Package org.broadinstitute.gatk.engine.traversals

Source Code of org.broadinstitute.gatk.engine.traversals.TraverseActiveRegionsUnitTest

/*
* Copyright (c) 2012 The Broad Institute
*
* Permission is hereby granted, free of charge, to any person
* obtaining a copy of this software and associated documentation
* files (the "Software"), to deal in the Software without
* restriction, including without limitation the rights to use,
* copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following
* conditions:
*
* The above copyright notice and this permission notice shall be
* included in all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR
* THE USE OR OTHER DEALINGS IN THE SOFTWARE.
*/

package org.broadinstitute.gatk.engine.traversals;

import com.google.java.contract.PreconditionError;
import htsjdk.samtools.*;
import org.broadinstitute.gatk.utils.commandline.Tags;
import org.broadinstitute.gatk.engine.arguments.ValidationExclusion;
import org.broadinstitute.gatk.engine.datasources.reads.*;
import org.broadinstitute.gatk.engine.filters.ReadFilter;
import org.broadinstitute.gatk.engine.iterators.ReadTransformer;
import org.broadinstitute.gatk.engine.resourcemanagement.ThreadAllocation;
import org.broadinstitute.gatk.engine.walkers.Walker;
import org.broadinstitute.gatk.utils.GenomeLocSortedSet;
import org.broadinstitute.gatk.utils.SampleUtils;
import org.broadinstitute.gatk.utils.activeregion.ActiveRegionReadState;
import org.broadinstitute.gatk.utils.interval.IntervalMergingRule;
import org.broadinstitute.gatk.utils.interval.IntervalUtils;
import org.broadinstitute.gatk.utils.sam.*;
import htsjdk.samtools.reference.IndexedFastaSequenceFile;
import org.broadinstitute.gatk.utils.BaseTest;
import org.broadinstitute.gatk.engine.GenomeAnalysisEngine;
import org.broadinstitute.gatk.engine.datasources.providers.LocusShardDataProvider;
import org.broadinstitute.gatk.engine.datasources.rmd.ReferenceOrderedDataSource;
import org.broadinstitute.gatk.engine.executive.WindowMaker;
import org.broadinstitute.gatk.utils.GenomeLoc;
import org.broadinstitute.gatk.utils.GenomeLocParser;
import org.broadinstitute.gatk.utils.activeregion.ActiveRegion;
import org.broadinstitute.gatk.utils.fasta.CachingIndexedFastaSequenceFile;
import org.testng.Assert;
import org.testng.annotations.BeforeClass;
import org.testng.annotations.DataProvider;
import org.testng.annotations.Test;


import java.io.File;
import java.io.IOException;
import java.util.*;

/**
* Created with IntelliJ IDEA.
* User: thibault
* Date: 11/13/12
* Time: 2:47 PM
*
* Test the Active Region Traversal Contract
* http://iwww.broadinstitute.org/gsa/wiki/index.php/Active_Region_Traversal_Contract
*/
public class TraverseActiveRegionsUnitTest extends BaseTest {
    private final static boolean ENFORCE_CONTRACTS = false;
    private final static boolean DEBUG = false;

    @DataProvider(name = "TraversalEngineProvider")
    public Object[][] makeTraversals() {
        final List<Object[]> traversals = new LinkedList<Object[]>();
        traversals.add(new Object[]{new TraverseActiveRegions<>()});
        return traversals.toArray(new Object[][]{});
    }

    private IndexedFastaSequenceFile reference;
    private SAMSequenceDictionary dictionary;
    private GenomeLocParser genomeLocParser;

    private List<GenomeLoc> intervals;

    private File testBAM;

    @BeforeClass
    private void init() throws IOException {
        //reference = new CachingIndexedFastaSequenceFile(new File("/Users/depristo/Desktop/broadLocal/localData/human_g1k_v37.fasta")); // hg19Reference));
        reference = new CachingIndexedFastaSequenceFile(new File(hg19Reference));
        dictionary = reference.getSequenceDictionary();
        genomeLocParser = new GenomeLocParser(dictionary);

        // TODO: reads with indels
        // TODO: reads which span many regions
        // TODO: reads which are partially between intervals (in/outside extension)
        // TODO: duplicate reads
        // TODO: read at the end of a contig
        // TODO: reads which are completely outside intervals but within extension
        // TODO: test the extension itself
        // TODO: unmapped reads

        intervals = new ArrayList<GenomeLoc>();
        intervals.add(genomeLocParser.createGenomeLoc("1", 10, 20));
        intervals.add(genomeLocParser.createGenomeLoc("1", 1, 999));
        intervals.add(genomeLocParser.createGenomeLoc("1", 1000, 1999));
        intervals.add(genomeLocParser.createGenomeLoc("1", 2000, 2999));
        intervals.add(genomeLocParser.createGenomeLoc("1", 10000, 20000));
        intervals.add(genomeLocParser.createGenomeLoc("2", 1, 100));
        intervals.add(genomeLocParser.createGenomeLoc("20", 10000, 10100));
        intervals = IntervalUtils.sortAndMergeIntervals(genomeLocParser, intervals, IntervalMergingRule.OVERLAPPING_ONLY).toList();

        List<GATKSAMRecord> reads = new ArrayList<GATKSAMRecord>();
        reads.add(buildSAMRecord("simple", "1", 100, 200));
        reads.add(buildSAMRecord("overlap_equal", "1", 10, 20));
        reads.add(buildSAMRecord("overlap_unequal", "1", 10, 21));
        reads.add(buildSAMRecord("boundary_equal", "1", 1990, 2009));
        reads.add(buildSAMRecord("boundary_unequal", "1", 1990, 2008));
        reads.add(buildSAMRecord("boundary_1_pre", "1", 1950, 2000));
        reads.add(buildSAMRecord("boundary_1_post", "1", 1999, 2050));
        reads.add(buildSAMRecord("extended_and_np", "1", 990, 1990));
        reads.add(buildSAMRecord("outside_intervals", "1", 5000, 6000));
        reads.add(buildSAMRecord("shard_boundary_1_pre", "1", 16300, 16385));
        reads.add(buildSAMRecord("shard_boundary_1_post", "1", 16384, 16400));
        reads.add(buildSAMRecord("shard_boundary_equal", "1", 16355, 16414));
        reads.add(buildSAMRecord("simple20", "20", 10025, 10075));

        createBAM(reads);
    }

    private void createBAM(List<GATKSAMRecord> reads) throws IOException {
        testBAM = createTempFile("TraverseActiveRegionsUnitTest", ".bam");

        SAMFileWriter out = new SAMFileWriterFactory().setCreateIndex(true).makeBAMWriter(reads.get(0).getHeader(), true, testBAM);
        for (GATKSAMRecord read : ReadUtils.sortReadsByCoordinate(reads)) {
            out.addAlignment(read);
        }
        out.close();

        new File(testBAM.getAbsolutePath().replace(".bam", ".bai")).deleteOnExit();
        new File(testBAM.getAbsolutePath() + ".bai").deleteOnExit();
    }

    @Test(enabled = true && ! DEBUG, dataProvider = "TraversalEngineProvider")
    public void testAllBasesSeen(TraverseActiveRegions t) {
        DummyActiveRegionWalker walker = new DummyActiveRegionWalker();

        List<GenomeLoc> activeIntervals = getIsActiveIntervals(t, walker, intervals);
        // Contract: Every genome position in the analysis interval(s) is processed by the walker's isActive() call
        verifyEqualIntervals(intervals, activeIntervals);
    }

    private List<GenomeLoc> getIsActiveIntervals(final TraverseActiveRegions t, DummyActiveRegionWalker walker, List<GenomeLoc> intervals) {
        List<GenomeLoc> activeIntervals = new ArrayList<GenomeLoc>();
        for (LocusShardDataProvider dataProvider : createDataProviders(t, walker, intervals, testBAM)) {
            t.traverse(walker, dataProvider, 0);
            activeIntervals.addAll(walker.isActiveCalls);
        }

        return activeIntervals;
    }

    @Test (enabled = ENFORCE_CONTRACTS, dataProvider = "TraversalEngineProvider", expectedExceptions = PreconditionError.class)
    public void testIsActiveRangeLow (TraverseActiveRegions t) {
        DummyActiveRegionWalker walker = new DummyActiveRegionWalker(-0.1);
        getActiveRegions(t, walker, intervals).values();
    }

    @Test (enabled = ENFORCE_CONTRACTS, dataProvider = "TraversalEngineProvider", expectedExceptions = PreconditionError.class)
    public void testIsActiveRangeHigh (TraverseActiveRegions t) {
        DummyActiveRegionWalker walker = new DummyActiveRegionWalker(1.1);
        getActiveRegions(t, walker, intervals).values();
    }

    @Test(enabled = true && ! DEBUG, dataProvider = "TraversalEngineProvider")
    public void testActiveRegionCoverage(TraverseActiveRegions t) {
        DummyActiveRegionWalker walker = new DummyActiveRegionWalker(new GenomeLocSortedSet(genomeLocParser, intervals), true);

        Collection<ActiveRegion> activeRegions = getActiveRegions(t, walker, intervals).values();
        verifyActiveRegionCoverage(intervals, activeRegions);
    }

    private void verifyActiveRegionCoverage(List<GenomeLoc> intervals, Collection<ActiveRegion> activeRegions) {
        List<GenomeLoc> intervalStarts = new ArrayList<GenomeLoc>();
        List<GenomeLoc> intervalStops = new ArrayList<GenomeLoc>();

        for (GenomeLoc interval : intervals) {
            intervalStarts.add(interval.getStartLocation());
            intervalStops.add(interval.getStopLocation());
        }

        Map<GenomeLoc, ActiveRegion> baseRegionMap = new HashMap<GenomeLoc, ActiveRegion>();

        for (ActiveRegion activeRegion : activeRegions) {
            for (GenomeLoc activeLoc : toSingleBaseLocs(activeRegion.getLocation())) {
                // Contract: Regions do not overlap
                Assert.assertFalse(baseRegionMap.containsKey(activeLoc), "Genome location " + activeLoc + " is assigned to more than one region");
                baseRegionMap.put(activeLoc, activeRegion);
            }

            GenomeLoc start = activeRegion.getLocation().getStartLocation();
            if (intervalStarts.contains(start))
                intervalStarts.remove(start);

            GenomeLoc stop = activeRegion.getLocation().getStopLocation();
            if (intervalStops.contains(stop))
                intervalStops.remove(stop);
        }

        for (GenomeLoc baseLoc : toSingleBaseLocs(intervals)) {
            // Contract: Each location in the interval(s) is in exactly one region
            // Contract: The total set of regions exactly matches the analysis interval(s)
            Assert.assertTrue(baseRegionMap.containsKey(baseLoc), "Genome location " + baseLoc + " is not assigned to any region");
            baseRegionMap.remove(baseLoc);
        }

        // Contract: The total set of regions exactly matches the analysis interval(s)
        Assert.assertEquals(baseRegionMap.size(), 0, "Active regions contain base(s) outside of the given intervals");

        // Contract: All explicit interval boundaries must also be region boundaries
        Assert.assertEquals(intervalStarts.size(), 0, "Interval start location does not match an active region start location");
        Assert.assertEquals(intervalStops.size(), 0, "Interval stop location does not match an active region stop location");
    }

    @Test(enabled = true && ! DEBUG, dataProvider = "TraversalEngineProvider")
    public void testActiveRegionExtensionOnContig(TraverseActiveRegions t) {
        DummyActiveRegionWalker walker = new DummyActiveRegionWalker();

        Collection<ActiveRegion> activeRegions = getActiveRegions(t, walker, intervals).values();
        for (ActiveRegion activeRegion : activeRegions) {
            GenomeLoc loc = activeRegion.getExtendedLoc();

            // Contract: active region extensions must stay on the contig
            Assert.assertTrue(loc.getStart() > 0, "Active region extension begins at location " + loc.getStart() + ", past the left end of the contig");
            int refLen = dictionary.getSequence(loc.getContigIndex()).getSequenceLength();
            Assert.assertTrue(loc.getStop() <= refLen, "Active region extension ends at location " + loc.getStop() + ", past the right end of the contig");
        }
    }

    @Test(enabled = true && !DEBUG, dataProvider = "TraversalEngineProvider")
    public void testPrimaryReadMapping(TraverseActiveRegions t) {
        DummyActiveRegionWalker walker = new DummyActiveRegionWalker(new GenomeLocSortedSet(genomeLocParser, intervals),
                EnumSet.of(ActiveRegionReadState.PRIMARY),
                true);

        // Contract: Each read has the Primary state in a single region (or none)
        // This is the region of maximum overlap for the read (earlier if tied)

        // simple: Primary in 1:1-999
        // overlap_equal: Primary in 1:1-999
        // overlap_unequal: Primary in 1:1-999
        // boundary_equal: Primary in 1:1000-1999, Non-Primary in 1:2000-2999
        // boundary_unequal: Primary in 1:1000-1999, Non-Primary in 1:2000-2999
        // boundary_1_pre: Primary in 1:1000-1999, Non-Primary in 1:2000-2999
        // boundary_1_post: Primary in 1:1000-1999, Non-Primary in 1:2000-2999
        // extended_and_np: Primary in 1:1-999, Non-Primary in 1:1000-1999, Extended in 1:2000-2999
        // outside_intervals: none
        // shard_boundary_1_pre: Primary in 1:14908-16384, Non-Primary in 1:16385-16927
        // shard_boundary_1_post: Primary in 1:14908-16384, Non-Primary in 1:16385-16927
        // shard_boundary_equal: Primary in 1:14908-16384, Non-Primary in 1:16385-16927
        // simple20: Primary in 20:10000-10100

        Map<GenomeLoc, ActiveRegion> activeRegions = getActiveRegions(t, walker, intervals);
        ActiveRegion region;

        region = activeRegions.get(genomeLocParser.createGenomeLoc("1", 1, 999));
        verifyReadMapping(region, "simple", "overlap_equal", "overlap_unequal", "extended_and_np");

        region = activeRegions.get(genomeLocParser.createGenomeLoc("1", 1000, 1999));
        verifyReadMapping(region, "boundary_unequal", "boundary_1_pre", "boundary_equal", "boundary_1_post");

        region = activeRegions.get(genomeLocParser.createGenomeLoc("1", 2000, 2999));
        verifyReadMapping(region);

        region = activeRegions.get(genomeLocParser.createGenomeLoc("1", 10000, 20000));
        verifyReadMapping(region, "shard_boundary_1_pre", "shard_boundary_1_post", "shard_boundary_equal");

        region = activeRegions.get(genomeLocParser.createGenomeLoc("20", 10000, 10100));
        verifyReadMapping(region, "simple20");
    }

    @Test(enabled = true && ! DEBUG, dataProvider = "TraversalEngineProvider")
    public void testNonPrimaryReadMapping(TraverseActiveRegions t) {
        DummyActiveRegionWalker walker = new DummyActiveRegionWalker(new GenomeLocSortedSet(genomeLocParser, intervals),
                EnumSet.of(ActiveRegionReadState.PRIMARY, ActiveRegionReadState.NONPRIMARY),
                true);

        // Contract: Each read has the Primary state in a single region (or none)
        // This is the region of maximum overlap for the read (earlier if tied)

        // Contract: Each read has the Non-Primary state in all other regions it overlaps

        // simple: Primary in 1:1-999
        // overlap_equal: Primary in 1:1-999
        // overlap_unequal: Primary in 1:1-999
        // boundary_equal: Primary in 1:1000-1999, Non-Primary in 1:2000-2999
        // boundary_unequal: Primary in 1:1000-1999, Non-Primary in 1:2000-2999
        // boundary_1_pre: Primary in 1:1000-1999, Non-Primary in 1:2000-2999
        // boundary_1_post: Primary in 1:1000-1999, Non-Primary in 1:2000-2999
        // extended_and_np: Primary in 1:1-999, Non-Primary in 1:1000-1999, Extended in 1:2000-2999
        // outside_intervals: none
        // shard_boundary_1_pre: Primary in 1:14908-16384, Non-Primary in 1:16385-16927
        // shard_boundary_1_post: Primary in 1:14908-16384, Non-Primary in 1:16385-16927
        // shard_boundary_equal: Primary in 1:14908-16384, Non-Primary in 1:16385-16927
        // simple20: Primary in 20:10000-10100

        Map<GenomeLoc, ActiveRegion> activeRegions = getActiveRegions(t, walker, intervals);
        ActiveRegion region;

        region = activeRegions.get(genomeLocParser.createGenomeLoc("1", 1, 999));
        verifyReadMapping(region, "simple", "overlap_equal", "overlap_unequal", "extended_and_np");

        region = activeRegions.get(genomeLocParser.createGenomeLoc("1", 1000, 1999));
        verifyReadMapping(region, "boundary_equal", "boundary_unequal", "extended_and_np", "boundary_1_pre", "boundary_1_post");

        region = activeRegions.get(genomeLocParser.createGenomeLoc("1", 2000, 2999));
        verifyReadMapping(region, "boundary_equal", "boundary_unequal", "boundary_1_pre", "boundary_1_post");

        region = activeRegions.get(genomeLocParser.createGenomeLoc("1", 10000, 20000));
        verifyReadMapping(region, "shard_boundary_1_pre", "shard_boundary_1_post", "shard_boundary_equal");

        region = activeRegions.get(genomeLocParser.createGenomeLoc("20", 10000, 10100));
        verifyReadMapping(region, "simple20");
    }

    @Test(enabled = true && ! DEBUG, dataProvider = "TraversalEngineProvider")
    public void testExtendedReadMapping(TraverseActiveRegions t) {
        DummyActiveRegionWalker walker = new DummyActiveRegionWalker(new GenomeLocSortedSet(genomeLocParser, intervals),
                EnumSet.of(ActiveRegionReadState.PRIMARY, ActiveRegionReadState.NONPRIMARY, ActiveRegionReadState.EXTENDED),
                true);

        // Contract: Each read has the Primary state in a single region (or none)
        // This is the region of maximum overlap for the read (earlier if tied)

        // Contract: Each read has the Non-Primary state in all other regions it overlaps
        // Contract: Each read has the Extended state in regions where it only overlaps if the region is extended

        // simple: Primary in 1:1-999
        // overlap_equal: Primary in 1:1-999
        // overlap_unequal: Primary in 1:1-999
        // boundary_equal: Non-Primary in 1:1000-1999, Primary in 1:2000-2999
        // boundary_unequal: Primary in 1:1000-1999, Non-Primary in 1:2000-2999
        // boundary_1_pre: Primary in 1:1000-1999, Non-Primary in 1:2000-2999
        // boundary_1_post: Non-Primary in 1:1000-1999, Primary in 1:2000-2999
        // extended_and_np: Non-Primary in 1:1-999, Primary in 1:1000-1999, Extended in 1:2000-2999
        // outside_intervals: none
        // shard_boundary_1_pre: Primary in 1:14908-16384, Non-Primary in 1:16385-16927
        // shard_boundary_1_post: Non-Primary in 1:14908-16384, Primary in 1:16385-16927
        // shard_boundary_equal: Non-Primary in 1:14908-16384, Primary in 1:16385-16927
        // simple20: Primary in 20:10000-10100

        Map<GenomeLoc, ActiveRegion> activeRegions = getActiveRegions(t, walker, intervals);
        ActiveRegion region;

        region = activeRegions.get(genomeLocParser.createGenomeLoc("1", 1, 999));
        verifyReadMapping(region, "simple", "overlap_equal", "overlap_unequal", "extended_and_np");

        region = activeRegions.get(genomeLocParser.createGenomeLoc("1", 1000, 1999));
        verifyReadMapping(region, "boundary_equal", "boundary_unequal", "extended_and_np", "boundary_1_pre", "boundary_1_post");

        region = activeRegions.get(genomeLocParser.createGenomeLoc("1", 2000, 2999));
        verifyReadMapping(region, "boundary_equal", "boundary_unequal", "extended_and_np", "boundary_1_pre", "boundary_1_post");

        region = activeRegions.get(genomeLocParser.createGenomeLoc("1", 10000, 20000));
        verifyReadMapping(region, "shard_boundary_1_pre", "shard_boundary_1_post", "shard_boundary_equal");

        region = activeRegions.get(genomeLocParser.createGenomeLoc("20", 10000, 10100));
        verifyReadMapping(region, "simple20");
    }

    @Test(enabled = true && ! DEBUG, dataProvider = "TraversalEngineProvider")
    public void testUnmappedReads(TraverseActiveRegions t) {
        // TODO
    }

    private void verifyReadMapping(ActiveRegion region, String... reads) {
        Assert.assertNotNull(region, "Region was unexpectedly null");
        final Set<String> regionReads = new HashSet<String>();
        for (SAMRecord read : region.getReads()) {
            Assert.assertFalse(regionReads.contains(read.getReadName()), "Duplicate reads detected in region " + region + " read " + read.getReadName());
            regionReads.add(read.getReadName());
        }

        Collection<String> wantReads = new ArrayList<String>(Arrays.asList(reads));
        for (SAMRecord read : region.getReads()) {
            String regionReadName = read.getReadName();
            Assert.assertTrue(wantReads.contains(regionReadName), "Read " + regionReadName + " incorrectly assigned to active region " + region);
            wantReads.remove(regionReadName);
        }

        Assert.assertTrue(wantReads.isEmpty(), "Reads missing in active region " + region + ", wanted " + (wantReads.isEmpty() ? "" : wantReads.iterator().next()));
    }

    private Map<GenomeLoc, ActiveRegion> getActiveRegions(TraverseActiveRegions t, DummyActiveRegionWalker walker, List<GenomeLoc> intervals) {
        return getActiveRegions(t, walker, intervals, testBAM);
    }

    private Map<GenomeLoc, ActiveRegion> getActiveRegions(TraverseActiveRegions t, DummyActiveRegionWalker walker, List<GenomeLoc> intervals, final File bam) {
        for (LocusShardDataProvider dataProvider : createDataProviders(t, walker, intervals, bam))
            t.traverse(walker, dataProvider, 0);

        return walker.mappedActiveRegions;
    }

    private Collection<GenomeLoc> toSingleBaseLocs(GenomeLoc interval) {
        List<GenomeLoc> bases = new ArrayList<GenomeLoc>();
        if (interval.size() == 1)
            bases.add(interval);
        else {
            for (int location = interval.getStart(); location <= interval.getStop(); location++)
                bases.add(genomeLocParser.createGenomeLoc(interval.getContig(), location, location));
        }

        return bases;
    }

    private Collection<GenomeLoc> toSingleBaseLocs(List<GenomeLoc> intervals) {
        Set<GenomeLoc> bases = new TreeSet<GenomeLoc>();    // for sorting and uniqueness
        for (GenomeLoc interval : intervals)
            bases.addAll(toSingleBaseLocs(interval));

        return bases;
    }

    private void verifyEqualIntervals(List<GenomeLoc> aIntervals, List<GenomeLoc> bIntervals) {
        Collection<GenomeLoc> aBases = toSingleBaseLocs(aIntervals);
        Collection<GenomeLoc> bBases = toSingleBaseLocs(bIntervals);

        Assert.assertTrue(aBases.size() == bBases.size(), "Interval lists have a differing number of bases: " + aBases.size() + " vs. " + bBases.size());

        Iterator<GenomeLoc> aIter = aBases.iterator();
        Iterator<GenomeLoc> bIter = bBases.iterator();
        while (aIter.hasNext() && bIter.hasNext()) {
            GenomeLoc aLoc = aIter.next();
            GenomeLoc bLoc = bIter.next();
            Assert.assertTrue(aLoc.equals(bLoc), "Interval locations do not match: " + aLoc + " vs. " + bLoc);
        }
    }

    // copied from LocusViewTemplate
    protected GATKSAMRecord buildSAMRecord(String readName, String contig, int alignmentStart, int alignmentEnd) {
        SAMFileHeader header = ArtificialSAMUtils.createDefaultReadGroup(new SAMFileHeader(), "test", "test");
        header.setSequenceDictionary(dictionary);
        header.setSortOrder(SAMFileHeader.SortOrder.coordinate);
        GATKSAMRecord record = new GATKSAMRecord(header);

        record.setReadName(readName);
        record.setReferenceIndex(dictionary.getSequenceIndex(contig));
        record.setAlignmentStart(alignmentStart);

        Cigar cigar = new Cigar();
        int len = alignmentEnd - alignmentStart + 1;
        cigar.add(new CigarElement(len, CigarOperator.M));
        record.setCigar(cigar);
        record.setReadString(new String(new char[len]).replace("\0", "A"));
        record.setBaseQualities(new byte[len]);
        record.setReadGroup(new GATKSAMReadGroupRecord(header.getReadGroup("test")));

        return record;
    }

    private List<LocusShardDataProvider> createDataProviders(TraverseActiveRegions traverseActiveRegions, final Walker walker, List<GenomeLoc> intervals, File bamFile) {
        GenomeAnalysisEngine engine = new GenomeAnalysisEngine();
        engine.setGenomeLocParser(genomeLocParser);

        Collection<SAMReaderID> samFiles = new ArrayList<SAMReaderID>();
        SAMReaderID readerID = new SAMReaderID(bamFile, new Tags());
        samFiles.add(readerID);

        SAMDataSource dataSource = new SAMDataSource(samFiles, new ThreadAllocation(), null, genomeLocParser,
                false,
                ValidationStringency.STRICT,
                null,
                null,
                new ValidationExclusion(),
                new ArrayList<ReadFilter>(),
                new ArrayList<ReadTransformer>(),
                false, (byte)30, false, true, null, IntervalMergingRule.ALL);

        engine.setReadsDataSource(dataSource);
        final Set<String> samples = SampleUtils.getSAMFileSamples(dataSource.getHeader());

        traverseActiveRegions.initialize(engine, walker);
        List<LocusShardDataProvider> providers = new ArrayList<LocusShardDataProvider>();
        for (Shard shard : dataSource.createShardIteratorOverIntervals(new GenomeLocSortedSet(genomeLocParser, intervals), new ActiveRegionShardBalancer())) {
            for (WindowMaker.WindowMakerIterator window : new WindowMaker(shard, genomeLocParser, dataSource.seek(shard), shard.getGenomeLocs(), samples)) {
                providers.add(new LocusShardDataProvider(shard, shard.getReadProperties(), genomeLocParser, window.getLocus(), window, reference, new ArrayList<ReferenceOrderedDataSource>()));
            }
        }

        return providers;
    }

    // ---------------------------------------------------------------------------------------------------------
    //
    // Combinatorial tests to ensure reads are going into the right regions
    //
    // ---------------------------------------------------------------------------------------------------------

    @DataProvider(name = "CombinatorialARTTilingProvider")
    public Object[][] makeCombinatorialARTTilingProvider() {
        final List<Object[]> tests = new LinkedList<Object[]>();

        final List<Integer> starts = Arrays.asList(
                1, // very start of the chromosome
                ArtificialBAMBuilder.BAM_SHARD_SIZE - 100, // right before the shard boundary
                ArtificialBAMBuilder.BAM_SHARD_SIZE + 100 // right after the shard boundary
        );

        final List<EnumSet<ActiveRegionReadState>> allReadStates = Arrays.asList(
                EnumSet.of(ActiveRegionReadState.PRIMARY),
                EnumSet.of(ActiveRegionReadState.PRIMARY, ActiveRegionReadState.NONPRIMARY),
                EnumSet.of(ActiveRegionReadState.PRIMARY, ActiveRegionReadState.NONPRIMARY, ActiveRegionReadState.EXTENDED)
        );

        final int maxTests = Integer.MAX_VALUE;
        int nTests = 0;
        for ( final int readLength : Arrays.asList(100) ) {
            for ( final int skips : Arrays.asList(0, 10) ) {
                for ( final int start : starts ) {
                    for ( final int nReadsPerLocus : Arrays.asList(1, 2) ) {
                        for ( final int nLoci : Arrays.asList(1, 1000) ) {
                            final ArtificialBAMBuilder bamBuilder = new ArtificialBAMBuilder(reference, nReadsPerLocus, nLoci);
                            bamBuilder.setReadLength(readLength);
                            bamBuilder.setSkipNLoci(skips);
                            bamBuilder.setAlignmentStart(start);
                            for ( EnumSet<ActiveRegionReadState> readStates : allReadStates ) {
                                for ( final GenomeLocSortedSet activeRegions : enumerateActiveRegions(bamBuilder.getAlignmentStart(), bamBuilder.getAlignmentEnd())) {
                                    nTests++;
                                    if ( nTests < maxTests ) // && nTests == 1238 )
                                        tests.add(new Object[]{new TraverseActiveRegions<>(), nTests, activeRegions, readStates, bamBuilder});
                                }
                            }
                        }
                    }
                }
            }
        }

        return tests.toArray(new Object[][]{});
    }

    private Collection<GenomeLocSortedSet> enumerateActiveRegions(final int start, final int stop) {
        // should basically cut up entire region into equal sized chunks, of
        // size 10, 20, 50, 100, etc, alternating skipping pieces so they are inactive
        // Need to make sure we include some edge cases:
        final List<GenomeLocSortedSet> activeRegions = new LinkedList<GenomeLocSortedSet>();

        for ( final int stepSize : Arrays.asList(11, 29, 53, 97) ) {
            for ( final boolean startWithActive : Arrays.asList(true, false) ) {
                activeRegions.add(makeActiveRegionMask(start, stop, stepSize,  startWithActive));
            }
        }

        // active region is the whole interval
        activeRegions.add(new GenomeLocSortedSet(genomeLocParser, genomeLocParser.createGenomeLoc("1", start, stop)));

        // active region extends up to the end of the data, but doesn't include start
        activeRegions.add(new GenomeLocSortedSet(genomeLocParser, genomeLocParser.createGenomeLoc("1", start+10, stop)));

        return activeRegions;
    }

    private GenomeLocSortedSet makeActiveRegionMask(final int start, final int stop, final int stepSize, final boolean startWithActive) {
        final GenomeLocSortedSet active = new GenomeLocSortedSet(genomeLocParser);

        boolean includeRegion = startWithActive;
        for ( int left = start; left < stop; left += stepSize) {
            final int right = left + stepSize;
            final GenomeLoc region = genomeLocParser.createGenomeLoc("1", left, right);
            if ( includeRegion )
                active.add(region);
            includeRegion = ! includeRegion;
        }

        return active;
    }


    @Test(enabled = true && ! DEBUG, dataProvider = "CombinatorialARTTilingProvider")
    public void testARTReadsInActiveRegions(final TraverseActiveRegions<Integer, Integer> traversal, final int id, final GenomeLocSortedSet activeRegions, final EnumSet<ActiveRegionReadState> readStates, final ArtificialBAMBuilder bamBuilder) {
        logger.warn("Running testARTReadsInActiveRegions id=" + id + " locs " + activeRegions + " against bam " + bamBuilder);
        final List<GenomeLoc> intervals = Arrays.asList(
                genomeLocParser.createGenomeLoc("1", bamBuilder.getAlignmentStart(), bamBuilder.getAlignmentEnd())
        );

        final DummyActiveRegionWalker walker = new DummyActiveRegionWalker(activeRegions, false);
        walker.setStates(readStates);

        final Map<GenomeLoc, ActiveRegion> activeRegionsMap = getActiveRegions(traversal, walker, intervals, bamBuilder.makeTemporarilyBAMFile());

        final Set<String> alreadySeenReads = new HashSet<String>(); // for use with the primary / non-primary
        for ( final ActiveRegion region : activeRegionsMap.values() ) {
            final Set<String> readNamesInRegion = readNamesInRegion(region);
            int nReadsExpectedInRegion = 0;
            for ( final GATKSAMRecord read : bamBuilder.makeReads() ) {
                final GenomeLoc readLoc = genomeLocParser.createGenomeLoc(read);

                boolean shouldBeInRegion = readStates.contains(ActiveRegionReadState.EXTENDED)
                        ? region.getExtendedLoc().overlapsP(readLoc)
                        : region.getLocation().overlapsP(readLoc);

                if ( ! readStates.contains(ActiveRegionReadState.NONPRIMARY) ) {
                    if ( alreadySeenReads.contains(read.getReadName()) )
                        shouldBeInRegion = false;
                    else if ( shouldBeInRegion )
                        alreadySeenReads.add(read.getReadName());
                }

                String msg = readNamesInRegion.contains(read.getReadName()) == shouldBeInRegion ? "" : "Region " + region +
                        " failed contains read check: read " + read + " with span " + readLoc + " should be in region is " + shouldBeInRegion + " but I got the opposite";
                Assert.assertEquals(readNamesInRegion.contains(read.getReadName()), shouldBeInRegion, msg);

                nReadsExpectedInRegion += shouldBeInRegion ? 1 : 0;
            }

            Assert.assertEquals(region.size(), nReadsExpectedInRegion, "There are more reads in active region " + region + "than expected");
        }
    }

    private Set<String> readNamesInRegion(final ActiveRegion region) {
        final Set<String> readNames = new LinkedHashSet<String>(region.getReads().size());
        for ( final SAMRecord read : region.getReads() )
            readNames.add(read.getReadName());
        return readNames;
    }

    // ---------------------------------------------------------------------------------------------------------
    //
    // Make sure all insertion reads are properly included in the active regions
    //
    // ---------------------------------------------------------------------------------------------------------

    @Test(dataProvider = "TraversalEngineProvider", enabled = true && ! DEBUG)
    public void ensureAllInsertionReadsAreInActiveRegions(final TraverseActiveRegions<Integer, Integer> traversal) {

        final int readLength = 10;
        final int start = 20;
        final int nReadsPerLocus = 10;
        final int nLoci = 3;

        final ArtificialBAMBuilder bamBuilder = new ArtificialBAMBuilder(reference, nReadsPerLocus, nLoci);
        bamBuilder.setReadLength(readLength);
        bamBuilder.setAlignmentStart(start);

        // note that the position must be +1 as the read's all I cigar puts the end 1 bp before start, leaving it out of the region
        GATKSAMRecord allI = ArtificialSAMUtils.createArtificialRead(bamBuilder.getHeader(),"allI",0,start+1,readLength);
        allI.setCigarString(readLength + "I");
        allI.setReadGroup(new GATKSAMReadGroupRecord(bamBuilder.getHeader().getReadGroups().get(0)));

        bamBuilder.addReads(allI);

        final GenomeLocSortedSet activeRegions = new GenomeLocSortedSet(bamBuilder.getGenomeLocParser());
        activeRegions.add(bamBuilder.getGenomeLocParser().createGenomeLoc("1", 10, 30));
        final List<GenomeLoc> intervals = Arrays.asList(
                genomeLocParser.createGenomeLoc("1", bamBuilder.getAlignmentStart(), bamBuilder.getAlignmentEnd())
        );

        final DummyActiveRegionWalker walker = new DummyActiveRegionWalker(activeRegions, false);

        final Map<GenomeLoc, ActiveRegion> activeRegionsMap = getActiveRegions(traversal, walker, intervals, bamBuilder.makeTemporarilyBAMFile());

        final ActiveRegion region = activeRegionsMap.values().iterator().next();
        int nReadsExpectedInRegion = 0;

        final Set<String> readNamesInRegion = readNamesInRegion(region);
        for ( final GATKSAMRecord read : bamBuilder.makeReads() ) {
            Assert.assertTrue(readNamesInRegion.contains(read.getReadName()),
                    "Region " + region + " should contain read " + read + " with cigar " + read.getCigarString() + " but it wasn't");
            nReadsExpectedInRegion++;
        }

        Assert.assertEquals(region.size(), nReadsExpectedInRegion, "There are more reads in active region " + region + "than expected");
    }
}
TOP

Related Classes of org.broadinstitute.gatk.engine.traversals.TraverseActiveRegionsUnitTest

TOP
Copyright © 2018 www.massapi.com. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.