Package org.broadinstitute.gatk.tools.walkers.diagnostics

Source Code of org.broadinstitute.gatk.tools.walkers.diagnostics.CoveredByNSamplesSites

/*
* Copyright (c) 2012 The Broad Institute
*
* Permission is hereby granted, free of charge, to any person
* obtaining a copy of this software and associated documentation
* files (the "Software"), to deal in the Software without
* restriction, including without limitation the rights to use,
* copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following
* conditions:
*
* The above copyright notice and this permission notice shall be
* included in all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR
* THE USE OR OTHER DEALINGS IN THE SOFTWARE.
*/

package org.broadinstitute.gatk.tools.walkers.diagnostics;


import org.broadinstitute.gatk.engine.walkers.By;
import org.broadinstitute.gatk.engine.walkers.DataSource;
import org.broadinstitute.gatk.engine.walkers.RodWalker;
import org.broadinstitute.gatk.engine.walkers.TreeReducible;
import org.broadinstitute.gatk.utils.commandline.Argument;
import org.broadinstitute.gatk.utils.commandline.ArgumentCollection;
import org.broadinstitute.gatk.utils.commandline.Output;
import org.broadinstitute.gatk.engine.CommandLineGATK;
import org.broadinstitute.gatk.engine.arguments.StandardVariantContextInputArgumentCollection;
import org.broadinstitute.gatk.engine.contexts.AlignmentContext;
import org.broadinstitute.gatk.engine.contexts.ReferenceContext;
import org.broadinstitute.gatk.engine.refdata.RefMetaDataTracker;
import org.broadinstitute.gatk.utils.GenomeLoc;
import org.broadinstitute.gatk.utils.help.DocumentedGATKFeature;
import org.broadinstitute.gatk.utils.help.HelpConstants;
import htsjdk.variant.variantcontext.Genotype;
import htsjdk.variant.variantcontext.GenotypesContext;
import htsjdk.variant.variantcontext.VariantContext;


import java.io.*;
import java.util.Collection;

/**
* Print intervals file with all the variant sites for which most of the samples have good coverage
*
* <p>
* CoveredByNSamplesSites is a GATK tool for filtering out sites based on their coverage.
* The sites that pass the filter are printed out to an intervals file.
*
* See argument defaults for what constitutes "most" samples and "good" coverage. These parameters can be modified from the command line.
* </p>
*
* <h3>Input</h3>
* <p>
* A variant file and optionally min coverage and sample percentage values.
* </p>
*
* <h3>Output</h3>
* <p>
* An intervals file.
* </p>
*
* <h3>Example</h3>
* <pre>
* java -Xmx2g -jar GenomeAnalysisTK.jar \
*   -R ref.fasta \
*   -T CoveredByNSamplesSites \
*   -V input.vcf \
*   -out output.intervals \
*   -minCov 15
* </pre>
*
*/
@DocumentedGATKFeature( groupName = HelpConstants.DOCS_CAT_QC, extraDocs = {CommandLineGATK.class} )
@By(DataSource.REFERENCE_ORDERED_DATA)
public class CoveredByNSamplesSites extends RodWalker<GenomeLoc, Integer> implements TreeReducible<Integer> {

    @Output(fullName = "OutputIntervals", shortName = "out", doc = "Name of file for output intervals")
    PrintStream outputStream;

    @ArgumentCollection
    protected StandardVariantContextInputArgumentCollection variantCollection = new StandardVariantContextInputArgumentCollection();

    @Argument(fullName = "minCoverage", shortName = "minCov",doc = "only samples that have coverage bigger than minCoverage will be counted",required = false)
    int minCoverage = 10;

    @Argument(fullName = "percentageOfSamples", shortName = "percentage", doc = "only sites where at least percentageOfSamples of the samples have good coverage, will be emitted", required = false)
    double percentageOfSamples = 0.9;

    @Override
    public GenomeLoc map(RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context) {
        if ( tracker == null )
            return null;

        Collection<VariantContext> VCs = tracker.getValues(variantCollection.variants, context.getLocation());
        if ( VCs.size() == 0 )
            return null;

        boolean emitSite = false;
        for(VariantContext vc : VCs){
            int coveredSamples = 0;
            final GenotypesContext genotypes = vc.getGenotypes();
            final int numOfGenotypes = genotypes.size();
            for(Genotype g : genotypes){
                if(g.getDP() >= minCoverage)
                    coveredSamples++;
            }
            if((double)coveredSamples/numOfGenotypes > percentageOfSamples){
                emitSite = true;
            }
        }
        if (emitSite)
            return ref.getLocus();
        else
            return null;
    }

    @Override
    public Integer reduceInit() { return 0; }

    @Override
    public Integer reduce(GenomeLoc value, Integer sum) {
        if ( value != null ) {
            outputStream.println(value);
            sum++;
        }
        return sum;
    }

    @Override
    public Integer treeReduce(Integer lhs, Integer rhs) {
        return lhs + rhs;
    }

    /**
     *
     * @param result the number of sites that passed the filter.
     */
    public void onTraversalDone(Integer result) {
        logger.info(result+" sites that have "+(percentageOfSamples*100)+"% of the samples with at least "+minCoverage+" coverage.\n");
    }



}
TOP

Related Classes of org.broadinstitute.gatk.tools.walkers.diagnostics.CoveredByNSamplesSites

TOP
Copyright © 2018 www.massapi.com. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.