Package org.broadinstitute.gatk.utils.locusiterator

Source Code of org.broadinstitute.gatk.utils.locusiterator.AlignmentStateMachine

/*
* Copyright (c) 2012 The Broad Institute
*
* Permission is hereby granted, free of charge, to any person
* obtaining a copy of this software and associated documentation
* files (the "Software"), to deal in the Software without
* restriction, including without limitation the rights to use,
* copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following
* conditions:
*
* The above copyright notice and this permission notice shall be
* included in all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR
* THE USE OR OTHER DEALINGS IN THE SOFTWARE.
*/

package org.broadinstitute.gatk.utils.locusiterator;

import com.google.java.contract.Ensures;
import com.google.java.contract.Invariant;
import com.google.java.contract.Requires;
import htsjdk.samtools.Cigar;
import htsjdk.samtools.CigarElement;
import htsjdk.samtools.CigarOperator;
import org.broadinstitute.gatk.utils.GenomeLoc;
import org.broadinstitute.gatk.utils.GenomeLocParser;
import org.broadinstitute.gatk.utils.exceptions.UserException;
import org.broadinstitute.gatk.utils.pileup.PileupElement;
import org.broadinstitute.gatk.utils.sam.GATKSAMRecord;

/**
* Steps a single read along its alignment to the genome
*
* The logical model for generating extended events is as follows: the "record state"
* implements the traversal along the reference; thus stepForwardOnGenome() returns
* on every and only on actual reference bases. This can be a (mis)match or a deletion
* (in the latter case, we still return on every individual reference base the deletion spans).
*
* User: depristo
* Date: 1/5/13
* Time: 1:08 PM
*/
@Invariant({
        "nCigarElements >= 0",
        "cigar != null",
        "read != null",
        "currentCigarElementOffset >= -1",
        "currentCigarElementOffset <= nCigarElements"
})
public class AlignmentStateMachine {
    /**
     * Our read
     */
    private final GATKSAMRecord read;
    private final Cigar cigar;
    private final int nCigarElements;
    private int currentCigarElementOffset = -1;

    /**
     * how far are we offset from the start of the read bases?
     */
    private int readOffset;

    /**
     * how far are we offset from the alignment start on the genome?
     */
    private int genomeOffset;

    /**
     * Our cigar element
     */
    private CigarElement currentElement;

    /**
     * how far are we into our cigarElement?
     */
    private int offsetIntoCurrentCigarElement;

    @Requires({"read != null", "read.getAlignmentStart() != -1", "read.getCigar() != null"})
    public AlignmentStateMachine(final GATKSAMRecord read) {
        this.read = read;
        this.cigar = read.getCigar();
        this.nCigarElements = cigar.numCigarElements();
        initializeAsLeftEdge();
    }

    /**
     * Initialize the state variables to put this machine one bp before the
     * start of the alignment, so that a call to stepForwardOnGenome() will advance
     * us to the first proper location
     */
    @Ensures("isLeftEdge()")
    private void initializeAsLeftEdge() {
        readOffset = offsetIntoCurrentCigarElement = genomeOffset = -1;
        currentElement = null;
    }

    /**
     * Get the read we are aligning to the genome
     * @return a non-null GATKSAMRecord
     */
    @Ensures("result != null")
    public GATKSAMRecord getRead() {
        return read;
    }

    /**
     * Get the reference index of the underlying read
     *
     * @return the reference index of the read
     */
    @Ensures("result == getRead().getReferenceIndex()")
    public int getReferenceIndex() {
        return getRead().getReferenceIndex();
    }

    /**
     * Is this the left edge state?  I.e., one that is before or after the current read?
     * @return true if this state is an edge state, false otherwise
     */
    public boolean isLeftEdge() {
        return readOffset == -1;
    }

    /**
     * Are we on the right edge?  I.e., is the current state off the right of the alignment?
     * @return true if off the right edge, false if otherwise
     */
    public boolean isRightEdge() {
        return readOffset == read.getReadLength();
    }

    /**
     * What is our current offset in the read's bases that aligns us with the reference genome?
     *
     * @return the current read offset position.  If an edge will be == -1
     */
    @Ensures("result >= -1")
    public int getReadOffset() {
        return readOffset;
    }

    /**
     * What is the current offset w.r.t. the alignment state that aligns us to the readOffset?
     *
     * @return the current offset from the alignment start on the genome.  If this state is
     * at the left edge the result will be -1;
     */
    @Ensures("result >= -1")
    public int getGenomeOffset() {
        return genomeOffset;
    }

    /**
     * Get the position (1-based as standard) of the current alignment on the genome w.r.t. the read's alignment start
     * @return the position on the genome of the current state in absolute coordinates
     */
    @Ensures("result > 0")
    public int getGenomePosition() {
        return read.getAlignmentStart() + getGenomeOffset();
    }

    /**
     * Gets #getGenomePosition but as a 1 bp GenomeLoc
     * @param genomeLocParser the parser to use to create the genome loc
     * @return a non-null genome location with start position of getGenomePosition
     */
    @Requires("genomeLocParser != null")
    @Ensures("result != null")
    public GenomeLoc getLocation(final GenomeLocParser genomeLocParser) {
        // TODO -- may return wonky results if on an edge (could be 0 or could be beyond genome location)
        return genomeLocParser.createGenomeLoc(read.getReferenceName(), getGenomePosition());
    }

    /**
     * Get the cigar element we're currently aligning with.
     *
     * For example, if the cigar string is 2M2D2M and we're in the second step of the
     * first 2M, then this function returns the element 2M.  After calling stepForwardOnGenome
     * this function would return 2D.
     *
     * @return the cigar element, or null if we're the left edge
     */
    @Ensures("result != null || isLeftEdge() || isRightEdge()")
    public CigarElement getCurrentCigarElement() {
        return currentElement;
    }

    /**
     * Get the offset of the current cigar element among all cigar elements in the read
     *
     * Suppose our read's cigar is 1M2D3M, and we're at the first 1M.  This would
     * return 0.  Stepping forward puts us in the 2D, so our offset is 1.  Another
     * step forward would result in a 1 again (we're in the second position of the 2D).
     * Finally, one more step forward brings us to 2 (for the 3M element)
     *
     * @return the offset of the current cigar element in the reads's cigar.  Will return -1 for
     * when the state is on the left edge, and be == the number of cigar elements in the
     * read when we're past the last position on the genome
     */
    @Ensures({"result >= -1", "result <= nCigarElements"})
    public int getCurrentCigarElementOffset() {
        return currentCigarElementOffset;
    }

    /**
     * Get the offset of the current state into the current cigar element
     *
     * That is, suppose we have a read with cigar 2M3D4M, and we're right at
     * the second M position.  offsetIntoCurrentCigarElement would be 1, as
     * it's two elements into the 2M cigar.  Now stepping forward we'd be
     * in cigar element 3D, and our offsetIntoCurrentCigarElement would be 0.
     *
     * @return the offset (from 0) of the current state in the current cigar element.
     *  Will be 0 on the right edge, and -1 on the left.
     */
    @Ensures({"result >= 0 || (result == -1 && isLeftEdge())", "!isRightEdge() || result == 0"})
    public int getOffsetIntoCurrentCigarElement() {
        return offsetIntoCurrentCigarElement;
    }

    /**
     * Convenience accessor of the CigarOperator of the current cigar element
     *
     * Robust to the case where we're on the edge, and currentElement is null, in which
     * case this function returns null as well
     *
     * @return null if this is an edge state
     */
    @Ensures("result != null || isLeftEdge() || isRightEdge()")
    public CigarOperator getCigarOperator() {
        return currentElement == null ? null : currentElement.getOperator();
    }

    @Override
    public String toString() {
        return String.format("%s ro=%d go=%d cec=%d %s", read.getReadName(), readOffset, genomeOffset, offsetIntoCurrentCigarElement, currentElement);
    }

    // -----------------------------------------------------------------------------------------------
    //
    // Code for setting up prev / next states
    //
    // -----------------------------------------------------------------------------------------------

    /**
     * Step the state machine forward one unit
     *
     * Takes the current state of this machine, and advances the state until the next on-genome
     * cigar element (M, X, =, D) is encountered, at which point this function returns with the
     * cigar operator of the current element.
     *
     * Assumes that the AlignmentStateMachine is in the left edge state at the start, so that
     * stepForwardOnGenome() can be called to move the machine to the first alignment position.  That
     * is, the normal use of this code is:
     *
     * AlignmentStateMachine machine = new AlignmentStateMachine(read)
     * machine.stepForwardOnGenome()
     * // now the machine is at the first position on the genome
     *
     * When stepForwardOnGenome() advances off the right edge of the read, the state machine is
     * left in a state such that isRightEdge() returns true and returns null, indicating the
     * the machine cannot advance further.  The machine may explode, though this is not contracted,
     * if stepForwardOnGenome() is called after a previous call returned null.
     *
     * @return the operator of the cigar element that machine stopped at, null if we advanced off the end of the read
     */
    @Ensures("result != null || isRightEdge()")
    public CigarOperator stepForwardOnGenome() {
        // loop until we either find a cigar element step that moves us one base on the genome, or we run
        // out of cigar elements
        while ( true ) {
            // we enter this method with readOffset = index of the last processed base on the read
            // (-1 if we did not process a single base yet); this can be last matching base,
            // or last base of an insertion
            if (currentElement == null || (offsetIntoCurrentCigarElement + 1) >= currentElement.getLength()) {
                currentCigarElementOffset++;
                if (currentCigarElementOffset < nCigarElements) {
                    currentElement = cigar.getCigarElement(currentCigarElementOffset);
                    offsetIntoCurrentCigarElement = -1;
                    // next line: guards against cigar elements of length 0; when new cigar element is retrieved,
                    // we reenter in order to re-check offsetIntoCurrentCigarElement against currentElement's length
                    continue;
                } else {
                    if (currentElement != null && currentElement.getOperator() == CigarOperator.D)
                        throw new UserException.MalformedBAM(read, "read ends with deletion. Cigar: " + read.getCigarString() + ". Although the SAM spec technically permits such reads, this is often indicative of malformed files. If you are sure you want to use this file, re-run your analysis with the extra option: -rf BadCigar");

                    // we're done, so set the offset of the cigar to 0 for cleanliness, as well as the current element
                    offsetIntoCurrentCigarElement = 0;
                    readOffset = read.getReadLength();
                    currentElement = null;

                    // Reads that contain indels model the genomeOffset as the following base in the reference.  Because
                    // we fall into this else block only when indels end the read, increment genomeOffset  such that the
                    // current offset of this read is the next ref base after the end of the indel.  This position will
                    // model a point on the reference somewhere after the end of the read.
                    genomeOffset++; // extended events need that. Logically, it's legal to advance the genomic offset here:

                    // we do step forward on the ref, and by returning null we also indicate that we are past the read end.
                    return null;
                }
            }

            offsetIntoCurrentCigarElement++;
            boolean done = false;
            switch (currentElement.getOperator()) {
                case H: // ignore hard clips
                case P: // ignore pads
                    offsetIntoCurrentCigarElement = currentElement.getLength();
                    break;
                case I: // insertion w.r.t. the reference
                case S: // soft clip
                    offsetIntoCurrentCigarElement = currentElement.getLength();
                    readOffset += currentElement.getLength();
                    break;
                case D: // deletion w.r.t. the reference
                    if (readOffset < 0)             // we don't want reads starting with deletion, this is a malformed cigar string
                        throw new UserException.MalformedBAM(read, "read starts with deletion. Cigar: " + read.getCigarString() + ". Although the SAM spec technically permits such reads, this is often indicative of malformed files. If you are sure you want to use this file, re-run your analysis with the extra option: -rf BadCigar");
                    // should be the same as N case
                    genomeOffset++;
                    done = true;
                    break;
                case N: // reference skip (looks and gets processed just like a "deletion", just different logical meaning)
                    genomeOffset++;
                    done = true;
                    break;
                case M:
                case EQ:
                case X:
                    readOffset++;
                    genomeOffset++;
                    done = true;
                    break;
                default:
                    throw new IllegalStateException("Case statement didn't deal with cigar op: " + currentElement.getOperator());
            }

            if ( done )
                return currentElement.getOperator();
        }
    }

    /**
     * Create a new PileupElement based on the current state of this element
     *
     * Must not be a left or right edge
     *
     * @return a pileup element
     */
    @Ensures("result != null")
    public final PileupElement makePileupElement() {
        if ( isLeftEdge() || isRightEdge() )
            throw new IllegalStateException("Cannot make a pileup element from an edge alignment state");
        return new PileupElement(read,
                getReadOffset(),
                getCurrentCigarElement(),
                getCurrentCigarElementOffset(),
                getOffsetIntoCurrentCigarElement());
    }
}
TOP

Related Classes of org.broadinstitute.gatk.utils.locusiterator.AlignmentStateMachine

TOP
Copyright © 2018 www.massapi.com. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.