Package org.broad.igv.tools

Source Code of org.broad.igv.tools.Preprocessor$Tile

/*
* Copyright (c) 2007-2012 The Broad Institute, Inc.
* SOFTWARE COPYRIGHT NOTICE
* This software and its documentation are the copyright of the Broad Institute, Inc. All rights are reserved.
*
* This software is supplied without any warranty or guaranteed support whatsoever. The Broad Institute is not responsible for its use, misuse, or functionality.
*
* This software is licensed under the terms of the GNU Lesser General Public License (LGPL),
* Version 2.1 which is available at http://www.opensource.org/licenses/lgpl-2.1.php.
*/

package org.broad.igv.tools;

import org.apache.log4j.Logger;
import org.broad.igv.Globals;
import org.broad.igv.feature.Chromosome;
import org.broad.igv.feature.genome.Genome;
import org.broad.igv.tdf.*;
import org.broad.igv.tools.parsers.CNParser;
import org.broad.igv.tools.parsers.DataConsumer;
import org.broad.igv.tools.parsers.ToolsWiggleParser;
import org.broad.igv.tools.parsers.UnsortedException;
import org.broad.igv.track.TrackType;
import org.broad.igv.track.WindowFunction;
import org.broad.igv.util.collections.FloatArrayList;
import org.broad.igv.util.collections.IntArrayList;

import java.io.File;
import java.io.IOException;
import java.io.PrintStream;
import java.util.*;

/**
* @author jrobinso
*/
public class Preprocessor implements DataConsumer {

    private static Logger log = Logger.getLogger(Preprocessor.class);

    boolean compressed = true;
    private boolean skipZeroes = false;
    private int nZoom = 7;
    int maxExtFactor = 0;
    Zoom[] zoomLevels;
    int nTracks;
    Genome genome;
    Collection<WindowFunction> windowFunctions;
    private String currentChr = "";
    int currentChrLength;
    private int sizeEstimate;
    int nPtsProcessed = 0;
    StatusMonitor statusMonitor;
    double percentComplete = 0.0;
    int lastStartPosition = 0;
    HashSet<String> skippedChromosomes = new HashSet();
    TDFWriter writer;
    Raw rawData;
    Zoom genomeZoom;
    File outputFile;
    ListAccumulator allDataStats;
    List<String> chromosomes = new ArrayList();
    Set<String> visitedChromosomes = new HashSet();
    Map<String, String> attributes = new HashMap();

    List<WindowFunction> allDataFunctions = Arrays.asList(
            WindowFunction.mean,
            WindowFunction.median,
            WindowFunction.min,
            WindowFunction.max,
            WindowFunction.percentile2,
            WindowFunction.percentile10,
            WindowFunction.percentile90,
            WindowFunction.percentile98);


    public Preprocessor(File outputFile,
                        Genome genome,
                        Collection<WindowFunction> windowFunctions,
                        int sizeEstimate,
                        StatusMonitor monitor) {

        this.outputFile = outputFile;
        this.genome = genome;
        this.windowFunctions = windowFunctions;
        this.sizeEstimate = sizeEstimate;
        this.genome = genome;
        this.statusMonitor = monitor;

        allDataStats = new ListAccumulator(allDataFunctions);
        if (statusMonitor == null) {
            PrintStream monStream = this.outputFile == null ? System.err : System.out;
            statusMonitor = new CommandLineStatusMonitor(monStream);
        }
    }

    /**
     * Called to set initial parameters.  It is required that this be called
     * prior to writing the file
     */
    public void setTrackParameters(TrackType trackType, String trackLine, String[] trackNames) {
        setTrackParameters(trackType, trackLine, trackNames, true);
    }

    /**
     * Called to set initial parameters.  It is required that this be called
     * prior to writing the file
     */
    public void setTrackParameters(TrackType trackType, String trackLine, String[] trackNames, boolean computeWholeGenome) {

        if (trackLine != null) {
            log.info(trackLine);
        }

        if (outputFile != null && writer == null) {
            writer = new TDFWriter(outputFile, genome.getId(), trackType, trackLine, trackNames, windowFunctions, compressed);
            nTracks = trackNames.length;

            // Convert genome coordinates from bp to kbp
            if (computeWholeGenome) {
                int genomeLength = (int) (genome.getNominalLength() / 1000);
                genomeZoom = new Zoom(Globals.CHR_ALL, 0, genomeLength);
            }

            TDFGroup rootGroup = writer.getRootGroup();
            rootGroup.setAttribute("genome", genome.getId());
            rootGroup.setAttribute("maxZoom", String.valueOf(nZoom));

        }
    }


    /**
     * Add an array of data for the given interval.  The array contains a value for each sample/track in this
     * dataset.  The name is an optional probe or feature name.
     */
    public void addData(String chr, int start, int end, float[] data, String name) {

        if (writer == null) {
            return;
        }

        if (skipZeroes) {
            boolean allZeroes = true;
            for (int i = 0; i < data.length; i++) {
                if (data[i] != 0) {
                    allZeroes = false;
                    break;
                }
            }
            if (allZeroes) {
                return;
            }
        }

        // Check for stop signal
        if (statusMonitor != null && statusMonitor.isInterrupted()) {
            throw new PreprocessingException("Preprocessing Halted.");
        }

        if (skippedChromosomes.contains(chr)) {
            return;
        }

        if (currentChr != null && chr.equals(currentChr)) {
            if (start < (lastStartPosition - maxExtFactor)) {
                String msg = "Error: Data is not sorted @ " + chr + " " + start +
                        "  (last position = " + lastStartPosition +
                        "   max ext factor = " + maxExtFactor + ")";
                UnsortedException e = new UnsortedException(msg);
                log.error(msg, e);
                throw e;
            }
        } else {
            newChromosome(chr);
        }

        // Check a second time, in case it just got added
        if (skippedChromosomes.contains(chr)) {
            return;
        }

        // Is this data in range for the chromosome?
        int chrLength = genome.getChromosome(chr).getLength();
        if (start > chrLength) {
            log.warn("Ignoring data from non-existent locus.  Probe = " + name +
                    "  Locus = " + chr + ":" + start + "-" + end + ". " + chr + " length = " + chrLength);
            return;
        }


        // Add to raw data
        rawData.addData(start, end, data, name);

        // Zoom levels
        for (Zoom zl : zoomLevels) {
            zl.addData(start, end, data);
        }

        // Whole genome
        long offset = genome.getCumulativeOffset(chr);
        int gStart = (int) ((offset + start) / 1000);
        int gEnd = Math.max(gStart + 1, (int) ((offset + end) / 1000));


        // Don't include "chrM" in the whole genome view or stats
        if (genomeZoom != null && !(chr.equals("chrM") || chr.equals("M") || chr.equals("MT"))) {
            genomeZoom.addData(gStart, gEnd, data);
            for (int i = 0; i < data.length; i++) {
                allDataStats.add(gEnd - gStart, data[i]);
            }
        }

        lastStartPosition = start;

    }


    /**
     * Start a new chromosome.  Note that data is sorted by chromosome, then start position.
     */
    public void newChromosome(String chr) {

        if (visitedChromosomes.contains(chr)) {
            String msg = "Error: Data is not ordered by start position. Chromosome " + chr +
                    " appears in multiple blocks";
            PreprocessingException e = new PreprocessingException(msg);
            log.error(msg, e);
            throw e;
        }
        visitedChromosomes.add(chr);


        Chromosome c = genome.getChromosome(chr);
        if (c == null) {
            log.warn("Chromosome: " + chr + " not found in .genome file.  Skipping.");
            skippedChromosomes.add(chr);
        } else {

            chromosomes.add(chr);

            log.info("Processing chromosome " + chr);
            if (zoomLevels != null) {
                for (Zoom zl : zoomLevels) {
                    zl.close();
                }
            }
            if (rawData != null) {
                rawData.close();
            }

            currentChr = chr;
            currentChrLength = c.getLength();
            zoomLevels = new Zoom[getNZoom() + 1];
            for (int z = 0; z <= getNZoom(); z++) {
                zoomLevels[z] = new Zoom(chr, z, currentChrLength);
            }

            rawData = new Raw(chr, currentChrLength, 100000);
        }
        lastStartPosition = 0;

    }


    /**
     * Called at end-of-file
     */
    public void parsingComplete() {


    }

    public void finish() {
        if (writer == null) {
            return;
        }

        /**
         * We output the chromosomes in the order specified by the Genome,
         * since that was how the overall genome data was calculated.
         * However we only include those chromosomes which were actually found
         * in the file
         */
        StringBuffer chrString = new StringBuffer();
        Iterator<String> iter = genome.getAllChromosomeNames().iterator();
        while (iter.hasNext()) {
            String chromoName = iter.next();
            if (!chromosomes.contains(chromoName)) {
                continue;
            }
            chrString.append(chromoName);
            if (iter.hasNext()) {
                chrString.append(",");
            }
        }
        writer.getRootGroup().setAttribute(TDFWriter.CHROMOSOMES, chrString.toString());

        for (Map.Entry<String, String> entry : attributes.entrySet()) {
            writer.getRootGroup().setAttribute(entry.getKey(), entry.getValue());
        }

        if (zoomLevels != null) {
            for (Zoom zl : zoomLevels) {
                zl.close();
            }
        }

        if (genomeZoom != null) {
            genomeZoom.close();
        }

        if (rawData == null) {
            // TODO -- delete .tdf file?
            log.warn("No features were found that matched chromosomes in genome: " + genome.getId());
        } else {
            rawData.close();

            // Record max/min
            allDataStats.finish();
            TDFGroup group = writer.getGroup("/");
            group.setAttribute(TDFGroup.USE_PERCENTILE_AUTOSCALING, "true");
            for (WindowFunction wf : allDataFunctions) {
                group.setAttribute(wf.getValue(), String.valueOf(allDataStats.getValue(wf)));
            }
            writer.closeFile();
        }

        if (statusMonitor != null) {
            statusMonitor.setPercentComplete(100);
        }
    }


    public void setType(String type) {
        //this.type = type;
    }

    public void setSortTolerance(int tolerance) {
        maxExtFactor = tolerance;
    }

    public void setAttribute(String key, String value) {
        attributes.put(key, value);
    }

    /**
     * @param sizeEstimate the sizeEstimate to set
     */
    public void setSizeEstimate(int sizeEstimate) {
        this.sizeEstimate = sizeEstimate;
    }

    public void setSkipZeroes(boolean skipZeroes) {
        this.skipZeroes = skipZeroes;
    }

    public int getNZoom() {
        return nZoom;
    }

    public void setNZoom(int nZoom) {
        this.nZoom = nZoom;
    }


    /**
     * Class representing a tile of raw (as opposed to summarized) data.
     */
    class RawTile {
        String dsName;
        int tileNumber;
        int tileStart;
        int tileEnd;
        IntArrayList startArray;
        IntArrayList endArray;
        ArrayList<String> nameList;
        FloatArrayList[] dataArray;

        RawTile(String dsName, int tileNumber, int start, int end) {
            this.dsName = dsName;
            this.tileNumber = tileNumber;
            this.tileStart = start;
            this.tileEnd = end;
            startArray = new IntArrayList();
            endArray = new IntArrayList();
            dataArray = new FloatArrayList[nTracks];
            for (int i = 0; i < nTracks; i++) {
                dataArray[i] = new FloatArrayList();
            }
        }

        void addData(int start, int end, float[] data, String name) {


            if (start > tileEnd) {
                log.warn("Warning: start position > tile end");

            }

            if (end < tileStart) {
                log.warn("Warning: end position > tile end");
            }


            if (name != null && nameList == null) {
                nameList = new ArrayList();
            }


            int dataStart = Math.max(tileStart, start);
            int dataEnd = Math.min(tileEnd, end);
            startArray.add(dataStart);
            endArray.add(dataEnd);
            for (int i = 0; i < data.length; i++) {
                dataArray[i].add(data[i]);
            }
            if (name != null) {
                nameList.add(name);
            }
        }

        void close() {
            try {
                if (startArray.size() > 0) {
                    int[] s = startArray.toArray();
                    int[] e = endArray.toArray();
                    float[][] d = new float[dataArray.length][dataArray[0].size()];
                    for (int i = 0; i < dataArray.length; i++) {
                        d[i] = dataArray[i].toArray();
                    }


                    String[] n = nameList == null ? null : nameList.toArray(new String[]{});
                    TDFBedTile tile = new TDFBedTile(tileStart, s, e, d, n);
                    writer.writeTile(dsName, tileNumber, tile);
                    startArray.clear();
                    endArray.clear();
                    for (int i = 0; i < dataArray.length; i++) {
                        dataArray[i].clear();
                    }
                }
            } catch (IOException ex) {
                log.error(ex.getMessage(), ex);
            }
        }
    }


    /**
     * Class representing the raw dataset
     */
    class Raw {


        String chr;
        String dsName;
        TDFDataset dataset;
        int tileWidth;
        Map<Integer, RawTile> activeTiles = new HashMap();

        Raw(String chr, int chrLength, int tileWidth) {

            this.tileWidth = tileWidth;
            int nTiles = (int) (chrLength / tileWidth) + 1;
            dsName = "/" + chr + "/raw";
            dataset = writer.createDataset(dsName, TDFDataset.DataType.FLOAT, tileWidth, nTiles);

        }

        /**
         * @param start
         * @param end
         * @param data
         */
        public void addData(int start, int end, float[] data, String name) {

            int startTileNumber = (int) (start / tileWidth);
            int endTileNumber = (int) (end / tileWidth);

            // Check for closed tiles -- tiles we are guaranteed to not revisit
            int tmp = (start - maxExtFactor) / tileWidth;
            while (!activeTiles.isEmpty()) {
                Integer tileNumber = activeTiles.keySet().iterator().next();
                if (tileNumber < tmp) {
                    RawTile t = activeTiles.get(tileNumber);
                    t.close();
                    activeTiles.remove(tileNumber);
                } else {
                    break;
                }
            }

            // Add data to all tiles it spans.  The data will be effectively "cut" if it spans multiple tiles.
            for (int t = startTileNumber; t <= endTileNumber; t++) {
                RawTile tile = activeTiles.get(t);
                if (tile == null) {
                    tile = new RawTile(dsName, t, t * tileWidth, (t + 1) * tileWidth);
                    activeTiles.put(t, tile);
                }
                tile.addData(start, end, data, name);
            }

            // Update progress -- assume uniform distribution
            if (statusMonitor != null && sizeEstimate > 0) {
                int p = (int) ((100.0 * nPtsProcessed) / (1.5 * sizeEstimate));
                if (p > percentComplete) {
                    percentComplete = p;
                    statusMonitor.setPercentComplete(percentComplete);
                }
            }
            nPtsProcessed++;
        }

        void close() {
            for (RawTile t : activeTiles.values()) {
                t.close();
            }
            activeTiles = null;
        }


    }

    /**
     * Class representing all the data for a particular zoom level.
     */
    class Zoom {

        int level;
        int tileWidth;
        LinkedHashMap<Integer, Tile> activeTiles = new LinkedHashMap();
        Map<WindowFunction, TDFDataset> datasets = new HashMap();


        Zoom(String chr, int level, int chrLength) {
            int nTiles = (int) Math.pow(2, level);
            tileWidth = chrLength / nTiles + 1;
            this.level = level;

            // Create datasets -- one for each window function
            for (WindowFunction wf : windowFunctions) {
                String dsName = "/" + chr + "/z" + level + "/" + wf.toString();
                datasets.put(wf, writer.createDataset(dsName, TDFDataset.DataType.FLOAT, tileWidth, nTiles));
            }
        }

        public void addData(int start, int end, float[] data) {

            int startTile = start / tileWidth;
            int endTile = end / tileWidth;

            // Check for closed tiles
            int tmp = (start - maxExtFactor) / tileWidth;
            while (!activeTiles.isEmpty()) {
                Integer tileNumber = activeTiles.keySet().iterator().next();
                if (tileNumber < tmp) {
                    Tile t = activeTiles.get(tileNumber);
                    t.close();
                    activeTiles.remove(tileNumber);
                } else {
                    break;
                }
            }


            for (int i = startTile; i <= endTile; i++) {
                Tile t = activeTiles.get(i);
                if (t == null) {
                    t = new Tile(datasets, level, i, 700, tileWidth);
                    activeTiles.put(i, t);
                }
                t.addData(start, end, data);
            }
        }

        // Close all active tiles
        public void close() {
            for (Tile t : activeTiles.values()) {
                t.close();
            }
        }
    }

    /**
     * A tile of summarized data
     */
    class Tile {
        int totalCount;
        int zoomLevel;
        int tileNumber;
        int tileStart;
        int lastFinishedBin = 0;
        double binWidth;
        int nBins;
        int nonEmptyBins;
        ListAccumulator[][] accumulators;
        Map<WindowFunction, TDFDataset> datasets;

        Tile(Map<WindowFunction, TDFDataset> datasets, int zoomLevel, int tileNumber, int nBins, int tileWidth) {
            this.totalCount = 0;
            this.datasets = datasets;
            this.zoomLevel = zoomLevel;
            this.tileNumber = tileNumber;
            this.tileStart = tileNumber * tileWidth;
            this.nBins = nBins;
            this.binWidth = ((double) tileWidth) / nBins;
            this.accumulators = new ListAccumulator[nTracks][nBins];
        }

        /**
         * Add a data point.  Positions are zero based exclusive  (UCSC convention)
         *
         * @param start
         * @param end
         * @param data  array of values at this position,  1 value per track
         */
        void addData(int start, int end, float[] data) {
            totalCount++;

            int startBin = Math.max(0, (int) ((start - tileStart) / binWidth));
            int endBin = Math.min(nBins - 1, (int) ((end - tileStart) / binWidth));

            int tmp = (int) ((start - tileStart - maxExtFactor) / binWidth);


            for (int t = 0; t < nTracks; t++) {
                for (int b = lastFinishedBin; b < tmp; b++) {
                    if (accumulators[t][b] != null) {
                        accumulators[t][b].finish();
                    }
                }
                lastFinishedBin = Math.max(0, tmp - 1);

                for (int b = startBin; b <= endBin; b++) {
                    if (accumulators[t][b] == null) {
                        accumulators[t][b] = new ListAccumulator(datasets.keySet());
                    }
                    accumulators[t][b].add(end - start, data[t]);
                }
            }
        }


        /**
         *
         */
        void close() {

            // Count non-empty bins.  All tracks should be the same
            nonEmptyBins = 0;

            for (int t = 0; t < nTracks; t++) {
                for (int i = 0; i < nBins; i++) {
                    if (accumulators[t][i] != null) {
                        accumulators[t][i].finish();
                        if (t == 0) {
                            nonEmptyBins++;
                        }
                    }
                }
            }

            TDFTile tile = null;

            for (WindowFunction wf : datasets.keySet()) {


                // If < 50% of bins are empty use vary step tile, otherwise use fixed step
                if (nonEmptyBins < 0.5 * nBins) {
                    int[] starts = new int[nonEmptyBins];
                    float[][] data = new float[nTracks][nonEmptyBins];
                    int n = 0;
                    for (int i = 0; i < nBins; i++) {
                        for (int t = 0; t < nTracks; t++) {
                            ListAccumulator acc = accumulators[t][i];
                            if (acc != null) {
                                data[t][n] = acc.getValue(wf);
                                if (t == nTracks - 1) {
                                    starts[n] = (int) (tileStart + (i * binWidth));
                                    n++;
                                }
                            }
                        }
                    }
                    tile = new TDFVaryTile((int) tileStart, binWidth, starts, data);

                } else {
                    float[][] data = new float[nTracks][nBins];
                    for (int t = 0; t < nTracks; t++) {
                        for (int i = 0; i < nBins; i++) {
                            data[t][i] = accumulators[t][i] == null ? Float.NaN : accumulators[t][i].getValue(wf);
                        }
                    }
                    tile = new TDFFixedTile(tileStart, tileStart, binWidth, data);
                }

                String dsName = datasets.get(wf).getName();
                try {
                    writer.writeTile(dsName, tileNumber, tile);
                } catch (IOException exc) {
                    log.error("Error writing tile: " + dsName + " [" + tileNumber + "]", exc);
                    throw new PreprocessingException(exc.getMessage());
                }
            }
        }
    }

    public static boolean isAlignmentFile(String ext) {
        return ext.equalsIgnoreCase(".bam") || ext.equalsIgnoreCase(".sam") ||
                ext.equalsIgnoreCase(".aligned") ||
                ext.equalsIgnoreCase(".bedz") || ext.equalsIgnoreCase(".bed");
    }


    public void preprocess(File iFile, int maxZoomValue, String typeString) throws IOException {

        setNZoom(maxZoomValue);


        String tmp = (typeString == null ? iFile.getAbsolutePath() : typeString).toLowerCase();
        if (tmp.endsWith(".txt")) tmp = tmp.substring(0, tmp.length() - 4);
        if (tmp.endsWith(".gz")) tmp = tmp.substring(0, tmp.length() - 3);
        if (tmp.endsWith("wig") || tmp.endsWith("bedgraph") || tmp.endsWith("cpg") || tmp.endsWith("map")) {
            ToolsWiggleParser wg = new ToolsWiggleParser(iFile.getAbsolutePath(), this, genome);
            wg.parse();
        } else if (tmp.endsWith("cn") || tmp.endsWith("xcn") || tmp.endsWith("igv") || tmp.endsWith("snp")) {
            CNParser cnParser = new CNParser(iFile.getAbsolutePath(), this, genome);
            cnParser.parse();
        } else {
            String msg = "Error: cannot convert files of type '" + tmp + "' to TDF format.";
            msg += "\nTry specifying the file type with the --fileType parameter.";
            throw new RuntimeException(msg);
        }
    }

    /**
     * Strip of "gz" and "txt" extensions to get to true format extension.
     *
     * @param filename
     * @return
     */
    public static String getExtension(String filename) {

        if (filename.endsWith(".gz")) {
            filename = filename.substring(0, filename.length() - 3);
        }

        if (filename.toLowerCase().endsWith(".txt")) {
            filename = filename.substring(0, filename.length() - 4);
        }

        int idx = filename.lastIndexOf('.');
        if (idx < 0) {
            return "";
        } else {
            return filename.substring(idx).toLowerCase();
        }
    }
}
TOP

Related Classes of org.broad.igv.tools.Preprocessor$Tile

TOP
Copyright © 2018 www.massapi.com. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.