Source Code of edu.msu.cme.rdp.classifier.train.validation.leaveoneout.LeaveOneOutTesterMain

/*
 * Created on June 24, 2002, 2:11 PM
 * LeaveOneOutTesterMain.java is the main program for leave-one-out testing.
 */
/**
 *
 * @author  wangqion
 * @version 
 */
package edu.msu.cme.rdp.classifier.train.validation.leaveoneout;


/**
 * This is the Main class to do leave-one-out testing
 */
import edu.msu.cme.rdp.classifier.cli.CmdOptions;
import java.io.*;


import org.apache.commons.cli.CommandLine;
import org.apache.commons.cli.HelpFormatter;
import org.apache.commons.cli.Option;
import org.apache.commons.cli.Options;
import org.apache.commons.cli.PosixParser;


import edu.msu.cme.rdp.classifier.train.LineageSequence;
import edu.msu.cme.rdp.classifier.train.LineageSequenceParser;
import edu.msu.cme.rdp.classifier.train.validation.NBClassifier;
import edu.msu.cme.rdp.classifier.train.validation.TreeFactory;
import edu.msu.cme.rdp.readseq.utils.orientation.GoodWordIterator;


public class LeaveOneOutTesterMain {


    private static final Options options = new Options();
    // long options
    public static final String QUERYFILE_LONG_OPT = "queryFile";
    public static final String OUTFILE_LONG_OPT = "outputFile";
    public static final String TRAIN_SEQFILE_LONG_OPT = "trainSeqFile";
    public static final String TRAIN_TAXONFILE_LONG_OPT = "trainTaxonFile";
    public static final String LENGTH_LONG_OPT = "length";
    //short options
    public static final String QUERYFILE_SHORT_OPT = "q";
    public static final String OUTFILE_SHORT_OPT = "o";
    public static final String TRAIN_SEQFILE_SHORT_OPT = "s";
    public static final String TRAIN_TAXONFILE_SHORT_OPT = "t";
    public static final String LENGTH_SHORT_OPT = "l";
    public static final String HIDETAXON_SHORT_OPT = "h";
    //public static final String KMER_SHORT_OPT = "k";
    
    // description of the options
    public static final String TRAIN_SEQFILE_DESC = "training files in fasta format labelled with the lineage information. "
            + "\nThe header of this fasta file starts with '>', followed by the sequence name, white space(s) and a list taxon names seperated by ';' with highest rank taxon first"
            + "\nex: Root;Bacteria;Proteobacteria;Gammaproteobacteria;Enterobacteriales;Enterobacteriaceae;Enterobacter";
    public static final String TRAIN_TAXONFILE_DESC = "contains the hierarchical taxonomy information, taxon name and rank together is unique."
            + " \nThe format looks like the following: taxid*taxon name*parent taxid*depth*rank"
            + " Note taxid, the parent taxid and depth should be in integer format. depth indicates the depth from the root taxon.";
    public static final String LENGTH_DESC = "the default is to test the entire query sequence. "
            + "if specifiy a length, a region of the query sequence with the specified length will be random choosen for testing";
    public static final String QUERYFILE_DESC = "query file contains sequences, same format as the training sequence file";
    public static final String OUTFILE_DESC = "stat of leave-one-out testing including correctness rate at each rank, misclassified rate for each taxon ";
    
    static {
        options.addOption(new Option(TRAIN_SEQFILE_SHORT_OPT, TRAIN_SEQFILE_LONG_OPT, true, TRAIN_SEQFILE_DESC));
        options.addOption(new Option(TRAIN_TAXONFILE_SHORT_OPT, TRAIN_TAXONFILE_LONG_OPT, true, TRAIN_TAXONFILE_DESC + " Recommend removing duplicate seqeunces using command rmdupseq"));
        options.addOption(new Option(QUERYFILE_SHORT_OPT, QUERYFILE_LONG_OPT, true, QUERYFILE_DESC));
        options.addOption(new Option(OUTFILE_SHORT_OPT, OUTFILE_LONG_OPT, true, OUTFILE_DESC));
        options.addOption(new Option(LENGTH_SHORT_OPT, LENGTH_LONG_OPT, true, LENGTH_DESC));
        options.addOption(new Option(CmdOptions.MIN_BOOTSTRAP_WORDS_SHORT_OPT, CmdOptions.MIN_BOOTSTRAP_WORDS_LONG_OPT, true, CmdOptions.MIN_WORDS_DESC));
        options.addOption(new Option(HIDETAXON_SHORT_OPT, "hideTaxon", false, 
                "If set, remove the lowest taxon where a query sequence originally labelled from the training set. Default only remove the query seq from training set"));        
        //options.addOption(new Option(KMER_SHORT_OPT, "kmersize", true, "the size of the kmer (word), default is 8. Recommend 6-9"));
    }
    
    /** Creates a new Classification*/
    public LeaveOneOutTesterMain(String taxFile, String trainseqFile, String testFile, String outFile, 
            int numGoodBases, int min_bootstrap_words, boolean hideTaxon) throws IOException {
        boolean useSeed = true;  // use seed for random word selection


        TreeFactory factory = new TreeFactory(new FileReader(taxFile));
        // create a tree
        createTree(factory, trainseqFile);


        BufferedWriter outWriter = new BufferedWriter(new FileWriter(outFile));
        LineageSequenceParser parser = new LineageSequenceParser(new File(testFile));
        LeaveOneOutTester tester = new LeaveOneOutTester(outWriter, numGoodBases);


        outWriter.write("taxon file: " + taxFile + "\n" + "train sequence file: " + trainseqFile + "\n");
        outWriter.write("word size: " + GoodWordIterator.getWordsize() + "\n");
        outWriter.write("minimum number of words for bootstrap: " + min_bootstrap_words + "\n");


        if (numGoodBases > 0) {    // do partial   


            outWriter.write("query sequence file: " + testFile + "\n"
                    + "classify partial sequence, number of good bases=" + numGoodBases + "\n");
        } else {
            outWriter.write("query sequence file: " + testFile + "\n"
                    + "classify full-length sequence \n");


        }
        outWriter.write("test rank: " + factory.getLowestRank());


        tester.classify(factory, parser, useSeed, min_bootstrap_words, hideTaxon);


    }


    /** reads from the stream, parses the sequences and creates the tree */
    private void createTree(TreeFactory factory, String input) throws IOException {
        LineageSequenceParser parser = new LineageSequenceParser(new File(input));


        while (parser.hasNext()) {
            factory.addSequence((LineageSequence) parser.next());
        }


        //after all the training set is being parsed, calculate the prior probability for all the words.
        factory.calculateWordPrior();
        // create the word occurrence for all the nodes, this is necessary if test on different level
        // besides genus but requires lot of memory for large taxonomy model
        // factory.getRoot().createWordOccurrenceFromSubclasses();
        //factory.displayTreePhylo(factory.getRoot(), "1", 0);
    }


    public static void main(String[] args) throws FileNotFoundException, IOException {


        String queryFile = null;
        String outputFile = null;
        String trainSeqFile = null;
        String trainTaxonFile = null;
        int length = 0;
        int min_bootstrap_words = NBClassifier.MIN_BOOTSTRSP_WORDS;
        boolean hideTaxon = false;
         
        try {
            CommandLine line = new PosixParser().parse(options, args);


            if (line.hasOption(QUERYFILE_SHORT_OPT)) {
                queryFile = line.getOptionValue(QUERYFILE_SHORT_OPT);
            } else {
                throw new Exception("query file must be specified");
            }
            if (line.hasOption(OUTFILE_SHORT_OPT)) {
                outputFile = line.getOptionValue(OUTFILE_SHORT_OPT);
            } else {
                throw new Exception("output file must be specified");
            }


            if (line.hasOption(TRAIN_SEQFILE_SHORT_OPT)) {
                trainSeqFile = line.getOptionValue(TRAIN_SEQFILE_SHORT_OPT);
            } else {
                throw new Exception("training sequence file must be specified");
            }
            if (line.hasOption(TRAIN_TAXONFILE_SHORT_OPT)) {
                trainTaxonFile = line.getOptionValue(TRAIN_TAXONFILE_SHORT_OPT);
            } else {
                throw new Exception("training taxon file must be specified");
            }


            if (line.hasOption(LENGTH_SHORT_OPT)) {
                length = Integer.parseInt(line.getOptionValue(LENGTH_SHORT_OPT));
                if (length <= 0) {
                    throw new IllegalArgumentException(length + " must be a positive number ");
                }
            }
            if (line.hasOption(CmdOptions.MIN_BOOTSTRAP_WORDS_SHORT_OPT)) {
                min_bootstrap_words = Integer.parseInt(line.getOptionValue(CmdOptions.MIN_BOOTSTRAP_WORDS_SHORT_OPT));
                if (min_bootstrap_words < NBClassifier.MIN_BOOTSTRSP_WORDS) {
                    throw new IllegalArgumentException(CmdOptions.MIN_BOOTSTRAP_WORDS_LONG_OPT + " must be at least " + NBClassifier.MIN_BOOTSTRSP_WORDS);
                }                
            }
            
            if (line.hasOption(HIDETAXON_SHORT_OPT)) {
                hideTaxon = true;
            }        
            /*
            if (line.hasOption(KMER_SHORT_OPT)) {
                int kmer = Integer.parseInt(line.getOptionValue(KMER_SHORT_OPT));
                if (kmer < 1) {
                    throw new IllegalArgumentException(length + " must be a positive number ");
                }
                GoodWordIterator.setWordSize(kmer);
            }*/
        } catch (Exception e) {
            System.out.println("Command Error: " + e.getMessage());
            new HelpFormatter().printHelp(120, "LeaveOneOutTesterMain", "", options, "", true);
            return;
        }




        LeaveOneOutTesterMain tester = new LeaveOneOutTesterMain(trainTaxonFile, trainSeqFile, 
                queryFile, outputFile, length, min_bootstrap_words, hideTaxon);


    }
}
Source Code of edu.msu.cme.rdp.classifier.train.validation.leaveoneout.LeaveOneOutTesterMain

Related Classes of edu.msu.cme.rdp.classifier.train.validation.leaveoneout.LeaveOneOutTesterMain