Package com.tamingtext.classifier.maxent

Source Code of com.tamingtext.classifier.maxent.TestMaxent

/*
* Copyright 2008-2011 Grant Ingersoll, Thomas Morton and Drew Farris
*
*    Licensed under the Apache License, Version 2.0 (the "License");
*    you may not use this file except in compliance with the License.
*    You may obtain a copy of the License at
*
*        http://www.apache.org/licenses/LICENSE-2.0
*
*    Unless required by applicable law or agreed to in writing, software
*    distributed under the License is distributed on an "AS IS" BASIS,
*    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
*    See the License for the specific language governing permissions and
*    limitations under the License.
* -------------------
* To purchase or learn more about Taming Text, by Grant Ingersoll, Thomas Morton and Drew Farris, visit
* http://www.manning.com/ingersoll
*/

package com.tamingtext.classifier.maxent;

import java.io.BufferedReader;
import java.io.File;
import java.io.FileInputStream;
import java.io.FileNotFoundException;
import java.io.FileReader;
import java.io.IOException;
import java.io.InputStream;
import java.util.ArrayList;
import java.util.Collection;

import opennlp.tools.doccat.BagOfWordsFeatureGenerator;
import opennlp.tools.doccat.DoccatModel;
import opennlp.tools.doccat.DocumentCategorizer;
import opennlp.tools.doccat.DocumentCategorizerME;
import opennlp.tools.tokenize.SimpleTokenizer;
import opennlp.tools.tokenize.Tokenizer;

import org.apache.commons.cli2.CommandLine;
import org.apache.commons.cli2.Group;
import org.apache.commons.cli2.Option;
import org.apache.commons.cli2.OptionException;
import org.apache.commons.cli2.builder.ArgumentBuilder;
import org.apache.commons.cli2.builder.DefaultOptionBuilder;
import org.apache.commons.cli2.builder.GroupBuilder;
import org.apache.commons.cli2.commandline.Parser;
import org.apache.mahout.classifier.ClassifierResult;
import org.apache.mahout.classifier.ResultAnalyzer;
import org.apache.mahout.common.CommandLineUtil;
import org.apache.mahout.common.commandline.DefaultOptionCreator;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

import com.tamingtext.util.FileUtil;

public class TestMaxent {
 
  private static final Logger log = LoggerFactory.getLogger(TestMaxent.class);
 
  /**
   * @param args
   */
  public static void main(String[] args) throws IOException {
    DefaultOptionBuilder obuilder = new DefaultOptionBuilder();
    ArgumentBuilder abuilder = new ArgumentBuilder();
    GroupBuilder gbuilder = new GroupBuilder();
   
    Option helpOpt = DefaultOptionCreator.helpOption();
   
    Option inputDirOpt = obuilder.withLongName("input").withRequired(true).withArgument(
      abuilder.withName("input").withMinimum(1).withMaximum(1).create()).withDescription(
      "The input directory")
        .withShortName("i").create();
   
    Option modelOpt = obuilder.withLongName("model").withRequired(true).withArgument(
      abuilder.withName("index").withMinimum(1).withMaximum(1).create()).withDescription(
      "The directory containing the index model").withShortName("m").create();

    Group group = gbuilder.withName("Options").withOption(helpOpt)
        .withOption(inputDirOpt).withOption(modelOpt).create();
   
    try {
      Parser parser = new Parser();
     
      parser.setGroup(group);
      parser.setHelpOption(helpOpt);
      CommandLine cmdLine = parser.parse(args);
      if (cmdLine.hasOption(helpOpt)) {
        CommandLineUtil.printHelp(group);
        return;
      }

      String inputPath  = (String) cmdLine.getValue(inputDirOpt);
      File f = new File(inputPath);
      if (!f.isDirectory()) {
        throw new IllegalArgumentException(f + " is not a directory or does not exit");
      }
      File[] inputFiles = FileUtil.buildFileList(f);
     
      File   modelDir  = new File((String) cmdLine.getValue(modelOpt));
      execute(inputFiles, modelDir);
    } catch (OptionException e) {
      log.error("Error while parsing options", e);
    }
   
  }

  private static void execute(File[] inputFiles, File modelFile)
      throws IOException, FileNotFoundException {
    //<start id="maxent.examples.test.setup"/>
    NameFinderFeatureGenerator nffg //<co id="tmx.feature"/>
      = new NameFinderFeatureGenerator();
    BagOfWordsFeatureGenerator bowfg
      = new BagOfWordsFeatureGenerator();

    InputStream modelStream = //<co id="tmx.modelreader"/>
        new FileInputStream(modelFile);
    DoccatModel model = new DoccatModel(modelStream);
    DocumentCategorizer categorizer //<co id="tmx.categorizer"/>
      = new DocumentCategorizerME(model, nffg, bowfg);
    Tokenizer tokenizer = SimpleTokenizer.INSTANCE;
  
    int catCount = categorizer.getNumberOfCategories();
    Collection<String> categories
      = new ArrayList<String>(catCount);
    for (int i=0; i < catCount; i++) {
      categories.add(categorizer.getCategory(i));
    }
    ResultAnalyzer resultAnalyzer = //<co id="tmx.results"/>
        new ResultAnalyzer(categories, "unknown");
    runTest(inputFiles, categorizer, tokenizer, resultAnalyzer); //<co id="tmx.run"/>
    /*<calloutlist>
    <callout arearefs="tmx.feature">Setup Feature Generators</callout>
    <callout arearefs="tmx.modelreader">Load Model</callout>
    <callout arearefs="tmx.categorizer">Create Categorizer</callout>
    <callout arearefs="tmx.results">Prepare Result Analyzer</callout>
    <callout arearefs="tmx.run">Execute Test</callout>
    </calloutlist>*/
    //<end id="maxent.examples.test.setup"/>
  }

  private static void runTest(File[] inputFiles,
      DocumentCategorizer categorizer,
      Tokenizer tokenizer, ResultAnalyzer resultAnalyzer)
      throws FileNotFoundException, IOException {
    String line;
    //<start id="maxent.examples.test.execute"/>
    for (File ff: inputFiles) {
      BufferedReader in = new BufferedReader(new FileReader(ff));
      while ((line = in.readLine()) != null) {
        String[] parts = line.split("\t");
        if (parts.length != 2) continue;
       
        String docText   = parts[1]; //<co id="tmt.preprocess"/>
        String[] tokens  = tokenizer.tokenize(docText);
       
        double[] probs   = categorizer.categorize(tokens); //<co id="tmt.categorize"/>
        String label     = categorizer.getBestCategory(probs);
        int    bestIndex = categorizer.getIndex(label);
        double score     = probs[bestIndex];

        ClassifierResult result //<co id="tmt.collect"/>
          = new ClassifierResult(label, score);
        resultAnalyzer.addInstance(parts[0], result);
      }
      in.close();
    }
   
    System.err.println(resultAnalyzer.toString()); //<co id="tmt.summarize"/>
    /*<calloutlist>
     * <callout arearefs="tmt.preprocess">Preprocess text</callout>
     * <callout arearefs="tmt.categorize">Categorize</callout>
     * <callout arearefs="tmt.collect">Analyze Results</callout>
     * <callout arearefs="tmt.summarize">Present Results</callout>
     * </calloutlist>*/
    //<end id="maxent.examples.test.execute"/>
  }
}
TOP

Related Classes of com.tamingtext.classifier.maxent.TestMaxent

TOP
Copyright © 2018 www.massapi.com. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.