Package org.languagetool.dev

Source Code of org.languagetool.dev.ConfusionSetCoverage

/* LanguageTool, a natural language style checker
* Copyright (C) 2014 Daniel Naber (http://www.danielnaber.de)
*
* This library is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
* This library is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
* License along with this library; if not, write to the Free Software
* Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301
* USA
*/
package org.languagetool.dev;

import org.languagetool.AnalyzedSentence;
import org.languagetool.AnalyzedTokenReadings;
import org.languagetool.JLanguageTool;
import org.languagetool.language.English;
import org.languagetool.rules.ConfusionProbabilityRule;
import org.languagetool.rules.ConfusionSetLoader;
import org.languagetool.tools.StringTools;

import java.io.FileReader;
import java.io.IOException;
import java.io.InputStream;
import java.util.List;
import java.util.Map;

/**
* Measure how many homophones there are per sentence on average. Useful so we can estimate
* the number of ngram lookups needed by the confusion rule.
* @since 2.7
*/
class ConfusionSetCoverage {

  private int sentences = 0;
  private int homophones = 0;
  private int lookupsNeeded = 0;

  private void run(String filename) throws IOException {
    Map<String, ConfusionProbabilityRule.ConfusionSet> confusionSet = getConfusionSet();
    try (FileReader reader = new FileReader(filename)) {
      String text = StringTools.readerToString(reader);
      JLanguageTool languageTool = new JLanguageTool(new English());
      List<AnalyzedSentence> analyzedSentences = languageTool.analyzeText(text);
      for (AnalyzedSentence sentence : analyzedSentences) {
        runOnSentence(sentence, confusionSet);
      }
    }
    System.out.println("Homophones set: " + confusionSet.size() + " items");
    System.out.println("Sentences: " + sentences);
    System.out.println("Homophones: " + homophones + " = " + ((float)homophones/sentences) + " per sentence");
    System.out.println("Lookups: " + lookupsNeeded + " = " + ((float)lookupsNeeded/sentences) + " per sentence");
    System.out.println(" (Lookups is the number of lookups needed to see which word in the homophones set " +
            "is more common. Actually even more ngram lookups will be needed, depending on what ngrams we have.)");
  }

  private Map<String, ConfusionProbabilityRule.ConfusionSet> getConfusionSet() throws IOException {
    ConfusionSetLoader loader = new ConfusionSetLoader();
    InputStream homophoneStream = JLanguageTool.getDataBroker().getFromResourceDirAsStream("/en/homophones.txt");
    return loader.loadConfusionSet(homophoneStream);
  }

  private void runOnSentence(AnalyzedSentence sentence, Map<String, ConfusionProbabilityRule.ConfusionSet> confusionSet) {
    sentences++;
    for (AnalyzedTokenReadings token : sentence.getTokensWithoutWhitespace()) {
      String tokenStr = token.getToken();
      if (confusionSet.containsKey(tokenStr)) {
        homophones++;
        lookupsNeeded += confusionSet.get(tokenStr).getSet().size();
      }
    }
  }

  public static void main(String[] args) throws IOException {
    if (args.length != 1) {
      System.out.println("Usage: " + ConfusionSetCoverage.class.getSimpleName() + " <textfile>");
      System.exit(1);
    }
    ConfusionSetCoverage coverage = new ConfusionSetCoverage();
    coverage.run(args[0]);
  }

}
TOP

Related Classes of org.languagetool.dev.ConfusionSetCoverage

TOP
Copyright © 2018 www.massapi.com. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.