/* LanguageTool, a natural language style checker
* Copyright (C) 2005 Daniel Naber (http://www.danielnaber.de)
*
* This library is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
* This library is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
* License along with this library; if not, write to the Free Software
* Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301
* USA
*/
package org.languagetool.dev.index;
import static org.languagetool.dev.dumpcheck.SentenceSourceIndexer.MAX_DOC_COUNT_FIELD;
import static org.languagetool.dev.dumpcheck.SentenceSourceIndexer.MAX_DOC_COUNT_FIELD_VAL;
import static org.languagetool.dev.dumpcheck.SentenceSourceIndexer.MAX_DOC_COUNT_VALUE;
import static org.languagetool.dev.index.PatternRuleQueryBuilder.FIELD_NAME;
import static org.languagetool.dev.index.PatternRuleQueryBuilder.FIELD_NAME_LOWERCASE;
import static org.languagetool.dev.index.PatternRuleQueryBuilder.SOURCE_FIELD_NAME;
import java.io.File;
import java.io.IOException;
import java.net.URLEncoder;
import java.util.ArrayList;
import java.util.List;
import org.apache.lucene.document.Document;
import org.apache.lucene.index.DirectoryReader;
import org.apache.lucene.index.Term;
import org.apache.lucene.search.IndexSearcher;
import org.apache.lucene.search.Query;
import org.apache.lucene.search.ScoreDoc;
import org.apache.lucene.search.Sort;
import org.apache.lucene.search.SortField;
import org.apache.lucene.search.TermQuery;
import org.apache.lucene.search.TimeLimitingCollector;
import org.apache.lucene.search.TopDocs;
import org.apache.lucene.search.TopFieldCollector;
import org.apache.lucene.store.Directory;
import org.apache.lucene.store.SimpleFSDirectory;
import org.apache.lucene.util.Counter;
import org.languagetool.AnalyzedSentence;
import org.languagetool.JLanguageTool;
import org.languagetool.Language;
import org.languagetool.rules.Rule;
import org.languagetool.rules.RuleMatch;
import org.languagetool.rules.patterns.PatternRule;
import org.languagetool.tools.ContextTools;
/**
* A class with a main() method that takes a rule id and the location of the
* index that runs the query on that index and prints all matches.
* Will transparently handle rules that are not supported, i.e. run on the candidate matches
* up to a limit.
*
* @author Tao Lin
* @author Daniel Naber
*/
public class Searcher {
private static boolean WIKITEXT_OUTPUT = false;
private final Directory directory;
private int maxHits = 1000;
private int maxSearchTimeMillis = 5000;
private IndexSearcher indexSearcher;
private DirectoryReader reader;
private boolean limitSearch = true;
public Searcher(Directory directory) {
this.directory = directory;
}
private void open() throws IOException {
reader = DirectoryReader.open(directory);
indexSearcher = new IndexSearcher(reader);
//System.out.println("Opened index " + directory + " with " + indexSearcher.getIndexReader().numDocs() + " docs");
}
private void close() throws IOException {
if (reader != null) {
reader.close();
}
}
public int getDocCount() throws IOException {
try (DirectoryReader reader = DirectoryReader.open(directory)) {
final IndexSearcher indexSearcher = new IndexSearcher(reader);
return getDocCount(indexSearcher);
}
}
private int getDocCount(IndexSearcher indexSearcher) throws IOException {
final Term searchTerm = new Term(MAX_DOC_COUNT_FIELD, MAX_DOC_COUNT_FIELD_VAL);
final TopDocs search = indexSearcher.search(new TermQuery(searchTerm), 1);
if (search.totalHits != 1) {
return -1;
}
final ScoreDoc scoreDoc = search.scoreDocs[0];
final Document doc = indexSearcher.doc(scoreDoc.doc);
return Integer.parseInt(doc.get(MAX_DOC_COUNT_VALUE));
}
public int getMaxHits() {
return maxHits;
}
public void setMaxHits(int maxHits) {
this.maxHits = maxHits;
}
public int getMaxSearchTimeMillis() {
return maxSearchTimeMillis;
}
public void setMaxSearchTimeMillis(int maxSearchTimeMillis) {
this.maxSearchTimeMillis = maxSearchTimeMillis;
}
public SearcherResult findRuleMatchesOnIndex(PatternRule rule, Language language) throws IOException, UnsupportedPatternRuleException {
// it seems wasteful to re-open the index every time, but I had strange problems (OOM, Array out of bounds, ...)
// when not doing so...
open();
try {
final PatternRuleQueryBuilder patternRuleQueryBuilder = new PatternRuleQueryBuilder(language);
final Query query = patternRuleQueryBuilder.buildRelaxedQuery(rule);
if (query == null) {
throw new NullPointerException("Cannot search on null query for rule: " + rule.getId());
}
System.out.println("Running query: " + query.toString(FIELD_NAME_LOWERCASE));
final SearchRunnable runnable = new SearchRunnable(indexSearcher, query, language, rule);
final Thread searchThread = new Thread(runnable);
searchThread.start();
try {
// using a TimeLimitingCollector is not enough, as it doesn't cover all time required to
// search for a complicated regex, so interrupt the whole thread instead:
if (limitSearch) { //FIXME: I don't know a simpler way to achieve this
searchThread.join(maxSearchTimeMillis);
} else {
searchThread.join(Integer.MAX_VALUE);
}
searchThread.interrupt();
} catch (InterruptedException e) {
throw new RuntimeException("Search thread got interrupted for query " + query, e);
}
if (searchThread.isInterrupted()) {
throw new SearchTimeoutException("Search timeout of " + maxSearchTimeMillis + "ms reached for query " + query);
}
final Exception exception = runnable.getException();
if (exception != null) {
if (exception instanceof SearchTimeoutException) {
throw (SearchTimeoutException)exception;
}
throw new RuntimeException("Exception during search for query " + query + " on rule " + rule.getId(), exception);
}
final List<MatchingSentence> matchingSentences = runnable.getMatchingSentences();
final int sentencesChecked = getSentenceCheckCount(query, indexSearcher);
final SearcherResult searcherResult = new SearcherResult(matchingSentences, sentencesChecked, query);
searcherResult.setHasTooManyLuceneMatches(runnable.hasTooManyLuceneMatches());
searcherResult.setLuceneMatchCount(runnable.getLuceneMatchCount());
if (runnable.hasTooManyLuceneMatches()) {
// more potential matches than we can check in an acceptable time :-(
searcherResult.setDocCount(maxHits);
} else {
searcherResult.setDocCount(getDocCount(indexSearcher));
}
//TODO: the search itself could also timeout, don't just ignore that:
//searcherResult.setResultIsTimeLimited(limitedTopDocs.resultIsTimeLimited);
return searcherResult;
} finally {
close();
}
}
private PossiblyLimitedTopDocs getTopDocs(Query query, Sort sort) throws IOException {
final TopFieldCollector topCollector = TopFieldCollector.create(sort, maxHits, true, false, false, false);
final Counter clock = Counter.newCounter(true);
final int waitMillis = 1000;
// TODO: if we interrupt the whole thread anyway, do we still need the TimeLimitingCollector?
final TimeLimitingCollector collector = new TimeLimitingCollector(topCollector, clock, maxSearchTimeMillis / waitMillis);
collector.setBaseline(0);
final Thread counterThread = new Thread() {
@Override
public void run() {
final long startTime = System.currentTimeMillis();
while (true) {
final long runTimeMillis = System.currentTimeMillis() - startTime;
if (runTimeMillis > maxSearchTimeMillis) {
// make sure there's no lingering thread for too long
return;
}
clock.addAndGet(1);
try {
Thread.sleep(waitMillis);
} catch (InterruptedException e) {
throw new RuntimeException(e);
}
}
}
};
counterThread.setName("LuceneSearchTimeoutThread");
counterThread.start();
boolean timeLimitActivated = false;
try {
indexSearcher.search(query, collector);
} catch (TimeLimitingCollector.TimeExceededException e) {
timeLimitActivated = true;
}
return new PossiblyLimitedTopDocs(topCollector.topDocs(), timeLimitActivated);
}
List<PatternRule> getRuleById(String ruleId, Language language) throws IOException {
List<PatternRule> rules = new ArrayList<>();
JLanguageTool langTool = new JLanguageTool(language);
langTool.activateDefaultPatternRules();
for (Rule rule : langTool.getAllRules()) {
if (rule.getId().equals(ruleId) && rule instanceof PatternRule) {
rules.add((PatternRule) rule);
}
}
if (rules.size() > 0) {
return rules;
} else {
throw new PatternRuleNotFoundException(ruleId, language);
}
}
private int getSentenceCheckCount(Query query, IndexSearcher indexSearcher) {
final int indexSize = indexSearcher.getIndexReader().numDocs();
// we actually check up to maxHits sentences:
// TODO: ??
final int sentencesChecked = Math.min(maxHits, indexSize);
return sentencesChecked;
}
private List<MatchingSentence> findMatchingSentences(IndexSearcher indexSearcher, TopDocs topDocs, JLanguageTool languageTool) throws IOException {
final List<MatchingSentence> matchingSentences = new ArrayList<>();
for (ScoreDoc match : topDocs.scoreDocs) {
final Document doc = indexSearcher.doc(match.doc);
final String sentence = doc.get(FIELD_NAME);
final List<RuleMatch> ruleMatches = languageTool.check(sentence);
if (ruleMatches.size() > 0) {
final String source = doc.get(SOURCE_FIELD_NAME);
final String title = doc.get(Indexer.TITLE_FIELD_NAME);
final AnalyzedSentence analyzedSentence = languageTool.getAnalyzedSentence(sentence);
final MatchingSentence matchingSentence = new MatchingSentence(sentence, source, title, analyzedSentence, ruleMatches);
matchingSentences.add(matchingSentence);
}
}
return matchingSentences;
}
private JLanguageTool getLanguageToolWithOneRule(Language lang, PatternRule patternRule) {
final JLanguageTool langTool = new JLanguageTool(lang);
for (Rule rule : langTool.getAllActiveRules()) {
langTool.disableRule(rule.getId());
}
langTool.addRule(patternRule);
langTool.enableDefaultOffRule(patternRule.getId()); // rule might be off by default
return langTool;
}
class PossiblyLimitedTopDocs {
TopDocs topDocs;
boolean resultIsTimeLimited;
PossiblyLimitedTopDocs(TopDocs topDocs, boolean resultIsTimeLimited) {
this.topDocs = topDocs;
this.resultIsTimeLimited = resultIsTimeLimited;
}
}
private static void ensureCorrectUsageOrExit(String[] args) {
if (args.length < 3 || (args.length == 4 && !"--no_limit".equals(args[3]))) {
System.err.println("Usage: Searcher <ruleId> <languageCode> <indexDir> [--no_limit]");
System.err.println("\truleId Id of the rule to search for (or comma-separated list of ids)");
System.err.println("\tlanguageCode short language code, e.g. 'en' for English");
System.err.println("\tindexDir path to a directory containing the index");
System.err.println("\t--no_limit do not limit search time");
System.exit(1);
}
}
class SearchRunnable implements Runnable {
private final IndexSearcher indexSearcher;
private final Query query;
private final Language language;
private final PatternRule rule;
private List<MatchingSentence> matchingSentences;
private Exception exception;
private boolean tooManyLuceneMatches;
private int luceneMatchCount;
SearchRunnable(IndexSearcher indexSearcher, Query query, Language language, PatternRule rule) {
this.indexSearcher = indexSearcher;
this.query = query;
this.language = language;
this.rule = rule;
}
@Override
public void run() {
try {
final Sort sort = new Sort(new SortField("docCount", SortField.Type.INT)); // do not sort by relevance as this will move the shortest documents to the top
final long t1 = System.currentTimeMillis();
final JLanguageTool languageTool = getLanguageToolWithOneRule(language, rule);
final long langToolCreationTime = System.currentTimeMillis() - t1;
final long t2 = System.currentTimeMillis();
final PossiblyLimitedTopDocs limitedTopDocs = getTopDocs(query, sort);
final long luceneTime = System.currentTimeMillis() - t2;
final long t3 = System.currentTimeMillis();
luceneMatchCount = limitedTopDocs.topDocs.totalHits;
tooManyLuceneMatches = limitedTopDocs.topDocs.scoreDocs.length >= maxHits;
matchingSentences = findMatchingSentences(indexSearcher, limitedTopDocs.topDocs, languageTool);
System.out.println("Check done in " + langToolCreationTime + "/" + luceneTime + "/" + (System.currentTimeMillis() - t3)
+ "ms (LT creation/Lucene/matching) for " + limitedTopDocs.topDocs.scoreDocs.length + " docs");
} catch (Exception e) {
exception = e;
}
}
Exception getException() {
return exception;
}
/**
* There were more Lucene matches than we can actually check with LanguageTool in
* an acceptable time, so real matches might be lost.
*/
boolean hasTooManyLuceneMatches() {
return tooManyLuceneMatches;
}
int getLuceneMatchCount() {
return luceneMatchCount;
}
List<MatchingSentence> getMatchingSentences() {
return matchingSentences;
}
}
private static ContextTools getContextTools(int contextSize) {
final ContextTools contextTools = new ContextTools();
contextTools.setEscapeHtml(false);
contextTools.setContextSize(contextSize);
contextTools.setErrorMarkerStart("**");
contextTools.setErrorMarkerEnd("**");
return contextTools;
}
public static void main(String[] args) throws Exception {
ensureCorrectUsageOrExit(args);
final long startTime = System.currentTimeMillis();
final String[] ruleIds = args[0].split(",");
final String languageCode = args[1];
final Language language = Language.getLanguageForShortName(languageCode);
final File indexDir = new File(args[2]);
final boolean limitSearch = args.length > 3 && "--no_limit".equals(args[3]);
final Searcher searcher = new Searcher(new SimpleFSDirectory(indexDir));
if (!limitSearch) {
searcher.setMaxHits(100_000);
}
searcher.limitSearch = limitSearch;
final ContextTools contextTools = getContextTools(140);
int totalMatches = 0;
for (String ruleId : ruleIds) {
final long ruleStartTime = System.currentTimeMillis();
for (PatternRule rule : searcher.getRuleById(ruleId, language)) {
System.out.println("===== " + ruleId + "[" + rule.getSubId() + "] =========================================================");
final SearcherResult searcherResult = searcher.findRuleMatchesOnIndex(rule, language);
int i = 1;
if (searcherResult.getMatchingSentences().size() == 0) {
System.out.println("[no matches]");
}
for (MatchingSentence ruleMatch : searcherResult.getMatchingSentences()) {
for (RuleMatch match : ruleMatch.getRuleMatches()) {
String context = contextTools.getContext(match.getFromPos(), match.getToPos(), ruleMatch.getSentence());
if (WIKITEXT_OUTPUT) {
ContextTools contextTools2 = getContextTools(0);
String coveredText = contextTools2.getContext(match.getFromPos(), match.getToPos(), ruleMatch.getSentence());
coveredText = coveredText.replaceFirst("^\\.\\.\\.", "").replaceFirst("\\.\\.\\.$", "");
coveredText = coveredText.replaceFirst("^\\*\\*", "").replaceFirst("\\*\\*$", "");
String encodedTextWithQuotes = URLEncoder.encode("\"" + coveredText + "\"", "UTF-8");
String searchLink = "https://de.wikipedia.org/w/index.php?search=" + encodedTextWithQuotes + "&title=Spezial%3ASuche&go=Artikel";
context = context.replaceAll("\\*\\*.*?\\*\\*", "[" + searchLink + " " + coveredText + "]");
String encTitle = URLEncoder.encode(ruleMatch.getTitle(), "UTF-8");
String encodedText = URLEncoder.encode(coveredText, "UTF-8");
System.out.println("# [[" + ruleMatch.getTitle() + "]]: " + context +
" ([http://wikipedia.ramselehof.de/wikiblame.php?user_lang=de&lang=de&project=wikipedia&article=" + encTitle +
"&needle=" + encodedText + "&skipversions=0&ignorefirst=0&limit=500&searchmethod=int&order=desc&start=Start WikiBlame])");
} else {
System.out.println(i + ": " + context + " [" + ruleMatch.getSource() + "]");
}
}
totalMatches += ruleMatch.getRuleMatches().size();
i++;
}
System.out.println("Time: " + (System.currentTimeMillis() - ruleStartTime) + "ms");
}
}
System.out.println("Total time: " + (System.currentTimeMillis() - startTime) + "ms, " + totalMatches + " matches");
}
}