Package org.apache.nutch.spell

Source Code of org.apache.nutch.spell.SpellCheckerBean

/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements.  See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License.  You may obtain a copy of the License at
*
*     http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.nutch.spell;

import java.io.BufferedReader;
import java.io.IOException;
import java.io.InputStreamReader;
import java.util.ArrayList;
import java.util.Iterator;
import java.util.List;

import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.apache.hadoop.conf.Configuration;

import org.apache.lucene.search.IndexSearcher;
import org.apache.nutch.searcher.Query;
import org.apache.nutch.util.LogUtil;
import org.apache.nutch.util.NutchConfiguration;

/**
* Parses queries and sends them to NGramSpeller for spell checking.
*
* @author Andy Liu <andyliu1227@gmail.com&gt
*/
public class SpellCheckerBean {
  public static final Log LOG = LogFactory.getLog(SpellCheckerBean.class);

  IndexSearcher spellingSearcher;

  //
  // Configuration parameters used by NGramSpeller . Hardcoded for now.
  // 
  final int minThreshold = 5;

  final int ng1 = 3;

  final int ng2 = 4;

  final int maxr = 10;

  final int maxd = 5;

  final float bStart = 2.0f;

  final float bEnd = 1.0f;

  final float bTransposition = 6.5f;

  // configuration variable names
  public static final String SPELLING_INDEX_LOCATION = "spell.index.dir";

  public static final String SPELLING_DOCFREQ_THRESHOLD = "spell.docfreq.threshold";

  public static final String SPELLING_DOCFREQ_THRESHOLD_FACTOR = "spell.docfreq.threshold.factor";

  String indexLocation;

  int threshold;

  int thresholdFactor;
 
  Configuration conf;

  public SpellCheckerBean(Configuration conf) {
    this.conf=conf;
    indexLocation = conf.get(SPELLING_INDEX_LOCATION, "./spelling");
    threshold = conf.getInt(SPELLING_DOCFREQ_THRESHOLD, 100);
    thresholdFactor = conf.getInt(SPELLING_DOCFREQ_THRESHOLD_FACTOR, 10);
    try {
      spellingSearcher = new IndexSearcher(indexLocation);
    } catch (IOException ioe) {
      LOG.info("error opening spell checking index");
      ioe.printStackTrace(LogUtil.getInfoStream(LOG));
    }
  }

  /** Cache in Configuration. */
  public static SpellCheckerBean get(Configuration conf) {
    SpellCheckerBean spellCheckerBean = (SpellCheckerBean) conf
        .getObject(SpellCheckerBean.class.getName());

    if (spellCheckerBean == null) {
      LOG.info("creating new spell checker bean");
      spellCheckerBean = new SpellCheckerBean(conf);
      conf.setObject(SpellCheckerBean.class.getName(), spellCheckerBean);
    }
    return spellCheckerBean;
  }

  public SpellCheckerTerms checkSpelling(Query query, String queryString) {

    return checkSpelling(query, queryString, threshold, thresholdFactor);
  }

  /**
   * Parses original query, retrieves suggestions from ngrams spelling index
   *
   * @param originalQuery
   *          Query to be spell-checked
   * @param docFreqThreshold
   *          Terms in query that have a docFreq lower than this threshold
   *          qualify as "mispelled"
   * @param factorThreshold
   *          The suggested term must have a docFreq at least factorThreshold
   *          times more than the mispelled term. Set to 1 to disable.
   * @return terms with corrected spelling
   */
  public SpellCheckerTerms checkSpelling(Query query, String queryString,
      int docFreqThreshold, int factorThreshold) {
    SpellCheckerTerms spellCheckerTerms = null;

    try {
      spellCheckerTerms = parseOriginalQuery(query, queryString);

      for (int i = 0; i < spellCheckerTerms.size(); i++) {
        SpellCheckerTerm currentTerm = spellCheckerTerms.getSpellCheckerTerm(i);
        String originalTerm = currentTerm.getOriginalTerm();

        spellCheckerTerms.getSpellCheckerTerm(i).setOriginalDocFreq(
            getDocFreq(originalTerm));

        //
        // Spell checking is not effective for words under 4 letters long
        // Any words over 25 letters long isn't worth checking either.
        //
        if (originalTerm.length() < 4)
          continue;

        if (originalTerm.length() > 25)
          continue;

        List lis = new ArrayList(maxr);

        String[] suggestions = NGramSpeller.suggestUsingNGrams(spellingSearcher
            , originalTerm, ng1, ng2, maxr, bStart, bEnd,
            bTransposition, maxd, lis, true);

        Iterator it = lis.iterator();

        while (it.hasNext()) {
          if(LOG.isDebugEnabled()){
            LOG.debug(it.next().toString());
          }
        }

        if (suggestions.length > 0) {
          currentTerm.setSuggestedTerm(suggestions[0]);

          if (lis != null) {
            NGramSpeller.SpellSuggestionDetails detail = (NGramSpeller.SpellSuggestionDetails) lis
                .get(0);
            currentTerm.setSuggestedTermDocFreq(detail.docFreq);
          }

          // We use document frequencies of the original term and the suggested
          // term to guess
          // whether or not a term is mispelled. The criteria is as follows:
          //
          // 1. The term's document frequency must be under a constant threshold
          // 2. The suggested term's docFreq must be greater than the original
          // term's docFreq * constant factor
          //
          if ((currentTerm.originalDocFreq < docFreqThreshold)
              && ((currentTerm.originalDocFreq * factorThreshold) < (currentTerm.suggestedTermDocFreq))) {
            spellCheckerTerms.setHasMispelledTerms(true);
            currentTerm.setMispelled(true);
          }
        }

      }

    } catch (Throwable t) {
      t.printStackTrace();
    }

    return spellCheckerTerms;
  }

  /**
   *
   * Parses the query and preserves characters and formatting surrounding terms
   * to be spell-checked. This is done so that we can present the query in the
   * "Did you mean: XYZ" message in the same format the user originally typed
   * it.
   *
   * @param originalQuery
   *          text to be parsed
   * @return spell checker terms
   */
  public SpellCheckerTerms parseOriginalQuery(Query query, String queryString)
      throws IOException {
    String[] terms = query.getTerms();
    SpellCheckerTerms spellCheckerTerms = new SpellCheckerTerms();

    int previousTermPos = 0;
    for (int i = 0; i < terms.length; i++) {

      int termPos = queryString.toLowerCase().indexOf(terms[i]);

      String charsBefore = "";
      String charsAfter = "";

      // Is this the first term? If so, we need to check for characters
      // before the first term.
      if (i == 0) {

        if (termPos > 0) {
          charsBefore = queryString.substring(0, termPos);
        }

        // We're in-between terms...
      } else {
        int endOfLastTerm = previousTermPos + terms[i - 1].length();

        if (endOfLastTerm < termPos) {
          charsBefore = queryString.substring(endOfLastTerm, termPos);
        }
      }

      // Is this the last term? If so, we need to check for characters
      // after the last term.
      if (i == (terms.length - 1)) {

        int endOfCurrentTerm = termPos + terms[i].length();

        if (endOfCurrentTerm < queryString.length()) {
          charsAfter = queryString.substring(endOfCurrentTerm, queryString
              .length());
        }

      }

      previousTermPos = termPos;

      spellCheckerTerms.add(new SpellCheckerTerm(terms[i], charsBefore,
          charsAfter));

    }

    return spellCheckerTerms;

  }

  public SpellCheckerTerms parseOriginalQuery(String queryString)
      throws IOException {
    return parseOriginalQuery(Query.parse(queryString, conf), queryString);
  }

  /**
   * Retrieves docFreq as stored within spelling index. Alternatively, we could
   * simply consult the main index for a docFreq() of a term (which would be
   * faster) but it's nice to have a separate, spelling index that can stand on
   * its own.
   *
   * @param term
   * @return document frequency of term
   */
  private int getDocFreq(String term) throws IOException {
    /*
     * Hits hits = this.spellingSearcher.getLuceneSearcher().search(new
     * TermQuery(new Term( NGramSpeller.F_WORD, term))); if (hits.length() > 0) {
     * Document doc = hits.doc(0); String docFreq =
     * doc.get(NGramSpeller.F_FREQ); return Integer.parseInt(docFreq); }
     */
    return 0;
  }

  public static void main(String[] args) throws Throwable {
    if (args.length < 1) {
      System.out.println("usage: SpellCheckerBean [ngrams spelling index]");
      return;
    }

    Configuration conf = NutchConfiguration.create();

    conf.set("spell.index.dir", args[0]);

    SpellCheckerBean checker = new SpellCheckerBean(conf);
    BufferedReader in = new BufferedReader(new InputStreamReader(System.in));

    String line;

    while ((line = in.readLine()) != null) {
      Query query = Query.parse(line, conf);
      SpellCheckerTerms terms = checker.checkSpelling(query, line);
      StringBuffer buf = new StringBuffer();

      for (int i = 0; i < terms.size(); i++) {
        SpellCheckerTerm currentTerm = terms.getSpellCheckerTerm(i);
        buf.append(currentTerm.getCharsBefore());

        if (currentTerm.isMispelled()) {
          buf.append(currentTerm.getSuggestedTerm());
        } else {
          buf.append(currentTerm.getOriginalTerm());
        }
      }

      System.out.println("Spell checked: " + buf);
    }
  }

  public void init() {
    //do initialization here
  }

  public String[] suggest(Query query) {
    // TODO Auto-generated method stub
    return null;
  }

  public String getID() {
    return "SPELLER";
  }
}
TOP

Related Classes of org.apache.nutch.spell.SpellCheckerBean

TOP
Copyright © 2018 www.massapi.com. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.