Package org.apache.nutch.analysis

Source Code of org.apache.nutch.analysis.CommonGrams$ArrayTokens

/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements.  See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License.  You may obtain a copy of the License at
*
*     http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/

package org.apache.nutch.analysis;

import org.apache.lucene.analysis.TokenFilter;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.Token;

import java.io.*;
import java.util.*;

// Commons Logging imports
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;

import org.apache.hadoop.conf.*;
import org.apache.nutch.util.NutchConfiguration;
import org.apache.nutch.searcher.Query.*;

/** Construct n-grams for frequently occuring terms and phrases while indexing.
* Optimize phrase queries to use the n-grams. Single terms are still indexed
* too, with n-grams overlaid.  This is achieved through the use of {@link
* Token#setPositionIncrement(int)}.*/
public class CommonGrams {
  private static final Log LOG = LogFactory.getLog(CommonGrams.class);
  private static final char SEPARATOR = '-';
  /** The key used to cache commonTerms in Configuration */
  private static final String KEY = CommonGrams.class.getName();

  private HashMap commonTerms = new HashMap();
 
  /**
   * The constructor.
   * @param conf
   */
  public CommonGrams(Configuration conf) {
      init(conf);
  }

  private static class Filter extends TokenFilter {
    private HashSet common;
    private Token previous;
    private LinkedList gramQueue = new LinkedList();
    private LinkedList nextQueue = new LinkedList();
    private StringBuffer buffer = new StringBuffer();

    /** Construct an n-gram producing filter. */
    public Filter(TokenStream input, HashSet common) {
      super(input);
      this.common = common;
    }

    /** Inserts n-grams into a token stream. */
    public Token next() throws IOException {
      if (gramQueue.size() != 0)                  // consume any queued tokens
        return (Token)gramQueue.removeFirst();

      final Token token = popNext();
      if (token == null)
        return null;

      if (!isCommon(token)) {                     // optimize simple case
        previous = token;
        return token;
      }

      gramQueue.add(token);                       // queue the token

      ListIterator i = nextQueue.listIterator();
      Token gram = token;
      while (isCommon(gram)) {
        if (previous != null && !isCommon(previous)) // queue prev gram first
          gramQueue.addFirst(gramToken(previous, gram));

        Token next = peekNext(i);
        if (next == null)
          break;

        gram = gramToken(gram, next);             // queue next gram last
        gramQueue.addLast(gram);
      }

      previous = token;
      return (Token)gramQueue.removeFirst();
    }

    /** True iff token is for a common term. */
    private boolean isCommon(Token token) {
      return common != null && common.contains(token.termText());
    }

    /** Pops nextQueue or, if empty, reads a new token. */
    private Token popNext() throws IOException {
      if (nextQueue.size() > 0)
        return (Token)nextQueue.removeFirst();
      else
        return input.next();
    }

    /** Return next token in nextQueue, extending it when empty. */
    private Token peekNext(ListIterator i) throws IOException {
      if (!i.hasNext()) {
        Token next = input.next();
        if (next == null)
          return null;
        i.add(next);
        i.previous();
      }
      return (Token)i.next();
    }

    /** Construct a compound token. */
    private Token gramToken(Token first, Token second) {
      buffer.setLength(0);
      buffer.append(first.termText());
      buffer.append(SEPARATOR);
      buffer.append(second.termText());
      Token result = new Token(buffer.toString(),
                               first.startOffset(), second.endOffset(),
                               "gram");
      result.setPositionIncrement(0);
      return result;
    }
  }

  /** Construct using the provided config file. */
  private void init(Configuration conf) {
    // First, try to retrieve some commonTerms cached in configuration.
    commonTerms = (HashMap) conf.getObject(KEY);
    if (commonTerms != null) { return; }

    // Otherwise, read the terms.file
    try {
      commonTerms = new HashMap();
      Reader reader = conf.getConfResourceAsReader
        (conf.get("analysis.common.terms.file"));
      BufferedReader in = new BufferedReader(reader);
      String line;
      while ((line = in.readLine()) != null) {
        line = line.trim();
        if (line.startsWith("#") || "".equals(line)) // skip comments
          continue;
        TokenStream ts = new NutchDocumentTokenizer(new StringReader(line));
        Token token = ts.next();
        if (token == null) {
          if (LOG.isWarnEnabled()) {
            LOG.warn("Line does not contain a field name: " + line);
          }
          continue;
        }
        String field = token.termText();
        token = ts.next();
        if (token == null) {
          if (LOG.isWarnEnabled()) {
            LOG.warn("Line contains only a field name, no word: " + line);
          }
          continue;
        }
        String gram = token.termText();
        while ((token = ts.next()) != null) {
          gram = gram + SEPARATOR + token.termText();
        }
        HashSet table = (HashSet)commonTerms.get(field);
        if (table == null) {
          table = new HashSet();
          commonTerms.put(field, table);
        }
        table.add(gram);
      }
      conf.setObject(KEY, commonTerms);
    } catch (IOException e) {
      throw new RuntimeException(e.toString());
    }
  }

  /** Construct a token filter that inserts n-grams for common terms.  For use
   * while indexing documents.  */
  public TokenFilter getFilter(TokenStream ts, String field) {
    return new Filter(ts, (HashSet)commonTerms.get(field));
  }

  /** Utility to convert an array of Query.Terms into a token stream. */
  private static class ArrayTokens extends TokenStream {
    private Term[] terms;
    private int index;

    public ArrayTokens(Phrase phrase) {
      this.terms = phrase.getTerms();
    }

    public Token next() {
      if (index == terms.length)
        return null;
      else
        return new Token(terms[index].toString(), index, ++index);
    }
  }

  /** Optimizes phrase queries to use n-grams when possible. */
  public String[] optimizePhrase(Phrase phrase, String field) {
   
    if (LOG.isTraceEnabled()) {
      LOG.trace("Optimizing " + phrase + " for " + field);
    }
    ArrayList result = new ArrayList();
    TokenStream ts = getFilter(new ArrayTokens(phrase), field);
    Token token, prev=null;
    int position = 0;
    try {
      while ((token = ts.next()) != null) {
        if (token.getPositionIncrement() != 0 && prev != null)
          result.add(prev.termText());
        prev = token;
        position += token.getPositionIncrement();
        if ((position + arity(token.termText())) == phrase.getTerms().length)
          break;
      }
    } catch (IOException e) {
      throw new RuntimeException(e.toString());
    }
    if (prev != null)
      result.add(prev.termText());

    return (String[])result.toArray(new String[result.size()]);
  }

  private int arity(String gram) {
    int index = 0;
    int arity = 0;
    while ((index = gram.indexOf(SEPARATOR, index+1)) != -1) {
      arity++;
    }
    return arity;
  }

  /** For debugging. */
  public static void main(String[] args) throws Exception {
    StringBuffer text = new StringBuffer();
    for (int i = 0; i < args.length; i++) {
      text.append(args[i]);
      text.append(' ');
    }
    TokenStream ts = new NutchDocumentTokenizer(new StringReader(text.toString()));
    CommonGrams commonGrams = new CommonGrams(NutchConfiguration.create());
    ts = commonGrams.getFilter(ts, "url");
    Token token;
    while ((token = ts.next()) != null) {
      System.out.println("Token: " + token);
    }
    String[] optimized = commonGrams.optimizePhrase(new Phrase(args), "url");
    System.out.print("Optimized: ");
    for (int i = 0; i < optimized.length; i++) {
      System.out.print(optimized[i] + " ");
    }
    System.out.println();
  }
 
}
TOP

Related Classes of org.apache.nutch.analysis.CommonGrams$ArrayTokens

TOP
Copyright © 2018 www.massapi.com. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.