Package it.unibz.instasearch.indexing

Source Code of it.unibz.instasearch.indexing.FileAnalyzer

/*
* Copyright (c) 2009 Andrejs Jermakovics.
*
* All rights reserved. This program and the accompanying materials
* are made available under the terms of the Eclipse Public License v1.0
* which accompanies this distribution, and is available at
* http://www.eclipse.org/legal/epl-v10.html
*
* Contributors:
*     Andrejs Jermakovics - initial implementation
*/
package it.unibz.instasearch.indexing;

import it.unibz.instasearch.indexing.tokenizers.CamelCaseTokenizer;
import it.unibz.instasearch.indexing.tokenizers.DotSplitTokenizer;
import it.unibz.instasearch.indexing.tokenizers.WordSplitTokenizer;
import it.unibz.instasearch.indexing.tokenizers.standard.StandardTokenizer;

import java.io.IOException;
import java.io.Reader;

import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.LengthFilter;
import org.apache.lucene.analysis.LowerCaseFilter;
import org.apache.lucene.analysis.TokenFilter;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.tokenattributes.TermAttribute;


public class FileAnalyzer extends Analyzer {

  private int minWordLength;
 
  public FileAnalyzer(int minWordLength){
    super();
   
    this.minWordLength = minWordLength;
  }
 
  public TokenStream tokenStream(Reader reader) {
   
    TokenStream result = new StandardTokenizer(reader); // splits at ". ", etc.
   
    // result = new SysoFilter(result);

    result = new WordSplitTokenizer(result);   // non-alphanumerics
    result = new DotSplitTokenizer(result);   // all.package.names, hyphen-separated-words
    result = new CamelCaseTokenizer(result);   // CamelCaseIdentifiers
   
      result = new LengthFilter(result, minWordLength, 128);
      result = new LowerCaseFilter(result);

      return result;
  }

  @Override
  public TokenStream tokenStream(String fieldName, Reader reader) {
    return tokenStream(reader);
  }

  // used when debugging
  public static class SysoFilter extends TokenFilter
  {
    private TermAttribute termAtt;
   
    public SysoFilter(TokenStream input)
    {
      super(input);
      termAtt = (TermAttribute) addAttribute(TermAttribute.class);
    }

    @Override
    public boolean incrementToken() throws IOException
    {
      if (input.incrementToken()) {
        System.out.println("TERM: " + termAtt.term());
        return true;
      }
     
      return false;
    }
  }
}
TOP

Related Classes of it.unibz.instasearch.indexing.FileAnalyzer

TOP
Copyright © 2018 www.massapi.com. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.