Examples of org.apache.lucene.analysis.core.StopFilter

org.apache.lucene.analysis.core.StopFilter
Removes stop words from a token stream.
You must specify the required {@link Version}compatibility when creating StopFilter:
- As of 3.1, StopFilter correctly handles Unicode 4.0 supplementary characters in stopwords and position increments are preserved

  protected TokenStreamComponents createComponents(String fieldName,
      Reader reader) {
    final Tokenizer source = new StandardTokenizer(matchVersion, reader);
    TokenStream result = new StandardFilter(matchVersion, source);
    result = new LowerCaseFilter(matchVersion, result);
    result = new StopFilter(matchVersion, result, stopwords);
    if(!stemExclusionSet.isEmpty())
      result = new KeywordMarkerFilter(result, stemExclusionSet);
    result = new SnowballFilter(result, new RomanianStemmer());
    return new TokenStreamComponents(source, result);
  }

View Full Code Here

    if (matchVersion.onOrAfter(Version.LUCENE_31)) {
      final Tokenizer source = new StandardTokenizer(matchVersion, reader);
      TokenStream result = new StandardFilter(matchVersion, source);
      result = new ElisionFilter(result, DEFAULT_ARTICLES);
      result = new LowerCaseFilter(matchVersion, result);
      result = new StopFilter(matchVersion, result, stopwords);
      if(!excltable.isEmpty())
        result = new KeywordMarkerFilter(result, excltable);
      if (matchVersion.onOrAfter(Version.LUCENE_36)) {
        result = new FrenchLightStemFilter(result);
      } else {
        result = new SnowballFilter(result, new org.tartarus.snowball.ext.FrenchStemmer());
      }
      return new TokenStreamComponents(source, result);
    } else {
      final Tokenizer source = new StandardTokenizer(matchVersion, reader);
      TokenStream result = new StandardFilter(matchVersion, source);
      result = new StopFilter(matchVersion, result, stopwords);
      if(!excltable.isEmpty())
        result = new KeywordMarkerFilter(result, excltable);
      result = new FrenchStemFilter(result);
      // Convert to lowercase after stemming!
      return new TokenStreamComponents(source, new LowerCaseFilter(matchVersion, result));

View Full Code Here

  protected TokenStreamComponents createComponents(String fieldName,
      Reader reader) {
    Tokenizer source = new StandardTokenizer(matchVersion, reader);
    TokenStream result = new LowerCaseFilter(matchVersion, source);
    result = new StandardFilter(matchVersion, result);
    result = new StopFilter(matchVersion, result, stopwords);
    if(excltable != null && !excltable.isEmpty())
      result = new KeywordMarkerFilter(result, excltable);
    return new TokenStreamComponents(source, new BrazilianStemFilter(result));
  }

View Full Code Here

  protected TokenStreamComponents createComponents(String fieldName,
      Reader reader) {
    final Tokenizer source = new StandardTokenizer(matchVersion, reader);
    TokenStream result = new StandardFilter(matchVersion, source);
    result = new LowerCaseFilter(matchVersion, result);
    result = new StopFilter( matchVersion, result, stopwords);
    result = new KeywordMarkerFilter(result, exclusionSet);
    if (matchVersion.onOrAfter(Version.LUCENE_36)) {
      result = new GermanNormalizationFilter(result);
      result = new GermanLightStemFilter(result);
    } else if (matchVersion.onOrAfter(Version.LUCENE_31)) {

View Full Code Here

  protected TokenStreamComponents createComponents(String fieldName,
      Reader reader) {
    final Tokenizer source = new StandardTokenizer(matchVersion, reader);
    TokenStream result = new StandardFilter(matchVersion, source);
    result = new LowerCaseFilter(matchVersion, result);
    result = new StopFilter(matchVersion, result, stopwords);
    if(!stemExclusionSet.isEmpty())
      result = new KeywordMarkerFilter(result, stemExclusionSet);
    result = new StempelFilter(result, new StempelStemmer(stemTable));
    return new TokenStreamComponents(source, result);
  }

View Full Code Here

    // result = new LowerCaseFilter(result);
    // LowerCaseFilter is not needed, as SegTokenFilter lowercases Basic Latin text.
    // The porter stemming is too strict, this is not a bug, this is a feature:)
    result = new PorterStemFilter(result);
    if (!stopWords.isEmpty()) {
      result = new StopFilter(matchVersion, result, stopWords);
    }
    return new TokenStreamComponents(tokenizer, result);
  }

View Full Code Here

    Analyzer a = new Analyzer() {
        @Override
        public TokenStreamComponents createComponents(String field, Reader reader) {
          Tokenizer tokenizer = new MockTokenizer(reader);
          CharArraySet stopSet = StopFilter.makeStopSet(TEST_VERSION_CURRENT, "of");
          return new TokenStreamComponents(tokenizer, new StopFilter(TEST_VERSION_CURRENT, tokenizer, stopSet));
        }
      };


    Iterable<Input> keys = AnalyzingSuggesterTest.shuffle(
        new Input("wizard of oz", 50)

View Full Code Here

    Analyzer a = new Analyzer() {
        @Override
        public TokenStreamComponents createComponents(String field, Reader reader) {
          Tokenizer tokenizer = new MockTokenizer(reader);
          CharArraySet stopSet = StopFilter.makeStopSet(TEST_VERSION_CURRENT, "of");
          return new TokenStreamComponents(tokenizer, new StopFilter(TEST_VERSION_CURRENT, tokenizer, stopSet));
        }
      };


    Iterable<Input> keys = AnalyzingSuggesterTest.shuffle(
        new Input("wizard of of oz", 50)

View Full Code Here

    final Tokenizer source = new StandardTokenizer(matchVersion, reader);
    TokenStream result = new StandardFilter(matchVersion, source);
    if(matchVersion.onOrAfter(Version.LUCENE_4_8))
      result = new ApostropheFilter(result);
    result = new TurkishLowerCaseFilter(result);
    result = new StopFilter(matchVersion, result, stopwords);
    if(!stemExclusionSet.isEmpty())
      result = new SetKeywordMarkerFilter(result, stemExclusionSet);
    result = new SnowballFilter(result, new TurkishStemmer());
    return new TokenStreamComponents(source, result);
  }

View Full Code Here

  protected TokenStreamComponents createComponents(String fieldName,
      Reader reader) {
    final Tokenizer source = new StandardTokenizer(matchVersion, reader);
    TokenStream result = new StandardFilter(matchVersion, source);
    result = new LowerCaseFilter(matchVersion, result);
    result = new StopFilter( matchVersion, result, stopwords);
    if (matchVersion.onOrAfter(Version.LUCENE_3_1)) {
      if(!this.stemExclusionTable.isEmpty())
        result = new SetKeywordMarkerFilter(result, stemExclusionTable);
      result = new CzechStemFilter(result);
    }

View Full Code Here

0 1 2 3 4 5 6 7 8 9

TOP

Related Classes of org.apache.lucene.analysis.core.StopFilter

com.foundationdb.server.service.text.SelectiveCaseAnalyzer

com.googlecode.lucene.PorterAnalyzer

com.livingsocial.hive.udf.Tokenize$MyAnalyzer

gov.nysenate.openleg.lucene.OpenLegislationAnalyzer

org.apache.blur.analysis.NoStopWordStandardAnalyzer

org.apache.lucene.analysis.ar.ArabicAnalyzer

org.apache.lucene.analysis.bg.BulgarianAnalyzer

org.apache.lucene.analysis.br.BrazilianAnalyzer

org.apache.lucene.analysis.ca.CatalanAnalyzer

org.apache.lucene.analysis.cjk.CJKAnalyzer

All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.