Examples of org.apache.lucene.analysis.core.LowerCaseFilter

org.apache.lucene.analysis.core.LowerCaseFilter
Normalizes token text to lower case.
You must specify the required {@link Version}compatibility when creating LowerCaseFilter:
- As of 3.1, supplementary characters are properly lowercased.

      Reader reader) {
    if (matchVersion.onOrAfter(Version.LUCENE_3_6)) {
      final Tokenizer source = new StandardTokenizer(matchVersion, reader);
      // run the widthfilter first before bigramming, it sometimes combines characters.
      TokenStream result = new CJKWidthFilter(source);
      result = new LowerCaseFilter(matchVersion, result);
      result = new CJKBigramFilter(result);
      return new TokenStreamComponents(source, new StopFilter(matchVersion, result, stopwords));
    } else {
      final Tokenizer source = new CJKTokenizer(reader);
      return new TokenStreamComponents(source, new StopFilter(matchVersion, source, stopwords));

View Full Code Here

  @Override
  protected TokenStreamComponents createComponents(String fieldName,
      Reader reader) {
    final Tokenizer source = new StandardTokenizer(matchVersion, reader);
    TokenStream result = new StandardFilter(matchVersion, source);
    result = new LowerCaseFilter(matchVersion, result);
    result = new StopFilter(matchVersion, result, stopwords);
    if(!stemExclusionSet.isEmpty())
      result = new SetKeywordMarkerFilter(result, stemExclusionSet);
    result = new SnowballFilter(result, new RomanianStemmer());
    return new TokenStreamComponents(source, result);

View Full Code Here

    final Tokenizer source = new StandardTokenizer(matchVersion, reader);
    TokenStream result = new StandardFilter(matchVersion, source);
    // prior to this we get the classic behavior, standardfilter does it for us.
    if (matchVersion.onOrAfter(Version.LUCENE_3_1))
      result = new EnglishPossessiveFilter(matchVersion, result);
    result = new LowerCaseFilter(matchVersion, result);
    result = new StopFilter(matchVersion, result, stopwords);
    if(!stemExclusionSet.isEmpty())
      result = new SetKeywordMarkerFilter(result, stemExclusionSet);
    result = new PorterStemFilter(result);
    return new TokenStreamComponents(source, result);

View Full Code Here

      Reader reader) {
    if (matchVersion.onOrAfter(Version.LUCENE_3_1)) {
      final Tokenizer source = new StandardTokenizer(matchVersion, reader);
      TokenStream result = new StandardFilter(matchVersion, source);
      result = new ElisionFilter(result, DEFAULT_ARTICLES);
      result = new LowerCaseFilter(matchVersion, result);
      result = new StopFilter(matchVersion, result, stopwords);
      if(!excltable.isEmpty())
        result = new SetKeywordMarkerFilter(result, excltable);
      if (matchVersion.onOrAfter(Version.LUCENE_3_6)) {
        result = new FrenchLightStemFilter(result);
      } else {
        result = new SnowballFilter(result, new org.tartarus.snowball.ext.FrenchStemmer());
      }
      return new TokenStreamComponents(source, result);
    } else {
      final Tokenizer source = new StandardTokenizer(matchVersion, reader);
      TokenStream result = new StandardFilter(matchVersion, source);
      result = new StopFilter(matchVersion, result, stopwords);
      if(!excltable.isEmpty())
        result = new SetKeywordMarkerFilter(result, excltable);
      result = new FrenchStemFilter(result);
      // Convert to lowercase after stemming!
      return new TokenStreamComponents(source, new LowerCaseFilter(matchVersion, result));
    }
  }

View Full Code Here


    assertTokenStreamContents(theDetector, new String[]{"The", "the", "The", "the"});
    assertTokenStreamContents(dogDetector, new String[]{"Dogs", "Dogs"});
    
    source1.reset();
    TokenStream lowerCasing = new LowerCaseFilter(TEST_VERSION_CURRENT, source1);
    String[] lowerCaseTokens = new String[tokens1.length];
    for (int i = 0; i < tokens1.length; i++)
      lowerCaseTokens[i] = tokens1[i].toLowerCase(Locale.ROOT);
    assertTokenStreamContents(lowerCasing, lowerCaseTokens);
  }

View Full Code Here

    if (matchVersion.onOrAfter(Version.LUCENE_3_1)) {
      source = new StandardTokenizer(matchVersion, reader);
    } else {
      source = new ArabicLetterTokenizer(matchVersion, reader);
    }
    TokenStream result = new LowerCaseFilter(matchVersion, source);
    result = new ArabicNormalizationFilter(result);
    /* additional persian-specific normalization */
    result = new PersianNormalizationFilter(result);
    /*
     * the order here is important: the stopword list is normalized with the

View Full Code Here

  @Override
  protected TokenStreamComponents createComponents(String fieldName,
      Reader reader) {
    final Tokenizer source = new StandardTokenizer(matchVersion, reader);
    TokenStream result = new StandardFilter(matchVersion, source);
    result = new LowerCaseFilter(matchVersion, result);
    result = new StopFilter(matchVersion, result, stopwords);
    if(!stemExclusionSet.isEmpty())
      result = new SetKeywordMarkerFilter(result, stemExclusionSet);
    if (matchVersion.onOrAfter(Version.LUCENE_3_6)) {
      result = new PortugueseLightStemFilter(result);

View Full Code Here

  @Override
  protected TokenStreamComponents createComponents(String fieldName,
      Reader reader) {
    final Tokenizer source = new StandardTokenizer(matchVersion, reader);
    TokenStream result = new StandardFilter(matchVersion, source);
    result = new LowerCaseFilter(matchVersion, result);
    result = new StopFilter( matchVersion, result, stopwords);
    result = new SetKeywordMarkerFilter(result, exclusionSet);
    if (matchVersion.onOrAfter(Version.LUCENE_3_6)) {
      result = new GermanNormalizationFilter(result);
      result = new GermanLightStemFilter(result);

View Full Code Here

    protected TokenStreamComponents createComponents(String fieldName,
        Reader reader) {
      if (matchVersion.onOrAfter(Version.LUCENE_3_1)) {
        final Tokenizer source = new StandardTokenizer(matchVersion, reader);
        TokenStream result = new StandardFilter(matchVersion, source);
        result = new LowerCaseFilter(matchVersion, result);
        result = new StopFilter(matchVersion, result, stopwords);
        if (!stemExclusionSet.isEmpty()) result = new SetKeywordMarkerFilter(
            result, stemExclusionSet);
        result = new SnowballFilter(result, new org.tartarus.snowball.ext.RussianStemmer());
        return new TokenStreamComponents(source, result);
      } else {
        final Tokenizer source = new RussianLetterTokenizer(matchVersion, reader);
        TokenStream result = new LowerCaseFilter(matchVersion, source);
        result = new StopFilter(matchVersion, result, stopwords);
        if (!stemExclusionSet.isEmpty()) result = new SetKeywordMarkerFilter(
          result, stemExclusionSet);
        result = new SnowballFilter(result, new org.tartarus.snowball.ext.RussianStemmer());
        return new TokenStreamComponents(source, result);

View Full Code Here

  @Override
  protected TokenStreamComponents createComponents(String fieldName,
      Reader reader) {
    final Tokenizer source = new StandardTokenizer(matchVersion, reader);
    TokenStream result = new StandardFilter(matchVersion, source);
    result = new LowerCaseFilter(matchVersion, result);
    result = new StopFilter(matchVersion, result, stopwords);
    if(!stemExclusionSet.isEmpty())
      result = new SetKeywordMarkerFilter(result, stemExclusionSet);
    result = new SnowballFilter(result, new ArmenianStemmer());
    return new TokenStreamComponents(source, result);

View Full Code Here

0 1 2 3 4 5 6 7 8 9

TOP

Related Classes of org.apache.lucene.analysis.core.LowerCaseFilter

com.code972.elasticsearch.analysis.HebrewExactAnalyzer

com.foundationdb.server.service.text.SelectiveCaseAnalyzer

com.googlecode.lucene.PorterAnalyzer

com.livingsocial.hive.udf.Tokenize$MyAnalyzer

de.arago.lucene.util.LowCaseAnalyzer

gov.nysenate.openleg.lucene.OpenLegislationAnalyzer

org.apache.blur.analysis.NoStopWordStandardAnalyzer

org.apache.jackrabbit.oak.plugins.index.lucene.OakAnalyzer

org.apache.lucene.analysis.ar.ArabicAnalyzer

org.apache.lucene.analysis.bg.BulgarianAnalyzer

All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.