Package org.apache.lucene.analysis

Examples of org.apache.lucene.analysis.ReusableAnalyzerBase


  public void testMultiwordOffsets() throws Exception {
    b = new SynonymMap.Builder(true);
    final boolean keepOrig = true;
    add("national hockey league", "nhl", keepOrig);
    final SynonymMap map = b.build();
    Analyzer a = new ReusableAnalyzerBase() {
      @Override
      protected TokenStreamComponents createComponents(String fieldName, Reader reader) {
        Tokenizer tokenizer = new MockTokenizer(reader, MockTokenizer.WHITESPACE, false);
        return new TokenStreamComponents(tokenizer, new SynonymFilter(tokenizer, map, true));
      }
View Full Code Here


    "Romanian", "Russian", "Spanish", "Swedish", "Turkish"
  };
 
  public void testEmptyTerm() throws IOException {
    for (final String lang : SNOWBALL_LANGS) {
      Analyzer a = new ReusableAnalyzerBase() {
        @Override
        protected TokenStreamComponents createComponents(String fieldName, Reader reader) {
          Tokenizer tokenizer = new KeywordTokenizer(reader);
          return new TokenStreamComponents(tokenizer, new SnowballFilter(tokenizer, lang));
        }
View Full Code Here

  }

  public void testRandomHugeStringsMockGraphAfter() throws Exception {
    // Randomly inject graph tokens after JapaneseTokenizer:
    checkRandomData(random,
                    new ReusableAnalyzerBase() {
                      @Override
                      protected TokenStreamComponents createComponents(String fieldName, Reader reader) {
                        Tokenizer tokenizer = new JapaneseTokenizer(reader, readDict(), false, Mode.SEARCH);
                        TokenStream graph = new MockGraphTokenFilter(random, tokenizer);
                        return new TokenStreamComponents(tokenizer, graph);
View Full Code Here

                     surfaceForms);
  }

  public void testLatticeToDot() throws Exception {
    final GraphvizFormatter gv2 = new GraphvizFormatter(ConnectionCosts.getInstance());
    final Analyzer analyzer = new ReusableAnalyzerBase() {
      @Override
      protected TokenStreamComponents createComponents(String fieldName, Reader reader) {
        JapaneseTokenizer tokenizer = new JapaneseTokenizer(reader, readDict(), false, Mode.SEARCH);
        tokenizer.setGraphvizFormatter(gv2);
        return new TokenStreamComponents(tokenizer, tokenizer);
View Full Code Here

  // LUCENE-3642
  // EdgeNgram blindly adds term length to offset, but this can take things out of bounds
  // wrt original text if a previous filter increases the length of the word (in this case æ -> ae)
  // so in this case we behave like WDF, and preserve any modified offsets
  public void testInvalidOffsets() throws Exception {
    Analyzer analyzer = new ReusableAnalyzerBase() {
      @Override
      protected TokenStreamComponents createComponents(String fieldName, Reader reader) {
        Tokenizer tokenizer = new MockTokenizer(reader, MockTokenizer.WHITESPACE, false);
        TokenFilter filters = new ASCIIFoldingFilter(tokenizer);
        filters = new NGramTokenFilter(filters, 2, 2);
View Full Code Here

        new int[]    {   11,   11,   11,   11,   11,   11,   11,   11,   11,   11,   11 });
  }
 
  /** blast some random strings through the analyzer */
  public void testRandomStrings() throws Exception {
    Analyzer a = new ReusableAnalyzerBase() {
      @Override
      protected TokenStreamComponents createComponents(String fieldName, Reader reader) {
        Tokenizer tokenizer = new MockTokenizer(reader, MockTokenizer.WHITESPACE, false);
        return new TokenStreamComponents(tokenizer,
            new NGramTokenFilter(tokenizer, 2, 15));
View Full Code Here

    };
    checkRandomData(random, a, 10000*RANDOM_MULTIPLIER, 20, false, false);
  }
 
  public void testEmptyTerm() throws Exception {
    Analyzer a = new ReusableAnalyzerBase() {
      @Override
      protected TokenStreamComponents createComponents(String fieldName, Reader reader) {
        Tokenizer tokenizer = new KeywordTokenizer(reader);
        return new TokenStreamComponents(tokenizer,
            new NGramTokenFilter(tokenizer, 2, 15));
View Full Code Here

    assertTokenStreamContents(tokenizer, new String[]{"a","ab","abc"}, new int[]{0,0,0}, new int[]{1,2,3}, 5 /* abcde */);
  }
 
  /** blast some random strings through the analyzer */
  public void testRandomStrings() throws Exception {
    Analyzer a = new ReusableAnalyzerBase() {
      @Override
      protected TokenStreamComponents createComponents(String fieldName, Reader reader) {
        Tokenizer tokenizer = new EdgeNGramTokenizer(reader, EdgeNGramTokenizer.Side.FRONT, 2, 15);
        return new TokenStreamComponents(tokenizer, tokenizer);
      }   
    };
    checkRandomData(random, a, 10000*RANDOM_MULTIPLIER, 20, false, false);
    checkRandomData(random, a, 200*RANDOM_MULTIPLIER, 8192, false, false);
   
    Analyzer b = new ReusableAnalyzerBase() {
      @Override
      protected TokenStreamComponents createComponents(String fieldName, Reader reader) {
        Tokenizer tokenizer = new EdgeNGramTokenizer(reader, EdgeNGramTokenizer.Side.BACK, 2, 15);
        return new TokenStreamComponents(tokenizer, tokenizer);
      }   
View Full Code Here

    assertTokenStreamContents(tokenizer, new String[]{"a","b","c","d","e"}, new int[]{0,1,2,3,4}, new int[]{1,2,3,4,5}, 5 /* abcde */);
  }
 
  /** blast some random strings through the analyzer */
  public void testRandomStrings() throws Exception {
    Analyzer a = new ReusableAnalyzerBase() {
      @Override
      protected TokenStreamComponents createComponents(String fieldName, Reader reader) {
        Tokenizer tokenizer = new NGramTokenizer(reader, 2, 15);
        return new TokenStreamComponents(tokenizer, tokenizer);
      }   
View Full Code Here

  }

  /** create a RandomIndexWriter with a random config: Uses TEST_VERSION_CURRENT and Whitespace+LowercasingAnalyzer */
  public RandomIndexWriter(Random r, Directory dir) throws IOException {
    this(r, dir, LuceneTestCase.newIndexWriterConfig(r, LuceneTestCase.TEST_VERSION_CURRENT,
        new ReusableAnalyzerBase() {
          @Override
          protected TokenStreamComponents createComponents(String fieldName,
              Reader reader) {
            Tokenizer tokenizer = new WhitespaceTokenizer(LuceneTestCase.TEST_VERSION_CURRENT, reader);
            return new TokenStreamComponents(tokenizer,
View Full Code Here

TOP

Related Classes of org.apache.lucene.analysis.ReusableAnalyzerBase

Copyright © 2018 www.massapicom. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.