Package org.apache.lucene.wikipedia.analysis

Examples of org.apache.lucene.wikipedia.analysis.WikipediaTokenizer


    this.stopSet = stopSet;
  }
 
  @Override
  public TokenStream tokenStream(String fieldName, Reader reader) {
    TokenStream result = new WikipediaTokenizer(reader);
    result = new StandardFilter(result);
    result = new LowerCaseFilter(result);
    result = new StopFilter(true, result, stopSet);
    return result;
  }
View Full Code Here


   * @return
   * @throws IOException
   */
  static Set<String> getTokens(Article article) throws IOException {
    Set<String> tokenList = new HashSet<String>();
    WikipediaTokenizer tok = new WikipediaTokenizer(new StringReader(article.getText()));
    TermAttribute term = tok.addAttribute(TermAttribute.class);
    try {
      while (tok.incrementToken()) {
        String token = term.term();
        if (!StringUtils.isEmpty(token))
          tokenList.add(token);
      }
    } catch (IOException e) {
      log.error("Error tokenizing text", e);
    } finally {
      try {
        tok.end();
      } catch (IOException e) {
        log.error("Error calling end()", e);
      } finally {
        try {
          tok.close();
        } catch (IOException e) {
          log.error("Error closing tokenizer", e);
        }
      }
    }
View Full Code Here

    this.stopSet = stopSet;
  }
 
  @Override
  public TokenStream tokenStream(String fieldName, Reader reader) {
    TokenStream result = new WikipediaTokenizer(reader);
    result = new StandardFilter(result);
    result = new LowerCaseFilter(result);
    result = new StopFilter(true, result, stopSet);
    return result;
  }
View Full Code Here

    this.stopSet = stopSet;
  }

  @Override
  public TokenStream tokenStream(String fieldName, Reader reader) {
    TokenStream result = new WikipediaTokenizer(reader);
    result = new StandardFilter(result);
    result = new LowerCaseFilter(result);
    result = new StopFilter(true, result, stopSet);
    return result;
  }
View Full Code Here

TOP

Related Classes of org.apache.lucene.wikipedia.analysis.WikipediaTokenizer

Copyright © 2018 www.massapicom. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.