Examples of TokenizerFactory


Examples of org.apache.lucene.analysis.util.TokenizerFactory

public class TestHMMChineseTokenizerFactory extends BaseTokenStreamTestCase {
 
  /** Test showing the behavior */
  public void testSimple() throws Exception {
    Reader reader = new StringReader("我购买了道具和服装。");
    TokenizerFactory factory = new HMMChineseTokenizerFactory(new HashMap<String,String>());
    Tokenizer tokenizer = factory.create(reader);
    tokenizer.setReader(reader);
    // TODO: fix smart chinese to not emit punctuation tokens
    // at the moment: you have to clean up with WDF, or use the stoplist, etc
    assertTokenStreamContents(tokenizer,
       new String[] { "我", "购买", "了", "道具", "和", "服装", "," });
View Full Code Here

Examples of org.apache.lucene.analysis.util.TokenizerFactory

    }
  }
 
  private void doTestTokenizer(String tokenizer) throws IOException {
    Class<? extends TokenizerFactory> factoryClazz = TokenizerFactory.lookupClass(tokenizer);
    TokenizerFactory factory = (TokenizerFactory) initialize(factoryClazz);
    if (factory != null) {
      // we managed to fully create an instance. check a few more things:
     
      // if it implements MultiTermAware, sanity check its impl
      if (factory instanceof MultiTermAwareComponent) {
View Full Code Here

Examples of org.apache.lucene.analysis.util.TokenizerFactory

    // prepare bi-gram tokenizer factory
    Map<String, String> args = new HashMap<>();
    args.put(AbstractAnalysisFactory.LUCENE_MATCH_VERSION_PARAM, "4.4");
    args.put("minGramSize","2");
    args.put("maxGramSize","2");
    TokenizerFactory tf = new NGramTokenizerFactory(args);
   
    // (ab)->(bc)->(cd)->[ef][fg][gh]
    List<String> rules = new ArrayList<>();
    rules.add( "abcd=>efgh" );
    synMap = new SlowSynonymMap( true );
View Full Code Here

Examples of org.apache.lucene.analysis.util.TokenizerFactory

      if (Tokenizer.class.isAssignableFrom(c)) {
        String clazzName = c.getSimpleName();
        assertTrue(clazzName.endsWith("Tokenizer"));
        String simpleName = clazzName.substring(0, clazzName.length() - 9);
        assertNotNull(TokenizerFactory.lookupClass(simpleName));
        TokenizerFactory instance = null;
        try {
          instance = TokenizerFactory.forName(simpleName, args);
          assertNotNull(instance);
          if (instance instanceof ResourceLoaderAware) {
            ((ResourceLoaderAware) instance).inform(loader);
          }
          assertSame(c, instance.create(new StringReader("")).getClass());
        } catch (IllegalArgumentException e) {
          if (e.getCause() instanceof NoSuchMethodException) {
            // there is no corresponding ctor available
            throw e;
          }
          // TODO: For now pass because some factories have not yet a default config that always works
        }
      } else if (TokenFilter.class.isAssignableFrom(c)) {
        String clazzName = c.getSimpleName();
        assertTrue(clazzName.endsWith("Filter"));
        String simpleName = clazzName.substring(0, clazzName.length() - (clazzName.endsWith("TokenFilter") ? 11 : 6));
        assertNotNull(TokenFilterFactory.lookupClass(simpleName));
        TokenFilterFactory instance = null;
        try {
          instance = TokenFilterFactory.forName(simpleName, args);
          assertNotNull(instance);
          if (instance instanceof ResourceLoaderAware) {
            ((ResourceLoaderAware) instance).inform(loader);
          }
          Class<? extends TokenStream> createdClazz = instance.create(new KeywordTokenizer(new StringReader(""))).getClass();
          // only check instance if factory have wrapped at all!
          if (KeywordTokenizer.class != createdClazz) {
            assertSame(c, createdClazz);
          }
        } catch (IllegalArgumentException e) {
          if (e.getCause() instanceof NoSuchMethodException) {
            // there is no corresponding ctor available
            throw e;
          }
          // TODO: For now pass because some factories have not yet a default config that always works
        }
      } else if (CharFilter.class.isAssignableFrom(c)) {
        String clazzName = c.getSimpleName();
        assertTrue(clazzName.endsWith("CharFilter"));
        String simpleName = clazzName.substring(0, clazzName.length() - 10);
        assertNotNull(CharFilterFactory.lookupClass(simpleName));
        CharFilterFactory instance = null;
        try {
          instance = CharFilterFactory.forName(simpleName, args);
          assertNotNull(instance);
          if (instance instanceof ResourceLoaderAware) {
            ((ResourceLoaderAware) instance).inform(loader);
          }
          Class<? extends Reader> createdClazz = instance.create(new StringReader("")).getClass();
          // only check instance if factory have wrapped at all!
          if (StringReader.class != createdClazz) {
            assertSame(c, createdClazz);
          }
        } catch (IllegalArgumentException e) {
View Full Code Here

Examples of org.apache.lucene.analysis.util.TokenizerFactory

   */
  public static Analyzer buildAnalyzer(AnalyzerDef analyzerDef, Version luceneMatchVersion) throws IOException {
    ResourceLoader defaultResourceLoader = new HibernateSearchResourceLoader();
    TokenizerDef token = analyzerDef.tokenizer();
    final Map<String, String> tokenMapsOfParameters = getMapOfParameters( token.params(), luceneMatchVersion );
    TokenizerFactory tokenFactory = instanceFromClass( TokenizerFactory.class, token.factory(), "Tokenizer factory", tokenMapsOfParameters );
    injectResourceLoader( tokenFactory, defaultResourceLoader, tokenMapsOfParameters );

    final int length = analyzerDef.filters().length;
    final int charLength = analyzerDef.charFilters().length;
    TokenFilterFactory[] filters = new TokenFilterFactory[length];
View Full Code Here

Examples of org.apache.lucene.analysis.util.TokenizerFactory

    FieldType ft = h.getCore().getLatestSchema().getFieldTypeByName(fieldTypeName);
    Analyzer a = ft.getAnalyzer();
    Assert.assertEquals(a.getClass(), TokenizerChain.class);
   
    TokenizerChain tc = (TokenizerChain) a;
    TokenizerFactory tf = tc.getTokenizerFactory();
    Assert.assertEquals(tf.getClass(), MMSegTokenizerFactory.class);
   
    MMSegTokenizerFactory mtf = (MMSegTokenizerFactory) tf;
   
    Assert.assertNotNull(mtf.dic);
    return mtf.dic;
View Full Code Here

Examples of org.apache.solr.analysis.TokenizerFactory

    final boolean ignoreCase = getBoolean("ignoreCase", false);
    this.ignoreCase = ignoreCase;

    String tf = args.get("tokenizerFactory");

    final TokenizerFactory factory = tf == null ? null : loadTokenizerFactory(loader, tf, args);

    Analyzer analyzer = new ReusableAnalyzerBase() {
      @Override
      protected ReusableAnalyzerBase.TokenStreamComponents createComponents(String fieldName, Reader reader) {
        Tokenizer tokenizer = factory == null ? new WhitespaceTokenizer(Version.LUCENE_31, reader) : factory.create(reader);
        TokenStream stream = ignoreCase ? new LowerCaseFilter(Version.LUCENE_31, tokenizer) : tokenizer;
        return new TokenStreamComponents(tokenizer, stream);
      }
    };
View Full Code Here

Examples of org.apache.solr.analysis.TokenizerFactory

    }
    return parser.build();
  }

  private static TokenizerFactory loadTokenizerFactory(ResourceLoader loader, String cname, Map<String, String> args) {
    TokenizerFactory tokFactory = (TokenizerFactory) loader.newInstance(cname);
    tokFactory.init(args);
    return tokFactory;
  }
View Full Code Here

Examples of org.apache.solr.analysis.TokenizerFactory

      return namedList;
    }

    TokenizerChain tokenizerChain = (TokenizerChain) analyzer;
    CharFilterFactory[] cfiltfacs = tokenizerChain.getCharFilterFactories();
    TokenizerFactory tfac = tokenizerChain.getTokenizerFactory();
    TokenFilterFactory[] filtfacs = tokenizerChain.getTokenFilterFactories();

    NamedList<List<NamedList>> namedList = new NamedList<List<NamedList>>();

    if( cfiltfacs != null ){
      String source = value;
      for(CharFilterFactory cfiltfac : cfiltfacs ){
        CharStream reader = CharReader.get(new StringReader(source));
        reader = cfiltfac.create(reader);
        source = writeCharStream(namedList, reader);
      }
    }

    TokenStream tokenStream = tfac.create(tokenizerChain.charStream(new StringReader(value)));
    List<AttributeSource> tokens = analyzeTokenStream(tokenStream);

    namedList.add(tokenStream.getClass().getName(), convertTokensToNamedLists(tokens, context));

    ListBasedTokenStream listBasedTokenStream = new ListBasedTokenStream(tokens);
View Full Code Here

Examples of org.apache.solr.analysis.TokenizerFactory

      if (cfilters.size() > 0) {
        aninfo.add("charFilters", cfilters);
      }
     
      SimpleOrderedMap<Object> tokenizer = new SimpleOrderedMap<Object>();
      TokenizerFactory tfac = tchain.getTokenizerFactory();
      tokenizer.add("className", tfac.getClass().getName());
      tokenizer.add("args", tfac.getArgs());
      aninfo.add("tokenizer", tokenizer);

      TokenFilterFactory[] filtfacs = tchain.getTokenFilterFactories();
      SimpleOrderedMap<Map<String, Object>> filters = new SimpleOrderedMap<Map<String, Object>>();
      for (TokenFilterFactory filtfac : filtfacs) {
View Full Code Here
TOP
Copyright © 2018 www.massapi.com. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.