Package com.clearnlp.tokenization

Examples of com.clearnlp.tokenization.AbstractTokenizer


public class Tmp
{
  public Tmp(String[] args) throws Exception
  {
    BufferedReader reader = UTInput.createBufferedFileReader(args[0]);
    AbstractTokenizer tokenizer = NLPGetter.getTokenizer("en");
    Map<String,String> redirects = Maps.newHashMap();
//    Pattern paren = Pattern.compile("\\(.+?\\)$");
    Set<String> titles = Sets.newHashSet();
    String line, title, redirect;
    PTHtml html = new PTHtml();
View Full Code Here


    String readerType = reader.getType();
    boolean bTwit = isTwit(eConfig);
    PrintStream fout;
   
    AbstractSegmenter segmenter = readerType.equals(AbstractReader.TYPE_RAW? getSegmenter(eConfig, bTwit) : null;
    AbstractTokenizer tokenizer = readerType.equals(AbstractReader.TYPE_LINE) ? getTokenizer(eConfig, bTwit) : null;
    AbstractComponent[] components = null;
   
    if (modelFile != null && !modelFile.equals(UNConstant.EMPTY))
    {
      if (new File(modelFile).isFile())
View Full Code Here

 
//  ===================================== COMPONENT GETTERS =====================================

  protected AbstractSegmenter getSegmenter(Element eConfig, boolean twit) throws IOException
  {
    AbstractTokenizer tokenizer = getTokenizer(eConfig, twit);
    String language = getLanguage(eConfig);
   
    return NLPGetter.getSegmenter(language, tokenizer);
  }
View Full Code Here

    return NLPGetter.getSegmenter(language, tokenizer);
  }
 
  protected AbstractTokenizer getTokenizer(Element eConfig, boolean twit) throws IOException
  {
    AbstractTokenizer tokenizer = NLPGetter.getTokenizer(getLanguage(eConfig));
    tokenizer.setTwit(twit);
   
    return tokenizer;
  }
View Full Code Here

  {
    initArgs(args);
   
    try
    {
      AbstractTokenizer tokenizer = NLPGetter.getTokenizer(s_language);
      AbstractSegmenter segmenter = i_format.equals(AbstractReader.TYPE_RAW) ? NLPGetter.getSegmenter(s_language, tokenizer) : null;
      List<String[]>    filenames = getFilenames(s_inputPath, s_inputExt, s_outputExt);
      boolean outLine = o_format.equals(AbstractReader.TYPE_LINE);
      tokenizer.setTwit(b_twit);
     
      for (String[] io : filenames)
      {
        System.out.println(io[0]);
        tokenize(tokenizer, segmenter, io[0], io[1], outLine);
View Full Code Here

TOP

Related Classes of com.clearnlp.tokenization.AbstractTokenizer

Copyright © 2018 www.massapicom. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.