Examples of opennlp.tools.tokenize.Tokenizer

opennlp.tools.tokenize.Tokenizer
The interface for tokenizers, which segment a string into its tokens.
Tokenization is a necessary step before more complex NLP tasks can be applied, these usually process text on a token level. The quality of tokenization is important because it influences the performance of high-level task applied to it.
In segmented languages like English most words are segmented by white spaces expect for punctuations, etc. which is directly attached to the word without a white space in between, it is not possible to just split at all punctuations because in abbreviations dots are a part of the token itself. A tokenizer is now responsible to split these tokens correctly.
In non-segmented languages like Chinese tokenization is more difficult since words are not segmented by a whitespace.
Tokenizers can also be used to segment already identified tokens further into more atomic parts to get a deeper understanding. This approach helps more complex task to gain insight into tokens which do not represent words like numbers, units or tokens which are part of a special notation.
For most further task it is desirable to over tokenize rather than under tokenize.

    }
    @Test
    public void testFallbackToSimpleTokenizer() throws IOException{
        //however for the tokenizer it is expected that a fallback to the
        //SimpleTokenizer is made
        Tokenizer tokenizer = openNLP.getTokenizer("ru");
        Assert.assertNotNull(tokenizer);
        Assert.assertEquals(SimpleTokenizer.INSTANCE, tokenizer);
    }

View Full Code Here

     * @param language the language or <code>null</code> to build a 
     * {@link SimpleTokenizer}
     * @return the {@link Tokenizer} for the parsed language.
     */
    public Tokenizer getTokenizer(String language) {
        Tokenizer tokenizer = null;
        if(language != null){
            try {
                TokenizerModel model = getTokenizerModel(language);
                if(model != null){
                    tokenizer = new TokenizerME(getTokenizerModel(language));

View Full Code Here

    @Override
    public void computeEnhancements(ContentItem ci) throws EngineException {
        AnalysedText at = initAnalysedText(this,analysedTextFactory,ci);
        String language = getLanguage(this, ci, true);
        
        Tokenizer tokenizer = getTokenizer(language);
        if(tokenizer == null){
            log.warn("Tokenizer for language {} is no longer available. "
                    + "This might happen if the model becomes unavailable during enhancement. "
                    + "If this happens more often it might also indicate an bug in the used "
                    + "EnhancementJobManager implementation as the availability is also checked "
                    + "in the canEnhance(..) method of this Enhancement Engine.");
            return;
        }
        //Try to use sentences for tokenizing
        Iterator<? extends Section> sections = at.getSentences();
        if(!sections.hasNext()){
            //if no sentences are annotated
            sections = Collections.singleton(at).iterator();
        }
        
        //for all sentences (or the whole Text - if no sentences available)
        while(sections.hasNext()){
            Section section = sections.next();
            //Tokenize section
            opennlp.tools.util.Span[] tokenSpans = tokenizer.tokenizePos(section.getSpan());
            for(int i=0;i<tokenSpans.length;i++){
                Token token = section.addToken(tokenSpans[i].getStart(), tokenSpans[i].getEnd());
                log.trace(" > add {}",token);
            }
        }

View Full Code Here

        }


        NameFinderME finder = new NameFinderME((TokenNameFinderModel)nameFinderModel);


        List<SurfaceFormOccurrence> sfOccurrences = new ArrayList<SurfaceFormOccurrence>();
        Tokenizer tokenizer = new SimpleTokenizer();
        for (int i = 0; i < sentences.length; i++) {
            String sentence = sentences[i];
            //LOG.debug("Sentence: " + sentence);


            // extract the names in the current sentence
            String[] tokens = tokenizer.tokenize(sentence);
            Span[] tokenspan = tokenizer.tokenizePos(sentence);
            Span[] nameSpans = finder.find(tokens);
            double[] probs = finder.probs();


            if (nameSpans != null && nameSpans.length > 0) {
                //System.out.println("Tokens: " +(new ArrayList(Arrays.asList(tokens))).toString());

View Full Code Here

        SentenceDetectorME sentenceDetector = new SentenceDetectorME(getSentenceModel("en"));


        Span[] sentenceSpans = sentenceDetector.sentPosDetect(textWithDots);


        NameFinderME finder = new NameFinderME(nameFinderModel);
        Tokenizer tokenizer = openNLP.getTokenizer(language);
        Map<String,List<NameOccurrence>> nameOccurrences = new LinkedHashMap<String,List<NameOccurrence>>();
        for (int i = 0; i < sentenceSpans.length; i++) {
            String sentence = sentenceSpans[i].getCoveredText(text).toString().trim();


            // build a context by concatenating three sentences to be used for
            // similarity ranking / disambiguation + contextual snippet in the
            // extraction structure
            List<String> contextElements = new ArrayList<String>();
            if (i > 0) {
                CharSequence previousSentence = sentenceSpans[i - 1].getCoveredText(text);
                contextElements.add(previousSentence.toString().trim());
            }
            contextElements.add(sentence.toString().trim());
            if (i + 1 < sentenceSpans.length) {
                CharSequence nextSentence = sentenceSpans[i + 1].getCoveredText(text);
                contextElements.add(nextSentence.toString().trim());
            }
            String context = StringUtils.join(contextElements, " ");


            // extract the names in the current sentence and
            // keep them store them with the current context
            Span[] tokenSpans = tokenizer.tokenizePos(sentence);
            String[] tokens = Span.spansToStrings(tokenSpans, sentence);
            Span[] nameSpans = finder.find(tokens);
            double[] probs = finder.probs();
            //int lastStartPosition = 0;
            for (int j = 0; j < nameSpans.length; j++) {

View Full Code Here

    @Override
    public void computeEnhancements(ContentItem ci) throws EngineException {
        AnalysedText at = initAnalysedText(this,analysedTextFactory,ci);
        String language = getLanguage(this, ci, true);
        
        Tokenizer tokenizer = getTokenizer(language);
        if(tokenizer == null){
            log.warn("Tokenizer for language {} is no longer available. "
                    + "This might happen if the model becomes unavailable during enhancement. "
                    + "If this happens more often it might also indicate an bug in the used "
                    + "EnhancementJobManager implementation as the availability is also checked "
                    + "in the canEnhance(..) method of this Enhancement Engine.");
            return;
        }
        //Try to use sentences for tokenizing
        Iterator<? extends Section> sections = at.getSentences();
        if(!sections.hasNext()){
            //if no sentences are annotated
            sections = Collections.singleton(at).iterator();
        }
        
        //for all sentences (or the whole Text - if no sentences available)
        while(sections.hasNext()){
            Section section = sections.next();
            //Tokenize section
            opennlp.tools.util.Span[] tokenSpans = tokenizer.tokenizePos(section.getSpan());
            for(int i=0;i<tokenSpans.length;i++){
                Token token = section.addToken(tokenSpans[i].getStart(), tokenSpans[i].getEnd());
                log.trace(" > add {}",token);
            }
        }

View Full Code Here

    }


    FeatureGenerator[] featureGenerators = DoccatTrainerTool
        .createFeatureGenerators(params.getFeatureGenerators());


    Tokenizer tokenizer = DoccatTrainerTool.createTokenizer(params
        .getTokenizer());


    DoccatEvaluationMonitor[] listenersArr = listeners
        .toArray(new DoccatEvaluationMonitor[listeners.size()]);

View Full Code Here

    CmdLineUtil.checkOutputFile("document categorizer model", modelOutFile);


    FeatureGenerator[] featureGenerators = createFeatureGenerators(params
        .getFeatureGenerators());


    Tokenizer tokenizer = createTokenizer(params.getTokenizer());


    DoccatModel model;
    try {
      DoccatFactory factory = DoccatFactory.create(params.getFactory(),
          tokenizer, featureGenerators);

View Full Code Here

   * {@link SimpleTokenizer}.
   */
  @Override
  public double[] categorize(String documentText,
      Map<String, Object> extraInformation) {
    Tokenizer tokenizer = model.getFactory().getTokenizer();
    return categorize(tokenizer.tokenize(documentText), extraInformation);
  }

View Full Code Here

  /**
   * Categorizes the given text. The text is tokenized with the SimpleTokenizer
   * before it is passed to the feature generation.
   */
  public double[] categorize(String documentText) {
    Tokenizer tokenizer = model.getFactory().getTokenizer();
    return categorize(tokenizer.tokenize(documentText),
        Collections.<String, Object> emptyMap());
  }

View Full Code Here

0 1 2

TOP

Related Classes of opennlp.tools.tokenize.Tokenizer

com.tamingtext.classifier.maxent.TestMaxent

com.tamingtext.classifier.maxent.TrainMaxent

com.tamingtext.opennlp.NameFinderTest

functionality.SentencesToTree

net.sf.nlpshell.Main

opennlp.tools.cmdline.doccat.DoccatCrossValidatorTool

opennlp.tools.cmdline.doccat.DoccatTrainerTool

opennlp.tools.doccat.DocumentCategorizerME

opennlp.tools.formats.brat.BratNameSampleStreamFactory

opennlp.tools.formats.muc.Muc6FullParseCorefSampleStreamFactory

All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.