Package opennlp.tools.tokenize

Examples of opennlp.tools.tokenize.Tokenizer


    }
    @Test
    public void testFallbackToSimpleTokenizer() throws IOException{
        //however for the tokenizer it is expected that a fallback to the
        //SimpleTokenizer is made
        Tokenizer tokenizer = openNLP.getTokenizer("ru");
        Assert.assertNotNull(tokenizer);
        Assert.assertEquals(SimpleTokenizer.INSTANCE, tokenizer);
    }
View Full Code Here


     * @param language the language or <code>null</code> to build a
     * {@link SimpleTokenizer}
     * @return the {@link Tokenizer} for the parsed language.
     */
    public Tokenizer getTokenizer(String language) {
        Tokenizer tokenizer = null;
        if(language != null){
            try {
                TokenizerModel model = getTokenizerModel(language);
                if(model != null){
                    tokenizer = new TokenizerME(getTokenizerModel(language));
View Full Code Here

    @Override
    public void computeEnhancements(ContentItem ci) throws EngineException {
        AnalysedText at = initAnalysedText(this,analysedTextFactory,ci);
        String language = getLanguage(this, ci, true);
       
        Tokenizer tokenizer = getTokenizer(language);
        if(tokenizer == null){
            log.warn("Tokenizer for language {} is no longer available. "
                    + "This might happen if the model becomes unavailable during enhancement. "
                    + "If this happens more often it might also indicate an bug in the used "
                    + "EnhancementJobManager implementation as the availability is also checked "
                    + "in the canEnhance(..) method of this Enhancement Engine.");
            return;
        }
        //Try to use sentences for tokenizing
        Iterator<? extends Section> sections = at.getSentences();
        if(!sections.hasNext()){
            //if no sentences are annotated
            sections = Collections.singleton(at).iterator();
        }
       
        //for all sentences (or the whole Text - if no sentences available)
        while(sections.hasNext()){
            Section section = sections.next();
            //Tokenize section
            opennlp.tools.util.Span[] tokenSpans = tokenizer.tokenizePos(section.getSpan());
            for(int i=0;i<tokenSpans.length;i++){
                Token token = section.addToken(tokenSpans[i].getStart(), tokenSpans[i].getEnd());
                log.trace(" > add {}",token);
            }
        }
View Full Code Here

        }

        NameFinderME finder = new NameFinderME((TokenNameFinderModel)nameFinderModel);

        List<SurfaceFormOccurrence> sfOccurrences = new ArrayList<SurfaceFormOccurrence>();
        Tokenizer tokenizer = new SimpleTokenizer();
        for (int i = 0; i < sentences.length; i++) {
            String sentence = sentences[i];
            //LOG.debug("Sentence: " + sentence);

            // extract the names in the current sentence
            String[] tokens = tokenizer.tokenize(sentence);
            Span[] tokenspan = tokenizer.tokenizePos(sentence);
            Span[] nameSpans = finder.find(tokens);
            double[] probs = finder.probs();

            if (nameSpans != null && nameSpans.length > 0) {
                //System.out.println("Tokens: " +(new ArrayList(Arrays.asList(tokens))).toString());
View Full Code Here

        SentenceDetectorME sentenceDetector = new SentenceDetectorME(getSentenceModel("en"));

        Span[] sentenceSpans = sentenceDetector.sentPosDetect(textWithDots);

        NameFinderME finder = new NameFinderME(nameFinderModel);
        Tokenizer tokenizer = openNLP.getTokenizer(language);
        Map<String,List<NameOccurrence>> nameOccurrences = new LinkedHashMap<String,List<NameOccurrence>>();
        for (int i = 0; i < sentenceSpans.length; i++) {
            String sentence = sentenceSpans[i].getCoveredText(text).toString().trim();

            // build a context by concatenating three sentences to be used for
            // similarity ranking / disambiguation + contextual snippet in the
            // extraction structure
            List<String> contextElements = new ArrayList<String>();
            if (i > 0) {
                CharSequence previousSentence = sentenceSpans[i - 1].getCoveredText(text);
                contextElements.add(previousSentence.toString().trim());
            }
            contextElements.add(sentence.toString().trim());
            if (i + 1 < sentenceSpans.length) {
                CharSequence nextSentence = sentenceSpans[i + 1].getCoveredText(text);
                contextElements.add(nextSentence.toString().trim());
            }
            String context = StringUtils.join(contextElements, " ");

            // extract the names in the current sentence and
            // keep them store them with the current context
            Span[] tokenSpans = tokenizer.tokenizePos(sentence);
            String[] tokens = Span.spansToStrings(tokenSpans, sentence);
            Span[] nameSpans = finder.find(tokens);
            double[] probs = finder.probs();
            //int lastStartPosition = 0;
            for (int j = 0; j < nameSpans.length; j++) {
View Full Code Here

    @Override
    public void computeEnhancements(ContentItem ci) throws EngineException {
        AnalysedText at = initAnalysedText(this,analysedTextFactory,ci);
        String language = getLanguage(this, ci, true);
       
        Tokenizer tokenizer = getTokenizer(language);
        if(tokenizer == null){
            log.warn("Tokenizer for language {} is no longer available. "
                    + "This might happen if the model becomes unavailable during enhancement. "
                    + "If this happens more often it might also indicate an bug in the used "
                    + "EnhancementJobManager implementation as the availability is also checked "
                    + "in the canEnhance(..) method of this Enhancement Engine.");
            return;
        }
        //Try to use sentences for tokenizing
        Iterator<? extends Section> sections = at.getSentences();
        if(!sections.hasNext()){
            //if no sentences are annotated
            sections = Collections.singleton(at).iterator();
        }
       
        //for all sentences (or the whole Text - if no sentences available)
        while(sections.hasNext()){
            Section section = sections.next();
            //Tokenize section
            opennlp.tools.util.Span[] tokenSpans = tokenizer.tokenizePos(section.getSpan());
            for(int i=0;i<tokenSpans.length;i++){
                Token token = section.addToken(tokenSpans[i].getStart(), tokenSpans[i].getEnd());
                log.trace(" > add {}",token);
            }
        }
View Full Code Here

    }

    FeatureGenerator[] featureGenerators = DoccatTrainerTool
        .createFeatureGenerators(params.getFeatureGenerators());

    Tokenizer tokenizer = DoccatTrainerTool.createTokenizer(params
        .getTokenizer());

    DoccatEvaluationMonitor[] listenersArr = listeners
        .toArray(new DoccatEvaluationMonitor[listeners.size()]);
View Full Code Here

    CmdLineUtil.checkOutputFile("document categorizer model", modelOutFile);

    FeatureGenerator[] featureGenerators = createFeatureGenerators(params
        .getFeatureGenerators());

    Tokenizer tokenizer = createTokenizer(params.getTokenizer());

    DoccatModel model;
    try {
      DoccatFactory factory = DoccatFactory.create(params.getFactory(),
          tokenizer, featureGenerators);
View Full Code Here

   * {@link SimpleTokenizer}.
   */
  @Override
  public double[] categorize(String documentText,
      Map<String, Object> extraInformation) {
    Tokenizer tokenizer = model.getFactory().getTokenizer();
    return categorize(tokenizer.tokenize(documentText), extraInformation);
  }
View Full Code Here

  /**
   * Categorizes the given text. The text is tokenized with the SimpleTokenizer
   * before it is passed to the feature generation.
   */
  public double[] categorize(String documentText) {
    Tokenizer tokenizer = model.getFactory().getTokenizer();
    return categorize(tokenizer.tokenize(documentText),
        Collections.<String, Object> emptyMap());
  }
View Full Code Here

TOP

Related Classes of opennlp.tools.tokenize.Tokenizer

Copyright © 2018 www.massapicom. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.