Examples of AnalysedText

org.apache.stanbol.commons.opennlp.TextAnalyzer.AnalysedText
org.apache.stanbol.enhancer.nlp.model.AnalysedText
Provides access to NLP processing results of the text/plain {@link Blob} of an ContentItem. Intended to be{@link ContentItem#addPart(org.apache.clerezza.rdf.core.UriRef,Object) addedas ContentPart} by using {@link #ANALYSED_TEXT_URI}. @see ContentItem#addPart(UriRef,Object)

Examples of org.apache.stanbol.enhancer.nlp.model.AnalysedText

     *          if the underlying process failed to work as
     *          expected
     */
    @Override
    public void computeEnhancements(ContentItem ci) throws EngineException {
        final AnalysedText at = initAnalysedText(this,analysedTextFactory,ci);


        String language = getLanguage(this,ci,false);
        if(!("ja".equals(language) || (language != null && language.startsWith("ja-")))) {
            throw new IllegalStateException("The detected language is NOT 'ja'! "
                + "As this is also checked within the #canEnhance(..) method this "
                + "indicates an Bug in the used EnhancementJobManager implementation. "
                + "Please report this on the dev@apache.stanbol.org or create an "
                + "JIRA issue about this.");
        }
        //start with the Tokenizer
        TokenStream tokenStream = tokenizerFactory.create(new CharSequenceReader(at.getText()));
        //build the analyzing chain by adding all TokenFilters
        for(TokenFilterFactory filterFactory : filterFactories){
            tokenStream = filterFactory.create(tokenStream);
        }


        //Try to extract sentences based on POS tags ...
        int sentStartOffset = -1;
        //NER data
        List<NerData> nerList = new ArrayList<NerData>();
        int nerSentIndex = 0; //the next index where the NerData.context need to be set
        NerData ner = null;
        OffsetAttribute offset = null;
        try {
          tokenStream.reset(); //required with Solr 4
            while (tokenStream.incrementToken()){
                offset = tokenStream.addAttribute(OffsetAttribute.class);
                Token token = at.addToken(offset.startOffset(), offset.endOffset());
                //Get the POS attribute and init the PosTag
                PartOfSpeechAttribute posAttr = tokenStream.addAttribute(PartOfSpeechAttribute.class);
                PosTag posTag = POS_TAG_SET.getTag(posAttr.getPartOfSpeech());
                if(posTag == null){
                    posTag = adhocTags.get(posAttr.getPartOfSpeech());
                    if(posTag == null){
                        posTag = new PosTag(posAttr.getPartOfSpeech());
                        adhocTags.put(posAttr.getPartOfSpeech(), posTag);
                        log.warn(" ... missing PosTag mapping for {}",posAttr.getPartOfSpeech());
                    }
                }
                //Sentence detection by POS tag
                if(sentStartOffset < 0){ //the last token was a sentence ending
                  sentStartOffset = offset.startOffset();
                }
                if(posTag.hasPos(Pos.Point)) { 
                    Sentence sent = at.addSentence(sentStartOffset, offset.startOffset());
                    //add the sentence as context to the NerData instances
                    while(nerSentIndex < nerList.size()){
                        nerList.get(nerSentIndex).context = sent.getSpan();
                        nerSentIndex++;
                    }
                    sentStartOffset = -1;
                }
                //POS
                token.addAnnotation(POS_ANNOTATION, Value.value(posTag));
                //NER
                NerTag nerTag = NER_TAG_SET.getTag(posAttr.getPartOfSpeech());
                if(ner != null && (nerTag == null || !ner.tag.getType().equals(nerTag.getType()))){
                    //write NER annotation
                    Chunk chunk = at.addChunk(ner.start, ner.end);
                    chunk.addAnnotation(NlpAnnotations.NER_ANNOTATION, Value.value(ner.tag));
                    //NOTE that the fise:TextAnnotation are written later based on the nerList
                    //clean up
                    ner = null;
                }
                if(nerTag != null){
                    if(ner == null){
                        ner = new NerData(nerTag, offset.startOffset());
                        nerList.add(ner);
                    }
                    ner.end = offset.endOffset();
                }
                BaseFormAttribute baseFormAttr = tokenStream.addAttribute(BaseFormAttribute.class);
                MorphoFeatures morpho = null;
                if(baseFormAttr != null && baseFormAttr.getBaseForm() != null){
                  morpho = new MorphoFeatures(baseFormAttr.getBaseForm());
                  morpho.addPos(posTag); //and add the posTag
                }
                InflectionAttribute inflectionAttr = tokenStream.addAttribute(InflectionAttribute.class);
                inflectionAttr.getInflectionForm();
                inflectionAttr.getInflectionType();
                if(morpho != null){ //if present add the morpho
                  token.addAnnotation(MORPHO_ANNOTATION, Value.value(morpho));
                }
            }
            //we still need to write the last sentence
            Sentence lastSent = null;
            if(offset != null && sentStartOffset >= 0 && offset.endOffset() > sentStartOffset){
                lastSent = at.addSentence(sentStartOffset, offset.endOffset());
            }
            //and set the context off remaining named entities
            while(nerSentIndex < nerList.size()){
                if(lastSent != null){
                    nerList.get(nerSentIndex).context = lastSent.getSpan();
                } else { //no sentence detected
                    nerList.get(nerSentIndex).context = at.getSpan();
                }
                nerSentIndex++;
            }
        } catch (IOException e) {
            throw new EngineException(this, ci, "Exception while reading from "
                + "AnalyzedText contentpart",e);
        } finally {
            try {
                tokenStream.close();
            } catch (IOException e) {/* ignore */}
        }
        //finally write the NER annotations to the metadata of the ContentItem
        final MGraph metadata = ci.getMetadata();
        ci.getLock().writeLock().lock();
        try {
            Language lang = new Language("ja");
            for(NerData nerData : nerList){
                UriRef ta = EnhancementEngineHelper.createTextEnhancement(ci, this);
                metadata.add(new TripleImpl(ta, ENHANCER_SELECTED_TEXT, new PlainLiteralImpl(
                    at.getSpan().substring(nerData.start, nerData.end),lang)));
                metadata.add(new TripleImpl(ta, DC_TYPE, nerData.tag.getType()));
                metadata.add(new TripleImpl(ta, ENHANCER_START, lf.createTypedLiteral(nerData.start)));
                metadata.add(new TripleImpl(ta, ENHANCER_END, lf.createTypedLiteral(nerData.end)));
                metadata.add(new TripleImpl(ta, ENHANCER_SELECTION_CONTEXT, 
                    new PlainLiteralImpl(nerData.context, lang)));

View Full Code Here

Examples of org.apache.stanbol.enhancer.nlp.model.AnalysedText

                new Object[]{ getName(), ci.getUri(), language});
            return CANNOT_ENHANCE;
        }
        //we need a detected language, the AnalyzedText contentPart with
        //Tokens.
        AnalysedText at = getAnalysedText(this, ci, false);
        return at != null && at.getTokens().hasNext() ?
                ENHANCE_ASYNC : CANNOT_ENHANCE;
    }

View Full Code Here

Examples of org.apache.stanbol.enhancer.nlp.model.AnalysedText

    public void computeEnhancements(ContentItem ci) throws EngineException {
        log.trace(" enhance ci {}",ci.getUri());
        if(isOfflineMode() && !entitySearcher.supportsOfflineMode()){
            throw new EngineException(this,ci,"Offline mode is not supported by the used EntitySearcher!",null);
        }
        AnalysedText at = getAnalysedText(this, ci, true);
        log.debug("  > AnalysedText {}",at);
        String language = getLanguage(this, ci, true);
        if(log.isDebugEnabled()){
            log.debug("computeEnhancements for ContentItem {} language {} text={}", 
                new Object []{ci.getUri().getUnicodeString(), language, StringUtils.abbreviate(at.getSpan(), 100)});
        }
        log.debug("  > Language {}",language);
        LanguageProcessingConfig languageConfig = textProcessingConfig.getConfiguration(language);
        if(languageConfig == null){
            throw new IllegalStateException("The language '"+language+"' is not configured "

View Full Code Here

Examples of org.apache.stanbol.enhancer.nlp.model.AnalysedText

            log.debug("Engine {} ignores ContentItem {} becuase language {} is not condigured.",
                new Object[]{ getName(), ci.getUri(), language});
            return CANNOT_ENHANCE;
        }
        //we need a detected language, the AnalyzedText contentPart with Tokens.
        AnalysedText at = getAnalysedText(this, ci, false);
        return at != null && at.getTokens().hasNext() ?
                ENHANCE_ASYNC : CANNOT_ENHANCE;
    }

View Full Code Here

Examples of org.apache.stanbol.enhancer.nlp.model.AnalysedText

                ENHANCE_ASYNC : CANNOT_ENHANCE;
    }
    
    @Override
    public void computeEnhancements(ContentItem ci) throws EngineException {
        AnalysedText at = getAnalysedText(this, ci, true);
        String language = getLanguage(this, ci, true);
        LanguageProcessingConfig languageConfig = textProcessingConfig.getConfiguration(language);
        if(languageConfig == null){
            throw new IllegalStateException("The language '"+language+"' is not configured "
                    + "to be processed by this Engine. As this is already checked within the "
                    + "canEnhance(..) method this may indicate an bug in the used "
                    + "EnhanceemntJobManager implementation!");
        }
        if(log.isDebugEnabled()){
            log.debug("compute co-mentions for ContentItem {} language {}  text={}", 
                new Object []{ci.getUri().getUnicodeString(), language, StringUtils.abbreviate(at.getSpan(), 100)});
        }
        //create the in-memory database for the mentioned Entities
        ContentItemMentionBuilder entityMentionIndex = new ContentItemMentionBuilder(
            labelTokenizer, language, linkerConfig.getDefaultLanguage());
        MGraph metadata = ci.getMetadata();

View Full Code Here

Examples of org.apache.stanbol.enhancer.nlp.model.AnalysedText

     *          if the underlying process failed to work as
     *          expected
     */
    @Override
    public void computeEnhancements(ContentItem ci) throws EngineException {
        AnalysedText at = getAnalysedText(this, ci, true);
        String language = getLanguage(this, ci, true);
        isLangaugeConfigured(this, languageConfiguration, language, true);
        ChunkerME chunker = initChunker(language);
        if(chunker == null){
            return;
        }
        //init the Phrase TagSet
        TagSet<PhraseTag> tagSet = tagSetRegistry.getTagSet(language);
        if(tagSet == null){ 
        }
        if(tagSet == null){
            log.warn("No Phrase TagSet registered for Language '{}'. Will build an "
                + "adhoc set based on encountered Tags!",language);
            //for now only created to avoid checks for tagSet == null
            //TODO: in future we might want to automatically create posModels based
            //on tagged texts. However this makes no sense as long we can not
            //persist TagSets.
            tagSet = new TagSet<PhraseTag>("dummy", language);
        }
        //holds PosTags created for POS tags that where not part of the posModel
        //(will hold all PosTags in case tagSet is NULL
        Map<String,PhraseTag> adhocTags = languageAdhocTags.get(language);
        if(adhocTags == null){
            adhocTags = new HashMap<String,PhraseTag>();
            languageAdhocTags.put(language, adhocTags);
        }        
        ci.getLock().writeLock().lock();
        try {
            Iterator<? extends Section> sentences = at.getSentences();
            if(!sentences.hasNext()){ //no sentences ... iterate over the whole text
                sentences = Collections.singleton(at).iterator();
            }
            List<String> tokenTextList = new ArrayList<String>(64);
            List<String> posList = new ArrayList<String>(64);
            List<Token> tokenList = new ArrayList<Token>(64);
            //process each sentence seperatly
            while(sentences.hasNext()){
                // (1) get Tokens and POS information for the sentence
                Section sentence = sentences.next();
                Iterator<Token> tokens = sentence.getTokens();
                while(tokens.hasNext()){
                    Token token = tokens.next();
                    tokenList.add(token);
                    tokenTextList.add(token.getSpan());
                    Value<PosTag> posValue = token.getAnnotation(POS_ANNOTATION);
                    if(posValue == null){
                        throw new EngineException("Missing POS value for Token '"
                            + token.getSpan()+"' of ContentItem "+ci.getUri()
                            + "(Sentence: '"+sentence.getSpan()+"'). This may "
                            + "indicate that a POS tagging Engine is missing in "
                            + "the EnhancementChain or that the used POS tagging "
                            + "does not provide POS tags for each token!");
                    } else {
                        posList.add(posValue.value().getTag());
                    }
                }
                String[] tokenStrings = tokenTextList.toArray(new String[tokenTextList.size()]);
                String[] tokenPos = posList.toArray(new String[tokenTextList.size()]);
                if(log.isTraceEnabled()){
                    log.trace("Tokens: {}"+Arrays.toString(tokenStrings));
                }
                tokenTextList.clear(); //free memory
                posList.clear(); //free memory
                
                // (2) Chunk the sentence
                
                String[] chunkTags = chunker.chunk(tokenStrings, tokenPos);
                double[] chunkProb = chunker.probs();
                if(log.isTraceEnabled()){
                    log.trace("Chunks: {}"+Arrays.toString(chunkTags));
                }
                tokenStrings = null; //free memory
                tokenPos = null; //free memory
                
                // (3) Process the results and write the Annotations
                double chunkProps = 0;
                int chunkTokenCount = 0;
                PhraseTag tag = null;
                int i;
                /*
                 * This assumes:
                 *  - 'B-{tag}' ... for start of a new chunk
                 *  - '???' ... anything other for continuing the current chunk
                 *  - 'O' ... no chunk (ends current chunk)
                 */
                for(i=0;i<tokenList.size();i++){
                    boolean start = chunkTags[i].charAt(0) == 'B';
                    boolean end = tag != null && (start || chunkTags[i].charAt(0) == 'O');
                    if(end){ //add the current phrase
                        //add at AnalysedText level, because offsets are absolute
                        //NOTE we are already at the next token when we detect the end
                        Chunk chunk = at.addChunk( 
                            tokenList.get(i-chunkTokenCount).getStart(), 
                            tokenList.get(i-1).getEnd());
                        chunk.addAnnotation(PHRASE_ANNOTATION, 
                            new Value<PhraseTag>(tag,
                                    chunkProps/(double)chunkTokenCount));
                        //reset the state
                        tag = null;
                        chunkTokenCount = 0;
                        chunkProps = 0;
                    }
                    if(start){ //create the new tag
                        tag = getPhraseTag(tagSet,adhocTags,
                            chunkTags[i].substring(2), language); //skip 'B-'
                        
                    }
                    if(tag != null){ //count this token for the current chunk
                        chunkProps = chunkProps + chunkProb[i];
                        chunkTokenCount++;
                    }
                }
                if(tag != null){
                    Chunk chunk = at.addChunk( 
                        tokenList.get(i-chunkTokenCount).getStart(), 
                        tokenList.get(i-1).getEnd());
                    chunk.addAnnotation(PHRASE_ANNOTATION, 
                        new Value<PhraseTag>(tag,
                                chunkProps/(double)chunkTokenCount));

View Full Code Here

Examples of org.apache.stanbol.enhancer.nlp.model.AnalysedText

        Assert.assertTrue(serialized.contains("\"class\" : \"org.apache.stanbol.enhancer.nlp.ner.NerTag\""));
        Assert.assertTrue(serialized.contains("\"stanbol.enhancer.nlp.morpho\" : {"));
        Assert.assertTrue(serialized.contains("\"class\" : \"org.apache.stanbol.enhancer.nlp.morpho.MorphoFeatures\""));
        //deserialize
        AnalyzedTextParser parser = AnalyzedTextParser.getDefaultInstance();
        AnalysedText parsedAt = parser.parse(new ByteArrayInputStream(data), null, 
            atFactory.createAnalysedText(textBlob.getValue()));
        Assert.assertEquals(analysedTextWithData, parsedAt);
        Iterator<Span> origSpanIt = analysedTextWithData.getEnclosed(EnumSet.allOf(SpanTypeEnum.class));
        Iterator<Span> parsedSpanIt = parsedAt.getEnclosed(EnumSet.allOf(SpanTypeEnum.class));
        while(origSpanIt.hasNext() && parsedSpanIt.hasNext()){
            Span orig = origSpanIt.next();
            Span parsed = parsedSpanIt.next();
            Assert.assertEquals(orig, parsed);
            Set<String> origKeys = orig.getKeys();

View Full Code Here

Examples of org.apache.stanbol.enhancer.nlp.model.AnalysedText

    
    @Test
    public void testEngineDe() throws IOException, EngineException {
        ContentItem ci = ciFactory.createContentItem(new StringSource(de_text));
        Assert.assertNotNull(ci);
        AnalysedText at = atFactory.createAnalysedText(ci, ci.getBlob());
        Assert.assertNotNull(at);
        ci.getMetadata().add(new TripleImpl(ci.getUri(), DC_LANGUAGE, new PlainLiteralImpl("de")));
        Assert.assertEquals("de", EnhancementEngineHelper.getLanguage(ci));
        
        //Add some Tokens with POS annotations to test the usage of
        //existing POS annotations by the lemmatizer
        Token verbrachten = at.addToken(de_verbStart,de_verbStart+de_verb.length());
        verbrachten.addAnnotation(POS_ANNOTATION, Value.value(
            new PosTag("V",LexicalCategory.Verb), de_verbProb));
        
        Token schonen = at.addToken(de_adjectiveStart,de_adjectiveStart+de_adjective.length()); 
        schonen.addAnnotation(POS_ANNOTATION, Value.value(
            new PosTag("ADJ",LexicalCategory.Adjective), de_adjectiveProb));
        
        Token urlaub = at.addToken(de_nounStart,de_nounStart+de_noun.length()); 
        urlaub.addAnnotation(POS_ANNOTATION, Value.value(
            new PosTag("NC",LexicalCategory.Noun), de_nounProb));
        
        Assert.assertEquals("Can not enhance Test ContentItem",
            EnhancementEngine.ENHANCE_ASYNC,engine.canEnhance(ci));
        //compute the enhancements
        try {
            engine.computeEnhancements(ci);
        } catch (EngineException e) {
            RemoteServiceHelper.checkServiceUnavailable(e);
            return; //deactivate test
        }
        //now validate the enhancements
        boolean foundVerb = false;
        boolean foundAdjective = false;
        boolean foundNoun = false;
        for(Iterator<Token> tokens = at.getTokens(); tokens.hasNext();){
            Token token = tokens.next();
            log.info("Token: {}",token);
            List<Value<MorphoFeatures>> mfs = token.getAnnotations(NlpAnnotations.MORPHO_ANNOTATION);
            if(de_verb.equals(token.getSpan())){
                foundVerb = !mfs.isEmpty();

View Full Code Here

Examples of org.apache.stanbol.enhancer.nlp.model.AnalysedText

     *          if the underlying process failed to work as
     *          expected
     */
    @Override
    public void computeEnhancements(ContentItem ci) throws EngineException {
        final AnalysedText at = initAnalysedText(this,analysedTextFactory,ci);


        String language = getLanguage(this,ci,false);
        if(!("zh".equals(language) || (language != null && language.startsWith("zh-")))) {
            throw new IllegalStateException("The detected language is NOT 'zh'! "
                + "As this is also checked within the #canEnhance(..) method this "
                + "indicates an Bug in the used EnhancementJobManager implementation. "
                + "Please report this on the dev@apache.stanbol.org or create an "
                + "JIRA issue about this.");
        }
        //first the sentences
        TokenStream sentences = new SentenceTokenizer(new CharSequenceReader(at.getText()));
        try {
            while(sentences.incrementToken()){
                OffsetAttribute offset = sentences.addAttribute(OffsetAttribute.class);
                Sentence s = at.addSentence(offset.startOffset(), offset.endOffset());
                if(log.isTraceEnabled()) {
                    log.trace("detected {}:{}",s,s.getSpan());
                }
            }
        } catch (IOException e) {

View Full Code Here

Examples of org.apache.stanbol.enhancer.nlp.model.AnalysedText

     *          expected
     */
    @Override
    public void computeEnhancements(final ContentItem ci) throws EngineException {
        //get/create the AnalysedText
        final AnalysedText at = NlpEngineHelper.initAnalysedText(this, analysedTextFactory, ci);
        final Blob blob = at.getBlob();
        //send the text to the server
        final String language = getLanguage(this, ci, true);
        final HttpPost request = new HttpPost(analysisServiceUrl);
        request.addHeader(HttpHeaders.CONTENT_LANGUAGE, language);
        request.setEntity(new InputStreamEntity(
            blob.getStream(), blob.getContentLength(),
            ContentType.create(blob.getMimeType(), 
                blob.getParameter().get("charset"))));
        //execute the request
        try {
            AccessController.doPrivileged(new PrivilegedExceptionAction<AnalysedText>() {
                public AnalysedText run() throws ClientProtocolException, IOException {
                    return httpClient.execute(request, new AnalysisResponseHandler(at));
                }
            });
        } catch (PrivilegedActionException pae) {
            Exception e = pae.getException();
            if(e instanceof ClientProtocolException) {
                throw new EngineException(this, ci, "Exception while executing Request "
                    + "on RESTful NLP Analysis Service at "+analysisServiceUrl, e);
            } else if(e instanceof IOException) {
                throw new EngineException(this, ci, "Exception while executing Request "
                        + "on RESTful NLP Analysis Service at "+analysisServiceUrl, e);
            } else {
                throw RuntimeException.class.cast(e);
            }
        }
        Iterator<Span> spans = at.getEnclosed(EnumSet.of(SpanTypeEnum.Sentence,SpanTypeEnum.Chunk));
        Sentence context = null;
        MGraph metadata = ci.getMetadata();
        Language lang = new Language(language);
        LiteralFactory lf = LiteralFactory.getInstance();
        ci.getLock().writeLock().lock();
        try { //write TextAnnotations for Named Entities
            while(spans.hasNext()){
                Span span = spans.next();
                switch (span.getType()) {
                    case Sentence:
                        context = (Sentence)context;
                        break;
                    default:
                        Value<NerTag> nerAnno = span.getAnnotation(NER_ANNOTATION);
                        if(nerAnno != null){
                            UriRef ta = EnhancementEngineHelper.createTextEnhancement(ci, this);
                            //add span related data
                            metadata.add(new TripleImpl(ta, ENHANCER_SELECTED_TEXT, 
                                new PlainLiteralImpl(span.getSpan(), lang)));
                            metadata.add(new TripleImpl(ta, ENHANCER_START, 
                                lf.createTypedLiteral(span.getStart())));
                            metadata.add(new TripleImpl(ta, ENHANCER_END, 
                                lf.createTypedLiteral(span.getEnd())));
                            metadata.add(new TripleImpl(ta, ENHANCER_SELECTION_CONTEXT, 
                                new PlainLiteralImpl(context == null ?
                                        getDefaultSelectionContext(at.getSpan(), span.getSpan(), span.getStart()) :
                                            context.getSpan(), lang)));
                            //add the NER type
                            if(nerAnno.value().getType() != null){
                                metadata.add(new TripleImpl(ta,DC_TYPE,nerAnno.value().getType()));
                            }

View Full Code Here

0 1 2 3 4 5

TOP

All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.