Package org.apache.stanbol.enhancer.engines.entitylinking.labeltokenizer.paoding

Source Code of org.apache.stanbol.enhancer.engines.entitylinking.labeltokenizer.paoding.PaodingLabelTokenizer

/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements.  See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License.  You may obtain a copy of the License at
*
*     http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.stanbol.enhancer.engines.entitylinking.labeltokenizer.paoding;

import java.io.IOException;
import java.io.StringReader;
import java.security.AccessController;
import java.security.PrivilegedActionException;
import java.security.PrivilegedExceptionAction;
import java.util.ArrayList;
import java.util.List;

import net.paoding.analysis.analyzer.PaodingAnalyzer;

import org.apache.felix.scr.annotations.Activate;
import org.apache.felix.scr.annotations.Component;
import org.apache.felix.scr.annotations.ConfigurationPolicy;
import org.apache.felix.scr.annotations.Deactivate;
import org.apache.felix.scr.annotations.Properties;
import org.apache.felix.scr.annotations.Property;
import org.apache.felix.scr.annotations.Service;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
import org.apache.stanbol.enhancer.engines.entitylinking.LabelTokenizer;
import org.osgi.framework.Constants;
import org.osgi.service.component.ComponentContext;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

@Service
@Component(
    configurationFactory=true,
    policy=ConfigurationPolicy.OPTIONAL,
    metatype=true)
@Properties(value={
        @Property(name=Constants.SERVICE_RANKING,intValue=200) //the smartcn one uses 100
})
public class PaodingLabelTokenizer implements LabelTokenizer {

    private Logger log = LoggerFactory.getLogger(PaodingAnalyzer.class);
   
    private static final String[] EMPTY = new String[]{};
   
    @Activate
    protected void activate(ComponentContext ctx){
        log.info(" ... activating {}",PaodingLabelTokenizer.class);
    }
   
    @Deactivate
    protected void deactivate(ComponentContext ctx){
        log.info(" ... deactivating {}",PaodingLabelTokenizer.class);
    }
   
    @Override
    public String[] tokenize(String label, String language) {
        if(label == null){
            throw new IllegalArgumentException("The parsed Label MUST NOT be NULL!");
        }
        if("zh".equals(language) || (language != null && language.length() > 4 &&
                language.charAt(2) == '-' && language.startsWith("zh"))){
            if(label.isEmpty()){
                return EMPTY;
            }
            PaodingAnalyzer pa;
            try {
                pa = AccessController.doPrivileged(new PrivilegedExceptionAction<PaodingAnalyzer>() {
                    public PaodingAnalyzer run() throws Exception {
                        return new PaodingAnalyzer();
                    }
                });
            } catch (PrivilegedActionException pae){
                Exception e = pae.getException();
                log.error("Unable to initialise PoadingAnalyzer",e);
                return null;
            }
            TokenStream ts = pa.tokenStream("dummy", new StringReader(label));
            List<String> tokens = new ArrayList<String>(8);
            int lastEnd = 0;
            try {
                while(ts.incrementToken()){
                    OffsetAttribute offset = ts.addAttribute(OffsetAttribute.class);
                    //when tokenizing labels we need to preserve all chars
                    if(offset.startOffset() > lastEnd){ //add token for stopword
                        tokens.add(label.substring(lastEnd,offset.startOffset()));
                    }
                    tokens.add(label.substring(offset.startOffset(), offset.endOffset()));
                    lastEnd = offset.endOffset();
                }
                return tokens.toArray(new String[tokens.size()]);           
            } catch (IOException e) {
                log.warn("IOException while tokenizing label '"+label+"'",e);
                return null;
            }
        } else {
            return null;
        }
    }

}
TOP

Related Classes of org.apache.stanbol.enhancer.engines.entitylinking.labeltokenizer.paoding.PaodingLabelTokenizer

TOP
Copyright © 2018 www.massapi.com. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.