Package org.mediameter.cliff.extractor

Source Code of org.mediameter.cliff.extractor.StanfordNamedEntityExtractor

package org.mediameter.cliff.extractor;

import java.io.IOException;
import java.io.InputStream;
import java.util.List;
import java.util.Properties;

import org.mediameter.cliff.places.substitutions.Blacklist;
import org.mediameter.cliff.places.substitutions.CustomSubstitutionMap;
import org.mediameter.cliff.places.substitutions.WikipediaDemonymMap;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

import com.bericotech.clavin.extractor.LocationOccurrence;

import edu.stanford.nlp.ie.AbstractSequenceClassifier;
import edu.stanford.nlp.ie.crf.CRFClassifier;
import edu.stanford.nlp.util.CoreMap;
import edu.stanford.nlp.util.Triple;

/**
*/
public class StanfordNamedEntityExtractor{

    public final static Logger logger = LoggerFactory.getLogger(StanfordNamedEntityExtractor.class);

    public static final String CUSTOM_SUBSTITUTION_FILE = "custom-substitutions.csv";
    public static final String LOCATION_BLACKLIST_FILE = "location-blacklist.txt";
    public static final String PERSON_TO_PLACE_FILE = "person-to-place-replacements.csv";
   
    // the actual named entity recognizer (NER) object
    private AbstractSequenceClassifier<CoreMap> namedEntityRecognizer;
       
    private WikipediaDemonymMap demonyms;
    private CustomSubstitutionMap customSubstitutions;
    private CustomSubstitutionMap personToPlaceSubstitutions;
    private Blacklist locationBlacklist;
   
    private Model model;
   
    // Don't change the order of this, unless you also change the default in the cliff.properties file
    public enum Model {
        ENGLISH_ALL_3CLASS, ENGLISH_CONLL_4CLASS
    }
   
    /**
     * Default constructor. Instantiates a {@link StanfordNamedEntityExtractor}
     * with the standard English language model
     *
     * @throws ClassCastException
     * @throws IOException
     * @throws ClassNotFoundException
     */
    public StanfordNamedEntityExtractor() throws ClassCastException, IOException, ClassNotFoundException {
        this(Model.ENGLISH_ALL_3CLASS);
    }
   
    public StanfordNamedEntityExtractor(Model modelToUse) throws ClassCastException, IOException, ClassNotFoundException {
        model = modelToUse;
        switch(model){
        case ENGLISH_ALL_3CLASS:
            initializeWithModelFiles("english.all.3class.distsim.crf.ser.gz", "english.all.3class.distsim.prop" );
            break;
        case ENGLISH_CONLL_4CLASS:
            initializeWithModelFiles("english.conll.4class.distsim.crf.ser.gz", "english.conll.4class.distsim.prop"); // makes it take about 30% longer :-(
            break;
        }
        demonyms = new WikipediaDemonymMap();
        customSubstitutions = new CustomSubstitutionMap(CUSTOM_SUBSTITUTION_FILE);
        locationBlacklist = new Blacklist(LOCATION_BLACKLIST_FILE);
        personToPlaceSubstitutions = new CustomSubstitutionMap(PERSON_TO_PLACE_FILE,false);
    }
   
    /**
     * Builds a {@link StanfordNamedEntityExtractor} by instantiating the
     * Stanford NER named entity recognizer with a specified
     * language model.
     *
     * @param NERmodel                      path to Stanford NER language model
     * @param NERprop                       path to property file for Stanford NER language model
     * @throws IOException
     * @throws ClassNotFoundException
     * @throws ClassCastException
     */
    //@SuppressWarnings("unchecked")
    private void initializeWithModelFiles(String NERmodel, String NERprop) throws IOException, ClassCastException, ClassNotFoundException {
        InputStream mpis = this.getClass().getClassLoader().getResourceAsStream("models/" + NERprop);
        Properties mp = new Properties();
        mp.load(mpis);
        namedEntityRecognizer = (AbstractSequenceClassifier<CoreMap>)
                CRFClassifier.getJarClassifier("/models/" + NERmodel, mp);
    }

    /**
     * Get extracted locations from a plain-text body.
     *
     * @param text                      Text content to perform extraction on.
     * @param manuallyReplaceDemonyms   Can slow down performance quite a bit
     * @return          All the entities mentioned
     */
    public ExtractedEntities extractEntities(String textToParse,boolean manuallyReplaceDemonyms) {
        ExtractedEntities entities = new ExtractedEntities();

        if (textToParse==null || textToParse.length()==0){
            logger.warn("input to extractEntities was null or zero!");
            return entities;
        }

        String text = textToParse;
        if(manuallyReplaceDemonyms){    // this is a noticeable performance hit
            logger.debug("Replacing all demonyms by hand");
            text = demonyms.replaceAll(textToParse);
        }
       
        // extract entities as <Entity Type, Start Index, Stop Index>
        List<Triple<String, Integer, Integer>> extractedEntities =
                namedEntityRecognizer.classifyToCharacterOffsets(text);

        if (extractedEntities != null) {
            for (Triple<String, Integer, Integer> extractedEntity : extractedEntities) {
                String entityName = text.substring(extractedEntity.second(), extractedEntity.third());
                int position = extractedEntity.second();
                switch(extractedEntity.first){
                case "PERSON":
                    if(personToPlaceSubstitutions.contains(entityName)){
                        entities.addLocation( getLocationOccurrence(personToPlaceSubstitutions.getSubstitution(entityName), position) );
                        logger.debug("Changed person "+entityName+" to a place");
                    } else {
                        PersonOccurrence person = new PersonOccurrence(entityName, position);
                        entities.addPerson( person );
                    }
                    break;
                case "LOCATION":
                    if(!locationBlacklist.contains(entityName)){
                        entities.addLocation( getLocationOccurrence(entityName, position) );
                    } else {
                       logger.debug("Ignored blacklisted location "+entityName);
                    }
                    break;
                case "ORGANIZATION":
                    OrganizationOccurrence organization = new OrganizationOccurrence(entityName, position);
                    entities.addOrganization( organization );
                    break;
                case "MISC":    // if you're using the slower 4class model
                    if (demonyms.contains(entityName)) {
                        logger.debug("Found and adding a MISC demonym "+entityName);
                        entities.addLocation( getLocationOccurrence(entityName, position) );
                    }
                    break;
                default:
                    logger.error("Unknown NER type :"+ extractedEntity.first);
                }
            }
        }

        return entities;
    }
   
    private LocationOccurrence getLocationOccurrence(String entityName, int position){
        String fixedName = entityName;
        if (demonyms.contains(entityName)) {
            fixedName = demonyms.getSubstitution(entityName);
            logger.debug("Demonym substitution: "+entityName+" to "+fixedName);
        } else if(customSubstitutions.contains(entityName)) {
            fixedName = customSubstitutions.getSubstitution(entityName);
            logger.debug("Custom substitution: "+entityName+" to "+fixedName);
        }
        return new LocationOccurrence(fixedName, position);
    }
   
}
TOP

Related Classes of org.mediameter.cliff.extractor.StanfordNamedEntityExtractor

TOP
Copyright © 2018 www.massapi.com. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.