/*
* Copyright 2011 DBpedia Spotlight Development Team
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*
* Check our project website for information on how to acknowledge the authors and how to contribute to the project: http://spotlight.dbpedia.org
*/
package org.dbpedia.spotlight.spot;
import opennlp.tools.namefind.NameFinderME;
import opennlp.tools.namefind.TokenNameFinderModel;
import opennlp.tools.sentdetect.SentenceDetectorME;
import opennlp.tools.sentdetect.SentenceModel;
import opennlp.tools.tokenize.SimpleTokenizer;
import opennlp.tools.tokenize.Tokenizer;
import opennlp.tools.util.Span;
import opennlp.tools.util.model.BaseModel;
import org.apache.commons.lang.StringUtils;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.dbpedia.spotlight.exceptions.ConfigurationException;
import org.dbpedia.spotlight.exceptions.SpottingException;
import org.dbpedia.spotlight.model.Feature;
import org.dbpedia.spotlight.model.SurfaceForm;
import org.dbpedia.spotlight.model.SurfaceFormOccurrence;
import org.dbpedia.spotlight.model.Text;
import java.io.IOException;
import java.net.URI;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
/**
* Spotter that uses Named Entity Recognition (NER) models from OpenNLP. Only spots People, Organisations and Locations.
*
* TODO remove hardcoding of opennlp models. get from configuration
*
* @author Rohana Rajapakse (GOSS Interactive Limited) - implemented the class
* @author pablomendes adjustments to logging, class rename, integrated with the rest of architecture
*/
public class NESpotter implements Spotter {
private final Log LOG = LogFactory.getLog(this.getClass());
protected static BaseModel sentenceModel = null;
protected static Map<String, Object[]> entityTypes = new HashMap<String, Object[]>() {
{
put(OpenNLPUtil.OpenNlpModels.person.toString(), null);
put(OpenNLPUtil.OpenNlpModels.location.toString(), null);
put(OpenNLPUtil.OpenNlpModels.organization.toString(), null);
}
};
public NESpotter(String onlpModelDir, String i18nLanguageCode, Map<String, String> openNLPModelsURI) throws ConfigurationException {
try {
if (NESpotter.sentenceModel == null) {
NESpotter.sentenceModel = OpenNLPUtil.loadModel(onlpModelDir, i18nLanguageCode + OpenNLPUtil.OpenNlpModels.SentenceModel.filename(), OpenNLPUtil.OpenNlpModels.SentenceModel.toString());
}
if (NESpotter.entityTypes.get(OpenNLPUtil.OpenNlpModels.person.toString()) == null) {
buildNameModel(onlpModelDir,OpenNLPUtil.OpenNlpModels.person.toString(), new URI(openNLPModelsURI.get(OpenNLPUtil.OpenNlpModels.person.toString())),i18nLanguageCode);
}
if (NESpotter.entityTypes.get(OpenNLPUtil.OpenNlpModels.location.toString()) == null) {
buildNameModel(onlpModelDir, OpenNLPUtil.OpenNlpModels.location.toString(), new URI(openNLPModelsURI.get(OpenNLPUtil.OpenNlpModels.location.toString())),i18nLanguageCode);
}
if (NESpotter.entityTypes.get(OpenNLPUtil.OpenNlpModels.organization.toString()) == null) {
buildNameModel(onlpModelDir, OpenNLPUtil.OpenNlpModels.organization.toString(), new URI(openNLPModelsURI.get(OpenNLPUtil.OpenNlpModels.organization.toString())),i18nLanguageCode);
}
} catch (Exception e) {
throw new ConfigurationException("Error initializing NESpotter", e);
}
}
protected BaseModel buildNameModel(String directoryPath, String modelType, URI typeUri, String i18nLanguageCode) throws IOException, ConfigurationException {
String fname = OpenNLPUtil.OpenNlpModels.valueOf(modelType).filename();
String modelRelativePath = i18nLanguageCode + fname;
BaseModel model = OpenNLPUtil.loadModel(directoryPath, modelRelativePath, modelType);
entityTypes.put(modelType, new Object[] { typeUri, model });
return model;
}
@Override
public List<SurfaceFormOccurrence> extract(Text text) throws SpottingException {
List<SurfaceFormOccurrence> ret = new ArrayList<SurfaceFormOccurrence>();
try {
for (Map.Entry<String, Object[]> type : entityTypes.entrySet()) {
List<SurfaceFormOccurrence> res = null;
//TODO pass type information within SurfaceFormOccurrence to later stages
String typeLabel = type.getKey();
Object[] typeInfo = type.getValue();
URI typeUri = (URI) typeInfo[0];
BaseModel nameFinderModel = (BaseModel) typeInfo[1];
res = extractNameOccurrences(nameFinderModel, text, typeUri);
if (res != null && !res.isEmpty()) {
if (ret == null) {
ret = res;
}
else {
ret.addAll(res);
}
}
}
} catch (Exception e) {
throw new SpottingException(e);
}
return ret;
}
String name = "NESpotter";
@Override
public String getName() {
return name;
}
@Override
public void setName(String n) {
this.name = n;
}
protected List<SurfaceFormOccurrence> extractNameOccurrences(BaseModel nameFinderModel, Text text, URI oType) {
String intext = text.text();
SentenceDetectorME sentenceDetector = new SentenceDetectorME((SentenceModel)sentenceModel);
String[] sentences = sentenceDetector.sentDetect(intext);
Span[] sentenceEndings = sentenceDetector.sentPosDetect(intext);
int[] sentencePositions = new int[sentences.length + 1];
for (int k=0; k<sentenceEndings.length; k++) {
sentencePositions[k] = sentenceEndings[k].getStart();
}
NameFinderME finder = new NameFinderME((TokenNameFinderModel)nameFinderModel);
List<SurfaceFormOccurrence> sfOccurrences = new ArrayList<SurfaceFormOccurrence>();
Tokenizer tokenizer = new SimpleTokenizer();
for (int i = 0; i < sentences.length; i++) {
String sentence = sentences[i];
//LOG.debug("Sentence: " + sentence);
// extract the names in the current sentence
String[] tokens = tokenizer.tokenize(sentence);
Span[] tokenspan = tokenizer.tokenizePos(sentence);
Span[] nameSpans = finder.find(tokens);
double[] probs = finder.probs();
if (nameSpans != null && nameSpans.length > 0) {
//System.out.println("Tokens: " +(new ArrayList(Arrays.asList(tokens))).toString());
//System.out.println("NameSpans: " +(new ArrayList(Arrays.asList(nameSpans))).toString());
for (Span span : nameSpans) {
StringBuilder buf = new StringBuilder();
//System.out.println("StartSpan: " + span.getStart() + " EndSpan: " + span.getEnd());
for (int j = span.getStart(); j < span.getEnd(); j++) {
//System.out.println(tokens[i] + " appended to " + buf.toString());
buf.append(tokens[j]);
if(j<span.getEnd()-1) buf.append(" ");
}
String surfaceFormStr = buf.toString().trim();
if (surfaceFormStr.contains(".")) {
surfaceFormStr = correctPhrase(surfaceFormStr, sentence);
}
int entStart = sentencePositions[i] + tokenspan[span.getStart()].getStart();
int entEnd = sentencePositions[i] +tokenspan[span.getEnd()-1].getEnd();
/*
System.out.println("\n\nRR-NE Found = " + buf.toString());
System.out.println("Start = " + entStart);
System.out.println("End = " + entEnd);
System.out.println("Sentence = " + sentence);
System.out.println("Text = " + text);
*/
SurfaceForm surfaceForm = new SurfaceForm(surfaceFormStr);
SurfaceFormOccurrence sfocc = new SurfaceFormOccurrence(surfaceForm, text, entStart);
sfocc.features().put("type", new Feature("type",oType.toString()));
sfOccurrences.add(sfocc);
}
}
}
finder.clearAdaptiveData();
if (LOG.isDebugEnabled()) {
LOG.debug("Occurrences found: " +StringUtils.join(sfOccurrences, ", "));
}
return sfOccurrences;
}
private String correctPhrase(String phrs, String intext) {
//first remove " ."
while (phrs.contains(" .")){
phrs = phrs.replace(" .", ".");
}
if (!intext.contains(phrs)) {
while (phrs.contains(". ")){
phrs = phrs.replace(". ", ".");
}
}
//System.out.println(phrs);
return phrs;
}
}