Package org.dbpedia.spotlight.evaluation

Source Code of org.dbpedia.spotlight.evaluation.ExtractionEvaluator

/*
* Copyright 2011 Pablo Mendes, Max Jakob
*
*  Licensed under the Apache License, Version 2.0 (the "License");
*  you may not use this file except in compliance with the License.
*  You may obtain a copy of the License at
*
*  http://www.apache.org/licenses/LICENSE-2.0
*
*  Unless required by applicable law or agreed to in writing, software
*  distributed under the License is distributed on an "AS IS" BASIS,
*  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
*  See the License for the specific language governing permissions and
*  limitations under the License.
*
*  Check our project website for information on how to acknowledge the authors and how to contribute to the project: http://spotlight.dbpedia.org
*/

package org.dbpedia.spotlight.evaluation;

import au.com.bytecode.opencsv.CSVReader;
import org.apache.jcs.access.exception.CacheException;
import org.dbpedia.spotlight.annotate.DefaultParagraphAnnotator;
import org.dbpedia.spotlight.exceptions.ConfigurationException;
import org.dbpedia.spotlight.exceptions.InputException;
import org.dbpedia.spotlight.model.*;
import org.dbpedia.spotlight.spot.cooccurrence.training.AnnotatedDataset;
import org.json.JSONException;

import java.io.*;
import java.util.*;

/**
* @author Joachim Daiber
*/
public class ExtractionEvaluator {

  public static void main(String[] args) throws IOException, ConfigurationException, JSONException, InputException, CacheException {

    SpotlightConfiguration configuration = new SpotlightConfiguration("conf/server.properties");
    SpotlightFactory spotlightFactory = new SpotlightFactory(configuration);

    AnnotatedDataset evaluationCorpus =
        new AnnotatedDataset(new File(args[0]),
            AnnotatedDataset.Format.CSAW, spotlightFactory);

    int tp = 0, fp = 0, fn = 0;

    Set<DBpediaResourceOccurrence> goldAnnotations = evaluationCorpus.toDBpediaResourceOccurrences();
    writeAnnotations(spotlightFactory, evaluationCorpus,
        new File(args[1]));

    /**
     * Read all annotations made by a configuration of DBpedia Spotlight.
     */
    List<List<DBpediaResourceOccurrence>> allAnnotatedTexts = readAnnotations(new File(args[1]),
      evaluationCorpus);

    /**
     * Remove all
     */
    Set<DBpediaResourceOccurrence> allAnnotatedOccurrences = new HashSet<DBpediaResourceOccurrence>();
    Set<Text> texts = new HashSet<Text>();
    for(List<DBpediaResourceOccurrence> annotatedOccurrenceText : allAnnotatedTexts) {
      if(annotatedOccurrenceText.size() == 0)
        continue;

      texts.add(annotatedOccurrenceText.get(0).context());
      allAnnotatedOccurrences.addAll(annotatedOccurrenceText);

      for(DBpediaResourceOccurrence dbpediaResourceOccurrence : annotatedOccurrenceText) {
        if(goldAnnotations.contains(dbpediaResourceOccurrence))
          tp++;
        else
          fp++;
      }
    }

    /**
     * Remove all annotations whose texts have not been
     * annotated.
     */
    Set<DBpediaResourceOccurrence> goldAnnotationsForTexts =  new HashSet<DBpediaResourceOccurrence>();
    for(DBpediaResourceOccurrence goldAnnotation : goldAnnotations) {
      if(texts.contains(goldAnnotation.context())) {
        goldAnnotationsForTexts.add(goldAnnotation);
      }
    }


    System.out.println("Gold annotations: " + goldAnnotations.size());
    System.out.println("Gold annotations with annotated equivalent: " + goldAnnotationsForTexts.size());


    for(DBpediaResourceOccurrence goldAnnotation : goldAnnotationsForTexts) {
      if(!allAnnotatedOccurrences.contains(goldAnnotation))
        fn++;
    }


    System.out.println("TP: " + tp);
    System.out.println("FP: " + fp);
    System.out.println("FN: " + fn);
   
    float precision = (tp / (float) (tp + fn));
    System.out.println("P: " + precision);

    float recall = (tp / (float) (tp + fp));
    System.out.println("R: " + recall);
    System.out.println("F1: " + ((2 * precision * recall/ (recall+precision)));


  }

  private static List<List<DBpediaResourceOccurrence>> readAnnotations(File folder, AnnotatedDataset evaluationCorpus) {

    File[] files = folder.listFiles();

    List<List<DBpediaResourceOccurrence>> allAnnotatedOccurrences
        = new LinkedList<List<DBpediaResourceOccurrence>>();

    for(File file : files) {

      if(!file.getName().endsWith(".tsv")) {
        continue;
      }

      int textID = Integer.parseInt(file.getName().replace(".tsv", ""));

      List<DBpediaResourceOccurrence> occurrencesInText = new ArrayList<DBpediaResourceOccurrence>();
      try {
        System.out.println("Read " + file);
        CSVReader csvReader = new CSVReader(new FileReader(file), '\t');

        System.out.println(evaluationCorpus.getTexts().size());
        Text text = evaluationCorpus.getTexts().get(textID - 1);
        System.out.println(text);

        String[] line;
        while((line = csvReader.readNext()) != null){

          DBpediaResource dbpediaResource = new DBpediaResource(line[1]);
          SurfaceForm surfaceForm = new SurfaceForm(line[2]);
          int offset = Integer.parseInt(line[4]);

          DBpediaResourceOccurrence dbpediaResourceOccurrence
              = new DBpediaResourceOccurrence(dbpediaResource, surfaceForm, text, offset);
          occurrencesInText.add(dbpediaResourceOccurrence);
        }

      } catch (FileNotFoundException e) {
        e.printStackTrace();
      } catch (IOException e) {
        e.printStackTrace();
      } catch (IndexOutOfBoundsException e) {
        /*
          A line in the CSV file could not be read, this is likely caused by the file not
          being wrote properly, so we will ignore it.
         */
        System.out.println("Error reading file " + file + e);
        continue;
      } catch (NumberFormatException e) {
        /*
          A line in the CSV file could not be read, this is likely caused by the file not
          being wrote properly, so we will ignore it.
         */
        System.out.println("Error reading file " + file + e);
        continue;
      }

      System.out.println("Read file " + file);
      allAnnotatedOccurrences.add(occurrencesInText);
    }

    return allAnnotatedOccurrences;
  }

  private static void writeAnnotations(SpotlightFactory spotlightFactory, AnnotatedDataset evaluationCorpus,
                     File annotationsFolder) throws IOException {


    DefaultParagraphAnnotator annotator = null;
    //spotlightFactory.annotator(); --> HACK: DefaultAnnotator is not DefaultParagraphAnnotator

    int i = 0;
    for(Text text : evaluationCorpus.getTexts()){
      i++;
      if(new File(annotationsFolder, i + ".tsv").exists()) {
        continue;
      }

      try{
        List<DBpediaResourceOccurrence> annotatedOccurrences = annotator.annotate(text.text());

        FileWriter fileWriter = new FileWriter(new File(annotationsFolder, i + ".tsv"));
        for(DBpediaResourceOccurrence annotatedOccurrence : annotatedOccurrences){
          fileWriter.write(annotatedOccurrence.toTsvString() + "\n");
        }
        fileWriter.close();
      }catch (Exception ignored) {}
    }

  }

}
TOP

Related Classes of org.dbpedia.spotlight.evaluation.ExtractionEvaluator

TOP
Copyright © 2018 www.massapi.com. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.