Source Code of edu.stanford.nlp.pipeline.StanfordCoreNLPITest

package edu.stanford.nlp.pipeline;


import edu.stanford.nlp.ie.machinereading.structure.MachineReadingAnnotations;
import edu.stanford.nlp.ie.machinereading.structure.RelationMention;
import edu.stanford.nlp.ling.CoreAnnotations;
import edu.stanford.nlp.ling.CoreLabel;
import edu.stanford.nlp.trees.TreeCoreAnnotations;
import edu.stanford.nlp.semgraph.SemanticGraph;
import edu.stanford.nlp.semgraph.SemanticGraphCoreAnnotations;
import edu.stanford.nlp.util.CoreMap;
import edu.stanford.nlp.util.StringUtils;
import junit.framework.Assert;
import junit.framework.TestCase;


import java.io.*;
import java.util.List;
import java.util.Properties;


public class StanfordCoreNLPITest extends TestCase {


  public void testRequires() throws Exception {
    Properties props = new Properties();
    try {
      // Put the lemma before pos and you have a problem
      props.setProperty("annotators", "tokenize,ssplit,lemma,pos,ner,parse");
      new StanfordCoreNLP(props);
      throw new RuntimeException("Should have thrown an exception");
    } catch (IllegalArgumentException e) {
      // yay
    }


    // This should be okay: parse can take the place of pos
    props.setProperty("annotators", "tokenize,ssplit,parse,lemma,ner");
    new StanfordCoreNLP(props);
  }


  public void test() throws Exception {
    // create a properties that enables all the anotators
    Properties props = new Properties();
    props.setProperty("annotators", "tokenize,ssplit,pos,lemma,ner,parse");


    // run an annotation through the pipeline
    String text = "Dan Ramage is working for\nMicrosoft. He's in Seattle! \n";
    Annotation document = new Annotation(text);
    StanfordCoreNLP pipeline = new StanfordCoreNLP(props);
    pipeline.annotate(document);


    // check that tokens are present
    List<CoreLabel> tokens = document.get(CoreAnnotations.TokensAnnotation.class);
    Assert.assertNotNull(tokens);
    Assert.assertEquals(12, tokens.size());


    // check that sentences are present
    List<CoreMap> sentences = document.get(CoreAnnotations.SentencesAnnotation.class);
    Assert.assertNotNull(sentences);
    Assert.assertEquals(2, sentences.size());


    // check that pos, lemma and ner and parses are present
    for (CoreMap sentence: sentences) {
      List<CoreLabel> sentenceTokens = sentence.get(CoreAnnotations.TokensAnnotation.class);
      Assert.assertNotNull(sentenceTokens);
      for (CoreLabel token: sentenceTokens) {
        Assert.assertNotNull(token.get(CoreAnnotations.PartOfSpeechAnnotation.class));
        Assert.assertNotNull(token.get(CoreAnnotations.LemmaAnnotation.class));
        Assert.assertNotNull(token.get(CoreAnnotations.NamedEntityTagAnnotation.class));
      }


      // check for parse tree
      Assert.assertNotNull(sentence.get(TreeCoreAnnotations.TreeAnnotation.class));
    }


    // test pretty print
    StringWriter stringWriter = new StringWriter();
    pipeline.prettyPrint(document, new PrintWriter(stringWriter));
    String result = stringWriter.getBuffer().toString();
    Assert.assertTrue("Tokens are wrong in " + result,
            StringUtils.find(result, "\\[Text=Dan .*PartOfSpeech=NNP Lemma=Dan NamedEntityTag=PERSON\\]"));
    Assert.assertTrue("Parses are wrong in " + result,
        result.contains("(NP (PRP He))"));
    Assert.assertTrue("Parses are wrong in " + result,
        result.contains("(VP (VBZ 's)"));
    Assert.assertTrue("Sentence header is wrong in " + result,
        result.contains("Sentence #1 (7 tokens)"));
    Assert.assertTrue("Dependencies are wrong in " + result,
        result.contains("nsubj(working-4, Ramage-2)"));


    // test XML
    ByteArrayOutputStream os = new ByteArrayOutputStream();
    pipeline.xmlPrint(document, os);
    result = new String(os.toByteArray(), "UTF-8");
    Assert.assertTrue("XML header is wrong in " + result,
        result.startsWith("<?xml version=\"1.0\" encoding=\"UTF-8\"?>"));
    Assert.assertTrue("XML root is wrong in " + result,
        result.contains("<?xml-stylesheet href=\"CoreNLP-to-HTML.xsl\" type=\"text/xsl\"?>"));
    Assert.assertTrue("XML word info is wrong in " + result,
            StringUtils.find(result, "<token id=\"2\">\\s*" +
                    "<word>Ramage</word>\\s*" +
                    "<lemma>Ramage</lemma>\\s*" +
                    "<CharacterOffsetBegin>4</CharacterOffsetBegin>\\s*" +
                    "<CharacterOffsetEnd>10</CharacterOffsetEnd>\\s*" +
                    "<POS>NNP</POS>\\s*" +
                    "<NER>PERSON</NER>"));
    Assert.assertTrue("XML dependencies are wrong in " + result,
            StringUtils.find(result, "<dep type=\"nn\">\\s*<governor idx=\"2\">" +
                    "Ramage</governor>\\s*<dependent idx=\"1\">Dan</dependent>\\s*</dep>"));
  }




  private static void checkNer(String message, String[][][] expected, CoreMap coremap, String coremapOutput) {
    List<CoreMap> sentences = coremap.get(CoreAnnotations.SentencesAnnotation.class);
    assertEquals(message + ": number of sentences for\n" + coremapOutput, expected.length, sentences.size());
    for (int i = 0; i < expected.length; i++) {
      CoreMap sentence = sentences.get(i);
      List<CoreLabel> tokens = sentence.get(CoreAnnotations.TokensAnnotation.class);
      assertEquals(message + ": number of tokens for sentence " + (i+1) + "\n" + coremapOutput, expected[i].length, tokens.size());
      for (int j = 0; j < expected[i].length; j++) {
        String text = expected[i][j][0];
        String ner = expected[i][j][1];
        String debug = "sentence " + (i+1) + ", token " + (j+1);
        assertEquals(message + ": text mismatch for " + debug + "\n" + coremapOutput, text, tokens.get(j).word());
        assertEquals(message + ": ner mismatch for " + debug + "(" + tokens.get(j).word() + ")\n" + coremapOutput, ner, tokens.get(j).ner());
      }
    }


  }
  public void testRegexNer() throws Exception {
    // Check the regexner is integrated with the StanfordCoreNLP
    Properties props = new Properties();
    props.setProperty("annotators", "tokenize,ssplit,pos,lemma,ner,regexner");
    props.setProperty("regexner.ignorecase", "true");  // Maybe ignorecase should be on by default...


    String text = "Barack Obama is the 44th President of the United States.  He is the first African American president.";
    Annotation document = new Annotation(text);
    StanfordCoreNLP pipeline = new StanfordCoreNLP(props);
    pipeline.annotate(document);


    StringWriter stringWriter = new StringWriter();
    pipeline.prettyPrint(document, new PrintWriter(stringWriter));
    String result = stringWriter.getBuffer().toString();


    // Check the named entity types
    String[][][] expected = {
      {
        {"Barack", "PERSON"},
        {"Obama", "PERSON"},
        {"is", "O"},
        {"the", "O"},
        {"44th", "ORDINAL"},
        {"President", "TITLE"},
        {"of", "O"},
        {"the", "O"},
        {"United", "LOCATION"},
        {"States", "LOCATION"},
        {".", "O"},
      },
      {
        {"He", "O"},
        {"is", "O"},
        {"the", "O"},
        {"first", "ORDINAL"},
        {"African", "MISC"},
        {"American", "NATIONALITY"},
        {"president", "TITLE"},
        {".", "O"},
      },
    };


    checkNer("testRegexNer", expected, document, result);
  }


  public void testRelationExtractor() throws Exception {
    // Check the regexner is integrated with the StanfordCoreNLP
    Properties props = new Properties();
    props.setProperty("annotators", "tokenize,ssplit,pos,lemma,ner,parse,relation");
    //props.setProperty("sup.relation.model", "/home/sonalg/javanlp/tmp/roth_relation_model_pipeline.ser");
    String text = "Barack Obama, a Yale professor, is president.";
    Annotation document = new Annotation(text);
    StanfordCoreNLP pipeline = new StanfordCoreNLP(props);
    pipeline.annotate(document);
    CoreMap sentence = document.get(CoreAnnotations.SentencesAnnotation.class).get(0);
    List<RelationMention> rel = sentence.get(MachineReadingAnnotations.RelationMentionsAnnotation.class);
    assertEquals(rel.get(0).getType(),"Work_For");
//    StringWriter stringWriter = new StringWriter();
//    pipeline.prettyPrint(document, new PrintWriter(stringWriter));
//    String result = stringWriter.getBuffer().toString();
//    System.out.println(result);
  }






  /* This test no longer supported. Do not mess with AnnotatorPool outside of StanfordCoreNLP */
  /*
  public void testAnnotatorPool() throws Exception {
    AnnotatorPool pool = new AnnotatorPool();
    pool.register("tokenize", new Factory<Annotator>() {
      private static final long serialVersionUID = 1L;
      public Annotator create() {
        return new Tokenize();
      }
    });
    // make sure only one Tokenize is created even with multiple pipelines
    Properties properties = this.newProperties("annotators=tokenize");
    new StanfordCoreNLP(pool, properties);
    new StanfordCoreNLP(pool, properties);
    Assert.assertEquals(1, Tokenize.N);
  }


  private static class Tokenize implements Annotator {
    public static int N = 0;
    public Tokenize() {
      ++N;
    }
    public void annotate(Annotation annotation) {
    }
  }


  private Properties newProperties(String desc) {
    Properties properties = new Properties();
    for (String nameValue: desc.split("\\s*,\\s*")) {
      String[] nameValueArray = nameValue.split("\\s*=\\s*");
      if (nameValueArray.length != 2) {
        throw new IllegalArgumentException("invalid name=value string: " + nameValue);
      }
      properties.setProperty(nameValueArray[0], nameValueArray[1]);
    }
    return properties;
  }
  */


  public void testSerialization()
    throws Exception
  {
    // Test that an annotation can be serialized and deserialized


    StanfordCoreNLP pipeline = new StanfordCoreNLP();
    Annotation document = new Annotation("Stanford University is located in California. It is a great university.");
    pipeline.annotate(document);


    CoreMap sentence = document.get(CoreAnnotations.SentencesAnnotation.class).get(0);
    SemanticGraph g = sentence.get(SemanticGraphCoreAnnotations.BasicDependenciesAnnotation.class);
    processSerialization(g);


    processSerialization(sentence.get(TreeCoreAnnotations.TreeAnnotation.class));
    processSerialization(sentence.get(CoreAnnotations.TokensAnnotation.class));
    processSerialization(sentence.get(SemanticGraphCoreAnnotations.BasicDependenciesAnnotation.class));
    processSerialization(sentence);


    Object newDocument = processSerialization(document);
    assertTrue(newDocument instanceof Annotation);
    assertTrue(document.equals(newDocument));
  }


  public static Object processSerialization(Object input)
    throws Exception
  {
    ByteArrayOutputStream bout = new ByteArrayOutputStream();
    ObjectOutputStream oout = new ObjectOutputStream(bout);
    oout.writeObject(input);
    oout.flush();
    oout.close();


    ByteArrayInputStream bin = new ByteArrayInputStream(bout.toByteArray());
    ObjectInputStream oin = new ObjectInputStream(bin);
    return oin.readObject();
  }




}
Source Code of edu.stanford.nlp.pipeline.StanfordCoreNLPITest

Related Classes of edu.stanford.nlp.pipeline.StanfordCoreNLPITest