Source Code of ivory.core.tokenize.OpenNLPEnTokenizerTest

package ivory.core.tokenize;


import static org.junit.Assert.assertEquals;
import static org.junit.Assert.assertTrue;
import java.io.BufferedReader;
import java.io.FileInputStream;
import java.io.IOException;
import java.io.InputStreamReader;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.List;
import junit.framework.JUnit4TestAdapter;
import org.junit.Test;
import edu.umd.hooka.VocabularyWritable;


public class OpenNLPEnTokenizerTest {


  @Test
  public void testTokensNoStemNoStopNoVocab() throws IOException{
//    Tokenizer tokenizer = TokenizerFactory.createTokenizer("en", "data/tokenizer/en-token.bin", false, false, null);
    Tokenizer tokenizer = TokenizerFactory.createTokenizer("en", "data/tokenizer/en-token.bin", false, null, null, null);


    String sentence = "This is a sentence, written in the U.S., which is \"un-tokenized\" (i.e., tokenization not performed).";
    String[] expectedTokens = new String[] {"this", "is", "a", "sentence", ",", "written", "in", "the", "u.s.", ",", "which", "is", "\"", "un", "-", "tokenized", "\"", "(", "i", ".e", ".", ",", "tokenization", "not", "performed", ")", "."};


    String[] tokens = tokenizer.processContent(sentence);
    verifyTokens(tokens, expectedTokens);
  }
  
  @Test
  public void testTokensNoStemNoVocab() throws IOException{
//    Tokenizer tokenizer = TokenizerFactory.createTokenizer("en", "data/tokenizer/en-token.bin", false, true, null);
    Tokenizer tokenizer = TokenizerFactory.createTokenizer("en", "data/tokenizer/en-token.bin", false, "data/tokenizer/en.stop", "data/tokenizer/en.stop.stemmed", null);


    String sentence = "This is a sentence, written in the U.S., which is \"un-tokenized\" (i.e., tokenization not performed).";
    String[] expectedTokens = new String[] {"sentence", "written", "u.s.", "un", "tokenized", ".e", "tokenization", "performed"};


    String[] tokens = tokenizer.processContent(sentence);
    verifyTokens(tokens, expectedTokens);
  }
  
  @Test
  public void testTokensNoVocab() throws IOException{
//    Tokenizer tokenizer = TokenizerFactory.createTokenizer("en", "data/tokenizer/en-token.bin", true, true, null);
    Tokenizer tokenizer = TokenizerFactory.createTokenizer("en", "data/tokenizer/en-token.bin", true, "data/tokenizer/en.stop", "data/tokenizer/en.stop.stemmed", null);


    String sentence = "This is a sentence, written in the U.S., which is \"un-tokenized\" (i.e., tokenization not performed).";
    String[] expectedTokens = new String[] {"sentenc", "written", "u.s.", "un", "token", ".e", "token", "perform"};


    String[] tokens = tokenizer.processContent(sentence);
    verifyTokens(tokens, expectedTokens);
  }
  
  @Test
  public void testTokensAllInclusive() throws IOException{
    VocabularyWritable v = new VocabularyWritable();
    v.addOrGet("sentenc");
    v.addOrGet("token");
    
//    Tokenizer tokenizer = TokenizerFactory.createTokenizer("en", "data/tokenizer/en-token.bin", true, true, v);
    Tokenizer tokenizer = TokenizerFactory.createTokenizer("en", "data/tokenizer/en-token.bin", true, "data/tokenizer/en.stop", "data/tokenizer/en.stop.stemmed", v);


    String sentence = "This is a sentence, written in the U.S., which is \"un-tokenized\" (i.e., tokenization not performed).";
    String[] expectedTokens = new String[] {"sentenc", "token", "token"};


    String[] tokens = tokenizer.processContent(sentence);
    verifyTokens(tokens, expectedTokens);
  }


  @Test
  public void testPreNormalize() throws IOException{
    Tokenizer tokenizer = TokenizerFactory.createTokenizer("en", "data/tokenizer/en-token.bin", true, "data/tokenizer/en.stop", "data/tokenizer/en.stop.stemmed", null);


    List<String> sentences = readInput("etc/tokenizer-normalize-test.txt");
    int cnt = 0;
    for (String sentence : sentences) {
      String[] tokens = tokenizer.processContent(sentence);
      System.out.println("Tokenizing line " + cnt);
      System.out.println("Original:" + sentence);
      System.out.println("Tokens:" + Arrays.asList(tokens));
      
      if (cnt < 4) {
        assertTrue("number of tokens < 2 => " + tokens.length, tokens.length == 2);
        assertEquals(tokens[0] + "!= world", tokens[0], "world");
        assertEquals(tokens[1] + "!= 's", tokens[1], "'s");
      }else {
        assertTrue("number of tokens < 1 => " + tokens.length, tokens.length == 1);
        assertEquals(tokens[0] + "!= world", tokens[0], "world");
      }
      cnt++;
    }
  }
  
  @Test
  public void testStopword() throws IOException{
    // ALl of the following should properly handle arguments and behave as if not stopword list was provided
    Tokenizer tokenizer1 = TokenizerFactory.createTokenizer("en", "data/tokenizer/en-token.bin", null);
    Tokenizer tokenizer2 = TokenizerFactory.createTokenizer("en", "data/tokenizer/en-token.bin", true, null, null, null);
    Tokenizer tokenizer3 = TokenizerFactory.createTokenizer("en", "data/tokenizer/en-token.bin", true, "xxxx", "yyyy", null);
    Tokenizer tokenizer4 = TokenizerFactory.createTokenizer("en", "data/tokenizer/en-token.bin", false, null, null, null);
    Tokenizer tokenizer5 = TokenizerFactory.createTokenizer("en", "data/tokenizer/en-token.bin", false, "xxxx", "yyyy", null);
    
    String sentence = "This is a sentence, written in the U.S., which is \"un-tokenized\" (i.e., tokenization not performed).";
    String[] expectedTokens1 = new String[] {"this", "is", "a", "sentenc", ",", "written", "in", "the", "u.s.", ",", "which", "is", "\"", "un", "-", "token", "\"", "(", "i", ".e", ".", ",", "token", "not", "perform", ")", "."};
    String[] expectedTokens2 = new String[] {"this", "is", "a", "sentence", ",", "written", "in", "the", "u.s.", ",", "which", "is", "\"", "un", "-", "tokenized", "\"", "(", "i", ".e", ".", ",", "tokenization", "not", "performed", ")", "."};


    String[] tokens1 = tokenizer1.processContent(sentence);
    String[] tokens2 = tokenizer2.processContent(sentence);
    String[] tokens3 = tokenizer3.processContent(sentence);
    String[] tokens4 = tokenizer4.processContent(sentence);
    String[] tokens5 = tokenizer5.processContent(sentence);


    verifyTokens(tokens1, expectedTokens1);
    verifyTokens(tokens2, expectedTokens1);
    verifyTokens(tokens3, expectedTokens1);
    verifyTokens(tokens4, expectedTokens2);
    verifyTokens(tokens5, expectedTokens2);
  }
  
  private void verifyTokens(String[] tokens, String[] expectedTokens) {
    int tokenCnt = 0;
    for (String token : tokens) {
      System.out.println("Token "+tokenCnt+":"+token);
      assertTrue("token "+tokenCnt+"="+token+"!="+expectedTokens[tokenCnt], token.equals(expectedTokens[tokenCnt]));
      tokenCnt++;
    }    
  }
  
  private List<String> readInput(String file) {
    List<String> lines = new ArrayList<String>();
    try {
      FileInputStream fis = new FileInputStream(file);
      InputStreamReader isr = new InputStreamReader(fis, "UTF8");
      BufferedReader in = new BufferedReader(isr);
      String line;


      while ((line = in.readLine()) != null) {
        lines.add(line);
      }
      in.close();
      return lines;
    } catch (IOException e) {
      e.printStackTrace();
      return null;
    }
  }
  
  public static junit.framework.Test suite() {
    return new JUnit4TestAdapter(OpenNLPEnTokenizerTest.class);
  }
}
Source Code of ivory.core.tokenize.OpenNLPEnTokenizerTest

Related Classes of ivory.core.tokenize.OpenNLPEnTokenizerTest