Package edu.umd.hooka

Examples of edu.umd.hooka.VocabularyWritable.addOrGet()


    vocab.addOrGet("mytholog");
    vocab.addOrGet("greenwood");
    vocab.addOrGet("press");
    vocab.addOrGet("new");
    vocab.addOrGet("york");
    vocab.addOrGet("1989");
    vocab.addOrGet("24");
    vocab.addOrGet("26");
    vocab.addOrGet("english");
    vocab.addOrGet("tree");
    vocab.addOrGet("einbaum");
View Full Code Here


    vocab.addOrGet("greenwood");
    vocab.addOrGet("press");
    vocab.addOrGet("new");
    vocab.addOrGet("york");
    vocab.addOrGet("1989");
    vocab.addOrGet("24");
    vocab.addOrGet("26");
    vocab.addOrGet("english");
    vocab.addOrGet("tree");
    vocab.addOrGet("einbaum");
View Full Code Here

    vocab.addOrGet("press");
    vocab.addOrGet("new");
    vocab.addOrGet("york");
    vocab.addOrGet("1989");
    vocab.addOrGet("24");
    vocab.addOrGet("26");
    vocab.addOrGet("english");
    vocab.addOrGet("tree");
    vocab.addOrGet("einbaum");

    float[] enStopStemExpectedOOVRates = {1f, 18/19f, 4/7.0f, 0f};
View Full Code Here

    vocab.addOrGet("new");
    vocab.addOrGet("york");
    vocab.addOrGet("1989");
    vocab.addOrGet("24");
    vocab.addOrGet("26");
    vocab.addOrGet("english");
    vocab.addOrGet("tree");
    vocab.addOrGet("einbaum");

    float[] enStopStemExpectedOOVRates = {1f, 18/19f, 4/7.0f, 0f};
    float[] enStopExpectedOOVRates = {1f, 18/19f, 4/7.0f, 2/12f};
View Full Code Here

    vocab.addOrGet("york");
    vocab.addOrGet("1989");
    vocab.addOrGet("24");
    vocab.addOrGet("26");
    vocab.addOrGet("english");
    vocab.addOrGet("tree");
    vocab.addOrGet("einbaum");

    float[] enStopStemExpectedOOVRates = {1f, 18/19f, 4/7.0f, 0f};
    float[] enStopExpectedOOVRates = {1f, 18/19f, 4/7.0f, 2/12f};
    float[] enStemExpectedOOVRates = {1f, 36/37f, 15/18.0f, 7/19f};
View Full Code Here

    vocab.addOrGet("1989");
    vocab.addOrGet("24");
    vocab.addOrGet("26");
    vocab.addOrGet("english");
    vocab.addOrGet("tree");
    vocab.addOrGet("einbaum");

    float[] enStopStemExpectedOOVRates = {1f, 18/19f, 4/7.0f, 0f};
    float[] enStopExpectedOOVRates = {1f, 18/19f, 4/7.0f, 2/12f};
    float[] enStemExpectedOOVRates = {1f, 36/37f, 15/18.0f, 7/19f};
    float[] enExpectedOOVRates = {1f, 36/37f, 15/18.0f, 9/19f};
View Full Code Here

        Pattern p = Pattern.compile("(.+)\\tentropy .+nTrans");
        Matcher m = p.matcher(line);
        if ( m.find() ) {
          cur = m.group(1);

          int gerIndex = srcVocab.addOrGet(cur)
          logger.debug("Found: "+cur+" with index: "+gerIndex);


          List<PairOfIntFloat> indexProbPairs = new ArrayList<PairOfIntFloat>();
          float sumOfProbs = 0.0f;
View Full Code Here

          topTrans.clear();
          earlyTerminate = false;    // reset status
          skipTerm = false;
          prev = srcTerm;
          int prevIndex = curIndex;
          curIndex = srcVocab.addOrGet(srcTerm);
          if(curIndex <= prevIndex){
            // we've seen this foreign term before. probably due to tokenization or sorting error in aligner. just ignore.
            logger.debug("FLAG: "+line);
            curIndex = prevIndex;    // revert curIndex value since we're skipping this one
            skipTerm = true;
View Full Code Here

  }
 
  @Test
  public void testTokensAllInclusive() throws IOException{
    VocabularyWritable v = new VocabularyWritable();
    v.addOrGet("sentenc");
    v.addOrGet("token");
   
//    Tokenizer tokenizer = TokenizerFactory.createTokenizer("en", "data/tokenizer/en-token.bin", true, true, v);
    Tokenizer tokenizer = TokenizerFactory.createTokenizer("en", "data/tokenizer/en-token.bin", true, "data/tokenizer/en.stop", "data/tokenizer/en.stop.stemmed", v);
View Full Code Here

 
  @Test
  public void testTokensAllInclusive() throws IOException{
    VocabularyWritable v = new VocabularyWritable();
    v.addOrGet("sentenc");
    v.addOrGet("token");
   
//    Tokenizer tokenizer = TokenizerFactory.createTokenizer("en", "data/tokenizer/en-token.bin", true, true, v);
    Tokenizer tokenizer = TokenizerFactory.createTokenizer("en", "data/tokenizer/en-token.bin", true, "data/tokenizer/en.stop", "data/tokenizer/en.stop.stemmed", v);

    String sentence = "This is a sentence, written in the U.S., which is \"un-tokenized\" (i.e., tokenization not performed).";
View Full Code Here

TOP
Copyright © 2018 www.massapi.com. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.