Examples of edu.umd.hooka.VocabularyWritable.addOrGet()

edu.umd.hooka.VocabularyWritable.addOrGet()

    vocab.addOrGet("mytholog");
    vocab.addOrGet("greenwood");
    vocab.addOrGet("press");
    vocab.addOrGet("new");
    vocab.addOrGet("york");
    vocab.addOrGet("1989");
    vocab.addOrGet("24");
    vocab.addOrGet("26");
    vocab.addOrGet("english");
    vocab.addOrGet("tree");
    vocab.addOrGet("einbaum");

View Full Code Here

    vocab.addOrGet("greenwood");
    vocab.addOrGet("press");
    vocab.addOrGet("new");
    vocab.addOrGet("york");
    vocab.addOrGet("1989");
    vocab.addOrGet("24");
    vocab.addOrGet("26");
    vocab.addOrGet("english");
    vocab.addOrGet("tree");
    vocab.addOrGet("einbaum");

View Full Code Here

    vocab.addOrGet("press");
    vocab.addOrGet("new");
    vocab.addOrGet("york");
    vocab.addOrGet("1989");
    vocab.addOrGet("24");
    vocab.addOrGet("26");
    vocab.addOrGet("english");
    vocab.addOrGet("tree");
    vocab.addOrGet("einbaum");


    float[] enStopStemExpectedOOVRates = {1f, 18/19f, 4/7.0f, 0f};

View Full Code Here

    vocab.addOrGet("new");
    vocab.addOrGet("york");
    vocab.addOrGet("1989");
    vocab.addOrGet("24");
    vocab.addOrGet("26");
    vocab.addOrGet("english");
    vocab.addOrGet("tree");
    vocab.addOrGet("einbaum");


    float[] enStopStemExpectedOOVRates = {1f, 18/19f, 4/7.0f, 0f};
    float[] enStopExpectedOOVRates = {1f, 18/19f, 4/7.0f, 2/12f};

View Full Code Here

    vocab.addOrGet("york");
    vocab.addOrGet("1989");
    vocab.addOrGet("24");
    vocab.addOrGet("26");
    vocab.addOrGet("english");
    vocab.addOrGet("tree");
    vocab.addOrGet("einbaum");


    float[] enStopStemExpectedOOVRates = {1f, 18/19f, 4/7.0f, 0f};
    float[] enStopExpectedOOVRates = {1f, 18/19f, 4/7.0f, 2/12f};
    float[] enStemExpectedOOVRates = {1f, 36/37f, 15/18.0f, 7/19f};

View Full Code Here

    vocab.addOrGet("1989");
    vocab.addOrGet("24");
    vocab.addOrGet("26");
    vocab.addOrGet("english");
    vocab.addOrGet("tree");
    vocab.addOrGet("einbaum");


    float[] enStopStemExpectedOOVRates = {1f, 18/19f, 4/7.0f, 0f};
    float[] enStopExpectedOOVRates = {1f, 18/19f, 4/7.0f, 2/12f};
    float[] enStemExpectedOOVRates = {1f, 36/37f, 15/18.0f, 7/19f};
    float[] enExpectedOOVRates = {1f, 36/37f, 15/18.0f, 9/19f};

View Full Code Here

        Pattern p = Pattern.compile("(.+)\\tentropy .+nTrans"); 
        Matcher m = p.matcher(line);
        if ( m.find() ) {
          cur = m.group(1);


          int gerIndex = srcVocab.addOrGet(cur);  
          logger.debug("Found: "+cur+" with index: "+gerIndex);




          List<PairOfIntFloat> indexProbPairs = new ArrayList<PairOfIntFloat>();
          float sumOfProbs = 0.0f;

View Full Code Here

          topTrans.clear();
          earlyTerminate = false;    // reset status
          skipTerm = false;
          prev = srcTerm;
          int prevIndex = curIndex;
          curIndex = srcVocab.addOrGet(srcTerm);
          if(curIndex <= prevIndex){
            // we've seen this foreign term before. probably due to tokenization or sorting error in aligner. just ignore.
            logger.debug("FLAG: "+line);
            curIndex = prevIndex;    // revert curIndex value since we're skipping this one
            skipTerm = true;

View Full Code Here

  }
  
  @Test
  public void testTokensAllInclusive() throws IOException{
    VocabularyWritable v = new VocabularyWritable();
    v.addOrGet("sentenc");
    v.addOrGet("token");
    
//    Tokenizer tokenizer = TokenizerFactory.createTokenizer("en", "data/tokenizer/en-token.bin", true, true, v);
    Tokenizer tokenizer = TokenizerFactory.createTokenizer("en", "data/tokenizer/en-token.bin", true, "data/tokenizer/en.stop", "data/tokenizer/en.stop.stemmed", v);

View Full Code Here

  
  @Test
  public void testTokensAllInclusive() throws IOException{
    VocabularyWritable v = new VocabularyWritable();
    v.addOrGet("sentenc");
    v.addOrGet("token");
    
//    Tokenizer tokenizer = TokenizerFactory.createTokenizer("en", "data/tokenizer/en-token.bin", true, true, v);
    Tokenizer tokenizer = TokenizerFactory.createTokenizer("en", "data/tokenizer/en-token.bin", true, "data/tokenizer/en.stop", "data/tokenizer/en.stop.stemmed", v);


    String sentence = "This is a sentence, written in the U.S., which is \"un-tokenized\" (i.e., tokenization not performed).";

View Full Code Here

0 1 2 3 4 5 6

TOP

All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.