Examples of ivory.core.tokenize.Tokenizer

ivory.core.tokenize.Tokenizer

  }


  // read from special wiki format created by Smith et al as part of their 2010 paper 
  private void readWikiSentences(String eReadFile, String fReadFile, String pairsFile, String eLang, String fLang,
      Vocab eVocab, Vocab fVocab, String fToken, String eToken, String fStopwordsFile, String eStopwordsFile) {
    Tokenizer eTokenizer = TokenizerFactory.createTokenizer(eLang, eToken, true, eStopwordsFile, eStopwordsFile + ".stemmed", null);
    Tokenizer fTokenizer = TokenizerFactory.createTokenizer(fLang, fToken, true, fStopwordsFile, fStopwordsFile + ".stemmed", null);


    try {
      BufferedReader dis1 = new BufferedReader(new InputStreamReader(new FileInputStream(new File(eReadFile)), "UTF-8"));
      BufferedReader dis2 = new BufferedReader(new InputStreamReader(new FileInputStream(new File(fReadFile)), "UTF-8"));

View Full Code Here


  // regular 1 sentence per line, 1 sentence per doc format
  private void readSentences(int sentsPerDoc, String eReadFile, String fReadFile, String eLang, String fLang,
      String fToken, String eToken, String fStopwordsFile, String eStopwordsFile) throws IOException,
      ClassNotFoundException, InstantiationException, IllegalAccessException {
    Tokenizer eTokenizer = TokenizerFactory.createTokenizer(eLang, eToken, true, eStopwordsFile, eStopwordsFile + ".stemmed", null);
    Tokenizer fTokenizer = TokenizerFactory.createTokenizer(fLang, fToken, true, fStopwordsFile, fStopwordsFile + ".stemmed", null);


    float sumFLengs = 0, sumELengs = 0;


    try {
      BufferedReader dis1 = new BufferedReader(new InputStreamReader(new FileInputStream(new File(eReadFile)), "UTF-8"));
      BufferedReader dis2 = new BufferedReader(new InputStreamReader(new FileInputStream(new File(fReadFile)), "UTF-8"));
      HMapSIW fDoc = new HMapSIW();
      HMapSIW eDoc = new HMapSIW();
      String eLine = null, fLine = null;
      int cntEDocs = 0, cntFDocs = 0, lastDocLenE = 0, lastDocLenF = 0, numSents = 0;


      while ((eLine = dis1.readLine()) != null) {
        fLine = dis2.readLine().trim();
        eLine = eLine.trim();


        String[] tokens = fTokenizer.processContent(fLine);      
        lastDocLenF += tokens.length;


        for (String token : tokens) {
          if (!fDoc.containsKey(token)) { // if this is first time we saw token in this sentence
            dfD.increment(token);

View Full Code Here

      eVocabTrg = HadoopAlign.loadVocab(new Path(eVocabTrgFile), localFs);
      fVocabSrc = HadoopAlign.loadVocab(new Path(fVocabSrcFile), localFs);
      fVocabTrg = HadoopAlign.loadVocab(new Path(fVocabTrgFile), localFs);
      f2e_Probs = new TTable_monolithic_IFAs(localFs, new Path(probTablef2eFile), true);
      e2f_Probs = new TTable_monolithic_IFAs(localFs, new Path(probTablee2fFile), true);
      Tokenizer fTokenizer = TokenizerFactory.createTokenizer(localFs, fLang, fTokenFile, false);
      Tokenizer eTokenizer = TokenizerFactory.createTokenizer(localFs, eLang, eTokenFile, false);
      long startTime = System.currentTimeMillis(); 


      if (pairsFile == null) {
        readSentences(1, eFile, fFile, eLang, fLang,
            fTokenFile, eTokenFile, fStopwordsFile, eStopwordsFile);

View Full Code Here

0 1

TOP

Related Classes of ivory.core.tokenize.Tokenizer

ivory.lsh.eval.BitextClassifierUtils

org.apache.commons.cli.CommandLine

org.apache.commons.cli.CommandLineParser

org.apache.commons.cli.GnuParser

org.apache.commons.cli.HelpFormatter

org.apache.commons.cli.Options

org.apache.hadoop.fs.FSDataInputStream

org.apache.hadoop.fs.Path

org.apache.lucene.analysis.tokenattributes.CharTermAttribute

java.lang.UnsupportedOperationException

All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.