Source Code of joshua.corpus.CorpusArray

/* This file is part of the Joshua Machine Translation System.
 * 
 * Joshua is free software; you can redistribute it and/or modify
 * it under the terms of the GNU Lesser General Public License as
 * published by the Free Software Foundation; either version 2.1
 * of the License, or (at your option) any later version.
 *
 * This library is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
 * Lesser General Public License for more details.
 *
 * You should have received a copy of the GNU Lesser General Public
 * License along with this library; if not, write to the Free
 * Software Foundation, Inc., 59 Temple Place, Suite 330, Boston,
 * MA 02111-1307 USA
 */
package joshua.corpus;


import joshua.corpus.suffix_array.SuffixArray;
import joshua.corpus.suffix_array.SuffixArrayFactory;
import joshua.corpus.vocab.ExternalizableSymbolTable;
import joshua.corpus.vocab.SymbolTable;
import joshua.corpus.vocab.Vocabulary;
import joshua.util.io.BinaryOut;


import java.io.Externalizable;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.ObjectInput;
import java.io.ObjectOutput;
import java.util.Arrays;






/**
 * A compact int[] based representation of a corpus. The class keeps
 * all of the words in their int form in a single array. It also
 * maintains a separate int[] array that lists the start index for
 * each sentence in the corpus. This second array allows us to
 * quickly determine the source sentence of any given position in
 * the corpus using a binary search.
 *
 * @author  Josh Schroeder
 * @since  29 Dec 2004
 * @version $LastChangedDate:2008-07-30 17:15:52 -0400 (Wed, 30 Jul 2008) $
 */
public class CorpusArray extends AbstractCorpus<ExternalizableSymbolTable> implements Corpus, Externalizable {


//===============================================================
// Constants
//===============================================================




//===============================================================
// Member variables
//===============================================================


  /**
   * Stores an integer based representation of each word in
   * the corpus.
   */
  protected int[] corpus;
  
  
  /**
   * Keeps the starting position in the corpus array for each
   * of the sentences. The length of the sentences array is
   * equal to the number of sentences in the corpus.
   */
  protected int[] sentences;
  
  
  /**
   * The alphabetized vocabulary which maps between the String
   * and int representation of words in the corpus.
   */
//  protected SymbolTable symbolTable;
  
  
//===============================================================
// Constructor(s)
//===============================================================


  /** 
   * Constructs an empty corpus.
   * <p>
   * NOTE: Primarily needed for Externalizable interface.
   */
  public CorpusArray() {
    super(new Vocabulary());
//    this.symbolTable = new Vocabulary();
    this.sentences = new int[]{};
    this.corpus = new int[]{};
  }
  
  
  /** 
   * Protected constructor takes in the already prepared
   * member variables.
   *
   * @see SuffixArrayFactory#createCorpusArray
   */
  public CorpusArray (int[] corpus, int[] sentences, ExternalizableSymbolTable vocab) {
    super(vocab);
    this.corpus = corpus;
    this.sentences = sentences;
//    this.symbolTable = vocab;
  }
  
//===============================================================
// Public
//===============================================================
  
  //===========================================================
  // Accessor methods (set/get)
  //===========================================================
  


  
  /**
   * @return the integer representation of the Word at the
   *         specified position in the corpus.
   */
  public int getWordID(int position) {
    return corpus[position];  
  }
  
  
  /**
   * @return the sentence index associated with the specified
   *         position in the corpus.
   */
  public int getSentenceIndex(int position) {
    int index = Arrays.binarySearch(sentences, position);
    // if index is positive, then the position searched
    // for is the first word of a sentence. we return
    // the exact value.
    if (index >= 0) {
        return index;
    } else {
    // otherwise, we are given an negative version of
    // the first number higher than our position. that
    // is the position of where this would be inserted
    // if it was its own sentence, so we make the number
    // positive and subtract 2 (one since since it is
    // by ith element instead of position, one to get
    // the previous index)
      return (index*(-1))-2;
    }
  }
  
  
  /**
   * @return the position in the corpus of the first word of
   *         the specified sentence. If the sentenceID is
   *         outside of the bounds of the sentences, then it
   *         returns the last position in the corpus + 1.
   */
  public int getSentencePosition(int sentenceID) {
    if (sentenceID >= sentences.length) {
      return corpus.length;
    }
    return sentences[sentenceID];
  }
  
  /**
   * Gets the exclusive end position of a sentence in the
   * corpus.
   *
   * @return the position in the corpus one past the last
   *         word of the specified sentence. If the sentenceID
   *         is outside of the bounds of the sentences, then
   *         it returns one past the last position in the
   *         corpus.
   */
  public int getSentenceEndPosition(int sentenceID) {
    if (sentenceID >= sentences.length-1) {
      return corpus.length;
    }
    return sentences[sentenceID+1];
  }
  
  /** 
   * Gets the sentence at the specified index (starting from
   * zero).
   *
   * @return the sentence, or null if the specified sentence
   *         number doesn't exist
   */
  public Phrase getSentence(int sentenceIndex) {
    if (sentenceIndex >= sentences.length) {
      return null;
    } else if (sentenceIndex == sentences.length - 1) {
      return getPhrase(sentences[sentenceIndex], corpus.length);
    } else {
      return getPhrase(sentences[sentenceIndex], sentences[sentenceIndex+1]);
    } 
  }


  
  /**
   * @return the number of words in the corpus.
   */
  public int size() {
    return corpus.length;
  }
  
  
  /**
   * @return the number of sentences in the corpus.
   */
  public int getNumSentences() {
    return sentences.length;
  }
  
  /**
   * Sets the symbol table to the provided object, and changes
   * migrates all internal data to use the new mappings
   * provided by that object.
   */
  public void setSymbolTable(ExternalizableSymbolTable vocab) {
    SymbolTable oldVocab = this.symbolTable;
    
    for (int i=0; i<corpus.length; i++) {
      
      int oldID = corpus[i];
      String word = oldVocab.getWord(oldID);
      int newID = vocab.getID(word);
      
      corpus[i] = newID;
    }
    
    this.symbolTable = vocab;
    oldVocab = null;
  }
  
  
  //===========================================================
  // Methods
  //===========================================================
  
  
  /**
   * Compares the phrase that starts at position start with
   * the subphrase indicated by the start and end points of
   * the phrase.
   *
   * @param corpusStart the point in the corpus where the
   *                    comparison begins
   * @param phrase      the superphrase that the comparsion
   *                    phrase is drawn from
   * @param phraseStart the point in the phrase where the
   *                    comparison begins (inclusive)
   * @param phraseEnd   the point in the phrase where the
   *                    comparison ends (exclusive)
   * @return an int that follows the conventions of
   *         java.util.Comparator.compareTo()
   */
  public int comparePhrase(int corpusStart, Phrase phrase, int phraseStart, int phraseEnd) {
    int diff = -1;
    for (int i = 0; i < phraseEnd-phraseStart; i++) {
      if (i + corpusStart >= corpus.length) {
        return -1;
      }
      diff = corpus[i+corpusStart] - phrase.getWordID(i+phraseStart);
      if (diff != 0) {
        return diff;
      }
    }
    return 0;
  }
  
  
  /**
   * compares the phrase that starts at position start with
   * the phrase passed in. Compares the entire phrase.
   */
  public int comparePhrase(int corpusStart, Phrase phrase) {
    return comparePhrase(corpusStart, phrase, 0, phrase.size());
  }
  
  public SymbolTable getVocabulary() {
    return symbolTable;
  }
  
  
  /** 
   * Compares the suffixes starting a positions index1 and
   * index2.
   *
   * @param position1 the position in the corpus where the
   *                  first suffix begins
   * @param position2 the position in the corpus where the
   *                  second suffix begins
   * @param maxComparisonLength a cutoff point to stop the
   *                            comparison
   * @return an int that follows the conventions of
   *         java.util.Comparator.compareTo()
   */
    public int compareSuffixes(int position1, int position2, int maxComparisonLength){
    for (int i = 0; i < maxComparisonLength; i++) {
      if (position1 + i < (corpus.length)
          && position2 + i >= (corpus.length)) {
        return 1;
      }
      if (position2 + i < (corpus.length)
          && position1 + i >= (corpus.length)) {
        return -1;
      }
      
      int diff;
      try {
        diff = corpus[position1 + i] - corpus[position2 + i];
      } catch (ArrayIndexOutOfBoundsException e) {
        throw new Error("Bug in CorpusArray method compareSuffixes: " + e.getMessage());
      }
      
      if (diff != 0) {
        return diff;
      }
    }
    return 0;
    }


    public void write(String corpusFilename, String vocabFilename, String charset) throws IOException {
    
      ObjectOutput vocabOut =
        new BinaryOut(new FileOutputStream(vocabFilename), true);
//        new ObjectOutputStream(new FileOutputStream(vocabFilename));
      symbolTable.setExternalizableEncoding(charset);
      symbolTable.writeExternal(vocabOut);
      vocabOut.flush();
      
      BinaryOut corpusOut = new BinaryOut(new FileOutputStream(corpusFilename), false);
      this.writeExternal(corpusOut);  
      corpusOut.flush();
      
    }
  
  public ContiguousPhrase getPhrase(int startPosition, int endPosition) {
    return new ContiguousPhrase(startPosition, endPosition, this);
  }
  
  
//===============================================================
// Private 
//===============================================================
  
  //===============================================================
  // Methods
  //===============================================================
  
  
//===============================================================
// Static
//===============================================================




//===============================================================
// Main
//===============================================================




  public static void main(String[] args) throws Exception {
    
    if (args.length < 4) {
      System.err.println("Usage: java " + SuffixArray.class.getName() + " corpus vocab.jbin corpus.bin");
      System.exit(0);
    }
    
    String corpusFileName = args[0];
    String binaryVocabFilename = args[1];
    String binaryCorpusFilename = args[2];
    String charset = (args.length > 3) ? args[3] : "UTF-8";
    
    Vocabulary symbolTable = new Vocabulary();
    int[] lengths = Vocabulary.initializeVocabulary(corpusFileName, symbolTable, true);
    
    CorpusArray corpusArray = SuffixArrayFactory.createCorpusArray(corpusFileName, symbolTable, lengths[0], lengths[1]);
    
    corpusArray.write(binaryCorpusFilename, binaryVocabFilename, charset);
    
  }


  public void readExternal(ObjectInput in) throws IOException,
      ClassNotFoundException {
    
    // Read the vocabulary
    symbolTable.readExternal(in);
    
    int numSentences = in.readInt();
    this.sentences = new int[numSentences];
    for (int i=0; i<numSentences; i++) {
      this.sentences[i] = in.readInt();
    }
    
    int numWords = in.readInt();
    this.corpus = new int[numWords];
    for (int i=0; i<numWords; i++) {
      this.corpus[i] = in.readInt();
    }
    
  }


  public void writeExternal(ObjectOutput out) throws IOException {
    
    // Write the vocabulary
    out.writeObject(symbolTable);
    
    out.writeInt(sentences.length);
    for (int sentencePosition : sentences) {
      out.writeInt(sentencePosition);
    }
    
    out.writeInt(corpus.length);
    for (int word : corpus) {
      out.writeInt(word);
    }
    
  }


//  static final long serialVersionUID = 1L;
}
Source Code of joshua.corpus.CorpusArray

Related Classes of joshua.corpus.CorpusArray