Examples of Corpus

basic.Corpus
ch.akuhn.hapax.corpus.Corpus
Text corpus, with term frequencies for each document. Both terms and documents are identified by strings. @author Adrian Kuhn
gannuNLP.corpus.Corpus
Class for loading a corpus of SGF files for calculating some useful statistics and extending bag of words with new samples. @author Francisco Viveros-Jiménez
gate.Corpus
joshua.corpus.Corpus
Corpus is an interface that contains methods for accessing the information within a monolingual corpus. @author Chris Callison-Burch @since 7 February 2005 @version $LastChangedDate:2008-07-30 17:15:52 -0400 (Wed, 30 Jul 2008) $
org.apache.stanbol.enhancer.engines.lucenefstlinking.TaggingSession.Corpus
syntaxLearner.corpus.Corpus
Main class for search. Typically, only one exists for any language. @author Omer Shapira
uk.ac.cam.ha293.tweetlabel.types.Corpus

Examples of joshua.corpus.Corpus

  /* See Javadoc for LexicalProbabilities#lexProbTargetGivenSource(MatchedHierarchicalPhrases,int,HierarchicalPhrase). */
  public float lexProbTargetGivenSource(MatchedHierarchicalPhrases sourcePhrases, int sourcePhraseIndex, HierarchicalPhrase targetPhrase) {
    
    final boolean LOGGING_FINEST = logger.isLoggable(Level.FINEST);
    
    Corpus sourceCorpus = parallelCorpus.getSourceCorpus();
    Corpus targetCorpus = parallelCorpus.getTargetCorpus();
    Alignments alignments = parallelCorpus.getAlignments();
    
    StringBuilder s;
    if (LOGGING_FINEST) {
      s = new StringBuilder();
      s.append("lexProb( ");
      s.append(sourcePhrases.getPattern().toString());
      s.append(" | ");
      s.append(targetPhrase.toString());
      s.append(" )  =  1.0");
    } else {
      s = null;
    }
    
    float targetGivenSource = 1.0f;


    // Iterate over each terminal sequence in the target phrase
    for (int seq=0; seq<targetPhrase.getNumberOfTerminalSequences(); seq++) {
      
      // Iterate over each source index in the current terminal sequence
      for (int targetWordIndex=targetPhrase.getTerminalSequenceStartIndex(seq),
            end=targetPhrase.getTerminalSequenceEndIndex(seq);
          targetWordIndex<end; 
          targetWordIndex++) {
        
        int targetWord = targetCorpus.getWordID(targetWordIndex);
        int[] sourceIndices = alignments.getAlignedSourceIndices(targetWordIndex);
        
        float sum = 0.0f;
        float average;

View Full Code Here

Examples of joshua.corpus.Corpus

      String binaryVocabFileName = joshDir + "/common.vocab";
      ObjectInput in = BinaryIn.vocabulary(binaryVocabFileName);
    commonVocab.readExternal(in);
    
    String sourceFileName = joshDir + "/source.corpus";
    Corpus sourceCorpusArray = new MemoryMappedCorpusArray(commonVocab, sourceFileName);


    String targetFileName = joshDir + "/target.corpus";
    Corpus targetCorpusArray = new MemoryMappedCorpusArray(commonVocab, targetFileName);
  
    String alignmentFileName = joshDir + "/alignment.grids";
    Alignments alignments = new MemoryMappedAlignmentGrids(alignmentFileName, sourceCorpusArray, targetCorpusArray);
  
    return new AlignedParallelCorpus(sourceCorpusArray, targetCorpusArray, alignments);

View Full Code Here

Examples of joshua.corpus.Corpus

    this.initializeStateComputers(symbolTable, JoshuaConfiguration.lmOrder, JoshuaConfiguration.ngramStateID);
    
    if (logger.isLoggable(Level.INFO))
      logger.info("Reading source language corpus from " +
        binarySourceCorpusFileName);
    Corpus sourceCorpusArray =
      new MemoryMappedCorpusArray(
        this.symbolTable, binarySourceCorpusFileName);
    
    
    if (logger.isLoggable(Level.INFO))
      logger.info("Reading source language suffix array from " +
        binarySourceSuffixesFileName);
    Suffixes sourceSuffixArray =
      new MemoryMappedSuffixArray(
          binarySourceSuffixesFileName,
          sourceCorpusArray,
          maxCacheSize);


    
    if (logger.isLoggable(Level.INFO))
      logger.info("Reading target language corpus from " +
        binaryTargetCorpusFileName);
    Corpus targetCorpusArray =
      new MemoryMappedCorpusArray(
        this.symbolTable, binaryTargetCorpusFileName);
    
    if (logger.isLoggable(Level.INFO))
      logger.info("Reading target language suffix array from " +

View Full Code Here

Examples of joshua.corpus.Corpus

    Vocabulary sourceVocab = new Vocabulary();
    int[] sourceWordsSentences = Vocabulary.initializeVocabulary(sourceFileName, sourceVocab, true);
    numSourceWords = sourceWordsSentences[0];
    numSourceSentences = sourceWordsSentences[1];
    
    Corpus sourceCorpusArray = SuffixArrayFactory.createCorpusArray(sourceFileName, sourceVocab, numSourceWords, numSourceSentences);
    Suffixes sourceSuffixArray = SuffixArrayFactory.createSuffixArray(sourceCorpusArray, maxCacheSize);
    
    int numTargetWords, numTargetSentences;
    Vocabulary targetVocab = new Vocabulary();
    int[] targetWordsSentences = Vocabulary.initializeVocabulary(targetFileName, targetVocab, true);
    numTargetWords = targetWordsSentences[0];
    numTargetSentences = targetWordsSentences[1];
    
    Corpus targetCorpusArray = SuffixArrayFactory.createCorpusArray(targetFileName, targetVocab, numTargetWords, numTargetSentences);
    Suffixes targetSuffixArray = SuffixArrayFactory.createSuffixArray(targetCorpusArray, maxCacheSize);
    
    int trainingSize = sourceCorpusArray.getNumSentences();
    boolean requireTightSpans = true;
    Alignments alignments = new AlignmentGrids(new Scanner(new File(alignmentFileName)), sourceCorpusArray, targetCorpusArray, trainingSize, requireTightSpans);

View Full Code Here

Examples of joshua.corpus.Corpus

          Vocabulary commonVocab = new Vocabulary();
          ObjectInput in = BinaryIn.vocabulary(binaryVocabFileName);
          commonVocab.readExternal(in);


          logger.fine("Loading source corpus...");
          Corpus sourceCorpus = new MemoryMappedCorpusArray(commonVocab, binarySourceFileName);


          logger.fine("Loading source suffix array...");
          Suffixes sourceSuffixes = new MemoryMappedSuffixArray(binarySourceSuffixesFileName, sourceCorpus);
          
          logger.fine("Loading target corpus...");    
          Corpus targetCorpus = new MemoryMappedCorpusArray(commonVocab, binaryTargetFileName);
          
          logger.fine("Loading target suffix array...");
          Suffixes targetSuffixes = new MemoryMappedSuffixArray(binarySourceSuffixesFileName, sourceCorpus);


          logger.fine("Loading alignment grids...");

View Full Code Here

Examples of joshua.corpus.Corpus

    int sentenceNumber = 1;
    int endOfSentence = suffixes.getSentencePosition(sentenceNumber);


    if (logger.isLoggable(Level.FINEST)) logger.finest("END OF SENT: " + endOfSentence);


    Corpus corpus = suffixes.getCorpus();
    int endOfCorpus = corpus.size();
    
    // Start at the beginning of the corpus...
    for (int currentPosition : corpus.corpusPositions()) {
          
      // Start with a phrase length of 1, at the current position...
      for (int i = 1, endOfPhrase = currentPosition + i; 
          // ...ensure the phrase length isn't too long...
          i <= maxPhraseLength  &&

View Full Code Here

Examples of joshua.corpus.Corpus

  ) {
    
    PriorityQueue<Counted<Phrase>> frequentPhrases = new PriorityQueue<Counted<Phrase>>();
    Set<Integer> prunedFrequencies = new HashSet<Integer>();
    
    Corpus corpus = suffixes.getCorpus();
    
    FrequencyClasses frequencyClasses = getFrequencyClasses(suffixes);
    
    for (FrequencyClass frequencyClass : frequencyClasses.withMinimumFrequency(minFrequency)) {

View Full Code Here

Examples of joshua.corpus.Corpus

   * @return Longest common prefix array
   */
  protected static int[] calculateLongestCommonPrefixes(Suffixes suffixes) {


    int length = suffixes.size();
    Corpus corpus = suffixes.getCorpus();


    int[] longestCommonPrefixes = new int[length +1];
    
    // For each element in the suffix array
    for (int i = 1; i < length; i++) {
      int corpusIndex = suffixes.getCorpusIndex(i);
      int prevCorpusIndex = suffixes.getCorpusIndex(i-1);


      // Start by assuming that the two positions 
      //    don't have anything in common
      int commonPrefixSize = 0;
      
      // While the 1st position is not at the end of the corpus...
      while(corpusIndex+commonPrefixSize < length && 
          // ... and the 2nd position is not at the end of the corpus...
          prevCorpusIndex + commonPrefixSize < length &&
          // ... and the nth word at the 1st position ...
          (corpus.getWordID(corpusIndex  + commonPrefixSize) == 
            // ... is the same as the nth word at the 2nd position ...
            corpus.getWordID(prevCorpusIndex + commonPrefixSize) && 
            // ... and the length to consider isn't too long
            commonPrefixSize <= Suffixes.MAX_COMPARISON_LENGTH)) {
        
        // The two positions match for their respective nth words!
        // Increment commonPrefixSize to reflect this fact

View Full Code Here

Examples of joshua.corpus.Corpus

  
  
  private Map<Phrase,InvertedIndex> calculateInvertedIndices() {
    Map<Phrase,InvertedIndex> invertedIndices = new HashMap<Phrase,InvertedIndex>(frequentPhrases.keySet().size());
    
    Corpus corpus = suffixes.getCorpus();
    int endOfCorpus = corpus.size();
    logger.fine("Corpus has size " + endOfCorpus);
    
    int sentenceNumber = 0;
    int endOfSentence = suffixes.getSentencePosition(sentenceNumber+1);
    boolean trackMe = false;
    // Start at the beginning of the corpus...
    for (int currentPosition : corpus.corpusPositions()) {
//          
      if (trackMe) 
        {
        logger.fine("At corpus position " + currentPosition);
        }

View Full Code Here

Examples of joshua.corpus.Corpus

  
  public static void main(String[] args) throws IOException, ClassNotFoundException {




    Vocabulary symbolTable;
    Corpus corpusArray;
    Suffixes suffixArray;
    FrequentPhrases frequentPhrases;


    if (args.length == 1) {

View Full Code Here

0 1 2 3 4

TOP

All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.