Package joshua.corpus.suffix_array

Source Code of joshua.corpus.suffix_array.ParallelCorpusGrammarFactory

/* This file is part of the Joshua Machine Translation System.
*
* Joshua is free software; you can redistribute it and/or modify
* it under the terms of the GNU Lesser General Public License as
* published by the Free Software Foundation; either version 2.1
* of the License, or (at your option) any later version.
*
* This library is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
* License along with this library; if not, write to the Free
* Software Foundation, Inc., 59 Temple Place, Suite 330, Boston,
* MA 02111-1307 USA
*/
package joshua.corpus.suffix_array;

import java.util.ArrayList;

import joshua.corpus.AlignedParallelCorpus;
import joshua.corpus.Phrase;
import joshua.corpus.RuleExtractor;
import joshua.corpus.alignment.Alignments;
import joshua.corpus.lexprob.LexProbs;
import joshua.corpus.lexprob.LexicalProbabilities;
import joshua.decoder.ff.FeatureFunction;
import joshua.decoder.ff.tm.Grammar;
import joshua.decoder.ff.tm.GrammarFactory;
import joshua.prefix_tree.HierarchicalRuleExtractor;
import joshua.prefix_tree.PrefixTree;

/**
* Aligned parallel corpus, capable of extracting a sentence-specific
* translation grammar.
* <p>
* The source side of the aligned parallel corpus is backed by a
* suffix array.
*
* @author Lane Schwartz
*/
public class ParallelCorpusGrammarFactory extends AlignedParallelCorpus implements GrammarFactory {

  /** Source language corpus, represented as a suffix array. */
  private final Suffixes sourceSuffixArray;
 
  /** Lexical translation probability table. */
  private final LexicalProbabilities lexProbs;
 
  /** Responsible for extracting translation rules from a parallel corpus. */
  private final RuleExtractor ruleExtractor;
 
  /**
   * Max span in the source corpus of any extracted hierarchical
   * phrase
   */
  private final int maxPhraseSpan;
 
  /**
   * Maximum number of terminals plus nonterminals allowed
   * in any extracted hierarchical phrase.
   */
  private final int maxPhraseLength;
 
  /**
   * Maximum number of nonterminals allowed on the
   * right-hand side of any extracted rule
   */
  private final int maxNonterminals;
 
  /**
   * Minimum span in the source corpus of any
   * nonterminal in an extracted hierarchical phrase.
   */                          
  private final int minNonterminalSpan;
 
 
  private final String ruleOwner;
 
  private final String defaultLHSSymbol;
 
  private final float oovFeatureCost;
 
  /**
   * Constructs a factory capable of getting a grammar backed
   * by a suffix array.
   *
   * @param sourceSuffixArray Source language corpus,
   *                          represented as a suffix array
   * @param targetSuffixArray Target language corpus
   *                          represented as a suffix array
   * @param alignments        Parallel corpus alignment points
   * @param maxPhraseSpan     Max span in the source corpus of any
   *                          extracted hierarchical phrase
   * @param maxPhraseLength   Maximum number of terminals plus nonterminals
   *                          allowed in any extracted hierarchical phrase
   * @param maxNonterminals   Maximum number of nonterminals allowed on the
   *                          right-hand side of any extracted rule
   * @param ruleOwner     Specifies a name identifier for this grammar
   * @param defaultLHSSymbol TODO
   * @param oovFeatureCost TODO
   */
  public ParallelCorpusGrammarFactory(
      Suffixes sourceSuffixArray,
      Suffixes targetSuffixArray,
      Alignments alignments,
      ArrayList<FeatureFunction> models,
      int sampleSize,
      int maxPhraseSpan,
      int maxPhraseLength,
      int maxNonterminals,
      int minNonterminalSpan,
      float lexProbFloor,
      String ruleOwner,
      String defaultLHSSymbol,
      float oovFeatureCost) {
   
    super((sourceSuffixArray==null)?null:sourceSuffixArray.getCorpus(),
        (targetSuffixArray==null)?null:targetSuffixArray.getCorpus(),
        alignments);
    this.sourceSuffixArray = sourceSuffixArray;
    this.maxPhraseSpan     = maxPhraseSpan;
    this.maxPhraseLength   = maxPhraseLength;
    this.maxNonterminals   = maxNonterminals;
    this.minNonterminalSpan = minNonterminalSpan;
    this.lexProbs          = new LexProbs(this,lexProbFloor);
    this.ruleOwner = ruleOwner;
    this.defaultLHSSymbol = defaultLHSSymbol;
    this.oovFeatureCost = oovFeatureCost;
   
    int maxNonterminalSpan = maxPhraseSpan;
   
    this.ruleExtractor =
      new HierarchicalRuleExtractor(
          sourceSuffixArray,
          targetSuffixArray,
          alignments,
          lexProbs,
          models,
          sampleSize,
          maxPhraseSpan,
          maxPhraseLength,
//          maxNonterminals,
//          minNonterminalSpan
          minNonterminalSpan,
          maxNonterminalSpan
        )
   
    /*
  public HierarchicalRuleExtractor(
      Suffixes suffixArray,
      Corpus targetCorpus,
      Alignments alignments,
      LexicalProbabilities lexProbs,
      int sampleSize,
      int maxPhraseSpan,
      int maxPhraseLength,
      int minNonterminalSpan,
      int maxNonterminalSpan)
    
     */
  }
 
 
  /**
   * Constructs a factory capable of getting a grammar backed
   * by a suffix array.
   *
   * @param sourceSuffixArray Source language corpus,
   *                          represented as a suffix array
   * @param targetSuffixArray Target language corpus
   *                          represented as a suffix array
   * @param alignments        Parallel corpus alignment points
   * @param maxPhraseSpan     Max span in the source corpus of any
   *                          extracted hierarchical phrase
   * @param maxPhraseLength   Maximum number of terminals plus nonterminals
   *                          allowed in any extracted hierarchical phrase
   * @param maxNonterminals   Maximum number of nonterminals allowed on the
   *                          right-hand side of any extracted rule
   * @param ruleOwner     Specifies a name identifier for this grammar
   * @param defaultLHSSymbol TODO
   * @param oovFeatureCost TODO
   */
  public ParallelCorpusGrammarFactory(
      Suffixes sourceSuffixArray,
      Suffixes targetSuffixArray,
      Alignments alignments,
      ArrayList<FeatureFunction> models,
      String lexCountsFilename,
      int sampleSize,
      int maxPhraseSpan,
      int maxPhraseLength,
      int maxNonterminals,
      int minNonterminalSpan, 
      String ruleOwner,
      String defaultLHSSymbol,
      float oovFeatureCost) {
   
    super((sourceSuffixArray==null)?null:sourceSuffixArray.getCorpus(),
        (targetSuffixArray==null)?null:targetSuffixArray.getCorpus(),
        alignments);
    this.sourceSuffixArray = sourceSuffixArray;
    this.maxPhraseSpan     = maxPhraseSpan;
    this.maxPhraseLength   = maxPhraseLength;
    this.maxNonterminals   = maxNonterminals;
    this.minNonterminalSpan = minNonterminalSpan;
    this.lexProbs          = new LexProbs(this, lexCountsFilename);
    this.ruleOwner = ruleOwner;
    this.defaultLHSSymbol = defaultLHSSymbol;
    this.oovFeatureCost = oovFeatureCost;
   
    int maxNonterminalSpan = maxPhraseSpan;
   
    this.ruleExtractor =
      new HierarchicalRuleExtractor(
          sourceSuffixArray,
          targetSuffixArray,
          alignments,
          lexProbs,
          models,
          sampleSize,
          maxPhraseSpan,
          maxPhraseLength,
//          maxNonterminals,
//          minNonterminalSpan
          minNonterminalSpan,
          maxNonterminalSpan
        )
   
    /*
  public HierarchicalRuleExtractor(
      Suffixes suffixArray,
      Corpus targetCorpus,
      Alignments alignments,
      LexicalProbabilities lexProbs,
      int sampleSize,
      int maxPhraseSpan,
      int maxPhraseLength,
      int minNonterminalSpan,
      int maxNonterminalSpan)
    
     */
  }
 
 
  /**
   * Extracts a grammar which contains only those rules
   * relevant for translating the specified sentence.
   *
   * @param sentence A sentence to be translated
   * @return a grammar, structured as a trie, that represents
   *         a set of translation rules
   */
  public Grammar getGrammarForSentence(Phrase sentence) {
   
    int[] words = new int[sentence.size()];
    for (int i = 0; i < words.length; i++) {
      words[i] = sentence.getWordID(i);
    }
   
    PrefixTree prefixTree = new PrefixTree(
//        sourceSuffixArray, targetCorpus, alignments,
//        sourceSuffixArray.getVocabulary(), lexProbs, ruleExtractor,
        this);
   
    prefixTree.add(words);
   
    return prefixTree;
//    return prefixTree.getRoot();
  }
 
  /**
   * Gets the source side suffix array.
   *
   * @return the source side suffix array
   */
  public Suffixes getSuffixArray() {
    return this.sourceSuffixArray;
  }
 
  /**
   * Gets the rule extractor.
   *
   * @return the rule extractor
   */
  public RuleExtractor getRuleExtractor() {
    return this.ruleExtractor;
  }
 
  public LexicalProbabilities getLexProbs() {
    return this.lexProbs;
  }
 
  /**
   * Max span in the source corpus of any extracted hierarchical
   * phrase
   */
  public int getMaxPhraseSpan() {
    return this.maxPhraseSpan;
  }
 
  /**
   * Maximum number of terminals plus nonterminals allowed
   * in any extracted hierarchical phrase.
   */
  public int getMaxPhraseLength() {
    return this.maxPhraseLength;
  }
 
  /**
   * Maximum number of nonterminals allowed on the
   * right-hand side of any extracted rule
   */
  public int getMaxNonterminals() {
    return this.maxNonterminals;
  }
 
  /**
   * Minimum span in the source corpus of any
   * nonterminal in an extracted hierarchical phrase.
   */                          
  public int getMinNonterminalSpan() {
    return this.minNonterminalSpan;
  }
 
  public String getRuleOwner() {
    return this.ruleOwner;
  }
 
  public String getDefaultLHSSymbol() {
    return this.defaultLHSSymbol;
  }
 
  public float getOovFeatureCost() {
    return this.oovFeatureCost;
  }
}
TOP

Related Classes of joshua.corpus.suffix_array.ParallelCorpusGrammarFactory

TOP
Copyright © 2018 www.massapi.com. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.