Package joshua.prefix_tree

Source Code of joshua.prefix_tree.PrefixTree

/* This file is part of the Joshua Machine Translation System.
*
* Joshua is free software; you can redistribute it and/or modify
* it under the terms of the GNU Lesser General Public License as
* published by the Free Software Foundation; either version 2.1
* of the License, or (at your option) any later version.
*
* This library is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
* License along with this library; if not, write to the Free
* Software Foundation, Inc., 59 Temple Place, Suite 330, Boston,
* MA 02111-1307 USA
*/
package joshua.prefix_tree;

import joshua.corpus.Corpus;
import joshua.corpus.MatchedHierarchicalPhrases;
import joshua.corpus.RuleExtractor;
import joshua.corpus.alignment.Alignments;
import joshua.corpus.lexprob.LexicalProbabilities;
import joshua.corpus.suffix_array.HierarchicalPhrases;
import joshua.corpus.suffix_array.ParallelCorpusGrammarFactory;
import joshua.corpus.suffix_array.Pattern;
import joshua.corpus.suffix_array.Suffixes;
import joshua.corpus.vocab.SymbolTable;
import joshua.decoder.JoshuaConfiguration;
import joshua.decoder.ff.tm.AbstractGrammar;
import joshua.decoder.ff.tm.BilingualRule;
import joshua.decoder.ff.tm.Rule;
import joshua.decoder.ff.tm.Trie;
import joshua.decoder.ff.tm.hiero.MemoryBasedBatchGrammar;
import joshua.util.Cache;

import java.io.PrintStream;
import java.util.Collections;
import java.util.HashMap;
import java.util.HashSet;
import java.util.LinkedList;
import java.util.List;
import java.util.Map;
import java.util.Queue;
import java.util.Set;
import java.util.logging.Level;
import java.util.logging.Logger;

/**
* Represents a prefix tree with suffix links, for use in extracting
* hierarchical phrase-based statistical translation rules.
*
* @author Lane Schwartz
* @version $LastChangedDate:2008-11-13 13:13:31 -0600 (Thu, 13 Nov 2008) $
*/
public class PrefixTree extends AbstractGrammar {

  /** Logger for this class. */
  private static final Logger logger = Logger.getLogger(PrefixTree.class.getName());

  /**
   * Integer representation of the nonterminal X.
   * All nonterminals are guaranteed to be represented by negative integers.
   */
  public static final int X = SymbolTable.X;//-1;
 
  /** Operating system-specific end of line character(s). */
  static final byte[] newline = System.getProperty("line.separator").getBytes();
 
  /** Root node of this tree. */
  final RootNode root;

  /**
   * Responsible for performing sampling and creating translation
   * rules.
   */
  final RuleExtractor ruleExtractor;
 
  /**
   * Max span in the source corpus of any extracted hierarchical
   * phrase.
   */
  final int maxPhraseSpan;  
 
  /**
   * Maximum number of terminals plus nonterminals allowed
   * in any extracted hierarchical phrase.
   */
  final int maxPhraseLength;
 
  /**
   * Maximum number of nonterminals allowed in any extracted
   * hierarchical phrase.
   */
  final int maxNonterminals;

  /**
   * Minimum span in the source corpus of any nonterminal in
   * an extracted hierarchical phrase.
   */
  final int minNonterminalSpan;
 
 
  /**
   * Represents a very high cost, corresponding to a very
   * unlikely probability.
   */
  static final float VERY_UNLIKELY = -1.0f * (float) Math.log(1.0e-9);
 
  /**
   * Indicates whether rules with an initial source-side
   * nonterminal should be extracted from phrases at the start
   * of a sentence, even though such rules do not have
   * supporting corporal evidence.
   * <p>
   * This is included for compatibility with Adam Lopez's
   * Hiero rule extractor, in which this setting is set to
   * <code>true</code>.
   * <p>
   * The default value is <code>false</code>.
   */
  boolean sentenceInitialX = false;
 
  /**
   * Indicates whether rules with a final source-side nonterminal
   * should be extracted from phrases at the end of a sentence,
   * even though such rules do not have supporting corporal
   * evidence.
   * <p>
   * This is included for compatibility with Adam Lopez's
   * Hiero rule extractor, in which this setting is set to
   * <code>true</code>.
   * <p>
   * The default value is <code>false</code>.
   */
  boolean sentenceFinalX = false;
 
 
  boolean edgeXMayViolatePhraseSpan = false;
 
 
  /** Unique integer identifier for the root node. */
  static final int ROOT_NODE_ID = -999;
 
  /**
   * Unique integer identifier for the special ⊥ node
   * that represents the suffix of the root node.
   * @see Lopez (2008), footnote 9 on p73
   */
  static final int BOT_NODE_ID = 0;//-2000;

  /** Suffix array representing the source language corpus. */
  final Suffixes suffixArray;
 
  /** Corpus array representing the target language corpus. */
  final Corpus targetCorpus;
 
  /** */
  final ParallelCorpusGrammarFactory parallelCorpus;
 
  /**
   * Represents alignments between words in the source corpus
   * and the target corpus.
   */
  final Alignments alignments;
 
  /** Lexical translation probabilities. */
  final LexicalProbabilities lexProbs;
 
  /** Symbol table */
  final SymbolTable vocab;
 
  /** Empty pattern */
  final Pattern epsilon;
 
  /**
   * Node representing phrases that start with the nonterminal
   * X. This node's parent is the root node of the tree.
   */
  private final Node xnode;

  private Set<Integer> printedNodes = null;
 
  private Map<Integer,String> ntVocab;
 
  private PrintStream out = null;
 
  private final int ruleOwner;
 
  private final int defaultLHS;
 
  private final float oovFeatureCost;
 
  /**
   * Constructs a new prefix tree with suffix links using the
   * GENERATE_PREFIX_TREE algorithm from Lopez (2008) PhD
   * Thesis, Algorithm 2, p 76.
   *
   * @param parallelCorpus
   */
  public PrefixTree(ParallelCorpusGrammarFactory parallelCorpus) {

   
    if (logger.isLoggable(Level.FINER)) logger.finer("\n\n\nConstructing new PrefixTree\n\n");

    this.parallelCorpus = parallelCorpus;
    this.suffixArray = parallelCorpus.getSuffixArray();
    this.targetCorpus = parallelCorpus.getTargetCorpus();
    this.alignments = parallelCorpus.getAlignments();
    this.lexProbs = parallelCorpus.getLexProbs();
    this.ruleExtractor = parallelCorpus.getRuleExtractor();
    this.maxPhraseSpan = parallelCorpus.getMaxPhraseSpan();
    this.maxPhraseLength = parallelCorpus.getMaxPhraseLength();
    this.maxNonterminals = parallelCorpus.getMaxNonterminals();
    this.minNonterminalSpan = parallelCorpus.getMinNonterminalSpan();
    this.vocab = parallelCorpus.getSourceCorpus().getVocabulary();
    this.ruleOwner = vocab.getID(parallelCorpus.getRuleOwner());
    this.defaultLHS = vocab.getID(parallelCorpus.getDefaultLHSSymbol());
    this.oovFeatureCost = parallelCorpus.getOovFeatureCost();
   
    this.root = new RootNode(this,ROOT_NODE_ID);
    Node bot = new BotNode(parallelCorpus, root);
    this.root.linkToSuffix(bot);

    this.ntVocab = new HashMap<Integer,String>();
    ntVocab.put(PrefixTree.X, "X");
   
////    if (suffixArray==null) {
//////      vocab = null;
////    } else {
//    if (suffixArray != null) {
////      vocab = suffixArray.getVocabulary();
//      //int[] bounds = {0, suffixArray.size()-1};
//      root.setBounds(0, suffixArray.size()-1);
//    }
//    root.sourceHierarchicalPhrases = HierarchicalPhrases.emptyList(vocab);

    // Define epsilon to be an empty pattern
    epsilon = new Pattern(vocab);

   
    // 1: children(p_eps) <-- children(p_eps) U p_x

    if (maxNonterminals > 0) {  // Create and set up the X node that comes off of ROOT
     
      // Add a link from root node to X
      xnode = root.addChild(X);

      // Add a suffix link from X back to root
      Node suffixLink = root.calculateSuffixLink(X);

      if (logger.isLoggable(Level.FINEST)) {
        String oldSuffixLink = (xnode.suffixLink==null) ? "null" : "id"+xnode.suffixLink.objectID;
        String newSuffixLink = (suffixLink==null) ? "null" : "id"+suffixLink.objectID;
        logger.finest("Changing suffix link from " + oldSuffixLink + " to " + newSuffixLink + " for node " + xnode.toShortString(vocab) + " with token " + X);
      }

      xnode.linkToSuffix(suffixLink);
    } else {
      this.xnode = null;
    }

    if (logger.isLoggable(Level.FINEST)) logger.finest("CURRENT TREE:  " + root);

  }

  /**
   * Constructs a new prefix tree with suffix links using the
   * GENERATE_PREFIX_TREE algorithm from Lopez (2008) PhD
   * Thesis, Algorithm 2, p 76.
   * <p>
   * This constructor does not take a suffix array parameter.
   * Instead any prefix tree constructed by this constructor
   * will assume that all possible phrases of this sentence
   * are valid phrases.
   * <p>
   * This constructor is meant to be used primarily for testing
   * purposes.
   *
   * @param sentence
   * @param maxPhraseSpan
   * @param maxPhraseLength
   * @param maxNonterminals
   */
  PrefixTree(SymbolTable vocab, int maxPhraseSpan, int maxPhraseLength, int maxNonterminals) {
    this(new ParallelCorpusGrammarFactory((Suffixes) null, (Suffixes) null, (Alignments) null, null, Integer.MAX_VALUE, maxPhraseSpan, maxPhraseLength, maxNonterminals, 2, Float.MIN_VALUE, JoshuaConfiguration.phrase_owner, JoshuaConfiguration.default_non_terminal, JoshuaConfiguration.oovFeatureCost));
  }


  /**
   * Sets a print stream to which newly extracted rules will be written.
   *
   * @param out a print stream
   *            to which newly extracted rules will be written
   */
  public void setPrintStream(PrintStream out) {
    logger.info("Setting output stream");
    this.out = out;
    this.printedNodes = new HashSet<Integer>();
  }
 
  /**
   * Modify this prefix tree by adding phrases for this
   * sentence.
   *
   * @param sentence
   */
  public void add(int[] sentence) {
   
    long startTime = System.nanoTime();
   
    int START_OF_SENTENCE = 0;
    int END_OF_SENTENCE = sentence.length - 1;
   
    Queue<Tuple> queue = new LinkedList<Tuple>();

    if (logger.isLoggable(Level.FINER)) logger.finer("Last sentence index == I == " + END_OF_SENTENCE);

    // 2: for i from 1 to I
    for (int i=START_OF_SENTENCE; i<=END_OF_SENTENCE; i++) {
      //if (logger.isLoggable(Level.FINEST)) logger.finest("Adding tuple (" + i + ","+ i +","+root+",{"+intToString(sentence[i])+"})");
      if (logger.isLoggable(Level.FINEST)) logger.finest("Adding tuple (\u03b5," + i + ","+ i +","+root.toShortString(vocab) +")");
     
      // 3: Add <f_i, i, i+1, p_eps> to queue
      queue.add(new Tuple(epsilon, i, i, root));
    }

    if (this.maxNonterminals > 0) {  Pattern xpattern = new Pattern(vocab,X);
     
      int start = START_OF_SENTENCE;
      if (!sentenceInitialX) start += 1;
   
      // 4: for i from 1 to I
      for (int i=start; i<=END_OF_SENTENCE; i++) {
        //if (logger.isLoggable(Level.FINEST)) logger.finest("Adding tuple (" + (i-1) + ","+(i)+","+root+",{"+X+","+intToString(sentence[i])+"})");
        if (logger.isLoggable(Level.FINEST)) logger.finest("Adding tuple (X," + (i-1) + ","+ i +","+xnode.toShortString(vocab) +")");
       
        // 5: Add <X f_i, i-1, i+1, p_x> to queue
        if (edgeXMayViolatePhraseSpan) {
          queue.add(new Tuple(xpattern, i, i, xnode))
        } else {
          queue.add(new Tuple(xpattern, i-1, i, xnode));
        }
      }
    }


    // 6: While queue is not empty do
    while (! queue.isEmpty()) {

      if (logger.isLoggable(Level.FINER)) {
        logger.finer("\n");
        if (logger.isLoggable(Level.FINEST)) logger.finest("CURRENT TREE:      " + root);
      }
     
      // 7: Pop <alpha, i, j, p_alphaBeta> from queue
      Tuple tuple = queue.remove();

      int i = tuple.spanStart;
      int j = tuple.spanEnd;
      Node prefixNode = tuple.prefixNode;
      Pattern prefixPattern = tuple.pattern;

//      if (prefixNode.objectID==329 //) {
//          || (prefixNode.objectID==28 && i==13 && j==17)) {
//        int x = -1;
//        x++;
//      }
     
      if (logger.isLoggable(Level.FINER)) logger.finer("Have tuple (" +prefixPattern+","+ i + ","+j+","+prefixNode.toShortString(vocab)+")");

      if (j <= END_OF_SENTENCE) {

        // 8: If p_alphaBetaF_i elementOf children(p_alphaBeta) then
        if (prefixNode.hasChild(sentence[j])) {

          if (logger.isLoggable(Level.FINER)) logger.finer("EXISTING node for \"" + sentence[j] + "\" from " + prefixNode.toShortString(vocab) + " to node " + prefixNode.getChild(sentence[j]).toShortString(vocab) + " with pattern " + prefixPattern);

          // child is p_alphaBetaF_j
          Node child = prefixNode.getChild(sentence[j]);
         
          // 9: If p_alphaBetaF_j is inactive then
          if (! child.active) {
           
            // 10: Continue to next item in queue
            continue;
           
            // 11: Else
          } else {
           
            // 12: EXTEND_QUEUE(alpha beta f_j, i, j, f_1^I)
            if (logger.isLoggable(Level.FINER)) {
              logger.finer("Calling EXTEND_QUEUE("+i+","+j+","+prefixPattern+","+prefixNode.toShortString(vocab));
              if (logger.isLoggable(Level.FINEST)) logger.finest("TREE BEFOR EXTEND: " + root);
            }
            extendQueue(queue, i, j, sentence, new Pattern(prefixPattern,sentence[j]), child);
            if (logger.isLoggable(Level.FINEST)) logger.finest("TREE AFTER EXTEND: " + root);
           
          }

        } else { // 13: Else

          // 14: children(alphaBeta) <-- children(alphaBeta) U p_alphaBetaF_j
          //     (Add new child node)
          if (logger.isLoggable(Level.FINER)) logger.finer("Adding new node to node " + prefixNode.toShortString(vocab));
          Node newNode = prefixNode.addChild(sentence[j]);
          if (logger.isLoggable(Level.FINER)) {
            String word = (suffixArray==null) ? ""+sentence[j] : suffixArray.getVocabulary().getWord(sentence[j]);
            logger.finer("Created new node " + newNode.toShortString(vocab) +" for \"" + word + "\" and \n  added it to " + prefixNode.toShortString(vocab));
          }


          // 15: p_beta <-- suffix_link(p_alpha_beta)
          //     suffixNode in this code is p_beta_f_j, not p_beta
          Node suffixNode = prefixNode.calculateSuffixLink(sentence[j]);

          if (logger.isLoggable(Level.FINEST)) {
            String oldSuffixLink = (newNode.suffixLink==null) ? "null" : "id"+newNode.suffixLink.objectID;
            String newSuffixLink = (suffixNode==null) ? "null" : "id"+suffixNode.objectID;
            logger.finest("Changing suffix link from " + oldSuffixLink + " to " + newSuffixLink + " for node " + newNode.toShortString(vocab) + " (prefix node " + prefixNode.toShortString(vocab) + " ) with token " + sentence[j]);
          }
         
          newNode.linkToSuffix( suffixNode );


          // 16: if p_beta_f_j is inactive then
          if (! suffixNode.active) {
           
            // 17: Mark p_alpha_beta_f_j inactive
            newNode.active = false; //Node.INACTIVE;
           
            // 18: else
          } else {

            Pattern extendedPattern = new Pattern(prefixPattern,sentence[j]);

            MatchedHierarchicalPhrases result = null;
           
            if (suffixArray != null) {
             
              // 19: Q_alpha-beta-f_j <-- query(alpha-beta-f_j, Q_alpha-beta, Q_beta-f_j)
              result = query(extendedPattern, newNode, prefixNode, suffixNode);
             
            }

            // 20: if Q_alpha_beta_f_j = ∅ (meaning that no results were found for this query)
            //if (result != null && result.isEmpty()) {// && prefixNode != xnode) {
            if (result != null && result.isEmpty()) {
             
              // 21: Mark p_alpha_beta_f_j inactive
              newNode.active = false; //Node.INACTIVE;
             
              // 22: else
            } else {
             
              // 23: Mark p_alpha_beta_f_j active
              newNode.active = true; //Node.ACTIVE;
             
              // 24: EXTEND_QUEUE(alpha beta f_j, i, j, f_1^I)
              extendQueue(queue, i, j, sentence, extendedPattern, newNode);
             
            }
          }
        }
      }

    }

    long endTime = System.nanoTime();
    long microseconds = (endTime - startTime) / 1000;
    float milliseconds = microseconds / 1000.0f;
    logger.info("Sentence total extraction time:\t"+ milliseconds + " milliseconds");
   
   
    if (logger.isLoggable(Level.FINER)) {
      logger.finer("\n");
      if (logger.isLoggable(Level.FINEST)) logger.finest("FINAL TREE:  " + root);
    }
  }
 

  /**
   * Implements the root QUERY algorithm (Algorithm 4) of
   * Adam Lopez's (2008) doctoral thesis.
   *
   * @param pattern Pattern to search for
   * @param node Node in the prefix tree
   * @param prefixNode Prefix node
   * @param suffixNode Suffix node
   * @return List of matched hierarchical phrases for the specified pattern.
   *
   * @see "Lopez (2008)"
   */
  public MatchedHierarchicalPhrases query(Pattern pattern, Node node, Node prefixNode, Node suffixNode) {

    if (logger.isLoggable(Level.FINER)) logger.finer("PrefixTree.query( " + pattern + ",\n\t   new node " + node + ",\n\tprefix node " + prefixNode + ",\n\tsuffix node " + suffixNode + ")");
    long startTime = System.nanoTime();
   
    MatchedHierarchicalPhrases result;

//    boolean stop = false;
//    if (pattern.toString().startsWith("[de ")) {
//      logger.warning("Found it! " + pattern.toString() + " yahoo");
//      int x;
//      x=5;
//      x+=1;
//      stop = true;
//    }
//   
//    if (stop) {
//      if (stop) {
//        logger.info("Stopping");
//        logger.info("Did you stop?");
//      }
//    }
//   
   
    if (suffixArray.getCachedHierarchicalPhrases().containsKey(pattern)) {
      result = suffixArray.getCachedHierarchicalPhrases().get(pattern);
      int[] bounds = suffixArray.findPhrase(pattern, 0, pattern.size(), prefixNode.lowBoundIndex, prefixNode.highBoundIndex);
      if (bounds!=null) {
        node.setBounds(bounds[0],bounds[1]);
      }
    } else {
      if (pattern.toString().startsWith("[de ")) {
        int x = 5;
        x++;
      }

      int arity = pattern.arity();

      // 1: if alpha=u then
      //    If the pattern is contiguous, look up the pattern in the suffix array
      if (arity == 0) {

        // 2: SUFFIX-ARRAY-LOOKUP(SA_f, a alpha b, l_a_alpha, h_a_alpha
        // Get the first and last index in the suffix array for the specified pattern
        int[] bounds = suffixArray.findPhrase(pattern, 0, pattern.size(), prefixNode.lowBoundIndex, prefixNode.highBoundIndex);
        if (bounds==null) {
          result = HierarchicalPhrases.emptyList(pattern);
          suffixArray.cacheMatchingPhrases(result);
          //TODO Should node.setBounds(bounds) be called here?
        } else {
          node.setBounds(bounds[0],bounds[1]);
          int[] startingPositions = suffixArray.getAllPositions(bounds);
          result = suffixArray.createTriviallyHierarchicalPhrases(startingPositions, pattern, vocab);
        }


      } else { // 3: else --- alpha is a discontiguous pattern

        // 8: If M_a_alpha_b has been precomputed (then result will be non-null)
        // 9: Retrieve M_a_alpha_b from cache of precomputations


        // 10: else
        if (suffixArray.getCachedHierarchicalPhrases().containsKey(pattern)) { 
          result = suffixArray.getMatchingPhrases(pattern);
        } else {

          // 16: M_a_alpha_b <-- QUERY_INTERSECT(M_a_alpha, M_alpha_b)

          int[] sourceWords = prefixNode.getSourcePattern().getWordIDs();

          // Special handling of case when prefixNode is the X off of root (hierarchicalPhrases for that node is empty)
          if (arity==1 && sourceWords[0] < 0 && sourceWords[sourceWords.length-1] < 0){

            result = suffixNode.getMatchedPhrases().copyWithInitialX();

          } else {

            // Normal query intersection case (when prefixNode != X off of root)

            if (logger.isLoggable(Level.FINEST)) logger.finest("Calling queryIntersect("+pattern+" M_a_alpha.pattern=="+prefixNode.getSourcePattern() + ", M_alpha_b.pattern=="+suffixNode.getSourcePattern()+")");

            result = HierarchicalPhrases.queryIntersect(pattern, prefixNode.getMatchedPhrases(), suffixNode.getMatchedPhrases(), minNonterminalSpan, maxPhraseSpan, suffixArray);

          }

          suffixArray.cacheMatchingPhrases(result);
        }
      }
    }
   
    long finalQueryTime = System.nanoTime();
    if (logger.isLoggable(Level.FINE)) {
      long elapsedQueryTime = finalQueryTime - startTime;
      long microseconds = elapsedQueryTime / 1000;
      float milliseconds = microseconds / 1000.0f;
      logger.fine("Time to query pattern:\t" + pattern.toString() + "\t" + milliseconds + " milliseconds\t" + result.size() + " instances");
    }
   
    // 17: Return M_a_alpha_b
    List<Rule> rules = ruleExtractor.extractRules(result);
//    node.storeResults(result, rules);
    storeResults(node, result, rules);
   
    if (logger.isLoggable(Level.FINE)) {
      long elapsedTime = System.nanoTime() - finalQueryTime;
      long microseconds = elapsedTime / 1000;
      float milliseconds = microseconds / 1000.0f;
      logger.fine("Time to extract rules for pattern:\t" + pattern.toString() + "\t" + milliseconds + " milliseconds\t" + result.size() + " instances");
    }

    return result;

  }
 
  @SuppressWarnings("deprecation")
  private void storeResults(Node node, MatchedHierarchicalPhrases result, List<Rule> rules) {
    if (printedNodes==null || !printedNodes.contains(node.objectID)) {
      node.storeResults(result, rules);

      if (out==null) {
        logger.finer("Not printing rules");
      } else {

        for (Rule rule : rules) {
          String ruleString = rule.toString(ntVocab, suffixArray.getVocabulary(), targetCorpus.getVocabulary());
          if (logger.isLoggable(Level.FINEST)) logger.finest("Rule: " + ruleString);
          out.println(ruleString);
        }
        printedNodes.add(node.objectID);

      }
    }
  }
 
  /**
   * Implements Function EXTEND_QUEUE from Lopez (2008) PhD
   * Thesis, Algorithm 2, p 76
   *
   * @param queue Queue of tuples
   * @param i Start index of the pattern in the source input
   *          sentence (inclusive, 1-based).
   * @param j End index of the pattern in the source input
   *          sentence (inclusive, 1-based).
   * @param sentence
   * @param pattern Pattern corresponding to the prefix node.
   *                In Lopez's terminology, this pattern is
   *                alpha f_j.
   * @param node Node in the prefix tree to which a new node
   *             (corresponding to the pattern) will eventually
   *             be attached.
   */
  private void extendQueue(Queue<Tuple> queue, int i, int j, int[] sentence, Pattern pattern, Node node) {

    int J = j;
    if (!sentenceFinalX) J += 1;

    int endOfPhraseSpan = (j+1)-i+1;

   
    // 1: if |alpha| < MaxPhraseLength  and  j-i+1<=MaxPhraseSpan then    
    if (pattern.size() < maxPhraseLength  && J<sentence.length) {

      if (endOfPhraseSpan <= maxPhraseSpan) {
        // 2: Add <alpha f_j, i, j+1, p_alpha> to queue
        //    (add new tuple to the queue)
        if (logger.isLoggable(Level.FINEST)) logger.finest("\nextendQueue: Adding tuple (" +pattern+","+ i + ","+ (j+1) +","+node+")");//(new Pattern(alphaPattern,sentence[j+1]))+"})");
        queue.add(new Tuple(pattern, i, j+1, node));//, sentence[j+1]));
      }

      if (edgeXMayViolatePhraseSpan) endOfPhraseSpan -= 1;
     
      // 3: if arity(alpha) < MaxNonterminals then
      if (pattern.arity() < maxNonterminals && endOfPhraseSpan <= maxPhraseSpan) {
        Node xNode;

        if (! node.children.containsKey(X)) {

          // 4: children(p_alpha) <-- children(p_alpha) U p_alphaX
          //    (add new child node in tree and mark in as active)
          xNode = node.addChild(X);
          if (logger.isLoggable(Level.FINEST)) logger.finest("Adding node for \"" + X + "\" from " + node + " to new node " + xNode + " with alphaPattern " + pattern + "  (in extendQueue)");

          Node suffixLink = node.calculateSuffixLink(X);

          if (logger.isLoggable(Level.FINEST)) {
            String oldSuffixLink = (xNode.suffixLink==null) ? "null" : "id"+xNode.suffixLink.objectID;
            String newSuffixLink = (suffixLink==null) ? "null" : "id"+suffixLink.objectID;
            logger.finest("Changing suffix link from " + oldSuffixLink + " to " + newSuffixLink + " for node " + xNode + " (prefix node " + node + " ) with token " + X);
          }

          xNode.linkToSuffix( suffixLink );

        } else {
          xNode = node.children.get(X);
          if (logger.isLoggable(Level.FINEST)) logger.finest("X Node is already " + xNode + " for prefixNode " + node);
        }

        // 5: Mark p_alphaX active
        xNode.active = true; //Node.ACTIVE;
       
        int[] patternWords = pattern.getWordIDs();
       
        // 6: Q_alphaX <-- Q_alpha
        {
          SymbolTable vocab = (suffixArray==null) ? null : suffixArray.getVocabulary();
          Pattern xpattern = new Pattern(vocab, patternWords, X);
         
//          HierarchicalPhrases phrasesWithFinalX = new HierarchicalPhrases(xpattern, node.sourceHierarchicalPhrases);
          MatchedHierarchicalPhrases phrasesWithFinalX;
          if (suffixArray==null) {
            // This should only happen in certain unit tests
            logger.severe("This should only be encountered during unit testing!");
            if (node.sourceHierarchicalPhrases==null) {
              node.sourceHierarchicalPhrases = HierarchicalPhrases.emptyList((SymbolTable) null);
              node.sourcePattern = node.sourceHierarchicalPhrases.getPattern();
            }
            phrasesWithFinalX = node.getMatchedPhrases().copyWithFinalX();
          } else {
            Cache<Pattern,MatchedHierarchicalPhrases> cache = suffixArray.getCachedHierarchicalPhrases();
            if (cache.containsKey(xpattern)) {
              phrasesWithFinalX = cache.get(xpattern);
            } else {
              phrasesWithFinalX = node.getMatchedPhrases().copyWithFinalX();
              suffixArray.cacheMatchingPhrases(phrasesWithFinalX);
            }
          } 
         
          List<Rule> rules = (ruleExtractor==null) ?
                Collections.<Rule>emptyList() :
                ruleExtractor.extractRules(phrasesWithFinalX);
          //xNode.storeResults(phrasesWithFinalX, rules);
          storeResults(xNode, phrasesWithFinalX, rules);
        }
     
        if (logger.isLoggable(Level.FINEST)) logger.finest("Alpha pattern is " + pattern);

        // For efficiency, don't add any tuples to the queue whose patterns would exceed the max allowed number of tokens
        if (patternWords.length+2 <= maxPhraseLength) {
         
          int I = sentence.length;
          if (!sentenceFinalX) I -= 1;
         
          int min = (I<i+maxPhraseSpan) ? I : i+maxPhraseSpan-1;
          Pattern patternX = new Pattern(pattern, X);

          // 7: for k from j+1 to min(I, i+MaxPhraseLength) do
          for (int k=j+2; k<=min; k++) {

            // 8: Add <alpha f_j X, i, k, p_alphaX> to queue
            if (logger.isLoggable(Level.FINEST)) logger.finest("extendQueue: Adding tuple ("+patternX+","+i+","+k+","+xNode+ " ) in EXTEND_QUEUE ****************************************" );
            queue.add(new Tuple(patternX, i, k, xNode));

          }
        } else if (logger.isLoggable(Level.FINEST)) {
          logger.finest("Not extending " + pattern + "+X ");
        }
      }
    }
   

  }


//  /**
//   * Gets the root node of this tree.
//   *
//   * @return the root node of this tree
//   */
//  public Grammar getRoot() {
//    return root;
//  }
 
//  /**
//   * Gets all translation rules stored in this tree.
//   *
//   * @return all translation rules stored in this tree
//   */
//  public List<Rule> getAllRules() {
//   
//    return root.getAllRules();
//   
//  }

  /* See Javadoc for java.lang.Object#toString. */
  public String toString() {
    return root.toTreeString("", vocab);
  }

  /**
   * Gets the number of nodes in this tree.
   * <p>
   * This method recursively traverses through all nodes
   * in the tree every time this method is called.
   *
   * @return the number of nodes in this tree
   */
  public int size() {
    return root.size();
  }

 
  /**
   * Constructs an invalid, dummy prefix tree.
   * <p>
   * The unit tests for Node require a dummy PrefixTree.
   */
  private PrefixTree() {
    root = null;
    parallelCorpus = null;
    suffixArray = null;
    targetCorpus = null;
    alignments = null;
    lexProbs = null;
    xnode = null;
    ruleExtractor = null;
    this.epsilon = null;
    this.vocab = null;
    this.maxPhraseSpan = Integer.MIN_VALUE;
    this.maxPhraseLength = Integer.MIN_VALUE;
    this.maxNonterminals = Integer.MIN_VALUE;
    this.minNonterminalSpan = Integer.MAX_VALUE;
    this.ruleOwner = Integer.MIN_VALUE;
    this.defaultLHS = Integer.MIN_VALUE;
    this.oovFeatureCost = Float.NaN;
  }
 
  /**
   * Gets an invalid, dummy prefix tree.
   * <p>
   * For testing purposes only.
   *
   * @return an invalid, dummy prefix tree
   */
  static PrefixTree getDummyPrefixTree() {
    return new PrefixTree();
  }
 
 
  public Rule constructManualRule(int lhs, int[] sourceWords,
      int[] targetWords, float[] scores, int arity) {
    return new BilingualRule(lhs, sourceWords, targetWords, scores, arity, this.ruleOwner, 0, getOOVRuleID());
  }

  public Rule constructOOVRule(int numFeatures, int sourceWord, int targetWord,
      boolean hasLM) {
    int[] french      = new int[1];
    french[0]         = sourceWord;
    int[] english       = new int[1];
    english[0]          = targetWord;
    float[] feat_scores = new float[numFeatures];
   
    // TODO: This is a hack to make the decoding without a LM works
    /**when a ngram LM is used, the OOV word will have a cost 100.
     * if no LM is used for decoding, so we should set the cost of some
     * TM feature to be maximum
     * */
    if ( (!hasLM) && numFeatures > 0) {
      feat_scores[0] = oovFeatureCost;
    }
   
    return new BilingualRule(
        this.defaultLHS, french, english,
        feat_scores, 0, this.ruleOwner,
        0, getOOVRuleID());

  }

  public int getNumRules() {
    return root.getNumRules();
  }

  public int getOOVRuleID() {
    return MemoryBasedBatchGrammar.OOV_RULE_ID;
  }

  public Trie getTrieRoot() {
    return root;
  }

  public boolean hasRuleForSpan(int startIndex, int endIndex, int pathLength) {
    return (endIndex - startIndex <= this.maxPhraseSpan);
  }
 
}
TOP

Related Classes of joshua.prefix_tree.PrefixTree

TOP
Copyright © 2018 www.massapi.com. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.