Package joshua.corpus.suffix_array

Source Code of joshua.corpus.suffix_array.HierarchicalPhrase

/* This file is part of the Joshua Machine Translation System.
*
* Joshua is free software; you can redistribute it and/or modify
* it under the terms of the GNU Lesser General Public License as
* published by the Free Software Foundation; either version 2.1
* of the License, or (at your option) any later version.
*
* This library is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
* License along with this library; if not, write to the Free
* Software Foundation, Inc., 59 Temple Place, Suite 330, Boston,
* MA 02111-1307 USA
*/
package joshua.corpus.suffix_array;

import joshua.corpus.Corpus;
import joshua.corpus.LabeledSpan;
import joshua.corpus.Phrase;
import joshua.corpus.Span;

import java.util.Arrays;
import java.util.List;
import java.util.ArrayList;


/**
* HierarchicalPhrase is a class that represents a single matched
* hierarchical phrase, and provides the methods necessary for
* accessing the Pattern that it matches and for computing the
* intersection used in the suffix array lookups of discontinuous
* phrases.
* <p>
* For efficiency and space savings, this class should only be used
* when it is does not make sense to use MatchedHierarchicalPhrases.
*
* In cases where many hierarchical phrases share a common pattern,
* that class should be preferred, as it is able to store the phrases
* in a much more memory-efficient manner compared to storing a
* collection of HierarchicalPhrase objects.
*
* @author Chris Callison-Burch and Lane Schwartz
* @since July 31 2008
* @version $LastChangedDate:2008-09-18 12:47:23 -0500 (Thu, 18 Sep 2008) $
*/
public class HierarchicalPhrase extends Pattern {
 
  /**
   * Represents one index into the source corpus for each
   * terminal that starts a contiguous sequence of terminals.
   */
  protected final int[] terminalSequenceStartIndices;
 
 
  /** Exclusive ending indices of each terminal sequence. */
  protected final int[] terminalSequenceEndIndices;
 
 
  /** Monolingual corpus from which this phrase was extracted. */
  protected final Corpus corpusArray;
 
 
  /**
   * Index of the sentence in the corpus where this phrase
   * was extracted.
   */
  protected final int sentenceNumber;
 
 
  // We are NOT storing this, because of cases where the HP
  // starts with a nonterminal. In such cases, we want a
  // single HP to represent all HPs that start with that
  // nonterminal, and storing the startIndex would spuriously
  // require a list of HPs to store this.
  //
  //protected final int startIndex;
 
 
  /**
   * Constructs a hierarchical phrase rooted at a location
   * in a corpus.
   *
   * @param patternWords Terminals and nonterminals comprising
   *                     the pattern
   * @param span         Span in the corpus from the first
   *                     terminal  to the final terminal in the phrase
   * @param nonterminalSpans Locations of all nonterminals
   *                     in this phrase.
   * @param corpus       Representation of a monolingual
   *                     corpus.
   */
  public HierarchicalPhrase(
      int[]             patternWords,
      Span              span,
      List<LabeledSpan> nonterminalSpans,
      Corpus corpus) {
   
    super(corpus.getVocabulary(), patternWords);
   
    // Compute the list of terminal spans
    List<Span> terminalSpans = new ArrayList<Span>();
    {
      int possibleStart = span.start;

      int nonterminalIndex = 0;
      Span nonterminal = span;

      for (LabeledSpan labeledNTSpan : nonterminalSpans) {

        nonterminal = labeledNTSpan.getSpan();

        if (nonterminal.start > possibleStart) {
          terminalSpans.add(new Span(possibleStart, nonterminal.start));
        }

        possibleStart = nonterminal.end;

        nonterminalIndex++;
      }

      if (span.end > possibleStart) {
        terminalSpans.add(new Span(possibleStart, span.end));
      }
    }

    // Initialize the sequence arrays
    this.terminalSequenceStartIndices = new int[terminalSpans.size()];
    this.terminalSequenceEndIndices = new int[terminalSpans.size()];
    {
      int terminalSpanNumber = 0;
      for (Span terminalSpan : terminalSpans) {
        terminalSequenceStartIndices[terminalSpanNumber] = terminalSpan.start;
        terminalSequenceEndIndices[terminalSpanNumber] = terminalSpan.end;
        terminalSpanNumber++;
      }
    }
   
   
    this.corpusArray = corpus;
   
    this.sentenceNumber = corpus.getSentenceIndex(terminalSequenceStartIndices[0]);
  }
 
 
  /**
   * Returns the index in the corpus of the first terminal
   * in this phrase.
   *
   * @return the index in the corpus of the first terminal
   *         in this phrase.
   */
  public int getFirstTerminalCorpusIndex() {
    return terminalSequenceStartIndices[0];
  }
 
 
  /**
   * Returns the index in the corpus of the last terminal in
   * this phrase.
   *
   * @return the index in the corpus of the last terminal in
   *         this phrase.
   */
  public int getLastTerminalCorpusIndex() {
    return terminalSequenceEndIndices[terminalSequenceEndIndices.length-1];
  }
 
  /**
   * Gets whether this phrase contains a terminal at the
   * specified index.
   *
   * @return <code>true</code> this phrase contains a terminal
   *         at the specified index <code>false</code> otherwise
   */
  public boolean containsTerminalAt(int alignedPointIndex) {

    for (int i=0; i<terminalSequenceStartIndices.length; i++) {
      if (alignedPointIndex >= terminalSequenceStartIndices[i] &&
          alignedPointIndex < terminalSequenceEndIndices[i]) {
        return true;
      }
    }   
   
    return false;
  }
 
 
  /**
   * Returns the index of the sentence in the corpus that the
   * phrase is a part of.
   *
   * @return the index of the sentence in the corpus that the
   *         phrase is a part of.
   */
  public int getSentenceNumber() {
    return sentenceNumber;
  }
 
 
  /**
   * Gets all possible subphrases of this phrase, up to and
   * including the phrase itself. For example, the phrase "I
   * like cheese ." would return the following:
   * <ul>
   * <li>I
   * <li>like
   * <li>cheese
   * <li>.
   * <li>I like
   * <li>like cheese
   * <li>cheese .
   * <li>I like cheese
   * <li>like cheese .
   * <li>I like cheese .
   * </ul>
   *
   * @return ArrayList of all possible subphrases.
   */
  public List<Phrase> getSubPhrases() {
    return getSubPhrases(size());
  }
 
 
  /**
   * Returns a list of subphrases only of length
   * <code>maxLength</code> or smaller.
   *
   * @param maxLength the maximum length phrase to return.
   * @return ArrayList of all possible subphrases of length
   *         maxLength or less
   * @see #getSubPhrases()
   */
  public List<Phrase> getSubPhrases(int maxLength) {
    if (maxLength > size()) {
      return getSubPhrases(size());
    }
    List<Phrase> phrases = new ArrayList<Phrase>();
   
    for (int i = 0; i < size(); i++) {
      for (int j = i + 1; (j <= size()) && (j - i <= maxLength); j++) {
        Phrase subPhrase = subPhrase(i,j);
        phrases.add(subPhrase);
      }
    }
    return phrases;
  }
 
 
  /**
   * Returns a string representation of the phrase.
   *
   * @return a space-delimited string of the terminals and
   *         nonterminals in the phrase.
   */
  public String toString() {
    StringBuilder s = new StringBuilder();
   
    int[] wordIDs = getWordIDs();
    for (int i = 0; i < wordIDs.length; i++) {
      if (wordIDs[i]<0) {
        s.append('X');
      } else {
        s.append(corpusArray.getVocabulary().getWord(wordIDs[i]));
      }
      s.append(' ');
    }
   
    s.append(Arrays.toString(terminalSequenceStartIndices));
   
    return s.toString();
  }
 
 
  /**
   * Gets the number of contiguous sequences of terminals
   * in the pattern corresponding to this phrase.
   *
   * @return the number of contiguous sequences of terminals
   *         in the pattern corresponding to this phrase
   */
  public int getNumberOfTerminalSequences() {
    return terminalSequenceStartIndices.length;
  }
 
  /**
   * Gets the starting index in the corpus for a specified
   * contiguous sequence of terminals in this phrase.
   *
   * @param sequenceIndex Index specifying a contiguous sequence
   *                      of terminals in this pattern
   * @return the starting index in the corpus for a specified
   *         contiguous sequence of terminals in this phrase
   */
  public int getTerminalSequenceStartIndex(int sequenceIndex) {
    return this.terminalSequenceStartIndices[sequenceIndex];
  }
 
  /**
   * Gets the ending index in the corpus for a specified
   * contiguous sequence of terminals in this phrase.
   *
   * @param sequenceIndex Index specifying a contiguous sequence
   *                      of terminals in this pattern
   * @return the ending index in the corpus for a specified
   *         contiguous sequence of terminals in this phrase
   */
  public int getTerminalSequenceEndIndex(int sequenceIndex) {
   
    return this.terminalSequenceEndIndices[sequenceIndex];
   
  }
 
 
  /**
   * Indicates whether the first element of this phrase's
   * pattern is a nonterminal.
   *        
   * @return <code>true</code> if this phrase starts with a
   *         nonterminal, <code>false</code> otherwise.
   */
  public boolean startsWithNonterminal() {
    return (words[0] < 0);
  }
 
 
  /**
   * Indicates whether the last element of this phrase's
   * pattern is a nonterminal.
   *        
   * @return <code>true</code> if this phrase ends with a
   *         nonterminal, <code>false</code> otherwise.
   */
  public boolean endsWithNonterminal() {
    return (words[words.length-1] < 0);
  }
 
 
  /**
   * Creates a new phrase object from the indexes provided.
   * <P>
   * NOTE: subList merely creates a "view" of the existing
   * Phrase object. Memory taken up by other Words in the
   * Phrase is not freed since the underlying subList object
   * still points to the complete Phrase List.
   * <p>
   * <em>This method is currently broken</em>,
   * and is guaranteed to throw an <code>Error</code>.
   *
   * @see ArrayList#subList(int, int)
   */
  public Phrase subPhrase(int start, int end) {
   
    // FIXME Lane This method should almost certainly return a HierarchicalPhrase in some cases
   
    //return new ContiguousPhrase(terminalSequenceStartIndices[0]+start, terminalSequenceStartIndices[0]+end, corpusArray);
    throw new Error("This code is broken. It needs to be fixed.");
  }
 
}
TOP

Related Classes of joshua.corpus.suffix_array.HierarchicalPhrase

TOP
Copyright © 2018 www.massapi.com. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.