/* This file is part of the Joshua Machine Translation System.
*
* Joshua is free software; you can redistribute it and/or modify
* it under the terms of the GNU Lesser General Public License as
* published by the Free Software Foundation; either version 2.1
* of the License, or (at your option) any later version.
*
* This library is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
* License along with this library; if not, write to the Free
* Software Foundation, Inc., 59 Temple Place, Suite 330, Boston,
* MA 02111-1307 USA
*/
package joshua.corpus.suffix_array;
import joshua.corpus.Corpus;
import joshua.corpus.LabeledSpan;
import joshua.corpus.Phrase;
import joshua.corpus.Span;
import java.util.Arrays;
import java.util.List;
import java.util.ArrayList;
/**
* HierarchicalPhrase is a class that represents a single matched
* hierarchical phrase, and provides the methods necessary for
* accessing the Pattern that it matches and for computing the
* intersection used in the suffix array lookups of discontinuous
* phrases.
* <p>
* For efficiency and space savings, this class should only be used
* when it is does not make sense to use MatchedHierarchicalPhrases.
*
* In cases where many hierarchical phrases share a common pattern,
* that class should be preferred, as it is able to store the phrases
* in a much more memory-efficient manner compared to storing a
* collection of HierarchicalPhrase objects.
*
* @author Chris Callison-Burch and Lane Schwartz
* @since July 31 2008
* @version $LastChangedDate:2008-09-18 12:47:23 -0500 (Thu, 18 Sep 2008) $
*/
public class HierarchicalPhrase extends Pattern {
/**
* Represents one index into the source corpus for each
* terminal that starts a contiguous sequence of terminals.
*/
protected final int[] terminalSequenceStartIndices;
/** Exclusive ending indices of each terminal sequence. */
protected final int[] terminalSequenceEndIndices;
/** Monolingual corpus from which this phrase was extracted. */
protected final Corpus corpusArray;
/**
* Index of the sentence in the corpus where this phrase
* was extracted.
*/
protected final int sentenceNumber;
// We are NOT storing this, because of cases where the HP
// starts with a nonterminal. In such cases, we want a
// single HP to represent all HPs that start with that
// nonterminal, and storing the startIndex would spuriously
// require a list of HPs to store this.
//
//protected final int startIndex;
/**
* Constructs a hierarchical phrase rooted at a location
* in a corpus.
*
* @param patternWords Terminals and nonterminals comprising
* the pattern
* @param span Span in the corpus from the first
* terminal to the final terminal in the phrase
* @param nonterminalSpans Locations of all nonterminals
* in this phrase.
* @param corpus Representation of a monolingual
* corpus.
*/
public HierarchicalPhrase(
int[] patternWords,
Span span,
List<LabeledSpan> nonterminalSpans,
Corpus corpus) {
super(corpus.getVocabulary(), patternWords);
// Compute the list of terminal spans
List<Span> terminalSpans = new ArrayList<Span>();
{
int possibleStart = span.start;
int nonterminalIndex = 0;
Span nonterminal = span;
for (LabeledSpan labeledNTSpan : nonterminalSpans) {
nonterminal = labeledNTSpan.getSpan();
if (nonterminal.start > possibleStart) {
terminalSpans.add(new Span(possibleStart, nonterminal.start));
}
possibleStart = nonterminal.end;
nonterminalIndex++;
}
if (span.end > possibleStart) {
terminalSpans.add(new Span(possibleStart, span.end));
}
}
// Initialize the sequence arrays
this.terminalSequenceStartIndices = new int[terminalSpans.size()];
this.terminalSequenceEndIndices = new int[terminalSpans.size()];
{
int terminalSpanNumber = 0;
for (Span terminalSpan : terminalSpans) {
terminalSequenceStartIndices[terminalSpanNumber] = terminalSpan.start;
terminalSequenceEndIndices[terminalSpanNumber] = terminalSpan.end;
terminalSpanNumber++;
}
}
this.corpusArray = corpus;
this.sentenceNumber = corpus.getSentenceIndex(terminalSequenceStartIndices[0]);
}
/**
* Returns the index in the corpus of the first terminal
* in this phrase.
*
* @return the index in the corpus of the first terminal
* in this phrase.
*/
public int getFirstTerminalCorpusIndex() {
return terminalSequenceStartIndices[0];
}
/**
* Returns the index in the corpus of the last terminal in
* this phrase.
*
* @return the index in the corpus of the last terminal in
* this phrase.
*/
public int getLastTerminalCorpusIndex() {
return terminalSequenceEndIndices[terminalSequenceEndIndices.length-1];
}
/**
* Gets whether this phrase contains a terminal at the
* specified index.
*
* @return <code>true</code> this phrase contains a terminal
* at the specified index <code>false</code> otherwise
*/
public boolean containsTerminalAt(int alignedPointIndex) {
for (int i=0; i<terminalSequenceStartIndices.length; i++) {
if (alignedPointIndex >= terminalSequenceStartIndices[i] &&
alignedPointIndex < terminalSequenceEndIndices[i]) {
return true;
}
}
return false;
}
/**
* Returns the index of the sentence in the corpus that the
* phrase is a part of.
*
* @return the index of the sentence in the corpus that the
* phrase is a part of.
*/
public int getSentenceNumber() {
return sentenceNumber;
}
/**
* Gets all possible subphrases of this phrase, up to and
* including the phrase itself. For example, the phrase "I
* like cheese ." would return the following:
* <ul>
* <li>I
* <li>like
* <li>cheese
* <li>.
* <li>I like
* <li>like cheese
* <li>cheese .
* <li>I like cheese
* <li>like cheese .
* <li>I like cheese .
* </ul>
*
* @return ArrayList of all possible subphrases.
*/
public List<Phrase> getSubPhrases() {
return getSubPhrases(size());
}
/**
* Returns a list of subphrases only of length
* <code>maxLength</code> or smaller.
*
* @param maxLength the maximum length phrase to return.
* @return ArrayList of all possible subphrases of length
* maxLength or less
* @see #getSubPhrases()
*/
public List<Phrase> getSubPhrases(int maxLength) {
if (maxLength > size()) {
return getSubPhrases(size());
}
List<Phrase> phrases = new ArrayList<Phrase>();
for (int i = 0; i < size(); i++) {
for (int j = i + 1; (j <= size()) && (j - i <= maxLength); j++) {
Phrase subPhrase = subPhrase(i,j);
phrases.add(subPhrase);
}
}
return phrases;
}
/**
* Returns a string representation of the phrase.
*
* @return a space-delimited string of the terminals and
* nonterminals in the phrase.
*/
public String toString() {
StringBuilder s = new StringBuilder();
int[] wordIDs = getWordIDs();
for (int i = 0; i < wordIDs.length; i++) {
if (wordIDs[i]<0) {
s.append('X');
} else {
s.append(corpusArray.getVocabulary().getWord(wordIDs[i]));
}
s.append(' ');
}
s.append(Arrays.toString(terminalSequenceStartIndices));
return s.toString();
}
/**
* Gets the number of contiguous sequences of terminals
* in the pattern corresponding to this phrase.
*
* @return the number of contiguous sequences of terminals
* in the pattern corresponding to this phrase
*/
public int getNumberOfTerminalSequences() {
return terminalSequenceStartIndices.length;
}
/**
* Gets the starting index in the corpus for a specified
* contiguous sequence of terminals in this phrase.
*
* @param sequenceIndex Index specifying a contiguous sequence
* of terminals in this pattern
* @return the starting index in the corpus for a specified
* contiguous sequence of terminals in this phrase
*/
public int getTerminalSequenceStartIndex(int sequenceIndex) {
return this.terminalSequenceStartIndices[sequenceIndex];
}
/**
* Gets the ending index in the corpus for a specified
* contiguous sequence of terminals in this phrase.
*
* @param sequenceIndex Index specifying a contiguous sequence
* of terminals in this pattern
* @return the ending index in the corpus for a specified
* contiguous sequence of terminals in this phrase
*/
public int getTerminalSequenceEndIndex(int sequenceIndex) {
return this.terminalSequenceEndIndices[sequenceIndex];
}
/**
* Indicates whether the first element of this phrase's
* pattern is a nonterminal.
*
* @return <code>true</code> if this phrase starts with a
* nonterminal, <code>false</code> otherwise.
*/
public boolean startsWithNonterminal() {
return (words[0] < 0);
}
/**
* Indicates whether the last element of this phrase's
* pattern is a nonterminal.
*
* @return <code>true</code> if this phrase ends with a
* nonterminal, <code>false</code> otherwise.
*/
public boolean endsWithNonterminal() {
return (words[words.length-1] < 0);
}
/**
* Creates a new phrase object from the indexes provided.
* <P>
* NOTE: subList merely creates a "view" of the existing
* Phrase object. Memory taken up by other Words in the
* Phrase is not freed since the underlying subList object
* still points to the complete Phrase List.
* <p>
* <em>This method is currently broken</em>,
* and is guaranteed to throw an <code>Error</code>.
*
* @see ArrayList#subList(int, int)
*/
public Phrase subPhrase(int start, int end) {
// FIXME Lane This method should almost certainly return a HierarchicalPhrase in some cases
//return new ContiguousPhrase(terminalSequenceStartIndices[0]+start, terminalSequenceStartIndices[0]+end, corpusArray);
throw new Error("This code is broken. It needs to be fixed.");
}
}