/* This file is part of the Joshua Machine Translation System.
*
* Joshua is free software; you can redistribute it and/or modify
* it under the terms of the GNU Lesser General Public License as
* published by the Free Software Foundation; either version 2.1
* of the License, or (at your option) any later version.
*
* This library is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
* License along with this library; if not, write to the Free
* Software Foundation, Inc., 59 Temple Place, Suite 330, Boston,
* MA 02111-1307 USA
*/
package joshua.corpus.alignment;
import java.io.File;
import java.io.IOException;
import java.io.ObjectOutput;
import java.util.ArrayList;
import java.util.List;
import java.util.Scanner;
import java.util.logging.Level;
import java.util.logging.Logger;
import joshua.corpus.Corpus;
import joshua.util.io.BinaryOut;
/**
* List of alignment grids representing all alignment data for an
* aligned parallel corpus.
* <p>
* Instances of this class are created from human-readable alignment
* text files.
*
* @author Lane Schwartz
*/
public class AlignmentGrids extends AbstractAlignmentGrids {
/** Logger for this class. */
private static final Logger logger =
Logger.getLogger(AlignmentGrids.class.getName());
/** List of individual alignment grids. */
private final List<AlignmentGrid> alignments;
/**
* Constructs a list of AlignmentGrid objects.
* <p>
* The size parameter is used to allocate the initial
* capacity of the backing list. If this number is off,
* things will still work, but memory usage may be less
* optimal.
* <p>
* The object returned by this constructor will required
* tight spans.
*
* @param alignmentScanner
* @param sourceCorpus
* @param targetCorpus
* @param expectedSize Expected number of training sentences.
* This parameter merely specifies the initial
* capacity of an array list.
*/
public AlignmentGrids(Scanner alignmentScanner, Corpus sourceCorpus, Corpus targetCorpus, int expectedSize) {
this(alignmentScanner, sourceCorpus, targetCorpus, expectedSize, true);
}
/**
* Constructs a list of AlignmentGrid objects.
* <p>
* The size parameter is used to allocate the initial
* capacity of the backing list. If this number is off,
* things will still work, but memory usage may be less
* optimal.
*
* @param alignmentScanner
* @param sourceCorpus
* @param targetCorpus
* @param expectedSize Expected number of training sentences.
* This parameter merely specifies the initial
* capacity of an array list.
* @param requireTightSpans
*/
public AlignmentGrids(Scanner alignmentScanner, Corpus sourceCorpus, Corpus targetCorpus, int expectedSize, boolean requireTightSpans) {
super(sourceCorpus, targetCorpus, requireTightSpans);
this.alignments = new ArrayList<AlignmentGrid>(expectedSize);
boolean finest = logger.isLoggable(Level.FINEST);
int tenthSize = expectedSize / 10;
int lineNumber = 0;
while (alignmentScanner.hasNextLine()) {
String line = alignmentScanner.nextLine();
try {
AlignmentGrid grid = new AlignmentGrid(line);
alignments.add(grid);
} catch (Exception e) {
logger.warning("Sentence pair number " + lineNumber + " was too long, skipping this item");
alignments.add(null);
}
lineNumber++;
if (finest && (lineNumber%tenthSize==0)) {
logger.finest("AlignmentGrids construction " +
(lineNumber/tenthSize)+"0% complete");
}
}
}
/* See Javadoc for AbstractAlignmentGrids. */
protected int[] getSourcePoints(int sentenceID, int targetSpanStart, int targetSpanEnd) {
AlignmentGrid grid = alignments.get(sentenceID);
if(grid != null) {
return grid.getSourcePoints(targetSpanStart, targetSpanEnd);
} else {
return new int[0];
}
}
/* See Javadoc for AbstractAlignmentGrids. */
protected int[] getTargetPoints(int sentenceID, int sourceSpanStart, int sourceSpanEnd) {
AlignmentGrid grid = alignments.get(sentenceID);
if(grid != null) {
return grid.getTargetPoints(sourceSpanStart, sourceSpanEnd);
} else {
return new int[0];
}
}
/**
* Serializes this object as binary data.
*
* @param out The stream to write this object to.
* @throws IOException Includes any I/O exceptions that may occur
* @see java.io.Externalizable#writeExternal
*/
public void writeExternal(ObjectOutput out) throws IOException {
// Start by writing the number of alignments
int size = alignments.size();
logger.fine("Exporting size = " + size + ": 1 integer (4 bytes)");
out.writeInt(size);
// Write the widths of each grid
logger.fine("Exporting widths: " + size + " integers (" + size*4 + ") bytes");
for (AlignmentGrid grid : alignments) {
if(grid != null) {
out.writeInt(grid.width);
} else {
out.writeInt(0);
}
}
// Write the widths of each grid
logger.fine("Exporting widths: " + size + " integers (" + size*4 + ") bytes");
for (AlignmentGrid grid : alignments) {
if(grid != null) {
out.writeInt(grid.height);
} else {
out.writeInt(0);
}
}
// Write the number of alignment points in each grid
logger.fine("Exporting pointCounters: " + (size+1) + " integers (" + (size+1)*4 + ") bytes");
int pointCounter = 0;
out.writeInt(pointCounter);
for (AlignmentGrid grid : alignments) {
if(grid != null) {
pointCounter += grid.coordinates.length;
out.writeInt(pointCounter);
} else {
out.writeInt(0);
}
}
logger.finer("\tfinal pointCounter value was: " + pointCounter);
// Write the alignment points
logger.fine("Exporting grid coordinates: " + pointCounter + " shorts (" + pointCounter*2 + ") bytes");
for (AlignmentGrid grid : alignments) {
if(grid != null) {
for (short point : grid.coordinates) {
out.writeShort(point);
}
}
}
// Write the reverse alignment points
logger.fine("Exporting reverse grid coordinates: " + pointCounter + " shorts (" + pointCounter*2 + ") bytes");
for (AlignmentGrid grid : alignments) {
if(grid != null) {
for (short point : grid.transposedCoordinates) {
out.writeShort(point);
}
}
}
}
/* See Javadoc for Alignments interface. */
public int size() {
return this.alignments.size();
}
/**
* Main method used to read a human-readable alignments
* file and write it to disk as binary data.
*
* @param args File names for an existing human-readable
* alignments file and for the binary data file
* to be written
* @throws IOException Includes any I/O exceptions that may occur
*/
public static void main(String[] args) throws IOException {
if (args.length != 2) {
System.err.println("Usage: java " + AlignmentGrids.class.getName() + " alignments alignments.bin");
System.exit(0);
}
String alignmentsFileName = args[0];
String binaryAlignmentsFileName = args[1];
File alignmentsFile = new File(alignmentsFileName);
Scanner scanner = new Scanner(alignmentsFile);
AlignmentGrids grids = new AlignmentGrids(scanner, null, null, 10);
BinaryOut out = new BinaryOut(binaryAlignmentsFileName);
grids.writeExternal(out);
out.flush();
out.close();
}
}