package org.maltparserx.core.syntaxgraph.writer;
import java.io.BufferedWriter;
import java.io.FileNotFoundException;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.OutputStream;
import java.io.OutputStreamWriter;
import java.io.UnsupportedEncodingException;
import java.util.SortedMap;
import java.util.TreeMap;
import java.util.regex.PatternSyntaxException;
import org.maltparserx.core.exception.MaltChainedException;
import org.maltparserx.core.helper.Util;
import org.maltparserx.core.io.dataformat.ColumnDescription;
import org.maltparserx.core.io.dataformat.DataFormatException;
import org.maltparserx.core.io.dataformat.DataFormatInstance;
import org.maltparserx.core.syntaxgraph.PhraseStructure;
import org.maltparserx.core.syntaxgraph.TokenStructure;
import org.maltparserx.core.syntaxgraph.node.NonTerminalNode;
import org.maltparserx.core.syntaxgraph.node.PhraseStructureNode;
import org.maltparserx.core.syntaxgraph.node.TokenNode;
import org.maltparserx.core.syntaxgraph.reader.TigerXMLHeader;
/**
*
*
* @author Johan Hall
*/
public class TigerXMLWriter implements SyntaxGraphWriter {
private enum RootHandling {
TALBANKEN, NORMAL
};
private BufferedWriter writer;
private DataFormatInstance dataFormatInstance;
private String optionString;
private int sentenceCount;
private TigerXMLHeader header;
// private boolean hasWriteTigerXMLHeader = false;
private RootHandling rootHandling;
private String sentencePrefix = "s";
private StringBuilder sentenceID;
private StringBuilder tmpID;
private StringBuilder rootID;
private int START_ID_OF_NONTERMINALS = 500;
private boolean labeledTerminalID;
private String VROOT_SYMBOL = "VROOT";
private boolean useVROOT = false;
// private String fileName = null;
// private String charsetName = null;
private boolean closeStream = true;
public TigerXMLWriter() {
sentenceID = new StringBuilder();
tmpID = new StringBuilder();
rootID = new StringBuilder();
labeledTerminalID = false;
}
public void open(String fileName, String charsetName) throws MaltChainedException {
try {
// this.fileName = fileName;
// this.charsetName = charsetName;
open(new OutputStreamWriter(new FileOutputStream(fileName),charsetName));
} catch (FileNotFoundException e) {
throw new DataFormatException("The output file '"+fileName+"' cannot be found.", e);
} catch (UnsupportedEncodingException e) {
throw new DataFormatException("The character encoding set '"+charsetName+"' isn't supported.", e);
}
}
public void open(OutputStream os, String charsetName) throws MaltChainedException {
try {
if (os == System.out || os == System.err) {
closeStream = false;
}
open(new OutputStreamWriter(os, charsetName));
} catch (UnsupportedEncodingException e) {
throw new DataFormatException("The character encoding set '"+charsetName+"' isn't supported.", e);
}
}
private void open(OutputStreamWriter osw) throws MaltChainedException {
setWriter(new BufferedWriter(osw));
setSentenceCount(0);
}
public void writeProlog() throws MaltChainedException {
// if (fileName == null || charsetName == null) {
writeHeader();
// }
}
public void writeSentence(TokenStructure syntaxGraph) throws MaltChainedException {
if (syntaxGraph == null || dataFormatInstance == null) {
return;
}
if (syntaxGraph.hasTokens()) {
sentenceCount++;
final PhraseStructure phraseStructure = (PhraseStructure)syntaxGraph;
try {
sentenceID.setLength(0);
sentenceID.append(sentencePrefix);
if (phraseStructure.getSentenceID() != 0) {
sentenceID.append(Integer.toString(phraseStructure.getSentenceID()));
} else {
sentenceID.append(Integer.toString(sentenceCount));
}
writer.write(" <s id=\"");
writer.write(sentenceID.toString());
writer.write("\">\n");
setRootID(phraseStructure);
writer.write(" <graph root=\"");
writer.write(rootID.toString());
writer.write("\" ");
writer.write("discontinuous=\"");
writer.write(Boolean.toString(!phraseStructure.isContinuous()));
writer.write("\">\n");
writeTerminals(phraseStructure);
if (phraseStructure.nTokenNode() != 1 || rootHandling.equals(RootHandling.TALBANKEN)) {
writeNonTerminals(phraseStructure);
} else {
writer.write(" <nonterminals/>\n");
}
writer.write(" </graph>\n");
writer.write(" </s>\n");
} catch (IOException e) {
throw new DataFormatException("The TigerXML writer could not write to file. ", e);
}
}
}
private void setRootID(PhraseStructure phraseStructure) throws MaltChainedException {
useVROOT = false;
PhraseStructureNode root = phraseStructure.getPhraseStructureRoot();
for (ColumnDescription column : dataFormatInstance.getPhraseStructureNodeLabelColumnDescriptionSet()) {
if (root.hasLabel(column.getSymbolTable()) && root.getLabelSymbol(column.getSymbolTable()).equals(VROOT_SYMBOL)) {
useVROOT = true;
break;
}
}
if (useVROOT) {
rootID.setLength(0);
rootID.append(sentenceID);
rootID.append('_');
rootID.append(VROOT_SYMBOL);
} else if (phraseStructure.nTokenNode() == 1 && phraseStructure.nNonTerminals() == 0 && !root.isLabeled()) {
rootID.setLength(0);
rootID.append(sentenceID);
rootID.append("_1");
} else {
rootID.setLength(0);
rootID.append(sentenceID);
rootID.append('_');
// if (rootHandling.equals(RootHandling.NORMAL)) {
rootID.append(Integer.toString(START_ID_OF_NONTERMINALS+phraseStructure.nNonTerminals()));
// } else if (rootHandling.equals(RootHandling.TALBANKEN)) {
// rootID.append(Integer.toString(START_ID_OF_NONTERMINALS+1));
// }
}
}
public void writeEpilog() throws MaltChainedException {
writeTail();
}
public BufferedWriter getWriter() {
return writer;
}
public void setWriter(BufferedWriter writer) {
this.writer = writer;
}
public void close() throws MaltChainedException {
try {
if (writer != null) {
writer.flush();
if (closeStream) {
writer.close();
}
writer = null;
}
} catch (IOException e) {
throw new DataFormatException("Could not close the output file. ", e);
}
}
private void writeHeader() throws MaltChainedException {
try {
if (header == null) {
header = new TigerXMLHeader(dataFormatInstance.getSymbolTables());
}
writer.write(header.toTigerXML());
// hasWriteTigerXMLHeader = true;
} catch (IOException e) {
throw new DataFormatException("The TigerXML writer could not write to file. ", e);
}
}
private void writeTerminals(PhraseStructure phraseStructure) throws MaltChainedException {
try {
writer.write(" <terminals>\n");
for (int index : phraseStructure.getTokenIndices()) {
final PhraseStructureNode t = phraseStructure.getTokenNode(index);
writer.write(" <t ");
if (!labeledTerminalID) {
tmpID.setLength(0);
tmpID.append(sentenceID);
tmpID.append('_');
tmpID.append(Integer.toString(t.getIndex()));
writer.write("id=\"");writer.write(tmpID.toString());writer.write("\" ");
}
for (ColumnDescription column : dataFormatInstance.getInputColumnDescriptionSet()) {
writer.write(column.getName().toLowerCase());
writer.write("=\"");
writer.write(Util.xmlEscape(t.getLabelSymbol(column.getSymbolTable())));
writer.write("\" ");
}
writer.write("/>\n");
}
writer.write(" </terminals>\n");
} catch (IOException e) {
throw new DataFormatException("The TigerXML writer is not able to write. ", e);
}
}
public void writeNonTerminals(PhraseStructure phraseStructure) throws MaltChainedException {
try {
SortedMap<Integer,Integer> heights = new TreeMap<Integer,Integer>();
for (int index : phraseStructure.getNonTerminalIndices()) {
heights.put(index, ((NonTerminalNode)phraseStructure.getNonTerminalNode(index)).getHeight());
}
writer.write(" <nonterminals>\n");
boolean done = false;
int h = 1;
while (!done) {
done = true;
for (int index : phraseStructure.getNonTerminalIndices()) {
if (heights.get(index) == h) {
NonTerminalNode nt = (NonTerminalNode)phraseStructure.getNonTerminalNode(index);
tmpID.setLength(0);
tmpID.append(sentenceID);
tmpID.append('_');
tmpID.append(Integer.toString(nt.getIndex()+START_ID_OF_NONTERMINALS-1));
writeNonTerminal(nt, tmpID.toString());
done = false;
}
}
h++;
}
writeNonTerminal((NonTerminalNode)phraseStructure.getPhraseStructureRoot(),rootID.toString());
writer.write(" </nonterminals>\n");
} catch (IOException e) {
throw new DataFormatException("The TigerXML writer is not able to write. ", e);
}
}
public void writeNonTerminal(NonTerminalNode nt, String id) throws MaltChainedException {
try {
writer.write(" <nt");
writer.write(" id=\"");writer.write(id);writer.write("\" ");
for (ColumnDescription column : dataFormatInstance.getPhraseStructureNodeLabelColumnDescriptionSet()) {
if (nt.hasLabel(column.getSymbolTable())) {
writer.write(column.getName().toLowerCase());
writer.write("=");
writer.write("\"");
writer.write(Util.xmlEscape(nt.getLabelSymbol(column.getSymbolTable())));
writer.write("\" ");
}
}
writer.write(">\n");
for (int i = 0, n = nt.nChildren(); i < n; i++) {
PhraseStructureNode child = nt.getChild(i);
writer.write(" <edge ");
for (ColumnDescription column : dataFormatInstance.getPhraseStructureEdgeLabelColumnDescriptionSet()) {
if (child.hasParentEdgeLabel(column.getSymbolTable())) {
writer.write(column.getName().toLowerCase());
writer.write("=\"");
writer.write(Util.xmlEscape(child.getParentEdgeLabelSymbol(column.getSymbolTable())));
writer.write("\" ");
}
}
if (child instanceof TokenNode) {
if (!labeledTerminalID) {
tmpID.setLength(0);
tmpID.append(sentenceID);
tmpID.append('_');
tmpID.append(Integer.toString(child.getIndex()));
writer.write(" idref=\"");writer.write(tmpID.toString());writer.write("\"");
} else {
writer.write(" idref=\"");writer.write(child.getLabelSymbol(dataFormatInstance.getInputSymbolTables().get("ID")));writer.write("\"");
}
} else {
tmpID.setLength(0);
tmpID.append(sentenceID);
tmpID.append('_');
tmpID.append(Integer.toString(child.getIndex()+START_ID_OF_NONTERMINALS-1));
writer.write(" idref=\"");writer.write(tmpID.toString());writer.write("\"");
}
writer.write(" />\n");
}
writer.write(" </nt>\n");
} catch (IOException e) {
throw new DataFormatException("The TigerXML writer is not able to write. ", e);
}
}
private void writeTail() throws MaltChainedException {
try {
writer.write(" </body>\n");
writer.write("</corpus>\n");
writer.flush();
// if (fileName != null && charsetName != null) {
// writer.close();
// writer = null;
// BufferedWriter headerWriter = new BufferedWriter(new OutputStreamWriter(new FileOutputStream(fileName+".header"),charsetName));
// if (header == null) {
// header = new TigerXMLHeader(dataFormatInstance.getSymbolTables());
// }
//
// headerWriter.write(header.toTigerXML());
// headerWriter.flush();
// headerWriter.close();
// }
} catch (IOException e) {
throw new DataFormatException("The TigerXML writer is not able to write. ", e);
}
}
public int getSentenceCount() {
return sentenceCount;
}
public void setSentenceCount(int sentenceCount) {
this.sentenceCount = sentenceCount;
}
public DataFormatInstance getDataFormatInstance() {
return dataFormatInstance;
}
public void setDataFormatInstance(DataFormatInstance dataFormatInstance) {
this.dataFormatInstance = dataFormatInstance;
labeledTerminalID = (dataFormatInstance.getInputColumnDescriptions().containsKey("id") || dataFormatInstance.getInputColumnDescriptions().containsKey("ID"));
}
public String getOptions() {
return optionString;
}
public void setOptions(String optionString) throws MaltChainedException {
this.optionString = optionString;
rootHandling = RootHandling.NORMAL;
String[] argv;
try {
argv = optionString.split("[_\\p{Blank}]");
} catch (PatternSyntaxException e) {
throw new DataFormatException("Could not split the TigerXML writer option '"+optionString+"'. ", e);
}
for (int i=0; i < argv.length-1; i++) {
if(argv[i].charAt(0) != '-') {
throw new DataFormatException("The argument flag should start with the following character '-', not with "+argv[i].charAt(0));
}
if(++i>=argv.length) {
throw new DataFormatException("The last argument does not have any value. ");
}
switch(argv[i-1].charAt(1)) {
case 'r':
if (argv[i].equals("n")) {
rootHandling = RootHandling.NORMAL;
} else if (argv[i].equals("tal")) {
rootHandling = RootHandling.TALBANKEN;
}
break;
case 's':
try {
START_ID_OF_NONTERMINALS = Integer.parseInt(argv[i]);
} catch (NumberFormatException e){
throw new MaltChainedException("The TigerXML writer option -s must be an integer value. ");
}
break;
case 'v':
VROOT_SYMBOL = argv[i];
break;
default:
throw new DataFormatException("Unknown TigerXML writer option: '"+argv[i-1]+"' with value '"+argv[i]+"'. ");
}
}
}
}