/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
* http://www.apache.org/licenses/LICENSE-2.0
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.vxquery.xmlparser;
import java.io.DataOutput;
import java.io.IOException;
import java.nio.ByteBuffer;
import java.util.ArrayList;
import java.util.List;
import org.apache.vxquery.datamodel.accessors.TaggedValuePointable;
import org.apache.vxquery.datamodel.accessors.nodes.NodeTreePointable;
import org.apache.vxquery.datamodel.builders.nodes.AbstractNodeBuilder;
import org.apache.vxquery.datamodel.builders.nodes.AttributeNodeBuilder;
import org.apache.vxquery.datamodel.builders.nodes.CommentNodeBuilder;
import org.apache.vxquery.datamodel.builders.nodes.DictionaryBuilder;
import org.apache.vxquery.datamodel.builders.nodes.DocumentNodeBuilder;
import org.apache.vxquery.datamodel.builders.nodes.ElementNodeBuilder;
import org.apache.vxquery.datamodel.builders.nodes.PINodeBuilder;
import org.apache.vxquery.datamodel.builders.nodes.TextNodeBuilder;
import org.apache.vxquery.datamodel.values.ValueTag;
import org.apache.vxquery.types.BuiltinTypeQNames;
import org.apache.vxquery.types.ElementType;
import org.apache.vxquery.types.NameTest;
import org.apache.vxquery.types.NodeType;
import org.apache.vxquery.types.SequenceType;
import org.apache.vxquery.xmlquery.query.XQueryConstants;
import org.xml.sax.Attributes;
import org.xml.sax.ContentHandler;
import org.xml.sax.Locator;
import org.xml.sax.SAXException;
import org.xml.sax.ext.LexicalHandler;
import edu.uci.ics.hyracks.api.comm.IFrameWriter;
import edu.uci.ics.hyracks.api.exceptions.HyracksDataException;
import edu.uci.ics.hyracks.data.std.primitive.UTF8StringPointable;
import edu.uci.ics.hyracks.data.std.util.ArrayBackedValueStorage;
import edu.uci.ics.hyracks.dataflow.common.comm.io.FrameTupleAccessor;
import edu.uci.ics.hyracks.dataflow.common.comm.io.FrameTupleAppender;
import edu.uci.ics.hyracks.dataflow.common.comm.util.FrameUtils;
public class SAXContentHandler implements ContentHandler, LexicalHandler {
// XML node builders
private final AttributeNodeBuilder anb;
private final CommentNodeBuilder cnb;
private final DictionaryBuilder db;
private final DocumentNodeBuilder docb;
private final PINodeBuilder pinb;
private final TextNodeBuilder tnb;
private final List<ElementNodeBuilder> enbStack;
private final List<ElementNodeBuilder> freeENBList;
// Frame writing variables
private FrameTupleAppender appender;
private ByteBuffer frame;
private FrameTupleAccessor fta;
private int tupleIndex;
private IFrameWriter writer;
// Element writing and path step variables
private boolean skipping;
private String[] childLocalName = null;
private String[] childUri = null;
private boolean[] subElement = null;
private final TaggedValuePointable tvp;
// Basic tracking and setting variables
private final boolean attachTypes;
private final StringBuilder buffer;
private final boolean createNodeIds;
private int depth;
private final ArrayBackedValueStorage docABVS;
private final ArrayBackedValueStorage elementABVS;
private boolean pendingText;
private int nodeIdCounter;
private final ITreeNodeIdProvider nodeIdProvider;
private final ArrayBackedValueStorage resultABVS;
private final ArrayBackedValueStorage tempABVS;
public SAXContentHandler(boolean attachTypes, ITreeNodeIdProvider nodeIdProvider) {
// XML node builders
anb = new AttributeNodeBuilder();
cnb = new CommentNodeBuilder();
db = new DictionaryBuilder();
docb = new DocumentNodeBuilder();
pinb = new PINodeBuilder();
tnb = new TextNodeBuilder();
enbStack = new ArrayList<ElementNodeBuilder>();
freeENBList = new ArrayList<ElementNodeBuilder>();
// Element writing and path step variables
skipping = true;
tvp = (TaggedValuePointable) TaggedValuePointable.FACTORY.createPointable();
// Basic tracking and setting variables
this.attachTypes = attachTypes;
buffer = new StringBuilder();
createNodeIds = nodeIdProvider != null;
depth = 0;
docABVS = new ArrayBackedValueStorage();
elementABVS = new ArrayBackedValueStorage();
pendingText = false;
nodeIdCounter = 0;
this.nodeIdProvider = nodeIdProvider;
resultABVS = new ArrayBackedValueStorage();
tempABVS = new ArrayBackedValueStorage();
}
public SAXContentHandler(boolean attachTypes, ITreeNodeIdProvider nodeIdProvider, ByteBuffer frame,
FrameTupleAppender appender, List<SequenceType> childSequenceTypes) {
this(attachTypes, nodeIdProvider);
// Frame writing variables
this.frame = frame;
this.appender = appender;
setChildPathSteps(childSequenceTypes);
}
private void setChildPathSteps(List<SequenceType> childSeq) {
if (!childSeq.isEmpty()) {
subElement = new boolean[childSeq.size()];
childUri = new String[childSeq.size()];
childLocalName = new String[childSeq.size()];
}
int index = 0;
for (SequenceType sType : childSeq) {
NodeType nodeType = (NodeType) sType.getItemType();
ElementType eType = (ElementType) nodeType;
NameTest nameTest = eType.getNameTest();
childUri[index] = getStringFromBytes(nameTest.getUri());
childLocalName[index] = getStringFromBytes(nameTest.getLocalName());;
++index;
}
}
public void setupElementWriter(IFrameWriter writer, FrameTupleAccessor fta, int tupleIndex) {
this.writer = writer;
this.fta = fta;
this.tupleIndex = tupleIndex;
}
@Override
public void characters(char[] ch, int start, int length) throws SAXException {
if (skipping) {
return;
}
buffer.append(ch, start, length);
pendingText = true;
}
@Override
public void endDocument() throws SAXException {
if (skipping) {
return;
}
try {
flushText();
docb.endChildrenChunk();
docb.finish();
if (frame != null && appender != null) {
writeElement();
}
} catch (IOException e) {
e.printStackTrace();
throw new SAXException(e);
}
}
private void endElementChildPathStep() throws IOException {
if (subElement != null && depth <= subElement.length) {
subElement[depth - 1] = false;
}
}
@Override
public void endElement(String uri, String localName, String name) throws SAXException {
if (skipping) {
--depth;
return;
}
try {
boolean nonSkipped = foundFirstNonSkippedElement();
flushText();
ElementNodeBuilder enb = enbStack.remove(enbStack.size() - 1);
enb.endChildrenChunk();
endChildInParent(enb, nonSkipped);
freeENB(enb);
if (nonSkipped) {
writeElement();
}
endElementChildPathStep();
} catch (IOException e) {
e.printStackTrace();
throw new SAXException(e);
}
--depth;
}
@Override
public void endPrefixMapping(String prefix) throws SAXException {
}
@Override
public void ignorableWhitespace(char[] ch, int start, int length) throws SAXException {
}
@Override
public void processingInstruction(String target, String data) throws SAXException {
if (skipping) {
return;
}
try {
flushText();
startChildInParent(pinb);
tempABVS.reset();
tempABVS.getDataOutput().writeUTF(target);
if (createNodeIds) {
pinb.setLocalNodeId(nodeIdCounter++);
}
pinb.setTarget(tempABVS);
tempABVS.reset();
tempABVS.getDataOutput().writeUTF(data);
pinb.setContent(tempABVS);
endChildInParent(pinb);
} catch (IOException e) {
e.printStackTrace();
throw new SAXException(e);
}
}
@Override
public void setDocumentLocator(Locator locator) {
}
@Override
public void skippedEntity(String name) throws SAXException {
}
@Override
public void startDocument() throws SAXException {
if (subElement == null) {
skipping = false;
}
db.reset();
docABVS.reset();
if (skipping) {
return;
}
try {
docb.reset(docABVS);
if (createNodeIds) {
docb.setLocalNodeId(nodeIdCounter++);
}
docb.startChildrenChunk();
flushText();
} catch (IOException e) {
e.printStackTrace();
throw new SAXException(e);
}
}
private boolean startElementChildPathStep(String uri, String localName) {
if (subElement != null && depth <= subElement.length) {
// Check path step if it exists.
if (uri.compareTo(childUri[depth - 1]) == 0) {
if (localName.compareTo(childLocalName[depth - 1]) == 0) {
subElement[depth - 1] = true;
}
}
}
boolean start = foundFirstNonSkippedElement();
if (start) {
skipping = false;
}
return start;
}
@Override
public void startElement(String uri, String localName, String name, Attributes atts) throws SAXException {
++depth;
boolean start = startElementChildPathStep(uri, localName);
if (skipping) {
return;
}
try {
flushText();
int idx = name.indexOf(':');
String prefix = idx < 0 ? "" : name.substring(0, idx);
ElementNodeBuilder enb = createENB();
startChildInParent(enb, start);
int uriCode = db.lookup(uri);
int localNameCode = db.lookup(localName);
int prefixCode = db.lookup(prefix);
enb.setName(uriCode, localNameCode, prefixCode);
if (attachTypes) {
int typeUriCode = db.lookup(XQueryConstants.XS_NSURI);
int typeLocalNameCode = db.lookup(BuiltinTypeQNames.UNTYPED_STR);
int typePrefixCode = db.lookup(XQueryConstants.XS_PREFIX);
enb.setType(typeUriCode, typeLocalNameCode, typePrefixCode);
}
if (createNodeIds) {
enb.setLocalNodeId(nodeIdCounter++);
}
enb.startAttributeChunk();
final int nAttrs = atts.getLength();
for (int i = 0; i < nAttrs; ++i) {
String aName = atts.getQName(i);
int aIdx = aName.indexOf(':');
int aPrefixCode = db.lookup(aIdx < 0 ? "" : aName.substring(0, aIdx));
int aLocalNameCode = db.lookup(atts.getLocalName(i));
int aUriCode = db.lookup(atts.getURI(i));
String aValue = atts.getValue(i);
tempABVS.reset();
DataOutput tempOut = tempABVS.getDataOutput();
tempOut.write(ValueTag.XS_UNTYPED_ATOMIC_TAG);
tempOut.writeUTF(aValue);
enb.startAttribute(anb);
anb.setName(aUriCode, aLocalNameCode, aPrefixCode);
if (attachTypes) {
int typeUriCode = db.lookup(XQueryConstants.XS_NSURI);
int typeLocalNameCode = db.lookup(BuiltinTypeQNames.UNTYPED_ATOMIC_STR);
int typePrefixCode = db.lookup(XQueryConstants.XS_PREFIX);
anb.setType(typeUriCode, typeLocalNameCode, typePrefixCode);
}
if (createNodeIds) {
anb.setLocalNodeId(nodeIdCounter++);
}
anb.setValue(tempABVS);
enb.endAttribute(anb);
}
enb.endAttributeChunk();
enb.startChildrenChunk();
enbStack.add(enb);
} catch (IOException e) {
e.printStackTrace();
throw new SAXException(e);
}
}
@Override
public void startPrefixMapping(String prefix, String uri) throws SAXException {
}
@Override
public void comment(char[] ch, int start, int length) throws SAXException {
if (skipping) {
return;
}
try {
flushText();
startChildInParent(cnb);
buffer.append(ch, start, length);
tempABVS.reset();
tempABVS.getDataOutput().writeUTF(buffer.toString());
if (createNodeIds) {
cnb.setLocalNodeId(nodeIdCounter++);
}
cnb.setValue(tempABVS);
endChildInParent(cnb);
buffer.delete(0, buffer.length());
} catch (IOException e) {
e.printStackTrace();
throw new SAXException(e);
}
}
private void flushText() throws IOException {
if (pendingText) {
peekENBStackTop().startChild(tnb);
tempABVS.reset();
tempABVS.getDataOutput().writeUTF(buffer.toString());
if (createNodeIds) {
tnb.setLocalNodeId(nodeIdCounter++);
}
tnb.setValue(tempABVS);
peekENBStackTop().endChild(tnb);
buffer.delete(0, buffer.length());
pendingText = false;
}
}
@Override
public void endCDATA() throws SAXException {
}
@Override
public void endDTD() throws SAXException {
}
@Override
public void endEntity(String name) throws SAXException {
}
@Override
public void startCDATA() throws SAXException {
}
@Override
public void startDTD(String name, String publicId, String systemId) throws SAXException {
}
@Override
public void startEntity(String name) throws SAXException {
}
public void writeElement() throws IOException {
resultABVS.reset();
DataOutput out = resultABVS.getDataOutput();
out.write(ValueTag.NODE_TREE_TAG);
byte header = NodeTreePointable.HEADER_DICTIONARY_EXISTS_MASK;
if (attachTypes) {
header |= NodeTreePointable.HEADER_TYPE_EXISTS_MASK;
}
if (createNodeIds) {
header |= NodeTreePointable.HEADER_NODEID_EXISTS_MASK;
}
out.write(header);
if (createNodeIds) {
out.writeInt(nodeIdProvider.getId());
}
db.write(resultABVS);
if (subElement == null) {
out.write(docABVS.getByteArray(), docABVS.getStartOffset(), docABVS.getLength());
} else {
out.write(elementABVS.getByteArray(), elementABVS.getStartOffset(), elementABVS.getLength());
}
tvp.set(resultABVS.getByteArray(), resultABVS.getStartOffset(), resultABVS.getLength());
addNodeToTuple(tvp, tupleIndex);
skipping = true;
}
public void writeDocument(ArrayBackedValueStorage abvs) throws IOException {
DataOutput out = abvs.getDataOutput();
out.write(ValueTag.NODE_TREE_TAG);
byte header = NodeTreePointable.HEADER_DICTIONARY_EXISTS_MASK;
if (attachTypes) {
header |= NodeTreePointable.HEADER_TYPE_EXISTS_MASK;
}
if (createNodeIds) {
header |= NodeTreePointable.HEADER_NODEID_EXISTS_MASK;
}
out.write(header);
if (createNodeIds) {
out.writeInt(nodeIdProvider.getId());
}
db.write(abvs);
out.write(docABVS.getByteArray(), docABVS.getStartOffset(), docABVS.getLength());
}
private ElementNodeBuilder createENB() {
if (freeENBList.isEmpty()) {
return new ElementNodeBuilder();
}
return freeENBList.remove(freeENBList.size() - 1);
}
private void freeENB(ElementNodeBuilder enb) {
freeENBList.add(enb);
}
private ElementNodeBuilder peekENBStackTop() {
return enbStack.get(enbStack.size() - 1);
}
private void startChildInParent(AbstractNodeBuilder anb) throws IOException {
startChildInParent(anb, false);
}
private void startChildInParent(AbstractNodeBuilder anb, boolean startNewElement) throws IOException {
if (startNewElement) {
elementABVS.reset();
anb.reset(elementABVS);
} else if (enbStack.isEmpty()) {
docb.startChild(anb);
} else {
peekENBStackTop().startChild(anb);
}
}
private void endChildInParent(AbstractNodeBuilder anb) throws IOException {
endChildInParent(anb, false);
}
private void endChildInParent(AbstractNodeBuilder anb, boolean endNewElement) throws IOException {
if (endNewElement) {
anb.finish();
} else if (enbStack.isEmpty()) {
docb.endChild(anb);
} else {
peekENBStackTop().endChild(anb);
}
}
private void addNodeToTuple(TaggedValuePointable result, int t) throws HyracksDataException {
// Send to the writer.
if (!addNodeToTupleAppender(result, t)) {
FrameUtils.flushFrame(frame, writer);
appender.reset(frame, true);
if (!addNodeToTupleAppender(result, t)) {
throw new HyracksDataException("Could not write frame.");
}
}
}
private boolean addNodeToTupleAppender(TaggedValuePointable result, int t) throws HyracksDataException {
// First copy all new fields over.
if (fta.getFieldCount() > 0) {
for (int f = 0; f < fta.getFieldCount(); ++f) {
if (!appender.appendField(fta, t, f)) {
return false;
}
}
}
return appender.appendField(result.getByteArray(), result.getStartOffset(), result.getLength());
}
private String getStringFromBytes(byte[] bytes) {
StringBuilder sb = new StringBuilder();
UTF8StringPointable.toString(sb, bytes, 0);
return sb.toString();
}
/**
* Determines if the correct path step is active.
*/
private boolean foundFirstNonSkippedElement() {
if (subElement == null || subElement.length != depth) {
// Not the correct depth.
return false;
}
for (boolean b : subElement) {
if (!b) {
// Found a path step that did not match.
return false;
}
}
return true;
}
}