Package org.apache.uima.cas.impl

Source Code of org.apache.uima.cas.impl.XCASSerializer$XCASDocSerializer

/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements.  See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership.  The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License.  You may obtain a copy of the License at
*
*   http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
* KIND, either express or implied.  See the License for the
* specific language governing permissions and limitations
* under the License.
*/

package org.apache.uima.cas.impl;

import java.io.IOException;
import java.io.OutputStream;
import java.util.Iterator;
import java.util.List;
import java.util.Map;
import java.util.Vector;

import org.apache.uima.UimaContext;
import org.apache.uima.cas.CAS;
import org.apache.uima.cas.TypeSystem;
import org.apache.uima.internal.util.IntStack;
import org.apache.uima.internal.util.IntVector;
import org.apache.uima.internal.util.StringUtils;
import org.apache.uima.internal.util.rb_trees.IntRedBlackTree;
import org.apache.uima.util.XMLSerializer;
import org.xml.sax.Attributes;
import org.xml.sax.ContentHandler;
import org.xml.sax.SAXException;
import org.xml.sax.helpers.AttributesImpl;

/**
* XCAS serializer. Create a serializer from a type system, then encode individual CASes by writing
* to a SAX content handler. This class is thread safe.
*
*
*/
public class XCASSerializer {

  private int numChildren;

  public int getNumChildren() {
    return numChildren;
  }

  /**
   * Use an inner class to hold the data for serializing a CAS. Each call to serialize() creates its
   * own instance.
   *
   *
   */
  private class XCASDocSerializer {

    // Where the output goes.
    // private SAXDocStack xmlStack;
    private ContentHandler ch;

    // The CAS we're serializing.
    private CASImpl cas;

    // Any FS reference we've touched goes in here.
    private IntRedBlackTree queued;

    private static final int NOT_INDEXED = -1;

    private static final int MULTIPLY_INDEXED = -2;

    private static final int INVALID_INDEX = -3;

    // Any FS indexed in more than one IR goes in here
    private IntRedBlackTree duplicates;

    // Number of FS indexed in more than one IR
    int numDuplicates;

    // Vector of IntVectors for duplicates
    Vector dupVectors;

    // All FSs that are in an index somewhere.
    private IntVector indexedFSs;

    // Specific IndexRepository for indexed FSs
    private IntVector indexReps;

    // The current queue for FSs to write out.
    private IntStack queue;

    // SofaFS type
    private int sofaTypeCode;

    private final AttributesImpl emptyAttrs = new AttributesImpl();

    private AttributesImpl workAttrs = new AttributesImpl();

    private static final String cdataType = "CDATA";

    // For debug statistics.
    private int fsCount = 0;

    // Out-Of-TypeSystem Data to be included in produced XCAS. (APL)
    private OutOfTypeSystemData mOutOfTypeSystemData;

    // We write to a SAXDocStack, a simplified interface to a
    // ContentHandler.
    private XCASDocSerializer(ContentHandler ch, CASImpl cas) {
      super();
      this.ch = ch;
      this.cas = cas;
      this.queued = new IntRedBlackTree();
      this.duplicates = new IntRedBlackTree();
      this.numDuplicates = 0;
      this.dupVectors = new Vector();
      this.queue = new IntStack();
      this.indexedFSs = new IntVector();
      this.indexReps = new IntVector();
      this.sofaTypeCode = cas.ll_getTypeSystem().ll_getCodeForType(
              cas.getTypeSystem().getType(CAS.TYPE_NAME_SOFA));

      // Why was this here? Was never being read anywhere:
      // fs = new FeatureStructureImplC(cas, 0);
    }

    /**
     * Add an address to the queue.
     *
     * @param addr
     *          The address.
     * @return <code>false</code> iff we've seen this address before.
     */
    private boolean enqueue(int addr) {
      if (KEY_ONLY_MATCH == isQueued(addr, INVALID_INDEX)) {
        return false;
      }
      int heapVal = cas.getHeapValue(addr);
      // at this point we don't know if this FS is indexed
      queued.put(addr, NOT_INDEXED);
      queue.push(addr);
      final int typeClass = classifyType(heapVal);
      if (typeClass == LowLevelCAS.TYPE_CLASS_FS) {
        if (mOutOfTypeSystemData != null) {
          enqueueOutOfTypeSystemFeatures(addr);
        }
        enqueueFeatures(addr, heapVal);
      } else if (typeClass == LowLevelCAS.TYPE_CLASS_FSARRAY) {
        enqueueFSArray(addr);
      }
      return true;
    }

    /**
     * Same as enqueue, but for indexed FSs.
     *
     * @param addr
     *          The address to enqueue.
     */
    private void enqueueIndexed(int addr, int indexRep) {
      int status = isQueued(addr, indexRep);
      switch (status) {
        case KEY_NOT_FOUND: // most common case, key not found
          queued.put(addr, indexRep);
          indexedFSs.add(addr);
          indexReps.add(indexRep);
          break;

        case KEY_AND_VALUE_MATCH: // next most common, FS already queued
          break;
        case KEY_ONLY_MATCH: // key is there, indexRep not
          int prevIndex = queued.get(addr);
          if (NOT_INDEXED == prevIndex) {
            // this addr added from a previously found reference
            queued.put(addr, indexRep); // set with given index
            break;
          }
          if (MULTIPLY_INDEXED == prevIndex) {
            // this addr already indexed more than once
            int thisDup = duplicates.get(addr);
            ((IntVector) dupVectors.get(thisDup)).add(indexRep);
            break;
          }
          // duplicate index detected!
          duplicates.put(addr, numDuplicates);
          dupVectors.add(new IntVector());
          ((IntVector) dupVectors.get(numDuplicates)).add(prevIndex);
          ((IntVector) dupVectors.get(numDuplicates)).add(indexRep);
          numDuplicates++;
          queued.put(addr, MULTIPLY_INDEXED); // mark this addr as multiply indexed
          break;
      }
      return;
    }

    /**
     * Bad name; check if we've seen this (address, value) before.
     *
     * @param addr
     *          The address.
     * @param value
     *          The index repository
     * @return KEY_AND_VALUE_MATCH iff we've seen (address, value) before. KEY_NOT_FOUND iff the
     *         address has not been seen before. KEY_ONLY_MATCH iff the address has been seen before
     *         with a different value.
     */
    private static final int KEY_AND_VALUE_MATCH = 1;

    private static final int KEY_ONLY_MATCH = -1;

    private static final int KEY_NOT_FOUND = 0;

    private int isQueued(int addr, int value) {
      return containsKeyValuePair(this.queued, addr, value);
    }

    // returns
    // KEY_AND_VALUE_MATCH = 1;
    // KEY_ONLY_MATCH = -1;
    // KEY_NOT_FOUND = 0;
    private final int containsKeyValuePair(IntRedBlackTree rbt, int key, int value) {
      if (rbt.containsKey(key)) {
        if (rbt.get(key) == value) {
          return KEY_AND_VALUE_MATCH;
        }
        return KEY_ONLY_MATCH;
      }
      return KEY_NOT_FOUND;
    }

    /**
     * Version of serialize which also includes OutOfTypeSystemData (obtained from previous
     * deserialization) in the produced XCAS.
     *
     * @throws XMLException
     * @throws IOException
     * @throws SAXException
     */
    private void serialize(boolean encodeDoc, OutOfTypeSystemData outOfTypeSystemData)
            throws IOException, SAXException {
      mOutOfTypeSystemData = outOfTypeSystemData;

      int iElementCount = 0;

      enqueueIndexed();
      enqueueFeaturesOfIndexed();
      if (outOfTypeSystemData != null) {
        // Queues out of type system data.
        int nextId = cas.getHeap().getCurrentTempSize();
        Iterator it = outOfTypeSystemData.fsList.iterator();
        while (it.hasNext()) {
          FSData fs = (FSData) it.next();
          String newId = Integer.toString(nextId++);
          outOfTypeSystemData.idMap.put(fs.id, newId);
          fs.id = newId;
        }
        iElementCount += outOfTypeSystemData.fsList.size();
        enqueueOutOfTypeSystemData(outOfTypeSystemData);
      }
      iElementCount += indexedFSs.size();
      iElementCount += queue.size();

      AttributesImpl rootAttrs = new AttributesImpl();
      rootAttrs.addAttribute(null, null, VERSION_ATTR, cdataType, CURRENT_VERSION);
      startElement(casTagName, rootAttrs, iElementCount);

      // continue with serialization
      encodeIndexed(); // encodes indexedFSs.size() elements
      encodeQueued(); // encodes queue.size() elements
      if (outOfTypeSystemData != null) {
        // encodes aData.fsList.size() elements
        serializeOutOfTypeSystemData(outOfTypeSystemData);
      }
      endElement(casTagName);
    }

    private void addText(String text) throws SAXException {
      ch.characters(text.toCharArray(), 0, text.length());
    }

    private String replaceInvalidXmlChars(String aString) {
      // first do a scan, so we don't have to change anything if there are
      // no
      // bad charactes
      boolean controlCharFound = false;
      for (int i = 0; i < aString.length(); i++) {
        if (!isValidXmlChar(aString.charAt(i))) {
          controlCharFound = true;
          break;
        }
      }
      if (!controlCharFound) {
        return aString;
      }

      // bad character was found, do another pass and replace all bad
      // chars
      char[] chars = aString.toCharArray();
      for (int i = 0; i < chars.length; i++) {
        if (!isValidXmlChar(chars[i])) {
          // replace invalid XML char with unicode replacement char
          chars[i] = 0xFFFD;
        }
      }
      return new String(chars);
    }

    private boolean isValidXmlChar(char c) {
      return (c >= 0x20 && c < 0xFFFE) || c == 0x09 || c == 0x0A || c == 0x0D;
    }

    private void addAttribute(AttributesImpl attrs, String attrName, String attrValue) {
      // special case: if attrName is "sofaString", we need to check for
      // invalid
      // XML characters in the data, and replace them
      if (CAS.FEATURE_BASE_NAME_SOFASTRING.equals(attrName)) {
        attrValue = replaceInvalidXmlChars(attrValue);
      }
      attrs.addAttribute(null, null, attrName, cdataType, attrValue);
    }

    private void startElement(String tag, Attributes attrs, int num) throws SAXException {
      numChildren = num;
      ch.startElement("", "", tag, attrs);
    }

    private void endElement(String tag) throws SAXException {
      ch.endElement("", "", tag);
    }

    /**
     * Encode the indexed FS in the queue.
     *
     * @throws XMLException
     * @throws IOException
     * @throws SAXException
     */
    private void encodeIndexed() throws IOException, SAXException {
      final int max = indexedFSs.size();
      for (int i = 0; i < max; i++) {
        if (MULTIPLY_INDEXED != queued.get(indexedFSs.get(i))) {
          IntVector iv = new IntVector(1);
          iv.add(indexReps.get(i));
          encodeFS(indexedFSs.get(i), iv);
        } else {
          int thisDup = duplicates.get(indexedFSs.get(i));
          encodeFS(indexedFSs.get(i), (IntVector) dupVectors.get(thisDup));
        }
      }
    }

    /**
     * Push the indexed FSs onto the queue.
     */
    private void enqueueIndexed() {
      FSIndexRepositoryImpl ir = (FSIndexRepositoryImpl) cas.getBaseCAS().getBaseIndexRepository();
      int[] fsarray = ir.getIndexedFSs();
      for (int k = 0; k < fsarray.length; k++) {
        enqueueIndexed(fsarray[k], 0);
      }

      // Get indexes for each SofaFS in the CAS
      int numViews = cas.getBaseSofaCount();
      for (int sofaNum = 1; sofaNum <= numViews; sofaNum++) {
        FSIndexRepositoryImpl loopIR = (FSIndexRepositoryImpl) cas.getBaseCAS()
                .getSofaIndexRepository(sofaNum);
        if (loopIR != null) {
          fsarray = loopIR.getIndexedFSs();
          for (int k = 0; k < fsarray.length; k++) {
            enqueueIndexed(fsarray[k], sofaNum);
          }
        }
      }
    }

    private void enqueueFeaturesOfIndexed() {
      final int max = indexedFSs.size();
      for (int i = 0; i < max; i++) {
        int addr = indexedFSs.get(i);
        int heapVal = cas.getHeapValue(addr);
        final int typeClass = classifyType(heapVal);
        if (typeClass == LowLevelCAS.TYPE_CLASS_FS) {
          if (mOutOfTypeSystemData != null) {
            enqueueOutOfTypeSystemFeatures(addr);
          }
          enqueueFeatures(addr, heapVal);
        } else if (typeClass == LowLevelCAS.TYPE_CLASS_FSARRAY) {
          enqueueFSArray(addr);
        }
      }
    }

    /**
     * Encode all other enqueued (non-indexed) FSs.
     *
     * @throws XMLException
     * @throws IOException
     * @throws SAXException
     */
    private void encodeQueued() throws IOException, SAXException {
      int addr;
      while (!queue.empty()) {
        addr = queue.pop();
        encodeFS(addr, null);
      }
    }

    /**
     * Encode an individual FS.
     *
     * @param addr
     *          The address to be encoded.
     * @param isIndexed
     *          If the FS is indexed or not.
     * @throws XMLException
     * @throws IOException
     * @throws SAXException
     */
    private void encodeFS(int addr, IntVector indexRep) throws IOException, SAXException {
      ++fsCount;
      workAttrs.clear();
      // Create an element with the type name as tag.
      // xmlStack.pushElementNode(getTypeName(addr));
      // Add indexed info.

      // if (sofaTypeCode == cas.getHeapValue(addr) &&
      // cas.isBackwardCompatibleCas()) {
      // // Don't encode sofaFS if old style application
      // return;
      // }

      if (indexRep != null) {
        if (indexRep.size() == 1) {
          // xmlStack.addAttribute(INDEXED_ATTR_NAME, TRUE_VALUE);
          addAttribute(workAttrs, INDEXED_ATTR_NAME, Integer.toString(indexRep.get(0)));
        } else {
          String multIndex = Integer.toString(indexRep.get(0));
          for (int mi = 1; mi < indexRep.size(); mi++) {
            multIndex += " " + Integer.toString(indexRep.get(mi));
          }
          addAttribute(workAttrs, INDEXED_ATTR_NAME, multIndex);
        }
      }
      // Add ID attribute. We do this for every FS, since otherwise we
      // would
      // have to do a complete traversal of the heap to find out which FSs
      // is
      // actually referenced.
      // xmlStack.addAttribute(ID_ATTR_NAME, Integer.toString(addr));
      addAttribute(workAttrs, ID_ATTR_NAME, Integer.toString(addr));
      final int typeClass = classifyType(cas.getHeapValue(addr));
      // Call special code according to the type of the FS (special
      // treatment
      // for arrays).
      switch (typeClass) {
        case LowLevelCAS.TYPE_CLASS_FS: {
          String typeName = getTypeName(addr);
          encodeFeatures(addr, workAttrs);
          if (mOutOfTypeSystemData != null) {
            encodeOutOfTypeSystemFeatures(addr, workAttrs); // APL
          }
          String xcasElementName = getXCasElementName(typeName);
          startElement(xcasElementName, workAttrs, 0);
          // xmlStack.commitNode();
          endElement(xcasElementName);
          break;
        }
        case LowLevelCAS.TYPE_CLASS_INTARRAY: {
          IntArrayFSImpl fs = new IntArrayFSImpl(addr, cas);
          String[] data = fs.toStringArray();
          encodePrimitiveTypeArrayFS(data, getTypeName(addr), workAttrs);
          // encodeIntArray(addr, workAttrs);
          break;
        }
        case LowLevelCAS.TYPE_CLASS_FLOATARRAY: {
          FloatArrayFSImpl fs = new FloatArrayFSImpl(addr, cas);
          String[] data = fs.toStringArray();
          encodePrimitiveTypeArrayFS(data, getTypeName(addr), workAttrs);
          // encodeFloatArray(addr, workAttrs);
          break;
        }
        case LowLevelCAS.TYPE_CLASS_STRINGARRAY: {
          StringArrayFSImpl fs = new StringArrayFSImpl(addr, cas);
          String[] data = fs.toArray();
          encodePrimitiveTypeArrayFS(data, getTypeName(addr), workAttrs);
          // encodeStringArray(addr, workAttrs);
          break;
        }
        case LowLevelCAS.TYPE_CLASS_FSARRAY: {
          encodeFSArray(addr, workAttrs);
          break;
        }
        case LowLevelCAS.TYPE_CLASS_BOOLEANARRAY: {
          BooleanArrayFSImpl fs = new BooleanArrayFSImpl(addr, cas);
          String[] data = fs.toStringArray();
          encodePrimitiveTypeArrayFS(data, getTypeName(addr), workAttrs);
          break;
        }
        case LowLevelCAS.TYPE_CLASS_BYTEARRAY: {
          ByteArrayFSImpl fs = new ByteArrayFSImpl(addr, cas);
          String[] data = fs.toStringArray();
          encodePrimitiveTypeArrayFS(data, getTypeName(addr), workAttrs);
          break;
        }
        case LowLevelCAS.TYPE_CLASS_SHORTARRAY: {
          ShortArrayFSImpl fs = new ShortArrayFSImpl(addr, cas);
          String[] data = fs.toStringArray();
          encodePrimitiveTypeArrayFS(data, getTypeName(addr), workAttrs);
          break;
        }
        case LowLevelCAS.TYPE_CLASS_LONGARRAY: {
          LongArrayFSImpl fs = new LongArrayFSImpl(addr, cas);
          String[] data = fs.toStringArray();
          encodePrimitiveTypeArrayFS(data, getTypeName(addr), workAttrs);
          break;
        }
        case LowLevelCAS.TYPE_CLASS_DOUBLEARRAY: {
          DoubleArrayFSImpl fs = new DoubleArrayFSImpl(addr, cas);
          String[] data = fs.toStringArray();
          encodePrimitiveTypeArrayFS(data, getTypeName(addr), workAttrs);
          break;
        }
        default: {
          // Internal error.
          System.err.println("Error classifying FS type.");
        }
      }
      // xmlStack.popNode();

    }

    private void encodePrimitiveTypeArrayFS(String[] data, String typeName, AttributesImpl attrs)
            throws SAXException {

      addAttribute(attrs, ARRAY_SIZE_ATTR, Integer.toString(data.length));
      startElement(typeName, attrs, data.length);

      for (int i = 0; i < data.length; i++) {
        startElement(ARRAY_ELEMENT_TAG, emptyAttrs, 1);
        addText(data[i]);
        endElement(ARRAY_ELEMENT_TAG);
      }
      endElement(typeName);
    }

    private void encodeFSArray(int addr, AttributesImpl attrs) throws SAXException {
      final String typeName = getTypeName(addr);
      final int size = cas.ll_getArraySize(addr);
      int pos = cas.getArrayStartAddress(addr);
      // xmlStack.addAttribute(ARRAY_SIZE_ATTR, Integer.toString(size));
      // xmlStack.commitNode();
      addAttribute(attrs, ARRAY_SIZE_ATTR, Integer.toString(size));
      startElement(typeName, attrs, size);
      for (int i = 0; i < size; i++) {
        String val = null;
        // xmlStack.pushTextNode(ARRAY_ELEMENT_TAG);
        // xmlStack.commitNode();
        int heapVal = cas.getHeapValue(pos);
        if (heapVal == CASImpl.NULL && mOutOfTypeSystemData != null) {
          // This array element may have been a reference to an OOTS
          // FS.
          List ootsElems = (List) mOutOfTypeSystemData.arrayElements.get(new Integer(addr));
          if (ootsElems != null) {
            Iterator iter = ootsElems.iterator();
            while (iter.hasNext()) // TODO: iteration could be slow
            // for large arrays
            {
              ArrayElement ootsElem = (ArrayElement) iter.next();
              if (ootsElem.index == i) {
                val = (String) mOutOfTypeSystemData.idMap.get(ootsElem.value);
                break;
              }
            }
          }
        } else if (heapVal != CASImpl.NULL) {
          val = Integer.toString(heapVal);
        }

        if (val != null) {
          startElement(ARRAY_ELEMENT_TAG, emptyAttrs, 1);
          addText(val);
        } else {
          startElement(ARRAY_ELEMENT_TAG, emptyAttrs, 0);
        }
        // xmlStack.popNode();
        endElement(ARRAY_ELEMENT_TAG);
        ++pos;
      }

      endElement(typeName);
    }

    private void enqueueFSArray(int addr) {
      final int size = cas.ll_getArraySize(addr);
      int pos = cas.getArrayStartAddress(addr);
      int val;
      for (int i = 0; i < size; i++) {
        val = cas.getHeapValue(pos);
        if (val != CASImpl.NULL) {
          enqueue(val);
        }
        ++pos;
      }
    }

    /**
     * Encode features of a regular (non-array) FS.
     *
     * @param addr
     */
    private void encodeFeatures(int addr, AttributesImpl attrs) {
      int heapValue = cas.getHeapValue(addr);
      int[] feats = ts.ll_getAppropriateFeatures(heapValue);
      int featAddr, featVal;
      String featName, attrValue;
      boolean nameMapping = false;
      if (sofaTypeCode == heapValue) {
        // set flag for SofaID mapping
        nameMapping = true;
      }

      for (int i = 0; i < feats.length; i++) {
        featAddr = addr + cas.getFeatureOffset(feats[i]);
        featVal = cas.getHeapValue(featAddr);
        featName = featureNames[feats[i]];
        if (!cas.ll_isRefType(ts.range(feats[i]))) {
          attrValue = cas.getFeatureValueAsString(addr, feats[i]);
          if (nameMapping && featName.equals(CAS.FEATURE_BASE_NAME_SOFAID) && uimaContext != null) {
            // map absolute SofaID to that expected by Component
            attrValue = uimaContext.mapSofaIDToComponentSofaName(attrValue);
          }
        } else {
          if (featVal == CASImpl.NULL) {
            attrValue = null;
          } else {
            attrValue = Integer.toString(featVal);
          }
        }

        if (attrValue != null && featName != null) {
          addAttribute(attrs, featName, attrValue);
        }
      }
    }

    private void enqueueFeatures(int addr, int heapValue) {
      int[] feats = ts.ll_getAppropriateFeatures(heapValue);
      int featAddr, featVal;

      for (int i = 0; i < feats.length; i++) {
        featAddr = addr + cas.getFeatureOffset(feats[i]);
        featVal = cas.getHeapValue(featAddr);
        if (cas.ll_isRefType(ts.range(feats[i]))) {
          if (featVal == CASImpl.NULL) {
            // break;
          } else {
            enqueue(featVal);
          }

        }
      }
    }

    /**
     * Encode Out-Of-TypeSystem Features.
     *
     * @param addr
     */
    private void encodeOutOfTypeSystemFeatures(int addr, AttributesImpl attrs) {
      List attrList = (List) mOutOfTypeSystemData.extraFeatureValues.get(new Integer(addr));
      if (attrList != null) {
        Iterator it = attrList.iterator();
        while (it.hasNext()) {
          String[] attr = (String[]) it.next();
          // remap ID if necessary
          if (attr[0].startsWith("_ref_")) {
            if (attr[1].startsWith("a")) { // reference to OOTS FS
              // - remap
              attr[1] = (String) mOutOfTypeSystemData.idMap.get(attr[1]);
            }
          }
          addAttribute(attrs, attr[0], attr[1]);
        }
      }
    }

    /**
     * Encode Out-Of-TypeSystem Features.
     *
     * @param addr
     */
    private void enqueueOutOfTypeSystemFeatures(int addr) {
      List attrList = (List) mOutOfTypeSystemData.extraFeatureValues.get(new Integer(addr));
      if (attrList != null) {
        Iterator it = attrList.iterator();
        while (it.hasNext()) {
          String[] attr = (String[]) it.next();
          // remap ID if necessary
          if (attr[0].startsWith("_ref_")) {
            // references whose ID starts with the character 'a' are references to out of type
            // system FS. All other references should be to in-typesystem FS, which we need to
            // enqueue.
            if (!attr[1].startsWith("a")) {
              enqueue(Integer.parseInt(attr[1]));
            }
          }
        }
      }
    }

    private final String getTypeName(int addr) {
      return ts.ll_getTypeForCode(cas.getHeapValue(addr)).getName();
    }

    private final int classifyType(int type) {
      return cas.ll_getTypeClass(type);
    }

    /**
     * Produces XCAS from Out-Of-Typesystem data. (APL)
     */
    private void enqueueOutOfTypeSystemData(OutOfTypeSystemData aData) {
      Iterator it = aData.fsList.iterator();
      while (it.hasNext()) {
        FSData fs = (FSData) it.next();
        Iterator attrIt = fs.featVals.entrySet().iterator();
        while (attrIt.hasNext()) {
          Map.Entry entry = (Map.Entry) attrIt.next();
          String attrName = (String) entry.getKey();
          if (attrName.startsWith("_ref_")) {
            String attrVal = (String) entry.getValue();
            // references whose ID starts with the character 'a' are references to out of type
            // system FS. All other references should be to in-typesystem FS, which we need to
            // enqueue.
            if (!attrVal.startsWith("a")) {
              enqueue(Integer.parseInt(attrVal));
            }
          }
        }
      }
    }

    private void serializeOutOfTypeSystemData(OutOfTypeSystemData aData) throws SAXException {
      Iterator it = aData.fsList.iterator();
      while (it.hasNext()) {
        FSData fs = (FSData) it.next();
        workAttrs.clear();
        // Add indexed info.
        if (fs.indexRep != null) {
          // xmlStack.addAttribute(INDEXED_ATTR_NAME, TRUE_VALUE);
          addAttribute(workAttrs, INDEXED_ATTR_NAME, fs.indexRep);
        }
        // Add ID attribute (remap to new unique integer ID).
        addAttribute(workAttrs, ID_ATTR_NAME, fs.id);

        // Add other attributes (remap OOTS refs)
        Iterator attrIt = fs.featVals.entrySet().iterator();
        while (attrIt.hasNext()) {
          Map.Entry entry = (Map.Entry) attrIt.next();
          String attrName = (String) entry.getKey();
          String attrVal = (String) entry.getValue();
          if (attrName.startsWith("_ref_")) {
            if (attrVal.startsWith("a")) {
              // "a" prefix indicates a reference from one OOTS FS
              // to another OOTS FS;
              // we need to remap those IDs to the actual IDs used
              // in the XCAS
              attrVal = (String) mOutOfTypeSystemData.idMap.get(attrVal);
            }
          }
          addAttribute(workAttrs, attrName, attrVal);
        }
        // send events
        String xcasElementName = getXCasElementName(fs.type);
        startElement(xcasElementName, workAttrs, 0);
        endElement(xcasElementName);
      }
    }

  }

  /**
   * Gets the XCAS element name for a CAS type name. The element name is usually the same as the
   * type name, but the sequences _colon_ and _dash_ are translated to the characters : and -,
   * respectively.
   *
   * @param aCasTypeName
   *          CAS type name
   * @return XCAS element name for this type name
   */
  private String getXCasElementName(String aTagName) {
    if (aTagName.indexOf(':') == -1 && aTagName.indexOf('-') == -1) {
      return aTagName;
    } else {
      // Note: This is really slow so we avoid if possible. -- RJB
      return StringUtils
              .replaceAll(StringUtils.replaceAll(aTagName, ":", "_colon_"), "-", "_dash_");
    }
  }

  public static final String casTagName = "CAS";

  public static final String VERSION_ATTR = "version";
 
  public static final String CURRENT_VERSION = "2";

  public static final String DEFAULT_DOC_TYPE_NAME = "uima.tcas.Document";

  public static final String DEFAULT_DOC_TEXT_FEAT = "text";

  public static final String INDEXED_ATTR_NAME = "_indexed";

  public static final String REF_PREFIX = "_ref_";

  public static final String ID_ATTR_NAME = "_id";

  public static final String CONTENT_ATTR_NAME = "_content";

  public static final String ARRAY_SIZE_ATTR = "size";

  public static final String ARRAY_ELEMENT_TAG = "i";

  public static final String TRUE_VALUE = "true";
 
  private TypeSystemImpl ts;

  private UimaContext uimaContext;

  // Create own cache of feature names because of _ref_ prefixes.
  private String[] featureNames;

  // name of tag to contain document text
  private String docTypeName = DEFAULT_DOC_TYPE_NAME;

  // value of _content attribute for document text element
  private String docTextFeature = DEFAULT_DOC_TEXT_FEAT;

  public XCASSerializer(TypeSystem ts, UimaContext uimaContext) {
    super();
    // System.out.println("Creating serializer for type system.");
    this.ts = (TypeSystemImpl) ts;
    this.uimaContext = uimaContext;
    // Create feature name cache.
    final int featArraySize = this.ts.getNumberOfFeatures() + 1;
    this.featureNames = new String[featArraySize];
    FeatureImpl feat;
    String featName;
    Iterator it = this.ts.getFeatures();
    while (it.hasNext()) {
      feat = (FeatureImpl) it.next();
      if (feat.getRange().isPrimitive()) {
        featName = feat.getShortName();
      } else {
        featName = REF_PREFIX + feat.getShortName();
      }
      this.featureNames[feat.getCode()] = featName;
    }
  }

  public XCASSerializer(TypeSystem ts) {
    this(ts, null);
  }

  /**
   * Write the CAS data to a SAX content handler.
   *
   * @param cas
   *          The CAS to be serialized.
   * @param contentHandler
   *          The SAX content handler the data is written to.
   * @throws IOException
   * @throws SAXException
   */
  public void serialize(CAS cas, ContentHandler contentHandler) throws IOException, SAXException {
    serialize(cas, contentHandler, true);
  }

  /**
   * Write the CAS data to a SAX content handler.
   *
   * @param cas
   *          The CAS to be serialized.
   * @param contentHandler
   *          The SAX content handler the data is written to.
   * @param encodeDoc
   *          If set to false, no uima.tcas.Document structure will be created, and the document
   *          text will not be serialized.
   * @throws IOException
   * @throws SAXException
   */
  public void serialize(CAS cas, ContentHandler contentHandler, boolean encodeDoc)
          throws IOException, SAXException {
    serialize(cas, contentHandler, encodeDoc, null);
  }

  /**
   * Write the CAS data to a SAX content handler.
   *
   * @param cas
   *          The CAS to be serialized.
   * @param contentHandler
   *          The SAX content handler the data is written to.
   * @param encodeDoc
   *          If set to false, no uima.tcas.Document structure will be created, and the document
   *          text will not be serialized.
   * @param outOfTypeSystemData
   *          data not part of the CAS type system, which should be inserted into the XCAS output
   *
   * @throws IOException
   * @throws SAXException
   */
  public void serialize(CAS cas, ContentHandler contentHandler, boolean encodeDoc,
          OutOfTypeSystemData outOfTypeSystemData) throws IOException, SAXException {
    contentHandler.startDocument();
    XCASDocSerializer ser = new XCASDocSerializer(contentHandler, ((CASImpl) cas).getBaseCAS());
    ser.serialize(encodeDoc, outOfTypeSystemData);
    contentHandler.endDocument();
    // System.out.println("Done serializing " + ser.fsCount + " FSs.");
  }

  /**
   * Gets the name of the type representing the document. This will become the name of the XML
   * element that will hold the document text.
   *
   * @return the document type name
   */
  public String getDocumentTypeName() {
    return docTypeName;
  }

  /**
   * Gets the name of the type representing the document. This will become the name of the XML
   * element that will hold the document text. If not set, defaults to
   * {@link #DEFAULT_DOC_TYPE_NAME}.
   *
   * @param aDocTypeName
   *          the document type name
   */
  public void setDocumentTypeName(String aDocTypeName) {
    docTypeName = aDocTypeName;
  }

  /**
   * Gets the name of the feature holding the documeng text. This will become the value of the
   * _content attribute on the document element.
   *
   * @return the document text feature
   */
  public String getDocumentTextFeature() {
    return docTextFeature;
  }

  /**
   * Sets the name of the feature holding the documeng text. This will become the value of the
   * _content attribute on the document element. If not set, defaults to
   * {@link #DEFAULT_DOC_TEXT_FEAT}. If set to null, no _content attribute will be emitted.
   *
   * @param aDocTextFeature
   *          the document text feature
   */
  public void setDocumentTextFeature(String aDocTextFeature) {
    docTextFeature = aDocTextFeature;
  }

  /**
   * Serializes an XCAS to a stream.
   *
   * @param aCAS
   *          CAS to serialize.
   * @param aStream
   *          output stream to which to write the XCAS XML document
   *
   * @throws SAXException
   *           if a problem occurs during XCAS serialization
   * @throws IOException
   *           if an I/O failure occurs
   */
  public static void serialize(CAS aCAS, OutputStream aStream) throws SAXException, IOException {
    XCASSerializer.serialize(aCAS, aStream, false);
  }

  /**
   * Serializes an XCAS to a stream.
   *
   * @param aCAS
   *          CAS to serialize.
   * @param aStream
   *          output stream to which to write the XCAS XML document
   * @param isFormattedOutput
   *          if true the XCAS will be serialized formatted
   *
   * @throws SAXException
   *           if a problem occurs during XCAS serialization
   * @throws IOException
   *           if an I/O failure occurs
   */
  public static void serialize(CAS aCAS, OutputStream aStream, boolean isFormattedOutput)
          throws SAXException, IOException {
    XCASSerializer xcasSerializer = new XCASSerializer(aCAS.getTypeSystem());
    XMLSerializer sax2xml = new XMLSerializer(aStream, isFormattedOutput);
    xcasSerializer.serialize(aCAS, sax2xml.getContentHandler());
  }

}
TOP

Related Classes of org.apache.uima.cas.impl.XCASSerializer$XCASDocSerializer

TOP
Copyright © 2018 www.massapi.com. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.