Package com.gentics.cr.lucene.indexer.transformer.ppt

Source Code of com.gentics.cr.lucene.indexer.transformer.ppt.MSPowerPointParser

package com.gentics.cr.lucene.indexer.transformer.ppt;

import java.io.ByteArrayOutputStream;
import java.io.IOException;
import java.io.InputStream;
import java.util.HashMap;

import org.apache.poi.hpsf.PropertySet;
import org.apache.poi.hpsf.PropertySetFactory;
import org.apache.poi.poifs.eventfilesystem.POIFSReader;
import org.apache.poi.poifs.eventfilesystem.POIFSReaderEvent;
import org.apache.poi.poifs.eventfilesystem.POIFSReaderListener;
import org.apache.poi.poifs.filesystem.DocumentInputStream;
import org.apache.poi.util.LittleEndian;

/**
*
* Last changed: $Date: 2009-06-24 17:10:19 +0200 (Mi, 24 Jun 2009) $
* @version $Revision: 99 $
* @author $Author: supnig@constantinopel.at $
*/
public class MSPowerPointParser implements POIFSReaderListener {

  private InputStream is;
  private ByteArrayOutputStream writer;

  /**
   * Create new Instance of MSPowerPointParser.
   * @param is
   */
  public MSPowerPointParser(InputStream is) {
    this.is = is;
  }

  /**
   * Get contents of ppt document.
   */
  public String getContents() {
    String contents = "";
    try {
      POIFSReader reader = new POIFSReader();
      writer = new ByteArrayOutputStream();
      reader.registerListener(this);
      reader.read(is);
      contents = writer.toString(getEncoding());
    } catch (Exception ex) {
      ex.printStackTrace();
    } finally {
      try {
        this.is.close();
      } catch (IOException e) {
        e.printStackTrace();
      }
    }
    return contents;

  }

  /**
   * Hashmap containing the mapping between codepages (office documents) and encodings (java streams).
   */
  private final static HashMap<Integer, String> ENCODINGMAPPING = new HashMap<Integer, String>();
  static {
    ENCODINGMAPPING.put(1252, "WINDOWS-1252");
  }

  private String getEncoding() {
    if (ps != null) {
      // get the encoding from the document:
      // http://poi.terra-intl.com/hpsf/how-to.html
      int codepage = ps.getFirstSection().getCodepage();
      if (ENCODINGMAPPING.containsKey(codepage)) {
        return ENCODINGMAPPING.get(codepage);
      }
    }
    //return system default charset
    return java.nio.charset.Charset.defaultCharset().toString();
  }

  PropertySet ps = null;

  /**
   * @param event
   */
  public void processPOIFSReaderEvent(POIFSReaderEvent event) {
    try {
      if (event.getName().equalsIgnoreCase("PowerPoint Document")) {
        DocumentInputStream input = event.getStream();
        byte[] buffer = new byte[input.available()];
        input.read(buffer, 0, input.available());
        processContent(0, buffer.length, buffer);
      } else if (event.getName().equalsIgnoreCase("DocumentSummaryInformation")
          || event.getName().equalsIgnoreCase("SummaryInformation")) {
        ps = PropertySetFactory.create(event.getStream());
      }
    } catch (Exception e) {
      throw new RuntimeException("Cannot process PPT Document.", e);
    }
  }

  private void processContent(int beginIndex, int endIndex, byte[] buffer) {
    while (beginIndex < endIndex) {
      int containerFlag = LittleEndian.getUShort(buffer, beginIndex);
      int recordType = LittleEndian.getUShort(buffer, beginIndex + 2);
      long recordLength = LittleEndian.getUInt(buffer, beginIndex + 4);
      beginIndex += 8;
      if ((containerFlag & 0x0f) == 0x0f) {
        processContent(beginIndex, beginIndex + (int) recordLength, buffer);
      } else if (recordType == 4008) {
        writer.write(buffer, beginIndex, (int) recordLength);
        writer.write(' ');
      }
      beginIndex += (int) recordLength;
    }
  }

}
TOP

Related Classes of com.gentics.cr.lucene.indexer.transformer.ppt.MSPowerPointParser

TOP
Copyright © 2018 www.massapi.com. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.