Package org.dspace.app.mediafilter

Source Code of org.dspace.app.mediafilter.WordFilter

/**
* The contents of this file are subject to the license and copyright
* detailed in the LICENSE and NOTICE files at the root of the source
* tree and available online at
*
* http://www.dspace.org/license/
*/
package org.dspace.app.mediafilter;

import java.io.ByteArrayInputStream;
import java.io.InputStream;
import java.io.IOException;

import org.apache.log4j.Logger;

import org.textmining.extraction.TextExtractor;
import org.textmining.extraction.word.WordTextExtractorFactory;

/*
*
* to do: helpful error messages - can't find mediafilter.cfg - can't
* instantiate filter - bitstream format doesn't exist.
*/
public class WordFilter extends MediaFilter
{

    private static Logger log = Logger.getLogger(WordFilter.class);

    public String getFilteredName(String oldFilename)
    {
        return oldFilename + ".txt";
    }

    /**
     * @return String bundle name
     * 
     */
    public String getBundleName()
    {
        return "TEXT";
    }

    /**
     * @return String bitstreamformat
     */
    public String getFormatString()
    {
        return "Text";
    }

    /**
     * @return String description
     */
    public String getDescription()
    {
        return "Extracted text";
    }

    /**
     * @param source
     *            source input stream
     *
     * @return InputStream the resulting input stream
     */
    public InputStream getDestinationStream(InputStream source)
            throws Exception
    {
        // get input stream from bitstream
        // pass to filter, get string back
        try 
        {
            WordTextExtractorFactory factory = new WordTextExtractorFactory();
            TextExtractor e = factory.textExtractor(source);
            String extractedText = e.getText();

            // if verbose flag is set, print out extracted text
            // to STDOUT
            if (MediaFilterManager.isVerbose)
            {
                System.out.println(extractedText);
            }

            // generate an input stream with the extracted text
            byte[] textBytes = extractedText.getBytes();
            ByteArrayInputStream bais = new ByteArrayInputStream(textBytes);

            return bais; // will this work? or will the byte array be out of scope?
        }
        catch (IOException ioe)
        {
            System.out.println("Invalid Word Format");
            log.error("Error detected - Word File format not recognized: "
                    + ioe.getMessage(), ioe);
            throw ioe;
        }
    }
}
TOP

Related Classes of org.dspace.app.mediafilter.WordFilter

TOP
Copyright © 2018 www.massapi.com. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.