Package org.apache.tika.parser

Examples of org.apache.tika.parser.Parser


*/
public class Bzip2ParserTest extends AbstractPkgTest {

    @Test
    public void testBzip2Parsing() throws Exception {
        Parser parser = new AutoDetectParser(); // Should auto-detect!
        ContentHandler handler = new BodyContentHandler();
        Metadata metadata = new Metadata();

        InputStream stream = Bzip2ParserTest.class.getResourceAsStream(
                "/test-documents/test-documents.tbz2");
        try {
            parser.parse(stream, handler, metadata, recursingContext);
        } finally {
            stream.close();
        }

        assertEquals("application/x-bzip2", metadata.get(Metadata.CONTENT_TYPE));
View Full Code Here


     * Tests that the ParseContext parser is correctly
     *  fired for all the embedded entries.
     */
    @Test
    public void testEmbedded() throws Exception {
       Parser parser = new AutoDetectParser(); // Should auto-detect!
       ContentHandler handler = new BodyContentHandler();
       Metadata metadata = new Metadata();

       InputStream stream = ZipParserTest.class.getResourceAsStream(
               "/test-documents/test-documents.tbz2");
       try {
           parser.parse(stream, handler, metadata, trackingContext);
       } finally {
           stream.close();
       }
      
       // Should find a single entry, for the (compressed) tar file
View Full Code Here

*/
public class GzipParserTest extends AbstractPkgTest {

    @Test
    public void testGzipParsing() throws Exception {
        Parser parser = new AutoDetectParser(); // Should auto-detect!
        ContentHandler handler = new BodyContentHandler();
        Metadata metadata = new Metadata();

        InputStream stream = GzipParserTest.class.getResourceAsStream(
                "/test-documents/test-documents.tgz");
        try {
            parser.parse(stream, handler, metadata, recursingContext);
        } finally {
            stream.close();
        }

        assertEquals("application/x-gzip", metadata.get(Metadata.CONTENT_TYPE));
View Full Code Here

     * Tests that the ParseContext parser is correctly
     *  fired for all the embedded entries.
     */
    @Test
    public void testEmbedded() throws Exception {
       Parser parser = new AutoDetectParser(); // Should auto-detect!
       ContentHandler handler = new BodyContentHandler();
       Metadata metadata = new Metadata();

       InputStream stream = ZipParserTest.class.getResourceAsStream(
               "/test-documents/test-documents.tgz");
       try {
           parser.parse(stream, handler, metadata, trackingContext);
       } finally {
           stream.close();
       }
      
       // Should find a single entry, for the (compressed) tar file
View Full Code Here

       assertEquals("test-documents/", new String(tracker.lastSeenStart, 0, 15, "ASCII"));
    }
   
    @Test
    public void testSvgzParsing() throws Exception {
        Parser parser = new AutoDetectParser(); // Should auto-detect!
        ContentHandler handler = new BodyContentHandler();
        Metadata metadata = new Metadata();

        InputStream stream = GzipParserTest.class.getResourceAsStream(
                "/test-documents/testSVG.svgz");
        try {
            parser.parse(stream, handler, metadata, recursingContext);
        } finally {
            stream.close();
        }

        assertEquals("application/x-gzip", metadata.get(Metadata.CONTENT_TYPE));
View Full Code Here

*/
public class ZipParserTest extends AbstractPkgTest {

    @Test
    public void testZipParsing() throws Exception {
        Parser parser = new AutoDetectParser(); // Should auto-detect!
        ContentHandler handler = new BodyContentHandler();
        Metadata metadata = new Metadata();

        InputStream stream = ZipParserTest.class.getResourceAsStream(
                "/test-documents/test-documents.zip");
        try {
            parser.parse(stream, handler, metadata, recursingContext);
        } finally {
            stream.close();
        }

        assertEquals("application/zip", metadata.get(Metadata.CONTENT_TYPE));
View Full Code Here

     * Tests that the ParseContext parser is correctly
     *  fired for all the embedded entries.
     */
    @Test
    public void testEmbedded() throws Exception {
       Parser parser = new AutoDetectParser(); // Should auto-detect!
       ContentHandler handler = new BodyContentHandler();
       Metadata metadata = new Metadata();

       InputStream stream = ZipParserTest.class.getResourceAsStream(
               "/test-documents/test-documents.zip");
       try {
           parser.parse(stream, handler, metadata, trackingContext);
       } finally {
           stream.close();
       }
      
       // Should have found all 9 documents
View Full Code Here

        assertContains("<div class=\"embedded\" id=\"test1.txt\" />", xml);
        assertContains("<div class=\"embedded\" id=\"test2.txt\" />", xml);

        // Also make sure EMBEDDED_RELATIONSHIP_ID was
        // passed when parsing the embedded docs:
        Parser parser = new AutoDetectParser();
        ParseContext context = new ParseContext();
        context.set(Parser.class, parser);
        GatherRelIDsDocumentExtractor relIDs = new GatherRelIDsDocumentExtractor();
        context.set(EmbeddedDocumentExtractor.class, relIDs);
        InputStream input = getResourceAsStream("/test-documents/testEmbedded.zip");
        try {
          parser.parse(input,
                       new BodyContentHandler(),
                       new Metadata(),
                       context);
        } finally {
            input.close();
View Full Code Here

     * Test that we can extract information from
     *  a M4A MP4 Audio file
     */
    @Test
    public void testMP4ParsingAudio() throws Exception {
        Parser parser = new AutoDetectParser(); // Should auto-detect!
        ContentHandler handler = new BodyContentHandler();
        Metadata metadata = new Metadata();

        InputStream stream = MP4ParserTest.class.getResourceAsStream(
                "/test-documents/testMP4.m4a");
        try {
            parser.parse(stream, handler, metadata, new ParseContext());
        } finally {
            stream.close();
        }

        // Check core properties
        assertEquals("audio/mp4", metadata.get(Metadata.CONTENT_TYPE));
        assertEquals("Test Title", metadata.get(TikaCoreProperties.TITLE));
        assertEquals("Test Artist", metadata.get(TikaCoreProperties.CREATOR));
        assertEquals("Test Artist", metadata.get(Metadata.AUTHOR));
        assertEquals("2012-01-28T18:39:18Z", metadata.get(TikaCoreProperties.CREATED));
        assertEquals("2012-01-28T18:39:18Z", metadata.get(Metadata.CREATION_DATE));
        assertEquals("2012-01-28T18:40:25Z", metadata.get(TikaCoreProperties.MODIFIED));
        assertEquals("2012-01-28T18:40:25Z", metadata.get(Metadata.DATE));

        // Check the textual contents
        String content = handler.toString();
        assertTrue(content.contains("Test Title"));
        assertTrue(content.contains("Test Artist"));
        assertTrue(content.contains("Test Album"));
        assertTrue(content.contains("2008"));
        assertTrue(content.contains("Test Comment"));
        assertTrue(content.contains("Test Genre"));
       
        // Check XMPDM-typed audio properties
        assertEquals("Test Album", metadata.get(XMPDM.ALBUM));
        assertEquals("Test Artist", metadata.get(XMPDM.ARTIST));
        assertEquals("Test Composer", metadata.get(XMPDM.COMPOSER));
        assertEquals("2008", metadata.get(XMPDM.RELEASE_DATE));
        assertEquals("Test Genre", metadata.get(XMPDM.GENRE));
        assertEquals("Test Comments", metadata.get(XMPDM.LOG_COMMENT.getName()));
        assertEquals("1", metadata.get(XMPDM.TRACK_NUMBER));
       
        assertEquals("44100", metadata.get(XMPDM.AUDIO_SAMPLE_RATE));
        //assertEquals("Stereo", metadata.get(XMPDM.AUDIO_CHANNEL_TYPE)); // TODO Extract
        assertEquals("M4A", metadata.get(XMPDM.AUDIO_COMPRESSOR));
       
       
        // Check again by file, rather than stream
        TikaInputStream tstream = TikaInputStream.get(
              MP4ParserTest.class.getResourceAsStream("/test-documents/testMP4.m4a"));
        tstream.getFile();
        try {
           parser.parse(tstream, handler, metadata, new ParseContext());
        } finally {
           tstream.close();
        }
    }
View Full Code Here

            // Embed the metadata into a copy of the original output stream
            embedder.embed(metadataToEmbed, sourceInputStream, tempFileOutputStream, null);

            ParseContext context = new ParseContext();
            Parser parser = getParser();
            context.set(Parser.class, parser);

            // Setup the extracting content handler
            ByteArrayOutputStream result = new ByteArrayOutputStream();
            OutputStreamWriter outputWriter = new OutputStreamWriter(result,DEFAULT_CHARSET);
            ContentHandler handler = new BodyContentHandler(outputWriter);

            // Create a new metadata object to read the new metadata into
            Metadata embeddedMetadata = new Metadata();

            // Setup a re-read of the now embeded temp file
            FileInputStream embeddedFileInputStream = new FileInputStream(tempOutputFile);

            parser.parse(embeddedFileInputStream, handler, embeddedMetadata,
                    context);

            tmp.dispose();

            String outputString = null;
View Full Code Here

TOP

Related Classes of org.apache.tika.parser.Parser

Copyright © 2018 www.massapicom. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.