Package org.apache.tika.io

Examples of org.apache.tika.io.TikaInputStream


    private void assertType(String file, String byData, String byNameAndData) throws Exception {
       assertTypeByData(file, byData);
       assertTypeByNameAndData(file, byNameAndData);
    }
    private void assertTypeByNameAndData(String dataFile, String name, String type) throws Exception {
       TikaInputStream stream = TikaInputStream.get(
               TestContainerAwareDetector.class.getResource(
                       "/test-documents/" + dataFile));
       try {
           Metadata m = new Metadata();
           if (name != null)
              m.add(Metadata.RESOURCE_NAME_KEY, name);
          
           assertEquals(
                   MediaType.parse(type),
                   detector.detect(stream, m));
       } finally {
           stream.close();
       }
    }
View Full Code Here


    }

    @Test
    public void testOpenContainer() throws Exception {
        TikaInputStream stream = TikaInputStream.get(
                TestContainerAwareDetector.class.getResource(
                        "/test-documents/testPPT.ppt"));
        try {
            assertNull(stream.getOpenContainer());
            assertEquals(
                    MediaType.parse("application/vnd.ms-powerpoint"),
                    detector.detect(stream, new Metadata()));
            assertTrue(stream.getOpenContainer() instanceof NPOIFSFileSystem);
        } finally {
            stream.close();
        }
    }
View Full Code Here

    }

    private void assertRemovalTempfiles(String fileName) throws Exception {
        int numberOfTempFiles = countTemporaryFiles();

        TikaInputStream stream = TikaInputStream.get(
                TestContainerAwareDetector.class.getResource(
                        "/test-documents/" + fileName));
        try {
            detector.detect(stream, new Metadata());
        } finally {
            stream.close();
        }

        assertEquals(numberOfTempFiles, countTemporaryFiles());
    }
View Full Code Here

    @Test
    public void testTruncatedFiles() throws Exception {
        // First up a truncated OOXML (zip) file
      
        // With only the data supplied, the best we can do is the container
        TikaInputStream xlsx = getTruncatedFile("testEXCEL.xlsx", 300);
        Metadata m = new Metadata();
        try {
            assertEquals(
                    MediaType.application("x-tika-ooxml"),
                    detector.detect(xlsx, m));
        } finally {
            xlsx.close();
        }
       
        // With truncated data + filename, we can use the filename to specialise
        xlsx = getTruncatedFile("testEXCEL.xlsx", 300);
        m = new Metadata();
        m.add(Metadata.RESOURCE_NAME_KEY, "testEXCEL.xlsx");
        try {
            assertEquals(
                    MediaType.application("vnd.openxmlformats-officedocument.spreadsheetml.sheet"),
                    detector.detect(xlsx, m));
        } finally {
            xlsx.close();
        }
       

        // Now a truncated OLE2 file
        TikaInputStream xls = getTruncatedFile("testEXCEL.xls", 400);
        m = new Metadata();
        try {
            assertEquals(
                    MediaType.application("x-tika-msoffice"),
                    detector.detect(xls, m));
        } finally {
            xls.close();
        }
       
        // Finally a truncated OLE2 file, with a filename available
        xls = getTruncatedFile("testEXCEL.xls", 400);
        m = new Metadata();
        m.add(Metadata.RESOURCE_NAME_KEY, "testEXCEL.xls");
        try {
            assertEquals(
                    MediaType.application("vnd.ms-excel"),
                    detector.detect(xls, m));
        } finally {
            xls.close();
        }
   }
View Full Code Here

    public void testAdobeFontMetricParsing() throws Exception {
        Parser parser = new AutoDetectParser(); // Should auto-detect!
        ContentHandler handler = new BodyContentHandler();
        Metadata metadata = new Metadata();
        ParseContext context = new ParseContext();
        TikaInputStream stream = TikaInputStream.get(
                AdobeFontMetricParserTest.class.getResource(
                        "/test-documents/testAFM.afm"));

        try {
            parser.parse(stream, handler, metadata, context);
        } finally {
            stream.close();
        }

        assertEquals("application/x-font-adobe-metric", metadata.get(Metadata.CONTENT_TYPE));
        assertEquals("TestFullName", metadata.get(TikaCoreProperties.TITLE));
        assertEquals("Fri Jul 15 17:50:51 2011", metadata.get(Metadata.CREATION_DATE));
View Full Code Here

    public static final MediaType TYPE_PNG = MediaType.image("png");
    public static final MediaType TYPE_EMF = MediaType.application("x-emf");
    public static final MediaType TYPE_WMF = MediaType.application("x-msmetafile");

    protected TrackingHandler process(String filename, ContainerExtractor extractor, boolean recurse) throws Exception {
        TikaInputStream stream = getTestFile(filename);
        try {
            assertEquals(true, extractor.isSupported(stream));

            // Process it
            TrackingHandler handler = new TrackingHandler();
            if(recurse) {
                extractor.extract(stream, extractor, handler);
            } else {
                extractor.extract(stream, null, handler);
            }

            // So they can check what happened
            return handler;
        } finally {
            stream.close();
        }
    }
View Full Code Here

        //assertEquals("Stereo", metadata.get(XMPDM.AUDIO_CHANNEL_TYPE)); // TODO Extract
        assertEquals("M4A", metadata.get(XMPDM.AUDIO_COMPRESSOR));
       
       
        // Check again by file, rather than stream
        TikaInputStream tstream = TikaInputStream.get(
              MP4ParserTest.class.getResourceAsStream("/test-documents/testMP4.m4a"));
        tstream.getFile();
        try {
           parser.parse(tstream, handler, metadata, new ParseContext());
        } finally {
           tstream.close();
        }
    }
View Full Code Here

    @Test
    public void testEmbedded() throws Exception {
        InputStream input = FictionBookParserTest.class.getResourceAsStream("/test-documents/test.fb2");
        try {
            ContainerExtractor extractor = new ParserContainerExtractor();
            TikaInputStream stream = TikaInputStream.get(input);

            assertEquals(true, extractor.isSupported(stream));

            // Process it
            AbstractPOIContainerExtractionTest.TrackingHandler handler = new AbstractPOIContainerExtractionTest.TrackingHandler();
View Full Code Here

      public void write(OutputStream outputStream) throws IOException, WebApplicationException {
        Writer writer = new OutputStreamWriter(outputStream, "UTF-8");

        BodyContentHandler body = new BodyContentHandler(new RichTextContentHandler(writer));

        TikaInputStream tis = TikaInputStream.get(is);

        try {
            parser.parse(tis, body, metadata);
        } catch (SAXException e) {
          throw new WebApplicationException(e);
        } catch (EncryptedDocumentException e) {
          logger.warn(String.format(
                  "%s: Encrypted document",
                  info.getPath()
          ), e);

          throw new WebApplicationException(e, Response.status(422).build());
        } catch (TikaException e) {
          logger.warn(String.format(
            "%s: Text extraction failed",
            info.getPath()
          ), e);

          if (e.getCause()!=null && e.getCause() instanceof WebApplicationException) {
            throw (WebApplicationException) e.getCause();
          }

          if (e.getCause()!=null && e.getCause() instanceof IllegalStateException) {
            throw new WebApplicationException(Response.status(422).build());
          }

          if (e.getCause()!=null && e.getCause() instanceof OldWordFileFormatException) {
            throw new WebApplicationException(Response.status(422).build());
          }

          throw new WebApplicationException(Response.Status.INTERNAL_SERVER_ERROR);
        } finally {
          tis.close();
        }
      }
    };
  }
View Full Code Here

        }
        catch ( TransformerConfigurationException e ) {
          throw new WebApplicationException( e );
        }

        TikaInputStream tis = TikaInputStream.get(is);

        try {
          parser.parse(tis, content, metadata);
        }
        catch (SAXException e) {
          throw new WebApplicationException(e);
        }
        catch (EncryptedDocumentException e) {
          logger.warn(String.format(
            "%s: Encrypted document",
            info.getPath()
          ), e);
          throw new WebApplicationException(e, Response.status(422).build());
        }
        catch (TikaException e) {
          logger.warn(String.format(
            "%s: Text extraction failed",
            info.getPath()
          ), e);

          if (e.getCause()!=null && e.getCause() instanceof WebApplicationException)
            throw (WebApplicationException) e.getCause();

          if (e.getCause()!=null && e.getCause() instanceof IllegalStateException)
            throw new WebApplicationException(Response.status(422).build());

          if (e.getCause()!=null && e.getCause() instanceof OldWordFileFormatException)
            throw new WebApplicationException(Response.status(422).build());

          throw new WebApplicationException(Response.Status.INTERNAL_SERVER_ERROR);
        }
        finally {
          tis.close();
        }
      }
    };
  }
View Full Code Here

TOP

Related Classes of org.apache.tika.io.TikaInputStream

Copyright © 2018 www.massapicom. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.