Package org.apache.tika.io

Examples of org.apache.tika.io.TikaInputStream


    private void handleEmbeddedOLE(PackagePart part, ContentHandler handler)
            throws IOException, SAXException {
        POIFSFileSystem fs = new POIFSFileSystem(part.getInputStream());
        try {
            Metadata metadata = new Metadata();
            TikaInputStream stream = null;

            DirectoryNode root = fs.getRoot();
            POIFSDocumentType type = POIFSDocumentType.detectType(root);
           
            if (root.hasEntry("CONTENTS")
View Full Code Here


            if (extractor.shouldParseEmbedded(entrydata)) {
                // For detectors to work, we need a mark/reset supporting
                // InputStream, which ArchiveInputStream isn't, so wrap
                TemporaryResources tmp = new TemporaryResources();
                try {
                    TikaInputStream tis = TikaInputStream.get(archive, tmp);
                    extractor.parseEmbedded(tis, xhtml, entrydata, true);
                } finally {
                    tmp.dispose();
                }
            }
View Full Code Here

    private void assertType(String file, String byData, String byNameAndData) throws Exception {
       assertTypeByData(file, byData);
       assertTypeByNameAndData(file, byNameAndData);
    }
    private void assertTypeByNameAndData(String dataFile, String name, String type) throws Exception {
       TikaInputStream stream = TikaInputStream.get(
               TestContainerAwareDetector.class.getResource(
                       "/test-documents/" + dataFile));
       try {
           Metadata m = new Metadata();
           if (name != null)
              m.add(Metadata.RESOURCE_NAME_KEY, name);
          
           assertEquals(
                   MediaType.parse(type),
                   detector.detect(stream, m));
       } finally {
           stream.close();
       }
    }
View Full Code Here

                "application/vnd.stardivision.writer");

    }

    public void testOpenContainer() throws Exception {
        TikaInputStream stream = TikaInputStream.get(
                TestContainerAwareDetector.class.getResource(
                        "/test-documents/testPPT.ppt"));
        try {
            assertNull(stream.getOpenContainer());
            assertEquals(
                    MediaType.parse("application/vnd.ms-powerpoint"),
                    detector.detect(stream, new Metadata()));
            assertTrue(stream.getOpenContainer() instanceof NPOIFSFileSystem);
        } finally {
            stream.close();
        }
    }
View Full Code Here

    }

    private void assertRemovalTempfiles(String fileName) throws Exception {
        int numberOfTempFiles = countTemporaryFiles();

        TikaInputStream stream = TikaInputStream.get(
                TestContainerAwareDetector.class.getResource(
                        "/test-documents/" + fileName));
        try {
            detector.detect(stream, new Metadata());
        } finally {
            stream.close();
        }

        assertEquals(numberOfTempFiles, countTemporaryFiles());
    }
View Full Code Here

    public void testTruncatedFiles() throws Exception {
        // First up a truncated OOXML (zip) file
      
        // With only the data supplied, the best we can do is the container
        TikaInputStream xlsx = getTruncatedFile("testEXCEL.xlsx", 300);
        Metadata m = new Metadata();
        try {
            assertEquals(
                    MediaType.application("x-tika-ooxml"),
                    detector.detect(xlsx, m));
        } finally {
            xlsx.close();
        }
       
        // With truncated data + filename, we can use the filename to specialise
        xlsx = getTruncatedFile("testEXCEL.xlsx", 300);
        m = new Metadata();
        m.add(Metadata.RESOURCE_NAME_KEY, "testEXCEL.xlsx");
        try {
            assertEquals(
                    MediaType.application("vnd.openxmlformats-officedocument.spreadsheetml.sheet"),
                    detector.detect(xlsx, m));
        } finally {
            xlsx.close();
        }
       

        // Now a truncated OLE2 file
        TikaInputStream xls = getTruncatedFile("testEXCEL.xls", 400);
        m = new Metadata();
        try {
            assertEquals(
                    MediaType.application("x-tika-msoffice"),
                    detector.detect(xls, m));
        } finally {
            xls.close();
        }
       
        // Finally a truncated OLE2 file, with a filename available
        xls = getTruncatedFile("testEXCEL.xls", 400);
        m = new Metadata();
        m.add(Metadata.RESOURCE_NAME_KEY, "testEXCEL.xls");
        try {
            assertEquals(
                    MediaType.application("vnd.ms-excel"),
                    detector.detect(xls, m));
        } finally {
            xls.close();
        }
   }
View Full Code Here

           
            try {
               ObjectData data = oleShape.getObjectData();

               if(data != null) {
                  TikaInputStream stream =
                     TikaInputStream.get(data.getData());
                  try {
                     String mediaType = null;
                     if ("Excel.Chart.8".equals(oleShape.getProgID())) {
                        mediaType = "application/vnd.ms-excel";
                     }
                     handleEmbeddedResource(
                           stream, Integer.toString(oleShape.getObjectID()),
                           mediaType, xhtml, false);
                  } finally {
                     stream.close();
                  }
               }
            } catch( NullPointerException e ) {
               /* getObjectData throws NPE some times. */
            }
View Full Code Here

        }

        // If this is a TikaInputStream wrapping an already
        // parsed NPOIFileSystem/DirectoryNode, just get the
        // names from the root:
        TikaInputStream tis = TikaInputStream.cast(input);
        Set<String> names = null;
        if (tis != null) {
            Object container = tis.getOpenContainer();
            if (container instanceof NPOIFSFileSystem) {
                names = getTopLevelNames(((NPOIFSFileSystem) container).getRoot());
            } else if (container instanceof DirectoryNode) {
                names = getTopLevelNames((DirectoryNode) container);
            }
        }

        if (names == null) {
            // Check if the document starts with the OLE header
            input.mark(8);
            try {
                if (input.read() != 0xd0 || input.read() != 0xcf
                    || input.read() != 0x11 || input.read() != 0xe0
                    || input.read() != 0xa1 || input.read() != 0xb1
                    || input.read() != 0x1a || input.read() != 0xe1) {
                    return MediaType.OCTET_STREAM;
                }
            } finally {
                input.reset();
            }
        }

        // We can only detect the exact type when given a TikaInputStream
        if (names == null && tis != null) {
            // Look for known top level entry names to detect the document type
            names = getTopLevelNames(tis);
        }
       
        // Detect based on the names (as available)
        if (tis != null &&
            tis.getOpenContainer() != null &&
            tis.getOpenContainer() instanceof NPOIFSFileSystem) {
            return detect(names, ((NPOIFSFileSystem)tis.getOpenContainer()).getRoot());
        } else {
            return detect(names, null);
        }
    }
View Full Code Here

  public boolean parseEmbedded(InputStream stream, Record record, String name, Command child) {
    // Use the delegate parser to parse this entry
   
    TemporaryResources tmp = new TemporaryResources();
    try {
      final TikaInputStream newStream = TikaInputStream.get(new CloseShieldInputStream(stream), tmp);
      if (stream instanceof TikaInputStream) {
        final Object container = ((TikaInputStream) stream).getOpenContainer();
        if (container != null) {
          newStream.setOpenContainer(container);
        }
      }
      record = record.copy();

      record.replaceValues(Fields.ATTACHMENT_BODY, newStream);
View Full Code Here

       
        // For detectors to work, we need a mark/reset supporting
        // InputStream, which ArchiveInputStream isn't, so wrap
        TemporaryResources tmp = new TemporaryResources();
        try {
          TikaInputStream tis = TikaInputStream.get(archive, tmp);
          return extractor.parseEmbedded(tis, entrydata, name, getChild());
        } finally {
          try {
            tmp.dispose();
          } catch (TikaException e) {
View Full Code Here

TOP

Related Classes of org.apache.tika.io.TikaInputStream

Copyright © 2018 www.massapicom. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.