Package org.apache.tika.io

Examples of org.apache.tika.io.TikaInputStream


            InputStream stream, ContentHandler handler,
            Metadata metadata, ParseContext context)
            throws IOException, SAXException, TikaException {
        TemporaryResources tmp = new TemporaryResources();
        try {
            TikaInputStream tis = TikaInputStream.get(stream, tmp);
            new ImageMetadataExtractor(metadata).parseTiff(tis.getFile());
            new JempboxExtractor(metadata).parse(tis);
        } finally {
            tmp.dispose();
        }
View Full Code Here


    public static final MediaType TYPE_GIF = MediaType.image("gif");
    public static final MediaType TYPE_PNG = MediaType.image("png");
    public static final MediaType TYPE_EMF = MediaType.application("x-msmetafile");

    protected TrackingHandler process(String filename, ContainerExtractor extractor, boolean recurse) throws Exception {
        TikaInputStream stream = getTestFile(filename);
        try {
            assertEquals(true, extractor.isSupported(stream));

            // Process it
            TrackingHandler handler = new TrackingHandler();
            if(recurse) {
                extractor.extract(stream, extractor, handler);
            } else {
                extractor.extract(stream, null, handler);
            }

            // So they can check what happened
            return handler;
        } finally {
            stream.close();
        }
    }
View Full Code Here

            try {
                os = new FileOutputStream(outputFile);

                if (inputStream instanceof TikaInputStream) {
                    TikaInputStream tin = (TikaInputStream) inputStream;

                    if (tin.getOpenContainer() != null && tin.getOpenContainer() instanceof DirectoryEntry) {
                        POIFSFileSystem fs = new POIFSFileSystem();
                        copy((DirectoryEntry) tin.getOpenContainer(), fs.getRoot());
                        fs.writeFilesystem(os);
                    } else {
                        IOUtils.copy(inputStream, os);
                    }
                } else {
View Full Code Here

        // Is it an embedded OLE2 document, or an embedded OOXML document?
        try {
            Entry ooxml = dir.getEntry("Package");

            // It's OOXML
            TikaInputStream stream = TikaInputStream.get(
                    new DocumentInputStream((DocumentEntry) ooxml));
            try {
                ZipContainerDetector detector = new ZipContainerDetector();
                MediaType type = detector.detect(stream, new Metadata());
                handleEmbeddedResource(stream, null, type.toString(), xhtml, true);
                return;
            } finally {
                stream.close();
            }
        } catch(FileNotFoundException e) {
            // It's regular OLE2
        }

       // Need to dump the directory out to a new temp file, so
       //  it's stand along
       POIFSFileSystem newFS = new POIFSFileSystem();
       copy(dir, newFS.getRoot());

       File tmpFile = File.createTempFile("tika", ".ole2");
       try {
           FileOutputStream out = new FileOutputStream(tmpFile);
           newFS.writeFilesystem(out);
           out.close();

           // What kind of document is it?
           Metadata metadata = new Metadata();
           POIFSDocumentType type = POIFSDocumentType.detectType(dir);

           TikaInputStream embedded;

           if (type==POIFSDocumentType.OLE10_NATIVE) {
               Entry entry = dir.getEntry(Ole10Native.OLE10_NATIVE);
               ByteArrayOutputStream bos = new ByteArrayOutputStream();
               IOUtils.copy(new DocumentInputStream((DocumentEntry) entry), bos);
               byte[] data = bos.toByteArray();

               try {
                    Ole10Native ole = new Ole10Native(data, 0);
                    byte[] dataBuffer = ole.getDataBuffer();

                    metadata.set("resourceName", dir.getName() + '/' + ole.getLabel());

                    embedded = TikaInputStream.get(dataBuffer);
               } catch (Ole10NativeException ex) {
                 embedded = TikaInputStream.get(data);
               }
           } else {
               metadata.set(Metadata.CONTENT_TYPE, type.getType().toString());
               metadata.set(Metadata.RESOURCE_NAME_KEY, dir.getName() + '.' + type.getExtension());

               embedded = TikaInputStream.get(tmpFile);
           }

           try {
               if (extractor.shouldParseEmbedded(metadata)) {
                   extractor.parseEmbedded(embedded, xhtml, metadata, true);
               }
           } finally {
               embedded.close();
           }
       } finally {
           tmpFile.delete();
       }
    }
View Full Code Here

        } finally {
            input.reset();
        }

        // We can only detect the exact type when given a TikaInputStream
        TikaInputStream tis = TikaInputStream.cast(input);
        if (tis != null) {
            try {
                ZipFile zip = new ZipFile(tis.getFile());
                try {
                    MediaType type = detectOpenDocument(zip);
                    if (type == null) {
                        type = detectOfficeOpenXML(zip, tis);
                    }
View Full Code Here

            throws IOException, SAXException, TikaException {
        XHTMLContentHandler xhtml = new XHTMLContentHandler(handler, metadata);
        xhtml.startDocument();

        NPOIFSFileSystem filesystem;
        TikaInputStream tstream = TikaInputStream.cast(stream);
        if (tstream == null) {
            filesystem =
                new NPOIFSFileSystem(new CloseShieldInputStream(stream));
        } else if (tstream.getOpenContainer() instanceof NPOIFSFileSystem) {
            filesystem = (NPOIFSFileSystem) tstream.getOpenContainer();
        } else if (tstream.hasFile()) {
            filesystem = new NPOIFSFileSystem(tstream.getFileChannel());
        } else {
            filesystem =
                new NPOIFSFileSystem(new CloseShieldInputStream(tstream));
        }
View Full Code Here

                       break;
                    default:
                       mimeType =  "image/unknown";
                       break;
                    }
                    TikaInputStream stream = TikaInputStream.get(blip.getPicturedata());
                   
                    // Handle the embeded resource
                    extractor.handleEmbeddedResource(
                          stream, null, mimeType,
                          handler, true
View Full Code Here

           
            try {
               ObjectData data = oleShape.getObjectData();

               if(data != null) {
                  TikaInputStream stream =
                     TikaInputStream.get(data.getData());
                  try {
                     String mediaType = null;
                     if ("Excel.Chart.8".equals(oleShape.getProgID())) {
                        mediaType = "application/vnd.ms-excel";
                     }
                     handleEmbeddedResource(
                           stream, Integer.toString(oleShape.getObjectID()),
                           mediaType, xhtml, false);
                  } finally {
                     stream.close();
                  }
               }
            } catch( NullPointerException e ) {
               /* getObjectData throws NPE some times. */
            }
View Full Code Here

        } finally {
            input.reset();
        }

        // We can only detect the exact type when given a TikaInputStream
        TikaInputStream tis = TikaInputStream.cast(input);
        if (tis != null) {
            // Look for known top level entry names to detect the document type
            Set<String> names = getTopLevelNames(tis);
            if (names.contains("Workbook")) {
                return XLS;
View Full Code Here

            InputStream stream, ContentHandler handler,
            Metadata metadata, ParseContext context)
            throws IOException, SAXException, TikaException {
        TrueTypeFont font;
        TTFParser parser = new TTFParser();
        TikaInputStream tis = TikaInputStream.cast(stream);
        if (tis != null && tis.hasFile()) {
            font = parser.parseTTF(tis.getFile());
        } else {
            font = parser.parseTTF(stream);
        }

        metadata.set(Metadata.CONTENT_TYPE, TYPE.toString());
View Full Code Here

TOP

Related Classes of org.apache.tika.io.TikaInputStream

Copyright © 2018 www.massapicom. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.