Package org.apache.tika.io

Examples of org.apache.tika.io.TikaInputStream


  public boolean parseEmbedded(InputStream stream, Record record, String name, Command child) {
    // Use the delegate parser to parse this entry
   
    TemporaryResources tmp = new TemporaryResources();
    try {
      final TikaInputStream newStream = TikaInputStream.get(new CloseShieldInputStream(stream), tmp);
      if (stream instanceof TikaInputStream) {
        final Object container = ((TikaInputStream) stream).getOpenContainer();
        if (container != null) {
          newStream.setOpenContainer(container);
        }
      }
      record = record.copy();

      record.replaceValues(Fields.ATTACHMENT_BODY, newStream);
View Full Code Here


       
        // For detectors to work, we need a mark/reset supporting
        // InputStream, which ArchiveInputStream isn't, so wrap
        TemporaryResources tmp = new TemporaryResources();
        try {
          TikaInputStream tis = TikaInputStream.get(archive, tmp);
          return extractor.parseEmbedded(tis, entrydata, name, getChild());
        } finally {
          try {
            tmp.dispose();
          } catch (TikaException e) {
View Full Code Here

            IOUtils.closeQuietly(resourceStream);
        }
    }
   
    protected MediaType getMediaType(BufferedInputStream inputStream, String fileName) throws IOException {
        final TikaInputStream tikaInputStreamStream = TikaInputStream.get(new CloseShieldInputStream(inputStream));
        try {
            final Detector detector = new DefaultDetector();
            final Metadata metadata = new Metadata();
            metadata.set(Metadata.RESOURCE_NAME_KEY, fileName);
           
View Full Code Here

    private void assertType(String file, String byData, String byNameAndData) throws Exception {
       assertTypeByData(file, byData);
       assertTypeByNameAndData(file, byNameAndData);
    }
    private void assertTypeByNameAndData(String dataFile, String name, String type) throws Exception {
       TikaInputStream stream = TikaInputStream.get(
               TestContainerAwareDetector.class.getResource(
                       "/test-documents/" + dataFile));
       try {
           Metadata m = new Metadata();
           if (name != null)
              m.add(Metadata.RESOURCE_NAME_KEY, name);
          
           assertEquals(
                   MediaType.parse(type),
                   detector.detect(stream, m));
       } finally {
           stream.close();
       }
    }
View Full Code Here

                "application/vnd.stardivision.writer");

    }

    public void testOpenContainer() throws Exception {
        TikaInputStream stream = TikaInputStream.get(
                TestContainerAwareDetector.class.getResource(
                        "/test-documents/testPPT.ppt"));
        try {
            assertNull(stream.getOpenContainer());
            assertEquals(
                    MediaType.parse("application/vnd.ms-powerpoint"),
                    detector.detect(stream, new Metadata()));
            assertTrue(stream.getOpenContainer() instanceof NPOIFSFileSystem);
        } finally {
            stream.close();
        }
    }
View Full Code Here

    }

    private void assertRemovalTempfiles(String fileName) throws Exception {
        int numberOfTempFiles = countTemporaryFiles();

        TikaInputStream stream = TikaInputStream.get(
                TestContainerAwareDetector.class.getResource(
                        "/test-documents/" + fileName));
        try {
            detector.detect(stream, new Metadata());
        } finally {
            stream.close();
        }

        assertEquals(numberOfTempFiles, countTemporaryFiles());
    }
View Full Code Here

    public void testTruncatedFiles() throws Exception {
        // First up a truncated OOXML (zip) file
      
        // With only the data supplied, the best we can do is the container
        TikaInputStream xlsx = getTruncatedFile("testEXCEL.xlsx", 300);
        Metadata m = new Metadata();
        try {
            assertEquals(
                    MediaType.application("x-tika-ooxml"),
                    detector.detect(xlsx, m));
        } finally {
            xlsx.close();
        }
       
        // With truncated data + filename, we can use the filename to specialise
        xlsx = getTruncatedFile("testEXCEL.xlsx", 300);
        m = new Metadata();
        m.add(Metadata.RESOURCE_NAME_KEY, "testEXCEL.xlsx");
        try {
            assertEquals(
                    MediaType.application("vnd.openxmlformats-officedocument.spreadsheetml.sheet"),
                    detector.detect(xlsx, m));
        } finally {
            xlsx.close();
        }
       

        // Now a truncated OLE2 file
        TikaInputStream xls = getTruncatedFile("testEXCEL.xls", 400);
        m = new Metadata();
        try {
            assertEquals(
                    MediaType.application("x-tika-msoffice"),
                    detector.detect(xls, m));
        } finally {
            xls.close();
        }
       
        // Finally a truncated OLE2 file, with a filename available
        xls = getTruncatedFile("testEXCEL.xls", 400);
        m = new Metadata();
        m.add(Metadata.RESOURCE_NAME_KEY, "testEXCEL.xls");
        try {
            assertEquals(
                    MediaType.application("vnd.ms-excel"),
                    detector.detect(xls, m));
        } finally {
            xls.close();
        }
   }
View Full Code Here

            try {
                os = new FileOutputStream(outputFile);

                if (inputStream instanceof TikaInputStream) {
                    TikaInputStream tin = (TikaInputStream) inputStream;

                    if (tin.getOpenContainer() != null && tin.getOpenContainer() instanceof DirectoryEntry) {
                        POIFSFileSystem fs = new POIFSFileSystem();
                        copy((DirectoryEntry) tin.getOpenContainer(), fs.getRoot());
                        fs.writeFilesystem(os);
                    } else {
                        IOUtils.copy(inputStream, os);
                    }
                } else {
View Full Code Here

        boolean hasMetadataCommandArguments =
                (metadataCommandArguments != null && !metadataCommandArguments.isEmpty());
        boolean serializeMetadataCommandArgumentsToken = false;
        boolean replacedMetadataCommandArgumentsToken = false;

        TikaInputStream tikaInputStream = TikaInputStream.get(inputStream);
        File tempOutputFile = null;

        List<String> commandMetadataSegments = null;
        if (hasMetadataCommandArguments) {
            commandMetadataSegments = getCommandMetadataSegments(metadata);
        }

        // Build our command
        List<String> origCmd = Arrays.asList(command);
        List<String> cmd = new ArrayList<String>();
        for (String commandSegment : origCmd) {
            if (commandSegment.indexOf(ExternalParser.INPUT_FILE_TOKEN) != -1) {
                commandSegment = commandSegment.replace(
                        ExternalParser.INPUT_FILE_TOKEN,
                        tikaInputStream.getFile().toString());
                inputToStdIn = false;
            }
            if (commandSegment.indexOf(ExternalParser.OUTPUT_FILE_TOKEN) != -1) {
                tempOutputFile = tmp.createTemporaryFile();
                commandSegment = commandSegment.replace(
                        ExternalParser.OUTPUT_FILE_TOKEN,
                        tempOutputFile.toString());
                outputFromStdOut = false;
            }
            if (commandSegment
                    .indexOf(METADATA_COMMAND_ARGUMENTS_SERIALIZED_TOKEN) != -1) {
                serializeMetadataCommandArgumentsToken = true;
            }
            if (commandSegment.indexOf(METADATA_COMMAND_ARGUMENTS_TOKEN) != -1) {
                if (hasMetadataCommandArguments) {
                    for (String commandMetadataSegment : commandMetadataSegments) {
                        cmd.add(commandMetadataSegment);
                    }
                }
                replacedMetadataCommandArgumentsToken = true;
            } else {
                cmd.add(commandSegment);
            }
        }
        if (hasMetadataCommandArguments) {
            if (serializeMetadataCommandArgumentsToken) {
                // Find all metadata tokens and replace with encapsulated metadata
                int i = 0;
                for (String commandSegment : cmd) {
                    if (commandSegment
                            .indexOf(METADATA_COMMAND_ARGUMENTS_SERIALIZED_TOKEN) != -1) {
                        commandSegment = commandSegment.replace(
                                METADATA_COMMAND_ARGUMENTS_SERIALIZED_TOKEN,
                                serializeMetadata(commandMetadataSegments));
                        cmd.set(i, commandSegment);
                    }
                    i++;
                }
            } else if (!replacedMetadataCommandArgumentsToken
                    && !serializeMetadataCommandArgumentsToken) {
                // Tack metadata onto the end of the cmd as arguments
                cmd.addAll(commandMetadataSegments);
            }
        }

        // Execute
        Process process;
        if (cmd.toArray().length == 1) {
            process = Runtime.getRuntime().exec(cmd.toArray(new String[] {})[0]);
        } else {
            process = Runtime.getRuntime().exec(cmd.toArray(new String[] {}));
        }

        ByteArrayOutputStream stdErrOutputStream = new ByteArrayOutputStream();

        try {
            sendStdErrToOutputStream(process, stdErrOutputStream);

            if (inputToStdIn) {
                sendInputStreamToStdIn(inputStream, process);
            } else {
                // We're not writing to std in this case so close
                process.getOutputStream().close();
            }

            if (outputFromStdOut) {
                sendStdOutToOutputStream(process, outputStream);
            } else {
                tmp.dispose();
                try {
                    process.waitFor();
                } catch (InterruptedException ignore) {
                }
                // The command is finished, read the output file into the given output stream
                InputStream tempOutputFileInputStream = TikaInputStream.get(tempOutputFile);
                IOUtils.copy(tempOutputFileInputStream, outputStream);
            }
        } finally {
            if (outputFromStdOut) {
                try {
                    process.waitFor();
                } catch (InterruptedException ignore) {
                }
            } else {
                try {
                    // Clean up temp output files
                    tempOutputFile.delete();
                } catch (Exception e) {
                }
            }
            if (!inputToStdIn) {
                // Clean up temp input files
                tikaInputStream.getFile().delete();
            }
            IOUtils.closeQuietly(outputStream);
            IOUtils.closeQuietly(stdErrOutputStream);
            if (process.exitValue() != 0) {
                throw new TikaException("There was an error executing the command line" +
View Full Code Here

                //because of the "all detectors" approach (see below), we need to avoid a self-closing stream here
                stream = ((SelfClosingInputStream) stream).wrappedStream();
            }
            TemporaryResources tmp = new TemporaryResources();
            try {
                TikaInputStream tikaInputStream = TikaInputStream.get(stream, tmp);
                // There is content and possibly a name ...
                autoDetectedMimeType = allDetectors.detect(tikaInputStream, metadata);
            } catch (Exception e) {
                LOGGER.debug(e, "Unable to extract mime-type");
            } finally {
View Full Code Here

TOP

Related Classes of org.apache.tika.io.TikaInputStream

Copyright © 2018 www.massapicom. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.