Package org.apache.tika.sax

Examples of org.apache.tika.sax.ToXMLContentHandler


    protected XMLResult getXML(InputStream input, Parser parser, Metadata metadata) throws Exception {
      ParseContext context = new ParseContext();
      context.set(Parser.class, parser);

      try {
          ContentHandler handler = new ToXMLContentHandler();
          parser.parse(input, handler, metadata, context);
          return new XMLResult(handler.toString(), metadata);
      } finally {
          input.close();
      }
  }
View Full Code Here


                    "writing the text/plain version of the parsed content",e);
            }
            final Writer plainTextWriter = new OutputStreamWriter(plainTextSink.getOutputStream(), UTF8);
            final ContentHandler textHandler = new BodyContentHandler( //only the Body
                new PlainTextHandler(plainTextWriter, false,skipLinebreaks)); //skip ignoreable
            final ToXMLContentHandler xhtmlHandler;
            final ContentHandler mainHandler;
            ContentSink xhtmlSink = null;
            try {
                if(!plainMediaType.equals(XHTML)){ //do not parse XHTML from XHTML
                    try {
                        xhtmlSink = ciFactory.createContentSink(XHTML +"; charset="+UTF8.name());
                    } catch (IOException e) {
                        throw new EngineException("Error while initialising Blob for" +
                                "writing the application/xhtml+xml version of the parsed content",e);
                    }
                    try {
                        xhtmlHandler = new ToXMLContentHandler(xhtmlSink.getOutputStream(),UTF8.name());
                    } catch (UnsupportedEncodingException e) {
                        throw new EngineException("This system does not support the encoding "+UTF8,e);
                    }
                    mainHandler = new MultiHandler(textHandler,xhtmlHandler);
                } else {
View Full Code Here

    protected XMLResult getXML(InputStream input, Parser parser, Metadata metadata) throws Exception {
      ParseContext context = new ParseContext();
      context.set(Parser.class, parser);

      try {
          ContentHandler handler = new ToXMLContentHandler();
          parser.parse(input, handler, metadata, context);
          return new XMLResult(handler.toString(), metadata);
      } finally {
          input.close();
      }
  }
View Full Code Here

            //set the already parsed contentType
            metadata.set(Metadata.CONTENT_TYPE, mtas.mediaType.toString());
            final StringWriter writer = new StringWriter();
            final ContentHandler textHandler = new BodyContentHandler( //only the Body
                new PlainTextHandler(writer, false,skipLinebreaks)); //skip ignoreable
            final ToXMLContentHandler xhtmlHandler;
            final ContentHandler mainHandler;
            if(!plainMediaType.equals(XHTML)){ //do not parse XHTML from XHTML
                xhtmlHandler = new ToXMLContentHandler();
                mainHandler = new MultiHandler(textHandler,xhtmlHandler);
            } else {
                mainHandler = textHandler;
                xhtmlHandler = null;
            }
            try {
                parser.parse(in, mainHandler, metadata, context);
            } catch (Exception e) {
                throw new EngineException("Unable to convert ContentItem "+
                        ci.getUri()+" with mimeType '"+ci.getMimeType()+"' to "+
                        "plain text!",e);
            }
            IOUtils.closeQuietly(in);
            if(log.isDebugEnabled()){
                log.debug("Plain Content: \n{}",writer.toString());
            }
            String random = randomUUID().toString();
            UriRef textBlobUri = new UriRef("urn:tika:text:"+random);
            ci.addPart(textBlobUri,
                new InMemoryBlob(writer.toString(),
                    TEXT_PLAIN.toString())); //string -> no encoding
            if(xhtmlHandler != null){
                if(log.isDebugEnabled()){
                    log.debug("XML Content: \n{}",xhtmlHandler.toString());
                }
                UriRef xhtmlBlobUri = new UriRef("urn:tika:xhtml:"+random);
                ci.addPart(xhtmlBlobUri,
                    new InMemoryBlob(xhtmlHandler.toString(),
                        "application/xhtml+xml")); //string -> no encoding
            }
            //add the extracted metadata
            if(log.isDebugEnabled()){
                for(String name : metadata.names()){
View Full Code Here

    protected XMLResult getXML(InputStream input, Parser parser, Metadata metadata) throws Exception {
      ParseContext context = new ParseContext();
      context.set(Parser.class, parser);

      try {
          ContentHandler handler = new ToXMLContentHandler();
          parser.parse(input, handler, metadata, context);
          return new XMLResult(handler.toString(), metadata);
      } finally {
          input.close();
      }
  }
View Full Code Here

        config.setExtractUniqueInlineImagesOnly(false);
        context.set(org.apache.tika.parser.pdf.PDFParserConfig.class, config);


        Metadata metadata = new Metadata();
        ContentHandler handler = new ToXMLContentHandler();
        String path = "/test-documents/testPDF_childAttachments.pdf";
        InputStream stream = null;
        try {
            stream = TikaInputStream.get(this.getClass().getResource(path));
            parser.parse(stream, handler, metadata, context);
        } finally {
            IOUtils.closeQuietly(stream);
        }

        String xml = handler.toString();
        //regular attachment
        assertContains("<div class=\"embedded\" id=\"Unit10.doc\" />", xml);
        //inline image
        assertContains("<img src=\"embedded:image1.tif\" alt=\"image1.tif\" />", xml);
View Full Code Here

    /**
     * Example of extracting the contents as HTML, as a string.
     */
    public String parseToHTML() throws IOException, SAXException, TikaException {
        ContentHandler handler = new ToXMLContentHandler();
       
        InputStream stream = ContentHandlerExample.class.getResourceAsStream("test.doc");
        AutoDetectParser parser = new AutoDetectParser();
        Metadata metadata = new Metadata();
        try {
            parser.parse(stream, handler, metadata);
            return handler.toString();
        } finally {
            stream.close();
        }
    }
View Full Code Here

     * Example of extracting just the body as HTML, without the
     *  head part, as a string
     */
    public String parseBodyToHTML() throws IOException, SAXException, TikaException {
        ContentHandler handler = new BodyContentHandler(
                new ToXMLContentHandler());
       
        InputStream stream = ContentHandlerExample.class.getResourceAsStream("test.doc");
        AutoDetectParser parser = new AutoDetectParser();
        Metadata metadata = new Metadata();
        try {
View Full Code Here

        // Only get things under html -> body -> div (class=header)
        XPathParser xhtmlParser = new XPathParser("xhtml", XHTMLContentHandler.XHTML);
        Matcher divContentMatcher = xhtmlParser.parse(
                "/xhtml:html/xhtml:body/xhtml:div/descendant::node()");       
        ContentHandler handler = new MatchingContentHandler(
                new ToXMLContentHandler(), divContentMatcher);
       
        InputStream stream = ContentHandlerExample.class.getResourceAsStream("test2.doc");
        AutoDetectParser parser = new AutoDetectParser();
        Metadata metadata = new Metadata();
        try {
View Full Code Here

    if (System.getProperty("java.version").startsWith("1.5")) {
      return;
    }

    Parser parser = new EnviHeaderParser();
    ToXMLContentHandler handler = new ToXMLContentHandler();
    Metadata metadata = new Metadata();

    InputStream stream = EnviHeaderParser.class
        .getResourceAsStream("/test-documents/envi_test_header.hdr");
    assertNotNull("Test ENVI file not found", stream);
    try {
      parser.parse(stream, handler, metadata, new ParseContext());
    } finally {
      stream.close();
    }

    // Check content of test file
    String content = handler.toString();
        assertTrue(content.contains("<body><p>ENVI</p>"));
    assertTrue(content.contains("<p>samples = 2400</p>"));
    assertTrue(content.contains("<p>lines   = 2400</p>"));
    assertTrue(content.contains("<p>map info = {Sinusoidal, 1.5000, 1.5000, -10007091.3643, 5559289.2856, 4.6331271653e+02, 4.6331271653e+02, , units=Meters}</p>"));
    assertTrue(content.contains("content=\"application/envi.hdr\""));
View Full Code Here

TOP

Related Classes of org.apache.tika.sax.ToXMLContentHandler

Copyright © 2018 www.massapicom. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.