Package fr.eolya.extraction.tika

Examples of fr.eolya.extraction.tika.TikaWrapper.process()


                  String swfToHtmlPath = Utils.getValidPropertyPath(config.getProperty("/crawler/param[@name='swfToHtmlPath']", ""), null, "HOME");

                  TikaWrapper tikaWrapper = new TikaWrapper(TikaWrapper.OUTPUT_FORMAT_HTML);
                  tikaWrapper.setSwfToHtmlPath(swfToHtmlPath);
                  tikaWrapper.process(urlLoader.getStream(), TikaWrapper.CONTENT_TYPE_SWF);
                  rawPage = tikaWrapper.getText();
                 
                  //MultiFormatTextExtractor extractor = new MultiFormatTextExtractor();
                  //extractor.setSwfToHtmlPath(swfToHtmlPath);
                  //rawPage = extractor.swfInputStreamToHtml(urlLoader.getStream());
View Full Code Here


          wrapper = new TikaWrapper(TikaWrapper.OUTPUT_FORMAT_TEXT);
        }
      }
     
      wrapper.setTempPath(tmpPath);
      wrapper.process(input, tikaContentType);
     
      parserText = wrapper.getText();
      parserContentType = wrapper.getMetaContentType();
      if (contentType.startsWith("text/html")) {
        parserTitle = htmlParser.getBestTitle(wrapper.getMetaTitle());
View Full Code Here

                //ws.clear();
               
                //String rawPage = extractor.htmlPageToText(data, page, "");
                //String title = extractor.getTitle();
        TikaWrapper tikaWrapper = new TikaWrapper(TikaWrapper.OUTPUT_FORMAT_HTML);
        tikaWrapper.process(urlLoader.getStream());
        String rawPage = tikaWrapper.getText();
        String title = tikaWrapper.getMetaTitle();

                ret += "<page_0><![CDATA[" + rawPage + "]]>" + "</page_0>";
                ret += "<title_0><![CDATA[" + title + "]]>" + "</title_0>";
View Full Code Here

                ret += "<title_0><![CDATA[" + title + "]]>" + "</title_0>";
               
                //rawPage = extractor.htmlPageToText(data, page, "boilerpipe_article");
                //title = extractor.getTitle();
        tikaWrapper = new TikaWrapper(TikaWrapper.OUTPUT_FORMAT_TEXT_MAIN_BOILERPIPE_ARTICLE, TikaWrapper.CONTENT_TYPE_HTML);
        tikaWrapper.process(urlLoader.getStream());
        rawPage = tikaWrapper.getText();
        title = tikaWrapper.getMetaTitle();
                ret += "<page_1><![CDATA[" + rawPage + "]]>" + "</page_1>";
                ret += "<title_1><![CDATA[" + title + "]]>" + "</title_1>";
View Full Code Here

                ret += "<title_1><![CDATA[" + title + "]]>" + "</title_1>";
                //rawPage = extractor.htmlPageToText(data, page, "boilerpipe_default");
                //title = extractor.getTitle();
        tikaWrapper = new TikaWrapper(TikaWrapper.OUTPUT_FORMAT_TEXT_MAIN_BOILERPIPE_DEFAULT, TikaWrapper.CONTENT_TYPE_HTML);
        tikaWrapper.process(urlLoader.getStream());
        rawPage = tikaWrapper.getText();
        title = tikaWrapper.getMetaTitle();
                ret += "<page_2><![CDATA[" + rawPage + "]]>" + "</page_2>";
                ret += "<title_2><![CDATA[" + title + "]]>" + "</title_2>";
View Full Code Here

                ret += "<title_2><![CDATA[" + title + "]]>" + "</title_2>";
                //rawPage = extractor.htmlPageToText(data, page, "boilerpipe_canola");
                //title = extractor.getTitle();
        tikaWrapper = new TikaWrapper(TikaWrapper.OUTPUT_FORMAT_TEXT_MAIN_BOILERPIPE_CANOLA, TikaWrapper.CONTENT_TYPE_HTML);
        tikaWrapper.process(urlLoader.getStream());
        rawPage = tikaWrapper.getText();
        title = tikaWrapper.getMetaTitle();
                ret += "<page_3><![CDATA[" + rawPage + "]]>" + "</page_3>";
                ret += "<title_3><![CDATA[" + title + "]]>" + "</title_3>";
View Full Code Here

                ret += "<title_3><![CDATA[" + title + "]]>" + "</title_3>";

                //rawPage = extractor.htmlPageToText(data, page, "snacktory");
                //title = extractor.getTitle();
        tikaWrapper = new TikaWrapper(TikaWrapper.OUTPUT_FORMAT_TEXT_MAIN_SNACKTORY, TikaWrapper.CONTENT_TYPE_HTML);
        tikaWrapper.process(urlLoader.getStream());
        rawPage = tikaWrapper.getText();
        title = tikaWrapper.getMetaTitle();
                ret += "<page_4><![CDATA[" + rawPage + "]]>" + "</page_4>";
                ret += "<title_4><![CDATA[" + title + "]]>" + "</title_4>";
               
View Full Code Here

                } else {
                    //text = extractor.htmlPageToText(page, "", "");
                  in = IOUtils.toInputStream(page);
                }
               
        tikaWrapper.process(in, TikaWrapper.CONTENT_TYPE_HTML);
        text = tikaWrapper.getText();
       
                if (title==null || "".equals(title))
                    title = tikaWrapper.getMetaTitle();
               
View Full Code Here

TOP
Copyright © 2018 www.massapi.com. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.