Examples of ParseStatus

com.ibm.icu.impl.locale.ParseStatus
com.proverby.api.ParseStatus
org.apache.nutch.parse.ParseStatus
@author Andrzej Bialecki <ab@getopt.org>
org.apache.nutch.storage.ParseStatus
sun.util.locale.ParseStatus

Examples of org.apache.nutch.parse.ParseStatus

    Metadata metadata = new Metadata();
    metadata.add(Response.CONTENT_TYPE, source);
    MoreIndexingFilter filter = new MoreIndexingFilter();
    filter.setConf(conf);
    NutchDocument doc = filter.filter(new NutchDocument(), new ParseImpl("text", new ParseData(
        new ParseStatus(), "title", new Outlink[0], metadata)), new Text(
        "http://www.example.com/"), new CrawlDatum(), new Inlinks());
    assertEquals("mime type not detected", expected, doc.getFieldValue("type"));
  }

View Full Code Here

Examples of org.apache.nutch.parse.ParseStatus

      }


      // set the content signature
      if (parseResult == null) {
        byte[] signature = SignatureFactory.getSignature(getConf()).calculate(
          content, new ParseStatus().getEmptyParse(getConf()));
        datum.setSignature(signature);
      }


      try {
        output.collect(key, new NutchWritable(datum));
        output.collect(key, new NutchWritable(content));


        if (parseResult != null) {
          for (Entry <Text, Parse> entry : parseResult) {
            Text url = entry.getKey();
            Parse parse = entry.getValue();
            ParseStatus parseStatus = parse.getData().getStatus();


            if (!parseStatus.isSuccess()) {
              LOG.warn("Error parsing: " + key + ": " + parseStatus);
              parse = parseStatus.getEmptyParse(getConf());
            }


            // Calculate page signature. 
            byte[] signature = SignatureFactory.getSignature(getConf()).calculate(
              content, parse);

View Full Code Here

Examples of org.apache.nutch.parse.ParseStatus

        Content content = new Content(urlStr, urlStr, bytes.get(), contentType,
          new Metadata(), getConf());
        
        // set the url version into the metadata
        content.getMetadata().set(URL_VERSION, version);
        ParseStatus pstatus = null;
        pstatus = output(output, segmentName, url, datum, content, status,
          CrawlDatum.STATUS_FETCH_SUCCESS);
        reporter.progress();
      }
      catch (Throwable t) { // unexpected exception

View Full Code Here

Examples of org.apache.nutch.parse.ParseStatus

        // check that contentType is one we can handle
        String contentType = content.getContentType();
        if (contentType != null
                && (!contentType.startsWith("text/xml") && !contentType
                        .startsWith("application/rss+xml")))
            return new ParseStatus(ParseStatus.FAILED_INVALID_FORMAT,
                    "Content-Type not text/xml or application/rss+xml: "
                            + contentType).getEmptyParse();


        List theRSSChannels = null;


        try {
            byte[] raw = content.getContent();


            // create a new FeedParser...
            FeedParser parser = FeedParserFactory.newFeedParser();


            // create a listener for handling our callbacks
            FeedParserListener listener = new FeedParserListenerImpl();


            // start parsing our feed and have the onItem methods called
            parser.parse(listener, new ByteArrayInputStream(raw), /* resource */
            null);


            theRSSChannels = ((FeedParserListenerImpl) listener).getChannels();


        } catch (Exception e) { // run time exception
            e.printStackTrace();
            LOG.fine("nutch:parse-rss:RSSParser Exception: " + e.getMessage());
            return new ParseStatus(ParseStatus.FAILED,
                    "Can't be handled as rss document. " + e).getEmptyParse();
        }


        StringBuffer contentTitle = new StringBuffer(), indexText = new StringBuffer();
        List theOutlinks = new Vector();

View Full Code Here

Examples of org.apache.nutch.parse.ParseStatus

  public Parse getParse(Content content) {


    // check that contentType is one we can handle
    String contentType = content.getContentType();
    if (contentType != null && !contentType.startsWith("application/pdf"))
      return new ParseStatus(ParseStatus.FAILED, ParseStatus.FAILED_INVALID_FORMAT,
        "Content-Type not application/pdf: " + contentType).getEmptyParse();


    // in memory representation of pdf file
    PDDocument pdf = null;


    String text = null;
    String title = null;


    try {


      byte[] raw = content.getContent();


      String contentLength = content.get("Content-Length");
      if (contentLength != null
            && raw.length != Integer.parseInt(contentLength)) {
          return new ParseStatus(ParseStatus.FAILED, ParseStatus.FAILED_TRUNCATED,
                  "Content truncated at "+raw.length
            +" bytes. Parser can't handle incomplete pdf file.").getEmptyParse();
      }


      PDFParser parser = new PDFParser(
        new ByteArrayInputStream(raw));
      parser.parse();


      pdf = parser.getPDDocument();


      if (pdf.isEncrypted()) {
        DocumentEncryption decryptor = new DocumentEncryption(pdf);
        //Just try using the default password and move on
        decryptor.decryptDocument("");
      }


      // collect text
      PDFTextStripper stripper = new PDFTextStripper();
      text = stripper.getText(pdf);


      // collect title
      PDDocumentInformation info = pdf.getDocumentInformation();
      title = info.getTitle();
      // more useful info, currently not used. please keep them for future use.
      // pdf.getPageCount();
      // info.getAuthor()
      // info.getSubject()
      // info.getKeywords()
      // info.getCreator()
      // info.getProducer()
      // info.getTrapped()
      // formatDate(info.getCreationDate())
      // formatDate(info.getModificationDate())


    } catch (CryptographyException e) {
      return new ParseStatus(ParseStatus.FAILED,
              "Error decrypting document. " + e).getEmptyParse();
    } catch (InvalidPasswordException e) {
      return new ParseStatus(ParseStatus.FAILED,
              "Can't decrypt document - invalid password. " + e).getEmptyParse();
    } catch (Exception e) { // run time exception
      return new ParseStatus(ParseStatus.FAILED,
              "Can't be handled as pdf document. " + e).getEmptyParse();
    } finally {
      try {
        if (pdf != null)
          pdf.close();

View Full Code Here

Examples of org.apache.nutch.storage.ParseStatus

        return;
      }
      


      parseUtil.process(key, page);
      ParseStatus pstatus = page.getParseStatus();
      if (pstatus != null) {
        context.getCounter("ParserStatus",
            ParseStatusCodes.majorCodes[pstatus.getMajorCode()]).increment(1);
      }


      context.write(key, page);
    }

View Full Code Here

Examples of org.apache.nutch.storage.ParseStatus

    };


    @Override
    public void map(String key, WebPage page, Context context)
    throws IOException, InterruptedException {
      ParseStatus pstatus = page.getParseStatus();
      if (pstatus == null || !ParseStatusUtils.isSuccess(pstatus)
          || pstatus.getMinorCode() == ParseStatusCodes.SUCCESS_REDIRECT) {
        return; // filter urls not parsed
      }


      Utf8 mark = Mark.UPDATEDB_MARK.checkMark(page);
      if (!batchId.equals(REINDEX)) {

View Full Code Here

Examples of org.apache.nutch.storage.ParseStatus

    if (outlinks.size() > 0) {
      Outlink[] old = parse.getOutlinks();
      String title = parse.getTitle();
      List<Outlink> list = Arrays.asList(old);
      outlinks.addAll(list);
      ParseStatus status = parse.getParseStatus();
      String text = parse.getText();
      Outlink[] newlinks = outlinks.toArray(new Outlink[outlinks.size()]);
      return new Parse(text, title, newlinks, status);
    }
    return parse;

View Full Code Here

Examples of org.apache.nutch.storage.ParseStatus

          res.put(f, simpleMeta);
        } else if ("protocolStatus".equals(f)) {
          ProtocolStatus ps = page.getProtocolStatus();
          res.put(f, ProtocolStatusUtils.toString(ps));
        } else if ("parseStatus".equals(f)) {
          ParseStatus ps = page.getParseStatus();
          res.put(f, ParseStatusUtils.toString(ps));
        } else if ("signature".equals(f)) {
          ByteBuffer bb = page.getSignature();
          res.put(f, StringUtil.toHexString(bb));
        } else if ("content".equals(f)) {

View Full Code Here

Examples of org.apache.nutch.storage.ParseStatus

      if (LOG.isTraceEnabled()) {
        LOG.trace("found "+outlinks.length+" outlinks in "+ url);
      }
    }


    ParseStatus status = new ParseStatus();
    status.setMajorCode(ParseStatusCodes.SUCCESS);
    if (metaTags.getRefresh()) {
      status.setMinorCode(ParseStatusCodes.SUCCESS_REDIRECT);
      status.addToArgs(new Utf8(metaTags.getRefreshHref().toString()));
      status.addToArgs(new Utf8(Integer.toString(metaTags.getRefreshTime())));
    }


    Parse parse = new Parse(text, title, outlinks, status);
    parse = htmlParseFilters.filter(url, page, parse, metaTags, root);

View Full Code Here

0 1 2 3 4 5

TOP

All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.