Examples of ParseStatus


Examples of org.apache.nutch.parse.ParseStatus

    Metadata metadata = new Metadata();
    metadata.add(Response.CONTENT_TYPE, source);
    MoreIndexingFilter filter = new MoreIndexingFilter();
    filter.setConf(conf);
    NutchDocument doc = filter.filter(new NutchDocument(), new ParseImpl("text", new ParseData(
        new ParseStatus(), "title", new Outlink[0], metadata)), new Text(
        "http://www.example.com/"), new CrawlDatum(), new Inlinks());
    assertEquals("mime type not detected", expected, doc.getFieldValue("type"));
  }
View Full Code Here

Examples of org.apache.nutch.parse.ParseStatus

      }

      // set the content signature
      if (parseResult == null) {
        byte[] signature = SignatureFactory.getSignature(getConf()).calculate(
          content, new ParseStatus().getEmptyParse(getConf()));
        datum.setSignature(signature);
      }

      try {
        output.collect(key, new NutchWritable(datum));
        output.collect(key, new NutchWritable(content));

        if (parseResult != null) {
          for (Entry <Text, Parse> entry : parseResult) {
            Text url = entry.getKey();
            Parse parse = entry.getValue();
            ParseStatus parseStatus = parse.getData().getStatus();

            if (!parseStatus.isSuccess()) {
              LOG.warn("Error parsing: " + key + ": " + parseStatus);
              parse = parseStatus.getEmptyParse(getConf());
            }

            // Calculate page signature.
            byte[] signature = SignatureFactory.getSignature(getConf()).calculate(
              content, parse);
View Full Code Here

Examples of org.apache.nutch.parse.ParseStatus

        Content content = new Content(urlStr, urlStr, bytes.get(), contentType,
          new Metadata(), getConf());
       
        // set the url version into the metadata
        content.getMetadata().set(URL_VERSION, version);
        ParseStatus pstatus = null;
        pstatus = output(output, segmentName, url, datum, content, status,
          CrawlDatum.STATUS_FETCH_SUCCESS);
        reporter.progress();
      }
      catch (Throwable t) { // unexpected exception
View Full Code Here

Examples of org.apache.nutch.parse.ParseStatus

        // check that contentType is one we can handle
        String contentType = content.getContentType();
        if (contentType != null
                && (!contentType.startsWith("text/xml") && !contentType
                        .startsWith("application/rss+xml")))
            return new ParseStatus(ParseStatus.FAILED_INVALID_FORMAT,
                    "Content-Type not text/xml or application/rss+xml: "
                            + contentType).getEmptyParse();

        List theRSSChannels = null;

        try {
            byte[] raw = content.getContent();

            // create a new FeedParser...
            FeedParser parser = FeedParserFactory.newFeedParser();

            // create a listener for handling our callbacks
            FeedParserListener listener = new FeedParserListenerImpl();

            // start parsing our feed and have the onItem methods called
            parser.parse(listener, new ByteArrayInputStream(raw), /* resource */
            null);

            theRSSChannels = ((FeedParserListenerImpl) listener).getChannels();

        } catch (Exception e) { // run time exception
            e.printStackTrace();
            LOG.fine("nutch:parse-rss:RSSParser Exception: " + e.getMessage());
            return new ParseStatus(ParseStatus.FAILED,
                    "Can't be handled as rss document. " + e).getEmptyParse();
        }

        StringBuffer contentTitle = new StringBuffer(), indexText = new StringBuffer();
        List theOutlinks = new Vector();
View Full Code Here

Examples of org.apache.nutch.parse.ParseStatus

  public Parse getParse(Content content) {

    // check that contentType is one we can handle
    String contentType = content.getContentType();
    if (contentType != null && !contentType.startsWith("application/pdf"))
      return new ParseStatus(ParseStatus.FAILED, ParseStatus.FAILED_INVALID_FORMAT,
        "Content-Type not application/pdf: " + contentType).getEmptyParse();

    // in memory representation of pdf file
    PDDocument pdf = null;

    String text = null;
    String title = null;

    try {

      byte[] raw = content.getContent();

      String contentLength = content.get("Content-Length");
      if (contentLength != null
            && raw.length != Integer.parseInt(contentLength)) {
          return new ParseStatus(ParseStatus.FAILED, ParseStatus.FAILED_TRUNCATED,
                  "Content truncated at "+raw.length
            +" bytes. Parser can't handle incomplete pdf file.").getEmptyParse();
      }

      PDFParser parser = new PDFParser(
        new ByteArrayInputStream(raw));
      parser.parse();

      pdf = parser.getPDDocument();

      if (pdf.isEncrypted()) {
        DocumentEncryption decryptor = new DocumentEncryption(pdf);
        //Just try using the default password and move on
        decryptor.decryptDocument("");
      }

      // collect text
      PDFTextStripper stripper = new PDFTextStripper();
      text = stripper.getText(pdf);

      // collect title
      PDDocumentInformation info = pdf.getDocumentInformation();
      title = info.getTitle();
      // more useful info, currently not used. please keep them for future use.
      // pdf.getPageCount();
      // info.getAuthor()
      // info.getSubject()
      // info.getKeywords()
      // info.getCreator()
      // info.getProducer()
      // info.getTrapped()
      // formatDate(info.getCreationDate())
      // formatDate(info.getModificationDate())

    } catch (CryptographyException e) {
      return new ParseStatus(ParseStatus.FAILED,
              "Error decrypting document. " + e).getEmptyParse();
    } catch (InvalidPasswordException e) {
      return new ParseStatus(ParseStatus.FAILED,
              "Can't decrypt document - invalid password. " + e).getEmptyParse();
    } catch (Exception e) { // run time exception
      return new ParseStatus(ParseStatus.FAILED,
              "Can't be handled as pdf document. " + e).getEmptyParse();
    } finally {
      try {
        if (pdf != null)
          pdf.close();
View Full Code Here

Examples of org.apache.nutch.storage.ParseStatus

        return;
      }
     

      parseUtil.process(key, page);
      ParseStatus pstatus = page.getParseStatus();
      if (pstatus != null) {
        context.getCounter("ParserStatus",
            ParseStatusCodes.majorCodes[pstatus.getMajorCode()]).increment(1);
      }

      context.write(key, page);
    }   
View Full Code Here

Examples of org.apache.nutch.storage.ParseStatus

    };

    @Override
    public void map(String key, WebPage page, Context context)
    throws IOException, InterruptedException {
      ParseStatus pstatus = page.getParseStatus();
      if (pstatus == null || !ParseStatusUtils.isSuccess(pstatus)
          || pstatus.getMinorCode() == ParseStatusCodes.SUCCESS_REDIRECT) {
        return; // filter urls not parsed
      }

      Utf8 mark = Mark.UPDATEDB_MARK.checkMark(page);
      if (!batchId.equals(REINDEX)) {
View Full Code Here

Examples of org.apache.nutch.storage.ParseStatus

    if (outlinks.size() > 0) {
      Outlink[] old = parse.getOutlinks();
      String title = parse.getTitle();
      List<Outlink> list = Arrays.asList(old);
      outlinks.addAll(list);
      ParseStatus status = parse.getParseStatus();
      String text = parse.getText();
      Outlink[] newlinks = outlinks.toArray(new Outlink[outlinks.size()]);
      return new Parse(text, title, newlinks, status);
    }
    return parse;
View Full Code Here

Examples of org.apache.nutch.storage.ParseStatus

          res.put(f, simpleMeta);
        } else if ("protocolStatus".equals(f)) {
          ProtocolStatus ps = page.getProtocolStatus();
          res.put(f, ProtocolStatusUtils.toString(ps));
        } else if ("parseStatus".equals(f)) {
          ParseStatus ps = page.getParseStatus();
          res.put(f, ParseStatusUtils.toString(ps));
        } else if ("signature".equals(f)) {
          ByteBuffer bb = page.getSignature();
          res.put(f, StringUtil.toHexString(bb));
        } else if ("content".equals(f)) {
View Full Code Here

Examples of org.apache.nutch.storage.ParseStatus

      if (LOG.isTraceEnabled()) {
        LOG.trace("found "+outlinks.length+" outlinks in "+ url);
      }
    }

    ParseStatus status = new ParseStatus();
    status.setMajorCode(ParseStatusCodes.SUCCESS);
    if (metaTags.getRefresh()) {
      status.setMinorCode(ParseStatusCodes.SUCCESS_REDIRECT);
      status.addToArgs(new Utf8(metaTags.getRefreshHref().toString()));
      status.addToArgs(new Utf8(Integer.toString(metaTags.getRefreshTime())));
    }

    Parse parse = new Parse(text, title, outlinks, status);
    parse = htmlParseFilters.filter(url, page, parse, metaTags, root);
View Full Code Here
TOP
Copyright © 2018 www.massapi.com. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.