Package org.apache.nutch.storage

Examples of org.apache.nutch.storage.ParseStatus


        return;
      }
     

      parseUtil.process(key, page);
      ParseStatus pstatus = page.getParseStatus();
      if (pstatus != null) {
        context.getCounter("ParserStatus",
            ParseStatusCodes.majorCodes[pstatus.getMajorCode()]).increment(1);
      }

      context.write(key, page);
    }   
View Full Code Here


    };

    @Override
    public void map(String key, WebPage page, Context context)
    throws IOException, InterruptedException {
      ParseStatus pstatus = page.getParseStatus();
      if (pstatus == null || !ParseStatusUtils.isSuccess(pstatus)
          || pstatus.getMinorCode() == ParseStatusCodes.SUCCESS_REDIRECT) {
        return; // filter urls not parsed
      }

      Utf8 mark = Mark.UPDATEDB_MARK.checkMark(page);
      if (!batchId.equals(REINDEX)) {
View Full Code Here

    if (outlinks.size() > 0) {
      Outlink[] old = parse.getOutlinks();
      String title = parse.getTitle();
      List<Outlink> list = Arrays.asList(old);
      outlinks.addAll(list);
      ParseStatus status = parse.getParseStatus();
      String text = parse.getText();
      Outlink[] newlinks = outlinks.toArray(new Outlink[outlinks.size()]);
      return new Parse(text, title, newlinks, status);
    }
    return parse;
View Full Code Here

          res.put(f, simpleMeta);
        } else if ("protocolStatus".equals(f)) {
          ProtocolStatus ps = page.getProtocolStatus();
          res.put(f, ProtocolStatusUtils.toString(ps));
        } else if ("parseStatus".equals(f)) {
          ParseStatus ps = page.getParseStatus();
          res.put(f, ParseStatusUtils.toString(ps));
        } else if ("signature".equals(f)) {
          ByteBuffer bb = page.getSignature();
          res.put(f, StringUtil.toHexString(bb));
        } else if ("content".equals(f)) {
View Full Code Here

      if (LOG.isTraceEnabled()) {
        LOG.trace("found "+outlinks.length+" outlinks in "+ url);
      }
    }

    ParseStatus status = new ParseStatus();
    status.setMajorCode(ParseStatusCodes.SUCCESS);
    if (metaTags.getRefresh()) {
      status.setMinorCode(ParseStatusCodes.SUCCESS_REDIRECT);
      status.addToArgs(new Utf8(metaTags.getRefreshHref().toString()));
      status.addToArgs(new Utf8(Integer.toString(metaTags.getRefreshTime())));
    }

    Parse parse = new Parse(text, title, outlinks, status);
    parse = htmlParseFilters.filter(url, page, parse, metaTags, root);
View Full Code Here

    if (outlinks.length == 0) {
      outlinks = OutlinkExtractor.getOutlinks(text, getConf());
    }

    ParseStatus status = ParseStatusUtils.STATUS_SUCCESS;
    if (metaTags.getRefresh()) {
      status.setMinorCode(ParseStatusCodes.SUCCESS_REDIRECT);
      status.addToArgs(new Utf8(metaTags.getRefreshHref().toString()));
      status.addToArgs(new Utf8(Integer.toString(metaTags.getRefreshTime())));
    }

    Parse parse = new Parse(text, title, outlinks, status);
    parse = htmlParseFilters.filter(url, page, parse, metaTags, root);
View Full Code Here

    if (outlinks.size() > 0) {
      Outlink[] old = parse.getOutlinks();
      String title = parse.getTitle();
      List<Outlink> list = Arrays.asList(old);
      outlinks.addAll(list);
      ParseStatus status = parse.getParseStatus();
      String text = parse.getText();
      Outlink[] newlinks = outlinks.toArray(new Outlink[outlinks.size()]);
      return new Parse(text, title, newlinks, status);
    }
    return parse;
View Full Code Here

        return;
      }
     

      URLWebPage redirectedPage = parseUtil.process(key, page);
      ParseStatus pstatus = page.getParseStatus();
      if (pstatus != null) {
        context.getCounter("ParserStatus",
            ParseStatusCodes.majorCodes[pstatus.getMajorCode()]).increment(1);
      }

      if (redirectedPage != null) {
        context.write(TableUtil.reverseUrl(redirectedPage.getUrl()),
                      redirectedPage.getDatum());
View Full Code Here

          res.put(f, simpleMeta);
        } else if ("protocolStatus".equals(f)) {
          ProtocolStatus ps = page.getProtocolStatus();
          res.put(f, ProtocolStatusUtils.toString(ps));
        } else if ("parseStatus".equals(f)) {
          ParseStatus ps = page.getParseStatus();
          res.put(f, ParseStatusUtils.toString(ps));
        } else if ("signature".equals(f)) {
          ByteBuffer bb = page.getSignature();
          res.put(f, StringUtil.toHexString(bb.array()));
        } else if ("content".equals(f)) {
View Full Code Here

    }
    return null;
  }

  public static Parse getEmptyParse(Exception e, Configuration conf) {
    ParseStatus status = new ParseStatus();
    status.setMajorCode(ParseStatusCodes.FAILED);
    status.setMinorCode(ParseStatusCodes.FAILED_EXCEPTION);
    status.addToArgs(new Utf8(e.toString()));

    return new Parse("", "", new Outlink[0], status);
  }
View Full Code Here

TOP

Related Classes of org.apache.nutch.storage.ParseStatus

Copyright © 2018 www.massapicom. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.