Examples of ProtocolStatus


Examples of org.apache.nutch.protocol.ProtocolStatus

    try {
      URL u = new URL(urlString);
     
      try {
        if (!robots.isAllowed(this, u)) {
          return new ProtocolOutput(null, new ProtocolStatus(ProtocolStatus.ROBOTS_DENIED, url));
        }
      } catch (Throwable e) {
        // XXX Maybe bogus: assume this is allowed.
        if (logger.isTraceEnabled()) {
          logger.trace("Exception checking robot rules for " + url + ": " + e);
        }
      }
     
      long crawlDelay = robots.getCrawlDelay(this, u);
      long delay = crawlDelay > 0 ? crawlDelay : serverDelay;
      if (maxCrawlDelay >= 0 && delay > maxCrawlDelay) {
        // skip this page, otherwise the thread would block for too long.
        LOGGER.info("Skipping: " + u + " exceeds fetcher.max.crawl.delay, max="
                + (maxCrawlDelay / 1000) + ", Crawl-Delay=" + (delay / 1000));
        Content c = new Content(u.toString(), u.toString(), EMPTY_CONTENT,
                null, null, this.conf);
        return new ProtocolOutput(c, ProtocolStatus.STATUS_WOULDBLOCK);
      }
      String host;
      try {
        host = blockAddr(u, delay);
      } catch (BlockedException be) {
        return new ProtocolOutput(null, ProtocolStatus.STATUS_BLOCKED);
      }
      Response response;
      try {
        response = getResponse(u, datum, false); // make a request
      } finally {
        unblockAddr(host, delay);
      }
     
      int code = response.getCode();
      byte[] content = response.getContent();
      Content c = new Content(u.toString(), u.toString(),
                              (content == null ? EMPTY_CONTENT : content),
                              response.getHeader("Content-Type"),
                              response.getHeaders(), this.conf);
     
      if (code == 200) { // got a good response
        return new ProtocolOutput(c); // return it
       
      } else if (code == 410) { // page is gone
        return new ProtocolOutput(c, new ProtocolStatus(ProtocolStatus.GONE, "Http: " + code + " url=" + url));
       
      } else if (code >= 300 && code < 400) { // handle redirect
        String location = response.getHeader("Location");
        // some broken servers, such as MS IIS, use lowercase header name...
        if (location == null) location = response.getHeader("location");
        if (location == null) location = "";
        u = new URL(u, location);
        int protocolStatusCode;
        switch (code) {
          case 300:   // multiple choices, preferred value in Location
            protocolStatusCode = ProtocolStatus.MOVED;
            break;
          case 301:   // moved permanently
          case 305:   // use proxy (Location is URL of proxy)
            protocolStatusCode = ProtocolStatus.MOVED;
            break;
          case 302:   // found (temporarily moved)
          case 303:   // see other (redirect after POST)
          case 307:   // temporary redirect
            protocolStatusCode = ProtocolStatus.TEMP_MOVED;
            break;
          case 304:   // not modified
            protocolStatusCode = ProtocolStatus.NOTMODIFIED;
            break;
          default:
            protocolStatusCode = ProtocolStatus.MOVED;
        }
        // handle this in the higher layer.
        return new ProtocolOutput(c, new ProtocolStatus(protocolStatusCode, u));
      } else if (code == 400) { // bad request, mark as GONE
        if (logger.isTraceEnabled()) { logger.trace("400 Bad request: " + u); }
        return new ProtocolOutput(c, new ProtocolStatus(ProtocolStatus.GONE, u));
      } else if (code == 401) { // requires authorization, but no valid auth provided.
        if (logger.isTraceEnabled()) { logger.trace("401 Authentication Required"); }
        return new ProtocolOutput(c, new ProtocolStatus(ProtocolStatus.ACCESS_DENIED, "Authentication required: "
                + urlString));
      } else if (code == 404) {
        return new ProtocolOutput(c, new ProtocolStatus(ProtocolStatus.NOTFOUND, u));
      } else if (code == 410) { // permanently GONE
        return new ProtocolOutput(c, new ProtocolStatus(ProtocolStatus.GONE, u));
      } else {
        return new ProtocolOutput(c, new ProtocolStatus(ProtocolStatus.EXCEPTION, "Http code=" + code + ", url="
                + u));
      }
    } catch (Throwable e) {
      e.printStackTrace(LogUtil.getErrorStream(logger));
      return new ProtocolOutput(null, new ProtocolStatus(e));
    }
  }
View Full Code Here

Examples of org.apache.nutch.storage.ProtocolStatus

                final FetchItemQueue fiq = fetchQueues.getFetchItemQueue(fit.queueID);
                fiq.crawlDelay = rules.getCrawlDelay();
              }
            }
            final ProtocolOutput output = protocol.getProtocolOutput(fit.url, fit.page);
            final ProtocolStatus status = output.getStatus();
            final Content content = output.getContent();
            // unblock queue
            fetchQueues.finishFetchItem(fit);

            context.getCounter("FetcherStatus", ProtocolStatusUtils.getName(status.getCode())).increment(1);

            int length = 0;
            if (content!=null && content.getContent()!=null) length= content.getContent().length;
            updateStatus(length);

            switch(status.getCode()) {

            case ProtocolStatusCodes.WOULDBLOCK:
              // retry ?
              fetchQueues.addFetchItem(fit);
              break;

            case ProtocolStatusCodes.SUCCESS:        // got a page
              output(fit, content, status, CrawlStatus.STATUS_FETCHED);
              break;

            case ProtocolStatusCodes.MOVED:         // redirect
            case ProtocolStatusCodes.TEMP_MOVED:
              byte code;
              boolean temp;
              if (status.getCode() == ProtocolStatusCodes.MOVED) {
                code = CrawlStatus.STATUS_REDIR_PERM;
                temp = false;
              } else {
                code = CrawlStatus.STATUS_REDIR_TEMP;
                temp = true;
              }
              final String newUrl = ProtocolStatusUtils.getMessage(status);
              handleRedirect(fit.url, newUrl, temp,  FetcherJob.PROTOCOL_REDIR, fit.page);
              output(fit, content, status, code);
              break;
            case ProtocolStatusCodes.EXCEPTION:
              logFetchFailure(fit.url, ProtocolStatusUtils.getMessage(status));
              /* FALLTHROUGH */
            case ProtocolStatusCodes.RETRY:          // retry
            case ProtocolStatusCodes.BLOCKED:
              output(fit, null, status, CrawlStatus.STATUS_RETRY);
              break;

            case ProtocolStatusCodes.GONE:           // gone
            case ProtocolStatusCodes.NOTFOUND:
            case ProtocolStatusCodes.ACCESS_DENIED:
            case ProtocolStatusCodes.ROBOTS_DENIED:
              output(fit, null, status, CrawlStatus.STATUS_GONE);
              break;

            case ProtocolStatusCodes.NOTMODIFIED:
              output(fit, null, status, CrawlStatus.STATUS_NOTMODIFIED);
              break;

            default:
              if (LOG.isWarnEnabled()) {
                LOG.warn("Unknown ProtocolStatus: " + status.getCode());
              }
              output(fit, null, status, CrawlStatus.STATUS_RETRY);
            }

          } catch (final Throwable t) {                 // unexpected exception
View Full Code Here

Examples of org.apache.nutch.storage.ProtocolStatus

        } else { // convert to exception
          throw new FtpError(code);
        }
      }
    } catch (Exception e) {
      ProtocolStatus ps = ProtocolStatusUtils.makeStatus(
          ProtocolStatusCodes.EXCEPTION, e.toString());
      return new ProtocolOutput(null, ps);
    }
  }
View Full Code Here

Examples of org.apache.nutch.storage.ProtocolStatus

                  Bytes.toStringBinary(entry.getValue()));
            }
          }
          res.put(f, simpleMeta);
        } else if ("protocolStatus".equals(f)) {
          ProtocolStatus ps = page.getProtocolStatus();
          res.put(f, ProtocolStatusUtils.toString(ps));
        } else if ("parseStatus".equals(f)) {
          ParseStatus ps = page.getParseStatus();
          res.put(f, ParseStatusUtils.toString(ps));
        } else if ("signature".equals(f)) {
View Full Code Here

Examples of org.apache.nutch.storage.ProtocolStatus

                  Bytes.toStringBinary(entry.getValue().array()));
            }
          }
          res.put(f, simpleMeta);
        } else if ("protocolStatus".equals(f)) {
          ProtocolStatus ps = page.getProtocolStatus();
          res.put(f, ProtocolStatusUtils.toString(ps));
        } else if ("parseStatus".equals(f)) {
          ParseStatus ps = page.getParseStatus();
          res.put(f, ParseStatusUtils.toString(ps));
        } else if ("signature".equals(f)) {
View Full Code Here

Examples of org.apache.nutch.storage.ProtocolStatus

      return "BLOCKED";
    return "UNKNOWN_CODE_" + code;
  }

  public static ProtocolStatus makeStatus(int code) {
    ProtocolStatus pstatus = new ProtocolStatus();
    pstatus.setCode(code);
    pstatus.setLastModified(0);
    return pstatus;
  }
View Full Code Here

Examples of org.apache.nutch.storage.ProtocolStatus

    pstatus.setLastModified(0);
    return pstatus;
  }

  public static ProtocolStatus makeStatus(int code, String message) {
    ProtocolStatus pstatus = makeStatus(code);
    pstatus.addToArgs(new Utf8(message));
    return pstatus;
  }
View Full Code Here

Examples of org.apache.nutch.storage.ProtocolStatus

                  final FetchItemQueue fiq = fetchQueues.getFetchItemQueue(fit.queueID);
                  fiq.crawlDelay = rules.getCrawlDelay();
                }
              }
              final ProtocolOutput output = protocol.getProtocolOutput(fit.url, fit.page);
              final ProtocolStatus status = output.getStatus();
              final Content content = output.getContent();
              // unblock queue
              fetchQueues.finishFetchItem(fit);

              context.getCounter("FetcherStatus", ProtocolStatusUtils.getName(status.getCode())).increment(1);

              int length = 0;
              if (content!=null && content.getContent()!=null) length= content.getContent().length;
              updateStatus(length);

              switch(status.getCode()) {

              case ProtocolStatusCodes.WOULDBLOCK:
                // retry ?
                fetchQueues.addFetchItem(fit);
                break;

              case ProtocolStatusCodes.SUCCESS:        // got a page
                output(fit, content, status, CrawlStatus.STATUS_FETCHED);
                break;

              case ProtocolStatusCodes.MOVED:         // redirect
              case ProtocolStatusCodes.TEMP_MOVED:
                byte code;
                boolean temp;
                if (status.getCode() == ProtocolStatusCodes.MOVED) {
                  code = CrawlStatus.STATUS_REDIR_PERM;
                  temp = false;
                } else {
                  code = CrawlStatus.STATUS_REDIR_TEMP;
                  temp = true;
                }
                output(fit, content, status, code);
                final String newUrl = ProtocolStatusUtils.getMessage(status);
                handleRedirect(fit.url, newUrl, temp,  FetcherJob.PROTOCOL_REDIR);
                redirecting = false;
                break;
              case ProtocolStatusCodes.EXCEPTION:
                logError(fit.url, ProtocolStatusUtils.getMessage(status));
                /* FALLTHROUGH */
              case ProtocolStatusCodes.RETRY:          // retry
              case ProtocolStatusCodes.BLOCKED:
                output(fit, null, status, CrawlStatus.STATUS_RETRY);
                break;

              case ProtocolStatusCodes.GONE:           // gone
              case ProtocolStatusCodes.NOTFOUND:
              case ProtocolStatusCodes.ACCESS_DENIED:
              case ProtocolStatusCodes.ROBOTS_DENIED:
                output(fit, null, status, CrawlStatus.STATUS_GONE);
                break;

              case ProtocolStatusCodes.NOTMODIFIED:
                output(fit, null, status, CrawlStatus.STATUS_NOTMODIFIED);
                break;

              default:
                if (LOG.isWarnEnabled()) {
                  LOG.warn("Unknown ProtocolStatus: " + status.getCode());
                }
                output(fit, null, status, CrawlStatus.STATUS_RETRY);
              }

              if (redirecting && redirectCount > maxRedirect) {
View Full Code Here

Examples of org.apache.nutch.storage.ProtocolStatus

          throw new FileError(code);
        }
      }
    } catch (Exception e) {
      e.printStackTrace();
      ProtocolStatus ps = ProtocolStatusUtils.makeStatus(
          ProtocolStatusCodes.EXCEPTION, e.toString());
      return new ProtocolOutput(null, ps);
    }
  }
View Full Code Here

Examples of org.apache.nutch.storage.ProtocolStatus

                  LOG.info("Crawl delay for queue: " + fit.queueID + " is set to " + fiq.crawlDelay + " as per robots.txt. url: " + fit.url);
                }
              }
            }
            final ProtocolOutput output = protocol.getProtocolOutput(fit.url, fit.page);
            final ProtocolStatus status = output.getStatus();
            final Content content = output.getContent();
            // unblock queue
            fetchQueues.finishFetchItem(fit);

            context.getCounter("FetcherStatus", ProtocolStatusUtils.getName(status.getCode())).increment(1);

            int length = 0;
            if (content!=null && content.getContent()!=null) length= content.getContent().length;
            updateStatus(length);

            switch(status.getCode()) {

            case ProtocolStatusCodes.WOULDBLOCK:
              // retry ?
              fetchQueues.addFetchItem(fit);
              break;

            case ProtocolStatusCodes.SUCCESS:        // got a page
              output(fit, content, status, CrawlStatus.STATUS_FETCHED);
              break;

            case ProtocolStatusCodes.MOVED:         // redirect
            case ProtocolStatusCodes.TEMP_MOVED:
              byte code;
              boolean temp;
              if (status.getCode() == ProtocolStatusCodes.MOVED) {
                code = CrawlStatus.STATUS_REDIR_PERM;
                temp = false;
              } else {
                code = CrawlStatus.STATUS_REDIR_TEMP;
                temp = true;
              }
              final String newUrl = ProtocolStatusUtils.getMessage(status);
              handleRedirect(fit.url, newUrl, temp,  FetcherJob.PROTOCOL_REDIR, fit.page);
              output(fit, content, status, code);
              break;
            case ProtocolStatusCodes.EXCEPTION:
              logFetchFailure(fit.url, ProtocolStatusUtils.getMessage(status));
              /* FALLTHROUGH */
            case ProtocolStatusCodes.RETRY:          // retry
            case ProtocolStatusCodes.BLOCKED:
              output(fit, null, status, CrawlStatus.STATUS_RETRY);
              break;

            case ProtocolStatusCodes.GONE:           // gone
            case ProtocolStatusCodes.NOTFOUND:
            case ProtocolStatusCodes.ACCESS_DENIED:
            case ProtocolStatusCodes.ROBOTS_DENIED:
              output(fit, null, status, CrawlStatus.STATUS_GONE);
              break;

            case ProtocolStatusCodes.NOTMODIFIED:
              output(fit, null, status, CrawlStatus.STATUS_NOTMODIFIED);
              break;

            default:
              if (LOG.isWarnEnabled()) {
                LOG.warn("Unknown ProtocolStatus: " + status.getCode());
              }
              output(fit, null, status, CrawlStatus.STATUS_RETRY);
            }

          } catch (final Throwable t) {                 // unexpected exception
View Full Code Here
TOP
Copyright © 2018 www.massapi.com. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.