Examples of ProtocolOutput


Examples of org.apache.nutch.protocol.ProtocolOutput

        response = new FileResponse(u, datum, this, getConf());   // make a request
 
        int code = response.getCode();
 
        if (code == 200) {                          // got a good response
          return new ProtocolOutput(response.toContent());              // return it
 
        } else if (code >= 300 && code < 400) {     // handle redirect
          if (redirects == MAX_REDIRECTS)
            throw new FileException("Too many redirects: " + url);
          u = new URL(response.getHeader("Location"));
          redirects++;               
          if (LOG.isTraceEnabled()) {
            LOG.trace("redirect to " + u);
          }
 
        } else {                                    // convert to exception
          throw new FileError(code);
        }
      }
    } catch (Exception e) {
      e.printStackTrace();
      return new ProtocolOutput(null, new ProtocolStatus(e));
    }
  }
View Full Code Here

Examples of org.apache.nutch.protocol.ProtocolOutput

      long delay = serverDelay;
     
      if (checkRobots) {
        try {
          if (!robots.isAllowed(this, u)) {
            return new ProtocolOutput(null, new ProtocolStatus(ProtocolStatus.ROBOTS_DENIED, url));
          }
        } catch (Throwable e) {
          // XXX Maybe bogus: assume this is allowed.
          if (logger.isTraceEnabled()) {
            logger.trace("Exception checking robot rules for " + url + ": " + e);
          }
        }

        long crawlDelay = robots.getCrawlDelay(this, u);
        delay = crawlDelay > 0 ? crawlDelay : serverDelay;
      }
      if (checkBlocking && maxCrawlDelay >= 0 && delay > maxCrawlDelay) {
        // skip this page, otherwise the thread would block for too long.
        LOGGER.info("Skipping: " + u + " exceeds fetcher.max.crawl.delay, max="
                + (maxCrawlDelay / 1000) + ", Crawl-Delay=" + (delay / 1000));
        return new ProtocolOutput(null, ProtocolStatus.STATUS_WOULDBLOCK);
      }
      String host = null;
      if (checkBlocking) {
        try {
          host = blockAddr(u, delay);
        } catch (BlockedException be) {
          return new ProtocolOutput(null, ProtocolStatus.STATUS_BLOCKED);
        }
      }
      Response response;
      try {
        response = getResponse(u, datum, false); // make a request
      } finally {
        if (checkBlocking) unblockAddr(host, delay);
      }
     
      int code = response.getCode();
      byte[] content = response.getContent();
      Content c = new Content(u.toString(), u.toString(),
                              (content == null ? EMPTY_CONTENT : content),
                              response.getHeader("Content-Type"),
                              response.getHeaders(), this.conf);
     
      if (code == 200) { // got a good response
        return new ProtocolOutput(c); // return it
       
      } else if (code == 410) { // page is gone
        return new ProtocolOutput(c, new ProtocolStatus(ProtocolStatus.GONE, "Http: " + code + " url=" + url));
       
      } else if (code >= 300 && code < 400) { // handle redirect
        String location = response.getHeader("Location");
        // some broken servers, such as MS IIS, use lowercase header name...
        if (location == null) location = response.getHeader("location");
        if (location == null) location = "";
        u = new URL(u, location);
        int protocolStatusCode;
        switch (code) {
          case 300:   // multiple choices, preferred value in Location
            protocolStatusCode = ProtocolStatus.MOVED;
            break;
          case 301:   // moved permanently
          case 305:   // use proxy (Location is URL of proxy)
            protocolStatusCode = ProtocolStatus.MOVED;
            break;
          case 302:   // found (temporarily moved)
          case 303:   // see other (redirect after POST)
          case 307:   // temporary redirect
            protocolStatusCode = ProtocolStatus.TEMP_MOVED;
            break;
          case 304:   // not modified
            protocolStatusCode = ProtocolStatus.NOTMODIFIED;
            break;
          default:
            protocolStatusCode = ProtocolStatus.MOVED;
        }
        // handle this in the higher layer.
        return new ProtocolOutput(c, new ProtocolStatus(protocolStatusCode, u));
      } else if (code == 400) { // bad request, mark as GONE
        if (logger.isTraceEnabled()) { logger.trace("400 Bad request: " + u); }
        return new ProtocolOutput(c, new ProtocolStatus(ProtocolStatus.GONE, u));
      } else if (code == 401) { // requires authorization, but no valid auth provided.
        if (logger.isTraceEnabled()) { logger.trace("401 Authentication Required"); }
        return new ProtocolOutput(c, new ProtocolStatus(ProtocolStatus.ACCESS_DENIED, "Authentication required: "
                + urlString));
      } else if (code == 404) {
        return new ProtocolOutput(c, new ProtocolStatus(ProtocolStatus.NOTFOUND, u));
      } else if (code == 410) { // permanently GONE
        return new ProtocolOutput(c, new ProtocolStatus(ProtocolStatus.GONE, u));
      } else {
        return new ProtocolOutput(c, new ProtocolStatus(ProtocolStatus.EXCEPTION, "Http code=" + code + ", url="
                + u));
      }
    } catch (Throwable e) {
      e.printStackTrace(LogUtil.getErrorStream(logger));
      return new ProtocolOutput(null, new ProtocolStatus(e));
    }
  }
View Full Code Here

Examples of org.apache.nutch.protocol.ProtocolOutput

   
//    if (verbose) {
//      LOGGER.setLevel(Level.FINE);
//    }
   
    ProtocolOutput out = http.getProtocolOutput(new Text(url), new CrawlDatum());
    Content content = out.getContent();
   
    System.out.println("Status: " + out.getStatus());
    if (content != null) {
      System.out.println("Content Type: " + content.getContentType());
      System.out.println("Content Length: " +
                         content.getMetadata().get(Response.CONTENT_LENGTH));
      System.out.println("Content:");
View Full Code Here

Examples of org.apache.nutch.protocol.ProtocolOutput

                              (content == null ? EMPTY_CONTENT : content),
                              response.getHeader("Content-Type"),
                              response.getHeaders(), this.conf);
     
      if (code == 200) { // got a good response
        return new ProtocolOutput(c); // return it
       
      } else if (code >= 300 && code < 400) { // handle redirect
        String location = response.getHeader("Location");
        // some broken servers, such as MS IIS, use lowercase header name...
        if (location == null) location = response.getHeader("location");
        if (location == null) location = "";
        u = new URL(u, location);
        int protocolStatusCode;
        switch (code) {
          case 300:   // multiple choices, preferred value in Location
            protocolStatusCode = ProtocolStatus.MOVED;
            break;
          case 301:   // moved permanently
          case 305:   // use proxy (Location is URL of proxy)
            protocolStatusCode = ProtocolStatus.MOVED;
            break;
          case 302:   // found (temporarily moved)
          case 303:   // see other (redirect after POST)
          case 307:   // temporary redirect
            protocolStatusCode = ProtocolStatus.TEMP_MOVED;
            break;
          case 304:   // not modified
            protocolStatusCode = ProtocolStatus.NOTMODIFIED;
            break;
          default:
            protocolStatusCode = ProtocolStatus.MOVED;
        }
        // handle this in the higher layer.
        return new ProtocolOutput(c, new ProtocolStatus(protocolStatusCode, u));
      } else if (code == 400) { // bad request, mark as GONE
        if (logger.isTraceEnabled()) { logger.trace("400 Bad request: " + u); }
        return new ProtocolOutput(c, new ProtocolStatus(ProtocolStatus.GONE, u));
      } else if (code == 401) { // requires authorization, but no valid auth provided.
        if (logger.isTraceEnabled()) { logger.trace("401 Authentication Required"); }
        return new ProtocolOutput(c, new ProtocolStatus(ProtocolStatus.ACCESS_DENIED, "Authentication required: "
                + urlString));
      } else if (code == 404) {
        return new ProtocolOutput(c, new ProtocolStatus(ProtocolStatus.NOTFOUND, u));
      } else if (code == 410) { // permanently GONE
        return new ProtocolOutput(c, new ProtocolStatus(ProtocolStatus.GONE, "Http: " + code + " url=" + u));
      } else {
        return new ProtocolOutput(c, new ProtocolStatus(ProtocolStatus.EXCEPTION, "Http code=" + code + ", url="
                + u));
      }
    } catch (Throwable e) {
      logger.error("Failed to get protocol output", e);
      return new ProtocolOutput(null, new ProtocolStatus(e));
    }
  }
View Full Code Here

Examples of org.apache.nutch.protocol.ProtocolOutput

   
//    if (verbose) {
//      LOGGER.setLevel(Level.FINE);
//    }
   
    ProtocolOutput out = http.getProtocolOutput(new Text(url), new CrawlDatum());
    Content content = out.getContent();
   
    System.out.println("Status: " + out.getStatus());
    if (content != null) {
      System.out.println("Content Type: " + content.getContentType());
      System.out.println("Content Length: " +
                         content.getMetadata().get(Response.CONTENT_LENGTH));
      System.out.println("Content:");
View Full Code Here

Examples of org.apache.nutch.protocol.ProtocolOutput

      if (LOG.isTraceEnabled())
        LOG.trace("cache miss " + url);

      try {
        Text robotsUrl = new Text(new URL(url, "/robots.txt").toString());
        ProtocolOutput output = ((Ftp)ftp).getProtocolOutput(robotsUrl, new CrawlDatum());
        ProtocolStatus status = output.getStatus();

        if (status.getCode() == ProtocolStatus.SUCCESS) {
          robotRules =  parseRules(url.toString(), output.getContent().getContent(),
                                  CONTENT_TYPE, agentNames);
        } else {                                      
          robotRules = EMPTY_RULES;                 // use default rules
        }
      } catch (Throwable t) {
View Full Code Here

Examples of org.apache.nutch.protocol.ProtocolOutput

    ProtocolFactory factory = new ProtocolFactory(conf);
    Protocol protocol = factory.getProtocol(url);
    CrawlDatum datum = new CrawlDatum();

    ProtocolOutput output = protocol.getProtocolOutput(new Text(url), datum);
   
    IndexWriters writers = new IndexWriters(getConf());
   
    if (!output.getStatus().isSuccess()) {
      System.out.println("Fetch failed with protocol status: " + output.getStatus());
      return 0;
    }
        
    Content content = output.getContent();

    if (content == null) {
      System.out.println("No content for " + url);
      return 0;
    }
View Full Code Here

Examples of org.apache.nutch.protocol.ProtocolOutput

   */
  public void setContentType(String testTextFile) throws ProtocolException {
    String urlString = "file:" + sampleDir + fileSeparator + testTextFile;
    Assert.assertNotNull(urlString);
    Protocol protocol = new ProtocolFactory(conf).getProtocol(urlString);
    ProtocolOutput output = protocol.getProtocolOutput(new Text(urlString),
        datum);
    Assert.assertNotNull(output);
    Assert.assertEquals("Status code: [" + output.getStatus().getCode()
        + "], not equal to: [" + ProtocolStatus.SUCCESS + "]: args: ["
        + output.getStatus().getArgs() + "]", ProtocolStatus.SUCCESS, output
        .getStatus().getCode());
    Assert.assertNotNull(output.getContent());
    Assert.assertNotNull(output.getContent().getContentType());
    Assert.assertEquals(expectedMimeType, output.getContent().getContentType());
    Assert.assertNotNull(output.getContent().getMetadata());
    Assert.assertEquals(expectedMimeType,
        output.getContent().getMetadata().get(Response.CONTENT_TYPE));

  }
View Full Code Here

Examples of org.apache.nutch.protocol.ProtocolOutput

        response = new FileResponse(u, datum, this, getConf());   // make a request
 
        int code = response.getCode();
 
        if (code == 200) {                          // got a good response
          return new ProtocolOutput(response.toContent());              // return it
 
        } else if (code == 304) {                   // got not modified
          return new ProtocolOutput(response.toContent(), ProtocolStatus.STATUS_NOTMODIFIED);

        } else if (code == 401) {                   // access denied / no read permissions
          return new ProtocolOutput(response.toContent(), new ProtocolStatus(ProtocolStatus.ACCESS_DENIED));

        } else if (code == 404) {                   // no such file
          return new ProtocolOutput(response.toContent(), ProtocolStatus.STATUS_NOTFOUND);

        } else if (code >= 300 && code < 400) {     // handle redirect
          if (redirects == MAX_REDIRECTS)
            throw new FileException("Too many redirects: " + url);
          u = new URL(response.getHeader("Location"));
          redirects++;               
          if (LOG.isTraceEnabled()) {
            LOG.trace("redirect to " + u);
          }
 
        } else {                                    // convert to exception
          throw new FileError(code);
        }
      }
    } catch (Exception e) {
      e.printStackTrace();
      return new ProtocolOutput(null, new ProtocolStatus(e));
    }
  }
View Full Code Here

Examples of org.apache.nutch.protocol.ProtocolOutput

        response = new FtpResponse(u, datum, this, getConf());   // make a request
 
        int code = response.getCode();
 
        if (code == 200) {                          // got a good response
          return new ProtocolOutput(response.toContent());              // return it
 
        } else if (code >= 300 && code < 400) {     // handle redirect
          if (redirects == MAX_REDIRECTS)
            throw new FtpException("Too many redirects: " + url);
          u = new URL(response.getHeader("Location"));
          redirects++;               
          if (LOG.isTraceEnabled()) {
            LOG.trace("redirect to " + u);
          }
        } else {                                    // convert to exception
          throw new FtpError(code);
        }
      }
    } catch (Exception e) {
      return new ProtocolOutput(null, new ProtocolStatus(e));
    }
  }
View Full Code Here
TOP
Copyright © 2018 www.massapi.com. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.