Examples of CrawlController


Examples of edu.uci.ics.crawler4j.crawler.CrawlController

    public static void main(String[] args) throws Exception {
      String rootFolder = "/tmp";
      int numberOfCrawlers = 1;

      CrawlController controller = new CrawlController(rootFolder);
      controller.addSeed("http://hadoop.apache.org/");
      controller.addSeed("http://hadoop.apache.org/common/");
      controller.addSeed("http://hadoop.apache.org/hdfs/");
      controller.addSeed("http://hadoop.apache.org/mapreduce/");
      controller.addSeed("http://avro.apache.org/");
      controller.addSeed("http://hbase.apache.org/");
      controller.addSeed("http://hive.apache.org/");
      controller.addSeed("http://pig.apache.org/");
      controller.addSeed("http://zookeeper.apache.org/");
      controller.setPolitenessDelay(1000);
      controller.setMaximumCrawlDepth(2);
      controller.setMaximumPagesToFetch(1);

      controller.start(MyCrawler.class, numberOfCrawlers);
    }
View Full Code Here

Examples of org.archive.crawler.framework.CrawlController

    }
   
    @Override
    protected ProcessResult innerProcessResult(CrawlURI curi)
    throws InterruptedException {
        CrawlController controller = getCrawlController();
        StatisticsTracker stats = getStatisticsTracker();
        long allowedRuntimeMs = getRuntimeSeconds() * 1000L;
        long currentRuntimeMs = stats.getCrawlElapsedTime();
        if(currentRuntimeMs > allowedRuntimeMs){
            Operation op = getExpirationOperation();
            if(op != null){
                if (op.equals(Operation.PAUSE)) {
                    controller.requestCrawlPause();
                } else if (op.equals(Operation.TERMINATE)){
                    controller.requestCrawlStop(CrawlStatus.FINISHED_TIME_LIMIT);
                } else if (op.equals(Operation.BLOCK_URIS)) {
                    curi.setFetchStatus(S_BLOCKED_BY_RUNTIME_LIMIT);
                    curi.getAnnotations().add("Runtime exceeded " + allowedRuntimeMs +
                            "ms");
                    return ProcessResult.FINISH;
View Full Code Here
TOP
Copyright © 2018 www.massapi.com. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.