Package com.manning.hip.testdata

Source Code of com.manning.hip.testdata.Crawler$MyCrawler

package com.manning.hip.testdata;

import edu.uci.ics.crawler4j.crawler.CrawlController;
import edu.uci.ics.crawler4j.crawler.Page;
import edu.uci.ics.crawler4j.crawler.WebCrawler;
import edu.uci.ics.crawler4j.url.WebURL;

import java.util.regex.Pattern;

public class Crawler {

    public static void main(String[] args) throws Exception {
      String rootFolder = "/tmp";
      int numberOfCrawlers = 1;

      CrawlController controller = new CrawlController(rootFolder);
      controller.addSeed("http://hadoop.apache.org/");
      controller.addSeed("http://hadoop.apache.org/common/");
      controller.addSeed("http://hadoop.apache.org/hdfs/");
      controller.addSeed("http://hadoop.apache.org/mapreduce/");
      controller.addSeed("http://avro.apache.org/");
      controller.addSeed("http://hbase.apache.org/");
      controller.addSeed("http://hive.apache.org/");
      controller.addSeed("http://pig.apache.org/");
      controller.addSeed("http://zookeeper.apache.org/");
      controller.setPolitenessDelay(1000);
      controller.setMaximumCrawlDepth(2);
      controller.setMaximumPagesToFetch(1);

      controller.start(MyCrawler.class, numberOfCrawlers);
    }


  public static class MyCrawler extends WebCrawler {

  Pattern filters = Pattern.compile(".*(\\.(css|js|bmp|gif|jpe?g"
      + "|png|tiff?|mid|mp2|mp3|mp4" + "|wav|avi|mov|mpeg|ram|m4v|pdf"
      + "|rm|smil|wmv|swf|wma|zip|rar|gz))$");

  public MyCrawler() {
  }

  public boolean shouldVisit(WebURL url) {
    String href = url.getURL().toLowerCase();
    return !filters.matcher(href).matches() &&
        href.contains("apache.org");
  }

  public void visit(Page page) {
        String url = page.getWebURL().getURL();

    // standard out contains a single line per URL, with the URL
    // followed by all the words found on the page
    //
    String text = page.getText().replaceAll("[^a-zA-Z]+", " ");
    System.out.println(url + "\t" + text);

    // standard err contains a line for each outgoing link from the
    // page we're crawling
    //
    for(WebURL link: page.getURLs()) {
      System.err.println(url + "\t" + link.getURL());
    }
  }
}

}
TOP

Related Classes of com.manning.hip.testdata.Crawler$MyCrawler

TOP
Copyright © 2018 www.massapi.com. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.