Source Code of edu.uci.ics.crawler4j.example.simple.Controller

/**
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */


package edu.uci.ics.crawler4j.example.simple;


import edu.uci.ics.crawler4j.crawler.CrawlController;


/**
 * @author Yasser Ganjisaffar <yganjisa at uci dot edu>
 */


public class Controller {


    public static void main(String[] args) throws Exception {
      if (args.length < 2) {
        System.out.println("Please specify 'root folder' and 'number of crawlers'.");
        return;
      }
      
      /*
       * rootfolder is a folder where intermediate crawl data is
       * stored. 
       */
      String rootFolder = args[0];
      
      /*
       * numberOfCrawlers shows the number of concurrent threads
       * that should be initiated for crawling.
       */
      int numberOfCrawlers = Integer.parseInt(args[1]);
      
      /*
       * Instantiate the controller for this crawl. Note that if you want
       * your crawl to be resumable (meaning that you can resume the crawl
       * from a previously interrupted/crashed crawl) you can either set
       * crawler.enable_resume to true in crawler4j.properties file or you
       * can use the second parameter to the CrawlController constructor.
       * 
       * Note: if you enable resuming feature and want to start a fresh
       * crawl, you need to delete the contents of rootFolder manually.
       */
      CrawlController controller = new CrawlController(rootFolder);
      
      /*
       * For each crawl, you need to add some seed urls.
       * These are the first URLs that are fetched and
       * then the crawler starts following links which
       * are found in these pages
       */
      controller.addSeed("http://www.ics.uci.edu/~yganjisa/");
      controller.addSeed("http://www.ics.uci.edu/~lopes/");
      controller.addSeed("http://www.ics.uci.edu/");
      
      /*
       * Be polite:
       * Make sure that we don't send more than 5 requests per 
       * second (200 milliseconds between requests).
       */
      controller.setPolitenessDelay(200);
      
      /*
       * Optional:
       * You can set the maximum crawl depth here.
       * The default value is -1 for unlimited depth
       */
      controller.setMaximumCrawlDepth(2);
      
      /*
       * Optional:
       * You can set the maximum number of pages to crawl.
       * The default value is -1 for unlimited depth
       */
      controller.setMaximumPagesToFetch(500);
      
      /*
       * Do you need to set a proxy?
       * If so, you can use: 
       * controller.setProxy("proxyserver.example.com", 8080);
       * OR
       * controller.setProxy("proxyserver.example.com", 8080, username, password);
       */
      
      /*
       * Note: you can configure several other parameters by modifying 
       * crawler4j.properties file
       */
      
      /*
       * Start the crawl. This is a blocking operation, meaning
       * that your code will reach the line after this only when
       * crawling is finished.
       */
      controller.start(MyCrawler.class, numberOfCrawlers);
    }


}
Source Code of edu.uci.ics.crawler4j.example.simple.Controller

Related Classes of edu.uci.ics.crawler4j.example.simple.Controller