Package edu.uci.ics.crawler4j.example.simple

Source Code of edu.uci.ics.crawler4j.example.simple.Controller

/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements.  See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License.  You may obtain a copy of the License at
*
*     http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/

package edu.uci.ics.crawler4j.example.simple;

import edu.uci.ics.crawler4j.crawler.CrawlController;

/**
* @author Yasser Ganjisaffar <yganjisa at uci dot edu>
*/

public class Controller {

    public static void main(String[] args) throws Exception {
      if (args.length < 2) {
        System.out.println("Please specify 'root folder' and 'number of crawlers'.");
        return;
      }
     
      /*
       * rootfolder is a folder where intermediate crawl data is
       * stored.
       */
      String rootFolder = args[0];
     
      /*
       * numberOfCrawlers shows the number of concurrent threads
       * that should be initiated for crawling.
       */
      int numberOfCrawlers = Integer.parseInt(args[1]);
     
      /*
       * Instantiate the controller for this crawl. Note that if you want
       * your crawl to be resumable (meaning that you can resume the crawl
       * from a previously interrupted/crashed crawl) you can either set
       * crawler.enable_resume to true in crawler4j.properties file or you
       * can use the second parameter to the CrawlController constructor.
       *
       * Note: if you enable resuming feature and want to start a fresh
       * crawl, you need to delete the contents of rootFolder manually.
       */
      CrawlController controller = new CrawlController(rootFolder);
     
      /*
       * For each crawl, you need to add some seed urls.
       * These are the first URLs that are fetched and
       * then the crawler starts following links which
       * are found in these pages
       */
      controller.addSeed("http://www.ics.uci.edu/~yganjisa/");
      controller.addSeed("http://www.ics.uci.edu/~lopes/");
      controller.addSeed("http://www.ics.uci.edu/");
     
      /*
       * Be polite:
       * Make sure that we don't send more than 5 requests per
       * second (200 milliseconds between requests).
       */
      controller.setPolitenessDelay(200);
     
      /*
       * Optional:
       * You can set the maximum crawl depth here.
       * The default value is -1 for unlimited depth
       */
      controller.setMaximumCrawlDepth(2);
     
      /*
       * Optional:
       * You can set the maximum number of pages to crawl.
       * The default value is -1 for unlimited depth
       */
      controller.setMaximumPagesToFetch(500);
     
      /*
       * Do you need to set a proxy?
       * If so, you can use:
       * controller.setProxy("proxyserver.example.com", 8080);
       * OR
       * controller.setProxy("proxyserver.example.com", 8080, username, password);
       */
     
      /*
       * Note: you can configure several other parameters by modifying
       * crawler4j.properties file
       */
     
      /*
       * Start the crawl. This is a blocking operation, meaning
       * that your code will reach the line after this only when
       * crawling is finished.
       */
      controller.start(MyCrawler.class, numberOfCrawlers);
    }

}

TOP

Related Classes of edu.uci.ics.crawler4j.example.simple.Controller

TOP
Copyright © 2018 www.massapi.com. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.