Package edu.uci.ics.crawler4j.crawler

Source Code of edu.uci.ics.crawler4j.crawler.WebCrawler

/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements.  See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License.  You may obtain a copy of the License at
*
*     http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/

package edu.uci.ics.crawler4j.crawler;

import edu.uci.ics.crawler4j.fetcher.PageFetchResult;
import edu.uci.ics.crawler4j.fetcher.CustomFetchStatus;
import edu.uci.ics.crawler4j.fetcher.PageFetcher;
import edu.uci.ics.crawler4j.frontier.DocIDServer;
import edu.uci.ics.crawler4j.frontier.Frontier;
import edu.uci.ics.crawler4j.parser.HtmlParseData;
import edu.uci.ics.crawler4j.parser.ParseData;
import edu.uci.ics.crawler4j.parser.Parser;
import edu.uci.ics.crawler4j.robotstxt.RobotstxtServer;
import edu.uci.ics.crawler4j.url.WebURL;

import org.apache.http.HttpStatus;
import org.apache.log4j.Logger;

import java.util.ArrayList;
import java.util.List;

/**
* WebCrawler class in the Runnable class that is executed by each crawler
* thread.
*
* @author Yasser Ganjisaffar <lastname at gmail dot com>
*/
public class WebCrawler implements Runnable {

  protected static final Logger logger = Logger.getLogger(WebCrawler.class.getName());

  /**
   * The id associated to the crawler thread running this instance
   */
  protected int myId;

  /**
   * The controller instance that has created this crawler thread. This
   * reference to the controller can be used for getting configurations of the
   * current crawl or adding new seeds during runtime.
   */
  protected CrawlController myController;

  /**
   * The thread within which this crawler instance is running.
   */
  private Thread myThread;

  /**
   * The parser that is used by this crawler instance to parse the content of
   * the fetched pages.
   */
  private Parser parser;

  /**
   * The fetcher that is used by this crawler instance to fetch the content of
   * pages from the web.
   */
  private PageFetcher pageFetcher;

  /**
   * The RobotstxtServer instance that is used by this crawler instance to
   * determine whether the crawler is allowed to crawl the content of each
   * page.
   */
  private RobotstxtServer robotstxtServer;

  /**
   * The DocIDServer that is used by this crawler instance to map each URL to
   * a unique docid.
   */
  private DocIDServer docIdServer;

  /**
   * The Frontier object that manages the crawl queue.
   */
  private Frontier frontier;

  /**
   * Is the current crawler instance waiting for new URLs? This field is
   * mainly used by the controller to detect whether all of the crawler
   * instances are waiting for new URLs and therefore there is no more work
   * and crawling can be stopped.
   */
  private boolean isWaitingForNewURLs;

  /**
   * Initializes the current instance of the crawler
   *
   * @param id
   *            the id of this crawler instance
   * @param crawlController
   *            the controller that manages this crawling session
   */
  public void init(int id, CrawlController crawlController) {
    this.myId = id;
    this.pageFetcher = crawlController.getPageFetcher();
    this.robotstxtServer = crawlController.getRobotstxtServer();
    this.docIdServer = crawlController.getDocIdServer();
    this.frontier = crawlController.getFrontier();
    this.parser = new Parser(crawlController.getConfig());
    this.myController = crawlController;
    this.isWaitingForNewURLs = false;
  }

  /**
   * Get the id of the current crawler instance
   *
   * @return the id of the current crawler instance
   */
  public int getMyId() {
    return myId;
  }

  public CrawlController getMyController() {
    return myController;
  }

  /**
   * This function is called just before starting the crawl by this crawler
   * instance. It can be used for setting up the data structures or
   * initializations needed by this crawler instance.
   */
  public void onStart() {
    // Do nothing by default
    // Sub-classed can override this to add their custom functionality
  }

  /**
   * This function is called just before the termination of the current
   * crawler instance. It can be used for persisting in-memory data or other
   * finalization tasks.
   */
  public void onBeforeExit() {
    // Do nothing by default
    // Sub-classed can override this to add their custom functionality
  }

  /**
   * This function is called once the header of a page is fetched. It can be
   * overwritten by sub-classes to perform custom logic for different status
   * codes. For example, 404 pages can be logged, etc.
   *
   * @param webUrl
   * @param statusCode
   * @param statusDescription
   */
  protected void handlePageStatusCode(WebURL webUrl, int statusCode, String statusDescription) {
    // Do nothing by default
    // Sub-classed can override this to add their custom functionality
  }

  /**
   * This function is called if the content of a url could not be fetched.
   *
   * @param webUrl
   */
  protected void onContentFetchError(WebURL webUrl) {
    // Do nothing by default
    // Sub-classed can override this to add their custom functionality
  }

  /**
   * This function is called if there has been an error in parsing the
   * content.
   *
   * @param webUrl
   */
  protected void onParseError(WebURL webUrl) {
    // Do nothing by default
    // Sub-classed can override this to add their custom functionality
  }

  /**
   * The CrawlController instance that has created this crawler instance will
   * call this function just before terminating this crawler thread. Classes
   * that extend WebCrawler can override this function to pass their local
   * data to their controller. The controller then puts these local data in a
   * List that can then be used for processing the local data of crawlers (if
   * needed).
   */
  public Object getMyLocalData() {
    return null;
  }

  public void run() {
    onStart();
    while (true) {
      List<WebURL> assignedURLs = new ArrayList<>(50);
      isWaitingForNewURLs = true;
      frontier.getNextURLs(50, assignedURLs);
      isWaitingForNewURLs = false;
      if (assignedURLs.size() == 0) {
        if (frontier.isFinished()) {
          return;
        }
        try {
          Thread.sleep(3000);
        } catch (InterruptedException e) {
          e.printStackTrace();
        }
      } else {
        for (WebURL curURL : assignedURLs) {
          if (curURL != null) {
            processPage(curURL);
            frontier.setProcessed(curURL);
          }
          if (myController.isShuttingDown()) {
            logger.info("Exiting because of controller shutdown.");
            return;
          }
        }
      }
    }
  }

  /**
   * Classes that extends WebCrawler can overwrite this function to tell the
   * crawler whether the given url should be crawled or not. The following
   * implementation indicates that all urls should be included in the crawl.
   *
   * @param url
   *            the url which we are interested to know whether it should be
   *            included in the crawl or not.
   * @return if the url should be included in the crawl it returns true,
   *         otherwise false is returned.
   */
  public boolean shouldVisit(WebURL url) {
    return true;
  }

  /**
   * Classes that extends WebCrawler can overwrite this function to process
   * the content of the fetched and parsed page.
   *
   * @param page
   *            the page object that is just fetched and parsed.
   */
  public void visit(Page page) {
    // Do nothing by default
    // Sub-classed can override this to add their custom functionality
  }

  private void processPage(WebURL curURL) {
    if (curURL == null) {
      return;
    }
    PageFetchResult fetchResult = null;
    try {
      fetchResult = pageFetcher.fetchHeader(curURL);
      int statusCode = fetchResult.getStatusCode();
      handlePageStatusCode(curURL, statusCode, CustomFetchStatus.getStatusDescription(statusCode));
      if (statusCode != HttpStatus.SC_OK) {
        if (statusCode == HttpStatus.SC_MOVED_PERMANENTLY || statusCode == HttpStatus.SC_MOVED_TEMPORARILY) {
          if (myController.getConfig().isFollowRedirects()) {
            String movedToUrl = fetchResult.getMovedToUrl();
            if (movedToUrl == null) {
              return;
            }
            int newDocId = docIdServer.getDocId(movedToUrl);
            if (newDocId > 0) {
              // Redirect page is already seen
              return;
            }

            WebURL webURL = new WebURL();
            webURL.setURL(movedToUrl);
            webURL.setParentDocid(curURL.getParentDocid());
            webURL.setParentUrl(curURL.getParentUrl());
            webURL.setDepth(curURL.getDepth());
            webURL.setDocid(-1);
            webURL.setAnchor(curURL.getAnchor());
            if (shouldVisit(webURL) && robotstxtServer.allows(webURL)) {
              webURL.setDocid(docIdServer.getNewDocID(movedToUrl));
              frontier.schedule(webURL);
            }
          }
        } else if (fetchResult.getStatusCode() == CustomFetchStatus.PageTooBig) {
          logger.info("Skipping a page which was bigger than max allowed size: " + curURL.getURL());
        }
        return;
      }

      if (!curURL.getURL().equals(fetchResult.getFetchedUrl())) {
        if (docIdServer.isSeenBefore(fetchResult.getFetchedUrl())) {
          // Redirect page is already seen
          return;
        }
        curURL.setURL(fetchResult.getFetchedUrl());
        curURL.setDocid(docIdServer.getNewDocID(fetchResult.getFetchedUrl()));
      }

      Page page = new Page(curURL);
      int docid = curURL.getDocid();

      if (!fetchResult.fetchContent(page)) {
        onContentFetchError(curURL);
        return;
      }

      if (!parser.parse(page, curURL.getURL())) {
        onParseError(curURL);
        return;
      }

      ParseData parseData = page.getParseData();
      if (parseData instanceof HtmlParseData) {
        HtmlParseData htmlParseData = (HtmlParseData) parseData;

        List<WebURL> toSchedule = new ArrayList<>();
        int maxCrawlDepth = myController.getConfig().getMaxDepthOfCrawling();
        for (WebURL webURL : htmlParseData.getOutgoingUrls()) {
          webURL.setParentDocid(docid);
          webURL.setParentUrl(curURL.getURL());
          int newdocid = docIdServer.getDocId(webURL.getURL());
          if (newdocid > 0) {
            // This is not the first time that this Url is
            // visited. So, we set the depth to a negative
            // number.
            webURL.setDepth((short) -1);
            webURL.setDocid(newdocid);
          } else {
            webURL.setDocid(-1);
            webURL.setDepth((short) (curURL.getDepth() + 1));
            if (maxCrawlDepth == -1 || curURL.getDepth() < maxCrawlDepth) {
              if (shouldVisit(webURL) && robotstxtServer.allows(webURL)) {
                webURL.setDocid(docIdServer.getNewDocID(webURL.getURL()));
                toSchedule.add(webURL);
              }
            }
          }
        }
        frontier.scheduleAll(toSchedule);
      }
      try {
        visit(page);
      } catch (Exception e) {
        logger.error("Exception while running the visit method. Message: '" + e.getMessage() + "' at " + e.getStackTrace()[0]);
      }

    } catch (Exception e) {
      logger.error(e.getMessage() + ", while processing: " + curURL.getURL());
    } finally {
      if (fetchResult != null) {
        fetchResult.discardContentIfNotConsumed();
      }
    }
  }

  public Thread getThread() {
    return myThread;
  }

  public void setThread(Thread myThread) {
    this.myThread = myThread;
  }

  public boolean isNotWaitingForNewURLs() {
    return !isWaitingForNewURLs;
  }

}
TOP

Related Classes of edu.uci.ics.crawler4j.crawler.WebCrawler

TOP
Copyright © 2018 www.massapi.com. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.