Package com.googlecode.flaxcrawler.frontier

Source Code of com.googlecode.flaxcrawler.frontier.DefaultScheduler

package com.googlecode.flaxcrawler.frontier;

import com.googlecode.flaxcrawler.concurrent.TaskQueue;
import com.googlecode.flaxcrawler.model.CrawlerTask;
import java.util.LinkedList;
import java.util.Queue;
import org.apache.log4j.Logger;

/**
* Standard scheduler implementation. Starts an asyncronous worker thread that reads urls from the queue
* and adds them to the {@code TaskQueue}.
*/
public class DefaultScheduler implements Scheduler {

    private Logger log = Logger.getLogger(this.getClass());
    private TaskQueue taskQueue;
    private StatisticsService statisticsService;
    private final Object syncRoot = new Object();
    private final Queue<CrawlerTask> schedulerQueue = new LinkedList<CrawlerTask>();
    private final Thread workerThread;

    public DefaultScheduler(TaskQueue taskQueue, StatisticsService statisticsService) {
        this.statisticsService = statisticsService;
        this.taskQueue = taskQueue;

        workerThread = new Thread(new Runnable() {

            public void run() {
                doWorkLoop();
            }
        });
        workerThread.setDaemon(true);
        workerThread.start();
        log.info("Scheduler was successfully initialized and started");
    }

    public void schedule(CrawlerTask crawlerTask) {
        synchronized (syncRoot) {
            log.debug("Enqueueing task " + crawlerTask.getUrl() + " to the scheduler queue");
            schedulerQueue.add(crawlerTask);
        }
    }

    /**
     * Reads urls from queue and adds them to the TaskQueue (if url was not crawled yet)
     */
    private void doWorkLoop() {
        while (true) {
            CrawlerTask task = null;

            try {
                synchronized (syncRoot) {
                    task = schedulerQueue.poll();
                }
                if (task != null) {
                    if (!statisticsService.isCrawled(task.getUrl())) {
                        taskQueue.enqueue(task);
                        statisticsService.afterScheduling(task);
                        log.debug("Scheduled crawling of the " + task.getUrl());
                    } else {
                        log.debug("Url " + task.getUrl() + " was already crawled");
                    }
                }

                // Yielding context to another thread
                Thread.sleep(1);
            } catch (Exception ex) {
                log.error("Error processing task " + task == null ? "NOTASK" : task.getUrl() + " from the scheduler queue", ex);
            }
        }
    }
}
TOP

Related Classes of com.googlecode.flaxcrawler.frontier.DefaultScheduler

TOP
Copyright © 2018 www.massapi.com. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.