Package org.archive.crawler.postprocessor

Source Code of org.archive.crawler.postprocessor.DispositionProcessor

/*
*  This file is part of the Heritrix web crawler (crawler.archive.org).
*
*  Licensed to the Internet Archive (IA) by one or more individual
*  contributors.
*
*  The IA licenses this file to You under the Apache License, Version 2.0
*  (the "License"); you may not use this file except in compliance with
*  the License.  You may obtain a copy of the License at
*
*      http://www.apache.org/licenses/LICENSE-2.0
*
*  Unless required by applicable law or agreed to in writing, software
*  distributed under the License is distributed on an "AS IS" BASIS,
*  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
*  See the License for the specific language governing permissions and
*  limitations under the License.
*/

package org.archive.crawler.postprocessor;


import static org.archive.modules.CoreAttributeConstants.A_FETCH_BEGAN_TIME;
import static org.archive.modules.CoreAttributeConstants.A_FETCH_COMPLETED_TIME;
import static org.archive.modules.fetcher.FetchStatusCodes.S_CONNECT_FAILED;
import static org.archive.modules.fetcher.FetchStatusCodes.S_CONNECT_LOST;
import static org.archive.modules.fetcher.FetchStatusCodes.S_DEEMED_NOT_FOUND;
import static org.archive.modules.fetcher.FetchStatusCodes.S_DEFERRED;

import java.util.Map;
import java.util.logging.Logger;

import org.apache.commons.httpclient.URIException;
import org.archive.modules.CrawlMetadata;
import org.archive.modules.CrawlURI;
import org.archive.modules.Processor;
import org.archive.modules.net.CrawlHost;
import org.archive.modules.net.CrawlServer;
import org.archive.modules.net.IgnoreRobotsPolicy;
import org.archive.modules.net.Robotstxt;
import org.archive.modules.net.ServerCache;
import org.springframework.beans.factory.annotation.Autowired;


/**
* A step, late in the processing of a CrawlURI, for marking-up the
* CrawlURI with values to affect frontier disposition, and updating
* information that may have been affected by the fetch. This includes
* robots info and other stats.
*
* (Formerly called CrawlStateUpdater, when it did less.)
*
* @author gojomo
* @version $Date$, $Revision$
*/
public class DispositionProcessor extends Processor {
    @SuppressWarnings("unused")
    private static final long serialVersionUID = -1072728147960180091L;
    private static final Logger logger =
        Logger.getLogger(DispositionProcessor.class.getName());

    protected ServerCache serverCache;
    public ServerCache getServerCache() {
        return this.serverCache;
    }
    @Autowired
    public void setServerCache(ServerCache serverCache) {
        this.serverCache = serverCache;
    }
   
    /**
     * How many multiples of last fetch elapsed time to wait before recontacting
     * same server.
     */
    {
        setDelayFactor(5.0f);
    }
    public float getDelayFactor() {
        return (Float) kp.get("delayFactor");
    }
    public void setDelayFactor(float factor) {
        kp.put("delayFactor",factor);
    }

    /**
     * always wait this long after one completion before recontacting same
     * server, regardless of multiple
     */
    {
        setMinDelayMs(3000);
    }
    public int getMinDelayMs() {
        return (Integer) kp.get("minDelayMs");
    }
    public void setMinDelayMs(int minDelay) {
        kp.put("minDelayMs",minDelay);
    }
   
    /**
     * Whether to respect a 'Crawl-Delay' (in seconds) given in a site's
     * robots.txt
     */
    {
        setRespectCrawlDelayUpToSeconds(300);
    }
    public int getRespectCrawlDelayUpToSeconds() {
        return (Integer) kp.get("respectCrawlDelayUpToSeconds");
    }
    public void setRespectCrawlDelayUpToSeconds(int respect) {
        kp.put("respectCrawlDelayUpToSeconds",respect);
    }

    /** never wait more than this long, regardless of multiple */
    {
        setMaxDelayMs(30000);
    }
    public int getMaxDelayMs() {
        return (Integer) kp.get("maxDelayMs");
    }
    public void setMaxDelayMs(int maxDelay) {
        kp.put("maxDelayMs",maxDelay);
    }   

    /** maximum per-host bandwidth usage */
    {
        setMaxPerHostBandwidthUsageKbSec(0);
    }
    public int getMaxPerHostBandwidthUsageKbSec() {
        return (Integer) kp.get("maxPerHostBandwidthUsageKbSec");
    }
    public void setMaxPerHostBandwidthUsageKbSec(int max) {
        kp.put("maxPerHostBandwidthUsageKbSec",max);
    }
   
    /**
     * Whether to set a CrawlURI's force-retired directive, retiring
     * its queue when it finishes. Mainly intended for URI-specific
     * overlay settings; setting true globally will just retire all queues
     * after they offer one URI, rapidly ending a crawl.
     */
    {
        setForceRetire(false);
    }
    public boolean getForceRetire() {
        return (Boolean) kp.get("forceRetire");
    }
    public void setForceRetire(boolean force) {
        kp.put("forceRetire",force);
    }
   
    /**
     * Auto-discovered module providing configured (or overridden)
     * User-Agent value and RobotsHonoringPolicy
     */
    protected CrawlMetadata metadata;
    public CrawlMetadata getMetadata() {
        return metadata;
    }
    @Autowired
    public void setMetadata(CrawlMetadata provider) {
        this.metadata = provider;
    }

    public DispositionProcessor() {
        super();
    }

    @Override
    protected boolean shouldProcess(CrawlURI puri) {
        return true;
    }
   
    @Override
    protected void innerProcess(CrawlURI curi) {
        // Tally per-server, per-host, per-frontier-class running totals
        CrawlServer server = serverCache.getServerFor(curi.getUURI());

        String scheme = curi.getUURI().getScheme().toLowerCase();
        if (scheme.equals("http") || scheme.equals("https") &&
                server != null) {
            // Update connection problems counter
            if(curi.getFetchStatus() == S_CONNECT_FAILED || curi.getFetchStatus() == S_CONNECT_LOST ) {
                server.incrementConsecutiveConnectionErrors();
            } else if (curi.getFetchStatus() > 0){
                server.resetConsecutiveConnectionErrors();
            }

            // Update robots info
            try {
                if ("/robots.txt".equals(curi.getUURI().getPath()) && curi.getFetchStatus() != S_DEFERRED) {
                    // shortcut retries  w/ DEEMED when ignore-all
                    if (metadata.getRobotsPolicy() instanceof IgnoreRobotsPolicy) {
                        if(curi.getFetchStatus() < 0 && curi.getFetchStatus()!=S_DEFERRED) {
                            // prevent the rest of the usual retries
                            curi.setFetchStatus(S_DEEMED_NOT_FOUND);
                        }
                    }
                   
                    // Update server with robots info
                    // NOTE: in some cases the curi's status can be changed here
                    server.updateRobots(curi);
                }
            }
            catch (URIException e) {
                logger.severe("Failed get path on " + curi.getUURI());
            }
        }
       
        // set politeness delay
        curi.setPolitenessDelay(politenessDelayFor(curi));
       
        // consider operator-set force-retire
        if (getForceRetire()) {
            curi.setForceRetire(true);
        }
       
        // TODO: set other disposition decisions
        // success, failure, retry(retry-delay)
    }
   
    /**
     * Update any scheduling structures with the new information in this
     * CrawlURI. Chiefly means make necessary arrangements for no other URIs at
     * the same host to be visited within the appropriate politeness window.
     *
     * @param curi
     *            The CrawlURI
     * @return millisecond politeness delay
     */
    protected long politenessDelayFor(CrawlURI curi) {
        long durationToWait = 0;
        Map<String,Object> cdata = curi.getData();
        if (cdata.containsKey(A_FETCH_BEGAN_TIME)
                && cdata.containsKey(A_FETCH_COMPLETED_TIME)) {

            long completeTime = curi.getFetchCompletedTime();
            long durationTaken = (completeTime - curi.getFetchBeginTime());
            durationToWait = (long)(getDelayFactor() * durationTaken);

            long minDelay = getMinDelayMs();
            if (minDelay > durationToWait) {
                // wait at least the minimum
                durationToWait = minDelay;
            }

            long maxDelay = getMaxDelayMs();
            if (durationToWait > maxDelay) {
                // wait no more than the maximum
                durationToWait = maxDelay;
            }
           
            long respectThreshold = getRespectCrawlDelayUpToSeconds() * 1000;
            if (durationToWait<respectThreshold) {
                // may need to extend wait
                CrawlServer s = getServerCache().getServerFor(curi.getUURI());
                String ua = curi.getUserAgent();
                if (ua == null) {
                    ua = metadata.getUserAgent();
                }
                Robotstxt rep = s.getRobotstxt();
                if (rep != null) {
                    long crawlDelay = (long)(1000 * rep.getDirectivesFor(ua).getCrawlDelay());
                    crawlDelay =
                        (crawlDelay > respectThreshold)
                            ? respectThreshold
                            : crawlDelay;
                    if (crawlDelay > durationToWait) {
                        // wait at least the directive crawl-delay
                        durationToWait = crawlDelay;
                    }
                }
            }
           
            long now = System.currentTimeMillis();
            int maxBandwidthKB = getMaxPerHostBandwidthUsageKbSec();
            if (maxBandwidthKB > 0) {
                // Enforce bandwidth limit
                ServerCache cache = this.getServerCache();
                CrawlHost host = cache.getHostFor(curi.getUURI());
                long minDurationToWait = host.getEarliestNextURIEmitTime()
                        - now;
                float maxBandwidth = maxBandwidthKB * 1.024F; // kilo factor
                long processedBytes = curi.getContentSize();
                host
                        .setEarliestNextURIEmitTime((long)(processedBytes / maxBandwidth)
                                + now);

                if (minDurationToWait > durationToWait) {
                    durationToWait = minDurationToWait;
                }
            }
        }
        return durationToWait;
    }
}
TOP

Related Classes of org.archive.crawler.postprocessor.DispositionProcessor

TOP
Copyright © 2018 www.massapi.com. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.