Package org.apache.nutch.protocol.ftp

Source Code of org.apache.nutch.protocol.ftp.FtpRobotRulesParser

/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements.  See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License.  You may obtain a copy of the License at
*
*     http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/

package org.apache.nutch.protocol.ftp;

import java.net.URL;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.io.Text;
import org.apache.nutch.crawl.CrawlDatum;
import org.apache.nutch.protocol.Protocol;
import org.apache.nutch.protocol.ProtocolOutput;
import org.apache.nutch.protocol.ProtocolStatus;
import org.apache.nutch.protocol.RobotRulesParser;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

import crawlercommons.robots.BaseRobotRules;
import crawlercommons.robots.SimpleRobotRules;

/**
* This class is used for parsing robots for urls belonging to FTP protocol.
* It extends the generic {@link RobotRulesParser} class and contains
* Ftp protocol specific implementation for obtaining the robots file.
*/
public class FtpRobotRulesParser extends RobotRulesParser {

  private static final String CONTENT_TYPE = "text/plain";
  public static final Logger LOG = LoggerFactory.getLogger(FtpRobotRulesParser.class);

  FtpRobotRulesParser() { }

  public FtpRobotRulesParser(Configuration conf) {
    super(conf);
  }

  /**
   * The hosts for which the caching of robots rules is yet to be done,
   * it sends a Ftp request to the host corresponding to the {@link URL}
   * passed, gets robots file, parses the rules and caches the rules object
   * to avoid re-work in future.
   *
   *  @param ftp The {@link Protocol} object
   *  @param url URL
   * 
   *  @return robotRules A {@link BaseRobotRules} object for the rules
   */
  public BaseRobotRules getRobotRulesSet(Protocol ftp, URL url) {

    String protocol = url.getProtocol().toLowerCase()// normalize to lower case
    String host = url.getHost().toLowerCase();          // normalize to lower case

    BaseRobotRules robotRules = (SimpleRobotRules) CACHE.get(protocol + ":" + host);

    boolean cacheRule = true;

    if (robotRules == null) {                     // cache miss
      if (LOG.isTraceEnabled())
        LOG.trace("cache miss " + url);

      try {
        Text robotsUrl = new Text(new URL(url, "/robots.txt").toString());
        ProtocolOutput output = ((Ftp)ftp).getProtocolOutput(robotsUrl, new CrawlDatum());
        ProtocolStatus status = output.getStatus();

        if (status.getCode() == ProtocolStatus.SUCCESS) {
          robotRules =  parseRules(url.toString(), output.getContent().getContent(),
                                  CONTENT_TYPE, agentNames);
        } else {                                      
          robotRules = EMPTY_RULES;                 // use default rules
        }
      } catch (Throwable t) {
        if (LOG.isInfoEnabled()) {
          LOG.info("Couldn't get robots.txt for " + url + ": " + t.toString());
        }
        cacheRule = false;
        robotRules = EMPTY_RULES;
      }

      if (cacheRule)
        CACHE.put(protocol + ":" + host, robotRules)// cache rules for host
    }
    return robotRules;
  }
}
TOP

Related Classes of org.apache.nutch.protocol.ftp.FtpRobotRulesParser

TOP
Copyright © 2018 www.massapi.com. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.