Package spiderman.plugin.impl

Source Code of spiderman.plugin.impl.BeginPointImpl

package spiderman.plugin.impl;

import java.net.URL;

import org.eweb4j.spiderman.plugin.BeginPoint;
import org.eweb4j.spiderman.spider.SpiderListener;
import org.eweb4j.spiderman.task.Task;
import org.eweb4j.spiderman.xml.Site;
import org.eweb4j.spiderman.xml.ValidHost;
import org.eweb4j.spiderman.xml.ValidHosts;
import org.eweb4j.util.CommonUtil;


public class BeginPointImpl implements BeginPoint{

  public Task confirmTask(Task task) throws Exception{
    //如果不是在给定的合法host列表里则不给于抓取
    ValidHosts vhs = task.site.getValidHosts();
    if (vhs == null || vhs.getValidHost() == null || vhs.getValidHost().isEmpty()){
      if (!CommonUtil.isSameHost(task.site.getUrl(), task.url))
        return null;
    }else{
      boolean isOk = false;
      String taskHost = new URL(task.url).getHost();
      for (ValidHost h : vhs.getValidHost()){
        if (taskHost.equals(h.getValue())){
          isOk = true;
          break;
        }
      }
     
      if (!isOk)
        return null;
    }
   
    return task;
  }

  public void init(Site site, SpiderListener listener) {
   
  }

  public void destroy() {
  }

}
TOP

Related Classes of spiderman.plugin.impl.BeginPointImpl

TOP
Copyright © 2018 www.massapi.com. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.