Package org.vietspider.html

Examples of org.vietspider.html.NodeIterator


    return node;
  }

  public StringBuilder getTextContent(HTMLNode node){
    StringBuilder value = new StringBuilder();
    NodeIterator iterator = node.iterator();
    while(iterator.hasNext()) {
      HTMLNode n = iterator.next();
      if(n.isNode(Name.CONTENT)) {
        value.append(n.getValue());   
      }   
    }
    /*if(node.getConfig().name() == Name.CONTENT){
View Full Code Here


  public static void main(String[] args) throws Exception {
    URL url = new URL("http://java.sun.com/");
    HTMLDocument document = new HTMLParser2().createDocument(url.openStream(), "utf-8");
   
    NodeIterator iterator =  document.getRoot().iterator();
    while(iterator.hasNext()) {
      HTMLNode node = iterator.next();
      if(node.isNode(Name.SCRIPT)) {
        if(node.hasChildren() && node.getChildren().size() > 0) {
          System.out.println("===================================================");
          System.out.println(node.getChild(0).getTextValue());
        }
View Full Code Here

    }
  }
 
  private void searchDataNode(HTMLNode root) {*/
    NodePathParser pathParser = new NodePathParser();
    NodeIterator iterator = root.iterator();
  
    while(iterator.hasNext()) {
      HTMLNode node = iterator.next();
      if(!node.isNode(Name.DIV)) continue;
      Attributes attributes = node.getAttributes();
      Attribute attribute = attributes.get("id");
      if(attribute == null) continue;
      String value = attribute.getValue();
View Full Code Here

  }
 
  private HTMLNode searchPageNode(HTMLNode root) {
    HTMLNode parent = root.getParent();
    if(parent == null) return null;
    NodeIterator iterator = parent.iterator();
    while(iterator.hasNext()) {
      HTMLNode node = iterator.next();
      if(!node.isNode(Name.CONTENT)) continue;
      String content  = node.getTextValue().toLowerCase();
      if(content.indexOf("page") < 0
          && content.indexOf("trang") < 0) continue;
      HTMLNode table = upParent(node, Name.TABLE);
View Full Code Here

    }
    return null;
  }
 
  private HTMLNode searchUserNode(HTMLNode root) {
    NodeIterator iterator = root.iterator();
    while(iterator.hasNext()) {
      HTMLNode node = iterator.next();
      if(!node.isNode(Name.A)) continue;
      Attributes attributes = node.getAttributes();
      Attribute attribute = attributes.get("class");
      if(attribute == null) continue;
      String value = attribute.getValue();
View Full Code Here

    }
    return null;
  }
 
  private HTMLNode searchContentNode(HTMLNode root, String clazz) {
    NodeIterator iterator = root.iterator();
    while(iterator.hasNext()) {
      HTMLNode node = iterator.next();
      if(!node.isNode(Name.DIV)) continue;
      Attributes attributes = node.getAttributes();
      Attribute attribute = attributes.get("id");
      if(attribute == null) continue;
      String value = attribute.getValue();
View Full Code Here

    }
    return null;
  }
 
  private boolean isPageList(HTMLNode node) {
    NodeIterator iterator = node.iterator();
    while(iterator.hasNext()) {
      HTMLNode n = iterator.next();
      if(!n.isNode(Name.A)) continue;
      Attributes attributes = n.getAttributes();
      Attribute attribute = attributes.get("href");
      if(attribute == null) continue;
      String value = attribute.getValue();
View Full Code Here

    }
    return false;
  }
 
  private HTMLNode searchPageNode2(HTMLNode node) {
    NodeIterator iterator = node.iterator();
    HTMLNode table = null;
    while(iterator.hasNext()) {
      HTMLNode n = iterator.next();
      if(!n.isNode(Name.A)) continue;
      List<HTMLNode> children = n.getChildren();
      if(children == null
          || children.size() != 1
          || !children.get(0).isNode(Name.CONTENT)) continue;
      String text = children.get(0).getTextValue();
      try {
        Integer.parseInt(text.trim());
        table = upParent(n, Name.TABLE);
        break;
      } catch (Exception e) {
      }
    }
    if(table == null) return null;
    NodeHandler nodeHandler = new NodeHandler();
    iterator = node.iterator();
    while(iterator.hasNext()) {
      HTMLNode n = iterator.next();
      if(!n.isNode(Name.CONTENT)) continue;
      String text = n.getTextValue().toLowerCase().trim();
      if(text.startsWith("trang") || text.startsWith("page")) {
        if(nodeHandler.count(text) < 5) return table;
      }
View Full Code Here

      return null;
    }
   
    HTMLNode parent = node.getParent();
    if(parent == null) return null;
    NodeIterator iterator = parent.iterator();
    while(iterator.hasNext()) {
      HTMLNode n = iterator.next();
      if(!n.isNode(Name.CONTENT)) continue;
      if(n.getParent().isNode(Name.A)) continue;
      String content = n.getTextValue();
      if(indexOf(title, content)) return upParent(n, Name.TD, Name.DIV, Name.STRONG);
    }
View Full Code Here

 
  public void remove(HTMLNode root, HTMLNode first, HTMLNode last) {
    boolean remove = true;
   
    List<HTMLNode> removes = new ArrayList<HTMLNode>();
    NodeIterator nodeIterator = root.iterator();
//    System.out.println(new String(first.getValue()));
    while(nodeIterator.hasNext()) {
      HTMLNode node = nodeIterator.next();
      switch (node.getName()) {
      case UL:
        if(isLinkContainer(node)) removes.add(node);
        break;
      case DIV:
View Full Code Here

TOP

Related Classes of org.vietspider.html.NodeIterator

Copyright © 2018 www.massapicom. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.