Source Code of org.vietspider.html.path2.DocumentExtractor

/***************************************************************************
 * Copyright 2001-2007 The VietSpider         All rights reserved.       *
 **************************************************************************/
package org.vietspider.html.path2;


import java.util.ArrayList;
import java.util.List;


import org.vietspider.html.HTMLDocument;
import org.vietspider.html.HTMLNode;
import org.vietspider.html.parser.HTMLParser2;


/** 
 * Author : Nhu Dinh Thuan
 *          nhudinhthuan@yahoo.com
 * Dec 6, 2007  
 */
public class DocumentExtractor {
  
  public HTMLDocument extractFirst(HTMLDocument document, NodePath [] nodePaths) {
    HTMLDocument [] documents = extractRow(document, nodePaths);
    if(documents.length < 1) return null;
    return documents[0];
  }
  
  public HTMLDocument extract(HTMLDocument document, NodePath... nodePaths) {
    HTMLNode root = document.getRoot();
//    CharsToken tokens = document.getTokens();
    
    HTMLNode newRoot = HTMLParser2.clone(root);
    HTMLDocument newDocument  = new HTMLDocument();
//    CharsToken newTokens = new CharsToken(newDocument);
//    newTokens.push((NodeImpl)newRoot);


    LookupNode lookupNode = new LookupNode();
    
    for(int i = 0; i < nodePaths.length; i++) {
      List<HTMLNode> htmlNodes = lookupNode.lookupNodes(root, nodePaths[i]);
      if(htmlNodes == null ) continue;
      for(HTMLNode htmlNode : htmlNodes) {
        if(htmlNode == null) continue;
//        lookupNode.extractTokens(tokens, newTokens, htmlNode);
        htmlNode.clone(newRoot);
      }
    }
    
    newDocument.setRoot(newRoot);
    return newDocument;
  }


  public HTMLDocument[] extractRow(HTMLDocument document, NodePath[] nodePaths) {
    List<List<HTMLNode>> nodeLists = new ArrayList<List<HTMLNode>>();
    HTMLNode root = document.getRoot();


    LookupNode lookupNode = new LookupNode();
    
    for(int i = 0; i < nodePaths.length; i++) {
      List<HTMLNode> matchValues = lookupNode.lookupNodes(root, nodePaths[i]);
      if(matchValues != null) nodeLists.add(matchValues);
    }


    if(nodeLists.size() == 0 || nodeLists.get(0) == null) return new HTMLDocument[0];


    List<HTMLNode> nodes = nodeLists.get(0);
    HTMLDocument [] newDocuments = new HTMLDocument[nodes.size()]; 


    for(int i = 0; i < nodes.size(); i++) {
      HTMLNode newRoot = HTMLParser2.clone(root);
      if(nodes.get(i) == null) continue;
      nodes.get(i).clone(newRoot);
      for(int j = 1; j < nodeLists.size(); j++) {
        List<HTMLNode> nextNodes = nodeLists.get(j);
        if(i > nextNodes.size()) break;
        try {
          if(nextNodes.get(i) == null) continue;
          nextNodes.get(i).clone(newRoot);
        } catch (Exception e) {
          continue;
        }
      }
      newDocuments[i] = new HTMLDocument(newRoot);
    }
    return newDocuments;
  }
  


}
Source Code of org.vietspider.html.path2.DocumentExtractor

Related Classes of org.vietspider.html.path2.DocumentExtractor