Package org.vietspider.html

Examples of org.vietspider.html.HTMLNode


 
  private Name [] ignores = {Name.A, Name.MARQUEE};

  public void searchNodes(NodeIterator iterator, List<HTMLNode> nodes, Name name) {
    while(iterator.hasNext()) {
      HTMLNode n = iterator.next();
      if(n.isNode(name)) nodes.add(n);
    }
  } 
View Full Code Here


    return new NodePath(toNodes(htmlNode));
  }
 
  public Node[] toNodes(HTMLNode htmlNode) {
    if(htmlNode == null) return new Node[0];
    HTMLNode htmlParent = htmlNode.getParent();
    List<Node> list = new ArrayList<Node>();
    while(htmlParent != null){
      list.add(toNode(htmlParent, htmlNode));
      htmlNode = htmlParent;
      htmlParent = htmlNode.getParent();
View Full Code Here

    }
    return isIgnoreTag(node.getParent(), level+1);
  }

  public HTMLNode searchParent(HTMLDocument document, int level) throws Exception  {
    HTMLNode  body = searchBody(document);
    ContentRenderer renderer = createContentRenderer(body);

    int length = renderer.getTextValue().length();
    List<HTMLNode> renderNodes = renderer.getNodePositions(0, length);
    NodeRenderer nodeRenderer = new NodeRenderer(renderer, renderNodes, 0, length);
View Full Code Here

  }
 
  public List<HTMLNode> searchNodes(HTMLDocument document, int level, int step) throws Exception  {
    List<HTMLNode> values = new ArrayList<HTMLNode>();
   
    HTMLNode body = searchBody(document);
    ContentRenderer renderer = createContentRenderer(body);

    int length = renderer.getTextValue().length();
    List<HTMLNode> renderNodes = renderer.getNodePositions(0, length);
    NodeRenderer nodeRenderer = new NodeRenderer(renderer, renderNodes, 0, length);
View Full Code Here

    HTMLDocument doc = new HTMLParser2().createDocument(tokens);
    NodePathParser pathParser = new NodePathParser();
    HTMLExtractor extractor  = new HTMLExtractor();
   
    NodePath nodePath  = pathParser.toPath("BODY");
    HTMLNode body = extractor.lookNode(doc.getRoot(), nodePath);
   
    TextRenderer renderer = new TextRenderer(body, TextRenderer.HANDLER);
    String value = renderer.getTextValue().toString();
    return value.trim().split("\n");
  }
View Full Code Here

 
  protected HTMLNode searchBody(HTMLDocument document) throws Exception {
    RefsDecoder decoder = new RefsDecoder();
    NodeIterator iterator = document.getRoot().iterator();
    while(iterator.hasNext()) {
      HTMLNode node = iterator.next();
      if(!node.isNode(Name.CONTENT)) continue;
      char [] chars = node.getValue();
      chars = decoder.decode(chars);

      chars = CharsUtil.cutAndTrim(chars, 0, chars.length);
      chars =  java.text.Normalizer.normalize(new String(chars), Normalizer.Form.NFC).toCharArray();
      node.setValue(chars);             
   

    HTMLExtractor extractor  = new HTMLExtractor();
    NodePathParser pathParser = new NodePathParser();
View Full Code Here

    if(documents.length < 1) return null;
    return documents[0];
  }
 
  public HTMLDocument extract(HTMLDocument document, NodePath... nodePaths) {
    HTMLNode root = document.getRoot();
//    CharsToken tokens = document.getTokens();
   
    HTMLNode newRoot = HTMLParser2.clone(root);
    HTMLDocument newDocument  = new HTMLDocument();
//    CharsToken newTokens = new CharsToken(newDocument);
//    newTokens.push((NodeImpl)newRoot);

    LookupNode lookupNode = new LookupNode();
View Full Code Here

    return newDocument;
  }

  public HTMLDocument[] extractRow(HTMLDocument document, NodePath[] nodePaths) {
    List<List<HTMLNode>> nodeLists = new ArrayList<List<HTMLNode>>();
    HTMLNode root = document.getRoot();

    LookupNode lookupNode = new LookupNode();
   
    for(int i = 0; i < nodePaths.length; i++) {
      List<HTMLNode> matchValues = lookupNode.lookupNodes(root, nodePaths[i]);
      if(matchValues != null) nodeLists.add(matchValues);
    }

    if(nodeLists.size() == 0 || nodeLists.get(0) == null) return new HTMLDocument[0];

    List<HTMLNode> nodes = nodeLists.get(0);
    HTMLDocument [] newDocuments = new HTMLDocument[nodes.size()];

    for(int i = 0; i < nodes.size(); i++) {
      HTMLNode newRoot = HTMLParser2.clone(root);
      if(nodes.get(i) == null) continue;
      nodes.get(i).clone(newRoot);
      for(int j = 1; j < nodeLists.size(); j++) {
        List<HTMLNode> nextNodes = nodeLists.get(j);
        if(i > nextNodes.size()) break;
View Full Code Here

  public TextRenderer (HTMLNode root,
      final List<HTMLNode> contents, final int type, final boolean constain) {
    NodeIterator iterator = root.iterator();
    while(iterator.hasNext()) {
      HTMLNode node = iterator.next();
      switch (node.getName()) {
      case CONTENT:
        char [] chars = node.getValue();
        if(!isEmpty(chars)) {
          if(isValid(contents, node, constain)) {
            int start = builder.length();
            for(int k = 0; k < chars.length; k++) {
              builder.append(chars[k] == '\n' ? ' ' : chars[k]);
            }
            HTMLNode parent = node.getParent();
            if(parent != null && parent.isNode(Name.SPAN)) builder.append(' ');
           
            int end = builder.length();
            positions.add(new NodePosition(node, start, end));
          }
        }
View Full Code Here

    if(documents.length < 1) return null;
    return documents[0];
  }
 
  public HTMLDocument extract(HTMLDocument document, NodePath... nodePaths) {
    HTMLNode root = document.getRoot();
//    CharsToken tokens = document.getTokens();
   
    HTMLNode newRoot = HTMLParser2.clone(root);
    HTMLDocument newDocument  = new HTMLDocument();
//    CharsToken newTokens = new CharsToken(newDocument);
//    newTokens.push((NodeImpl)newRoot);

    for(int i = 0; i < nodePaths.length; i++) {
      List<HTMLNode> htmlNodes = matchNodes(root, nodePaths[i]);
      if(htmlNodes == null ) continue;
      for(HTMLNode htmlNode : htmlNodes) {
        if(htmlNode == null) continue;
//        extractTokens(tokens, newTokens, htmlNode);
        newRoot.addChild(htmlNode);
//        htmlNode.setParent(newRoot);
      }
    }
   
    newDocument.setRoot(newRoot);
View Full Code Here

TOP

Related Classes of org.vietspider.html.HTMLNode

Copyright © 2018 www.massapicom. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.