Package org.vietspider.html.path2

Examples of org.vietspider.html.path2.HTMLExtractor.extract()


    NodePath [] nodePaths = new NodePath[paths.length];
    for(int i=0; i<paths.length; i++){
      nodePaths[i] = pathParser.toPath(paths[i]);
    }
   
    HTMLDocument doc = htmlExtractor.extract(document, nodePaths);
    System.out.println(doc.getTextValue());
   
    paths = new String[]{
        "DIV[*].BR[*]",
    };
View Full Code Here


    HTMLParser2 parser2 = new HTMLParser2();
    HTMLDocument document  = parser2.createDocument(file, null);
    NodePathParser pathParser = new NodePathParser();
    NodePath nodePath = pathParser.toPath(path);
    HTMLExtractor htmlExtractor = new HTMLExtractor();
    HTMLNode node = htmlExtractor.extract(document, new NodePath[]{nodePath}).getRoot();
    System.out.println(node.getTextValue());
  }

  public HTMLDocument removeNode(String namethrows Exception {
    HTMLParser2 parser2 = new HTMLParser2();
View Full Code Here

      HTMLDocument document = parser2.createDocument(url.openStream(), "utf-8");

      NodePathParser pathParser = new NodePathParser();
      NodePath nodePath = pathParser.toPath("BODY[0].DIV[0].TABLE[0].TBODY[0].TR[1].TD[3].DIV[10]");
      HTMLExtractor htmlExtractor = new HTMLExtractor();
      HTMLNode node = htmlExtractor.extract(document, new NodePath[]{nodePath}).getRoot();

      System.out.println(node.getTextValue());

      System.out.println("=================================================================");
View Full Code Here

    NodePathParser pathParser = new NodePathParser();
   
    NodePath nodePath = pathParser.toPath(path);
   
    HTMLExtractor htmlExtractor = new HTMLExtractor();
    document = htmlExtractor.extract(document, new NodePath[]{nodePath});
   
    List<HTMLNode> children = document.getRoot().getChildren();
   
    //print header
    List<String> headers = new ArrayList<String>();
View Full Code Here

        "BODY[0].DIV[1].TABLE[1].TBODY[0].TR[0].TD[0].DIV[1].DIV[0].DIV[0].TABLE[0]",
        "BODY[0].DIV[1].TABLE[1].TBODY[0].TR[0].TD[0].TABLE[*].TBODY[0].TR[0].TD[1].DIV[0].DIV[0].DIV[0].TABLE[0]"
    };
   
    NodePath [] postPaths = pathParser.toNodePath(postPathValues);
    HTMLDocument document2 = htmlExtractor.extract(document, postPaths);
   
    String userPathValue = "TABLE[*].TBODY[0].TR[0].TD[0].TABLE[0].TBODY[0].TR[0].TD[0].TABLE[0].TBODY[0].TR[0].TD[0].NOBR[0].A[0]";
    NodePath userPath = pathParser.toPath(userPathValue);
    List<HTMLNode> userNodes = htmlExtractor.matchNodes(document2.getRoot(), userPath);
   
View Full Code Here

TOP
Copyright © 2018 www.massapi.com. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.