Package org.vietspider.html.renderer

Source Code of org.vietspider.html.renderer.ContentRendererBak

/***************************************************************************
* Copyright 2001-2008 The VietSpider         All rights reserved.       *
**************************************************************************/
package org.vietspider.html.renderer;

import java.util.ArrayList;
import java.util.List;

import org.vietspider.html.HTMLNode;
import org.vietspider.html.Name;
import org.vietspider.html.NodeIterator;
import org.vietspider.html.parser.NodeImpl;
import org.vietspider.token.TypeToken;


/**
* Author : Nhu Dinh Thuan
*          nhudinhthuan@yahoo.com
* Sep 30, 2008 
*/
public final class ContentRendererBak {
 
  private List<NodePosition> positions = new ArrayList<NodePosition>();
 
  private String text = "";
  private LinkChecker linkChecker;
 
  public ContentRendererBak (HTMLNode root,
      List<HTMLNode> ignores, List<HTMLNode> wrappers, LinkChecker linkChecker) {
    this.linkChecker = linkChecker;
    StringBuilder builder = new StringBuilder();
   
    NodeIterator iterator = root.iterator(ignores);
    while(iterator.hasNext()) {
      HTMLNode node = iterator.next();
      switch (node.getName()) {
      case CONTENT:
        char [] chars = node.getValue();
        if(!isEmpty(chars)) {
//          if(isValid(contents, node, constain)) {
            int start = builder.length();
            for(int k = 0; k < chars.length; k++) {
              builder.append(chars[k] == '\n' ? ' ' : chars[k]);
            }
            HTMLNode parent = node.getParent();
            if(parent != null && parent.isNode(Name.SPAN)) builder.append(' ');
           
            int end = builder.length();
            positions.add(new NodePosition(node, start, end));
          }
//        }
        break;
      case IMG:
//        positions.add(new NodePosition(node, -1, -1));
        break;
      case H1:
      case H2:
      case H3:
      case H4:
      case H5:
      case H6:
      case P:
      case BR:
      case LI:       
        if(!isEndWithNewLine(builder)) {
          builder.append('\n');
        }
        break;
       
      case IFRAME:
        System.out.println(builder.length());
        separateBlock(builder, 2);
//        System.out.println("====>" + builder.length());
        break;
      case DIV:
        separateBlock(builder, node, 2, wrappers);
        break;
      case TABLE:
        separateBlock(builder, node, 4, wrappers);
        break;
      case TR:
        separateBlock(builder, node, 2, wrappers);
        break;
      case TD:
        separateBlock(builder, node, 2, wrappers);
        break;
      case SCRIPT:
      case STYLE:
        NodeImpl nodeImpl = (NodeImpl) node;
        if(nodeImpl.getType() == TypeToken.TAG && iterator.hasNext()) iterator.next();
        break;
      default:
        if(builder.length() > 0) {
          char c = builder.charAt(builder.length()-1);
          if(!(Character.isWhitespace(c)
              || Character.isSpaceChar(c))) {
            builder.append(' ');
          }
        }
      break;
      }
    }
    text = builder.toString();
  }
 
//  private boolean isValid(List<HTMLNode> list, HTMLNode node, boolean constain) {
//    if(list == null) return true;
//    boolean listConstain = list.contains(node);
//    return constain ? listConstain : !listConstain;
//  }

  public String getTextValue() { return text; }

  private final boolean isEndWithNewLine(StringBuilder value) {
    int i = value.length()-1;
    while(i > -1) {
      char c = value.charAt(i);
      if(c == '\n') {
        return true;
      } else if(Character.isWhitespace(c)
          || Character.isSpaceChar(c)) {
        i--;
        continue;
      }
      return false;
    }
    return true;
  }

  public List<HTMLNode> getNodePositions(int start, int end) {
    List<HTMLNode> nodes = new ArrayList<HTMLNode>();
    for(int i = 0; i < positions.size(); i++) {
      NodePosition np = positions.get(i);
      if(np.getStart() < 0
          || np.getStart() < start) continue;
      if(np.getEnd() > end + 1) break;
      nodes.add(np.getNode());
    }
    return nodes;
  }
 
  public List<NodePosition> getPositionCollecton(int start, int end) {
    List<NodePosition> nodes = new ArrayList<NodePosition>();
    for(int i = 0; i < positions.size(); i++) {
      NodePosition np = positions.get(i);
      if(np.getStart() < 0
          || np.getStart() < start) continue;
      if(np.getEnd() > end + 1) break;
      nodes.add(np.clone(start));
    }
    return nodes;
  }

  private final boolean isEmpty(char [] chars) {
    int i = 0;
    while(i < chars.length) {
      if(Character.isWhitespace(chars[i])
          || Character.isSpaceChar(chars[i])) {
        i++;
        continue;
      }
      return false;
    }
    return true;
  }

  public List<NodePosition> getPositions() { return positions; }

  public void setPositions(List<NodePosition> positions) { this.positions = positions; }

  private void separateBlock(StringBuilder builder,
      HTMLNode node, int time, List<HTMLNode> wrappers) {
    if(!isWrapperContent(node) || wrappers.contains(node)) return;
    separateBlock(builder, time);
  }
 
  private void separateBlock(StringBuilder builder, int time) {
    int index = builder.length() - 1;
    while(index > -1) {
      char c = builder.charAt(index);
      if(Character.isSpaceChar(c)
          || Character.isWhitespace(c)) {
        index--;
      } else {
        break;
      }
    }
    StringBuilder pattern = new StringBuilder();
    for(int i  = 0; i < time; i++) {
      pattern.append('\\');
    }
    builder.insert(index + 1, pattern.toString());
//    if(node.isNode(Name.TABLE)) {
//      builder.insert(index + 1, "\\\\\\\\");
//    } else {
//      builder.insert(index + 1, "\\\\\\");
//    }
  }
 
  private boolean isWrapperContent(HTMLNode node){
    List<HTMLNode> children = node.getChildren();
    if(children == null) return false;
    for(int i = 0; i < children.size(); i++) {
      HTMLNode child = children.get(i);
      if(child.isNode(Name.CONTENT)
         || isWrapperContent(child)) return true;
    }
    return false;
  }

  public LinkChecker getLinkChecker() { return linkChecker; }

}
TOP

Related Classes of org.vietspider.html.renderer.ContentRendererBak

TOP
Copyright © 2018 www.massapi.com. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.