Package net.htmlparser.jericho

Examples of net.htmlparser.jericho.Segment


    return count;
  }

  private void processCurrentSegment() {
    Segment segment = parser.getCurrentSegment();
    // If this tag is inside the previous tag (e.g. a server tag) then
    // ignore it as it was already output along with the previous tag.
    if( segment.getEnd() <= lastSegEnd ) {
      return;
    }
    lastSegEnd = segment.getEnd();
    if( segment instanceof Tag ) {
      if( segment instanceof StartTag ) {
        processStartTag( (StartTag)segment );
      } else if ( segment instanceof EndTag ) {
        processEndTag( (EndTag)segment );
      } else {
        writer.write( segment.toString() );
      }
    } else {
      processText( segment );
    }
  }
View Full Code Here


    return count;
  }

  private void processCurrentSegment() {
    Segment segment = parser.getCurrentSegment();
    // if this tag is inside the previous tag (e.g. a server tag) then
    // ignore it as it was already output along with the previous tag.
    if( segment.getEnd() <= lastSegEnd ) {
      return;
    }
    lastSegEnd = segment.getEnd();
    if( segment instanceof Tag ) {
      if( segment instanceof StartTag ) {
        processStartTag( (StartTag)segment );
      } else if ( segment instanceof EndTag ) {
        processEndTag( (EndTag)segment );
      } else {
        writer.write( segment.toString() );
      }
    } else {
      processText( segment );
    }
  }
View Full Code Here

     * @param html
     * @return
     */
    public static String htmlToText(String html) {
        Source htmlSource = new Source(html);
        Segment htmlSeg = new Segment(htmlSource, 0, htmlSource.length());
        Renderer htmlRend = new Renderer(htmlSeg);
        return htmlRend.toString();
    }
View Full Code Here

            if (preTags.size() > 0) {
                result = preTags.get(0).getContent().toString();
            } else {

                log.debug("parsing element: {}", valueElement.toString());
                Segment valueText = valueElement.getContent();
                int elementsSize = valueText.getAllElements().size();
                String fieldText = "";
                if (elementsSize > 0) {
                    if (valueText.toString().contains("<br")) {
                        String delimitedText = valueText
                                .toString()
                                .replace("<br>", ";")
                                .replace("<br/>", ";")
                                .replace("<br />", ";");
                        log.debug("delimited text: {}", delimitedText);
                        Source newElement = new Source(delimitedText);
                        fieldText = newElement.getTextExtractor().toString();
                        if (fieldText.endsWith(";")) {
                            fieldText = fieldText.substring(0, fieldText.length() - 1).trim();
                        }
                    } else {
                        fieldText = valueText.getTextExtractor().toString();
                    }

                } else {
                    fieldText = valueText.getTextExtractor().toString();
                }
                // check for links..
                List<Element> links = valueElement.getAllElements(HTMLElementName.A);
                if (links.size() > 0) {
                    for (Element link : links) {
View Full Code Here

                        document.insert(element.getBegin(), buf.toString()); // 插入块指令
                    }
                    // ---- 指令属性处理 ----
                    for (int i = 0; i < directiveAttributes.size(); i++) {
                        Attribute attribute = (Attribute) directiveAttributes.get(i);
                        document.remove(new Segment(source, attribute.getBegin() - 1, attribute.getEnd())); // 移除属性
                    }

                    if (attributes != null) {
                        //检查扩展的ifattr指令
                        for (Attribute attribute : attributes) {
                            if (attribute != null) {
                                String name = attribute.getName();
                                if (ifattr.equals(name)) {
                                    String val = attribute.getValue();
                                    String[] arr = val.split(",");
                                    String attrName = arr[0].trim();
                                    String expression = arr[1].trim();

                                    //修改原attribute
                                    Attribute oriattr = attributes.get(attrName);
                                    if (oriattr != null) {
                                        String buf = String.format("#if(%s)%s=\"%s\"#end()", expression, oriattr.getName(), oriattr.getValue());
                                        document.replace(new Segment(source, oriattr.getBegin(), oriattr.getEnd()), buf);
                                        document.remove(new Segment(source, attribute.getBegin(), attribute.getEnd())); // 移除ifattr控制属性
                                    }
                                }
                            }
                        }

                        //检查扩展的setattr指令
                        for (Attribute attribute : attributes) {
                            if (attribute != null) {
                                String name = attribute.getName();
                                if (setattr.equals(name)) {
                                    String val = attribute.getValue();
                                    String[] arr = val.split(",");
                                    String attrName = arr[0].trim();
                                    String expression = arr[1].trim();

                                    //将控制指令直接替换为动态属性赋值
                                    Attribute oriattr = attributes.get(attrName);
                                    String buf = String.format("%s=\"%s\"", attrName, expression);
                                    document.replace(new Segment(source, attribute.getBegin(), attribute.getEnd()), buf);

                                    //如果有已经存在的静态属性,直接删去即可
                                    if (oriattr != null) {
                                        document.remove(new Segment(source, attribute.getBegin(), attribute.getEnd())); // 移除setattr控制属性
                                    }
                                }
                            }
                        }
                    }
View Full Code Here

TOP

Related Classes of net.htmlparser.jericho.Segment

Copyright © 2018 www.massapicom. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.