Source Code of edu.uci.ics.crawler4j.crawler.HTMLParser

/**
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */


package edu.uci.ics.crawler4j.crawler;


import it.unimi.dsi.parser.BulletParser;
import it.unimi.dsi.parser.callback.TextExtractor;


import java.net.URL;
import java.util.HashSet;
import java.util.Iterator;
import java.util.Set;


import edu.uci.ics.crawler4j.url.URLCanonicalizer;


/**
 * @author Yasser Ganjisaffar <yganjisa at uci dot edu>
 */


public class HTMLParser {


  private String text;
  private String title;


  private BulletParser bulletParser;
  private TextExtractor textExtractor;
  private LinkExtractor linkExtractor;


  private static final int MAX_OUT_LINKS = Configurations.getIntProperty(
      "fetcher.max_outlinks", 5000);


  private Set<String> urls;


  public HTMLParser() {
    bulletParser = new BulletParser();
    textExtractor = new TextExtractor();
    linkExtractor = new LinkExtractor();
    
    linkExtractor.setIncludeImagesSources(Configurations
        .getBooleanProperty("crawler.include_images", false));
  }


  public void parse(String htmlContent, String contextURL) {
    urls = new HashSet<String>();
    char[] chars = htmlContent.toCharArray();


    bulletParser.setCallback(textExtractor);
    bulletParser.parse(chars);
    text = textExtractor.text.toString().trim();
    title = textExtractor.title.toString().trim();


    bulletParser.setCallback(linkExtractor);
    bulletParser.parse(chars);
    Iterator<String> it = linkExtractor.urls.iterator();
    
    String baseURL = linkExtractor.base();
    if (baseURL != null) {
      contextURL = baseURL;
    }


    int urlCount = 0;
    while (it.hasNext()) {
      String href = it.next();
      href = href.trim();
      if (href.length() == 0) {
        continue;
      }
      String hrefWithoutProtocol = href.toLowerCase();
      if (href.startsWith("http://")) {
        hrefWithoutProtocol = href.substring(7);
      }
      if (hrefWithoutProtocol.indexOf("javascript:") < 0
          && hrefWithoutProtocol.indexOf("@") < 0) {
        URL url = URLCanonicalizer.getCanonicalURL(href, contextURL);
        if (url != null) {
          urls.add(url.toExternalForm());
          urlCount++;
          if (urlCount > MAX_OUT_LINKS) {
            break;
          }  
        }        
      }
    }
  }


  public String getText() {
    return text;
  }


  public String getTitle() {
    return title;
  }


  public Set<String> getLinks() {
    return urls;
  }
}
Source Code of edu.uci.ics.crawler4j.crawler.HTMLParser

Related Classes of edu.uci.ics.crawler4j.crawler.HTMLParser