package com.crawljax.core;
import java.io.IOException;
import java.net.URI;
import java.util.ArrayList;
import java.util.Collections;
import java.util.LinkedList;
import java.util.List;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import javax.inject.Inject;
import javax.xml.xpath.XPathExpressionException;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import org.w3c.dom.Document;
import org.w3c.dom.Element;
import org.w3c.dom.Node;
import org.w3c.dom.NodeList;
import com.crawljax.browser.EmbeddedBrowser;
import com.crawljax.condition.eventablecondition.EventableCondition;
import com.crawljax.condition.eventablecondition.EventableConditionChecker;
import com.crawljax.core.configuration.CrawlElement;
import com.crawljax.core.configuration.CrawlRules;
import com.crawljax.core.configuration.CrawljaxConfiguration;
import com.crawljax.core.configuration.PreCrawlConfiguration;
import com.crawljax.core.state.Identification;
import com.crawljax.core.state.StateVertex;
import com.crawljax.forms.FormHandler;
import com.crawljax.util.DomUtils;
import com.crawljax.util.XPathHelper;
import com.google.common.base.Strings;
import com.google.common.collect.ImmutableList;
import com.google.common.collect.ImmutableList.Builder;
import com.google.common.collect.ImmutableMultimap;
import com.google.common.collect.ImmutableSortedSet;
import com.google.inject.assistedinject.Assisted;
/**
* This class extracts candidate elements from the DOM tree, based on the tags provided by the user.
* Elements can also be excluded.
*/
public class CandidateElementExtractor {
private static final Logger LOG = LoggerFactory.getLogger(CandidateElementExtractor.class);
private final ExtractorManager checkedElements;
private final EmbeddedBrowser browser;
private final FormHandler formHandler;
private final boolean crawlFrames;
private final ImmutableMultimap<String, CrawlElement> excludeCrawlElements;
private final ImmutableList<CrawlElement> includedCrawlElements;
private final boolean clickOnce;
private final boolean randomizeElementsOrder;
private final ImmutableSortedSet<String> ignoredFrameIdentifiers;
private final boolean followExternalLinks;
private final String siteHostName;
/**
* Create a new CandidateElementExtractor.
*
* @param checker
* the ExtractorManager to use for marking handled elements and retrieve the
* EventableConditionChecker
* @param browser
* the current browser instance used in the Crawler
* @param formHandler
* the form handler.
* @param config
* the checker used to determine if a certain frame must be ignored.
*/
@Inject
public CandidateElementExtractor(ExtractorManager checker, @Assisted EmbeddedBrowser browser,
FormHandler formHandler, CrawljaxConfiguration config) {
checkedElements = checker;
this.browser = browser;
this.formHandler = formHandler;
CrawlRules rules = config.getCrawlRules();
PreCrawlConfiguration preCrawlConfig = rules.getPreCrawlConfig();
this.excludeCrawlElements = asMultiMap(preCrawlConfig.getExcludedElements());
this.includedCrawlElements = ImmutableList.<CrawlElement> builder()
.addAll(preCrawlConfig.getIncludedElements())
.addAll(rules.getInputSpecification().getCrawlElements())
.build();
crawlFrames = rules.shouldCrawlFrames();
clickOnce = rules.isClickOnce();
randomizeElementsOrder = rules.isRandomizeCandidateElements();
ignoredFrameIdentifiers = rules.getIgnoredFrameIdentifiers();
followExternalLinks = rules.followExternalLinks();
siteHostName = config.getUrl().getHost();
}
private ImmutableMultimap<String, CrawlElement> asMultiMap(
ImmutableList<CrawlElement> elements) {
ImmutableMultimap.Builder<String, CrawlElement> builder = ImmutableMultimap.builder();
for (CrawlElement elem : elements) {
builder.put(elem.getTagName(), elem);
}
return builder.build();
}
/**
* This method extracts candidate elements from the current DOM tree in the browser, based on
* the crawl tags defined by the user.
*
* @param currentState
* the state in which this extract method is requested.
* @return a list of candidate elements that are not excluded.
* @throws CrawljaxException
* if the method fails.
*/
public ImmutableList<CandidateElement> extract(StateVertex currentState)
throws CrawljaxException {
LinkedList<CandidateElement> results = new LinkedList<>();
if (!checkedElements.checkCrawlCondition(browser)) {
LOG.info("State {} did not satisfy the CrawlConditions.", currentState.getName());
return ImmutableList.of();
}
LOG.debug("Looking in state: {} for candidate elements", currentState.getName());
try {
Document dom = DomUtils.asDocument(browser.getStrippedDomWithoutIframeContent());
extractElements(dom, results, "");
} catch (IOException e) {
LOG.error(e.getMessage(), e);
throw new CrawljaxException(e);
}
if (randomizeElementsOrder) {
Collections.shuffle(results);
}
LOG.debug("Found {} new candidate elements to analyze!", results.size());
return ImmutableList.copyOf(results);
}
private void extractElements(Document dom, List<CandidateElement> results,
String relatedFrame) {
LOG.debug("Extracting elements for related frame '{}'", relatedFrame);
for (CrawlElement tag : includedCrawlElements) {
LOG.debug("Extracting TAG: {}", tag);
NodeList frameNodes = dom.getElementsByTagName("FRAME");
addFramesCandidates(dom, results, relatedFrame, frameNodes);
NodeList iFrameNodes = dom.getElementsByTagName("IFRAME");
addFramesCandidates(dom, results, relatedFrame, iFrameNodes);
evaluateElements(dom, tag, results, relatedFrame);
}
}
private void addFramesCandidates(Document dom, List<CandidateElement> results,
String relatedFrame, NodeList frameNodes) {
if (frameNodes == null) {
return;
}
for (int i = 0; i < frameNodes.getLength(); i++) {
Element frameElement = (Element) frameNodes.item(i);
String nameId = DomUtils.getFrameIdentification(frameElement);
String frameIdentification = "";
if (!Strings.isNullOrEmpty(relatedFrame)) {
frameIdentification += relatedFrame + ".";
}
// TODO Stefan; Here the IgnoreFrameChecker is used, also in
// WebDriverBackedEmbeddedBrowser. We must get this in 1 place.
if (nameId == null || isFrameIgnored(frameIdentification + nameId)) {
continue;
} else {
frameIdentification += nameId;
LOG.debug("frame Identification: {}", frameIdentification);
try {
Document frameDom =
DomUtils.asDocument(browser.getFrameDom(frameIdentification));
extractElements(frameDom, results, frameIdentification);
} catch (IOException e) {
LOG.info("Got exception while inspecting a frame: {} continuing...",
frameIdentification, e);
}
}
}
}
private boolean isFrameIgnored(String string) {
if (crawlFrames) {
for (String ignorePattern : ignoredFrameIdentifiers) {
if (ignorePattern.contains("%")) {
// replace with a useful wildcard for regex
String pattern = ignorePattern.replace("%", ".*");
if (string.matches(pattern)) {
return true;
}
} else if (ignorePattern.equals(string)) {
return true;
}
}
return false;
} else {
return true;
}
}
private void evaluateElements(Document dom, CrawlElement crawl,
List<CandidateElement> results, String relatedFrame) {
try {
List<Element> nodeListForCrawlElement =
getNodeListForTagElement(dom, crawl,
checkedElements.getEventableConditionChecker());
for (Element sourceElement : nodeListForCrawlElement) {
evaluateElement(results, relatedFrame, crawl, sourceElement);
}
} catch (CrawljaxException e) {
LOG.warn("Catched exception during NodeList For Tag Element retrieval", e);
}
}
/**
* Returns a list of Elements form the DOM tree, matching the tag element.
*/
private ImmutableList<Element> getNodeListForTagElement(Document dom,
CrawlElement crawlElement,
EventableConditionChecker eventableConditionChecker) {
Builder<Element> result = ImmutableList.builder();
if (crawlElement.getTagName() == null) {
return result.build();
}
EventableCondition eventableCondition =
eventableConditionChecker.getEventableCondition(crawlElement.getId());
// TODO Stefan; this part of the code should be re-factored, Hack-ed it this way to prevent
// performance problems.
ImmutableList<String> expressions = getFullXpathForGivenXpath(dom, eventableCondition);
NodeList nodeList = dom.getElementsByTagName(crawlElement.getTagName());
for (int k = 0; k < nodeList.getLength(); k++) {
Element element = (Element) nodeList.item(k);
boolean matchesXpath =
elementMatchesXpath(eventableConditionChecker, eventableCondition,
expressions, element);
LOG.debug("Element {} matches Xpath={}", DomUtils.getElementString(element),
matchesXpath);
/*
* TODO Stefan This is a possible Thread-Interleaving problem, as / isChecked can return
* false and when needed to add it can return true. / check if element is a candidate
*/
String id = element.getNodeName() + ": " + DomUtils.getAllElementAttributes(element);
if (matchesXpath && !checkedElements.isChecked(id)
&& !isExcluded(dom, element, eventableConditionChecker)) {
addElement(element, result, crawlElement);
} else {
LOG.debug("Element {} was not added", element);
}
}
return result.build();
}
private boolean elementMatchesXpath(EventableConditionChecker eventableConditionChecker,
EventableCondition eventableCondition, ImmutableList<String> expressions,
Element element) {
boolean matchesXpath = true;
if (eventableCondition != null && eventableCondition.getInXPath() != null) {
try {
matchesXpath =
eventableConditionChecker.checkXPathUnderXPaths(
XPathHelper.getXPathExpression(element), expressions);
} catch (RuntimeException e) {
matchesXpath = false;
}
}
return matchesXpath;
}
private ImmutableList<String> getFullXpathForGivenXpath(Document dom,
EventableCondition eventableCondition) {
if (eventableCondition != null && eventableCondition.getInXPath() != null) {
try {
ImmutableList<String> result =
XPathHelper.getXpathForXPathExpressions(dom,
eventableCondition.getInXPath());
LOG.debug("Xpath {} resolved to xpaths in document: {}",
eventableCondition.getInXPath(), result);
return result;
} catch (XPathExpressionException e) {
LOG.debug("Could not load XPath expressions for {}", eventableCondition, e);
}
}
return ImmutableList.<String> of();
}
private void addElement(Element element, Builder<Element> builder, CrawlElement crawlElement) {
if ("A".equalsIgnoreCase(crawlElement.getTagName()) && hrefShouldBeIgnored(element)) {
return;
}
builder.add(element);
LOG.debug("Adding element {}", element);
checkedElements.increaseElementsCounter();
}
private boolean hrefShouldBeIgnored(Element element) {
String href = Strings.nullToEmpty(element.getAttribute("href"));
return isFileForDownloading(href)
|| href.startsWith("mailto:")
|| (!followExternalLinks && isExternal(href));
}
private boolean isExternal(String href) {
if (href.startsWith("http")) {
try {
URI uri = URI.create(href);
return !uri.getHost().equalsIgnoreCase(siteHostName);
} catch (IllegalArgumentException e) {
LOG.info("Unreadable externa link {}", href);
}
}
return false;
}
/**
* @param href
* the string to check
* @return true if href has the pdf or ps pattern.
*/
private boolean isFileForDownloading(String href) {
final Pattern p = Pattern.compile(".+.pdf|.+.ps|.+.zip|.+.mp3");
Matcher m = p.matcher(href);
if (m.matches()) {
return true;
}
return false;
}
private void evaluateElement(List<CandidateElement> results, String relatedFrame,
CrawlElement crawl, Element sourceElement) {
EventableCondition eventableCondition =
checkedElements.getEventableConditionChecker().getEventableCondition(
crawl.getId());
String xpath = XPathHelper.getXPathExpression(sourceElement);
// get multiple candidate elements when there are input
// fields connected to this element
List<CandidateElement> candidateElements = new ArrayList<CandidateElement>();
if (eventableCondition != null && eventableCondition.getLinkedInputFields() != null
&& eventableCondition.getLinkedInputFields().size() > 0) {
// add multiple candidate elements, for every input
// value combination
candidateElements =
formHandler.getCandidateElementsForInputs(sourceElement, eventableCondition);
} else {
// just add default element
candidateElements.add(new CandidateElement(sourceElement, new Identification(
Identification.How.xpath, xpath), relatedFrame));
}
for (CandidateElement candidateElement : candidateElements) {
if (!clickOnce || checkedElements.markChecked(candidateElement)) {
LOG.debug("Found new candidate element: {} with eventableCondition {}",
candidateElement.getUniqueString(), eventableCondition);
candidateElement.setEventableCondition(eventableCondition);
results.add(candidateElement);
/**
* TODO add element to checkedElements after the event is fired! also add string
* without 'atusa' attribute to make sure an form action element is only clicked for
* its defined values
*/
}
}
}
/**
* @return true if element should be excluded. Also when an ancestor of the given element is
* marked for exclusion, which allows for recursive exclusion of elements from
* candidates.
*/
private boolean isExcluded(Document dom, Element element,
EventableConditionChecker eventableConditionChecker) {
Node parent = element.getParentNode();
if (parent instanceof Element
&& isExcluded(dom, (Element) parent, eventableConditionChecker)) {
return true;
}
for (CrawlElement crawlElem : excludeCrawlElements
.get(element.getTagName().toUpperCase())) {
boolean matchesXPath = false;
EventableCondition eventableCondition =
eventableConditionChecker.getEventableCondition(crawlElem.getId());
try {
String asXpath = XPathHelper.getXPathExpression(element);
matchesXPath =
eventableConditionChecker.checkXpathStartsWithXpathEventableCondition(
dom, eventableCondition, asXpath);
} catch (CrawljaxException | XPathExpressionException e) {
LOG.debug("Could not check exclusion by Xpath for element because {}",
e.getMessage());
matchesXPath = false;
}
if (matchesXPath) {
LOG.info("Excluded element because of xpath: " + element);
return true;
}
}
return false;
}
public boolean checkCrawlCondition() {
return checkedElements.checkCrawlCondition(browser);
}
}