Examples of org.archive.modules.CrawlURI

org.archive.modules.CrawlURI
Represents a candidate URI and the associated state it collects as it is crawled.
Core state is in instance variables but a flexible attribute list is also available. Use this 'bucket' to carry custom processing extracted data and state across CrawlURI processing. See the {@link #putString(String,String)}, {@link #getString(String)}, etc.
Note: getHttpMethod() has been removed starting with Heritrix 3.3.0. HTTP response headers are available using {@link #getHttpResponseHeader(String)}. (HTTP fetchers are responsible for setting the values using {@link #putHttpResponseHeader(String,String)}). @author Gordon Mohr

        // alternative is synchronizing and we don't want to do this --
        // it causes hang ups as controller waits on a lock for this thread,
        // something it gets easily enough on old threading model but something
        // it can wait interminably for on NPTL threading model.
        // See [ 994946 ] Pause/Terminate ignored on 2.6 kernel 1.5 JVM.
        CrawlURI c = currentCuri;
        if(c != null) {
            pw.print(" ");
            c.shortReportLineTo(pw);
            pw.print("    ");
            pw.print(c.getFetchAttempts());
            pw.print(" attempts");
            pw.println();
            pw.print("    ");
            pw.print("in processor: ");
            pw.print(currentProcessorName);

View Full Code Here

    protected CrawlURI peekItem(final WorkQueueFrontier frontier)
    throws IOException {
        final BdbMultipleWorkQueues queues = ((BdbFrontier) frontier)
            .getWorkQueues();
        DatabaseEntry key = new DatabaseEntry(origin);
        CrawlURI curi = null;
        int tries = 1;
        while(true) {
            try {
                curi = queues.get(key);
            } catch (DatabaseException e) {

View Full Code Here


    @Override
    public Map<String, Object> shortReportMap() {
        Map<String,Object> data = new LinkedHashMap<String, Object>();
        data.put("serialNumber", serialNumber);
        CrawlURI c = currentCuri;
        if (c != null) {
            data.put("currentURI", c.toString());
            data.put("currentProcessor", currentProcessorName);
            data.put("fetchAttempts", c.getFetchAttempts());
        } else {
            data.put("currentURI", null);
        }


        long now = System.currentTimeMillis();

View Full Code Here

                throw new RuntimeException(e); // can't happen
            }
            JSONObject jo = new JSONObject(decodedBody);
            
            if ("GET".equals(jo.getString("method"))) {
                CrawlURI curi;
                try {
                    curi = makeCrawlUri(jo);
                    // bypasses scoping (unless rechecking is configured)
                    getFrontier().schedule(curi);
                    if (logger.isLoggable(Level.FINE)) {

View Full Code Here


            JSONObject parentUrlMetadata = jo.getJSONObject("parentUrlMetadata");
            String parentHopPath = parentUrlMetadata.getString("pathFromSeed");
            String hopPath = parentHopPath + Hop.INFERRED.getHopString();


            CrawlURI curi = new CrawlURI(uuri, hopPath, via, LinkContext.INFERRED_MISC);
            
            // set the heritable data from the parent url, passed back to us via amqp
            // XXX brittle, only goes one level deep, and only handles strings and arrays, the latter of which it converts to a Set.
            // 'heritableData': {'source': 'https://facebook.com/whitehouse/', 'heritable': ['source', 'heritable']}
            JSONObject heritableData = parentUrlMetadata.getJSONObject("heritableData");
            for (String key: (Set<String>) heritableData.keySet()) {
                Object value = heritableData.get(key);
                if (value instanceof JSONArray) {
                    Set<String> valueSet = new HashSet<String>();
                    JSONArray arr = ((JSONArray) value);
                    for (int i = 0; i < arr.length(); i++) {
                        valueSet.add(arr.getString(i));
                    }
                    curi.getData().put(key, valueSet);
                } else {
                    curi.getData().put(key, heritableData.get(key));
                }
            }


            // set the http headers from the amqp message
            Map<String, String> customHttpRequestHeaders = new HashMap<String, String>();
            for (Object key : joHeaders.keySet()) {
                customHttpRequestHeaders.put(key.toString(),
                        joHeaders.getString(key.toString()));
            }
            curi.getData().put("customHttpRequestHeaders", customHttpRequestHeaders);


            /* Use HighestUriQueuePrecedencePolicy to ensure these high priority
             * urls really get crawled ahead of others. 
             * See https://webarchive.jira.com/wiki/display/Heritrix/Precedence+Feature+Notes
             */
            curi.setSchedulingDirective(SchedulingConstants.HIGH);
            curi.setPrecedence(1);
            
            //curi.setForceFetch(true);


            curi.getAnnotations().add(A_RECEIVED_FROM_AMQP);


            return curi;
        }

View Full Code Here

        // alternative is synchronizing and we don't want to do this --
        // it causes hang ups as controller waits on a lock for this thread,
        // something it gets easily enough on old threading model but something
        // it can wait interminably for on NPTL threading model.
        // See [ 994946 ] Pause/Terminate ignored on 2.6 kernel 1.5 JVM.
        CrawlURI c = currentCuri;
        if(c != null) {
            w.print(" ");
            w.print(currentProcessorName);
            w.print(" ");
            w.print(c.toString());
            w.print(" (");
            w.print(c.getFetchAttempts());
            w.print(") ");
        } else {
            w.print(" [no CrawlURI] ");
        }

View Full Code Here

        if (base.endsWith("/")) {
            base = base.substring(0, base.length() - 1);
        }
        try {
            UURI n = UURIFactory.getInstance(base + "/" + file);
            CrawlURI link = curi.createCrawlURI(n, LinkContext.NAVLINK_MISC, Hop.NAVLINK);
            curi.getOutLinks().add(link);
        } catch (URIException e) {
            logger.log(Level.WARNING, "URI error during extraction.", e);            
        }
    }

View Full Code Here

            String scheme = uuri.getScheme();
            String auth = uuri.getEscapedAuthority();
            String path = uuri.getEscapedCurrentHierPath();
            UURI parent = UURIFactory.getInstance(scheme + "://" + auth + path);


            CrawlURI link = curi.createCrawlURI(parent, LinkContext.NAVLINK_MISC, 
                    Hop.NAVLINK);
            curi.getOutLinks().add(link);
        } catch (URIException e) {
            logger.log(Level.WARNING, "URI error during extraction.", e);
        }

View Full Code Here

    this.filter.setDestination(this);
    }
    
    public void testAdding() throws URIException {
        this.filter.add(this.getUri(),
            new CrawlURI(UURIFactory.getInstance(this.getUri())));
        this.filter.addNow(this.getUri(),
            new CrawlURI(UURIFactory.getInstance(this.getUri())));
        this.filter.addForce(this.getUri(),
            new CrawlURI(UURIFactory.getInstance(this.getUri())));
        // Should only have add 'this' once.
        assertTrue("Count is off", this.filter.count() == 1);
    }

View Full Code Here

        int count = 0;
        final int MAX_COUNT = 1000;
        for (; count < MAX_COUNT; count++) {
          UURI u = UURIFactory.getInstance("http://www" +
              count + ".archive.org/" + count + "/index.html");
          this.filter.add(u.toString(), new CrawlURI(u));
          if (count > 0 && ((count % 100) == 0)) {
            list.add(u);
          }
        }
        this.logger.info("Added " + count + " in " +
            (System.currentTimeMillis() - start));
        
        start = System.currentTimeMillis();
        for (Iterator<UURI> i = list.iterator(); i.hasNext();) {
            UURI uuri = i.next();
            this.filter.add(uuri.toString(), new CrawlURI(uuri));
        }
        this.logger.info("Added random " + list.size() + " in " +
            (System.currentTimeMillis() - start));
        
        start = System.currentTimeMillis();
        for (Iterator<UURI> i = list.iterator(); i.hasNext();) {
            UURI uuri = i.next();
            this.filter.add(uuri.toString(), new CrawlURI(uuri));
        }
        this.logger.info("Deleted random " + list.size() + " in " +
            (System.currentTimeMillis() - start));
        // Looks like delete doesn't work.
        assertTrue("Count is off: " + this.filter.count(),

View Full Code Here

0 1 2 3 4 5 6 7 8 9

TOP

Related Classes of org.archive.modules.CrawlURI

org.archive.crawler.datamodel.CandidateURITest

org.archive.crawler.datamodel.CrawlURITest

org.archive.crawler.deciderules.ClassKeyMatchesRegexDecideRule

org.archive.crawler.framework.ToeThread

org.archive.crawler.frontier.AbstractFrontier

org.archive.crawler.frontier.AMQPUrlReceiver$UrlConsumer

org.archive.crawler.frontier.BdbMultipleWorkQueues

org.archive.crawler.frontier.BdbMultipleWorkQueuesTest

org.archive.crawler.frontier.BdbWorkQueue

org.archive.crawler.frontier.FrontierJournal

All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.