Package de.anomic.crawler.retrieval

Examples of de.anomic.crawler.retrieval.Request.url()


                prop.putXML("item_" + c + "_link", entry.url().toNormalform(true, false));
                prop.putXML("item_" + c + "_referrer", (referrer == null) ? "" : referrer.toNormalform(true, false));
                prop.putXML("item_" + c + "_description", entry.name());
                prop.put("item_" + c + "_author", "");
                prop.put("item_" + c + "_pubDate", GenericFormatter.SHORT_SECOND_FORMATTER.format(entry.appdate()));
                prop.put("item_" + c + "_guid", entry.url().hash());
                c++;
                maxCount--;
            }
            prop.put("item", c);
            prop.putXML("response", "ok");
View Full Code Here


              return null;
            }
            // depending on the caching policy we need sleep time to avoid DoS-like situations
            sleeptime = (
                    profileEntry.cacheStrategy() == CacheStrategy.CACHEONLY ||
                    (profileEntry.cacheStrategy() == CacheStrategy.IFEXIST && Cache.has(crawlEntry.url()))
                    ) ? 0 : Latency.waitingRemaining(crawlEntry.url(), this.myAgentIDs, this.minimumLocalDelta, this.minimumGlobalDelta); // this uses the robots.txt database and may cause a loading of robots.txt from the server

            assert Base64Order.enhancedCoder.equal(nexthash, rowEntry.getPrimaryKeyBytes()) : "result = " + ASCII.String(nexthash) + ", rowEntry.getPrimaryKeyBytes() = " + ASCII.String(rowEntry.getPrimaryKeyBytes());
            assert Base64Order.enhancedCoder.equal(nexthash, crawlEntry.url().hash()) : "result = " + ASCII.String(nexthash) + ", crawlEntry.url().hash() = " + ASCII.String(crawlEntry.url().hash());
View Full Code Here

            }
            // depending on the caching policy we need sleep time to avoid DoS-like situations
            sleeptime = (
                    profileEntry.cacheStrategy() == CacheStrategy.CACHEONLY ||
                    (profileEntry.cacheStrategy() == CacheStrategy.IFEXIST && Cache.has(crawlEntry.url()))
                    ) ? 0 : Latency.waitingRemaining(crawlEntry.url(), this.myAgentIDs, this.minimumLocalDelta, this.minimumGlobalDelta); // this uses the robots.txt database and may cause a loading of robots.txt from the server

            assert Base64Order.enhancedCoder.equal(nexthash, rowEntry.getPrimaryKeyBytes()) : "result = " + ASCII.String(nexthash) + ", rowEntry.getPrimaryKeyBytes() = " + ASCII.String(rowEntry.getPrimaryKeyBytes());
            assert Base64Order.enhancedCoder.equal(nexthash, crawlEntry.url().hash()) : "result = " + ASCII.String(nexthash) + ", crawlEntry.url().hash() = " + ASCII.String(crawlEntry.url().hash());

            if (failhash != null && Base64Order.enhancedCoder.equal(failhash, nexthash)) break; // prevent endless loops
View Full Code Here

                    profileEntry.cacheStrategy() == CacheStrategy.CACHEONLY ||
                    (profileEntry.cacheStrategy() == CacheStrategy.IFEXIST && Cache.has(crawlEntry.url()))
                    ) ? 0 : Latency.waitingRemaining(crawlEntry.url(), this.myAgentIDs, this.minimumLocalDelta, this.minimumGlobalDelta); // this uses the robots.txt database and may cause a loading of robots.txt from the server

            assert Base64Order.enhancedCoder.equal(nexthash, rowEntry.getPrimaryKeyBytes()) : "result = " + ASCII.String(nexthash) + ", rowEntry.getPrimaryKeyBytes() = " + ASCII.String(rowEntry.getPrimaryKeyBytes());
            assert Base64Order.enhancedCoder.equal(nexthash, crawlEntry.url().hash()) : "result = " + ASCII.String(nexthash) + ", crawlEntry.url().hash() = " + ASCII.String(crawlEntry.url().hash());

            if (failhash != null && Base64Order.enhancedCoder.equal(failhash, nexthash)) break; // prevent endless loops

            if (delay && sleeptime > 0 && this.domStackInitSize > 1) {
                //System.out.println("*** putback: nexthash=" + nexthash + ", failhash="+failhash);
View Full Code Here

                    profileEntry.cacheStrategy() == CacheStrategy.CACHEONLY ||
                    (profileEntry.cacheStrategy() == CacheStrategy.IFEXIST && Cache.has(crawlEntry.url()))
                    ) ? 0 : Latency.waitingRemaining(crawlEntry.url(), this.myAgentIDs, this.minimumLocalDelta, this.minimumGlobalDelta); // this uses the robots.txt database and may cause a loading of robots.txt from the server

            assert Base64Order.enhancedCoder.equal(nexthash, rowEntry.getPrimaryKeyBytes()) : "result = " + ASCII.String(nexthash) + ", rowEntry.getPrimaryKeyBytes() = " + ASCII.String(rowEntry.getPrimaryKeyBytes());
            assert Base64Order.enhancedCoder.equal(nexthash, crawlEntry.url().hash()) : "result = " + ASCII.String(nexthash) + ", crawlEntry.url().hash() = " + ASCII.String(crawlEntry.url().hash());

            if (failhash != null && Base64Order.enhancedCoder.equal(failhash, nexthash)) break; // prevent endless loops

            if (delay && sleeptime > 0 && this.domStackInitSize > 1) {
                //System.out.println("*** putback: nexthash=" + nexthash + ", failhash="+failhash);
View Full Code Here

                    //System.out.println("*** delayed +=" + nexthash);
                    this.delayed.put(Long.valueOf(System.currentTimeMillis() + sleeptime + 1), nexthash);
                }
              try {
                        this.urlFileIndex.put(rowEntry);
                        String host = crawlEntry.url().getHost();
                        if (host == null) host = localhost;
                        this.domainStacks.remove(host);
                        failhash = nexthash;
                    } catch (final RowSpaceExceededException e) {
                        Log.logException(e);
View Full Code Here

            }
            break;
        }
        if (crawlEntry != null) {
                if (this.ddc.size() > 10000 || MemoryControl.shortStatus()) this.ddc.clear();
                try { this.ddc.put(crawlEntry.url().hash()); } catch (final RowSpaceExceededException e) {}
        }
      }
      if (crawlEntry == null) return null;

        if (delay && sleeptime > 0) {
View Full Code Here

        if (delay && sleeptime > 0) {
            // force a busy waiting here
            // in best case, this should never happen if the balancer works propertly
            // this is only to protection against the worst case, where the crawler could
            // behave in a DoS-manner
            Log.logInfo("BALANCER", "forcing crawl-delay of " + sleeptime + " milliseconds for " + crawlEntry.url().getHost() + ": " + Latency.waitingRemainingExplain(crawlEntry.url(), this.myAgentIDs, this.minimumLocalDelta, this.minimumGlobalDelta) + ", top.size() = " + this.top.size() + ", delayed.size() = " + this.delayed.size() + ", domainStacks.size() = " + this.domainStacks.size() + ", domainStacksInitSize = " + this.domStackInitSize);
            long loops = sleeptime / 1000;
            long rest = sleeptime % 1000;
            if (loops < 3) {
              rest = rest + 1000 * loops;
              loops = 0;
View Full Code Here

        if (delay && sleeptime > 0) {
            // force a busy waiting here
            // in best case, this should never happen if the balancer works propertly
            // this is only to protection against the worst case, where the crawler could
            // behave in a DoS-manner
            Log.logInfo("BALANCER", "forcing crawl-delay of " + sleeptime + " milliseconds for " + crawlEntry.url().getHost() + ": " + Latency.waitingRemainingExplain(crawlEntry.url(), this.myAgentIDs, this.minimumLocalDelta, this.minimumGlobalDelta) + ", top.size() = " + this.top.size() + ", delayed.size() = " + this.delayed.size() + ", domainStacks.size() = " + this.domainStacks.size() + ", domainStacksInitSize = " + this.domStackInitSize);
            long loops = sleeptime / 1000;
            long rest = sleeptime % 1000;
            if (loops < 3) {
              rest = rest + 1000 * loops;
              loops = 0;
View Full Code Here

              rest = rest + 1000 * loops;
              loops = 0;
            }
            if (rest > 0) {try {this.wait(rest); } catch (final InterruptedException e) {}}
            for (int i = 0; i < loops; i++) {
              Log.logInfo("BALANCER", "waiting for " + crawlEntry.url().getHost() + ": " + (loops - i) + " seconds remaining...");
                try {this.wait(1000); } catch (final InterruptedException e) {}
            }
        }
        this.ddc.remove(crawlEntry.url().hash());
        Latency.update(crawlEntry.url());
View Full Code Here

TOP
Copyright © 2018 www.massapi.com. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.