Package com.flaptor.hounder.crawler.pagedb

Examples of com.flaptor.hounder.crawler.pagedb.PageDB


        try {
            server = new WebServer(8085);
            server.addResourceHandler("/", tmpDir+"/web");
            server.start();

            PageDB db = new PageDB(tmpDir+"/testdb");
            db.open(PageDB.WRITE);
            db.addPage(in);
            db.close();

            crawler = new Crawler();

            int tries = 0;
            int maxTries = 10;
            do {
                tries++;

                crawler.crawl(1);

                db.open(PageDB.READ);
                Iterator<Page> pages = db.iterator();
                assertTrue("The crawler lost or discarded the test page", pages.hasNext());
                out = pages.next();
                assertFalse("The crawler has more than the test page", pages.hasNext());
                db.close();
            } while (out.getRetries() > 0 && tries <= maxTries);

        } finally {
            if (null != crawler) {
                crawler.cleanup();
View Full Code Here


        try {
            server = new WebServer(8087);
            server.addResourceHandler("/", tmpDir+"/web");
            server.start();

            PageDB db = new PageDB(tmpDir+"/testdb");
            db.open(PageDB.WRITE);
            db.addPage(in);
            db.close();

            crawler = new Crawler();

            crawler.crawl(2);

            db.open(PageDB.READ);
            Iterator<Page> pages = db.iterator();
            assertTrue("The crawler lost or discarded all test pages", pages.hasNext());
            one = pages.next();
            assertTrue("The crawler lost or discarded the second test page", pages.hasNext());
            two = pages.next();
            assertFalse("The crawler has more than two pages", pages.hasNext());
            db.close();
        } finally {
            if (null != crawler) {
                crawler.cleanup();
            }
            server.requestStop();
View Full Code Here

        runRandomCrawl(100, 10, 30);
    }

    private int getPageDBFetchedCount (long[] fetchAttempts) throws Exception {
        int changed = 0;
        PageDB db = new PageDB(tmpDir+"/testdb");
        for (Page page : db) {
            int num = page.getUrl().charAt(26)-'0';
            long attempt = page.getLastAttempt();
            if (attempt != fetchAttempts[num]) {
                fetchAttempts[num] = attempt;
                changed++;
            }
        }
        db.close();
        return changed;
    }
View Full Code Here

        db.close();
        return changed;
    }

    private float[] getPageDBPriorities () throws Exception {
        PageDB db = new PageDB(tmpDir+"/testdb");
        float[] pri = new float[(int)db.getSize()+1];
        for (Page page : db) {
            int num = page.getUrl().charAt(26)-'0';
            pri[num] = page.getPriority();
        }
        db.close();
        return pri;
    }
View Full Code Here

        server.stop();
    }


    private float[] getPageDBScores (SimWeb web) throws Exception {
        PageDB db = new PageDB(tmpDir+"/testdb");
        float[] score = new float[(int)db.getSize()];
        for (Page page : db) {
            int num = SimWeb.urlToPage(page.getUrl());
            score[num] = page.getScore();
        }
        db.close();
        return score;
    }
View Full Code Here

        db.close();
        return score;
    }

    private float[] getPageDBAntiScores (SimWeb web) throws Exception {
        PageDB db = new PageDB(tmpDir+"/testdb");
        int size = (int)db.getSize();
        float[] antiScore = new float[(int)db.getSize()];
        for (Page page : db) {
            int num = SimWeb.urlToPage(page.getUrl());
            antiScore[num] = page.getAntiScore();
        }
        db.close();
        return antiScore;
    }
View Full Code Here

        PageDistributor distributor = new PageDistributor(nodes, node, mapper);
        Page page1 = PageTest.randomPage();
        page1.setUrl("http://example.com/test0=0");
        IRemotePageCatcher stubCatcher= distributor.getCatcher(page1);
        stubCatcher.addPage(page1);
        PageDB db = localCatcher.getCatch();
        db.open(PageDB.READ);
        Iterator<Page> pages = db.iterator();
        assertTrue("The page sent through rmi did not survive the adventure.", pages.hasNext());
        Page page2 = pages.next();
        assertTrue("The page has been changed by the trip through rmi:\n  1: "+page1+"\n  2: "+page2, page1.equals(page2));
        assertFalse("Sent one page through rmi and more than one came out the other end.", pages.hasNext());
        db.close();
    }
View Full Code Here

            done = 0;
            new DBCloser(0).start();
            new DBCloser(1).start();
            while (done < 2) Execute.sleep(100);

            PageDB db = new PageDB(dbNames[0]);
            db.open(DPageDB.READ);
            Iterator<Page> pages = db.iterator();
            assertTrue("The first distributed pagedb should have two pages, yet it has none.", pages.hasNext());
            Page page1 = pages.next();
            assertTrue("One of the pages in the first distributed pagedb is not expected: "+page1.getUrl(), page1.equals(page00) || page1.equals(page10));
            assertTrue("The first distributed pagedb should have two pages, yet it has one.", pages.hasNext());
            Page page2 = pages.next();
            assertTrue("One of the pages in the first distributed pagedb is not expected: "+page2.getUrl(), page2.equals(page00) || page2.equals(page10));
            assertFalse("One of the pages in the first distributed pagedb has been cloned:"+page1.getUrl(), page1.equals(page2));
            assertFalse("The first distributed pagedb should have two pages, yet it has more.", pages.hasNext());
            db.close();

            db = new PageDB(dbNames[1]);
            db.open(DPageDB.READ);
            pages = db.iterator();
            assertTrue("The second distributed pagedb should have two pages, yet it has none.", pages.hasNext());
            page1 = pages.next();
            assertTrue("One of the pages in the second distributed pagedb is not expected: "+page1.getUrl(), page1.equals(page01) || page1.equals(page11));
            assertTrue("The second distributed pagedb should have two pages, yet it has one.", pages.hasNext());
            page2 = pages.next();
            assertTrue("One of the pages in the second distributed pagedb is not expected: "+page2.getUrl(), page2.equals(page01) || page2.equals(page11));
            assertFalse("One of the pages in the second distributed pagedb has been cloned:"+page1.getUrl(), page1.equals(page2));
            assertFalse("The second pagedb should have two pages, yet it has more.", pages.hasNext());
            db.close();
        }
    }
View Full Code Here

    @Override
    public void close () throws IOException {
        if ((mode & 0x0F) != READ) {
            logger.info("DPageDB ready for close. Will merge "+pageCatcher.catches()+" catched pages.");
            if (pageCatcher.catches() > 0) {
                PageDB db = pageCatcher.getCatch();
                db.open(READ);
                for (Page page : db) {
                    page.setLocal(true);
                    super.addPage (page);
                }
                Execute.close(db);
                db.deleteDir(false);
            }
            if (catcherIsLocal) {
                pageCatcher.stop();
            }
        }
View Full Code Here

    // start a new pagedb for storing catched pages.
    private synchronized void newCatcher () {
        try {
            String dirname = catchDir + ".tmp";
            catchdb = new PageDB(dirname);
            catchdb.open(PageDB.WRITE + PageDB.APPEND + PageDB.UNSORTED);
        } catch (IOException e) {
            logger.error(e,e);
            throw new RuntimeException("Creating a new PageCatcher pagedb", e);
        }
View Full Code Here

TOP

Related Classes of com.flaptor.hounder.crawler.pagedb.PageDB

Copyright © 2018 www.massapicom. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.