Examples of bixo.datum.ParsedDatum

bixo.datum.ParsedDatum


        ParserPolicy policy = new ParserPolicy( ParserPolicy.DEFAULT_MAX_PARSE_DURATION,
                                                linkTags,
                                                linkAttributeTypes);
        SimpleParser parser = new SimpleParser(policy);
        ParsedDatum parsedDatum = parser.parse(fetchedDatum);
        
        // Verify outlinks are correct (and we only get the a href ones).
        Outlink[] outlinks = parsedDatum.getOutlinks();
        Assert.assertEquals(4, outlinks.length);
        
        Assert.assertEquals("http://newdomain.com/favicon.ico", outlinks[0].getToUrl());
        Assert.assertEquals("http://newdomain.com/link1", outlinks[1].getToUrl());
        Assert.assertEquals("link1", outlinks[1].getAnchor());

View Full Code Here

    ContentBytes content = new ContentBytes(html.getBytes("utf-8"));
    FetchedDatum fetchedDatum = new FetchedDatum(url, url, System.currentTimeMillis(), headers, content, contentType, 0);
    
    // Call parser.parse
    SimpleParser parser = new SimpleParser();
    ParsedDatum parsedDatum = parser.parse(fetchedDatum);
    
    // Verify content is correct
    Assert.assertEquals("Simple", parsedDatum.getTitle());
    
    compareTermsInStrings("Simple Content", parsedDatum.getParsedText());
  }

View Full Code Here

    public void testHtmlParsing() throws Exception {
        URL path = SimpleParserTest.class.getResource("/simple-page.html");


        BaseParser parser = new SimpleParser();
        FetchedDatum content = makeFetchedDatum(path);
        ParsedDatum parse = parser.parse(content);
        Assert.assertNotNull(parse.getParsedText());
        
        // TODO - add back in title text to simple-page, when we generate this
        File parsedTextFile = new File(SimpleParserTest.class.getResource("/" + "simple-page.txt").getFile());
        String expectedString = FileUtils.readFileToString(parsedTextFile, "utf-8");
        String actualString = parse.getParsedText();
        
        // Trim of leading returns so split() doesn't give us an empty term
        // TODO - use our own split that skips leading/trailing separators
        compareTermsInStrings(expectedString, actualString.replaceFirst("^[\\n]+", ""));


        // TODO reenable when Tika bug is fixed re not emitting <img> links.
        // Outlink[] outlinks = parse.getOutlinks();
        // Assert.assertEquals(10, outlinks.length);
        
        Assert.assertEquals("TransPac Software", parse.getTitle());
    }

View Full Code Here

                return new Outlink[0];
            }
        },
        new ParserPolicy());
        
        ParsedDatum parsedDatum = parser.parse(fetchedDatum);
        
        // Verify content is correct
        Assert.assertEquals("Simple", parsedDatum.getTitle());
        
        compareTermsInStrings("Custom", parsedDatum.getParsedText());
    }

View Full Code Here

        ContentBytes content = new ContentBytes(html.getBytes("utf-8"));
        FetchedDatum fetchedDatum = new FetchedDatum(url, url, System.currentTimeMillis(), headers, content, contentType, 0);
        
        ParserPolicy policy = new ParserPolicy(Integer.MAX_VALUE);
        SimpleParser parser = new SimpleParser(policy);
        ParsedDatum parsedDatum = parser.parse(fetchedDatum);
        
        // Verify we got no URLs
        Assert.assertEquals(0, parsedDatum.getOutlinks().length);
    }

View Full Code Here

    ContentBytes content = new ContentBytes(html.getBytes("utf-8"));
    FetchedDatum fetchedDatum = new FetchedDatum(url, url, System.currentTimeMillis(), headers, content, contentType, 0);
    
    // Call parser.parse
    SimpleParser parser = new SimpleParser();
    ParsedDatum parsedDatum = parser.parse(fetchedDatum);
    
    // Verify content is correct
    Assert.assertEquals("Simple", parsedDatum.getTitle());
    
    compareTermsInStrings("Simple Content", parsedDatum.getParsedText());
    Assert.assertEquals("en", parsedDatum.getLanguage());


    }

View Full Code Here

    ContentBytes content = new ContentBytes(html.getBytes("utf-8"));
    FetchedDatum fetchedDatum = new FetchedDatum(url, url, System.currentTimeMillis(), headers, content, contentType, 0);
    
    // Call parser.parse
    SimpleParser parser = new SimpleParser();
    ParsedDatum parsedDatum = parser.parse(fetchedDatum);
    
    // Verify content is correct
    Assert.assertEquals("DublinCore Language Example", parsedDatum.getTitle());
    
    compareTermsInStrings("DublinCore Language Example Content", parsedDatum.getParsedText());
    
    Assert.assertEquals("ja", parsedDatum.getLanguage());


    }

View Full Code Here

    ContentBytes content = new ContentBytes(html.getBytes("utf-8"));
    FetchedDatum fetchedDatum = new FetchedDatum(url, url, System.currentTimeMillis(), headers, content, contentType, 0);
    
    // Call parser.parse
    SimpleParser parser = new SimpleParser();
    ParsedDatum parsedDatum = parser.parse(fetchedDatum);
    
    // Verify content is correct
    Assert.assertEquals("SimpleHttpEquiv", parsedDatum.getTitle());
    
    compareTermsInStrings("SimpleHttpEquiv Content", parsedDatum.getParsedText());
    
    Assert.assertEquals("ja", parsedDatum.getLanguage());


    }

View Full Code Here

        // Call parser.parse
        ParserPolicy policy = new ParserPolicy( ParserPolicy.NO_MAX_PARSE_DURATION,
                                                BaseLinkExtractor.ALL_LINK_TAGS,
                                                BaseLinkExtractor.ALL_LINK_ATTRIBUTE_TYPES);
        SimpleParser parser = new SimpleParser(new SimpleContentExtractor(), new SimpleLinkExtractor(), policy, true);
        ParsedDatum parsedDatum = parser.parse(fetchedDatum);
        
        // Verify outlinks are correct
        Outlink[] outlinks = parsedDatum.getOutlinks();
        Assert.assertEquals(1, outlinks.length);
        Assert.assertEquals("http://domain.com/song.mid", outlinks[0].getToUrl());
    }

View Full Code Here

        ContentBytes content = new ContentBytes(htmlText.getBytes("utf-8"));
        FetchedDatum fetchedDatum = new FetchedDatum(url, url, System.currentTimeMillis(), headers, content, contentType, 0);
        
        // Call parser.parse
        SimpleParser parser = new SimpleParser(new ParserPolicy(), true);
        ParsedDatum parsedDatum = parser.parse(fetchedDatum);
        
        // Now take the resulting HTML, process it using Dom4J
        SAXReader reader = new SAXReader(new Parser());
        reader.setEncoding("UTF-8");
        String htmlWithMarkup = parsedDatum.getParsedText();
        Document doc = reader.read(new StringInputStream(htmlWithMarkup));
        
        // We have to do helicopter stunts since HTML has a global namespace on it, set
        // at the <html> element level.
        XPath xpath = DocumentHelper.createXPath("/xhtml:html/xhtml:body/xhtml:p");

View Full Code Here

0 1 2

TOP

Related Classes of bixo.datum.ParsedDatum

bixo.examples.crawl.CreateUrlDatumFromOutlinksFunction

bixo.examples.crawl.CreateWritableSeqFileData

bixo.parser.DOMParser

bixo.parser.FakeParser

bixo.parser.SimpleParser

bixo.parser.SimpleParserTest

bixo.parser.TikaCallable

bixo.pipes.ParsePipe$ParseFunction

bixo.tools.FetchAndParseTool

cascading.tuple.Fields

All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.