Package org.apache.nutch.parse

Examples of org.apache.nutch.parse.Parse


  public void testNone() throws ProtocolException, ParseException {
    String urlString;
    Protocol protocol;
    Content content;
    Parser parser;
    Parse parse;

    urlString = "file:" + sampleDir + fileSeparator + none;
    protocol = ProtocolFactory.getProtocol(urlString);
    content = protocol.getContent(urlString);
    parser = ParserFactory.getParser(content.getContentType(), urlString);
    try {
      parse = parser.getParse(content);
      Properties metadata = parse.getData().getMetadata();
    } catch (ParseException e) {
      return;
    }
    fail("Expected ParseException");
View Full Code Here


    public void testIt() throws ProtocolException, ParseException {
        String urlString;
        Protocol protocol;
        Content content;
        Parser parser;
        Parse parse;

        for (int i = 0; i < sampleFiles.length; i++) {
            urlString = "file:" + sampleDir + fileSeparator + sampleFiles[i];

            protocol = ProtocolFactory.getProtocol(urlString);
            content = protocol.getProtocolOutput(urlString).getContent();

            parser = ParserFactory.getParser(content.getContentType(),
                    urlString);
            parse = parser.getParse(content);

            //check that there are 3 outlinks:
            //http://test.channel.com
            //http://www-scf.usc.edu/~mattmann/
            //http://www.nutch.org

            ParseData theParseData = parse.getData();

            Outlink[] theOutlinks = theParseData.getOutlinks();

            assertTrue("There aren't 3 outlinks read!", theOutlinks.length == 3);
View Full Code Here

    doc.add("segment", metadata.get(Nutch.SEGMENT_NAME_KEY));

    // add digest, used by dedup
    doc.add("digest", metadata.get(Nutch.SIGNATURE_KEY));

    final Parse parse = new ParseImpl(parseText, parseData);
    try {
      // extract information from dbDatum and pass it to
      // fetchDatum so that indexing filters can use it
      final Text url = (Text) dbDatum.getMetaData().get(Nutch.WRITABLE_REPR_URL_KEY);
      if (url != null) {
View Full Code Here

  public void testId3v2() throws ProtocolException, ParseException {

    String urlString;
    Protocol protocol;
    Content content;
    Parse parse;

    Configuration conf = NutchConfiguration.create();
    urlString = "file:" + sampleDir + fileSeparator + id3v2;
    protocol = new ProtocolFactory(conf).getProtocol(urlString);
    content = protocol.getProtocolOutput(new Text(urlString), new CrawlDatum())
                      .getContent();
    parse = new ParseUtil(conf).parseByExtensionId("parse-mp3", content).get(urlString);
    Metadata metadata = parse.getData().getParseMeta();
    assertEquals("postgresql comment id3v2", metadata.get("COMM-Text"));
    assertEquals("postgresql composer id3v2", metadata.get("TCOM-Text"));
    assertEquals("02", metadata.get("TRCK-Text"));
    assertEquals("http://localhost/", metadata.get("WCOP-URL Link"));
    assertEquals("postgresql artist id3v2", metadata.get("TPE1-Text"));
    assertEquals("(28)", metadata.get("TCON-Text"));
    assertEquals("2004", metadata.get("TYER-Text"));
    assertEquals("postgresql title id3v2", metadata.get("TIT2-Text"));
    assertEquals("postgresql album id3v2", metadata.get("TALB-Text"));
    assertEquals("postgresql encoded by id3v2", metadata.get("TENC-Text"));

    assertEquals("postgresql title id3v2 - "
        + "postgresql album id3v2 - "
        + "postgresql artist id3v2", parse.getData().getTitle());
    assertEquals("http://localhost/", parse.getData().getOutlinks()[0].getToUrl());

  }
View Full Code Here

  public void testId3v1() throws ProtocolException, ParseException {

    String urlString;
    Protocol protocol;
    Content content;
    Parse parse;

    Configuration conf = NutchConfiguration.create();
    urlString = "file:" + sampleDir + fileSeparator + id3v1;
    protocol = new ProtocolFactory(conf).getProtocol(urlString);
    content = protocol.getProtocolOutput(new Text(urlString), new CrawlDatum())
                      .getContent();
    parse = new ParseUtil(conf).parseByExtensionId("parse-mp3", content).get(urlString);

    Metadata metadata = parse.getData().getParseMeta();
    assertEquals("postgresql comment id3v1", metadata.get("COMM-Text"));
    assertEquals("postgresql artist id3v1", metadata.get("TPE1-Text"));
    assertEquals("(28)", metadata.get("TCON-Text"));
    assertEquals("2004", metadata.get("TYER-Text"));
    assertEquals("postgresql title id3v1", metadata.get("TIT2-Text"));
    assertEquals("postgresql album id3v1", metadata.get("TALB-Text"));

    assertEquals("postgresql title id3v1 - "
        + "postgresql album id3v1 - "
        + "postgresql artist id3v1", parse.getData().getTitle());

  }
View Full Code Here

  public void testNone() throws ProtocolException, ParseException {
    String urlString;
    Protocol protocol;
    Content content;
    Parse parse;

    Configuration conf = NutchConfiguration.create();
    urlString = "file:" + sampleDir + fileSeparator + none;
    protocol = new ProtocolFactory(conf).getProtocol(urlString);
    content = protocol.getProtocolOutput(new Text(urlString), new CrawlDatum())
                      .getContent();
    parse = new ParseUtil(conf).parseByExtensionId("parse-mp3", content).get(urlString);
//    Metadata metadata = parse.getData().getParseMeta();
    if (parse.getData().getStatus().isSuccess()) {
      fail("Expected ParseException");
    }
  }
View Full Code Here

   * <li>3. meta http-equiv (content-language) (http://www.w3.org/TR/REC-html40/struct/global.html#h-7.4.4.2)
   * <br>Only the first occurence of language is stored.
   */
  public ParseResult filter(Content content, ParseResult parseResult, HTMLMetaTags metaTags, DocumentFragment doc) {
   
    Parse parse = parseResult.get(content.getUrl());

    // Trying to find the document's language
    LanguageParser parser = new LanguageParser(doc);
    String lang = parser.getLanguage();

    if (lang != null) {
      parse.getData().getParseMeta().set(Metadata.LANGUAGE, lang);
    }
    return parseResult;
  }
View Full Code Here

    try {
      ParseUtil parser = new ParseUtil(NutchConfiguration.create());
      /* loop through the test documents and validate result */
      for (int t = 0; t < docs.length; t++) {
        Content content = getContent(docs[t]);
        Parse parse = parser.parse(content).get(content.getUrl());
        assertEquals(metalanguages[t], (String) parse.getData().getParseMeta().get(Metadata.LANGUAGE));
      }
    } catch (Exception e) {
      e.printStackTrace(System.out);
      fail(e.toString());
    }
View Full Code Here

    doc.add("segment", metadata.get(Nutch.SEGMENT_NAME_KEY));

    // add digest, used by dedup
    doc.add("digest", metadata.get(Nutch.SIGNATURE_KEY));

    final Parse parse = new ParseImpl(parseText, parseData);
    try {
      // extract information from dbDatum and pass it to
      // fetchDatum so that indexing filters can use it
      final Text url = (Text) dbDatum.getMetaData().get(Nutch.WRITABLE_REPR_URL_KEY);
      if (url != null) {
View Full Code Here

  public void testIt() throws ProtocolException, ParseException {
    String urlString;
    Protocol protocol;
    Content content;
    Parse parse;

    for (int i = 0; i < sampleFiles.length; i++) {
      urlString = "file:" + sampleDir + fileSeparator + sampleFiles[i];

      Configuration conf = NutchConfiguration.create();
      protocol = new ProtocolFactory(conf).getProtocol(urlString);
      content = protocol.getProtocolOutput(new Text(urlString), new CrawlDatum()).getContent();
      parse = new ParseUtil(conf).parseByExtensionId("parse-tika", content).get(content.getUrl());

      assertEquals("121", parse.getData().getMeta("width"));
      assertEquals("48", parse.getData().getMeta("height"));
    }
  }
View Full Code Here

TOP

Related Classes of org.apache.nutch.parse.Parse

Copyright © 2018 www.massapicom. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.