Examples of ParseData

cn.edu.hfut.dmic.webcollector.parser.ParseData
@author hu
edu.uci.ics.crawler4j.parser.ParseData
info.bliki.api.ParseData
wikimedia.org/w/api.php">Wikimedia API
net.nutch.parse.ParseData
Data extracted from a page's content. @see Parse#getData()
org.apache.nutch.parse.ParseData
Data extracted from a page's content. @see Parse#getData()

Examples of org.apache.nutch.parse.ParseData

    }
    Path parseDir = new Path(segment, ParseData.DIR_NAME);
    if (fs.exists(fetchDir) && fs.isDirectory(fetchDir)) {
      cnt = 0L;
      long errors = 0L;
      ParseData value = new ParseData();
      MapFile.Reader[] mreaders = MapFileOutputFormat.getReaders(fs, parseDir, getConf());
      for (int i = 0; i < mreaders.length; i++) {
        while (mreaders[i].next(key, value)) {
          cnt++;
          if (!value.getStatus().isSuccess()) errors++;
        }
        mreaders[i].close();
      }
      stats.parsed = cnt;
      stats.parseErrors = errors;

View Full Code Here

Examples of org.apache.nutch.parse.ParseData

      title = "";


    // collect outlink
    Outlink[] outlinks = OutlinkExtractor.getOutlinks(text, getConf());


    ParseData parseData = new ParseData(ParseStatus.STATUS_SUCCESS, title,
                                        outlinks, content.getMetadata(),
                                        metadata);
    parseData.setConf(this.conf);
    return new ParseImpl(text, parseData);
    // any filter?
    //return HtmlParseFilters.filter(content, parse, root);
  }

View Full Code Here

Examples of org.apache.nutch.parse.ParseData

    
  public Parse getParse(Content content)
  {
//    return new ParseImpl(content.getUrl(),  TODO MC BUG - don't index url as content
    return new ParseImpl("",
      new ParseData(ParseStatus.STATUS_SUCCESS,
      "", new Outlink[0], content.getMetadata()));
  }

View Full Code Here

Examples of org.apache.nutch.parse.ParseData

  public void reduce(WritableComparable key, Iterator values, OutputCollector output, Reporter reporter) throws IOException {
    CrawlDatum lastG = null;
    CrawlDatum lastF = null;
    CrawlDatum lastSig = null;
    Content lastC = null;
    ParseData lastPD = null;
    ParseText lastPT = null;
    String lastGname = null;
    String lastFname = null;
    String lastSigname = null;
    String lastCname = null;

View Full Code Here

Examples of org.apache.nutch.parse.ParseData

    add(urlStr, doc, "encoding", parse.getData().getMeta(ENCODING_KEY),
      false, true, true, false, false);


    // Get metadatas.
    MapWritable mw = datum.getMetaData();
    ParseData pd = parse.getData();


    // Add as stored, indexed, and untokenized but not lowercased.
    add(urlStr, doc, ARCCOLLECTION_KEY,
      getMetadataValue(ARCCOLLECTION_KEY, pd, mw),
      false, true, true, false);

View Full Code Here

Examples of org.apache.nutch.parse.ParseData

    }


    // collect outlink
    Outlink[] outlinks = OutlinkExtractor.getOutlinks(text, getConf());


    ParseData parseData = new ParseData(ParseStatus.STATUS_SUCCESS, title,
      outlinks, content.getMetadata());
    parseData.setConf(this.conf);
    
    return new ParseImpl(text, parseData);
  }

View Full Code Here

Examples of org.apache.nutch.parse.ParseData

    }


    if (text == null) { text = ""; }
    if (title == null) { title = ""; }


    ParseData parseData = new ParseData(ParseStatus.STATUS_SUCCESS, title,
                                        outlinks, content.getMetadata(),
                                        metadata);
    parseData.setConf(this.conf);
    return new ParseImpl(text, parseData);
  }

View Full Code Here

Examples of org.apache.nutch.parse.ParseData

    String class1 = "NonExistingFilter";
    String class2 = "org.apache.nutch.indexer.basic.BasicIndexingFilter";
    conf.set(IndexingFilters.INDEXINGFILTER_ORDER, class1 + " " + class2);


    IndexingFilters filters = new IndexingFilters(conf);
    filters.filter(new NutchDocument(), new ParseImpl("text", new ParseData(
      new ParseStatus(), "title", new Outlink[0], new Metadata())), new Text(
      "http://www.example.com/"), new CrawlDatum(), new Inlinks());
  }

View Full Code Here

Examples of org.apache.nutch.parse.ParseData

    Configuration conf = NutchConfiguration.create();
    conf.addResource("nutch-default.xml");
    conf.addResource("crawl-tests.xml");


    IndexingFilters filters = new IndexingFilters(conf);
    NutchDocument doc = filters.filter(null, new ParseImpl("text", new ParseData(
      new ParseStatus(), "title", new Outlink[0], new Metadata())), new Text(
      "http://www.example.com/"), new CrawlDatum(), new Inlinks());
     
    Assert.assertNull(doc);
  }

View Full Code Here

Examples of org.apache.nutch.parse.ParseData


    String class1 = "org.apache.nutch.indexer.basic.BasicIndexingFilter";
    conf.set(IndexingFilters.INDEXINGFILTER_ORDER, class1);


    IndexingFilters filters1 = new IndexingFilters(conf);
    NutchDocument fdoc1 = filters1.filter(new NutchDocument(),new ParseImpl("text",new ParseData(
      new ParseStatus(),"title",new Outlink[0],new Metadata())),new Text("http://www.example.com/"),
      new CrawlDatum(),new Inlinks());


    // add another index filter
    String class2 = "org.apache.nutch.indexer.metadata.MetadataIndexer";
    // set content metadata
    Metadata md = new Metadata();
    md.add("example","data");
    // set content metadata property defined in MetadataIndexer
    conf.set("index.content.md","example");
    // add MetadataIndxer filter
    conf.set(IndexingFilters.INDEXINGFILTER_ORDER, class1 + " " + class2);
    IndexingFilters filters2 = new IndexingFilters(conf);
    NutchDocument fdoc2 = filters2.filter(new NutchDocument(),new ParseImpl("text",new ParseData(
      new ParseStatus(),"title",new Outlink[0],md)),new Text("http://www.example.com/"),
      new CrawlDatum(),new Inlinks());
    Assert.assertEquals(fdoc1.getFieldNames().size(),fdoc2.getFieldNames().size());
  }

View Full Code Here

0 1 2 3 4 5

TOP

All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.