Examples of ParseText


Examples of org.apache.nutch.parse.ParseText

              ((ParseData)o).setParseMeta(new Metadata());
            }
            ((ParseData)o).getParseMeta().set(SEGMENT_NAME_KEY.toString(), segment);
          } else if (o instanceof ParseText) {
            String text = ((ParseText)o).getText();
            o = new ParseText(SEGMENT_NAME_KEY.toString() +
                    segment + SEGMENT_NAME_KEY.toString() + text);
            wrapper.set(o);
          } else {
            throw new IOException("Unknown value type: " + o.getClass().getName() + "(" + o + ")");
          }
View Full Code Here

Examples of org.apache.nutch.parse.ParseText

                int idx = text.indexOf(nameMarker, nameMarker.length());
                if (idx != -1) {
                  text = text.substring(idx + nameMarker.length());
                }
              }
              o = new ParseText(text);
            }
            pt_out = ensureMapFile(slice, ParseText.DIR_NAME, ParseText.class);
            pt_out.append(key, o);
          }
        }
View Full Code Here

Examples of org.apache.nutch.parse.ParseText

    CrawlDatum lastG = null;
    CrawlDatum lastF = null;
    CrawlDatum lastSig = null;
    Content lastC = null;
    ParseData lastPD = null;
    ParseText lastPT = null;
    String lastGname = null;
    String lastFname = null;
    String lastSigname = null;
    String lastCname = null;
    String lastPDname = null;
View Full Code Here

Examples of org.apache.nutch.parse.ParseText

    }

    String feedDesc = stripTags(feed.getDescriptionEx());
    String feedTitle = stripTags(feed.getTitleEx());

    parseResult.put(content.getUrl(), new ParseText(feedDesc), new ParseData(
        new ParseStatus(ParseStatus.SUCCESS), feedTitle, new Outlink[0],
        content.getMetadata()));

    return parseResult;
  }
View Full Code Here

Examples of org.apache.nutch.parse.ParseText

    if (parse != null) {
      ParseData data = parse.getData();
      data.getContentMeta().remove(Response.CONTENT_TYPE);
      mergeMetadata(data.getParseMeta(), parseMeta);
      parseResult.put(link, new ParseText(parse.getText()), new ParseData(
          ParseStatus.STATUS_SUCCESS, title, data.getOutlinks(), data
              .getContentMeta(), data.getParseMeta()));
    } else {
      contentMeta.remove(Response.CONTENT_TYPE);
      parseResult.put(link, new ParseText(text), new ParseData(
          ParseStatus.STATUS_FAILURE, title, new Outlink[0], contentMeta,
          parseMeta));
    }

  }
View Full Code Here

Examples of org.apache.nutch.parse.ParseText

    throws IOException {
    Inlinks inlinks = null;
    CrawlDatum dbDatum = null;
    CrawlDatum fetchDatum = null;
    ParseData parseData = null;
    ParseText parseText = null;
    while (values.hasNext()) {
      final Writable value = values.next().get(); // unwrap
      if (value instanceof Inlinks) {
        inlinks = (Inlinks)value;
      } else if (value instanceof CrawlDatum) {
View Full Code Here

Examples of org.apache.nutch.parse.ParseText

    MapFile.Writer w = new MapFile.Writer(conf, fs, ptPath.toString(), Text.class, ParseText.class);
    long curSize = 0;
    countSeg1 = 0;
    while (curSize < blkSize * 2) {
      k.set("seg1-" + df.format(countSeg1));
      w.append(k, new ParseText("seg1 text " + countSeg1));
      countSeg1++;
      curSize += 40; // roughly ...
    }
    w.close();
    System.err.println(" - done: " + countSeg1 + " records.");
    System.err.println("Creating large segment 2...");
    ptPath = new Path(new Path(seg2, ParseText.DIR_NAME), "part-00000");
    w = new MapFile.Writer(conf, fs, ptPath.toString(), Text.class, ParseText.class);
    curSize = 0;
    countSeg2 = 0;
    while (curSize < blkSize * 2) {
      k.set("seg2-" + df.format(countSeg2));
      w.append(k, new ParseText("seg2 text " + countSeg2));
      countSeg2++;
      curSize += 40; // roughly ...
    }
    w.close();
    System.err.println(" - done: " + countSeg2 + " records.");
View Full Code Here

Examples of org.apache.nutch.parse.ParseText

    FileStatus[] stats = fs.listStatus(out);
    // there should be just one path
    assertEquals(1, stats.length);
    Path outSeg = stats[0].getPath();
    Text k = new Text();
    ParseText v = new ParseText();
    MapFile.Reader[] readers = MapFileOutputFormat.getReaders(fs, new Path(outSeg, ParseText.DIR_NAME), conf);
    int cnt1 = 0, cnt2 = 0;
    for (MapFile.Reader r : readers) {
      while (r.next(k, v)) {
        String ks = k.toString();
        String vs = v.getText();
        if (ks.startsWith("seg1-")) {
          cnt1++;
          assertTrue(vs.startsWith("seg1 "));
        } else if (ks.startsWith("seg2-")) {
          cnt2++;
View Full Code Here

Examples of org.apache.nutch.parse.ParseText

              if (LOG.isWarnEnabled()) {
                e.printStackTrace(LogUtil.getWarnStream(LOG));
                LOG.warn("Couldn't pass score, url " + key + " (" + e + ")");
              }
            }
            output.collect(url, new NutchWritable(new ParseImpl(new ParseText(
              parse.getText()), parse.getData(), parse.isCanonical())));
          }
        }
      }
      catch (IOException e) {
View Full Code Here

Examples of org.apache.nutch.parse.ParseText

      ParseData parseData = new ParseData(status, title, newlinks,
                                          parse.getData().getContentMeta(),
                                          parse.getData().getParseMeta());

      // replace original parse obj with new one
      parseResult.put(content.getUrl(), new ParseText(text), parseData);
    }
    return parseResult;
  }
View Full Code Here
TOP
Copyright © 2018 www.massapi.com. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.