Examples of ParseText


Examples of org.apache.nutch.parse.ParseText

              if (LOG.isWarnEnabled()) {
                e.printStackTrace(LogUtil.getWarnStream(LOG));
                LOG.warn("Couldn't pass score, url " + key + " (" + e + ")");
              }
            }
            output.collect(url, new NutchWritable(new ParseImpl(new ParseText(
              parse.getText()), parse.getData(), parse.isCanonical())));
          }
        }
      }
      catch (IOException e) {
View Full Code Here

Examples of org.apache.nutch.parse.ParseText

    throws IOException {
    Inlinks inlinks = null;
    CrawlDatum dbDatum = null;
    CrawlDatum fetchDatum = null;
    ParseData parseData = null;
    ParseText parseText = null;

    while (values.hasNext()) {
      final Writable value = values.next().get(); // unwrap
      if (value instanceof Inlinks) {
        inlinks = (Inlinks)value;
View Full Code Here

Examples of org.apache.nutch.parse.ParseText

   */
  public synchronized boolean next(FetcherOutput fo, Content co,
          ParseText pt, ParseData pd) throws IOException {
    boolean valid = true;
    Content rco = (co == null) ? _co : co;
    ParseText rpt = (pt == null) ? _pt : pt;
    ParseData rpd = (pd == null) ? _pd : pd;
    if (fetcherReader.next(fo) == null) valid = false;
    if (contentReader != null)
      if (contentReader.next(rco) == null) valid = false;
    if (parseTextReader != null)
View Full Code Here

Examples of org.apache.nutch.parse.ParseText

  public synchronized void dump(boolean sorted, PrintStream output) throws Exception {
    reset();
    FetcherOutput fo = new FetcherOutput();
    Content co = new Content();
    ParseData pd = new ParseData();
    ParseText pt = new ParseText();
    long recNo = 0L;
    if (!sorted) {
      while(next(fo, co, pt, pd)) {
        output.println("Recno:: " + recNo++);
        output.println("FetcherOutput::\n" + fo.toString());
        if (contentReader != null)
          output.println("Content::\n" + co.toString());
        if (parseDataReader != null)
          output.println("ParseData::\n" + pd.toString());
        if (parseTextReader != null)
          output.println("ParseText::\n" + pt.toString());
        output.println("");
      }
    } else {
      File unsortedFile = new File(segmentDir, ".unsorted");
      File sortedFile = new File(segmentDir, ".sorted");
      nfs.delete(unsortedFile);
      nfs.delete(sortedFile);
      SequenceFile.Writer seqWriter = new SequenceFile.Writer(nfs,
              unsortedFile.toString(), UTF8.class, LongWritable.class);
      FetchListEntry fle;
      LongWritable rec = new LongWritable();
      UTF8 url = new UTF8();
      String urlString;
      while (fetcherReader.next(fo) != null) {
        fle = fo.getFetchListEntry();
        urlString = fle.getPage().getURL().toString();
        rec.set(recNo);
        url.set(urlString);
        seqWriter.append(url, rec);
        recNo++;
      }
      seqWriter.close();
      // sort the SequenceFile
      long start = System.currentTimeMillis();

      SequenceFile.Sorter sorter = new SequenceFile.Sorter(nfs,
              new UTF8.Comparator(), LongWritable.class);

      sorter.sort(unsortedFile.toString(), sortedFile.toString());

      float localSecs = (System.currentTimeMillis() - start) / 1000.0f;
      LOG.info(" - sorted: " + recNo + " entries in " + localSecs + "s, "
        + (recNo/localSecs) + " entries/s");

      nfs.delete(unsortedFile);
      SequenceFile.Reader seqReader = new SequenceFile.Reader(nfs, sortedFile.toString());
      while (seqReader.next(url, rec)) {
        recNo = rec.get();
        get(recNo, fo, co, pt, pd);
        output.println("Recno:: " + recNo++);
        output.println("FetcherOutput::\n" + fo.toString());
        if (contentReader != null)
          output.println("Content::\n" + co.toString());
        if (parseDataReader != null)
          output.println("ParseData::\n" + pd.toString());
        if (parseTextReader != null)
          output.println("ParseText::\n" + pt.toString());
        output.println("");
      }
      seqReader.close();
      nfs.delete(sortedFile);
    }
View Full Code Here

Examples of org.apache.nutch.parse.ParseText

      nfs.mkdirs(outDir);
      SegmentWriter sw = new SegmentWriter(nfs, outDir, true);
      LOG.fine(" - opening first output segment in " + outDir.getName());
      FetcherOutput fo = new FetcherOutput();
      Content co = new Content();
      ParseText pt = new ParseText();
      ParseData pd = new ParseData();
      int outputCnt = 0;
      for (int n = 0; n < ir.maxDoc(); n++) {
        if (ir.isDeleted(n)) {
          //System.out.println("-del");
View Full Code Here

Examples of org.apache.nutch.parse.ParseText

    CrawlDatum lastG = null;
    CrawlDatum lastF = null;
    CrawlDatum lastSig = null;
    Content lastC = null;
    ParseData lastPD = null;
    ParseText lastPT = null;
    String lastGname = null;
    String lastFname = null;
    String lastSigname = null;
    String lastCname = null;
    String lastPDname = null;
View Full Code Here

Examples of org.apache.nutch.parse.ParseText

    throws IOException {
    Inlinks inlinks = null;
    CrawlDatum dbDatum = null;
    CrawlDatum fetchDatum = null;
    ParseData parseData = null;
    ParseText parseText = null;

    while (values.hasNext()) {
      final Writable value = values.next().get(); // unwrap
      if (value instanceof Inlinks) {
        inlinks = (Inlinks)value;
View Full Code Here

Examples of org.apache.nutch.parse.ParseText

    FileStatus fileStatus = fs.getFileStatus(ptPath);
    long blkSize = fileStatus.getBlockSize();
   
    while (curSize < blkSize * 2) {
      k.set("seg1-" + df.format(countSeg1));
      w.append(k, new ParseText("seg1 text " + countSeg1));
      countSeg1++;
      curSize += 40; // roughly ...
    }
    w.close();
    System.err.println(" - done: " + countSeg1 + " records.");
    System.err.println("Creating large segment 2...");
    ptPath = new Path(new Path(seg2, ParseText.DIR_NAME), "part-00000");
    w = new MapFile.Writer(conf, fs, ptPath.toString(), Text.class, ParseText.class);
    curSize = 0;
    countSeg2 = 0;
    while (curSize < blkSize * 2) {
      k.set("seg2-" + df.format(countSeg2));
      w.append(k, new ParseText("seg2 text " + countSeg2));
      countSeg2++;
      curSize += 40; // roughly ...
    }
    w.close();
    System.err.println(" - done: " + countSeg2 + " records.");
View Full Code Here

Examples of org.apache.nutch.parse.ParseText

    FileStatus[] stats = fs.listStatus(out);
    // there should be just one path
    assertEquals(1, stats.length);
    Path outSeg = stats[0].getPath();
    Text k = new Text();
    ParseText v = new ParseText();
    MapFile.Reader[] readers = MapFileOutputFormat.getReaders(fs, new Path(outSeg, ParseText.DIR_NAME), conf);
    int cnt1 = 0, cnt2 = 0;
    for (MapFile.Reader r : readers) {
      while (r.next(k, v)) {
        String ks = k.toString();
        String vs = v.getText();
        if (ks.startsWith("seg1-")) {
          cnt1++;
          assertTrue(vs.startsWith("seg1 "));
        } else if (ks.startsWith("seg2-")) {
          cnt2++;
View Full Code Here

Examples of org.apache.nutch.parse.ParseText

    CrawlDatum lastG = null;
    CrawlDatum lastF = null;
    CrawlDatum lastSig = null;
    Content lastC = null;
    ParseData lastPD = null;
    ParseText lastPT = null;
    String lastGname = null;
    String lastFname = null;
    String lastSigname = null;
    String lastCname = null;
    String lastPDname = null;
    String lastPTname = null;
    TreeMap linked = new TreeMap();
    while (values.hasNext()) {
      ObjectWritable wrapper = (ObjectWritable)values.next();
      Object o = wrapper.get();
      if (o instanceof CrawlDatum) {
        CrawlDatum val = (CrawlDatum)o;
        // check which output dir it belongs to
        UTF8 part = (UTF8)val.getMetaData().get(SEGMENT_PART_KEY);
        if (part == null)
          throw new IOException("Null segment part, key=" + key);
        UTF8 uName = (UTF8)val.getMetaData().get(SEGMENT_NAME_KEY);
        if (uName == null)
          throw new IOException("Null segment name, key=" + key);
        String name = uName.toString();
        String partString = part.toString();
        if (partString.equals(CrawlDatum.GENERATE_DIR_NAME)) {
          if (lastG == null) {
            lastG = val;
            lastGname = name;
          } else {
            // take newer
            if (lastGname.compareTo(name) < 0) {
              lastG = val;
              lastGname = name;
            }
          }
        } else if (partString.equals(CrawlDatum.FETCH_DIR_NAME)) {
          if (lastF == null) {
            lastF = val;
            lastFname = name;
          } else {
            // take newer
            if (lastFname.compareTo(name) < 0) {
              lastF = val;
              lastFname = name;
            }
          }
        } else if (partString.equals(CrawlDatum.PARSE_DIR_NAME)) {
          if (val.getStatus() == CrawlDatum.STATUS_SIGNATURE) {
            if (lastSig == null) {
              lastSig = val;
              lastSigname = name;
            } else {
              // take newer
              if (lastSigname.compareTo(name) < 0) {
                lastSig = val;
                lastSigname = name;
              }
            }
            continue;
          }
          // collect all LINKED values from the latest segment
          ArrayList segLinked = (ArrayList)linked.get(name);
          if (segLinked == null) {
            segLinked = new ArrayList();
            linked.put(name, segLinked);
          }
          segLinked.add(val);
        } else {
          throw new IOException("Cannot determine segment part: " + partString);
        }
      } else if (o instanceof Content) {
        String name = ((Content)o).getMetadata().get(SEGMENT_NAME_KEY.toString());
        if (lastC == null) {
          lastC = (Content)o;
          lastCname = name;
        } else {
          if (lastCname.compareTo(name) < 0) {
            lastC = (Content)o;
            lastCname = name;
          }
        }
      } else if (o instanceof ParseData) {
        String name = ((ParseData)o).getParseMeta().get(SEGMENT_NAME_KEY.toString());
        if (lastPD == null) {
          lastPD = (ParseData)o;
          lastPDname = name;
        } else {
          if (lastPDname.compareTo(name) < 0) {
            lastPD = (ParseData)o;
            lastPDname = name;
          }
        }
      } else if (o instanceof ParseText) {
        String text = ((ParseText)o).getText();
        String name = null;
        int idx = text.indexOf(nameMarker, nameMarker.length());
        if (idx != -1) {
          name = text.substring(nameMarker.length(), idx);
        } else {
          throw new IOException("Missing segment name marker in ParseText, key " + key + ": " + text);
        }
        if (lastPT == null) {
          lastPT = (ParseText)o;
          lastPTname = name;
        } else {
          if (lastPTname.compareTo(name) < 0) {
            lastPT = (ParseText)o;
            lastPTname = name;
          }
        }
      }
    }
    curCount++;
    UTF8 sliceName = null;
    ObjectWritable wrapper = new ObjectWritable();
    if (sliceSize > 0) {
      sliceName = new UTF8(String.valueOf(curCount / sliceSize));
    }
    // now output the latest values
    if (lastG != null) {
      if (sliceName != null) {
        lastG.getMetaData().put(SEGMENT_SLICE_KEY, sliceName);
      }
      wrapper.set(lastG);
      output.collect(key, wrapper);
    }
    if (lastF != null) {
      if (sliceName != null) {
        lastF.getMetaData().put(SEGMENT_SLICE_KEY, sliceName);
      }
      wrapper.set(lastF);
      output.collect(key, wrapper);
    }
    if (lastSig != null) {
      if (sliceName != null) {
        lastSig.getMetaData().put(SEGMENT_SLICE_KEY, sliceName);
      }
      wrapper.set(lastSig);
      output.collect(key, wrapper);
    }
    if (lastC != null) {
      if (sliceName != null) {
        lastC.getMetadata().set(sliceMarker, sliceName.toString());
      }
      wrapper.set(lastC);
      output.collect(key, wrapper);
    }
    if (lastPD != null) {
      if (sliceName != null) {
        lastPD.getParseMeta().set(sliceMarker, sliceName.toString());
      }
      wrapper.set(lastPD);
      output.collect(key, wrapper);
    }
    if (lastPT != null) {
      if (sliceName != null) {
        lastPT = new ParseText(sliceMarker + sliceName + sliceMarker
                + lastPT.getText());
      }
      wrapper.set(lastPT);
      output.collect(key, wrapper);
    }
    if (linked.size() > 0) {
View Full Code Here
TOP
Copyright © 2018 www.massapi.com. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.