Examples of ClueWarcForwardIndex


Examples of edu.umd.cloud9.collection.clue.ClueWarcForwardIndex

      String inputFile = conf.get("InputFile");
      String outputFile = conf.get("OutputFile");

      ClueWarcForwardIndex[] indexes = new ClueWarcForwardIndex[10];

      indexes[0] = new ClueWarcForwardIndex();
      indexes[0].loadIndex(new Path("/shared/ClueWeb09/collection.compressed.block/findex.en.01.dat"),
               new Path("/shared/ClueWeb09/docno-mapping.dat"), FileSystem.get(conf));

      indexes[1] = new ClueWarcForwardIndex();
      indexes[1].loadIndex(new Path("/shared/ClueWeb09/collection.compressed.block/findex.en.02.dat"),
               new Path("/shared/ClueWeb09/docno-mapping.dat"), FileSystem.get(conf));

      indexes[2] = new ClueWarcForwardIndex();
      indexes[2].loadIndex(new Path("/shared/ClueWeb09/collection.compressed.block/findex.en.03.dat"),
               new Path("/shared/ClueWeb09/docno-mapping.dat"), FileSystem.get(conf));

      indexes[3] = new ClueWarcForwardIndex();
      indexes[3].loadIndex(new Path("/shared/ClueWeb09/collection.compressed.block/findex.en.04.dat"),
               new Path("/shared/ClueWeb09/docno-mapping.dat"), FileSystem.get(conf));

      indexes[4] = new ClueWarcForwardIndex();
      indexes[4].loadIndex(new Path("/shared/ClueWeb09/collection.compressed.block/findex.en.05.dat"),
               new Path("/shared/ClueWeb09/docno-mapping.dat"), FileSystem.get(conf));

      indexes[5] = new ClueWarcForwardIndex();
      indexes[5].loadIndex(new Path("/shared/ClueWeb09/collection.compressed.block/findex.en.06.dat"),
               new Path("/shared/ClueWeb09/docno-mapping.dat"), FileSystem.get(conf));

      indexes[6] = new ClueWarcForwardIndex();
      indexes[6].loadIndex(new Path("/shared/ClueWeb09/collection.compressed.block/findex.en.07.dat"),
               new Path("/shared/ClueWeb09/docno-mapping.dat"), FileSystem.get(conf));

      indexes[7] = new ClueWarcForwardIndex();
      indexes[7].loadIndex(new Path("/shared/ClueWeb09/collection.compressed.block/findex.en.08.dat"),
               new Path("/shared/ClueWeb09/docno-mapping.dat"), FileSystem.get(conf));

      indexes[8] = new ClueWarcForwardIndex();
      indexes[8].loadIndex(new Path("/shared/ClueWeb09/collection.compressed.block/findex.en.09.dat"),
               new Path("/shared/ClueWeb09/docno-mapping.dat"), FileSystem.get(conf));

      indexes[9] = new ClueWarcForwardIndex();
      indexes[9].loadIndex(new Path("/shared/ClueWeb09/collection.compressed.block/findex.en.10.dat"),
               new Path("/shared/ClueWeb09/docno-mapping.dat"), FileSystem.get(conf));

      FileSystem fs = FileSystem.get(conf);
View Full Code Here

Examples of edu.umd.cloud9.collection.clue.ClueWarcForwardIndex

      String inputFile = conf.get("InputFile");
      String outputFile = conf.get("OutputFile");
      String findexFile = conf.get("ForwardIndexFile");
      String docnoMapping = conf.get("DocnoMappingFile");

      ClueWarcForwardIndex findex = new ClueWarcForwardIndex();
      findex.loadIndex(new Path(findexFile), new Path(docnoMapping), FileSystem.get(conf));

      FileSystem fs = FileSystem.get(conf);

      sLogger.info("reading " + inputFile);

      FSLineReader reader = new FSLineReader(new Path(inputFile), fs);
      FSDataOutputStream writer = fs.create(new Path(outputFile), true);

      Text line = new Text();
      while (reader.readLine(line) > 0) {
        String[] arr = line.toString().split("\\s+");

        String docid = arr[2];
        int rank = Integer.parseInt(arr[3]);

        long start = System.currentTimeMillis();
        String url = findex.getDocument(docid).getHeaderMetadataItem("WARC-Target-URI");
        long duration = System.currentTimeMillis() - start;

        reporter.incrCounter(MyCounter.Count, 1);
        reporter.incrCounter(MyCounter.Time, duration);
View Full Code Here

Examples of edu.umd.cloud9.collection.clue.ClueWarcForwardIndex

        "-collection=" + collectionPathRepacked,
        "-index=" + index };

    IntegrationUtils.exec(Joiner.on(" ").join(args));

    ClueWarcForwardIndex findex = new ClueWarcForwardIndex();
    findex.loadIndex(new Path(index), new Path(mappingFile), fs);

    assertTrue(findex.getDocument(14069750).getContent()
        .contains("Vizergy: How Design and SEO work together"));
    assertTrue(findex.getDocument("clueweb09-en0008-76-19728").getContent()
        .contains("Jostens - Homeschool Yearbooks"));
    assertEquals(1, findex.getFirstDocno());
    assertEquals(50220423, findex.getLastDocno());
  }
View Full Code Here

Examples of edu.umd.cloud9.collection.clue.ClueWarcForwardIndex

      String inputFile = conf.get("InputFile");
      String outputFile = conf.get("OutputFile");

      ClueWarcForwardIndex[] indexes = new ClueWarcForwardIndex[10];

      indexes[0] = new ClueWarcForwardIndex();
      indexes[0].loadIndex(new Path("/shared/ClueWeb09/collection.compressed.block/findex.en.01.dat"),
               new Path("/shared/ClueWeb09/docno-mapping.dat"), FileSystem.get(conf));

      indexes[1] = new ClueWarcForwardIndex();
      indexes[1].loadIndex(new Path("/shared/ClueWeb09/collection.compressed.block/findex.en.02.dat"),
               new Path("/shared/ClueWeb09/docno-mapping.dat"), FileSystem.get(conf));

      indexes[2] = new ClueWarcForwardIndex();
      indexes[2].loadIndex(new Path("/shared/ClueWeb09/collection.compressed.block/findex.en.03.dat"),
               new Path("/shared/ClueWeb09/docno-mapping.dat"), FileSystem.get(conf));

      indexes[3] = new ClueWarcForwardIndex();
      indexes[3].loadIndex(new Path("/shared/ClueWeb09/collection.compressed.block/findex.en.04.dat"),
               new Path("/shared/ClueWeb09/docno-mapping.dat"), FileSystem.get(conf));

      indexes[4] = new ClueWarcForwardIndex();
      indexes[4].loadIndex(new Path("/shared/ClueWeb09/collection.compressed.block/findex.en.05.dat"),
               new Path("/shared/ClueWeb09/docno-mapping.dat"), FileSystem.get(conf));

      indexes[5] = new ClueWarcForwardIndex();
      indexes[5].loadIndex(new Path("/shared/ClueWeb09/collection.compressed.block/findex.en.06.dat"),
               new Path("/shared/ClueWeb09/docno-mapping.dat"), FileSystem.get(conf));

      indexes[6] = new ClueWarcForwardIndex();
      indexes[6].loadIndex(new Path("/shared/ClueWeb09/collection.compressed.block/findex.en.07.dat"),
               new Path("/shared/ClueWeb09/docno-mapping.dat"), FileSystem.get(conf));

      indexes[7] = new ClueWarcForwardIndex();
      indexes[7].loadIndex(new Path("/shared/ClueWeb09/collection.compressed.block/findex.en.08.dat"),
               new Path("/shared/ClueWeb09/docno-mapping.dat"), FileSystem.get(conf));

      indexes[8] = new ClueWarcForwardIndex();
      indexes[8].loadIndex(new Path("/shared/ClueWeb09/collection.compressed.block/findex.en.09.dat"),
               new Path("/shared/ClueWeb09/docno-mapping.dat"), FileSystem.get(conf));

      indexes[9] = new ClueWarcForwardIndex();
      indexes[9].loadIndex(new Path("/shared/ClueWeb09/collection.compressed.block/findex.en.10.dat"),
               new Path("/shared/ClueWeb09/docno-mapping.dat"), FileSystem.get(conf));

      FileSystem fs = FileSystem.get(conf);
View Full Code Here

Examples of edu.umd.cloud9.collection.clue.ClueWarcForwardIndex

      String inputFile = conf.get("InputFile");
      String outputFile = conf.get("OutputFile");
      String findexFile = conf.get("ForwardIndexFile");
      String docnoMapping = conf.get("DocnoMappingFile");

      ClueWarcForwardIndex findex = new ClueWarcForwardIndex();
      findex.loadIndex(new Path(findexFile), new Path(docnoMapping), FileSystem.get(conf));

      FileSystem fs = FileSystem.get(conf);

      sLogger.info("reading " + inputFile);

      LineReader reader = new LineReader(fs.open(new Path(inputFile)));
      FSDataOutputStream writer = fs.create(new Path(outputFile), true);

      Text line = new Text();
      while (reader.readLine(line) > 0) {
        String[] arr = line.toString().split("\\s+");

        String docid = arr[2];
        int rank = Integer.parseInt(arr[3]);

        long start = System.currentTimeMillis();
        String url = findex.getDocument(docid).getHeaderMetadataItem("WARC-Target-URI");
        long duration = System.currentTimeMillis() - start;

        reporter.incrCounter(MyCounter.Count, 1);
        reporter.incrCounter(MyCounter.Time, duration);
View Full Code Here
TOP
Copyright © 2018 www.massapi.com. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.