Examples of org.apache.crunch.impl.mr.MRPipeline.readTextFile()

Class org.apache.crunch.impl.mr.MRPipeline

Examples of org.apache.crunch.impl.mr.MRPipeline.readTextFile()

org.apache.crunch.impl.mr.MRPipeline.readTextFile()


  @Test
  public void testWritables() throws Exception {
    Pipeline pipeline = new MRPipeline(AggregateIT.class, tmpDir.getDefaultConfiguration());
    String shakesInputPath = tmpDir.copyResourceFileName("shakes.txt");
    PCollection<String> shakes = pipeline.readTextFile(shakesInputPath);
    runMinMax(shakes, WritableTypeFamily.getInstance());
    pipeline.done();
  }


  @Test

View Full Code Here


  @Test
  public void testAvro() throws Exception {
    Pipeline pipeline = new MRPipeline(AggregateIT.class, tmpDir.getDefaultConfiguration());
    String shakesInputPath = tmpDir.copyResourceFileName("shakes.txt");
    PCollection<String> shakes = pipeline.readTextFile(shakesInputPath);
    runMinMax(shakes, AvroTypeFamily.getInstance());
    pipeline.done();
  }


  @Test

View Full Code Here


  @Test
  public void testCollectUrls() throws Exception {
    Pipeline p = new MRPipeline(AggregateIT.class, tmpDir.getDefaultConfiguration());
    String urlsInputPath = tmpDir.copyResourceFileName("urls.txt");
    PTable<String, Collection<String>> urls = Aggregate.collectValues(p.readTextFile(urlsInputPath).parallelDo(
        new SplitFn(), tableOf(strings(), strings())));
    for (Pair<String, Collection<String>> e : urls.materialize()) {
      String key = e.first();
      int expectedSize = 0;
      if ("www.A.com".equals(key)) {

View Full Code Here

  }


  @Test
  public void testCollectValues_Writables() throws IOException {
    Pipeline pipeline = new MRPipeline(AggregateIT.class, tmpDir.getDefaultConfiguration());
    Map<Integer, Collection<Text>> collectionMap = pipeline.readTextFile(tmpDir.copyResourceFileName("set2.txt"))
        .parallelDo(new MapStringToTextPair(), Writables.tableOf(Writables.ints(), Writables.writables(Text.class)))
        .collectValues().materializeToMap();


    assertEquals(1, collectionMap.size());

View Full Code Here

  @Test
  public void testCollectValues_Avro() throws IOException {


    MapStringToEmployeePair mapFn = new MapStringToEmployeePair();
    Pipeline pipeline = new MRPipeline(AggregateIT.class, tmpDir.getDefaultConfiguration());
    Map<Integer, Collection<Employee>> collectionMap = pipeline.readTextFile(tmpDir.copyResourceFileName("set2.txt"))
        .parallelDo(mapFn, Avros.tableOf(Avros.ints(), Avros.records(Employee.class))).collectValues()
        .materializeToMap();


    assertEquals(1, collectionMap.size());

View Full Code Here

    MRPipeline pipeline = new MRPipeline(BloomFilterFactory.class);
    FileStatus[] listStatus = FileSystem.get(pipeline.getConfiguration()).listStatus(inputPath);
    PTable<String, BloomFilter> filterTable = null;
    for (FileStatus fileStatus : listStatus) {
      Path path = fileStatus.getPath();
      PCollection<String> readTextFile = pipeline.readTextFile(path.toString());
      pipeline.getConfiguration().set(BloomFilterFn.CRUNCH_FILTER_NAME, path.getName());
      PTable<String, BloomFilter> currentTable = createFilterTable(readTextFile, filterFn);
      if (filterTable != null) {
        filterTable = filterTable.union(currentTable);
      } else {

View Full Code Here


  @Test
  public void materializedColShouldBeWritten() throws Exception {
    File textFile = tmpDir.copyResourceFile("shakes.txt");
    Pipeline pipeline = new MRPipeline(MRPipelineIT.class, tmpDir.getDefaultConfiguration());
    PCollection<String> genericCollection = pipeline.readTextFile(textFile.getAbsolutePath());
    pipeline.run();
    PCollection<String> filter = genericCollection.filter("Filtering data", new FilterFn<String>() {
      @Override
      public boolean accept(String input) {
        return true;

View Full Code Here

  
  
  @Test
  public void testPGroupedTableToMultipleOutputs() throws IOException{
    Pipeline pipeline = new MRPipeline(MRPipelineIT.class, tmpDir.getDefaultConfiguration());
    PGroupedTable<String, String> groupedLineTable = pipeline.readTextFile(tmpDir.copyResourceFileName("set1.txt")).by(IdentityFn.<String>getInstance(), Writables.strings()).groupByKey();
    
    PTable<String, String> ungroupedTableA = groupedLineTable.ungroup();
    PTable<String, String> ungroupedTableB = groupedLineTable.ungroup();
    
    File outputDirA = tmpDir.getFile("output_a");

View Full Code Here

      return 1;
    }
    // Create an object to coordinate pipeline creation and execution.
    Pipeline pipeline = new MRPipeline(AverageBytesByIP.class, getConf());
    // Reference a given text file as a collection of Strings.
    PCollection<String> lines = pipeline.readTextFile(args[0]);


    // Combiner used for summing up response size and count
    CombineFn<String, Pair<Long, Long>> stringPairOfLongsSumCombiner = CombineFn.pairAggregator(CombineFn.SUM_LONGS,
        CombineFn.SUM_LONGS);

View Full Code Here

      return 1;
    }
    // Create an object to coordinate pipeline creation and execution.
    Pipeline pipeline = new MRPipeline(TotalBytesByIP.class, getConf());
    // Reference a given text file as a collection of Strings.
    PCollection<String> lines = pipeline.readTextFile(args[0]);


    // Combiner used for summing up response size
    CombineFn<String, Long> longSumCombiner = CombineFn.SUM_LONGS();


    // Table of (ip, sum(response size))

View Full Code Here

0 1 2 3 4 5 6 7 8

TOP

All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.