Package org.apache.crunch.impl.mr

Examples of org.apache.crunch.impl.mr.MRPipeline.readTextFile()


    out2.flush();
    out2.close();

    final MRPipeline pipeline = new MRPipeline(MapsideJoinStrategyIT.class, tmpDir.getDefaultConfiguration());

    final PCollection<String> values1 = pipeline.readTextFile(path1.toString());
    final PCollection<String> values2 = pipeline.readTextFile(path2.toString());

    final PTable<Text, Text> convertedValues1 = convertStringToText(values1);
    final PTable<Text, Text> convertedValues2 = convertStringToText(values2);
View Full Code Here


    out2.close();

    final MRPipeline pipeline = new MRPipeline(MapsideJoinStrategyIT.class, tmpDir.getDefaultConfiguration());

    final PCollection<String> values1 = pipeline.readTextFile(path1.toString());
    final PCollection<String> values2 = pipeline.readTextFile(path2.toString());

    final PTable<Text, Text> convertedValues1 = convertStringToText(values1);
    final PTable<Text, Text> convertedValues2 = convertStringToText(values2);

    // for map side join
View Full Code Here

  @Test
  public void materializedColShouldBeWritten() throws Exception {
    File textFile = tmpDir.copyResourceFile("shakes.txt");
    Pipeline pipeline = new MRPipeline(MRPipelineIT.class, tmpDir.getDefaultConfiguration());
    PCollection<String> genericCollection = pipeline.readTextFile(textFile.getAbsolutePath());
    pipeline.run();
    PCollection<String> filter = genericCollection.filter("Filtering data", FilterFns.<String>ACCEPT_ALL());
    filter.materialize();
    pipeline.run();
    File file = tmpDir.getFile("output.txt");
View Full Code Here

 
 
  @Test
  public void testPGroupedTableToMultipleOutputs() throws IOException{
    Pipeline pipeline = new MRPipeline(MRPipelineIT.class, tmpDir.getDefaultConfiguration());
    PGroupedTable<String, String> groupedLineTable = pipeline.readTextFile(tmpDir.copyResourceFileName("set1.txt")).by(IdentityFn.<String>getInstance(), Writables.strings()).groupByKey();
   
    PTable<String, String> ungroupedTableA = groupedLineTable.ungroup();
    PTable<String, String> ungroupedTableB = groupedLineTable.ungroup();
   
    File outputDirA = tmpDir.getFile("output_a");
View Full Code Here

 
  public PipelineExecution run() throws IOException {
    String shakes = tmpDir.copyResourceFileName("shakes.txt");
    String out = tmpDir.getFileName("cancel");
    Pipeline p = new MRPipeline(CancelJobsIT.class, tmpDir.getDefaultConfiguration());
    PCollection<String> words = p.readTextFile(shakes);
    p.write(words.count().top(20), To.textFile(out));
    return p.runAsync(); // need to hack to slow down job start up if this test becomes flaky.
  }
}
View Full Code Here

      return 1;
    }
    // Create an object to coordinate pipeline creation and execution.
    Pipeline pipeline = new MRPipeline(AverageBytesByIP.class, getConf());
    // Reference a given text file as a collection of Strings.
    PCollection<String> lines = pipeline.readTextFile(args[0]);

    // Aggregator used for summing up response size and count
    Aggregator<Pair<Long, Long>> agg = pairAggregator(SUM_LONGS(), SUM_LONGS());

    // Table of (ip, sum(response size), count)
View Full Code Here

      return 1;
    }
    // Create an object to coordinate pipeline creation and execution.
    Pipeline pipeline = new MRPipeline(TotalWordCount.class, getConf());
    // Reference a given text file as a collection of Strings.
    PCollection<String> lines = pipeline.readTextFile(args[0]);

    // Define a function that splits each line in a PCollection of Strings into
    // a
    // PCollection made up of the individual words in the file.
    PCollection<Long> numberOfWords = lines.parallelDo(new DoFn<String, Long>() {
View Full Code Here

      return 1;
    }
    // Create an object to coordinate pipeline creation and execution.
    Pipeline pipeline = new MRPipeline(SecondarySortExample.class, getConf());
    // Reference a given text file as a collection of Strings.
    PCollection<String> lines = pipeline.readTextFile(args[0]);

    // Define a function that parses each line in a PCollection of Strings into
    // a pair of pairs, the first of which will be grouped by (first member) and
    // the sorted by (second memeber). The second pair is payload which can be
    // passed in an Iterable object.
View Full Code Here

    MRPipeline pipeline = new MRPipeline(BloomFilterFactory.class);
    FileStatus[] listStatus = FileSystem.get(pipeline.getConfiguration()).listStatus(inputPath);
    PTable<String, BloomFilter> filterTable = null;
    for (FileStatus fileStatus : listStatus) {
      Path path = fileStatus.getPath();
      PCollection<String> readTextFile = pipeline.readTextFile(path.toString());
      pipeline.getConfiguration().set(BloomFilterFn.CRUNCH_FILTER_NAME, path.getName());
      PTable<String, BloomFilter> currentTable = createFilterTable(readTextFile, filterFn);
      if (filterTable != null) {
        filterTable = filterTable.union(currentTable);
      } else {
View Full Code Here

      return 1;
    }
    // Create an object to coordinate pipeline creation and execution.
    Pipeline pipeline = new MRPipeline(AverageBytesByIP.class, getConf());
    // Reference a given text file as a collection of Strings.
    PCollection<String> lines = pipeline.readTextFile(args[0]);

    // Combiner used for summing up response size and count
    CombineFn<String, Pair<Long, Long>> stringPairOfLongsSumCombiner = CombineFn.pairAggregator(CombineFn.SUM_LONGS,
        CombineFn.SUM_LONGS);
View Full Code Here

TOP
Copyright © 2018 www.massapi.com. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.