Examples of org.apache.crunch.impl.mr.MRPipeline.readTextFile()

Class org.apache.crunch.impl.mr.MRPipeline

Examples of org.apache.crunch.impl.mr.MRPipeline.readTextFile()

org.apache.crunch.impl.mr.MRPipeline.readTextFile()

      return 1;
    }
    // Create an object to coordinate pipeline creation and execution.
    Pipeline pipeline = new MRPipeline(WordCount.class, getConf());
    // Reference a given text file as a collection of Strings.
    PCollection<String> lines = pipeline.readTextFile(args[1]);


    // Define a function that splits each line in a PCollection of Strings into
    // a
    // PCollection made up of the individual words in the file.
    PCollection<String> words = lines.parallelDo(new DoFn<String, String>() {

View Full Code Here


  @Test
  public void testMaterializeAvroPersonAndReflectsPair_GroupedTable() throws IOException {
    Assume.assumeTrue(Avros.CAN_COMBINE_SPECIFIC_AND_REFLECT_SCHEMAS);
    Pipeline pipeline = new MRPipeline(MaterializeIT.class);
    List<Pair<StringWrapper, Person>> pairList = Lists.newArrayList(pipeline
        .readTextFile(tmpDir.copyResourceFileName("set1.txt"))
        .parallelDo(new StringToStringWrapperPersonPairMapFn(),
            Avros.pairs(Avros.reflects(StringWrapper.class), Avros.records(Person.class)))
        .materialize());

View Full Code Here

  
  @Test
  public void testDeepCopyCustomTuple() throws Exception {
    Pipeline p = new MRPipeline(DeepCopyCustomTuplesIT.class, tmpDir.getDefaultConfiguration());
    String shakesInputPath = tmpDir.copyResourceFileName("shakes.txt");
    PCollection<String> shakes = p.readTextFile(shakesInputPath);
    Iterable<String> out = shakes
        .parallelDo(new PreProcFn(), tableOf(ints(), pairs(ints(), pids)))
        .groupByKey()
        .parallelDo(new PostProcFn(), strings())
        .materialize();

View Full Code Here


  @Test
  public void materializedColShouldBeWritten() throws Exception {
    File textFile = tmpDir.copyResourceFile("shakes.txt");
    Pipeline pipeline = new MRPipeline(MRPipelineIT.class, tmpDir.getDefaultConfiguration());
    PCollection<String> genericCollection = pipeline.readTextFile(textFile.getAbsolutePath());
    pipeline.run();
    PCollection<String> filter = genericCollection.filter("Filtering data", FilterFns.<String>ACCEPT_ALL());
    filter.materialize();
    pipeline.run();
    File file = tmpDir.getFile("output.txt");

View Full Code Here

  
  
  @Test
  public void testPGroupedTableToMultipleOutputs() throws IOException{
    Pipeline pipeline = new MRPipeline(MRPipelineIT.class, tmpDir.getDefaultConfiguration());
    PGroupedTable<String, String> groupedLineTable = pipeline.readTextFile(tmpDir.copyResourceFileName("set1.txt")).by(IdentityFn.<String>getInstance(), Writables.strings()).groupByKey();
    
    PTable<String, String> ungroupedTableA = groupedLineTable.ungroup();
    PTable<String, String> ungroupedTableB = groupedLineTable.ungroup();
    
    File outputDirA = tmpDir.getFile("output_a");

View Full Code Here


  @Test
  public void testMRMaterializeToMap() throws IOException {
    Pipeline p = new MRPipeline(MaterializeToMapIT.class, tmpDir.getDefaultConfiguration());
    String inputFile = tmpDir.copyResourceFileName("set1.txt");
    PCollection<String> c = p.readTextFile(inputFile);
    PTypeFamily tf = c.getTypeFamily();
    PTable<Integer, String> t = c.parallelDo(new Set1Mapper(), tf.tableOf(tf.ints(), tf.strings()));
    Map<Integer, String> m = t.materializeToMap();
    assertMatches(m);
  }

View Full Code Here

  }


  private PTable<Integer, String> getMRPTable() throws IOException {
    Pipeline p = new MRPipeline(MaterializeToMapIT.class, tmpDir.getDefaultConfiguration());
    String inputFile = tmpDir.copyResourceFileName("set1.txt");
    PCollection<String> c = p.readTextFile(inputFile);
    PTypeFamily tf = c.getTypeFamily();
    PTable<Integer, String> table = c.parallelDo(new Set1Mapper(), tf.tableOf(tf.ints(),
        tf.strings()));
    return table;
  }

View Full Code Here

  
  @Test
  public void testMapSideOutputs() throws Exception {
    Pipeline pipeline = new MRPipeline(CleanTextIT.class, tmpDir.getDefaultConfiguration());
    String shakesInputPath = tmpDir.copyResourceFileName("shakes.txt");
    PCollection<String> shakespeare = pipeline.readTextFile(shakesInputPath);
    
    PCollection<String> cleanShakes = shakespeare.parallelDo(CLEANER, Avros.strings());
    File cso = tmpDir.getFile("cleanShakes");
    cleanShakes.write(To.textFile(cso.getAbsolutePath()));

View Full Code Here


  @Test
  public void testEnumPTypes() throws IOException {
    String inputFile1 = tmpDir.copyResourceFileName("set1.txt");
    Pipeline pipeline = new MRPipeline(EnumPairIT.class);
    PCollection<String> set1 = pipeline.readTextFile(inputFile1);
    PTable<String, etypes> data = set1.parallelDo(new DoFn<String, Pair<String, etypes>>() {
      @Override
      public void process(String input, Emitter<Pair<String, etypes>> emitter) {
        emitter.emit(new Pair<String, etypes>(input, etypes.type1));
      }

View Full Code Here

  }


  public static void run(PTypeFamily typeFamily, TemporaryPath tmpDir) throws Exception {
    Pipeline pipeline = new MRPipeline(MapsIT.class, tmpDir.getDefaultConfiguration());
    String shakesInputPath = tmpDir.copyResourceFileName("shakes.txt");
    PCollection<String> shakespeare = pipeline.readTextFile(shakesInputPath);
    Iterable<Pair<String, Map<String, Long>>> output = shakespeare
        .parallelDo(new DoFn<String, Pair<String, Map<String, Long>>>() {
          @Override
          public void process(String input, Emitter<Pair<String, Map<String, Long>>> emitter) {
            String last = null;

View Full Code Here

0 1 2 3 4 5 6 7 8

TOP

All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.