Examples of com.datasalt.pangool.tuplemr.TupleMRBuilder

com.datasalt.pangool.tuplemr.TupleMRBuilder
TupleMRBuilder creates Tuple-based Map-Reduce jobs.
One of the key concepts of Tuple-based Map-Reduce is that Hadoop Key-Value pairs are no longer used.Instead,they are replaced by tuples.
Tuples(see {@link ITuple}) are just an ordered list of elements whose types are defined in a {@link Schema}.TupleMRBuilder contains several methods to define how grouping and sorting among tuples will be performed, avoiding the complex task of defining custom binary {@link SortComparator} , {@link GroupComparator} and{@link TupleHashPartitioner} implementations.
A Tuple-based Map-Red job, in its simplest form, requires to define :
- Intermediate schemas:
  An schema specifies the name and types of a Tuple's fields. Several schemas can be defined in order to perform joins among different input data. It's mandatory to specify ,at least,one schema using {@link #addIntermediateSchema(Schema)}
- Group-by fields:
  Needed to specify how the tuples will be grouped. Several tuples with the same group-by fields will be groupped and reduced together in the Reduce phase.
- Tuple-based Mapper:
  The job needs to specify a {@link TupleMapper} instance,the Tuple-based implementation of Hadoop's {@link Mapper}. Unlike Hadoop's Mappers, Tuple-based mappers are configured using stateful serializable instances and not static class definitions.
- Tuple-based Reducer: Similar to mapper instances,the job needs to specify a {@link TupleReducer}instance,the Tuple-based implementation of Hadoop's {@link Reducer}.
@see ITuple @see Schema @see TupleMapper @see TupleReducer


    // Define the Schema according to the text file
    Schema schema = new Schema("schema",
        Fields.parse("id:int,name:string,country_code:string,district:string,population:int"));


    TupleMRBuilder builder = new TupleMRBuilder(conf);
    builder.addIntermediateSchema(schema);
    builder.setGroupByFields("id"); // but we don't care, really
    /*
     * Define the Input Format and the Output Format!
     */
    InputFormat inputFormat = new TupleTextInputFormat(schema, false, false, ',', '"', '\\',
        FieldSelector.NONE, TupleTextInputFormat.NO_NULL_STRING);
    OutputFormat outputFormat = new TupleTextOutputFormat(schema, false, ',', '"', '\\');


    builder.addInput(inPath, inputFormat, new IdentityTupleMapper());
    builder.setTupleReducer(new IdentityTupleReducer());
    builder.setOutput(outPath, outputFormat, ITuple.class, NullWritable.class);
    try {
      Job job = builder.createJob();
      assertRun(job);
    } finally {
      builder.cleanUpInstanceFiles();
    }


    Assert.assertEquals(line1out + "\n" + line2out,
        Files.toString(new File(OUT + "/" + "part-r-00000"), Charset.forName("UTF-8")).trim());

View Full Code Here

    fields.add(Field.create("booleanField", Type.BOOLEAN));
    fields.add(Field.createEnum("enumField", TestEnum.class));


    Schema schema = new Schema("schema", fields);


    TupleMRBuilder builder = new TupleMRBuilder(conf);
    builder.addIntermediateSchema(schema);
    builder.setGroupByFields("strField1"); // but we don't care, really
    /*
     * Define the Input Format and the Output Format!
     */
    InputFormat inputFormat = new TupleTextInputFormat(schema, true, false, ' ',
        TupleTextOutputFormat.NO_QUOTE_CHARACTER, TupleTextOutputFormat.NO_ESCAPE_CHARACTER,
        FieldSelector.NONE, TupleTextInputFormat.NO_NULL_STRING);
    OutputFormat outputFormat = new TupleTextOutputFormat(schema, true, ' ',
        TupleTextOutputFormat.NO_QUOTE_CHARACTER, TupleTextOutputFormat.NO_ESCAPE_CHARACTER);


    builder.addInput(inPath, inputFormat, new IdentityTupleMapper());
    builder.setTupleReducer(new IdentityTupleReducer());
    builder.setOutput(outPath, outputFormat, ITuple.class, NullWritable.class);
    Job job = builder.createJob();
    try {
      assertRun(job);
    } finally {
      builder.cleanUpInstanceFiles();
    }
    Assert.assertEquals(outHeader + "\n" + line1 + "\n" + line2 + "\n" + line3,
        Files.toString(new File(OUT + "/" + "part-r-00000"), Charset.forName("UTF-8")).trim());


    HadoopUtils.deleteIfExists(fS, inPath);

View Full Code Here


    // Define a FieldSelector to select only columns 1, 4, 6
    // 0 is the first column
    FieldSelector selector = new FieldSelector(1, 4, 6);


    TupleMRBuilder builder = new TupleMRBuilder(conf);
    builder.addIntermediateSchema(schema);
    builder.setGroupByFields("floatField"); // but we don't care, really
    // Define the Input Format and the Output Format!
    // Add the selector to the input format
    InputFormat inputFormat = new TupleTextInputFormat(schema, false, false, ' ',
        TupleTextOutputFormat.NO_QUOTE_CHARACTER, TupleTextOutputFormat.NO_ESCAPE_CHARACTER, selector,
        TupleTextInputFormat.NO_NULL_STRING);
    OutputFormat outputFormat = new TupleTextOutputFormat(schema, false, ' ',
        TupleTextOutputFormat.NO_QUOTE_CHARACTER, TupleTextOutputFormat.NO_ESCAPE_CHARACTER);


    builder.addInput(inPath, inputFormat, new IdentityTupleMapper());
    builder.setTupleReducer(new IdentityTupleReducer());
    builder.setOutput(outPath, outputFormat, ITuple.class, NullWritable.class);
    Job job = builder.createJob();
    try {
      assertRun(job);
    } finally {
      builder.cleanUpInstanceFiles();
    }
    // This is what we expect as output after field selection
    line1 = "10.0 100 true";
    line2 = "20.0 200 false";
    line3 = "30.0 300 true";

View Full Code Here

    Path inPath = new Path("src/test/resources/broken-encoding.txt");
    HadoopUtils.deleteIfExists(fS, outPath);
    
    Schema schema = new Schema("schema", Fields.parse("plugin:string?, count:int?"));


    TupleMRBuilder builder = new TupleMRBuilder(conf);
    builder.addIntermediateSchema(schema);
    builder.setGroupByFields("plugin"); // but we don't care, really
    /*
     * Define the Input Format and the Output Format!
     */


    InputFormat inputFormat = new TupleTextInputFormat(schema, false, false, ',', '"', '\\', null, null);
    builder.addInput(inPath, inputFormat, new IdentityTupleMapper());
    builder.setTupleReducer(new IdentityTupleReducer());
    builder.setTupleOutput(outPath, schema);
    Job job = builder.createJob();
    try {
      assertRun(job);
    } finally {
      builder.cleanUpInstanceFiles();
    }


    HadoopUtils.deleteIfExists(fS, outPath);
  }

View Full Code Here

    fields.add(Field.create("booleanField", Type.BOOLEAN));
    fields.add(Field.createEnum("enumField", TestEnum.class));


    Schema schema = new Schema("schema", fields);


    TupleMRBuilder builder = new TupleMRBuilder(conf);
    builder.addIntermediateSchema(schema);
    builder.setGroupByFields("strField1"); // but we don't care, really
    /*
     * Define the Input Format and the Output Format!
     */


    InputFormat inputFormat = new TupleTextInputFormat(schema, fieldsPos, false, null);
    OutputFormat outputFormat = new TupleTextOutputFormat(schema, false, ' ',
        TupleTextOutputFormat.NO_QUOTE_CHARACTER, TupleTextOutputFormat.NO_ESCAPE_CHARACTER);


    builder.addInput(inPath, inputFormat, new IdentityTupleMapper());
    builder.setTupleReducer(new IdentityTupleReducer());
    builder.setOutput(outPath, outputFormat, ITuple.class, NullWritable.class);
    Job job = builder.createJob();
    try {
      assertRun(job);
    } finally {
      builder.cleanUpInstanceFiles();
    }
    Assert.assertEquals(line1out + "\n" + line2out + "\n" + line3out,
        Files.toString(new File(OUT + "/" + "part-r-00000"), Charset.forName("UTF-8")).trim());


    HadoopUtils.deleteIfExists(fS, inPath);

View Full Code Here

    fields.add(Field.create("booleanField", Type.BOOLEAN));
    fields.add(Field.createEnum("enumField", TestEnum.class));


    Schema schema = new Schema("schema", fields);


    TupleMRBuilder builder = new TupleMRBuilder(conf);
    builder.addIntermediateSchema(schema);
    builder.setGroupByFields("strField1"); // but we don't care, really
    /*
     * Define the Input Format and the Output Format!
     */


    InputFormat inputFormat = new TupleTextInputFormat(schema, fieldsPos, false, null);
    OutputFormat outputFormat = new TupleTextOutputFormat(schema, false, ' ',
        TupleTextOutputFormat.NO_QUOTE_CHARACTER, TupleTextOutputFormat.NO_ESCAPE_CHARACTER);


    builder.addInput(inPath, inputFormat, new IdentityTupleMapper());
    builder.setTupleReducer(new IdentityTupleReducer());
    builder.setOutput(outPath, outputFormat, ITuple.class, NullWritable.class);
    Job job = builder.createJob();
    try {
      assertRun(job);
    } finally {
      builder.cleanUpInstanceFiles();
    }
    Assert.assertEquals(line1out + "\n" + line2out + "\n" + line3out,
        Files.toString(new File(OUT + "/" + "part-r-00000"), Charset.forName("UTF-8")).trim());


    HadoopUtils.deleteIfExists(fS, inPath);

View Full Code Here

    CommonUtils.writeTXT("ignore-me", new File(INPUT));


    getConf().set("mapred.output.compress", "true");
    getConf().set("mapred.output.compression.codec", "org.apache.hadoop.io.compress.DefaultCodec");


    TupleMRBuilder builder = new TupleMRBuilder(getConf());
    Schema baseSchema = new Schema("schema", Fields.parse("name:string, money:int, country:string"));
    builder.addIntermediateSchema(baseSchema);
    builder.setGroupByFields("country");
    builder.setOrderBy(new OrderBy().add("country", Order.ASC).add("money", Order.DESC)
        .add("name", Order.ASC));
    builder.addInput(new Path(INPUT), new HadoopInputFormat(TextInputFormat.class),
        new MyInputProcessor());
    builder.setTupleReducer(new MyGroupHandler());
    builder.setOutput(new Path(OUTPUT), new HadoopOutputFormat(SequenceFileOutputFormat.class),
        DoubleWritable.class, NullWritable.class);
    // Configure extra outputs
    builder.addNamedOutput(OUTPUT_1, new HadoopOutputFormat(SequenceFileOutputFormat.class), Utf8.class,
        Utf8.class);
    builder.addNamedOutput(OUTPUT_2, new HadoopOutputFormat(SequenceFileOutputFormat.class),
        IntWritable.class, NullWritable.class);
    builder.addNamedTupleOutput(TUPLEOUTPUT_1, baseSchema);


    getConf().setClass(ProxyOutputFormat.PROXIED_OUTPUT_FORMAT_CONF, SequenceFileOutputFormat.class,
        OutputFormat.class);
    Job job = builder.createJob();
    try {
      assertRun(job);
    } finally {
      builder.cleanUpInstanceFiles();
    }


    // Check outputs


    checkCompression(firstReducerOutput(OUTPUT + "/" + OUTPUT_1), DefaultCodec.class);

View Full Code Here

    HadoopUtils.deleteIfExists(fS, outPath);
    HadoopUtils.deleteIfExists(fS, outPathText);


    Schema originalSchema = new Schema("schema", Fields.parse("title:string, content:string"));


    TupleMRBuilder builder = new TupleMRBuilder(conf);
    builder.addIntermediateSchema(originalSchema);
    builder.setGroupByFields("title");
    builder.setOrderBy(new OrderBy().add("title", Order.ASC).add("content", Order.ASC));


    builder.setTupleReducer(new IdentityTupleReducer());
    builder.setTupleOutput(outPath, originalSchema);
    builder.addInput(inPath, new HadoopInputFormat(TextInputFormat.class), new MyInputProcessor());


    Job job = builder.createJob();
    try {
      job.waitForCompletion(true);
    } finally {
      builder.cleanUpInstanceFiles();
    }


    // Use output as input of new TupleMRBuilder
    // To make things nicer, we evolve the Schema and use a different Schema for reading the Tuple File.
    // We remove the "content" and add a new nullable field.
    Schema evolvedSchema = new Schema("evolved", Fields.parse("content:string, new_field:string?"));


    builder = new TupleMRBuilder(conf);
    builder.addTupleInput(outPath, evolvedSchema, new IdentityTupleMapper()); 
    builder.addIntermediateSchema(evolvedSchema);
    builder.setGroupByFields("content");
    builder.setTupleReducer(new MyGroupHandler());
    builder.setOutput(outPathText, new HadoopOutputFormat(TextOutputFormat.class), Text.class,
        NullWritable.class);


    job = builder.createJob();
    try {
      assertRun(job);
    } finally {
      builder.cleanUpInstanceFiles();
    }


    Assert.assertEquals("bar2 foo2\nfoo1 bar1",
        Files.toString(new File(OUT_TEXT + "/" + "part-r-00000"), Charset.forName("UTF-8")).trim());

View Full Code Here

    
    Configuration conf = new Configuration();
    FileSystem fS = FileSystem.get(conf);
    
    Path out = new Path("out-" + TupleOfTupleOfTuples.class.getName());
    TupleMRBuilder builder = new TupleMRBuilder(conf);
    fS.delete(out, true);
    
    builder.setTupleOutput(out, getMetaSchema2());
    builder.addIntermediateSchema(getMetaSchema2());
    builder.addInput(new Path("src/test/resources/foo-file.txt"), new HadoopInputFormat(TextInputFormat.class), new MyHandler());
    builder.setGroupByFields("group");
    builder.setTupleReducer(new IdentityTupleReducer());
    Job job = builder.createJob();
    try {
      job.waitForCompletion(true);
    } finally {
      builder.cleanUpInstanceFiles();
    }


    Path toRead = new Path(out, "part-r-00000");
    assertTrue(fS.exists(toRead));
    TupleFile.Reader reader = new TupleFile.Reader(fS, conf, toRead);

View Full Code Here


    List<Field> fields = new ArrayList<Field>();
    fields.add(Field.create("word", Type.STRING));
    fields.add(Field.create("count", Type.INT));


    TupleMRBuilder cg = new TupleMRBuilder(conf);
    cg.addIntermediateSchema(new Schema("schema", fields));
    cg.setJarByClass(TestCombiner.class);
    cg.addInput(new Path(input), new HadoopInputFormat(SequenceFileInputFormat.class), new Split());
    cg.setOutput(new Path(output), new HadoopOutputFormat(SequenceFileOutputFormat.class), Utf8.class,
        IntWritable.class);
    cg.setGroupByFields("word");
    cg.setOrderBy(new OrderBy().add("word", Order.ASC));
    cg.setTupleReducer(new Count());
    cg.setTupleCombiner(new CountCombiner());


    return cg;
  }

View Full Code Here

0 1 2 3 4 5 6 7

TOP

Related Classes of com.datasalt.pangool.tuplemr.TupleMRBuilder

com.datasalt.pangool.examples.avro.AvroCustomSerializationJob

com.datasalt.pangool.examples.avro.AvroTopicalWordCount

com.datasalt.pangool.examples.avro.AvroTweetsJoin

com.datasalt.pangool.examples.gameoflife.GameOfLifeJob

com.datasalt.pangool.examples.movingaverage.MovingAverage

com.datasalt.pangool.examples.naivebayes.NaiveBayesGenerate

com.datasalt.pangool.examples.secondarysort.SecondarySort

com.datasalt.pangool.examples.simplesecondarysort.SimpleSecondarySort

com.datasalt.pangool.examples.solr.MultiShakespeareIndexer

com.datasalt.pangool.examples.topicalwordcount.TopicalWordCount

All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.