Package com.datasalt.pangool.tuplemr

Examples of com.datasalt.pangool.tuplemr.TupleMRBuilder


    // Define the Schema according to the text file
    Schema schema = new Schema("schema",
        Fields.parse("id:int,name:string,country_code:string,district:string,population:int"));

    TupleMRBuilder builder = new TupleMRBuilder(conf);
    builder.addIntermediateSchema(schema);
    builder.setGroupByFields("id"); // but we don't care, really
    /*
     * Define the Input Format and the Output Format!
     */
    InputFormat inputFormat = new TupleTextInputFormat(schema, false, false, ',', '"', '\\',
        FieldSelector.NONE, TupleTextInputFormat.NO_NULL_STRING);
    OutputFormat outputFormat = new TupleTextOutputFormat(schema, false, ',', '"', '\\');

    builder.addInput(inPath, inputFormat, new IdentityTupleMapper());
    builder.setTupleReducer(new IdentityTupleReducer());
    builder.setOutput(outPath, outputFormat, ITuple.class, NullWritable.class);
    try {
      Job job = builder.createJob();
      assertRun(job);
    } finally {
      builder.cleanUpInstanceFiles();
    }

    Assert.assertEquals(line1out + "\n" + line2out,
        Files.toString(new File(OUT + "/" + "part-r-00000"), Charset.forName("UTF-8")).trim());

View Full Code Here


    fields.add(Field.create("booleanField", Type.BOOLEAN));
    fields.add(Field.createEnum("enumField", TestEnum.class));

    Schema schema = new Schema("schema", fields);

    TupleMRBuilder builder = new TupleMRBuilder(conf);
    builder.addIntermediateSchema(schema);
    builder.setGroupByFields("strField1"); // but we don't care, really
    /*
     * Define the Input Format and the Output Format!
     */
    InputFormat inputFormat = new TupleTextInputFormat(schema, true, false, ' ',
        TupleTextOutputFormat.NO_QUOTE_CHARACTER, TupleTextOutputFormat.NO_ESCAPE_CHARACTER,
        FieldSelector.NONE, TupleTextInputFormat.NO_NULL_STRING);
    OutputFormat outputFormat = new TupleTextOutputFormat(schema, true, ' ',
        TupleTextOutputFormat.NO_QUOTE_CHARACTER, TupleTextOutputFormat.NO_ESCAPE_CHARACTER);

    builder.addInput(inPath, inputFormat, new IdentityTupleMapper());
    builder.setTupleReducer(new IdentityTupleReducer());
    builder.setOutput(outPath, outputFormat, ITuple.class, NullWritable.class);
    Job job = builder.createJob();
    try {
      assertRun(job);
    } finally {
      builder.cleanUpInstanceFiles();
    }
    Assert.assertEquals(outHeader + "\n" + line1 + "\n" + line2 + "\n" + line3,
        Files.toString(new File(OUT + "/" + "part-r-00000"), Charset.forName("UTF-8")).trim());

    HadoopUtils.deleteIfExists(fS, inPath);
View Full Code Here

    // Define a FieldSelector to select only columns 1, 4, 6
    // 0 is the first column
    FieldSelector selector = new FieldSelector(1, 4, 6);

    TupleMRBuilder builder = new TupleMRBuilder(conf);
    builder.addIntermediateSchema(schema);
    builder.setGroupByFields("floatField"); // but we don't care, really
    // Define the Input Format and the Output Format!
    // Add the selector to the input format
    InputFormat inputFormat = new TupleTextInputFormat(schema, false, false, ' ',
        TupleTextOutputFormat.NO_QUOTE_CHARACTER, TupleTextOutputFormat.NO_ESCAPE_CHARACTER, selector,
        TupleTextInputFormat.NO_NULL_STRING);
    OutputFormat outputFormat = new TupleTextOutputFormat(schema, false, ' ',
        TupleTextOutputFormat.NO_QUOTE_CHARACTER, TupleTextOutputFormat.NO_ESCAPE_CHARACTER);

    builder.addInput(inPath, inputFormat, new IdentityTupleMapper());
    builder.setTupleReducer(new IdentityTupleReducer());
    builder.setOutput(outPath, outputFormat, ITuple.class, NullWritable.class);
    Job job = builder.createJob();
    try {
      assertRun(job);
    } finally {
      builder.cleanUpInstanceFiles();
    }
    // This is what we expect as output after field selection
    line1 = "10.0 100 true";
    line2 = "20.0 200 false";
    line3 = "30.0 300 true";
View Full Code Here

    Path inPath = new Path("src/test/resources/broken-encoding.txt");
    HadoopUtils.deleteIfExists(fS, outPath);
   
    Schema schema = new Schema("schema", Fields.parse("plugin:string?, count:int?"));

    TupleMRBuilder builder = new TupleMRBuilder(conf);
    builder.addIntermediateSchema(schema);
    builder.setGroupByFields("plugin"); // but we don't care, really
    /*
     * Define the Input Format and the Output Format!
     */

    InputFormat inputFormat = new TupleTextInputFormat(schema, false, false, ',', '"', '\\', null, null);
    builder.addInput(inPath, inputFormat, new IdentityTupleMapper());
    builder.setTupleReducer(new IdentityTupleReducer());
    builder.setTupleOutput(outPath, schema);
    Job job = builder.createJob();
    try {
      assertRun(job);
    } finally {
      builder.cleanUpInstanceFiles();
    }

    HadoopUtils.deleteIfExists(fS, outPath);
  }
View Full Code Here

    fields.add(Field.create("booleanField", Type.BOOLEAN));
    fields.add(Field.createEnum("enumField", TestEnum.class));

    Schema schema = new Schema("schema", fields);

    TupleMRBuilder builder = new TupleMRBuilder(conf);
    builder.addIntermediateSchema(schema);
    builder.setGroupByFields("strField1"); // but we don't care, really
    /*
     * Define the Input Format and the Output Format!
     */

    InputFormat inputFormat = new TupleTextInputFormat(schema, fieldsPos, false, null);
    OutputFormat outputFormat = new TupleTextOutputFormat(schema, false, ' ',
        TupleTextOutputFormat.NO_QUOTE_CHARACTER, TupleTextOutputFormat.NO_ESCAPE_CHARACTER);

    builder.addInput(inPath, inputFormat, new IdentityTupleMapper());
    builder.setTupleReducer(new IdentityTupleReducer());
    builder.setOutput(outPath, outputFormat, ITuple.class, NullWritable.class);
    Job job = builder.createJob();
    try {
      assertRun(job);
    } finally {
      builder.cleanUpInstanceFiles();
    }
    Assert.assertEquals(line1out + "\n" + line2out + "\n" + line3out,
        Files.toString(new File(OUT + "/" + "part-r-00000"), Charset.forName("UTF-8")).trim());

    HadoopUtils.deleteIfExists(fS, inPath);
View Full Code Here

    fields.add(Field.create("booleanField", Type.BOOLEAN));
    fields.add(Field.createEnum("enumField", TestEnum.class));

    Schema schema = new Schema("schema", fields);

    TupleMRBuilder builder = new TupleMRBuilder(conf);
    builder.addIntermediateSchema(schema);
    builder.setGroupByFields("strField1"); // but we don't care, really
    /*
     * Define the Input Format and the Output Format!
     */

    InputFormat inputFormat = new TupleTextInputFormat(schema, fieldsPos, false, null);
    OutputFormat outputFormat = new TupleTextOutputFormat(schema, false, ' ',
        TupleTextOutputFormat.NO_QUOTE_CHARACTER, TupleTextOutputFormat.NO_ESCAPE_CHARACTER);

    builder.addInput(inPath, inputFormat, new IdentityTupleMapper());
    builder.setTupleReducer(new IdentityTupleReducer());
    builder.setOutput(outPath, outputFormat, ITuple.class, NullWritable.class);
    Job job = builder.createJob();
    try {
      assertRun(job);
    } finally {
      builder.cleanUpInstanceFiles();
    }
    Assert.assertEquals(line1out + "\n" + line2out + "\n" + line3out,
        Files.toString(new File(OUT + "/" + "part-r-00000"), Charset.forName("UTF-8")).trim());

    HadoopUtils.deleteIfExists(fS, inPath);
View Full Code Here

    CommonUtils.writeTXT("ignore-me", new File(INPUT));

    getConf().set("mapred.output.compress", "true");
    getConf().set("mapred.output.compression.codec", "org.apache.hadoop.io.compress.DefaultCodec");

    TupleMRBuilder builder = new TupleMRBuilder(getConf());
    Schema baseSchema = new Schema("schema", Fields.parse("name:string, money:int, country:string"));
    builder.addIntermediateSchema(baseSchema);
    builder.setGroupByFields("country");
    builder.setOrderBy(new OrderBy().add("country", Order.ASC).add("money", Order.DESC)
        .add("name", Order.ASC));
    builder.addInput(new Path(INPUT), new HadoopInputFormat(TextInputFormat.class),
        new MyInputProcessor());
    builder.setTupleReducer(new MyGroupHandler());
    builder.setOutput(new Path(OUTPUT), new HadoopOutputFormat(SequenceFileOutputFormat.class),
        DoubleWritable.class, NullWritable.class);
    // Configure extra outputs
    builder.addNamedOutput(OUTPUT_1, new HadoopOutputFormat(SequenceFileOutputFormat.class), Utf8.class,
        Utf8.class);
    builder.addNamedOutput(OUTPUT_2, new HadoopOutputFormat(SequenceFileOutputFormat.class),
        IntWritable.class, NullWritable.class);
    builder.addNamedTupleOutput(TUPLEOUTPUT_1, baseSchema);

    getConf().setClass(ProxyOutputFormat.PROXIED_OUTPUT_FORMAT_CONF, SequenceFileOutputFormat.class,
        OutputFormat.class);
    Job job = builder.createJob();
    try {
      assertRun(job);
    } finally {
      builder.cleanUpInstanceFiles();
    }

    // Check outputs

    checkCompression(firstReducerOutput(OUTPUT + "/" + OUTPUT_1), DefaultCodec.class);
View Full Code Here

    HadoopUtils.deleteIfExists(fS, outPath);
    HadoopUtils.deleteIfExists(fS, outPathText);

    Schema originalSchema = new Schema("schema", Fields.parse("title:string, content:string"));

    TupleMRBuilder builder = new TupleMRBuilder(conf);
    builder.addIntermediateSchema(originalSchema);
    builder.setGroupByFields("title");
    builder.setOrderBy(new OrderBy().add("title", Order.ASC).add("content", Order.ASC));

    builder.setTupleReducer(new IdentityTupleReducer());
    builder.setTupleOutput(outPath, originalSchema);
    builder.addInput(inPath, new HadoopInputFormat(TextInputFormat.class), new MyInputProcessor());

    Job job = builder.createJob();
    try {
      job.waitForCompletion(true);
    } finally {
      builder.cleanUpInstanceFiles();
    }

    // Use output as input of new TupleMRBuilder
    // To make things nicer, we evolve the Schema and use a different Schema for reading the Tuple File.
    // We remove the "content" and add a new nullable field.
    Schema evolvedSchema = new Schema("evolved", Fields.parse("content:string, new_field:string?"));

    builder = new TupleMRBuilder(conf);
    builder.addTupleInput(outPath, evolvedSchema, new IdentityTupleMapper());
    builder.addIntermediateSchema(evolvedSchema);
    builder.setGroupByFields("content");
    builder.setTupleReducer(new MyGroupHandler());
    builder.setOutput(outPathText, new HadoopOutputFormat(TextOutputFormat.class), Text.class,
        NullWritable.class);

    job = builder.createJob();
    try {
      assertRun(job);
    } finally {
      builder.cleanUpInstanceFiles();
    }

    Assert.assertEquals("bar2 foo2\nfoo1 bar1",
        Files.toString(new File(OUT_TEXT + "/" + "part-r-00000"), Charset.forName("UTF-8")).trim());
View Full Code Here

   
    Configuration conf = new Configuration();
    FileSystem fS = FileSystem.get(conf);
   
    Path out = new Path("out-" + TupleOfTupleOfTuples.class.getName());
    TupleMRBuilder builder = new TupleMRBuilder(conf);
    fS.delete(out, true);
   
    builder.setTupleOutput(out, getMetaSchema2());
    builder.addIntermediateSchema(getMetaSchema2());
    builder.addInput(new Path("src/test/resources/foo-file.txt"), new HadoopInputFormat(TextInputFormat.class), new MyHandler());
    builder.setGroupByFields("group");
    builder.setTupleReducer(new IdentityTupleReducer());
    Job job = builder.createJob();
    try {
      job.waitForCompletion(true);
    } finally {
      builder.cleanUpInstanceFiles();
    }

    Path toRead = new Path(out, "part-r-00000");
    assertTrue(fS.exists(toRead));
    TupleFile.Reader reader = new TupleFile.Reader(fS, conf, toRead);
View Full Code Here

    List<Field> fields = new ArrayList<Field>();
    fields.add(Field.create("word", Type.STRING));
    fields.add(Field.create("count", Type.INT));

    TupleMRBuilder cg = new TupleMRBuilder(conf);
    cg.addIntermediateSchema(new Schema("schema", fields));
    cg.setJarByClass(TestCombiner.class);
    cg.addInput(new Path(input), new HadoopInputFormat(SequenceFileInputFormat.class), new Split());
    cg.setOutput(new Path(output), new HadoopOutputFormat(SequenceFileOutputFormat.class), Utf8.class,
        IntWritable.class);
    cg.setGroupByFields("word");
    cg.setOrderBy(new OrderBy().add("word", Order.ASC));
    cg.setTupleReducer(new Count());
    cg.setTupleCombiner(new CountCombiner());

    return cg;
  }
View Full Code Here

TOP

Related Classes of com.datasalt.pangool.tuplemr.TupleMRBuilder

Copyright © 2018 www.massapicom. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.