Examples of HadoopInputFormat


Examples of com.datasalt.pangool.tuplemr.mapred.lib.input.HadoopInputFormat

    // Note that the order in which we defined the fields of the Schema is not relevant here
    cg.setGroupByFields("topic", "word");
    // Here we instantiate a mapper with stop words:
    // Note that we don't need to use the DistributedCache for that becasuse mappers, reducers, etc themselves are instantiable
    StopWordMapper mapper = new StopWordMapper(stopWords);
    cg.addInput(new Path(args[0]), new HadoopInputFormat(TextInputFormat.class), mapper);
    // We'll use a TupleOutputFormat with the same schema than the intermediate schema
    cg.setTupleOutput(new Path(args[1]), TopicalWordCount.getSchema());
    cg.setTupleReducer(new CountReducer());
    cg.setTupleCombiner(new CountReducer());
View Full Code Here

Examples of com.datasalt.pangool.tuplemr.mapred.lib.input.HadoopInputFormat

    builder.setGroupByFields("first");
    builder.setOrderBy(new OrderBy().add("first",Order.ASC).add("second",Order.ASC));
    // Input / output and such
    builder.setTupleReducer(new Handler());
    builder.setOutput(new Path(output), new HadoopOutputFormat(TextOutputFormat.class), Text.class, NullWritable.class);
    builder.addInput(new Path(input), new HadoopInputFormat(TextInputFormat.class), new IProcessor());
    builder.createJob().waitForCompletion(true);

    return 1;
  }
View Full Code Here

Examples of com.datasalt.pangool.tuplemr.mapred.lib.input.HadoopInputFormat

    mr.addIntermediateSchema(getPangoolRetweetSchema());
    mr.setGroupByFields("tweet_id");
    mr.setOrderBy(new OrderBy().add("tweet_id", Order.ASC).addSchemaOrder(Order.ASC));

    mr.addInput(tweetsPath, new AvroInputFormat<Record>(getAvroTweetSchema()), new TweetsMapper());
    mr.addInput(retweetsPath, new HadoopInputFormat(TextInputFormat.class), new RetweetsMapper());
    mr.setOutput(outputPath, new AvroOutputFormat<Record>(getAvroOutputSchema()), AvroWrapper.class, NullWritable.class);

    mr.setTupleReducer(new Red());

    Job job = mr.createJob();
View Full Code Here

Examples of com.datasalt.pangool.tuplemr.mapred.lib.input.HadoopInputFormat

    mr.addIntermediateSchema(getSchema());
    mr.setGroupByFields("my_avro");
    //here the custom comparator that groups by "topic,word" is used.
    MyAvroComparator customComp = new MyAvroComparator(getAvroSchema(),"topic","word");
    mr.setOrderBy(new OrderBy().add("my_avro",Order.ASC,customComp));
    mr.addInput(new Path(args[0]), new HadoopInputFormat(TextInputFormat.class), new TokenizeMapper());
    // We'll use a TupleOutputFormat with the same schema than the intermediate schema
    mr.setTupleOutput(new Path(args[1]), getSchema());
    mr.setTupleReducer(new CountReducer());
    mr.setTupleCombiner(new CountReducer());
View Full Code Here

Examples of com.datasalt.pangool.tuplemr.mapred.lib.input.HadoopInputFormat

    TupleMRBuilder mr = new TupleMRBuilder(conf, "Pangool Topical Word Count");
    mr.addIntermediateSchema(getSchema());
    // We will count each (topicId, word) pair
    // Note that the order in which we defined the fields of the Schema is not relevant here
    mr.setGroupByFields("topic", "word");
    mr.addInput(new Path(args[0]), new HadoopInputFormat(TextInputFormat.class), new TokenizeMapper());
    // We'll use a TupleOutputFormat with the same schema than the intermediate schema
    mr.setTupleOutput(new Path(args[1]), getSchema());
    mr.setTupleReducer(new CountReducer());
    mr.setTupleCombiner(new CountReducer());
View Full Code Here

Examples of com.datasalt.pangool.tuplemr.mapred.lib.input.HadoopInputFormat

    job.addIntermediateSchema(schema);
    job.setGroupByFields("min", "max");
    job.setCustomPartitionFields("min");
    // Define the input and its associated mapper
    // The mapper will just emit the (min, max) pairs to the reduce stage
    job.addInput(new Path(input), new HadoopInputFormat(TextInputFormat.class), new TupleMapper<LongWritable, Text>() {

      Tuple tuple = new Tuple(schema);

      @Override
      public void map(LongWritable key, Text value, TupleMRContext context, Collector collector) throws IOException,
View Full Code Here

Examples of com.datasalt.pangool.tuplemr.mapred.lib.input.HadoopInputFormat

    mr.setFieldAliases("urlMap",new Aliases().add("url","nonCanonicalUrl"));
    mr.setGroupByFields("url");
    mr.setOrderBy(new OrderBy().add("url", Order.ASC).addSchemaOrder(Order.ASC));
    mr.setTupleReducer(new Handler());
    mr.setOutput(new Path(output), new HadoopOutputFormat(TextOutputFormat.class), Text.class, NullWritable.class);
    mr.addInput(new Path(input1), new HadoopInputFormat(TextInputFormat.class), new UrlMapProcessor());
    mr.addInput(new Path(input2), new HadoopInputFormat(TextInputFormat.class), new UrlProcessor());
    mr.createJob().waitForCompletion(true);

    return 1;
  }
View Full Code Here

Examples of com.datasalt.pangool.tuplemr.mapred.lib.input.HadoopInputFormat

    mr.addIntermediateSchema(getSchema());
    mr.setGroupByFields("my_avro");
    //here the custom comparator that groups by "topic,word" is used.
    MyAvroComparator customComp = new MyAvroComparator(getAvroSchema(),"topic","word");
    mr.setOrderBy(new OrderBy().add("my_avro",Order.ASC,customComp));
    mr.addInput(new Path(args[0]), new HadoopInputFormat(TextInputFormat.class), new TokenizeMapper());
    // We'll use a TupleOutputFormat with the same schema than the intermediate schema
    mr.setTupleOutput(new Path(args[1]), getSchema());
    mr.setTupleReducer(new CountReducer());
    mr.setTupleCombiner(new CountReducer());
View Full Code Here

Examples of com.datasalt.pangool.tuplemr.mapred.lib.input.HadoopInputFormat

    builder.setGroupByFields("first");
    builder.setOrderBy(new OrderBy().add("first",Order.ASC).add("second",Order.ASC));
    // Input / output and such
    builder.setTupleReducer(new Handler());
    builder.setOutput(new Path(output), new HadoopOutputFormat(TextOutputFormat.class), Text.class, NullWritable.class);
    builder.addInput(new Path(input), new HadoopInputFormat(TextInputFormat.class), new IProcessor());
    builder.createJob().waitForCompletion(true);

    return 1;
  }
View Full Code Here

Examples of com.datasalt.pangool.tuplemr.mapred.lib.input.HadoopInputFormat

    deleteOutput(output);
   
    MapOnlyJobBuilder b = new MapOnlyJobBuilder(conf);
    b.setMapper(new GrepHandler(regex));
    b.setOutput(new Path(output), new HadoopOutputFormat(TextOutputFormat.class), Text.class, NullWritable.class);
    b.addInput(new Path(input), new HadoopInputFormat(TextInputFormat.class));
    b.createJob().waitForCompletion(true);
   
    return 0;
  }
View Full Code Here
TOP
Copyright © 2018 www.massapi.com. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.