Package com.splout.db.benchmark

Source Code of com.splout.db.benchmark.IdentityJob

package com.splout.db.benchmark;

/*
* #%L
* Splout SQL Hadoop library
* %%
* Copyright (C) 2012 Datasalt Systems S.L.
* %%
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
*      http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
* #L%
*/

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.input.TextInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.util.Tool;
import org.apache.hadoop.util.ToolRunner;

import com.beust.jcommander.JCommander;
import com.beust.jcommander.Parameter;
import com.beust.jcommander.ParameterException;
import com.datasalt.pangool.io.Fields;
import com.datasalt.pangool.io.ITuple;
import com.datasalt.pangool.io.Schema;
import com.datasalt.pangool.tuplemr.IdentityTupleMapper;
import com.datasalt.pangool.tuplemr.IdentityTupleReducer;
import com.datasalt.pangool.tuplemr.TupleMRBuilder;
import com.datasalt.pangool.tuplemr.mapred.lib.input.TupleTextInputFormat;
import com.datasalt.pangool.tuplemr.mapred.lib.input.TupleTextInputFormat.FieldSelector;
import com.datasalt.pangool.tuplemr.mapred.lib.output.TupleTextOutputFormat;
import com.datasalt.pangool.utils.HadoopUtils;

/**
* This Job implements a identity Map/Reduce in two ways: one using the plain Hadoop API and the other one using the Pangool API
* for parsing CSV files. With this Job we can measure 1) The overhead of using Splout store generator tools against a plain Indetity Hadoop Job
* and 2) Which part of this overhead is only due to parsing CSV files with Pangool.
*/
public class IdentityJob implements Tool {

  @Parameter(required = true, names = { "-i", "--inputpath" }, description = "The input path for the identity Job. Must be textual files.")
  private String inputPath;

  @Parameter(required = true, names = { "-o", "--outputpath" }, description = "The output path for the identity Job.")
  private String outputPath;

  @Parameter(required = false, names = { "-ps", "--pangoolSchema" }, description = "Provide a Pangool-schema and Pangool will be used for parsing the input text file into a Tuple. Using this option one can measure the overhead of using Pangool's textual input format.")
  private String pangoolSchema = null;

  @Parameter(required = false, names = { "-gb", "--groupBy" }, description = "If pangoolSchema is provided, a groupBy clause must be provided too. Use a field in your schema that makes the data as evenly spread across reducers as possible.")
  private String groupBy = null;

  // Basic CSV parsing parameters, optionally used if pangoolSchema != null, can be overrided
  // --------------------------------//
  @Parameter(names = { "-sep", "--separator" }, description = "The separator character of your text input file, defaults to a space.")
  private String separator = " ";

  @Parameter(names = { "-quo", "--quotes" }, description = "The quotes character of your input file, defaults to none.")
  private String quotes = TupleTextInputFormat.NO_QUOTE_CHARACTER + "";

  @Parameter(names = { "-esc", "--escape" }, description = "The escape character of your input file, defaults to none.")
  private String escape = TupleTextInputFormat.NO_ESCAPE_CHARACTER + "";

  @Parameter(names = { "-sh", "--skipheading" }, description = "Specify this flag for skipping the header line of your text file.")
  private boolean skipHeading = false;
  // --------------------------------//

  private Configuration conf;

  @Override
  public Configuration getConf() {
    return null;
  }

  @Override
  public void setConf(Configuration conf) {
    this.conf = conf;
  }

  @Override
  public int run(String[] params) throws Exception {
    // Validate params etc
    JCommander jComm = new JCommander(this);
    jComm.setProgramName("Identity Job");
    try {
      jComm.parse(params);
    } catch(ParameterException e) {
      System.err.println(e.getMessage());
      jComm.usage();
      System.exit(-1);
    }

    Path outP = new Path(outputPath);
    HadoopUtils.deleteIfExists(FileSystem.get(conf), outP);

    if(pangoolSchema == null) {
      // Use plain Hadoop API
      Job job = new Job(conf);
      job.setInputFormatClass(TextInputFormat.class);
      FileInputFormat.setInputPaths(job, inputPath);
      FileOutputFormat.setOutputPath(job, outP);

      job.waitForCompletion(true);

    } else {
      if(groupBy == null) {
        System.err.println("If pangoolSchema is used, groupBy must also be used.");
        jComm.usage();
        System.exit(-1);
      }

      Schema schema = new Schema("sch", Fields.parse(pangoolSchema));
      Path inputP = new Path(inputPath);

      // Use Pangool API - parse CSV, etc
      TupleMRBuilder builder = new TupleMRBuilder(conf);
      TupleTextInputFormat parsingInputFormat = new TupleTextInputFormat(schema, skipHeading, false,
          separator.charAt(0), quotes.charAt(0), escape.charAt(0), FieldSelector.NONE, null);
      TupleTextOutputFormat outputFormat = new TupleTextOutputFormat(schema, false, separator.charAt(0),
          quotes.charAt(0), escape.charAt(0));

      builder.addIntermediateSchema(schema);
      builder.addInput(inputP, parsingInputFormat, new IdentityTupleMapper());
      builder.setGroupByFields(groupBy);
      builder.setOutput(outP, outputFormat, ITuple.class, NullWritable.class);
      builder.setTupleReducer(new IdentityTupleReducer());
      builder.setJarByClass(this.getClass());
     
      builder.createJob().waitForCompletion(true);
    }

    return 1;
  }

  public static void main(String[] args) throws Exception {
    ToolRunner.run(new IdentityJob(), args);
  }
}
TOP

Related Classes of com.splout.db.benchmark.IdentityJob

TOP
Copyright © 2018 www.massapi.com. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.