package com.ontology2.bakemono.joins;
import com.google.common.collect.Iterators;
import com.google.common.collect.Lists;
import com.google.common.collect.PeekingIterator;
import com.ontology2.bakemono.Main;
import com.ontology2.bakemono.configuration.HadoopTool;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.io.VIntWritable;
import org.apache.hadoop.io.compress.GzipCodec;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.mapreduce.lib.output.TextOutputFormat;
import org.apache.hadoop.util.Tool;
import java.util.List;
@HadoopTool("fetchWithMatchingObjects3")
public class FetchTriplesWithMatchingObjectsTool implements Tool {
private static Log logger= LogFactory.getLog(FetchTriplesWithMatchingObjectsTool.class);
private Configuration conf;
@Override
public Configuration getConf() {
return this.conf;
}
@Override
public void setConf(Configuration arg0) {
this.conf=arg0;
}
@Override
public int run(String[] arg0) throws Exception {
try {
PeekingIterator<String> a= Iterators.peekingIterator(Iterators.forArray(arg0));
Integer reduceTasks = parseRArgument(a);
if (!a.hasNext())
usage();
// The first argument is the list of objects
String inputA=a.next();
if (!a.hasNext())
usage();
// Middle positional parameters are sources of triples
List<String> paths= Lists.newArrayList(a);
// The last positional parameter is the output path
String output=paths.get(paths.size() - 1);
paths.remove(paths.size()-1);
logger.info("Writing to output path "+output);
conf.set("mapred.compress.map.output", "true");
conf.set("mapred.output.compression.type", "BLOCK");
conf.set("mapred.map.output.compression.codec", "org.apache.hadoop.io.compress.GzipCodec");
conf.set(SetJoinMapper.INPUTS+".1",inputA);
for(String path:paths)
conf.set(SetJoinMapper.INPUTS+".2",path);
Job job=new Job(conf,"fetchTriplesWithMatchingObjects");
job.setJarByClass(this.getClass());
job.setMapperClass(FetchTriplesWithMatchingObjectsMapper.class);
job.setReducerClass(AcceptWithMatchingKeyReducer.class);
job.setGroupingComparatorClass(TaggedTextKeyGroupComparator.class);
job.setPartitionerClass(TaggedKeyPartitioner.class);
if(reduceTasks==null) {
reduceTasks=1; // about right for AWS runs
}
job.setNumReduceTasks(reduceTasks);
job.setMapOutputKeyClass(TaggedTextItem.class);
job.setMapOutputValueClass(TaggedTextItem.class);
job.setOutputKeyClass(NullWritable.class);
job.setOutputValueClass(Text.class);
FileInputFormat.addInputPath(job, new Path(inputA));
for(String path:paths)
FileInputFormat.addInputPath(job, new Path(path));
FileOutputFormat.setOutputPath(job, new Path(output));
FileOutputFormat.setCompressOutput(job, true);
FileOutputFormat.setOutputCompressorClass(job, GzipCodec.class);
job.setOutputFormatClass(TextOutputFormat.class);
return job.waitForCompletion(true) ? 0 : 1;
} catch(Main.IncorrectUsageException iue) {
return 2;
}
}
public static Integer parseRArgument(PeekingIterator<String> a)
throws Main.IncorrectUsageException {
Integer reduceTasks=null;
while(a.hasNext() && a.peek().startsWith("-")) {
String flagName=a.next().substring(1).intern();
if (!a.hasNext())
usage();
String flagValue=a.next();
if (flagName=="r") {
reduceTasks=Integer.parseInt(flagValue);
} else {
usage();
};
}
return reduceTasks;
}
private static void usage() throws Main.IncorrectUsageException {
throw new Main.IncorrectUsageException("incorrect arguments");
};
}