Package ivory.lsh.pwsim

Source Code of ivory.lsh.pwsim.GenerateChunkedPermutedTables$MyReducer

package ivory.lsh.pwsim;

import ivory.lsh.data.BitsSignatureTable;
import ivory.lsh.data.PairOfIntSignature;
import ivory.lsh.data.Permutation;
import ivory.lsh.data.PermutationByBit;
import ivory.lsh.data.Signature;
import ivory.lsh.driver.PwsimEnvironment;

import java.io.IOException;
import java.lang.reflect.Constructor;
import java.net.URI;
import java.util.Iterator;
import java.util.List;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.filecache.DistributedCache;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.SequenceFile;
import org.apache.hadoop.io.Writable;
import org.apache.hadoop.mapred.FileInputFormat;
import org.apache.hadoop.mapred.FileOutputFormat;
import org.apache.hadoop.mapred.JobClient;
import org.apache.hadoop.mapred.JobConf;
import org.apache.hadoop.mapred.MapReduceBase;
import org.apache.hadoop.mapred.Mapper;
import org.apache.hadoop.mapred.OutputCollector;
import org.apache.hadoop.mapred.Partitioner;
import org.apache.hadoop.mapred.Reducer;
import org.apache.hadoop.mapred.Reporter;
import org.apache.hadoop.mapred.SequenceFileInputFormat;
import org.apache.hadoop.mapred.SequenceFileOutputFormat;
import org.apache.log4j.Level;
import org.apache.log4j.Logger;

import edu.umd.cloud9.io.SequenceFileUtils;
import edu.umd.cloud9.io.array.ArrayListOfIntsWritable;
import edu.umd.cloud9.util.PowerTool;

@SuppressWarnings("deprecation")
public class GenerateChunkedPermutedTables extends PowerTool {
  private static final Logger sLogger = Logger.getLogger(GenerateChunkedPermutedTables.class);
  static {
    sLogger.setLevel(Level.WARN);
  }

  static enum Count {
    Signatures, Chunks, SignaturesInChunks
  }

  public GenerateChunkedPermutedTables(Configuration conf) {
    super(conf);
  }

  /**
   * @author ferhanture
   *
   *         Maps each signature to Q random permutations, usign the permuters stored in local cache
   *         file <docno,signature> --> <(permno,signature),docno>
   */

  @SuppressWarnings("unchecked")
  public static class MyMapper extends MapReduceBase implements
      Mapper<IntWritable, Signature, PairOfIntSignature, IntWritable> {

    static Path[] localFiles;
    static List<Writable> randomPermutations;
    static int numOfPermutations;
    static Signature permutedSignature;
    static Constructor pairConstructor;
    static PairOfIntSignature pair;

    public void configure(JobConf job) {
      numOfPermutations = job.getInt("Ivory.NumOfPermutations", -1);
      int numOfBits = job.getInt("Ivory.NumOfBits", -1);
      Class signatureClass = null;

      try {
        Class pairClass = Class.forName(job.get("Ivory.PairClass"));
        pairConstructor = pairClass.getConstructor(int.class, Signature.class);

        signatureClass = Class.forName(job.get("Ivory.SignatureClass"));
        Constructor intConstructor = signatureClass.getConstructor(int.class);
        permutedSignature = (Signature) intConstructor.newInstance(numOfBits);
        pair = (PairOfIntSignature) pairConstructor.newInstance(0, permutedSignature);
      } catch (Exception e) {
        throw new RuntimeException("config exception: \n" + e.toString());
      }

      sLogger.debug("Reading permutations file....");
      sLogger.debug("PwsimEnvironment.cluster: " + PwsimEnvironment.cluster);
      sLogger.debug("PwsimEnvironment.cluster: " + PwsimEnvironment.cluster);
      // sLogger.debug ("job.get(\"mapred.job.tracker\"): " + job.get ("mapred.job.tracker"));
      // sLogger.debug ("job.get(\"mapred.job.tracker\").equals (\"local\"): " + job.get
      // ("mapred.job.tracker").equals ("local"));
      try {
        if (job.get("mapred.job.tracker").equals("local")) {
          String rootPath = job.get("Ivory.IndexPath");
          String randomPermFile = PwsimEnvironment.getFileNameWithPars(rootPath, "Permsbit");
          randomPermutations = SequenceFileUtils.readValues(new Path(randomPermFile), FileSystem
              .getLocal(job));
          // randomPermutations =
          // edu.umd.cloud9.util.SequenceFileUtils.readFile("index/random.perms", new Text(""), -1);
        } else {
          localFiles = DistributedCache.getLocalCacheFiles(job);
          // sLogger.debug ("localFiles [0]: " + localFiles [0]);
          randomPermutations = SequenceFileUtils
              .readValues(localFiles[0], FileSystem.getLocal(job));
        }
      } catch (Exception e) {
        throw new RuntimeException("Error reading random vectors!");
      }
      sLogger.debug("Done reading file.");
    }

    public void map(IntWritable docno, Signature signature,
        OutputCollector<PairOfIntSignature, IntWritable> output, Reporter reporter)
        throws IOException {
      sLogger.debug("Mapping signature " + docno);
      sLogger.debug("Permuting " + signature + "...");

      // Map each signature to Q random permutations
      for (int i = 0; i < numOfPermutations; i++) {
        signature.perm((ArrayListOfIntsWritable) randomPermutations.get(i), permutedSignature);
        // sLogger.debug("Permutation "+i+" : "+permutedSign);
        try {
          pair.setInt(i);
          pair.setSignature(permutedSignature);
          output.collect(pair, docno);
        } catch (Exception e) {
          throw new RuntimeException("output.collect exception: \n" + e.toString());
        }
        sLogger.debug("emitted");
      }
      reporter.incrCounter(Count.Signatures, 1);
    }
  }

  public static class MyPartitioner implements Partitioner<PairOfIntSignature, IntWritable> {
    // partition with permutation number only
    public int getPartition(PairOfIntSignature key, IntWritable value, int numReducers) {
      return key.getInt() % numReducers;
    }

    public void configure(JobConf conf) {
    }
  }

  public static class MyReducer extends MapReduceBase implements
      Reducer<PairOfIntSignature, IntWritable, IntWritable, BitsSignatureTable> {
    static int permNo = -1;
    Signature[] signatures = null;
    int[] docNos = null;
    int curTableSize = 0;
    int overlapSize = -1;
    int chunckSize = -1;

    public void configure(JobConf conf) {
      // sLogger.setLevel(Level.DEBUG);
      overlapSize = conf.getInt("Ivory.OverlapSize", -1);
      chunckSize = conf.getInt("Ivory.ChunckSize", -1);
      if (overlapSize >= chunckSize)
        throw new RuntimeException("Invalid Ivory.OverlapSize(" + overlapSize
            + ") or Ivory.ChunkSize(" + chunckSize + ")");
      signatures = new Signature[chunckSize];
      docNos = new int[chunckSize];
    }

    BitsSignatureTable table = new BitsSignatureTable();
    IntWritable outKey = new IntWritable();
    OutputCollector<IntWritable, BitsSignatureTable> mOutput = null;
    PairOfIntSignature lastKey = null;
    Reporter mReporter = null;

    public void reduce(PairOfIntSignature key, Iterator<IntWritable> val,
        OutputCollector<IntWritable, BitsSignatureTable> output, Reporter reporter)
        throws IOException {
      mReporter = reporter;
      // all signatures:
      // (1) belong to the same permutation table, and
      // (2) sorted
      mOutput = output;
      lastKey = key;
      // sLogger.debug(key.toString());
      while (val.hasNext()) {
        docNos[curTableSize] = val.next().get();
        signatures[curTableSize] = key.getSignature();
        curTableSize++;
        if (curTableSize == chunckSize) {
          table.set(signatures, docNos, curTableSize);
          outKey.set(key.getInt());
          output.collect(outKey, table);
          reporter.incrCounter(Count.SignaturesInChunks, table.getNumOfSignatures());
          reporter.incrCounter(Count.Chunks, 1);
          shiftOverlap();
        }
      }
    }

    private void shiftOverlap() {
      if (overlapSize >= curTableSize)
        return;

      // overlapSize < curTableSize ==> shift up
      int j = 0;
      for (int i = curTableSize - overlapSize; i < curTableSize; i++) {
        signatures[j] = signatures[i];
        docNos[j] = docNos[i];
        j++;
      }
      curTableSize = j;
    }

    @Override
    public void close() throws IOException {
      if (curTableSize == 0 || mOutput == null)
        return;
      table.set(signatures, docNos, curTableSize);
      outKey.set(lastKey.getInt());
      mOutput.collect(outKey, table);
      mReporter.incrCounter(Count.SignaturesInChunks, table.getNumOfSignatures());
      mReporter.incrCounter(Count.Chunks, 1);

      // sLogger.debug(nSignaturesRead);
    }

  }

  @Override
  public String[] getRequiredParameters() {
    return RequiredParameters;
  }

  // create Q permutation functions and write them to file
  public static String createPermutations(FileSystem fs, JobConf job, String rootPath, int numBits,
      int numOfPermutations) throws Exception {
    String randomPermFile = PwsimEnvironment.getFileNameWithPars(rootPath, "Permsbit");
    if (fs.exists(new Path(randomPermFile))) {
      sLogger.info("Random permutations output path already exists!");
      return randomPermFile;
    }
    SequenceFile.Writer writer = SequenceFile.createWriter(fs, job, new Path(randomPermFile),
        IntWritable.class, ArrayListOfIntsWritable.class);
    Permutation p = new PermutationByBit(numBits);
    for (int i = 0; i < numOfPermutations; i++) {
      ArrayListOfIntsWritable perm = p.nextPermutation();
      writer.append(new IntWritable(i), perm);
      sLogger.debug(i + ":" + perm);
    }
    writer.close();
    sLogger.info("Random permutations written.");
    return randomPermFile;
  }

  public static final String[] RequiredParameters = { "Ivory.NumMapTasks", "Ivory.NumReduceTasks",
      "Ivory.CollectionName", "Ivory.IndexPath", "Ivory.NumOfPermutations", "Ivory.ChunckSize",
      "Ivory.OverlapSize" };

  @Override
  public int runTool() throws Exception {
    sLogger.setLevel(Level.INFO);
    int numOfPermutations = getConf().getInt("Ivory.NumOfPermutations", -1);
    int numBits = getConf().getInt("Ivory.NumOfBits", -1);
    if (numOfPermutations < 0)
      throw new RuntimeException("parameters not read properly");

    String rootPath = getConf().get("Ivory.IndexPath");

    JobConf job = new JobConf(getConf(), GenerateChunkedPermutedTables.class);

    FileSystem fs = FileSystem.get(job);
    String inputPath, outputPath, fileno;
    String collectionName = job.get("Ivory.CollectionName");

    if (job.getBoolean("Ivory.SignaturesPartitioned", false)) {
      inputPath = job.get("Ivory.PartitionFile");
      fileno = inputPath.substring(inputPath.lastIndexOf('-') + 1);
      outputPath = PwsimEnvironment.getFileNameWithPars(rootPath, "P-Tables") + "/" + fileno;
      job.setJobName("GenerateChunkedPermutedTables: " + collectionName + "-p#"
          + Integer.parseInt(fileno));
    } else {
      inputPath = PwsimEnvironment.getFileNameWithPars(rootPath, "Signatures" + job.get("Type"));
      outputPath = PwsimEnvironment.getFileNameWithPars(rootPath, "Tables");
      job.setJobName("GenerateChunkedPermutedTables: " + numOfPermutations + "_" + collectionName);
    }
    int numMappers = job.getInt("Ivory.NumMapTasks", 100);
    int numReducers = job.getInt("Ivory.NumReduceTasks", 100);

    if (fs.exists(new Path(outputPath))) {
      sLogger.info("Permuted tables already exist! Quitting...");
      return 0;
    }

    // create Q permutation functions and write them to file
    String randomPermFile = createPermutations(fs, job, rootPath, numBits, numOfPermutations);
    DistributedCache.addCacheFile(new URI(randomPermFile), job);

    FileInputFormat.addInputPath(job, new Path(inputPath));

    // ONLY FOR CROSS-LINGUAL CASE
    if (PwsimEnvironment.isCrossLingual) {
      String srcLangInputPath = PwsimEnvironment.getFileNameWithPars(job.get("SrcLangDir"),
          "Signatures");
      FileInputFormat.addInputPath(job, new Path(srcLangInputPath));
    }
    Path[] paths = FileInputFormat.getInputPaths(job);

    for (Path path : paths) {
      sLogger.info("Added " + path);
    }
    FileOutputFormat.setOutputPath(job, new Path(outputPath));
    FileOutputFormat.setCompressOutput(job, false);

    job.set("mapred.child.java.opts", "-Xmx2048m");
    // job.setInt("mapred.map.max.attempts", 10);
    // job.setInt("mapred.reduce.max.attempts", 10);
    job.setInt("mapred.task.timeout", 600000000);

    job.setNumMapTasks(numMappers);
    job.setNumReduceTasks(numReducers);
    job.setInputFormat(SequenceFileInputFormat.class);
    job.setMapOutputKeyClass(Class.forName(job.get("Ivory.PairClass")));
    job.setMapOutputValueClass(IntWritable.class);
    job.setOutputKeyClass(IntWritable.class);
    job.setOutputValueClass(BitsSignatureTable.class);
    job.setMapperClass(MyMapper.class);
    job.setReducerClass(MyReducer.class);
    job.setPartitionerClass(MyPartitioner.class);
    job.setOutputFormat(SequenceFileOutputFormat.class);

    sLogger.info("Running job " + job.getJobName() + "...");
    sLogger.info("Collection: " + collectionName);
    sLogger.info("Number of bits/signature(D): " + numBits);
    sLogger.info("Number of permutations(Q): " + numOfPermutations);
    sLogger.info("Overlap size: " + getConf().getInt("Ivory.OverlapSize", -1));
    sLogger.info("Chunk size: " + getConf().getInt("Ivory.ChunckSize", -1));

    JobClient.runJob(job);

    return 0;
  }
}
TOP

Related Classes of ivory.lsh.pwsim.GenerateChunkedPermutedTables$MyReducer

TOP
Copyright © 2018 www.massapi.com. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.