Examples of org.apache.hadoop.hive.ql.exec.MapRedTask

org.apache.hadoop.hive.ql.exec.MapRedTask
Extension of ExecDriver: - can optionally spawn a map-reduce task from a separate jvm - will make last minute adjustments to map-reduce job parameters, viz: * estimating number of reducers * estimating whether job should run locally

      emptyScratchDir = new Path(emptyScratchDirStr);
      FileSystem fileSys = emptyScratchDir.getFileSystem(newJob);
      fileSys.mkdirs(emptyScratchDir);
      
      QueryPlan plan = drv.getPlan();
      MapRedTask selectTask = (MapRedTask)plan.getRootTasks().get(0);
      
      ExecDriver.addInputPaths(newJob, selectTask.getWork(), emptyScratchDir.toString(), ctx);
      Utilities.setMapRedWork(newJob, selectTask.getWork(), ctx.getMRTmpFileURI());
      
      CombineHiveInputFormat combineInputFormat = ReflectionUtils.newInstance(
          CombineHiveInputFormat.class, newJob);
      InputSplit[] retSplits = combineInputFormat.getSplits(newJob, 1);
      assertEquals(1, retSplits.length);

View Full Code Here

   * @param hiveConf
   * @throws Exception
   */
  private void executePlan(MapredWork mr, HiveConf hiveConf) throws Exception
  {
    MapRedTask mrtask = new MapRedTask();
    DriverContext dctx = new DriverContext();
    mrtask.setWork(mr);
    mrtask.initialize(hiveConf, null, dctx);
    int exitVal = mrtask.execute(dctx);


    if (exitVal != 0)
    {
      System.out.println("Test execution failed with exit status: "
          + exitVal);

View Full Code Here

      emptyScratchDir = new Path(emptyScratchDirStr);
      FileSystem fileSys = emptyScratchDir.getFileSystem(newJob);
      fileSys.mkdirs(emptyScratchDir);
      
      QueryPlan plan = drv.getPlan();
      MapRedTask selectTask = (MapRedTask)plan.getRootTasks().get(0);
      
      ExecDriver.addInputPaths(newJob, selectTask.getWork(), emptyScratchDir.toString(), ctx);
      Utilities.setMapRedWork(newJob, selectTask.getWork(), ctx.getMRTmpFileURI());
      
      CombineHiveInputFormat combineInputFormat = ReflectionUtils.newInstance(
          CombineHiveInputFormat.class, newJob);
      InputSplit[] retSplits = combineInputFormat.getSplits(newJob, 1);
      assertEquals(1, retSplits.length);

View Full Code Here

    /*
     * A task and its child task has been converted from join to mapjoin.
     * See if the two tasks can be merged.
     */
    private void mergeMapJoinTaskWithChildMapJoinTask(MapRedTask task, Configuration conf) {
      MapRedTask childTask = (MapRedTask)task.getChildTasks().get(0);
      MapredWork work = task.getWork();
      MapredLocalWork localWork = work.getMapLocalWork();
      MapredWork childWork = childTask.getWork();
      MapredLocalWork childLocalWork = childWork.getMapLocalWork();


      // Can this be merged
      Map<String, Operator<? extends OperatorDesc>> aliasToWork = work.getAliasToWork();
      if (aliasToWork.size() > 1) {
        return;
      }


      Operator<? extends OperatorDesc> op = aliasToWork.values().iterator().next();
      while (op.getChildOperators() != null) {
        // Dont perform this optimization for multi-table inserts
        if (op.getChildOperators().size() > 1) {
          return;
        }
        op = op.getChildOperators().get(0);
      }


      if (!(op instanceof FileSinkOperator)) {
        return;
      }


      FileSinkOperator fop = (FileSinkOperator)op;
      String workDir = fop.getConf().getDirName();


      Map<String, ArrayList<String>> childPathToAliases = childWork.getPathToAliases();
      if (childPathToAliases.size() > 1) {
        return;
      }


      // The filesink writes to a different directory
      if (!childPathToAliases.keySet().iterator().next().equals(workDir)) {
        return;
      }


      // Either of them should not be bucketed
      if ((localWork.getBucketMapjoinContext() != null) ||
          (childLocalWork.getBucketMapjoinContext() != null)) {
        return;
      }


      // Merge the trees
      if (childWork.getAliasToWork().size() > 1) {
        return;
      }
      long mapJoinSize = HiveConf.getLongVar(conf,
          HiveConf.ConfVars.HIVECONVERTJOINNOCONDITIONALTASKTHRESHOLD);
      long localTableTotalSize = 0;
      for (String alias : localWork.getAliasToWork().keySet()) {
        Long tabSize = aliasToSize.get(alias);
        if (tabSize == null) {
          /* if the size is unavailable, we need to assume a size 1 greater than mapJoinSize
           * this implies that merge cannot happen so we can return.
           */
          return;
        }
        localTableTotalSize += tabSize;
      }


      for (String alias : childLocalWork.getAliasToWork().keySet()) {
        Long tabSize = aliasToSize.get(alias);
        if (tabSize == null) {
          /* if the size is unavailable, we need to assume a size 1 greater than mapJoinSize
           * this implies that merge cannot happen so we can return.
           */
          return;
        }
        localTableTotalSize += tabSize;
        if (localTableTotalSize > mapJoinSize) {
          return;
        }
      }


      Operator<? extends Serializable> childAliasOp =
          childWork.getAliasToWork().values().iterator().next();
      if (fop.getParentOperators().size() > 1) {
        return;
      }


      // Merge the 2 trees - remove the FileSinkOperator from the first tree pass it to the
      // top of the second
      Operator<? extends Serializable> parentFOp = fop.getParentOperators().get(0);
      parentFOp.getChildOperators().remove(fop);
      parentFOp.getChildOperators().add(childAliasOp);
      List<Operator<? extends OperatorDesc>> parentOps =
          new ArrayList<Operator<? extends OperatorDesc>>();
      parentOps.add(parentFOp);
      childAliasOp.setParentOperators(parentOps);


      work.getAliasToPartnInfo().putAll(childWork.getAliasToPartnInfo());
      for (Map.Entry<String, PartitionDesc> childWorkEntry :
        childWork.getPathToPartitionInfo().entrySet()) {
        if (childWork.getAliasToPartnInfo().containsValue(childWorkEntry.getKey())) {
          work.getPathToPartitionInfo().put(childWorkEntry.getKey(), childWorkEntry.getValue());
        }
      }


      localWork.getAliasToFetchWork().putAll(childLocalWork.getAliasToFetchWork());
      localWork.getAliasToWork().putAll(childLocalWork.getAliasToWork());


      // remove the child task
      List<Task<? extends Serializable>> oldChildTasks = childTask.getChildTasks();
      task.setChildTasks(oldChildTasks);
      if (oldChildTasks != null) {
        for (Task<? extends Serializable> oldChildTask : oldChildTasks) {
          oldChildTask.getParentTasks().remove(childTask);
          oldChildTask.getParentTasks().add(task);

View Full Code Here


    // create map join task and set big table as bigTablePosition
    private ObjectPair<MapRedTask, String> convertTaskToMapJoinTask(MapredWork newWork,
        int bigTablePosition) throws SemanticException {
      // create a mapred task for this work
      MapRedTask newTask = (MapRedTask) TaskFactory.get(newWork, physicalContext
          .getParseContext().getConf());
      JoinOperator newJoinOp = getJoinOp(newTask);


      // optimize this newWork and assume big table position is i
      String bigTableAlias =

View Full Code Here

      Task<? extends Serializable> firstChildTask = mapJoinTask.getChildTasks().get(0);
      if (!(firstChildTask instanceof MapRedTask)) {
        // Nothing to do if it is not a mapreduce task.
        return;
      }
      MapRedTask childTask = (MapRedTask) firstChildTask;
      MapredWork mapJoinWork = mapJoinTask.getWork();
      MapredWork childWork = childTask.getWork();
      Operator childReducer = childWork.getReducer();
      if (childReducer == null) {
        // Not a MR job, nothing to merge.
        return;
      }


      // Can this be merged
      Map<String, Operator<? extends OperatorDesc>> aliasToWork = mapJoinWork.getAliasToWork();
      if (aliasToWork.size() > 1) {
        return;
      }
      Map<String, ArrayList<String>> childPathToAliases = childWork.getPathToAliases();
      if (childPathToAliases.size() > 1) {
        return;
      }


      // Locate leaf operator of the map-join task. Start by initializing leaf
      // operator to be root operator.
      Operator<? extends OperatorDesc> mapJoinLeafOperator = aliasToWork.values().iterator().next();
      while (mapJoinLeafOperator.getChildOperators() != null) {
        // Dont perform this optimization for multi-table inserts
        if (mapJoinLeafOperator.getChildOperators().size() > 1) {
          return;
        }
        mapJoinLeafOperator = mapJoinLeafOperator.getChildOperators().get(0);
      }


      assert (mapJoinLeafOperator instanceof FileSinkOperator);
      if (!(mapJoinLeafOperator instanceof FileSinkOperator)) {
        // Sanity check, shouldn't happen.
        return;
      }


      FileSinkOperator mapJoinTaskFileSinkOperator = (FileSinkOperator) mapJoinLeafOperator;


      // The filesink writes to a different directory
      String workDir = mapJoinTaskFileSinkOperator.getConf().getDirName();
      if (!childPathToAliases.keySet().iterator().next().equals(workDir)) {
        return;
      }


      MapredLocalWork mapJoinLocalWork = mapJoinWork.getMapLocalWork();
      MapredLocalWork childLocalWork = childWork.getMapLocalWork();


      // Either of them should not be bucketed
      if ((mapJoinLocalWork != null && mapJoinLocalWork.getBucketMapjoinContext() != null) ||
          (childLocalWork != null && childLocalWork.getBucketMapjoinContext() != null)) {
        return;
      }


      if (childWork.getAliasToWork().size() > 1) {
        return;
      }


      Operator<? extends Serializable> childAliasOp =
          childWork.getAliasToWork().values().iterator().next();
      if (mapJoinTaskFileSinkOperator.getParentOperators().size() > 1) {
        return;
      }


      // Merge the 2 trees - remove the FileSinkOperator from the first tree pass it to the
      // top of the second
      Operator<? extends Serializable> parentFOp = mapJoinTaskFileSinkOperator
          .getParentOperators().get(0);
      parentFOp.getChildOperators().remove(mapJoinTaskFileSinkOperator);
      parentFOp.getChildOperators().add(childAliasOp);
      List<Operator<? extends OperatorDesc>> parentOps =
          new ArrayList<Operator<? extends OperatorDesc>>();
      parentOps.add(parentFOp);
      childAliasOp.setParentOperators(parentOps);


      mapJoinWork.getAliasToPartnInfo().putAll(childWork.getAliasToPartnInfo());
      for (Map.Entry<String, PartitionDesc> childWorkEntry : childWork.getPathToPartitionInfo()
          .entrySet()) {
        if (childWork.getAliasToPartnInfo().containsValue(childWorkEntry.getKey())) {
          mapJoinWork.getPathToPartitionInfo()
              .put(childWorkEntry.getKey(), childWorkEntry.getValue());
        }
      }


      // Fill up stuff in local work
      if (mapJoinLocalWork != null && childLocalWork != null) {
        mapJoinLocalWork.getAliasToFetchWork().putAll(childLocalWork.getAliasToFetchWork());
        mapJoinLocalWork.getAliasToWork().putAll(childLocalWork.getAliasToWork());
      }


      // remove the child task
      List<Task<? extends Serializable>> oldChildTasks = childTask.getChildTasks();
      mapJoinTask.setChildTasks(oldChildTasks);
      if (oldChildTasks != null) {
        for (Task<? extends Serializable> oldChildTask : oldChildTasks) {
          oldChildTask.getParentTasks().remove(childTask);
          oldChildTask.getParentTasks().add(mapJoinTask);

View Full Code Here

        currWork.setOpParseCtxMap(parseCtx.getOpParseCtx());
        currWork.setJoinTree(joinTree);


        if (convertJoinMapJoin) {
          // create map join task and set big table as bigTablePosition
          MapRedTask newTask = convertTaskToMapJoinTask(currWork, bigTablePosition).getFirst();


          newTask.setTaskTag(Task.MAPJOIN_ONLY_NOBACKUP);
          replaceTask(currTask, newTask, physicalContext);


          // Can this task be merged with the child task. This can happen if a big table is being
          // joined with multiple small tables on different keys
          if ((newTask.getChildTasks() != null) && (newTask.getChildTasks().size() == 1)) {
            if (newTask.getChildTasks().get(0).getTaskTag() == Task.MAPJOIN_ONLY_NOBACKUP) {
              // Merging two map-join tasks
              mergeMapJoinTaskWithChildMapJoinTask(newTask, conf);
            }


            // Converted the join operator into a map-join. Now see if it can
            // be merged into the following map-reduce job.
            boolean convertToSingleJob = HiveConf.getBoolVar(conf,
                HiveConf.ConfVars.HIVEOPTIMIZEMAPJOINFOLLOWEDBYMR);
            if (convertToSingleJob) {
              // Try merging a map-join task with a mapreduce job to have a
              // single job.
              mergeMapJoinTaskWithMapReduceTask(newTask, conf);
            }
          }


          return newTask;
        }


        long ThresholdOfSmallTblSizeSum = HiveConf.getLongVar(conf,
            HiveConf.ConfVars.HIVESMALLTABLESFILESIZE);
        String xml = currWork.toXML();
        for (int i = 0; i < numAliases; i++) {
          // this table cannot be big table
          if (!bigTableCandidates.contains(i)) {
            continue;
          }


          // deep copy a new mapred work from xml
          InputStream in = new ByteArrayInputStream(xml.getBytes("UTF-8"));
          MapredWork newWork = Utilities.deserializeMapRedWork(in, physicalContext.getConf());


          // create map join task and set big table as i
          ObjectPair<MapRedTask, String> newTaskAlias = convertTaskToMapJoinTask(newWork, i);
          MapRedTask newTask = newTaskAlias.getFirst();
          bigTableAlias = newTaskAlias.getSecond();


          Long aliasKnownSize = aliasToSize.get(bigTableAlias);
          if (aliasKnownSize != null && aliasKnownSize.longValue() > 0) {
            long smallTblTotalKnownSize = aliasTotalKnownInputSize
                - aliasKnownSize.longValue();
            if(smallTblTotalKnownSize > ThresholdOfSmallTblSizeSum) {
              //this table is not good to be a big table.
              continue;
            }
          }


          // add into conditional task
          listWorks.add(newTask.getWork());
          listTasks.add(newTask);
          newTask.setTaskTag(Task.CONVERTED_MAPJOIN);


          //set up backup task
          newTask.setBackupTask(currTask);
          newTask.setBackupChildrenTasks(currTask.getChildTasks());


          // put the mapping alias to task
          aliasToTask.put(bigTableAlias, newTask);
        }
      } catch (Exception e) {

View Full Code Here

    }


    //
    // 2. Constructing a conditional task consisting of a move task and a map reduce task
    //
    MapRedTask currTask = (MapRedTask) ctx.getCurrTask();
    MoveWork dummyMv = new MoveWork(null, null, null,
        new LoadFileDesc(fsInputDesc.getDirName(), finalName, true, null, null), false);
    MapredWork cplan = createMergeTask(ctx.getConf(), tsMerge, fsInputDesc);
    // use CombineHiveInputFormat for map-only merging
    cplan.setInputformat("org.apache.hadoop.hive.ql.io.CombineHiveInputFormat");

View Full Code Here

          // create map join task and set big table as i
          // deep copy a new mapred work from xml
          InputStream in = new ByteArrayInputStream(xml.getBytes("UTF-8"));
          MapredWork newWork = Utilities.deserializeMapRedWork(in, physicalContext.getConf());
          // create a mapred task for this work
          MapRedTask newTask = (MapRedTask) TaskFactory.get(newWork, physicalContext
              .getParseContext().getConf());
          JoinOperator newJoinOp = getJoinOp(newTask);


          // optimize this newWork and assume big table position is i
          bigTableAlias = MapJoinProcessor.genMapJoinOpAndLocalWork(newWork, newJoinOp, i);


          // add into conditional task
          listWorks.add(newWork);
          listTasks.add(newTask);
          newTask.setTaskTag(Task.CONVERTED_MAPJOIN);


          //set up backup task
          newTask.setBackupTask(currTask);
          newTask.setBackupChildrenTasks(currTask.getChildTasks());


          // put the mapping alias to task
          aliasToTask.put(bigTableAlias, newTask);


          // set alias to path

View Full Code Here

0 1

TOP

Related Classes of org.apache.hadoop.hive.ql.exec.MapRedTask

com.sap.hadoop.windowing.runtime2.mr.MRExecutor

org.apache.hadoop.fs.Path

org.apache.hadoop.hive.common.io.CachingPrintStream

org.apache.hadoop.hive.ql.Context

org.apache.hadoop.hive.ql.exec.Utilities.StreamPrinter

org.apache.hadoop.hive.ql.io.TestSymlinkTextInputFormat

org.apache.hadoop.hive.ql.optimizer.GenMRFileSink1

org.apache.hadoop.hive.ql.optimizer.physical.CommonJoinResolver$CommonJoinTaskDispatcher

org.apache.hadoop.hive.ql.optimizer.physical.index.IndexWhereProcessor

org.apache.hadoop.hive.ql.plan.mapredWork

All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.