Examples of cascading.tap.hadoop.Hfs

cascading.tap.hadoop.Hfs
ode> will denote Dfs, and file://... will denote Lfs.
Call {@link #setTemporaryDirectory(java.util.Map,String)} to use a different temporary file directory pathother than the current Hadoop default path.
By default Cascading on Hadoop will assume any source or sink Tap using the {@code file://} URI schemeintends to read files from the local client filesystem (for example when using the {@code Lfs} Tap) where the Hadoopjob jar is started, Tap so will force any MapReduce jobs reading or writing to {@code file://} resources to run inHadoop "standalone mode" so that the file can be read.
To change this behavior, {@link HfsProps#setLocalModeScheme(java.util.Map,String)} to set a different scheme value,or to "none" to disable entirely for the case the file to be read is available on every Hadoop processing node in the exact same path.
Hfs can optionally combine multiple small files (or a series of small "blocks") into larger "splits". This reduces the number of resulting map tasks created by Hadoop and can improve application performance.
This is enabled by calling {@link HfsProps#setUseCombinedInput(boolean)} to {@code true}. By default, merging or combining splits into large ones is disabled.


    LOG.info( "writing step state to dist cache, too large for job conf, size: {}", stepState.length() );


    String statePath = Hfs.getTempPath( conf ) + "/" + kind + "-state-" + id;


    Hfs temp = new Hfs( new TextLine(), statePath, SinkMode.REPLACE );


    try
      {
      TupleEntryCollector writer = temp.openForWrite( new HadoopFlowProcess( conf ) );


      writer.add( new Tuple( stepState ) );


      writer.close();
      }
    catch( IOException exception )
      {
      throw new FlowException( "unable to write step state to Hadoop FS: " + temp.getIdentifier() );
      }


    URI uri = new Path( statePath ).toUri();
    DistributedCache.addCacheFile( uri, conf );

View Full Code Here

    if( stepStatePath == null )
      throw new FlowException( "unable to find step state from distributed cache" );


    LOG.info( "reading step state from local path: {}", stepStatePath );


    Hfs temp = new Lfs( new TextLine( new Fields( "line" ) ), stepStatePath.toString() );


    TupleEntryIterator reader = null;


    try
      {
      reader = temp.openForRead( new HadoopFlowProcess( jobConf ) );


      if( !reader.hasNext() )
        throw new FlowException( "step state path is empty: " + temp.getIdentifier() );


      return reader.next().getString( 0 );
      }
    catch( IOException exception )
      {
      throw new FlowException( "unable to find state path: " + temp.getIdentifier(), exception );
      }
    finally
      {
      try
        {

View Full Code Here

    FlowElement edgeTarget = elementGraph.getEdgeTarget( scope );


    if( !( edgeSource instanceof Hfs ) || !( edgeTarget instanceof Hfs ) )
      throw new IllegalStateException( "non Hfs Taps matched" );


    Hfs predecessor = (Hfs) edgeSource;
    Hfs successor = (Hfs) edgeTarget;


    // does this scheme source what it sinks
    if( !successor.getScheme().isSymmetrical() )
      return false;


    HadoopPlanner flowPlanner = (HadoopPlanner) plannerContext.getFlowPlanner();


    URI tempURIScheme = flowPlanner.getDefaultURIScheme( predecessor ); // temp uses default fs
    URI successorURIScheme = flowPlanner.getURIScheme( successor );


    if( !tempURIScheme.equals( successorURIScheme ) )
      return false;


    // safe, both are symmetrical
    // should be called after fields are resolved
    if( !predecessor.getSourceFields().equals( successor.getSourceFields() ) )
      return true;


    return true;
    }

View Full Code Here

    Path[] paths = FileInputFormat.getInputPaths( jobConf );


    Map<String, Tap> taps = new HashMap<String, Tap>();


    for( Path path : paths )
      taps.put( path.toString(), new Hfs( new NullScheme(), path.toString() ) );


    return taps;
    }

View Full Code Here

    {
    Map<String, Tap> taps = new HashMap<String, Tap>();


    String path = FileOutputFormat.getOutputPath( jobConf ).toString();


    taps.put( path, new Hfs( new NullScheme(), path, deleteSinkOnInit ) );


    return taps;
    }

View Full Code Here

    String outputPath = getOutputPath( "flowTest" );
    FileOutputFormat.setOutputPath( conf, new Path( outputPath ) );


    Flow flow = new MapReduceFlow( "mrflow", conf, true );


    validateLength( new Hfs( new TextLine(), inputFileApache ).openForRead( new HadoopFlowProcess( defaultConf ) ), 10 );


    flow.complete();


    validateLength( new Hfs( new TextLine(), outputPath ).openForRead( new HadoopFlowProcess( defaultConf ) ), 10 );
    }

View Full Code Here

  public void testCascade() throws IOException
    {
    getPlatform().copyFromLocal( inputFileApache );


    // Setup two standard cascading flows that will generate the input for the first MapReduceFlow
    Tap source1 = new Hfs( new TextLine( new Fields( "offset", "line" ) ), remove( inputFileApache, false ) );
    String sinkPath4 = getOutputPath( "flow4" );
    Tap sink1 = new Hfs( new TextLine( new Fields( "offset", "line" ) ), remove( sinkPath4, true ), true );
    Flow firstFlow = getPlatform().getFlowConnector( getProperties() ).connect( source1, sink1, new Pipe( "first-flow" ) );


    String sinkPath5 = getOutputPath( "flow5" );
    Tap sink2 = new Hfs( new TextLine( new Fields( "offset", "line" ) ), remove( sinkPath5, true ), true );
    Flow secondFlow = getPlatform().getFlowConnector( getProperties() ).connect( sink1, sink2, new Pipe( "second-flow" ) );


    JobConf defaultConf = HadoopPlanner.createJobConf( getProperties() );


    JobConf firstConf = new JobConf( defaultConf );
    firstConf.setJobName( "first-mr" );


    firstConf.setOutputKeyClass( LongWritable.class );
    firstConf.setOutputValueClass( Text.class );


    firstConf.setMapperClass( IdentityMapper.class );
    firstConf.setReducerClass( IdentityReducer.class );


    firstConf.setInputFormat( TextInputFormat.class );
    firstConf.setOutputFormat( TextOutputFormat.class );


    FileInputFormat.setInputPaths( firstConf, new Path( remove( sinkPath5, true ) ) );
    String sinkPath1 = getOutputPath( "flow1" );
    FileOutputFormat.setOutputPath( firstConf, new Path( remove( sinkPath1, true ) ) );


    Flow firstMR = new MapReduceFlow( firstConf, true );


    JobConf secondConf = new JobConf( defaultConf );
    secondConf.setJobName( "second-mr" );


    secondConf.setOutputKeyClass( LongWritable.class );
    secondConf.setOutputValueClass( Text.class );


    secondConf.setMapperClass( IdentityMapper.class );
    secondConf.setReducerClass( IdentityReducer.class );


    secondConf.setInputFormat( TextInputFormat.class );
    secondConf.setOutputFormat( TextOutputFormat.class );


    FileInputFormat.setInputPaths( secondConf, new Path( remove( sinkPath1, true ) ) );
    String sinkPath2 = getOutputPath( "flow2" );
    FileOutputFormat.setOutputPath( secondConf, new Path( remove( sinkPath2, true ) ) );


    Flow secondMR = new MapReduceFlow( secondConf, true );


    JobConf thirdConf = new JobConf( defaultConf );
    thirdConf.setJobName( "third-mr" );


    thirdConf.setOutputKeyClass( LongWritable.class );
    thirdConf.setOutputValueClass( Text.class );


    thirdConf.setMapperClass( IdentityMapper.class );
    thirdConf.setReducerClass( IdentityReducer.class );


    thirdConf.setInputFormat( TextInputFormat.class );
    thirdConf.setOutputFormat( TextOutputFormat.class );


    FileInputFormat.setInputPaths( thirdConf, new Path( remove( sinkPath2, true ) ) );
    String sinkPath3 = getOutputPath( "flow3" );
    FileOutputFormat.setOutputPath( thirdConf, new Path( remove( sinkPath3, true ) ) );


    Flow thirdMR = new MapReduceFlow( thirdConf, true );


    CascadeConnector cascadeConnector = new CascadeConnector();


    // pass out of order
    Cascade cascade = cascadeConnector.connect( firstFlow, secondFlow, thirdMR, firstMR, secondMR );


    cascade.complete();


    validateLength( new Hfs( new TextLine(), sinkPath3 ).openForRead( new HadoopFlowProcess( defaultConf ) ), 10 );
    }

View Full Code Here

  public void testSameSourceForBranch() throws IOException
    {
    Map sources = new HashMap();
    Map sinks = new HashMap();


    sources.put( "a", new Hfs( new TextLine( new Fields( "first", "second" ) ), "input/path/a" ) );


    Pipe pipeA = new Pipe( "a" );


    Pipe group1 = new GroupBy( "a1", pipeA, Fields.FIRST );
    Pipe group2 = new GroupBy( "a2", pipeA, Fields.FIRST );


    Pipe merge = new GroupBy( "tail", Pipe.pipes( group1, group2 ), new Fields( "first", "second" ) );


    sinks.put( merge.getName(), new Hfs( new TextLine(), "output/path" ) );


    Flow flow = getPlatform().getFlowConnector().connect( sources, sinks, merge );


    assertEquals( "not equal: steps.size()", 3, flow.getFlowSteps().size() );
    }

View Full Code Here

  public void testSameTaps() throws IOException
    {
    Map sources = new HashMap();
    Map sinks = new HashMap();


    Hfs tap = new Hfs( new TextLine( new Fields( "first", "second" ) ), "input/path/a" );
    sources.put( "a", tap );
    sources.put( "b", tap );


    Pipe pipeA = new Pipe( "a" );
    Pipe pipeB = new Pipe( "b" );


    Pipe group1 = new GroupBy( pipeA );
    Pipe group2 = new GroupBy( pipeB );


    Pipe merge = new GroupBy( "tail", Pipe.pipes( group1, group2 ), new Fields( "first", "second" ) );


    sinks.put( merge.getName(), new Hfs( new TextLine(), "output/path" ) );


    Flow flow = getPlatform().getFlowConnector().connect( sources, sinks, merge );


    assertEquals( "not equal: steps.size()", 3, flow.getFlowSteps().size() );
    }

View Full Code Here

  public void testDanglingHead() throws IOException
    {
    Map sources = new HashMap();
    Map sinks = new HashMap();


    Hfs source = new Hfs( new TextLine( new Fields( "first", "second" ) ), "input/path/a" );
    sources.put( "a", source );


    Pipe pipeA = new Pipe( "a" );
    Pipe pipeB = new Pipe( "b" );


    Pipe group1 = new GroupBy( pipeA );
    Pipe group2 = new GroupBy( pipeB );


    Pipe merge = new GroupBy( "tail", Pipe.pipes( group1, group2 ), new Fields( "first", "second" ) );


    sinks.put( merge.getName(), new Hfs( new TextLine(), "output/path" ) );


    try
      {
      Flow flow = getPlatform().getFlowConnector().connect( sources, sinks, merge );
      fail( "did not catch missing source tap" );

View Full Code Here

0 1 2 3 4 5 6 7 8 9

TOP

Related Classes of cascading.tap.hadoop.Hfs

cascading.flow.hadoop.BuildJobsHadoopPlatformTest

cascading.flow.hadoop.FlowPlatformTest

cascading.flow.hadoop.MapReduceFlow

cascading.flow.hadoop.MapReduceFlowPlatformTest

cascading.flow.hadoop.planner.rule.scopeexpression.EquivalentTapsScopeExpression

cascading.flow.hadoop.util.HadoopMRUtil

cascading.flow.hadoop.util.HadoopUtil

cascading.lingual.platform.hadoop.HadoopDefaultFactory

cascading.lingual.platform.hadoop.HadoopPlatformBroker

cascading.lingual.platform.hadoop2.Hadoop2MR1DefaultFactory

All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.