Package cascading.tap.hadoop

Examples of cascading.tap.hadoop.Hfs


    LOG.info( "writing step state to dist cache, too large for job conf, size: {}", stepState.length() );

    String statePath = Hfs.getTempPath( conf ) + "/" + kind + "-state-" + id;

    Hfs temp = new Hfs( new TextLine(), statePath, SinkMode.REPLACE );

    try
      {
      TupleEntryCollector writer = temp.openForWrite( new HadoopFlowProcess( conf ) );

      writer.add( new Tuple( stepState ) );

      writer.close();
      }
    catch( IOException exception )
      {
      throw new FlowException( "unable to write step state to Hadoop FS: " + temp.getIdentifier() );
      }

    URI uri = new Path( statePath ).toUri();
    DistributedCache.addCacheFile( uri, conf );
View Full Code Here


    if( stepStatePath == null )
      throw new FlowException( "unable to find step state from distributed cache" );

    LOG.info( "reading step state from local path: {}", stepStatePath );

    Hfs temp = new Lfs( new TextLine( new Fields( "line" ) ), stepStatePath.toString() );

    TupleEntryIterator reader = null;

    try
      {
      reader = temp.openForRead( new HadoopFlowProcess( jobConf ) );

      if( !reader.hasNext() )
        throw new FlowException( "step state path is empty: " + temp.getIdentifier() );

      return reader.next().getString( 0 );
      }
    catch( IOException exception )
      {
      throw new FlowException( "unable to find state path: " + temp.getIdentifier(), exception );
      }
    finally
      {
      try
        {
View Full Code Here

    FlowElement edgeTarget = elementGraph.getEdgeTarget( scope );

    if( !( edgeSource instanceof Hfs ) || !( edgeTarget instanceof Hfs ) )
      throw new IllegalStateException( "non Hfs Taps matched" );

    Hfs predecessor = (Hfs) edgeSource;
    Hfs successor = (Hfs) edgeTarget;

    // does this scheme source what it sinks
    if( !successor.getScheme().isSymmetrical() )
      return false;

    HadoopPlanner flowPlanner = (HadoopPlanner) plannerContext.getFlowPlanner();

    URI tempURIScheme = flowPlanner.getDefaultURIScheme( predecessor ); // temp uses default fs
    URI successorURIScheme = flowPlanner.getURIScheme( successor );

    if( !tempURIScheme.equals( successorURIScheme ) )
      return false;

    // safe, both are symmetrical
    // should be called after fields are resolved
    if( !predecessor.getSourceFields().equals( successor.getSourceFields() ) )
      return true;

    return true;
    }
View Full Code Here

    Path[] paths = FileInputFormat.getInputPaths( jobConf );

    Map<String, Tap> taps = new HashMap<String, Tap>();

    for( Path path : paths )
      taps.put( path.toString(), new Hfs( new NullScheme(), path.toString() ) );

    return taps;
    }
View Full Code Here

    {
    Map<String, Tap> taps = new HashMap<String, Tap>();

    String path = FileOutputFormat.getOutputPath( jobConf ).toString();

    taps.put( path, new Hfs( new NullScheme(), path, deleteSinkOnInit ) );

    return taps;
    }
View Full Code Here

    String outputPath = getOutputPath( "flowTest" );
    FileOutputFormat.setOutputPath( conf, new Path( outputPath ) );

    Flow flow = new MapReduceFlow( "mrflow", conf, true );

    validateLength( new Hfs( new TextLine(), inputFileApache ).openForRead( new HadoopFlowProcess( defaultConf ) ), 10 );

    flow.complete();

    validateLength( new Hfs( new TextLine(), outputPath ).openForRead( new HadoopFlowProcess( defaultConf ) ), 10 );
    }
View Full Code Here

  public void testCascade() throws IOException
    {
    getPlatform().copyFromLocal( inputFileApache );

    // Setup two standard cascading flows that will generate the input for the first MapReduceFlow
    Tap source1 = new Hfs( new TextLine( new Fields( "offset", "line" ) ), remove( inputFileApache, false ) );
    String sinkPath4 = getOutputPath( "flow4" );
    Tap sink1 = new Hfs( new TextLine( new Fields( "offset", "line" ) ), remove( sinkPath4, true ), true );
    Flow firstFlow = getPlatform().getFlowConnector( getProperties() ).connect( source1, sink1, new Pipe( "first-flow" ) );

    String sinkPath5 = getOutputPath( "flow5" );
    Tap sink2 = new Hfs( new TextLine( new Fields( "offset", "line" ) ), remove( sinkPath5, true ), true );
    Flow secondFlow = getPlatform().getFlowConnector( getProperties() ).connect( sink1, sink2, new Pipe( "second-flow" ) );

    JobConf defaultConf = HadoopPlanner.createJobConf( getProperties() );

    JobConf firstConf = new JobConf( defaultConf );
    firstConf.setJobName( "first-mr" );

    firstConf.setOutputKeyClass( LongWritable.class );
    firstConf.setOutputValueClass( Text.class );

    firstConf.setMapperClass( IdentityMapper.class );
    firstConf.setReducerClass( IdentityReducer.class );

    firstConf.setInputFormat( TextInputFormat.class );
    firstConf.setOutputFormat( TextOutputFormat.class );

    FileInputFormat.setInputPaths( firstConf, new Path( remove( sinkPath5, true ) ) );
    String sinkPath1 = getOutputPath( "flow1" );
    FileOutputFormat.setOutputPath( firstConf, new Path( remove( sinkPath1, true ) ) );

    Flow firstMR = new MapReduceFlow( firstConf, true );

    JobConf secondConf = new JobConf( defaultConf );
    secondConf.setJobName( "second-mr" );

    secondConf.setOutputKeyClass( LongWritable.class );
    secondConf.setOutputValueClass( Text.class );

    secondConf.setMapperClass( IdentityMapper.class );
    secondConf.setReducerClass( IdentityReducer.class );

    secondConf.setInputFormat( TextInputFormat.class );
    secondConf.setOutputFormat( TextOutputFormat.class );

    FileInputFormat.setInputPaths( secondConf, new Path( remove( sinkPath1, true ) ) );
    String sinkPath2 = getOutputPath( "flow2" );
    FileOutputFormat.setOutputPath( secondConf, new Path( remove( sinkPath2, true ) ) );

    Flow secondMR = new MapReduceFlow( secondConf, true );

    JobConf thirdConf = new JobConf( defaultConf );
    thirdConf.setJobName( "third-mr" );

    thirdConf.setOutputKeyClass( LongWritable.class );
    thirdConf.setOutputValueClass( Text.class );

    thirdConf.setMapperClass( IdentityMapper.class );
    thirdConf.setReducerClass( IdentityReducer.class );

    thirdConf.setInputFormat( TextInputFormat.class );
    thirdConf.setOutputFormat( TextOutputFormat.class );

    FileInputFormat.setInputPaths( thirdConf, new Path( remove( sinkPath2, true ) ) );
    String sinkPath3 = getOutputPath( "flow3" );
    FileOutputFormat.setOutputPath( thirdConf, new Path( remove( sinkPath3, true ) ) );

    Flow thirdMR = new MapReduceFlow( thirdConf, true );

    CascadeConnector cascadeConnector = new CascadeConnector();

    // pass out of order
    Cascade cascade = cascadeConnector.connect( firstFlow, secondFlow, thirdMR, firstMR, secondMR );

    cascade.complete();

    validateLength( new Hfs( new TextLine(), sinkPath3 ).openForRead( new HadoopFlowProcess( defaultConf ) ), 10 );
    }
View Full Code Here

  public void testSameSourceForBranch() throws IOException
    {
    Map sources = new HashMap();
    Map sinks = new HashMap();

    sources.put( "a", new Hfs( new TextLine( new Fields( "first", "second" ) ), "input/path/a" ) );

    Pipe pipeA = new Pipe( "a" );

    Pipe group1 = new GroupBy( "a1", pipeA, Fields.FIRST );
    Pipe group2 = new GroupBy( "a2", pipeA, Fields.FIRST );

    Pipe merge = new GroupBy( "tail", Pipe.pipes( group1, group2 ), new Fields( "first", "second" ) );

    sinks.put( merge.getName(), new Hfs( new TextLine(), "output/path" ) );

    Flow flow = getPlatform().getFlowConnector().connect( sources, sinks, merge );

    assertEquals( "not equal: steps.size()", 3, flow.getFlowSteps().size() );
    }
View Full Code Here

  public void testSameTaps() throws IOException
    {
    Map sources = new HashMap();
    Map sinks = new HashMap();

    Hfs tap = new Hfs( new TextLine( new Fields( "first", "second" ) ), "input/path/a" );
    sources.put( "a", tap );
    sources.put( "b", tap );

    Pipe pipeA = new Pipe( "a" );
    Pipe pipeB = new Pipe( "b" );

    Pipe group1 = new GroupBy( pipeA );
    Pipe group2 = new GroupBy( pipeB );

    Pipe merge = new GroupBy( "tail", Pipe.pipes( group1, group2 ), new Fields( "first", "second" ) );

    sinks.put( merge.getName(), new Hfs( new TextLine(), "output/path" ) );

    Flow flow = getPlatform().getFlowConnector().connect( sources, sinks, merge );

    assertEquals( "not equal: steps.size()", 3, flow.getFlowSteps().size() );
    }
View Full Code Here

  public void testDanglingHead() throws IOException
    {
    Map sources = new HashMap();
    Map sinks = new HashMap();

    Hfs source = new Hfs( new TextLine( new Fields( "first", "second" ) ), "input/path/a" );
    sources.put( "a", source );

    Pipe pipeA = new Pipe( "a" );
    Pipe pipeB = new Pipe( "b" );

    Pipe group1 = new GroupBy( pipeA );
    Pipe group2 = new GroupBy( pipeB );

    Pipe merge = new GroupBy( "tail", Pipe.pipes( group1, group2 ), new Fields( "first", "second" ) );

    sinks.put( merge.getName(), new Hfs( new TextLine(), "output/path" ) );

    try
      {
      Flow flow = getPlatform().getFlowConnector().connect( sources, sinks, merge );
      fail( "did not catch missing source tap" );
View Full Code Here

TOP

Related Classes of cascading.tap.hadoop.Hfs

Copyright © 2018 www.massapicom. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.