Examples of eu.stratosphere.api.java.ExecutionEnvironment

eu.stratosphere.api.java.ExecutionEnvironment
The ExecutionEnviroment is the context in which a program is executed. A {@link LocalEnvironment} will cause execution in the current JVM, a{@link RemoteEnvironment} will cause execution on a remote setup.
The environment provides methods to control the job execution (such as setting the parallelism) and to interact with the outside world (data access).
Please note that the execution environment needs strong type information for the input and return types of all operations that are executed. This means that the environments needs to know that the return value of an operation is for example a Tuple of String and Integer. Because the Java compiler throws much of the generic type information away, most methods attempt to re- obtain that information using reflection. In certain cases, it may be necessary to manually supply that information to some of the methods. @see LocalEnvironment @see RemoteEnvironment

    
    if(!parseParameters(args)) {
      return;
    }


    final ExecutionEnvironment env = ExecutionEnvironment.getExecutionEnvironment();


    // get input data
    DataSet<Lineitem> li = getLineitemDataSet(env);
    DataSet<Order> or = getOrdersDataSet(env);
    DataSet<Customer> cust = getCustomerDataSet(env);
    
    // Filter market segment "AUTOMOBILE"
    cust = cust.filter(
              new FilterFunction<Customer>() {
                @Override
                public boolean filter(Customer value) {
                  return value.getMktsegment().equals("AUTOMOBILE");
                }
              });


    // Filter all Orders with o_orderdate < 12.03.1995
    or = or.filter(
            new FilterFunction<Order>() {
              private DateFormat format = new SimpleDateFormat("yyyy-MM-dd");
              private Date date;
              
              {  
                Calendar cal = Calendar.getInstance();
                cal.set(1995, 3, 12);
                date = cal.getTime(); 
              }
              
              @Override
              public boolean filter(Order value) throws ParseException {
                Date orderDate = format.parse(value.getOrderdate());
                return orderDate.before(date);
              }
            });
    
    // Filter all Lineitems with l_shipdate > 12.03.1995
    li = li.filter(
            new FilterFunction<Lineitem>() {
              private DateFormat format = new SimpleDateFormat("yyyy-MM-dd");
              private Date date;
              
              {
                Calendar cal = Calendar.getInstance();
                cal.set(1995, 3, 12);
                date = cal.getTime();
              }
              
              @Override
              public boolean filter(Lineitem value) throws ParseException {
                Date shipDate = format.parse(value.getShipdate());
                return shipDate.after(date);
              }
            });


    // Join customers with orders and package them into a ShippingPriorityItem
    DataSet<ShippingPriorityItem> customerWithOrders = 
        cust.join(or)
          .where(0)
          .equalTo(0)
          .with(
              new JoinFunction<Customer, Order, ShippingPriorityItem>() {
                @Override
                public ShippingPriorityItem join(Customer first, Order second) {
                  return new ShippingPriorityItem(0, 0.0, second.getOrderdate(),
                      second.getShippriority(), second.getOrderkey());
                }
              });
    
    // Join the last join result with Lineitems
    DataSet<ShippingPriorityItem> joined = 
        customerWithOrders.join(li)
                  .where(4)
                  .equalTo(0)
                  .with(
                      new JoinFunction<ShippingPriorityItem, Lineitem, ShippingPriorityItem>() {
                        @Override
                        public ShippingPriorityItem join(ShippingPriorityItem first, Lineitem second) {
                          first.setL_Orderkey(second.getOrderkey());
                          first.setRevenue(second.getExtendedprice() * (1 - second.getDiscount()));
                          return first;
                        }
                      });
    
    // Group by l_orderkey, o_orderdate and o_shippriority and compute revenue sum
    joined = joined
        .groupBy(0, 2, 3)
        .aggregate(Aggregations.SUM, 1);
    
    // emit result
    joined.writeAsCsv(outputPath, "\n", "|");
    
    // execute program
    env.execute("TPCH Query 3 Example");
    
  }

View Full Code Here

    if(!parseParameters(args)) {
      return;
    }
    
    // set up execution environment
    final ExecutionEnvironment env = ExecutionEnvironment.getExecutionEnvironment();
    
    // read input data
    DataSet<Edge> edges = getEdgeDataSet(env);
    
    // annotate edges with degrees
    DataSet<EdgeWithDegrees> edgesWithDegrees = edges
        .flatMap(new EdgeDuplicator())
        .groupBy(Edge.V1).sortGroup(Edge.V2, Order.ASCENDING).reduceGroup(new DegreeCounter())
        .groupBy(EdgeWithDegrees.V1,EdgeWithDegrees.V2).reduce(new DegreeJoiner());
    
    // project edges by degrees
    DataSet<Edge> edgesByDegree = edgesWithDegrees
        .map(new EdgeByDegreeProjector());
    // project edges by vertex id
    DataSet<Edge> edgesById = edgesByDegree
        .map(new EdgeByIdProjector());
    
    DataSet<Triad> triangles = edgesByDegree
        // build triads
        .groupBy(Edge.V1).sortGroup(Edge.V2, Order.ASCENDING).reduceGroup(new TriadBuilder())
        // filter triads
        .join(edgesById).where(Triad.V2,Triad.V3).equalTo(Edge.V1,Edge.V2).with(new TriadFilter());


    // emit result
    if(fileOutput) {
      triangles.writeAsCsv(outputPath, "\n", ",");
    } else {
      triangles.print();
    }
    
    // execute program
    env.execute("Triangle Enumeration Example");
    
  }

View Full Code Here

    
    if(!parseParameters(args)) {
      return;
    }


    final ExecutionEnvironment env = ExecutionEnvironment.getExecutionEnvironment();


    // get input data
    DataSet<Tuple2<String, String>> documents = getDocumentsDataSet(env);
    DataSet<Tuple3<Integer, String, Integer>> ranks = getRanksDataSet(env);
    DataSet<Tuple2<String, String>> visits = getVisitsDataSet(env);
    
    // Retain documents with keywords
    DataSet<Tuple1<String>> filterDocs = documents
        .filter(new FilterDocByKeyWords())
        .project(0).types(String.class);


    // Filter ranks by minimum rank
    DataSet<Tuple3<Integer, String, Integer>> filterRanks = ranks
        .filter(new FilterByRank());


    // Filter visits by visit date
    DataSet<Tuple1<String>> filterVisits = visits
        .filter(new FilterVisitsByDate())
        .project(0).types(String.class);


    // Join the filtered documents and ranks, i.e., get all URLs with min rank and keywords
    DataSet<Tuple3<Integer, String, Integer>> joinDocsRanks = 
        filterDocs.join(filterRanks)
              .where(0).equalTo(1)
              .projectSecond(0,1,2)
              .types(Integer.class, String.class, Integer.class);


    // Anti-join urls with visits, i.e., retain all URLs which have NOT been visited in a certain time
    DataSet<Tuple3<Integer, String, Integer>> result = 
        joinDocsRanks.coGroup(filterVisits)
                .where(1).equalTo(0)
                .with(new AntiJoinVisits());


    // emit result
    if(fileOutput) {
      result.writeAsCsv(outputPath, "\n", "|");
    } else {
      result.print();
    }


    // execute program
    env.execute("WebLogAnalysis Example");
    
  }

View Full Code Here

    if(!parseParameters(args)) {
      return;
    }
  
    // set up execution environment
    ExecutionEnvironment env = ExecutionEnvironment.getExecutionEnvironment();
    
    // get input data
    DataSet<Point> points = getPointDataSet(env);
    DataSet<Centroid> centroids = getCentroidDataSet(env);
    
    // set number of bulk iterations for KMeans algorithm
    IterativeDataSet<Centroid> loop = centroids.iterate(numIterations);
    
    DataSet<Centroid> newCentroids = points
      // compute closest centroid for each point
      .map(new SelectNearestCenter()).withBroadcastSet(loop, "centroids")
      // count and sum point coordinates for each centroid
      .map(new CountAppender())
      .groupBy(0).reduce(new CentroidAccumulator())
      // compute new centroids from point counts and coordinate sums
      .map(new CentroidAverager());
    
    // feed new centroids back into next iteration
    DataSet<Centroid> finalCentroids = loop.closeWith(newCentroids);
    
    DataSet<Tuple2<Integer, Point>> clusteredPoints = points
        // assign points to final clusters
        .map(new SelectNearestCenter()).withBroadcastSet(finalCentroids, "centroids");
    
    // emit result
    if(fileOutput) {
      clusteredPoints.writeAsCsv(outputPath, "\n", " ");
    } else {
      clusteredPoints.print();
    }


    // execute program
    env.execute("KMeans Example");
    
  }

View Full Code Here

    
    if(!parseParameters(args)) {
      return;
    }
    
    final ExecutionEnvironment env = ExecutionEnvironment.getExecutionEnvironment();


    // get orders data set: (orderkey, orderstatus, orderdate, orderpriority, shippriority)
    DataSet<Tuple5<Integer, String, String, String, Integer>> orders = getOrdersDataSet(env);


    // get lineitem data set: (orderkey, extendedprice)
    DataSet<Tuple2<Integer, Double>> lineitems = getLineitemDataSet(env);


    // orders filtered by year: (orderkey, custkey)
    DataSet<Tuple2<Integer, Integer>> ordersFilteredByYear =
        // filter orders
        orders.filter(
                new FilterFunction<Tuple5<Integer, String, String, String, Integer>>() {
                  @Override
                  public boolean filter(Tuple5<Integer, String, String, String, Integer> t) {
                    // status filter
                    if(!t.f1.equals(STATUS_FILTER)) {
                      return false;
                    // year filter
                    } else if(Integer.parseInt(t.f2.substring(0, 4)) <= YEAR_FILTER) {
                      return false;
                    // order priority filter
                    } else if(!t.f3.startsWith(OPRIO_FILTER)) {
                      return false;
                    }
                    return true;
                  }
                })
        // project fields out that are no longer required
        .project(0,4).types(Integer.class, Integer.class);


    // join orders with lineitems: (orderkey, shippriority, extendedprice)
    DataSet<Tuple3<Integer, Integer, Double>> lineitemsOfOrders = 
        ordersFilteredByYear.joinWithHuge(lineitems)
                  .where(0).equalTo(0)
                  .projectFirst(0,1).projectSecond(1)
                  .types(Integer.class, Integer.class, Double.class);


    // extendedprice sums: (orderkey, shippriority, sum(extendedprice))
    DataSet<Tuple3<Integer, Integer, Double>> priceSums =
        // group by order and sum extendedprice
        lineitemsOfOrders.groupBy(0,1).aggregate(Aggregations.SUM, 2);


    // emit result
    priceSums.writeAsCsv(outputPath);
    
    // execute program
    env.execute("Relational Query Example");
    
  }

View Full Code Here

  public void testUnionNewApiAssembly() {
    final int NUM_INPUTS = 4;
    
    // construct the plan it will be multiple flat maps, all unioned
    // and the "unioned" dataSet will be grouped
    final ExecutionEnvironment env = ExecutionEnvironment.getExecutionEnvironment();
    
    DataSet<String> source = env.readTextFile(IN_FILE);
    DataSet<Tuple2<String, Integer>> lastUnion = source.flatMap(new DummyFlatMap());
  
    for (int i = 1; i< NUM_INPUTS; i++){
      lastUnion = lastUnion.union(source.flatMap(new DummyFlatMap()));
    }
    
    DataSet<Tuple2<String, Integer>> result = lastUnion.groupBy(0).aggregate(Aggregations.SUM, 1);
    result.writeAsText(OUT_FILE);
  
    // return the plan
    Plan plan = env.createProgramPlan("Test union on new java-api");
    OptimizedPlan oPlan = compileNoStats(plan);
    NepheleJobGraphGenerator jobGen = new NepheleJobGraphGenerator();
    
    // Compile plan to verify that no error is thrown
    jobGen.compileJobGraph(oPlan);

View Full Code Here

    if(!parseParameters(args)) {
      return;
    }
    
    // set up execution environment
    ExecutionEnvironment env = ExecutionEnvironment.getExecutionEnvironment();
    
    // read vertex and edge data
    DataSet<Long> vertices = getVertexDataSet(env);
    DataSet<Tuple2<Long, Long>> edges = getEdgeDataSet(env).flatMap(new UndirectEdge());
    
    // assign the initial components (equal to the vertex id)
    DataSet<Tuple2<Long, Long>> verticesWithInitialId = vertices.map(new DuplicateValue<Long>());
        
    // open a delta iteration
    DeltaIteration<Tuple2<Long, Long>, Tuple2<Long, Long>> iteration =
        verticesWithInitialId.iterateDelta(verticesWithInitialId, maxIterations, 0);
    
    // apply the step logic: join with the edges, select the minimum neighbor, update if the component of the candidate is smaller
    DataSet<Tuple2<Long, Long>> changes = iteration.getWorkset().join(edges).where(0).equalTo(0).with(new NeighborWithComponentIDJoin())
        .groupBy(0).aggregate(Aggregations.MIN, 1)
        .join(iteration.getSolutionSet()).where(0).equalTo(0)
        .with(new ComponentIdFilter());


    // close the delta iteration (delta and new workset are identical)
    DataSet<Tuple2<Long, Long>> result = iteration.closeWith(changes, changes);
    
    // emit result
    if(fileOutput) {
      result.writeAsCsv(outputPath, "\n", " ");
    } else {
      result.print();
    }
        
    // execute program
    env.execute("Connected Components Example");
        
  }

View Full Code Here

    
    if(!parseParameters(args)) {
      return;
    }
    
    final ExecutionEnvironment env = ExecutionEnvironment.getExecutionEnvironment();


    // get customer data set: (custkey, name, address, nationkey, acctbal) 
    DataSet<Tuple5<Integer, String, String, Integer, Double>> customers = getCustomerDataSet(env);


    // get orders data set: (orderkey, custkey, orderdate)
    DataSet<Tuple3<Integer, Integer, String>> orders = getOrdersDataSet(env);


    // get lineitem data set: (orderkey, extendedprice, discount, returnflag)
    DataSet<Tuple4<Integer, Double, Double, String>> lineitems = getLineitemDataSet(env);


    // get nation data set: (nationkey, name)
    DataSet<Tuple2<Integer, String>> nations = getNationsDataSet(env);


    // orders filtered by year: (orderkey, custkey)
    DataSet<Tuple2<Integer, Integer>> ordersFilteredByYear =
        // filter by year
        orders.filter(
                new FilterFunction<Tuple3<Integer,Integer, String>>() {
                  @Override
                  public boolean filter(Tuple3<Integer, Integer, String> t) {
                    int year = Integer.parseInt(t.f2.substring(0, 4));
                    return year > 1990;
                  }
                })
        // project fields out that are no longer required
        .project(0,1).types(Integer.class, Integer.class);


    // lineitems filtered by flag: (orderkey, extendedprice, discount)
    DataSet<Tuple3<Integer, Double, Double>> lineitemsFilteredByFlag = 
        // filter by flag
        lineitems.filter(new FilterFunction<Tuple4<Integer, Double, Double, String>>() {
                    @Override
                    public boolean filter(Tuple4<Integer, Double, Double, String> t)
                        throws Exception {
                      return t.f3.equals("R");
                    }
                })
        // project fields out that are no longer required
        .project(0,1,2).types(Integer.class, Double.class, Double.class);


    // join orders with lineitems: (custkey, extendedprice, discount)
    DataSet<Tuple3<Integer, Double, Double>> lineitemsOfCustomerKey = 
        ordersFilteredByYear.joinWithHuge(lineitemsFilteredByFlag)
                  .where(0).equalTo(0)
                  .projectFirst(1).projectSecond(1,2)
                  .types(Integer.class, Double.class, Double.class);


    // aggregate for revenue: (custkey, revenue)
    DataSet<Tuple2<Integer, Double>> revenueOfCustomerKey = lineitemsOfCustomerKey
        // calculate the revenue for each item
        .map(new MapFunction<Tuple3<Integer, Double, Double>, Tuple2<Integer, Double>>() {
              @Override
              public Tuple2<Integer, Double> map(Tuple3<Integer, Double, Double> t) {
                // revenue per item = l_extendedprice * (1 - l_discount)
                return new Tuple2<Integer, Double>(t.f0, t.f1 * (1 - t.f2));
              }
          })
        // aggregate the revenues per item to revenue per customer
        .groupBy(0).aggregate(Aggregations.SUM, 1);


    // join customer with nation (custkey, name, address, nationname, acctbal)
    DataSet<Tuple5<Integer, String, String, String, Double>> customerWithNation = customers
            .joinWithTiny(nations)
            .where(3).equalTo(0)
            .projectFirst(0,1,2).projectSecond(1).projectFirst(4)
            .types(Integer.class, String.class, String.class, String.class, Double.class);


    // join customer (with nation) with revenue (custkey, name, address, nationname, acctbal, revenue)
    DataSet<Tuple6<Integer, String, String, String, Double, Double>> customerWithRevenue = 
        customerWithNation.join(revenueOfCustomerKey)
        .where(0).equalTo(0)
        .projectFirst(0,1,2,3,4).projectSecond(1)
        .types(Integer.class, String.class, String.class, String.class, Double.class, Double.class);


    // emit result
    customerWithRevenue.writeAsCsv(outputPath);
    
    // execute program
    env.execute("TPCH Query 10 Example");
    
  }

View Full Code Here

    if(!parseParameters(args)) {
      return;
    }
    
    // set up execution environment
    final ExecutionEnvironment env = ExecutionEnvironment.getExecutionEnvironment();
    
    // get input data
    DataSet<Tuple1<Long>> pagesInput = getPagesDataSet(env);
    DataSet<Tuple2<Long, Long>> linksInput = getLinksDataSet(env);
    
    // assign initial rank to pages
    DataSet<Tuple2<Long, Double>> pagesWithRanks = pagesInput.
        map(new RankAssigner((1.0d / numPages)));
    
    // build adjecency list from link input
    DataSet<Tuple2<Long, Long[]>> adjacencyListInput = 
        linksInput.groupBy(0).reduceGroup(new BuildOutgoingEdgeList());
    
    // set iterative data set
    IterativeDataSet<Tuple2<Long, Double>> iteration = pagesWithRanks.iterate(maxIterations);
    
    DataSet<Tuple2<Long, Double>> newRanks = iteration
        // join pages with outgoing edges and distribute rank
        .join(adjacencyListInput).where(0).equalTo(0).flatMap(new JoinVertexWithEdgesMatch())
        // collect and sum ranks
        .groupBy(0).aggregate(SUM, 1)
        // apply dampening factor
        .map(new Dampener(DAMPENING_FACTOR, numPages));
    
    DataSet<Tuple2<Long, Double>> finalPageRanks = iteration.closeWith(
        newRanks, 
        newRanks.join(iteration).where(0).equalTo(0)
        // termination condition
        .filter(new EpsilonFilter()));


    // emit result
    if(fileOutput) {
      finalPageRanks.writeAsCsv(outputPath, "\n", " ");
    } else {
      finalPageRanks.print();
    }


    // execute program
    env.execute("Basic Page Rank Example");
    
  }

View Full Code Here

    if(!parseParameters(args)) {
      return;
    }
    
    // set up the execution environment
    final ExecutionEnvironment env = ExecutionEnvironment.getExecutionEnvironment();
    
    // get input data
    DataSet<String> text = getTextDataSet(env);
    
    DataSet<Tuple2<String, Integer>> counts = 
        // split up the lines in pairs (2-tuples) containing: (word,1)
        text.flatMap(new Tokenizer())
        // group by the tuple field "0" and sum up tuple field "1"
        .groupBy(0)
        .aggregate(Aggregations.SUM, 1);


    // emit result
    if(fileOutput) {
      counts.writeAsCsv(outputPath, "\n", " ");
    } else {
      counts.print();
    }
    
    // execute program
    env.execute("WordCount Example");
  }

View Full Code Here

0 1 2 3 4 5 6 7 8 9

TOP

Related Classes of eu.stratosphere.api.java.ExecutionEnvironment

$.Job

$.WordCountJob

eu.stratosphere.api.avro.testjar.AvroExternalJarProgram

eu.stratosphere.api.java.functions.SemanticPropertiesTranslationTest

eu.stratosphere.api.java.io.CsvReader

eu.stratosphere.api.java.io.jdbc.example.JDBCExample

eu.stratosphere.api.java.io.TextInputFormat

eu.stratosphere.api.java.io.TextValueInputFormat

eu.stratosphere.api.java.operator.AggregateOperatorTest

eu.stratosphere.api.java.operator.CoGroupOperatorTest

All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.