Package eu.stratosphere.api.java

Examples of eu.stratosphere.api.java.ExecutionEnvironment


   
    if(!parseParameters(args)) {
      return;
    }

    final ExecutionEnvironment env = ExecutionEnvironment.getExecutionEnvironment();

    // get input data
    DataSet<Lineitem> li = getLineitemDataSet(env);
    DataSet<Order> or = getOrdersDataSet(env);
    DataSet<Customer> cust = getCustomerDataSet(env);
   
    // Filter market segment "AUTOMOBILE"
    cust = cust.filter(
              new FilterFunction<Customer>() {
                @Override
                public boolean filter(Customer value) {
                  return value.getMktsegment().equals("AUTOMOBILE");
                }
              });

    // Filter all Orders with o_orderdate < 12.03.1995
    or = or.filter(
            new FilterFunction<Order>() {
              private DateFormat format = new SimpleDateFormat("yyyy-MM-dd");
              private Date date;
             
             
                Calendar cal = Calendar.getInstance();
                cal.set(1995, 3, 12);
                date = cal.getTime();
              }
             
              @Override
              public boolean filter(Order value) throws ParseException {
                Date orderDate = format.parse(value.getOrderdate());
                return orderDate.before(date);
              }
            });
   
    // Filter all Lineitems with l_shipdate > 12.03.1995
    li = li.filter(
            new FilterFunction<Lineitem>() {
              private DateFormat format = new SimpleDateFormat("yyyy-MM-dd");
              private Date date;
             
              {
                Calendar cal = Calendar.getInstance();
                cal.set(1995, 3, 12);
                date = cal.getTime();
              }
             
              @Override
              public boolean filter(Lineitem value) throws ParseException {
                Date shipDate = format.parse(value.getShipdate());
                return shipDate.after(date);
              }
            });

    // Join customers with orders and package them into a ShippingPriorityItem
    DataSet<ShippingPriorityItem> customerWithOrders =
        cust.join(or)
          .where(0)
          .equalTo(0)
          .with(
              new JoinFunction<Customer, Order, ShippingPriorityItem>() {
                @Override
                public ShippingPriorityItem join(Customer first, Order second) {
                  return new ShippingPriorityItem(0, 0.0, second.getOrderdate(),
                      second.getShippriority(), second.getOrderkey());
                }
              });
   
    // Join the last join result with Lineitems
    DataSet<ShippingPriorityItem> joined =
        customerWithOrders.join(li)
                  .where(4)
                  .equalTo(0)
                  .with(
                      new JoinFunction<ShippingPriorityItem, Lineitem, ShippingPriorityItem>() {
                        @Override
                        public ShippingPriorityItem join(ShippingPriorityItem first, Lineitem second) {
                          first.setL_Orderkey(second.getOrderkey());
                          first.setRevenue(second.getExtendedprice() * (1 - second.getDiscount()));
                          return first;
                        }
                      });
   
    // Group by l_orderkey, o_orderdate and o_shippriority and compute revenue sum
    joined = joined
        .groupBy(0, 2, 3)
        .aggregate(Aggregations.SUM, 1);
   
    // emit result
    joined.writeAsCsv(outputPath, "\n", "|");
   
    // execute program
    env.execute("TPCH Query 3 Example");
   
  }
View Full Code Here


    if(!parseParameters(args)) {
      return;
    }
   
    // set up execution environment
    final ExecutionEnvironment env = ExecutionEnvironment.getExecutionEnvironment();
   
    // read input data
    DataSet<Edge> edges = getEdgeDataSet(env);
   
    // annotate edges with degrees
    DataSet<EdgeWithDegrees> edgesWithDegrees = edges
        .flatMap(new EdgeDuplicator())
        .groupBy(Edge.V1).sortGroup(Edge.V2, Order.ASCENDING).reduceGroup(new DegreeCounter())
        .groupBy(EdgeWithDegrees.V1,EdgeWithDegrees.V2).reduce(new DegreeJoiner());
   
    // project edges by degrees
    DataSet<Edge> edgesByDegree = edgesWithDegrees
        .map(new EdgeByDegreeProjector());
    // project edges by vertex id
    DataSet<Edge> edgesById = edgesByDegree
        .map(new EdgeByIdProjector());
   
    DataSet<Triad> triangles = edgesByDegree
        // build triads
        .groupBy(Edge.V1).sortGroup(Edge.V2, Order.ASCENDING).reduceGroup(new TriadBuilder())
        // filter triads
        .join(edgesById).where(Triad.V2,Triad.V3).equalTo(Edge.V1,Edge.V2).with(new TriadFilter());

    // emit result
    if(fileOutput) {
      triangles.writeAsCsv(outputPath, "\n", ",");
    } else {
      triangles.print();
    }
   
    // execute program
    env.execute("Triangle Enumeration Example");
   
  }
View Full Code Here

   
    if(!parseParameters(args)) {
      return;
    }

    final ExecutionEnvironment env = ExecutionEnvironment.getExecutionEnvironment();

    // get input data
    DataSet<Tuple2<String, String>> documents = getDocumentsDataSet(env);
    DataSet<Tuple3<Integer, String, Integer>> ranks = getRanksDataSet(env);
    DataSet<Tuple2<String, String>> visits = getVisitsDataSet(env);
   
    // Retain documents with keywords
    DataSet<Tuple1<String>> filterDocs = documents
        .filter(new FilterDocByKeyWords())
        .project(0).types(String.class);

    // Filter ranks by minimum rank
    DataSet<Tuple3<Integer, String, Integer>> filterRanks = ranks
        .filter(new FilterByRank());

    // Filter visits by visit date
    DataSet<Tuple1<String>> filterVisits = visits
        .filter(new FilterVisitsByDate())
        .project(0).types(String.class);

    // Join the filtered documents and ranks, i.e., get all URLs with min rank and keywords
    DataSet<Tuple3<Integer, String, Integer>> joinDocsRanks =
        filterDocs.join(filterRanks)
              .where(0).equalTo(1)
              .projectSecond(0,1,2)
              .types(Integer.class, String.class, Integer.class);

    // Anti-join urls with visits, i.e., retain all URLs which have NOT been visited in a certain time
    DataSet<Tuple3<Integer, String, Integer>> result =
        joinDocsRanks.coGroup(filterVisits)
                .where(1).equalTo(0)
                .with(new AntiJoinVisits());

    // emit result
    if(fileOutput) {
      result.writeAsCsv(outputPath, "\n", "|");
    } else {
      result.print();
    }

    // execute program
    env.execute("WebLogAnalysis Example");
   
  }
View Full Code Here

    if(!parseParameters(args)) {
      return;
    }
 
    // set up execution environment
    ExecutionEnvironment env = ExecutionEnvironment.getExecutionEnvironment();
   
    // get input data
    DataSet<Point> points = getPointDataSet(env);
    DataSet<Centroid> centroids = getCentroidDataSet(env);
   
    // set number of bulk iterations for KMeans algorithm
    IterativeDataSet<Centroid> loop = centroids.iterate(numIterations);
   
    DataSet<Centroid> newCentroids = points
      // compute closest centroid for each point
      .map(new SelectNearestCenter()).withBroadcastSet(loop, "centroids")
      // count and sum point coordinates for each centroid
      .map(new CountAppender())
      .groupBy(0).reduce(new CentroidAccumulator())
      // compute new centroids from point counts and coordinate sums
      .map(new CentroidAverager());
   
    // feed new centroids back into next iteration
    DataSet<Centroid> finalCentroids = loop.closeWith(newCentroids);
   
    DataSet<Tuple2<Integer, Point>> clusteredPoints = points
        // assign points to final clusters
        .map(new SelectNearestCenter()).withBroadcastSet(finalCentroids, "centroids");
   
    // emit result
    if(fileOutput) {
      clusteredPoints.writeAsCsv(outputPath, "\n", " ");
    } else {
      clusteredPoints.print();
    }

    // execute program
    env.execute("KMeans Example");
   
  }
View Full Code Here

   
    if(!parseParameters(args)) {
      return;
    }
   
    final ExecutionEnvironment env = ExecutionEnvironment.getExecutionEnvironment();

    // get orders data set: (orderkey, orderstatus, orderdate, orderpriority, shippriority)
    DataSet<Tuple5<Integer, String, String, String, Integer>> orders = getOrdersDataSet(env);

    // get lineitem data set: (orderkey, extendedprice)
    DataSet<Tuple2<Integer, Double>> lineitems = getLineitemDataSet(env);

    // orders filtered by year: (orderkey, custkey)
    DataSet<Tuple2<Integer, Integer>> ordersFilteredByYear =
        // filter orders
        orders.filter(
                new FilterFunction<Tuple5<Integer, String, String, String, Integer>>() {
                  @Override
                  public boolean filter(Tuple5<Integer, String, String, String, Integer> t) {
                    // status filter
                    if(!t.f1.equals(STATUS_FILTER)) {
                      return false;
                    // year filter
                    } else if(Integer.parseInt(t.f2.substring(0, 4)) <= YEAR_FILTER) {
                      return false;
                    // order priority filter
                    } else if(!t.f3.startsWith(OPRIO_FILTER)) {
                      return false;
                    }
                    return true;
                  }
                })
        // project fields out that are no longer required
        .project(0,4).types(Integer.class, Integer.class);

    // join orders with lineitems: (orderkey, shippriority, extendedprice)
    DataSet<Tuple3<Integer, Integer, Double>> lineitemsOfOrders =
        ordersFilteredByYear.joinWithHuge(lineitems)
                  .where(0).equalTo(0)
                  .projectFirst(0,1).projectSecond(1)
                  .types(Integer.class, Integer.class, Double.class);

    // extendedprice sums: (orderkey, shippriority, sum(extendedprice))
    DataSet<Tuple3<Integer, Integer, Double>> priceSums =
        // group by order and sum extendedprice
        lineitemsOfOrders.groupBy(0,1).aggregate(Aggregations.SUM, 2);

    // emit result
    priceSums.writeAsCsv(outputPath);
   
    // execute program
    env.execute("Relational Query Example");
   
  }
View Full Code Here

  public void testUnionNewApiAssembly() {
    final int NUM_INPUTS = 4;
   
    // construct the plan it will be multiple flat maps, all unioned
    // and the "unioned" dataSet will be grouped
    final ExecutionEnvironment env = ExecutionEnvironment.getExecutionEnvironment();
   
    DataSet<String> source = env.readTextFile(IN_FILE);
    DataSet<Tuple2<String, Integer>> lastUnion = source.flatMap(new DummyFlatMap());
 
    for (int i = 1; i< NUM_INPUTS; i++){
      lastUnion = lastUnion.union(source.flatMap(new DummyFlatMap()));
    }
   
    DataSet<Tuple2<String, Integer>> result = lastUnion.groupBy(0).aggregate(Aggregations.SUM, 1);
    result.writeAsText(OUT_FILE);
 
    // return the plan
    Plan plan = env.createProgramPlan("Test union on new java-api");
    OptimizedPlan oPlan = compileNoStats(plan);
    NepheleJobGraphGenerator jobGen = new NepheleJobGraphGenerator();
   
    // Compile plan to verify that no error is thrown
    jobGen.compileJobGraph(oPlan);
View Full Code Here

    if(!parseParameters(args)) {
      return;
    }
   
    // set up execution environment
    ExecutionEnvironment env = ExecutionEnvironment.getExecutionEnvironment();
   
    // read vertex and edge data
    DataSet<Long> vertices = getVertexDataSet(env);
    DataSet<Tuple2<Long, Long>> edges = getEdgeDataSet(env).flatMap(new UndirectEdge());
   
    // assign the initial components (equal to the vertex id)
    DataSet<Tuple2<Long, Long>> verticesWithInitialId = vertices.map(new DuplicateValue<Long>());
       
    // open a delta iteration
    DeltaIteration<Tuple2<Long, Long>, Tuple2<Long, Long>> iteration =
        verticesWithInitialId.iterateDelta(verticesWithInitialId, maxIterations, 0);
   
    // apply the step logic: join with the edges, select the minimum neighbor, update if the component of the candidate is smaller
    DataSet<Tuple2<Long, Long>> changes = iteration.getWorkset().join(edges).where(0).equalTo(0).with(new NeighborWithComponentIDJoin())
        .groupBy(0).aggregate(Aggregations.MIN, 1)
        .join(iteration.getSolutionSet()).where(0).equalTo(0)
        .with(new ComponentIdFilter());

    // close the delta iteration (delta and new workset are identical)
    DataSet<Tuple2<Long, Long>> result = iteration.closeWith(changes, changes);
   
    // emit result
    if(fileOutput) {
      result.writeAsCsv(outputPath, "\n", " ");
    } else {
      result.print();
    }
       
    // execute program
    env.execute("Connected Components Example");
       
  }
View Full Code Here

   
    if(!parseParameters(args)) {
      return;
    }
   
    final ExecutionEnvironment env = ExecutionEnvironment.getExecutionEnvironment();

    // get customer data set: (custkey, name, address, nationkey, acctbal)
    DataSet<Tuple5<Integer, String, String, Integer, Double>> customers = getCustomerDataSet(env);

    // get orders data set: (orderkey, custkey, orderdate)
    DataSet<Tuple3<Integer, Integer, String>> orders = getOrdersDataSet(env);

    // get lineitem data set: (orderkey, extendedprice, discount, returnflag)
    DataSet<Tuple4<Integer, Double, Double, String>> lineitems = getLineitemDataSet(env);

    // get nation data set: (nationkey, name)
    DataSet<Tuple2<Integer, String>> nations = getNationsDataSet(env);

    // orders filtered by year: (orderkey, custkey)
    DataSet<Tuple2<Integer, Integer>> ordersFilteredByYear =
        // filter by year
        orders.filter(
                new FilterFunction<Tuple3<Integer,Integer, String>>() {
                  @Override
                  public boolean filter(Tuple3<Integer, Integer, String> t) {
                    int year = Integer.parseInt(t.f2.substring(0, 4));
                    return year > 1990;
                  }
                })
        // project fields out that are no longer required
        .project(0,1).types(Integer.class, Integer.class);

    // lineitems filtered by flag: (orderkey, extendedprice, discount)
    DataSet<Tuple3<Integer, Double, Double>> lineitemsFilteredByFlag =
        // filter by flag
        lineitems.filter(new FilterFunction<Tuple4<Integer, Double, Double, String>>() {
                    @Override
                    public boolean filter(Tuple4<Integer, Double, Double, String> t)
                        throws Exception {
                      return t.f3.equals("R");
                    }
                })
        // project fields out that are no longer required
        .project(0,1,2).types(Integer.class, Double.class, Double.class);

    // join orders with lineitems: (custkey, extendedprice, discount)
    DataSet<Tuple3<Integer, Double, Double>> lineitemsOfCustomerKey =
        ordersFilteredByYear.joinWithHuge(lineitemsFilteredByFlag)
                  .where(0).equalTo(0)
                  .projectFirst(1).projectSecond(1,2)
                  .types(Integer.class, Double.class, Double.class);

    // aggregate for revenue: (custkey, revenue)
    DataSet<Tuple2<Integer, Double>> revenueOfCustomerKey = lineitemsOfCustomerKey
        // calculate the revenue for each item
        .map(new MapFunction<Tuple3<Integer, Double, Double>, Tuple2<Integer, Double>>() {
              @Override
              public Tuple2<Integer, Double> map(Tuple3<Integer, Double, Double> t) {
                // revenue per item = l_extendedprice * (1 - l_discount)
                return new Tuple2<Integer, Double>(t.f0, t.f1 * (1 - t.f2));
              }
          })
        // aggregate the revenues per item to revenue per customer
        .groupBy(0).aggregate(Aggregations.SUM, 1);

    // join customer with nation (custkey, name, address, nationname, acctbal)
    DataSet<Tuple5<Integer, String, String, String, Double>> customerWithNation = customers
            .joinWithTiny(nations)
            .where(3).equalTo(0)
            .projectFirst(0,1,2).projectSecond(1).projectFirst(4)
            .types(Integer.class, String.class, String.class, String.class, Double.class);

    // join customer (with nation) with revenue (custkey, name, address, nationname, acctbal, revenue)
    DataSet<Tuple6<Integer, String, String, String, Double, Double>> customerWithRevenue =
        customerWithNation.join(revenueOfCustomerKey)
        .where(0).equalTo(0)
        .projectFirst(0,1,2,3,4).projectSecond(1)
        .types(Integer.class, String.class, String.class, String.class, Double.class, Double.class);

    // emit result
    customerWithRevenue.writeAsCsv(outputPath);
   
    // execute program
    env.execute("TPCH Query 10 Example");
   
  }
View Full Code Here

    if(!parseParameters(args)) {
      return;
    }
   
    // set up execution environment
    final ExecutionEnvironment env = ExecutionEnvironment.getExecutionEnvironment();
   
    // get input data
    DataSet<Tuple1<Long>> pagesInput = getPagesDataSet(env);
    DataSet<Tuple2<Long, Long>> linksInput = getLinksDataSet(env);
   
    // assign initial rank to pages
    DataSet<Tuple2<Long, Double>> pagesWithRanks = pagesInput.
        map(new RankAssigner((1.0d / numPages)));
   
    // build adjecency list from link input
    DataSet<Tuple2<Long, Long[]>> adjacencyListInput =
        linksInput.groupBy(0).reduceGroup(new BuildOutgoingEdgeList());
   
    // set iterative data set
    IterativeDataSet<Tuple2<Long, Double>> iteration = pagesWithRanks.iterate(maxIterations);
   
    DataSet<Tuple2<Long, Double>> newRanks = iteration
        // join pages with outgoing edges and distribute rank
        .join(adjacencyListInput).where(0).equalTo(0).flatMap(new JoinVertexWithEdgesMatch())
        // collect and sum ranks
        .groupBy(0).aggregate(SUM, 1)
        // apply dampening factor
        .map(new Dampener(DAMPENING_FACTOR, numPages));
   
    DataSet<Tuple2<Long, Double>> finalPageRanks = iteration.closeWith(
        newRanks,
        newRanks.join(iteration).where(0).equalTo(0)
        // termination condition
        .filter(new EpsilonFilter()));

    // emit result
    if(fileOutput) {
      finalPageRanks.writeAsCsv(outputPath, "\n", " ");
    } else {
      finalPageRanks.print();
    }

    // execute program
    env.execute("Basic Page Rank Example");
   
  }
View Full Code Here

    if(!parseParameters(args)) {
      return;
    }
   
    // set up the execution environment
    final ExecutionEnvironment env = ExecutionEnvironment.getExecutionEnvironment();
   
    // get input data
    DataSet<String> text = getTextDataSet(env);
   
    DataSet<Tuple2<String, Integer>> counts =
        // split up the lines in pairs (2-tuples) containing: (word,1)
        text.flatMap(new Tokenizer())
        // group by the tuple field "0" and sum up tuple field "1"
        .groupBy(0)
        .aggregate(Aggregations.SUM, 1);

    // emit result
    if(fileOutput) {
      counts.writeAsCsv(outputPath, "\n", " ");
    } else {
      counts.print();
    }
   
    // execute program
    env.execute("WordCount Example");
  }
View Full Code Here

TOP

Related Classes of eu.stratosphere.api.java.ExecutionEnvironment

Copyright © 2018 www.massapicom. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.