Package org.apache.crunch.impl.spark

Examples of org.apache.crunch.impl.spark.SparkPipeline


    runMapsideLeftOuterJoin(new SparkPipeline("local", "mapside"), false);
  }

  @Test
  public void testMapsideJoin_LeftOuterJoin_Materialized() throws IOException {
    runMapsideLeftOuterJoin(new SparkPipeline("local", "mapside"), true);
  }
View Full Code Here


  @Test
  public void testUnionOfGroupedOutputAndNonGroupedOutput() throws IOException {
    String inputPath = tempDir.copyResourceFileName("set1.txt");
    String inputPath2 = tempDir.copyResourceFileName("set2.txt");

    Pipeline pipeline = new SparkPipeline("local", "unionresults");

    PCollection<String> set1Lines = pipeline.read(At.textFile(inputPath, Writables.strings()));
    PCollection<Pair<String, Long>> set1Lengths = set1Lines.parallelDo(new StringLengthMapFn(),
        Writables.pairs(Writables.strings(), Writables.longs()));
    PCollection<Pair<String, Long>> set2Counts = pipeline.read(At.textFile(inputPath2, Writables.strings())).count();

    PCollection<Pair<String, Long>> union = set1Lengths.union(set2Counts);

    Set<Pair<String, Long>> unionValues = Sets.newHashSet(union.materialize());
    assertEquals(7, unionValues.size());

    Set<Pair<String, Long>> expectedPairs = Sets.newHashSet();
    expectedPairs.add(Pair.of("b", 10L));
    expectedPairs.add(Pair.of("c", 10L));
    expectedPairs.add(Pair.of("a", 10L));
    expectedPairs.add(Pair.of("e", 10L));
    expectedPairs.add(Pair.of("a", 1L));
    expectedPairs.add(Pair.of("c", 1L));
    expectedPairs.add(Pair.of("d", 1L));

    assertEquals(expectedPairs, unionValues);

    pipeline.done();
  }
View Full Code Here

  public void testMultiGroupBy() throws Exception {
    String inputPath = tempDir.copyResourceFileName("set1.txt");
    String inputPath2 = tempDir.copyResourceFileName("set2.txt");
    String output = tempDir.getFileName("output");

    Pipeline pipeline = new SparkPipeline("local", "multigroupby");

    PCollection<String> set1Lines = pipeline.read(At.textFile(inputPath, Writables.strings()));
    PCollection<Pair<String, Long>> set1Lengths = set1Lines.parallelDo(new StringLengthMapFn(),
        Writables.pairs(Writables.strings(), Writables.longs()));
    PTable<String, Long> set2Counts = pipeline.read(At.textFile(inputPath2, Writables.strings())).count();
    PTables.asPTable(set2Counts.union(set1Lengths)).groupByKey().ungroup()
        .write(At.sequenceFile(output, Writables.strings(), Writables.longs()));
    PipelineResult res = pipeline.done();
    assertEquals(4, res.getStageResults().get(0).getCounterValue("my", "counter"));
  }
View Full Code Here

  public void testMultiWrite() throws Exception {
    String inputPath = tempDir.copyResourceFileName("set1.txt");
    String inputPath2 = tempDir.copyResourceFileName("set2.txt");
    String output = tempDir.getFileName("output");

    Pipeline pipeline = new SparkPipeline("local", "multiwrite");

    PCollection<String> set1Lines = pipeline.read(At.textFile(inputPath, Writables.strings()));
    PTable<String, Long> set1Lengths = set1Lines.parallelDo(new StringLengthMapFn(),
        Writables.tableOf(Writables.strings(), Writables.longs()));
    PTable<String, Long> set2Counts = pipeline.read(At.textFile(inputPath2, Writables.strings())).count();

    TableSourceTarget<String, Long> inter = At.sequenceFile(output, Writables.strings(), Writables.longs());
    set1Lengths.write(inter);
    set2Counts.write(inter, Target.WriteMode.APPEND);

    pipeline.run();

    PTable<String, Long> in = pipeline.read(inter);
    Set<Pair<String, Long>> values = Sets.newHashSet(in.materialize());
    assertEquals(7, values.size());

    Set<Pair<String, Long>> expectedPairs = Sets.newHashSet();
    expectedPairs.add(Pair.of("b", 10L));
    expectedPairs.add(Pair.of("c", 10L));
    expectedPairs.add(Pair.of("a", 10L));
    expectedPairs.add(Pair.of("e", 10L));
    expectedPairs.add(Pair.of("a", 1L));
    expectedPairs.add(Pair.of("c", 1L));
    expectedPairs.add(Pair.of("d", 1L));

    assertEquals(expectedPairs, values);

    pipeline.done();
  }
View Full Code Here

  @Rule
  public transient TemporaryPath tmpDir = new TemporaryPath();

  @Test
  public void testWritableSortAsc() throws Exception {
    runSingle(new SparkPipeline("local", "sort"), WritableTypeFamily.getInstance(), Sort.Order.ASCENDING,
        "A\tand this text as well");
  }
View Full Code Here

        "A\tand this text as well");
  }

  @Test
  public void testWritableSortDesc() throws Exception {
    runSingle(new SparkPipeline("local", "sort"), WritableTypeFamily.getInstance(), Sort.Order.DESCENDING,
        "B\tthis doc has some text");
  }
View Full Code Here

        "B\tthis doc has some text");
  }

  @Test
  public void testWritableSortAscDesc() throws Exception {
    runPair(new SparkPipeline("local", "sort"), WritableTypeFamily.getInstance(), by(1, ASCENDING), by(2, DESCENDING), "A",
        "this doc has this text");
  }
View Full Code Here

        "this doc has this text");
  }

  @Test
  public void testWritableSortSecondDescFirstAsc() throws Exception {
    runPair(new SparkPipeline("local", "sort"), WritableTypeFamily.getInstance(), by(2, DESCENDING), by(1, ASCENDING), "A",
        "this doc has this text");
  }
View Full Code Here

        "this doc has this text");
  }

  @Test
  public void testWritableSortTripleAscDescAsc() throws Exception {
    runTriple(new SparkPipeline("local", "sort"), WritableTypeFamily.getInstance(), by(1, ASCENDING), by(2, DESCENDING),
        by(3, ASCENDING), "A", "this", "doc");
  }
View Full Code Here

        by(3, ASCENDING), "A", "this", "doc");
  }

  @Test
  public void testWritableSortQuadAscDescAscDesc() throws Exception {
    runQuad(new SparkPipeline("local", "sort"), WritableTypeFamily.getInstance(), by(1, ASCENDING), by(2, DESCENDING),
        by(3, ASCENDING), by(4, DESCENDING), "A", "this", "doc", "has");
  }
View Full Code Here

TOP

Related Classes of org.apache.crunch.impl.spark.SparkPipeline

Copyright © 2018 www.massapicom. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.