Package org.apache.crunch.impl.mr

Examples of org.apache.crunch.impl.mr.MRPipeline.readTextFile()


  @Test
  public void testAvro() throws Exception {
    Pipeline pipeline = new MRPipeline(AggregateIT.class, tmpDir.getDefaultConfiguration());
    String shakesInputPath = tmpDir.copyResourceFileName("shakes.txt");
    PCollection<String> shakes = pipeline.readTextFile(shakesInputPath);
    runMinMax(shakes, AvroTypeFamily.getInstance());
    pipeline.done();
  }

  @Test
View Full Code Here


  @Test
  public void testCollectUrls() throws Exception {
    Pipeline p = new MRPipeline(AggregateIT.class, tmpDir.getDefaultConfiguration());
    String urlsInputPath = tmpDir.copyResourceFileName("urls.txt");
    PTable<String, Collection<String>> urls = Aggregate.collectValues(p.readTextFile(urlsInputPath).parallelDo(
        new SplitFn(), tableOf(strings(), strings())));
    for (Pair<String, Collection<String>> e : urls.materialize()) {
      String key = e.first();
      int expectedSize = 0;
      if ("www.A.com".equals(key)) {
View Full Code Here

  }

  @Test
  public void testCollectValues_Writables() throws IOException {
    Pipeline pipeline = new MRPipeline(AggregateIT.class, tmpDir.getDefaultConfiguration());
    Map<Integer, Collection<Text>> collectionMap = pipeline.readTextFile(tmpDir.copyResourceFileName("set2.txt"))
        .parallelDo(new MapStringToTextPair(), Writables.tableOf(Writables.ints(), Writables.writables(Text.class)))
        .collectValues().materializeToMap();

    assertEquals(1, collectionMap.size());
View Full Code Here

  @Test
  public void testCollectValues_Avro() throws IOException {

    MapStringToEmployeePair mapFn = new MapStringToEmployeePair();
    Pipeline pipeline = new MRPipeline(AggregateIT.class, tmpDir.getDefaultConfiguration());
    Map<Integer, Collection<Employee>> collectionMap = pipeline.readTextFile(tmpDir.copyResourceFileName("set2.txt"))
        .parallelDo(mapFn, Avros.tableOf(Avros.ints(), Avros.records(Employee.class))).collectValues()
        .materializeToMap();

    assertEquals(1, collectionMap.size());
View Full Code Here

  @Test
  public void testEnumPTypes() throws IOException {
    String inputFile1 = tmpDir.copyResourceFileName("set1.txt");
    Pipeline pipeline = new MRPipeline(EnumPairIT.class);
    PCollection<String> set1 = pipeline.readTextFile(inputFile1);
    PTable<String, etypes> data = set1.parallelDo(new DoFn<String, Pair<String, etypes>>() {
      @Override
      public void process(String input, Emitter<Pair<String, etypes>> emitter) {
        emitter.emit(new Pair<String, etypes>(input, etypes.type1));
      }
View Full Code Here

        Pipeline pipeline = new MRPipeline(MRPipelineIT.class, tmpDir.getDefaultConfiguration());

        String customersFile = tmpDir.copyResourceFileName("customers.txt");
        String ordersFile = tmpDir.copyResourceFileName("orders.txt");
        String addressesFile = tmpDir.copyResourceFileName("addresses.txt");
        PTable<String, String> customersTable = pipeline.readTextFile(customersFile)
                .parallelDo("Split customers", new StringToPairMapFn(), tableOf(strings(), strings()));
        PTable<String, String> ordersTable = pipeline.readTextFile(ordersFile)
                .parallelDo("Split orders", new StringToPairMapFn(), tableOf(strings(), strings()));

        PTable<String, String> assignedOrders = new BloomFilterJoinStrategy<String, String, String>(5)
View Full Code Here

        String customersFile = tmpDir.copyResourceFileName("customers.txt");
        String ordersFile = tmpDir.copyResourceFileName("orders.txt");
        String addressesFile = tmpDir.copyResourceFileName("addresses.txt");
        PTable<String, String> customersTable = pipeline.readTextFile(customersFile)
                .parallelDo("Split customers", new StringToPairMapFn(), tableOf(strings(), strings()));
        PTable<String, String> ordersTable = pipeline.readTextFile(ordersFile)
                .parallelDo("Split orders", new StringToPairMapFn(), tableOf(strings(), strings()));

        PTable<String, String> assignedOrders = new BloomFilterJoinStrategy<String, String, String>(5)
                .join(customersTable, ordersTable, JoinType.INNER_JOIN)
                .parallelDo(new MapFn<Pair<String, Pair<String, String>>, Pair<String, String>>() {
View Full Code Here

                    public Pair<String, String> map(Pair<String, Pair<String, String>> input) {
                        return Pair.of(input.first(), input.second().second());
                    }
                }, tableOf(strings(), strings()));

        PTable<String, String> addressesTable = pipeline.readTextFile(addressesFile)
                .parallelDo("Split addresses", new StringToPairMapFn(), tableOf(strings(), strings()))
                .filter(new FilterFn<Pair<String, String>>() {
                    @Override
                    public boolean accept(Pair<String, String> input) {
                        // This is odd but it is the simpler way of simulating this would take longer than
View Full Code Here

   * another.
   */
  @Test
  public void testFusedMappersObjectReuseBug() throws IOException {
    Pipeline pipeline = new MRPipeline(MultipleOutputIT.class, tmpDir.getDefaultConfiguration());
    PCollection<StringWrapper> stringWrappers = pipeline.readTextFile(tmpDir.copyResourceFileName("set2.txt"))
        .parallelDo(new StringWrapper.StringToStringWrapperMapFn(), Avros.reflects(StringWrapper.class));

    PCollection<String> stringsA = stringWrappers.parallelDo(new AppendFn("A"), stringWrappers.getPType())
        .parallelDo(new StringWrapper.StringWrapperToStringMapFn(), Writables.strings());
    PCollection<String> stringsB = stringWrappers.parallelDo(new AppendFn("B"), stringWrappers.getPType())
View Full Code Here

  };

  @Test
  public void testKill() throws Exception {
    Pipeline pipeline = new MRPipeline(FailIT.class, tempDir.getDefaultConfiguration());
    PCollection<String> p = pipeline.readTextFile(tempDir.copyResourceFileName("shakes.txt"));
    PCollection<Integer> result = p.parallelDo(new InverseFn(), Writables.ints());
    result.cache();

    PipelineExecution execution = pipeline.runAsync();
View Full Code Here

TOP
Copyright © 2018 www.massapi.com. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.