Package org.apache.hadoop.mapred

Examples of org.apache.hadoop.mapred.JobConf.addInputPath()


    Path tmpFolder = new Path(crawlDb, "stat_tmp" + System.currentTimeMillis());

    JobConf job = new NutchJob(config);
    job.setJobName("stats " + crawlDb);

    job.addInputPath(new Path(crawlDb, CrawlDb.CURRENT_NAME));
    job.setInputFormat(SequenceFileInputFormat.class);

    job.setMapperClass(CrawlDbStatMapper.class);
    job.setCombinerClass(CrawlDbStatCombiner.class);
    job.setReducerClass(CrawlDbStatReducer.class);
View Full Code Here


    Path outFolder = new Path(output);

    JobConf job = new NutchJob(config);
    job.setJobName("dump " + crawlDb);

    job.addInputPath(new Path(crawlDb, CrawlDb.CURRENT_NAME));
    job.setInputFormat(SequenceFileInputFormat.class);

    job.setOutputPath(outFolder);
    job.setOutputFormat(TextOutputFormat.class);
    job.setOutputKeyClass(Text.class);
View Full Code Here

               "/readdb-topN-temp-"+
               Integer.toString(new Random().nextInt(Integer.MAX_VALUE)));

    JobConf job = new NutchJob(config);
    job.setJobName("topN prepare " + crawlDb);
    job.addInputPath(new Path(crawlDb, CrawlDb.CURRENT_NAME));
    job.setInputFormat(SequenceFileInputFormat.class);
    job.setMapperClass(CrawlDbTopNMapper.class);
    job.setReducerClass(IdentityReducer.class);

    job.setOutputPath(tempDir);
View Full Code Here

    }
    job = new NutchJob(config);
    job.setJobName("topN collect " + crawlDb);
    job.setLong("CrawlDbReader.topN", topN);

    job.addInputPath(tempDir);
    job.setInputFormat(SequenceFileInputFormat.class);
    job.setMapperClass(IdentityMapper.class);
    job.setReducerClass(CrawlDbTopNReducer.class);

    job.setOutputPath(outFolder);
View Full Code Here

    }

    JobConf job = createJobConf();
    job.setJobName("read " + segment);

    if (ge) job.addInputPath(new Path(segment, CrawlDatum.GENERATE_DIR_NAME));
    if (fe) job.addInputPath(new Path(segment, CrawlDatum.FETCH_DIR_NAME));
    if (pa) job.addInputPath(new Path(segment, CrawlDatum.PARSE_DIR_NAME));
    if (co) job.addInputPath(new Path(segment, Content.DIR_NAME));
    if (pd) job.addInputPath(new Path(segment, ParseData.DIR_NAME));
    if (pt) job.addInputPath(new Path(segment, ParseText.DIR_NAME));
View Full Code Here

    JobConf job = createJobConf();
    job.setJobName("read " + segment);

    if (ge) job.addInputPath(new Path(segment, CrawlDatum.GENERATE_DIR_NAME));
    if (fe) job.addInputPath(new Path(segment, CrawlDatum.FETCH_DIR_NAME));
    if (pa) job.addInputPath(new Path(segment, CrawlDatum.PARSE_DIR_NAME));
    if (co) job.addInputPath(new Path(segment, Content.DIR_NAME));
    if (pd) job.addInputPath(new Path(segment, ParseData.DIR_NAME));
    if (pt) job.addInputPath(new Path(segment, ParseText.DIR_NAME));
View Full Code Here

    JobConf job = createJobConf();
    job.setJobName("read " + segment);

    if (ge) job.addInputPath(new Path(segment, CrawlDatum.GENERATE_DIR_NAME));
    if (fe) job.addInputPath(new Path(segment, CrawlDatum.FETCH_DIR_NAME));
    if (pa) job.addInputPath(new Path(segment, CrawlDatum.PARSE_DIR_NAME));
    if (co) job.addInputPath(new Path(segment, Content.DIR_NAME));
    if (pd) job.addInputPath(new Path(segment, ParseData.DIR_NAME));
    if (pt) job.addInputPath(new Path(segment, ParseText.DIR_NAME));

    job.setInputFormat(SequenceFileInputFormat.class);
View Full Code Here

    job.setJobName("read " + segment);

    if (ge) job.addInputPath(new Path(segment, CrawlDatum.GENERATE_DIR_NAME));
    if (fe) job.addInputPath(new Path(segment, CrawlDatum.FETCH_DIR_NAME));
    if (pa) job.addInputPath(new Path(segment, CrawlDatum.PARSE_DIR_NAME));
    if (co) job.addInputPath(new Path(segment, Content.DIR_NAME));
    if (pd) job.addInputPath(new Path(segment, ParseData.DIR_NAME));
    if (pt) job.addInputPath(new Path(segment, ParseText.DIR_NAME));

    job.setInputFormat(SequenceFileInputFormat.class);
    job.setMapperClass(InputCompatMapper.class);
View Full Code Here

    if (ge) job.addInputPath(new Path(segment, CrawlDatum.GENERATE_DIR_NAME));
    if (fe) job.addInputPath(new Path(segment, CrawlDatum.FETCH_DIR_NAME));
    if (pa) job.addInputPath(new Path(segment, CrawlDatum.PARSE_DIR_NAME));
    if (co) job.addInputPath(new Path(segment, Content.DIR_NAME));
    if (pd) job.addInputPath(new Path(segment, ParseData.DIR_NAME));
    if (pt) job.addInputPath(new Path(segment, ParseText.DIR_NAME));

    job.setInputFormat(SequenceFileInputFormat.class);
    job.setMapperClass(InputCompatMapper.class);
    job.setReducerClass(SegmentReader.class);
View Full Code Here

    if (ge) job.addInputPath(new Path(segment, CrawlDatum.GENERATE_DIR_NAME));
    if (fe) job.addInputPath(new Path(segment, CrawlDatum.FETCH_DIR_NAME));
    if (pa) job.addInputPath(new Path(segment, CrawlDatum.PARSE_DIR_NAME));
    if (co) job.addInputPath(new Path(segment, Content.DIR_NAME));
    if (pd) job.addInputPath(new Path(segment, ParseData.DIR_NAME));
    if (pt) job.addInputPath(new Path(segment, ParseText.DIR_NAME));

    job.setInputFormat(SequenceFileInputFormat.class);
    job.setMapperClass(InputCompatMapper.class);
    job.setReducerClass(SegmentReader.class);
View Full Code Here

TOP
Copyright © 2018 www.massapi.com. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.