Package it.cnr.isti.hpc.wikipedia.cli

Source Code of it.cnr.isti.hpc.wikipedia.cli.GetDumpSummaryCLI

/**
*  Copyright 2011 Diego Ceccarelli
*
*  Licensed under the Apache License, Version 2.0 (the "License");
*  you may not use this file except in compliance with the License.
*  You may obtain a copy of the License at
*
*      http://www.apache.org/licenses/LICENSE-2.0
*
*  Unless required by applicable law or agreed to in writing, software
*  distributed under the License is distributed on an "AS IS" BASIS,
*  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
*  See the License for the specific language governing permissions and
*  limitations under the License.
*/
package it.cnr.isti.hpc.wikipedia.cli;

import it.cnr.isti.hpc.cli.AbstractCommandLineInterface;
import it.cnr.isti.hpc.io.reader.JsonRecordParser;
import it.cnr.isti.hpc.io.reader.RecordReader;
import it.cnr.isti.hpc.log.ProgressLogger;
import it.cnr.isti.hpc.wikipedia.article.Article;
import it.cnr.isti.hpc.wikipedia.article.ArticleSummarizer;

/**
* Takes the JSON dump and produce a summary file containing, a file where each
* line contains: <br/>
* <br/>
* <code>
* type <tab> wid <tab> wikititle <tab> redirect/short summary
* </code> <br/>
* <br/>
*
* The last field contains the redirection is type is redirect, otherwise the
* short summary
*
*
* @author Diego Ceccarelli, diego.ceccarelli@isti.cnr.it created on 21/nov/2011
*/
public class GetDumpSummaryCLI extends AbstractCommandLineInterface {

  private static String[] params = new String[] { INPUT, OUTPUT };

  private static final String USAGE = "java -cp $jar "
      + GetDumpSummaryCLI.class
      + " -input wikipedia-json-dump -output titles";

  private final static String TAB = "\t";

  public GetDumpSummaryCLI(String[] args) {
    super(args, params, USAGE);
  }

  public static void main(String[] args) {

    GetDumpSummaryCLI cli = new GetDumpSummaryCLI(args);
    ProgressLogger pl = new ProgressLogger("dumped {} titles", 10000);
    cli.openOutput();
    RecordReader<Article> reader = new RecordReader<Article>(
        cli.getInput(), new JsonRecordParser<Article>(Article.class));
    ArticleSummarizer summarizer = new ArticleSummarizer();

    for (Article a : reader) {
      pl.up();
      cli.writeInOutput(a.getTypeName());
      cli.writeInOutput(TAB);
      cli.writeInOutput(String.valueOf(a.getWikiId()));
      cli.writeInOutput(TAB);
      cli.writeInOutput(a.getWikiTitle());
      cli.writeInOutput(TAB);
      if (a.isRedirect()) {
        cli.writeInOutput("-> " + a.getRedirect());
      } else {
        cli.writeInOutput(summarizer.getSummary(a));
      }
      cli.writeInOutput("\n");

    }
    cli.closeOutput();
  }
}
TOP

Related Classes of it.cnr.isti.hpc.wikipedia.cli.GetDumpSummaryCLI

TOP
Copyright © 2018 www.massapi.com. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.