Package cn.edu.hfut.dmic.webcollector.generator

Source Code of cn.edu.hfut.dmic.webcollector.generator.DbReader

/*
* Copyright (C) 2014 hu
*
* This program is free software; you can redistribute it and/or
* modify it under the terms of the GNU General Public License
* as published by the Free Software Foundation; either version 2
* of the License, or (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program; if not, write to the Free Software
* Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA  02111-1307, USA.
*/
package cn.edu.hfut.dmic.webcollector.generator;

import cn.edu.hfut.dmic.webcollector.model.CrawlDatum;
import java.io.File;
import java.io.IOException;
import java.util.Iterator;
import org.apache.avro.file.DataFileReader;
import org.apache.avro.io.DatumReader;
import org.apache.avro.reflect.ReflectDatumReader;

/**
* 读Avro文件的Reader
* @author hu
* @param <T> 待读取数据的数据类型
*/
public class DbReader<T> {

    Class<T> type;
    Iterator<T> iterator;
    DataFileReader<T> dataFileReader;

    /**
     * 构造一个从avro文件中读取指定类型数据的Reader
     * @param type 指定的数据类型
     * @param dbfile 待读取的avro文件
     * @throws IOException
     */
    public DbReader(Class<T> type,File dbfile) throws IOException {
        this.type=type;
        DatumReader<T> datumReader = new ReflectDatumReader<T>(type);
        dataFileReader = new DataFileReader<T>(dbfile, datumReader);
        iterator = dataFileReader.iterator();
    }

    /**
     * 构造一个从avro文件中读取指定类型数据的Reader
     * @param type 指定的数据类型
     * @param dbpath 待读取的avro文件的路径
     * @throws IOException
     */
    public DbReader(Class<T> type,String dbpath) throws IOException {
        this(type,new File(dbpath));
    }

    /**
     * 读取下一条数据,在文件结束时调用该方法会出错,所以在调用readNext方法前需要使
     * 用hasNext方法来判断文件是否结束
     * @return 下一条数据
     */
    public T readNext() {
        return iterator.next();
    }

    /**
     * 判断是否已读取到avro文件结尾
     * @return 是否已读取到avro文件结尾
     */
    public boolean hasNext(){
        return iterator.hasNext();
    }
   
    /**
     * 关闭该Reader
     * @throws IOException
     */
    public void close() throws IOException {
        dataFileReader.close();
    }

   
    public static void main(String[] args) throws IOException{
        if(args.length==0){
            System.err.println("Usage dbpath");          
            main(new String[]{"/home/hu/data/crawl_hfut1/crawldb/current/info.avro"});
            return;
        }
        String dbpath=args[0];
        DbReader<CrawlDatum> reader=new DbReader<CrawlDatum>(CrawlDatum.class,dbpath);
        int sum=0;
        int sum_fetched=0;
        int sum_unfetched=0;
       
       
        CrawlDatum crawldatum=null;

        System.out.println("start read:");
        while(reader.hasNext()){
            crawldatum=reader.readNext();
            System.out.println(crawldatum.getUrl());
            sum++;
            switch(crawldatum.getStatus()){
                case CrawlDatum.STATUS_DB_FETCHED:
                    sum_fetched++;
                    break;
                case CrawlDatum.STATUS_DB_UNFETCHED:
                    sum_unfetched++;
                    break;
                   
            }
           
        
        }
        reader.close();
      
       
    }
   
}
TOP

Related Classes of cn.edu.hfut.dmic.webcollector.generator.DbReader

TOP
Copyright © 2018 www.massapi.com. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.