Package org.fnlp.data.reader

Source Code of org.fnlp.data.reader.FileReader

/**
*  This file is part of FNLP (formerly FudanNLP).
*  FNLP is free software: you can redistribute it and/or modify
*  it under the terms of the GNU Lesser General Public License as published by
*  the Free Software Foundation, either version 3 of the License, or
*  (at your option) any later version.
*  FNLP is distributed in the hope that it will be useful,
*  but WITHOUT ANY WARRANTY; without even the implied warranty of
*  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
*  GNU Lesser General Public License for more details.
*  You should have received a copy of the GNU General Public License
*  along with FudanNLP.  If not, see <http://www.gnu.org/licenses/>.
*  Copyright 2009-2014 www.fnlp.org. All rights reserved.
*/

package org.fnlp.data.reader;

import java.io.BufferedReader;
import java.io.File;
import java.io.FileInputStream;
import java.io.FileNotFoundException;
import java.io.IOException;
import java.io.InputStreamReader;
import java.io.UnsupportedEncodingException;
import java.nio.charset.Charset;
import java.util.LinkedList;

import org.fnlp.ml.types.Instance;
/**
* @author xpqiu
* @version 1.0
* 文档数据读取如下:
* 输入为数据存放路径(子文件夹不处理)
* 不同类别存放在各自文件中
* 类别:文件名
* 数据:文件内的一行字符
* package edu.fudan.ml.data
*/
public class FileReader extends Reader {

  LinkedList<File> files;
  Instance cur;
  Charset charset;
  String content = null;
  BufferedReader reader;
  int line;
  File currentFile;
  private String filter;

  public FileReader(String path) {
    this(path, "UTF-8",null);
  }
  /**
   *
   * @param path 路径名
   * @param charsetName 字符编码
   * @param filter 文件类型过滤
   */
  public FileReader(String path, String charsetName, String filter) {
    files = new LinkedList<File>();
    this.filter = filter;
    File fpath = new File(path);

    if(fpath.isDirectory()) {
      File[] flist = fpath.listFiles();
      for(int i=0;i<flist.length;i++){
        if(flist[i].isFile()){
          if(filter==null)
            files.push(flist[i]);
          else if(flist[i].getName().endsWith(filter))
            files.push(flist[i]);           
        }
      }
    }else{
      System.err.println("输入必须为目录");
    }
    if(files.size()==0)
      System.err.println("找不到合法文件");
    charset = Charset.forName(charsetName);
    getFile();
  }

  private boolean getFile() {
    currentFile = files.poll();
    if(currentFile==null)
      return false;
    try {
      FileInputStream in = new FileInputStream(currentFile);
      reader = new BufferedReader(new InputStreamReader(in,
          "UTF-8"));
    } catch (FileNotFoundException e) {     
      System.err.println("文件不存在");
      return false;
    } catch (UnsupportedEncodingException e) {
      System.err.println("文件编码错误");
      return false;
    }
    line=0;
    return true;
  }

  public boolean hasNext() {
    while(true){
      try{
        content = reader.readLine();
        line++;
        if(content==null){
          reader.close();
          if(!getFile()){
            return false;
          }
          continue;
        }
        content = content.trim();
        if(content.length()==0)
          continue;
        else
          return true;     
      }catch (IOException e) {
        System.err.println("读文件错误。文件名:"+currentFile.getName()+"行数:"+(line-1));
        return false;
      }
    }
  }

  public Instance next() {
    int idx = currentFile.getName().indexOf(".");
    return new Instance (content,currentFile.getName().substring(0, idx));
  }
}
TOP

Related Classes of org.fnlp.data.reader.FileReader

TOP
Copyright © 2018 www.massapi.com. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.