Package org.ictclas4j.bean

Source Code of org.ictclas4j.bean.Dictionary

package org.ictclas4j.bean;

import java.io.DataInputStream;
import java.io.DataOutputStream;
import java.io.File;
import java.io.FileInputStream;
import java.io.FileNotFoundException;
import java.io.FileOutputStream;
import java.io.IOException;
import java.util.Collection;
import java.util.HashMap;

import org.apache.log4j.Logger;
import org.ictclas4j.util.Utility;

import com.gftech.util.GFNet;
import com.gftech.util.GFString;

public class Dictionary {
  /**
   * �ʵ��,��6768��,GB2312����(before) 22034����gbk����+��ĸ���֣�now)
   */
  private WordTable[] wts;

  private int wordCount;// �ʵĸ���

  private long totalFreq;// �ܴ�Ƶ

  private int dict_count;

  static Logger logger = Logger.getLogger(Dictionary.class);

  public Dictionary() {
    this(null,false);
  }
  public Dictionary(String fileName) {
    this(fileName,false);
  }
 
  public Dictionary( boolean isExtend) {
    this(null,isExtend);
  }
 
  public Dictionary(String fileName,boolean isExtend) {
    init(isExtend);
    load(fileName);
  }

  public void init(boolean isExtend) {
    wordCount = 0;
    totalFreq = 0;
    dict_count = isExtend ? Utility.GBK_NUM_EXT : Utility.GB_NUM;
    wts = new WordTable[dict_count];

  }

  /**
   * �Ӵʵ���м��ش���.��6768��������ݿ�(����5���Ǻ����ַ�),ÿ�������ݿ�������ɸ�С���ݿ�,
   * ÿ��С���ݿ�Ϊһ������,�����ݿ���ÿ���������ǹ�һ���ֿ�ͷ��.
   *
   * @param fileName
   *            ���Ĵʵ��ļ���
   * @return
   */
  public boolean load(String fileName) {
    int i = 0, j = 0;
    File file = new File(fileName);
    if (!file.canRead())
      return false;// fail while opening the file

    try {
      long offset = 0;
      WordTable wt = new WordTable();
      SegAtom sa = new SegAtom();
      HashMap<String, SegAtom> wordMap = null;
      DataInputStream in = new DataInputStream(new FileInputStream(file));
      for (i = 0; i < dict_count; i++) {
        try {
          WordTable wtClone = wt.clone();
          logger.debug("��" + i);
          // �ʵ����д����������ʱ���õ�λ����(Сͷ��ǰ)��ʽ,��Ҫת��һ��
          int count = GFNet.readInt32(in);
          logger.debug(" count:" + count);
          wtClone.setWordCount(count);
          int wordMaxLen = GFNet.readUInt8(in);
          wtClone.setWordMaxLen(wordMaxLen);
          offset += 5;
          wordMap = new HashMap<String, SegAtom>();
          for (j = 0; j < count; j++, wordCount++) {
            SegAtom saClone = sa.clone();
            int bc = saClone.read(in, 0);
            offset += bc;
            logger.debug(saClone);
            wordMap.put(saClone.getWord(), saClone);
            totalFreq += saClone.getTotalFreq();
          }
          wtClone.setWordMap(wordMap);
          wts[i] = wtClone;
        } catch (CloneNotSupportedException e) {
          logger.fatal("Load dict:", e);
        }
      }

      in.close();
    } catch (FileNotFoundException e) {
      logger.fatal("load dict " + fileName + ":", e);
    } catch (IOException e) {
      logger.fatal("load dict " + fileName + ":", e);
      logger.fatal("i:" + i + ",j:" + j);
    }

    return true;
  }

  /**
   *
   * @param fileName
   * @return
   */
  public boolean save(String fileName) {

    File file = new File(fileName);
    try {
      DataOutputStream out = new DataOutputStream(new FileOutputStream(file));
      for (int i = 0; i < dict_count; i++) {

        int count = 0;
        WordTable wt = wts[i];
        count = wt.getWordCount();
        GFNet.writeInt32(out, count);
        GFNet.writeInt8(out, wt.getWordMaxLen());
        Collection<SegAtom> atoms = wt.getWordMap().values();
        for (SegAtom atom : atoms) {
          int size = atom.write(out);
          System.out.println(i + "," + size);
        }
      }
      out.close();
      return true;
    } catch (FileNotFoundException e) {
      logger.error(e);
    } catch (IOException e) {
      logger.error(e);
    }
    return false;
  }

  public SegAtom getSegAtom(String word, int index) {
    SegAtom result = null;

    if (word != null && word.length() > 0) {
      if (index > 0 && index < wts.length) {
        WordTable wt = wts[index];
        result = wt.getSegAtom(word);

      }
    }
    return result;
  }
 
  public boolean addSegAtom(SegAtom sa,int index){
   
    if(sa!=null && index>=0 && index<dict_count){
      if(wts!=null){
        WordTable wt=wts[index];
        if(wt!=null){
          wt.addSegAtom(sa);
        }
      }
    }
    return false;
  }

  // ��ȡͬһ���ֿ�ͷ���������Ĺؼ��ʳ���
  public int getWordMaxLen(String word, int index) {
    int result = 0;
    if (word != null && word.length() > 0) {
      if (index > 0 && index < wts.length) {
        WordTable wt = wts[index];
        return wt.getWordMaxLen();
      }
    }
    return result;
  }

  public boolean strEqual(String b1, String b2) {
    if (b1 == null && b2 == null)
      return true;
    else if (b1 != null && b2 != null) {
      return b1.equals(b2);
    }
    return false;
  }

  public int getWordType(String word) {
    if (word != null) {
      int type = Utility.charType(word);
      int len = word.length();

      if (len > 0 && type == Utility.CT_CHINESE && GFString.isAllChinese(word))
        return Utility.WT_CHINESE;
      else if (len > 0 && type == Utility.CT_DELIMITER)
        return Utility.WT_DELIMITER;

    }
    return Utility.WT_OTHER;
  }

  /**
   * �жϹؼ����Ƿ���ڸô���
   *
   * @param word
   * @param pos
   * @return
   */
  public boolean isExist(String word, int pos, int index) {
    if (word != null) {
      SegAtom atom = getSegAtom(word, index);
      if (atom != null) {
        return atom.hasPos(pos);
      }
    }

    return false;
  }

  public int getFreq(String word, int pos, int index) {
    if (word != null) {
      SegAtom atom = getSegAtom(word, index);
      if (atom != null) {
        return atom.getFreqByPos(pos);
      }
    }
    return 0;
  }

  public long totalFreq() {
    return totalFreq;
  }

  public int wordCount() {
    return wordCount;
  }

  public WordTable[] getWts() {
    return wts;
  }

  public void setWts(WordTable[] wts) {
    this.wts = wts;
  }

}
TOP

Related Classes of org.ictclas4j.bean.Dictionary

TOP
Copyright © 2018 www.massapi.com. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.