Source Code of org.apache.lucene.analysis.cn.smart.hhmm.HHMMSegmenter

/**
 * Copyright 2009 www.imdict.net
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */


package org.apache.lucene.analysis.cn.smart.hhmm;


import java.util.List;


import org.apache.lucene.analysis.cn.smart.CharType;
import org.apache.lucene.analysis.cn.smart.Utility;
import org.apache.lucene.analysis.cn.smart.WordType;


public class HHMMSegmenter {


  private static WordDictionary wordDict = WordDictionary.getInstance();


  /**
   * 寻找sentence中所有可能的Token，最后再添加两个特殊Token，"始##始",
   * "末##末"，"始##始"Token的起始位置是-1,"末##末"Token的起始位置是句子的长度
   * 
   * @param sentence 输入的句子，不包含"始##始","末##末"等
   * @param coreDict 核心字典
   * @return 所有可能的Token
   * @see MultiTokenMap
   */
  private SegGraph createSegGraph(String sentence) {
    int i = 0, j;
    int length = sentence.length();
    int foundIndex;
    CharType[] charTypeArray = getCharTypes(sentence);
    StringBuffer wordBuf = new StringBuffer();
    SegToken token;
    int frequency = 0; // word的出现次数
    boolean hasFullWidth;
    WordType wordType;
    char[] charArray;


    SegGraph segGraph = new SegGraph();
    while (i < length) {
      hasFullWidth = false;
      switch (charTypeArray[i]) {
        case SPACE_LIKE:
          i++;
          break;
        case HANZI:
          j = i + 1;
          wordBuf.delete(0, wordBuf.length());
          // 不管单个汉字能不能构成词，都将单个汉字存到segGraph中去，否则会造成分此图断字
          wordBuf.append(sentence.charAt(i));
          charArray = new char[] { sentence.charAt(i) };
          frequency = wordDict.getFrequency(charArray);
          token = new SegToken(charArray, i, j, WordType.CHINESE_WORD,
              frequency);
          segGraph.addToken(token);


          foundIndex = wordDict.getPrefixMatch(charArray);
          while (j <= length && foundIndex != -1) {
            if (wordDict.isEqual(charArray, foundIndex) && charArray.length > 1) {
              // 就是我们要找的词， 也就是说找到了从i到j的一个成词SegToken，并且不是单字词
              frequency = wordDict.getFrequency(charArray);
              token = new SegToken(charArray, i, j, WordType.CHINESE_WORD,
                  frequency);
              segGraph.addToken(token);
            }


            while (j < length && charTypeArray[j] == CharType.SPACE_LIKE)
              j++;


            if (j < length && charTypeArray[j] == CharType.HANZI) {
              wordBuf.append(sentence.charAt(j));
              charArray = new char[wordBuf.length()];
              wordBuf.getChars(0, charArray.length, charArray, 0);
              // idArray作为前缀已经找到过(foundWordIndex!=-1),
              // 因此加长过后的idArray只可能出现在foundWordIndex以后,
              // 故从foundWordIndex之后开始查找
              foundIndex = wordDict.getPrefixMatch(charArray, foundIndex);
              j++;
            } else {
              break;
            }
          }
          i++;
          break;
        case FULLWIDTH_LETTER:
          hasFullWidth = true;
        case LETTER:
          j = i + 1;
          while (j < length
              && (charTypeArray[j] == CharType.LETTER || charTypeArray[j] == CharType.FULLWIDTH_LETTER)) {
            if (charTypeArray[j] == CharType.FULLWIDTH_LETTER)
              hasFullWidth = true;
            j++;
          }
          // 找到了从i到j的一个Token，类型为LETTER的字符串
          charArray = Utility.STRING_CHAR_ARRAY;
          frequency = wordDict.getFrequency(charArray);
          wordType = hasFullWidth ? WordType.FULLWIDTH_STRING : WordType.STRING;
          token = new SegToken(charArray, i, j, wordType, frequency);
          segGraph.addToken(token);
          i = j;
          break;
        case FULLWIDTH_DIGIT:
          hasFullWidth = true;
        case DIGIT:
          j = i + 1;
          while (j < length
              && (charTypeArray[j] == CharType.DIGIT || charTypeArray[j] == CharType.FULLWIDTH_DIGIT)) {
            if (charTypeArray[j] == CharType.FULLWIDTH_DIGIT)
              hasFullWidth = true;
            j++;
          }
          // 找到了从i到j的一个Token，类型为NUMBER的字符串
          charArray = Utility.NUMBER_CHAR_ARRAY;
          frequency = wordDict.getFrequency(charArray);
          wordType = hasFullWidth ? WordType.FULLWIDTH_NUMBER : WordType.NUMBER;
          token = new SegToken(charArray, i, j, wordType, frequency);
          segGraph.addToken(token);
          i = j;
          break;
        case DELIMITER:
          j = i + 1;
          // 标点符号的weight不用查了，选个最大的频率即可
          frequency = Utility.MAX_FREQUENCE;
          charArray = new char[] { sentence.charAt(i) };
          token = new SegToken(charArray, i, j, WordType.DELIMITER, frequency);
          segGraph.addToken(token);
          i = j;
          break;
        default:
          j = i + 1;
          // 把不认识的字符当作未知串看待，例如GB2312编码之外的字符，每个字符当作一个
          charArray = Utility.STRING_CHAR_ARRAY;
          frequency = wordDict.getFrequency(charArray);
          token = new SegToken(charArray, i, j, WordType.STRING, frequency);
          segGraph.addToken(token);
          i = j;
          break;
      }
    }


    // 为segGraph增加两个新Token： "始##始","末##末"
    charArray = Utility.START_CHAR_ARRAY;
    frequency = wordDict.getFrequency(charArray);
    token = new SegToken(charArray, -1, 0, WordType.SENTENCE_BEGIN, frequency);
    segGraph.addToken(token);


    // "末##末"
    charArray = Utility.END_CHAR_ARRAY;
    frequency = wordDict.getFrequency(charArray);
    token = new SegToken(charArray, length, length + 1, WordType.SENTENCE_END,
        frequency);
    segGraph.addToken(token);


    return segGraph;
  }


  /**
   * 为sentence中的每个字符确定唯一的字符类型
   * 
   * @see Utility.charType(char)
   * @param sentence 输入的完成句子
   * @return 返回的字符类型数组，如果输入为null，返回也是null
   */
  private static CharType[] getCharTypes(String sentence) {
    int length = sentence.length();
    CharType[] charTypeArray = new CharType[length];
    // 生成对应单个汉字的字符类型数组
    for (int i = 0; i < length; i++) {
      charTypeArray[i] = Utility.getCharType(sentence.charAt(i));
    }


    return charTypeArray;
  }


  public List<SegToken> process(String sentence) {
    SegGraph segGraph = createSegGraph(sentence);
    BiSegGraph biSegGraph = new BiSegGraph(segGraph);
    List<SegToken> shortPath = biSegGraph.getShortPath();
    return shortPath;
  }
}
Source Code of org.apache.lucene.analysis.cn.smart.hhmm.HHMMSegmenter

Related Classes of org.apache.lucene.analysis.cn.smart.hhmm.HHMMSegmenter