Package org.apache.lucene.analysis.cn.smart.hhmm

Source Code of org.apache.lucene.analysis.cn.smart.hhmm.HHMMSegmenter

/**
* Copyright 2009 www.imdict.net
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
*     http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/

package org.apache.lucene.analysis.cn.smart.hhmm;

import java.util.List;

import org.apache.lucene.analysis.cn.smart.CharType;
import org.apache.lucene.analysis.cn.smart.Utility;
import org.apache.lucene.analysis.cn.smart.WordType;

public class HHMMSegmenter {

  private static WordDictionary wordDict = WordDictionary.getInstance();

  /**
   * 寻找sentence中所有可能的Token,最后再添加两个特殊Token,"始##始",
   * "末##末","始##始"Token的起始位置是-1,"末##末"Token的起始位置是句子的长度
   *
   * @param sentence 输入的句子,不包含"始##始","末##末"等
   * @param coreDict 核心字典
   * @return 所有可能的Token
   * @see MultiTokenMap
   */
  private SegGraph createSegGraph(String sentence) {
    int i = 0, j;
    int length = sentence.length();
    int foundIndex;
    CharType[] charTypeArray = getCharTypes(sentence);
    StringBuffer wordBuf = new StringBuffer();
    SegToken token;
    int frequency = 0; // word的出现次数
    boolean hasFullWidth;
    WordType wordType;
    char[] charArray;

    SegGraph segGraph = new SegGraph();
    while (i < length) {
      hasFullWidth = false;
      switch (charTypeArray[i]) {
        case SPACE_LIKE:
          i++;
          break;
        case HANZI:
          j = i + 1;
          wordBuf.delete(0, wordBuf.length());
          // 不管单个汉字能不能构成词,都将单个汉字存到segGraph中去,否则会造成分此图断字
          wordBuf.append(sentence.charAt(i));
          charArray = new char[] { sentence.charAt(i) };
          frequency = wordDict.getFrequency(charArray);
          token = new SegToken(charArray, i, j, WordType.CHINESE_WORD,
              frequency);
          segGraph.addToken(token);

          foundIndex = wordDict.getPrefixMatch(charArray);
          while (j <= length && foundIndex != -1) {
            if (wordDict.isEqual(charArray, foundIndex) && charArray.length > 1) {
              // 就是我们要找的词, 也就是说找到了从i到j的一个成词SegToken,并且不是单字词
              frequency = wordDict.getFrequency(charArray);
              token = new SegToken(charArray, i, j, WordType.CHINESE_WORD,
                  frequency);
              segGraph.addToken(token);
            }

            while (j < length && charTypeArray[j] == CharType.SPACE_LIKE)
              j++;

            if (j < length && charTypeArray[j] == CharType.HANZI) {
              wordBuf.append(sentence.charAt(j));
              charArray = new char[wordBuf.length()];
              wordBuf.getChars(0, charArray.length, charArray, 0);
              // idArray作为前缀已经找到过(foundWordIndex!=-1),
              // 因此加长过后的idArray只可能出现在foundWordIndex以后,
              // 故从foundWordIndex之后开始查找
              foundIndex = wordDict.getPrefixMatch(charArray, foundIndex);
              j++;
            } else {
              break;
            }
          }
          i++;
          break;
        case FULLWIDTH_LETTER:
          hasFullWidth = true;
        case LETTER:
          j = i + 1;
          while (j < length
              && (charTypeArray[j] == CharType.LETTER || charTypeArray[j] == CharType.FULLWIDTH_LETTER)) {
            if (charTypeArray[j] == CharType.FULLWIDTH_LETTER)
              hasFullWidth = true;
            j++;
          }
          // 找到了从i到j的一个Token,类型为LETTER的字符串
          charArray = Utility.STRING_CHAR_ARRAY;
          frequency = wordDict.getFrequency(charArray);
          wordType = hasFullWidth ? WordType.FULLWIDTH_STRING : WordType.STRING;
          token = new SegToken(charArray, i, j, wordType, frequency);
          segGraph.addToken(token);
          i = j;
          break;
        case FULLWIDTH_DIGIT:
          hasFullWidth = true;
        case DIGIT:
          j = i + 1;
          while (j < length
              && (charTypeArray[j] == CharType.DIGIT || charTypeArray[j] == CharType.FULLWIDTH_DIGIT)) {
            if (charTypeArray[j] == CharType.FULLWIDTH_DIGIT)
              hasFullWidth = true;
            j++;
          }
          // 找到了从i到j的一个Token,类型为NUMBER的字符串
          charArray = Utility.NUMBER_CHAR_ARRAY;
          frequency = wordDict.getFrequency(charArray);
          wordType = hasFullWidth ? WordType.FULLWIDTH_NUMBER : WordType.NUMBER;
          token = new SegToken(charArray, i, j, wordType, frequency);
          segGraph.addToken(token);
          i = j;
          break;
        case DELIMITER:
          j = i + 1;
          // 标点符号的weight不用查了,选个最大的频率即可
          frequency = Utility.MAX_FREQUENCE;
          charArray = new char[] { sentence.charAt(i) };
          token = new SegToken(charArray, i, j, WordType.DELIMITER, frequency);
          segGraph.addToken(token);
          i = j;
          break;
        default:
          j = i + 1;
          // 把不认识的字符当作未知串看待,例如GB2312编码之外的字符,每个字符当作一个
          charArray = Utility.STRING_CHAR_ARRAY;
          frequency = wordDict.getFrequency(charArray);
          token = new SegToken(charArray, i, j, WordType.STRING, frequency);
          segGraph.addToken(token);
          i = j;
          break;
      }
    }

    // 为segGraph增加两个新Token: "始##始","末##末"
    charArray = Utility.START_CHAR_ARRAY;
    frequency = wordDict.getFrequency(charArray);
    token = new SegToken(charArray, -1, 0, WordType.SENTENCE_BEGIN, frequency);
    segGraph.addToken(token);

    // "末##末"
    charArray = Utility.END_CHAR_ARRAY;
    frequency = wordDict.getFrequency(charArray);
    token = new SegToken(charArray, length, length + 1, WordType.SENTENCE_END,
        frequency);
    segGraph.addToken(token);

    return segGraph;
  }

  /**
   * 为sentence中的每个字符确定唯一的字符类型
   *
   * @see Utility.charType(char)
   * @param sentence 输入的完成句子
   * @return 返回的字符类型数组,如果输入为null,返回也是null
   */
  private static CharType[] getCharTypes(String sentence) {
    int length = sentence.length();
    CharType[] charTypeArray = new CharType[length];
    // 生成对应单个汉字的字符类型数组
    for (int i = 0; i < length; i++) {
      charTypeArray[i] = Utility.getCharType(sentence.charAt(i));
    }

    return charTypeArray;
  }

  public List<SegToken> process(String sentence) {
    SegGraph segGraph = createSegGraph(sentence);
    BiSegGraph biSegGraph = new BiSegGraph(segGraph);
    List<SegToken> shortPath = biSegGraph.getShortPath();
    return shortPath;
  }
}
TOP

Related Classes of org.apache.lucene.analysis.cn.smart.hhmm.HHMMSegmenter

TOP
Copyright © 2018 www.massapi.com. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.