Package com.chenlb.mmseg4j.analysis

Source Code of com.chenlb.mmseg4j.analysis.CutLetterDigitFilter

package com.chenlb.mmseg4j.analysis;

import java.io.IOException;
import java.util.LinkedList;
import java.util.Queue;

import org.apache.lucene.analysis.Token;
import org.apache.lucene.analysis.TokenFilter;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
import org.apache.lucene.analysis.tokenattributes.TermAttribute;
import org.apache.lucene.analysis.tokenattributes.TypeAttribute;

import com.chenlb.mmseg4j.Word;

/**
* 切分“字母和数”混在一起的过虑器。比如:mb991ch 切为 "mb 991 ch"
*
* @author chenlb 2009-10-14 下午04:03:18
*/
public class CutLetterDigitFilter extends TokenFilter {

  protected Queue<Token> tokenQueue = new LinkedList<Token>();
 
  private TermAttribute termAtt;
    private OffsetAttribute offsetAtt;
    private TypeAttribute typeAtt;
    private Token reusableToken;
 
  public CutLetterDigitFilter(TokenStream input) {
    super(input);
   
    reusableToken = new Token();
    termAtt = (TermAttribute)addAttribute(TermAttribute.class);
    offsetAtt = (OffsetAttribute)addAttribute(OffsetAttribute.class);
    typeAtt = (TypeAttribute)addAttribute(TypeAttribute.class);
  }

  //兼容 lucene 2.9
  public Token next(Token reusableToken) throws IOException {
    return nextToken(reusableToken);
  }
 
  private Token nextToken(Token reusableToken) throws IOException {
    assert reusableToken != null;
   
    //先使用上次留下来的。
    Token nextToken = tokenQueue.poll();
    if(nextToken != null) {
      return nextToken;
    }
   
    /*//在 TokenUtils.nextToken 已经调用了 inc
    if(!input.incrementToken()) {
      return null;
    }*/
   
    /*TermAttribute termAtt = (TermAttribute)input.getAttribute(TermAttribute.class);
    OffsetAttribute offsetAtt = (OffsetAttribute)input.getAttribute(OffsetAttribute.class);
    TypeAttribute typeAtt = (TypeAttribute)input.getAttribute(TypeAttribute.class);
   
    nextToken = reusableToken.reinit(termAtt.termBuffer(), 0, termAtt.termLength(), offsetAtt.startOffset(), offsetAtt.endOffset(), typeAtt.type());*/
   
    nextToken = TokenUtils.nextToken(input, reusableToken);
   
    if(nextToken != null &&
        (Word.TYPE_LETTER_OR_DIGIT.equalsIgnoreCase(nextToken.type())
          || Word.TYPE_DIGIT_OR_LETTER.equalsIgnoreCase(nextToken.type()))
        ) {
      final char[] buffer = nextToken.termBuffer();
      final int length = nextToken.termLength();
      byte lastType = (byte) Character.getType(buffer[0])//与上次的字符是否同类
      int termBufferOffset = 0;
      int termBufferLength = 0;
      for(int i=0;i<length;i++) {
        byte type = (byte) Character.getType(buffer[i]);
        if(type <= Character.MODIFIER_LETTER) {
          type = Character.LOWERCASE_LETTER;
        }
        if(type != lastType) {  //与上一次的不同
          addToken(nextToken, termBufferOffset, termBufferLength, lastType);
         
          termBufferOffset += termBufferLength;
          termBufferLength = 0;
         
          lastType = type;
        }
       
        termBufferLength++;
      }
      if(termBufferLength > 0) {  //最后一次
        addToken(nextToken, termBufferOffset, termBufferLength, lastType);
      }
      nextToken = tokenQueue.poll();
    }
   
    return nextToken;
  }
 
  private void addToken(Token oriToken, int termBufferOffset, int termBufferLength, byte type) {
    Token token = new Token(oriToken.termBuffer(), termBufferOffset, termBufferLength,
        oriToken.startOffset()+termBufferOffset, oriToken.startOffset()+termBufferOffset+termBufferLength);
   
    if(type == Character.DECIMAL_DIGIT_NUMBER) {
      token.setType(Word.TYPE_DIGIT);
    } else {
      token.setType(Word.TYPE_LETTER);
    }
   
    tokenQueue.offer(token);
  }

  public void close() throws IOException {
    super.close();
    tokenQueue.clear();
  }

  public void reset() throws IOException {
    super.reset();
    tokenQueue.clear();
  }

  public boolean incrementToken() throws IOException {
    clearAttributes();
    Token token = nextToken(reusableToken);
    if(token != null) {
      termAtt.setTermBuffer(token.termBuffer(), 0, token.termLength());
      offsetAtt.setOffset(token.startOffset(), token.endOffset());
      typeAtt.setType(token.type());
      return true;
    } else {
      end();
      return false;
    }
  }
 
    public void end() {
      try {
        reset();
      } catch(IOException e) {}
    }
}
TOP

Related Classes of com.chenlb.mmseg4j.analysis.CutLetterDigitFilter

TOP
Copyright © 2018 www.massapi.com. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.