Package org.tartarus.snowball

Examples of org.tartarus.snowball.EnglishStemmer


   * @return the keyword list
   */
  public KeywordList extractKeyword(String string, boolean onlyNoun)
  {
    List<Keyword> ret = new ArrayList<Keyword>();
    EnglishStemmer engStemmer = new EnglishStemmer();
   
    try {
      List<MExpression> meList = leaveJustBest(postProcess(analyze(string)));

      Morpheme mp = null;
      MCandidate mc = null;
      MExpression me = null;
      Keyword keyword = null;
      List<Morpheme> mpList = new ArrayList<Morpheme>();
      for( int i = 0, size = meList == null ? 0 : meList.size(); i < size; i++ ) {
        me = meList.get(i);
        mc = me.get(0);

        int jSize = mc.size();
        if( jSize == 1 ) {
          mp = mc.get(0);
          mp.setString(me.getExp());
          mpList.add(mp);
        } else {
          // 분할되지 않은 리스트 형태로 형태소를 넣어준다.
          for( int j = 0; j < jSize; j++ )
            mpList.add(mc.get(j));
        }

      }

      // 복합 UOM 확인
      for( int endIdx = mpList.size() - 1; endIdx > 0; endIdx-- ) {
        for( int startIdx = Math.max(endIdx - MAX_UOM_SIZE, 0); startIdx < endIdx; startIdx++ ) {
          String tempName = "";
          for( int i = startIdx; i <= endIdx; i++ ) {
            tempName += mpList.get(i).getString();
          }

          // 다수의 토큰으로 이루어진 UOM 확인
          if( UOMDic.contains(tempName) ) {
            for( ; startIdx < endIdx; endIdx-- ) {
              mpList.remove(startIdx + 1);
            }
            mp = mpList.get(startIdx);
            mp.setString(tempName);
            mp.setCharSet(CharSetType.COMBINED);
            mp.setTag(POSTag.NNM);
          }
          // 다수의 토큰으로 이루어진 화학식 확인
          else if( ChemFormulaDic.contains(tempName) ) {
            for( ; startIdx < endIdx; endIdx-- ) {
              mpList.remove(startIdx + 1);
            }
            mp = mpList.get(startIdx);
            mp.setString(tempName);
            mp.setCharSet(CharSetType.COMBINED);
            mp.setTag(POSTag.UN);
          }
          // 다수의 토큰으로 이루어진 명사 확인 ((주), Web2.0)류의 키워드
          else if( CompNounDic.contains(tempName) ) {
            for( ; startIdx < endIdx; endIdx-- ) {
              mpList.remove(startIdx + 1);
            }
            if( !JunkWordDic.contains(tempName) ) {
              mp = mpList.get(startIdx);
              mp.setString(tempName);
              mp.setCharSet(CharSetType.COMBINED);
              mp.setTag(POSTag.NNG);
              mp.setComposed(true);
            }
          }
        }
      }

      // 키워드 추출
      for( int i = 0, size = mpList.size(); i < size; i++ ) {
        mp = mpList.get(i);
        mp.setString(mp.getString().toLowerCase());

        // stemming 및 키워드 추출
        if( (!onlyNoun || mp.isTagOf(POSTag.N) ) 
            && !JunkWordDic.contains(mp.getString()) )
        {

          // do stemming english word
          if( mp.isTagOf(POSTag.UN)
              && mp.getCharSet() == CharSetType.ENGLISH )
          {
            keyword = new Keyword(mp);
            engStemmer.setCurrent(keyword.getString().toLowerCase());
            engStemmer.stem();
            keyword.setString(engStemmer.getCurrent());
            ret.add(keyword);
          }
          // 사랑하 로 추출된 경우 명사 '사랑'을 색인어로 추출
          else if( mp.isTagOf(POSTag.V) ) {
            String temp = mp.getString();
View Full Code Here

TOP

Related Classes of org.tartarus.snowball.EnglishStemmer

Copyright © 2018 www.massapicom. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.