Package com.atilika.kuromoji.util

Source Code of com.atilika.kuromoji.util.UnknownDictionaryBuilder

/**
* Copyright 2010-2013 Atilika Inc. and contributors (see CONTRIBUTORS.md)
*
* Licensed under the Apache License, Version 2.0 (the "License"); you may
* not use this file except in compliance with the License.  A copy of the
* License is distributed with this work in the LICENSE.md file.  You may
* also obtain a copy of the License from
*
*     http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package com.atilika.kuromoji.util;

import com.atilika.kuromoji.dict.UnknownDictionary;

import java.io.File;
import java.io.FileInputStream;
import java.io.IOException;
import java.io.InputStreamReader;
import java.io.LineNumberReader;

public class UnknownDictionaryBuilder {
  private static final String NGRAM_DICTIONARY_ENTRY = "NGRAM,5,5,-32768,-,*,*,*,*,*,*"; // TODO: Is this correct for other dictionaries than IPADIC?  -Christian
 
  private String encoding = "euc-jp";

    public UnknownDictionaryBuilder(String encoding) {
    this.encoding = encoding;
  }
 
  public UnknownDictionary build(String dirname) throws IOException {
    UnknownDictionary unkDictionary = null;
    unkDictionary = readDictionaryFile(dirname + File.separator + "unk.def")//Should be only one file
    readCharacterDefinition(dirname + File.separator + "char.def", unkDictionary);
    return unkDictionary;
  }
 
  public UnknownDictionary readDictionaryFile(String filename)
    throws IOException {
    return readDictionaryFile(filename, encoding);
  }

  public UnknownDictionary readDictionaryFile(String filename, String encoding)
    throws IOException {
    UnknownDictionary dictionary = new UnknownDictionary(5 * 1024 * 1024);
   
    FileInputStream inputStream = new FileInputStream(filename);
    InputStreamReader streamReader = new InputStreamReader(inputStream, encoding);
    LineNumberReader lineReader = new LineNumberReader(streamReader);
   
    dictionary.put(CSVUtil.parse(NGRAM_DICTIONARY_ENTRY));

    String line = null;
    while ((line = lineReader.readLine()) != null) {
      dictionary.put(CSVUtil.parse(line)); // Probably we don't need to validate entry
    }
   
    lineReader.close();
    return dictionary;
  }
 
  public void readCharacterDefinition(String filename, UnknownDictionary dictionary) throws IOException {
    FileInputStream inputStream = new FileInputStream(filename);
    InputStreamReader streamReader = new InputStreamReader(inputStream, encoding);
    LineNumberReader lineReader = new LineNumberReader(streamReader);

    String line = null;
   
    while ((line = lineReader.readLine()) != null) {
      line = line.replaceAll("^\\s", "");
      line = line.replaceAll("\\s*#.*", "");
      line = line.replaceAll("\\s+", " ");
     
      // Skip empty line or comment line
      if(line.length() == 0) {
        continue;
      }
     
      if(line.startsWith("0x")) {  // Category mapping
        String[] values = line.split(" ", 2)// Split only first space
       
        if(!values[0].contains("..")) {
          int cp = Integer.decode(values[0]).intValue();
          dictionary.putCharacterCategory(cp, values[1]);         
        } else {
          String[] codePoints = values[0].split("\\.\\.");
          int cpFrom = Integer.decode(codePoints[0]).intValue();
          int cpTo = Integer.decode(codePoints[1]).intValue();
         
          for(int i = cpFrom; i <= cpTo; i++){
            dictionary.putCharacterCategory(i, values[1]);         
          }
        }
      } else // Invoke definition
        String[] values = line.split(" "); // Consecutive space is merged above
        String characterClassName = values[0];
        int invoke = Integer.parseInt(values[1]);
        int group = Integer.parseInt(values[2]);
        int length = Integer.parseInt(values[3]);
        dictionary.putInvokeDefinition(characterClassName, invoke, group, length);
      }
    }

    lineReader.close();
  }
}
TOP

Related Classes of com.atilika.kuromoji.util.UnknownDictionaryBuilder

TOP
Copyright © 2018 www.massapi.com. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.