Source Code of org.analyse.core.util.UnicodeUtils

package org.analyse.core.util;


import java.text.CharacterIterator;
import java.text.Normalizer;
import java.text.StringCharacterIterator;
import java.util.Collections;
import java.util.HashMap;
import java.util.Map;


public class UnicodeUtils
{
    private static final Map<Character, String> decomposedChars = Collections
            .unmodifiableMap(new HashMap<Character, String>() {
                private static final long serialVersionUID = 1L;
                {
                    put('\u00DF', "ss");
                    put('\u00C6', "Ae");
                    put('\u00E6', "ae");
                    put('\u00E6', "ae");
                    put('\u00D8', "O");
                    put('\u0152', "Oe");
                    put('\u0153', "oe");
                }
            });


    /**
     * Strips accents from an input String, and decompose combined characters
     * into multiple basic ASCII characters.
     * 
     * The method is based on the Unicode KD normalization form. It iterates
     * over the resulting characters, and the strips everything that is not in
     * the Basic Latin Unicode block.
     * 
     * Based on http://www.codeproject.com/KB/cs/UnicodeNormalization.aspx
     * (found while Google-ing "stripping accents unicode string"), but with
     * legacy Java 1.6 classes. Also inspired by
     * http://www.nntp.perl.org/group/perl.i18n/2008/05/msg209.html
     * 
     * @param accentedString
     *            A string that contains accents.
     * @return The same string, without accents.
     * @see Normalizer.Form.NFKD, Character.UnicodeBlock.BASIC_LATIN
     */
    public static String decomposeToBasicLatin(String accentedString)
    {
        StringBuilder unaccentedString = new StringBuilder();
        String normalizedString = Normalizer.normalize(accentedString, Normalizer.Form.NFKD);
        CharacterIterator iterator = new StringCharacterIterator(normalizedString);
        for (char c = iterator.first(); c != CharacterIterator.DONE; c = iterator.next())
            if (decomposedChars.containsKey(c))
                unaccentedString.append(decomposedChars.get(c));
            else if (Character.UnicodeBlock.BASIC_LATIN.equals(Character.UnicodeBlock.of(c)))
                unaccentedString.append(c);
        return unaccentedString.toString();
    }
}
Source Code of org.analyse.core.util.UnicodeUtils

Related Classes of org.analyse.core.util.UnicodeUtils