package net.sf.saxon.codenorm;
import net.sf.saxon.om.FastStringBuffer;
import java.io.*;
import java.util.ArrayList;
import java.util.Iterator;
import java.util.List;
/**
* This class reads the Unicode character database, extracts information needed
* to perform unicode normalization, and writes this information out in the form of the
* Java "source" module UnicodeData.java. This class is therefore executed (via its main()
* method) at the time Saxon is built - it only needs to be rerun when the Unicode data tables
* have changed.
* <p>
* The class is derived from the sample program NormalizerData.java published by the
* Unicode consortium. That code has been modified so that instead of building the run-time
* data structures directly, they are written to a Java "source" module, which is then
* compiled. Also, the ability to construct a condensed version of the data tables has been
* removed.
* <p>
* Copyright (c) 1991-2005 Unicode, Inc.
* For terms of use, see http://www.unicode.org/terms_of_use.html
* For documentation, see UAX#15.<br>
* @author Mark Davis
* @author Michael Kay: Saxon modifications.
*/
class UnicodeDataGenerator {
static final String copyright = "Copyright � 1998-1999 Unicode, Inc.";
/**
* Testing flags
*/
private static final boolean DEBUG = false;
/**
* Constants for the data file version to use.
*/
// static final boolean NEW_VERSION = true;
private static String dir;
private static String UNICODE_DATA = "UnicodeData.txt";
private static String COMPOSITION_EXCLUSIONS = "CompositionExclusions.txt";
private static List canonicalClassKeys = new ArrayList(30000);
private static List canonicalClassValues = new ArrayList(30000);
private static List decompositionKeys = new ArrayList(6000);
private static List decompositionValues = new ArrayList(6000);
private static List exclusionList = new ArrayList(200);
private static List compatibilityList = new ArrayList(8000);
private UnicodeDataGenerator() {
}
/**
* Called exactly once by NormalizerData to build the static data
*/
static void build() {
try {
readExclusionList();
buildDecompositionTables();
} catch (java.io.IOException e) {
System.err.println("Can't load data file." + e + ", " + e.getMessage());
}
}
// =============================================================
// Building Decomposition Tables
// =============================================================
/**
* Reads exclusion list and stores the data
*/
// Modified by MHK: the original code expects the hex character code to be always four hex digits
private static void readExclusionList() throws java.io.IOException {
if (DEBUG) System.out.println("Reading Exclusions");
BufferedReader in = new BufferedReader(new FileReader(dir + '/' + COMPOSITION_EXCLUSIONS), 5*1024);
while (true) {
// read a line, discarding comments and blank lines
String line = in.readLine();
if (line == null) break;
int comment = line.indexOf('#'); // strip comments
if (comment != -1) line = line.substring(0,comment);
if (line.length() == 0) continue; // ignore blanks
// store -1 in the excluded table for each character hit
int z = line.indexOf(' ');
if (z < 0) {
z = line.length();
}
int value = Integer.parseInt(line.substring(0,z),16);
exclusionList.add(new Integer(value));
}
in.close();
}
/**
* Builds a decomposition table from a UnicodeData file
*/
private static void buildDecompositionTables()
throws java.io.IOException {
if (DEBUG) System.out.println("Reading Unicode Character Database");
BufferedReader in = new BufferedReader(new FileReader(dir + '/' + UNICODE_DATA), 64*1024);
int value;
int counter = 0;
while (true) {
// read a line, discarding comments and blank lines
String line = in.readLine();
if (line == null) break;
int comment = line.indexOf('#'); // strip comments
if (comment != -1) line = line.substring(0,comment);
if (line.length() == 0) continue;
if (DEBUG) {
counter++;
if ((counter & 0xFF) == 0) System.out.println("At: " + line);
}
// find the values of the particular fields that we need
// Sample line: 00C0;LATIN ...A GRAVE;Lu;0;L;0041 0300;;;;N;LATIN ... GRAVE;;;00E0;
int start = 0;
int end = line.indexOf(';'); // code
try {
value = Integer.parseInt(line.substring(start,end),16);
} catch (NumberFormatException e) {
throw new IllegalStateException("Bad hex value in line:\n" + line);
}
if (true && value == '\u00c0') {
System.out.println("debug: " + line);
}
end = line.indexOf(';', end+1); // name
//String name = line.substring(start,end);
end = line.indexOf(';', end+1); // general category
end = line.indexOf(';', start=end+1); // canonical class
// check consistency: canonical classes must be from 0 to 255
int cc = Integer.parseInt(line.substring(start,end));
if (cc != (cc & 0xFF)) System.err.println("Bad canonical class at: " + line);
canonicalClassKeys.add(new Integer(value));
canonicalClassValues.add(new Integer(cc));
//canonicalClass.put(value,cc);
end = line.indexOf(';', end+1); // BIDI
end = line.indexOf(';', start=end+1); // decomp
// decomp requires more processing.
// store whether it is canonical or compatibility.
// store the decomp in one table, and the reverse mapping (from pairs) in another
if (start != end) {
String segment = line.substring(start, end);
boolean compat = segment.charAt(0) == '<';
if (compat) {
compatibilityList.add(new Integer(value));
//isCompatibility.set(value);
}
String decomp = fromHex(segment);
// check consistency: all canon decomps must be singles or pairs!
if (decomp.length() < 1 || decomp.length() > 2 && !compat) {
System.err.println("Bad decomp at: " + line);
}
decompositionKeys.add(new Integer(value));
decompositionValues.add(decomp);
//decompose.put(value, decomp);
// only compositions are canonical pairs
// skip if script exclusion
// if (!compat && !isExcluded.get(value)) {
// char first = '\u0000';
// char second = decomp.charAt(0);
// if (decomp.length() > 1) {
// first = second;
// second = decomp.charAt(1);
// }
//
// // store composition pair in single integer
//
// pair = (first << 16) | second;
// if (DEBUG && value == '\u00C0') {
// System.out.println("debug2: " + line);
// }
// compose.put(pair, value);
// } else if (DEBUG) {
// System.out.println("Excluding: " + decomp);
// }
}
}
in.close();
if (DEBUG) System.out.println("Done reading Unicode Character Database");
// add algorithmic Hangul decompositions
// this is more compact if done at runtime, but for simplicity we
// do it this way.
// if (DEBUG) System.out.println("Adding Hangul");
//
// for (int SIndex = 0; SIndex < SCount; ++SIndex) {
// int TIndex = SIndex % TCount;
// char first, second;
// if (TIndex != 0) { // triple
// first = (char)(SBase + SIndex - TIndex);
// second = (char)(TBase + TIndex);
// } else {
// first = (char)(LBase + SIndex / NCount);
// second = (char)(VBase + (SIndex % NCount) / TCount);
// }
// pair = (first << 16) | second;
// value = SIndex + SBase;
// decompose.put(value, String.valueOf(first) + second);
// compose.put(pair, value);
// }
// if (DEBUG) System.out.println("Done adding Hangul");
}
/**
* Hangul composition constants
*/
// static final int
// SBase = 0xAC00, LBase = 0x1100, VBase = 0x1161, TBase = 0x11A7,
// LCount = 19, VCount = 21, TCount = 28,
// NCount = VCount * TCount, // 588
// SCount = LCount * NCount; // 11172
/**
* Utility: Parses a sequence of hex Unicode characters separated by spaces
*/
// Modified by MHK. Original code assumed the characters were each 4 hex digits!
public static String fromHex(String source) {
FastStringBuffer result = new FastStringBuffer(8);
for (int i = 0; i < source.length(); ++i) {
char c = source.charAt(i);
switch (c) {
case ' ': break; // ignore
case '0': case '1': case '2': case '3': case '4': case '5': case '6': case '7':
case '8': case '9': case 'A': case 'B': case 'C': case 'D': case 'E': case 'F':
case 'a': case 'b': case 'c': case 'd': case 'e': case 'f':
int z = source.indexOf(' ',i);
if (z < 0) {
z = source.length();
}
try {
result.append((char)Integer.parseInt(source.substring(i, z),16));
} catch (NumberFormatException e) {
throw new IllegalArgumentException("Bad hex value in " + source);
}
i = z; // skip rest of number
break;
case '<': int j = source.indexOf('>',i); // skip <...>
if (j > 0) {
i = j;
break;
} // else fall through--error
default:
throw new IllegalArgumentException("Bad hex value in " + source);
}
}
return result.toString();
}
/**
* Utility: Supplies a zero-padded hex representation of a Unicode character (without 0x, \\u)
*/
public static String hex(char i) {
String result = Integer.toString(i, 16).toUpperCase();
return "0000".substring(result.length(),4) + result;
}
/**
* Utility: Supplies a zero-padded hex representation of a Unicode character (without 0x, \\u)
*/
public static String hex(String s, String sep) {
FastStringBuffer result = new FastStringBuffer(20);
for (int i = 0; i < s.length(); ++i) {
if (i != 0) result.append(sep);
result.append(hex(s.charAt(i)));
}
return result.toString();
}
/**
* Generate the Java output from the data structure
*/
private static void generateJava(PrintStream o) {
o.println("package net.sf.saxon.codenorm;");
o.println("");
o.println("//This module was generated by running net.sf.saxon.codenorm.UnicodeDataGenerator");
o.println("//*** DO NOT EDIT! ***");
o.println("//The strange format of this file is carefully chosen to avoid breaking Java compiler limits");
o.println("");
o.println("public class UnicodeData {");
// Output the canonical class table
o.println("public static final String[] canonicalClassKeys = {");
printArray(o, canonicalClassKeys.iterator());
o.println("};");
o.println("public static final String[] canonicalClassValues = {");
printArray(o, canonicalClassValues.iterator());
o.println("};");
// Output the decomposition values (not including Hangul algorithmic decompositions)
o.println("public static final String[] decompositionKeys = {");
printArray(o, decompositionKeys.iterator());
o.println("};");
o.println("public static final String[] decompositionValues = {");
printStringArray(o, decompositionValues.iterator());
o.println("};");
// Output the composition exclusions
o.println("public static final String[] exclusionList = {");
printArray(o, exclusionList.iterator());
o.println("};");
// Output the compatibility list
o.println("public static final String[] compatibilityList = {");
printArray(o, compatibilityList.iterator());
o.println("};");
o.println("}");
}
/**
* Output an array of integer values
*/
private static void printArray(PrintStream o, Iterator iter) {
int count = 0;
FastStringBuffer buff = new FastStringBuffer(128);
if (!iter.hasNext()) return;
buff.append('"');
while (true) {
if (++count == 20) {
count = 0;
buff.append("\",");
o.println(buff.toString());
buff.setLength(0);
buff.append('"');
}
int next = ((Integer)iter.next()).intValue();
buff.append(Integer.toString(next, 32)); // values are written in base-32 notation
if (iter.hasNext()) {
buff.append(",");
} else {
buff.append("\"");
o.println(buff.toString());
return;
}
}
}
/**
* Output an array of string values (using backslash-uuuu notation where appropriate)
*/
private static void printStringArray(PrintStream o, Iterator iter) {
int count = 0;
FastStringBuffer buff = new FastStringBuffer(128);
if (!iter.hasNext()) return;
while (true) {
if (++count == 20) {
count = 0;
o.println(buff.toString());
buff.setLength(0);
}
String next = (String)iter.next();
appendJavaString(next, buff);
if (iter.hasNext()) {
buff.append(", ");
} else {
o.println(buff.toString());
return;
}
}
}
private static void appendJavaString(String value, FastStringBuffer buff) {
buff.append('"');
for (int i=0; i<value.length(); i++) {
char c = value.charAt(i);
if (c == '\\') {
buff.append("\\\\");
} else if (c == '"') {
buff.append("\\\"");
} else if (c > 32 && c < 127) {
buff.append(c);
} else {
buff.append("\\u");
char b0 = "0123456789abcdef".charAt(c & 0xf);
char b1 = "0123456789abcdef".charAt((c>>4) & 0xf);
char b2 = "0123456789abcdef".charAt((c>>8) & 0xf);
char b3 = "0123456789abcdef".charAt((c>>12) & 0xf);
buff.append(b3);
buff.append(b2);
buff.append(b1);
buff.append(b0);
}
}
buff.append('"');
}
/**
* Main program. Run this program to regenerate the Java module UnicodeData.java against revised data
* from the Unicode character database.
* <p>
* Usage: java UnicodeDataGenerator dir >UnicodeData.java
* <p>
* where dir is the directory containing the files UnicodeData.text and CompositionExclusions.txt from the
* Unicode character database.
*/
public static void main(String[] args) throws Exception {
if (args.length != 2) {
System.err.println("Usage: java UnicodeDataGenerator dir UnicodeData.java");
System.err.println("where dir is the directory containing the files UnicodeData.text and" +
" CompositionExclusions.txt from the Unicode character database");
}
dir = args[0];
build();
PrintStream o = new PrintStream(new FileOutputStream(new File(args[1])));
generateJava(o);
}
}