Source Code of com.bluecast.xml.UTF8XMLDecoder

/*
 * $Id: UTF8XMLDecoder.java,v 1.5 2004/07/11 09:37:37 yuvalo Exp $
 *
 * (C) Copyright 2002-2004 by Yuval Oren. All rights reserved.
 *
 *  Licensed under the Apache License, Version 2.0 (the "License");
 *  you may not use this file except in compliance with the License.
 *  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 *  Unless required by applicable law or agreed to in writing, software
 *  distributed under the License is distributed on an "AS IS" BASIS,
 *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 *  See the License for the specific language governing permissions and
 *  limitations under the License.
 */




package  com.bluecast.xml;
import com.bluecast.io.CharsetDecoder;
import com.bluecast.io.IllegalCharException;


import  java.io.CharConversionException;




/**
 * A decoder for UTF-8 text. Also converts
 * carriage returns into linefeeds and CRLF into LF.
 *
 * @author Yuval Oren, yuval@bluecast.com
 * @version $Revision: 1.5 $
 */


final public class UTF8XMLDecoder
        implements XMLDecoder {
    private boolean sawCR = false;


    public CharsetDecoder newCharsetDecoder() { return newXMLDecoder(); }
    public XMLDecoder newXMLDecoder() { return new UTF8XMLDecoder(); }


    public int minBytesPerChar () {
        return  1;
    }


    public int maxBytesPerChar () {
        return  3;
    }


    public void reset() { sawCR = false; }


    public void decode (byte[] in_buf, int in_off, int in_len, char[] out_buf,
            int out_off, int out_len, int[] result) throws CharConversionException {
        int i, o;


        for (i = o = 0; i < in_len && o < out_len; i++) {
            // A UTF-8 character can be 1 or more bytes. Length
            // is determined by the bitmask of the first byte
            int c, c2, c3, c4;
            c = in_buf[in_off + i];
            // Check for 1 byte first, since it's the most common.
            // One byte: 0xxxxxxx
            if ((c & 0x80) == 0) {
            /* We got the character */
            }
            else {
                // It's at least two bytes long
                if (++i < in_len)
                    c2 = in_buf[in_off + i];
                else {
                    result[0] = i - 1;
                    result[1] = o;
                    return;
                }


                // Two bytes: 110xxxxx 10xxxxxx
                if ((c & 0xE0) == 0xC0) {
                    // Subsequent bytes must begin with 0x10
                    if ((c2 & 0x80) != 0x80)
                        throw new CharConversionException(
                            "Malformed UTF-8 character: 0x"
                            + Integer.toHexString(c&0xFF) + " 0x"
                            + Integer.toHexString(c2&0xFF));


                    c = ((c & 0x1F) << 6) | (c2 & 0x3F);
                    // Make sure this is not an overlong character:
                    // this character must not fit within 7 bits
                    if ((c & 0x780) == 0)
                        throw  new CharConversionException("2-byte UTF-8 character is overlong: 0x"
                                + Integer.toHexString(in_buf[in_off + i - 1] & 0xFF)
                                + " 0x" + Integer.toHexString(c2 & 0xFF));




                }
                // Three bytes: 1110xxxx 10xxxxxx 10xxxxxx
                else if ((c & 0xF0) == 0xE0) {
                    if (++i < in_len)
                        c3 = in_buf[in_off + i];
                    else {
                        result[0] = i - 2;
                        result[1] = o;
                        return;
                    }


                    // Subsequent bytes must begin with 0x10
                    if ( ((c2 & 0x80) != 0x80) ||
                         ((c3 & 0x80) != 0x80) )
                        throw new CharConversionException(
                            "Malformed UTF-8 character: 0x"
                            + Integer.toHexString(c&0xFF) + " 0x"
                            + Integer.toHexString(c2&0xFF) + " 0x"
                            + Integer.toHexString(c3&0xFF));




                    c = ((c & 0xF) << 12) | ((c2 & 0x3F) << 6) | (c3 & 0x3F);
                    // Make sure this is not an overlong character:
                    // this character must not fit within 11 bits
                    if ((c & 0xF800) == 0)
                        throw  new CharConversionException("3-byte UTF-8 character is overlong: 0x"
                                + Integer.toHexString(in_buf[in_off+i-2] & 0xFF)
                                + " 0x" + Integer.toHexString(c2 & 0xFF) + " 0x" + Integer.toHexString(c3 & 0xFF));
                }


                // Four bytes: 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
                // This is returned as a surrogate pair of characters
                else if ((c & 0xF0) == 0xF0) {
                    if (i + 2 < in_len) {
                        c3 = in_buf[in_off + (++i)];
                        c4 = in_buf[in_off + (++i)];
                    }
                    else {
                        result[0] = i - 2;
                        result[1] = o;
                        return;
                    }


                    // Subsequent bytes must begin with 0x10
                    if ( ((c2 & 0x80) != 0x80) ||
                         ((c3 & 0x80) != 0x80) ||
                         ((c4 & 0x80) != 0x80) )
                        throw new CharConversionException(
                            "Malformed UTF-8 character: 0x"
                            + Integer.toHexString(c&0xFF) + " 0x"
                            + Integer.toHexString(c2&0xFF) + " 0x"
                            + Integer.toHexString(c3&0xFF) + " 0x"
                            + Integer.toHexString(c4&0xFF));


                    c = ((c & 0x7) << 18) | ((c2 & 0x3F) << 12) | ((c3 & 0x3F) << 6) | (c4 & 0x3F);


                    if (c < 0x10000 || c > 0x10FFFF)
                        throw new IllegalCharException("Illegal XML character: 0x"
                            + Integer.toHexString(c));


                    // Construct the surrogate pair
                    c -= 0x10000;
                    out_buf[out_off + (o++)] = (char)((c >> 10) | 0xD800);
                    out_buf[out_off + (o++)] = (char)((c & ((1 << 10) - 1)) | 0xDC00);
                    sawCR = false;
                    continue;
                }
                else {
                    throw  new CharConversionException("Characters larger than 4 bytes are "
                            + "not supported: byte 0x" + Integer.toHexString(c & 0xFF) + " implies a length of more than 4 bytes");
                }


                if ( (c >= 0xD800 && c < 0xE000)
                    || (c == 0xFFFE || c == 0xFFFF) )
                    throw new IllegalCharException("Illegal XML character: 0x"
                            + Integer.toHexString(c));
            }
            // Now condense CRLF into LF and transform a lone CR into LF
            if (c >= 0x20) {
                sawCR = false;
                out_buf[out_off + o++] = (char)c;
            }
            else {
                switch (c) {
                    case '\n':
                        if (sawCR) {
                            sawCR = false;
                        }
                        else
                            out_buf[out_off + o++] = '\n';
                        break;


                    case '\r':
                        sawCR = true;
                        out_buf[out_off + o++] = '\n';
                        break;


                    case '\t':
                        out_buf[out_off + o++] = '\t';
                        break;


                    default:
                        throw new IllegalCharException(
                        "Illegal XML character: 0x"
                            + Integer.toHexString(c));
                }
            }
        }
        result[0] = i;
        result[1] = o;
    }




    public void decodeXMLDecl (byte[] in_buf, int in_off, int in_len, char[] out_buf,
            int out_off, int out_len, int[] result) throws CharConversionException {
        int i, o;
        inputLoop:
        for (i = o = 0; i < in_len && o < out_len; i++) {
            // A UTF-8 character can be 1 or more bytes. Length
            // is determined by the bitmask of the first byte
            int c = in_buf[in_off + i];


            // Check for 1 byte first, since it's the most common.
            // One byte: 0xxxxxxx
            if ((c & 0x80) != 0) {
                // XML declarations don't use more than 1 byte per char.
                break inputLoop;
            }


            // Now condense CRLF into LF and transform a lone CR into LF
            if (c >= 0x20) {
                sawCR = false;
                out_buf[out_off + o++] = (char)c;


                // Stop decoding after the first '>' because anything more
                // can't be part of the declaration
                if (c == '>') {
                    i++;
                    break inputLoop;
                }
            }
            else {
                switch (c) {
                    case '\n':
                        if (sawCR) {
                            sawCR = false;
                        }
                        else
                            out_buf[out_off + o++] = '\n';
                        break;


                    case '\r':
                        sawCR = true;
                        out_buf[out_off + o++] = '\n';
                        break;


                    case '\t':
                        out_buf[out_off + o++] = '\t';
                        break;


                    default:
                        // Illegal character. Stop decoding
                        break inputLoop;
                }
            }
        }
        result[0] = i;
        result[1] = o;
    }






}
Source Code of com.bluecast.xml.UTF8XMLDecoder

Related Classes of com.bluecast.xml.UTF8XMLDecoder