Package com.salas.bb.utils.xml

Source Code of com.salas.bb.utils.xml.UTF8Reader

// BlogBridge -- RSS feed reader, manager, and web based service
// Copyright (C) 2002-2006 by R. Pito Salas
//
// This program is free software; you can redistribute it and/or modify it under
// the terms of the GNU General Public License as published by the Free Software Foundation;
// either version 2 of the License, or (at your option) any later version.
//
// This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY;
// without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
// See the GNU General Public License for more details.
//
// You should have received a copy of the GNU General Public License along with this program;
// if not, write to the Free Software Foundation, Inc., 59 Temple Place,
// Suite 330, Boston, MA 02111-1307 USA
//
// Contact: R. Pito Salas
// mailto:pitosalas@users.sourceforge.net
// More information: about BlogBridge
// http://www.blogbridge.com
// http://sourceforge.net/projects/blogbridge
//
// $Id: UTF8Reader.java,v 1.3 2006/01/08 05:00:10 kyank Exp $
//

package com.salas.bb.utils.xml;

import java.io.InputStream;
import java.io.IOException;
import java.io.CharConversionException;
import java.io.Reader;

/**
* High-speed reader of any UTF-8-like stream. It's capable of reading both valid and invalid
* streams. If it finds invalid UTF-8 sequences it uses invalid bytes as bytes from ISO-8859-1
* and continues parsing. This approach guaranties that <b>any</b> stream will be parsed,
* but we do not guaranty the correctness of our own interpretation of invalid sequences.
*/
public final class UTF8Reader extends Reader
{
    private InputStream in;
    private byte[]      buffer;
    private int         start;
    private int         finish;

    private char        secondHalf;

    private int         multibyteChar;
    private int         multibyteCharsToGo;
    private int         multibyteCharsRead;

    /**
     * Creates UTF-8 reader-interpreter for the stream.
     *
     * @param stream source stream.
     */
    public UTF8Reader(InputStream stream)
    {
        in = stream;
        buffer = new byte[8192];

        finish = 0;
        start = 0;

        resetMultibyte();
    }

    /**
     * Close the stream.  Once a stream has been closed, further read(),
     * ready(), mark(), or reset() invocations will throw an IOException.
     * Closing a previously-closed stream, however, has no effect.
     *
     * @throws IOException If an I/O error occurs
     */
    public void close() throws IOException
    {
        if (in != null)
        {
            in.close();
            buffer = null;
            in = null;
            start = 0;
            finish = 0;
        }
    }

    /**
     * Tell whether this stream is ready to be read.
     *
     * @return TRUE if the next read() is guaranteed not to block for input,
     *         false otherwise.  Note that returning false does not guarantee that the
     *         next read will block.
     *
     * @throws IOException If an I/O error occurs
     */
    public boolean ready()
        throws IOException
    {
        return finish > start || in == null || in.available() != 0;
    }

    /**
     * Reads maximum <code>len</code> bytes from stream into the target buffer starting from
     * specified <code>offset</code>.
     *
     * @param buf       target buffer.
     * @param offset    offset in buffer.
     * @param len       max bytes to read in.
     *
     * @return number of bytes read or -1 if the stream is over.
     *
     * @throws IOException in case of I/O error.
     */
    public int read(char[] buf, int offset, int len)
        throws IOException
    {
        int index = 0;
        int ch = 0;

        if (len <= 0) return 0;

        if (secondHalf != 0)
        {
            buf[offset + index] = secondHalf;
            index++;
            secondHalf = 0;
        }

        while (index < len)
        {
            if (finish <= start)
            {
                int readCount = -1;

                if (in != null)
                {
                    int readOffset = 0;

                    // compact if necessary
                    if (multibyteCharsToGo > 0)
                    {
                        int off = start - (1 + multibyteCharsRead);
                        int length = finish - off;
                        System.arraycopy(buffer, off, buffer, 0, length);
                        readOffset = length;
                        start = length;
                    } else
                    {
                        start = 0;
                    }
                    readCount = in.read(buffer, readOffset, buffer.length - readOffset);
                }

                if (readCount <= 0)
                {
                    if (multibyteCharsToGo > 0)
                    {
                        // Stream finished, but we have not finished job yet
                        finish = start;
                        index = saveMultiByteStartAndRewind(buf, offset, index);
                        continue;
                    } else
                    {
                        // Close and exit
                        close();
                        ch = -1;
                        break;
                    }
                } else
                {
                    finish = start + readCount;
                }
            }

            // Get next char
            ch = buffer[start] & 0x0ff;

            if (multibyteCharsToGo > 0)
            {
                // multi-byte sequence continues...
                if ((ch & 0xc0) == 0x80)
                {
                    // valid continuation byte
                    multibyteChar = (multibyteChar << 6) | (ch & 0x3f);
                    multibyteCharsToGo--;
                    multibyteCharsRead++;
                    start++;

                    if (multibyteCharsToGo == 0)
                    {
                        // finished reading multi-byte successfully -- write it to the target
                        // buffer and forget

                        // Unicode supports c <= 0x0010 ffff ...
                        if (multibyteChar > 0x0010ffff)
                        {
                            throw new CharConversionException("UTF-8 encoding of character 0x00" +
                                Integer.toHexString(multibyteChar) +
                                " can't be converted to Unicode.");
                        } else if (multibyteChar > 0xffff)
                        {
                            // Convert UCS-4 char to UTF-16
                            multibyteChar -= 0x10000;
                            secondHalf = (char)(0xDC00 + (multibyteChar & 0x03ff));
                            multibyteChar = 0xD800 + (multibyteChar >> 10);
                        }

                        buf[offset + index++] = (char)multibyteChar;
                        if (secondHalf != 0 && index < len)
                        {
                            buf[offset + index++] = secondHalf;
                            secondHalf = 0;
                        }
                        resetMultibyte();
                    }
                } else
                {
                    // the sequence got broken -- write first byte as is and rewind to the
                    // first continuation byte
                    index = saveMultiByteStartAndRewind(buf, offset, index);
                }
            } else
            {
                // Find multi-byte sequence start, others - ASCII of ISO-8859-1
                if ((ch & 0x0E0) == 0x0C0)
                {
                    // 2 bytes (0x0080 - 0x07FF)
                    multibyteChar = ch & 0x1F;
                    multibyteCharsToGo = 1;
                    ch = -1;
                } else if ((ch & 0x0F0) == 0x0E0)
                {
                    // 3 bytes (0x0800 - 0xFFFF)
                    multibyteChar = ch & 0x0F;
                    multibyteCharsToGo = 2;
                    ch = -1;
                } else if ((ch & 0x0F8) == 0x0F0)
                {
                    // 4 bytes (0x0001 0000  <= c  <= 0x001F FFFF)
                    multibyteChar = ch & 0x07;
                    multibyteCharsToGo = 3;
                    ch = -1;
                }

                // Write if there's anything to write
                if (ch != -1) buf[offset + index++] = (char)ch;
                start++;
            }
        }

        return (index > 0) ? index : (ch == -1) ? -1 : 0;
    }

    /**
     * Saves starting byte of false multi-byte sequence into buffer and rewind to
     * first continuation byte to start further parsing from (if any continuation bytes
     * were read of course).
     *
     * @param buf       target buffer.
     * @param offset    offset in target buffer.
     * @param i         current index in target buffer relative to offset.
     *
     * @return new index value.
     */
    private int saveMultiByteStartAndRewind(char[] buf, int offset, int i)
    {
        start -= multibyteCharsRead;
        buf[offset + i++] = (char)(buffer[start - 1] & 0xFF);
        resetMultibyte();

        return i;
    }

    /**
     * Resets all multi-byte properties into initial state.
     */
    private void resetMultibyte()
    {
        multibyteChar = 0;
        multibyteCharsToGo = 0;
        multibyteCharsRead = 0;
    }
}
TOP

Related Classes of com.salas.bb.utils.xml.UTF8Reader

TOP
Copyright © 2018 www.massapi.com. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.