Package com.ibm.icu.util

Source Code of com.ibm.icu.util.CharsTrie

/*
*******************************************************************************
*   Copyright (C) 2011-2012, International Business Machines
*   Corporation and others.  All Rights Reserved.
*******************************************************************************
*   created on: 2011jan06
*   created by: Markus W. Scherer
*   ported from ICU4C ucharstrie.h/.cpp
*/

package com.ibm.icu.util;

import java.io.IOException;
import java.util.ArrayList;
import java.util.NoSuchElementException;

import com.ibm.icu.text.UTF16;
import com.ibm.icu.util.BytesTrie.Result;

/**
* Light-weight, non-const reader class for a CharsTrie.
* Traverses a char-serialized data structure with minimal state,
* for mapping strings (16-bit-unit sequences) to non-negative integer values.
*
* <p>This class is not intended for public subclassing.
*
* @stable ICU 4.8
* @author Markus W. Scherer
*/
public final class CharsTrie implements Cloneable, Iterable<CharsTrie.Entry> {
    /**
     * Constructs a CharsTrie reader instance.
     *
     * <p>The CharSequence must contain a copy of a char sequence from the CharsTrieBuilder,
     * with the offset indicating the first char of that sequence.
     * The CharsTrie object will not read more chars than
     * the CharsTrieBuilder generated in the corresponding build() call.
     *
     * <p>The CharSequence is not copied/cloned and must not be modified while
     * the CharsTrie object is in use.
     *
     * @param trieChars CharSequence that contains the serialized trie.
     * @param offset Root offset of the trie in the CharSequence.
     * @stable ICU 4.8
     */
    public CharsTrie(CharSequence trieChars, int offset) {
        chars_=trieChars;
        pos_=root_=offset;
        remainingMatchLength_=-1;
    }

    /**
     * Clones this trie reader object and its state,
     * but not the char array which will be shared.
     * @return A shallow clone of this trie.
     * @stable ICU 4.8
     */
    @Override
    public Object clone() throws CloneNotSupportedException {
        return super.clone()// A shallow copy is just what we need.
    }

    /**
     * Resets this trie to its initial state.
     * @return this
     * @stable ICU 4.8
     */
    public CharsTrie reset() {
        pos_=root_;
        remainingMatchLength_=-1;
        return this;
    }

    /**
     * CharsTrie state object, for saving a trie's current state
     * and resetting the trie back to this state later.
     * @stable ICU 4.8
     */
    public static final class State {
        /**
         * Constructs an empty State.
         * @stable ICU 4.8
         */
        public State() {}
        private CharSequence chars;
        private int root;
        private int pos;
        private int remainingMatchLength;
    }

    /**
     * Saves the state of this trie.
     * @param state The State object to hold the trie's state.
     * @return this
     * @see #resetToState
     * @stable ICU 4.8
     */
    public CharsTrie saveState(State state) /*const*/ {
        state.chars=chars_;
        state.root=root_;
        state.pos=pos_;
        state.remainingMatchLength=remainingMatchLength_;
        return this;
    }

    /**
     * Resets this trie to the saved state.
     * @param state The State object which holds a saved trie state.
     * @return this
     * @throws IllegalArgumentException if the state object contains no state,
     *         or the state of a different trie
     * @see #saveState
     * @see #reset
     * @stable ICU 4.8
     */
    public CharsTrie resetToState(State state) {
        if(chars_==state.chars && chars_!=null && root_==state.root) {
            pos_=state.pos;
            remainingMatchLength_=state.remainingMatchLength;
        } else {
            throw new IllegalArgumentException("incompatible trie state");
        }
        return this;
    }

    /**
     * Determines whether the string so far matches, whether it has a value,
     * and whether another input char can continue a matching string.
     * @return The match/value Result.
     * @stable ICU 4.8
     */
    public Result current() /*const*/ {
        int pos=pos_;
        if(pos<0) {
            return Result.NO_MATCH;
        } else {
            int node;
            return (remainingMatchLength_<0 && (node=chars_.charAt(pos))>=kMinValueLead) ?
                    valueResults_[node>>15] : Result.NO_VALUE;
        }
    }

    /**
     * Traverses the trie from the initial state for this input char.
     * Equivalent to reset().next(inUnit).
     * @param inUnit Input char value. Values below 0 and above 0xffff will never match.
     * @return The match/value Result.
     * @stable ICU 4.8
     */
    public Result first(int inUnit) {
        remainingMatchLength_=-1;
        return nextImpl(root_, inUnit);
    }

    /**
     * Traverses the trie from the initial state for the
     * one or two UTF-16 code units for this input code point.
     * Equivalent to reset().nextForCodePoint(cp).
     * @param cp A Unicode code point 0..0x10ffff.
     * @return The match/value Result.
     * @stable ICU 4.8
     */
    public Result firstForCodePoint(int cp) {
        return cp<=0xffff ?
            first(cp) :
            (first(UTF16.getLeadSurrogate(cp)).hasNext() ?
                next(UTF16.getTrailSurrogate(cp)) :
                Result.NO_MATCH);
    }

    /**
     * Traverses the trie from the current state for this input char.
     * @param inUnit Input char value. Values below 0 and above 0xffff will never match.
     * @return The match/value Result.
     * @stable ICU 4.8
     */
    public Result next(int inUnit) {
        int pos=pos_;
        if(pos<0) {
            return Result.NO_MATCH;
        }
        int length=remainingMatchLength_;  // Actual remaining match length minus 1.
        if(length>=0) {
            // Remaining part of a linear-match node.
            if(inUnit==chars_.charAt(pos++)) {
                remainingMatchLength_=--length;
                pos_=pos;
                int node;
                return (length<0 && (node=chars_.charAt(pos))>=kMinValueLead) ?
                        valueResults_[node>>15] : Result.NO_VALUE;
            } else {
                stop();
                return Result.NO_MATCH;
            }
        }
        return nextImpl(pos, inUnit);
    }

    /**
     * Traverses the trie from the current state for the
     * one or two UTF-16 code units for this input code point.
     * @param cp A Unicode code point 0..0x10ffff.
     * @return The match/value Result.
     * @stable ICU 4.8
     */
    public Result nextForCodePoint(int cp) {
        return cp<=0xffff ?
            next(cp) :
            (next(UTF16.getLeadSurrogate(cp)).hasNext() ?
                next(UTF16.getTrailSurrogate(cp)) :
                Result.NO_MATCH);
    }

    /**
     * Traverses the trie from the current state for this string.
     * Equivalent to
     * <pre>
     * Result result=current();
     * for(each c in s)
     *   if(!result.hasNext()) return Result.NO_MATCH;
     *   result=next(c);
     * return result;
     * </pre>
     * @param s Contains a string.
     * @param sIndex The start index of the string in s.
     * @param sLimit The (exclusive) end index of the string in s.
     * @return The match/value Result.
     * @stable ICU 4.8
     */
    public Result next(CharSequence s, int sIndex, int sLimit) {
        if(sIndex>=sLimit) {
            // Empty input.
            return current();
        }
        int pos=pos_;
        if(pos<0) {
            return Result.NO_MATCH;
        }
        int length=remainingMatchLength_;  // Actual remaining match length minus 1.
        for(;;) {
            // Fetch the next input unit, if there is one.
            // Continue a linear-match node.
            char inUnit;
            for(;;) {
                if(sIndex==sLimit) {
                    remainingMatchLength_=length;
                    pos_=pos;
                    int node;
                    return (length<0 && (node=chars_.charAt(pos))>=kMinValueLead) ?
                            valueResults_[node>>15] : Result.NO_VALUE;
                }
                inUnit=s.charAt(sIndex++);
                if(length<0) {
                    remainingMatchLength_=length;
                    break;
                }
                if(inUnit!=chars_.charAt(pos)) {
                    stop();
                    return Result.NO_MATCH;
                }
                ++pos;
                --length;
            }
            int node=chars_.charAt(pos++);
            for(;;) {
                if(node<kMinLinearMatch) {
                    Result result=branchNext(pos, node, inUnit);
                    if(result==Result.NO_MATCH) {
                        return Result.NO_MATCH;
                    }
                    // Fetch the next input unit, if there is one.
                    if(sIndex==sLimit) {
                        return result;
                    }
                    if(result==Result.FINAL_VALUE) {
                        // No further matching units.
                        stop();
                        return Result.NO_MATCH;
                    }
                    inUnit=s.charAt(sIndex++);
                    pos=pos_;  // branchNext() advanced pos and wrote it to pos_ .
                    node=chars_.charAt(pos++);
                } else if(node<kMinValueLead) {
                    // Match length+1 units.
                    length=node-kMinLinearMatch;  // Actual match length minus 1.
                    if(inUnit!=chars_.charAt(pos)) {
                        stop();
                        return Result.NO_MATCH;
                    }
                    ++pos;
                    --length;
                    break;
                } else if((node&kValueIsFinal)!=0) {
                    // No further matching units.
                    stop();
                    return Result.NO_MATCH;
                } else {
                    // Skip intermediate value.
                    pos=skipNodeValue(pos, node);
                    node&=kNodeTypeMask;
                }
            }
        }
    }

    /**
     * Returns a matching string's value if called immediately after
     * current()/first()/next() returned Result.INTERMEDIATE_VALUE or Result.FINAL_VALUE.
     * getValue() can be called multiple times.
     *
     * Do not call getValue() after Result.NO_MATCH or Result.NO_VALUE!
     * @return The value for the string so far.
     * @stable ICU 4.8
     */
    public int getValue() /*const*/ {
        int pos=pos_;
        int leadUnit=chars_.charAt(pos++);
        assert(leadUnit>=kMinValueLead);
        return (leadUnit&kValueIsFinal)!=0 ?
            readValue(chars_, pos, leadUnit&0x7fff) : readNodeValue(chars_, pos, leadUnit);
    }

    /**
     * Determines whether all strings reachable from the current state
     * map to the same value, and if so, returns that value.
     * @return The unique value in bits 32..1 with bit 0 set,
     *         if all strings reachable from the current state
     *         map to the same value; otherwise returns 0.
     * @stable ICU 4.8
     */
    public long getUniqueValue() /*const*/ {
        int pos=pos_;
        if(pos<0) {
            return 0;
        }
        // Skip the rest of a pending linear-match node.
        long uniqueValue=findUniqueValue(chars_, pos+remainingMatchLength_+1, 0);
        // Ignore internally used bits 63..33; extend the actual value's sign bit from bit 32.
        return (uniqueValue<<31)>>31;
    }

    /**
     * Finds each char which continues the string from the current state.
     * That is, each char c for which it would be next(c)!=Result.NO_MATCH now.
     * @param out Each next char is appended to this object.
     *            (Only uses the out.append(c) method.)
     * @return The number of chars which continue the string from here.
     * @stable ICU 4.8
     */
    public int getNextChars(Appendable out) /*const*/ {
        int pos=pos_;
        if(pos<0) {
            return 0;
        }
        if(remainingMatchLength_>=0) {
            append(out, chars_.charAt(pos))// Next unit of a pending linear-match node.
            return 1;
        }
        int node=chars_.charAt(pos++);
        if(node>=kMinValueLead) {
            if((node&kValueIsFinal)!=0) {
                return 0;
            } else {
                pos=skipNodeValue(pos, node);
                node&=kNodeTypeMask;
            }
        }
        if(node<kMinLinearMatch) {
            if(node==0) {
                node=chars_.charAt(pos++);
            }
            getNextBranchChars(chars_, pos, ++node, out);
            return node;
        } else {
            // First unit of the linear-match node.
            append(out, chars_.charAt(pos));
            return 1;
        }
    }

    /**
     * Iterates from the current state of this trie.
     * @return A new CharsTrie.Iterator.
     * @stable ICU 4.8
     */
    public Iterator iterator() {
        return new Iterator(chars_, pos_, remainingMatchLength_, 0);
    }

    /**
     * Iterates from the current state of this trie.
     * @param maxStringLength If 0, the iterator returns full strings.
     *                        Otherwise, the iterator returns strings with this maximum length.
     * @return A new CharsTrie.Iterator.
     * @stable ICU 4.8
     */
    public Iterator iterator(int maxStringLength) {
        return new Iterator(chars_, pos_, remainingMatchLength_, maxStringLength);
    }

    /**
     * Iterates from the root of a char-serialized BytesTrie.
     * @param trieChars CharSequence that contains the serialized trie.
     * @param offset Root offset of the trie in the CharSequence.
     * @param maxStringLength If 0, the iterator returns full strings.
     *                        Otherwise, the iterator returns strings with this maximum length.
     * @return A new CharsTrie.Iterator.
     * @stable ICU 4.8
     */
    public static Iterator iterator(CharSequence trieChars, int offset, int maxStringLength) {
        return new Iterator(trieChars, offset, -1, maxStringLength);
    }

    /**
     * Return value type for the Iterator.
     * @stable ICU 4.8
     */
    public static final class Entry {
        /**
         * The string.
         * @stable ICU 4.8
         */
        public CharSequence chars;
        /**
         * The value associated with the string.
         * @stable ICU 4.8
         */
        public int value;

        private Entry() {
        }
    }

    /**
     * Iterator for all of the (string, value) pairs in a CharsTrie.
     * @stable ICU 4.8
     */
    public static final class Iterator implements java.util.Iterator<Entry> {
        private Iterator(CharSequence trieChars, int offset, int remainingMatchLength, int maxStringLength) {
            chars_=trieChars;
            pos_=initialPos_=offset;
            remainingMatchLength_=initialRemainingMatchLength_=remainingMatchLength;
            maxLength_=maxStringLength;
            int length=remainingMatchLength_;  // Actual remaining match length minus 1.
            if(length>=0) {
                // Pending linear-match node, append remaining bytes to str_.
                ++length;
                if(maxLength_>0 && length>maxLength_) {
                    length=maxLength_;  // This will leave remainingMatchLength>=0 as a signal.
                }
                str_.append(chars_, pos_, pos_+length);
                pos_+=length;
                remainingMatchLength_-=length;
            }
        }

        /**
         * Resets this iterator to its initial state.
         * @return this
         * @stable ICU 4.8
         */
        public Iterator reset() {
            pos_=initialPos_;
            remainingMatchLength_=initialRemainingMatchLength_;
            skipValue_=false;
            int length=remainingMatchLength_+1// Remaining match length.
            if(maxLength_>0 && length>maxLength_) {
                length=maxLength_;
            }
            str_.setLength(length);
            pos_+=length;
            remainingMatchLength_-=length;
            stack_.clear();
            return this;
        }

        /**
         * @return true if there are more elements.
         * @stable ICU 4.8
         */
        public boolean hasNext() /*const*/ { return pos_>=0 || !stack_.isEmpty(); }

        /**
         * Finds the next (string, value) pair if there is one.
         *
         * If the string is truncated to the maximum length and does not
         * have a real value, then the value is set to -1.
         * In this case, this "not a real value" is indistinguishable from
         * a real value of -1.
         * @return An Entry with the string and value of the next element.
         * @throws NoSuchElementException - iteration has no more elements.
         * @stable ICU 4.8
         */
        public Entry next() {
            int pos=pos_;
            if(pos<0) {
                if(stack_.isEmpty()) {
                    throw new NoSuchElementException();
                }
                // Pop the state off the stack and continue with the next outbound edge of
                // the branch node.
                long top=stack_.remove(stack_.size()-1);
                int length=(int)top;
                pos=(int)(top>>32);
                str_.setLength(length&0xffff);
                length>>>=16;
                if(length>1) {
                    pos=branchNext(pos, length);
                    if(pos<0) {
                        return entry_;  // Reached a final value.
                    }
                } else {
                    str_.append(chars_.charAt(pos++));
                }
            }
            if(remainingMatchLength_>=0) {
                // We only get here if we started in a pending linear-match node
                // with more than maxLength remaining units.
                return truncateAndStop();
            }
            for(;;) {
                int node=chars_.charAt(pos++);
                if(node>=kMinValueLead) {
                    if(skipValue_) {
                        pos=skipNodeValue(pos, node);
                        node&=kNodeTypeMask;
                        skipValue_=false;
                    } else {
                        // Deliver value for the string so far.
                        boolean isFinal=(node&kValueIsFinal)!=0;
                        if(isFinal) {
                            entry_.value=readValue(chars_, pos, node&0x7fff);
                        } else {
                            entry_.value=readNodeValue(chars_, pos, node);
                        }
                        if(isFinal || (maxLength_>0 && str_.length()==maxLength_)) {
                            pos_=-1;
                        } else {
                            // We cannot skip the value right here because it shares its
                            // lead unit with a match node which we have to evaluate
                            // next time.
                            // Instead, keep pos_ on the node lead unit itself.
                            pos_=pos-1;
                            skipValue_=true;
                        }
                        entry_.chars=str_;
                        return entry_;
                    }
                }
                if(maxLength_>0 && str_.length()==maxLength_) {
                    return truncateAndStop();
                }
                if(node<kMinLinearMatch) {
                    if(node==0) {
                        node=chars_.charAt(pos++);
                    }
                    pos=branchNext(pos, node+1);
                    if(pos<0) {
                        return entry_;  // Reached a final value.
                    }
                } else {
                    // Linear-match node, append length units to str_.
                    int length=node-kMinLinearMatch+1;
                    if(maxLength_>0 && str_.length()+length>maxLength_) {
                        str_.append(chars_, pos, pos+maxLength_-str_.length());
                        return truncateAndStop();
                    }
                    str_.append(chars_, pos, pos+length);
                    pos+=length;
                }
            }
        }

        /**
         * Iterator.remove() is not supported.
         * @throws UnsupportedOperationException (always)
         * @stable ICU 4.8
         */
        public void remove() {
            throw new UnsupportedOperationException();
        }

        private Entry truncateAndStop() {
            pos_=-1;
            // We reset entry_.chars every time we return entry_
            // just because the caller might have modified the Entry.
            entry_.chars=str_;
            entry_.value=-1// no real value for str
            return entry_;
        }

        private int branchNext(int pos, int length) {
            while(length>kMaxBranchLinearSubNodeLength) {
                ++pos;  // ignore the comparison unit
                // Push state for the greater-or-equal edge.
                stack_.add(((long)skipDelta(chars_, pos)<<32)|((length-(length>>1))<<16)|str_.length());
                // Follow the less-than edge.
                length>>=1;
                pos=jumpByDelta(chars_, pos);
            }
            // List of key-value pairs where values are either final values or jump deltas.
            // Read the first (key, value) pair.
            char trieUnit=chars_.charAt(pos++);
            int node=chars_.charAt(pos++);
            boolean isFinal=(node&kValueIsFinal)!=0;
            int value=readValue(chars_, pos, node&=0x7fff);
            pos=skipValue(pos, node);
            stack_.add(((long)pos<<32)|((length-1)<<16)|str_.length());
            str_.append(trieUnit);
            if(isFinal) {
                pos_=-1;
                entry_.chars=str_;
                entry_.value=value;
                return -1;
            } else {
                return pos+value;
            }
        }

        private CharSequence chars_;
        private int pos_;
        private int initialPos_;
        private int remainingMatchLength_;
        private int initialRemainingMatchLength_;
        private boolean skipValue_;  // Skip intermediate value which was already delivered.

        private StringBuilder str_=new StringBuilder();
        private int maxLength_;
        private Entry entry_=new Entry();

        // The stack stores longs for backtracking to another
        // outbound edge of a branch node.
        // Each long has the offset in chars_ in bits 62..32,
        // the str_.length() from before the node in bits 15..0,
        // and the remaining branch length in bits 31..16.
        // (We could store the remaining branch length minus 1 in bits 30..16 and not use bit 31,
        // but the code looks more confusing that way.)
        private ArrayList<Long> stack_=new ArrayList<Long>();
    }

    private void stop() {
        pos_=-1;
    }

    // Reads a compact 32-bit integer.
    // pos is already after the leadUnit, and the lead unit has bit 15 reset.
    private static int readValue(CharSequence chars, int pos, int leadUnit) {
        int value;
        if(leadUnit<kMinTwoUnitValueLead) {
            value=leadUnit;
        } else if(leadUnit<kThreeUnitValueLead) {
            value=((leadUnit-kMinTwoUnitValueLead)<<16)|chars.charAt(pos);
        } else {
            value=(chars.charAt(pos)<<16)|chars.charAt(pos+1);
        }
        return value;
    }
    private static int skipValue(int pos, int leadUnit) {
        if(leadUnit>=kMinTwoUnitValueLead) {
            if(leadUnit<kThreeUnitValueLead) {
                ++pos;
            } else {
                pos+=2;
            }
        }
        return pos;
    }
    private static int skipValue(CharSequence chars, int pos) {
        int leadUnit=chars.charAt(pos++);
        return skipValue(pos, leadUnit&0x7fff);
    }

    private static int readNodeValue(CharSequence chars, int pos, int leadUnit) {
        assert(kMinValueLead<=leadUnit && leadUnit<kValueIsFinal);
        int value;
        if(leadUnit<kMinTwoUnitNodeValueLead) {
            value=(leadUnit>>6)-1;
        } else if(leadUnit<kThreeUnitNodeValueLead) {
            value=(((leadUnit&0x7fc0)-kMinTwoUnitNodeValueLead)<<10)|chars.charAt(pos);
        } else {
            value=(chars.charAt(pos)<<16)|chars.charAt(pos+1);
        }
        return value;
    }
    private static int skipNodeValue(int pos, int leadUnit) {
        assert(kMinValueLead<=leadUnit && leadUnit<kValueIsFinal);
        if(leadUnit>=kMinTwoUnitNodeValueLead) {
            if(leadUnit<kThreeUnitNodeValueLead) {
                ++pos;
            } else {
                pos+=2;
            }
        }
        return pos;
    }

    private static int jumpByDelta(CharSequence chars, int pos) {
        int delta=chars.charAt(pos++);
        if(delta>=kMinTwoUnitDeltaLead) {
            if(delta==kThreeUnitDeltaLead) {
                delta=(chars.charAt(pos)<<16)|chars.charAt(pos+1);
                pos+=2;
            } else {
                delta=((delta-kMinTwoUnitDeltaLead)<<16)|chars.charAt(pos++);
            }
        }
        return pos+delta;
    }

    private static int skipDelta(CharSequence chars, int pos) {
        int delta=chars.charAt(pos++);
        if(delta>=kMinTwoUnitDeltaLead) {
            if(delta==kThreeUnitDeltaLead) {
                pos+=2;
            } else {
                ++pos;
            }
        }
        return pos;
    }

    private static Result[] valueResults_={ Result.INTERMEDIATE_VALUE, Result.FINAL_VALUE };

    // Handles a branch node for both next(unit) and next(string).
    private Result branchNext(int pos, int length, int inUnit) {
        // Branch according to the current unit.
        if(length==0) {
            length=chars_.charAt(pos++);
        }
        ++length;
        // The length of the branch is the number of units to select from.
        // The data structure encodes a binary search.
        while(length>kMaxBranchLinearSubNodeLength) {
            if(inUnit<chars_.charAt(pos++)) {
                length>>=1;
                pos=jumpByDelta(chars_, pos);
            } else {
                length=length-(length>>1);
                pos=skipDelta(chars_, pos);
            }
        }
        // Drop down to linear search for the last few units.
        // length>=2 because the loop body above sees length>kMaxBranchLinearSubNodeLength>=3
        // and divides length by 2.
        do {
            if(inUnit==chars_.charAt(pos++)) {
                Result result;
                int node=chars_.charAt(pos);
                if((node&kValueIsFinal)!=0) {
                    // Leave the final value for getValue() to read.
                    result=Result.FINAL_VALUE;
                } else {
                    // Use the non-final value as the jump delta.
                    ++pos;
                    // int delta=readValue(pos, node);
                    int delta;
                    if(node<kMinTwoUnitValueLead) {
                        delta=node;
                    } else if(node<kThreeUnitValueLead) {
                        delta=((node-kMinTwoUnitValueLead)<<16)|chars_.charAt(pos++);
                    } else {
                        delta=(chars_.charAt(pos)<<16)|chars_.charAt(pos+1);
                        pos+=2;
                    }
                    // end readValue()
                    pos+=delta;
                    node=chars_.charAt(pos);
                    result= node>=kMinValueLead ? valueResults_[node>>15] : Result.NO_VALUE;
                }
                pos_=pos;
                return result;
            }
            --length;
            pos=skipValue(chars_, pos);
        } while(length>1);
        if(inUnit==chars_.charAt(pos++)) {
            pos_=pos;
            int node=chars_.charAt(pos);
            return node>=kMinValueLead ? valueResults_[node>>15] : Result.NO_VALUE;
        } else {
            stop();
            return Result.NO_MATCH;
        }
    }

    // Requires remainingLength_<0.
    private Result nextImpl(int pos, int inUnit) {
        int node=chars_.charAt(pos++);
        for(;;) {
            if(node<kMinLinearMatch) {
                return branchNext(pos, node, inUnit);
            } else if(node<kMinValueLead) {
                // Match the first of length+1 units.
                int length=node-kMinLinearMatch;  // Actual match length minus 1.
                if(inUnit==chars_.charAt(pos++)) {
                    remainingMatchLength_=--length;
                    pos_=pos;
                    return (length<0 && (node=chars_.charAt(pos))>=kMinValueLead) ?
                            valueResults_[node>>15] : Result.NO_VALUE;
                } else {
                    // No match.
                    break;
                }
            } else if((node&kValueIsFinal)!=0) {
                // No further matching units.
                break;
            } else {
                // Skip intermediate value.
                pos=skipNodeValue(pos, node);
                node&=kNodeTypeMask;
            }
        }
        stop();
        return Result.NO_MATCH;
    }

    // Helper functions for getUniqueValue().
    // Recursively finds a unique value (or whether there is not a unique one)
    // from a branch.
    // uniqueValue: On input, same as for getUniqueValue()/findUniqueValue().
    // On return, if not 0, then bits 63..33 contain the updated non-negative pos.
    private static long findUniqueValueFromBranch(CharSequence chars, int pos, int length,
                                                  long uniqueValue) {
        while(length>kMaxBranchLinearSubNodeLength) {
            ++pos;  // ignore the comparison unit
            uniqueValue=findUniqueValueFromBranch(chars, jumpByDelta(chars, pos), length>>1, uniqueValue);
            if(uniqueValue==0) {
                return 0;
            }
            length=length-(length>>1);
            pos=skipDelta(chars, pos);
        }
        do {
            ++pos;  // ignore a comparison unit
            // handle its value
            int node=chars.charAt(pos++);
            boolean isFinal=(node&kValueIsFinal)!=0;
            node&=0x7fff;
            int value=readValue(chars, pos, node);
            pos=skipValue(pos, node);
            if(isFinal) {
                if(uniqueValue!=0) {
                    if(value!=(int)(uniqueValue>>1)) {
                        return 0;
                    }
                } else {
                    uniqueValue=((long)value<<1)|1;
                }
            } else {
                uniqueValue=findUniqueValue(chars, pos+value, uniqueValue);
                if(uniqueValue==0) {
                    return 0;
                }
            }
        } while(--length>1);
        // ignore the last comparison byte
        return ((long)(pos+1)<<33)|(uniqueValue&0x1ffffffffL);
    }
    // Recursively finds a unique value (or whether there is not a unique one)
    // starting from a position on a node lead unit.
    // uniqueValue: If there is one, then bits 32..1 contain the value and bit 0 is set.
    // Otherwise, uniqueValue is 0. Bits 63..33 are ignored.
    private static long findUniqueValue(CharSequence chars, int pos, long uniqueValue) {
        int node=chars.charAt(pos++);
        for(;;) {
            if(node<kMinLinearMatch) {
                if(node==0) {
                    node=chars.charAt(pos++);
                }
                uniqueValue=findUniqueValueFromBranch(chars, pos, node+1, uniqueValue);
                if(uniqueValue==0) {
                    return 0;
                }
                pos=(int)(uniqueValue>>>33);
                node=chars.charAt(pos++);
            } else if(node<kMinValueLead) {
                // linear-match node
                pos+=node-kMinLinearMatch+1// Ignore the match units.
                node=chars.charAt(pos++);
            } else {
                boolean isFinal=(node&kValueIsFinal)!=0;
                int value;
                if(isFinal) {
                    value=readValue(chars, pos, node&0x7fff);
                } else {
                    value=readNodeValue(chars, pos, node);
                }
                if(uniqueValue!=0) {
                    if(value!=(int)(uniqueValue>>1)) {
                        return 0;
                    }
                } else {
                    uniqueValue=((long)value<<1)|1;
                }
                if(isFinal) {
                    return uniqueValue;
                }
                pos=skipNodeValue(pos, node);
                node&=kNodeTypeMask;
            }
        }
    }

    // Helper functions for getNextChars().
    // getNextChars() when pos is on a branch node.
    private static void getNextBranchChars(CharSequence chars, int pos, int length, Appendable out) {
        while(length>kMaxBranchLinearSubNodeLength) {
            ++pos;  // ignore the comparison unit
            getNextBranchChars(chars, jumpByDelta(chars, pos), length>>1, out);
            length=length-(length>>1);
            pos=skipDelta(chars, pos);
        }
        do {
            append(out, chars.charAt(pos++));
            pos=skipValue(chars, pos);
        } while(--length>1);
        append(out, chars.charAt(pos));
    }
    private static void append(Appendable out, int c) {
        try {
            out.append((char)c);
        } catch(IOException e) {
            throw new RuntimeException(e);
        }
    }

    // CharsTrie data structure
    //
    // The trie consists of a series of char-serialized nodes for incremental
    // Unicode string/char sequence matching. (char=16-bit unsigned integer)
    // The root node is at the beginning of the trie data.
    //
    // Types of nodes are distinguished by their node lead unit ranges.
    // After each node, except a final-value node, another node follows to
    // encode match values or continue matching further units.
    //
    // Node types:
    //  - Final-value node: Stores a 32-bit integer in a compact, variable-length format.
    //    The value is for the string/char sequence so far.
    //  - Match node, optionally with an intermediate value in a different compact format.
    //    The value, if present, is for the string/char sequence so far.
    //
    //  Aside from the value, which uses the node lead unit's high bits:
    //
    //  - Linear-match node: Matches a number of units.
    //  - Branch node: Branches to other nodes according to the current input unit.
    //    The node unit is the length of the branch (number of units to select from)
    //    minus 1. It is followed by a sub-node:
    //    - If the length is at most kMaxBranchLinearSubNodeLength, then
    //      there are length-1 (key, value) pairs and then one more comparison unit.
    //      If one of the key units matches, then the value is either a final value for
    //      the string so far, or a "jump" delta to the next node.
    //      If the last unit matches, then matching continues with the next node.
    //      (Values have the same encoding as final-value nodes.)
    //    - If the length is greater than kMaxBranchLinearSubNodeLength, then
    //      there is one unit and one "jump" delta.
    //      If the input unit is less than the sub-node unit, then "jump" by delta to
    //      the next sub-node which will have a length of length/2.
    //      (The delta has its own compact encoding.)
    //      Otherwise, skip the "jump" delta to the next sub-node
    //      which will have a length of length-length/2.

    // Match-node lead unit values, after masking off intermediate-value bits:

    // 0000..002f: Branch node. If node!=0 then the length is node+1, otherwise
    // the length is one more than the next unit.

    // For a branch sub-node with at most this many entries, we drop down
    // to a linear search.
    /*package*/ static final int kMaxBranchLinearSubNodeLength=5;

    // 0030..003f: Linear-match node, match 1..16 units and continue reading the next node.
    /*package*/ static final int kMinLinearMatch=0x30;
    /*package*/ static final int kMaxLinearMatchLength=0x10;

    // Match-node lead unit bits 14..6 for the optional intermediate value.
    // If these bits are 0, then there is no intermediate value.
    // Otherwise, see the *NodeValue* constants below.
    /*package*/ static final int kMinValueLead=kMinLinearMatch+kMaxLinearMatchLength;  // 0x0040
    /*package*/ static final int kNodeTypeMask=kMinValueLead-1// 0x003f

    // A final-value node has bit 15 set.
    /*package*/ static final int kValueIsFinal=0x8000;

    // Compact value: After testing and masking off bit 15, use the following thresholds.
    /*package*/ static final int kMaxOneUnitValue=0x3fff;

    /*package*/ static final int kMinTwoUnitValueLead=kMaxOneUnitValue+1// 0x4000
    /*package*/ static final int kThreeUnitValueLead=0x7fff;

    /*package*/ static final int kMaxTwoUnitValue=((kThreeUnitValueLead-kMinTwoUnitValueLead)<<16)-1// 0x3ffeffff

    // Compact intermediate-value integer, lead unit shared with a branch or linear-match node.
    /*package*/ static final int kMaxOneUnitNodeValue=0xff;
    /*package*/ static final int kMinTwoUnitNodeValueLead=kMinValueLead+((kMaxOneUnitNodeValue+1)<<6)// 0x4040
    /*package*/ static final int kThreeUnitNodeValueLead=0x7fc0;

    /*package*/ static final int kMaxTwoUnitNodeValue=
        ((kThreeUnitNodeValueLead-kMinTwoUnitNodeValueLead)<<10)-1// 0xfdffff

    // Compact delta integers.
    /*package*/ static final int kMaxOneUnitDelta=0xfbff;
    /*package*/ static final int kMinTwoUnitDeltaLead=kMaxOneUnitDelta+1// 0xfc00
    /*package*/ static final int kThreeUnitDeltaLead=0xffff;

    /*package*/ static final int kMaxTwoUnitDelta=((kThreeUnitDeltaLead-kMinTwoUnitDeltaLead)<<16)-1// 0x03feffff

    // Fixed value referencing the CharsTrie words.
    private CharSequence chars_;
    private int root_;

    // Iterator variables.

    // Pointer to next trie unit to read. NULL if no more matches.
    private int pos_;
    // Remaining length of a linear-match node, minus 1. Negative if not in such a node.
    private int remainingMatchLength_;
}
TOP

Related Classes of com.ibm.icu.util.CharsTrie

TOP
Copyright © 2018 www.massapi.com. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.