Package it.unimi.dsi.mg4j.index

Source Code of it.unimi.dsi.mg4j.index.BitStreamIndexReader



package it.unimi.dsi.mg4j.index;





/*    
* MG4J: Managing Gigabytes for Java
*
* Copyright (C) 2003-2010 Paolo Boldi and Sebastiano Vigna
*
*  This library is free software; you can redistribute it and/or modify it
*  under the terms of the GNU Lesser General Public License as published by the Free
*  Software Foundation; either version 3 of the License, or (at your option)
*  any later version.
*
*  This library is distributed in the hope that it will be useful, but
*  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
*  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public License
*  for more details.
*
*  You should have received a copy of the GNU Lesser General Public License
*  along with this program; if not, see <http://www.gnu.org/licenses/>.
*
*/

import it.unimi.dsi.fastutil.ints.IntIterator;
import it.unimi.dsi.fastutil.ints.IntIterators;
import it.unimi.dsi.fastutil.ints.IntSet;
import it.unimi.dsi.fastutil.objects.AbstractObjectIterator;
import it.unimi.dsi.fastutil.objects.Reference2ReferenceMap;
import it.unimi.dsi.fastutil.objects.Reference2ReferenceMaps;
import it.unimi.dsi.fastutil.objects.ReferenceSet;
import it.unimi.dsi.mg4j.index.AbstractIndexIterator;
import it.unimi.dsi.mg4j.index.AbstractIndexReader;
import it.unimi.dsi.mg4j.index.BitStreamIndex;
import it.unimi.dsi.mg4j.index.Index;
import it.unimi.dsi.mg4j.index.IndexIterator;
import it.unimi.dsi.mg4j.index.CompressionFlags.Coding;
import it.unimi.dsi.mg4j.index.payload.Payload;
import it.unimi.dsi.io.InputBitStream;
import it.unimi.dsi.util.Interval;
import it.unimi.dsi.mg4j.search.IntervalIterator;
import it.unimi.dsi.mg4j.search.IntervalIterators;
import it.unimi.dsi.Util;


import it.unimi.dsi.bits.Fast;


import java.io.IOException;
import java.util.NoSuchElementException;

import org.apache.log4j.Logger;


/** A bitstream-based {@linkplain IndexReader index reader}. */


public class BitStreamIndexReader extends AbstractIndexReader {
@SuppressWarnings("unused")
private static final Logger LOGGER = Util.getLogger( BitStreamIndexReader.class );

/** The reference index. */
protected final BitStreamIndex index;

private final static boolean ASSERTS = false;
private final static boolean DEBUG = false;

/** The {@link IndexIterator} view of this reader (returned by {@link #documents(CharSequence)}). */
protected final BitStreamIndexReaderIndexIterator indexIterator;

/** Creates a new skip index reader, with the specified underlying {@link Index} and input bit stream.
   *
   * @param index the index.
   * @param ibs the underlying bit stream.
   */
public BitStreamIndexReader( final BitStreamIndex index, final InputBitStream ibs ) {
  this.index = index;
  this.indexIterator = new BitStreamIndexReaderIndexIterator( this, ibs );
}

protected static final class BitStreamIndexReaderIndexIterator extends AbstractIndexIterator implements IndexIterator {
  /** The enclosing instance. */
  private final BitStreamIndexReader parent;
  /** The reference index. */
  protected final BitStreamIndex index;
  /** The underlying input bit stream. */
  protected final InputBitStream ibs;
  /** The enclosed interval iterator. */
  private final IndexIntervalIterator intervalIterator;
  /** A singleton set containing the enclosed interval iterator. */
  private final Reference2ReferenceMap<Index,IntervalIterator> singletonIntervalIterator;
  /** The key index. */
  private final Index keyIndex;

  /** The cached copy of {@link #index index.hasPositions}. */
  protected final boolean hasPositions;
  /** The cached copy of {@link #index index.hasCounts}. */
  protected final boolean hasCounts;
  /** The cached copy of {@link #index index.hasPayloads}. */
  protected final boolean hasPayloads;
  /** Whether the underlying index has skips. */
  protected final boolean hasSkips;

  /** The cached copy of {@link #index index.pointerCoding}. */
  protected final Coding pointerCoding;

  /** The cached copy of {@link #index index.countCoding}. */
  protected final Coding countCoding;


  /** The cached copy of {@link #index index.positionCoding}. */
  protected final Coding positionCoding;


  /** The payload, in case the index of this reader has payloads, or <code>null</code>. */
  protected final Payload payload;


  /** The parameter <code>b</code> for Golomb coding of pointers. */
  protected int b;
  /** The parameter <code>log2b</code> for Golomb coding of pointers; it is the most significant bit of {@link #b}. */
  protected int log2b;

  /** The current term. */
  protected int currentTerm = -1;
  /** The current frequency. */
  protected int frequency;
  /** Whether the current terms has pointers at all (this happens when the {@link #frequency} is smaller than the number of documents). */
  protected boolean hasPointers;
  /** The current count (if this index contains counts). */
  protected int count;
  /** The last document pointer we read from current list, -1 if we just read the frequency,
     * {@link Integer#MAX_VALUE} if we are beyond the end of list. */
  protected int currentDocument;
  /** The number of the document record we are going to read inside the current inverted list. */
  protected int numberOfDocumentRecord;
  /** This variable tracks the current state of the reader. */
  protected int state;


  /** Whether the index will use variable quanta. */
  private boolean variableQuanta;
    /** The parameter <code>h</code> (the maximum height of a skip tower). */
  public final int height;
  /** The quantum. */
  public int quantum;
  /** The bit mask giving the remainder of the division by {@link #quantum}. */
  public int quantumModuloMask;
  /** The shift giving result of the division by {@link #quantum}. */
  public int quantumDivisionShift;
  /** The maximum height of a skip tower in the current block. May be less than {@link #height} if the block is defective,
     * and will be -1 on defective quanta (no tower at all). */
  private int maxh;
  /** The maximum valid index of the current skip tower, if any. */
  private int s;
  /** The minimum valid index of the current skip tower, or {@link Integer#MAX_VALUE}. If {@link #maxh} is negative, the value is undefined. */
  private int lowest;
  /** We have <var>w</var> = <var>Hq</var>. */
  private long w;
  /** The bit mask giving the remainder of the division by {@link #w}. */
  private long wModuloMask;
  /** The shift giving result of the division by {@link #w}. */
  private int wDivisionShift;
  /** The Golomb modulus for a top pointer skip, for each level. */
  private int[] towerTopB;
  /** The most significant bit of the Golomb modulus for a top point[]er skip, for each level. */
  private int[] towerTopLog2B;
  /** The Golomb modulus for a lower pointer skip, for each level. */
  private int[] towerLowerB;
  /** The most significant bit of the Golomb modulus for a lower pointer skip, for each level. */
  private int[] towerLowerLog2B;
  /** The prediction for a pointer skip, for each level. */
  private int[] pointerPrediction;
  /** An array to decode bit skips. */
  private long[] bitSkip;
  /** An array to decode the pointer skips. */
  private int[] pointerSkip;
  /** The number of bits read just after reading the last skip tower. */
  private long readBitsAtLastSkipTower;
  /** The document pointer corresponding to the last skip tower. */
  private int pointerAtLastSkipTower;
  /** The current quantum bit length, as provided by the index. */
  private int quantumBitLength;
  /** The current entry bit length, as provided by the index. */
  private int entryBitLength;
  /** This value of {@link #state} means that we are positioned just before a tower. */
  private static final int BEFORE_TOWER = 0;


  /** The initial size of {@link #positionCache}. */
  private static final int POSITION_CACHE_INITIAL_SIZE = 16;

  /** This value of {@link #state} can be assumed only in indices that contain a payload; it
     * means that we are positioned just before the payload for the current document record. */
  private static final int BEFORE_PAYLOAD = 1;

  /** This value of {@link #state} can be assumed only in indices that contain counts; it
     * means that we are positioned just before the count for the current document record. */
  private static final int BEFORE_COUNT = 2;

  /** This value of {@link #state} can be assumed only in indices that contain document positions;
     * it means that we are positioned just before the position list of the current document record. */
  private static final int BEFORE_POSITIONS = 3;

  /** This value of {@link #state} means that we are at the start of a new document record,
     * unless we already read all documents (i.e., {@link #numberOfDocumentRecord} == {@link #frequency}),
     * in which case we are at the end of the inverted list, and {@link #endOfList()} is true. */
  private static final int BEFORE_POINTER = 4;

  /** The cached position array. */
  protected int[] positionCache;

  public BitStreamIndexReaderIndexIterator( final BitStreamIndexReader parent, final InputBitStream ibs ) {
   this.parent = parent;
   this.ibs = ibs;
   index = parent.index;
   keyIndex = index.keyIndex;
   pointerCoding = index.pointerCoding;


   hasPayloads = index.hasPayloads;
   payload = hasPayloads ? index.payload.copy() : null;
   hasCounts = index.hasCounts;
   countCoding = index.countCoding;
   hasPositions = index.hasPositions;
   positionCoding = index.positionCoding;
   if ( hasPositions ) positionCache = new int[ POSITION_CACHE_INITIAL_SIZE ];
   intervalIterator = index.hasPositions ? new IndexIntervalIterator() : null;
   singletonIntervalIterator = index.hasPositions ? Reference2ReferenceMaps.singleton( keyIndex, (IntervalIterator)intervalIterator ) : null;
   if ( ( index.quantum == -1 ) != ( index.height == -1 ) ) throw new IllegalArgumentException();
   height = index.height;
   hasSkips = quantum != -1 && height != -1;
   if ( hasSkips ) {
    if ( ! ( variableQuanta = index.quantum == 0 ) ) {
     quantum = index.quantum;
     quantumModuloMask = quantum - 1;
     quantumDivisionShift = Fast.mostSignificantBit( quantum );
     w = ( 1L << height ) * quantum;
     wModuloMask = w - 1;
     wDivisionShift = Fast.mostSignificantBit( w );
    }
    bitSkip = new long[ height + 1 ];
    pointerSkip = new int[ height + 1 ];
    towerTopB = new int[ height + 1 ];
    towerTopLog2B = new int[ height + 1 ];
    towerLowerB = new int[ height + 1 ];
    towerLowerLog2B = new int[ height + 1 ];
    pointerPrediction = new int[ height + 1 ];
   }
   else {
    w = wModuloMask = quantumModuloMask = quantumDivisionShift = wDivisionShift = 0;
    bitSkip = null;
    pointerSkip = towerTopB = towerTopLog2B = towerLowerB = towerLowerLog2B = pointerPrediction = null;
   }
  }
  private void ensureHasPositions() {
   if ( ! hasPositions ) throw new UnsupportedOperationException( "Index " + index + " does not contain positions" );
  }
  /** Positions the index on the inverted list of a given term.
     *
     * <p>This method can be called at any time. Note that it is <em>always</em> possible
     * to call this method with argument 0, even if offsets have not been loaded.
     *
     * @param term a term.
     */
  protected void position( final int term ) throws IOException {
   if ( term == 0 ) {
    ibs.position( 0 );
    ibs.readBits( 0 );
   }
   else {
    if ( index.offsets == null ) throw new IllegalStateException( "You cannot position an index without offsets" );
    final long offset = index.offsets.getLong( term );
    ibs.position( offset );
    // TODO: Can't we set this to 0?
    ibs.readBits( offset );
   }
   currentTerm = term;
   readFrequency();
  }
  public int termNumber() {
   return currentTerm;
  }
  protected IndexIterator advance() throws IOException {
   if ( currentTerm == index.numberOfTerms - 1 ) return null;
   if ( currentTerm != -1 ) {
    skipTo( Integer.MAX_VALUE );
    nextDocument(); // This guarantees we have no garbage before the frequency
   }
   currentTerm++;
   readFrequency();
   return this;
  }
  private void readFrequency() throws IOException {
   // Read the frequency
   switch( index.frequencyCoding ) {
   case GAMMA:
    frequency = ibs.readGamma() + 1;
    break;
   case SHIFTED_GAMMA:
    frequency = ibs.readShiftedGamma() + 1;
    break;
   case DELTA:
    frequency = ibs.readDelta() + 1;
    break;
   default:
    throw new IllegalStateException( "The required frequency coding (" + index.frequencyCoding + ") is not supported." );
   }
   hasPointers = frequency < index.numberOfDocuments;
   // We compute the modulus used for pointer Golomb coding
   if ( pointerCoding == Coding.GOLOMB ) {
    if ( hasPointers ) {
     b = BitStreamIndex.golombModulus( frequency, index.numberOfDocuments );
     log2b = Fast.mostSignificantBit( b );
    }
   }
   if ( hasSkips ) {
    if ( variableQuanta ) {
     quantumDivisionShift = frequency > 1 ? ibs.readGamma() - 1 : -1;
     if ( quantumDivisionShift == -1 ) quantumDivisionShift = Fast.ceilLog2( frequency ) + 1;
     quantum = 1 << quantumDivisionShift;
     quantumModuloMask = quantum - 1;
     w = ( 1L << height ) * quantum;
     wModuloMask = w - 1;
     wDivisionShift = Fast.mostSignificantBit( w );
    }
    quantumBitLength = entryBitLength = -1;
    lowest = Integer.MAX_VALUE;
    if ( ASSERTS ) for( int i = height; i > Math.min( height, Fast.mostSignificantBit( frequency >> quantumDivisionShift ) ); i-- ) towerTopB[ i ] = towerLowerB[ i ] = pointerPrediction[ i ] = -1;
    final long pointerQuantumSigma = BitStreamIndex.quantumSigma( frequency, index.numberOfDocuments, quantum );
    for( int i = Math.min( height, Fast.mostSignificantBit( frequency >> quantumDivisionShift ) ); i >= 0; i-- ) {
     towerTopB[ i ] = BitStreamIndex.gaussianGolombModulus( pointerQuantumSigma, i + 1 );
     towerTopLog2B[ i ] = Fast.mostSignificantBit( towerTopB[ i ] );
     towerLowerB[ i ] = BitStreamIndex.gaussianGolombModulus( pointerQuantumSigma, i );
     towerLowerLog2B[ i ] = Fast.mostSignificantBit( towerLowerB[ i ] );
     pointerPrediction[ i ] = (int)( ( quantum * ( 1L << i ) * index.numberOfDocuments + frequency / 2 ) / frequency );
    }
   }
   count = -1;
   currentDocument = -1;
   numberOfDocumentRecord = -1;
   state = BEFORE_POINTER;
  }
  public Index index() {
   return keyIndex;
  }
  public int frequency() {
   return frequency;
  }
  private void ensureCurrentDocument() {
   if ( currentDocument < 0 ) throw new IllegalStateException( "nextDocument() has never been called for (term=" + currentTerm + ")" );
   if ( currentDocument == Integer.MAX_VALUE ) throw new IllegalStateException( "This reader is positioned beyond the end of list of (term=" + currentTerm + ")" );
  }
  /** Returns whether there are no more document records in the current inverted list.
     *
     * <p>This method returns true if the last document pointer of the current inverted
     * list has been read. It makes no distinction as to where (inside the last document
     * record) this reader is currently positioned. In particular, this method will
     * return true independently of whether count and positions have been read or not (we
     * note by passing that this is the only sensible behaviour, as you can build indices
     * with or without counts/positions).
     *
     * <p>This method will return true also when this reader is positioned <em>beyond</em>
     * the last document pointer. In this case, {@link #currentDocumentPointer()} will
     * return {@link Integer#MAX_VALUE}.
     *
     * @return true whether there are no more document records in the current inverted list.
     */
  private boolean endOfList() {
   if ( ASSERTS ) assert numberOfDocumentRecord <= frequency;
   return numberOfDocumentRecord >= frequency - 1;
  }
  public int document() {
   if ( ASSERTS ) ensureCurrentDocument();
   return currentDocument;
  }
  public Payload payload() throws IOException {
   if ( DEBUG ) System.err.println( this + ".payload()" );
   if ( ASSERTS ) ensureCurrentDocument();
   if ( ! hasPayloads )
    throw new UnsupportedOperationException( "This index ("+ index + ") does not contain payloads" );
   if ( state <= BEFORE_PAYLOAD ) {
    if ( state == BEFORE_TOWER ) readTower();
    payload.read( ibs );
    state = hasCounts ? BEFORE_COUNT : BEFORE_POINTER;
   }
   return payload;
  }
  public int count() throws IOException {
   if ( DEBUG ) System.err.println( this + ".count()" );
   if ( count != -1 ) return count;
   if ( ASSERTS ) ensureCurrentDocument();
   if ( ! hasCounts )
    throw new UnsupportedOperationException( "This index (" + index + ") does not contain counts" );
   if ( state == BEFORE_TOWER ) readTower();
   if ( state == BEFORE_PAYLOAD ) payload.read( ibs );
  {
   if ( ASSERTS ) if ( state != BEFORE_COUNT ) throw new IllegalStateException();
   state = hasPositions ? BEFORE_POSITIONS : BEFORE_POINTER;
   switch( countCoding ) {
   case UNARY:
    count = ibs.readUnary() + 1;
    break;
   case SHIFTED_GAMMA:
    count = ibs.readShiftedGamma() + 1;
    break;
   case GAMMA:
    count = ibs.readGamma() + 1;
    break;
   case DELTA:
    count = ibs.readDelta() + 1;
    break;
   default: throw new IllegalStateException( "The required count coding (" + countCoding + ") is not supported." );
   }
  }
   return count;
  }
  /** We read positions, assuming state <= BEFORE_POSITIONS */
  protected void updatePositionCache() throws IOException {
   if ( ASSERTS ) assert state <= BEFORE_POSITIONS;
   if ( ! hasPositions )
    throw new UnsupportedOperationException( "Index " + index + " does not contain positions" );
   if ( state < BEFORE_POSITIONS ) {
    if ( state == BEFORE_TOWER ) readTower();
    if ( state == BEFORE_PAYLOAD ) payload.read( ibs );
    if ( state == BEFORE_COUNT )
  {
   if ( ASSERTS ) if ( state != BEFORE_COUNT ) throw new IllegalStateException();
   switch( countCoding ) {
   case UNARY:
    count = ibs.readUnary() + 1;
    break;
   case SHIFTED_GAMMA:
    count = ibs.readShiftedGamma() + 1;
    break;
   case GAMMA:
    count = ibs.readGamma() + 1;
    break;
   case DELTA:
    count = ibs.readDelta() + 1;
    break;
   default: throw new IllegalStateException( "The required count coding (" + countCoding + ") is not supported." );
   }
  }
   }
    if ( count > positionCache.length ) positionCache = new int[ Math.max( positionCache.length * 2, count ) ];
    final int[] occ = positionCache;
    state = BEFORE_POINTER;
    switch( positionCoding ) {
    case SHIFTED_GAMMA:
     ibs.readShiftedGammas( occ, count );
     for( int i = 1; i < count; i++ ) occ[ i ] += occ[ i - 1 ] + 1;
     return;
    case GAMMA:
     ibs.readGammas( occ, count );
     for( int i = 1; i < count; i++ ) occ[ i ] += occ[ i - 1 ] + 1;
     return;
    case DELTA:
     ibs.readDeltas( occ, count );
     for( int i = 1; i < count; i++ ) occ[ i ] += occ[ i - 1 ] + 1;
     return;
    case GOLOMB:
     if ( ASSERTS ) assert index.sizes != null;
     int docSize = index.sizes.getInt( currentDocument );
     if ( count < 3 ) for( int i = 0; i < count; i++ ) occ[ i ] = ibs.readMinimalBinary( docSize );
     else {
      final int bb = BitStreamIndex.golombModulus( count, docSize );
      int prev = -1;
      if ( bb != 0 ) {
       final int log2bb = Fast.mostSignificantBit( bb );
       for( int i = 0; i < count; i++ ) occ[ i ] = prev = ibs.readGolomb( bb, log2bb ) + prev + 1;
      }
      else for ( int i = 0; i < count; i++ ) occ[ i ] = i;
     }
     return;
    case SKEWED_GOLOMB:
     if ( ASSERTS ) assert index.sizes != null;
     int docSize2 = index.sizes.getInt( currentDocument );
     if ( count < 3 ) for( int i = 0; i < count; i++ ) occ[ i ] = ibs.readMinimalBinary( docSize2 );
     else {
      final int sb = ibs.readMinimalBinary( docSize2 ) + 1;
      int prev2 = -1;
      for( int i = 0; i < count; i++ ) occ[ i ] = prev2 = ibs.readSkewedGolomb( sb ) + prev2 + 1;
     }
     return;
    case INTERPOLATIVE:
     it.unimi.dsi.mg4j.io.InterpolativeCoding.read( ibs, occ, 0, count, 0, index.sizes.getInt( currentDocument ) - 1 );
     return;
    default:
     throw new IllegalStateException( "The required position coding (" + index.positionCoding + ") is not supported." );
    }
  }
  public IntIterator positions() throws IOException {
   if ( ASSERTS ) ensureCurrentDocument();
   if ( state <= BEFORE_POSITIONS ) updatePositionCache();
   return IntIterators.wrap( positionCache, 0, count );
  }
  public int[] positionArray() throws IOException {
   if ( ASSERTS ) ensureCurrentDocument();
   if ( state <= BEFORE_POSITIONS ) updatePositionCache();
   return positionCache;
  }
  // TODO: check who's using this (positionArray() is actually faster now)
  public int positions( final int[] position ) throws IOException {
   if ( ASSERTS ) ensureCurrentDocument();
   if ( state <= BEFORE_POSITIONS ) updatePositionCache(); // And also that positions have been read
   if ( position.length < count ) return -count;
   for( int i = count; i-- != 0; ) position[ i ] = this.positionCache[ i ];
   return count;
  }
  public int nextDocument() throws IOException {
   if ( DEBUG ) System.err.println( "{" + this + "} nextDocument()" );
   if ( state != BEFORE_POINTER ) {
    if ( state == BEFORE_TOWER ) readTower();
    if ( state == BEFORE_PAYLOAD ) payload.read( ibs );
    if ( state == BEFORE_COUNT )
  {
   if ( ASSERTS ) if ( state != BEFORE_COUNT ) throw new IllegalStateException();
   state = hasPositions ? BEFORE_POSITIONS : BEFORE_POINTER;
   switch( countCoding ) {
   case UNARY:
    count = ibs.readUnary() + 1;
    break;
   case SHIFTED_GAMMA:
    count = ibs.readShiftedGamma() + 1;
    break;
   case GAMMA:
    count = ibs.readGamma() + 1;
    break;
   case DELTA:
    count = ibs.readDelta() + 1;
    break;
   default: throw new IllegalStateException( "The required count coding (" + countCoding + ") is not supported." );
   }
  }
    if ( state == BEFORE_POSITIONS ) {
     // Here we just skip; note that the state change is necessary if endOfList() is true
     state = BEFORE_POINTER;
     switch( positionCoding ) {
     case SHIFTED_GAMMA:
      ibs.skipShiftedGammas( count );
      break;
     case GAMMA:
      ibs.skipGammas( count );
      break;
     case DELTA:
      ibs.skipDeltas( count );
    break;
     case GOLOMB:
      if ( ASSERTS ) assert index.sizes != null;
      int docSize = index.sizes.getInt( currentDocument );
      if ( count < 3 ) for( int i = 0; i < count; i++ ) ibs.readMinimalBinary( docSize );
      else {
       final int bb = BitStreamIndex.golombModulus( count, docSize );
       if ( bb != 0 ) {
        final int log2bb = Fast.mostSignificantBit( bb );
        for( int i = 0; i < count; i++ ) ibs.readGolomb( bb, log2bb );
       }
      }
      break;
     case SKEWED_GOLOMB:
      if ( ASSERTS ) assert index.sizes != null;
      docSize = index.sizes.getInt( currentDocument );
      if ( count < 3 ) for( int i = 0; i < count; i++ ) ibs.readMinimalBinary( docSize );
      else {
       final int sb = ibs.readMinimalBinary( docSize ) + 1;
       for( int i = 0; i < count; i++ ) ibs.readSkewedGolomb( sb );
      }
      break;
     case INTERPOLATIVE:
      it.unimi.dsi.mg4j.io.InterpolativeCoding.read( ibs, null, 0, count, 0, index.sizes.getInt( currentDocument ) - 1 );
      break;
     default:
      throw new IllegalStateException( "The required position coding (" + positionCoding + ") is not supported." );
     }
    }
   }
   if ( endOfList() ) return -1;
   if ( hasPointers ) {// We do not write pointers for everywhere occurring terms.
    switch( pointerCoding ) {
    case UNARY:
     currentDocument += ibs.readUnary() + 1;
     break;
    case SHIFTED_GAMMA:
     currentDocument += ibs.readShiftedGamma() + 1;
     break;
    case GAMMA:
     currentDocument += ibs.readGamma() + 1;
     break;
    case DELTA:
     currentDocument += ibs.readDelta() + 1;
     break;
    case GOLOMB:
     currentDocument += ibs.readGolomb( b, log2b ) + 1;
     break;
    default:
     throw new IllegalStateException( "The required pointer coding (" + pointerCoding + ") is not supported." );
    }
   }
   else currentDocument++;
   numberOfDocumentRecord++;
   if ( hasPayloads )
    state = BEFORE_PAYLOAD;
   else if ( hasCounts )
    state = BEFORE_COUNT;
   count = -1;
   if ( hasSkips && ( numberOfDocumentRecord & quantumModuloMask ) == 0 ) state = BEFORE_TOWER;
   return currentDocument;
  }
  /** Reads the entire skip tower for the current position.
     */
  private void readTower() throws IOException {
   readTower( -1 );
  }
  /** Reads the skip tower for the current position, possibly skipping part of the tower.
     *
     * <P>Note that this method will update {@link #state} only if it reads the entire tower,
     * otherwise the state remains {@link #BEFORE_TOWER}.
     *
     * @param pointer the tower will be read up to the first entry smaller than or equal to this pointer; use
     * -1 to guarantee that the entire tower will be read.
     */
  private void readTower( final int pointer ) throws IOException {
   int i, j, k, cacheOffset, cache, towerLength = 0;
   long bitsAtTowerStart = 0;
   boolean truncated = false;
   if ( ASSERTS ) assert numberOfDocumentRecord % quantum == 0;
   if ( ASSERTS ) if ( state != BEFORE_TOWER ) throw new IllegalStateException( "readTower() called in state " + state );
   cacheOffset = (int)( numberOfDocumentRecord & wModuloMask );
   k = cacheOffset >> quantumDivisionShift;
   if ( ASSERTS ) if ( k == 0 ) { // Invalidate current tower data
    it.unimi.dsi.fastutil.ints.IntArrays.fill( pointerSkip, Integer.MAX_VALUE );
    it.unimi.dsi.fastutil.longs.LongArrays.fill( bitSkip, Integer.MAX_VALUE );
   }
   // Compute the height of the current skip tower.
   s = ( k == 0 )? height : Fast.leastSignificantBit( k );
   cache = (int)( frequency - w * ( numberOfDocumentRecord >> wDivisionShift ) );
   if ( cache < w ) {
    maxh = Fast.mostSignificantBit( ( cache >> quantumDivisionShift ) - k );
    if ( maxh < s ) {
     s = maxh;
     truncated = true;
    } else truncated = false;
   }
   else {
    cache = (int)w;
    maxh = height;
    truncated = k == 0;
   }
   //assert w == cache || k == 0 || lastMaxh == Fast.mostSignificantBit( k ^ ( cache/quantum ) )  : lastMaxh +","+ (Fast.mostSignificantBit( k ^ ( cache/quantum ) ));
   i = s;
   if ( s >= 0 ) {
    if ( k == 0 ) {
     if ( quantumBitLength < 0 ) {
      quantumBitLength = ibs.readDelta();
      entryBitLength = ibs.readDelta();
     }
     else {
      quantumBitLength += Fast.nat2int( ibs.readDelta() );
      entryBitLength += Fast.nat2int( ibs.readDelta() );
     }
     if ( DEBUG ) System.err.println( "{" + this + "} quantum bit length=" + quantumBitLength + " entry bit length=" + entryBitLength );
    }
    if ( DEBUG ) System.err.println( "{" + this + "} Reading tower; pointer=" + pointer + " maxh=" + maxh + " s=" + s );
    if ( s > 0 ) {
     towerLength = entryBitLength * ( s + 1 ) + Fast.nat2int( ibs.readDelta() );
     if ( DEBUG ) System.err.println( "{" + this + "} Tower length=" + towerLength );
    }
    // We store the number of bits read at the start of the tower (just after the length).
    bitsAtTowerStart = ibs.readBits();
    if ( truncated ) {
     if ( DEBUG ) System.err.println( "{" + this + "} Truncated--reading tops" );
     // We read the tower top.
     pointerSkip[ s ] = Fast.nat2int( ibs.readGolomb( towerTopB[ s ], towerTopLog2B[ s ] ) ) + pointerPrediction[ s ];
     bitSkip[ s ] = quantumBitLength * ( 1 << s ) + entryBitLength * ( ( 1 << s + 1 ) - s - 2 ) + Fast.nat2int( ibs.readLongDelta() );
    }
    else {
     // We copy the tower top from the lowest inherited entry suitably updated.
     pointerSkip[ s ] = pointerSkip[ s + 1 ] - ( currentDocument - pointerAtLastSkipTower );
     bitSkip[ s ] = bitSkip[ s + 1 ] - ( bitsAtTowerStart - readBitsAtLastSkipTower ) - towerLength;
    }
    // We read the remaining part of the tower, at least until we point after pointer.
    if ( currentDocument + pointerSkip[ i ] > pointer ) {
     for( i = s - 1; i >= 0; i-- ) {
      pointerSkip[ i ] = Fast.nat2int( ibs.readGolomb( towerLowerB[ i ], towerLowerLog2B[ i ] ) ) + pointerSkip[ i + 1 ] / 2;
      bitSkip[ i ] = ( bitSkip[ i + 1 ] - entryBitLength * ( i + 1 ) ) / 2 - Fast.nat2int( ibs.readLongDelta() );
      if ( DEBUG ) if ( currentDocument + pointerSkip[ i ] <= pointer ) System.err.println( "{" + this + "} stopping reading at i=" + i + " as currentDocument (" + currentDocument + ") plus pointer skip (" + pointerSkip[ i ] + ") is smaller than or equal target (" + pointer +")" );
      if ( currentDocument + pointerSkip[ i ] <= pointer ) break;
     }
    }
   }
   /* If we did not read the entire tower, we need to fix the skips we read (as they
       * are offsets from the *end* of the tower) and moreover we must make unusable the
       * rest of the tower (for asserts). */
   if ( i > 0 ) {
    final long fix = ibs.readBits() - bitsAtTowerStart;
    for( j = s; j >= i; j-- ) bitSkip[ j ] += towerLength - fix;
    if ( ASSERTS ) for( ; j >= 0; j-- ) pointerSkip[ j ] = Integer.MAX_VALUE;
   }
   else
    state = hasPayloads ? BEFORE_PAYLOAD : hasCounts ? BEFORE_COUNT : BEFORE_POINTER;
   // We update the inherited tower.
   final long deltaBits = ibs.readBits() - readBitsAtLastSkipTower;
   final int deltaPointers = currentDocument - pointerAtLastSkipTower;
   for( j = Fast.mostSignificantBit( k ^ ( cache >> quantumDivisionShift ) ); j >= s + 1; j-- ) {
    bitSkip[ j ] -= deltaBits;
    pointerSkip[ j ] -= deltaPointers;
   }
   readBitsAtLastSkipTower = ibs.readBits();
   pointerAtLastSkipTower = currentDocument;
   lowest = i < 0 ? 0 : i;
   if ( DEBUG ) {
    System.err.println( "{" + this + "} " + "Computed skip tower (lowest: " + lowest + ") for document record number " + numberOfDocumentRecord + " (pointer " + currentDocument + ") from " + Math.max( i , 0 ) + ": " );
    System.err.print( "% " );
    for( j = 0; j <= s; j++ ) System.err.print( pointerSkip[ j ] + ":" + bitSkip[ j ] + " " );
    System.err.print( " [" );
    for( ; j <= height; j++ ) System.err.print( pointerSkip[ j ] + ":" + bitSkip[ j ] + " " );
    System.err.print( "]" );
    System.err.println();
   }
  }
  /*
    public int skip( final int n ) throws IOException {
      int i, k, cacheOffset, start = numberOfDocumentRecord, skip = 0;

      if ( DEBUG ) System.err.println( "{" + this + "} " + "Going to enter linear skip code with lastDoc=" + currentDocument + ", numberOfDocumentRecord=" + numberOfDocumentRecord + ", n=" + n + ", endOfList()=" + endOfList() );
      if ( n < 0 ) throw new IllegalArgumentException();
      if ( n == 0 ) return 0;
     
      // If we are just at the start of a list, let us read the first pointer.
      if ( numberOfDocumentRecord == -1 ) readDocumentPointer();
      if ( state == BEFORE_TOWER ) readTower( -1 );

      if ( DEBUG ) System.err.println( "{" + this + "} " + "Entering skip code with lastDoc=" + currentDocument + ", numberOfDocumentRecord=" + numberOfDocumentRecord + ", n=" + n + ", endOfList()=" + endOfList() );

      for(;;) {
        if ( DEBUG ) System.err.println( "{" + this + "} " + "In for loop, lastDoc=" + currentDocument + ", maxh=" + maxh + ", numberOfDocumentRecord=" + numberOfDocumentRecord + ", n=" + n );

        cacheOffset = (int)( numberOfDocumentRecord & wModuloMask );
        k = cacheOffset >> quantumDivisionShift;

        if ( maxh < 0 ) break; // Defective quantum--no tower.

        for( i = Fast.mostSignificantBit( k ^ ( Math.min( w, frequency - numberOfDocumentRecord + cacheOffset ) >> quantumDivisionShift ) ); i >= 0; i-- )
          if ( ( skip = ( ( k & - ( 1 << i ) ) + ( 1 << i ) ) * quantum - cacheOffset ) <= n ) break;
       
        if ( i >= 0 ) {
          ibs.skip( bitSkip[ i ] - ( ibs.readBits() - readBitsAtLastSkipTower ) );
          state = BEFORE_TOWER;
          currentDocument = pointerSkip[ i ] + pointerAtLastSkipTower;
          numberOfDocumentRecord += skip;
          // If we skipped beyond the end of the list, we invalidate the current document.
          if ( numberOfDocumentRecord == frequency ) currentDocument = -1;
          readTower( -1 );
          count = -1; // We must invalidate count as readDocumentPointer() would do.
          if ( endOfList() ) return numberOfDocumentRecord - start;
        }
        else break;
      }

      if ( DEBUG ) System.err.println( "{" + this + "} " + "Completing sequentially, lastDoc=" + currentDocument + ", numberOfDocumentRecord=" + numberOfDocumentRecord + ", n=" + n );

      while( numberOfDocumentRecord - start < n ) {
        if ( endOfList() ) break;
        readDocumentPointer();
      }
      return numberOfDocumentRecord - start;
    }
  */
  public int skipTo( final int p ) throws IOException {
   if ( DEBUG ) System.err.println( this + ".skipTo(" + p + ") [currentDocument=" + currentDocument + ", numberOfDocumentRecord=" + numberOfDocumentRecord + ", endOfList()=" + endOfList() );
   // If we are just at the start of a list, let us read the first pointer.
   if ( numberOfDocumentRecord == -1 ) nextDocument(); // TODO: shouldn't we just read the tower?
   if ( currentDocument >= p ) {
    if ( DEBUG ) System.err.println( this + ": No skip necessary, returning " + currentDocument );
    return currentDocument;
   }
   if ( hasSkips ) {
    if ( state == BEFORE_TOWER ) readTower( p );
    final int[] pointerSkip = this.pointerSkip;
    for(;;) {
     if ( ASSERTS ) assert maxh < 0 || lowest > 0 || pointerSkip[ 0 ] != Integer.MAX_VALUE;
     // If on a defective quantum (no tower) or p is inside the current quantum (no need to scan the tower) we bail out.
     if ( maxh < 0 || lowest == 0 && pointerAtLastSkipTower + pointerSkip[ 0 ] > p ) break;
     if ( DEBUG ) System.err.println( this + ": In for loop, currentDocument=" + currentDocument + ", maxh=" + maxh + ", numberOfDocumentRecord=" + numberOfDocumentRecord + ", p=" + p );
     final int cacheOffset = (int)( numberOfDocumentRecord & wModuloMask );
     final int k = cacheOffset >> quantumDivisionShift;
     final int top = Fast.mostSignificantBit( k ^ ( Math.min( w, frequency - numberOfDocumentRecord + cacheOffset ) >> quantumDivisionShift ) );
     int i;
     for( i = lowest; i <= top; i++ ) {
      if ( ASSERTS ) if ( ( k & 1 << i ) != 0 ) assert pointerSkip[ i ] == pointerSkip[ i + 1 ];
      if ( ASSERTS ) assert pointerSkip[ i ] != Integer.MAX_VALUE : "Invalid pointer skip " + i + " (lowest=" + lowest + ", top=" + top + ")";
      if ( pointerAtLastSkipTower + pointerSkip[ i ] > p ) break;
     }
     if ( --i < 0 ) break;
     if ( ASSERTS ) assert i >= lowest : i + " < " + lowest;
     if ( DEBUG ) System.err.println( this + ": Safely after for with i=" + i + ", P[i]=" + pointerSkip[i] + ", A[i]=" + bitSkip[i] );
     if ( DEBUG ) System.err.println( this + ": [" + ibs.readBits() + "] Skipping " + ( bitSkip[ i ] - ( ibs.readBits() - readBitsAtLastSkipTower ) ) + " bits (" + ( ( ( k & - ( 1 << i ) ) + ( 1 << i ) ) * quantum - cacheOffset ) + " records) to get to document pointer " + ( currentDocument + pointerSkip[ i ] ) );
     ibs.skip( bitSkip[ i ] - ( ibs.readBits() - readBitsAtLastSkipTower ) );
     state = BEFORE_TOWER;
     currentDocument = pointerSkip[ i ] + pointerAtLastSkipTower;
     numberOfDocumentRecord += ( ( k & - ( 1 << i ) ) + ( 1 << i ) ) * quantum - cacheOffset;
     // If we skipped beyond the end of the list, we invalidate the current document.
     if ( numberOfDocumentRecord == frequency ) {
      currentDocument = Integer.MAX_VALUE;
      state = BEFORE_POINTER; // We are actually before a frequency, but we must avoid that calls to nextDocument() read anything
     }
     else readTower( p ); // Note that if we are exactly on the destination pointer, we will read the entire tower.
     count = -1; // We must invalidate count as readDocumentPointer() would do.
     if ( endOfList() ) {
      if ( DEBUG ) System.err.println( this + ".toSkip(): end-of-list, returning " + currentDocument );
      // Note that if p == Integer.MAX_VALUE, we are certainly beyond end-of-list
      return p == Integer.MAX_VALUE ? Integer.MAX_VALUE : currentDocument;
     }
    }
    if ( DEBUG ) System.err.println( this + ": Completing sequentially, currentDocument=" + currentDocument + ", numberOfDocumentRecord=" + numberOfDocumentRecord + ", p=" + p );
   }
   while( currentDocument < p ) {
    if ( DEBUG ) System.err.println( this + ": Skipping sequentially (second), currentDocument=" + currentDocument + ", numberOfDocumentRecord=" + numberOfDocumentRecord + ", p=" + p );
    if ( nextDocument() == -1 ) {
     if ( DEBUG ) System.err.println( this + ": end-of-list, returning MAX_VALUE" );
     return Integer.MAX_VALUE;
    }
   }
   if ( DEBUG ) System.err.println( this + ".toSkip(): Returning " + currentDocument );
   return currentDocument;
  }
  public void dispose() throws IOException {
   parent.close();
  }
  public boolean hasNext() {
   return ! endOfList();
  }
  public int nextInt() {
   if ( ! hasNext() ) throw new NoSuchElementException();
   try {
    return nextDocument();
   }
   catch ( IOException e ) {
    throw new RuntimeException( e );
   }
  }
  public String toString() {
   return index + " [" + currentTerm + "]" + ( weight != 1 ? "{" + weight + "}" : "" );
  }
  /** An interval iterator returning the positions of the current document as singleton intervals. */
  private final class IndexIntervalIterator extends AbstractObjectIterator<Interval> implements IntervalIterator {
   int pos = -1;
   public void reset() throws IOException {
    pos = -1;
    if ( state <= BEFORE_POSITIONS ) updatePositionCache(); // This guarantees the position cache is ok
   }
   public void intervalTerms( final IntSet terms ) {
    terms.add( BitStreamIndexReaderIndexIterator.this.currentTerm );
   }
   public boolean hasNext() {
    return pos < count - 1;
   }
   public Interval next() {
    if ( ! hasNext() ) throw new NoSuchElementException();
    return Interval.valueOf( positionCache[ ++pos ] );
   }
   public Interval nextInterval() {
    return pos < count - 1 ? Interval.valueOf( positionCache[ ++pos ] ) : null;
   }
   public int extent() {
    return 1;
   }
   public String toString() {
    return index + ": " + term + "[doc=" + currentDocument + ", count=" + count + ", pos=" + pos + "]";
   }
  };
  public Reference2ReferenceMap<Index,IntervalIterator> intervalIterators() throws IOException {
   intervalIterator();
   ensureHasPositions();
   return singletonIntervalIterator;
  }
  public IntervalIterator intervalIterator() throws IOException {
   return intervalIterator( keyIndex );
  }
  public IntervalIterator intervalIterator( final Index index ) throws IOException {
   if ( ASSERTS ) ensureCurrentDocument();
   if ( index != keyIndex || hasPayloads ) return IntervalIterators.TRUE;
   ensureHasPositions();
   if ( ASSERTS ) assert intervalIterator != null;
   intervalIterator.reset();
   return intervalIterator;
  }
  public ReferenceSet<Index> indices() {
   return index.singletonSet;
  }
}
private IndexIterator documents( final CharSequence term, final int termNumber ) throws IOException {
  indexIterator.term( term );
  indexIterator.position( termNumber );
  return indexIterator;
}
public IndexIterator documents( final int term ) throws IOException {
  return documents( null, term );
}
public IndexIterator documents( final CharSequence term ) throws IOException {
  if ( closed ) throw new IllegalStateException( "This " + getClass().getSimpleName() + " has been closed" );
  if ( index.termMap != null ) {
   final int termIndex = (int)index.termMap.getLong( term );
   if ( termIndex == -1 ) return index.getEmptyIndexIterator( term, termIndex );
   return documents( term, termIndex );
  }
  throw new UnsupportedOperationException( "Index " + index + " has no term map" );
}
  @Override
  public IndexIterator nextIterator() throws IOException {
   return indexIterator.advance();
  }
public String toString() {
  return getClass().getSimpleName() + "[" + index + "]";
}
public void close() throws IOException {
  super.close();
  indexIterator.ibs.close();
}
}
TOP

Related Classes of it.unimi.dsi.mg4j.index.BitStreamIndexReader

TOP
Copyright © 2018 www.massapi.com. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.
respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.