Package it.unimi.dsi.io

Examples of it.unimi.dsi.io.InputBitStream


    this.address = address;
  }
 
  @Override
  public InputBitStream getInputBitStream( final int bufferSize ) throws IOException {
    return new InputBitStream( new RemoteInputStream( address ), bufferSize );
  }
View Full Code Here


      last = next;
      next = -1;

      try {
        // TODO: this is *very roughf* and preliminary
        if ( index.hasPayloads ) payload.read( new InputBitStream( inputStream, 0 ) );
        if ( index.hasCounts ) {
          count = inputStream.readInt();
          if ( index.hasPositions ) {
            for ( int i = 0; i < count; i++ ) position[ i ] = inputStream.readInt();
            intervalIterator.reset();
View Full Code Here

    this.positions = positions;
  }

  @Override
  public InputBitStream getInputBitStream( int bufferSizeUnused ) {
    return new InputBitStream( getInputStream() );
  }
View Full Code Here

    return index.copy();
  }

  @Override
  public InputBitStream getPositionsInputBitStream( int bufferSizeUnused ) throws IOException {
    return new InputBitStream( getPositionsInputStream() );
  }
View Full Code Here

        final boolean hasPositions = completeness.compareTo( Completeness.POSITIONS ) >= 0;
        int count = -1, moreCount = -1;
       
        for ( int i = 0; i < numTerms; i++ ) {
          bapl = termMap.get( termArray[ i ] );
          final InputBitStream ibs = new InputBitStream( bapl.buffer );
          frequency = bapl.frequency; // This could be much more than the actual frequency in virtual indices

          // Calculate posting bit positions and corresponding pointers
          for ( int j = 0; j < frequency; j++ ) {
            bitPos[ j ] = ibs.readBits(); // Cache bit poisition
            pointer[ j ] = ibs.readDelta(); // Cache pointer
            if ( hasCounts ) count = ibs.readGamma() + 1;
            if ( hasPositions ) ibs.skipDeltas( count ); // Skip document positions
          }

          // Sort stably pointers and positions by increasing pointer
          it.unimi.dsi.fastutil.Arrays.quickSort( 0, frequency, new AbstractIntComparator() {
            public int compare( final int i0, final int i1 ) {
              final int t = pointer[ i0 ] - pointer[ i1 ];
              if ( t != 0 ) return t;
              final long u = bitPos[ i0 ] - bitPos[ i1 ]; // We need a stable sort
              return u < 0 ? -1 : u > 0 ? 1 : 0;
            }
          },
          new Swapper() {
            public void swap( final int i0, final int i1 ) {
              final long t = bitPos[ i0 ]; bitPos[ i0 ] = bitPos[ i1 ]; bitPos[ i1 ] = t;
              final int p = pointer[ i0 ]; pointer[ i0 ] = pointer[ i1 ]; pointer[ i1 ] = p;
            }
          } );

          int actualFrequency = frequency;
          // Compute actual frequency for virtual indices
          if ( indexingIsVirtual ) {
            actualFrequency = 1;
            for ( int j = 1; j < frequency; j++ ) if ( pointer[ j ] != pointer[ j - 1 ] ) actualFrequency++;
            if ( ASSERTS ) {
              for ( int j = 1; j < frequency; j++ ) {
                assert pointer[ j ] >= pointer[ j - 1 ];
                assert pointer[ j ] != pointer[ j - 1 ] || bitPos[ j ] > bitPos[ j - 1 ];
              }
            }
          }

          indexWriter.newInvertedList();
          indexWriter.writeFrequency( actualFrequency );

          int currPointer;
          for ( int j = 0; j < frequency; j++ ) {
            ibs.position( bitPos[ j ] );
            obs = indexWriter.newDocumentRecord();
            indexWriter.writeDocumentPointer( obs, currPointer = ibs.readDelta() );
            if ( ASSERTS ) assert currPointer == pointer[ j ];
            if ( hasCounts ) count = ibs.readGamma() + 1;
            if ( hasPositions ) {
              ibs.readDeltas( pos, count );
              for ( int p = 1; p < count; p++ ) pos[ p ] += pos[ p - 1 ] + 1;
            }

            if ( indexingIsVirtual ) {
              while( j < frequency - 1 ) {
                ibs.position( bitPos[ j + 1 ] );
                if ( currPointer != ibs.readDelta() ) break;
                j++;
                if ( hasCounts ) moreCount = ibs.readGamma() + 1;
                if ( hasPositions ) {
                  pos = IntArrays.grow( pos, count + moreCount, count );
                  pos[ count ] = ibs.readDelta();
                  if ( ASSERTS ) assert pos[ count ] > pos[ count - 1 ];
                  for ( int p = 1; p < moreCount; p++ ) pos[ count + p ] = pos[ count + p - 1 ] + 1 + ibs.readDelta();
                }
                count += moreCount;
              }
              if ( maxCount < count ) maxCount = count;
            }

            if ( hasCounts ) indexWriter.writePositionCount( obs, count );
            if ( hasPositions ) indexWriter.writeDocumentPositions( obs, pos, 0, count, -1 );
          }

          frequencies.writeGamma( actualFrequency );
          globCounts.writeLongGamma( bapl.globCount );
        }

        indexWriter.close();
        final Properties properties = indexWriter.properties();
        totPostings += properties.getLong( "postings" );
        properties.setProperty( Index.PropertyKeys.TERMPROCESSOR, ObjectParser.toSpec( termProcessor ) );
        properties.setProperty( Index.PropertyKeys.OCCURRENCES, numOccurrences );
        properties.setProperty( Index.PropertyKeys.MAXDOCSIZE, maxDocSize );
        properties.setProperty( Index.PropertyKeys.SIZE, indexWriter.writtenBits() );
        if ( field != null ) properties.setProperty( Index.PropertyKeys.FIELD, field );
        properties.save( batchBasename + DiskBasedIndex.PROPERTIES_EXTENSION );

        if ( indexingIsRemapped ) {
          // We must permute sizes
          final int[] document = new int[ documentCount ], size = new int[ documentCount ];
          final InputBitStream sizes = new InputBitStream( batchBasename + DiskBasedIndex.SIZES_EXTENSION );
          for ( int i = 0; i < documentCount; i++ ) {
            document[ i ] = sizes.readGamma();
            size[ i ] = sizes.readGamma();
          }
          sizes.close();
         
          it.unimi.dsi.fastutil.Arrays.quickSort( 0, documentCount, new AbstractIntComparator() {
            public int compare( int x, int y ) {
              return document[ x ] - document[ y ];
            }
          }, new Swapper() {
            public void swap( int x, int y ) {
              int t = document[ x ];
              document[ x ] = document[ y ];
              document[ y ] = t;
              t = size[ x ];
              size[ x ] = size[ y ];
              size[ y ] = t;
            }
          } );


          final OutputBitStream permutedSizes = new OutputBitStream( batchBasename( batch, basename, batchDir ) + DiskBasedIndex.SIZES_EXTENSION );
          for ( int i = 0, d = 0; i < documentCount; i++ ) {
            while ( d++ < document[ i ] )
              permutedSizes.writeGamma( 0 );
            permutedSizes.writeGamma( size[ i ] );
          }
          permutedSizes.close();
        }
      }
     
      if ( indexingIsVirtual ) {
        final OutputBitStream sizes = new OutputBitStream( batchBasename( batch, basename, batchDir ) + DiskBasedIndex.SIZES_EXTENSION );
        for ( int i = 0; i < currSize.length; i++ ) sizes.writeGamma( currSize[ i ] );
        sizes.close();
      }

      globCounts.close();
      frequencies.close();
      termMap.clear();
View Full Code Here

   * @param obs an output bit stream.
   * @param bitLength the number of bits to be scanned.
   * @throws IOException
   */
  public void stripPointers( final OutputBitStream obs, final long bitLength ) throws IOException {
    final InputBitStream ibs = new InputBitStream( buffer );
    int count;
    while( ibs.readBits() < bitLength ) {
      ibs.readDelta(); // Discard pointer
      if ( completeness >= COUNTS.ordinal() ) {
        count = ibs.readGamma() + 1;
        obs.writeGamma( count - 1 );
        if ( completeness >= POSITIONS.ordinal() ) while( count-- != 0 ) obs.writeDelta( ibs.readDelta() );
      }
    }
  }
View Full Code Here

 
  private void partitionSizes() throws IOException {     
    final File sizesFile = new File( inputBasename + DiskBasedIndex.SIZES_EXTENSION );
    if ( sizesFile.exists() ) {
      LOGGER.info( "Partitioning sizes..." );
      final InputBitStream sizes = new InputBitStream ( sizesFile );
      final OutputBitStream localSizes[] = new OutputBitStream[ numIndices ];
      for ( int i = 0; i < numIndices; i++ ) localSizes[ i ] = new OutputBitStream( localBasename[ i ] + DiskBasedIndex.SIZES_EXTENSION );

      // ALERT: for the time being, we decide whether to "fill the gaps" in sizes using as sole indicator the equality between global and local number of documents.
      int size, localIndex;
      if ( globalIndex.numberOfDocuments == strategy.numberOfDocuments( 0 ) ) {
        for( int i = 0; i < globalIndex.numberOfDocuments; i++ ) {
          localSizes[ localIndex = strategy.localIndex( i ) ].writeGamma( size = sizes.readGamma() );
          if ( maxDocSize[ localIndex ] < size ) maxDocSize[ localIndex ] = size;
          for( int l = numIndices; l-- != 0; ) if ( l != localIndex ) localSizes[ l ].writeGamma( 0 );
        }
      }
      else {
        for( int i = 0; i < globalIndex.numberOfDocuments; i++ ) {
          localSizes[ localIndex = strategy.localIndex( i ) ].writeGamma( size = sizes.readGamma() );
          if ( maxDocSize[ localIndex ] < size ) maxDocSize[ localIndex ] = size;
        }
      }

      sizes.close();
      for ( int i = 0; i < numIndices; i++ ) localSizes[ i ].close();
    }
  }
View Full Code Here

    IndexIterator indexIterator;
   
    for ( int i = 0; i < numIndices; i++ ) {
      tempFile[ i ] = new File( localBasename[ i ] + ".temp" );
      temp[ i ] = new CachingOutputBitStream( tempFile[ i ], bufferSize );
      direct[ i ] = new InputBitStream( temp[ i ].buffer() );
      indirect[ i ] = new InputBitStream( tempFile[ i ] );
      if ( bloomFilterPrecision != 0 ) bloomFilter[ i ] = new BloomFilter( globalIndex.numberOfTerms, bloomFilterPrecision );
    }
    int usedIndices;
    MutableString currentTerm = new MutableString();
    Payload payload = null;
    int frequency, globalPointer, localIndex, localPointer, count = -1;

    pl.expectedUpdates = globalIndex.numberOfPostings;
    pl.itemsName = "postings";
    pl.logInterval = logInterval;
    pl.start( "Partitioning index..." );

    for ( int t = 0; t < globalIndex.numberOfTerms; t++ ) {
      terms.readLine( currentTerm );
      indexIterator = indexReader.nextIterator();
      usedIndices = 0;
      frequency = indexIterator.frequency();
     
      for ( int j = 0; j < frequency; j++ ) {
        globalPointer = indexIterator.nextDocument();               
        localIndex = strategy.localIndex( globalPointer )

        if ( localFrequency[ localIndex ] == 0 ) {
          // First time we see a document for this index.
          currentTerm.println( localTerms[ localIndex ] );
          numTerms[ localIndex ]++;
          usedIndex[ usedIndices++ ] = localIndex;
          if ( bloomFilterPrecision != 0 ) bloomFilter[ localIndex ].add( currentTerm );
        }
       
        /* Store temporarily posting data; note that we save the global pointer as we
         * will have to access the size list. */
       
        localFrequency[ localIndex ]++;
        numPostings[ localIndex ]++;
        temp[ localIndex ].writeGamma( globalPointer );

        if ( globalIndex.hasPayloads ) payload = indexIterator.payload();
        if ( havePayloads ) payload.write( temp[ localIndex ] );
       
        if ( haveCounts ) {
          count = indexIterator.count();
          temp[ localIndex ].writeGamma( count );
          globCount[ localIndex ] += count;       
          if ( maxDocPos[ localIndex ] < count ) maxDocPos[ localIndex ] = count;        
          if ( havePositions ) {
            final int[] pos = indexIterator.positionArray();
            // TODO: compress this stuff
            for( int p = 0; p < count; p++ ) temp[ localIndex ].writeGamma( pos[ p ] );
          }
        }
      }
     
      // We now run through the indices used by this term and copy from the temporary buffer.

      OutputBitStream obs;
     
      for( int k = 0; k < usedIndices; k++ ) {
        final int i = usedIndex[ k ];

        localFrequencies[ i ].writeGamma( localFrequency[ i ] );
        if ( haveCounts ) numOccurrences[ i ] += globCount[ i ];
        if ( localGlobCounts[ i ] != null ) localGlobCounts[ i ].writeLongGamma( globCount[ i ] );
        globCount[ i ] = 0;
       
        InputBitStream ibs;
        indexWriter[ i ].newInvertedList();

        temp[ i ].align();
        if ( temp[ i ].buffer() != null ) ibs = direct[ i ];
        else {
          // We cannot read directly from the internal buffer.
          ibs = indirect[ i ];
          ibs.flush();
          temp[ i ].flush();
        }

        ibs.position( 0 );
         
        indexWriter[ i ].writeFrequency( localFrequency[ i ] );
        for( int j = 0; j < localFrequency[ i ]; j++ ) {
          obs = indexWriter[ i ].newDocumentRecord();
          globalPointer = ibs.readGamma();
          localPointer = strategy.localPointer( globalPointer )
          indexWriter[ i ].writeDocumentPointer( obs, localPointer );
          if ( havePayloads ) {
            payload.read( ibs );
            indexWriter[ i ].writePayload( obs, payload );
          }
          if ( haveCounts ) indexWriter[ i ].writePositionCount( obs, count = ibs.readGamma() );
          if ( havePositions ) {
            for( int p = 0; p < count; p++ ) position[ p ] = ibs.readGamma();
            indexWriter[ i ].writeDocumentPositions( obs, position, 0, count, sizeList != null ? sizeList.getInt( globalPointer ) : -1 );
          }
         
        }
        temp[ i ].position( 0 );
View Full Code Here

   * an additional final element of index <code>T</code> that gives the number
   * of bytes of the index file.
   */

  public static LongList readOffsets( final CharSequence filename, final int T ) throws IOException {
    final InputBitStream in = new InputBitStream( filename.toString() );
    final long[] offset = new long[ T + 1 ];
    LOGGER.debug( "Loading offsets..." );
    offset[ 0 ] = in.readLongGamma();
    for( int i = 0; i < T; i++ ) offset[ i + 1 ] = in.readLongGamma() + offset[ i ];
    LOGGER.debug( "Completed." );
    in.close();
    return LongArrayList.wrap( offset );
  }
View Full Code Here

   * @return a list of integers backed by an array.
   */

  public static IntList readSizes( final CharSequence filename, final int N ) throws IOException {
    final int[] size = new int[ N ];
    final InputBitStream in = new InputBitStream( filename.toString() );
    LOGGER.debug( "Loading sizes..." );
    in.readGammas( size, N );     
    LOGGER.debug( "Completed." );
    in.close();
    return IntArrayList.wrap( size );
  }
View Full Code Here

TOP

Related Classes of it.unimi.dsi.io.InputBitStream

Copyright © 2018 www.massapicom. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.