Examples of ProgressLogger


Examples of it.unimi.dsi.logging.ProgressLogger

      for ( int i = 0; i < numIndices; i++ ) localSizes[ i ].close();
    }
  }
 
  public void run() throws Exception {
    final ProgressLogger pl = new ProgressLogger( LOGGER, logInterval );
    final IntList sizeList = globalIndex.sizes;
    partitionSizes();
   
    final int[] position = new int[ globalIndex.maxCount ]
    final int[] localFrequency = new int[ numIndices ]
    final int[] usedIndex = new int[ numIndices ];
    final InputBitStream[] direct = new InputBitStream[ numIndices ];
    final InputBitStream[] indirect = new InputBitStream[ numIndices ];
    final BloomFilter[] bloomFilter = bloomFilterPrecision != 0 ? new BloomFilter[ numIndices ] : null;
    final File[] tempFile = new File[ numIndices ];
    final CachingOutputBitStream[] temp = new CachingOutputBitStream[ numIndices ];
    IndexIterator indexIterator;
   
    for ( int i = 0; i < numIndices; i++ ) {
      tempFile[ i ] = new File( localBasename[ i ] + ".temp" );
      temp[ i ] = new CachingOutputBitStream( tempFile[ i ], bufferSize );
      direct[ i ] = new InputBitStream( temp[ i ].buffer() );
      indirect[ i ] = new InputBitStream( tempFile[ i ] );
      if ( bloomFilterPrecision != 0 ) bloomFilter[ i ] = new BloomFilter( globalIndex.numberOfTerms, bloomFilterPrecision );
    }
    int usedIndices;
    MutableString currentTerm = new MutableString();
    Payload payload = null;
    int frequency, globalPointer, localIndex, localPointer, count = -1;

    pl.expectedUpdates = globalIndex.numberOfPostings;
    pl.itemsName = "postings";
    pl.logInterval = logInterval;
    pl.start( "Partitioning index..." );

    for ( int t = 0; t < globalIndex.numberOfTerms; t++ ) {
      terms.readLine( currentTerm );
      indexIterator = indexReader.nextIterator();
      usedIndices = 0;
      frequency = indexIterator.frequency();
     
      for ( int j = 0; j < frequency; j++ ) {
        globalPointer = indexIterator.nextDocument();               
        localIndex = strategy.localIndex( globalPointer )

        if ( localFrequency[ localIndex ] == 0 ) {
          // First time we see a document for this index.
          currentTerm.println( localTerms[ localIndex ] );
          numTerms[ localIndex ]++;
          usedIndex[ usedIndices++ ] = localIndex;
          if ( bloomFilterPrecision != 0 ) bloomFilter[ localIndex ].add( currentTerm );
        }
       
        /* Store temporarily posting data; note that we save the global pointer as we
         * will have to access the size list. */
       
        localFrequency[ localIndex ]++;
        numPostings[ localIndex ]++;
        temp[ localIndex ].writeGamma( globalPointer );

        if ( globalIndex.hasPayloads ) payload = indexIterator.payload();
        if ( havePayloads ) payload.write( temp[ localIndex ] );
       
        if ( haveCounts ) {
          count = indexIterator.count();
          temp[ localIndex ].writeGamma( count );
          globCount[ localIndex ] += count;       
          if ( maxDocPos[ localIndex ] < count ) maxDocPos[ localIndex ] = count;        
          if ( havePositions ) {
            final int[] pos = indexIterator.positionArray();
            // TODO: compress this stuff
            for( int p = 0; p < count; p++ ) temp[ localIndex ].writeGamma( pos[ p ] );
          }
        }
      }
     
      // We now run through the indices used by this term and copy from the temporary buffer.

      OutputBitStream obs;
     
      for( int k = 0; k < usedIndices; k++ ) {
        final int i = usedIndex[ k ];

        localFrequencies[ i ].writeGamma( localFrequency[ i ] );
        if ( haveCounts ) numOccurrences[ i ] += globCount[ i ];
        if ( localGlobCounts[ i ] != null ) localGlobCounts[ i ].writeLongGamma( globCount[ i ] );
        globCount[ i ] = 0;
       
        InputBitStream ibs;
        indexWriter[ i ].newInvertedList();

        temp[ i ].align();
        if ( temp[ i ].buffer() != null ) ibs = direct[ i ];
        else {
          // We cannot read directly from the internal buffer.
          ibs = indirect[ i ];
          ibs.flush();
          temp[ i ].flush();
        }

        ibs.position( 0 );
         
        indexWriter[ i ].writeFrequency( localFrequency[ i ] );
        for( int j = 0; j < localFrequency[ i ]; j++ ) {
          obs = indexWriter[ i ].newDocumentRecord();
          globalPointer = ibs.readGamma();
          localPointer = strategy.localPointer( globalPointer )
          indexWriter[ i ].writeDocumentPointer( obs, localPointer );
          if ( havePayloads ) {
            payload.read( ibs );
            indexWriter[ i ].writePayload( obs, payload );
          }
          if ( haveCounts ) indexWriter[ i ].writePositionCount( obs, count = ibs.readGamma() );
          if ( havePositions ) {
            for( int p = 0; p < count; p++ ) position[ p ] = ibs.readGamma();
            indexWriter[ i ].writeDocumentPositions( obs, position, 0, count, sizeList != null ? sizeList.getInt( globalPointer ) : -1 );
          }
         
        }
        temp[ i ].position( 0 );
        temp[ i ].writtenBits( 0 );
        localFrequency[ i ] = 0;
      }
     
      usedIndices = 0;
      pl.count += frequency - 1;
      pl.update();
    }

    pl.done();

    Properties globalProperties = new Properties();
    globalProperties.setProperty( Index.PropertyKeys.FIELD, inputProperties.getProperty( Index.PropertyKeys.FIELD ) );
    globalProperties.setProperty( Index.PropertyKeys.TERMPROCESSOR, inputProperties.getProperty( Index.PropertyKeys.TERMPROCESSOR ) );
   
View Full Code Here

Examples of it.unimi.dsi.logging.ProgressLogger

    if ( jsapResult.userSpecified( "uris" ) ) uriStream = new FastBufferedOutputStream( new FileOutputStream( jsapResult.getString( "uris" ) ) );
    if ( jsapResult.userSpecified( "titles" ) ) titleStream = new FastBufferedOutputStream( new FileOutputStream( jsapResult.getString( "titles" ) ) );
   
    MutableString s = new MutableString();

    ProgressLogger progressLogger = new ProgressLogger( LOGGER, jsapResult.getLong( "logInterval" ), "documents" );
    if ( documentSequence instanceof DocumentCollection ) progressLogger.expectedUpdates = ((DocumentCollection)documentSequence).size();
    progressLogger.start( "Scanning..." );
   
    while( ( document = documentIterator.nextDocument() ) != null ) {
      if ( uriStream != null ) {
        s.replace( document.uri() );
        s.replace( LINE_TERMINATORS, SPACES );
        s.writeUTF8( uriStream );
        uriStream.write( '\n' );
      }
      if ( titleStream != null ) {
        s.replace( document.title() );
        s.replace( LINE_TERMINATORS, SPACES );
        s.writeUTF8( titleStream );
        titleStream.write( '\n' );
      }
      progressLogger.lightUpdate();
    }
   
    progressLogger.done();
    if ( uriStream != null ) uriStream.close();
    if ( titleStream != null ) titleStream.close();
  }
View Full Code Here

Examples of it.unimi.dsi.logging.ProgressLogger

    if ( uniqueURIs ) filter = new BloomFilter( jsapResult.getInt( "uniqueUris" ) );
   
    final Collection<? extends CharSequence> collection;
    if ( termFile == null ) {
      ArrayList<MutableString> termList = new ArrayList<MutableString>();
      final ProgressLogger pl = new ProgressLogger();
      pl.itemsName = "URIs";
      final LineIterator termIterator = new LineIterator( new FastBufferedReader( new InputStreamReader( System.in, "UTF-8" ), bufferSize ), pl );
     
      pl.start( "Reading URIs..." );
      MutableString uri;
      while( termIterator.hasNext() ) {
        uri = termIterator.next();
        if ( uniqueURIs ) makeUnique( filter, uri );
        termList.add( uri.copy() );
      }
      pl.done();
     
      collection = termList;
    }
    else {
      if ( uniqueURIs ) {
        // Create temporary file with unique URIs
        final ProgressLogger pl = new ProgressLogger();
        pl.itemsName = "URIs";
        pl.start( "Copying URIs..." );
        final LineIterator termIterator = new LineIterator( new FastBufferedReader( new InputStreamReader( new FileInputStream( termFile ) ), bufferSize ), pl );
        File temp = File.createTempFile( URLMPHVirtualDocumentResolver.class.getName(), ".uniqueuris" );
        temp.deleteOnExit();
        termFile = temp.toString();
        final FastBufferedOutputStream outputStream = new FastBufferedOutputStream( new FileOutputStream( termFile ), bufferSize );
        MutableString uri;
        while( termIterator.hasNext() ) {
          uri = termIterator.next();
          makeUnique( filter, uri );
          uri.writeUTF8( outputStream );
          outputStream.write( '\n' );
        }
        pl.done();
        outputStream.close();
      }
      collection = new FileLinesCollection( termFile, "UTF-8" );
    }
    LOGGER.debug( "Building function..." );
View Full Code Here

Examples of it.unimi.dsi.logging.ProgressLogger

    final boolean isVirtual = jsapResult.getBoolean( "virtual" );

    int i, t = 0;

    final ProgressLogger pl = new ProgressLogger( LOGGER, jsapResult.getLong( "logInterval" ), "ints" );
    final Index[] index = stem ? new Index[ indexedField.length ] : new Index[ 1 ];
    final int numberOfTerms[] = new int[ indexedField.length ];
    final ObjectArrayList<MutableString>[] terms = new ObjectArrayList[ indexedField.length ];
    final IndexReader[] indexReader = new IndexReader[ index.length ];
    final InputBitStream[] frequencies = new InputBitStream[ index.length ];
    final int[][] count = new int[ index.length ][];
    final int[] permutation = permutationFile != null ? BinIO.loadInts( permutationFile ) : null;
    final int[][] occ = new int[ index.length ][];
    final int[][] wordInPos = new int[ index.length ][];
    final Int2IntMap[] termsInDoc = new Int2IntOpenHashMap[ index.length ];
    int totalTerms = 0;
   
    boolean allBitStreamIndices = true;
   
    for( i = 0; i < index.length; i++ ) {
      final String basenameField = basename + (stem ? "-" + factory.fieldName( indexedField[ i ] ) : "" );
      index[ i ] = Index.getInstance( basenameField );
      if ( ! ( index[ i ] instanceof BitStreamIndex ) ) allBitStreamIndices = false;
     
      if ( termLists ) {
        terms[ i ] = new ObjectArrayList<MutableString>( new FileLinesCollection( basenameField + DiskBasedIndex.TERMS_EXTENSION, "UTF-8" ).allLines() );
        numberOfTerms[ i ] = terms[ i ].size();
      }
      else numberOfTerms[ i ] = index[ i ].numberOfTerms;
      totalTerms += numberOfTerms[ i ];
     
      // This will be matched with the number of occurrences per document
      count[ i ] = new int[ index[ i ].numberOfDocuments ];

      occ[ i ] = index[ i ].maxCount > 0 ? new int[ index[ i ].maxCount ] : IntArrays.EMPTY_ARRAY;
      wordInPos[ i ] = new int[ Math.max( 0, index[ i ].properties.getInt( Index.PropertyKeys.MAXDOCSIZE ) ) ];
      indexReader[ i ] = index[ i ].getReader();
     
      if ( new File( basenameField + DiskBasedIndex.FREQUENCIES_EXTENSION ).exists() ) frequencies[ i ] = new InputBitStream( basenameField + DiskBasedIndex.FREQUENCIES_EXTENSION );
      termsInDoc[ i ] = new Int2IntOpenHashMap();
    }


    int currDoc = 0,
    // Term position in the current document.
    pos = 0, f = 0, p;

    pl.itemsName = "lists";
    pl.expectedUpdates = totalTerms;
   
    int indexFrequency = -1;
   
    // Sequential scan
    if ( !jsapResult.getBoolean( "noSeq" ) ) {
      try {
        for ( i = 0; i < index.length; i++ ) {
          int numberOfPostings = 0;
          pl.expectedUpdates = numberOfTerms[ i ];
          pl.start( "Verifying sequentially index " + index[ i ] + "..." );

          if ( allBitStreamIndices ) {
            for ( t = 0; t < numberOfTerms[ i ]; t++ ) {
              pl.update();
              IndexIterator indexIterator = indexReader[ i ].nextIterator();
              indexFrequency = indexIterator.frequency();
              numberOfPostings += indexFrequency;
              if ( frequencies[ i ] != null && indexFrequency != ( f = frequencies[ i ].readGamma() ) ) {
                System.err.println( "Error in frequency for term " + t + ": expected " + f + " documents, found " + indexFrequency );
                return;
              }

              while ( indexFrequency-- != 0 ) {
                p = indexIterator.nextDocument();
                if (index[i].hasCounts) count[i][p] += indexIterator.count();
                if (index[i].hasPositions) indexIterator.positionArray(); // Just to force reading in high-performance indices
              }
              if ( indexIterator.nextDocument() != -1 ) throw new AssertionError( "nextDocument() is not -1 after exhaustive iteration" );
            }
           
            // Check document sizes
            if ( ! isVirtual && ( (BitStreamIndex) index[ i ] ).sizes != null && index[ i ].hasCounts )
              for ( p = 0; p < index[ i ].numberOfDocuments; p++ )
                if ( index[ i ].sizes.getInt( p ) != count[ i ][ p ] )
                  System.err.println( "Document " + p + " has size " + ( (BitStreamIndex) index[ i ] ).sizes.getInt( p ) + " but " + count[ i ][ p ] + " occurrences have been stored." );
           
          }
          else { // Non-bitstream indices
            for (t = 0; t < numberOfTerms[ i ]; t++) {
              pl.update();
              IndexIterator indexIterator = termLists ? indexReader[ i ].documents( terms[ i ].get( t ) ) : indexReader[ i ].documents( t );
              indexFrequency = indexIterator.frequency();
              numberOfPostings += indexFrequency;
              if (frequencies[i] != null && indexFrequency != (f = frequencies[i].readGamma())) {
                System.err.println("Error in frequency for term " + t
                    + ": expected " + f + " documents, found "
                    + indexFrequency);
                return;
              }
             
              int prevp = -1;
              while (indexFrequency-- != 0) {
                p = indexIterator.nextDocument();
                if ( prevp >= p ) throw new AssertionError( "previous pointer: " + prevp + "; current pointer: " + p );
                prevp = p;
                if (index[i].hasCounts) count[i][p] += indexIterator.count();
              }
            }
          }
          pl.done();
         
          if ( ! isVirtual && numberOfPostings != index[ i ].numberOfPostings ) System.err.println( "Index declares " + index[ i ].numberOfPostings + " postings, but we found " + numberOfPostings );
          long numberOfOccurrences = 0;
          if ( index[ i ].hasCounts ) {
            for ( p = 0; p < index[ i ].numberOfDocuments; p++ ) numberOfOccurrences += count[ i ][ p ];
            if ( numberOfOccurrences != index[ i ].numberOfOccurrences ) System.err.println( "Index declares " + index[ i ].numberOfOccurrences + " occurrences, but we found " + numberOfOccurrences );
          }
        }
      } catch ( Exception e ) {
        System.err.println( "Exception while scanning sequentially term " + t + " of index " + index[ i ] );
        System.err.println( "Term frequency was " + f + " and position " + ( f - indexFrequency - 1 ) );
        throw e;
      }
    }
 
    IntArrayList l = new IntArrayList();
    ObjectArrayList<int[]> positions = new ObjectArrayList<int[]>();
   
    if ( ! jsapResult.getBoolean( "noSkip" ) ) {
      int start = 0, end = 0, result;
      try {
        for (i = 0; i < index.length; i++) {
         
          pl.expectedUpdates = numberOfTerms[ i ];
          pl.start("Verifying all skips in " + index[i] + "...");

          for (t = 0; t < numberOfTerms[ i ]; t++) {
            l.clear();
            positions.clear();
            IndexIterator documents = termLists ? indexReader[ i ].documents( terms[ i ].get( t ) ) : indexReader[ i ].documents( t );
            int d;
            while( ( d = documents.nextDocument() ) != -1 ) {
              l.add( d );
              if ( index[ i ].hasPositions ) positions.add( ArrayUtils.subarray( documents.positionArray(), 0, documents.count() ) );
            }
           
            for( start = 0; start < l.size(); start++ ) {
              for( end = start + 1; end < l.size(); end++ ) {
                IndexIterator indexIterator = termLists ? indexReader[ i ].documents( terms[ i ].get( t ) ) : indexReader[ i ].documents( t );
               
                result = indexIterator.skipTo( l.getInt( start ) );
                if ( indexIterator.document() != l.getInt( start ) || result != l.getInt( start ) ) throw new AssertionError( "Trying to skip to document " + l.getInt( start ) + " (term " + t + ") moved to " + indexIterator.document() + "(skipTo() returned " + result + ")" );
                result = indexIterator.skipTo( l.getInt( end ) );
                if ( indexIterator.document() != l.getInt( end ) || result != l.getInt( end ) ) throw new AssertionError( "Trying to skip to document " + l.getInt( end ) + " (term " + t + ") after a skip to " + start + " moved to " + indexIterator.document() + "(skipTo() returned " + result + ")" );
               
                if ( index[ i ].hasPositions ) {
                  // This catches wrong state reconstruction after skips.
                  indexIterator = termLists ? indexReader[ i ].documents( terms[ i ].get( t ) ) : indexReader[ i ].documents( t );
                  indexIterator.skipTo( l.getInt( start ) );
                  if ( indexIterator.document() != l.getInt( start ) ) throw new AssertionError(indexIterator.document() + " != " + l.getInt( start ) );
                  if ( indexIterator.count() != positions.get( start ).length ) throw new AssertionError(indexIterator.count() + " != " + positions.get( start ).length );
                  if ( ! Arrays.equals( positions.get( start ), ArrayUtils.subarray( indexIterator.positionArray(), 0, indexIterator.count() ) )
                     ) throw new AssertionError(Arrays.toString( positions.get( start ) ) + "!=" + Arrays.toString( ArrayUtils.subarray( indexIterator.positionArray(), 0, indexIterator.count() ) ) );
                  indexIterator.skipTo( l.getInt( end ) );
                  if ( indexIterator.document() != l.getInt( end )  ) throw new AssertionError(indexIterator.document() + " != " + l.getInt( end ) );
                  if ( indexIterator.count() != positions.get( end ).length ) throw new AssertionError(indexIterator.count() + " != " + positions.get( end ).length );
                  if ( ! Arrays.equals( positions.get( end ), ArrayUtils.subarray( indexIterator.positionArray(), 0, indexIterator.count() ) )
                     ) throw new AssertionError(Arrays.toString( positions.get( end ) ) + "!=" + Arrays.toString( ArrayUtils.subarray( indexIterator.positionArray(), 0, indexIterator.count() ) ) );
                }
               
              }
             
              IndexIterator indexIterator = termLists ? indexReader[ i ].documents( terms[ i ].get( t ) ) : indexReader[ i ].documents( t );
             
              result = indexIterator.skipTo( l.getInt( start ) );
              if ( indexIterator.document() != l.getInt( start ) || result != l.getInt( start ) ) throw new AssertionError("Trying to skip to document " + l.getInt( start ) + " (term " + t + ") moved to " + indexIterator.document() + "(skipTo() returned " + result + ")" );
              result = indexIterator.skipTo( Integer.MAX_VALUE );
              if ( indexIterator.hasNext() || result != Integer.MAX_VALUE ) throw new AssertionError("Trying to skip beyond end of list (term " + t + ") after a skip to " + start + " returned " + result + " (hasNext()=" + indexIterator.hasNext() + ")" );
             
             
            }
            pl.update();
          }
          pl.done();
        }
      }
      catch( Throwable e  ) {
        System.err.println( "Exception during all-skip test (index=" + index[ i ] + ", term=" + t + ", start=" + start + ", end=" + end + ")" );
        throw e;
      }
     }
   

    if ( ! jsapResult.getBoolean( "noComp" ) ) {
      IndexReader additionalReader;
      IntLinkedOpenHashSet s0 = new IntLinkedOpenHashSet();
      IntOpenHashSet s1 = new IntOpenHashSet();
      IntAVLTreeSet s2 = new IntAVLTreeSet();
      IntIterator it;
      IndexIterator indexIterator, additionalIterator;
      it.unimi.dsi.mg4j.search.DocumentIterator documentIterator;
      int u = 0;
     
      try {
        for (i = 0; i < index.length; i++) {
          pl.expectedUpdates = numberOfTerms[ i ];
          pl.start("Verifying composite iterators in " + index[i] + "...");
          additionalReader = index[ i ].getReader();
         
          for (t = 0; t < numberOfTerms[ i ]; t++) {
            for (u = 0; u < numberOfTerms[ i ]; u++) {
              s0.clear();
              s1.clear();
              // TODO: in case we have positions, we should check them, too
              IntIterators.pour( termLists ? indexReader[ i ].documents( terms[ i ].get( t ) ) : indexReader[ i ].documents( t ), s0 );
              IntIterators.pour( termLists ? indexReader[ i ].documents( terms[ i ].get( u ) ) : indexReader[ i ].documents( u ), s1 );
              s0.retainAll( s1 );
              indexIterator =  termLists ? indexReader[ i ].documents( terms[ i ].get( t ) ) : indexReader[ i ].documents( t );
              additionalIterator = termLists ? additionalReader.documents( terms[ i ].get( u ) ) : additionalReader.documents( u );
              it = s0.iterator();
              documentIterator = AndDocumentIterator.getInstance( indexIterator, additionalIterator );
              for( int j = s0.size(); j-- != 0; ) if ( it.nextInt() != documentIterator.nextDocument() ) throw new AssertionError();
              if ( documentIterator.hasNext() ) throw new AssertionError();

              s2.clear();
              IntIterators.pour( termLists ? indexReader[ i ].documents( terms[ i ].get( t ) ) : indexReader[ i ].documents( t ), s2 );
              IntIterators.pour( termLists ? indexReader[ i ].documents( terms[ i ].get( u ) ) : indexReader[ i ].documents( u ), s2 );

              indexIterator =  termLists ? indexReader[ i ].documents( terms[ i ].get( t ) ) : indexReader[ i ].documents( t );
              additionalIterator = termLists ? additionalReader.documents( terms[ i ].get( u ) ) : additionalReader.documents( u );

              it = s2.iterator();
              documentIterator = OrDocumentIterator.getInstance( indexIterator, additionalIterator );
              for( int j = s2.size(); j-- != 0; ) if ( it.nextInt() != documentIterator.nextDocument() ) throw new AssertionError();
              if ( documentIterator.hasNext() ) throw new AssertionError();
           
            pl.update();
          }
          pl.done();
          additionalReader.close();
        }
      }
      catch( Throwable e  ) {
        System.err.println( "Exception during composite iterator test (index=" + index[ i ] + ", first term=" + t + ", second term =" + u + ")" );
        throw e;
     
    }
   
    if ( ! isVirtual && jsapResult.getBoolean( "random" ) ) {
     
      // Random access scan
      pl.expectedUpdates = index[ 0 ].numberOfDocuments;
      pl.itemsName = "documents";
      pl.start( "Verifying random access..." );

      if ( allBitStreamIndices ) {
        it.unimi.dsi.mg4j.document.DocumentIterator documentIterator = documentSequence.iterator();
        Document document;
        Reader reader;
        WordReader wordReader;
       
        final MutableString word = new MutableString(), nonWord = new MutableString();
       
        int docCounter = 0;
       
        while( ( document = documentIterator.nextDocument() ) != null ) {
          currDoc = permutation != null ? permutation[ docCounter ] : docCounter;

          for( i = 0; i < index.length; i++ ) {
            Object content = document.content( stem || index[ i ].field == null ? indexedField[ i ] : factory.fieldIndex( index[ i ].field ) );
            if ( index[ i ].hasPayloads ) {
              // TODO: write tests for the other case
              if ( allBitStreamIndices ) {
                IndexIterator indexIterator = indexReader[ i ].documents( 0 );
                int pointer = indexIterator.skipTo( currDoc );
                if ( pointer == currDoc ) {
                  Payload payload = indexIterator.payload();
                  if ( ! payload.get().equals( content ) ) LOGGER.error( index[ i ] + ": Document " + currDoc + " has payload " + content + " but the index says " + payload )
                }
                else LOGGER.error( index[ i ] + ": Document " + currDoc + " does not appear in the inverted list of term " + t );
              }
              else {
                IndexIterator indexIterator = indexReader[ i ].documents);
                if ( indexIterator.skipTo( currDoc ) == currDoc ) {
                  if ( ! indexIterator.payload().get().equals( content ) )
                    LOGGER.error( index[ i ] + ": Document " + currDoc + " has payload " + content + " but the index says " + indexIterator.payload().get() );
                }
                else LOGGER.error( index[ i ] + ": Document " + currDoc + " does not appear in the inverted list of term " + t );
              }
            }
            else {
              // text index
              pos = 0;
              termsInDoc[ i ].clear();
              reader = (Reader)content;
              wordReader = document.wordReader( stem || index[ i ].field == null ? indexedField[ i ] : factory.fieldIndex( index[ i ].field ) );
              wordReader.setReader( reader );
              while( wordReader.next( word, nonWord ) ) {
                if ( word.length() == 0 || index[ i ].termProcessor != null && ! index[ i ].termProcessor.processTerm( word ) ) continue;
                if ( ( t = (int)( (BitStreamIndex)index[ i ] ).termMap.getLong( word ) ) == -1 ) LOGGER.error( index[ i ] + ": Could not find term " + word + " in term index" );
                else {
                  if ( index[ i ].hasCounts ) termsInDoc[ i ].put( t, termsInDoc[ i ].get( t ) + 1 );
                  if ( index[ i ].hasPositions ) wordInPos[ i ][ pos++ ] = t;
                }
              }

              if ( allBitStreamIndices ) {
                for( IntIterator x = termsInDoc[ i ].keySet().iterator(); x.hasNext(); ) {
                  t = x.nextInt();

                  IndexIterator indexIterator = indexReader[ i ].documents( t );

                  int pointer = indexIterator.skipTo( currDoc );
                  if ( pointer == currDoc ) {
                    if ( index[ i ].hasCounts ) {
                      int c = indexIterator.count();
                      if ( termsInDoc[ i ].get( t ) !=  c )
                        LOGGER.error( index[ i ] + ": The count for term " + t + " in document " + currDoc + " is " + c + " instead of " + termsInDoc[ i ].get( t ) );
                      else {
                        if ( index[ i ].hasPositions ) {
                          indexIterator.positions( occ[ i ] );

                          for( int j = 0; j < c; j++ )
                            if ( wordInPos[ i ][ occ[ i ][ j ] ] != t
                              LOGGER.error( index[ i ] + ": The occurrence of index " + i + " of term " + t + " (position " + occ[ i ] +") in document " + currDoc + " is occupied instead by term " + wordInPos[ i ][ occ[ i ][ j ] ] );
                        }
                      }
                    }
                  }
                  else LOGGER.error( index[ i ] + ": Document " + currDoc + " does not appear in the inverted list of term " + t + "(skipTo returned " + pointer + ")" );
                }
              }
              else {
                for( IntIterator x = termsInDoc[ i ].keySet().iterator(); x.hasNext(); ) {
                  t = x.nextInt();
                  IndexIterator indexIterator = termLists ? indexReader[ i ].documents( terms[ i ].get( t ) ) : indexReader[ i ].documents( t );

                  if ( indexIterator.skipTo( currDoc ) == currDoc ) {
                    if ( index[ i ].hasCounts ) {
                      int c = indexIterator.count();
                      if ( termsInDoc[ i ].get( t ) !=  c )
                        LOGGER.error( index[ i ] + ": The count for term " + t + " in document " + currDoc + " is " + c + " instead of " + termsInDoc[ i ].get( t ) );
                      else {
                        if ( index[ i ].hasPositions ) {
                          indexIterator.positions( occ[ i ] );

                          for( int j = 0; j < c; j++ )
                            if ( wordInPos[ i ][ occ[ i ][ j ] ] != t
                              LOGGER.error( index[ i ] + ": The occurrence of index " + i + " of term " + t + " (position " + occ[ i ] +") in document " + currDoc + " is occupied instead by term " + wordInPos[ i ][ occ[ i ][ j ] ] );
                        }
                      }
                    }
                  }
                  else LOGGER.error( index[ i ] + ": Document " + currDoc + " does not appear in the inverted list of term " + t );
                }
              }
            }
          }
          docCounter++;
          document.close();
          pl.update();
        }
      }
      else {
        LOGGER.warn( "Random access tests require very slow single-term scanning as not all indices are disk based" );

        it.unimi.dsi.mg4j.document.DocumentIterator documentIterator = documentSequence.iterator();
        Document document;
        Reader reader;
        WordReader wordReader;
       
        final MutableString word = new MutableString(), nonWord = new MutableString();
       
        int docCounter = 0;
       
        while( ( document = documentIterator.nextDocument() ) != null ) {
          currDoc = permutation != null ? permutation[ docCounter ] : docCounter;

          for( i = 0; i < index.length; i++ ) {
            Object content = document.content( stem || index[ i ].field == null ? indexedField[ i ] : factory.fieldIndex( index[ i ].field ) );
            if ( index[ i ].hasPayloads ) {
              if ( allBitStreamIndices ) {
                IndexIterator indexIterator = indexReader[ i ].documents( 0 );
                int pointer = indexIterator.skipTo( currDoc );
                if ( pointer == currDoc ) {
                  Payload payload = indexIterator.payload();
                  if ( ! payload.get().equals( content ) ) LOGGER.error( index[ i ] + ": Document " + currDoc + " has payload " + content + " but the index says " + payload )
                }
                else LOGGER.error( index[ i ] + ": Document " + currDoc + " does not appear in the inverted list of term " + t );
              }
              else {
                IndexIterator indexIterator = indexReader[ i ].documents( "#" );
                if ( indexIterator.skipTo( currDoc ) == currDoc ) {
                  if ( ! indexIterator.payload().get().equals( content ) )
                    LOGGER.error( index[ i ] + ": Document " + currDoc + " has payload " + content + " but the index says " + indexIterator.payload().get() );
                }
                else LOGGER.error( index[ i ] + ": Document " + currDoc + " does not appear in the inverted list of term " + t );
              }
            }
            else {
              pos = 0;
              reader = (Reader)content;
              wordReader = document.wordReader( stem || index[ i ].field == null ? indexedField[ i ] : factory.fieldIndex( index[ i ].field ) );
              wordReader.setReader( reader );
              while( wordReader.next( word, nonWord ) ) {
                if ( word.length() == 0 || index[ i ].termProcessor != null && ! index[ i ].termProcessor.processTerm( word ) ) continue;
                IndexIterator indexIterator = indexReader[ i ].documents( word );
                if ( currDoc != indexIterator.skipTo( currDoc ) )
                  LOGGER.error( index[ i ] + ": Document " + currDoc + " does not appear in the inverted list of term " + word );
                else if ( index[ i ].hasPositions ) {
                  indexIterator.positions( occ[ i ] );
                  if ( IntArrayList.wrap( occ[ i ], indexIterator.count() ).indexOf( pos ) == -1 )
                    LOGGER.error( index[ i ] + ": Position " + pos + " does not appear in the position list of term " + word + " in document " + currDoc );
                }
                pos++;
              }
            }
          }
          document.close();
          pl.update();
          docCounter++;
        }
      }

      pl.done();
    }
   
    for( IndexReader ir : indexReader ) ir.close();
  }
View Full Code Here

Examples of it.unimi.dsi.logging.ProgressLogger

    this.factory = factory;
    this.bufferSize = bufferSize;
    this.descriptors = new ObjectArrayList<TRECDocumentDescriptor>();
    this.useGzip = useGzip;

    final ProgressLogger progressLogger = new ProgressLogger( LOGGER );
    progressLogger.expectedUpdates = file.length;
    progressLogger.itemsName = "files";

    progressLogger.start( "Parsing " + ( useGzip ? "GZip" : "plain" ) + " files" );

    for ( int i = 0; i < file.length; i++ ) {
      parseContent( i, openFileStream( file[ i ] ) );
      progressLogger.update();
    }

    progressLogger.done();
  }
View Full Code Here

Examples of it.unimi.dsi.logging.ProgressLogger

    LongArrayList p = new LongArrayList();
    pointers = new ObjectArrayList<EliasFanoMonotoneLongBigList>( file.length );
    firstDocument = new int[ file.length + 1 ];
    int count = 0;
   
    final ProgressLogger pl = new ProgressLogger( LOGGER );
    pl.expectedUpdates = file.length;
    pl.itemsName = "files";
    pl.start( "Scanning files..." );
   
    // Scan files and retrieve page pointers
    for( String f : file ) {
      p.clear();
      final FastBufferedInputStream fbis = gzipped ? new FastBufferedInputStream( new GZIPInputStream( new FileInputStream( f ) ) ) : new FastBufferedInputStream( new FileInputStream( f ) );
      long position;
      for(;;) {
        position = fbis.position();
        if ( readLine( fbis ) == -1 ) break;
        if ( startsWith( lineBuffer, DOC_MARKER ) ) p.add( position );
        if ( phrase && startsWith( lineBuffer, SENTENCE_MARKER ) ) p.add( position );
      }
     
      count += p.size();
      p.add( fbis.position() );
      fbis.close();
     
      pointers.add( new EliasFanoMonotoneLongBigList( p ) );
      firstDocument[ pointers.size() ] = count;
     
      pl.update();
    }
   
    pl.done();
   
    size = count;
  }
View Full Code Here

Examples of it.unimi.dsi.logging.ProgressLogger

    );
    JSAPResult jsapResult = jsap.parse( arg );
    if ( jsap.messagePrinted() ) return;

    DocumentSequence documentSequence = Scan.getSequence( jsapResult.getString( "sequence" ), jsapResult.getClass( "factory" ), jsapResult.getStringArray( "property" ), jsapResult.getInt( "delimiter" ), LOGGER );
    final ProgressLogger progressLogger = new ProgressLogger( LOGGER, "documents" );
    if ( documentSequence instanceof DocumentCollection ) progressLogger.expectedUpdates = ((DocumentCollection)documentSequence).size();
    final ZipDocumentCollectionBuilder zipDocumentCollectionBuilder = new ZipDocumentCollectionBuilder( jsapResult.getString( "basename" ), documentSequence.factory(), !jsapResult.getBoolean( "approximated") );
    zipDocumentCollectionBuilder.open( "" );
    zipDocumentCollectionBuilder.build( documentSequence );
  }
View Full Code Here

Examples of it.unimi.dsi.logging.ProgressLogger

     * @throws Exception
     */
    @SuppressWarnings("unchecked")
    public WebgraphWebServer(String pathToWebgraphBase, int port) throws Exception {
        _port = port;
        ProgressLogger pl = new ProgressLogger();

        String fcl = pathToWebgraphBase + ".fcl";
        _node2url = (List<CharSequence>) BinIO.loadObject(fcl);

        String mph = pathToWebgraphBase + ".mph";
View Full Code Here

Examples of it.unimi.dsi.logging.ProgressLogger

        this.bufferSize = bufferSize;
        this.descriptors = new ObjectBigArrayBigList<>();
        this.compression = compression;
        this.metadataFile = metadataFile;

        final ProgressLogger progressLogger = new ProgressLogger(LOGGER);
        progressLogger.expectedUpdates = files.length;
        progressLogger.itemsName = "files";

        progressLogger.start("Parsing files with compression \"" + compression
                + "\"");

        final DataOutputStream metadataStream = new DataOutputStream(new FileOutputStream(metadataFile));
        for (int i = 0; i < files.length; i++) {
            parseContent(i, openFileStream(files[i]), metadataStream);
            progressLogger.update();
        }
        metadataStream.close();

        metadataRandomAccess = new RandomAccessFile(metadataFile, "r");

        progressLogger.done();
    }
View Full Code Here

Examples of net.sf.picard.util.ProgressLogger

  }

  public void validateSamRecords(final Iterable<SAMRecord> samRecords,
      final SAMFileHeader header) {
    SAMRecordIterator iter = (SAMRecordIterator) samRecords.iterator();
    final ProgressLogger progress = new ProgressLogger(log, 10000000,
        "Validated Read");
    try {
      while (iter.hasNext()) {
        SAMRecord record = iter.next();

        final long recordNumber = progress.getCount() + 1;
        final Collection<SAMValidationError> errors = record.isValid();
        if (errors != null) {
          for (final SAMValidationError error : errors) {
            error.setRecordNumber(recordNumber);
            addError(error);
          }
        }

        validateMateFields(record, recordNumber);
        validateSortOrder(record, recordNumber);
        validateReadGroup(record, header);
        final boolean cigarIsValid = validateCigar(record, recordNumber);
        if (cigarIsValid) {
          validateNmTag(record, recordNumber);
        }
        validateSecondaryBaseCalls(record, recordNumber);
        validateTags(record, recordNumber);
        if (sequenceDictionaryEmptyAndNoWarningEmitted
            && !record.getReadUnmappedFlag()) {
          addError(new SAMValidationError(
              Type.MISSING_SEQUENCE_DICTIONARY,
              "Sequence dictionary is empty", null));
          sequenceDictionaryEmptyAndNoWarningEmitted = false;

        }
        progress.record(record);
      }
    } catch (SAMFormatException e) {
      // increment record number because the iterator behind the
      // SAMFileReader
      // reads one record ahead so we will get this failure one record
      // ahead
      final String msg = "SAMFormatException on record "
          + progress.getCount() + 1;
      out.println(msg);
      throw new PicardException(msg, e);
    } catch (FileTruncatedException e) {
      addError(new SAMValidationError(Type.TRUNCATED_FILE,
          "File is truncated", null));
View Full Code Here
TOP
Copyright © 2018 www.massapi.com. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.