Examples of DocumentCollection


Examples of it.unimi.di.big.mg4j.document.DocumentCollection

            list.add(s);
        String[] files = list.toArray(new String[list.size()]);
        Arrays.sort(files);


        final DocumentCollection collection;
        String docType = conf.format;
        File metadataFile = new File(output.getAbsolutePath() + ".metadata");

        switch (docType) {
            case "trec":
                Properties properties = new Properties();
                properties.setProperty(PropertyBasedDocumentFactory.MetadataKeys.ENCODING, "UTF-8");
                final TRECDocumentFactory documentFactory = new TRECDocumentFactory(properties);

                collection = new TRECDocumentCollection(files,
                            documentFactory, SegmentedDocumentCollection.DEFAULT_BUFFER_SIZE, compression, metadataFile);
                break;

            case "warc/0.18":
                collection = new WARCDocumentCollection(files, SegmentedDocumentCollection.DEFAULT_BUFFER_SIZE, compression, metadataFile);
                break;
            default:
                LOGGER.error(String.format("Unknown document type [%s]", docType));
                System.exit(-1);
                throw new AssertionError();
        }

        // Store the collection
        BinIO.storeObject(collection, out);

        LOGGER.info("Found {} documents in the collection", collection.size());
        return 0;
    }
View Full Code Here

Examples of it.unimi.di.big.mg4j.document.DocumentCollection

    }

  @Override
  public int execute() throws Throwable {
        final DocumentCollection collection = collectionCf.init();
        index.init();

        // Read model & topics
        logger.info("Reading model");
        DocumentBuilderFactory dbFactory = DocumentBuilderFactory.newInstance();
        dbFactory.setNamespaceAware(true);
        DocumentBuilder dBuilder = dbFactory.newDocumentBuilder();
        org.w3c.dom.Document xml = taskFile == null  ? dBuilder.parse(System.in) : dBuilder.parse(new FileInputStream(taskFile));

        JAXBContext context = JAXBContext.newInstance(BM25.class);
        Unmarshaller um = context.createUnmarshaller();

        RetrievalModel model = null;
        QuerySet querySet = null;

        for(Element child: XMLUtils.elements(xml.getDocumentElement().getChildNodes())) {
            if (XMLUtils.is(child, ADHOC_MODEL)) {
                for(Element grandchild: XMLUtils.elements(child.getChildNodes())) {
                    model = (RetrievalModel) um.unmarshal(grandchild);
                    break;
                }
            }
            if (XMLUtils.is(child, TOPICS)) {
                final String type = child.getAttribute("type");
                switch(type) {
                    case "trec":
                        try(BufferedReader reader = new BufferedReader(new FileReader(child.getAttribute("path")))) {
                            querySet = TRECTopic.readTopics(reader, false);
                        }
                        break;
                    default:
                        throw new RuntimeException(String.format("Cannot handle topics of type %s", type));
                }
                // Do something
            }
        }

        if (model == null)
            throw new IllegalArgumentException("No model was present in the XML description file");
        if (querySet == null)
            throw new IllegalArgumentException("No topics were present in the XML description file");

        logger.info(String.format("Starting with model [%s] and %d topics", model, querySet.queries().size()));

        // Dicarded documents
    TRECJudgments discarded = discardedQRELFile == null ? null
        : new TRECJudgments(discardedQRELFile);



        // Queries
    Set<String> topicIds = GenericHelper.newHashSet();
    Map<String, ? extends Topic> topics = querySet.queries();
    for (String id : topics.keySet()) {
      logger.debug(new LazyString("Considering topic %s (%b/%b/%b)", id, topics.keySet()
          .contains(id), onlyTopics.isEmpty(), onlyTopics
          .contains(id)));
      if (topics.keySet().contains(id)
          && (onlyTopics.isEmpty() || onlyTopics.contains(id))) {
        topicIds.add(id);
      }
    }

    if (topicIds.isEmpty()) {
      logger.error("No topics to be answered");
      return 1;
    }

    // Iterates on topics
    timer.start();
    TaskTimer.Task task = timer.new Task("Answering topics", "topics",
        topicIds.size());
    PrintStream output = System.out;

    model.init(collection, index);
    int totalRetrieved = 0;
    for (String topicId : topicIds) {
      logger.info(String.format("Answering topic %s", topicId));

      Topic topic = topics.get(topicId);
      ObjectArrayList<DocumentScoreInfo<Reference2ObjectMap<Index, SelectedInterval[]>>> results = new ObjectArrayList<>();

      Set<String> discardedDocuments = null;
      if (discarded != null) {
        Map<String, Integer> map = discarded.get(topicId);
        if (map != null)
          discardedDocuments = map.keySet();

      }

      // Ask for results (add some documents in case we discard some
      // after)
      model.process(topic, results,
          capacity
              + (discardedDocuments == null ? 0
                  : discardedDocuments.size()), timer);

      final int retrieved = results.size();
      totalRetrieved += retrieved;
      logger.info(String.format("Returned %d results", retrieved));
      int added = 0;
      for (int i = 0; i < retrieved && added < capacity; i++) {
        DocumentScoreInfo dsi = results.get(i);
        Document document = collection.document(dsi.document);
                System.err.println("URI: " + document.uri());
                System.err.println("URI["+dsi.document+"]: " + collection.metadata(dsi.document).get(PropertyBasedDocumentFactory.MetadataKeys.URI));
        final String docno = (String) collection.metadata(dsi.document).get(PropertyBasedDocumentFactory.MetadataKeys.URI);

        // If it was not a discarded document
        if (discardedDocuments == null
            || !discardedDocuments.contains(docno)) {
          output.format("%s Q0 %s %d %g %s%n", topicId, docno, i,
View Full Code Here

Examples of it.unimi.dsi.mg4j.document.DocumentCollection

      });

    final JSAPResult jsapResult = jsap.parse( arg );
    if ( jsap.messagePrinted() ) return;

    final DocumentCollection documentCollection = (DocumentCollection)(jsapResult.userSpecified( "collection" ) ? AbstractDocumentSequence.load( jsapResult.getString( "collection" ) ) :
      jsapResult.userSpecified( "objectCollection" ) ? jsapResult.getObject( "objectCollection" ): null );
    final List<? extends CharSequence> titleList = (List<? extends CharSequence>) (
        jsapResult.userSpecified( "titleList" ) ? BinIO.loadObject( jsapResult.getString( "titleList" ) ) :
          jsapResult.userSpecified( "titleFile" ) ? new FileLinesList( jsapResult.getString( "titleFile" ), "UTF-8" ) :
            null );    final String[] basenameWeight = jsapResult.getStringArray( "basenameWeight" );
View Full Code Here

Examples of it.unimi.dsi.mg4j.document.DocumentCollection

        ObjectArrayList<ResultItem> resultItems = new ObjectArrayList<ResultItem>();

        if ( ! results.isEmpty() ) {
          SelectedInterval[] selectedInterval = null;

          final DocumentCollection collection = documentCollection != null ? documentCollection.copy() : null;

          for( int i = 0; i < results.size(); i++ ) {
            DocumentScoreInfo<Reference2ObjectMap<Index,SelectedInterval[]>> dsi = results.get( i );
            LOGGER.debug( "Intervals for item " + i );
            final ResultItem resultItem = new ResultItem( dsi.document, dsi.score );
            resultItems.add( resultItem );

            if ( collection != null ) {
              final Document document = collection.document( dsi.document );
              // If both collection and title list are present, we override the collection title (cfr. Query)
              resultItem.title = StringEscapeUtils.escapeHtml( titleList != null ? titleList.get( resultItem.doc ).toString() : document.title().toString() );
              if ( useUri ) {
                if ( document.uri() != null ) resultItem.uri = StringEscapeUtils.escapeHtml( document.uri().toString() );
              }
              else {
                if ( document.uri() != null ) {
                  String stringUri = document.uri().toString();
                  // TODO: this is a quick patch to get the file server running with relative files
                  final String documentUri = URLEncoder.encode( derelativise
                  ? new File( stringUri.startsWith( "file:" ) ? stringUri.substring( 5 ) : stringUri ).getAbsoluteFile().toURI().toASCIIString()
                      : document.uri().toString(), "UTF-8" );
                  resultItem.uri = StringEscapeUtils.escapeHtml( "./Item?doc=" + resultItem.doc + "&m=" + urlEncodedMimeType + "&uri=" + documentUri );
                }
                else resultItem.uri = StringEscapeUtils.escapeHtml( "./Item?doc=" + resultItem.doc + "&m=" + urlEncodedMimeType );
              }
             
              MarkingMutableString snippet = new MarkingMutableString( TextMarker.HTML_STRONG, MarkingMutableString.HTML_ESCAPE );
             
              for( int j = 0; j < sortedIndex.length; j++ ) {
                if ( ! sortedIndex[ j ].hasPositions || dsi.info == null ) continue;
                selectedInterval = dsi.info.get( sortedIndex[ j ] );
                if ( selectedInterval != null ) {
                  final int field = documentCollection.factory().fieldIndex( sortedIndex[ j ].field );
                  // If the field is not present (e.g., because of parallel indexing) or it is not text we skip
                  if ( field == -1 || documentCollection.factory().fieldType( field ) != DocumentFactory.FieldType.TEXT ) continue;
                  LOGGER.debug( "Found intervals for " + sortedIndex[ j ].field + " (" + field + ")" );
                  final Reader content = (Reader)document.content( field );
                  snippet.startField( selectedInterval ).appendAndMark( document.wordReader( field ).setReader( content ) ).endField();
                }
                if ( LOGGER.isDebugEnabled() ) LOGGER.debug( sortedIndex[ j ].field + ": " + ( selectedInterval == null ? null : Arrays.asList( selectedInterval ) ) );
                document.close();
              }
             
              resultItem.text = snippet;
            }
            else {
              if ( titleList != null ) {
                // TODO: this is a bit radical
                resultItem.title = resultItem.uri = titleList.get( resultItem.doc );
              }
              else {
                resultItem.title = "Document #" +  resultItem.doc;
                resultItem.uri = new MutableString( "./Item?doc=" ).append( resultItem.doc ).append( "&m=" ).append( urlEncodedMimeType );
              }
             
              MutableString text = new MutableString();
              for( Iterator<Index> j = indexMap.values().iterator(); j.hasNext(); ) {
                final Index index = j.next();
                selectedInterval = dsi.info.get( index );
                if ( selectedInterval != null )
                  text.append( "<p>" ).append( index.field ).append( ": " ).append( Arrays.asList( selectedInterval ) );
                LOGGER.debug( index.field + ": " + ( selectedInterval == null ? null : Arrays.asList( selectedInterval ) ) );
              }
              resultItem.text = text;
            }
          }
         
          if ( collection != null ) collection.close();
        }

       
        // Note that if we pass an array to the template we lose the possibility of measuring its length.
        context.put( "result", resultItems );
View Full Code Here

Examples of it.unimi.dsi.mg4j.document.DocumentCollection

    return HttpQueryServer.setLiberalResourceLoading( super.loadConfiguration( config ) );
  }

  public Template handleRequest( final HttpServletRequest request, final HttpServletResponse response, final Context context ) throws Exception {
    if ( request.getParameter( "doc" ) != null ) {
      DocumentCollection collection = (DocumentCollection)getServletContext().getAttribute( "collection" );
      response.setContentType( request.getParameter( "m" ) );
      response.setCharacterEncoding( "UTF-8" );
      final Document document = collection.document( Integer.parseInt( request.getParameter( "doc" ) ) );
      final DocumentFactory factory = collection.factory();
      final ObjectArrayList<String> fields = new ObjectArrayList<String>();
      final int numberOfFields = factory.numberOfFields();
     
      LOGGER.debug( "ParsingFactory declares " + numberOfFields + " fields"  );
     
View Full Code Here

Examples of it.unimi.dsi.mg4j.document.DocumentCollection

  private static final int skip = Integer.getInteger( InputStreamItem.class.getName() + ".skip", 0 ).intValue();
 
  protected void doGet( final HttpServletRequest request, final HttpServletResponse response ) throws IOException {
    try {
      if ( request.getParameter( "m" ) != null && request.getParameter( "doc" ) != null ) {
        DocumentCollection collection = (DocumentCollection)getServletContext().getAttribute( "collection" );
        if ( collection == null ) LOGGER.fatal( "The servlet context does not contain a document collection." );
        response.setContentType( request.getParameter( "m" ) );
        response.setCharacterEncoding( "UTF-8" );
        InputStream rawContent = collection.stream( Integer.parseInt( request.getParameter( "doc" ) ) );
        for( int i = skip; i-- != 0; ) rawContent.reset();
        IOUtils.copy( rawContent, response.getOutputStream() );
      }
    } catch( RuntimeException e ) {
      e.printStackTrace();
View Full Code Here

Examples of lotus.domino.DocumentCollection

  }

  public void exportDocuments(Database database) throws IOException, NotesException {
    target.startExport();
    try {
      DocumentCollection docs = database.getAllDocuments();
      try {
        for(Document doc=docs.getFirstDocument(); doc!=null; doc=docs.getNextDocument(doc)) {
          exportDocument(doc);
        }
      } finally {
        docs.recycle();
      }
    } finally {
      target.endExport();
    }
  }
View Full Code Here

Examples of lotus.domino.DocumentCollection

     
      // Getting the specified Notes database.
      Database database = session.getDatabase( "", stringDatabase );
     
      // Getting a collection of all documents from the database.
      DocumentCollection documentcollection = database.getAllDocuments();
     
      // Getting the first document from the database
      Document document = documentcollection.getFirstDocument();
     
      // Start to write to cells at this row.
      int intRowToStart = 0;
     
      // The current row.
      int intRow = intRowToStart;
     
      // The current column.
      int intColumn = 0;
     
      // Process all documents
      while ( document != null ) {
        // Getting the name of the stock.
        String stringName = document.getItemValueString( "Name" );
       
        // Inserting the name to a specified cell.
        insertIntoCell( intColumn, intRow, stringName, xspreadsheet, "" );
       
        // Getting the number of stocks.
        double intNumber = document.getItemValueInteger( "Number" );
       
        // Inserting the number of stocks to a specified cell.
        insertIntoCell( intColumn + 1, intRow, String.valueOf( intNumber ),
        xspreadsheet, "V" );
       
        // Getting current share price.
        double doubleSharePrice = document.getItemValueDouble( "SharePrice" );
       
        // Inserting the current share price to a specified cell.
        insertIntoCell( intColumn + 2, intRow, String.valueOf( doubleSharePrice ),
        xspreadsheet, "V" );
       
        // Inserting the total value.
        insertIntoCell( intColumn + 3, intRow, "=B"
        + String.valueOf( intRow + 1 ) + "*C" + String.valueOf( intRow + 1 ),
        xspreadsheet, "" );
       
        // Increasing the current row.
        intRow++;
       
        // Getting the next document from the collection.
        document = documentcollection.getNextDocument();
      }
     
      // Summing all specific amounts.
      insertIntoCell( intColumn + 3, intRow, "=sum(D"
      + String.valueOf( intRowToStart + 1 ) + ":D"
View Full Code Here

Examples of lotus.domino.DocumentCollection

     
            // Getting the specified Notes database.
            Database database = session.getDatabase( "", stringDatabase );
           
            // Getting a collection of all documents from the database.
            DocumentCollection documentCollection = database.getAllDocuments();
     
            // Getting the first document from the database
            Document document = documentCollection.getFirstDocument();
     
            // Start to write to cells at this row.
            int intRowToStart = 0;
     
            // The current row.
            int intRow = intRowToStart;
     
            // The current column.
            int intColumn = 0;
     
            // Process all documents
            while ( document != null ) {
                // Getting the name of the stock.
                String stringName = document.getItemValueString("Name");
               
                // Inserting the name to a specified cell.
                insertIntoCell(intColumn, intRow, stringName, xSpreadsheet, "");
               
                // Getting the number of stocks.
                double intNumber = document.getItemValueInteger( "Number" );
       
                // Inserting the number of stocks to a specified cell.
                insertIntoCell( intColumn + 1, intRow, String.valueOf(intNumber),
                                xSpreadsheet, "V" );
       
                // Getting current share price.
                double doubleSharePrice = document.getItemValueDouble("SharePrice");
       
                // Inserting the current share price to a specified cell.
                insertIntoCell(intColumn + 2, intRow,
                               String.valueOf(doubleSharePrice),
                               xSpreadsheet, "V");
       
                // Inserting the total value.
                insertIntoCell(intColumn + 3, intRow, "=B"
                               + String.valueOf( intRow + 1 )
                               + "*C" + String.valueOf(intRow + 1),
                               xSpreadsheet, "");
       
                // Increasing the current row.
                intRow++;
               
                // Getting the next document from the collection.
                document = documentCollection.getNextDocument();
            }
     
            // Summing all specific amounts.
            insertIntoCell(intColumn + 3, intRow, "=sum(D"
                           + String.valueOf( intRowToStart + 1 ) + ":D"
View Full Code Here

Examples of lotus.domino.DocumentCollection

     
            // Getting the specified Notes database.
            Database database = session.getDatabase( "", stringDatabase );
           
            // Getting a collection of all documents from the database.
            DocumentCollection documentCollection = database.getAllDocuments();
     
            // Getting the first document from the database
            Document document = documentCollection.getFirstDocument();
     
            // Start to write to cells at this row.
            int intRowToStart = 0;
     
            // The current row.
            int intRow = intRowToStart;
     
            // The current column.
            int intColumn = 0;
     
            // Process all documents
            while ( document != null ) {
                // Getting the name of the stock.
                String stringName = document.getItemValueString("Name");
               
                // Inserting the name to a specified cell.
                insertIntoCell(intColumn, intRow, stringName, xSpreadsheet, "");
               
                // Getting the number of stocks.
                double intNumber = document.getItemValueInteger( "Number" );
       
                // Inserting the number of stocks to a specified cell.
                insertIntoCell( intColumn + 1, intRow, String.valueOf(intNumber),
                                xSpreadsheet, "V" );
       
                // Getting current share price.
                double doubleSharePrice = document.getItemValueDouble("SharePrice");
       
                // Inserting the current share price to a specified cell.
                insertIntoCell(intColumn + 2, intRow,
                               String.valueOf(doubleSharePrice),
                               xSpreadsheet, "V");
       
                // Inserting the total value.
                insertIntoCell(intColumn + 3, intRow, "=B"
                               + String.valueOf( intRow + 1 )
                               + "*C" + String.valueOf(intRow + 1),
                               xSpreadsheet, "");
       
                // Increasing the current row.
                intRow++;
               
                // Getting the next document from the collection.
                document = documentCollection.getNextDocument();
            }
     
            // Summing all specific amounts.
            insertIntoCell(intColumn + 3, intRow, "=sum(D"
                           + String.valueOf( intRowToStart + 1 ) + ":D"
View Full Code Here
TOP
Copyright © 2018 www.massapi.com. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.