Source Code of com.google.enterprise.connector.notes.NotesCrawlerThread

// Copyright 2011 Google Inc.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.


package com.google.enterprise.connector.notes;


import com.google.common.annotations.VisibleForTesting;
import com.google.enterprise.connector.notes.client.NotesDatabase;
import com.google.enterprise.connector.notes.client.NotesDocument;
import com.google.enterprise.connector.notes.client.NotesDocumentCollection;
import com.google.enterprise.connector.notes.client.NotesEmbeddedObject;
import com.google.enterprise.connector.notes.client.NotesItem;
import com.google.enterprise.connector.notes.client.NotesRichTextItem;
import com.google.enterprise.connector.notes.client.NotesSession;
import com.google.enterprise.connector.notes.client.NotesView;
import com.google.enterprise.connector.spi.RepositoryException;
import com.google.enterprise.connector.spi.SpiConstants.ActionType;


import java.net.URLEncoder;
import java.sql.Connection;
import java.sql.SQLException;
import java.util.ArrayList;
import java.util.HashSet;
import java.util.List;
import java.util.Set;
import java.util.Vector;
import java.util.logging.Level;
import java.util.logging.Logger;
import java.util.regex.Matcher;
import java.util.regex.Pattern;


public class NotesCrawlerThread extends Thread {
  private static final String CLASS_NAME = NotesCrawlerThread.class.getName();
  private static final Logger LOGGER = Logger.getLogger(CLASS_NAME);
  static final String META_FIELDS_PREFIX = "x.";


  private final NotesConnector nc;
  private final NotesConnectorSession ncs;
  private NotesSession ns = null;
  private NotesDatabase cdb = null;
  @VisibleForTesting
  NotesDocument templateDoc = null;
  @VisibleForTesting
  NotesDocument formDoc = null;
  @VisibleForTesting
  NotesDocumentCollection formsdc = null;
  private String openDbRepId = "";
  private NotesDatabase srcdb = null;
  private NotesView crawlQueue = null;


  @VisibleForTesting
  List<MetaField> metaFields;


  NotesCrawlerThread(NotesConnector Connector, NotesConnectorSession Session) {
    final String METHOD = "NotesCrawlerThread";
    LOGGER.logp(Level.FINEST, CLASS_NAME, METHOD,
        "NotesCrawlerThread being created.");


    nc = Connector;
    ncs = Session;
  }


  // Since we are multi-threaded, each thread has its own objects
  // which are not shared.  Hence the calling thread must pass
  // the Domino objects to this method.
  @VisibleForTesting
  static synchronized NotesDocument getNextFromCrawlQueue(
      NotesSession ns, NotesView crawlQueue) {
    final String METHOD = "getNextFromCrawlQueue";
    try {
      crawlQueue.refresh();
      NotesDocument nextDoc = crawlQueue.getFirstDocument();
      if (nextDoc == null) {
        return null;
      }
      LOGGER.logp(Level.FINER, CLASS_NAME, METHOD, "Prefetching document");
      nextDoc.replaceItemValue(NCCONST.NCITM_STATE, NCCONST.STATEINCRAWL);
      nextDoc.save(true);


      return nextDoc;
    } catch (Exception e) {
      LOGGER.log(Level.SEVERE, CLASS_NAME, e);
    } finally {
    }
    return null;
  }


  protected void loadTemplateDoc(String TemplateName)
      throws RepositoryException {
    final String METHOD = "loadTemplate";
    LOGGER.entering(CLASS_NAME, METHOD);


    // Is a template document all ready loaded?
    if (null != templateDoc) {
      // Is this the one we need?
      if (TemplateName.equals(
              templateDoc.getItemValueString(NCCONST.TITM_TEMPLATENAME))) {
        return;
      }
      templateDoc.recycle();
      templateDoc = null;
      if (null != formsdc) {
        formsdc.recycle();
      }
      formsdc = null;
      if (null != formDoc) {
        formDoc.recycle();
      }
      formDoc = null;
    }
    NotesView vw = cdb.getView(NCCONST.VIEWTEMPLATES);
    templateDoc = vw.getDocumentByKey(TemplateName, true);
    formsdc = templateDoc.getResponses();


    // Parse any configured MetaFields once per template load.
    Vector templateMetaFields =
        templateDoc.getItemValue(NCCONST.TITM_METAFIELDS);
    metaFields = new ArrayList<MetaField>(templateMetaFields.size());
    for (Object o : templateMetaFields) {
      metaFields.add(new MetaField((String) o));
    }
    if (LOGGER.isLoggable(Level.FINEST)) {
      LOGGER.finest("template MetaFields: '" + templateMetaFields
          + "'; parsed MetaFields: " + metaFields);
    }
    vw.recycle();
  }


  protected void loadForm(String FormName) throws RepositoryException {
    final String METHOD = "loadForm";
    LOGGER.entering(CLASS_NAME, METHOD);


    if (null != formDoc) {
      if (FormName == formDoc.getItemValueString(NCCONST.FITM_LASTALIAS)) {
        return;
      }
      formDoc.recycle();
      formDoc = null;
    }
    if (null == formsdc) {
      return;
    }
    formDoc = formsdc.getFirstDocument();
    while (null != formDoc) {
      String formDocName = formDoc.getItemValueString(NCCONST.FITM_LASTALIAS);
      if (formDocName.equals(FormName)) {
        return;
      }
      NotesDocument prevDoc = formDoc;
      formDoc = formsdc.getNextDocument(prevDoc);
      prevDoc.recycle();
    }
  }


  /*
   *   Some comments on Domino.
   *
   *   Reader security is only enforced in Domino if there are
   *   Readers fields on the document and they are non-blank
   *
   *   Authors fields also provide read access to the document if
   *   document level security is enforced.  However if there are
   *   authors fields, but not any non-blank readers fields,
   *   document level security will not be enforced.
   */
  protected boolean getDocumentReaderNames(NotesDocument crawlDoc,
      NotesDocument srcDoc) throws RepositoryException {
    final String METHOD = "getDocumentReaderNames";
    LOGGER.entering(CLASS_NAME, METHOD);


    Vector<?> allItems = srcDoc.getItems();
    try {
      Vector<String> authorReaders = new Vector<String>();
      boolean hasReaders = false;
      Vector<Integer> authorItems = new Vector<Integer>();


      // Find the Readers field(s), if any. There can be more
      // than one Readers field.
      for (int i = 0; i < allItems.size(); i++) {
        NotesItem item = (NotesItem) allItems.elementAt(i);
        if (item.isReaders()) {
          boolean hasCurrentItemReaders =
              copyValues(item, authorReaders, "readers");
          hasReaders = hasCurrentItemReaders || hasReaders;
        } else if (item.isAuthors()) {
          authorItems.add(i);
        }
      }
      // If there are Readers, add any Authors to the Readers list
      // for AuthZ purposes. With no Readers, database security applies.
      if (hasReaders && authorItems.size() > 0) {
        for (Integer i : authorItems) {
          copyValues((NotesItem) allItems.elementAt(i),
              authorReaders, "authors");
        }
      }


      LOGGER.logp(Level.FINEST, CLASS_NAME, METHOD,
          "Document readers for "
          + crawlDoc.getItemValueString(NCCONST.ITM_DOCID)
          + " are " + authorReaders);
      if (authorReaders.size() > 0) {
        crawlDoc.replaceItemValue(NCCONST.NCITM_DOCAUTHORREADERS,
            authorReaders);
        crawlDoc.replaceItemValue(NCCONST.NCITM_DOCREADERS, authorReaders);
      }
      return hasReaders;
    } finally {
      srcDoc.recycle(allItems);
    }
  }


  private boolean copyValues(NotesItem item, Vector<String> destination,
      String description) throws RepositoryException {
    final String METHOD = "copyValues";
    Vector values = item.getValues();
    int count = 0;
    if (null != values) {
      LOGGER.logp(Level.FINEST, CLASS_NAME, METHOD,
          "Adding " + description + " " + values.toString());
      for (; count < values.size(); count++) {
        destination.add(values.elementAt(count).toString().toLowerCase());
      }
    }
    return count > 0;
  }


  // This function will set google security fields for the document
  protected void setDocumentSecurity(NotesDocument crawlDoc,
      NotesDocument srcDoc) throws RepositoryException {
    final String METHOD = "setDocumentSecurity";
    LOGGER.entering(CLASS_NAME, METHOD);


    String AuthType = crawlDoc.getItemValueString(NCCONST.NCITM_AUTHTYPE);


    if (AuthType.equals(NCCONST.AUTH_NONE)) {
      crawlDoc.replaceItemValue(NCCONST.ITM_ISPUBLIC, Boolean.TRUE.toString());
      return;
    }
    if (AuthType.equals(NCCONST.AUTH_ACL)) {
      crawlDoc.replaceItemValue(NCCONST.ITM_ISPUBLIC, Boolean.FALSE.toString());


      ;  // TODO: Handle document ACLs
      return;
    }
    if (AuthType.equals(NCCONST.AUTH_CONNECTOR)) {
      crawlDoc.replaceItemValue(NCCONST.ITM_ISPUBLIC, Boolean.FALSE.toString());
      ;
      return;
    }
  }


  protected void evaluateField(NotesDocument crawlDoc, NotesDocument srcDoc,
      String formula, String ItemName, String Default)
      throws RepositoryException {
    final String METHOD = "evaluateField";
    LOGGER.entering(CLASS_NAME, METHOD);


    Vector<?> VecEvalResult = null;
    String Result = null;
    try {
      LOGGER.logp(Level.FINEST, CLASS_NAME, METHOD,
          "Evaluating formula for item " + ItemName + " : src is: " + formula);
      VecEvalResult = ns.evaluate(formula, srcDoc);
      // Make sure we dont' get an empty vector or an empty string
      if (VecEvalResult != null) {
        if (VecEvalResult.size() > 0) {
          Result = VecEvalResult.elementAt(0).toString();
          LOGGER.logp(Level.FINEST, CLASS_NAME, METHOD,
              "Evaluating formula result is: " + Result);
        }
      }
      if (null == Result) {
        Result = Default;
      }
      if (Result.length() == 0) {
        Result = Default;
      }
    } catch (RepositoryException e) {
      LOGGER.log(Level.SEVERE, CLASS_NAME, e);
    } finally {
      crawlDoc.replaceItemValue(ItemName, Result);
    }
    LOGGER.exiting(CLASS_NAME, METHOD);
  }




  // TODO: Consider mapping other fields so they can be used for
  // dynamic navigation.  This could be an configurable option.


  // This function will map the fields from the source database
  // to the crawl doc using the configuration specified in
  // formDoc
  protected void mapFields(NotesDocument crawlDoc, NotesDocument srcDoc)
      throws RepositoryException {
    final String METHOD = "mapFields";
    LOGGER.entering(CLASS_NAME, METHOD);


    // Copy the standard fields
    String NotesURL = srcDoc.getNotesURL();
    String HttpURL = getHTTPURL(crawlDoc);
    crawlDoc.replaceItemValue(NCCONST.ITM_DOCID, HttpURL);
    crawlDoc.replaceItemValue(NCCONST.ITM_DISPLAYURL, HttpURL);
    crawlDoc.replaceItemValue(NCCONST.ITM_GMETAFORM,
        srcDoc.getItemValueString(NCCONST.ITMFORM));
    crawlDoc.replaceItemValue(NCCONST.ITM_LASTMODIFIED,
        srcDoc.getLastModified());
    crawlDoc.replaceItemValue(NCCONST.ITM_GMETAWRITERNAME, srcDoc.getAuthors());
    crawlDoc.replaceItemValue(NCCONST.ITM_GMETALASTUPDATE,
        srcDoc.getLastModified());
    crawlDoc.replaceItemValue(NCCONST.ITM_GMETACREATEDATE, srcDoc.getCreated());


    // We need to generate the title and description using a formula
    String formula = null;
    // When there is no form configuration use the config from the template
    if (formDoc != null) {
      formula = formDoc.getItemValueString(NCCONST.FITM_SEARCHRESULTSFORMULA);
    }
    else {
      formula = templateDoc.getItemValueString(NCCONST.TITM_SEARCHRESULTSFIELDS);
    }
    evaluateField(crawlDoc, srcDoc, formula, NCCONST.ITM_TITLE, "");


    // Again..when there is no form configuration use the config
    // from the template
    if (formDoc != null) {
      formula = formDoc.getItemValueString(NCCONST.FITM_DESCRIPTIONFORMULA);
    }
    else {
      formula = templateDoc.getItemValueString(NCCONST.TITM_DESCRIPTIONFIELDS);
    }
    evaluateField(crawlDoc, srcDoc, formula, NCCONST.ITM_GMETADESCRIPTION, "");
    LOGGER.exiting(CLASS_NAME, METHOD);


    // DO NOT MAP THIS FIELD - it will force the GSA to try and crawl this URL
    // crawlDoc.replaceItemValue(NCCONST.ITM_SEARCHURL, HttpURL);
  }


  @VisibleForTesting
  void mapMetaFields(NotesDocument crawlDoc, NotesDocument srcDoc)
      throws RepositoryException {
    final String METHOD = "mapMetaFields";
    LOGGER.entering(CLASS_NAME, METHOD);
    NotesItem item = null;
    for (MetaField mf : metaFields) {
      try {
        if (null == mf.getFieldName()) {
          if (LOGGER.isLoggable(Level.FINEST)) {
            LOGGER.logp(Level.FINEST, CLASS_NAME, METHOD,
                "Skipping null fieldname");
          }
          continue;
        }
        String configForm = mf.getFormName();
        if (null != configForm) {
          String docForm = srcDoc.getItemValueString(NCCONST.ITMFORM);
          if (!configForm.equalsIgnoreCase(docForm)) {
            if (LOGGER.isLoggable(Level.FINEST)) {
              LOGGER.logp(Level.FINEST, CLASS_NAME, METHOD,
                  "Skipping metafields because configured form {0} does not "
                  + "match doc form {1}",
                  new Object[] { configForm, docForm });
            }
            continue;
          }
        }
        if (!srcDoc.hasItem(mf.getFieldName())) {
          if (LOGGER.isLoggable(Level.FINEST)) {
            LOGGER.logp(Level.FINEST, CLASS_NAME, METHOD,
                "Source doc does not have field: " + mf.getFieldName());
          }
          continue;
        }
        // If there are multiple items with the same name (not a
        // common Notes occurrence), only the first item will be
        // mapped.
        item = srcDoc.getFirstItem(mf.getFieldName());
        if (null == item.getValues()) {
          if (LOGGER.isLoggable(Level.FINEST)) {
            LOGGER.logp(Level.FINEST, CLASS_NAME, METHOD,
                "Source doc does not have value for: " + mf.getFieldName());
          }
          continue;
        }
        Object content = item;
        if (item.getType() == NotesItem.RICHTEXT) {
          content = item.getText(2 * 1024);
        }
        if (crawlDoc.hasItem(META_FIELDS_PREFIX + mf.getMetaName())) {
          LOGGER.logp(Level.WARNING, CLASS_NAME, METHOD,
              "Mapping meta fields: meta field {0} already exists in crawl doc",
              mf.getMetaName());
          // If multiple Notes fields are mapped to the same meta
          // field, only the first mapping will be used.
          continue;
        }
        crawlDoc.replaceItemValue(META_FIELDS_PREFIX + mf.getMetaName(),
            content);
        if (LOGGER.isLoggable(Level.FINEST)) {
          LOGGER.logp(Level.FINEST, CLASS_NAME, METHOD,
              "Mapped meta field : " + META_FIELDS_PREFIX
              + mf.getMetaName() + " =  " + content);
        }
      } catch (RepositoryException e) {
        LOGGER.logp(Level.WARNING, CLASS_NAME, METHOD,
            "Error mapping MetaField " + mf, e);
      } finally {
        if (null != item) {
          item.recycle();
        }
      }
    }
    LOGGER.exiting(CLASS_NAME, METHOD);
  }


  protected String getHTTPURL(NotesDocument crawlDoc)
      throws RepositoryException {


    String httpURL = null;
    String server = null;


    // Get the domain name associated with the server
    server = crawlDoc.getItemValueString(NCCONST.NCITM_SERVER);
    String domain = ncs.getDomain(server);


    httpURL = String.format("http://%s%s/%s/0/%s",
        crawlDoc.getItemValueString(NCCONST.NCITM_SERVER),
        domain,
        crawlDoc.getItemValueString(NCCONST.NCITM_REPLICAID),
        crawlDoc.getItemValueString(NCCONST.NCITM_UNID));
    return httpURL;
  }


  protected String getContentFields(NotesDocument srcDoc)
      throws RepositoryException {
    final String METHOD = "getContentFields";
    LOGGER.entering(CLASS_NAME, METHOD);


    // TODO:  Handle stored forms
    StringBuffer content = new StringBuffer();
    // If we have a form document then we have a specified list
    // of fields to index
    if (null != formDoc) {
      Vector<?> v = formDoc.getItemValue(NCCONST.FITM_FIELDSTOINDEX);
      for (int i = 0; i < v.size(); i++) {
        String fieldName = v.elementAt(i).toString();
        // Fields beginning with $ are reserved fields in Domino
        // Do not index the Form field ever
        if ((fieldName.charAt(0) == '$') ||
            (fieldName.equalsIgnoreCase("form"))) {
          continue;
        }
        content.append("\n");
        NotesItem tmpItem = srcDoc.getFirstItem(fieldName);
        if (null != tmpItem) {
          // Must use getText to get more than 64k of text
          content.append(tmpItem.getText(2 * 1024 * 1024));
          tmpItem.recycle();
        }
      }
      LOGGER.exiting(CLASS_NAME, METHOD);
      return content.toString();
    }


    // Otherwise we will index all allowable fields
    Vector <?> vi = srcDoc.getItems();
    for (int j = 0; j < vi.size(); j++) {
      NotesItem itm = (NotesItem) vi.elementAt(j);
      String ItemName = itm.getName();
      if ((ItemName.charAt(0) == '$') || (ItemName.equalsIgnoreCase("form"))) {
        continue;
      }
      int type = itm.getType();
      switch (type) {
        case NotesItem.TEXT:
        case NotesItem.NUMBERS:
        case NotesItem.DATETIMES:
        case NotesItem.RICHTEXT:
        case NotesItem.NAMES:
        case NotesItem.AUTHORS:
        case NotesItem.READERS:
          content.append("\n");
          NotesItem tmpItem = srcDoc.getFirstItem(ItemName);
          if (null != tmpItem) {
            // Must use getText to get more than 64k of text
            content.append(tmpItem.getText(2 * 1024 * 1024));
            tmpItem.recycle();
          }
          break;
        default:
          break;
      }
    }
    LOGGER.exiting(CLASS_NAME, METHOD);
    return content.toString();
  }


  protected boolean prefetchDoc(NotesDocument crawlDoc) {
    final String METHOD = "prefetchDoc";
    LOGGER.entering(CLASS_NAME, METHOD);


    String NotesURL = null;
    NotesDocument srcDoc = null;
    try {
      NotesURL = crawlDoc.getItemValueString(NCCONST.ITM_GMETANOTESLINK);
      LOGGER.logp(Level.FINER, CLASS_NAME, METHOD,
          "Prefetching document " + NotesURL);


      // Get the template for this document
      loadTemplateDoc(crawlDoc.getItemValueString(NCCONST.NCITM_TEMPLATE));
      if (null == templateDoc) {
        LOGGER.logp(Level.FINER, CLASS_NAME, METHOD,
            "No template found for document " +
            crawlDoc.getItemValueString(NCCONST.ITM_GMETANOTESLINK));
        return false;
      }


      // Check to see if the database we all ready have open is
      // the right one by comparing replicaids
      String crawlDocDbRepId = crawlDoc.getItemValueString(
          NCCONST.NCITM_REPLICAID);
      if (!crawlDocDbRepId.contentEquals(openDbRepId)) {
        // Different ReplicaId - Recycle and close the old database
        if (srcdb != null) {
          srcdb.recycle();
          srcdb= null;
        }
        // Open the new database
        srcdb = ns.getDatabase(null, null);
        srcdb.openByReplicaID(crawlDoc.getItemValueString(
                NCCONST.NCITM_SERVER), crawlDocDbRepId);
        openDbRepId = crawlDocDbRepId;
      }


      // Load our source document
      srcDoc = srcdb.getDocumentByUNID(crawlDoc.getItemValueString(
              NCCONST.NCITM_UNID));
      // Get the form configuration for this document
      loadForm(srcDoc.getItemValueString(NCCONST.ITMFORM));
      if (null == formDoc) {
        LOGGER.logp(Level.FINER, CLASS_NAME, METHOD,
            "No form definition found.  Using template definition " +
            "to process document " + NotesURL);
      }


      boolean hasReaders = getDocumentReaderNames(crawlDoc, srcDoc);
      if (hasReaders) {
        if (NCCONST.AUTH_ACL.equals(
            crawlDoc.getItemValueString(NCCONST.NCITM_AUTHTYPE))) {


          // Continue processing doc if GSA supports inherited
          // ACLs. Return false if not; doc won't be indexed.
          if (!((NotesTraversalManager) ncs.getTraversalManager())
              .getTraversalContext().supportsInheritedAcls()) {
            LOGGER.logp(Level.WARNING, CLASS_NAME, METHOD,
                "Document " + NotesURL + " has document-level security, "
                + "but the connector is configured to use database-level "
                + "Policy ACLs. This document will not be indexed.");
            return false;
          }
        }
      }
      setDocumentSecurity(crawlDoc, srcDoc);


      mapFields(crawlDoc, srcDoc);
      mapMetaFields(crawlDoc, srcDoc);


      // Process the attachments associated with this document
      // When there are multiple attachments with the same name
      // Lotus Notes automatically generates unique names for next document
      Vector<?> va = ns.evaluate("@AttachmentNames", srcDoc);
      Vector<String> docIds = new Vector<String>();


      NotesItem attachItems = crawlDoc.replaceItemValue(
          NCCONST.ITM_GMETAATTACHMENTS, "");
      for (int i = 0; i < va.size(); i++) {
        String attachName = va.elementAt(i).toString();


        if (attachName.length() == 0) {
          continue;
        }
        String xtn = null;
        int period = attachName.lastIndexOf(".");
        if (period == -1) {
          xtn = "";
        } else {
          xtn = attachName.substring(period + 1);
        }
        if (!ncs.isExcludedExtension(xtn.toLowerCase())) {
          String docId = createAttachmentDoc(crawlDoc, srcDoc,
              attachName, ncs.getMimeType(xtn));
          if (docId != null) {
            attachItems.appendToTextList(attachName);
            docIds.add(docId);
          } else {
            LOGGER.log(Level.FINER,
                "Attachment document was not created for {0}", attachName);
          }
        } else {
          LOGGER.logp(Level.FINER, CLASS_NAME, METHOD,
              "Excluding attachment in " + NotesURL + " : " + attachName);
        }
      }
      crawlDoc.replaceItemValue(NCCONST.ITM_GMETAALLATTACHMENTS, va);
      crawlDoc.replaceItemValue(NCCONST.ITM_GMETAATTACHMENTDOCIDS, docIds);


      // Get our content after processing attachments
      // We don't want the document content in the attachment docs
      // Our content must be stored as non-summary rich text to
      // avoid the 32/64K limits in Domino
      NotesRichTextItem contentItem = crawlDoc.createRichTextItem(
          NCCONST.ITM_CONTENT);
      String content = getContentFields(srcDoc);
      contentItem.appendText(content);
      contentItem.setSummary(false);


      // Update the status of the document to be fetched.
      crawlDoc.replaceItemValue(NCCONST.ITM_ACTION, ActionType.ADD.toString());
      srcDoc.recycle();


      // Check attachments against H2 database and create delete requests for
      // attachments which no longer exist in source document.
      NotesDocId notesDocId =
          new NotesDocId(crawlDoc.getItemValueString(NCCONST.ITM_DOCID));
      enqueue(notesDocId, docIds);


      return true;
    } catch (Exception e) {
      LOGGER.log(Level.SEVERE, CLASS_NAME, e);
      return false;
    } finally {
      LOGGER.exiting(CLASS_NAME, METHOD);
    }
  }


  /**
   * Create delete requests for attachments which no longer exist in the
   * source document.
   * 
   * @param notesId google:docid of the parent document
   * @param attachIds hashes of current attachment names
   */
  void enqueue(NotesDocId notesId, Vector<String> attachIds) {
    LOGGER.log(Level.FINEST, "Send delete requests for attachments which "
        + "no longer exist in source document [UNID: {0}]", notesId);
    Set<String> curAttachIds = new HashSet<String>(attachIds);


    NotesDocumentManager docMgr = ncs.getNotesDocumentManager();
    Connection conn = null;
    try {
      conn = docMgr.getDatabaseConnection();
      Set<String> allAttachIds = docMgr.getAttachmentIds(conn,
          notesId.getDocId(), notesId.getReplicaId());
      for (String attachId : allAttachIds) {
        if (!curAttachIds.contains(attachId)) {
          LOGGER.log(Level.FINEST, "{0} attachment is in cache but not in "
              + "source document, send delete request to GSA", attachId);
          try {
            // Send deletion for each attachment
            String attachmentUrl = String.format(NCCONST.SITM_ATTACHMENTDOCID,
                notesId.toString(), attachId);
            createDeleteRequest(attachmentUrl);
          } catch (RepositoryException e) {
            LOGGER.log(Level.WARNING,
                "Failed to create delete request for attachment: " + attachId);
          }
        }
      }
    } catch (SQLException e) {
      LOGGER.log(Level.SEVERE, "Unable to connect to H2 database", e);
    } finally {
      if (conn != null) {
        docMgr.releaseDatabaseConnection(conn);
      }
    }
  }


  private void createDeleteRequest(String googleDocId)
      throws RepositoryException {
    LOGGER.log(Level.FINEST, "Send deletion request to GSA for {0}",
        googleDocId);
    NotesDocument deleteReq = cdb.createDocument();
    deleteReq.appendItemValue(NCCONST.ITMFORM, NCCONST.FORMCRAWLREQUEST);
    deleteReq.replaceItemValue(NCCONST.ITM_ACTION,
        ActionType.DELETE.toString());
    deleteReq.replaceItemValue(NCCONST.ITM_DOCID, googleDocId);
    deleteReq.replaceItemValue(NCCONST.NCITM_STATE, NCCONST.STATEFETCHED);
    deleteReq.save(true);
    deleteReq.recycle();
  }


  /**
   * Creates a document for an attachment in the GSA Configuration database.  If
   * the file size is exceeding the limit or the MIME type is not supported,
   * only metadata and the attachment file name will be indexed.
   * 
   * @param crawlDoc document being crawled in the Crawl Queue view
   * @param srcDoc source document where the attachment is located
   * @param AttachmentName string file name without encoding
   * @param MimeType string MIME type computed from file extension
   * @return attachment document ID string if the attachment document is created
   *         and its content will be indexed.
   *         null string if the attachment document is not created and its
   *         content will not be indexed.
   * @throws RepositoryException if embedded object is not accessible
   */
  public String createAttachmentDoc(NotesDocument crawlDoc,
      NotesDocument srcDoc, String AttachmentName, String MimeType)
      throws RepositoryException {
    final String METHOD = "createAttachmentDoc";
    String AttachmentURL = null;
    LOGGER.entering(CLASS_NAME, METHOD);
    NotesEmbeddedObject eo = null;
    NotesDocument attachDoc = null;


    try {
      // Error access the attachment
      eo = srcDoc.getAttachment(AttachmentName);


      if (eo == null) {
        LOGGER.log(Level.FINER, "Attachment could not be accessed {0}",
            AttachmentName);
        return null;
      }


      if (eo.getType() != NotesEmbeddedObject.EMBED_ATTACHMENT) {
        // The object is not an attachment - could be an OLE object or link
        LOGGER.logp(Level.FINER, CLASS_NAME, METHOD,
            "Ignoring embedded object " + AttachmentName);
        eo.recycle();
        return null;
      }


      // Don't send attachments larger than the limit
      if (eo.getFileSize() > ncs.getMaxFileSize()) {
        LOGGER.logp(Level.FINER, CLASS_NAME, METHOD,
            "Attachment larger than the configured limit and content " +
            "will not be sent. " + AttachmentName);
      }


      attachDoc = cdb.createDocument();
      crawlDoc.copyAllItems(attachDoc, true);


      // Store the filename of this attachment in the attachment crawl doc.
      attachDoc.replaceItemValue(NCCONST.ITM_GMETAATTACHMENTFILENAME,
          AttachmentName);
      attachDoc.save();


      // Compute display URL
      String encodedAttachmentName = null;
      try {
        encodedAttachmentName = URLEncoder.encode(AttachmentName, "UTF-8");
      } catch (Exception e) {
        attachDoc.recycle();
        eo.recycle();
        return null;
      }
      AttachmentURL = String.format(NCCONST.SITM_ATTACHMENTDISPLAYURL,
          getHTTPURL(crawlDoc), encodedAttachmentName);
      attachDoc.replaceItemValue(NCCONST.ITM_DISPLAYURL, AttachmentURL);
      LOGGER.log(Level.FINEST, "Attachment display url: {0}", AttachmentURL);


      // Compute docid
      String attachNameHash = Util.hash(AttachmentName);
      if (attachNameHash == null) {
        return null;
      }
      String docURL = String.format(NCCONST.SITM_ATTACHMENTDOCID,
          getHTTPURL(crawlDoc), attachNameHash);
      attachDoc.replaceItemValue(NCCONST.ITM_DOCID, docURL);
      LOGGER.log(Level.FINEST, "Attachment document docid: {0}", docURL);
      
      // Only if we have a supported mime type and file size is not exceeding
      // the limit do we send the content, or only metadata and file name will
      // be sent.
      if ((0 != MimeType.length()) &&
          eo.getFileSize() <= ncs.getMaxFileSize()) {
        attachDoc.replaceItemValue(NCCONST.ITM_MIMETYPE, MimeType);
        String attachmentPath = getAttachmentFilePath(crawlDoc, attachNameHash);
        eo.extractFile(attachmentPath);
        attachDoc.replaceItemValue(NCCONST.ITM_CONTENTPATH, attachmentPath);
      } else {
        // Not a supported attachment so sending meta data only
        // with the filename as content
        attachDoc.replaceItemValue(NCCONST.ITM_CONTENT, AttachmentName);
        attachDoc.replaceItemValue(NCCONST.ITM_MIMETYPE,
            NCCONST.DEFAULT_MIMETYPE);
      }
      eo.recycle();


      // Set the state of this document to be fetched
      attachDoc.replaceItemValue(NCCONST.ITM_ACTION, ActionType.ADD.toString());
      attachDoc.replaceItemValue(NCCONST.NCITM_STATE, NCCONST.STATEFETCHED);
      attachDoc.save();
      attachDoc.recycle();
      LOGGER.exiting(CLASS_NAME, METHOD);
      return attachNameHash;
    } catch (Exception e) {
      LOGGER.logp(Level.SEVERE, CLASS_NAME, METHOD,
          "Error pre-fetching attachment: " + AttachmentName +
          " in document: " + srcDoc.getNotesURL(), e);
      if (null != eo) {
        eo.recycle();
      }
      if (null != attachDoc) {
        attachDoc.replaceItemValue(NCCONST.NCITM_STATE, NCCONST.STATEERROR);
        attachDoc.save();
        attachDoc.recycle();
      }
      return null;
    }
  }


  // This function will generate an unique file path for an attachment object.
  // Consider the situation where a document is updated twice and
  // appears in the submitq twice In this case, the first submit
  // will delete the doc.  The second submit will then send an
  // empty doc So we must use the UNID of the crawl request to
  // generate the unique filename
  public String getAttachmentFilePath(NotesDocument crawlDoc,
      String attachName) throws RepositoryException {
    String dirName = String.format("%s/attachments/%s/%s",
        ncs.getSpoolDir(),
        cdb.getReplicaID(),
        crawlDoc.getUniversalID());
    new java.io.File(dirName).mkdirs();
    String FilePath = String.format("%s/%s", dirName, attachName);
    //TODO:  Ensure that FilePath is a valid Windows filepath
    return FilePath;
  }


  public void connectQueue() throws RepositoryException {
    if (null == ns) {
      ns = ncs.createNotesSession();
    }
    if (null == cdb) {
      cdb = ns.getDatabase(ncs.getServer(), ncs.getDatabase());
    }
    if (crawlQueue == null) {
      crawlQueue = cdb.getView(NCCONST.VIEWCRAWLQ);
    }
  }




  /*
   * We accumulate objects as pre-fetch documents
   * De-allocate these in reverse order
   */
  public void disconnectQueue()  {
    final String METHOD = "disconnectQueue";
    LOGGER.entering(CLASS_NAME, METHOD);
    try {
      if (null != templateDoc) {
        templateDoc.recycle();
      }
      templateDoc = null;


      if (null != formDoc) {
        formDoc.recycle();
      }
      formDoc = null;


      if (null != formsdc) {
        formsdc.recycle();
      }
      formsdc = null;


      if (null != srcdb) {
        openDbRepId = "";
        srcdb.recycle();
        srcdb = null;
      }


      if (null != crawlQueue) {
        crawlQueue.recycle();
      }
      crawlQueue = null;


      if (null != cdb) {
        cdb.recycle();
      }
      cdb = null;


      if (null != ns) {
        ncs.closeNotesSession(ns);
      }
      ns = null;
    } catch (RepositoryException e) {
      LOGGER.log(Level.WARNING, CLASS_NAME, e);
    } finally {
      LOGGER.exiting(CLASS_NAME, METHOD);
    }
  }


  @Override
  public void run() {
    final String METHOD = "run";
    int exceptionCount = 0;
    LOGGER.entering(CLASS_NAME, METHOD);
    NotesPollerNotifier npn = ncs.getNotifier();
    while (nc.getShutdown() == false) {
      try {
        NotesDocument crawlDoc = null;
        // Only get from the queue if there is more than 300MB in the
        // spool directory
        java.io.File spoolDir = new java.io.File(ncs.getSpoolDir());
        LOGGER.logp(Level.FINE, CLASS_NAME, METHOD,
            "Spool free space is " + spoolDir.getFreeSpace());
        if (spoolDir.getFreeSpace()/1000000 < 300) {
          LOGGER.logp(Level.WARNING, CLASS_NAME, METHOD,
              "Insufficient space in spool directory to process " +
              "new documents.  Need at least 300MB.");
          npn.waitForWork();
          LOGGER.logp(Level.FINE, CLASS_NAME, METHOD,
              "Crawler thread resuming after spool directory had " +
              "insufficient space.");
          continue;
        }
        LOGGER.logp(Level.FINEST, CLASS_NAME, METHOD,
            "Connecting to crawl queue.");
        connectQueue();
        crawlDoc = getNextFromCrawlQueue(ns, crawlQueue);
        if (crawlDoc == null) {
          LOGGER.logp(Level.FINE, CLASS_NAME, METHOD, this.getName() +
              ": Crawl queue is empty.  Crawler thread sleeping.");
          // If we have finished processing the queue shutdown our connections
          disconnectQueue();
          npn.waitForWork();
          LOGGER.logp(Level.FINE, CLASS_NAME, METHOD, this.getName() +
              "Crawler thread resuming after crawl queue was empty.");
          continue;
        }
        if (prefetchDoc(crawlDoc)) {
          crawlDoc.replaceItemValue(NCCONST.NCITM_STATE, NCCONST.STATEFETCHED);
        } else  {
          crawlDoc.replaceItemValue(NCCONST.NCITM_STATE, NCCONST.STATEERROR);
        }
        crawlDoc.save(true);
        crawlDoc.recycle();
      } catch (Exception e) {
        LOGGER.log(Level.SEVERE, CLASS_NAME, e);
        // Lets say the server we are connected to goes down
        // while we are crawling We don't want to fill up the
        // logs with errors so go to sleep after 5 exceptions
        exceptionCount++;
        
        // If we run into an exception we should close our session.
        disconnectQueue();
        
        if (exceptionCount > 5) {
          LOGGER.logp(Level.WARNING, CLASS_NAME, METHOD,
              "Too many exceptions.  Crawler thread sleeping.");
          npn.waitForWork();
          LOGGER.logp(Level.WARNING, CLASS_NAME, METHOD,
              "Crawler thread resuming after too many exceptions " +
              "were encountered.");
        }
      }
    }
    disconnectQueue();
    LOGGER.logp(Level.FINE, CLASS_NAME, METHOD,
        "Connector shutdown - NotesCrawlerThread exiting.");
    LOGGER.exiting(CLASS_NAME, METHOD);
  }


  @VisibleForTesting
  static class MetaField {
    private static final Pattern formFieldMetaPattern =
        Pattern.compile("\\A(.+)===([^=]+)=([^=]+)\\z");
    private static final Pattern fieldMetaPattern =
        Pattern.compile("\\A([^=]+)=([^=]+)\\z");
    private static final Pattern fieldPattern =
        Pattern.compile("\\A([^=]+)\\z");


    private String formName;
    private String fieldName;
    private String metaName;


    MetaField(String configString) {
      String METHOD = "MetaField";
      if (configString == null) {
        return;
      }
      configString = configString.trim();
      if (configString.length() == 0) {
        return;
      }


      Matcher matcher = formFieldMetaPattern.matcher(configString);
      if (matcher.matches()) {
        formName = matcher.group(1);
        fieldName = matcher.group(2);
        metaName = matcher.group(3);
        return;
      }
      matcher = fieldMetaPattern.matcher(configString);
      if (matcher.matches()) {
        fieldName = matcher.group(1);
        metaName = matcher.group(2);
        return;
      }
      matcher = fieldPattern.matcher(configString);
      if (matcher.matches()) {
        fieldName = matcher.group(1);
        metaName = fieldName;
        return;
      }
      LOGGER.logp(Level.WARNING, CLASS_NAME, METHOD,
          "Unable to parse custom meta field definition; skipping: "
          + configString);
    }


    String getFormName() {
      return formName;
    }


    String getFieldName() {
      return fieldName;
    }


    String getMetaName() {
      return metaName;
    }


    @Override
    public String toString() {
      return "[form: " + formName + "; field: " + fieldName
          + "; meta: " + metaName + "]";
    }
  }
}
Source Code of com.google.enterprise.connector.notes.NotesCrawlerThread

Related Classes of com.google.enterprise.connector.notes.NotesCrawlerThread