Source Code of com.google.enterprise.connector.otex.LivelinkRetriever$AgingDocumentCache

// Copyright 2011 Google Inc.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//      http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.


package com.google.enterprise.connector.otex;


import com.google.enterprise.connector.otex.client.Client;
import com.google.enterprise.connector.otex.client.ClientValue;
import com.google.enterprise.connector.spi.Document;
import com.google.enterprise.connector.spi.RepositoryDocumentException;
import com.google.enterprise.connector.spi.RepositoryException;
import com.google.enterprise.connector.spi.Retriever;
import com.google.enterprise.connector.spi.SpiConstants;
import com.google.enterprise.connector.spi.Value;
import com.google.enterprise.connector.util.Clock;
import com.google.enterprise.connector.util.SystemClock;


import java.io.InputStream;
import java.util.ArrayList;
import java.util.Collections;
import java.util.Map;
import java.util.logging.Logger;


/**
 * A {@link Retriever} that provides access to document content,
 * based upon a docid.
 */
class LivelinkRetriever implements Retriever {
  /** The logger for this class. */
  private static final Logger LOGGER =
      Logger.getLogger(LivelinkRetriever.class.getName());


  /** The connector contains configuration information. */
  private final LivelinkConnector connector;


  /** A concrete strategy for retrieving the content from the server. */
  private final ContentHandler contentHandler;


  /**
   * The traversal client provides access to the server as the
   * traversal user.
   */
  private final Client traversalClient;


  /** Clock for getting the current time of day. */
  private final Clock clock;


  /**
   * Cache of recently read Document meta-data.
   *
   * In the case where the GSA has requested content "If-Modified-Since"
   * the initial crawl, the Connector Manager will fetch the LastModify
   * date to see if it can avoid returning content.  The Connector Manager
   * could also request the content MimeType so that it might set the
   * content type of the output stream.  The getContent() method also
   * needs access to the metadata so that it can properly determine
   * whether the document actually has content to return.  In these cases,
   * it is beneficial to avoid a double fetch of metadata from the Livelink
   * repository.
   *
   * This cache holds the Document objects recently returned from getMetaData()
   * for a short period, in case that information is needed again.
   * The documents age out of the cache relatively quickly (1 minute), as
   * the information should only be needed for a single servlet request.
   * The cache is configured hold 100 items, as the GSA may open many
   * concurrent content retrieval requests.
   */
  private final Map<Integer, AgingDocument> documentCache;


  /**
   * The fields used for constructing Properties for getMetaData().
   */
  private static final Field[] FIELDS;


  /**
   * The columns needed in the database query, which are obtained
   * from the field names in {@code FIELDS}.
   */
  private static final String[] SELECT_LIST;


  static {
    // ListNodes requires the DataID and PermID columns to be
    // included here. This implementation requires DataID,
    // ModifyDate, MimeType, SubType, and DataSize.
    ArrayList<Field> list = new ArrayList<Field>();
    list.add(new Field("DataID", "ID", SpiConstants.PROPNAME_DOCID));
    list.add(new Field("PermID"));
    list.add(new Field("ModifyDate", "ModifyDate",
                       SpiConstants.PROPNAME_LASTMODIFIED));
    list.add(new Field("MimeType", "MimeType",
                       SpiConstants.PROPNAME_MIMETYPE));
    list.add(new Field("SubType", "SubType"));
    // Workaround LAPI NumberFormatException/NullPointerException bug
    // returning negative longs.
    list.add(Field.fromExpression("GoogleDataSize as DataSize",
        "DataSize", "DataSize" ));
    FIELDS = list.toArray(new Field[0]);


    SELECT_LIST = new String[FIELDS.length];
    for (int i = 0; i < FIELDS.length; i++) {
      SELECT_LIST[i] = FIELDS[i].selectExpression;
    }
  }


  LivelinkRetriever(LivelinkConnector connector, Client traversalClient,
      ContentHandler contentHandler) throws RepositoryException {
    this.connector = connector;
    this.traversalClient = traversalClient;
    this.contentHandler = contentHandler;


    // This cache holds the Document objects recently returned by getMetaData()
    // for a short period, in case that information is needed again.
    // The documents age out of the cache relatively quickly (1 minute), as
    // the information should only be needed for a single servlet request.
    // The cache is configured hold 100 items, as the GSA may open many
    // concurrent content retrieval requests.
    clock = new SystemClock();
    documentCache =
        Collections.synchronizedMap(new AgingDocumentCache(100, 60 * 1000));
  }


  /**
   * Return a {@link Document} instance populated with meta-data for the
   * document identified by {@code docid}.  The meta-data <em>should</em>
   * minimally include the {@code google:lastmodified} Property.
   *
   * @return a Document instance with Properties containing document meta-data
   * @throws RepositoryDocumentException if there was a document-specific
   *         error accessing the metadata, for instance the document does not
   *         exist or should be skipped
   * @throws RepositoryException if there was a problem accessing the document
   *         repository
   */
  @Override
  public Document getMetaData(String docid) throws RepositoryException {
    try {
      int objid = getObjectId(docid);
      AgingDocument doc = documentCache.get(objid);
      if (doc != null) {
        return doc;
      }
      String view = "(select b.*, "
          + "case when DataSize < 0 then 0 else DataSize end as GoogleDataSize "
          + "from WebNodes b)";
      ClientValue node = traversalClient.ListNodes("DataID = " + docid,
                                                   view, SELECT_LIST);
      if (node == null || node.size() == 0) {
        throw new RepositoryDocumentException("Not found: " + docid);
      }
      doc = new AgingDocument(objid);
      // Collect the recarray-based properties.
      for (int i = 0; i < FIELDS.length; i++) {
        if (FIELDS[i].propertyNames.length > 0) {
          ClientValue value = node.toValue(0, FIELDS[i].fieldName);
          if (value.isDefined()) {
            doc.addProperty(FIELDS[i], value);
          }
        }
      }
      documentCache.put(objid, doc);
      return doc;
    } catch (RepositoryDocumentException rde) {
      throw rde;
    } catch (RepositoryException re) {
      if (isDocumentError()) {
        throw new RepositoryDocumentException(
            "Failed to access document content for docid " + docid, re);
      } else {
        throw re;
      }
    }
  }


  /**
   * Return an {@code InputStream} that may be used to access content for the
   * document identified by {@code docid}.
   *
   * @param docid the document identifier
   * @return an InputStream for the document content or {@code null} if the
   *         document has no content.
   * @throws RepositoryDocumentException if there was a document-specific
   *         error accessing the content, for instance the document does not
   *         exist or should be skipped
   * @throws RepositoryException if there was a problem accessing the document
   *         repository
   */
  // NOTE: This duplicates some, but not all, of the logic from
  // LivelinkDocumentList.collectContentProperty().  Changes here
  // should probably be reflected there, and vice-versa.
  // TODO: Extract the common logic out into a shared utility method.
  @Override
  public InputStream getContent(String docid) throws RepositoryException {
    try {
      int objid = getObjectId(docid);
      Document doc = getMetaData(docid);


      // TODO: There are subtle differences between how content
      // is handled here and how it is handled in the TraversalManager.
      // For instance, skipping content based upon MimeType or document size
      // is not done here because we don't have access to a TraversalContext.
      // Empty content is also handled differently in the Connector Manager
      // for Retriever content, so we might consider providing meaningful
      // stub content here.


      // TODO: Make this list configurable.
      switch (getSingleValueInt(doc, "SubType", -1)) {
        case -1:     // No subType?
        case 356:    // Blog
        case 357:    // Blog Entry
        case 123469: // Forum
        case 123470: // Forum Topics & Replies
        case 123475: // FAQ
        case 123476: // FAQ Entry
          return null;
        default:
          break;
      }


      // DataSize is the only non-nullable column from
      // DVersData that appears in the WebNodes view,
      // but there are cases (such as categories) where
      // there are rows in DVersData but FetchVersion
      // fails. So we're guessing here that if MimeType
      // is non-null then there should be a blob.
      if (Value.getSingleValueString(doc, "MimeType") == null) {
        return null;
      }


      // XXX: This value might be wrong. There are
      // data size callbacks which can change this
      // value. For example, the value returned by
      // GetObjectInfo may be different than the
      // value retrieved from the database.
      int dataSize = getSingleValueInt(doc, "DataSize", 0);
      if (dataSize <= 0) {
        return null;
      }


      return contentHandler.getInputStream(0, objid, 0, dataSize);
    } catch (RepositoryDocumentException rde) {
      throw rde;
    } catch (RepositoryException re) {
      if (isDocumentError()) {
        throw new RepositoryDocumentException(
            "Failed to access document content for docid " + docid, re);
      } else {
        throw re;
      }
    }
  }


  /** Extract the Livelink ObjectID from the supplied docid string. */
  private int getObjectId(String docid) throws RepositoryException {
    try {
      return Integer.parseInt(docid);
    } catch (NumberFormatException nfe) {
      LOGGER.warning("Invalid docid: " + docid);
      throw new RepositoryDocumentException("Invalid docid: " + docid, nfe);
    }
  }


  /** Convenience function for access to a single int value from a document. */
  private int getSingleValueInt(Document doc, String propertyName,
      int defaultValue) throws RepositoryException {
    String value = Value.getSingleValueString(doc, propertyName);
    if (value == null) {
      return defaultValue;
    }
    try {
      return Integer.parseInt(value);
    } catch (NumberFormatException nfe) {
      throw new RepositoryException(propertyName + " is not an integer: "
                                    + value, nfe);
    }
  }


  /**
   * Return true if an encountered error seems specific to the document,
   * or false if it appears systemic.
   */
  // NOTE: This logic was lifted from the error handler in LivelinkDocumentList.
  private boolean isDocumentError() {
    // Ping the Livelink Server to try to determine whether the
    // error is systemic or related to this specific document.
    // This is a guess, because it could be a transient error that
    // will not repeat itself on the ping.
    // TODO: Do a better job of distinguishing transient exceptions
    // from document exceptions when throwing LivelinkException
    // (including LivelinkIOException and LapiException).
    try {
      traversalClient.GetCurrentUserID();
      return true;
    } catch (RepositoryException e) {
      // The failure seems to be systemic, rather than a problem
      // with this particular document.
      return false;
    }
  }


  /**
   * A subclass of LivelinkDocument with a timestamp of its creation,
   * so that it might expire.
   */
  private class AgingDocument extends LivelinkDocument {
    // Object creation timestamp.
    public final long timestamp;


    public AgingDocument(int docid) throws RepositoryException {
      super(docid, FIELDS.length);
      timestamp = clock.getTimeMillis();
    }
  }


  /** A LivelinkDocument cache that discards any document that is expired. */
  // TODO: Remove aged documents in the background.
  private class AgingDocumentCache extends CacheMap<Integer, AgingDocument> {
    // Maximum age of document, before discarding.
    private final long timeout;


    public AgingDocumentCache(int size, long timeout) {
      super(size, size * 8);
      this.timeout = timeout;
    }


    /**
     * Returns cached Document, if not expired.  Returns {@code null} for
     * expired documents or cache misses.
     */
    public AgingDocument get(Integer key) {
      AgingDocument doc = super.get(key);
      if (doc == null) {
        return null;
      } else if (doc.timestamp + timeout > clock.getTimeMillis()) {
        return doc;
      } else {
        remove(key);
        return null;
      }
    }
  }
}
Source Code of com.google.enterprise.connector.otex.LivelinkRetriever$AgingDocumentCache

Related Classes of com.google.enterprise.connector.otex.LivelinkRetriever$AgingDocumentCache