Source Code of com.gnizr.core.search.SearchIndexManager$Request

/*
 * gnizr is a trademark of Image Matters LLC in the United States.
 * 
 * The contents of this file are subject to the Mozilla Public License Version
 * 1.1 (the "License"); you may not use this file except in compliance with
 * the License. You may obtain a copy of the License at
 * http://www.mozilla.org/MPL/
 * 
 * Software distributed under the License is distributed on an "AS IS" basis,
 * WITHOUT WARRANTY OF ANY KIND, either expressed or implied. See the License
 * for the specific language governing rights and limitations under the License.
 * 
 * The Initial Contributor of the Original Code is Image Matters LLC.
 * Portions created by the Initial Contributor are Copyright (C) 2007
 * Image Matters LLC. All Rights Reserved.
 */
package com.gnizr.core.search;


import java.io.File;
import java.io.IOException;
import java.io.Serializable;
import java.util.Arrays;
import java.util.HashSet;
import java.util.LinkedList;
import java.util.List;
import java.util.Queue;
import java.util.concurrent.BlockingQueue;
import java.util.concurrent.LinkedBlockingQueue;


import org.apache.log4j.Logger;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.FieldSelector;
import org.apache.lucene.document.FieldSelectorResult;
import org.apache.lucene.index.CorruptIndexException;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.IndexWriter;
import org.apache.lucene.index.Term;
import org.apache.lucene.index.TermDocs;
import org.apache.lucene.store.LockObtainFailedException;


/**
 * Creates and manages the search index of bookmarks. Search index is created
 * using Lucene API. The implementation for changing search index (i.e., add,
 * delete or update a bookmark index) is executed in the separate internal
 * thread.
 * 
 * @author Harry Chen
 * @since 2.4.0
 * 
 */
public class SearchIndexManager implements Serializable {


  /**
   * 
   */
  private static final long serialVersionUID = 8598646761200259815L;


  private static final Logger logger = Logger
      .getLogger(SearchIndexManager.class);


  private SearchIndexProfile profile;


  private LinkedBlockingQueue<Request> documentQueue;


  private Thread workerThread;


  private UpdateIndexWorker worker;


  // used to signal workerThread that this class instance is shutting down.
  private static final Document POISON_DOC = new Document();


  private boolean forceIndexReset;


  private File indexDirectory;


  private static final String INDEX_DIR = "bmarks-idx";


  /**
   * Creates an instance of this class. If a search index database exists, use
   * it as it.
   */
  public SearchIndexManager() {
    this.forceIndexReset = false;
  }


  /**
   * Creates an instance of this class and optionally defines whether or not
   * to force an existing search index database should be cleared.
   * 
   * @param resetIndex
   *            <code>true</code> if an existing search index database
   *            should be cleared upon initialization.
   */
  public SearchIndexManager(boolean resetIndex) {
    this.forceIndexReset = resetIndex;
  }


  /**
   * Initializes this class instance and outputs errors in the log file if the
   * <code>profile</code> is <code>null</code>. Forces the index database
   * to be cleared if the <code>resetIndex</code> is set to
   * <code>true</code> in the constructor. Starts the internal thread for
   * managing the search index database.
   */
  public void init() {
    if (profile == null) {
      logger
          .error("Can't initialize search index database -- profile undefined.");
      // quit. as soon as
      return;
    }


    if (profile.getSearchIndexDirectory() == null) {
      logger.error("Undefined search index database directory: null");
      throw new NullPointerException(
          "Search index directory is undefined in the configuration file.");
    } else {
      File dir = new File(profile.getSearchIndexDirectory());
      if (dir.exists() && dir.isDirectory() == false) {
        logger.error("Defined path is not a directory. Path: "
            + dir.toString());
        throw new RuntimeException(
            "Defined search index directory is not a system directory.");
      }
      indexDirectory = new File(profile.getSearchIndexDirectory(),
          INDEX_DIR);
    }


    if (forceIndexReset == true) {
      logger.info("Overwriting the existing index store, if it exists.");
      createEmptyIndexDirectory();
    }


    documentQueue = new LinkedBlockingQueue<Request>();
    worker = new UpdateIndexWorker();
    workerThread = new Thread(worker);
    workerThread.setDaemon(true);
    workerThread.start();
  }


  private void createEmptyIndexDirectory() {
    IndexWriter writer = null;
    try {
      writer = new IndexWriter(indexDirectory, DocumentCreator
          .createDocumentAnalyzer(), true);
    } catch (Exception e) {
      logger.error("Unable to reset the search index. Path "
          + indexDirectory.toString(), e);
    } finally {
      if (writer != null) {
        try {
          writer.optimize();
          writer.close();
        } catch (Exception e) {
          logger.error(e);
        }
      }
    }
  }


  /**
   * Checks whether the internal index thread is performing any active
   * indexing. This is an estimated result.
   * 
   * @return <code>true</code> if the the internal thread is performing
   *         indexing. Returns <code>false</code> otherwise.
   */
  public boolean isIndexProcessActive() {
    if(getIndexProcessWorkLoad() > 0){
      return true;
    }
    return false;
  }


  /**
   * Gets an estimated count of the workload of the internal index thread. 
   * This number is the sum of <code>getIndexProcessPending</code> and 
   * <code>getIndexProcessWorking</code>. Use
   * with caution. This number is only an estimate.
   * 
   * @return workload count
   */
  public int getIndexProcessWorkLoad() {
    int numWorking = worker.getWorkQueueSize();
    int numPending = documentQueue.size();
    return numPending + numWorking;
  }
  
  /**
   * Gets an estimated count of the amount of work that is 
   * pending to be processed. Use with caution. This number 
   * is only an estimate. 
   * @return pending work count.
   */
  public int getIndexProcessPending(){
    return documentQueue.size();
  }


  /**
   * Gets an estimated count of the amount of work 
   * that is currently being processed (i.e., not pending). 
   * Use with caution. This number is only an estimate.
   * @return amount of work being processed.
   */
  public int getIndexProcessWorking(){
    return worker.getWorkQueueSize();
  }
                                      


  /**
   * Checks whether the thread that is responsible for adding, deleting and
   * updating bookmark index is still alive.
   * 
   * @return <code>true</code> if the worker thread that performs index
   *         modification is still alive. Returns <code>false</code>,
   *         otherwise.
   */
  public boolean isActive() {
    return workerThread.isAlive();
  }


  /**
   * Cleans up the internal resources used by this class instance.
   * <p>
   * <b>IMPORTANT</b>: it's necessary to call this method if the client wants
   * to this class to be properly unloaded from the JVM.
   * </p>
   */
  public void destroy() {
    if (workerThread != null) {
      try {
        if (workerThread.isAlive() == true) {
          addIndex(POISON_DOC);
        }
      } catch (Exception e) {
        logger.error("SearchIndexManager destory(): " + e);
      }
    }
  }


  /**
   * Returns the profile used for configuring this class instance.
   * 
   * @return current profile
   */
  public SearchIndexProfile getProfile() {
    return profile;
  }


  /**
   * Sets the profile used for configuring the class instance.
   * 
   * @param profile
   *            profile to use
   */
  public void setProfile(SearchIndexProfile profile) {
    this.profile = profile;
  }


  /**
   * Appends a <code>Document</code> to the queue of documents to be updated
   * in the search index database. The update request is performed in an
   * asynchronous fashion. There is no guarantee that the update will be
   * completed right after this method has been called.
   * 
   * @param doc
   *            a document to be updated.
   * @throws InterruptedException
   * 
   */
  public void updateIndex(Document doc) throws InterruptedException {
    if (doc != null) {
      documentQueue.put(new Request(doc, UPD));
    }
  }


  /**
   * Appends a <code>Document</code> to the queue of documents to be added
   * in the search index database. The add request is performed in an
   * asynchronous fashion. There is no guarantee that the add will be
   * completed right after this method has been called.
   * 
   * @param doc
   *            a document to be added.
   * @throws InterruptedException
   * 
   */
  public void addIndex(Document doc) throws InterruptedException {
    if (doc != null) {
      documentQueue.put(new Request(doc, ADD));
    }
  }


  /**
   * Appends a <code>Document</code> to the queue of documents to be deleted
   * from the search index database. The delete request is performed in an
   * asynchronous fashion. There is no guarantee that the delete will be
   * completed right after this method has been called.
   * 
   * @param doc
   *            a document to be deleted.
   * @throws InterruptedException
   * 
   */
  public void deleteIndex(Document doc) throws InterruptedException {
    if (doc != null) {
      documentQueue.put(new Request(doc, DEL));
    }
  }


  /**
   * Instructs this class to clear all index records from the existing search
   * index database as soon as possible. This request is executed in an
   * asynchronous fashion. There is no guarantee that this request can be
   * completed right after the call.
   * 
   * @throws InterruptedException
   */
  public void resetIndex() throws InterruptedException {
    documentQueue.put(new Request(null, RST));
  }


  private static final int ADD = 1;
  private static final int DEL = 2;
  private static final int UPD = 3;
  private static final int RST = 4;


  private class Request {
    Document doc;
    int type;


    public Request(Document doc, int type) {
      this.doc = doc;
      this.type = type;
    }


    public String toString() {
      StringBuilder sb = new StringBuilder();
      sb.append("req:" + type + ",doc:" + doc);
      return sb.toString();
    }
  }


  /**
   * Finds the representative bookmark document for a ginve URL hash.
   * 
   * @param urlHash
   *            a URL MD5 Hash.
   * 
   * @return a Lucene document of the representative bookmark
   */
  public Document findLeadDocument(String urlHash) {
    IndexReader reader = null;
    TermDocs termDocs = null;
    Document leadDoc = null;
    try {
      boolean exists = IndexReader.indexExists(indexDirectory);
      if (exists == true) {
        reader = IndexReader.open(indexDirectory);
        Term key = new Term(DocumentCreator.FIELD_URL_MD5, urlHash);
        termDocs = reader.termDocs(key);
        boolean found = false;
        while (termDocs.next() && found == false) {
          int pos = termDocs.doc();
          // use FieldSelector for more efficient loading of Fields.
          // load only what's needed to determine a leading document
          Document d = reader.document(pos, new FieldSelector() {
            private static final long serialVersionUID = 1426724242925499003L;


            public FieldSelectorResult accept(String field) {
              if (field.equals(DocumentCreator.FIELD_INDEX_TYPE)) {
                return FieldSelectorResult.LOAD_AND_BREAK;
              } else {
                return FieldSelectorResult.NO_LOAD;
              }
            }
          });
          String[] values = d
              .getValues(DocumentCreator.FIELD_INDEX_TYPE);
          if (values != null) {
            List<String> vList = Arrays.asList(values);
            if (vList.contains(DocumentCreator.INDEX_TYPE_LEAD) == true) {
              leadDoc = reader.document(pos);
              found = true;
            }
          }
        }
      }
    } catch (Exception e) {
      logger.error("FindLeadDocument failed to find doc: " + urlHash
          + ", exception=" + e);
    } finally {
      try {
        if (termDocs != null) {
          termDocs.close();
        }
        if (reader != null) {
          reader.close();
        }
      } catch (Exception e) {
        logger
            .error("FindLeadDocument can't close reader or termDocs: "
                + e);
      }
    }
    return leadDoc;
  }


  /**
   * Finds a non-representative bookmark document for a given URL hash.
   * 
   * @param urlHash
   *            a URL MD5 Hash.
   * 
   * @return a Lucene document of a non-representative bookmark
   */
  public Document findNonLeadDocument(String urlHash) {
    IndexReader reader = null;
    TermDocs termDocs = null;
    Document leadDoc = null;
    try {
      boolean exists = IndexReader.indexExists(indexDirectory);
      if (exists == true) {
        reader = IndexReader.open(indexDirectory);
        Term key = new Term(DocumentCreator.FIELD_URL_MD5, urlHash);
        termDocs = reader.termDocs(key);
        boolean found = false;
        while (termDocs.next() && found == false) {
          int pos = termDocs.doc();
          // use FieldSelector for more efficient loading of Fields.
          // load only what's needed to determine a leading document
          Document d = reader.document(pos, new FieldSelector() {
            private static final long serialVersionUID = 1426724242925499003L;


            public FieldSelectorResult accept(String field) {
              if (field.equals(DocumentCreator.FIELD_INDEX_TYPE)) {
                return FieldSelectorResult.LOAD_AND_BREAK;
              } else {
                return FieldSelectorResult.NO_LOAD;
              }
            }


          });
          String[] values = d
              .getValues(DocumentCreator.FIELD_INDEX_TYPE);
          if (values != null) {
            List<String> vList = Arrays.asList(values);
            if (vList.contains(DocumentCreator.INDEX_TYPE_LEAD) == false) {
              leadDoc = reader.document(pos);
              found = true;
            }
          } else {
            leadDoc = reader.document(pos);
            found = true;
          }
        }
      }
    } catch (Exception e) {
      logger.error("FindLeadDocument failed to find doc hash: " + urlHash
          + ", exception=" + e);
    } finally {
      try {
        if (termDocs != null) {
          termDocs.close();
        }
        if (reader != null) {
          reader.close();
        }
      } catch (Exception e) {
        logger
            .error("FindLeadDocument can't close reader or termDocs: "
                + e);
      }
    }
    return leadDoc;
  }


  private class UpdateIndexWorker implements Runnable {
    private static final int MAX_WORK_SIZE = Integer.MAX_VALUE;
    private Queue<Request> workQueue = new LinkedList<Request>();
  
    private Queue<Request> batchRequest(BlockingQueue<Request> inputQueue)
        throws InterruptedException {
      HashSet<String> seenMD5Url = new HashSet<String>();
      Queue<Request> batchWork = new LinkedList<Request>();
      boolean cutOff = false;
      boolean isFromTake = true;
      Request req = inputQueue.take();    
      while (req != null && cutOff == false && batchWork.size() < MAX_WORK_SIZE) {        
        Document doc = req.doc;
        if(doc != null){
          if(POISON_DOC.equals(doc) == true){
            cutOff = true;            
          }else{
            String md5Url = doc.get(DocumentCreator.FIELD_URL_MD5);
            if(md5Url != null){
              if(seenMD5Url.contains(md5Url) == true){
                cutOff = true;
              }else{
                seenMD5Url.add(md5Url);
              }
            }
          }
        }else{
          cutOff = true;
        }
        if(isFromTake == true){
          batchWork.add(req);
          isFromTake = false;
        }else{
          if(cutOff == false){
            batchWork.add(inputQueue.poll());
          }
        }
        req = inputQueue.peek();
      }
      return batchWork;
    }


    private IndexWriter createIndexWriter() throws CorruptIndexException,
        LockObtainFailedException, IOException {
      Analyzer analyzer = DocumentCreator.createDocumentAnalyzer();
      return new IndexWriter(indexDirectory, analyzer);
    }


    public int getWorkQueueSize(){
      return workQueue.size();
    }
    
    public void run() {
      boolean stopRunning = false;
      while (true && stopRunning == false) {
        IndexWriter writer = null;
        try {
          workQueue = batchRequest(documentQueue);
          logger.debug("UpdateIndexWorker: batching # of Request :"
              + workQueue.size());
          Request aReq = workQueue.poll();
          if(aReq != null && aReq.type != RST){
            writer = createIndexWriter();
          }
          while (aReq != null) {
            Document doc = aReq.doc;
            if (doc != null && POISON_DOC.equals(doc)) {
              logger.debug("Terminate UpdateIndexWorker.");
              stopRunning = true;
            } else if (aReq.type == RST) {
              logger.debug("===================================> Do RESET.");
              doReset();
            } else {            
              if (aReq.type == ADD && doc != null) {
                
                doAdd(doc, writer);
              } else if (aReq.type == UPD && doc != null) {
                doUpdate(doc, writer);
              } else if (aReq.type == DEL && doc != null) {
                doDelete(doc, writer);
              } 
            }
            aReq = workQueue.poll();
          }
        } catch (InterruptedException e) {
          logger.debug("UpdateIndexWorker is interrupted.");
          stopRunning = true;
        } catch (Exception e) {
          logger.error(e);
        } finally{
          if(writer != null){
            try{
              writer.optimize();
              writer.flush();
              writer.close();
            }catch(Exception e){
              logger.error(e);
            }
          }
        }
      }
    }


    private void doReset() {
      createEmptyIndexDirectory();
    }


    private void doAdd(Document doc, IndexWriter writer)
        throws CorruptIndexException, IOException {
      if (doc == null) {
        throw new NullPointerException(
            "Can't add document to the index. Doc is NULL");
      }
      String urlHash = doc.get(DocumentCreator.FIELD_URL_MD5);
      if (urlHash == null) {
        throw new NullPointerException(
            "Can't add document to the index. Doc is missing URL hash. doc:"
                + doc);
      }
      Document leadDoc = findLeadDocument(urlHash);
      if (leadDoc == null) {
        doc = DocumentCreator.addIndexTypeLead(doc);
        logger.debug("Added Lead Index Type to doc = " + doc.get(DocumentCreator.FIELD_BOOKMARK_ID));
      }


      writer.addDocument(doc);
      logger.debug("Added doc = " + doc.get(DocumentCreator.FIELD_BOOKMARK_ID));
    }


    private void doDelete(Document doc, IndexWriter writer)
        throws CorruptIndexException, IOException {
      if (doc == null) {
        throw new NullPointerException(
            "Can't delete document from the index. Doc is NULL");
      }


      Term t = new Term(DocumentCreator.FIELD_BOOKMARK_ID, doc
          .get(DocumentCreator.FIELD_BOOKMARK_ID));
      writer.deleteDocuments(t);
      logger.debug("Deleted doc = " + doc.get(DocumentCreator.FIELD_BOOKMARK_ID));
      
      String urlHash = doc.get(DocumentCreator.FIELD_URL_MD5);
      if (urlHash == null) {
        throw new NullPointerException(
            "Can't delete document from the index. Doc is missing URL hash. doc:"
                + doc);
      }
      writer.flush();
      Document leadDoc = findLeadDocument(urlHash);
      if (leadDoc == null) {
        Document nonleadDoc = findNonLeadDocument(urlHash);
        if (nonleadDoc != null) {
          logger.debug("After deleting found a new Lead document. doc = " + nonleadDoc.get(DocumentCreator.FIELD_BOOKMARK_ID));
          nonleadDoc = DocumentCreator.addIndexTypeLead(nonleadDoc);
          doUpdate(nonleadDoc, writer);
        }
      }
    }


    private void doUpdate(Document doc, IndexWriter writer)
        throws CorruptIndexException, IOException {
      if (doc == null) {
        throw new NullPointerException(
            "Can't update document in the index. Doc is NULL");
      }
      String urlHash = doc.get(DocumentCreator.FIELD_URL_MD5);
      if (urlHash == null) {
        throw new NullPointerException(
            "Can't update document in the index. Doc is missing URL hash. doc:"
                + doc);
      }


      Term t = new Term(DocumentCreator.FIELD_BOOKMARK_ID, doc
          .get(DocumentCreator.FIELD_BOOKMARK_ID));
      writer.updateDocument(t, doc);
      logger.debug("Updated doc = " + doc.get(DocumentCreator.FIELD_BOOKMARK_ID));
    }
  }


  /**
   * Returns the directory where the search index database is stored
   * 
   * @return the <code>File</code> that represents the storage location of
   *         the index. Even if the return value is not <code>null</code>,
   *         there is no guarantee that this directory exists in the system.
   * @return the directory of the search index database.
   */
  public File getIndexDirectory() {
    return indexDirectory;
  }


}
Source Code of com.gnizr.core.search.SearchIndexManager$Request

Related Classes of com.gnizr.core.search.SearchIndexManager$Request