Source Code of org.apache.jackrabbit.core.query.lucene.SearchIndex

/*
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  The ASF licenses this file to You
 * under the Apache License, Version 2.0 (the "License"); you may not
 * use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *      http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package org.apache.jackrabbit.core.query.lucene;


import org.apache.jackrabbit.core.ItemManager;
import org.apache.jackrabbit.core.SessionImpl;
import org.apache.jackrabbit.core.NodeId;
import org.apache.jackrabbit.core.NodeIdIterator;
import org.apache.jackrabbit.core.query.AbstractQueryHandler;
import org.apache.jackrabbit.core.query.ExecutableQuery;
import org.apache.jackrabbit.core.query.QueryHandlerContext;
import org.apache.jackrabbit.core.query.TextFilter;
import org.apache.jackrabbit.core.query.QueryHandler;
import org.apache.jackrabbit.core.state.NodeState;
import org.apache.jackrabbit.core.state.NodeStateIterator;
import org.apache.jackrabbit.name.NoPrefixDeclaredException;
import org.apache.jackrabbit.name.QName;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.standard.StandardAnalyzer;
import org.apache.lucene.document.Document;
import org.apache.lucene.index.Term;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.MultiReader;
import org.apache.lucene.search.Hits;
import org.apache.lucene.search.IndexSearcher;
import org.apache.lucene.search.Query;
import org.apache.lucene.search.Sort;
import org.apache.lucene.search.SortField;
import org.apache.commons.collections.iterators.AbstractIteratorDecorator;


import javax.jcr.RepositoryException;
import javax.jcr.query.InvalidQueryException;
import java.io.IOException;
import java.io.File;
import java.util.Iterator;
import java.util.List;
import java.util.StringTokenizer;
import java.util.ArrayList;
import java.util.Collections;
import java.util.HashSet;
import java.util.Set;


/**
 * Implements a {@link org.apache.jackrabbit.core.query.QueryHandler} using
 * Lucene.
 */
public class SearchIndex extends AbstractQueryHandler {


    /** The logger instance for this class */
    private static final Logger log = LoggerFactory.getLogger(SearchIndex.class);


    /**
     * Name of the file to persist search internal namespace mappings
     */
    private static final String NS_MAPPING_FILE = "ns_mappings.properties";


    /**
     * The default value for property {@link #minMergeDocs}.
     */
    public static final int DEFAULT_MIN_MERGE_DOCS = 100;


    /**
     * The default value for property {@link #maxMergeDocs}.
     */
    public static final int DEFAULT_MAX_MERGE_DOCS = 100000;


    /**
     * the default value for property {@link #mergeFactor}.
     */
    public static final int DEFAULT_MERGE_FACTOR = 10;


    /**
     * Default text filters.
     */
    public static final String DEFAULT_TEXT_FILTERS = TextPlainTextFilter.class.getName();


    /**
     * The actual index
     */
    private MultiIndex index;


    /**
     * The analyzer we use for indexing.
     */
    private Analyzer analyzer;


    /**
     * List of {@link org.apache.jackrabbit.core.query.TextFilter} instance.
     */
    private List textFilters;


    /**
     * The location of the search index.
     * <p/>
     * Note: This is a <b>mandatory</b> parameter!
     */
    private String path;


    /**
     * minMergeDocs config parameter.
     */
    private int minMergeDocs = DEFAULT_MIN_MERGE_DOCS;


    /**
     * volatileIdleTime config parameter.
     */
    private int volatileIdleTime = 3;


    /**
     * maxMergeDocs config parameter
     */
    private int maxMergeDocs = DEFAULT_MAX_MERGE_DOCS;


    /**
     * mergeFactor config parameter
     */
    private int mergeFactor = DEFAULT_MERGE_FACTOR;


    /**
     * Number of documents that are buffered before they are added to the index.
     */
    private int bufferSize = 10;


    /**
     * Compound file flag
     */
    private boolean useCompoundFile = true;


    /**
     * Flag indicating whether document order is enable as the default ordering.
     */
    private boolean documentOrder = true;


    /**
     * If set <code>true</code> the index is checked for consistency on startup.
     * If <code>false</code> a consistency check is only performed when there
     * are entries in the redo log on startup.
     * <p/>
     * Default value is: <code>false</code>.
     */
    private boolean forceConsistencyCheck = false;


    /**
     * If set <code>true</code> errors detected by the consistency check are
     * repaired. If <code>false</code> the errors are only reported in the log.
     * <p/>
     * Default value is: <code>true</code>.
     */
    private boolean autoRepair = true;


    /**
     * The uuid resolver cache size.
     * <p/>
     * Default value is: <code>1000</code>.
     */
    private int cacheSize = 1000;


    /**
     * Indicates if this <code>SearchIndex</code> is closed and cannot be used
     * anymore.
     */
    private boolean closed = false;


    /**
     * Default constructor.
     */
    public SearchIndex() {
        this.analyzer = new StandardAnalyzer(new String[]{});
        setTextFilterClasses(DEFAULT_TEXT_FILTERS);
    }


    /**
     * Initializes this <code>QueryHandler</code>. This implementation requires
     * that a path parameter is set in the configuration. If this condition
     * is not met, a <code>IOException</code> is thrown.
     *
     * @throws IOException if an error occurs while initializing this handler.
     */
    protected void doInit() throws IOException {
        QueryHandlerContext context = getContext();
        if (path == null) {
            throw new IOException("SearchIndex requires 'path' parameter in configuration!");
        }


        Set excludedIDs= new HashSet();
        if (context.getExcludedNodeId() != null) {
            excludedIDs.add(context.getExcludedNodeId());
        }


        File indexDir = new File(path);


        NamespaceMappings nsMappings;
        if (context.getParentHandler() instanceof SearchIndex) {
            // use system namespace mappings
            SearchIndex sysIndex = (SearchIndex) context.getParentHandler();
            nsMappings = sysIndex.getNamespaceMappings();
        } else {
            // read local namespace mappings
            File mapFile = new File(indexDir, NS_MAPPING_FILE);
             nsMappings = new NamespaceMappings(mapFile);
        }


        index = new MultiIndex(indexDir, this, context.getItemStateManager(),
                context.getRootId(), excludedIDs, nsMappings);
        if (index.getRedoLogApplied() || forceConsistencyCheck) {
            log.info("Running consistency check...");
            try {
                ConsistencyCheck check = ConsistencyCheck.run(index,
                        context.getItemStateManager());
                if (autoRepair) {
                    check.repair(true);
                } else {
                    List errors = check.getErrors();
                    if (errors.size() == 0) {
                        log.info("No errors detected.");
                    }
                    for (Iterator it = errors.iterator(); it.hasNext(); ) {
                        ConsistencyCheckError err = (ConsistencyCheckError) it.next();
                        log.info(err.toString());
                    }
                }
            } catch (Exception e) {
                log.warn("Failed to run consistency check on index: " + e);
            }
        }
        log.info("Index initialized: " + path);
    }


    /**
     * Adds the <code>node</code> to the search index.
     * @param node the node to add.
     * @throws RepositoryException if an error occurs while indexing the node.
     * @throws IOException if an error occurs while adding the node to the index.
     */
    public void addNode(NodeState node) throws RepositoryException, IOException {
        throw new UnsupportedOperationException("addNode");
    }


    /**
     * Removes the node with <code>uuid</code> from the search index.
     * @param id the id of the node to remove from the index.
     * @throws IOException if an error occurs while removing the node from
     * the index.
     */
    public void deleteNode(NodeId id) throws IOException {
        throw new UnsupportedOperationException("deleteNode");
    }


    /**
     * This implementation forwards the call to
     * {@link MultiIndex#update(java.util.Iterator, java.util.Iterator)} and
     * transforms the two iterators to the required types.
     *
     * @param remove uuids of nodes to remove.
     * @param add    NodeStates to add. Calls to <code>next()</code> on this
     *               iterator may return <code>null</code>, to indicate that a
     *               node could not be indexed successfully.
     * @throws RepositoryException if an error occurs while indexing a node.
     * @throws IOException         if an error occurs while updating the index.
     */
    public void updateNodes(NodeIdIterator remove, NodeStateIterator add)
            throws RepositoryException, IOException {
        checkOpen();
        index.update(new AbstractIteratorDecorator(remove) {
            public Object next() {
                NodeId id = (NodeId) super.next();
                return new Term(FieldNames.UUID, id.getUUID().toString());
            }
        }, new AbstractIteratorDecorator(add) {
            public Object next() {
                NodeState state = (NodeState) super.next();
                if (state == null) {
                    return null;
                }
                Document doc = null;
                try {
                    doc = createDocument(state, getNamespaceMappings());
                } catch (RepositoryException e) {
                    log.error("Exception while creating document for node: " +
                            state.getNodeId() + ": " + e.toString());
                }
                return doc;
            }
        });
    }


    /**
     * Creates a new query by specifying the query statement itself and the
     * language in which the query is stated.  If the query statement is
     * syntactically invalid, given the language specified, an
     * InvalidQueryException is thrown. <code>language</code> must specify a query language
     * string from among those returned by QueryManager.getSupportedQueryLanguages(); if it is not
     * then an <code>InvalidQueryException</code> is thrown.
     *
     * @param session the session of the current user creating the query object.
     * @param itemMgr the item manager of the current user.
     * @param statement the query statement.
     * @param language the syntax of the query statement.
     * @throws InvalidQueryException if statement is invalid or language is unsupported.
     * @return A <code>Query</code> object.
     */
    public ExecutableQuery createExecutableQuery(SessionImpl session,
                                             ItemManager itemMgr,
                                             String statement,
                                             String language)
            throws InvalidQueryException {
        QueryImpl query = new QueryImpl(session, itemMgr, this,
                getContext().getPropertyTypeRegistry(), statement, language);
        query.setRespectDocumentOrder(documentOrder);
        return query;
    }


    /**
     * Closes this <code>QueryHandler</code> and frees resources attached
     * to this handler.
     */
    public void close() {
        index.close();
        getContext().destroy();
        closed = true;
        log.info("Index closed: " + path);
    }


    /**
     * Executes the query on the search index.
     * @param queryImpl the query impl.
     * @param query the lucene query.
     * @param orderProps name of the properties for sort order.
     * @param orderSpecs the order specs for the sort order properties.
     * <code>true</code> indicates ascending order, <code>false</code> indicates
     * descending.
     * @return the lucene Hits object.
     * @throws IOException if an error occurs while searching the index.
     */
    public QueryHits executeQuery(QueryImpl queryImpl,
                                  Query query,
                                  QName[] orderProps,
                                  boolean[] orderSpecs) throws IOException {
        checkOpen();
        QueryHandler parentHandler = getContext().getParentHandler();
        IndexReader parentReader = null;
        if (parentHandler instanceof SearchIndex) {
            parentReader = ((SearchIndex) parentHandler).index.getIndexReader();
        }


        SortField[] sortFields = createSortFields(orderProps, orderSpecs);


        IndexReader reader = index.getIndexReader();
        if (parentReader != null) {
            // todo FIXME not type safe
            CachingMultiReader[] readers = {(CachingMultiReader) reader,
                                            (CachingMultiReader) parentReader};
            reader = new CombinedIndexReader(readers);
        }


        IndexSearcher searcher = new IndexSearcher(reader);
        Hits hits;
        if (sortFields.length > 0) {
            hits = searcher.search(query, new Sort(sortFields));
        } else {
            hits = searcher.search(query);
        }
        return new QueryHits(hits, reader);
    }


    /**
     * Returns the analyzer in use for indexing.
     * @return the analyzer in use for indexing.
     */
    public Analyzer getTextAnalyzer() {
        return analyzer;
    }


    /**
     * Returns an unmodifiable list of {@link TextFilter} configured for
     * this search index.
     *
     * @return unmodifiable list of text filters.
     */
    protected List getTextFilters() {
        return textFilters;
    }


    /**
     * Returns the namespace mappings for the internal representation.
     * @return the namespace mappings for the internal representation.
     */
    public NamespaceMappings getNamespaceMappings() {
        return index.getNamespaceMappings();
    }


    /**
     * Creates the SortFields for the order properties.
     *
     * @param orderProps the order properties.
     * @param orderSpecs the order specs for the properties.
     * @return an array of sort fields
     */
    protected SortField[] createSortFields(QName[] orderProps,
                                           boolean[] orderSpecs) {
        List sortFields = new ArrayList();
        for (int i = 0; i < orderProps.length; i++) {
            String prop = null;
            if (QName.JCR_SCORE.equals(orderProps[i])) {
                // order on jcr:score does not use the natural order as
                // implemented in lucene. score ascending in lucene means that
                // higher scores are first. JCR specs that lower score values
                // are first.
                sortFields.add(new SortField(null, SortField.SCORE, orderSpecs[i]));
            } else {
                try {
                    prop = orderProps[i].toJCRName(getNamespaceMappings());
                } catch (NoPrefixDeclaredException e) {
                    // will never happen
                }
                sortFields.add(new SortField(prop, SharedFieldSortComparator.PROPERTIES, !orderSpecs[i]));
            }
        }
        return (SortField[]) sortFields.toArray(new SortField[sortFields.size()]);
    }


    /**
     * Creates a lucene <code>Document</code> from a node state using the
     * namespace mappings <code>nsMappings</code>.
     * @param node the node state to index.
     * @param nsMappings the namespace mappings of the search index.
     * @return a lucene <code>Document</code> that contains all properties
     *  of <code>node</code>.
     * @throws RepositoryException if an error occurs while indexing the
     *  <code>node</code>.
     */
    protected Document createDocument(NodeState node, NamespaceMappings nsMappings)
            throws RepositoryException {
        return NodeIndexer.createDocument(node, getContext().getItemStateManager(),
                nsMappings, textFilters);
    }


    /**
     * Returns the actual index.
     *
     * @return the actual index.
     */
    protected MultiIndex getIndex() {
        return index;
    }


    //----------------------------< internal >----------------------------------


    /**
     * Combines multiple {@link CachingMultiReader} into a <code>MultiReader</code>
     * with {@link HierarchyResolver} support.
     */
    protected static final class CombinedIndexReader extends MultiReader implements HierarchyResolver {


        /**
         * The sub readers.
         */
        private CachingMultiReader[] subReaders;


        /**
         * Doc number starts for each sub reader
         */
        private int[] starts;


        public CombinedIndexReader(CachingMultiReader[] indexReaders) throws IOException {
            super(indexReaders);
            this.subReaders = indexReaders;
            this.starts = new int[subReaders.length + 1];


            int maxDoc = 0;
            for (int i = 0; i < subReaders.length; i++) {
                starts[i] = maxDoc;
                maxDoc += subReaders[i].maxDoc();
            }
            starts[subReaders.length] = maxDoc;
        }


        /**
         * @inheritDoc
         */
        public int getParent(int n) throws IOException {
            int i = readerIndex(n);
            DocId id = subReaders[i].getParentDocId(n - starts[i]);
            id = id.applyOffset(starts[i]);
            return id.getDocumentNumber(this);
        }


        /**
         * Returns the reader index for document <code>n</code>.
         * Implementation copied from lucene MultiReader class.
         *
         * @param n document number.
         * @return the reader index.
         */
        private int readerIndex(int n) {
            int lo = 0;                                      // search starts array
            int hi = subReaders.length - 1;                  // for first element less


            while (hi >= lo) {
                int mid = (lo + hi) >> 1;
                int midValue = starts[mid];
                if (n < midValue) {
                    hi = mid - 1;
                } else if (n > midValue) {
                    lo = mid + 1;
                } else {                                      // found a match
                    while (mid + 1 < subReaders.length && starts[mid + 1] == midValue) {
                        mid++;                                  // scan to last match
                    }
                    return mid;
                }
            }
            return hi;
        }


    }


    //--------------------------< properties >----------------------------------


    /**
     * Sets the analyzer in use for indexing. The given analyzer class name
     * must satisfy the following conditions:
     * <ul>
     *   <li>the class must exist in the class path</li>
     *   <li>the class must have a public default constructor</li>
     *   <li>the class must be a Lucene Analyzer</li>
     * </ul>
     * <p>
     * If the above conditions are met, then a new instance of the class is
     * set as the analyzer. Otherwise a warning is logged and the current
     * analyzer is not changed.
     * <p>
     * This property setter method is normally invoked by the Jackrabbit
     * configuration mechanism if the "analyzer" parameter is set in the
     * search configuration.
     *
     * @param analyzerClassName the analyzer class name
     */
    public void setAnalyzer(String analyzerClassName) {
        try {
            Class analyzerClass = Class.forName(analyzerClassName);
            analyzer = (Analyzer) analyzerClass.newInstance();
        } catch (Exception e) {
            log.warn("Invalid Analyzer class: " + analyzerClassName, e);
        }
    }


    /**
     * Returns the class name of the analyzer that is currently in use.
     *
     * @return class name of analyzer in use.
     */
    public String getAnalyzer() {
        return analyzer.getClass().getName();
    }


    /**
     * Sets the location of the search index.
     *
     * @param path the location of the search index.
     */
    public void setPath(String path) {
        this.path = path;
    }


    /**
     * Returns the location of the search index. Returns <code>null</code> if
     * not set.
     *
     * @return the location of the search index.
     */
    public String getPath() {
        return path;
    }


    /**
     * The lucene index writer property: useCompoundFile
     */
    public void setUseCompoundFile(boolean b) {
        useCompoundFile = b;
    }


    /**
     * Returns the current value for useCompoundFile.
     *
     * @return the current value for useCompoundFile.
     */
    public boolean getUseCompoundFile() {
        return useCompoundFile;
    }


    /**
     * The lucene index writer property: minMergeDocs
     */
    public void setMinMergeDocs(int minMergeDocs) {
        this.minMergeDocs = minMergeDocs;
    }


    /**
     * Returns the current value for minMergeDocs.
     *
     * @return the current value for minMergeDocs.
     */
    public int getMinMergeDocs() {
        return minMergeDocs;
    }


    /**
     * Sets the property: volatileIdleTime
     *
     * @param volatileIdleTime idle time in seconds
     */
    public void setVolatileIdleTime(int volatileIdleTime) {
        this.volatileIdleTime = volatileIdleTime;
    }


    /**
     * Returns the current value for volatileIdleTime.
     *
     * @return the current value for volatileIdleTime.
     */
    public int getVolatileIdleTime() {
        return volatileIdleTime;
    }


    /**
     * The lucene index writer property: maxMergeDocs
     */
    public void setMaxMergeDocs(int maxMergeDocs) {
        this.maxMergeDocs = maxMergeDocs;
    }


    /**
     * Returns the current value for maxMergeDocs.
     *
     * @return the current value for maxMergeDocs.
     */
    public int getMaxMergeDocs() {
        return maxMergeDocs;
    }


    /**
     * The lucene index writer property: mergeFactor
     */
    public void setMergeFactor(int mergeFactor) {
        this.mergeFactor = mergeFactor;
    }


    /**
     * Returns the current value for the merge factor.
     *
     * @return the current value for the merge factor.
     */
    public int getMergeFactor() {
        return mergeFactor;
    }


    /**
     * @see VolatileIndex#setBufferSize(int)
     */
    public void setBufferSize(int size) {
        bufferSize = size;
    }


    /**
     * Returns the current value for the buffer size.
     *
     * @return the current value for the buffer size.
     */
    public int getBufferSize() {
        return bufferSize;
    }


    public void setRespectDocumentOrder(boolean docOrder) {
        documentOrder = docOrder;
    }


    public boolean getRespectDocumentOrder() {
        return documentOrder;
    }


    public void setForceConsistencyCheck(boolean b) {
        forceConsistencyCheck = b;
    }


    public boolean getForceConsistencyCheck() {
        return forceConsistencyCheck;
    }


    public void setAutoRepair(boolean b) {
        autoRepair = b;
    }


    public boolean getAutoRepair() {
        return autoRepair;
    }


    public void setCacheSize(int size) {
        cacheSize = size;
    }


    public int getCacheSize() {
        return cacheSize;
    }


    /**
     * Sets a new set of text filter classes that are in use for indexing
     * binary properties. The <code>filterClasses</code> must be a comma
     * separated <code>String</code> of fully qualified class names implementing
     * {@link org.apache.jackrabbit.core.query.TextFilter}. Each class must
     * provide a default constructor.
     * </p>
     * Filter class names that cannot be resolved are skipped and a warn message
     * is logged.
     *
     * @param filterClasses comma separated list of filter class names
     */
    public void setTextFilterClasses(String filterClasses) {
        List filters = new ArrayList();
        StringTokenizer tokenizer = new StringTokenizer(filterClasses, ", \t\n\r\f");
        while (tokenizer.hasMoreTokens()) {
            String className = tokenizer.nextToken();
            try {
                Class filterClass = Class.forName(className);
                TextFilter filter = (TextFilter) filterClass.newInstance();
                filters.add(filter);
            } catch (Exception e) {
                log.warn("Invalid TextFilter class: " + className, e);
            } catch (LinkageError e) {
                log.warn("Missing dependency for text filter: " + className);
                log.warn(e.toString());
            }
        }
        textFilters = Collections.unmodifiableList(filters);
    }


    /**
     * Returns the fully qualified class names of the text filter instances
     * currently in use. The names are comma separated.
     *
     * @return class names of the text filters in use.
     */
    public String getTextFilterClasses() {
        StringBuffer names = new StringBuffer();
        String delim = "";
        for (Iterator it = textFilters.iterator(); it.hasNext(); ) {
            names.append(delim);
            names.append(it.next().getClass().getName());
            delim = ",";
        }
        return names.toString();
    }


    //----------------------------< internal >----------------------------------


    /**
     * Checks if this <code>SearchIndex</code> is open, otherwise throws
     * an <code>IOException</code>.
     *
     * @throws IOException if this <code>SearchIndex</code> had been closed.
     */
    private void checkOpen() throws IOException {
        if (closed) {
            throw new IOException("query handler closed and cannot be used anymore.");
}
    }
}
Source Code of org.apache.jackrabbit.core.query.lucene.SearchIndex

Related Classes of org.apache.jackrabbit.core.query.lucene.SearchIndex