Package org.apache.lucene.sandbox.queries

Source Code of org.apache.lucene.sandbox.queries.DuplicateFilter

package org.apache.lucene.sandbox.queries;
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements.  See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License.  You may obtain a copy of the License at
*
*     http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/

import org.apache.lucene.index.*;
import org.apache.lucene.search.DocIdSet;
import org.apache.lucene.search.DocIdSetIterator;
import org.apache.lucene.search.Filter;
import org.apache.lucene.util.Bits;
import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.FixedBitSet;

import java.io.IOException;

/**
* Filter to remove duplicate values from search results.
* <p>
* WARNING: for this to work correctly, you may have to wrap
* your reader as it cannot current deduplicate across different
* index segments.
*
* @see SlowCompositeReaderWrapper
*/
public class DuplicateFilter extends Filter {
  // TODO: make duplicate filter aware of ReaderContext such that we can
  // filter duplicates across segments

  /**
   * KeepMode determines which document id to consider as the master, all others being
   * identified as duplicates. Selecting the "first occurrence" can potentially save on IO.
   */
  public enum KeepMode {
    KM_USE_FIRST_OCCURRENCE, KM_USE_LAST_OCCURRENCE
  }

  private KeepMode keepMode;

  /**
   * "Full" processing mode starts by setting all bits to false and only setting bits
   * for documents that contain the given field and are identified as none-duplicates.
   * <p/>
   * "Fast" processing sets all bits to true then unsets all duplicate docs found for the
   * given field. This approach avoids the need to read DocsEnum for terms that are seen
   * to have a document frequency of exactly "1" (i.e. no duplicates). While a potentially
   * faster approach , the downside is that bitsets produced will include bits set for
   * documents that do not actually contain the field given.
   */

  public enum ProcessingMode {
    PM_FULL_VALIDATION, PM_FAST_INVALIDATION
  }

  private ProcessingMode processingMode;

  private String fieldName;

  public DuplicateFilter(String fieldName) {
    this(fieldName, KeepMode.KM_USE_LAST_OCCURRENCE, ProcessingMode.PM_FULL_VALIDATION);
  }

  public DuplicateFilter(String fieldName, KeepMode keepMode, ProcessingMode processingMode) {
    this.fieldName = fieldName;
    this.keepMode = keepMode;
    this.processingMode = processingMode;
  }

  @Override
  public DocIdSet getDocIdSet(AtomicReaderContext context, Bits acceptDocs) throws IOException {
    if (processingMode == ProcessingMode.PM_FAST_INVALIDATION) {
      return fastBits(context.reader(), acceptDocs);
    } else {
      return correctBits(context.reader(), acceptDocs);
    }
  }

  private FixedBitSet correctBits(AtomicReader reader, Bits acceptDocs) throws IOException {
    FixedBitSet bits = new FixedBitSet(reader.maxDoc()); //assume all are INvalid
    Terms terms = reader.fields().terms(fieldName);

    if (terms == null) {
      return bits;
    }

    TermsEnum termsEnum = terms.iterator(null);
    DocsEnum docs = null;
    while (true) {
      BytesRef currTerm = termsEnum.next();
      if (currTerm == null) {
        break;
      } else {
        docs = termsEnum.docs(acceptDocs, docs, DocsEnum.FLAG_NONE);
        int doc = docs.nextDoc();
        if (doc != DocIdSetIterator.NO_MORE_DOCS) {
          if (keepMode == KeepMode.KM_USE_FIRST_OCCURRENCE) {
            bits.set(doc);
          } else {
            int lastDoc = doc;
            while (true) {
              lastDoc = doc;
              doc = docs.nextDoc();
              if (doc == DocIdSetIterator.NO_MORE_DOCS) {
                break;
              }
            }
            bits.set(lastDoc);
          }
        }
      }
    }
    return bits;
  }

  private FixedBitSet fastBits(AtomicReader reader, Bits acceptDocs) throws IOException {
    FixedBitSet bits = new FixedBitSet(reader.maxDoc());
    bits.set(0, reader.maxDoc()); //assume all are valid
    Terms terms = reader.fields().terms(fieldName);

    if (terms == null) {
      return bits;
    }

    TermsEnum termsEnum = terms.iterator(null);
    DocsEnum docs = null;
    while (true) {
      BytesRef currTerm = termsEnum.next();
      if (currTerm == null) {
        break;
      } else {
        if (termsEnum.docFreq() > 1) {
          // unset potential duplicates
          docs = termsEnum.docs(acceptDocs, docs, DocsEnum.FLAG_NONE);
          int doc = docs.nextDoc();
          if (doc != DocIdSetIterator.NO_MORE_DOCS) {
            if (keepMode == KeepMode.KM_USE_FIRST_OCCURRENCE) {
              doc = docs.nextDoc();
            }
          }

          int lastDoc = -1;
          while (true) {
            lastDoc = doc;
            bits.clear(lastDoc);
            doc = docs.nextDoc();
            if (doc == DocIdSetIterator.NO_MORE_DOCS) {
              break;
            }
          }

          if (keepMode == KeepMode.KM_USE_LAST_OCCURRENCE) {
            // restore the last bit
            bits.set(lastDoc);
          }
        }
      }
    }

    return bits;
  }

  public String getFieldName() {
    return fieldName;
  }

  public void setFieldName(String fieldName) {
    this.fieldName = fieldName;
  }

  public KeepMode getKeepMode() {
    return keepMode;
  }

  public void setKeepMode(KeepMode keepMode) {
    this.keepMode = keepMode;
  }

  @Override
  public boolean equals(Object obj) {
    if (this == obj) {
      return true;
    }
    if ((obj == null) || (obj.getClass() != this.getClass())) {
      return false;
    }

    DuplicateFilter other = (DuplicateFilter) obj;
    return keepMode == other.keepMode &&
        processingMode == other.processingMode &&
        fieldName != null && fieldName.equals(other.fieldName);
  }

  @Override
  public int hashCode() {
    int hash = 217;
    hash = 31 * hash + keepMode.hashCode();
    hash = 31 * hash + processingMode.hashCode();
    hash = 31 * hash + fieldName.hashCode();
    return hash;
  }

  public ProcessingMode getProcessingMode() {
    return processingMode;
  }

  public void setProcessingMode(ProcessingMode processingMode) {
    this.processingMode = processingMode;
  }
}
TOP

Related Classes of org.apache.lucene.sandbox.queries.DuplicateFilter

TOP
Copyright © 2018 www.massapi.com. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.