Package org.apache.lucene.search

Source Code of org.apache.lucene.search.DuplicateFilter

package org.apache.lucene.search;
/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements.  See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License.  You may obtain a copy of the License at
*
*     http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import java.io.IOException;
import java.util.BitSet;

import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.Term;
import org.apache.lucene.index.TermDocs;
import org.apache.lucene.index.TermEnum;
import org.apache.lucene.util.OpenBitSet;

public class DuplicateFilter extends Filter
{
 
  String fieldName;
 
  /**
   * KeepMode determines which document id to consider as the master, all others being
   * identified as duplicates. Selecting the "first occurrence" can potentially save on IO.
   */
  int keepMode=KM_USE_FIRST_OCCURRENCE;
  public static final int KM_USE_FIRST_OCCURRENCE=1;
  public static final int KM_USE_LAST_OCCURRENCE=2;
 
  /**
   * "Full" processing mode starts by setting all bits to false and only setting bits
   * for documents that contain the given field and are identified as none-duplicates.

   * "Fast" processing sets all bits to true then unsets all duplicate docs found for the
   * given field. This approach avoids the need to read TermDocs for terms that are seen
   * to have a document frequency of exactly "1" (i.e. no duplicates). While a potentially
   * faster approach , the downside is that bitsets produced will include bits set for
   * documents that do not actually contain the field given.
   *
   */
  int processingMode=PM_FULL_VALIDATION;
  public static final int PM_FULL_VALIDATION=1;
  public static final int PM_FAST_INVALIDATION=2;
 

 
  public DuplicateFilter(String fieldName)
  {
    this(fieldName, KM_USE_LAST_OCCURRENCE,PM_FULL_VALIDATION);
  }
 

  public DuplicateFilter(String fieldName, int keepMode, int processingMode)
  {
    this.fieldName = fieldName;
    this.keepMode = keepMode;
    this.processingMode = processingMode;
  }

  public DocIdSet getDocIdSet(IndexReader reader) throws IOException
  {
    if(processingMode==PM_FAST_INVALIDATION)
    {
      return fastBits(reader);
    }
    else
    {
      return correctBits(reader);
    }
  }
 
  private OpenBitSet correctBits(IndexReader reader) throws IOException
  {
   
    OpenBitSet bits=new OpenBitSet(reader.maxDoc()); //assume all are INvalid
    Term startTerm=new Term(fieldName);
    TermEnum te = reader.terms(startTerm);
    if(te!=null)
    {
      Term currTerm=te.term();
      while((currTerm!=null)&&(currTerm.field()==startTerm.field())) //term fieldnames are interned
      {
        int lastDoc=-1;
        //set non duplicates
        TermDocs td = reader.termDocs(currTerm);
        if(td.next())
        {
          if(keepMode==KM_USE_FIRST_OCCURRENCE)
          {
            bits.set(td.doc());
          }
          else
          {
            do
            {
              lastDoc=td.doc();
            }while(td.next());
            bits.set(lastDoc);
          }
        }
        if(!te.next())
        {
          break;
        }
        currTerm=te.term();
      }
    }
    return bits;
  }
 
  private OpenBitSet fastBits(IndexReader reader) throws IOException
  {
   
    OpenBitSet bits=new OpenBitSet(reader.maxDoc());
    bits.set(0,reader.maxDoc()); //assume all are valid
    Term startTerm=new Term(fieldName);
    TermEnum te = reader.terms(startTerm);
    if(te!=null)
    {
      Term currTerm=te.term();
     
      while((currTerm!=null)&&(currTerm.field()==startTerm.field())) //term fieldnames are interned
      {
        if(te.docFreq()>1)
        {
          int lastDoc=-1;
          //unset potential duplicates
          TermDocs td = reader.termDocs(currTerm);
          td.next();
          if(keepMode==KM_USE_FIRST_OCCURRENCE)
          {
            td.next();
          }
          do
          {
            lastDoc=td.doc();
            bits.clear(lastDoc);
          }while(td.next());
          if(keepMode==KM_USE_LAST_OCCURRENCE)
          {
            //restore the last bit
            bits.set(lastDoc);
          }         
        }
        if(!te.next())
        {
          break;
        }
        currTerm=te.term();
      }
    }
    return bits;
  }

  /**
   * @param args
   * @throws IOException
   * @throws Exception
   */
  public static void main(String[] args) throws Exception
  {
    IndexReader r=IndexReader.open("/indexes/personCentricAnon");
//    IndexReader r=IndexReader.open("/indexes/enron");
    long start=System.currentTimeMillis();
//    DuplicateFilter df = new DuplicateFilter("threadId",KM_USE_FIRST_OCCURRENCE, PM_FAST_INVALIDATION);
//    DuplicateFilter df = new DuplicateFilter("threadId",KM_USE_LAST_OCCURRENCE, PM_FAST_INVALIDATION);
    DuplicateFilter df = new DuplicateFilter("vehicle.vrm",KM_USE_LAST_OCCURRENCE, PM_FAST_INVALIDATION);
//    DuplicateFilter df = new DuplicateFilter("title",USE_LAST_OCCURRENCE);
//    df.setProcessingMode(PM_SLOW_VALIDATION);
    BitSet b = df.bits(r);
    long end=System.currentTimeMillis()-start;
    System.out.println(b.cardinality()+" in "+end+" ms ");

  }


  public String getFieldName()
  {
    return fieldName;
  }


  public void setFieldName(String fieldName)
  {
    this.fieldName = fieldName;
  }


  public int getKeepMode()
  {
    return keepMode;
  }


  public void setKeepMode(int keepMode)
  {
    this.keepMode = keepMode;
  }


  public boolean equals(Object obj)
  {
    if(this == obj)
      return true;
    if((obj == null) || (obj.getClass() != this.getClass()))
      return false;
    DuplicateFilter other = (DuplicateFilter)obj;
    return keepMode == other.keepMode &&
    processingMode == other.processingMode &&
      (fieldName == other.fieldName || (fieldName != null && fieldName.equals(other.fieldName)));
  }



  public int hashCode()
  {
    int hash = 217;
    hash = 31 * hash + keepMode;
    hash = 31 * hash + processingMode;
    hash = 31 * hash + fieldName.hashCode();
    return hash; 
  }


  public int getProcessingMode()
  {
    return processingMode;
  }


  public void setProcessingMode(int processingMode)
  {
    this.processingMode = processingMode;
  }
 
 

}
TOP

Related Classes of org.apache.lucene.search.DuplicateFilter

TOP
Copyright © 2018 www.massapi.com. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.