Source Code of org.apache.solr.analysis.SynonymFilter

// original code from Apache Solr - ported to work with Lucene 3.x and reformatted to Search coding style
/*
 * Hibernate, Relational Persistence for Idiomatic Java
 *
 *  Copyright (c) 2010, Red Hat, Inc. and/or its affiliates or third-party contributors as
 *  indicated by the @author tags or express copyright attribution
 *  statements applied by the authors.  All third-party contributions are
 *  distributed under license by Red Hat, Inc.
 *
 *  This copyrighted material is made available to anyone wishing to use, modify,
 *  copy, or redistribute it subject to the terms and conditions of the GNU
 *  Lesser General Public License, as published by the Free Software Foundation.
 *
 *  This program is distributed in the hope that it will be useful,
 *  but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
 *  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public License
 *  for more details.
 *
 *  You should have received a copy of the GNU Lesser General Public License
 *  along with this distribution; if not, write to:
 *  Free Software Foundation, Inc.
 *  51 Franklin Street, Fifth Floor
 *  Boston, MA  02110-1301  USA
 */


package org.apache.solr.analysis;


import java.io.IOException;
import java.util.ArrayList;
import java.util.Iterator;
import java.util.LinkedList;


import org.apache.lucene.analysis.Token;
import org.apache.lucene.analysis.TokenFilter;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
import org.apache.lucene.analysis.tokenattributes.TermAttribute;
import org.apache.lucene.analysis.tokenattributes.TypeAttribute;
import org.apache.lucene.util.AttributeSource;


/**
 * SynonymFilter handles multi-token synonyms with variable position increment offsets.
 * <p/>
 * The matched tokens from the input stream may be optionally passed through (includeOrig=true)
 * or discarded.  If the original tokens are included, the position increments may be modified
 * to retain absolute positions after merging with the synonym tokenstream.
 * <p/>
 * Generated synonyms will start at the same position as the first matched source token.
 *
 * @version $Id: SynonymFilter.java 991055 2010-08-31 01:40:19Z rmuir $
 */
public final class SynonymFilter extends TokenFilter {


  private final SynonymMap map;  // Map<String, SynonymMap>
  private Iterator<AttributeSource> replacement;  // iterator over generated tokens


  public SynonymFilter(TokenStream in, SynonymMap map) {
    super( in );
    this.map = map;
    // just ensuring these attributes exist...
    addAttribute( TermAttribute.class );
    addAttribute( PositionIncrementAttribute.class );
    addAttribute( OffsetAttribute.class );
    addAttribute( TypeAttribute.class );
  }




  /*
     * Need to worry about multiple scenarios:
     *  - need to go for the longest match
     *    a b => foo      #shouldn't match if "a b" is followed by "c d"
     *    a b c d => bar
     *  - need to backtrack - retry matches for tokens already read
     *     a b c d => foo
     *       b c => bar
     *     If the input stream is "a b c x", one will consume "a b c d"
     *     trying to match the first rule... all but "a" should be
     *     pushed back so a match may be made on "b c".
     *  - don't try and match generated tokens (thus need separate queue)
     *    matching is not recursive.
     *  - handle optional generation of original tokens in all these cases,
     *    merging token streams to preserve token positions.
     *  - preserve original positionIncrement of first matched token
     */


  @Override
  public boolean incrementToken() throws IOException {
    while ( true ) {
      // if there are any generated tokens, return them... don't try any
      // matches against them, as we specifically don't want recursion.
      if ( replacement != null && replacement.hasNext() ) {
        copy( this, replacement.next() );
        return true;
      }


      // common case fast-path of first token not matching anything
      AttributeSource firstTok = nextTok();
      if ( firstTok == null ) {
        return false;
      }
      TermAttribute termAtt = firstTok.addAttribute( TermAttribute.class );
      SynonymMap result = map.submap != null ? map.submap
          .get( termAtt.termBuffer(), 0, termAtt.termLength() ) : null;
      if ( result == null ) {
        copy( this, firstTok );
        return true;
      }


      // fast-path failed, clone ourselves if needed
      if ( firstTok == this ) {
        firstTok = cloneAttributes();
      }
      // OK, we matched a token, so find the longest match.


      matched = new LinkedList<AttributeSource>();


      result = match( result );


      if ( result == null ) {
        // no match, simply return the first token read.
        copy( this, firstTok );
        return true;
      }


      // reuse, or create new one each time?
      ArrayList<AttributeSource> generated = new ArrayList<AttributeSource>( result.synonyms.length + matched.size() + 1 );


      //
      // there was a match... let's generate the new tokens, merging
      // in the matched tokens (position increments need adjusting)
      //
      AttributeSource lastTok = matched.isEmpty() ? firstTok : matched.getLast();
      boolean includeOrig = result.includeOrig();


      AttributeSource origTok = includeOrig ? firstTok : null;
      PositionIncrementAttribute firstPosIncAtt = firstTok.addAttribute( PositionIncrementAttribute.class );
      int origPos = firstPosIncAtt.getPositionIncrement();  // position of origTok in the original stream
      int repPos = 0; // curr position in replacement token stream
      int pos = 0;  // current position in merged token stream


      for ( int i = 0; i < result.synonyms.length; i++ ) {
        Token repTok = result.synonyms[i];
        AttributeSource newTok = firstTok.cloneAttributes();
        TermAttribute newTermAtt = newTok.addAttribute( TermAttribute.class );
        OffsetAttribute newOffsetAtt = newTok.addAttribute( OffsetAttribute.class );
        PositionIncrementAttribute newPosIncAtt = newTok.addAttribute( PositionIncrementAttribute.class );


        OffsetAttribute lastOffsetAtt = lastTok.addAttribute( OffsetAttribute.class );


        newOffsetAtt.setOffset( newOffsetAtt.startOffset(), lastOffsetAtt.endOffset() );
        newTermAtt.setTermBuffer( repTok.termBuffer(), 0, repTok.termLength() );
        repPos += repTok.getPositionIncrement();
        if ( i == 0 ) {
          repPos = origPos;
        }  // make position of first token equal to original


        // if necessary, insert original tokens and adjust position increment
        while ( origTok != null && origPos <= repPos ) {
          PositionIncrementAttribute origPosInc = origTok.addAttribute( PositionIncrementAttribute.class );
          origPosInc.setPositionIncrement( origPos - pos );
          generated.add( origTok );
          pos += origPosInc.getPositionIncrement();
          origTok = matched.isEmpty() ? null : matched.removeFirst();
          if ( origTok != null ) {
            origPosInc = origTok.addAttribute( PositionIncrementAttribute.class );
            origPos += origPosInc.getPositionIncrement();
          }
        }


        newPosIncAtt.setPositionIncrement( repPos - pos );
        generated.add( newTok );
        pos += newPosIncAtt.getPositionIncrement();
      }


      // finish up any leftover original tokens
      while ( origTok != null ) {
        PositionIncrementAttribute origPosInc = origTok.addAttribute( PositionIncrementAttribute.class );
        origPosInc.setPositionIncrement( origPos - pos );
        generated.add( origTok );
        pos += origPosInc.getPositionIncrement();
        origTok = matched.isEmpty() ? null : matched.removeFirst();
        if ( origTok != null ) {
          origPosInc = origTok.addAttribute( PositionIncrementAttribute.class );
          origPos += origPosInc.getPositionIncrement();
        }
      }


      // what if we replaced a longer sequence with a shorter one?
      // a/0 b/5 =>  foo/0
      // should I re-create the gap on the next buffered token?


      replacement = generated.iterator();
      // Now return to the top of the loop to read and return the first
      // generated token.. The reason this is done is that we may have generated
      // nothing at all, and may need to continue with more matching logic.
    }
  }




  //
  // Defer creation of the buffer until the first time it is used to
  // optimize short fields with no matches.
  //
  private LinkedList<AttributeSource> buffer;
  private LinkedList<AttributeSource> matched;


  private AttributeSource nextTok() throws IOException {
    if ( buffer != null && !buffer.isEmpty() ) {
      return buffer.removeFirst();
    }
    else {
      if ( input.incrementToken() ) {
        return this;
      }
      else {
        return null;
      }
    }
  }


  private void pushTok(AttributeSource t) {
    if ( buffer == null ) {
      buffer = new LinkedList<AttributeSource>();
    }
    buffer.addFirst( t );
  }


  private SynonymMap match(SynonymMap map) throws IOException {
    SynonymMap result = null;


    if ( map.submap != null ) {
      AttributeSource tok = nextTok();
      if ( tok != null ) {
        // clone ourselves.
        if ( tok == this ) {
          tok = cloneAttributes();
        }
        // check for positionIncrement!=1?  if>1, should not match, if==0, check multiple at this level?
        TermAttribute termAtt = tok.getAttribute( TermAttribute.class );
        SynonymMap subMap = map.submap.get( termAtt.termBuffer(), 0, termAtt.termLength() );


        if ( subMap != null ) {
          // recurse
          result = match( subMap );
        }


        if ( result != null ) {
          matched.addFirst( tok );
        }
        else {
          // push back unmatched token
          pushTok( tok );
        }
      }
    }


    // if no longer sequence matched, so if this node has synonyms, it's the match.
    if ( result == null && map.synonyms != null ) {
      result = map;
    }


    return result;
  }


  private void copy(AttributeSource target, AttributeSource source) {
    if ( target != source ) {
      if ( source.hasAttributes() ) {
        State sourceState = source.captureState();
        target.restoreState( sourceState );
      }
    }
  }


  @Override
  public void reset() throws IOException {
    input.reset();
    replacement = null;
  }
}
Source Code of org.apache.solr.analysis.SynonymFilter

Related Classes of org.apache.solr.analysis.SynonymFilter