Source Code of org.sindice.siren.qparser.keyword.processors.DatatypeAnalyzerProcessor

/**
 * Copyright 2014 National University of Ireland, Galway.
 *
 * This file is part of the SIREn project. Project and contact information:
 *
 *  https://github.com/rdelbru/SIREn
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *  http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */


package org.sindice.siren.qparser.keyword.processors;


import java.io.IOException;
import java.io.StringReader;
import java.util.LinkedList;
import java.util.List;


import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.CachingTokenFilter;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
import org.apache.lucene.queryparser.flexible.core.QueryNodeException;
import org.apache.lucene.queryparser.flexible.core.messages.QueryParserMessages;
import org.apache.lucene.queryparser.flexible.core.nodes.FieldQueryNode;
import org.apache.lucene.queryparser.flexible.core.nodes.FuzzyQueryNode;
import org.apache.lucene.queryparser.flexible.core.nodes.GroupQueryNode;
import org.apache.lucene.queryparser.flexible.core.nodes.NoTokenFoundQueryNode;
import org.apache.lucene.queryparser.flexible.core.nodes.OrQueryNode;
import org.apache.lucene.queryparser.flexible.core.nodes.QueryNode;
import org.apache.lucene.queryparser.flexible.core.nodes.RangeQueryNode;
import org.apache.lucene.queryparser.flexible.core.nodes.TextableQueryNode;
import org.apache.lucene.queryparser.flexible.core.nodes.TokenizedPhraseQueryNode;
import org.apache.lucene.queryparser.flexible.core.processors.QueryNodeProcessorImpl;
import org.apache.lucene.queryparser.flexible.messages.MessageImpl;
import org.apache.lucene.queryparser.flexible.standard.config.StandardQueryConfigHandler.ConfigurationKeys;
import org.apache.lucene.queryparser.flexible.standard.nodes.RegexpQueryNode;
import org.apache.lucene.queryparser.flexible.standard.nodes.WildcardQueryNode;
import org.apache.lucene.search.MultiPhraseQuery;
import org.sindice.siren.qparser.keyword.config.KeywordQueryConfigHandler.KeywordConfigurationKeys;
import org.sindice.siren.qparser.keyword.nodes.DatatypeQueryNode;
import org.sindice.siren.qparser.keyword.nodes.TwigQueryNode;
import org.sindice.siren.qparser.keyword.nodes.WildcardNodeQueryNode;


/**
 * This processor analyzes query terms based on their datatype.
 *
 * <p>
 *
 * This processor retrieves the {@link Analyzer} associated with the TAG
 * {@link DatatypeQueryNode#DATATYPE_TAGID} in the key
 * {@link KeywordConfigurationKeys#DATATYPES_ANALYZERS}, and uses it on the
 * {@link FieldQueryNode} text, which is not a {@link WildcardQueryNode},
 * a {@link FuzzyQueryNode}, a {@link RegexpQueryNode} or the bound of a
 * {@link RangeQueryNode}.
 *
 * <p>
 *
 * If no term is returned by the analyzer, a {@link NoTokenFoundQueryNode} object
 * is returned. An {@link WildcardNodeQueryNode} is returned instead if an ancestor
 * is a {@link TwigQueryNode}.
 *
 * <p>
 *
 * If the analyzer returns only one term, the
 * returned term is set to the {@link FieldQueryNode} and it is returned.
 *
 * <p>
 *
 * If the analyzer returns more than one term at different positions, a
 * {@link TokenizedPhraseQueryNode} is created. If they are all at the same
 * position, a {@link OrQueryNode} object is created and returned.
 *
 * <p>
 *
 * If the analyzer returns multiple terms and the parent node is a
 * {@link TokenizedPhraseQueryNode}, a {@link QueryNodeException} is thrown
 * because {@link MultiPhraseQuery} are not supported in SIREn.
 */
public class DatatypeAnalyzerProcessor
extends QueryNodeProcessorImpl {


  private boolean positionIncrementsEnabled;
  private int nbTwigs = 0;


  @Override
  protected QueryNode preProcessNode(final QueryNode node)
  throws QueryNodeException {
    if (node instanceof TwigQueryNode) {
      nbTwigs++;
    }
    return node;
  }


  @Override
  protected QueryNode postProcessNode(final QueryNode node)
  throws QueryNodeException {
    if (node instanceof TextableQueryNode
        && !(node instanceof WildcardQueryNode)
        && !(node instanceof FuzzyQueryNode)
        && !(node instanceof RegexpQueryNode)
        && !(node.getParent() instanceof RangeQueryNode)) {


      this.positionIncrementsEnabled = false;
      final Boolean positionIncrementsEnabled = this.getQueryConfigHandler().get(ConfigurationKeys.ENABLE_POSITION_INCREMENTS);
      if (positionIncrementsEnabled != null) {
          this.positionIncrementsEnabled = positionIncrementsEnabled;
      }


      final FieldQueryNode fieldNode = ((FieldQueryNode) node);
      final String text = fieldNode.getTextAsString();
      final String field = fieldNode.getFieldAsString();
      final String datatype = (String) fieldNode.getTag(DatatypeQueryNode.DATATYPE_TAGID);


      if (datatype == null) {
        return node;
      }


      final Analyzer analyzer = this.getQueryConfigHandler()
                                .get(KeywordConfigurationKeys.DATATYPES_ANALYZERS)
                                .get(datatype);
      if (analyzer == null) {
        throw new QueryNodeException(new MessageImpl(
          QueryParserMessages.INVALID_SYNTAX, "No analyzer associated with " + datatype));
      }


      PositionIncrementAttribute posIncrAtt = null;
      int numTokens = 0;
      int positionCount = 0;
      boolean severalTokensAtSamePosition = false;


      final TokenStream source;
      try {
        source = analyzer.tokenStream(field, new StringReader(text));
        source.reset();
      } catch (final IOException e1) {
        throw new RuntimeException(e1);
      }
      final CachingTokenFilter buffer = new CachingTokenFilter(source);


      if (buffer.hasAttribute(PositionIncrementAttribute.class)) {
        posIncrAtt = buffer.getAttribute(PositionIncrementAttribute.class);
      }


      try {
        while (buffer.incrementToken()) {
          numTokens++;
          final int positionIncrement = (posIncrAtt != null) ? posIncrAtt
              .getPositionIncrement() : 1;
          if (positionIncrement != 0) {
            positionCount += positionIncrement;
          } else {
            severalTokensAtSamePosition = true;
          }
        }
      } catch (final IOException e) {
        // ignore
      }


      try {
        // rewind the buffer stream
        buffer.reset();
        // close original stream - all tokens buffered
        source.close();
      } catch (final IOException e) {
        // ignore
      }


      if (!buffer.hasAttribute(CharTermAttribute.class)) {
        return new NoTokenFoundQueryNode();
      }
      final CharTermAttribute termAtt = buffer.getAttribute(CharTermAttribute.class);


      if (numTokens == 0) {
        if (nbTwigs != 0) { // Twig special case
          return new WildcardNodeQueryNode();
        }
        return new NoTokenFoundQueryNode();
      }
      else if (numTokens == 1) {
        String term = null;
        try {
          boolean hasNext;
          hasNext = buffer.incrementToken();
          assert hasNext == true;
          term = termAtt.toString();
        } catch (final IOException e) {
          // safe to ignore, because we know the number of tokens
        }
        fieldNode.setText(term);
        return fieldNode;
      }
      else {
        // no phrase query:
        final LinkedList<QueryNode> children = new LinkedList<QueryNode>();


        int position = -1;


        for (int i = 0; i < numTokens; i++) {
          String term = null;
          final int positionIncrement = 1;


          try {
            final boolean hasNext = buffer.incrementToken();
            assert hasNext == true;
            term = termAtt.toString();


          } catch (final IOException e) {
            // safe to ignore, because we know the number of tokens
          }


          final FieldQueryNode newFieldNode = new FieldQueryNode(field, term, -1, -1);


          if (this.positionIncrementsEnabled) {
            position += positionIncrement;
            newFieldNode.setPositionIncrement(position);
          } else {
            newFieldNode.setPositionIncrement(i);
          }


          children.add(new FieldQueryNode(field, term, -1, -1));
        }


        if (node.getParent() instanceof TokenizedPhraseQueryNode) {
          throw new QueryNodeException(new MessageImpl("Cannot build a MultiPhraseQuery"));
        }
        // If multiple terms at one single position, this must be a query
        // expansion. Perform a OR between the terms.
        if (severalTokensAtSamePosition && positionCount == 1) {
          return new GroupQueryNode(new OrQueryNode(children));
        }
        // if several tokens at same position && position count > 1, then
        // results can be unexpected
        else {
          final TokenizedPhraseQueryNode pq = new TokenizedPhraseQueryNode();
          for (int i = 0; i < children.size(); i++) {
            pq.add(children.get(i));
          }
          return pq;
        }
      }
    } else if (node instanceof TwigQueryNode) {
      nbTwigs--;
      assert nbTwigs >= 0;
    }
    return node;
  }


  @Override
  protected List<QueryNode> setChildrenOrder(final List<QueryNode> children)
  throws QueryNodeException {
    return children;
  }


}
Source Code of org.sindice.siren.qparser.keyword.processors.DatatypeAnalyzerProcessor

Related Classes of org.sindice.siren.qparser.keyword.processors.DatatypeAnalyzerProcessor