package org.apache.blur.utils;
/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import java.io.IOException;
import java.util.ArrayList;
import java.util.Collection;
import java.util.HashSet;
import java.util.List;
import org.apache.blur.analysis.FieldManager;
import org.apache.blur.log.Log;
import org.apache.blur.log.LogFactory;
import org.apache.blur.lucene.search.SuperQuery;
import org.apache.blur.thrift.generated.Selector;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field.Store;
import org.apache.lucene.document.StringField;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.IndexableField;
import org.apache.lucene.index.Term;
import org.apache.lucene.search.BooleanClause;
import org.apache.lucene.search.BooleanQuery;
import org.apache.lucene.search.IndexSearcher;
import org.apache.lucene.search.MultiPhraseQuery;
import org.apache.lucene.search.NumericRangeQuery;
import org.apache.lucene.search.PhraseQuery;
import org.apache.lucene.search.PrefixQuery;
import org.apache.lucene.search.Query;
import org.apache.lucene.search.TermQuery;
import org.apache.lucene.search.TermRangeQuery;
import org.apache.lucene.search.TopDocs;
import org.apache.lucene.search.WildcardQuery;
import org.apache.lucene.search.highlight.Highlighter;
import org.apache.lucene.search.highlight.InvalidTokenOffsetsException;
import org.apache.lucene.search.highlight.QueryScorer;
import org.apache.lucene.search.highlight.SimpleHTMLFormatter;
import org.apache.lucene.search.highlight.TextFragment;
import org.apache.lucene.search.highlight.TokenSources;
import org.apache.lucene.util.BytesRef;
public class HighlightHelper {
private static final Log LOG = LogFactory.getLog(HighlightHelper.class);
private static final Collection<String> FIELDS_NOT_TO_HIGHLIGHT = new HashSet<String>() {
private static final long serialVersionUID = 1L;
{
add(BlurConstants.ROW_ID);
add(BlurConstants.RECORD_ID);
add(BlurConstants.PRIME_DOC);
add(BlurConstants.FAMILY);
}
};
public static List<Document> highlightDocuments(IndexReader reader, Term term,
ResetableDocumentStoredFieldVisitor fieldSelector, Selector selector, Query highlightQuery,
FieldManager fieldManager, String preTag, String postTag) throws IOException {
IndexSearcher indexSearcher = new IndexSearcher(reader);
int docFreq = reader.docFreq(term);
BooleanQuery booleanQueryForFamily = null;
BooleanQuery booleanQuery = null;
if (selector.getColumnFamiliesToFetchSize() > 0) {
booleanQueryForFamily = new BooleanQuery();
for (String familyName : selector.getColumnFamiliesToFetch()) {
booleanQueryForFamily
.add(new TermQuery(new Term(BlurConstants.FAMILY, familyName)), BooleanClause.Occur.SHOULD);
}
booleanQuery = new BooleanQuery();
booleanQuery.add(new TermQuery(term), BooleanClause.Occur.MUST);
booleanQuery.add(booleanQueryForFamily, BooleanClause.Occur.MUST);
}
Query query = booleanQuery == null ? new TermQuery(term) : booleanQuery;
TopDocs topDocs = indexSearcher.search(query, docFreq);
int totalHits = topDocs.totalHits;
List<Document> docs = new ArrayList<Document>();
int start = selector.getStartRecord();
int end = selector.getMaxRecordsToFetch() + start;
for (int i = start; i < end; i++) {
if (i >= totalHits) {
break;
}
int doc = topDocs.scoreDocs[i].doc;
indexSearcher.doc(doc, fieldSelector);
Document document = fieldSelector.getDocument();
try {
document = highlight(doc, document, highlightQuery, fieldManager, reader, preTag, postTag);
} catch (InvalidTokenOffsetsException e) {
LOG.error("Unknown error while tring to highlight", e);
}
docs.add(document);
fieldSelector.reset();
}
return docs;
}
/**
* NOTE: This method will not preserve the correct field types.
*
* @param preTag
* @param postTag
*/
public static Document highlight(int docId, Document document, Query query, FieldManager fieldManager,
IndexReader reader, String preTag, String postTag) throws IOException, InvalidTokenOffsetsException {
String fieldLessFieldName = fieldManager.getFieldLessFieldName();
Query fixedQuery = fixSuperQuery(query, null, fieldLessFieldName);
Analyzer analyzer = fieldManager.getAnalyzerForQuery();
SimpleHTMLFormatter htmlFormatter = new SimpleHTMLFormatter(preTag, postTag);
Document result = new Document();
for (IndexableField f : document) {
String name = f.name();
if (fieldLessFieldName.equals(name) || FIELDS_NOT_TO_HIGHLIGHT.contains(name)) {
result.add(f);
continue;
}
String text = f.stringValue();
Number numericValue = f.numericValue();
Query fieldFixedQuery;
if (fieldManager.isFieldLessIndexed(name)) {
fieldFixedQuery = fixSuperQuery(query, name, fieldLessFieldName);
} else {
fieldFixedQuery = fixedQuery;
}
if (numericValue != null) {
if (shouldNumberBeHighlighted(name, numericValue, fieldFixedQuery)) {
String numberHighlight = preTag + text + postTag;
result.add(new StringField(name, numberHighlight, Store.YES));
}
} else {
Highlighter highlighter = new Highlighter(htmlFormatter, new QueryScorer(fieldFixedQuery, name));
TokenStream tokenStream = TokenSources.getAnyTokenStream(reader, docId, name, analyzer);
TextFragment[] frag = highlighter.getBestTextFragments(tokenStream, text, false, 10);
for (int j = 0; j < frag.length; j++) {
if ((frag[j] != null) && (frag[j].getScore() > 0)) {
result.add(new StringField(name, frag[j].toString(), Store.YES));
}
}
}
}
return result;
}
private static Query fixSuperQuery(Query query, String name, String fieldLessFieldName) {
if (query instanceof BooleanQuery) {
BooleanQuery bq = (BooleanQuery) query;
BooleanQuery newBq = new BooleanQuery();
for (BooleanClause booleanClause : bq) {
newBq.add(fixSuperQuery(booleanClause.getQuery(), name, fieldLessFieldName), booleanClause.getOccur());
}
return newBq;
} else if (query instanceof SuperQuery) {
SuperQuery sq = (SuperQuery) query;
return setFieldIfNeeded(sq.getQuery(), name, fieldLessFieldName);
} else {
return setFieldIfNeeded(query, name, fieldLessFieldName);
}
}
private static Query setFieldIfNeeded(Query query, String name, String fieldLessFieldName) {
if (name == null) {
return query;
}
if (query instanceof TermQuery) {
TermQuery tq = (TermQuery) query;
Term term = tq.getTerm();
if (term.field().equals(fieldLessFieldName)) {
return new TermQuery(new Term(name, term.bytes()));
}
} else if (query instanceof WildcardQuery) {
WildcardQuery wq = (WildcardQuery) query;
Term term = wq.getTerm();
if (term.field().equals(fieldLessFieldName)) {
return new WildcardQuery(new Term(name, term.bytes()));
}
} else if (query instanceof MultiPhraseQuery) {
MultiPhraseQuery mpq = (MultiPhraseQuery) query;
int[] positions = mpq.getPositions();
List<Term[]> termArrays = mpq.getTermArrays();
if (isTermField(termArrays, fieldLessFieldName)) {
MultiPhraseQuery multiPhraseQuery = new MultiPhraseQuery();
multiPhraseQuery.setSlop(mpq.getSlop());
for (int i = 0; i < termArrays.size(); i++) {
multiPhraseQuery.add(changeFields(termArrays.get(i), name), positions[i]);
}
return multiPhraseQuery;
}
} else if (query instanceof PhraseQuery) {
PhraseQuery pq = (PhraseQuery) query;
Term[] terms = pq.getTerms();
int[] positions = pq.getPositions();
String field = terms[0].field();
if (field.equals(BlurConstants.SUPER)) {
PhraseQuery phraseQuery = new PhraseQuery();
for (int i = 0; i < terms.length; i++) {
phraseQuery.add(new Term(name, terms[i].bytes()), positions[i]);
}
phraseQuery.setSlop(pq.getSlop());
return phraseQuery;
}
} else if (query instanceof PrefixQuery) {
PrefixQuery pq = (PrefixQuery) query;
Term term = pq.getPrefix();
if (term.field().equals(BlurConstants.SUPER)) {
return new PrefixQuery(new Term(name, term.bytes()));
}
} else if (query instanceof TermRangeQuery) {
TermRangeQuery trq = (TermRangeQuery) query;
BytesRef lowerTerm = trq.getLowerTerm();
BytesRef upperTerm = trq.getUpperTerm();
boolean includeUpper = trq.includesUpper();
boolean includeLower = trq.includesLower();
String field = trq.getField();
if (field.equals(BlurConstants.SUPER)) {
return new TermRangeQuery(name, lowerTerm, upperTerm, includeLower, includeUpper);
}
}
return query;
}
private static Term[] changeFields(Term[] terms, String name) {
Term[] newTerms = new Term[terms.length];
for (int i = 0; i < terms.length; i++) {
newTerms[i] = new Term(name, terms[i].bytes());
}
return newTerms;
}
private static boolean isTermField(List<Term[]> termArrays, String fieldName) {
Term[] terms = termArrays.get(0);
return terms[0].field().equals(fieldName);
}
public static boolean shouldNumberBeHighlighted(String name, Number numericValue, Query query) {
if (query instanceof BooleanQuery) {
BooleanQuery booleanQuery = (BooleanQuery) query;
for (BooleanClause booleanClause : booleanQuery) {
if (booleanClause.isProhibited()) {
continue;
} else {
if (shouldNumberBeHighlighted(name, numericValue, booleanClause.getQuery())) {
return true;
}
}
}
} else {
if (query instanceof NumericRangeQuery) {
if (numericValue instanceof Integer) {
return checkInteger(name, numericValue, query);
} else if (numericValue instanceof Double) {
return checkDouble(name, numericValue, query);
} else if (numericValue instanceof Float) {
return checkFloat(name, numericValue, query);
} else if (numericValue instanceof Long) {
return checkLong(name, numericValue, query);
}
}
}
return false;
}
@SuppressWarnings("unchecked")
public static boolean checkLong(String name, Number numericValue, Query query) {
long value = (Long) numericValue;
NumericRangeQuery<Long> nrq = (NumericRangeQuery<Long>) query;
if (!name.equals(nrq.getField())) {
return false;
}
if (nrq.includesMin()) {
if (nrq.includesMax()) {
if (value >= nrq.getMin() && value <= nrq.getMax()) {
return true;
}
} else {
if (value >= nrq.getMin() && value < nrq.getMax()) {
return true;
}
}
} else {
if (nrq.includesMax()) {
if (value > nrq.getMin() && value <= nrq.getMax()) {
return true;
}
} else {
if (value > nrq.getMin() && value < nrq.getMax()) {
return true;
}
}
}
return false;
}
@SuppressWarnings("unchecked")
public static boolean checkFloat(String name, Number numericValue, Query query) {
float value = (Float) numericValue;
NumericRangeQuery<Float> nrq = (NumericRangeQuery<Float>) query;
if (!name.equals(nrq.getField())) {
return false;
}
if (nrq.includesMin()) {
if (nrq.includesMax()) {
if (value >= nrq.getMin() && value <= nrq.getMax()) {
return true;
}
} else {
if (value >= nrq.getMin() && value < nrq.getMax()) {
return true;
}
}
} else {
if (nrq.includesMax()) {
if (value > nrq.getMin() && value <= nrq.getMax()) {
return true;
}
} else {
if (value > nrq.getMin() && value < nrq.getMax()) {
return true;
}
}
}
return false;
}
@SuppressWarnings("unchecked")
public static boolean checkDouble(String name, Number numericValue, Query query) {
double value = (Double) numericValue;
NumericRangeQuery<Double> nrq = (NumericRangeQuery<Double>) query;
if (!name.equals(nrq.getField())) {
return false;
}
if (nrq.includesMin()) {
if (nrq.includesMax()) {
if (value >= nrq.getMin() && value <= nrq.getMax()) {
return true;
}
} else {
if (value >= nrq.getMin() && value < nrq.getMax()) {
return true;
}
}
} else {
if (nrq.includesMax()) {
if (value > nrq.getMin() && value <= nrq.getMax()) {
return true;
}
} else {
if (value > nrq.getMin() && value < nrq.getMax()) {
return true;
}
}
}
return false;
}
@SuppressWarnings("unchecked")
public static boolean checkInteger(String name, Number numericValue, Query query) {
int value = (Integer) numericValue;
NumericRangeQuery<Integer> nrq = (NumericRangeQuery<Integer>) query;
if (!name.equals(nrq.getField())) {
return false;
}
if (nrq.includesMin()) {
if (nrq.includesMax()) {
if (value >= nrq.getMin() && value <= nrq.getMax()) {
return true;
}
} else {
if (value >= nrq.getMin() && value < nrq.getMax()) {
return true;
}
}
} else {
if (nrq.includesMax()) {
if (value > nrq.getMin() && value <= nrq.getMax()) {
return true;
}
} else {
if (value > nrq.getMin() && value < nrq.getMax()) {
return true;
}
}
}
return false;
}
}