Package org.carrot2.source.lucene

Source Code of org.carrot2.source.lucene.LuceneDocumentSourceTest

/*
* Carrot2 project.
*
* Copyright (C) 2002-2014, Dawid Weiss, Stanisław Osiński.
* All rights reserved.
*
* Refer to the full license file "carrot2.LICENSE"
* in the root folder of the repository checkout or at:
* http://www.carrot2.org/carrot2.LICENSE
*/

package org.carrot2.source.lucene;

import java.io.ByteArrayOutputStream;
import java.io.StringWriter;
import java.util.Arrays;
import java.util.List;

import org.apache.lucene.analysis.core.SimpleAnalyzer;
import org.apache.lucene.index.Term;
import org.apache.lucene.search.BooleanClause.Occur;
import org.apache.lucene.search.BooleanQuery;
import org.apache.lucene.search.TermQuery;
import org.apache.lucene.search.highlight.SimpleHTMLFormatter;
import org.apache.lucene.store.RAMDirectory;
import org.apache.lucene.util.Version;
import org.carrot2.core.Document;
import org.carrot2.core.ProcessingResult;
import org.carrot2.core.attribute.AttributeNames;
import org.carrot2.core.attribute.CommonAttributesDescriptor;
import org.carrot2.core.test.QueryableDocumentSourceTestBase;
import org.carrot2.util.attribute.AttributeUtils;
import org.junit.Before;
import org.junit.BeforeClass;
import org.junit.Test;

/**
* Tests Lucene document source.
*/
public class LuceneDocumentSourceTest extends
    QueryableDocumentSourceTestBase<LuceneDocumentSource>
{
    private static SimpleAnalyzer analyzer;
    private static RAMDirectory directory;

    @SuppressWarnings("deprecation")
    @BeforeClass
    public static void prepareIndex() throws Exception
    {
        directory = new RAMDirectory();
        analyzer = new SimpleAnalyzer(Version.LUCENE_CURRENT);
        LuceneIndexUtils.createAndPopulateIndex(directory, analyzer);
    }

    @Before
    public void prepareComponent()
    {
        this.initAttributes.put(
            AttributeUtils.getKey(LuceneDocumentSource.class, "directory"), directory);

        this.initAttributes.put(
            AttributeUtils.getKey(SimpleFieldMapper.class, "titleField"), "title");

        this.initAttributes.put(
            AttributeUtils.getKey(SimpleFieldMapper.class, "contentField"), "snippet");

        this.initAttributes.put(
            AttributeUtils.getKey(SimpleFieldMapper.class, "urlField"), "url");

        this.initAttributes.put(
            AttributeUtils.getKey(SimpleFieldMapper.class, "searchFields"),
            Arrays.asList(new String []
            {
                "title", "snippet"
            }));
    }

    @Override
    public Class<LuceneDocumentSource> getComponentClass()
    {
        return LuceneDocumentSource.class;
    }

    @Override
    protected boolean hasUtfResults()
    {
        return false;
    }

    @Override
    protected String getSmallQueryText()
    {
        return "software";
    }

    @Override
    protected int getSmallQuerySize()
    {
        return 13;
    }

    @Override
    protected String getLargeQueryText()
    {
        return "data mining";
    }

    @Override
    protected int getLargeQuerySize()
    {
        return 100;
    }

    @Test
    public void testCustomFormatter() throws Exception
    {
        this.initAttributes.put(
            AttributeUtils.getKey(SimpleFieldMapper.class, "formatter"),
            SimpleHTMLFormatter.class);

        runQuery(getLargeQueryText(), getLargeQuerySize());

        int highlights = 0;
        for (Document d : getDocuments())
        {
            if (((String) d.getField(Document.SUMMARY)).indexOf("") >= 0)
            {
                highlights++;
            }
        }

        assertThat(highlights).as("Number of highlights").isGreaterThan(10);
    }

    @Test
    public void testCustomQuery() throws Exception
    {
        final BooleanQuery query = new BooleanQuery();
        query.add(new TermQuery(new Term("snippet", "data")), Occur.MUST);

        this.processingAttributes.put(AttributeNames.QUERY, query);

        assertThat(runQuery(null, getLargeQuerySize())).as("Number of results")
            .isGreaterThan(10);
    }

    @Test
    public void testAdvancedQueries() throws Exception
    {
        assertThat(runQuery("\"data mining\"", getLargeQuerySize())).as(
            "Number of results").isEqualTo(99);
    }

    @Test
    public void testMultiEntryField() throws Exception
    {
        runQuery("\"termb\"", getLargeQuerySize());

        final List<Document> list = getDocuments();
        assertThat(list.size()).isEqualTo(1);
        assertThat(list.get(0).getSummary()).contains("terma");
        assertThat(list.get(0).getSummary()).contains("termb");
    }

    /**
     * Test case for CARROT-820.
     */
    @Test
    public void testCatchAllQueryWithHighlighting() throws Exception
    {
        SimpleFieldMapperDescriptor.attributeBuilder(processingAttributes).formatter(
            PlainTextFormatter.class);
        runQuery("*:*", 2);

        final List<Document> list = getDocuments();
        assertThat(list.size()).isEqualTo(2);
        assertThat(list.get(0).getSummary()).isNotEmpty();
        assertThat(list.get(0).getSummary()).isNotEmpty();
    }

    @Test
    public void luceneScorePassing() throws Exception
    {
        final int results = 10;
        assertThat(runQuery("\"data mining\"", results)).as("Number of results")
            .isEqualTo(results);
        for (Document document : getDocuments())
        {
            assertThat(document.getScore()).isNotNull().isGreaterThan(0);
        }
    }

    /**
     * Keeping Lucene documents by default is not a good idea, because it would cause the
     * cache size to grow very quickly.
     */
    @Test
    public void luceneDocumentNotPassedByDefault() throws Exception
    {
        final int results = 10;
        assertThat(runQuery("\"data mining\"", results)).as("Number of results")
            .isEqualTo(results);
        for (Document document : getDocuments())
        {
            for (Object field : document.getFields().values())
            {
                // Lucene Document class is final
                assertThat(field.getClass()).as("Field type").isNotEqualTo(
                    org.apache.lucene.document.Document.class);
            }
        }
    }

    @Test
    public void luceneDocumentPassing() throws Exception
    {
        LuceneDocumentSourceDescriptor.attributeBuilder(processingAttributes)
            .keepLuceneDocuments(true);

        final int results = 10;
        assertThat(runQuery("\"data mining\"", results)).as("Number of results")
            .isEqualTo(results);
        for (Document document : getDocuments())
        {
            assertThat(document.getField(LuceneDocumentSource.LUCENE_DOCUMENT_FIELD))
                .isInstanceOf(org.apache.lucene.document.Document.class);
        }
    }

    @Test
    public void luceneDocumentNotSerialized() throws Exception
    {
        final int results = 2;
        CommonAttributesDescriptor.attributeBuilder(processingAttributes)
            .query("\"data mining\"").results(results);
        LuceneDocumentSourceDescriptor.attributeBuilder(processingAttributes)
            .keepLuceneDocuments(true);
        final ProcessingResult result = getSimpleController(initAttributes).process(
            processingAttributes, LuceneDocumentSource.class);
        assertThat(result.getDocuments().size()).as("Number of results").isEqualTo(
            results);

        final StringWriter json = new StringWriter();
        result.serializeJson(json);
        assertThat(json.toString()).doesNotContain("\"luceneDocument\"");

        final ByteArrayOutputStream xml = new ByteArrayOutputStream();
        result.serialize(xml);
        assertThat(xml.toString("UTF-8")).doesNotContain(
            "org.apache.lucene.document.Document");
    }
}
TOP

Related Classes of org.carrot2.source.lucene.LuceneDocumentSourceTest

TOP
Copyright © 2018 www.massapi.com. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.