Package de.jungblut.datastructure

Source Code of de.jungblut.datastructure.InvertedIndexTest

package de.jungblut.datastructure;

import static org.junit.Assert.assertEquals;

import java.util.Arrays;
import java.util.List;
import java.util.Set;

import org.junit.Test;

import com.google.common.collect.Sets;
import com.google.common.collect.Sets.SetView;

import de.jungblut.datastructure.InvertedIndex.DocumentDistanceMeasurer;
import de.jungblut.datastructure.InvertedIndex.DocumentMapper;
import de.jungblut.distance.CosineDistance;
import de.jungblut.math.DoubleVector;
import de.jungblut.math.sparse.SparseDoubleVector;
import de.jungblut.nlp.TokenizerUtils;

public class InvertedIndexTest {

  private static final List<String> phrases = Arrays.asList("I eat the dog",
      "You like the dog", "this is the best I have ever seen");

  @Test
  public void testInvertedIndex() {

    InvertedIndex<String, String> invIndex = getBuiltIndex();

    List<DistanceResult<String>> res = invIndex
        .query("something with the dog I like");
    assertEquals(3, res.size());
    assertEquals(phrases.get(1), res.get(0).get());
    assertEquals(phrases.get(0), res.get(1).get());
    assertEquals(phrases.get(2), res.get(2).get());

    res = invIndex.query("something with the dog I like", 0.8d);
    assertEquals(2, res.size());
    assertEquals(phrases.get(1), res.get(0).get());
    assertEquals(phrases.get(0), res.get(1).get());

    res = invIndex.query("something with the dog I like", 1, 0.8d);
    assertEquals(1, res.size());
    assertEquals(phrases.get(1), res.get(0).get());

    res = invIndex.query("something with the dog I like", 1, 0.8d);
    assertEquals(1, res.size());
    assertEquals(phrases.get(1), res.get(0).get());

    res = invIndex.query("something with the dog I like", 1, 0.5d);
    assertEquals(0, res.size());
  }

  @Test
  public void testVectorInvertedIndex() {

    InvertedIndex<DoubleVector, Integer> invIndex = InvertedIndex
        .createVectorIndex(new CosineDistance());
    DoubleVector v1 = new SparseDoubleVector(4);
    v1.set(1, 0.6931471805599453);
    v1.set(0, 1.3862943611198906);
    DoubleVector v2 = new SparseDoubleVector(4);
    v2.set(2, 0.6931471805599453);
    v2.set(1, 1.3862943611198906);

    invIndex.build(Arrays.asList(v1, v2));

    DoubleVector v3 = new SparseDoubleVector(4);
    v3.set(3, 0.2);
    v3.set(1, 1);
    List<DistanceResult<DoubleVector>> res = invIndex.query(v3);
    assertEquals(2, res.size());
    assertEquals(v2, res.get(0).get());
    assertEquals(v1, res.get(1).get());

  }

  public static InvertedIndex<String, String> getBuiltIndex() {
    // create a white space tokenizing index that measures the jaccard
    // distance.
    InvertedIndex<String, String> invIndex = InvertedIndex.create(
        new DocumentMapper<String, String>() {
          @Override
          public Set<String> mapDocument(String doc) {
            return Sets.newHashSet(TokenizerUtils.whiteSpaceTokenize(doc));
          }
        }, new DocumentDistanceMeasurer<String, String>() {
          @Override
          public double measure(String reference, Set<String> referenceKeys,
              String doc, Set<String> docKeys) {
            SetView<String> union = Sets.union(referenceKeys, docKeys);
            SetView<String> intersection = Sets.intersection(referenceKeys,
                docKeys);
            return 1d - (intersection.size() / (double) union.size());
          }
        });

    invIndex.build(phrases);
    return invIndex;
  }
}
TOP

Related Classes of de.jungblut.datastructure.InvertedIndexTest

TOP
Copyright © 2018 www.massapi.com. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.