package com.googlecode.gaal.analysis.impl;
import static org.junit.Assert.fail;
import java.io.FileNotFoundException;
import java.util.NavigableSet;
import java.util.TreeSet;
import org.junit.Before;
import org.junit.Test;
import com.googlecode.gaal.analysis.api.Context;
import com.googlecode.gaal.analysis.api.Filter;
import com.googlecode.gaal.analysis.impl.EmbeddedIntervalExtractor.EmbeddedContext;
import com.googlecode.gaal.data.api.Corpus;
import com.googlecode.gaal.data.api.IntSequence;
import com.googlecode.gaal.data.api.Multiset;
import com.googlecode.gaal.data.impl.ArraySequence;
import com.googlecode.gaal.data.impl.CorpusTest;
import com.googlecode.gaal.data.impl.HashMultiset;
import com.googlecode.gaal.suffix.api.EmbeddedSuffixTree.EmbeddedInterval;
import com.googlecode.gaal.suffix.api.IntervalTree.Interval;
import com.googlecode.gaal.suffix.api.LinearizedSuffixTree;
import com.googlecode.gaal.suffix.api.LinearizedSuffixTree.BinaryInterval;
import com.googlecode.gaal.suffix.impl.LinearizedSuffixTreeImpl;
public class ContextExtractorTest {
private static final int WINDOW_SIZE = 5;
private Corpus<String> corpus;
private Iterable<EmbeddedInterval> embeddedExtractor;
private Iterable<Context> nestedExtractor;
private LinearizedSuffixTree lst;
@Before
public void setUp() throws Exception {
this.corpus = CorpusTest.createSmallCorpus();
lst = new LinearizedSuffixTreeImpl(corpus.sequence(), corpus.alphabetSize());
embeddedExtractor = new EmbeddedContextExtractor(lst, corpus, new NonSingletonBwtSetBuilder(),
new NoContextFilter<EmbeddedInterval>(), WINDOW_SIZE);
nestedExtractor = new NestedMaximalityContextExtractor<String>(corpus, false);
}
@Test
public void testPrint() throws FileNotFoundException {
Iterable<EmbeddedInterval> contextExtractor = new EmbeddedContextExtractor(lst, corpus,
new SupermaximalSetBuilder(), new NoContextFilter<EmbeddedInterval>(), WINDOW_SIZE);
for (EmbeddedInterval embeddedInterval : contextExtractor) {
Context context = new EmbeddedContext(embeddedInterval);
System.out.printf("%s %s %s\n", corpus.toString(context.leftSequence(), " "),
corpus.toString(context.fillerSet(), " "), corpus.toString(context.rightSequence(), " "));
}
}
@Test
public void testEmbeddedContexts() {
System.out.println("Embedded Suffix Tree Test");
int counter = 0;
for (EmbeddedInterval embeddedInterval : embeddedExtractor) {
Context context = new EmbeddedContext(embeddedInterval);
IntSequence left = context.leftSequence();
IntSequence right = context.rightSequence();
Multiset<IntSequence> expectedSet = fillerSet(left, right);
Multiset<IntSequence> resultSet = context.fillerSet();
if (!resultSet.equals(expectedSet)) {
String leftString = corpus.toString(left, " ");
String rightString = corpus.toString(right, " ");
String expectedString = corpus.toString(expectedSet, " ");
String resultString = corpus.toString(resultSet, " ");
System.out.printf("%s %s %s\n", leftString, expectedString, rightString);
System.err.printf("%s %s %s\n", leftString, resultString, rightString);
fail("fill mismatch");
}
if (expectedSet.size() != context.fillerSetSize()) {
System.err.printf("%d != %d\n", expectedSet.size(), context.fillerSetSize());
fail("filler set size mismatch");
}
counter++;
}
System.out.printf("total contexts: %d\n", counter);
}
@Test
public void testNestedContexts() {
System.out.println("Nested Maximality Test");
int counter = 0;
for (Context context : nestedExtractor) {
IntSequence left = context.leftSequence();
IntSequence right = context.rightSequence();
Multiset<IntSequence> expectedSet = fillerSet(left, right);
Multiset<IntSequence> resultSet = context.fillerSet();
for (IntSequence filler : resultSet) {
int expectedQuantity = expectedSet.getQuantity(filler);
int resultQuantity = resultSet.getQuantity(filler);
if (expectedQuantity == 0) {
String leftString = corpus.toString(left, " ");
String rightString = corpus.toString(right, " ");
String expectedString = corpus.toString(expectedSet, " ");
String resultString = corpus.toString(resultSet, " ");
System.out.printf("%s %s %s\n", leftString, expectedString, rightString);
System.err.printf("%s %s %s\n", leftString, resultString, rightString);
fail("fill mismatch");
} else if (expectedQuantity != resultQuantity) {
System.out.printf("%d != %d\n", expectedQuantity, resultQuantity);
}
}
if (expectedSet.size() != context.fillerSetSize()) {
System.out.printf("filler set size mismatch: %d != %d\n", expectedSet.size(), context.fillerSetSize());
}
counter++;
}
System.out.printf("total contexts: %d\n", counter);
}
private Multiset<IntSequence> fillerSet(IntSequence left, IntSequence right) {
BinaryInterval leftInterval = lst.search(left);
Multiset<IntSequence> expectedSet = null;
if (leftInterval == null) {
System.err.printf("left context not found: %s\n", left);
fail("left context not found");
} else {
Interval rightInterval = lst.search(right);
if (rightInterval == null) {
System.err.printf("right context not found: %s\n", right);
fail("right context not found");
} else {
NavigableSet<Integer> leftIndices = intervalIndices(leftInterval);
int[] suffixTable = lst.getSuffixTable();
expectedSet = new HashMultiset<IntSequence>();
for (int i = rightInterval.left(); i <= rightInterval.right(); i++) {
int end = suffixTable[i];
Integer start = leftIndices.floor(end);
if (start != null && end - start < WINDOW_SIZE) {
IntSequence fillerSeq = lst.getSequence().subSequence(start, end);
boolean hasSpecial = false;
for (int j = 0; j < fillerSeq.size(); j++) {
if (corpus.isSeparator(fillerSeq.get(j))) {
hasSpecial = true;
break;
}
}
if (!hasSpecial)
expectedSet.add(fillerSeq);
}
}
}
}
return expectedSet;
}
public IntSequence toIntSequence(String string) {
String[] stringArray = string.split(" ");
int[] intArray = new int[stringArray.length];
for (int i = 0; i < intArray.length; i++) {
intArray[i] = corpus.toInt(stringArray[i]);
}
return new ArraySequence(intArray);
}
private NavigableSet<Integer> intervalIndices(BinaryInterval interval) {
int lcp = interval.lcp();
NavigableSet<Integer> intervalIndices = new TreeSet<Integer>();
int[] suffixTable = lst.getSuffixTable();
for (int i = interval.left(); i <= interval.right(); i++) {
intervalIndices.add(suffixTable[i] + lcp);
}
return intervalIndices;
}
private static class NoContextFilter<C> implements Filter<C> {
@Override
public boolean passed(C context) {
return true;
}
}
}