Package org.apache.uima.ruta.textruler.learner.whisk.token

Source Code of org.apache.uima.ruta.textruler.learner.whisk.token.Whisk

/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements.  See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership.  The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License.  You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
* KIND, either express or implied.  See the License for the
* specific language governing permissions and limitations
* under the License.
*/

package org.apache.uima.ruta.textruler.learner.whisk.token;

import java.util.ArrayList;
import java.util.HashMap;
import java.util.HashSet;
import java.util.List;
import java.util.Map;
import java.util.Set;

import org.apache.uima.cas.CAS;
import org.apache.uima.cas.Type;
import org.apache.uima.cas.TypeSystem;
import org.apache.uima.cas.text.AnnotationFS;
import org.apache.uima.ruta.textruler.core.TextRulerAnnotation;
import org.apache.uima.ruta.textruler.core.TextRulerBasicLearner;
import org.apache.uima.ruta.textruler.core.TextRulerExample;
import org.apache.uima.ruta.textruler.core.TextRulerExampleDocument;
import org.apache.uima.ruta.textruler.core.TextRulerRule;
import org.apache.uima.ruta.textruler.core.TextRulerRuleList;
import org.apache.uima.ruta.textruler.core.TextRulerRulePattern;
import org.apache.uima.ruta.textruler.core.TextRulerSlotPattern;
import org.apache.uima.ruta.textruler.core.TextRulerStatisticsCollector;
import org.apache.uima.ruta.textruler.core.TextRulerTarget;
import org.apache.uima.ruta.textruler.core.TextRulerToolkit;
import org.apache.uima.ruta.textruler.extension.TextRulerLearnerDelegate;
import org.apache.uima.ruta.textruler.learner.whisk.token.WhiskRuleItem.MLWhiskOtherConstraint;

public class Whisk extends TextRulerBasicLearner {

  public final static String WINDOSIZE_KEY = "windowSize";

  public final static String ERROR_THRESHOLD_KEY = "errorThreshold";

  public final static String POSTAG_ROOTTYPE_KEY = "posTagRootType";

  public final static int STANDARD_WINDOWSIZE = 5;

  public final static float STANDARD_ERROR_THRESHOLD = 0.1f;

  public final static String STANDARD_POSTAG_ROOTTYPE = "org.apache.uima.ml.ML.postag";

  TextRulerRuleList ruleList;

  protected Set<TextRulerExample> coveredExamples;

  protected int windowSize = STANDARD_WINDOWSIZE;

  protected double errorThreshold = STANDARD_ERROR_THRESHOLD;

  protected String posTagRootTypeName = STANDARD_POSTAG_ROOTTYPE;

  int roundNumber = 0;

  int allExamplesCount = 0;

  private Map<String, TextRulerStatisticsCollector> cachedTestedRuleStatistics = new HashMap<String, TextRulerStatisticsCollector>();

  public Whisk(String inputDir, String prePropTmFile, String tmpDir, String[] slotNames,
          Set<String> filterSet, TextRulerLearnerDelegate delegate) {
    super(inputDir, prePropTmFile, tmpDir, slotNames, filterSet, delegate);
  }

  @Override
  public boolean collectNegativeCoveredInstancesWhenTesting() {
    return false;
  }

  @Override
  protected void doRun() {

    // we don't use the same overall structure like the original WHISK since
    // we do not
    // repeat the whole process for some new training documents at the
    // user's request, we
    // learn like the other algorithms from the whole training set, so we
    // for example do not
    // need to test the intermediate rule base on a newly "incoming"
    // training document since we
    // tested all rules already on all training documents !

    // this version of whisk is not tested for mutli slot learning since the
    // seminar announcements
    // are not quite suitable for this task: they do not all contain all 4
    // slots and some of them
    // occur more than once in one document ! And the order of them is not
    // always the same as well!
    // so this is now made only tested for the single slot case even if it
    // is built capable of multislot
    // examples!

    // this is the inner loop of the WHISK pseudo-code:
    // For each inst in Training
    // for each tag

    cachedTestedRuleStatistics.clear();
    ruleList = new TextRulerRuleList();
    coveredExamples = new HashSet<TextRulerExample>();

    sendStatusUpdateToDelegate("Creating examples...", TextRulerLearnerState.ML_RUNNING, false);
    TextRulerTarget target = new TextRulerTarget(slotNames[0], this); // only
    // single-slot-target
    // for now
    exampleDocuments.createExamplesForTarget(target);

    TextRulerExampleDocument[] docs = exampleDocuments.getSortedDocumentsInCacheOptimizedOrder();

    allExamplesCount = exampleDocuments.getAllPositiveExamples().size();

    for (TextRulerExampleDocument inst : docs) {
      List<TextRulerExample> tags = inst.getPositiveExamples();

      // for each uncovered example -> induce a new rule:
      for (TextRulerExample tag : tags) {
        if (!coveredExamples.contains(tag)) {
          roundNumber++;
          WhiskRule newRule = growRule(inst, tag);
          if (shouldAbort())
            break;
          // if (newRule == null)
          // break;
          // else
          if (newRule != null
                  && (newRule.getCoveringStatistics().getCoveredNegativesCount() == 00 || newRule
                          .getLaplacian() <= errorThreshold)) {
            ruleList.addRule(newRule);
            coveredExamples.addAll(newRule.getCoveringStatistics().getCoveredPositiveExamples());
            sendStatusUpdateToDelegate("New Rule added...", TextRulerLearnerState.ML_RUNNING, true);
          }
        }
      }
      if (shouldAbort())
        return;
    }
    sendStatusUpdateToDelegate("Done", TextRulerLearnerState.ML_DONE, true);
    cachedTestedRuleStatistics.clear();
  }

  protected WhiskRule growRule(TextRulerExampleDocument doc, TextRulerExample example) {
    sendStatusUpdateToDelegate("Creating new rule from seed...", TextRulerLearnerState.ML_RUNNING,
            false);
    WhiskRule theRule = new WhiskRule(this, example.getTarget(), example);
    int numberOfSlotsInTag = example.getAnnotations().length;
    for (int i = 0; i < numberOfSlotsInTag; i++)
      theRule.getPatterns().add(new TextRulerSlotPattern());

    List<WhiskRuleItem> allTerms = getAllTermsOfExample(example);

    sendStatusUpdateToDelegate("Creating new rule: anchoring...", TextRulerLearnerState.ML_RUNNING,
            false);
    for (int i = 0; i < numberOfSlotsInTag; i++) {
      theRule = anchor(theRule, doc, example, allTerms, i);
      if (shouldAbort())
        return null;
    }

    sendStatusUpdateToDelegate("Creating new rule: extending...", TextRulerLearnerState.ML_RUNNING,
            false);
    if (theRule != null) {
      double oldLaplacian = theRule.getLaplacian();
      int subRoundNumber = 0;
      // repeat while we still make errors...
      while (theRule.getCoveringStatistics().getCoveredNegativesCount() > 0) {
        WhiskRule extendedRule = extendRule(theRule, doc, example, allTerms, subRoundNumber);
        if (extendedRule == null) {
          // this way we get the previous rule
          // as the best rule...
          break;
        }
        theRule = extendedRule;
        TextRulerToolkit.log("----------------------------");
        TextRulerToolkit.log("BEST EXTENSION IS: " + theRule.getRuleString());
        TextRulerToolkit.log("Laplacian: " + theRule.getLaplacian() + "    ; "
                + theRule.getCoveringStatistics());
        subRoundNumber++;

        double newLaplacian = theRule.getLaplacian();
        if (newLaplacian >= oldLaplacian) {
          break;
        }
        oldLaplacian = newLaplacian;
      }
      TextRulerToolkit.log("----------------------------");
      TextRulerToolkit.log("FINAL RULE IS : " + theRule.getRuleString());
    }
    return theRule;
  }

  protected WhiskRule extendRule(WhiskRule rule, TextRulerExampleDocument doc,
          TextRulerExample example, List<WhiskRuleItem> allTerms, int subRoundNumber) {
    WhiskRule bestRule = null;
    double bestL = 1.0;
    int bestRuleConstraintPoints = -1;
    if (rule.getLaplacian() <= errorThreshold) {
      bestRule = rule;
      bestL = rule.getLaplacian();
    }

    List<WhiskRuleItem> slotTerms = getTermsWithinBounds(allTerms,
            example.getAnnotations()[0].getBegin(), example.getAnnotations()[0].getEnd());
    WhiskRuleItem firstSlotTerm = slotTerms.get(0);
    WhiskRuleItem lastSlotTerm = slotTerms.get(slotTerms.size() - 1);

    List<TextRulerRule> rulesToTest = new ArrayList<TextRulerRule>();
    for (WhiskRuleItem term : allTerms) {
      if (rule.containsTerm(term)) {
        continue;
      }

      boolean rejectTerm = false;
      // for now this works only for slot 0 (no multislot stuff here yet!)
      if (term.getTermNumberInExample() < firstSlotTerm.getTermNumberInExample())
        rejectTerm = firstSlotTerm.getTermNumberInExample() - term.getTermNumberInExample() > windowSize;
      else if (term.getTermNumberInExample() > lastSlotTerm.getTermNumberInExample())
        rejectTerm = term.getTermNumberInExample() - firstSlotTerm.getTermNumberInExample() > windowSize;

      if (rejectTerm) {
        // out of window scope -> skip to next...
        continue;
      }

      WhiskRule proposedRule = createNewRuleByAddingTerm(rule, term);
      WhiskRuleItem t = proposedRule.searchItemWithTermNumber(term.getTermNumberInExample());

      if (!rulesToTest.contains(proposedRule))
        rulesToTest.add(proposedRule);

      // add a second version where we remove the exact token content if
      // it is a regexp item:
      WhiskRule proposedRule2 = null;
      WhiskRuleItem t2 = null;
      if (t.getWordConstraint().isRegExpConstraint()) {
        proposedRule2 = proposedRule.copy();
        t2 = proposedRule2.searchItemWithTermNumber(term.getTermNumberInExample());
        t2.setHideRegExp(true);
        proposedRule2.setNeedsCompile(true);
        if (!rulesToTest.contains(proposedRule2)) {
          rulesToTest.add(proposedRule2);
        }
      }

      // and now, for WHISK performance testing purposes, we also add POS
      // tags:
      // this is not very nice code and not dynamic feature capable, but
      // for testpurposes
      // in order to test WHISK with PosTag Terms...
      if (posTagRootTypeName != null && posTagRootTypeName.length() > 0) {
        TextRulerAnnotation tokenAnnotation = term.getWordConstraint().getTokenAnnotation();
        CAS cas = example.getDocumentCAS();
        TypeSystem ts = cas.getTypeSystem();
        Type posTagsRootType = ts.getType(posTagRootTypeName);
        if (ts != null) {
          // POS-Tags created by our test hmm tagger.
          List<AnnotationFS> posTagAnnotations = TextRulerToolkit.getAnnotationsWithinBounds(cas,
                  tokenAnnotation.getBegin(), tokenAnnotation.getEnd(), null, posTagsRootType);
          if (posTagAnnotations.size() > 0) {
            AnnotationFS posTag = posTagAnnotations.get(0);
            if (posTag.getBegin() == tokenAnnotation.getBegin()
                    && posTag.getEnd() == tokenAnnotation.getEnd()) {
              TextRulerAnnotation posTagAnnotation = new TextRulerAnnotation(posTag, doc);

              // 1. most specific term with all constraints we
              // have:
              WhiskRule proposedRule3 = proposedRule.copy();
              WhiskRuleItem t3 = proposedRule3.searchItemWithTermNumber(term
                      .getTermNumberInExample());
              t3.addOtherConstraint(new MLWhiskOtherConstraint(tokenAnnotation, posTagAnnotation));
              proposedRule3.setNeedsCompile(true);
              if (!rulesToTest.contains(proposedRule3))
                rulesToTest.add(proposedRule3);

              // 2. the same without the regexp thingy:
              if (proposedRule2 != null) {
                WhiskRule proposedRule4 = proposedRule2.copy();
                WhiskRuleItem t4 = proposedRule4.searchItemWithTermNumber(term
                        .getTermNumberInExample());
                t4.addOtherConstraint(new MLWhiskOtherConstraint(tokenAnnotation, posTagAnnotation));
                proposedRule4.setNeedsCompile(true);
                if (!rulesToTest.contains(proposedRule4))
                  rulesToTest.add(proposedRule4);
              }

              // 3. last but not least: a rule with only the pos
              // tag constraint:
              WhiskRule proposedRule5 = proposedRule.copy();
              WhiskRuleItem t5 = proposedRule5.searchItemWithTermNumber(term
                      .getTermNumberInExample());
              t5.addOtherConstraint(new MLWhiskOtherConstraint(tokenAnnotation, posTagAnnotation));
              t5.setWordConstraint(null);
              proposedRule5.setNeedsCompile(true);
              if (!rulesToTest.contains(proposedRule5))
                rulesToTest.add(proposedRule5);
            }
          }
        }
      }

    }
    if (rulesToTest.size() == 0)
      return bestRule;

    sendStatusUpdateToDelegate(
            "Round "
                    + roundNumber
                    + "."
                    + subRoundNumber
                    + " - Testing "
                    + rulesToTest.size()
                    + " rules... "
                    + " - uncovered examples: "
                    + (allExamplesCount - coveredExamples.size() + " / " + allExamplesCount
                            + " ; cs=" + cachedTestedRuleStatistics.size()),
            TextRulerLearnerState.ML_RUNNING, false);

    TextRulerToolkit.log("Testing " + rulesToTest.size() + " rules on training set...");
    for (TextRulerRule r : rulesToTest)
      TextRulerToolkit.log(r.getRuleString());
    testRulesIfNotCached(rulesToTest); // testRulesOnDocumentSet(rulesToTest,
    // exampleDocuments);
    if (shouldAbort())
      return null;
    for (TextRulerRule r : rulesToTest) {
      WhiskRule wr = (WhiskRule) r;
      if (wr.getLaplacian() < bestL) {
        bestL = wr.getLaplacian();
        bestRule = wr;
        bestRuleConstraintPoints = bestRule.totalConstraintPoints();
      } else if (wr.getLaplacian() == bestL && bestRuleConstraintPoints >= 0) {
        TextRulerToolkit.log("Same Laplacian! So prefer more general rule!");
        if (wr.totalConstraintPoints() < bestRuleConstraintPoints) {
          TextRulerToolkit.log("\tYes, prefered!");
          bestL = wr.getLaplacian();
          bestRule = wr;
          bestRuleConstraintPoints = bestRule.totalConstraintPoints();
        }
      }
    }
    return bestRule;
  }

  protected WhiskRule createNewRuleByAddingTerm(WhiskRule baseRule, WhiskRuleItem term) {
    WhiskRule newRule = baseRule.copy();
    int foundSlotNumber = -1; // debug info
    String foundSlotPattern = "";
    int termNumber = term.getTermNumberInExample();
    // determine, where this term is located relatively to the slots we
    // have...
    TextRulerRulePattern targetPattern = null;
    TextRulerRulePattern previousSlotPostFillerPattern = null;
    for (int i = 0; i < newRule.getPatterns().size(); i++) {
      TextRulerSlotPattern slotPattern = newRule.getPatterns().get(i);
      WhiskRuleItem it = (WhiskRuleItem) slotPattern.preFillerPattern.lastItem(); // look at the
      // prefiller
      // pattern
      if (it != null && termNumber <= it.getTermNumberInExample())
        targetPattern = slotPattern.preFillerPattern;
      if (targetPattern == null && slotPattern.fillerPattern.size() > 0) // now
      // look
      // at
      // the
      // filler
      // pattern
      {
        it = (WhiskRuleItem) slotPattern.fillerPattern.firstItem();
        if (termNumber < it.getTermNumberInExample()) // it's still for
          // the prefiller
          // pattern but it
          // seems to be
          // emtpy so we
          // could not find
          // that out above!
          targetPattern = slotPattern.preFillerPattern;
        else {
          it = (WhiskRuleItem) slotPattern.fillerPattern.lastItem();
          if (termNumber <= it.getTermNumberInExample()) {
            targetPattern = slotPattern.fillerPattern;
          }
        }
      }
      if (targetPattern == null && slotPattern.postFillerPattern.size() > 0) // now look at
      // the
      // postfiller
      // pattern
      {
        it = (WhiskRuleItem) slotPattern.postFillerPattern.firstItem();
        if (termNumber < it.getTermNumberInExample()) // it's still for
          // the filler
          // pattern but it
          // seems to be
          // emtpy so we
          // could not find
          // that out above!
          targetPattern = slotPattern.fillerPattern;
        else {
          it = (WhiskRuleItem) slotPattern.postFillerPattern.lastItem();
          if (termNumber <= it.getTermNumberInExample())
            targetPattern = slotPattern.postFillerPattern;
        }
      }
      if (targetPattern == null) {
        targetPattern = previousSlotPostFillerPattern;
        if (i > 0) {
          TextRulerSlotPattern prevSlotPattern = newRule.getPatterns().get(i - 1);
          foundSlotPattern = targetPattern == prevSlotPattern.preFillerPattern ? "PRE FILLER"
                  : (targetPattern == prevSlotPattern.fillerPattern ? "FILLER" : "POST FILLER");
          foundSlotNumber = i - 1;
        }
      } else {
        foundSlotPattern = targetPattern == slotPattern.preFillerPattern ? "PRE FILLER"
                : (targetPattern == slotPattern.fillerPattern ? "FILLER" : "POST FILLER");
        foundSlotNumber = i;
      }
      previousSlotPostFillerPattern = slotPattern.postFillerPattern;
    }

    if (targetPattern == null) {
      targetPattern = previousSlotPostFillerPattern;
      foundSlotNumber = newRule.getPatterns().size() - 1;
      foundSlotPattern = "POST FILLER";
    }

    if (targetPattern == null) {
      TextRulerToolkit.log("ERROR, NO TARGET PATTERN FOR NEW RULE TERM FOUND !");
    } else {
      // TextRulerToolkit.log("Ok, found for Rule: "+newRule.getRuleString());
      // TextRulerToolkit.log("Term: "+term.getTermNumberInExample()+" ; "+term);
      // TextRulerToolkit.log("Slot "+foundSlotNumber+" - Pattern: "+foundSlotPattern);
      // now put that term into the rule:
      int indexInPattern = -1;
      if (targetPattern.size() == 0) {
        targetPattern.add(term.copy());
        indexInPattern = 0;
      } else {
        // 1. search if the term would replace a wildcard:
        WhiskRuleItem wildCard = newRule.searchItemWithTermNumber(termNumber);
        if (wildCard != null) {
          if (!wildCard.isStarWildCard()) {
            TextRulerToolkit
                    .log("ERROR, FOUND A TERM WITH THE SAME NUMBER THAT IS NOT A WILDCARD! HOW IS THAT???");
            return null;
          }
          if (!targetPattern.contains(wildCard)) {
            TextRulerToolkit.log("EVEN WORSE, THAT MUST NOT BE AT ALL!");
            return null;
          }
          indexInPattern = targetPattern.indexOf(wildCard);
          targetPattern.set(indexInPattern, term.copy());
        } else {
          // not a wildcard, so search for the insertion point:
          for (int i = 0; i < targetPattern.size(); i++) {
            WhiskRuleItem it = (WhiskRuleItem) targetPattern.get(i);
            if (termNumber < it.getTermNumberInExample()) {
              indexInPattern = i;
              break;
            }
          }
          if (indexInPattern < 0) {
            indexInPattern = targetPattern.size();
            targetPattern.add(term.copy());
          } else
            targetPattern.add(indexInPattern, term.copy());
        }
      }
      // ok, now we have replaced a wildcard with the term or added the
      // term between two other items.
      // we now have to check the neighbors of the new term: if it is a
      // direct neighbor (according to the termNumber),
      // we have nothing special to do. but if it is not a direct
      // neighbor, we have to add a wildcard between the two items (if the
      // neighbor item
      // is not a wildcard itself!
      WhiskRuleItem newTerm = (WhiskRuleItem) targetPattern.get(indexInPattern);

      // look at left neighbor:
      WhiskRuleItem left = newRule.searchNeighborOfItem(newTerm, true);
      if (left != null) {
        // TextRulerToolkit.log("LEFT NEIGHBOR FOUND!");

        // so we have a left neighbor. let's see if it also is the
        // neighbor in our seed token stream:
        if (left.getTermNumberInExample() < newTerm.getTermNumberInExample() - 1
                && !left.isStarWildCard()) { // no direct neighbor and
          // no wildcard yet,
          // so insert a wildcard between us!
          targetPattern.add(indexInPattern,
                  WhiskRuleItem.newWildCardItem(left.getTermNumberInExample() + 1));
          indexInPattern++;
        }
      }

      // look at right neighbor:
      WhiskRuleItem right = newRule.searchNeighborOfItem(newTerm, false);
      if (right != null) {
        // TextRulerToolkit.log("RIGHT NEIGHBOR FOUND!");
        // so we have a right neighbor. let's see if it also is the
        // neighbor in our seed token stream:
        if (right.getTermNumberInExample() > newTerm.getTermNumberInExample() + 1
                && !right.isStarWildCard()) { // no direct neighbor and
          // no wildcard yet,
          // so insert a wildcard between us!
          WhiskRuleItem wc = WhiskRuleItem.newWildCardItem(newTerm.getTermNumberInExample() + 1);
          if (indexInPattern + 1 < targetPattern.size())
            targetPattern.add(indexInPattern + 1, wc);
          else
            targetPattern.add(wc);
        }
      }

      newRule.setNeedsCompile(true);
      // TextRulerToolkit.log("BEFORE: "+baseRule.getRuleString());
      // TextRulerToolkit.log("AFTER : "+newRule.getRuleString());
      // TextRulerToolkit.log("");
    }
    if (newRule.getRuleString().equals(baseRule.getRuleString())) // this
      // must
      // not be!
      return null;
    else
      return newRule;
  }

  protected WhiskRule anchor(WhiskRule rule, TextRulerExampleDocument doc,
          TextRulerExample example, List<WhiskRuleItem> allTerms, int slotIndex) {
    TextRulerAnnotation slotAnnotation = example.getAnnotations()[slotIndex];
    List<WhiskRuleItem> inside = getTermsWithinBounds(allTerms, slotAnnotation.getBegin(),
            slotAnnotation.getEnd());

    if (rule == null || inside.isEmpty()) {
      return null;
    }
    // create base 1 and base 2:
    WhiskRule base1 = rule.copy(); // slot filler rule
    TextRulerSlotPattern slotPattern = base1.getPatterns().get(slotIndex);
    for (int i = 0; i < inside.size(); i++)
      if (i == 0 || (i == inside.size() - 1))
        slotPattern.fillerPattern.add(inside.get(i).copy());
      else if (inside.size() > 2 && i < 2)
        slotPattern.fillerPattern.add(WhiskRuleItem.newWildCardItem(inside.get(i)
                .getTermNumberInExample()));

    WhiskRule base2 = rule.copy(); // slot context rule
    slotPattern = base2.getPatterns().get(slotIndex);

    int firstOfSlot = allTerms.indexOf(inside.get(0));
    int lastOfSlot = allTerms.indexOf(inside.get(inside.size() - 1));
    if (firstOfSlot > 0)
      slotPattern.preFillerPattern.add(allTerms.get(firstOfSlot - 1));
    slotPattern.fillerPattern.add(WhiskRuleItem.newWildCardItem(inside.get(0)
            .getTermNumberInExample()));
    if (lastOfSlot + 1 < allTerms.size())
      slotPattern.postFillerPattern.add(allTerms.get(lastOfSlot + 1));

    TextRulerToolkit.log("base1: " + base1.getRuleString());
    TextRulerToolkit.log("base2: " + base2.getRuleString());
    List<TextRulerRule> testRules = new ArrayList<TextRulerRule>();
    testRules.add(base1);
    testRules.add(base2);
    // testRulesOnDocumentSet(testRules, exampleDocuments);
    testRulesIfNotCached(testRules);
    if (shouldAbort())
      return null;
    TextRulerToolkit.log("\tbase1: " + base1.getCoveringStatistics() + " --> laplacian = "
            + base1.getLaplacian());
    TextRulerToolkit.log("\tbase2: " + base2.getCoveringStatistics() + " --> laplacian = "
            + base2.getLaplacian());
    if (base2.getCoveringStatistics().getCoveredPositivesCount() > base1.getCoveringStatistics()
            .getCoveredPositivesCount())
      return base2;
    else
      return base1;
  }

  public String getResultString() {
    if (ruleList != null)
      return getTMFileHeaderString() + ruleList.getRulesString("");
    else
      return "No results available yet!";
  }

  public void setParameters(Map<String, Object> params) {
    if (TextRulerToolkit.DEBUG)
      saveParametersToTempFolder(params);

    // TODO try catch
    if (params.containsKey(WINDOSIZE_KEY))
      windowSize = (Integer) params.get(WINDOSIZE_KEY);

    if (params.containsKey(ERROR_THRESHOLD_KEY))
      errorThreshold = (Float) params.get(ERROR_THRESHOLD_KEY);

    if (params.containsKey(POSTAG_ROOTTYPE_KEY))
      posTagRootTypeName = (String) params.get(POSTAG_ROOTTYPE_KEY);

  }

  public List<WhiskRuleItem> getAllTermsOfExample(TextRulerExample example) {
    CAS cas = example.getDocumentCAS();
    Type tokensRootType = cas.getTypeSystem().getType(TextRulerToolkit.RUTA_ANY_TYPE_NAME);
    List<AnnotationFS> all = TextRulerToolkit.getAnnotationsWithinBounds(cas, 0, cas
            .getDocumentText().length() + 1, TextRulerToolkit.getFilterSetWithSlotNames(slotNames,
            filterSet), tokensRootType);

    List<WhiskRuleItem> result = new ArrayList<WhiskRuleItem>();
    int i = 0;
    for (AnnotationFS afs : all) {
      WhiskRuleItem term = new WhiskRuleItem(new TextRulerAnnotation(afs, example.getDocument()));
      term.setTermNumberInExample(i);
      i++;
      result.add(term);
    }
    return result;
  }

  public List<WhiskRuleItem> getTermsWithinBounds(List<WhiskRuleItem> allTerms, int startPos,
          int endPos) {
    List<WhiskRuleItem> result = new ArrayList<WhiskRuleItem>();
    for (WhiskRuleItem term : allTerms) {
      TextRulerAnnotation a = term.getWordConstraint().getTokenAnnotation();
      if (a.getBegin() >= startPos && a.getEnd() <= endPos)
        result.add(term);
      if (a.getEnd() > endPos)
        break;
    }
    return result;
  }

  // TODO share this between algorithms (e.g. LP2 and RAPIER, WHISK ?) and
  // make a maximum size of the cache, etc. like CasCache?
  protected void testRulesIfNotCached(List<TextRulerRule> rules) {
    List<TextRulerRule> rulesToTest = new ArrayList<TextRulerRule>();

    for (TextRulerRule r : rules) {
      String key = r.getRuleString();
      if (cachedTestedRuleStatistics.containsKey(key)) {
        r.setCoveringStatistics(cachedTestedRuleStatistics.get(key).copy());
        TextRulerToolkit.log("CACHE HIT !");
      } else
        rulesToTest.add(r);
    }

    if (rulesToTest.size() > 0) {
      testRulesOnDocumentSet(rulesToTest, exampleDocuments);
      if (shouldAbort())
        return;
      for (TextRulerRule r : rulesToTest) {
        String key = r.getRuleString();
        cachedTestedRuleStatistics.put(key, r.getCoveringStatistics().copy());
      }
    }
  }

}
TOP

Related Classes of org.apache.uima.ruta.textruler.learner.whisk.token.Whisk

TOP
Copyright © 2018 www.massapi.com. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.