Package org.apache.uima.ruta.textruler.learner.whisk.generic

Source Code of org.apache.uima.ruta.textruler.learner.whisk.generic.Whisk

/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements.  See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership.  The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License.  You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
* KIND, either express or implied.  See the License for the
* specific language governing permissions and limitations
* under the License.
*/

package org.apache.uima.ruta.textruler.learner.whisk.generic;

import java.util.ArrayList;
import java.util.Collection;
import java.util.HashMap;
import java.util.HashSet;
import java.util.List;
import java.util.Map;
import java.util.Set;

import org.apache.commons.lang3.StringUtils;
import org.apache.uima.cas.CAS;
import org.apache.uima.cas.FSIterator;
import org.apache.uima.cas.FeatureStructure;
import org.apache.uima.cas.Type;
import org.apache.uima.cas.TypeSystem;
import org.apache.uima.cas.text.AnnotationFS;
import org.apache.uima.ruta.textruler.core.TextRulerAnnotation;
import org.apache.uima.ruta.textruler.core.TextRulerBasicLearner;
import org.apache.uima.ruta.textruler.core.TextRulerExample;
import org.apache.uima.ruta.textruler.core.TextRulerExampleDocument;
import org.apache.uima.ruta.textruler.core.TextRulerRule;
import org.apache.uima.ruta.textruler.core.TextRulerRuleItem;
import org.apache.uima.ruta.textruler.core.TextRulerRuleList;
import org.apache.uima.ruta.textruler.core.TextRulerRulePattern;
import org.apache.uima.ruta.textruler.core.TextRulerSlotPattern;
import org.apache.uima.ruta.textruler.core.TextRulerStatisticsCollector;
import org.apache.uima.ruta.textruler.core.TextRulerTarget;
import org.apache.uima.ruta.textruler.core.TextRulerToolkit;
import org.apache.uima.ruta.textruler.core.TextRulerWordConstraint;
import org.apache.uima.ruta.textruler.extension.TextRulerLearnerDelegate;
import org.apache.uima.ruta.textruler.learner.whisk.generic.WhiskRuleItem.MLWhiskOtherConstraint;

public class Whisk extends TextRulerBasicLearner {

  public final static String WINDOWSIZE_KEY = "windowSize";

  public final static String ERROR_THRESHOLD_KEY = "errorThreshold";

  public final static String POSTAG_ROOTTYPE_KEY = "posTagRootType";

  public final static int STANDARD_WINDOWSIZE = 5;

  public final static float STANDARD_ERROR_THRESHOLD = 0.1f;

  public final static String STANDARD_POSTAG_ROOTTYPE = "org.apache.uima.ml.ML.postag";

  public static final String CONSIDERED_FEATURES = "consideredFeatures";

  public static final String STANDARD_CONSIDERED_FEATURES = "";

  TextRulerRuleList ruleList;

  protected Set<TextRulerExample> coveredExamples;

  protected int windowSize = STANDARD_WINDOWSIZE;

  protected double errorThreshold = STANDARD_ERROR_THRESHOLD;

  protected String posTagRootTypeName = STANDARD_POSTAG_ROOTTYPE;

  int roundNumber = 0;

  int allExamplesCount = 0;

  private List<String> consideredFeatures = new ArrayList<String>();

  private Map<String, TextRulerStatisticsCollector> cachedTestedRuleStatistics = new HashMap<String, TextRulerStatisticsCollector>();

  public Whisk(String inputDir, String prePropTmFile, String tmpDir, String[] slotNames,
          Set<String> filterSet, boolean skip, TextRulerLearnerDelegate delegate) {
    super(inputDir, prePropTmFile, tmpDir, slotNames, filterSet, skip, delegate);
    // useDynamicAnchoring = true;
  }

  @Override
  public boolean collectNegativeCoveredInstancesWhenTesting() {
    return false;
  }

  @Override
  protected void doRun() {

    // we don't use the same overall structure like the original WHISK since
    // we do not
    // repeat the whole process for some new training documents at the
    // user's request, we
    // learn like the other algorithms from the whole training set, so we
    // for example do not
    // need to test the intermediate rule base on a newly "incoming"
    // training document since we
    // tested all rules already on all training documents !

    // this version of whisk is not tested for mutli slot learning since the
    // seminar announcements
    // are not quite suitable for this task: they do not all contain all 4
    // slots and some of them
    // occur more than once in one document ! And the order of them is not
    // always the same as well!
    // so this is now made only tested for the single slot case even if it
    // is built capable of multislot
    // examples!

    // this is the inner loop of the WHISK pseudo-code:
    // For each inst in Training
    // for each tag

    cachedTestedRuleStatistics.clear();
    ruleList = new TextRulerRuleList();
    coveredExamples = new HashSet<TextRulerExample>();

    sendStatusUpdateToDelegate("Creating examples...", TextRulerLearnerState.ML_RUNNING, false);
    for (int i = 0; i < slotNames.length; i++) {
      TextRulerTarget target = new TextRulerTarget(slotNames[i], this);
      exampleDocuments.createExamplesForTarget(target);

      TextRulerExampleDocument[] docs = exampleDocuments.getSortedDocumentsInCacheOptimizedOrder();

      allExamplesCount = exampleDocuments.getAllPositiveExamples().size();

      for (TextRulerExampleDocument inst : docs) {
        List<TextRulerExample> tags = inst.getPositiveExamples();

        // for each uncovered example -> induce a new rule:
        for (TextRulerExample tag : tags) {
          if (!coveredExamples.contains(tag)) {
            roundNumber++;
            WhiskRule newRule = growRule(inst, tag);
            if (shouldAbort())
              break;
            // if (newRule == null)
            // break;
            // else
            if (newRule != null
                    && (newRule.getCoveringStatistics().getCoveredNegativesCount() == 00 || newRule
                            .getLaplacian() <= errorThreshold)) {
              ruleList.addRule(newRule);
              coveredExamples.addAll(newRule.getCoveringStatistics().getCoveredPositiveExamples());
              sendStatusUpdateToDelegate("New Rule added...", TextRulerLearnerState.ML_RUNNING,
                      true);
            }
          }
        }
        if (shouldAbort())
          return;
      }
    }
    sendStatusUpdateToDelegate("Done", TextRulerLearnerState.ML_DONE, true);
    cachedTestedRuleStatistics.clear();
  }

  protected WhiskRule growRule(TextRulerExampleDocument doc, TextRulerExample example) {
    sendStatusUpdateToDelegate("Creating new rule from seed...", TextRulerLearnerState.ML_RUNNING,
            false);
    WhiskRule theRule = new WhiskRule(this, example.getTarget(), example);
    int numberOfSlotsInTag = example.getAnnotations().length;
    for (int i = 0; i < numberOfSlotsInTag; i++)
      theRule.getPatterns().add(new TextRulerSlotPattern());

    sendStatusUpdateToDelegate("Creating new rule: anchoring...", TextRulerLearnerState.ML_RUNNING,
            false);
    for (int i = 0; i < numberOfSlotsInTag; i++) {
      theRule = anchor(theRule, doc, example, i);
      if (shouldAbort())
        return null;
    }

    sendStatusUpdateToDelegate("Creating new rule: extending...", TextRulerLearnerState.ML_RUNNING,
            false);
    if (theRule != null) {
      double oldLaplacian = theRule.getLaplacian();
      int subRoundNumber = 0;

      // repeat while we still make errors...
      while (theRule.getCoveringStatistics().getCoveredNegativesCount() > 0) {
        WhiskRule extendedRule = extendRule(theRule, doc, example, subRoundNumber);
        if (extendedRule == null) {
          // this way we get the previous rule
          // as the best rule...
          break;
        }
        theRule = extendedRule;
        TextRulerToolkit.log("----------------------------");
        TextRulerToolkit.log("BEST EXTENSION IS: " + theRule.getRuleString());
        TextRulerToolkit.log("Laplacian: " + theRule.getLaplacian() + "    ; "
                + theRule.getCoveringStatistics());
        subRoundNumber++;

        double newLaplacian = theRule.getLaplacian();
        if (newLaplacian >= oldLaplacian) {
          break;
        }
        oldLaplacian = newLaplacian;
      }
      TextRulerToolkit.log("----------------------------");
      TextRulerToolkit.log("FINAL RULE IS : " + theRule.getRuleString());
    }
    return theRule;
  }

  protected WhiskRule extendRule(WhiskRule rule, TextRulerExampleDocument doc,
          TextRulerExample example, int subRoundNumber) {
    WhiskRule bestRule = null;
    double bestL = 1.0;
    int bestRuleConstraintPoints = -1;
    if (rule.getLaplacian() <= errorThreshold) {
      bestRule = rule;
      bestL = rule.getLaplacian();
    }
    List<TextRulerRule> rulesToTest = new ArrayList<TextRulerRule>();

    // first only add conditions, e.g., for features

    List<TextRulerSlotPattern> patterns = rule.getPatterns();
    for (TextRulerSlotPattern eachPattern : patterns) {
      for (TextRulerRuleItem item : eachPattern.fillerPattern) {
        if (item instanceof WhiskRuleItem) {
          WhiskRuleItem wri = (WhiskRuleItem) item;
          WhiskRule proposedRule = rule;
          TextRulerWordConstraint wordConstraint = wri.getWordConstraint();
          for (String eachFeature : consideredFeatures) {
            if (wordConstraint != null) {
              Map<String, String> featureMap = wordConstraint.getTokenAnnotation().getFeatureMap();
              String stringValue = featureMap.get(eachFeature);
              if (stringValue != null && !wri.getActivatedFeatures().contains(eachFeature)) {
                wri.activateFeature(eachFeature);
                WhiskRule proposedRuleF = proposedRule.copy();
                wri.deactivateFeature(eachFeature);
                proposedRuleF.setNeedsCompile(true);
                if (!rulesToTest.contains(proposedRuleF)) {
                  rulesToTest.add(proposedRuleF);
                }
              }
            }
          }
          if (wordConstraint != null && wordConstraint.isRegExpConstraint() && wri.isHideRegExp()) {
            wri.setHideRegExp(false);
            WhiskRule proposedRuleF = proposedRule.copy();
            wri.setHideRegExp(true);
            proposedRuleF.setNeedsCompile(true);
            if (!rulesToTest.contains(proposedRuleF)) {
              rulesToTest.add(proposedRuleF);
            }
          }
        }
      }
    }

    List<List<WhiskRuleItem>> slotTerms = getTermsWithinBounds(
            example.getAnnotations()[0].getBegin(), example.getAnnotations()[0].getEnd(), example);
    List<List<WhiskRuleItem>> windowTerms = getTermsWithinWindow(slotTerms, example, 0);

    for (List<WhiskRuleItem> eachList : windowTerms) {
      for (WhiskRuleItem term : eachList) {

        if (rule.containsTerm(term)) {
          continue;
        }

        WhiskRule proposedRule = createNewRuleByAddingTerm(rule, term);
        if (proposedRule == null)
          continue;
        WhiskRuleItem t = term;

        if (!rulesToTest.contains(proposedRule))
          rulesToTest.add(proposedRule);

        // add a second version where we add the exact token content if
        // it is a regexp item:
        WhiskRule proposedRule2 = proposedRule;
        if (t.getWordConstraint().isRegExpConstraint()) {
          t.setHideRegExp(false);
          WhiskRule proposedRuleF = proposedRule.copy();
          t.setHideRegExp(true);
          proposedRuleF.setNeedsCompile(true);
          if (!rulesToTest.contains(proposedRuleF)) {
            rulesToTest.add(proposedRuleF);
          }
        }

        // extend with feature conditions
        WhiskRule proposedRuleF = null;
        for (String eachFeature : consideredFeatures) {
          Map<String, String> featureMap = t.getWordConstraint().getTokenAnnotation()
                  .getFeatureMap();
          String stringValue = featureMap.get(eachFeature);
          if (stringValue != null) {
            t.activateFeature(eachFeature);
            proposedRuleF = proposedRule.copy();
            t.deactivateFeature(eachFeature);
            proposedRuleF.setNeedsCompile(true);
            if (!rulesToTest.contains(proposedRuleF)) {
              rulesToTest.add(proposedRuleF);
            }
          }
        }

        // and now, for WHISK performance testing purposes, we also add POS
        // tags:
        // this is not very nice code and not dynamic feature capable, but
        // for testpurposes
        // in order to test WHISK with PosTag Terms...
        if (posTagRootTypeName != null && posTagRootTypeName.length() > 0) {
          TextRulerAnnotation tokenAnnotation = term.getWordConstraint().getTokenAnnotation();
          CAS cas = example.getDocumentCAS();
          TypeSystem ts = cas.getTypeSystem();
          Type posTagsRootType = ts.getType(posTagRootTypeName);
          if (ts != null) {
            // POS-Tags created by our test hmm tagger.
            List<AnnotationFS> posTagAnnotations = TextRulerToolkit.getAnnotationsWithinBounds(cas,
                    tokenAnnotation.getBegin(), tokenAnnotation.getEnd(), null, posTagsRootType);
            if (posTagAnnotations.size() > 0) {
              AnnotationFS posTag = posTagAnnotations.get(0);
              if (posTag.getBegin() == tokenAnnotation.getBegin()
                      && posTag.getEnd() == tokenAnnotation.getEnd()) {
                TextRulerAnnotation posTagAnnotation = new TextRulerAnnotation(posTag, doc,
                        consideredFeatures);

                // 1. most specific term with all constraints we
                // have:
                WhiskRule proposedRule3 = proposedRule.copy();
                WhiskRuleItem t3 = term;
                t3.addOtherConstraint(new MLWhiskOtherConstraint(tokenAnnotation, posTagAnnotation));
                proposedRule3.setNeedsCompile(true);
                if (!rulesToTest.contains(proposedRule3))
                  rulesToTest.add(proposedRule3);

                // 2. the same without the regexp thingy:
                if (proposedRule2 != null) {
                  WhiskRule proposedRule4 = proposedRule2.copy();
                  WhiskRuleItem t4 = term;
                  t4.addOtherConstraint(new MLWhiskOtherConstraint(tokenAnnotation,
                          posTagAnnotation));
                  proposedRule4.setNeedsCompile(true);
                  if (!rulesToTest.contains(proposedRule4))
                    rulesToTest.add(proposedRule4);
                }

                // 3. last but not least: a rule with only the pos
                // tag constraint:
                WhiskRule proposedRule5 = proposedRule.copy();
                WhiskRuleItem t5 = term;
                t5.addOtherConstraint(new MLWhiskOtherConstraint(tokenAnnotation, posTagAnnotation));
                t5.setWordConstraint(null);
                proposedRule5.setNeedsCompile(true);
                if (!rulesToTest.contains(proposedRule5)) {
                  rulesToTest.add(proposedRule5);
                }

              }
            }
          }
        }
      }
    }
    if (rulesToTest.size() == 0)
      return bestRule;

    sendStatusUpdateToDelegate(
            "Round "
                    + roundNumber
                    + "."
                    + subRoundNumber
                    + " - Testing "
                    + rulesToTest.size()
                    + " rules... "
                    + " - uncovered examples: "
                    + (allExamplesCount - coveredExamples.size() + " / " + allExamplesCount
                            + " ; cs=" + cachedTestedRuleStatistics.size()),
            TextRulerLearnerState.ML_RUNNING, false);

    TextRulerToolkit.log("Testing " + rulesToTest.size() + " rules on training set...");
    for (TextRulerRule r : rulesToTest)
      TextRulerToolkit.log(r.getRuleString());
    testRulesIfNotCached(rulesToTest);

    if (shouldAbort())
      return null;
    for (TextRulerRule r : rulesToTest) {
      WhiskRule wr = (WhiskRule) r;
      if (wr.getLaplacian() < bestL) {
        bestL = wr.getLaplacian();
        bestRule = wr;
        bestRuleConstraintPoints = bestRule.totalConstraintPoints();
      } else if (wr.getLaplacian() == bestL && bestRuleConstraintPoints >= 0) {
        TextRulerToolkit.log("Same Laplacian! So prefer more general rule!");
        if (wr.totalConstraintPoints() < bestRuleConstraintPoints) {
          TextRulerToolkit.log("\tYes, prefered!");
          bestL = wr.getLaplacian();
          bestRule = wr;
          bestRuleConstraintPoints = bestRule.totalConstraintPoints();
        }
      }
    }
    return bestRule;
  }

  private List<List<WhiskRuleItem>> getTermsWithinWindow(List<List<WhiskRuleItem>> slotTerms,
          TextRulerExample example, int steps) {
    if (steps == windowSize)
      return slotTerms;
    List<List<WhiskRuleItem>> result = new ArrayList<List<WhiskRuleItem>>();

    for (List<WhiskRuleItem> list : slotTerms) {
      List<WhiskRuleItem> termsBefore = getTermsBefore(list.get(0), example);
      List<WhiskRuleItem> termsAfter = getTermsAfter(list.get(list.size() - 1), example);
      if (!termsBefore.isEmpty()) {
        for (WhiskRuleItem before : termsBefore) {
          for (WhiskRuleItem after : termsAfter) {
            List<WhiskRuleItem> newList = new ArrayList<WhiskRuleItem>();
            newList.add(before);
            newList.addAll(list);
            newList.add(after);
            result.add(newList);
          }
        }
      } else {
        for (WhiskRuleItem after : termsAfter) {
          List<WhiskRuleItem> newList = new ArrayList<WhiskRuleItem>();
          newList.addAll(list);
          newList.add(after);
          result.add(newList);
        }
      }
    }
    result = getTermsWithinWindow(result, example, ++steps);
    return result;
  }

  protected WhiskRule createNewRuleByAddingTerm(WhiskRule baseRule, WhiskRuleItem term) {
    if (term == null)
      return null;
    if (term.isStarWildCard() || term.getWordConstraint() == null)
      return null;
    WhiskRule newRule = baseRule.copy();
    // int foundSlotNumber = -1; // debug info
    // String foundSlotPattern = "";
    int termBeginNumber = term.getWordConstraint().getTokenAnnotation().getBegin();
    int termEndNumber = term.getWordConstraint().getTokenAnnotation().getEnd();
    TextRulerRulePattern targetPattern = null;
    TextRulerRulePattern previousSlotPostFillerPattern = null;
    for (int i = 0; i < newRule.getPatterns().size(); i++) {
      TextRulerSlotPattern slotPattern = newRule.getPatterns().get(i);
      WhiskRuleItem it = (WhiskRuleItem) slotPattern.preFillerPattern.lastItem(); // look at the
      // prefiller
      // pattern
      if (it != null && it.getWordConstraint() != null
              && termEndNumber <= it.getWordConstraint().getTokenAnnotation().getBegin())
        targetPattern = slotPattern.preFillerPattern;
      if (targetPattern == null && slotPattern.fillerPattern.size() > 0) // now
      // look
      // at
      // the
      // filler
      // pattern
      {
        it = (WhiskRuleItem) slotPattern.fillerPattern.firstItem();
        if (it.getWordConstraint() != null
                && termEndNumber <= it.getWordConstraint().getTokenAnnotation().getBegin()) // it's
          // still
          // for
          // the prefiller
          // pattern but it
          // seems to be
          // emtpy so we
          // could not find
          // that out above!
          targetPattern = slotPattern.preFillerPattern;
        else {
          it = (WhiskRuleItem) slotPattern.fillerPattern.lastItem();
          if (it.getWordConstraint() != null
                  && termEndNumber <= it.getWordConstraint().getTokenAnnotation().getBegin()) {
            targetPattern = slotPattern.fillerPattern;
          }
        }
      }
      if (targetPattern == null && slotPattern.postFillerPattern.size() > 0) // now
                                                                             // look
                                                                             // at
      // the
      // postfiller
      // pattern
      {
        it = (WhiskRuleItem) slotPattern.postFillerPattern.firstItem();
        if (it.getWordConstraint() != null
                && termEndNumber <= it.getWordConstraint().getTokenAnnotation().getBegin()) // it's
          // still
          // for
          // the filler
          // pattern but it
          // seems to be
          // emtpy so we
          // could not find
          // that out above!
          targetPattern = slotPattern.fillerPattern;
        else {
          it = (WhiskRuleItem) slotPattern.postFillerPattern.lastItem();
          if (it.getWordConstraint() != null
                  && termEndNumber <= it.getWordConstraint().getTokenAnnotation().getBegin())
            targetPattern = slotPattern.postFillerPattern;
        }
      }
      if (targetPattern == null) {
        targetPattern = previousSlotPostFillerPattern;
        // debug info
        // if (i > 0) {
        // TextRulerSlotPattern prevSlotPattern = newRule.getPatterns().get(i -
        // 1);
        // foundSlotPattern = targetPattern == prevSlotPattern.preFillerPattern
        // ? "PRE FILLER"
        // : (targetPattern == prevSlotPattern.fillerPattern ? "FILLER" :
        // "POST FILLER");
        // foundSlotNumber = i - 1;
        // }
        // } else {
        // foundSlotPattern = targetPattern == slotPattern.preFillerPattern ?
        // "PRE FILLER"
        // : (targetPattern == slotPattern.fillerPattern ? "FILLER" :
        // "POST FILLER");
        // foundSlotNumber = i;
      }
      previousSlotPostFillerPattern = slotPattern.postFillerPattern;
    }

    if (targetPattern == null) {
      targetPattern = previousSlotPostFillerPattern;
      // debug info
      // foundSlotNumber = newRule.getPatterns().size() - 1;
      // foundSlotPattern = "POST FILLER";
    }

    if (targetPattern == null) {
      TextRulerToolkit.log("ERROR, NO TARGET PATTERN FOR NEW RULE TERM FOUND !");
    } else {
      // TextRulerToolkit.log("Ok, found for Rule: "+newRule.getRuleString());
      // TextRulerToolkit.log("Term: "+term.getTermNumberInExample()+" ; "+term);
      // TextRulerToolkit.log("Slot "+foundSlotNumber+" - Pattern: "+foundSlotPattern);
      // now put that term into the rule:
      int indexInPattern = -1;
      if (targetPattern.size() == 0) {
        targetPattern.add(term.copy());
        indexInPattern = 0;
      } else {
        // 1. search if the term would replace a wildcard:
        WhiskRuleItem wildCard = null;
        for (TextRulerRuleItem i : newRule.getPatterns().get(0).preFillerPattern) {
          if (((WhiskRuleItem) i).isStarWildCard()) {
            WhiskRuleItem left = newRule.searchNeighborOfItem(((WhiskRuleItem) i), true);
            WhiskRuleItem right = newRule.searchNeighborOfItem(((WhiskRuleItem) i), false);
            if (left.getWordConstraint().getTokenAnnotation().getEnd() <= termBeginNumber
                    && right.getWordConstraint().getTokenAnnotation().getBegin() >= termEndNumber)
              wildCard = (WhiskRuleItem) i;
          }
        }
        if (wildCard == null) {
          for (TextRulerRuleItem i : newRule.getPatterns().get(0).fillerPattern) {
            if (((WhiskRuleItem) i).isStarWildCard()) {
              WhiskRuleItem left = newRule.searchNeighborOfItem(((WhiskRuleItem) i), true);
              WhiskRuleItem right = newRule.searchNeighborOfItem(((WhiskRuleItem) i), false);
              if (left != null
                      && left.getWordConstraint().getTokenAnnotation().getEnd() <= termBeginNumber
                      && right.getWordConstraint().getTokenAnnotation().getBegin() >= termEndNumber)
                wildCard = (WhiskRuleItem) i;
            }
          }
        }
        if (wildCard == null) {
          for (TextRulerRuleItem i : newRule.getPatterns().get(0).postFillerPattern) {
            if (((WhiskRuleItem) i).isStarWildCard()) {
              WhiskRuleItem left = newRule.searchNeighborOfItem(((WhiskRuleItem) i), true);
              WhiskRuleItem right = newRule.searchNeighborOfItem(((WhiskRuleItem) i), false);
              if (left.getWordConstraint().getTokenAnnotation().getEnd() <= termBeginNumber
                      && right.getWordConstraint().getTokenAnnotation().getBegin() >= termEndNumber)
                wildCard = (WhiskRuleItem) i;
            }
          }
        }
        if (wildCard != null) {
          if (!wildCard.isStarWildCard()) {
            TextRulerToolkit
                    .log("ERROR, FOUND A TERM WITH THE SAME NUMBER THAT IS NOT A WILDCARD! HOW IS THAT???");
            return null;
          }
          if (!targetPattern.contains(wildCard)) {
            TextRulerToolkit.log("EVEN WORSE, THAT MUST NOT BE AT ALL!");
            return null;
          }
          indexInPattern = targetPattern.indexOf(wildCard);
          targetPattern.set(indexInPattern, term.copy());
        } else {
          // not a wildcard, so search for the insertion point:
          for (int i = 0; i < targetPattern.size(); i++) {
            WhiskRuleItem it = (WhiskRuleItem) targetPattern.get(i);
            if (it.getWordConstraint() != null
                    && termEndNumber <= it.getWordConstraint().getTokenAnnotation().getBegin()) {
              indexInPattern = i;
              break;
            }
          }
          if (indexInPattern < 0) {
            indexInPattern = targetPattern.size();
            targetPattern.add(term.copy());
          } else
            targetPattern.add(indexInPattern, term.copy());
        }
      }
      // ok, now we have replaced a wildcard with the term or added the
      // term between two other items.
      // we now have to check the neighbors of the new term: if it is a
      // direct neighbor (according to the termNumber),
      // we have nothing special to do. but if it is not a direct
      // neighbor, we have to add a wildcard between the two items (if the
      // neighbor item
      // is not a wildcard itself!
      WhiskRuleItem newTerm = (WhiskRuleItem) targetPattern.get(indexInPattern);

      // look at left neighbor:
      WhiskRuleItem left = newRule.searchNeighborOfItem(newTerm, true);
      if (left != null && left.getWordConstraint() != null) {
        // TextRulerToolkit.log("LEFT NEIGHBOR FOUND!");

        // so we have a left neighbor. let's see if it also is the
        // neighbor in our seed token stream:
        if (!left.isStarWildCard()) { // no direct neighbor and
          // no wildcard yet,
          // so insert a wildcard between us!
          boolean isValid = isNextValidNeighbor(left, newTerm, newRule.getSeedExample());
          if (!isValid) {
            targetPattern.add(indexInPattern, WhiskRuleItem.newWildCardItem());
            indexInPattern++;
          }
        }
      }

      // look at right neighbor:
      WhiskRuleItem right = newRule.searchNeighborOfItem(newTerm, false);
      if (right != null && right.getWordConstraint() != null) {
        // TextRulerToolkit.log("RIGHT NEIGHBOR FOUND!");
        // so we have a right neighbor. let's see if it also is the
        // neighbor in our seed token stream:
        if (!right.isStarWildCard()) {
          // no direct neighbor and
          // no wildcard yet,
          // so insert a wildcard between us!
          boolean isValid = isNextValidNeighbor(newTerm, right, newRule.getSeedExample());
          if (!isValid) {
            WhiskRuleItem wc = WhiskRuleItem.newWildCardItem();
            if (indexInPattern + 1 < targetPattern.size())
              targetPattern.add(indexInPattern + 1, wc);
            else
              targetPattern.add(wc);
          }
        }
      }

      newRule.setNeedsCompile(true);
      // TextRulerToolkit.log("BEFORE: "+baseRule.getRuleString());
      // TextRulerToolkit.log("AFTER : "+newRule.getRuleString());
      // TextRulerToolkit.log("");
    }
    if (newRule.getRuleString().equals(baseRule.getRuleString())) // this
      // must
      // not be!
      return null;
    else
      return newRule;
  }

  protected WhiskRule anchor(WhiskRule rule, TextRulerExampleDocument doc,
          TextRulerExample example, int slotIndex) {
    List<WhiskRule> result = new ArrayList<WhiskRule>();
    TextRulerAnnotation slotAnnotation = example.getAnnotations()[slotIndex];
    List<List<WhiskRuleItem>> window = getTermsWithinBounds(slotAnnotation.getBegin(),
            slotAnnotation.getEnd(), example);

    for (List<WhiskRuleItem> inside : window) {

      if (rule == null || inside.isEmpty()) {
        return null;
      }
      // create base 1 and base 2:
      WhiskRule base1 = rule.copy(); // slot filler rule
      TextRulerSlotPattern slotPattern = base1.getPatterns().get(slotIndex);
      // questionable restriction:
      if (inside.size() <= windowSize) { // TODO add parameter for this!
        slotPattern.fillerPattern.addAll(inside);
      } else {
        for (int i = 0; i < inside.size(); i++)
          if (i == 0 || (i == inside.size() - 1))
            slotPattern.fillerPattern.add(inside.get(i).copy());
          else if (inside.size() > 2 && i < 2)
            slotPattern.fillerPattern.add(WhiskRuleItem.newWildCardItem());
      }
      List<WhiskRuleItem> beforeList = getTermsBefore(inside.get(0), example);
      List<WhiskRuleItem> afterList = getTermsAfter(inside.get(inside.size() - 1), example);
      beforeList.add(null);
      afterList.add(null);
      Collection<WhiskRule> tempRules = new HashSet<WhiskRule>();

      // workaround for better rules:
      // only inner begin
      for (WhiskRuleItem eachBefore : beforeList) {
        for (WhiskRuleItem eachAfter : afterList) {
          WhiskRule copy = rule.copy();
          TextRulerSlotPattern textRulerSlotPattern = copy.getPatterns().get(slotIndex);
          if (eachBefore != null) {
            textRulerSlotPattern.preFillerPattern.add(eachBefore);
          }
          textRulerSlotPattern.fillerPattern.add(inside.get(0).copy());
          textRulerSlotPattern.fillerPattern.add(WhiskRuleItem.newWildCardItem());
          if (eachAfter != null) {
            textRulerSlotPattern.postFillerPattern.add(eachAfter);
          }
          tempRules.add(copy);
        }
      }
      // only inner end
      for (WhiskRuleItem eachBefore : beforeList) {
        for (WhiskRuleItem eachAfter : afterList) {
          WhiskRule copy = rule.copy();
          TextRulerSlotPattern textRulerSlotPattern = copy.getPatterns().get(slotIndex);
          if (eachBefore != null) {
            textRulerSlotPattern.preFillerPattern.add(eachBefore);
          }
          textRulerSlotPattern.fillerPattern.add(WhiskRuleItem.newWildCardItem());
          textRulerSlotPattern.fillerPattern.add(inside.get(inside.size() - 1).copy());
          if (eachAfter != null) {
            textRulerSlotPattern.postFillerPattern.add(eachAfter);
          }
          tempRules.add(copy);
        }
      }

      if (!beforeList.isEmpty()) {
        if (!afterList.isEmpty()) {
          for (WhiskRuleItem eachBefore : beforeList) {
            for (WhiskRuleItem eachAfter : afterList) {
              WhiskRule copy = rule.copy();
              TextRulerSlotPattern textRulerSlotPattern = copy.getPatterns().get(slotIndex);
              if (eachBefore != null) {
                textRulerSlotPattern.preFillerPattern.add(eachBefore);
              }
              textRulerSlotPattern.fillerPattern.add(WhiskRuleItem.newWildCardItem());
              if (eachAfter != null) {
                textRulerSlotPattern.postFillerPattern.add(eachAfter);
              }
              tempRules.add(copy);
            }
          }
        } else {
          for (WhiskRuleItem eachBefore : beforeList) {
            WhiskRule copy = rule.copy();
            TextRulerSlotPattern textRulerSlotPattern = copy.getPatterns().get(slotIndex);
            textRulerSlotPattern.fillerPattern.add(WhiskRuleItem.newWildCardItem());
            if (eachBefore != null) {
              textRulerSlotPattern.preFillerPattern.add(eachBefore);
            }
            tempRules.add(copy);
          }
        }
      } else {
        for (WhiskRuleItem eachAfter : afterList) {
          WhiskRule copy = rule.copy();
          TextRulerSlotPattern textRulerSlotPattern = copy.getPatterns().get(slotIndex);
          textRulerSlotPattern.fillerPattern.add(WhiskRuleItem.newWildCardItem());
          if (eachAfter != null) {
            textRulerSlotPattern.postFillerPattern.add(eachAfter);
          }
          tempRules.add(copy);
        }
      }
      ArrayList<TextRulerRule> rules = new ArrayList<TextRulerRule>(tempRules);
      testRulesIfNotCached(rules);
      TextRulerRule best = null;
      for (TextRulerRule each : rules) {
        if (best == null) {
          best = each;
        } else {
          if (each.getCoveringStatistics().getCoveredPositivesCount() > best
                  .getCoveringStatistics().getCoveredPositivesCount()) {
            best = each;
          }
        }
      }
      WhiskRule base2 = (WhiskRule) best;
      List<TextRulerRule> testRules = new ArrayList<TextRulerRule>();
      if (base1 != null) {
        TextRulerToolkit.log("base1: " + base1.getRuleString());
        testRules.add(base1);
      }
      if (base2 != null) {
        TextRulerToolkit.log("base2: " + base2.getRuleString());
        testRules.add(base2);
      }
      testRulesIfNotCached(testRules);
      if (shouldAbort()) {
        return null;
      }
      if (base1 != null && base2 == null) {
        TextRulerToolkit.log("\tbase1: " + base1.getCoveringStatistics() + " --> laplacian = "
                + base1.getLaplacian());
        result.add(base1);
      } else {
        TextRulerToolkit.log("\tbase1: " + base1.getCoveringStatistics() + " --> laplacian = "
                + base1.getLaplacian());
        TextRulerToolkit.log("\tbase2: " + base2.getCoveringStatistics() + " --> laplacian = "
                + base2.getLaplacian());
        if (base2.getCoveringStatistics().getCoveredPositivesCount() > base1
                .getCoveringStatistics().getCoveredPositivesCount()) {
          result.add(base2);
        } else {
          result.add(base1);
        }
      }
    }
    TextRulerRule best = null;
    for (TextRulerRule each : result) {
      if (best == null) {
        best = each;
      } else {
        if (each.getCoveringStatistics().getCoveredPositivesCount() > best.getCoveringStatistics()
                .getCoveredPositivesCount()) {
          best = each;
        }
      }
    }

    return (WhiskRule) best;
  }

  private List<WhiskRuleItem> getTermsAfter(WhiskRuleItem whiskRuleItem, TextRulerExample example) {
    List<WhiskRuleItem> result = new ArrayList<WhiskRuleItem>();
    int end = whiskRuleItem.getWordConstraint().getTokenAnnotation().getEnd();
    CAS cas = example.getDocumentCAS();
    // TODO: access type with string constant
    Type frameType = cas.getTypeSystem().getType("org.apache.uima.ruta.type.RutaFrame");
    AnnotationFS pointer = cas.createAnnotation(frameType, end, Integer.MAX_VALUE);
    FSIterator iterator = cas.getAnnotationIndex().iterator(pointer);
    int nextBegin = -1;
    while (iterator.isValid()) {
      FeatureStructure fs = iterator.get();
      if (fs instanceof AnnotationFS) {
        AnnotationFS a = (AnnotationFS) fs;
        if (!filterSetWithSlotNames.contains(a.getType().getName())) {
          if (nextBegin == -1) {
            nextBegin = a.getBegin();
          } else if (nextBegin != a.getBegin()) {
            break;
          }
          if (a.getBegin() <= nextBegin && a.getBegin() >= end) {
            WhiskRuleItem term = new WhiskRuleItem(new TextRulerAnnotation(a,
                    example.getDocument(), consideredFeatures));
            result.add(term);
          }
        }
      }
      iterator.moveToNext();
    }
    return result;
  }

  private List<WhiskRuleItem> getTermsBefore(WhiskRuleItem whiskRuleItem, TextRulerExample example) {
    List<WhiskRuleItem> result = new ArrayList<WhiskRuleItem>();
    int begin = whiskRuleItem.getWordConstraint().getTokenAnnotation().getBegin();
    CAS cas = example.getDocumentCAS();

    // TODO: access type with string constant
    Type frameType = cas.getTypeSystem().getType("org.apache.uima.ruta.type.RutaFrame");
    AnnotationFS pointer = cas.createAnnotation(frameType, begin, begin);
    FSIterator iterator = cas.getAnnotationIndex().iterator(pointer);
    int nextEnd = -1;

    // ???
    iterator.moveToPrevious();
    iterator.moveToPrevious();
    while (iterator.isValid()) {
      FeatureStructure fs = iterator.get();
      if (fs instanceof AnnotationFS) {
        AnnotationFS a = (AnnotationFS) fs;
        if (!filterSetWithSlotNames.contains(a.getType().getName())) {
          if (a.getEnd() > example.getAnnotation().getEnd()) {
            iterator.moveToPrevious();
            continue;
          }
          if (nextEnd == -1) {
            nextEnd = a.getEnd();
          } else if (nextEnd != a.getEnd()) {
            break;
          }
          if (a.getEnd() >= nextEnd && a.getEnd() <= begin) {
            WhiskRuleItem term = new WhiskRuleItem(new TextRulerAnnotation(a,
                    example.getDocument(), consideredFeatures));
            result.add(term);
          }
        }
      }
      iterator.moveToPrevious();
    }
    return result;
  }

  public String getResultString() {
    if (ruleList != null)
      return getFileHeaderString(true) + ruleList.getRulesString("");
    else
      return "No results available yet!";
  }

  public void setParameters(Map<String, Object> params) {
    if (TextRulerToolkit.DEBUG)
      saveParametersToTempFolder(params);

    // TODO try catch
    if (params.containsKey(WINDOWSIZE_KEY))
      windowSize = (Integer) params.get(WINDOWSIZE_KEY);

    if (params.containsKey(ERROR_THRESHOLD_KEY))
      errorThreshold = (Float) params.get(ERROR_THRESHOLD_KEY);

    if (params.containsKey(POSTAG_ROOTTYPE_KEY))
      posTagRootTypeName = (String) params.get(POSTAG_ROOTTYPE_KEY);

    if (params.containsKey(CONSIDERED_FEATURES)) {
      String list = (String) params.get(CONSIDERED_FEATURES);
      if (!StringUtils.isBlank(list)) {
        String[] split = list.split(",");
        for (String string : split) {
          String trim = string.trim();
          if (!StringUtils.isBlank(trim)) {
            consideredFeatures.add(trim);
          }
        }
      }
    }

  }

  public List<List<WhiskRuleItem>> getTermsWithinBounds(int startPos, int endPos,
          TextRulerExample example) {
    List<List<WhiskRuleItem>> result = new ArrayList<List<WhiskRuleItem>>();
    CAS cas = example.getDocumentCAS();
    // TODO: access type with string constant
    Type frameType = cas.getTypeSystem().getType("org.apache.uima.ruta.type.RutaFrame");
    AnnotationFS pointer = cas.createAnnotation(frameType, startPos, endPos);
    FSIterator iterator = cas.getAnnotationIndex().iterator(pointer);
    List<AnnotationFS> startAs = new ArrayList<AnnotationFS>();
    int firstBegin = -1;
    while (iterator.isValid()) {
      FeatureStructure fs = iterator.get();
      AnnotationFS a = (AnnotationFS) fs;

      // TODO change for multislot rules!
      if (a.getBegin() >= startPos && a.getEnd() <= endPos) {
        if (!filterSetWithSlotNames.contains(a.getType().getName())) {
          if (firstBegin == -1) {
            firstBegin = a.getBegin();
          } else if (firstBegin != a.getBegin()) {
            break;
          }
          if (a.getBegin() == firstBegin)
            startAs.add(a);
        }
        iterator.moveToNext();
      } else {
        iterator.moveToNext();
      }
    }

    for (AnnotationFS annotation : startAs) {
      List<WhiskRuleItem> startList = new ArrayList<WhiskRuleItem>();
      WhiskRuleItem term = new WhiskRuleItem(new TextRulerAnnotation(annotation,
              example.getDocument(), consideredFeatures));
      startList.add(term);
      result.add(startList);
    }

    result = addFollowing(result, endPos, example);
    return result;
  }

  private List<List<WhiskRuleItem>> addFollowing(List<List<WhiskRuleItem>> lists, int till,
          TextRulerExample example) {
    List<List<WhiskRuleItem>> result = new ArrayList<List<WhiskRuleItem>>();
    for (List<WhiskRuleItem> list : lists) {
      WhiskRuleItem last = list.get(list.size() - 1);
      List<WhiskRuleItem> termsAfter = getTermsAfter(last, example);
      if (termsAfter.isEmpty()) {
        return lists;
      }
      for (WhiskRuleItem eachAfter : termsAfter) {
        if (eachAfter.getWordConstraint().getTokenAnnotation().getEnd() <= till) {
          List<WhiskRuleItem> newList = new ArrayList<WhiskRuleItem>();
          newList.addAll(list);
          newList.add(eachAfter);
          result.add(newList);
          result = addFollowing(result, till, example);
        } else {
          return lists;
        }
      }
    }
    return result;
  }

  // TODO share this between algorithms (e.g. LP2 and RAPIER, WHISK ?) and
  // make a maximum size of the cache, etc. like CasCache?
  protected void testRulesIfNotCached(List<TextRulerRule> rules) {
    List<TextRulerRule> rulesToTest = new ArrayList<TextRulerRule>();

    for (TextRulerRule r : rules) {
      String key = r.getRuleString();
      if (cachedTestedRuleStatistics.containsKey(key)) {
        r.setCoveringStatistics(cachedTestedRuleStatistics.get(key).copy());
      } else
        rulesToTest.add(r);
    }

    if (rulesToTest.size() > 0) {
      testRulesOnDocumentSet(rulesToTest, exampleDocuments);
      if (shouldAbort())
        return;
      for (TextRulerRule r : rulesToTest) {
        String key = r.getRuleString();
        cachedTestedRuleStatistics.put(key, r.getCoveringStatistics().copy());
      }
    }
  }

  private int getElementIndex(WhiskRule proposedRule, WhiskRuleItem term) {
    if (term == null)
      return -1;
    int index = 0;
    int result = -1;
    for (TextRulerRuleItem i : proposedRule.getPatterns().get(0).preFillerPattern) {
      if (((WhiskRuleItem) i).equals(term)) {
        result = index;
      }
      index++;
    }
    for (TextRulerRuleItem i : proposedRule.getPatterns().get(0).fillerPattern) {
      if (((WhiskRuleItem) i).equals(term)) {
        result = index;
      }
      index++;
    }
    for (TextRulerRuleItem i : proposedRule.getPatterns().get(0).postFillerPattern) {
      if (((WhiskRuleItem) i).equals(term)) {
        result = index;
      }
      index++;
    }
    return result;
  }

  private boolean isNextValidNeighbor(WhiskRuleItem left, WhiskRuleItem right,
          TextRulerExample example) {
    CAS cas = example.getDocumentCAS();
    // TODO: access type with string constant
    Type frameType = cas.getTypeSystem().getType("org.apache.uima.ruta.type.RutaFrame");
    int begin = left.getWordConstraint().getTokenAnnotation().getEnd();
    int end = right.getWordConstraint().getTokenAnnotation().getBegin();
    AnnotationFS pointer = cas.createAnnotation(frameType, begin, end);
    FSIterator iterator = cas.getAnnotationIndex().iterator(pointer);
    while (iterator.isValid()) {
      FeatureStructure fs = iterator.get();
      AnnotationFS a = (AnnotationFS) fs;
      if (a.getBegin() >= begin && a.getEnd() <= end) {
        if (!filterSetWithSlotNames.contains(a.getType().getName())) {
          return false;
        }
      }
      iterator.moveToNext();
    }
    return true;
  }
}
TOP

Related Classes of org.apache.uima.ruta.textruler.learner.whisk.generic.Whisk

TOP
Copyright © 2018 www.massapi.com. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.