Package org.apache.uima.ruta.textruler.learner.lp2

Source Code of org.apache.uima.ruta.textruler.learner.lp2.BasicLP2

/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements.  See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership.  The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License.  You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
* KIND, either express or implied.  See the License for the
* specific language governing permissions and limitations
* under the License.
*/

package org.apache.uima.ruta.textruler.learner.lp2;

import java.io.File;
import java.util.ArrayList;
import java.util.HashSet;
import java.util.List;
import java.util.Map;
import java.util.Set;
import java.util.TreeMap;

import org.apache.commons.lang3.StringUtils;
import org.apache.uima.cas.CAS;
import org.apache.uima.cas.Type;
import org.apache.uima.cas.TypeSystem;
import org.apache.uima.cas.text.AnnotationFS;
import org.apache.uima.ruta.engine.RutaEngine;
import org.apache.uima.ruta.textruler.TextRulerPlugin;
import org.apache.uima.ruta.textruler.core.GlobalCASSource;
import org.apache.uima.ruta.textruler.core.TextRulerAnnotation;
import org.apache.uima.ruta.textruler.core.TextRulerBasicLearner;
import org.apache.uima.ruta.textruler.core.TextRulerExample;
import org.apache.uima.ruta.textruler.core.TextRulerExampleDocument;
import org.apache.uima.ruta.textruler.core.TextRulerRule;
import org.apache.uima.ruta.textruler.core.TextRulerRuleList;
import org.apache.uima.ruta.textruler.core.TextRulerShiftExample;
import org.apache.uima.ruta.textruler.core.TextRulerStatisticsCollector;
import org.apache.uima.ruta.textruler.core.TextRulerTarget;
import org.apache.uima.ruta.textruler.core.TextRulerTarget.MLTargetType;
import org.apache.uima.ruta.textruler.core.TextRulerToolkit;
import org.apache.uima.ruta.textruler.extension.TextRulerLearnerDelegate;
import org.apache.uima.util.FileUtils;

public abstract class BasicLP2 extends TextRulerBasicLearner {

  public static final String WINDOW_SIZE_KEY = "windowSize";

  public static final String CURRENT_BEST_RULES_SIZE_KEY = "currentBestRulesSize";

  public static final String CURRENT_CONTEXTUAL_RULES_SIZE_KEY = "currentContextualRulesSize";

  public static final String MIN_COVERED_POSITIVES_PER_RULE_KEY = "minCoveredPositivesPerRule";

  public static final String MAX_ERROR_THRESHOLD_KEY = "maxErrorThreshold";

  public static final int STANDARD_WINDOW_SIZE = 2;

  public static final int STANDARD_MAX_CURRENT_BEST_RULES_COUNT = 4;

  public static final int STANDARD_MAX_CONTEXTUAL_RULES_COUNT = 4;

  public static final int STANDARD_MIN_COVERED_POSITIVES_PER_RULE = 1;

  public static final float STANDARD_MAX_ERROR_THRESHOLD = 0.1f;

  public static final String CORRECTION_ANNOTATION_NAME = "lp2shift";

  private static final int STANDARD_SHIFT_SIZE = 2;

  protected int maxCurrentBestRulesCount = STANDARD_MAX_CURRENT_BEST_RULES_COUNT;

  protected int maxCurrentContextualRulesCount = STANDARD_MAX_CONTEXTUAL_RULES_COUNT;

  protected int windowSize = STANDARD_WINDOW_SIZE;

  protected int shiftSize = STANDARD_SHIFT_SIZE;

  protected int minCoveredPositives = STANDARD_MIN_COVERED_POSITIVES_PER_RULE;

  protected float maxErrorThreshold = STANDARD_MAX_ERROR_THRESHOLD;

  protected List<TextRulerExample> examples;

  protected Set<TextRulerExample> coveredExamples;

  protected Map<String, Integer> slotMaximumTokenCountMap = new TreeMap<String, Integer>();

  protected LP2CurrentBestRulesQueue currentBestRules;

  protected LP2CurrentBestRulesQueue currentContextualRules;

  protected Map<String, TextRulerRuleList> bestRulesPoolMap = new TreeMap<String, TextRulerRuleList>();

  protected Map<String, TextRulerRuleList> contextRulesPoolMap = new TreeMap<String, TextRulerRuleList>();

  protected Map<String, String> leftBoundaryBestRulesMap = new TreeMap<String, String>();

  protected Map<String, String> rightBoundaryBestRulesMap = new TreeMap<String, String>();

  protected Map<String, String> leftBoundaryContextualRulesMap = new TreeMap<String, String>();

  protected Map<String, String> rightBoundaryContextualRulesMap = new TreeMap<String, String>();

  public BasicLP2(String inputDir, String prePropTMFile, String tmpDir, String[] slotNames,
          Set<String> filterSet, boolean skip, TextRulerLearnerDelegate delegate) {
    super(inputDir, prePropTMFile, tmpDir, slotNames, filterSet, skip, delegate);
    supportBoundaries = true;
  }

  protected TextRulerRuleList learnTaggingRules(TextRulerTarget target,
          TextRulerRuleList contextualRules) {
    if (target.type == MLTargetType.SINGLE_LEFT_BOUNDARY)
      sendStatusUpdateToDelegate("Creating Left-Boundary Examples...",
              TextRulerLearnerState.ML_RUNNING, false);
    else if (target.type == MLTargetType.SINGLE_RIGHT_BOUNDARY)
      sendStatusUpdateToDelegate("Creating Right-Boundary Examples...",
              TextRulerLearnerState.ML_RUNNING, false);
    else if (target.type == MLTargetType.SINGLE_LEFT_CORRECTION)
      sendStatusUpdateToDelegate("Creating Left Correction Examples...",
              TextRulerLearnerState.ML_RUNNING, false);
    else
      // if (target.type == MLTargetType.SINGLE_RIGHT_CORRECTION)
      sendStatusUpdateToDelegate("Creating Right Correction Examples...",
              TextRulerLearnerState.ML_RUNNING, false);
    exampleDocuments.clearCurrentExamples();
    exampleDocuments.createExamplesForTarget(target);
    examples = exampleDocuments.getAllPositiveExamples();

    if (shouldAbort())
      return null;
    TextRulerRuleList bestRulesPool = new TextRulerRuleList();
    TextRulerRuleList contextRulesPool = new TextRulerRuleList();
    String slotName = target.getSingleSlotRawTypeName();
    bestRulesPoolMap.put(slotName, bestRulesPool);
    contextRulesPoolMap.put(slotName, contextRulesPool);

    coveredExamples = new HashSet<TextRulerExample>();
    int roundNumber = 0;
    for (TextRulerExample e : examples)
      if (!coveredExamples.contains(e)) {
        if (shouldAbort())
          break;
        roundNumber++;
        currentBestRules = new LP2CurrentBestRulesQueue(maxCurrentBestRulesCount);
        currentContextualRules = new LP2CurrentBestRulesQueue(maxCurrentContextualRulesCount);
        // TextRulerToolkit.log("Example: "+e.getAnnotation().getBegin()+" : "+e.getAnnotation().getEnd());

        induceRulesFromExample(e, roundNumber);

        // TextRulerToolkit.log("Best Rules from this Seed: "+currentBestRules.size());
        // if (TextRulerToolkit.DEBUG && currentBestRules.size()>1)
        // {
        // for (TextRulerRule r : currentBestRules)
        // {
        // TextRulerToolkit.log("\tp="+r.getCoveringStatistics().getCoveredPositivesCount()+"; n="+r.getCoveringStatistics().getCoveredNegativesCount()+";  "+r.getRuleString());
        // for (TextRulerExample ex :
        // r.getCoveringStatistics().getCoveredPositiveExamples())
        // {
        // TextRulerToolkit.log("\t\te="+ex.getAnnotation().getBegin());
        //
        // }
        // }
        // }
        for (LP2Rule bestRule : currentBestRules) {
          addToFinalBestRulesPool(bestRule);
        }
        for (LP2Rule ctxRule : currentContextualRules) {
          addToFinalContextRulesPool(ctxRule);
        }
        sendStatusUpdateToDelegate("New Rules added.", TextRulerLearnerState.ML_RUNNING, true);
      }
    if (TextRulerToolkit.DEBUG) {
      bestRulesPool.saveToRulesFile(getIntermediateRulesFileName(), getFileHeaderString(true));
      // for (TextRulerRule r : bestRulesPool)
      // {
      // TextRulerToolkit.log("p="+r.getCoveringStatistics().getCoveredPositivesCount()+"; n="+r.getCoveringStatistics().getCoveredNegativesCount()+";  "+r.getRuleString());
      // }
    }

    TextRulerRuleList result = bestRulesPool;
    if (contextualRules != null)
      for (TextRulerRule r : contextRulesPool)
        contextualRules.add(r);
    return result;
  }

  @Override
  public CAS loadCAS(String fileName, CAS reuseCAS) {
    CAS cas = super.loadCAS(fileName, reuseCAS);
    prepareCASWithBoundaries(cas);
    return cas;
  }

  public void prepareCASWithBoundaries(CAS cas) {
    for (String slotName : slotNames)
      TextRulerExampleDocument.createBoundaryAnnotationsForCas(cas, slotName, filterSet);
  }

  public void prepareCachedCASesWithBoundaries() {
    for (CAS cas : exampleDocuments.getCachedCASes())
      prepareCASWithBoundaries(cas);
  }

  @Override
  protected void cleanUp() {
    super.cleanUp();
    examples = null;
    coveredExamples = null;
    currentBestRules = null;
    currentContextualRules = null;
    bestRulesPoolMap.clear();
    contextRulesPoolMap.clear();
  }

  @Override
  protected void doRun() {
    TextRulerToolkit.logIfDebug("--- LP2 START");

    prepareCachedCASesWithBoundaries(); // if some cases are already loaded,
    // prepare them! all others get prepared when loaded (see loadCAS)

    for (int i = 0; i < slotNames.length; i++) {
      runForSlotName(slotNames[i]);
    }

    sendStatusUpdateToDelegate("Done", TextRulerLearnerState.ML_DONE, true);
    TextRulerToolkit.logIfDebug("--- LP2 END");
  }

  protected void runForSlotName(String slotName) {
    // 1. get slot length histogram in order to find maximum slot length
    // (counted in tokens)

    sendStatusUpdateToDelegate("Creating slot length histogram...",
            TextRulerLearnerState.ML_RUNNING, false);
    List<Integer> histogram = exampleDocuments.getTokenCountHistogrammForSlotName(slotName,
            TextRulerToolkit.getFilterSetWithSlotNames(slotNames, filterSet));
    if (shouldAbort())
      return;
    slotMaximumTokenCountMap.put(slotName, histogram.size() - 1); // -1 since the
    // zero-histogram point
    // also needs a place!

    TextRulerRuleList ctxRules = new TextRulerRuleList();
    TextRulerRuleList bestRules = learnTaggingRules(new TextRulerTarget(slotName,
            MLTargetType.SINGLE_LEFT_BOUNDARY, this), ctxRules); // learn
    // left
    // boundary
    // best
    // rules
    if (bestRules != null) {
      leftBoundaryBestRulesMap.put(slotName, bestRules.getRulesString(""));
      leftBoundaryContextualRulesMap.put(slotName, ctxRules.getRulesString("\t"));
      bestRules.clear(); // free som memory/references
    }
    if (shouldAbort())
      return;
    ctxRules.clear();
    bestRules = learnTaggingRules(new TextRulerTarget(slotName, MLTargetType.SINGLE_RIGHT_BOUNDARY,
            this), ctxRules); // learn
    // right
    // boundary best
    // rules
    if (bestRules != null) {
      rightBoundaryBestRulesMap.put(slotName, bestRules.getRulesString(""));
      rightBoundaryContextualRulesMap.put(slotName, ctxRules.getRulesString("\t"));
    }

    // TODO add correction rule learn stuff
    // testTaggingRulesAndCreateCorrectionRulesExamples(null, STANDARD_MAX_CONTEXTUAL_RULES_COUNT)

    // correct left start
    TextRulerTarget lsTarget = new TextRulerTarget(slotName, MLTargetType.SINGLE_LEFT_CORRECTION,
            this);
    lsTarget.setMaxShiftDistance(shiftSize);
    TextRulerRuleList correctLeftRules = learnTaggingRules(lsTarget, null);

    // resultString = "CAP{REGEXP(\"PM\")} ALL{->MARKONCE(stimeEND)};";
    // try {
    // FileUtils.saveString2File(resultString, file);
    // } catch (IOException e) {
    // }

    // correct right start
    // TextRulerTarget rsTarget = new TextRulerTarget(slotName,
    // MLTargetType.SINGLE_RIGHT_CORRECTION,
    // this);
    // rsTarget.setMaxShiftDistance(shiftSize);
    // TextRulerRuleList correctRightRules = learnTaggingRules(rsTarget, null);
    //
    sendStatusUpdateToDelegate("SLOT Done", TextRulerLearnerState.ML_RUNNING, true);
    TextRulerToolkit.logIfDebug("--- LP2 END FOR SLOT:" + slotName);
  }

  protected abstract void induceRulesFromExample(TextRulerExample e, int roundNumber);

  protected void addToFinalContextRulesPool(LP2Rule rule) {
    if (TextRulerToolkit.DEBUG)
      TextRulerToolkit.appendStringToFile(tempDirectory() + "ctxpool"
              + RutaEngine.SCRIPT_FILE_EXTENSION, rule.getRuleString() + "\n");
    String slotName = rule.getTarget().getSingleSlotRawTypeName();
    if (!contextRulesPoolMap.get(slotName).contains(rule)) {
      contextRulesPoolMap.get(slotName).add(rule);
      // TextRulerToolkit.log("CONTEXT RULE: "+rule.getRuleString()+" ; "+rule.getCoveringStatistics());
    } else {
      if (TextRulerToolkit.DEBUG) {
        TextRulerToolkit.appendStringToFile(tempDirectory() + "ctxpool"
                + RutaEngine.SCRIPT_FILE_EXTENSION, "\tDUPLICATE\n");
      }
    }

  }

  protected void addToFinalBestRulesPool(LP2Rule rule) {
    if (TextRulerToolkit.DEBUG && false)
      TextRulerToolkit.appendStringToFile(tempDirectory() + "bestpool"
              + RutaEngine.SCRIPT_FILE_EXTENSION, rule.getRuleString() + "\n");
    String slotName = rule.getTarget().getSingleSlotRawTypeName();
    if (!bestRulesPoolMap.get(slotName).contains(rule)) {
      bestRulesPoolMap.get(slotName).add(rule);
      // TextRulerToolkit.log("BEST RULE: "+rule.getRuleString());
      // add all covered positives to covering set
      coveredExamples.addAll(rule.getCoveringStatistics().getCoveredPositiveExamples());
      if (TextRulerToolkit.DEBUG)
        bestRulesPoolMap.get(slotName).saveToRulesFile(getIntermediateRulesFileName(),
                getFileHeaderString(false));
    } else {
      if (TextRulerToolkit.DEBUG && false) {
        TextRulerToolkit.log("KANN SOWAS PASSIEREN ??");
        TextRulerToolkit.appendStringToFile(tempDirectory() + "bestpool"
                + RutaEngine.SCRIPT_FILE_EXTENSION, "\tDUPLICATE\n");
      }
    }

  }

  public String getResultString() {
    StringBuilder sb = new StringBuilder();
    String header = getFileHeaderString(true);
    sb.append(header);

    for (String eachSlot : slotNames) {

      String leftBoundaryBestRulesString = leftBoundaryBestRulesMap.get(eachSlot);
      String rightBoundaryBestRulesString = rightBoundaryBestRulesMap.get(eachSlot);
      String leftBoundaryContextualRulesString = leftBoundaryContextualRulesMap.get(eachSlot);
      String rightBoundaryContextualRulesString = rightBoundaryContextualRulesMap.get(eachSlot);
      TextRulerRuleList bestRulesPool = bestRulesPoolMap.get(eachSlot);
      TextRulerRuleList contextRulesPool = contextRulesPoolMap.get(eachSlot);

      sb.append("\n// Slot: " + TextRulerToolkit.getTypeShortName(eachSlot) + "\n");
      sb.append("// LEFT BOUNDARY RULES:\n");
      if (leftBoundaryBestRulesString != null) {
        sb.append(leftBoundaryBestRulesString);
        sb.append("\n// RIGHT BOUNDARY RULES:\n");
        if (rightBoundaryBestRulesString != null)
          sb.append(rightBoundaryBestRulesString);
        else if (bestRulesPool != null)
          sb.append(bestRulesPool.getRulesString(""));

        sb.append("\nBLOCK(contextualRules_" + TextRulerToolkit.getTypeShortName(eachSlot)
                + ") Document{} {\n"
                + "\tDocument{->ASSIGN(redoContextualRules, false)}; // reset flag\n");
        sb.append("\n\t// LEFT BOUNDARY CONTEXTUAL RULES:\n");
        sb.append(leftBoundaryContextualRulesString);

        sb.append("\n\t// RIGHT BOUNDARY CONTEXTUAL RULES:\n");
        if (rightBoundaryBestRulesString != null)
          sb.append(rightBoundaryContextualRulesString);
        else if (contextRulesPool != null)
          sb.append(contextRulesPool.getRulesString("\t"));

        sb.append("\n\t//Document{IF(redoContextualRules)->CALL(thisFile.contextualRules_"
                + TextRulerToolkit.getTypeShortName(eachSlot) + ")};\n}\n");
      } else if (bestRulesPool != null) {
        sb.append(bestRulesPool.getRulesString(""));
        sb.append("\n\t// LEFT BOUNDARY CONTEXTUAL RULES:\n");
        if (contextRulesPool != null)
          sb.append(contextRulesPool.getRulesString(""));
      }
    }

    for (String eachSlot : slotNames) {
      String leftBoundary = TextRulerToolkit.getTypeShortName((new TextRulerTarget(eachSlot,
              MLTargetType.SINGLE_LEFT_BOUNDARY, this)).getSingleSlotTypeName());
      String rightBoundary = TextRulerToolkit.getTypeShortName((new TextRulerTarget(eachSlot,
              MLTargetType.SINGLE_RIGHT_BOUNDARY, this)).getSingleSlotTypeName());
      String slotMarkName = TextRulerToolkit.getTypeShortName(eachSlot);
      int maxInnerLength = (getMaxTokens(eachSlot) * 3) - 2;
      sb.append("\n//slot-building rules:\n");
      sb.append(leftBoundary + "{IS(" + rightBoundary + ")->UNMARK(" + leftBoundary + "), UNMARK("
              + rightBoundary + "), MARKONCE(" + slotMarkName + ")};\n");
      sb.append(leftBoundary + "{->UNMARK(" + leftBoundary + ")} ");
      if (maxInnerLength > 0) {
        sb.append("ANY[0, " + maxInnerLength + "]? ");
        sb.append(rightBoundary + "{->UNMARK(" + rightBoundary + "), MARKONCE(" + slotMarkName
                + ", 1, 3)};\n");
      } else
        sb.append(rightBoundary + "{->UNMARK(" + rightBoundary + "), MARKONCE(" + slotMarkName
                + ", 1, 2)};\n");

      sb.append("\n//cleaning up:\n" + leftBoundary + "{->UNMARK(" + leftBoundary + ")};\n"
              + rightBoundary + "{->UNMARK(" + rightBoundary + ")};\n");
    }

    return sb.toString();
  }

  private Integer getMaxTokens(String slot) {
    if (slotMaximumTokenCountMap.get(slot) == null) {
      return 0;
    }
    return slotMaximumTokenCountMap.get(slot);
  }

  public void setParameters(Map<String, Object> params) {
    if (TextRulerToolkit.DEBUG)
      saveParametersToTempFolder(params);

    // TODO try catch
    if (params.containsKey(WINDOW_SIZE_KEY))
      windowSize = (Integer) params.get(WINDOW_SIZE_KEY);

    if (params.containsKey(CURRENT_BEST_RULES_SIZE_KEY))
      maxCurrentBestRulesCount = (Integer) params.get(CURRENT_BEST_RULES_SIZE_KEY);

    if (params.containsKey(CURRENT_CONTEXTUAL_RULES_SIZE_KEY))
      maxCurrentContextualRulesCount = (Integer) params.get(CURRENT_CONTEXTUAL_RULES_SIZE_KEY);

    if (params.containsKey(MIN_COVERED_POSITIVES_PER_RULE_KEY))
      minCoveredPositives = (Integer) params.get(MIN_COVERED_POSITIVES_PER_RULE_KEY);

    if (params.containsKey(MAX_ERROR_THRESHOLD_KEY))
      maxErrorThreshold = (Float) params.get(MAX_ERROR_THRESHOLD_KEY);
  }

  protected String correctionRulesInputDirectory(TextRulerTarget target) {
    if (target.isLeftBoundary())
      return tempDirectory() + "leftCorrectionDocs";
    else
      return tempDirectory() + "rightCorrectionDocs";
  }

  protected boolean testTaggingRulesAndCreateCorrectionRulesExamples(TextRulerTarget target,
          int maxDistance) {
    try {
      File dir = new File(correctionRulesInputDirectory(target));
      if (!dir.exists())
        dir.mkdir();
      exampleDocuments.clearCurrentExamples();
      exampleDocuments.createExamplesForTarget(target);
      examples = exampleDocuments.getAllPositiveExamples();

      TextRulerExampleDocument[] sortedDocs = exampleDocuments
              .getSortedDocumentsInCacheOptimizedOrder();
      TypeSystem ts = sortedDocs[0].getCAS().getTypeSystem();
      Type tokensRootType = ts.getType(TextRulerToolkit.RUTA_ANY_TYPE_NAME);

      // String allRulesContent = getResultString();
      String allRulesContent = FileUtils.file2String(new File("/testinput/testrules/rules"
              + RutaEngine.SCRIPT_FILE_EXTENSION));
      FileUtils.saveString2File(allRulesContent, new File(getTempRulesFileName()));

      CAS testCAS = getTestCAS();
      for (TextRulerExampleDocument doc : sortedDocs) {
        TextRulerStatisticsCollector c = new TextRulerStatisticsCollector();
        doc.resetAndFillTestCAS(testCAS, target);
        CAS docCAS = doc.getCAS();
        ae.process(testCAS);
        compareOriginalDocumentWithTestCAS(doc, testCAS, target, c, true); // test whole ruleset and
        // collect negative
        // examples

        // now we have some covered positive examples that are good, and
        // maybe some negative examples
        // for that we might create Correction Rules... in order to do
        // that we have to create
        // ShiftExamples and map negative examples (incorrect inserted
        // boundaries) with a specific
        // distance to an original positive example...

        // TODO should that be done in both directions ? left and right
        // ?! what happes if we
        // find two potential examples, one left, one right ? --> for
        // now: use the nearer one. if
        // exactly the same distance, use the one where the wrong tag
        // would be IN the slot filler!
        List<TextRulerExample> correctTags = doc.getPositiveExamples();
        List<TextRulerExample> wrongTags = new ArrayList<TextRulerExample>(
                c.getCoveredNegativeExamples());
        List<TextRulerShiftExample> newExamples = new ArrayList<TextRulerShiftExample>();
        for (TextRulerExample wrongTag : wrongTags) {
          // test, if there's a corresponding positive example
          // somewhere around (within maxDistance)
          List<AnnotationFS> left = TextRulerToolkit.getAnnotationsBeforePosition(docCAS, wrongTag
                  .getAnnotation().getBegin(), maxDistance, TextRulerToolkit
                  .getFilterSetWithSlotNames(slotNames, filterSet), tokensRootType);
          List<AnnotationFS> right = TextRulerToolkit.getAnnotationsAfterPosition(docCAS, wrongTag
                  .getAnnotation().getEnd(), maxDistance, TextRulerToolkit
                  .getFilterSetWithSlotNames(slotNames, filterSet), tokensRootType);

          // TODO stop after the first found match or create one bad
          // example for each found occurence ??!!
          // for now: stop after one ! so create only ONE bad
          // example...
          int leftDistance = 0;
          TextRulerExample leftCorrectTag = null;
          for (int i = left.size() - 1; i >= 0; i--) {
            leftDistance++;
            TextRulerAnnotation needle = TextRulerToolkit.convertToTargetAnnotation(left.get(i),
                    doc, target, docCAS.getTypeSystem());
            leftCorrectTag = TextRulerToolkit.exampleListContainsAnnotation(correctTags, needle);
            if (leftCorrectTag != null)
              break;
          }

          int rightDistance = 0;
          TextRulerExample rightCorrectTag = null;
          for (AnnotationFS fs : right) {
            rightDistance++;
            TextRulerAnnotation needle = TextRulerToolkit.convertToTargetAnnotation(fs, doc,
                    target, docCAS.getTypeSystem());
            rightCorrectTag = TextRulerToolkit.exampleListContainsAnnotation(correctTags, needle);
            if (rightCorrectTag != null)
              break;
          }

          TextRulerExample theCorrectTag = null;
          if (rightDistance < leftDistance && rightCorrectTag != null)
            theCorrectTag = rightCorrectTag;
          else if (rightDistance > leftDistance && leftCorrectTag != null)
            theCorrectTag = leftCorrectTag;
          else // use the one that would lie in the slot filler:
          {
            if (target.type == MLTargetType.SINGLE_LEFT_BOUNDARY && rightCorrectTag != null)
              theCorrectTag = rightCorrectTag;
            else
              theCorrectTag = leftCorrectTag;
          }

          if (theCorrectTag != null) {
            TextRulerToolkit.log("FOUND BAD EXAMPLE FOR SHIFTING !!");
            TextRulerShiftExample shiftExample = new TextRulerShiftExample(doc,
                    wrongTag.getAnnotation(), theCorrectTag.getAnnotation(), true, target);
            newExamples.add(shiftExample);
          }
        }
        TextRulerToolkit
                .writeCAStoXMIFile(testCAS, dir + File.pathSeparator + doc.getCasFileName());
      }
      testCAS.reset();
    } catch (Exception e) {
      TextRulerPlugin.error(e);
      return false;
    }

    return true;
  }

  @Override
  public String getFileHeaderString(boolean complete) {
    return super.getFileHeaderString(complete) + "BOOLEAN redoContextualRules;\n\n";
  }

  @Override
  protected boolean checkForMandatoryTypes() {
    if (!super.checkForMandatoryTypes()) {
      return false;
    }

    CAS someCas = getTestCAS();
    TypeSystem ts = someCas.getTypeSystem();
//    GlobalCASSource.releaseCAS(someCas);
    // check if all helper types are present:
    List<String> list = new ArrayList<String>();

    for (String eachSlot : slotNames) {
      list.add(new TextRulerTarget(eachSlot, MLTargetType.SINGLE_LEFT_BOUNDARY, this)
              .getSingleSlotTypeName());
      list.add(new TextRulerTarget(eachSlot, MLTargetType.SINGLE_RIGHT_BOUNDARY, this)
              .getSingleSlotTypeName());
    }

    boolean result = true;
    List<String> missingTypes = new ArrayList<String>();
    for (String s : list) {
      if (ts.getType(s) == null) {
        missingTypes.add(s);
        result = false;
      }
    }
    String missingString = "";
    for (String string : missingTypes) {
      missingString += string + ", ";
    }
    if (!StringUtils.isEmpty(missingString)) {
      missingString = missingString.substring(0, missingString.length() - 2);
    }
    if (!result) {
      sendStatusUpdateToDelegate("Error: Some Slot- or Helper-Types were not found in TypeSystem: "
              + missingString, TextRulerLearnerState.ML_ERROR, false);
    }
    return result;
  }

}
TOP

Related Classes of org.apache.uima.ruta.textruler.learner.lp2.BasicLP2

TOP
Copyright © 2018 www.massapi.com. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.