Package org.apache.ctakes.drugner.fsm.machines.util

Source Code of org.apache.ctakes.drugner.fsm.machines.util.SuffixFrequencyFSM

/**
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements.  See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership.  The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License.  You may obtain a copy of the License at
*
*   http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
* KIND, either express or implied.  See the License for the
* specific language governing permissions and limitations
* under the License.
*/
package org.apache.ctakes.drugner.fsm.machines.util;

import java.util.HashMap;
import java.util.HashSet;
import java.util.Iterator;
import java.util.List;
import java.util.Map;
import java.util.Set;

import org.apache.ctakes.core.fsm.condition.IntegerCondition;
import org.apache.ctakes.core.fsm.condition.PunctuationValueCondition;
import org.apache.ctakes.core.fsm.condition.TextValueCondition;
import org.apache.ctakes.core.fsm.condition.WordSetCondition;
import org.apache.ctakes.core.fsm.state.NamedState;
import org.apache.ctakes.core.fsm.token.BaseToken;
import org.apache.ctakes.drugner.fsm.elements.conditions.ContainsSetTextValueCondition;
import org.apache.ctakes.drugner.fsm.output.util.SuffixFrequencyToken;

import net.openai.util.fsm.AnyCondition;
import net.openai.util.fsm.Condition;
import net.openai.util.fsm.Machine;
import net.openai.util.fsm.State;

/**
* Uses one or more finite state machines to detect frequency suffix data in the given
* input of tokens.
* @author Mayo Clinic
*/
public class SuffixFrequencyFSM {
  // text fractions
  Set iv_textSuffixSet = new HashSet();

  Set iv_textPrefixSet = new HashSet();

  Set iv_frequencySet = new HashSet();

  Set iv_middleTermSet = new HashSet();

  Set iv_periodSet = new HashSet();

  Set iv_hyphenatedSet = new HashSet();

  Set iv_singleWordSet = new HashSet();

  Set iv_specifiedWordSet = new HashSet();

  // contains the finite state machines
  private Set iv_machineSet = new HashSet();

 
  /**
   *
   * Constructor
   *
   */
  public SuffixFrequencyFSM() {
    iv_specifiedWordSet.add("noon");
    iv_specifiedWordSet.add("lunch");
    iv_specifiedWordSet.add("breakfast");
    iv_specifiedWordSet.add("dinner");
    iv_specifiedWordSet.add("morning");
    iv_specifiedWordSet.add("afternoon");
    iv_specifiedWordSet.add("evening");
    iv_specifiedWordSet.add("am");
    iv_specifiedWordSet.add("pm");
    iv_specifiedWordSet.add("specified");

    iv_singleWordSet.add("weekly");
    iv_singleWordSet.add("monthly");
    iv_singleWordSet.add("biweekly");
    iv_singleWordSet.add("daily");
    iv_singleWordSet.add("nightly");
    iv_singleWordSet.add("bid");
    iv_singleWordSet.add("od");
    iv_singleWordSet.add("qd");
    iv_singleWordSet.add("hs");
    iv_singleWordSet.add("prn");
    iv_singleWordSet.add("tid");
    iv_singleWordSet.add("q");

    iv_textSuffixSet.add("d");
    iv_textSuffixSet.add("y");
    iv_textSuffixSet.add("m");
    iv_textSuffixSet.add("mo");
    iv_textSuffixSet.add("yr");
    iv_textSuffixSet.add("day");
    iv_textSuffixSet.add("daily");
    iv_textSuffixSet.add("wk");
    iv_textSuffixSet.add("week");
    iv_textSuffixSet.add("weeks");
    iv_textSuffixSet.add("h");
    iv_textSuffixSet.add("hour");
    iv_textSuffixSet.add("hours");
    iv_textSuffixSet.add("min");
    iv_textSuffixSet.add("month");
    iv_textSuffixSet.add("months");
    iv_textSuffixSet.add("year");

    iv_textPrefixSet.add("every");

    iv_frequencySet.add("once");
    iv_frequencySet.add("twice");
    iv_frequencySet.add("one");
    iv_frequencySet.add("two");
    iv_frequencySet.add("three");
    iv_frequencySet.add("four");
    iv_frequencySet.add("five");
    iv_frequencySet.add("six");
    iv_frequencySet.add("seven");
    iv_frequencySet.add("eight");
    iv_frequencySet.add("nine");

    iv_middleTermSet.add("a");
    iv_middleTermSet.add("an");
    iv_middleTermSet.add("as");
    iv_middleTermSet.add("in");
    iv_middleTermSet.add("the");
    iv_middleTermSet.add("each");
    iv_middleTermSet.add("times");
    iv_middleTermSet.add("time");
    iv_middleTermSet.add("per");
    iv_middleTermSet.add("every");
    iv_middleTermSet.add("at");

    iv_hyphenatedSet.add("once-a-day");
    iv_hyphenatedSet.add("once-a-week");
    iv_hyphenatedSet.add("twice-a-day");
    iv_hyphenatedSet.add("twice-a-week");
    iv_hyphenatedSet.add("once-daily");
    iv_hyphenatedSet.add("twice-daily");
    iv_hyphenatedSet.add("one-a-day");
    iv_hyphenatedSet.add("two-a-day");
    iv_hyphenatedSet.add("three-a-day");
    iv_hyphenatedSet.add("four-a-day");
    iv_hyphenatedSet.add("five-a-day");
    iv_hyphenatedSet.add("six-a-day");
    iv_hyphenatedSet.add("seven-a-day");
    iv_hyphenatedSet.add("eight-a-day");
    iv_hyphenatedSet.add("nine-a-day");
    iv_hyphenatedSet.add("once-weekly");
    iv_hyphenatedSet.add("twice-weekly");
    iv_hyphenatedSet.add("one-a-week");
    iv_hyphenatedSet.add("two-a-week");
    iv_hyphenatedSet.add("three-a-week");
    iv_hyphenatedSet.add("four-a-week");
    iv_hyphenatedSet.add("five-a-week");
    iv_hyphenatedSet.add("six-a-week");
    iv_hyphenatedSet.add("seven-a-week");
    iv_hyphenatedSet.add("eight-a-week");
    iv_hyphenatedSet.add("nine-a-week");
    iv_hyphenatedSet.add("once-monthly");
    iv_hyphenatedSet.add("twice-monthly");
    iv_hyphenatedSet.add("one-a-month");
    iv_hyphenatedSet.add("two-a-month");
    iv_hyphenatedSet.add("three-a-month");
    iv_hyphenatedSet.add("four-a-month");
    iv_hyphenatedSet.add("five-a-month");
    iv_hyphenatedSet.add("six-a-month");
    iv_hyphenatedSet.add("seven-a-month");
    iv_hyphenatedSet.add("eight-a-month");
    iv_hyphenatedSet.add("nine-a-month");
    iv_hyphenatedSet.add("once-hourly");
    iv_hyphenatedSet.add("twice-hourly");
    iv_hyphenatedSet.add("one-an-hour");
    iv_hyphenatedSet.add("two-an-hour");
    iv_hyphenatedSet.add("three-an-hour");
    iv_hyphenatedSet.add("four-an-hour");
    iv_hyphenatedSet.add("five-an-hour");
    iv_hyphenatedSet.add("six-an-hour");
    iv_hyphenatedSet.add("seven-an-hour");
    iv_hyphenatedSet.add("eight-an-hour");
    iv_hyphenatedSet.add("nine-an-hour");

    iv_hyphenatedSet.add("once-nightly");
    iv_hyphenatedSet.add("as-needed");
    iv_hyphenatedSet.add("twice-nightly");
    iv_hyphenatedSet.add("once-every-day");
    iv_hyphenatedSet.add("once-daily");
    iv_hyphenatedSet.add("twice-daily");
    iv_hyphenatedSet.add("one-time-a-day");
    iv_hyphenatedSet.add("two-times-a-day");
    iv_hyphenatedSet.add("three-times-a-day");
    iv_hyphenatedSet.add("four-times-a-day");
    iv_hyphenatedSet.add("five-times-a-day");
    iv_hyphenatedSet.add("six-times-a-day");
    iv_hyphenatedSet.add("seven-times-a-day");
    iv_hyphenatedSet.add("eight-times-a-day");
    iv_hyphenatedSet.add("nine-times-a-day");
    iv_hyphenatedSet.add("once-every-week");
    iv_hyphenatedSet.add("twice-every-day");
    iv_hyphenatedSet.add("one-time-a-week");
    iv_hyphenatedSet.add("two-times-a-week");
    iv_hyphenatedSet.add("three-times-a-week");
    iv_hyphenatedSet.add("four-times-a-week");
    iv_hyphenatedSet.add("five-times-a-week");
    iv_hyphenatedSet.add("six-times-a-week");
    iv_hyphenatedSet.add("seven-times-a-week");
    iv_hyphenatedSet.add("eight-times-a-week");
    iv_hyphenatedSet.add("nine-times-a-week");
    iv_hyphenatedSet.add("once-every-hour");
    iv_hyphenatedSet.add("twice-every-hour");
    iv_hyphenatedSet.add("one-time-a-month");
    iv_hyphenatedSet.add("two-times-a-month");
    iv_hyphenatedSet.add("three-times-a-month");
    iv_hyphenatedSet.add("four-times-a-month");
    iv_hyphenatedSet.add("five-times-a-month");
    iv_hyphenatedSet.add("six-times-a-month");
    iv_hyphenatedSet.add("seven-times-a-month");
    iv_hyphenatedSet.add("eight-times-a-month");
    iv_hyphenatedSet.add("nine-times-a-month");
    iv_hyphenatedSet.add("one-time-each-hour");
    iv_hyphenatedSet.add("two-times-each-hour");
    iv_hyphenatedSet.add("three-times-each-hour");
    iv_hyphenatedSet.add("four-times-each-hour");
    iv_hyphenatedSet.add("five-times-each-hour");
    iv_hyphenatedSet.add("six-times-each-hour");
    iv_hyphenatedSet.add("seven-times-each-hour");
    iv_hyphenatedSet.add("eight-times-each-hour");
    iv_hyphenatedSet.add("nine-times-each-hour");

    iv_machineSet.add(getLatin3AbbreviationMachine());
    iv_machineSet.add(getLatin2AbbreviationMachine());
    iv_machineSet.add(getFrequencyMachine());

  }

  /**
   * Gets a finite state machine that detects the following:
   * <ol>
   *     <li>40mg/d</li>
   *     <li>32.1-47.3mg/wk</li>
   * </ol>
   * @return
   */
  private Machine getLatin3AbbreviationMachine() {
    State startState = new NamedState("START");
    State endState = new NamedState("END");
    endState.setEndStateFlag(true);

    Machine m = new Machine(startState);

    State leftAbbreviateQState = new NamedState("LEFT_Q");
    State leftAbbreviateBState = new NamedState("LEFT_B");
    State leftAbbreviatePState = new NamedState("LEFT_P");
    State leftAbbreviateTState = new NamedState("LEFT_T");

    State middleAbbreviateQtoAState = new NamedState("MID_Q2A");
    State middleAbbreviateQtoDState = new NamedState("MID_Q2D");
    State middleAbbreviateQtoHState = new NamedState("MID_Q2H");
    State middleAbbreviateQtoIState = new NamedState("MID_Q2I");
    State middleAbbreviateQtoMState = new NamedState("MID_Q2M");
    State middleAbbreviateQtoOState = new NamedState("MID_Q2O");
    State middleAbbreviateQtoWState = new NamedState("MID_Q2W");
    State middleAbbreviateQtoPState = new NamedState("MID_Q2P");

    State middleAbbreviatePtoRState = new NamedState("MID_P2R");
    State middleAbbreviateTtoIState = new NamedState("MID_T2I");

    State middleAbbreviateBtoIState = new NamedState("MID_B2I");

    State rightAbbreviateQIDState = new NamedState("RIGHT_QID");
    State rightAbbreviateQADState = new NamedState("RIGHT_QAD");
    State rightAbbreviateQDSState = new NamedState("RIGHT_QDS");
    State rightAbbreviateQHSState = new NamedState("RIGHT_QHS");
    State rightAbbreviateQWKState = new NamedState("RIGHT_QWK");
    State rightAbbreviateQODState = new NamedState("RIGHT_QOD");
    State rightAbbreviateQAMState = new NamedState("RIGHT_QAM");
    State rightAbbreviateQPMState = new NamedState("RIGHT_QPM");
    State rightAbbreviateQMTState = new NamedState("RIGHT_QMT");
    State rightAbbreviateBIDState = new NamedState("RIGHT_BID");
    State rightAbbreviatePRNState = new NamedState("RIGHT_PRN");
    State rightAbbreviateTIDState = new NamedState("RIGHT_TID");

    State firstDotQState = new NamedState("FIRSTDOTQ");
    State firstDotBState = new NamedState("FIRSTDOTB");
    State firstDotPState = new NamedState("FIRSTDOTP");
    State firstDotTState = new NamedState("FIRSTDOTT");

    State secondDotQtoAState = new NamedState("SECONDDOTQ2A");
    State secondDotQtoDState = new NamedState("SECONDDOTQ2D");
    State secondDotQtoHState = new NamedState("SECONDDOTQ2H");
    State secondDotQtoIState = new NamedState("SECONDDOTQ2I");
    State secondDotQtoMState = new NamedState("SECONDDOTQ2M");
    State secondDotQtoOState = new NamedState("SECONDDOTQ2O");
    State secondDotQtoWState = new NamedState("SECONDDOTQ2W");
    State secondDotQtoPState = new NamedState("SECONDDOTQ2P");
    State secondDotBtoIState = new NamedState("SECONDDOTB2I");
    State secondDotPtoRState = new NamedState("SECONDDOTP2R");
    State secondDotTtoIState = new NamedState("SECONDDOTT2I");

    Condition firstDotConditionQ = new PunctuationValueCondition('.');
    Condition firstDotConditionB = new PunctuationValueCondition('.');
    Condition firstDotConditionP = new PunctuationValueCondition('.');
    Condition firstDotConditionT = new PunctuationValueCondition('.');
    Condition secondDotConditionQH = new PunctuationValueCondition('.');
    Condition secondDotConditionQI = new PunctuationValueCondition('.');
    Condition secondDotConditionQA = new PunctuationValueCondition('.');
    Condition secondDotConditionQD = new PunctuationValueCondition('.');
    Condition secondDotConditionQM = new PunctuationValueCondition('.');
    Condition secondDotConditionQO = new PunctuationValueCondition('.');
    Condition secondDotConditionQW = new PunctuationValueCondition('.');
    Condition secondDotConditionQP = new PunctuationValueCondition('.');
    Condition secondDotConditionBI = new PunctuationValueCondition('.');
    Condition secondDotConditionPR = new PunctuationValueCondition('.');
    Condition secondDotConditionTI = new PunctuationValueCondition('.');
    Condition thirdDotConditionQHS = new PunctuationValueCondition('.');
    Condition thirdDotConditionQAD = new PunctuationValueCondition('.');
    Condition thirdDotConditionQID = new PunctuationValueCondition('.');
    Condition thirdDotConditionQDS = new PunctuationValueCondition('.');
    Condition thirdDotConditionQMT = new PunctuationValueCondition('.');
    Condition thirdDotConditionQOD = new PunctuationValueCondition('.');
    Condition thirdDotConditionQWK = new PunctuationValueCondition('.');
    Condition thirdDotConditionQAM = new PunctuationValueCondition('.');
    Condition thirdDotConditionQPM = new PunctuationValueCondition('.');
    Condition thirdDotConditionBID = new PunctuationValueCondition('.');
    Condition thirdDotConditionPRN = new PunctuationValueCondition('.');
    Condition thirdDotConditionTID = new PunctuationValueCondition('.');

    startState.addTransition(new TextValueCondition("q", true),
        leftAbbreviateQState);
    startState.addTransition(new TextValueCondition("b", true),
        leftAbbreviateBState);
    startState.addTransition(new TextValueCondition("p", true),
        leftAbbreviatePState);
    startState.addTransition(new TextValueCondition("t", true),
        leftAbbreviateTState);
    startState.addTransition(new AnyCondition(), startState);

    leftAbbreviateQState.addTransition(firstDotConditionQ, firstDotQState);
    leftAbbreviateQState.addTransition(new AnyCondition(), startState);

    firstDotQState.addTransition(new TextValueCondition("a", true),
        middleAbbreviateQtoAState);
    firstDotQState.addTransition(new TextValueCondition("d", true),
        middleAbbreviateQtoDState);
    firstDotQState.addTransition(new TextValueCondition("h", true),
        middleAbbreviateQtoHState);
    firstDotQState.addTransition(new TextValueCondition("i", true),
        middleAbbreviateQtoIState);
    firstDotQState.addTransition(new TextValueCondition("m", true),
        middleAbbreviateQtoMState);
    firstDotQState.addTransition(new TextValueCondition("o", true),
        middleAbbreviateQtoOState);
    firstDotQState.addTransition(new TextValueCondition("w", true),
        middleAbbreviateQtoWState);
    firstDotQState.addTransition(new TextValueCondition("p", true),
        middleAbbreviateQtoPState);
    firstDotQState.addTransition(new AnyCondition(), startState);

    middleAbbreviateQtoAState.addTransition(secondDotConditionQA,
        secondDotQtoAState);
    middleAbbreviateQtoAState.addTransition(new AnyCondition(), startState);

    middleAbbreviateQtoDState.addTransition(secondDotConditionQD,
        secondDotQtoDState);
    middleAbbreviateQtoDState.addTransition(new AnyCondition(), startState);

    middleAbbreviateQtoHState.addTransition(secondDotConditionQH,
        secondDotQtoHState);
    middleAbbreviateQtoHState.addTransition(new AnyCondition(), startState);

    middleAbbreviateQtoIState.addTransition(secondDotConditionQI,
        secondDotQtoIState);
    middleAbbreviateQtoIState.addTransition(new AnyCondition(), startState);

    middleAbbreviateQtoMState.addTransition(secondDotConditionQM,
        secondDotQtoMState);
    middleAbbreviateQtoMState.addTransition(new AnyCondition(), startState);

    middleAbbreviateQtoOState.addTransition(secondDotConditionQO,
        secondDotQtoOState);
    middleAbbreviateQtoOState.addTransition(new AnyCondition(), startState);

    middleAbbreviateQtoWState.addTransition(secondDotConditionQW,
        secondDotQtoWState);
    middleAbbreviateQtoWState.addTransition(new AnyCondition(), startState);

    middleAbbreviateQtoPState.addTransition(secondDotConditionQP,
        secondDotQtoPState);
    middleAbbreviateQtoPState.addTransition(new AnyCondition(), startState);

    secondDotQtoAState.addTransition(new TextValueCondition("d", true),
        rightAbbreviateQADState);
    secondDotQtoAState.addTransition(new AnyCondition(), startState);

    secondDotQtoDState.addTransition(new TextValueCondition("s", true),
        rightAbbreviateQDSState);
    secondDotQtoDState.addTransition(new AnyCondition(), startState);

    secondDotQtoHState.addTransition(new TextValueCondition("s", true),
        rightAbbreviateQHSState);
    secondDotQtoHState.addTransition(new AnyCondition(), startState);

    secondDotQtoIState.addTransition(new TextValueCondition("d", true),
        rightAbbreviateQIDState);
    secondDotQtoIState.addTransition(new AnyCondition(), startState);

    secondDotQtoMState.addTransition(new TextValueCondition("t", true),
        rightAbbreviateQMTState);
    secondDotQtoMState.addTransition(new AnyCondition(), startState);

    secondDotQtoOState.addTransition(new TextValueCondition("d", true),
        rightAbbreviateQODState);
    secondDotQtoOState.addTransition(new AnyCondition(), startState);

    secondDotQtoWState.addTransition(new TextValueCondition("k", true),
        rightAbbreviateQWKState);
    secondDotQtoWState.addTransition(new AnyCondition(), startState);

    secondDotQtoAState.addTransition(new TextValueCondition("m", true),
        rightAbbreviateQAMState);
    secondDotQtoAState.addTransition(new AnyCondition(), startState);

    secondDotQtoPState.addTransition(new TextValueCondition("m", true),
        rightAbbreviateQPMState);
    secondDotQtoPState.addTransition(new AnyCondition(), startState);

    secondDotBtoIState.addTransition(new TextValueCondition("d", true),
        endState);
    secondDotBtoIState.addTransition(new AnyCondition(), startState);

    rightAbbreviateQADState.addTransition(thirdDotConditionQAD, endState);
    rightAbbreviateQADState.addTransition(new AnyCondition(), startState);

    rightAbbreviateQDSState.addTransition(thirdDotConditionQDS, endState);
    rightAbbreviateQDSState.addTransition(new AnyCondition(), startState);

    rightAbbreviateQHSState.addTransition(thirdDotConditionQHS, endState);
    rightAbbreviateQHSState.addTransition(new AnyCondition(), startState);

    rightAbbreviateQIDState.addTransition(thirdDotConditionQID, endState);
    rightAbbreviateQIDState.addTransition(new AnyCondition(), startState);

    rightAbbreviateQMTState.addTransition(thirdDotConditionQMT, endState);
    rightAbbreviateQMTState.addTransition(new AnyCondition(), startState);

    rightAbbreviateQODState.addTransition(thirdDotConditionQOD, endState);
    rightAbbreviateQODState.addTransition(new AnyCondition(), startState);

    rightAbbreviateQWKState.addTransition(thirdDotConditionQWK, endState);
    rightAbbreviateQWKState.addTransition(new AnyCondition(), startState);

    rightAbbreviateQAMState.addTransition(thirdDotConditionQAM, endState);
    rightAbbreviateQAMState.addTransition(new AnyCondition(), startState);

    rightAbbreviateQPMState.addTransition(thirdDotConditionQPM, endState);
    rightAbbreviateQPMState.addTransition(new AnyCondition(), startState);

    leftAbbreviateBState.addTransition(firstDotConditionB, firstDotBState);
    leftAbbreviateBState.addTransition(new AnyCondition(), startState);

    firstDotBState.addTransition(new TextValueCondition("i", true),
        middleAbbreviateBtoIState);
    firstDotBState.addTransition(new AnyCondition(), startState);

    middleAbbreviateBtoIState.addTransition(secondDotConditionBI,
        secondDotBtoIState);
    middleAbbreviateBtoIState.addTransition(new AnyCondition(), startState);

    secondDotBtoIState.addTransition(new TextValueCondition("d", true),
        rightAbbreviateBIDState);
    secondDotBtoIState.addTransition(new AnyCondition(), startState);

    rightAbbreviateBIDState.addTransition(thirdDotConditionBID, endState);

    leftAbbreviatePState.addTransition(firstDotConditionP, firstDotPState);
    leftAbbreviatePState.addTransition(new AnyCondition(), startState);

    leftAbbreviateTState.addTransition(firstDotConditionT, firstDotTState);
    leftAbbreviateTState.addTransition(new AnyCondition(), startState);

    firstDotPState.addTransition(new TextValueCondition("r", true),
        middleAbbreviatePtoRState);
    firstDotPState.addTransition(new AnyCondition(), startState);

    firstDotTState.addTransition(new TextValueCondition("i", true),
        middleAbbreviateTtoIState);
    firstDotTState.addTransition(new AnyCondition(), startState);

    middleAbbreviatePtoRState.addTransition(secondDotConditionPR,
        secondDotPtoRState);
    middleAbbreviatePtoRState.addTransition(new AnyCondition(), startState);

    middleAbbreviateTtoIState.addTransition(secondDotConditionTI,
        secondDotTtoIState);
    middleAbbreviateTtoIState.addTransition(new AnyCondition(), startState);

    secondDotPtoRState.addTransition(new TextValueCondition("n", true),
        rightAbbreviatePRNState);
    secondDotPtoRState.addTransition(new AnyCondition(), startState);

    secondDotTtoIState.addTransition(new TextValueCondition("d", true),
        rightAbbreviateTIDState);
    secondDotTtoIState.addTransition(new AnyCondition(), startState);

    rightAbbreviatePRNState.addTransition(thirdDotConditionPRN, endState);
    rightAbbreviateTIDState.addTransition(thirdDotConditionTID, endState);

    endState.addTransition(new AnyCondition(), startState);
    return m;
  }

  /**
   * Gets a finite state machine that detects the following:
   * <ol>
   *     <li>40mg/d</li>
   *     <li>32.1-47.3mg/wk</li>
   * </ol>
   * @return
   */
  private Machine getLatin2AbbreviationMachine() {
    State startState = new NamedState("START");
    State endState = new NamedState("END");
    endState.setEndStateFlag(true);

    Machine m = new Machine(startState);

    State leftAbbreviateQState = new NamedState("LEFT_Q");
    State leftAbbreviateAState = new NamedState("LEFT_A");
    State leftAbbreviateOState = new NamedState("LEFT_O");
    State leftAbbreviateHState = new NamedState("LEFT_H");
    State leftAbbreviatePState = new NamedState("LEFT_P");

    State rightAbbreviateQDState = new NamedState("RIGHT_QD");
    State rightAbbreviateQHState = new NamedState("RIGHT_QH");
    State rightAbbreviateAMState = new NamedState("RIGHT_AM");
    State rightAbbreviateODState = new NamedState("RIGHT_OD");
    State rightAbbreviateHSState = new NamedState("RIGHT_HS");
    State rightAbbreviatePMState = new NamedState("RIGHT_PM");

    State firstDotQState = new NamedState("FIRSTDOTQ");
    State firstDotAState = new NamedState("FIRSTDOTA");
    State firstDotOState = new NamedState("FIRSTDOTO");
    State firstDotHState = new NamedState("FIRSTDOTH");
    State firstDotPState = new NamedState("FIRSTDOTP");

    Condition firstQDDotCondition = new PunctuationValueCondition('.');
    Condition secondQDDotCondition = new PunctuationValueCondition('.');
    Condition firstODDotCondition = new PunctuationValueCondition('.');
    Condition secondQHDotCondition = new PunctuationValueCondition('.');
    Condition secondODDotCondition = new PunctuationValueCondition('.');
    Condition firstAMDotCondition = new PunctuationValueCondition('.');
    Condition firstPMDotCondition = new PunctuationValueCondition('.');
    Condition secondAMDotCondition = new PunctuationValueCondition('.');
    Condition secondPMDotCondition = new PunctuationValueCondition('.');
    Condition firstHSDotCondition = new PunctuationValueCondition('.');
    Condition secondHSDotCondition = new PunctuationValueCondition('.');

    Condition soloCondition = new WordSetCondition(iv_singleWordSet, true);

    startState.addTransition(new TextValueCondition("q", true),
        leftAbbreviateQState);
    startState.addTransition(new TextValueCondition("a", true),
        leftAbbreviateAState);
    startState.addTransition(new TextValueCondition("o", true),
        leftAbbreviateOState);
    startState.addTransition(new TextValueCondition("h", true),
        leftAbbreviateHState);
    startState.addTransition(new TextValueCondition("p", true),
        leftAbbreviatePState);
    startState.addTransition(new AnyCondition(), startState);

    leftAbbreviateQState.addTransition(firstQDDotCondition, firstDotQState);

    leftAbbreviateQState.addTransition(new AnyCondition(), startState);

    leftAbbreviateAState.addTransition(firstAMDotCondition, firstDotAState);
    leftAbbreviateAState.addTransition(new AnyCondition(), startState);

    leftAbbreviateOState.addTransition(firstODDotCondition, firstDotOState);
    leftAbbreviateOState.addTransition(new AnyCondition(), startState);

    leftAbbreviateHState.addTransition(firstHSDotCondition, firstDotHState);
    leftAbbreviateHState.addTransition(new AnyCondition(), startState);

    leftAbbreviatePState.addTransition(firstPMDotCondition, firstDotPState);
    leftAbbreviatePState.addTransition(new AnyCondition(), startState);

    firstDotQState.addTransition(soloCondition, endState);
    firstDotQState.addTransition(new TextValueCondition("d", true),
        rightAbbreviateQDState);
    firstDotQState.addTransition(new TextValueCondition("h", true),
        rightAbbreviateQHState);
    firstDotQState.addTransition(new AnyCondition(), startState);

    firstDotAState.addTransition(new TextValueCondition("m", true),
        rightAbbreviateAMState);
    firstDotAState.addTransition(new AnyCondition(), startState);

    firstDotOState.addTransition(new TextValueCondition("d", true),
        rightAbbreviateODState);
    firstDotOState.addTransition(new AnyCondition(), startState);

    firstDotHState.addTransition(new TextValueCondition("s", true),
        rightAbbreviateHSState);
    firstDotHState.addTransition(new AnyCondition(), startState);

    firstDotPState.addTransition(new TextValueCondition("m", true),
        rightAbbreviatePMState);
    firstDotPState.addTransition(new AnyCondition(), startState);

    rightAbbreviateQHState.addTransition(secondQHDotCondition, endState);
    rightAbbreviateQHState.addTransition(new AnyCondition(), startState);

    rightAbbreviateAMState.addTransition(secondAMDotCondition, endState);
    rightAbbreviateAMState.addTransition(new AnyCondition(), startState);

    rightAbbreviateODState.addTransition(secondODDotCondition, endState);
    rightAbbreviateODState.addTransition(new AnyCondition(), startState);

    rightAbbreviateQDState.addTransition(secondQDDotCondition, endState);
    rightAbbreviateQDState.addTransition(new AnyCondition(), startState);

    rightAbbreviatePMState.addTransition(secondPMDotCondition, endState);
    rightAbbreviatePMState.addTransition(new AnyCondition(), startState);

    rightAbbreviateHSState.addTransition(secondHSDotCondition, endState);
    rightAbbreviateHSState.addTransition(new AnyCondition(), startState);

    endState.addTransition(new AnyCondition(), startState);

    return m;
  }

  /**
   * Gets a finite state machine that detects the following
   * ('once', 'twice', # or text#) a day/week/month/year:
   * <ol>
   *     <li>once a day</li>
   *     <li>three times a day</li>
   *     <li>once-a-day</li>
   * </ol>
   * @return
   */
  private Machine getFrequencyMachine() {
    State startState = new NamedState("START");
    State endState = new NamedState("END");
    endState.setEndStateFlag(true);

    Machine m = new Machine(startState);

    State leftAbbreviateState = new NamedState("LEFT_FREQ");

    State lastTextState = new NamedState("RIGHT_FREQ");
    State middleATextState = new NamedState("MID_TEXT");
    State firstDashState = new NamedState("FIRSTDASH");
    State secondDashState = new NamedState("SECONDDASH");

    Condition integerCondition = new IntegerCondition();
    Condition firstDashCondition = new PunctuationValueCondition('-');
    Condition secondDashCondition = new PunctuationValueCondition('-');

    Condition numericStartCondition = new WordSetCondition(iv_frequencySet,
        false);
    Condition hyphenatedCondition = new WordSetCondition(iv_hyphenatedSet,
        false);
    Condition firstMiddleTextCondition = new WordSetCondition(
        iv_middleTermSet, true);
    Condition secondMiddleTextCondition = new WordSetCondition(
        iv_middleTermSet, true);
    Condition thirdMiddleTextCondition = new WordSetCondition(
        iv_middleTermSet, true);
    Condition fourthMiddleTextCondition = new WordSetCondition(
        iv_middleTermSet, true);
    Condition lastTextCondition = new WordSetCondition(iv_textSuffixSet,
        false);
    Condition firstTextCondition = new WordSetCondition(iv_textPrefixSet,
        false);
    Condition soloCondition = new WordSetCondition(iv_singleWordSet, true);
    Condition specificWordCondition = new WordSetCondition(
        iv_specifiedWordSet, false);
    Condition containsSoloTermCondition = new ContainsSetTextValueCondition(
        iv_singleWordSet, true);

    startState.addTransition(numericStartCondition, leftAbbreviateState);
    startState.addTransition(firstTextCondition, leftAbbreviateState);
    startState.addTransition(new TextValueCondition("a", true),
        leftAbbreviateState);
    startState.addTransition(integerCondition, leftAbbreviateState);
    startState.addTransition(hyphenatedCondition, endState);
    startState.addTransition(containsSoloTermCondition, endState);
    startState.addTransition(soloCondition, endState);

    startState.addTransition(new AnyCondition(), startState);

    leftAbbreviateState.addTransition(firstMiddleTextCondition,
        middleATextState);
    leftAbbreviateState.addTransition(firstDashCondition, firstDashState);
    leftAbbreviateState.addTransition(soloCondition, endState);
    leftAbbreviateState.addTransition(specificWordCondition, endState);
    leftAbbreviateState.addTransition(hyphenatedCondition, endState);
    leftAbbreviateState.addTransition(new AnyCondition(), startState);

    firstDashState
        .addTransition(thirdMiddleTextCondition, middleATextState);
    firstDashState.addTransition(new AnyCondition(), startState);

    middleATextState
        .addTransition(secondMiddleTextCondition, lastTextState);
    middleATextState.addTransition(secondDashCondition, secondDashState);
    middleATextState.addTransition(lastTextCondition, endState);
    middleATextState.addTransition(new AnyCondition(), startState);

    secondDashState.addTransition(fourthMiddleTextCondition, lastTextState);
    secondDashState.addTransition(lastTextCondition, endState);
    secondDashState.addTransition(new AnyCondition(), startState);

    lastTextState.addTransition(lastTextCondition, endState);
    lastTextState.addTransition(new AnyCondition(), startState);

    endState.addTransition(new AnyCondition(), startState);

    return m;
  }

  /**
   * Executes the finite state machines.
   * @param tokens
   * @return Set SuffixFrequencyToken objects.
   * @throws Exception
   */
  public Set execute(List tokens, Set overrideSet) throws Exception {
    Set rangeSet = new HashSet();

    // maps a fsm to a token start index
    // key = fsm , value = token start index
    Map tokenStartMap = new HashMap();

    Iterator overrideTokenItr = overrideSet.iterator();
    // key = start offset, value = override BaseToken object
    Map overrideTokenMap = new HashMap();
    while (overrideTokenItr.hasNext()) {
      BaseToken t = (BaseToken) overrideTokenItr.next();
      Integer key = new Integer(t.getStartOffset());
      overrideTokenMap.put(key, t);
    }

    boolean overrideOn = false;
    int overrideEndOffset = -1;
    for (int i = 0; i < tokens.size(); i++) {
      BaseToken token = (BaseToken) tokens.get(i);

      Integer key = new Integer(token.getStartOffset());

      if (overrideOn) {
        if (token.getStartOffset() >= overrideEndOffset) {
          overrideOn = false;
          overrideEndOffset = -1;
        } else {
          // step to next iteration of for loop
          continue;
        }
      } else {
        if (overrideTokenMap.containsKey(key)) {
          // override one or more tokens until the override
          // token is complete
          token = (BaseToken) overrideTokenMap.get(key);
          overrideOn = true;
          overrideEndOffset = token.getEndOffset();
        }
      }

      Iterator machineItr = iv_machineSet.iterator();
      while (machineItr.hasNext()) {
        Machine fsm = (Machine) machineItr.next();

        fsm.input(token);

        State currentState = fsm.getCurrentState();
        if (currentState.getStartStateFlag()) {
          tokenStartMap.put(fsm, new Integer(i));
        }
        if (currentState.getEndStateFlag()) {
          Object o = tokenStartMap.get(fsm);
          int tokenStartIndex;
          if (o == null) {
            // By default, all machines start with
            // token zero.
            tokenStartIndex = 0;
          } else {
            tokenStartIndex = ((Integer) o).intValue();
            // skip ahead over single token we don't want
            tokenStartIndex++;
          }
          BaseToken startToken = (BaseToken) tokens
              .get(tokenStartIndex);
          BaseToken endToken = token;
          SuffixFrequencyToken segmentToken = new SuffixFrequencyToken(
              startToken.getStartOffset(), endToken
                  .getEndOffset());
          rangeSet.add(segmentToken);
          fsm.reset();
        }
      }
    }

    // cleanup
    tokenStartMap.clear();

    // reset machines
    Iterator itr = iv_machineSet.iterator();
    while (itr.hasNext()) {
      Machine fsm = (Machine) itr.next();
      fsm.reset();
    }

    return rangeSet;
  }

  /**
   * Executes the finite state machines.
   * @param tokens
   * @return Set of FractionToken objects.
   * @throws Exception
   */
  public Set execute(List tokens) throws Exception {
    Set fractionSet = new HashSet();

    // maps a fsm to a token start index
    // key = fsm , value = token start index
    Map tokenStartMap = new HashMap();

    for (int i = 0; i < tokens.size(); i++) {
      BaseToken token = (BaseToken) tokens.get(i);

      Iterator machineItr = iv_machineSet.iterator();
      while (machineItr.hasNext()) {
        Machine fsm = (Machine) machineItr.next();

        fsm.input(token);

        State currentState = fsm.getCurrentState();
        if (currentState.getStartStateFlag()) {
          tokenStartMap.put(fsm, new Integer(i));
        }
        if (currentState.getEndStateFlag()) {
          Object o = tokenStartMap.get(fsm);
          int tokenStartIndex;
          if (o == null) {
            // By default, all machines start with
            // token zero.
            tokenStartIndex = 0;
          } else {
            tokenStartIndex = ((Integer) o).intValue();
            // skip ahead over single token we don't want
            tokenStartIndex++;
          }
          BaseToken startToken = (BaseToken) tokens
              .get(tokenStartIndex);
          BaseToken endToken = token;
          SuffixFrequencyToken fractionToken = new SuffixFrequencyToken(
              startToken.getStartOffset(), endToken
                  .getEndOffset());
          fractionSet.add(fractionToken);
          fsm.reset();
        }
      }
    }

    // cleanup
    tokenStartMap.clear();

    // reset machines
    Iterator itr = iv_machineSet.iterator();
    while (itr.hasNext()) {
      Machine fsm = (Machine) itr.next();
      fsm.reset();
    }

    return fractionSet;
  }
}
TOP

Related Classes of org.apache.ctakes.drugner.fsm.machines.util.SuffixFrequencyFSM

TOP
Copyright © 2018 www.massapi.com. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.