Package at.ofai.gate.extendedgazetteer

Source Code of at.ofai.gate.extendedgazetteer.AbstractExtendedGazetteer$FSM

/*
* AbstractExtendedGazeteer.java
*
* $Id: AbstractExtendedGazetteer.java 2386 2011-05-02 20:23:30Z johann $
*
*/



package at.ofai.gate.extendedgazetteer;


import java.io.IOException;
import java.io.UnsupportedEncodingException;
import java.net.MalformedURLException;
import java.util.*;

import gate.*;
import gate.annotation.AnnotationSetImpl;
import gate.creole.*;
import gate.creole.gazetteer.*;
import gate.creole.metadata.CreoleParameter;
import gate.creole.metadata.Optional;
import gate.creole.metadata.RunTime;
import gate.util.*;
import java.io.BufferedReader;
import java.net.URL;
import java.util.regex.Pattern;
import org.apache.log4j.Logger;


/**
* This is the common base class for ExtendedGazetteer and
* IndirectExtendedGazetteer so that the code for both only needs to be defined
* once.
*
@author Johann Petrak
*/
public abstract class AbstractExtendedGazetteer
  extends DefaultGazetteer {

  /**
   *
   */
  private static final long serialVersionUID = 1L;
  private Boolean prefixAnnotations = false;
  private Boolean suffixAnnotations = false;
  private Boolean includeStrings = false;
  private WCClass wordCharsClass = WCClass.NONWHITESPACE;
  private String wordBoundaryChars = "";
  private String wordBoundaryCharsValue = "";
  private String wordCharsValue = "";
  private String wordChars = "";

  public Boolean getPrefixAnnotations() {
    return prefixAnnotations;
  }

  // Parameters just inherited from the DefaultGazetteer
  @CreoleParameter(comment = "The document to be processed")
  @RunTime
  @Override
  public void setDocument(Document doc) {
    document = doc;
  }
  @Override
  public Document getDocument() {
    return document;
  }
 
  @CreoleParameter(
      comment = "The character used to separate features for entries in gazetteer lists. Accepts strings like "\t" and will unescape it to the relevant character. If not specified, this gazetteer does not support extra features.",
      defaultValue = "\\t"
      )
  @Optional
  @Override
  public void setGazetteerFeatureSeparator(String sep) {
    gazetteerFeatureSeparator = sep;
  }
  @Override
  public String getGazetteerFeatureSeparator() {
    return gazetteerFeatureSeparator;
  }
 
  @CreoleParameter(comment = "The annotation set to be used for the generated annotations")
  @Optional
  @RunTime
  @Override
  public void setAnnotationSetName(String name) {
    annotationSetName = name;
  }
  @Override
  public String getAnnotationSetName() {
    return annotationSetName;
  }
 
  @CreoleParameter(comment = "The URL to the file with list of lists", defaultValue="")
  @Override
  public void setListsURL(java.net.URL theURL) {
    listsURL = theURL;
  }
  @Override
  public java.net.URL getListsURL() {
    return listsURL;
  }
 
  @CreoleParameter(comment = "The encoding used for reading the definitions",
    defaultValue="UTF-8")
  @Override
  public void setEncoding(String enc) {
    encoding = enc;
  }
  @Override
  public String getEncoding() {
    return encoding;
  }
 
  @CreoleParameter(comment = "Should this gazetteer differentiate on case",
    defaultValue = "true")
  @Override
  public void setCaseSensitive(Boolean yesno) {
    caseSensitive = yesno;
  }
  @Override
  public Boolean getCaseSensitive() {
    return caseSensitive;
  }
 
  @CreoleParameter(comment = "Should this gazetteer only match whole words",
    defaultValue = "true")
  @Override
  @RunTime
  public void setWholeWordsOnly(Boolean yesno) {
    wholeWordsOnly = yesno;
  }
  @Override
  public Boolean getWholeWordsOnly() {
    return wholeWordsOnly;
  }
 
  // Only relevant if wholewordsonly is true: if true, prefix
  // matches (i.e. matches where we have a suffix) are allowed too
  @CreoleParameter(comment = "Allow prefix matching",defaultValue="false")
  @RunTime
  @Optional
  public void setAllowPrefixMatches(Boolean yesno) {
    allowPrefixMatches = yesno;
  }
  public Boolean getAllowPrefixMatches() { return allowPrefixMatches ; }
  private Boolean allowPrefixMatches = false;
 
  @CreoleParameter(comment = "Allow suffix matching",defaultValue="false")
  @RunTime
  @Optional
  public void setAllowSuffixMatches(Boolean yesno) {
    allowSuffixMatches = yesno;
  }
  public Boolean getAllowSuffixMatches() { return allowSuffixMatches ; }
  private Boolean allowSuffixMatches = false;
 
  @CreoleParameter(
    comment = "Disallow whole word matches if wholeWordsOnly is set to eanable prefix or suffix matches",
    defaultValue = "false")
  @RunTime
  @Optional
  public void setDisallowWholeMatches(Boolean b) { disallowWholeMatches = b ; }
  public Boolean getDisallowWholeMatches() { return disallowWholeMatches ; }
  private Boolean disallowWholeMatches = false;
 
  @CreoleParameter(comment = "Should this gazetteer only match the longest string starting from any offset?",
    defaultValue = "true")
  @Override
  @RunTime
  public void setLongestMatchOnly(Boolean yesno) {
    longestMatchOnly = yesno;
  }
  @Override
  public Boolean getLongestMatchOnly() {
    return longestMatchOnly;
  }
 
  // Parameters added by this PR
 
  @CreoleParameter(comment = "Generate prefix annotations?", defaultValue="false")
  @RunTime
  public void setPrefixAnnotations(Boolean newPrefixAnnotations) {
    prefixAnnotations = newPrefixAnnotations;
  }

  public Boolean getSuffixAnnotations() {
    return suffixAnnotations;
  }

  @CreoleParameter(comment = "Generate suffix annotations?", defaultValue="false")
  @RunTime
  public void setSuffixAnnotations(Boolean newSuffixAnnotations) {
    suffixAnnotations = newSuffixAnnotations;
  }

  public Boolean getIncludeStrings() {
    return includeStrings;
  }

  @CreoleParameter(comment = "Include original string as feature?", defaultValue="false")
  @RunTime
  public void setIncludeStrings(Boolean newIncludeStrings) {
    includeStrings = newIncludeStrings;
  }

  public WCClass getWordCharsClass() {
    return wordCharsClass;
  }


  @CreoleParameter(comment = "Types of characters that make up words?",
    defaultValue="LETTERS")
  public void setWordCharsClass(WCClass newWordCharsClass) {
    wordCharsClass = newWordCharsClass;
  }

  public String getWordBoundaryChars() {
    return wordBoundaryCharsValue;
  }

  @CreoleParameter(comment = "Additional word boundary characters?",
    defaultValue="")
  public void setWordBoundaryChars(String newWordBoundaryChars) {
    wordBoundaryCharsValue = newWordBoundaryChars;
    wordBoundaryChars = wordBoundaryCharsValue.replaceAll("\\p{Z}", "");
  }

  public String getWordChars() {
    return wordCharsValue;
  }

  @CreoleParameter(comment = "Additional word characters?",
    defaultValue="")
  public void setWordChars(String newWordChars) {
    wordCharsValue = newWordChars;   
    wordChars = wordCharsValue.replaceAll("\\p{Z}", "");
  }

  @CreoleParameter(comment = "Non-editable and memory saving mode, also enables FSM re-use for multiple copies", defaultValue="false")
  public void setMemorySavingMode(Boolean yesno) {
    memorySavingMode = yesno;
  }
  public Boolean getMemorySavingMode() {
    return memorySavingMode;
  }
  protected Boolean memorySavingMode;

  @CreoleParameter(comment = "Canonicalize strings - save memory with many repeated feature value strings", defaultValue="false")
  public void setCanonicalizeStrings(Boolean yesno) {
    canonicalizeStrings = yesno;
  }
  public Boolean getCanonicalizeStrings() {
    return canonicalizeStrings;
  }
  protected Boolean canonicalizeStrings = false;

  @RunTime
  @Optional
  @CreoleParameter(comment = "Type name of output annotations to create. Overwrites the definition in the def file, if present!",
    defaultValue = "")
  public void setAnnotationTypeName(String value) {
    annotationTypeName = value;
  }
  public String getAnnotationTypeName() {
    return annotationTypeName;
  }
  protected String annotationTypeName = "";
 
  // in the program, we use only this method to find the annotation type
  // name to assign: if the annotation type is set as a runtime parameter,
  // always use that, otherwise use the one we get from the gazetteer list
  // specification in the def file.
  public String getAnnotationTypeName(String defaultType) {
    if(annotationTypeName == null || annotationTypeName.isEmpty()) {
      return defaultType;
    } else {
      return annotationTypeName;
    }
  }
 
 
  @RunTime
  @Optional
  @CreoleParameter(comment = "Annotation setname.typename - only within these, the annotation will be done")
  public void setContainingAnnotationSpec(String spec) {
    containingAnnotationSpec = spec;
  }
  public String getContainingAnnotationSpec() {
    return containingAnnotationSpec;
  }
  protected String containingAnnotationSpec = null;
  protected String containingAnnotationType = null;
  protected String containingAnnotationSet = null;
 
  protected String unescapedSeparator = null;

  protected static Map<String,FSM> loadedGazetteers = new HashMap<String,FSM>();

  protected Logger logger;

  protected class FSM {
    public Set<FSMState> fsmStates;
    public FSMState initialState;
    public FSM(Set<FSMState> states, FSMState init) {
      fsmStates = states;
      initialState = init;
    }
    public int refcount = 0;
  }

  // TODO: for now, make this a parameter?
  protected Locale caseConversionLocale = Locale.ENGLISH;
 
  private static final String ws_chars =
    "\\u0009" // CHARACTER TABULATION
                        + "\\u000A" // LINE FEED (LF)
                        + "\\u000B" // LINE TABULATION
                        + "\\u000C" // FORM FEED (FF)
                        + "\\u000D" // CARRIAGE RETURN (CR)
                        + "\\u0020" // SPACE
                        + "\\u0085" // NEXT LINE (NEL)
                        + "\\u00A0" // NO-BREAK SPACE
                        + "\\u1680" // OGHAM SPACE MARK
                        + "\\u180E" // MONGOLIAN VOWEL SEPARATOR
                        + "\\u2000" // EN QUAD
                        + "\\u2001" // EM QUAD
                        + "\\u2002" // EN SPACE
                        + "\\u2003" // EM SPACE
                        + "\\u2004" // THREE-PER-EM SPACE
                        + "\\u2005" // FOUR-PER-EM SPACE
                        + "\\u2006" // SIX-PER-EM SPACE
                        + "\\u2007" // FIGURE SPACE
                        + "\\u2008" // PUNCTUATION SPACE
                        + "\\u2009" // THIN SPACE
                        + "\\u200A" // HAIR SPACE
                        + "\\u2028" // LINE SEPARATOR
                        + "\\u2029" // PARAGRAPH SEPARATOR
                        + "\\u202F" // NARROW NO-BREAK SPACE
                        + "\\u205F" // MEDIUM MATHEMATICAL SPACE
                        + "\\u3000" // IDEOGRAPHIC SPACE
                        ;       
  private static final String ws_class = "[" + ws_chars + "]";
  private static final String ws_patternstring = ws_class + "+";
  private static Pattern ws_pattern;
 
  /**
   * Build a gazetteer using the default lists from the gate resources
   */
  public AbstractExtendedGazetteer() {
    super();
    logger = Logger.getLogger(this.getClass().getName());
  }

  @Override
  public Resource init() throws ResourceInstantiationException {
    // precompile the pattern used to replace all unicode whitespace in gazetteer
    // entries with a single space.
    ws_pattern = Pattern.compile(ws_patternstring);
    // silently remove all whitespace from the wordChars and also from
    // wordBoundaryChars (because Whitespace will always be a word Boundary)
    //System.out.println("Running init!");
    if(memorySavingMode) {
      incrementFSM();
      return this;
    } else {
      //System.out.println("Default mode!!");
      return super.init();
    }
  }

  protected String genUniqueFSMKey() {
    return
      "wcc="+wordCharsClass +
      " wchs="+wordChars + " wbchs=" +wordBoundaryChars + 
      " cs="+caseSensitive + " url="+ listsURL;
  }

  @Override
  public void cleanup() {
    if(memorySavingMode) {
      decrementFSM();
    }
  }

  private synchronized void incrementFSM() throws ResourceInstantiationException {
    //System.out.println("Memory saving mode!");
    String uniqueFSMKey = genUniqueFSMKey();
    FSM fsm = loadedGazetteers.get(uniqueFSMKey);
    if(fsm != null) {
      // The FSM for this file/parm combination already has been compiled, just
      // reuse it for this PR
      fsmStates = fsm.fsmStates;
      initialState = fsm.initialState;
      fsm.refcount++;
      logger.info("Reusing already generated FSM for "+uniqueFSMKey);
    } else {
      // The FSM for this file/parm combination does not exist yet, create it!
      this.fsmStates = new HashSet();
      initialState = new FSMState(this);
      if (gazetteerFeatureSeparator != null && !gazetteerFeatureSeparator.isEmpty()) {
        unescapedSeparator = Strings.unescape(gazetteerFeatureSeparator);
      }
      try {
        loadData();
      } catch (Exception ex) {
        throw new ResourceInstantiationException("Could not initialize", ex);
      }
      fsm = new FSM(fsmStates,initialState);
      fsm.refcount++;
      loadedGazetteers.put(uniqueFSMKey, fsm);
      logger.info("New FMS loaded for "+uniqueFSMKey);
    }   
  }
 
  private synchronized void decrementFSM() {
    String key = genUniqueFSMKey();
    FSM fsm = loadedGazetteers.get(key);
    fsm.refcount--;
    if (fsm.refcount == 0) {
      loadedGazetteers.remove(key);
      logger.info("Removing FSM for "+key);
    }   
  }

  private synchronized void removeFSM() {
    String key = genUniqueFSMKey();
    loadedGazetteers.remove(key);
    logger.info("reInit(): force-removing FSM for "+key);
  }
 
  @Override
  /**
   * CAUTION: this should only be called when no other PR that may use the FSM is still
   * running in a different thread!
   * This will always force-recreate the FSM from the current LST file, no matter if the
   * FSM is used by another PR.
   */
  public void reInit() throws ResourceInstantiationException {
    // the default for this from DefaultGazettteer is to simply do an init().
    // However, if we are using memory saving mode, a simple reInit() will not
    // actually reload anything if the same filename and parms are used by another
    // PR.
    if(memorySavingMode) {
      removeFSM();
      init();
    } else {
      init();
    }
  }

  protected void loadData() throws UnsupportedEncodingException, IOException, ResourceInstantiationException  {
    BufferedReader defReader =
      new BomStrippingInputStreamReader((listsURL).openStream(), encoding);
    String line;
    //System.out.println("Loading data");
    while (null != (line = defReader.readLine())) {
       String[] fields = line.split(":");
       if(fields.length == 0) {
         System.err.println("Empty line in file "+listsURL);
       } else {
         String listFileName = "";
         String majorType = "";
         String minorType = "";
         String languages = "";
         String annotationType = ANNIEConstants.LOOKUP_ANNOTATION_TYPE;
         listFileName = fields[0];
         if(fields.length > 1) {
           majorType = fields[1];
         }
         if(fields.length > 2) {
           minorType = fields[2];
         }
         if(fields.length > 3) {
           languages = fields[3];
         }
         if(fields.length > 4) {
           annotationType = fields[4];
         }
         if(fields.length > 5) {
           defReader.close();
           throw new ResourceInstantiationException("Line has more that 5 fields in def file "+listsURL);
         }
         logger.debug("Reading from "+listFileName+", "+majorType+"/"+minorType+"/"+languages+"/"+annotationType);
         loadListFile(listFileName,majorType,minorType,languages,annotationType);
      }
    } //while
    defReader.close();
  }

  void loadListFile(String listFileName,String majorType,String minorType,
      String languages, String annotationType)
  throws MalformedURLException, IOException {
    // create a completely new fsm
    // query SOLR
    //System.out.println("Loading list file "+listFileName);
    Lookup defaultLookup = new Lookup(listFileName, majorType, minorType,
            languages, annotationType);
    URL lurl = new URL(listsURL,listFileName);
    BufferedReader listReader =
      new BomStrippingInputStreamReader(lurl.openStream(), encoding);
    String line;
    int lines = 0;
    while (null != (line = listReader.readLine())) {
      GazetteerNode node = new GazetteerNode(line, unescapedSeparator, false);
      Lookup lookup = defaultLookup;
      Map<String,Object> fm = node.getFeatureMap();
      if(fm != null && fm.size() > 0) {
        lookup = new Lookup(listFileName, majorType, minorType,
                languages, annotationType);
        // if the only keys we have in the map are one of majorType, minorType,
        // or language, then just overwrite the values in the new lookup
        // object, we wont need an extra new feature map for that!
        Set<String> keyset = fm.keySet();
        if(keyset.size() <= 4) {
          Map<String,Object> newfm = null;
          for(String key : keyset) {
            if(key.equals("majorType")) {
              String tmp = (String)fm.get("majorType");
              if(canonicalizeStrings) { tmp.intern(); }
              lookup.majorType = tmp;
            } else if(key.equals("minorType")) {
              String tmp = (String)fm.get("minorType");
              if(canonicalizeStrings) { tmp.intern(); }
              lookup.minorType = tmp;
            } else if(key.equals("languages")) {
              String tmp = (String)fm.get("languages");
              if(canonicalizeStrings) { tmp.intern(); }
              lookup.languages = tmp;
            } else if(key.equals("annotationType")) {
              String tmp = (String)fm.get("annotationType");
              if(canonicalizeStrings) { tmp.intern(); }
              lookup.annotationType = tmp;
            } else {
              // some other key, we copy over to the new map
              if(newfm == null) {
                newfm = new HashMap<String,Object>();
              }
              String tmp = (String)fm.get(key);
              if(canonicalizeStrings) { tmp.intern(); }
              newfm.put(key, tmp);
            }
          }
          // if we had to allocate the map, store it
          if(newfm != null) {
            lookup.features = newfm;
          }
        } else {
          if(canonicalizeStrings) {
            for(String key : fm.keySet()) {
              String tmp = (String)fm.get(key);
              tmp.intern();
              fm.put(key,tmp);
            }
          }
          lookup.features = fm;
        }
      }
      //System.out.print("adding lookup for node: "+node);
      addLookup(node.getEntry(), lookup);
      lines++;
    } // while
    logger.debug("Lines read: "+lines);
  }



  /**
   * Tests whether a character is internal to a word (i.e. if it's a letter or
   * a combining mark (spacing or not)).
   * @param ch the character to be tested
   * @return a boolean value
   */
  public boolean isWithinWord(char ch) {
    if (wordCharsClass == WCClass.LETTERS) {
      return Character.isLetter(ch);
    } else if (wordCharsClass == WCClass.LETTERSORHYPHEN) {
      return Character.isLetter(ch) || ch == '-';
    } else if (wordCharsClass == WCClass.DIGITS) {
      return Character.isDigit(ch);
    } else if (wordCharsClass == WCClass.LETTERSORDIGITS) {
      return (Character.isDigit(ch) || Character.isLetter(ch));
    }
    // or, if we have WC_NONWHITESPACE
    if (Character.getType(ch) == Character.COMBINING_SPACING_MARK
      || Character.getType(ch) == Character.NON_SPACING_MARK) {
      return true;
    }
    if (Character.isWhitespace(ch) || Character.isSpaceChar(ch)) {
      return false;
    }

    if (wordChars.indexOf(ch) != -1) {
      return true;
    }
    if (wordBoundaryChars.indexOf(ch) != -1) {
      return false;
    }

    return true;  
  }

   public void execute() throws ExecutionException{
    doExecute(document);
   }

   public void doExecute(Document theDocument) throws ExecutionException {
    interrupted = false;
    AnnotationSet annotationSet;
    //check the input
    if(theDocument == null) {
      throw new ExecutionException(
        "No document to process!"
      );
    }

    if(annotationSetName == null ||
       annotationSetName.equals("")) annotationSet = theDocument.getAnnotations();
    else annotationSet = theDocument.getAnnotations(annotationSetName);

    if(getContainingAnnotationSpec() == null || getContainingAnnotationSpec().isEmpty()) {
      containingAnnotationType = null;
      containingAnnotationSet = null;     
    } else {
      String[] set_type = containingAnnotationSpec.split("\\.",2);
      if(set_type.length == 1) {
        containingAnnotationSet = "";
        containingAnnotationType = set_type[0];
      } else if(set_type.length == 2) {
        containingAnnotationSet = set_type[0];
        containingAnnotationType = set_type[1];
      } else {
        throw new GateRuntimeException("ContaininAnnotationSpec not valid");
      }
    }
   
   
    fireStatusChanged("Performing look-up in " + theDocument.getName() + "...");
    if(containingAnnotationType == null) {
      String content = theDocument.getContent().toString();
      doAnnotateRegion(theDocument,content,content,0,annotationSet);
    } else {
      AnnotationSet containings = theDocument.getAnnotations(containingAnnotationSet);
      containings = containings.get(containingAnnotationType);
      for(Annotation containing : containings) {
        String content = theDocument.getContent().toString();
        String region = gate.Utils.stringFor(theDocument, containing);
        doAnnotateRegion(theDocument,content,region,
          containing.getStartNode().getOffset().intValue(),annotationSet);
      }
    }

    fireProcessFinished();
    fireStatusChanged("Look-up complete!");
  } // execute

   public void doAnnotateRegion(
     Document theDocument,
     String content, String region,
     int regionOffset, AnnotationSet annotationSet)
     throws ExecutionException{
    interrupted = false;
    int length = region.length();
    char currentChar;
    FSMState currentState = initialState;
    FSMState nextState;
    FSMState lastMatchingState = null;
    int matchedRegionEnd = 0;
    int matchedRegionStart = 0;
    int charIdx = 0;
    int oldCharIdx = 0;

    // we do prefix annotation when the parameter requires it
    // and when we either have wholeWordsOnly false, which means
    // we can annotate anywhere as part of a word, or if we
    // have wholeWordsOnly true and in addition also annotate
    // word suffixes (which leaves us with a prefix).
    boolean doPrefixAnnotation =
      getPrefixAnnotations() &&
        (!wholeWordsOnly || getAllowSuffixMatches());
    // analogously for the suffix annotations
    boolean doSuffixAnnotation =
      getSuffixAnnotations() &&
        (!wholeWordsOnly || getAllowPrefixMatches());

    while(charIdx < length) {
      currentChar = region.charAt(charIdx);
      if(!isWithinWord(currentChar)) currentChar = ' ';
      else currentChar = caseSensitive.booleanValue() ?
                          currentChar :
                          Character.toUpperCase(currentChar);
      nextState = currentState.next(currentChar);
      if(nextState == null) {
        //the matching stopped
        //if we had a successful match then act on it;
        if(lastMatchingState != null){
          createLookups(theDocument,lastMatchingState,
            matchedRegionStart+regionOffset,
            matchedRegionEnd+regionOffset,
            annotationSet, doPrefixAnnotation, doSuffixAnnotation,
            content);
          lastMatchingState = null;
        }
        //reset the FSM
        charIdx = matchedRegionStart + 1;
        matchedRegionStart = charIdx;
        currentState = initialState;
      } else{//go on with the matching
        currentState = nextState;
        // if we have a successful state, i.e. an end state:
        // either we are matching just whole words, then make sure that
        // the begginning of this match and the current end are at word boundaries;
        // or we do not just match whole words, then go on.
        if(currentState.isFinal() &&
           (
            // if !wholeWordsOnly, everything is allowed
            (!wholeWordsOnly)
             ||
            // if wholewordsonly, check start if suffix not allowed
            // and check end if prefix not allowed
            // In other words, either suffix is allowed or check start
            // and either prefix is allowed or check end
            (!allowSuffixMatches && !allowPrefixMatches &&
              (matchedRegionStart == 0 ||
               !isWithinWord(region.charAt(matchedRegionStart - 1))) &&
              (charIdx + 1 >= region.length()   ||
                !isWithinWord(region.charAt(charIdx + 1)))
            ) ||
            (!allowSuffixMatches && allowPrefixMatches &&
              (matchedRegionStart == 0 ||
               !isWithinWord(region.charAt(matchedRegionStart - 1))))
             ||
             (!allowPrefixMatches && allowSuffixMatches &&
              (charIdx + 1 >= region.length()   ||
               !isWithinWord(region.charAt(charIdx + 1))))
             ||
             (allowPrefixMatches && allowSuffixMatches &&
                ( ( (matchedRegionStart == 0 ||
                     !isWithinWord(region.charAt(matchedRegionStart - 1))) &&
                    (charIdx + 1 < region.length()   &&
                     isWithinWord(region.charAt(charIdx + 1)))) ||
                  ( (matchedRegionStart > 0 ||
                     isWithinWord(region.charAt(matchedRegionStart - 1))) &&
                    (charIdx + 1 >= region.length()   ||
                     !isWithinWord(region.charAt(charIdx + 1))))
                )               
             )
           )
          ){
            // no that we are in here we have one additional situation still
            // not checked: if wholeWordsOnly and disallowWhole we
            // prevent the following, if we have indeed a whole word check.
            // that way we can create gazetteer runs where only prefixes and/or
            // suffixes are allowed
            if(wholeWordsOnly && disallowWholeMatches &&
               (matchedRegionStart == 0 ||
                !isWithinWord(region.charAt(matchedRegionStart - 1))) &&
               (charIdx + 1 >= region.length()   ||
                !isWithinWord(region.charAt(charIdx + 1)))) {
              // do not do anything !!!
            } else {
          //we have a new match
          // if there is a previous matching state to act upon and we do not
          // just annotate the longest match, then annotate that previous
          // match before updating the last matching state.
          // TODO: all matches here will share the prefix with the longest match
          // and all matches will have a suffix ... should we annotate those
          // seperately?
          // for now, yes we will add all the prefix and suffix annotations
          if(!longestMatchOnly && lastMatchingState != null){
            createLookups(theDocument,lastMatchingState,
              matchedRegionStart+regionOffset,
              matchedRegionEnd+regionOffset,
              annotationSet,
              doPrefixAnnotation, doSuffixAnnotation,
              content);
          }
          matchedRegionEnd = charIdx;
          lastMatchingState = currentState;
            } // check for wholeWordsOnly, disallowWholeMatches and whole match
        }
        charIdx ++;
        if(charIdx == region.length()){
          //we can't go on, use the last matching state and restart matching
          //from the next char
          if(lastMatchingState != null){
            //let's add the new annotation(s)
            createLookups(theDocument,lastMatchingState,
              matchedRegionStart+regionOffset,
              matchedRegionEnd+regionOffset,
              annotationSet,
              doPrefixAnnotation, doSuffixAnnotation,
              content);
            lastMatchingState = null;
          }
          //reset the FSM
          charIdx = matchedRegionStart + 1;
          matchedRegionStart = charIdx;
          currentState = initialState;
        }
      }
      //fire the progress event
      if(charIdx - oldCharIdx > 256) {
        fireProgressChanged((100 * charIdx )/ length );
        oldCharIdx = charIdx;
        if(isInterrupted()) throw new ExecutionInterruptedException(
            "The execution of the " + getName() +
            " gazetteer has been abruptly interrupted!");
      }
    } // while(charIdx < length)
    //we've finished. If we had a stored match, then apply it.
    if(lastMatchingState != null) {
      createLookups(theDocument,lastMatchingState,
        matchedRegionStart+regionOffset,
        matchedRegionEnd+regionOffset,
        annotationSet,
        doPrefixAnnotation, doSuffixAnnotation,
        content);
    }
    fireProcessFinished();
    fireStatusChanged("Look-up complete!");
  } // execute
 
 
 
 
  protected void createLookups(
    Document theDocument,
    FSMState matchingState, int matchedRegionStart,
          int matchedRegionEnd, AnnotationSet annotationSet,
          boolean doPrefixAnnotation, boolean doSuffixAnnotation,
          String content)
  {
    Iterator<Lookup> lookupIter = matchingState.getLookupSet().iterator();
    while(lookupIter.hasNext()) {
      Lookup currentLookup = (Lookup)lookupIter.next();
      FeatureMap fm = Factory.newFeatureMap();
      fm.put(LOOKUP_MAJOR_TYPE_FEATURE_NAME, currentLookup.majorType);
      if (null!= currentLookup.oClass && null!=currentLookup.ontology){
        fm.put(LOOKUP_CLASS_FEATURE_NAME,currentLookup.oClass);
        fm.put(LOOKUP_ONTOLOGY_FEATURE_NAME,currentLookup.ontology);
      }

      if(null != currentLookup.minorType)
        fm.put(LOOKUP_MINOR_TYPE_FEATURE_NAME, currentLookup.minorType);
      if(null != currentLookup.languages)
        fm.put(LOOKUP_LANGUAGE_FEATURE_NAME, currentLookup.languages);
      if(null != currentLookup.features) {
        fm.putAll(currentLookup.features);
      }
      // added features for ExtendedGazetteer
      fm.put("firstcharCategory", Character.getType(content.charAt(matchedRegionStart)));
      if (Character.isUpperCase(content.charAt(matchedRegionStart))) {
        fm.put("firstcharUpper", true);
      } else {
        fm.put("firstcharUpper", false);
      }
      boolean atEnd = false;
      boolean atBeginning = false;
      boolean isWholeWord = false;
      if (!isWithinWord(content.charAt(Math.min(content.length() - 1, matchedRegionEnd + 1)))) {
        fm.put("atEnd", true);
        atEnd = true;
      } else {
        fm.put("atEnd", false);
      }
      if (matchedRegionStart > 0 && isWithinWord(content.charAt(matchedRegionStart - 1))) {
        fm.put("atBeginning", false);
      } else {
        fm.put("atBeginning", true);
        atBeginning = true;
      }
      if(atBeginning && atEnd) { isWholeWord = true; }
      fm.put("wholeWord", isWholeWord);
      // this would not be necessary any more now that JAPE can handle the
      // @string metafeature but we keep it for backwards compatibility
      if (includeStrings) {
        fm.put("string", content.substring(matchedRegionStart, matchedRegionEnd + 1));
      }
      if (null != currentLookup.features) {
        fm.putAll(currentLookup.features);
      }
      Integer lookupid =
        addAnAnnotation(theDocument,annotationSet, matchedRegionStart, matchedRegionEnd+1,
          getAnnotationTypeName(currentLookup.annotationType), fm);
      // prefix annotation, if wanted: starting with the region start,
      // go back, if possible while we still have word characters and annotate
      // those. Link the prefix to the lookup annotation above.
      //
      if(doPrefixAnnotation) {
        int prefixLength = determinePrefixLength(matchedRegionStart,content);
        if(prefixLength > 0) {
          // Annotate from matchedRegionStart-prefixLength to matchedRegionStart
          int prefixFrom = matchedRegionStart-prefixLength;
          int prefixTo = matchedRegionStart;
          FeatureMap fmprefix = Factory.newFeatureMap();
          fmprefix.put(LOOKUP_MAJOR_TYPE_FEATURE_NAME, currentLookup.majorType);
          if (null != currentLookup.oClass && null != currentLookup.ontology) {
            fmprefix.put(LOOKUP_CLASS_FEATURE_NAME, currentLookup.oClass);
            fmprefix.put(LOOKUP_ONTOLOGY_FEATURE_NAME, currentLookup.ontology);
          }
          if (null != currentLookup.minorType) {
            fmprefix.put(LOOKUP_MINOR_TYPE_FEATURE_NAME, currentLookup.minorType);
            if (null != currentLookup.languages) {
              fmprefix.put("language", currentLookup.languages);
            }
          }
          fmprefix.put("firstcharCategory", Character.getType(content.charAt(prefixFrom)));
          if (Character.isUpperCase(content.charAt(prefixFrom))) {
            fmprefix.put("firstcharUpper", true);
          } else {
            fmprefix.put("firstcharUpper", false);
          }
          if (includeStrings) {
            fmprefix.put("string", content.substring(prefixFrom, prefixTo));
          }
          if (null != currentLookup.features) {
            fmprefix.putAll(currentLookup.features);
          }
          Integer prefid =
            addAnAnnotation(theDocument,annotationSet, prefixFrom, prefixTo,
            getAnnotationTypeName(currentLookup.annotationType) + "_prefix", fmprefix);
          fm.put("prefix_id", prefid);
          fmprefix.put("lookup_id", lookupid);
        }
      }
      // suffix annotation, if wanted: starting with the character after the
      // region end, go forward, if possible, while we still have word
      // characters and annotate those. Link the suffix to the lookup annotation
      // above.
      if (doSuffixAnnotation) {
        int suffixLength = determineSuffixLength(matchedRegionEnd, content);
        if (suffixLength > 0) {
          // Annotate from matchedRegionEnd+1 to matchedRegionEnd+1+suffixlength
          int suffixFrom = matchedRegionEnd + 1;
          int suffixTo = matchedRegionEnd + 1 + suffixLength;
          FeatureMap fmsuffix = Factory.newFeatureMap();
          fmsuffix.put(LOOKUP_MAJOR_TYPE_FEATURE_NAME, currentLookup.majorType);
          if (null != currentLookup.oClass && null != currentLookup.ontology) {
            fmsuffix.put(LOOKUP_CLASS_FEATURE_NAME, currentLookup.oClass);
            fmsuffix.put(LOOKUP_ONTOLOGY_FEATURE_NAME, currentLookup.ontology);
          }
          if (null != currentLookup.minorType) {
            fmsuffix.put(LOOKUP_MINOR_TYPE_FEATURE_NAME, currentLookup.minorType);
            if (null != currentLookup.languages) {
              fmsuffix.put("language", currentLookup.languages);
            }
          }
          fmsuffix.put("string", content.substring(suffixFrom, suffixTo));
          if (null != currentLookup.features) {
            fmsuffix.putAll(currentLookup.features);
          }
          Integer sufid =
            addAnAnnotation(theDocument,annotationSet, suffixFrom, suffixTo,
            getAnnotationTypeName(currentLookup.annotationType) + "_suffix", fmsuffix);
          fm.put("suffix_id", sufid);
          fmsuffix.put("lookup_id", lookupid);
        }
      }
    }//while(lookupIter.hasNext())
  }

  // given the region start index of a lookup, find the length of any prefix,
  // or return 0 if no prefix is present.
  protected int determinePrefixLength(int regionStart, String content) {
    int length = 0;
    while( regionStart-length-1 >= 0 // as long as not crossing the beginning
      && isWithinWord(content.charAt(regionStart-length-1))
      )
      length++;
    return length;
  }

  // helper method that adds an annotation to an annotation set and if the
  // annotationset is a AnnotationSetImpl (currently all are, but who knows ...),
  // returns the id of the annotation, otherwise null.
  // This allows to link prefix, suffix and lookup annotations by their IDs
  protected Integer addAnAnnotation(Document theDocument,
    AnnotationSet set, int from, int to, String type, FeatureMap fm) {
    Integer id = null;
    if (set instanceof AnnotationSetImpl) {
      AnnotationSetImpl setasannimpl = (AnnotationSetImpl) set;
      try {
        id = setasannimpl.add(new Long(from), new Long(to), type, fm);
      } catch (InvalidOffsetException ex) {
        throw new GateRuntimeException("Invalid offset exception - doclen/from/to="
          + theDocument.getContent().size() + "/" + from + "/" + to, ex);
      }
    } else {
      try {
        set.add(new Long(from), new Long(to), type, fm);
      } catch (InvalidOffsetException ex) {
        throw new GateRuntimeException("Invalid offset exception - doclen/from/to="
          + theDocument.getContent().size() + "/" + from + "/" + to + " / ", ex);
      }
    }
    return id;
  }


  // given the region end index of a lookup (the index of the last character
  // of the lookup) find the length of any suffix, or return 0 if no suffix
  // is present.
  protected int determineSuffixLength(int regionEnd, String content) {
    int length = 0;
    while( regionEnd+length+1 < content.length()
      && isWithinWord(content.charAt(regionEnd+length+1))
      )
      length++;
    return length;
  }




  @Override
  public void addLookup(String text, Lookup lookup) {
    // 1) instead of translating every character that is not within a word
    // on the fly when adding states, first normalize the text string and then
    // trim it. If the resulting word is empty, skip the whole processing because
    // the original consisted only of characters that are not word characters!
    // 2) if something remains and we want not exact case matching, convert
    // the whole string to both only upper and only lower case first, then
    // compare the lengths. If the lengths differ, add both in addition to
    // the original!
   
    int textLen = text.length();
    char[] textChars = new char[textLen];
    for(int i = 0; i < textLen; i++) {
      if(isWithinWord(text.charAt(i))) {
        textChars[i] = text.charAt(i);
      } else {
        textChars[i] = ' ';
      }
    }
   
    String textNormalized = new String(textChars).trim();
    // convert anything that is a sequence of whitespace to a single space
    // WAS: textNormalized = textNormalized.replaceAll("  +", " ");
    textNormalized = ws_pattern.matcher(textNormalized).replaceAll(" ");
    if(textNormalized.isEmpty()) {
      //System.out.println("Ignoring, is empty");
      return;
    }
   
    // TODO: at some point this should get changed to allow for both totally
    // ignoring case (as now) and for matching either the original or a
    // case-normalization (in the runtime). This would also need a setting
    // for specifying what the case normalization should be (e.g. UPPERCASE).
   
    // For now, we always normalize to upper case when case is ignored.
    // The gazetteer should contain lowercase or firstCaseUpper words, but
    // better not ALLCAPS in order for lower case characters which get mapped
    // to two characters in uppercase to be mapped correctly.
    // For these special cases, we add two UPPERCASE normalizations:
    // the one with the two characters and the one where the char.toUpperCase
    // is used.
    if(!caseSensitive) {
      String textNormalizedUpper = textNormalized.toUpperCase(caseConversionLocale);
      if(textNormalizedUpper.length() != textNormalized.length()) {
        actualAddLookup(textNormalizedUpper, lookup);
        char[] textChars2 = new char[textNormalized.length()];
        for(int i=0; i<textNormalized.length(); i++) {
          textChars2[i] = Character.toUpperCase(textNormalized.charAt(i));
        }
        actualAddLookup(new String(textChars2), lookup);
      } else {
        // if both version are of the same length, it is sufficient to add the
        // upper case version
        actualAddLookup(textNormalizedUpper, lookup);
      }
    } else {
      actualAddLookup(textNormalized, lookup);
    }
   

  } // addLookup

  public void actualAddLookup(String text, Lookup lookup) {
    char currentChar;
    FSMState currentState = initialState;
    FSMState nextState;
    Lookup oldLookup;

    //System.out.println("Adding "+text+"|"+lookup);
   
    for(int i = 0; i< text.length(); i++) {
        currentChar = text.charAt(i);
      nextState = currentState.next(currentChar);
      if(nextState == null){
        nextState = new FSMState(this);
        currentState.put(currentChar, nextState);
        if(currentChar == ' ') nextState.put(' ',nextState);
      }
      currentState = nextState;
    } //for(int i = 0; i< text.length(); i++)

    currentState.addLookup(lookup);
    //System.out.println("text=>"+text + "<, " + lookup.majorType + "|" + lookup.minorType);

  } // addLookup

  /**
   * Our way to duplicate the PR differs from how DefaultGazetteer, from which we inherit
   * does it, so this method overrides the inherited behavior, we simply create a new instance
   * of the gazetteer, FSM sharing will be done automatically if memory saving mode is
   * true.
   */
  @Override
  public Resource duplicate(Factory.DuplicationContext ctx)
      throws ResourceInstantiationException {
    return Factory.defaultDuplicate(this, ctx);
  }
 

} // ExtendedGazetteer
TOP

Related Classes of at.ofai.gate.extendedgazetteer.AbstractExtendedGazetteer$FSM

TOP
Copyright © 2018 www.massapi.com. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.