Source Code of com.tamingtext.qa.AnswerTypeContextGenerator

/*
 * Copyright 2008-2011 Grant Ingersoll, Thomas Morton and Drew Farris
 *
 *    Licensed under the Apache License, Version 2.0 (the "License");
 *    you may not use this file except in compliance with the License.
 *    You may obtain a copy of the License at
 *
 *        http://www.apache.org/licenses/LICENSE-2.0
 *
 *    Unless required by applicable law or agreed to in writing, software
 *    distributed under the License is distributed on an "AS IS" BASIS,
 *    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 *    See the License for the specific language governing permissions and
 *    limitations under the License.
 * -------------------
 * To purchase or learn more about Taming Text, by Grant Ingersoll, Thomas Morton and Drew Farris, visit
 * http://www.manning.com/ingersoll
 */


package com.tamingtext.qa;


import java.io.File;
import java.io.IOException;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.HashSet;
import java.util.List;
import java.util.Set;
import java.util.regex.Pattern;


import net.didion.jwnl.JWNLException;
import opennlp.tools.coref.mention.JWNLDictionary;
import opennlp.tools.parser.Parse;


/**
 * Identify the features of the question by leveraging Wordnet and using the parse of the question
 * and the question word (how, why, what, etc.) to output a context, which is the set
 * of features that can be passed to the model.  Can use Wordnet to examine what words are
 * also predictive by looking at the synsets.
 *
 * It is a rule-based feature selector
 */
public class AnswerTypeContextGenerator {
  
  private static final Pattern falseHeadsPattern = Pattern.compile("^(name|type|kind|sort|form|one|breed|names|variety)$");
  private static final Pattern copulaPattern = Pattern.compile("^(is|are|'s|were|was|will)$");
  private static final Pattern queryWordPattern = Pattern.compile("^(who|what|when|where|why|how|whom|which|name)$");
  private static final Pattern useFocusNounPattern = Pattern.compile("^(who|what|which|name)$");
  private static final Pattern howModifierTagPattern = Pattern.compile("^(JJ|RB)");
  private JWNLDictionary wordnet;


  /**
   *
   * @param dictDir The dict directory under the wordnet dir
   */
  public AnswerTypeContextGenerator(File dictDir) {
    try {
      wordnet = new JWNLDictionary(dictDir.getAbsolutePath());
    }
    catch (IOException e) {
      throw new RuntimeException("Unable to initalize",e);
    }
    catch (JWNLException e) {
      throw new RuntimeException("Unable to initalize",e);
    }
  }


  private boolean isQueryWord(String word) {
    return(queryWordPattern.matcher(word).matches());
  }


  private int movePastPrep(int i, Parse[] toks) {
    if (i < toks.length && (toks[i].toString().equals("of") || toks[i].toString().equals("for"))) {
      i++;
    }
    return (i);
  }


  private  int movePastOf(int i, Parse[] toks) {
    if (i < toks.length && toks[i].toString().equals("of")) {
      i++;
    }
    return (i);
  }


  private int movePastCopula(int i, Parse[] toks) {
    if (i < toks.length && toks[i].getType().startsWith("V")) {
      if (copulaPattern.matcher(toks[i].toString()).matches()) {
        i++;
      }
    }
    return (i);
  }


  
  private Parse[] getNounPhrases(Parse parse) {
    List<Parse> nps = new ArrayList<Parse>(10);
    List<Parse> parts = new ArrayList<Parse>();
    parts.add(parse);
    while (parts.size() > 0) {
      List<Parse> newParts = new ArrayList<Parse>();
      for (int pi=0,pn=parts.size();pi<pn;pi++) {
        Parse cp = parts.get(pi);
        if (cp.getType().equals("NP") && cp.isFlat()) {
          nps.add(cp);
        }
        else if (!cp.isPosTag()) {
          newParts.addAll(Arrays.asList(cp.getChildren()));
        }
      }
      parts = newParts;
    }
    return nps.toArray(new Parse[nps.size()]);
  }
  
  private Parse getContainingNounPhrase(Parse token) {
    Parse parent = token.getParent();
    if (parent.getType().equals("NP")) {
      return parent;
    }
    return null;
  }
  
  private int getTokenIndexFollowingPhrase(Parse p,Parse[] toks) {
    Parse[] ptok = p.getTagNodes();
    Parse lastToken = ptok[ptok.length-1];
    for (int ti=0,tl=toks.length;ti<tl;ti++) {
      if (toks[ti] == lastToken) {
        return(ti+1);
      }
    }
    return(toks.length);
  }


  
  private Parse findFocusNounPhrase(String queryWord, int qwi, Parse[] toks) {
    if (queryWord.equals("who")) {
      int npStart = movePastCopula(qwi + 1, toks);
      if (npStart > qwi + 1) { // check to ensure there is a copula
        Parse np = getContainingNounPhrase(toks[npStart]);
        if (np != null) {
          return (np);
        }
      }
    }
    else if (queryWord.equals("what")) {
      int npStart = movePastCopula(qwi + 1, toks);
      Parse np = getContainingNounPhrase(toks[npStart]);
      //removed copula case 
      if (np != null) {
        Parse head = np.getHead();
        if (falseHeadsPattern.matcher(head.toString()).matches()) {
          npStart += np.getChildren().length;
          int np2Start = movePastPrep(npStart, toks);
          if (np2Start > npStart) {
            Parse snp = getContainingNounPhrase(toks[np2Start]);
            if (snp != null) {
              return (snp);
            }
          }
        }
        return (np);
      }
    }
    else if (queryWord.equals("which")) {
      //check for false query words like which VBD
      int npStart = movePastCopula(qwi + 1, toks);
      if (npStart > qwi + 1) {
        return (getContainingNounPhrase(toks[npStart]));
      }
      else {
        npStart = movePastOf(qwi + 1, toks);
        return (getContainingNounPhrase(toks[npStart]));
      }
    }
    else if (queryWord.equals("how")) {
      if (qwi + 1 < toks.length) {
        return (getContainingNounPhrase(toks[qwi + 1]));
      }
    }
    else if (qwi == 0 && queryWord.equals("name")) {
      int npStart = qwi + 1;
      Parse np = getContainingNounPhrase(toks[npStart]);
      if (np != null) {
        Parse head = np.getHead();
        if (falseHeadsPattern.matcher(head.toString()).matches()) {
          npStart += np.getChildren().length;
          int np2Start = movePastPrep(npStart, toks);
          if (np2Start > npStart) {
            Parse snp = getContainingNounPhrase(toks[np2Start]);
            if (snp != null) {
              return (snp);
            }
          }
        }
        return (np);
      }
    }
    return (null);
  }
  
  private String[] getLemmas(Parse np) {
    // make sure we're getting a single word.
    String word = np.getHead().toString().toLowerCase();
    return wordnet.getLemmas(word,"NN");
  }
  
  private Set<String> getSynsetSet(Parse np) {


    Set<String> synsetSet = new HashSet<String>();
    String[] lemmas = getLemmas(np);
    for (int li = 0; li < lemmas.length; li++) {
      String[] synsets = wordnet.getParentSenseKeys(lemmas[li],"NN",0);
      for (int si=0,sn=synsets.length;si<sn;si++) {
        synsetSet.add(synsets[si]);
      }
    }
    return (synsetSet);
  }


  private void generateWordNetFeatures(Parse focusNoun, List<String> features) {


    Parse[] toks = focusNoun.getTagNodes();
    if (toks[toks.length - 1].getType().startsWith("NNP")) {
      return;
    }
    //check wordnet
    Set<String> synsets = getSynsetSet(focusNoun);


    for (String synset : synsets) {
      features.add("s=" + synset);
    }
  }


  private void generateWordFeatures(Parse focusNoun, List<String> features) {
    Parse[] toks = focusNoun.getTagNodes();
    int nsi = 0;
    for (; nsi < toks.length - 1; nsi++) {
      features.add("mw=" + toks[nsi]);
      features.add("mt=" + toks[nsi].getType());
    }
    features.add("hw=" + toks[nsi]);
    features.add("ht=" + toks[nsi].getType());
  }
  
  public String[] getContext(Parse query) {
    Parse focalNoun = null;
    String queryWord = null;
    List<String> features = new ArrayList<String>();
    features.add("def");
    Parse[] nps = getNounPhrases(query);
    Parse[] toks = query.getTagNodes();
    int fnEnd = 0;
    int i = 0;
    boolean fnIsLast = false;
    for (; i < toks.length; i++) {
      String tok = toks[i].toString().toLowerCase();
      if (isQueryWord(tok)) {
        queryWord = tok;
        focalNoun = findFocusNounPhrase(queryWord, i, toks);
        if (tok.equals("how") && i+1 < toks.length) {
          if (howModifierTagPattern.matcher(toks[i+1].getType()).find()) {
            queryWord=tok+"_"+toks[i+1].toString();
          }
        }
        if (focalNoun != null) {
          fnEnd = getTokenIndexFollowingPhrase(focalNoun,toks);
        }
        if (focalNoun != null && focalNoun.equals(nps[nps.length - 1])) {
          fnIsLast  = true;
        }
        break;
      }
    }
    int ri = i + 1;
    if (focalNoun != null) {
      ri = fnEnd + 1;
    }
    for (; ri < toks.length; ri++) {
      features.add("rw=" + toks[ri].toString());
    }
    if (queryWord != null) {
      features.add("qw=" + queryWord);
      String verb = null;
      //skip first verb for some query words like how much
      for (int vi = i + 1; vi < toks.length; vi++) {
        String tag = toks[vi].getType();
        if (tag != null && tag.startsWith("V")) {
          verb = toks[vi].toString();
          break;
        }
      }
      if (focalNoun == null) {
        features.add("qw_verb=" + queryWord + "_" + verb);
        features.add("verb="+ verb);
        features.add("fn=null");
      }
      else if (useFocusNounPattern.matcher(queryWord).matches()) {
        generateWordFeatures(focalNoun, features);
        generateWordNetFeatures(focalNoun, features);
      }
      if (fnIsLast) {
        features.add("fnIsLast=" + fnIsLast);
      }
    }
    return(features.toArray(new String[features.size()]));
  }
 
}
Source Code of com.tamingtext.qa.AnswerTypeContextGenerator

Related Classes of com.tamingtext.qa.AnswerTypeContextGenerator