Package edu.stanford.nlp.international.arabic.pipeline

Source Code of edu.stanford.nlp.international.arabic.pipeline.LDCPosMapper

package edu.stanford.nlp.international.arabic.pipeline;

import java.io.File;
import java.io.FileNotFoundException;
import java.io.FileReader;
import java.io.IOException;
import java.io.LineNumberReader;
import java.util.Map;
import java.util.Set;
import java.util.regex.Matcher;
import java.util.regex.Pattern;

import edu.stanford.nlp.trees.treebank.Mapper;
import edu.stanford.nlp.util.Generics;

/**
* Maps pre-terminal ATB morphological analyses to the shortened Bies tag set.
*
* @author Spence Green
*
*/
public class LDCPosMapper implements Mapper {

  protected Pattern startOfTagMap = Pattern.compile("\\(tag-map");
  protected Pattern endOfTagMap = Pattern.compile("^\\s*\\)\\s*$");
  protected Pattern mapping = Pattern.compile("\\((\\S+)\\s+(\\S+)\\)\\s*$");
  protected int numExpectedTokens = 2;

  private boolean addDT = false;
  private final Pattern determiner = Pattern.compile("DET");
  private final Pattern nounBaseTag = Pattern.compile("NN");
  private final Pattern adjBaseTag = Pattern.compile("JJ");
  private final Pattern LDCdeterminer = Pattern.compile("DT\\+");

  protected final Map<String,String> tagMap;
  protected final Set<String> tagsToEscape;

  public LDCPosMapper() {
    this(false);
  }

  public LDCPosMapper(boolean addDeterminer) {
    addDT = addDeterminer;
    tagMap = Generics.newHashMap();

    //Pre-terminal tags that do not appear in LDC tag maps
    tagsToEscape = Generics.newHashSet();
    tagsToEscape.add("-NONE-");             //Traces
    tagsToEscape.add("PUNC");               //Punctuation
  }

  /**
   *
   * @param posTag The preterminal tag
   * @param terminal The optional terminal, which may be used for context
   */
  public String map(String posTag, String terminal) {
    String rawTag = posTag.trim();

    if(tagMap.containsKey(rawTag))
      return tagMap.get(rawTag);
    else if(tagsToEscape.contains(rawTag))
      return rawTag;

    System.err.printf("%s: No mapping for %s%n", this.getClass().getName(),rawTag);

    return rawTag;
  }

  //Modifies the shortened tag based on information contained in the longer tag
  private String processShortTag(String longTag, String shortTag) {
    if(shortTag == null) return null;

    //Hacks to make p5+ mappings compatible with p1-3
    if(shortTag.startsWith("DT+"))
      shortTag = LDCdeterminer.matcher(shortTag).replaceAll("");
    if(longTag.equals("NUMERIC_COMMA"))
      shortTag = "PUNC";

    //As recommended by (Kulick et al., 2006)
    if(addDT && (longTag != null)) {
      Matcher detInLongTag = determiner.matcher(longTag);
      Matcher someKindOfNoun = nounBaseTag.matcher(shortTag);
      Matcher someKindOfAdj = adjBaseTag.matcher(shortTag);

      if(detInLongTag.find() && (someKindOfNoun.find() || someKindOfAdj.find()))
        shortTag = "DT" + shortTag.trim();
    }

    if(tagMap.containsKey(longTag)) {
      String existingShortTag = tagMap.get(longTag);
      if(!existingShortTag.equals(shortTag))
        System.err.printf("%s: Union of mapping files will cause overlap for %s (current: %s new: %s)%n", this.getClass().getName(),longTag,existingShortTag,shortTag);
      return existingShortTag;
    }

    return shortTag;
  }

  public void setup(File path, String... options) {
    if(path == null || !path.exists()) return;

    LineNumberReader reader = null;
    try {
      reader = new LineNumberReader(new FileReader(path));
      boolean insideTagMap = false;
      for(String line; (line = reader.readLine()) != null; ) {
        line = line.trim();

        Matcher isStartSymbol = startOfTagMap.matcher(line);
        insideTagMap = (isStartSymbol.matches() || insideTagMap);

        if(insideTagMap) {
          //Comment line
          if(line.startsWith(";")) continue;

          Matcher mappingLine = mapping.matcher(line);
          if(mappingLine.find()) {
            if(mappingLine.groupCount() == numExpectedTokens) {
              String finalShortTag = processShortTag(mappingLine.group(1),mappingLine.group(2));
              tagMap.put(mappingLine.group(1), finalShortTag);
            }
            else
              System.err.printf("%s: Skipping bad mapping in %s (line %d)%n",this.getClass().getName(),path.getPath(),reader.getLineNumber());
          }

          Matcher isEndSymbol = endOfTagMap.matcher(line);
          if(isEndSymbol.matches()) break;
        }
      }

      reader.close();

    } catch (FileNotFoundException e) {
      System.err.printf("%s: Could not open mapping file %s%n", this.getClass().getName(),path.getPath());
    } catch (IOException e) {
      int lineNum = (reader == null) ? -1 : reader.getLineNumber();
      System.err.printf("%s: Error reading %s (line %d)%n",this.getClass().getName(),path.getPath(),lineNum);
    }
  }

  public boolean canChangeEncoding(String parent, String element) {
    //POS tags aren't encoded, so no need to check
    return true;
  }

  @Override
  public String toString() {
    StringBuilder sb = new StringBuilder();
    for (String longTag : tagMap.keySet()) {
      sb.append(longTag).append('\t').append(tagMap.get(longTag)).append('\n');
    }
    return sb.toString();
  }

  public static void main(String[] args) {
    Mapper mapper = new LDCPosMapper(true);
    File mapFile = new File("/u/nlp/data/Arabic/ldc/atb-latest/p1/docs/atb1-v4.0-taglist-conversion-to-PennPOS-forrelease.lisp");
    mapper.setup(mapFile);

    String test1 = "DET+NOUN+NSUFF_FEM_SG+CASE_DEF_ACC";
    String test2 = "ADJXXXXX";
    String test3 = "REL_ADV";
    String test4 = "NUMERIC_COMMA";

    System.out.printf("%s --> %s\n",test1,mapper.map(test1, null));
    System.out.printf("%s --> %s\n",test2,mapper.map(test2, null));
    System.out.printf("%s --> %s\n",test3,mapper.map(test3, null));
    System.out.printf("%s --> %s\n",test4,mapper.map(test4, null));
  }

}
TOP

Related Classes of edu.stanford.nlp.international.arabic.pipeline.LDCPosMapper

TOP
Copyright © 2018 www.massapi.com. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.