Package edu.stanford.nlp.trees.international.spanish

Source Code of edu.stanford.nlp.trees.international.spanish.SpanishTreebankLanguagePack

package edu.stanford.nlp.trees.international.spanish;

import edu.stanford.nlp.international.morph.MorphoFeatureSpecification;
import edu.stanford.nlp.international.spanish.process.SpanishTokenizer;
import edu.stanford.nlp.ling.HasWord;
import edu.stanford.nlp.process.CoreLabelTokenFactory;
import edu.stanford.nlp.process.TokenizerFactory;
import edu.stanford.nlp.trees.AbstractTreebankLanguagePack;
import edu.stanford.nlp.trees.CollinsHeadFinder;
import edu.stanford.nlp.trees.HeadFinder;


/**
* Language pack for the Spanish treebank.
*
* @author mcdm
*/
public class SpanishTreebankLanguagePack extends AbstractTreebankLanguagePack {

  private static final long serialVersionUID = -7059939700276532428L;

  // The AnCora treebank is distributed in ISO 8859-1 XML, but the
  // processed treebank (PTB-style) is UTF-8
  public static final String STB_ENCODING = "ISO8859_1";

  private static final String[] punctTags = {
    "faa", "fat", "fc", "fca", "fct", "fd", "fe", "fg", "fh", "fia", "fit",
    "fla", "flt", "fp", "fpa", "fpt", "fra", "frc", "fs", "ft", "fx", "fz",
    "f0"
  };

  private static final String[] sentenceFinalPunctTags = {
    "fat", "fit", "fp", "fs"
  };

  private static final String[] punctWords = {
    "¡", "!", ",", "[", "]", ":", "\"", "-", "/", "¿", "?", "{", "}", ".",
    "=LRB=", "=RRB=", "«", "»", "…", "...", "%", ";", "_", "+", "=", "&", "@"
  };

  private static final String[] sentenceFinalPunctWords = {
    "!", "?", ".", "…", "..."
  };

  private static final String[] startSymbols = {"ROOT"};

  private static final char[] annotationIntroducingChars = {'^', '[', '-'};

  /**
   * Return the input Charset encoding for the Treebank. See
   * documentation for the <code>Charset</code> class.
   *
   * @return Name of Charset
   */
  @Override
  public String getEncoding() {
    return STB_ENCODING;
  }

  /**
   * Return a tokenizer which might be suitable for tokenizing text that will be used with this
   * Treebank/Language pair, without tokenizing carriage returns (i.e., treating them as white
   * space).  The implementation in AbstractTreebankLanguagePack returns a factory for {@link
   * WhitespaceTokenizer}.
   *
   * @return A tokenizer
   */
  @Override
  public TokenizerFactory<? extends HasWord> getTokenizerFactory() {
    return SpanishTokenizer.factory(new CoreLabelTokenFactory(),
        "invertible,ptb3Escaping=true,splitAll=true");
  }

  /**
   * Returns a String array of punctuation tags for this treebank/language.
   *
   * @return The punctuation tags
   */
  @Override
  public String[] punctuationTags() {
    return punctTags;
  }


  /**
   * Returns a String array of punctuation words for this treebank/language.
   *
   * @return The punctuation words
   */
  @Override
  public String[] punctuationWords() {
    return punctWords;
  }


  /**
   * Returns a String array of sentence final punctuation tags for this
   * treebank/language.
   *
   * @return The sentence final punctuation tags
   */
  @Override
  public String[] sentenceFinalPunctuationTags() {
    return sentenceFinalPunctTags;
  }

  /**
   * Returns a String array of sentence final punctuation words for this
   * treebank/language.
   *
   * @return The sentence final punctuation tags
   */
  public String[] sentenceFinalPunctuationWords() {
    return sentenceFinalPunctWords;
  }

  /**
   * Return an array of characters at which a String should be truncated to give the basic syntactic
   * category of a label. The idea here is that Penn treebank style labels follow a syntactic
   * category with various functional and crossreferencing information introduced by special
   * characters (such as "NP-SBJ=1").  This would be truncated to "NP" by the array containing '-'
   * and "=".
   *
   * @return An array of characters that set off label name suffixes
   */
  @Override
  public char[] labelAnnotationIntroducingCharacters() {
    return annotationIntroducingChars;
  }

  /**
   * Returns a String array of treebank start symbols.
   *
   * @return The start symbols
   */
  @Override
  public String[] startSymbols() {
    return startSymbols;
  }

  /**
   * Returns the extension of treebank files for this treebank.
   */
  public String treebankFileExtension() {
    return "xml";
  }

  /** {@inheritDoc} */
  public HeadFinder headFinder() {
    return new SpanishHeadFinder(this);
  }

  /** {@inheritDoc} */
  public HeadFinder typedDependencyHeadFinder() {
    return new SpanishHeadFinder(this);
  }

}
TOP

Related Classes of edu.stanford.nlp.trees.international.spanish.SpanishTreebankLanguagePack

TOP
Copyright © 2018 www.massapi.com. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.