Package edu.stanford.nlp.process

Source Code of edu.stanford.nlp.process.WordToSentenceProcessor

package edu.stanford.nlp.process;

import java.util.*;
import java.util.regex.Matcher;
import java.util.regex.Pattern;

import edu.stanford.nlp.io.EncodingPrintWriter;
import edu.stanford.nlp.ling.CoreAnnotations;
import edu.stanford.nlp.ling.Document;
import edu.stanford.nlp.ling.HasWord;
import edu.stanford.nlp.ling.MultiTokenTag;
import edu.stanford.nlp.ling.tokensregex.SequenceMatcher;
import edu.stanford.nlp.ling.tokensregex.SequencePattern;
import edu.stanford.nlp.util.CoreMap;
import edu.stanford.nlp.util.Generics;

/**
* Transforms a List of words into a List of Lists of words (that is, a List
* of sentences), by grouping the words.  The word stream is assumed to
* already be adequately tokenized, and this class just divides the List into
* sentences, perhaps discarding some separator tokens as it goes.
* <p>
* The main behavior is to look for sentence ending tokens like "." or "?!?",
* and to split after them and any following sentence closers like ")".
* Overlaid on this is an overall choice of state: The WordToSentenceProcessor
* can be a non-splitter, which always returns one sentence. Otherwise, the
* WordToSentenceProcessor will also split based on paragraphs using one of
* these three states: (1) Ignore line breaks in splitting sentences,
* (2) Treat each line as a separate paragraph, or (3) Treat two consecutive
* line breaks as marking the end of a paragraph. The details of sentence
* breaking within paragraphs is controlled based on the following three
* variables:
* <ul>
* <li>sentenceBoundaryTokens are tokens that are left in a sentence, but are
* to be regarded as ending a sentence.  A canonical example is a period.
* If two of these follow each other, the second will be a sentence
* consisting of only the sentenceBoundaryToken.
* <li>sentenceBoundaryFollowers are tokens that are left in a sentence, and
* which can follow a sentenceBoundaryToken while still belonging to
* the previous sentence.  They cannot begin a sentence (except at the
* beginning of a document).  A canonical example is a close parenthesis
* ')'.
* <li>sentenceBoundaryToDiscard are tokens which separate sentences and
* which should be thrown away.  In web documents, a typical example would
* be a '{@code <p>}' tag.  If two of these follow each other, they are
* coalesced: no empty Sentence is output.  The end-of-file is not
* represented in this Set, but the code behaves as if it were a member.
* <li>regionElementRegex A regular expression for element names containing
* a sentence region. Only tokens in such elements will be included in
* sentences. The start and end tags themselves are not included in the
* sentence.
* </ul>
*
* Instances of this class are now immutable. ☺
*
* @author Joseph Smarr (jsmarr@stanford.edu)
* @author Christopher Manning
* @author Teg Grenager (grenager@stanford.edu)
* @author Sarah Spikes (sdspikes@cs.stanford.edu) (Templatization)
*
* @param <IN> The type of the tokens in the sentences
*/
public class WordToSentenceProcessor<IN> implements ListProcessor<IN, List<IN>> {

  // todo [cdm Aug 2012]: This should be unified with the PlainTextIterator
  // in DocumentPreprocessor, perhaps by making this one implement Iterator.
  // (DocumentProcessor once used to use this class, but now doesn't....)

  public enum NewlineIsSentenceBreak { NEVER, ALWAYS, TWO_CONSECUTIVE }

  public static final String DEFAULT_BOUNDARY_REGEX = "\\.|[!?]+";
  public static final Set<String> DEFAULT_BOUNDARY_FOLLOWERS = Collections.unmodifiableSet(Generics.newHashSet(
          Arrays.asList(")", "]", "}", "\"", "'", "''", "\u2019", "\u201D", "-RRB-", "-RSB-", "-RCB-", ")", "]", "}")));
  public static final Set<String> DEFAULT_SENTENCE_BOUNDARIES_TO_DISCARD = Collections.unmodifiableSet(Generics.newHashSet(
          Arrays.asList(WhitespaceLexer.NEWLINE, PTBLexer.NEWLINE_TOKEN)));

  private static final boolean DEBUG = false;


  /**
   * Regex for tokens (Strings) that qualify as sentence-final tokens.
   */
  private final Pattern sentenceBoundaryTokenPattern;

  /**
   * Regex for multi token sequences that qualify as sentence-final tokens.
   * (i.e. use if you want to sentence split on 2 or more newlines)
   */
  private final SequencePattern<? super IN> sentenceBoundaryMultiTokenPattern;

  /**
   * Set of tokens (Strings) that qualify as tokens that can follow
   * what normally counts as an end of sentence token, and which are
   * attributed to the preceding sentence.  For example ")" coming after
   * a period.
   */
  private final Set<String> sentenceBoundaryFollowers;

  /**
   * List of regex Pattern that are sentence boundaries to be discarded.
   * This is normally newline tokens or representations of them.
   */
  private final Set<String> sentenceBoundaryToDiscard;

  /** Patterns that match the start and end tags of XML elements. These will
   *  be discarded, but taken to mark a sentence boundary.
   *  The value will be null if there are no such elements being used
   *  (for efficiency).
   */
  private final List<Pattern> xmlBreakElementsToDiscard;

  /**
   * List of regex Patterns that are not to be treated as sentence boundaries but should be discarded
   * (i.e. these may have been used with context to identify sentence boundaries but are not needed any more)
   */
  private final List<Pattern> tokenPatternsToDiscard;

  private final Pattern sentenceRegionBeginPattern;

  private final Pattern sentenceRegionEndPattern;

  private final NewlineIsSentenceBreak newlineIsSentenceBreak;

  private final boolean isOneSentence;

  private final boolean allowEmptySentences;

  public static NewlineIsSentenceBreak stringToNewlineIsSentenceBreak(String name) {
    if ("always".equals(name)) {
      return NewlineIsSentenceBreak.ALWAYS;
    } else if ("never".equals(name)) {
      return NewlineIsSentenceBreak.NEVER;
    } else if (name != null && name.contains("two")) {
      return NewlineIsSentenceBreak.TWO_CONSECUTIVE;
    } else {
      throw new IllegalArgumentException("Not a valid NewlineIsSentenceBreak name");
    }
  }

  /** This is a sort of hacked in other way to end sentences.
   *  Tokens with the ForcedSentenceEndAnnotation set to true
   *  will also end a sentence.
   */
  @SuppressWarnings("OverlyStrongTypeCast")
  private boolean isForcedEndToken(IN o) {
    if (o instanceof CoreMap) {
      Boolean forcedEndValue =
              ((CoreMap)o).get(CoreAnnotations.ForcedSentenceEndAnnotation.class);
      return forcedEndValue != null && forcedEndValue;
    } else {
      return false;
    }
  }

  @SuppressWarnings("OverlyStrongTypeCast")
  private String getString(IN o) {
    if (o instanceof HasWord) {
      HasWord h = (HasWord) o;
      return h.word();
    } else if (o instanceof String) {
      return (String) o;
    } else if (o instanceof CoreMap) {
      return ((CoreMap) o).get(CoreAnnotations.TextAnnotation.class);
    } else {
      throw new RuntimeException("Expected token to be either Word or String.");
    }
  }

  private static boolean matches(List<Pattern> patterns, String word) {
    for(Pattern p: patterns){
      Matcher m = p.matcher(word);
      if (m.matches()) {
        return true;
      }
    }
    return false;
  }

  private boolean matchesXmlBreakElementToDiscard(String word) {
    return matches(xmlBreakElementsToDiscard, word);
  }

  private boolean matchesTokenPatternsToDiscard(String word) {
    return matches(tokenPatternsToDiscard, word);
  }

  @Override
  public List<List<IN>> process(List<? extends IN> words) {
    if (isOneSentence) {
      // put all the words in one sentence
      List<List<IN>> sentences = Generics.newArrayList();
      sentences.add(new ArrayList<IN>(words));
      return sentences;
    } else {
      return wordsToSentences(words);
    }
  }

  /**
   * Returns a List of Lists where each element is built from a run
   * of Words in the input Document. Specifically, reads through each word in
   * the input document and breaks off a sentence after finding a valid
   * sentence boundary token or end of file.
   * Note that for this to work, the words in the
   * input document must have been tokenized with a tokenizer that makes
   * sentence boundary tokens their own tokens (e.g., {@link PTBTokenizer}).
   *
   * @param words A list of already tokenized words (must implement HasWord or be a String).
   * @return A list of sentences.
   * @see #WordToSentenceProcessor(String, Set, Set, Set, String, NewlineIsSentenceBreak, SequencePattern, Set, boolean, boolean)
   */
  public List<List<IN>> wordsToSentences(List<? extends IN> words) {
    IdentityHashMap<Object, Boolean> isSentenceBoundary = new IdentityHashMap<Object, Boolean>();
    if (sentenceBoundaryMultiTokenPattern != null) {
      // Do initial pass using tokensregex to identify multi token patterns that need to be matched
      // and add the last token to our table of sentence boundary tokens
      SequenceMatcher<? super IN> matcher = sentenceBoundaryMultiTokenPattern.getMatcher(words);
      while (matcher.find()) {
        List nodes = matcher.groupNodes();
        if (nodes != null && nodes.size() > 0) {
          isSentenceBoundary.put(nodes.get(nodes.size() - 1), true);
        }
      }
    }

    // Split tokens into sentences!!!
    List<List<IN>> sentences = Generics.newArrayList();
    List<IN> currentSentence = new ArrayList<IN>();
    List<IN> lastSentence = null;
    boolean insideRegion = false;
    boolean inWaitForForcedEnd = false;
    boolean lastTokenWasNewline = false;
    for (IN o: words) {
      String word = getString(o);
      boolean forcedEnd = isForcedEndToken(o);

      boolean inMultiTokenExpr = false;
      boolean discardToken = false;
      if (o instanceof CoreMap) {
        // Hacky stuff to ensure sentence breaks do not happen in certain cases
        CoreMap cm = (CoreMap) o;
        Boolean forcedUntilEndValue = cm.get(CoreAnnotations.ForcedSentenceUntilEndAnnotation.class);
        if (!forcedEnd) {
          if (forcedUntilEndValue != null && forcedUntilEndValue)
            inWaitForForcedEnd = true;
          else {
            MultiTokenTag mt = cm.get(CoreAnnotations.MentionTokenAnnotation.class);
            if (mt != null && !mt.isEnd()) {
              // In the middle of a multi token mention, make sure sentence is not ended here
              inMultiTokenExpr = true;
            }
          }
        }
      }
      if (tokenPatternsToDiscard != null) {
        discardToken = matchesTokenPatternsToDiscard(word);
      }

      if (DEBUG) {
        EncodingPrintWriter.err.println("Word is " + word, "UTF-8");
      }
      if (sentenceRegionBeginPattern != null && ! insideRegion) {
        if (DEBUG) {
          System.err.println("  outside region; deleted");
        }
        if (sentenceRegionBeginPattern.matcher(word).matches()) {
          insideRegion = true;
          if (DEBUG) {
            System.err.println("  entering region");
          }
        }
        lastTokenWasNewline = false;
        continue;
      }

      if (lastSentence != null && currentSentence.isEmpty() && sentenceBoundaryFollowers.contains(word)) {
        if (!discardToken) lastSentence.add(o);
        if (DEBUG) {
          System.err.println(discardToken? "discarded":"  added to last sentence");
        }
        lastTokenWasNewline = false;
        continue;
      }

      boolean newSent = false;
      String debugText = (discardToken)? "discarded":"added to current";
      if (inWaitForForcedEnd && !forcedEnd) {
        if (!discardToken) currentSentence.add(o);
        if (DEBUG) {
          System.err.println("  is in wait for forced end; " + debugText);
        }
      } else if (inMultiTokenExpr && !forcedEnd) {
        if (!discardToken) currentSentence.add(o);
        if (DEBUG) {
          System.err.println("  is in multi token expr; " + debugText);
        }
      } else if (sentenceBoundaryToDiscard.contains(word)) {
        if (newlineIsSentenceBreak == NewlineIsSentenceBreak.ALWAYS) {
          newSent = true;
        } else if (newlineIsSentenceBreak == NewlineIsSentenceBreak.TWO_CONSECUTIVE) {
          if (lastTokenWasNewline) {
            newSent = true;
          }
        }
        lastTokenWasNewline = true;
        if (DEBUG) {
          System.err.println("  discarded sentence boundary");
        }
      } else {
        lastTokenWasNewline = false;
        if (xmlBreakElementsToDiscard != null && matchesXmlBreakElementToDiscard(word)) {
          newSent = true;
          if (DEBUG) {
            System.err.println("  is XML break element; discarded");
          }
        } else if (sentenceRegionEndPattern != null && sentenceRegionEndPattern.matcher(word).matches()) {
          insideRegion = false;
          newSent = true;
          // Marked sentence boundaries
        } else if (isSentenceBoundary.containsKey(o) && isSentenceBoundary.get(o)) {
          if (!discardToken) currentSentence.add(o);
          if (DEBUG) {
            System.err.println("  is sentence boundary (matched multi-token pattern); " + debugText);
          }
          newSent = true;
        } else if (sentenceBoundaryTokenPattern.matcher(word).matches()) {
          if (!discardToken) currentSentence.add(o);
          if (DEBUG) {
            System.err.println("  is sentence boundary; " + debugText);
          }
          newSent = true;
        } else if (forcedEnd) {
          if (!discardToken) currentSentence.add(o);
          inWaitForForcedEnd = false;
          newSent = true;
          if (DEBUG) {
            System.err.println("  annotated to be the end of a sentence; " + debugText);
          }
        } else {
          if (!discardToken) currentSentence.add(o);
          if (DEBUG) {
            System.err.println("  " + debugText);
          }
        }
      }

      if (newSent && (!currentSentence.isEmpty() || allowEmptySentences)) {
        if (DEBUG) {
          System.err.println("  beginning new sentence");
        }
        sentences.add(currentSentence);
        // adds this sentence now that it's complete
        lastSentence = currentSentence;
        currentSentence = new ArrayList<IN>(); // clears the current sentence
      }
    }

    // add any words at the end, even if there isn't a sentence
    // terminator at the end of file
    if ( ! currentSentence.isEmpty()) {
      sentences.add(currentSentence); // adds last sentence
    }

    return sentences;
  }

  public <L, F> Document<L, F, List<IN>> processDocument(Document<L, F, IN> in) {
    Document<L, F, List<IN>> doc = in.blankDocument();
    doc.addAll(process(in));
    return doc;
  }


  /**
   * Create a {@code WordToSentenceProcessor} using a sensible default
   * list of tokens for sentence ending for English/Latin writing systems.
   * The default set is: {".","?","!"} and
   * any combination of ! or ?, as in !!!?!?!?!!!?!!?!!!.
   * A sequence of two or more consecutive line breaks is taken as a paragraph break
   * which also splits sentences. This is the usual constructor for sentence
   * breaking reasonable text, which uses hard-line breaking, so two
   * blank lines indicate a paragraph break.
   */
  public WordToSentenceProcessor() {
    this(false);
  }

  /**
   * Create a {@code WordToSentenceProcessor} using a sensible default
   * list of tokens for sentence ending for English/Latin writing systems.
   * The default set is: {".","?","!"} and
   * any combination of ! or ?, as in !!!?!?!?!!!?!!?!!!.
   * You can specify the treatment of newlines as sentence breaks as one
   * of ignored, every newline is a sentence break, or only two or more
   * consecutive newlines are a sentence break.
   *
   * @param newlineIsSentenceBreak Strategy for treating newlines as
   *                               paragraph breaks.
   */
  public WordToSentenceProcessor(NewlineIsSentenceBreak newlineIsSentenceBreak) {
    this(DEFAULT_BOUNDARY_REGEX, newlineIsSentenceBreak, false);
  }

  /**
   * Create a {@code WordToSentenceProcessor} which never breaks the input
   * into multiple sentences. If the argument is true, the input stream
   * is always output as one sentence. (If it is false, this is
   * equivalent to the no argument constructor, so why use this?)
   *
   * @param isOneSentence Marker argument: true means to treat input
   *                      as one sentence
   */
  public WordToSentenceProcessor(boolean isOneSentence) {
    this(DEFAULT_BOUNDARY_REGEX, NewlineIsSentenceBreak.TWO_CONSECUTIVE, isOneSentence);
  }


  /**
   * This one just massages the above 3 constructors to the maximal constructor.
   *
   * @param boundaryTokenRegex The set of boundary tokens
   * @param newlineIsSentenceBreak Strategy for treating newlines as sentence breaks
   * @param isOneSentence Whether to treat whole text as one sentence
   *                      (if true, the other two parameters are ignored).
   */
  private WordToSentenceProcessor(String boundaryTokenRegex,
                                 NewlineIsSentenceBreak newlineIsSentenceBreak,
                                 boolean isOneSentence) {
    this(boundaryTokenRegex, DEFAULT_BOUNDARY_FOLLOWERS, DEFAULT_SENTENCE_BOUNDARIES_TO_DISCARD,
            null, null, newlineIsSentenceBreak, null, null, isOneSentence, false);
  }

  /**
   * Flexibly set the set of acceptable sentence boundary tokens, but with
   * a default set of allowed boundary following tokens. Also can set sentence boundary
   * to discard tokens and xmlBreakElementsToDiscard and set the treatment of newlines
   * (boundaryToDiscard) as sentence ends.
   *
   * This one is convenient in allowing any of the first 3 arguments to be null,
   * and then the usual defaults are substituted for it.
   * The allowed set of boundary followers is:
   * {")", "]", "}", "\"", "'", "''", "’", "”", "-RRB-", "-RSB-", "-RCB-", ")", "]", "}"}.
   * The default set of discarded separator tokens includes the
   * newline tokens used by WhitespaceLexer and PTBLexer.
   *
   * @param boundaryTokenRegex The set of boundary tokens. If null, use default.
   * @param boundaryToDiscard The set of regex for sentence boundary tokens that should be discarded.
   *                          If null, use default.
   * @param xmlBreakElementsToDiscard xml element names like "p", which will be recognized,
   *                                  treated as sentence ends, and discarded.
   *                                  If null, use none.
   * @param newlineIsSentenceBreak Strategy for counting line ends (boundaryToDiscard) as sentence ends.
   */
  public WordToSentenceProcessor(String boundaryTokenRegex,
                                 Set<String> boundaryToDiscard, Set<String> xmlBreakElementsToDiscard,
                                 NewlineIsSentenceBreak newlineIsSentenceBreak) {
    this(boundaryTokenRegex == null ? DEFAULT_BOUNDARY_REGEX : boundaryTokenRegex,
            DEFAULT_BOUNDARY_FOLLOWERS,
            boundaryToDiscard == null || boundaryToDiscard.isEmpty() ? DEFAULT_SENTENCE_BOUNDARIES_TO_DISCARD : boundaryToDiscard,
            xmlBreakElementsToDiscard == null ? Collections.<String>emptySet() : xmlBreakElementsToDiscard,
            null, newlineIsSentenceBreak, null, null, false, false);
  }

  /**
   * Flexibly set the set of acceptable sentence boundary tokens, but with
   * a default set of allowed boundary following tokens. Also can set sentence boundary
   * to discard tokens and xmlBreakElementsToDiscard and set the treatment of newlines
   * (boundaryToDiscard) as sentence ends.
   *
   * This one is convenient in allowing any of the first 3 arguments to be null,
   * and then the usual defaults are substituted for it.
   * The allowed set of boundary followers is:
   * {")", "]", "}", "\"", "'", "''", "’", "”", "-RRB-", "-RSB-", "-RCB-", ")", "]", "}"}.
   * The default set of discarded separator tokens includes the
   * newline tokens used by WhitespaceLexer and PTBLexer.
   *
   * @param boundaryTokenRegex The set of boundary tokens. If null, use default.
   * @param boundaryToDiscard The set of regex for sentence boundary tokens that should be discarded.
   *                          If null, use default.
   * @param xmlBreakElementsToDiscard xml element names like "p", which will be recognized,
   *                                  treated as sentence ends, and discarded.
   *                                  If null, use none.
   * @param newlineIsSentenceBreak Strategy for counting line ends (boundaryToDiscard) as sentence ends.
   */
  public WordToSentenceProcessor(String boundaryTokenRegex,
                                 Set<String> boundaryToDiscard, Set<String> xmlBreakElementsToDiscard,
                                 NewlineIsSentenceBreak newlineIsSentenceBreak,
                                 SequencePattern<? super IN> sentenceBoundaryMultiTokenPattern,
                                 Set<String> tokenRegexesToDiscard) {
    this(boundaryTokenRegex == null ? DEFAULT_BOUNDARY_REGEX : boundaryTokenRegex,
            DEFAULT_BOUNDARY_FOLLOWERS,
            boundaryToDiscard == null || boundaryToDiscard.isEmpty() ? DEFAULT_SENTENCE_BOUNDARIES_TO_DISCARD : boundaryToDiscard,
            xmlBreakElementsToDiscard == null ? Collections.<String>emptySet() : xmlBreakElementsToDiscard,
            null, newlineIsSentenceBreak, sentenceBoundaryMultiTokenPattern, tokenRegexesToDiscard, false, false);
  }


  /**
   * Set the set of Strings that will mark the end of a sentence,
   * and which will be discarded after doing so.
   * This constructor is used for, and usually only for, doing
   * one-sentence-per-line sentence splitting.  Since in such cases, you
   * generally want to strictly preserve the set of lines in the input,
   * it preserves empty lines as empty sentences in the output.
   *
   * @param boundaryToDiscard A Set of String that will be matched
   *                          with .equals() and will mark an
   *                          end of sentence and be discarded.
   */
  public WordToSentenceProcessor(Set<String> boundaryToDiscard) {
    this("", Collections.<String>emptySet(), boundaryToDiscard, null, null,
            NewlineIsSentenceBreak.ALWAYS, null, null, false, true);
  }

  /**
   * Flexibly set parameters for converting a list of tokens into sentences.
   * The whole enchilada.
   *
   * @param boundaryTokenRegex Tokens that match this regex will end a
   *                           sentence, but are retained at the end of
   *                           the sentence.
   * @param boundaryFollowers This is a Set of String that are matched with
   *                          .equals() which are allowed to be tacked onto
   *                          the end of a sentence after a sentence boundary
   *                          token, for example ")".
   * @param boundariesToDiscard This is normally used for newline tokens if
   *                            they are included in the tokenization. They
   *                            may end the sentence (depending on the setting
   *                            of newlineIsSentenceBreak), but at any rate
   *                            are deleted from sentences in the output.
   * @param xmlBreakElementsToDiscard These are elements like "p" or "sent",
   *                                  which will be wrapped into regex for
   *                                  approximate XML matching. They will be
   *                                  deleted in the output, and will always
   *                                  trigger a sentence boundary.
   */
  public WordToSentenceProcessor(String boundaryTokenRegex, Set<String> boundaryFollowers,
                                 Set<String> boundariesToDiscard, Set<String> xmlBreakElementsToDiscard,
                                 String regionElementRegex, NewlineIsSentenceBreak newlineIsSentenceBreak,
                                 SequencePattern<? super IN> sentenceBoundaryMultiTokenPattern,
                                 Set<String> tokenRegexesToDiscard,
                                 boolean isOneSentence, boolean allowEmptySentences) {
    sentenceBoundaryTokenPattern = Pattern.compile(boundaryTokenRegex);
    sentenceBoundaryFollowers = Collections.unmodifiableSet(boundaryFollowers);
    sentenceBoundaryToDiscard = Collections.unmodifiableSet(boundariesToDiscard);
    if (xmlBreakElementsToDiscard == null || xmlBreakElementsToDiscard.isEmpty()) {
      this.xmlBreakElementsToDiscard = null;
    } else {
      this.xmlBreakElementsToDiscard = new ArrayList<Pattern>(xmlBreakElementsToDiscard.size());
      for (String s: xmlBreakElementsToDiscard) {
        String regex = "<\\s*(?:/\\s*)?(?:" + s + ")(?:\\s+[^>]+?|\\s*(?:/\\s*)?)>";
        // System.err.println("Regex is |" + regex + "|");
        // todo: Historically case insensitive, but maybe better and more proper to make case sensitive?
        this.xmlBreakElementsToDiscard.add(Pattern.compile(regex, Pattern.CASE_INSENSITIVE));
      }
    }
    if (regionElementRegex != null) {
      sentenceRegionBeginPattern = Pattern.compile("<\\s*(?:" + regionElementRegex + ")(?:\\s+[^>]+?)?>");
      sentenceRegionEndPattern = Pattern.compile("<\\s*/\\s*(?:" + regionElementRegex + ")\\s*>");
    } else {
      sentenceRegionBeginPattern = null;
      sentenceRegionEndPattern = null;
    }
    this.newlineIsSentenceBreak = newlineIsSentenceBreak;
    this.isOneSentence = isOneSentence;
    this.allowEmptySentences = allowEmptySentences;
    if (sentenceBoundaryMultiTokenPattern != null) {
      this.sentenceBoundaryMultiTokenPattern = sentenceBoundaryMultiTokenPattern;
    } else {
      this.sentenceBoundaryMultiTokenPattern = null;
    }
    if (tokenRegexesToDiscard != null) {
      this.tokenPatternsToDiscard = new ArrayList<Pattern>(tokenRegexesToDiscard.size());
      for (String s: tokenRegexesToDiscard) {
        this.tokenPatternsToDiscard.add(Pattern.compile(s));
      }
    } else {
      this.tokenPatternsToDiscard = null;
    }

    if (DEBUG) {
      EncodingPrintWriter.err.println("WordToSentenceProcessor: boundaryTokens=" + boundaryTokenRegex, "UTF-8");
      EncodingPrintWriter.err.println("  boundaryFollowers=" + boundaryFollowers, "UTF-8");
      EncodingPrintWriter.err.println("  boundariesToDiscard=" + boundariesToDiscard, "UTF-8");
      EncodingPrintWriter.err.println("  xmlBreakElementsToDiscard=" + xmlBreakElementsToDiscard, "UTF-8");
      EncodingPrintWriter.err.println("  regionBeginPattern=" + sentenceRegionBeginPattern, "UTF-8");
      EncodingPrintWriter.err.println("  regionEndPattern=" + sentenceRegionEndPattern, "UTF-8");
      EncodingPrintWriter.err.println("  newlineIsSentenceBreak=" + newlineIsSentenceBreak, "UTF-8");
      EncodingPrintWriter.err.println("  sentenceBoundaryMultiTokenPattern=" + sentenceBoundaryMultiTokenPattern, "UTF-8");
      EncodingPrintWriter.err.println("  tokenPatternsToDiscard=" + tokenPatternsToDiscard, "UTF-8");
      EncodingPrintWriter.err.println("  isOneSentence=" + isOneSentence, "UTF-8");
      EncodingPrintWriter.err.println("  allowEmptySentences=" + allowEmptySentences, "UTF-8");
    }
  }

}
TOP

Related Classes of edu.stanford.nlp.process.WordToSentenceProcessor

TOP
Copyright © 2018 www.massapi.com. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.