Package opennlp.tools.formats.ad

Source Code of opennlp.tools.formats.ad.ADPOSSampleStream

/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements.  See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
*     http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/

package opennlp.tools.formats.ad;

import java.io.IOException;
import java.io.InputStream;
import java.io.UnsupportedEncodingException;
import java.util.ArrayList;
import java.util.List;
import java.util.StringTokenizer;

import opennlp.tools.formats.ad.ADSentenceStream.Sentence;
import opennlp.tools.formats.ad.ADSentenceStream.SentenceParser.Leaf;
import opennlp.tools.formats.ad.ADSentenceStream.SentenceParser.Node;
import opennlp.tools.formats.ad.ADSentenceStream.SentenceParser.TreeElement;
import opennlp.tools.postag.POSSample;
import opennlp.tools.util.ObjectStream;
import opennlp.tools.util.PlainTextByLineStream;

/**
* <b>Note:</b> Do not use this class, internal use only!
*/
public class ADPOSSampleStream implements ObjectStream<POSSample> {

  private final ObjectStream<ADSentenceStream.Sentence> adSentenceStream;
  private boolean expandME;
  private boolean isIncludeFeatures;

  /**
   * Creates a new {@link POSSample} stream from a line stream, i.e.
   * {@link ObjectStream}< {@link String}>, that could be a
   * {@link PlainTextByLineStream} object.
   *
   * @param lineStream
   *          a stream of lines as {@link String}
   * @param expandME
   *          if true will expand the multiword expressions, each word of the
   *          expression will have the POS Tag that was attributed to the
   *          expression plus the prefix B- or I- (CONLL convention)
   * @param includeFeatures
   *          if true will combine the POS Tag with the feature tags
   */
  public ADPOSSampleStream(ObjectStream<String> lineStream, boolean expandME,
      boolean includeFeatures) {
    this.adSentenceStream = new ADSentenceStream(lineStream);
    this.expandME = expandME;
    this.isIncludeFeatures = includeFeatures;
  }

  /**
   * Creates a new {@link POSSample} stream from a {@link InputStream}
   *
   * @param in
   *          the Corpus {@link InputStream}
   * @param charsetName
   *          the charset of the Arvores Deitadas Corpus
   * @param expandME
   *          if true will expand the multiword expressions, each word of the
   *          expression will have the POS Tag that was attributed to the
   *          expression plus the prefix B- or I- (CONLL convention)
   * @param includeFeatures
   *          if true will combine the POS Tag with the feature tags
   */
  public ADPOSSampleStream(InputStream in, String charsetName,
      boolean expandME, boolean includeFeatures) {

    try {
      this.adSentenceStream = new ADSentenceStream(new PlainTextByLineStream(
          in, charsetName));
      this.expandME = expandME;
      this.isIncludeFeatures = includeFeatures;
    } catch (UnsupportedEncodingException e) {
      // UTF-8 is available on all JVMs, will never happen
      throw new IllegalStateException(e);
    }
  }

  public POSSample read() throws IOException {
    Sentence paragraph;
    while ((paragraph = this.adSentenceStream.read()) != null) {
      Node root = paragraph.getRoot();
      List<String> sentence = new ArrayList<String>();
      List<String> tags = new ArrayList<String>();
      process(root, sentence, tags);

      return new POSSample(sentence, tags);
    }
    return null;
  }

  private void process(Node node, List<String> sentence, List<String> tags) {
    if (node != null) {
      for (TreeElement element : node.getElements()) {
        if (element.isLeaf()) {
          processLeaf((Leaf) element, sentence, tags);
        } else {
          process((Node) element, sentence, tags);
        }
      }
    }
  }

  private void processLeaf(Leaf leaf, List<String> sentence, List<String> tags) {
    if (leaf != null) {
      String lexeme = leaf.getLexeme();
      String tag = leaf.getFunctionalTag();

      if (tag == null) {
        tag = leaf.getLexeme();
      }

      if (isIncludeFeatures && leaf.getMorphologicalTag() != null) {
        tag += " " + leaf.getMorphologicalTag();
      }
      tag = tag.replaceAll("\\s+", "=");

      if (tag == null)
        tag = lexeme;

      if (expandME && lexeme.contains("_")) {
        StringTokenizer tokenizer = new StringTokenizer(lexeme, "_");

        if (tokenizer.countTokens() > 0) {
          List<String> toks = new ArrayList<String>(tokenizer.countTokens());
          List<String> tagsWithCont = new ArrayList<String>(
              tokenizer.countTokens());
          toks.add(tokenizer.nextToken());
          tagsWithCont.add("B-" + tag);
          while (tokenizer.hasMoreTokens()) {
            toks.add(tokenizer.nextToken());
            tagsWithCont.add("I-" + tag);
          }

          sentence.addAll(toks);
          tags.addAll(tagsWithCont);
        } else {
          sentence.add(lexeme);
          tags.add(tag);
        }

      } else {
        sentence.add(lexeme);
        tags.add(tag);
      }
    }

  }

  public void reset() throws IOException, UnsupportedOperationException {
    adSentenceStream.reset();
  }

  public void close() throws IOException {
    adSentenceStream.close();
  }
}
TOP

Related Classes of opennlp.tools.formats.ad.ADPOSSampleStream

TOP
Copyright © 2018 www.massapi.com. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.