Package com.clearnlp.constituent

Source Code of com.clearnlp.constituent.CTNode

/**
* Copyright (c) 2009/09-2012/08, Regents of the University of Colorado
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
*
* 1. Redistributions of source code must retain the above copyright notice, this
*    list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright notice,
*    this list of conditions and the following disclaimer in the documentation
*    and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
* WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
* DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
* ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
* LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
* ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
* SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
/**
* Copyright 2012/09-2013/04, 2013/11-Present, University of Massachusetts Amherst
* Copyright 2013/05-2013/10, IPSoft Inc.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
*     http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package com.clearnlp.constituent;

import java.util.ArrayList;
import java.util.Collection;
import java.util.HashSet;
import java.util.List;
import java.util.Set;
import java.util.StringTokenizer;
import java.util.TreeSet;

import com.carrotsearch.hppc.IntArrayList;
import com.carrotsearch.hppc.IntOpenHashSet;
import com.clearnlp.conversion.C2DInfo;
import com.clearnlp.pattern.PTNumber;
import com.clearnlp.propbank.PBLoc;
import com.clearnlp.util.pair.StringIntPair;


/**
* Constituent node.
* @since 1.0.0
* @author Jinho D. Choi ({@code jdchoi77@gmail.com})
*/
public class CTNode implements Comparable<CTNode>
{
  /** The phrase or pos tag of this node. */
  public String      pTag;
  /** The function tags of this node (default: empty). */
  protected Set<String> s_fTags;
  /** The co-index of this node (default: {@code -1}). */
  public int coIndex  = -1;
  /** The gap-index of this node (default: {@code -1}). */
  public int gapIndex = -1;
  /** The word-form of this node (default: {@code null}). */
  public String form  = null;
 
  /** The parent of this node (default: {@code null}). */
  protected CTNode parent     = null;
  /** The antecedent of this node (default: {@code null}). */
  protected CTNode antecedent = null;
  /** The children of this node. */
  protected List<CTNode> ls_children;
  /** The ID (starting at 0) of this node among other terminal nodes within the tree (default: {@code -1}). */
  protected int i_terminalId = -1;
  /** The ID (starting at 0) of this node among other terminal nodes (disregarding empty categories) within the tree (default: {@code -1}). */
  protected int i_tokenId    = -1;
  /** The ID (starting at 0) of this node among its siblings (default: {@code -1}). */
  protected int i_siblingId  = -1;
  /** The PropBank location of this node (default: {@code null}). */
  protected PBLoc  pb_loc    = null;
 
  /** The information of constituent-to-dependency conversion. */
  public C2DInfo c2d = null;
  /** The list of PropBank predicate ID and label pairs. */
  public List<StringIntPair> pbArgs = null;
 
  /**
     * Constructs a constituent node.
     * @param tags {@link CTNode#pTag}{@code (-}{@link CTNode#s_fTags}{@code )*(-}{@link CTNode#coIndex}{@code ){0,1}(=}{@link CTNode#gapIndex}{@code ){0,1}}.
     */
  public CTNode(String tags)
  {
    setTags(tags);
    ls_children = new ArrayList<CTNode>();
  }
 
  /**
     * Constructs a constituent node with the specific word-form.
     * @param tags see the {@code tags} parameter in {@link CTNode#CTNode(String, CTNode)}.
     * @param form the word-form of this node.
     */
  public CTNode(String tags, String form)
  {
    this(tags);
    this.form = form;
  }
 
//  ======================== Getters ========================

  /**
     * Returns all tags of this node in the Penn Treebank format.
     * See the {@code tags} parameter in {@link CTNode#CTNode(String, CTNode)}.
     * @return all tags of this node in the Penn Treebank format.
     */
  public String getTags()
  {
    StringBuilder build = new StringBuilder();
   
    build.append(pTag);
   
    for (String fTag : s_fTags)
    {
      build.append("-");
      build.append(fTag);
    }
   
    if (coIndex != -1)
    {
      build.append("-");
      build.append(coIndex);
    }
   
    if (gapIndex != -1)
    {
      build.append("=");
      build.append(gapIndex);
    }
   
    return build.toString();
  }
 
  /**
     * Returns the set of function tags of this node.
     * @return the set of function tags of this node.
     */
  public Set<String> getFTags()
  {
    return s_fTags;
  }
 
  /**
     * Returns the ID (starting at 0) of this node among other terminal nodes within the tree (default: {@code -1}).
     * @return the ID (starting at 0) of this node among other terminal nodes within the tree (default: {@code -1}).
     */
  public int getTerminalId()
  {
    return i_terminalId;
  }
 
  /**
     * Returns the ID (starting at 0) of this node among other terminal nodes (disregarding empty categories) within the tree (default: {@code -1}).
     * @return the ID (starting at 0) of this node among other terminal nodes (disregarding empty categories) within the tree (default: {@code -1}).
     */
  public int getTokenId()
  {
    return i_tokenId;
  }
 
  /**
     * Returns the ID (starting at 0) of this node among its siblings (default: {@code -1}).
     * @return the ID (starting at 0) of this node among its siblings (default: {@code -1}).
     */
  public int getSiblingId()
  {
    return i_siblingId;
  }
 
  /**
     * Returns the PropBank location of this node (default: {@code null}).
     * @return the PropBank location of this node (default: {@code null}).
     */
  public PBLoc getPBLoc()
  {
    return pb_loc;
  }
 
  /**
     * Returns the parent of this node.
     * @return the parent of this node.
     */
  public CTNode getParent()
  {
    return parent;
  }
 
  /**
     * Returns the antecedent of this node (default: {@code null}).
     * @return the antecedent of this node (default: {@code null}).
     */
  public CTNode getAntecedent()
  {
    return antecedent;
  }
 
  /**
     * Returns a list of all children of this node.
     * @return a list of all children of this node.
     */
  public List<CTNode> getChildren()
  {
    return ls_children;
  }
 
  /**
     * Returns an immutable list of sub-children of this node.
     * The sublist begins at the specific position and extends to the end.
     * @param fstId the ID of the first child (inclusive).
     * @return an immutable list of sub-children of this node.
     * @throws IndexOutOfBoundsException for an illegal ID.
     */
  public List<CTNode> getChildren(int fstId)
  {
    return ls_children.subList(fstId, ls_children.size());
  }
 
  /**
   * Returns an immutable list of sub-children of this node.
     * The sublist begins and ends at the specific positions.
     * @param fstId the ID of the first child (inclusive).
     * @param lstId the ID of the last child (exclusive)
     * @return an immutable list of sub-children of this node.
     * @throws IndexOutOfBoundsException for an illegal ID.
     */
  public List<CTNode> getChildren(int fstId, int lstId)
  {
    return ls_children.subList(fstId, lstId);
  }
 
  public CTNode getChild(int childId)
  {
    return (0 <= childId && childId < ls_children.size()) ? ls_children.get(childId) : null;
  }
 
  public CTNode getFirstChild(String... tags)
  {
    for (CTNode child : ls_children)
    {
      if (child.isTag(tags))
        return child;
    }
   
    return null;
  }
 
  public CTNode getLastChild(String... tags)
  {
    CTNode child;  int i;
   
    for (i=ls_children.size()-1; i>=0; i--)
    {
      child = ls_children.get(i);
     
      if (child.isTag(tags))
        return child;
    }
   
    return null;
  }
 
  public List<CTNode> getAllChildren(String... tags)
  {
    List<CTNode> list = new ArrayList<CTNode>();
   
    for (CTNode child : ls_children)
    {
      if (child.isTag(tags))
        list.add(child);
    }
   
    return list;
  }
 
  public CTNode getFirstDescendant(String... tags)
  {
    return getFirstDescendantAux(ls_children, tags);
  }
 
  private CTNode getFirstDescendantAux(List<CTNode> nodes, String... tags)
  {
    CTNode desc;
   
    for (CTNode node : nodes)
    {
      if (node.isTag(tags))  return node;
     
      desc = getFirstDescendantAux(node.ls_children, tags);
      if (desc != nullreturn desc;
    }
   
    return null;
  }

  public CTNode getPrevSibling()
  {
    List<CTNode> siblings = parent.ls_children;
    return (0 <= i_siblingId-1) ? siblings.get(i_siblingId-1) : null;
  }
 
  public CTNode getPrevSibling(String... tags)
  {
    if (parent == null) return null;
    List<CTNode> siblings = parent.ls_children;
    CTNode node; int i;
   
    for (i=i_siblingId-1; i>=0; i--)
    {
      node = siblings.get(i);
     
      if (node.isTag(tags))
        return node;
    }
   
    return null;
  }
 
  public List<CTNode> getPrevSiblings()
  {
    return (parent != null) ? parent.getChildren(0, this.i_siblingId) : new ArrayList<CTNode>(0);
  }
 
  public CTNode getNextSibling()
  {
    List<CTNode> siblings = parent.ls_children;
    return (i_siblingId+1 < siblings.size()) ? siblings.get(i_siblingId+1) : null;
  }
 
  public CTNode getNextSibling(String... tags)
  {
    if (parent == null) return null;
    List<CTNode> siblings = parent.ls_children;
    CTNode node; int i, size = siblings.size();
   
    for (i=i_siblingId+1; i<size; i++)
    {
      node = siblings.get(i);
     
      if (node.isTag(tags))
        return node;
    }
   
    return null;
  }
 
  public CTNode getNearestAncestor(String... tags)
  {
    CTNode curr = this;
   
    while (curr.parent != null)
    {
      curr = curr.parent;
      if (curr.isTag(tags)) return curr;
    }
   
    return null;
  }
 
  public CTNode getFirstChainedDescendant(String... tags)
  {
    CTNode desc = this, child;
   
    while ((child = desc.getFirstChild(tags)) != null)
      desc = child;
   
    return (desc != this) ? desc : null;
  }
 
  public CTNode getHighestChainedAncestor(String... tags)
  {
    CTNode curr = this;
   
    while (curr.parent != null && curr.parent.isTag(tags))
      curr = curr.parent;
 
    return (curr == this) ? null : curr;
  }
 
  public List<CTNode> getIncludedEmptyCategory(String regex)
  {
    List<CTNode> list = new ArrayList<CTNode>();
   
    getIncludedEmptyCategoriesAux(this, list, regex);
    return list;
  }
 
  private void getIncludedEmptyCategoriesAux(CTNode curr, List<CTNode> list, String regex)
  {
    if (curr.isEmptyCategory() && curr.form.matches(regex))
      list.add(curr);
   
    for (CTNode child : curr.ls_children)
      getIncludedEmptyCategoriesAux(child, list, regex);
  }
 
  public List<CTNode> getSubTerminals()
  {
    List<CTNode> terminals = new ArrayList<CTNode>();
   
    getSubTerminals(this, terminals);
    return terminals;
  }
 
  private void getSubTerminals(CTNode curr, List<CTNode> terminals)
  {
    if (curr.isPhrase())
    {
      for (CTNode child : curr.ls_children)
        getSubTerminals(child, terminals);
    }
    else
      terminals.add(curr);
  }
 
  public IntArrayList getSubTerminalIdList()
  {
    IntArrayList list = new IntArrayList();
   
    for (CTNode node : getSubTerminals())
      list.add(node.getTerminalId());
   
    return list;
  }
 
  public IntOpenHashSet getSubTerminalIdSet()
  {
    IntOpenHashSet set = new IntOpenHashSet();
   
    for (CTNode node : getSubTerminals())
      set.add(node.getTerminalId());
   
    return set;
  }
 
  public List<CTNode> getSubTokens()
  {
    List<CTNode> tokens = new ArrayList<CTNode>();
   
    getSubTokens(this, tokens);
    return tokens;
  }
 
  private void getSubTokens(CTNode curr, List<CTNode> tokens)
  {
    if (curr.isPhrase())
    {
      for (CTNode child : curr.ls_children)
        getSubTokens(child, tokens);
    }
    else if (!curr.isEmptyCategory())
      tokens.add(curr);
  }
 
  public CTNode getFirstTerminal()
  {
    return getFirstTerminalAux(this);
  }
 
  private CTNode getFirstTerminalAux(CTNode node)
  {
    List<CTNode> children = node.getChildren();
    if (children.isEmpty())  return node;
   
    return getFirstTerminalAux(children.get(0));
  }
 
  public CTNode getLastTerminal()
  {
    return getLastTerminalAux(this);
  }
 
  private CTNode getLastTerminalAux(CTNode node)
  {
    List<CTNode> children = node.getChildren();
    if (children.isEmpty())  return node;
   
    return getLastTerminalAux(children.get(children.size()-1));
  }
 
  public int getChildrenSize()
  {
    return ls_children.size();
  }
 
  public CTNode getLowestCommonAncestor(CTNode node)
  {
    if (this.isDescendantOf(node))  return node;
    if (node.isDescendantOf(this))  return this;
   
    CTNode parent = getParent();
   
    while (parent != null)
    {
      if (node.isDescendantOf(parent))
        return parent;
     
      parent = parent.getParent();
    }
   
    return null;
  }
 
//  ======================== Setters ========================
 
  public void setTags(String tags)
  {
    s_fTags = new TreeSet<String>();
   
    if (tags.charAt(0) == '-')
    {
      pTag = tags;
      return;
    }
   
    StringTokenizer tok = new StringTokenizer(tags, "-=", true);
    String delim, tag;
   
    pTag = tok.nextToken();
   
    while (tok.hasMoreTokens())
    {
      delim = tok.nextToken();
      if (!tok.hasMoreTokens())
      {
        System.err.println("Error: illegal tag \""+tags+"\"");
        break;
      }
      tag = tok.nextToken();
     
      if (delim.equals("-"))
      {
        if (PTNumber.containsOnlyDigits(tag))
        {
          if (coIndex == -1)
            coIndex = Integer.parseInt(tag);
          else
            gapIndex = Integer.parseInt(tag);
        }
        else
          s_fTags.add(tag);
      }
      else // if (delim.equals("="))
        gapIndex = Integer.parseInt(tag);
    }
  }
 
  public void addFTag(String fTag)
  {
    s_fTags.add(fTag);
  }
 
  public void addFTags(Collection<String> fTags)
  {
    s_fTags.addAll(fTags);
  }
 
  public void removeFTag(String fTag)
  {
    s_fTags.remove(fTag);
  }
 
  public void removeFTagAll()
  {
    s_fTags.clear();
  }
 
  public void setAntecedent(CTNode ante)
  {
    antecedent = ante;
  }
 
  public void addChild(CTNode child)
  {
    child.parent = this;
    child.i_siblingId = ls_children.size();
   
    ls_children.add(child);
  }
 
  public void addChild(int index, CTNode child)
  {
    ls_children.add(index, child);
   
    child.parent = this;
    resetSiblingIDs(index);
  }
 
  public void setChild(int index, CTNode child)
  {
    ls_children.set(index, child).parent = null;
   
    child.parent = this;
    child.i_siblingId = index;
  }
 
  public void removeChild(int index)
  {
    if (!isChildrenRange(index))
      throw new IndexOutOfBoundsException(Integer.toString(index));
   
    ls_children.remove(index).parent = null;
    resetSiblingIDs(index);
  }
 
  private void resetSiblingIDs(int index)
  {
    int i, size = ls_children.size();
   
    for (i=index; i<size; i++)
      ls_children.get(i).i_siblingId = i;
  }
 
  public void removeChild(CTNode child)
  {
    removeChild(ls_children.indexOf(child));
  }

  public void resetChildren(Collection<CTNode> children)
  {
    ls_children.clear();
   
    for (CTNode child : children)
      addChild(child);
  }
 
//  ======================== Booleans ========================
 
  public boolean isPTag(String pTag)
  {
    return this.pTag.equals(pTag);
  }
 
  public boolean isPTagAny(String... pTags)
  {
    for (String pTag : pTags)
    {
      if (isPTag(pTag))
        return true;
    }
   
    return false;
  }
 
  public boolean matchesPTag(String regex)
  {
    return pTag.matches("^"+regex+"$");
  }
 
  public boolean isFTag(String fTag)
  {
    return (s_fTags.size() == 1) && (s_fTags.contains(fTag));
  }
 
  public boolean hasFTag(String fTag)
  {
    return s_fTags.contains(fTag);
  }
 
  public boolean hasFTagAll(Collection<String> fTags)
  {
    return this.s_fTags.containsAll(fTags);
  }
 
  public boolean hasFTagAny(Collection<String> fTags)
  {
    for (String fTag : fTags)
    {
      if (this.s_fTags.contains(fTag))
        return true;
    }
   
    return false;
  }
 
  public boolean hasFTagAny(String... fTags)
  {
    for (String fTag : fTags)
    {
      if (this.s_fTags.contains(fTag))
        return true;
    }
   
    return false;
  }
 
  public boolean isTag(String... tags)
  {
    String      pTag  = null;
    String      pRex  = null;
    Set<String> fTags = new HashSet<String>();
   
    for (String tag : tags)
    {
      if (tag.equals(CTLib.POS_NONE) || tag.equals(CTLibEn.POS_LRB) || tag.equals(CTLibEn.POS_RRB))
        pTag = tag;
      else
      {
        switch (tag.charAt(0))
        {
        case '-': fTags.add(tag.substring(1))break;
        case '+': pRex = tag.substring(1);    break;
        default : pTag = tag;
        }       
      }
    }
   
    return (pTag == null || isPTag (pTag)) &&
           (pRex == null || matchesPTag(pRex)) &&
           hasFTagAll(fTags);
  }
 
  public boolean isForm(String form)
  {
    return this.form != null && this.form.equals(form);
  }
 
  public boolean isPhrase()
  {
    return !ls_children.isEmpty();
  }
 
  public boolean isEmptyCategory()
  {
    return pTag.equals(CTLib.POS_NONE);
  }
 
  public boolean isEmptyCategoryRec()
  {
    CTNode curr = this;
   
    while (curr.isPhrase())
    {
      if (curr.ls_children.size() > 1)
        return false;
      else
        curr = curr.getChild(0);
    }
   
    return curr.isEmptyCategory();
  }
 
  public boolean isDescendantOf(CTNode node)
  {
    CTNode parent = getParent();

    while (parent != null)
    {
      if (parent == node)
        return true;
     
      parent = parent.getParent();
    }
   
    return false;
  }
 
  public boolean containsTags(String... tags)
  {
    for (CTNode child : ls_children)
    {
      if (child.isTag(tags))
        return true;
    }
   
    return false;
  }
 
  /**
   *
   *
   * @param index
   * @return
   */
  public boolean isChildrenRange(int index)
  {
    return 0 <= index && index < ls_children.size();
  }

 
//  ======================== Strings ========================

  public String toForms(boolean includeNulls, String delim)
  {
    StringBuilder build = new StringBuilder();
   
    for (CTNode node : getSubTerminals())
    {
      if (includeNulls || !node.isEmptyCategory())
      {
        build.append(delim);
        build.append(node.form);
      }
    }
   
    return build.length() == 0 ? "" : build.substring(delim.length());
  }
 
  /* (non-Javadoc)
   * @see java.lang.Object#toString()
   */
  public String toString()
  {
    return toString(false, false);
  }
 
  public String toString(boolean... args)
  {
    boolean includeLineNumbers  = (args.length > 0) ? args[0] : false;
    boolean includeAntePointers = (args.length > 1) ? args[1] : false;
   
    ArrayList<String> lTree = new ArrayList<String>();
    toStringAux(this, lTree, "", includeAntePointers);
   
    StringBuilder build = new StringBuilder();
   
    for (int i=0; i<lTree.size(); i++)
    {
      if (includeLineNumbers)
        build.append(String.format("%3d: %s\n", i, lTree.get(i)));
      else
        build.append(lTree.get(i)+"\n");
    }
     
    build.deleteCharAt(build.length()-1)// remove the last '\n'
    return build.toString();
  }
 
  public String toStringLine()
  {
    ArrayList<String> lTree = new ArrayList<String>();
    toStringAux(this, lTree, "", false);
   
    StringBuilder build = new StringBuilder();
   
    for (int i=0; i<lTree.size(); i++)
      build.append(lTree.get(i).trim()+" ");
     
    build.deleteCharAt(build.length()-1)// remove the last ' '
    return build.toString();
  }
 
  private void toStringAux(CTNode curr, List<String> lTree, String sTags, boolean includeAntePointers)
  {
    if (curr.isPhrase())
    {
      sTags += "("+curr.getTags()+" ";
    //  sTags += "("+curr.getTags()+"-"+curr.pbLoc+" ";
     
      for (CTNode child : curr.ls_children)
      {
        toStringAux(child, lTree, sTags, includeAntePointers);
       
        if (child.i_siblingId == 0)
          sTags = sTags.replaceAll(".", " ");    // indent
      }

      int last = lTree.size() - 1;
      lTree.set(last, lTree.get(last)+")");
    }
    else
    {
      String tag = sTags+"("+curr.getTags()+" "+curr.form+")";
      if (includeAntePointers && curr.antecedent != null)
        tag += "["+curr.antecedent.getTags()+"]";
      lTree.add(tag);
    }
  }
 
  @Override
  public int compareTo(CTNode node)
  {
    return i_terminalId - node.i_terminalId;
  }
}
TOP

Related Classes of com.clearnlp.constituent.CTNode

TOP
Copyright © 2018 www.massapi.com. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.