Source Code of cc.mallet.types.FeatureVector

/* Copyright (C) 2002 Univ. of Massachusetts Amherst, Computer Science Dept.
   This file is part of "MALLET" (MAchine Learning for LanguagE Toolkit).
   http://www.cs.umass.edu/~mccallum/mallet
   This software is provided under the terms of the Common Public License,
   version 1.0, as published by http://www.opensource.org.  For further
   information, see the file `LICENSE' included with this distribution. */








/**
   @author Andrew McCallum <a href="mailto:mccallum@cs.umass.edu">mccallum@cs.umass.edu</a>
 */


package cc.mallet.types;


import java.util.HashMap;
import java.util.Iterator;
import java.util.Arrays;
import java.util.logging.*;
import java.io.*;


import cc.mallet.types.Alphabet;
import cc.mallet.types.FeatureSequence;
import cc.mallet.types.Vector;
import cc.mallet.util.MalletLogger;
import cc.mallet.util.PropertyList;


/**
 *  A subset of an {@link cc.mallet.types.Alphabet} in which each element of the subset has an associated value.
 *  The subset is represented as a {@link cc.mallet.types.SparseVector}
 *  <p>
 *  A SparseVector represents only the non-zero locations of a vector.  In the case of a FeatureVector,
 *  a location represents the index of an entry in the Alphabet that is contained in
 *  the FeatureVector.
 *  <p>
 *  To loop over the elements of a feature vector, one loops over the consecutive integers between 0
 *  and the number of locations in the feature vector. From these locations one can cheaply
 *  obtain the index of the entry in the underlying Alphabet, the entry itself, and the value
 *  in this feature vector associated the entry.
 *  <p>
 *  A SparseVector (or FeatureVector) can be sparse or dense depending on whether or not
 *  an array if indices is specified at construction time.  If the FeatureVector is dense,
 *  the mapping from location to index is the identity mapping.
 *  <p>
 *  The associated value of an element in a SparseVector (or FeatureVector) can be
 *  a double or binary (0.0 or 1.0), depending on whether an array of doubles is specified at
 *  contruction time.
 *
 *  @see SparseVector
 *  @see Alphabet
 *
 *  @author Andrew McCallum <a href="mailto:mccallum@cs.umass.edu">mccallum@cs.umass.edu</a>
 */
public class FeatureVector extends SparseVector implements Serializable, AlphabetCarrying
{
  private static Logger logger = MalletLogger.getLogger(FeatureVector.class.getName());


  Alphabet dictionary;
  
  protected FeatureVector (Alphabet dict,
                           int[] indices, double[] values, 
                           int capacity, int size,
                           boolean copy,
                           boolean checkIndicesSorted,
                           boolean removeDuplicates)
  {
    super (indices, values, capacity, size, copy, checkIndicesSorted, removeDuplicates);
    this.dictionary = dict;
  }


  /** Create a dense vector */
  public FeatureVector (Alphabet dict, double[] values)
  {
    super (values);
    this.dictionary = dict;
  }
  
  /** Create non-binary vector, possibly dense if "featureIndices" or possibly sparse, if not */
  public FeatureVector (Alphabet dict,
                        int[] featureIndices,
                        double[] values)
  {
    super (featureIndices, values);
    this.dictionary = dict;
  }


  /** Create binary vector */
  public FeatureVector (Alphabet dict,
                        int[] featureIndices)
  {
    super (featureIndices);
    this.dictionary = dict;
  }


  public static int[] getObjectIndices(Object[] entries, Alphabet dict, boolean addIfNotPresent)
  {
    int[] feats = new int[entries.length];
    for (int i = 0; i < entries.length; i++) {
      feats[i] = dict.lookupIndex (entries[i], addIfNotPresent);
      if (feats[i] == -1)
        throw new IllegalArgumentException ("Object is not in dictionary.");
    }
    return feats;
  }
    
  public FeatureVector (Alphabet dict, Object[] keys, double[] values)
  {
    this (dict, getObjectIndices(keys, dict, true), values);
  }


  private static int[] sortedFeatureIndexSequence (FeatureSequence fs)
  {
    int[] feats = fs.toFeatureIndexSequence ();
    java.util.Arrays.sort (feats);
    return feats;
  }


  public FeatureVector (FeatureSequence fs, boolean binary)
  {
    super (fs.toSortedFeatureIndexSequence(), false, false, true, binary);
    this.dictionary = (Alphabet) fs.getAlphabet();
  }


  public FeatureVector (FeatureSequence fs)
  {
    this (fs, false);
  }
  
  public FeatureVector (Alphabet dict, PropertyList pl, boolean binary,
                        boolean growAlphabet)
  {
    super (dict, pl, binary, growAlphabet);
    this.dictionary = dict;
  }


  public FeatureVector (Alphabet dict, PropertyList pl, boolean binary) {
    this (dict, pl, binary, true);
  }


  private static int[] indicesWithConjunctions (FeatureVector fv, Alphabet newVocab, int[] conjunctions)
  {
    assert (fv.values == null);          // Only works on binary feature vectors
    assert (! (fv instanceof AugmentableFeatureVector));
    Alphabet v = fv.getAlphabet();
    // newVocab should be an augmented copy of v
    assert (v.size() <= newVocab.size())
                        : "fv.vocab.size="+v.size()+" newVocab.size="+newVocab.size();
    int[] newIndices = new int[fv.indices.length * conjunctions.length];
    java.util.Arrays.sort (conjunctions);
    System.arraycopy (fv.indices, 0, newIndices, 0, fv.indices.length);
    int size = fv.indices.length;
    int ci = 0;
    for (int i = 0; i < fv.indices.length; i++) {
      if (ci < conjunctions.length && conjunctions[ci] < fv.indices[i])
        ci++;
      if (conjunctions[ci] == fv.indices[i]) {
        for (int j = 0; j < fv.indices.length; j++) {
          if (conjunctions[ci] != fv.indices[j]) {
            int index = newVocab.lookupIndex (FeatureConjunction.getName (v, conjunctions[ci], fv.indices[j]));
            if (index == newVocab.size()-1 && index % 3 == 0)
              logger.info ("New feature "+ newVocab.lookupObject(index));
            if (index != -1)  // this can be -1 if newVocab.growthStopped
              newIndices[size++] = index;
          }
        }
      }
    }


    // Sort and remove duplicates
    Arrays.sort (newIndices, 0, size);
    for (int i = 1; i < size; i++) {
      if (newIndices[i-1] == newIndices[i]) {
        for (int j = i+1; j < size; j++)
          newIndices[j-1] = newIndices[j];
        size--;
      }
    }


    int[] ret = new int[size];
    System.arraycopy (newIndices, 0, ret, 0, size);
    return ret;
  }


  private static int[] indicesWithConjunctions (FeatureVector fv, Alphabet newVocab,
                                                FeatureSelection fsNarrow,
                                                FeatureSelection fsWide)
  {
    assert (fv.values == null);          // Only works on binary feature vectors
    ////assert (! (fv instanceof AugmentableFeatureVector));
    Alphabet v = fv.getAlphabet();
    // newVocab should be an augmented copy of v
    assert (v.size() <= newVocab.size())
                        : "fv.vocab.size="+v.size()+" newVocab.size="+newVocab.size();
    int length;
    if (fv instanceof AugmentableFeatureVector) {
      length = ((AugmentableFeatureVector)fv).size;
      ((AugmentableFeatureVector)fv).sortIndices();
    } else {
      length = fv.indices.length;
    }
    int[] newIndices = new int[length * length];
    System.arraycopy (fv.indices, 0, newIndices, 0, length);
    int size = length;
    int ci = 0;
    for (int i = 0; i < length; i++) {
      if (fsNarrow != null && !fsNarrow.contains (fv.indices[i]))
        continue;
      for (int j = 0; j < length; j++) {
        if ((fsWide == null || fsWide.contains (fv.indices[j]))
            && fv.indices[i] != fv.indices[j]
            //&& !FeatureConjunction.featuresOverlap (v, fv.indices[i], fv.indices[j]))
          )
        {
          int index = newVocab.lookupIndex (FeatureConjunction.getName (v, fv.indices[i], fv.indices[j]));
          if (index != -1) // this can be -1 if newVocab.growthStopped
            newIndices[size++] = index; 
        }
      }
    }


    // Sort and remove duplicates
    Arrays.sort (newIndices, 0, size);
    for (int i = 1; i < size; i++) {
      if (newIndices[i-1] == newIndices[i]) {
        for (int j = i+1; j < size; j++)
          newIndices[j-1] = newIndices[j];
        size--;
      }
    }    
    int[] ret = new int[size];
    System.arraycopy (newIndices, 0, ret, 0, size);
    return ret;
  }
  
  /** New feature vector containing all the features of "fv", plus new
      features created by making conjunctions between the features in
      "conjunctions" and all the other features. */
  public FeatureVector (FeatureVector fv, Alphabet newVocab, int[] conjunctions)
  {
    this (newVocab, indicesWithConjunctions (fv, newVocab, conjunctions));
  }


  public FeatureVector (FeatureVector fv, Alphabet newVocab,
                        FeatureSelection fsNarrow, FeatureSelection fsWide)
  {
    this (newVocab, indicesWithConjunctions (fv, newVocab, fsNarrow, fsWide));
  }


  /** Construct a new FeatureVector, selecting only those features in fs, and having new
   * (presumably more compact, dense) Alphabet. */
  public static FeatureVector newFeatureVector (FeatureVector fv, Alphabet newVocab, FeatureSelection fs)
  {
    assert (fs.getAlphabet() == fv.dictionary);
    if (fv.indices == null) {
      throw new UnsupportedOperationException("Not yet implemented for dense feature vectors.");
    }
    
    // this numLocations() method call ensures that AugmentableFeatureVectors have been compressed
    int fvNumLocations = fv.numLocations();


    int[] indices = new int[fvNumLocations];
    double[] values = null;
    // if feature vectors are binary
    if (fv.values != null) {
      values = new double[indices.length];
    }
    int size = 0;
    for (int index = 0; index < fvNumLocations; index++) {
      if (fs.contains(fv.indices[index])) {
        try{
          indices[size] = newVocab.lookupIndex(fv.dictionary.lookupObject(fv.indices[index]), true);
        } catch (Exception e) {
          System.out.println (e.toString());
        }
        // if feature vectors are binary
        if (fv.values != null) {
          values[size] = fv.values[index];
        }
        size++;
      }
    }
    return new FeatureVector (newVocab, indices, values, size, size, true, true, false);
  }


  // xxx We need to implement this in FeatureVector subclasses
  public ConstantMatrix cloneMatrix ()
  {
    return new FeatureVector ((Alphabet)dictionary, indices, values);
  }


  public ConstantMatrix cloneMatrixZeroed () {
    assert (values != null);
    if (indices == null)
      return new FeatureVector (dictionary, new double[values.length]);
    else {
      int[] newIndices = new int[indices.length];
      System.arraycopy (indices, 0, newIndices, 0, indices.length);
      return new FeatureVector (dictionary, newIndices, new double[values.length],
                                values.length, values.length, false, false, false);
    }
  }
  
  public String toString ()
  {
    return toString (false);
  }


    // CPAL - added this to output Feature vectors to a text file in a simple format
    public boolean toSimpFile (String FileName, int curdocNo, boolean printcounts)
  {
    //Thread.currentThread().dumpStack();
    StringBuffer sb = new StringBuffer ();
    //System.out.println ("FeatureVector toString dictionary="+dictionary);
    if (values == null) {
        //System.out.println ("FeatureVector toString values==null");
        int indicesLength = numLocations();
        for (int i = 0; i < indicesLength; i++) {
      //System.out.println ("FeatureVector toString i="+i);
      if (dictionary == null)
          sb.append ("["+i+"]");
      else {
          //System.out.println ("FeatureVector toString: i="+i+" index="+indices[i]);
          sb.append (dictionary.lookupObject(indices[i]).toString());
          //sb.append ("("+indices[i]+")");
      }
      //sb.append ("= 1.0 (forced binary)");
      //if (!onOneLine)
          sb.append ('\n');
      //else
      //    sb.append (' ');
        }
    } else {
        //System.out.println ("FeatureVector toString values!=null");
        int valuesLength = numLocations();
        for (int i = 0; i < valuesLength; i++) {
      int idx = indices == null ? i : indices[i];
      if (dictionary == null)
          sb.append ("["+i+"]");
      else {
          //sb.append (dictionary.lookupObject(idx).toString());
          //sb.append ("(" + idx +")");
                sb.append(curdocNo + " " + idx );
      }


      //sb.append ("=");
            // CPAL - optionally include the counts
            if (printcounts)
          sb.append (" " + values[i]);
      //if (!onOneLine)
          sb.append ("\n");
      //else
      //    sb.append (' ');
        }
    }
    //return sb.toString();
        String str = sb.toString();


        File myfile = new File(FileName);
        try{
            FileWriter out = new FileWriter(myfile,true); // true -> append to the file
            out.write(str);
            out.close();
        } catch (IOException e) {
            System.err.println("Feature Vector exception when trying to print a file");
        }


        return true;
  }




  public String toString (boolean onOneLine)
  {
    //Thread.currentThread().dumpStack();
    StringBuffer sb = new StringBuffer ();
    //System.out.println ("FeatureVector toString dictionary="+dictionary);
    if (values == null) {
        //System.out.println ("FeatureVector toString values==null");
        int indicesLength = numLocations();
        for (int i = 0; i < indicesLength; i++) {
      //System.out.println ("FeatureVector toString i="+i);
      if (dictionary == null)
          sb.append ("["+i+"]");
      else {
          //System.out.println ("FeatureVector toString: i="+i+" index="+indices[i]);
          sb.append (dictionary.lookupObject(indices[i]).toString());
          //sb.append ("("+indices[i]+")");
      }
      //sb.append ("= 1.0 (forced binary)");
      if (!onOneLine)
          sb.append ('\n');
      else
          sb.append (' ');
        }
    } else {
        //System.out.println ("FeatureVector toString values!=null");
        int valuesLength = numLocations();
        for (int i = 0; i < valuesLength; i++) {
      int idx = indices == null ? i : indices[i];
      if (dictionary == null)
          sb.append ("["+i+"]");
      else {
          sb.append (dictionary.lookupObject(idx).toString());
          sb.append ("(" + idx +")");
      }
      
      sb.append ("=");
      sb.append (values[i]);
      if (!onOneLine)
          sb.append ("\n");
      else
          sb.append (' ');
        }
    }
    return sb.toString();
  }


  public Alphabet getAlphabet ()
  {
    return dictionary;
  }
  
  public Alphabet[] getAlphabets()
  {
    return new Alphabet[] {dictionary};
  }
  
  public boolean alphabetsMatch (AlphabetCarrying object)
  {
    return dictionary.equals (object.getAlphabet());
  }


  public int location (Object entry)
  {
    if (dictionary == null)
      throw new IllegalStateException ("This FeatureVector has no dictionary.");
    int i = dictionary.lookupIndex (entry, false);
    if (i < 0)
      return -1;
    else
      return location (i);
  }


  public boolean contains (Object entry)
  {
    int loc = location(entry);
    return (loc >= 0 && valueAtLocation(loc) != 0);
  }


  public double value (Object o)
  {
    int loc = location (o);
    if (loc >= 0)
      return valueAtLocation (loc);
    else
      throw new IllegalArgumentException ("Object "+o+" is not a key in the dictionary.");
  }
  
  //Serialization


  private static final long serialVersionUID = 1;
  private static final int CURRENT_SERIAL_VERSION = 0;


  private void writeObject (ObjectOutputStream out) throws IOException {
    out.writeInt (CURRENT_SERIAL_VERSION);
    out.writeObject (dictionary);
  }


  private void readObject (ObjectInputStream in) throws IOException, ClassNotFoundException {
    int version = in.readInt ();
    dictionary = (Alphabet) in.readObject();
  }


}
Source Code of cc.mallet.types.FeatureVector

Related Classes of cc.mallet.types.FeatureVector