Source Code of weka.filters.unsupervised.attribute.PartitionedMultiFilter

/*
 *    This program is free software; you can redistribute it and/or modify
 *    it under the terms of the GNU General Public License as published by
 *    the Free Software Foundation; either version 2 of the License, or
 *    (at your option) any later version.
 *
 *    This program is distributed in the hope that it will be useful,
 *    but WITHOUT ANY WARRANTY; without even the implied warranty of
 *    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 *    GNU General Public License for more details.
 *
 *    You should have received a copy of the GNU General Public License
 *    along with this program; if not, write to the Free Software
 *    Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
 */


/*
 * PartitionedMultiFilter.java
 * Copyright (C) 2006-2010 University of Waikato, Hamilton, New Zealand
 *
 */


package weka.filters.unsupervised.attribute;


import weka.core.Attribute;
import weka.core.DenseInstance;
import weka.core.Instance;
import weka.core.Instances;
import weka.core.Option;
import weka.core.OptionHandler;
import weka.core.Range;
import weka.core.RevisionUtils;
import weka.core.SparseInstance;
import weka.core.Utils;
import weka.filters.AllFilter;
import weka.filters.Filter;
import weka.filters.SimpleBatchFilter;


import java.util.ArrayList;
import java.util.Enumeration;
import java.util.Vector;


/**
 <!-- globalinfo-start -->
 * A filter that applies filters on subsets of attributes and assembles the output into a new dataset. Attributes that are not covered by any of the ranges can be either retained or removed from the output.
 * <p/>
 <!-- globalinfo-end -->
 *
 <!-- options-start -->
 * Valid options are: <p/>
 *
 * <pre> -D
 *  Turns on output of debugging information.</pre>
 *
 * <pre> -F &lt;classname [options]&gt;
 *  A filter to apply (can be specified multiple times).</pre>
 *
 * <pre> -R &lt;range&gt;
 *  An attribute range (can be specified multiple times).
 *  For each filter a range must be supplied. 'first' and 'last'
 *  are valid indices. 'inv(...)' around the range denotes an
 *  inverted range.</pre>
 *
 * <pre> -U
 *  Flag for leaving unused attributes out of the output, by default
 *  these are included in the filter output.</pre>
 *
 <!-- options-end -->
 *
 * @author  FracPete (fracpete at waikato dot ac dot nz)
 * @version $Revision: 6971 $
 * @see     weka.filters.StreamableFilter
 */
public class PartitionedMultiFilter
  extends SimpleBatchFilter {


  /** for serialization. */
  private static final long serialVersionUID = -6293720886005713120L;


  /** The filters. */
  protected Filter m_Filters[] = {new AllFilter()};


  /** The attribute ranges. */
  protected Range m_Ranges[] = {new Range("first-last")};


  /** Whether unused attributes are left out of the output. */
  protected boolean m_RemoveUnused = false;


  /** the indices of the unused attributes. */
  protected int[] m_IndicesUnused = new int[0];


  /**
   * Returns a string describing this filter.
   * @return     a description of the filter suitable for
   *       displaying in the explorer/experimenter gui
   */
  public String globalInfo() {
    return
        "A filter that applies filters on subsets of attributes and "
      + "assembles the output into a new dataset. Attributes that are "
      + "not covered by any of the ranges can be either retained or removed "
      + "from the output.";
  }


  /**
   * Returns an enumeration describing the available options.
   *
   * @return     an enumeration of all the available options.
   */
  public Enumeration listOptions() {
    Vector result = new Vector();
    Enumeration enm = super.listOptions();
    while (enm.hasMoreElements())
      result.add(enm.nextElement());


    result.addElement(new Option(
        "\tA filter to apply (can be specified multiple times).",
        "F", 1, "-F <classname [options]>"));


    result.addElement(new Option(
        "\tAn attribute range (can be specified multiple times).\n"
  + "\tFor each filter a range must be supplied. 'first' and 'last'\n"
  + "\tare valid indices. 'inv(...)' around the range denotes an\n"
  + "\tinverted range.",
        "R", 1, "-R <range>"));


    result.addElement(new Option(
        "\tFlag for leaving unused attributes out of the output, by default\n"
  + "\tthese are included in the filter output.",
        "U", 0, "-U"));


    return result.elements();
  }


  /**
   * Parses a list of options for this object. <p/>
   *
   <!-- options-start -->
   * Valid options are: <p/>
   *
   * <pre> -D
   *  Turns on output of debugging information.</pre>
   *
   * <pre> -F &lt;classname [options]&gt;
   *  A filter to apply (can be specified multiple times).</pre>
   *
   * <pre> -R &lt;range&gt;
   *  An attribute range (can be specified multiple times).
   *  For each filter a range must be supplied. 'first' and 'last'
   *  are valid indices. 'inv(...)' around the range denotes an
   *  inverted range.</pre>
   *
   * <pre> -U
   *  Flag for leaving unused attributes out of the output, by default
   *  these are included in the filter output.</pre>
   *
   <!-- options-end -->
   *
   * @param options   the list of options as an array of strings
   * @throws Exception   if an option is not supported
   */
  public void setOptions(String[] options) throws Exception {
    String  tmpStr;
    String  classname;
    String[]  options2;
    Vector  objects;
    Range  range;


    super.setOptions(options);


    setRemoveUnused(Utils.getFlag("U", options));


    objects = new Vector();
    while ((tmpStr = Utils.getOption("F", options)).length() != 0) {
      options2    = Utils.splitOptions(tmpStr);
      classname      = options2[0];
      options2[0] = "";
      objects.add(Utils.forName(Filter.class, classname, options2));
    }


    // at least one filter
    if (objects.size() == 0)
      objects.add(new AllFilter());


    setFilters((Filter[]) objects.toArray(new Filter[objects.size()]));


    objects = new Vector();
    while ((tmpStr = Utils.getOption("R", options)).length() != 0) {
      if (tmpStr.startsWith("inv(") && tmpStr.endsWith(")")) {
  range = new Range(tmpStr.substring(4, tmpStr.length() - 1));
  range.setInvert(true);
      }
      else {
  range = new Range(tmpStr);
      }
      objects.add(range);
    }


    // at least one Range
    if (objects.size() == 0)
      objects.add(new Range("first-last"));


    setRanges((Range[]) objects.toArray(new Range[objects.size()]));


    // is number of filters the same as ranges?
    checkDimensions();
  }


  /**
   * Gets the current settings of the filter.
   *
   * @return     an array of strings suitable for passing to setOptions
   */
  public String[] getOptions() {
    Vector  result;
    String[]  options;
    int    i;
    String  tmpStr;


    result = new Vector();


    options = super.getOptions();
    for (i = 0; i < options.length; i++)
      result.add(options[i]);


    if (getRemoveUnused())
      result.add("-U");


    for (i = 0; i < getFilters().length; i++) {
      result.add("-F");
      result.add(getFilterSpec(getFilter(i)));
    }


    for (i = 0; i < getRanges().length; i++) {
      tmpStr = getRange(i).getRanges();
      if (getRange(i).getInvert())
  tmpStr = "inv(" + tmpStr + ")";
      result.add("-R");
      result.add(tmpStr);
    }


    return (String[]) result.toArray(new String[result.size()]);
  }


  /**
   * checks whether the dimensions of filters and ranges fit together.
   *
   * @throws Exception  if dimensions differ
   */
  protected void checkDimensions() throws Exception {
    if (getFilters().length != getRanges().length)
      throw new IllegalArgumentException(
    "Number of filters (= " + getFilters().length + ") "
    + "and ranges (= " + getRanges().length + ") don't match!");
  }


  /**
   * tests the data whether the filter can actually handle it.
   *
   * @param instanceInfo  the data to test
   * @throws Exception    if the test fails
   */
  protected void testInputFormat(Instances instanceInfo) throws Exception {
    for (int i = 0; i < getRanges().length; i++) {
      Instances newi = new Instances(instanceInfo, 0);
      if (instanceInfo.size() > 0){
  newi.add((Instance)instanceInfo.get(0).copy());
      }
      Range range = getRanges()[i];
      range.setUpper(instanceInfo.numAttributes() - 1);
      Instances subset = generateSubset(newi, range);
      getFilters()[i].setInputFormat(subset);
    }
  }


  /**
   * Sets whether unused attributes (ones that are not covered by any of the
   * ranges) are removed from the output.
   *
   * @param value  if true then the unused attributes get removed
   */
  public void setRemoveUnused(boolean value) {
    m_RemoveUnused = value;
  }


  /**
   * Gets whether unused attributes (ones that are not covered by any of the
   * ranges) are removed from the output.
   *
   * @return    true if unused attributes are removed
   */
  public boolean getRemoveUnused() {
    return m_RemoveUnused;
  }


  /**
   * Returns the tip text for this property.
   *
   * @return      tip text for this property suitable for
   *              displaying in the explorer/experimenter gui
   */
  public String removeUnusedTipText() {
    return
        "If true then unused attributes (ones that are not covered by any "
      + "of the ranges) will be removed from the output.";
  }


  /**
   * Sets the list of possible filters to choose from.
   * Also resets the state of the filter (this reset doesn't affect the
   * options).
   *
   * @param filters  an array of filters with all options set.
   * @see #reset()
   */
  public void setFilters(Filter[] filters) {
    m_Filters = filters;
    reset();
  }


  /**
   * Gets the list of possible filters to choose from.
   *
   * @return     the array of Filters
   */
  public Filter[] getFilters() {
    return m_Filters;
  }


  /**
   * Returns the tip text for this property.
   *
   * @return      tip text for this property suitable for
   *              displaying in the explorer/experimenter gui
   */
  public String filtersTipText() {
    return "The base filters to be used.";
  }


  /**
   * Gets a single filter from the set of available filters.
   *
   * @param index   the index of the filter wanted
   * @return     the Filter
   */
  public Filter getFilter(int index) {
    return m_Filters[index];
  }


  /**
   * returns the filter classname and the options as one string.
   *
   * @param filter  the filter to get the specs for
   * @return    the classname plus options
   */
  protected String getFilterSpec(Filter filter) {
    String        result;


    if (filter == null) {
      result = "";
    }
    else {
      result  = filter.getClass().getName();
      if (filter instanceof OptionHandler)
        result += " "
          + Utils.joinOptions(((OptionHandler) filter).getOptions());
    }


    return result;
  }


  /**
   * Sets the list of possible Ranges to choose from.
   * Also resets the state of the Range (this reset doesn't affect the
   * options).
   *
   * @param Ranges  an array of Ranges with all options set.
   * @see #reset()
   */
  public void setRanges(Range[] Ranges) {
    m_Ranges = Ranges;
    reset();
  }


  /**
   * Gets the list of possible Ranges to choose from.
   *
   * @return     the array of Ranges
   */
  public Range[] getRanges() {
    return m_Ranges;
  }


  /**
   * Returns the tip text for this property.
   *
   * @return      tip text for this property suitable for
   *              displaying in the explorer/experimenter gui
   */
  public String rangesTipText() {
    return "The attribute ranges to be used; 'inv(...)' denotes an inverted range.";
  }


  /**
   * Gets a single Range from the set of available Ranges.
   *
   * @param index   the index of the Range wanted
   * @return     the Range
   */
  public Range getRange(int index) {
    return m_Ranges[index];
  }


  /**
   * determines the indices of unused attributes (ones that are not covered
   * by any of the range).
   *
   * @param data  the data to base the determination on
   * @see     #m_IndicesUnused
   */
  protected void determineUnusedIndices(Instances data) {
    Vector<Integer>  indices;
    int      i;
    int      n;
    boolean    covered;


    // traverse all ranges
    indices = new Vector<Integer>();
    for (i = 0; i < data.numAttributes(); i++) {
      if (i == data.classIndex())
  continue;


      covered = false;
      for (n = 0; n < getRanges().length; n++) {
  if (getRanges()[n].isInRange(i)) {
    covered = true;
    break;
  }
      }


      if (!covered)
  indices.add(new Integer(i));
    }


    // create array
    m_IndicesUnused = new int[indices.size()];
    for (i = 0; i < indices.size(); i++)
      m_IndicesUnused[i] = indices.get(i).intValue();


    if (getDebug())
      System.out.println(
    "Unused indices: " + Utils.arrayToString(m_IndicesUnused));
  }


  /**
   * generates a subset of the dataset with only the attributes from the range
   * (class is always added if present).
   *
   * @param data  the data to work on
   * @param range  the range of attribute to use
   * @return    the generated subset
   * @throws Exception  if creation fails
   */
  protected Instances generateSubset(Instances data, Range range) throws Exception {
    Remove    filter;
    StringBuilder  atts;
    Instances    result;
    int[]    indices;
    int      i;


    // determine attributes
    indices = range.getSelection();
    atts    = new StringBuilder();
    for (i = 0; i < indices.length; i++) {
      if (i > 0)
  atts.append(",");
      atts.append("" + (indices[i] + 1));
    }
    if ((data.classIndex() > -1) && (!range.isInRange(data.classIndex())))
      atts.append("," + (data.classIndex() + 1));


    // setup filter
    filter = new Remove();
    filter.setAttributeIndices(atts.toString());
    filter.setInvertSelection(true);
    filter.setInputFormat(data);


    // generate output
    result = Filter.useFilter(data, filter);


    return result;
  }


  /**
   * renames all the attributes in the dataset (excluding the class if present)
   * by adding the prefix to the name.
   *
   * @param data  the data to work on
   * @param prefix  the prefix for the attributes
   * @return    a copy of the data with the attributes renamed
   * @throws Exception  if renaming fails
   */
  protected Instances renameAttributes(Instances data, String prefix) throws Exception {
    Instances      result;
    int        i;
    ArrayList<Attribute>  atts;


    // rename attributes
    atts = new ArrayList<Attribute>();
    for (i = 0; i < data.numAttributes(); i++) {
      if (i == data.classIndex())
  atts.add((Attribute) data.attribute(i).copy());
      else
  atts.add(data.attribute(i).copy(prefix + data.attribute(i).name()));
    }


    // create new dataset
    result = new Instances(data.relationName(), atts, data.numInstances());
    for (i = 0; i < data.numInstances(); i++) {
      result.add((Instance) data.instance(i).copy());
    }


    // set class if present
    if (data.classIndex() > -1)
      result.setClassIndex(data.classIndex());


    return result;
  }


  /**
   * Determines the output format based only on the full input dataset and
   * returns this otherwise null is returned. In case the output format cannot
   * be returned immediately, i.e., immediateOutputFormat() returns false,
   * then this method will be called from batchFinished().
   *
   * @param inputFormat     the input format to base the output format on
   * @return                the output format
   * @throws Exception      in case the determination goes wrong
   * @see                   #hasImmediateOutputFormat()
   * @see                   #batchFinished()
   */
  protected Instances determineOutputFormat(Instances inputFormat) throws Exception {
    Instances       result;
    Instances      processed;
    int             i;
    int        n;
    ArrayList<Attribute>  atts;
    Attribute      att;


    if (!isFirstBatchDone()) {
      // we need the full dataset here, see process(Instances)
      if (inputFormat.numInstances() == 0)
  return null;


      checkDimensions();


      // determine unused indices
      determineUnusedIndices(inputFormat);


      atts = new ArrayList<Attribute>();
      for (i = 0; i < getFilters().length; i++) {
  if (!isFirstBatchDone()) {
    // generate subset
    processed = generateSubset(inputFormat, getRange(i));
    // set input format
    if (!getFilter(i).setInputFormat(processed))
      Filter.useFilter(processed, getFilter(i));
  }


  // get output format
  processed = getFilter(i).getOutputFormat();


  // rename attributes
  processed = renameAttributes(processed, "filtered-" + i + "-");


  // add attributes
  for (n = 0; n < processed.numAttributes(); n++) {
    if (n == processed.classIndex())
      continue;
    atts.add((Attribute) processed.attribute(n).copy());
  }
      }


      // add unused attributes
      if (!getRemoveUnused()) {
  for (i = 0; i < m_IndicesUnused.length; i++) {
    att = inputFormat.attribute(m_IndicesUnused[i]);
    atts.add(att.copy("unfiltered-" + att.name()));
  }
      }


      // add class if present
      if (inputFormat.classIndex() > -1)
  atts.add((Attribute) inputFormat.classAttribute().copy());


      // generate new dataset
      result = new Instances(inputFormat.relationName(), atts, 0);
      if (inputFormat.classIndex() > -1)
  result.setClassIndex(result.numAttributes() - 1);
    }
    else {
      result = getOutputFormat();
    }


    return result;
  }


  /**
   * Processes the given data (may change the provided dataset) and returns
   * the modified version. This method is called in batchFinished().
   *
   * @param instances   the data to process
   * @return            the modified data
   * @throws Exception  in case the processing goes wrong
   * @see               #batchFinished()
   */
  protected Instances process(Instances instances) throws Exception {
    Instances    result;
    int            i;
    int      n;
    int      m;
    int      index;
    Instances[]    processed;
    Instance    inst;
    Instance    newInst;
    double[]    values;
    Vector    errors;


    if (!isFirstBatchDone()) {
      checkDimensions();


      // set upper limits
      for (i = 0; i < m_Ranges.length; i++)
  m_Ranges[i].setUpper(instances.numAttributes() - 1);


      // determine unused indices
      determineUnusedIndices(instances);
    }


    // pass data through all filters
    processed = new Instances[getFilters().length];
    for (i = 0; i < getFilters().length; i++) {
      processed[i] = generateSubset(instances, getRange(i));
      if (!isFirstBatchDone())
  getFilter(i).setInputFormat(processed[i]);
      processed[i] = Filter.useFilter(processed[i], getFilter(i));
    }


    // set output format (can only be determined with full dataset, hence here)
    if (!isFirstBatchDone()) {
      result = determineOutputFormat(instances);
      setOutputFormat(result);
    }
    else {
      result = getOutputFormat();
    }


    // check whether all filters didn't change the number of instances
    errors = new Vector();
    for (i = 0; i < processed.length; i++) {
      if (processed[i].numInstances() != instances.numInstances())
  errors.add(new Integer(i));
    }
    if (errors.size() > 0)
      throw new IllegalStateException(
    "The following filter(s) changed the number of instances: " + errors);


    // assemble data
    for (i = 0; i < instances.numInstances(); i++) {
      inst   = instances.instance(i);
      values = new double[result.numAttributes()];


      // filtered data
      index = 0;
      for (n = 0; n < processed.length; n++) {
  for (m = 0; m < processed[n].numAttributes(); m++) {
    if (m == processed[n].classIndex())
      continue;
    values[index] = processed[n].instance(i).value(m);
    index++;
  }
      }


      // unused attributes
      if (!getRemoveUnused()) {
  for (n = 0; n < m_IndicesUnused.length; n++) {
    values[index] = inst.value(m_IndicesUnused[n]);
    index++;
  }
      }


      // class
      if (instances.classIndex() > -1)
  values[values.length - 1] = inst.value(instances.classIndex());


      // generate and add instance
      if (inst instanceof SparseInstance)
  newInst = new SparseInstance(instances.instance(i).weight(), values);
      else
  newInst = new DenseInstance(instances.instance(i).weight(), values);
      result.add(newInst);
    }


    return result;
  }


  /**
   * Returns the revision string.
   *
   * @return    the revision
   */
  public String getRevision() {
    return RevisionUtils.extract("$Revision: 6971 $");
  }


  /**
   * Main method for executing this class.
   *
   * @param args should contain arguments for the filter: use -h for help
   */
  public static void main(String[] args) {
    runFilter(new PartitionedMultiFilter(), args);
  }
}
Source Code of weka.filters.unsupervised.attribute.PartitionedMultiFilter

Related Classes of weka.filters.unsupervised.attribute.PartitionedMultiFilter