Source Code of de.lmu.ifi.dbs.elki.evaluation.histogram.ComputeOutlierHistogram$Parameterizer

package de.lmu.ifi.dbs.elki.evaluation.histogram;
/*
This file is part of ELKI:
Environment for Developing KDD-Applications Supported by Index-Structures


Copyright (C) 2011
Ludwig-Maximilians-Universität München
Lehr- und Forschungseinheit für Datenbanksysteme
ELKI Development Team


This program is free software: you can redistribute it and/or modify
it under the terms of the GNU Affero General Public License as published by
the Free Software Foundation, either version 3 of the License, or
(at your option) any later version.


This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
GNU Affero General Public License for more details.


You should have received a copy of the GNU Affero General Public License
along with this program.  If not, see <http://www.gnu.org/licenses/>.
*/


import java.util.ArrayList;
import java.util.Collection;
import java.util.List;
import java.util.regex.Pattern;


import de.lmu.ifi.dbs.elki.data.DoubleVector;
import de.lmu.ifi.dbs.elki.database.Database;
import de.lmu.ifi.dbs.elki.database.ids.DBID;
import de.lmu.ifi.dbs.elki.database.ids.DBIDUtil;
import de.lmu.ifi.dbs.elki.database.ids.DBIDs;
import de.lmu.ifi.dbs.elki.database.ids.ModifiableDBIDs;
import de.lmu.ifi.dbs.elki.distance.distancefunction.EuclideanDistanceFunction;
import de.lmu.ifi.dbs.elki.evaluation.Evaluator;
import de.lmu.ifi.dbs.elki.logging.Logging;
import de.lmu.ifi.dbs.elki.math.AggregatingHistogram;
import de.lmu.ifi.dbs.elki.math.FlexiHistogram;
import de.lmu.ifi.dbs.elki.result.HierarchicalResult;
import de.lmu.ifi.dbs.elki.result.HistogramResult;
import de.lmu.ifi.dbs.elki.result.Result;
import de.lmu.ifi.dbs.elki.result.ResultUtil;
import de.lmu.ifi.dbs.elki.result.outlier.OutlierResult;
import de.lmu.ifi.dbs.elki.utilities.DatabaseUtil;
import de.lmu.ifi.dbs.elki.utilities.optionhandling.AbstractParameterizer;
import de.lmu.ifi.dbs.elki.utilities.optionhandling.OptionID;
import de.lmu.ifi.dbs.elki.utilities.optionhandling.constraints.GreaterConstraint;
import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameterization.Parameterization;
import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameters.Flag;
import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameters.IntParameter;
import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameters.ObjectParameter;
import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameters.PatternParameter;
import de.lmu.ifi.dbs.elki.utilities.pairs.Pair;
import de.lmu.ifi.dbs.elki.utilities.scaling.IdentityScaling;
import de.lmu.ifi.dbs.elki.utilities.scaling.ScalingFunction;
import de.lmu.ifi.dbs.elki.utilities.scaling.outlier.OutlierScalingFunction;


/**
 * Compute a Histogram to evaluate a ranking algorithm.
 * 
 * The parameter {@code -hist.positive} specifies the class label of "positive"
 * hits.
 * 
 * @author Lisa Reichert
 * @author Erich Schubert
 * 
 * @apiviz.landmark
 * @apiviz.has ScalingFunction
 * @apiviz.has HistogramResult oneway - - «create»
 */
public class ComputeOutlierHistogram implements Evaluator {
  /**
   * Logger for debugging.
   */
  protected static final Logging logger = Logging.getLogger(ComputeOutlierHistogram.class);


  /**
   * The object pattern to identify positive classes
   * <p>
   * Key: {@code -comphist.positive}
   * </p>
   */
  public static final OptionID POSITIVE_CLASS_NAME_ID = OptionID.getOrCreateOptionID("comphist.positive", "Class label for the 'positive' class.");


  /**
   * number of bins for the histogram
   * <p>
   * Default value: {@link EuclideanDistanceFunction}
   * </p>
   * <p>
   * Key: {@code -comphist.bins}
   * </p>
   */
  public static final OptionID BINS_ID = OptionID.getOrCreateOptionID("comphist.bins", "number of bins");


  /**
   * Parameter to specify a scaling function to use.
   * <p>
   * Key: {@code -comphist.scaling}
   * </p>
   */
  public static final OptionID SCALING_ID = OptionID.getOrCreateOptionID("comphist.scaling", "Class to use as scaling function.");


  /**
   * Flag to count frequencies of outliers and non-outliers separately
   * <p>
   * Key: {@code -histogram.splitfreq}
   * </p>
   */
  public static final OptionID SPLITFREQ_ID = OptionID.getOrCreateOptionID("histogram.splitfreq", "Use separate frequencies for outliers and non-outliers.");


  /**
   * Stores the "positive" class.
   */
  private Pattern positiveClassName = null;


  /**
   * Number of bins
   */
  private int bins;


  /**
   * Scaling function to use
   */
  private ScalingFunction scaling;


  /**
   * Flag to make split frequencies
   */
  private boolean splitfreq = false;


  /**
   * Constructor.
   * 
   * @param positive_class_name Class name
   * @param bins Bins
   * @param scaling Scaling
   * @param splitfreq Scale inlier and outlier frequencies independently
   */
  public ComputeOutlierHistogram(Pattern positive_class_name, int bins, ScalingFunction scaling, boolean splitfreq) {
    super();
    this.positiveClassName = positive_class_name;
    this.bins = bins;
    this.scaling = scaling;
    this.splitfreq = splitfreq;
  }


  /**
   * Evaluate a single outlier result as histogram.
   * 
   * @param database Database to process
   * @param or Outlier result
   * @return Result
   */
  public HistogramResult<DoubleVector> evaluateOutlierResult(Database database, OutlierResult or) {
    if(scaling instanceof OutlierScalingFunction) {
      OutlierScalingFunction oscaling = (OutlierScalingFunction) scaling;
      oscaling.prepare(or);
    }


    ModifiableDBIDs ids = DBIDUtil.newHashSet(or.getScores().getDBIDs());
    DBIDs outlierIds = DatabaseUtil.getObjectsByLabelMatch(database, positiveClassName);
    // first value for outliers, second for each object
    final AggregatingHistogram<Pair<Double, Double>, Pair<Double, Double>> hist;
    // If we have useful (finite) min/max, use these for binning.
    double min = scaling.getMin();
    double max = scaling.getMax();
    if(Double.isInfinite(min) || Double.isNaN(min) || Double.isInfinite(max) || Double.isNaN(max)) {
      hist = FlexiHistogram.DoubleSumDoubleSumHistogram(bins);
    }
    else {
      hist = AggregatingHistogram.DoubleSumDoubleSumHistogram(bins, min, max);
    }
    // first fill histogram only with values of outliers
    Pair<Double, Double> positive, negative;
    if(!splitfreq) {
      positive = new Pair<Double, Double>(0., 1. / ids.size());
      negative = new Pair<Double, Double>(1. / ids.size(), 0.);
    }
    else {
      positive = new Pair<Double, Double>(0., 1. / outlierIds.size());
      negative = new Pair<Double, Double>(1. / (ids.size() - outlierIds.size()), 0.);
    }
    ids.removeDBIDs(outlierIds);
    // fill histogram with values of each object
    for(DBID id : ids) {
      double result = or.getScores().get(id);
      result = scaling.getScaled(result);
      hist.aggregate(result, negative);
    }
    for(DBID id : outlierIds) {
      double result = or.getScores().get(id);
      result = scaling.getScaled(result);
      hist.aggregate(result, positive);
    }


    // turn into Collection


    Collection<DoubleVector> collHist = new ArrayList<DoubleVector>(hist.getNumBins());
    for(Pair<Double, Pair<Double, Double>> ppair : hist) {
      Pair<Double, Double> data = ppair.getSecond();
      DoubleVector row = new DoubleVector(new double[] { ppair.getFirst(), data.getFirst(), data.getSecond() });
      collHist.add(row);
    }
    return new HistogramResult<DoubleVector>("Outlier Score Histogram", "outlier-histogram", collHist);
  }


  @Override
  public void processNewResult(HierarchicalResult baseResult, Result result) {
    final Database db = ResultUtil.findDatabase(baseResult);
    List<OutlierResult> ors = ResultUtil.filterResults(result, OutlierResult.class);
    if(ors == null || ors.size() <= 0) {
      // logger.warning("No outlier results found for "+ComputeOutlierHistogram.class.getSimpleName());
      return;
    }


    for(OutlierResult or : ors) {
      db.getHierarchy().add(or, evaluateOutlierResult(db, or));
    }
  }


  /**
   * Parameterization class.
   *
   * @author Erich Schubert
   *
   * @apiviz.exclude
   */
  public static class Parameterizer extends AbstractParameterizer {
    /**
     * Stores the "positive" class.
     */
    protected Pattern positiveClassName = null;


    /**
     * Number of bins
     */
    protected int bins;


    /**
     * Scaling function to use
     */
    protected ScalingFunction scaling;


    /**
     * Flag to make split frequencies
     */
    protected boolean splitfreq = false;


    @Override
    protected void makeOptions(Parameterization config) {
      super.makeOptions(config);
      PatternParameter positiveClassNameP = new PatternParameter(POSITIVE_CLASS_NAME_ID, true);
      if(config.grab(positiveClassNameP)) {
        positiveClassName = positiveClassNameP.getValue();
      }


      IntParameter binsP = new IntParameter(BINS_ID, new GreaterConstraint(1), 50);
      if(config.grab(binsP)) {
        bins = binsP.getValue();
      }


      ObjectParameter<ScalingFunction> scalingP = new ObjectParameter<ScalingFunction>(SCALING_ID, ScalingFunction.class, IdentityScaling.class);
      if(config.grab(scalingP)) {
        scaling = scalingP.instantiateClass(config);
      }


      Flag splitfreqF = new Flag(SPLITFREQ_ID);
      if(config.grab(splitfreqF)) {
        splitfreq = splitfreqF.getValue();
      }


    }


    @Override
    protected ComputeOutlierHistogram makeInstance() {
      return new ComputeOutlierHistogram(positiveClassName, bins, scaling, splitfreq);
    }
  }
}
Source Code of de.lmu.ifi.dbs.elki.evaluation.histogram.ComputeOutlierHistogram$Parameterizer

Related Classes of de.lmu.ifi.dbs.elki.evaluation.histogram.ComputeOutlierHistogram$Parameterizer