Source Code of de.lmu.ifi.dbs.elki.algorithm.clustering.SLINK$CompareByLambda

package de.lmu.ifi.dbs.elki.algorithm.clustering;


/*
 This file is part of ELKI:
 Environment for Developing KDD-Applications Supported by Index-Structures


 Copyright (C) 2012
 Ludwig-Maximilians-Universität München
 Lehr- und Forschungseinheit für Datenbanksysteme
 ELKI Development Team


 This program is free software: you can redistribute it and/or modify
 it under the terms of the GNU Affero General Public License as published by
 the Free Software Foundation, either version 3 of the License, or
 (at your option) any later version.


 This program is distributed in the hope that it will be useful,
 but WITHOUT ANY WARRANTY; without even the implied warranty of
 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 GNU Affero General Public License for more details.


 You should have received a copy of the GNU Affero General Public License
 along with this program.  If not, see <http://www.gnu.org/licenses/>.
 */


import java.util.ArrayList;
import java.util.Collections;
import java.util.Comparator;
import java.util.HashMap;
import java.util.Iterator;
import java.util.List;
import java.util.Map;
import java.util.Map.Entry;


import de.lmu.ifi.dbs.elki.algorithm.AbstractDistanceBasedAlgorithm;
import de.lmu.ifi.dbs.elki.data.Cluster;
import de.lmu.ifi.dbs.elki.data.Clustering;
import de.lmu.ifi.dbs.elki.data.model.ClusterModel;
import de.lmu.ifi.dbs.elki.data.model.DendrogramModel;
import de.lmu.ifi.dbs.elki.data.model.Model;
import de.lmu.ifi.dbs.elki.data.type.SimpleTypeInformation;
import de.lmu.ifi.dbs.elki.data.type.TypeInformation;
import de.lmu.ifi.dbs.elki.data.type.TypeUtil;
import de.lmu.ifi.dbs.elki.database.Database;
import de.lmu.ifi.dbs.elki.database.datastore.DataStore;
import de.lmu.ifi.dbs.elki.database.datastore.DataStoreFactory;
import de.lmu.ifi.dbs.elki.database.datastore.DataStoreUtil;
import de.lmu.ifi.dbs.elki.database.datastore.WritableDataStore;
import de.lmu.ifi.dbs.elki.database.datastore.WritableRecordStore;
import de.lmu.ifi.dbs.elki.database.ids.ArrayModifiableDBIDs;
import de.lmu.ifi.dbs.elki.database.ids.DBID;
import de.lmu.ifi.dbs.elki.database.ids.DBIDUtil;
import de.lmu.ifi.dbs.elki.database.ids.DBIDs;
import de.lmu.ifi.dbs.elki.database.ids.ModifiableDBIDs;
import de.lmu.ifi.dbs.elki.database.query.distance.DistanceQuery;
import de.lmu.ifi.dbs.elki.database.relation.MaterializedRelation;
import de.lmu.ifi.dbs.elki.database.relation.Relation;
import de.lmu.ifi.dbs.elki.distance.DistanceUtil;
import de.lmu.ifi.dbs.elki.distance.distancefunction.DistanceFunction;
import de.lmu.ifi.dbs.elki.distance.distancevalue.Distance;
import de.lmu.ifi.dbs.elki.logging.Logging;
import de.lmu.ifi.dbs.elki.logging.progress.FiniteProgress;
import de.lmu.ifi.dbs.elki.result.BasicResult;
import de.lmu.ifi.dbs.elki.result.OrderingFromDataStore;
import de.lmu.ifi.dbs.elki.result.Result;
import de.lmu.ifi.dbs.elki.utilities.datastructures.hierarchy.HierarchyHashmapList;
import de.lmu.ifi.dbs.elki.utilities.datastructures.hierarchy.ModifiableHierarchy;
import de.lmu.ifi.dbs.elki.utilities.documentation.Description;
import de.lmu.ifi.dbs.elki.utilities.documentation.Reference;
import de.lmu.ifi.dbs.elki.utilities.documentation.Title;
import de.lmu.ifi.dbs.elki.utilities.optionhandling.OptionID;
import de.lmu.ifi.dbs.elki.utilities.optionhandling.constraints.GreaterEqualConstraint;
import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameterization.Parameterization;
import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameters.IntParameter;
import de.lmu.ifi.dbs.elki.utilities.pairs.Pair;


/**
 * Efficient implementation of the Single-Link Algorithm SLINK of R. Sibson.
 * <p>
 * Reference: R. Sibson: SLINK: An optimally efficient algorithm for the
 * single-link cluster method. <br>
 * In: The Computer Journal 16 (1973), No. 1, p. 30-34.
 * </p>
 * 
 * @author Elke Achtert
 * @param <O> the type of DatabaseObject the algorithm is applied on
 * @param <D> the type of Distance used
 */
@Title("SLINK: Single Link Clustering")
@Description("Hierarchical clustering algorithm based on single-link connectivity.")
@Reference(authors = "R. Sibson", title = "SLINK: An optimally efficient algorithm for the single-link cluster method", booktitle = "The Computer Journal 16 (1973), No. 1, p. 30-34.", url = "http://dx.doi.org/10.1093/comjnl/16.1.30")
public class SLINK<O, D extends Distance<D>> extends AbstractDistanceBasedAlgorithm<O, D, Result> {
  /**
   * The logger for this class.
   */
  private static final Logging logger = Logging.getLogger(SLINK.class);


  /**
   * The minimum number of clusters to extract
   */
  public static final OptionID SLINK_MINCLUSTERS_ID = OptionID.getOrCreateOptionID("slink.minclusters", "The maximum number of clusters to extract.");


  /**
   * The values of the function Pi of the pointer representation.
   */
  private WritableDataStore<DBID> pi;


  /**
   * The values of the function Lambda of the pointer representation.
   */
  private WritableDataStore<D> lambda;


  /**
   * Minimum number of clusters to extract
   */
  private Integer minclusters;


  /**
   * Constructor.
   * 
   * @param distanceFunction Distance function
   * @param minclusters Minimum clusters to extract. Can be null
   */
  public SLINK(DistanceFunction<? super O, D> distanceFunction, Integer minclusters) {
    super(distanceFunction);
    this.minclusters = minclusters;
  }


  /**
   * Performs the SLINK algorithm on the given database.
   */
  @SuppressWarnings("unchecked")
  public Result run(Database database, Relation<O> relation) {
    DistanceQuery<O, D> distQuery = database.getDistanceQuery(relation, getDistanceFunction());
    Class<D> distCls = (Class<D>) getDistanceFunction().getDistanceFactory().getClass();
    WritableRecordStore store = DataStoreUtil.makeRecordStorage(relation.getDBIDs(), DataStoreFactory.HINT_HOT | DataStoreFactory.HINT_STATIC, DBID.class, distCls);
    pi = store.getStorage(0, DBID.class);
    lambda = store.getStorage(1, distCls);
    // Temporary storage for m.
    WritableDataStore<D> m = DataStoreUtil.makeStorage(relation.getDBIDs(), DataStoreFactory.HINT_HOT | DataStoreFactory.HINT_TEMP, distCls);


    FiniteProgress progress = logger.isVerbose() ? new FiniteProgress("Clustering", relation.size(), logger) : null;
    // has to be an array for monotonicity reasons!
    ModifiableDBIDs processedIDs = DBIDUtil.newArray(relation.size());


    // apply the algorithm
    for(DBID id : relation.iterDBIDs()) {
      step1(id);
      step2(id, processedIDs, distQuery, m);
      step3(id, processedIDs, m);
      step4(id, processedIDs);


      processedIDs.add(id);


      if(progress != null) {
        progress.incrementProcessed(logger);
      }
    }


    if(progress != null) {
      progress.ensureCompleted(logger);
    }
    // We don't need m anymore.
    m.destroy();
    m = null;


    // build dendrogram
    BasicResult result = null;


    // Build clusters identified by their target object
    int minc = minclusters != null ? minclusters : relation.size();
    result = extractClusters(relation.getDBIDs(), pi, lambda, minc);


    result.addChildResult(new MaterializedRelation<DBID>("SLINK pi", "slink-order", TypeUtil.DBID, pi, processedIDs));
    result.addChildResult(new MaterializedRelation<D>("SLINK lambda", "slink-order", new SimpleTypeInformation<D>(distCls), lambda, processedIDs));
    result.addChildResult(new OrderingFromDataStore<D>("SLINK order", "slink-order", processedIDs, lambda));
    return result;
  }


  /**
   * First step: Initialize P(id) = id, L(id) = infinity.
   * 
   * @param newID the id of the object to be inserted into the pointer
   *        representation
   */
  private void step1(DBID newID) {
    // P(n+1) = n+1:
    pi.put(newID, newID);
    // L(n+1) = infinity
    lambda.put(newID, getDistanceFunction().getDistanceFactory().infiniteDistance());
  }


  /**
   * Second step: Determine the pairwise distances from all objects in the
   * pointer representation to the new object with the specified id.
   * 
   * @param newID the id of the object to be inserted into the pointer
   *        representation
   * @param processedIDs the already processed ids
   * @param distFunc Distance function to use
   */
  private void step2(DBID newID, DBIDs processedIDs, DistanceQuery<O, D> distFunc, WritableDataStore<D> m) {
    for(DBID id : processedIDs) {
      // M(i) = dist(i, n+1)
      m.put(id, distFunc.distance(id, newID));
    }
  }


  /**
   * Third step: Determine the values for P and L
   * 
   * @param newID the id of the object to be inserted into the pointer
   *        representation
   * @param processedIDs the already processed ids
   */
  private void step3(DBID newID, DBIDs processedIDs, WritableDataStore<D> m) {
    // for i = 1..n
    for(DBID id : processedIDs) {
      D l_i = lambda.get(id);
      D m_i = m.get(id);
      DBID p_i = pi.get(id);
      D mp_i = m.get(p_i);


      // if L(i) >= M(i)
      if(l_i.compareTo(m_i) >= 0) {
        // M(P(i)) = min { M(P(i)), L(i) }
        m.put(p_i, DistanceUtil.min(mp_i, l_i));


        // L(i) = M(i)
        lambda.put(id, m_i);


        // P(i) = n+1;
        pi.put(id, newID);
      }
      else {
        // M(P(i)) = min { M(P(i)), M(i) }
        m.put(p_i, DistanceUtil.min(mp_i, m_i));
      }
    }
  }


  /**
   * Fourth step: Actualize the clusters if necessary
   * 
   * @param newID the id of the current object
   * @param processedIDs the already processed ids
   */
  private void step4(DBID newID, DBIDs processedIDs) {
    // for i = 1..n
    for(DBID id : processedIDs) {
      D l_i = lambda.get(id);
      D lp_i = lambda.get(pi.get(id));


      // if L(i) >= L(P(i))
      if(l_i.compareTo(lp_i) >= 0) {
        // P(i) = n+1
        pi.put(id, newID);
      }
    }
  }


  private DBID lastObjectInCluster(DBID id, D stopdist, final DataStore<DBID> pi, final DataStore<D> lambda) {
    if(stopdist == null) {
      return id;
    }


    DBID currentID = id;
    while(lambda.get(currentID).compareTo(stopdist) < 1) {
      currentID = pi.get(currentID);
    }
    return currentID;
  }


  /**
   * Extract all clusters from the pi-lambda-representation.
   * 
   * @param ids Object ids to process
   * @param pi Pi store
   * @param lambda Lambda store
   * @param minclusters Minimum number of clusters to extract
   * 
   * @return Hierarchical clustering
   */
  private Clustering<DendrogramModel<D>> extractClusters(DBIDs ids, final DataStore<DBID> pi, final DataStore<D> lambda, int minclusters) {
    FiniteProgress progress = logger.isVerbose() ? new FiniteProgress("Extracting clusters", ids.size(), logger) : null;


    // stopdist
    D stopdist = null;
    // sort by lambda
    ArrayModifiableDBIDs order = DBIDUtil.newArray(ids);
    order.sort(new CompareByLambda<D>(lambda));
    int index = ids.size() - minclusters - 1;
    while(index >= 0) {
      if(lambda.get(order.get(index)).equals(lambda.get(order.get(index + 1)))) {
        index--;
      }
      else {
        stopdist = lambda.get(order.get(index));
        break;
      }
    }


    // extract the child clusters
    Map<DBID, ModifiableDBIDs> cluster_ids = new HashMap<DBID, ModifiableDBIDs>();
    Map<DBID, D> cluster_distances = new HashMap<DBID, D>();
    for(DBID id : ids) {
      DBID lastObjectInCluster = lastObjectInCluster(id, stopdist, pi, lambda);
      ModifiableDBIDs cluster = cluster_ids.get(lastObjectInCluster);
      if(cluster == null) {
        cluster = DBIDUtil.newArray();
        cluster_ids.put(lastObjectInCluster, cluster);
      }
      cluster.add(id);


      D lambda_id = lambda.get(id);
      if(stopdist != null && lambda_id.compareTo(stopdist) <= 0 && (cluster_distances.get(lastObjectInCluster) == null || lambda_id.compareTo(cluster_distances.get(lastObjectInCluster)) > 0)) {
        cluster_distances.put(lastObjectInCluster, lambda_id);
      }


      // Decrement counter
      if(progress != null) {
        progress.incrementProcessed(logger);
      }
    }
    if(progress != null) {
      progress.ensureCompleted(logger);
    }


    // build hierarchy
    final Clustering<DendrogramModel<D>> dendrogram = new Clustering<DendrogramModel<D>>("Single-Link-Dendrogram", "slink-dendrogram");
    ModifiableHierarchy<Cluster<DendrogramModel<D>>> hier = new HierarchyHashmapList<Cluster<DendrogramModel<D>>>();
    Cluster<DendrogramModel<D>> root = root(cluster_ids, cluster_distances, pi, lambda, hier, progress);
    dendrogram.addCluster(root);


    return dendrogram;
  }


  private Cluster<DendrogramModel<D>> root(Map<DBID, ModifiableDBIDs> cluster_ids, Map<DBID, D> cluster_distances, final DataStore<DBID> pi, final DataStore<D> lambda, ModifiableHierarchy<Cluster<DendrogramModel<D>>> hier, FiniteProgress progress) {
    if(cluster_ids.size() == 1) {
      DBID id = cluster_ids.keySet().iterator().next();
      String name = "cluster_" + id + "_" + cluster_distances.get(id);
      return new Cluster<DendrogramModel<D>>(name, cluster_ids.get(id), new DendrogramModel<D>(cluster_distances.get(id)), hier);
    }


    // sort leafs by lambda
    List<Pair<DBID, D>> leafs = new ArrayList<Pair<DBID, D>>(cluster_ids.size());
    for(DBID id : cluster_ids.keySet()) {
      leafs.add(new Pair<DBID, D>(id, lambda.get(id)));
    }


    Collections.sort(leafs, new Comparator<Pair<DBID, D>>() {
      @Override
      public int compare(Pair<DBID, D> o1, Pair<DBID, D> o2) {
        D k1 = lambda.get(o1.first);
        D k2 = lambda.get(o2.first);
        if(k1 == null && k2 == null) {
          return 0;
        }
        else if(k1 == null) {
          return -1;
        }
        else if(k2 == null) {
          return 1;
        }
        else {
          return k1.compareTo(k2);
        }
      }
    });


    // create nodes of the dendrogram
    Cluster<DendrogramModel<D>> parent = null;
    Map<DBID, Cluster<DendrogramModel<D>>> nodes = new HashMap<DBID, Cluster<DendrogramModel<D>>>();
    int nodeCount = 0;
    int clusterCount = 0;
    while(!leafs.isEmpty()) {
      // left child
      Pair<DBID, D> leaf = leafs.remove(0);
      DBID leftID = leaf.first;
      Cluster<DendrogramModel<D>> left = nodes.get(leftID);
      if(left == null) {
        // String name = "cluster_" + leftID + "_" +
        // cluster_distances.get(leftID);
        String name = "cluster_" + (++clusterCount);
        left = new Cluster<DendrogramModel<D>>(name, cluster_ids.get(leftID), new DendrogramModel<D>(cluster_distances.get(leftID)), hier);
        nodes.put(leftID, left);
      }
      // right child
      DBID rightID = pi.get(leftID);
      if(leftID.equals(rightID)) {
        break;
      }
      Cluster<DendrogramModel<D>> right = nodes.get(rightID);
      if(right == null) {
        // String name = "cluster_" + rightID + "_" +
        // cluster_distances.get(rightID);
        String name = "cluster_" + (++clusterCount);
        right = new Cluster<DendrogramModel<D>>(name, cluster_ids.get(rightID), new DendrogramModel<D>(cluster_distances.get(rightID)), hier);
        nodes.put(rightID, right);
      }
      // parent
      // String name = "node" + (++nodeCount) + "_" + leaf.second;
      String name = "node_" + (++nodeCount);
      parent = createParent(name, lastAncestor(left, hier), lastAncestor(right, hier), leaf.second, hier);


      // Decrement counter
      if(progress != null) {
        progress.incrementProcessed(logger);
      }
    }
    // root = parent
    return parent;
  }


  /**
   * Determines recursively the last ancestor of the specified cluster.
   * 
   * @param cluster the child
   * @param hier the cluster hierarchy
   * @return the (currently) last ancestor
   */
  private Cluster<DendrogramModel<D>> lastAncestor(Cluster<DendrogramModel<D>> cluster, ModifiableHierarchy<Cluster<DendrogramModel<D>>> hier) {
    List<Cluster<DendrogramModel<D>>> parents = hier.getParents(cluster);
    if(parents.isEmpty()) {
      return cluster;
    }
    else {
      if(parents.size() > 1) {
        logger.warning("More than one parent in Single-Link dendrogram: " + cluster + " parents: " + parents);
        return null;
      }
      return lastAncestor(parents.get(0), hier);
    }
  }


  private Cluster<DendrogramModel<D>> createParent(String name, Cluster<DendrogramModel<D>> leftChild, Cluster<DendrogramModel<D>> rightChild, D distance, ModifiableHierarchy<Cluster<DendrogramModel<D>>> hier) {
    // DBIDs ids = DBIDUtil.union(leftChild.getIDs(), rightChild.getIDs());
    Cluster<DendrogramModel<D>> parent = new Cluster<DendrogramModel<D>>(name, DBIDUtil.EMPTYDBIDS, new DendrogramModel<D>(distance), hier);


    hier.add(parent, leftChild);
    hier.add(parent, rightChild);


    return parent;
  }


  /**
   * Extract all clusters from the pi-lambda-representation.
   * 
   * @param ids Object ids to process
   * @param pi Pi store
   * @param lambda Lambda store
   * @param minclusters Minimum number of clusters to extract
   * 
   * @return Hierarchical clustering
   */
  @SuppressWarnings("unused")
  private Clustering<Model> extractClusters_erich(DBIDs ids, final DataStore<DBID> pi, final DataStore<D> lambda, int minclusters) {
    // extract a hierarchical clustering
    ArrayModifiableDBIDs order = DBIDUtil.newArray(ids);
    // sort by lambda
    order.sort(new CompareByLambda<D>(lambda));
    D curdist = null;


    D stopdist = null;
    if(minclusters < ids.size()) {
      stopdist = lambda.get(order.get(ids.size() - minclusters));
    }


    ModifiableHierarchy<Cluster<Model>> hier = new HierarchyHashmapList<Cluster<Model>>();
    Map<DBID, Cluster<Model>> clusters = new HashMap<DBID, Cluster<Model>>();
    Map<DBID, ModifiableDBIDs> cids = new HashMap<DBID, ModifiableDBIDs>();


    FiniteProgress progress = logger.isVerbose() ? new FiniteProgress("Extracting clusters", ids.size(), logger) : null;


    for(DBID cur : order) {
      DBID dest = pi.get(cur);
      D l = lambda.get(cur);
      // logger.debugFine("DBID " + cur.toString() + " dist: " + l.toString());
      if(stopdist != null && stopdist.compareTo(l) > 0) {
        ModifiableDBIDs curset = cids.remove(cur);
        ModifiableDBIDs destset = cids.get(dest);
        if(destset == null) {
          if(curset != null) {
            destset = curset;
          }
          else {
            destset = DBIDUtil.newHashSet();
            destset.add(cur);
          }
          destset.add(dest);
          cids.put(dest, destset);
        }
        else {
          if(curset != null) {
            destset.addDBIDs(curset);
          }
          else {
            destset.add(cur);
          }
        }
        curdist = l;
      }
      else {
        if(curdist == null || l.compareTo(curdist) > 0) {
          // New distance level reached. Post-process the current objects
          for(Entry<DBID, ModifiableDBIDs> ent : cids.entrySet()) {
            DBID key = ent.getKey();
            ModifiableDBIDs clusids = ent.getValue();
            // Make a new cluster
            String cname = "Cluster_" + key.toString() + "_" + curdist.toString();
            Cluster<Model> cluster = new Cluster<Model>(cname, clusids, ClusterModel.CLUSTER, hier);
            // Collect child clusters and clean up the cluster ids, keeping only
            // "new" objects.
            Iterator<DBID> iter = clusids.iterator();
            while(iter.hasNext()) {
              DBID child = iter.next();
              Cluster<Model> chiclus = clusters.get(child);
              if(chiclus != null) {
                hier.add(cluster, chiclus);
                clusters.remove(child);
                iter.remove();
              }
            }
            clusters.put(key, cluster);
          }
          if(logger.isDebuggingFine()) {
            StringBuffer buf = new StringBuffer();
            buf.append("Number of clusters at depth ");
            buf.append((curdist != null ? curdist.toString() : "null"));
            buf.append(": ").append(clusters.size()).append(" ");
            buf.append("last-objects:");
            for(DBID id : clusters.keySet()) {
              buf.append(" ").append(id.toString());
            }
            logger.debugFine(buf.toString());
          }
          cids.clear();
          curdist = l;
        }
        // Add the current object to the destinations cluster
        {
          ModifiableDBIDs destset = cids.get(dest);
          if(destset == null) {
            destset = DBIDUtil.newHashSet();
            cids.put(dest, destset);
            destset.add(dest);
          }
          destset.add(cur);
        }
      }
      // Decrement counter
      if(progress != null) {
        progress.incrementProcessed(logger);
      }
    }
    if(progress != null) {
      progress.ensureCompleted(logger);
    }
    // There should be one cluster remaining at infinite distance...
    if(clusters.size() != 1) {
      logger.warning("Single-link is expected to have a single cluster at the top level!");
      return null;
    }
    final Clustering<Model> clustering = new Clustering<Model>("Single-Link-Clustering", "slink-clustering");
    // FIXME: validate this is works correctly for a single-object dataset!
    for(Cluster<Model> cluster : clusters.values()) {
      clustering.addCluster(cluster);
    }
    return clustering;
  }


  @Override
  public TypeInformation[] getInputTypeRestriction() {
    return TypeUtil.array(getDistanceFunction().getInputTypeRestriction());
  }


  @Override
  protected Logging getLogger() {
    return logger;
  }


  /**
   * Order a DBID collection by the lambda value.
   * 
   * @author Erich Schubert
   * 
   * @apiviz.exclude
   * 
   * @param <D> Distance type
   */
  private static final class CompareByLambda<D extends Distance<D>> implements Comparator<DBID> {
    /**
     * Lambda storage
     */
    private final DataStore<D> lambda;


    /**
     * Constructor.
     * 
     * @param lambda Lambda storage
     */
    protected CompareByLambda(DataStore<D> lambda) {
      this.lambda = lambda;
    }


    @Override
    public int compare(DBID id1, DBID id2) {
      D k1 = lambda.get(id1);
      D k2 = lambda.get(id2);
      assert (k1 != null);
      assert (k2 != null);
      return k1.compareTo(k2);
    }
  }


  /**
   * Parameterization class.
   * 
   * @author Erich Schubert
   * 
   * @apiviz.exclude
   */
  public static class Parameterizer<O, D extends Distance<D>> extends AbstractDistanceBasedAlgorithm.Parameterizer<O, D> {
    protected Integer minclusters = null;


    @Override
    protected void makeOptions(Parameterization config) {
      super.makeOptions(config);
      IntParameter minclustersP = new IntParameter(SLINK_MINCLUSTERS_ID, new GreaterEqualConstraint(1), true);
      if(config.grab(minclustersP)) {
        minclusters = minclustersP.getValue();
      }
    }


    @Override
    protected SLINK<O, D> makeInstance() {
      return new SLINK<O, D>(distanceFunction, minclusters);
    }
  }
}
Source Code of de.lmu.ifi.dbs.elki.algorithm.clustering.SLINK$CompareByLambda

Related Classes of de.lmu.ifi.dbs.elki.algorithm.clustering.SLINK$CompareByLambda