Package edu.ucla.sspace.clustering

Source Code of edu.ucla.sspace.clustering.NeighborChainAgglomerativeClustering$Link

/*
* Copyright (c) 2012, Lawrence Livermore National Security, LLC. Produced at
* the Lawrence Livermore National Laboratory. Written by Keith Stevens,
* kstevens@cs.ucla.edu OCEC-10-073 All rights reserved.
*
* This file is part of the S-Space package and is covered under the terms and
* conditions therein.
*
* The S-Space package is free software: you can redistribute it and/or modify
* it under the terms of the GNU General Public License version 2 as published
* by the Free Software Foundation and distributed hereunder to you.
*
* THIS SOFTWARE IS PROVIDED "AS IS" AND NO REPRESENTATIONS OR WARRANTIES,
* EXPRESS OR IMPLIED ARE MADE.  BY WAY OF EXAMPLE, BUT NOT LIMITATION, WE MAKE
* NO REPRESENTATIONS OR WARRANTIES OF MERCHANT- ABILITY OR FITNESS FOR ANY
* PARTICULAR PURPOSE OR THAT THE USE OF THE LICENSED SOFTWARE OR DOCUMENTATION
* WILL NOT INFRINGE ANY THIRD PARTY PATENTS, COPYRIGHTS, TRADEMARKS OR OTHER
* RIGHTS.
*
* You should have received a copy of the GNU General Public License
* along with this program. If not, see <http://www.gnu.org/licenses/>.
*/

package edu.ucla.sspace.clustering;

import edu.ucla.sspace.similarity.CosineSimilarity;
import edu.ucla.sspace.similarity.SimilarityFunction;
import edu.ucla.sspace.matrix.Matrix;
import edu.ucla.sspace.matrix.SymmetricMatrix;
import edu.ucla.sspace.vector.DoubleVector;

import java.util.Collection;
import java.util.Deque;
import java.util.HashMap;
import java.util.HashSet;
import java.util.Iterator;
import java.util.LinkedList;
import java.util.Map;
import java.util.Properties;
import java.util.Set;


/**
* A Nearest Neighbor Chain Agglomerative Clustering implementation.  This
* approach builds large chains starting from a random node of nearest
* neighbors.  As soon as a recirocal nearest neighbor (RNN) is found, the pair
* is merged.  This approach find an exact solution and runs in O(n^2) time.  It
* also uses O(n^2) space to represent an adjacency matrix based on a similar
* function.
*
* </p>
*
* Currently, this algorithm only implements the UGPMA agglomerative criteria.
* However, it does use any symmetric {@link SimilarityFunction}.
*
* </p>
*
* This implementation is based on the following paper:
*
*   <li style="font-family:Garamond, Georgia, serif">Murtagh, Fionn and
*   Contreras, PedroM. Algorithms for hierarchical clustering: an overview.  In
*   <i>Wiley Interdisciplinary Reviews: Data Mining and Knowledge Discovery</i>
*   Available <a
*   href="http://onlinelibrary.wiley.com/doi/10.1002/widm.53/full">here</a>
*   </li>
*
* @author Keith Stevens
*/
public class NeighborChainAgglomerativeClustering implements Clustering {

    /**
     * The similarity method used when comparing and merging compelte clusters.
     * See <a
     * href="http://home.dei.polimi.it/matteucc/Clustering/tutorial_html/hierarchical.html">this
     * guide</a> for examples of how each link method operates.
     */
    public enum ClusterLink {
        /**
         * Similarity will be based on the most similar item pair connecting
         * two clusters.
         */
        SINGLE_LINK,

        /**
         * Similarity will be based on most dissimilar item pair connecting
         * two clusters.
         */
        COMPLETE_LINK,

        /**
         * Similarity will be the average similarity of all item pairs
         * connecting two clusters.
         */
        MEAN_LINK,

        /**
         * Similarity will be based on median points for each cluster.
         */
        MEDIAN_LINK,

        /**
         * Similarity will be the unweighted average similarity of all item
         * pairs connecting two clusters.
         */
        MCQUITTY_LINK,
    }

    private final ClusterLink method;

    private final SimilarityFunction simFunc;
   
    /**
     * Creates a {@link NeighborChainAgglomerativeClustering} with no minimum
     * similarity threshold.  When using this constructor, you must specifcy the
     * number of clusters or the results will be undefined.
     */
    public NeighborChainAgglomerativeClustering() {
        this(ClusterLink.MEAN_LINK, new CosineSimilarity());
    }

    /**
     * Creates a {@link NeighborChainAgglomerativeClustering} with the specified
     * minimum similarity threshold.  When clustering without specifying the
     * number of clusters, clustering stops when the most similar clusters have
     * a similarity below this threshold.
     */
    public NeighborChainAgglomerativeClustering(ClusterLink method,
                                                SimilarityFunction simFunc) {
        if (!simFunc.isSymmetric())
            throw new IllegalArgumentException(
                    "Agglomerative Clustering requires a symmetric " +
                    "similarity function");

        this.method = method;
        this.simFunc = simFunc;
    }

    /**
     * Unsupported
     *
     * @throws UnsupportedOperationException
     */
    public Assignments cluster(Matrix m, Properties props) {
        throw new UnsupportedOperationException(
                "Cannot cluster without specifying the number of clusters.");
    }

    /**
     * Returns the agglomerative clustering result using {@link ClusterLink} and
     * {@code SimilarityFunction} specified from the constructor.  The {@link
     * SimilarityFunction} will be used to build a symmetric adjacency matrix
     * and the {@link ClusterLink} will determine how to udpate similarities
     * between newly nerged clusters.
     */
    public Assignments cluster(Matrix m, int numClusters, Properties props) {
        Matrix adj = new SymmetricMatrix(m.rows(), m.rows());
        for (int r = 0; r < m.rows(); ++r) {
            DoubleVector v = m.getRowVector(r);
            for (int c = r+1; c < m.rows(); ++c)
                adj.set(r,c, simFunc.sim(v, m.getRowVector(c)));
        }
        return clusterAdjacencyMatrix(adj, method, numClusters);
    }

    /**
     * Clusters the points represented as an adjacency matrix in {@code adj}
     * using the supplied {@code ClusterLink} method into {@code numCluster}.
     *
     * @param adj A symmetric {@link Matrix} recording similarities between
     *        nodes in a graph.
     * @param method {@link ClusterLink} method that determines how similarities
     *        between clusters will be updated.
     * @param numClusters The desired number of clusters.
     *
     * @return {@link Assignments} from each data point to it's assigned
     *         cluster.
     */
    public static Assignments clusterAdjacencyMatrix(Matrix adj,
                                                     ClusterLink method,
                                                     int numClusters) {
        // A mapping from cluster id's to their point sets.
        Map<Integer, Set<Integer>> clusterMap =
            new HashMap<Integer, Set<Integer>>();

        // A set of clusters to be considered for merging.
        Set<Integer> remaining = new HashSet<Integer>();

        // Create a cluster for every data point and add it to the cluster map
        // and to the examine set.
        for (int r = 0; r < adj.rows(); ++r) {
            remaining.add(r);
            Set<Integer> points = new HashSet<Integer>();
            points.add(r);
            clusterMap.put(r, points);
        }

        // A stack of the nearest neighbor chains.  The tuple stores the id of the
        // current node and the similarity from this node to it's parent in the
        // chain.
        Deque<Link> chain = new LinkedList<Link>();

        // Initializes the chain.
        initializeChain(chain, remaining);

        while (clusterMap.size() > numClusters) {
            // Get the last link in the chain.
            Link top = chain.peek();

            // Find the nearest neighbor using the clusters not in the chain
            // already.
            Link best = findBest(remaining, adj, top.x);

            // Check the similarity for the best neighbor and compare it to that of
            // the current node in the chain.  If the neighbor sim is larger, then
            // the current node and it's parent aren't RNNs.  Otherwise, the current
            // node is RNNs with it's parent.
            if (best.sim > top.sim) {
                // Not RNNs.  So push the best node and it's similarity to the
                // current node onto the chain and remove it from the examine set.
                chain.push(best);
                remaining.remove(best.x);
            } else {
                // Yes top two on stack are RNNs.
               
                // Pop the current node from the top.
                chain.pop();

                // Pop the parent of the best node.
                Link parent = chain.pop();

                // Get the current candidates and remove them from the cluster
                // map.
                Set<Integer> c1Points = clusterMap.get(top.x);
                int c1Size = c1Points.size();
                Set<Integer> c2Points = clusterMap.get(parent.x);
                int c2Size = c2Points.size();
                clusterMap.remove(top.x);
                clusterMap.remove(parent.x);

                // Update the distance from the parent id to all other elements
                // in the cluster map.
                for (int o : clusterMap.keySet())
                    adj.set(parent.x, o,
                            updateSimilarity(method, adj, top.x, c1Size, parent.x, c2Size, 0));

                // Merge the set assignments for the two clusters.
                c1Points.addAll(c2Points);
                // Replace the mapping from parent to now point to the merged cluster.
                clusterMap.put(parent.x, c1Points);

                // Add the id for the new cluster (now using parent's id) into
                // the remaining set.
                remaining.add(parent.x);

                // If the chain is now empty, re-initialize it.
                if (chain.size() == 0)
                    initializeChain(chain, remaining);
            }
        }

        return formAssignments(clusterMap.values(), adj.rows());
    }

    /**
     * Returns the updated similarity between a newly formed cluster containing
     * clusters {@code c1Index} and {@code c2Index} to the cluster {@code
     * otherIndex}.  These similarity updates use the Lance-Williams recurrence
     * relations which hold for both similarity metrics and distance metrics.
     */
    private static double updateSimilarity(ClusterLink method,
                                           Matrix adj,
                                           int c1Index,
                                           int c1Size,
                                           int c2Index,
                                           int c2Size,
                                           int otherIndex) {
        double s1 = adj.get(c1Index, otherIndex);
        double s2 = adj.get(c2Index, otherIndex);
        switch (method) {
            case MEAN_LINK:
                return (c1Size*s1 + c2Size*s2) / (c1Size + c2Size);
            case MEDIAN_LINK:
                return .5*s1 + .5 * s2 - .25*adj.get(c1Index, c2Index);
            case SINGLE_LINK:
                return .5*s1 + .5*s2 + Math.min(s1, s2);
            case COMPLETE_LINK:
                return .5*s1 + .5*s2 + Math.max(s1, s2);
            case MCQUITTY_LINK:
                return .5 * s1 + .5*s2;
            default: throw new IllegalArgumentException(
                             "Unsupported ClusterLink Method");
        }
    }

    /**
     * Initializes the neighbor {@code chain} with the first element in {@code
     * remaining} and removes that elment from the set.
     */
    private static void initializeChain(Deque<Link> chain,
                                        Set<Integer> remaining) {
        Iterator<Integer> iter = remaining.iterator();
        chain.push(new Link(-Double.MAX_VALUE, iter.next()));
        iter.remove();
    }

    /**
     * Returns a {@link Link} representing the nearest cluster in {@code
     * remaining} to {@code cluster} and the similarity between the two clusters
     * as defined in {@code adj}.
     */
    private static Link findBest(Set<Integer> remaining,
                                 Matrix adj,
                                 int cluster) {
        int best = -1;
        double sim = -Double.MAX_VALUE;
        for (int i : remaining) {
            double s = adj.get(cluster, i);
            if (s > sim) {
                best = i;
                sim = s;
            }
        }

        return new Link(sim, best);
    }

    /**
     * Returns the {@link Assignments} corresponding to the set of clusters and
     * number of data points.
     */
    public static Assignments formAssignments(Collection<Set<Integer>> clusters,
                                              int numPoints) {
        int cid = 0;
        Assignment[] assignments = new Assignment[numPoints];
        for (Set<Integer> cluster : clusters) {
            for (int point : cluster)
                assignments[point] = new HardAssignment(cid);
            cid++;
        }
        return new Assignments(clusters.size(), assignments, null);
    }

    /**
     * A simple struct for storing links between two clusters.
     */
    public static class Link {

        /**
         * The similarity between the current cluster and it's parent in the
         * chain.
         */
        public double sim;

        /**
         * The id of the current cluster.
         */
        public int x;

        /**
         * Creates a new {@link Link}.
         */
        public Link(double sim, int x) {
            this.sim = sim;
            this.x = x;
        }
    }
}
TOP

Related Classes of edu.ucla.sspace.clustering.NeighborChainAgglomerativeClustering$Link

TOP
Copyright © 2018 www.massapi.com. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.