Package edu.ucla.sspace.util

Source Code of edu.ucla.sspace.util.SimpleNearestNeighborFinder

/*
* Copyright 2012 David Jurgens
*
* This file is part of the S-Space package and is covered under the terms and
* conditions therein.
*
* The S-Space package is free software: you can redistribute it and/or modify
* it under the terms of the GNU General Public License version 2 as published
* by the Free Software Foundation and distributed hereunder to you.
*
* THIS SOFTWARE IS PROVIDED "AS IS" AND NO REPRESENTATIONS OR WARRANTIES,
* EXPRESS OR IMPLIED ARE MADE.  BY WAY OF EXAMPLE, BUT NOT LIMITATION, WE MAKE
* NO REPRESENTATIONS OR WARRANTIES OF MERCHANT- ABILITY OR FITNESS FOR ANY
* PARTICULAR PURPOSE OR THAT THE USE OF THE LICENSED SOFTWARE OR DOCUMENTATION
* WILL NOT INFRINGE ANY THIRD PARTY PATENTS, COPYRIGHTS, TRADEMARKS OR OTHER
* RIGHTS.
*
* You should have received a copy of the GNU General Public License
* along with this program. If not, see <http://www.gnu.org/licenses/>.
*/

package edu.ucla.sspace.util;

import edu.ucla.sspace.common.SemanticSpace;

import edu.ucla.sspace.similarity.CosineSimilarity;
import edu.ucla.sspace.similarity.SimilarityFunction;

import edu.ucla.sspace.util.BoundedSortedMultiMap;
import edu.ucla.sspace.util.MultiMap;
import edu.ucla.sspace.util.SortedMultiMap;
import edu.ucla.sspace.util.WorkQueue;

import edu.ucla.sspace.vector.DenseVector;
import edu.ucla.sspace.vector.DoubleVector;
import edu.ucla.sspace.vector.Vector;
import edu.ucla.sspace.vector.Vectors;
import edu.ucla.sspace.vector.VectorMath;

import java.util.ArrayList;
import java.util.HashSet;
import java.util.Iterator;
import java.util.List;
import java.util.Map;
import java.util.Set;
import java.util.SortedMap;

import java.util.logging.Level;
import java.util.logging.Logger;

import static edu.ucla.sspace.util.LoggerUtil.verbose;
import static edu.ucla.sspace.util.LoggerUtil.info;


/**
* A native {@link NearestNeighborFinder} implemetnation for finding the {@code
* k} most-similar words to a provided word in a {@link SemanticSpace} using a
* brute-force search.  The comparisons required for generating the list maybe
* be run in parallel by configuring an instance of this class to use multiple
* threads.
*
* <p> All instances of this class are thread-safe.
*
* @author David Jurgens
*/
public class SimpleNearestNeighborFinder implements NearestNeighborFinder {

    /**
     * The logger to which clustering status updates will be written.
     */
    private static final Logger LOGGER =
        Logger.getLogger(SimpleNearestNeighborFinder.class.getName());

    /**
     * The work queue where concurrent jobs are run during the nearest-neighbot
     * searching process
     */
    private final WorkQueue workQueue;

    /**
     * The semantic space from which the principle vectors are derived
     */
    private final SemanticSpace sspace;   
   
    /**
     * The similarity function used to compare words
     */
    private final SimilarityFunction simFunc;

    /**
     * Creates this {@code SimpleNearestNeighborFinder} with as many threads as
     * processors, and using the cosine similarity for word comparisons.
     */
    public SimpleNearestNeighborFinder(SemanticSpace sspace) {
        this(sspace, new CosineSimilarity(),
             Runtime.getRuntime().availableProcessors());
    }
   
    /**
     * Creates this {@code SimpleNearestNeighborFinder} with the specified
     * number of threads, using the cosine similarity for word comparisons.
     */
    public SimpleNearestNeighborFinder(SemanticSpace sspace, int numThreads) {
        this(sspace, new CosineSimilarity(), numThreads);
    }

    /**
     * Creates this {@code SimpleNearestNeighborFinder} with as many threads as
     * processings and using the provided similarity function to compare words.
     */
    public SimpleNearestNeighborFinder(SemanticSpace sspace, SimilarityFunction similarity) {
        this(sspace, similarity, Runtime.getRuntime().availableProcessors());
    }

    /**
     * Creates this {@code SimpleNearestNeighborFinder} with the specified
     * number of threads and using the provided similarity function to compare
     * words.
     */
    public SimpleNearestNeighborFinder(SemanticSpace sspace,
                                       SimilarityFunction similarity,
                                       int numThreads) {
        this.sspace = sspace;
        this.simFunc = similarity;
        this.workQueue = WorkQueue.getWorkQueue(numThreads);
    }

    /**
     * {@inheritDoc}
     */
    public SortedMultiMap<Double,String> getMostSimilar(
            String word, int numberOfSimilarWords) {

        Vector v = sspace.getVector(word);
        // if the semantic space did not have the word, then return null
        if (v == null)
            return null;
        // Find the most similar words vectors to this word's vector, which will
        // end up including the word itself.  Therefore, increase the count by
        // one and remove the word's vector after it finished.
        SortedMultiMap<Double,String> mostSim =
            getMostSimilar(v, numberOfSimilarWords + 1);
        Iterator<Map.Entry<Double,String>> iter = mostSim.entrySet().iterator();
        while (iter.hasNext()) {
            Map.Entry<Double,String> e = iter.next();
            if (word.equals(e.getValue())) {
                iter.remove();
                break;
            }
        }
        return mostSim;
    }

    /**
     * {@inheritDoc}
     */
    public SortedMultiMap<Double,String> getMostSimilar(
             Set<String> terms, int numberOfSimilarWords) {
        if (terms.isEmpty())
            return null;
        // Compute the mean vector for all the terms
        DoubleVector mean = new DenseVector(sspace.getVectorLength());
        int found = 0;
        for (String term : terms) {
            Vector v = sspace.getVector(term);
            if (v == null)
                info(LOGGER, "No vector for term " + term);
            else {
                VectorMath.add(mean, v);
                found++;
            }
        }
        // If none of the provided vectors were in the space, then return null
        if (found == 0)
            return null;

        // Find the most similar words vectors to the mean vector.  There is
        // some chance that the returned list could includes the terms
        // themselves.  Therefore, increase the count by the number of terms
        // (which is the worst case) and remove the terms after it finished.
        SortedMultiMap<Double,String> mostSim =
            getMostSimilar(mean, numberOfSimilarWords + terms.size());
        Iterator<Map.Entry<Double,String>> iter = mostSim.entrySet().iterator();
        Set<Map.Entry<Double,String>> toRemove =
            new HashSet<Map.Entry<Double,String>>();
        while (iter.hasNext()) {
            Map.Entry<Double,String> e = iter.next();
            if (terms.contains(e.getValue()))
                toRemove.add(e);
        }
        for (Map.Entry<Double,String> e : toRemove)
            mostSim.remove(e.getKey(), e.getValue());

        // If we still have more words that were asked for, then prune out the
        // least similar
        while (mostSim.size() > numberOfSimilarWords)
            mostSim.remove(mostSim.firstKey());

        return mostSim;
    }


    /**
     * Finds the <i>k</i> most similar words in the semantic space according to
     * the cosine similarity, returning a mapping from their similarity to the
     * word itself.
     *
     * @return the most similar words to the vector
     */
    public SortedMultiMap<Double,String> getMostSimilar(
            final Vector v, int numberOfSimilarWords) {

        if (v == null)
            return null;
        final int k = numberOfSimilarWords;

        // Partition the sspace's words into sets, according to how many
        // processors are available to process them.
        int numThreads = workQueue.availableThreads();
        int numWords = sspace.getWords().size();
        List<List<String>> wordSets =
            new ArrayList<List<String>>(numThreads);
        for (int i = 0; i < numThreads; ++i)
            wordSets.add(new ArrayList<String>(numWords / numThreads));
        Iterator<String> iter = sspace.getWords().iterator();
        for (int i = 0; iter.hasNext(); ++i)
            wordSets.get(i % numThreads).add(iter.next());
       
               
        // Create a global map to store the results of the principle vectors'
        // terms comparisons
        final SortedMultiMap<Double,String> kMostSimTerms =
            new BoundedSortedMultiMap<Double,String>(k, false);
       
        // Register a task group to process similarity each of the principle
        // component's terms in parallel.
        Object taskId = workQueue.registerTaskGroup(wordSets.size());
        for (List<String> words : wordSets) {
            final List<String> terms = words;

            // Create a new Runnable to evaluate the similarity for each of the
            // terms that were most similar to this principle vector
            workQueue.add(taskId, new Runnable() {
                    public void run() {
                        // Record the similarity in a local data structure so we
                        // avoid locking the global mostSimilar term map
                        SortedMultiMap<Double,String> localMostSimTerms =
                            new BoundedSortedMultiMap<Double,String>(k, false);
                        for (String term : terms) {
                            Vector tVec = sspace.getVector(term);
                            double sim = simFunc.sim(v, tVec);
                            localMostSimTerms.put(sim, term);
                        }
                        // Lock the global map and then add the local results
                        synchronized (kMostSimTerms) {
                            for (Map.Entry<Double,String> e :
                                     localMostSimTerms.entrySet()) {
                                kMostSimTerms.put(e.getKey(), e.getValue());
                            }
                        }
                    }
                });
        }      
        workQueue.await(taskId);

        return kMostSimTerms;
    }
}
TOP

Related Classes of edu.ucla.sspace.util.SimpleNearestNeighborFinder

TOP
Copyright © 2018 www.massapi.com. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.