Package edu.ucla.sspace.nonlinear

Source Code of edu.ucla.sspace.nonlinear.LocalityPreservingCooccurrenceSpace

/*
* Copyright 2010 David Jurgens
*
* This file is part of the S-Space package and is covered under the terms and
* conditions therein.
*
* The S-Space package is free software: you can redistribute it and/or modify
* it under the terms of the GNU General Public License version 2 as published
* by the Free Software Foundation and distributed hereunder to you.
*
* THIS SOFTWARE IS PROVIDED "AS IS" AND NO REPRESENTATIONS OR WARRANTIES,
* EXPRESS OR IMPLIED ARE MADE.  BY WAY OF EXAMPLE, BUT NOT LIMITATION, WE MAKE
* NO REPRESENTATIONS OR WARRANTIES OF MERCHANT- ABILITY OR FITNESS FOR ANY
* PARTICULAR PURPOSE OR THAT THE USE OF THE LICENSED SOFTWARE OR DOCUMENTATION
* WILL NOT INFRINGE ANY THIRD PARTY PATENTS, COPYRIGHTS, TRADEMARKS OR OTHER
* RIGHTS.
*
* You should have received a copy of the GNU General Public License
* along with this program. If not, see <http://www.gnu.org/licenses/>.
*/

package edu.ucla.sspace.nonlinear;

import edu.ucla.sspace.common.SemanticSpace;
import edu.ucla.sspace.common.Similarity;
import edu.ucla.sspace.common.Statistics;

import edu.ucla.sspace.hal.WeightingFunction;

import edu.ucla.sspace.matrix.AffinityMatrixCreator;
import edu.ucla.sspace.matrix.AtomicMatrix;
import edu.ucla.sspace.matrix.GrowingSparseMatrix;
import edu.ucla.sspace.matrix.LocalityPreservingProjection;
import edu.ucla.sspace.matrix.Matrices;
import edu.ucla.sspace.matrix.Matrix;
import edu.ucla.sspace.matrix.MatrixFile;
import edu.ucla.sspace.matrix.SparseMatrix;
import edu.ucla.sspace.matrix.YaleSparseMatrix;

import edu.ucla.sspace.text.IteratorFactory;

import edu.ucla.sspace.util.BoundedSortedMultiMap;
import edu.ucla.sspace.util.MultiMap;
import edu.ucla.sspace.util.ReflectionUtil;
import edu.ucla.sspace.util.Pair;

import edu.ucla.sspace.vector.CompactSparseVector;
import edu.ucla.sspace.vector.SparseHashDoubleVector;
import edu.ucla.sspace.vector.SparseDoubleVector;
import edu.ucla.sspace.vector.Vector;

import java.io.BufferedReader;
import java.io.File;
import java.io.IOException;
import java.io.PrintWriter;

import java.util.ArrayDeque;
import java.util.BitSet;
import java.util.Collections;
import java.util.HashMap;
import java.util.HashSet;
import java.util.Iterator;
import java.util.LinkedList;
import java.util.Map;
import java.util.Properties;
import java.util.Queue;
import java.util.Set;

import java.util.logging.Logger;

import java.util.concurrent.ConcurrentHashMap;


/**
* @author David Jurgens
*
* @see SemanticSpace
* @see LocalityPreservingSemanticAnalysis
* @see AffinityMatrixCreator
* @see LocalityPreservingProjection
*/
public class LocalityPreservingCooccurrenceSpace implements SemanticSpace {

    /**
     * The prefix for naming public properties.
     */
    private static final String PROPERTY_PREFIX =
        "edu.ucla.sspace.lpsa.LocalityPreservingCooccurrenceSpace";
   
    /**
     * The property to specify the minimum entropy theshold a word should have
     * to be included in the vector space after processing.  The specified value
     * of this property should be a double
     */
    public static final String ENTROPY_THRESHOLD_PROPERTY =
        PROPERTY_PREFIX + ".threshold";

    /**
     * The property to specify the number of words to view before and after each
     * word in focus.
     */
    public static final String WINDOW_SIZE_PROPERTY =
        PROPERTY_PREFIX + ".windowSize";

    /**
     * The property to set the {@link WeightingFunction} to be used with
     * weighting the co-occurrence of neighboring words based on their distance.
     */
    public static final String WEIGHTING_FUNCTION_PROPERTY =
        PROPERTY_PREFIX + ".weighting";

    /**
     * The property to set the number of dimension to which the space should be
     * reduced using the SVD
     */
    public static final String LPCS_DIMENSIONS_PROPERTY =
        PROPERTY_PREFIX + ".dimensions";

    /**
     * The default number of words before and after the focus word to include
     */
    public static final int DEFAULT_WINDOW_SIZE = 5;

    /**
     * The default {@code WeightingFunction} to use.
     */       
    public static final String DEFAULT_WEIGHTING =
        "edu.ucla.sspace.hal.EvenWeighting";

    /**
     * Logger for LocalityPreservingCooccurrenceSpace.
     */
    private static final Logger LOGGER =
        Logger.getLogger(LocalityPreservingCooccurrenceSpace.class.getName());

    /**
     * Map that pairs the word with it's position in the matrix
     */
    private final Map<String,Integer> termToIndex;      

    /**
     * The number of words to consider in one direction to create the symmetric
     * window
     */
    private final int windowSize;
   
    /**
     * The type of weight to apply to a the co-occurrence word based on its
     * relative location
     */
    private final WeightingFunction weighting;

    /**
     * The number that keeps track of the index values of words
     */
    private int wordIndexCounter;

    /**
     * The matrix used for storing weight co-occurrence statistics of those
     * words that occur both before and after.
     */
    private SparseMatrix cooccurrenceMatrix;

    /**
     * An atomic wrapper around the {@link #cooccurrenceMatrix} instance to
     * provide atomic updates during document processing.
     */
    private AtomicMatrix atomicMatrix;   

    /**
     * The reduced matrix
     */
    private Matrix reduced;

    /**
     * The {@link AffinityMatrixCreator}.
     */
    private AffinityMatrixCreator affinityCreator;

    /**
     * Constructs a new instance using the system properties for configuration.
     */
    public LocalityPreservingCooccurrenceSpace(AffinityMatrixCreator creator) {
        this(creator, System.getProperties());
    }
   
    /**
     * Constructs a new instance using the provided properties for
     * configuration.
     */
    public LocalityPreservingCooccurrenceSpace(AffinityMatrixCreator creator,
                                               Properties properties) {
        affinityCreator = creator;
        cooccurrenceMatrix = new GrowingSparseMatrix();
        atomicMatrix = Matrices.synchronizedMatrix(cooccurrenceMatrix);
        reduced = null;
        termToIndex = new ConcurrentHashMap<String,Integer>();
       
        wordIndexCounter = 0;

        String windowSizeProp = properties.getProperty(WINDOW_SIZE_PROPERTY);
        windowSize = (windowSizeProp != null)
            ? Integer.parseInt(windowSizeProp)
            : DEFAULT_WINDOW_SIZE;

        weighting = ReflectionUtil.getObjectInstance(
                properties.getProperty(
                    WEIGHTING_FUNCTION_PROPERTY, DEFAULT_WEIGHTING));
    }

    /**
     * {@inheritDoc}
     */
    public void  processDocument(BufferedReader document) throws IOException {
        Queue<String> nextWords = new ArrayDeque<String>();
        Queue<String> prevWords = new ArrayDeque<String>();
           
        Iterator<String> documentTokens =
            IteratorFactory.tokenizeOrdered(document);
           
        String focus = null;

        // Rather than updating the matrix every time an occurrence is seen,
        // keep a thread-local count of what needs to be modified in the matrix
        // and update after the document has been processed.  This saves
        // potential contention from concurrent writes.
        Map<Pair<Integer>,Double> matrixEntryToCount =
            new HashMap<Pair<Integer>,Double>();
           
        //Load the first windowSize words into the Queue       
        for(int i = 0;  i < windowSize && documentTokens.hasNext(); i++)
            nextWords.offer(documentTokens.next());
           
        while(!nextWords.isEmpty()) {
           
            // Load the top of the nextWords Queue into the focus word
            focus = nextWords.remove();

            // Add the next word to nextWords queue (if possible)
            if (documentTokens.hasNext()) {       
                String windowEdge = documentTokens.next();
                nextWords.offer(windowEdge);
            }           

            // If the filter does not accept this word, skip the semantic
            // processing, continue with the next word
            if (focus.equals(IteratorFactory.EMPTY_TOKEN)) {
            // shift the window
                prevWords.offer(focus);
                if (prevWords.size() > windowSize)
                    prevWords.remove();
                continue;
            }
           
            int focusIndex = getIndexFor(focus);
           
            // Iterate through the words occurring after and add values
            int wordDistance = 1;
            for (String after : nextWords) {
                // skip adding co-occurence values for words that are not
                // accepted by the filter
                if (!after.equals(IteratorFactory.EMPTY_TOKEN)) {
                    int index = getIndexFor(after);
                   
                    // Get the current number of times that the focus word has
                    // co-occurred with this word appearing after it.  Weightb
                    // the word appropriately baed on distance
                    Pair<Integer> p = new Pair<Integer>(focusIndex, index);
                    double value = weighting.weight(wordDistance, windowSize);
                    Double curCount = matrixEntryToCount.get(p);
                    matrixEntryToCount.put(p, (curCount == null)
                                           ? value : value + curCount);
                }
            
                wordDistance++;       
            }

            wordDistance = -1; // in front of the focus word
            for (String before : prevWords) {
                // skip adding co-occurence values for words that are not
                // accepted by the filter
                if (!before.equals(IteratorFactory.EMPTY_TOKEN)) {
                    int index = getIndexFor(before);

                    // Get the current number of times that the focus word has
                    // co-occurred with this word before after it.  Weight the
                    // word appropriately baed on distance
                    Pair<Integer> p = new Pair<Integer>(index, focusIndex);
                    double value = weighting.weight(wordDistance, windowSize);
                    Double curCount = matrixEntryToCount.get(p);
                    matrixEntryToCount.put(p, (curCount == null)
                                           ? value : value + curCount);
                }
                wordDistance--;
            }
                   
            // last, put this focus word in the prev words and shift off the
            // front if it is larger than the window
            prevWords.offer(focus);
            if (prevWords.size() > windowSize)
                prevWords.remove();
        }

        // Once the document has been processed, update the co-occurrence matrix
        // accordingly.
        for (Map.Entry<Pair<Integer>,Double> e : matrixEntryToCount.entrySet()){
            Pair<Integer> p = e.getKey();
            atomicMatrix.addAndGet(p.x, p.y, e.getValue());
        }                   
    }

    /**
     * Returns the index in the co-occurence matrix for this word.  If the word
     * was not previously assigned an index, this method adds one for it and
     * returns that index.
     */
    private final int getIndexFor(String word) {
        Integer index = termToIndex.get(word);
        if (index == null) {    
            synchronized(this) {
                // recheck to see if the term was added while blocking
                index = termToIndex.get(word);
                // if another thread has not already added this word while the
                // current thread was blocking waiting on the lock, then add it.
                if (index == null) {
                    int i = wordIndexCounter++;
                    termToIndex.put(word, i);
                    return i; // avoid the auto-boxing to assign i to index
                }
            }
        }
        return index;
    }
   
    /**
     * {@inheritDoc}
     */
    public Set<String> getWords() {
        // If no documents have been processed, it will be empty       
        return Collections.unmodifiableSet(termToIndex.keySet());           
    }       

    /**
     * {@inheritDoc}
     */
    public Vector getVector(String word) {
        Integer index = termToIndex.get(word);
        if (index == null)
            return null;
        // If the matrix hasn't had columns dropped then the returned vector
        // will be the combination of the word's row and column
        else
            return reduced.getRowVector(index);
    }

    /**
     * {@inheritDoc}
     */
    public int getVectorLength() {
        return reduced.columns();
    }
   
    /**
     * {@inheritDoc}
     */
    public void processSpace(Properties properties) {

        // Set all of the default properties
        int dimensions = 300;

        // Then load any of the user-specified properties
        String dimensionsProp =
            properties.getProperty(LPCS_DIMENSIONS_PROPERTY);
        if (dimensionsProp != null) {
            try {
                dimensions = Integer.parseInt(dimensionsProp);
            } catch (NumberFormatException nfe) {
                throw new IllegalArgumentException(
                    LPCS_DIMENSIONS_PROPERTY + " is not an integer: " +
                    dimensionsProp);
            }
        }
       
        try {
            LOGGER.info("reducing to " + dimensions + " dimensions");
            File tiMap = new File("lpcs-term-index." + Math.random() + ".map");
            PrintWriter pw = new PrintWriter(tiMap);
            for (Map.Entry<String,Integer> e : termToIndex.entrySet())
                pw.println(e.getKey() + "\t" + e.getValue());
            pw.close();
            LOGGER.info("wrote term-index map to " + tiMap);
        } catch (Throwable t) {
            t.printStackTrace();
        }

        // Calculate the affinity matrix for the cooccurrence matrix
        MatrixFile affinityMatrix = affinityCreator.calculate(
                cooccurrenceMatrix);
       
        // Using the affinity matrix as a guide to locality, project the
        // co-occurrence matrix into the lower dimensional subspace
        reduced = LocalityPreservingProjection.project(
            cooccurrenceMatrix, affinityMatrix, dimensions);
    }
       
    /**
     * {@inheritDoc}
     */
    public String getSpaceName() {
        return "nws-semantic-space";
    }
}
TOP

Related Classes of edu.ucla.sspace.nonlinear.LocalityPreservingCooccurrenceSpace

TOP
Copyright © 2018 www.massapi.com. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.