Package edu.ucla.sspace.purandare

Source Code of edu.ucla.sspace.purandare.PurandareFirstOrder

/*
* Copyright 2009 David Jurgens
*
* This file is part of the S-Space package and is covered under the terms and
* conditions therein.
*
* The S-Space package is free software: you can redistribute it and/or modify
* it under the terms of the GNU General Public License version 2 as published
* by the Free Software Foundation and distributed hereunder to you.
*
* THIS SOFTWARE IS PROVIDED "AS IS" AND NO REPRESENTATIONS OR WARRANTIES,
* EXPRESS OR IMPLIED ARE MADE.  BY WAY OF EXAMPLE, BUT NOT LIMITATION, WE MAKE
* NO REPRESENTATIONS OR WARRANTIES OF MERCHANT- ABILITY OR FITNESS FOR ANY
* PARTICULAR PURPOSE OR THAT THE USE OF THE LICENSED SOFTWARE OR DOCUMENTATION
* WILL NOT INFRINGE ANY THIRD PARTY PATENTS, COPYRIGHTS, TRADEMARKS OR OTHER
* RIGHTS.
*
* You should have received a copy of the GNU General Public License
* along with this program. If not, see <http://www.gnu.org/licenses/>.
*/

package edu.ucla.sspace.purandare;

import edu.ucla.sspace.clustering.Assignment;
import edu.ucla.sspace.clustering.Assignments;
import edu.ucla.sspace.clustering.ClutoClustering;

import edu.ucla.sspace.common.SemanticSpace;
import edu.ucla.sspace.common.Similarity.SimType;
import edu.ucla.sspace.common.Statistics;

import edu.ucla.sspace.matrix.AtomicGrowingMatrix;
import edu.ucla.sspace.matrix.AtomicMatrix;
import edu.ucla.sspace.matrix.GrowingSparseMatrix;
import edu.ucla.sspace.matrix.Matrix;
import edu.ucla.sspace.matrix.MatrixIO;
import edu.ucla.sspace.matrix.MatrixIO.Format;
import edu.ucla.sspace.matrix.SparseMatrix;
import edu.ucla.sspace.matrix.SparseRowMaskedMatrix;
import edu.ucla.sspace.matrix.YaleSparseMatrix;
import edu.ucla.sspace.matrix.SparseOnDiskMatrix;

import edu.ucla.sspace.text.IteratorFactory;

import edu.ucla.sspace.util.SparseArray;
import edu.ucla.sspace.util.SparseHashArray;
import edu.ucla.sspace.util.WorkerThread;

import edu.ucla.sspace.vector.CompactSparseVector;
import edu.ucla.sspace.vector.SparseDoubleVector;
import edu.ucla.sspace.vector.SparseHashDoubleVector;
import edu.ucla.sspace.vector.DoubleVector;
import edu.ucla.sspace.vector.VectorMath;
import edu.ucla.sspace.vector.Vectors;

import java.io.BufferedInputStream;
import java.io.BufferedOutputStream;
import java.io.ByteArrayInputStream;
import java.io.ByteArrayOutputStream;
import java.io.BufferedReader;
import java.io.DataInputStream;
import java.io.DataOutputStream;
import java.io.File;
import java.io.FileInputStream;
import java.io.FileOutputStream;
import java.io.IOError;
import java.io.IOException;

import java.util.ArrayDeque;
import java.util.BitSet;
import java.util.Collections;
import java.util.HashMap;
import java.util.HashSet;
import java.util.Iterator;
import java.util.LinkedList;
import java.util.List;
import java.util.Map;
import java.util.Properties;
import java.util.Queue;
import java.util.Set;

import java.util.logging.Level;
import java.util.logging.Logger;

import java.util.concurrent.BlockingQueue;
import java.util.concurrent.ConcurrentHashMap;
import java.util.concurrent.CopyOnWriteArrayList;
import java.util.concurrent.LinkedBlockingQueue;
import java.util.concurrent.Semaphore;

import java.util.concurrent.atomic.AtomicInteger;


/**
* An implementation of the word sense induction algorithm described by
* Purandare and Pedersen.  This implementation is based on the following
* paper: <ul>
*
*   <li style="font-family:Garamond, Georgia, serif"> Amruta Purandare and Ted
*   Pedersen.  (2004) Word Sense Discrimination by Clustering Contexts in
*   Vector and Similarity Spaces.  <i>Proceedings of Conference on
*   Computational Natural Language Learning (CoNLL)</i>, pp. 41-48, May 6-7,
*   2004, Boston, MA.</li>
*
* </ul>
*
*
* This class offers one configurable parameter.
*
* <dl style="margin-left: 1em">
*
* <dt> <i>Property:</i> <code><b>{@value #MAX_CONTEXT_PER_WORD}
*      </b></code> <br>
*      <i>Default:</i> {@link Integer.MAX_VALUE}
*
* <dd style="padding-top: .5em">This property sets the upper-bound on the
*      maximum number of contexts to be clustered for a single word.  If there
*      are fewer of contexts than this value, all of the contexts will be used.
*      Users should consider setting this value if a large corpus is to be
*      used, or if the corpus contains many frequently used words after
*      filtering.<p>
*
* </dl></p>
*
* @author David Jurgens
*/
public class PurandareFirstOrder implements SemanticSpace {

    private static final Logger LOGGER =
        Logger.getLogger(PurandareFirstOrder.class.getName());

    private static final String PROPERTY_PREFIX =
        "edu.ucla.sspace.purandare.PurandareFirstOrder";

    /**
     * The property to set the upper-bound on the maximum number of contexts to
     * be clustered for a single word.
     */
    public static final String MAX_CONTEXTS_PER_WORD =
        PROPERTY_PREFIX + ".maxContexts";

    /**
     * Map that pairs the word with its position in the original term-document
     * matrix.
     */
    private final Map<String,Integer> termToIndex;

    /**
     * A mapping from term to the sense-induced semantic vectors.  The first
     * sense of the term will be the token itself, while addition senses will be
     * denoted with a "-" and a sense number appended to the token.
     */
    private final Map<String,DoubleVector> termToVector;

    /**
     * The window size for identifying co-occurence words that have the
     * potential to be features.
     */
    private final int windowSize;

    /**
     * The window size used for generating feature vectors.
     */
    private final int contextWindowSize;

    /**
     * The matrix used for storing weight co-occurrence statistics of those
     * words that occur both before and after.
     */
    private final AtomicMatrix cooccurrenceMatrix;

    /**
     * A count for how many times each term appears in the corpus.
     */
    private final List<AtomicInteger> termCounts;

    /**
     * A compressed version of the corpus that is built as the text version is
     * being processed.  The file contains documents represented as an integer
     * for the number of tokens in that document followed by the indices for all
     * of the tokens in the order that they appeared.
     *
     * @see #processIntDocument(int[],Matrix,int,BitSet[],Set[])
     */
    private File compressedDocuments;

    /**
     * The output stream used to the write the {@link #compressedDocuments} file
     * as the text documents are being processed.
     */
    private DataOutputStream compressedDocumentsWriter;

    /**
     * A counter for the number of documents seen in the corpus.
     */
    private final AtomicInteger documentCounter;

    /**
     * The maximum number of contexts allowable for any word.
     */
    private final int maxContextsPerWord;

    /**
     * The number that keeps track of the index values of words
     */
    private int wordIndexCounter;

    /**
     * Creates a new instance of {@code PurandareFirstOrder} using the system
     * properties for configuration
     */
    public PurandareFirstOrder() {
        this(System.getProperties());
    }

    /**
     * Creates a new instance of {@code PurandareFirstOrder} using the provided
     * properties for configuration
     */
    public PurandareFirstOrder(Properties props) {
  cooccurrenceMatrix = new AtomicGrowingMatrix();
        termToIndex = new ConcurrentHashMap<String,Integer>();
        termToVector = new ConcurrentHashMap<String,DoubleVector>();
        termCounts = new CopyOnWriteArrayList<AtomicInteger>();
        windowSize = 5;
        contextWindowSize = 20;
        documentCounter = new AtomicInteger(0);       
        String maxContextsProp = props.getProperty(MAX_CONTEXTS_PER_WORD);
        if (maxContextsProp == null)
            maxContextsPerWord = Integer.MAX_VALUE;
        else {
            int i = Integer.parseInt(maxContextsProp);
            if (i <= 0)
                throw new IllegalArgumentException(
                    "The number of contexts must be a positive number");
            maxContextsPerWord = i;
        }
        try {
            compressedDocuments =
                File.createTempFile("petersen-documents",".dat");
            compressedDocumentsWriter = new DataOutputStream(
                new BufferedOutputStream(
                    new FileOutputStream(compressedDocuments)));
        } catch (IOException ioe) {
            throw new IOError(ioe);
        }
    }

    /**
     * {@inheritDoc}
     */
    public void  processDocument(BufferedReader document) throws IOException {
        documentCounter.getAndIncrement();
  Queue<String> nextWords = new ArrayDeque<String>();
  Queue<String> prevWords = new ArrayDeque<String>();
   
  Iterator<String> documentTokens =
      IteratorFactory.tokenizeOrdered(document);
   
  String focus = null;
   
        ByteArrayOutputStream compressedDocument =
            new ByteArrayOutputStream(4096);
        DataOutputStream dos = new DataOutputStream(compressedDocument);
        int tokens = 0; // count how many are in this document
        int unfilteredTokens = 0;
  //Load the first windowSize words into the Queue   
  for(int i = 0;  i < windowSize && documentTokens.hasNext(); i++)
      nextWords.offer(documentTokens.next());
     
  while(!nextWords.isEmpty()) {
            tokens++;

      // Load the top of the nextWords Queue into the focus word
      focus = nextWords.remove();

      // Add the next word to nextWords queue (if possible)
      if (documentTokens.hasNext()) {   
    String windowEdge = documentTokens.next();
    nextWords.offer(windowEdge);
      }     

      // If the filter does not accept this word, skip the semantic
      // processing, continue with the next word
      if (focus.equals(IteratorFactory.EMPTY_TOKEN)) {
                // Mark the token as empty using a negative term index in the
                // compressed form of the document
                dos.writeInt(-1);
    // shift the window
    prevWords.offer(focus);
    if (prevWords.size() > windowSize)
        prevWords.remove();
    continue;
      }
   
      int focusIndex = getIndexFor(focus);
            // write the term index into the compressed for the document for
            // later corpus reprocessing
            dos.writeInt(focusIndex);
            // Update the occurrences of this token
            termCounts.get(focusIndex).incrementAndGet();
            unfilteredTokens++;
           
      // Iterate through the words occurring after and add values
      for (String after : nextWords) {
    // skip adding co-occurence values for words that are not
    // accepted by the filter
    if (!after.equals(IteratorFactory.EMPTY_TOKEN)) {
        int index = getIndexFor(after);
                    cooccurrenceMatrix.addAndGet(focusIndex, index, 1);    
                }
      }

      for (String before : prevWords) {
    // skip adding co-occurence values for words that are not
    // accepted by the filter
    if (!before.equals(IteratorFactory.EMPTY_TOKEN)) {
        int index = getIndexFor(before);
                    cooccurrenceMatrix.addAndGet(focusIndex, index, 1);
                }
      }
                     
      // last, put this focus word in the prev words and shift off the
      // front if it is larger than the window
      prevWords.offer(focus);
      if (prevWords.size() > windowSize)
    prevWords.remove();
  }

        dos.close();
        byte[] docAsBytes = compressedDocument.toByteArray();

        // Once the document is finished, write the compressed contents to the
        // corpus stream
        synchronized(compressedDocumentsWriter) {
            // Write how many terms were in this document
            compressedDocumentsWriter.writeInt(tokens);
            compressedDocumentsWriter.writeInt(unfilteredTokens);
            compressedDocumentsWriter.write(docAsBytes, 0, docAsBytes.length);
        }
    }

    /**
     * Returns the index in the co-occurence matrix for this word.  If the word
     * was not previously assigned an index, this method adds one for it and
     * returns that index.
     */
    private final int getIndexFor(String word) {
  Integer index = termToIndex.get(word);
  if (index == null) {  
      synchronized(this) {
    // recheck to see if the term was added while blocking
    index = termToIndex.get(word);
    // if another thread has not already added this word while the
    // current thread was blocking waiting on the lock, then add it.
    if (index == null) {
        int i = wordIndexCounter++;
                    // Add a new counter for this term.  Because the
                    // wordIndexCounter starts at zero, so the next index will
                    // be the last index in the termCounts list.
                    termCounts.add(new AtomicInteger(0));
        termToIndex.put(word, i);
        return i; // avoid the auto-boxing to assign i to index
    }
      }
  }
  return index;
    }

        /**
     * {@inheritDoc}
     */
    public Set<String> getWords() {
  // If no documents have been processed, it will be empty   
  return Collections.unmodifiableSet(termToVector.keySet());
    }   

    /**
     * {@inheritDoc}
     */
    public DoubleVector getVector(String word) {       
        return termToVector.get(word);
    }

    /**
     * {@inheritDoc}
     */
    public void processSpace(Properties properties) {
        try {
            // Wrap the call to avoid having all the code in a try/catch.  This
            // is for improved readability purposes only.
            processSpace();
        } catch (IOException ioe) {
            throw new IOError(ioe);
        }
    }

    /**
     * Calculates the first order co-occurrence statics to determine the feature
     * set for each term, then clusters the feature vectors for each terms
     * contexts and finally induces the sense-specific vectors for each term.
     */
    @SuppressWarnings("unchecked")
    private void processSpace() throws IOException {
        compressedDocumentsWriter.close();       

        // Generate the reverse index-to-term mapping.  We will need this for
        // assigning specific senses to each term
        String[] indexToTerm = new String[termToIndex.size()];
        for (Map.Entry<String,Integer> e : termToIndex.entrySet())
            indexToTerm[e.getValue()] = e.getKey();
       
        // Compute how many terms were in the corpus.  We will need this for
        // determining the log-likelihood for all co-occurrences.
        int corpusSize = 0;
        for (AtomicInteger i : termCounts)
            corpusSize += i.get();
        final int uniqueTerms = cooccurrenceMatrix.rows();

        LOGGER.info("calculating term features");
        // Create a set for each term that contains the term indices that are
        // determined to be features for the term, i.e. not all co-occurrences
        // will count as the features.  The feature set for each term is
        // determined by computing the log-likelihood for each co-occurrence and
        // only keeping those terms whos l-l value is above a certain threshold.
        final BitSet[] termFeatures = new BitSet[wordIndexCounter];       
        for (int termIndex = 0; termIndex < uniqueTerms; ++termIndex) {
            String term = indexToTerm[termIndex];           
            termFeatures[termIndex] = calculateTermFeatures(term, corpusSize);
        }

        LOGGER.info("reprocessing corpus to generate feature vectors");
       
        // Set up the concurrent data structures so we can process the documents
        // concurrently
        final BlockingQueue<Runnable> workQueue =
            new LinkedBlockingQueue<Runnable>();
        for (int i = 0; i < Runtime.getRuntime().availableProcessors(); ++i) {
            Thread t = new WorkerThread(workQueue);
            t.start();
        }
        final Semaphore termsProcessed = new Semaphore(0);
       
        for (int termIndex = 0; termIndex < uniqueTerms; ++termIndex) {
           
            final String term = indexToTerm[termIndex];
            final int i = termIndex;
            workQueue.offer(new Runnable() {
                    public void run() {
                        try {
                        LOGGER.fine(String.format(
                            "processing term %6d/%d: %s", i, uniqueTerms,term));
                        Matrix contexts = getTermContexts(i, termFeatures[i]);
                        senseInduce(term, contexts);
                        } catch (IOException ioe) {
                            ioe.printStackTrace();
                        } finally {
                            termsProcessed.release();
                        }                      
                    }
                });
        }
       
        // Wait until all the documents have been processed
        try {
            termsProcessed.acquire(uniqueTerms);
        } catch (InterruptedException ie) {
            throw new Error("interrupted while waiting for terms to " +
                            "finish reprocessing", ie);
        }       
        LOGGER.info("finished reprocessing all terms");
    }

    private BitSet calculateTermFeatures(String term, int corpusSize) {
        int termIndex = termToIndex.get(term);
        LOGGER.fine(String.format("Calculating feature set for %6d/%d: %s",
                                  termIndex, cooccurrenceMatrix.rows(), term));
        DoubleVector cooccurrences = cooccurrenceMatrix.getRowVector(termIndex);
        int termCount = termCounts.get(termIndex).get();
        BitSet validFeatures = new BitSet(wordIndexCounter);
       
        // For each of the co-occurring terms, calculate the log-likelikehood
        // value for that term's occurrences.  Only terms whose value is above
        // 3.841 will be counted as features
        for (int co = 0; co < cooccurrences.length(); ++co) {
            // Form the contingency table:
            //  a   b
            //  c   d
            double count = cooccurrences.get(co);
            // Don't include words that never co-occur as features
            if (count == 0)
                continue;
           
            // a = the number of times they both co-occur
            double a = count;
            // b = the number of times co occurs without term
            double b = termCounts.get(co).get() - count;
            // c = the number of times term occurs without co
            double c = termCount - count;
            // d = the number of times neither co-occurrence
            double d = corpusSize - (a +  b + c);
           
            double logLikelihood = logLikelihood(a, b, c, d);
            if (logLikelihood > 3.841)
                validFeatures.set(co);
        }
        if (LOGGER.isLoggable(Level.FINE))
            LOGGER.fine(term + " had " + validFeatures.cardinality()
                        + " features");
        return validFeatures;
    }

    /**
     * For the specified term, reprocesses the entire corpus using the term's
     * features to construct a matrix of all the contexts in which the term
     * appears.  If the term occurs in more contexts than is allowed in the
     * {@link #maxContextsPerWord}, the random subset of the contexts is
     * returned.
     *
     * @param termIndex the index of the term for which the context matrix
     *        should be generated
     * @param termFeatures the set of term indices that are valid features when
     *        in the context of {@code termIndex}.
     *
     * @return a {@code Matrix} where each row is a different context for the
     *         term in the corpus
     */
    private Matrix getTermContexts(int termIndex, BitSet termFeatures)
            throws IOException {
        // Reprocess the corpus in binary format to generate the set of context
        // with the appropriate feature vectors
        DataInputStream corpusReader = new DataInputStream(
            new BufferedInputStream(new FileInputStream(compressedDocuments)));

        int documents = documentCounter.get();
        // Use the number of times the term occurred in the corpus to determine
        // how many rows (contexts) in the matrix.
        SparseMatrix contextsForCurTerm = new YaleSparseMatrix(
            termCounts.get(termIndex).get(), termToIndex.size());
        int contextsSeen = 0;
        for (int d = 0; d < documents; ++d) {
            final int docId = d;

            int tokensInDoc = corpusReader.readInt();
            int unfilteredTokens = corpusReader.readInt();
            // Read in the document
            int[] doc = new int[tokensInDoc];
            for (int i = 0; i < tokensInDoc; ++i)
                doc[i] = corpusReader.readInt();

            int contextsInDoc =
                processIntDocument(termIndex, doc, contextsForCurTerm,
                                   contextsSeen, termFeatures);
            contextsSeen += contextsInDoc;
        }
        corpusReader.close();

        // If the term is to be processed using fewer than all of its contexts,
        // then randomly select the maximum allowable contexts from the matrix
        if (maxContextsPerWord < Integer.MAX_VALUE &&
                contextsForCurTerm.rows() > maxContextsPerWord) {
            BitSet randomContexts = Statistics.randomDistribution(
                maxContextsPerWord, contextsForCurTerm.rows());
            contextsForCurTerm =
                new SparseRowMaskedMatrix(contextsForCurTerm, randomContexts);
        }
       
        return contextsForCurTerm;
    }

    /**
     * Given a matrix for the term where each row is a different context,
     * clusters the rows to identify how many senses the word has.
     *
     * @param term the term for which the senses will be discovered
     * @param contexts a matrix containing all the contexts in which {@code
     *        term} appear, with one context per row
     */
    private void senseInduce(String term, Matrix contexts) throws IOException {
        LOGGER.fine("Clustering " + contexts.rows() + " contexts for " + term);

        // For terms with fewer than seven contexts, set the number of potential
        // clusters lower
        int numClusters = Math.min(7, contexts.rows());
       
        // Cluster each of the rows into seven groups.  Note that if the term is
        // not purely alphabetic (i.e. contains number of other symbols), don't
        // bother clustering it.  This is done to reduce the computation time,
        // and to avoid clustering non-meaningful terms such as '.' or '''
        if (!(term.matches("[a-zA-z]+") && numClusters > 6)) { // special case           
            SparseDoubleVector meanSenseVector =
                new CompactSparseVector(termToIndex.size());           
            int rows = contexts.rows();
            for (int row = 0; row < rows; ++row)
                VectorMath.add(meanSenseVector, contexts.getRowVector(row));
            termToVector.put(term, meanSenseVector);
            return;
        }

        Assignments clusterAssignment =
            new ClutoClustering().cluster(contexts, numClusters,
                                          ClutoClustering.Method.AGGLOMERATIVE,
                                          ClutoClustering.Criterion.UPGMA);
       
        LOGGER.fine("Generative sense vectors for " + term);
       
        // For each of the clusters, compute the mean sense vector
        int[] clusterSize = new int[numClusters];

        // Use CompactSparseVector to conserve memory given the potentially
        // large number of sense vectors
        SparseDoubleVector[] meanSenseVectors =
            new CompactSparseVector[numClusters];

        for (int i = 0; i < meanSenseVectors.length; ++i)
            meanSenseVectors[i] = new CompactSparseVector(termToIndex.size());

        // For each of the contexts, determine which cluster it was in and sum
        // it value with the other contexts
        for (int row = 0; row < clusterAssignment.size(); ++row) {
            // Check whether this row was assigned a cluster
            if (clusterAssignment.get(row).assignments().length == 0)
                continue;               
            int assignment = clusterAssignment.get(row).assignments()[0];               
            clusterSize[assignment]++;

            DoubleVector contextVector = contexts.getRowVector(row);
            VectorMath.add(meanSenseVectors[assignment], contextVector);
        }
       
        // For each of the clusters with more than 2% of the contexts, generage
        // an average sense vectors.  For those clusters with less than that
        // amount, discard them.
        int senseCounter = 0;
        for (int i = 0; i < numClusters; ++i) {
            int size = clusterSize[i];
            if (size / (double)(contexts.rows()) > 0.02) {
                String termWithSense = (senseCounter == 0)
                    ? term : term + "-" + senseCounter;
                senseCounter++;
                termToVector.put(termWithSense,meanSenseVectors[i]);
            }
        }
        LOGGER.fine("Discovered " + senseCounter + " senses for " + term);
    }

    /**
     * Processes the compressed version of a document where each integer
     * indicates that token's index and identifies all the contexts for the
     * target word, adding them as new rows to the context matrix.
     *
     * @param termIndex the term whose contexts should be extracted
     * @param document the document to be processed where each {@code int} is a
     *        term index
     * @param contextMatrix the matrix that will contain all the contexts for
     *        the term with {@code termIndex} when this method returns
     * @param rowStart the next row index in the matrix where a new context can
     *        be added
     * @param featuresForTerm a mapping from term index to the set of other term
     *        indices that are valid feature for that term
     *
     * @return the number of contexts present in this document
     */
    private int processIntDocument(int termIndex, int[] document,
                                   Matrix contextMatrix,
                                   int rowStart,
                                   BitSet featuresForTerm) {
        int contexts = 0;
        for (int i = 0; i < document.length; ++i) {

            int curToken = document[i];
            // Skip processing tokens that are not the current focus
            if (curToken != termIndex)
                continue;

            // Buffer the count of how many times each feature appeared in the
            // context.
            SparseArray<Integer> contextCounts = new SparseHashArray<Integer>();
       
            // Process all the tokes to the left (prior) to the current token;
            for (int left = Math.max(i - contextWindowSize, 0);
                     left < i; ++left) {
                // NOTE: this token value could be -1 if the token's original
                // text was filtered out from the corpus, i.e. was EMPTY_TOKEN
                int token = document[left];
                // Only count co-occurrences that are valid features for the
                // current token
                if (token >= 0 && featuresForTerm.get(token)) {
                    Integer count = contextCounts.get(token);
                    contextCounts.set(token, (count == null) ? 1 : count + 1);
                }
            }

            // Process all the tokes to the right (after) to the current token;
            int end = Math.min(i + contextWindowSize, document.length);
            for (int right = i + 1; right < end; ++right) {
                int token = document[right];
                // Only count co-occurrences that are valid features for the
                // current token
                if (token >= 0 && featuresForTerm.get(token)) {
                    Integer count = contextCounts.get(token);
                    contextCounts.set(token, (count == null) ? 1 : count + 1);
                }
            }
           
            // Each word in the document represents a new context, so the
            // specific context instance can be determined from the current word
            // and the number of previously process words
            int curContext = rowStart + contexts;
            for (int feat : contextCounts.getElementIndices()) {
                //System.out.println("setting row: " + curContext + ", col: " + feat);
                contextMatrix.set(curContext, feat, contextCounts.get(feat));
            }
            // If the current token wasn't skipped, indicate that another
            // context was seen
            contexts++;
        }
        return contexts;
    }

    /**
     * {@inheritDoc}
     */
    public int getVectorLength() {
        // The vector length is dependent upon the total number of features seen
        return termToIndex.size();
    }

    /**
     * {@inheritDoc}
     */
    public String getSpaceName() {
  return "purandare-petersen";
    }

    /**
     * Returns the log-likelihood of the contingency table made up of the four
     * values.
     */
    private static double logLikelihood(double a, double b,
                                        double c, double d) {
        // Table set up as:
        //  a   b
        //  c   d
        double col1sum = a + c;
        double col2sum = b + d;
        double row1sum = a + b;
        double row2sum = c + d;
        double sum = row1sum + row2sum;
       
        // Calculate the expected values for a, b, c, d
        double aExp = (row1sum / sum) * col1sum;
        double bExp = (row1sum / sum) * col2sum;
        double cExp = (row2sum / sum) * col1sum;
        double dExp = (row2sum / sum) * col2sum;

        // log(0) = Infinity, which messes up the calcuation.  Therefore, check
        // whether the value is zero before calculating its contribution.
        double aVal = (a == 0) ? 0 : a * Math.log(a / aExp);
        double bVal = (b == 0) ? 0 : b * Math.log(b / bExp);
        double cVal = (c == 0) ? 0 : c * Math.log(c / cExp);
        double dVal = (d == 0) ? 0 : d * Math.log(d / dExp);

        return 2 * (aVal + bVal + cVal + dVal);           
    }
}
TOP

Related Classes of edu.ucla.sspace.purandare.PurandareFirstOrder

TOP
Copyright © 2018 www.massapi.com. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.