Source Code of dkpro.similarity.algorithms.sspace.SSpaceVectorReader

/*******************************************************************************
 * Copyright 2012
 * Ubiquitous Knowledge Processing (UKP) Lab
 * Technische Universität Darmstadt
 *
 * All rights reserved. This program and the accompanying materials
 * are made available under the terms of the GNU Public License v3.0
 * which accompanies this distribution, and is available at
 * http://www.gnu.org/licenses/gpl-3.0.txt
 *******************************************************************************/
package dkpro.similarity.algorithms.sspace;


import static java.util.Arrays.asList;


import java.io.BufferedReader;
import java.io.File;
import java.io.FileReader;
import java.io.IOException;
import java.util.Collection;
import java.util.Properties;


import no.uib.cipr.matrix.Vector;
import no.uib.cipr.matrix.sparse.SparseVector;


import org.apache.commons.io.FileUtils;


import dkpro.similarity.algorithms.api.SimilarityException;
import dkpro.similarity.algorithms.sspace.util.DocumentVectorBuilder;
import dkpro.similarity.algorithms.sspace.util.LatentSemanticAnalysis;
import dkpro.similarity.algorithms.sspace.util.VectorAdapter;
import dkpro.similarity.algorithms.vsm.store.VectorReader;
import edu.ucla.sspace.common.SemanticSpace;


/**
 * Vector reader accessing a {@link SemanticSpace}.
 *
 * @author Richard Eckart de Castilho
 */
public class SSpaceVectorReader
  extends VectorReader
{
  private final SemanticSpace sspace;
  private final DocumentVectorBuilder builder;


  /**
   * Create a vector source for an existing semantic space. To build such a space, you best
   * use the LsiIndexWriter from DKPro IR.
   *
   * @parameter dimensions An existing semantic space (usually an *.sspace file)
   */
  public SSpaceVectorReader(SemanticSpace aSSpace)
  {
    sspace = aSSpace;
    builder = new DocumentVectorBuilder(aSSpace);
  }


  /**
   * Creates a new semantic space from scratch based on all "txt" documents in the specified path.
   * If you want to have full control over the tokenization, normalization, stop word removal
   * and such, you should build the semantic space yourself and then use the other constructor.
   *
   * @parameter directory Path to the document collection
   * @see {@link #createSemanticSpace}
   */
  public SSpaceVectorReader(File aDirectory)
    throws IOException
  {
    this(createSemanticSpace(aDirectory, -1));
  }


  @Override
  public Vector getVector(String aTerm)
    throws SimilarityException
  {
    Vector vec1 = new SparseVector(getConceptCount());
    builder.buildVector(asList(aTerm), VectorAdapter.create(vec1));
    return vec1;
  }


  @Override
  public int getConceptCount()
    throws SimilarityException
  {
    // Nasty stack-overflow bug in getVectorLength()!
    return sspace.getVector(sspace.getWords().iterator().next()).length();
  }


  @Override
  public String getId()
  {
    return sspace.getSpaceName();
  }


  @Override
  public void close()
  {
    // Nothing to do
  }


  /**
   * Create a LSA space from all "txt" files in the specified directory.
   *
   * @param aInputDir directory containing the input files.
   * @param aMaxDimensions maximum number of dimensions. If 0 or negative, 300 is used, which has
   * been determined as a good default for LSA models. If the directory contains less than 300
   * documents, the number of documents is used as the number of dimensions.
   * @return a semantic space.
   * @throws IOException
   */
  public static SemanticSpace createSemanticSpace(File aInputDir, int aMaxDimensions)
    throws IOException
  {
    LatentSemanticAnalysis sspace = new LatentSemanticAnalysis();


    Collection<File> documents = FileUtils.listFiles(aInputDir, new String[] { "txt" }, true);


    for (File document : documents) {
      BufferedReader reader = new BufferedReader(new FileReader(document));
      sspace.processDocument(reader);
    }


    int dimensions = Math.min(documents.size(), aMaxDimensions <= 0 ? 300 : aMaxDimensions);


    Properties props = new Properties();
    props.setProperty(LatentSemanticAnalysis.LSA_DIMENSIONS_PROPERTY, Integer.toString(dimensions));
    sspace.processSpace(props);


    return sspace;
  }
}
Source Code of dkpro.similarity.algorithms.sspace.SSpaceVectorReader

Related Classes of dkpro.similarity.algorithms.sspace.SSpaceVectorReader