Package cc.mallet.pipe

Source Code of cc.mallet.pipe.Csv2FeatureVector

/* Copyright (C) 2002 Univ. of Massachusetts Amherst, Computer Science Dept.
   This file is part of "MALLET" (MAchine Learning for LanguagE Toolkit).
   http://www.cs.umass.edu/~mccallum/mallet
   This software is provided under the terms of the Common Public License,
   version 1.0, as published by http://www.opensource.org.  For further
   information, see the file `LICENSE' included with this distribution. */

package cc.mallet.pipe;


import java.util.logging.*;
import java.util.*;

import cc.mallet.pipe.Pipe;
import cc.mallet.types.Alphabet;
import cc.mallet.types.FeatureVector;
import cc.mallet.types.Instance;
import cc.mallet.types.Labeling;
import cc.mallet.util.MalletLogger;


/**
* Converts a string of the form
* <tt>feature_1:val_1 feature_2:val_2 ... feature_k:val_k</tt>
* into a (sparse) FeatureVector.
*
* Features with no ":" character are assumed to have value 1.0.
*
* @author Gary Huang
*/
public class Csv2FeatureVector extends Pipe {

    private static Logger logger = MalletLogger.getLogger(Csv2FeatureVector.class.getName());

    public Csv2FeatureVector(int capacity) {
        this.dataAlphabet = new Alphabet(capacity);
    }
   
    public Csv2FeatureVector() {
        this(1000);
    }
   
    /**
     * Convert the data in the given <tt>Instance</tt> from a <tt>CharSequence</tt>
     * of sparse feature-value pairs to a <tt>FeatureVector</tt>
     */
    public Instance pipe(Instance carrier) {

        CharSequence c = (CharSequence) carrier.getData();
        String[] pairs = c.toString().trim().split("\\s+");
        int[] keys = new int[pairs.length];
        double[] values = new double[pairs.length];

        for (int i = 0; i < pairs.length; i++) {
      int delimIndex = pairs[i].lastIndexOf(":");
      if (delimIndex <= 0 || delimIndex == (pairs[i].length()-1)) {
        keys[i] = dataAlphabet.lookupIndex(pairs[i], true);
        values[i] = 1.0;
      }
      else {
        keys[i] = dataAlphabet.lookupIndex(pairs[i].substring(0, delimIndex), true);
        values[i] = Double.parseDouble(pairs[i].substring(delimIndex+1));
      }
        }

    // [removed code that sorted indices but NOT values -DM]

        FeatureVector fv = new FeatureVector(dataAlphabet, keys, values);
        carrier.setData( fv );
        return carrier;
    }
   
}
TOP

Related Classes of cc.mallet.pipe.Csv2FeatureVector

TOP
Copyright © 2018 www.massapi.com. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.