/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.mahout.classifier;
import org.apache.mahout.math.DenseMatrix;
import org.apache.mahout.math.DenseVector;
import org.apache.mahout.math.Matrix;
import org.apache.mahout.math.Vector;
import com.google.common.base.Preconditions;
/**
* Defines the interface for classifiers that take input as a vector. This is implemented
* as an abstract class so that it can implement a number of handy convenience methods
* related to classification of vectors.
*/
public abstract class AbstractVectorClassifier {
// ------ These are all that are necessary to define a vector classifier.
/**
* Returns the number of categories for the target variable. A vector classifier
* will encode it's output using a zero-based 1 of numCategories encoding.
* @return The number of categories.
*/
public abstract int numCategories();
/**
* Classify a vector returning a vector of numCategories-1 scores. It is assumed that
* the score for the missing category is one minus the sum of the scores that are returned.
*
* Note that the missing score is the 0-th score.
* @param instance A feature vector to be classified.
* @return A vector of probabilities in 1 of n-1 encoding.
*/
public abstract Vector classify(Vector instance);
/**
* Classify a vector, but don't apply the inverse link function. For logistic regression
* and other generalized linear models, this is just the linear part of the classification.
* @param features A feature vector to be classified.
* @return A vector of scores. If transformed by the link function, these will become probabilities.
*/
public Vector classifyNoLink(Vector features) {
throw new UnsupportedOperationException(
this.getClass().getName() + " doesn't support classification without a link");
}
/**
* Classifies a vector in the special case of a binary classifier where
* {@link #classify(Vector)} would return a vector with only one element. As such,
* using this method can void the allocation of a vector.
* @param instance The feature vector to be classified.
* @return The score for category 1.
*
* @see #classify(Vector)
*/
public abstract double classifyScalar(Vector instance);
// ------- From here on, we have convenience methods that provide an easier API to use.
/**
* Returns n probabilities, one for each category. If you can use an n-1 coding, and are touchy
* about allocation performance, then the classify method is probably better to use. The 0-th
* element of the score vector returned by this method is the missing score as computed by the
* classify method.
*
* @see #classify(Vector)
* @see #classifyFull(Vector r, Vector instance)
*
* @param instance A vector of features to be classified.
* @return A vector of probabilities, one for each category.
*/
public Vector classifyFull(Vector instance) {
return classifyFull(new DenseVector(numCategories()), instance);
}
/**
* Returns n probabilities, one for each category into a pre-allocated vector. One
* vector allocation is still done in the process of multiplying by the coefficient
* matrix, but that is hard to avoid. The cost of such an ephemeral allocation is
* very small in any case compared to the multiplication itself.
*
* @param r Where to put the results.
* @param instance A vector of features to be classified.
* @return A vector of probabilities, one for each category.
*/
public Vector classifyFull(Vector r, Vector instance) {
r.viewPart(1, numCategories() - 1).assign(classify(instance));
r.setQuick(0, 1.0 - r.zSum());
return r;
}
/**
* Returns n-1 probabilities, one for each category but the last, for each row of a matrix. The
* probability of the missing 0-th category is 1 - rowSum(this result).
*
* @param data The matrix whose rows are vectors to classify
* @return A matrix of scores, one row per row of the input matrix, one column for each but the
* last category.
*/
public Matrix classify(Matrix data) {
Matrix r = new DenseMatrix(data.numRows(), numCategories() - 1);
for (int row = 0; row < data.numRows(); row++) {
r.assignRow(row, classify(data.getRow(row)));
}
return r;
}
/**
* Returns n probabilities, one for each category, for each row of a matrix.
*
* @param data The matrix whose rows are vectors to classify
* @return A matrix of scores, one row per row of the input matrix, one column for each but the
* last category.
*/
public Matrix classifyFull(Matrix data) {
Matrix r = new DenseMatrix(data.numRows(), numCategories());
for (int row = 0; row < data.numRows(); row++) {
classifyFull(r.viewRow(row), data.getRow(row));
}
return r;
}
/**
* Returns a vector of probabilities of the first category, one for each row of a matrix. This
* only makes sense if there are exactly two categories, but calling this method in that case can
* save a number of vector allocations.
*
* @param data The matrix whose rows are vectors to classify
* @return A vector of scores, with one value per row of the input matrix.
*/
public Vector classifyScalar(Matrix data) {
Preconditions.checkArgument(numCategories() == 2, "Can only call classifyScalar with two categories");
Vector r = new DenseVector(data.numRows());
for (int row = 0; row < data.numRows(); row++) {
r.set(row, classifyScalar(data.getRow(row)));
}
return r;
}
/**
* Returns a measure of how good the classification for a particular example actually is.
*
* @param actual The correct category for the example.
* @param data The vector to be classified.
* @return The log likelihood of the correct answer as estimated by the current model. This will
* always be <= 0 and larger (closer to 0) indicates better accuracy. In order to simplify
* code that maintains running averages, we bound this value at -100.
*/
public double logLikelihood(int actual, Vector data) {
if (numCategories() == 2) {
double p = classifyScalar(data);
if (actual > 0) {
return Math.max(-100.0, Math.log(p));
} else {
return Math.max(-100.0, Math.log(1.0 - p));
}
} else {
Vector p = classify(data);
if (actual > 0) {
return Math.max(-100.0, Math.log(p.get(actual - 1)));
} else {
return Math.max(-100.0, Math.log(1.0 - p.zSum()));
}
}
}
}