Package com.digitalpebble.classification

Source Code of com.digitalpebble.classification.MultiFieldDocument$TokenField

/**
* Copyright 2009 DigitalPebble Ltd
*
* Licensed under the Apache License, Version 2.0 (the "License"); you may not
* use this file except in compliance with the License. You may obtain a copy of
* the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
* WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
* License for the specific language governing permissions and limitations under
* the License.
*/

package com.digitalpebble.classification;

import java.util.Iterator;
import java.util.Map;
import java.util.TreeMap;
import java.util.Map.Entry;
import java.util.regex.Pattern;

import com.digitalpebble.classification.Parameters.WeightingMethod;

public class MultiFieldDocument implements Document {
  int label = 0;

  int[] indices;

  int[] freqs;

  // keep a link between a field index and its field num
  int[] indexToField;

  double[] tokensPerField;
 
  private static final Pattern SPACE_PATTERN = Pattern.compile("\\s+");

  private MultiFieldDocument() {
  }

  /***************************************************************************
   * A document is built from an array of Fields, with a reference to a
   * lexicon
   **************************************************************************/
  MultiFieldDocument(Field[] fields, Lexicon lexicon, boolean create) {

    // missing a know field?
    int maxFieldLength = Math
        .max(lexicon.getFields().length, fields.length);

    tokensPerField = new double[maxFieldLength];

    // create a vector for this document
    // from the individual tokens
    TreeMap<TokenField, int[]> tokens = new TreeMap<TokenField, int[]>();
    for (Field f : fields) {

      // get the field num from the lexicon
      final int fieldNum = lexicon.getFieldID(f._name, create);

      // field does not exist
      if (fieldNum == -1)
        continue;

      for (int token = 0; token < f._tokens.length; token++) {
        // remove null strings or empty strings
        if (f._tokens[token] == null)
          continue;
        if (f._tokens[token].length() < 1)
          continue;

        String normToken = simpleNormalisationTokenString(f._tokens[token]);

        // add a new instance to the count
        tokensPerField[fieldNum]++;

        String label = f._name + "_" + normToken;
        TokenField tf = new TokenField(label, fieldNum);
        int[] count = (int[]) tokens.get(tf);
        if (count == null) {
          count = new int[] { 0 };
          tokens.put(tf, count);
        }
        count[0]++;
      }
    }
    indices = new int[tokens.size()];
    freqs = new int[tokens.size()];
    indexToField = new int[tokens.size()];

    int lastused = 0;
    // iterates on the internal vector
    Iterator<Entry<TokenField, int[]>> iter = tokens.entrySet().iterator();
    while (iter.hasNext()) {
      Entry<TokenField, int[]> entry = iter.next();
      TokenField key = entry.getKey();
      int[] localFreq = entry.getValue();

      // gets the index from the lexicon
      int index = -1;
      if (create) {
        index = lexicon.createIndex(key.value);
      } else {
        index = lexicon.getIndex(key.value);
      }
      // if not found in the lexicon
      // we'll just put a conventional value
      // which will help filtering it later
      if (index == -1) {
        index = Integer.MAX_VALUE;
      }
      // add it to the list
      indices[lastused] = index;
      freqs[lastused] = localFreq[0];
      indexToField[lastused] = key.field;
      lastused++;
    }
    // at this stage all the tokens are linked
    // to their indices in the lexicon
    // and we have their raw frequency in the document
    // sort the content of the vector
    quicksort(indices, freqs, indexToField, 0, indices.length - 1);
  }

  /**
   * Returns the label of the document. The String value of the label can be
   * accessed via the Lexicon object.*
   */
  public int getLabel() {
    return label;
  }

  // the label is now set by the lexicon
  // and not directly by the user code
  void setLabel(int lab) {
    label = lab;
  }

  public String getStringSerialization() {
    StringBuffer buffer = new StringBuffer();
    buffer.append(this.getClass().getSimpleName()).append("\t");
    buffer.append(this.label);
    buffer.append("\t").append(tokensPerField.length);
    for (double tokperf : tokensPerField) {
      buffer.append("\t").append(tokperf);
    }
    for (int i = 0; i < indices.length; i++) {
      buffer.append("\t").append(indices[i]).append(":").append(freqs[i])
          .append(":").append(this.indexToField[i]);
    }
    buffer.append("\n");
    return buffer.toString();
  }

  // get a String representation of the document
  // but limiting it to a subset of its fields
  public String getStringSerialization(int[] fieldToKeep) {
    if (fieldToKeep == null || fieldToKeep.length == 0)
      return getStringSerialization();
    StringBuffer buffer = new StringBuffer();
    buffer.append(this.getClass().getSimpleName()).append("\t");
    buffer.append(this.label);
    buffer.append("\t").append(tokensPerField.length);
    for (int fieldNum = 0; fieldNum < tokensPerField.length; fieldNum++) {
      double tokperf = tokensPerField[fieldNum];
      if (java.util.Arrays.binarySearch(fieldToKeep, fieldNum) != -1)
        buffer.append("\t").append(tokperf);
      else
        buffer.append("\t").append("0.0");
    }
    for (int i = 0; i < indices.length; i++) {
      int fieldNum = this.indexToField[i];
      if (java.util.Arrays.binarySearch(fieldToKeep, fieldNum) != -1)
        buffer.append("\t").append(indices[i]).append(":").append(
            freqs[i]).append(":").append(this.indexToField[i]);
    }
    buffer.append("\n");
    return buffer.toString();
  }

  public static Document parse(String line) {
    String[] splits = line.split("\t");
    if (splits.length < 4)
      return null;
    // ignore first part
    MultiFieldDocument newdoc = new MultiFieldDocument();
    try {
      newdoc.label = Integer.parseInt(splits[1]);
      int numFields = Integer.parseInt(splits[2]);

      newdoc.tokensPerField = new double[numFields];

      int currentPos = 3;
      for (int i = 0; i < numFields; i++) {
        String sizeField = splits[currentPos];
        newdoc.tokensPerField[i] = Double.parseDouble(sizeField);
        currentPos++;
      }

      // num features
      int numfeatures = splits.length - currentPos;
      newdoc.freqs = new int[numfeatures];
      newdoc.indices = new int[numfeatures];
      newdoc.indexToField = new int[numfeatures];

      int lastPos = 0;
      for (; currentPos < splits.length; currentPos++) {
        // x:y:z
        String[] subsplits = splits[currentPos].split(":");
        newdoc.indices[lastPos] = Integer.parseInt(subsplits[0]);
        newdoc.freqs[lastPos] = Integer.parseInt(subsplits[1]);
        newdoc.indexToField[lastPos] = Integer.parseInt(subsplits[2]);
        lastPos++;
      }
    } catch (Exception e) {
      return null;
    }
    return newdoc;
  }

  /**
   * Returns a Vector representation of the document. This Vector object is
   * weighted and used by the instances of Learner or TextClassifier
   */
  public Vector getFeatureVector(Lexicon lexicon) {
    Parameters.WeightingMethod method = lexicon.getMethod();
    return getFeatureVector(lexicon, method, null);
  }

  public Vector getFeatureVector(Lexicon lexicon,
      Parameters.WeightingMethod method) {
    return getFeatureVector(lexicon, method, null);
  }

  public Vector getFeatureVector(Lexicon lexicon, Map<Integer, Integer> equiv) {
    Parameters.WeightingMethod method = lexicon.getMethod();
    return getFeatureVector(lexicon, method, equiv);
  }

  public Vector getFeatureVector(Lexicon lexicon,
      Parameters.WeightingMethod method, Map<Integer, Integer> equiv) {
    // we need to iterate on the features
    // of this document and compute a score
    double numDocs = (double) lexicon.getDocNum();

    // have the attribute numbers been changed in
    // the meantime?
    if (equiv != null) {
      for (int pos = 0; pos < indices.length; pos++) {
        Integer newPos = equiv.get(indices[pos]);
        // filtered
        if (newPos == null)
          indices[pos] = Integer.MAX_VALUE;
        else
          indices[pos] = newPos.intValue();
      }
      // resort the indices
      quicksort(indices, freqs, indexToField, 0, indices.length - 1);
    }

    int kept = 0;
    double[] copyvalues = new double[indices.length];
    for (int pos = 0; pos < indices.length; pos++) {
      // need to check that a given term has not
      // been filtered since the creation of the corpus
      // the indices are sorted so we know there is no point
      // in going further
      // Integer.MAX_VALUE == unknown in model
      if (indices[pos] == Integer.MAX_VALUE) {
        break;
      }
      if (lexicon.getDocFreq(indices[pos]) <= 0)
        continue;
      double score = getScore(pos, lexicon, numDocs);
      // removed in meantime?
      if (score == 0)
        continue;
      copyvalues[pos] = score;
      kept++;
    }
    // trim to size
    int[] trimmedindices = new int[kept];
    double[] trimmedvalues = new double[kept];

    // normalize the values?
    if (lexicon.isNormalizeVector())
      normalizeL2(trimmedvalues);

    System.arraycopy(indices, 0, trimmedindices, 0, kept);
    System.arraycopy(copyvalues, 0, trimmedvalues, 0, kept);
    return new Vector(trimmedindices, trimmedvalues);
  }

  /**
   * Returns the score of an attribute given the weighting scheme specified in
   * the lexicon or for a specific field
   **/
  private double getScore(int pos, Lexicon lexicon, double numdocs) {
    double score = 0;
    int indexTerm = this.indices[pos];
    double occurences = (double) this.freqs[pos];

    int fieldNum = this.indexToField[pos];
    double frequency = occurences / tokensPerField[fieldNum];

    // is there a custom weight for this field?
    String fieldName = lexicon.getFields()[fieldNum];
    WeightingMethod method = lexicon.getMethod(fieldName);

    if (method.equals(Parameters.WeightingMethod.BOOLEAN)) {
      score = 1;
    } else if (method.equals(Parameters.WeightingMethod.OCCURRENCES)) {
      score = occurences;
    } else if (method.equals(Parameters.WeightingMethod.FREQUENCY)) {
      score = frequency;
    } else if (method.equals(Parameters.WeightingMethod.TFIDF)) {
      int df = lexicon.getDocFreq(indexTerm);
      double idf = numdocs / (double) df;
      score = frequency * Math.log(idf);
      if (idf == 1)
        score = frequency;
    }
    return score;
  }

  /**
   * Returns the L2 norm factor of this vector's values.
   */
  private void normalizeL2(double[] scores) {
    double square_sum = 0.0;
    for (int i = 0; i < scores.length; i++) {
      square_sum += (scores[i] * scores[i]);
    }
    double norm = Math.sqrt(square_sum);
    if (norm != 0)
      for (int i = 0; i < scores.length; i++) {
        scores[i] = scores[i] / norm;
      }
  }

  private int partition(int[] dims, int[] vals, int[] vals2, int low, int high) {
    double pivotprim = 0;
    int i = low - 1;
    int j = high + 1;
    pivotprim = dims[(low + high) / 2];
    while (i < j) {
      i++;
      while (dims[i] < pivotprim)
        i++;
      j--;
      while (dims[j] > pivotprim)
        j--;
      if (i < j) {
        int tmp = dims[i];
        dims[i] = dims[j];
        dims[j] = tmp;
        int tmpd = vals[i];
        vals[i] = vals[j];
        vals[j] = tmpd;
        int t2mpd = vals2[i];
        vals2[i] = vals2[j];
        vals2[j] = t2mpd;
      }
    }
    return j;
  }

  private void quicksort(int[] dims, int[] vals, int[] vals2, int low,
      int high) {
    if (low >= high)
      return;
    int p = partition(dims, vals, vals2, low, high);
    quicksort(dims, vals, vals2, low, p);
    quicksort(dims, vals, vals2, p + 1, high);
  }

  class TokenField implements Comparable<TokenField> {
    int field;

    String value;

    TokenField(String val, int fieldNum) {
      field = fieldNum;
      value = val;
    }

    public int compareTo(TokenField tf) {
      return value.compareTo(tf.value);
    }

  }

  /**
   * this is done to make sure that the lexicon file will be read properly and
   * won't contain any characters that would break it
   **/
  private static String simpleNormalisationTokenString(String token) {
      return SPACE_PATTERN.matcher(token).replaceAll("_");
  }

}
TOP

Related Classes of com.digitalpebble.classification.MultiFieldDocument$TokenField

TOP
Copyright © 2018 www.massapi.com. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.