Examples of org.apache.mahout.vectorizer.encoders.Dictionary

Package org.apache.mahout.vectorizer.encoders

Examples of org.apache.mahout.vectorizer.encoders.Dictionary

org.apache.mahout.vectorizer.encoders.Dictionary
Assigns integer codes to strings as they appear.

    encoder.setTraceDictionary(traceDictionary);
    FeatureVectorEncoder bias = new ConstantValueEncoder("Intercept");
    bias.setTraceDictionary(traceDictionary);
    FeatureVectorEncoder lines = new ConstantValueEncoder("Lines");
    lines.setTraceDictionary(traceDictionary);
    Dictionary newsGroups = new Dictionary();
    
    OnlineLogisticRegression learningAlgorithm = 
        new OnlineLogisticRegression(
              20, FEATURES, new L1())
            .alpha(1).stepOffset(1000)
            .decayExponent(0.9) 
            .lambda(3.0e-5)
            .learningRate(20);
    
    List<File> files = new ArrayList<File>();
    for (File newsgroup : base.listFiles()) {
      newsGroups.intern(newsgroup.getName());
      files.addAll(Arrays.asList(newsgroup.listFiles()));
    }


    Collections.shuffle(files);
    System.out.printf("%d training files\n", files.size());

View Full Code Here

    int leakType = 0;
    if (args.length > 1) {
      leakType = Integer.parseInt(args[1]);
    }


    Dictionary newsGroups = new Dictionary();


    encoder.setProbes(2);
    AdaptiveLogisticRegression learningAlgorithm = new AdaptiveLogisticRegression(20, FEATURES, new L1());
    learningAlgorithm.setInterval(800);
    learningAlgorithm.setAveragingWindow(500);


    List<File> files = Lists.newArrayList();
    File[] directories = base.listFiles();
    Arrays.sort(directories, Ordering.usingToString());
    for (File newsgroup : directories) {
      if (newsgroup.isDirectory()) {
        newsGroups.intern(newsgroup.getName());
        files.addAll(Arrays.asList(newsgroup.listFiles()));
      }
    }
    Collections.shuffle(files);
    System.out.printf("%d training files\n", files.size());
    System.out.printf("%s\n", Arrays.asList(directories));


    double averageLL = 0;
    double averageCorrect = 0;


    int k = 0;
    double step = 0;
    int[] bumps = {1, 2, 5};
    for (File file : files) {
      String ng = file.getParentFile().getName();
      int actual = newsGroups.intern(ng);


      Vector v = encodeFeatureVector(file);
      learningAlgorithm.train(actual, v);


      k++;

View Full Code Here

    int leakType = 0;
    if (args.length > 1) {
      leakType = Integer.parseInt(args[1]);
    }


    Dictionary newsGroups = new Dictionary();


    encoder.setProbes(2);
    AdaptiveLogisticRegression learningAlgorithm = new AdaptiveLogisticRegression(20, FEATURES, new L1());
    learningAlgorithm.setInterval(800);
    learningAlgorithm.setAveragingWindow(500);


    List<File> files = Lists.newArrayList();
    for (File newsgroup : base.listFiles()) {
      if (newsgroup.isDirectory()) {
        newsGroups.intern(newsgroup.getName());
        files.addAll(Arrays.asList(newsgroup.listFiles()));
      }
    }
    Collections.shuffle(files);
    System.out.printf("%d training files\n", files.size());


    double averageLL = 0;
    double averageCorrect = 0;


    int k = 0;
    double step = 0;
    int[] bumps = {1, 2, 5};
    for (File file : files.subList(0, 10000)) {
      String ng = file.getParentFile().getName();
      int actual = newsGroups.intern(ng);


      Vector v = encodeFeatureVector(file, actual, leakType);
      learningAlgorithm.train(actual, v);


      k++;

View Full Code Here

   * @param typeMap               A map describing the types of the predictor variables.
   */
  public CsvRecordFactory(String targetName, Map<String, String> typeMap) {
    this.targetName = targetName;
    this.typeMap = typeMap;
    targetDictionary = new Dictionary();
  }

View Full Code Here

    File base = new File(inputFile);
    //contains the best model
    OnlineLogisticRegression classifier = ModelSerializer.readBinary(new FileInputStream(modelFile), OnlineLogisticRegression.class);




    Dictionary newsGroups = new Dictionary();
    Multiset<String> overallCounts = HashMultiset.create();


    List<File> files = Lists.newArrayList();
    for (File newsgroup : base.listFiles()) {
      if (newsgroup.isDirectory()) {
        newsGroups.intern(newsgroup.getName());
        files.addAll(Arrays.asList(newsgroup.listFiles()));
      }
    }
    System.out.printf("%d test files\n", files.size());
    ResultAnalyzer ra = new ResultAnalyzer(newsGroups.values(), "DEFAULT");
    for (File file : files) {
      String ng = file.getParentFile().getName();


      int actual = newsGroups.intern(ng);
      NewsgroupHelper helper = new NewsgroupHelper();
      Vector input = helper.encodeFeatureVector(file, actual, 0, overallCounts);//no leak type ensures this is a normal vector
      Vector result = classifier.classifyFull(input);
      int cat = result.maxValueIndex();
      double score = result.maxValue();
      double ll = classifier.logLikelihood(actual, input);
      ClassifierResult cr = new ClassifierResult(newsGroups.values().get(cat), score, ll);
      ra.addInstance(newsGroups.values().get(actual), cr);


    }
    output.printf("%s\n\n", ra.toString());
  }

View Full Code Here

    int leakType = 0;
    if (args.length > 1) {
      leakType = Integer.parseInt(args[1]);
    }


    Dictionary newsGroups = new Dictionary();


    NewsgroupHelper helper = new NewsgroupHelper();
    helper.getEncoder().setProbes(2);
    AdaptiveLogisticRegression learningAlgorithm = new AdaptiveLogisticRegression(20, NewsgroupHelper.FEATURES, new L1());
    learningAlgorithm.setInterval(800);
    learningAlgorithm.setAveragingWindow(500);


    List<File> files = Lists.newArrayList();
    for (File newsgroup : base.listFiles()) {
      if (newsgroup.isDirectory()) {
        newsGroups.intern(newsgroup.getName());
        files.addAll(Arrays.asList(newsgroup.listFiles()));
      }
    }
    Collections.shuffle(files);
    System.out.printf("%d training files\n", files.size());
    SGDInfo info = new SGDInfo();


    int k = 0;




    for (File file : files) {
      String ng = file.getParentFile().getName();
      int actual = newsGroups.intern(ng);


      Vector v = helper.encodeFeatureVector(file, actual, leakType, overallCounts);
      learningAlgorithm.train(actual, v);


      k++;

View Full Code Here

   * @param typeMap               A map describing the types of the predictor variables.
   */
  public CsvRecordFactory(String targetName, Map<String, String> typeMap) {
    this.targetName = targetName;
    this.typeMap = typeMap;
    targetDictionary = new Dictionary();
  }

View Full Code Here


    // holds target variable
    List<Integer> target = Lists.newArrayList();


    // for decoding target values
    Dictionary dict = new Dictionary();


    // for permuting data later
    List<Integer> order = Lists.newArrayList();


    for (String line : raw.subList(1, raw.size())) {
      // order gets a list of indexes
      order.add(order.size());


      // parse the predictor variables
      Vector v = new DenseVector(5);
      v.set(0, 1);
      int i = 1;
      Iterable<String> values = onComma.split(line);
      for (String value : Iterables.limit(values, 4)) {
        v.set(i++, Double.parseDouble(value));
      }
      data.add(v);


      // and the target
      target.add(dict.intern(Iterables.get(values, 4)));
    }


    // randomize the order ... original data has each species all together
    // note that this randomization is deterministic
    Random random = RandomUtils.getRandom();

View Full Code Here

   * @param typeMap               A map describing the types of the predictor variables.
   */
  public CsvRecordFactory(String targetName, Map<String, String> typeMap) {
    this.targetName = targetName;
    this.typeMap = typeMap;
    targetDictionary = new Dictionary();
  }

View Full Code Here

    File base = new File(inputFile);
    //contains the best model
    OnlineLogisticRegression classifier =
        ModelSerializer.readBinary(new FileInputStream(modelFile), OnlineLogisticRegression.class);


    Dictionary newsGroups = new Dictionary();
    Multiset<String> overallCounts = HashMultiset.create();


    List<File> files = Lists.newArrayList();
    for (File newsgroup : base.listFiles()) {
      if (newsgroup.isDirectory()) {
        newsGroups.intern(newsgroup.getName());
        files.addAll(Arrays.asList(newsgroup.listFiles()));
      }
    }
    System.out.println(files.size() + " test files");
    ResultAnalyzer ra = new ResultAnalyzer(newsGroups.values(), "DEFAULT");
    for (File file : files) {
      String ng = file.getParentFile().getName();


      int actual = newsGroups.intern(ng);
      NewsgroupHelper helper = new NewsgroupHelper();
      //no leak type ensures this is a normal vector
      Vector input = helper.encodeFeatureVector(file, actual, 0, overallCounts);
      Vector result = classifier.classifyFull(input);
      int cat = result.maxValueIndex();
      double score = result.maxValue();
      double ll = classifier.logLikelihood(actual, input);
      ClassifierResult cr = new ClassifierResult(newsGroups.values().get(cat), score, ll);
      ra.addInstance(newsGroups.values().get(actual), cr);


    }
    output.println(ra);
  }

View Full Code Here

0 1 2

TOP

Related Classes of org.apache.mahout.vectorizer.encoders.Dictionary

mia.classifier.ch14.TrainNewsGroups

mia.classifier.ch16.train.TrainNewsGroups

org.apache.mahout.classifier.sgd.CsvRecordFactory

org.apache.mahout.classifier.sgd.CsvRecordFactoryTest

org.apache.mahout.classifier.sgd.OnlineLogisticRegressionTest

org.apache.mahout.classifier.sgd.TestASFEmail

org.apache.mahout.classifier.sgd.TestNewsGroups

org.apache.mahout.classifier.sgd.TrainASFEmail

org.apache.mahout.classifier.sgd.TrainNewsGroups

All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.