Package org.apache.mahout.vectorizer.encoders

Examples of org.apache.mahout.vectorizer.encoders.Dictionary


    encoder.setTraceDictionary(traceDictionary);
    FeatureVectorEncoder bias = new ConstantValueEncoder("Intercept");
    bias.setTraceDictionary(traceDictionary);
    FeatureVectorEncoder lines = new ConstantValueEncoder("Lines");
    lines.setTraceDictionary(traceDictionary);
    Dictionary newsGroups = new Dictionary();
   
    OnlineLogisticRegression learningAlgorithm =
        new OnlineLogisticRegression(
              20, FEATURES, new L1())
            .alpha(1).stepOffset(1000)
            .decayExponent(0.9)
            .lambda(3.0e-5)
            .learningRate(20);
   
    List<File> files = new ArrayList<File>();
    for (File newsgroup : base.listFiles()) {
      newsGroups.intern(newsgroup.getName());
      files.addAll(Arrays.asList(newsgroup.listFiles()));
    }

    Collections.shuffle(files);
    System.out.printf("%d training files\n", files.size());
View Full Code Here


    int leakType = 0;
    if (args.length > 1) {
      leakType = Integer.parseInt(args[1]);
    }

    Dictionary newsGroups = new Dictionary();

    encoder.setProbes(2);
    AdaptiveLogisticRegression learningAlgorithm = new AdaptiveLogisticRegression(20, FEATURES, new L1());
    learningAlgorithm.setInterval(800);
    learningAlgorithm.setAveragingWindow(500);

    List<File> files = Lists.newArrayList();
    File[] directories = base.listFiles();
    Arrays.sort(directories, Ordering.usingToString());
    for (File newsgroup : directories) {
      if (newsgroup.isDirectory()) {
        newsGroups.intern(newsgroup.getName());
        files.addAll(Arrays.asList(newsgroup.listFiles()));
      }
    }
    Collections.shuffle(files);
    System.out.printf("%d training files\n", files.size());
    System.out.printf("%s\n", Arrays.asList(directories));

    double averageLL = 0;
    double averageCorrect = 0;

    int k = 0;
    double step = 0;
    int[] bumps = {1, 2, 5};
    for (File file : files) {
      String ng = file.getParentFile().getName();
      int actual = newsGroups.intern(ng);

      Vector v = encodeFeatureVector(file);
      learningAlgorithm.train(actual, v);

      k++;
View Full Code Here

    int leakType = 0;
    if (args.length > 1) {
      leakType = Integer.parseInt(args[1]);
    }

    Dictionary newsGroups = new Dictionary();

    encoder.setProbes(2);
    AdaptiveLogisticRegression learningAlgorithm = new AdaptiveLogisticRegression(20, FEATURES, new L1());
    learningAlgorithm.setInterval(800);
    learningAlgorithm.setAveragingWindow(500);

    List<File> files = Lists.newArrayList();
    for (File newsgroup : base.listFiles()) {
      if (newsgroup.isDirectory()) {
        newsGroups.intern(newsgroup.getName());
        files.addAll(Arrays.asList(newsgroup.listFiles()));
      }
    }
    Collections.shuffle(files);
    System.out.printf("%d training files\n", files.size());

    double averageLL = 0;
    double averageCorrect = 0;

    int k = 0;
    double step = 0;
    int[] bumps = {1, 2, 5};
    for (File file : files.subList(0, 10000)) {
      String ng = file.getParentFile().getName();
      int actual = newsGroups.intern(ng);

      Vector v = encodeFeatureVector(file, actual, leakType);
      learningAlgorithm.train(actual, v);

      k++;
View Full Code Here

   * @param typeMap               A map describing the types of the predictor variables.
   */
  public CsvRecordFactory(String targetName, Map<String, String> typeMap) {
    this.targetName = targetName;
    this.typeMap = typeMap;
    targetDictionary = new Dictionary();
  }
View Full Code Here

    File base = new File(inputFile);
    //contains the best model
    OnlineLogisticRegression classifier = ModelSerializer.readBinary(new FileInputStream(modelFile), OnlineLogisticRegression.class);


    Dictionary newsGroups = new Dictionary();
    Multiset<String> overallCounts = HashMultiset.create();

    List<File> files = Lists.newArrayList();
    for (File newsgroup : base.listFiles()) {
      if (newsgroup.isDirectory()) {
        newsGroups.intern(newsgroup.getName());
        files.addAll(Arrays.asList(newsgroup.listFiles()));
      }
    }
    System.out.printf("%d test files\n", files.size());
    ResultAnalyzer ra = new ResultAnalyzer(newsGroups.values(), "DEFAULT");
    for (File file : files) {
      String ng = file.getParentFile().getName();

      int actual = newsGroups.intern(ng);
      NewsgroupHelper helper = new NewsgroupHelper();
      Vector input = helper.encodeFeatureVector(file, actual, 0, overallCounts);//no leak type ensures this is a normal vector
      Vector result = classifier.classifyFull(input);
      int cat = result.maxValueIndex();
      double score = result.maxValue();
      double ll = classifier.logLikelihood(actual, input);
      ClassifierResult cr = new ClassifierResult(newsGroups.values().get(cat), score, ll);
      ra.addInstance(newsGroups.values().get(actual), cr);

    }
    output.printf("%s\n\n", ra.toString());
  }
View Full Code Here

    int leakType = 0;
    if (args.length > 1) {
      leakType = Integer.parseInt(args[1]);
    }

    Dictionary newsGroups = new Dictionary();

    NewsgroupHelper helper = new NewsgroupHelper();
    helper.getEncoder().setProbes(2);
    AdaptiveLogisticRegression learningAlgorithm = new AdaptiveLogisticRegression(20, NewsgroupHelper.FEATURES, new L1());
    learningAlgorithm.setInterval(800);
    learningAlgorithm.setAveragingWindow(500);

    List<File> files = Lists.newArrayList();
    for (File newsgroup : base.listFiles()) {
      if (newsgroup.isDirectory()) {
        newsGroups.intern(newsgroup.getName());
        files.addAll(Arrays.asList(newsgroup.listFiles()));
      }
    }
    Collections.shuffle(files);
    System.out.printf("%d training files\n", files.size());
    SGDInfo info = new SGDInfo();

    int k = 0;


    for (File file : files) {
      String ng = file.getParentFile().getName();
      int actual = newsGroups.intern(ng);

      Vector v = helper.encodeFeatureVector(file, actual, leakType, overallCounts);
      learningAlgorithm.train(actual, v);

      k++;
View Full Code Here

   * @param typeMap               A map describing the types of the predictor variables.
   */
  public CsvRecordFactory(String targetName, Map<String, String> typeMap) {
    this.targetName = targetName;
    this.typeMap = typeMap;
    targetDictionary = new Dictionary();
  }
View Full Code Here

    // holds target variable
    List<Integer> target = Lists.newArrayList();

    // for decoding target values
    Dictionary dict = new Dictionary();

    // for permuting data later
    List<Integer> order = Lists.newArrayList();

    for (String line : raw.subList(1, raw.size())) {
      // order gets a list of indexes
      order.add(order.size());

      // parse the predictor variables
      Vector v = new DenseVector(5);
      v.set(0, 1);
      int i = 1;
      Iterable<String> values = onComma.split(line);
      for (String value : Iterables.limit(values, 4)) {
        v.set(i++, Double.parseDouble(value));
      }
      data.add(v);

      // and the target
      target.add(dict.intern(Iterables.get(values, 4)));
    }

    // randomize the order ... original data has each species all together
    // note that this randomization is deterministic
    Random random = RandomUtils.getRandom();
View Full Code Here

   * @param typeMap               A map describing the types of the predictor variables.
   */
  public CsvRecordFactory(String targetName, Map<String, String> typeMap) {
    this.targetName = targetName;
    this.typeMap = typeMap;
    targetDictionary = new Dictionary();
  }
View Full Code Here

    File base = new File(inputFile);
    //contains the best model
    OnlineLogisticRegression classifier =
        ModelSerializer.readBinary(new FileInputStream(modelFile), OnlineLogisticRegression.class);

    Dictionary newsGroups = new Dictionary();
    Multiset<String> overallCounts = HashMultiset.create();

    List<File> files = Lists.newArrayList();
    for (File newsgroup : base.listFiles()) {
      if (newsgroup.isDirectory()) {
        newsGroups.intern(newsgroup.getName());
        files.addAll(Arrays.asList(newsgroup.listFiles()));
      }
    }
    System.out.println(files.size() + " test files");
    ResultAnalyzer ra = new ResultAnalyzer(newsGroups.values(), "DEFAULT");
    for (File file : files) {
      String ng = file.getParentFile().getName();

      int actual = newsGroups.intern(ng);
      NewsgroupHelper helper = new NewsgroupHelper();
      //no leak type ensures this is a normal vector
      Vector input = helper.encodeFeatureVector(file, actual, 0, overallCounts);
      Vector result = classifier.classifyFull(input);
      int cat = result.maxValueIndex();
      double score = result.maxValue();
      double ll = classifier.logLikelihood(actual, input);
      ClassifierResult cr = new ClassifierResult(newsGroups.values().get(cat), score, ll);
      ra.addInstance(newsGroups.values().get(actual), cr);

    }
    output.println(ra);
  }
View Full Code Here

TOP

Related Classes of org.apache.mahout.vectorizer.encoders.Dictionary

Copyright © 2018 www.massapicom. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.