Package org.apache.mahout.classifier.df.data

Examples of org.apache.mahout.classifier.df.data.Dataset


      log.error("No Decision Forest found!");
      return;
    }

    // load the dataset
    Dataset dataset = Dataset.load(getConf(), datasetPath);
    DataConverter converter = new DataConverter(dataset);

    log.info("Sequential classification...");
    long time = System.currentTimeMillis();

    Random rng = RandomUtils.getRandom();

    List<double[]> resList = Lists.newArrayList();
    if (dataFS.getFileStatus(dataPath).isDir()) {
      //the input is a directory of files
      testDirectory(outputPath, converter, forest, dataset, resList, rng);
    else {
      // the input is one single file
      testFile(dataPath, outputPath, converter, forest, dataset, resList, rng);
    }

    time = System.currentTimeMillis() - time;
    log.info("Classification Time: {}", DFUtils.elapsedTime(time));

    if (analyze) {
      if (dataset.isNumerical(dataset.getLabelId())) {
        RegressionResultAnalyzer regressionAnalyzer = new RegressionResultAnalyzer();
        double[][] results = new double[resList.size()][2];
        regressionAnalyzer.setInstances(resList.toArray(results));
        log.info("{}", regressionAnalyzer);
      } else {
        ResultAnalyzer analyzer = new ResultAnalyzer(Arrays.asList(dataset.labels()), "unknown");
        for (double[] r : resList) {
          analyzer.addInstance(dataset.getLabelString(r[0]),
            new ClassifierResult(dataset.getLabelString(r[1]), 1.0));
        }
        log.info("{}", analyzer);
      }
    }
  }
View Full Code Here


    rng = RandomUtils.getRandom();
  }

  private static Data[] generateTrainingDataA() throws DescriptorException {
    // Dataset
    Dataset dataset = DataLoader.generateDataset("C N N C L", false, TRAIN_DATA);
   
    // Training data
    Data data = DataLoader.loadData(dataset, TRAIN_DATA);
    @SuppressWarnings("unchecked")
    List<Instance>[] instances = new List[3];
View Full Code Here

      } else {
        trainData[i] = "C," + (i + 20) + ',' (i + 20);
      }
    }
    // Dataset
    Dataset dataset = DataLoader.generateDataset("C N L", true, trainData);
    Data[] datas = new Data[3];
    datas[0] = DataLoader.loadData(dataset, trainData);

    // Training data
    trainData = new String[20];
View Full Code Here

    int label = Utils.findLabel(descriptor);

    // all the vectors have the same label (0)
    double[][] temp = Utils.randomDoublesWithSameLabel(rng, descriptor, false, 100, 0);
    String[] sData = Utils.double2String(temp);
    Dataset dataset = DataLoader.generateDataset(descriptor, false, sData);
    Data data = DataLoader.loadData(dataset, sData);
    DefaultIgSplit iG = new DefaultIgSplit();

    double expected = 0.0 - 1.0 * Math.log(1.0) / Math.log(2.0);
    assertEquals(expected, iG.entropy(data), EPSILON);
View Full Code Here

    super.setUp();
   
    rng = RandomUtils.getRandom(1);
   
    // Dataset
    Dataset dataset = DataLoader
        .generateDataset("C N N C L", false, TRAIN_DATA);
   
    // Training data
    data = DataLoader.loadData(dataset, TRAIN_DATA);
   
View Full Code Here

      return -1;
    }
   
    // load the data
    FileSystem fs = dataPath.getFileSystem(new Configuration());
    Dataset dataset = Dataset.load(getConf(), datasetPath);
    Data data = DataLoader.loadData(dataset, fs, dataPath);
   
    // take m to be the first integer less than log2(M) + 1, where M is the
    // number of inputs
    int m = (int) Math.floor(FastMath.log(2.0, data.getDataset().nbAttributes()) + 1);
View Full Code Here

      } else {
        trainData[i] = "C," + (i + 20) + ',' + (i + 20);
      }
    }
    // Dataset
    Dataset dataset = DataLoader.generateDataset("C N L", true, trainData);
    Data[] datas = new Data[3];
    datas[0] = DataLoader.loadData(dataset, trainData);

    // Training data
    trainData = new String[20];
View Full Code Here

    // prepare the data
    String descriptor = Utils.randomDescriptor(rng, NUM_ATTRIBUTES);
    double[][] source = Utils.randomDoubles(rng, descriptor, false, NUM_INSTANCES);
    String[] sData = Utils.double2String(source);
    Dataset dataset = DataLoader.generateDataset(descriptor, false, sData);
    String[][] splits = Utils.splitData(sData, NUM_MAPPERS);

    MockTreeBuilder treeBuilder = new MockTreeBuilder();

    LongWritable key = new LongWritable();
View Full Code Here

  private static Split categoricalSplit(Data data, int attr) {
    double[] values = data.values(attr);
    int[][] counts = new int[values.length][data.getDataset().nblabels()];
    int[] countAll = new int[data.getDataset().nblabels()];

    Dataset dataset = data.getDataset();

    // compute frequencies
    for (int index = 0; index < data.size(); index++) {
      Instance instance = data.get(index);
      counts[ArrayUtils.indexOf(values, instance.get(attr))][(int) dataset.getLabel(instance)]++;
      countAll[(int) dataset.getLabel(instance)]++;
    }

    int size = data.size();
    double hy = entropy(countAll, size); // H(Y)
    double hyx = 0.0; // H(Y|X)
View Full Code Here

    countAll = new int[data.getDataset().nblabels()];
    countLess = new int[data.getDataset().nblabels()];
  }

  void computeFrequencies(Data data, int attr, double[] values) {
    Dataset dataset = data.getDataset();

    for (int index = 0; index < data.size(); index++) {
      Instance instance = data.get(index);
      counts[ArrayUtils.indexOf(values, instance.get(attr))][(int) dataset.getLabel(instance)]++;
      countAll[(int) dataset.getLabel(instance)]++;
    }
  }
View Full Code Here

TOP

Related Classes of org.apache.mahout.classifier.df.data.Dataset

Copyright © 2018 www.massapicom. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.