Package net.javlov.policy

Source Code of net.javlov.policy.SoftmaxActor

/*
* Javlov - a Java toolkit for reinforcement learning with multi-agent support.
*
* Copyright (c) 2009 Matthijs Snel
*
* This program is free software: you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation, either version 3 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program.  If not, see <http://www.gnu.org/licenses/>.
*/
package net.javlov.policy;

import java.util.List;

import net.javlov.Action;
import net.javlov.Actor;
import net.javlov.Option;
import net.javlov.Policy;
import net.javlov.QFunction;
import net.javlov.State;

/**
* Uses a softmax distribution over action probabilities to select an action. Thus, this
* actor can only be used with discrete actions.
*
* @author Matthijs Snel
*
*/
public class SoftmaxActor implements Actor {

  /**
   * The Q-function that will be used to store action selection probabilities.
   */
  protected QFunction q;
 
  /**
   * The last selected action; stored in order to be able to update it with the TD error.
   */
  protected Option lastOption;
 
  /**
   * Policy used to select actions from probabilities.
   */
  protected Policy pi;
 
  /**
   * Learning rate alpha.
   */
  protected double alpha = 0.1;
 
  /**
   * Constructs an actor based on the provided Q-function and action pool. Note that
   * the {@code QFunction} is here purely used as a storage medium; it will not store
   * Q-values, but instead the probabilities of selecting each action.
   *
   * The values stored
   * in the Q-function do not necessarily need to sum to 1 since this actor uses a softmax
   * distribution over the values, which guarantees that the resulting probabilities sum
   * to 1 anyway. It is however recommended (if a tabular Q-function is used) to initialise
   * the values to > 0, e.g. all 1 / (nr of actions).
   *
   * @param q the Q-function that stores the action probabilities.
   * @param actions the pool of available actions.
   */
  public SoftmaxActor(QFunction q, List<? extends Action> actions) {
    this.q = q;
    pi = new SoftmaxPolicy(q, actions);
  }
 
  /**
   * @inheritDoc
   */
  @Override
  public <T> Option getOption(State<T> s) {
    Option o = pi.getOption(s);
    q.setLastOption(o);
    return o;
  }

  /**
   * @inheritDoc
   */
  @Override
  public double getLearnRate() {
    return alpha;
  }

  /**
   * @inheritDoc
   */
  @Override
  public void init() {
    q.init();
  }

  /**
   * @inheritDoc
   */
  @Override
  public void reset() {
    q.reset();
  }

  /**
   * @inheritDoc
   */
  @Override
  public void setLearnRate(double alpha) {
    this.alpha = alpha;
  }

  /**
   * Adds the provided TD error, multiplied by the learning rate alpha,
   * to the current probablity of the action that was selected last.
   * The probabilities of selecting the other actions will be decreased
   * such that the sum of all probabilities adds to 1.
   *
   * This implementation uses the TD error directly to increase the probability of the
   * last selected action, i.e.
   *
   * {@code p(s,a) = p(s,a) + alpha*TDerr},
   *
   * or if eligibility traces are used (simply pass the actor a "traced" q-function):
   *
   * {@code p(s,a) = p(s,a) + alpha*TDerr*e(s,a).}
   *
   * @param TDerr the TD error that will be used to update the probability of the last
   * selected action.
   */
  @Override
  public <T> void update(double TDerr) {
    q.update(TDerr);
  }

  @Override
  public <T> Option getOption(State<T> s, double[] qvalues) {
    throw new UnsupportedOperationException();
  }

  @Override
  public <T> double[] getOptionProbabilities(State<T> s, double[] qvalues) {
    return pi.getOptionProbabilities(s, qvalues);
  }
}
TOP

Related Classes of net.javlov.policy.SoftmaxActor

TOP
Copyright © 2018 www.massapi.com. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.