From 650c1d5edbef3eb3f6fc5d2497c46cca5ff596c4 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adri=C3=A1n=20Javaloy?= Date: Sat, 9 Dec 2017 13:27:33 +0100 Subject: [PATCH 01/14] Added a first version of Qlearning + updated rl interface --- dlib/control/approximate_linear_models.h | 80 +---- .../approximate_linear_models_abstract.h | 125 +------- dlib/control/model_abstract.h | 135 ++++++++ dlib/control/policy.h | 180 +++++++++++ dlib/control/policy_abstract.h | 293 ++++++++++++++++++ dlib/control/qlearning.h | 177 +++++++++++ dlib/control/qlearning_abstract.h | 171 ++++++++++ examples/CMakeLists.txt | 1 + examples/qlearning_sarsa_ex.cpp | 217 +++++++++++++ 9 files changed, 1197 insertions(+), 182 deletions(-) create mode 100644 dlib/control/model_abstract.h create mode 100644 dlib/control/policy.h create mode 100644 dlib/control/policy_abstract.h create mode 100644 dlib/control/qlearning.h create mode 100644 dlib/control/qlearning_abstract.h create mode 100644 examples/qlearning_sarsa_ex.cpp diff --git a/dlib/control/approximate_linear_models.h b/dlib/control/approximate_linear_models.h index 9732d71e90..a0d9c01dcb 100644 --- a/dlib/control/approximate_linear_models.h +++ b/dlib/control/approximate_linear_models.h @@ -4,7 +4,6 @@ #define DLIB_APPROXIMATE_LINEAR_MODELS_Hh_ #include "approximate_linear_models_abstract.h" -#include "../matrix.h" namespace dlib { @@ -12,13 +11,13 @@ namespace dlib // ---------------------------------------------------------------------------------------- template < - typename feature_extractor + typename model_type > struct process_sample { - typedef feature_extractor feature_extractor_type; - typedef typename feature_extractor::state_type state_type; - typedef typename feature_extractor::action_type action_type; + typedef typename model_type::state_type state_type; + typedef typename model_type::action_type action_type; + typedef typename model_type::reward_type reward_type; process_sample(){} @@ -26,13 +25,13 @@ namespace dlib const state_type& s, const action_type& a, const state_type& n, - const double& r + const reward_type& r ) : state(s), action(a), next_state(n), reward(r) {} state_type state; action_type action; state_type next_state; - double reward; + reward_type reward; }; template < typename feature_extractor > @@ -53,73 +52,6 @@ namespace dlib deserialize(item.reward, in); } -// ---------------------------------------------------------------------------------------- - - template < - typename feature_extractor - > - class policy - { - public: - - typedef feature_extractor feature_extractor_type; - typedef typename feature_extractor::state_type state_type; - typedef typename feature_extractor::action_type action_type; - - - policy ( - ) - { - w.set_size(fe.num_features()); - w = 0; - } - - policy ( - const matrix& weights_, - const feature_extractor& fe_ - ) : w(weights_), fe(fe_) {} - - action_type operator() ( - const state_type& state - ) const - { - return fe.find_best_action(state,w); - } - - const feature_extractor& get_feature_extractor ( - ) const { return fe; } - - const matrix& get_weights ( - ) const { return w; } - - - private: - matrix w; - feature_extractor fe; - }; - - template < typename feature_extractor > - inline void serialize(const policy& item, std::ostream& out) - { - int version = 1; - serialize(version, out); - serialize(item.get_feature_extractor(), out); - serialize(item.get_weights(), out); - } - template < typename feature_extractor > - inline void deserialize(policy& item, std::istream& in) - { - int version = 0; - deserialize(version, in); - if (version != 1) - throw serialization_error("Unexpected version found while deserializing dlib::policy object."); - feature_extractor fe; - matrix w; - deserialize(fe, in); - deserialize(w, in); - item = policy(w,fe); - } - // ---------------------------------------------------------------------------------------- } diff --git a/dlib/control/approximate_linear_models_abstract.h b/dlib/control/approximate_linear_models_abstract.h index 59dac42769..a5dc2a6b13 100644 --- a/dlib/control/approximate_linear_models_abstract.h +++ b/dlib/control/approximate_linear_models_abstract.h @@ -3,20 +3,24 @@ #undef DLIB_APPROXIMATE_LINEAR_MODELS_ABSTRACT_Hh_ #ifdef DLIB_APPROXIMATE_LINEAR_MODELS_ABSTRACT_Hh_ -#include "../matrix.h" +#include "model_abstract.h" namespace dlib { // ---------------------------------------------------------------------------------------- + template < + typename T, + typename U + > struct example_feature_extractor { /*! WHAT THIS OBJECT REPRESENTS This object defines the interface a feature extractor must implement if it - is to be used with the process_sample and policy objects defined at the - bottom of this file. Moreover, it is meant to represent the core part + is to be used with the process_sample and policy objects defined at + policy_abstract.h. Moreover, it is meant to represent the core part of a model used in a reinforcement learning algorithm. In particular, this object models a Q(state,action) function where @@ -24,7 +28,7 @@ namespace dlib where PSI(state,action) is a feature vector and w is a parameter vector. - Therefore, a feature extractor defines how the PSI(x,y) feature vector is + Therefore, a feature extractor defines how the PSI(x,y) feature vector is calculated. It also defines the types used to represent the state and action objects. @@ -35,9 +39,8 @@ namespace dlib functions of this object. !*/ - // The state and actions can be any types so long as you provide typedefs for them. typedef T state_type; - typedef U action_type; + typedef U action_type; // We can also say that the last element in the weight vector w must be 1. This // can be useful for including a prior into your model. const static bool force_last_weight_to_1 = false; @@ -56,20 +59,8 @@ namespace dlib - returns the dimensionality of the PSI() feature vector. !*/ - action_type find_best_action ( - const state_type& state, - const matrix& w - ) const; - /*! - ensures - - returns the action A that maximizes Q(state,A) = dot(w,PSI(state,A)). - That is, this function finds the best action to take in the given state - when our model is parameterized by the given weight vector w. - !*/ - void get_features ( const state_type& state, - const action_type& action, matrix& feats ) const; /*! @@ -83,14 +74,13 @@ namespace dlib // ---------------------------------------------------------------------------------------- template < - typename feature_extractor + typename model_type > struct process_sample { /*! - REQUIREMENTS ON feature_extractor - feature_extractor should implement the example_feature_extractor interface - defined at the top of this file. + REQUIREMENTS ON model_type + model_type should implement the interface defined at model_abstract.h. WHAT THIS OBJECT REPRESENTS This object holds a training sample for a reinforcement learning algorithm. @@ -99,9 +89,9 @@ namespace dlib receiving this->reward and ending up in the state this->next_state. !*/ - typedef feature_extractor feature_extractor_type; - typedef typename feature_extractor::state_type state_type; - typedef typename feature_extractor::action_type action_type; + typedef typename model_type::state_type state_type; + typedef typename model_type::action_type action_type; + typedef typename model_type::reward_type reward_type; process_sample(){} @@ -109,13 +99,13 @@ namespace dlib const state_type& s, const action_type& a, const state_type& n, - const double& r + const reward_type& r ) : state(s), action(a), next_state(n), reward(r) {} state_type state; action_type action; state_type next_state; - double reward; + reward_type reward; }; template < typename feature_extractor > @@ -128,86 +118,5 @@ namespace dlib // ---------------------------------------------------------------------------------------- - template < - typename feature_extractor - > - class policy - { - /*! - REQUIREMENTS ON feature_extractor - feature_extractor should implement the example_feature_extractor interface - defined at the top of this file. - - WHAT THIS OBJECT REPRESENTS - This is a policy based on the supplied feature_extractor model. In - particular, it maps from feature_extractor::state_type to the best action - to take in that state. - !*/ - - public: - - typedef feature_extractor feature_extractor_type; - typedef typename feature_extractor::state_type state_type; - typedef typename feature_extractor::action_type action_type; - - - policy ( - ); - /*! - ensures - - #get_feature_extractor() == feature_extractor() - (i.e. it will have its default value) - - #get_weights().size() == #get_feature_extractor().num_features() - - #get_weights() == 0 - !*/ - - policy ( - const matrix& weights, - const feature_extractor& fe - ); - /*! - requires - - fe.num_features() == weights.size() - ensures - - #get_feature_extractor() == fe - - #get_weights() == weights - !*/ - - action_type operator() ( - const state_type& state - ) const; - /*! - ensures - - returns get_feature_extractor().find_best_action(state,w); - !*/ - - const feature_extractor& get_feature_extractor ( - ) const; - /*! - ensures - - returns the feature extractor used by this object - !*/ - - const matrix& get_weights ( - ) const; - /*! - ensures - - returns the parameter vector (w) associated with this object. The length - of the vector is get_feature_extractor().num_features(). - !*/ - - }; - - template < typename feature_extractor > - void serialize(const policy& item, std::ostream& out); - template < typename feature_extractor > - void deserialize(policy& item, std::istream& in); - /*! - provides serialization support. - !*/ - -// ---------------------------------------------------------------------------------------- - - #endif // DLIB_APPROXIMATE_LINEAR_MODELS_ABSTRACT_Hh_ diff --git a/dlib/control/model_abstract.h b/dlib/control/model_abstract.h new file mode 100644 index 0000000000..dc7bcbce8c --- /dev/null +++ b/dlib/control/model_abstract.h @@ -0,0 +1,135 @@ +// Copyright (C) 2017 Adrián Javaloy (adrian.javaloy@gmail.com) +// License: Boost Software License See LICENSE.txt for the full license. +#undef DLIB_MODEL_ABSTRACT_Hh_ +#ifdef DLIB_MODEL_ABSTRACT_Hh_ + +#include "approximate_linear_models_abstract.h" +#include "../matrix.h" + +namespace dlib +{ + + template < + template typename feature_extractor_type + > + class example_model + { + /*! + REQUIREMENTS ON feature_extractor + feature_extractor should implement the example_feature_extractor interface defined + at approximate_linear_models_abstract.h. + + WHAT THIS OBJECT REPRESENTS + This is an example interface of a model class. This class represents an environment + where an agent will be deployed at. In particular, this class includes information + about the state space, action space and how to represent those states feature-wise. + !*/ + public: + + // You have to define state, action and reward types. + typedef U state_type; + typedef V action_type; + typedef W reward_type; + + // The feature extractor uses the same types as the model. + typedef feature_extractor_type feature_extractor; + + example_model( + ); + /*! + ensures + - #get_feature_extractor() == feature_extractor() + !*/ + + action_type random_action( + const state_type &state + ) const; + /*! + ensures + - returns a random reachable action from state. + !*/ + + action_type find_best_action( + const state_type &state, + const matrix &w + ) const; + /*! + requires + - w.size() == states_size() + ensures + - returns the action that maximizes the product + dot(w, get_feature_extractor().get_features(state)). + !*/ + + const feature_extractor& get_feature_extractor( + ) const; + /*! + ensures + - returns the feature_extractor used by the model. + !*/ + + auto states_size( + ) const -> decltype(get_feature_extractor().num_features()); + /*! + ensures + - returns get_feature_extractor().num_features(). + !*/ + + auto get_features( + const state_type &state + ) const -> decltype(get_feature_extractor().get_features(state)); + /*! + ensures + - returns get_feature_extractor().get_features(state); + !*/ + + state_type initial_state( + ) const; + /*! + ensures + - returns the initial state of the model. + !*/ + + state_type step( + const state_type &state, + const action_type &action + ) const; + /*! + requires + - action is a valid action from state. + ensures + - returns a state that is possible to be in after doing action + from state. + !*/ + + // The new_state parameter is need because the model doesn't have to be deterministic. + // Nonetheless for now I'll suppose that the reward is deterministic. + reward_type reward( + const state_type &state, + const action_type &action, + const state_type &new_state + ) const; + /*! + requires + - is possible to be in new_state after doing action from state. + ensures + - returns the reward obtained for reaching new_state from state after + doing action. + !*/ + + + }; + + template < template typename feature_extractor > + void serialize (const example_model& item, std::ostream& out); + template < template typename feature_extractor > + void deserialize (example_model& item, std::istream& in); + /*! + provides serialization support. + !*/ + +// ---------------------------------------------------------------------------------------- + +} + +#endif diff --git a/dlib/control/policy.h b/dlib/control/policy.h new file mode 100644 index 0000000000..620f9b4f79 --- /dev/null +++ b/dlib/control/policy.h @@ -0,0 +1,180 @@ +// Copyright (C) 2017 Adrián Javaloy (adrian.javaloy@gmail.com) +// License: Boost Software License See LICENSE.txt for the full license. +#ifndef DLIB_POLICY_Hh_ +#define DLIB_POLICY_Hh_ + +#include +#include "../matrix.h" +#include "policy_abstract.h" + +namespace dlib +{ + + template < + typename model_type + > + class greedy_policy + { + public: + + typedef model_type feature_extractor_type; + typedef typename model_type::state_type state_type; + typedef typename model_type::action_type action_type; + + greedy_policy ( + ) + { + w.set_size(model.states_size()); + w = 0; + } + + greedy_policy ( + const matrix& weights_, + const model_type& model_ = model_type() + ) : w(weights_), model(model_) {} + + action_type operator() ( + const state_type& state + ) const + { + return model.find_best_action(state,w); + } + + const model_type& get_model ( + ) const { return model; } + + matrix& get_weights ( + ) { return w; } + + const matrix& get_weights ( + ) const { return w; } + + private: + matrix w; + model_type model; + }; + + template < typename model_type > + inline void serialize(const greedy_policy& item, std::ostream& out) + { + int version = 1; + serialize(version, out); + serialize(item.get_model(), out); + serialize(item.get_weights(), out); + } + template < typename model_type > + inline void deserialize(greedy_policy& item, std::istream& in) + { + int version = 0; + deserialize(version, in); + if (version != 1) + throw serialization_error("Unexpected version found while deserializing dlib::greedy_policy object."); + model_type model; + matrix w; + deserialize(model, in); + deserialize(w, in); + item = greedy_policy(w,model); + } + +// ---------------------------------------------------------------------------------------- + + template < + typename model_type, + typename generator = std::default_random_engine + > + class epsilon_policy + { + public: + + typedef model_type feature_extractor_type; + typedef typename model_type::state_type state_type; + typedef typename model_type::action_type action_type; + + epsilon_policy ( + double epsilon_, + const generator &gen_ = std::default_random_engine() + ) : epsilon(epsilon_), gen(gen_) + { + w.set_size(model.states_size()); + w = 0; + } + + epsilon_policy ( + double epsilon_, + const matrix& weights_, + const model_type& model_ = model_type(), + const generator gen_ = std::default_random_engine() + ) : w(weights_), model(model_), epsilon(epsilon_), gen(gen_) {} + + action_type operator() ( + const state_type& state + ) const + { + std::bernoulli_distribution d(epsilon); + if(d(gen)){ + // std::cout << "random\n"; + return model.random_action(state); + } + else{ +// std::cout << "best\n"; + return model.find_best_action(state,w); + } + //return d(gen) ? model.random_action(state) : model.find_best_action(state,w); + } + + const model_type& get_model ( + ) const { return model; } + + matrix& get_weights ( + ) { return w; } + + const matrix& get_weights ( + ) const { return w; } + + double get_epsilon( + ) const { return epsilon; } + + const generator& get_generator( + ) const { return gen; } + + private: + matrix w; + model_type model; + double epsilon; + + mutable generator gen; + }; + + template < typename model_type, typename generator > + inline void serialize(const epsilon_policy& item, std::ostream& out) + { + int version = 1; + serialize(version, out); + serialize(item.get_model(), out); + serialize(item.get_weights(), out); + serialize(item.get_epsilon(), out); + serialize(item.get_generator(), out); + } + template < typename model_type, typename generator > + inline void deserialize(epsilon_policy& item, std::istream& in) + { + int version = 0; + deserialize(version, in); + if (version != 1) + throw serialization_error("Unexpected version found while deserializing dlib::greedy_policy object."); + model_type model; + matrix w; + double epsilon; + generator gen; + deserialize(model, in); + deserialize(w, in); + deserialize(epsilon, in); + deserialize(gen, in); + item = epsilon_policy(w,model, epsilon, gen); + } + +// ---------------------------------------------------------------------------------------- + +} + +#endif // DLIB_POLICY_Hh_ diff --git a/dlib/control/policy_abstract.h b/dlib/control/policy_abstract.h new file mode 100644 index 0000000000..41b0e70fd7 --- /dev/null +++ b/dlib/control/policy_abstract.h @@ -0,0 +1,293 @@ +// Copyright (C) 2017 Adrián Javaloy (adrian.javaloy@gmail.com) +// License: Boost Software License See LICENSE.txt for the full license. +#undef DLIB_POLICY_ABSTRACT_Hh_ +#ifdef DLIB_POLICY_ABSTRACT_Hh_ + +#include +#include "../matrix.h" +#include "model_abstract.h" + +namespace dlib +{ + +template < + typename model_type + > +class example_policy +{ + /*! + REQUIREMENTS ON model_type + model_type should implement the interface defined at model_abstract.h. + + WHAT THIS OBJECT REPRESENTS + This is a policy based on the supplied model_type model. In + particular, it maps from model_type::state_type to a model_type::action + to take in that state. + !*/ + +public: + + typedef typename model_type::state_type state_type; + typedef typename model_type::action_type action_type; + + example_policy ( + ); + /*! + ensures + - #get_model() == model_type() + (i.e. it will have its default value) + - #get_weights().size() == #get_model().states_size() + - #get_weights() == 0 + !*/ + + example_policy ( + const matrix& weights, + const model_type& model + ); + /*! + requires + - model.states_size() == weights.size() + ensures + - #get_model() == model + - #get_weights() == weights + !*/ + + action_type operator() ( + const state_type& state + ) const; + + const model_type& get_model ( + ) const; + /*! + ensures + - returns the model used by this object + !*/ + + matrix& get_weights ( + ); + /*! + ensures + - returns the parameter vector (w) associated with this object. The length + of the vector is get_model().states_size(). + !*/ + + const matrix& get_weights ( + ) const; + /*! + ensures + - returns the parameter vector (w) associated with this object. The length + of the vector is get_model().states_size(). + !*/ + +}; + +template < typename model_type > +void serialize(const example_policy& item, std::ostream& out); +template < typename model_type > +void deserialize(example_policy& item, std::istream& in); +/*! + provides serialization support. +!*/ + +// ---------------------------------------------------------------------------------------- + +template < + typename model_type + > +class greedy_policy +{ + /*! + REQUIREMENTS ON model_type + model_type should implement the interface defined at model_abstract.h. + + WHAT THIS OBJECT REPRESENTS + This is an implementation of the policy interface that returns the best action + based on the weights (i.e. it acts in a greedy fashion). + !*/ + +public: + + typedef typename model_type::state_type state_type; + typedef typename model_type::action_type action_type; + + greedy_policy ( + ); + /*! + ensures + - #get_model() == model_type() + (i.e. it will have its default value) + - #get_weights().size() == #get_model().states_size() + - #get_weights() == 0 + !*/ + + greedy_policy ( + const matrix& weights, + const model_type& model + ); + /*! + requires + - model.states_size() == weights.size() + ensures + - #get_model() == model + - #get_weights() == weights + !*/ + + action_type operator() ( + const state_type& state + ) const; + /*! + ensures + - returns get_model().find_best_action(state, w); + !*/ + + const model_type& get_model ( + ) const; + /*! + ensures + - returns the model used by this object + !*/ + + matrix& get_weights ( + ); + /*! + ensures + - returns the parameter vector (w) associated with this object. The length + of the vector is get_model().states_size(). + !*/ + + const matrix& get_weights ( + ) const; + /*! + ensures + - returns the parameter vector (w) associated with this object. The length + of the vector is get_model().states_size(). + !*/ + +}; + +template < typename model_type > +void serialize(const greedy_policy& item, std::ostream& out); +template < typename model_type > +void deserialize(greedy_policy& item, std::istream& in); +/*! + provides serialization support. +!*/ + +// ---------------------------------------------------------------------------------------- + +template < + typename model_type, + typename generator + > +class epsilon_policy +{ + /*! + REQUIREMENTS ON model_type + model_type should implement the interface defined at model_abstract.h. + + REQUIREMENTS ON generator + generator should be a PRNG type like the ones defined in std::random. + + WHAT THIS OBJECT REPRESENTS + This is an implementation of the policy interface that returns the best + action for the given state with probability 1-epsilon while it returns + an doable random action with probability epsilon. + !*/ + +public: + + typedef typename model_type::state_type state_type; + typedef typename model_type::action_type action_type; + + epsilon_policy ( + double epsilon, + const generator &gen = std::default_random_engine() + ); + /*! + requires + - epsilon >= 0 and epsilon <= 1 + ensures + - #get_model() == model_type() + (i.e. it will have its default value) + - #get_weights().size() == #get_model().states_size() + - #get_weights() == 0 + - #get_epsilon() == epsilon + !*/ + + epsilon_policy ( + double epsilon, + const matrix& weights, + const model_type& model, + const generator &gen = std::default_random_engine() + ); + /*! + requires + - model.states_size() == weights.size() + - epsilon >= 0 and epsilon <= 1 + ensures + - #get_model() == model + - #get_weights() == weights + - #get_epsilon() == epsilon + !*/ + + action_type operator() ( + const state_type& state + ) const; + /*! + ensures + - returns get_model().find_best_action(state, w) with probability 1-epsilon + and get_model().random_action(state) with probability epsilon. + !*/ + + const model_type& get_model ( + ) const; + /*! + ensures + - returns the model used by this object + !*/ + + matrix& get_weights ( + ); + /*! + ensures + - returns the parameter vector (w) associated with this object. The length + of the vector is get_model().states_size(). + !*/ + + const matrix& get_weights ( + ) const; + /*! + ensures + - returns the parameter vector (w) associated with this object. The length + of the vector is get_model().states_size(). + !*/ + + double get_epsilon( + ) const; + /*! + ensures + - returns the epsilon value used by the policy. + !*/ + + const generator& get_generator( + ) const; + /*! + ensures + - returns the generator used by the policy. + !*/ + +}; + +template < typename model_type > +void serialize(const epsilon_policy& item, std::ostream& out); +template < typename model_type > +void deserialize(epsilon_policy& item, std::istream& in); +/*! + provides serialization support. +!*/ + +// ---------------------------------------------------------------------------------------- + +} + +#endif // DLIB_POLICY_ABSTRACT_Hh_ diff --git a/dlib/control/qlearning.h b/dlib/control/qlearning.h new file mode 100644 index 0000000000..70a1fb0634 --- /dev/null +++ b/dlib/control/qlearning.h @@ -0,0 +1,177 @@ +// Copyright (C) 2017 Adrián Javaloy (adrian.javaloy@gmail.com) +// License: Boost Software License See LICENSE.txt for the full license. +#ifndef DLIB_QLEARNING_Hh_ +#define DLIB_QLEARNING_Hh_ + +#include "policy.h" + +namespace dlib +{ + template < + typename model_type + > + class qlearning + { + public: + explicit qlearning( + double lr = 0.2, + double disc = 0.8, + unsigned int miters = 100u, + double eps = 0.1, + bool v = false + ) : max_iterations(miters), verbose(v) { + set_learning_rate(lr); + set_discount(disc); + set_epsilon(eps); + } + + double get_learning_rate( + ) const { return learning_rate; } + + void set_learning_rate( + double value + ) + { + DLIB_ASSERT(value >= 0. && value <= 1., + "\t qlearning::set_learning_rate(value)" + "\n\t invalid inputs were given to this function" + "\n\t value: " << value + ); + learning_rate = value; + } + + double get_discount( + ) const { return discount; } + + void set_discount( + double value + ) + { + DLIB_ASSERT(value >= 0. && value <= 1., + "\t qlearning::set_discount(value)" + "\n\t invalid inputs were given to this function" + "\n\t value: " << value + ); + discount = value; + } + + unsigned int get_max_iterations( + ) const { return max_iterations; } + + void set_max_iterations( + unsigned int iterations + ) { max_iterations = iterations; } + + double get_epsilon( + ) const { return epsilon; } + + void set_epsilon( + double value + ) + { + DLIB_ASSERT(value >= 0. && value <= 1., + "\t qlearning::set_epsilon(value)" + "\n\t invalid inputs were given to this function" + "\n\t value: " << value + ); + epsilon = value; + } + + bool is_verbose( + ) const { return verbose; } + + void be_verbose( + ) { verbose = true; } + + void be_quiet( + ) { verbose = false; } + + greedy_policy train( + const matrix &weights + ) const + { + typedef typename model_type::reward_type reward_type; + + epsilon_policy eps_pol(epsilon, weights); + auto& w = eps_pol.get_weights(); + + DLIB_ASSERT(weights.size() == model.states_size(), + "\t qlearning::train(weights)" + "\n\t invalid inputs were given to this function" + "\n\t weights.size: " << weights.size() << + "\n\t features size: " << model.states_size() + ); + + reward_type total_reward = static_cast(0); + for(auto iter = 0u; iter < max_iterations; ++iter){ + auto state = model.initial_state(); + + reward_type reward = static_cast(0); + while(!model.is_final(state)){ + auto action = eps_pol(state); + auto next_state = model.step(state, action); + auto next_reward = model.reward(state, action, next_state); + + const auto feats = model.get_features(state, action); + const auto feats_next_best = model.get_features(next_state, model.find_best_action(next_state, w)); + + auto prev = w; + + double correction = reward + discount * dot(w, feats_next_best) - dot(w, feats); + //std::cout << "correction " << correction << "\n"; + w += learning_rate * correction * feats; + + /*for(auto i = 0; i < model.states_size(); i++) + std::cout << w(i) << " "; + std::cout << std::endl; + + for(auto i = 0; i < model.states_size(); i++) + std::cout << feats(i) << " "; + std::cout << std::endl; + + + if(verbose && sum(abs(w-prev)) != 0){ + std::cout << "updated:\n"; + for(auto i = 0; i < model.states_size(); i++){ + if(prev(i) != w(i)) + std::cout << "(" << i/5 << "," << i%5 << ") from " << prev(i) << " to " << w(i) << "\n"; + } + } + */ + + state = next_state; + reward += next_reward; + } + + total_reward += reward; + if(verbose) + std::cout << "iteration: " << iter << "\t reward: " << reward + << "\t mean: " << total_reward/static_cast(iter+1) << std::endl; + } + + return greedy_policy(w); + } + + greedy_policy train( + ) const + { + matrix weights; + weights = 0; + return train(weights); + } + + private: + double learning_rate; + double discount; + unsigned int max_iterations; + double epsilon; + bool verbose; + + model_type model; + }; + +// ---------------------------------------------------------------------------------------- + +} + +#endif // DLIB_QLEARNING_Hh_ diff --git a/dlib/control/qlearning_abstract.h b/dlib/control/qlearning_abstract.h new file mode 100644 index 0000000000..9fb227d0ba --- /dev/null +++ b/dlib/control/qlearning_abstract.h @@ -0,0 +1,171 @@ +// Copyright (C) 2017 Adrián Javaloy (adrian.javaloy@gmail.com) +// License: Boost Software License See LICENSE.txt for the full license. +#undef DLIB_QLEARNING_ABSTRACT_Hh_ +#ifdef DLIB_QLEARNING_ABSTRACT_Hh_ + +#include "policy_abstract.h" +#include "model_abstract.h" + +namespace dlib +{ + template < + typename model_type + > + class qlearning + { + /*! + REQUIREMENTS ON model_type + model_type is an implementation of the model interface declared in + model_abstract.h. + + WHAT THIS OBJECT REPRESENTS + This objects is an implementation of the well-known reinforcement learning + algorithm Q-learning. This algorithms takes a bunch of process_samples + as input and outputs a policy that have learnt from that in order to take + the better results. + + Supposing we are in state s and action a the learning function has the form: + Q(s, a) = (1 - lr) * Q(s,a) + lr * (reward + disc * max_a' Q(s, a')) + where lr is the learning_rate and disc the discount. + That formula means that it takes a convex combination of the current qvalue + and the expected qvalue. + !*/ + + public: + qlearning( + ); + /*! + ensures + - #get_learning_rate() == 0.2 + - #get_discount() == 0.8 + - #get_max_iterations() == 100 + - #get_epsilon() == 0.1 + - #is not verbose + !*/ + + explicit qlearning( + double learning_rate, + double discount, + unsigned int max_iterations, + double epsilon, + bool verbose + ); + /*! + requires + - learning_rate >= 0 and learning_rate <= 1 + - discount >= 0 and discount <= 1 + - epsilon >= 0 and epsilon <= 1 + ensures + - #get_learning_rate() == learning_rate + - #get_discount() == discount + - #get_max_iterations() == max_iterations + - #get_epsilon() == epsilon + - #is_verbose() == verbose + !*/ + + double get_learning_rate( + ) const; + /*! + ensures + - returns the learning rate applied to the learning function. + !*/ + + void set_learning_rate( + double learning_rate + ); + /*! + requires + - learning_rate >= 0 and learning_rate <= 1. + ensures + - #get_learning_rate() == learning_rate + !*/ + + double get_discount( + ) const; + /*! + ensures + - returns the discount applied to the learning function. + !*/ + + void set_discount( + double discount + ); + /*! + requires + - discount >= 0 and discount <= 1. + ensures + - #get_discount() == discount + !*/ + + unsigned int get_max_iterations( + ) const; + /*! + ensures + - returns the maximum number of iterations that qlearning will + perform during the training. + !*/ + + void set_max_iterations( + unsigned int iterations + ); + /*! + ensures + - #get_max_iterations() == iterations + !*/ + + double get_epsilon( + ) const; + /*! + ensures + - returns the probability of doing a non-optimal step while training. + !*/ + + void set_epsilon( + double epsilon + ); + /*! + requires + - epsilon >= 0 and epsilon <= 1. + ensures + - #get_epsilon() == epsilon + !*/ + + bool is_verbose( + ) const; + /*! + ensures + - returns if the class is verbose or not. + !*/ + + void be_verbose( + ); + /*! + ensures + - #is_verbose() == true + !*/ + + void be_quiet( + ); + /*! + ensures + - #is_verbose() == false + !*/ + + greedy_policy train( + const matrix &weights + ) const; + /*! + requires + - weights.size() == model_type.states_size() + ensures + - returns a greedy_policy resulting of doing max_iterations iterations + over the model while applying the learning function to the weights + matrix of the policy. + !*/ + }; + +// ---------------------------------------------------------------------------------------- + +} + +#endif // DLIB_QLEARNING_ABSTRACT_Hh_ diff --git a/examples/CMakeLists.txt b/examples/CMakeLists.txt index 34e46ca048..c95c4c2c89 100644 --- a/examples/CMakeLists.txt +++ b/examples/CMakeLists.txt @@ -154,6 +154,7 @@ if (NOT USING_OLD_VISUAL_STUDIO_COMPILER) add_example(dnn_semantic_segmentation_train_ex) add_example(dnn_instance_segmentation_train_ex) add_example(dnn_metric_learning_on_images_ex) + add_example(qlearning_sarsa_ex) endif() diff --git a/examples/qlearning_sarsa_ex.cpp b/examples/qlearning_sarsa_ex.cpp new file mode 100644 index 0000000000..44f8a13d34 --- /dev/null +++ b/examples/qlearning_sarsa_ex.cpp @@ -0,0 +1,217 @@ +#include +#include +#include +#include +#include +#include + +using namespace dlib; + +template < + typename state_type, + typename action_type + > +class feature_extractor +{ +public: + feature_extractor( + unsigned int h, + unsigned int w, + unsigned int na + ) : height(h), width(w), num_actions(na) {} + + + inline long num_features( + ) const { return num_actions * height * width; } + + matrix get_features( + const state_type &state, + const action_type &action + ) const + { + matrix feats(num_features()); + feats = 0; + //for(auto i = 0u; i < num_actions; i++) + // feats(num_actions * state + i) = 1; + feats(num_actions*state + static_cast(action)) = 1; + + return feats; + } + +private: + int height, width, num_actions; +}; + +template < + int height, + int width, + template class feature_extractor_type + > +class cliff_model +{ +public: + enum class actions {up = 0, right, down, left}; + constexpr static double EPS = 1e-16; + + typedef int state_type; + typedef actions action_type; + typedef int reward_type; + + typedef feature_extractor_type feature_extractor; + + explicit cliff_model( + ) : fe(height, width, 4){} + + action_type random_action( + const state_type& state + ) const + { + std::uniform_int_distribution dist(0,3); + return static_cast(dist(gen)); + } + + action_type find_best_action( + const state_type& state, + const matrix& w + ) const + { + auto best = std::numeric_limits::lowest(); + auto best_indexes = std::vector(); + + for(auto i = 0; i < 4; i++){ + auto feats = get_features(state, static_cast(i)); + auto product = dot(w, feats); + if(product > best){ + best = product; + best_indexes.clear(); + } + + if(std::abs(product - best) < EPS) + best_indexes.push_back(i); + } + + std::uniform_int_distribution dist(0, best_indexes.size()-1); + return static_cast(best_indexes[dist(gen)]); + } + + const feature_extractor& get_feature_extractor( + ) const { return fe; } + + auto states_size( + ) const -> decltype(get_feature_extractor().num_features()) + { + return get_feature_extractor().num_features(); + } + + auto get_features( + const state_type &state, + const action_type &action + ) const -> decltype(get_feature_extractor().get_features(state, action)) + { return get_feature_extractor().get_features(state, action); } + + reward_type reward( + const state_type &state, + const action_type &action, + const state_type &new_state + ) const + { + return !is_final(new_state) ? -1 : is_success(new_state) ? 100 : -100; + } + + state_type initial_state( + ) const { return static_cast((height-1) * width); } + + state_type step( + const state_type& state, + const action_type& action + ) const + { + if(out_of_bounds(state, action)) + return state; + + return action == actions::up ? state - width : + action == actions::down ? state + width : + action == actions::right ? state + 1 : + state - 1 ; + } + + bool is_success( + const state_type &state + ) const { return state == height*width - 1; } + + bool is_failure( + const state_type &state + ) const { return state/width == height-1 && state%width > 0 && state%width < width-1;} + + bool is_final( + const state_type& state + ) const { return is_success(state) || is_failure(state); } + +private: + bool out_of_bounds( + const state_type& state, + const action_type& action + ) const + { + bool result; + + switch(action){ + case actions::up: + result = state / width == 0; + break; + case actions::down: + result = (state / width == height-2 && state % width > 0 && state % width < width-1) + || state / width == height-1; + break; + case actions::left: + result = state % width == 0; // || state == height*width-1; <- is the goal condition + break; + case actions::right: + result = state % width == width-1 || state == (height-1)*width; + break; + } + + return result; + } + + feature_extractor fe; + mutable std::default_random_engine gen; +}; + +#include +int main(int argc, char** argv) +{ + std::cout << "Hello." << std::endl; + + const auto height = 3u; + const auto width = 5u; + + typedef cliff_model model_type; + + model_type model; + qlearning algorithm; + algorithm.be_verbose(); + algorithm.set_max_iterations(100); + + auto policy = algorithm.train(); + + auto s = model.initial_state(); + int r = 0; //TODO + for(auto i = 0u; i < 100 && !model.is_final(s); i++){ + auto a = policy(s); + auto new_s = model.step(s, a); + r += model.reward(s,a,new_s); + s = new_s; + } + + if(!model.is_final(s)) + std::cout << "Nothing reached after 100 steps." << std::endl; + else if(model.is_failure(s)) + std::cout << "Failed." << std::endl; + else + std::cout << "Success." << std::endl; + + std::cout << "Good bye." << std::endl; + + return 0; +} From f0bd6a889994b339895b140dad660274a4f2ce1a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adri=C3=A1n=20Javaloy?= Date: Sat, 9 Dec 2017 19:02:01 +0100 Subject: [PATCH 02/14] Added Sarsa and an example. Everything is working alright. --- dlib/control.h | 9 +- .../approximate_linear_models_abstract.h | 11 +- dlib/control/model_abstract.h | 70 +++++-- dlib/control/policy.h | 84 ++++---- dlib/control/policy_abstract.h | 80 ++++--- dlib/control/qlearning.h | 74 +++---- dlib/control/qlearning_abstract.h | 48 +++-- dlib/control/sarsa.h | 164 +++++++++++++++ dlib/control/sarsa_abstract.h | 195 ++++++++++++++++++ examples/qlearning_sarsa_ex.cpp | 167 ++++++++++----- 10 files changed, 671 insertions(+), 231 deletions(-) create mode 100644 dlib/control/sarsa.h create mode 100644 dlib/control/sarsa_abstract.h diff --git a/dlib/control.h b/dlib/control.h index 85d00817d5..9c0ec80781 100644 --- a/dlib/control.h +++ b/dlib/control.h @@ -1,11 +1,14 @@ // Copyright (C) 2015 Davis E. King (davis@dlib.net) // License: Boost Software License See LICENSE.txt for the full license. -#ifndef DLIB_CONTRoL_ -#define DLIB_CONTRoL_ +#ifndef DLIB_CONTROL_ +#define DLIB_CONTROL_ +#include "control/policy.h" #include "control/lspi.h" #include "control/mpc.h" +#include "control/qlearning.h" +#include "control/sarsa.h" -#endif // DLIB_CONTRoL_ +#endif // DLIB_CONTROL_ diff --git a/dlib/control/approximate_linear_models_abstract.h b/dlib/control/approximate_linear_models_abstract.h index a5dc2a6b13..74f99da4ab 100644 --- a/dlib/control/approximate_linear_models_abstract.h +++ b/dlib/control/approximate_linear_models_abstract.h @@ -59,14 +59,15 @@ namespace dlib - returns the dimensionality of the PSI() feature vector. !*/ - void get_features ( - const state_type& state, - matrix& feats + matrix get_features ( + const state_type &state, + const action_type &action ) const; /*! + requires + - action is a valid option from state. ensures - - #feats.size() == num_features() - - #feats == PSI(state,action) + - returns PSI(state,action) !*/ }; diff --git a/dlib/control/model_abstract.h b/dlib/control/model_abstract.h index dc7bcbce8c..8abb0f326e 100644 --- a/dlib/control/model_abstract.h +++ b/dlib/control/model_abstract.h @@ -10,7 +10,7 @@ namespace dlib { template < - template typename feature_extractor_type + template class feature_extractor_type > class example_model { @@ -21,8 +21,13 @@ namespace dlib WHAT THIS OBJECT REPRESENTS This is an example interface of a model class. This class represents an environment - where an agent will be deployed at. In particular, this class includes information - about the state space, action space and how to represent those states feature-wise. + where an agent will be deployed at. In other words, it is an interface between the + simulated/real world and the agent that has to be there. In short this class: + - Holds information about the state, action and reward space. + - Delegates the state representation to the feature_extractor. + - Provides an initial state to start the agent. + - Offers an interface to move in the world (look for actions, make steps in it + and get a feedback/reward for them). !*/ public: @@ -76,11 +81,29 @@ namespace dlib !*/ auto get_features( - const state_type &state - ) const -> decltype(get_feature_extractor().get_features(state)); + const state_type &state, + const action_type &action + ) const -> decltype(get_feature_extractor().get_features(state, action)); + /*! + ensures + - returns get_feature_extractor().get_features(state, action); + !*/ + + // The new_state parameter is needed because the model doesn't have to be deterministic. + // Nonetheless for now we will suppose that the rewards are deterministic. + reward_type reward( + const state_type &state, + const action_type &action, + const state_type &new_state + ) const; /*! + requires + - action is available in state. + - new_state is a possible outcome when you do action on state. ensures - - returns get_feature_extractor().get_features(state); + - returns the reward obtained by going to new_state from state + doing action. + - the function is deterministic with respect to its arguments. !*/ state_type initial_state( @@ -102,27 +125,38 @@ namespace dlib from state. !*/ - // The new_state parameter is need because the model doesn't have to be deterministic. - // Nonetheless for now I'll suppose that the reward is deterministic. - reward_type reward( - const state_type &state, - const action_type &action, - const state_type &new_state + bool is_success( + const state_type &state ) const; /*! - requires - - is possible to be in new_state after doing action from state. ensures - - returns the reward obtained for reaching new_state from state after - doing action. + - returns whether state is a goal state (the agent has done its task properly). + !*/ + + bool is_failure( + const state_type &state + ) const; + /*! + ensures + - returns whether state is a failure state, i.e., a state where the agent has + failed his task. + !*/ + + bool is_final( + const state_type& state + ) const; + /*! + ensures + - returns whether state is a final state, i.e., it is a state where the agent can't + advance anymore. In another words, whether state is a success or failure state. !*/ }; - template < template typename feature_extractor > + template < template class feature_extractor > void serialize (const example_model& item, std::ostream& out); - template < template typename feature_extractor > + template < template class feature_extractor > void deserialize (example_model& item, std::istream& in); /*! provides serialization support. diff --git a/dlib/control/policy.h b/dlib/control/policy.h index 620f9b4f79..5ad8c58322 100644 --- a/dlib/control/policy.h +++ b/dlib/control/policy.h @@ -3,9 +3,10 @@ #ifndef DLIB_POLICY_Hh_ #define DLIB_POLICY_Hh_ -#include #include "../matrix.h" #include "policy_abstract.h" +#include +#include namespace dlib { @@ -22,15 +23,16 @@ namespace dlib typedef typename model_type::action_type action_type; greedy_policy ( - ) + const model_type &model_ + ) : model(model_) { w.set_size(model.states_size()); w = 0; } greedy_policy ( - const matrix& weights_, - const model_type& model_ = model_type() + const model_type &model_, + const matrix& weights_ ) : w(weights_), model(model_) {} action_type operator() ( @@ -51,7 +53,7 @@ namespace dlib private: matrix w; - model_type model; + const model_type &model; }; template < typename model_type > @@ -79,57 +81,40 @@ namespace dlib // ---------------------------------------------------------------------------------------- template < - typename model_type, + typename policy_type, typename generator = std::default_random_engine > class epsilon_policy { public: - - typedef model_type feature_extractor_type; - typedef typename model_type::state_type state_type; - typedef typename model_type::action_type action_type; + typedef typename policy_type::state_type state_type; + typedef typename policy_type::action_type action_type; epsilon_policy ( double epsilon_, - const generator &gen_ = std::default_random_engine() - ) : epsilon(epsilon_), gen(gen_) - { - w.set_size(model.states_size()); - w = 0; - } - - epsilon_policy ( - double epsilon_, - const matrix& weights_, - const model_type& model_ = model_type(), - const generator gen_ = std::default_random_engine() - ) : w(weights_), model(model_), epsilon(epsilon_), gen(gen_) {} + const policy_type &policy_, + const generator &gen_ = generator() + ) : policy(policy_), epsilon(epsilon_), gen(gen_) {} action_type operator() ( const state_type& state ) const { std::bernoulli_distribution d(epsilon); - if(d(gen)){ - // std::cout << "random\n"; - return model.random_action(state); - } - else{ -// std::cout << "best\n"; - return model.find_best_action(state,w); - } - //return d(gen) ? model.random_action(state) : model.find_best_action(state,w); + return d(gen) ? get_model().random_action(state) : policy(state); } - const model_type& get_model ( - ) const { return model; } + policy_type get_policy( + ) const { return policy; } + + auto get_model ( + ) const -> decltype(get_policy().get_model()) { return policy.get_model(); } matrix& get_weights ( - ) { return w; } + ) { return policy.get_weights(); } const matrix& get_weights ( - ) const { return w; } + ) const { return policy.get_weights(); } double get_epsilon( ) const { return epsilon; } @@ -138,43 +123,44 @@ namespace dlib ) const { return gen; } private: - matrix w; - model_type model; + policy_type policy; double epsilon; mutable generator gen; }; - template < typename model_type, typename generator > - inline void serialize(const epsilon_policy& item, std::ostream& out) + template < typename policy_type, typename generator > + inline void serialize(const epsilon_policy& item, std::ostream& out) { int version = 1; serialize(version, out); - serialize(item.get_model(), out); - serialize(item.get_weights(), out); + serialize(item.get_policy(), out); serialize(item.get_epsilon(), out); serialize(item.get_generator(), out); } - template < typename model_type, typename generator > - inline void deserialize(epsilon_policy& item, std::istream& in) + + template < typename policy_type, typename generator > + inline void deserialize(epsilon_policy& item, std::istream& in) { int version = 0; deserialize(version, in); if (version != 1) throw serialization_error("Unexpected version found while deserializing dlib::greedy_policy object."); - model_type model; - matrix w; + + policy_type policy; double epsilon; generator gen; - deserialize(model, in); - deserialize(w, in); + deserialize(policy, in); deserialize(epsilon, in); deserialize(gen, in); - item = epsilon_policy(w,model, epsilon, gen); + item = epsilon_policy(epsilon, policy, gen); } // ---------------------------------------------------------------------------------------- + // For backward compability with lspi + template < typename model_type > + using policy = greedy_policy; //template aliasing is possible post C++11 } #endif // DLIB_POLICY_Hh_ diff --git a/dlib/control/policy_abstract.h b/dlib/control/policy_abstract.h index 41b0e70fd7..991bf96296 100644 --- a/dlib/control/policy_abstract.h +++ b/dlib/control/policy_abstract.h @@ -3,7 +3,6 @@ #undef DLIB_POLICY_ABSTRACT_Hh_ #ifdef DLIB_POLICY_ABSTRACT_Hh_ -#include #include "../matrix.h" #include "model_abstract.h" @@ -31,18 +30,18 @@ class example_policy typedef typename model_type::action_type action_type; example_policy ( + const model_type &model ); /*! ensures - - #get_model() == model_type() - (i.e. it will have its default value) + - #get_model() == model - #get_weights().size() == #get_model().states_size() - #get_weights() == 0 !*/ example_policy ( - const matrix& weights, - const model_type& model + const model_type& model, + const matrix& weights ); /*! requires @@ -111,18 +110,18 @@ class greedy_policy typedef typename model_type::action_type action_type; greedy_policy ( + const model_type &model ); /*! ensures - - #get_model() == model_type() - (i.e. it will have its default value) + - #get_model() == model - #get_weights().size() == #get_model().states_size() - #get_weights() == 0 !*/ greedy_policy ( - const matrix& weights, - const model_type& model + const model_type& model, + const matrix& weights ); /*! requires @@ -176,58 +175,42 @@ void deserialize(greedy_policy& item, std::istream& in); // ---------------------------------------------------------------------------------------- template < - typename model_type, + typename policy_type, typename generator > class epsilon_policy { /*! - REQUIREMENTS ON model_type - model_type should implement the interface defined at model_abstract.h. + REQUIREMENTS ON policy_type + policy_type should implement the example_policy interface defined at the + top of this file. REQUIREMENTS ON generator generator should be a PRNG type like the ones defined in std::random. WHAT THIS OBJECT REPRESENTS - This is an implementation of the policy interface that returns the best - action for the given state with probability 1-epsilon while it returns - an doable random action with probability epsilon. + This is a special policy that returns the best action (according to the + underlying policy) for the given state with probability 1-epsilon + while it returns a valid random action with probability epsilon. !*/ public: - typedef typename model_type::state_type state_type; - typedef typename model_type::action_type action_type; - - epsilon_policy ( - double epsilon, - const generator &gen = std::default_random_engine() - ); - /*! - requires - - epsilon >= 0 and epsilon <= 1 - ensures - - #get_model() == model_type() - (i.e. it will have its default value) - - #get_weights().size() == #get_model().states_size() - - #get_weights() == 0 - - #get_epsilon() == epsilon - !*/ + typedef typename policy_type::state_type state_type; + typedef typename policy_type::action_type action_type; epsilon_policy ( double epsilon, - const matrix& weights, - const model_type& model, - const generator &gen = std::default_random_engine() + const policy_type &policy, + const generator &gen = generator() ); /*! requires - - model.states_size() == weights.size() - epsilon >= 0 and epsilon <= 1 ensures - - #get_model() == model - - #get_weights() == weights - #get_epsilon() == epsilon + - #get_policy() == policy + - #get_generator() == gen !*/ action_type operator() ( @@ -235,15 +218,22 @@ class epsilon_policy ) const; /*! ensures - - returns get_model().find_best_action(state, w) with probability 1-epsilon + - returns get_policy()(state, w) with probability 1-epsilon and get_model().random_action(state) with probability epsilon. !*/ - const model_type& get_model ( + policy_type get_policy( ) const; /*! ensures - - returns the model used by this object + - returns the underlying policy used by the object. + !*/ + + auto get_model ( + ) const -> decltype(get_policy().get_model()); + /*! + ensures + - returns the model used by the underlying policy. !*/ matrix& get_weights ( @@ -278,10 +268,10 @@ class epsilon_policy }; -template < typename model_type > -void serialize(const epsilon_policy& item, std::ostream& out); -template < typename model_type > -void deserialize(epsilon_policy& item, std::istream& in); +template < typename policy_type, typename generator > +inline void serialize(const epsilon_policy& item, std::ostream& out); +template < typename policy_type, typename generator > +inline void deserialize(epsilon_policy& item, std::istream& in); /*! provides serialization support. !*/ diff --git a/dlib/control/qlearning.h b/dlib/control/qlearning.h index 70a1fb0634..56e944ec74 100644 --- a/dlib/control/qlearning.h +++ b/dlib/control/qlearning.h @@ -4,12 +4,11 @@ #define DLIB_QLEARNING_Hh_ #include "policy.h" +#include +#include namespace dlib { - template < - typename model_type - > class qlearning { public: @@ -19,7 +18,7 @@ namespace dlib unsigned int miters = 100u, double eps = 0.1, bool v = false - ) : max_iterations(miters), verbose(v) { + ) : iterations(miters), verbose(v) { set_learning_rate(lr); set_discount(disc); set_epsilon(eps); @@ -55,12 +54,12 @@ namespace dlib discount = value; } - unsigned int get_max_iterations( - ) const { return max_iterations; } + unsigned int get_iterations( + ) const { return iterations; } - void set_max_iterations( + void set_iterations( unsigned int iterations - ) { max_iterations = iterations; } + ) { iterations = iterations; } double get_epsilon( ) const { return epsilon; } @@ -86,13 +85,20 @@ namespace dlib void be_quiet( ) { verbose = false; } - greedy_policy train( - const matrix &weights + template < + typename policy_type + > + policy_type train_policy( + const policy_type &policy ) const { - typedef typename model_type::reward_type reward_type; + typedef typename std::decay::type::reward_type reward_type; + + if(verbose) + std::cout << "Starting training..." << std::endl; - epsilon_policy eps_pol(epsilon, weights); + const auto &model = policy.get_model(); + epsilon_policy eps_pol(epsilon, policy); auto& w = eps_pol.get_weights(); DLIB_ASSERT(weights.size() == model.states_size(), @@ -103,7 +109,8 @@ namespace dlib ); reward_type total_reward = static_cast(0); - for(auto iter = 0u; iter < max_iterations; ++iter){ + std::cout << "iterations: " << iterations << std::endl; + for(auto iter = 0u; iter < iterations; ++iter){ auto state = model.initial_state(); reward_type reward = static_cast(0); @@ -115,30 +122,9 @@ namespace dlib const auto feats = model.get_features(state, action); const auto feats_next_best = model.get_features(next_state, model.find_best_action(next_state, w)); - auto prev = w; - double correction = reward + discount * dot(w, feats_next_best) - dot(w, feats); - //std::cout << "correction " << correction << "\n"; w += learning_rate * correction * feats; - /*for(auto i = 0; i < model.states_size(); i++) - std::cout << w(i) << " "; - std::cout << std::endl; - - for(auto i = 0; i < model.states_size(); i++) - std::cout << feats(i) << " "; - std::cout << std::endl; - - - if(verbose && sum(abs(w-prev)) != 0){ - std::cout << "updated:\n"; - for(auto i = 0; i < model.states_size(); i++){ - if(prev(i) != w(i)) - std::cout << "(" << i/5 << "," << i%5 << ") from " << prev(i) << " to " << w(i) << "\n"; - } - } - */ - state = next_state; reward += next_reward; } @@ -149,25 +135,25 @@ namespace dlib << "\t mean: " << total_reward/static_cast(iter+1) << std::endl; } - return greedy_policy(w); + if(verbose) + std::cout << "Training finished." << std::endl; + + return eps_pol.get_policy(); } + template < + typename model_type + > greedy_policy train( - ) const - { - matrix weights; - weights = 0; - return train(weights); - } + const model_type &model + ) const { return train_policy(greedy_policy(model)); } private: double learning_rate; double discount; - unsigned int max_iterations; + unsigned int iterations; double epsilon; bool verbose; - - model_type model; }; // ---------------------------------------------------------------------------------------- diff --git a/dlib/control/qlearning_abstract.h b/dlib/control/qlearning_abstract.h index 9fb227d0ba..ccac305890 100644 --- a/dlib/control/qlearning_abstract.h +++ b/dlib/control/qlearning_abstract.h @@ -24,11 +24,15 @@ namespace dlib as input and outputs a policy that have learnt from that in order to take the better results. - Supposing we are in state s and action a the learning function has the form: - Q(s, a) = (1 - lr) * Q(s,a) + lr * (reward + disc * max_a' Q(s, a')) + Supposing we are in state s and action a and we are going to a new state s' + the learning function has the form: + Q(s, a) = (1 - lr) * Q(s,a) + lr * (reward + disc * max_a' Q(s', a')) where lr is the learning_rate and disc the discount. That formula means that it takes a convex combination of the current qvalue and the expected qvalue. + + Note that it is an off-policy reinforcement learning algorithm meaning + that it doesn't take the policy into account while learning. !*/ public: @@ -38,7 +42,7 @@ namespace dlib ensures - #get_learning_rate() == 0.2 - #get_discount() == 0.8 - - #get_max_iterations() == 100 + - #get_iterations() == 100 - #get_epsilon() == 0.1 - #is not verbose !*/ @@ -46,7 +50,7 @@ namespace dlib explicit qlearning( double learning_rate, double discount, - unsigned int max_iterations, + unsigned int iterations, double epsilon, bool verbose ); @@ -58,7 +62,7 @@ namespace dlib ensures - #get_learning_rate() == learning_rate - #get_discount() == discount - - #get_max_iterations() == max_iterations + - #get_iterations() == iterations - #get_epsilon() == epsilon - #is_verbose() == verbose !*/ @@ -97,7 +101,7 @@ namespace dlib - #get_discount() == discount !*/ - unsigned int get_max_iterations( + unsigned int get_iterations( ) const; /*! ensures @@ -105,12 +109,12 @@ namespace dlib perform during the training. !*/ - void set_max_iterations( + void set_iterations( unsigned int iterations ); /*! ensures - - #get_max_iterations() == iterations + - #get_iterations() == iterations !*/ double get_epsilon( @@ -151,16 +155,34 @@ namespace dlib - #is_verbose() == false !*/ + template < + typename policy_type + > + policy_type train_policy( + const policy_type &policy + ) const; + /*! + requires + - policy is of the form example_policy, i.e., an instance of + an implementation of the policy interface defined in policy_abstract.h. + ensures + - returns a policy of the type policy_type as the result of applying the + qlearning learning function over iterations runs over using the weight + matrix of the argument as the initial weights. + !*/ + + template < + typename model_type + > greedy_policy train( - const matrix &weights + const model_type &model ) const; /*! requires - - weights.size() == model_type.states_size() + - model_type is an implementation of the example_model interface defined + at model_abstract.h. ensures - - returns a greedy_policy resulting of doing max_iterations iterations - over the model while applying the learning function to the weights - matrix of the policy. + - returns train_policy(greedy_policy(model)); !*/ }; diff --git a/dlib/control/sarsa.h b/dlib/control/sarsa.h new file mode 100644 index 0000000000..bcb978f7fa --- /dev/null +++ b/dlib/control/sarsa.h @@ -0,0 +1,164 @@ +// Copyright (C) 2017 Adrián Javaloy (adrian.javaloy@gmail.com) +// License: Boost Software License See LICENSE.txt for the full license. +#ifndef DLIB_SARSA_Hh_ +#define DLIB_SARSA_Hh_ + +#include "policy.h" +#include +#include + +namespace dlib +{ + class sarsa + { + public: + explicit sarsa( + double lr = 0.2, + double disc = 0.8, + unsigned int miters = 100u, + double eps = 0.1, + bool v = false + ) : iterations(miters), verbose(v) { + set_learning_rate(lr); + set_discount(disc); + set_epsilon(eps); + } + + double get_learning_rate( + ) const { return learning_rate; } + + void set_learning_rate( + double value + ) + { + DLIB_ASSERT(value >= 0. && value <= 1., + "\t sarsa::set_learning_rate(value)" + "\n\t invalid inputs were given to this function" + "\n\t value: " << value + ); + learning_rate = value; + } + + double get_discount( + ) const { return discount; } + + void set_discount( + double value + ) + { + DLIB_ASSERT(value >= 0. && value <= 1., + "\t sarsa::set_discount(value)" + "\n\t invalid inputs were given to this function" + "\n\t value: " << value + ); + discount = value; + } + + unsigned int get_iterations( + ) const { return iterations; } + + void set_iterations( + unsigned int iterations + ) { iterations = iterations; } + + double get_epsilon( + ) const { return epsilon; } + + void set_epsilon( + double value + ) + { + DLIB_ASSERT(value >= 0. && value <= 1., + "\t sarsa::set_epsilon(value)" + "\n\t invalid inputs were given to this function" + "\n\t value: " << value + ); + epsilon = value; + } + + bool is_verbose( + ) const { return verbose; } + + void be_verbose( + ) { verbose = true; } + + void be_quiet( + ) { verbose = false; } + + template < + typename policy_type + > + policy_type train_policy( + const policy_type &policy + ) const + { + typedef typename std::decay::type::reward_type reward_type; + + if(verbose) + std::cout << "Starting training..." << std::endl; + + const auto &model = policy.get_model(); + epsilon_policy eps_pol(epsilon, policy); + auto& w = eps_pol.get_weights(); + + DLIB_ASSERT(weights.size() == model.states_size(), + "\t sarsa::train(weights)" + "\n\t invalid inputs were given to this function" + "\n\t weights.size: " << weights.size() << + "\n\t features size: " << model.states_size() + ); + + reward_type total_reward = static_cast(0); + for(auto iter = 0u; iter < iterations; ++iter){ + auto state = model.initial_state(); + auto action = eps_pol(state); + + reward_type reward = static_cast(0); + while(!model.is_final(state)){ + auto next_state = model.step(state, action); + auto next_action = eps_pol(next_state); + auto next_reward = model.reward(state, action, next_state); + + const auto feats = model.get_features(state, action); + const auto feats_next = model.get_features(next_state, next_action); + + double correction = reward + discount * dot(w, feats_next) - dot(w, feats); + w += learning_rate * correction * feats; + + state = next_state; + action = next_action; + reward += next_reward; + } + + total_reward += reward; + if(verbose) + std::cout << "iteration: " << iter << "\t reward: " << reward + << "\t mean: " << total_reward/static_cast(iter+1) << std::endl; + } + + if(verbose) + std::cout << "Training finished." << std::endl; + + return eps_pol.get_policy(); + } + + template < + typename model_type + > + greedy_policy train( + const model_type &model + ) const { return train_policy(greedy_policy(model)); } + + private: + double learning_rate; + double discount; + unsigned int iterations; + double epsilon; + bool verbose; + }; + +// ---------------------------------------------------------------------------------------- + +} + +#endif // DLIB_SARSA_Hh_ diff --git a/dlib/control/sarsa_abstract.h b/dlib/control/sarsa_abstract.h new file mode 100644 index 0000000000..f4d559a8d9 --- /dev/null +++ b/dlib/control/sarsa_abstract.h @@ -0,0 +1,195 @@ +// Copyright (C) 2017 Adrián Javaloy (adrian.javaloy@gmail.com) +// License: Boost Software License See LICENSE.txt for the full license. +#undef DLIB_SARSA_ABSTRACT_Hh_ +#ifdef DLIB_SARSA_ABSTRACT_Hh_ + +#include "policy_abstract.h" +#include "model_abstract.h" + +namespace dlib +{ + template < + typename model_type + > + class sarsa + { + /*! + REQUIREMENTS ON model_type + model_type is an implementation of the model interface declared in + model_abstract.h. + + WHAT THIS OBJECT REPRESENTS + This objects is an implementation of the well-known reinforcement learning + algorithm Q-learning. This algorithms takes a bunch of process_samples + as input and outputs a policy that have learnt from that in order to take + the better results. + + Supposing we are in state s and action a and we are going to a new state s' + the learning function has the form: + Q(s, a) = (1 - lr) * Q(s,a) + lr * (reward + disc * Q(s', a')) + where lr is the learning_rate, disc the discount and a' is the next action + the algorithm will perform after reaching s'. + That formula means that it takes a convex combination of the current qvalue + and the expected qvalue. + + Note that, unlike qlearning, sarsa is an on-policy reinforcement learning + algorithm meaning that it takes the policy into account while learning. + !*/ + + public: + sarsa( + ); + /*! + ensures + - #get_learning_rate() == 0.2 + - #get_discount() == 0.8 + - #get_iterations() == 100 + - #get_epsilon() == 0.1 + - #is not verbose + !*/ + + explicit sarsa( + double learning_rate, + double discount, + unsigned int iterations, + double epsilon, + bool verbose + ); + /*! + requires + - learning_rate >= 0 and learning_rate <= 1 + - discount >= 0 and discount <= 1 + - epsilon >= 0 and epsilon <= 1 + ensures + - #get_learning_rate() == learning_rate + - #get_discount() == discount + - #get_iterations() == iterations + - #get_epsilon() == epsilon + - #is_verbose() == verbose + !*/ + + double get_learning_rate( + ) const; + /*! + ensures + - returns the learning rate applied to the learning function. + !*/ + + void set_learning_rate( + double learning_rate + ); + /*! + requires + - learning_rate >= 0 and learning_rate <= 1. + ensures + - #get_learning_rate() == learning_rate + !*/ + + double get_discount( + ) const; + /*! + ensures + - returns the discount applied to the learning function. + !*/ + + void set_discount( + double discount + ); + /*! + requires + - discount >= 0 and discount <= 1. + ensures + - #get_discount() == discount + !*/ + + unsigned int get_iterations( + ) const; + /*! + ensures + - returns the maximum number of iterations that sarsa will + perform during the training. + !*/ + + void set_iterations( + unsigned int iterations + ); + /*! + ensures + - #get_iterations() == iterations + !*/ + + double get_epsilon( + ) const; + /*! + ensures + - returns the probability of doing a non-optimal step while training. + !*/ + + void set_epsilon( + double epsilon + ); + /*! + requires + - epsilon >= 0 and epsilon <= 1. + ensures + - #get_epsilon() == epsilon + !*/ + + bool is_verbose( + ) const; + /*! + ensures + - returns if the class is verbose or not. + !*/ + + void be_verbose( + ); + /*! + ensures + - #is_verbose() == true + !*/ + + void be_quiet( + ); + /*! + ensures + - #is_verbose() == false + !*/ + + template < + typename policy_type + > + policy_type train_policy( + const policy_type &policy + ) const; + /*! + requires + - policy is of the form example_policy, i.e., an instance of + an implementation of the policy interface defined in policy_abstract.h. + ensures + - returns a policy of the type policy_type as the result of applying the + sarsa learning function over iterations runs over using the weight + matrix of the argument as the initial weights. + !*/ + + template < + typename model_type + > + greedy_policy train( + const model_type &model + ) const; + /*! + requires + - model_type is an implementation of the example_model interface defined + at model_abstract.h. + ensures + - returns train_policy(greedy_policy(model)); + !*/ + }; + }; + +// ---------------------------------------------------------------------------------------- + +} + +#endif // DLIB_SARSA_ABSTRACT_Hh_ diff --git a/examples/qlearning_sarsa_ex.cpp b/examples/qlearning_sarsa_ex.cpp index 44f8a13d34..612eb6c54a 100644 --- a/examples/qlearning_sarsa_ex.cpp +++ b/examples/qlearning_sarsa_ex.cpp @@ -1,5 +1,7 @@ -#include +// Copyright (C) 2017 Adrián Javaloy (adrian.javaloy@gmail.com) +// License: Boost Software License See LICENSE.txt for the full license. #include +#include #include #include #include @@ -7,41 +9,18 @@ using namespace dlib; -template < - typename state_type, - typename action_type - > -class feature_extractor -{ -public: - feature_extractor( - unsigned int h, - unsigned int w, - unsigned int na - ) : height(h), width(w), num_actions(na) {} - - - inline long num_features( - ) const { return num_actions * height * width; } - - matrix get_features( - const state_type &state, - const action_type &action - ) const - { - matrix feats(num_features()); - feats = 0; - //for(auto i = 0u; i < num_actions; i++) - // feats(num_actions * state + i) = 1; - feats(num_actions*state + static_cast(action)) = 1; - - return feats; - } - -private: - int height, width, num_actions; -}; - +// This is the model the agent is going to work with. In particular this class +// represents the a grid with height rows and width cols where of the form +// .......... +// .......... +// IFFFFFFFFG +// where: - F are pits cells (if the agent falls there it fails) +// - I is the initial cell +// - G is the goal cell (the agent goal is to reach that spot) +// - . are free cell where the agent can go. +// the rewards are: -100 for reaching F, 100 for reaching G and -1 for the rest. +// it doesn't allow to go out of bounds, instead the agent will stay in the same cell +// (like if there was a wall there). template < int height, int width, @@ -50,9 +29,13 @@ template < class cliff_model { public: + // constants and actions allowed enum class actions {up = 0, right, down, left}; constexpr static double EPS = 1e-16; + constexpr static int HEIGHT = height; + constexpr static int WIDTH = width; + // model types typedef int state_type; typedef actions action_type; typedef int reward_type; @@ -63,7 +46,7 @@ class cliff_model ) : fe(height, width, 4){} action_type random_action( - const state_type& state + const state_type& state // since all movements are always allowed we don't use state ) const { std::uniform_int_distribution dist(0,3); @@ -75,21 +58,23 @@ class cliff_model const matrix& w ) const { + // it looks for the best actions in state according to w auto best = std::numeric_limits::lowest(); auto best_indexes = std::vector(); for(auto i = 0; i < 4; i++){ auto feats = get_features(state, static_cast(i)); auto product = dot(w, feats); + if(product > best){ best = product; best_indexes.clear(); } - if(std::abs(product - best) < EPS) best_indexes.push_back(i); } + // returns a random action between the best ones. std::uniform_int_distribution dist(0, best_indexes.size()-1); return static_cast(best_indexes[dist(gen)]); } @@ -175,41 +160,115 @@ class cliff_model } feature_extractor fe; - mutable std::default_random_engine gen; + mutable std::default_random_engine gen; //mutable because it doesn't changes the model state }; -#include -int main(int argc, char** argv) +// This class is the feature representation of cliff_model states. +// It's just a basic one-shot representation where the feature vector for a point (a,b) doing action c +// is a zero vector of size width*height*num_actions with just a one on (a*width + b)*num_actions + c +template < + typename state_type, + typename action_type + > +class feature_extractor { - std::cout << "Hello." << std::endl; +public: + feature_extractor( + int h, + int w, + int na + ) : height(h), width(w), num_actions(na) {} - const auto height = 3u; - const auto width = 5u; + inline long num_features( + ) const { return num_actions * height * width; } - typedef cliff_model model_type; + matrix get_features( + const state_type &state, + const action_type &action + ) const + { + matrix feats(num_features()); + feats = 0; + //for(auto i = 0u; i < num_actions; i++) + // feats(num_actions * state + i) = 1; + feats(num_actions*state + static_cast(action)) = 1; - model_type model; - qlearning algorithm; - algorithm.be_verbose(); - algorithm.set_max_iterations(100); + return feats; + } - auto policy = algorithm.train(); +private: + int height, width, num_actions; +}; +// Just a helper function to pretty print the state of the agent. +template < + typename model_t + > +void print(std::ostream &os, const model_t &model, const typename model_t::state_type &state) +{ + for(auto i = 0; i < model_t::HEIGHT; i++){ + for(auto j = 0; j < model_t::WIDTH; j++){ + typename model_t::state_type s = model_t::WIDTH * i + j; + os << ( s == state ? 'X' : model.is_success(s) ? 'G' : model.is_failure(s) ? 'F' : '.'); + } + os << std::endl; + } + os << std::endl; +} + +// The function that runs the agent +template < + typename model_t, + typename algorithm_t // qlearning or sarsa + > +void run_example(const model_t &model, algorithm_t &&algorithm) +{ + algorithm.be_verbose(); // uncomment it if you want to see training info. + auto policy = algorithm.train(model); + + std::cout << "Starting final simulation..." << std::endl; auto s = model.initial_state(); - int r = 0; //TODO - for(auto i = 0u; i < 100 && !model.is_final(s); i++){ + auto r = static_cast(0); + int i; + + for(i = 0; i < 100 && !model.is_final(s); i++){ + print(std::cout, model, s); + auto a = policy(s); auto new_s = model.step(s, a); r += model.reward(s,a,new_s); s = new_s; } + print(std::cout, model, s); + std::cout << "Simulation finished." << std::endl; if(!model.is_final(s)) std::cout << "Nothing reached after 100 steps." << std::endl; else if(model.is_failure(s)) - std::cout << "Failed." << std::endl; + std::cout << "Failed after " << i << " steps with reward " << r << "." << std::endl; + else + std::cout << "Success after " << i << " steps with reward " << r << "." << std::endl; +} + +int main(int argc, char** argv) +{ + std::cout << "Hello." << std::endl; + + const auto height = 4u; + const auto width = 5u; + typedef cliff_model model_type; + model_type model; + + char response; + std::cout << "Qlearning or SARSA? (q/s): "; + std::cin >> response; + + if(response == 'q') + run_example(model, qlearning()); + else if(response == 's') + run_example(model, sarsa()); else - std::cout << "Success." << std::endl; + std::cerr << "Invalid option." << std::endl; std::cout << "Good bye." << std::endl; From 8b241b291455568be679580107af068a364c389b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adri=C3=A1n=20Javaloy?= Date: Sat, 9 Dec 2017 21:32:51 +0100 Subject: [PATCH 03/14] Added test for the reinforcement learning methods and checked backward compability with lspi. --- dlib/control/approximate_linear_models.h | 2 +- dlib/control/policy.h | 6 + dlib/control/qlearning.h | 11 +- dlib/control/sarsa.h | 4 +- dlib/test/CMakeLists.txt | 2 +- dlib/test/reinforcement_learning.cpp | 264 +++++++++++++++++++++++ examples/qlearning_sarsa_ex.cpp | 37 +++- 7 files changed, 308 insertions(+), 18 deletions(-) create mode 100644 dlib/test/reinforcement_learning.cpp diff --git a/dlib/control/approximate_linear_models.h b/dlib/control/approximate_linear_models.h index a0d9c01dcb..252b849e49 100644 --- a/dlib/control/approximate_linear_models.h +++ b/dlib/control/approximate_linear_models.h @@ -17,7 +17,7 @@ namespace dlib { typedef typename model_type::state_type state_type; typedef typename model_type::action_type action_type; - typedef typename model_type::reward_type reward_type; + typedef double reward_type; process_sample(){} diff --git a/dlib/control/policy.h b/dlib/control/policy.h index 5ad8c58322..f2c8f4855b 100644 --- a/dlib/control/policy.h +++ b/dlib/control/policy.h @@ -35,6 +35,12 @@ namespace dlib const matrix& weights_ ) : w(weights_), model(model_) {} + //backward compability + greedy_policy ( + const matrix& weights_, + const model_type &model_ + ) : w(weights_), model(model_) {} + action_type operator() ( const state_type& state ) const diff --git a/dlib/control/qlearning.h b/dlib/control/qlearning.h index 56e944ec74..2f266cfbd8 100644 --- a/dlib/control/qlearning.h +++ b/dlib/control/qlearning.h @@ -58,8 +58,8 @@ namespace dlib ) const { return iterations; } void set_iterations( - unsigned int iterations - ) { iterations = iterations; } + unsigned int value + ) { iterations = value; } double get_epsilon( ) const { return epsilon; } @@ -109,10 +109,10 @@ namespace dlib ); reward_type total_reward = static_cast(0); - std::cout << "iterations: " << iterations << std::endl; for(auto iter = 0u; iter < iterations; ++iter){ auto state = model.initial_state(); + auto steps = 0u; reward_type reward = static_cast(0); while(!model.is_final(state)){ auto action = eps_pol(state); @@ -127,12 +127,15 @@ namespace dlib state = next_state; reward += next_reward; + steps++; } total_reward += reward; if(verbose) std::cout << "iteration: " << iter << "\t reward: " << reward - << "\t mean: " << total_reward/static_cast(iter+1) << std::endl; + << "\t mean: " << total_reward/static_cast(iter+1) + << "\t steps: " << steps + << std::endl; } if(verbose) diff --git a/dlib/control/sarsa.h b/dlib/control/sarsa.h index bcb978f7fa..4ff96195b6 100644 --- a/dlib/control/sarsa.h +++ b/dlib/control/sarsa.h @@ -58,8 +58,8 @@ namespace dlib ) const { return iterations; } void set_iterations( - unsigned int iterations - ) { iterations = iterations; } + unsigned int value + ) { iterations = value; } double get_epsilon( ) const { return epsilon; } diff --git a/dlib/test/CMakeLists.txt b/dlib/test/CMakeLists.txt index 8d05fdce5d..7474d658a8 100644 --- a/dlib/test/CMakeLists.txt +++ b/dlib/test/CMakeLists.txt @@ -118,6 +118,7 @@ set (tests ranking.cpp read_write_mutex.cpp reference_counter.cpp + reinforcement_learning.cpp rls.cpp random_forest.cpp sammon.cpp @@ -160,7 +161,6 @@ set (tests elastic_net.cpp ) - # add all the cpp files we want to compile to this list. This tells # cmake that they are part of our target (which is the executable named dtest) ADD_EXECUTABLE(${target_name} main.cpp tester.cpp ${tests}) diff --git a/dlib/test/reinforcement_learning.cpp b/dlib/test/reinforcement_learning.cpp new file mode 100644 index 0000000000..d16a37638c --- /dev/null +++ b/dlib/test/reinforcement_learning.cpp @@ -0,0 +1,264 @@ +// Copyright (C) 2017 Adrián Javaloy (adrian.javaloy@gmail.com) +// License: Boost Software License See LICENSE.txt for the full license. + +#include "tester.h" +#include +#include +#include +#include + +namespace +{ + using namespace test; + using namespace dlib; + using namespace std; + dlib::logger dlog("test.rl"); + + template < + int height, + int width, + template class feature_extractor_type + > + class cliff_model + { + public: + // constants and actions allowed + enum class actions {up = 0, right, down, left}; + constexpr static double EPS = 1e-16; + constexpr static int HEIGHT = height; + constexpr static int WIDTH = width; + + // model types + typedef int state_type; + typedef actions action_type; + typedef int reward_type; + + typedef feature_extractor_type feature_extractor; + + explicit cliff_model( + ) : fe(height, width, 4){} + + action_type random_action( + const state_type& state // since all movements are always allowed we don't use state + ) const + { + std::uniform_int_distribution dist(0,3); + return static_cast(dist(gen)); + } + + action_type find_best_action( + const state_type& state, + const matrix& w + ) const + { + // it looks for the best actions in state according to w + auto best = std::numeric_limits::lowest(); + auto best_indexes = std::vector(); + + for(auto i = 0; i < 4; i++){ + auto feats = get_features(state, static_cast(i)); + auto product = dot(w, feats); + + if(product > best){ + best = product; + best_indexes.clear(); + } + if(std::abs(product - best) < EPS) + best_indexes.push_back(i); + } + + // returns a random action between the best ones. + std::uniform_int_distribution dist(0, best_indexes.size()-1); + return static_cast(best_indexes[dist(gen)]); + } + + const feature_extractor& get_feature_extractor( + ) const { return fe; } + + auto states_size( + ) const -> decltype(get_feature_extractor().num_features()) + { + return get_feature_extractor().num_features(); + } + + auto get_features( + const state_type &state, + const action_type &action + ) const -> decltype(get_feature_extractor().get_features(state, action)) + { return get_feature_extractor().get_features(state, action); } + + reward_type reward( + const state_type &state, + const action_type &action, + const state_type &new_state + ) const + { + return !is_final(new_state) ? -1 : is_success(new_state) ? 100 : -100; + } + + state_type initial_state( + ) const { return static_cast((height-1) * width); } + + state_type step( + const state_type& state, + const action_type& action + ) const + { + if(out_of_bounds(state, action)) + return state; + + return action == actions::up ? state - width : + action == actions::down ? state + width : + action == actions::right ? state + 1 : + state - 1 ; + } + + bool is_success( + const state_type &state + ) const { return state == height*width - 1; } + + bool is_failure( + const state_type &state + ) const { return state/width == height-1 && state%width > 0 && state%width < width-1;} + + bool is_final( + const state_type& state + ) const { return is_success(state) || is_failure(state); } + + private: + bool out_of_bounds( + const state_type& state, + const action_type& action + ) const + { + bool result; + + switch(action){ + case actions::up: + result = state / width == 0; + break; + case actions::down: + result = (state / width == height-2 && state % width > 0 && state % width < width-1) + || state / width == height-1; + break; + case actions::left: + result = state % width == 0; // || state == height*width-1; <- is the goal condition + break; + case actions::right: + result = state % width == width-1 || state == (height-1)*width; + break; + } + + return result; + } + + feature_extractor fe; + mutable std::default_random_engine gen; //mutable because it doesn't changes the model state + }; + + template < + typename state_type, + typename action_type + > + class feature_extractor + { + public: + feature_extractor( + int h, + int w, + int na + ) : height(h), width(w), num_actions(na) {} + + inline long num_features( + ) const { return num_actions * height * width; } + + matrix get_features( + const state_type &state, + const action_type &action + ) const + { + matrix feats(num_features()); + feats = 0; + //for(auto i = 0u; i < num_actions; i++) + // feats(num_actions * state + i) = 1; + feats(num_actions*state + static_cast(action)) = 1; + + return feats; + } + + private: + int height, width, num_actions; + }; + + template < + int height, + int width, + typename algorithm_t + > + void test(unsigned int iterations) + { + typedef cliff_model model_t; + const int max_steps = 150; + + print_spinner(); + algorithm_t algorithm; + algorithm.set_iterations(iterations); + model_t model; + auto policy = algorithm.train(model); + + auto s = model.initial_state(); + auto r = static_cast(0); + int i; + + for(i = 0; i < max_steps && !model.is_final(s); i++){ + auto a = policy(s); + auto new_s = model.step(s, a); + r += model.reward(s,a,new_s); + s = new_s; + } + + dlog << LINFO << "height, width: " << height << "," << width; + dlog << LINFO << "steps: " << i; + dlog << LINFO << "state: (" << s/width << "," << s%width << ")"; + dlog << LINFO << "success: " << (model.is_success(s) ? "true" : "false"); + dlog << LINFO << "failure: " << (model.is_failure(s) ? "true" : "false"); + dlog << LINFO << "reward: " << r; + DLIB_TEST(i != max_steps); + DLIB_TEST(model.is_success(s)); + DLIB_TEST(r > 0); + } + + class rl_tester : public tester + { + public: + rl_tester ( + ) : + tester ( + "test_rl", // the command line argument name for this test + "Run tests on the qlearning and sarsa objects.", // the command line argument description + 0 // the number of command line arguments for this test + ) + { + } + + void perform_test ( + ) + { + // I have to hardcode the number of iterations + // since qlearning is off-policy it can get the wrong answer if it iterates too much + // this could be troublesome if convergence depends too much on randomness + test<4,5,qlearning>(100); + test<5,5,qlearning>(1000); + test<4,7,qlearning>(500); + test<5,10,qlearning>(2000); + + test<4,5,sarsa>(100); + test<5,5,sarsa>(200); + test<4,7,sarsa>(200); + test<5,10,sarsa>(300); + } + }; + + rl_tester a; +} + diff --git a/examples/qlearning_sarsa_ex.cpp b/examples/qlearning_sarsa_ex.cpp index 612eb6c54a..3c18c6db5c 100644 --- a/examples/qlearning_sarsa_ex.cpp +++ b/examples/qlearning_sarsa_ex.cpp @@ -204,8 +204,21 @@ class feature_extractor template < typename model_t > -void print(std::ostream &os, const model_t &model, const typename model_t::state_type &state) +void print( + std::ostream &os, + const model_t &model, + const typename model_t::state_type &state, + const matrix &weights, + const typename model_t::action_type &action +) { + std::cout << "weights: "; + for(int i = 0; i < 4; i++) + std::cout << weights(state*4+i) << " "; + std::cout << std::endl; + + std::cout << "action: " << static_cast(action) << "\n"; + for(auto i = 0; i < model_t::HEIGHT; i++){ for(auto j = 0; j < model_t::WIDTH; j++){ typename model_t::state_type s = model_t::WIDTH * i + j; @@ -223,7 +236,7 @@ template < > void run_example(const model_t &model, algorithm_t &&algorithm) { - algorithm.be_verbose(); // uncomment it if you want to see training info. + //algorithm.be_verbose(); // uncomment it if you want to see training info. auto policy = algorithm.train(model); std::cout << "Starting final simulation..." << std::endl; @@ -232,14 +245,14 @@ void run_example(const model_t &model, algorithm_t &&algorithm) int i; for(i = 0; i < 100 && !model.is_final(s); i++){ - print(std::cout, model, s); - auto a = policy(s); auto new_s = model.step(s, a); r += model.reward(s,a,new_s); + + print(std::cout, model, s, policy.get_weights(), a); s = new_s; } - print(std::cout, model, s); + print(std::cout, model, s, policy.get_weights(), static_cast(0)); std::cout << "Simulation finished." << std::endl; if(!model.is_final(s)) @@ -255,7 +268,7 @@ int main(int argc, char** argv) std::cout << "Hello." << std::endl; const auto height = 4u; - const auto width = 5u; + const auto width = 7u; typedef cliff_model model_type; model_type model; @@ -263,10 +276,14 @@ int main(int argc, char** argv) std::cout << "Qlearning or SARSA? (q/s): "; std::cin >> response; - if(response == 'q') - run_example(model, qlearning()); - else if(response == 's') - run_example(model, sarsa()); + if(response == 'q'){ + qlearning algorithm; + algorithm.set_iterations(500); //for this size qlearning doesn't converge with 100 iterations + run_example(model, algorithm); + } + else if(response == 's'){ + run_example(model, sarsa()); //On the other side, sarsa does converge + } else std::cerr << "Invalid option." << std::endl; From d20c510c0fb918606ae57ceeb17da1c787219322 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adri=C3=A1n=20Javaloy?= Date: Mon, 11 Dec 2017 23:57:58 +0100 Subject: [PATCH 04/14] Fixed reward bug + model bug + prng support --- dlib/control/policy.h | 8 +++--- dlib/control/qlearning.h | 27 ++++++++++-------- dlib/control/sarsa.h | 33 +++++++++++++--------- dlib/test/reinforcement_learning.cpp | 42 +++++++++++++--------------- examples/qlearning_sarsa_ex.cpp | 26 ++++++++--------- 5 files changed, 71 insertions(+), 65 deletions(-) diff --git a/dlib/control/policy.h b/dlib/control/policy.h index f2c8f4855b..c72ea7c3cf 100644 --- a/dlib/control/policy.h +++ b/dlib/control/policy.h @@ -88,7 +88,7 @@ namespace dlib template < typename policy_type, - typename generator = std::default_random_engine + typename prng_engine = std::default_random_engine > class epsilon_policy { @@ -99,7 +99,7 @@ namespace dlib epsilon_policy ( double epsilon_, const policy_type &policy_, - const generator &gen_ = generator() + const prng_engine &gen_ = prng_engine() ) : policy(policy_), epsilon(epsilon_), gen(gen_) {} action_type operator() ( @@ -125,14 +125,14 @@ namespace dlib double get_epsilon( ) const { return epsilon; } - const generator& get_generator( + const prng_engine& get_generator( ) const { return gen; } private: policy_type policy; double epsilon; - mutable generator gen; + mutable prng_engine gen; }; template < typename policy_type, typename generator > diff --git a/dlib/control/qlearning.h b/dlib/control/qlearning.h index 2f266cfbd8..94f6731d75 100644 --- a/dlib/control/qlearning.h +++ b/dlib/control/qlearning.h @@ -6,6 +6,7 @@ #include "policy.h" #include #include +#include namespace dlib { @@ -86,10 +87,12 @@ namespace dlib ) { verbose = false; } template < - typename policy_type + typename policy_type, + typename prng_engine = std::default_random_engine > policy_type train_policy( - const policy_type &policy + const policy_type &policy, + const prng_engine &gen = prng_engine() ) const { typedef typename std::decay::type::reward_type reward_type; @@ -98,7 +101,7 @@ namespace dlib std::cout << "Starting training..." << std::endl; const auto &model = policy.get_model(); - epsilon_policy eps_pol(epsilon, policy); + epsilon_policy eps_pol(epsilon, policy, gen); auto& w = eps_pol.get_weights(); DLIB_ASSERT(weights.size() == model.states_size(), @@ -113,11 +116,11 @@ namespace dlib auto state = model.initial_state(); auto steps = 0u; - reward_type reward = static_cast(0); + reward_type iteration_reward = static_cast(0); while(!model.is_final(state)){ auto action = eps_pol(state); auto next_state = model.step(state, action); - auto next_reward = model.reward(state, action, next_state); + auto reward = model.reward(state, action, next_state); const auto feats = model.get_features(state, action); const auto feats_next_best = model.get_features(next_state, model.find_best_action(next_state, w)); @@ -126,13 +129,13 @@ namespace dlib w += learning_rate * correction * feats; state = next_state; - reward += next_reward; + iteration_reward += reward; steps++; } - total_reward += reward; + total_reward += iteration_reward; if(verbose) - std::cout << "iteration: " << iter << "\t reward: " << reward + std::cout << "iteration: " << iter << "\t reward: " << iteration_reward << "\t mean: " << total_reward/static_cast(iter+1) << "\t steps: " << steps << std::endl; @@ -145,11 +148,13 @@ namespace dlib } template < - typename model_type + typename model_type, + typename prng_engine = std::default_random_engine > greedy_policy train( - const model_type &model - ) const { return train_policy(greedy_policy(model)); } + const model_type &model, + const prng_engine &gen = prng_engine() + ) const { return train_policy(greedy_policy(model), gen); } private: double learning_rate; diff --git a/dlib/control/sarsa.h b/dlib/control/sarsa.h index 4ff96195b6..ba7e75099b 100644 --- a/dlib/control/sarsa.h +++ b/dlib/control/sarsa.h @@ -86,10 +86,12 @@ namespace dlib ) { verbose = false; } template < - typename policy_type - > + typename policy_type, + typename prng_engine = std::default_random_engine + > policy_type train_policy( - const policy_type &policy + const policy_type &policy, + const prng_engine &gen = prng_engine() ) const { typedef typename std::decay::type::reward_type reward_type; @@ -98,7 +100,7 @@ namespace dlib std::cout << "Starting training..." << std::endl; const auto &model = policy.get_model(); - epsilon_policy eps_pol(epsilon, policy); + epsilon_policy eps_pol(epsilon, policy, gen); auto& w = eps_pol.get_weights(); DLIB_ASSERT(weights.size() == model.states_size(), @@ -113,11 +115,12 @@ namespace dlib auto state = model.initial_state(); auto action = eps_pol(state); - reward_type reward = static_cast(0); + auto steps = 0u; + reward_type iteration_reward = static_cast(0); while(!model.is_final(state)){ auto next_state = model.step(state, action); auto next_action = eps_pol(next_state); - auto next_reward = model.reward(state, action, next_state); + auto reward = model.reward(state, action, next_state); const auto feats = model.get_features(state, action); const auto feats_next = model.get_features(next_state, next_action); @@ -127,13 +130,15 @@ namespace dlib state = next_state; action = next_action; - reward += next_reward; + iteration_reward += reward; } - total_reward += reward; + total_reward += iteration_reward; if(verbose) - std::cout << "iteration: " << iter << "\t reward: " << reward - << "\t mean: " << total_reward/static_cast(iter+1) << std::endl; + std::cout << "iteration: " << iter << "\t reward: " << iteration_reward + << "\t mean: " << total_reward/static_cast(iter+1) + << "\t steps: " << steps + << std::endl; } if(verbose) @@ -143,11 +148,13 @@ namespace dlib } template < - typename model_type + typename model_type, + typename prng_engine = std::default_random_engine > greedy_policy train( - const model_type &model - ) const { return train_policy(greedy_policy(model)); } + const model_type &model, + const prng_engine &gen = prng_engine() + ) const { return train_policy(greedy_policy(model), gen); } private: double learning_rate; diff --git a/dlib/test/reinforcement_learning.cpp b/dlib/test/reinforcement_learning.cpp index d16a37638c..4d79535440 100644 --- a/dlib/test/reinforcement_learning.cpp +++ b/dlib/test/reinforcement_learning.cpp @@ -36,7 +36,8 @@ namespace typedef feature_extractor_type feature_extractor; explicit cliff_model( - ) : fe(height, width, 4){} + int seed = 0 + ) : fe(height, width, 4), gen(seed) {} action_type random_action( const state_type& state // since all movements are always allowed we don't use state @@ -138,14 +139,13 @@ namespace result = state / width == 0; break; case actions::down: - result = (state / width == height-2 && state % width > 0 && state % width < width-1) - || state / width == height-1; + result = state / width == height-1; break; case actions::left: - result = state % width == 0; // || state == height*width-1; <- is the goal condition + result = state % width == 0; break; case actions::right: - result = state % width == width-1 || state == (height-1)*width; + result = state % width == width-1; break; } @@ -195,16 +195,17 @@ namespace int width, typename algorithm_t > - void test(unsigned int iterations) + void test() { + constexpr static int seed = 7; + typedef cliff_model model_t; - const int max_steps = 150; + const int max_steps = 100; print_spinner(); algorithm_t algorithm; - algorithm.set_iterations(iterations); - model_t model; - auto policy = algorithm.train(model); + model_t model(seed); + auto policy = algorithm.train(model, std::default_random_engine(seed)); auto s = model.initial_state(); auto r = static_cast(0); @@ -244,18 +245,15 @@ namespace void perform_test ( ) { - // I have to hardcode the number of iterations - // since qlearning is off-policy it can get the wrong answer if it iterates too much - // this could be troublesome if convergence depends too much on randomness - test<4,5,qlearning>(100); - test<5,5,qlearning>(1000); - test<4,7,qlearning>(500); - test<5,10,qlearning>(2000); - - test<4,5,sarsa>(100); - test<5,5,sarsa>(200); - test<4,7,sarsa>(200); - test<5,10,sarsa>(300); + test<4,5,qlearning>(); + test<5,5,qlearning>(); + test<4,7,qlearning>(); + test<5,10,qlearning>(); + + test<4,5,sarsa>(); + test<5,5,sarsa>(); + test<4,7,sarsa>(); + test<5,10,sarsa>(); } }; diff --git a/examples/qlearning_sarsa_ex.cpp b/examples/qlearning_sarsa_ex.cpp index 3c18c6db5c..826580aaae 100644 --- a/examples/qlearning_sarsa_ex.cpp +++ b/examples/qlearning_sarsa_ex.cpp @@ -43,7 +43,8 @@ class cliff_model typedef feature_extractor_type feature_extractor; explicit cliff_model( - ) : fe(height, width, 4){} + int seed = 0 + ) : fe(height, width, 4), gen(seed){} action_type random_action( const state_type& state // since all movements are always allowed we don't use state @@ -145,14 +146,13 @@ class cliff_model result = state / width == 0; break; case actions::down: - result = (state / width == height-2 && state % width > 0 && state % width < width-1) - || state / width == height-1; + result = state / width == height-1; break; case actions::left: - result = state % width == 0; // || state == height*width-1; <- is the goal condition + result = state % width == 0; break; case actions::right: - result = state % width == width-1 || state == (height-1)*width; + result = state % width == width-1; break; } @@ -267,8 +267,8 @@ int main(int argc, char** argv) { std::cout << "Hello." << std::endl; - const auto height = 4u; - const auto width = 7u; + const auto height = 5u; + const auto width = 10u; typedef cliff_model model_type; model_type model; @@ -276,14 +276,10 @@ int main(int argc, char** argv) std::cout << "Qlearning or SARSA? (q/s): "; std::cin >> response; - if(response == 'q'){ - qlearning algorithm; - algorithm.set_iterations(500); //for this size qlearning doesn't converge with 100 iterations - run_example(model, algorithm); - } - else if(response == 's'){ - run_example(model, sarsa()); //On the other side, sarsa does converge - } + if(response == 'q') + run_example(model, qlearning()); + else if(response == 's') + run_example(model, sarsa()); else std::cerr << "Invalid option." << std::endl; From 3e56eb5481a4999802e4f725956049c139108507 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adri=C3=A1n=20Javaloy?= Date: Tue, 12 Dec 2017 11:36:39 +0100 Subject: [PATCH 05/14] Commented example + updated abstracts --- dlib/control/policy_abstract.h | 11 +- dlib/control/qlearning_abstract.h | 24 +++- dlib/control/sarsa.h | 6 +- dlib/control/sarsa_abstract.h | 26 ++-- examples/qlearning_sarsa_ex.cpp | 194 +++++++++++++++++++++--------- 5 files changed, 181 insertions(+), 80 deletions(-) diff --git a/dlib/control/policy_abstract.h b/dlib/control/policy_abstract.h index 991bf96296..6d96b3ebdc 100644 --- a/dlib/control/policy_abstract.h +++ b/dlib/control/policy_abstract.h @@ -5,6 +5,7 @@ #include "../matrix.h" #include "model_abstract.h" +#include namespace dlib { @@ -176,7 +177,7 @@ void deserialize(greedy_policy& item, std::istream& in); template < typename policy_type, - typename generator + typename prng_engine = std::default_random_engine() > class epsilon_policy { @@ -185,8 +186,8 @@ class epsilon_policy policy_type should implement the example_policy interface defined at the top of this file. - REQUIREMENTS ON generator - generator should be a PRNG type like the ones defined in std::random. + REQUIREMENTS ON prng_engine + prng_engine should be a PRNG class like the ones defined in std::random. WHAT THIS OBJECT REPRESENTS This is a special policy that returns the best action (according to the @@ -202,7 +203,7 @@ class epsilon_policy epsilon_policy ( double epsilon, const policy_type &policy, - const generator &gen = generator() + const prng_engine &gen = prng_engine() ); /*! requires @@ -259,7 +260,7 @@ class epsilon_policy - returns the epsilon value used by the policy. !*/ - const generator& get_generator( + const prng_engine& get_generator( ) const; /*! ensures diff --git a/dlib/control/qlearning_abstract.h b/dlib/control/qlearning_abstract.h index ccac305890..182c80ca4f 100644 --- a/dlib/control/qlearning_abstract.h +++ b/dlib/control/qlearning_abstract.h @@ -5,6 +5,7 @@ #include "policy_abstract.h" #include "model_abstract.h" +#include namespace dlib { @@ -156,33 +157,44 @@ namespace dlib !*/ template < - typename policy_type + typename policy_type, + typename prng_engine = std::default_random_engine > policy_type train_policy( - const policy_type &policy + const policy_type &policy, + const prng_engine &gen = prng_engine() ) const; /*! requires - policy is of the form example_policy, i.e., an instance of an implementation of the policy interface defined in policy_abstract.h. + - prng_engine is a pseudo-random number generator class like the ones + defined in std::random. By default it assumes it to be the standard + default_random_engine class. ensures - returns a policy of the type policy_type as the result of applying the qlearning learning function over iterations runs over using the weight - matrix of the argument as the initial weights. + matrix of the argument as the initial weights. Besides that, the + exploration is done with an epsilon policy using the given prng. !*/ template < - typename model_type + typename model_type, + typename prng_engine = std::default_random_engine > greedy_policy train( - const model_type &model + const model_type &model, + const prng_engine &gen = prng_engine() ) const; /*! requires - model_type is an implementation of the example_model interface defined at model_abstract.h. + - prng_engine is a pseudo-random number generator class like the ones + defined in std::random. By default it assumes it to be the standard + default_random_engine class. ensures - - returns train_policy(greedy_policy(model)); + - returns train_policy(greedy_policy(model), gen); !*/ }; diff --git a/dlib/control/sarsa.h b/dlib/control/sarsa.h index ba7e75099b..772945c048 100644 --- a/dlib/control/sarsa.h +++ b/dlib/control/sarsa.h @@ -148,9 +148,9 @@ namespace dlib } template < - typename model_type, - typename prng_engine = std::default_random_engine - > + typename model_type, + typename prng_engine = std::default_random_engine + > greedy_policy train( const model_type &model, const prng_engine &gen = prng_engine() diff --git a/dlib/control/sarsa_abstract.h b/dlib/control/sarsa_abstract.h index f4d559a8d9..6acd6f06da 100644 --- a/dlib/control/sarsa_abstract.h +++ b/dlib/control/sarsa_abstract.h @@ -5,6 +5,7 @@ #include "policy_abstract.h" #include "model_abstract.h" +#include namespace dlib { @@ -157,33 +158,44 @@ namespace dlib !*/ template < - typename policy_type + typename policy_type, + typename prng_engine = std::default_random_engine > policy_type train_policy( - const policy_type &policy + const policy_type &policy, + const prng_engine &gen ) const; /*! requires - policy is of the form example_policy, i.e., an instance of an implementation of the policy interface defined in policy_abstract.h. + - prng_engine is a pseudo-random number generator class like the ones + defined in std::random. By default it assumes it to be the standard + default_random_engine class. ensures - returns a policy of the type policy_type as the result of applying the sarsa learning function over iterations runs over using the weight - matrix of the argument as the initial weights. + matrix of the argument as the initial weights. Besides that, the + exploration is done with an epsilon policy using the given prng. !*/ template < - typename model_type - > + typename model_type, + typename prng_engine = std::default_random_engine + > greedy_policy train( - const model_type &model + const model_type &model, + const prng_engine &gen = prng_engine() ) const; /*! requires - model_type is an implementation of the example_model interface defined at model_abstract.h. + - prng_engine is a pseudo-random number generator class like the ones + defined in std::random. By default it assumes it to be the standard + default_random_engine class. ensures - - returns train_policy(greedy_policy(model)); + - returns train_policy(greedy_policy(model), gen); !*/ }; }; diff --git a/examples/qlearning_sarsa_ex.cpp b/examples/qlearning_sarsa_ex.cpp index 826580aaae..431dcff48a 100644 --- a/examples/qlearning_sarsa_ex.cpp +++ b/examples/qlearning_sarsa_ex.cpp @@ -1,5 +1,10 @@ -// Copyright (C) 2017 Adrián Javaloy (adrian.javaloy@gmail.com) -// License: Boost Software License See LICENSE.txt for the full license. +// The contents of this file are in the public domain. See LICENSE_FOR_EXAMPLE_PROGRAMS.txt +/* + This is an example showing how to use the dlib algorithms Q-learning and SARSA. + These are two simples reinforcement learning algorithms. In short, they take a model + and take steps over and over until they've learnt how to solve the given task properly. +*/ + #include #include #include @@ -8,19 +13,35 @@ #include using namespace dlib; - -// This is the model the agent is going to work with. In particular this class -// represents the a grid with height rows and width cols where of the form -// .......... -// .......... -// IFFFFFFFFG -// where: - F are pits cells (if the agent falls there it fails) -// - I is the initial cell -// - G is the goal cell (the agent goal is to reach that spot) -// - . are free cell where the agent can go. -// the rewards are: -100 for reaching F, 100 for reaching G and -1 for the rest. -// it doesn't allow to go out of bounds, instead the agent will stay in the same cell -// (like if there was a wall there). +using namespace std; + +/* + Both of these algorithms work by a reward system. That means that they assign to each + pair (state, action) an expected reward (Qvalue) and they update those values iteratively + taking steps on a model/simulation and observing the reward they obtain. Like so, they + need a model class that allow them to work in a interactive way. + + The algorithms/agents objective is to maximize the expected reward by taking the proper + steps. +*/ + +/* + This is the model the agent is going to work with in the example. In particular, + this class represents a grid with a given height and width of the form + .......... + .......... + IFFFFFFFFG + where: - F are pit cells (if the agent falls there it fails and simulation ends). + - I is the starting position. + - G is the goal cell (the agent goal is to reach that cell). + - . are free cells where the agent can go. + + The agent receives the following reward: -100 for reaching F, 100 for reaching G and a + reward of -1 otherwise. + + This model doesn't allow the agent to go out of bounds, instead it will stay in the same cell + he was before the action (like if there was a wall there) but receiving a reward of -1. +*/ template < int height, int width, @@ -29,41 +50,54 @@ template < class cliff_model { public: - // constants and actions allowed + // actions allowed in the model enum class actions {up = 0, right, down, left}; + constexpr static int num_actions = 4; + + // some constants that we need constexpr static double EPS = 1e-16; constexpr static int HEIGHT = height; constexpr static int WIDTH = width; - // model types + // we define the model's types typedef int state_type; typedef actions action_type; typedef int reward_type; + // this ensures that the feature extractor uses the same underlying types as our model typedef feature_extractor_type feature_extractor; + + // Constructor explicit cliff_model( int seed = 0 - ) : fe(height, width, 4), gen(seed){} + ) : fe(height, width, num_actions), gen(seed){} + + // Functions that will use the agent + + // It returns a random action. It's possible that the allowed actions differ from among states. + // In this case all movements are always allowed so we don't need to use state. action_type random_action( - const state_type& state // since all movements are always allowed we don't use state + const state_type& state ) const { - std::uniform_int_distribution dist(0,3); + uniform_int_distribution dist(0,num_actions-1); return static_cast(dist(gen)); } + // Returns the best action that maximizes the expected reward, that is, + // the action that maximizes dot_product(w, get_features(state, action)) + // w will be the weights assign by the agent to each feature action_type find_best_action( const state_type& state, const matrix& w ) const { - // it looks for the best actions in state according to w - auto best = std::numeric_limits::lowest(); + auto best = numeric_limits::lowest(); auto best_indexes = std::vector(); - for(auto i = 0; i < 4; i++){ + for(auto i = 0; i < num_actions; i++){ auto feats = get_features(state, static_cast(i)); auto product = dot(w, feats); @@ -71,15 +105,18 @@ class cliff_model best = product; best_indexes.clear(); } - if(std::abs(product - best) < EPS) + if(abs(product - best) < EPS) best_indexes.push_back(i); } // returns a random action between the best ones. - std::uniform_int_distribution dist(0, best_indexes.size()-1); + uniform_int_distribution dist(0, best_indexes.size()-1); return static_cast(best_indexes[dist(gen)]); } + + // This functions are delegated to the feature extractor + const feature_extractor& get_feature_extractor( ) const { return fe; } @@ -95,6 +132,8 @@ class cliff_model ) const -> decltype(get_feature_extractor().get_features(state, action)) { return get_feature_extractor().get_features(state, action); } + + // This functions gives the rewards, that is, tells the agent how good are its movements reward_type reward( const state_type &state, const action_type &action, @@ -107,6 +146,7 @@ class cliff_model state_type initial_state( ) const { return static_cast((height-1) * width); } + // This is an important function, basically it allows the agent to move in the model's world state_type step( const state_type& state, const action_type& action @@ -121,6 +161,8 @@ class cliff_model state - 1 ; } + // this functions allow the agent to know in which state of the simulation he is in + bool is_success( const state_type &state ) const { return state == height*width - 1; } @@ -160,12 +202,21 @@ class cliff_model } feature_extractor fe; - mutable std::default_random_engine gen; //mutable because it doesn't changes the model state + mutable default_random_engine gen; //mutable because it doesn't changes the model state }; -// This class is the feature representation of cliff_model states. -// It's just a basic one-shot representation where the feature vector for a point (a,b) doing action c -// is a zero vector of size width*height*num_actions with just a one on (a*width + b)*num_actions + c +/* + Usually when we use these types of agents the state space of the model is huge. That could make + the Qfunction to be unmanageable and so we need to use what is known as function approximation. + + Basically it represents the states by a given features instead of the states themselves. That way + what usually was just a single value Q(state, action) now is codified as the linear combination of + learnt weights and the features, that is, Q(state, action) = dot_product(weights, features(state, action)). + + Our example is a toy example and so we don't need to use it. However, to show how it works I use a simple + one-shot representation of the states. That means that I have a vector of features where the feature in the + ith position is one if we provide a specific (state, action) and 0 otherwise. +*/ template < typename state_type, typename action_type @@ -179,6 +230,7 @@ class feature_extractor int na ) : height(h), width(w), num_actions(na) {} + //the size of the vector inline long num_features( ) const { return num_actions * height * width; } @@ -189,9 +241,7 @@ class feature_extractor { matrix feats(num_features()); feats = 0; - //for(auto i = 0u; i < num_actions; i++) - // feats(num_actions * state + i) = 1; - feats(num_actions*state + static_cast(action)) = 1; + feats(num_actions*state + static_cast(action)) = 1; //only this one is 1 return feats; } @@ -200,46 +250,72 @@ class feature_extractor int height, width, num_actions; }; -// Just a helper function to pretty print the state of the agent. +// This is just a helper function to pretty-print the agent's state. template < - typename model_t - > + typename model_t + > void print( - std::ostream &os, + ostream &os, const model_t &model, const typename model_t::state_type &state, const matrix &weights, const typename model_t::action_type &action ) { - std::cout << "weights: "; + cout << "weights: "; for(int i = 0; i < 4; i++) - std::cout << weights(state*4+i) << " "; - std::cout << std::endl; + cout << weights(state*4+i) << " "; + cout << endl; - std::cout << "action: " << static_cast(action) << "\n"; + cout << "action: " << static_cast(action) << "\n"; for(auto i = 0; i < model_t::HEIGHT; i++){ for(auto j = 0; j < model_t::WIDTH; j++){ typename model_t::state_type s = model_t::WIDTH * i + j; os << ( s == state ? 'X' : model.is_success(s) ? 'G' : model.is_failure(s) ? 'F' : '.'); } - os << std::endl; + os << endl; } - os << std::endl; + os << endl; } -// The function that runs the agent +/* + This is the function that runs the agent. The code to run both agents are identical so I + chose to use a templated function. + + The difference between executions comes in the way they train. Namely, the way they updated the Qvalue. + Let's suppose that we are in the pair (s, a) and we are going to be in (s', a') in the next step. + + Q-learning is an off-policy algorithm meaning that doesn't consider its trully next move but the best one, + that is, doesn't consider a'. Its update function is like this: + Q(s, a) = (1 - lr) * Q(s,a) + lr * (reward + disc * max_c Q(s', c)) + That formula means that it takes a convex combination of the current qvalue and the expected qvalue, but + for doing so it considers the action c that maximizes Q(s', c) instead of the one he will take. + + On the other hand SARSA does exactly the same as Q-learning but it considers the action that he will do + in the next step instead of the optimal. So it's an on-policy algorithm. Its update formula is: + Q(s, a) = (1 - lr) * Q(s,a) + lr * (reward + disc * Q(s', a')) + + This seems as a meaningless change, but what produces is that when training SARSA tends to be more conservative + in its movement while Q-learning tries to optimizes no matter what. In cases when you have to avoid failling + (usually a real world example) SARSA is a better option. + + In our example this difference is appreciated in the way they learn. Q-learning will try to go close to the pit + cells all the time (falling a lot in the training process) and SARSA will go one or two cells off the cliff. + + Usually, one decreases the learning ratio as the iterations go on and so SARSA would converge to the same solution + as Q-learning. This is not implemented yet and so the learning rate is constant always. +*/ template < typename model_t, - typename algorithm_t // qlearning or sarsa + typename algorithm_t // this can be qlearning or sarsa > void run_example(const model_t &model, algorithm_t &&algorithm) { - //algorithm.be_verbose(); // uncomment it if you want to see training info. + //algorithm.be_verbose(); // uncomment it if you want to see some training info. auto policy = algorithm.train(model); - std::cout << "Starting final simulation..." << std::endl; + cout << "Starting final simulation..." << endl; auto s = model.initial_state(); auto r = static_cast(0); int i; @@ -249,41 +325,41 @@ void run_example(const model_t &model, algorithm_t &&algorithm) auto new_s = model.step(s, a); r += model.reward(s,a,new_s); - print(std::cout, model, s, policy.get_weights(), a); + print(cout, model, s, policy.get_weights(), a); s = new_s; } - print(std::cout, model, s, policy.get_weights(), static_cast(0)); - std::cout << "Simulation finished." << std::endl; + print(cout, model, s, policy.get_weights(), static_cast(0)); + cout << "Simulation finished." << endl; if(!model.is_final(s)) - std::cout << "Nothing reached after 100 steps." << std::endl; + cout << "Nothing reached after 100 steps." << endl; else if(model.is_failure(s)) - std::cout << "Failed after " << i << " steps with reward " << r << "." << std::endl; + cout << "Failed after " << i << " steps with reward " << r << "." << endl; else - std::cout << "Success after " << i << " steps with reward " << r << "." << std::endl; + cout << "Success after " << i << " steps with reward " << r << "." << endl; } int main(int argc, char** argv) { - std::cout << "Hello." << std::endl; + cout << "Hello." << endl; - const auto height = 5u; - const auto width = 10u; + const auto height = 3u; + const auto width = 7u; typedef cliff_model model_type; model_type model; char response; - std::cout << "Qlearning or SARSA? (q/s): "; - std::cin >> response; + cout << "Qlearning or SARSA? (q/s): "; + cin >> response; if(response == 'q') run_example(model, qlearning()); else if(response == 's') run_example(model, sarsa()); else - std::cerr << "Invalid option." << std::endl; + cerr << "Invalid option." << endl; - std::cout << "Good bye." << std::endl; + cout << "Good bye." << endl; return 0; } From 160db7d83a466416540e45ec59143559f2cd0720 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adri=C3=A1n=20Javaloy?= Date: Sat, 17 Feb 2018 22:35:58 +0100 Subject: [PATCH 06/14] Applied review notes - Improved English notes - Merged feature_extractor and model (now the user can choose which one to implement, offline or online model) - Improved training function header in qlearning and sarsa --- dlib/control.h | 1 - dlib/control/approximate_linear_models.h | 153 ++++++- .../approximate_linear_models_abstract.h | 402 ++++++++++++++++-- dlib/control/lspi_abstract.h | 2 +- dlib/control/model_abstract.h | 169 -------- dlib/control/policy.h | 172 -------- dlib/control/policy_abstract.h | 284 ------------- dlib/control/qlearning.h | 63 ++- dlib/control/qlearning_abstract.h | 69 +-- dlib/control/sarsa.h | 63 ++- dlib/control/sarsa_abstract.h | 61 +-- dlib/test/reinforcement_learning.cpp | 124 +++--- examples/qlearning_sarsa_ex.cpp | 161 +++---- 13 files changed, 718 insertions(+), 1006 deletions(-) delete mode 100644 dlib/control/model_abstract.h delete mode 100644 dlib/control/policy.h delete mode 100644 dlib/control/policy_abstract.h diff --git a/dlib/control.h b/dlib/control.h index 9c0ec80781..4e9c02878e 100644 --- a/dlib/control.h +++ b/dlib/control.h @@ -3,7 +3,6 @@ #ifndef DLIB_CONTROL_ #define DLIB_CONTROL_ -#include "control/policy.h" #include "control/lspi.h" #include "control/mpc.h" #include "control/qlearning.h" diff --git a/dlib/control/approximate_linear_models.h b/dlib/control/approximate_linear_models.h index 252b849e49..5fe025427c 100644 --- a/dlib/control/approximate_linear_models.h +++ b/dlib/control/approximate_linear_models.h @@ -4,6 +4,8 @@ #define DLIB_APPROXIMATE_LINEAR_MODELS_Hh_ #include "approximate_linear_models_abstract.h" +#include +#include namespace dlib { @@ -17,7 +19,6 @@ namespace dlib { typedef typename model_type::state_type state_type; typedef typename model_type::action_type action_type; - typedef double reward_type; process_sample(){} @@ -25,13 +26,13 @@ namespace dlib const state_type& s, const action_type& a, const state_type& n, - const reward_type& r + const double& r ) : state(s), action(a), next_state(n), reward(r) {} state_type state; action_type action; state_type next_state; - reward_type reward; + double reward; }; template < typename feature_extractor > @@ -52,6 +53,152 @@ namespace dlib deserialize(item.reward, in); } +// ---------------------------------------------------------------------------------------- + + template < + typename model_type + > + class policy + { + public: + + typedef typename model_type::state_type state_type; + typedef typename model_type::action_type action_type; + + policy ( + const model_type& model_ = model_type() + ) : model(model_) + { + weights.set_size(model.num_features()); + weights = 0; + } + + policy ( + const matrix& weights_, + const model_type &model_ + ) : weights(weights_), model(model_) {} + + action_type operator() ( + const state_type& state + ) const + { + return model.find_best_action(state,weights); + } + + const model_type& get_model ( + ) const { return model; } + + const matrix& get_weights ( + ) const { return weights; } + + matrix& get_weights ( + ) { return weights; } + + private: + matrix weights; + const model_type model; + }; + + template < typename model_type > + inline void serialize(const policy& item, std::ostream& out) + { + int version = 1; + serialize(version, out); + serialize(item.get_model(), out); + serialize(item.get_weights(), out); + } + template < typename model_type > + inline void deserialize(policy& item, std::istream& in) + { + int version = 0; + deserialize(version, in); + if (version != 1) + throw serialization_error("Unexpected version found while deserializing dlib::policy object."); + model_type model; + matrix w; + deserialize(model, in); + deserialize(w, in); + item = policy(w,model); + } + +// ---------------------------------------------------------------------------------------- + + template < + typename policy_type, + typename prng_engine = std::default_random_engine + > + class epsilon_policy + { + public: + typedef typename policy_type::state_type state_type; + typedef typename policy_type::action_type action_type; + + epsilon_policy ( + double epsilon_, + policy_type &policy_, + const prng_engine &gen_ = prng_engine() + ) : underlying_policy(policy_), epsilon(epsilon_), gen(gen_) {} + + action_type operator() ( + const state_type& state + ) const + { + std::bernoulli_distribution d(epsilon); + return d(gen) ? get_model().random_action(state) : underlying_policy(state); + } + + const policy_type& get_policy( + ) const { return underlying_policy; } + + auto get_model ( + ) const -> decltype(get_policy().get_model()) { return underlying_policy.get_model(); } + + matrix& get_weights ( + ) { return underlying_policy.get_weights(); } + + const matrix& get_weights ( + ) const { return underlying_policy.get_weights(); } + + double get_epsilon( + ) const { return epsilon; } + + const prng_engine& get_generator( + ) const { return gen; } + + private: + policy_type& underlying_policy; + double epsilon; + + mutable prng_engine gen; + }; + + template < typename policy_type, typename generator > + inline void serialize(const epsilon_policy& item, std::ostream& out) + { + int version = 1; + serialize(version, out); + serialize(item.get_policy(), out); + serialize(item.get_epsilon(), out); + serialize(item.get_generator(), out); + } + + template < typename policy_type, typename generator > + inline void deserialize(epsilon_policy& item, std::istream& in) + { + int version = 0; + deserialize(version, in); + if (version != 1) + throw serialization_error("Unexpected version found while deserializing dlib::policy object."); + + policy_type policy; + double epsilon; + generator gen; + deserialize(policy, in); + deserialize(epsilon, in); + deserialize(gen, in); + item = epsilon_policy(epsilon, policy, gen); + } + // ---------------------------------------------------------------------------------------- } diff --git a/dlib/control/approximate_linear_models_abstract.h b/dlib/control/approximate_linear_models_abstract.h index 74f99da4ab..0f14432f92 100644 --- a/dlib/control/approximate_linear_models_abstract.h +++ b/dlib/control/approximate_linear_models_abstract.h @@ -3,35 +3,30 @@ #undef DLIB_APPROXIMATE_LINEAR_MODELS_ABSTRACT_Hh_ #ifdef DLIB_APPROXIMATE_LINEAR_MODELS_ABSTRACT_Hh_ -#include "model_abstract.h" +#include <../matrix_abstract.h> +#include namespace dlib { // ---------------------------------------------------------------------------------------- - template < - typename T, - typename U - > - struct example_feature_extractor - { + struct example_offline_model { /*! WHAT THIS OBJECT REPRESENTS - This object defines the interface a feature extractor must implement if it - is to be used with the process_sample and policy objects defined at - policy_abstract.h. Moreover, it is meant to represent the core part - of a model used in a reinforcement learning algorithm. - - In particular, this object models a Q(state,action) function where - Q(state,action) == dot(w, PSI(state,action)) - where PSI(state,action) is a feature vector and w is a parameter - vector. + This object defines the inferface that any model has to implement if it + is to be used in an offline fashion along with some method like the lspi + method defined in the file lspi_abstract.h. Being offline only means that + it already holds the data and will not interact with the environment to get + them. - Therefore, a feature extractor defines how the PSI(x,y) feature vector is - calculated. It also defines the types used to represent the state and - action objects. + In particular, this object models a Q(state, action) function where + Q(state, action) == dot(w, PSI(state, action)) + where PSI(state, action) is a feature vector and w is a parameter vector. + Therefore, an offline model defines how the PSI(x,y) feature vector is + calculated. It also defines the types used to represent the state and + action objects. THREAD SAFETY Instances of this object are required to be threadsafe, that is, it should @@ -39,13 +34,14 @@ namespace dlib functions of this object. !*/ - typedef T state_type; - typedef U action_type; - // We can also say that the last element in the weight vector w must be 1. This + // The states and actions can be any type as long as you provide typedefs for them. + typedef U state_type; + typedef V action_type; + // We can also say that the last element in the weights vector w must be 1. This // can be useful for including a prior into your model. const static bool force_last_weight_to_1 = false; - example_feature_extractor( + example_offline_model( ); /*! ensures @@ -56,20 +52,167 @@ namespace dlib ) const; /*! ensures - - returns the dimensionality of the PSI() feature vector. + - returns the dimensionality of the PSI() feature vector. + !*/ + + action_type find_best_action ( + const state_type& state, + const matrix& w + ) const; + /*! + ensures + - returns the action A that maximizes Q(state, A) = dot(w,PSI(state,A)). + That is, this function finds the best action to take in the given state + when our model is parameterized by the given weight vector w. !*/ - matrix get_features ( - const state_type &state, - const action_type &action + void get_features( + const state_type& state, + const action_type& action, + matrix& feats + ) const; + /*! + ensures + - #feats.size() == num_features() + - #feats == PSI(state, action) + */! + }; + +// ---------------------------------------------------------------------------------------- + + struct example_online_model + { + /*! + WHAT THIS OBJECT REPRESENTS + This object defines the inferface that any model has to implement if it + is to be used in an online fashion along with some method like the qlearning + method defined in the file qlearning_abstract.h. + + Being online means that the model doesn't hold prior data but it interacts + with the environment and performing actions from some given state turning + that state into a new one as well as getting some reward for doing so. + + In particular, this object models a Q(state, action) function where + Q(state, action) == dot(w, PSI(state, action)) + where PSI(state, action) is a feature vector and w is a parameter vector. + + Therefore, an online model defines how the PSI(x,y) feature vector is + calculated, the types used to represent the state, action and reward + objects as well as how to interact with the environment. + + THREAD SAFETY + Instances of this object are required to be threadsafe, that is, it should + be safe for multiple threads to make concurrent calls to the member + functions of this object. + !*/ + + // The states and actions can be any type as long as you provide typedefs for them. + typedef U state_type; + typedef V action_type; + + example_online_model( + ); + /*! + ensures + - this object is properly initialized. + !*/ + + unsigned long num_features( + ) const; + /*! + ensures + - returns the dimensionality of the PSI vector. + !*/ + + action_type find_best_action( + const state_type& state, + const matrix& w + ) const; + /*! + ensures + - returns the action A that maximizes Q(state, A) = dot(w,PSI(state,A)). + That is, this function finds the best action to take in the given state + when our model is parameterized by the given weight vector. + !*/ + + void get_features( + const state_type& state, + const action_type& action, + matrix& feats + ) const; + /*! + ensures + - #feats.size() == num_features() + - #feats == PSI(state, action) + !*/ + + action_type random_action( + const state_type& state + ) const; + /*! + ensures + - returns a random plausible action assuming we are in the given state. + !*/ + + double reward( + const state_type& state, + const action_type& action, + const state_type& new_state + ) const; + /*! + requires + - action is a pausible action from state. + - new_state is a possible outcome when performing action on state. + ensures + - returns the reward obtained by reaching new_state from state + doing action. + !*/ + + state_type initial_state( + ) const; + /*! + ensures + - returns the initial state of the model. + !*/ + + state_type step( + const state_type& state, + const action_type& action ) const; /*! requires - - action is a valid option from state. + - action is a plausible action when we are in state. + ensures + - returns a new state result of being on the given state and doing the given + action. + !*/ + + bool is_success( + const state_type& state + ) const; + /*! + ensures + - returns whether state is a goal state (the agent has finished properly). + !*/ + + bool is_failure( + const state_type& state + ) const; + /*! + ensures + - returns whether state is a failure state, i.e., a state where the agent has + failed its task. + !*/ + + bool is_final( + const state_type& state + ) const; + /*! ensures - - returns PSI(state,action) + - #is_final(state) == is_success(state) || is_failure(state) !*/ + }; // ---------------------------------------------------------------------------------------- @@ -81,7 +224,7 @@ namespace dlib { /*! REQUIREMENTS ON model_type - model_type should implement the interface defined at model_abstract.h. + model_type should implement one of the interfaces defined above this file. WHAT THIS OBJECT REPRESENTS This object holds a training sample for a reinforcement learning algorithm. @@ -92,7 +235,6 @@ namespace dlib typedef typename model_type::state_type state_type; typedef typename model_type::action_type action_type; - typedef typename model_type::reward_type reward_type; process_sample(){} @@ -100,19 +242,205 @@ namespace dlib const state_type& s, const action_type& a, const state_type& n, - const reward_type& r + const double& r ) : state(s), action(a), next_state(n), reward(r) {} state_type state; action_type action; state_type next_state; - reward_type reward; + double reward; + }; + + template < typename model_type > + void serialize (const process_sample& item, std::ostream& out); + template < typename model_type > + void deserialize (process_sample& item, std::istream& in); + /*! + provides serialization support. + !*/ + +// ---------------------------------------------------------------------------------------- + + template < + typename model_type + > + class policy + { + /*! + REQUIREMENTS ON model_type + model_type should implement one of the interfaces defined above this file. + + WHAT THIS OBJECT REPRESENTS + This class represents a greedy policy, that is, it is a policy that given a + state returns the best possible action based on its weight matrix. + !*/ + + public: + + typedef typename model_type::state_type state_type; + typedef typename model_type::action_type action_type; + + policy ( + const model_type& model = model_type() + ); + /*! + ensures + - #get_model() == model + - #get_weights().size() == #get_model().num_features() + - #get_weights() == 0 + !*/ + + policy ( + const matrix& weights, + const model_type& model + ); + /*! + requires + - model.num_features() == weights.size() + ensures + - #get_model() == model + - #get_weights() == weights + !*/ + + action_type operator() ( + const state_type& state + ) const; + /*! + ensures + - returns get_model().find_best_action(state, this->weights); + !*/ + + const model_type& get_model ( + ) const; + /*! + ensures + - returns the model used by this object + !*/ + + const matrix& get_weights ( + ) const; + /*! + ensures + - returns the weights that the policy is using. + !*/ + + matrix& get_weights ( + ); + /*! + ensures + - returns the weights that the policy is using. + !*/ + }; + + template < typename model_type > + void serialize(const policy& item, std::ostream& out); + template < typename model_type > + void deserialize(policy& item, std::istream& in); + /*! + provides serialization support. + !*/ + + // ---------------------------------------------------------------------------------------- + + template < + typename policy_type, + typename prng_engine = std::default_random_engine() + > + class epsilon_policy + { + /*! + REQUIREMENTS ON policy_type + policy_type is an object with the same interface as the policy class defined + above. + + REQUIREMENTS ON prng_engine + prng_engine should be a PRNG interface like the ones defined in std::random. + + WHAT THIS OBJECT REPRESENTS + This is a special policy that returns the best action (according to the + underlying policy) for the given state with probability 1-epsilon + while it returns a valid random action with probability epsilon. + + It is mainly used to add some exploration in the training process of the + online reinforcement learning methods such as qlearning and sarsa. + !*/ + + public: + + typedef typename policy_type::state_type state_type; + typedef typename policy_type::action_type action_type; + + epsilon_policy ( + double epsilon, + const policy_type& policy, + const prng_engine& gen = prng_engine() + ); + /*! + requires + - epsilon >= 0 and epsilon <= 1 + ensures + - #get_epsilon() == epsilon + - #get_policy() == policy + - #get_generator() == gen + !*/ + + action_type operator() ( + const state_type& state + ) const; + /*! + ensures + - returns get_policy()(state, w) with probability 1-epsilon + and get_model().random_action(state) with probability epsilon. + !*/ + + const policy_type& get_policy( + ) const; + /*! + ensures + - returns the underlying policy used by the object. + !*/ + + model_type get_model ( + ) const; + /*! + ensures + - returns the model used by the underlying policy. + !*/ + + const matrix& get_weights ( + ) const; + /*! + ensures + - returns the weights that the policy is using. + !*/ + + matrix& get_weights ( + ); + /*! + ensures + - returns the weights that the policy is using. + !*/ + + double get_epsilon( + ) const; + /*! + ensures + - returns the epsilon value used by the policy. + !*/ + + const prng_engine& get_generator( + ) const; + /*! + ensures + - returns the generator used by the policy. + !*/ + }; - template < typename feature_extractor > - void serialize (const process_sample& item, std::ostream& out); - template < typename feature_extractor > - void deserialize (process_sample& item, std::istream& in); + template < typename policy_type, typename generator > + inline void serialize(const epsilon_policy& item, std::ostream& out); + template < typename policy_type, typename generator > + inline void deserialize(epsilon_policy& item, std::istream& in); /*! provides serialization support. !*/ diff --git a/dlib/control/lspi_abstract.h b/dlib/control/lspi_abstract.h index f262d16f48..649ca54972 100644 --- a/dlib/control/lspi_abstract.h +++ b/dlib/control/lspi_abstract.h @@ -17,7 +17,7 @@ namespace dlib { /*! REQUIREMENTS ON feature_extractor - feature_extractor should implement the example_feature_extractor interface + feature_extractor should implement the example_offline_model interface defined at the top of dlib/control/approximate_linear_models_abstract.h WHAT THIS OBJECT REPRESENTS diff --git a/dlib/control/model_abstract.h b/dlib/control/model_abstract.h deleted file mode 100644 index 8abb0f326e..0000000000 --- a/dlib/control/model_abstract.h +++ /dev/null @@ -1,169 +0,0 @@ -// Copyright (C) 2017 Adrián Javaloy (adrian.javaloy@gmail.com) -// License: Boost Software License See LICENSE.txt for the full license. -#undef DLIB_MODEL_ABSTRACT_Hh_ -#ifdef DLIB_MODEL_ABSTRACT_Hh_ - -#include "approximate_linear_models_abstract.h" -#include "../matrix.h" - -namespace dlib -{ - - template < - template class feature_extractor_type - > - class example_model - { - /*! - REQUIREMENTS ON feature_extractor - feature_extractor should implement the example_feature_extractor interface defined - at approximate_linear_models_abstract.h. - - WHAT THIS OBJECT REPRESENTS - This is an example interface of a model class. This class represents an environment - where an agent will be deployed at. In other words, it is an interface between the - simulated/real world and the agent that has to be there. In short this class: - - Holds information about the state, action and reward space. - - Delegates the state representation to the feature_extractor. - - Provides an initial state to start the agent. - - Offers an interface to move in the world (look for actions, make steps in it - and get a feedback/reward for them). - !*/ - public: - - // You have to define state, action and reward types. - typedef U state_type; - typedef V action_type; - typedef W reward_type; - - // The feature extractor uses the same types as the model. - typedef feature_extractor_type feature_extractor; - - example_model( - ); - /*! - ensures - - #get_feature_extractor() == feature_extractor() - !*/ - - action_type random_action( - const state_type &state - ) const; - /*! - ensures - - returns a random reachable action from state. - !*/ - - action_type find_best_action( - const state_type &state, - const matrix &w - ) const; - /*! - requires - - w.size() == states_size() - ensures - - returns the action that maximizes the product - dot(w, get_feature_extractor().get_features(state)). - !*/ - - const feature_extractor& get_feature_extractor( - ) const; - /*! - ensures - - returns the feature_extractor used by the model. - !*/ - - auto states_size( - ) const -> decltype(get_feature_extractor().num_features()); - /*! - ensures - - returns get_feature_extractor().num_features(). - !*/ - - auto get_features( - const state_type &state, - const action_type &action - ) const -> decltype(get_feature_extractor().get_features(state, action)); - /*! - ensures - - returns get_feature_extractor().get_features(state, action); - !*/ - - // The new_state parameter is needed because the model doesn't have to be deterministic. - // Nonetheless for now we will suppose that the rewards are deterministic. - reward_type reward( - const state_type &state, - const action_type &action, - const state_type &new_state - ) const; - /*! - requires - - action is available in state. - - new_state is a possible outcome when you do action on state. - ensures - - returns the reward obtained by going to new_state from state - doing action. - - the function is deterministic with respect to its arguments. - !*/ - - state_type initial_state( - ) const; - /*! - ensures - - returns the initial state of the model. - !*/ - - state_type step( - const state_type &state, - const action_type &action - ) const; - /*! - requires - - action is a valid action from state. - ensures - - returns a state that is possible to be in after doing action - from state. - !*/ - - bool is_success( - const state_type &state - ) const; - /*! - ensures - - returns whether state is a goal state (the agent has done its task properly). - !*/ - - bool is_failure( - const state_type &state - ) const; - /*! - ensures - - returns whether state is a failure state, i.e., a state where the agent has - failed his task. - !*/ - - bool is_final( - const state_type& state - ) const; - /*! - ensures - - returns whether state is a final state, i.e., it is a state where the agent can't - advance anymore. In another words, whether state is a success or failure state. - !*/ - - - }; - - template < template class feature_extractor > - void serialize (const example_model& item, std::ostream& out); - template < template class feature_extractor > - void deserialize (example_model& item, std::istream& in); - /*! - provides serialization support. - !*/ - -// ---------------------------------------------------------------------------------------- - -} - -#endif diff --git a/dlib/control/policy.h b/dlib/control/policy.h deleted file mode 100644 index c72ea7c3cf..0000000000 --- a/dlib/control/policy.h +++ /dev/null @@ -1,172 +0,0 @@ -// Copyright (C) 2017 Adrián Javaloy (adrian.javaloy@gmail.com) -// License: Boost Software License See LICENSE.txt for the full license. -#ifndef DLIB_POLICY_Hh_ -#define DLIB_POLICY_Hh_ - -#include "../matrix.h" -#include "policy_abstract.h" -#include -#include - -namespace dlib -{ - - template < - typename model_type - > - class greedy_policy - { - public: - - typedef model_type feature_extractor_type; - typedef typename model_type::state_type state_type; - typedef typename model_type::action_type action_type; - - greedy_policy ( - const model_type &model_ - ) : model(model_) - { - w.set_size(model.states_size()); - w = 0; - } - - greedy_policy ( - const model_type &model_, - const matrix& weights_ - ) : w(weights_), model(model_) {} - - //backward compability - greedy_policy ( - const matrix& weights_, - const model_type &model_ - ) : w(weights_), model(model_) {} - - action_type operator() ( - const state_type& state - ) const - { - return model.find_best_action(state,w); - } - - const model_type& get_model ( - ) const { return model; } - - matrix& get_weights ( - ) { return w; } - - const matrix& get_weights ( - ) const { return w; } - - private: - matrix w; - const model_type &model; - }; - - template < typename model_type > - inline void serialize(const greedy_policy& item, std::ostream& out) - { - int version = 1; - serialize(version, out); - serialize(item.get_model(), out); - serialize(item.get_weights(), out); - } - template < typename model_type > - inline void deserialize(greedy_policy& item, std::istream& in) - { - int version = 0; - deserialize(version, in); - if (version != 1) - throw serialization_error("Unexpected version found while deserializing dlib::greedy_policy object."); - model_type model; - matrix w; - deserialize(model, in); - deserialize(w, in); - item = greedy_policy(w,model); - } - -// ---------------------------------------------------------------------------------------- - - template < - typename policy_type, - typename prng_engine = std::default_random_engine - > - class epsilon_policy - { - public: - typedef typename policy_type::state_type state_type; - typedef typename policy_type::action_type action_type; - - epsilon_policy ( - double epsilon_, - const policy_type &policy_, - const prng_engine &gen_ = prng_engine() - ) : policy(policy_), epsilon(epsilon_), gen(gen_) {} - - action_type operator() ( - const state_type& state - ) const - { - std::bernoulli_distribution d(epsilon); - return d(gen) ? get_model().random_action(state) : policy(state); - } - - policy_type get_policy( - ) const { return policy; } - - auto get_model ( - ) const -> decltype(get_policy().get_model()) { return policy.get_model(); } - - matrix& get_weights ( - ) { return policy.get_weights(); } - - const matrix& get_weights ( - ) const { return policy.get_weights(); } - - double get_epsilon( - ) const { return epsilon; } - - const prng_engine& get_generator( - ) const { return gen; } - - private: - policy_type policy; - double epsilon; - - mutable prng_engine gen; - }; - - template < typename policy_type, typename generator > - inline void serialize(const epsilon_policy& item, std::ostream& out) - { - int version = 1; - serialize(version, out); - serialize(item.get_policy(), out); - serialize(item.get_epsilon(), out); - serialize(item.get_generator(), out); - } - - template < typename policy_type, typename generator > - inline void deserialize(epsilon_policy& item, std::istream& in) - { - int version = 0; - deserialize(version, in); - if (version != 1) - throw serialization_error("Unexpected version found while deserializing dlib::greedy_policy object."); - - policy_type policy; - double epsilon; - generator gen; - deserialize(policy, in); - deserialize(epsilon, in); - deserialize(gen, in); - item = epsilon_policy(epsilon, policy, gen); - } - -// ---------------------------------------------------------------------------------------- - - // For backward compability with lspi - template < typename model_type > - using policy = greedy_policy; //template aliasing is possible post C++11 -} - -#endif // DLIB_POLICY_Hh_ diff --git a/dlib/control/policy_abstract.h b/dlib/control/policy_abstract.h deleted file mode 100644 index 6d96b3ebdc..0000000000 --- a/dlib/control/policy_abstract.h +++ /dev/null @@ -1,284 +0,0 @@ -// Copyright (C) 2017 Adrián Javaloy (adrian.javaloy@gmail.com) -// License: Boost Software License See LICENSE.txt for the full license. -#undef DLIB_POLICY_ABSTRACT_Hh_ -#ifdef DLIB_POLICY_ABSTRACT_Hh_ - -#include "../matrix.h" -#include "model_abstract.h" -#include - -namespace dlib -{ - -template < - typename model_type - > -class example_policy -{ - /*! - REQUIREMENTS ON model_type - model_type should implement the interface defined at model_abstract.h. - - WHAT THIS OBJECT REPRESENTS - This is a policy based on the supplied model_type model. In - particular, it maps from model_type::state_type to a model_type::action - to take in that state. - !*/ - -public: - - typedef typename model_type::state_type state_type; - typedef typename model_type::action_type action_type; - - example_policy ( - const model_type &model - ); - /*! - ensures - - #get_model() == model - - #get_weights().size() == #get_model().states_size() - - #get_weights() == 0 - !*/ - - example_policy ( - const model_type& model, - const matrix& weights - ); - /*! - requires - - model.states_size() == weights.size() - ensures - - #get_model() == model - - #get_weights() == weights - !*/ - - action_type operator() ( - const state_type& state - ) const; - - const model_type& get_model ( - ) const; - /*! - ensures - - returns the model used by this object - !*/ - - matrix& get_weights ( - ); - /*! - ensures - - returns the parameter vector (w) associated with this object. The length - of the vector is get_model().states_size(). - !*/ - - const matrix& get_weights ( - ) const; - /*! - ensures - - returns the parameter vector (w) associated with this object. The length - of the vector is get_model().states_size(). - !*/ - -}; - -template < typename model_type > -void serialize(const example_policy& item, std::ostream& out); -template < typename model_type > -void deserialize(example_policy& item, std::istream& in); -/*! - provides serialization support. -!*/ - -// ---------------------------------------------------------------------------------------- - -template < - typename model_type - > -class greedy_policy -{ - /*! - REQUIREMENTS ON model_type - model_type should implement the interface defined at model_abstract.h. - - WHAT THIS OBJECT REPRESENTS - This is an implementation of the policy interface that returns the best action - based on the weights (i.e. it acts in a greedy fashion). - !*/ - -public: - - typedef typename model_type::state_type state_type; - typedef typename model_type::action_type action_type; - - greedy_policy ( - const model_type &model - ); - /*! - ensures - - #get_model() == model - - #get_weights().size() == #get_model().states_size() - - #get_weights() == 0 - !*/ - - greedy_policy ( - const model_type& model, - const matrix& weights - ); - /*! - requires - - model.states_size() == weights.size() - ensures - - #get_model() == model - - #get_weights() == weights - !*/ - - action_type operator() ( - const state_type& state - ) const; - /*! - ensures - - returns get_model().find_best_action(state, w); - !*/ - - const model_type& get_model ( - ) const; - /*! - ensures - - returns the model used by this object - !*/ - - matrix& get_weights ( - ); - /*! - ensures - - returns the parameter vector (w) associated with this object. The length - of the vector is get_model().states_size(). - !*/ - - const matrix& get_weights ( - ) const; - /*! - ensures - - returns the parameter vector (w) associated with this object. The length - of the vector is get_model().states_size(). - !*/ - -}; - -template < typename model_type > -void serialize(const greedy_policy& item, std::ostream& out); -template < typename model_type > -void deserialize(greedy_policy& item, std::istream& in); -/*! - provides serialization support. -!*/ - -// ---------------------------------------------------------------------------------------- - -template < - typename policy_type, - typename prng_engine = std::default_random_engine() - > -class epsilon_policy -{ - /*! - REQUIREMENTS ON policy_type - policy_type should implement the example_policy interface defined at the - top of this file. - - REQUIREMENTS ON prng_engine - prng_engine should be a PRNG class like the ones defined in std::random. - - WHAT THIS OBJECT REPRESENTS - This is a special policy that returns the best action (according to the - underlying policy) for the given state with probability 1-epsilon - while it returns a valid random action with probability epsilon. - !*/ - -public: - - typedef typename policy_type::state_type state_type; - typedef typename policy_type::action_type action_type; - - epsilon_policy ( - double epsilon, - const policy_type &policy, - const prng_engine &gen = prng_engine() - ); - /*! - requires - - epsilon >= 0 and epsilon <= 1 - ensures - - #get_epsilon() == epsilon - - #get_policy() == policy - - #get_generator() == gen - !*/ - - action_type operator() ( - const state_type& state - ) const; - /*! - ensures - - returns get_policy()(state, w) with probability 1-epsilon - and get_model().random_action(state) with probability epsilon. - !*/ - - policy_type get_policy( - ) const; - /*! - ensures - - returns the underlying policy used by the object. - !*/ - - auto get_model ( - ) const -> decltype(get_policy().get_model()); - /*! - ensures - - returns the model used by the underlying policy. - !*/ - - matrix& get_weights ( - ); - /*! - ensures - - returns the parameter vector (w) associated with this object. The length - of the vector is get_model().states_size(). - !*/ - - const matrix& get_weights ( - ) const; - /*! - ensures - - returns the parameter vector (w) associated with this object. The length - of the vector is get_model().states_size(). - !*/ - - double get_epsilon( - ) const; - /*! - ensures - - returns the epsilon value used by the policy. - !*/ - - const prng_engine& get_generator( - ) const; - /*! - ensures - - returns the generator used by the policy. - !*/ - -}; - -template < typename policy_type, typename generator > -inline void serialize(const epsilon_policy& item, std::ostream& out); -template < typename policy_type, typename generator > -inline void deserialize(epsilon_policy& item, std::istream& in); -/*! - provides serialization support. -!*/ - -// ---------------------------------------------------------------------------------------- - -} - -#endif // DLIB_POLICY_ABSTRACT_Hh_ diff --git a/dlib/control/qlearning.h b/dlib/control/qlearning.h index 94f6731d75..ace925d8b2 100644 --- a/dlib/control/qlearning.h +++ b/dlib/control/qlearning.h @@ -3,23 +3,27 @@ #ifndef DLIB_QLEARNING_Hh_ #define DLIB_QLEARNING_Hh_ -#include "policy.h" +#include "approximate_linear_models.h" #include #include #include namespace dlib { + template< + typename model_type + > class qlearning { public: + explicit qlearning( double lr = 0.2, double disc = 0.8, unsigned int miters = 100u, double eps = 0.1, bool v = false - ) : iterations(miters), verbose(v) { + ) : iters(miters), verbose(v) { set_learning_rate(lr); set_discount(disc); set_epsilon(eps); @@ -56,11 +60,11 @@ namespace dlib } unsigned int get_iterations( - ) const { return iterations; } + ) const { return iters; } void set_iterations( unsigned int value - ) { iterations = value; } + ) { iters = value; } double get_epsilon( ) const { return epsilon; } @@ -87,43 +91,44 @@ namespace dlib ) { verbose = false; } template < - typename policy_type, typename prng_engine = std::default_random_engine > - policy_type train_policy( - const policy_type &policy, + policy train( + policy training_policy = policy(), const prng_engine &gen = prng_engine() ) const { - typedef typename std::decay::type::reward_type reward_type; - if(verbose) std::cout << "Starting training..." << std::endl; - const auto &model = policy.get_model(); - epsilon_policy eps_pol(epsilon, policy, gen); + const auto& model = training_policy.get_model(); + epsilon_policy, prng_engine> eps_pol(epsilon, training_policy, gen); auto& w = eps_pol.get_weights(); DLIB_ASSERT(weights.size() == model.states_size(), - "\t qlearning::train(weights)" + "\t qlearning::train(policy, gen)" "\n\t invalid inputs were given to this function" - "\n\t weights.size: " << weights.size() << - "\n\t features size: " << model.states_size() + "\n\t policy's weights.size: " << weights.size() << + "\n\t num of features: " << model.num_features() ); - reward_type total_reward = static_cast(0); - for(auto iter = 0u; iter < iterations; ++iter){ - auto state = model.initial_state(); + matrix feats(model.num_features()), feats_next_best(model.num_features()); + double total_reward = 0.; + for(auto iter = 0u; iter < iters; ++iter) + { + auto state = model.initial_state(); auto steps = 0u; - reward_type iteration_reward = static_cast(0); - while(!model.is_final(state)){ + double iteration_reward = 0.; + + while(!model.is_final(state)) + { auto action = eps_pol(state); auto next_state = model.step(state, action); auto reward = model.reward(state, action, next_state); - const auto feats = model.get_features(state, action); - const auto feats_next_best = model.get_features(next_state, model.find_best_action(next_state, w)); + model.get_features(state, action, feats); + model.get_features(next_state, model.find_best_action(next_state, w), feats_next_best); double correction = reward + discount * dot(w, feats_next_best) - dot(w, feats); w += learning_rate * correction * feats; @@ -135,7 +140,8 @@ namespace dlib total_reward += iteration_reward; if(verbose) - std::cout << "iteration: " << iter << "\t reward: " << iteration_reward + std::cout << "iteration: " << iter + << "\t reward: " << iteration_reward << "\t mean: " << total_reward/static_cast(iter+1) << "\t steps: " << steps << std::endl; @@ -144,22 +150,13 @@ namespace dlib if(verbose) std::cout << "Training finished." << std::endl; - return eps_pol.get_policy(); + return training_policy; } - template < - typename model_type, - typename prng_engine = std::default_random_engine - > - greedy_policy train( - const model_type &model, - const prng_engine &gen = prng_engine() - ) const { return train_policy(greedy_policy(model), gen); } - private: double learning_rate; double discount; - unsigned int iterations; + unsigned int iters; double epsilon; bool verbose; }; diff --git a/dlib/control/qlearning_abstract.h b/dlib/control/qlearning_abstract.h index 182c80ca4f..c96a75654e 100644 --- a/dlib/control/qlearning_abstract.h +++ b/dlib/control/qlearning_abstract.h @@ -3,12 +3,14 @@ #undef DLIB_QLEARNING_ABSTRACT_Hh_ #ifdef DLIB_QLEARNING_ABSTRACT_Hh_ -#include "policy_abstract.h" -#include "model_abstract.h" +#include "approximate_linear_models_abstract.h" #include namespace dlib { + +// ---------------------------------------------------------------------------------------- + template < typename model_type > @@ -16,24 +18,23 @@ namespace dlib { /*! REQUIREMENTS ON model_type - model_type is an implementation of the model interface declared in - model_abstract.h. + model_type should implement the example_online_model interface defined in + the approximate_linear_models_abstract.h file. WHAT THIS OBJECT REPRESENTS - This objects is an implementation of the well-known reinforcement learning - algorithm Q-learning. This algorithms takes a bunch of process_samples - as input and outputs a policy that have learnt from that in order to take - the better results. + This object is an implementation of the well-known reinforcement learning + algorithm Q-learning. It takes an online model and tries to learn the best + possible policy for the model's environment by interacting with it. - Supposing we are in state s and action a and we are going to a new state s' - the learning function has the form: + Supposing we are in state s and action a and we are going to a new state s', + then the learning function has the form: Q(s, a) = (1 - lr) * Q(s,a) + lr * (reward + disc * max_a' Q(s', a')) - where lr is the learning_rate and disc the discount. - That formula means that it takes a convex combination of the current qvalue - and the expected qvalue. + where lr is the learning_rate and disc is the discount factor. + That formula means that it takes a convex combination of the current qvalue, + that is, the current expected reward from there, and the new expected qvalue. Note that it is an off-policy reinforcement learning algorithm meaning - that it doesn't take the policy into account while learning. + that it doesn't take the policy is using into account in the learning process. !*/ public: @@ -45,7 +46,7 @@ namespace dlib - #get_discount() == 0.8 - #get_iterations() == 100 - #get_epsilon() == 0.1 - - #is not verbose + - #is_verbose() == false !*/ explicit qlearning( @@ -122,7 +123,7 @@ namespace dlib ) const; /*! ensures - - returns the probability of doing a non-optimal step while training. + - returns the probability of taking a random step while training. !*/ void set_epsilon( @@ -157,45 +158,21 @@ namespace dlib !*/ template < - typename policy_type, typename prng_engine = std::default_random_engine > - policy_type train_policy( - const policy_type &policy, - const prng_engine &gen = prng_engine() + policy train( + policy policy = policy() + const prng_engine& gen = prng_engine() ) const; /*! requires - - policy is of the form example_policy, i.e., an instance of - an implementation of the policy interface defined in policy_abstract.h. - prng_engine is a pseudo-random number generator class like the ones - defined in std::random. By default it assumes it to be the standard - default_random_engine class. + defined in std::random. By default it is the standard one. ensures - - returns a policy of the type policy_type as the result of applying the - qlearning learning function over iterations runs over using the weight - matrix of the argument as the initial weights. Besides that, the - exploration is done with an epsilon policy using the given prng. + - returns the policy resulting of applying the learning function over + and over according to the parameters previously fed into this object. !*/ - template < - typename model_type, - typename prng_engine = std::default_random_engine - > - greedy_policy train( - const model_type &model, - const prng_engine &gen = prng_engine() - ) const; - /*! - requires - - model_type is an implementation of the example_model interface defined - at model_abstract.h. - - prng_engine is a pseudo-random number generator class like the ones - defined in std::random. By default it assumes it to be the standard - default_random_engine class. - ensures - - returns train_policy(greedy_policy(model), gen); - !*/ }; // ---------------------------------------------------------------------------------------- diff --git a/dlib/control/sarsa.h b/dlib/control/sarsa.h index 772945c048..aab620f14f 100644 --- a/dlib/control/sarsa.h +++ b/dlib/control/sarsa.h @@ -3,12 +3,16 @@ #ifndef DLIB_SARSA_Hh_ #define DLIB_SARSA_Hh_ -#include "policy.h" +#include "approximate_linear_models.h" #include #include +#include namespace dlib { + template< + typename model_type + > class sarsa { public: @@ -18,7 +22,7 @@ namespace dlib unsigned int miters = 100u, double eps = 0.1, bool v = false - ) : iterations(miters), verbose(v) { + ) : iters(miters), verbose(v) { set_learning_rate(lr); set_discount(disc); set_epsilon(eps); @@ -55,11 +59,11 @@ namespace dlib } unsigned int get_iterations( - ) const { return iterations; } + ) const { return iters; } void set_iterations( unsigned int value - ) { iterations = value; } + ) { iters = value; } double get_epsilon( ) const { return epsilon; } @@ -86,44 +90,45 @@ namespace dlib ) { verbose = false; } template < - typename policy_type, typename prng_engine = std::default_random_engine > - policy_type train_policy( - const policy_type &policy, + policy train( + policy training_policy = policy(), const prng_engine &gen = prng_engine() ) const { - typedef typename std::decay::type::reward_type reward_type; - if(verbose) std::cout << "Starting training..." << std::endl; - const auto &model = policy.get_model(); - epsilon_policy eps_pol(epsilon, policy, gen); + const auto& model = training_policy.get_model(); + epsilon_policy, prng_engine> eps_pol(epsilon, training_policy, gen); auto& w = eps_pol.get_weights(); DLIB_ASSERT(weights.size() == model.states_size(), - "\t sarsa::train(weights)" + "\t sarsa::train(policy, gen)" "\n\t invalid inputs were given to this function" - "\n\t weights.size: " << weights.size() << - "\n\t features size: " << model.states_size() + "\n\t policy's weights.size: " << weights.size() << + "\n\t num of features: " << model.num_features() ); - reward_type total_reward = static_cast(0); - for(auto iter = 0u; iter < iterations; ++iter){ + matrix feats(model.num_features()), feats_next(model.num_features()); + + double total_reward = 0.; + for(auto iter = 0u; iter < iters; ++iter) + { auto state = model.initial_state(); auto action = eps_pol(state); - auto steps = 0u; - reward_type iteration_reward = static_cast(0); - while(!model.is_final(state)){ + double iteration_reward = 0.; + + while(!model.is_final(state)) + { auto next_state = model.step(state, action); auto next_action = eps_pol(next_state); auto reward = model.reward(state, action, next_state); - const auto feats = model.get_features(state, action); - const auto feats_next = model.get_features(next_state, next_action); + model.get_features(state, action, feats); + model.get_features(next_state, next_action, feats_next); double correction = reward + discount * dot(w, feats_next) - dot(w, feats); w += learning_rate * correction * feats; @@ -135,7 +140,8 @@ namespace dlib total_reward += iteration_reward; if(verbose) - std::cout << "iteration: " << iter << "\t reward: " << iteration_reward + std::cout << "iteration: " << iter + << "\t reward: " << iteration_reward << "\t mean: " << total_reward/static_cast(iter+1) << "\t steps: " << steps << std::endl; @@ -144,22 +150,13 @@ namespace dlib if(verbose) std::cout << "Training finished." << std::endl; - return eps_pol.get_policy(); + return training_policy; } - template < - typename model_type, - typename prng_engine = std::default_random_engine - > - greedy_policy train( - const model_type &model, - const prng_engine &gen = prng_engine() - ) const { return train_policy(greedy_policy(model), gen); } - private: double learning_rate; double discount; - unsigned int iterations; + unsigned int iters; double epsilon; bool verbose; }; diff --git a/dlib/control/sarsa_abstract.h b/dlib/control/sarsa_abstract.h index 6acd6f06da..a9ebbf522d 100644 --- a/dlib/control/sarsa_abstract.h +++ b/dlib/control/sarsa_abstract.h @@ -16,22 +16,20 @@ namespace dlib { /*! REQUIREMENTS ON model_type - model_type is an implementation of the model interface declared in - model_abstract.h. + model_type should implement the example_online_model interface defined in + the approximate_linear_models_abstract.h file. WHAT THIS OBJECT REPRESENTS - This objects is an implementation of the well-known reinforcement learning - algorithm Q-learning. This algorithms takes a bunch of process_samples - as input and outputs a policy that have learnt from that in order to take - the better results. + This object is an implementation of the well-known reinforcement learning + algorithm SARSA. It takes an online model and tries to learn the best + possible policy for the model's environment by interacting with it. Supposing we are in state s and action a and we are going to a new state s' - the learning function has the form: + and taking the action a' in s', then the learning function has the form: Q(s, a) = (1 - lr) * Q(s,a) + lr * (reward + disc * Q(s', a')) - where lr is the learning_rate, disc the discount and a' is the next action - the algorithm will perform after reaching s'. - That formula means that it takes a convex combination of the current qvalue - and the expected qvalue. + where lr is the learning_rate and disc is the discount factor. + That formula means that it takes a convex combination of the current qvalue, + that is, the current expected reward from there, and the new expected qvalue. Note that, unlike qlearning, sarsa is an on-policy reinforcement learning algorithm meaning that it takes the policy into account while learning. @@ -46,7 +44,7 @@ namespace dlib - #get_discount() == 0.8 - #get_iterations() == 100 - #get_epsilon() == 0.1 - - #is not verbose + - #is_verbose() == false !*/ explicit sarsa( @@ -123,7 +121,7 @@ namespace dlib ) const; /*! ensures - - returns the probability of doing a non-optimal step while training. + - returns the probability of taking a random step while training. !*/ void set_epsilon( @@ -158,46 +156,21 @@ namespace dlib !*/ template < - typename policy_type, typename prng_engine = std::default_random_engine > - policy_type train_policy( - const policy_type &policy, - const prng_engine &gen - ) const; - /*! - requires - - policy is of the form example_policy, i.e., an instance of - an implementation of the policy interface defined in policy_abstract.h. - - prng_engine is a pseudo-random number generator class like the ones - defined in std::random. By default it assumes it to be the standard - default_random_engine class. - ensures - - returns a policy of the type policy_type as the result of applying the - sarsa learning function over iterations runs over using the weight - matrix of the argument as the initial weights. Besides that, the - exploration is done with an epsilon policy using the given prng. - !*/ - - template < - typename model_type, - typename prng_engine = std::default_random_engine - > - greedy_policy train( - const model_type &model, + policy_type train( + policy policy = policy(), const prng_engine &gen = prng_engine() ) const; /*! requires - - model_type is an implementation of the example_model interface defined - at model_abstract.h. - prng_engine is a pseudo-random number generator class like the ones - defined in std::random. By default it assumes it to be the standard - default_random_engine class. + defined in std::random. By default it is the standard one. ensures - - returns train_policy(greedy_policy(model), gen); + - returns the policy resulting of applying the learning function over + and over according to the parameters previously fed into this object. !*/ - }; + }; // ---------------------------------------------------------------------------------------- diff --git a/dlib/test/reinforcement_learning.cpp b/dlib/test/reinforcement_learning.cpp index 4d79535440..0db14d45c2 100644 --- a/dlib/test/reinforcement_learning.cpp +++ b/dlib/test/reinforcement_learning.cpp @@ -16,34 +16,52 @@ namespace template < int height, - int width, - template class feature_extractor_type + int width > class cliff_model { public: - // constants and actions allowed + // actions allowed in the model enum class actions {up = 0, right, down, left}; + constexpr static int num_actions = 4; + + // some constants we need constexpr static double EPS = 1e-16; constexpr static int HEIGHT = height; constexpr static int WIDTH = width; - // model types + // we define the model's types typedef int state_type; typedef actions action_type; - typedef int reward_type; - - typedef feature_extractor_type feature_extractor; + // Constructor explicit cliff_model( int seed = 0 - ) : fe(height, width, 4), gen(seed) {} + ) : gen(seed){} + + + // Functions that will use the agent + + unsigned int num_features( + ) const { return num_actions * height * width; } + void get_features( + const state_type &state, + const action_type &action, + matrix& feats + ) const + { + feats = 0; + feats(num_actions*state + static_cast(action)) = 1; //only this one is 1 + } + + // It's possible that the allowed actions differ among states. + // In this case all movements are always allowed so we don't need to use state. action_type random_action( - const state_type& state // since all movements are always allowed we don't use state + const state_type& state ) const { - std::uniform_int_distribution dist(0,3); + uniform_int_distribution dist(0,num_actions-1); return static_cast(dist(gen)); } @@ -52,43 +70,30 @@ namespace const matrix& w ) const { - // it looks for the best actions in state according to w - auto best = std::numeric_limits::lowest(); + auto best = numeric_limits::lowest(); auto best_indexes = std::vector(); - for(auto i = 0; i < 4; i++){ - auto feats = get_features(state, static_cast(i)); + for(auto i = 0; i < num_actions; i++) + { + matrix feats(num_features()); + get_features(state, static_cast(i), feats); auto product = dot(w, feats); if(product > best){ best = product; best_indexes.clear(); } - if(std::abs(product - best) < EPS) + if(abs(product - best) < EPS) best_indexes.push_back(i); } // returns a random action between the best ones. - std::uniform_int_distribution dist(0, best_indexes.size()-1); + uniform_int_distribution dist(0, best_indexes.size()-1); return static_cast(best_indexes[dist(gen)]); } - const feature_extractor& get_feature_extractor( - ) const { return fe; } - - auto states_size( - ) const -> decltype(get_feature_extractor().num_features()) - { - return get_feature_extractor().num_features(); - } - - auto get_features( - const state_type &state, - const action_type &action - ) const -> decltype(get_feature_extractor().get_features(state, action)) - { return get_feature_extractor().get_features(state, action); } - - reward_type reward( + // This functions gives the rewards, that is, tells the agent how good are its movements + double reward( const state_type &state, const action_type &action, const state_type &new_state @@ -100,6 +105,7 @@ namespace state_type initial_state( ) const { return static_cast((height-1) * width); } + // This is an important function, basically it allows the agent to move around the environment state_type step( const state_type& state, const action_type& action @@ -114,6 +120,7 @@ namespace state - 1 ; } + // this functions allow the agent to know in which state of the simulation it's in bool is_success( const state_type &state ) const { return state == height*width - 1; } @@ -127,6 +134,7 @@ namespace ) const { return is_success(state) || is_failure(state); } private: + bool out_of_bounds( const state_type& state, const action_type& action @@ -134,7 +142,8 @@ namespace { bool result; - switch(action){ + switch(action) + { case actions::up: result = state / width == 0; break; @@ -152,67 +161,32 @@ namespace return result; } - feature_extractor fe; - mutable std::default_random_engine gen; //mutable because it doesn't changes the model state - }; - - template < - typename state_type, - typename action_type - > - class feature_extractor - { - public: - feature_extractor( - int h, - int w, - int na - ) : height(h), width(w), num_actions(na) {} - - inline long num_features( - ) const { return num_actions * height * width; } - - matrix get_features( - const state_type &state, - const action_type &action - ) const - { - matrix feats(num_features()); - feats = 0; - //for(auto i = 0u; i < num_actions; i++) - // feats(num_actions * state + i) = 1; - feats(num_actions*state + static_cast(action)) = 1; - - return feats; - } - - private: - int height, width, num_actions; + mutable default_random_engine gen; //mutable because it doesn't changes the model state }; template < int height, int width, - typename algorithm_t + template typename algorithm_t > void test() { constexpr static int seed = 7; - typedef cliff_model model_t; + typedef cliff_model model_t; const int max_steps = 100; print_spinner(); - algorithm_t algorithm; + algorithm_t algorithm; model_t model(seed); - auto policy = algorithm.train(model, std::default_random_engine(seed)); + auto my_policy = algorithm.train(policy(model), std::default_random_engine(seed)); auto s = model.initial_state(); - auto r = static_cast(0); + double r = 0.; int i; for(i = 0; i < max_steps && !model.is_final(s); i++){ - auto a = policy(s); + auto a = my_policy(s); auto new_s = model.step(s, a); r += model.reward(s,a,new_s); s = new_s; diff --git a/examples/qlearning_sarsa_ex.cpp b/examples/qlearning_sarsa_ex.cpp index 431dcff48a..53a10fdf1f 100644 --- a/examples/qlearning_sarsa_ex.cpp +++ b/examples/qlearning_sarsa_ex.cpp @@ -16,36 +16,41 @@ using namespace dlib; using namespace std; /* - Both of these algorithms work by a reward system. That means that they assign to each - pair (state, action) an expected reward (Qvalue) and they update those values iteratively - taking steps on a model/simulation and observing the reward they obtain. Like so, they - need a model class that allow them to work in a interactive way. + Both of these algorithms work with a reward system. It means that they assign to each + pair (state, action) an expected reward (qvalue) and they iteratively update those values + taking steps on an online model/simulation observing the reward obtained. Like so, they + need a model class that allows them to work in a interactive way. - The algorithms/agents objective is to maximize the expected reward by taking the proper + The algorithms/agents' objective is to maximize the expected reward by taking the proper steps. */ /* - This is the model the agent is going to work with in the example. In particular, + Let me now introduce you the conceptual model the agent is going to use. Basically, this class represents a grid with a given height and width of the form .......... .......... IFFFFFFFFG - where: - F are pit cells (if the agent falls there it fails and simulation ends). + where: - Fs represent pit cells where the agent can fall and thus fail the simulation. - I is the starting position. - - G is the goal cell (the agent goal is to reach that cell). - - . are free cells where the agent can go. + - G is the goal cell where the agent aims to go. + - dots (.) represent free cells where the agent can freely go through. The agent receives the following reward: -100 for reaching F, 100 for reaching G and a reward of -1 otherwise. This model doesn't allow the agent to go out of bounds, instead it will stay in the same cell - he was before the action (like if there was a wall there) but receiving a reward of -1. + it was before the taking action (like if there was a wall) and receive a reward of -1. + + Function approximation by feature extraction is a powerful tool for reducing the state space's size. + But ours is a toy example and so I use a one-shot representation, meaning that each feature + represents a single state of the space and it will be 1 when that's state is active and 0 otherwise. */ + +//This is an implementation of the example_online_model interface on approximate_linear_models_abstract.h template < int height, - int width, - template class feature_extractor_type + int width > class cliff_model { @@ -54,7 +59,7 @@ class cliff_model enum class actions {up = 0, right, down, left}; constexpr static int num_actions = 4; - // some constants that we need + // some constants we need constexpr static double EPS = 1e-16; constexpr static int HEIGHT = height; constexpr static int WIDTH = width; @@ -62,21 +67,30 @@ class cliff_model // we define the model's types typedef int state_type; typedef actions action_type; - typedef int reward_type; - - // this ensures that the feature extractor uses the same underlying types as our model - typedef feature_extractor_type feature_extractor; - // Constructor explicit cliff_model( int seed = 0 - ) : fe(height, width, num_actions), gen(seed){} + ) : gen(seed){} // Functions that will use the agent - // It returns a random action. It's possible that the allowed actions differ from among states. + unsigned int num_features( + ) const { return num_actions * height * width; } + + void get_features( + const state_type &state, + const action_type &action, + matrix& feats + ) const + { + feats.set_size(num_features()); + feats = 0; + feats(num_actions*state + static_cast(action)) = 1; //only this one is 1 + } + + // It's possible that the allowed actions differ among states. // In this case all movements are always allowed so we don't need to use state. action_type random_action( const state_type& state @@ -86,9 +100,6 @@ class cliff_model return static_cast(dist(gen)); } - // Returns the best action that maximizes the expected reward, that is, - // the action that maximizes dot_product(w, get_features(state, action)) - // w will be the weights assign by the agent to each feature action_type find_best_action( const state_type& state, const matrix& w @@ -97,8 +108,10 @@ class cliff_model auto best = numeric_limits::lowest(); auto best_indexes = std::vector(); - for(auto i = 0; i < num_actions; i++){ - auto feats = get_features(state, static_cast(i)); + for(auto i = 0; i < num_actions; i++) + { + matrix feats; + get_features(state, static_cast(i), feats); auto product = dot(w, feats); if(product > best){ @@ -114,27 +127,8 @@ class cliff_model return static_cast(best_indexes[dist(gen)]); } - - // This functions are delegated to the feature extractor - - const feature_extractor& get_feature_extractor( - ) const { return fe; } - - auto states_size( - ) const -> decltype(get_feature_extractor().num_features()) - { - return get_feature_extractor().num_features(); - } - - auto get_features( - const state_type &state, - const action_type &action - ) const -> decltype(get_feature_extractor().get_features(state, action)) - { return get_feature_extractor().get_features(state, action); } - - // This functions gives the rewards, that is, tells the agent how good are its movements - reward_type reward( + double reward( const state_type &state, const action_type &action, const state_type &new_state @@ -146,7 +140,7 @@ class cliff_model state_type initial_state( ) const { return static_cast((height-1) * width); } - // This is an important function, basically it allows the agent to move in the model's world + // This is an important function, basically it allows the agent to move around the environment state_type step( const state_type& state, const action_type& action @@ -161,8 +155,7 @@ class cliff_model state - 1 ; } - // this functions allow the agent to know in which state of the simulation he is in - + // this functions allow the agent to know in which state of the simulation it's in bool is_success( const state_type &state ) const { return state == height*width - 1; } @@ -176,6 +169,7 @@ class cliff_model ) const { return is_success(state) || is_failure(state); } private: + bool out_of_bounds( const state_type& state, const action_type& action @@ -183,7 +177,8 @@ class cliff_model { bool result; - switch(action){ + switch(action) + { case actions::up: result = state / width == 0; break; @@ -201,55 +196,9 @@ class cliff_model return result; } - feature_extractor fe; mutable default_random_engine gen; //mutable because it doesn't changes the model state }; -/* - Usually when we use these types of agents the state space of the model is huge. That could make - the Qfunction to be unmanageable and so we need to use what is known as function approximation. - - Basically it represents the states by a given features instead of the states themselves. That way - what usually was just a single value Q(state, action) now is codified as the linear combination of - learnt weights and the features, that is, Q(state, action) = dot_product(weights, features(state, action)). - - Our example is a toy example and so we don't need to use it. However, to show how it works I use a simple - one-shot representation of the states. That means that I have a vector of features where the feature in the - ith position is one if we provide a specific (state, action) and 0 otherwise. -*/ -template < - typename state_type, - typename action_type - > -class feature_extractor -{ -public: - feature_extractor( - int h, - int w, - int na - ) : height(h), width(w), num_actions(na) {} - - //the size of the vector - inline long num_features( - ) const { return num_actions * height * width; } - - matrix get_features( - const state_type &state, - const action_type &action - ) const - { - matrix feats(num_features()); - feats = 0; - feats(num_actions*state + static_cast(action)) = 1; //only this one is 1 - - return feats; - } - -private: - int height, width, num_actions; -}; - // This is just a helper function to pretty-print the agent's state. template < typename model_t @@ -283,7 +232,7 @@ void print( This is the function that runs the agent. The code to run both agents are identical so I chose to use a templated function. - The difference between executions comes in the way they train. Namely, the way they updated the Qvalue. + The difference between executions comes in the way they train. Namely, the way they update the qvalue. Let's suppose that we are in the pair (s, a) and we are going to be in (s', a') in the next step. Q-learning is an off-policy algorithm meaning that doesn't consider its trully next move but the best one, @@ -296,19 +245,16 @@ void print( in the next step instead of the optimal. So it's an on-policy algorithm. Its update formula is: Q(s, a) = (1 - lr) * Q(s,a) + lr * (reward + disc * Q(s', a')) - This seems as a meaningless change, but what produces is that when training SARSA tends to be more conservative - in its movement while Q-learning tries to optimizes no matter what. In cases when you have to avoid failling + This looks as a meaningless change, but what produces is that, when training, SARSA tends to be more conservative + in its movements while Q-learning tries to optimizes them no matter what. In cases when you have to avoid failure (usually a real world example) SARSA is a better option. - In our example this difference is appreciated in the way they learn. Q-learning will try to go close to the pit + In our example this difference can be appreciated in the way they learn. Q-learning will try to go close to the pit cells all the time (falling a lot in the training process) and SARSA will go one or two cells off the cliff. - - Usually, one decreases the learning ratio as the iterations go on and so SARSA would converge to the same solution - as Q-learning. This is not implemented yet and so the learning rate is constant always. */ template < typename model_t, - typename algorithm_t // this can be qlearning or sarsa + typename algorithm_t // qlearning or sarsa > void run_example(const model_t &model, algorithm_t &&algorithm) { @@ -317,7 +263,7 @@ void run_example(const model_t &model, algorithm_t &&algorithm) cout << "Starting final simulation..." << endl; auto s = model.initial_state(); - auto r = static_cast(0); + double r = 0.; int i; for(i = 0; i < 100 && !model.is_final(s); i++){ @@ -345,17 +291,16 @@ int main(int argc, char** argv) const auto height = 3u; const auto width = 7u; - typedef cliff_model model_type; - model_type model; + cliff_model model; char response; cout << "Qlearning or SARSA? (q/s): "; cin >> response; if(response == 'q') - run_example(model, qlearning()); + run_example(model, qlearning()); else if(response == 's') - run_example(model, sarsa()); + run_example(model, sarsa()); else cerr << "Invalid option." << endl; From 883075368b03568bd2ea71dad89f894c723d420a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adri=C3=A1n=20Javaloy?= Date: Sun, 18 Feb 2018 01:55:59 +0100 Subject: [PATCH 07/14] travis hotfix --- dlib/control/approximate_linear_models.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/dlib/control/approximate_linear_models.h b/dlib/control/approximate_linear_models.h index 5fe025427c..3666bb905a 100644 --- a/dlib/control/approximate_linear_models.h +++ b/dlib/control/approximate_linear_models.h @@ -151,7 +151,7 @@ namespace dlib ) const { return underlying_policy; } auto get_model ( - ) const -> decltype(get_policy().get_model()) { return underlying_policy.get_model(); } + ) const -> decltype(underlying_policy.get_model()) { return underlying_policy.get_model(); } matrix& get_weights ( ) { return underlying_policy.get_weights(); } From c5cf9b307924d27353d3573a1be763e7f3bbdfee Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adri=C3=A1n=20Javaloy?= Date: Sun, 18 Feb 2018 01:55:59 +0100 Subject: [PATCH 08/14] Revert "travis hotfix" This reverts commit ee5428ada88f0c70247f51ab87f9753e18ee43a6. --- dlib/control/approximate_linear_models.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/dlib/control/approximate_linear_models.h b/dlib/control/approximate_linear_models.h index 3666bb905a..5fe025427c 100644 --- a/dlib/control/approximate_linear_models.h +++ b/dlib/control/approximate_linear_models.h @@ -151,7 +151,7 @@ namespace dlib ) const { return underlying_policy; } auto get_model ( - ) const -> decltype(underlying_policy.get_model()) { return underlying_policy.get_model(); } + ) const -> decltype(get_policy().get_model()) { return underlying_policy.get_model(); } matrix& get_weights ( ) { return underlying_policy.get_weights(); } From ebc5648e4403565f69455821579a52e9743675cb Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adri=C3=A1n=20Javaloy?= Date: Sun, 18 Feb 2018 12:05:23 +0100 Subject: [PATCH 09/14] real travis hotfix There was a compilation error that happened on gcc4.8.4 on travis but not in my local compiler --- dlib/control/approximate_linear_models.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/dlib/control/approximate_linear_models.h b/dlib/control/approximate_linear_models.h index 5fe025427c..8acaf18cf8 100644 --- a/dlib/control/approximate_linear_models.h +++ b/dlib/control/approximate_linear_models.h @@ -151,7 +151,7 @@ namespace dlib ) const { return underlying_policy; } auto get_model ( - ) const -> decltype(get_policy().get_model()) { return underlying_policy.get_model(); } + ) const -> decltype(this->get_policy().get_model()) { return underlying_policy.get_model(); } matrix& get_weights ( ) { return underlying_policy.get_weights(); } From 9650cd8cf3e6f8e8d11544de0b176ed722b4fc78 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adri=C3=A1n=20Javaloy?= Date: Sun, 18 Feb 2018 12:36:44 +0100 Subject: [PATCH 10/14] templated template parameters must have class, not typename --- dlib/test/reinforcement_learning.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/dlib/test/reinforcement_learning.cpp b/dlib/test/reinforcement_learning.cpp index 0db14d45c2..9933dfdeba 100644 --- a/dlib/test/reinforcement_learning.cpp +++ b/dlib/test/reinforcement_learning.cpp @@ -167,7 +167,7 @@ namespace template < int height, int width, - template typename algorithm_t + template class algorithm_t > void test() { From cade6fb5aaa185e39ed03672bb6f3bab4c8d9103 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adri=C3=A1n=20Javaloy?= Date: Mon, 4 Jun 2018 00:57:10 +0200 Subject: [PATCH 11/14] Applied the notes of the second review --- dlib/control/approximate_linear_models.h | 22 ++++- .../approximate_linear_models_abstract.h | 59 +++++------- dlib/control/qlearning.h | 6 +- dlib/control/qlearning_abstract.h | 13 ++- dlib/control/sarsa.h | 6 +- dlib/control/sarsa_abstract.h | 12 +-- dlib/serialize.h | 51 ++++++++++ dlib/test/reinforcement_learning.cpp | 93 ++++++++++++++++++- examples/qlearning_sarsa_ex.cpp | 29 +++++- 9 files changed, 226 insertions(+), 65 deletions(-) diff --git a/dlib/control/approximate_linear_models.h b/dlib/control/approximate_linear_models.h index 8acaf18cf8..5322c2735c 100644 --- a/dlib/control/approximate_linear_models.h +++ b/dlib/control/approximate_linear_models.h @@ -78,6 +78,12 @@ namespace dlib const model_type &model_ ) : weights(weights_), model(model_) {} + policy(const policy&) = default; + policy& operator=(const policy&) = default; + + policy(policy&&) = default; + policy& operator=(policy&&) = default; + action_type operator() ( const state_type& state ) const @@ -96,7 +102,7 @@ namespace dlib private: matrix weights; - const model_type model; + model_type model; }; template < typename model_type > @@ -135,23 +141,29 @@ namespace dlib epsilon_policy ( double epsilon_, - policy_type &policy_, + const policy_type& policy_, const prng_engine &gen_ = prng_engine() ) : underlying_policy(policy_), epsilon(epsilon_), gen(gen_) {} + epsilon_policy(const epsilon_policy&) = default; + epsilon_policy& operator=(const epsilon_policy&) = default; + + epsilon_policy(epsilon_policy&&) = default; + epsilon_policy& operator=(epsilon_policy&&) = default; + action_type operator() ( const state_type& state ) const { std::bernoulli_distribution d(epsilon); - return d(gen) ? get_model().random_action(state) : underlying_policy(state); + return d(gen) ? underlying_policy.get_model().random_action(state) : underlying_policy(state); } const policy_type& get_policy( ) const { return underlying_policy; } auto get_model ( - ) const -> decltype(this->get_policy().get_model()) { return underlying_policy.get_model(); } + ) const -> decltype(get_policy().get_model()) { return underlying_policy.get_model(); } matrix& get_weights ( ) { return underlying_policy.get_weights(); } @@ -166,7 +178,7 @@ namespace dlib ) const { return gen; } private: - policy_type& underlying_policy; + policy_type underlying_policy; double epsilon; mutable prng_engine gen; diff --git a/dlib/control/approximate_linear_models_abstract.h b/dlib/control/approximate_linear_models_abstract.h index 0f14432f92..7462a97552 100644 --- a/dlib/control/approximate_linear_models_abstract.h +++ b/dlib/control/approximate_linear_models_abstract.h @@ -15,23 +15,16 @@ namespace dlib /*! WHAT THIS OBJECT REPRESENTS This object defines the inferface that any model has to implement if it - is to be used in an offline fashion along with some method like the lspi - method defined in the file lspi_abstract.h. Being offline only means that - it already holds the data and will not interact with the environment to get - them. + is to be used in an offline fashion along with some class like the lspi + class defined in the file lspi_abstract.h. In particular, this object models a Q(state, action) function where Q(state, action) == dot(w, PSI(state, action)) where PSI(state, action) is a feature vector and w is a parameter vector. - Therefore, an offline model defines how the PSI(x,y) feature vector is - calculated. It also defines the types used to represent the state and - action objects. - - THREAD SAFETY - Instances of this object are required to be threadsafe, that is, it should - be safe for multiple threads to make concurrent calls to the member - functions of this object. + Therefore, an offline model object defines how a model is represented by + defining its actions, states, calculating the feature vectors. However, it + does not provide any way to interactively interact with it. !*/ // The states and actions can be any type as long as you provide typedefs for them. @@ -85,25 +78,22 @@ namespace dlib /*! WHAT THIS OBJECT REPRESENTS This object defines the inferface that any model has to implement if it - is to be used in an online fashion along with some method like the qlearning - method defined in the file qlearning_abstract.h. + is to be used by an object such as the qlearning class defined in the + file qlearning_abstract.h. - Being online means that the model doesn't hold prior data but it interacts - with the environment and performing actions from some given state turning - that state into a new one as well as getting some reward for doing so. + Instances of this object differ from the offline model in the way they + interact with the environment. This object expands the interface of the + offline model with methods that make it suitable for simulations: + going from one state to another, detecting special states and getting + the reward for performing those steps. In particular, this object models a Q(state, action) function where Q(state, action) == dot(w, PSI(state, action)) where PSI(state, action) is a feature vector and w is a parameter vector. - Therefore, an online model defines how the PSI(x,y) feature vector is - calculated, the types used to represent the state, action and reward - objects as well as how to interact with the environment. - - THREAD SAFETY - Instances of this object are required to be threadsafe, that is, it should - be safe for multiple threads to make concurrent calls to the member - functions of this object. + Therefore, an online model object defines how a model is represented by + defining its actions, states, calculating the feature vectors. Besides, it + provides methods to interact with that environment on the fly. !*/ // The states and actions can be any type as long as you provide typedefs for them. @@ -160,12 +150,9 @@ namespace dlib const state_type& new_state ) const; /*! - requires - - action is a pausible action from state. - - new_state is a possible outcome when performing action on state. ensures - returns the reward obtained by reaching new_state from state - doing action. + after you do action. !*/ state_type initial_state( @@ -180,11 +167,9 @@ namespace dlib const action_type& action ) const; /*! - requires - - action is a plausible action when we are in state. ensures - - returns a new state result of being on the given state and doing the given - action. + - returns a new state result of doing the given action over the + given state. !*/ bool is_success( @@ -224,7 +209,8 @@ namespace dlib { /*! REQUIREMENTS ON model_type - model_type should implement one of the interfaces defined above this file. + model_type should implement one of the two interfaces defined above, that is, + example_offline_model or example_online_model. WHAT THIS OBJECT REPRESENTS This object holds a training sample for a reinforcement learning algorithm. @@ -268,7 +254,8 @@ namespace dlib { /*! REQUIREMENTS ON model_type - model_type should implement one of the interfaces defined above this file. + model_type should implement one of the two interfaces defined above, that is, + example_offline_model or example_online_model. WHAT THIS OBJECT REPRESENTS This class represents a greedy policy, that is, it is a policy that given a @@ -307,7 +294,7 @@ namespace dlib ) const; /*! ensures - - returns get_model().find_best_action(state, this->weights); + - returns get_model().find_best_action(state, get_weights()); !*/ const model_type& get_model ( diff --git a/dlib/control/qlearning.h b/dlib/control/qlearning.h index ace925d8b2..c6b7f7ac70 100644 --- a/dlib/control/qlearning.h +++ b/dlib/control/qlearning.h @@ -94,8 +94,8 @@ namespace dlib typename prng_engine = std::default_random_engine > policy train( - policy training_policy = policy(), - const prng_engine &gen = prng_engine() + const policy& training_policy = policy(), + const prng_engine& gen = prng_engine() ) const { if(verbose) @@ -150,7 +150,7 @@ namespace dlib if(verbose) std::cout << "Training finished." << std::endl; - return training_policy; + return eps_pol.get_policy(); } private: diff --git a/dlib/control/qlearning_abstract.h b/dlib/control/qlearning_abstract.h index c96a75654e..39e5c7225e 100644 --- a/dlib/control/qlearning_abstract.h +++ b/dlib/control/qlearning_abstract.h @@ -30,11 +30,9 @@ namespace dlib then the learning function has the form: Q(s, a) = (1 - lr) * Q(s,a) + lr * (reward + disc * max_a' Q(s', a')) where lr is the learning_rate and disc is the discount factor. - That formula means that it takes a convex combination of the current qvalue, - that is, the current expected reward from there, and the new expected qvalue. - Note that it is an off-policy reinforcement learning algorithm meaning - that it doesn't take the policy is using into account in the learning process. + The formula above means that it takes a convex combination of the current + qvalue, that is, the current expected reward, and the new expected qvalue. !*/ public: @@ -161,7 +159,7 @@ namespace dlib typename prng_engine = std::default_random_engine > policy train( - policy policy = policy() + const policy& policy = policy() const prng_engine& gen = prng_engine() ) const; /*! @@ -169,8 +167,9 @@ namespace dlib - prng_engine is a pseudo-random number generator class like the ones defined in std::random. By default it is the standard one. ensures - - returns the policy resulting of applying the learning function over - and over according to the parameters previously fed into this object. + - returns the policy obtained by applying to the given policy the learning + function several times according to the parameters previously fed + into this object. !*/ }; diff --git a/dlib/control/sarsa.h b/dlib/control/sarsa.h index aab620f14f..69d87badc3 100644 --- a/dlib/control/sarsa.h +++ b/dlib/control/sarsa.h @@ -93,8 +93,8 @@ namespace dlib typename prng_engine = std::default_random_engine > policy train( - policy training_policy = policy(), - const prng_engine &gen = prng_engine() + const policy& training_policy = policy(), + const prng_engine& gen = prng_engine() ) const { if(verbose) @@ -150,7 +150,7 @@ namespace dlib if(verbose) std::cout << "Training finished." << std::endl; - return training_policy; + return eps_pol.get_policy(); } private: diff --git a/dlib/control/sarsa_abstract.h b/dlib/control/sarsa_abstract.h index a9ebbf522d..e21bb9b7ec 100644 --- a/dlib/control/sarsa_abstract.h +++ b/dlib/control/sarsa_abstract.h @@ -30,9 +30,6 @@ namespace dlib where lr is the learning_rate and disc is the discount factor. That formula means that it takes a convex combination of the current qvalue, that is, the current expected reward from there, and the new expected qvalue. - - Note that, unlike qlearning, sarsa is an on-policy reinforcement learning - algorithm meaning that it takes the policy into account while learning. !*/ public: @@ -159,16 +156,17 @@ namespace dlib typename prng_engine = std::default_random_engine > policy_type train( - policy policy = policy(), - const prng_engine &gen = prng_engine() + const policy& policy = policy(), + const prng_engine& gen = prng_engine() ) const; /*! requires - prng_engine is a pseudo-random number generator class like the ones defined in std::random. By default it is the standard one. ensures - - returns the policy resulting of applying the learning function over - and over according to the parameters previously fed into this object. + - returns the policy obtained by applying to the given policy the learning + function several times according to the parameters previously fed + into this object. !*/ }; diff --git a/dlib/serialize.h b/dlib/serialize.h index 16d0d15013..1dfaf9b713 100644 --- a/dlib/serialize.h +++ b/dlib/serialize.h @@ -74,6 +74,7 @@ - enumerable where T is a serializable type - map_pair where D and R are both serializable types. - C style arrays of serializable types + - the random devices defined in std::random like std::default_random_engine - Google protocol buffer objects. This file provides deserialization support to the following object types: @@ -91,6 +92,7 @@ - dlib::int64 - float_details - C style arrays of serializable types + - the random devices defined in std::random like std::default_random_engine - Google protocol buffer objects. Support for deserialization of objects which implement the enumerable or @@ -156,6 +158,8 @@ #include #include #include +#include +#include #include "uintn.h" #include "interfaces/enumerable.h" #include "interfaces/map_pair.h" @@ -1541,6 +1545,53 @@ namespace dlib } } +// ---------------------------------------------------------------------------------------- + + #define USE_SERIALIZATION_THROUGH_IOSTREAM(T) \ + inline void serialize ( \ + const T& item, \ + std::ostream& out \ + ) \ + { \ + std::stringstream ss; \ + ss.setf(std::ios_base::dec, std::ios_base::basefield); \ + ss.setf(std::ios_base::left, std::ios_base::adjustfield); \ + ss.fill(' '); \ + ss << item; \ + \ + try{ serialize(ss.str(),out); } \ + catch (serialization_error& e) \ + { throw serialization_error(e.info + "\n while serializing object of type std::default_random_engine"); } \ + } \ + \ + inline void deserialize ( \ + T& item, \ + std::istream& in \ + ) \ + { \ + std::string str; \ + try { deserialize(str,in); } \ + catch (serialization_error& e) \ + { throw serialization_error(e.info + "\n while deserializing object of type std::default_random_engine"); } \ + \ + std::stringstream ss(str); \ + ss.setf(std::ios_base::dec, std::ios_base::basefield); \ + ss.setf(std::ios_base::left, std::ios_base::adjustfield); \ + ss.fill(' '); \ + ss >> item; \ + } + + //USE_SERIALIZATION_THROUGH_IOSTREAM(std::default_random_engine) + USE_SERIALIZATION_THROUGH_IOSTREAM(std::minstd_rand) + USE_SERIALIZATION_THROUGH_IOSTREAM(std::minstd_rand0) + USE_SERIALIZATION_THROUGH_IOSTREAM(std::mt19937) + USE_SERIALIZATION_THROUGH_IOSTREAM(std::mt19937_64) + USE_SERIALIZATION_THROUGH_IOSTREAM(std::ranlux24_base) + USE_SERIALIZATION_THROUGH_IOSTREAM(std::ranlux48_base) + USE_SERIALIZATION_THROUGH_IOSTREAM(std::ranlux24) + USE_SERIALIZATION_THROUGH_IOSTREAM(std::ranlux48) + USE_SERIALIZATION_THROUGH_IOSTREAM(std::knuth_b) + // ---------------------------------------------------------------------------------------- class proxy_serialize diff --git a/dlib/test/reinforcement_learning.cpp b/dlib/test/reinforcement_learning.cpp index 9933dfdeba..7c708a49e7 100644 --- a/dlib/test/reinforcement_learning.cpp +++ b/dlib/test/reinforcement_learning.cpp @@ -3,6 +3,7 @@ #include "tester.h" #include +#include #include #include #include @@ -12,7 +13,7 @@ namespace using namespace test; using namespace dlib; using namespace std; - dlib::logger dlog("test.rl"); + logger dlog("test.rl"); template < int height, @@ -34,11 +35,13 @@ namespace typedef int state_type; typedef actions action_type; - // Constructor + // Constructors explicit cliff_model( int seed = 0 ) : gen(seed){} + cliff_model(const cliff_model&) = default; + cliff_model& operator=(const cliff_model&) = default; // Functions that will use the agent @@ -71,7 +74,7 @@ namespace ) const { auto best = numeric_limits::lowest(); - auto best_indexes = std::vector(); + std::vector best_indexes; for(auto i = 0; i < num_actions; i++) { @@ -133,6 +136,9 @@ namespace const state_type& state ) const { return is_success(state) || is_failure(state); } + const std::default_random_engine& get_generator( + ) const { return gen; } + private: bool out_of_bounds( @@ -161,9 +167,35 @@ namespace return result; } + template < int H, int W> + friend void serialize(const cliff_model& item, std::ostream& out); + + template < int H, int W> + friend void deserialize(cliff_model& item, std::istream& in); + mutable default_random_engine gen; //mutable because it doesn't changes the model state }; + template < int height, int width > + inline void serialize(const cliff_model& item, std::ostream& out) + { + int version = 1; + dlib::serialize(version, out); + dlib::serialize(item.gen, out); + } + + template < int height, int width > + inline void deserialize(cliff_model& item, std::istream& in) + { + int version = 0; + dlib::deserialize(version, in); + if (version != 1) + throw serialization_error("Unexpected version found while deserializing reinforcement learning test model object."); + + item = cliff_model(); + dlib::deserialize(item.gen, in); + } + template < int height, int width, @@ -203,6 +235,58 @@ namespace DLIB_TEST(r > 0); } + void policy_serialization_test(){ + cliff_model<3, 5> model(8); + policy gp(model), gres; + + for(uint i = 0u; i < gp.get_weights().size(); i++) + gp.get_weights()(i) = i; + + ostringstream sout; + serialize(gp, sout); + istringstream sin(sout.str()); + + deserialize(gres, sin); + dlog << LINFO << "policy serializing: " << + (gp.get_weights() == gres.get_weights() && gp.get_model().get_generator() == gres.get_model().get_generator()); + DLIB_TEST(gp.get_weights() == gres.get_weights() && gp.get_model().get_generator() == gres.get_model().get_generator()); + } + + void epsilon_policy_serialization_test(){ + cliff_model<3, 5> model(11); + policy gp(model); + + for(uint i = 0u; i < gp.get_weights().size(); i++) + gp.get_weights()(i) = i; + + epsilon_policy ep(0.3, gp); + auto eres = ep; // epsilon_policy is not default constructible + + auto state = ep.get_model().initial_state(); + for(uint i = 0u; i < 3; i++) + state = ep.get_model().step(state, ep(state)); + + ostringstream sout; + serialize(ep, sout); + istringstream sin(sout.str()); + + auto cstate = state; + for(uint i = 0; i < 5; i++) + state = ep.get_model().step(state, ep(state)); + + deserialize(eres, sin); + for(uint i = 0; i < 5; i++) + cstate = eres.get_model().step(cstate, eres(cstate)); + + dlog << LINFO << "epsilon policy serializing: " << + (ep.get_weights() == eres.get_weights() && ep.get_generator() == eres.get_generator() && + ep.get_model().get_generator() == eres.get_model().get_generator() ? "True" : "False"); + dlog << LINFO << "same state stepping after serializing: " << (state == cstate ? "True" : "False"); + DLIB_TEST(state == cstate); + DLIB_TEST(ep.get_weights() == eres.get_weights() && ep.get_generator() == eres.get_generator() && + ep.get_model().get_generator() == eres.get_model().get_generator()); + } + class rl_tester : public tester { public: @@ -228,6 +312,9 @@ namespace test<5,5,sarsa>(); test<4,7,sarsa>(); test<5,10,sarsa>(); + + policy_serialization_test(); + epsilon_policy_serialization_test(); } }; diff --git a/examples/qlearning_sarsa_ex.cpp b/examples/qlearning_sarsa_ex.cpp index 53a10fdf1f..90ca0d3b0b 100644 --- a/examples/qlearning_sarsa_ex.cpp +++ b/examples/qlearning_sarsa_ex.cpp @@ -7,6 +7,7 @@ #include #include +#include #include #include #include @@ -73,6 +74,8 @@ class cliff_model int seed = 0 ) : gen(seed){} + cliff_model(const cliff_model&) = default; + cliff_model& operator=(const cliff_model&) = default; // Functions that will use the agent @@ -106,7 +109,7 @@ class cliff_model ) const { auto best = numeric_limits::lowest(); - auto best_indexes = std::vector(); + std::vector best_indexes; for(auto i = 0; i < num_actions; i++) { @@ -196,9 +199,33 @@ class cliff_model return result; } + // for accessing to gen on serialization functions (alternatively we could define a getter method) + template < int H, int W> friend void serialize(const cliff_model& item, std::ostream& out); + template < int H, int W> friend void deserialize(cliff_model& item, std::istream& in); + mutable default_random_engine gen; //mutable because it doesn't changes the model state }; +template < int height, int width > +inline void serialize(const cliff_model& item, std::ostream& out) +{ + int version = 1; + dlib::serialize(version, out); + dlib::serialize(item.gen, out); +} + +template < int height, int width > +inline void deserialize(cliff_model& item, std::istream& in) +{ + int version = 0; + dlib::deserialize(version, in); + if (version != 1) + throw serialization_error("Unexpected version found while deserializing reinforcement learning test model object."); + + item = cliff_model(); + dlib::deserialize(item.gen, in); +} + // This is just a helper function to pretty-print the agent's state. template < typename model_t From b6f1fde280a143981a7d83965ec3cf4c083d49f6 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adri=C3=A1n=20Javaloy?= Date: Mon, 4 Jun 2018 01:40:57 +0200 Subject: [PATCH 12/14] changed uint type to int --- dlib/test/reinforcement_learning.cpp | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/dlib/test/reinforcement_learning.cpp b/dlib/test/reinforcement_learning.cpp index 7c708a49e7..ffb9bd3b15 100644 --- a/dlib/test/reinforcement_learning.cpp +++ b/dlib/test/reinforcement_learning.cpp @@ -239,7 +239,7 @@ namespace cliff_model<3, 5> model(8); policy gp(model), gres; - for(uint i = 0u; i < gp.get_weights().size(); i++) + for(int i = 0; i < gp.get_weights().size(); i++) gp.get_weights()(i) = i; ostringstream sout; @@ -256,14 +256,14 @@ namespace cliff_model<3, 5> model(11); policy gp(model); - for(uint i = 0u; i < gp.get_weights().size(); i++) + for(int i = 0; i < gp.get_weights().size(); i++) gp.get_weights()(i) = i; epsilon_policy ep(0.3, gp); auto eres = ep; // epsilon_policy is not default constructible auto state = ep.get_model().initial_state(); - for(uint i = 0u; i < 3; i++) + for(int i = 0; i < 3; i++) state = ep.get_model().step(state, ep(state)); ostringstream sout; @@ -271,11 +271,11 @@ namespace istringstream sin(sout.str()); auto cstate = state; - for(uint i = 0; i < 5; i++) + for(int i = 0; i < 5; i++) state = ep.get_model().step(state, ep(state)); deserialize(eres, sin); - for(uint i = 0; i < 5; i++) + for(int i = 0; i < 5; i++) cstate = eres.get_model().step(cstate, eres(cstate)); dlog << LINFO << "epsilon policy serializing: " << From eaa621a1c2dfdf257b320bd3aa3b8b7aee53cd51 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adri=C3=A1n=20Javaloy?= Date: Mon, 4 Jun 2018 10:47:19 +0200 Subject: [PATCH 13/14] Fixed "cannot call member function without object" --- dlib/control/approximate_linear_models.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/dlib/control/approximate_linear_models.h b/dlib/control/approximate_linear_models.h index 5322c2735c..9799f4b9a5 100644 --- a/dlib/control/approximate_linear_models.h +++ b/dlib/control/approximate_linear_models.h @@ -163,7 +163,7 @@ namespace dlib ) const { return underlying_policy; } auto get_model ( - ) const -> decltype(get_policy().get_model()) { return underlying_policy.get_model(); } + ) const -> decltype(this->get_policy().get_model()) { return underlying_policy.get_model(); } matrix& get_weights ( ) { return underlying_policy.get_weights(); } From e3f2d28a37199365c083f0b2ede30063fc98043a Mon Sep 17 00:00:00 2001 From: Davis King Date: Sat, 14 Mar 2020 19:35:00 -0400 Subject: [PATCH 14/14] cleanup and add some tests --- dlib/control.h | 6 ++-- dlib/control/approximate_linear_models.h | 2 +- dlib/serialize.h | 4 +-- dlib/test/reinforcement_learning.cpp | 8 ++--- dlib/test/serialize.cpp | 38 ++++++++++++++++++++++++ 5 files changed, 48 insertions(+), 10 deletions(-) diff --git a/dlib/control.h b/dlib/control.h index 4e9c02878e..8f941244f5 100644 --- a/dlib/control.h +++ b/dlib/control.h @@ -1,13 +1,13 @@ // Copyright (C) 2015 Davis E. King (davis@dlib.net) // License: Boost Software License See LICENSE.txt for the full license. -#ifndef DLIB_CONTROL_ -#define DLIB_CONTROL_ +#ifndef DLIB_CONTRoL_ +#define DLIB_CONTRoL_ #include "control/lspi.h" #include "control/mpc.h" #include "control/qlearning.h" #include "control/sarsa.h" -#endif // DLIB_CONTROL_ +#endif // DLIB_CONTRoL_ diff --git a/dlib/control/approximate_linear_models.h b/dlib/control/approximate_linear_models.h index 9799f4b9a5..760ee25da6 100644 --- a/dlib/control/approximate_linear_models.h +++ b/dlib/control/approximate_linear_models.h @@ -4,7 +4,7 @@ #define DLIB_APPROXIMATE_LINEAR_MODELS_Hh_ #include "approximate_linear_models_abstract.h" -#include +#include "../matrix.h" #include namespace dlib diff --git a/dlib/serialize.h b/dlib/serialize.h index 1dfaf9b713..cc6e2c0ec3 100644 --- a/dlib/serialize.h +++ b/dlib/serialize.h @@ -1561,7 +1561,7 @@ namespace dlib \ try{ serialize(ss.str(),out); } \ catch (serialization_error& e) \ - { throw serialization_error(e.info + "\n while serializing object of type std::default_random_engine"); } \ + { throw serialization_error(e.info + "\n while serializing object of type " + #T); } \ } \ \ inline void deserialize ( \ @@ -1572,7 +1572,7 @@ namespace dlib std::string str; \ try { deserialize(str,in); } \ catch (serialization_error& e) \ - { throw serialization_error(e.info + "\n while deserializing object of type std::default_random_engine"); } \ + { throw serialization_error(e.info + "\n while deserializing object of type " + #T); } \ \ std::stringstream ss(str); \ ss.setf(std::ios_base::dec, std::ios_base::basefield); \ diff --git a/dlib/test/reinforcement_learning.cpp b/dlib/test/reinforcement_learning.cpp index ffb9bd3b15..85e9fca88e 100644 --- a/dlib/test/reinforcement_learning.cpp +++ b/dlib/test/reinforcement_learning.cpp @@ -61,7 +61,7 @@ namespace // It's possible that the allowed actions differ among states. // In this case all movements are always allowed so we don't need to use state. action_type random_action( - const state_type& state + const state_type& /*state*/ ) const { uniform_int_distribution dist(0,num_actions-1); @@ -97,8 +97,8 @@ namespace // This functions gives the rewards, that is, tells the agent how good are its movements double reward( - const state_type &state, - const action_type &action, + const state_type &/*state*/, + const action_type &/*action*/, const state_type &new_state ) const { @@ -146,7 +146,7 @@ namespace const action_type& action ) const { - bool result; + bool result = false; switch(action) { diff --git a/dlib/test/serialize.cpp b/dlib/test/serialize.cpp index f8b3384b98..b951b76a4e 100644 --- a/dlib/test/serialize.cpp +++ b/dlib/test/serialize.cpp @@ -1050,6 +1050,43 @@ namespace } } +// ---------------------------------------------------------------------------------------- + + template + void test_std_generator() + { + T rnd; + + for (int i = 0; i < 10; ++i) + rnd(); + + std::stringstream ss; + const int val1 = 123; + const int val2 = 456; + dlib::serialize(val1, ss); + dlib::serialize(rnd, ss); + dlib::serialize(val2, ss); + + T rnd2; + int val1_read, val2_read; + dlib::deserialize(val1_read, ss); + dlib::deserialize(rnd2, ss); + dlib::deserialize(val2_read, ss); + + DLIB_TEST(val1_read == val1); + DLIB_TEST(val2_read == val2); + + for (int i = 0; i < 100; ++i) + DLIB_TEST(rnd() == rnd2()); + } + + void random_generators() + { + test_std_generator(); + test_std_generator(); + test_std_generator(); + } + // ---------------------------------------------------------------------------------------- class serialize_tester : public tester @@ -1078,6 +1115,7 @@ namespace test_array2d_and_matrix_serialization(); test_strings(); test_std_array(); + random_generators(); } } a;