From 650c1d5edbef3eb3f6fc5d2497c46cca5ff596c4 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Adri=C3=A1n=20Javaloy?= <java@sobremesa.localdomain>
Date: Sat, 9 Dec 2017 13:27:33 +0100
Subject: [PATCH 01/14] Added a first version of Qlearning + updated rl
 interface

---
 dlib/control/approximate_linear_models.h      |  80 +----
 .../approximate_linear_models_abstract.h      | 125 +-------
 dlib/control/model_abstract.h                 | 135 ++++++++
 dlib/control/policy.h                         | 180 +++++++++++
 dlib/control/policy_abstract.h                | 293 ++++++++++++++++++
 dlib/control/qlearning.h                      | 177 +++++++++++
 dlib/control/qlearning_abstract.h             | 171 ++++++++++
 examples/CMakeLists.txt                       |   1 +
 examples/qlearning_sarsa_ex.cpp               | 217 +++++++++++++
 9 files changed, 1197 insertions(+), 182 deletions(-)
 create mode 100644 dlib/control/model_abstract.h
 create mode 100644 dlib/control/policy.h
 create mode 100644 dlib/control/policy_abstract.h
 create mode 100644 dlib/control/qlearning.h
 create mode 100644 dlib/control/qlearning_abstract.h
 create mode 100644 examples/qlearning_sarsa_ex.cpp

diff --git a/dlib/control/approximate_linear_models.h b/dlib/control/approximate_linear_models.h
index 9732d71e90..a0d9c01dcb 100644
--- a/dlib/control/approximate_linear_models.h
+++ b/dlib/control/approximate_linear_models.h
@@ -4,7 +4,6 @@
 #define DLIB_APPROXIMATE_LINEAR_MODELS_Hh_
 
 #include "approximate_linear_models_abstract.h"
-#include "../matrix.h"
 
 namespace dlib
 {
@@ -12,13 +11,13 @@ namespace dlib
 // ----------------------------------------------------------------------------------------
 
     template <
-        typename feature_extractor
+        typename model_type
         >
     struct process_sample
     {
-        typedef feature_extractor feature_extractor_type;
-        typedef typename feature_extractor::state_type state_type;
-        typedef typename feature_extractor::action_type action_type;
+        typedef typename model_type::state_type state_type;
+        typedef typename model_type::action_type action_type;
+        typedef typename model_type::reward_type reward_type;
 
         process_sample(){}
 
@@ -26,13 +25,13 @@ namespace dlib
             const state_type& s,
             const action_type& a,
             const state_type& n,
-            const double& r
+            const reward_type& r
         ) : state(s), action(a), next_state(n), reward(r) {}
 
         state_type  state;
         action_type action;
         state_type  next_state;
-        double reward;
+        reward_type reward;
     };
 
     template < typename feature_extractor >
@@ -53,73 +52,6 @@ namespace dlib
         deserialize(item.reward, in);
     }
 
-// ----------------------------------------------------------------------------------------
-
-    template <
-        typename feature_extractor
-        >
-    class policy
-    {
-    public:
-
-        typedef feature_extractor feature_extractor_type;
-        typedef typename feature_extractor::state_type state_type;
-        typedef typename feature_extractor::action_type action_type;
-
-
-        policy (
-        )
-        {
-            w.set_size(fe.num_features());
-            w = 0;
-        }
-
-        policy (
-            const matrix<double,0,1>& weights_,
-            const feature_extractor& fe_
-        ) : w(weights_), fe(fe_) {}
-
-        action_type operator() (
-            const state_type& state
-        ) const
-        {
-            return fe.find_best_action(state,w);
-        }
-
-        const feature_extractor& get_feature_extractor (
-        ) const { return fe; }
-
-        const matrix<double,0,1>& get_weights (
-        ) const { return w; }
-
-
-    private:
-        matrix<double,0,1> w;
-        feature_extractor fe;
-    };
-
-    template < typename feature_extractor >
-    inline void serialize(const policy<feature_extractor>& item, std::ostream& out)
-    {
-        int version = 1;
-        serialize(version, out);
-        serialize(item.get_feature_extractor(), out);
-        serialize(item.get_weights(), out);
-    }
-    template < typename feature_extractor >
-    inline void deserialize(policy<feature_extractor>& item, std::istream& in)
-    {
-        int version = 0;
-        deserialize(version, in);
-        if (version != 1)
-            throw serialization_error("Unexpected version found while deserializing dlib::policy object.");
-        feature_extractor fe;
-        matrix<double,0,1> w;
-        deserialize(fe, in);
-        deserialize(w, in);
-        item = policy<feature_extractor>(w,fe);
-    }
-
 // ----------------------------------------------------------------------------------------
 
 }
diff --git a/dlib/control/approximate_linear_models_abstract.h b/dlib/control/approximate_linear_models_abstract.h
index 59dac42769..a5dc2a6b13 100644
--- a/dlib/control/approximate_linear_models_abstract.h
+++ b/dlib/control/approximate_linear_models_abstract.h
@@ -3,20 +3,24 @@
 #undef DLIB_APPROXIMATE_LINEAR_MODELS_ABSTRACT_Hh_
 #ifdef DLIB_APPROXIMATE_LINEAR_MODELS_ABSTRACT_Hh_
 
-#include "../matrix.h"
+#include "model_abstract.h"
 
 namespace dlib
 {
 
 // ----------------------------------------------------------------------------------------
 
+    template <
+        typename T,
+        typename U
+    >
     struct example_feature_extractor 
     {
         /*!
             WHAT THIS OBJECT REPRESENTS
                 This object defines the interface a feature extractor must implement if it
-                is to be used with the process_sample and policy objects defined at the
-                bottom of this file.  Moreover, it is meant to represent the core part
+                is to be used with the process_sample and policy objects defined at
+                policy_abstract.h.  Moreover, it is meant to represent the core part
                 of a model used in a reinforcement learning algorithm.
                 
                 In particular, this object models a Q(state,action) function where
@@ -24,7 +28,7 @@ namespace dlib
                     where PSI(state,action) is a feature vector and w is a parameter
                     vector.
 
-                Therefore, a feature extractor defines how the PSI(x,y) feature vector is
+                Therefore,  a feature extractor defines how the PSI(x,y) feature vector is
                 calculated.  It also defines the types used to represent the state and
                 action objects. 
 
@@ -35,9 +39,8 @@ namespace dlib
                 functions of this object.
         !*/
 
-        // The state and actions can be any types so long as you provide typedefs for them.
         typedef T state_type;
-        typedef U action_type; 
+        typedef U action_type;
         // We can also say that the last element in the weight vector w must be 1.  This
         // can be useful for including a prior into your model.
         const static bool force_last_weight_to_1 = false;
@@ -56,20 +59,8 @@ namespace dlib
                 - returns the dimensionality of the PSI() feature vector.  
         !*/
 
-        action_type find_best_action (
-            const state_type& state,
-            const matrix<double,0,1>& w
-        ) const;
-        /*!
-            ensures
-                - returns the action A that maximizes Q(state,A) = dot(w,PSI(state,A)).
-                  That is, this function finds the best action to take in the given state
-                  when our model is parameterized by the given weight vector w.
-        !*/
-
         void get_features (
             const state_type& state,
-            const action_type& action,
             matrix<double,0,1>& feats
         ) const;
         /*!
@@ -83,14 +74,13 @@ namespace dlib
 // ----------------------------------------------------------------------------------------
 
     template <
-        typename feature_extractor
+        typename model_type
         >
     struct process_sample
     {
         /*!
-            REQUIREMENTS ON feature_extractor
-                feature_extractor should implement the example_feature_extractor interface
-                defined at the top of this file.
+            REQUIREMENTS ON model_type
+                model_type should implement the interface defined at model_abstract.h.
 
             WHAT THIS OBJECT REPRESENTS
                 This object holds a training sample for a reinforcement learning algorithm.
@@ -99,9 +89,9 @@ namespace dlib
                 receiving this->reward and ending up in the state this->next_state.
         !*/
 
-        typedef feature_extractor feature_extractor_type;
-        typedef typename feature_extractor::state_type state_type;
-        typedef typename feature_extractor::action_type action_type;
+        typedef typename model_type::state_type state_type;
+        typedef typename model_type::action_type action_type;
+        typedef typename model_type::reward_type reward_type;
 
         process_sample(){}
 
@@ -109,13 +99,13 @@ namespace dlib
             const state_type& s,
             const action_type& a,
             const state_type& n,
-            const double& r
+            const reward_type& r
         ) : state(s), action(a), next_state(n), reward(r) {}
 
         state_type  state;
         action_type action;
         state_type  next_state;
-        double reward;
+        reward_type reward;
     };
 
     template < typename feature_extractor >
@@ -128,86 +118,5 @@ namespace dlib
 
 // ----------------------------------------------------------------------------------------
 
-    template <
-        typename feature_extractor
-        >
-    class policy
-    {
-        /*!
-            REQUIREMENTS ON feature_extractor
-                feature_extractor should implement the example_feature_extractor interface
-                defined at the top of this file.
-
-            WHAT THIS OBJECT REPRESENTS
-                This is a policy based on the supplied feature_extractor model.  In
-                particular, it maps from feature_extractor::state_type to the best action
-                to take in that state.
-        !*/
-
-    public:
-
-        typedef feature_extractor feature_extractor_type;
-        typedef typename feature_extractor::state_type state_type;
-        typedef typename feature_extractor::action_type action_type;
-
-
-        policy (
-        );
-        /*!
-            ensures
-                - #get_feature_extractor() == feature_extractor() 
-                  (i.e. it will have its default value)
-                - #get_weights().size() == #get_feature_extractor().num_features()
-                - #get_weights() == 0
-        !*/
-
-        policy (
-            const matrix<double,0,1>& weights,
-            const feature_extractor& fe
-        ); 
-        /*!
-            requires
-                - fe.num_features() == weights.size()
-            ensures
-                - #get_feature_extractor() == fe
-                - #get_weights() == weights
-        !*/
-
-        action_type operator() (
-            const state_type& state
-        ) const;
-        /*!
-            ensures
-                - returns get_feature_extractor().find_best_action(state,w);
-        !*/
-
-        const feature_extractor& get_feature_extractor (
-        ) const; 
-        /*!
-            ensures
-                - returns the feature extractor used by this object
-        !*/
-
-        const matrix<double,0,1>& get_weights (
-        ) const; 
-        /*!
-            ensures
-                - returns the parameter vector (w) associated with this object.  The length
-                  of the vector is get_feature_extractor().num_features().  
-        !*/
-
-    };
-
-    template < typename feature_extractor >
-    void serialize(const policy<feature_extractor>& item, std::ostream& out);
-    template < typename feature_extractor >
-    void deserialize(policy<feature_extractor>& item, std::istream& in);
-    /*!
-        provides serialization support.
-    !*/
-
-// ----------------------------------------------------------------------------------------
-
-
 #endif // DLIB_APPROXIMATE_LINEAR_MODELS_ABSTRACT_Hh_
  
diff --git a/dlib/control/model_abstract.h b/dlib/control/model_abstract.h
new file mode 100644
index 0000000000..dc7bcbce8c
--- /dev/null
+++ b/dlib/control/model_abstract.h
@@ -0,0 +1,135 @@
+// Copyright (C) 2017 Adrián Javaloy (adrian.javaloy@gmail.com)
+// License: Boost Software License   See LICENSE.txt for the full license.
+#undef DLIB_MODEL_ABSTRACT_Hh_
+#ifdef DLIB_MODEL_ABSTRACT_Hh_
+
+#include "approximate_linear_models_abstract.h"
+#include "../matrix.h"
+
+namespace dlib
+{
+
+    template <
+            template<typename, typename> typename feature_extractor_type
+            >
+    class example_model
+    {
+        /*!
+            REQUIREMENTS ON feature_extractor
+                feature_extractor should implement the example_feature_extractor interface defined
+                at approximate_linear_models_abstract.h.
+
+            WHAT THIS OBJECT REPRESENTS
+                This is an example interface of a model class. This class represents an environment
+                where an agent will be deployed at. In particular, this class includes information
+                about the state space, action space and how to represent those states feature-wise.
+        !*/
+    public:
+
+        // You have to define state, action and reward types.
+        typedef U state_type;
+        typedef V action_type;
+        typedef W reward_type;
+
+        // The feature extractor uses the same types as the model.
+        typedef feature_extractor_type<state_type, action_type> feature_extractor;
+
+        example_model(
+        );
+        /*!
+            ensures
+                - #get_feature_extractor() == feature_extractor()
+        !*/
+
+        action_type random_action(
+            const state_type &state
+        ) const;
+        /*!
+            ensures
+                - returns a random reachable action from state.
+        !*/
+
+        action_type find_best_action(
+            const state_type &state,
+            const matrix<double,0,1> &w
+        ) const;
+        /*!
+            requires
+                - w.size() == states_size()
+            ensures
+                - returns the action that maximizes the product
+                  dot(w, get_feature_extractor().get_features(state)).
+        !*/
+
+        const feature_extractor& get_feature_extractor(
+        ) const;
+        /*!
+            ensures
+                - returns the feature_extractor used by the model.
+        !*/
+
+        auto states_size(
+        ) const -> decltype(get_feature_extractor().num_features());
+        /*!
+            ensures
+                - returns get_feature_extractor().num_features().
+        !*/
+
+        auto get_features(
+            const state_type &state
+        ) const -> decltype(get_feature_extractor().get_features(state));
+        /*!
+            ensures
+                - returns get_feature_extractor().get_features(state);
+        !*/
+
+        state_type initial_state(
+        ) const;
+        /*!
+            ensures
+                - returns the initial state of the model.
+        !*/
+
+        state_type step(
+            const state_type &state,
+            const action_type &action
+        ) const;
+        /*!
+            requires
+                - action is a valid action from state.
+            ensures
+                - returns a state that is possible to be in after doing action
+                  from state.
+        !*/
+
+        // The new_state parameter is need because the model doesn't have to be deterministic.
+        // Nonetheless for now I'll suppose that the reward is deterministic.
+        reward_type reward(
+            const state_type &state,
+            const action_type &action,
+            const state_type &new_state
+        ) const;
+        /*!
+            requires
+                - is possible to be in new_state after doing action from state.
+            ensures
+                - returns the reward obtained for reaching new_state from state after
+                  doing action.
+        !*/
+
+
+    };
+
+    template < template<typename, typename> typename feature_extractor >
+    void serialize (const example_model<feature_extractor>& item, std::ostream& out);
+    template < template<typename, typename> typename feature_extractor >
+    void deserialize (example_model<feature_extractor>& item, std::istream& in);
+    /*!
+        provides serialization support.
+    !*/
+
+// ----------------------------------------------------------------------------------------
+
+}
+
+#endif
diff --git a/dlib/control/policy.h b/dlib/control/policy.h
new file mode 100644
index 0000000000..620f9b4f79
--- /dev/null
+++ b/dlib/control/policy.h
@@ -0,0 +1,180 @@
+// Copyright (C) 2017 Adrián Javaloy (adrian.javaloy@gmail.com)
+// License: Boost Software License   See LICENSE.txt for the full license.
+#ifndef DLIB_POLICY_Hh_
+#define DLIB_POLICY_Hh_
+
+#include <random>
+#include "../matrix.h"
+#include "policy_abstract.h"
+
+namespace dlib
+{
+
+    template <
+        typename model_type
+        >
+    class greedy_policy
+    {
+    public:
+
+        typedef model_type feature_extractor_type;
+        typedef typename model_type::state_type state_type;
+        typedef typename model_type::action_type action_type;
+
+        greedy_policy (
+        )
+        {
+            w.set_size(model.states_size());
+            w = 0;
+        }
+
+        greedy_policy (
+            const matrix<double,0,1>& weights_,
+            const model_type& model_ = model_type()
+        ) : w(weights_), model(model_) {}
+
+        action_type operator() (
+            const state_type& state
+        ) const
+        {
+            return model.find_best_action(state,w);
+        }
+
+        const model_type& get_model (
+        ) const { return model; }
+
+        matrix<double,0,1>& get_weights (
+        ) { return w; }
+
+        const matrix<double,0,1>& get_weights (
+        ) const { return w; }
+
+    private:
+        matrix<double,0,1> w;
+        model_type model;
+    };
+
+    template < typename model_type >
+    inline void serialize(const greedy_policy<model_type>& item, std::ostream& out)
+    {
+        int version = 1;
+        serialize(version, out);
+        serialize(item.get_model(), out);
+        serialize(item.get_weights(), out);
+    }
+    template < typename model_type >
+    inline void deserialize(greedy_policy<model_type>& item, std::istream& in)
+    {
+        int version = 0;
+        deserialize(version, in);
+        if (version != 1)
+            throw serialization_error("Unexpected version found while deserializing dlib::greedy_policy object.");
+        model_type model;
+        matrix<double,0,1> w;
+        deserialize(model, in);
+        deserialize(w, in);
+        item = greedy_policy<model_type>(w,model);
+    }
+
+// ----------------------------------------------------------------------------------------
+
+    template <
+        typename model_type,
+        typename generator = std::default_random_engine
+        >
+    class epsilon_policy
+    {
+    public:
+
+        typedef model_type feature_extractor_type;
+        typedef typename model_type::state_type state_type;
+        typedef typename model_type::action_type action_type;
+
+        epsilon_policy (
+            double epsilon_,
+            const generator &gen_ = std::default_random_engine()
+        ) : epsilon(epsilon_), gen(gen_)
+        {
+            w.set_size(model.states_size());
+            w = 0;
+        }
+
+        epsilon_policy (
+            double epsilon_,
+            const matrix<double,0,1>& weights_,
+            const model_type& model_ = model_type(),
+            const generator gen_ = std::default_random_engine()
+        ) : w(weights_), model(model_), epsilon(epsilon_), gen(gen_) {}
+
+        action_type operator() (
+            const state_type& state
+        ) const
+        {
+            std::bernoulli_distribution d(epsilon);
+            if(d(gen)){
+  //              std::cout << "random\n";
+                return model.random_action(state);
+            }
+            else{
+//                std::cout << "best\n";
+                return model.find_best_action(state,w);
+            }
+            //return d(gen) ? model.random_action(state) : model.find_best_action(state,w);
+        }
+
+        const model_type& get_model (
+        ) const { return model; }
+
+        matrix<double,0,1>& get_weights (
+        ) { return w; }
+
+        const matrix<double,0,1>& get_weights (
+        ) const { return w; }
+
+        double get_epsilon(
+        ) const { return epsilon; }
+
+        const generator& get_generator(
+        ) const { return gen; }
+
+    private:
+        matrix<double,0,1> w;
+        model_type model;
+        double epsilon;
+
+        mutable generator gen;
+    };
+
+    template < typename model_type, typename generator >
+    inline void serialize(const epsilon_policy<model_type, generator>& item, std::ostream& out)
+    {
+        int version = 1;
+        serialize(version, out);
+        serialize(item.get_model(), out);
+        serialize(item.get_weights(), out);
+        serialize(item.get_epsilon(), out);
+        serialize(item.get_generator(), out);
+    }
+    template < typename model_type, typename generator >
+    inline void deserialize(epsilon_policy<model_type, generator>& item, std::istream& in)
+    {
+        int version = 0;
+        deserialize(version, in);
+        if (version != 1)
+            throw serialization_error("Unexpected version found while deserializing dlib::greedy_policy object.");
+        model_type model;
+        matrix<double,0,1> w;
+        double epsilon;
+        generator gen;
+        deserialize(model, in);
+        deserialize(w, in);
+        deserialize(epsilon, in);
+        deserialize(gen, in);
+        item = epsilon_policy<model_type, generator>(w,model, epsilon, gen);
+    }
+
+// ----------------------------------------------------------------------------------------
+
+}
+
+#endif // DLIB_POLICY_Hh_
diff --git a/dlib/control/policy_abstract.h b/dlib/control/policy_abstract.h
new file mode 100644
index 0000000000..41b0e70fd7
--- /dev/null
+++ b/dlib/control/policy_abstract.h
@@ -0,0 +1,293 @@
+// Copyright (C) 2017 Adrián Javaloy (adrian.javaloy@gmail.com)
+// License: Boost Software License   See LICENSE.txt for the full license.
+#undef DLIB_POLICY_ABSTRACT_Hh_
+#ifdef DLIB_POLICY_ABSTRACT_Hh_
+
+#include <random>
+#include "../matrix.h"
+#include "model_abstract.h"
+
+namespace dlib
+{
+
+template <
+    typename model_type
+    >
+class example_policy
+{
+    /*!
+        REQUIREMENTS ON model_type
+            model_type should implement the interface defined at model_abstract.h.
+
+        WHAT THIS OBJECT REPRESENTS
+            This is a policy based on the supplied model_type model.  In
+            particular, it maps from model_type::state_type to a model_type::action
+            to take in that state.
+    !*/
+
+public:
+
+    typedef typename model_type::state_type state_type;
+    typedef typename model_type::action_type action_type;
+
+    example_policy (
+    );
+    /*!
+        ensures
+            - #get_model() == model_type()
+              (i.e. it will have its default value)
+            - #get_weights().size() == #get_model().states_size()
+            - #get_weights() == 0
+    !*/
+
+    example_policy (
+        const matrix<double,0,1>& weights,
+        const model_type& model
+    );
+    /*!
+        requires
+            - model.states_size() == weights.size()
+        ensures
+            - #get_model() == model
+            - #get_weights() == weights
+    !*/
+
+    action_type operator() (
+        const state_type& state
+    ) const;
+
+    const model_type& get_model (
+    ) const;
+    /*!
+        ensures
+            - returns the model used by this object
+    !*/
+
+    matrix<double,0,1>& get_weights (
+    );
+    /*!
+        ensures
+            - returns the parameter vector (w) associated with this object.  The length
+              of the vector is get_model().states_size().
+    !*/
+
+    const matrix<double,0,1>& get_weights (
+    ) const;
+    /*!
+        ensures
+            - returns the parameter vector (w) associated with this object.  The length
+              of the vector is get_model().states_size().
+    !*/
+
+};
+
+template < typename model_type >
+void serialize(const example_policy<model_type>& item, std::ostream& out);
+template < typename model_type >
+void deserialize(example_policy<model_type>& item, std::istream& in);
+/*!
+    provides serialization support.
+!*/
+
+// ----------------------------------------------------------------------------------------
+
+template <
+    typename model_type
+    >
+class greedy_policy
+{
+    /*!
+        REQUIREMENTS ON model_type
+            model_type should implement the interface defined at model_abstract.h.
+
+        WHAT THIS OBJECT REPRESENTS
+            This is an implementation of the policy interface that returns the best action
+            based on the weights (i.e. it acts in a greedy fashion).
+    !*/
+
+public:
+
+    typedef typename model_type::state_type state_type;
+    typedef typename model_type::action_type action_type;
+
+    greedy_policy (
+    );
+    /*!
+        ensures
+            - #get_model() == model_type()
+              (i.e. it will have its default value)
+            - #get_weights().size() == #get_model().states_size()
+            - #get_weights() == 0
+    !*/
+
+    greedy_policy (
+        const matrix<double,0,1>& weights,
+        const model_type& model
+    );
+    /*!
+        requires
+            - model.states_size() == weights.size()
+        ensures
+            - #get_model() == model
+            - #get_weights() == weights
+    !*/
+
+    action_type operator() (
+        const state_type& state
+    ) const;
+    /*!
+        ensures
+            - returns get_model().find_best_action(state, w);
+    !*/
+
+    const model_type& get_model (
+    ) const;
+    /*!
+        ensures
+            - returns the model used by this object
+    !*/
+
+    matrix<double,0,1>& get_weights (
+    );
+    /*!
+        ensures
+            - returns the parameter vector (w) associated with this object.  The length
+              of the vector is get_model().states_size().
+    !*/
+
+    const matrix<double,0,1>& get_weights (
+    ) const;
+    /*!
+        ensures
+            - returns the parameter vector (w) associated with this object.  The length
+              of the vector is get_model().states_size().
+    !*/
+
+};
+
+template < typename model_type >
+void serialize(const greedy_policy<model_type>& item, std::ostream& out);
+template < typename model_type >
+void deserialize(greedy_policy<model_type>& item, std::istream& in);
+/*!
+    provides serialization support.
+!*/
+
+// ----------------------------------------------------------------------------------------
+
+template <
+    typename model_type,
+    typename generator
+    >
+class epsilon_policy
+{
+    /*!
+        REQUIREMENTS ON model_type
+            model_type should implement the interface defined at model_abstract.h.
+
+        REQUIREMENTS ON generator
+            generator should be a PRNG type like the ones defined in std::random.
+
+        WHAT THIS OBJECT REPRESENTS
+            This is an implementation of the policy interface that returns the best
+            action for the given state with probability 1-epsilon while it returns
+            an doable random action with probability epsilon.
+    !*/
+
+public:
+
+    typedef typename model_type::state_type state_type;
+    typedef typename model_type::action_type action_type;
+
+    epsilon_policy (
+        double epsilon,
+        const generator &gen = std::default_random_engine()
+    );
+    /*!
+        requires
+            - epsilon >= 0 and epsilon <= 1
+        ensures
+            - #get_model() == model_type()
+              (i.e. it will have its default value)
+            - #get_weights().size() == #get_model().states_size()
+            - #get_weights() == 0
+            - #get_epsilon() == epsilon
+    !*/
+
+    epsilon_policy (
+        double epsilon,
+        const matrix<double,0,1>& weights,
+        const model_type& model,
+        const generator &gen = std::default_random_engine()
+    );
+    /*!
+        requires
+            - model.states_size() == weights.size()
+            - epsilon >= 0 and epsilon <= 1
+        ensures
+            - #get_model() == model
+            - #get_weights() == weights
+            - #get_epsilon() == epsilon
+    !*/
+
+    action_type operator() (
+        const state_type& state
+    ) const;
+    /*!
+        ensures
+            - returns get_model().find_best_action(state, w) with probability 1-epsilon
+              and get_model().random_action(state) with probability epsilon.
+    !*/
+
+    const model_type& get_model (
+    ) const;
+    /*!
+        ensures
+            - returns the model used by this object
+    !*/
+
+    matrix<double,0,1>& get_weights (
+    );
+    /*!
+        ensures
+            - returns the parameter vector (w) associated with this object.  The length
+              of the vector is get_model().states_size().
+    !*/
+
+    const matrix<double,0,1>& get_weights (
+    ) const;
+    /*!
+        ensures
+            - returns the parameter vector (w) associated with this object.  The length
+              of the vector is get_model().states_size().
+    !*/
+
+    double get_epsilon(
+    ) const;
+    /*!
+        ensures
+            - returns the epsilon value used by the policy.
+    !*/
+
+    const generator& get_generator(
+    ) const;
+    /*!
+        ensures
+            - returns the generator used by the policy.
+    !*/
+
+};
+
+template < typename model_type >
+void serialize(const epsilon_policy<model_type>& item, std::ostream& out);
+template < typename model_type >
+void deserialize(epsilon_policy<model_type>& item, std::istream& in);
+/*!
+    provides serialization support.
+!*/
+
+// ----------------------------------------------------------------------------------------
+
+}
+
+#endif // DLIB_POLICY_ABSTRACT_Hh_
diff --git a/dlib/control/qlearning.h b/dlib/control/qlearning.h
new file mode 100644
index 0000000000..70a1fb0634
--- /dev/null
+++ b/dlib/control/qlearning.h
@@ -0,0 +1,177 @@
+// Copyright (C) 2017 Adrián Javaloy (adrian.javaloy@gmail.com)
+// License: Boost Software License   See LICENSE.txt for the full license.
+#ifndef DLIB_QLEARNING_Hh_
+#define DLIB_QLEARNING_Hh_
+
+#include "policy.h"
+
+namespace dlib
+{
+    template <
+        typename model_type
+        >
+    class qlearning
+    {
+    public:
+        explicit qlearning(
+            double lr = 0.2,
+            double disc = 0.8,
+            unsigned int miters = 100u,
+            double eps = 0.1,
+            bool v = false
+        ) : max_iterations(miters), verbose(v) {
+            set_learning_rate(lr);
+            set_discount(disc);
+            set_epsilon(eps);
+        }
+
+        double get_learning_rate(
+        ) const { return learning_rate; }
+
+        void set_learning_rate(
+            double value
+        )
+        {
+            DLIB_ASSERT(value >= 0. && value <= 1.,
+                "\t qlearning::set_learning_rate(value)"
+                "\n\t invalid inputs were given to this function"
+                "\n\t value: " << value
+            );
+            learning_rate = value;
+        }
+
+        double get_discount(
+        ) const { return discount; }
+
+        void set_discount(
+            double value
+        )
+        {
+            DLIB_ASSERT(value >= 0. && value <= 1.,
+                "\t qlearning::set_discount(value)"
+                "\n\t invalid inputs were given to this function"
+                "\n\t value: " << value
+            );
+            discount = value;
+        }
+
+        unsigned int get_max_iterations(
+        ) const { return max_iterations; }
+
+        void set_max_iterations(
+            unsigned int iterations
+        ) { max_iterations = iterations; }
+
+        double get_epsilon(
+        ) const { return epsilon; }
+
+        void set_epsilon(
+            double value
+        )
+        {
+            DLIB_ASSERT(value >= 0. && value <= 1.,
+                "\t qlearning::set_epsilon(value)"
+                "\n\t invalid inputs were given to this function"
+                "\n\t value: " << value
+            );
+            epsilon = value;
+        }
+
+        bool is_verbose(
+        ) const { return verbose; }
+
+        void be_verbose(
+        ) { verbose = true; }
+
+        void be_quiet(
+        ) { verbose = false; }
+
+        greedy_policy<model_type> train(
+            const matrix<double,0,1> &weights
+        ) const
+        {
+            typedef typename model_type::reward_type reward_type;
+
+            epsilon_policy<model_type> eps_pol(epsilon, weights);
+            auto& w = eps_pol.get_weights();
+
+            DLIB_ASSERT(weights.size() == model.states_size(),
+                "\t qlearning::train(weights)"
+                "\n\t invalid inputs were given to this function"
+                "\n\t weights.size: " << weights.size() <<
+                "\n\t features size: " << model.states_size()
+            );
+
+            reward_type total_reward = static_cast<reward_type>(0);
+            for(auto iter = 0u; iter < max_iterations; ++iter){
+                auto state = model.initial_state();
+
+                reward_type reward = static_cast<reward_type>(0);
+                while(!model.is_final(state)){
+                    auto action = eps_pol(state);
+                    auto next_state = model.step(state, action);
+                    auto next_reward = model.reward(state, action, next_state);
+
+                    const auto feats = model.get_features(state, action);
+                    const auto feats_next_best = model.get_features(next_state, model.find_best_action(next_state, w));
+
+                    auto prev = w;
+
+                    double correction = reward + discount * dot(w, feats_next_best) - dot(w, feats);
+                    //std::cout << "correction " << correction << "\n";
+                    w += learning_rate * correction * feats;
+
+                    /*for(auto i = 0; i < model.states_size(); i++)
+                        std::cout << w(i) << " ";
+                    std::cout << std::endl;
+
+                    for(auto i = 0; i < model.states_size(); i++)
+                        std::cout << feats(i) << " ";
+                    std::cout << std::endl;
+
+
+                    if(verbose && sum(abs(w-prev)) != 0){
+                        std::cout << "updated:\n";
+                        for(auto i = 0; i < model.states_size(); i++){
+                            if(prev(i) != w(i))
+                                std::cout << "(" << i/5 << "," << i%5 << ") from " << prev(i) << " to " << w(i) << "\n";
+                        }
+                    }
+                    */
+
+                    state = next_state;
+                    reward += next_reward;
+                }
+
+                total_reward += reward;
+                if(verbose)
+                    std::cout << "iteration: " << iter << "\t reward: " << reward
+                              << "\t mean: " << total_reward/static_cast<int>(iter+1) << std::endl;
+            }
+
+            return greedy_policy<model_type>(w);
+        }
+
+        greedy_policy<model_type> train(
+        ) const
+        {
+            matrix<double, 0, 1> weights;
+            weights = 0;
+            return train(weights);
+        }
+
+    private:
+        double learning_rate;
+        double discount;
+        unsigned int max_iterations;
+        double epsilon;
+        bool verbose;
+
+        model_type model;
+    };
+
+// ----------------------------------------------------------------------------------------
+
+}
+
+#endif // DLIB_QLEARNING_Hh_
diff --git a/dlib/control/qlearning_abstract.h b/dlib/control/qlearning_abstract.h
new file mode 100644
index 0000000000..9fb227d0ba
--- /dev/null
+++ b/dlib/control/qlearning_abstract.h
@@ -0,0 +1,171 @@
+// Copyright (C) 2017 Adrián Javaloy (adrian.javaloy@gmail.com)
+// License: Boost Software License   See LICENSE.txt for the full license.
+#undef DLIB_QLEARNING_ABSTRACT_Hh_
+#ifdef DLIB_QLEARNING_ABSTRACT_Hh_
+
+#include "policy_abstract.h"
+#include "model_abstract.h"
+
+namespace dlib
+{
+    template <
+        typename model_type
+        >
+    class qlearning
+    {
+        /*!
+            REQUIREMENTS ON model_type
+                model_type is an implementation of the model interface declared in
+                  model_abstract.h.
+
+            WHAT THIS OBJECT REPRESENTS
+                This objects is an implementation of the well-known reinforcement learning
+                algorithm Q-learning. This algorithms takes a bunch of process_samples
+                as input and outputs a policy that have learnt from that in order to take
+                the better results.
+
+                Supposing we are in state s and action a the learning function has the form:
+                    Q(s, a) = (1 - lr) * Q(s,a) + lr * (reward + disc * max_a' Q(s, a'))
+                where lr is the learning_rate and disc the discount.
+                That formula means that it takes a convex combination of the current qvalue
+                and the expected qvalue.
+        !*/
+
+    public:
+        qlearning(
+        );
+        /*!
+            ensures
+                - #get_learning_rate() == 0.2
+                - #get_discount() == 0.8
+                - #get_max_iterations() == 100
+                - #get_epsilon() == 0.1
+                - #is not verbose
+        !*/
+
+        explicit qlearning(
+            double learning_rate,
+            double discount,
+            unsigned int max_iterations,
+            double epsilon,
+            bool verbose
+        );
+        /*!
+          requires
+            - learning_rate >= 0 and learning_rate <= 1
+            - discount >= 0 and discount <= 1
+            - epsilon >= 0 and epsilon <= 1
+          ensures
+            - #get_learning_rate() == learning_rate
+            - #get_discount() == discount
+            - #get_max_iterations() == max_iterations
+            - #get_epsilon() == epsilon
+            - #is_verbose() == verbose
+        !*/
+
+        double get_learning_rate(
+        ) const;
+        /*!
+            ensures
+                - returns the learning rate applied to the learning function.
+        !*/
+
+        void set_learning_rate(
+            double learning_rate
+        );
+        /*!
+            requires
+                - learning_rate >= 0 and learning_rate <= 1.
+            ensures
+                - #get_learning_rate() == learning_rate
+        !*/
+
+        double get_discount(
+        ) const;
+        /*!
+            ensures
+                - returns the discount applied to the learning function.
+        !*/
+
+        void set_discount(
+            double discount
+        );
+        /*!
+            requires
+                - discount >= 0 and discount <= 1.
+            ensures
+                - #get_discount() == discount
+        !*/
+
+        unsigned int get_max_iterations(
+        ) const;
+        /*!
+            ensures
+                - returns the maximum number of iterations that qlearning will
+                  perform during the training.
+        !*/
+
+        void set_max_iterations(
+            unsigned int iterations
+        );
+        /*!
+            ensures
+                - #get_max_iterations() == iterations
+        !*/
+
+        double get_epsilon(
+        ) const;
+        /*!
+            ensures
+                - returns the probability of doing a non-optimal step while training.
+        !*/
+
+        void set_epsilon(
+            double epsilon
+        );
+        /*!
+            requires
+                - epsilon >= 0 and epsilon <= 1.
+            ensures
+                - #get_epsilon() == epsilon
+        !*/
+
+        bool is_verbose(
+        ) const;
+        /*!
+            ensures
+                - returns if the class is verbose or not.
+        !*/
+
+        void be_verbose(
+        );
+        /*!
+            ensures
+                - #is_verbose() == true
+        !*/
+
+        void be_quiet(
+        );
+        /*!
+            ensures
+                - #is_verbose() == false
+        !*/
+
+        greedy_policy<model_type> train(
+            const matrix<double,0,1> &weights
+        ) const;
+        /*!
+            requires
+                - weights.size() == model_type.states_size()
+            ensures
+                - returns a greedy_policy resulting of doing max_iterations iterations
+                  over the model while applying the learning function to the weights
+                  matrix of the policy.
+        !*/
+    };
+
+// ----------------------------------------------------------------------------------------
+
+}
+
+#endif // DLIB_QLEARNING_ABSTRACT_Hh_
diff --git a/examples/CMakeLists.txt b/examples/CMakeLists.txt
index 34e46ca048..c95c4c2c89 100644
--- a/examples/CMakeLists.txt
+++ b/examples/CMakeLists.txt
@@ -154,6 +154,7 @@ if (NOT USING_OLD_VISUAL_STUDIO_COMPILER)
    add_example(dnn_semantic_segmentation_train_ex)
    add_example(dnn_instance_segmentation_train_ex)
    add_example(dnn_metric_learning_on_images_ex)
+   add_example(qlearning_sarsa_ex)
 endif()
 
 
diff --git a/examples/qlearning_sarsa_ex.cpp b/examples/qlearning_sarsa_ex.cpp
new file mode 100644
index 0000000000..44f8a13d34
--- /dev/null
+++ b/examples/qlearning_sarsa_ex.cpp
@@ -0,0 +1,217 @@
+#include <dlib/control/qlearning.h>
+#include <dlib/matrix.h>
+#include <limits>
+#include <cmath>
+#include <vector>
+#include <iostream>
+
+using namespace dlib;
+
+template <
+        typename state_type,
+        typename action_type
+        >
+class feature_extractor
+{
+public:
+    feature_extractor(
+        unsigned int h,
+        unsigned int w,
+        unsigned int na
+    ) : height(h), width(w), num_actions(na) {}
+
+
+    inline long num_features(
+    ) const { return num_actions * height * width; }
+
+    matrix<double,0,1> get_features(
+        const state_type &state,
+        const action_type &action
+    ) const
+    {
+        matrix<double,0,1> feats(num_features());
+        feats = 0;
+        //for(auto i = 0u; i < num_actions; i++)
+        //    feats(num_actions * state + i) = 1;
+        feats(num_actions*state + static_cast<int>(action)) = 1;
+
+        return feats;
+    }
+
+private:
+    int height, width, num_actions;
+};
+
+template <
+        int height,
+        int width,
+        template<typename,typename> class feature_extractor_type
+        >
+class cliff_model
+{
+public:
+    enum class actions {up = 0, right, down, left};
+    constexpr static double EPS = 1e-16;
+
+    typedef int state_type;
+    typedef actions action_type;
+    typedef int reward_type;
+
+    typedef feature_extractor_type<state_type, action_type> feature_extractor;
+
+    explicit cliff_model(
+    ) : fe(height, width, 4){}
+
+    action_type random_action(
+        const state_type& state
+    ) const
+    {
+        std::uniform_int_distribution<int> dist(0,3);
+        return static_cast<action_type>(dist(gen));
+    }
+
+    action_type find_best_action(
+        const state_type& state,
+        const matrix<double,0,1>& w
+    ) const
+    {
+        auto best = std::numeric_limits<double>::lowest();
+        auto best_indexes = std::vector<int>();
+
+        for(auto i = 0; i < 4; i++){
+            auto feats = get_features(state, static_cast<action_type>(i));
+            auto product = dot(w, feats);
+            if(product > best){
+                best = product;
+                best_indexes.clear();
+            }
+
+            if(std::abs(product - best) < EPS)
+                best_indexes.push_back(i);
+        }
+
+        std::uniform_int_distribution<unsigned long> dist(0, best_indexes.size()-1);
+        return static_cast<action_type>(best_indexes[dist(gen)]);
+    }
+
+    const feature_extractor& get_feature_extractor(
+    ) const { return fe; }
+
+    auto states_size(
+    ) const -> decltype(get_feature_extractor().num_features())
+    {
+        return get_feature_extractor().num_features();
+    }
+
+    auto get_features(
+        const state_type &state,
+        const action_type &action
+    ) const -> decltype(get_feature_extractor().get_features(state, action))
+    { return get_feature_extractor().get_features(state, action); }
+
+    reward_type reward(
+        const state_type &state,
+        const action_type &action,
+        const state_type &new_state
+    ) const
+    {
+        return !is_final(new_state) ? -1 : is_success(new_state) ? 100 : -100;
+    }
+
+    state_type initial_state(
+    ) const { return static_cast<state_type>((height-1) * width); }
+
+    state_type step(
+        const state_type& state,
+        const action_type& action
+    ) const
+    {
+        if(out_of_bounds(state, action))
+            return state;
+
+        return action == actions::up    ?   state - width   :
+               action == actions::down  ?   state + width   :
+               action == actions::right ?   state + 1       :
+                                            state - 1       ;
+    }
+
+    bool is_success(
+        const state_type &state
+    ) const { return state == height*width - 1; }
+
+    bool is_failure(
+        const state_type &state
+    ) const { return state/width == height-1 && state%width > 0 && state%width < width-1;}
+
+    bool is_final(
+        const state_type& state
+    ) const { return is_success(state) || is_failure(state); }
+
+private:
+    bool out_of_bounds(
+        const state_type& state,
+        const action_type& action
+    ) const
+    {
+        bool result;
+
+        switch(action){
+        case actions::up:
+            result = state / width == 0;
+            break;
+        case actions::down:
+            result = (state / width == height-2 && state % width > 0 && state % width < width-1)
+                    || state / width == height-1;
+            break;
+        case actions::left:
+            result = state % width == 0; // || state == height*width-1; <- is the goal condition
+            break;
+        case actions::right:
+            result = state % width == width-1 || state == (height-1)*width;
+            break;
+        }
+
+        return result;
+    }
+
+    feature_extractor fe;
+    mutable std::default_random_engine gen;
+};
+
+#include <dlib/control/policy.h>
+int main(int argc, char** argv)
+{
+    std::cout << "Hello." << std::endl;
+
+    const auto height = 3u;
+    const auto width = 5u;
+
+    typedef cliff_model<height, width, feature_extractor> model_type;
+
+    model_type model;
+    qlearning<model_type> algorithm;
+    algorithm.be_verbose();
+    algorithm.set_max_iterations(100);
+
+    auto policy = algorithm.train();
+
+    auto s = model.initial_state();
+    int r = 0; //TODO
+    for(auto i = 0u; i < 100 && !model.is_final(s); i++){
+        auto a = policy(s);
+        auto new_s = model.step(s, a);
+        r += model.reward(s,a,new_s);
+        s = new_s;
+    }
+
+    if(!model.is_final(s))
+        std::cout << "Nothing reached after 100 steps." << std::endl;
+    else if(model.is_failure(s))
+        std::cout << "Failed." << std::endl;
+    else
+        std::cout << "Success." << std::endl;
+
+    std::cout << "Good bye." << std::endl;
+
+    return 0;
+}

From f0bd6a889994b339895b140dad660274a4f2ce1a Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Adri=C3=A1n=20Javaloy?= <adrian.javaloy@gmail.com>
Date: Sat, 9 Dec 2017 19:02:01 +0100
Subject: [PATCH 02/14] Added Sarsa and an example. Everything is working
 alright.

---
 dlib/control.h                                |   9 +-
 .../approximate_linear_models_abstract.h      |  11 +-
 dlib/control/model_abstract.h                 |  70 +++++--
 dlib/control/policy.h                         |  84 ++++----
 dlib/control/policy_abstract.h                |  80 ++++---
 dlib/control/qlearning.h                      |  74 +++----
 dlib/control/qlearning_abstract.h             |  48 +++--
 dlib/control/sarsa.h                          | 164 +++++++++++++++
 dlib/control/sarsa_abstract.h                 | 195 ++++++++++++++++++
 examples/qlearning_sarsa_ex.cpp               | 167 ++++++++++-----
 10 files changed, 671 insertions(+), 231 deletions(-)
 create mode 100644 dlib/control/sarsa.h
 create mode 100644 dlib/control/sarsa_abstract.h

diff --git a/dlib/control.h b/dlib/control.h
index 85d00817d5..9c0ec80781 100644
--- a/dlib/control.h
+++ b/dlib/control.h
@@ -1,11 +1,14 @@
 // Copyright (C) 2015  Davis E. King (davis@dlib.net)
 // License: Boost Software License   See LICENSE.txt for the full license.
-#ifndef DLIB_CONTRoL_
-#define DLIB_CONTRoL_
+#ifndef DLIB_CONTROL_
+#define DLIB_CONTROL_
 
+#include "control/policy.h"
 #include "control/lspi.h"
 #include "control/mpc.h"
+#include "control/qlearning.h"
+#include "control/sarsa.h"
 
-#endif // DLIB_CONTRoL_
+#endif // DLIB_CONTROL_
 
 
diff --git a/dlib/control/approximate_linear_models_abstract.h b/dlib/control/approximate_linear_models_abstract.h
index a5dc2a6b13..74f99da4ab 100644
--- a/dlib/control/approximate_linear_models_abstract.h
+++ b/dlib/control/approximate_linear_models_abstract.h
@@ -59,14 +59,15 @@ namespace dlib
                 - returns the dimensionality of the PSI() feature vector.  
         !*/
 
-        void get_features (
-            const state_type& state,
-            matrix<double,0,1>& feats
+        matrix<double,0,1> get_features (
+            const state_type &state,
+            const action_type &action
         ) const;
         /*!
+            requires
+                - action is a valid option from state.
             ensures
-                - #feats.size() == num_features()
-                - #feats == PSI(state,action)
+                - returns PSI(state,action)
         !*/
 
     };
diff --git a/dlib/control/model_abstract.h b/dlib/control/model_abstract.h
index dc7bcbce8c..8abb0f326e 100644
--- a/dlib/control/model_abstract.h
+++ b/dlib/control/model_abstract.h
@@ -10,7 +10,7 @@ namespace dlib
 {
 
     template <
-            template<typename, typename> typename feature_extractor_type
+            template<typename, typename> class feature_extractor_type
             >
     class example_model
     {
@@ -21,8 +21,13 @@ namespace dlib
 
             WHAT THIS OBJECT REPRESENTS
                 This is an example interface of a model class. This class represents an environment
-                where an agent will be deployed at. In particular, this class includes information
-                about the state space, action space and how to represent those states feature-wise.
+                where an agent will be deployed at. In other words, it is an interface between the
+                simulated/real world and the agent that has to be there. In short this class:
+                    - Holds information about the state, action and reward space.
+                    - Delegates the state representation to the feature_extractor.
+                    - Provides an initial state to start the agent.
+                    - Offers an interface to move in the world (look for actions, make steps in it
+                      and get a feedback/reward for them).
         !*/
     public:
 
@@ -76,11 +81,29 @@ namespace dlib
         !*/
 
         auto get_features(
-            const state_type &state
-        ) const -> decltype(get_feature_extractor().get_features(state));
+            const state_type &state,
+            const action_type &action
+        ) const -> decltype(get_feature_extractor().get_features(state, action));
+        /*!
+            ensures
+                - returns get_feature_extractor().get_features(state, action);
+        !*/
+
+        // The new_state parameter is needed because the model doesn't have to be deterministic.
+        // Nonetheless for now we will suppose that the rewards are deterministic.
+        reward_type reward(
+            const state_type &state,
+            const action_type &action,
+            const state_type &new_state
+        ) const;
         /*!
+            requires
+                - action is available in state.
+                - new_state is a possible outcome when you do action on state.
             ensures
-                - returns get_feature_extractor().get_features(state);
+                - returns the reward obtained by going to new_state from state
+                  doing action.
+                - the function is deterministic with respect to its arguments.
         !*/
 
         state_type initial_state(
@@ -102,27 +125,38 @@ namespace dlib
                   from state.
         !*/
 
-        // The new_state parameter is need because the model doesn't have to be deterministic.
-        // Nonetheless for now I'll suppose that the reward is deterministic.
-        reward_type reward(
-            const state_type &state,
-            const action_type &action,
-            const state_type &new_state
+        bool is_success(
+            const state_type &state
         ) const;
         /*!
-            requires
-                - is possible to be in new_state after doing action from state.
             ensures
-                - returns the reward obtained for reaching new_state from state after
-                  doing action.
+                - returns whether state is a goal state (the agent has done its task properly).
+        !*/
+
+        bool is_failure(
+            const state_type &state
+        ) const;
+        /*!
+            ensures
+                - returns whether state is a failure state, i.e., a state where the agent has
+                  failed his task.
+        !*/
+
+        bool is_final(
+            const state_type& state
+        ) const;
+        /*!
+            ensures
+                - returns whether state is a final state, i.e., it is a state where the agent can't
+                  advance anymore. In another words, whether state is a success or failure state.
         !*/
 
 
     };
 
-    template < template<typename, typename> typename feature_extractor >
+    template < template<typename, typename> class feature_extractor >
     void serialize (const example_model<feature_extractor>& item, std::ostream& out);
-    template < template<typename, typename> typename feature_extractor >
+    template < template<typename, typename> class feature_extractor >
     void deserialize (example_model<feature_extractor>& item, std::istream& in);
     /*!
         provides serialization support.
diff --git a/dlib/control/policy.h b/dlib/control/policy.h
index 620f9b4f79..5ad8c58322 100644
--- a/dlib/control/policy.h
+++ b/dlib/control/policy.h
@@ -3,9 +3,10 @@
 #ifndef DLIB_POLICY_Hh_
 #define DLIB_POLICY_Hh_
 
-#include <random>
 #include "../matrix.h"
 #include "policy_abstract.h"
+#include <iostream>
+#include <random>
 
 namespace dlib
 {
@@ -22,15 +23,16 @@ namespace dlib
         typedef typename model_type::action_type action_type;
 
         greedy_policy (
-        )
+            const model_type &model_
+        ) : model(model_)
         {
             w.set_size(model.states_size());
             w = 0;
         }
 
         greedy_policy (
-            const matrix<double,0,1>& weights_,
-            const model_type& model_ = model_type()
+            const model_type &model_,
+            const matrix<double,0,1>& weights_
         ) : w(weights_), model(model_) {}
 
         action_type operator() (
@@ -51,7 +53,7 @@ namespace dlib
 
     private:
         matrix<double,0,1> w;
-        model_type model;
+        const model_type &model;
     };
 
     template < typename model_type >
@@ -79,57 +81,40 @@ namespace dlib
 // ----------------------------------------------------------------------------------------
 
     template <
-        typename model_type,
+        typename policy_type,
         typename generator = std::default_random_engine
         >
     class epsilon_policy
     {
     public:
-
-        typedef model_type feature_extractor_type;
-        typedef typename model_type::state_type state_type;
-        typedef typename model_type::action_type action_type;
+        typedef typename policy_type::state_type state_type;
+        typedef typename policy_type::action_type action_type;
 
         epsilon_policy (
             double epsilon_,
-            const generator &gen_ = std::default_random_engine()
-        ) : epsilon(epsilon_), gen(gen_)
-        {
-            w.set_size(model.states_size());
-            w = 0;
-        }
-
-        epsilon_policy (
-            double epsilon_,
-            const matrix<double,0,1>& weights_,
-            const model_type& model_ = model_type(),
-            const generator gen_ = std::default_random_engine()
-        ) : w(weights_), model(model_), epsilon(epsilon_), gen(gen_) {}
+            const policy_type &policy_,
+            const generator &gen_ = generator()
+        ) : policy(policy_), epsilon(epsilon_), gen(gen_) {}
 
         action_type operator() (
             const state_type& state
         ) const
         {
             std::bernoulli_distribution d(epsilon);
-            if(d(gen)){
-  //              std::cout << "random\n";
-                return model.random_action(state);
-            }
-            else{
-//                std::cout << "best\n";
-                return model.find_best_action(state,w);
-            }
-            //return d(gen) ? model.random_action(state) : model.find_best_action(state,w);
+            return d(gen) ? get_model().random_action(state) : policy(state);
         }
 
-        const model_type& get_model (
-        ) const { return model; }
+        policy_type get_policy(
+        ) const { return policy; }
+
+        auto get_model (
+        ) const -> decltype(get_policy().get_model()) { return policy.get_model(); }
 
         matrix<double,0,1>& get_weights (
-        ) { return w; }
+        ) { return policy.get_weights(); }
 
         const matrix<double,0,1>& get_weights (
-        ) const { return w; }
+        ) const { return policy.get_weights(); }
 
         double get_epsilon(
         ) const { return epsilon; }
@@ -138,43 +123,44 @@ namespace dlib
         ) const { return gen; }
 
     private:
-        matrix<double,0,1> w;
-        model_type model;
+        policy_type policy;
         double epsilon;
 
         mutable generator gen;
     };
 
-    template < typename model_type, typename generator >
-    inline void serialize(const epsilon_policy<model_type, generator>& item, std::ostream& out)
+    template < typename policy_type, typename generator >
+    inline void serialize(const epsilon_policy<policy_type, generator>& item, std::ostream& out)
     {
         int version = 1;
         serialize(version, out);
-        serialize(item.get_model(), out);
-        serialize(item.get_weights(), out);
+        serialize(item.get_policy(), out);
         serialize(item.get_epsilon(), out);
         serialize(item.get_generator(), out);
     }
-    template < typename model_type, typename generator >
-    inline void deserialize(epsilon_policy<model_type, generator>& item, std::istream& in)
+
+    template < typename policy_type, typename generator >
+    inline void deserialize(epsilon_policy<policy_type, generator>& item, std::istream& in)
     {
         int version = 0;
         deserialize(version, in);
         if (version != 1)
             throw serialization_error("Unexpected version found while deserializing dlib::greedy_policy object.");
-        model_type model;
-        matrix<double,0,1> w;
+
+        policy_type policy;
         double epsilon;
         generator gen;
-        deserialize(model, in);
-        deserialize(w, in);
+        deserialize(policy, in);
         deserialize(epsilon, in);
         deserialize(gen, in);
-        item = epsilon_policy<model_type, generator>(w,model, epsilon, gen);
+        item = epsilon_policy<policy_type, generator>(epsilon, policy, gen);
     }
 
 // ----------------------------------------------------------------------------------------
 
+    // For backward compability with lspi
+    template < typename model_type >
+    using policy = greedy_policy<model_type>; //template aliasing is possible post C++11
 }
 
 #endif // DLIB_POLICY_Hh_
diff --git a/dlib/control/policy_abstract.h b/dlib/control/policy_abstract.h
index 41b0e70fd7..991bf96296 100644
--- a/dlib/control/policy_abstract.h
+++ b/dlib/control/policy_abstract.h
@@ -3,7 +3,6 @@
 #undef DLIB_POLICY_ABSTRACT_Hh_
 #ifdef DLIB_POLICY_ABSTRACT_Hh_
 
-#include <random>
 #include "../matrix.h"
 #include "model_abstract.h"
 
@@ -31,18 +30,18 @@ class example_policy
     typedef typename model_type::action_type action_type;
 
     example_policy (
+        const model_type &model
     );
     /*!
         ensures
-            - #get_model() == model_type()
-              (i.e. it will have its default value)
+            - #get_model() == model
             - #get_weights().size() == #get_model().states_size()
             - #get_weights() == 0
     !*/
 
     example_policy (
-        const matrix<double,0,1>& weights,
-        const model_type& model
+        const model_type& model,
+        const matrix<double,0,1>& weights
     );
     /*!
         requires
@@ -111,18 +110,18 @@ class greedy_policy
     typedef typename model_type::action_type action_type;
 
     greedy_policy (
+        const model_type &model
     );
     /*!
         ensures
-            - #get_model() == model_type()
-              (i.e. it will have its default value)
+            - #get_model() == model
             - #get_weights().size() == #get_model().states_size()
             - #get_weights() == 0
     !*/
 
     greedy_policy (
-        const matrix<double,0,1>& weights,
-        const model_type& model
+        const model_type& model,
+        const matrix<double,0,1>& weights
     );
     /*!
         requires
@@ -176,58 +175,42 @@ void deserialize(greedy_policy<model_type>& item, std::istream& in);
 // ----------------------------------------------------------------------------------------
 
 template <
-    typename model_type,
+    typename policy_type,
     typename generator
     >
 class epsilon_policy
 {
     /*!
-        REQUIREMENTS ON model_type
-            model_type should implement the interface defined at model_abstract.h.
+        REQUIREMENTS ON policy_type
+            policy_type should implement the example_policy interface defined at the
+            top of this file.
 
         REQUIREMENTS ON generator
             generator should be a PRNG type like the ones defined in std::random.
 
         WHAT THIS OBJECT REPRESENTS
-            This is an implementation of the policy interface that returns the best
-            action for the given state with probability 1-epsilon while it returns
-            an doable random action with probability epsilon.
+            This is a special policy that returns the best action (according to the
+            underlying policy) for the given state with probability 1-epsilon
+            while it returns a valid random action with probability epsilon.
     !*/
 
 public:
 
-    typedef typename model_type::state_type state_type;
-    typedef typename model_type::action_type action_type;
-
-    epsilon_policy (
-        double epsilon,
-        const generator &gen = std::default_random_engine()
-    );
-    /*!
-        requires
-            - epsilon >= 0 and epsilon <= 1
-        ensures
-            - #get_model() == model_type()
-              (i.e. it will have its default value)
-            - #get_weights().size() == #get_model().states_size()
-            - #get_weights() == 0
-            - #get_epsilon() == epsilon
-    !*/
+    typedef typename policy_type::state_type state_type;
+    typedef typename policy_type::action_type action_type;
 
     epsilon_policy (
         double epsilon,
-        const matrix<double,0,1>& weights,
-        const model_type& model,
-        const generator &gen = std::default_random_engine()
+        const policy_type &policy,
+        const generator &gen = generator()
     );
     /*!
         requires
-            - model.states_size() == weights.size()
             - epsilon >= 0 and epsilon <= 1
         ensures
-            - #get_model() == model
-            - #get_weights() == weights
             - #get_epsilon() == epsilon
+            - #get_policy() == policy
+            - #get_generator() == gen
     !*/
 
     action_type operator() (
@@ -235,15 +218,22 @@ class epsilon_policy
     ) const;
     /*!
         ensures
-            - returns get_model().find_best_action(state, w) with probability 1-epsilon
+            - returns get_policy()(state, w) with probability 1-epsilon
               and get_model().random_action(state) with probability epsilon.
     !*/
 
-    const model_type& get_model (
+    policy_type get_policy(
     ) const;
     /*!
         ensures
-            - returns the model used by this object
+            - returns the underlying policy used by the object.
+    !*/
+
+    auto get_model (
+    ) const -> decltype(get_policy().get_model());
+    /*!
+        ensures
+            - returns the model used by the underlying policy.
     !*/
 
     matrix<double,0,1>& get_weights (
@@ -278,10 +268,10 @@ class epsilon_policy
 
 };
 
-template < typename model_type >
-void serialize(const epsilon_policy<model_type>& item, std::ostream& out);
-template < typename model_type >
-void deserialize(epsilon_policy<model_type>& item, std::istream& in);
+template < typename policy_type, typename generator >
+inline void serialize(const epsilon_policy<policy_type, generator>& item, std::ostream& out);
+template < typename policy_type, typename generator >
+inline void deserialize(epsilon_policy<policy_type, generator>& item, std::istream& in);
 /*!
     provides serialization support.
 !*/
diff --git a/dlib/control/qlearning.h b/dlib/control/qlearning.h
index 70a1fb0634..56e944ec74 100644
--- a/dlib/control/qlearning.h
+++ b/dlib/control/qlearning.h
@@ -4,12 +4,11 @@
 #define DLIB_QLEARNING_Hh_
 
 #include "policy.h"
+#include <iostream>
+#include <type_traits>
 
 namespace dlib
 {
-    template <
-        typename model_type
-        >
     class qlearning
     {
     public:
@@ -19,7 +18,7 @@ namespace dlib
             unsigned int miters = 100u,
             double eps = 0.1,
             bool v = false
-        ) : max_iterations(miters), verbose(v) {
+        ) : iterations(miters), verbose(v) {
             set_learning_rate(lr);
             set_discount(disc);
             set_epsilon(eps);
@@ -55,12 +54,12 @@ namespace dlib
             discount = value;
         }
 
-        unsigned int get_max_iterations(
-        ) const { return max_iterations; }
+        unsigned int get_iterations(
+        ) const { return iterations; }
 
-        void set_max_iterations(
+        void set_iterations(
             unsigned int iterations
-        ) { max_iterations = iterations; }
+        ) { iterations = iterations; }
 
         double get_epsilon(
         ) const { return epsilon; }
@@ -86,13 +85,20 @@ namespace dlib
         void be_quiet(
         ) { verbose = false; }
 
-        greedy_policy<model_type> train(
-            const matrix<double,0,1> &weights
+        template <
+            typename policy_type
+            >
+        policy_type train_policy(
+            const policy_type &policy
         ) const
         {
-            typedef typename model_type::reward_type reward_type;
+            typedef typename std::decay<decltype(policy.get_model())>::type::reward_type reward_type;
+
+            if(verbose)
+                std::cout << "Starting training..." << std::endl;
 
-            epsilon_policy<model_type> eps_pol(epsilon, weights);
+            const auto &model = policy.get_model();
+            epsilon_policy<policy_type> eps_pol(epsilon, policy);
             auto& w = eps_pol.get_weights();
 
             DLIB_ASSERT(weights.size() == model.states_size(),
@@ -103,7 +109,8 @@ namespace dlib
             );
 
             reward_type total_reward = static_cast<reward_type>(0);
-            for(auto iter = 0u; iter < max_iterations; ++iter){
+            std::cout << "iterations: " << iterations << std::endl;
+            for(auto iter = 0u; iter < iterations; ++iter){
                 auto state = model.initial_state();
 
                 reward_type reward = static_cast<reward_type>(0);
@@ -115,30 +122,9 @@ namespace dlib
                     const auto feats = model.get_features(state, action);
                     const auto feats_next_best = model.get_features(next_state, model.find_best_action(next_state, w));
 
-                    auto prev = w;
-
                     double correction = reward + discount * dot(w, feats_next_best) - dot(w, feats);
-                    //std::cout << "correction " << correction << "\n";
                     w += learning_rate * correction * feats;
 
-                    /*for(auto i = 0; i < model.states_size(); i++)
-                        std::cout << w(i) << " ";
-                    std::cout << std::endl;
-
-                    for(auto i = 0; i < model.states_size(); i++)
-                        std::cout << feats(i) << " ";
-                    std::cout << std::endl;
-
-
-                    if(verbose && sum(abs(w-prev)) != 0){
-                        std::cout << "updated:\n";
-                        for(auto i = 0; i < model.states_size(); i++){
-                            if(prev(i) != w(i))
-                                std::cout << "(" << i/5 << "," << i%5 << ") from " << prev(i) << " to " << w(i) << "\n";
-                        }
-                    }
-                    */
-
                     state = next_state;
                     reward += next_reward;
                 }
@@ -149,25 +135,25 @@ namespace dlib
                               << "\t mean: " << total_reward/static_cast<int>(iter+1) << std::endl;
             }
 
-            return greedy_policy<model_type>(w);
+            if(verbose)
+                std::cout << "Training finished." << std::endl;
+
+            return eps_pol.get_policy();
         }
 
+        template <
+                typename model_type
+                >
         greedy_policy<model_type> train(
-        ) const
-        {
-            matrix<double, 0, 1> weights;
-            weights = 0;
-            return train(weights);
-        }
+            const model_type &model
+        ) const { return train_policy(greedy_policy<model_type>(model)); }
 
     private:
         double learning_rate;
         double discount;
-        unsigned int max_iterations;
+        unsigned int iterations;
         double epsilon;
         bool verbose;
-
-        model_type model;
     };
 
 // ----------------------------------------------------------------------------------------
diff --git a/dlib/control/qlearning_abstract.h b/dlib/control/qlearning_abstract.h
index 9fb227d0ba..ccac305890 100644
--- a/dlib/control/qlearning_abstract.h
+++ b/dlib/control/qlearning_abstract.h
@@ -24,11 +24,15 @@ namespace dlib
                 as input and outputs a policy that have learnt from that in order to take
                 the better results.
 
-                Supposing we are in state s and action a the learning function has the form:
-                    Q(s, a) = (1 - lr) * Q(s,a) + lr * (reward + disc * max_a' Q(s, a'))
+                Supposing we are in state s and action a and we are going to a new state s'
+                the learning function has the form:
+                    Q(s, a) = (1 - lr) * Q(s,a) + lr * (reward + disc * max_a' Q(s', a'))
                 where lr is the learning_rate and disc the discount.
                 That formula means that it takes a convex combination of the current qvalue
                 and the expected qvalue.
+
+                Note that it is an off-policy reinforcement learning algorithm meaning
+                that it doesn't take the policy into account while learning.
         !*/
 
     public:
@@ -38,7 +42,7 @@ namespace dlib
             ensures
                 - #get_learning_rate() == 0.2
                 - #get_discount() == 0.8
-                - #get_max_iterations() == 100
+                - #get_iterations() == 100
                 - #get_epsilon() == 0.1
                 - #is not verbose
         !*/
@@ -46,7 +50,7 @@ namespace dlib
         explicit qlearning(
             double learning_rate,
             double discount,
-            unsigned int max_iterations,
+            unsigned int iterations,
             double epsilon,
             bool verbose
         );
@@ -58,7 +62,7 @@ namespace dlib
           ensures
             - #get_learning_rate() == learning_rate
             - #get_discount() == discount
-            - #get_max_iterations() == max_iterations
+            - #get_iterations() == iterations
             - #get_epsilon() == epsilon
             - #is_verbose() == verbose
         !*/
@@ -97,7 +101,7 @@ namespace dlib
                 - #get_discount() == discount
         !*/
 
-        unsigned int get_max_iterations(
+        unsigned int get_iterations(
         ) const;
         /*!
             ensures
@@ -105,12 +109,12 @@ namespace dlib
                   perform during the training.
         !*/
 
-        void set_max_iterations(
+        void set_iterations(
             unsigned int iterations
         );
         /*!
             ensures
-                - #get_max_iterations() == iterations
+                - #get_iterations() == iterations
         !*/
 
         double get_epsilon(
@@ -151,16 +155,34 @@ namespace dlib
                 - #is_verbose() == false
         !*/
 
+        template <
+            typename policy_type
+            >
+        policy_type train_policy(
+            const policy_type &policy
+        ) const;
+        /*!
+            requires
+                - policy is of the form example_policy<model_type>, i.e., an instance of
+                  an implementation of the policy interface defined in policy_abstract.h.
+            ensures
+                - returns a policy of the type policy_type as the result of applying the
+                  qlearning learning function over iterations runs over using the weight
+                  matrix of the argument as the initial weights.
+        !*/
+
+        template <
+                typename model_type
+                >
         greedy_policy<model_type> train(
-            const matrix<double,0,1> &weights
+            const model_type &model
         ) const;
         /*!
             requires
-                - weights.size() == model_type.states_size()
+                - model_type is an implementation of the example_model interface defined
+                  at model_abstract.h.
             ensures
-                - returns a greedy_policy resulting of doing max_iterations iterations
-                  over the model while applying the learning function to the weights
-                  matrix of the policy.
+                - returns train_policy(greedy_policy<model_type>(model));
         !*/
     };
 
diff --git a/dlib/control/sarsa.h b/dlib/control/sarsa.h
new file mode 100644
index 0000000000..bcb978f7fa
--- /dev/null
+++ b/dlib/control/sarsa.h
@@ -0,0 +1,164 @@
+// Copyright (C) 2017 Adrián Javaloy (adrian.javaloy@gmail.com)
+// License: Boost Software License   See LICENSE.txt for the full license.
+#ifndef DLIB_SARSA_Hh_
+#define DLIB_SARSA_Hh_
+
+#include "policy.h"
+#include <type_traits>
+#include <iostream>
+
+namespace dlib
+{
+    class sarsa
+    {
+    public:
+        explicit sarsa(
+            double lr = 0.2,
+            double disc = 0.8,
+            unsigned int miters = 100u,
+            double eps = 0.1,
+            bool v = false
+        ) : iterations(miters), verbose(v) {
+            set_learning_rate(lr);
+            set_discount(disc);
+            set_epsilon(eps);
+        }
+
+        double get_learning_rate(
+        ) const { return learning_rate; }
+
+        void set_learning_rate(
+            double value
+        )
+        {
+            DLIB_ASSERT(value >= 0. && value <= 1.,
+                "\t sarsa::set_learning_rate(value)"
+                "\n\t invalid inputs were given to this function"
+                "\n\t value: " << value
+            );
+            learning_rate = value;
+        }
+
+        double get_discount(
+        ) const { return discount; }
+
+        void set_discount(
+            double value
+        )
+        {
+            DLIB_ASSERT(value >= 0. && value <= 1.,
+                "\t sarsa::set_discount(value)"
+                "\n\t invalid inputs were given to this function"
+                "\n\t value: " << value
+            );
+            discount = value;
+        }
+
+        unsigned int get_iterations(
+        ) const { return iterations; }
+
+        void set_iterations(
+            unsigned int iterations
+        ) { iterations = iterations; }
+
+        double get_epsilon(
+        ) const { return epsilon; }
+
+        void set_epsilon(
+            double value
+        )
+        {
+            DLIB_ASSERT(value >= 0. && value <= 1.,
+                "\t sarsa::set_epsilon(value)"
+                "\n\t invalid inputs were given to this function"
+                "\n\t value: " << value
+            );
+            epsilon = value;
+        }
+
+        bool is_verbose(
+        ) const { return verbose; }
+
+        void be_verbose(
+        ) { verbose = true; }
+
+        void be_quiet(
+        ) { verbose = false; }
+
+        template <
+                typename policy_type
+                >
+        policy_type train_policy(
+            const policy_type &policy
+        ) const
+        {
+            typedef typename std::decay<decltype(policy.get_model())>::type::reward_type reward_type;
+
+            if(verbose)
+                std::cout << "Starting training..." << std::endl;
+
+            const auto &model = policy.get_model();
+            epsilon_policy<policy_type> eps_pol(epsilon, policy);
+            auto& w = eps_pol.get_weights();
+
+            DLIB_ASSERT(weights.size() == model.states_size(),
+                "\t sarsa::train(weights)"
+                "\n\t invalid inputs were given to this function"
+                "\n\t weights.size: " << weights.size() <<
+                "\n\t features size: " << model.states_size()
+            );
+
+            reward_type total_reward = static_cast<reward_type>(0);
+            for(auto iter = 0u; iter < iterations; ++iter){
+                auto state = model.initial_state();
+                auto action = eps_pol(state);
+
+                reward_type reward = static_cast<reward_type>(0);
+                while(!model.is_final(state)){
+                    auto next_state = model.step(state, action);
+                    auto next_action = eps_pol(next_state);
+                    auto next_reward = model.reward(state, action, next_state);
+
+                    const auto feats = model.get_features(state, action);
+                    const auto feats_next = model.get_features(next_state, next_action);
+
+                    double correction = reward + discount * dot(w, feats_next) - dot(w, feats);
+                    w += learning_rate * correction * feats;
+
+                    state = next_state;
+                    action = next_action;
+                    reward += next_reward;
+                }
+
+                total_reward += reward;
+                if(verbose)
+                    std::cout << "iteration: " << iter << "\t reward: " << reward
+                              << "\t mean: " << total_reward/static_cast<int>(iter+1) << std::endl;
+            }
+
+            if(verbose)
+                std::cout << "Training finished." << std::endl;
+
+            return eps_pol.get_policy();
+        }
+
+        template <
+                typename model_type
+                >
+        greedy_policy<model_type> train(
+            const model_type &model
+        ) const { return train_policy(greedy_policy<model_type>(model)); }
+
+    private:
+        double learning_rate;
+        double discount;
+        unsigned int iterations;
+        double epsilon;
+        bool verbose;
+    };
+
+// ----------------------------------------------------------------------------------------
+
+}
+
+#endif // DLIB_SARSA_Hh_
diff --git a/dlib/control/sarsa_abstract.h b/dlib/control/sarsa_abstract.h
new file mode 100644
index 0000000000..f4d559a8d9
--- /dev/null
+++ b/dlib/control/sarsa_abstract.h
@@ -0,0 +1,195 @@
+// Copyright (C) 2017 Adrián Javaloy (adrian.javaloy@gmail.com)
+// License: Boost Software License   See LICENSE.txt for the full license.
+#undef DLIB_SARSA_ABSTRACT_Hh_
+#ifdef DLIB_SARSA_ABSTRACT_Hh_
+
+#include "policy_abstract.h"
+#include "model_abstract.h"
+
+namespace dlib
+{
+    template <
+        typename model_type
+        >
+    class sarsa
+    {
+        /*!
+            REQUIREMENTS ON model_type
+                model_type is an implementation of the model interface declared in
+                  model_abstract.h.
+
+            WHAT THIS OBJECT REPRESENTS
+                This objects is an implementation of the well-known reinforcement learning
+                algorithm Q-learning. This algorithms takes a bunch of process_samples
+                as input and outputs a policy that have learnt from that in order to take
+                the better results.
+
+                Supposing we are in state s and action a and we are going to a new state s'
+                the learning function has the form:
+                    Q(s, a) = (1 - lr) * Q(s,a) + lr * (reward + disc * Q(s', a'))
+                where lr is the learning_rate, disc the discount and a' is the next action
+                the algorithm will perform after reaching s'.
+                That formula means that it takes a convex combination of the current qvalue
+                and the expected qvalue.
+
+                Note that, unlike qlearning, sarsa is an on-policy reinforcement learning
+                algorithm meaning that it takes the policy into account while learning.
+        !*/
+
+    public:
+        sarsa(
+        );
+        /*!
+            ensures
+                - #get_learning_rate() == 0.2
+                - #get_discount() == 0.8
+                - #get_iterations() == 100
+                - #get_epsilon() == 0.1
+                - #is not verbose
+        !*/
+
+        explicit sarsa(
+            double learning_rate,
+            double discount,
+            unsigned int iterations,
+            double epsilon,
+            bool verbose
+        );
+        /*!
+          requires
+            - learning_rate >= 0 and learning_rate <= 1
+            - discount >= 0 and discount <= 1
+            - epsilon >= 0 and epsilon <= 1
+          ensures
+            - #get_learning_rate() == learning_rate
+            - #get_discount() == discount
+            - #get_iterations() == iterations
+            - #get_epsilon() == epsilon
+            - #is_verbose() == verbose
+        !*/
+
+        double get_learning_rate(
+        ) const;
+        /*!
+            ensures
+                - returns the learning rate applied to the learning function.
+        !*/
+
+        void set_learning_rate(
+            double learning_rate
+        );
+        /*!
+            requires
+                - learning_rate >= 0 and learning_rate <= 1.
+            ensures
+                - #get_learning_rate() == learning_rate
+        !*/
+
+        double get_discount(
+        ) const;
+        /*!
+            ensures
+                - returns the discount applied to the learning function.
+        !*/
+
+        void set_discount(
+            double discount
+        );
+        /*!
+            requires
+                - discount >= 0 and discount <= 1.
+            ensures
+                - #get_discount() == discount
+        !*/
+
+        unsigned int get_iterations(
+        ) const;
+        /*!
+            ensures
+                - returns the maximum number of iterations that sarsa will
+                  perform during the training.
+        !*/
+
+        void set_iterations(
+            unsigned int iterations
+        );
+        /*!
+            ensures
+                - #get_iterations() == iterations
+        !*/
+
+        double get_epsilon(
+        ) const;
+        /*!
+            ensures
+                - returns the probability of doing a non-optimal step while training.
+        !*/
+
+        void set_epsilon(
+            double epsilon
+        );
+        /*!
+            requires
+                - epsilon >= 0 and epsilon <= 1.
+            ensures
+                - #get_epsilon() == epsilon
+        !*/
+
+        bool is_verbose(
+        ) const;
+        /*!
+            ensures
+                - returns if the class is verbose or not.
+        !*/
+
+        void be_verbose(
+        );
+        /*!
+            ensures
+                - #is_verbose() == true
+        !*/
+
+        void be_quiet(
+        );
+        /*!
+            ensures
+                - #is_verbose() == false
+        !*/
+
+        template <
+            typename policy_type
+            >
+        policy_type train_policy(
+            const policy_type &policy
+        ) const;
+        /*!
+            requires
+                - policy is of the form example_policy<model_type>, i.e., an instance of
+                  an implementation of the policy interface defined in policy_abstract.h.
+            ensures
+                - returns a policy of the type policy_type as the result of applying the
+                  sarsa learning function over iterations runs over using the weight
+                  matrix of the argument as the initial weights.
+        !*/
+
+        template <
+                typename model_type
+                >
+        greedy_policy<model_type> train(
+            const model_type &model
+        ) const;
+        /*!
+            requires
+                - model_type is an implementation of the example_model interface defined
+                  at model_abstract.h.
+            ensures
+                - returns train_policy(greedy_policy<model_type>(model));
+        !*/
+    };
+    };
+
+// ----------------------------------------------------------------------------------------
+
+}
+
+#endif // DLIB_SARSA_ABSTRACT_Hh_
diff --git a/examples/qlearning_sarsa_ex.cpp b/examples/qlearning_sarsa_ex.cpp
index 44f8a13d34..612eb6c54a 100644
--- a/examples/qlearning_sarsa_ex.cpp
+++ b/examples/qlearning_sarsa_ex.cpp
@@ -1,5 +1,7 @@
-#include <dlib/control/qlearning.h>
+// Copyright (C) 2017 Adrián Javaloy (adrian.javaloy@gmail.com)
+// License: Boost Software License   See LICENSE.txt for the full license.
 #include <dlib/matrix.h>
+#include <dlib/control.h>
 #include <limits>
 #include <cmath>
 #include <vector>
@@ -7,41 +9,18 @@
 
 using namespace dlib;
 
-template <
-        typename state_type,
-        typename action_type
-        >
-class feature_extractor
-{
-public:
-    feature_extractor(
-        unsigned int h,
-        unsigned int w,
-        unsigned int na
-    ) : height(h), width(w), num_actions(na) {}
-
-
-    inline long num_features(
-    ) const { return num_actions * height * width; }
-
-    matrix<double,0,1> get_features(
-        const state_type &state,
-        const action_type &action
-    ) const
-    {
-        matrix<double,0,1> feats(num_features());
-        feats = 0;
-        //for(auto i = 0u; i < num_actions; i++)
-        //    feats(num_actions * state + i) = 1;
-        feats(num_actions*state + static_cast<int>(action)) = 1;
-
-        return feats;
-    }
-
-private:
-    int height, width, num_actions;
-};
-
+// This is the model the agent is going to work with. In particular this class
+// represents the a grid with height rows and width cols where of the form
+//                                  ..........
+//                                  ..........
+//                                  IFFFFFFFFG
+// where: - F are pits cells (if the agent falls there it fails)
+//        - I is the initial cell
+//        - G is the goal cell (the agent goal is to reach that spot)
+//        - . are free cell where the agent can go.
+// the rewards are: -100 for reaching F, 100 for reaching G and -1 for the rest.
+// it doesn't allow to go out of bounds, instead the agent will stay in the same cell
+// (like if there was a wall there).
 template <
         int height,
         int width,
@@ -50,9 +29,13 @@ template <
 class cliff_model
 {
 public:
+    // constants and actions allowed
     enum class actions {up = 0, right, down, left};
     constexpr static double EPS = 1e-16;
+    constexpr static int HEIGHT = height;
+    constexpr static int WIDTH = width;
 
+    // model types
     typedef int state_type;
     typedef actions action_type;
     typedef int reward_type;
@@ -63,7 +46,7 @@ class cliff_model
     ) : fe(height, width, 4){}
 
     action_type random_action(
-        const state_type& state
+        const state_type& state // since all movements are always allowed we don't use state
     ) const
     {
         std::uniform_int_distribution<int> dist(0,3);
@@ -75,21 +58,23 @@ class cliff_model
         const matrix<double,0,1>& w
     ) const
     {
+        // it looks for the best actions in state according to w
         auto best = std::numeric_limits<double>::lowest();
         auto best_indexes = std::vector<int>();
 
         for(auto i = 0; i < 4; i++){
             auto feats = get_features(state, static_cast<action_type>(i));
             auto product = dot(w, feats);
+
             if(product > best){
                 best = product;
                 best_indexes.clear();
             }
-
             if(std::abs(product - best) < EPS)
                 best_indexes.push_back(i);
         }
 
+        // returns a random action between the best ones.
         std::uniform_int_distribution<unsigned long> dist(0, best_indexes.size()-1);
         return static_cast<action_type>(best_indexes[dist(gen)]);
     }
@@ -175,41 +160,115 @@ class cliff_model
     }
 
     feature_extractor fe;
-    mutable std::default_random_engine gen;
+    mutable std::default_random_engine gen; //mutable because it doesn't changes the model state
 };
 
-#include <dlib/control/policy.h>
-int main(int argc, char** argv)
+// This class is the feature representation of cliff_model states.
+// It's just a basic one-shot representation where the feature vector for a point (a,b) doing action c
+// is a zero vector of size width*height*num_actions with just a one on (a*width + b)*num_actions + c
+template <
+        typename state_type,
+        typename action_type
+        >
+class feature_extractor
 {
-    std::cout << "Hello." << std::endl;
+public:
+    feature_extractor(
+        int h,
+        int w,
+        int na
+    ) : height(h), width(w), num_actions(na) {}
 
-    const auto height = 3u;
-    const auto width = 5u;
+    inline long num_features(
+    ) const { return num_actions * height * width; }
 
-    typedef cliff_model<height, width, feature_extractor> model_type;
+    matrix<double,0,1> get_features(
+        const state_type &state,
+        const action_type &action
+    ) const
+    {
+        matrix<double,0,1> feats(num_features());
+        feats = 0;
+        //for(auto i = 0u; i < num_actions; i++)
+        //    feats(num_actions * state + i) = 1;
+        feats(num_actions*state + static_cast<int>(action)) = 1;
 
-    model_type model;
-    qlearning<model_type> algorithm;
-    algorithm.be_verbose();
-    algorithm.set_max_iterations(100);
+        return feats;
+    }
 
-    auto policy = algorithm.train();
+private:
+    int height, width, num_actions;
+};
 
+// Just a helper function to pretty print the state of the agent.
+template <
+        typename model_t
+        >
+void print(std::ostream &os, const model_t &model, const typename model_t::state_type &state)
+{
+    for(auto i = 0; i < model_t::HEIGHT; i++){
+        for(auto j = 0; j < model_t::WIDTH; j++){
+            typename model_t::state_type s = model_t::WIDTH * i + j;
+            os << ( s == state ? 'X' : model.is_success(s) ? 'G' : model.is_failure(s) ? 'F' : '.');
+        }
+        os << std::endl;
+    }
+    os << std::endl;
+}
+
+// The function that runs the agent
+template <
+        typename model_t,
+        typename algorithm_t // qlearning or sarsa
+        >
+void run_example(const model_t &model, algorithm_t &&algorithm)
+{
+    algorithm.be_verbose();  // uncomment it if you want to see training info.
+    auto policy = algorithm.train(model);
+
+    std::cout << "Starting final simulation..." << std::endl;
     auto s = model.initial_state();
-    int r = 0; //TODO
-    for(auto i = 0u; i < 100 && !model.is_final(s); i++){
+    auto r = static_cast<typename model_t::reward_type>(0);
+    int i;
+
+    for(i = 0; i < 100 && !model.is_final(s); i++){
+        print(std::cout, model, s);
+
         auto a = policy(s);
         auto new_s = model.step(s, a);
         r += model.reward(s,a,new_s);
         s = new_s;
     }
+    print(std::cout, model, s);
+    std::cout << "Simulation finished." << std::endl;
 
     if(!model.is_final(s))
         std::cout << "Nothing reached after 100 steps." << std::endl;
     else if(model.is_failure(s))
-        std::cout << "Failed." << std::endl;
+        std::cout << "Failed after " << i << " steps with reward " << r << "." << std::endl;
+    else
+        std::cout << "Success after " << i << " steps with reward " << r << "." << std::endl;
+}
+
+int main(int argc, char** argv)
+{
+    std::cout << "Hello." << std::endl;
+
+    const auto height = 4u;
+    const auto width = 5u;
+    typedef cliff_model<height, width, feature_extractor> model_type;
+    model_type model;
+
+    char response;
+    std::cout << "Qlearning or SARSA? (q/s): ";
+    std::cin >> response;
+
+    if(response == 'q')
+        run_example(model, qlearning());
+    else if(response == 's')
+        run_example(model, sarsa());
     else
-        std::cout << "Success." << std::endl;
+        std::cerr << "Invalid option." << std::endl;
 
     std::cout << "Good bye." << std::endl;
 

From 8b241b291455568be679580107af068a364c389b Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Adri=C3=A1n=20Javaloy?= <adrian.javaloy@gmail.com>
Date: Sat, 9 Dec 2017 21:32:51 +0100
Subject: [PATCH 03/14] Added test for the reinforcement learning methods and
 checked backward compability with lspi.

---
 dlib/control/approximate_linear_models.h |   2 +-
 dlib/control/policy.h                    |   6 +
 dlib/control/qlearning.h                 |  11 +-
 dlib/control/sarsa.h                     |   4 +-
 dlib/test/CMakeLists.txt                 |   2 +-
 dlib/test/reinforcement_learning.cpp     | 264 +++++++++++++++++++++++
 examples/qlearning_sarsa_ex.cpp          |  37 +++-
 7 files changed, 308 insertions(+), 18 deletions(-)
 create mode 100644 dlib/test/reinforcement_learning.cpp

diff --git a/dlib/control/approximate_linear_models.h b/dlib/control/approximate_linear_models.h
index a0d9c01dcb..252b849e49 100644
--- a/dlib/control/approximate_linear_models.h
+++ b/dlib/control/approximate_linear_models.h
@@ -17,7 +17,7 @@ namespace dlib
     {
         typedef typename model_type::state_type state_type;
         typedef typename model_type::action_type action_type;
-        typedef typename model_type::reward_type reward_type;
+        typedef double reward_type;
 
         process_sample(){}
 
diff --git a/dlib/control/policy.h b/dlib/control/policy.h
index 5ad8c58322..f2c8f4855b 100644
--- a/dlib/control/policy.h
+++ b/dlib/control/policy.h
@@ -35,6 +35,12 @@ namespace dlib
             const matrix<double,0,1>& weights_
         ) : w(weights_), model(model_) {}
 
+        //backward compability
+        greedy_policy (
+            const matrix<double,0,1>& weights_,
+            const model_type &model_
+        ) : w(weights_), model(model_) {}
+
         action_type operator() (
             const state_type& state
         ) const
diff --git a/dlib/control/qlearning.h b/dlib/control/qlearning.h
index 56e944ec74..2f266cfbd8 100644
--- a/dlib/control/qlearning.h
+++ b/dlib/control/qlearning.h
@@ -58,8 +58,8 @@ namespace dlib
         ) const { return iterations; }
 
         void set_iterations(
-            unsigned int iterations
-        ) { iterations = iterations; }
+            unsigned int value
+        ) { iterations = value; }
 
         double get_epsilon(
         ) const { return epsilon; }
@@ -109,10 +109,10 @@ namespace dlib
             );
 
             reward_type total_reward = static_cast<reward_type>(0);
-            std::cout << "iterations: " << iterations << std::endl;
             for(auto iter = 0u; iter < iterations; ++iter){
                 auto state = model.initial_state();
 
+                auto steps = 0u;
                 reward_type reward = static_cast<reward_type>(0);
                 while(!model.is_final(state)){
                     auto action = eps_pol(state);
@@ -127,12 +127,15 @@ namespace dlib
 
                     state = next_state;
                     reward += next_reward;
+                    steps++;
                 }
 
                 total_reward += reward;
                 if(verbose)
                     std::cout << "iteration: " << iter << "\t reward: " << reward
-                              << "\t mean: " << total_reward/static_cast<int>(iter+1) << std::endl;
+                              << "\t mean: " << total_reward/static_cast<int>(iter+1)
+                              << "\t steps: " << steps
+                              << std::endl;
             }
 
             if(verbose)
diff --git a/dlib/control/sarsa.h b/dlib/control/sarsa.h
index bcb978f7fa..4ff96195b6 100644
--- a/dlib/control/sarsa.h
+++ b/dlib/control/sarsa.h
@@ -58,8 +58,8 @@ namespace dlib
         ) const { return iterations; }
 
         void set_iterations(
-            unsigned int iterations
-        ) { iterations = iterations; }
+            unsigned int value
+        ) { iterations = value; }
 
         double get_epsilon(
         ) const { return epsilon; }
diff --git a/dlib/test/CMakeLists.txt b/dlib/test/CMakeLists.txt
index 8d05fdce5d..7474d658a8 100644
--- a/dlib/test/CMakeLists.txt
+++ b/dlib/test/CMakeLists.txt
@@ -118,6 +118,7 @@ set (tests
    ranking.cpp
    read_write_mutex.cpp
    reference_counter.cpp
+   reinforcement_learning.cpp
    rls.cpp
    random_forest.cpp
    sammon.cpp
@@ -160,7 +161,6 @@ set (tests
    elastic_net.cpp
    )
 
-
 # add all the cpp files we want to compile to this list.  This tells
 # cmake that they are part of our target (which is the executable named dtest)
 ADD_EXECUTABLE(${target_name} main.cpp tester.cpp ${tests})
diff --git a/dlib/test/reinforcement_learning.cpp b/dlib/test/reinforcement_learning.cpp
new file mode 100644
index 0000000000..d16a37638c
--- /dev/null
+++ b/dlib/test/reinforcement_learning.cpp
@@ -0,0 +1,264 @@
+// Copyright (C) 2017  Adrián Javaloy (adrian.javaloy@gmail.com)
+// License: Boost Software License   See LICENSE.txt for the full license.
+
+#include "tester.h"
+#include <dlib/control.h>
+#include <vector>
+#include <sstream>
+#include <ctime>
+
+namespace
+{
+    using namespace test;
+    using namespace dlib;
+    using namespace std;
+    dlib::logger dlog("test.rl");
+
+    template <
+            int height,
+            int width,
+            template<typename,typename> class feature_extractor_type
+            >
+    class cliff_model
+    {
+    public:
+        // constants and actions allowed
+        enum class actions {up = 0, right, down, left};
+        constexpr static double EPS = 1e-16;
+        constexpr static int HEIGHT = height;
+        constexpr static int WIDTH = width;
+
+        // model types
+        typedef int state_type;
+        typedef actions action_type;
+        typedef int reward_type;
+
+        typedef feature_extractor_type<state_type, action_type> feature_extractor;
+
+        explicit cliff_model(
+        ) : fe(height, width, 4){}
+
+        action_type random_action(
+            const state_type& state // since all movements are always allowed we don't use state
+        ) const
+        {
+            std::uniform_int_distribution<int> dist(0,3);
+            return static_cast<action_type>(dist(gen));
+        }
+
+        action_type find_best_action(
+            const state_type& state,
+            const matrix<double,0,1>& w
+        ) const
+        {
+            // it looks for the best actions in state according to w
+            auto best = std::numeric_limits<double>::lowest();
+            auto best_indexes = std::vector<int>();
+
+            for(auto i = 0; i < 4; i++){
+                auto feats = get_features(state, static_cast<action_type>(i));
+                auto product = dot(w, feats);
+
+                if(product > best){
+                    best = product;
+                    best_indexes.clear();
+                }
+                if(std::abs(product - best) < EPS)
+                    best_indexes.push_back(i);
+            }
+
+            // returns a random action between the best ones.
+            std::uniform_int_distribution<unsigned long> dist(0, best_indexes.size()-1);
+            return static_cast<action_type>(best_indexes[dist(gen)]);
+        }
+
+        const feature_extractor& get_feature_extractor(
+        ) const { return fe; }
+
+        auto states_size(
+        ) const -> decltype(get_feature_extractor().num_features())
+        {
+            return get_feature_extractor().num_features();
+        }
+
+        auto get_features(
+            const state_type &state,
+            const action_type &action
+        ) const -> decltype(get_feature_extractor().get_features(state, action))
+        { return get_feature_extractor().get_features(state, action); }
+
+        reward_type reward(
+            const state_type &state,
+            const action_type &action,
+            const state_type &new_state
+        ) const
+        {
+            return !is_final(new_state) ? -1 : is_success(new_state) ? 100 : -100;
+        }
+
+        state_type initial_state(
+        ) const { return static_cast<state_type>((height-1) * width); }
+
+        state_type step(
+            const state_type& state,
+            const action_type& action
+        ) const
+        {
+            if(out_of_bounds(state, action))
+                return state;
+
+            return action == actions::up    ?   state - width   :
+                   action == actions::down  ?   state + width   :
+                   action == actions::right ?   state + 1       :
+                                                state - 1       ;
+        }
+
+        bool is_success(
+            const state_type &state
+        ) const { return state == height*width - 1; }
+
+        bool is_failure(
+            const state_type &state
+        ) const { return state/width == height-1 && state%width > 0 && state%width < width-1;}
+
+        bool is_final(
+            const state_type& state
+        ) const { return is_success(state) || is_failure(state); }
+
+    private:
+        bool out_of_bounds(
+            const state_type& state,
+            const action_type& action
+        ) const
+        {
+            bool result;
+
+            switch(action){
+            case actions::up:
+                result = state / width == 0;
+                break;
+            case actions::down:
+                result = (state / width == height-2 && state % width > 0 && state % width < width-1)
+                        || state / width == height-1;
+                break;
+            case actions::left:
+                result = state % width == 0; // || state == height*width-1; <- is the goal condition
+                break;
+            case actions::right:
+                result = state % width == width-1 || state == (height-1)*width;
+                break;
+            }
+
+            return result;
+        }
+
+        feature_extractor fe;
+        mutable std::default_random_engine gen; //mutable because it doesn't changes the model state
+    };
+
+    template <
+            typename state_type,
+            typename action_type
+            >
+    class feature_extractor
+    {
+    public:
+        feature_extractor(
+            int h,
+            int w,
+            int na
+        ) : height(h), width(w), num_actions(na) {}
+
+        inline long num_features(
+        ) const { return num_actions * height * width; }
+
+        matrix<double,0,1> get_features(
+            const state_type &state,
+            const action_type &action
+        ) const
+        {
+            matrix<double,0,1> feats(num_features());
+            feats = 0;
+            //for(auto i = 0u; i < num_actions; i++)
+            //    feats(num_actions * state + i) = 1;
+            feats(num_actions*state + static_cast<int>(action)) = 1;
+
+            return feats;
+        }
+
+    private:
+        int height, width, num_actions;
+    };
+
+    template <
+        int height,
+        int width,
+        typename algorithm_t
+        >
+    void test(unsigned int iterations)
+    {
+        typedef cliff_model<height, width, feature_extractor> model_t;
+        const int max_steps = 150;
+
+        print_spinner();
+        algorithm_t algorithm;
+        algorithm.set_iterations(iterations);
+        model_t model;
+        auto policy = algorithm.train(model);
+
+        auto s = model.initial_state();
+        auto r = static_cast<typename model_t::reward_type>(0);
+        int i;
+
+        for(i = 0; i < max_steps && !model.is_final(s); i++){
+            auto a = policy(s);
+            auto new_s = model.step(s, a);
+            r += model.reward(s,a,new_s);
+            s = new_s;
+        }
+
+        dlog << LINFO << "height, width:   " << height << "," << width;
+        dlog << LINFO << "steps:   " << i;
+        dlog << LINFO << "state:   (" << s/width << "," << s%width << ")";
+        dlog << LINFO << "success: " << (model.is_success(s) ? "true" : "false");
+        dlog << LINFO << "failure: " << (model.is_failure(s) ? "true" : "false");
+        dlog << LINFO << "reward:  " << r;
+        DLIB_TEST(i != max_steps);
+        DLIB_TEST(model.is_success(s));
+        DLIB_TEST(r > 0);
+    }
+
+    class rl_tester : public tester
+    {
+    public:
+        rl_tester (
+        ) :
+            tester (
+                "test_rl",       // the command line argument name for this test
+                "Run tests on the qlearning and sarsa objects.", // the command line argument description
+                0                     // the number of command line arguments for this test
+            )
+        {
+        }
+
+        void perform_test (
+        )
+        {
+            // I have to hardcode the number of iterations
+            // since qlearning is off-policy it can get the wrong answer if it iterates too much
+            // this could be troublesome if convergence depends too much on randomness
+            test<4,5,qlearning>(100);
+            test<5,5,qlearning>(1000);
+            test<4,7,qlearning>(500);
+            test<5,10,qlearning>(2000);
+
+            test<4,5,sarsa>(100);
+            test<5,5,sarsa>(200);
+            test<4,7,sarsa>(200);
+            test<5,10,sarsa>(300);
+        }
+    };
+
+    rl_tester a;
+}
+
diff --git a/examples/qlearning_sarsa_ex.cpp b/examples/qlearning_sarsa_ex.cpp
index 612eb6c54a..3c18c6db5c 100644
--- a/examples/qlearning_sarsa_ex.cpp
+++ b/examples/qlearning_sarsa_ex.cpp
@@ -204,8 +204,21 @@ class feature_extractor
 template <
         typename model_t
         >
-void print(std::ostream &os, const model_t &model, const typename model_t::state_type &state)
+void print(
+        std::ostream &os,
+        const model_t &model,
+        const typename model_t::state_type &state,
+        const matrix<double,0,1> &weights,
+        const typename model_t::action_type &action
+)
 {
+    std::cout << "weights: ";
+    for(int i = 0; i < 4; i++)
+        std::cout << weights(state*4+i) << " ";
+    std::cout << std::endl;
+
+    std::cout << "action: " << static_cast<int>(action) << "\n";
+
     for(auto i = 0; i < model_t::HEIGHT; i++){
         for(auto j = 0; j < model_t::WIDTH; j++){
             typename model_t::state_type s = model_t::WIDTH * i + j;
@@ -223,7 +236,7 @@ template <
         >
 void run_example(const model_t &model, algorithm_t &&algorithm)
 {
-    algorithm.be_verbose();  // uncomment it if you want to see training info.
+    //algorithm.be_verbose();  // uncomment it if you want to see training info.
     auto policy = algorithm.train(model);
 
     std::cout << "Starting final simulation..." << std::endl;
@@ -232,14 +245,14 @@ void run_example(const model_t &model, algorithm_t &&algorithm)
     int i;
 
     for(i = 0; i < 100 && !model.is_final(s); i++){
-        print(std::cout, model, s);
-
         auto a = policy(s);
         auto new_s = model.step(s, a);
         r += model.reward(s,a,new_s);
+
+        print(std::cout, model, s, policy.get_weights(), a);
         s = new_s;
     }
-    print(std::cout, model, s);
+    print(std::cout, model, s, policy.get_weights(), static_cast<decltype(policy(s))>(0));
     std::cout << "Simulation finished." << std::endl;
 
     if(!model.is_final(s))
@@ -255,7 +268,7 @@ int main(int argc, char** argv)
     std::cout << "Hello." << std::endl;
 
     const auto height = 4u;
-    const auto width = 5u;
+    const auto width = 7u;
     typedef cliff_model<height, width, feature_extractor> model_type;
     model_type model;
 
@@ -263,10 +276,14 @@ int main(int argc, char** argv)
     std::cout << "Qlearning or SARSA? (q/s): ";
     std::cin >> response;
 
-    if(response == 'q')
-        run_example(model, qlearning());
-    else if(response == 's')
-        run_example(model, sarsa());
+    if(response == 'q'){
+        qlearning algorithm;
+        algorithm.set_iterations(500); //for this size qlearning doesn't converge with 100 iterations
+        run_example(model, algorithm);
+    }
+    else if(response == 's'){
+        run_example(model, sarsa()); //On the other side, sarsa does converge
+    }
     else
         std::cerr << "Invalid option." << std::endl;
 

From d20c510c0fb918606ae57ceeb17da1c787219322 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Adri=C3=A1n=20Javaloy?= <adrian.javaloy@gmail.com>
Date: Mon, 11 Dec 2017 23:57:58 +0100
Subject: [PATCH 04/14] Fixed reward bug + model bug + prng support

---
 dlib/control/policy.h                |  8 +++---
 dlib/control/qlearning.h             | 27 ++++++++++--------
 dlib/control/sarsa.h                 | 33 +++++++++++++---------
 dlib/test/reinforcement_learning.cpp | 42 +++++++++++++---------------
 examples/qlearning_sarsa_ex.cpp      | 26 ++++++++---------
 5 files changed, 71 insertions(+), 65 deletions(-)

diff --git a/dlib/control/policy.h b/dlib/control/policy.h
index f2c8f4855b..c72ea7c3cf 100644
--- a/dlib/control/policy.h
+++ b/dlib/control/policy.h
@@ -88,7 +88,7 @@ namespace dlib
 
     template <
         typename policy_type,
-        typename generator = std::default_random_engine
+        typename prng_engine = std::default_random_engine
         >
     class epsilon_policy
     {
@@ -99,7 +99,7 @@ namespace dlib
         epsilon_policy (
             double epsilon_,
             const policy_type &policy_,
-            const generator &gen_ = generator()
+            const prng_engine &gen_ = prng_engine()
         ) : policy(policy_), epsilon(epsilon_), gen(gen_) {}
 
         action_type operator() (
@@ -125,14 +125,14 @@ namespace dlib
         double get_epsilon(
         ) const { return epsilon; }
 
-        const generator& get_generator(
+        const prng_engine& get_generator(
         ) const { return gen; }
 
     private:
         policy_type policy;
         double epsilon;
 
-        mutable generator gen;
+        mutable prng_engine gen;
     };
 
     template < typename policy_type, typename generator >
diff --git a/dlib/control/qlearning.h b/dlib/control/qlearning.h
index 2f266cfbd8..94f6731d75 100644
--- a/dlib/control/qlearning.h
+++ b/dlib/control/qlearning.h
@@ -6,6 +6,7 @@
 #include "policy.h"
 #include <iostream>
 #include <type_traits>
+#include <random>
 
 namespace dlib
 {
@@ -86,10 +87,12 @@ namespace dlib
         ) { verbose = false; }
 
         template <
-            typename policy_type
+            typename policy_type,
+            typename prng_engine = std::default_random_engine
             >
         policy_type train_policy(
-            const policy_type &policy
+            const policy_type &policy,
+            const prng_engine &gen = prng_engine()
         ) const
         {
             typedef typename std::decay<decltype(policy.get_model())>::type::reward_type reward_type;
@@ -98,7 +101,7 @@ namespace dlib
                 std::cout << "Starting training..." << std::endl;
 
             const auto &model = policy.get_model();
-            epsilon_policy<policy_type> eps_pol(epsilon, policy);
+            epsilon_policy<policy_type, prng_engine> eps_pol(epsilon, policy, gen);
             auto& w = eps_pol.get_weights();
 
             DLIB_ASSERT(weights.size() == model.states_size(),
@@ -113,11 +116,11 @@ namespace dlib
                 auto state = model.initial_state();
 
                 auto steps = 0u;
-                reward_type reward = static_cast<reward_type>(0);
+                reward_type iteration_reward = static_cast<reward_type>(0);
                 while(!model.is_final(state)){
                     auto action = eps_pol(state);
                     auto next_state = model.step(state, action);
-                    auto next_reward = model.reward(state, action, next_state);
+                    auto reward = model.reward(state, action, next_state);
 
                     const auto feats = model.get_features(state, action);
                     const auto feats_next_best = model.get_features(next_state, model.find_best_action(next_state, w));
@@ -126,13 +129,13 @@ namespace dlib
                     w += learning_rate * correction * feats;
 
                     state = next_state;
-                    reward += next_reward;
+                    iteration_reward += reward;
                     steps++;
                 }
 
-                total_reward += reward;
+                total_reward += iteration_reward;
                 if(verbose)
-                    std::cout << "iteration: " << iter << "\t reward: " << reward
+                    std::cout << "iteration: " << iter << "\t reward: " << iteration_reward
                               << "\t mean: " << total_reward/static_cast<int>(iter+1)
                               << "\t steps: " << steps
                               << std::endl;
@@ -145,11 +148,13 @@ namespace dlib
         }
 
         template <
-                typename model_type
+                typename model_type,
+                typename prng_engine = std::default_random_engine
                 >
         greedy_policy<model_type> train(
-            const model_type &model
-        ) const { return train_policy(greedy_policy<model_type>(model)); }
+            const model_type &model,
+            const prng_engine &gen = prng_engine()
+        ) const { return train_policy(greedy_policy<model_type>(model), gen); }
 
     private:
         double learning_rate;
diff --git a/dlib/control/sarsa.h b/dlib/control/sarsa.h
index 4ff96195b6..ba7e75099b 100644
--- a/dlib/control/sarsa.h
+++ b/dlib/control/sarsa.h
@@ -86,10 +86,12 @@ namespace dlib
         ) { verbose = false; }
 
         template <
-                typename policy_type
-                >
+            typename policy_type,
+            typename prng_engine = std::default_random_engine
+            >
         policy_type train_policy(
-            const policy_type &policy
+            const policy_type &policy,
+            const prng_engine &gen = prng_engine()
         ) const
         {
             typedef typename std::decay<decltype(policy.get_model())>::type::reward_type reward_type;
@@ -98,7 +100,7 @@ namespace dlib
                 std::cout << "Starting training..." << std::endl;
 
             const auto &model = policy.get_model();
-            epsilon_policy<policy_type> eps_pol(epsilon, policy);
+            epsilon_policy<policy_type, prng_engine> eps_pol(epsilon, policy, gen);
             auto& w = eps_pol.get_weights();
 
             DLIB_ASSERT(weights.size() == model.states_size(),
@@ -113,11 +115,12 @@ namespace dlib
                 auto state = model.initial_state();
                 auto action = eps_pol(state);
 
-                reward_type reward = static_cast<reward_type>(0);
+                auto steps = 0u;
+                reward_type iteration_reward = static_cast<reward_type>(0);
                 while(!model.is_final(state)){
                     auto next_state = model.step(state, action);
                     auto next_action = eps_pol(next_state);
-                    auto next_reward = model.reward(state, action, next_state);
+                    auto reward = model.reward(state, action, next_state);
 
                     const auto feats = model.get_features(state, action);
                     const auto feats_next = model.get_features(next_state, next_action);
@@ -127,13 +130,15 @@ namespace dlib
 
                     state = next_state;
                     action = next_action;
-                    reward += next_reward;
+                    iteration_reward += reward;
                 }
 
-                total_reward += reward;
+                total_reward += iteration_reward;
                 if(verbose)
-                    std::cout << "iteration: " << iter << "\t reward: " << reward
-                              << "\t mean: " << total_reward/static_cast<int>(iter+1) << std::endl;
+                    std::cout << "iteration: " << iter << "\t reward: " << iteration_reward
+                              << "\t mean: " << total_reward/static_cast<int>(iter+1)
+                              << "\t steps: " << steps
+                              << std::endl;
             }
 
             if(verbose)
@@ -143,11 +148,13 @@ namespace dlib
         }
 
         template <
-                typename model_type
+                typename model_type,
+                typename prng_engine = std::default_random_engine
                 >
         greedy_policy<model_type> train(
-            const model_type &model
-        ) const { return train_policy(greedy_policy<model_type>(model)); }
+            const model_type &model,
+            const prng_engine &gen = prng_engine()
+        ) const { return train_policy(greedy_policy<model_type>(model), gen); }
 
     private:
         double learning_rate;
diff --git a/dlib/test/reinforcement_learning.cpp b/dlib/test/reinforcement_learning.cpp
index d16a37638c..4d79535440 100644
--- a/dlib/test/reinforcement_learning.cpp
+++ b/dlib/test/reinforcement_learning.cpp
@@ -36,7 +36,8 @@ namespace
         typedef feature_extractor_type<state_type, action_type> feature_extractor;
 
         explicit cliff_model(
-        ) : fe(height, width, 4){}
+            int seed = 0
+        ) : fe(height, width, 4), gen(seed) {}
 
         action_type random_action(
             const state_type& state // since all movements are always allowed we don't use state
@@ -138,14 +139,13 @@ namespace
                 result = state / width == 0;
                 break;
             case actions::down:
-                result = (state / width == height-2 && state % width > 0 && state % width < width-1)
-                        || state / width == height-1;
+                result = state / width == height-1;
                 break;
             case actions::left:
-                result = state % width == 0; // || state == height*width-1; <- is the goal condition
+                result = state % width == 0;
                 break;
             case actions::right:
-                result = state % width == width-1 || state == (height-1)*width;
+                result = state % width == width-1;
                 break;
             }
 
@@ -195,16 +195,17 @@ namespace
         int width,
         typename algorithm_t
         >
-    void test(unsigned int iterations)
+    void test()
     {
+        constexpr static int seed = 7;
+
         typedef cliff_model<height, width, feature_extractor> model_t;
-        const int max_steps = 150;
+        const int max_steps = 100;
 
         print_spinner();
         algorithm_t algorithm;
-        algorithm.set_iterations(iterations);
-        model_t model;
-        auto policy = algorithm.train(model);
+        model_t model(seed);
+        auto policy = algorithm.train(model, std::default_random_engine(seed));
 
         auto s = model.initial_state();
         auto r = static_cast<typename model_t::reward_type>(0);
@@ -244,18 +245,15 @@ namespace
         void perform_test (
         )
         {
-            // I have to hardcode the number of iterations
-            // since qlearning is off-policy it can get the wrong answer if it iterates too much
-            // this could be troublesome if convergence depends too much on randomness
-            test<4,5,qlearning>(100);
-            test<5,5,qlearning>(1000);
-            test<4,7,qlearning>(500);
-            test<5,10,qlearning>(2000);
-
-            test<4,5,sarsa>(100);
-            test<5,5,sarsa>(200);
-            test<4,7,sarsa>(200);
-            test<5,10,sarsa>(300);
+            test<4,5,qlearning>();
+            test<5,5,qlearning>();
+            test<4,7,qlearning>();
+            test<5,10,qlearning>();
+
+            test<4,5,sarsa>();
+            test<5,5,sarsa>();
+            test<4,7,sarsa>();
+            test<5,10,sarsa>();
         }
     };
 
diff --git a/examples/qlearning_sarsa_ex.cpp b/examples/qlearning_sarsa_ex.cpp
index 3c18c6db5c..826580aaae 100644
--- a/examples/qlearning_sarsa_ex.cpp
+++ b/examples/qlearning_sarsa_ex.cpp
@@ -43,7 +43,8 @@ class cliff_model
     typedef feature_extractor_type<state_type, action_type> feature_extractor;
 
     explicit cliff_model(
-    ) : fe(height, width, 4){}
+        int seed = 0
+    ) : fe(height, width, 4), gen(seed){}
 
     action_type random_action(
         const state_type& state // since all movements are always allowed we don't use state
@@ -145,14 +146,13 @@ class cliff_model
             result = state / width == 0;
             break;
         case actions::down:
-            result = (state / width == height-2 && state % width > 0 && state % width < width-1)
-                    || state / width == height-1;
+            result = state / width == height-1;
             break;
         case actions::left:
-            result = state % width == 0; // || state == height*width-1; <- is the goal condition
+            result = state % width == 0;
             break;
         case actions::right:
-            result = state % width == width-1 || state == (height-1)*width;
+            result = state % width == width-1;
             break;
         }
 
@@ -267,8 +267,8 @@ int main(int argc, char** argv)
 {
     std::cout << "Hello." << std::endl;
 
-    const auto height = 4u;
-    const auto width = 7u;
+    const auto height = 5u;
+    const auto width = 10u;
     typedef cliff_model<height, width, feature_extractor> model_type;
     model_type model;
 
@@ -276,14 +276,10 @@ int main(int argc, char** argv)
     std::cout << "Qlearning or SARSA? (q/s): ";
     std::cin >> response;
 
-    if(response == 'q'){
-        qlearning algorithm;
-        algorithm.set_iterations(500); //for this size qlearning doesn't converge with 100 iterations
-        run_example(model, algorithm);
-    }
-    else if(response == 's'){
-        run_example(model, sarsa()); //On the other side, sarsa does converge
-    }
+    if(response == 'q')
+        run_example(model, qlearning());
+    else if(response == 's')
+        run_example(model, sarsa());
     else
         std::cerr << "Invalid option." << std::endl;
 

From 3e56eb5481a4999802e4f725956049c139108507 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Adri=C3=A1n=20Javaloy?= <adrian.javaloy@gmail.com>
Date: Tue, 12 Dec 2017 11:36:39 +0100
Subject: [PATCH 05/14] Commented example + updated abstracts

---
 dlib/control/policy_abstract.h    |  11 +-
 dlib/control/qlearning_abstract.h |  24 +++-
 dlib/control/sarsa.h              |   6 +-
 dlib/control/sarsa_abstract.h     |  26 ++--
 examples/qlearning_sarsa_ex.cpp   | 194 +++++++++++++++++++++---------
 5 files changed, 181 insertions(+), 80 deletions(-)

diff --git a/dlib/control/policy_abstract.h b/dlib/control/policy_abstract.h
index 991bf96296..6d96b3ebdc 100644
--- a/dlib/control/policy_abstract.h
+++ b/dlib/control/policy_abstract.h
@@ -5,6 +5,7 @@
 
 #include "../matrix.h"
 #include "model_abstract.h"
+#include <random>
 
 namespace dlib
 {
@@ -176,7 +177,7 @@ void deserialize(greedy_policy<model_type>& item, std::istream& in);
 
 template <
     typename policy_type,
-    typename generator
+    typename prng_engine = std::default_random_engine()
     >
 class epsilon_policy
 {
@@ -185,8 +186,8 @@ class epsilon_policy
             policy_type should implement the example_policy interface defined at the
             top of this file.
 
-        REQUIREMENTS ON generator
-            generator should be a PRNG type like the ones defined in std::random.
+        REQUIREMENTS ON prng_engine
+            prng_engine should be a PRNG class like the ones defined in std::random.
 
         WHAT THIS OBJECT REPRESENTS
             This is a special policy that returns the best action (according to the
@@ -202,7 +203,7 @@ class epsilon_policy
     epsilon_policy (
         double epsilon,
         const policy_type &policy,
-        const generator &gen = generator()
+        const prng_engine &gen = prng_engine()
     );
     /*!
         requires
@@ -259,7 +260,7 @@ class epsilon_policy
             - returns the epsilon value used by the policy.
     !*/
 
-    const generator& get_generator(
+    const prng_engine& get_generator(
     ) const;
     /*!
         ensures
diff --git a/dlib/control/qlearning_abstract.h b/dlib/control/qlearning_abstract.h
index ccac305890..182c80ca4f 100644
--- a/dlib/control/qlearning_abstract.h
+++ b/dlib/control/qlearning_abstract.h
@@ -5,6 +5,7 @@
 
 #include "policy_abstract.h"
 #include "model_abstract.h"
+#include <random>
 
 namespace dlib
 {
@@ -156,33 +157,44 @@ namespace dlib
         !*/
 
         template <
-            typename policy_type
+            typename policy_type,
+            typename prng_engine = std::default_random_engine
             >
         policy_type train_policy(
-            const policy_type &policy
+            const policy_type &policy,
+            const prng_engine &gen = prng_engine()
         ) const;
         /*!
             requires
                 - policy is of the form example_policy<model_type>, i.e., an instance of
                   an implementation of the policy interface defined in policy_abstract.h.
+                - prng_engine is a pseudo-random number generator class like the ones
+                  defined in std::random. By default it assumes it to be the standard
+                  default_random_engine class.
             ensures
                 - returns a policy of the type policy_type as the result of applying the
                   qlearning learning function over iterations runs over using the weight
-                  matrix of the argument as the initial weights.
+                  matrix of the argument as the initial weights. Besides that, the
+                  exploration is done with an epsilon policy using the given prng.
         !*/
 
         template <
-                typename model_type
+                typename model_type,
+                typename prng_engine = std::default_random_engine
                 >
         greedy_policy<model_type> train(
-            const model_type &model
+            const model_type &model,
+            const prng_engine &gen = prng_engine()
         ) const;
         /*!
             requires
                 - model_type is an implementation of the example_model interface defined
                   at model_abstract.h.
+                - prng_engine is a pseudo-random number generator class like the ones
+                  defined in std::random. By default it assumes it to be the standard
+                  default_random_engine class.
             ensures
-                - returns train_policy(greedy_policy<model_type>(model));
+                - returns train_policy(greedy_policy<model_type>(model), gen);
         !*/
     };
 
diff --git a/dlib/control/sarsa.h b/dlib/control/sarsa.h
index ba7e75099b..772945c048 100644
--- a/dlib/control/sarsa.h
+++ b/dlib/control/sarsa.h
@@ -148,9 +148,9 @@ namespace dlib
         }
 
         template <
-                typename model_type,
-                typename prng_engine = std::default_random_engine
-                >
+            typename model_type,
+            typename prng_engine = std::default_random_engine
+            >
         greedy_policy<model_type> train(
             const model_type &model,
             const prng_engine &gen = prng_engine()
diff --git a/dlib/control/sarsa_abstract.h b/dlib/control/sarsa_abstract.h
index f4d559a8d9..6acd6f06da 100644
--- a/dlib/control/sarsa_abstract.h
+++ b/dlib/control/sarsa_abstract.h
@@ -5,6 +5,7 @@
 
 #include "policy_abstract.h"
 #include "model_abstract.h"
+#include <random>
 
 namespace dlib
 {
@@ -157,33 +158,44 @@ namespace dlib
         !*/
 
         template <
-            typename policy_type
+            typename policy_type,
+            typename prng_engine = std::default_random_engine
             >
         policy_type train_policy(
-            const policy_type &policy
+            const policy_type &policy,
+            const prng_engine &gen
         ) const;
         /*!
             requires
                 - policy is of the form example_policy<model_type>, i.e., an instance of
                   an implementation of the policy interface defined in policy_abstract.h.
+                - prng_engine is a pseudo-random number generator class like the ones
+                  defined in std::random. By default it assumes it to be the standard
+                  default_random_engine class.
             ensures
                 - returns a policy of the type policy_type as the result of applying the
                   sarsa learning function over iterations runs over using the weight
-                  matrix of the argument as the initial weights.
+                  matrix of the argument as the initial weights. Besides that, the
+                  exploration is done with an epsilon policy using the given prng.
         !*/
 
         template <
-                typename model_type
-                >
+            typename model_type,
+            typename prng_engine = std::default_random_engine
+            >
         greedy_policy<model_type> train(
-            const model_type &model
+            const model_type &model,
+            const prng_engine &gen = prng_engine()
         ) const;
         /*!
             requires
                 - model_type is an implementation of the example_model interface defined
                   at model_abstract.h.
+                - prng_engine is a pseudo-random number generator class like the ones
+                  defined in std::random. By default it assumes it to be the standard
+                  default_random_engine class.
             ensures
-                - returns train_policy(greedy_policy<model_type>(model));
+                - returns train_policy(greedy_policy<model_type>(model), gen);
         !*/
     };
     };
diff --git a/examples/qlearning_sarsa_ex.cpp b/examples/qlearning_sarsa_ex.cpp
index 826580aaae..431dcff48a 100644
--- a/examples/qlearning_sarsa_ex.cpp
+++ b/examples/qlearning_sarsa_ex.cpp
@@ -1,5 +1,10 @@
-// Copyright (C) 2017 Adrián Javaloy (adrian.javaloy@gmail.com)
-// License: Boost Software License   See LICENSE.txt for the full license.
+// The contents of this file are in the public domain. See LICENSE_FOR_EXAMPLE_PROGRAMS.txt
+/*
+    This is an example showing how to use the dlib algorithms Q-learning and SARSA.
+    These are two simples reinforcement learning algorithms. In short, they take a model
+    and take steps over and over until they've learnt how to solve the given task properly.
+*/
+
 #include <dlib/matrix.h>
 #include <dlib/control.h>
 #include <limits>
@@ -8,19 +13,35 @@
 #include <iostream>
 
 using namespace dlib;
-
-// This is the model the agent is going to work with. In particular this class
-// represents the a grid with height rows and width cols where of the form
-//                                  ..........
-//                                  ..........
-//                                  IFFFFFFFFG
-// where: - F are pits cells (if the agent falls there it fails)
-//        - I is the initial cell
-//        - G is the goal cell (the agent goal is to reach that spot)
-//        - . are free cell where the agent can go.
-// the rewards are: -100 for reaching F, 100 for reaching G and -1 for the rest.
-// it doesn't allow to go out of bounds, instead the agent will stay in the same cell
-// (like if there was a wall there).
+using namespace std;
+
+/*
+    Both of these algorithms work by a reward system. That means that they assign to each
+    pair (state, action) an expected reward (Qvalue) and they update those values iteratively
+    taking steps on a model/simulation and observing the reward they obtain. Like so, they
+    need a model class that allow them to work in a interactive way.
+
+    The algorithms/agents objective is to maximize the expected reward by taking the proper
+    steps.
+*/
+
+/*
+    This is the model the agent is going to work with in the example. In particular,
+    this class represents a grid with a given height and width of the form
+                                     ..........
+                                     ..........
+                                     IFFFFFFFFG
+    where: - F are pit cells (if the agent falls there it fails and simulation ends).
+           - I is the starting position.
+           - G is the goal cell (the agent goal is to reach that cell).
+           - . are free cells where the agent can go.
+
+    The agent receives the following reward: -100 for reaching F, 100 for reaching G and a
+    reward of -1 otherwise.
+
+    This model doesn't allow the agent to go out of bounds, instead it will stay in the same cell
+    he was before the action (like if there was a wall there) but receiving a reward of -1.
+*/
 template <
         int height,
         int width,
@@ -29,41 +50,54 @@ template <
 class cliff_model
 {
 public:
-    // constants and actions allowed
+    // actions allowed in the model
     enum class actions {up = 0, right, down, left};
+    constexpr static int num_actions = 4;
+
+    // some constants that we need
     constexpr static double EPS = 1e-16;
     constexpr static int HEIGHT = height;
     constexpr static int WIDTH = width;
 
-    // model types
+    // we define the model's types
     typedef int state_type;
     typedef actions action_type;
     typedef int reward_type;
 
+    // this ensures that the feature extractor uses the same underlying types as our model
     typedef feature_extractor_type<state_type, action_type> feature_extractor;
 
+
+    // Constructor
     explicit cliff_model(
         int seed = 0
-    ) : fe(height, width, 4), gen(seed){}
+    ) : fe(height, width, num_actions), gen(seed){}
+
 
+    // Functions that will use the agent
+
+    // It returns a random action. It's possible that the allowed actions differ from among states.
+    // In this case all movements are always allowed so we don't need to use state.
     action_type random_action(
-        const state_type& state // since all movements are always allowed we don't use state
+        const state_type& state
     ) const
     {
-        std::uniform_int_distribution<int> dist(0,3);
+        uniform_int_distribution<int> dist(0,num_actions-1);
         return static_cast<action_type>(dist(gen));
     }
 
+    // Returns the best action that maximizes the expected reward, that is,
+    // the action that maximizes dot_product(w, get_features(state, action))
+    // w will be the weights assign by the agent to each feature
     action_type find_best_action(
         const state_type& state,
         const matrix<double,0,1>& w
     ) const
     {
-        // it looks for the best actions in state according to w
-        auto best = std::numeric_limits<double>::lowest();
+        auto best = numeric_limits<double>::lowest();
         auto best_indexes = std::vector<int>();
 
-        for(auto i = 0; i < 4; i++){
+        for(auto i = 0; i < num_actions; i++){
             auto feats = get_features(state, static_cast<action_type>(i));
             auto product = dot(w, feats);
 
@@ -71,15 +105,18 @@ class cliff_model
                 best = product;
                 best_indexes.clear();
             }
-            if(std::abs(product - best) < EPS)
+            if(abs(product - best) < EPS)
                 best_indexes.push_back(i);
         }
 
         // returns a random action between the best ones.
-        std::uniform_int_distribution<unsigned long> dist(0, best_indexes.size()-1);
+        uniform_int_distribution<unsigned long> dist(0, best_indexes.size()-1);
         return static_cast<action_type>(best_indexes[dist(gen)]);
     }
 
+
+    // This functions are delegated to the feature extractor
+
     const feature_extractor& get_feature_extractor(
     ) const { return fe; }
 
@@ -95,6 +132,8 @@ class cliff_model
     ) const -> decltype(get_feature_extractor().get_features(state, action))
     { return get_feature_extractor().get_features(state, action); }
 
+
+    // This functions gives the rewards, that is, tells the agent how good are its movements
     reward_type reward(
         const state_type &state,
         const action_type &action,
@@ -107,6 +146,7 @@ class cliff_model
     state_type initial_state(
     ) const { return static_cast<state_type>((height-1) * width); }
 
+    // This is an important function, basically it allows the agent to move in the model's world
     state_type step(
         const state_type& state,
         const action_type& action
@@ -121,6 +161,8 @@ class cliff_model
                                             state - 1       ;
     }
 
+    // this functions allow the agent to know in which state of the simulation he is in
+
     bool is_success(
         const state_type &state
     ) const { return state == height*width - 1; }
@@ -160,12 +202,21 @@ class cliff_model
     }
 
     feature_extractor fe;
-    mutable std::default_random_engine gen; //mutable because it doesn't changes the model state
+    mutable default_random_engine gen; //mutable because it doesn't changes the model state
 };
 
-// This class is the feature representation of cliff_model states.
-// It's just a basic one-shot representation where the feature vector for a point (a,b) doing action c
-// is a zero vector of size width*height*num_actions with just a one on (a*width + b)*num_actions + c
+/*
+    Usually when we use these types of agents the state space of the model is huge. That could make
+    the Qfunction to be unmanageable and so we need to use what is known as function approximation.
+
+    Basically it represents the states by a given features instead of the states themselves. That way
+    what usually was just a single value Q(state, action) now is codified as the linear combination of
+    learnt weights and the features, that is, Q(state, action) = dot_product(weights, features(state, action)).
+
+    Our example is a toy example and so we don't need to use it. However, to show how it works I use a simple
+    one-shot representation of the states. That means that I have a vector of features where the feature in the
+    ith position is one if we provide a specific (state, action) and 0 otherwise.
+*/
 template <
         typename state_type,
         typename action_type
@@ -179,6 +230,7 @@ class feature_extractor
         int na
     ) : height(h), width(w), num_actions(na) {}
 
+    //the size of the vector
     inline long num_features(
     ) const { return num_actions * height * width; }
 
@@ -189,9 +241,7 @@ class feature_extractor
     {
         matrix<double,0,1> feats(num_features());
         feats = 0;
-        //for(auto i = 0u; i < num_actions; i++)
-        //    feats(num_actions * state + i) = 1;
-        feats(num_actions*state + static_cast<int>(action)) = 1;
+        feats(num_actions*state + static_cast<int>(action)) = 1; //only this one is 1
 
         return feats;
     }
@@ -200,46 +250,72 @@ class feature_extractor
     int height, width, num_actions;
 };
 
-// Just a helper function to pretty print the state of the agent.
+// This is just a helper function to pretty-print the agent's state.
 template <
-        typename model_t
-        >
+    typename model_t
+    >
 void print(
-        std::ostream &os,
+        ostream &os,
         const model_t &model,
         const typename model_t::state_type &state,
         const matrix<double,0,1> &weights,
         const typename model_t::action_type &action
 )
 {
-    std::cout << "weights: ";
+    cout << "weights: ";
     for(int i = 0; i < 4; i++)
-        std::cout << weights(state*4+i) << " ";
-    std::cout << std::endl;
+        cout << weights(state*4+i) << " ";
+    cout << endl;
 
-    std::cout << "action: " << static_cast<int>(action) << "\n";
+    cout << "action: " << static_cast<int>(action) << "\n";
 
     for(auto i = 0; i < model_t::HEIGHT; i++){
         for(auto j = 0; j < model_t::WIDTH; j++){
             typename model_t::state_type s = model_t::WIDTH * i + j;
             os << ( s == state ? 'X' : model.is_success(s) ? 'G' : model.is_failure(s) ? 'F' : '.');
         }
-        os << std::endl;
+        os << endl;
     }
-    os << std::endl;
+    os << endl;
 }
 
-// The function that runs the agent
+/*
+    This is the function that runs the agent. The code to run both agents are identical so I
+    chose to use a templated function.
+
+    The difference between executions comes in the way they train. Namely, the way they updated the Qvalue.
+    Let's suppose that we are in the pair (s, a) and we are going to be in (s', a') in the next step.
+
+    Q-learning is an off-policy algorithm meaning that doesn't consider its trully next move but the best one,
+    that is, doesn't consider a'. Its update function is like this:
+                            Q(s, a) = (1 - lr) * Q(s,a) + lr * (reward + disc * max_c Q(s', c))
+    That formula means that it takes a convex combination of the current qvalue and the expected qvalue, but
+    for doing so it considers the action c that maximizes Q(s', c) instead of the one he will take.
+
+    On the other hand SARSA does exactly the same as Q-learning but it considers the action that he will do
+    in the next step instead of the optimal. So it's an on-policy algorithm. Its update formula is:
+                            Q(s, a) = (1 - lr) * Q(s,a) + lr * (reward + disc * Q(s', a'))
+
+    This seems as a meaningless change, but what produces is that when training SARSA tends to be more conservative
+    in its movement while Q-learning tries to optimizes no matter what. In cases when you have to avoid failling
+    (usually a real world example) SARSA is a better option.
+
+    In our example this difference is appreciated in the way they learn. Q-learning will try to go close to the pit
+    cells all the time (falling a lot in the training process) and SARSA will go one or two cells off the cliff.
+
+    Usually, one decreases the learning ratio as the iterations go on and so SARSA would converge to the same solution
+    as Q-learning. This is not implemented yet and so the learning rate is constant always.
+*/
 template <
         typename model_t,
-        typename algorithm_t // qlearning or sarsa
+        typename algorithm_t // this can be qlearning or sarsa
         >
 void run_example(const model_t &model, algorithm_t &&algorithm)
 {
-    //algorithm.be_verbose();  // uncomment it if you want to see training info.
+    //algorithm.be_verbose();  // uncomment it if you want to see some training info.
     auto policy = algorithm.train(model);
 
-    std::cout << "Starting final simulation..." << std::endl;
+    cout << "Starting final simulation..." << endl;
     auto s = model.initial_state();
     auto r = static_cast<typename model_t::reward_type>(0);
     int i;
@@ -249,41 +325,41 @@ void run_example(const model_t &model, algorithm_t &&algorithm)
         auto new_s = model.step(s, a);
         r += model.reward(s,a,new_s);
 
-        print(std::cout, model, s, policy.get_weights(), a);
+        print(cout, model, s, policy.get_weights(), a);
         s = new_s;
     }
-    print(std::cout, model, s, policy.get_weights(), static_cast<decltype(policy(s))>(0));
-    std::cout << "Simulation finished." << std::endl;
+    print(cout, model, s, policy.get_weights(), static_cast<decltype(policy(s))>(0));
+    cout << "Simulation finished." << endl;
 
     if(!model.is_final(s))
-        std::cout << "Nothing reached after 100 steps." << std::endl;
+        cout << "Nothing reached after 100 steps." << endl;
     else if(model.is_failure(s))
-        std::cout << "Failed after " << i << " steps with reward " << r << "." << std::endl;
+        cout << "Failed after " << i << " steps with reward " << r << "." << endl;
     else
-        std::cout << "Success after " << i << " steps with reward " << r << "." << std::endl;
+        cout << "Success after " << i << " steps with reward " << r << "." << endl;
 }
 
 int main(int argc, char** argv)
 {
-    std::cout << "Hello." << std::endl;
+    cout << "Hello." << endl;
 
-    const auto height = 5u;
-    const auto width = 10u;
+    const auto height = 3u;
+    const auto width = 7u;
     typedef cliff_model<height, width, feature_extractor> model_type;
     model_type model;
 
     char response;
-    std::cout << "Qlearning or SARSA? (q/s): ";
-    std::cin >> response;
+    cout << "Qlearning or SARSA? (q/s): ";
+    cin >> response;
 
     if(response == 'q')
         run_example(model, qlearning());
     else if(response == 's')
         run_example(model, sarsa());
     else
-        std::cerr << "Invalid option." << std::endl;
+        cerr << "Invalid option." << endl;
 
-    std::cout << "Good bye." << std::endl;
+    cout << "Good bye." << endl;
 
     return 0;
 }

From 160db7d83a466416540e45ec59143559f2cd0720 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Adri=C3=A1n=20Javaloy?= <adrian.javaloy@gmail.com>
Date: Sat, 17 Feb 2018 22:35:58 +0100
Subject: [PATCH 06/14] Applied review notes

- Improved English notes
- Merged feature_extractor and model (now the user can choose which one to implement, offline or online model)
- Improved training function header in qlearning and sarsa
---
 dlib/control.h                                |   1 -
 dlib/control/approximate_linear_models.h      | 153 ++++++-
 .../approximate_linear_models_abstract.h      | 402 ++++++++++++++++--
 dlib/control/lspi_abstract.h                  |   2 +-
 dlib/control/model_abstract.h                 | 169 --------
 dlib/control/policy.h                         | 172 --------
 dlib/control/policy_abstract.h                | 284 -------------
 dlib/control/qlearning.h                      |  63 ++-
 dlib/control/qlearning_abstract.h             |  69 +--
 dlib/control/sarsa.h                          |  63 ++-
 dlib/control/sarsa_abstract.h                 |  61 +--
 dlib/test/reinforcement_learning.cpp          | 124 +++---
 examples/qlearning_sarsa_ex.cpp               | 161 +++----
 13 files changed, 718 insertions(+), 1006 deletions(-)
 delete mode 100644 dlib/control/model_abstract.h
 delete mode 100644 dlib/control/policy.h
 delete mode 100644 dlib/control/policy_abstract.h

diff --git a/dlib/control.h b/dlib/control.h
index 9c0ec80781..4e9c02878e 100644
--- a/dlib/control.h
+++ b/dlib/control.h
@@ -3,7 +3,6 @@
 #ifndef DLIB_CONTROL_
 #define DLIB_CONTROL_
 
-#include "control/policy.h"
 #include "control/lspi.h"
 #include "control/mpc.h"
 #include "control/qlearning.h"
diff --git a/dlib/control/approximate_linear_models.h b/dlib/control/approximate_linear_models.h
index 252b849e49..5fe025427c 100644
--- a/dlib/control/approximate_linear_models.h
+++ b/dlib/control/approximate_linear_models.h
@@ -4,6 +4,8 @@
 #define DLIB_APPROXIMATE_LINEAR_MODELS_Hh_
 
 #include "approximate_linear_models_abstract.h"
+#include <dlib/matrix.h>
+#include <random>
 
 namespace dlib
 {
@@ -17,7 +19,6 @@ namespace dlib
     {
         typedef typename model_type::state_type state_type;
         typedef typename model_type::action_type action_type;
-        typedef double reward_type;
 
         process_sample(){}
 
@@ -25,13 +26,13 @@ namespace dlib
             const state_type& s,
             const action_type& a,
             const state_type& n,
-            const reward_type& r
+            const double& r
         ) : state(s), action(a), next_state(n), reward(r) {}
 
         state_type  state;
         action_type action;
         state_type  next_state;
-        reward_type reward;
+        double reward;
     };
 
     template < typename feature_extractor >
@@ -52,6 +53,152 @@ namespace dlib
         deserialize(item.reward, in);
     }
 
+// ----------------------------------------------------------------------------------------
+
+    template <
+        typename model_type
+        >
+    class policy
+    {
+    public:
+
+        typedef typename model_type::state_type state_type;
+        typedef typename model_type::action_type action_type;
+
+        policy (
+            const model_type& model_ = model_type()
+        ) : model(model_)
+        {
+            weights.set_size(model.num_features());
+            weights = 0;
+        }
+
+        policy (
+            const matrix<double,0,1>& weights_,
+            const model_type &model_
+        ) : weights(weights_), model(model_) {}
+
+        action_type operator() (
+            const state_type& state
+        ) const
+        {
+            return model.find_best_action(state,weights);
+        }
+
+        const model_type& get_model (
+        ) const { return model; }
+
+        const matrix<double,0,1>& get_weights (
+        ) const { return weights; }
+
+        matrix<double,0,1>& get_weights (
+        ) { return weights; }
+
+    private:
+        matrix<double,0,1> weights;
+        const model_type model;
+    };
+
+    template < typename model_type >
+    inline void serialize(const policy<model_type>& item, std::ostream& out)
+    {
+        int version = 1;
+        serialize(version, out);
+        serialize(item.get_model(), out);
+        serialize(item.get_weights(), out);
+    }
+    template < typename model_type >
+    inline void deserialize(policy<model_type>& item, std::istream& in)
+    {
+        int version = 0;
+        deserialize(version, in);
+        if (version != 1)
+            throw serialization_error("Unexpected version found while deserializing dlib::policy object.");
+        model_type model;
+        matrix<double,0,1> w;
+        deserialize(model, in);
+        deserialize(w, in);
+        item = policy<model_type>(w,model);
+    }
+
+// ----------------------------------------------------------------------------------------
+
+    template <
+        typename policy_type,
+        typename prng_engine = std::default_random_engine
+        >
+    class epsilon_policy
+    {
+    public:
+        typedef typename policy_type::state_type state_type;
+        typedef typename policy_type::action_type action_type;
+
+        epsilon_policy (
+            double epsilon_,
+            policy_type &policy_,
+            const prng_engine &gen_ = prng_engine()
+        ) : underlying_policy(policy_), epsilon(epsilon_), gen(gen_) {}
+
+        action_type operator() (
+            const state_type& state
+        ) const
+        {
+            std::bernoulli_distribution d(epsilon);
+            return d(gen) ? get_model().random_action(state) : underlying_policy(state);
+        }
+
+        const policy_type& get_policy(
+        ) const { return underlying_policy; }
+
+        auto get_model (
+        ) const -> decltype(get_policy().get_model()) { return underlying_policy.get_model(); }
+
+        matrix<double,0,1>& get_weights (
+        ) { return underlying_policy.get_weights(); }
+
+        const matrix<double,0,1>& get_weights (
+        ) const { return underlying_policy.get_weights(); }
+
+        double get_epsilon(
+        ) const { return epsilon; }
+
+        const prng_engine& get_generator(
+        ) const { return gen; }
+
+    private:
+        policy_type& underlying_policy;
+        double epsilon;
+
+        mutable prng_engine gen;
+    };
+
+    template < typename policy_type, typename generator >
+    inline void serialize(const epsilon_policy<policy_type, generator>& item, std::ostream& out)
+    {
+        int version = 1;
+        serialize(version, out);
+        serialize(item.get_policy(), out);
+        serialize(item.get_epsilon(), out);
+        serialize(item.get_generator(), out);
+    }
+
+    template < typename policy_type, typename generator >
+    inline void deserialize(epsilon_policy<policy_type, generator>& item, std::istream& in)
+    {
+        int version = 0;
+        deserialize(version, in);
+        if (version != 1)
+            throw serialization_error("Unexpected version found while deserializing dlib::policy object.");
+
+        policy_type policy;
+        double epsilon;
+        generator gen;
+        deserialize(policy, in);
+        deserialize(epsilon, in);
+        deserialize(gen, in);
+        item = epsilon_policy<policy_type, generator>(epsilon, policy, gen);
+    }
+
 // ----------------------------------------------------------------------------------------
 
 }
diff --git a/dlib/control/approximate_linear_models_abstract.h b/dlib/control/approximate_linear_models_abstract.h
index 74f99da4ab..0f14432f92 100644
--- a/dlib/control/approximate_linear_models_abstract.h
+++ b/dlib/control/approximate_linear_models_abstract.h
@@ -3,35 +3,30 @@
 #undef DLIB_APPROXIMATE_LINEAR_MODELS_ABSTRACT_Hh_
 #ifdef DLIB_APPROXIMATE_LINEAR_MODELS_ABSTRACT_Hh_
 
-#include "model_abstract.h"
+#include <../matrix_abstract.h>
+#include <random>
 
 namespace dlib
 {
 
 // ----------------------------------------------------------------------------------------
 
-    template <
-        typename T,
-        typename U
-    >
-    struct example_feature_extractor 
-    {
+    struct example_offline_model {
         /*!
             WHAT THIS OBJECT REPRESENTS
-                This object defines the interface a feature extractor must implement if it
-                is to be used with the process_sample and policy objects defined at
-                policy_abstract.h.  Moreover, it is meant to represent the core part
-                of a model used in a reinforcement learning algorithm.
-                
-                In particular, this object models a Q(state,action) function where
-                    Q(state,action) == dot(w, PSI(state,action))
-                    where PSI(state,action) is a feature vector and w is a parameter
-                    vector.
+                This object defines the inferface that any model has to implement if it
+                is to be used in an offline fashion along with some method like the lspi
+                method defined in the file lspi_abstract.h. Being offline only means that
+                it already holds the data and will not interact with the environment to get
+                them.
 
-                Therefore,  a feature extractor defines how the PSI(x,y) feature vector is
-                calculated.  It also defines the types used to represent the state and
-                action objects. 
+                In particular, this object models a Q(state, action) function where
+                    Q(state, action) == dot(w, PSI(state, action))
+                where PSI(state, action) is a feature vector and w is a parameter vector.
 
+                Therefore, an offline model defines how the PSI(x,y) feature vector is
+                calculated. It also defines the types used to represent the state and
+                action objects.
 
             THREAD SAFETY
                 Instances of this object are required to be threadsafe, that is, it should
@@ -39,13 +34,14 @@ namespace dlib
                 functions of this object.
         !*/
 
-        typedef T state_type;
-        typedef U action_type;
-        // We can also say that the last element in the weight vector w must be 1.  This
+        // The states and actions can be any type as long as you provide typedefs for them.
+        typedef U state_type;
+        typedef V action_type;
+        // We can also say that the last element in the weights vector w must be 1. This
         // can be useful for including a prior into your model.
         const static bool force_last_weight_to_1 = false;
 
-        example_feature_extractor(
+        example_offline_model(
         );
         /*!
             ensures
@@ -56,20 +52,167 @@ namespace dlib
         ) const;
         /*!
             ensures
-                - returns the dimensionality of the PSI() feature vector.  
+                - returns the dimensionality of the PSI() feature vector.
+        !*/
+
+        action_type find_best_action (
+            const state_type& state,
+            const matrix<double,0,1>& w
+        ) const;
+        /*!
+            ensures
+                - returns the action A that maximizes Q(state, A) = dot(w,PSI(state,A)).
+                  That is, this function finds the best action to take in the given state
+                  when our model is parameterized by the given weight vector w.
         !*/
 
-        matrix<double,0,1> get_features (
-            const state_type &state,
-            const action_type &action
+        void get_features(
+            const state_type& state,
+            const action_type& action,
+            matrix<double,0,1>& feats
+        ) const;
+        /*!
+            ensures
+                - #feats.size() == num_features()
+                - #feats == PSI(state, action)
+        */!
+    };
+
+// ----------------------------------------------------------------------------------------
+
+    struct example_online_model
+    {
+        /*!
+            WHAT THIS OBJECT REPRESENTS
+                This object defines the inferface that any model has to implement if it
+                is to be used in an online fashion along with some method like the qlearning
+                method defined in the file qlearning_abstract.h.
+
+                Being online means that the model doesn't hold prior data but it interacts
+                with the environment and performing actions from some given state turning
+                that state into a new one as well as getting some reward for doing so.
+
+                In particular, this object models a Q(state, action) function where
+                    Q(state, action) == dot(w, PSI(state, action))
+                where PSI(state, action) is a feature vector and w is a parameter vector.
+
+                Therefore, an online model defines how the PSI(x,y) feature vector is
+                calculated, the types used to represent the state, action and reward
+                objects as well as how to interact with the environment.
+
+            THREAD SAFETY
+                Instances of this object are required to be threadsafe, that is, it should
+                be safe for multiple threads to make concurrent calls to the member
+                functions of this object.
+        !*/
+
+        // The states and actions can be any type as long as you provide typedefs for them.
+        typedef U state_type;
+        typedef V action_type;
+
+        example_online_model(
+        );
+        /*!
+            ensures
+                - this object is properly initialized.
+        !*/
+
+        unsigned long num_features(
+        ) const;
+        /*!
+            ensures
+                - returns the dimensionality of the PSI vector.
+        !*/
+
+        action_type find_best_action(
+            const state_type& state,
+            const matrix<double,0,1>& w
+        ) const;
+        /*!
+            ensures
+                - returns the action A that maximizes Q(state, A) = dot(w,PSI(state,A)).
+                  That is, this function finds the best action to take in the given state
+                  when our model is parameterized by the given weight vector.
+        !*/
+
+        void get_features(
+            const state_type& state,
+            const action_type& action,
+            matrix<double,0,1>& feats
+        ) const;
+        /*!
+            ensures
+                - #feats.size() == num_features()
+                - #feats == PSI(state, action)
+        !*/
+
+        action_type random_action(
+            const state_type& state
+        ) const;
+        /*!
+            ensures
+                - returns a random plausible action assuming we are in the given state.
+        !*/
+
+        double reward(
+            const state_type& state,
+            const action_type& action,
+            const state_type& new_state
+        ) const;
+        /*!
+            requires
+                - action is a pausible action from state.
+                - new_state is a possible outcome when performing action on state.
+            ensures
+                - returns the reward obtained by reaching new_state from state
+                  doing action.
+        !*/
+
+        state_type initial_state(
+        ) const;
+        /*!
+            ensures
+                - returns the initial state of the model.
+        !*/
+
+        state_type step(
+            const state_type& state,
+            const action_type& action
         ) const;
         /*!
             requires
-                - action is a valid option from state.
+                - action is a plausible action when we are in state.
+            ensures
+                - returns a new state result of being on the given state and doing the given
+                  action.
+        !*/
+
+        bool is_success(
+            const state_type& state
+        ) const;
+        /*!
+            ensures
+                - returns whether state is a goal state (the agent has finished properly).
+        !*/
+
+        bool is_failure(
+            const state_type& state
+        ) const;
+        /*!
+            ensures
+                - returns whether state is a failure state, i.e., a state where the agent has
+                  failed its task.
+        !*/
+
+        bool is_final(
+            const state_type& state
+        ) const;
+        /*!
             ensures
-                - returns PSI(state,action)
+                - #is_final(state) == is_success(state) || is_failure(state)
         !*/
 
+
     };
 
 // ----------------------------------------------------------------------------------------
@@ -81,7 +224,7 @@ namespace dlib
     {
         /*!
             REQUIREMENTS ON model_type
-                model_type should implement the interface defined at model_abstract.h.
+                model_type should implement one of the interfaces defined above this file.
 
             WHAT THIS OBJECT REPRESENTS
                 This object holds a training sample for a reinforcement learning algorithm.
@@ -92,7 +235,6 @@ namespace dlib
 
         typedef typename model_type::state_type state_type;
         typedef typename model_type::action_type action_type;
-        typedef typename model_type::reward_type reward_type;
 
         process_sample(){}
 
@@ -100,19 +242,205 @@ namespace dlib
             const state_type& s,
             const action_type& a,
             const state_type& n,
-            const reward_type& r
+            const double& r
         ) : state(s), action(a), next_state(n), reward(r) {}
 
         state_type  state;
         action_type action;
         state_type  next_state;
-        reward_type reward;
+        double reward;
+    };
+
+    template < typename model_type >
+    void serialize (const process_sample<model_type>& item, std::ostream& out);
+    template < typename model_type >
+    void deserialize (process_sample<model_type>& item, std::istream& in);
+    /*!
+        provides serialization support.
+    !*/
+
+// ----------------------------------------------------------------------------------------
+
+    template <
+        typename model_type
+        >
+    class policy
+    {
+        /*!
+            REQUIREMENTS ON model_type
+                model_type should implement one of the interfaces defined above this file.
+
+            WHAT THIS OBJECT REPRESENTS
+                This class represents a greedy policy, that is, it is a policy that given a
+                state returns the best possible action based on its weight matrix.
+        !*/
+
+    public:
+
+        typedef typename model_type::state_type state_type;
+        typedef typename model_type::action_type action_type;
+
+        policy (
+            const model_type& model = model_type()
+        );
+        /*!
+            ensures
+                - #get_model() == model
+                - #get_weights().size() == #get_model().num_features()
+                - #get_weights() == 0
+        !*/
+
+        policy (
+            const matrix<double,0,1>& weights,
+            const model_type& model
+        );
+        /*!
+            requires
+                - model.num_features() == weights.size()
+            ensures
+                - #get_model() == model
+                - #get_weights() == weights
+        !*/
+
+        action_type operator() (
+            const state_type& state
+        ) const;
+        /*!
+            ensures
+                - returns get_model().find_best_action(state, this->weights);
+        !*/
+
+        const model_type& get_model (
+        ) const;
+        /*!
+            ensures
+                - returns the model used by this object
+        !*/
+
+        const matrix<double,0,1>& get_weights (
+        ) const;
+        /*!
+            ensures
+                - returns the weights that the policy is using.
+        !*/
+
+        matrix<double,0,1>& get_weights (
+        );
+        /*!
+            ensures
+                - returns the weights that the policy is using.
+        !*/
+    };
+
+    template < typename model_type >
+    void serialize(const policy<model_type>& item, std::ostream& out);
+    template < typename model_type >
+    void deserialize(policy<model_type>& item, std::istream& in);
+    /*!
+        provides serialization support.
+    !*/
+
+    // ----------------------------------------------------------------------------------------
+
+    template <
+        typename policy_type,
+        typename prng_engine = std::default_random_engine()
+        >
+    class epsilon_policy
+    {
+        /*!
+            REQUIREMENTS ON policy_type
+                policy_type is an object with the same interface as the policy class defined
+                above.
+
+            REQUIREMENTS ON prng_engine
+                prng_engine should be a PRNG interface like the ones defined in std::random.
+
+            WHAT THIS OBJECT REPRESENTS
+                This is a special policy that returns the best action (according to the
+                underlying policy) for the given state with probability 1-epsilon
+                while it returns a valid random action with probability epsilon.
+
+                It is mainly used to add some exploration in the training process of the
+                online reinforcement learning methods such as qlearning and sarsa.
+        !*/
+
+    public:
+
+        typedef typename policy_type::state_type state_type;
+        typedef typename policy_type::action_type action_type;
+
+        epsilon_policy (
+            double epsilon,
+            const policy_type& policy,
+            const prng_engine& gen = prng_engine()
+        );
+        /*!
+            requires
+                - epsilon >= 0 and epsilon <= 1
+            ensures
+                - #get_epsilon() == epsilon
+                - #get_policy() == policy
+                - #get_generator() == gen
+        !*/
+
+        action_type operator() (
+            const state_type& state
+        ) const;
+        /*!
+            ensures
+                - returns get_policy()(state, w) with probability 1-epsilon
+                  and get_model().random_action(state) with probability epsilon.
+        !*/
+
+        const policy_type& get_policy(
+        ) const;
+        /*!
+            ensures
+                - returns the underlying policy used by the object.
+        !*/
+
+        model_type get_model (
+        ) const;
+        /*!
+            ensures
+                - returns the model used by the underlying policy.
+        !*/
+
+        const matrix<double,0,1>& get_weights (
+        ) const;
+        /*!
+            ensures
+                - returns the weights that the policy is using.
+        !*/
+
+        matrix<double,0,1>& get_weights (
+        );
+        /*!
+            ensures
+                - returns the weights that the policy is using.
+        !*/
+
+        double get_epsilon(
+        ) const;
+        /*!
+            ensures
+                - returns the epsilon value used by the policy.
+        !*/
+
+        const prng_engine& get_generator(
+        ) const;
+        /*!
+            ensures
+                - returns the generator used by the policy.
+        !*/
+
     };
 
-    template < typename feature_extractor >
-    void serialize (const process_sample<feature_extractor>& item, std::ostream& out);
-    template < typename feature_extractor >
-    void deserialize (process_sample<feature_extractor>& item, std::istream& in);
+    template < typename policy_type, typename generator >
+    inline void serialize(const epsilon_policy<policy_type, generator>& item, std::ostream& out);
+    template < typename policy_type, typename generator >
+    inline void deserialize(epsilon_policy<policy_type, generator>& item, std::istream& in);
     /*!
         provides serialization support.
     !*/
diff --git a/dlib/control/lspi_abstract.h b/dlib/control/lspi_abstract.h
index f262d16f48..649ca54972 100644
--- a/dlib/control/lspi_abstract.h
+++ b/dlib/control/lspi_abstract.h
@@ -17,7 +17,7 @@ namespace dlib
     {
         /*!
             REQUIREMENTS ON feature_extractor
-                feature_extractor should implement the example_feature_extractor interface
+                feature_extractor should implement the example_offline_model interface
                 defined at the top of dlib/control/approximate_linear_models_abstract.h
 
             WHAT THIS OBJECT REPRESENTS
diff --git a/dlib/control/model_abstract.h b/dlib/control/model_abstract.h
deleted file mode 100644
index 8abb0f326e..0000000000
--- a/dlib/control/model_abstract.h
+++ /dev/null
@@ -1,169 +0,0 @@
-// Copyright (C) 2017 Adrián Javaloy (adrian.javaloy@gmail.com)
-// License: Boost Software License   See LICENSE.txt for the full license.
-#undef DLIB_MODEL_ABSTRACT_Hh_
-#ifdef DLIB_MODEL_ABSTRACT_Hh_
-
-#include "approximate_linear_models_abstract.h"
-#include "../matrix.h"
-
-namespace dlib
-{
-
-    template <
-            template<typename, typename> class feature_extractor_type
-            >
-    class example_model
-    {
-        /*!
-            REQUIREMENTS ON feature_extractor
-                feature_extractor should implement the example_feature_extractor interface defined
-                at approximate_linear_models_abstract.h.
-
-            WHAT THIS OBJECT REPRESENTS
-                This is an example interface of a model class. This class represents an environment
-                where an agent will be deployed at. In other words, it is an interface between the
-                simulated/real world and the agent that has to be there. In short this class:
-                    - Holds information about the state, action and reward space.
-                    - Delegates the state representation to the feature_extractor.
-                    - Provides an initial state to start the agent.
-                    - Offers an interface to move in the world (look for actions, make steps in it
-                      and get a feedback/reward for them).
-        !*/
-    public:
-
-        // You have to define state, action and reward types.
-        typedef U state_type;
-        typedef V action_type;
-        typedef W reward_type;
-
-        // The feature extractor uses the same types as the model.
-        typedef feature_extractor_type<state_type, action_type> feature_extractor;
-
-        example_model(
-        );
-        /*!
-            ensures
-                - #get_feature_extractor() == feature_extractor()
-        !*/
-
-        action_type random_action(
-            const state_type &state
-        ) const;
-        /*!
-            ensures
-                - returns a random reachable action from state.
-        !*/
-
-        action_type find_best_action(
-            const state_type &state,
-            const matrix<double,0,1> &w
-        ) const;
-        /*!
-            requires
-                - w.size() == states_size()
-            ensures
-                - returns the action that maximizes the product
-                  dot(w, get_feature_extractor().get_features(state)).
-        !*/
-
-        const feature_extractor& get_feature_extractor(
-        ) const;
-        /*!
-            ensures
-                - returns the feature_extractor used by the model.
-        !*/
-
-        auto states_size(
-        ) const -> decltype(get_feature_extractor().num_features());
-        /*!
-            ensures
-                - returns get_feature_extractor().num_features().
-        !*/
-
-        auto get_features(
-            const state_type &state,
-            const action_type &action
-        ) const -> decltype(get_feature_extractor().get_features(state, action));
-        /*!
-            ensures
-                - returns get_feature_extractor().get_features(state, action);
-        !*/
-
-        // The new_state parameter is needed because the model doesn't have to be deterministic.
-        // Nonetheless for now we will suppose that the rewards are deterministic.
-        reward_type reward(
-            const state_type &state,
-            const action_type &action,
-            const state_type &new_state
-        ) const;
-        /*!
-            requires
-                - action is available in state.
-                - new_state is a possible outcome when you do action on state.
-            ensures
-                - returns the reward obtained by going to new_state from state
-                  doing action.
-                - the function is deterministic with respect to its arguments.
-        !*/
-
-        state_type initial_state(
-        ) const;
-        /*!
-            ensures
-                - returns the initial state of the model.
-        !*/
-
-        state_type step(
-            const state_type &state,
-            const action_type &action
-        ) const;
-        /*!
-            requires
-                - action is a valid action from state.
-            ensures
-                - returns a state that is possible to be in after doing action
-                  from state.
-        !*/
-
-        bool is_success(
-            const state_type &state
-        ) const;
-        /*!
-            ensures
-                - returns whether state is a goal state (the agent has done its task properly).
-        !*/
-
-        bool is_failure(
-            const state_type &state
-        ) const;
-        /*!
-            ensures
-                - returns whether state is a failure state, i.e., a state where the agent has
-                  failed his task.
-        !*/
-
-        bool is_final(
-            const state_type& state
-        ) const;
-        /*!
-            ensures
-                - returns whether state is a final state, i.e., it is a state where the agent can't
-                  advance anymore. In another words, whether state is a success or failure state.
-        !*/
-
-
-    };
-
-    template < template<typename, typename> class feature_extractor >
-    void serialize (const example_model<feature_extractor>& item, std::ostream& out);
-    template < template<typename, typename> class feature_extractor >
-    void deserialize (example_model<feature_extractor>& item, std::istream& in);
-    /*!
-        provides serialization support.
-    !*/
-
-// ----------------------------------------------------------------------------------------
-
-}
-
-#endif
diff --git a/dlib/control/policy.h b/dlib/control/policy.h
deleted file mode 100644
index c72ea7c3cf..0000000000
--- a/dlib/control/policy.h
+++ /dev/null
@@ -1,172 +0,0 @@
-// Copyright (C) 2017 Adrián Javaloy (adrian.javaloy@gmail.com)
-// License: Boost Software License   See LICENSE.txt for the full license.
-#ifndef DLIB_POLICY_Hh_
-#define DLIB_POLICY_Hh_
-
-#include "../matrix.h"
-#include "policy_abstract.h"
-#include <iostream>
-#include <random>
-
-namespace dlib
-{
-
-    template <
-        typename model_type
-        >
-    class greedy_policy
-    {
-    public:
-
-        typedef model_type feature_extractor_type;
-        typedef typename model_type::state_type state_type;
-        typedef typename model_type::action_type action_type;
-
-        greedy_policy (
-            const model_type &model_
-        ) : model(model_)
-        {
-            w.set_size(model.states_size());
-            w = 0;
-        }
-
-        greedy_policy (
-            const model_type &model_,
-            const matrix<double,0,1>& weights_
-        ) : w(weights_), model(model_) {}
-
-        //backward compability
-        greedy_policy (
-            const matrix<double,0,1>& weights_,
-            const model_type &model_
-        ) : w(weights_), model(model_) {}
-
-        action_type operator() (
-            const state_type& state
-        ) const
-        {
-            return model.find_best_action(state,w);
-        }
-
-        const model_type& get_model (
-        ) const { return model; }
-
-        matrix<double,0,1>& get_weights (
-        ) { return w; }
-
-        const matrix<double,0,1>& get_weights (
-        ) const { return w; }
-
-    private:
-        matrix<double,0,1> w;
-        const model_type &model;
-    };
-
-    template < typename model_type >
-    inline void serialize(const greedy_policy<model_type>& item, std::ostream& out)
-    {
-        int version = 1;
-        serialize(version, out);
-        serialize(item.get_model(), out);
-        serialize(item.get_weights(), out);
-    }
-    template < typename model_type >
-    inline void deserialize(greedy_policy<model_type>& item, std::istream& in)
-    {
-        int version = 0;
-        deserialize(version, in);
-        if (version != 1)
-            throw serialization_error("Unexpected version found while deserializing dlib::greedy_policy object.");
-        model_type model;
-        matrix<double,0,1> w;
-        deserialize(model, in);
-        deserialize(w, in);
-        item = greedy_policy<model_type>(w,model);
-    }
-
-// ----------------------------------------------------------------------------------------
-
-    template <
-        typename policy_type,
-        typename prng_engine = std::default_random_engine
-        >
-    class epsilon_policy
-    {
-    public:
-        typedef typename policy_type::state_type state_type;
-        typedef typename policy_type::action_type action_type;
-
-        epsilon_policy (
-            double epsilon_,
-            const policy_type &policy_,
-            const prng_engine &gen_ = prng_engine()
-        ) : policy(policy_), epsilon(epsilon_), gen(gen_) {}
-
-        action_type operator() (
-            const state_type& state
-        ) const
-        {
-            std::bernoulli_distribution d(epsilon);
-            return d(gen) ? get_model().random_action(state) : policy(state);
-        }
-
-        policy_type get_policy(
-        ) const { return policy; }
-
-        auto get_model (
-        ) const -> decltype(get_policy().get_model()) { return policy.get_model(); }
-
-        matrix<double,0,1>& get_weights (
-        ) { return policy.get_weights(); }
-
-        const matrix<double,0,1>& get_weights (
-        ) const { return policy.get_weights(); }
-
-        double get_epsilon(
-        ) const { return epsilon; }
-
-        const prng_engine& get_generator(
-        ) const { return gen; }
-
-    private:
-        policy_type policy;
-        double epsilon;
-
-        mutable prng_engine gen;
-    };
-
-    template < typename policy_type, typename generator >
-    inline void serialize(const epsilon_policy<policy_type, generator>& item, std::ostream& out)
-    {
-        int version = 1;
-        serialize(version, out);
-        serialize(item.get_policy(), out);
-        serialize(item.get_epsilon(), out);
-        serialize(item.get_generator(), out);
-    }
-
-    template < typename policy_type, typename generator >
-    inline void deserialize(epsilon_policy<policy_type, generator>& item, std::istream& in)
-    {
-        int version = 0;
-        deserialize(version, in);
-        if (version != 1)
-            throw serialization_error("Unexpected version found while deserializing dlib::greedy_policy object.");
-
-        policy_type policy;
-        double epsilon;
-        generator gen;
-        deserialize(policy, in);
-        deserialize(epsilon, in);
-        deserialize(gen, in);
-        item = epsilon_policy<policy_type, generator>(epsilon, policy, gen);
-    }
-
-// ----------------------------------------------------------------------------------------
-
-    // For backward compability with lspi
-    template < typename model_type >
-    using policy = greedy_policy<model_type>; //template aliasing is possible post C++11
-}
-
-#endif // DLIB_POLICY_Hh_
diff --git a/dlib/control/policy_abstract.h b/dlib/control/policy_abstract.h
deleted file mode 100644
index 6d96b3ebdc..0000000000
--- a/dlib/control/policy_abstract.h
+++ /dev/null
@@ -1,284 +0,0 @@
-// Copyright (C) 2017 Adrián Javaloy (adrian.javaloy@gmail.com)
-// License: Boost Software License   See LICENSE.txt for the full license.
-#undef DLIB_POLICY_ABSTRACT_Hh_
-#ifdef DLIB_POLICY_ABSTRACT_Hh_
-
-#include "../matrix.h"
-#include "model_abstract.h"
-#include <random>
-
-namespace dlib
-{
-
-template <
-    typename model_type
-    >
-class example_policy
-{
-    /*!
-        REQUIREMENTS ON model_type
-            model_type should implement the interface defined at model_abstract.h.
-
-        WHAT THIS OBJECT REPRESENTS
-            This is a policy based on the supplied model_type model.  In
-            particular, it maps from model_type::state_type to a model_type::action
-            to take in that state.
-    !*/
-
-public:
-
-    typedef typename model_type::state_type state_type;
-    typedef typename model_type::action_type action_type;
-
-    example_policy (
-        const model_type &model
-    );
-    /*!
-        ensures
-            - #get_model() == model
-            - #get_weights().size() == #get_model().states_size()
-            - #get_weights() == 0
-    !*/
-
-    example_policy (
-        const model_type& model,
-        const matrix<double,0,1>& weights
-    );
-    /*!
-        requires
-            - model.states_size() == weights.size()
-        ensures
-            - #get_model() == model
-            - #get_weights() == weights
-    !*/
-
-    action_type operator() (
-        const state_type& state
-    ) const;
-
-    const model_type& get_model (
-    ) const;
-    /*!
-        ensures
-            - returns the model used by this object
-    !*/
-
-    matrix<double,0,1>& get_weights (
-    );
-    /*!
-        ensures
-            - returns the parameter vector (w) associated with this object.  The length
-              of the vector is get_model().states_size().
-    !*/
-
-    const matrix<double,0,1>& get_weights (
-    ) const;
-    /*!
-        ensures
-            - returns the parameter vector (w) associated with this object.  The length
-              of the vector is get_model().states_size().
-    !*/
-
-};
-
-template < typename model_type >
-void serialize(const example_policy<model_type>& item, std::ostream& out);
-template < typename model_type >
-void deserialize(example_policy<model_type>& item, std::istream& in);
-/*!
-    provides serialization support.
-!*/
-
-// ----------------------------------------------------------------------------------------
-
-template <
-    typename model_type
-    >
-class greedy_policy
-{
-    /*!
-        REQUIREMENTS ON model_type
-            model_type should implement the interface defined at model_abstract.h.
-
-        WHAT THIS OBJECT REPRESENTS
-            This is an implementation of the policy interface that returns the best action
-            based on the weights (i.e. it acts in a greedy fashion).
-    !*/
-
-public:
-
-    typedef typename model_type::state_type state_type;
-    typedef typename model_type::action_type action_type;
-
-    greedy_policy (
-        const model_type &model
-    );
-    /*!
-        ensures
-            - #get_model() == model
-            - #get_weights().size() == #get_model().states_size()
-            - #get_weights() == 0
-    !*/
-
-    greedy_policy (
-        const model_type& model,
-        const matrix<double,0,1>& weights
-    );
-    /*!
-        requires
-            - model.states_size() == weights.size()
-        ensures
-            - #get_model() == model
-            - #get_weights() == weights
-    !*/
-
-    action_type operator() (
-        const state_type& state
-    ) const;
-    /*!
-        ensures
-            - returns get_model().find_best_action(state, w);
-    !*/
-
-    const model_type& get_model (
-    ) const;
-    /*!
-        ensures
-            - returns the model used by this object
-    !*/
-
-    matrix<double,0,1>& get_weights (
-    );
-    /*!
-        ensures
-            - returns the parameter vector (w) associated with this object.  The length
-              of the vector is get_model().states_size().
-    !*/
-
-    const matrix<double,0,1>& get_weights (
-    ) const;
-    /*!
-        ensures
-            - returns the parameter vector (w) associated with this object.  The length
-              of the vector is get_model().states_size().
-    !*/
-
-};
-
-template < typename model_type >
-void serialize(const greedy_policy<model_type>& item, std::ostream& out);
-template < typename model_type >
-void deserialize(greedy_policy<model_type>& item, std::istream& in);
-/*!
-    provides serialization support.
-!*/
-
-// ----------------------------------------------------------------------------------------
-
-template <
-    typename policy_type,
-    typename prng_engine = std::default_random_engine()
-    >
-class epsilon_policy
-{
-    /*!
-        REQUIREMENTS ON policy_type
-            policy_type should implement the example_policy interface defined at the
-            top of this file.
-
-        REQUIREMENTS ON prng_engine
-            prng_engine should be a PRNG class like the ones defined in std::random.
-
-        WHAT THIS OBJECT REPRESENTS
-            This is a special policy that returns the best action (according to the
-            underlying policy) for the given state with probability 1-epsilon
-            while it returns a valid random action with probability epsilon.
-    !*/
-
-public:
-
-    typedef typename policy_type::state_type state_type;
-    typedef typename policy_type::action_type action_type;
-
-    epsilon_policy (
-        double epsilon,
-        const policy_type &policy,
-        const prng_engine &gen = prng_engine()
-    );
-    /*!
-        requires
-            - epsilon >= 0 and epsilon <= 1
-        ensures
-            - #get_epsilon() == epsilon
-            - #get_policy() == policy
-            - #get_generator() == gen
-    !*/
-
-    action_type operator() (
-        const state_type& state
-    ) const;
-    /*!
-        ensures
-            - returns get_policy()(state, w) with probability 1-epsilon
-              and get_model().random_action(state) with probability epsilon.
-    !*/
-
-    policy_type get_policy(
-    ) const;
-    /*!
-        ensures
-            - returns the underlying policy used by the object.
-    !*/
-
-    auto get_model (
-    ) const -> decltype(get_policy().get_model());
-    /*!
-        ensures
-            - returns the model used by the underlying policy.
-    !*/
-
-    matrix<double,0,1>& get_weights (
-    );
-    /*!
-        ensures
-            - returns the parameter vector (w) associated with this object.  The length
-              of the vector is get_model().states_size().
-    !*/
-
-    const matrix<double,0,1>& get_weights (
-    ) const;
-    /*!
-        ensures
-            - returns the parameter vector (w) associated with this object.  The length
-              of the vector is get_model().states_size().
-    !*/
-
-    double get_epsilon(
-    ) const;
-    /*!
-        ensures
-            - returns the epsilon value used by the policy.
-    !*/
-
-    const prng_engine& get_generator(
-    ) const;
-    /*!
-        ensures
-            - returns the generator used by the policy.
-    !*/
-
-};
-
-template < typename policy_type, typename generator >
-inline void serialize(const epsilon_policy<policy_type, generator>& item, std::ostream& out);
-template < typename policy_type, typename generator >
-inline void deserialize(epsilon_policy<policy_type, generator>& item, std::istream& in);
-/*!
-    provides serialization support.
-!*/
-
-// ----------------------------------------------------------------------------------------
-
-}
-
-#endif // DLIB_POLICY_ABSTRACT_Hh_
diff --git a/dlib/control/qlearning.h b/dlib/control/qlearning.h
index 94f6731d75..ace925d8b2 100644
--- a/dlib/control/qlearning.h
+++ b/dlib/control/qlearning.h
@@ -3,23 +3,27 @@
 #ifndef DLIB_QLEARNING_Hh_
 #define DLIB_QLEARNING_Hh_
 
-#include "policy.h"
+#include "approximate_linear_models.h"
 #include <iostream>
 #include <type_traits>
 #include <random>
 
 namespace dlib
 {
+    template<
+        typename model_type
+        >
     class qlearning
     {
     public:
+
         explicit qlearning(
             double lr = 0.2,
             double disc = 0.8,
             unsigned int miters = 100u,
             double eps = 0.1,
             bool v = false
-        ) : iterations(miters), verbose(v) {
+        ) : iters(miters), verbose(v) {
             set_learning_rate(lr);
             set_discount(disc);
             set_epsilon(eps);
@@ -56,11 +60,11 @@ namespace dlib
         }
 
         unsigned int get_iterations(
-        ) const { return iterations; }
+        ) const { return iters; }
 
         void set_iterations(
             unsigned int value
-        ) { iterations = value; }
+        ) { iters = value; }
 
         double get_epsilon(
         ) const { return epsilon; }
@@ -87,43 +91,44 @@ namespace dlib
         ) { verbose = false; }
 
         template <
-            typename policy_type,
             typename prng_engine = std::default_random_engine
             >
-        policy_type train_policy(
-            const policy_type &policy,
+        policy<model_type> train(
+            policy<model_type> training_policy = policy<model_type>(),
             const prng_engine &gen = prng_engine()
         ) const
         {
-            typedef typename std::decay<decltype(policy.get_model())>::type::reward_type reward_type;
-
             if(verbose)
                 std::cout << "Starting training..." << std::endl;
 
-            const auto &model = policy.get_model();
-            epsilon_policy<policy_type, prng_engine> eps_pol(epsilon, policy, gen);
+            const auto& model = training_policy.get_model();
+            epsilon_policy<policy<model_type>, prng_engine> eps_pol(epsilon, training_policy, gen);
             auto& w = eps_pol.get_weights();
 
             DLIB_ASSERT(weights.size() == model.states_size(),
-                "\t qlearning::train(weights)"
+                "\t qlearning::train(policy, gen)"
                 "\n\t invalid inputs were given to this function"
-                "\n\t weights.size: " << weights.size() <<
-                "\n\t features size: " << model.states_size()
+                "\n\t policy's weights.size: " << weights.size() <<
+                "\n\t num of features: " << model.num_features()
             );
 
-            reward_type total_reward = static_cast<reward_type>(0);
-            for(auto iter = 0u; iter < iterations; ++iter){
-                auto state = model.initial_state();
+            matrix<double,0,1> feats(model.num_features()), feats_next_best(model.num_features());
 
+            double total_reward = 0.;
+            for(auto iter = 0u; iter < iters; ++iter)
+            {
+                auto state = model.initial_state();
                 auto steps = 0u;
-                reward_type iteration_reward = static_cast<reward_type>(0);
-                while(!model.is_final(state)){
+                double iteration_reward = 0.;
+
+                while(!model.is_final(state))
+                {
                     auto action = eps_pol(state);
                     auto next_state = model.step(state, action);
                     auto reward = model.reward(state, action, next_state);
 
-                    const auto feats = model.get_features(state, action);
-                    const auto feats_next_best = model.get_features(next_state, model.find_best_action(next_state, w));
+                    model.get_features(state, action, feats);
+                    model.get_features(next_state, model.find_best_action(next_state, w), feats_next_best);
 
                     double correction = reward + discount * dot(w, feats_next_best) - dot(w, feats);
                     w += learning_rate * correction * feats;
@@ -135,7 +140,8 @@ namespace dlib
 
                 total_reward += iteration_reward;
                 if(verbose)
-                    std::cout << "iteration: " << iter << "\t reward: " << iteration_reward
+                    std::cout << "iteration: " << iter
+                              << "\t reward: " << iteration_reward
                               << "\t mean: " << total_reward/static_cast<int>(iter+1)
                               << "\t steps: " << steps
                               << std::endl;
@@ -144,22 +150,13 @@ namespace dlib
             if(verbose)
                 std::cout << "Training finished." << std::endl;
 
-            return eps_pol.get_policy();
+            return training_policy;
         }
 
-        template <
-                typename model_type,
-                typename prng_engine = std::default_random_engine
-                >
-        greedy_policy<model_type> train(
-            const model_type &model,
-            const prng_engine &gen = prng_engine()
-        ) const { return train_policy(greedy_policy<model_type>(model), gen); }
-
     private:
         double learning_rate;
         double discount;
-        unsigned int iterations;
+        unsigned int iters;
         double epsilon;
         bool verbose;
     };
diff --git a/dlib/control/qlearning_abstract.h b/dlib/control/qlearning_abstract.h
index 182c80ca4f..c96a75654e 100644
--- a/dlib/control/qlearning_abstract.h
+++ b/dlib/control/qlearning_abstract.h
@@ -3,12 +3,14 @@
 #undef DLIB_QLEARNING_ABSTRACT_Hh_
 #ifdef DLIB_QLEARNING_ABSTRACT_Hh_
 
-#include "policy_abstract.h"
-#include "model_abstract.h"
+#include "approximate_linear_models_abstract.h"
 #include <random>
 
 namespace dlib
 {
+
+// ----------------------------------------------------------------------------------------
+
     template <
         typename model_type
         >
@@ -16,24 +18,23 @@ namespace dlib
     {
         /*!
             REQUIREMENTS ON model_type
-                model_type is an implementation of the model interface declared in
-                  model_abstract.h.
+                model_type should implement the example_online_model interface defined in
+                the approximate_linear_models_abstract.h file.
 
             WHAT THIS OBJECT REPRESENTS
-                This objects is an implementation of the well-known reinforcement learning
-                algorithm Q-learning. This algorithms takes a bunch of process_samples
-                as input and outputs a policy that have learnt from that in order to take
-                the better results.
+                This object is an implementation of the well-known reinforcement learning
+                algorithm Q-learning. It takes an online model and tries to learn the best
+                possible policy for the model's environment by interacting with it.
 
-                Supposing we are in state s and action a and we are going to a new state s'
-                the learning function has the form:
+                Supposing we are in state s and action a and we are going to a new state s',
+                then the learning function has the form:
                     Q(s, a) = (1 - lr) * Q(s,a) + lr * (reward + disc * max_a' Q(s', a'))
-                where lr is the learning_rate and disc the discount.
-                That formula means that it takes a convex combination of the current qvalue
-                and the expected qvalue.
+                where lr is the learning_rate and disc is the discount factor.
+                That formula means that it takes a convex combination of the current qvalue,
+                that is, the current expected reward from there, and the new expected qvalue.
 
                 Note that it is an off-policy reinforcement learning algorithm meaning
-                that it doesn't take the policy into account while learning.
+                that it doesn't take the policy is using into account in the learning process.
         !*/
 
     public:
@@ -45,7 +46,7 @@ namespace dlib
                 - #get_discount() == 0.8
                 - #get_iterations() == 100
                 - #get_epsilon() == 0.1
-                - #is not verbose
+                - #is_verbose() == false
         !*/
 
         explicit qlearning(
@@ -122,7 +123,7 @@ namespace dlib
         ) const;
         /*!
             ensures
-                - returns the probability of doing a non-optimal step while training.
+                - returns the probability of taking a random step while training.
         !*/
 
         void set_epsilon(
@@ -157,45 +158,21 @@ namespace dlib
         !*/
 
         template <
-            typename policy_type,
             typename prng_engine = std::default_random_engine
             >
-        policy_type train_policy(
-            const policy_type &policy,
-            const prng_engine &gen = prng_engine()
+        policy<model_type> train(
+            policy<model_type> policy = policy<model_type>()
+            const prng_engine& gen = prng_engine()
         ) const;
         /*!
             requires
-                - policy is of the form example_policy<model_type>, i.e., an instance of
-                  an implementation of the policy interface defined in policy_abstract.h.
                 - prng_engine is a pseudo-random number generator class like the ones
-                  defined in std::random. By default it assumes it to be the standard
-                  default_random_engine class.
+                  defined in std::random. By default it is the standard one.
             ensures
-                - returns a policy of the type policy_type as the result of applying the
-                  qlearning learning function over iterations runs over using the weight
-                  matrix of the argument as the initial weights. Besides that, the
-                  exploration is done with an epsilon policy using the given prng.
+                - returns the policy resulting of applying the learning function over
+                  and over according to the parameters previously fed into this object.
         !*/
 
-        template <
-                typename model_type,
-                typename prng_engine = std::default_random_engine
-                >
-        greedy_policy<model_type> train(
-            const model_type &model,
-            const prng_engine &gen = prng_engine()
-        ) const;
-        /*!
-            requires
-                - model_type is an implementation of the example_model interface defined
-                  at model_abstract.h.
-                - prng_engine is a pseudo-random number generator class like the ones
-                  defined in std::random. By default it assumes it to be the standard
-                  default_random_engine class.
-            ensures
-                - returns train_policy(greedy_policy<model_type>(model), gen);
-        !*/
     };
 
 // ----------------------------------------------------------------------------------------
diff --git a/dlib/control/sarsa.h b/dlib/control/sarsa.h
index 772945c048..aab620f14f 100644
--- a/dlib/control/sarsa.h
+++ b/dlib/control/sarsa.h
@@ -3,12 +3,16 @@
 #ifndef DLIB_SARSA_Hh_
 #define DLIB_SARSA_Hh_
 
-#include "policy.h"
+#include "approximate_linear_models.h"
 #include <type_traits>
 #include <iostream>
+#include <random>
 
 namespace dlib
 {
+    template<
+        typename model_type
+        >
     class sarsa
     {
     public:
@@ -18,7 +22,7 @@ namespace dlib
             unsigned int miters = 100u,
             double eps = 0.1,
             bool v = false
-        ) : iterations(miters), verbose(v) {
+        ) : iters(miters), verbose(v) {
             set_learning_rate(lr);
             set_discount(disc);
             set_epsilon(eps);
@@ -55,11 +59,11 @@ namespace dlib
         }
 
         unsigned int get_iterations(
-        ) const { return iterations; }
+        ) const { return iters; }
 
         void set_iterations(
             unsigned int value
-        ) { iterations = value; }
+        ) { iters = value; }
 
         double get_epsilon(
         ) const { return epsilon; }
@@ -86,44 +90,45 @@ namespace dlib
         ) { verbose = false; }
 
         template <
-            typename policy_type,
             typename prng_engine = std::default_random_engine
             >
-        policy_type train_policy(
-            const policy_type &policy,
+        policy<model_type> train(
+            policy<model_type> training_policy = policy<model_type>(),
             const prng_engine &gen = prng_engine()
         ) const
         {
-            typedef typename std::decay<decltype(policy.get_model())>::type::reward_type reward_type;
-
             if(verbose)
                 std::cout << "Starting training..." << std::endl;
 
-            const auto &model = policy.get_model();
-            epsilon_policy<policy_type, prng_engine> eps_pol(epsilon, policy, gen);
+            const auto& model = training_policy.get_model();
+            epsilon_policy<policy<model_type>, prng_engine> eps_pol(epsilon, training_policy, gen);
             auto& w = eps_pol.get_weights();
 
             DLIB_ASSERT(weights.size() == model.states_size(),
-                "\t sarsa::train(weights)"
+                "\t sarsa::train(policy, gen)"
                 "\n\t invalid inputs were given to this function"
-                "\n\t weights.size: " << weights.size() <<
-                "\n\t features size: " << model.states_size()
+                "\n\t policy's weights.size: " << weights.size() <<
+                "\n\t num of features: " << model.num_features()
             );
 
-            reward_type total_reward = static_cast<reward_type>(0);
-            for(auto iter = 0u; iter < iterations; ++iter){
+            matrix<double,0,1> feats(model.num_features()), feats_next(model.num_features());
+
+            double total_reward = 0.;
+            for(auto iter = 0u; iter < iters; ++iter)
+            {
                 auto state = model.initial_state();
                 auto action = eps_pol(state);
-
                 auto steps = 0u;
-                reward_type iteration_reward = static_cast<reward_type>(0);
-                while(!model.is_final(state)){
+                double iteration_reward = 0.;
+
+                while(!model.is_final(state))
+                {
                     auto next_state = model.step(state, action);
                     auto next_action = eps_pol(next_state);
                     auto reward = model.reward(state, action, next_state);
 
-                    const auto feats = model.get_features(state, action);
-                    const auto feats_next = model.get_features(next_state, next_action);
+                    model.get_features(state, action, feats);
+                    model.get_features(next_state, next_action, feats_next);
 
                     double correction = reward + discount * dot(w, feats_next) - dot(w, feats);
                     w += learning_rate * correction * feats;
@@ -135,7 +140,8 @@ namespace dlib
 
                 total_reward += iteration_reward;
                 if(verbose)
-                    std::cout << "iteration: " << iter << "\t reward: " << iteration_reward
+                    std::cout << "iteration: " << iter
+                              << "\t reward: " << iteration_reward
                               << "\t mean: " << total_reward/static_cast<int>(iter+1)
                               << "\t steps: " << steps
                               << std::endl;
@@ -144,22 +150,13 @@ namespace dlib
             if(verbose)
                 std::cout << "Training finished." << std::endl;
 
-            return eps_pol.get_policy();
+            return training_policy;
         }
 
-        template <
-            typename model_type,
-            typename prng_engine = std::default_random_engine
-            >
-        greedy_policy<model_type> train(
-            const model_type &model,
-            const prng_engine &gen = prng_engine()
-        ) const { return train_policy(greedy_policy<model_type>(model), gen); }
-
     private:
         double learning_rate;
         double discount;
-        unsigned int iterations;
+        unsigned int iters;
         double epsilon;
         bool verbose;
     };
diff --git a/dlib/control/sarsa_abstract.h b/dlib/control/sarsa_abstract.h
index 6acd6f06da..a9ebbf522d 100644
--- a/dlib/control/sarsa_abstract.h
+++ b/dlib/control/sarsa_abstract.h
@@ -16,22 +16,20 @@ namespace dlib
     {
         /*!
             REQUIREMENTS ON model_type
-                model_type is an implementation of the model interface declared in
-                  model_abstract.h.
+                model_type should implement the example_online_model interface defined in
+                the approximate_linear_models_abstract.h file.
 
             WHAT THIS OBJECT REPRESENTS
-                This objects is an implementation of the well-known reinforcement learning
-                algorithm Q-learning. This algorithms takes a bunch of process_samples
-                as input and outputs a policy that have learnt from that in order to take
-                the better results.
+                This object is an implementation of the well-known reinforcement learning
+                algorithm SARSA. It takes an online model and tries to learn the best
+                possible policy for the model's environment by interacting with it.
 
                 Supposing we are in state s and action a and we are going to a new state s'
-                the learning function has the form:
+                and taking the action a' in s', then the learning function has the form:
                     Q(s, a) = (1 - lr) * Q(s,a) + lr * (reward + disc * Q(s', a'))
-                where lr is the learning_rate, disc the discount and a' is the next action
-                the algorithm will perform after reaching s'.
-                That formula means that it takes a convex combination of the current qvalue
-                and the expected qvalue.
+                where lr is the learning_rate and disc is the discount factor.
+                That formula means that it takes a convex combination of the current qvalue,
+                that is, the current expected reward from there, and the new expected qvalue.
 
                 Note that, unlike qlearning, sarsa is an on-policy reinforcement learning
                 algorithm meaning that it takes the policy into account while learning.
@@ -46,7 +44,7 @@ namespace dlib
                 - #get_discount() == 0.8
                 - #get_iterations() == 100
                 - #get_epsilon() == 0.1
-                - #is not verbose
+                - #is_verbose() == false
         !*/
 
         explicit sarsa(
@@ -123,7 +121,7 @@ namespace dlib
         ) const;
         /*!
             ensures
-                - returns the probability of doing a non-optimal step while training.
+                - returns the probability of taking a random step while training.
         !*/
 
         void set_epsilon(
@@ -158,46 +156,21 @@ namespace dlib
         !*/
 
         template <
-            typename policy_type,
             typename prng_engine = std::default_random_engine
             >
-        policy_type train_policy(
-            const policy_type &policy,
-            const prng_engine &gen
-        ) const;
-        /*!
-            requires
-                - policy is of the form example_policy<model_type>, i.e., an instance of
-                  an implementation of the policy interface defined in policy_abstract.h.
-                - prng_engine is a pseudo-random number generator class like the ones
-                  defined in std::random. By default it assumes it to be the standard
-                  default_random_engine class.
-            ensures
-                - returns a policy of the type policy_type as the result of applying the
-                  sarsa learning function over iterations runs over using the weight
-                  matrix of the argument as the initial weights. Besides that, the
-                  exploration is done with an epsilon policy using the given prng.
-        !*/
-
-        template <
-            typename model_type,
-            typename prng_engine = std::default_random_engine
-            >
-        greedy_policy<model_type> train(
-            const model_type &model,
+        policy_type train(
+            policy<model_type> policy = policy<model_type>(),
             const prng_engine &gen = prng_engine()
         ) const;
         /*!
             requires
-                - model_type is an implementation of the example_model interface defined
-                  at model_abstract.h.
                 - prng_engine is a pseudo-random number generator class like the ones
-                  defined in std::random. By default it assumes it to be the standard
-                  default_random_engine class.
+                  defined in std::random. By default it is the standard one.
             ensures
-                - returns train_policy(greedy_policy<model_type>(model), gen);
+                - returns the policy resulting of applying the learning function over
+                  and over according to the parameters previously fed into this object.
         !*/
-    };
+
     };
 
 // ----------------------------------------------------------------------------------------
diff --git a/dlib/test/reinforcement_learning.cpp b/dlib/test/reinforcement_learning.cpp
index 4d79535440..0db14d45c2 100644
--- a/dlib/test/reinforcement_learning.cpp
+++ b/dlib/test/reinforcement_learning.cpp
@@ -16,34 +16,52 @@ namespace
 
     template <
             int height,
-            int width,
-            template<typename,typename> class feature_extractor_type
+            int width
             >
     class cliff_model
     {
     public:
-        // constants and actions allowed
+        // actions allowed in the model
         enum class actions {up = 0, right, down, left};
+        constexpr static int num_actions = 4;
+
+        // some constants we need
         constexpr static double EPS = 1e-16;
         constexpr static int HEIGHT = height;
         constexpr static int WIDTH = width;
 
-        // model types
+        // we define the model's types
         typedef int state_type;
         typedef actions action_type;
-        typedef int reward_type;
-
-        typedef feature_extractor_type<state_type, action_type> feature_extractor;
 
+        // Constructor
         explicit cliff_model(
             int seed = 0
-        ) : fe(height, width, 4), gen(seed) {}
+        ) : gen(seed){}
+
+
+        // Functions that will use the agent
+
+        unsigned int num_features(
+        ) const { return num_actions * height * width; }
 
+        void get_features(
+            const state_type &state,
+            const action_type &action,
+            matrix<double,0,1>& feats
+        ) const
+        {
+            feats = 0;
+            feats(num_actions*state + static_cast<int>(action)) = 1; //only this one is 1
+        }
+
+        // It's possible that the allowed actions differ among states.
+        // In this case all movements are always allowed so we don't need to use state.
         action_type random_action(
-            const state_type& state // since all movements are always allowed we don't use state
+            const state_type& state
         ) const
         {
-            std::uniform_int_distribution<int> dist(0,3);
+            uniform_int_distribution<int> dist(0,num_actions-1);
             return static_cast<action_type>(dist(gen));
         }
 
@@ -52,43 +70,30 @@ namespace
             const matrix<double,0,1>& w
         ) const
         {
-            // it looks for the best actions in state according to w
-            auto best = std::numeric_limits<double>::lowest();
+            auto best = numeric_limits<double>::lowest();
             auto best_indexes = std::vector<int>();
 
-            for(auto i = 0; i < 4; i++){
-                auto feats = get_features(state, static_cast<action_type>(i));
+            for(auto i = 0; i < num_actions; i++)
+            {
+                matrix<double,0,1> feats(num_features());
+                get_features(state, static_cast<action_type>(i), feats);
                 auto product = dot(w, feats);
 
                 if(product > best){
                     best = product;
                     best_indexes.clear();
                 }
-                if(std::abs(product - best) < EPS)
+                if(abs(product - best) < EPS)
                     best_indexes.push_back(i);
             }
 
             // returns a random action between the best ones.
-            std::uniform_int_distribution<unsigned long> dist(0, best_indexes.size()-1);
+            uniform_int_distribution<unsigned long> dist(0, best_indexes.size()-1);
             return static_cast<action_type>(best_indexes[dist(gen)]);
         }
 
-        const feature_extractor& get_feature_extractor(
-        ) const { return fe; }
-
-        auto states_size(
-        ) const -> decltype(get_feature_extractor().num_features())
-        {
-            return get_feature_extractor().num_features();
-        }
-
-        auto get_features(
-            const state_type &state,
-            const action_type &action
-        ) const -> decltype(get_feature_extractor().get_features(state, action))
-        { return get_feature_extractor().get_features(state, action); }
-
-        reward_type reward(
+        // This functions gives the rewards, that is, tells the agent how good are its movements
+        double reward(
             const state_type &state,
             const action_type &action,
             const state_type &new_state
@@ -100,6 +105,7 @@ namespace
         state_type initial_state(
         ) const { return static_cast<state_type>((height-1) * width); }
 
+        // This is an important function, basically it allows the agent to move around the environment
         state_type step(
             const state_type& state,
             const action_type& action
@@ -114,6 +120,7 @@ namespace
                                                 state - 1       ;
         }
 
+        // this functions allow the agent to know in which state of the simulation it's in
         bool is_success(
             const state_type &state
         ) const { return state == height*width - 1; }
@@ -127,6 +134,7 @@ namespace
         ) const { return is_success(state) || is_failure(state); }
 
     private:
+
         bool out_of_bounds(
             const state_type& state,
             const action_type& action
@@ -134,7 +142,8 @@ namespace
         {
             bool result;
 
-            switch(action){
+            switch(action)
+            {
             case actions::up:
                 result = state / width == 0;
                 break;
@@ -152,67 +161,32 @@ namespace
             return result;
         }
 
-        feature_extractor fe;
-        mutable std::default_random_engine gen; //mutable because it doesn't changes the model state
-    };
-
-    template <
-            typename state_type,
-            typename action_type
-            >
-    class feature_extractor
-    {
-    public:
-        feature_extractor(
-            int h,
-            int w,
-            int na
-        ) : height(h), width(w), num_actions(na) {}
-
-        inline long num_features(
-        ) const { return num_actions * height * width; }
-
-        matrix<double,0,1> get_features(
-            const state_type &state,
-            const action_type &action
-        ) const
-        {
-            matrix<double,0,1> feats(num_features());
-            feats = 0;
-            //for(auto i = 0u; i < num_actions; i++)
-            //    feats(num_actions * state + i) = 1;
-            feats(num_actions*state + static_cast<int>(action)) = 1;
-
-            return feats;
-        }
-
-    private:
-        int height, width, num_actions;
+        mutable default_random_engine gen; //mutable because it doesn't changes the model state
     };
 
     template <
         int height,
         int width,
-        typename algorithm_t
+        template<typename> typename algorithm_t
         >
     void test()
     {
         constexpr static int seed = 7;
 
-        typedef cliff_model<height, width, feature_extractor> model_t;
+        typedef cliff_model<height, width> model_t;
         const int max_steps = 100;
 
         print_spinner();
-        algorithm_t algorithm;
+        algorithm_t<model_t> algorithm;
         model_t model(seed);
-        auto policy = algorithm.train(model, std::default_random_engine(seed));
+        auto my_policy = algorithm.train(policy<model_t>(model), std::default_random_engine(seed));
 
         auto s = model.initial_state();
-        auto r = static_cast<typename model_t::reward_type>(0);
+        double r = 0.;
         int i;
 
         for(i = 0; i < max_steps && !model.is_final(s); i++){
-            auto a = policy(s);
+            auto a = my_policy(s);
             auto new_s = model.step(s, a);
             r += model.reward(s,a,new_s);
             s = new_s;
diff --git a/examples/qlearning_sarsa_ex.cpp b/examples/qlearning_sarsa_ex.cpp
index 431dcff48a..53a10fdf1f 100644
--- a/examples/qlearning_sarsa_ex.cpp
+++ b/examples/qlearning_sarsa_ex.cpp
@@ -16,36 +16,41 @@ using namespace dlib;
 using namespace std;
 
 /*
-    Both of these algorithms work by a reward system. That means that they assign to each
-    pair (state, action) an expected reward (Qvalue) and they update those values iteratively
-    taking steps on a model/simulation and observing the reward they obtain. Like so, they
-    need a model class that allow them to work in a interactive way.
+    Both of these algorithms work with a reward system. It means that they assign to each
+    pair (state, action) an expected reward (qvalue) and they iteratively update those values
+    taking steps on an online model/simulation observing the reward obtained. Like so, they
+    need a model class that allows them to work in a interactive way.
 
-    The algorithms/agents objective is to maximize the expected reward by taking the proper
+    The algorithms/agents' objective is to maximize the expected reward by taking the proper
     steps.
 */
 
 /*
-    This is the model the agent is going to work with in the example. In particular,
+    Let me now introduce you the conceptual model the agent is going to use. Basically,
     this class represents a grid with a given height and width of the form
                                      ..........
                                      ..........
                                      IFFFFFFFFG
-    where: - F are pit cells (if the agent falls there it fails and simulation ends).
+    where: - Fs represent pit cells where the agent can fall and thus fail the simulation.
            - I is the starting position.
-           - G is the goal cell (the agent goal is to reach that cell).
-           - . are free cells where the agent can go.
+           - G is the goal cell where the agent aims to go.
+           - dots (.) represent free cells where the agent can freely go through.
 
     The agent receives the following reward: -100 for reaching F, 100 for reaching G and a
     reward of -1 otherwise.
 
     This model doesn't allow the agent to go out of bounds, instead it will stay in the same cell
-    he was before the action (like if there was a wall there) but receiving a reward of -1.
+    it was before the taking action (like if there was a wall) and receive a reward of -1.
+
+    Function approximation by feature extraction is a powerful tool for reducing the state space's size.
+    But ours is a toy example and so I use a one-shot representation, meaning that each feature
+    represents a single state of the space and it will be 1 when that's state is active and 0 otherwise.
 */
+
+//This is an implementation of the example_online_model interface on approximate_linear_models_abstract.h
 template <
         int height,
-        int width,
-        template<typename,typename> class feature_extractor_type
+        int width
         >
 class cliff_model
 {
@@ -54,7 +59,7 @@ class cliff_model
     enum class actions {up = 0, right, down, left};
     constexpr static int num_actions = 4;
 
-    // some constants that we need
+    // some constants we need
     constexpr static double EPS = 1e-16;
     constexpr static int HEIGHT = height;
     constexpr static int WIDTH = width;
@@ -62,21 +67,30 @@ class cliff_model
     // we define the model's types
     typedef int state_type;
     typedef actions action_type;
-    typedef int reward_type;
-
-    // this ensures that the feature extractor uses the same underlying types as our model
-    typedef feature_extractor_type<state_type, action_type> feature_extractor;
-
 
     // Constructor
     explicit cliff_model(
         int seed = 0
-    ) : fe(height, width, num_actions), gen(seed){}
+    ) : gen(seed){}
 
 
     // Functions that will use the agent
 
-    // It returns a random action. It's possible that the allowed actions differ from among states.
+    unsigned int num_features(
+    ) const { return num_actions * height * width; }
+
+    void get_features(
+        const state_type &state,
+        const action_type &action,
+        matrix<double,0,1>& feats
+    ) const
+    {
+        feats.set_size(num_features());
+        feats = 0;
+        feats(num_actions*state + static_cast<int>(action)) = 1; //only this one is 1
+    }
+
+    // It's possible that the allowed actions differ among states.
     // In this case all movements are always allowed so we don't need to use state.
     action_type random_action(
         const state_type& state
@@ -86,9 +100,6 @@ class cliff_model
         return static_cast<action_type>(dist(gen));
     }
 
-    // Returns the best action that maximizes the expected reward, that is,
-    // the action that maximizes dot_product(w, get_features(state, action))
-    // w will be the weights assign by the agent to each feature
     action_type find_best_action(
         const state_type& state,
         const matrix<double,0,1>& w
@@ -97,8 +108,10 @@ class cliff_model
         auto best = numeric_limits<double>::lowest();
         auto best_indexes = std::vector<int>();
 
-        for(auto i = 0; i < num_actions; i++){
-            auto feats = get_features(state, static_cast<action_type>(i));
+        for(auto i = 0; i < num_actions; i++)
+        {
+            matrix<double,0,1> feats;
+            get_features(state, static_cast<action_type>(i), feats);
             auto product = dot(w, feats);
 
             if(product > best){
@@ -114,27 +127,8 @@ class cliff_model
         return static_cast<action_type>(best_indexes[dist(gen)]);
     }
 
-
-    // This functions are delegated to the feature extractor
-
-    const feature_extractor& get_feature_extractor(
-    ) const { return fe; }
-
-    auto states_size(
-    ) const -> decltype(get_feature_extractor().num_features())
-    {
-        return get_feature_extractor().num_features();
-    }
-
-    auto get_features(
-        const state_type &state,
-        const action_type &action
-    ) const -> decltype(get_feature_extractor().get_features(state, action))
-    { return get_feature_extractor().get_features(state, action); }
-
-
     // This functions gives the rewards, that is, tells the agent how good are its movements
-    reward_type reward(
+    double reward(
         const state_type &state,
         const action_type &action,
         const state_type &new_state
@@ -146,7 +140,7 @@ class cliff_model
     state_type initial_state(
     ) const { return static_cast<state_type>((height-1) * width); }
 
-    // This is an important function, basically it allows the agent to move in the model's world
+    // This is an important function, basically it allows the agent to move around the environment
     state_type step(
         const state_type& state,
         const action_type& action
@@ -161,8 +155,7 @@ class cliff_model
                                             state - 1       ;
     }
 
-    // this functions allow the agent to know in which state of the simulation he is in
-
+    // this functions allow the agent to know in which state of the simulation it's in
     bool is_success(
         const state_type &state
     ) const { return state == height*width - 1; }
@@ -176,6 +169,7 @@ class cliff_model
     ) const { return is_success(state) || is_failure(state); }
 
 private:
+
     bool out_of_bounds(
         const state_type& state,
         const action_type& action
@@ -183,7 +177,8 @@ class cliff_model
     {
         bool result;
 
-        switch(action){
+        switch(action)
+        {
         case actions::up:
             result = state / width == 0;
             break;
@@ -201,55 +196,9 @@ class cliff_model
         return result;
     }
 
-    feature_extractor fe;
     mutable default_random_engine gen; //mutable because it doesn't changes the model state
 };
 
-/*
-    Usually when we use these types of agents the state space of the model is huge. That could make
-    the Qfunction to be unmanageable and so we need to use what is known as function approximation.
-
-    Basically it represents the states by a given features instead of the states themselves. That way
-    what usually was just a single value Q(state, action) now is codified as the linear combination of
-    learnt weights and the features, that is, Q(state, action) = dot_product(weights, features(state, action)).
-
-    Our example is a toy example and so we don't need to use it. However, to show how it works I use a simple
-    one-shot representation of the states. That means that I have a vector of features where the feature in the
-    ith position is one if we provide a specific (state, action) and 0 otherwise.
-*/
-template <
-        typename state_type,
-        typename action_type
-        >
-class feature_extractor
-{
-public:
-    feature_extractor(
-        int h,
-        int w,
-        int na
-    ) : height(h), width(w), num_actions(na) {}
-
-    //the size of the vector
-    inline long num_features(
-    ) const { return num_actions * height * width; }
-
-    matrix<double,0,1> get_features(
-        const state_type &state,
-        const action_type &action
-    ) const
-    {
-        matrix<double,0,1> feats(num_features());
-        feats = 0;
-        feats(num_actions*state + static_cast<int>(action)) = 1; //only this one is 1
-
-        return feats;
-    }
-
-private:
-    int height, width, num_actions;
-};
-
 // This is just a helper function to pretty-print the agent's state.
 template <
     typename model_t
@@ -283,7 +232,7 @@ void print(
     This is the function that runs the agent. The code to run both agents are identical so I
     chose to use a templated function.
 
-    The difference between executions comes in the way they train. Namely, the way they updated the Qvalue.
+    The difference between executions comes in the way they train. Namely, the way they update the qvalue.
     Let's suppose that we are in the pair (s, a) and we are going to be in (s', a') in the next step.
 
     Q-learning is an off-policy algorithm meaning that doesn't consider its trully next move but the best one,
@@ -296,19 +245,16 @@ void print(
     in the next step instead of the optimal. So it's an on-policy algorithm. Its update formula is:
                             Q(s, a) = (1 - lr) * Q(s,a) + lr * (reward + disc * Q(s', a'))
 
-    This seems as a meaningless change, but what produces is that when training SARSA tends to be more conservative
-    in its movement while Q-learning tries to optimizes no matter what. In cases when you have to avoid failling
+    This looks as a meaningless change, but what produces is that, when training, SARSA tends to be more conservative
+    in its movements while Q-learning tries to optimizes them no matter what. In cases when you have to avoid failure
     (usually a real world example) SARSA is a better option.
 
-    In our example this difference is appreciated in the way they learn. Q-learning will try to go close to the pit
+    In our example this difference can be appreciated in the way they learn. Q-learning will try to go close to the pit
     cells all the time (falling a lot in the training process) and SARSA will go one or two cells off the cliff.
-
-    Usually, one decreases the learning ratio as the iterations go on and so SARSA would converge to the same solution
-    as Q-learning. This is not implemented yet and so the learning rate is constant always.
 */
 template <
         typename model_t,
-        typename algorithm_t // this can be qlearning or sarsa
+        typename algorithm_t // qlearning or sarsa
         >
 void run_example(const model_t &model, algorithm_t &&algorithm)
 {
@@ -317,7 +263,7 @@ void run_example(const model_t &model, algorithm_t &&algorithm)
 
     cout << "Starting final simulation..." << endl;
     auto s = model.initial_state();
-    auto r = static_cast<typename model_t::reward_type>(0);
+    double r = 0.;
     int i;
 
     for(i = 0; i < 100 && !model.is_final(s); i++){
@@ -345,17 +291,16 @@ int main(int argc, char** argv)
 
     const auto height = 3u;
     const auto width = 7u;
-    typedef cliff_model<height, width, feature_extractor> model_type;
-    model_type model;
+    cliff_model<height, width> model;
 
     char response;
     cout << "Qlearning or SARSA? (q/s): ";
     cin >> response;
 
     if(response == 'q')
-        run_example(model, qlearning());
+        run_example(model, qlearning<decltype(model)>());
     else if(response == 's')
-        run_example(model, sarsa());
+        run_example(model, sarsa<decltype(model)>());
     else
         cerr << "Invalid option." << endl;
 

From 883075368b03568bd2ea71dad89f894c723d420a Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Adri=C3=A1n=20Javaloy?= <adrian.javaloy@gmail.com>
Date: Sun, 18 Feb 2018 01:55:59 +0100
Subject: [PATCH 07/14] travis hotfix

---
 dlib/control/approximate_linear_models.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/dlib/control/approximate_linear_models.h b/dlib/control/approximate_linear_models.h
index 5fe025427c..3666bb905a 100644
--- a/dlib/control/approximate_linear_models.h
+++ b/dlib/control/approximate_linear_models.h
@@ -151,7 +151,7 @@ namespace dlib
         ) const { return underlying_policy; }
 
         auto get_model (
-        ) const -> decltype(get_policy().get_model()) { return underlying_policy.get_model(); }
+        ) const -> decltype(underlying_policy.get_model()) { return underlying_policy.get_model(); }
 
         matrix<double,0,1>& get_weights (
         ) { return underlying_policy.get_weights(); }

From c5cf9b307924d27353d3573a1be763e7f3bbdfee Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Adri=C3=A1n=20Javaloy?= <adrian.javaloy@gmail.com>
Date: Sun, 18 Feb 2018 01:55:59 +0100
Subject: [PATCH 08/14] Revert "travis hotfix"

This reverts commit ee5428ada88f0c70247f51ab87f9753e18ee43a6.
---
 dlib/control/approximate_linear_models.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/dlib/control/approximate_linear_models.h b/dlib/control/approximate_linear_models.h
index 3666bb905a..5fe025427c 100644
--- a/dlib/control/approximate_linear_models.h
+++ b/dlib/control/approximate_linear_models.h
@@ -151,7 +151,7 @@ namespace dlib
         ) const { return underlying_policy; }
 
         auto get_model (
-        ) const -> decltype(underlying_policy.get_model()) { return underlying_policy.get_model(); }
+        ) const -> decltype(get_policy().get_model()) { return underlying_policy.get_model(); }
 
         matrix<double,0,1>& get_weights (
         ) { return underlying_policy.get_weights(); }

From ebc5648e4403565f69455821579a52e9743675cb Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Adri=C3=A1n=20Javaloy?= <adrian.javaloy@gmail.com>
Date: Sun, 18 Feb 2018 12:05:23 +0100
Subject: [PATCH 09/14] real travis hotfix

There was a compilation error that happened on gcc4.8.4 on travis but not in my local compiler
---
 dlib/control/approximate_linear_models.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/dlib/control/approximate_linear_models.h b/dlib/control/approximate_linear_models.h
index 5fe025427c..8acaf18cf8 100644
--- a/dlib/control/approximate_linear_models.h
+++ b/dlib/control/approximate_linear_models.h
@@ -151,7 +151,7 @@ namespace dlib
         ) const { return underlying_policy; }
 
         auto get_model (
-        ) const -> decltype(get_policy().get_model()) { return underlying_policy.get_model(); }
+        ) const -> decltype(this->get_policy().get_model()) { return underlying_policy.get_model(); }
 
         matrix<double,0,1>& get_weights (
         ) { return underlying_policy.get_weights(); }

From 9650cd8cf3e6f8e8d11544de0b176ed722b4fc78 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Adri=C3=A1n=20Javaloy?= <adrian.javaloy@gmail.com>
Date: Sun, 18 Feb 2018 12:36:44 +0100
Subject: [PATCH 10/14] templated template parameters must have class, not
 typename

---
 dlib/test/reinforcement_learning.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/dlib/test/reinforcement_learning.cpp b/dlib/test/reinforcement_learning.cpp
index 0db14d45c2..9933dfdeba 100644
--- a/dlib/test/reinforcement_learning.cpp
+++ b/dlib/test/reinforcement_learning.cpp
@@ -167,7 +167,7 @@ namespace
     template <
         int height,
         int width,
-        template<typename> typename algorithm_t
+        template<typename> class algorithm_t
         >
     void test()
     {

From cade6fb5aaa185e39ed03672bb6f3bab4c8d9103 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Adri=C3=A1n=20Javaloy?= <adrian.javaloy@gmail.com>
Date: Mon, 4 Jun 2018 00:57:10 +0200
Subject: [PATCH 11/14] Applied the notes of the second review

---
 dlib/control/approximate_linear_models.h      | 22 ++++-
 .../approximate_linear_models_abstract.h      | 59 +++++-------
 dlib/control/qlearning.h                      |  6 +-
 dlib/control/qlearning_abstract.h             | 13 ++-
 dlib/control/sarsa.h                          |  6 +-
 dlib/control/sarsa_abstract.h                 | 12 +--
 dlib/serialize.h                              | 51 ++++++++++
 dlib/test/reinforcement_learning.cpp          | 93 ++++++++++++++++++-
 examples/qlearning_sarsa_ex.cpp               | 29 +++++-
 9 files changed, 226 insertions(+), 65 deletions(-)

diff --git a/dlib/control/approximate_linear_models.h b/dlib/control/approximate_linear_models.h
index 8acaf18cf8..5322c2735c 100644
--- a/dlib/control/approximate_linear_models.h
+++ b/dlib/control/approximate_linear_models.h
@@ -78,6 +78,12 @@ namespace dlib
             const model_type &model_
         ) : weights(weights_), model(model_) {}
 
+        policy(const policy<model_type>&) = default;
+        policy<model_type>& operator=(const policy<model_type>&) = default;
+
+        policy(policy<model_type>&&) = default;
+        policy<model_type>& operator=(policy<model_type>&&) = default;
+
         action_type operator() (
             const state_type& state
         ) const
@@ -96,7 +102,7 @@ namespace dlib
 
     private:
         matrix<double,0,1> weights;
-        const model_type model;
+        model_type model;
     };
 
     template < typename model_type >
@@ -135,23 +141,29 @@ namespace dlib
 
         epsilon_policy (
             double epsilon_,
-            policy_type &policy_,
+            const policy_type& policy_,
             const prng_engine &gen_ = prng_engine()
         ) : underlying_policy(policy_), epsilon(epsilon_), gen(gen_) {}
 
+        epsilon_policy(const epsilon_policy<policy_type, prng_engine>&) = default;
+        epsilon_policy<policy_type, prng_engine>& operator=(const epsilon_policy<policy_type, prng_engine>&) = default;
+
+        epsilon_policy(epsilon_policy<policy_type, prng_engine>&&) = default;
+        epsilon_policy<policy_type, prng_engine>& operator=(epsilon_policy<policy_type, prng_engine>&&) = default;
+
         action_type operator() (
             const state_type& state
         ) const
         {
             std::bernoulli_distribution d(epsilon);
-            return d(gen) ? get_model().random_action(state) : underlying_policy(state);
+            return d(gen) ? underlying_policy.get_model().random_action(state) : underlying_policy(state);
         }
 
         const policy_type& get_policy(
         ) const { return underlying_policy; }
 
         auto get_model (
-        ) const -> decltype(this->get_policy().get_model()) { return underlying_policy.get_model(); }
+        ) const -> decltype(get_policy().get_model()) { return underlying_policy.get_model(); }
 
         matrix<double,0,1>& get_weights (
         ) { return underlying_policy.get_weights(); }
@@ -166,7 +178,7 @@ namespace dlib
         ) const { return gen; }
 
     private:
-        policy_type& underlying_policy;
+        policy_type underlying_policy;
         double epsilon;
 
         mutable prng_engine gen;
diff --git a/dlib/control/approximate_linear_models_abstract.h b/dlib/control/approximate_linear_models_abstract.h
index 0f14432f92..7462a97552 100644
--- a/dlib/control/approximate_linear_models_abstract.h
+++ b/dlib/control/approximate_linear_models_abstract.h
@@ -15,23 +15,16 @@ namespace dlib
         /*!
             WHAT THIS OBJECT REPRESENTS
                 This object defines the inferface that any model has to implement if it
-                is to be used in an offline fashion along with some method like the lspi
-                method defined in the file lspi_abstract.h. Being offline only means that
-                it already holds the data and will not interact with the environment to get
-                them.
+                is to be used in an offline fashion along with some class like the lspi
+                class defined in the file lspi_abstract.h.
 
                 In particular, this object models a Q(state, action) function where
                     Q(state, action) == dot(w, PSI(state, action))
                 where PSI(state, action) is a feature vector and w is a parameter vector.
 
-                Therefore, an offline model defines how the PSI(x,y) feature vector is
-                calculated. It also defines the types used to represent the state and
-                action objects.
-
-            THREAD SAFETY
-                Instances of this object are required to be threadsafe, that is, it should
-                be safe for multiple threads to make concurrent calls to the member
-                functions of this object.
+                Therefore, an offline model object defines how a model is represented by
+                defining its actions, states, calculating the feature vectors. However, it
+                does not provide any way to interactively interact with it.
         !*/
 
         // The states and actions can be any type as long as you provide typedefs for them.
@@ -85,25 +78,22 @@ namespace dlib
         /*!
             WHAT THIS OBJECT REPRESENTS
                 This object defines the inferface that any model has to implement if it
-                is to be used in an online fashion along with some method like the qlearning
-                method defined in the file qlearning_abstract.h.
+                is to be used by an object such as the qlearning class defined in the
+                file qlearning_abstract.h.
 
-                Being online means that the model doesn't hold prior data but it interacts
-                with the environment and performing actions from some given state turning
-                that state into a new one as well as getting some reward for doing so.
+                Instances of this object differ from the offline model in the way they
+                interact with the environment. This object expands the interface of the
+                offline model with methods that make it suitable for simulations:
+                going from one state to another, detecting special states and getting
+                the reward for performing those steps.
 
                 In particular, this object models a Q(state, action) function where
                     Q(state, action) == dot(w, PSI(state, action))
                 where PSI(state, action) is a feature vector and w is a parameter vector.
 
-                Therefore, an online model defines how the PSI(x,y) feature vector is
-                calculated, the types used to represent the state, action and reward
-                objects as well as how to interact with the environment.
-
-            THREAD SAFETY
-                Instances of this object are required to be threadsafe, that is, it should
-                be safe for multiple threads to make concurrent calls to the member
-                functions of this object.
+                Therefore, an online model object defines how a model is represented by
+                defining its actions, states, calculating the feature vectors. Besides, it
+                provides methods to interact with that environment on the fly.
         !*/
 
         // The states and actions can be any type as long as you provide typedefs for them.
@@ -160,12 +150,9 @@ namespace dlib
             const state_type& new_state
         ) const;
         /*!
-            requires
-                - action is a pausible action from state.
-                - new_state is a possible outcome when performing action on state.
             ensures
                 - returns the reward obtained by reaching new_state from state
-                  doing action.
+                  after you do action.
         !*/
 
         state_type initial_state(
@@ -180,11 +167,9 @@ namespace dlib
             const action_type& action
         ) const;
         /*!
-            requires
-                - action is a plausible action when we are in state.
             ensures
-                - returns a new state result of being on the given state and doing the given
-                  action.
+                - returns a new state result of doing the given action over the
+                  given state.
         !*/
 
         bool is_success(
@@ -224,7 +209,8 @@ namespace dlib
     {
         /*!
             REQUIREMENTS ON model_type
-                model_type should implement one of the interfaces defined above this file.
+                model_type should implement one of the two interfaces defined above, that is,
+                example_offline_model or example_online_model.
 
             WHAT THIS OBJECT REPRESENTS
                 This object holds a training sample for a reinforcement learning algorithm.
@@ -268,7 +254,8 @@ namespace dlib
     {
         /*!
             REQUIREMENTS ON model_type
-                model_type should implement one of the interfaces defined above this file.
+                model_type should implement one of the two interfaces defined above, that is,
+                example_offline_model or example_online_model.
 
             WHAT THIS OBJECT REPRESENTS
                 This class represents a greedy policy, that is, it is a policy that given a
@@ -307,7 +294,7 @@ namespace dlib
         ) const;
         /*!
             ensures
-                - returns get_model().find_best_action(state, this->weights);
+                - returns get_model().find_best_action(state, get_weights());
         !*/
 
         const model_type& get_model (
diff --git a/dlib/control/qlearning.h b/dlib/control/qlearning.h
index ace925d8b2..c6b7f7ac70 100644
--- a/dlib/control/qlearning.h
+++ b/dlib/control/qlearning.h
@@ -94,8 +94,8 @@ namespace dlib
             typename prng_engine = std::default_random_engine
             >
         policy<model_type> train(
-            policy<model_type> training_policy = policy<model_type>(),
-            const prng_engine &gen = prng_engine()
+            const policy<model_type>& training_policy = policy<model_type>(),
+            const prng_engine& gen = prng_engine()
         ) const
         {
             if(verbose)
@@ -150,7 +150,7 @@ namespace dlib
             if(verbose)
                 std::cout << "Training finished." << std::endl;
 
-            return training_policy;
+            return eps_pol.get_policy();
         }
 
     private:
diff --git a/dlib/control/qlearning_abstract.h b/dlib/control/qlearning_abstract.h
index c96a75654e..39e5c7225e 100644
--- a/dlib/control/qlearning_abstract.h
+++ b/dlib/control/qlearning_abstract.h
@@ -30,11 +30,9 @@ namespace dlib
                 then the learning function has the form:
                     Q(s, a) = (1 - lr) * Q(s,a) + lr * (reward + disc * max_a' Q(s', a'))
                 where lr is the learning_rate and disc is the discount factor.
-                That formula means that it takes a convex combination of the current qvalue,
-                that is, the current expected reward from there, and the new expected qvalue.
 
-                Note that it is an off-policy reinforcement learning algorithm meaning
-                that it doesn't take the policy is using into account in the learning process.
+                The formula above means that it takes a convex combination of the current
+                qvalue, that is, the current expected reward, and the new expected qvalue.
         !*/
 
     public:
@@ -161,7 +159,7 @@ namespace dlib
             typename prng_engine = std::default_random_engine
             >
         policy<model_type> train(
-            policy<model_type> policy = policy<model_type>()
+            const policy<model_type>& policy = policy<model_type>()
             const prng_engine& gen = prng_engine()
         ) const;
         /*!
@@ -169,8 +167,9 @@ namespace dlib
                 - prng_engine is a pseudo-random number generator class like the ones
                   defined in std::random. By default it is the standard one.
             ensures
-                - returns the policy resulting of applying the learning function over
-                  and over according to the parameters previously fed into this object.
+                - returns the policy obtained by applying to the given policy the learning
+                  function several times according to the parameters previously fed
+                  into this object.
         !*/
 
     };
diff --git a/dlib/control/sarsa.h b/dlib/control/sarsa.h
index aab620f14f..69d87badc3 100644
--- a/dlib/control/sarsa.h
+++ b/dlib/control/sarsa.h
@@ -93,8 +93,8 @@ namespace dlib
             typename prng_engine = std::default_random_engine
             >
         policy<model_type> train(
-            policy<model_type> training_policy = policy<model_type>(),
-            const prng_engine &gen = prng_engine()
+            const policy<model_type>& training_policy = policy<model_type>(),
+            const prng_engine& gen = prng_engine()
         ) const
         {
             if(verbose)
@@ -150,7 +150,7 @@ namespace dlib
             if(verbose)
                 std::cout << "Training finished." << std::endl;
 
-            return training_policy;
+            return eps_pol.get_policy();
         }
 
     private:
diff --git a/dlib/control/sarsa_abstract.h b/dlib/control/sarsa_abstract.h
index a9ebbf522d..e21bb9b7ec 100644
--- a/dlib/control/sarsa_abstract.h
+++ b/dlib/control/sarsa_abstract.h
@@ -30,9 +30,6 @@ namespace dlib
                 where lr is the learning_rate and disc is the discount factor.
                 That formula means that it takes a convex combination of the current qvalue,
                 that is, the current expected reward from there, and the new expected qvalue.
-
-                Note that, unlike qlearning, sarsa is an on-policy reinforcement learning
-                algorithm meaning that it takes the policy into account while learning.
         !*/
 
     public:
@@ -159,16 +156,17 @@ namespace dlib
             typename prng_engine = std::default_random_engine
             >
         policy_type train(
-            policy<model_type> policy = policy<model_type>(),
-            const prng_engine &gen = prng_engine()
+            const policy<model_type>& policy = policy<model_type>(),
+            const prng_engine& gen = prng_engine()
         ) const;
         /*!
             requires
                 - prng_engine is a pseudo-random number generator class like the ones
                   defined in std::random. By default it is the standard one.
             ensures
-                - returns the policy resulting of applying the learning function over
-                  and over according to the parameters previously fed into this object.
+                - returns the policy obtained by applying to the given policy the learning
+                  function several times according to the parameters previously fed
+                  into this object.
         !*/
 
     };
diff --git a/dlib/serialize.h b/dlib/serialize.h
index 16d0d15013..1dfaf9b713 100644
--- a/dlib/serialize.h
+++ b/dlib/serialize.h
@@ -74,6 +74,7 @@
         - enumerable<T> where T is a serializable type
         - map_pair<D,R> where D and R are both serializable types.
         - C style arrays of serializable types
+        - the random devices defined in std::random like std::default_random_engine
         - Google protocol buffer objects.
 
     This file provides deserialization support to the following object types:
@@ -91,6 +92,7 @@
         - dlib::int64
         - float_details
         - C style arrays of serializable types
+        - the random devices defined in std::random like std::default_random_engine
         - Google protocol buffer objects.
 
     Support for deserialization of objects which implement the enumerable or
@@ -156,6 +158,8 @@
 #include <limits>
 #include <type_traits>
 #include <utility>
+#include <random>
+#include <sstream>
 #include "uintn.h"
 #include "interfaces/enumerable.h"
 #include "interfaces/map_pair.h"
@@ -1541,6 +1545,53 @@ namespace dlib
         }
     }
 
+// ----------------------------------------------------------------------------------------
+
+    #define USE_SERIALIZATION_THROUGH_IOSTREAM(T) \
+    inline void serialize ( \
+        const T& item, \
+        std::ostream& out \
+    ) \
+    { \
+        std::stringstream ss; \
+        ss.setf(std::ios_base::dec, std::ios_base::basefield); \
+        ss.setf(std::ios_base::left, std::ios_base::adjustfield); \
+        ss.fill(' '); \
+        ss << item; \
+        \
+        try{ serialize(ss.str(),out); } \
+        catch (serialization_error& e) \
+        { throw serialization_error(e.info + "\n   while serializing object of type std::default_random_engine"); } \
+    } \
+    \
+    inline void deserialize ( \
+        T& item, \
+        std::istream& in \
+    ) \
+    { \
+        std::string str; \
+        try { deserialize(str,in); } \
+        catch (serialization_error& e) \
+        { throw serialization_error(e.info + "\n   while deserializing object of type std::default_random_engine"); } \
+        \
+        std::stringstream ss(str); \
+        ss.setf(std::ios_base::dec, std::ios_base::basefield); \
+        ss.setf(std::ios_base::left, std::ios_base::adjustfield); \
+        ss.fill(' '); \
+        ss >> item; \
+    }
+
+    //USE_SERIALIZATION_THROUGH_IOSTREAM(std::default_random_engine)
+    USE_SERIALIZATION_THROUGH_IOSTREAM(std::minstd_rand)
+    USE_SERIALIZATION_THROUGH_IOSTREAM(std::minstd_rand0)
+    USE_SERIALIZATION_THROUGH_IOSTREAM(std::mt19937)
+    USE_SERIALIZATION_THROUGH_IOSTREAM(std::mt19937_64)
+    USE_SERIALIZATION_THROUGH_IOSTREAM(std::ranlux24_base)
+    USE_SERIALIZATION_THROUGH_IOSTREAM(std::ranlux48_base)
+    USE_SERIALIZATION_THROUGH_IOSTREAM(std::ranlux24)
+    USE_SERIALIZATION_THROUGH_IOSTREAM(std::ranlux48)
+    USE_SERIALIZATION_THROUGH_IOSTREAM(std::knuth_b)
+
 // ----------------------------------------------------------------------------------------
 
     class proxy_serialize 
diff --git a/dlib/test/reinforcement_learning.cpp b/dlib/test/reinforcement_learning.cpp
index 9933dfdeba..7c708a49e7 100644
--- a/dlib/test/reinforcement_learning.cpp
+++ b/dlib/test/reinforcement_learning.cpp
@@ -3,6 +3,7 @@
 
 #include "tester.h"
 #include <dlib/control.h>
+#include <dlib/serialize.h>
 #include <vector>
 #include <sstream>
 #include <ctime>
@@ -12,7 +13,7 @@ namespace
     using namespace test;
     using namespace dlib;
     using namespace std;
-    dlib::logger dlog("test.rl");
+    logger dlog("test.rl");
 
     template <
             int height,
@@ -34,11 +35,13 @@ namespace
         typedef int state_type;
         typedef actions action_type;
 
-        // Constructor
+        // Constructors
         explicit cliff_model(
             int seed = 0
         ) : gen(seed){}
 
+        cliff_model(const cliff_model<height, width>&) = default;
+        cliff_model<height, width>& operator=(const cliff_model<height, width>&) = default;
 
         // Functions that will use the agent
 
@@ -71,7 +74,7 @@ namespace
         ) const
         {
             auto best = numeric_limits<double>::lowest();
-            auto best_indexes = std::vector<int>();
+            std::vector<int> best_indexes;
 
             for(auto i = 0; i < num_actions; i++)
             {
@@ -133,6 +136,9 @@ namespace
             const state_type& state
         ) const { return is_success(state) || is_failure(state); }
 
+        const std::default_random_engine& get_generator(
+        ) const { return gen; }
+
     private:
 
         bool out_of_bounds(
@@ -161,9 +167,35 @@ namespace
             return result;
         }
 
+        template < int H, int W>
+        friend void serialize(const cliff_model<H, W>& item, std::ostream& out);
+
+        template < int H, int W>
+        friend void deserialize(cliff_model<H, W>& item, std::istream& in);
+
         mutable default_random_engine gen; //mutable because it doesn't changes the model state
     };
 
+    template < int height, int width >
+    inline void serialize(const cliff_model<height, width>& item, std::ostream& out)
+    {
+        int version = 1;
+        dlib::serialize(version, out);
+        dlib::serialize(item.gen, out);
+    }
+
+    template < int height, int width >
+    inline void deserialize(cliff_model<height, width>& item, std::istream& in)
+    {
+        int version = 0;
+        dlib::deserialize(version, in);
+        if (version != 1)
+            throw serialization_error("Unexpected version found while deserializing reinforcement learning test model object.");
+
+        item = cliff_model<height, width>();
+        dlib::deserialize(item.gen, in);
+    }
+
     template <
         int height,
         int width,
@@ -203,6 +235,58 @@ namespace
         DLIB_TEST(r > 0);
     }
 
+    void policy_serialization_test(){
+        cliff_model<3, 5> model(8);
+        policy<decltype(model)> gp(model), gres;
+
+        for(uint i = 0u; i < gp.get_weights().size(); i++)
+            gp.get_weights()(i) = i;
+
+        ostringstream sout;
+        serialize(gp, sout);
+        istringstream sin(sout.str());
+
+        deserialize(gres, sin);
+        dlog << LINFO << "policy serializing:  " <<
+                (gp.get_weights() == gres.get_weights() && gp.get_model().get_generator() == gres.get_model().get_generator());
+        DLIB_TEST(gp.get_weights() == gres.get_weights() && gp.get_model().get_generator() == gres.get_model().get_generator());
+    }
+
+    void epsilon_policy_serialization_test(){
+        cliff_model<3, 5> model(11);
+        policy<decltype(model)> gp(model);
+
+        for(uint i = 0u; i < gp.get_weights().size(); i++)
+            gp.get_weights()(i) = i;
+
+        epsilon_policy<decltype(gp)> ep(0.3, gp);
+        auto eres = ep; // epsilon_policy is not default constructible
+
+        auto state = ep.get_model().initial_state();
+        for(uint i = 0u; i < 3; i++)
+            state = ep.get_model().step(state, ep(state));
+
+        ostringstream sout;
+        serialize(ep, sout);
+        istringstream sin(sout.str());
+
+        auto cstate = state;
+        for(uint i = 0; i < 5; i++)
+            state = ep.get_model().step(state, ep(state));
+
+        deserialize(eres, sin);
+        for(uint i = 0; i < 5; i++)
+            cstate = eres.get_model().step(cstate, eres(cstate));
+
+        dlog << LINFO << "epsilon policy serializing:  " <<
+                (ep.get_weights() == eres.get_weights() && ep.get_generator() == eres.get_generator() &&
+                 ep.get_model().get_generator() == eres.get_model().get_generator() ? "True" : "False");
+        dlog << LINFO << "same state stepping after serializing:  " << (state == cstate ? "True" : "False");
+        DLIB_TEST(state == cstate);
+        DLIB_TEST(ep.get_weights() == eres.get_weights() && ep.get_generator() == eres.get_generator() &&
+                  ep.get_model().get_generator() == eres.get_model().get_generator());
+    }
+
     class rl_tester : public tester
     {
     public:
@@ -228,6 +312,9 @@ namespace
             test<5,5,sarsa>();
             test<4,7,sarsa>();
             test<5,10,sarsa>();
+
+            policy_serialization_test();
+            epsilon_policy_serialization_test();
         }
     };
 
diff --git a/examples/qlearning_sarsa_ex.cpp b/examples/qlearning_sarsa_ex.cpp
index 53a10fdf1f..90ca0d3b0b 100644
--- a/examples/qlearning_sarsa_ex.cpp
+++ b/examples/qlearning_sarsa_ex.cpp
@@ -7,6 +7,7 @@
 
 #include <dlib/matrix.h>
 #include <dlib/control.h>
+#include <dlib/serialize.h>
 #include <limits>
 #include <cmath>
 #include <vector>
@@ -73,6 +74,8 @@ class cliff_model
         int seed = 0
     ) : gen(seed){}
 
+    cliff_model(const cliff_model<height, width>&) = default;
+    cliff_model<height, width>& operator=(const cliff_model<height, width>&) = default;
 
     // Functions that will use the agent
 
@@ -106,7 +109,7 @@ class cliff_model
     ) const
     {
         auto best = numeric_limits<double>::lowest();
-        auto best_indexes = std::vector<int>();
+        std::vector<int> best_indexes;
 
         for(auto i = 0; i < num_actions; i++)
         {
@@ -196,9 +199,33 @@ class cliff_model
         return result;
     }
 
+    // for accessing to gen on serialization functions (alternatively we could define a getter method)
+    template < int H, int W> friend void serialize(const cliff_model<H, W>& item, std::ostream& out);
+    template < int H, int W> friend void deserialize(cliff_model<H, W>& item, std::istream& in);
+
     mutable default_random_engine gen; //mutable because it doesn't changes the model state
 };
 
+template < int height, int width >
+inline void serialize(const cliff_model<height, width>& item, std::ostream& out)
+{
+    int version = 1;
+    dlib::serialize(version, out);
+    dlib::serialize(item.gen, out);
+}
+
+template < int height, int width >
+inline void deserialize(cliff_model<height, width>& item, std::istream& in)
+{
+    int version = 0;
+    dlib::deserialize(version, in);
+    if (version != 1)
+        throw serialization_error("Unexpected version found while deserializing reinforcement learning test model object.");
+
+    item = cliff_model<height, width>();
+    dlib::deserialize(item.gen, in);
+}
+
 // This is just a helper function to pretty-print the agent's state.
 template <
     typename model_t

From b6f1fde280a143981a7d83965ec3cf4c083d49f6 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Adri=C3=A1n=20Javaloy?= <adrian.javaloy@gmail.com>
Date: Mon, 4 Jun 2018 01:40:57 +0200
Subject: [PATCH 12/14] changed uint type to int

---
 dlib/test/reinforcement_learning.cpp | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/dlib/test/reinforcement_learning.cpp b/dlib/test/reinforcement_learning.cpp
index 7c708a49e7..ffb9bd3b15 100644
--- a/dlib/test/reinforcement_learning.cpp
+++ b/dlib/test/reinforcement_learning.cpp
@@ -239,7 +239,7 @@ namespace
         cliff_model<3, 5> model(8);
         policy<decltype(model)> gp(model), gres;
 
-        for(uint i = 0u; i < gp.get_weights().size(); i++)
+        for(int i = 0; i < gp.get_weights().size(); i++)
             gp.get_weights()(i) = i;
 
         ostringstream sout;
@@ -256,14 +256,14 @@ namespace
         cliff_model<3, 5> model(11);
         policy<decltype(model)> gp(model);
 
-        for(uint i = 0u; i < gp.get_weights().size(); i++)
+        for(int i = 0; i < gp.get_weights().size(); i++)
             gp.get_weights()(i) = i;
 
         epsilon_policy<decltype(gp)> ep(0.3, gp);
         auto eres = ep; // epsilon_policy is not default constructible
 
         auto state = ep.get_model().initial_state();
-        for(uint i = 0u; i < 3; i++)
+        for(int i = 0; i < 3; i++)
             state = ep.get_model().step(state, ep(state));
 
         ostringstream sout;
@@ -271,11 +271,11 @@ namespace
         istringstream sin(sout.str());
 
         auto cstate = state;
-        for(uint i = 0; i < 5; i++)
+        for(int i = 0; i < 5; i++)
             state = ep.get_model().step(state, ep(state));
 
         deserialize(eres, sin);
-        for(uint i = 0; i < 5; i++)
+        for(int i = 0; i < 5; i++)
             cstate = eres.get_model().step(cstate, eres(cstate));
 
         dlog << LINFO << "epsilon policy serializing:  " <<

From eaa621a1c2dfdf257b320bd3aa3b8b7aee53cd51 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Adri=C3=A1n=20Javaloy?= <adrian.javaloy@gmail.com>
Date: Mon, 4 Jun 2018 10:47:19 +0200
Subject: [PATCH 13/14] Fixed "cannot call member function without object"

---
 dlib/control/approximate_linear_models.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/dlib/control/approximate_linear_models.h b/dlib/control/approximate_linear_models.h
index 5322c2735c..9799f4b9a5 100644
--- a/dlib/control/approximate_linear_models.h
+++ b/dlib/control/approximate_linear_models.h
@@ -163,7 +163,7 @@ namespace dlib
         ) const { return underlying_policy; }
 
         auto get_model (
-        ) const -> decltype(get_policy().get_model()) { return underlying_policy.get_model(); }
+        ) const -> decltype(this->get_policy().get_model()) { return underlying_policy.get_model(); }
 
         matrix<double,0,1>& get_weights (
         ) { return underlying_policy.get_weights(); }

From e3f2d28a37199365c083f0b2ede30063fc98043a Mon Sep 17 00:00:00 2001
From: Davis King <davis@dlib.net>
Date: Sat, 14 Mar 2020 19:35:00 -0400
Subject: [PATCH 14/14] cleanup and add some tests

---
 dlib/control.h                           |  6 ++--
 dlib/control/approximate_linear_models.h |  2 +-
 dlib/serialize.h                         |  4 +--
 dlib/test/reinforcement_learning.cpp     |  8 ++---
 dlib/test/serialize.cpp                  | 38 ++++++++++++++++++++++++
 5 files changed, 48 insertions(+), 10 deletions(-)

diff --git a/dlib/control.h b/dlib/control.h
index 4e9c02878e..8f941244f5 100644
--- a/dlib/control.h
+++ b/dlib/control.h
@@ -1,13 +1,13 @@
 // Copyright (C) 2015  Davis E. King (davis@dlib.net)
 // License: Boost Software License   See LICENSE.txt for the full license.
-#ifndef DLIB_CONTROL_
-#define DLIB_CONTROL_
+#ifndef DLIB_CONTRoL_
+#define DLIB_CONTRoL_
 
 #include "control/lspi.h"
 #include "control/mpc.h"
 #include "control/qlearning.h"
 #include "control/sarsa.h"
 
-#endif // DLIB_CONTROL_
+#endif // DLIB_CONTRoL_
 
 
diff --git a/dlib/control/approximate_linear_models.h b/dlib/control/approximate_linear_models.h
index 9799f4b9a5..760ee25da6 100644
--- a/dlib/control/approximate_linear_models.h
+++ b/dlib/control/approximate_linear_models.h
@@ -4,7 +4,7 @@
 #define DLIB_APPROXIMATE_LINEAR_MODELS_Hh_
 
 #include "approximate_linear_models_abstract.h"
-#include <dlib/matrix.h>
+#include "../matrix.h"
 #include <random>
 
 namespace dlib
diff --git a/dlib/serialize.h b/dlib/serialize.h
index 1dfaf9b713..cc6e2c0ec3 100644
--- a/dlib/serialize.h
+++ b/dlib/serialize.h
@@ -1561,7 +1561,7 @@ namespace dlib
         \
         try{ serialize(ss.str(),out); } \
         catch (serialization_error& e) \
-        { throw serialization_error(e.info + "\n   while serializing object of type std::default_random_engine"); } \
+        { throw serialization_error(e.info + "\n   while serializing object of type " + #T); } \
     } \
     \
     inline void deserialize ( \
@@ -1572,7 +1572,7 @@ namespace dlib
         std::string str; \
         try { deserialize(str,in); } \
         catch (serialization_error& e) \
-        { throw serialization_error(e.info + "\n   while deserializing object of type std::default_random_engine"); } \
+        { throw serialization_error(e.info + "\n   while deserializing object of type " + #T); } \
         \
         std::stringstream ss(str); \
         ss.setf(std::ios_base::dec, std::ios_base::basefield); \
diff --git a/dlib/test/reinforcement_learning.cpp b/dlib/test/reinforcement_learning.cpp
index ffb9bd3b15..85e9fca88e 100644
--- a/dlib/test/reinforcement_learning.cpp
+++ b/dlib/test/reinforcement_learning.cpp
@@ -61,7 +61,7 @@ namespace
         // It's possible that the allowed actions differ among states.
         // In this case all movements are always allowed so we don't need to use state.
         action_type random_action(
-            const state_type& state
+            const state_type& /*state*/
         ) const
         {
             uniform_int_distribution<int> dist(0,num_actions-1);
@@ -97,8 +97,8 @@ namespace
 
         // This functions gives the rewards, that is, tells the agent how good are its movements
         double reward(
-            const state_type &state,
-            const action_type &action,
+            const state_type &/*state*/,
+            const action_type &/*action*/,
             const state_type &new_state
         ) const
         {
@@ -146,7 +146,7 @@ namespace
             const action_type& action
         ) const
         {
-            bool result;
+            bool result = false;
 
             switch(action)
             {
diff --git a/dlib/test/serialize.cpp b/dlib/test/serialize.cpp
index f8b3384b98..b951b76a4e 100644
--- a/dlib/test/serialize.cpp
+++ b/dlib/test/serialize.cpp
@@ -1050,6 +1050,43 @@ namespace
         }
     }
 
+// ----------------------------------------------------------------------------------------
+
+    template <typename T>
+    void test_std_generator() 
+    {
+        T rnd;
+
+        for (int i = 0; i < 10; ++i)
+            rnd();
+
+        std::stringstream ss;
+        const int val1 = 123;
+        const int val2 = 456;
+        dlib::serialize(val1, ss);
+        dlib::serialize(rnd, ss);
+        dlib::serialize(val2, ss);
+
+        T rnd2;
+        int val1_read, val2_read;
+        dlib::deserialize(val1_read, ss);
+        dlib::deserialize(rnd2, ss);
+        dlib::deserialize(val2_read, ss);
+
+        DLIB_TEST(val1_read == val1);
+        DLIB_TEST(val2_read == val2);
+
+        for (int i = 0; i < 100; ++i)
+            DLIB_TEST(rnd() == rnd2());
+    }
+
+    void random_generators() 
+    {
+        test_std_generator<std::default_random_engine>();
+        test_std_generator<std::mt19937>();
+        test_std_generator<std::ranlux24>();
+    }
+
 // ----------------------------------------------------------------------------------------
 
     class serialize_tester : public tester
@@ -1078,6 +1115,7 @@ namespace
             test_array2d_and_matrix_serialization();
             test_strings();
             test_std_array();
+            random_generators();
         }
     } a;