Added continuous action variable length pendulum OpenAI Gym environment

DocVaughan · DocVaughan · commit b890ae56876f · 2017-07-07T20:26:06.000-05:00
TODO:
* refine reward for the environments
* refine simple learning algorithms
diff --git a/OpenAI Gym/openAI_variableLengthPendulum.py b/OpenAI Gym/openAI_variableLengthPendulum.py
@@ -25,6 +25,7 @@
 import matplotlib.pyplot as plt
 
 import gym
+import time
 import variable_pendulum
 
 env = gym.make('variable_pendulum-v0')
@@ -49,6 +50,8 @@
         print("L_dot (m/s).:".ljust(j,d), '{:+8.3f}'.format(observation[3]))
         print("Reward:".ljust(j,d), '{:+8.3f}'.format(reward))
 
+        # if episode finishes before full time range, notify
         if done:
-            print("Episode finished after {} timesteps".format(t+1))
+            print("\r\nEpisode finished after {} timesteps".format(t+1))
+            time.sleep(1)
             break
diff --git a/OpenAI Gym/openAI_variableLengthPendulumContinuous_learning.py b/OpenAI Gym/openAI_variableLengthPendulumContinuous_learning.py
@@ -0,0 +1,74 @@
+import numpy as np
+import gym
+import variable_pendulum_continuous
+
+from keras.models import Sequential, Model
+from keras.layers import Dense, Activation, Flatten, Input, merge
+from keras.optimizers import Adam
+
+from rl.agents import DDPGAgent
+from rl.memory import SequentialMemory
+from rl.random import OrnsteinUhlenbeckProcess
+
+
+ENV_NAME = 'variable_pendulum_continuous-v0'
+
+# ENV_NAME = 'Pendulum-v0'
+
+# Get the environment and extract the number of actions.
+env = gym.make(ENV_NAME)
+np.random.seed(123)
+env.seed(123)
+nb_actions = env.action_space.shape[0]
+
+# Next, we build a very simple model.
+actor = Sequential()
+actor.add(Flatten(input_shape=(1,) + env.observation_space.shape))
+actor.add(Dense(16))
+actor.add(Activation('relu'))
+actor.add(Dense(16))
+actor.add(Activation('relu'))
+actor.add(Dense(16))
+actor.add(Activation('relu'))
+actor.add(Dense(nb_actions))
+actor.add(Activation('linear'))
+print(actor.summary())
+
+action_input = Input(shape=(nb_actions,), name='action_input')
+observation_input = Input(shape=(1,) + env.observation_space.shape, name='observation_input')
+flattened_observation = Flatten()(observation_input)
+x = merge([action_input, flattened_observation], mode='concat')
+x = Dense(32)(x)
+x = Activation('relu')(x)
+x = Dense(32)(x)
+x = Activation('relu')(x)
+x = Dense(32)(x)
+x = Activation('relu')(x)
+x = Dense(1)(x)
+x = Activation('linear')(x)
+critic = Model(input=[action_input, observation_input], output=x)
+print(critic.summary())
+
+# Finally, we configure and compile our agent. You can use every built-in Keras optimizer and
+# even the metrics!
+memory = SequentialMemory(limit=100000, window_length=1)
+random_process = OrnsteinUhlenbeckProcess(size=nb_actions, theta=.15, mu=0., sigma=.3)
+agent = DDPGAgent(nb_actions=nb_actions, actor=actor, critic=critic, critic_action_input=action_input,
+                  memory=memory, nb_steps_warmup_critic=100, nb_steps_warmup_actor=100,
+                  random_process=random_process, gamma=.99, target_model_update=1e-3,
+                  delta_clip=1.)
+# agent = DDPGAgent(nb_actions=nb_actions, actor=actor, critic=critic, critic_action_input=action_input,
+#                   memory=memory, nb_steps_warmup_critic=100, nb_steps_warmup_actor=100,
+#                   random_process=random_process, gamma=.99, target_model_update=1e-3)
+agent.compile(Adam(lr=.001, clipnorm=1.), metrics=['mae'])
+
+# Okay, now it's time to learn something! We visualize the training here for show, but this
+# slows down training quite a lot. You can always safely abort the training prematurely using
+# Ctrl + C.
+agent.fit(env, nb_steps=50000, visualize=True, verbose=1, nb_max_episode_steps=200)
+
+# After training is done, we save the final weights.
+agent.save_weights('ddpg_{}_weights.h5f'.format(ENV_NAME), overwrite=True)
+
+# Finally, evaluate our algorithm for 5 episodes.
+agent.test(env, nb_episodes=5, visualize=True, nb_max_episode_steps=200)
diff --git a/OpenAI Gym/openAI_variableLengthPendulum_learning.py b/OpenAI Gym/openAI_variableLengthPendulum_learning.py
@@ -0,0 +1,122 @@
+#! /usr/bin/env python
+
+###############################################################################
+# openAI_variableLengthPendulum_learning.py
+#
+# Simple Q-learning setup for the variable length pendulum
+#
+# Extended/reworked from:
+#  https://keon.io/deep-q-learning/
+#  https://github.com/keon/deep-q-learning
+#
+# Also see:
+#  https://github.com/matthiasplappert/keras-rl
+#
+# NOTE: Any plotting is set up for output, not viewing on screen.
+#       So, it will likely be ugly on screen. The saved PDFs should look
+#       better.
+#
+# Created: 07/07/17
+#   - Joshua Vaughan
+#   - joshua.vaughan@louisiana.edu
+#   - http://www.ucs.louisiana.edu/~jev9637
+#
+# Modified:
+#   * 
+#
+# TODO:
+#   * 
+###############################################################################
+
+import numpy as np
+import matplotlib.pyplot as plt
+
+import gym
+import random
+import time
+import variable_pendulum
+
+from collections import deque
+from keras.models import Sequential
+from keras.layers import Dense
+from keras.optimizers import Adam
+
+EPISODES = 1000
+
+class DQNAgent:
+    def __init__(self, state_size, action_size):
+        self.state_size = state_size
+        self.action_size = action_size
+        self.memory = deque(maxlen=2000)
+        self.gamma = 0.95    
+        self.epsilon = 1.0  
+        self.epsilon_min = 0.01
+        self.epsilon_decay = 0.995
+        self.learning_rate = 0.001
+        self.model = self._build_model()
+
+    def _build_model(self):
+        model = Sequential()
+        model.add(Dense(24, input_dim=self.state_size, activation='relu'))
+        model.add(Dense(24, activation='relu'))
+        model.add(Dense(self.action_size, activation='linear'))
+        model.compile(loss='mse',
+                      optimizer=Adam(lr=self.learning_rate))
+        return model
+
+    def remember(self, state, action, reward, next_state, done):
+        self.memory.append((state, action, reward, next_state, done))
+
+    def act(self, state):
+        if np.random.rand() <= self.epsilon:
+            return random.randrange(self.action_size)
+        act_values = self.model.predict(state)
+        return np.argmax(act_values[0])  
+
+    def replay(self, batch_size):
+        minibatch = random.sample(self.memory, batch_size)
+        for state, action, reward, next_state, done in minibatch:
+            target = reward
+            if not done:
+              target = reward + self.gamma * \
+                       np.amax(self.model.predict(next_state)[0])
+            target_f = self.model.predict(state)
+            target_f[0][action] = target
+            self.model.fit(state, target_f, epochs=1, verbose=0)
+        if self.epsilon > self.epsilon_min:
+            self.epsilon *= self.epsilon_decay
+
+
+if __name__ == "__main__":
+    env = gym.make('variable_pendulum-v0')
+    
+    state_size = env.observation_space.shape[0]
+    action_size = env.action_space.n
+    agent = DQNAgent(state_size, action_size)
+    # agent.load("./save/cartpole-master.h5")
+    done = False
+    batch_size = 32
+  
+    for e in range(EPISODES):
+        state = env.reset()
+        state = np.reshape(state, [1, 4])
+  
+        for time_t in range(500):
+            if e % 100 == 0: # render every 100th episode - slow
+                env.render()
+
+            action = agent.act(state)
+  
+            next_state, reward, done, _ = env.step(action)
+            next_state = np.reshape(next_state, [1, 4])
+
+            agent.remember(state, action, reward, next_state, done)
+            state = next_state
+
+            if done:                
+                print("episode: {}/{}, score: {}"
+                      .format(e, EPISODES, time_t))
+                break
+        
+        if len(agent.memory) > batch_size:
+            agent.replay(batch_size)
diff --git a/OpenAI Gym/openAI_variableLengthPendulum_learning2.py b/OpenAI Gym/openAI_variableLengthPendulum_learning2.py
@@ -0,0 +1,53 @@
+import numpy as np
+import gym
+import variable_pendulum
+
+from keras.models import Sequential
+from keras.layers import Dense, Activation, Flatten
+from keras.optimizers import Adam
+
+from rl.agents.dqn import DQNAgent
+from rl.policy import BoltzmannQPolicy
+from rl.memory import SequentialMemory
+
+
+ENV_NAME = 'variable_pendulum-v0'
+
+
+# Get the environment and extract the number of actions.
+env = gym.make(ENV_NAME)
+np.random.seed(123)
+env.seed(123)
+nb_actions = env.action_space.n
+
+# Next, we build a very simple model.
+model = Sequential()
+model.add(Flatten(input_shape=(1,) + env.observation_space.shape))
+model.add(Dense(16))
+model.add(Activation('relu'))
+model.add(Dense(16))
+model.add(Activation('relu'))
+model.add(Dense(16))
+model.add(Activation('relu'))
+model.add(Dense(nb_actions))
+model.add(Activation('linear'))
+print(model.summary())
+
+# Finally, we configure and compile our agent. You can use every built-in Keras optimizer and
+# even the metrics!
+memory = SequentialMemory(limit=50000, window_length=1)
+policy = BoltzmannQPolicy()
+dqn = DQNAgent(model=model, nb_actions=nb_actions, memory=memory, nb_steps_warmup=100,
+               target_model_update=1e-2, policy=policy)
+dqn.compile(Adam(lr=1e-3), metrics=['mae'])
+
+# Okay, now it's time to learn something! We visualize the training here for show, but this
+# slows down training quite a lot. You can always safely abort the training prematurely using
+# Ctrl + C.
+dqn.fit(env, nb_steps=50000, visualize=False, verbose=2)
+
+# After training is done, we save the final weights.
+dqn.save_weights('dqn_{}_weights.h5f'.format(ENV_NAME), overwrite=True)
+
+# Finally, evaluate our algorithm for 5 episodes.
+dqn.test(env, nb_episodes=5, visualize=True)
diff --git a/OpenAI Gym/variable_pendulum/variable_pendulum.py b/OpenAI Gym/variable_pendulum/variable_pendulum.py
@@ -22,7 +22,6 @@
 ###############################################################################
 
 
-
 import gym
 from gym import spaces
 from gym.utils import seeding
@@ -39,24 +38,22 @@ class VariablePendulumEnv(gym.Env):
     }
     
     # actions available, hoist down, do nothing, hoist up
-    MAX_CABLE_ACCEL = 1.0
+    MAX_CABLE_ACCEL = 0.25
     AVAIL_CABLE_ACCEL =  [-MAX_CABLE_ACCEL, 0, MAX_CABLE_ACCEL]  
 
     def __init__(self):
         self.gravity = 9.8          # accel. due to gravity (m/s^2)
         self.masspend = 1.0         # mass of the pendulum point mass (kg)
         self.max_cable_accel = 0.25 # maximum acceleration of cable (m/s^2)
+        self.counter = 0            # counter to trial duration
         self.tau = 0.02             # seconds between state updates
 
         
         # Define thesholds for failing episode
-        self.theta_threshold = 45 * np.pi / 360     # +/- 45 degree limit (rad)
+        self.theta_threshold = 45 * np.pi / 180     # +/- 45 degree limit (rad)
         self.l_max_threshold = 3.0                  # max cable length (m)
         self.l_min_threshold = 0.5                  # min cable length (m)
 
-        # The action space is continuous inputs between 
-        #self.action_space = spaces.Box(-self.max_cable_accel, self.max_cable_accel, shape = (1,))
-        
         # This action space is just hoist down, do nothing, hoist up
         self.action_space = spaces.Discrete(3)
         
@@ -84,12 +81,11 @@ def _seed(self, seed=None):
 
     def _step(self, action):
         assert self.action_space.contains(action), "%r (%s) invalid"%(action, type(action))
-        
+        self.counter = self.counter + 1
         state = self.state
         theta, theta_dot, l, l_dot = state
-
         cable_accel = self.AVAIL_CABLE_ACCEL[action]
-
+                
         theta_ddot = -l_dot/l * theta_dot - self.gravity/l * np.sin(theta)
         l_ddot = cable_accel
 
@@ -103,24 +99,34 @@ def _step(self, action):
         done =  l > self.l_max_threshold \
                 or l < self.l_min_threshold \
                 or theta < -self.theta_threshold \
-                or theta > self.theta_threshold
+                or theta > self.theta_threshold \
+                or self.counter > 500 \
+                or (np.abs(theta) < np.pi/180 and np.abs(theta_dot) < np.pi/180)
         
         done = bool(done)
 
-        if not done:
-            reward = -np.abs(theta) # a negative award for nonzero angles
-        else:
-#             if self.steps_beyond_done == 0:
-#                 logger.warning("You are calling 'step()' even though this environment has already returned done = True. You should always call 'reset()' once you receive 'done = True' -- any further steps are undefined behavior.")
-#             self.steps_beyond_done += 1
-            reward = 0.0
+#        if not done:
+#             reward = 100.0 / (theta * 180/np.pi)**2
+#             
+#             if (np.abs(theta) < 2*np.pi/180):
+#                 reward = reward * 10
+#                 if (np.abs(theta_dot) < np.pi/180):
+#                     reward = reward * 10
+#             
+#         else:
+#             reward = 0.0
+        reward = -(theta**2 + 0.1 * theta_dot**2 + 0.001 * cable_accel**2)
 
         return np.array(self.state), reward, done, {}
 
     def _reset(self):
 #         self.state = self.np_random.uniform(low=-0.05, high=0.05, size=(4,))
-        self.state = np.array([10*np.pi/180, 0, 2, 0]) + self.np_random.uniform(low=-0.1, high=0.1, size=(4,))
-        self.steps_beyond_done = None
+        # TODO: 07/07/17 - Probably need more randomness in initial conditions
+        self.state = np.array([self.np_random.uniform(low=-np.pi/12, high=np.pi/12),
+                               0, #self.np_random.uniform(low=-0.5*np.pi/6, high=0.5*np.pi/6),
+                               self.np_random.uniform(low=1.5*self.l_min_threshold, high=0.5*self.l_max_threshold),
+                               0])#self.np_random.uniform(low=-0.5, high=0.5)])
+        self.counter = 0
         return np.array(self.state)
 
     def _render(self, mode='human', close=False):
@@ -152,14 +158,14 @@ def _render(self, mode='human', close=False):
             self.cable = rendering.FilledPolygon([(l,b), (l,t), (r,t), (r,b)])
             self.cabletrans = rendering.Transform(translation=(screen_width/2, cable_pin))
             self.cable.add_attr(self.cabletrans)
-            self.cable.set_color(0.25,0.25,0.25)    # darj gray
+            self.cable.set_color(0.25,0.25,0.25)    # dark gray
             self.viewer.add_geom(self.cable)
             
             # the payload is a circle.
             self.payload = rendering.make_circle(payload_size)
             self.payloadtrans = rendering.Transform(translation=(screen_width/2, cable_pin-l*scale))
             self.payload.add_attr(self.payloadtrans)
-            self.payload.set_color(0.5,0.5,0.5)  # dark gray
+            self.payload.set_color(0.5,0.5,0.5)  # mid gray
             self.viewer.add_geom(self.payload)
 
 
diff --git a/OpenAI Gym/variable_pendulum_continuous/__init__.py b/OpenAI Gym/variable_pendulum_continuous/__init__.py
diff --git a/OpenAI Gym/variable_pendulum_continuous/variable_pendulum_continuous.py b/OpenAI Gym/variable_pendulum_continuous/variable_pendulum_continuous.py