Skip to content

Commit b890ae5

Browse files
committed
Added continuous action variable length pendulum OpenAI Gym environment
TODO: * refine reward for the environments * refine simple learning algorithms
1 parent e7bda1d commit b890ae5

File tree

7 files changed

+491
-22
lines changed

7 files changed

+491
-22
lines changed

OpenAI Gym/openAI_variableLengthPendulum.py

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -25,6 +25,7 @@
2525
import matplotlib.pyplot as plt
2626

2727
import gym
28+
import time
2829
import variable_pendulum
2930

3031
env = gym.make('variable_pendulum-v0')
@@ -49,6 +50,8 @@
4950
print("L_dot (m/s).:".ljust(j,d), '{:+8.3f}'.format(observation[3]))
5051
print("Reward:".ljust(j,d), '{:+8.3f}'.format(reward))
5152

53+
# if episode finishes before full time range, notify
5254
if done:
53-
print("Episode finished after {} timesteps".format(t+1))
55+
print("\r\nEpisode finished after {} timesteps".format(t+1))
56+
time.sleep(1)
5457
break
Lines changed: 74 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,74 @@
1+
import numpy as np
2+
import gym
3+
import variable_pendulum_continuous
4+
5+
from keras.models import Sequential, Model
6+
from keras.layers import Dense, Activation, Flatten, Input, merge
7+
from keras.optimizers import Adam
8+
9+
from rl.agents import DDPGAgent
10+
from rl.memory import SequentialMemory
11+
from rl.random import OrnsteinUhlenbeckProcess
12+
13+
14+
ENV_NAME = 'variable_pendulum_continuous-v0'
15+
16+
# ENV_NAME = 'Pendulum-v0'
17+
18+
# Get the environment and extract the number of actions.
19+
env = gym.make(ENV_NAME)
20+
np.random.seed(123)
21+
env.seed(123)
22+
nb_actions = env.action_space.shape[0]
23+
24+
# Next, we build a very simple model.
25+
actor = Sequential()
26+
actor.add(Flatten(input_shape=(1,) + env.observation_space.shape))
27+
actor.add(Dense(16))
28+
actor.add(Activation('relu'))
29+
actor.add(Dense(16))
30+
actor.add(Activation('relu'))
31+
actor.add(Dense(16))
32+
actor.add(Activation('relu'))
33+
actor.add(Dense(nb_actions))
34+
actor.add(Activation('linear'))
35+
print(actor.summary())
36+
37+
action_input = Input(shape=(nb_actions,), name='action_input')
38+
observation_input = Input(shape=(1,) + env.observation_space.shape, name='observation_input')
39+
flattened_observation = Flatten()(observation_input)
40+
x = merge([action_input, flattened_observation], mode='concat')
41+
x = Dense(32)(x)
42+
x = Activation('relu')(x)
43+
x = Dense(32)(x)
44+
x = Activation('relu')(x)
45+
x = Dense(32)(x)
46+
x = Activation('relu')(x)
47+
x = Dense(1)(x)
48+
x = Activation('linear')(x)
49+
critic = Model(input=[action_input, observation_input], output=x)
50+
print(critic.summary())
51+
52+
# Finally, we configure and compile our agent. You can use every built-in Keras optimizer and
53+
# even the metrics!
54+
memory = SequentialMemory(limit=100000, window_length=1)
55+
random_process = OrnsteinUhlenbeckProcess(size=nb_actions, theta=.15, mu=0., sigma=.3)
56+
agent = DDPGAgent(nb_actions=nb_actions, actor=actor, critic=critic, critic_action_input=action_input,
57+
memory=memory, nb_steps_warmup_critic=100, nb_steps_warmup_actor=100,
58+
random_process=random_process, gamma=.99, target_model_update=1e-3,
59+
delta_clip=1.)
60+
# agent = DDPGAgent(nb_actions=nb_actions, actor=actor, critic=critic, critic_action_input=action_input,
61+
# memory=memory, nb_steps_warmup_critic=100, nb_steps_warmup_actor=100,
62+
# random_process=random_process, gamma=.99, target_model_update=1e-3)
63+
agent.compile(Adam(lr=.001, clipnorm=1.), metrics=['mae'])
64+
65+
# Okay, now it's time to learn something! We visualize the training here for show, but this
66+
# slows down training quite a lot. You can always safely abort the training prematurely using
67+
# Ctrl + C.
68+
agent.fit(env, nb_steps=50000, visualize=True, verbose=1, nb_max_episode_steps=200)
69+
70+
# After training is done, we save the final weights.
71+
agent.save_weights('ddpg_{}_weights.h5f'.format(ENV_NAME), overwrite=True)
72+
73+
# Finally, evaluate our algorithm for 5 episodes.
74+
agent.test(env, nb_episodes=5, visualize=True, nb_max_episode_steps=200)
Lines changed: 122 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,122 @@
1+
#! /usr/bin/env python
2+
3+
###############################################################################
4+
# openAI_variableLengthPendulum_learning.py
5+
#
6+
# Simple Q-learning setup for the variable length pendulum
7+
#
8+
# Extended/reworked from:
9+
# https://keon.io/deep-q-learning/
10+
# https://github.com/keon/deep-q-learning
11+
#
12+
# Also see:
13+
# https://github.com/matthiasplappert/keras-rl
14+
#
15+
# NOTE: Any plotting is set up for output, not viewing on screen.
16+
# So, it will likely be ugly on screen. The saved PDFs should look
17+
# better.
18+
#
19+
# Created: 07/07/17
20+
# - Joshua Vaughan
21+
# - joshua.vaughan@louisiana.edu
22+
# - http://www.ucs.louisiana.edu/~jev9637
23+
#
24+
# Modified:
25+
# *
26+
#
27+
# TODO:
28+
# *
29+
###############################################################################
30+
31+
import numpy as np
32+
import matplotlib.pyplot as plt
33+
34+
import gym
35+
import random
36+
import time
37+
import variable_pendulum
38+
39+
from collections import deque
40+
from keras.models import Sequential
41+
from keras.layers import Dense
42+
from keras.optimizers import Adam
43+
44+
EPISODES = 1000
45+
46+
class DQNAgent:
47+
def __init__(self, state_size, action_size):
48+
self.state_size = state_size
49+
self.action_size = action_size
50+
self.memory = deque(maxlen=2000)
51+
self.gamma = 0.95
52+
self.epsilon = 1.0
53+
self.epsilon_min = 0.01
54+
self.epsilon_decay = 0.995
55+
self.learning_rate = 0.001
56+
self.model = self._build_model()
57+
58+
def _build_model(self):
59+
model = Sequential()
60+
model.add(Dense(24, input_dim=self.state_size, activation='relu'))
61+
model.add(Dense(24, activation='relu'))
62+
model.add(Dense(self.action_size, activation='linear'))
63+
model.compile(loss='mse',
64+
optimizer=Adam(lr=self.learning_rate))
65+
return model
66+
67+
def remember(self, state, action, reward, next_state, done):
68+
self.memory.append((state, action, reward, next_state, done))
69+
70+
def act(self, state):
71+
if np.random.rand() <= self.epsilon:
72+
return random.randrange(self.action_size)
73+
act_values = self.model.predict(state)
74+
return np.argmax(act_values[0])
75+
76+
def replay(self, batch_size):
77+
minibatch = random.sample(self.memory, batch_size)
78+
for state, action, reward, next_state, done in minibatch:
79+
target = reward
80+
if not done:
81+
target = reward + self.gamma * \
82+
np.amax(self.model.predict(next_state)[0])
83+
target_f = self.model.predict(state)
84+
target_f[0][action] = target
85+
self.model.fit(state, target_f, epochs=1, verbose=0)
86+
if self.epsilon > self.epsilon_min:
87+
self.epsilon *= self.epsilon_decay
88+
89+
90+
if __name__ == "__main__":
91+
env = gym.make('variable_pendulum-v0')
92+
93+
state_size = env.observation_space.shape[0]
94+
action_size = env.action_space.n
95+
agent = DQNAgent(state_size, action_size)
96+
# agent.load("./save/cartpole-master.h5")
97+
done = False
98+
batch_size = 32
99+
100+
for e in range(EPISODES):
101+
state = env.reset()
102+
state = np.reshape(state, [1, 4])
103+
104+
for time_t in range(500):
105+
if e % 100 == 0: # render every 100th episode - slow
106+
env.render()
107+
108+
action = agent.act(state)
109+
110+
next_state, reward, done, _ = env.step(action)
111+
next_state = np.reshape(next_state, [1, 4])
112+
113+
agent.remember(state, action, reward, next_state, done)
114+
state = next_state
115+
116+
if done:
117+
print("episode: {}/{}, score: {}"
118+
.format(e, EPISODES, time_t))
119+
break
120+
121+
if len(agent.memory) > batch_size:
122+
agent.replay(batch_size)
Lines changed: 53 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,53 @@
1+
import numpy as np
2+
import gym
3+
import variable_pendulum
4+
5+
from keras.models import Sequential
6+
from keras.layers import Dense, Activation, Flatten
7+
from keras.optimizers import Adam
8+
9+
from rl.agents.dqn import DQNAgent
10+
from rl.policy import BoltzmannQPolicy
11+
from rl.memory import SequentialMemory
12+
13+
14+
ENV_NAME = 'variable_pendulum-v0'
15+
16+
17+
# Get the environment and extract the number of actions.
18+
env = gym.make(ENV_NAME)
19+
np.random.seed(123)
20+
env.seed(123)
21+
nb_actions = env.action_space.n
22+
23+
# Next, we build a very simple model.
24+
model = Sequential()
25+
model.add(Flatten(input_shape=(1,) + env.observation_space.shape))
26+
model.add(Dense(16))
27+
model.add(Activation('relu'))
28+
model.add(Dense(16))
29+
model.add(Activation('relu'))
30+
model.add(Dense(16))
31+
model.add(Activation('relu'))
32+
model.add(Dense(nb_actions))
33+
model.add(Activation('linear'))
34+
print(model.summary())
35+
36+
# Finally, we configure and compile our agent. You can use every built-in Keras optimizer and
37+
# even the metrics!
38+
memory = SequentialMemory(limit=50000, window_length=1)
39+
policy = BoltzmannQPolicy()
40+
dqn = DQNAgent(model=model, nb_actions=nb_actions, memory=memory, nb_steps_warmup=100,
41+
target_model_update=1e-2, policy=policy)
42+
dqn.compile(Adam(lr=1e-3), metrics=['mae'])
43+
44+
# Okay, now it's time to learn something! We visualize the training here for show, but this
45+
# slows down training quite a lot. You can always safely abort the training prematurely using
46+
# Ctrl + C.
47+
dqn.fit(env, nb_steps=50000, visualize=False, verbose=2)
48+
49+
# After training is done, we save the final weights.
50+
dqn.save_weights('dqn_{}_weights.h5f'.format(ENV_NAME), overwrite=True)
51+
52+
# Finally, evaluate our algorithm for 5 episodes.
53+
dqn.test(env, nb_episodes=5, visualize=True)

OpenAI Gym/variable_pendulum/variable_pendulum.py

Lines changed: 27 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -22,7 +22,6 @@
2222
###############################################################################
2323

2424

25-
2625
import gym
2726
from gym import spaces
2827
from gym.utils import seeding
@@ -39,24 +38,22 @@ class VariablePendulumEnv(gym.Env):
3938
}
4039

4140
# actions available, hoist down, do nothing, hoist up
42-
MAX_CABLE_ACCEL = 1.0
41+
MAX_CABLE_ACCEL = 0.25
4342
AVAIL_CABLE_ACCEL = [-MAX_CABLE_ACCEL, 0, MAX_CABLE_ACCEL]
4443

4544
def __init__(self):
4645
self.gravity = 9.8 # accel. due to gravity (m/s^2)
4746
self.masspend = 1.0 # mass of the pendulum point mass (kg)
4847
self.max_cable_accel = 0.25 # maximum acceleration of cable (m/s^2)
48+
self.counter = 0 # counter to trial duration
4949
self.tau = 0.02 # seconds between state updates
5050

5151

5252
# Define thesholds for failing episode
53-
self.theta_threshold = 45 * np.pi / 360 # +/- 45 degree limit (rad)
53+
self.theta_threshold = 45 * np.pi / 180 # +/- 45 degree limit (rad)
5454
self.l_max_threshold = 3.0 # max cable length (m)
5555
self.l_min_threshold = 0.5 # min cable length (m)
5656

57-
# The action space is continuous inputs between
58-
#self.action_space = spaces.Box(-self.max_cable_accel, self.max_cable_accel, shape = (1,))
59-
6057
# This action space is just hoist down, do nothing, hoist up
6158
self.action_space = spaces.Discrete(3)
6259

@@ -84,12 +81,11 @@ def _seed(self, seed=None):
8481

8582
def _step(self, action):
8683
assert self.action_space.contains(action), "%r (%s) invalid"%(action, type(action))
87-
84+
self.counter = self.counter + 1
8885
state = self.state
8986
theta, theta_dot, l, l_dot = state
90-
9187
cable_accel = self.AVAIL_CABLE_ACCEL[action]
92-
88+
9389
theta_ddot = -l_dot/l * theta_dot - self.gravity/l * np.sin(theta)
9490
l_ddot = cable_accel
9591

@@ -103,24 +99,34 @@ def _step(self, action):
10399
done = l > self.l_max_threshold \
104100
or l < self.l_min_threshold \
105101
or theta < -self.theta_threshold \
106-
or theta > self.theta_threshold
102+
or theta > self.theta_threshold \
103+
or self.counter > 500 \
104+
or (np.abs(theta) < np.pi/180 and np.abs(theta_dot) < np.pi/180)
107105

108106
done = bool(done)
109107

110-
if not done:
111-
reward = -np.abs(theta) # a negative award for nonzero angles
112-
else:
113-
# if self.steps_beyond_done == 0:
114-
# logger.warning("You are calling 'step()' even though this environment has already returned done = True. You should always call 'reset()' once you receive 'done = True' -- any further steps are undefined behavior.")
115-
# self.steps_beyond_done += 1
116-
reward = 0.0
108+
# if not done:
109+
# reward = 100.0 / (theta * 180/np.pi)**2
110+
#
111+
# if (np.abs(theta) < 2*np.pi/180):
112+
# reward = reward * 10
113+
# if (np.abs(theta_dot) < np.pi/180):
114+
# reward = reward * 10
115+
#
116+
# else:
117+
# reward = 0.0
118+
reward = -(theta**2 + 0.1 * theta_dot**2 + 0.001 * cable_accel**2)
117119

118120
return np.array(self.state), reward, done, {}
119121

120122
def _reset(self):
121123
# self.state = self.np_random.uniform(low=-0.05, high=0.05, size=(4,))
122-
self.state = np.array([10*np.pi/180, 0, 2, 0]) + self.np_random.uniform(low=-0.1, high=0.1, size=(4,))
123-
self.steps_beyond_done = None
124+
# TODO: 07/07/17 - Probably need more randomness in initial conditions
125+
self.state = np.array([self.np_random.uniform(low=-np.pi/12, high=np.pi/12),
126+
0, #self.np_random.uniform(low=-0.5*np.pi/6, high=0.5*np.pi/6),
127+
self.np_random.uniform(low=1.5*self.l_min_threshold, high=0.5*self.l_max_threshold),
128+
0])#self.np_random.uniform(low=-0.5, high=0.5)])
129+
self.counter = 0
124130
return np.array(self.state)
125131

126132
def _render(self, mode='human', close=False):
@@ -152,14 +158,14 @@ def _render(self, mode='human', close=False):
152158
self.cable = rendering.FilledPolygon([(l,b), (l,t), (r,t), (r,b)])
153159
self.cabletrans = rendering.Transform(translation=(screen_width/2, cable_pin))
154160
self.cable.add_attr(self.cabletrans)
155-
self.cable.set_color(0.25,0.25,0.25) # darj gray
161+
self.cable.set_color(0.25,0.25,0.25) # dark gray
156162
self.viewer.add_geom(self.cable)
157163

158164
# the payload is a circle.
159165
self.payload = rendering.make_circle(payload_size)
160166
self.payloadtrans = rendering.Transform(translation=(screen_width/2, cable_pin-l*scale))
161167
self.payload.add_attr(self.payloadtrans)
162-
self.payload.set_color(0.5,0.5,0.5) # dark gray
168+
self.payload.set_color(0.5,0.5,0.5) # mid gray
163169
self.viewer.add_geom(self.payload)
164170

165171

0 commit comments

Comments
 (0)