Skip to content

Commit e6f7635

Browse files
committed
Changed layers in A3C algorithm to be the same as they are in DQN derived algorithms. Added TestAgent.py script that is used for testing DQN derived algorithms. Added README.md with description on how to use the repository. Added A3C_test.py and A3C_no_lstm_test.py scripts that can be used for testing A3C models.
1 parent 91b3cc7 commit e6f7635

File tree

9 files changed

+763
-337
lines changed

9 files changed

+763
-337
lines changed

Agent.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -127,7 +127,7 @@ def __init__(self, setup_dict=None):
127127
if not 'gamma' in setup_dict:
128128
setup_dict['gamma'] = 0.99
129129
if not 'update_freq' in setup_dict:
130-
setup_dict['update_freq'] = 4
130+
setup_dict['update_freq'] = 1
131131
if not 'log_filename' in setup_dict:
132132
setup_dict['log_filename'] = 'log.txt'
133133
if not 'MemoryType' in setup_dict:

AgentRunner.py

Lines changed: 13 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -10,18 +10,19 @@
1010

1111
if os.path.isfile('config.json'):
1212
setup_dict = json.loads(open('config.json').read())
13-
else:
14-
setup_dict['observing_frames'] = 25000
15-
setup_dict['replay_memory_size'] = 25000
16-
setup_dict['learning_rate'] = 1e-5
17-
setup_dict['start_eps'] = 0.7
18-
setup_dict['exploring_frames'] = 2000000
19-
setup_dict['saving_dir'] = "DDQN_AgentPrioritizedForgettingEpsGreedy"
20-
setup_dict['log_freq'] = 5
21-
setup_dict['MemoryType'] = 'MemoryPrioritizedForgetting'
22-
#setup_dict['MemoryType'] = 'PrioritizedExperienceReplayMemory'
23-
setup_dict['ExplorationStrategy'] = 'EpsilonGreedyExplorationStrategy'
24-
setup_dict['Agent'] = 'DDQN_Agent'
13+
setup_dict['observing_frames'] = 25000
14+
setup_dict['replay_memory_size'] = 25000
15+
setup_dict['learning_rate'] = 1e-4
16+
setup_dict['start_eps'] = 0.7
17+
setup_dict['exploring_frames'] = 2000000
18+
setup_dict['saving_dir'] = "DuelingDDQN_AgentPrioritizedForgettingEpsGreedy_2018_06_14"
19+
setup_dict['log_freq'] = 5
20+
setup_dict['MemoryType'] = 'MemoryPrioritizedForgetting'
21+
#setup_dict['MemoryType'] = 'PrioritizedExperienceReplayMemory'
22+
setup_dict['ExplorationStrategy'] = 'EpsilonGreedyExplorationStrategy'
23+
setup_dict['Agent'] = 'Dueling_DDQN_Agent'
24+
setup_dict['update_freq'] = 2
25+
setup_dict['tau'] = 0.005
2526
#agent = DuelingDDQNAgentPER.Dueling_DDQN_PER_Agent(setup_dict)
2627
agent = None
2728
if setup_dict['Agent'] == 'DQN_Agent':

Asynchronous/A3C.ipynb

Lines changed: 44 additions & 258 deletions
Large diffs are not rendered by default.

Asynchronous/A3C.py

Lines changed: 22 additions & 39 deletions
Original file line numberDiff line numberDiff line change
@@ -5,23 +5,22 @@
55
import skimage
66
from skimage import color, exposure, transform
77
import threading
8+
import os
89

910
IMG_WIDTH = 105
1011
IMG_HEIGHT = 80
11-
CNT_FRAMES = 1
1212
GLOBAL_SCOPE = 'global'
1313
VALUE_MODIFIER = 0.5
1414
POLICY_MODIFIER = 1
15-
ENTROPY_MODIFIER = 5*1e-2
15+
ENTROPY_MODIFIER = 1
1616
MAX_STEPS = 30
1717
DISCOUNT = 0.99
1818
ENV_NAME = 'BreakoutDeterministic-v4'
19-
#ENV_NAME = 'PongDeterministic-v4'
20-
#MAX_ITERATIONS = 100000000
2119
MAX_EP_LENGTH = 100000
22-
#MAX_LEARNING_TIME = 7 * 60 * 60 # 7 hours
2320
LEARNING_RATE = 1e-4
24-
CLIP_VALUE = 2.0
21+
CLIP_VALUE = 10.0
22+
SAVE_DIR = 'AgentResults'
23+
CNT_THREADS = 24
2524

2625
def process_frame(x_t, img_rows, img_cols):
2726
x_t = skimage.color.rgb2gray(x_t)
@@ -66,20 +65,18 @@ def __build_model(self):
6665
with tf.variable_scope(self.scope_name):
6766
weights_initializer = tf.contrib.layers.xavier_initializer_conv2d()
6867
bias_initializer = tf.zeros_initializer()
69-
self.X = tf.placeholder(shape=[None, IMG_WIDTH, IMG_HEIGHT, CNT_FRAMES], dtype=tf.float32, name='input')
70-
conv1 = tf.contrib.layers.conv2d(self.X, 32, 3, stride=2, activation_fn=tf.nn.relu, padding='SAME', \
68+
self.X = tf.placeholder(shape=[None, IMG_WIDTH, IMG_HEIGHT, 1], dtype=tf.float32, name='input')
69+
conv1 = tf.contrib.layers.conv2d(self.X, 32, 8, stride=4, activation_fn=tf.nn.relu, padding='VALID', \
7170
weights_initializer=weights_initializer, biases_initializer = bias_initializer,\
7271
scope='first_conv')
73-
mp1 = tf.contrib.layers.max_pool2d(conv1, 2, scope='first_mp')
74-
conv2 = tf.contrib.layers.conv2d(mp1, 32, 3, stride=2, activation_fn=tf.nn.relu, padding='SAME', \
72+
conv2 = tf.contrib.layers.conv2d(conv1, 64, 4, stride=2, activation_fn=tf.nn.relu, padding='VALID', \
7573
weights_initializer=weights_initializer, biases_initializer = bias_initializer,\
7674
scope='second_conv')
77-
mp2 = tf.contrib.layers.max_pool2d(conv2, 2, scope='second_mp')
78-
conv3 = tf.contrib.layers.conv2d(mp2, 64, 3, stride=2, activation_fn=tf.nn.relu, padding='SAME', \
75+
conv3 = tf.contrib.layers.conv2d(conv2, 64, 3, stride=1, activation_fn=tf.nn.relu, padding='VALID', \
7976
weights_initializer=weights_initializer, biases_initializer = bias_initializer,\
8077
scope='third_conv')
8178
flattened = tf.contrib.layers.flatten(conv3, scope='flatten')
82-
embedding = tf.contrib.layers.fully_connected(flattened, 256, activation_fn=tf.nn.relu, weights_initializer=tf.random_normal_initializer(stddev=0.02), biases_initializer=bias_initializer,\
79+
embedding = tf.contrib.layers.fully_connected(flattened, 512, activation_fn=tf.nn.relu, weights_initializer=tf.random_normal_initializer(stddev=0.02), biases_initializer=bias_initializer,\
8380
scope='fc_embed')
8481

8582
step_size = tf.shape(self.X)[:1]
@@ -117,8 +114,6 @@ def __build_model(self):
117114
self.actions_oh = tf.one_hot(self.actions, depth=self.action_size, dtype=tf.float32, name='actions_oh')
118115
self.target_values = tf.placeholder(shape=[None], dtype=tf.float32, name='target_vals')
119116
self.advantages = tf.placeholder(shape=[None], dtype=tf.float32, name='advantages')
120-
#print('adv shape', self.advantages.shape)
121-
#self.advantages = tf.subtract(tf.stop_gradient(self.value), self.target_values, name='advantage')
122117

123118
MIN_POLICY = 1e-8
124119
MAX_POLICY = 1.0 - MIN_POLICY
@@ -128,23 +123,20 @@ def __build_model(self):
128123
self.log_policy_for_action = tf.reduce_sum(tf.multiply(self.log_policy, self.actions_oh), axis=1, name='log_policy_for_action')
129124
self.value_loss = tf.reduce_mean(tf.square(self.value - self.target_values), name='value_loss')
130125
self.value_loss = self.value_loss * VALUE_MODIFIER
131-
#self.value_loss = self.value_loss - self.value_loss
132126
self.policy_loss = -tf.reduce_mean(tf.multiply(self.log_policy_for_action, self.advantages), name='policy_loss')
133127
self.policy_loss = self.policy_loss * POLICY_MODIFIER
134128
#entropija je E[-log(X)] = sum(p(x) * log(x))
135129
self.entropy_beta = tf.get_variable('entropy_beta', shape=[],
136130
initializer=tf.constant_initializer(ENTROPY_MODIFIER), trainable=False)
137131
self.entropy_loss = -tf.reduce_mean(self.policy * -self.log_policy, name='entropy_loss')
138132
self.entropy_loss = self.entropy_loss * self.entropy_beta
139-
#self.entropy_loss = self.entropy_loss - self.entropy_loss
140133
self.loss = self.value_loss + \
141134
self.policy_loss + \
142135
self.entropy_loss
143136
#get locals
144137
local_vars = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, self.scope_name)
145138
#update locals
146139
grads = tf.gradients(self.loss, local_vars)
147-
#grads = [tf.clip_by_average_norm(grad, CLIP_VALUE) for grad in grads]
148140
grads, grad_norms = tf.clip_by_global_norm(grads, CLIP_VALUE)
149141
self.update_ops = update_target_graph(GLOBAL_SCOPE, self.scope_name)
150142
global_vars = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, GLOBAL_SCOPE)
@@ -160,10 +152,7 @@ def predict(self, sess, state, initial_lstm_state):
160152
}\
161153
)
162154
policy = policy.flatten()
163-
#print('cur policy', policy)
164155
prediction = np.random.choice(self.action_size, p=policy)
165-
#prediction = np.argmax(policy)
166-
#print('prediction', prediction)
167156
return prediction, final_lstm_state
168157

169158
def act(self, sess, state, initial_lstm_state):
@@ -209,7 +198,7 @@ def update_to_global(self, sess):
209198
class Worker:
210199
def __init__(self, agent):
211200
self.agent = agent
212-
self.summary_writer = tf.summary.FileWriter(self.agent.scope_name)
201+
self.summary_writer = tf.summary.FileWriter(os.path.join(SAVE_DIR, 'Tensorboard/' + self.agent.scope_name))
213202
def work(self, sess, optimizer, thread_lock):
214203

215204
global global_counter
@@ -234,14 +223,12 @@ def work(self, sess, optimizer, thread_lock):
234223
elapsed_time = time.time() - start_time
235224

236225
with sess.as_default(), sess.graph.as_default():
237-
while True:#global_counter <= MAX_ITERATIONS and elapsed_time <= MAX_LEARNING_TIME:
226+
while True:
238227
self.agent.update_to_global(sess)
239228
if done or timestep > MAX_EP_LENGTH:
240229
last_rewards.append(episode_reward)
241230
last_frames.append(timestep)
242231
if episode_counter > 0 and episode_counter % 5 == 0:
243-
#print('for agent:', self.agent.scope_name)
244-
#print('at episode', episode_counter, 'episode reward is', episode_reward)
245232
if len(value_losses) > 0:
246233
summary = tf.Summary()
247234

@@ -321,9 +308,7 @@ def work(self, sess, optimizer, thread_lock):
321308
for reward in reversed(rewards):
322309
target_value = reward + DISCOUNT * target_value
323310
target_values.append(target_value)
324-
#for i in range(len(rewards)-1):
325-
# idx = len(rewards) - i - 1
326-
# target_values[idx-1] = rewards[idx-1] + DISCOUNT * target_values[idx]
311+
327312
states = np.vstack(states)
328313
actions = np.vstack(actions).ravel()
329314
target_values = np.vstack(target_values).ravel()
@@ -351,22 +336,20 @@ def work(self, sess, optimizer, thread_lock):
351336
worker_threads = []
352337

353338
env_global = EnvWrapper(ENV_NAME)
354-
#global_agent = Agent(env_global, GLOBAL_SCOPE, tf.train.AdamOptimizer())
355339
global_agent = Agent(env_global, GLOBAL_SCOPE, tf.train.GradientDescentOptimizer(LEARNING_RATE))
356340

357-
config = tf.ConfigProto()#device_count = {'GPU': 0})
341+
config = tf.ConfigProto()
358342
config.gpu_options.allow_growth=True
359343

360344
sess = tf.Session(config=config)
361-
writer = tf.summary.FileWriter("graph", sess.graph)
345+
writer = tf.summary.FileWriter(os.path.join(SAVE_DIR, "Tensorboard/graph"), sess.graph)
362346

363347
print('saved graph')
364348

365349
def global_saving_thread(agent, sess):
366350

367351
global global_counter
368352

369-
MAX_MODELS = 1000
370353
cnt_model = 0
371354

372355
with sess.as_default(), sess.graph.as_default():
@@ -376,24 +359,24 @@ def global_saving_thread(agent, sess):
376359

377360
elapsed_time = time.time() - start_time
378361

379-
#save model every 15 minutes
380-
while True:#global_counter <= MAX_ITERATIONS and elapsed_time <= MAX_LEARNING_TIME:
381-
print("Current model save name:", 'model_' + str(cnt_model % MAX_MODELS))
382-
save_path = saver.save(sess, "models/model_" + str(cnt_model % MAX_MODELS) + ".ckpt")
362+
cnt_minutes = 30
363+
#save model every 30 minutes
364+
while True:
365+
print("Current model save name:", 'model_' + str(cnt_model))
366+
save_path = saver.save(sess, os.path.join(SAVE_DIR, "models/model_" + str(cnt_model) + ".ckpt"))
383367
print("Current global iteration", global_counter)
384368
cnt_model += 1
385-
time.sleep(3 * 60 * 60)
369+
time.sleep(cnt_minutes * 60)
386370
print("Learning time was", int(elapsed_time/60/60), "hours", int((elapsed_time - int(elapsed_time/60/60)*60*60)/60), "minutes")
387371

388-
cnt_threads = 24
389372
thread_lock = threading.Lock()
390373

391374
def worker_fun(worker, sess, optimizer, thread_lock):
392375
worker.work(sess, optimizer, thread_lock)
393376

394377
optimizer = tf.train.AdamOptimizer(learning_rate=LEARNING_RATE)
395378

396-
for i in range(cnt_threads):
379+
for i in range(CNT_THREADS):
397380
env = EnvWrapper(ENV_NAME)
398381
worker = Worker(Agent(env, 'local' + str(i), optimizer))
399382
t = threading.Thread(target=worker_fun, args=(worker, sess, optimizer, thread_lock))

0 commit comments

Comments
 (0)