55import skimage
66from skimage import color , exposure , transform
77import threading
8+ import os
89
910IMG_WIDTH = 105
1011IMG_HEIGHT = 80
11- CNT_FRAMES = 1
1212GLOBAL_SCOPE = 'global'
1313VALUE_MODIFIER = 0.5
1414POLICY_MODIFIER = 1
15- ENTROPY_MODIFIER = 5 * 1e-2
15+ ENTROPY_MODIFIER = 1
1616MAX_STEPS = 30
1717DISCOUNT = 0.99
1818ENV_NAME = 'BreakoutDeterministic-v4'
19- #ENV_NAME = 'PongDeterministic-v4'
20- #MAX_ITERATIONS = 100000000
2119MAX_EP_LENGTH = 100000
22- #MAX_LEARNING_TIME = 7 * 60 * 60 # 7 hours
2320LEARNING_RATE = 1e-4
24- CLIP_VALUE = 2.0
21+ CLIP_VALUE = 10.0
22+ SAVE_DIR = 'AgentResults'
23+ CNT_THREADS = 24
2524
2625def process_frame (x_t , img_rows , img_cols ):
2726 x_t = skimage .color .rgb2gray (x_t )
@@ -66,20 +65,18 @@ def __build_model(self):
6665 with tf .variable_scope (self .scope_name ):
6766 weights_initializer = tf .contrib .layers .xavier_initializer_conv2d ()
6867 bias_initializer = tf .zeros_initializer ()
69- self .X = tf .placeholder (shape = [None , IMG_WIDTH , IMG_HEIGHT , CNT_FRAMES ], dtype = tf .float32 , name = 'input' )
70- conv1 = tf .contrib .layers .conv2d (self .X , 32 , 3 , stride = 2 , activation_fn = tf .nn .relu , padding = 'SAME ' , \
68+ self .X = tf .placeholder (shape = [None , IMG_WIDTH , IMG_HEIGHT , 1 ], dtype = tf .float32 , name = 'input' )
69+ conv1 = tf .contrib .layers .conv2d (self .X , 32 , 8 , stride = 4 , activation_fn = tf .nn .relu , padding = 'VALID ' , \
7170 weights_initializer = weights_initializer , biases_initializer = bias_initializer ,\
7271 scope = 'first_conv' )
73- mp1 = tf .contrib .layers .max_pool2d (conv1 , 2 , scope = 'first_mp' )
74- conv2 = tf .contrib .layers .conv2d (mp1 , 32 , 3 , stride = 2 , activation_fn = tf .nn .relu , padding = 'SAME' , \
72+ conv2 = tf .contrib .layers .conv2d (conv1 , 64 , 4 , stride = 2 , activation_fn = tf .nn .relu , padding = 'VALID' , \
7573 weights_initializer = weights_initializer , biases_initializer = bias_initializer ,\
7674 scope = 'second_conv' )
77- mp2 = tf .contrib .layers .max_pool2d (conv2 , 2 , scope = 'second_mp' )
78- conv3 = tf .contrib .layers .conv2d (mp2 , 64 , 3 , stride = 2 , activation_fn = tf .nn .relu , padding = 'SAME' , \
75+ conv3 = tf .contrib .layers .conv2d (conv2 , 64 , 3 , stride = 1 , activation_fn = tf .nn .relu , padding = 'VALID' , \
7976 weights_initializer = weights_initializer , biases_initializer = bias_initializer ,\
8077 scope = 'third_conv' )
8178 flattened = tf .contrib .layers .flatten (conv3 , scope = 'flatten' )
82- embedding = tf .contrib .layers .fully_connected (flattened , 256 , activation_fn = tf .nn .relu , weights_initializer = tf .random_normal_initializer (stddev = 0.02 ), biases_initializer = bias_initializer ,\
79+ embedding = tf .contrib .layers .fully_connected (flattened , 512 , activation_fn = tf .nn .relu , weights_initializer = tf .random_normal_initializer (stddev = 0.02 ), biases_initializer = bias_initializer ,\
8380 scope = 'fc_embed' )
8481
8582 step_size = tf .shape (self .X )[:1 ]
@@ -117,8 +114,6 @@ def __build_model(self):
117114 self .actions_oh = tf .one_hot (self .actions , depth = self .action_size , dtype = tf .float32 , name = 'actions_oh' )
118115 self .target_values = tf .placeholder (shape = [None ], dtype = tf .float32 , name = 'target_vals' )
119116 self .advantages = tf .placeholder (shape = [None ], dtype = tf .float32 , name = 'advantages' )
120- #print('adv shape', self.advantages.shape)
121- #self.advantages = tf.subtract(tf.stop_gradient(self.value), self.target_values, name='advantage')
122117
123118 MIN_POLICY = 1e-8
124119 MAX_POLICY = 1.0 - MIN_POLICY
@@ -128,23 +123,20 @@ def __build_model(self):
128123 self .log_policy_for_action = tf .reduce_sum (tf .multiply (self .log_policy , self .actions_oh ), axis = 1 , name = 'log_policy_for_action' )
129124 self .value_loss = tf .reduce_mean (tf .square (self .value - self .target_values ), name = 'value_loss' )
130125 self .value_loss = self .value_loss * VALUE_MODIFIER
131- #self.value_loss = self.value_loss - self.value_loss
132126 self .policy_loss = - tf .reduce_mean (tf .multiply (self .log_policy_for_action , self .advantages ), name = 'policy_loss' )
133127 self .policy_loss = self .policy_loss * POLICY_MODIFIER
134128 #entropija je E[-log(X)] = sum(p(x) * log(x))
135129 self .entropy_beta = tf .get_variable ('entropy_beta' , shape = [],
136130 initializer = tf .constant_initializer (ENTROPY_MODIFIER ), trainable = False )
137131 self .entropy_loss = - tf .reduce_mean (self .policy * - self .log_policy , name = 'entropy_loss' )
138132 self .entropy_loss = self .entropy_loss * self .entropy_beta
139- #self.entropy_loss = self.entropy_loss - self.entropy_loss
140133 self .loss = self .value_loss + \
141134 self .policy_loss + \
142135 self .entropy_loss
143136 #get locals
144137 local_vars = tf .get_collection (tf .GraphKeys .TRAINABLE_VARIABLES , self .scope_name )
145138 #update locals
146139 grads = tf .gradients (self .loss , local_vars )
147- #grads = [tf.clip_by_average_norm(grad, CLIP_VALUE) for grad in grads]
148140 grads , grad_norms = tf .clip_by_global_norm (grads , CLIP_VALUE )
149141 self .update_ops = update_target_graph (GLOBAL_SCOPE , self .scope_name )
150142 global_vars = tf .get_collection (tf .GraphKeys .TRAINABLE_VARIABLES , GLOBAL_SCOPE )
@@ -160,10 +152,7 @@ def predict(self, sess, state, initial_lstm_state):
160152 }\
161153 )
162154 policy = policy .flatten ()
163- #print('cur policy', policy)
164155 prediction = np .random .choice (self .action_size , p = policy )
165- #prediction = np.argmax(policy)
166- #print('prediction', prediction)
167156 return prediction , final_lstm_state
168157
169158 def act (self , sess , state , initial_lstm_state ):
@@ -209,7 +198,7 @@ def update_to_global(self, sess):
209198class Worker :
210199 def __init__ (self , agent ):
211200 self .agent = agent
212- self .summary_writer = tf .summary .FileWriter (self .agent .scope_name )
201+ self .summary_writer = tf .summary .FileWriter (os . path . join ( SAVE_DIR , 'Tensorboard/' + self .agent .scope_name ) )
213202 def work (self , sess , optimizer , thread_lock ):
214203
215204 global global_counter
@@ -234,14 +223,12 @@ def work(self, sess, optimizer, thread_lock):
234223 elapsed_time = time .time () - start_time
235224
236225 with sess .as_default (), sess .graph .as_default ():
237- while True :#global_counter <= MAX_ITERATIONS and elapsed_time <= MAX_LEARNING_TIME:
226+ while True :
238227 self .agent .update_to_global (sess )
239228 if done or timestep > MAX_EP_LENGTH :
240229 last_rewards .append (episode_reward )
241230 last_frames .append (timestep )
242231 if episode_counter > 0 and episode_counter % 5 == 0 :
243- #print('for agent:', self.agent.scope_name)
244- #print('at episode', episode_counter, 'episode reward is', episode_reward)
245232 if len (value_losses ) > 0 :
246233 summary = tf .Summary ()
247234
@@ -321,9 +308,7 @@ def work(self, sess, optimizer, thread_lock):
321308 for reward in reversed (rewards ):
322309 target_value = reward + DISCOUNT * target_value
323310 target_values .append (target_value )
324- #for i in range(len(rewards)-1):
325- # idx = len(rewards) - i - 1
326- # target_values[idx-1] = rewards[idx-1] + DISCOUNT * target_values[idx]
311+
327312 states = np .vstack (states )
328313 actions = np .vstack (actions ).ravel ()
329314 target_values = np .vstack (target_values ).ravel ()
@@ -351,22 +336,20 @@ def work(self, sess, optimizer, thread_lock):
351336worker_threads = []
352337
353338env_global = EnvWrapper (ENV_NAME )
354- #global_agent = Agent(env_global, GLOBAL_SCOPE, tf.train.AdamOptimizer())
355339global_agent = Agent (env_global , GLOBAL_SCOPE , tf .train .GradientDescentOptimizer (LEARNING_RATE ))
356340
357- config = tf .ConfigProto ()#device_count = {'GPU': 0})
341+ config = tf .ConfigProto ()
358342config .gpu_options .allow_growth = True
359343
360344sess = tf .Session (config = config )
361- writer = tf .summary .FileWriter (" graph" , sess .graph )
345+ writer = tf .summary .FileWriter (os . path . join ( SAVE_DIR , "Tensorboard/ graph") , sess .graph )
362346
363347print ('saved graph' )
364348
365349def global_saving_thread (agent , sess ):
366350
367351 global global_counter
368352
369- MAX_MODELS = 1000
370353 cnt_model = 0
371354
372355 with sess .as_default (), sess .graph .as_default ():
@@ -376,24 +359,24 @@ def global_saving_thread(agent, sess):
376359
377360 elapsed_time = time .time () - start_time
378361
379- #save model every 15 minutes
380- while True :#global_counter <= MAX_ITERATIONS and elapsed_time <= MAX_LEARNING_TIME:
381- print ("Current model save name:" , 'model_' + str (cnt_model % MAX_MODELS ))
382- save_path = saver .save (sess , "models/model_" + str (cnt_model % MAX_MODELS ) + ".ckpt" )
362+ cnt_minutes = 30
363+ #save model every 30 minutes
364+ while True :
365+ print ("Current model save name:" , 'model_' + str (cnt_model ))
366+ save_path = saver .save (sess , os .path .join (SAVE_DIR , "models/model_" + str (cnt_model ) + ".ckpt" ))
383367 print ("Current global iteration" , global_counter )
384368 cnt_model += 1
385- time .sleep (3 * 60 * 60 )
369+ time .sleep (cnt_minutes * 60 )
386370 print ("Learning time was" , int (elapsed_time / 60 / 60 ), "hours" , int ((elapsed_time - int (elapsed_time / 60 / 60 )* 60 * 60 )/ 60 ), "minutes" )
387371
388- cnt_threads = 24
389372thread_lock = threading .Lock ()
390373
391374def worker_fun (worker , sess , optimizer , thread_lock ):
392375 worker .work (sess , optimizer , thread_lock )
393376
394377optimizer = tf .train .AdamOptimizer (learning_rate = LEARNING_RATE )
395378
396- for i in range (cnt_threads ):
379+ for i in range (CNT_THREADS ):
397380 env = EnvWrapper (ENV_NAME )
398381 worker = Worker (Agent (env , 'local' + str (i ), optimizer ))
399382 t = threading .Thread (target = worker_fun , args = (worker , sess , optimizer , thread_lock ))
0 commit comments