NikolaMr
diff --git a/‎Agent.py‎
Lines changed: 1 addition & 1 deletion b/‎Agent.py‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎AgentRunner.py‎
Lines changed: 13 additions & 12 deletions b/‎AgentRunner.py‎
Lines changed: 13 additions & 12 deletions
diff --git a/‎Asynchronous/A3C.ipynb‎
Lines changed: 44 additions & 258 deletions b/‎Asynchronous/A3C.ipynb‎
Lines changed: 44 additions & 258 deletions
diff --git a/‎Asynchronous/A3C.py‎
Lines changed: 22 additions & 39 deletions b/‎Asynchronous/A3C.py‎
Lines changed: 22 additions & 39 deletions
@@ -127,7 +127,7 @@ def __init__(self, setup_dict=None):
         if not 'gamma' in setup_dict:
             setup_dict['gamma'] = 0.99
         if not 'update_freq' in setup_dict:
-            setup_dict['update_freq'] = 4
+            setup_dict['update_freq'] = 1
         if not 'log_filename' in setup_dict:
             setup_dict['log_filename'] = 'log.txt'
         if not 'MemoryType' in setup_dict:
 
@@ -10,18 +10,19 @@
 
 if os.path.isfile('config.json'):
     setup_dict = json.loads(open('config.json').read())
-else:
-    setup_dict['observing_frames'] = 25000
-    setup_dict['replay_memory_size'] = 25000
-    setup_dict['learning_rate'] = 1e-5
-    setup_dict['start_eps'] = 0.7
-    setup_dict['exploring_frames'] = 2000000
-    setup_dict['saving_dir'] = "DDQN_AgentPrioritizedForgettingEpsGreedy"
-    setup_dict['log_freq'] = 5
-    setup_dict['MemoryType'] = 'MemoryPrioritizedForgetting'
-    #setup_dict['MemoryType'] = 'PrioritizedExperienceReplayMemory'
-    setup_dict['ExplorationStrategy'] = 'EpsilonGreedyExplorationStrategy'
-    setup_dict['Agent'] = 'DDQN_Agent'
+setup_dict['observing_frames'] = 25000
+setup_dict['replay_memory_size'] = 25000
+setup_dict['learning_rate'] = 1e-4
+setup_dict['start_eps'] = 0.7
+setup_dict['exploring_frames'] = 2000000
+setup_dict['saving_dir'] = "DuelingDDQN_AgentPrioritizedForgettingEpsGreedy_2018_06_14"
+setup_dict['log_freq'] = 5
+setup_dict['MemoryType'] = 'MemoryPrioritizedForgetting'
+#setup_dict['MemoryType'] = 'PrioritizedExperienceReplayMemory'
+setup_dict['ExplorationStrategy'] = 'EpsilonGreedyExplorationStrategy'
+setup_dict['Agent'] = 'Dueling_DDQN_Agent'
+setup_dict['update_freq'] = 2
+setup_dict['tau'] = 0.005
 #agent = DuelingDDQNAgentPER.Dueling_DDQN_PER_Agent(setup_dict)
 agent = None
 if setup_dict['Agent'] == 'DQN_Agent':
 
@@ -5,23 +5,22 @@
 import skimage
 from skimage import color, exposure, transform
 import threading
+import os
 
 IMG_WIDTH = 105
 IMG_HEIGHT = 80
-CNT_FRAMES = 1
 GLOBAL_SCOPE = 'global'
 VALUE_MODIFIER = 0.5
 POLICY_MODIFIER = 1
-ENTROPY_MODIFIER = 5*1e-2
+ENTROPY_MODIFIER = 1
 MAX_STEPS = 30
 DISCOUNT = 0.99
 ENV_NAME = 'BreakoutDeterministic-v4'
-#ENV_NAME = 'PongDeterministic-v4'
-#MAX_ITERATIONS = 100000000
 MAX_EP_LENGTH = 100000
-#MAX_LEARNING_TIME = 7 * 60 * 60 # 7 hours
 LEARNING_RATE = 1e-4
-CLIP_VALUE = 2.0
+CLIP_VALUE = 10.0
+SAVE_DIR = 'AgentResults'
+CNT_THREADS = 24
 
 def process_frame(x_t, img_rows, img_cols):
     x_t = skimage.color.rgb2gray(x_t)
@@ -66,20 +65,18 @@ def __build_model(self):
         with tf.variable_scope(self.scope_name):
             weights_initializer = tf.contrib.layers.xavier_initializer_conv2d()
             bias_initializer = tf.zeros_initializer()
-            self.X = tf.placeholder(shape=[None, IMG_WIDTH, IMG_HEIGHT, CNT_FRAMES], dtype=tf.float32, name='input')
-            conv1 = tf.contrib.layers.conv2d(self.X, 32, 3, stride=2, activation_fn=tf.nn.relu, padding='SAME', \
+            self.X = tf.placeholder(shape=[None, IMG_WIDTH, IMG_HEIGHT, 1], dtype=tf.float32, name='input')
+            conv1 = tf.contrib.layers.conv2d(self.X, 32, 8, stride=4, activation_fn=tf.nn.relu, padding='VALID', \
                                             weights_initializer=weights_initializer, biases_initializer = bias_initializer,\
                                             scope='first_conv')
-            mp1 = tf.contrib.layers.max_pool2d(conv1, 2, scope='first_mp')
-            conv2 = tf.contrib.layers.conv2d(mp1, 32, 3, stride=2, activation_fn=tf.nn.relu, padding='SAME', \
+            conv2 = tf.contrib.layers.conv2d(conv1, 64, 4, stride=2, activation_fn=tf.nn.relu, padding='VALID', \
                                             weights_initializer=weights_initializer, biases_initializer = bias_initializer,\
                                             scope='second_conv')
-            mp2 = tf.contrib.layers.max_pool2d(conv2, 2, scope='second_mp')
-            conv3 = tf.contrib.layers.conv2d(mp2, 64, 3, stride=2, activation_fn=tf.nn.relu, padding='SAME', \
+            conv3 = tf.contrib.layers.conv2d(conv2, 64, 3, stride=1, activation_fn=tf.nn.relu, padding='VALID', \
                                              weights_initializer=weights_initializer, biases_initializer = bias_initializer,\
                                             scope='third_conv')
             flattened = tf.contrib.layers.flatten(conv3, scope='flatten')
-            embedding = tf.contrib.layers.fully_connected(flattened, 256, activation_fn=tf.nn.relu, weights_initializer=tf.random_normal_initializer(stddev=0.02), biases_initializer=bias_initializer,\
+            embedding = tf.contrib.layers.fully_connected(flattened, 512, activation_fn=tf.nn.relu, weights_initializer=tf.random_normal_initializer(stddev=0.02), biases_initializer=bias_initializer,\
                                                          scope='fc_embed')
 
             step_size = tf.shape(self.X)[:1]
@@ -117,8 +114,6 @@ def __build_model(self):
                 self.actions_oh = tf.one_hot(self.actions, depth=self.action_size, dtype=tf.float32, name='actions_oh')
                 self.target_values = tf.placeholder(shape=[None], dtype=tf.float32, name='target_vals')
                 self.advantages = tf.placeholder(shape=[None], dtype=tf.float32, name='advantages')
-                #print('adv shape', self.advantages.shape)
-                #self.advantages = tf.subtract(tf.stop_gradient(self.value), self.target_values, name='advantage')
 
                 MIN_POLICY = 1e-8
                 MAX_POLICY = 1.0 - MIN_POLICY
@@ -128,23 +123,20 @@ def __build_model(self):
                 self.log_policy_for_action = tf.reduce_sum(tf.multiply(self.log_policy, self.actions_oh), axis=1, name='log_policy_for_action')
                 self.value_loss = tf.reduce_mean(tf.square(self.value - self.target_values), name='value_loss')
                 self.value_loss = self.value_loss * VALUE_MODIFIER
-                #self.value_loss = self.value_loss - self.value_loss
                 self.policy_loss = -tf.reduce_mean(tf.multiply(self.log_policy_for_action, self.advantages), name='policy_loss')
                 self.policy_loss = self.policy_loss * POLICY_MODIFIER
                 #entropija je E[-log(X)] = sum(p(x) * log(x))
                 self.entropy_beta = tf.get_variable('entropy_beta', shape=[],
                                        initializer=tf.constant_initializer(ENTROPY_MODIFIER), trainable=False)
                 self.entropy_loss = -tf.reduce_mean(self.policy * -self.log_policy, name='entropy_loss')
                 self.entropy_loss = self.entropy_loss * self.entropy_beta
-                #self.entropy_loss = self.entropy_loss - self.entropy_loss
                 self.loss = self.value_loss + \
                             self.policy_loss + \
                             self.entropy_loss
                 #get locals
                 local_vars = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, self.scope_name)
                 #update locals
                 grads = tf.gradients(self.loss, local_vars)
-                #grads = [tf.clip_by_average_norm(grad, CLIP_VALUE) for grad in grads]
                 grads, grad_norms = tf.clip_by_global_norm(grads, CLIP_VALUE)
                 self.update_ops = update_target_graph(GLOBAL_SCOPE, self.scope_name)
                 global_vars = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, GLOBAL_SCOPE)
@@ -160,10 +152,7 @@ def predict(self, sess, state, initial_lstm_state):
                                                       }\
                                            )
         policy = policy.flatten()
-        #print('cur policy', policy)
         prediction = np.random.choice(self.action_size, p=policy)
-        #prediction = np.argmax(policy)
-        #print('prediction', prediction)
         return prediction, final_lstm_state
 
     def act(self, sess, state, initial_lstm_state):
@@ -209,7 +198,7 @@ def update_to_global(self, sess):
 class Worker:
     def __init__(self, agent):
         self.agent = agent
-        self.summary_writer = tf.summary.FileWriter(self.agent.scope_name)
+        self.summary_writer = tf.summary.FileWriter(os.path.join(SAVE_DIR, 'Tensorboard/' + self.agent.scope_name))
     def work(self, sess, optimizer, thread_lock):
 
         global global_counter
@@ -234,14 +223,12 @@ def work(self, sess, optimizer, thread_lock):
         elapsed_time = time.time() - start_time
 
         with sess.as_default(), sess.graph.as_default():
-            while True:#global_counter <= MAX_ITERATIONS and elapsed_time <= MAX_LEARNING_TIME:
+            while True:
                 self.agent.update_to_global(sess)
                 if done or timestep > MAX_EP_LENGTH:
                     last_rewards.append(episode_reward)
                     last_frames.append(timestep)
                     if episode_counter > 0 and episode_counter % 5 == 0:
-                        #print('for agent:', self.agent.scope_name)
-                        #print('at episode', episode_counter, 'episode reward is', episode_reward)
                         if len(value_losses) > 0:
                             summary = tf.Summary()
 
@@ -321,9 +308,7 @@ def work(self, sess, optimizer, thread_lock):
                 for reward in reversed(rewards):
                     target_value = reward + DISCOUNT * target_value
                     target_values.append(target_value)
-                #for i in range(len(rewards)-1):
-                #    idx = len(rewards) - i - 1
-                #    target_values[idx-1] = rewards[idx-1] + DISCOUNT * target_values[idx]
+
                 states = np.vstack(states)
                 actions = np.vstack(actions).ravel()
                 target_values = np.vstack(target_values).ravel()
@@ -351,22 +336,20 @@ def work(self, sess, optimizer, thread_lock):
 worker_threads = []
 
 env_global = EnvWrapper(ENV_NAME)
-#global_agent = Agent(env_global, GLOBAL_SCOPE, tf.train.AdamOptimizer())
 global_agent = Agent(env_global, GLOBAL_SCOPE, tf.train.GradientDescentOptimizer(LEARNING_RATE))
 
-config = tf.ConfigProto()#device_count = {'GPU': 0})
+config = tf.ConfigProto()
 config.gpu_options.allow_growth=True
 
 sess = tf.Session(config=config)
-writer = tf.summary.FileWriter("graph", sess.graph)
+writer = tf.summary.FileWriter(os.path.join(SAVE_DIR, "Tensorboard/graph"), sess.graph)
 
 print('saved graph')
 
 def global_saving_thread(agent, sess):
 
     global global_counter
 
-    MAX_MODELS = 1000
     cnt_model = 0
 
     with sess.as_default(), sess.graph.as_default():
@@ -376,24 +359,24 @@ def global_saving_thread(agent, sess):
 
         elapsed_time = time.time() - start_time
 
-        #save model every 15 minutes
-        while True:#global_counter <= MAX_ITERATIONS and elapsed_time <= MAX_LEARNING_TIME:
-            print("Current model save name:", 'model_' + str(cnt_model % MAX_MODELS))
-            save_path = saver.save(sess, "models/model_" + str(cnt_model % MAX_MODELS) + ".ckpt")
+        cnt_minutes = 30
+        #save model every 30 minutes
+        while True:
+            print("Current model save name:", 'model_' + str(cnt_model))
+            save_path = saver.save(sess, os.path.join(SAVE_DIR, "models/model_" + str(cnt_model) + ".ckpt"))
             print("Current global iteration", global_counter)
             cnt_model += 1
-            time.sleep(3 * 60 * 60)
+            time.sleep(cnt_minutes * 60)
         print("Learning time was", int(elapsed_time/60/60), "hours", int((elapsed_time - int(elapsed_time/60/60)*60*60)/60), "minutes")
 
-cnt_threads = 24
 thread_lock = threading.Lock()
 
 def worker_fun(worker, sess, optimizer, thread_lock):
     worker.work(sess, optimizer, thread_lock)
 
 optimizer = tf.train.AdamOptimizer(learning_rate=LEARNING_RATE)
 
-for i in range(cnt_threads):
+for i in range(CNT_THREADS):
     env = EnvWrapper(ENV_NAME)
     worker = Worker(Agent(env, 'local' + str(i), optimizer))
     t = threading.Thread(target=worker_fun, args=(worker, sess, optimizer, thread_lock))