diff --git a/requirements.txt b/requirements.txt new file mode 100644 index 00000000..34ccaf4d --- /dev/null +++ b/requirements.txt @@ -0,0 +1,35 @@ +absl-py==0.7.1 +animalai==1.0.3 +animalai-train==1.0.3 +astor==0.8.0 +atari-py==0.2.6 +cloudpickle==1.2.1 +cycler==0.10.0 +dopamine-rl==2.0.5 +future==0.17.1 +gast==0.2.2 +gin-config==0.2.0 +grpcio==1.11.1 +gym==0.13.1 +h5py==2.9.0 +jsonpickle==1.2 +Keras-Applications==1.0.8 +Keras-Preprocessing==1.1.0 +kiwisolver==1.1.0 +Markdown==3.1.1 +matplotlib==3.1.1 +numpy==1.14.5 +opencv-python==4.1.0.25 +Pillow==5.4.1 +protobuf==3.6.1 +pyglet==1.3.2 +pyparsing==2.4.0 +python-dateutil==2.8.0 +PyYAML==5.1.1 +scipy==1.3.0 +six==1.12.0 +tensorboard==1.12.2 +tensorflow==1.12.2 +termcolor==1.1.0 +torch==1.2.0 +Werkzeug==0.15.5 diff --git a/trainers/a3c_src/env.py b/trainers/a3c_src/env.py new file mode 100644 index 00000000..dfa5a78c --- /dev/null +++ b/trainers/a3c_src/env.py @@ -0,0 +1,108 @@ +""" +@author: Viet Nguyen +""" + +import gym_super_mario_bros +from gym.spaces import Box +from gym import Wrapper +from nes_py.wrappers import BinarySpaceToDiscreteSpaceEnv +from gym_super_mario_bros.actions import SIMPLE_MOVEMENT, COMPLEX_MOVEMENT, RIGHT_ONLY +import cv2 +import numpy as np +import subprocess as sp + + +class Monitor: + def __init__(self, width, height, saved_path): + + self.command = ["ffmpeg", "-y", "-f", "rawvideo", "-vcodec", "rawvideo", "-s", "{}X{}".format(width, height), + "-pix_fmt", "rgb24", "-r", "80", "-i", "-", "-an", "-vcodec", "mpeg4", saved_path] + try: + self.pipe = sp.Popen(self.command, stdin=sp.PIPE, stderr=sp.PIPE) + except FileNotFoundError: + pass + + def record(self, image_array): + self.pipe.stdin.write(image_array.tostring()) + + +def process_frame(frame): + if frame is not None: + frame = cv2.cvtColor(frame, cv2.COLOR_RGB2GRAY) + frame = cv2.resize(frame, (84, 84))[None, :, :] / 255. + return frame + else: + return np.zeros((1, 84, 84)) + + +class CustomReward(Wrapper): + def __init__(self, env=None, monitor=None): + super(CustomReward, self).__init__(env) + self.observation_space = Box(low=0, high=255, shape=(1, 84, 84)) + self.curr_score = 0 + if monitor: + self.monitor = monitor + else: + self.monitor = None + + def step(self, action): + state, reward, done, info = self.env.step(action) + if self.monitor: + self.monitor.record(state) + state = process_frame(state) + reward += (info["score"] - self.curr_score) / 40. + self.curr_score = info["score"] + if done: + if info["flag_get"]: + reward += 50 + else: + reward -= 50 + return state, reward / 10., done, info + + def reset(self): + self.curr_score = 0 + return process_frame(self.env.reset()) + + +class CustomSkipFrame(Wrapper): + def __init__(self, env, skip=4): + super(CustomSkipFrame, self).__init__(env) + self.observation_space = Box(low=0, high=255, shape=(4, 84, 84)) + self.skip = skip + + def step(self, action): + total_reward = 0 + states = [] + state, reward, done, info = self.env.step(action) + for i in range(self.skip): + if not done: + state, reward, done, info = self.env.step(action) + total_reward += reward + states.append(state) + else: + states.append(state) + states = np.concatenate(states, 0)[None, :, :, :] + return states.astype(np.float32), reward, done, info + + def reset(self): + state = self.env.reset() + states = np.concatenate([state for _ in range(self.skip)], 0)[None, :, :, :] + return states.astype(np.float32) + + +def create_train_env(world, stage, action_type, output_path=None): + env = gym_super_mario_bros.make("SuperMarioBros-{}-{}-v0".format(world, stage)) + if output_path: + monitor = Monitor(256, 240, output_path) + else: + monitor = None + if action_type == "right": + actions = RIGHT_ONLY + elif action_type == "simple": + actions = SIMPLE_MOVEMENT + else: + actions = COMPLEX_MOVEMENT + env = BinarySpaceToDiscreteSpaceEnv(env, actions) + env = CustomReward(env, monitor) + env = CustomSkipFrame(env) + return env, env.observation_space.shape[0], len(actions) diff --git a/trainers/a3c_src/model.py b/trainers/a3c_src/model.py new file mode 100644 index 00000000..50627d65 --- /dev/null +++ b/trainers/a3c_src/model.py @@ -0,0 +1,70 @@ +""" +@author: Viet Nguyen +""" + +import torch.nn as nn +import torch.nn.functional as F + + +class ActorCritic(nn.Module): + def __init__(self, num_inputs, num_actions): + super(ActorCritic, self).__init__() + self.conv1 = nn.Conv2d(num_inputs, 32, 3, stride=2, padding=1) + self.conv2 = nn.Conv2d(32, 32, 3, stride=2, padding=1) + self.conv3 = nn.Conv2d(32, 32, 3, stride=2, padding=1) + self.conv4 = nn.Conv2d(32, 32, 3, stride=2, padding=1) + self.lstm = nn.LSTMCell(32 * 6 * 6, 512) + self.critic_linear = nn.Linear(512, 1) + self.actor_linear = nn.Linear(512, num_actions) + self._initialize_weights() + + def _initialize_weights(self): + for module in self.modules(): + if isinstance(module, nn.Conv2d) or isinstance(module, nn.Linear): + nn.init.xavier_uniform_(module.weight) + # nn.init.kaiming_uniform_(module.weight) + nn.init.constant_(module.bias, 0) + elif isinstance(module, nn.LSTMCell): + nn.init.constant_(module.bias_ih, 0) + nn.init.constant_(module.bias_hh, 0) + + def forward(self, x, hx, cx): + x = F.relu(self.conv1(x)) + x = F.relu(self.conv2(x)) + x = F.relu(self.conv3(x)) + x = F.relu(self.conv4(x)) + hx, cx = self.lstm(x.view(x.size(0), -1), (hx, cx)) + return self.actor_linear(hx), self.critic_linear(hx), hx, cx + + + +class Mapper(nn.Module): + def __init__(self, num_inputs): + super(Mapper, self).__init__() + self.conv1 = nn.Conv2d(num_inputs, 32, 3, stride=2, padding=1) + self.conv2 = nn.Conv2d(32, 32, 3, stride=2, padding=1) + self.conv3 = nn.Conv2d(32, 32, 3, stride=2, padding=1) + self.conv4 = nn.Conv2d(32, 32, 3, stride=2, padding=1) + self.lstm = nn.LSTMCell(32 * 6 * 6, 400) + self.map_final = nn.Linear(400, 1600) + self._initialize_weights() + + def _initialize_weights(self): + for module in self.modules(): + if isinstance(module, nn.Conv2d) or isinstance(module, nn.Linear): + nn.init.xavier_uniform_(module.weight) + # nn.init.kaiming_uniform_(module.weight) + nn.init.constant_(module.bias, 0) + elif isinstance(module, nn.LSTMCell): + nn.init.constant_(module.bias_ih, 0) + nn.init.constant_(module.bias_hh, 0) + + def forward(self, x, hx, cx): + x = F.relu(self.conv1(x)) + x = F.relu(self.conv2(x)) + x = F.relu(self.conv3(x)) + x = F.relu(self.conv4(x)) + hx, cx = self.lstm(x.view(x.size(0), -1), (hx, cx)) + return self.map_final(hx), hx, cx + + diff --git a/trainers/a3c_src/optimizer.py b/trainers/a3c_src/optimizer.py new file mode 100644 index 00000000..385b5264 --- /dev/null +++ b/trainers/a3c_src/optimizer.py @@ -0,0 +1,18 @@ +""" +@author: Viet Nguyen +""" + +import torch + +class GlobalAdam(torch.optim.Adam): + def __init__(self, params, lr): + super(GlobalAdam, self).__init__(params, lr=lr) + for group in self.param_groups: + for p in group['params']: + state = self.state[p] + state['step'] = 0 + state['exp_avg'] = torch.zeros_like(p.data) + state['exp_avg_sq'] = torch.zeros_like(p.data) + + state['exp_avg'].share_memory_() + state['exp_avg_sq'].share_memory_() diff --git a/trainers/a3c_src/process.py b/trainers/a3c_src/process.py new file mode 100644 index 00000000..fe01b07c --- /dev/null +++ b/trainers/a3c_src/process.py @@ -0,0 +1,240 @@ +""" +@author: Viet Nguyen +""" + +import torch +#from src.env import create_train_env +from a3c_src.model import ActorCritic +import torch.nn.functional as F +from torch.distributions import Categorical +from collections import deque +from tensorboardX import SummaryWriter +import timeit + +from animalai.envs import UnityEnvironment +from animalai.envs.arena_config import ArenaConfig + +from env_utils import * + +#device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu") +device = torch.device("cpu") +actions_array = np.array([[0,0],[0,1],[0,2],[1,0],[2,0]]) +brain_name = 'Learner' + +def local_train(index, opt, global_model, optimizer, save=False): + torch.manual_seed(123 + index) + if save: + start_time = timeit.default_timer() + writer = SummaryWriter(opt.log_path) + + + # Unity + #env_path = '../env/AnimalAI' + #n_arenas=1 + #env=UnityEnvironment(file_name=env_path, n_arenas=n_arenas, worker_id=np.random.randint(1,100), play=False,inference=False) + + + + b_env = better_env(n_arenas = 1) + env = b_env.env + #arena_config_in = b_env.env_config + #start_positions, start_rotations = b_env.get_start_positions() + #ps = position_tracker(start_positions, start_rotations) + # end unity + num_states = 3 + num_actions = 5 + + #env, num_states, num_actions = create_train_env(opt.world, opt.stage, opt.action_type) + local_model = ActorCritic(num_states, num_actions).to(device) + local_model.train() + + + + action_info = env.reset(arenas_configurations=b_env.env_config, train_mode=True) + state = action_info[brain_name].visual_observations[0] + state = torch.from_numpy(np.moveaxis(state, -1, 1)).float().to(device) + + + + done = True + curr_step = 0 + curr_episode = 0 + while True: + if save: + if curr_episode % opt.save_interval == 0 and curr_episode > 0: + torch.save(global_model.state_dict(), + "{}/{}_{}".format(opt.saved_path, opt.saved_filepath, curr_episode)) + #print("Process {}. Episode {}".format(index, curr_episode)) + if curr_episode > 0: + print("Process {}. Episode {}, total_loss = {}".format(index, curr_episode, total_loss.item())) + curr_episode += 1 + local_model.load_state_dict(global_model.state_dict()) + if done: + h_0 = torch.zeros((1, 512), dtype=torch.float) + c_0 = torch.zeros((1, 512), dtype=torch.float) + else: + h_0 = h_0.detach() + c_0 = c_0.detach() + + h_0 = h_0.to(device) + c_0 = c_0.to(device) + #if opt.use_gpu: + # h_0 = h_0.cuda() + # c_0 = c_0.cuda() + + log_policies = [] + values = [] + rewards = [] + entropies = [] + + for _ in range(opt.num_local_steps): + curr_step += 1 + logits, value, h_0, c_0 = local_model(state, h_0, c_0) + policy = F.softmax(logits, dim=1) + log_policy = F.log_softmax(logits, dim=1) + entropy = -(policy * log_policy).sum(1, keepdim=True) + + m = Categorical(policy) + action_idx = m.sample().item() + + action = actions_array[action_idx] + #action = actions_array[action.cpu().numpy().astype(int)] + #state, reward, done, _ = env.step(action) + action_info = env.step(vector_action=action) + + + state = action_info[brain_name].visual_observations[0] + state = torch.from_numpy(np.moveaxis(state, -1, 1)).float().to(device) + velocity_obs = action_info[brain_name].vector_observations + b_env.position_tracker.position_step(velocity_obs, action) + #print("Distance to goal = {}".format(ps.distance_to_goal())) + #print('Current position = {}, velocity = {}'.format(ps.current_position, velocity_obs)) + reward = action_info[brain_name].rewards # list of rewards len = n_arenas + reward = reward[0] + + # reward based on visiting squares + total_unvisited = np.sum(b_env.position_tracker.visited) + reward -= total_unvisited/10000 + reward -= b_env.position_tracker.distance_to_goal()/500 + reward -= b_env.position_tracker.angle_to_goal()/1000 + #print("{} reward = {}".format(index, reward)) + + arenas_done = action_info[brain_name].local_done + done = any(arenas_done) + + + + + + #state = torch.from_numpy(state) + + if opt.use_gpu: + state = state.cuda() + if curr_step > opt.num_global_steps: + done = True + + #if curr_step > 500: + # done = True + + if done: + curr_step = 0 + + #b_env = better_env(n_arenas = 1) + #arena_config_in = b_env.env_config + #start_positions, start_rotations = b_env.get_start_positions() + #ps = position_tracker(start_positions, start_rotations) + b_env.generate_new_config() + action_info = env.reset(arenas_configurations=b_env.env_config, train_mode=True) + state = action_info[brain_name].visual_observations[0] + state = torch.from_numpy(np.moveaxis(state, -1, 1)).float().to(device) + #state = torch.from_numpy(env.reset()) + #if opt.use_gpu: + # state = state.cuda() + + values.append(value) + log_policies.append(log_policy[0, action_idx]) + rewards.append(reward) + entropies.append(entropy) + + if done: + break + + + R = torch.zeros((1, 1), dtype=torch.float) + if opt.use_gpu: + R = R.cuda() + if not done: + _, R, _, _ = local_model(state, h_0, c_0) + + gae = torch.zeros((1, 1), dtype=torch.float) + if opt.use_gpu: + gae = gae.cuda() + actor_loss = 0 + critic_loss = 0 + entropy_loss = 0 + next_value = R + + for value, log_policy, reward, entropy in list(zip(values, log_policies, rewards, entropies))[::-1]: + gae = gae * opt.gamma * opt.tau + gae = gae + reward + opt.gamma * next_value.detach() - value.detach() + next_value = value + actor_loss = actor_loss + log_policy * gae + R = R * opt.gamma + reward + critic_loss = critic_loss + (R - value) ** 2 / 2 + entropy_loss = entropy_loss + entropy + + total_loss = -actor_loss + critic_loss - opt.beta * entropy_loss + #print("Loss = {}".format(total_loss)) + writer.add_scalar("Train_{}/Loss".format(index), total_loss, curr_episode) + optimizer.zero_grad() + total_loss.backward() + + for local_param, global_param in zip(local_model.parameters(), global_model.parameters()): + if global_param.grad is not None: + break + global_param._grad = local_param.grad + + optimizer.step() + + if curr_episode == int(opt.num_global_steps / opt.num_local_steps): + print("Training process {} terminated".format(index)) + if save: + end_time = timeit.default_timer() + print('The code runs for %.2f s ' % (end_time - start_time)) + return + + +def local_test(index, opt, global_model): + torch.manual_seed(123 + index) + env, num_states, num_actions = create_train_env(opt.world, opt.stage, opt.action_type) + local_model = ActorCritic(num_states, num_actions) + local_model.eval() + state = torch.from_numpy(env.reset()) + done = True + curr_step = 0 + actions = deque(maxlen=opt.max_actions) + while True: + curr_step += 1 + if done: + local_model.load_state_dict(global_model.state_dict()) + with torch.no_grad(): + if done: + h_0 = torch.zeros((1, 512), dtype=torch.float) + c_0 = torch.zeros((1, 512), dtype=torch.float) + else: + h_0 = h_0.detach() + c_0 = c_0.detach() + + logits, value, h_0, c_0 = local_model(state, h_0, c_0) + policy = F.softmax(logits, dim=1) + action = torch.argmax(policy).item() + state, reward, done, _ = env.step(action) + env.render() + actions.append(action) + if curr_step > opt.num_global_steps or actions.count(actions[0]) == actions.maxlen: + done = True + if done: + curr_step = 0 + actions.clear() + state = env.reset() + state = torch.from_numpy(state) diff --git a/trainers/a3c_test.py b/trainers/a3c_test.py new file mode 100644 index 00000000..75a435ab --- /dev/null +++ b/trainers/a3c_test.py @@ -0,0 +1,124 @@ +""" +@author: Viet Nguyen +""" + +import os + +os.environ['OMP_NUM_THREADS'] = '1' +import argparse +import torch +from a3c_src.model import ActorCritic +import torch.nn.functional as F +import numpy as np +from animalai.envs import UnityEnvironment +from animalai.envs.arena_config import ArenaConfig + +from env_utils import * + + +def get_args(): + parser = argparse.ArgumentParser( + """Implementation of model described in the paper: Asynchronous Methods for Deep Reinforcement Learning for Super Mario Bros""") + parser.add_argument("--saved_filepath", type=str, default="trained_models/a3c_animalai") + args = parser.parse_args() + return args + + +def test(opt): + + + # AnimalAI + device = torch.device("cpu") + num_states = 3 + num_actions = 5 + actions_array = np.array([[0,0],[0,1],[0,2],[1,0],[2,0]]) + brain_name = 'Learner' + # AnimalAI + + torch.manual_seed(123) + + + #env=UnityEnvironment(file_name='../env/AnimalAI', n_arenas=1, worker_id=np.random.randint(1,100), play=False,inference=True) + b_env = better_env(n_arenas = 1, walls=1,t=100, inference=True) + env = b_env.env + #arena_config_in = b_env.env_config + #start_positions, start_rotations = b_env.get_start_positions() + #ps = position_tracker(start_positions, start_rotations) + + + + + model = ActorCritic(num_states, num_actions) + + basepath = opt.saved_filepath.split('/')[0] + basename = opt.saved_filepath.split('/')[1] + + found_models = [int(filenames.split('_')[-1]) for filenames in os.listdir(basepath) if basename in filenames] + if len(found_models) > 0: + latest = max(found_models) + model.load_state_dict(torch.load("{}_{}".format(opt.saved_filepath, latest))) + model = model.to(device) + print("Loaded saved model from {}_{}".format(opt.saved_filepath, latest)) + else: + print("Could not find model to load.") + raise + + + ''' + if torch.cuda.is_available(): + model.load_state_dict(torch.load("{}/a3c_super_mario_bros_{}_{}".format(opt.saved_path, opt.world, opt.stage))) + model.cuda() + else: + model.load_state_dict(torch.load("{}/a3c_super_mario_bros_{}_{}".format(opt.saved_path, opt.world, opt.stage), + map_location=lambda storage, loc: storage)) + ''' + + model.eval() + + action_info = env.reset(arenas_configurations=b_env.env_config, train_mode=False) + state = action_info[brain_name].visual_observations[0] + state = torch.from_numpy(np.moveaxis(state, -1, 1)).float().to(device) + done = True + while True: + if done: + h_0 = torch.zeros((1, 512), dtype=torch.float) + c_0 = torch.zeros((1, 512), dtype=torch.float) + #b_env = better_env(n_arenas = 1) + #arena_config_in = b_env.env_config + b_env.generate_new_config() + action_info = env.reset(arenas_configurations=b_env.env_config, train_mode=False) + state = action_info[brain_name].visual_observations[0] + state = torch.from_numpy(np.moveaxis(state, -1, 1)).float().to(device) + else: + h_0 = h_0.detach() + c_0 = c_0.detach() + + h_0 = h_0.to(device) + c_0 = c_0.to(device) + state = state.to(device) + + logits, value, h_0, c_0 = model(state, h_0, c_0) + policy = F.softmax(logits, dim=1) + action_idx = torch.argmax(policy).item() + action_idx = int(action_idx) + action = actions_array[action_idx] + action_info = env.step(vector_action=action) + + state = action_info[brain_name].visual_observations[0] + state = torch.from_numpy(np.moveaxis(state, -1, 1)).float().to(device) + + velocity_obs = action_info[brain_name].vector_observations + b_env.position_tracker.position_step(velocity_obs, action) + + #print("{}__{}".format(b_env.position_tracker.current_rotation,b_env.position_tracker.angle_to_goal())) + print("Current position = {}".format(b_env.position_tracker.current_position)) + + + arenas_done = action_info[brain_name].local_done + done = any(arenas_done) + + + +if __name__ == "__main__": + opt = get_args() + test(opt) diff --git a/trainers/a3c_train.py b/trainers/a3c_train.py new file mode 100644 index 00000000..9a20b1f2 --- /dev/null +++ b/trainers/a3c_train.py @@ -0,0 +1,93 @@ +""" +modified by: Lucas Tindall +@author: Viet Nguyen +""" + +import os +os.environ['OMP_NUM_THREADS'] = '1' +import argparse +import torch +#from src.env import create_train_env +from a3c_src.model import ActorCritic +from a3c_src.optimizer import GlobalAdam +from a3c_src.process import local_train, local_test +import torch.multiprocessing as _mp +import shutil + +from animalai.envs import UnityEnvironment +from animalai.envs.arena_config import ArenaConfig + + +from env_utils import * + +def get_args(): + parser = argparse.ArgumentParser( + """Implementation of model described in the paper: Asynchronous Methods for Deep Reinforcement Learning for Super Mario Bros""") + #parser.add_argument('--config', type=str, default='configs/1-Food.yaml', help='Environment config file. Default: "configs/1-Food.yaml"') + #parser.add_argument('--load_model', type=str, default='saved_models/ppo.pth', help='Saved model to load. Default: "saved_models/ppo.pth"') + #parser.add_argument('--inference', default=False, action='store_true', help='Run in inference mode. Default: False') + #parser.add_argument("--world", type=int, default=1) + #parser.add_argument("--stage", type=int, default=1) + #parser.add_argument("--action_type", type=str, default="complex") + parser.add_argument('--lr', type=float, default=1e-4) + parser.add_argument('--gamma', type=float, default=0.9, help='discount factor for rewards') + parser.add_argument('--tau', type=float, default=1.0, help='parameter for GAE') + parser.add_argument('--beta', type=float, default=0.01, help='entropy coefficient') + parser.add_argument("--num_local_steps", type=int, default=50) + parser.add_argument("--num_global_steps", type=int, default=5e6) + parser.add_argument("--num_processes", type=int, default=4) + parser.add_argument("--save_interval", type=int, default=500, help="Number of steps between savings") + parser.add_argument("--max_actions", type=int, default=200, help="Maximum repetition steps in test phase") + parser.add_argument("--log_path", type=str, default="tensorboard/a3c_super_mario_bros") + parser.add_argument("--saved_path", type=str, default="trained_models") + parser.add_argument("--saved_filepath", type=str, default="a3c_animalai") + parser.add_argument("--load_model", type=str, default="") + #parser.add_argument("--load_from_previous_stage", type=bool, default=False, + # help="Load weight from previous trained stage") + parser.add_argument("--use_gpu", type=bool, default=False) + args = parser.parse_args() + return args + + +def train(opt): + torch.manual_seed(123) + if os.path.isdir(opt.log_path): + shutil.rmtree(opt.log_path) + os.makedirs(opt.log_path) + if not os.path.isdir(opt.saved_path): + os.makedirs(opt.saved_path) + mp = _mp.get_context("spawn") + #mp = _mp.get_context("fork") + + num_states = 3 + num_actions = 5 + global_model = ActorCritic(num_states, num_actions) + + if opt.use_gpu: + global_model.cuda() + global_model.share_memory() + + if os.path.isfile("{}/{}".format(opt.saved_path, opt.load_model)): + print("loaded global model from {}/{}".format(opt.saved_path, opt.load_model)) + global_model.load_state_dict(torch.load("{}/{}".format(opt.saved_path, opt.load_model))) + + optimizer = GlobalAdam(global_model.parameters(), lr=opt.lr) + processes = [] + for index in range(opt.num_processes): + print("local train {}".format(index)) + if index == 0: + process = mp.Process(target=local_train, args=(index, opt, global_model, optimizer, True)) + else: + process = mp.Process(target=local_train, args=(index, opt, global_model, optimizer)) + process.start() + processes.append(process) + #process = mp.Process(target=local_test, args=(opt.num_processes, opt, global_model)) + #process.start() + #processes.append(process) + for process in processes: + process.join() + + +if __name__ == "__main__": + opt = get_args() + train(opt) diff --git a/trainers/configs/1-Food.yaml b/trainers/configs/1-Food.yaml new file mode 100644 index 00000000..c5992f16 --- /dev/null +++ b/trainers/configs/1-Food.yaml @@ -0,0 +1,22 @@ +!ArenaConfig +arenas: + 0: !Arena + t: 250 + items: + - !Item + name: GoodGoal + 1: !Arena + t: 250 + items: + - !Item + name: GoodGoal + 2: !Arena + t: 250 + items: + - !Item + name: GoodGoal + 3: !Arena + t: 250 + items: + - !Item + name: GoodGoal diff --git a/trainers/configs/2-Preferences.yaml b/trainers/configs/2-Preferences.yaml new file mode 100644 index 00000000..2ea5a4cc --- /dev/null +++ b/trainers/configs/2-Preferences.yaml @@ -0,0 +1,11 @@ +!ArenaConfig +arenas: + 0: !Arena + t: 250 + items: + - !Item + name: GoodGoal + - !Item + name: GoodGoalMulti + - !Item + name: BadGoal diff --git a/trainers/configs/3-Obstacles.yaml b/trainers/configs/3-Obstacles.yaml new file mode 100644 index 00000000..a9f686ad --- /dev/null +++ b/trainers/configs/3-Obstacles.yaml @@ -0,0 +1,41 @@ +!ArenaConfig +arenas: + 0: !Arena + t: 500 + items: + - !Item + name: GoodGoal + - !Item + name: Wall + colors: + - !RGB {r: 153, g: 153, b: 153} + - !Item + name: Wall + colors: + - !RGB {r: 153, g: 153, b: 153} + 1: !Arena + t: 500 + items: + - !Item + name: GoodGoal + - !Item + name: Wall + colors: + - !RGB {r: 153, g: 153, b: 153} + - !Item + name: Wall + colors: + - !RGB {r: 153, g: 153, b: 153} + 2: !Arena + t: 500 + items: + - !Item + name: GoodGoal + - !Item + name: Wall + colors: + - !RGB {r: 153, g: 153, b: 153} + - !Item + name: Wall + colors: + - !RGB {r: 153, g: 153, b: 153} diff --git a/trainers/configs/4-Avoidance.yaml b/trainers/configs/4-Avoidance.yaml new file mode 100644 index 00000000..f0c64551 --- /dev/null +++ b/trainers/configs/4-Avoidance.yaml @@ -0,0 +1,11 @@ +!ArenaConfig +arenas: + 0: !Arena + t: 0 + items: + - !Item + name: GoodGoal + sizes: + - !Vector3 {x: 1, y: 1, z: 1} + - !Item + name: DeathZone diff --git a/trainers/configs/5-SpatialReasoning.yaml b/trainers/configs/5-SpatialReasoning.yaml new file mode 100644 index 00000000..d0e7c9ea --- /dev/null +++ b/trainers/configs/5-SpatialReasoning.yaml @@ -0,0 +1,22 @@ +!ArenaConfig +arenas: + 0: !Arena + t: 0 + items: + - !Item + name: GoodGoalMulti + sizes: + - !Vector3 {x: 1, y: 1, z: 1} + - !Vector3 {x: 1, y: 1, z: 1} + - !Item + name: Ramp + colors: + - !RGB {r: 255, g: 0, b: 255} + - !RGB {r: 255, g: 0, b: 255} + - !RGB {r: 255, g: 0, b: 255} + - !Item + name: Wall + colors: + - !RGB {r: 153, g: 153, b: 153} + - !RGB {r: 153, g: 153, b: 153} + - !RGB {r: 153, g: 153, b: 153} diff --git a/trainers/configs/6-Generalization.yaml b/trainers/configs/6-Generalization.yaml new file mode 100644 index 00000000..942058a0 --- /dev/null +++ b/trainers/configs/6-Generalization.yaml @@ -0,0 +1,22 @@ +!ArenaConfig +arenas: + 0: !Arena + t: 0 + items: + - !Item + name: GoodGoalMulti + sizes: + - !Vector3 {x: 1, y: 1, z: 1} + - !Vector3 {x: 1, y: 1, z: 1} + - !Item + name: Ramp + - !Item + name: Ramp + - !Item + name: Ramp + - !Item + name: Wall + - !Item + name: Wall + - !Item + name: Wall \ No newline at end of file diff --git a/trainers/configs/7-InternalMemory.yaml b/trainers/configs/7-InternalMemory.yaml new file mode 100644 index 00000000..c70c2a84 --- /dev/null +++ b/trainers/configs/7-InternalMemory.yaml @@ -0,0 +1,24 @@ +!ArenaConfig +arenas: + 0: !Arena + t: 0 + blackouts: [-20] + items: + - !Item + name: GoodGoal + 1: !Arena + t: 100 + blackouts: [-40] + items: + - !Item + name: Wall + - !Item + name: GoodGoal + 2: !Arena + t: 100 + blackouts: [50, 55, 75, 80, 100, 105, 125] + items: + - !Item + name: WallTranparent + - !Item + name: GoodGoal diff --git a/trainers/configs/allObjectsRandom.yaml b/trainers/configs/allObjectsRandom.yaml new file mode 100644 index 00000000..aaedf9c1 --- /dev/null +++ b/trainers/configs/allObjectsRandom.yaml @@ -0,0 +1,150 @@ +!ArenaConfig +arenas: + 0: !Arena + t: 0 + items: + - !Item + name: CylinderTunnel + - !Item + name: Ramp + - !Item + name: Wall + - !Item + name: WallTransparent + - !Item + name: Cardbox1 + - !Item + name: Cardbox2 + - !Item + name: Cylinder + - !Item + name: UObject + - !Item + name: LObject + - !Item + name: GoodGoal + - !Item + name: GoodGoalBounce + - !Item + name: BadGoal + - !Item + name: BadGoalBounce + - !Item + name: GoodGoalMulti + - !Item + name: GoodGoalMultiBounce + - !Item + name: DeathZone + - !Item + name: HotZone + 1: !Arena + t: 0 + items: + - !Item + name: CylinderTunnel + - !Item + name: Ramp + - !Item + name: Wall + - !Item + name: WallTransparent + - !Item + name: Cardbox1 + - !Item + name: Cardbox2 + - !Item + name: Cylinder + - !Item + name: UObject + - !Item + name: LObject + - !Item + name: GoodGoal + - !Item + name: GoodGoalBounce + - !Item + name: BadGoal + - !Item + name: BadGoalBounce + - !Item + name: GoodGoalMulti + - !Item + name: GoodGoalMultiBounce + - !Item + name: DeathZone + - !Item + name: HotZone + 2: !Arena + t: 0 + items: + - !Item + name: CylinderTunnel + - !Item + name: Ramp + - !Item + name: Wall + - !Item + name: WallTransparent + - !Item + name: Cardbox1 + - !Item + name: Cardbox2 + - !Item + name: Cylinder + - !Item + name: UObject + - !Item + name: LObject + - !Item + name: GoodGoal + - !Item + name: GoodGoalBounce + - !Item + name: BadGoal + - !Item + name: BadGoalBounce + - !Item + name: GoodGoalMulti + - !Item + name: GoodGoalMultiBounce + - !Item + name: DeathZone + - !Item + name: HotZone + 3: !Arena + t: 0 + items: + - !Item + name: CylinderTunnel + - !Item + name: Ramp + - !Item + name: Wall + - !Item + name: WallTransparent + - !Item + name: Cardbox1 + - !Item + name: Cardbox2 + - !Item + name: Cylinder + - !Item + name: UObject + - !Item + name: LObject + - !Item + name: GoodGoal + - !Item + name: GoodGoalBounce + - !Item + name: BadGoal + - !Item + name: BadGoalBounce + - !Item + name: GoodGoalMulti + - !Item + name: GoodGoalMultiBounce + - !Item + name: DeathZone + - !Item + name: HotZone diff --git a/trainers/configs/exampleConfig.yaml b/trainers/configs/exampleConfig.yaml new file mode 100644 index 00000000..ce7e39a3 --- /dev/null +++ b/trainers/configs/exampleConfig.yaml @@ -0,0 +1,23 @@ +!ArenaConfig +arenas: + 0: !Arena + t: 0 + items: + - !Item + name: Cube + positions: + - !Vector3 {x: 10, y: 0, z: 10} + - !Vector3 {x: -1, y: 0, z: 30} + colors: + - !RGB {r: 204, g: 0, b: 204 } + rotations: [45] + sizes: + - !Vector3 {x: -1, y: 5, z: -1} + - !Item + name: Cylinder + colors: + - !RGB {r: 204, g: 0, b: 204 } + - !RGB {r: 204, g: 0, b: 204 } + - !RGB {r: 204, g: 0, b: 204 } + - !Item + name: GoodGoal \ No newline at end of file diff --git a/trainers/configs/exampleTraining.yaml b/trainers/configs/exampleTraining.yaml new file mode 100644 index 00000000..e4a952ad --- /dev/null +++ b/trainers/configs/exampleTraining.yaml @@ -0,0 +1,34 @@ +!ArenaConfig +arenas: + 0: !Arena + t: 1000 + items: + - !Item + name: Wall + - !Item + name: CylinderTunnel + - !Item + name: GoodGoal + 1: !Arena + t: 1000 + items: + - !Item + name: TransparentWall + - !Item + name: GoodGoal + 2: !Arena + t: 1000 + items: + - !Item + name: Cardbox1 + - !Item + name: BadGoal + - !Item + name: GoodGoal + 3: !Arena + t: 1000 + items: + - !Item + name: DeathZone + - !Item + name: GoodGoal \ No newline at end of file diff --git a/trainers/configs/forcedChoice.yaml b/trainers/configs/forcedChoice.yaml new file mode 100644 index 00000000..245a235e --- /dev/null +++ b/trainers/configs/forcedChoice.yaml @@ -0,0 +1,43 @@ +!ArenaConfig +arenas: + 0: !Arena + t: 0 + items: + - !Item + name: Wall + positions: + - !Vector3 {x: 20, y: 0, z: 20} + - !Vector3 {x: 20, y: 0, z: 8.75} + - !Vector3 {x: 20, y: 0, z: 31.25} + - !Vector3 {x: 8.75, y: 0, z: 20} + - !Vector3 {x: 31.25, y: 0, z: 20} + rotations: [0,0,0,0,0] + sizes: + - !Vector3 {x: 5, y: 0.5, z: 5} + - !Vector3 {x: .1, y: 5, z: 17.5} + - !Vector3 {x: .1, y: 5, z: 17.5} + - !Vector3 {x: 17.5, y: 5, z: .1} + - !Vector3 {x: 17.5, y: 5, z: .1} + colors: + - !RGB {r: 0, g: 0, b: 255} + - !RGB {r: 200, g: 200, b: 200} + - !RGB {r: 200, g: 200, b: 200} + - !RGB {r: 200, g: 200, b: 200} + - !RGB {r: 200, g: 200, b: 200} + - !Item + name: Agent + positions: + - !Vector3 {x: 20, y: .5, z: 20} + rotations: [0] + - !Item + name: GoodGoal + positions: + - !Vector3 {x: 30, y: 0, z: 30} + sizes: + - !Vector3 {x: 1, y: 1, z: 1} + - !Item + name: BadGoal + positions: + - !Vector3 {x: 10, y: 0, z: 30} + sizes: + - !Vector3 {x: 1, y: 1, z: 1} \ No newline at end of file diff --git a/trainers/configs/internalMemory.yaml b/trainers/configs/internalMemory.yaml new file mode 100644 index 00000000..35007091 --- /dev/null +++ b/trainers/configs/internalMemory.yaml @@ -0,0 +1,32 @@ +!ArenaConfig +arenas: + 0: !Arena + t: 0 + blackouts: [-20] + items: + - !Item + name: GoodGoal + 1: !Arena + t: 100 + blackouts: [-40] + items: + - !Item + name: Wall + - !Item + name: GoodGoal + 2: !Arena + t: 100 + blackouts: [50, 55, 75, 80, 100, 105, 125] + items: + - !Item + name: WallTranparent + - !Item + name: GoodGoal + 3: !Arena + t: 100 + blackouts: [25, 30, 50, 55, 75] + items: + - !Item + name: CylinderTunnel + - !Item + name: GoodGoal diff --git a/trainers/configs/movingFood.yaml b/trainers/configs/movingFood.yaml new file mode 100644 index 00000000..6ebccd81 --- /dev/null +++ b/trainers/configs/movingFood.yaml @@ -0,0 +1,9 @@ +!ArenaConfig +arenas: + 0: !Arena + t: 0 + items: + - !Item + name: GoodGoalBounce + - !Item + name: BadGoalBounce \ No newline at end of file diff --git a/trainers/configs/objectManipulation.yaml b/trainers/configs/objectManipulation.yaml new file mode 100644 index 00000000..8b957bd4 --- /dev/null +++ b/trainers/configs/objectManipulation.yaml @@ -0,0 +1,17 @@ +!ArenaConfig +arenas: + 0: !Arena + t: 0 + items: + - !Item + name: GoodGoal + sizes: + - !Vector3 {x: 1, y: 1, z: 1} + - !Item + name: Cardbox1 + - !Item + name: Cardbox2 + - !Item + name: UObject + - !Item + name: LObject diff --git a/trainers/configs/rainbow.gin b/trainers/configs/rainbow.gin new file mode 100644 index 00000000..1cc5e979 --- /dev/null +++ b/trainers/configs/rainbow.gin @@ -0,0 +1,34 @@ +# Hyperparameters follow Hessel et al. (2018). +import dopamine.agents.rainbow.rainbow_agent +import animalai_train.dopamine.animalai_lib +import dopamine.discrete_domains.run_experiment +import dopamine.replay_memory.prioritized_replay_buffer +import gin.tf.external_configurables + +RainbowAgent.num_atoms = 51 +RainbowAgent.vmax = 10. +RainbowAgent.gamma = 0.99 +RainbowAgent.update_horizon = 3 +RainbowAgent.min_replay_history = 20000 # agent steps +RainbowAgent.update_period = 4 +RainbowAgent.target_update_period = 8000 # agent steps +RainbowAgent.epsilon_train = 0.01 +RainbowAgent.epsilon_eval = 0.001 +RainbowAgent.epsilon_decay_period = 250000 # agent steps +RainbowAgent.replay_scheme = 'prioritized' +RainbowAgent.tf_device = '/gpu:0' # use '/cpu:*' for non-GPU version +RainbowAgent.optimizer = @tf.train.AdamOptimizer() +RainbowAgent.network = @animalai_lib.rainbow_network + +# Note these parameters are different from C51's. +tf.train.AdamOptimizer.learning_rate = 0.0000625 +tf.train.AdamOptimizer.epsilon = 0.00015 + +create_agent.agent_name = 'rainbow' +Runner.num_iterations = 200 +Runner.training_steps = 250000 # agent steps +Runner.evaluation_steps = 125000 # agent steps +Runner.max_steps_per_episode = 27000 # agent steps + +WrappedPrioritizedReplayBuffer.replay_capacity = 1000000 +WrappedPrioritizedReplayBuffer.batch_size = 32 diff --git a/trainers/configs/trainer_config.yaml b/trainers/configs/trainer_config.yaml new file mode 100644 index 00000000..314ca567 --- /dev/null +++ b/trainers/configs/trainer_config.yaml @@ -0,0 +1,26 @@ +default: + trainer: ppo + +Learner: + trainer: ppo + epsilon: 0.2 + gamma: 0.99 + lambd: 0.95 + learning_rate: 3.0e-4 + memory_size: 256 + normalize: false + sequence_length: 64 + summary_freq: 1000 + use_recurrent: false + use_curiosity: true + curiosity_strength: 0.01 + curiosity_enc_size: 256 + time_horizon: 128 + batch_size: 64 + buffer_size: 2024 + hidden_units: 256 + num_layers: 1 + beta: 1.0e-2 + max_steps: 5.0e6 + num_epoch: 3 + diff --git a/trainers/configs/trainer_config2.yaml b/trainers/configs/trainer_config2.yaml new file mode 100644 index 00000000..b9a045f6 --- /dev/null +++ b/trainers/configs/trainer_config2.yaml @@ -0,0 +1,27 @@ +default: + trainer: ppo + model_path: /data1/AnimalAI-Olympics/examples/2daytrain.cptk + +Learner: + trainer: ppo + epsilon: 0.2 + gamma: 0.99 + lambd: 0.95 + learning_rate: 3.0e-4 + memory_size: 256 + normalize: false + sequence_length: 64 + summary_freq: 1000 + use_recurrent: false + use_curiosity: true + curiosity_strength: 0.01 + curiosity_enc_size: 256 + time_horizon: 128 + batch_size: 64 + buffer_size: 2024 + hidden_units: 256 + num_layers: 1 + beta: 1.0e-2 + max_steps: 5.0e6 + num_epoch: 3 + diff --git a/trainers/env_utils.py b/trainers/env_utils.py new file mode 100644 index 00000000..2a8cb5e1 --- /dev/null +++ b/trainers/env_utils.py @@ -0,0 +1,259 @@ +from animalai.envs.arena_config import Vector3, RGB, Item, Arena, ArenaConfig +from animalai.envs import UnityEnvironment +from collections import defaultdict +import numpy as np +import pprint + + +pp = pprint.PrettyPrinter(indent=4) + +np.set_printoptions(threshold=np.inf) + +class position_tracker(): + + def __init__(self, starting_positions, starting_rotations): + + self.agent_start = starting_positions['Agent'] + self.good_goal_start = np.array(starting_positions['GoodGoal']).astype('float64') + + + self.current_position = np.array(self.agent_start).astype('float64') + + self.current_rotation = np.array(starting_rotations['Agent']).astype('float64') + + self.visited = np.ones((40,40)) + + + + def position_step(self, velocity_obs, action): + + + action = np.array(action) + + if len(action.shape) > 1: + self.current_rotation[np.where(action[:,1] == 1)] -= 7 + self.current_rotation[np.where(action[:,1] == 2)] += 7 + else: + self.current_rotation[np.where(action[1] == 1)] -= 7 + self.current_rotation[np.where(action[1] == 2)] += 7 + + rot_mat = get_rot_mat(deg_to_rad(self.current_rotation[0][0])) + + velocity_obs = np.dot(rot_mat, np.array(velocity_obs).T).T + delta_distance = 0.0595 * velocity_obs + + self.current_position += delta_distance + + square_coord = np.floor(self.current_position[0]).astype(int)[[0,2]] + + if all(square_coord >= 0) and all(square_coord < 40): + self.visited[square_coord[1],square_coord[0]] = 0 + + + + + + + def distance_to_goal(self): + + + distance = 0 + for g_pos, a_pos in zip(self.good_goal_start[0], self.current_position[0]): + + distance += (g_pos - a_pos)**2 + distance = distance ** (0.5) + + return distance + + def angle_to_goal(self): + + + agent_to_goal_vec = self.good_goal_start - self.current_position + agent_to_goal_vec = np.delete(agent_to_goal_vec, 1, 1) + + agent_face_vec = np.array([-np.sin(deg_to_rad(self.current_rotation[0][0])), np.cos(deg_to_rad(self.current_rotation[0][0]))]) + + angle = np.arccos(np.dot(agent_to_goal_vec, agent_face_vec)/(np.linalg.norm(agent_to_goal_vec)*np.linalg.norm(agent_face_vec))) + + deg = rad_to_deg(angle) + + if np.isnan(deg): + return 0 + else: + return deg[0] + + + +def deg_to_rad(deg): + return deg * (np.pi/180) + +def rad_to_deg(rad): + return rad * (180/np.pi) + +def get_rot_mat(rad): + return np.array([[np.cos(rad), 0, -np.sin(rad)],[0, 1, 0],[np.sin(rad), 0, np.cos(rad)]]) + + + +class better_env(): + + def __init__(self, n_arenas=2, walls=2, t=250, play=False, inference=False): + print(n_arenas) + + self.n_arenas = n_arenas + self.walls = walls + self.t = t + #self.env_config = self.create_env() + self.generate_new_config() + self.env = UnityEnvironment(file_name='../env/AnimalAI', n_arenas=n_arenas, worker_id=np.random.randint(1,100), play=play,inference=inference) + + + start_positions, start_rotations = self.get_start_positions() + self.position_tracker = position_tracker(start_positions, start_rotations) + + def generate_new_config(self): + self.env_config = self.create_env() + start_positions, start_rotations = self.get_start_positions() + self.position_tracker = position_tracker(start_positions, start_rotations) + + + def create_env(self): + + #print("Creating {} arenas!!!".format(self.n_arenas)) + + #include_items = {'Agent':1}#, 'GoodGoal':1, 'Wall':2} + include_items = {'Agent':1, 'GoodGoal':1} + if self.walls > 0: + include_items['Wall'] = self.walls + + if True: + include_items['GoodGoalMulti'] = 1 + + if True: + include_items['BadGoal'] = 1 + + + env_config = ArenaConfig() + + # Loop over arenas + for i in range(self.n_arenas): + env_config.arenas[i] = Arena(t=self.t) + + #self.details[i] = {} + + + item_list = [] + # Loop over item types in each arena + for item_type, item_count in include_items.items(): + + #self.details[i][item_type] = defaultdict(list) + + name = item_type + colors = [] + positions = [] + rotations = [] + + # Loop over item counts + for j in range(item_count): + if item_type == 'Wall': + colors.append(RGB(r=153, g=153, b=153)) + #self.details[i][item_type]['colors'].append((153,153,153)) + + + elif item_type in ['GoodGoal', 'GoodGoalMulti', 'BadGoal']: + x = np.random.randint(1,39) + #y = np.random.randint(1,39) + y = 1 + + z = np.random.randint(1,39) + #self.details[i][item_type]['positions'].append((x,y,z)) + + positions.append(Vector3(x=x, y=y, z=z)) + + elif item_type == 'Agent': + x = np.random.randint(1,39) + #y = np.random.randint(1,39) + y = 1 + z = np.random.randint(1,39) + #x = 0.5 + #y = 0.5 + #z = 0.5 + #self.details[i][item_type]['positions'].append((x,y,z)) + + positions.append(Vector3(x=x, y=y, z=z)) + rotations.append(0) + + item_list.append(Item(name=name, positions=positions, rotations=rotations, colors=colors)) + env_config.arenas[i].items = item_list + + return env_config + + def get_details(self): + + details = {} + + for i, arena in self.env_config.arenas.items(): + details[i] = {} + + for j, item in enumerate(arena.items): + details[i][item.name] = {} + details[i][item.name]['positions'] = [] + details[i][item.name]['rotations'] = [] + details[i][item.name]['sizes'] = [] + details[i][item.name]['colors'] = [] + + for position in item.positions: + details[i][item.name]['positions'].append((position.x, position.y, position.z)) + for rotation in item.rotations: + details[i][item.name]['rotations'].append(rotation) + for size in item.sizes: + details[i][item.name]['sizes'].append((size.x, size.y, size.z)) + for color in item.colors: + details[i][item.name]['colors'].append((color.r, color.g, color.b)) + + return details + + def get_start_positions(self): + + start_positions = {'Agent': [], 'GoodGoal': []} + start_rotations = {'Agent':[]} + + for arena_idx, arena in self.env_config.arenas.items(): + + for item_idx, item in enumerate(arena.items): + if item.name == 'Agent' or item.name == 'GoodGoal': + for position in item.positions: + start_positions[item.name].append([position.x, position.y, position.z]) + if item.name == 'Agent': + for rotation in item.rotations: + start_rotations[item.name].append([rotation]) + + return start_positions, start_rotations + + + + + +def env_info(env_config): + + for i, arena in env_config.arenas.items(): + print("Arena Config #{}".format(i)) + print("max time steps = {}".format(arena.t)) + for j, item in enumerate(arena.items): + print("{:4s}Item name: {}".format('',item.name)) + print("{:8s}Item positions: {}".format('',item.positions)) + print("{:8s}Item rotations: {}".format('',item.rotations)) + print("{:8s}Item sizes: {}".format('',item.sizes)) + print("{:8s}Item colors: {}".format('',item.colors)) + +#env = better_env() +#env_config = env.env_config +#env_info(env_config) +#pp.pprint(env.details) +#pp.pprint(env.details2) + +#pp.pprint(env.get_start_positions()) + +#start_pos, start_rot = env.get_start_positions() +#ps = position_tracker(start_pos, start_rot) +#print(ps.current_position) diff --git a/trainers/ppo.py b/trainers/ppo.py new file mode 100644 index 00000000..ca69b400 --- /dev/null +++ b/trainers/ppo.py @@ -0,0 +1,373 @@ +import gym +import torch +import torch.nn as nn +import torch.nn.functional as F +import torch.optim as optim +from torch.distributions import Categorical +import numpy as np +import time +import argparse +import os +import datetime + +from animalai.envs import UnityEnvironment +from animalai.envs.arena_config import ArenaConfig + + +from env_utils import * + +parser = argparse.ArgumentParser(description="Train ppo agent for AnimalAI.") +parser.add_argument('--train_name', type=str, help='Will save model with this name. Default: random') +parser.add_argument('--config', type=str, default='configs/1-Food.yaml', help='Environment config file. Default: "configs/1-Food.yaml"') +parser.add_argument('--load_model', type=str, default='saved_models/ppo.pth', help='Saved model to load. Default: "saved_models/ppo.pth"') +parser.add_argument('--inference', default=False, action='store_true', help='Run in inference mode. Default: False') + +args = parser.parse_args() + + +if not args.inference: + if args.train_name is not None: + train_filename = '{}.pth'.format(args.train_name) + else: + train_filename = 'ppo_{}.pth'.format(np.random.randint(100000,999999)) +# my params +env_path = '../env/AnimalAI' +brain_name = 'Learner' +train_mode = True +color_channels = 3 +env_field = args.config +n_episodes = 20000 +#max_t = 100 +#num_actions = 9 +#actions_array = np.array([[0,0],[0,1],[0,2],[1,0], [1,1],[1,2], [2,0],[2,1],[2,2]]) +num_actions = 5 +actions_array = np.array([[0,0],[0,1],[0,2],[1,0],[2,0]]) + +n_arenas = 1 +print_interval = 1 +save_interval = 10 +save_path = 'saved_models/' + +device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu") + +#Hyperparameters +learning_rate = 0.0005 +gamma = 0.98 +lmbda = 0.95 +eps_clip = 0.1 +K_epoch = 4 +T_horizon = 500 + + + +class PPO(nn.Module): + def __init__(self): + super(PPO, self).__init__() + self.data = [] + + self.conv1 = nn.Conv2d(color_channels, 32, kernel_size=8, stride=4) + self.conv2 = nn.Conv2d(32, 64, kernel_size=4, stride=2) + self.conv3 = nn.Conv2d(64, 64, kernel_size=3, stride=1) + self.fc4 = nn.Linear(7 * 7 * 64, 512) + self.fc_pi = nn.Linear(512, num_actions) + self.fc_v = nn.Linear(512, 1) + + #self.fc1 = nn.Linear(4,256) + #self.fc_pi = nn.Linear(256,2) + #self.fc_v = nn.Linear(256,1) + self.optimizer = optim.Adam(self.parameters(), lr=learning_rate) + + def pi(self, x, softmax_dim = 1): + #x = x.permute(2,0,1) + if x.ndim == 3: + x = x.unsqueeze(0) + #x = x.transpose(1,3) + x = F.relu(self.conv1(x)) + x = F.relu(self.conv2(x)) + x = F.relu(self.conv3(x)) + x = F.relu(self.fc4(x.view(x.size(0), -1))) + x = self.fc_pi(x) + + #x = F.relu(self.fc1(x)) + #x = self.fc_pi(x) + + prob = F.softmax(x, dim=softmax_dim) + + + return prob + + def v(self, x): + + #x = x.transpose(1,3) + #print(x.shape) + #x = x.permute(2,0,1) + #x = x.unsqueeze(0) + x = F.relu(self.conv1(x)) + x = F.relu(self.conv2(x)) + x = F.relu(self.conv3(x)) + x = F.relu(self.fc4(x.view(x.size(0), -1))) + + #x = F.relu(self.fc1(x)) + v = self.fc_v(x) + return v + + def put_data(self, transition): + + self.data.append(transition) + + def make_batch(self): + s_lst, a_lst, r_lst, s_prime_lst, prob_a_lst, done_lst = [], [], [], [], [], [] + for transition in self.data: + s, a, r, s_prime, prob_a, done = transition + + + s_lst.append(s) + a_lst.append([a]) + r_lst.append([r]) + s_prime_lst.append(s_prime) + prob_a_lst.append([prob_a]) + done_mask = 0 if done else 1 + done_lst.append([done_mask]) + + s,a,r,s_prime,done_mask, prob_a = torch.tensor(s_lst, dtype=torch.float), torch.tensor(a_lst), \ + torch.tensor(r_lst), torch.tensor(s_prime_lst, dtype=torch.float), \ + torch.tensor(done_lst, dtype=torch.float), torch.tensor(prob_a_lst) + + prob_a = prob_a.to(device) + a = a.to(device) + s_prime = s_prime.to(device) + r = r.to(device) + done_mask = done_mask.to(device) + s = s.to(device) + + self.data = [] + return s, a, r, s_prime, done_mask, prob_a + + def train_net(self): + s, a, r, s_prime, done_mask, prob_a = self.make_batch() + + + for i in range(K_epoch): + td_target = r + gamma * self.v(s_prime) * done_mask + delta = td_target - self.v(s) + delta = delta.detach().cpu().numpy() + + advantage_lst = [] + advantage = 0.0 + for delta_t in delta[::-1]: + advantage = gamma * lmbda * advantage + delta_t[0] + advantage_lst.append([advantage]) + advantage_lst.reverse() + advantage = torch.tensor(advantage_lst, dtype=torch.float).to(device) + + pi = self.pi(s, softmax_dim=1) + + + pi_a = pi.gather(1,a) + ratio = torch.exp(torch.log(pi_a) - torch.log(prob_a)) # a/b == exp(log(a)-log(b)) + + surr1 = ratio * advantage + surr2 = torch.clamp(ratio, 1-eps_clip, 1+eps_clip) * advantage + loss = -torch.min(surr1, surr2) + F.smooth_l1_loss(self.v(s) , td_target.detach()) + + self.optimizer.zero_grad() + loss.mean().backward() + self.optimizer.step() + +def train(): + env=UnityEnvironment(file_name=env_path, n_arenas=n_arenas, worker_id=np.random.randint(1,100), inference=args.inference) + #arena_config_in = ArenaConfig(env_field) + #print(arena_config_in.arenas) + + + model = PPO() + if os.path.exists(args.load_model): + model.load_state_dict(torch.load(args.load_model)) + print("Successfully loaded saved model from {}".format(args.load_model)) + + model = model.to(device) + + + total_obs = 0 + + for n_epi in range(1, n_episodes+1): + b_env = better_env(n_arenas = 1) + arena_config_in = b_env.env_config + start_positions, start_rotations = b_env.get_start_positions() + ps = position_tracker(start_positions, start_rotations) + + action_info = env.reset(arenas_configurations=arena_config_in, train_mode=train_mode) + state = action_info[brain_name].visual_observations[0] + + #state = np.moveaxis(state, -1, 0) + state = np.moveaxis(state, -1, 1) + done = False + score = 0.0 + scores = [] + + start_episode = time.time() + n_obs = 0 + while not done: + for t in range(T_horizon): + n_obs += n_arenas + + prob = model.pi(torch.from_numpy(state).float().to(device)) + m = Categorical(prob) + + #a = m.sample().item() + a = m.sample() + action = actions_array[a.cpu().numpy().astype(int)] + #s_prime, reward, done, info = + action_info = env.step(vector_action=action) + next_state = action_info[brain_name].visual_observations[0] + velocity_obs = action_info[brain_name].vector_observations + + ps.position_step(velocity_obs, action) + #print('Current position = {}, velocity = {}'.format(ps.current_position, velocity_obs)) + #print('Distance to goal = {}'.format(ps.distance_to_goal())) + + #next_state = np.moveaxis(next_state, -1, 0) + next_state = np.moveaxis(next_state, -1, 1) # next state shape = [n_arenas, 3, 84, 84] + reward = action_info[brain_name].rewards # list of rewards len = n_arenas + reward -= ps.distance_to_goal()/100 + #print(reward) + arenas_done = action_info[brain_name].local_done + done = any(arenas_done) + + prob_a = prob[np.arange(prob.shape[0])[:,None], a.cpu().numpy().astype(int)[:,None]] + + for (s, a, r, n_s, p_a, d) in zip (state, a, reward, next_state, prob_a, arenas_done): + model.put_data((s, a, r, n_s, p_a, d)) + scores.append(r) + #model.put_data((state, a, reward, next_state, prob[0][a].item(), done)) + #model.put_data((state, a, reward, next_state, prob_a, done)) + state = next_state + + #score += reward + if done: + break + + start_train = time.time() + model.train_net() + end_train = time.time() + #print('time to train: ',end_train - start_train) + + end_episode = time.time() + + #print('{} observations/second'.format(n_obs/(end_episode - start_episode))) + + #scores.append(score) + + if n_epi%print_interval==0 and n_epi!=0: + print("Episode: {}, avg score: {:.4f}, [{:.0f}] observations/second".format(n_epi, np.mean(scores)/n_arenas, n_obs/(end_episode - start_episode))) + + if n_epi%save_interval==0 and n_epi!=0: + print("Saving model to {} at {}".format(save_path+train_filename, datetime.datetime.now())) + torch.save(model.state_dict(), save_path+train_filename) + + + env.close() + +def env_info(env_config): + + for i, arena in env_config.arenas.items(): + print("Arena Config #{}".format(i)) + print("max time steps = {}".format(arena.t)) + for j, item in enumerate(arena.items): + print("Item name: {}".format(item.name)) + print("Item positions: {}".format(item.positions)) + print("Item rotations: {}".format(item.rotations)) + print("Item sizes: {}".format(item.sizes)) + print("Item colors: {}".format(item.colors)) + +def inference(): + env=UnityEnvironment(file_name=env_path, n_arenas=n_arenas, worker_id=np.random.randint(1,100), play=False,inference=args.inference) + #arena_config_in = ArenaConfig(env_field) + + + + + model = PPO() + if os.path.exists(args.load_model): + model.load_state_dict(torch.load(args.load_model)) + print("Successfully loaded saved model from {}".format(args.load_model)) + + model = model.to(device) + + + total_obs = 0 + + for n_epi in range(1, n_episodes+1): + + b_env = better_env(n_arenas = 1) + arena_config_in = b_env.env_config + start_positions, start_rotations = b_env.get_start_positions() + ps = position_tracker(start_positions, start_rotations) + + action_info = env.reset(arenas_configurations=arena_config_in, train_mode=False) + state = action_info[brain_name].visual_observations[0] + + state = np.moveaxis(state, -1, 1) + done = False + score = 0.0 + + start_episode = time.time() + n_obs = 0 + action = [[0,1]] + action_info = env.step(vector_action=action) + velocity_obs = action_info[brain_name].vector_observations + ps.position_step(velocity_obs, action) + print('Start position = {}, velocity = {}'.format(ps.current_position, velocity_obs)) + + while not done: + for t in range(T_horizon): + n_obs += n_arenas + + prob = model.pi(torch.from_numpy(state).float().to(device)) + m = Categorical(prob) + + a = m.sample() + action = actions_array[a.cpu().numpy().astype(int)] + #if np.random.randint(0,2): + # action = [0,1] + #else: + # action = [0,2] + action_info = env.step(vector_action=action) + #action = [[1,0]] + next_state = action_info[brain_name].visual_observations[0] + velocity_obs = action_info[brain_name].vector_observations + + ps.position_step(velocity_obs, action) + #print('Current position = {}, velocity = {}'.format(ps.current_position, velocity_obs)) + print('Distance to goal = {}'.format(ps.distance_to_goal())) + + next_state = np.moveaxis(next_state, -1, 1) # next state shape = [n_arenas, 3, 84, 84] + reward = action_info[brain_name].rewards # list of rewards len = n_arenas + arenas_done = action_info[brain_name].local_done + done = any(arenas_done) + + + state = next_state + + score += reward[0] + if done: + break + + + end_episode = time.time() + + + if n_epi%print_interval==0 and n_epi!=0: + print("Episode: {}, avg score: {:.4f}, [{:.0f}] observations/second".format(n_epi, score/n_obs, n_obs/(end_episode - start_episode))) + + + + env.close() +if __name__ == '__main__': + + if not args.inference: + print("Starting agent in train mode...") + train() + else: + print("Starting agent in inference mode...") + inference()