ynuwm
diff --git a/‎.spyproject/codestyle.ini
+6 b/‎.spyproject/codestyle.ini
+6
diff --git a/‎.spyproject/encoding.ini
+6 b/‎.spyproject/encoding.ini
+6
diff --git a/‎.spyproject/vcs.ini
+7 b/‎.spyproject/vcs.ini
+7
diff --git a/‎.spyproject/workspace.ini
+10 b/‎.spyproject/workspace.ini
+10
diff --git a/‎README.md
+33 b/‎README.md
+33
diff --git a/‎__pycache__/normalized_actions.cpython-35.pyc
828 Bytes b/‎__pycache__/normalized_actions.cpython-35.pyc
828 Bytes
diff --git a/‎__pycache__/reinforce_discrete.cpython-35.pyc
2.61 KB b/‎__pycache__/reinforce_discrete.cpython-35.pyc
2.61 KB
diff --git a/‎assets/CartPole-v0.png
89.3 KB b/‎assets/CartPole-v0.png
89.3 KB
diff --git a/‎assets/InvertedPendulum-v1.png
111 KB b/‎assets/InvertedPendulum-v1.png
111 KB
diff --git a/‎assets/algo.png
158 KB b/‎assets/algo.png
158 KB
diff --git a/‎ckpt_CartPole-v0/reinforce-0.pkl
4.31 KB b/‎ckpt_CartPole-v0/reinforce-0.pkl
4.31 KB
diff --git a/‎ckpt_CartPole-v0/reinforce-100.pkl
4.31 KB b/‎ckpt_CartPole-v0/reinforce-100.pkl
4.31 KB
diff --git a/‎ckpt_CartPole-v0/reinforce-1000.pkl
4.31 KB b/‎ckpt_CartPole-v0/reinforce-1000.pkl
4.31 KB
diff --git a/‎ckpt_CartPole-v0/reinforce-1100.pkl
4.31 KB b/‎ckpt_CartPole-v0/reinforce-1100.pkl
4.31 KB
diff --git a/‎ckpt_CartPole-v0/reinforce-1200.pkl
4.31 KB b/‎ckpt_CartPole-v0/reinforce-1200.pkl
4.31 KB
diff --git a/‎ckpt_CartPole-v0/reinforce-1300.pkl
4.31 KB b/‎ckpt_CartPole-v0/reinforce-1300.pkl
4.31 KB
diff --git a/‎ckpt_CartPole-v0/reinforce-1400.pkl
4.31 KB b/‎ckpt_CartPole-v0/reinforce-1400.pkl
4.31 KB
diff --git a/‎ckpt_CartPole-v0/reinforce-1500.pkl
4.31 KB b/‎ckpt_CartPole-v0/reinforce-1500.pkl
4.31 KB
diff --git a/‎ckpt_CartPole-v0/reinforce-1600.pkl
4.31 KB b/‎ckpt_CartPole-v0/reinforce-1600.pkl
4.31 KB
diff --git a/‎ckpt_CartPole-v0/reinforce-1700.pkl
4.31 KB b/‎ckpt_CartPole-v0/reinforce-1700.pkl
4.31 KB
diff --git a/‎ckpt_CartPole-v0/reinforce-1800.pkl
4.31 KB b/‎ckpt_CartPole-v0/reinforce-1800.pkl
4.31 KB
diff --git a/‎ckpt_CartPole-v0/reinforce-1900.pkl
4.31 KB b/‎ckpt_CartPole-v0/reinforce-1900.pkl
4.31 KB
diff --git a/‎ckpt_CartPole-v0/reinforce-200.pkl
4.31 KB b/‎ckpt_CartPole-v0/reinforce-200.pkl
4.31 KB
diff --git a/‎ckpt_CartPole-v0/reinforce-300.pkl
4.31 KB b/‎ckpt_CartPole-v0/reinforce-300.pkl
4.31 KB
diff --git a/‎ckpt_CartPole-v0/reinforce-400.pkl
4.31 KB b/‎ckpt_CartPole-v0/reinforce-400.pkl
4.31 KB
diff --git a/‎ckpt_CartPole-v0/reinforce-500.pkl
4.31 KB b/‎ckpt_CartPole-v0/reinforce-500.pkl
4.31 KB
diff --git a/‎ckpt_CartPole-v0/reinforce-600.pkl
4.31 KB b/‎ckpt_CartPole-v0/reinforce-600.pkl
4.31 KB
diff --git a/‎ckpt_CartPole-v0/reinforce-700.pkl
4.31 KB b/‎ckpt_CartPole-v0/reinforce-700.pkl
4.31 KB
diff --git a/‎ckpt_CartPole-v0/reinforce-800.pkl
4.31 KB b/‎ckpt_CartPole-v0/reinforce-800.pkl
4.31 KB
diff --git a/‎ckpt_CartPole-v0/reinforce-900.pkl
4.31 KB b/‎ckpt_CartPole-v0/reinforce-900.pkl
4.31 KB
diff --git a/‎main.py
+82 b/‎main.py
+82
diff --git a/‎normalized_actions.py
+16 b/‎normalized_actions.py
+16
diff --git a/‎reinforce_continuous.py
+72 b/‎reinforce_continuous.py
+72
diff --git a/‎reinforce_discrete.py
+75 b/‎reinforce_discrete.py
+75
@@ -0,0 +1,6 @@
+[codestyle]
+indentation = True
+
+[main]
+version = 0.1.0
+
@@ -0,0 +1,6 @@
+[encoding]
+text_encoding = utf-8
+
+[main]
+version = 0.1.0
+
@@ -0,0 +1,7 @@
+[vcs]
+version_control_system = 
+use_version_control = False
+
+[main]
+version = 0.1.0
+
@@ -0,0 +1,10 @@
+[workspace]
+save_non_project_files = False
+restore_data_on_startup = True
+save_history = True
+save_data_on_exit = True
+
+[main]
+version = 0.1.0
+recent_files = ['C:\\Users\\wmin_\\Desktop\\pytorch-REINFORCE-master\\main.py', 'C:\\Users\\wmin_\\Desktop\\pytorch-REINFORCE-master\\reinforce_discrete.py']
+
@@ -0,0 +1,33 @@
+# PyTorch REINFORCE
+
+<img src="assets/algo.png" width="800"> 
+
+PyTorch implementation of REINFORCE.     
+This repo supports both **continuous** and **discrete** environments in OpenAI gym. 
+
+
+## Requirement
+- python 2.7
+- PyTorch
+- OpenAI gym
+- Mujoco (optional)
+
+
+## Run
+Use the default hyperparameters. *(Program will detect whether the environment is continuous or discrete)*
+
+```
+python main.py --env_name [name of environment]
+```
+
+## Experiment results
+### continuous: InvertedPendulum-v1
+
+<img src="assets/InvertedPendulum-v1.png" width="800">
+
+### discrete: CartPole-v0
+
+<img src="assets/CartPole-v0.png" width="800">
+
+## Reference
+- [pytorch example](https://github.com/pytorch/examples/blob/master/reinforcement_learning/reinforce.py)
@@ -0,0 +1,82 @@
+import argparse, math, os
+import numpy as np
+import gym
+from gym import wrappers
+
+import torch
+from torch.autograd import Variable
+import torch.nn.utils as utils
+
+from normalized_actions import NormalizedActions
+
+parser = argparse.ArgumentParser(description='PyTorch REINFORCE example')
+parser.add_argument('--env_name', type=str, default='CartPole-v0')
+parser.add_argument('--gamma', type=float, default=0.99, metavar='G',
+                    help='discount factor for reward (default: 0.99)')
+parser.add_argument('--exploration_end', type=int, default=100, metavar='N',
+                    help='number of episodes with noise (default: 100)')
+parser.add_argument('--seed', type=int, default=123, metavar='N',
+                    help='random seed (default: 123)')
+parser.add_argument('--num_steps', type=int, default=1000, metavar='N',
+                    help='max episode length (default: 1000)')
+parser.add_argument('--num_episodes', type=int, default=2000, metavar='N',
+                    help='number of episodes (default: 2000)')
+parser.add_argument('--hidden_size', type=int, default=128, metavar='N',
+                    help='number of episodes (default: 128)')
+parser.add_argument('--render', action='store_true',
+                    help='render the environment')
+parser.add_argument('--ckpt_freq', type=int, default=100, 
+		    help='model saving frequency')
+parser.add_argument('--display', type=bool, default=False,
+                    help='display or not')
+args = parser.parse_args()
+
+env_name = args.env_name
+env = gym.make(env_name)
+if type(env.action_space) != gym.spaces.discrete.Discrete:
+    from reinforce_continuous import REINFORCE
+    env = NormalizedActions(gym.make(env_name))
+else:
+    from reinforce_discrete import REINFORCE
+
+if args.display:
+    env = wrappers.Monitor(env, '/tmp/{}-experiment'.format(env_name), force=True)
+
+env.seed(args.seed)
+torch.manual_seed(args.seed)
+np.random.seed(args.seed)
+
+agent = REINFORCE(args.hidden_size, env.observation_space.shape[0], env.action_space)
+
+dir = 'ckpt_' + env_name
+if not os.path.exists(dir):    
+    os.mkdir(dir)
+
+for i_episode in range(args.num_episodes):
+    state = torch.Tensor([env.reset()])
+    entropies = []
+    log_probs = []
+    rewards = []
+    for t in range(args.num_steps):
+        action, log_prob, entropy = agent.select_action(state)
+        action = action.cpu()
+
+        next_state, reward, done, _ = env.step(action.numpy()[0])
+
+        entropies.append(entropy)
+        log_probs.append(log_prob)
+        rewards.append(reward)
+        state = torch.Tensor([next_state])
+
+        if done:
+            break
+
+    agent.update_parameters(rewards, log_probs, entropies, args.gamma)
+
+
+    if i_episode%args.ckpt_freq == 0:
+        torch.save(agent.model.state_dict(), os.path.join(dir, 'reinforce-'+str(i_episode)+'.pkl'))
+
+    print("Episode: {}, reward: {}".format(i_episode, np.sum(rewards)))
+	
+env.close()
@@ -0,0 +1,16 @@
+import gym
+
+
+class NormalizedActions(gym.ActionWrapper):
+
+    def _action(self, action):
+        action = (action + 1) / 2  # [-1, 1] => [0, 1]
+        action *= (self.action_space.high - self.action_space.low)
+        action += self.action_space.low
+        return action
+
+    def _reverse_action(self, action):
+        action -= self.action_space.low
+        action /= (self.action_space.high - self.action_space.low)
+        action = action * 2 - 1
+        return actions
@@ -0,0 +1,72 @@
+import sys
+import math
+
+import torch
+import torch.autograd as autograd
+import torch.nn as nn
+import torch.nn.functional as F
+import torch.optim as optim
+import torch.nn.utils as utils
+import torchvision.transforms as T
+from torch.autograd import Variable
+
+pi = Variable(torch.FloatTensor([math.pi])).cuda()
+
+def normal(x, mu, sigma_sq):
+    a = (-1*(Variable(x)-mu).pow(2)/(2*sigma_sq)).exp()
+    b = 1/(2*sigma_sq*pi.expand_as(sigma_sq)).sqrt()
+    return a*b
+
+
+class Policy(nn.Module):
+    def __init__(self, hidden_size, num_inputs, action_space):
+        super(Policy, self).__init__()
+        self.action_space = action_space
+        num_outputs = action_space.shape[0]
+
+        self.linear1 = nn.Linear(num_inputs, hidden_size)
+        self.linear2 = nn.Linear(hidden_size, num_outputs)
+        self.linear2_ = nn.Linear(hidden_size, num_outputs)
+
+    def forward(self, inputs):
+        x = inputs
+        x = F.relu(self.linear1(x))
+        mu = self.linear2(x)
+        sigma_sq = self.linear2_(x)
+
+        return mu, sigma_sq
+
+
+class REINFORCE:
+    def __init__(self, hidden_size, num_inputs, action_space):
+        self.action_space = action_space
+        self.model = Policy(hidden_size, num_inputs, action_space)
+        self.model = self.model.cuda()
+        self.optimizer = optim.Adam(self.model.parameters(), lr=1e-3)
+        self.model.train()
+
+    def select_action(self, state):
+        mu, sigma_sq = self.model(Variable(state).cuda())
+        sigma_sq = F.softplus(sigma_sq)
+
+        eps = torch.randn(mu.size())
+        # calculate the probability
+        action = (mu + sigma_sq.sqrt()*Variable(eps).cuda()).data
+        prob = normal(action, mu, sigma_sq)
+        entropy = -0.5*((sigma_sq+2*pi.expand_as(sigma_sq)).log()+1)
+
+        log_prob = prob.log()
+        return action, log_prob, entropy
+
+    def update_parameters(self, rewards, log_probs, entropies, gamma):
+        R = torch.zeros(1, 1)
+        loss = 0
+        for i in reversed(range(len(rewards))):
+            R = gamma * R + rewards[i]
+            loss = loss - (log_probs[i]*(Variable(R).expand_as(log_probs[i])).cuda()).sum() - (0.0001*entropies[i].cuda()).sum()
+        loss = loss / len(rewards)
+		
+        self.optimizer.zero_grad()
+        loss.backward()
+        utils.clip_grad_norm(self.model.parameters(), 40)
+        self.optimizer.step()
@@ -0,0 +1,75 @@
+import sys
+import math
+
+import torch
+import torch.autograd as autograd
+import torch.nn as nn
+import torch.nn.functional as F
+import torch.optim as optim
+import torch.nn.utils as utils
+import torchvision.transforms as T
+from torch.autograd import Variable
+import pdb
+
+class Policy(nn.Module):
+    def __init__(self, hidden_size, num_inputs, action_space):
+        super(Policy, self).__init__()
+        self.action_space = action_space
+        num_outputs = action_space.n
+
+        self.linear1 = nn.Linear(num_inputs, hidden_size)
+        self.linear2 = nn.Linear(hidden_size, num_outputs)
+
+    def forward(self, inputs):
+        x = inputs
+        x = F.relu(self.linear1(x))
+        action_scores = self.linear2(x)
+        return F.softmax(action_scores)
+
+
+class REINFORCE:
+    def __init__(self, hidden_size, num_inputs, action_space):
+        self.action_space = action_space
+        self.model = Policy(hidden_size, num_inputs, action_space)
+        self.model = self.model.cuda()
+        self.optimizer = optim.Adam(self.model.parameters(), lr=1e-3)
+        self.model.train()
+
+    def select_action(self, state):
+        probs = self.model(Variable(state).cuda())       
+        action = probs.multinomial(1).data
+        prob = probs[:, action[0,0]].view(1, -1)
+        log_prob = prob.log()
+        entropy = - (probs*probs.log()).sum()
+
+        return action[0], log_prob, entropy
+
+    def update_parameters(self, rewards, log_probs, entropies, gamma):
+        R = torch.zeros(1, 1)
+        loss = 0
+        for i in reversed(range(len(rewards))):
+            R = gamma * R + rewards[i]
+            loss = loss - (log_probs[i]*(Variable(R).expand_as(log_probs[i])).cuda()).sum() - \
+                                                                          (0.0001*entropies[i].cuda()).sum()
+        loss = loss / len(rewards)
+		
+        self.optimizer.zero_grad()
+        loss.backward()
+        utils.clip_grad_norm(self.model.parameters(), 40)
+        self.optimizer.step()
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+