Skip to content

Commit e9c2b52

Browse files
committed
torch version of reinforce algorithm/policy gradient
0 parents  commit e9c2b52

34 files changed

+307
-0
lines changed

.spyproject/codestyle.ini

+6
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,6 @@
1+
[codestyle]
2+
indentation = True
3+
4+
[main]
5+
version = 0.1.0
6+

.spyproject/encoding.ini

+6
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,6 @@
1+
[encoding]
2+
text_encoding = utf-8
3+
4+
[main]
5+
version = 0.1.0
6+

.spyproject/vcs.ini

+7
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,7 @@
1+
[vcs]
2+
version_control_system =
3+
use_version_control = False
4+
5+
[main]
6+
version = 0.1.0
7+

.spyproject/workspace.ini

+10
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,10 @@
1+
[workspace]
2+
save_non_project_files = False
3+
restore_data_on_startup = True
4+
save_history = True
5+
save_data_on_exit = True
6+
7+
[main]
8+
version = 0.1.0
9+
recent_files = ['C:\\Users\\wmin_\\Desktop\\pytorch-REINFORCE-master\\main.py', 'C:\\Users\\wmin_\\Desktop\\pytorch-REINFORCE-master\\reinforce_discrete.py']
10+

README.md

+33
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,33 @@
1+
# PyTorch REINFORCE
2+
3+
<img src="assets/algo.png" width="800">
4+
5+
PyTorch implementation of REINFORCE.
6+
This repo supports both **continuous** and **discrete** environments in OpenAI gym.
7+
8+
9+
## Requirement
10+
- python 2.7
11+
- PyTorch
12+
- OpenAI gym
13+
- Mujoco (optional)
14+
15+
16+
## Run
17+
Use the default hyperparameters. *(Program will detect whether the environment is continuous or discrete)*
18+
19+
```
20+
python main.py --env_name [name of environment]
21+
```
22+
23+
## Experiment results
24+
### continuous: InvertedPendulum-v1
25+
26+
<img src="assets/InvertedPendulum-v1.png" width="800">
27+
28+
### discrete: CartPole-v0
29+
30+
<img src="assets/CartPole-v0.png" width="800">
31+
32+
## Reference
33+
- [pytorch example](https://github.com/pytorch/examples/blob/master/reinforcement_learning/reinforce.py)
828 Bytes
Binary file not shown.
2.61 KB
Binary file not shown.

assets/CartPole-v0.png

89.3 KB
Loading

assets/InvertedPendulum-v1.png

111 KB
Loading

assets/algo.png

158 KB
Loading

ckpt_CartPole-v0/reinforce-0.pkl

4.31 KB
Binary file not shown.

ckpt_CartPole-v0/reinforce-100.pkl

4.31 KB
Binary file not shown.

ckpt_CartPole-v0/reinforce-1000.pkl

4.31 KB
Binary file not shown.

ckpt_CartPole-v0/reinforce-1100.pkl

4.31 KB
Binary file not shown.

ckpt_CartPole-v0/reinforce-1200.pkl

4.31 KB
Binary file not shown.

ckpt_CartPole-v0/reinforce-1300.pkl

4.31 KB
Binary file not shown.

ckpt_CartPole-v0/reinforce-1400.pkl

4.31 KB
Binary file not shown.

ckpt_CartPole-v0/reinforce-1500.pkl

4.31 KB
Binary file not shown.

ckpt_CartPole-v0/reinforce-1600.pkl

4.31 KB
Binary file not shown.

ckpt_CartPole-v0/reinforce-1700.pkl

4.31 KB
Binary file not shown.

ckpt_CartPole-v0/reinforce-1800.pkl

4.31 KB
Binary file not shown.

ckpt_CartPole-v0/reinforce-1900.pkl

4.31 KB
Binary file not shown.

ckpt_CartPole-v0/reinforce-200.pkl

4.31 KB
Binary file not shown.

ckpt_CartPole-v0/reinforce-300.pkl

4.31 KB
Binary file not shown.

ckpt_CartPole-v0/reinforce-400.pkl

4.31 KB
Binary file not shown.

ckpt_CartPole-v0/reinforce-500.pkl

4.31 KB
Binary file not shown.

ckpt_CartPole-v0/reinforce-600.pkl

4.31 KB
Binary file not shown.

ckpt_CartPole-v0/reinforce-700.pkl

4.31 KB
Binary file not shown.

ckpt_CartPole-v0/reinforce-800.pkl

4.31 KB
Binary file not shown.

ckpt_CartPole-v0/reinforce-900.pkl

4.31 KB
Binary file not shown.

main.py

+82
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,82 @@
1+
import argparse, math, os
2+
import numpy as np
3+
import gym
4+
from gym import wrappers
5+
6+
import torch
7+
from torch.autograd import Variable
8+
import torch.nn.utils as utils
9+
10+
from normalized_actions import NormalizedActions
11+
12+
parser = argparse.ArgumentParser(description='PyTorch REINFORCE example')
13+
parser.add_argument('--env_name', type=str, default='CartPole-v0')
14+
parser.add_argument('--gamma', type=float, default=0.99, metavar='G',
15+
help='discount factor for reward (default: 0.99)')
16+
parser.add_argument('--exploration_end', type=int, default=100, metavar='N',
17+
help='number of episodes with noise (default: 100)')
18+
parser.add_argument('--seed', type=int, default=123, metavar='N',
19+
help='random seed (default: 123)')
20+
parser.add_argument('--num_steps', type=int, default=1000, metavar='N',
21+
help='max episode length (default: 1000)')
22+
parser.add_argument('--num_episodes', type=int, default=2000, metavar='N',
23+
help='number of episodes (default: 2000)')
24+
parser.add_argument('--hidden_size', type=int, default=128, metavar='N',
25+
help='number of episodes (default: 128)')
26+
parser.add_argument('--render', action='store_true',
27+
help='render the environment')
28+
parser.add_argument('--ckpt_freq', type=int, default=100,
29+
help='model saving frequency')
30+
parser.add_argument('--display', type=bool, default=False,
31+
help='display or not')
32+
args = parser.parse_args()
33+
34+
env_name = args.env_name
35+
env = gym.make(env_name)
36+
if type(env.action_space) != gym.spaces.discrete.Discrete:
37+
from reinforce_continuous import REINFORCE
38+
env = NormalizedActions(gym.make(env_name))
39+
else:
40+
from reinforce_discrete import REINFORCE
41+
42+
if args.display:
43+
env = wrappers.Monitor(env, '/tmp/{}-experiment'.format(env_name), force=True)
44+
45+
env.seed(args.seed)
46+
torch.manual_seed(args.seed)
47+
np.random.seed(args.seed)
48+
49+
agent = REINFORCE(args.hidden_size, env.observation_space.shape[0], env.action_space)
50+
51+
dir = 'ckpt_' + env_name
52+
if not os.path.exists(dir):
53+
os.mkdir(dir)
54+
55+
for i_episode in range(args.num_episodes):
56+
state = torch.Tensor([env.reset()])
57+
entropies = []
58+
log_probs = []
59+
rewards = []
60+
for t in range(args.num_steps):
61+
action, log_prob, entropy = agent.select_action(state)
62+
action = action.cpu()
63+
64+
next_state, reward, done, _ = env.step(action.numpy()[0])
65+
66+
entropies.append(entropy)
67+
log_probs.append(log_prob)
68+
rewards.append(reward)
69+
state = torch.Tensor([next_state])
70+
71+
if done:
72+
break
73+
74+
agent.update_parameters(rewards, log_probs, entropies, args.gamma)
75+
76+
77+
if i_episode%args.ckpt_freq == 0:
78+
torch.save(agent.model.state_dict(), os.path.join(dir, 'reinforce-'+str(i_episode)+'.pkl'))
79+
80+
print("Episode: {}, reward: {}".format(i_episode, np.sum(rewards)))
81+
82+
env.close()

normalized_actions.py

+16
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,16 @@
1+
import gym
2+
3+
4+
class NormalizedActions(gym.ActionWrapper):
5+
6+
def _action(self, action):
7+
action = (action + 1) / 2 # [-1, 1] => [0, 1]
8+
action *= (self.action_space.high - self.action_space.low)
9+
action += self.action_space.low
10+
return action
11+
12+
def _reverse_action(self, action):
13+
action -= self.action_space.low
14+
action /= (self.action_space.high - self.action_space.low)
15+
action = action * 2 - 1
16+
return actions

reinforce_continuous.py

+72
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,72 @@
1+
import sys
2+
import math
3+
4+
import torch
5+
import torch.autograd as autograd
6+
import torch.nn as nn
7+
import torch.nn.functional as F
8+
import torch.optim as optim
9+
import torch.nn.utils as utils
10+
import torchvision.transforms as T
11+
from torch.autograd import Variable
12+
13+
pi = Variable(torch.FloatTensor([math.pi])).cuda()
14+
15+
def normal(x, mu, sigma_sq):
16+
a = (-1*(Variable(x)-mu).pow(2)/(2*sigma_sq)).exp()
17+
b = 1/(2*sigma_sq*pi.expand_as(sigma_sq)).sqrt()
18+
return a*b
19+
20+
21+
class Policy(nn.Module):
22+
def __init__(self, hidden_size, num_inputs, action_space):
23+
super(Policy, self).__init__()
24+
self.action_space = action_space
25+
num_outputs = action_space.shape[0]
26+
27+
self.linear1 = nn.Linear(num_inputs, hidden_size)
28+
self.linear2 = nn.Linear(hidden_size, num_outputs)
29+
self.linear2_ = nn.Linear(hidden_size, num_outputs)
30+
31+
def forward(self, inputs):
32+
x = inputs
33+
x = F.relu(self.linear1(x))
34+
mu = self.linear2(x)
35+
sigma_sq = self.linear2_(x)
36+
37+
return mu, sigma_sq
38+
39+
40+
class REINFORCE:
41+
def __init__(self, hidden_size, num_inputs, action_space):
42+
self.action_space = action_space
43+
self.model = Policy(hidden_size, num_inputs, action_space)
44+
self.model = self.model.cuda()
45+
self.optimizer = optim.Adam(self.model.parameters(), lr=1e-3)
46+
self.model.train()
47+
48+
def select_action(self, state):
49+
mu, sigma_sq = self.model(Variable(state).cuda())
50+
sigma_sq = F.softplus(sigma_sq)
51+
52+
eps = torch.randn(mu.size())
53+
# calculate the probability
54+
action = (mu + sigma_sq.sqrt()*Variable(eps).cuda()).data
55+
prob = normal(action, mu, sigma_sq)
56+
entropy = -0.5*((sigma_sq+2*pi.expand_as(sigma_sq)).log()+1)
57+
58+
log_prob = prob.log()
59+
return action, log_prob, entropy
60+
61+
def update_parameters(self, rewards, log_probs, entropies, gamma):
62+
R = torch.zeros(1, 1)
63+
loss = 0
64+
for i in reversed(range(len(rewards))):
65+
R = gamma * R + rewards[i]
66+
loss = loss - (log_probs[i]*(Variable(R).expand_as(log_probs[i])).cuda()).sum() - (0.0001*entropies[i].cuda()).sum()
67+
loss = loss / len(rewards)
68+
69+
self.optimizer.zero_grad()
70+
loss.backward()
71+
utils.clip_grad_norm(self.model.parameters(), 40)
72+
self.optimizer.step()

reinforce_discrete.py

+75
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,75 @@
1+
import sys
2+
import math
3+
4+
import torch
5+
import torch.autograd as autograd
6+
import torch.nn as nn
7+
import torch.nn.functional as F
8+
import torch.optim as optim
9+
import torch.nn.utils as utils
10+
import torchvision.transforms as T
11+
from torch.autograd import Variable
12+
import pdb
13+
14+
class Policy(nn.Module):
15+
def __init__(self, hidden_size, num_inputs, action_space):
16+
super(Policy, self).__init__()
17+
self.action_space = action_space
18+
num_outputs = action_space.n
19+
20+
self.linear1 = nn.Linear(num_inputs, hidden_size)
21+
self.linear2 = nn.Linear(hidden_size, num_outputs)
22+
23+
def forward(self, inputs):
24+
x = inputs
25+
x = F.relu(self.linear1(x))
26+
action_scores = self.linear2(x)
27+
return F.softmax(action_scores)
28+
29+
30+
class REINFORCE:
31+
def __init__(self, hidden_size, num_inputs, action_space):
32+
self.action_space = action_space
33+
self.model = Policy(hidden_size, num_inputs, action_space)
34+
self.model = self.model.cuda()
35+
self.optimizer = optim.Adam(self.model.parameters(), lr=1e-3)
36+
self.model.train()
37+
38+
def select_action(self, state):
39+
probs = self.model(Variable(state).cuda())
40+
action = probs.multinomial(1).data
41+
prob = probs[:, action[0,0]].view(1, -1)
42+
log_prob = prob.log()
43+
entropy = - (probs*probs.log()).sum()
44+
45+
return action[0], log_prob, entropy
46+
47+
def update_parameters(self, rewards, log_probs, entropies, gamma):
48+
R = torch.zeros(1, 1)
49+
loss = 0
50+
for i in reversed(range(len(rewards))):
51+
R = gamma * R + rewards[i]
52+
loss = loss - (log_probs[i]*(Variable(R).expand_as(log_probs[i])).cuda()).sum() - \
53+
(0.0001*entropies[i].cuda()).sum()
54+
loss = loss / len(rewards)
55+
56+
self.optimizer.zero_grad()
57+
loss.backward()
58+
utils.clip_grad_norm(self.model.parameters(), 40)
59+
self.optimizer.step()
60+
61+
62+
63+
64+
65+
66+
67+
68+
69+
70+
71+
72+
73+
74+
75+

0 commit comments

Comments
 (0)