-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathtools.py
113 lines (91 loc) · 3.28 KB
/
tools.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
from __future__ import division
import time
from math import exp, log, floor
from operator import add, mul
from random import random, randint
import numpy as np
from matplotlib import pylab as plt
def sig(x):
if x < -700:
return 0.0
else:
return 1 / ( 1 + np.exp(-x) )
sig_vec = np.vectorize(sig)
def samp(p):
if random() < p:
return 1
else:
return 0
samp_vec = np.vectorize(samp)
def logexp(x):
if x > 700:
return x
else:
return log(1+exp(x))
logexp_vec = np.vectorize(logexp)
def safe_log(x):
if x < 1e-32:
x = 1e-32
return log(x)
log_vec = np.vectorize(safe_log)
def similarity(m, y):
"""
Input
-------
m: binary matrix, shape = (data_len, n_feature)
y: binary matrix, shape = (n_feature)
Output
---------
similarity between each row and y, the number of same entries over each row and y
"""
return np.sum((m + y + 1) %2, axis=1)
class LargeActionTask:
def __init__(self, n_key_states, dim_state, dim_action):
self.n_key_states = n_key_states
self.dim_state = dim_state
self.dim_action = dim_action
self.key_states = np.random.randint(0, 2, size=(n_key_states, dim_state))
self.key_actions = np.random.randint(0, 2, size=(n_key_states, dim_action))
self.current_state = np.random.randint(0, 2, self.dim_state)
def next_state(self):
self.current_state = np.random.randint(0, 2, self.dim_state)
return self.current_state
def next_key_state(self):
ind = np.random.randint(0, self.n_key_states)
self.current_state = self.key_states[ind, :]
return self.current_state
def reward(self, action):
ind = np.argmax(similarity(self.key_states, self.current_state))
return np.sum(action == self.key_actions[ind, :])
def optimal_action(self):
ind = np.argmax(similarity(self.key_states, self.current_state))
return self.key_actions[ind, :]
class RBM:
def __init__(self, n_hidden, dim_state, dim_action, scale=None):
self.n_hidden = n_hidden
self.dim_state = dim_state
self.dim_action = dim_action
self.n_visible = dim_state + dim_action
self.scale = scale
self.w = np.random.uniform(low=-self.scale, high=self.scale, size=(n_hidden, dim_state))
self.u = np.random.uniform(low=-self.scale, high=self.scale, size=(n_hidden, dim_action))
def tau(self, s, a):
return np.dot(self.w, s) + np.dot(self.u, a)
def lam(self, s, a):
return -logexp_vec(self.tau(s, a))
def q(self, s, a):
return -np.sum(self.lam(s, a))
def play(self, s, n_sample, beta):
# First deterministic initialization
h = samp_vec(sig_vec(beta * np.dot(self.w, s)))
a = samp_vec(sig_vec(beta * np.dot(self.u.T, h)))
# Gibbs sampling
for i in range(n_sample):
h = samp_vec(sig_vec(beta * self.tau(s, a)))
a = samp_vec(sig_vec(beta * np.dot(self.u.T, h)))
return a
def qlearn(self, s, a, r, lr):
# q learning with gamma = 0
ph = sig_vec(self.tau(s, a))
self.w += lr * (r - self.q(s, a)) * np.outer(ph, s)
self.u += lr * (r - self.q(s, a)) * np.outer(ph, a)