diff --git a/theta/costfunctions.py b/theta/costfunctions.py index 279a8e3..3b66b25 100644 --- a/theta/costfunctions.py +++ b/theta/costfunctions.py @@ -15,14 +15,27 @@ def cost(self, X, *Y): @abstractmethod def gradient(self, X): pass + + +class kullbackLeibler(costfunction): + + @staticmethod + def cost(x, y): + if (x < 0).any(): + import ipdb; ipdb.set_trace() + return -np.sum(y*np.log(x)) # + np.sum(y*np.log(y)) + @staticmethod + def gradient(x, y): + return -np.sum(y/x) + class mse(costfunction): """ Mean squared error """ @staticmethod def cost(x, y): - return np.sum(np.mean((x-y)**2,axis=1)) + return np.sum(np.mean((x-y)**2,axis=-1)) @staticmethod def gradient(x, y): @@ -49,7 +62,7 @@ def cost(x, *y): return -np.sum(x) @staticmethod - def gradient(x): + def gradient(x, y): sys.exit("Gradient not implemented!") @@ -61,7 +74,7 @@ def cost(x, y): return np.sqrt(np.sum(np.mean((y-x)**2,axis=1))) @staticmethod - def gradient(x): + def gradient(x, y): return 1.0/np.sqrt(np.sum(np.mean((y-x)**2,axis=1)))*(x-y) @@ -74,6 +87,5 @@ def cost(x, y): return -np.sum(np.mean(np.multiply(y,lx),axis=1)) @staticmethod - def gradient(x): + def gradient(x, y): return -1.0/y.shape[1]*y/x - diff --git a/theta/mathtools.py b/theta/mathtools.py index 192e2ee..e96f222 100644 --- a/theta/mathtools.py +++ b/theta/mathtools.py @@ -6,9 +6,13 @@ RTBM_precision= 1e-8 -def check_normalization_consistency(t, q, w): +def normalization_consistency(t, q, w): c = q - np.transpose(w).dot(np.linalg.inv(t).dot(w)) - return np.all(np.linalg.eigvals(c) > 0) + return np.linalg.eigvals(c) + + +def check_normalization_consistency(t, q, w): + return np.all(normalization_consistency(t, q, w) > 0) def check_pos_def(x): @@ -57,7 +61,10 @@ def rtbm_probability(v, bv, bh, t, w, q, mode=1): uR1, vR1 = RiemannTheta.parts_eval((vT.dot(w) + BhT) / (2.0j * np.pi), -q / (2.0j * np.pi), mode, epsilon=RTBM_precision) uR2, vR2 = RiemannTheta.parts_eval((BhT - BtiTW) / (2.0j * np.pi), (-q + WtiTW) / (2.0j * np.pi), mode, epsilon=RTBM_precision) - return np.sqrt(detT / (2.0 * np.pi) ** (v.shape[0])) * ExpF * vR1 / vR2 * np.exp(uR1-uR2) + # In order to avoid problems at multiprocessing, let's add a maximum value for the exponent + # And a minimum to avoid division by 0 + res = np.sqrt(detT / (2.0 * np.pi) ** (v.shape[0])) * ExpF * vR1 / (vR2+RTBM_precision) * np.exp( np.minimum(uR1-uR2, 250) ) + return res def rtbm_log_probability(v, bv, bh, t, w, q, mode=1): @@ -263,4 +270,4 @@ def rtbm_ph(model, h): BBTWh = np.dot(BBTW ,h) ExpF = np.exp(-0.5*hTQWTWh-BBTWh) u, v = RiemannTheta.parts_eval((BhT-BtiTW)/(2j*np.pi),(-model.q+WtiTW)/(2j*np.pi), mode=1, epsilon=RTBM_precision) - return ExpF / v * np.exp(-u) \ No newline at end of file + return ExpF / v * np.exp(-u) diff --git a/theta/minimizer.py b/theta/minimizer.py index 10fa879..b27baf9 100644 --- a/theta/minimizer.py +++ b/theta/minimizer.py @@ -28,7 +28,10 @@ def worker_initialize(cost, model, x_data, y_data): def worker_compute(params): if not resource.model.set_parameters(params): return np.NaN - res = resource.cost_function.cost(np.real(resource.model(resource.x_data)), resource.y_data) + prob = np.real(resource.model(resource.x_data)) + if (prob < 0.0).any(): + import ipdb; ipdb.set_trace() + res = resource.cost_function.cost(prob, resource.y_data) return res @@ -47,7 +50,8 @@ class CMA(object): parallel (bool): if set to True the algorithm uses multi-processing. ncores (int): limit the number of cores when ``parallel=True``. """ - def __init__(self, parallel=False, ncores=0): + def __init__(self, parallel=False, ncores=0, verbose=True): + self._verbose = verbose super(CMA, self).__init__() if parallel: if(ncores==0): @@ -77,11 +81,20 @@ def train(self, cost, model, x_data, y_data=None, tolfun=1e-11, popsize=None, ma Note: The parameters of the model are changed by this algorithm. """ + if self._verbose: + vb = 0 + else: + vb = -9 initsol = np.real(model.get_parameters()) - args = {'bounds': model.get_bounds(), + + # Prepare the bounds + bounds = model.get_bounds() + + args = {'bounds': bounds, 'tolfun': tolfun, - 'verb_log': 0} + 'verbose': vb, + 'verb_log': vb} sigma = np.max(model.get_bounds()[1])*0.1 if popsize is not None: @@ -112,18 +125,12 @@ def train(self, cost, model, x_data, y_data=None, tolfun=1e-11, popsize=None, ma pool.terminate() else: worker_initialize(cost, model, x_data, y_data) - while not es.stop(): - f_values, solutions = [], [] - while len(solutions) < es.popsize: - curr_fit = x = np.NaN - while np.isnan(curr_fit): - x = es.ask(1, gradf=grad)[0] - curr_fit = worker_compute(x) - solutions.append(x) - f_values.append(curr_fit) - es.tell(solutions, f_values) - es.disp() - print(es.result) + # By using ask_and_eval we let the cma decide what to do with NaNs + solutions, f_values = es.ask_and_eval(worker_compute) + es.tell(solutions, f_values) + es.disp() + if self._verbose: + print(es.result) model.set_parameters(es.result[0]) return es.result[0] diff --git a/theta/rtbm.py b/theta/rtbm.py index f5b9534..31b3e42 100644 --- a/theta/rtbm.py +++ b/theta/rtbm.py @@ -1,13 +1,64 @@ # -*- coding: utf-8 -*- -from __future__ import absolute_import import numpy as np -from theta.mathtools import rtbm_probability, hidden_expectations, rtbm_log_probability, \ - check_normalization_consistency, check_pos_def, rtbm_ph, RTBM_precision +from scipy.special import expit, logit +from theta.mathtools import ( + rtbm_probability, + hidden_expectations, + rtbm_log_probability, + check_normalization_consistency, + check_pos_def, + rtbm_ph, + RTBM_precision, +) from theta.riemann_theta.riemann_theta import RiemannTheta, radius, integer_points_python +#### Activations +class Nope: + def __init__(self): + print("WARNING: using no sampling") + def __call__(self, x): + return x + def grad(self, x): + return np.ones_like(x) + def inv(self, y): + return y + +class Tanh: + # The image of tanh is (-1, 1) + # factor of 2 to move it to (0, 1) + def __call__(self, x): + return (np.tanh(x)+1.0)/2.0 + def grad(self, x): + return 2.0*np.cosh(x)**2 + def inv(self, y): + return np.arctanh(2.0*y - 1.0) + +class Sigmoid: + def __call__(self, x): + return expit(x) + def grad(self, x): + return 1.0/(self(x)-self(x)**2) + def inv(self, y): + return logit(y) + +class Softsign: + def __call__(self, x): + return (x/(1+np.abs(x))+1.0)/2.0 + def grad(self, x): + return 2.0*np.abs(x)**2 + def inv(self, y): + ay = np.where(y > 0.0, y, -y) + return y/(1 - ay) + +activation_dict = { + "tanh": Tanh, + "sigmoid": Sigmoid, + "softsign": Softsign + } class AssignError(Exception): """Custom exception for Shur complement test""" + pass @@ -19,7 +70,7 @@ class RTBM(object): hidden_units (int): number of hidden units. mode (theta.rtbm.RTBM.Mode): set the working mode among: `probability mode` (``Mode.Probability``), `log of probability` (``Mode.LogProbability``) and expectation (``Mode.Expectation``), see :class:`theta.rtbm.RTBM.Mode`. - init_max_param_bound (float): maximum value allowed for all parameters during the CMA-ES minimization. + minimization_bound (float): maximum value allowed for all parameters during the CMA-ES minimization. random_bound (float): selects the maximum random value for the Schur complement initialization. phase (complex): number which multiplies w and bh ``phase=1`` for Phase I and ``phase=1j`` for Phase II. diagonal_T (bool): force T diagonal, by default T is symmetric. @@ -51,12 +102,26 @@ class Mode: * ``Mode.LogProbability``: set the output to log probability mode. * ``Mode.Expectation``: set the output to expectation mode. """ + Probability = 0 LogProbability = 1 Expectation = 2 - def __init__(self, visible_units, hidden_units, mode=Mode.Probability, - init_max_param_bound=2, random_bound=1, phase=1, diagonal_T=False): + def __init__( + self, + visible_units, + hidden_units, + mode=Mode.Probability, + minimization_bound=2, + random_bound=1, + phase=1, + diagonal_T=False, + positive_T=False, + positive_Q=False, + gaussian_init=False, + gaussian_parameters=None, + sampling_activations="tanh" + ): self._Nv = visible_units self._Nh = hidden_units self._bv = np.zeros([visible_units, 1]) @@ -64,27 +129,52 @@ def __init__(self, visible_units, hidden_units, mode=Mode.Probability, self._bh = np.zeros([hidden_units, 1]) self._w = np.zeros([visible_units, hidden_units]) self._q = np.zeros([hidden_units, hidden_units]) - self._diagonal_T = diagonal_T + self._diagonal_T = diagonal_T or (visible_units == 1 and not gaussian_init) self._mode = None self._call = None self._parameters = None self._X = None self._phase = phase self._check_positivity = True + # Get the activation per dimension (# TODO better check) + if isinstance(sampling_activations, str): + sampling_activations = visible_units*[sampling_activations] + self._activations = [activation_dict.get(i.lower(), Nope)() for i in sampling_activations] + if diagonal_T: - self._size = 2 * self._Nv + self._Nh + (self._Nh**2+self._Nh)//2 + self._Nv*self._Nh + if gaussian_init: + raise ValueError("Gaussian initialization doesn't allow for diagonal_T") + self._size = ( + 2 * self._Nv + self._Nh + (self._Nh ** 2 + self._Nh) // 2 + self._Nv * self._Nh + ) else: - self._size = self._Nv + self._Nh + (self._Nv**2+self._Nv+self._Nh**2+self._Nh)//2+self._Nv*self._Nh + self._size = ( + self._Nv + + self._Nh + + (self._Nv ** 2 + self._Nv + self._Nh ** 2 + self._Nh) // 2 + + self._Nv * self._Nh + ) + + # Note that the list of parameters will always keep the following structure: + # [biases_visible, biases_hidden, W, upper_triangular_T (or diagonal), upper_triangular_Q] + # therefore, the indices for T start at: (nv+nh + nv*nh) # set operation mode self.mode = mode # set boundaries - self.set_bounds(init_max_param_bound) - - # Populate with random parameters using Schur complement - # This guarantees an acceptable and instantaneous initial solution - self.random_init(random_bound) + self._positive_T = positive_T + self._positive_Q = positive_Q + self.set_bounds(minimization_bound) + + if gaussian_init: + if gaussian_parameters is None: + gaussian_parameters = {"mean": 0.0, "std": 2.0} + self.gaussian_initialize(**gaussian_parameters) + else: + # Populate with random parameters using Schur complement + # This guarantees an acceptable and instantaneous initial solution + self.random_init(random_bound) # Generate vector for gradient calc call self._D1 = [] @@ -100,15 +190,20 @@ def __init__(self, visible_units, hidden_units, mode=Mode.Probability, if hidden_units > 1: for i in range(0, hidden_units): for j in range(0, hidden_units): - tmp = [0] * hidden_units**2 + tmp = [0] * hidden_units ** 2 tmp[i] = 1 - tmp[j+hidden_units] = 1 + tmp[j + hidden_units] = 1 - self._D2.append([tmp[k:k+hidden_units] for k in range(0, hidden_units+1, hidden_units)]) + self._D2.append( + [ + tmp[k : k + hidden_units] + for k in range(0, hidden_units + 1, hidden_units) + ] + ) self._D2 = np.array(self._D2) else: - self._D2.append([1,1]) + self._D2.append([1, 1]) def __call__(self, data, grad_calc=False): """Evaluates the RTBM for a given data array""" @@ -122,7 +217,6 @@ def __call__(self, data, grad_calc=False): self._check_positivity = False else: self._check_positivity = True - return P def feed_through(self, X, grad_calc=False): @@ -155,36 +249,101 @@ def random_init(self, bound): """Random initializer which satisfies the Schur complement positivity condition. If ``diagonal_T=True`` the initial Q and T are diagonal and W is set to zero. + Note that if T or Q are forced to be positive what the parameters array hold + would be its cholesky decomposition + Args: bound (float): the maximum value for the random matrix X used by the Schur complement. """ - a_shape = ((self._Nv+self._Nh), (self._Nv+self._Nh)) - a_size = (self._Nv+self._Nh)**2 + a_shape = ((self._Nv + self._Nh), (self._Nv + self._Nh)) + a_size = (self._Nv + self._Nh) ** 2 + + params = np.random.uniform(-bound, bound, a_size + self._Nv + self._Nh) - params = np.random.uniform(-bound, bound, a_size+self._Nv+self._Nh) if self._diagonal_T: x = np.eye(a_shape[0]) - np.fill_diagonal(x, params[:self._Nv+self._Nh]) + np.fill_diagonal(x, params[: self._Nv + self._Nh]) else: x = params[:a_size].reshape(a_shape) a = np.transpose(x).dot(x) - self._q = a[:self._Nh, :self._Nh] - self._t = a[self._Nh:self._Nh + self._Nv, self._Nh:] - self._w = self._phase * a[self._Nh:, :self._Nh] - self._bv = params[a_size:a_size + self._Nv].reshape(self._bv.shape) - self._bh = self._phase * params[-self._Nh:].reshape(self._bh.shape) + self._q = a[: self._Nh, : self._Nh] + self._t = a[self._Nh : self._Nh + self._Nv, self._Nh :] + self._w = self._phase * a[self._Nh :, : self._Nh] + self._bv = params[a_size : a_size + self._Nv].reshape(self._bv.shape) + self._bh = self._phase * params[-self._Nh :].reshape(self._bh.shape) + + self._store_parameters() - # store parameters having in mind that Q and T are symmetric. + def gaussian_initialize(self, mean=0, std=2.0): + """Reset the parameters of the rtbm with a multivariate gaussian + centered around mean with a covmat given by std*eye(nv) + """ + multi_mean = np.ones(self._Nv) * mean + covmat = np.eye(self._Nv) + np.fill_diagonal(covmat, std) + n = min(pow(10, self._Nv + 2), 1e7) + fake_data = np.random.multivariate_normal(multi_mean, covmat, size=int(n)) + + # Get the bounds for the matrix q, which corresponds to the last value in the bounds + q_min = self._bounds[0][-1] + q_max = self._bounds[1][-1] + # The biases instead all have the same bounds and are set at the beginning + q_bias_min = self._bounds[0][0] + q_bias_max = self._bounds[1][0] + + # Create a positive Q + self._bh = np.random.uniform(q_bias_min, q_bias_max, self._Nh) + params_q = np.random.uniform(q_min, q_max, int((self._Nh ** 2 + self._Nh) / 2)) + if self._Nh == 1: + self._q = params_q.reshape((1, 1)) + else: + chol_q = np.zeros((self._Nh, self._Nh)) + chol_q[np.triu_indices(self._Nh)] = params_q + self._q = chol_q.T.dot(chol_q) + + self._gaussian_init(fake_data.T) + + # Change the bounds of T according to the gaussian initialization? + + def _gaussian_init(self, data): + """ Reset parametrization with a gaussian on top of the data """ + # Set the W andbiases to 0 + self._w = np.zeros_like(self._w) + self._bh = np.zeros_like(self._bh) + + # Solve the equation for the gaussian + vi = np.mean(data, axis=1, keepdims=True) + vivj = data.dot(data.T) / data.shape[1] + invT = vivj - vi.dot(vi.T) + self._t = np.linalg.inv(invT) + self._bv = -1.0 * np.dot(vi.T, self._t).T + + self._store_parameters() + + def _store_parameters(self): + """ Store paramaters in the all_parameters array """ + all_parameters = [self._bv.flatten(), self._bh.flatten(), self._w.flatten()] + + # store parameters having in mind that Q and T are symmetric if self._diagonal_T: - self._parameters = np.concatenate([self._bv.flatten(), self._bh.flatten(), - self._w.flatten(), self._t.diagonal(), - self._q[np.triu_indices(self._Nh)]]) + all_parameters.append(self._t.diagonal()) + else: + if self._positive_T and self._Nv > 1: + t_target = np.linalg.cholesky(self._t).T + else: + t_target = self._t + all_parameters.append(t_target[np.triu_indices(self._Nv)]) + + if self._positive_Q and self._Nh > 1: + q_target = np.linalg.cholesky(self._q).T else: - self._parameters = np.concatenate([self._bv.flatten(), self._bh.flatten(), - self._w.flatten(), self._t[np.triu_indices(self._Nv)], - self._q[np.triu_indices(self._Nh)]]) + q_target = self._q + + all_parameters.append(q_target[np.triu_indices(self._Nh)]) + + self._parameters = np.concatenate(all_parameters) def mean(self): """Computes the first moment estimator (mean). @@ -202,11 +361,22 @@ def mean(self): BhT = self._bh.T BtiTW = np.dot(np.dot(BvT, invT), self._w) WtiTW = np.dot(np.dot(self._w.T, invT), self._w) - return np.real(-np.dot(invT, self._bv) + 1.0 / (2j * np.pi) * - np.dot(np.dot(invT, self._w), RiemannTheta.normalized_eval((BhT - BtiTW) / (2.0j * np.pi), - (-self._q + WtiTW) / (2.0j * np.pi), mode=self._mode, derivs=self._D1))) + return np.real( + -np.dot(invT, self._bv) + + 1.0 + / (2j * np.pi) + * np.dot( + np.dot(invT, self._w), + RiemannTheta.normalized_eval( + (BhT - BtiTW) / (2.0j * np.pi), + (-self._q + WtiTW) / (2.0j * np.pi), + mode=self._mode, + derivs=self._D1, + ), + ) + ) else: - assert AssertionError('Mean for mode %s not implemented' % self._mode) + assert AssertionError("Mean for mode %s not implemented" % self._mode) def backprop(self, E): """Evaluates and stores the gradients for backpropagation. @@ -248,27 +418,35 @@ def backprop(self, E): np.fill_diagonal(Hb, Hb.diagonal() * 0.5) # Grad Bv - self._gradBv = np.mean(E * (self._P * (-self._X - 2.0 * iT.dot(self._bv) + iTW.dot(Db))), axis=1) + self._gradBv = np.mean( + E * (self._P * (-self._X - 2.0 * iT.dot(self._bv) + iTW.dot(Db))), axis=1 + ) # Grad Bh self._gradBh = np.mean(E * self._P * (Da - Db), axis=1) # Grad W - self._gradW = (E * self._P * self._X).dot(Da.T) / self._X.shape[1] + np.mean(E * self._P, axis=1) * ( - self._bv.T.dot(iT).T.dot(Db.T) - 2 * iTW.dot(Hb)) + self._gradW = (E * self._P * self._X).dot(Da.T) / self._X.shape[1] + np.mean( + E * self._P, axis=1 + ) * (self._bv.T.dot(iT).T.dot(Db.T) - 2 * iTW.dot(Hb)) # Grad T iT2 = np.square(iT) - self._gradT = np.diag(np.mean(-0.5 * self._P * self._X ** 2 * E, axis=1)) + np.mean(E * self._P, axis=1) * ( - 0.5 * iT + self._bv ** 2 * iT2 - self._bv * iT2 * self._w.dot(Db) + iT2 * self._w.dot(Hb).dot( - self._w.T)) + self._gradT = np.diag(np.mean(-0.5 * self._P * self._X ** 2 * E, axis=1)) + np.mean( + E * self._P, axis=1 + ) * ( + 0.5 * iT + + self._bv ** 2 * iT2 + - self._bv * iT2 * self._w.dot(Db) + + iT2 * self._w.dot(Hb).dot(self._w.T) + ) # Grad Q self._gradQ = np.mean(-self._P * (DDa - DDb) * E, axis=1).reshape(self._q.shape) np.fill_diagonal(self._gradQ, self._gradQ.diagonal() * 0.5) else: - raise AssertionError('Gradients for non-diagonal T not implemented.') + raise AssertionError("Gradients for non-diagonal T not implemented.") def size(self): """ @@ -286,37 +464,57 @@ def set_parameters(self, params): Returns: bool: True if Q, T and Q-WtTW are positive, False otherwise. """ - if len(params) != self._size: - raise Exception('Size does no match.') + raise Exception("Size does no match.") self._parameters = params - self._bv = params[:self._Nv].reshape(self._bv.shape) + self._bv = params[: self._Nv].reshape(self._bv.shape) index = self._Nv - self._bh = self._phase*params[index:index+self._Nh].reshape(self._bh.shape) + self._bh = self._phase * params[index : index + self._Nh].reshape(self._bh.shape) index += self._Nh - self._w = self._phase*params[index:index+self._Nv*self._Nh].reshape(self._w.shape) + self._w = self._phase * params[index : index + self._Nv * self._Nh].reshape(self._w.shape) index += self._w.size if self._diagonal_T: - np.fill_diagonal(self._t, params[index:index+self._Nv]) + np.fill_diagonal(self._t, params[index : index + self._Nv]) index += self._Nv else: inds = np.triu_indices_from(self._t) - self._t[inds] = params[index:index+(self._Nv**2+self._Nv)//2] - self._t[(inds[1], inds[0])] = params[index:index+(self._Nv**2+self._Nv)//2] - index += (self._Nv**2+self._Nv)//2 + matrix_t = np.zeros((self._Nv, self._Nv)) + matrix_t[inds] = params[index : index + (self._Nv ** 2 + self._Nv) // 2] + if self._positive_T: + self._t = matrix_t.T.dot(matrix_t) + else: + matrix_t[(inds[1], inds[0])] = params[ + index : index + (self._Nv ** 2 + self._Nv) // 2 + ] + self._t = matrix_t + index += (self._Nv ** 2 + self._Nv) // 2 + inds = np.triu_indices_from(self._q) - self._q[inds] = params[index:index+(self._Nh**2+self._Nh)//2] - self._q[(inds[1], inds[0])] = params[index:index+(self._Nh**2+self._Nh)//2] + # Note, the choice of creating a new variable to hold the _reference_ to the matrix is purely aesthetical + matrix_q = np.zeros((self._Nh, self._Nh)) + matrix_q[inds] = params[index : index + (self._Nh ** 2 + self._Nh) // 2] + if self._positive_Q: + if self._Nh == 1: + self._q = matrix_q + else: + self._q = matrix_q.T.dot(matrix_q) + else: + matrix_q[(inds[1], inds[0])] = params[index : index + (self._Nh ** 2 + self._Nh) // 2] + self._q = matrix_q + # At this point we can already check the positivity if self._check_positivity: - if not check_normalization_consistency(self._t, self._q, self._w) or \ - not check_pos_def(self._q) or not check_pos_def(self._t): + if not check_normalization_consistency(self._t, self._q, self._w): + return False + if not self._positive_T and not check_pos_def(self._t): + return False + if not self._positive_Q and not check_pos_def(self._q): return False return True @@ -336,18 +534,68 @@ def get_gradients(self): inds = np.triu_indices_from(self._gradQ) if self._diagonal_T: - return np.real(np.concatenate((self._gradBv.flatten(),self._gradBh.flatten(),self._gradW.flatten(), self._gradT.diagonal(), self._gradQ[inds].flatten()))) + return np.real( + np.concatenate( + ( + self._gradBv.flatten(), + self._gradBh.flatten(), + self._gradW.flatten(), + self._gradT.diagonal(), + self._gradQ[inds].flatten(), + ) + ) + ) else: - return np.real(np.concatenate((self._gradBv.flatten(),self._gradBh.flatten(),self._gradW.flatten(), self._gradT.flatten(), self._gradQ[inds].flatten()))) + return np.real( + np.concatenate( + ( + self._gradBv.flatten(), + self._gradBh.flatten(), + self._gradW.flatten(), + self._gradT.flatten(), + self._gradQ[inds].flatten(), + ) + ) + ) def set_bounds(self, param_bound): - """Sets the parameter bound for each parameter. + """Sets the parameter bounds for each parameter as (-param_bound, param_bound) + If the matrices T and/or Q are forced positive, the bounds depend on their shape + if T/Q are diagonal then the bounds are (0, param_bound), otherwise, since + we optimize the cholesky decomposition the bounds are (-1, +1)*sqrt(param_bound) Args: param_bound (float): the maximum absolute value for parameter variation. """ - upper_bounds = [param_bound] * self._size - lower_bounds = [-param_bound] * self._size + upper_bounds = np.ones(self._size)*param_bound + lower_bounds = -upper_bounds + + if self._positive_T: + t_idx = self._Nv + self._Nh + self._Nv * self._Nh + t_size = self._Nv if self._diagonal_T else int((self._Nv ** 2 + self._Nv) / 2) + + if self._Nv == 1 or self._diagonal_T: + lower_bounds[t_idx : t_idx + t_size] = 1.0/param_bound + upper_bounds[t_idx : t_idx + t_size] = param_bound + else: + sqr_bound = np.sqrt(param_bound) + lower_bounds[t_idx : t_idx + t_size] = -sqr_bound + upper_bounds[t_idx : t_idx + t_size] = sqr_bound + + + # We do the same for Q, only Q is not allowed to be diagonal + q_idx = t_idx + t_size + if self._positive_Q: + # if nh = 1 then this is trivial! + if self._Nh == 1: + lower_bounds[q_idx] = 1.0/param_bound + upper_bounds[q_idx] = param_bound + else: + # if is positive but not diagonal, then let's play with the limit + sqr_bound = np.sqrt(param_bound) + lower_bounds[q_idx:] = -sqr_bound + upper_bounds[q_idx:] = sqr_bound + self._bounds = [lower_bounds, upper_bounds] def get_bounds(self): @@ -357,6 +605,45 @@ def get_bounds(self): """ return self._bounds + def get_transformation(self, xarr): + """ Receives an array (nevents, ndim) and returns + the transformed array of numbers and the jacobian of the transformation """ + # Get first the distribution by the raw rtbm + px = self(xarr.T)[0] + # And now transform the input array and the probability according to the activation + r_list = [] + for x, act in zip(xarr.T, self._activations): + r_list.append(act(x)) + px *= act.grad(x) + r = np.stack(r_list, axis=-1) + return r, px + + def undo_transformation(self, rand): + """ Receives an array of random numbers (the product of `get_transformation`) + and invert it returning `xarr` """ + x_list = [] + for r, act in zip(rand.T, self._activations): + x_list.append(act.inv(r)) + return np.stack(x_list, axis=-1) + + def make_sample_rho(self, size): + """Produces a probability density between 0 and 1 + such tha + + \int_0^1 p(x) = 1 + + Returns: + r: np.array (nevents, ndim) + sampling of P(v) between 0 and 1 + p(x): np.array (nevents,) + p(x) + r_raw: np.array(nevents, ndim) + original sampling of P(v) (-inf, inf) + """ + r_raw, _ = self.make_sample(size) + r, px = self.get_transformation(r_raw) + return r, px, r_raw + def make_sample(self, size, epsilon=RTBM_precision): """Produces P(v) and P(h) samples for the current RTBM architecture. @@ -369,19 +656,19 @@ def make_sample(self, size, epsilon=RTBM_precision): list of numpy.array: sampling of P(h) """ invT = np.linalg.inv(self._t) - WTiW = self._w.T.dot(invT.dot(self._w)) + WTiW = self._w.T.dot(invT.dot(self._w)) BvTiW = self._bv.T.dot(invT.dot(self._w)) - O = (self._q - WTiW) - Z = (self._bh.T - BvTiW) + O = self._q - WTiW + Z = self._bh.T - BvTiW Z = Z.flatten() - Omega = np.array(-O/(2.0j*np.pi), dtype=np.complex) + Omega = np.array(-O / (2.0j * np.pi), dtype=np.complex) Y = Omega.imag - RT = RiemannTheta.eval(Z/(2.0j*np.pi),-O/(2.0j*np.pi) ) + RT = RiemannTheta.eval(Z / (2.0j * np.pi), -O / (2.0j * np.pi)) - if(Y.shape[0]!=1): + if Y.shape[0] != 1: _T = np.linalg.cholesky(Y).T else: _T = np.sqrt(Y) @@ -389,66 +676,67 @@ def make_sample(self, size, epsilon=RTBM_precision): T = np.ascontiguousarray(_T) g = len(Z) - R = radius(epsilon, _T, derivs=[], accuracy_radius=5.) - S = np.ascontiguousarray(integer_points_python(g,R,_T)) + R = radius(epsilon, _T, derivs=[], accuracy_radius=5.0) + S = np.ascontiguousarray(integer_points_python(g, R, _T)) pmax = 0 for s in S: v = rtbm_ph(self, s) - if v > pmax: pmax = v + if v > pmax: + pmax = v # Rejection sampling ph = [] while len(ph) < size: - U = np.random.randint(0,len(S)) - X = (np.exp(-0.5*S[U].T.dot(O).dot(S[U])-Z.dot(S[U]))/RT).real + U = np.random.randint(0, len(S)) + X = (np.exp(-0.5 * S[U].T.dot(O).dot(S[U]) - Z.dot(S[U])) / RT).real J = np.random.uniform() - if(X/pmax > J): + if X / pmax > J: ph.append(S[U]) # Draw samples from P(v|h) pv = np.zeros(shape=(len(ph), self._bv.shape[0])) - for i in range(0,len(ph)): - muh = -np.dot(invT, np.dot(self._w, ph[i].reshape(g,1))+ self._bv) - pv[i] = np.random.multivariate_normal(mean=muh.flatten(),cov=invT, size=1).flatten() + for i in range(0, len(ph)): + muh = -np.dot(invT, np.dot(self._w, ph[i].reshape(g, 1)) + self._bv) + pv[i] = np.random.multivariate_normal(mean=muh.flatten(), cov=invT, size=1).flatten() return pv, ph def conditional(self, d): - """Generates the conditional RTBM. - + """Generates the conditional RTBM. + Args: d (numpy.array): column vector containing the values for the conditional Returns: theta.rtbm.RTBM: RTBM modelling the conditional probability P(y|d) """ - assert (self._Nv > 1), "cannot do the conditional probability of a 1d distribution" - + assert self._Nv > 1, "cannot do the conditional probability of a 1d distribution" + nh = self._Nh - nv = self._Nv + nv = self._Nv - assert (d.size < nv), "d larger than Nv" + assert d.size < nv, "d larger than Nv" + + k = int(nv - d.size) - k = int(nv-d.size) - cmodel = RTBM(k, nh) # Matrix A - t = self.t[:k,:k] + t = self.t[:k, :k] t = t[np.triu_indices(k)] q = self.q[np.triu_indices(nh)] w = self.w[:k] # Biases bh = self.bh + np.dot(self.w[k:].T, d) - bv = self.bv[:k] + np.dot(self.t[:k,k:], d) + bv = self.bv[:k] + np.dot(self.t[:k, k:], d) - cparams = np.concatenate((bv, bh, w, t, q), axis = None) + cparams = np.concatenate((bv, bh, w, t, q), axis=None) cmodel.set_parameters(cparams) return cmodel - + @property def mode(self): """Sets and returns the RTBM mode.""" @@ -464,14 +752,18 @@ def mode(self, value): if value is self.Mode.Probability: self._call = lambda data: np.real( - rtbm_probability(data, self._bv, self._bh, self._t, self._w, self._q, mode)) + rtbm_probability(data, self._bv, self._bh, self._t, self._w, self._q, mode) + ) elif value is self.Mode.LogProbability: self._call = lambda data: np.real( - rtbm_log_probability(data, self._bv, self._bh, self._t, self._w, self._q, mode)) + rtbm_log_probability(data, self._bv, self._bh, self._t, self._w, self._q, mode) + ) elif value is self.Mode.Expectation: - self._call = lambda data: np.real(self._phase * hidden_expectations(data, self._bh, self._w, self._q)) + self._call = lambda data: np.real( + self._phase * hidden_expectations(data, self._bh, self._w, self._q) + ) else: - raise AssertionError('Mode %s not implemented.' % value) + raise AssertionError("Mode %s not implemented." % value) self._mode = value @@ -483,7 +775,7 @@ def bv(self): @bv.setter def bv(self, value): if value.shape != self._bv.shape: - raise AssertionError('Setting bv with wrong shape.') + raise AssertionError("Setting bv with wrong shape.") self._bv = value @property @@ -494,7 +786,7 @@ def t(self): @t.setter def t(self, value): if value.shape != self._t.shape: - raise AssertionError('Setting t with wrong shape.') + raise AssertionError("Setting t with wrong shape.") self._t = value @property @@ -505,7 +797,7 @@ def bh(self): @bh.setter def bh(self, value): if value.shape != self._bh.shape: - raise AssertionError('Setting bh with wrong shape.') + raise AssertionError("Setting bh with wrong shape.") self._bh = value @property @@ -516,7 +808,7 @@ def w(self): @w.setter def w(self, value): if value.shape != self._w.shape: - raise AssertionError('Setting w with wrong shape.') + raise AssertionError("Setting w with wrong shape.") self._w = value @property @@ -527,5 +819,5 @@ def q(self): @q.setter def q(self, value): if value.shape != self._q.shape: - raise AssertionError('Setting q with wrong shape.') + raise AssertionError("Setting q with wrong shape.") self._q = value