diff --git a/doc/example/sampler_study.ipynb b/doc/example/sampler_study.ipynb index 002c71491..7101f2ec7 100644 --- a/doc/example/sampler_study.ipynb +++ b/doc/example/sampler_study.ipynb @@ -249,7 +249,7 @@ "outputs": [], "source": [ "import seaborn as sns\n", - "from scipy.stats import multivariate_normal\n", + "from scipy.stats import multivariate_normal, norm\n", "\n", "import pypesto\n", "import pypesto.sample as sample\n", @@ -266,7 +266,26 @@ " return -np.log(density(x))\n", "\n", "\n", - "objective = pypesto.Objective(fun=nllh)\n", + "def nllh_grad(x):\n", + " w = np.array([0.3, 0.7])\n", + " mus = np.array([-1.5, 2.5])\n", + " sigmas2 = np.array([0.1, 0.2])\n", + " sigmas = np.sqrt(sigmas2)\n", + "\n", + " pdfs = np.array([w[i] * norm.pdf(x, mus[i], sigmas[i]) for i in range(2)])\n", + " dpdx = np.sum(\n", + " [\n", + " w[i] * norm.pdf(x, mus[i], sigmas[i]) * (mus[i] - x) / sigmas2[i]\n", + " for i in range(2)\n", + " ],\n", + " axis=0,\n", + " )\n", + "\n", + " grad = -dpdx / np.sum(pdfs, axis=0)\n", + " return grad\n", + "\n", + "\n", + "objective = pypesto.Objective(fun=nllh, grad=nllh_grad)\n", "problem = pypesto.Problem(objective=objective, lb=-4, ub=5, x_names=[\"x\"])" ] }, @@ -498,6 +517,43 @@ "result.sample_result.betas" ] }, + { + "cell_type": "markdown", + "metadata": {}, + "source": "### Metropolis-Adjusted Langevin Algorithm\n" + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": "So far, we have used only the function values to guide the sampling. If gradients are available, they can be used to guide the sampling more effectively. The `pypesto.sample.MetropolisAdjustedLangevinAlgorithm` implements this idea. It can be used within the parallel tempering framework as well." + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "sampler = sample.AdaptiveParallelTemperingSampler(\n", + " internal_sampler=sample.MalaSampler(), n_chains=3\n", + ")\n", + "result = sample.sample(\n", + " problem, 1e3, sampler, x0=np.array([0.5]), filename=None\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "sample.geweke_test(result)\n", + "for i_chain in range(len(result.sample_result.betas)):\n", + " visualize.sampling_1d_marginals(\n", + " result, i_chain=i_chain, suptitle=f\"Chain: {i_chain}\"\n", + " )" + ] + }, { "cell_type": "markdown", "metadata": {}, diff --git a/pypesto/sample/__init__.py b/pypesto/sample/__init__.py index a55740308..0d9f90a22 100644 --- a/pypesto/sample/__init__.py +++ b/pypesto/sample/__init__.py @@ -17,6 +17,7 @@ laplace_approximation_log_evidence, parallel_tempering_log_evidence, ) +from .mala import MalaSampler from .metropolis import MetropolisSampler from .parallel_tempering import ParallelTemperingSampler from .pymc import PymcSampler diff --git a/pypesto/sample/adaptive_metropolis.py b/pypesto/sample/adaptive_metropolis.py index c0b9e13dd..00e73c473 100644 --- a/pypesto/sample/adaptive_metropolis.py +++ b/pypesto/sample/adaptive_metropolis.py @@ -1,3 +1,4 @@ +import logging import numbers import numpy as np @@ -62,6 +63,7 @@ def __init__(self, options: dict = None): self._mean_hist = None self._cov_hist = None self._cov_scale = None + self._cov_chol = None @classmethod def default_options(cls): @@ -75,11 +77,10 @@ def default_options(cls): # a higher value reduces the impact of early adaptation "threshold_sample": 1, # regularization factor for ill-conditioned cov matrices of - # the adapted proposal density. regularization might happen if the - # eigenvalues of the cov matrix strongly differ in order - # of magnitude. in this case, the algorithm adds a small - # diag matrix to the cov matrix with elements of this factor - "reg_factor": 1e-6, + # the adapted proposal density. + "reg_factor": 1e-8, + # maximum number of attempts to regularize the covariance matrix + "max_tries": 10, # initial covariance matrix. defaults to a unit matrix "cov0": None, # target acceptance rate @@ -98,14 +99,18 @@ def initialize(self, problem: Problem, x0: np.ndarray): cov0 = float(cov0) * np.eye(len(x0)) else: cov0 = np.eye(len(x0)) - self._cov = regularize_covariance(cov0, self.options["reg_factor"]) + self._cov_chol, self._cov = regularize_covariance( + cov0, + reg_factor=self.options["reg_factor"], + max_tries=self.options["max_tries"], + ) self._mean_hist = self.trace_x[-1] self._cov_hist = self._cov self._cov_scale = 1.0 - def _propose_parameter(self, x: np.ndarray): - x_new = np.random.multivariate_normal(x, self._cov) - return x_new + def _propose_parameter(self, x: np.ndarray, beta: float): + x_new: np.ndarray = np.random.multivariate_normal(x, self._cov) + return x_new, np.nan # no gradient needed def _update_proposal( self, x: np.ndarray, lpost: float, log_p_acc: float, n_sample_cur: int @@ -114,6 +119,7 @@ def _update_proposal( decay_constant = self.options["decay_constant"] threshold_sample = self.options["threshold_sample"] reg_factor = self.options["reg_factor"] + max_tries = self.options["max_tries"] target_acceptance_rate = self.options["target_acceptance_rate"] # compute historical mean and covariance @@ -136,7 +142,9 @@ def _update_proposal( self._cov = self._cov_scale * self._cov_hist # regularize proposal covariance - self._cov = regularize_covariance(cov=self._cov, reg_factor=reg_factor) + self._cov_chol, self._cov = regularize_covariance( + cov=self._cov, reg_factor=reg_factor, max_tries=max_tries + ) def update_history_statistics( @@ -186,7 +194,11 @@ def update_history_statistics( return mean, cov -def regularize_covariance(cov: np.ndarray, reg_factor: float) -> np.ndarray: +def regularize_covariance( + cov: np.ndarray, + reg_factor: float, + max_tries: int, +) -> tuple[np.ndarray, np.ndarray]: """ Regularize the estimated covariance matrix of the sample. @@ -199,14 +211,49 @@ def regularize_covariance(cov: np.ndarray, reg_factor: float) -> np.ndarray: Estimate of the covariance matrix of the sample. reg_factor: Regularization factor. Larger values result in stronger regularization. + max_tries: + Maximum number of attempts to regularize the covariance matrix. Returns ------- + L: + Cholesky decomposition of the regularized covariance matrix. cov: Regularized estimate of the covariance matrix of the sample. """ - eig = np.linalg.eigvals(cov) - eig_min = min(eig) - if eig_min <= 0: - cov += (abs(eig_min) + reg_factor) * np.eye(cov.shape[0]) - return cov + # we symmetrize cov first + cov = 0.5 * (cov + cov.T) + + d = cov.shape[0] + eye = np.eye(d) + if not np.all(np.isfinite(cov)): + s = np.nanmean(np.diag(cov)) + s = 1.0 if not np.isfinite(s) or s <= 0 else s + return np.sqrt(s) * eye, s * eye + + # Try Cholesky on original matrix first + try: + L = np.linalg.cholesky(cov) + return L, cov + except np.linalg.LinAlgError: + pass # Need regularization + + # scale for the initial jitter + s = np.mean(np.diag(cov)) + s = 1.0 if not np.isfinite(s) or s <= 0 else s + + jitter = reg_factor * s + for _ in range(max_tries): + try: + new_cov = cov + jitter * eye + L = np.linalg.cholesky(new_cov) + return L, new_cov + except np.linalg.LinAlgError: + jitter *= 2.0 + logging.warning( + "Could not regularize covariance matrix. Falling back to " + "scaled identity." + ) + + # final safe fallback + return np.sqrt(s) * eye, s * eye diff --git a/pypesto/sample/mala.py b/pypesto/sample/mala.py new file mode 100644 index 000000000..a74ebbd43 --- /dev/null +++ b/pypesto/sample/mala.py @@ -0,0 +1,170 @@ +import logging + +import numpy as np + +from ..problem import Problem +from .adaptive_metropolis import AdaptiveMetropolisSampler + +logger = logging.getLogger(__name__) + + +class MalaSampler(AdaptiveMetropolisSampler): + """Metropolis-Adjusted Langevin Algorithm (MALA) sampler with preconditioning. + + MALA is a gradient-based MCMC method that uses the gradient of the + log-posterior to guide the proposal distribution. This allows for + more efficient exploration of the parameter space compared to + standard random-walk Metropolis-Hastings. + + The proposal distribution is: + x_new = x + (step_size / 2) * M * grad_log_p(x) + sqrt(step_size) * sqrt(M) * noise + + where grad_log_p(x) is the gradient of the log-posterior at x, + M is the preconditioning matrix, step_size is the discretization step size, + and noise is standard Gaussian noise. + + Preconditioning can significantly improve convergence for poorly conditioned + problems by rescaling the parameter space. Here, we use an adaptive covariance + matrix as the preconditioning matrix M, which is updated during sampling. + We aim to converge to a fixed target acceptance rate of 0.574, as suggested + by theoretical results for MALA. + + By default, the step size is set to sqrt((1/dim)^(1/3)), where dim is the + dimension of the target distribution, following Boisvert-Beaudry and Bedard (2022). + This scaling helps maintain efficient exploration as the dimensionality increases, + and leads to an asymptotically performant sampler, in the sense that exploring the + parameter space requires O(dim^(1/3)) steps. + + For reference, see: + + * Roberts et al. 1996. + Exponential convergence of Langevin distributions and their + discrete approximations + (https://doi.org/10.2307/3318418) + * Girolami & Calderhead 2011. + Riemann manifold Langevin and Hamiltonian Monte Carlo methods + (https://doi.org/10.1111/j.1467-9868.2010.00765.x) + * Boisvert-Beaudry and Bedard 2022. + MALA with annealed proposals: a generalization of locally and globally balanced proposal distributions + """ + + def __init__(self, step_size: float = None, options: dict = None): + """Initialize the sampler. + + Parameters + ---------- + step_size: + Step size for the Langevin dynamics. If None, it will be set to + sqrt((1/n_parameter)^(1/3)) during initialization. + options: + Options for the sampler. + """ + super().__init__(options) + self.step_size = step_size + self._dimension_of_target_weight = 1.0 + + @classmethod + def default_options(cls): + """Return the default options for the sampler.""" + return { + # controls adaptation degeneration velocity of the proposals + # in [0, 1], with 0 -> no adaptation, i.e. classical + # Metropolis-Hastings + "decay_constant": 0.51, + # number of samples before adaptation decreases significantly. + # a higher value reduces the impact of early adaptation + "threshold_sample": 1, + # regularization factor for ill-conditioned cov matrices of + # the adapted proposal density. + "reg_factor": 1e-8, + # maximum number of attempts to regularize the covariance matrix + "max_tries": 10, + # initial covariance matrix. defaults to a unit matrix + "cov0": None, + # target acceptance rate + "target_acceptance_rate": 0.574, + # show progress + "show_progress": None, + } + + def initialize(self, problem: Problem, x0: np.ndarray): + """Initialize the sampler.""" + # Check if gradient is available + if not problem.objective.has_grad: + raise ValueError("MALA sampler requires gradient information.") + + # Boisvert-Beaudry and Bedard (2022) + # Set step size to problem dimension if not specified + if self.step_size is None: + self.step_size = np.sqrt((1 / problem.dim) ** (1 / 3)) + logger.info(f"Step size set to {self.step_size}") + self._dimension_of_target_weight = 1 + (1 / problem.dim) ** (1 / 3) + + super().initialize(problem, x0) + self._current_x_grad = -self.neglogpost.get_grad(x0) + + def _propose_parameter(self, x: np.ndarray, beta: float): + """Propose a parameter using preconditioned MALA dynamics.""" + # Get gradient of log-posterior at current position + grad = -self.neglogpost.get_grad(x) + # todo: in parallel tempering, this should be the tempered gradient, but only the likelihood is tempered + + # Apply preconditioning to gradient + precond_grad = self._cov @ (beta * grad) + + # Generate standard Gaussian noise + noise = np.random.randn(len(x)) + + # Apply preconditioning to noise (via Cholesky decomposition) + precond_noise = self._cov_chol @ noise + + # Preconditioned MALA proposal: x + (h/2) * M * grad + sqrt(h) * sqrt(M) * noise + drift = (self.step_size / 2.0) * precond_grad + diffusion = np.sqrt(self.step_size) * precond_noise + + x_new: np.ndarray = ( + x + self._dimension_of_target_weight * drift + diffusion + ) + return x_new, grad + + def _compute_transition_log_prob( + self, x_from: np.ndarray, x_to: np.ndarray, x_from_grad: np.ndarray + ): + """Compute the log probability of transitioning from x_from to x_to with preconditioning.""" + # Apply preconditioning to gradient + precond_grad_from = self._cov @ x_from_grad + + # Mean of the preconditioned proposal distribution + mean = ( + x_from + + self._dimension_of_target_weight + * (self.step_size / 2.0) + * precond_grad_from + ) + + # For preconditioned MALA, the covariance is step_size * M + # We need to compute the log probability under N(mean, step_size * M) + diff = x_to - mean + + # Use Cholesky decomposition for efficient computation + # We need to solve L @ L^T @ z = diff, i.e., z = M^{-1} @ diff + # This is done by first solving L @ y = diff, then L^T @ z = y + try: + # Forward substitution: L @ y = diff + y = np.linalg.solve(self._cov_chol, diff) + # Quadratic form: diff^T @ M^{-1} @ diff = y^T @ y + log_prob = -0.5 * np.dot(y, y) / self.step_size + except np.linalg.LinAlgError: + # If matrix is singular, return -inf + return -np.inf + + # Add normalization constant: -0.5 * log|2π * step_size * M| + # = -0.5 * (d * log(2π * step_size) + log|M|) + # log|M| = 2 * log|L| = 2 * sum(log(diag(L))) + d = len(x_from) + log_det_cov = 2 * np.sum(np.log(np.diag(self._cov_chol))) + log_prob -= 0.5 * ( + d * np.log(2 * np.pi * self.step_size) + log_det_cov + ) + + return log_prob diff --git a/pypesto/sample/metropolis.py b/pypesto/sample/metropolis.py index 7213dfcc3..0fc13ffb5 100644 --- a/pypesto/sample/metropolis.py +++ b/pypesto/sample/metropolis.py @@ -46,6 +46,7 @@ def __init__(self, options: dict = None): self.trace_neglogpost: Union[Sequence[float], None] = None self.trace_neglogprior: Union[Sequence[float], None] = None self.temper_lpost: bool = False + self._current_x_grad: Union[np.ndarray] = np.nan @classmethod def default_options(cls): @@ -111,7 +112,7 @@ def _perform_step( Propose new parameter, evaluate and check whether to accept. """ # propose step - x_new: np.ndarray = self._propose_parameter(x) + x_new, x_new_grad = self._propose_parameter(x, beta=beta) # check if step lies within bounds if any(x_new < self.problem.lb) or any(x_new > self.problem.ub): @@ -151,6 +152,16 @@ def _perform_step( # log acceptance probability (temper log posterior) log_p_acc = min(beta * (lpost_new - lpost), 0) + # This accounts for an asymmetric proposal distribution + log_q_forward = self._compute_transition_log_prob( + x, x_new, beta * self._current_x_grad + ) + log_q_backward = self._compute_transition_log_prob( + x_new, x, beta * x_new_grad + ) + proposal_correction = log_q_backward - log_q_forward + log_p_acc = min(log_p_acc + proposal_correction, 0) + # flip a coin u = np.random.uniform(0, 1) @@ -160,6 +171,7 @@ def _perform_step( x = x_new lpost = lpost_new lprior = lprior_new + self._current_x_grad = x_new_grad # update proposal self._update_proposal( @@ -168,16 +180,28 @@ def _perform_step( return x, lpost, lprior - def _propose_parameter(self, x: np.ndarray): + def _propose_parameter(self, x: np.ndarray, beta: float): """Propose a step.""" x_new: np.ndarray = x + self.options["std"] * np.random.randn(len(x)) - return x_new + return x_new, np.nan # no gradient needed def _update_proposal( self, x: np.ndarray, lpost: float, log_p_acc: float, n_sample_cur: int ): """Update the proposal density. Default: Do nothing.""" + def _compute_transition_log_prob( + self, + x_from: np.ndarray, + x_to: np.ndarray, + x_from_grad: Union[np.ndarray, None], + ): + """Compute the transition log probability for symmetric proposal. + + For a symmetric proposal distribution, this is zero. + """ + return 0.0 + def get_last_sample(self) -> InternalSample: """Get the last sample in the chain. diff --git a/test/sample/test_sample.py b/test/sample/test_sample.py index 7d9f8e4dd..6d78e7651 100644 --- a/test/sample/test_sample.py +++ b/test/sample/test_sample.py @@ -32,6 +32,7 @@ gaussian_nllh_grad, gaussian_nllh_hess, gaussian_problem, + gaussian_problem_with_grad, negative_log_posterior, negative_log_prior, rosenbrock_problem, @@ -47,6 +48,7 @@ "Pymc", "Emcee", "Dynesty", + "Mala", ] ) def sampler(request): @@ -62,6 +64,12 @@ def sampler(request): "show_progress": False, }, ) + elif request.param == "Mala": + return sample.MalaSampler( + options={ + "show_progress": False, + }, + ) elif request.param == "ParallelTempering": return sample.ParallelTemperingSampler( internal_sampler=sample.MetropolisSampler(), @@ -91,11 +99,15 @@ def sampler(request): ) -@pytest.fixture(params=["gaussian", "gaussian_mixture", "rosenbrock"]) +@pytest.fixture( + params=["gaussian", "gaussian_with_grad", "gaussian_mixture", "rosenbrock"] +) def problem(request): if request.param == "gaussian": return gaussian_problem() - if request.param == "gaussian_mixture": + elif request.param == "gaussian_with_grad": + return gaussian_problem_with_grad() + elif request.param == "gaussian_mixture": return gaussian_mixture_problem() elif request.param == "rosenbrock": return rosenbrock_problem() @@ -103,6 +115,14 @@ def problem(request): def test_pipeline(sampler, problem): """Check that a typical pipeline runs through.""" + if isinstance(sampler, sample.MalaSampler): + if not problem.objective.has_grad: + pytest.skip("MALA requires gradient information.") + if isinstance(sampler, sample.EmceeSampler): + if problem.objective.has_grad: + pytest.skip( + "EMCEE fails to converge when started in optimal point with gradient." + ) # optimization optimizer = optimize.ScipyOptimizer(options={"maxiter": 10}) result = optimize.minimize( @@ -202,7 +222,9 @@ def test_regularize_covariance(): """ matrix = np.array([[-1.0, -4.0], [-4.0, 1.0]]) assert np.any(np.linalg.eigvals(matrix) < 0) - reg = sample.adaptive_metropolis.regularize_covariance(matrix, 1e-6) + reg = sample.adaptive_metropolis.regularize_covariance( + matrix, 1e-6, max_tries=1 + ) assert np.all(np.linalg.eigvals(reg) >= 0) diff --git a/test/sample/util.py b/test/sample/util.py index b191962bd..268f472bc 100644 --- a/test/sample/util.py +++ b/test/sample/util.py @@ -35,7 +35,7 @@ def gaussian_llh(x): def gaussian_nllh_grad(x): """Negative log-likelihood gradient for Gaussian.""" - return np.array([((x - MU) / (SIGMA**2))]) + return (x - MU) / (SIGMA**2) def gaussian_nllh_hess(x): @@ -56,6 +56,19 @@ def nllh(x): return problem +def gaussian_problem_with_grad(): + """Defines a simple Gaussian problem.""" + + def nllh(x): + return -gaussian_llh(x) + + objective = pypesto.Objective(fun=nllh, grad=gaussian_nllh_grad) + problem = pypesto.Problem( + objective=objective, lb=LB_GAUSSIAN, ub=UB_GAUSSIAN + ) + return problem + + def gaussian_mixture_llh(x): """Log-likelihood for Gaussian mixture model.""" return np.log(