Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
60 changes: 58 additions & 2 deletions doc/example/sampler_study.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -249,7 +249,7 @@
"outputs": [],
"source": [
"import seaborn as sns\n",
"from scipy.stats import multivariate_normal\n",
"from scipy.stats import multivariate_normal, norm\n",
"\n",
"import pypesto\n",
"import pypesto.sample as sample\n",
Expand All @@ -266,7 +266,26 @@
" return -np.log(density(x))\n",
"\n",
"\n",
"objective = pypesto.Objective(fun=nllh)\n",
"def nllh_grad(x):\n",
" w = np.array([0.3, 0.7])\n",
" mus = np.array([-1.5, 2.5])\n",
" sigmas2 = np.array([0.1, 0.2])\n",
" sigmas = np.sqrt(sigmas2)\n",
"\n",
" pdfs = np.array([w[i] * norm.pdf(x, mus[i], sigmas[i]) for i in range(2)])\n",
" dpdx = np.sum(\n",
" [\n",
" w[i] * norm.pdf(x, mus[i], sigmas[i]) * (mus[i] - x) / sigmas2[i]\n",
" for i in range(2)\n",
" ],\n",
" axis=0,\n",
" )\n",
"\n",
" grad = -dpdx / np.sum(pdfs, axis=0)\n",
" return grad\n",
"\n",
"\n",
"objective = pypesto.Objective(fun=nllh, grad=nllh_grad)\n",
"problem = pypesto.Problem(objective=objective, lb=-4, ub=5, x_names=[\"x\"])"
]
},
Expand Down Expand Up @@ -498,6 +517,43 @@
"result.sample_result.betas"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": "### Metropolis-Adjusted Langevin Algorithm\n"
},
{
"cell_type": "markdown",
"metadata": {},
"source": "So far, we have used only the function values to guide the sampling. If gradients are available, they can be used to guide the sampling more effectively. The `pypesto.sample.MetropolisAdjustedLangevinAlgorithm` implements this idea. It can be used within the parallel tempering framework as well."
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"sampler = sample.AdaptiveParallelTemperingSampler(\n",
" internal_sampler=sample.MalaSampler(), n_chains=3\n",
")\n",
"result = sample.sample(\n",
" problem, 1e3, sampler, x0=np.array([0.5]), filename=None\n",
")"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"sample.geweke_test(result)\n",
"for i_chain in range(len(result.sample_result.betas)):\n",
" visualize.sampling_1d_marginals(\n",
" result, i_chain=i_chain, suptitle=f\"Chain: {i_chain}\"\n",
" )"
]
},
{
"cell_type": "markdown",
"metadata": {},
Expand Down
1 change: 1 addition & 0 deletions pypesto/sample/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@
laplace_approximation_log_evidence,
parallel_tempering_log_evidence,
)
from .mala import MalaSampler
from .metropolis import MetropolisSampler
from .parallel_tempering import ParallelTemperingSampler
from .pymc import PymcSampler
Expand Down
79 changes: 63 additions & 16 deletions pypesto/sample/adaptive_metropolis.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
import logging
import numbers

import numpy as np
Expand Down Expand Up @@ -62,6 +63,7 @@ def __init__(self, options: dict = None):
self._mean_hist = None
self._cov_hist = None
self._cov_scale = None
self._cov_chol = None

@classmethod
def default_options(cls):
Expand All @@ -75,11 +77,10 @@ def default_options(cls):
# a higher value reduces the impact of early adaptation
"threshold_sample": 1,
# regularization factor for ill-conditioned cov matrices of
# the adapted proposal density. regularization might happen if the
# eigenvalues of the cov matrix strongly differ in order
# of magnitude. in this case, the algorithm adds a small
# diag matrix to the cov matrix with elements of this factor
"reg_factor": 1e-6,
# the adapted proposal density.
"reg_factor": 1e-8,
# maximum number of attempts to regularize the covariance matrix
"max_tries": 10,
# initial covariance matrix. defaults to a unit matrix
"cov0": None,
# target acceptance rate
Expand All @@ -98,14 +99,18 @@ def initialize(self, problem: Problem, x0: np.ndarray):
cov0 = float(cov0) * np.eye(len(x0))
else:
cov0 = np.eye(len(x0))
self._cov = regularize_covariance(cov0, self.options["reg_factor"])
self._cov_chol, self._cov = regularize_covariance(
cov0,
reg_factor=self.options["reg_factor"],
max_tries=self.options["max_tries"],
)
self._mean_hist = self.trace_x[-1]
self._cov_hist = self._cov
self._cov_scale = 1.0

def _propose_parameter(self, x: np.ndarray):
x_new = np.random.multivariate_normal(x, self._cov)
return x_new
def _propose_parameter(self, x: np.ndarray, beta: float):
x_new: np.ndarray = np.random.multivariate_normal(x, self._cov)
return x_new, np.nan # no gradient needed

def _update_proposal(
self, x: np.ndarray, lpost: float, log_p_acc: float, n_sample_cur: int
Expand All @@ -114,6 +119,7 @@ def _update_proposal(
decay_constant = self.options["decay_constant"]
threshold_sample = self.options["threshold_sample"]
reg_factor = self.options["reg_factor"]
max_tries = self.options["max_tries"]
target_acceptance_rate = self.options["target_acceptance_rate"]

# compute historical mean and covariance
Expand All @@ -136,7 +142,9 @@ def _update_proposal(
self._cov = self._cov_scale * self._cov_hist

# regularize proposal covariance
self._cov = regularize_covariance(cov=self._cov, reg_factor=reg_factor)
self._cov_chol, self._cov = regularize_covariance(
cov=self._cov, reg_factor=reg_factor, max_tries=max_tries
)


def update_history_statistics(
Expand Down Expand Up @@ -186,7 +194,11 @@ def update_history_statistics(
return mean, cov


def regularize_covariance(cov: np.ndarray, reg_factor: float) -> np.ndarray:
def regularize_covariance(
cov: np.ndarray,
reg_factor: float,
max_tries: int,
) -> tuple[np.ndarray, np.ndarray]:
"""
Regularize the estimated covariance matrix of the sample.

Expand All @@ -199,14 +211,49 @@ def regularize_covariance(cov: np.ndarray, reg_factor: float) -> np.ndarray:
Estimate of the covariance matrix of the sample.
reg_factor:
Regularization factor. Larger values result in stronger regularization.
max_tries:
Maximum number of attempts to regularize the covariance matrix.

Returns
-------
L:
Cholesky decomposition of the regularized covariance matrix.
cov:
Regularized estimate of the covariance matrix of the sample.
"""
eig = np.linalg.eigvals(cov)
eig_min = min(eig)
if eig_min <= 0:
cov += (abs(eig_min) + reg_factor) * np.eye(cov.shape[0])
return cov
# we symmetrize cov first
cov = 0.5 * (cov + cov.T)

d = cov.shape[0]
eye = np.eye(d)
if not np.all(np.isfinite(cov)):
s = np.nanmean(np.diag(cov))
s = 1.0 if not np.isfinite(s) or s <= 0 else s
return np.sqrt(s) * eye, s * eye

# Try Cholesky on original matrix first
try:
L = np.linalg.cholesky(cov)
return L, cov
except np.linalg.LinAlgError:
pass # Need regularization

# scale for the initial jitter
s = np.mean(np.diag(cov))
s = 1.0 if not np.isfinite(s) or s <= 0 else s

jitter = reg_factor * s
for _ in range(max_tries):
try:
new_cov = cov + jitter * eye
L = np.linalg.cholesky(new_cov)
return L, new_cov
except np.linalg.LinAlgError:
jitter *= 2.0
logging.warning(
"Could not regularize covariance matrix. Falling back to "
"scaled identity."
)

# final safe fallback
return np.sqrt(s) * eye, s * eye
170 changes: 170 additions & 0 deletions pypesto/sample/mala.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,170 @@
import logging

import numpy as np

from ..problem import Problem
from .adaptive_metropolis import AdaptiveMetropolisSampler

logger = logging.getLogger(__name__)


class MalaSampler(AdaptiveMetropolisSampler):
"""Metropolis-Adjusted Langevin Algorithm (MALA) sampler with preconditioning.

MALA is a gradient-based MCMC method that uses the gradient of the
log-posterior to guide the proposal distribution. This allows for
more efficient exploration of the parameter space compared to
standard random-walk Metropolis-Hastings.

The proposal distribution is:
x_new = x + (step_size / 2) * M * grad_log_p(x) + sqrt(step_size) * sqrt(M) * noise

where grad_log_p(x) is the gradient of the log-posterior at x,
M is the preconditioning matrix, step_size is the discretization step size,
and noise is standard Gaussian noise.

Preconditioning can significantly improve convergence for poorly conditioned
problems by rescaling the parameter space. Here, we use an adaptive covariance
matrix as the preconditioning matrix M, which is updated during sampling.
We aim to converge to a fixed target acceptance rate of 0.574, as suggested
by theoretical results for MALA.

By default, the step size is set to sqrt((1/dim)^(1/3)), where dim is the
dimension of the target distribution, following Boisvert-Beaudry and Bedard (2022).
This scaling helps maintain efficient exploration as the dimensionality increases,
and leads to an asymptotically performant sampler, in the sense that exploring the
parameter space requires O(dim^(1/3)) steps.

For reference, see:

* Roberts et al. 1996.
Exponential convergence of Langevin distributions and their
discrete approximations
(https://doi.org/10.2307/3318418)
* Girolami & Calderhead 2011.
Riemann manifold Langevin and Hamiltonian Monte Carlo methods
(https://doi.org/10.1111/j.1467-9868.2010.00765.x)
* Boisvert-Beaudry and Bedard 2022.
MALA with annealed proposals: a generalization of locally and globally balanced proposal distributions
"""

def __init__(self, step_size: float = None, options: dict = None):
"""Initialize the sampler.

Parameters
----------
step_size:
Step size for the Langevin dynamics. If None, it will be set to
sqrt((1/n_parameter)^(1/3)) during initialization.
options:
Options for the sampler.
"""
super().__init__(options)
self.step_size = step_size
self._dimension_of_target_weight = 1.0

@classmethod
def default_options(cls):
"""Return the default options for the sampler."""
return {
# controls adaptation degeneration velocity of the proposals
# in [0, 1], with 0 -> no adaptation, i.e. classical
# Metropolis-Hastings
"decay_constant": 0.51,
# number of samples before adaptation decreases significantly.
# a higher value reduces the impact of early adaptation
"threshold_sample": 1,
# regularization factor for ill-conditioned cov matrices of
# the adapted proposal density.
"reg_factor": 1e-8,
# maximum number of attempts to regularize the covariance matrix
"max_tries": 10,
# initial covariance matrix. defaults to a unit matrix
"cov0": None,
# target acceptance rate
"target_acceptance_rate": 0.574,
# show progress
"show_progress": None,
}

def initialize(self, problem: Problem, x0: np.ndarray):
"""Initialize the sampler."""
# Check if gradient is available
if not problem.objective.has_grad:
raise ValueError("MALA sampler requires gradient information.")

# Boisvert-Beaudry and Bedard (2022)
# Set step size to problem dimension if not specified
if self.step_size is None:
self.step_size = np.sqrt((1 / problem.dim) ** (1 / 3))
logger.info(f"Step size set to {self.step_size}")
self._dimension_of_target_weight = 1 + (1 / problem.dim) ** (1 / 3)

super().initialize(problem, x0)
self._current_x_grad = -self.neglogpost.get_grad(x0)

def _propose_parameter(self, x: np.ndarray, beta: float):
"""Propose a parameter using preconditioned MALA dynamics."""
# Get gradient of log-posterior at current position
grad = -self.neglogpost.get_grad(x)
# todo: in parallel tempering, this should be the tempered gradient, but only the likelihood is tempered

# Apply preconditioning to gradient
precond_grad = self._cov @ (beta * grad)

# Generate standard Gaussian noise
noise = np.random.randn(len(x))

# Apply preconditioning to noise (via Cholesky decomposition)
precond_noise = self._cov_chol @ noise

# Preconditioned MALA proposal: x + (h/2) * M * grad + sqrt(h) * sqrt(M) * noise
drift = (self.step_size / 2.0) * precond_grad
diffusion = np.sqrt(self.step_size) * precond_noise

x_new: np.ndarray = (
x + self._dimension_of_target_weight * drift + diffusion
)
return x_new, grad

def _compute_transition_log_prob(
self, x_from: np.ndarray, x_to: np.ndarray, x_from_grad: np.ndarray
):
"""Compute the log probability of transitioning from x_from to x_to with preconditioning."""
# Apply preconditioning to gradient
precond_grad_from = self._cov @ x_from_grad

# Mean of the preconditioned proposal distribution
mean = (
x_from
+ self._dimension_of_target_weight
* (self.step_size / 2.0)
* precond_grad_from
)

# For preconditioned MALA, the covariance is step_size * M
# We need to compute the log probability under N(mean, step_size * M)
diff = x_to - mean

# Use Cholesky decomposition for efficient computation
# We need to solve L @ L^T @ z = diff, i.e., z = M^{-1} @ diff
# This is done by first solving L @ y = diff, then L^T @ z = y
try:
# Forward substitution: L @ y = diff
y = np.linalg.solve(self._cov_chol, diff)
# Quadratic form: diff^T @ M^{-1} @ diff = y^T @ y
log_prob = -0.5 * np.dot(y, y) / self.step_size
except np.linalg.LinAlgError:
# If matrix is singular, return -inf
return -np.inf

# Add normalization constant: -0.5 * log|2π * step_size * M|
# = -0.5 * (d * log(2π * step_size) + log|M|)
# log|M| = 2 * log|L| = 2 * sum(log(diag(L)))
d = len(x_from)
log_det_cov = 2 * np.sum(np.log(np.diag(self._cov_chol)))
log_prob -= 0.5 * (
d * np.log(2 * np.pi * self.step_size) + log_det_cov
)

return log_prob
Loading