diff --git a/doc/example/sampler_study.ipynb b/doc/example/sampler_study.ipynb
index 002c71491..7101f2ec7 100644
--- a/doc/example/sampler_study.ipynb
+++ b/doc/example/sampler_study.ipynb
@@ -249,7 +249,7 @@
    "outputs": [],
    "source": [
     "import seaborn as sns\n",
-    "from scipy.stats import multivariate_normal\n",
+    "from scipy.stats import multivariate_normal, norm\n",
     "\n",
     "import pypesto\n",
     "import pypesto.sample as sample\n",
@@ -266,7 +266,26 @@
     "    return -np.log(density(x))\n",
     "\n",
     "\n",
-    "objective = pypesto.Objective(fun=nllh)\n",
+    "def nllh_grad(x):\n",
+    "    w = np.array([0.3, 0.7])\n",
+    "    mus = np.array([-1.5, 2.5])\n",
+    "    sigmas2 = np.array([0.1, 0.2])\n",
+    "    sigmas = np.sqrt(sigmas2)\n",
+    "\n",
+    "    pdfs = np.array([w[i] * norm.pdf(x, mus[i], sigmas[i]) for i in range(2)])\n",
+    "    dpdx = np.sum(\n",
+    "        [\n",
+    "            w[i] * norm.pdf(x, mus[i], sigmas[i]) * (mus[i] - x) / sigmas2[i]\n",
+    "            for i in range(2)\n",
+    "        ],\n",
+    "        axis=0,\n",
+    "    )\n",
+    "\n",
+    "    grad = -dpdx / np.sum(pdfs, axis=0)\n",
+    "    return grad\n",
+    "\n",
+    "\n",
+    "objective = pypesto.Objective(fun=nllh, grad=nllh_grad)\n",
     "problem = pypesto.Problem(objective=objective, lb=-4, ub=5, x_names=[\"x\"])"
    ]
   },
@@ -498,6 +517,43 @@
     "result.sample_result.betas"
    ]
   },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": "### Metropolis-Adjusted Langevin Algorithm\n"
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": "So far, we have used only the function values to guide the sampling. If gradients are available, they can be used to guide the sampling more effectively. The `pypesto.sample.MetropolisAdjustedLangevinAlgorithm` implements this idea. It can be used within the parallel tempering framework as well."
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "sampler = sample.AdaptiveParallelTemperingSampler(\n",
+    "    internal_sampler=sample.MalaSampler(), n_chains=3\n",
+    ")\n",
+    "result = sample.sample(\n",
+    "    problem, 1e3, sampler, x0=np.array([0.5]), filename=None\n",
+    ")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "sample.geweke_test(result)\n",
+    "for i_chain in range(len(result.sample_result.betas)):\n",
+    "    visualize.sampling_1d_marginals(\n",
+    "        result, i_chain=i_chain, suptitle=f\"Chain: {i_chain}\"\n",
+    "    )"
+   ]
+  },
   {
    "cell_type": "markdown",
    "metadata": {},
diff --git a/pypesto/sample/__init__.py b/pypesto/sample/__init__.py
index a55740308..0d9f90a22 100644
--- a/pypesto/sample/__init__.py
+++ b/pypesto/sample/__init__.py
@@ -17,6 +17,7 @@
     laplace_approximation_log_evidence,
     parallel_tempering_log_evidence,
 )
+from .mala import MalaSampler
 from .metropolis import MetropolisSampler
 from .parallel_tempering import ParallelTemperingSampler
 from .pymc import PymcSampler
diff --git a/pypesto/sample/adaptive_metropolis.py b/pypesto/sample/adaptive_metropolis.py
index c0b9e13dd..00e73c473 100644
--- a/pypesto/sample/adaptive_metropolis.py
+++ b/pypesto/sample/adaptive_metropolis.py
@@ -1,3 +1,4 @@
+import logging
 import numbers
 
 import numpy as np
@@ -62,6 +63,7 @@ def __init__(self, options: dict = None):
         self._mean_hist = None
         self._cov_hist = None
         self._cov_scale = None
+        self._cov_chol = None
 
     @classmethod
     def default_options(cls):
@@ -75,11 +77,10 @@ def default_options(cls):
             # a higher value reduces the impact of early adaptation
             "threshold_sample": 1,
             # regularization factor for ill-conditioned cov matrices of
-            # the adapted proposal density. regularization might happen if the
-            # eigenvalues of the cov matrix strongly differ in order
-            # of magnitude. in this case, the algorithm adds a small
-            # diag matrix to the cov matrix with elements of this factor
-            "reg_factor": 1e-6,
+            # the adapted proposal density.
+            "reg_factor": 1e-8,
+            # maximum number of attempts to regularize the covariance matrix
+            "max_tries": 10,
             # initial covariance matrix. defaults to a unit matrix
             "cov0": None,
             # target acceptance rate
@@ -98,14 +99,18 @@ def initialize(self, problem: Problem, x0: np.ndarray):
                 cov0 = float(cov0) * np.eye(len(x0))
         else:
             cov0 = np.eye(len(x0))
-        self._cov = regularize_covariance(cov0, self.options["reg_factor"])
+        self._cov_chol, self._cov = regularize_covariance(
+            cov0,
+            reg_factor=self.options["reg_factor"],
+            max_tries=self.options["max_tries"],
+        )
         self._mean_hist = self.trace_x[-1]
         self._cov_hist = self._cov
         self._cov_scale = 1.0
 
-    def _propose_parameter(self, x: np.ndarray):
-        x_new = np.random.multivariate_normal(x, self._cov)
-        return x_new
+    def _propose_parameter(self, x: np.ndarray, beta: float):
+        x_new: np.ndarray = np.random.multivariate_normal(x, self._cov)
+        return x_new, np.nan  # no gradient needed
 
     def _update_proposal(
         self, x: np.ndarray, lpost: float, log_p_acc: float, n_sample_cur: int
@@ -114,6 +119,7 @@ def _update_proposal(
         decay_constant = self.options["decay_constant"]
         threshold_sample = self.options["threshold_sample"]
         reg_factor = self.options["reg_factor"]
+        max_tries = self.options["max_tries"]
         target_acceptance_rate = self.options["target_acceptance_rate"]
 
         # compute historical mean and covariance
@@ -136,7 +142,9 @@ def _update_proposal(
         self._cov = self._cov_scale * self._cov_hist
 
         # regularize proposal covariance
-        self._cov = regularize_covariance(cov=self._cov, reg_factor=reg_factor)
+        self._cov_chol, self._cov = regularize_covariance(
+            cov=self._cov, reg_factor=reg_factor, max_tries=max_tries
+        )
 
 
 def update_history_statistics(
@@ -186,7 +194,11 @@ def update_history_statistics(
     return mean, cov
 
 
-def regularize_covariance(cov: np.ndarray, reg_factor: float) -> np.ndarray:
+def regularize_covariance(
+    cov: np.ndarray,
+    reg_factor: float,
+    max_tries: int,
+) -> tuple[np.ndarray, np.ndarray]:
     """
     Regularize the estimated covariance matrix of the sample.
 
@@ -199,14 +211,49 @@ def regularize_covariance(cov: np.ndarray, reg_factor: float) -> np.ndarray:
         Estimate of the covariance matrix of the sample.
     reg_factor:
         Regularization factor. Larger values result in stronger regularization.
+    max_tries:
+        Maximum number of attempts to regularize the covariance matrix.
 
     Returns
     -------
+    L:
+        Cholesky decomposition of the regularized covariance matrix.
     cov:
         Regularized estimate of the covariance matrix of the sample.
     """
-    eig = np.linalg.eigvals(cov)
-    eig_min = min(eig)
-    if eig_min <= 0:
-        cov += (abs(eig_min) + reg_factor) * np.eye(cov.shape[0])
-    return cov
+    # we symmetrize cov first
+    cov = 0.5 * (cov + cov.T)
+
+    d = cov.shape[0]
+    eye = np.eye(d)
+    if not np.all(np.isfinite(cov)):
+        s = np.nanmean(np.diag(cov))
+        s = 1.0 if not np.isfinite(s) or s <= 0 else s
+        return np.sqrt(s) * eye, s * eye
+
+    # Try Cholesky on original matrix first
+    try:
+        L = np.linalg.cholesky(cov)
+        return L, cov
+    except np.linalg.LinAlgError:
+        pass  # Need regularization
+
+    # scale for the initial jitter
+    s = np.mean(np.diag(cov))
+    s = 1.0 if not np.isfinite(s) or s <= 0 else s
+
+    jitter = reg_factor * s
+    for _ in range(max_tries):
+        try:
+            new_cov = cov + jitter * eye
+            L = np.linalg.cholesky(new_cov)
+            return L, new_cov
+        except np.linalg.LinAlgError:
+            jitter *= 2.0
+    logging.warning(
+        "Could not regularize covariance matrix. Falling back to "
+        "scaled identity."
+    )
+
+    # final safe fallback
+    return np.sqrt(s) * eye, s * eye
diff --git a/pypesto/sample/mala.py b/pypesto/sample/mala.py
new file mode 100644
index 000000000..a74ebbd43
--- /dev/null
+++ b/pypesto/sample/mala.py
@@ -0,0 +1,170 @@
+import logging
+
+import numpy as np
+
+from ..problem import Problem
+from .adaptive_metropolis import AdaptiveMetropolisSampler
+
+logger = logging.getLogger(__name__)
+
+
+class MalaSampler(AdaptiveMetropolisSampler):
+    """Metropolis-Adjusted Langevin Algorithm (MALA) sampler with preconditioning.
+
+    MALA is a gradient-based MCMC method that uses the gradient of the
+    log-posterior to guide the proposal distribution. This allows for
+    more efficient exploration of the parameter space compared to
+    standard random-walk Metropolis-Hastings.
+
+    The proposal distribution is:
+    x_new = x + (step_size / 2) * M * grad_log_p(x) + sqrt(step_size) * sqrt(M) * noise
+
+    where grad_log_p(x) is the gradient of the log-posterior at x,
+    M is the preconditioning matrix, step_size is the discretization step size,
+    and noise is standard Gaussian noise.
+
+    Preconditioning can significantly improve convergence for poorly conditioned
+    problems by rescaling the parameter space. Here, we use an adaptive covariance
+    matrix as the preconditioning matrix M, which is updated during sampling.
+    We aim to converge to a fixed target acceptance rate of 0.574, as suggested
+    by theoretical results for MALA.
+
+    By default, the step size is set to sqrt((1/dim)^(1/3)), where dim is the
+    dimension of the target distribution, following Boisvert-Beaudry and Bedard (2022).
+    This scaling helps maintain efficient exploration as the dimensionality increases,
+    and leads to an asymptotically performant sampler, in the sense that exploring the
+    parameter space requires O(dim^(1/3)) steps.
+
+    For reference, see:
+
+    * Roberts et al. 1996.
+      Exponential convergence of Langevin distributions and their
+      discrete approximations
+      (https://doi.org/10.2307/3318418)
+    * Girolami & Calderhead 2011.
+      Riemann manifold Langevin and Hamiltonian Monte Carlo methods
+      (https://doi.org/10.1111/j.1467-9868.2010.00765.x)
+    * Boisvert-Beaudry and Bedard 2022.
+      MALA with annealed proposals: a generalization of locally and globally balanced proposal distributions
+    """
+
+    def __init__(self, step_size: float = None, options: dict = None):
+        """Initialize the sampler.
+
+        Parameters
+        ----------
+        step_size:
+            Step size for the Langevin dynamics. If None, it will be set to
+            sqrt((1/n_parameter)^(1/3)) during initialization.
+        options:
+            Options for the sampler.
+        """
+        super().__init__(options)
+        self.step_size = step_size
+        self._dimension_of_target_weight = 1.0
+
+    @classmethod
+    def default_options(cls):
+        """Return the default options for the sampler."""
+        return {
+            # controls adaptation degeneration velocity of the proposals
+            # in [0, 1], with 0 -> no adaptation, i.e. classical
+            # Metropolis-Hastings
+            "decay_constant": 0.51,
+            # number of samples before adaptation decreases significantly.
+            # a higher value reduces the impact of early adaptation
+            "threshold_sample": 1,
+            # regularization factor for ill-conditioned cov matrices of
+            # the adapted proposal density.
+            "reg_factor": 1e-8,
+            # maximum number of attempts to regularize the covariance matrix
+            "max_tries": 10,
+            # initial covariance matrix. defaults to a unit matrix
+            "cov0": None,
+            # target acceptance rate
+            "target_acceptance_rate": 0.574,
+            # show progress
+            "show_progress": None,
+        }
+
+    def initialize(self, problem: Problem, x0: np.ndarray):
+        """Initialize the sampler."""
+        # Check if gradient is available
+        if not problem.objective.has_grad:
+            raise ValueError("MALA sampler requires gradient information.")
+
+        # Boisvert-Beaudry and Bedard (2022)
+        # Set step size to problem dimension if not specified
+        if self.step_size is None:
+            self.step_size = np.sqrt((1 / problem.dim) ** (1 / 3))
+            logger.info(f"Step size set to {self.step_size}")
+        self._dimension_of_target_weight = 1 + (1 / problem.dim) ** (1 / 3)
+
+        super().initialize(problem, x0)
+        self._current_x_grad = -self.neglogpost.get_grad(x0)
+
+    def _propose_parameter(self, x: np.ndarray, beta: float):
+        """Propose a parameter using preconditioned MALA dynamics."""
+        # Get gradient of log-posterior at current position
+        grad = -self.neglogpost.get_grad(x)
+        # todo: in parallel tempering, this should be the tempered gradient, but only the likelihood is tempered
+
+        # Apply preconditioning to gradient
+        precond_grad = self._cov @ (beta * grad)
+
+        # Generate standard Gaussian noise
+        noise = np.random.randn(len(x))
+
+        # Apply preconditioning to noise (via Cholesky decomposition)
+        precond_noise = self._cov_chol @ noise
+
+        # Preconditioned MALA proposal: x + (h/2) * M * grad + sqrt(h) * sqrt(M) * noise
+        drift = (self.step_size / 2.0) * precond_grad
+        diffusion = np.sqrt(self.step_size) * precond_noise
+
+        x_new: np.ndarray = (
+            x + self._dimension_of_target_weight * drift + diffusion
+        )
+        return x_new, grad
+
+    def _compute_transition_log_prob(
+        self, x_from: np.ndarray, x_to: np.ndarray, x_from_grad: np.ndarray
+    ):
+        """Compute the log probability of transitioning from x_from to x_to with preconditioning."""
+        # Apply preconditioning to gradient
+        precond_grad_from = self._cov @ x_from_grad
+
+        # Mean of the preconditioned proposal distribution
+        mean = (
+            x_from
+            + self._dimension_of_target_weight
+            * (self.step_size / 2.0)
+            * precond_grad_from
+        )
+
+        # For preconditioned MALA, the covariance is step_size * M
+        # We need to compute the log probability under N(mean, step_size * M)
+        diff = x_to - mean
+
+        # Use Cholesky decomposition for efficient computation
+        # We need to solve L @ L^T @ z = diff, i.e., z = M^{-1} @ diff
+        # This is done by first solving L @ y = diff, then L^T @ z = y
+        try:
+            # Forward substitution: L @ y = diff
+            y = np.linalg.solve(self._cov_chol, diff)
+            # Quadratic form: diff^T @ M^{-1} @ diff = y^T @ y
+            log_prob = -0.5 * np.dot(y, y) / self.step_size
+        except np.linalg.LinAlgError:
+            # If matrix is singular, return -inf
+            return -np.inf
+
+        # Add normalization constant: -0.5 * log|2π * step_size * M|
+        # = -0.5 * (d * log(2π * step_size) + log|M|)
+        # log|M| = 2 * log|L| = 2 * sum(log(diag(L)))
+        d = len(x_from)
+        log_det_cov = 2 * np.sum(np.log(np.diag(self._cov_chol)))
+        log_prob -= 0.5 * (
+            d * np.log(2 * np.pi * self.step_size) + log_det_cov
+        )
+
+        return log_prob
diff --git a/pypesto/sample/metropolis.py b/pypesto/sample/metropolis.py
index 7213dfcc3..0fc13ffb5 100644
--- a/pypesto/sample/metropolis.py
+++ b/pypesto/sample/metropolis.py
@@ -46,6 +46,7 @@ def __init__(self, options: dict = None):
         self.trace_neglogpost: Union[Sequence[float], None] = None
         self.trace_neglogprior: Union[Sequence[float], None] = None
         self.temper_lpost: bool = False
+        self._current_x_grad: Union[np.ndarray] = np.nan
 
     @classmethod
     def default_options(cls):
@@ -111,7 +112,7 @@ def _perform_step(
         Propose new parameter, evaluate and check whether to accept.
         """
         # propose step
-        x_new: np.ndarray = self._propose_parameter(x)
+        x_new, x_new_grad = self._propose_parameter(x, beta=beta)
 
         # check if step lies within bounds
         if any(x_new < self.problem.lb) or any(x_new > self.problem.ub):
@@ -151,6 +152,16 @@ def _perform_step(
             # log acceptance probability (temper log posterior)
             log_p_acc = min(beta * (lpost_new - lpost), 0)
 
+        # This accounts for an asymmetric proposal distribution
+        log_q_forward = self._compute_transition_log_prob(
+            x, x_new, beta * self._current_x_grad
+        )
+        log_q_backward = self._compute_transition_log_prob(
+            x_new, x, beta * x_new_grad
+        )
+        proposal_correction = log_q_backward - log_q_forward
+        log_p_acc = min(log_p_acc + proposal_correction, 0)
+
         # flip a coin
         u = np.random.uniform(0, 1)
 
@@ -160,6 +171,7 @@ def _perform_step(
             x = x_new
             lpost = lpost_new
             lprior = lprior_new
+            self._current_x_grad = x_new_grad
 
         # update proposal
         self._update_proposal(
@@ -168,16 +180,28 @@ def _perform_step(
 
         return x, lpost, lprior
 
-    def _propose_parameter(self, x: np.ndarray):
+    def _propose_parameter(self, x: np.ndarray, beta: float):
         """Propose a step."""
         x_new: np.ndarray = x + self.options["std"] * np.random.randn(len(x))
-        return x_new
+        return x_new, np.nan  # no gradient needed
 
     def _update_proposal(
         self, x: np.ndarray, lpost: float, log_p_acc: float, n_sample_cur: int
     ):
         """Update the proposal density. Default: Do nothing."""
 
+    def _compute_transition_log_prob(
+        self,
+        x_from: np.ndarray,
+        x_to: np.ndarray,
+        x_from_grad: Union[np.ndarray, None],
+    ):
+        """Compute the transition log probability for symmetric proposal.
+
+        For a symmetric proposal distribution, this is zero.
+        """
+        return 0.0
+
     def get_last_sample(self) -> InternalSample:
         """Get the last sample in the chain.
 
diff --git a/test/sample/test_sample.py b/test/sample/test_sample.py
index 7d9f8e4dd..6d78e7651 100644
--- a/test/sample/test_sample.py
+++ b/test/sample/test_sample.py
@@ -32,6 +32,7 @@
     gaussian_nllh_grad,
     gaussian_nllh_hess,
     gaussian_problem,
+    gaussian_problem_with_grad,
     negative_log_posterior,
     negative_log_prior,
     rosenbrock_problem,
@@ -47,6 +48,7 @@
         "Pymc",
         "Emcee",
         "Dynesty",
+        "Mala",
     ]
 )
 def sampler(request):
@@ -62,6 +64,12 @@ def sampler(request):
                 "show_progress": False,
             },
         )
+    elif request.param == "Mala":
+        return sample.MalaSampler(
+            options={
+                "show_progress": False,
+            },
+        )
     elif request.param == "ParallelTempering":
         return sample.ParallelTemperingSampler(
             internal_sampler=sample.MetropolisSampler(),
@@ -91,11 +99,15 @@ def sampler(request):
         )
 
 
-@pytest.fixture(params=["gaussian", "gaussian_mixture", "rosenbrock"])
+@pytest.fixture(
+    params=["gaussian", "gaussian_with_grad", "gaussian_mixture", "rosenbrock"]
+)
 def problem(request):
     if request.param == "gaussian":
         return gaussian_problem()
-    if request.param == "gaussian_mixture":
+    elif request.param == "gaussian_with_grad":
+        return gaussian_problem_with_grad()
+    elif request.param == "gaussian_mixture":
         return gaussian_mixture_problem()
     elif request.param == "rosenbrock":
         return rosenbrock_problem()
@@ -103,6 +115,14 @@ def problem(request):
 
 def test_pipeline(sampler, problem):
     """Check that a typical pipeline runs through."""
+    if isinstance(sampler, sample.MalaSampler):
+        if not problem.objective.has_grad:
+            pytest.skip("MALA requires gradient information.")
+    if isinstance(sampler, sample.EmceeSampler):
+        if problem.objective.has_grad:
+            pytest.skip(
+                "EMCEE fails to converge when started in optimal point with gradient."
+            )
     # optimization
     optimizer = optimize.ScipyOptimizer(options={"maxiter": 10})
     result = optimize.minimize(
@@ -202,7 +222,9 @@ def test_regularize_covariance():
     """
     matrix = np.array([[-1.0, -4.0], [-4.0, 1.0]])
     assert np.any(np.linalg.eigvals(matrix) < 0)
-    reg = sample.adaptive_metropolis.regularize_covariance(matrix, 1e-6)
+    reg = sample.adaptive_metropolis.regularize_covariance(
+        matrix, 1e-6, max_tries=1
+    )
     assert np.all(np.linalg.eigvals(reg) >= 0)
 
 
diff --git a/test/sample/util.py b/test/sample/util.py
index b191962bd..268f472bc 100644
--- a/test/sample/util.py
+++ b/test/sample/util.py
@@ -35,7 +35,7 @@ def gaussian_llh(x):
 
 def gaussian_nllh_grad(x):
     """Negative log-likelihood gradient for Gaussian."""
-    return np.array([((x - MU) / (SIGMA**2))])
+    return (x - MU) / (SIGMA**2)
 
 
 def gaussian_nllh_hess(x):
@@ -56,6 +56,19 @@ def nllh(x):
     return problem
 
 
+def gaussian_problem_with_grad():
+    """Defines a simple Gaussian problem."""
+
+    def nllh(x):
+        return -gaussian_llh(x)
+
+    objective = pypesto.Objective(fun=nllh, grad=gaussian_nllh_grad)
+    problem = pypesto.Problem(
+        objective=objective, lb=LB_GAUSSIAN, ub=UB_GAUSSIAN
+    )
+    return problem
+
+
 def gaussian_mixture_llh(x):
     """Log-likelihood for Gaussian mixture model."""
     return np.log(