summary (#91)

* Implement Summary class. * Need to apply the test to zero * More tests. * Raise if column names include the word 'intercept' * Test Summary.__format__ and __str__. * Changelog. * Space after colon * One last space after colon * Kommando rückwärts. * E231 is also for ',', use noqas. * Deal with it later.
mlondschien · Jul 9, 2024 · f483762 · f483762
1 parent 549c071
commit f483762
Show file tree

Hide file tree

Showing 11 changed files with 303 additions and 29 deletions.
diff --git a/.flake8 b/.flake8
@@ -1,4 +1,3 @@
-# Taken directly from https://github.com/ambv/black/blob/master/.flake8
 [flake8]
 ignore = E203, E266, E501, W503, C901, D104, D100, D301, W604
 max-line-length = 88

diff --git a/CHANGELOG.rst b/CHANGELOG.rst
@@ -12,10 +12,17 @@ Changelog
 
 - New class :class:`~ivmodels.confidence_sets.ConfidenceSet`.
 
+- New class :class:`~ivmodels.summary.Summary` holding information about the model fit.
+
+- New method :func:`~ivmodels.models.kclass.KClass.summary` to create a summary of the
+  model fit.
+
 - The :class:`~ivmodels.models.kclass.KClass` gets new attributes after fitting a model:
   `endogenous_names_`, `exogenous_names_`,  and `instrument_names_`. If pandas is
   installed, there's also `names_coefs_`.
 
+
+
 **Other changes:**
 
 - The function :func:`~ivmodels.tests.lagrange_multiplier_test` is now slightly faster.

diff --git a/ivmodels/models/kclass.py b/ivmodels/models/kclass.py
@@ -5,6 +5,7 @@
 import scipy
 from glum import GeneralizedLinearRegressor
 
+from ivmodels.summary import Summary
 from ivmodels.utils import oproj, proj, to_numpy
 
 try:
@@ -357,6 +358,11 @@ def fit(self, X, y, Z=None, C=None, *args, **kwargs):
         (X, Z, C), names = self._X_Z_C(X, Z, C, predict=False)
         self.endogenous_names_, self.instrument_names_, self.exogenous_names_ = names
 
+        if self.fit_intercept and any("intercept" in n for n in names):
+            raise ValueError(
+                "Variable names must not contain 'intercept' when fit_intercept=True."
+            )
+
         n, mx = X.shape
 
         # Including an intercept is equivalent to replacing y <- M_1 y, X <- M_1 X,
@@ -476,6 +482,38 @@ def predict(self, X, C=None, *args, **kwargs):  # noqa D
         (X, _, C), _ = self._X_Z_C(X, C=C, Z=None, predict=True)
         return super().predict(np.hstack([X, C]), *args, **kwargs)
 
+    def summary(self, X, y, Z=None, C=None, test="wald (liml)", alpha=0.05, **kwargs):
+        """
+        Create Summary object for the fitted model.
+
+        This contains the fitted values (estimates), subvector test statistics for each
+        parameter, corresponding p-values, and confidence sets.
+
+        Parameters
+        ----------
+        X: array-like, shape (n_samples, n_features)
+            The input data.
+        y: array-like, shape (n_samples,)
+            The target values.
+        Z: array-like, shape (n_samples, n_instruments), optional
+            The instrument values. If ``instrument_names`` or ``instrument_regex`` are
+            specified, ``Z`` must be ``None``. If ``Z`` is specified,
+            ``instrument_names`` and ``instrument_regex`` must be ``None``.
+        C: array-like, shape (n_samples, n_exogenous), optional
+            The exogenous regressors. If ``exogenous_names`` or ``exogenous_regex`` are
+            specified, ``C`` must be ``None``. If ``C`` is specified,
+            ``exogenous_names`` and ``exogenous_regex`` must be ``None``.
+        test: str, optional, default="wald (liml)"
+            The test to use. Must be one of "wald (liml)", "wald (tsls)", "anderson
+            rubin", "AR", or "lagrange multiplier".
+        alpha: float, optional, default=0.05
+            The significance level.
+        **kwargs
+            Additional keyword arguments to pass to the test and its inversion.
+        """
+        summary = Summary(kclass=self, test=test, alpha=alpha)
+        return summary.fit(X, y, Z=Z, C=C, **kwargs)
+
 
 class KClass(KClassMixin, GeneralizedLinearRegressor):
     """K-class estimator for instrumental variable regression.

diff --git a/ivmodels/simulate.py b/ivmodels/simulate.py
@@ -163,22 +163,19 @@ def simulate_gaussian_iv(
         True gamma. Only returned if ``return_gamma`` is True.
     """
     rng = np.random.RandomState(seed)
-    beta = rng.normal(0, 1, (mx, 1))
 
+    m = mx + mw
     if u is None:
-        u = mx
+        u = m
 
-    ux = rng.normal(0, 1, (u, mx))
+    ux = rng.normal(0, 1, (u, m))
     uy = rng.normal(0, 1, (u, 1))
-    uw = rng.normal(0, 1, (u, mw))
 
     alpha = rng.normal(0, 1, (mc, 1))
-    gamma = rng.normal(0, 1, (mw, 1))
+    beta = rng.normal(0, 1, (m, 1))
 
-    Pi_ZX = rng.normal(0, 1, (k, mx))
-    Pi_ZW = rng.normal(0, 1, (k, mw))
-    Pi_CX = rng.normal(0, 1, (mc, mx))
-    Pi_CW = rng.normal(0, 1, (mc, mw))
+    Pi_ZX = rng.normal(0, 1, (k, m))
+    Pi_CX = rng.normal(0, 1, (mc, m))
     Pi_CZ = rng.normal(0, 1, (mc, k))
 
     U = rng.normal(0, 1, (n, u))
@@ -191,17 +188,15 @@ def simulate_gaussian_iv(
     )
 
     X = Z @ Pi_ZX + C @ Pi_CX + U @ ux
-    X += rng.normal(0, 1, (n, mx)) + include_intercept * rng.normal(0, 1, (1, mx))
-    W = Z @ Pi_ZW + C @ Pi_CW + U @ uw
-    W += rng.normal(0, 1, (n, mw)) + include_intercept * rng.normal(0, 1, (1, mw))
-    y = C @ alpha + X @ beta + W @ gamma + U @ uy
+    X += rng.normal(0, 1, (n, m)) + include_intercept * rng.normal(0, 1, (1, m))
+    y = C @ alpha + X @ beta + U @ uy
     y += rng.normal(0, 1, (n, 1)) + include_intercept * rng.normal(0, 1, (1, 1))
 
     if return_beta and return_gamma:
-        return Z, X, y.flatten(), C, W, beta.flatten(), gamma.flatten()
+        return Z, X[:, :mx], y.flatten(), C, X[:, mx:], beta[:mx], beta[mx:]
     elif return_beta:
-        return Z, X, y.flatten(), C, W, beta.flatten()
+        return Z, X[:, :mx], y.flatten(), C, X[:, mx:], beta[:mx]
     elif return_gamma:
-        return Z, X, y.flatten(), C, W, gamma.flatten()
+        return Z, X[:, :mx], y.flatten(), C, X[:, mx:], beta[mx:]
     else:
-        return Z, X, y.flatten(), C, W
+        return Z, X[:, :mx], y.flatten(), C, X[:, mx:]
diff --git a/ivmodels/summary.py b/ivmodels/summary.py
@@ -0,0 +1,207 @@
+from functools import partial
+
+import numpy as np
+
+from ivmodels.confidence_set import ConfidenceSet
+from ivmodels.quadric import Quadric
+
+
+class Summary:
+    """Class containing summary statistics for a fitted model.
+
+    Parameters
+    ----------
+    kclass: ivmodels.KClass or child class of ivmodels.models.kclass.KClassMixin
+        Fitted model.
+    test: str
+        Name of the test to be used. One of ``"wald"``, ``"anderson-rubin"``,
+        ``"lagrange multiplier"``, ``"likelihood-ratio"``, or
+        ``"conditional likelihood-ratio"``.
+    alpha: float
+        Significance level :math:`\\alpha` for the confidence sets, e.g., 0.05. The
+        confidence of the confidence set will be :math:`1 - \\alpha`
+
+    """
+
+    def __init__(self, kclass, test, alpha):
+        from ivmodels import KClass  # avoid circular imports
+
+        if not isinstance(kclass, KClass):
+            raise ValueError("kclass must be an instance of ivmodels.KClass")
+
+        self.kclass = kclass
+        self.test = test
+        self.alpha = alpha
+
+    def fit(self, X, y, Z=None, C=None, *args, **kwargs):
+        """
+        Fit a summary.
+
+        If ``instrument_names`` or ``instrument_regex`` are specified, ``X`` must be a
+        pandas DataFrame containing columns ``instrument_names`` and ``Z`` must be
+        ``None``. At least one one of ``Z``, ``instrument_names``, and
+        ``instrument_regex`` must be specified.
+        If ``exogenous_names`` or ``exogenous_regex`` are specified, ``X`` must be a
+        pandas DataFrame containing columns ``exogenous_names`` and ``C`` must be
+        ``None``.
+
+        Parameters
+        ----------
+        X: array-like, shape (n_samples, n_features)
+            The training input samples. If ``instrument_names`` or ``instrument_regex``
+            are specified, ``X`` must be a pandas DataFrame containing columns
+            ``instrument_names``.
+        y: array-like, shape (n_samples,)
+            The target values.
+        Z: array-like, shape (n_samples, n_instruments), optional
+            The instrument values. If ``instrument_names`` or ``instrument_regex`` are
+            specified, ``Z`` must be ``None``. If ``Z`` is specified,
+            ``instrument_names`` and ``instrument_regex`` must be ``None``.
+        C: array-like, shape (n_samples, n_exogenous), optional
+            The exogenous regressors. If ``exogenous_names`` or ``exogenous_regex`` are
+            specified, ``C`` must be ``None``. If ``C`` is specified,
+            ``exogenous_names`` and ``exogenous_regex`` must be ``None``.
+        """
+        if not hasattr(self, "named_coefs_"):
+            self.kclass.fit(X, y, Z=Z, C=C)
+
+        import ivmodels.tests as tests  # avoid circular imports
+
+        _TESTS = {
+            "wald": (
+                partial(tests.wald_test, estimator=self.kclass.kappa),
+                partial(tests.inverse_wald_test, estimator=self.kclass.kappa),
+            ),
+            "anderson-rubin": (
+                tests.anderson_rubin_test,
+                tests.inverse_anderson_rubin_test,
+            ),
+            "lagrange multiplier": (
+                tests.lagrange_multiplier_test,
+                tests.inverse_lagrange_multiplier_test,
+            ),
+            "likelihood-ratio": (
+                tests.likelihood_ratio_test,
+                tests.inverse_likelihood_ratio_test,
+            ),
+            "conditional likelihood-ratio": (
+                tests.conditional_likelihood_ratio_test,
+                tests.inverse_conditional_likelihood_ratio_test,
+            ),
+        }
+
+        if not str(self.test).lower() in _TESTS:
+            raise ValueError(f"Test {self.test} not recognized.")
+
+        test_, inverse_test_ = _TESTS.get(str(self.test).lower())
+
+        self.estimates_ = self.kclass.coef_.tolist()
+        self.statistics_ = list()
+        self.p_values_ = list()
+        self.confidence_sets_ = list()
+
+        (X, Z, C), _ = self.kclass._X_Z_C(X, Z, C, predict=False)
+
+        self.feature_names_ = self.kclass.endogenous_names_
+        # + self.kclass.exogenous_names_
+
+        idx = 0
+        # if self.kclass.fit_intercept:
+        #     self.feature_names_ = ["intercept"] + self.feature_names_
+        #     self.estimates_ = [self.kclass.intercept_] + self.estimates_
+        #     idx -= 1
+        for name in self.feature_names_:
+            if name in self.kclass.endogenous_names_:
+                mask = np.zeros(len(self.kclass.endogenous_names_), dtype=bool)
+                mask[idx] = True
+
+                X_, W_, C_, Z_ = X[:, mask], X[:, ~mask], C, Z
+                fit_intercept_ = True
+
+            elif name in self.kclass.exogenous_names_:
+                mask = np.zeros(len(self.kclass.exogenous_names_), dtype=bool)
+                mask[idx - len(self.kclass.endogenous_names_)] = True
+
+                X_, W_, C_, Z_ = C[:, mask], X, C[:, ~mask], np.hstack([Z, C[:, mask]])
+                fit_intercept_ = True
+
+            elif name == "intercept":
+                ones = np.ones((X.shape[0], 1))
+                X_, W_, C_, Z_ = ones, X, C, np.hstack([Z, ones])
+                fit_intercept_ = False
+
+            test_result = test_(
+                Z=Z_,
+                X=X_,
+                W=W_,
+                y=y,
+                C=C_,
+                beta=np.array([0]),
+                fit_intercept=fit_intercept_,
+                **kwargs,
+            )
+            self.statistics_.append(test_result[0])
+            self.p_values_.append(test_result[1])
+
+            confidence_set = inverse_test_(
+                Z=Z_,
+                X=X_,
+                W=W_,
+                y=y,
+                C=C_,
+                alpha=self.alpha,
+                fit_intercept=fit_intercept_,
+                **kwargs,
+            )
+
+            if isinstance(confidence_set, Quadric):
+                confidence_set = ConfidenceSet.from_quadric(confidence_set)
+
+            self.confidence_sets_.append(confidence_set)
+            idx += 1
+
+        self.statistic_, self.p_value_ = test_(
+            Z=Z, X=X, y=y, C=C, beta=np.zeros(X.shape[1]), fit_intercept=True
+        )
+        self.f_statistic_, self.f_p_value_ = tests.rank_test(
+            Z, X, C=C, fit_intercept=True
+        )
+        return self
+
+    def __format__(self, format_spec: str) -> str:  # noqa D
+        if not hasattr(self, "estimates_"):
+            return "Summary not fitted yet."
+
+        def format_p_value(x):
+            return f"{x:{format_spec}}" if x > 1e-16 else "<1e-16"
+
+        estimate_str = [f"{e:{format_spec}}" for e in self.estimates_]
+        statistics_str = [f"{s:{format_spec}}" for s in self.statistics_]
+        p_values_str = [format_p_value(p) for p in self.p_values_]
+        cis_str = [f"{cs:{format_spec}}" for cs in self.confidence_sets_]
+
+        names_len = max(len(name) for name in self.feature_names_)
+        coefs_len = max(max(len(e) for e in estimate_str), len("estimate"))
+        statistics_len = max(max(len(s) for s in statistics_str), len("statistic"))
+        p_values_len = max(max(len(p) for p in p_values_str), len("p-value"))
+        cis_len = max(max(len(ci) for ci in cis_str), len("conf. set"))
+
+        string = f"""Summary based on the {self.test} test.
+
+{'': <{names_len}}  {'estimate': >{coefs_len}}  {'statistic': >{statistics_len}}  {'p-value': >{p_values_len}}  {'conf. set': >{cis_len}}
+"""
+        # total_len = names_len + statistics_len + coefs_len + p_values_len + cis_len + 4
+        # string += "-" * total_len + "\n"
+
+        for name, estimate, statistic, p_value, ci in zip(
+            self.feature_names_, estimate_str, statistics_str, p_values_str, cis_str
+        ):
+            string += f"{name: <{names_len}}  {estimate: >{coefs_len}}  {statistic: >{statistics_len}}  {p_value: >{p_values_len}}  {ci: >{cis_len}}\n"
+
+        string += f"""
+Endogenous model statistic: {self.statistic_:{format_spec}}, p-value: {format_p_value(self.p_value_)}
+(Multivariate) F-statistic: {self.f_statistic_:{format_spec}}, p-value: {format_p_value(self.f_p_value_)}"""
+        return string
+
+    def __str__(self):  # noqa D
+        return f"{self:.4g}"
diff --git a/ivmodels/tests/lagrange_multiplier.py b/ivmodels/tests/lagrange_multiplier.py
@@ -213,16 +213,16 @@ def _derivative(gamma):
 
         objective = MemoizeJacHess(_derivative)
         jac = objective.derivative
-        hess = objective.hessian
+        # hess = objective.hessian
 
         res1 = scipy.optimize.minimize(
-            objective, jac=jac, hess=hess, x0=gamma_0, method="newton-cg"
+            objective, jac=jac, hess=None, x0=gamma_0, method="newton-cg"
         )
 
         res2 = scipy.optimize.minimize(
             objective,
             jac=jac,
-            hess=hess,
+            hess=None,
             method="newton-cg",
             x0=np.zeros_like(gamma_0),
         )

diff --git a/ivmodels/tests/utils.py b/ivmodels/tests/utils.py
@@ -101,7 +101,7 @@ def _find_roots(f, a, b, tol, max_value, max_eval, n_points=50):
     closest to ``b``. Note that this is also not strictly ensured by this function.
     """
     if np.abs(b - a) < tol or max_eval < 0:
-        return b  # convervative
+        return b  # conservative
     if np.isinf(a):
         return a
 

diff --git a/ivmodels/tests/wald.py b/ivmodels/tests/wald.py
@@ -49,8 +49,8 @@ def wald_test(Z, X, y, beta, W=None, C=None, estimator="tsls", fit_intercept=Tru
         Exogenous regressors not of interest.
     beta: np.ndarray of dimension (mx,)
         Coefficients to test.
-    estimator: str, optional, default = "tsls"
-        Estimator to use. Must be one of ``"tsls"`` or ``"liml"``.
+    estimator: str or float, optional, default = "liml"
+        Estimator to use. Passed to ``kappa`` argument of ``KClass``.
     fit_intercept: bool, optional, default = True
         Whether to include an intercept. The intercept will be included both in the
         complete and the (restricted) model. Including an intercept is equivalent to

diff --git a/tests/models/test_anchor_regression.py b/tests/models/test_anchor_regression.py
@@ -76,10 +76,10 @@ def test_anchor_solution_minimizes_loss(n, mx, k, u, mc, gamma):
     ar = AnchorRegression(gamma=gamma, fit_intercept=False).fit(X, y, Z, C=C)
 
     def loss(beta):
-        return np.mean((y - np.hstack([X, C]) @ beta - ar.intercept_) ** 2) + (
-            gamma - 1
-        ) * np.mean(
+        return 1 / gamma * np.mean(
+            (y - np.hstack([X, C]) @ beta - ar.intercept_) ** 2
+        ) + (gamma - 1) / gamma * np.mean(
             proj(np.hstack([Z, C]), y - np.hstack([X, C]) @ beta - ar.intercept_) ** 2
         )
 
-    assert np.allclose(scipy.optimize.approx_fprime(ar.coef_, loss), 0, atol=1e-6)
+    assert np.allclose(scipy.optimize.approx_fprime(ar.coef_, loss), 0, atol=1e-5)