Skip to content

Commit

Permalink
summary (#91)
Browse files Browse the repository at this point in the history
* Implement Summary class.

* Need to apply the test to zero

* More tests.

* Raise if column names include the word 'intercept'

* Test Summary.__format__ and __str__.

* Changelog.

* Space after colon

* One last space after colon

* Kommando rückwärts.

* E231 is also for ',', use noqas.

* Deal with it later.
  • Loading branch information
mlondschien authored Jul 9, 2024
1 parent 549c071 commit f483762
Show file tree
Hide file tree
Showing 11 changed files with 303 additions and 29 deletions.
1 change: 0 additions & 1 deletion .flake8
Original file line number Diff line number Diff line change
@@ -1,4 +1,3 @@
# Taken directly from https://github.com/ambv/black/blob/master/.flake8
[flake8]
ignore = E203, E266, E501, W503, C901, D104, D100, D301, W604
max-line-length = 88
Expand Down
7 changes: 7 additions & 0 deletions CHANGELOG.rst
Original file line number Diff line number Diff line change
Expand Up @@ -12,10 +12,17 @@ Changelog

- New class :class:`~ivmodels.confidence_sets.ConfidenceSet`.

- New class :class:`~ivmodels.summary.Summary` holding information about the model fit.

- New method :func:`~ivmodels.models.kclass.KClass.summary` to create a summary of the
model fit.

- The :class:`~ivmodels.models.kclass.KClass` gets new attributes after fitting a model:
`endogenous_names_`, `exogenous_names_`, and `instrument_names_`. If pandas is
installed, there's also `names_coefs_`.



**Other changes:**

- The function :func:`~ivmodels.tests.lagrange_multiplier_test` is now slightly faster.
Expand Down
38 changes: 38 additions & 0 deletions ivmodels/models/kclass.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@
import scipy
from glum import GeneralizedLinearRegressor

from ivmodels.summary import Summary
from ivmodels.utils import oproj, proj, to_numpy

try:
Expand Down Expand Up @@ -357,6 +358,11 @@ def fit(self, X, y, Z=None, C=None, *args, **kwargs):
(X, Z, C), names = self._X_Z_C(X, Z, C, predict=False)
self.endogenous_names_, self.instrument_names_, self.exogenous_names_ = names

if self.fit_intercept and any("intercept" in n for n in names):
raise ValueError(
"Variable names must not contain 'intercept' when fit_intercept=True."
)

n, mx = X.shape

# Including an intercept is equivalent to replacing y <- M_1 y, X <- M_1 X,
Expand Down Expand Up @@ -476,6 +482,38 @@ def predict(self, X, C=None, *args, **kwargs): # noqa D
(X, _, C), _ = self._X_Z_C(X, C=C, Z=None, predict=True)
return super().predict(np.hstack([X, C]), *args, **kwargs)

def summary(self, X, y, Z=None, C=None, test="wald (liml)", alpha=0.05, **kwargs):
"""
Create Summary object for the fitted model.
This contains the fitted values (estimates), subvector test statistics for each
parameter, corresponding p-values, and confidence sets.
Parameters
----------
X: array-like, shape (n_samples, n_features)
The input data.
y: array-like, shape (n_samples,)
The target values.
Z: array-like, shape (n_samples, n_instruments), optional
The instrument values. If ``instrument_names`` or ``instrument_regex`` are
specified, ``Z`` must be ``None``. If ``Z`` is specified,
``instrument_names`` and ``instrument_regex`` must be ``None``.
C: array-like, shape (n_samples, n_exogenous), optional
The exogenous regressors. If ``exogenous_names`` or ``exogenous_regex`` are
specified, ``C`` must be ``None``. If ``C`` is specified,
``exogenous_names`` and ``exogenous_regex`` must be ``None``.
test: str, optional, default="wald (liml)"
The test to use. Must be one of "wald (liml)", "wald (tsls)", "anderson
rubin", "AR", or "lagrange multiplier".
alpha: float, optional, default=0.05
The significance level.
**kwargs
Additional keyword arguments to pass to the test and its inversion.
"""
summary = Summary(kclass=self, test=test, alpha=alpha)
return summary.fit(X, y, Z=Z, C=C, **kwargs)


class KClass(KClassMixin, GeneralizedLinearRegressor):
"""K-class estimator for instrumental variable regression.
Expand Down
29 changes: 12 additions & 17 deletions ivmodels/simulate.py
Original file line number Diff line number Diff line change
Expand Up @@ -163,22 +163,19 @@ def simulate_gaussian_iv(
True gamma. Only returned if ``return_gamma`` is True.
"""
rng = np.random.RandomState(seed)
beta = rng.normal(0, 1, (mx, 1))

m = mx + mw
if u is None:
u = mx
u = m

ux = rng.normal(0, 1, (u, mx))
ux = rng.normal(0, 1, (u, m))
uy = rng.normal(0, 1, (u, 1))
uw = rng.normal(0, 1, (u, mw))

alpha = rng.normal(0, 1, (mc, 1))
gamma = rng.normal(0, 1, (mw, 1))
beta = rng.normal(0, 1, (m, 1))

Pi_ZX = rng.normal(0, 1, (k, mx))
Pi_ZW = rng.normal(0, 1, (k, mw))
Pi_CX = rng.normal(0, 1, (mc, mx))
Pi_CW = rng.normal(0, 1, (mc, mw))
Pi_ZX = rng.normal(0, 1, (k, m))
Pi_CX = rng.normal(0, 1, (mc, m))
Pi_CZ = rng.normal(0, 1, (mc, k))

U = rng.normal(0, 1, (n, u))
Expand All @@ -191,17 +188,15 @@ def simulate_gaussian_iv(
)

X = Z @ Pi_ZX + C @ Pi_CX + U @ ux
X += rng.normal(0, 1, (n, mx)) + include_intercept * rng.normal(0, 1, (1, mx))
W = Z @ Pi_ZW + C @ Pi_CW + U @ uw
W += rng.normal(0, 1, (n, mw)) + include_intercept * rng.normal(0, 1, (1, mw))
y = C @ alpha + X @ beta + W @ gamma + U @ uy
X += rng.normal(0, 1, (n, m)) + include_intercept * rng.normal(0, 1, (1, m))
y = C @ alpha + X @ beta + U @ uy
y += rng.normal(0, 1, (n, 1)) + include_intercept * rng.normal(0, 1, (1, 1))

if return_beta and return_gamma:
return Z, X, y.flatten(), C, W, beta.flatten(), gamma.flatten()
return Z, X[:, :mx], y.flatten(), C, X[:, mx:], beta[:mx], beta[mx:]
elif return_beta:
return Z, X, y.flatten(), C, W, beta.flatten()
return Z, X[:, :mx], y.flatten(), C, X[:, mx:], beta[:mx]
elif return_gamma:
return Z, X, y.flatten(), C, W, gamma.flatten()
return Z, X[:, :mx], y.flatten(), C, X[:, mx:], beta[mx:]
else:
return Z, X, y.flatten(), C, W
return Z, X[:, :mx], y.flatten(), C, X[:, mx:]
207 changes: 207 additions & 0 deletions ivmodels/summary.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,207 @@
from functools import partial

import numpy as np

from ivmodels.confidence_set import ConfidenceSet
from ivmodels.quadric import Quadric


class Summary:
"""Class containing summary statistics for a fitted model.
Parameters
----------
kclass: ivmodels.KClass or child class of ivmodels.models.kclass.KClassMixin
Fitted model.
test: str
Name of the test to be used. One of ``"wald"``, ``"anderson-rubin"``,
``"lagrange multiplier"``, ``"likelihood-ratio"``, or
``"conditional likelihood-ratio"``.
alpha: float
Significance level :math:`\\alpha` for the confidence sets, e.g., 0.05. The
confidence of the confidence set will be :math:`1 - \\alpha`
"""

def __init__(self, kclass, test, alpha):
from ivmodels import KClass # avoid circular imports

if not isinstance(kclass, KClass):
raise ValueError("kclass must be an instance of ivmodels.KClass")

self.kclass = kclass
self.test = test
self.alpha = alpha

def fit(self, X, y, Z=None, C=None, *args, **kwargs):
"""
Fit a summary.
If ``instrument_names`` or ``instrument_regex`` are specified, ``X`` must be a
pandas DataFrame containing columns ``instrument_names`` and ``Z`` must be
``None``. At least one one of ``Z``, ``instrument_names``, and
``instrument_regex`` must be specified.
If ``exogenous_names`` or ``exogenous_regex`` are specified, ``X`` must be a
pandas DataFrame containing columns ``exogenous_names`` and ``C`` must be
``None``.
Parameters
----------
X: array-like, shape (n_samples, n_features)
The training input samples. If ``instrument_names`` or ``instrument_regex``
are specified, ``X`` must be a pandas DataFrame containing columns
``instrument_names``.
y: array-like, shape (n_samples,)
The target values.
Z: array-like, shape (n_samples, n_instruments), optional
The instrument values. If ``instrument_names`` or ``instrument_regex`` are
specified, ``Z`` must be ``None``. If ``Z`` is specified,
``instrument_names`` and ``instrument_regex`` must be ``None``.
C: array-like, shape (n_samples, n_exogenous), optional
The exogenous regressors. If ``exogenous_names`` or ``exogenous_regex`` are
specified, ``C`` must be ``None``. If ``C`` is specified,
``exogenous_names`` and ``exogenous_regex`` must be ``None``.
"""
if not hasattr(self, "named_coefs_"):
self.kclass.fit(X, y, Z=Z, C=C)

import ivmodels.tests as tests # avoid circular imports

_TESTS = {
"wald": (
partial(tests.wald_test, estimator=self.kclass.kappa),
partial(tests.inverse_wald_test, estimator=self.kclass.kappa),
),
"anderson-rubin": (
tests.anderson_rubin_test,
tests.inverse_anderson_rubin_test,
),
"lagrange multiplier": (
tests.lagrange_multiplier_test,
tests.inverse_lagrange_multiplier_test,
),
"likelihood-ratio": (
tests.likelihood_ratio_test,
tests.inverse_likelihood_ratio_test,
),
"conditional likelihood-ratio": (
tests.conditional_likelihood_ratio_test,
tests.inverse_conditional_likelihood_ratio_test,
),
}

if not str(self.test).lower() in _TESTS:
raise ValueError(f"Test {self.test} not recognized.")

test_, inverse_test_ = _TESTS.get(str(self.test).lower())

self.estimates_ = self.kclass.coef_.tolist()
self.statistics_ = list()
self.p_values_ = list()
self.confidence_sets_ = list()

(X, Z, C), _ = self.kclass._X_Z_C(X, Z, C, predict=False)

self.feature_names_ = self.kclass.endogenous_names_
# + self.kclass.exogenous_names_

idx = 0
# if self.kclass.fit_intercept:
# self.feature_names_ = ["intercept"] + self.feature_names_
# self.estimates_ = [self.kclass.intercept_] + self.estimates_
# idx -= 1
for name in self.feature_names_:
if name in self.kclass.endogenous_names_:
mask = np.zeros(len(self.kclass.endogenous_names_), dtype=bool)
mask[idx] = True

X_, W_, C_, Z_ = X[:, mask], X[:, ~mask], C, Z
fit_intercept_ = True

elif name in self.kclass.exogenous_names_:
mask = np.zeros(len(self.kclass.exogenous_names_), dtype=bool)
mask[idx - len(self.kclass.endogenous_names_)] = True

X_, W_, C_, Z_ = C[:, mask], X, C[:, ~mask], np.hstack([Z, C[:, mask]])
fit_intercept_ = True

elif name == "intercept":
ones = np.ones((X.shape[0], 1))
X_, W_, C_, Z_ = ones, X, C, np.hstack([Z, ones])
fit_intercept_ = False

test_result = test_(
Z=Z_,
X=X_,
W=W_,
y=y,
C=C_,
beta=np.array([0]),
fit_intercept=fit_intercept_,
**kwargs,
)
self.statistics_.append(test_result[0])
self.p_values_.append(test_result[1])

confidence_set = inverse_test_(
Z=Z_,
X=X_,
W=W_,
y=y,
C=C_,
alpha=self.alpha,
fit_intercept=fit_intercept_,
**kwargs,
)

if isinstance(confidence_set, Quadric):
confidence_set = ConfidenceSet.from_quadric(confidence_set)

self.confidence_sets_.append(confidence_set)
idx += 1

self.statistic_, self.p_value_ = test_(
Z=Z, X=X, y=y, C=C, beta=np.zeros(X.shape[1]), fit_intercept=True
)
self.f_statistic_, self.f_p_value_ = tests.rank_test(
Z, X, C=C, fit_intercept=True
)
return self

def __format__(self, format_spec: str) -> str: # noqa D
if not hasattr(self, "estimates_"):
return "Summary not fitted yet."

def format_p_value(x):
return f"{x:{format_spec}}" if x > 1e-16 else "<1e-16"

estimate_str = [f"{e:{format_spec}}" for e in self.estimates_]
statistics_str = [f"{s:{format_spec}}" for s in self.statistics_]
p_values_str = [format_p_value(p) for p in self.p_values_]
cis_str = [f"{cs:{format_spec}}" for cs in self.confidence_sets_]

names_len = max(len(name) for name in self.feature_names_)
coefs_len = max(max(len(e) for e in estimate_str), len("estimate"))
statistics_len = max(max(len(s) for s in statistics_str), len("statistic"))
p_values_len = max(max(len(p) for p in p_values_str), len("p-value"))
cis_len = max(max(len(ci) for ci in cis_str), len("conf. set"))

string = f"""Summary based on the {self.test} test.
{'': <{names_len}} {'estimate': >{coefs_len}} {'statistic': >{statistics_len}} {'p-value': >{p_values_len}} {'conf. set': >{cis_len}}
"""
# total_len = names_len + statistics_len + coefs_len + p_values_len + cis_len + 4
# string += "-" * total_len + "\n"

for name, estimate, statistic, p_value, ci in zip(
self.feature_names_, estimate_str, statistics_str, p_values_str, cis_str
):
string += f"{name: <{names_len}} {estimate: >{coefs_len}} {statistic: >{statistics_len}} {p_value: >{p_values_len}} {ci: >{cis_len}}\n"

string += f"""
Endogenous model statistic: {self.statistic_:{format_spec}}, p-value: {format_p_value(self.p_value_)}
(Multivariate) F-statistic: {self.f_statistic_:{format_spec}}, p-value: {format_p_value(self.f_p_value_)}"""
return string

def __str__(self): # noqa D
return f"{self:.4g}"
6 changes: 3 additions & 3 deletions ivmodels/tests/lagrange_multiplier.py
Original file line number Diff line number Diff line change
Expand Up @@ -213,16 +213,16 @@ def _derivative(gamma):

objective = MemoizeJacHess(_derivative)
jac = objective.derivative
hess = objective.hessian
# hess = objective.hessian

res1 = scipy.optimize.minimize(
objective, jac=jac, hess=hess, x0=gamma_0, method="newton-cg"
objective, jac=jac, hess=None, x0=gamma_0, method="newton-cg"
)

res2 = scipy.optimize.minimize(
objective,
jac=jac,
hess=hess,
hess=None,
method="newton-cg",
x0=np.zeros_like(gamma_0),
)
Expand Down
2 changes: 1 addition & 1 deletion ivmodels/tests/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -101,7 +101,7 @@ def _find_roots(f, a, b, tol, max_value, max_eval, n_points=50):
closest to ``b``. Note that this is also not strictly ensured by this function.
"""
if np.abs(b - a) < tol or max_eval < 0:
return b # convervative
return b # conservative
if np.isinf(a):
return a

Expand Down
4 changes: 2 additions & 2 deletions ivmodels/tests/wald.py
Original file line number Diff line number Diff line change
Expand Up @@ -49,8 +49,8 @@ def wald_test(Z, X, y, beta, W=None, C=None, estimator="tsls", fit_intercept=Tru
Exogenous regressors not of interest.
beta: np.ndarray of dimension (mx,)
Coefficients to test.
estimator: str, optional, default = "tsls"
Estimator to use. Must be one of ``"tsls"`` or ``"liml"``.
estimator: str or float, optional, default = "liml"
Estimator to use. Passed to ``kappa`` argument of ``KClass``.
fit_intercept: bool, optional, default = True
Whether to include an intercept. The intercept will be included both in the
complete and the (restricted) model. Including an intercept is equivalent to
Expand Down
8 changes: 4 additions & 4 deletions tests/models/test_anchor_regression.py
Original file line number Diff line number Diff line change
Expand Up @@ -76,10 +76,10 @@ def test_anchor_solution_minimizes_loss(n, mx, k, u, mc, gamma):
ar = AnchorRegression(gamma=gamma, fit_intercept=False).fit(X, y, Z, C=C)

def loss(beta):
return np.mean((y - np.hstack([X, C]) @ beta - ar.intercept_) ** 2) + (
gamma - 1
) * np.mean(
return 1 / gamma * np.mean(
(y - np.hstack([X, C]) @ beta - ar.intercept_) ** 2
) + (gamma - 1) / gamma * np.mean(
proj(np.hstack([Z, C]), y - np.hstack([X, C]) @ beta - ar.intercept_) ** 2
)

assert np.allclose(scipy.optimize.approx_fprime(ar.coef_, loss), 0, atol=1e-6)
assert np.allclose(scipy.optimize.approx_fprime(ar.coef_, loss), 0, atol=1e-5)
Loading

0 comments on commit f483762

Please sign in to comment.