Implement a better confidence interval computation

markusReinert · web-flow · commit 30301217d974 · 2022-03-07T11:40:45.000+01:00
This commit introduces a new way of computing the confidence interval
for a non-time-dependent GEV distribution.  The new functions follow
the book by Coles (2001).  The function that was previously used to
compute the confidence interval is removed, as it is not needed in any
of the sample scripts.  The function to print the GEV parameters is
moved from tools_errors to advanced_GEV_analysis.  The former file is
removed because it became empty.

The new way to calculate confidence intervals is mathematically more
sound and generally leads to smaller confidence intervals, thus it
improves the results.  The old way of computing confidence intervals
(now removed) gave a more conservative estimate.

Note that this change does not impact time-dependent GEV distributions
as discussed in the paper.

This modification has been proposed by Marvin Lorenz (IOW), who also
implemented a first version of the new methods.

A new sample script that uses annual maxima instead of monthly maxima is
added with this commit.  Since no conversion between return periods in
years and months is necessary for annual maxima, this code is slightly
cleaner and makes the principles of this new method clearer.
diff --git a/README.md b/README.md
@@ -46,9 +46,10 @@ the script [advanced_GEV_analysis.py](advanced_GEV_analysis.py) may be
 useful.  Its main part is an implementation of the methods described in
 the book “An Introduction to Statistical Modeling of Extreme Values” by
 Stuart Coles (2001).  Example usage of this library is shown for
-[time-independent GEV models](Time-independent_GEV_fit.py) and for
-[time-dependent GEV models](Time-dependent_GEV_fit.py).  With the surge
-levels for Brest from the GESLA-2 dataset, the time-independent GEV
-model looks like this:
+[time-independent GEV models](Time-independent_GEV_fit.py), for
+[time-dependent GEV models](Time-dependent_GEV_fit.py), and for
+[GEV models of annual maxima](Time-independent_GEV_fit_for_annual_maxima.py).
+With the surge levels for Brest from the GESLA-2 dataset, the
+time-independent GEV model of monthly maxima looks like this:
 
-![Figure of a time-independent GEV fit to extreme surge levels in Brest](results/GEV_fit_Brest.png)
+![Figure of a time-independent GEV fit to extreme surge levels (monthly maxima) in Brest](results/GEV_fit_Brest.png)
diff --git a/Time-independent_GEV_fit.py b/Time-independent_GEV_fit.py
@@ -1,16 +1,15 @@
 """Make a time-independent GEV fit to monthly maxima (MM).
 
-Written by Markus Reinert, June 2020–July 2021.
+Written by Markus Reinert, June 2020–March 2022.
 """
 
 import numpy as np
 from scipy import optimize
 from matplotlib import pyplot as plt
 from matplotlib.ticker import ScalarFormatter
 
-from advanced_GEV_analysis import negative_log_likelihood, GEV
-from advanced_GEV_analysis import get_month_selection, get_year_selection
-from tools_errors import format_GEV_parameters, get_error_bounds
+from advanced_GEV_analysis import negative_log_likelihood, get_year_selection, get_month_selection
+from advanced_GEV_analysis import GEV_return_level, GEV_standard_error, format_GEV_parameters
 from tools_surge import load_data, Timeseries
 
 
@@ -36,30 +35,23 @@
 h_MM = np.array(h_MM)
 n_months = len(h_MM)
 
-# Define a function to compute the return period from a CDF
-return_period = lambda P: 1 / (1 - P**(n_months / n_years))
-
 # Fit a GEV to the extreme values
 result = optimize.minimize(negative_log_likelihood, [10, 15, -0.1], args=(h_MM,))
 if not result.success:
     print("Warning:", result.message)
 params = result.x
-errors = np.sqrt(np.diag(result.hess_inv))
-
-# Calculate the CDF and the return periods of the fit
-x_axis = np.linspace(min(h_MM), max(h_MM), 1000)
-P_MM = GEV(x_axis, *params)
-T_MM = return_period(P_MM)
+covars = result.hess_inv
+errors = np.sqrt(np.diag(covars))
 
-# Calculate the 95 % confidence interval of the fit
-n_sigma = 1.96
-P_MM_bounds = get_error_bounds(GEV, x_axis, params, errors, n_sigma)
-T_MM_bounds = [return_period(P_bound) for P_bound in P_MM_bounds]
+# Calculate the graph and the standard error of the fitted GEV model
+t_axis = np.logspace(0, 3, 10_000)[1:]  # exclude t = 1 to avoid value -infinity
+h_model = GEV_return_level(t_axis, *params, values_per_year=n_months/n_years)
+h_std_error = GEV_standard_error(t_axis, *params, covars, values_per_year=n_months/n_years)
 
 # Calculate the CDF and the return periods of the empirical data
 h_empirical = sorted(h_MM)
 P_empirical = (1 + np.arange(n_months)) / (1 + n_months)
-T_empirical = return_period(P_empirical)
+T_empirical = 1 / (1 - P_empirical**(n_months / n_years))
 
 
 fig, ax = plt.subplots()
@@ -77,9 +69,14 @@
     T_empirical, h_empirical, "k.",
     label="Empirical return periods ({} data points)".format(n_months),
 )
-ax.semilogx(T_MM, x_axis, label="GEV fit: " + format_GEV_parameters(params, errors))
-assert n_sigma == 1.96, "label 95 % CI is not correct"
-ax.fill_betweenx(x_axis, *T_MM_bounds, alpha=0.3, label="95 % confidence interval")
+ax.semilogx(t_axis, h_model, label="GEV fit: " + format_GEV_parameters(params, errors))
+ax.fill_between(
+    t_axis,
+    h_model + 1.96 * h_std_error,
+    h_model - 1.96 * h_std_error,
+    alpha=0.3,
+    label="95 % confidence interval",
+)
 
 ax.legend()
 ax.set_xlim(0.9, 200)
diff --git a/Time-independent_GEV_fit_for_annual_maxima.py b/Time-independent_GEV_fit_for_annual_maxima.py
@@ -0,0 +1,79 @@
+"""Make a time-independent GEV fit to annual maxima (AM).
+
+Written by Markus Reinert, June 2020–March 2022.
+"""
+
+import numpy as np
+from scipy import optimize
+from matplotlib import pyplot as plt
+from matplotlib.ticker import ScalarFormatter
+
+from advanced_GEV_analysis import negative_log_likelihood, get_year_selection
+from advanced_GEV_analysis import GEV_return_level, GEV_standard_error, format_GEV_parameters
+from tools_surge import load_data, Timeseries
+
+
+data = load_data("Brest", Timeseries.SKEW_SURGE_GESLA)
+
+# Get the maximum value in every calendar year
+h_AM = []
+for year in range(data["year_start"], data["year_end"] + 1):
+    sel = get_year_selection(year, data["t"])
+    if any(sel):
+        h_AM.append(max(data["h"][sel]))
+h_AM = np.array(h_AM)
+n_years = len(h_AM)
+
+# Fit a GEV to the extreme values
+result = optimize.minimize(negative_log_likelihood, [40, 15, -0.1], args=(h_AM,))
+if not result.success:
+    print("Warning:", result.message)
+params = result.x
+covars = result.hess_inv
+errors = np.sqrt(np.diag(covars))
+
+# Calculate the graph and the standard error of the fitted GEV model
+t_axis = np.logspace(0, 3, 100_000)[1:]  # exclude t = 1 to avoid value -infinity
+h_model = GEV_return_level(t_axis, *params)
+h_std_error = GEV_standard_error(t_axis, *params, covars)
+
+# Calculate the CDF and the return periods of the empirical data
+h_empirical = sorted(h_AM, reverse=True)
+P_empirical = (1 + np.arange(n_years)) / (1 + n_years)
+T_empirical = 1 / P_empirical
+
+
+fig, ax = plt.subplots()
+
+ax.set_title(
+    "GEV fit to annual surge maxima in {} from {} to {}".format(
+        data["city"], data["year_start"], data["year_end"]
+    ),
+    weight="bold",
+)
+ax.set_xlabel("Return period in years")
+ax.set_ylabel("Return level in cm")
+
+ax.semilogx(
+    T_empirical, h_empirical, "k.",
+    label="Empirical return periods ({} data points)".format(n_years),
+)
+ax.semilogx(t_axis, h_model, label="GEV fit: " + format_GEV_parameters(params, errors))
+ax.fill_between(
+    t_axis,
+    h_model + 1.96 * h_std_error,
+    h_model - 1.96 * h_std_error,
+    alpha=0.3,
+    label="95 % confidence interval",
+)
+
+ax.legend()
+ax.set_xlim(0.9, 200)
+# Limit the y-extent to the observed range +/- 5%
+delta_h = (h_AM.max() - h_AM.min()) * 0.05
+ax.set_ylim(h_AM.min() - delta_h, h_AM.max() + delta_h)
+ax.grid(linestyle=":")
+ax.xaxis.set_major_formatter(ScalarFormatter())
+
+plt.savefig("results/GEV_fit_{}_annual.png".format(data["city"]))
+plt.show()
diff --git a/advanced_GEV_analysis.py b/advanced_GEV_analysis.py
@@ -93,6 +93,7 @@
     oscillation with a phase.
 
 Written by Markus Reinert, August 2020, February 2021.
+Extended by Marvin Lorenz and Markus Reinert, March 2022.
 """
 
 import calendar
@@ -295,6 +296,89 @@ def GEV(x, mu, sigma, xi):
         return y
 
 
+def GEV_return_level(t, mu, sigma, xi, values_per_year=1.0):
+    """Compute return level z for return period t of a GEV distribution.
+
+    A random variable with the distribution GEV(mu, sigma, xi) will
+    typically reach a value as high as z once every t years, where z is
+    the value returned by this function.
+
+    That means, for any t >= 1, the following call returns t:
+    return_period(GEV(GEV_return_level(t, mu, sigma, xi), mu, sigma, xi))
+    where return_period(p) = 1 / (1 - p).
+
+    If there is more than one value per year, i.e., not annual maxima
+    but for example monthly maxima are used, then the optional argument
+    values_per_year specifies how many values there are on average per
+    year (12 in the case of monthly maxima, or less if individual months
+    are missing).  In this case, the return_period (see above) is
+    defined as return_period(p) = 1 / (1 - p**values_per_year).
+
+    Reference: Equations (3.4) and (3.10) of Coles (2001)
+    """
+    p = 1 / t
+    if values_per_year != 1.0:
+        # If there is more than one value per year, we need to correct p
+        p = 1 - (1 - p)**(1/values_per_year)
+    if abs(xi) < XI_THRESHOLD:
+        # Gumbel distribution
+        return mu - sigma * np.log(-np.log(1 - p))
+    else:
+        # non-Gumbel GEV distribution
+        return mu - sigma / xi * (1 - (-np.log(1 - p)) ** (-xi))
+
+
+def GEV_standard_error(t, mu, sigma, xi, V, values_per_year=1.0):
+    """Compute the standard error of the return level for return period t.
+
+    When the distribution of a random variable is estimated to be
+    GEV(mu, sigma, xi) with the variance-covariance matrix V, then the
+    uncertainty associated with the return level z that belongs to the
+    return period t (see function GEV_return_level) is the value
+    returned by this function.  The upper (resp. lower) bound of the 95%
+    confidence interval can be approximated by adding (subtracting) 1.96
+    times the value returned by this function to z.
+
+    The main computation in this function is grad_z_p * V * grad_z_p,
+    where * is a matrix multiplication if grad_z_p is 1-dimensional.
+    This is basically a scalar product between grad_z_p and V * grad_z_p
+    and can be written as <grad_z_p|V|grad_z_p> in the bra-ket notation
+    occasionally used in physics.  The square root of this product is
+    returned by this function, in order to have the standard error and
+    not the variance.
+
+    This function can be used with t as a vector instead of a single
+    value, in which case a vector of the same size as t is returned.
+    Note that in this case, grad_z_p is a matrix instead of a vector, so
+    grad_z_p * V * grad_z_p differs from regular matrix multiplication.
+
+    Reference: Equation (3.11) of Coles (2001)
+    """
+    p = 1 / t
+    if values_per_year != 1.0:
+        # If there is more than one value per year, we need to correct p
+        p = 1 - (1 - p)**(1/values_per_year)
+    y_p = -np.log(1 - p)
+    grad_z_p = np.array([
+        np.ones_like(y_p),
+        -xi**(-1) * (1 - y_p**(-xi)),
+        sigma * xi**(-2) * (1 - y_p**(-xi)) - sigma * xi**(-1) * y_p**(-xi) * np.log(y_p),
+    ])
+    return np.sqrt(np.sum(grad_z_p * np.dot(V, grad_z_p), axis=0))
+
+
+def format_GEV_parameters(parameters, errors, join_str=", "):
+    """Create a string that contains the GEV parameters in a nice format."""
+    names = ["\\mu", "\\sigma", "\\xi"]
+    # Write the error with one significant digit
+    digits = [int(-np.floor(np.log10(e))) if e < 1 else 0 for e in errors]
+    return join_str.join(
+        "${name} = {param:.{digits}f} \\pm {std:.{digits}f}$".format(
+            name=n, param=p, std=e, digits=d
+        ) for n, p, e, d in zip(names, parameters, errors, digits)
+    )
+
+
 def get_year_selection(year, time_array):
     """Get the Boolean array that selects ‘year’ from ‘time_array’.
 
diff --git a/results/GEV_fit_Brest.png b/results/GEV_fit_Brest.png
diff --git a/tools_errors.py b/tools_errors.py