"""
Interval estimation tools for confidence and prediction intervals.
This module provides functions to compute interval estimates from empirical
data, including exact and approximate confidence intervals and prediction
intervals. It supports methods based on the normal approximation,
binomial distribution, beta distribution, and quantile-based approaches.
Classes
-------
IntervalResults
A results container for interval estimates.
QuantilesIntervalResults
A specialised results class for quantile-based confidence intervals,
including index tracking and coverage matrices.
Functions
---------
univariable_quantiles_exact(values, quantile, alpha=0.05, window=2)
Computes exact nonparametric confidence intervals for an arbitrary quantile.
univariable_poisson_standard_normal(counts, alpha=0.05)
Computes Poisson confidence intervals for count data based on a normal
approximation.
wald_confidence_interval(point, se, alpha=0.05)
Computes confidence intervals based on the normal distribution.
beta_confidence_interval(proportion, total_sample, alpha=0.05, integer=True)
Computes confidence intervals for proportions based on the beta
distribution.
"""
import numpy as np
import pandas as pd
import warnings
from scipy.stats import (
norm,
binom,
beta,
)
from stats_misc.constants import (
NamesIntervals as NamesI,
CLASS_NAME,
)
from stats_misc.errors import (
is_type,
same_len,
check_limits,
)
from stats_misc.utils.helpers import (
Results,
)
# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
[docs]
class IntervalResults(Results):
"""
A results objects containing interval estimates.
Attributes
----------
interval_values : `tuple` [`float`, `float`]
The lower and upper bounds of the interval.
point_estimate : `float`
The estimate with the highest data-support.
standard_error : `float`
The estimated population variability of the point estimate.
coverage : `float`
The interval coverage.
"""
SET_ARGS = [
CLASS_NAME,
NamesI.POINT,
NamesI.COVERAGE,
NamesI.VALUES,
NamesI.SE,
]
def __init__(self, **kwargs) -> None:
super().__init__(set_args=self.SET_ARGS, **kwargs)
# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
[docs]
class QuantilesIntervalResults(IntervalResults):
"""
The results class for `univariable_quantiles_exact`.
Attributes
----------
interval_indices : `list` [`int`]
The list indices of the interval lower and upper bounds.
interval_values : `list` [`float`]
The lower and upper bounds of the interval.
point_estimate : `float`
The estimate with the highest data-support.
coverage : `float`
The interval coverage.
matrix_coverage : `numpy.ndarray`
A 2D array of coverage probabilities.
matrix_columns : `list` [`float`]
The column names of `matrix_coverage` representing the indices of
the upper limit.
matrix_rows : `list` [`float`]
The row names of `matrix_coverage` representing the indices of
the lower limit.
"""
SET_ARGS = [
CLASS_NAME,
NamesI.POINT,
NamesI.COVERAGE,
NamesI.VALUES,
NamesI.INDICES,
NamesI.MATRIX_COVERAGE,
NamesI.MATRIX_COLUMNS,
NamesI.MATRIX_ROWS,
]
# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
[docs]
def univariable_quantiles_exact(values:list[float], quantile:float,
alpha:float=0.05, window:int=2,
) -> QuantilesIntervalResults:
"""
Compute an exact confidence interval for a univariate quantile.
Estimates the lower and upper bounds of a 100(1 - alpha)% confidence
interval around a specified quantile of the input data. The function
returns both the values and indices of the bounds, as well as the
exact coverage probability.
Parameters
----------
values : `list` [`float`]
A list of floats representing the available data.
quantile : `float`
The target quantile (between 0 and 1); for example,
0.5 for the median, 0.25 for the first quartile, or 0.025
for the 2.5th percentile.
alpha : `float`, default 0.05
The 100(1-alpha)% confidence level.
window : `int`, default 2
The number of points allowed around the exact quantile
index when searching for valid intervals.
Returns
-------
QuantilesIntervalResults
An object containing the interval indices, interval bounds, the
point estimate, and the exact coverage probability.
Notes
-----
This method uses the binomial distribution to derive the exact probability
of a value being less than or equal to the desired quantile. It does not
make assumptions about the underlying distribution of the data and thus
produces nonparametric, exact confidence intervals. The returned interval
is guaranteed to have coverage greater than or equal to the nominal level.
The code is based on the
`stackexchange <https://stats.stackexchange.com/questions/99829/how-to-obtain-a-confidence-interval-for-a-percentile>`_ answer by `whuber`.
Which in turn is based on Chapter 5 of Meeker, Hahn, and Escobar's
book [1]_.
References
----------
.. [1] Meeker, W.Q., Hahn, G.J., & Escobar, L.A. (2017). *Statistical Intervals:
A Guide for Practitioners and Researchers* (2nd ed.). Wiley.
"""
# ### test input
is_type(values, list, 'values')
is_type(quantile, float, 'quantile')
is_type(alpha, float, 'quantile')
is_type(window, (int, float), 'window')
# is alpha between 0 and 1
check_limits(alpha, 0.0, 1.0)
# ### calculate the interval
n = len(values)
# ensure the values are sorted
values = values.copy()
values.sort()
# get the quantile - only needed to for the results objects, not for calcs
point = np.quantile(values, quantile)
# calculate the interval positions
upper_temp = int(binom.ppf(1-alpha/2, n, quantile))
lower_temp = int(binom.ppf(alpha/2, n, quantile))
# adding integer values around the true positions to form a search_grid
w_ints = list(range(0,window+1))
w_ints = [-1*w for w in w_ints[1:]] + w_ints
upper = [upper_temp + w for w in w_ints]
lower = [lower_temp + w for w in w_ints]
# check if these values are plausible, otherwise replace by inf
upper = [np.inf if u > n else u for u in upper]
lower = [np.inf if l < 0 else l for l in lower]
# finding the coverage probabilities
coverage = np.empty((len(lower), len(upper)))
for i, a in enumerate(lower):
for j, b in enumerate(upper):
coverage[i, j] = \
binom.cdf(b - 1, n, quantile) - binom.cdf(a - 1, n, quantile)
# the lower and upper bounds with a coverage equal to or larger than requested
coverage_diff = (coverage - (1-alpha))
coverage_exact = np.min(coverage[coverage_diff > 0])
row, col = np.where(coverage_exact == coverage)
# if there are multiple cols, simply select the first entry, which is
# the smaller value, hence resulting in a smaller CI
limit_u = [upper[k] for k in col][0]
limit_l = [lower[k] for k in row][0]
limit_indices = [limit_l, limit_u]
if any(np.isinf(x) for x in limit_indices):
raise TypeError('`limit_indices` contains `inf`, please use different '
'combinations of the alpha and quantile arguments.')
# get the actual confidence interval values
try:
confidence_interval = [values[t] for t in limit_indices]
except IndexError:
raise IndexError('Object values has indices: `{}`, while the '
'confidence interval limit incides are `{}`. '
'Please request a different alpha, or quantile.'.\
format([0,len(values)-1], limit_indices)
)
# return
return QuantilesIntervalResults(**{
CLASS_NAME : 'univariable_quantiles_exact',
NamesI.VALUES : confidence_interval,
NamesI.POINT : point,
NamesI.INDICES : limit_indices,
NamesI.COVERAGE : coverage_exact,
NamesI.MATRIX_COVERAGE : coverage,
NamesI.MATRIX_COLUMNS : upper,
NamesI.MATRIX_ROWS : lower,
})
# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
[docs]
def univariable_poisson_standard_normal(counts:list[int], alpha:float=0.05,
) -> IntervalResults:
"""
Compute Poisson confidence intervals based on the standard normal
approximation to the Poisson distribution.
Parameters
----------
counts : `list` [`int`]
A list of counts.
alpha : `float`, default 0.05
The 100(1-alpha)% confidence level.
Returns
-------
IntervalResults
A results object containing the lower and upper confidence bounds,
the point estimate, and the interval coverage.
"""
# Test input
is_type(counts, list, 'counts')
is_type(alpha, float, 'alpha')
# is alpha between 0 and 1
check_limits(alpha, 0.0, 1.0)
# Get constants
xbar = np.mean(counts)
xsd = np.sqrt(xbar/len(counts))
z = norm.ppf(1-alpha/2)
# get the confidence intervals
lb = xbar-xsd*z
ub = xbar+xsd*z
# return
return IntervalResults(**{
CLASS_NAME : 'univariable_poisson_standard_normal',
NamesI.VALUES : (lb, ub),
NamesI.POINT : xbar,
NamesI.SE : xsd,
NamesI.COVERAGE : 1-alpha,
})
# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
[docs]
def wald_confidence_interval(point:float, se:float,
alpha:float=0.05,
) -> IntervalResults:
"""
Compute a 100(1-alpha)% confidence interval using the normal distribution.
Parameters
----------
point : `float`
The point estimate.
se : `float`
The standard error of the point estimate.
alpha : `float`, default 0.05
The 100(1-alpha)% confidence level.
Returns
-------
IntervalResults
A results object containing the lower and upper confidence bounds,
the point estimate, and the interval coverage.
"""
# test input
is_type(point, (float, int), 'point')
is_type(se, (float, int), 'se')
is_type(alpha, float, 'alpha')
# is alpha between 0 and 1
check_limits(alpha, 0.0, 1.0)
# getting
z = norm.ppf(1-alpha/2)
# lower and upper
lb, ub = point - se * z, point + se * z
# return
return IntervalResults(**{
CLASS_NAME : 'wald_confidence_interval',
NamesI.VALUES : (lb, ub),
NamesI.POINT : point,
NamesI.SE : se,
NamesI.COVERAGE : 1-alpha,
})
# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
[docs]
def beta_confidence_interval(proportion:float, total_sample:int,
alpha:float=0.05,
integer:bool=True,
) -> IntervalResults:
"""
Compute beta confidence intervals for proportions, based on a classic
contribution by SA Julious [1]_.
Parameters
----------
proportion : `float`
A proportion between 0 and 1.
total_sample : `int`
The total sample size from which the proportion was derived.
alpha : `float`, default 0.05
The 100(1-alpha)% confidence level.
integer : `bool`, default True
Whether the function should raise an error if the product of
`proportion * total_sample` is not an integer. If set to `False`,
the function will issue a warning and round the product to the
nearest integer.
Returns
-------
IntervalResults
A results class returning the lower and upper confidence interval
bounds.
References
----------
.. [1] Steven A Julious. Two-sided confidence intervals for the single
proportion: comparison of seven methods by Robert G. Newcombe,
Statistics in Medicine 1998; 17:857-872.
"""
# check input
is_type(proportion, (float, int), 'proportion')
is_type(total_sample, int, 'total_sample')
is_type(alpha, float, 'alpha')
is_type(integer, bool, 'integer')
# is alpha between 0 and 1
check_limits(alpha, 0.0, 1.0)
# get the number of events
no_events = proportion * total_sample
if (not no_events.is_integer()) and (integer==True):
raise ValueError(
"The product of `proportion` and `total_sample` is not an integer. "
"Either correct the input or set `integer=False` to allow rounding. "
f"Current input: proportion = {proportion}, total_sample = "
f"{total_sample}, product = {no_events}."
)
elif not no_events.is_integer():
no_events = round(no_events)
# lower bound
lb = 1-beta.ppf(1-alpha/2, total_sample - no_events + 1, no_events)
# upper bound
ub = beta.ppf(1-alpha/2, no_events + 1, total_sample - no_events)
# suppressed the warnings because SE is missing.
warnings.filterwarnings("ignore", category=UserWarning)
results = IntervalResults(**{
CLASS_NAME : 'beta_confidence_interval',
NamesI.VALUES : (lb, ub),
NamesI.POINT : proportion,
NamesI.COVERAGE : 1-alpha,
})
warnings.resetwarnings()
# returns
return results
# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
[docs]
def linear_contrast(
d_mat: np.ndarray,
v_mat: np.ndarray,
c_vec: np.ndarray,
contrast_names: list[str] | None = None,
alpha: float = 0.05,
) -> pd.DataFrame:
"""
Compute exact variance propagation for linear contrasts of regression
coefficients.
For any linear contrast L = D' * beta, the variance is:
Var(L) = D' * V * D
This is exact and holds for any regression model with a
coefficient vector and variance-covariance matrix.
Parameters
----------
d_mat : `array_like`, shape (n_coef, n_contrasts)
Contrast matrix. Each column defines one contrast, each
row corresponds to one coefficient. Set a row to zero to
exclude that coefficient from the contrast.
v_mat : `array_like`, shape (n_coef, n_coef)
Variance-covariance matrix of the coefficients.
c_vec : `array_like`, shape (n_coef,)
Point estimates of the model coefficients.
contrast_names : `list` [`str`], default `None`
Labels for each contrast. Defaults to
["contrast_1", "contrast_2", ...].
alpha : `float`, default 0.05
The 100(1-alpha)% confidence level.
Returns
-------
pd.DataFrame
One row per contrast with columns: contrast, estimate,
se, lower, upper. All values are on the linear predictor
scale; any back-transformation is the caller's
responsibility.
Raises
------
ValueError
If dimensions of d_mat, v_mat, c_vec are inconsistent,
alpha is outside (0, 1), or contrast_names length
does not match ncol(d_mat).
Notes
-----
**Generality across model types**
Works for OLS, GLM, Cox, mixed-effects, penalised regression,
or any model yielding a coefficient vector and covariance
matrix. For spline or polynomial terms, populate the
corresponding rows of d_mat with the evaluated basis values
at the target point (e.g. from a fitted spline object).
**Link scale**
For models with a nonlinear link (logistic, Poisson, Cox),
estimates are returned on the link scale. Back-transform and
propagate uncertainty externally as needed.
**Interactions and piecewise equations**
Include the product (or indicator) terms explicitly in d_mat.
For example, for treatment effect in females under a
treatment * gender interaction, set both the treatment row
and the interaction row to 1.
Examples
--------
Linear model: y ~ gender * treatment
Coefficients: intercept, gender_female, treatment,
treatment:gender_female
Treatment effect in males: [0, 0, 1, 0]
Treatment effect in females: [0, 0, 1, 1]
>>> d_mat = np.array([[0, 0],
... [0, 0],
... [1, 1],
... [0, 1]])
>>> c_vec = np.array([5.0, 1.2, 2.3, 0.8])
>>> v_mat = np.eye(4) * 0.1
>>> linear_contrast(d_mat, v_mat, c_vec,
... contrast_names=["males", "females"])
"""
# ### confirm input and map to array - e.g. if list of list
d_mat = np.asarray(d_mat, dtype=float)
v_mat = np.asarray(v_mat, dtype=float)
# ravel enusre the coef vec is a 1d matrix
c_vec = np.asarray(c_vec, dtype=float).ravel()
# reshape if needed
if d_mat.ndim == 1:
d_mat = d_mat.reshape(-1, 1)
# confirm length
same_len(c_vec, d_mat, ['c_vec', 'd_mat'])
same_len(c_vec, v_mat, ['c_vec', 'v_mat'])
same_len(v_mat, v_mat.T, ['rows v_mat', 'columns v_mat'])
# check alpha
is_type(alpha, float)
# is alpha between 0 and 1
check_limits(alpha, 0.0, 1.0)
if contrast_names is not None:
same_len(contrast_names, d_mat.T, ['contrast_names', 'columns d_mat'])
else:
contrast_names = [
f"contrast_{i + 1}" for i in range(d_mat.shape[1])
]
# Get the point estimates
estimates = c_vec @ d_mat
# the standard erroris
se = np.sqrt(np.diag(d_mat.T @ v_mat @ d_mat))
# the z-statistics
z = norm.ppf(1-alpha/2)
# the confidence_intervals
return pd.DataFrame({
"contrast": contrast_names,
"estimate": estimates,
"se": se,
"lower": estimates - z * se,
"upper": estimates + z * se,
})