Source code for stats_misc.intervals

"""
Interval estimation tools for confidence and prediction intervals.

This module provides functions to compute interval estimates from empirical
data, including exact and approximate confidence intervals and prediction
intervals. It supports methods based on the normal approximation,
binomial distribution, beta distribution, and quantile-based approaches.

Classes
-------
IntervalResults
    A results container for interval estimates.

QuantilesIntervalResults
    A specialised results class for quantile-based confidence intervals,
    including index tracking and coverage matrices.

Functions
---------
univariable_quantiles_exact(values, quantile, alpha=0.05, window=2)
    Computes exact nonparametric confidence intervals for an arbitrary quantile.

univariable_poisson_standard_normal(counts, alpha=0.05)
    Computes Poisson confidence intervals for count data based on a normal
    approximation.

wald_confidence_interval(point, se, alpha=0.05)
    Computes confidence intervals based on the normal distribution.

beta_confidence_interval(proportion, total_sample, alpha=0.05, integer=True)
    Computes confidence intervals for proportions based on the beta
    distribution.

"""

import numpy as np
import pandas as pd
import warnings
from scipy.stats import (
    norm,
    binom,
    beta,
)
from stats_misc.constants import (
    NamesIntervals as NamesI,
    CLASS_NAME,
)
from stats_misc.errors import (
    is_type,
    same_len,
    check_limits,
)
from stats_misc.utils.helpers import (
    Results,
)

# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
[docs] class IntervalResults(Results): """ A results objects containing interval estimates. Attributes ---------- interval_values : `tuple` [`float`, `float`] The lower and upper bounds of the interval. point_estimate : `float` The estimate with the highest data-support. standard_error : `float` The estimated population variability of the point estimate. coverage : `float` The interval coverage. """ SET_ARGS = [ CLASS_NAME, NamesI.POINT, NamesI.COVERAGE, NamesI.VALUES, NamesI.SE, ] def __init__(self, **kwargs) -> None: super().__init__(set_args=self.SET_ARGS, **kwargs)
# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
[docs] class QuantilesIntervalResults(IntervalResults): """ The results class for `univariable_quantiles_exact`. Attributes ---------- interval_indices : `list` [`int`] The list indices of the interval lower and upper bounds. interval_values : `list` [`float`] The lower and upper bounds of the interval. point_estimate : `float` The estimate with the highest data-support. coverage : `float` The interval coverage. matrix_coverage : `numpy.ndarray` A 2D array of coverage probabilities. matrix_columns : `list` [`float`] The column names of `matrix_coverage` representing the indices of the upper limit. matrix_rows : `list` [`float`] The row names of `matrix_coverage` representing the indices of the lower limit. """ SET_ARGS = [ CLASS_NAME, NamesI.POINT, NamesI.COVERAGE, NamesI.VALUES, NamesI.INDICES, NamesI.MATRIX_COVERAGE, NamesI.MATRIX_COLUMNS, NamesI.MATRIX_ROWS, ]
# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
[docs] def univariable_quantiles_exact(values:list[float], quantile:float, alpha:float=0.05, window:int=2, ) -> QuantilesIntervalResults: """ Compute an exact confidence interval for a univariate quantile. Estimates the lower and upper bounds of a 100(1 - alpha)% confidence interval around a specified quantile of the input data. The function returns both the values and indices of the bounds, as well as the exact coverage probability. Parameters ---------- values : `list` [`float`] A list of floats representing the available data. quantile : `float` The target quantile (between 0 and 1); for example, 0.5 for the median, 0.25 for the first quartile, or 0.025 for the 2.5th percentile. alpha : `float`, default 0.05 The 100(1-alpha)% confidence level. window : `int`, default 2 The number of points allowed around the exact quantile index when searching for valid intervals. Returns ------- QuantilesIntervalResults An object containing the interval indices, interval bounds, the point estimate, and the exact coverage probability. Notes ----- This method uses the binomial distribution to derive the exact probability of a value being less than or equal to the desired quantile. It does not make assumptions about the underlying distribution of the data and thus produces nonparametric, exact confidence intervals. The returned interval is guaranteed to have coverage greater than or equal to the nominal level. The code is based on the `stackexchange <https://stats.stackexchange.com/questions/99829/how-to-obtain-a-confidence-interval-for-a-percentile>`_ answer by `whuber`. Which in turn is based on Chapter 5 of Meeker, Hahn, and Escobar's book [1]_. References ---------- .. [1] Meeker, W.Q., Hahn, G.J., & Escobar, L.A. (2017). *Statistical Intervals: A Guide for Practitioners and Researchers* (2nd ed.). Wiley. """ # ### test input is_type(values, list, 'values') is_type(quantile, float, 'quantile') is_type(alpha, float, 'quantile') is_type(window, (int, float), 'window') # is alpha between 0 and 1 check_limits(alpha, 0.0, 1.0) # ### calculate the interval n = len(values) # ensure the values are sorted values = values.copy() values.sort() # get the quantile - only needed to for the results objects, not for calcs point = np.quantile(values, quantile) # calculate the interval positions upper_temp = int(binom.ppf(1-alpha/2, n, quantile)) lower_temp = int(binom.ppf(alpha/2, n, quantile)) # adding integer values around the true positions to form a search_grid w_ints = list(range(0,window+1)) w_ints = [-1*w for w in w_ints[1:]] + w_ints upper = [upper_temp + w for w in w_ints] lower = [lower_temp + w for w in w_ints] # check if these values are plausible, otherwise replace by inf upper = [np.inf if u > n else u for u in upper] lower = [np.inf if l < 0 else l for l in lower] # finding the coverage probabilities coverage = np.empty((len(lower), len(upper))) for i, a in enumerate(lower): for j, b in enumerate(upper): coverage[i, j] = \ binom.cdf(b - 1, n, quantile) - binom.cdf(a - 1, n, quantile) # the lower and upper bounds with a coverage equal to or larger than requested coverage_diff = (coverage - (1-alpha)) coverage_exact = np.min(coverage[coverage_diff > 0]) row, col = np.where(coverage_exact == coverage) # if there are multiple cols, simply select the first entry, which is # the smaller value, hence resulting in a smaller CI limit_u = [upper[k] for k in col][0] limit_l = [lower[k] for k in row][0] limit_indices = [limit_l, limit_u] if any(np.isinf(x) for x in limit_indices): raise TypeError('`limit_indices` contains `inf`, please use different ' 'combinations of the alpha and quantile arguments.') # get the actual confidence interval values try: confidence_interval = [values[t] for t in limit_indices] except IndexError: raise IndexError('Object values has indices: `{}`, while the ' 'confidence interval limit incides are `{}`. ' 'Please request a different alpha, or quantile.'.\ format([0,len(values)-1], limit_indices) ) # return return QuantilesIntervalResults(**{ CLASS_NAME : 'univariable_quantiles_exact', NamesI.VALUES : confidence_interval, NamesI.POINT : point, NamesI.INDICES : limit_indices, NamesI.COVERAGE : coverage_exact, NamesI.MATRIX_COVERAGE : coverage, NamesI.MATRIX_COLUMNS : upper, NamesI.MATRIX_ROWS : lower, })
# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
[docs] def univariable_poisson_standard_normal(counts:list[int], alpha:float=0.05, ) -> IntervalResults: """ Compute Poisson confidence intervals based on the standard normal approximation to the Poisson distribution. Parameters ---------- counts : `list` [`int`] A list of counts. alpha : `float`, default 0.05 The 100(1-alpha)% confidence level. Returns ------- IntervalResults A results object containing the lower and upper confidence bounds, the point estimate, and the interval coverage. """ # Test input is_type(counts, list, 'counts') is_type(alpha, float, 'alpha') # is alpha between 0 and 1 check_limits(alpha, 0.0, 1.0) # Get constants xbar = np.mean(counts) xsd = np.sqrt(xbar/len(counts)) z = norm.ppf(1-alpha/2) # get the confidence intervals lb = xbar-xsd*z ub = xbar+xsd*z # return return IntervalResults(**{ CLASS_NAME : 'univariable_poisson_standard_normal', NamesI.VALUES : (lb, ub), NamesI.POINT : xbar, NamesI.SE : xsd, NamesI.COVERAGE : 1-alpha, })
# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
[docs] def wald_confidence_interval(point:float, se:float, alpha:float=0.05, ) -> IntervalResults: """ Compute a 100(1-alpha)% confidence interval using the normal distribution. Parameters ---------- point : `float` The point estimate. se : `float` The standard error of the point estimate. alpha : `float`, default 0.05 The 100(1-alpha)% confidence level. Returns ------- IntervalResults A results object containing the lower and upper confidence bounds, the point estimate, and the interval coverage. """ # test input is_type(point, (float, int), 'point') is_type(se, (float, int), 'se') is_type(alpha, float, 'alpha') # is alpha between 0 and 1 check_limits(alpha, 0.0, 1.0) # getting z = norm.ppf(1-alpha/2) # lower and upper lb, ub = point - se * z, point + se * z # return return IntervalResults(**{ CLASS_NAME : 'wald_confidence_interval', NamesI.VALUES : (lb, ub), NamesI.POINT : point, NamesI.SE : se, NamesI.COVERAGE : 1-alpha, })
# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
[docs] def beta_confidence_interval(proportion:float, total_sample:int, alpha:float=0.05, integer:bool=True, ) -> IntervalResults: """ Compute beta confidence intervals for proportions, based on a classic contribution by SA Julious [1]_. Parameters ---------- proportion : `float` A proportion between 0 and 1. total_sample : `int` The total sample size from which the proportion was derived. alpha : `float`, default 0.05 The 100(1-alpha)% confidence level. integer : `bool`, default True Whether the function should raise an error if the product of `proportion * total_sample` is not an integer. If set to `False`, the function will issue a warning and round the product to the nearest integer. Returns ------- IntervalResults A results class returning the lower and upper confidence interval bounds. References ---------- .. [1] Steven A Julious. Two-sided confidence intervals for the single proportion: comparison of seven methods by Robert G. Newcombe, Statistics in Medicine 1998; 17:857-872. """ # check input is_type(proportion, (float, int), 'proportion') is_type(total_sample, int, 'total_sample') is_type(alpha, float, 'alpha') is_type(integer, bool, 'integer') # is alpha between 0 and 1 check_limits(alpha, 0.0, 1.0) # get the number of events no_events = proportion * total_sample if (not no_events.is_integer()) and (integer==True): raise ValueError( "The product of `proportion` and `total_sample` is not an integer. " "Either correct the input or set `integer=False` to allow rounding. " f"Current input: proportion = {proportion}, total_sample = " f"{total_sample}, product = {no_events}." ) elif not no_events.is_integer(): no_events = round(no_events) # lower bound lb = 1-beta.ppf(1-alpha/2, total_sample - no_events + 1, no_events) # upper bound ub = beta.ppf(1-alpha/2, no_events + 1, total_sample - no_events) # suppressed the warnings because SE is missing. warnings.filterwarnings("ignore", category=UserWarning) results = IntervalResults(**{ CLASS_NAME : 'beta_confidence_interval', NamesI.VALUES : (lb, ub), NamesI.POINT : proportion, NamesI.COVERAGE : 1-alpha, }) warnings.resetwarnings() # returns return results
# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
[docs] def linear_contrast( d_mat: np.ndarray, v_mat: np.ndarray, c_vec: np.ndarray, contrast_names: list[str] | None = None, alpha: float = 0.05, ) -> pd.DataFrame: """ Compute exact variance propagation for linear contrasts of regression coefficients. For any linear contrast L = D' * beta, the variance is: Var(L) = D' * V * D This is exact and holds for any regression model with a coefficient vector and variance-covariance matrix. Parameters ---------- d_mat : `array_like`, shape (n_coef, n_contrasts) Contrast matrix. Each column defines one contrast, each row corresponds to one coefficient. Set a row to zero to exclude that coefficient from the contrast. v_mat : `array_like`, shape (n_coef, n_coef) Variance-covariance matrix of the coefficients. c_vec : `array_like`, shape (n_coef,) Point estimates of the model coefficients. contrast_names : `list` [`str`], default `None` Labels for each contrast. Defaults to ["contrast_1", "contrast_2", ...]. alpha : `float`, default 0.05 The 100(1-alpha)% confidence level. Returns ------- pd.DataFrame One row per contrast with columns: contrast, estimate, se, lower, upper. All values are on the linear predictor scale; any back-transformation is the caller's responsibility. Raises ------ ValueError If dimensions of d_mat, v_mat, c_vec are inconsistent, alpha is outside (0, 1), or contrast_names length does not match ncol(d_mat). Notes ----- **Generality across model types** Works for OLS, GLM, Cox, mixed-effects, penalised regression, or any model yielding a coefficient vector and covariance matrix. For spline or polynomial terms, populate the corresponding rows of d_mat with the evaluated basis values at the target point (e.g. from a fitted spline object). **Link scale** For models with a nonlinear link (logistic, Poisson, Cox), estimates are returned on the link scale. Back-transform and propagate uncertainty externally as needed. **Interactions and piecewise equations** Include the product (or indicator) terms explicitly in d_mat. For example, for treatment effect in females under a treatment * gender interaction, set both the treatment row and the interaction row to 1. Examples -------- Linear model: y ~ gender * treatment Coefficients: intercept, gender_female, treatment, treatment:gender_female Treatment effect in males: [0, 0, 1, 0] Treatment effect in females: [0, 0, 1, 1] >>> d_mat = np.array([[0, 0], ... [0, 0], ... [1, 1], ... [0, 1]]) >>> c_vec = np.array([5.0, 1.2, 2.3, 0.8]) >>> v_mat = np.eye(4) * 0.1 >>> linear_contrast(d_mat, v_mat, c_vec, ... contrast_names=["males", "females"]) """ # ### confirm input and map to array - e.g. if list of list d_mat = np.asarray(d_mat, dtype=float) v_mat = np.asarray(v_mat, dtype=float) # ravel enusre the coef vec is a 1d matrix c_vec = np.asarray(c_vec, dtype=float).ravel() # reshape if needed if d_mat.ndim == 1: d_mat = d_mat.reshape(-1, 1) # confirm length same_len(c_vec, d_mat, ['c_vec', 'd_mat']) same_len(c_vec, v_mat, ['c_vec', 'v_mat']) same_len(v_mat, v_mat.T, ['rows v_mat', 'columns v_mat']) # check alpha is_type(alpha, float) # is alpha between 0 and 1 check_limits(alpha, 0.0, 1.0) if contrast_names is not None: same_len(contrast_names, d_mat.T, ['contrast_names', 'columns d_mat']) else: contrast_names = [ f"contrast_{i + 1}" for i in range(d_mat.shape[1]) ] # Get the point estimates estimates = c_vec @ d_mat # the standard erroris se = np.sqrt(np.diag(d_mat.T @ v_mat @ d_mat)) # the z-statistics z = norm.ppf(1-alpha/2) # the confidence_intervals return pd.DataFrame({ "contrast": contrast_names, "estimate": estimates, "se": se, "lower": estimates - z * se, "upper": estimates + z * se, })