Source code for stats_misc.intervals

"""
Interval estimation tools for confidence and prediction intervals.

This module provides functions to compute interval estimates from empirical
data, including exact and approximate confidence intervals and prediction
intervals. It supports methods based on the normal approximation,
binomial distribution, beta distribution, and quantile-based approaches.

Classes
-------
IntervalResults
    A results container for interval estimates.

QuantilesIntervalResults
    A specialised results class for quantile-based confidence intervals,
    including index tracking and coverage matrices.

Functions
---------
univariable_quantiles_exact(values, quantile, alpha=0.05, window=2)
    Computes exact nonparametric confidence intervals for an arbitrary quantile.

univariable_poisson_standard_normal(counts, alpha=0.05)
    Computes Poisson confidence intervals for count data based on a normal
    approximation.

wald_confidence_interval(point, se, alpha=0.05)
    Computes confidence intervals based on the normal distribution.

beta_confidence_interval(proportion, total_sample, alpha=0.05, integer=True)
    Computes confidence intervals for proportions based on the beta
    distribution.

"""

import numpy as np
import pandas as pd
import warnings
from scipy.stats import (
    norm,
    binom,
    beta,
)
from stats_misc.constants import (
    NamesIntervals as NamesI,
    CLASS_NAME,
)
from stats_misc.errors import (
    is_type,
    same_len,
    check_limits,
)
from stats_misc.utils.helpers import (
    Results,
)

# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

[docs]
class IntervalResults(Results):
    """
    A results objects containing interval estimates.
    
    Attributes
    ----------
    interval_values : `tuple` [`float`, `float`]
        The lower and upper bounds of the interval.
    point_estimate : `float`
        The estimate with the highest data-support.
    standard_error : `float`
        The estimated population variability of the point estimate.
    coverage : `float`
        The interval coverage.
    """
    SET_ARGS = [
        CLASS_NAME,
        NamesI.POINT,
        NamesI.COVERAGE,
        NamesI.VALUES,
        NamesI.SE,
    ]
    def __init__(self, **kwargs) -> None:
        super().__init__(set_args=self.SET_ARGS, **kwargs)


# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

[docs]
class QuantilesIntervalResults(IntervalResults):
    """
    The results class for `univariable_quantiles_exact`.
    
    Attributes
    ----------
    interval_indices : `list` [`int`]
        The list indices of the interval lower and upper bounds.
    interval_values : `list` [`float`]
        The lower and upper bounds of the interval.
    point_estimate : `float`
        The estimate with the highest data-support.
    coverage : `float`
        The interval coverage.
    matrix_coverage : `numpy.ndarray`
        A 2D array of coverage probabilities.
    matrix_columns : `list` [`float`]
        The column names of `matrix_coverage` representing the indices of
        the upper limit.
    matrix_rows : `list` [`float`]
        The row names of `matrix_coverage` representing the indices of
        the lower limit.
    """
    SET_ARGS = [
        CLASS_NAME,
        NamesI.POINT,
        NamesI.COVERAGE,
        NamesI.VALUES,
        NamesI.INDICES,
        NamesI.MATRIX_COVERAGE,
        NamesI.MATRIX_COLUMNS,
        NamesI.MATRIX_ROWS,
    ]


# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

[docs]
def univariable_quantiles_exact(values:list[float], quantile:float,
                                alpha:float=0.05, window:int=2,
                                ) -> QuantilesIntervalResults:
    """
    Compute an exact confidence interval for a univariate quantile.
    
    Estimates the lower and upper bounds of a 100(1 - alpha)% confidence
    interval around a specified quantile of the input data. The function
    returns both the values and indices of the bounds, as well as the
    exact coverage probability.
    
    Parameters
    ----------
    values : `list` [`float`]
        A list of floats representing the available data.
    quantile : `float`
        The target quantile (between 0 and 1); for example,
        0.5 for the median, 0.25 for the first quartile, or 0.025
        for the 2.5th percentile.
    alpha : `float`, default 0.05
        The 100(1-alpha)% confidence level.
    window : `int`, default 2
        The number of points allowed around the exact quantile
        index when searching for valid intervals.
    
    Returns
    -------
    QuantilesIntervalResults
        An object containing the interval indices, interval bounds, the
        point estimate, and the exact coverage probability.
    
    Notes
    -----
    This method uses the binomial distribution to derive the exact probability
    of a value being less than or equal to the desired quantile. It does not
    make assumptions about the underlying distribution of the data and thus
    produces nonparametric, exact confidence intervals. The returned interval
    is guaranteed to have coverage greater than or equal to the nominal level.
    
    The code is based on the
    `stackexchange <https://stats.stackexchange.com/questions/99829/how-to-obtain-a-confidence-interval-for-a-percentile>`_ answer by `whuber`.
    Which in turn is based on Chapter 5 of Meeker, Hahn, and Escobar's
    book [1]_.
    
    References
    ----------
    .. [1] Meeker, W.Q., Hahn, G.J., & Escobar, L.A. (2017). *Statistical Intervals:
           A Guide for Practitioners and Researchers* (2nd ed.). Wiley.
    """
    
    # ### test input
    is_type(values, list, 'values')
    is_type(quantile, float, 'quantile')
    is_type(alpha, float, 'quantile')
    is_type(window, (int, float), 'window')
    # is alpha between 0 and 1
    check_limits(alpha, 0.0, 1.0)
    # ### calculate the interval
    n = len(values)
    # ensure the values are sorted
    values = values.copy()
    values.sort()
    # get the quantile - only needed to for the results objects, not for calcs
    point = np.quantile(values, quantile)
    # calculate the interval positions
    upper_temp = int(binom.ppf(1-alpha/2, n, quantile))
    lower_temp = int(binom.ppf(alpha/2, n, quantile))
    # adding integer values around the true positions to form a search_grid
    w_ints = list(range(0,window+1))
    w_ints = [-1*w for w in w_ints[1:]] + w_ints
    upper = [upper_temp + w for w in w_ints]
    lower = [lower_temp + w for w in w_ints]
    # check if these values are plausible, otherwise replace by inf
    upper = [np.inf if u > n else u for u in upper]
    lower = [np.inf if l < 0 else l for l in lower]
    # finding the coverage probabilities
    coverage = np.empty((len(lower), len(upper)))
    for i, a in enumerate(lower):
        for j, b in enumerate(upper):
            coverage[i, j] = \
                binom.cdf(b - 1, n, quantile) - binom.cdf(a - 1, n, quantile)
    # the lower and upper bounds with a coverage equal to or larger than requested
    coverage_diff = (coverage - (1-alpha))
    coverage_exact = np.min(coverage[coverage_diff > 0])
    row, col = np.where(coverage_exact == coverage)
    # if there are multiple cols, simply select the first entry, which is
    # the smaller value, hence resulting in a smaller CI
    limit_u = [upper[k] for k in col][0]
    limit_l = [lower[k] for k in row][0]
    limit_indices = [limit_l, limit_u]
    if any(np.isinf(x) for x in limit_indices):
        raise TypeError('`limit_indices` contains `inf`, please use different '
                        'combinations of the alpha and quantile arguments.')
    # get the actual confidence interval values
    try:
        confidence_interval = [values[t] for t in limit_indices]
    except IndexError:
        raise IndexError('Object values has indices: `{}`, while the '
                         'confidence interval limit incides are `{}`. '
                         'Please request a different alpha, or quantile.'.\
                         format([0,len(values)-1], limit_indices)
                         )
    # return
    return QuantilesIntervalResults(**{
        CLASS_NAME             : 'univariable_quantiles_exact',
        NamesI.VALUES          : confidence_interval,
        NamesI.POINT           : point,
        NamesI.INDICES         : limit_indices,
        NamesI.COVERAGE        : coverage_exact,
        NamesI.MATRIX_COVERAGE : coverage,
        NamesI.MATRIX_COLUMNS  : upper,
        NamesI.MATRIX_ROWS     : lower,
    })


# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

[docs]
def univariable_poisson_standard_normal(counts:list[int], alpha:float=0.05,
                                        ) -> IntervalResults:
    """
    Compute Poisson confidence intervals based on the standard normal
    approximation to the Poisson distribution.
    
    Parameters
    ----------
    counts : `list` [`int`]
        A list of counts.
    alpha : `float`, default 0.05
        The 100(1-alpha)% confidence level.
    
    Returns
    -------
    IntervalResults
        A results object containing the lower and upper confidence bounds,
        the point estimate, and the interval coverage.
    """
    
    # Test input
    is_type(counts, list, 'counts')
    is_type(alpha, float, 'alpha')
    # is alpha between 0 and 1
    check_limits(alpha, 0.0, 1.0)
    # Get constants
    xbar = np.mean(counts)
    xsd = np.sqrt(xbar/len(counts))
    z = norm.ppf(1-alpha/2)
    # get the confidence intervals
    lb = xbar-xsd*z
    ub = xbar+xsd*z
    # return
    return IntervalResults(**{
        CLASS_NAME             : 'univariable_poisson_standard_normal',
        NamesI.VALUES          : (lb, ub),
        NamesI.POINT           : xbar,
        NamesI.SE              : xsd,
        NamesI.COVERAGE        : 1-alpha,
    })


# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

[docs]
def wald_confidence_interval(point:float, se:float,
                             alpha:float=0.05,
                             ) -> IntervalResults:
    """
    Compute a 100(1-alpha)% confidence interval using the normal distribution.
    
    Parameters
    ----------
    point : `float`
        The point estimate.
    se : `float`
        The standard error of the point estimate.
    alpha : `float`, default 0.05
        The 100(1-alpha)% confidence level.
    
    Returns
    -------
    IntervalResults
        A results object containing the lower and upper confidence bounds,
        the point estimate, and the interval coverage.
    """
    # test input
    is_type(point, (float, int), 'point')
    is_type(se, (float, int), 'se')
    is_type(alpha, float, 'alpha')
    # is alpha between 0 and 1
    check_limits(alpha, 0.0, 1.0)
    # getting
    z = norm.ppf(1-alpha/2)
    # lower and upper
    lb, ub = point - se * z, point + se * z
    # return
    return IntervalResults(**{
        CLASS_NAME             : 'wald_confidence_interval',
        NamesI.VALUES          : (lb, ub),
        NamesI.POINT           : point,
        NamesI.SE              : se,
        NamesI.COVERAGE        : 1-alpha,
    })


# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

[docs]
def beta_confidence_interval(proportion:float, total_sample:int,
                             alpha:float=0.05,
                             integer:bool=True,
                             ) -> IntervalResults:
    """
    Compute beta confidence intervals for proportions, based on a classic
    contribution by SA Julious [1]_.
    
    Parameters
    ----------
    proportion : `float`
        A proportion between 0 and 1.
    total_sample : `int`
        The total sample size from which the proportion was derived.
    alpha : `float`, default 0.05
        The 100(1-alpha)% confidence level.
    integer : `bool`, default True
        Whether the function should raise an error if the product of
        `proportion * total_sample` is not an integer. If set to `False`,
        the function will issue a warning and round the product to the
        nearest integer.
    
    Returns
    -------
    IntervalResults
        A results class returning the lower and upper confidence interval
        bounds.
    
    References
    ----------
    .. [1] Steven A Julious. Two-sided confidence intervals for the single
           proportion: comparison of seven methods by Robert G. Newcombe,
           Statistics in Medicine 1998; 17:857-872.
    """
    # check input
    is_type(proportion, (float, int), 'proportion')
    is_type(total_sample, int, 'total_sample')
    is_type(alpha, float, 'alpha')
    is_type(integer, bool, 'integer')
    # is alpha between 0 and 1
    check_limits(alpha, 0.0, 1.0)
    # get the number of events
    no_events = proportion * total_sample
    if (not no_events.is_integer()) and (integer==True):
        raise ValueError(
            "The product of `proportion` and `total_sample` is not an integer. "
            "Either correct the input or set `integer=False` to allow rounding. "
            f"Current input: proportion = {proportion}, total_sample = "
            f"{total_sample}, product = {no_events}."
        )
    elif not no_events.is_integer():
        no_events = round(no_events)
    # lower bound
    lb = 1-beta.ppf(1-alpha/2, total_sample - no_events + 1, no_events)
    # upper bound
    ub = beta.ppf(1-alpha/2, no_events + 1, total_sample - no_events)
    # suppressed the warnings because SE is missing.
    warnings.filterwarnings("ignore", category=UserWarning)
    results = IntervalResults(**{
        CLASS_NAME             : 'beta_confidence_interval',
        NamesI.VALUES          : (lb, ub),
        NamesI.POINT           : proportion,
        NamesI.COVERAGE        : 1-alpha,
    })
    warnings.resetwarnings()
    # returns
    return results


# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

[docs]
def linear_contrast(
    d_mat: np.ndarray,
    v_mat: np.ndarray,
    c_vec: np.ndarray,
    contrast_names: list[str] | None = None,
    alpha: float = 0.05,
    ) -> pd.DataFrame:
    """
    Compute exact variance propagation for linear contrasts of regression
    coefficients.
    
    For any linear contrast L = D' * beta, the variance is:
    
        Var(L) = D' * V * D
    
    This is exact and holds for any regression model with a
    coefficient vector and variance-covariance matrix.
    
    Parameters
    ----------
    d_mat : `array_like`, shape (n_coef, n_contrasts)
        Contrast matrix. Each column defines one contrast, each
        row corresponds to one coefficient. Set a row to zero to
        exclude that coefficient from the contrast.
    v_mat : `array_like`, shape (n_coef, n_coef)
        Variance-covariance matrix of the coefficients.
    c_vec : `array_like`, shape (n_coef,)
        Point estimates of the model coefficients.
    contrast_names : `list` [`str`], default `None`
        Labels for each contrast. Defaults to
        ["contrast_1", "contrast_2", ...].
    alpha : `float`, default 0.05
        The 100(1-alpha)% confidence level.
    
    Returns
    -------
    pd.DataFrame
        One row per contrast with columns: contrast, estimate,
        se, lower, upper. All values are on the linear predictor
        scale; any back-transformation is the caller's
        responsibility.
    
    Raises
    ------
    ValueError
        If dimensions of d_mat, v_mat, c_vec are inconsistent,
        alpha is outside (0, 1), or contrast_names length
        does not match ncol(d_mat).
    
    Notes
    -----
    **Generality across model types**
    Works for OLS, GLM, Cox, mixed-effects, penalised regression,
    or any model yielding a coefficient vector and covariance
    matrix. For spline or polynomial terms, populate the
    corresponding rows of d_mat with the evaluated basis values
    at the target point (e.g. from a fitted spline object).
    
    **Link scale**
    For models with a nonlinear link (logistic, Poisson, Cox),
    estimates are returned on the link scale. Back-transform and
    propagate uncertainty externally as needed.
    
    **Interactions and piecewise equations**
    Include the product (or indicator) terms explicitly in d_mat.
    For example, for treatment effect in females under a
    treatment * gender interaction, set both the treatment row
    and the interaction row to 1.
    
    Examples
    --------
    Linear model: y ~ gender * treatment
    Coefficients: intercept, gender_female, treatment,
                  treatment:gender_female
    
    Treatment effect in males:  [0, 0, 1, 0]
    Treatment effect in females: [0, 0, 1, 1]
    
    >>> d_mat = np.array([[0, 0],
    ...                   [0, 0],
    ...                   [1, 1],
    ...                   [0, 1]])
    >>> c_vec = np.array([5.0, 1.2, 2.3, 0.8])
    >>> v_mat = np.eye(4) * 0.1
    >>> linear_contrast(d_mat, v_mat, c_vec,
    ...                 contrast_names=["males", "females"])
    """
    # ### confirm input and map to array - e.g. if list of list
    d_mat = np.asarray(d_mat, dtype=float)
    v_mat = np.asarray(v_mat, dtype=float)
    # ravel enusre the coef vec is a 1d matrix
    c_vec = np.asarray(c_vec, dtype=float).ravel()
    # reshape if needed
    if d_mat.ndim == 1:
        d_mat = d_mat.reshape(-1, 1)
    # confirm length
    same_len(c_vec, d_mat, ['c_vec', 'd_mat'])
    same_len(c_vec, v_mat, ['c_vec', 'v_mat'])
    same_len(v_mat, v_mat.T, ['rows v_mat', 'columns v_mat'])
    # check alpha
    is_type(alpha, float)
    # is alpha between 0 and 1
    check_limits(alpha, 0.0, 1.0)
    if contrast_names is not None:
        same_len(contrast_names, d_mat.T, ['contrast_names', 'columns d_mat'])
    else:
        contrast_names = [
            f"contrast_{i + 1}" for i in range(d_mat.shape[1])
        ]
    # Get the point estimates
    estimates = c_vec @ d_mat
    # the standard erroris
    se = np.sqrt(np.diag(d_mat.T @ v_mat @ d_mat))
    # the z-statistics
    z = norm.ppf(1-alpha/2)
    # the confidence_intervals
    return pd.DataFrame({
        "contrast": contrast_names,
        "estimate": estimates,
        "se": se,
        "lower": estimates - z * se,
        "upper": estimates + z * se,
    })