Source code for stats_misc.intervals

'''
Includes a mixture of functions to estimates intervals from a collection of
data. Type of intervals include confidence intervals, as well as
prediction intervals.
'''

import numpy as np
import warnings
from typing import Any, List, Type, Union, Tuple, Optional, Dict
from scipy.stats import (
    norm,
    binom,
    beta,
)
from stats_misc.constants import (
    is_type,
    NamesIntervals as NamesI,
    Error_MSG,
)


# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
[docs] class IntervalResults(object): ''' A results objects containing interval estimates. Attributes ---------- interval_values : `tuple` [`float`] The lower and upper bounds of the interval. point_estimate : `float` The estimate with the highest data-support. standard_errors : `float` The estimated population variability of the point estimate. coverage : `float` The interval coverage. ''' SET_ARGS = [ NamesI.VALUES, NamesI.POINT, NamesI.SE, NamesI.COVERAGE, ] # Initiation the class def __init__(self, **kwargs): """ Initialise """ for k in kwargs.keys(): if k not in self.__class__.SET_ARGS: raise AttributeError("unrecognised argument '{0}'".format(k)) # Loops over `SET_ARGS`, assigns the kwargs content to name `s`. # if argument is missing in kwargs, print a warning. for s in self.__class__.SET_ARGS: try: setattr(self, s, kwargs[s]) except KeyError: warnings.warn("argument '{0}' is set to 'None'".format(s)) setattr(self, s, None) # ///////////////////////////////////////////////////////////////////////// def __str__(self): return f"An `Interval` results class."
# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
[docs] class QuantilesIntervalResults(IntervalResults): ''' The results class for `univariable_quantiles_exact`. Attributes ---------- interval_indices : `list` [`int`] The list indices of the interval lower and upper bounds. interval_values : `list` [`float`] The lower and upper bounds of the interval. point_estimate : `float` The estimate with the highest data-support. coverage : `float` The interval coverage. matrix_coverage : `numpy.array` A 2d array of coverage probabilities. matrix_columns : `list` [`float`] The column names of `matrix_coverage` representing the indices of the upper limit. matrix_rows : `list` [`float`] The row names of `matrix_coverage` representing the indices of the lower limit. ''' SET_ARGS = [ NamesI.INDICES, NamesI.VALUES, NamesI.POINT, NamesI.COVERAGE, NamesI.MATRIX_COVERAGE, NamesI.MATRIX_COLUMNS, NamesI.MATRIX_ROWS, ] # ///////////////////////////////////////////////////////////////////////// def __str__(self): return f"A `Quantiles interval` results class."
# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
[docs] def univariable_quantiles_exact(values:List[float], quantile:float, alpha:float=0.05, window:int=2, ) -> QuantilesIntervalResults: ''' Estimates the locations of the 100(1-alpha)% lower bound and upper bound for any desired quantile, and will return the `values` of these lower and upper bounds. Parameters ---------- values : `list` [`float`] A list of floats representing the available data. quantile : `float` A float between 0 and 1, e.g., use 0.5 for the median, 0.25 for quartile 1, or 0.025 for the 2.5% percentile. alpha : `float`, default 0.05 The 100(1-alpha)% confidence level. Returns ------- results : `QuantilesIntervalResults` An interval class results object, including the interval indices, bounds, and exact coverage. Notes ----- The algorithm will attempt to find a near symmetric confidence interval, which is guaranteed to have a coverage equal or larger than requested. Internally this uses the binomial function to calculate the probability of a value being smaller or equal than the requested quantile, hence resulting in exact confidence interval limits, irrespective of the underlying distribution. These intervals will typically be larger than intervals based on (semi)-parametric solutions. The codes is based on the `stackexchange <https://stats.stackexchange.com/questions/99829/how-to-obtain-a-confidence-interval-for-a-percentile>`_ answer by `whuber`. Which in turn is based on Chapter five of this `book <https://www.wiley.com/en-us/Statistical+Intervals%3A+A+Guide+for+Practitioners+and+Researchers%2C+2nd+Edition-p-9780471687177>`_ by Meeker, Hahn, and Escobar. ''' # ### test input is_type(values, list, 'values') is_type(quantile, float, 'quantile') is_type(alpha, float, 'quantile') is_type(window, (int, float), 'window') # is alpha between 0 and 1 if alpha <= 0 or alpha >= 1: raise ValueError(Error_MSG.FLOAT_LIMITS.format('alpha', 0, 1)) # ### calculate the interval n = len(values) # ensure the values are sorted values = values.copy() values.sort() # get the quantile - only needed to for the results objects, not for calcs point = np.quantile(values, quantile) # calculate the interval positions upper_temp = int(binom.ppf(1-alpha/2, n, quantile)) lower_temp = int(binom.ppf(alpha/2, n, quantile)) # adding integer values around the true positions to form a search_grid w_ints = list(range(0,window+1)) w_ints = [-1*w for w in w_ints[1:]] + w_ints upper = [upper_temp + w for w in w_ints] lower = [lower_temp + w for w in w_ints] # check if these values are plausible, otherwise replace by inf upper = [np.inf if u > n else u for u in upper] lower = [np.inf if l < 0 else l for l in lower] # finding the coverage probabilities coverage = np.empty((len(lower), len(upper))) for i, a in enumerate(lower): for j, b in enumerate(upper): coverage[i, j] = \ binom.cdf(b - 1, n, quantile) - binom.cdf(a - 1, n, quantile) # the lower and upper bounds with a coverage equal to or larger than requested coverage_diff = (coverage - (1-alpha)) coverage_exact = np.min(coverage[coverage_diff > 0]) row, col = np.where(coverage_exact == coverage) # if there are multiple cols, simply select the first entry, which is # the smaller value, hence resulting in a smaller CI limit_u = [upper[k] for k in col][0] limit_l = [lower[k] for k in row][0] limit_indices = [limit_l, limit_u] if any(np.isinf(x) for x in limit_indices): raise TypeError('`limit_indices` contains `inf`, please use different ' 'combinations of the alpha and quantile arguments.') # get the actual confidence interval values try: confidence_interval = [values[t] for t in limit_indices] except IndexError: raise IndexError('Object values has indices: `{}`, while the ' 'confidence interval limit incides are `{}`. ' 'Please request a different alpha, or quantile.'.\ format([0,len(values)-1], limit_indices) ) # return return QuantilesIntervalResults(**{ NamesI.VALUES : confidence_interval, NamesI.POINT : point, NamesI.INDICES : limit_indices, NamesI.COVERAGE : coverage_exact, NamesI.MATRIX_COVERAGE : coverage, NamesI.MATRIX_COLUMNS : upper, NamesI.MATRIX_ROWS : lower, })
# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
[docs] def univariable_poisson_standard_normal(counts:List[int], alpha:float=0.05, ) -> IntervalResults: ''' Calculates Poisson confidence intervals based on the standard normal approximation to the Poisson distribution. Parameters ---------- counts : `list` [`int`] A list of counts. alpha : `float` The (1-alpha/2)% confidence interval coverage. Returns --------- results : `IntervalResults` A results class returning the lower and upper confidence interval bounds. ''' # Test input is_type(counts, list, 'counts') is_type(alpha, float, 'alpha') # is alpha between 0 and 1 if alpha <= 0 or alpha >= 1: raise ValueError(Error_MSG.FLOAT_LIMITS.format('alpha', 0, 1)) # Get constants xbar = np.mean(counts) xsd = np.sqrt(xbar/len(counts)) z = norm.ppf(1-alpha/2) # get the confidence intervals lb = xbar-xsd*z ub = xbar+xsd*z # return return IntervalResults(**{ NamesI.VALUES : (lb, ub), NamesI.POINT : xbar, NamesI.SE : xsd, NamesI.COVERAGE : 1-alpha, })
# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
[docs] def wald_confidence_interval(point:float, se:float, alpha:float=0.05, ) -> IntervalResults: ''' Calculates a (1-a/2)*100 confidence interval based on the quantile from a normal distribution. Parameters ---------- point : `float` The point estimate se : `float` The standard error of the point estimate alpha: `float` The (1-alpha/2)% confidence interval coverage. Returns ------- results : `IntervalResults` A results class returning the lower and upper confidence interval bounds. ''' # test input is_type(point, (float, int), 'point') is_type(se, (float, int), 'se') is_type(alpha, float, 'alpha') # is alpha between 0 and 1 if alpha <= 0 or alpha >= 1: raise ValueError(Error_MSG.FLOAT_LIMITS.format('alpha', 0, 1)) # getting z = norm.ppf(1-alpha/2) # lower and upper lb, ub = point - se * z, point + se * z # return return IntervalResults(**{ NamesI.VALUES : (lb, ub), NamesI.POINT : point, NamesI.SE : se, NamesI.COVERAGE : 1-alpha, })
# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
[docs] def beta_confidence_interval(proportion:float, total_sample:int, alpha:float=0.05, integer:bool=True, ) -> IntervalResults: """ Better confidence intervals for proportions, based on a classic contribution by SA Julious [1]_. Parameters ---------- proportion : `float` A proportion between 0 and 1. total_sample : `int` The total sample size the proportion was derived from. alpha : `float`, default 0.05 A float between 0 and 1, representing the type 1 error rate. Is used to define the confidence interval coverage: (1-alpha)*100. integer : `bool`, default True whether the function should fail when the proportion times total_sample does not result in an integer number of events. Set to `False` to ignore the ValueError, which will replace the produt with its closest integer value. Returns ------- results : `IntervalResults` A results class returning the lower and upper confidence interval bounds. References ---------- .. [1] Steven A Julious. Two-sided confidence intervals for the single proportion: comparison of seven methods by Robert G. Newcombe, Statistics in Medicine 1998; 17:857-872 """ # check input is_type(proportion, (float, int), 'proportion') is_type(total_sample, int, 'total_sample') is_type(alpha, float, 'alpha') is_type(integer, bool, 'integer') # is alpha between 0 and 1 if alpha <= 0 or alpha >= 1: raise ValueError(Error_MSG.FLOAT_LIMITS.format('alpha', 0, 1)) # get the number of events no_events = proportion * total_sample if (not no_events.is_integer()) and (integer==True): raise ValueError("The product of proportion times total_sample is not " "an integer, either correct the input or set `integer`" "to `False`. The current input: proportion " f"`{proportion}`, total_sample `{total_sample}`, and " f"their product `{no_events}`.") elif not no_events.is_integer(): no_events = round(no_events) # lower bound lb = 1-beta.ppf(1-alpha/2, total_sample - no_events + 1, no_events) # upper bound ub = beta.ppf(1-alpha/2, no_events + 1, total_sample - no_events) # suppressed the warnings because SE is missing. warnings.filterwarnings("ignore", category=UserWarning) results = IntervalResults(**{ NamesI.VALUES : (lb, ub), NamesI.POINT : proportion, NamesI.COVERAGE : 1-alpha, }) warnings.resetwarnings() # returns return results