'''
Includes a mixture of functions to estimates intervals from a collection of
data. Type of intervals include confidence intervals, as well as
prediction intervals.
'''
import numpy as np
import warnings
from typing import Any, List, Type, Union, Tuple, Optional, Dict
from scipy.stats import (
norm,
binom,
beta,
)
from stats_misc.constants import (
is_type,
NamesIntervals as NamesI,
Error_MSG,
)
# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
[docs]
class IntervalResults(object):
'''
A results objects containing interval estimates.
Attributes
----------
interval_values : `tuple` [`float`] The lower and upper bounds of the interval.
point_estimate : `float`
The estimate with the highest data-support.
standard_errors : `float`
The estimated population variability of the point estimate.
coverage : `float`
The interval coverage.
'''
SET_ARGS = [
NamesI.VALUES,
NamesI.POINT,
NamesI.SE,
NamesI.COVERAGE,
]
# Initiation the class
def __init__(self, **kwargs):
"""
Initialise
"""
for k in kwargs.keys():
if k not in self.__class__.SET_ARGS:
raise AttributeError("unrecognised argument '{0}'".format(k))
# Loops over `SET_ARGS`, assigns the kwargs content to name `s`.
# if argument is missing in kwargs, print a warning.
for s in self.__class__.SET_ARGS:
try:
setattr(self, s, kwargs[s])
except KeyError:
warnings.warn("argument '{0}' is set to 'None'".format(s))
setattr(self, s, None)
# /////////////////////////////////////////////////////////////////////////
def __str__(self):
return f"An `Interval` results class."
# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
[docs]
class QuantilesIntervalResults(IntervalResults):
'''
The results class for `univariable_quantiles_exact`.
Attributes
----------
interval_indices : `list` [`int`]
The list indices of the interval lower and upper bounds.
interval_values : `list` [`float`]
The lower and upper bounds of the interval.
point_estimate : `float`
The estimate with the highest data-support.
coverage : `float`
The interval coverage.
matrix_coverage : `numpy.array`
A 2d array of coverage probabilities.
matrix_columns : `list` [`float`]
The column names of `matrix_coverage` representing the indices of
the upper limit.
matrix_rows : `list` [`float`]
The row names of `matrix_coverage` representing the indices of
the lower limit.
'''
SET_ARGS = [
NamesI.INDICES,
NamesI.VALUES,
NamesI.POINT,
NamesI.COVERAGE,
NamesI.MATRIX_COVERAGE,
NamesI.MATRIX_COLUMNS,
NamesI.MATRIX_ROWS,
]
# /////////////////////////////////////////////////////////////////////////
def __str__(self):
return f"A `Quantiles interval` results class."
# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
[docs]
def univariable_quantiles_exact(values:List[float], quantile:float,
alpha:float=0.05, window:int=2,
) -> QuantilesIntervalResults:
'''
Estimates the locations of the 100(1-alpha)% lower bound and upper bound
for any desired quantile, and will return the `values` of these lower and
upper bounds.
Parameters
----------
values : `list` [`float`]
A list of floats representing the available data.
quantile : `float`
A float between 0 and 1, e.g., use 0.5 for the median, 0.25 for
quartile 1, or 0.025 for the 2.5% percentile.
alpha : `float`, default 0.05
The 100(1-alpha)% confidence level.
Returns
-------
results : `QuantilesIntervalResults`
An interval class results object, including the interval indices,
bounds, and exact coverage.
Notes
-----
The algorithm will attempt to find a near symmetric confidence interval,
which is guaranteed to have a coverage equal or larger than requested.
Internally this uses the binomial function to calculate the probability of
a value being smaller or equal than the requested quantile, hence resulting
in exact confidence interval limits, irrespective of the underlying
distribution. These intervals will typically be larger than intervals based
on (semi)-parametric solutions.
The codes is based on the
`stackexchange <https://stats.stackexchange.com/questions/99829/how-to-obtain-a-confidence-interval-for-a-percentile>`_ answer by `whuber`.
Which in turn is based on Chapter five of this
`book <https://www.wiley.com/en-us/Statistical+Intervals%3A+A+Guide+for+Practitioners+and+Researchers%2C+2nd+Edition-p-9780471687177>`_ by Meeker, Hahn, and Escobar.
'''
# ### test input
is_type(values, list, 'values')
is_type(quantile, float, 'quantile')
is_type(alpha, float, 'quantile')
is_type(window, (int, float), 'window')
# is alpha between 0 and 1
if alpha <= 0 or alpha >= 1:
raise ValueError(Error_MSG.FLOAT_LIMITS.format('alpha', 0, 1))
# ### calculate the interval
n = len(values)
# ensure the values are sorted
values = values.copy()
values.sort()
# get the quantile - only needed to for the results objects, not for calcs
point = np.quantile(values, quantile)
# calculate the interval positions
upper_temp = int(binom.ppf(1-alpha/2, n, quantile))
lower_temp = int(binom.ppf(alpha/2, n, quantile))
# adding integer values around the true positions to form a search_grid
w_ints = list(range(0,window+1))
w_ints = [-1*w for w in w_ints[1:]] + w_ints
upper = [upper_temp + w for w in w_ints]
lower = [lower_temp + w for w in w_ints]
# check if these values are plausible, otherwise replace by inf
upper = [np.inf if u > n else u for u in upper]
lower = [np.inf if l < 0 else l for l in lower]
# finding the coverage probabilities
coverage = np.empty((len(lower), len(upper)))
for i, a in enumerate(lower):
for j, b in enumerate(upper):
coverage[i, j] = \
binom.cdf(b - 1, n, quantile) - binom.cdf(a - 1, n, quantile)
# the lower and upper bounds with a coverage equal to or larger than requested
coverage_diff = (coverage - (1-alpha))
coverage_exact = np.min(coverage[coverage_diff > 0])
row, col = np.where(coverage_exact == coverage)
# if there are multiple cols, simply select the first entry, which is
# the smaller value, hence resulting in a smaller CI
limit_u = [upper[k] for k in col][0]
limit_l = [lower[k] for k in row][0]
limit_indices = [limit_l, limit_u]
if any(np.isinf(x) for x in limit_indices):
raise TypeError('`limit_indices` contains `inf`, please use different '
'combinations of the alpha and quantile arguments.')
# get the actual confidence interval values
try:
confidence_interval = [values[t] for t in limit_indices]
except IndexError:
raise IndexError('Object values has indices: `{}`, while the '
'confidence interval limit incides are `{}`. '
'Please request a different alpha, or quantile.'.\
format([0,len(values)-1], limit_indices)
)
# return
return QuantilesIntervalResults(**{
NamesI.VALUES : confidence_interval,
NamesI.POINT : point,
NamesI.INDICES : limit_indices,
NamesI.COVERAGE : coverage_exact,
NamesI.MATRIX_COVERAGE : coverage,
NamesI.MATRIX_COLUMNS : upper,
NamesI.MATRIX_ROWS : lower,
})
# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
[docs]
def univariable_poisson_standard_normal(counts:List[int], alpha:float=0.05,
) -> IntervalResults:
'''
Calculates Poisson confidence intervals based on the standard normal
approximation to the Poisson distribution.
Parameters
----------
counts : `list` [`int`]
A list of counts.
alpha : `float`
The (1-alpha/2)% confidence interval coverage.
Returns
---------
results : `IntervalResults`
A results class returning the lower and upper confidence interval
bounds.
'''
# Test input
is_type(counts, list, 'counts')
is_type(alpha, float, 'alpha')
# is alpha between 0 and 1
if alpha <= 0 or alpha >= 1:
raise ValueError(Error_MSG.FLOAT_LIMITS.format('alpha', 0, 1))
# Get constants
xbar = np.mean(counts)
xsd = np.sqrt(xbar/len(counts))
z = norm.ppf(1-alpha/2)
# get the confidence intervals
lb = xbar-xsd*z
ub = xbar+xsd*z
# return
return IntervalResults(**{
NamesI.VALUES : (lb, ub),
NamesI.POINT : xbar,
NamesI.SE : xsd,
NamesI.COVERAGE : 1-alpha,
})
# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
[docs]
def wald_confidence_interval(point:float, se:float,
alpha:float=0.05,
) -> IntervalResults:
'''
Calculates a (1-a/2)*100 confidence interval based on the quantile from
a normal distribution.
Parameters
----------
point : `float`
The point estimate
se : `float`
The standard error of the point estimate
alpha: `float`
The (1-alpha/2)% confidence interval coverage.
Returns
-------
results : `IntervalResults`
A results class returning the lower and upper confidence interval
bounds.
'''
# test input
is_type(point, (float, int), 'point')
is_type(se, (float, int), 'se')
is_type(alpha, float, 'alpha')
# is alpha between 0 and 1
if alpha <= 0 or alpha >= 1:
raise ValueError(Error_MSG.FLOAT_LIMITS.format('alpha', 0, 1))
# getting
z = norm.ppf(1-alpha/2)
# lower and upper
lb, ub = point - se * z, point + se * z
# return
return IntervalResults(**{
NamesI.VALUES : (lb, ub),
NamesI.POINT : point,
NamesI.SE : se,
NamesI.COVERAGE : 1-alpha,
})
# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
[docs]
def beta_confidence_interval(proportion:float, total_sample:int,
alpha:float=0.05,
integer:bool=True,
) -> IntervalResults:
"""
Better confidence intervals for proportions, based on a classic
contribution by SA Julious [1]_.
Parameters
----------
proportion : `float`
A proportion between 0 and 1.
total_sample : `int`
The total sample size the proportion was derived from.
alpha : `float`, default 0.05
A float between 0 and 1, representing the type 1 error rate. Is used
to define the confidence interval coverage: (1-alpha)*100.
integer : `bool`, default True
whether the function should fail when the proportion times total_sample
does not result in an integer number of events. Set to `False` to
ignore the ValueError, which will replace the produt with its closest
integer value.
Returns
-------
results : `IntervalResults`
A results class returning the lower and upper confidence interval
bounds.
References
----------
.. [1] Steven A Julious. Two-sided confidence intervals for the single
proportion: comparison of seven methods by Robert G. Newcombe, Statistics
in Medicine 1998; 17:857-872
"""
# check input
is_type(proportion, (float, int), 'proportion')
is_type(total_sample, int, 'total_sample')
is_type(alpha, float, 'alpha')
is_type(integer, bool, 'integer')
# is alpha between 0 and 1
if alpha <= 0 or alpha >= 1:
raise ValueError(Error_MSG.FLOAT_LIMITS.format('alpha', 0, 1))
# get the number of events
no_events = proportion * total_sample
if (not no_events.is_integer()) and (integer==True):
raise ValueError("The product of proportion times total_sample is not "
"an integer, either correct the input or set `integer`"
"to `False`. The current input: proportion "
f"`{proportion}`, total_sample `{total_sample}`, and "
f"their product `{no_events}`.")
elif not no_events.is_integer():
no_events = round(no_events)
# lower bound
lb = 1-beta.ppf(1-alpha/2, total_sample - no_events + 1, no_events)
# upper bound
ub = beta.ppf(1-alpha/2, no_events + 1, total_sample - no_events)
# suppressed the warnings because SE is missing.
warnings.filterwarnings("ignore", category=UserWarning)
results = IntervalResults(**{
NamesI.VALUES : (lb, ub),
NamesI.POINT : proportion,
NamesI.COVERAGE : 1-alpha,
})
warnings.resetwarnings()
# returns
return results