'''
A module for resampling techniques such as bootstrap, jackknife, or
permutation. Currently focussed on confidence interval estimation using
canonical boostrap algorithms.
'''
# main function
import warnings
import numpy as np
import numba as nb
import scipy.stats as st
# from numbers import Real
from numpy.typing import NDArray
from typing import (
Any, Annotated, Callable, Optional,
)
from stats_misc.constants import (
is_type,
NamesResampling as NamesRS,
NamesIntervals as NamesI,
Error_MSG,
)
# @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
# constants
MSG_NAN='There are {} `nan` values in `{}`. These will be omitted.'
ERR1 = "All arrays in `data` must have the same number of rows."
ERR2 = "nrows ({}) exceeds the number of rows in data ({})."
ERR3 = "`{}` is not callable."
CLASS_NAME = '__CLASS_NAME'
# @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
[docs]
class BootCIResults(object):
'''
The results class for bootstrapped confidence intervals.
Attributes
----------
interval_indices: `list` [`int`]
The bootstrap indices of the interval lower and upper bounds.
interval_values: `list` [`float`]
The lower and upper bounds of the interval.
coverage : `float`
The interval coverage.
'''
DEFAULT_ARGS = [
CLASS_NAME,
NamesI.INDICES,
NamesI.VALUES,
NamesI.COVERAGE,
NamesRS.BOOT_SAMPLE,
]
# Initiation the class
# NOTE include * to force all named arguments to be named (no positional)
# args when calling innit.
def __init__(self,*, set_args: Optional[list[str]] = None, **kwargs:Any):
"""
Initialise a `BootCIResults` instance.
Raises
------
AttributeError
If an unrecognised keyword is provided.
"""
# the arguments we want to define
set_args = set_args if set_args is not None else self.DEFAULT_ARGS
# now set values
for k in kwargs.keys():
if k not in set_args:
raise AttributeError("unrecognised argument '{0}'".format(k))
# Loops over `SET_ARGS`, assigns the kwargs content to name `s`.
# if argument is missing in kwargs, print a warning.
for s in set_args:
try:
setattr(self, s, kwargs[s])
except KeyError:
warnings.warn("argument '{0}' is set to 'None'".format(s))
setattr(self, s, None)
# /////////////////////////////////////////////////////////////////////////
def __str__(self) -> str:
# assigns a back up name 'BootCIResults` if clas_name is not provided
CLASS_NAME_ = getattr(self, CLASS_NAME, "BootCIResults")
return f"A `{CLASS_NAME_}` results class."
# /////////////////////////////////////////////////////////////////////////
def __repr__(self) -> str:
# assigns a back up name 'BootCIResults` if clas_name is not provided
CLASS_NAME_ = getattr(self, CLASS_NAME, "BootCIResults")
return (f"{CLASS_NAME_}(interval_indices={getattr(self, NamesI.INDICES)}, "
f"interval_values={getattr(self, NamesI.VALUES)}, "
f"coverage={getattr(self, NamesI.COVERAGE)})")
# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
[docs]
class BCaCIResults(BootCIResults):
"""
The results class for `BCaCI`.
Attributes
----------
interval_indices : `list` [`int`] or `np.ndarray`
The bootstrap indices of the interval lower and upper bounds.
interval_values : `list` [`float`] or `np.ndarray`
The lower and upper bounds of the interval.
coverage : `float`
The confidence interval coverage.
"""
# /////////////////////////////////////////////////////////////////////////
def __init__(self, **kwargs: Any) -> None:
"""
Initialise a `BCaCIResults` instance.
Raises
------
AttributeError
If an unrecognised keyword is provided.
"""
set_args = [
CLASS_NAME,
NamesI.INDICES,
NamesI.VALUES,
NamesI.COVERAGE,
NamesRS.BOOT_SAMPLE,
NamesRS.JACK_SAMPLE,
NamesRS.BCA_ACCELERATION,
NamesRS.BCA_BIAS,
]
kwargs.setdefault(CLASS_NAME, "BCaCIResults")
super().__init__(set_args=set_args, **kwargs)
# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
@nb.njit
def _random_row_indices(nrows:int) -> Annotated[NDArray[np.integer],(Any, 1)]:
'''
Takes a random sample of row indices with replacement.
Parameters
----------
nrows : `int`
The number of rows.
Returns
-------
rows : `np.ndarray`
A vector of resampled row indices, with the number of rows equal to
`nrows`.
'''
# resample - numba needs this to be a 2d array
sel_r = np.random.choice(nrows, nrows).reshape(nrows, 1)
return sel_r
# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
@nb.njit
def _sample_rows(arr: NDArray[np.generic], rows: NDArray[np.generic],
) -> NDArray[np.generic]:
'''
Extracts rows from a numpy array.
Parameters
----------
arr : `np.ndarray`
A numpy array of any dimension.
rows: `np.ndarray`
An numpy array with shape (x,1) containing the row elements of `arr`.
Returns
-------
arr_resampled : `np.ndarray`
A version of `arr` with a subset of the original array and certain
rows repeated.
Notes
-----
This is a work around because numba does not allow for multi-dimensional
array indexing such as `arr[rows, :]`.
'''
# ### getting the array dimensions
nrows = rows.shape[0]
arr_shape = arr.shape
# empty result array
output_shape = (nrows,) + arr_shape[1:]
data_r = np.empty(output_shape, dtype=arr.dtype)
# Loop over the row indices and copy the corresponding rows
for i in range(nrows):
data_r[i] = arr[rows[i, 0]] # This will work for any number of dimensions
# return
return data_r
# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
def _draw_bs_sample(data:tuple[NDArray[Any]], ) -> tuple[NDArray[Any]]:
"""
Samples with replacement from a tuple with np.ndarrays of the same length.
Parameters
----------
data : `tuple` [`np.ndarray`]
A tuple of numpy arrays with the same number of rows.
Returns
-------
resample : `tuple` [`np.ndarray`]
A tuple of resampled numpy arrays. For a proper bootstrap this should
be equal to the number of rows of the input data
"""
# ### confirm input
is_type(data, tuple)
# ### check number of rows
nrows = data[0].shape[0]
# Check that all arrays have the same number of rows
for arr in data:
if arr.shape[0] != nrows:
raise ValueError(ERR1)
# ### performing the resampling
sel_r = _random_row_indices(nrows)
# random draws with replacement for each list object
data_r = []
for obj in data:
# NOTE commented this out, looks redudant, but may cause some
# downstream issues which should be addressed individually.
# # we want to return at least a 2d array
# if len(obj.shape) == 1:
# obj = obj.reshape(-1,1)
# subset
data_r.append(_sample_rows(obj, rows=sel_r))
# return
return tuple(data_r)
# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
def _remove_row(data:tuple[NDArray[Any]], row_index:int,
) -> list[NDArray[Any]]:
'''
A jackknife helper function to iterativly remove a row from a tuple of
arrays.
Parameters
----------
data : `tuple` [`np.ndarray`]
A tuple of numpy arrays with the same number of rows.
row_index : `int`
The row index which should be dropped.
Returns
-------
results : `list` [`np.ndarray`]
A list of numpy arrays with `row_index` removed.
Notes
-----
The arrays should be at least 2d, the absence of a shape[1] values will
throw an error. Have not yet found a way to test for this with the numba
more limited functionality.
A non-numba implementation of this code would be
`tuple(np.delete(arr, 1, axis=0) for arr in data)`
'''
# get dims and list
num_rows = data[0].shape[0]
new_arrays = []
# Iterate over each array in the tuple
for arr in data:
num_cols = arr.shape[1]
new_arr = np.empty((num_rows - 1, num_cols), dtype=arr.dtype)
# Fill the new array excluding the specified row
row_counter = 0
for i in range(num_rows):
if i != row_index:
new_arr[row_counter] = arr[i]
row_counter += 1
# add array
new_arrays.append(new_arr)
# return list
return new_arrays
# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
[docs]
def bootstrap_replicates(data:tuple[NDArray[Any], ...], statsfunction:Callable,
n_estimates:int, n_reps:int=999,
**kwargs:Optional[Any],
) -> NDArray[Any]:
'''
Draws `n_reps` bootstrap samples and applies `statsfunction` on each bth
sample, return the results as an numpy array.
Parameters
----------
data : `tuple` [`np.ndarray`]
A tuple of numpy arrays with the same number of rows.
statsfunction : `Callable`
A function which can unpack a tuple of arrays and perform analyses on
these: `statsfunction(*data)`. The function can return the estimate(s)
as a single float/int, list, tuple or numpy array.
n_estimates : `int`
The number of estimates `statsfunction` will return.
n_reps : `int`, default 999
The number of bootstrap samples.
**kwargs : `any`
Any keyword arguments supplied to `statsfunctions`.
Returns
-------
boots : `np.ndarray`
A 2d numpy array with dims equal to `n_reps` by `n_estimates`.
Notes
-----
The helper functions have been optimised through numba.njit.
Will internally map 1d arrays to 2d to deal with numba requirements.
Examples
--------
>>> import numpy as np
>>> np.random.seed(42)
>>>
>>> # Example statsfunction that returns a list of statistics
>>> def stats_list(*data):
... return [np.mean(data[0]), np.std(data[0])]
>>>
>>> result = bootstrap_replicates(
... (np.random.randn(10), np.random.randn(10)),
... stats_list, n_estimates=2, n_reps=10
... )
>>> print(result)
'''
# ### check input
is_type(data, tuple)
is_type(n_estimates, (float, int))
is_type(n_reps, (float, int))
if not callable(statsfunction):
raise TypeError(ERR3.format(NamesRS.STATSFUNC))
# ### performing the bootstrap
# Make sure there are now 1d arrays for the bloody numba call
n_data = []
for arr in data:
if arr.ndim == 1:
n_data.append(arr.reshape(arr.shape[0], 1))
else:
n_data.append(arr)
data = tuple(n_data)
# initiating an nan array
boot_reps = np.full((n_reps, n_estimates), np.nan)
# looping over to bootstrap
for i in range(n_reps):
boot_reps[i] = statsfunction(*_draw_bs_sample(data),
**kwargs,
)
# returns
return boot_reps
# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
[docs]
def jackknife_replicates(data:tuple[NDArray[Any], ...], statsfunction:Callable,
n_estimates:int,
**kwargs:Optional[Any],
) -> NDArray[Any]:
'''
Performs jackknife resampling procedure which systematically leaves out
one observation at a time and applies `statsfunction`.
Parameters
----------
data : `tuple` [`np.ndarray`]
A tuple of numpy arrays with the same number of rows.
statsfunction : `Callable`
A function which can unpack a tuple of arrays and perform analyses on
these: `statsfunction(*data)`. The function can return the estimate(s)
as a single float/int, list, tuple or numpy array.
n_estimates : `int`
The number of estimates `statsfunction` will return.
**kwargs : `any`
Any keyword arguments supplied to `statsfunctions`.
Returns
-------
jacked : `np.ndarray`
A 2d numpy array with dims equal to `data[0].shape[0]` by `n_estimates`.
Notes
-----
The helper functions have been optimised through numba.njit.
Will internally map 1d arrays to 2d to deal with numba requirements.
'''
### test input
is_type(data, tuple)
is_type(n_estimates, (float, int))
if not callable(statsfunction):
raise TypeError(ERR3.format(NamesRS.STATSFUNC))
### the actual work
# Make sure there are now 1d arrays for the bloody numba call
n_data = []
for arr in data:
if arr.ndim == 1:
n_data.append(arr.reshape(arr.shape[0], 1))
else:
n_data.append(arr)
data = tuple(n_data)
# Set up empty array to store jackknife replicates
nrows = len(data[0])
jack_reps = np.full((nrows, n_estimates), np.nan)
# For each observation in the dataset, compute the statistical function on
# the sample with that observation removed
for i in range(nrows):
jack_sample = _remove_row(data, row_index=i)
jack_reps[i] = statsfunction(*jack_sample, **kwargs)
return jack_reps
# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
[docs]
class BCaCI(object):
"""
A Bias-Corrected and Accelerated (BCa) bootstrap confidence interval.
Parameters
----------
data : `tuple` [`np.ndarray`]
A tuple of numpy arrays with the same number of rows.
Methods
-------
Attributes
----------
data : `tuple` [`np.ndarray`]
A tuple of numpy arrays with the same number of rows.
Notes
-----
The BCa interval generally has a better coverage (i.e. the smallest
confidence interval while retaining advertised coverage) than most
alternative boostrap confidence interval methods.
"""
# /////////////////////////////////////////////////////////////////////////
def __init__(self, data:tuple[NDArray[Any], ...],) -> None:
"""
Initialises a `BCaCI` instance.
"""
# #### testing input
is_type(data, tuple)
# #### settings attributes
setattr(self, NamesRS.DATA_PRIV, data)
# /////////////////////////////////////////////////////////////////////////
def __str__(self) -> str:
CLASS_NAME = type(self).__name__
return (f"{CLASS_NAME} instance with data=\n"
f"{getattr(self, NamesRS.DATA_PRIV).__str__()}."
)
# \\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\
def __repr__(self) -> str:
CLASS_NAME = type(self).__name__
return (f"{CLASS_NAME}(data=\n"
f"{getattr(self, NamesRS.DATA_PRIV).__str__()})"
)
# /////////////////////////////////////////////////////////////////////////
@property
def boot_sample(self) -> tuple[NDArray[Any]]:
"""Getter function for boot_sample"""
return getattr(self, NamesRS.BOOT_SAMPLE)
# /////////////////////////////////////////////////////////////////////////
@property
def jack_sample(self) -> tuple[NDArray[Any]]:
"""Getter function for jack_sample"""
return getattr(self, NamesRS.JACK_SAMPLE)
# /////////////////////////////////////////////////////////////////////////
@property
def data(self) -> tuple[NDArray[Any]]:
"""Getter function for data"""
return getattr(self, NamesRS.DATA_PRIV)
# /////////////////////////////////////////////////////////////////////////
def __call__(self,
n_estimates:int, statsfunction:Callable,
alpha:float=0.05, n_reps:int=999,
verbose:bool = True,
**kwargs:Optional[Any],
) -> BCaCIResults:
"""
Calculates and BCa confidence interval, as well as generating the
bootstrap and jakknife re-sampling to obtain the BCa estimate.
Parameters
----------
statsfunction : `Callable`
A function which can unpack a tuple of arrays and perform analyses on
these: `statsfunction(*data)`. The function can return the estimate(s)
as a single float/int, list, tuple or numpy array.
n_estimates : `int`
The number of estimates `statsfunction` will return.
n_reps : `int`, default 999
The number of bootstrap samples.
alpha : `float`, default 0.05
The desired coverage of the confidence interval `(1-alpha)*100`.
**kwargs : `any`
Any keyword arguments supplied to `statsfunctions`.
Attributes
----------
boot_sample : `np.ndarray`
An array of boostrap estimates of `statsfunction`. These are the
crude estimates from `bootstrap_replicates`.
jack_sample : `np.ndarray`
An array of jack-knifed estimates of `statsfunction`. These are the
crude estimates from `jackknife_replicates`.
confidence interval : `np.ndarray`
The confidence intervals.
coverage : `float`
The confidence interval coverage.
Returns
-------
results : `BCaCIResults` instance
A results class instance returning the confidence intervals,
coverage, and confidence interval bootstrap indices.
"""
# #### check input
is_type(n_estimates, int)
is_type(n_reps, int)
is_type(alpha, float)
if not callable(statsfunction):
raise TypeError(ERR3.format(NamesRS.STATSFUNC))
# is alpha between 0 and 1
if alpha <= 0 or alpha >= 1:
raise ValueError(Error_MSG.FLOAT_LIMITS.format('alpha', 0, 1))
setattr(self, NamesRS.N_ESTIMATES, n_estimates)
setattr(self, NamesRS.N_REPS, n_reps)
setattr(self, NamesRS.ALPHA, alpha)
setattr(self, NamesRS.CI_COVERAGE, 1-alpha)
# #### perform the calculations
# #### get bootstrap
boot_samp = bootstrap_replicates(
getattr(self, NamesRS.DATA_PRIV),
statsfunction=statsfunction,
n_estimates=getattr(self, NamesRS.N_ESTIMATES),
n_reps=getattr(self, NamesRS.N_REPS),
**kwargs,
)
setattr(self, NamesRS.BOOT_SAMPLE, boot_samp)
# #### get jack knives
jack_samp = jackknife_replicates(
getattr(self, NamesRS.DATA_PRIV),
statsfunction=statsfunction,
n_estimates=getattr(self, NamesRS.N_ESTIMATES),
**kwargs,
)
setattr(self, NamesRS.JACK_SAMPLE, jack_samp)
# #### see if there are any missings and raise a warning
missings_bs = np.isnan(boot_samp).any(axis=1)
missings_jk = np.isnan(jack_samp).any(axis=1)
if any(missings_bs):
boot_samp=boot_samp[~missings_bs]
if verbose == True:
warnings.warn(MSG_NAN.format(sum(missings_bs), 'boot_samp'))
if any(missings_jk):
jack_samp = jack_samp[~missings_jk]
if verbose == True:
warnings.warn(MSG_NAN.format(sum(missings_jk), 'jack_samp'))
# #### computing the acceleration and bias constants.
setattr(self, NamesRS.BCA_ACCELERATION,
type(self)._compute_acceleration(jack_samp))
setattr(self, NamesRS.BCA_BIAS,
type(self)._compute_bias(
data=getattr(self, NamesRS.DATA_PRIV),
boot_samp=getattr(self, NamesRS.BOOT_SAMPLE),
statsfunction=statsfunction,
**kwargs,
)
)
# #### get lower and upper limits
# TODO the below, until return, should realy by a seperate function...
alphas = np.array([alpha/2., 1-alpha/2.])
# loop over the estimates (per column entry)
ci_array = np.full((len(boot_samp.T), 2), np.nan)
int_array = np.full((len(boot_samp.T), 2), np.nan)
for k in range(len(boot_samp.T)):
a_k = getattr(self, NamesRS.BCA_ACCELERATION)[k]
z0_k = getattr(self, NamesRS.BCA_BIAS)[k]
zs = z0_k + st.norm.ppf(alphas)
avals = st.norm.cdf(z0_k + zs/(1-a_k*zs))
ints = np.round((len(boot_samp)-1)*avals)
ints = np.nan_to_num(ints).astype('int')
int_array[k] = ints
# Compute confidence interval
boot_samp_sort = np.sort(boot_samp.T[k])
ci_low = boot_samp_sort[ints[0]]
ci_high = boot_samp_sort[ints[1]]
ci_array[k] = ci_low, ci_high
# return stuff
return BCaCIResults(
**{
NamesI.INDICES : int_array,
NamesI.VALUES : ci_array,
NamesI.COVERAGE : getattr(self, NamesRS.CI_COVERAGE),
NamesRS.BOOT_SAMPLE : getattr(self, NamesRS.BOOT_SAMPLE),
NamesRS.JACK_SAMPLE : getattr(self, NamesRS.JACK_SAMPLE),
NamesRS.BCA_ACCELERATION : getattr(self, NamesRS.BCA_ACCELERATION),
NamesRS.BCA_BIAS : getattr(self, NamesRS.BCA_BIAS),
}
)
# /////////////////////////////////////////////////////////////////////////
@staticmethod
def _compute_acceleration(jack_samp:NDArray[Any]) -> list[float]:
"""
Returns the acceleration constant a for a BCa confidence interval.
Parameters
----------
jack_samp : `np.ndarray`
An array of jack knife estimates.
Return
------
accel : `list` [`float`]
Acceleration values for each column of `jack_samp`.
"""
# #### make sure input is at least a 2d array
if jack_samp.ndim == 1:
jack_samp = jack_samp.reshape(jack_samp.shape[0], 1)
# #### calculations
# column means
means = np.mean(jack_samp, axis=0)
if not isinstance(means, np.ndarray):
means = np.array([means], dtype=np.float64)
# make sure means only contains 1 dim
if means.ndim != 1:
if not (means.shape[0] == 1 or means.shape[1] == 1):
raise IndexError(
f"`means` shape should contain at least a 1, not "
f"{means.shape}."
)
# loop over column
accel = []
for k, jc in enumerate(jack_samp.T):
a = (1/6) * np.divide(
np.sum(means[k] - jc)**3,
(np.sum(means[k] - jc)**2)**(3/2)
)
accel.append(a)
# return list
return accel
# /////////////////////////////////////////////////////////////////////////
@staticmethod
def _compute_bias(data:tuple[NDArray[Any]], boot_samp:NDArray[Any],
statsfunction:Callable, **kwargs:Optional[Any],
) -> list[float]:
"""
Computes z0, the bias correction constant, using the original estimates
and the boostrapped estimates.
Parameters
----------
data : `tuple` [`np.ndarray`]
A tuple of numpy arrays with the same number of rows.
boot_samp : `np.ndarray`
A array of bootstrap samples, can be either 1d or 2d.
statsfunction : `Callable`
A function which can unpack a tuple of arrays and perform analyses on
these: `statsfunction(*data)`.
**kwargs : `any`
Any keyword arguments supplied to `statsfunctions`.
Returns
-------
z0 : `list` [`float`]
Values of `z0` per column of `boot_samp`.
"""
# ### calculations
# Apply the statsfunction on the data
s = statsfunction(*data, **kwargs)
# make sure it is a list
if not isinstance(s, list):
s = [s]
# get z0
boot_n = boot_samp.shape[0]
z0 = []
# loop over columns
for b, boot_b in enumerate(boot_samp.T):
# so this will be zero when exactly halve of the sample is less
# than s
z0.append(st.norm.ppf(np.sum(boot_b < s[b]) / boot_n))
# return
return z0
# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
[docs]
class BaseBootstrap(object):
"""
Base class for generating bootstrap estimates.
Parameters
----------
data : `tuple` [`np.ndarray`, ...]
A tuple containing one or more numpy arrays representing the input
data to be used for bootstrapping.
verbose : `bool`, default `True`
Whether warnings should be returned.
Attributes
----------
_data : `tuple` [`np.ndarray`, ...]
The input data arrays from which bootstrap samples are drawn.
Methods
-------
bootstrap_samples(n_estimates, statsfunction, n_reps, **kwargs)
Generate bootstrap replicates and return the resulting statistics.
original_estimate(statsfunction, **kwargs)
Compute the statistic(s) on the original (non-resampled) data.
Notes
-----
This class handles the generation of bootstrap replicates and calculation
of original estimates from the provided dataset using a specified
statistical function.
"""
# /////////////////////////////////////////////////////////////////////////
def __repr__(self) -> str:
"""
String representation of the class instance.
"""
return (f"{type(self).__name__}(data=tuple of "
f"{len(self._data)} numpy arrays)")
# /////////////////////////////////////////////////////////////////////////
def __str__(self) -> str:
"""
String representation of the class instance.
"""
return (f"{type(self).__name__} with {len(self._data)} data "
"arrays.")
# /////////////////////////////////////////////////////////////////////////
def __init__(self, data: tuple[NDArray[Any], ...], verbose:bool=True,
):
"""Initialise the BaseBootstrapCI instance. """
is_type(data, tuple)
is_type(verbose, bool)
self._data = data
self._verbose = verbose
# self.__boot_sample = None
# /////////////////////////////////////////////////////////////////////////
[docs]
def bootstrap_samples(self, n_estimates: int, statsfunction: Callable,
n_reps: int = 999, verbose:bool=True,
**kwargs: Optional[Any],
) -> NDArray[Any]:
"""
Generates a bootstrap sample, applies a function, and removes missing
values.
Parameters
----------
n_estimates : `int`
The number of estimates (statistics) to compute for each bootstrap
sample.
statsfunction : `Callable`
The statistical function to apply to each bootstrap sample.
n_reps : `int`, default 999
The number of bootstrap repetitions (default is 999).
**kwargs : `dict` [`str`, `any`]
Additional keyword arguments passed to `statsfunction`.
Returns
-------
np.ndarray
An array of bootstrap estimates, with any samples containing
missing values removed.
"""
# check input
is_type(n_estimates, int)
is_type(n_reps, (int, float))
if not callable(statsfunction):
raise TypeError(ERR3.format(NamesRS.STATSFUNC))
# perform the acual boostrap
boot_samp = bootstrap_replicates(
self._data, statsfunction, n_estimates, n_reps, **kwargs,
)
# Remove missing values
missings_bs = np.isnan(boot_samp).any(axis=1)
if any(missings_bs):
boot_samp = boot_samp[~missings_bs]
if self._verbose == True:
warnings.warn(MSG_NAN.format(
sum(missings_bs), 'the bootstraped esimates [boot_samp]'))
# assign to self
# self.__boot_sample = boot_samp
# return
return boot_samp
# /////////////////////////////////////////////////////////////////////////
[docs]
def original_estimate(self, statsfunction: Callable,
**kwargs:Optional[Any],
) -> np.ndarray:
"""
Calculate the estimate based on the non-resampled data.
Parameters
----------
statsfunction : `Callable`
The statistical function which will be applied to `data`.
**kwargs : `dict` [`str`, `any`]
Additional keyword arguments passed to `statsfunction`.
Returns
-------
np.ndarray
An array containing the original statistic(s).
"""
# get estimate(s)
result = statsfunction(*self._data, **kwargs)
# ensure we get an array
if not isinstance(result, list) and not isinstance(result, np.ndarray):
result = [result]
if not isinstance(result, np.ndarray):
result = np.array(result)
# return
return result
# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
[docs]
class PercentileCI(BaseBootstrap):
"""
Percentile bootstrap confidence interval estimator.
Attributes
----------
_data : `tuple` [`np.ndarray`, ...]
The input data arrays from which bootstrap samples are drawn.
"""
# /////////////////////////////////////////////////////////////////////////
def __call__(self, n_estimates: int, statsfunction: Callable,
alpha: float = 0.05, n_reps: int = 999,
**kwargs: Optional[Any]) -> BootCIResults:
"""
Calculate the percentile bootstrap confidence interval.
Parameters
----------
n_estimates : `int`
The number of estimates (statistics) to compute for each sample.
statsfunction : `Callable`
The statistical function applied to the bootstrap samples.
alpha : `float`, default 0.05
The significance level for the confidence interval.
n_reps : `int`, default 999
The number of bootstrap repetitions (default is 999).
**kwargs : `dict` [`str`, `any`]
Additional keyword arguments passed to `statsfunction`.
Returns
-------
BootCIResults
A dataclass containing the confidence interval bounds, the bootstrap
samples, and metadata.
"""
# check input
is_type(alpha, float)
# is alpha between 0 and 1
if alpha <= 0 or alpha >= 1:
raise ValueError(Error_MSG.FLOAT_LIMITS.format('alpha', 0, 1))
# the bootstrap
boot_samp = self.bootstrap_samples(
n_estimates, statsfunction, n_reps, **kwargs
)
# getting the confidence intervals
# NOTE the alpha/2 and (1-alpha/2) are inverted - which is intended.
lower_pct = (alpha/2) * 100
upper_pct = (1 - alpha/2) * 100
ind_array = np.atleast_2d(np.array([lower_pct, upper_pct]))
# NOTE np.percentile intrapolates - if the pct do not exactly match
# elements in `boot_samp`
ci_array = np.percentile(boot_samp, [lower_pct, upper_pct], axis=0,
method='linear').T
# return results
return BootCIResults(
**{
CLASS_NAME :type(self).__name__,
NamesI.INDICES: ind_array,
NamesI.VALUES: ci_array,
NamesI.COVERAGE: 1 - alpha,
NamesRS.BOOT_SAMPLE: boot_samp,
}
)
# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
[docs]
class BasicCI(BaseBootstrap):
"""
Basic bootstrap confidence interval estimator.
Attributes
----------
_data : `tuple` [`np.ndarray`, ...]
The input data arrays from which bootstrap samples are drawn.
Notes
-----
The basic bootstrap confidence interval is calculated by reflecting
the standard percentile intervals around the original estimate.
"""
# /////////////////////////////////////////////////////////////////////////
def __call__(self, n_estimates: int, statsfunction: Callable,
alpha: float = 0.05, n_reps: int = 999,
**kwargs: Optional[Any]) -> BootCIResults:
"""
Calculate the bootstrap confidence interval.
Parameters
----------
n_estimates : `int`
The number of estimates (statistics) to compute for each sample.
statsfunction : `Callable`
The statistical function applied to the bootstrap samples.
alpha : `float`, default 0.05
The significance level for the confidence interval.
n_reps : `int`, default 999
The number of bootstrap repetitions (default is 999).
**kwargs : `dict` [`str`, `any`], optional
Additional keyword arguments passed to `statsfunction`.
Returns
-------
BootCIResults
A dataclass containing the confidence interval bounds, the bootstrap
samples, and metadata.
"""
# check input
is_type(alpha, float)
# is alpha between 0 and 1
if alpha <= 0 or alpha >= 1:
raise ValueError(Error_MSG.FLOAT_LIMITS.format('alpha', 0, 1))
# the bootstrap samples
boot_samp = self.bootstrap_samples(
n_estimates, statsfunction, n_reps, **kwargs
)
# the original (non-resampled) estimate
orig_estimate = self.original_estimate(
statsfunction, **kwargs
)
# calculate percentiles
lower_pct = (1 - alpha / 2) * 100
upper_pct = (alpha / 2) * 100
ind_array = np.atleast_2d(np.array([lower_pct, upper_pct]))
# NOTE np.percentile will interpolate if needed
# calculate the basic confidence intervals
lower, upper = np.percentile(
boot_samp, [lower_pct, upper_pct],
axis=0, method='linear'
)
ci_array = np.stack([
2 * orig_estimate - lower,
2 * orig_estimate - upper,
], axis=1)
# return results
return BootCIResults(
**{
CLASS_NAME :type(self).__name__,
NamesI.INDICES: ind_array,
NamesI.VALUES: ci_array,
NamesI.COVERAGE: 1 - alpha,
NamesRS.BOOT_SAMPLE: boot_samp,
}
)
# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
[docs]
class StudentisedCI(BaseBootstrap):
"""
Studentised bootstrap confidence interval estimator.
Attributes
----------
_data : tuple of NDArray
The input data arrays from which bootstrap samples are drawn.
Notes
-----
This class performs a nested bootstrap to estimate the standard error of
the bootstrapped statistic(s)
"""
# /////////////////////////////////////////////////////////////////////////
def __call__(self, n_estimates: int, statsfunction: Callable,
alpha: float = 0.05, n_reps: int = 999,
n_nested: int = 100,
**kwargs: Optional[Any]) -> BootCIResults:
"""
Calculate the studentised bootstrap confidence interval.
Parameters
----------
n_estimates : `int`
The number of estimates (statistics) to compute for each sample.
statsfunction : `Callable`
The statistical function applied to the bootstrap samples.
alpha : `float`, default 0.05
The significance level for the confidence interval.
n_reps : `int`, default 999
The number of bootstrap repetitions.
n_nested : `int`, default 100
The number of inner bootstrap repetitions for standard error
estimation.
**kwargs : `dict`, [`str`, `any`]
Additional keyword arguments passed to `statsfunction`.
Returns
-------
BootCIResults
A dataclass containing the confidence interval bounds, the bootstrap
samples, and metadata.
"""
# ### check input
is_type(alpha, float)
# is alpha between 0 and 1
if alpha <= 0 or alpha >= 1:
raise ValueError(Error_MSG.FLOAT_LIMITS.format('alpha', 0, 1))
is_type(n_reps, (int, float))
is_type(n_nested, (int, float))
# ### the bootstrap
# ### pre-allocate memory
boot_point = np.full((n_reps, n_estimates), np.nan)
boot_se = np.full((n_reps, n_estimates), np.nan)
# boot_point = []
# boot_se = []
for i in range(n_reps):
boot_samp = _draw_bs_sample(self._data)
# the outer estimates
outer_est = statsfunction(*boot_samp, **kwargs,
)
# map to array
outer_est = self._map_sequence(outer_est, array = True)
# boot_point.append(outer_est)
boot_point[i] = outer_est
# the inner loop
inner_point = np.full((n_nested, n_estimates), np.nan)
for j in range(n_nested):
inner_samp = _draw_bs_sample(boot_samp)
inner_est = statsfunction(*inner_samp, **kwargs)
# make sure it is a list
inner_est = self._map_sequence(inner_est, array = False)
# inner_point.append(inner_est)
inner_point[j] = inner_est
# getting the standard error
inner_point = np.array(inner_point)
# Getting the sample sd
inner_point, _ = self._filter_missing(inner_point)
if np.isnan(inner_point).all():
if self._verbose == True:
warnings.warn("Warning: inner_point is entirely NaN")
se = np.nan
else:
se = np.std(inner_point, axis=0, ddof=1)
# boot_se.append(se)
boot_se[i] = se
# point estimate on the non-resampled data
orig_mean = self.original_estimate(statsfunction, **kwargs)
if not isinstance(orig_mean, np.ndarray):
orig_mean = np.array(orig_mean)
# NOTE perhaps check if there are the same length?
# removing possible missing data
boot_point, keep_point = self._filter_missing(boot_point)
boot_se, keep_se = self._filter_missing(boot_se[keep_point])
boot_point = boot_point[keep_se]
# standard error of the outer boot
if np.isnan(boot_point).all():
if self._verbose == True:
warnings.warn("Warning: boot_point is entirely NaN")
boot_point_se = np.nan
else:
boot_point_se = np.std(boot_point, axis=0, ddof=1)
# Studentised t-statistics
t_stats = (boot_point - orig_mean) / boot_se
# The t-statistic based interval positions
percentiles = np.percentile(
t_stats, [100 * (1 - alpha/2), 100 * (alpha/2)],
method="linear", axis=0,
)
lower_t, upper_t = percentiles
ind_array = np.atleast_2d(np.array([lower_t, upper_t]))
# finally get the confidence interval
ci_array = np.stack([
orig_mean - lower_t * boot_point_se, # lower bound
orig_mean - upper_t * boot_point_se, # upper bound
], axis=1)
# return
return BootCIResults(
**{
CLASS_NAME :type(self).__name__,
NamesI.INDICES: ind_array,
NamesI.VALUES: ci_array,
NamesI.COVERAGE: 1 - alpha,
NamesRS.BOOT_SAMPLE: boot_point,
}
)
# /////////////////////////////////////////////////////////////////////////
def _map_sequence(self, obj:Any, array:bool=False) -> list | np.ndarray:
"""Maps an objs to a mutable sequence."""
# map to list
if not isinstance(obj, list) and not isinstance(obj, np.ndarray):
obj = [obj]
# check if we also want to map to array
if array:
if not isinstance(obj, np.ndarray):
obj = np.array(obj)
# return
return obj
# /////////////////////////////////////////////////////////////////////////
def _filter_missing(self, array: np.ndarray,
) -> tuple[np.ndarray, np.ndarray]:
"""
Remove rows containing NaNs from the array and return a boolean mask
indicating non-missing rows.
Parameters
----------
array : np.ndarray
The array to check for missing values (NaNs).
Returns
-------
filtered_array : np.ndarray
The input array with rows containing NaNs removed.
non_missing_mask : np.ndarray
Boolean array where True indicates rows without any NaNs.
"""
missings = np.isnan(array).any(axis=1)
non_missing = ~missings
if missings.any():
if self._verbose:
warnings.warn('The bootstrap sample contained {} `nan` values '
'which have been removed.'.format(sum(missings)))
return array[non_missing], non_missing
# # Toy example
# from stats_misc.machine_learning.validation import (
# cstatistic,
# )
# # create a function which returns a single float
# def cstat_boot(*data):
# '''
# Function to use for bootstrapping
# '''
# cstat = cstatistic(data[0], data[1])
# point = cstat.c_statistic
# if np.isnan(point):
# return np.nan
# return point
# # example data
# x = np.array([0, 1, 0, 1, 0, 1, 1, 0, 1, 1, 0, 1, 0, 0, 0])
# y = np.array([0.1, 0.7, 0.3, 0.6, 0.9, 0.1, 0.2, 0.3, 0.4, 0.01, 0.9, 0.8, 0.15, 0.71, 0.8])
# data = (x, y)
# n_reps=999
# n_estimates = 1
# statsfunction = cstat_boot
# cstat_boot(*(x,y))
# res1 = PercentileCI(data)
# res2 = res1(1, cstat_boot)