"""
Resampling methods.
This module provides tools for non-parametric resampling procedures, including
bootstrap and jackknife methods. It implements several widely used approaches
for confidence interval estimation, such as percentile intervals, basic
bootstrap, bias-corrected and accelerated (BCa), and studentised intervals.
These methods operate on raw data supplied as tuples of NumPy arrays, and are
designed to work with user-defined statistical functions. Results are returned
as structured objects that encapsulate interval estimates, coverage
probabilities, and resampling metadata.
Classes
-------
BootCIResults
A container for standard bootstrap confidence interval output, including
interval bounds, coverage, and bootstrap samples.
BCaCIResults
A subclass of `BootCIResults` including additional information for
BCa intervals, such as acceleration and bias correction terms.
BaseBootstrap
A general base class for generating bootstrap replicates and original
(non-resampled) estimates.
PercentileCI
Implements the percentile bootstrap confidence interval.
BasicCI
Implements the basic (reverse) bootstrap confidence interval.
StudentisedCI
Implements the studentised bootstrap confidence interval using nested
resampling to estimate variability of the bootstrap estimates.
BCaCI
A callable class that computes BCa confidence intervals and stores the
corresponding bootstrap and jackknife samples.
Functions
---------
bootstrap_replicates(data, statsfunction, n_estimates, n_reps=999, **kwargs)
Generates bootstrap samples and computes statistics on each.
jackknife_replicates(data, statsfunction, n_estimates, **kwargs)
Computes jackknife replicates by systematically omitting each observation.
Notes
-----
All core resampling functions have been optimised using Numba for performance.
Input arrays must have consistent row dimensions. Output is typically returned
as NumPy arrays or results class instances that support structured inspection
and downstream usage.
"""
import warnings
import numpy as np
import numba as nb
import scipy.stats as st
from numpy.typing import NDArray
from typing import (
Any, Annotated, Callable,
)
from stats_misc.constants import (
NamesResampling as NamesRS,
NamesIntervals as NamesI,
CLASS_NAME,
)
from stats_misc.errors import (
is_type,
check_limits,
)
from stats_misc.utils.helpers import (
Results,
ManagedProperty,
)
# @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
# constants
MSG_NAN='There are {} `nan` values in `{}`. These will be omitted.'
ERR1 = "All arrays in `data` must have the same number of rows."
ERR2 = "nrows ({}) exceeds the number of rows in data ({})."
ERR3 = "`{}` is not callable."
# @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
[docs]
class BootCIResults(Results):
"""
The results class for bootstrapped confidence intervals.
Attributes
----------
interval_indices: `list` [`int`] or `np.ndarray`
The indices corresponding to the lower and upper CI bounds.
interval_values: `list` [`float`] or `np.ndarray`
The computed lower and upper confidence interval bounds.
coverage : `float`
The confidence interval coverage.
"""
SET_ARGS = [
CLASS_NAME,
NamesI.INDICES,
NamesI.VALUES,
NamesI.COVERAGE,
NamesRS.BOOT_SAMPLE,
]
def __init__(self, **kwargs) -> None:
super().__init__(set_args=self.SET_ARGS, **kwargs)
# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
[docs]
class BCaCIResults(BootCIResults):
"""
The results class for Bias-Corrected and Accelerated (BCa) intervals.
Attributes
----------
interval_indices : `list` [`int`] or `np.ndarray`
The indices corresponding to the lower and upper CI bounds.
interval_values : `list` [`float`] or `np.ndarray`
The computed lower and upper confidence interval bounds.
coverage : `float`
The confidence interval coverage.
"""
SET_ARGS = [
CLASS_NAME,
NamesI.INDICES,
NamesI.VALUES,
NamesI.COVERAGE,
NamesRS.BOOT_SAMPLE,
NamesRS.JACK_SAMPLE,
NamesRS.BCA_ACCELERATION,
NamesRS.BCA_BIAS,
]
# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
@nb.njit
def _random_row_indices_jit(nrows: int,
) -> Annotated[NDArray[np.integer], (Any, 1)]:
"""
Takes a random sample of row indices with replacement.
Parameters
----------
nrows : `int`
The number of rows.
Returns
-------
np.ndarray
A vector of resampled row indices, with the number of rows equal to
`nrows`.
"""
# resample - numba needs this to be a 2d array
sel_r = np.random.choice(nrows, nrows).reshape(nrows, 1)
return sel_r
# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
def _random_row_indices(nrows: int | float,
) -> Annotated[NDArray[np.integer], (Any, 1)]:
"""
Takes a random sample of row indices with replacement.
Parameters
----------
nrows : `int` or `float`
The number of rows. Float values are cast to `int`.
Returns
-------
np.ndarray
A vector of resampled row indices, with the number of rows equal to
`nrows`.
"""
is_type(nrows, (int, float, np.integer))
nrows = int(nrows)
return _random_row_indices_jit(nrows)
# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
@nb.njit
def _sample_rows(arr: NDArray[np.generic], rows: NDArray[np.generic],
) -> NDArray[np.generic]:
"""
Extracts rows from a numpy array.
Parameters
----------
arr : `np.ndarray`
A numpy array of any dimension.
rows: `np.ndarray`
An numpy array with shape (x,1) containing the row elements of `arr`.
Returns
-------
arr_resampled : `np.ndarray`
A version of `arr` with a subset of the original array and certain
rows repeated.
Notes
-----
This is a work around because numba does not allow for multi-dimensional
array indexing such as `arr[rows, :]`.
"""
# ### getting the array dimensions
nrows = rows.shape[0]
arr_shape = arr.shape
# empty result array
output_shape = (nrows,) + arr_shape[1:]
data_r = np.empty(output_shape, dtype=arr.dtype)
# Loop over the row indices and copy the corresponding rows
for i in range(nrows):
data_r[i] = arr[rows[i, 0]] # This will work for any number of dimensions
# return
return data_r
# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
def _draw_bs_sample(data:tuple[NDArray[Any]], ) -> tuple[NDArray[Any]]:
"""
Draws a bootstrap sample with replacement from a tuple of arrays.
Parameters
----------
data : `tuple` [`np.ndarray`]
Tuple of arrays with equal number of rows.
Returns
-------
tuple [`np.ndarray`]
A tuple of resampled numpy arrays. For a proper bootstrap this should
be equal to the number of rows of the input data
"""
# ### confirm input
is_type(data, tuple)
# ### check number of rows
nrows = data[0].shape[0]
# Check that all arrays have the same number of rows
for arr in data:
if arr.shape[0] != nrows:
raise ValueError(ERR1)
# ### performing the resampling
sel_r = _random_row_indices(nrows)
# random draws with replacement for each list object
data_r = []
for obj in data:
# NOTE commented this out, looks redudant, but may cause some
# downstream issues which should be addressed individually.
# # we want to return at least a 2d array
# if len(obj.shape) == 1:
# obj = obj.reshape(-1,1)
# subset
data_r.append(_sample_rows(obj, rows=sel_r))
# return
return tuple(data_r)
# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
def _remove_row(data:tuple[NDArray[Any]], row_index:int,
) -> list[NDArray[Any]]:
"""
A jackknife helper function to iterativly remove a row from a tuple of
arrays.
Parameters
----------
data : `tuple` [`np.ndarray`]
A tuple of numpy arrays with the same number of rows.
row_index : `int`
The row index which should be dropped.
Returns
-------
list [`np.ndarray`]
A list of numpy arrays with `row_index` removed.
Notes
-----
The arrays should be at least 2d, the absence of a shape[1] values will
throw an error. Have not yet found a way to test for this with the numba
more limited functionality.
A non-numba implementation of this code would be
`tuple(np.delete(arr, 1, axis=0) for arr in data)`
"""
# get dims and list
num_rows = data[0].shape[0]
new_arrays = []
# Iterate over each array in the tuple
for arr in data:
num_cols = arr.shape[1]
new_arr = np.empty((num_rows - 1, num_cols), dtype=arr.dtype)
# Fill the new array excluding the specified row
row_counter = 0
for i in range(num_rows):
if i != row_index:
new_arr[row_counter] = arr[i]
row_counter += 1
# add array
new_arrays.append(new_arr)
# return list
return new_arrays
# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
[docs]
def bootstrap_replicates(data:tuple[NDArray[Any], ...], statsfunction:Callable,
n_estimates:int, n_reps:int=999,
**kwargs: Any,
) -> NDArray[Any]:
"""
Draws `n_reps` bootstrap samples and applies `statsfunction` on each bth
sample, return the results as an numpy array.
Parameters
----------
data : `tuple` [`np.ndarray`]
A tuple of numpy arrays with the same number of rows.
statsfunction : `Callable`
A function which can unpack a tuple of arrays and perform analyses on
these: `statsfunction(*data)`. The function can return the estimate(s)
as a single float/int, list, tuple or numpy array.
n_estimates : `int`
The number of estimates `statsfunction` will return.
n_reps : `int`, default 999
The number of bootstrap samples.
**kwargs : `any`
Any keyword arguments supplied to `statsfunctions`.
Returns
-------
np.ndarray
A 2d numpy array with dims equal to `n_reps` by `n_estimates`.
Notes
-----
The helper functions have been optimised through numba.njit.
Will internally map 1d arrays to 2d to deal with numba requirements.
Examples
--------
>>> import numpy as np
>>> np.random.seed(42)
>>>
>>> # Example statsfunction that returns a list of statistics
>>> def stats_list(*data):
... return [np.mean(data[0]), np.std(data[0])]
>>>
>>> result = bootstrap_replicates(
... (np.random.randn(10), np.random.randn(10)),
... stats_list, n_estimates=2, n_reps=10
... )
>>> print(result)
"""
# ### check input
is_type(data, tuple)
is_type(n_estimates, (float, int))
is_type(n_reps, (float, int))
if not callable(statsfunction):
raise TypeError(ERR3.format(NamesRS.STATSFUNC))
# ### performing the bootstrap
# Make sure there are now 1d arrays for the bloody numba call
n_data = []
for arr in data:
if arr.ndim == 1:
n_data.append(arr.reshape(arr.shape[0], 1))
else:
n_data.append(arr)
data = tuple(n_data)
# initiating an nan array
boot_reps = np.full((n_reps, n_estimates), np.nan)
# looping over to bootstrap
for i in range(n_reps):
boot_reps[i] = statsfunction(*_draw_bs_sample(data),
**kwargs,
)
# returns
return boot_reps
# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
[docs]
def jackknife_replicates(data:tuple[NDArray[Any], ...], statsfunction:Callable,
n_estimates:int,
**kwargs: Any,
) -> NDArray[Any]:
"""
Performs jackknife resampling procedure which systematically leaves out
one observation at a time and applies `statsfunction`.
Parameters
----------
data : `tuple` [`np.ndarray`]
A tuple of numpy arrays with the same number of rows.
statsfunction : `Callable`
A function which can unpack a tuple of arrays and perform analyses on
these: `statsfunction(*data)`. The function can return the estimate(s)
as a single float/int, list, tuple or numpy array.
n_estimates : `int`
The number of estimates `statsfunction` will return.
**kwargs : `any`
Any keyword arguments supplied to `statsfunctions`.
Returns
-------
np.ndarray
A 2d numpy array with dims equal to `data[0].shape[0]` by `n_estimates`.
Notes
-----
The helper functions have been optimised through numba.njit.
Will internally map 1d arrays to 2d to deal with numba requirements.
"""
### test input
is_type(data, tuple)
is_type(n_estimates, (float, int))
if not callable(statsfunction):
raise TypeError(ERR3.format(NamesRS.STATSFUNC))
### the actual work
# Make sure there are now 1d arrays for the bloody numba call
n_data = []
for arr in data:
if arr.ndim == 1:
n_data.append(arr.reshape(arr.shape[0], 1))
else:
n_data.append(arr)
data = tuple(n_data)
# Set up empty array to store jackknife replicates
nrows = len(data[0])
jack_reps = np.full((nrows, n_estimates), np.nan)
# For each observation in the dataset, compute the statistical function on
# the sample with that observation removed
for i in range(nrows):
jack_sample = _remove_row(data, row_index=i)
jack_reps[i] = statsfunction(*jack_sample, **kwargs)
return jack_reps
# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
[docs]
class BCaCI(object):
"""
A Bias-Corrected and Accelerated (BCa) bootstrap confidence interval.
Parameters
----------
data : `tuple` [`np.ndarray`]
A tuple of numpy arrays with the same number of rows.
Methods
-------
Attributes
----------
data : `tuple` [`np.ndarray`]
A tuple of numpy arrays with the same number of rows.
boot_sample : `np.ndarray` or `None`
Bootstrap estimates of `statsfunction`. `None` until `__call__` is invoked.
jack_sample : `np.ndarray` or `None`
Jackknife estimates of `statsfunction`. `None` until `__call__` is invoked.
Notes
-----
The BCa interval generally has a better coverage (i.e. the smallest
confidence interval while retaining advertised coverage) than most
alternative boostrap confidence interval methods.
"""
data = ManagedProperty('data', types=(tuple,))
boot_sample = ManagedProperty('boot_sample', types=(np.ndarray, type(None)))
jack_sample = ManagedProperty('jack_sample', types=(np.ndarray, type(None)))
# /////////////////////////////////////////////////////////////////////////
def __init__(self, data:tuple[NDArray[Any], ...],) -> None:
"""
Initialises a `BCaCI` instance.
"""
is_type(data, tuple)
self.data = data
self.boot_sample = None
self.jack_sample = None
# /////////////////////////////////////////////////////////////////////////
def __str__(self) -> str:
CLASS_NAME = type(self).__name__
return (f"{CLASS_NAME} instance with data=\n"
f"{self.data.__str__()}."
)
# \\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\
def __repr__(self) -> str:
CLASS_NAME = type(self).__name__
return (f"{CLASS_NAME}(data=\n"
f"{self.data.__str__()})"
)
# /////////////////////////////////////////////////////////////////////////
def __call__(self,
n_estimates:int, statsfunction:Callable,
alpha:float=0.05, n_reps:int=999,
verbose:bool = True,
**kwargs: Any,
) -> BCaCIResults:
"""
Calculates and BCa confidence interval, as well as generating the
bootstrap and jakknife re-sampling to obtain the BCa estimate.
Parameters
----------
statsfunction : `Callable`
A function which can unpack a tuple of arrays and perform analyses on
these: `statsfunction(*data)`. The function can return the estimate(s)
as a single float/int, list, tuple or numpy array.
n_estimates : `int`
The number of estimates `statsfunction` will return.
n_reps : `int`, default 999
The number of bootstrap samples.
alpha : `float`, default 0.05
The desired coverage of the confidence interval `(1-alpha)*100`.
verbose : `bool`, default `True`
Whether to emit warnings when bootstrap or jackknife samples
contain `nan` values.
**kwargs : `any`
Any keyword arguments supplied to `statsfunction`.
Returns
-------
BCaCIResults
A results class instance returning the confidence intervals,
coverage, and confidence interval bootstrap indices.
"""
# #### check input
is_type(n_estimates, int)
is_type(n_reps, int)
is_type(alpha, float)
if not callable(statsfunction):
raise TypeError(ERR3.format(NamesRS.STATSFUNC))
# is alpha between 0 and 1
check_limits(alpha, 0.0, 1.0)
setattr(self, NamesRS.N_ESTIMATES, n_estimates)
setattr(self, NamesRS.N_REPS, n_reps)
setattr(self, NamesRS.ALPHA, alpha)
setattr(self, NamesRS.CI_COVERAGE, 1-alpha)
# #### perform the calculations
# #### get bootstrap
boot_samp = bootstrap_replicates(
self.data,
statsfunction=statsfunction,
n_estimates=getattr(self, NamesRS.N_ESTIMATES),
n_reps=getattr(self, NamesRS.N_REPS),
**kwargs,
)
self.boot_sample = boot_samp
# #### get jack knives
jack_samp = jackknife_replicates(
self.data,
statsfunction=statsfunction,
n_estimates=getattr(self, NamesRS.N_ESTIMATES),
**kwargs,
)
self.jack_sample = jack_samp
# #### see if there are any missings and raise a warning
missings_bs = np.isnan(boot_samp).any(axis=1)
missings_jk = np.isnan(jack_samp).any(axis=1)
if any(missings_bs):
boot_samp=boot_samp[~missings_bs]
if verbose == True:
warnings.warn(MSG_NAN.format(sum(missings_bs), 'boot_samp'))
if any(missings_jk):
jack_samp = jack_samp[~missings_jk]
if verbose == True:
warnings.warn(MSG_NAN.format(sum(missings_jk), 'jack_samp'))
# #### computing the acceleration and bias constants.
setattr(self, NamesRS.BCA_ACCELERATION,
type(self)._compute_acceleration(jack_samp))
setattr(self, NamesRS.BCA_BIAS,
type(self)._compute_bias(
data=self.data,
boot_samp=self.boot_sample,
statsfunction=statsfunction,
**kwargs,
)
)
# #### get lower and upper limits
# TODO the below, until return, should realy by a seperate function...
alphas = np.array([alpha/2., 1-alpha/2.])
# loop over the estimates (per column entry)
ci_array = np.full((len(boot_samp.T), 2), np.nan)
int_array = np.full((len(boot_samp.T), 2), np.nan)
for k in range(len(boot_samp.T)):
a_k = getattr(self, NamesRS.BCA_ACCELERATION)[k]
z0_k = getattr(self, NamesRS.BCA_BIAS)[k]
zs = z0_k + st.norm.ppf(alphas)
avals = st.norm.cdf(z0_k + zs/(1-a_k*zs))
ints = np.round((len(boot_samp)-1)*avals)
ints = np.nan_to_num(ints).astype('int')
int_array[k] = ints
# Compute confidence interval
boot_samp_sort = np.sort(boot_samp.T[k])
ci_low = boot_samp_sort[ints[0]]
ci_high = boot_samp_sort[ints[1]]
ci_array[k] = ci_low, ci_high
# return stuff
return BCaCIResults(
**{
CLASS_NAME : type(self).__name__,
NamesI.INDICES : int_array,
NamesI.VALUES : ci_array,
NamesI.COVERAGE : getattr(self, NamesRS.CI_COVERAGE),
NamesRS.BOOT_SAMPLE : self.boot_sample,
NamesRS.JACK_SAMPLE : self.jack_sample,
NamesRS.BCA_ACCELERATION : getattr(self, NamesRS.BCA_ACCELERATION),
NamesRS.BCA_BIAS : getattr(self, NamesRS.BCA_BIAS),
}
)
# /////////////////////////////////////////////////////////////////////////
@staticmethod
def _compute_acceleration(jack_samp:NDArray[Any]) -> list[float]:
"""
Returns the acceleration constant a for a BCa confidence interval.
Parameters
----------
jack_samp : `np.ndarray`
An array of jack knife estimates.
Return
------
accel : `list` [`float`]
Acceleration values for each column of `jack_samp`.
"""
# #### make sure input is at least a 2d array
if jack_samp.ndim == 1:
jack_samp = jack_samp.reshape(jack_samp.shape[0], 1)
# #### calculations
# column means
means = np.mean(jack_samp, axis=0)
if not isinstance(means, np.ndarray):
means = np.array([means], dtype=np.float64)
# make sure means only contains 1 dim
if means.ndim != 1:
if not (means.shape[0] == 1 or means.shape[1] == 1):
raise IndexError(
f"`means` shape should contain at least a 1, not "
f"{means.shape}."
)
# loop over column
accel = []
for k, jc in enumerate(jack_samp.T):
a = (1/6) * np.divide(
np.sum(means[k] - jc)**3,
(np.sum(means[k] - jc)**2)**(3/2)
)
accel.append(a)
# return list
return accel
# /////////////////////////////////////////////////////////////////////////
@staticmethod
def _compute_bias(data:tuple[NDArray[Any]], boot_samp:NDArray[Any],
statsfunction:Callable, **kwargs: Any,
) -> list[float]:
"""
Computes z0, the bias correction constant, using the original estimates
and the boostrapped estimates.
Parameters
----------
data : `tuple` [`np.ndarray`]
A tuple of numpy arrays with the same number of rows.
boot_samp : `np.ndarray`
A array of bootstrap samples, can be either 1d or 2d.
statsfunction : `Callable`
A function which can unpack a tuple of arrays and perform analyses on
these: `statsfunction(*data)`.
**kwargs : `any`
Any keyword arguments supplied to `statsfunctions`.
Returns
-------
z0 : `list` [`float`]
Values of `z0` per column of `boot_samp`.
"""
# ### calculations
# Apply the statsfunction on the data
s = statsfunction(*data, **kwargs)
# make sure it is a list
if not isinstance(s, list):
s = [s]
# get z0
boot_n = boot_samp.shape[0]
z0 = []
# loop over columns
for b, boot_b in enumerate(boot_samp.T):
# so this will be zero when exactly halve of the sample is less
# than s
z0.append(st.norm.ppf(np.sum(boot_b < s[b]) / boot_n))
# return
return z0
# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
[docs]
class BaseBootstrap(object):
"""
Base class for generating bootstrap estimates.
Parameters
----------
data : `tuple` [`np.ndarray`, ...]
A tuple containing one or more numpy arrays representing the input
data to be used for bootstrapping.
verbose : `bool`, default `True`
Whether warnings should be returned.
Attributes
----------
_data : `tuple` [`np.ndarray`, ...]
The input data arrays from which bootstrap samples are drawn.
Methods
-------
bootstrap_samples(n_estimates, statsfunction, n_reps, **kwargs)
Generate bootstrap replicates and return the resulting statistics.
original_estimate(statsfunction, **kwargs)
Compute the statistic(s) on the original (non-resampled) data.
Notes
-----
This class handles the generation of bootstrap replicates and calculation
of original estimates from the provided dataset using a specified
statistical function.
"""
# /////////////////////////////////////////////////////////////////////////
def __repr__(self) -> str:
"""
String representation of the class instance.
"""
return (f"{type(self).__name__}(data=tuple of "
f"{len(self._data)} numpy arrays)")
# /////////////////////////////////////////////////////////////////////////
def __str__(self) -> str:
"""
String representation of the class instance.
"""
return (f"{type(self).__name__} with {len(self._data)} data "
"arrays.")
# /////////////////////////////////////////////////////////////////////////
def __init__(self, data: tuple[NDArray[Any], ...], verbose:bool=True,
):
"""Initialise the BaseBootstrapCI instance. """
is_type(data, tuple)
is_type(verbose, bool)
self._data = data
self._verbose = verbose
# self.__boot_sample = None
# /////////////////////////////////////////////////////////////////////////
[docs]
def bootstrap_samples(self, n_estimates: int, statsfunction: Callable,
n_reps: int = 999,
# verbose:bool=True,
**kwargs: Any,
) -> NDArray[Any]:
"""
Generates a bootstrap sample, applies a function, and removes missing
values.
Parameters
----------
n_estimates : `int`
The number of estimates (statistics) to compute for each bootstrap
sample.
statsfunction : `Callable`
The statistical function to apply to each bootstrap sample.
n_reps : `int`, default 999
The number of bootstrap repetitions (default is 999).
**kwargs : `dict` [`str`, `any`]
Additional keyword arguments passed to `statsfunction`.
Returns
-------
np.ndarray
An array of bootstrap estimates, with any samples containing
missing values removed.
"""
# check input
is_type(n_estimates, int)
is_type(n_reps, (int, float))
if not callable(statsfunction):
raise TypeError(ERR3.format(NamesRS.STATSFUNC))
# perform the acual boostrap
boot_samp = bootstrap_replicates(
self._data, statsfunction, n_estimates, n_reps, **kwargs,
)
# Remove missing values
missings_bs = np.isnan(boot_samp).any(axis=1)
if any(missings_bs):
boot_samp = boot_samp[~missings_bs]
if self._verbose == True:
warnings.warn(MSG_NAN.format(
sum(missings_bs), 'the bootstraped esimates [boot_samp]'))
# assign to self
# self.__boot_sample = boot_samp
# return
return boot_samp
# /////////////////////////////////////////////////////////////////////////
[docs]
def original_estimate(self, statsfunction: Callable,
**kwargs: Any,
) -> np.ndarray:
"""
Calculate the estimate based on the non-resampled data.
Parameters
----------
statsfunction : `Callable`
The statistical function which will be applied to `data`.
**kwargs : `dict` [`str`, `any`]
Additional keyword arguments passed to `statsfunction`.
Returns
-------
np.ndarray
An array containing the original statistic(s).
"""
# get estimate(s)
result = statsfunction(*self._data, **kwargs)
# ensure we get an array
if not isinstance(result, list) and not isinstance(result, np.ndarray):
result = [result]
if not isinstance(result, np.ndarray):
result = np.array(result)
# return
return result
# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
[docs]
class PercentileCI(BaseBootstrap):
"""
Percentile bootstrap confidence interval estimator.
Attributes
----------
_data : `tuple` [`np.ndarray`, ...]
The input data arrays from which bootstrap samples are drawn.
"""
# /////////////////////////////////////////////////////////////////////////
def __call__(self, n_estimates: int, statsfunction: Callable,
alpha: float = 0.05, n_reps: int = 999,
**kwargs: Any) -> BootCIResults:
"""
Calculate the percentile bootstrap confidence interval.
Parameters
----------
n_estimates : `int`
The number of estimates (statistics) to compute for each sample.
statsfunction : `Callable`
The statistical function applied to the bootstrap samples.
alpha : `float`, default 0.05
The significance level for the confidence interval.
n_reps : `int`, default 999
The number of bootstrap repetitions (default is 999).
**kwargs : `dict` [`str`, `any`]
Additional keyword arguments passed to `statsfunction`.
Returns
-------
BootCIResults
A dataclass containing the confidence interval bounds, the bootstrap
samples, and metadata.
"""
# check input
is_type(alpha, float)
# is alpha between 0 and 1
check_limits(alpha, 0.0, 1.0)
# the bootstrap
boot_samp = self.bootstrap_samples(
n_estimates, statsfunction, n_reps, **kwargs
)
# getting the confidence intervals
# NOTE the alpha/2 and (1-alpha/2) are inverted - which is intended.
lower_pct = (alpha/2) * 100
upper_pct = (1 - alpha/2) * 100
ind_array = np.atleast_2d(np.array([lower_pct, upper_pct]))
# NOTE np.percentile intrapolates - if the pct do not exactly match
# elements in `boot_samp`
ci_array = np.percentile(boot_samp, [lower_pct, upper_pct], axis=0,
method='linear').T
# return results
return BootCIResults(
**{
CLASS_NAME :type(self).__name__,
NamesI.INDICES: ind_array,
NamesI.VALUES: ci_array,
NamesI.COVERAGE: 1 - alpha,
NamesRS.BOOT_SAMPLE: boot_samp,
}
)
# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
[docs]
class BasicCI(BaseBootstrap):
"""
Basic bootstrap confidence interval estimator.
Attributes
----------
_data : `tuple` [`np.ndarray`, ...]
The input data arrays from which bootstrap samples are drawn.
Notes
-----
The basic bootstrap confidence interval is calculated by reflecting
the standard percentile intervals around the original estimate.
"""
# /////////////////////////////////////////////////////////////////////////
def __call__(self, n_estimates: int, statsfunction: Callable,
alpha: float = 0.05, n_reps: int = 999,
**kwargs: Any) -> BootCIResults:
"""
Calculate the bootstrap confidence interval.
Parameters
----------
n_estimates : `int`
The number of estimates (statistics) to compute for each sample.
statsfunction : `Callable`
The statistical function applied to the bootstrap samples.
alpha : `float`, default 0.05
The significance level for the confidence interval.
n_reps : `int`, default 999
The number of bootstrap repetitions (default is 999).
**kwargs : `dict` [`str`, `any`], optional
Additional keyword arguments passed to `statsfunction`.
Returns
-------
BootCIResults
A dataclass containing the confidence interval bounds, the bootstrap
samples, and metadata.
"""
# check input
is_type(alpha, float)
# is alpha between 0 and 1
check_limits(alpha, 0.0, 1.0)
# the bootstrap samples
boot_samp = self.bootstrap_samples(
n_estimates, statsfunction, n_reps, **kwargs
)
# the original (non-resampled) estimate
orig_estimate = self.original_estimate(
statsfunction, **kwargs
)
# calculate percentiles
lower_pct = (1 - alpha / 2) * 100
upper_pct = (alpha / 2) * 100
ind_array = np.atleast_2d(np.array([lower_pct, upper_pct]))
# NOTE np.percentile will interpolate if needed
# calculate the basic confidence intervals
lower, upper = np.percentile(
boot_samp, [lower_pct, upper_pct],
axis=0, method='linear'
)
ci_array = np.stack([
2 * orig_estimate - lower,
2 * orig_estimate - upper,
], axis=1)
# return results
return BootCIResults(
**{
CLASS_NAME :type(self).__name__,
NamesI.INDICES: ind_array,
NamesI.VALUES: ci_array,
NamesI.COVERAGE: 1 - alpha,
NamesRS.BOOT_SAMPLE: boot_samp,
}
)
# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
[docs]
class StudentisedCI(BaseBootstrap):
"""
Studentised bootstrap confidence interval estimator.
Attributes
----------
_data : tuple of NDArray
The input data arrays from which bootstrap samples are drawn.
Notes
-----
This class performs a nested bootstrap to estimate the standard error of
the bootstrapped statistic(s)
"""
# /////////////////////////////////////////////////////////////////////////
def __call__(self, n_estimates: int, statsfunction: Callable,
alpha: float = 0.05, n_reps: int = 999,
n_nested: int = 100,
**kwargs: Any) -> BootCIResults:
"""
Calculate the studentised bootstrap confidence interval.
Parameters
----------
n_estimates : `int`
The number of estimates (statistics) to compute for each sample.
statsfunction : `Callable`
The statistical function applied to the bootstrap samples.
alpha : `float`, default 0.05
The significance level for the confidence interval.
n_reps : `int`, default 999
The number of bootstrap repetitions.
n_nested : `int`, default 100
The number of inner bootstrap repetitions for standard error
estimation.
**kwargs : `dict`, [`str`, `any`]
Additional keyword arguments passed to `statsfunction`.
Returns
-------
BootCIResults
A dataclass containing the confidence interval bounds, the bootstrap
samples, and metadata.
"""
# ### check input
is_type(alpha, float)
# is alpha between 0 and 1
check_limits(alpha, 0.0, 1.0)
is_type(n_reps, (int, float))
is_type(n_nested, (int, float))
# ### the bootstrap
# ### pre-allocate memory
boot_point = np.full((n_reps, n_estimates), np.nan)
boot_se = np.full((n_reps, n_estimates), np.nan)
# boot_point = []
# boot_se = []
for i in range(n_reps):
boot_samp = _draw_bs_sample(self._data)
# the outer estimates
outer_est = statsfunction(*boot_samp, **kwargs,
)
# map to array
outer_est = self._map_sequence(outer_est, array = True)
# boot_point.append(outer_est)
boot_point[i] = outer_est
# the inner loop
inner_point = np.full((n_nested, n_estimates), np.nan)
for j in range(n_nested):
inner_samp = _draw_bs_sample(boot_samp)
inner_est = statsfunction(*inner_samp, **kwargs)
# make sure it is a list
inner_est = self._map_sequence(inner_est, array = False)
# inner_point.append(inner_est)
inner_point[j] = inner_est
# getting the standard error
inner_point = np.array(inner_point)
# Getting the sample sd
inner_point, _ = self._filter_missing(inner_point)
if np.isnan(inner_point).all():
if self._verbose == True:
warnings.warn("Warning: inner_point is entirely NaN")
se = np.nan
else:
se = np.std(inner_point, axis=0, ddof=1)
# boot_se.append(se)
boot_se[i] = se
# point estimate on the non-resampled data
orig_mean = self.original_estimate(statsfunction, **kwargs)
if not isinstance(orig_mean, np.ndarray):
orig_mean = np.array(orig_mean)
# NOTE perhaps check if there are the same length?
# removing possible missing data
boot_point, keep_point = self._filter_missing(boot_point)
boot_se, keep_se = self._filter_missing(boot_se[keep_point])
boot_point = boot_point[keep_se]
# standard error of the outer boot
if np.isnan(boot_point).all():
if self._verbose == True:
warnings.warn("Warning: boot_point is entirely NaN")
boot_point_se = np.nan
else:
boot_point_se = np.std(boot_point, axis=0, ddof=1)
# Studentised t-statistics
t_stats = (boot_point - orig_mean) / boot_se
# The t-statistic based interval positions
percentiles = np.percentile(
t_stats, [100 * (1 - alpha/2), 100 * (alpha/2)],
method="linear", axis=0,
)
lower_t, upper_t = percentiles
ind_array = np.atleast_2d(np.array([lower_t, upper_t]))
# finally get the confidence interval
ci_array = np.stack([
orig_mean - lower_t * boot_point_se, # lower bound
orig_mean - upper_t * boot_point_se, # upper bound
], axis=1)
# return
return BootCIResults(
**{
CLASS_NAME :type(self).__name__,
NamesI.INDICES: ind_array,
NamesI.VALUES: ci_array,
NamesI.COVERAGE: 1 - alpha,
NamesRS.BOOT_SAMPLE: boot_point,
}
)
# /////////////////////////////////////////////////////////////////////////
def _map_sequence(self, obj:Any, array:bool=False) -> list | np.ndarray:
"""
Map an object to a mutable sequence, optionally as a numpy array.
Parameters
----------
obj : `any`
The object to convert. Scalars and non-list/ndarray types are
wrapped in a list first.
array : `bool`, default `False`
If `True`, convert the result to a `np.ndarray`.
Returns
-------
obj : `list` or `np.ndarray`
The converted object.
"""
# map to list
if not isinstance(obj, list) and not isinstance(obj, np.ndarray):
obj = [obj]
# check if we also want to map to array
if array:
if not isinstance(obj, np.ndarray):
obj = np.array(obj)
# return
return obj
# /////////////////////////////////////////////////////////////////////////
def _filter_missing(self, array: np.ndarray,
) -> tuple[np.ndarray, np.ndarray]:
"""
Remove rows containing NaNs from the array and return a boolean mask
indicating non-missing rows.
Parameters
----------
array : np.ndarray
The array to check for missing values (NaNs).
Returns
-------
filtered_array : np.ndarray
The input array with rows containing NaNs removed.
non_missing_mask : np.ndarray
Boolean array where True indicates rows without any NaNs.
"""
missings = np.isnan(array).any(axis=1)
non_missing = ~missings
if missings.any():
if self._verbose:
warnings.warn('The bootstrap sample contained {} `nan` values '
'which have been removed.'.format(sum(missings)))
return array[non_missing], non_missing