Source code for stats_misc.resampling

"""
Resampling methods.

This module provides tools for non-parametric resampling procedures, including
bootstrap and jackknife methods. It implements several widely used approaches
for confidence interval estimation, such as percentile intervals, basic
bootstrap, bias-corrected and accelerated (BCa), and studentised intervals.

These methods operate on raw data supplied as tuples of NumPy arrays, and are
designed to work with user-defined statistical functions. Results are returned
as structured objects that encapsulate interval estimates, coverage
probabilities, and resampling metadata.

Classes
-------
BootCIResults
    A container for standard bootstrap confidence interval output, including
    interval bounds, coverage, and bootstrap samples.

BCaCIResults
    A subclass of `BootCIResults` including additional information for
    BCa intervals, such as acceleration and bias correction terms.

BaseBootstrap
    A general base class for generating bootstrap replicates and original
    (non-resampled) estimates.

PercentileCI
    Implements the percentile bootstrap confidence interval.

BasicCI
    Implements the basic (reverse) bootstrap confidence interval.

StudentisedCI
    Implements the studentised bootstrap confidence interval using nested
    resampling to estimate variability of the bootstrap estimates.

BCaCI
    A callable class that computes BCa confidence intervals and stores the
    corresponding bootstrap and jackknife samples.

Functions
---------
bootstrap_replicates(data, statsfunction, n_estimates, n_reps=999, **kwargs)
    Generates bootstrap samples and computes statistics on each.

jackknife_replicates(data, statsfunction, n_estimates, **kwargs)
    Computes jackknife replicates by systematically omitting each observation.

Notes
-----
All core resampling functions have been optimised using Numba for performance.
Input arrays must have consistent row dimensions. Output is typically returned
as NumPy arrays or results class instances that support structured inspection
and downstream usage.
"""

import warnings
import numpy as np
import numba as nb
import scipy.stats as st
from numpy.typing import NDArray
from typing import (
    Any, Annotated, Callable,
)
from stats_misc.constants import (
    NamesResampling as NamesRS,
    NamesIntervals as NamesI,
    CLASS_NAME,
)
from stats_misc.errors import (
    is_type,
    check_limits,
)
from stats_misc.utils.helpers import (
    Results,
    ManagedProperty,
)

# @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
# constants
MSG_NAN='There are {} `nan` values in `{}`. These will be omitted.'
ERR1 = "All arrays in `data` must have the same number of rows."
ERR2 = "nrows ({}) exceeds the number of rows in data ({})."
ERR3 = "`{}` is not callable."

# @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
[docs] class BootCIResults(Results): """ The results class for bootstrapped confidence intervals. Attributes ---------- interval_indices: `list` [`int`] or `np.ndarray` The indices corresponding to the lower and upper CI bounds. interval_values: `list` [`float`] or `np.ndarray` The computed lower and upper confidence interval bounds. coverage : `float` The confidence interval coverage. """ SET_ARGS = [ CLASS_NAME, NamesI.INDICES, NamesI.VALUES, NamesI.COVERAGE, NamesRS.BOOT_SAMPLE, ] def __init__(self, **kwargs) -> None: super().__init__(set_args=self.SET_ARGS, **kwargs)
# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
[docs] class BCaCIResults(BootCIResults): """ The results class for Bias-Corrected and Accelerated (BCa) intervals. Attributes ---------- interval_indices : `list` [`int`] or `np.ndarray` The indices corresponding to the lower and upper CI bounds. interval_values : `list` [`float`] or `np.ndarray` The computed lower and upper confidence interval bounds. coverage : `float` The confidence interval coverage. """ SET_ARGS = [ CLASS_NAME, NamesI.INDICES, NamesI.VALUES, NamesI.COVERAGE, NamesRS.BOOT_SAMPLE, NamesRS.JACK_SAMPLE, NamesRS.BCA_ACCELERATION, NamesRS.BCA_BIAS, ]
# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ @nb.njit def _random_row_indices_jit(nrows: int, ) -> Annotated[NDArray[np.integer], (Any, 1)]: """ Takes a random sample of row indices with replacement. Parameters ---------- nrows : `int` The number of rows. Returns ------- np.ndarray A vector of resampled row indices, with the number of rows equal to `nrows`. """ # resample - numba needs this to be a 2d array sel_r = np.random.choice(nrows, nrows).reshape(nrows, 1) return sel_r # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ def _random_row_indices(nrows: int | float, ) -> Annotated[NDArray[np.integer], (Any, 1)]: """ Takes a random sample of row indices with replacement. Parameters ---------- nrows : `int` or `float` The number of rows. Float values are cast to `int`. Returns ------- np.ndarray A vector of resampled row indices, with the number of rows equal to `nrows`. """ is_type(nrows, (int, float, np.integer)) nrows = int(nrows) return _random_row_indices_jit(nrows) # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ @nb.njit def _sample_rows(arr: NDArray[np.generic], rows: NDArray[np.generic], ) -> NDArray[np.generic]: """ Extracts rows from a numpy array. Parameters ---------- arr : `np.ndarray` A numpy array of any dimension. rows: `np.ndarray` An numpy array with shape (x,1) containing the row elements of `arr`. Returns ------- arr_resampled : `np.ndarray` A version of `arr` with a subset of the original array and certain rows repeated. Notes ----- This is a work around because numba does not allow for multi-dimensional array indexing such as `arr[rows, :]`. """ # ### getting the array dimensions nrows = rows.shape[0] arr_shape = arr.shape # empty result array output_shape = (nrows,) + arr_shape[1:] data_r = np.empty(output_shape, dtype=arr.dtype) # Loop over the row indices and copy the corresponding rows for i in range(nrows): data_r[i] = arr[rows[i, 0]] # This will work for any number of dimensions # return return data_r # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ def _draw_bs_sample(data:tuple[NDArray[Any]], ) -> tuple[NDArray[Any]]: """ Draws a bootstrap sample with replacement from a tuple of arrays. Parameters ---------- data : `tuple` [`np.ndarray`] Tuple of arrays with equal number of rows. Returns ------- tuple [`np.ndarray`] A tuple of resampled numpy arrays. For a proper bootstrap this should be equal to the number of rows of the input data """ # ### confirm input is_type(data, tuple) # ### check number of rows nrows = data[0].shape[0] # Check that all arrays have the same number of rows for arr in data: if arr.shape[0] != nrows: raise ValueError(ERR1) # ### performing the resampling sel_r = _random_row_indices(nrows) # random draws with replacement for each list object data_r = [] for obj in data: # NOTE commented this out, looks redudant, but may cause some # downstream issues which should be addressed individually. # # we want to return at least a 2d array # if len(obj.shape) == 1: # obj = obj.reshape(-1,1) # subset data_r.append(_sample_rows(obj, rows=sel_r)) # return return tuple(data_r) # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ def _remove_row(data:tuple[NDArray[Any]], row_index:int, ) -> list[NDArray[Any]]: """ A jackknife helper function to iterativly remove a row from a tuple of arrays. Parameters ---------- data : `tuple` [`np.ndarray`] A tuple of numpy arrays with the same number of rows. row_index : `int` The row index which should be dropped. Returns ------- list [`np.ndarray`] A list of numpy arrays with `row_index` removed. Notes ----- The arrays should be at least 2d, the absence of a shape[1] values will throw an error. Have not yet found a way to test for this with the numba more limited functionality. A non-numba implementation of this code would be `tuple(np.delete(arr, 1, axis=0) for arr in data)` """ # get dims and list num_rows = data[0].shape[0] new_arrays = [] # Iterate over each array in the tuple for arr in data: num_cols = arr.shape[1] new_arr = np.empty((num_rows - 1, num_cols), dtype=arr.dtype) # Fill the new array excluding the specified row row_counter = 0 for i in range(num_rows): if i != row_index: new_arr[row_counter] = arr[i] row_counter += 1 # add array new_arrays.append(new_arr) # return list return new_arrays # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
[docs] def bootstrap_replicates(data:tuple[NDArray[Any], ...], statsfunction:Callable, n_estimates:int, n_reps:int=999, **kwargs: Any, ) -> NDArray[Any]: """ Draws `n_reps` bootstrap samples and applies `statsfunction` on each bth sample, return the results as an numpy array. Parameters ---------- data : `tuple` [`np.ndarray`] A tuple of numpy arrays with the same number of rows. statsfunction : `Callable` A function which can unpack a tuple of arrays and perform analyses on these: `statsfunction(*data)`. The function can return the estimate(s) as a single float/int, list, tuple or numpy array. n_estimates : `int` The number of estimates `statsfunction` will return. n_reps : `int`, default 999 The number of bootstrap samples. **kwargs : `any` Any keyword arguments supplied to `statsfunctions`. Returns ------- np.ndarray A 2d numpy array with dims equal to `n_reps` by `n_estimates`. Notes ----- The helper functions have been optimised through numba.njit. Will internally map 1d arrays to 2d to deal with numba requirements. Examples -------- >>> import numpy as np >>> np.random.seed(42) >>> >>> # Example statsfunction that returns a list of statistics >>> def stats_list(*data): ... return [np.mean(data[0]), np.std(data[0])] >>> >>> result = bootstrap_replicates( ... (np.random.randn(10), np.random.randn(10)), ... stats_list, n_estimates=2, n_reps=10 ... ) >>> print(result) """ # ### check input is_type(data, tuple) is_type(n_estimates, (float, int)) is_type(n_reps, (float, int)) if not callable(statsfunction): raise TypeError(ERR3.format(NamesRS.STATSFUNC)) # ### performing the bootstrap # Make sure there are now 1d arrays for the bloody numba call n_data = [] for arr in data: if arr.ndim == 1: n_data.append(arr.reshape(arr.shape[0], 1)) else: n_data.append(arr) data = tuple(n_data) # initiating an nan array boot_reps = np.full((n_reps, n_estimates), np.nan) # looping over to bootstrap for i in range(n_reps): boot_reps[i] = statsfunction(*_draw_bs_sample(data), **kwargs, ) # returns return boot_reps
# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
[docs] def jackknife_replicates(data:tuple[NDArray[Any], ...], statsfunction:Callable, n_estimates:int, **kwargs: Any, ) -> NDArray[Any]: """ Performs jackknife resampling procedure which systematically leaves out one observation at a time and applies `statsfunction`. Parameters ---------- data : `tuple` [`np.ndarray`] A tuple of numpy arrays with the same number of rows. statsfunction : `Callable` A function which can unpack a tuple of arrays and perform analyses on these: `statsfunction(*data)`. The function can return the estimate(s) as a single float/int, list, tuple or numpy array. n_estimates : `int` The number of estimates `statsfunction` will return. **kwargs : `any` Any keyword arguments supplied to `statsfunctions`. Returns ------- np.ndarray A 2d numpy array with dims equal to `data[0].shape[0]` by `n_estimates`. Notes ----- The helper functions have been optimised through numba.njit. Will internally map 1d arrays to 2d to deal with numba requirements. """ ### test input is_type(data, tuple) is_type(n_estimates, (float, int)) if not callable(statsfunction): raise TypeError(ERR3.format(NamesRS.STATSFUNC)) ### the actual work # Make sure there are now 1d arrays for the bloody numba call n_data = [] for arr in data: if arr.ndim == 1: n_data.append(arr.reshape(arr.shape[0], 1)) else: n_data.append(arr) data = tuple(n_data) # Set up empty array to store jackknife replicates nrows = len(data[0]) jack_reps = np.full((nrows, n_estimates), np.nan) # For each observation in the dataset, compute the statistical function on # the sample with that observation removed for i in range(nrows): jack_sample = _remove_row(data, row_index=i) jack_reps[i] = statsfunction(*jack_sample, **kwargs) return jack_reps
# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
[docs] class BCaCI(object): """ A Bias-Corrected and Accelerated (BCa) bootstrap confidence interval. Parameters ---------- data : `tuple` [`np.ndarray`] A tuple of numpy arrays with the same number of rows. Methods ------- Attributes ---------- data : `tuple` [`np.ndarray`] A tuple of numpy arrays with the same number of rows. boot_sample : `np.ndarray` or `None` Bootstrap estimates of `statsfunction`. `None` until `__call__` is invoked. jack_sample : `np.ndarray` or `None` Jackknife estimates of `statsfunction`. `None` until `__call__` is invoked. Notes ----- The BCa interval generally has a better coverage (i.e. the smallest confidence interval while retaining advertised coverage) than most alternative boostrap confidence interval methods. """ data = ManagedProperty('data', types=(tuple,)) boot_sample = ManagedProperty('boot_sample', types=(np.ndarray, type(None))) jack_sample = ManagedProperty('jack_sample', types=(np.ndarray, type(None))) # ///////////////////////////////////////////////////////////////////////// def __init__(self, data:tuple[NDArray[Any], ...],) -> None: """ Initialises a `BCaCI` instance. """ is_type(data, tuple) self.data = data self.boot_sample = None self.jack_sample = None # ///////////////////////////////////////////////////////////////////////// def __str__(self) -> str: CLASS_NAME = type(self).__name__ return (f"{CLASS_NAME} instance with data=\n" f"{self.data.__str__()}." ) # \\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\ def __repr__(self) -> str: CLASS_NAME = type(self).__name__ return (f"{CLASS_NAME}(data=\n" f"{self.data.__str__()})" ) # ///////////////////////////////////////////////////////////////////////// def __call__(self, n_estimates:int, statsfunction:Callable, alpha:float=0.05, n_reps:int=999, verbose:bool = True, **kwargs: Any, ) -> BCaCIResults: """ Calculates and BCa confidence interval, as well as generating the bootstrap and jakknife re-sampling to obtain the BCa estimate. Parameters ---------- statsfunction : `Callable` A function which can unpack a tuple of arrays and perform analyses on these: `statsfunction(*data)`. The function can return the estimate(s) as a single float/int, list, tuple or numpy array. n_estimates : `int` The number of estimates `statsfunction` will return. n_reps : `int`, default 999 The number of bootstrap samples. alpha : `float`, default 0.05 The desired coverage of the confidence interval `(1-alpha)*100`. verbose : `bool`, default `True` Whether to emit warnings when bootstrap or jackknife samples contain `nan` values. **kwargs : `any` Any keyword arguments supplied to `statsfunction`. Returns ------- BCaCIResults A results class instance returning the confidence intervals, coverage, and confidence interval bootstrap indices. """ # #### check input is_type(n_estimates, int) is_type(n_reps, int) is_type(alpha, float) if not callable(statsfunction): raise TypeError(ERR3.format(NamesRS.STATSFUNC)) # is alpha between 0 and 1 check_limits(alpha, 0.0, 1.0) setattr(self, NamesRS.N_ESTIMATES, n_estimates) setattr(self, NamesRS.N_REPS, n_reps) setattr(self, NamesRS.ALPHA, alpha) setattr(self, NamesRS.CI_COVERAGE, 1-alpha) # #### perform the calculations # #### get bootstrap boot_samp = bootstrap_replicates( self.data, statsfunction=statsfunction, n_estimates=getattr(self, NamesRS.N_ESTIMATES), n_reps=getattr(self, NamesRS.N_REPS), **kwargs, ) self.boot_sample = boot_samp # #### get jack knives jack_samp = jackknife_replicates( self.data, statsfunction=statsfunction, n_estimates=getattr(self, NamesRS.N_ESTIMATES), **kwargs, ) self.jack_sample = jack_samp # #### see if there are any missings and raise a warning missings_bs = np.isnan(boot_samp).any(axis=1) missings_jk = np.isnan(jack_samp).any(axis=1) if any(missings_bs): boot_samp=boot_samp[~missings_bs] if verbose == True: warnings.warn(MSG_NAN.format(sum(missings_bs), 'boot_samp')) if any(missings_jk): jack_samp = jack_samp[~missings_jk] if verbose == True: warnings.warn(MSG_NAN.format(sum(missings_jk), 'jack_samp')) # #### computing the acceleration and bias constants. setattr(self, NamesRS.BCA_ACCELERATION, type(self)._compute_acceleration(jack_samp)) setattr(self, NamesRS.BCA_BIAS, type(self)._compute_bias( data=self.data, boot_samp=self.boot_sample, statsfunction=statsfunction, **kwargs, ) ) # #### get lower and upper limits # TODO the below, until return, should realy by a seperate function... alphas = np.array([alpha/2., 1-alpha/2.]) # loop over the estimates (per column entry) ci_array = np.full((len(boot_samp.T), 2), np.nan) int_array = np.full((len(boot_samp.T), 2), np.nan) for k in range(len(boot_samp.T)): a_k = getattr(self, NamesRS.BCA_ACCELERATION)[k] z0_k = getattr(self, NamesRS.BCA_BIAS)[k] zs = z0_k + st.norm.ppf(alphas) avals = st.norm.cdf(z0_k + zs/(1-a_k*zs)) ints = np.round((len(boot_samp)-1)*avals) ints = np.nan_to_num(ints).astype('int') int_array[k] = ints # Compute confidence interval boot_samp_sort = np.sort(boot_samp.T[k]) ci_low = boot_samp_sort[ints[0]] ci_high = boot_samp_sort[ints[1]] ci_array[k] = ci_low, ci_high # return stuff return BCaCIResults( **{ CLASS_NAME : type(self).__name__, NamesI.INDICES : int_array, NamesI.VALUES : ci_array, NamesI.COVERAGE : getattr(self, NamesRS.CI_COVERAGE), NamesRS.BOOT_SAMPLE : self.boot_sample, NamesRS.JACK_SAMPLE : self.jack_sample, NamesRS.BCA_ACCELERATION : getattr(self, NamesRS.BCA_ACCELERATION), NamesRS.BCA_BIAS : getattr(self, NamesRS.BCA_BIAS), } ) # ///////////////////////////////////////////////////////////////////////// @staticmethod def _compute_acceleration(jack_samp:NDArray[Any]) -> list[float]: """ Returns the acceleration constant a for a BCa confidence interval. Parameters ---------- jack_samp : `np.ndarray` An array of jack knife estimates. Return ------ accel : `list` [`float`] Acceleration values for each column of `jack_samp`. """ # #### make sure input is at least a 2d array if jack_samp.ndim == 1: jack_samp = jack_samp.reshape(jack_samp.shape[0], 1) # #### calculations # column means means = np.mean(jack_samp, axis=0) if not isinstance(means, np.ndarray): means = np.array([means], dtype=np.float64) # make sure means only contains 1 dim if means.ndim != 1: if not (means.shape[0] == 1 or means.shape[1] == 1): raise IndexError( f"`means` shape should contain at least a 1, not " f"{means.shape}." ) # loop over column accel = [] for k, jc in enumerate(jack_samp.T): a = (1/6) * np.divide( np.sum(means[k] - jc)**3, (np.sum(means[k] - jc)**2)**(3/2) ) accel.append(a) # return list return accel # ///////////////////////////////////////////////////////////////////////// @staticmethod def _compute_bias(data:tuple[NDArray[Any]], boot_samp:NDArray[Any], statsfunction:Callable, **kwargs: Any, ) -> list[float]: """ Computes z0, the bias correction constant, using the original estimates and the boostrapped estimates. Parameters ---------- data : `tuple` [`np.ndarray`] A tuple of numpy arrays with the same number of rows. boot_samp : `np.ndarray` A array of bootstrap samples, can be either 1d or 2d. statsfunction : `Callable` A function which can unpack a tuple of arrays and perform analyses on these: `statsfunction(*data)`. **kwargs : `any` Any keyword arguments supplied to `statsfunctions`. Returns ------- z0 : `list` [`float`] Values of `z0` per column of `boot_samp`. """ # ### calculations # Apply the statsfunction on the data s = statsfunction(*data, **kwargs) # make sure it is a list if not isinstance(s, list): s = [s] # get z0 boot_n = boot_samp.shape[0] z0 = [] # loop over columns for b, boot_b in enumerate(boot_samp.T): # so this will be zero when exactly halve of the sample is less # than s z0.append(st.norm.ppf(np.sum(boot_b < s[b]) / boot_n)) # return return z0
# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
[docs] class BaseBootstrap(object): """ Base class for generating bootstrap estimates. Parameters ---------- data : `tuple` [`np.ndarray`, ...] A tuple containing one or more numpy arrays representing the input data to be used for bootstrapping. verbose : `bool`, default `True` Whether warnings should be returned. Attributes ---------- _data : `tuple` [`np.ndarray`, ...] The input data arrays from which bootstrap samples are drawn. Methods ------- bootstrap_samples(n_estimates, statsfunction, n_reps, **kwargs) Generate bootstrap replicates and return the resulting statistics. original_estimate(statsfunction, **kwargs) Compute the statistic(s) on the original (non-resampled) data. Notes ----- This class handles the generation of bootstrap replicates and calculation of original estimates from the provided dataset using a specified statistical function. """ # ///////////////////////////////////////////////////////////////////////// def __repr__(self) -> str: """ String representation of the class instance. """ return (f"{type(self).__name__}(data=tuple of " f"{len(self._data)} numpy arrays)") # ///////////////////////////////////////////////////////////////////////// def __str__(self) -> str: """ String representation of the class instance. """ return (f"{type(self).__name__} with {len(self._data)} data " "arrays.") # ///////////////////////////////////////////////////////////////////////// def __init__(self, data: tuple[NDArray[Any], ...], verbose:bool=True, ): """Initialise the BaseBootstrapCI instance. """ is_type(data, tuple) is_type(verbose, bool) self._data = data self._verbose = verbose # self.__boot_sample = None # /////////////////////////////////////////////////////////////////////////
[docs] def bootstrap_samples(self, n_estimates: int, statsfunction: Callable, n_reps: int = 999, # verbose:bool=True, **kwargs: Any, ) -> NDArray[Any]: """ Generates a bootstrap sample, applies a function, and removes missing values. Parameters ---------- n_estimates : `int` The number of estimates (statistics) to compute for each bootstrap sample. statsfunction : `Callable` The statistical function to apply to each bootstrap sample. n_reps : `int`, default 999 The number of bootstrap repetitions (default is 999). **kwargs : `dict` [`str`, `any`] Additional keyword arguments passed to `statsfunction`. Returns ------- np.ndarray An array of bootstrap estimates, with any samples containing missing values removed. """ # check input is_type(n_estimates, int) is_type(n_reps, (int, float)) if not callable(statsfunction): raise TypeError(ERR3.format(NamesRS.STATSFUNC)) # perform the acual boostrap boot_samp = bootstrap_replicates( self._data, statsfunction, n_estimates, n_reps, **kwargs, ) # Remove missing values missings_bs = np.isnan(boot_samp).any(axis=1) if any(missings_bs): boot_samp = boot_samp[~missings_bs] if self._verbose == True: warnings.warn(MSG_NAN.format( sum(missings_bs), 'the bootstraped esimates [boot_samp]')) # assign to self # self.__boot_sample = boot_samp # return return boot_samp
# /////////////////////////////////////////////////////////////////////////
[docs] def original_estimate(self, statsfunction: Callable, **kwargs: Any, ) -> np.ndarray: """ Calculate the estimate based on the non-resampled data. Parameters ---------- statsfunction : `Callable` The statistical function which will be applied to `data`. **kwargs : `dict` [`str`, `any`] Additional keyword arguments passed to `statsfunction`. Returns ------- np.ndarray An array containing the original statistic(s). """ # get estimate(s) result = statsfunction(*self._data, **kwargs) # ensure we get an array if not isinstance(result, list) and not isinstance(result, np.ndarray): result = [result] if not isinstance(result, np.ndarray): result = np.array(result) # return return result
# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
[docs] class PercentileCI(BaseBootstrap): """ Percentile bootstrap confidence interval estimator. Attributes ---------- _data : `tuple` [`np.ndarray`, ...] The input data arrays from which bootstrap samples are drawn. """ # ///////////////////////////////////////////////////////////////////////// def __call__(self, n_estimates: int, statsfunction: Callable, alpha: float = 0.05, n_reps: int = 999, **kwargs: Any) -> BootCIResults: """ Calculate the percentile bootstrap confidence interval. Parameters ---------- n_estimates : `int` The number of estimates (statistics) to compute for each sample. statsfunction : `Callable` The statistical function applied to the bootstrap samples. alpha : `float`, default 0.05 The significance level for the confidence interval. n_reps : `int`, default 999 The number of bootstrap repetitions (default is 999). **kwargs : `dict` [`str`, `any`] Additional keyword arguments passed to `statsfunction`. Returns ------- BootCIResults A dataclass containing the confidence interval bounds, the bootstrap samples, and metadata. """ # check input is_type(alpha, float) # is alpha between 0 and 1 check_limits(alpha, 0.0, 1.0) # the bootstrap boot_samp = self.bootstrap_samples( n_estimates, statsfunction, n_reps, **kwargs ) # getting the confidence intervals # NOTE the alpha/2 and (1-alpha/2) are inverted - which is intended. lower_pct = (alpha/2) * 100 upper_pct = (1 - alpha/2) * 100 ind_array = np.atleast_2d(np.array([lower_pct, upper_pct])) # NOTE np.percentile intrapolates - if the pct do not exactly match # elements in `boot_samp` ci_array = np.percentile(boot_samp, [lower_pct, upper_pct], axis=0, method='linear').T # return results return BootCIResults( **{ CLASS_NAME :type(self).__name__, NamesI.INDICES: ind_array, NamesI.VALUES: ci_array, NamesI.COVERAGE: 1 - alpha, NamesRS.BOOT_SAMPLE: boot_samp, } )
# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
[docs] class BasicCI(BaseBootstrap): """ Basic bootstrap confidence interval estimator. Attributes ---------- _data : `tuple` [`np.ndarray`, ...] The input data arrays from which bootstrap samples are drawn. Notes ----- The basic bootstrap confidence interval is calculated by reflecting the standard percentile intervals around the original estimate. """ # ///////////////////////////////////////////////////////////////////////// def __call__(self, n_estimates: int, statsfunction: Callable, alpha: float = 0.05, n_reps: int = 999, **kwargs: Any) -> BootCIResults: """ Calculate the bootstrap confidence interval. Parameters ---------- n_estimates : `int` The number of estimates (statistics) to compute for each sample. statsfunction : `Callable` The statistical function applied to the bootstrap samples. alpha : `float`, default 0.05 The significance level for the confidence interval. n_reps : `int`, default 999 The number of bootstrap repetitions (default is 999). **kwargs : `dict` [`str`, `any`], optional Additional keyword arguments passed to `statsfunction`. Returns ------- BootCIResults A dataclass containing the confidence interval bounds, the bootstrap samples, and metadata. """ # check input is_type(alpha, float) # is alpha between 0 and 1 check_limits(alpha, 0.0, 1.0) # the bootstrap samples boot_samp = self.bootstrap_samples( n_estimates, statsfunction, n_reps, **kwargs ) # the original (non-resampled) estimate orig_estimate = self.original_estimate( statsfunction, **kwargs ) # calculate percentiles lower_pct = (1 - alpha / 2) * 100 upper_pct = (alpha / 2) * 100 ind_array = np.atleast_2d(np.array([lower_pct, upper_pct])) # NOTE np.percentile will interpolate if needed # calculate the basic confidence intervals lower, upper = np.percentile( boot_samp, [lower_pct, upper_pct], axis=0, method='linear' ) ci_array = np.stack([ 2 * orig_estimate - lower, 2 * orig_estimate - upper, ], axis=1) # return results return BootCIResults( **{ CLASS_NAME :type(self).__name__, NamesI.INDICES: ind_array, NamesI.VALUES: ci_array, NamesI.COVERAGE: 1 - alpha, NamesRS.BOOT_SAMPLE: boot_samp, } )
# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
[docs] class StudentisedCI(BaseBootstrap): """ Studentised bootstrap confidence interval estimator. Attributes ---------- _data : tuple of NDArray The input data arrays from which bootstrap samples are drawn. Notes ----- This class performs a nested bootstrap to estimate the standard error of the bootstrapped statistic(s) """ # ///////////////////////////////////////////////////////////////////////// def __call__(self, n_estimates: int, statsfunction: Callable, alpha: float = 0.05, n_reps: int = 999, n_nested: int = 100, **kwargs: Any) -> BootCIResults: """ Calculate the studentised bootstrap confidence interval. Parameters ---------- n_estimates : `int` The number of estimates (statistics) to compute for each sample. statsfunction : `Callable` The statistical function applied to the bootstrap samples. alpha : `float`, default 0.05 The significance level for the confidence interval. n_reps : `int`, default 999 The number of bootstrap repetitions. n_nested : `int`, default 100 The number of inner bootstrap repetitions for standard error estimation. **kwargs : `dict`, [`str`, `any`] Additional keyword arguments passed to `statsfunction`. Returns ------- BootCIResults A dataclass containing the confidence interval bounds, the bootstrap samples, and metadata. """ # ### check input is_type(alpha, float) # is alpha between 0 and 1 check_limits(alpha, 0.0, 1.0) is_type(n_reps, (int, float)) is_type(n_nested, (int, float)) # ### the bootstrap # ### pre-allocate memory boot_point = np.full((n_reps, n_estimates), np.nan) boot_se = np.full((n_reps, n_estimates), np.nan) # boot_point = [] # boot_se = [] for i in range(n_reps): boot_samp = _draw_bs_sample(self._data) # the outer estimates outer_est = statsfunction(*boot_samp, **kwargs, ) # map to array outer_est = self._map_sequence(outer_est, array = True) # boot_point.append(outer_est) boot_point[i] = outer_est # the inner loop inner_point = np.full((n_nested, n_estimates), np.nan) for j in range(n_nested): inner_samp = _draw_bs_sample(boot_samp) inner_est = statsfunction(*inner_samp, **kwargs) # make sure it is a list inner_est = self._map_sequence(inner_est, array = False) # inner_point.append(inner_est) inner_point[j] = inner_est # getting the standard error inner_point = np.array(inner_point) # Getting the sample sd inner_point, _ = self._filter_missing(inner_point) if np.isnan(inner_point).all(): if self._verbose == True: warnings.warn("Warning: inner_point is entirely NaN") se = np.nan else: se = np.std(inner_point, axis=0, ddof=1) # boot_se.append(se) boot_se[i] = se # point estimate on the non-resampled data orig_mean = self.original_estimate(statsfunction, **kwargs) if not isinstance(orig_mean, np.ndarray): orig_mean = np.array(orig_mean) # NOTE perhaps check if there are the same length? # removing possible missing data boot_point, keep_point = self._filter_missing(boot_point) boot_se, keep_se = self._filter_missing(boot_se[keep_point]) boot_point = boot_point[keep_se] # standard error of the outer boot if np.isnan(boot_point).all(): if self._verbose == True: warnings.warn("Warning: boot_point is entirely NaN") boot_point_se = np.nan else: boot_point_se = np.std(boot_point, axis=0, ddof=1) # Studentised t-statistics t_stats = (boot_point - orig_mean) / boot_se # The t-statistic based interval positions percentiles = np.percentile( t_stats, [100 * (1 - alpha/2), 100 * (alpha/2)], method="linear", axis=0, ) lower_t, upper_t = percentiles ind_array = np.atleast_2d(np.array([lower_t, upper_t])) # finally get the confidence interval ci_array = np.stack([ orig_mean - lower_t * boot_point_se, # lower bound orig_mean - upper_t * boot_point_se, # upper bound ], axis=1) # return return BootCIResults( **{ CLASS_NAME :type(self).__name__, NamesI.INDICES: ind_array, NamesI.VALUES: ci_array, NamesI.COVERAGE: 1 - alpha, NamesRS.BOOT_SAMPLE: boot_point, } ) # ///////////////////////////////////////////////////////////////////////// def _map_sequence(self, obj:Any, array:bool=False) -> list | np.ndarray: """ Map an object to a mutable sequence, optionally as a numpy array. Parameters ---------- obj : `any` The object to convert. Scalars and non-list/ndarray types are wrapped in a list first. array : `bool`, default `False` If `True`, convert the result to a `np.ndarray`. Returns ------- obj : `list` or `np.ndarray` The converted object. """ # map to list if not isinstance(obj, list) and not isinstance(obj, np.ndarray): obj = [obj] # check if we also want to map to array if array: if not isinstance(obj, np.ndarray): obj = np.array(obj) # return return obj # ///////////////////////////////////////////////////////////////////////// def _filter_missing(self, array: np.ndarray, ) -> tuple[np.ndarray, np.ndarray]: """ Remove rows containing NaNs from the array and return a boolean mask indicating non-missing rows. Parameters ---------- array : np.ndarray The array to check for missing values (NaNs). Returns ------- filtered_array : np.ndarray The input array with rows containing NaNs removed. non_missing_mask : np.ndarray Boolean array where True indicates rows without any NaNs. """ missings = np.isnan(array).any(axis=1) non_missing = ~missings if missings.any(): if self._verbose: warnings.warn('The bootstrap sample contained {} `nan` values ' 'which have been removed.'.format(sum(missings))) return array[non_missing], non_missing