Source code for stats_misc.resampling

'''
A module for resampling techniques such as bootstrap, jackknife, or
permutation. Currently focussed on confidence interval estimation using
canonical boostrap algorithms.

'''
# main function
import warnings
import numpy as np
import numba as nb
import scipy.stats as st
# from numbers import Real
from numpy.typing import NDArray
from typing import (
    Any, Annotated, Callable, Optional,
)
from stats_misc.constants import (
    is_type,
    NamesResampling as NamesRS,
    NamesIntervals as NamesI,
    Error_MSG,
)


# @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
# constants
MSG_NAN='There are {} `nan` values in `{}`. These will be omitted.'
ERR1 = "All arrays in `data` must have the same number of rows."
ERR2 = "nrows ({}) exceeds the number of rows in data ({})."
ERR3 = "`{}` is not callable."
CLASS_NAME = '__CLASS_NAME'

# @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
[docs] class BootCIResults(object): ''' The results class for bootstrapped confidence intervals. Attributes ---------- interval_indices: `list` [`int`] The bootstrap indices of the interval lower and upper bounds. interval_values: `list` [`float`] The lower and upper bounds of the interval. coverage : `float` The interval coverage. ''' DEFAULT_ARGS = [ CLASS_NAME, NamesI.INDICES, NamesI.VALUES, NamesI.COVERAGE, NamesRS.BOOT_SAMPLE, ] # Initiation the class # NOTE include * to force all named arguments to be named (no positional) # args when calling innit. def __init__(self,*, set_args: Optional[list[str]] = None, **kwargs:Any): """ Initialise a `BootCIResults` instance. Raises ------ AttributeError If an unrecognised keyword is provided. """ # the arguments we want to define set_args = set_args if set_args is not None else self.DEFAULT_ARGS # now set values for k in kwargs.keys(): if k not in set_args: raise AttributeError("unrecognised argument '{0}'".format(k)) # Loops over `SET_ARGS`, assigns the kwargs content to name `s`. # if argument is missing in kwargs, print a warning. for s in set_args: try: setattr(self, s, kwargs[s]) except KeyError: warnings.warn("argument '{0}' is set to 'None'".format(s)) setattr(self, s, None) # ///////////////////////////////////////////////////////////////////////// def __str__(self) -> str: # assigns a back up name 'BootCIResults` if clas_name is not provided CLASS_NAME_ = getattr(self, CLASS_NAME, "BootCIResults") return f"A `{CLASS_NAME_}` results class." # ///////////////////////////////////////////////////////////////////////// def __repr__(self) -> str: # assigns a back up name 'BootCIResults` if clas_name is not provided CLASS_NAME_ = getattr(self, CLASS_NAME, "BootCIResults") return (f"{CLASS_NAME_}(interval_indices={getattr(self, NamesI.INDICES)}, " f"interval_values={getattr(self, NamesI.VALUES)}, " f"coverage={getattr(self, NamesI.COVERAGE)})")
# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
[docs] class BCaCIResults(BootCIResults): """ The results class for `BCaCI`. Attributes ---------- interval_indices : `list` [`int`] or `np.ndarray` The bootstrap indices of the interval lower and upper bounds. interval_values : `list` [`float`] or `np.ndarray` The lower and upper bounds of the interval. coverage : `float` The confidence interval coverage. """ # ///////////////////////////////////////////////////////////////////////// def __init__(self, **kwargs: Any) -> None: """ Initialise a `BCaCIResults` instance. Raises ------ AttributeError If an unrecognised keyword is provided. """ set_args = [ CLASS_NAME, NamesI.INDICES, NamesI.VALUES, NamesI.COVERAGE, NamesRS.BOOT_SAMPLE, NamesRS.JACK_SAMPLE, NamesRS.BCA_ACCELERATION, NamesRS.BCA_BIAS, ] kwargs.setdefault(CLASS_NAME, "BCaCIResults") super().__init__(set_args=set_args, **kwargs)
# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ @nb.njit def _random_row_indices(nrows:int) -> Annotated[NDArray[np.integer],(Any, 1)]: ''' Takes a random sample of row indices with replacement. Parameters ---------- nrows : `int` The number of rows. Returns ------- rows : `np.ndarray` A vector of resampled row indices, with the number of rows equal to `nrows`. ''' # resample - numba needs this to be a 2d array sel_r = np.random.choice(nrows, nrows).reshape(nrows, 1) return sel_r # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ @nb.njit def _sample_rows(arr: NDArray[np.generic], rows: NDArray[np.generic], ) -> NDArray[np.generic]: ''' Extracts rows from a numpy array. Parameters ---------- arr : `np.ndarray` A numpy array of any dimension. rows: `np.ndarray` An numpy array with shape (x,1) containing the row elements of `arr`. Returns ------- arr_resampled : `np.ndarray` A version of `arr` with a subset of the original array and certain rows repeated. Notes ----- This is a work around because numba does not allow for multi-dimensional array indexing such as `arr[rows, :]`. ''' # ### getting the array dimensions nrows = rows.shape[0] arr_shape = arr.shape # empty result array output_shape = (nrows,) + arr_shape[1:] data_r = np.empty(output_shape, dtype=arr.dtype) # Loop over the row indices and copy the corresponding rows for i in range(nrows): data_r[i] = arr[rows[i, 0]] # This will work for any number of dimensions # return return data_r # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ def _draw_bs_sample(data:tuple[NDArray[Any]], ) -> tuple[NDArray[Any]]: """ Samples with replacement from a tuple with np.ndarrays of the same length. Parameters ---------- data : `tuple` [`np.ndarray`] A tuple of numpy arrays with the same number of rows. Returns ------- resample : `tuple` [`np.ndarray`] A tuple of resampled numpy arrays. For a proper bootstrap this should be equal to the number of rows of the input data """ # ### confirm input is_type(data, tuple) # ### check number of rows nrows = data[0].shape[0] # Check that all arrays have the same number of rows for arr in data: if arr.shape[0] != nrows: raise ValueError(ERR1) # ### performing the resampling sel_r = _random_row_indices(nrows) # random draws with replacement for each list object data_r = [] for obj in data: # NOTE commented this out, looks redudant, but may cause some # downstream issues which should be addressed individually. # # we want to return at least a 2d array # if len(obj.shape) == 1: # obj = obj.reshape(-1,1) # subset data_r.append(_sample_rows(obj, rows=sel_r)) # return return tuple(data_r) # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ def _remove_row(data:tuple[NDArray[Any]], row_index:int, ) -> list[NDArray[Any]]: ''' A jackknife helper function to iterativly remove a row from a tuple of arrays. Parameters ---------- data : `tuple` [`np.ndarray`] A tuple of numpy arrays with the same number of rows. row_index : `int` The row index which should be dropped. Returns ------- results : `list` [`np.ndarray`] A list of numpy arrays with `row_index` removed. Notes ----- The arrays should be at least 2d, the absence of a shape[1] values will throw an error. Have not yet found a way to test for this with the numba more limited functionality. A non-numba implementation of this code would be `tuple(np.delete(arr, 1, axis=0) for arr in data)` ''' # get dims and list num_rows = data[0].shape[0] new_arrays = [] # Iterate over each array in the tuple for arr in data: num_cols = arr.shape[1] new_arr = np.empty((num_rows - 1, num_cols), dtype=arr.dtype) # Fill the new array excluding the specified row row_counter = 0 for i in range(num_rows): if i != row_index: new_arr[row_counter] = arr[i] row_counter += 1 # add array new_arrays.append(new_arr) # return list return new_arrays # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
[docs] def bootstrap_replicates(data:tuple[NDArray[Any], ...], statsfunction:Callable, n_estimates:int, n_reps:int=999, **kwargs:Optional[Any], ) -> NDArray[Any]: ''' Draws `n_reps` bootstrap samples and applies `statsfunction` on each bth sample, return the results as an numpy array. Parameters ---------- data : `tuple` [`np.ndarray`] A tuple of numpy arrays with the same number of rows. statsfunction : `Callable` A function which can unpack a tuple of arrays and perform analyses on these: `statsfunction(*data)`. The function can return the estimate(s) as a single float/int, list, tuple or numpy array. n_estimates : `int` The number of estimates `statsfunction` will return. n_reps : `int`, default 999 The number of bootstrap samples. **kwargs : `any` Any keyword arguments supplied to `statsfunctions`. Returns ------- boots : `np.ndarray` A 2d numpy array with dims equal to `n_reps` by `n_estimates`. Notes ----- The helper functions have been optimised through numba.njit. Will internally map 1d arrays to 2d to deal with numba requirements. Examples -------- >>> import numpy as np >>> np.random.seed(42) >>> >>> # Example statsfunction that returns a list of statistics >>> def stats_list(*data): ... return [np.mean(data[0]), np.std(data[0])] >>> >>> result = bootstrap_replicates( ... (np.random.randn(10), np.random.randn(10)), ... stats_list, n_estimates=2, n_reps=10 ... ) >>> print(result) ''' # ### check input is_type(data, tuple) is_type(n_estimates, (float, int)) is_type(n_reps, (float, int)) if not callable(statsfunction): raise TypeError(ERR3.format(NamesRS.STATSFUNC)) # ### performing the bootstrap # Make sure there are now 1d arrays for the bloody numba call n_data = [] for arr in data: if arr.ndim == 1: n_data.append(arr.reshape(arr.shape[0], 1)) else: n_data.append(arr) data = tuple(n_data) # initiating an nan array boot_reps = np.full((n_reps, n_estimates), np.nan) # looping over to bootstrap for i in range(n_reps): boot_reps[i] = statsfunction(*_draw_bs_sample(data), **kwargs, ) # returns return boot_reps
# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
[docs] def jackknife_replicates(data:tuple[NDArray[Any], ...], statsfunction:Callable, n_estimates:int, **kwargs:Optional[Any], ) -> NDArray[Any]: ''' Performs jackknife resampling procedure which systematically leaves out one observation at a time and applies `statsfunction`. Parameters ---------- data : `tuple` [`np.ndarray`] A tuple of numpy arrays with the same number of rows. statsfunction : `Callable` A function which can unpack a tuple of arrays and perform analyses on these: `statsfunction(*data)`. The function can return the estimate(s) as a single float/int, list, tuple or numpy array. n_estimates : `int` The number of estimates `statsfunction` will return. **kwargs : `any` Any keyword arguments supplied to `statsfunctions`. Returns ------- jacked : `np.ndarray` A 2d numpy array with dims equal to `data[0].shape[0]` by `n_estimates`. Notes ----- The helper functions have been optimised through numba.njit. Will internally map 1d arrays to 2d to deal with numba requirements. ''' ### test input is_type(data, tuple) is_type(n_estimates, (float, int)) if not callable(statsfunction): raise TypeError(ERR3.format(NamesRS.STATSFUNC)) ### the actual work # Make sure there are now 1d arrays for the bloody numba call n_data = [] for arr in data: if arr.ndim == 1: n_data.append(arr.reshape(arr.shape[0], 1)) else: n_data.append(arr) data = tuple(n_data) # Set up empty array to store jackknife replicates nrows = len(data[0]) jack_reps = np.full((nrows, n_estimates), np.nan) # For each observation in the dataset, compute the statistical function on # the sample with that observation removed for i in range(nrows): jack_sample = _remove_row(data, row_index=i) jack_reps[i] = statsfunction(*jack_sample, **kwargs) return jack_reps
# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
[docs] class BCaCI(object): """ A Bias-Corrected and Accelerated (BCa) bootstrap confidence interval. Parameters ---------- data : `tuple` [`np.ndarray`] A tuple of numpy arrays with the same number of rows. Methods ------- Attributes ---------- data : `tuple` [`np.ndarray`] A tuple of numpy arrays with the same number of rows. Notes ----- The BCa interval generally has a better coverage (i.e. the smallest confidence interval while retaining advertised coverage) than most alternative boostrap confidence interval methods. """ # ///////////////////////////////////////////////////////////////////////// def __init__(self, data:tuple[NDArray[Any], ...],) -> None: """ Initialises a `BCaCI` instance. """ # #### testing input is_type(data, tuple) # #### settings attributes setattr(self, NamesRS.DATA_PRIV, data) # ///////////////////////////////////////////////////////////////////////// def __str__(self) -> str: CLASS_NAME = type(self).__name__ return (f"{CLASS_NAME} instance with data=\n" f"{getattr(self, NamesRS.DATA_PRIV).__str__()}." ) # \\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\ def __repr__(self) -> str: CLASS_NAME = type(self).__name__ return (f"{CLASS_NAME}(data=\n" f"{getattr(self, NamesRS.DATA_PRIV).__str__()})" ) # ///////////////////////////////////////////////////////////////////////// @property def boot_sample(self) -> tuple[NDArray[Any]]: """Getter function for boot_sample""" return getattr(self, NamesRS.BOOT_SAMPLE) # ///////////////////////////////////////////////////////////////////////// @property def jack_sample(self) -> tuple[NDArray[Any]]: """Getter function for jack_sample""" return getattr(self, NamesRS.JACK_SAMPLE) # ///////////////////////////////////////////////////////////////////////// @property def data(self) -> tuple[NDArray[Any]]: """Getter function for data""" return getattr(self, NamesRS.DATA_PRIV) # ///////////////////////////////////////////////////////////////////////// def __call__(self, n_estimates:int, statsfunction:Callable, alpha:float=0.05, n_reps:int=999, verbose:bool = True, **kwargs:Optional[Any], ) -> BCaCIResults: """ Calculates and BCa confidence interval, as well as generating the bootstrap and jakknife re-sampling to obtain the BCa estimate. Parameters ---------- statsfunction : `Callable` A function which can unpack a tuple of arrays and perform analyses on these: `statsfunction(*data)`. The function can return the estimate(s) as a single float/int, list, tuple or numpy array. n_estimates : `int` The number of estimates `statsfunction` will return. n_reps : `int`, default 999 The number of bootstrap samples. alpha : `float`, default 0.05 The desired coverage of the confidence interval `(1-alpha)*100`. **kwargs : `any` Any keyword arguments supplied to `statsfunctions`. Attributes ---------- boot_sample : `np.ndarray` An array of boostrap estimates of `statsfunction`. These are the crude estimates from `bootstrap_replicates`. jack_sample : `np.ndarray` An array of jack-knifed estimates of `statsfunction`. These are the crude estimates from `jackknife_replicates`. confidence interval : `np.ndarray` The confidence intervals. coverage : `float` The confidence interval coverage. Returns ------- results : `BCaCIResults` instance A results class instance returning the confidence intervals, coverage, and confidence interval bootstrap indices. """ # #### check input is_type(n_estimates, int) is_type(n_reps, int) is_type(alpha, float) if not callable(statsfunction): raise TypeError(ERR3.format(NamesRS.STATSFUNC)) # is alpha between 0 and 1 if alpha <= 0 or alpha >= 1: raise ValueError(Error_MSG.FLOAT_LIMITS.format('alpha', 0, 1)) setattr(self, NamesRS.N_ESTIMATES, n_estimates) setattr(self, NamesRS.N_REPS, n_reps) setattr(self, NamesRS.ALPHA, alpha) setattr(self, NamesRS.CI_COVERAGE, 1-alpha) # #### perform the calculations # #### get bootstrap boot_samp = bootstrap_replicates( getattr(self, NamesRS.DATA_PRIV), statsfunction=statsfunction, n_estimates=getattr(self, NamesRS.N_ESTIMATES), n_reps=getattr(self, NamesRS.N_REPS), **kwargs, ) setattr(self, NamesRS.BOOT_SAMPLE, boot_samp) # #### get jack knives jack_samp = jackknife_replicates( getattr(self, NamesRS.DATA_PRIV), statsfunction=statsfunction, n_estimates=getattr(self, NamesRS.N_ESTIMATES), **kwargs, ) setattr(self, NamesRS.JACK_SAMPLE, jack_samp) # #### see if there are any missings and raise a warning missings_bs = np.isnan(boot_samp).any(axis=1) missings_jk = np.isnan(jack_samp).any(axis=1) if any(missings_bs): boot_samp=boot_samp[~missings_bs] if verbose == True: warnings.warn(MSG_NAN.format(sum(missings_bs), 'boot_samp')) if any(missings_jk): jack_samp = jack_samp[~missings_jk] if verbose == True: warnings.warn(MSG_NAN.format(sum(missings_jk), 'jack_samp')) # #### computing the acceleration and bias constants. setattr(self, NamesRS.BCA_ACCELERATION, type(self)._compute_acceleration(jack_samp)) setattr(self, NamesRS.BCA_BIAS, type(self)._compute_bias( data=getattr(self, NamesRS.DATA_PRIV), boot_samp=getattr(self, NamesRS.BOOT_SAMPLE), statsfunction=statsfunction, **kwargs, ) ) # #### get lower and upper limits # TODO the below, until return, should realy by a seperate function... alphas = np.array([alpha/2., 1-alpha/2.]) # loop over the estimates (per column entry) ci_array = np.full((len(boot_samp.T), 2), np.nan) int_array = np.full((len(boot_samp.T), 2), np.nan) for k in range(len(boot_samp.T)): a_k = getattr(self, NamesRS.BCA_ACCELERATION)[k] z0_k = getattr(self, NamesRS.BCA_BIAS)[k] zs = z0_k + st.norm.ppf(alphas) avals = st.norm.cdf(z0_k + zs/(1-a_k*zs)) ints = np.round((len(boot_samp)-1)*avals) ints = np.nan_to_num(ints).astype('int') int_array[k] = ints # Compute confidence interval boot_samp_sort = np.sort(boot_samp.T[k]) ci_low = boot_samp_sort[ints[0]] ci_high = boot_samp_sort[ints[1]] ci_array[k] = ci_low, ci_high # return stuff return BCaCIResults( **{ NamesI.INDICES : int_array, NamesI.VALUES : ci_array, NamesI.COVERAGE : getattr(self, NamesRS.CI_COVERAGE), NamesRS.BOOT_SAMPLE : getattr(self, NamesRS.BOOT_SAMPLE), NamesRS.JACK_SAMPLE : getattr(self, NamesRS.JACK_SAMPLE), NamesRS.BCA_ACCELERATION : getattr(self, NamesRS.BCA_ACCELERATION), NamesRS.BCA_BIAS : getattr(self, NamesRS.BCA_BIAS), } ) # ///////////////////////////////////////////////////////////////////////// @staticmethod def _compute_acceleration(jack_samp:NDArray[Any]) -> list[float]: """ Returns the acceleration constant a for a BCa confidence interval. Parameters ---------- jack_samp : `np.ndarray` An array of jack knife estimates. Return ------ accel : `list` [`float`] Acceleration values for each column of `jack_samp`. """ # #### make sure input is at least a 2d array if jack_samp.ndim == 1: jack_samp = jack_samp.reshape(jack_samp.shape[0], 1) # #### calculations # column means means = np.mean(jack_samp, axis=0) if not isinstance(means, np.ndarray): means = np.array([means], dtype=np.float64) # make sure means only contains 1 dim if means.ndim != 1: if not (means.shape[0] == 1 or means.shape[1] == 1): raise IndexError( f"`means` shape should contain at least a 1, not " f"{means.shape}." ) # loop over column accel = [] for k, jc in enumerate(jack_samp.T): a = (1/6) * np.divide( np.sum(means[k] - jc)**3, (np.sum(means[k] - jc)**2)**(3/2) ) accel.append(a) # return list return accel # ///////////////////////////////////////////////////////////////////////// @staticmethod def _compute_bias(data:tuple[NDArray[Any]], boot_samp:NDArray[Any], statsfunction:Callable, **kwargs:Optional[Any], ) -> list[float]: """ Computes z0, the bias correction constant, using the original estimates and the boostrapped estimates. Parameters ---------- data : `tuple` [`np.ndarray`] A tuple of numpy arrays with the same number of rows. boot_samp : `np.ndarray` A array of bootstrap samples, can be either 1d or 2d. statsfunction : `Callable` A function which can unpack a tuple of arrays and perform analyses on these: `statsfunction(*data)`. **kwargs : `any` Any keyword arguments supplied to `statsfunctions`. Returns ------- z0 : `list` [`float`] Values of `z0` per column of `boot_samp`. """ # ### calculations # Apply the statsfunction on the data s = statsfunction(*data, **kwargs) # make sure it is a list if not isinstance(s, list): s = [s] # get z0 boot_n = boot_samp.shape[0] z0 = [] # loop over columns for b, boot_b in enumerate(boot_samp.T): # so this will be zero when exactly halve of the sample is less # than s z0.append(st.norm.ppf(np.sum(boot_b < s[b]) / boot_n)) # return return z0
# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
[docs] class BaseBootstrap(object): """ Base class for generating bootstrap estimates. Parameters ---------- data : `tuple` [`np.ndarray`, ...] A tuple containing one or more numpy arrays representing the input data to be used for bootstrapping. verbose : `bool`, default `True` Whether warnings should be returned. Attributes ---------- _data : `tuple` [`np.ndarray`, ...] The input data arrays from which bootstrap samples are drawn. Methods ------- bootstrap_samples(n_estimates, statsfunction, n_reps, **kwargs) Generate bootstrap replicates and return the resulting statistics. original_estimate(statsfunction, **kwargs) Compute the statistic(s) on the original (non-resampled) data. Notes ----- This class handles the generation of bootstrap replicates and calculation of original estimates from the provided dataset using a specified statistical function. """ # ///////////////////////////////////////////////////////////////////////// def __repr__(self) -> str: """ String representation of the class instance. """ return (f"{type(self).__name__}(data=tuple of " f"{len(self._data)} numpy arrays)") # ///////////////////////////////////////////////////////////////////////// def __str__(self) -> str: """ String representation of the class instance. """ return (f"{type(self).__name__} with {len(self._data)} data " "arrays.") # ///////////////////////////////////////////////////////////////////////// def __init__(self, data: tuple[NDArray[Any], ...], verbose:bool=True, ): """Initialise the BaseBootstrapCI instance. """ is_type(data, tuple) is_type(verbose, bool) self._data = data self._verbose = verbose # self.__boot_sample = None # /////////////////////////////////////////////////////////////////////////
[docs] def bootstrap_samples(self, n_estimates: int, statsfunction: Callable, n_reps: int = 999, verbose:bool=True, **kwargs: Optional[Any], ) -> NDArray[Any]: """ Generates a bootstrap sample, applies a function, and removes missing values. Parameters ---------- n_estimates : `int` The number of estimates (statistics) to compute for each bootstrap sample. statsfunction : `Callable` The statistical function to apply to each bootstrap sample. n_reps : `int`, default 999 The number of bootstrap repetitions (default is 999). **kwargs : `dict` [`str`, `any`] Additional keyword arguments passed to `statsfunction`. Returns ------- np.ndarray An array of bootstrap estimates, with any samples containing missing values removed. """ # check input is_type(n_estimates, int) is_type(n_reps, (int, float)) if not callable(statsfunction): raise TypeError(ERR3.format(NamesRS.STATSFUNC)) # perform the acual boostrap boot_samp = bootstrap_replicates( self._data, statsfunction, n_estimates, n_reps, **kwargs, ) # Remove missing values missings_bs = np.isnan(boot_samp).any(axis=1) if any(missings_bs): boot_samp = boot_samp[~missings_bs] if self._verbose == True: warnings.warn(MSG_NAN.format( sum(missings_bs), 'the bootstraped esimates [boot_samp]')) # assign to self # self.__boot_sample = boot_samp # return return boot_samp
# /////////////////////////////////////////////////////////////////////////
[docs] def original_estimate(self, statsfunction: Callable, **kwargs:Optional[Any], ) -> np.ndarray: """ Calculate the estimate based on the non-resampled data. Parameters ---------- statsfunction : `Callable` The statistical function which will be applied to `data`. **kwargs : `dict` [`str`, `any`] Additional keyword arguments passed to `statsfunction`. Returns ------- np.ndarray An array containing the original statistic(s). """ # get estimate(s) result = statsfunction(*self._data, **kwargs) # ensure we get an array if not isinstance(result, list) and not isinstance(result, np.ndarray): result = [result] if not isinstance(result, np.ndarray): result = np.array(result) # return return result
# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
[docs] class PercentileCI(BaseBootstrap): """ Percentile bootstrap confidence interval estimator. Attributes ---------- _data : `tuple` [`np.ndarray`, ...] The input data arrays from which bootstrap samples are drawn. """ # ///////////////////////////////////////////////////////////////////////// def __call__(self, n_estimates: int, statsfunction: Callable, alpha: float = 0.05, n_reps: int = 999, **kwargs: Optional[Any]) -> BootCIResults: """ Calculate the percentile bootstrap confidence interval. Parameters ---------- n_estimates : `int` The number of estimates (statistics) to compute for each sample. statsfunction : `Callable` The statistical function applied to the bootstrap samples. alpha : `float`, default 0.05 The significance level for the confidence interval. n_reps : `int`, default 999 The number of bootstrap repetitions (default is 999). **kwargs : `dict` [`str`, `any`] Additional keyword arguments passed to `statsfunction`. Returns ------- BootCIResults A dataclass containing the confidence interval bounds, the bootstrap samples, and metadata. """ # check input is_type(alpha, float) # is alpha between 0 and 1 if alpha <= 0 or alpha >= 1: raise ValueError(Error_MSG.FLOAT_LIMITS.format('alpha', 0, 1)) # the bootstrap boot_samp = self.bootstrap_samples( n_estimates, statsfunction, n_reps, **kwargs ) # getting the confidence intervals # NOTE the alpha/2 and (1-alpha/2) are inverted - which is intended. lower_pct = (alpha/2) * 100 upper_pct = (1 - alpha/2) * 100 ind_array = np.atleast_2d(np.array([lower_pct, upper_pct])) # NOTE np.percentile intrapolates - if the pct do not exactly match # elements in `boot_samp` ci_array = np.percentile(boot_samp, [lower_pct, upper_pct], axis=0, method='linear').T # return results return BootCIResults( **{ CLASS_NAME :type(self).__name__, NamesI.INDICES: ind_array, NamesI.VALUES: ci_array, NamesI.COVERAGE: 1 - alpha, NamesRS.BOOT_SAMPLE: boot_samp, } )
# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
[docs] class BasicCI(BaseBootstrap): """ Basic bootstrap confidence interval estimator. Attributes ---------- _data : `tuple` [`np.ndarray`, ...] The input data arrays from which bootstrap samples are drawn. Notes ----- The basic bootstrap confidence interval is calculated by reflecting the standard percentile intervals around the original estimate. """ # ///////////////////////////////////////////////////////////////////////// def __call__(self, n_estimates: int, statsfunction: Callable, alpha: float = 0.05, n_reps: int = 999, **kwargs: Optional[Any]) -> BootCIResults: """ Calculate the bootstrap confidence interval. Parameters ---------- n_estimates : `int` The number of estimates (statistics) to compute for each sample. statsfunction : `Callable` The statistical function applied to the bootstrap samples. alpha : `float`, default 0.05 The significance level for the confidence interval. n_reps : `int`, default 999 The number of bootstrap repetitions (default is 999). **kwargs : `dict` [`str`, `any`], optional Additional keyword arguments passed to `statsfunction`. Returns ------- BootCIResults A dataclass containing the confidence interval bounds, the bootstrap samples, and metadata. """ # check input is_type(alpha, float) # is alpha between 0 and 1 if alpha <= 0 or alpha >= 1: raise ValueError(Error_MSG.FLOAT_LIMITS.format('alpha', 0, 1)) # the bootstrap samples boot_samp = self.bootstrap_samples( n_estimates, statsfunction, n_reps, **kwargs ) # the original (non-resampled) estimate orig_estimate = self.original_estimate( statsfunction, **kwargs ) # calculate percentiles lower_pct = (1 - alpha / 2) * 100 upper_pct = (alpha / 2) * 100 ind_array = np.atleast_2d(np.array([lower_pct, upper_pct])) # NOTE np.percentile will interpolate if needed # calculate the basic confidence intervals lower, upper = np.percentile( boot_samp, [lower_pct, upper_pct], axis=0, method='linear' ) ci_array = np.stack([ 2 * orig_estimate - lower, 2 * orig_estimate - upper, ], axis=1) # return results return BootCIResults( **{ CLASS_NAME :type(self).__name__, NamesI.INDICES: ind_array, NamesI.VALUES: ci_array, NamesI.COVERAGE: 1 - alpha, NamesRS.BOOT_SAMPLE: boot_samp, } )
# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
[docs] class StudentisedCI(BaseBootstrap): """ Studentised bootstrap confidence interval estimator. Attributes ---------- _data : tuple of NDArray The input data arrays from which bootstrap samples are drawn. Notes ----- This class performs a nested bootstrap to estimate the standard error of the bootstrapped statistic(s) """ # ///////////////////////////////////////////////////////////////////////// def __call__(self, n_estimates: int, statsfunction: Callable, alpha: float = 0.05, n_reps: int = 999, n_nested: int = 100, **kwargs: Optional[Any]) -> BootCIResults: """ Calculate the studentised bootstrap confidence interval. Parameters ---------- n_estimates : `int` The number of estimates (statistics) to compute for each sample. statsfunction : `Callable` The statistical function applied to the bootstrap samples. alpha : `float`, default 0.05 The significance level for the confidence interval. n_reps : `int`, default 999 The number of bootstrap repetitions. n_nested : `int`, default 100 The number of inner bootstrap repetitions for standard error estimation. **kwargs : `dict`, [`str`, `any`] Additional keyword arguments passed to `statsfunction`. Returns ------- BootCIResults A dataclass containing the confidence interval bounds, the bootstrap samples, and metadata. """ # ### check input is_type(alpha, float) # is alpha between 0 and 1 if alpha <= 0 or alpha >= 1: raise ValueError(Error_MSG.FLOAT_LIMITS.format('alpha', 0, 1)) is_type(n_reps, (int, float)) is_type(n_nested, (int, float)) # ### the bootstrap # ### pre-allocate memory boot_point = np.full((n_reps, n_estimates), np.nan) boot_se = np.full((n_reps, n_estimates), np.nan) # boot_point = [] # boot_se = [] for i in range(n_reps): boot_samp = _draw_bs_sample(self._data) # the outer estimates outer_est = statsfunction(*boot_samp, **kwargs, ) # map to array outer_est = self._map_sequence(outer_est, array = True) # boot_point.append(outer_est) boot_point[i] = outer_est # the inner loop inner_point = np.full((n_nested, n_estimates), np.nan) for j in range(n_nested): inner_samp = _draw_bs_sample(boot_samp) inner_est = statsfunction(*inner_samp, **kwargs) # make sure it is a list inner_est = self._map_sequence(inner_est, array = False) # inner_point.append(inner_est) inner_point[j] = inner_est # getting the standard error inner_point = np.array(inner_point) # Getting the sample sd inner_point, _ = self._filter_missing(inner_point) if np.isnan(inner_point).all(): if self._verbose == True: warnings.warn("Warning: inner_point is entirely NaN") se = np.nan else: se = np.std(inner_point, axis=0, ddof=1) # boot_se.append(se) boot_se[i] = se # point estimate on the non-resampled data orig_mean = self.original_estimate(statsfunction, **kwargs) if not isinstance(orig_mean, np.ndarray): orig_mean = np.array(orig_mean) # NOTE perhaps check if there are the same length? # removing possible missing data boot_point, keep_point = self._filter_missing(boot_point) boot_se, keep_se = self._filter_missing(boot_se[keep_point]) boot_point = boot_point[keep_se] # standard error of the outer boot if np.isnan(boot_point).all(): if self._verbose == True: warnings.warn("Warning: boot_point is entirely NaN") boot_point_se = np.nan else: boot_point_se = np.std(boot_point, axis=0, ddof=1) # Studentised t-statistics t_stats = (boot_point - orig_mean) / boot_se # The t-statistic based interval positions percentiles = np.percentile( t_stats, [100 * (1 - alpha/2), 100 * (alpha/2)], method="linear", axis=0, ) lower_t, upper_t = percentiles ind_array = np.atleast_2d(np.array([lower_t, upper_t])) # finally get the confidence interval ci_array = np.stack([ orig_mean - lower_t * boot_point_se, # lower bound orig_mean - upper_t * boot_point_se, # upper bound ], axis=1) # return return BootCIResults( **{ CLASS_NAME :type(self).__name__, NamesI.INDICES: ind_array, NamesI.VALUES: ci_array, NamesI.COVERAGE: 1 - alpha, NamesRS.BOOT_SAMPLE: boot_point, } ) # ///////////////////////////////////////////////////////////////////////// def _map_sequence(self, obj:Any, array:bool=False) -> list | np.ndarray: """Maps an objs to a mutable sequence.""" # map to list if not isinstance(obj, list) and not isinstance(obj, np.ndarray): obj = [obj] # check if we also want to map to array if array: if not isinstance(obj, np.ndarray): obj = np.array(obj) # return return obj # ///////////////////////////////////////////////////////////////////////// def _filter_missing(self, array: np.ndarray, ) -> tuple[np.ndarray, np.ndarray]: """ Remove rows containing NaNs from the array and return a boolean mask indicating non-missing rows. Parameters ---------- array : np.ndarray The array to check for missing values (NaNs). Returns ------- filtered_array : np.ndarray The input array with rows containing NaNs removed. non_missing_mask : np.ndarray Boolean array where True indicates rows without any NaNs. """ missings = np.isnan(array).any(axis=1) non_missing = ~missings if missings.any(): if self._verbose: warnings.warn('The bootstrap sample contained {} `nan` values ' 'which have been removed.'.format(sum(missings))) return array[non_missing], non_missing
# # Toy example # from stats_misc.machine_learning.validation import ( # cstatistic, # ) # # create a function which returns a single float # def cstat_boot(*data): # ''' # Function to use for bootstrapping # ''' # cstat = cstatistic(data[0], data[1]) # point = cstat.c_statistic # if np.isnan(point): # return np.nan # return point # # example data # x = np.array([0, 1, 0, 1, 0, 1, 1, 0, 1, 1, 0, 1, 0, 0, 0]) # y = np.array([0.1, 0.7, 0.3, 0.6, 0.9, 0.1, 0.2, 0.3, 0.4, 0.01, 0.9, 0.8, 0.15, 0.71, 0.8]) # data = (x, y) # n_reps=999 # n_estimates = 1 # statsfunction = cstat_boot # cstat_boot(*(x,y)) # res1 = PercentileCI(data) # res2 = res1(1, cstat_boot)