Source code for stats_misc.resampling

"""
Resampling methods.

This module provides tools for non-parametric resampling procedures, including
bootstrap and jackknife methods. It implements several widely used approaches
for confidence interval estimation, such as percentile intervals, basic
bootstrap, bias-corrected and accelerated (BCa), and studentised intervals.

These methods operate on raw data supplied as tuples of NumPy arrays, and are
designed to work with user-defined statistical functions. Results are returned
as structured objects that encapsulate interval estimates, coverage
probabilities, and resampling metadata.

Classes
-------
BootCIResults
    A container for standard bootstrap confidence interval output, including
    interval bounds, coverage, and bootstrap samples.

BCaCIResults
    A subclass of `BootCIResults` including additional information for
    BCa intervals, such as acceleration and bias correction terms.

BaseBootstrap
    A general base class for generating bootstrap replicates and original
    (non-resampled) estimates.

PercentileCI
    Implements the percentile bootstrap confidence interval.

BasicCI
    Implements the basic (reverse) bootstrap confidence interval.

StudentisedCI
    Implements the studentised bootstrap confidence interval using nested
    resampling to estimate variability of the bootstrap estimates.

BCaCI
    A callable class that computes BCa confidence intervals and stores the
    corresponding bootstrap and jackknife samples.

Functions
---------
bootstrap_replicates(data, statsfunction, n_estimates, n_reps=999, **kwargs)
    Generates bootstrap samples and computes statistics on each.

jackknife_replicates(data, statsfunction, n_estimates, **kwargs)
    Computes jackknife replicates by systematically omitting each observation.

Notes
-----
All core resampling functions have been optimised using Numba for performance.
Input arrays must have consistent row dimensions. Output is typically returned
as NumPy arrays or results class instances that support structured inspection
and downstream usage.
"""

import warnings
import numpy as np
import numba as nb
import scipy.stats as st
from numpy.typing import NDArray
from typing import (
    Any, Annotated, Callable,
)
from stats_misc.constants import (
    NamesResampling as NamesRS,
    NamesIntervals as NamesI,
    CLASS_NAME,
)
from stats_misc.errors import (
    is_type,
    check_limits,
)
from stats_misc.utils.helpers import (
    Results,
    ManagedProperty,
)

# @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
# constants
MSG_NAN='There are {} `nan` values in `{}`. These will be omitted.'
ERR1 = "All arrays in `data` must have the same number of rows."
ERR2 = "nrows ({}) exceeds the number of rows in data ({})."
ERR3 = "`{}` is not callable."

# @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@

[docs]
class BootCIResults(Results):
    """
    The results class for bootstrapped confidence intervals.
    
    Attributes
    ----------
    interval_indices: `list` [`int`] or `np.ndarray`
        The indices corresponding to the lower and upper CI bounds.
    interval_values: `list` [`float`] or `np.ndarray`
        The computed lower and upper confidence interval bounds.
    coverage : `float`
        The confidence interval coverage.
    """
    SET_ARGS = [
        CLASS_NAME,
        NamesI.INDICES,
        NamesI.VALUES,
        NamesI.COVERAGE,
        NamesRS.BOOT_SAMPLE,
    ]
    def __init__(self, **kwargs) -> None:
        super().__init__(set_args=self.SET_ARGS, **kwargs)


# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

[docs]
class BCaCIResults(BootCIResults):
    """
    The results class for Bias-Corrected and Accelerated (BCa) intervals.
    
    Attributes
    ----------
    interval_indices : `list` [`int`] or `np.ndarray`
        The indices corresponding to the lower and upper CI bounds.
    interval_values : `list` [`float`] or `np.ndarray`
        The computed lower and upper confidence interval bounds.
    coverage : `float`
        The confidence interval coverage.
    """
    SET_ARGS = [
        CLASS_NAME,
        NamesI.INDICES,
        NamesI.VALUES,
        NamesI.COVERAGE,
        NamesRS.BOOT_SAMPLE,
        NamesRS.JACK_SAMPLE,
        NamesRS.BCA_ACCELERATION,
        NamesRS.BCA_BIAS,
    ]


# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
@nb.njit
def _random_row_indices_jit(nrows: int,
                            ) -> Annotated[NDArray[np.integer], (Any, 1)]:
    """
    Takes a random sample of row indices with replacement.
    
    Parameters
    ----------
    nrows : `int`
        The number of rows.
    
    Returns
    -------
    np.ndarray
        A vector of resampled row indices, with the number of rows equal to
        `nrows`.
    """
    # resample - numba needs this to be a 2d array
    sel_r = np.random.choice(nrows, nrows).reshape(nrows, 1)
    return sel_r

# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
def _random_row_indices(nrows: int | float,
                        ) -> Annotated[NDArray[np.integer], (Any, 1)]:
    """
    Takes a random sample of row indices with replacement.
    
    Parameters
    ----------
    nrows : `int` or `float`
        The number of rows. Float values are cast to `int`.
    
    Returns
    -------
    np.ndarray
        A vector of resampled row indices, with the number of rows equal to
        `nrows`.
    
    """
    is_type(nrows, (int, float, np.integer))
    nrows = int(nrows)
    return _random_row_indices_jit(nrows)

# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
@nb.njit
def _sample_rows(arr: NDArray[np.generic], rows: NDArray[np.generic],
                 ) -> NDArray[np.generic]:
    """
    Extracts rows from a numpy array.
    
    Parameters
    ----------
    arr : `np.ndarray`
        A numpy array of any dimension.
    rows: `np.ndarray`
        An numpy array with shape (x,1) containing the row elements of `arr`.
    
    Returns
    -------
    arr_resampled : `np.ndarray`
        A version of `arr` with a subset of the original array and certain
        rows repeated.
    
    Notes
    -----
    This is a work around because numba does not allow for multi-dimensional
    array indexing such as `arr[rows, :]`.
    """
    # ### getting the array dimensions
    nrows = rows.shape[0]
    arr_shape = arr.shape
    # empty result array
    output_shape = (nrows,) + arr_shape[1:]
    data_r = np.empty(output_shape, dtype=arr.dtype)
    # Loop over the row indices and copy the corresponding rows
    for i in range(nrows):
        data_r[i] = arr[rows[i, 0]]  # This will work for any number of dimensions
    # return
    return data_r

# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
def _draw_bs_sample(data:tuple[NDArray[Any]], ) -> tuple[NDArray[Any]]:
    """
    Draws a bootstrap sample with replacement from a tuple of arrays.
    
    Parameters
    ----------
    data : `tuple` [`np.ndarray`]
        Tuple of arrays with equal number of rows.
    
    Returns
    -------
    tuple [`np.ndarray`]
        A tuple of resampled numpy arrays. For a proper bootstrap this should
        be equal to the number of rows of the input data
    """
    # ### confirm input
    is_type(data, tuple)
    # ### check number of rows
    nrows = data[0].shape[0]
    # Check that all arrays have the same number of rows
    for arr in data:
        if arr.shape[0] != nrows:
            raise ValueError(ERR1)
    # ### performing the resampling
    sel_r = _random_row_indices(nrows)
    # random draws with replacement for each list object
    data_r = []
    for obj in data:
        # NOTE commented this out, looks redudant, but may cause some
        # downstream issues which should be addressed individually.
        # # we want to return at least a 2d array
        # if len(obj.shape) == 1:
        #     obj = obj.reshape(-1,1)
        # subset
        data_r.append(_sample_rows(obj, rows=sel_r))
    # return
    return tuple(data_r)

# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
def _remove_row(data:tuple[NDArray[Any]], row_index:int,
                ) -> list[NDArray[Any]]:
    """
    A jackknife helper function to iterativly remove a row from a tuple of
    arrays.
    
    Parameters
    ----------
    data : `tuple` [`np.ndarray`]
        A tuple of numpy arrays with the same number of rows.
    row_index : `int`
        The row index which should be dropped.
    
    Returns
    -------
    list [`np.ndarray`]
        A list of numpy arrays with `row_index` removed.
    
    Notes
    -----
    The arrays should be at least 2d, the absence of a shape[1] values will
    throw an error. Have not yet found a way to test for this with the numba
    more limited functionality.
    
    A non-numba implementation of this code would be
    `tuple(np.delete(arr, 1, axis=0) for arr in data)`
    """
    # get dims and list
    num_rows = data[0].shape[0]
    new_arrays = []
    # Iterate over each array in the tuple
    for arr in data:
        num_cols = arr.shape[1]
        new_arr = np.empty((num_rows - 1, num_cols), dtype=arr.dtype)
        # Fill the new array excluding the specified row
        row_counter = 0
        for i in range(num_rows):
            if i != row_index:
                new_arr[row_counter] = arr[i]
                row_counter += 1
        # add array
        new_arrays.append(new_arr)
    # return list
    return new_arrays

# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

[docs]
def bootstrap_replicates(data:tuple[NDArray[Any], ...], statsfunction:Callable,
                         n_estimates:int, n_reps:int=999,
                         **kwargs: Any,
                         ) -> NDArray[Any]:
    """
    Draws `n_reps` bootstrap samples and applies `statsfunction` on each bth
    sample, return the results as an numpy array.
    
    Parameters
    ----------
    data : `tuple` [`np.ndarray`]
        A tuple of numpy arrays with the same number of rows.
    statsfunction : `Callable`
        A function which can unpack a tuple of arrays and perform analyses on
        these: `statsfunction(*data)`. The function can return the estimate(s)
        as a single float/int, list, tuple or numpy array.
    n_estimates : `int`
        The number of estimates `statsfunction` will return.
    n_reps : `int`, default 999
        The number of bootstrap samples.
    **kwargs : `any`
        Any keyword arguments supplied to `statsfunctions`.
    
    Returns
    -------
    np.ndarray
        A 2d numpy array with dims equal to `n_reps` by `n_estimates`.
    
    Notes
    -----
    The helper functions have been optimised through numba.njit.
    
    Will internally map 1d arrays to 2d to deal with numba requirements.
    
    Examples
    --------
    >>> import numpy as np
    >>> np.random.seed(42)
    >>>
    >>> # Example statsfunction that returns a list of statistics
    >>> def stats_list(*data):
    ...     return [np.mean(data[0]), np.std(data[0])]
    >>>
    >>> result = bootstrap_replicates(
    ...     (np.random.randn(10), np.random.randn(10)),
    ...     stats_list, n_estimates=2, n_reps=10
    ... )
    >>> print(result)
    """
    # ### check input
    is_type(data, tuple)
    is_type(n_estimates, (float, int))
    is_type(n_reps, (float, int))
    if not callable(statsfunction):
        raise TypeError(ERR3.format(NamesRS.STATSFUNC))
    # ### performing the bootstrap
    # Make sure there are now 1d arrays for the bloody numba call
    n_data = []
    for arr in data:
        if arr.ndim == 1:
            n_data.append(arr.reshape(arr.shape[0], 1))
        else:
            n_data.append(arr)
    data = tuple(n_data)
    # initiating an nan array
    boot_reps = np.full((n_reps, n_estimates), np.nan)
    #  looping over to bootstrap
    for i in range(n_reps):
        boot_reps[i] = statsfunction(*_draw_bs_sample(data),
                                    **kwargs,
                                    )
    # returns
    return boot_reps


# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

[docs]
def jackknife_replicates(data:tuple[NDArray[Any], ...], statsfunction:Callable,
                         n_estimates:int,
                         **kwargs: Any,
                         ) -> NDArray[Any]:
    """
    Performs jackknife resampling procedure which systematically leaves out
    one observation at a time and applies `statsfunction`.
    
    Parameters
    ----------
    data : `tuple` [`np.ndarray`]
        A tuple of numpy arrays with the same number of rows.
    statsfunction : `Callable`
        A function which can unpack a tuple of arrays and perform analyses on
        these: `statsfunction(*data)`. The function can return the estimate(s)
        as a single float/int, list, tuple or numpy array.
    n_estimates : `int`
        The number of estimates `statsfunction` will return.
    **kwargs : `any`
        Any keyword arguments supplied to `statsfunctions`.
    
    Returns
    -------
    np.ndarray
        A 2d numpy array with dims equal to `data[0].shape[0]` by `n_estimates`.
    
    Notes
    -----
    The helper functions have been optimised through numba.njit.
    
    Will internally map 1d arrays to 2d to deal with numba requirements.
    """
    ### test input
    is_type(data, tuple)
    is_type(n_estimates, (float, int))
    if not callable(statsfunction):
        raise TypeError(ERR3.format(NamesRS.STATSFUNC))
    ### the actual work
    # Make sure there are now 1d arrays for the bloody numba call
    n_data = []
    for arr in data:
        if arr.ndim == 1:
            n_data.append(arr.reshape(arr.shape[0], 1))
        else:
            n_data.append(arr)
    data = tuple(n_data)
    # Set up empty array to store jackknife replicates
    nrows = len(data[0])
    jack_reps = np.full((nrows, n_estimates), np.nan)
    # For each observation in the dataset, compute the statistical function on
    # the sample with that observation removed
    for i in range(nrows):
        jack_sample = _remove_row(data, row_index=i)
        jack_reps[i] = statsfunction(*jack_sample, **kwargs)
    return jack_reps


# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

[docs]
class BCaCI(object):
    """
    A Bias-Corrected and Accelerated (BCa) bootstrap confidence interval.
    
    Parameters
    ----------
    data : `tuple` [`np.ndarray`]
        A tuple of numpy arrays with the same number of rows.
    
    Methods
    -------
    
    Attributes
    ----------
    data : `tuple` [`np.ndarray`]
        A tuple of numpy arrays with the same number of rows.
    boot_sample : `np.ndarray` or `None`
        Bootstrap estimates of `statsfunction`. `None` until `__call__` is invoked.
    jack_sample : `np.ndarray` or `None`
        Jackknife estimates of `statsfunction`. `None` until `__call__` is invoked.

    Notes
    -----
    The BCa interval generally has a better coverage (i.e. the smallest
    confidence interval while retaining advertised coverage) than most
    alternative boostrap confidence interval methods.
    """
    data        = ManagedProperty('data',        types=(tuple,))
    boot_sample = ManagedProperty('boot_sample', types=(np.ndarray, type(None)))
    jack_sample = ManagedProperty('jack_sample', types=(np.ndarray, type(None)))
    # /////////////////////////////////////////////////////////////////////////
    def __init__(self, data:tuple[NDArray[Any], ...],) -> None:
        """
        Initialises a `BCaCI` instance.
        """
        is_type(data, tuple)
        self.data = data
        self.boot_sample = None
        self.jack_sample = None
    # /////////////////////////////////////////////////////////////////////////
    def __str__(self) -> str:
        CLASS_NAME = type(self).__name__
        return (f"{CLASS_NAME} instance with data=\n"
                f"{self.data.__str__()}."
                )
    # \\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\
    def __repr__(self) -> str:
        CLASS_NAME = type(self).__name__
        return (f"{CLASS_NAME}(data=\n"
                f"{self.data.__str__()})"
                )
    # /////////////////////////////////////////////////////////////////////////
    def __call__(self,
                 n_estimates:int, statsfunction:Callable,
                 alpha:float=0.05, n_reps:int=999,
                 verbose:bool = True,
                 **kwargs: Any,
                 ) -> BCaCIResults:
        """
        Calculates and BCa confidence interval, as well as generating the
        bootstrap and jakknife re-sampling to obtain the BCa estimate.
        
        Parameters
        ----------
        statsfunction : `Callable`
            A function which can unpack a tuple of arrays and perform analyses on
            these: `statsfunction(*data)`. The function can return the estimate(s)
            as a single float/int, list, tuple or numpy array.
        n_estimates : `int`
            The number of estimates `statsfunction` will return.
        n_reps : `int`, default 999
            The number of bootstrap samples.
        alpha : `float`, default 0.05
            The desired coverage of the confidence interval `(1-alpha)*100`.
        verbose : `bool`, default `True`
            Whether to emit warnings when bootstrap or jackknife samples
            contain `nan` values.
        **kwargs : `any`
            Any keyword arguments supplied to `statsfunction`.
        
        Returns
        -------
        BCaCIResults
            A results class instance returning the confidence intervals,
            coverage, and confidence interval bootstrap indices.
        """
        # #### check input
        is_type(n_estimates, int)
        is_type(n_reps, int)
        is_type(alpha, float)
        if not callable(statsfunction):
            raise TypeError(ERR3.format(NamesRS.STATSFUNC))
        # is alpha between 0 and 1
        check_limits(alpha, 0.0, 1.0)
        setattr(self, NamesRS.N_ESTIMATES, n_estimates)
        setattr(self, NamesRS.N_REPS, n_reps)
        setattr(self, NamesRS.ALPHA, alpha)
        setattr(self, NamesRS.CI_COVERAGE, 1-alpha)
        # #### perform the calculations
        # #### get bootstrap
        boot_samp = bootstrap_replicates(
            self.data,
            statsfunction=statsfunction,
            n_estimates=getattr(self, NamesRS.N_ESTIMATES),
            n_reps=getattr(self, NamesRS.N_REPS),
            **kwargs,
        )
        self.boot_sample = boot_samp
        # #### get jack knives
        jack_samp = jackknife_replicates(
            self.data,
            statsfunction=statsfunction,
            n_estimates=getattr(self, NamesRS.N_ESTIMATES),
            **kwargs,
        )
        self.jack_sample = jack_samp
        # #### see if there are any missings and raise a warning
        missings_bs = np.isnan(boot_samp).any(axis=1)
        missings_jk = np.isnan(jack_samp).any(axis=1)
        if any(missings_bs):
            boot_samp=boot_samp[~missings_bs]
            if verbose == True:
                warnings.warn(MSG_NAN.format(sum(missings_bs), 'boot_samp'))
        if any(missings_jk):
            jack_samp = jack_samp[~missings_jk]
            if verbose == True:
                warnings.warn(MSG_NAN.format(sum(missings_jk), 'jack_samp'))
        # #### computing the acceleration and bias constants.
        setattr(self, NamesRS.BCA_ACCELERATION,
                type(self)._compute_acceleration(jack_samp))
        setattr(self, NamesRS.BCA_BIAS,
                type(self)._compute_bias(
                    data=self.data,
                    boot_samp=self.boot_sample,
                    statsfunction=statsfunction,
                    **kwargs,
                )
                )
        # #### get lower and upper limits
        # TODO the below, until return, should realy by a seperate function...
        alphas = np.array([alpha/2., 1-alpha/2.])
        # loop over the estimates (per column entry)
        ci_array = np.full((len(boot_samp.T), 2), np.nan)
        int_array = np.full((len(boot_samp.T), 2), np.nan)
        for k in range(len(boot_samp.T)):
            a_k = getattr(self, NamesRS.BCA_ACCELERATION)[k]
            z0_k = getattr(self, NamesRS.BCA_BIAS)[k]
            zs = z0_k + st.norm.ppf(alphas)
            avals = st.norm.cdf(z0_k + zs/(1-a_k*zs))
            ints = np.round((len(boot_samp)-1)*avals)
            ints = np.nan_to_num(ints).astype('int')
            int_array[k] = ints
            # Compute confidence interval
            boot_samp_sort = np.sort(boot_samp.T[k])
            ci_low = boot_samp_sort[ints[0]]
            ci_high = boot_samp_sort[ints[1]]
            ci_array[k] = ci_low, ci_high
        # return stuff
        return BCaCIResults(
            **{
                CLASS_NAME               : type(self).__name__,
                NamesI.INDICES           : int_array,
                NamesI.VALUES            : ci_array,
                NamesI.COVERAGE          : getattr(self, NamesRS.CI_COVERAGE),
                NamesRS.BOOT_SAMPLE      : self.boot_sample,
                NamesRS.JACK_SAMPLE      : self.jack_sample,
                NamesRS.BCA_ACCELERATION : getattr(self, NamesRS.BCA_ACCELERATION),
                NamesRS.BCA_BIAS         : getattr(self, NamesRS.BCA_BIAS),
            }
        )
    # /////////////////////////////////////////////////////////////////////////
    @staticmethod
    def _compute_acceleration(jack_samp:NDArray[Any]) -> list[float]:
        """
        Returns the acceleration constant a for a BCa confidence interval.
        
        Parameters
        ----------
        jack_samp : `np.ndarray`
            An array of jack knife estimates.
        
        Return
        ------
        accel : `list` [`float`]
            Acceleration values for each column of `jack_samp`.
        """
        # #### make sure input is at least a 2d array
        if jack_samp.ndim == 1:
            jack_samp = jack_samp.reshape(jack_samp.shape[0], 1)
        # #### calculations
        # column means
        means = np.mean(jack_samp, axis=0)
        if not isinstance(means, np.ndarray):
            means = np.array([means], dtype=np.float64)
        # make sure means only contains 1 dim
        if means.ndim != 1:
            if not (means.shape[0] == 1 or means.shape[1] == 1):
                raise IndexError(
                    f"`means` shape should contain at least a 1, not "
                    f"{means.shape}."
                )
        # loop over column
        accel = []
        for k, jc in enumerate(jack_samp.T):
            a = (1/6) * np.divide(
                np.sum(means[k] - jc)**3,
                (np.sum(means[k] - jc)**2)**(3/2)
            )
            accel.append(a)
        # return list
        return accel
    # /////////////////////////////////////////////////////////////////////////
    @staticmethod
    def _compute_bias(data:tuple[NDArray[Any]], boot_samp:NDArray[Any],
                    statsfunction:Callable, **kwargs: Any,
                    ) -> list[float]:
        """
        Computes z0, the bias correction constant, using the original estimates
        and the boostrapped estimates.
        
        Parameters
        ----------
        data : `tuple` [`np.ndarray`]
            A tuple of numpy arrays with the same number of rows.
        boot_samp : `np.ndarray`
            A array of bootstrap samples, can be either 1d or 2d.
        statsfunction : `Callable`
            A function which can unpack a tuple of arrays and perform analyses on
            these: `statsfunction(*data)`.
        **kwargs : `any`
            Any keyword arguments supplied to `statsfunctions`.
        
        Returns
        -------
        z0 : `list` [`float`]
            Values of `z0` per column of `boot_samp`.
        """
        # ### calculations
        # Apply the statsfunction on the data
        s = statsfunction(*data, **kwargs)
        # make sure it is a list
        if not isinstance(s, list):
            s = [s]
        # get z0
        boot_n = boot_samp.shape[0]
        z0 = []
        # loop over columns
        for b, boot_b in enumerate(boot_samp.T):
            # so this will be zero when exactly halve of the sample is less
            # than s
            z0.append(st.norm.ppf(np.sum(boot_b < s[b]) / boot_n))
        # return
        return z0


# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

[docs]
class BaseBootstrap(object):
    """
    Base class for generating bootstrap estimates.
    
    Parameters
    ----------
    data : `tuple` [`np.ndarray`, ...]
        A tuple containing one or more numpy arrays representing the input
        data to be used for bootstrapping.
    verbose : `bool`, default `True`
        Whether warnings should be returned.
    
    Attributes
    ----------
    _data : `tuple` [`np.ndarray`, ...]
        The input data arrays from which bootstrap samples are drawn.
    
    Methods
    -------
    bootstrap_samples(n_estimates, statsfunction, n_reps, **kwargs)
        Generate bootstrap replicates and return the resulting statistics.
    original_estimate(statsfunction, **kwargs)
        Compute the statistic(s) on the original (non-resampled) data.
    
    Notes
    -----
    This class handles the generation of bootstrap replicates and calculation
    of original estimates from the provided dataset using a specified
    statistical function.
    
    """
    # /////////////////////////////////////////////////////////////////////////
    def __repr__(self) -> str:
        """
        String representation of the class instance.
        """
        return (f"{type(self).__name__}(data=tuple of "
                f"{len(self._data)} numpy arrays)")
    # /////////////////////////////////////////////////////////////////////////
    def __str__(self) -> str:
        """
        String representation of the class instance.
        """
        return (f"{type(self).__name__} with {len(self._data)} data "
                "arrays.")
    # /////////////////////////////////////////////////////////////////////////
    def __init__(self, data: tuple[NDArray[Any], ...], verbose:bool=True,
                 ):
        """Initialise the BaseBootstrapCI instance.  """
        is_type(data, tuple)
        is_type(verbose, bool)
        self._data = data
        self._verbose = verbose
        # self.__boot_sample = None
    # /////////////////////////////////////////////////////////////////////////

[docs]
    def bootstrap_samples(self, n_estimates: int, statsfunction: Callable,
                          n_reps: int = 999,
                          # verbose:bool=True,
                          **kwargs: Any,
                          ) -> NDArray[Any]:
        """
        Generates a bootstrap sample, applies a function, and removes missing
        values.
        
        Parameters
        ----------
        n_estimates : `int`
            The number of estimates (statistics) to compute for each bootstrap
            sample.
        statsfunction : `Callable`
            The statistical function to apply to each bootstrap sample.
        n_reps : `int`, default 999
            The number of bootstrap repetitions (default is 999).
        **kwargs : `dict` [`str`, `any`]
            Additional keyword arguments passed to `statsfunction`.
        
        Returns
        -------
        np.ndarray
            An array of bootstrap estimates, with any samples containing
            missing values removed.
        """
        # check input
        is_type(n_estimates, int)
        is_type(n_reps, (int, float))
        if not callable(statsfunction):
            raise TypeError(ERR3.format(NamesRS.STATSFUNC))
        # perform the acual boostrap
        boot_samp = bootstrap_replicates(
            self._data, statsfunction, n_estimates, n_reps, **kwargs,
        )
        # Remove missing values
        missings_bs = np.isnan(boot_samp).any(axis=1)
        if any(missings_bs):
            boot_samp = boot_samp[~missings_bs]
            if self._verbose == True:
                warnings.warn(MSG_NAN.format(
                    sum(missings_bs), 'the bootstraped esimates [boot_samp]'))
        # assign to self
        # self.__boot_sample = boot_samp
        # return
        return boot_samp

    # /////////////////////////////////////////////////////////////////////////

[docs]
    def original_estimate(self, statsfunction: Callable,
                           **kwargs: Any,
                           ) -> np.ndarray:
        """
        Calculate the estimate based on the non-resampled data.
        
        Parameters
        ----------
        statsfunction : `Callable`
            The statistical function which will be applied to `data`.
        **kwargs : `dict` [`str`, `any`]
            Additional keyword arguments passed to `statsfunction`.
        
        Returns
        -------
        np.ndarray
            An array containing the original statistic(s).
        """
        # get estimate(s)
        result = statsfunction(*self._data, **kwargs)
        # ensure we get an array
        if not isinstance(result, list) and not isinstance(result, np.ndarray):
            result = [result]
        if not isinstance(result, np.ndarray):
            result = np.array(result)
        # return
        return result



# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

[docs]
class PercentileCI(BaseBootstrap):
    """
    Percentile bootstrap confidence interval estimator.
    
    Attributes
    ----------
    _data : `tuple` [`np.ndarray`, ...]
        The input data arrays from which bootstrap samples are drawn.
    """
    # /////////////////////////////////////////////////////////////////////////
    def __call__(self, n_estimates: int, statsfunction: Callable,
                 alpha: float = 0.05, n_reps: int = 999,
                 **kwargs: Any) -> BootCIResults:
        """
        Calculate the percentile bootstrap confidence interval.
        
        Parameters
        ----------
        n_estimates : `int`
            The number of estimates (statistics) to compute for each sample.
        statsfunction : `Callable`
            The statistical function applied to the bootstrap samples.
        alpha : `float`, default 0.05
            The significance level for the confidence interval.
        n_reps : `int`, default 999
            The number of bootstrap repetitions (default is 999).
        **kwargs : `dict` [`str`, `any`]
            Additional keyword arguments passed to `statsfunction`.
        
        Returns
        -------
        BootCIResults
            A dataclass containing the confidence interval bounds, the bootstrap
            samples, and metadata.
        """
        # check input
        is_type(alpha, float)
        # is alpha between 0 and 1
        check_limits(alpha, 0.0, 1.0)
        # the bootstrap
        boot_samp = self.bootstrap_samples(
            n_estimates, statsfunction, n_reps, **kwargs
        )
        # getting the confidence intervals
        # NOTE the alpha/2 and (1-alpha/2) are inverted - which is intended.
        lower_pct = (alpha/2) * 100
        upper_pct = (1 - alpha/2) * 100
        ind_array = np.atleast_2d(np.array([lower_pct, upper_pct]))
        # NOTE np.percentile intrapolates - if the pct do not exactly match
        # elements in `boot_samp`
        ci_array = np.percentile(boot_samp, [lower_pct, upper_pct], axis=0,
                                 method='linear').T
        # return results
        return BootCIResults(
            **{
                CLASS_NAME :type(self).__name__,
                NamesI.INDICES: ind_array,
                NamesI.VALUES: ci_array,
                NamesI.COVERAGE: 1 - alpha,
                NamesRS.BOOT_SAMPLE: boot_samp,
            }
        )


# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

[docs]
class BasicCI(BaseBootstrap):
    """
    Basic bootstrap confidence interval estimator.
    
    Attributes
    ----------
    _data : `tuple` [`np.ndarray`, ...]
        The input data arrays from which bootstrap samples are drawn.
    
    Notes
    -----
    The basic bootstrap confidence interval is calculated by reflecting
    the standard percentile intervals around the original estimate.
    """
    # /////////////////////////////////////////////////////////////////////////
    def __call__(self, n_estimates: int, statsfunction: Callable,
                 alpha: float = 0.05, n_reps: int = 999,
                 **kwargs: Any) -> BootCIResults:
        """
        Calculate the bootstrap confidence interval.
        
        Parameters
        ----------
        n_estimates : `int`
            The number of estimates (statistics) to compute for each sample.
        statsfunction : `Callable`
            The statistical function applied to the bootstrap samples.
        alpha : `float`, default 0.05
            The significance level for the confidence interval.
        n_reps : `int`, default 999
            The number of bootstrap repetitions (default is 999).
        **kwargs : `dict` [`str`, `any`], optional
            Additional keyword arguments passed to `statsfunction`.
        
        Returns
        -------
        BootCIResults
            A dataclass containing the confidence interval bounds, the bootstrap
            samples, and metadata.
        """
        # check input
        is_type(alpha, float)
        # is alpha between 0 and 1
        check_limits(alpha, 0.0, 1.0)
        # the bootstrap samples
        boot_samp = self.bootstrap_samples(
            n_estimates, statsfunction, n_reps, **kwargs
        )
        # the original (non-resampled) estimate
        orig_estimate = self.original_estimate(
            statsfunction, **kwargs
        )
        # calculate percentiles
        lower_pct = (1 - alpha / 2) * 100
        upper_pct = (alpha / 2) * 100
        ind_array = np.atleast_2d(np.array([lower_pct, upper_pct]))
        # NOTE np.percentile will interpolate if needed
        # calculate the basic confidence intervals
        lower, upper = np.percentile(
            boot_samp, [lower_pct, upper_pct],
            axis=0, method='linear'
        )
        ci_array = np.stack([
            2 * orig_estimate - lower,
            2 * orig_estimate - upper,
        ], axis=1)
        # return results
        return BootCIResults(
            **{
                CLASS_NAME :type(self).__name__,
                NamesI.INDICES: ind_array,
                NamesI.VALUES: ci_array,
                NamesI.COVERAGE: 1 - alpha,
                NamesRS.BOOT_SAMPLE: boot_samp,
            }
        )


# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

[docs]
class StudentisedCI(BaseBootstrap):
    """
    Studentised bootstrap confidence interval estimator.
    
    Attributes
    ----------
    _data : tuple of NDArray
        The input data arrays from which bootstrap samples are drawn.
    
    Notes
    -----
    This class performs a nested bootstrap to estimate the standard error of
    the bootstrapped statistic(s)
    """
    # /////////////////////////////////////////////////////////////////////////
    def __call__(self, n_estimates: int, statsfunction: Callable,
                 alpha: float = 0.05, n_reps: int = 999,
                 n_nested: int = 100,
                 **kwargs: Any) -> BootCIResults:
        """
        Calculate the studentised bootstrap confidence interval.
        
        Parameters
        ----------
        n_estimates : `int`
            The number of estimates (statistics) to compute for each sample.
        statsfunction : `Callable`
            The statistical function applied to the bootstrap samples.
        alpha : `float`, default 0.05
            The significance level for the confidence interval.
        n_reps : `int`, default 999
            The number of bootstrap repetitions.
        n_nested : `int`, default 100
            The number of inner bootstrap repetitions for standard error
            estimation.
        **kwargs : `dict`, [`str`, `any`]
            Additional keyword arguments passed to `statsfunction`.
        
        Returns
        -------
        BootCIResults
            A dataclass containing the confidence interval bounds, the bootstrap
            samples, and metadata.
        """
        # ### check input
        is_type(alpha, float)
        # is alpha between 0 and 1
        check_limits(alpha, 0.0, 1.0)
        is_type(n_reps, (int, float))
        is_type(n_nested, (int, float))
        # ### the bootstrap
        # ### pre-allocate memory
        boot_point = np.full((n_reps, n_estimates), np.nan)
        boot_se = np.full((n_reps, n_estimates), np.nan)
        # boot_point = []
        # boot_se = []
        for i in range(n_reps):
            boot_samp = _draw_bs_sample(self._data)
            # the outer estimates
            outer_est = statsfunction(*boot_samp, **kwargs,
                                        )
            # map to array
            outer_est = self._map_sequence(outer_est, array = True)
            # boot_point.append(outer_est)
            boot_point[i] = outer_est
            # the inner loop
            inner_point = np.full((n_nested, n_estimates), np.nan)
            for j in range(n_nested):
                inner_samp = _draw_bs_sample(boot_samp)
                inner_est = statsfunction(*inner_samp, **kwargs)
                # make sure it is a list
                inner_est = self._map_sequence(inner_est, array = False)
                # inner_point.append(inner_est)
                inner_point[j] = inner_est
            # getting the standard error
            inner_point = np.array(inner_point)
            # Getting the sample sd
            inner_point, _ = self._filter_missing(inner_point)
            if np.isnan(inner_point).all():
                if self._verbose == True:
                    warnings.warn("Warning: inner_point is entirely NaN")
                se = np.nan
            else:
                se = np.std(inner_point, axis=0, ddof=1)
            # boot_se.append(se)
            boot_se[i] = se
        # point estimate on the non-resampled data
        orig_mean = self.original_estimate(statsfunction, **kwargs)
        if not isinstance(orig_mean, np.ndarray):
            orig_mean = np.array(orig_mean)
        # NOTE perhaps check if there are the same length?
        # removing possible missing data
        boot_point, keep_point = self._filter_missing(boot_point)
        boot_se, keep_se = self._filter_missing(boot_se[keep_point])
        boot_point = boot_point[keep_se]
        # standard error of the outer boot
        if np.isnan(boot_point).all():
            if self._verbose == True:
                warnings.warn("Warning: boot_point is entirely NaN")
            boot_point_se = np.nan
        else:
            boot_point_se = np.std(boot_point, axis=0, ddof=1)
        # Studentised t-statistics
        t_stats = (boot_point - orig_mean) / boot_se
        # The t-statistic based interval positions
        percentiles = np.percentile(
            t_stats, [100 * (1 - alpha/2), 100 * (alpha/2)],
            method="linear", axis=0,
        )
        lower_t, upper_t = percentiles
        ind_array = np.atleast_2d(np.array([lower_t, upper_t]))
        # finally get the confidence interval
        ci_array = np.stack([
            orig_mean - lower_t * boot_point_se, # lower bound
            orig_mean - upper_t * boot_point_se, # upper bound
        ], axis=1)
        # return
        return BootCIResults(
            **{
                CLASS_NAME :type(self).__name__,
                NamesI.INDICES: ind_array,
                NamesI.VALUES: ci_array,
                NamesI.COVERAGE: 1 - alpha,
                NamesRS.BOOT_SAMPLE: boot_point,
            }
        )
    # /////////////////////////////////////////////////////////////////////////
    def _map_sequence(self, obj:Any, array:bool=False) -> list | np.ndarray:
        """
        Map an object to a mutable sequence, optionally as a numpy array.

        Parameters
        ----------
        obj : `any`
            The object to convert. Scalars and non-list/ndarray types are
            wrapped in a list first.
        array : `bool`, default `False`
            If `True`, convert the result to a `np.ndarray`.

        Returns
        -------
        obj : `list` or `np.ndarray`
            The converted object.
        """
        # map to list
        if not isinstance(obj, list) and not isinstance(obj, np.ndarray):
            obj = [obj]
        # check if we also want to map to array
        if array:
            if not isinstance(obj, np.ndarray):
                obj = np.array(obj)
        # return
        return obj
    # /////////////////////////////////////////////////////////////////////////
    def _filter_missing(self, array: np.ndarray,
                        ) -> tuple[np.ndarray, np.ndarray]:
        """
        Remove rows containing NaNs from the array and return a boolean mask
        indicating non-missing rows.
        
        Parameters
        ----------
        array : np.ndarray
            The array to check for missing values (NaNs).
        
        Returns
        -------
        filtered_array : np.ndarray
            The input array with rows containing NaNs removed.
        non_missing_mask : np.ndarray
            Boolean array where True indicates rows without any NaNs.
        """
        missings = np.isnan(array).any(axis=1)
        non_missing = ~missings
        if missings.any():
            if self._verbose:
                warnings.warn('The bootstrap sample contained {} `nan` values '
                              'which have been removed.'.format(sum(missings)))
        return array[non_missing], non_missing