Source code for stats_misc.utils.general

"""
Functions used in multiple stat-miscs modules which may be of general utility.
"""

import numpy as np
from typing import Literal
from stats_misc.constants import (
    is_type,
    NamesUtilsGeneral as NamesGen,
    Error_MSG,
)
from scipy.stats import (
    norm,
    t,
    ncx2,
)
from typing import Any, List, Type, Union, Tuple, Optional, Dict
from stats_misc.constants import (
    is_type,
    Error_MSG,
    InputValidationError,
    string_join_delimiter,
)
from stats_misc.utils.helpers import(
    ManagedProperty,
)

# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
[docs] def invlogit(l:np.ndarray): ''' The inverse of the logit function. Parameters ---------- l : `np.array`, `float` or `int` array of probabilities. Returns ------- np.array A np.array of length l ''' is_type(l, (np.ndarray, float, int)) # inverse logit p = 1/(1+np.exp(-l)) return p
# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ # NOTE add pytests
[docs] def logit(p:np.ndarray, add:float=0.0001, sub:float=0.0001): ''' The logit function. Parameters ---------- p : `np.array` array of probabilities. add : `float`, default 0.0001 constant to be added if p is equal to 0. sub : `float`, default 0.0001 constant to be subtracted if p is equal to 1. Returns ------- np.array A np.array of length p ''' is_type(p, (np.ndarray, float, int)) if not isinstance(p, np.ndarray): p=np.array([p]) if not ( all(i <= 1 for i in p) and all(i >= 0 for i in p)): raise ValueError("Error: p should be between 0 and 1") if not ( all(i < 1 for i in p) and all(i > 0 for i in p)): raise Warning("Warning: Some values of p are 1 or 0. Adding or " "subtracting a small value, based on `add` and `sub`. " "Values of 1 or 0 may indicate the model is invalid " "for some subjects.") p[p == 0] = p[p == 0] + add p[p == 1] = p[p == 1] - add # logit logit_p = np.log(p/(1-p)) return logit_p
# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
[docs] def calculate_pvalue(z_statistic, side:Literal['two', 'left', 'right']='two', ) -> float: r""" Calculate the P-value based on the Z-statistic and the standard normal distribution. Parameters ---------- z_statistic : `float` Typically the ratio of the point estimate and the standard error, representing the standardized difference from the value under the null-hypothesis. side : {`two`, `left`, `right`, 'below', 'above'}, default `two` `left` will perform a left-sided, one-sided, z-test. `right` will perform a right-sided, one-sided, z-test. `below` is a synonym for `left`, and 'above` is a synonym for `right`. Returns ------- float P-value. Notes ----- Use `left` if your null-hypothesis is .. math:: \mu \geq \mu_0, and the alternative hypothesis is .. math:: \mu < \mu_0. Use `right` if your null-hypothesis is .. math:: \mu \leq \mu_0, and the alternative hypothesis is .. math:: \mu > \mu_0. So for `left` the alternative hypothesis supposes is to the left of null-hypothesis, while for `right` we assume this is to the right. """ # check input is_type(side, str) SIDES = [NamesGen.SIDE_LEFT, NamesGen.SIDE_RIGHT, NamesGen.SIDE_TWO, NamesGen.SIDE_BELOW, NamesGen.SIDE_ABOVE, ] # confirm the choice is correct if not side in SIDES: raise ValueError(Error_MSG.INVALID_STRING.\ format('side', ', '.join(SIDES))) # the left-side p-value pvalue = norm.cdf(z_statistic) # the right-side if side == NamesGen.SIDE_RIGHT or side == NamesGen.SIDE_BELOW: pvalue = 1-pvalue elif side == NamesGen.SIDE_TWO: # two-sided pvalue = min(pvalue, 1-pvalue)*2 return pvalue
# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
[docs] def calculate_pvalue_tdist(t_statistic, degrees, side:Literal['two', 'left', 'right']='two', ): """ Calculate the P-value based on the t-statistic and the t distribution. The function assumes one want to perform a two-sided test. Parameters ---------- t_statistic : `float` Typically the ratio of the point estimate and the standard error, representing the standardized difference from the value under the null-hypothesis. degrees : `int` The degrees of freedom. side : {`two`, `left`, `right`, 'below', 'above'}, default `two` `left` will perform a left-sided, one-sided, z-test. `right` will perform a right-sided, one-sided, z-test. `below` is a synonym for `left`, and 'above` is a synonym for `right`. Returns ------- float P-value. """ # check input is_type(side, str) SIDES = [NamesGen.SIDE_LEFT, NamesGen.SIDE_RIGHT, NamesGen.SIDE_TWO, NamesGen.SIDE_BELOW, NamesGen.SIDE_ABOVE, ] # confirm the choice is correct if not side in SIDES: raise ValueError(Error_MSG.INVALID_STRING.\ format('side', ', '.join(SIDES))) # the left-side p-value pvalue = t.cdf(t_statistic, degrees) # the right-side if side == NamesGen.SIDE_RIGHT or side == NamesGen.SIDE_BELOW: pvalue = 1-pvalue elif side == NamesGen.SIDE_TWO: # two-sided pvalue = min(pvalue, 1-pvalue)*2 return(pvalue)
# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
[docs] class ObservedPower(object): """ Computes observed power (two-sided, above, below) for a given z-statistic, based on the normal distribution. Parameters ---------- statistic : `float` The z-statistic (e.g., estimate / standard error). alpha : `float`, default 0.05 The type I error rate. **kwargs Optional keyword arguments passed to the distribution. Methods ------- compute_power() Compute two-sided observed power using the configured distribution. compute_power_above() Compute one-sided observed power for the 'above' (right) side. compute_power_below() Compute one-sided observed power for the 'below' (left) side. Notes ----- Observed power is equivalent to the p-value evaluated under the alternative hypothesis instead of the null-hypothesis. For example, if the p-value is 0.05 and one is performing a test against an alpha of 0.05, the observed power is 50% (i.e., if we would repeat the exact same experiment, under the alternative hypothesis, there would be a 50% probability of observing a p-value smaller than 0.05. Because, of the equivalence between the p-value and observed power, it has very limited application. """ # class-level variable _distribution = norm # Default to the normal distribution # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ def __init__(self, statistic: float, alpha: float = 0.05, **kwargs: Any, ) -> None: """ The class init. """ is_type(statistic, (float, int)) is_type(alpha, (float, int)) # is alpha between 0 and 1 if alpha <= 0 or alpha >= 1: raise ValueError(Error_MSG.FLOAT_LIMITS.format('alpha', 0, 1)) self.statistic = float(statistic) self.alpha = float(alpha) self._dist_kwargs = kwargs # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
[docs] def compute_power(self) -> float: """ Compute two-sided observed power using the configured distribution. Attributes ---------- crit : `float` The critical value beyond which the null-hypothesis would be rejected. Returns ------- float The observed power. """ self.crit =\ self._distribution.ppf(1 - self.alpha / 2, **self._dist_kwargs) # Sums probabilities in both tails. return ( self._distribution.cdf( -self.statistic - self.crit, **self._dist_kwargs) + self._distribution.cdf( self.statistic - self.crit, **self._dist_kwargs) )
# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
[docs] def compute_power_above(self) -> float: """ Compute one-sided observed power for the 'above' (right) side. Attributes ---------- crit : `float` The critical value beyond which the null-hypothesis would be rejected. Returns ------- float The observed power. """ self.crit =\ self._distribution.ppf(1 - self.alpha, **self._dist_kwargs) return self._distribution.cdf( self.statistic - self.crit, **self._dist_kwargs)
# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
[docs] def compute_power_below(self) -> float: """ Compute one-sided observed power for the 'below' (left) side. Attributes ---------- crit : `float` The critical value beyond which the null-hypothesis would be rejected. Returns ------- float The observed power. """ self.crit =\ -self._distribution.ppf(1 - self.alpha, **self._dist_kwargs) return self._distribution.cdf( self.statistic - self.crit, **self._dist_kwargs)
# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ def __str__(self) -> str: """ Returns a human-readable string representation of the instance. """ return (f"ObservedPower with statistic={self.statistic}, " f"alpha={self.alpha}") # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ def __repr__(self) -> str: """ Returns an unambiguous string representation of the instance. """ cname = self.__class__.__name__ return (f"{cname}(statistic={self.statistic}, alpha={self.alpha}, " f"dist_kwargs={self._dist_kwargs})")
# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
[docs] class ObservedPowerT(ObservedPower): """ Computes observed power (two-sided, above, below) for a given test-statistic, using the t-distribution. Parameters ---------- statistic : `float` The t-statistic (e.g., estimate / standard error). df : `float` The degrees of freedom for the t-distribution. alpha : `float`, default 0.05 The type I error rate. Methods ------- compute_power() Compute two-sided observed power using the configured distribution. compute_power_above() Compute one-sided observed power for the 'above' (right) side. compute_power_below() Compute one-sided observed power for the 'below' (left) side. Notes ----- Inherits from ObservedPower and adds degrees of freedom via keyword arguments. """ # class-level variable _distribution = t # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ def __init__(self, statistic: float, df: float, alpha: float = 0.05, ) -> None: # Pass the df to the parent class's **kwargs. super().__init__(statistic, alpha, df=df)
# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
[docs] class ObservedPowerChiSquare(ObservedPower): """ Computes observed power for a given statistic, using the non-central chi-square distribution. Parameters ---------- statistic : `float` The chi-square statistic (e.g., sum of squares of standardised residuals). df : `float` Degrees of freedom for the chi-square distribution. alpha : `float`, default 0.05 The type I error rate. Notes ----- Inherits from ObservedPower but overrides the distribution to accommodate its strictly positive domain. In a chi-square context, one generally considers whether the observed statistic is 'above' a critical threshold. Hence, `compute_power()` internally calls `compute_power_above()` rather than summing two tails. Because chi-square is not defined below zero, `compute_power_below()` is not meaningful and thus raises a NotImplementedError. """ # class-level variable _distribution = ncx2 # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ def __init__(self, statistic: float, df: float, alpha: float = 0.05) -> None: # Pass df to the parent class's kwargs so it's included in distribution calls super().__init__(statistic, alpha, df=df) # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
[docs] def compute_power(self) -> float: """ Compute one-sided observed power for the 'above' (right) side. Attributes ---------- crit : `float` The critical value beyond which the null-hypothesis would be rejected. Returns ------- float The observed power. """ return self.compute_power_above()
# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
[docs] def compute_power_above(self) -> float: """ Compute observed power for the right tail using the non-central chi-square distribution. Attributes ---------- crit : float The critical value (from the central distribution) beyond which the null hypothesis would be rejected. Returns ------- float The critical value beyond which the null-hypothesis would be rejected. Notes ----- The procedure: - First, compute the critical value (ppf) under the central chi-square (nc=0). - Then, evaluate the cdf at x = self.crit under the non-central chi-square with nc = self.statistic. - Substract the density from 1 to calculate the observed power. This approach can be easily verified using the normal distribution >>> from scipy.stats import norm >>> # use a test statistic of 4, and critical value of 1.96 >>> ncnorm = norm(loc=4, scale=1.0) >>> 1-ncnorm.cdf(1.96) 0.9793 >>> norm.cdf(4-1.96) 0.9793 """ df = self._dist_kwargs["df"] # Compute critical value with central chi-square (nc=0) self.crit = self._distribution.ppf(1 - self.alpha, df=df, nc=0) # Now compute power as cdf at x = self.statistic, nc = self.crit # return self._distribution.cdf(self.statistic, df=df, nc=self.crit) return 1-self._distribution.cdf(self.crit, df=df, nc=self.statistic)
# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
[docs] def compute_power_below(self) -> float: """ Raises a NotImplementedError because a chi-square distribution is only defined on [0, ∞), so a 'below' tail does not correspond to a meaningful test. Raises ------ NotImplementedError Because 'below' power is not defined for chi-square. """ raise NotImplementedError( "compute_power_below() is not meaningful for a chi-square " "distribution, which is only defined on [0, ∞)." )