Source code for stats_misc.utils.general

"""
Functions used in multiple stat-miscs modules which may be of general utility.
"""

import warnings
import numpy as np
from typing import Any, Literal
from stats_misc.constants import (
    NamesUtilsGeneral as NamesGen,
    Error_MSG,
)
from stats_misc.errors import (
    is_type,
)
from scipy.stats import (
    norm,
    t,
    f,
    ncx2,
)

# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
[docs] def invlogit(l: np.ndarray | float | int) -> np.ndarray: """ The inverse of the logit function. Parameters ---------- l : `np.ndarray`, `float`, or `int` Log-odds value(s) to transform. Returns ------- np.ndarray Probability value(s) in (0, 1). """ is_type(l, (np.ndarray, float, int)) # inverse logit p = 1/(1+np.exp(-l)) return p
# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
[docs] def logit( p: np.ndarray | float | int, add: float = 0.0001, sub: float = 0.0001, ) -> np.ndarray: """ The logit function. Parameters ---------- p : `np.ndarray`, `float`, or `int` Probability value(s); must be in [0, 1]. add : `float`, default 0.0001 Constant added to p when p equals 0. sub : `float`, default 0.0001 Constant subtracted from p when p equals 1. Returns ------- np.ndarray Log-odds of p. Warns ----- UserWarning When any value of p is exactly 0 or 1. Raises ------ ValueError When any value of p is outside [0, 1]. """ is_type(p, (np.ndarray, float, int)) if not isinstance(p, np.ndarray): p=np.array([p]) if not ( all(i <= 1 for i in p) and all(i >= 0 for i in p)): raise ValueError("Error: p should be between 0 and 1") if not ( all(i < 1 for i in p) and all(i > 0 for i in p)): warnings.warn( "Some values of p are 1 or 0. Adding or subtracting a small " "value, based on `add` and `sub`. Values of 1 or 0 may indicate " "the model is invalid for some subjects.", UserWarning, stacklevel=2, ) p[p == 0] = p[p == 0] + add p[p == 1] = p[p == 1] - sub # logit logit_p = np.log(p/(1-p)) return logit_p
# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
[docs] def calculate_pvalue( z_statistic: float | int, side: Literal['two', 'left', 'right', 'below', 'above'] = 'two', ) -> float: r""" Calculate the P-value based on the Z-statistic and the standard normal distribution. Parameters ---------- z_statistic : `float` Typically the ratio of the point estimate and the standard error, representing the standardized difference from the value under the null-hypothesis. side : {`two`, `left`, `right`, 'below', 'above'}, default `two` `left` will perform a left-sided, one-sided, z-test. `right` will perform a right-sided, one-sided, z-test. `below` is a synonym for `left`, and 'above` is a synonym for `right`. Returns ------- float P-value. Notes ----- Use `left` if your null-hypothesis is .. math:: \mu \geq \mu_0, and the alternative hypothesis is .. math:: \mu < \mu_0. Use `right` if your null-hypothesis is .. math:: \mu \leq \mu_0, and the alternative hypothesis is .. math:: \mu > \mu_0. So for `left` the alternative hypothesis supposes is to the left of null-hypothesis, while for `right` we assume this is to the right. """ # check input is_type(z_statistic, (int, float)) is_type(side, str) SIDES = [NamesGen.SIDE_LEFT, NamesGen.SIDE_RIGHT, NamesGen.SIDE_TWO, NamesGen.SIDE_BELOW, NamesGen.SIDE_ABOVE, ] # confirm the choice is correct if not side in SIDES: raise ValueError(Error_MSG.INVALID_STRING.\ format('side', ', '.join(SIDES))) # the left-side p-value pvalue = norm.cdf(z_statistic) # the right-side if side == NamesGen.SIDE_RIGHT or side == NamesGen.SIDE_ABOVE: pvalue = 1-pvalue elif side == NamesGen.SIDE_TWO: # two-sided pvalue = min(pvalue, 1-pvalue)*2 return pvalue
# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
[docs] def calculate_pvalue_tdist( t_statistic: float | int, degrees: int, side: Literal['two', 'left', 'right', 'below', 'above'] = 'two', ) -> float: """ Calculate the P-value based on the t-statistic and the t distribution. Parameters ---------- t_statistic : `float` Typically the ratio of the point estimate and the standard error, representing the standardized difference from the value under the null-hypothesis. degrees : `int` The degrees of freedom. side : {`two`, `left`, `right`, 'below', 'above'}, default `two` `left` will perform a left-sided, one-sided, z-test. `right` will perform a right-sided, one-sided, z-test. `below` is a synonym for `left`, and 'above` is a synonym for `right`. Returns ------- float P-value. """ # check input is_type(t_statistic, (int, float)) is_type(side, str) SIDES = [NamesGen.SIDE_LEFT, NamesGen.SIDE_RIGHT, NamesGen.SIDE_TWO, NamesGen.SIDE_BELOW, NamesGen.SIDE_ABOVE, ] # confirm the choice is correct if not side in SIDES: raise ValueError(Error_MSG.INVALID_STRING.\ format('side', ', '.join(SIDES))) # the left-side p-value pvalue = t.cdf(t_statistic, degrees) # the right-side if side == NamesGen.SIDE_RIGHT or side == NamesGen.SIDE_ABOVE: pvalue = 1-pvalue elif side == NamesGen.SIDE_TWO: # two-sided pvalue = min(pvalue, 1-pvalue)*2 return pvalue
# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
[docs] def calculate_pvalue_fdist( f_statistic: float | int, degrees_numerator: int, degrees_denominator: int, ) -> float: """ Calculate the P-value based on the F-statistic and the F distribution. Parameters ---------- f_statistic : `float` The F test-statistic. degrees_numerator : `int` Numerator degrees of freedom. degrees_denominator : `int` Denominator degrees of freedom. Returns ------- float P-value. """ is_type(f_statistic, (int, float)) # the p-value return 1-f.cdf(f_statistic, degrees_numerator, degrees_denominator)
# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
[docs] class ObservedPower(object): """ Computes observed power (two-sided, above, below) for a given z-statistic, based on the normal distribution. Parameters ---------- statistic : `float` The z-statistic (e.g., estimate / standard error). alpha : `float`, default 0.05 The type I error rate. **kwargs Optional keyword arguments passed to the distribution. Methods ------- compute_power() Compute two-sided observed power using the configured distribution. compute_power_above() Compute one-sided observed power for the 'above' (right) side. compute_power_below() Compute one-sided observed power for the 'below' (left) side. Notes ----- Observed power is equivalent to the p-value evaluated under the alternative hypothesis instead of the null-hypothesis. For example, if the p-value is 0.05 and one is performing a test against an alpha of 0.05, the observed power is 50% (i.e., if we would repeat the exact same experiment, under the alternative hypothesis, there would be a 50% probability of observing a p-value smaller than 0.05. Because, of the equivalence between the p-value and observed power, it has very limited application. """ # class-level variable _distribution = norm # Default to the normal distribution # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ def __init__(self, statistic: float, alpha: float = 0.05, **kwargs: Any, ) -> None: """ The class init. """ is_type(statistic, (float, int)) is_type(alpha, (float, int)) # is alpha between 0 and 1 if alpha <= 0 or alpha >= 1: raise ValueError(Error_MSG.FLOAT_LIMITS.format('alpha', 0, 1)) self.statistic = float(statistic) self.alpha = float(alpha) self._dist_kwargs = kwargs # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
[docs] def compute_power(self) -> float: """ Compute two-sided observed power using the configured distribution. Attributes ---------- crit : `float` The critical value beyond which the null-hypothesis would be rejected. Returns ------- float The observed power. """ self.crit =\ self._distribution.ppf(1 - self.alpha / 2, **self._dist_kwargs) # Sums probabilities in both tails. return ( self._distribution.cdf( -self.statistic - self.crit, **self._dist_kwargs) + self._distribution.cdf( self.statistic - self.crit, **self._dist_kwargs) )
# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
[docs] def compute_power_above(self) -> float: """ Compute one-sided observed power for the 'above' (right) side. Attributes ---------- crit : `float` The critical value beyond which the null-hypothesis would be rejected. Returns ------- float The observed power. """ self.crit =\ self._distribution.ppf(1 - self.alpha, **self._dist_kwargs) return self._distribution.cdf( self.statistic - self.crit, **self._dist_kwargs)
# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
[docs] def compute_power_below(self) -> float: """ Compute one-sided observed power for the 'below' (left) side. Attributes ---------- crit : `float` The critical value beyond which the null-hypothesis would be rejected. Returns ------- float The observed power. """ self.crit =\ -self._distribution.ppf(1 - self.alpha, **self._dist_kwargs) return self._distribution.cdf( self.statistic - self.crit, **self._dist_kwargs)
# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ def __str__(self) -> str: """ Returns a human-readable string representation of the instance. """ return (f"ObservedPower with statistic={self.statistic}, " f"alpha={self.alpha}") # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ def __repr__(self) -> str: """ Returns an unambiguous string representation of the instance. """ cname = self.__class__.__name__ return (f"{cname}(statistic={self.statistic}, alpha={self.alpha}, " f"dist_kwargs={self._dist_kwargs})")
# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
[docs] class ObservedPowerT(ObservedPower): """ Computes observed power (two-sided, above, below) for a given test-statistic, using the t-distribution. Parameters ---------- statistic : `float` The t-statistic (e.g., estimate / standard error). df : `float` The degrees of freedom for the t-distribution. alpha : `float`, default 0.05 The type I error rate. Methods ------- compute_power() Compute two-sided observed power using the configured distribution. compute_power_above() Compute one-sided observed power for the 'above' (right) side. compute_power_below() Compute one-sided observed power for the 'below' (left) side. Notes ----- Inherits from ObservedPower and adds degrees of freedom via keyword arguments. """ # class-level variable _distribution = t # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ def __init__(self, statistic: float, df: float, alpha: float = 0.05, ) -> None: # Pass the df to the parent class's **kwargs. super().__init__(statistic, alpha, df=df)
# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
[docs] class ObservedPowerChiSquare(ObservedPower): """ Computes observed power for a given statistic, using the non-central chi-square distribution. Parameters ---------- statistic : `float` The chi-square statistic (e.g., sum of squares of standardised residuals). df : `float` Degrees of freedom for the chi-square distribution. alpha : `float`, default 0.05 The type I error rate. Notes ----- Inherits from ObservedPower but overrides the distribution to accommodate its strictly positive domain. In a chi-square context, one generally considers whether the observed statistic is 'above' a critical threshold. Hence, `compute_power()` internally calls `compute_power_above()` rather than summing two tails. Because chi-square is not defined below zero, `compute_power_below()` is not meaningful and thus raises a NotImplementedError. """ # class-level variable _distribution = ncx2 # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ def __init__(self, statistic: float, df: float, alpha: float = 0.05) -> None: # Pass df to the parent class's kwargs so it's included in distribution calls super().__init__(statistic, alpha, df=df) # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
[docs] def compute_power(self) -> float: """ Compute one-sided observed power for the 'above' (right) side. Attributes ---------- crit : `float` The critical value beyond which the null-hypothesis would be rejected. Returns ------- float The observed power. """ return self.compute_power_above()
# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
[docs] def compute_power_above(self) -> float: """ Compute observed power for the right tail using the non-central chi-square distribution. Attributes ---------- crit : float The critical value (from the central distribution) beyond which the null hypothesis would be rejected. Returns ------- float The observed power. Notes ----- The procedure: - First, compute the critical value (ppf) under the central chi-square (nc=0). - Then, evaluate the cdf at x = self.crit under the non-central chi-square with nc = self.statistic. - Subtract the density from 1 to calculate the observed power. This approach can be easily verified using the normal distribution >>> from scipy.stats import norm >>> # use a test statistic of 4, and critical value of 1.96 >>> ncnorm = norm(loc=4, scale=1.0) >>> 1-ncnorm.cdf(1.96) 0.9793 >>> norm.cdf(4-1.96) 0.9793 """ df = self._dist_kwargs["df"] # Compute critical value with central chi-square (nc=0) self.crit = self._distribution.ppf(1 - self.alpha, df=df, nc=0) # Now compute power as cdf at x = self.statistic, nc = self.crit # return self._distribution.cdf(self.statistic, df=df, nc=self.crit) return 1-self._distribution.cdf(self.crit, df=df, nc=self.statistic)
# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
[docs] def compute_power_below(self) -> float: """ Raises a NotImplementedError because a chi-square distribution is only defined on [0, ∞), so a 'below' tail does not correspond to a meaningful test. Raises ------ NotImplementedError Because 'below' power is not defined for chi-square. """ raise NotImplementedError( "compute_power_below() is not meaningful for a chi-square " "distribution, which is only defined on [0, ∞)." )