"""
Functions used in multiple stat-miscs modules which may be of general utility.
"""
import numpy as np
from typing import Literal
from stats_misc.constants import (
is_type,
NamesUtilsGeneral as NamesGen,
Error_MSG,
)
from scipy.stats import (
norm,
t,
ncx2,
)
from typing import Any, List, Type, Union, Tuple, Optional, Dict
from stats_misc.constants import (
is_type,
Error_MSG,
InputValidationError,
string_join_delimiter,
)
from stats_misc.utils.helpers import(
ManagedProperty,
)
# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
[docs]
def invlogit(l:np.ndarray):
'''
The inverse of the logit function.
Parameters
----------
l : `np.array`, `float` or `int`
array of probabilities.
Returns
-------
np.array
A np.array of length l
'''
is_type(l, (np.ndarray, float, int))
# inverse logit
p = 1/(1+np.exp(-l))
return p
# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
# NOTE add pytests
[docs]
def logit(p:np.ndarray, add:float=0.0001, sub:float=0.0001):
'''
The logit function.
Parameters
----------
p : `np.array`
array of probabilities.
add : `float`, default 0.0001
constant to be added if p is equal to 0.
sub : `float`, default 0.0001
constant to be subtracted if p is equal to 1.
Returns
-------
np.array
A np.array of length p
'''
is_type(p, (np.ndarray, float, int))
if not isinstance(p, np.ndarray):
p=np.array([p])
if not ( all(i <= 1 for i in p) and all(i >= 0 for i in p)):
raise ValueError("Error: p should be between 0 and 1")
if not ( all(i < 1 for i in p) and all(i > 0 for i in p)):
raise Warning("Warning: Some values of p are 1 or 0. Adding or "
"subtracting a small value, based on `add` and `sub`. "
"Values of 1 or 0 may indicate the model is invalid "
"for some subjects.")
p[p == 0] = p[p == 0] + add
p[p == 1] = p[p == 1] - add
# logit
logit_p = np.log(p/(1-p))
return logit_p
# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
[docs]
def calculate_pvalue(z_statistic, side:Literal['two', 'left', 'right']='two',
) -> float:
r"""
Calculate the P-value based on the Z-statistic and the standard normal
distribution.
Parameters
----------
z_statistic : `float`
Typically the ratio of the point estimate and the standard error,
representing the standardized difference from the value under the
null-hypothesis.
side : {`two`, `left`, `right`, 'below', 'above'}, default `two`
`left` will perform a left-sided, one-sided, z-test. `right` will
perform a right-sided, one-sided, z-test. `below` is a synonym for
`left`, and 'above` is a synonym for `right`.
Returns
-------
float
P-value.
Notes
-----
Use `left` if your null-hypothesis is
.. math:: \mu \geq \mu_0,
and the alternative hypothesis is
.. math:: \mu < \mu_0.
Use `right` if your null-hypothesis is
.. math:: \mu \leq \mu_0,
and the alternative hypothesis is
.. math:: \mu > \mu_0.
So for `left` the alternative hypothesis supposes is to the left of
null-hypothesis, while for `right` we assume this is to the right.
"""
# check input
is_type(side, str)
SIDES = [NamesGen.SIDE_LEFT, NamesGen.SIDE_RIGHT, NamesGen.SIDE_TWO,
NamesGen.SIDE_BELOW, NamesGen.SIDE_ABOVE,
]
# confirm the choice is correct
if not side in SIDES:
raise ValueError(Error_MSG.INVALID_STRING.\
format('side', ', '.join(SIDES)))
# the left-side p-value
pvalue = norm.cdf(z_statistic)
# the right-side
if side == NamesGen.SIDE_RIGHT or side == NamesGen.SIDE_BELOW:
pvalue = 1-pvalue
elif side == NamesGen.SIDE_TWO:
# two-sided
pvalue = min(pvalue, 1-pvalue)*2
return pvalue
# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
[docs]
def calculate_pvalue_tdist(t_statistic, degrees,
side:Literal['two', 'left', 'right']='two',
):
"""
Calculate the P-value based on the t-statistic and the t distribution.
The function assumes one want to perform a two-sided test.
Parameters
----------
t_statistic : `float`
Typically the ratio of the point estimate and the standard error,
representing the standardized difference from the value under the
null-hypothesis.
degrees : `int`
The degrees of freedom.
side : {`two`, `left`, `right`, 'below', 'above'}, default `two`
`left` will perform a left-sided, one-sided, z-test. `right` will
perform a right-sided, one-sided, z-test. `below` is a synonym for
`left`, and 'above` is a synonym for `right`.
Returns
-------
float
P-value.
"""
# check input
is_type(side, str)
SIDES = [NamesGen.SIDE_LEFT, NamesGen.SIDE_RIGHT, NamesGen.SIDE_TWO,
NamesGen.SIDE_BELOW, NamesGen.SIDE_ABOVE,
]
# confirm the choice is correct
if not side in SIDES:
raise ValueError(Error_MSG.INVALID_STRING.\
format('side', ', '.join(SIDES)))
# the left-side p-value
pvalue = t.cdf(t_statistic, degrees)
# the right-side
if side == NamesGen.SIDE_RIGHT or side == NamesGen.SIDE_BELOW:
pvalue = 1-pvalue
elif side == NamesGen.SIDE_TWO:
# two-sided
pvalue = min(pvalue, 1-pvalue)*2
return(pvalue)
# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
[docs]
class ObservedPower(object):
"""
Computes observed power (two-sided, above, below) for a given z-statistic,
based on the normal distribution.
Parameters
----------
statistic : `float`
The z-statistic (e.g., estimate / standard error).
alpha : `float`, default 0.05
The type I error rate.
**kwargs
Optional keyword arguments passed to the distribution.
Methods
-------
compute_power()
Compute two-sided observed power using the configured distribution.
compute_power_above()
Compute one-sided observed power for the 'above' (right) side.
compute_power_below()
Compute one-sided observed power for the 'below' (left) side.
Notes
-----
Observed power is equivalent to the p-value evaluated under the
alternative hypothesis instead of the null-hypothesis. For example, if the
p-value is 0.05 and one is performing a test against an alpha of 0.05, the
observed power is 50% (i.e., if we would repeat the exact same experiment,
under the alternative hypothesis, there would be a 50% probability of
observing a p-value smaller than 0.05. Because, of the equivalence between
the p-value and observed power, it has very limited application.
"""
# class-level variable
_distribution = norm # Default to the normal distribution
# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
def __init__(self, statistic: float, alpha: float = 0.05, **kwargs: Any,
) -> None:
"""
The class init.
"""
is_type(statistic, (float, int))
is_type(alpha, (float, int))
# is alpha between 0 and 1
if alpha <= 0 or alpha >= 1:
raise ValueError(Error_MSG.FLOAT_LIMITS.format('alpha', 0, 1))
self.statistic = float(statistic)
self.alpha = float(alpha)
self._dist_kwargs = kwargs
# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
[docs]
def compute_power(self) -> float:
"""
Compute two-sided observed power using the configured distribution.
Attributes
----------
crit : `float`
The critical value beyond which the null-hypothesis would be
rejected.
Returns
-------
float
The observed power.
"""
self.crit =\
self._distribution.ppf(1 - self.alpha / 2, **self._dist_kwargs)
# Sums probabilities in both tails.
return (
self._distribution.cdf(
-self.statistic - self.crit, **self._dist_kwargs)
+ self._distribution.cdf(
self.statistic - self.crit, **self._dist_kwargs)
)
# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
[docs]
def compute_power_above(self) -> float:
"""
Compute one-sided observed power for the 'above' (right) side.
Attributes
----------
crit : `float`
The critical value beyond which the null-hypothesis would be
rejected.
Returns
-------
float
The observed power.
"""
self.crit =\
self._distribution.ppf(1 - self.alpha, **self._dist_kwargs)
return self._distribution.cdf(
self.statistic - self.crit, **self._dist_kwargs)
# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
[docs]
def compute_power_below(self) -> float:
"""
Compute one-sided observed power for the 'below' (left) side.
Attributes
----------
crit : `float`
The critical value beyond which the null-hypothesis would be
rejected.
Returns
-------
float
The observed power.
"""
self.crit =\
-self._distribution.ppf(1 - self.alpha, **self._dist_kwargs)
return self._distribution.cdf(
self.statistic - self.crit, **self._dist_kwargs)
# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
def __str__(self) -> str:
"""
Returns a human-readable string representation of the instance.
"""
return (f"ObservedPower with statistic={self.statistic}, "
f"alpha={self.alpha}")
# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
def __repr__(self) -> str:
"""
Returns an unambiguous string representation of the instance.
"""
cname = self.__class__.__name__
return (f"{cname}(statistic={self.statistic}, alpha={self.alpha}, "
f"dist_kwargs={self._dist_kwargs})")
# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
[docs]
class ObservedPowerT(ObservedPower):
"""
Computes observed power (two-sided, above, below) for a given
test-statistic, using the t-distribution.
Parameters
----------
statistic : `float`
The t-statistic (e.g., estimate / standard error).
df : `float`
The degrees of freedom for the t-distribution.
alpha : `float`, default 0.05
The type I error rate.
Methods
-------
compute_power()
Compute two-sided observed power using the configured distribution.
compute_power_above()
Compute one-sided observed power for the 'above' (right) side.
compute_power_below()
Compute one-sided observed power for the 'below' (left) side.
Notes
-----
Inherits from ObservedPower and adds degrees of freedom via keyword
arguments.
"""
# class-level variable
_distribution = t
# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
def __init__(self, statistic: float, df: float, alpha: float = 0.05,
) -> None:
# Pass the df to the parent class's **kwargs.
super().__init__(statistic, alpha, df=df)
# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
[docs]
class ObservedPowerChiSquare(ObservedPower):
"""
Computes observed power for a given statistic, using the non-central
chi-square distribution.
Parameters
----------
statistic : `float`
The chi-square statistic (e.g., sum of squares of standardised
residuals).
df : `float`
Degrees of freedom for the chi-square distribution.
alpha : `float`, default 0.05
The type I error rate.
Notes
-----
Inherits from ObservedPower but overrides the distribution
to accommodate its strictly positive domain.
In a chi-square context, one generally considers whether the observed
statistic is 'above' a critical threshold. Hence, `compute_power()`
internally calls `compute_power_above()` rather than summing two tails.
Because chi-square is not defined below zero, `compute_power_below()`
is not meaningful and thus raises a NotImplementedError.
"""
# class-level variable
_distribution = ncx2
# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
def __init__(self, statistic: float, df: float, alpha: float = 0.05) -> None:
# Pass df to the parent class's kwargs so it's included in distribution calls
super().__init__(statistic, alpha, df=df)
# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
[docs]
def compute_power(self) -> float:
"""
Compute one-sided observed power for the 'above' (right) side.
Attributes
----------
crit : `float`
The critical value beyond which the null-hypothesis would be
rejected.
Returns
-------
float
The observed power.
"""
return self.compute_power_above()
# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
[docs]
def compute_power_above(self) -> float:
"""
Compute observed power for the right tail using the non-central
chi-square distribution.
Attributes
----------
crit : float
The critical value (from the central distribution) beyond which
the null hypothesis would be rejected.
Returns
-------
float
The critical value beyond which the null-hypothesis would be
rejected.
Notes
-----
The procedure:
- First, compute the critical value (ppf) under the central chi-square
(nc=0).
- Then, evaluate the cdf at x = self.crit under the non-central
chi-square with nc = self.statistic.
- Substract the density from 1 to calculate the observed power.
This approach can be easily verified using the normal distribution
>>> from scipy.stats import norm
>>> # use a test statistic of 4, and critical value of 1.96
>>> ncnorm = norm(loc=4, scale=1.0)
>>> 1-ncnorm.cdf(1.96)
0.9793
>>> norm.cdf(4-1.96)
0.9793
"""
df = self._dist_kwargs["df"]
# Compute critical value with central chi-square (nc=0)
self.crit = self._distribution.ppf(1 - self.alpha, df=df, nc=0)
# Now compute power as cdf at x = self.statistic, nc = self.crit
# return self._distribution.cdf(self.statistic, df=df, nc=self.crit)
return 1-self._distribution.cdf(self.crit, df=df, nc=self.statistic)
# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
[docs]
def compute_power_below(self) -> float:
"""
Raises a NotImplementedError because a chi-square distribution
is only defined on [0, ∞), so a 'below' tail does not correspond
to a meaningful test.
Raises
------
NotImplementedError
Because 'below' power is not defined for chi-square.
"""
raise NotImplementedError(
"compute_power_below() is not meaningful for a chi-square "
"distribution, which is only defined on [0, ∞)."
)