Source code for stats_misc.utils.general

"""
Functions used in multiple stat-miscs modules which may be of general utility.
"""

import warnings
import numpy as np
from typing import Any, Literal
from stats_misc.constants import (
    NamesUtilsGeneral as NamesGen,
    Error_MSG,
)
from stats_misc.errors import (
    is_type,
)
from scipy.stats import (
    norm,
    t,
    f,
    ncx2,
)

# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

[docs]
def invlogit(l: np.ndarray | float | int) -> np.ndarray:
    """
    The inverse of the logit function.
    
    Parameters
    ----------
    l : `np.ndarray`, `float`, or `int`
        Log-odds value(s) to transform.
    
    Returns
    -------
    np.ndarray
        Probability value(s) in (0, 1).
    """
    is_type(l, (np.ndarray, float, int))
    # inverse logit
    p = 1/(1+np.exp(-l))
    return p


# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

[docs]
def logit(
    p: np.ndarray | float | int,
    add: float = 0.0001,
    sub: float = 0.0001,
) -> np.ndarray:
    """
    The logit function.
    
    Parameters
    ----------
    p : `np.ndarray`, `float`, or `int`
        Probability value(s); must be in [0, 1].
    add : `float`, default 0.0001
        Constant added to p when p equals 0.
    sub : `float`, default 0.0001
        Constant subtracted from p when p equals 1.
    
    Returns
    -------
    np.ndarray
        Log-odds of p.
    
    Warns
    -----
    UserWarning
        When any value of p is exactly 0 or 1.
    
    Raises
    ------
    ValueError
        When any value of p is outside [0, 1].
    """
    is_type(p, (np.ndarray, float, int))
    if not isinstance(p, np.ndarray):
        p=np.array([p])
    if not ( all(i <= 1 for i in p) and all(i >= 0 for i in p)):
        raise ValueError("Error: p should be between 0 and 1")
    if not ( all(i < 1 for i in p) and all(i > 0 for i in p)):
        warnings.warn(
            "Some values of p are 1 or 0. Adding or subtracting a small "
            "value, based on `add` and `sub`. Values of 1 or 0 may indicate "
            "the model is invalid for some subjects.",
            UserWarning,
            stacklevel=2,
        )
    p[p == 0] = p[p == 0] + add
    p[p == 1] = p[p == 1] - sub
    # logit
    logit_p = np.log(p/(1-p))
    return logit_p


# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

[docs]
def calculate_pvalue(
    z_statistic: float | int,
    side: Literal['two', 'left', 'right', 'below', 'above'] = 'two',
) -> float:
    r"""
    Calculate the P-value based on the Z-statistic and the standard normal
    distribution.
    
    Parameters
    ----------
    z_statistic : `float`
        Typically the ratio of the point estimate and the standard error,
        representing the standardized difference from the value under the
        null-hypothesis.
    side : {`two`, `left`, `right`, 'below', 'above'}, default `two`
        `left` will perform a left-sided,  one-sided, z-test. `right` will
        perform a right-sided, one-sided, z-test. `below` is a synonym for
        `left`, and 'above` is a synonym for `right`.
    
    Returns
    -------
    float
        P-value.
    
    Notes
    -----
    Use `left` if your null-hypothesis is
    .. math:: \mu \geq \mu_0,
    and the alternative hypothesis is
    .. math:: \mu < \mu_0.
    
    Use `right` if your null-hypothesis is
    .. math:: \mu \leq \mu_0,
    and the alternative hypothesis is
    .. math:: \mu > \mu_0.
    
    So for `left` the alternative hypothesis supposes is to the left of
    null-hypothesis, while for `right` we assume this is to the right.
    """
    # check input
    is_type(z_statistic, (int, float))
    is_type(side, str)
    SIDES = [NamesGen.SIDE_LEFT, NamesGen.SIDE_RIGHT, NamesGen.SIDE_TWO,
             NamesGen.SIDE_BELOW, NamesGen.SIDE_ABOVE,
             ]
    # confirm the choice is correct
    if not side in SIDES:
        raise ValueError(Error_MSG.INVALID_STRING.\
                         format('side', ', '.join(SIDES)))
    # the left-side p-value
    pvalue = norm.cdf(z_statistic)
    # the right-side
    if side == NamesGen.SIDE_RIGHT or side == NamesGen.SIDE_ABOVE:
        pvalue = 1-pvalue
    elif side == NamesGen.SIDE_TWO:
        # two-sided
        pvalue = min(pvalue, 1-pvalue)*2
    return pvalue


# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

[docs]
def calculate_pvalue_tdist(
    t_statistic: float | int,
    degrees: int,
    side: Literal['two', 'left', 'right', 'below', 'above'] = 'two',
) -> float:
    """
    Calculate the P-value based on the t-statistic and the t distribution.
    
    Parameters
    ----------
    t_statistic : `float`
        Typically the ratio of the point estimate and the standard error,
        representing the standardized difference from the value under the
        null-hypothesis.
    degrees : `int`
        The degrees of freedom.
    side : {`two`, `left`, `right`, 'below', 'above'}, default `two`
        `left` will perform a left-sided,  one-sided, z-test. `right` will
        perform a right-sided, one-sided, z-test. `below` is a synonym for
        `left`, and 'above` is a synonym for `right`.
    
    Returns
    -------
    float
        P-value.
    """
    # check input
    is_type(t_statistic, (int, float))
    is_type(side, str)
    SIDES = [NamesGen.SIDE_LEFT, NamesGen.SIDE_RIGHT, NamesGen.SIDE_TWO,
             NamesGen.SIDE_BELOW, NamesGen.SIDE_ABOVE,
             ]
    # confirm the choice is correct
    if not side in SIDES:
        raise ValueError(Error_MSG.INVALID_STRING.\
                         format('side', ', '.join(SIDES)))
    # the left-side p-value
    pvalue = t.cdf(t_statistic, degrees)
    # the right-side
    if side == NamesGen.SIDE_RIGHT or side == NamesGen.SIDE_ABOVE:
        pvalue = 1-pvalue
    elif side == NamesGen.SIDE_TWO:
        # two-sided
        pvalue = min(pvalue, 1-pvalue)*2
    return pvalue


# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

[docs]
def calculate_pvalue_fdist(
    f_statistic: float | int,
    degrees_numerator: int,
    degrees_denominator: int,
) -> float:
    """
    Calculate the P-value based on the F-statistic and the F distribution.
    
    Parameters
    ----------
    f_statistic : `float`
        The F test-statistic.
    degrees_numerator : `int`
        Numerator degrees of freedom.
    degrees_denominator : `int`
        Denominator degrees of freedom.
    
    Returns
    -------
    float
        P-value.
    """
    is_type(f_statistic, (int, float))
    # the p-value
    return 1-f.cdf(f_statistic, degrees_numerator, degrees_denominator)


# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

[docs]
class ObservedPower(object):
    """
    Computes observed power (two-sided, above, below) for a given z-statistic,
    based on the normal distribution.
    
    Parameters
    ----------
    statistic : `float`
        The z-statistic (e.g., estimate / standard error).
    alpha : `float`, default 0.05
        The type I error rate.
    **kwargs
        Optional keyword arguments passed to the distribution.
    
    Methods
    -------
    compute_power()
        Compute two-sided observed power using the configured distribution.
    compute_power_above()
        Compute one-sided observed power for the 'above' (right) side.
    compute_power_below()
        Compute one-sided observed power for the 'below' (left) side.
    
    Notes
    -----
    Observed power is equivalent to the p-value evaluated under the
    alternative hypothesis instead of the null-hypothesis. For example, if the
    p-value is 0.05 and one is performing a test against an alpha of 0.05, the
    observed power is 50% (i.e., if we would repeat the exact same experiment,
    under the alternative hypothesis, there would be a 50% probability of
    observing a p-value smaller than 0.05. Because, of the equivalence between
    the p-value and observed power, it has very limited application.
    """
    # class-level variable
    _distribution = norm  # Default to the normal distribution
    # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
    def __init__(self, statistic: float, alpha: float = 0.05, **kwargs: Any,
                 ) -> None:
        """
        The class init.
        """
        is_type(statistic, (float, int))
        is_type(alpha, (float, int))
        # is alpha between 0 and 1
        if alpha <= 0 or alpha >= 1:
            raise ValueError(Error_MSG.FLOAT_LIMITS.format('alpha', 0, 1))
        self.statistic = float(statistic)
        self.alpha = float(alpha)
        self._dist_kwargs = kwargs
    # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

[docs]
    def compute_power(self) -> float:
        """
        Compute two-sided observed power using the configured distribution.
        
        Attributes
        ----------
        crit : `float`
            The critical value beyond which the null-hypothesis would be
            rejected.
        
        Returns
        -------
        float
            The observed power.
        """
        self.crit =\
            self._distribution.ppf(1 - self.alpha / 2, **self._dist_kwargs)
        # Sums probabilities in both tails.
        return (
            self._distribution.cdf(
                -self.statistic - self.crit, **self._dist_kwargs)
            + self._distribution.cdf(
                self.statistic - self.crit, **self._dist_kwargs)
        )

    # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

[docs]
    def compute_power_above(self) -> float:
        """
        Compute one-sided observed power for the 'above' (right) side.
        
        Attributes
        ----------
        crit : `float`
            The critical value beyond which the null-hypothesis would be
            rejected.
        
        Returns
        -------
        float
            The observed power.
        """
        self.crit =\
            self._distribution.ppf(1 - self.alpha, **self._dist_kwargs)
        return self._distribution.cdf(
            self.statistic - self.crit, **self._dist_kwargs)

    # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

[docs]
    def compute_power_below(self) -> float:
        """
        Compute one-sided observed power for the 'below' (left) side.
        
        Attributes
        ----------
        crit : `float`
            The critical value beyond which the null-hypothesis would be
            rejected.
        
        Returns
        -------
        float
            The observed power.
        """
        self.crit =\
            -self._distribution.ppf(1 - self.alpha, **self._dist_kwargs)
        return self._distribution.cdf(
            self.statistic - self.crit, **self._dist_kwargs)

    # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
    def __str__(self) -> str:
        """
        Returns a human-readable string representation of the instance.
        """
        return (f"ObservedPower with statistic={self.statistic}, "
                f"alpha={self.alpha}")
    # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
    def __repr__(self) -> str:
        """
        Returns an unambiguous string representation of the instance.
        """
        cname = self.__class__.__name__
        return (f"{cname}(statistic={self.statistic}, alpha={self.alpha}, "
                f"dist_kwargs={self._dist_kwargs})")


# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

[docs]
class ObservedPowerT(ObservedPower):
    """
    Computes observed power (two-sided, above, below) for a given
    test-statistic, using the t-distribution.
    
    Parameters
    ----------
    statistic : `float`
        The t-statistic (e.g., estimate / standard error).
    df : `float`
        The degrees of freedom for the t-distribution.
    alpha : `float`, default 0.05
        The type I error rate.
    
    Methods
    -------
    compute_power()
        Compute two-sided observed power using the configured distribution.
    compute_power_above()
        Compute one-sided observed power for the 'above' (right) side.
    compute_power_below()
        Compute one-sided observed power for the 'below' (left) side.
    
    Notes
    -----
    Inherits from ObservedPower and adds degrees of freedom via keyword
    arguments.
    
    """
    # class-level variable
    _distribution = t
    # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
    def __init__(self, statistic: float, df: float, alpha: float = 0.05,
                 ) -> None:
        # Pass the df to the parent class's **kwargs.
        super().__init__(statistic, alpha, df=df)


# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

[docs]
class ObservedPowerChiSquare(ObservedPower):
    """
    Computes observed power for a given statistic, using the non-central
    chi-square distribution.
    
    Parameters
    ----------
    statistic : `float`
        The chi-square statistic (e.g., sum of squares of standardised
        residuals).
    df : `float`
        Degrees of freedom for the chi-square distribution.
    alpha : `float`, default 0.05
        The type I error rate.
    
    Notes
    -----
    Inherits from ObservedPower but overrides the distribution
    to accommodate its strictly positive domain.
    
    In a chi-square context, one generally considers whether the observed
    statistic is 'above' a critical threshold. Hence, `compute_power()`
    internally calls `compute_power_above()` rather than summing two tails.
    Because chi-square is not defined below zero, `compute_power_below()`
    is not meaningful and thus raises a NotImplementedError.
    """
    # class-level variable
    _distribution = ncx2
    # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
    def __init__(self, statistic: float, df: float, alpha: float = 0.05) -> None:
        # Pass df to the parent class's kwargs so it's included in distribution calls
        super().__init__(statistic, alpha, df=df)
    # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

[docs]
    def compute_power(self) -> float:
        """
        Compute one-sided observed power for the 'above' (right) side.
        
        Attributes
        ----------
        crit : `float`
            The critical value beyond which the null-hypothesis would be
            rejected.
        
        Returns
        -------
        float
            The observed power.
        """
        return self.compute_power_above()

    # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

[docs]
    def compute_power_above(self) -> float:
        """
        Compute observed power for the right tail using the non-central
        chi-square distribution.
        
        Attributes
        ----------
        crit : float
            The critical value (from the central distribution) beyond which
            the null hypothesis would be rejected.
        
        Returns
        -------
        float
            The observed power.
        
        Notes
        -----
        The procedure:
        - First, compute the critical value (ppf) under the central chi-square
          (nc=0).
        - Then, evaluate the cdf at x = self.crit under the non-central
          chi-square with nc = self.statistic.
        - Subtract the density from 1 to calculate the observed power.
        This approach can be easily verified using the normal distribution
        
        >>> from scipy.stats import norm
        >>> # use a test statistic of 4, and critical value of 1.96
        >>> ncnorm = norm(loc=4, scale=1.0)
        >>> 1-ncnorm.cdf(1.96)
        0.9793
        >>> norm.cdf(4-1.96)
        0.9793
        
        """
        df = self._dist_kwargs["df"]
        # Compute critical value with central chi-square (nc=0)
        self.crit = self._distribution.ppf(1 - self.alpha, df=df, nc=0)
        # Now compute power as cdf at x = self.statistic, nc = self.crit
        # return self._distribution.cdf(self.statistic, df=df, nc=self.crit)
        return 1-self._distribution.cdf(self.crit, df=df, nc=self.statistic)

    # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

[docs]
    def compute_power_below(self) -> float:
        """
        Raises a NotImplementedError because a chi-square distribution
        is only defined on [0, ∞), so a 'below' tail does not correspond
        to a meaningful test.
        
        Raises
        ------
        NotImplementedError
            Because 'below' power is not defined for chi-square.
        """
        raise NotImplementedError(
            "compute_power_below() is not meaningful for a chi-square "
            "distribution, which is only defined on [0, ∞)."
        )