Source code for stats_misc.tests

"""
A collection of null-hypothesis tests.
"""
import pandas as pd
import numpy as np
from scipy import stats as ss
from scipy.stats import f
from typing import Any, List, Type, Union, Tuple, Dict, ClassVar, Optional
from stats_misc.utils.general import (
    calculate_pvalue,
)
from stats_misc.constants import (
    is_type,
    Error_MSG,
    NamesTest,
    same_len,
)

# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
[docs] class TestResults(object): """ A results objects containing test values. Attributes ---------- point_estimate : `float` The point estimate. standard_error : `float` The standard error of the point estimate. test_statistic : `float` The test statistic, evaluated against the null hypothesis value. p_value : `float` The p-value of the test evaluated against the null hypothesis value null_value : `float` The null hypothesis value the point estimate is evaluated against. """ SET_ARGS = [ NamesTest.POINT, NamesTest.POINT_SE, NamesTest.STATISTIC, NamesTest.PVALUE, NamesTest.NULL_VALUE, ] # Initiation the class def __init__(self, **kwargs): """ Initialise """ for k in kwargs.keys(): if k not in self.__class__.SET_ARGS: raise AttributeError("unrecognised argument '{0}'".format(k)) # Loops over `SET_ARGS`, assigns the kwargs content to name `s`. # if argument is missing in kwargs, print a warning. for s in self.__class__.SET_ARGS: try: setattr(self, s, kwargs[s]) except KeyError: warnings.warn("argument '{0}' is set to 'None'".format(s)) setattr(self, s, None) # ///////////////////////////////////////////////////////////////////////// def __str__(self): return f"An `Test` results class."
# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
[docs] class AnovaTestResults(TestResults): """ A results object specifically for ANOVA tests. Attributes ---------- test_statistic : float The F-statistic. p_value : float The p-value associated with the F-statistic. explained_sum_squares : float The sum of squares explained by the model (between-groups sum of squares). residual_sum_squares : float The sum of squares of residuals (within-groups sum of squares). df_numerator : int The numerator degrees of freedom. df_denominator : int The denominator degrees of freedom. """ SET_ARGS = [ NamesTest.STATISTIC, NamesTest.PVALUE, NamesTest.ESS, NamesTest.RSS, NamesTest.DF_NUM, NamesTest.DF_DENUM, ] # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ def __init__(self, **kwargs): """ Initialise the ANOVA results object, dropping `null_value` if present. """ # If `null_value` or `point` was accidentally passed, remove it so the # base class doesn't complain about an unrecognized argument. if NamesTest.NULL_VALUE in kwargs: del kwargs[NamesTest.NULL_VALUE] if NamesTest.POINT in kwargs: del kwargs[NamesTest.POINT] if NamesTest.POINT_SE in kwargs: del kwargs[NamesTest.POINT_SE] # Call the parent initializer, which uses self.__class__.SET_ARGS super().__init__(**kwargs) def __str__(self): return "An `ANOVA Test` results class."
# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
[docs] def ks_test(data:pd.DataFrame, group:str, values:str, nulldistribution:str='uniform') -> Dict[str, str]: """ Will loop over the unique `group` values to perform overall null-hypothesis tests comparing sets of values against a null-distribution using the Kolmogorov-Smirnoff test. Parameters ---------- data : `pd.DataFrame` A data table. group : `str` A column name in `data` which will be used to group the `values`. values : `str` A column name in `data` to which you want to apply the Kolmogorov-Smirnoff test to. nulldistribution : str, default `uniform` The null-distribution the `values` should be compared against. This maps to the `Scipy.stats` available distributions. Returns ------- results : `dict` A dictionary with `group` values and a `KstestResults` class a items. """ ks_res = {} for c in data[group].unique(): temp = data[data[group] == c][values] ks_res[c] = ss.kstest(temp[np.isnan(temp) == False], nulldistribution) # return return ks_res
# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
[docs] def wald_interaction_test(point:Union[Tuple[float, float], List], se:Union[Tuple[float, float], List], null_value:float=0.0, ) -> TestResults: """ Statistic test whether the difference between to point estimates is distinct from the null hypothesis values (`null_value`). The tests simply calculates the difference between two point estimates and calculates the standard error of this differences by taking the squared root of the sum of the squared standard errors of the point estimates. The fraction of the difference by its standard error is compared to a standard normal distribution. Parameters ---------- point : tuple [`float`, `float`] Two point estimates, for example the mean difference or log odds ratio. se : tuple [`float`, `float`] Two standard errors of the point estimates. null_value : `float`, default 0.0 The null-hypothesis value of the difference between the point estimates. Returns ------- results : `TestResults` A results class. """ # check type is_type(point, (tuple, list, np.ndarray), 'point') is_type(se, (tuple, list, np.ndarray), 'se') is_type(null_value, (int, float), 'null_value') if len(point) != 2: raise ValueError( Error_MSG.INVALID_EXACT_LENGTH.format('point', 2, len(point))) if len(se) != 2: raise ValueError( Error_MSG.INVALID_EXACT_LENGTH.format('se', 2, len(se))) # get the z-statistic diff = point[0] - point[1] se = np.sqrt(se[0]**2 + se[1]**2) zstat = (diff - null_value)/se # get p-value pvalue = calculate_pvalue(zstat) # return return TestResults(**{ NamesTest.POINT : diff, NamesTest.POINT_SE : se, NamesTest.STATISTIC : zstat, NamesTest.PVALUE : pvalue, NamesTest.NULL_VALUE : null_value, })
# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
[docs] def anova_one_way(means:list[int|float], variances:list[int|float], sizes:list[int], ) -> AnovaTestResults: """ Performs a one-way ANOVA based on aggregate data. Parameters ---------- means : `list` [`int` | `float`] The group means. variances : `list` [`int | `float`] The group variances. sizes : `list` [`int`] The group sample sizes. Returns ------- TestResults A results class. Notes ----- This is an alternative to scipy's one-way ANOVA implementation which requires access to the individual observations (i.e. for each row in a table). """ # ### confirm input is_type(means, list) is_type(variances, list) is_type(sizes, list) # check length same_len(means, variances, ['means', 'variances']) same_len(means, sizes, ['means', 'variances']) # ### get dervied information grand_mean = sum(m*s for m, s in zip(means, sizes))/sum(sizes) df_num = len(sizes) - 1 df_den = sum(sizes) - len(sizes) # explained sum of squares ess = sum(s*(m-grand_mean)**2 for m,s in zip(means, sizes)) # residual sum of squares rss = sum(v*(s-1) for v,s in zip(variances, sizes)) # ### calculate the test statistic and p-value fstat = (ess/df_num)/(rss/df_den) # NOTE make this into a seperate pvalue function pval = 1-f.cdf(fstat, df_num, df_den) # return return AnovaTestResults(**{ NamesTest.STATISTIC : fstat, NamesTest.PVALUE : pval, NamesTest.ESS : ess, NamesTest.RSS : rss, NamesTest.DF_NUM : df_num, NamesTest.DF_DENUM : df_den, })