"""
A collection of null-hypothesis tests.
"""
import pandas as pd
import numpy as np
from scipy import stats as ss
from scipy.stats import f
from typing import Any, List, Type, Union, Tuple, Dict, ClassVar, Optional
from stats_misc.utils.general import (
calculate_pvalue,
)
from stats_misc.constants import (
is_type,
Error_MSG,
NamesTest,
same_len,
)
# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
[docs]
class TestResults(object):
"""
A results objects containing test values.
Attributes
----------
point_estimate : `float`
The point estimate.
standard_error : `float`
The standard error of the point estimate.
test_statistic : `float`
The test statistic, evaluated against the null hypothesis value.
p_value : `float`
The p-value of the test evaluated against the null hypothesis value
null_value : `float`
The null hypothesis value the point estimate is evaluated against.
"""
SET_ARGS = [
NamesTest.POINT,
NamesTest.POINT_SE,
NamesTest.STATISTIC,
NamesTest.PVALUE,
NamesTest.NULL_VALUE,
]
# Initiation the class
def __init__(self, **kwargs):
"""
Initialise
"""
for k in kwargs.keys():
if k not in self.__class__.SET_ARGS:
raise AttributeError("unrecognised argument '{0}'".format(k))
# Loops over `SET_ARGS`, assigns the kwargs content to name `s`.
# if argument is missing in kwargs, print a warning.
for s in self.__class__.SET_ARGS:
try:
setattr(self, s, kwargs[s])
except KeyError:
warnings.warn("argument '{0}' is set to 'None'".format(s))
setattr(self, s, None)
# /////////////////////////////////////////////////////////////////////////
def __str__(self):
return f"An `Test` results class."
# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
[docs]
class AnovaTestResults(TestResults):
"""
A results object specifically for ANOVA tests.
Attributes
----------
test_statistic : float
The F-statistic.
p_value : float
The p-value associated with the F-statistic.
explained_sum_squares : float
The sum of squares explained by the model (between-groups sum of squares).
residual_sum_squares : float
The sum of squares of residuals (within-groups sum of squares).
df_numerator : int
The numerator degrees of freedom.
df_denominator : int
The denominator degrees of freedom.
"""
SET_ARGS = [
NamesTest.STATISTIC,
NamesTest.PVALUE,
NamesTest.ESS,
NamesTest.RSS,
NamesTest.DF_NUM,
NamesTest.DF_DENUM,
]
# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
def __init__(self, **kwargs):
"""
Initialise the ANOVA results object, dropping `null_value` if present.
"""
# If `null_value` or `point` was accidentally passed, remove it so the
# base class doesn't complain about an unrecognized argument.
if NamesTest.NULL_VALUE in kwargs:
del kwargs[NamesTest.NULL_VALUE]
if NamesTest.POINT in kwargs:
del kwargs[NamesTest.POINT]
if NamesTest.POINT_SE in kwargs:
del kwargs[NamesTest.POINT_SE]
# Call the parent initializer, which uses self.__class__.SET_ARGS
super().__init__(**kwargs)
def __str__(self):
return "An `ANOVA Test` results class."
# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
[docs]
def ks_test(data:pd.DataFrame, group:str, values:str,
nulldistribution:str='uniform') -> Dict[str, str]:
"""
Will loop over the unique `group` values to perform overall null-hypothesis
tests comparing sets of values against a null-distribution using the
Kolmogorov-Smirnoff test.
Parameters
----------
data : `pd.DataFrame`
A data table.
group : `str`
A column name in `data` which will be used to group the `values`.
values : `str`
A column name in `data` to which you want to apply the
Kolmogorov-Smirnoff test to.
nulldistribution : str, default `uniform`
The null-distribution the `values` should be compared against. This
maps to the `Scipy.stats` available distributions.
Returns
-------
results : `dict`
A dictionary with `group` values and a `KstestResults` class a items.
"""
ks_res = {}
for c in data[group].unique():
temp = data[data[group] == c][values]
ks_res[c] = ss.kstest(temp[np.isnan(temp) == False], nulldistribution)
# return
return ks_res
# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
[docs]
def wald_interaction_test(point:Union[Tuple[float, float], List],
se:Union[Tuple[float, float], List],
null_value:float=0.0,
) -> TestResults:
"""
Statistic test whether the difference between to point estimates is distinct
from the null hypothesis values (`null_value`).
The tests simply calculates the difference between two point estimates and
calculates the standard error of this differences by taking the squared
root of the sum of the squared standard errors of the point estimates. The
fraction of the difference by its standard error is compared to a standard
normal distribution.
Parameters
----------
point : tuple [`float`, `float`]
Two point estimates, for example the mean difference or log odds ratio.
se : tuple [`float`, `float`]
Two standard errors of the point estimates.
null_value : `float`, default 0.0
The null-hypothesis value of the difference between the point estimates.
Returns
-------
results : `TestResults`
A results class.
"""
# check type
is_type(point, (tuple, list, np.ndarray), 'point')
is_type(se, (tuple, list, np.ndarray), 'se')
is_type(null_value, (int, float), 'null_value')
if len(point) != 2:
raise ValueError(
Error_MSG.INVALID_EXACT_LENGTH.format('point', 2, len(point)))
if len(se) != 2:
raise ValueError(
Error_MSG.INVALID_EXACT_LENGTH.format('se', 2, len(se)))
# get the z-statistic
diff = point[0] - point[1]
se = np.sqrt(se[0]**2 + se[1]**2)
zstat = (diff - null_value)/se
# get p-value
pvalue = calculate_pvalue(zstat)
# return
return TestResults(**{
NamesTest.POINT : diff,
NamesTest.POINT_SE : se,
NamesTest.STATISTIC : zstat,
NamesTest.PVALUE : pvalue,
NamesTest.NULL_VALUE : null_value,
})
# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
[docs]
def anova_one_way(means:list[int|float], variances:list[int|float],
sizes:list[int],
) -> AnovaTestResults:
"""
Performs a one-way ANOVA based on aggregate data.
Parameters
----------
means : `list` [`int` | `float`]
The group means.
variances : `list` [`int | `float`]
The group variances.
sizes : `list` [`int`]
The group sample sizes.
Returns
-------
TestResults
A results class.
Notes
-----
This is an alternative to scipy's one-way ANOVA implementation which
requires access to the individual observations (i.e. for each row in
a table).
"""
# ### confirm input
is_type(means, list)
is_type(variances, list)
is_type(sizes, list)
# check length
same_len(means, variances, ['means', 'variances'])
same_len(means, sizes, ['means', 'variances'])
# ### get dervied information
grand_mean = sum(m*s for m, s in zip(means, sizes))/sum(sizes)
df_num = len(sizes) - 1
df_den = sum(sizes) - len(sizes)
# explained sum of squares
ess = sum(s*(m-grand_mean)**2 for m,s in zip(means, sizes))
# residual sum of squares
rss = sum(v*(s-1) for v,s in zip(variances, sizes))
# ### calculate the test statistic and p-value
fstat = (ess/df_num)/(rss/df_den)
# NOTE make this into a seperate pvalue function
pval = 1-f.cdf(fstat, df_num, df_den)
# return
return AnovaTestResults(**{
NamesTest.STATISTIC : fstat,
NamesTest.PVALUE : pval,
NamesTest.ESS : ess,
NamesTest.RSS : rss,
NamesTest.DF_NUM : df_num,
NamesTest.DF_DENUM : df_den,
})