Source code for stats_misc.meta_analysis

"""
Meta-analysis estimators for weighted averages and heterogeneity assessment.

This module provides methods for calculating weighted averages of point
estimates using fixed-effect and random-effects models. It includes
implementations of standard estimators for between-study variance
(tau-squared), Q-tests for heterogeneity, and I-squared statistics.
Iterative and non-iterative estimators are supported, including
DerSimonian-Laird, Cochrane ANOVA, Paule-Mandel, and method of moments.

Classes
-------
HeterogeneityResults
    A results container for heterogeneity estimates including Q-statistic,
    I-squared, and tau-squared values.

Functions
---------
fixed_effect(estimates, standard_errors)
    Computes a fixed-effect meta-analytic estimate and standard error.

random_effects(estimates, standard_errors, between_estimate_variance)
    Computes a random-effects meta-analytic estimate and standard error.

heterogeneity(estimates, standard_errors, overall_estimate, alpha=0.05,
              tau2=None)
    Computes heterogeneity measures: Q-test, I-squared, and tau-squared.

estimate_tau(estimates, standard_errors, method='mm-it', **kwargs)
    Estimates the between-study variance (tau-squared) using various methods.

Notes
-----
These implementations are partially adapted from `statsmodels`, with
adjustments for flexibility and reproducibility.
"""

import warnings
import numpy as np
from typing import Any, Literal
from scipy.stats import (
    chi2,
    norm,
)
from stats_misc.constants import (
    Error_MSG,
    NamesMetaAnalysis as NamesMA,
    CLASS_NAME,
)
from stats_misc.errors import (
    is_type,
    string_join_delimiter,
    same_len,
    check_limits,
)
from stats_misc.utils.helpers import (
    Results,
)

# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
# constants
EXPECTED_TAU_METHODS = [NamesMA.TAU_METHOD_DL, NamesMA.TAU_METHOD_CA,
                        NamesMA.TAU_METHOD_MM_IT, NamesMA.TAU_METHOD_CA2,
                        NamesMA.TAU_METHOD_DL2, NamesMA.TAU_METHOD_PM_IT,
                        ]
MSG1 = Error_MSG.DIFF_LENGTHS
MSG2 = Error_MSG.INCORRECT_STRING_INPUT

# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

[docs]
class HeterogeneityResults(Results):
    """
    A results container for heterogeneity estimates.
    
    Attributes
    ----------
    q_statistic : `float`
        The Q-statistic.
    q_pvalue : `float`
        The Q-statistic p-value.
    i_squared : `float`
        The I-squared heterogeneity statistic.
    i_squared_ci : `tuple` [`float`]
        The I-squared confidence interval.
    i_squared_ci_coverage : `float`
        The confidence level of the I-squared interval.
    tau_squared : `float`
        The tau-squared heterogeneity statistic.
    """
    SET_ARGS = [
        CLASS_NAME,
        NamesMA.QSTAT,
        NamesMA.QPVAL,
        NamesMA.ISQR,
        NamesMA.ISQR_CI,
        NamesMA.ISQR_CI_COV,
        NamesMA.TSQR,
    ]
    def __init__(self, **kwargs) -> None:
        super().__init__(set_args=self.SET_ARGS, **kwargs)


# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
# Tau-square estimates
def _tau_iter_pm(estimates:list[float], standard_errors:list[float],
                 tau_start:float=0, atol:float=1e-5, maxiter:int=50,
                 ) -> tuple[float, bool]:
    """
    The Paule-Mandel (PM) iterative estimator of the between-study variance.
    
    This method estimates tau-squared by iteratively solving the estimating
    equation until convergence.
    
    Parameters
    ----------
    estimates : `list` [`float`]
        A list of point estimates (e.g., mean differences or log odds ratios).
    standard_errors : `list` [`float`]
        Corresponding standard errors for each estimate.
    tau_start : `float`, default 0
        Starting value of the tau-squared.
    atol : `float`, default 1e-5
        Convergence tolerance. Itteration will terminate when the change is
        smaller than `atol`.
    maxiter : `int`, default 50
        Maximum number of iterations.
    
    Returns
    -------
    float
        Estimated tau-squared.
    bool
        Whether the algorithm converged.
    
    Notes
    -----
    See DerSimonian and Kacker 2007 [1]_.
    
    This code was addapted from
    `statsmodels <https://github.com/statsmodels/statsmodels/blob/main/statsmodels/stats/meta_analysis.py#L628>`_.
    
    Hash: d5b11ca
    
    References
    ----------
    .. [1] DerSimonian R, Kacker R. Random-effects model for meta-analysis of
        clinical trials: an update. Contemp Clin Trials. 2007;28(2):105-114.
        doi:10.1016/j.cct.2006.04.004
    """
    # #### check input
    is_type(estimates, list)
    is_type(standard_errors, list)
    is_type(tau_start, (int, float))
    is_type(atol, (int, float))
    is_type(maxiter, int)
    same_len(estimates, standard_errors, ['estimates', 'standard_errors'])
    tausquared = tau_start
    v = [s**2 for s in standard_errors]
    # #### iteration
    converged = False
    for _ in range(maxiter):
        w = np.array([1/(i + tausquared) for i in v])
        m = w.dot(estimates) / w.sum(0)
        squared_res = [(e-m)**2 for e in estimates]
        q_w = w.dot(squared_res)
        # F(tausquard_[-i]) estimating equation
        ee = q_w - (len(estimates) - 1)
        # if negative set tau to zero
        if ee < 0:
            tausquared = 0
            converged = True
            break
        # if close enough to 0 stop
        if np.allclose(ee, 0, atol=atol):
            converged = True
            break
        # update tausquared
        delta = ee / (w**2).dot(squared_res)
        tausquared += delta
    # return
    return tausquared, converged

# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
def _tau_mm(estimates:list[float], standard_errors:list[float],
            weights:list[float]
            ) -> float:
    """
    Method of moments estimator for the tau-squared (i.e. the between study
    variance)
    
    Parameters
    ----------
    estimates : `list` [`float`]
        A list of point estimates (e.g., mean differences or log odds ratios).
    standard_errors : `list` [`float`]
        Corresponding standard errors for each estimate.
    weights : `list` [`float`]
        The weights used to calculate the overall point estimates using
        a weighted average.
    
    Returns
    -------
    float
        Estimated tau-squared.
    
    Notes
    -----
    This code was addapted from
    `statsmodels <https://github.com/statsmodels/statsmodels/blob/main/statsmodels/stats/meta_analysis.py#L713C1-L754C1>`_.
    
    Hash: d5b11ca
    
    The code is based on [1]_ and implements equation 6, where depending on the
    weights on can use the following tau estimators: Cochrane ANOVA (CA),
    DerSimonian and Laird (DL), CA2 (the two-stage CA estimator), and the DL2
    (the two stage DL estimator).
    
    | Estimator | Weights                    | Notes                |
    | --------- | -------------------------- | -------------------- |
    | CA        | 1/k                        | for k studies        |
    | DL        | 1/(s_i)**2                 | for standard error s |
    | CA2       | 1/(_tau_mm(CA) + (s_i)**2) |           -          |
    | DL2       | 1/(_tau_mm(DL) + (s_i)**2) |           -          |
    | --------- | -------------------------- | -------------------- |
    
    References
    ----------
    .. [1] DerSimonian R, Kacker R. Random-effects model for meta-analysis of
        clinical trials: an update. Contemp Clin Trials. 2007;28(2):105-114.
        doi:10.1016/j.cct.2006.04.004
    
    """
    
    # #### test input
    is_type(estimates, list)
    is_type(standard_errors, list)
    is_type(weights, list)
    same_len(estimates, standard_errors, ['estimates', 'standard_errors'])
    same_len(weights, standard_errors, ['weights', 'standard_errors'])
    # ### calculations
    v = [s**2 for s in standard_errors]
    w = np.array(weights)
    # weighted estimate
    m = w.dot(estimates)/w.sum(0)
    # squared residuals
    resid_sq = [(e - m)**2 for e in estimates]
    q_w = w.dot(resid_sq)
    w_t = w.sum()
    expect = w.dot(v) - (w**2).dot(v)/w_t
    denom = w_t - (w**2).sum()/w_t
    # moment estimate from estimating equation
    tau2 = (q_w - expect)/denom
    # return tau
    return tau2

# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
def _tau_iter_mm(estimates:list[float], standard_errors:list[float],
                 tau_start:float=0, atol:float=1e-5, maxiter:int=50,
                 ) -> tuple[float, bool]:
    """
    General method-of-moments iterative estimator of tau-squared.
    
    This approach updates tau-squared until convergence using MM equations.
    
    Parameters
    ----------
    estimates : `list` [`float`]
        A list of point estimates (e.g., mean differences or log odds ratios).
    standard_errors : `list` [`float`]
        Corresponding standard errors for each estimate.
    tau_start : `float`, default 0
        Starting value of the tau-squared.
    atol : `float`, default 1e-5
        Convergence tolerance for change in tau-squared estimate between
        iterations.
    maxiter : `int`, default 50
        Maximum number of iterations.
    
    Returns
    -------
    float
        Estimated tau-squared.
    bool
        Whether the algorithm converged.
    
    Notes
    -----
    See DerSimonian and Kacker 2007 [1]_.
    
    This code was addapted from
    `statsmodels <https://github.com/statsmodels/statsmodels/blob/main/statsmodels/stats/meta_analysis.py#L713>`_.
    
    Hash: d5b11ca
    
    References
    ----------
    .. [1] DerSimonian R, Kacker R. Random-effects model for meta-analysis of
        clinical trials: an update. Contemp Clin Trials. 2007;28(2):105-114.
        doi:10.1016/j.cct.2006.04.004
    
    """
    # #### check input
    is_type(estimates, list)
    is_type(standard_errors, list)
    is_type(tau_start, (int, float))
    is_type(atol, (int, float))
    is_type(maxiter, int)
    same_len(estimates, standard_errors, ['estimates', 'standard_errors'])
    tausquared = tau_start
    v = [s**2 for s in standard_errors]
    # #### iterating
    converged = False
    for _ in range(maxiter):
        w = [1/(i + tausquared) for i in v]
        tau_new = _tau_mm(estimates, standard_errors, w)
        tau_new = max(0, tau_new)
        delta = tau_new - tausquared
        # if close enough to 0 stop
        if np.allclose(delta, 0, atol=atol):
            converged = True
            break
        # final result
        tausquared = tau_new
    # return
    return tausquared, converged

# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

[docs]
def estimate_tau(
    estimates:list[float], standard_errors:list[float],
    method:Literal['dl', 'ca', 'dl2', 'ca2', 'mm-it', 'pm-it']='mm-it',
    **kwargs: Any,
) -> float:
    """
    Calculate the between study/estimate variance, see DerSimonian and Kacker
    2007 [1]_.
    
    Parameters
    ----------
    estimates : `list` [`float`]
        A list of point estimates (e.g., mean differences or log odds ratios).
    standard_errors : `list` [`float`]
        Corresponding standard errors for each estimate.
    method : {'dl', 'ca', 'dl2', 'ca2', 'mm-it', 'pm-it'}, default `dl2`
        Which methods of moments (MM) estimator to use:
        
        - dl    : DerSimonian and Laird (DL) estimator,
        - ca    : Cochrane ANOVA (CA) estimator,
        - dl2   : The two-stage DL estimator,
        - ca2   : The two-stage CA estimator,
        - mm-it : The general MM iterative estimator,
        - pm-it : The Paule Mandel iterative estimator.
    **kwargs : `any`
        Keyword arguments supplied to `_tau_iter_mm` or `_tau_iter_pm`.
    
    Returns
    -------
    float
        Estimated tau-squared.
    
    References
    ----------
    .. [1] DerSimonian R, Kacker R. Random-effects model for meta-analysis of
        clinical trials: an update. Contemp Clin Trials. 2007;28(2):105-114.
        doi:10.1016/j.cct.2006.04.004
    
    """
    
    # #### confirm input
    is_type(estimates, list)
    is_type(standard_errors, list)
    is_type(method, str)
    same_len(estimates, standard_errors, ['estimates', 'standard_errors'])
    # ensure the same format
    method = method.lower()
    tau2 = None
    if method not in EXPECTED_TAU_METHODS:
        raise ValueError(Error_MSG.INCORRECT_STRING_INPUT.method('method',
            string_join_delimiter(EXPECTED_TAU_METHODS)))
    # #### perform calculations
    if method == NamesMA.TAU_METHOD_CA:
        w = [1/len(estimates)] * len(estimates)
        tau2 = _tau_mm(estimates=estimates, standard_errors=standard_errors,
                       weights=w)
    elif method == NamesMA.TAU_METHOD_DL:
        w = [1/s**2 for s in standard_errors]
        tau2 = _tau_mm(estimates=estimates, standard_errors=standard_errors,
                       weights=w)
    elif method == NamesMA.TAU_METHOD_CA2:
        w = [1/len(estimates)] * len(estimates)
        tau2_temp = _tau_mm(estimates=estimates, standard_errors=standard_errors,
                            weights=w)
        w2 = [1/(tau2_temp + s**2) for s in standard_errors]
        # the final tau estimate
        tau2 = _tau_mm(estimates=estimates, standard_errors=standard_errors,
                       weights=w2)
    elif method == NamesMA.TAU_METHOD_DL2:
        w = [1/s**2 for s in standard_errors]
        tau2_temp = _tau_mm(estimates=estimates, standard_errors=standard_errors,
                            weights=w)
        w2 = [1/(tau2_temp + s**2) for s in standard_errors]
        # the final tau estimate
        tau2 = _tau_mm(estimates=estimates, standard_errors=standard_errors,
                       weights=w2)
    elif method == NamesMA.TAU_METHOD_MM_IT:
        tau2, converged = _tau_iter_mm(estimates=estimates,
                               standard_errors=standard_errors,
                               **kwargs)
        if converged == False:
            warnings.warn(
                Error_MSG.NON_CONVERGENCE.format(NamesMA.TAU_METHOD_MM_IT))
    elif method == NamesMA.TAU_METHOD_PM_IT:
        tau2, converged = _tau_iter_pm(estimates=estimates,
                               standard_errors=standard_errors,
                               **kwargs)
        if converged == False:
            warnings.warn(
                Error_MSG.NON_CONVERGENCE.format(NamesMA.TAU_METHOD_PM_IT))
    # #### return
    return tau2


# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
# Q-test

[docs]
def heterogeneity(estimates:list[float], standard_errors:list[float],
                  overall_estimate:float | int,
                  alpha:float=0.05,
                  tau2:float | None=None,
                  ) -> HeterogeneityResults:
    """
    Provides heterogeneity estimates (Q-test, I-squared, and Tau-square) [2]_,
    determining too what extent the individual `estimates` are distinct from
    the `overall_estimate`, accounting for difference in precision based on
    the supplied estimate specific standard_errors.
    
    Parameters
    ----------
    estimates : `list` [`float`]
        A list of point estimates (e.g., mean differences or log odds ratios).
    standard_errors : `list` [`float`]
        Corresponding standard errors for each estimate.
    overall_estimate : `float` or `int`
        The overall estimate, for example based on a meta-analysis of the
        point estimates.
    alpha : `float`, default 0.05
        The alpha for the (1-alpha/2)*100% confidence interval.
    tau2 : `float`, optional
        A possible external estimate of the tausquard used in the random effects
        estimator. If `NoneType` a non-iterative MM estimate will be used.
    
    Returns
    -------
    HeterogeneityResults
        Object containing heterogeneity statistics.
    
    References
    ----------
    .. [2] Higgins JP, Thompson SG. Quantifying heterogeneity in a
        meta-analysis. Stat Med. 2002;21(11):1539-1558. doi:10.1002/sim.1186
    
    Notes
    -----
    Heterogeneity will be internally evaluated using a Chi-square distribution
    with `len(estimates) - 1` degrees of freedom.
    
    The heterogeneity estimates are based on [2]_, specifically the tau-squared
    is estimated using the DerSimonian and Laird method of moments method
    (without iteration).
    """
    
    # #### check input
    is_type(estimates, list)
    is_type(standard_errors, list)
    is_type(overall_estimate, (int, float))
    is_type(alpha, float)
    same_len(estimates, standard_errors, ['estimates', 'standard_errors'])
    # is alpha between 0 and 1
    check_limits(alpha, 0.0, 1.0)
    # #### Calculate the Q-test
    k = len(estimates) - 1
    weights = [1/s**2 for s in standard_errors]
    qstatistic = sum(
        [w * (b-overall_estimate)**2 for w,b in zip(weights, estimates)]
    )
    # if there is only one study set to 1
    if k == 0:
        qpvalue = 1
    else:
        qpvalue = 1-chi2.cdf(qstatistic, k)
    # #### Calcualte the I-squared
    if qstatistic <= k:
        isquared = 0
    else:
        isquared = 100*(qstatistic-k)/qstatistic
    # get confidence intervals
    H = qstatistic/k
    H_se = 0.5*(np.log(qstatistic) - np.log(k))/\
        (np.sqrt(2*qstatistic) - np.sqrt(2*k-2))
    H_ub = np.exp(np.log(H) + norm.ppf(1-alpha/2)*H_se)
    H_lb = np.exp(np.log(H) - norm.ppf(1-alpha/2)*H_se)
    isquared_ub = 100*(H_ub-1)/H_ub
    isquared_lb = 100*(H_lb-1)/H_lb
    # truncate if need - this is really poor, but apparently standard
    if isquared_ub < 0:
        isquared_ub = 0
    if isquared_lb < 0:
        isquared_lb = 0
    if isquared_ub > 100:
        isquared_ub = 100
    if isquared_lb > 100:
        isquared_lb = 100
    # #### Calcualte the Tau-squared
    tausquared = tau2
    if tau2 is None:
        ctau = sum(weights) - sum(w**2 for w in weights)/sum(weights)
        tausquared = (qstatistic - k)/ctau
        if tausquared < 0:
            tausquared = 0
    # ### results
    results = {
        CLASS_NAME : 'heterogeneity',
        NamesMA.QSTAT: qstatistic, NamesMA.QPVAL: qpvalue,
        NamesMA.ISQR: isquared,
        NamesMA.ISQR_CI: (isquared_lb, isquared_ub),
        NamesMA.ISQR_CI_COV: 1-alpha,
        NamesMA.TSQR: tausquared,
    }
    return HeterogeneityResults(**results)


# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

[docs]
def fixed_effect(estimates:list[float], standard_errors:list[float],
                  ) -> tuple[float, float]:
    """
    Compute fixed-effect meta-analysis estimate.
    
    Parameters
    ----------
    estimates : `list` [`float`]
        A list of point estimates (e.g., mean differences or log odds ratios).
    standard_errors : `list` [`float`]
        Corresponding standard errors for each estimate.
    
    Returns
    -------
    estimate : `float`
        The average estimate,
    standard_error : `float`
        The standard error of the average estimate.
    """
    # #### check input
    is_type(estimates, list)
    is_type(standard_errors, list)
    # #### Calculations
    w = [1/s**2 for s in standard_errors]
    # weighted average
    b_pool = sum(e*w for e,w in zip(estimates, w))/np.sum(w)
    # weighted standard error
    se_pool = np.sqrt(1/np.sum(w))
    # return stuff
    return b_pool, se_pool


# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

[docs]
def random_effects(estimates:list[float], standard_errors:list[float],
                   between_estimate_variance: float,
                   ) -> tuple[float, float]:
    """
    Compute random-effects meta-analysis estimate.
    
    Parameters
    ----------
    estimates : `list` [`float`]
        A list of point estimates (e.g., mean differences or log odds ratios).
    standard_errors : `list` [`float`]
        Corresponding standard errors for each estimate.
    between_estimate_variance : `float`
        The between estimate variance, often referred to as the tau-squared.
        Can be estimated using the `Heterogeneity` function.
    
    Returns
    -------
    estimate : `float`
        The average estimate,
    standard_error : `float`
        The standard error of the average estimate.
    
    Notes
    -----
    Setting `between_estimate_variance` to zero will results in a fixed effect
    estimate.
    """
    # #### check input
    is_type(estimates, list)
    is_type(standard_errors, list)
    is_type(between_estimate_variance, (int, float))
    # #### Calculations
    # updates the standard error to include the between_estimate_variance
    s_random = [np.sqrt(s**2 + between_estimate_variance)\
                for s in standard_errors]
    # supply this to the regular fixed effect function
    estimate, standard_error = fixed_effect(estimates=estimates,
                                            standard_errors=s_random,
                                            )
    # return
    return estimate, standard_error