Source code for stats_misc.genetics.general

#!/usr/bin/env python3
"""Genetics utility functions for the stats-misc module."""
from __future__ import annotations

import numpy as np

from stats_misc.errors import InputValidationError

# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
[docs] def minor_allele_frequency(genotypes: list[int | float]) -> float: """ Calculate the minor allele frequency (MAF) from genotype dosages. Parameters ---------- genotypes : `list` [`int` | `float`] 1D array of genotype dosages coded as 0, 1, or 2, representing the count of the effect allele per individual. Fractional values in [0, 2] (e.g. imputed dosages) are accepted. Missing values are not permitted and will raise an error. Returns ------- maf : `float` The minor allele frequency, bounded in the interval [0, 0.5]. Raises ------ InputValidationError If the input contains missing values, is empty, or contains dosages outside the range [0, 2]. Notes ----- The allele frequency is computed as the mean dosage divided by two. The MAF is then defined as ``min(p, 1 - p)``, where ``p`` is the frequency of the effect allele. Examples -------- >>> minor_allele_frequency([0, 1, 1, 2, 0]) 0.4 """ g = np.asarray(genotypes, dtype=float) # refuse to guess: missingness must be handled upstream if np.any(np.isnan(g)): raise InputValidationError("genotypes contain missing values (NaN)") if g.size == 0: raise InputValidationError("no genotypes provided") # basic range check on the dosage values if np.any((g < 0) | (g > 2)): raise InputValidationError("genotype dosages must be between 0 and 2") # allele frequency of the effect allele p = np.mean(g) / 2.0 # the minor allele is whichever has frequency <= 0.5 return float(min(p, 1.0 - p))