#!/usr/bin/env python3
"""Genetics utility functions for the stats-misc module."""
from __future__ import annotations
import numpy as np
from stats_misc.errors import InputValidationError
# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
[docs]
def minor_allele_frequency(genotypes: list[int | float]) -> float:
"""
Calculate the minor allele frequency (MAF) from genotype dosages.
Parameters
----------
genotypes : `list` [`int` | `float`]
1D array of genotype dosages coded as 0, 1, or 2, representing
the count of the effect allele per individual. Fractional values
in [0, 2] (e.g. imputed dosages) are accepted. Missing values
are not permitted and will raise an error.
Returns
-------
maf : `float`
The minor allele frequency, bounded in the interval [0, 0.5].
Raises
------
InputValidationError
If the input contains missing values, is empty, or contains
dosages outside the range [0, 2].
Notes
-----
The allele frequency is computed as the mean dosage divided by two.
The MAF is then defined as ``min(p, 1 - p)``, where ``p`` is the
frequency of the effect allele.
Examples
--------
>>> minor_allele_frequency([0, 1, 1, 2, 0])
0.4
"""
g = np.asarray(genotypes, dtype=float)
# refuse to guess: missingness must be handled upstream
if np.any(np.isnan(g)):
raise InputValidationError("genotypes contain missing values (NaN)")
if g.size == 0:
raise InputValidationError("no genotypes provided")
# basic range check on the dosage values
if np.any((g < 0) | (g > 2)):
raise InputValidationError("genotype dosages must be between 0 and 2")
# allele frequency of the effect allele
p = np.mean(g) / 2.0
# the minor allele is whichever has frequency <= 0.5
return float(min(p, 1.0 - p))