Source code for stats_misc.machine_learning.sklearn_utils

"""
A module with helper functions to support machine learning using sklearn.
"""
import pandas as pd
import numpy as np
import sklearn
from dataclasses import field
from typing import Any, Callable
from stats_misc.constants import (
    Error_MSG,
    NamesSklearnUtils as NamesSK,
)
from stats_misc.errors import (
    is_type,
    same_len,
    SklearnClass,
    EmptyError,
    assign_empty_default,
)

# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

[docs]
def report(results: dict[str, Any], n_top: int = 3, verbose: bool = True,
           sort: str | None = None,
           ) -> pd.DataFrame:
    """
    Prints and extracts cross-validation results from a `sklearn` based
    hyper-parameter search. Copied from `here <https://scikit-learn.org/stable/auto_examples/model_selection/plot_randomized_search.html#sphx-glr-auto-examples-model-selection-plot-randomized-search-py>`_.
    
    Parameters
    ----------
    results : `dict`
        A dictionary with grid search results, e.g., from a
        `sklearn.model_selection._search.GridSearchCV.cv_results_` object.
        The dictionary should contain the keys `rank_test_score`,
        `mean_test_score`, `std_test_score`, and `params`.
    n_top : `int`, default 3
        The top `n` results that should be printed - based on the
        `rank_test_score` column of `results`, so it automatically adjusts for
        the criterion (`scores`) that are looking for a minimum, or maximum.
        Will be ignored if `verbose` is `False`.
    sort : `str` or `None`, default `None`
        Set to ascending or descending, which will internally be parsed as a
        boolean argument to `ascending` in pandas.DataFrame.sort_values.
    verbose : `bool`, default `True`
        If something should be printed.
    
    Returns
    -------
    `pd.DataFrame`
        A ranked table with information on the used hyper-parameters, the run
        time and the performance. For the exact meaning please refer to the
        documentation of the supplied `results` object.
    """
    # check input
    is_type(results, dict, 'results')
    is_type(n_top, int, 'n_top')
    is_type(verbose, bool, 'verbose')
    is_type(sort, (type(None), str), 'sort')
    # create DataFrame
    results_frame = pd.DataFrame.from_dict(results)
    if sort is not None:
        if sort == NamesSK.REPORT_SORT_ASC:
            sort_b = True
        elif sort == NamesSK.REPORT_SORT_DESC:
            sort_b = False
        else:
            raise ValueError(Error_MSG.INCORRECT_STRING_INPUT.format(
                'sort',[NamesSK.REPORT_SORT_ASC, NamesSK.REPORT_SORT_DESC]))
        # sort
        results_frame.sort_values(by=[NamesSK.REPORT_RANK], ascending=sort_b,
                                  inplace=True)
    # do we want to print a top `n`
    if verbose == True:
        for i in range(1, n_top + 1):
            candidates = np.flatnonzero(results[NamesSK.REPORT_RANK] == i)
            for candidate in candidates:
                print("Model with rank: {0}".format(i))
                print(
                    "Mean validation score: {0:.3f} (std: {1:.3f})".format(
                        results["mean_test_score"][candidate],
                        results["std_test_score"][candidate],
                    )
                )
                print("Parameters: {0}".format(results["params"][candidate]))
                print("")
    # return stuff
    return results_frame


# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
# TODO add an option to use the weighted mean instead of the unweighted mean
# can used the meta-analysis functions for this and the SD score info

[docs]
def tune_hyper_params(
    learner: Callable,
    searcher: Callable,
    X: np.ndarray,
    Y: np.ndarray,
    fixed_learner_dict: dict[str, Any] | None = None,
    searcher_params_dict: dict[str, Any] | None = None,
    fit_dict: dict[str, Any] | None = None,
    strata: np.ndarray | None = None,
    strata_ascending: bool = False,
    ) -> tuple[pd.DataFrame, SklearnClass]:
    """
    Will take a supervised learning algorithm `learner` and tune the
    algorithm's hyper-parameters using a `searcher`.
    
    Common sklearn searching algorithms include:
        * GridSearchCV, to perform an exhaustive search,
        * RandomizedSearchCV, which samples candidate parameters from supplied
            statistical distributions.
        Have a look at the sklearn `documentation <https://scikit-learn.org/stable/modules/grid_search.html>`_ for further options and discussion.
    
    When `strata` is supplied, the `searcher` will be applied to each
    individual stratum value. Per hyper-parameter set the perofrmance metric
    will be averged, and internally the best set (with rank 1) will be selected
    and returned as a `searcher` object with a single (best) model.
    
    Parameters
    ----------
    learner : `callable`
        An sklearning supervised algorithm, such as `NuSVC` or
        `RandomForestClassifier`.
    searcher : `callable`
        A searching algorithm to tune the hyper-parameters of `learner`
    X : `np.ndarray`
        The design matrix containing all the features as columns and the
        observations/subjects as rows.
    Y : `np.ndarray`
        A matrix with the dependent variable(s), i.e., the labelled data.
    fixed_learner_dict : `dict` or `None`, default `None`
        Optional keyword arguments for the `learner`, specifying parameters
        which do not need to be tuned.
    searcher_params_dict : `dict` or `None`, default `None`
        Optional keyword arguments for the `searcher` algorithm.
    fit_dict : `dict` or `None`, default `None`
        Optional keyword arguments for the `searcher.fit` callable.
    strata : `np.ndarray` or `None`, default `None`
        An optional r by 1 matrix with discrete values to stratify the tuning
        algorithm on. Will perform separate tuning on each stratum and
        return the hyper-parameter set with the average (`mean`) optimised
        performance for the `learner`. Will be ignored if set to `None`.
    strata_ascending : `bool`, default `False`
        Wether the largest or smallest optimisation metric should be used to
        identify the best model. For example, for c-statistic one would want
        the largest value, whereas for the mean squared error one would
        prefer the hyper-parameters with the smallest value.
    
    Returns
    -------
    `pd.DataFrame`
        A table of search results.
    `SklearnClass`
        The searcher object with either a single best model (when strata is
        supplied), or alternatively all the models evaluated by the searcher.
    """
    # constants
    MEAN_COL = NamesSK.SEARCH_MEAN
    PARAMS   = NamesSK.SEARCH_PARAMS
    AVERAGE  = NamesSK.SEARCH_AVERAGE
    RANK     = NamesSK.SEARCH_RANK
    # check input
    is_type(Y, np.ndarray, 'Y')
    is_type(X, np.ndarray, 'X')
    is_type(searcher_params_dict, (type(None), dict), 'searcher_params_dict')
    is_type(fixed_learner_dict, (type(None), dict), 'fixed_learner_dict')
    is_type(fit_dict, (type(None), dict), 'fit_dict')
    is_type(strata, (type(None), np.ndarray), 'strata')
    is_type(strata_ascending, bool, 'strata_ascending')
    same_len(Y, X, ['Y', 'X'])
    if strata is not None:
        same_len(Y, strata, ['Y', 'strata'])
    # replace None by empty dicts
    searcher_params_dict, fixed_learner_dict, fit_dict = assign_empty_default(
        [searcher_params_dict, fixed_learner_dict, fit_dict],
        empty_object=dict,
    )
    # ##### start internal function
    def _internal(learner=learner, searcher=searcher, X=X, Y=Y,
                  searcher_params_dict=searcher_params_dict,
                  fixed_learner_dict=fixed_learner_dict,
                  fit_dict=fit_dict,
                  ) -> tuple[pd.DataFrame, SklearnClass]:
        # init leaner
        learner_init = learner(**fixed_learner_dict)
        # NOTE either do this
        # # match the `searcher` parameter to the `learner` and add to the
        # # kwargs for `searcher`.
        # searcher_params_dict[searcher_learner_parameter] = learner_init
        # NOTE OR simply assume it will always be the first parameter:
        # init grid
        if len(searcher_params_dict) != 0:
            searcher_init = searcher(learner_init, **searcher_params_dict)
        else:
            searcher_init = searcher(learner_init, searcher_params_dict)
        # fit the model
        searcher_init.fit(X=X,y=Y, **fit_dict)
        # extract report
        results = report(
            searcher_init.cv_results_, verbose=False, sort=None,
        )
        # return
        return results, searcher_init
    # ##### start main function
    if strata is None:
        results, searcher_init = _internal()
    else:
        # loop over unique strata
        S_dict={}
        S_uniq = np.unique(strata)
        for S in S_uniq:
            X_S = X[strata == S]
            Y_S = Y[strata == S]
            # get CV results
            results_S, searcher_init =  _internal(X=X_S,Y=Y_S)
            # store mean results
            S_dict[S] = results_S[MEAN_COL]
        # get average performance, NOTE implement a weighted mean
        S_dict[PARAMS] = results_S[PARAMS]; del results_S
        M_average = pd.DataFrame.from_dict(S_dict)
        M_average[AVERAGE] = M_average[S_uniq].mean(axis=1, skipna=False)
        M_average[RANK] = M_average[AVERAGE].rank(ascending=strata_ascending)
        # get best params
        best = M_average[M_average[RANK] == 1]
        if best.empty:
            raise EmptyError('`best` is empty. Most likely one or more values '
                             'of strata do not occur frequently enough. '
                             'Potentially, you would be better to simply '
                             'remove these values.')
        if best.shape[0] == 1:
            best_params = best[PARAMS].values[0]
        else:
            # simply take the first, could infuture consider the average SD: (sqrt(SD^2 + SD^2))
            best_params = best.iloc[0][PARAMS]
        # ##### refit model
        fixed_learner_dict.update(best_params)
        # now actual refit the model
        _, searcher_init = _internal(fixed_learner_dict=fixed_learner_dict,
                                       searcher_params_dict={},
                                       )
        # asign results
        results = M_average
    # return
    return results, searcher_init