Source code for stats_misc.machine_learning.sklearn_utils

"""
A module with helper functions to support machine learning using sklearn.
"""
import pandas as pd
import numpy as np
import sklearn
from dataclasses import field
from typing import Any, Callable
from stats_misc.constants import (
    Error_MSG,
    NamesSklearnUtils as NamesSK,
)
from stats_misc.errors import (
    is_type,
    same_len,
    SklearnClass,
    EmptyError,
    assign_empty_default,
)

# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
[docs] def report(results: dict[str, Any], n_top: int = 3, verbose: bool = True, sort: str | None = None, ) -> pd.DataFrame: """ Prints and extracts cross-validation results from a `sklearn` based hyper-parameter search. Copied from `here <https://scikit-learn.org/stable/auto_examples/model_selection/plot_randomized_search.html#sphx-glr-auto-examples-model-selection-plot-randomized-search-py>`_. Parameters ---------- results : `dict` A dictionary with grid search results, e.g., from a `sklearn.model_selection._search.GridSearchCV.cv_results_` object. The dictionary should contain the keys `rank_test_score`, `mean_test_score`, `std_test_score`, and `params`. n_top : `int`, default 3 The top `n` results that should be printed - based on the `rank_test_score` column of `results`, so it automatically adjusts for the criterion (`scores`) that are looking for a minimum, or maximum. Will be ignored if `verbose` is `False`. sort : `str` or `None`, default `None` Set to ascending or descending, which will internally be parsed as a boolean argument to `ascending` in pandas.DataFrame.sort_values. verbose : `bool`, default `True` If something should be printed. Returns ------- `pd.DataFrame` A ranked table with information on the used hyper-parameters, the run time and the performance. For the exact meaning please refer to the documentation of the supplied `results` object. """ # check input is_type(results, dict, 'results') is_type(n_top, int, 'n_top') is_type(verbose, bool, 'verbose') is_type(sort, (type(None), str), 'sort') # create DataFrame results_frame = pd.DataFrame.from_dict(results) if sort is not None: if sort == NamesSK.REPORT_SORT_ASC: sort_b = True elif sort == NamesSK.REPORT_SORT_DESC: sort_b = False else: raise ValueError(Error_MSG.INCORRECT_STRING_INPUT.format( 'sort',[NamesSK.REPORT_SORT_ASC, NamesSK.REPORT_SORT_DESC])) # sort results_frame.sort_values(by=[NamesSK.REPORT_RANK], ascending=sort_b, inplace=True) # do we want to print a top `n` if verbose == True: for i in range(1, n_top + 1): candidates = np.flatnonzero(results[NamesSK.REPORT_RANK] == i) for candidate in candidates: print("Model with rank: {0}".format(i)) print( "Mean validation score: {0:.3f} (std: {1:.3f})".format( results["mean_test_score"][candidate], results["std_test_score"][candidate], ) ) print("Parameters: {0}".format(results["params"][candidate])) print("") # return stuff return results_frame
# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ # TODO add an option to use the weighted mean instead of the unweighted mean # can used the meta-analysis functions for this and the SD score info
[docs] def tune_hyper_params( learner: Callable, searcher: Callable, X: np.ndarray, Y: np.ndarray, fixed_learner_dict: dict[str, Any] | None = None, searcher_params_dict: dict[str, Any] | None = None, fit_dict: dict[str, Any] | None = None, strata: np.ndarray | None = None, strata_ascending: bool = False, ) -> tuple[pd.DataFrame, SklearnClass]: """ Will take a supervised learning algorithm `learner` and tune the algorithm's hyper-parameters using a `searcher`. Common sklearn searching algorithms include: * GridSearchCV, to perform an exhaustive search, * RandomizedSearchCV, which samples candidate parameters from supplied statistical distributions. Have a look at the sklearn `documentation <https://scikit-learn.org/stable/modules/grid_search.html>`_ for further options and discussion. When `strata` is supplied, the `searcher` will be applied to each individual stratum value. Per hyper-parameter set the perofrmance metric will be averged, and internally the best set (with rank 1) will be selected and returned as a `searcher` object with a single (best) model. Parameters ---------- learner : `callable` An sklearning supervised algorithm, such as `NuSVC` or `RandomForestClassifier`. searcher : `callable` A searching algorithm to tune the hyper-parameters of `learner` X : `np.ndarray` The design matrix containing all the features as columns and the observations/subjects as rows. Y : `np.ndarray` A matrix with the dependent variable(s), i.e., the labelled data. fixed_learner_dict : `dict` or `None`, default `None` Optional keyword arguments for the `learner`, specifying parameters which do not need to be tuned. searcher_params_dict : `dict` or `None`, default `None` Optional keyword arguments for the `searcher` algorithm. fit_dict : `dict` or `None`, default `None` Optional keyword arguments for the `searcher.fit` callable. strata : `np.ndarray` or `None`, default `None` An optional r by 1 matrix with discrete values to stratify the tuning algorithm on. Will perform separate tuning on each stratum and return the hyper-parameter set with the average (`mean`) optimised performance for the `learner`. Will be ignored if set to `None`. strata_ascending : `bool`, default `False` Wether the largest or smallest optimisation metric should be used to identify the best model. For example, for c-statistic one would want the largest value, whereas for the mean squared error one would prefer the hyper-parameters with the smallest value. Returns ------- `pd.DataFrame` A table of search results. `SklearnClass` The searcher object with either a single best model (when strata is supplied), or alternatively all the models evaluated by the searcher. """ # constants MEAN_COL = NamesSK.SEARCH_MEAN PARAMS = NamesSK.SEARCH_PARAMS AVERAGE = NamesSK.SEARCH_AVERAGE RANK = NamesSK.SEARCH_RANK # check input is_type(Y, np.ndarray, 'Y') is_type(X, np.ndarray, 'X') is_type(searcher_params_dict, (type(None), dict), 'searcher_params_dict') is_type(fixed_learner_dict, (type(None), dict), 'fixed_learner_dict') is_type(fit_dict, (type(None), dict), 'fit_dict') is_type(strata, (type(None), np.ndarray), 'strata') is_type(strata_ascending, bool, 'strata_ascending') same_len(Y, X, ['Y', 'X']) if strata is not None: same_len(Y, strata, ['Y', 'strata']) # replace None by empty dicts searcher_params_dict, fixed_learner_dict, fit_dict = assign_empty_default( [searcher_params_dict, fixed_learner_dict, fit_dict], empty_object=dict, ) # ##### start internal function def _internal(learner=learner, searcher=searcher, X=X, Y=Y, searcher_params_dict=searcher_params_dict, fixed_learner_dict=fixed_learner_dict, fit_dict=fit_dict, ) -> tuple[pd.DataFrame, SklearnClass]: # init leaner learner_init = learner(**fixed_learner_dict) # NOTE either do this # # match the `searcher` parameter to the `learner` and add to the # # kwargs for `searcher`. # searcher_params_dict[searcher_learner_parameter] = learner_init # NOTE OR simply assume it will always be the first parameter: # init grid if len(searcher_params_dict) != 0: searcher_init = searcher(learner_init, **searcher_params_dict) else: searcher_init = searcher(learner_init, searcher_params_dict) # fit the model searcher_init.fit(X=X,y=Y, **fit_dict) # extract report results = report( searcher_init.cv_results_, verbose=False, sort=None, ) # return return results, searcher_init # ##### start main function if strata is None: results, searcher_init = _internal() else: # loop over unique strata S_dict={} S_uniq = np.unique(strata) for S in S_uniq: X_S = X[strata == S] Y_S = Y[strata == S] # get CV results results_S, searcher_init = _internal(X=X_S,Y=Y_S) # store mean results S_dict[S] = results_S[MEAN_COL] # get average performance, NOTE implement a weighted mean S_dict[PARAMS] = results_S[PARAMS]; del results_S M_average = pd.DataFrame.from_dict(S_dict) M_average[AVERAGE] = M_average[S_uniq].mean(axis=1, skipna=False) M_average[RANK] = M_average[AVERAGE].rank(ascending=strata_ascending) # get best params best = M_average[M_average[RANK] == 1] if best.empty: raise EmptyError('`best` is empty. Most likely one or more values ' 'of strata do not occur frequently enough. ' 'Potentially, you would be better to simply ' 'remove these values.') if best.shape[0] == 1: best_params = best[PARAMS].values[0] else: # simply take the first, could infuture consider the average SD: (sqrt(SD^2 + SD^2)) best_params = best.iloc[0][PARAMS] # ##### refit model fixed_learner_dict.update(best_params) # now actual refit the model _, searcher_init = _internal(fixed_learner_dict=fixed_learner_dict, searcher_params_dict={}, ) # asign results results = M_average # return return results, searcher_init