"""
A module with helper functions to support machine learning using sklearn.
"""
import pandas as pd
import numpy as np
import sklearn
from dataclasses import field
from typing import Any, Callable
from stats_misc.constants import (
Error_MSG,
NamesSklearnUtils as NamesSK,
)
from stats_misc.errors import (
is_type,
same_len,
SklearnClass,
EmptyError,
assign_empty_default,
)
# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
[docs]
def report(results: dict[str, Any], n_top: int = 3, verbose: bool = True,
sort: str | None = None,
) -> pd.DataFrame:
"""
Prints and extracts cross-validation results from a `sklearn` based
hyper-parameter search. Copied from `here <https://scikit-learn.org/stable/auto_examples/model_selection/plot_randomized_search.html#sphx-glr-auto-examples-model-selection-plot-randomized-search-py>`_.
Parameters
----------
results : `dict`
A dictionary with grid search results, e.g., from a
`sklearn.model_selection._search.GridSearchCV.cv_results_` object.
The dictionary should contain the keys `rank_test_score`,
`mean_test_score`, `std_test_score`, and `params`.
n_top : `int`, default 3
The top `n` results that should be printed - based on the
`rank_test_score` column of `results`, so it automatically adjusts for
the criterion (`scores`) that are looking for a minimum, or maximum.
Will be ignored if `verbose` is `False`.
sort : `str` or `None`, default `None`
Set to ascending or descending, which will internally be parsed as a
boolean argument to `ascending` in pandas.DataFrame.sort_values.
verbose : `bool`, default `True`
If something should be printed.
Returns
-------
`pd.DataFrame`
A ranked table with information on the used hyper-parameters, the run
time and the performance. For the exact meaning please refer to the
documentation of the supplied `results` object.
"""
# check input
is_type(results, dict, 'results')
is_type(n_top, int, 'n_top')
is_type(verbose, bool, 'verbose')
is_type(sort, (type(None), str), 'sort')
# create DataFrame
results_frame = pd.DataFrame.from_dict(results)
if sort is not None:
if sort == NamesSK.REPORT_SORT_ASC:
sort_b = True
elif sort == NamesSK.REPORT_SORT_DESC:
sort_b = False
else:
raise ValueError(Error_MSG.INCORRECT_STRING_INPUT.format(
'sort',[NamesSK.REPORT_SORT_ASC, NamesSK.REPORT_SORT_DESC]))
# sort
results_frame.sort_values(by=[NamesSK.REPORT_RANK], ascending=sort_b,
inplace=True)
# do we want to print a top `n`
if verbose == True:
for i in range(1, n_top + 1):
candidates = np.flatnonzero(results[NamesSK.REPORT_RANK] == i)
for candidate in candidates:
print("Model with rank: {0}".format(i))
print(
"Mean validation score: {0:.3f} (std: {1:.3f})".format(
results["mean_test_score"][candidate],
results["std_test_score"][candidate],
)
)
print("Parameters: {0}".format(results["params"][candidate]))
print("")
# return stuff
return results_frame
# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
# TODO add an option to use the weighted mean instead of the unweighted mean
# can used the meta-analysis functions for this and the SD score info
[docs]
def tune_hyper_params(
learner: Callable,
searcher: Callable,
X: np.ndarray,
Y: np.ndarray,
fixed_learner_dict: dict[str, Any] | None = None,
searcher_params_dict: dict[str, Any] | None = None,
fit_dict: dict[str, Any] | None = None,
strata: np.ndarray | None = None,
strata_ascending: bool = False,
) -> tuple[pd.DataFrame, SklearnClass]:
"""
Will take a supervised learning algorithm `learner` and tune the
algorithm's hyper-parameters using a `searcher`.
Common sklearn searching algorithms include:
* GridSearchCV, to perform an exhaustive search,
* RandomizedSearchCV, which samples candidate parameters from supplied
statistical distributions.
Have a look at the sklearn `documentation <https://scikit-learn.org/stable/modules/grid_search.html>`_ for further options and discussion.
When `strata` is supplied, the `searcher` will be applied to each
individual stratum value. Per hyper-parameter set the perofrmance metric
will be averged, and internally the best set (with rank 1) will be selected
and returned as a `searcher` object with a single (best) model.
Parameters
----------
learner : `callable`
An sklearning supervised algorithm, such as `NuSVC` or
`RandomForestClassifier`.
searcher : `callable`
A searching algorithm to tune the hyper-parameters of `learner`
X : `np.ndarray`
The design matrix containing all the features as columns and the
observations/subjects as rows.
Y : `np.ndarray`
A matrix with the dependent variable(s), i.e., the labelled data.
fixed_learner_dict : `dict` or `None`, default `None`
Optional keyword arguments for the `learner`, specifying parameters
which do not need to be tuned.
searcher_params_dict : `dict` or `None`, default `None`
Optional keyword arguments for the `searcher` algorithm.
fit_dict : `dict` or `None`, default `None`
Optional keyword arguments for the `searcher.fit` callable.
strata : `np.ndarray` or `None`, default `None`
An optional r by 1 matrix with discrete values to stratify the tuning
algorithm on. Will perform separate tuning on each stratum and
return the hyper-parameter set with the average (`mean`) optimised
performance for the `learner`. Will be ignored if set to `None`.
strata_ascending : `bool`, default `False`
Wether the largest or smallest optimisation metric should be used to
identify the best model. For example, for c-statistic one would want
the largest value, whereas for the mean squared error one would
prefer the hyper-parameters with the smallest value.
Returns
-------
`pd.DataFrame`
A table of search results.
`SklearnClass`
The searcher object with either a single best model (when strata is
supplied), or alternatively all the models evaluated by the searcher.
"""
# constants
MEAN_COL = NamesSK.SEARCH_MEAN
PARAMS = NamesSK.SEARCH_PARAMS
AVERAGE = NamesSK.SEARCH_AVERAGE
RANK = NamesSK.SEARCH_RANK
# check input
is_type(Y, np.ndarray, 'Y')
is_type(X, np.ndarray, 'X')
is_type(searcher_params_dict, (type(None), dict), 'searcher_params_dict')
is_type(fixed_learner_dict, (type(None), dict), 'fixed_learner_dict')
is_type(fit_dict, (type(None), dict), 'fit_dict')
is_type(strata, (type(None), np.ndarray), 'strata')
is_type(strata_ascending, bool, 'strata_ascending')
same_len(Y, X, ['Y', 'X'])
if strata is not None:
same_len(Y, strata, ['Y', 'strata'])
# replace None by empty dicts
searcher_params_dict, fixed_learner_dict, fit_dict = assign_empty_default(
[searcher_params_dict, fixed_learner_dict, fit_dict],
empty_object=dict,
)
# ##### start internal function
def _internal(learner=learner, searcher=searcher, X=X, Y=Y,
searcher_params_dict=searcher_params_dict,
fixed_learner_dict=fixed_learner_dict,
fit_dict=fit_dict,
) -> tuple[pd.DataFrame, SklearnClass]:
# init leaner
learner_init = learner(**fixed_learner_dict)
# NOTE either do this
# # match the `searcher` parameter to the `learner` and add to the
# # kwargs for `searcher`.
# searcher_params_dict[searcher_learner_parameter] = learner_init
# NOTE OR simply assume it will always be the first parameter:
# init grid
if len(searcher_params_dict) != 0:
searcher_init = searcher(learner_init, **searcher_params_dict)
else:
searcher_init = searcher(learner_init, searcher_params_dict)
# fit the model
searcher_init.fit(X=X,y=Y, **fit_dict)
# extract report
results = report(
searcher_init.cv_results_, verbose=False, sort=None,
)
# return
return results, searcher_init
# ##### start main function
if strata is None:
results, searcher_init = _internal()
else:
# loop over unique strata
S_dict={}
S_uniq = np.unique(strata)
for S in S_uniq:
X_S = X[strata == S]
Y_S = Y[strata == S]
# get CV results
results_S, searcher_init = _internal(X=X_S,Y=Y_S)
# store mean results
S_dict[S] = results_S[MEAN_COL]
# get average performance, NOTE implement a weighted mean
S_dict[PARAMS] = results_S[PARAMS]; del results_S
M_average = pd.DataFrame.from_dict(S_dict)
M_average[AVERAGE] = M_average[S_uniq].mean(axis=1, skipna=False)
M_average[RANK] = M_average[AVERAGE].rank(ascending=strata_ascending)
# get best params
best = M_average[M_average[RANK] == 1]
if best.empty:
raise EmptyError('`best` is empty. Most likely one or more values '
'of strata do not occur frequently enough. '
'Potentially, you would be better to simply '
'remove these values.')
if best.shape[0] == 1:
best_params = best[PARAMS].values[0]
else:
# simply take the first, could infuture consider the average SD: (sqrt(SD^2 + SD^2))
best_params = best.iloc[0][PARAMS]
# ##### refit model
fixed_learner_dict.update(best_params)
# now actual refit the model
_, searcher_init = _internal(fixed_learner_dict=fixed_learner_dict,
searcher_params_dict={},
)
# asign results
results = M_average
# return
return results, searcher_init