Source code for spare_scores.spare

import logging
from typing import Any, Tuple, Union

import numpy as np
import pandas as pd

from .classes import MetaData, SpareModel
from .data_prep import (
    check_test,
    check_train,
    convert_cat_variables,
    logging_basic_config,
)
from .util import (
    check_file_exists,
    is_unique_identifier,
    load_df,
    load_model,
    save_file,
)


[docs] def spare_train( df: Union[pd.DataFrame, str], to_predict: str, model_type: str = "SVM", pos_group: str = "", key_var: str = "", data_vars: list = [], ignore_vars: list = [], kernel: str = "linear", output: str = "", verbose: int = 1, logs: str = "", **kwargs: Any, ) -> dict: """ Trains a SPARE model, either classification or regression :param df: either a pandas dataframe or a path to a saved csv containing training data. :type df: pandas.DataFrame :param to_predict: variable to predict. Binary for classification and continuous for regression. Must be one of the columnes in df. :type to_predict: str :param pos_group: group to assign a positive SPARE score (only for classification). :type pos_group: str :param key_var: The key variable to be used for training. If not given, the first column of the dataset is considered the primary key of the dataset. :type key_var: str :param data_vars: a list of predictors for the training. All must be present in columns of df. :type data_vars: list :param ignore_vars:The list of predictors to be ignored for training. Can be a listkey_var, or empty. :type ignore_vars: list :param kernel: 'linear' or 'rbf' (only linear is supported currently in regression). :type kernel: str :param output: path to save the trained model. '.pkl.gz' file extension optional. If None is given, no model will be saved. :type output: str :param verbose: Verbosity. Int, higher is more verbose. [0,1,2] :type verbose: int :param logs: Where to save log file. If not given, logs will only be printed out. :type logs: str :return: A dictionary with three keys, 'status_code', 'status' and 'data'. 'status' is either'OK' or the error message. 'data' is a dictionary containing the trained model and metadata if successful, or None / error object if unsuccessful. 'status_code' is either 0, 1 or 2. 0 is success, 1 is warning, 2 is error. :rtype: dict """ res = {"status_code": int, "status": Any, "data": Any} logger = logging_basic_config(verbose=verbose, filename=logs) # Make sure that no overwrites happen: if check_file_exists(output, logger): res["status"] = check_file_exists(output, logger) res["status_code"] = 2 return res # Load the data df = load_df(df) # Assume key_variable (if not given) if key_var == "" or key_var is None: key_var = df.columns[0] if not is_unique_identifier(df, [key_var]): logging.info( "Assumed primary key is not capable of uniquely " + "identifying each row of the dataset. Assumed pkey: " + key_var ) # Assume predictors (if not given) if data_vars == [] or data_vars is None: # Predictors = all_vars - key_var - ignore_vars - to_predict if ignore_vars == [] or ignore_vars is None: data_vars = list(set(list(df)) - set([key_var]) - set([to_predict])) else: data_vars = list( set(list(df)) - set([key_var]) - set(ignore_vars) - set([to_predict]) ) predictors = data_vars # Check if it contains any errors. try: df, predictors, mdl_task = check_train( # type: ignore df, predictors, to_predict, verbose, pos_group ) except Exception as e: err = "Dataset check failed before training was initiated." logger.error(err) print(e) res["status"] = err res["status_code"] = 2 return res # Create meta data meta_data = MetaData(model_type, mdl_task, kernel, predictors, to_predict, key_var) meta_data.key_var = key_var # Convert categorical variables if len(df[to_predict].value_counts().keys()) == 2: if set(df[to_predict].value_counts().keys()) != set([0, 1]): df[to_predict] = df[to_predict].apply(lambda x: 1 if x == pos_group else 0) try: df, meta_data = convert_cat_variables(df, predictors + [to_predict], meta_data) except ValueError: err = ( "Categorical variables could not be converted, because " + "they were not binary." ) logger.error(err) res["status"] = err res["status_code"] = 2 return res # Create the model try: spare_model = SpareModel( model_type, predictors, to_predict, key_var, verbose=1, parameters={ "kernel": kernel, "k": 5, "n_repeats": 1, "task": mdl_task, "param_grid": None, }, **kwargs, ) except NotImplementedError: err = "SPARE model " + model_type + " is not implemented yet." logger.error(err) res["status"] = err res["status_code"] = 2 return res except ValueError as e: logger.error(e) print(e) res["status"] = e res["status_code"] = 2 return res # Train the model try: trained = spare_model.train_model(df, pos_group=pos_group) except Exception as e: logger.critical(e) print(e) res["status"] = e res["status_code"] = 2 return res # Save the results if trained is None: err = "No training output was produced." logger.critical(err) res["status"] = err res["status_code"] = 2 return res df["predicted"] = trained["predicted"] model = trained["model"] meta_data.params = trained["best_params"] meta_data.stats = trained["stats"] meta_data.cv_folds = trained["CV_folds"] meta_data.scaler = trained["scaler"] if "scaler" in trained.keys() else None meta_data.cv_results = df[list(dict.fromkeys([key_var, to_predict, "predicted"]))] result = model, vars(meta_data) # Save model if output != "" and output is not None: save_file(result, output, "train", logger) res["status"] = "OK" res["data"] = result res["status_code"] = 0 return res
[docs] def spare_test( df: Union[pd.DataFrame, str], mdl_path: Union[str, Tuple[dict, dict]], key_var: str = "", output: str = "", spare_var: str = "SPARE_score", verbose: int = 1, logs: str = "", ) -> pd.DataFrame: """ Applies a trained SPARE model on a test dataset :param df: either a pandas dataframe or a path to a saved csv containing the test sample. :type df: pandas.DataFrame :param mdl_path: either a path to a saved SPARE model ('.pkl.gz' file extension expected) or a tuple of SPARE model and meta_data. :type mdl_path: str :param key_var: The of key variable to be used for training. If not given, and the saved model does not contain it,the first column of the dataset is considered the primary key of the dataset. :type key_var: str :param output: path to save the calculated scores. '.csv' file extension optional. If None is given, no data will be saved. :type output: str :param spare_var: The name of the variable to be predicted. If not given, the name 'SPARE_score' will be used. :type spare_var: str :param verbose: Verbosity. Int, higher is more verbose. [0,1,2] :type verbose: int :param logs: Where to save log file. If not given, logs will only be printed out. :type logs: str :return: A dictionary with three keys, 'status_code', 'status' and 'data'. 'status' is either 'OK' or the error message. 'data' is the pandas dataframe containing predicted SPARE scores, or None / error object if unsuccessful. 'status_code' is either 0, 1 or 2. 0 is success, 1 is warning, 2 is error. :rtype: dict """ res = {"status_code": int, "status": Any, "data": Any} logger = logging_basic_config(verbose=verbose, filename=logs) # Make sure that no overwrites happen: if check_file_exists(output, logger): res["status"] = check_file_exists(output, logger) res["status_code"] = 2 return res df = load_df(df) # Load & check for errors / compatibility the trained SPARE model mdl, meta_data = load_model(mdl_path) if isinstance(mdl_path, str) else mdl_path try: check, cols = check_test(df, meta_data) except Exception as e: logger.error(e) print(e) res["status"] = e res["status_code"] = 2 return res if cols is not None and cols != []: print(check) logger.error(check) res["status"] = check res["data"] = cols res["status_code"] = 1 return res # Assume key_variable (if not given) if key_var == "" or key_var is None: key_var = df.columns[0] if not is_unique_identifier(df, [key_var]): logging.info( "Assumed primary key(s) are not capable of uniquely " + "identifying each row of the dataset. Assumed " + "primary key(s) are: " + key_var ) # Convert categorical variables for var, map_dict in meta_data.get("categorical_var_map", {}).items(): if not isinstance(map_dict, dict): continue if df[var].isin(map_dict.keys()).any(): df[var] = df[var].map(map_dict) else: expected_var = list(map_dict.keys()) err = ( f'Column "{var}" expected {expected_var}, but ' + f"received {list(df[var].unique())}" ) logger.error(err) res["status"] = err res["data"] = list(df[var].unique()) res["status_code"] = 1 return res # TODO: Output model description n = len(meta_data["cv_results"].index) if "Age" in meta_data["cv_results"].keys(): a1 = int(np.floor(np.min((meta_data["cv_results"]["Age"])))) a2 = int(np.ceil(np.max((meta_data["cv_results"]["Age"])))) else: a1 = None a2 = None stats_metric = list(meta_data["stats"].keys())[0] stats = "{:.3f}".format(np.mean(meta_data["stats"][stats_metric])) logger.info( f"Model Info: training N = {n} / ages = {a1} - {a2} / " + f"expected {stats_metric} = {stats}" ) # Figure out model type and task: if "mdl_task" not in meta_data.keys(): # Backwards compatibility model_task = ( "Classification" if "Classification" in meta_data["mdl_type"] else "Regression" ) if "SVM" in meta_data["mdl_type"]: model_type = "SVM" elif "MLP" in meta_data["mdl_type"]: model_type = "MLP" else: model_type = "MLPTorch" else: model_task = meta_data["mdl_task"] model_type = meta_data["mdl_type"] # Create model instance based on saved model: predictors = meta_data["predictors"] target = meta_data["to_predict"] params = meta_data["params"] spare_model = SpareModel(model_type, predictors, target, key_var, verbose=verbose) # Set the model attributes to the ones that were saved to the instance # during training: try: spare_model.set_parameters(**params) spare_model.set_parameters( **{ "mdl": mdl, "task": model_task, **{ key: meta_data[key] for key in meta_data.keys() if key not in ["mdl", "task"] }, } ) except Exception as e: logger.critical(e) print(e) res["status"] = e res["status_code"] = 2 return res # Predict try: predicted = spare_model.apply_model(df) except Exception as e: logger.critical(e) print(e) res["status"] = e res["status_code"] = 2 return res # Save the results if predicted is None: err = "No testing output was produced." logger.critical(err) res["status"] = err res["status_code"] = 2 return res d = {} d[key_var] = df[key_var] d[spare_var] = predicted out_df = pd.DataFrame(data=d) if output != "" and output is not None: save_file(out_df, output, "test", logger) res["status"] = "OK" res["data"] = out_df res["status_code"] = 0 return res