Source code for spare_scores.spare

import logging
from typing import Any, Tuple, Union

import numpy as np
import pandas as pd

from .classes import MetaData, SpareModel
from .data_prep import (
    check_test,
    check_train,
    convert_cat_variables,
    logging_basic_config,
)
from .util import (
    check_file_exists,
    is_unique_identifier,
    load_df,
    load_model,
    save_file,
)



[docs]
def spare_train(
    df: Union[pd.DataFrame, str],
    to_predict: str,
    model_type: str = "SVM",
    pos_group: str = "",
    key_var: str = "",
    data_vars: list = [],
    ignore_vars: list = [],
    kernel: str = "linear",
    output: str = "",
    verbose: int = 1,
    logs: str = "",
    **kwargs: Any,
) -> dict:
    """
    Trains a SPARE model, either classification or regression

    :param df: either a pandas dataframe or a path to a saved csv
                containing training data.
    :type df: pandas.DataFrame
    :param to_predict: variable to predict. Binary for classification and
                continuous for regression. Must be one of the columnes in
                df.
    :type to_predict: str
    :param pos_group: group to assign a positive SPARE score (only for
                classification).
    :type pos_group: str
    :param key_var: The key variable to be used for training. If not
                given, the first column of the dataset is considered the
                primary key of the dataset.
    :type key_var: str
    :param data_vars:  a list of predictors for the training. All must be present
                in columns of df.
    :type data_vars: list
    :param ignore_vars:The list of predictors to be ignored for training. Can be
                a listkey_var, or empty.
    :type ignore_vars: list
    :param kernel: 'linear' or 'rbf' (only linear is supported currently in
                regression).
    :type kernel: str
    :param output: path to save the trained model. '.pkl.gz' file extension
                optional. If None is given, no model will be saved.
    :type output: str
    :param verbose:    Verbosity. Int, higher is more verbose. [0,1,2]
    :type verbose: int
    :param logs: Where to save log file. If not given, logs will only be printed out.
    :type logs: str

    :return: A dictionary with three keys, 'status_code', 'status' and 'data'.
             'status' is either'OK' or the error message. 'data' is a dictionary
             containing the trained model and metadata if successful, or
             None / error object if unsuccessful. 'status_code' is either 0, 1 or 2.
             0 is success, 1 is warning, 2 is error.
    :rtype: dict

    """
    res = {"status_code": int, "status": Any, "data": Any}

    logger = logging_basic_config(verbose=verbose, filename=logs)

    # Make sure that no overwrites happen:
    if check_file_exists(output, logger):
        res["status"] = check_file_exists(output, logger)
        res["status_code"] = 2
        return res

    # Load the data
    df = load_df(df)

    # Assume key_variable (if not given)
    if key_var == "" or key_var is None:
        key_var = df.columns[0]
        if not is_unique_identifier(df, [key_var]):
            logging.info(
                "Assumed primary key is not capable of uniquely "
                + "identifying each row of the dataset. Assumed pkey: "
                + key_var
            )

    # Assume predictors (if not given)
    if data_vars == [] or data_vars is None:
        # Predictors = all_vars - key_var - ignore_vars - to_predict
        if ignore_vars == [] or ignore_vars is None:
            data_vars = list(set(list(df)) - set([key_var]) - set([to_predict]))
        else:
            data_vars = list(
                set(list(df)) - set([key_var]) - set(ignore_vars) - set([to_predict])
            )
    predictors = data_vars

    # Check if it contains any errors.
    try:
        df, predictors, mdl_task = check_train(  # type: ignore
            df, predictors, to_predict, verbose, pos_group
        )
    except Exception as e:
        err = "Dataset check failed before training was initiated."
        logger.error(err)
        print(e)
        res["status"] = err
        res["status_code"] = 2
        return res

    # Create meta data
    meta_data = MetaData(model_type, mdl_task, kernel, predictors, to_predict, key_var)
    meta_data.key_var = key_var

    # Convert categorical variables

    if len(df[to_predict].value_counts().keys()) == 2:
        if set(df[to_predict].value_counts().keys()) != set([0, 1]):
            df[to_predict] = df[to_predict].apply(lambda x: 1 if x == pos_group else 0)

    try:
        df, meta_data = convert_cat_variables(df, predictors + [to_predict], meta_data)
    except ValueError:
        err = (
            "Categorical variables could not be converted, because "
            + "they were not binary."
        )
        logger.error(err)
        res["status"] = err
        res["status_code"] = 2
        return res

    # Create the model
    try:
        spare_model = SpareModel(
            model_type,
            predictors,
            to_predict,
            key_var,
            verbose=1,
            parameters={
                "kernel": kernel,
                "k": 5,
                "n_repeats": 1,
                "task": mdl_task,
                "param_grid": None,
            },
            **kwargs,
        )
    except NotImplementedError:
        err = "SPARE model " + model_type + " is not implemented yet."
        logger.error(err)
        res["status"] = err
        res["status_code"] = 2
        return res
    except ValueError as e:
        logger.error(e)
        print(e)
        res["status"] = e
        res["status_code"] = 2
        return res

    # Train the model
    try:
        trained = spare_model.train_model(df, pos_group=pos_group)
    except Exception as e:
        logger.critical(e)
        print(e)
        res["status"] = e
        res["status_code"] = 2
        return res

    # Save the results
    if trained is None:
        err = "No training output was produced."
        logger.critical(err)
        res["status"] = err
        res["status_code"] = 2
        return res

    df["predicted"] = trained["predicted"]
    model = trained["model"]
    meta_data.params = trained["best_params"]
    meta_data.stats = trained["stats"]
    meta_data.cv_folds = trained["CV_folds"]
    meta_data.scaler = trained["scaler"] if "scaler" in trained.keys() else None

    meta_data.cv_results = df[list(dict.fromkeys([key_var, to_predict, "predicted"]))]
    result = model, vars(meta_data)

    # Save model
    if output != "" and output is not None:
        save_file(result, output, "train", logger)

    res["status"] = "OK"
    res["data"] = result
    res["status_code"] = 0
    return res




[docs]
def spare_test(
    df: Union[pd.DataFrame, str],
    mdl_path: Union[str, Tuple[dict, dict]],
    key_var: str = "",
    output: str = "",
    spare_var: str = "SPARE_score",
    verbose: int = 1,
    logs: str = "",
) -> pd.DataFrame:
    """
    Applies a trained SPARE model on a test dataset

    :param df:  either a pandas dataframe or a path to a saved csv
                containing the test sample.
    :type df: pandas.DataFrame
    :param mdl_path: either a path to a saved SPARE model ('.pkl.gz' file
                extension expected) or a tuple of SPARE model and
                meta_data.
    :type mdl_path: str
    :param key_var: The of key variable to be used for training. If not
                given, and the saved model does not contain it,the first
                column of the dataset is considered the primary key of the
                dataset.
    :type key_var: str
    :param output: path to save the calculated scores. '.csv' file extension
                optional. If None is given, no data will be saved.
    :type output: str
    :param spare_var: The name of the variable to be predicted. If not given,
                the name 'SPARE_score' will be used.
    :type spare_var: str
    :param verbose: Verbosity. Int, higher is more verbose. [0,1,2]
    :type verbose: int
    :param logs: Where to save log file. If not given, logs will only be
                printed out.
    :type logs: str

    :return: A dictionary with three keys, 'status_code', 'status' and 'data'.
             'status' is either 'OK' or the error message. 'data' is the pandas
             dataframe  containing predicted SPARE scores, or  None / error object
             if  unsuccessful. 'status_code' is either 0, 1 or 2.
             0 is success, 1 is warning, 2 is error.
    :rtype: dict

    """
    res = {"status_code": int, "status": Any, "data": Any}

    logger = logging_basic_config(verbose=verbose, filename=logs)

    # Make sure that no overwrites happen:
    if check_file_exists(output, logger):
        res["status"] = check_file_exists(output, logger)
        res["status_code"] = 2
        return res

    df = load_df(df)

    # Load & check for errors / compatibility the trained SPARE model
    mdl, meta_data = load_model(mdl_path) if isinstance(mdl_path, str) else mdl_path

    try:
        check, cols = check_test(df, meta_data)
    except Exception as e:
        logger.error(e)
        print(e)
        res["status"] = e
        res["status_code"] = 2
        return res

    if cols is not None and cols != []:
        print(check)
        logger.error(check)
        res["status"] = check
        res["data"] = cols
        res["status_code"] = 1
        return res

    # Assume key_variable (if not given)
    if key_var == "" or key_var is None:
        key_var = df.columns[0]
        if not is_unique_identifier(df, [key_var]):
            logging.info(
                "Assumed primary key(s) are not capable of uniquely "
                + "identifying each row of the dataset. Assumed "
                + "primary key(s) are: "
                + key_var
            )

    # Convert categorical variables
    for var, map_dict in meta_data.get("categorical_var_map", {}).items():
        if not isinstance(map_dict, dict):
            continue
        if df[var].isin(map_dict.keys()).any():
            df[var] = df[var].map(map_dict)
        else:
            expected_var = list(map_dict.keys())
            err = (
                f'Column "{var}" expected {expected_var}, but '
                + f"received {list(df[var].unique())}"
            )
            logger.error(err)
            res["status"] = err
            res["data"] = list(df[var].unique())
            res["status_code"] = 1
            return res

    # TODO: Output model description
    n = len(meta_data["cv_results"].index)
    if "Age" in meta_data["cv_results"].keys():
        a1 = int(np.floor(np.min((meta_data["cv_results"]["Age"]))))
        a2 = int(np.ceil(np.max((meta_data["cv_results"]["Age"]))))
    else:
        a1 = None
        a2 = None
    stats_metric = list(meta_data["stats"].keys())[0]
    stats = "{:.3f}".format(np.mean(meta_data["stats"][stats_metric]))
    logger.info(
        f"Model Info: training N = {n} / ages = {a1} - {a2} / "
        + f"expected {stats_metric} = {stats}"
    )

    # Figure out model type and task:
    if "mdl_task" not in meta_data.keys():  # Backwards compatibility
        model_task = (
            "Classification"
            if "Classification" in meta_data["mdl_type"]
            else "Regression"
        )

        if "SVM" in meta_data["mdl_type"]:
            model_type = "SVM"
        elif "MLP" in meta_data["mdl_type"]:
            model_type = "MLP"
        else:
            model_type = "MLPTorch"
    else:
        model_task = meta_data["mdl_task"]
        model_type = meta_data["mdl_type"]

    # Create model instance based on saved model:
    predictors = meta_data["predictors"]
    target = meta_data["to_predict"]
    params = meta_data["params"]
    spare_model = SpareModel(model_type, predictors, target, key_var, verbose=verbose)

    # Set the model attributes to the ones that were saved to the instance
    # during training:
    try:
        spare_model.set_parameters(**params)
        spare_model.set_parameters(
            **{
                "mdl": mdl,
                "task": model_task,
                **{
                    key: meta_data[key]
                    for key in meta_data.keys()
                    if key not in ["mdl", "task"]
                },
            }
        )
    except Exception as e:
        logger.critical(e)
        print(e)
        res["status"] = e
        res["status_code"] = 2
        return res

    # Predict
    try:
        predicted = spare_model.apply_model(df)
    except Exception as e:
        logger.critical(e)
        print(e)
        res["status"] = e
        res["status_code"] = 2
        return res

    # Save the results
    if predicted is None:
        err = "No testing output was produced."
        logger.critical(err)
        res["status"] = err
        res["status_code"] = 2
        return res

    d = {}
    d[key_var] = df[key_var]
    d[spare_var] = predicted
    out_df = pd.DataFrame(data=d)

    if output != "" and output is not None:
        save_file(out_df, output, "test", logger)

    res["status"] = "OK"
    res["data"] = out_df
    res["status_code"] = 0
    return res