Source code for spare_scores.util

import gzip
import logging
import os
import pickle
from typing import Any, Union

import numpy as np
import pandas as pd
import pkg_resources  # type: ignore


[docs] def expspace(span: list) -> np.ndarray: return np.exp(np.linspace(span[0], span[1], num=int(span[1]) - int(span[0]) + 1))
[docs] def load_df(df: Union[pd.DataFrame, str]) -> pd.DataFrame: """ Fast loader for dataframes :param df: Either pd.DataFrame or path to the .csv file :type df: Union[pd.DataFrame, str] :return: The dataframe :rtype: pd.DataFrame """ return pd.read_csv(df, low_memory=False) if isinstance(df, str) else df.copy()
[docs] def add_file_extension(filename: str, extension: str) -> str: """ Adds file extension to needed file :param filename: The path to the file :type filename: str :param extension: The wanted extension(i.e. .txt, .csv, etc) :type extension: str :return: The filename :rtype: str """ if not filename.endswith(extension): filename += extension return filename
[docs] def check_file_exists(filename: str, logger: Any) -> Any: """ Checks if file exists :param filename: The file that will be searched :type filename: str :param logger: Output logger :type logger: logging.basicConfig :return: True if file exists, False otherwise :rtype: bool """ # Make sure that no overwrites happen: if filename is None or filename == "": return False if os.path.exists(filename): err = ( "The output filename " + filename + ", corresponds to an " + "existing file, interrupting execution to avoid overwrite." ) print(err) logger.info(err) return err return False
[docs] def save_file(result: Any, output: str, action: str, logger: Any) -> None: """ Saves the results in a file depending the action :param result: The results that will be dumped into the file :type result: Either .csv or pandas.DataFrame depending on the action :param output: The output filename :type output: str :param action: Either 'train' or 'test' depending on the action :type action: str :param logger: Output logger :type logger: logging.basicConfig """ # Add the correct extension: if action == "train": output = add_file_extension(output, ".pkl.gz") if action == "test": output = add_file_extension(output, ".csv") dirname, fname = os.path.split(output) # Make directory doesn't exist: if not os.path.exists(output): try: os.mkdir(dirname) logger.info("Created directory {dirname}") except FileExistsError: logger.info("Directory of file already exists.") except FileNotFoundError: logger.info("Directory couldn't be created") # Create the file: if action == "train": with gzip.open(output, "wb") as f: pickle.dump(result, f) logger.info(f"Model {fname} saved to {dirname}/{fname}") if action == "test": try: result.to_csv(output, index=False) except Exception as e: logger.info(e) logger.info(f"Spare scores {fname} saved to {dirname}/{fname}") return
[docs] def is_unique_identifier(df: pd.DataFrame, column_names: list) -> bool: """ Checks if the passed dataframe is a unique identifier :param df: The passed dataframe :type df: pandas.DataFrame :param column_names: The passed column names :type column_names: list :return: True if the passed data frame is a unique identifier False otherwise :rtype: bool """ # Check the number of unique combinations unique_combinations = df[column_names].drop_duplicates() num_unique_combinations = len(unique_combinations) # Check the total number of rows num_rows = df.shape[0] # Return True if the number of unique combinations is equal to the total # number of rows return True if (num_unique_combinations == num_rows) else False
[docs] def load_model(mdl_path: str) -> Any: """ Loads the model from the passed path :param mdl_path: the path to the weights of the model :type mdl_path: str """ with gzip.open(mdl_path, "rb") as f: return pickle.load(f)
[docs] def load_examples(file_name: str = "") -> Any: """Loads example data and models in the package. :param file_name: either name of the example data saved as .csv or name of the SPARE model saved as .pkl.gz. :type file_name: str :return: the resulted dataframe :rtype: None or pandas.DataFrame """ pkg_path = pkg_resources.resource_filename("spare_scores", "") list_data = os.listdir(f"{pkg_path}/data/") list_mdl = os.listdir(f"{pkg_path}/mdl/") if file_name in list_data: return pd.read_csv(f"{pkg_path}/data/{file_name}") elif file_name in list_mdl: return load_model(f"{pkg_path}/mdl/{file_name}") else: logging.info("Available example data:") for a in list_data: logging.info(f" - {a}") logging.info("Available example SPARE models:") for a in list_mdl: logging.info(f" - {a}") return None
[docs] def convert_to_number_if_possible(string: str) -> Union[float, str]: """ Converts the the input string to a float if possible :param string: the input string :type string: str :return: float if the string is numeric, the same string if it's not :rtype: float or str """ if string.isnumeric(): return float(string) else: return string