Source code for mealy.error_analysis_utils

# -*- coding: utf-8 -*-
from sklearn.pipeline import Pipeline
import numpy as np
from mealy.constants import ErrorAnalyzerConstants
from kneed import KneeLocator


[docs]def get_epsilon(difference): """ Compute the threshold used to decide whether a prediction is wrong or correct (for regression tasks). Args: difference (1D-array): The absolute differences between the true target values and the predicted ones (by the primary model). Return: epsilon (float): The value of the threshold used to decide whether the prediction for a regression task is wrong or correct """ epsilon_range = np.linspace(min(difference), max(difference), num=ErrorAnalyzerConstants.NUMBER_EPSILON_VALUES) cdf_error = [] n_samples = difference.shape[0] for epsilon in epsilon_range: correct_predictions = difference <= epsilon cdf_error.append(np.count_nonzero(correct_predictions) / float(n_samples)) return KneeLocator(epsilon_range, cdf_error).knee
[docs]def generate_preprocessing_steps(transformer, invert_order=False): if isinstance(transformer, Pipeline): steps = [step for name, step in transformer.steps] if invert_order: steps = reversed(steps) else: steps = [transformer] for step in steps: if step == 'drop': # Skip the drop step of ColumnTransformer continue if step != 'passthrough' and not isinstance(step, ErrorAnalyzerConstants.SUPPORTED_STEPS): # Check all the preprocessing steps are supported by mealy unsupported_class = step.__class__ raise TypeError('Mealy package does not support {}. '.format(unsupported_class) + 'It might be because it changes output dimension without ' + 'providing a get_feature_names function to keep track of the ' + 'generated features, or that it does not provide an ' + 'inverse_tranform method.') yield step
[docs]def invert_transform_via_identity(step): if isinstance(step, ErrorAnalyzerConstants.STEPS_THAT_CAN_BE_INVERSED_WITH_IDENTICAL_FUNCTION): return True if step == 'passthrough' or step is None: return True return False
[docs]def check_lists_having_same_elements(list_A, list_B): return set(list_A) == set(list_B)
[docs]def check_enough_data(df, min_len): """ Compare length of dataframe to minimum lenght of the test data. Used in the relevance of the measure. :param df: Input dataframe :param min_len: :return: """ if df.shape[0] < min_len: raise ValueError( 'The original dataset is too small ({} rows) to have stable result, it needs to have at least {} rows'.format( df.shape[0], min_len))
[docs]def format_float(number, decimals): """ Format a number to have the required number of decimals. Ensure no trailing zeros remain. Args: number (float or int): The number to format decimals (int): The number of decimals required Return: formatted (str): The number as a formatted string """ formatted = ("{:." + str(decimals) + "f}").format(number).rstrip("0") if formatted.endswith("."): return formatted[:-1] return formatted