Source code for skoot.model_validation._validator

# -*- coding: utf-8 -*-
#
# Author: Taylor Smith <taylor.smith@alkaline-ml.com>
#
# Validator classes for model monitoring. These classes can be stacked after
# transformers in a pipeline in order to ensure the test data distributions
# resemble those of the training data and can provide early warnings for
# covariate shift.

from ..base import BasePDTransformer
from ..utils.validation import check_dataframe, type_or_iterable_to_col_mapping
from ..utils.dataframe import get_continuous_columns, dataframe_or_array
from ..utils.metaestimators import timed_instance_method
from ..exceptions import ValidationWarning

import six
from sklearn.utils.validation import check_is_fitted

from scipy.stats import ttest_ind_from_stats

import numpy as np
import warnings
import collections

from abc import abstractmethod, ABCMeta

__all__ = [
    "CustomValidator",
    "DistHypothesisValidator"
]


def _passthrough(_):
    # Default for when user does not set a function in the custom validator
    # (design choice: should we FORCE a value there?)
    return True


def _compute_stats(v, continuous):
    # determine if this needs a T-test or freq test
    if continuous:
        # Compute the t-test stats
        mean = np.nanmean(v)
        std = np.nanstd(v)
        return mean, std, v.shape[0]

    # otherwise it's categorical or integer (probably ordinal)
    else:
        unique_levels, counts = np.unique(v, return_counts=True)
        return unique_levels, counts, v.shape[0]


class _BaseValidator(six.with_metaclass(ABCMeta, BasePDTransformer)):
    """Base validator class."""
    def __init__(self, cols, as_df, action):
        super(_BaseValidator, self).__init__(
            cols=cols, as_df=as_df)

        self.action = action

    @abstractmethod
    def _is_as_expected(self, index, feature_name, feature):
        """Validate the test feature.

        Abstract method to compute the validation statistic. Should
        return a boolean indicating whether the feature adheres to the
        expectation.
        """

    def transform(self, X):
        """Validate the features in the test dataframe.

        This method will apply the validation test over each prescribed
        feature, and raise or warn appropriately.

        Parameters
        ----------
        X : pd.DataFrame, shape=(n_samples, n_features)
            The Pandas frame to validate. The operation will
            be applied to a copy of the input data, and the result
            will be returned.

        Returns
        -------
        X : pd.DataFrame or np.ndarray, shape=(n_samples, n_features)
            The operation is applied to a copy of ``X``,
            and the result set is returned.
        """
        check_is_fitted(self, "fit_cols_")
        X, _ = check_dataframe(X, cols=self.cols)  # X is a copy now
        cols = self.fit_cols_  # assigned in the "fit" method

        for i, c in enumerate(cols):
            v = X[c].values  # get the feature

            # determine whether it's valid
            if not self._is_as_expected(i, c, v):
                msg = "Feature %s does not match expectation as set by %s" \
                      % (c, self.__class__.__name__)

                # if it's error or warn, we alert the user otherwise we do not.
                if self.action == "raise":
                    raise ValueError(msg)
                elif self.action == "warn":
                    warnings.warn(msg, ValidationWarning)

        # just return X if we get here
        return dataframe_or_array(X, self.as_df)


[docs]class CustomValidator(_BaseValidator): """Validate test features given custom functions. Apply test set validator behavior over custom functions. This can be especially useful in cases where a feature should never exhibit values within a certain range (i.e., sensor data). Parameters ---------- cols : array-like, shape=(n_features,) The names of the columns on which to apply the transformation. If ``cols`` is None, will apply to the entire dataframe. as_df : bool, optional (default=True) Whether to return a Pandas ``DataFrame`` in the ``transform`` method. If False, will return a Numpy ``ndarray`` instead. Since most skoot transformers depend on explicitly-named ``DataFrame`` features, the ``as_df`` parameter is True by default. func : callable or iterable, optional (default=None) The function used to validate the feature. Can be as complex or as simple as needed, but must adhere to the following criteria: * The signature must accept a single vector * The output must be a boolean Note also that providing a lambda expression as a function can prove to be problematic when it comes time to serialize your class, as lambda expressions cannot be serialized via pickle. It's best to provide a ``def``-style function or closure. action : str or unicode, optional (default="warn") The default action for handling validation mismatches. Options include "warn", "raise" or "ignore". If ``action`` is "raise", will raise a ValueError if mismatched. Attributes ---------- func_dict_ : dict A dictionary mapping the column names to their respective validation function. fit_cols_ : list The list of column names on which the transformer was fit. This is used to validate the presence of the features in the test set during the ``transform`` stage. """
[docs] def __init__(self, cols=None, as_df=True, func=None, action="warn"): super(CustomValidator, self).__init__( cols=cols, as_df=as_df, action=action) self.func = func
[docs] @timed_instance_method(attribute_name="fit_time_") def fit(self, X, y=None): """Fit the transformer. Parameters ---------- X : pd.DataFrame, shape=(n_samples, n_features) The Pandas frame to fit. The frame will only be fit on the prescribed ``cols`` (see ``__init__``) or all if ``cols`` is None. y : array-like or None, shape=(n_samples,), optional (default=None) Pass-through for ``sklearn.pipeline.Pipeline``. """ X, cols = check_dataframe(X, cols=self.cols, assert_all_finite=False) # validate the func(tions): func = self.func # if it's None, make it an identity function of sorts if func is None: func = _passthrough f = type_or_iterable_to_col_mapping( cols=cols, param=func, param_name="func", permitted_scalar_types=collections.Callable) # type: dict # save the function mapping as the fit param self.func_dict_ = f self.fit_cols_ = cols return self
def _is_as_expected(self, index, feature_name, feature): """Validate the test feature. Apply the user-defined custom validation function to the test feature. """ return self.func_dict_[feature_name](feature)
[docs]class DistHypothesisValidator(_BaseValidator): r"""Validate test distributions using various hypothesis tests. The distribution validator learns statistics from the training set and then validates that the test set features match their expected distributions. This can be useful for model validation tasks where model monitoring needs to take place. For continuous (float) features, a two-tailed T-test will be applied to the test data to ensure it matches the distribution of the training data. For categorical (int, object) features, we compare the frequencies of different categorical levels within a tolerance of ``alpha``. **Note**: this class is NaN-safe, meaning if it is used early in your pipeline when you still have NaN values in your features, it will still function! Parameters ---------- cols : array-like, shape=(n_features,) The names of the columns on which to apply the transformation. Unlike other BasePDTransformer instances, if ``cols`` is None, it will only fit the numerical columns, since statistics such as standard deviation cannot be computed on categorical features. For column types that are integers or objects, the ratio of frequency for each class level will be compared to the expected ratio within a tolerance of ``alpha``. as_df : bool, optional (default=True) Whether to return a Pandas ``DataFrame`` in the ``transform`` method. If False, will return a Numpy ``ndarray`` instead. Since most skoot transformers depend on explicitly-named ``DataFrame`` features, the ``as_df`` parameter is True by default. alpha : float, optional (default=0.05) The :math:`\alpha` value for the T-test or level ratio comparison. If the resulting p-value is LESS than ``alpha``, it means that we would reject the null hypothesis, and that the variable likely follows a different distribution from the training set. action : str or unicode, optional (default="warn") The default action for handling validation mismatches. Options include "warn", "raise" or "ignore". If ``action`` is "raise", will raise a ValueError if mismatched. categorical_strategy : str, unicode or None, optional (default="ratio") How to validate categorical features. Default is "ratio", which will compare the ratio of each level's frequency to the overall count of samples in the feature within an absolute tolerance of ``alpha``. If None, will not perform validation on categorical features. Notes ----- This class is NaN-safe, meaning if it is used early in your pipeline when you still have NaN values in your features, it will still function. This is a double-edge sword, since computing the ``np.nanmean`` on a feature of mostly-NaN values will not be very meaningful. Attributes ---------- statistics_ : list, shape=(n_features,) A list of tuples over the training features. For continuous features:: (mean, standard_dev, n_obs) For categorical features: (present_levels, present_counts, n_obs) fit_cols_ : list The list of column names on which the transformer was fit. This is used to validate the presence of the features in the test set during the ``transform`` stage. """
[docs] def __init__(self, cols=None, as_df=True, alpha=0.05, action="warn", categorical_strategy="ratio"): super(DistHypothesisValidator, self).__init__( cols=cols, as_df=as_df, action=action) self.alpha = alpha self.categorical_strategy = categorical_strategy
[docs] @timed_instance_method(attribute_name="fit_time_") def fit(self, X, y=None): """Fit the transformer. Parameters ---------- X : pd.DataFrame, shape=(n_samples, n_features) The Pandas frame to fit. The frame will only be fit on the prescribed ``cols`` (see ``__init__``) or all if ``cols`` is None. y : array-like or None, shape=(n_samples,), optional (default=None) Pass-through for ``sklearn.pipeline.Pipeline``. """ X, cols = check_dataframe(X, cols=self.cols, assert_all_finite=False) # if self.cols is None, the user might have tried to apply this to # every column and some may contain categorical features. So we need # to control for that... float_cols = set(get_continuous_columns(X).columns.tolist()) # fit the test statistics over each column self.statistics_ = [ _compute_stats(X[col].values, continuous=col in float_cols) for col in cols ] self.fit_cols_ = cols self.continuous_ = float_cols return self
def _is_as_expected(self, index, feature_name, feature): """Validate the test feature. Compute the test statistic and, for continuous covariates, return whether the P-value is GREATER THAN OR EQUAL TO the specified alpha value (less than indicates that we would reject the null, so >= means it's likely from the same distribution). For categorical features, compare the frequencies of each level within ``alpha`` tolerance. """ # if it's a continuous feature, we use the T-test if feature_name in self.continuous_: mean1, std1, nobs1 = self.statistics_[index] mean2, std2, nobs2 = _compute_stats(feature, continuous=True) _, pval = ttest_ind_from_stats( mean1=mean1, std1=std1, nobs1=nobs1, mean2=mean2, std2=std2, nobs2=nobs2, equal_var=True) # we expect them to be the same return pval >= self.alpha # otherwise, we are dealing with categorical features else: # if we want to use the ratio strategy, do so here: if self.categorical_strategy == "ratio": exp_levels, exp_counts, n_obs = self.statistics_[index] prst_levels, prst_counts, n_test = \ _compute_stats(feature, continuous=False) # Get the expected ratios & present ratios exp_ratios = exp_counts / float(n_obs) prst_ratios = prst_counts / float(n_test) abs_diff = np.abs(exp_ratios - prst_ratios) # if there are any levels in the test set that are NOT in the # training set, we have to handle the action. Don't fail since # this may be before a user has encoded all levels, so we can # just let the transform method handle the action. new_lvls = ~np.in1d(prst_levels, exp_levels) # type: np.ndarray valid = not new_lvls.any() # dont want new lvls, new = invalid abs_mask = abs_diff <= self.alpha # type: np.ndarray return valid and abs_mask.all() # if we add more strategies, here's where they'll go... return True
# in case nose has an issue here (since "test" is used all over)... DistHypothesisValidator.__test__ = False