Source code for skoot.preprocessing.schema

# -*- coding: utf-8 -*-
#
# Author: Taylor Smith <taylor.smith@alkaline-ml.com>

from sklearn.utils.validation import check_is_fitted
import six

from ..base import BasePDTransformer
from ..utils.validation import check_dataframe, validate_test_set_columns
from ..utils.metaestimators import timed_instance_method

__all__ = [
    'SchemaNormalizer'
]


[docs]class SchemaNormalizer(BasePDTransformer): r"""Enforce a schema on an input dataframe. The SchemaNormalizer enforces a schema across incoming train and test data. This ensures that all data matches the expected schema. Note that unlike most other Skoot transformers, this one requires that the output be a DataFrame (note the lack of the ``as_df`` constructor arg). Parameters ---------- schema : dict The schema. This dictionary maps column names to actions. For instance the following schema will cast the iris dataset "petal widtch (cm)" column to integer:: >>> schema = {'petal width (cm)': int} Attributes ---------- fit_cols_ : list The list of column names on which the transformer was fit. This is used to validate the presence of the features in the test set during the ``transform`` stage. """
[docs] def __init__(self, schema): super(SchemaNormalizer, self).__init__( as_df=True, # Does not really matter, it always returns one cols=None) self.schema = schema
[docs] @timed_instance_method(attribute_name="fit_time_") def fit(self, X, y=None): """Fit the transformer. Default behavior is not to fit any parameters and return self. This is useful for transformers which do not require parameterization, but need to fit into a pipeline. Parameters ---------- X : pd.DataFrame, shape=(n_samples, n_features) The Pandas frame to fit. y : array-like or None, shape=(n_samples,), optional (default=None) Pass-through for ``sklearn.pipeline.Pipeline``. """ _, self.fit_cols_ = check_dataframe(X, cols=self.cols) return self
[docs] def transform(self, X): """Apply the schema normalization. Parameters ---------- X : pd.DataFrame, shape=(n_samples, n_features) The Pandas frame to transform. The operation will be applied to a copy of the input data, and the result will be returned. Returns ------- X : pd.DataFrame or np.ndarray, shape=(n_samples, n_features) The operation is applied to a copy of ``X``, and the result set is returned. """ check_is_fitted(self, "fit_cols_") X, _ = check_dataframe(X, cols=self.cols) # validate that fit cols in test set cols = self.fit_cols_ validate_test_set_columns(cols, X.columns) # normalize for k, v in six.iteritems(self.schema): X[k] = X[k].astype(v) return X # DataFrame