# -*- coding: utf-8 -*-
#
# Author: Taylor Smith <taylor.smith@alkaline-ml.com>
from sklearn.utils.validation import check_is_fitted
import six
from ..base import BasePDTransformer
from ..utils.validation import check_dataframe, validate_test_set_columns
from ..utils.metaestimators import timed_instance_method
__all__ = [
'SchemaNormalizer'
]
[docs]class SchemaNormalizer(BasePDTransformer):
r"""Enforce a schema on an input dataframe.
The SchemaNormalizer enforces a schema across incoming train and
test data. This ensures that all data matches the expected schema.
Note that unlike most other Skoot transformers, this one requires
that the output be a DataFrame (note the lack of the ``as_df``
constructor arg).
Parameters
----------
schema : dict
The schema. This dictionary maps column names to actions. For
instance the following schema will cast the iris dataset
"petal widtch (cm)" column to integer::
>>> schema = {'petal width (cm)': int}
Attributes
----------
fit_cols_ : list
The list of column names on which the transformer was fit. This
is used to validate the presence of the features in the test set
during the ``transform`` stage.
"""
[docs] def __init__(self, schema):
super(SchemaNormalizer, self).__init__(
as_df=True, # Does not really matter, it always returns one
cols=None)
self.schema = schema
[docs] @timed_instance_method(attribute_name="fit_time_")
def fit(self, X, y=None):
"""Fit the transformer.
Default behavior is not to fit any parameters and return self.
This is useful for transformers which do not require
parameterization, but need to fit into a pipeline.
Parameters
----------
X : pd.DataFrame, shape=(n_samples, n_features)
The Pandas frame to fit.
y : array-like or None, shape=(n_samples,), optional (default=None)
Pass-through for ``sklearn.pipeline.Pipeline``.
"""
_, self.fit_cols_ = check_dataframe(X, cols=self.cols)
return self