Source code for skoot.base

# -*- coding: utf-8 -*-
#
# Author: Taylor Smith <taylor.smith@alkaline-ml.com>

from sklearn.utils.validation import check_is_fitted
from sklearn.base import BaseEstimator, TransformerMixin

import six
from abc import ABCMeta
import pandas as pd

from .exceptions import DeveloperError
from .utils.validation import check_dataframe, validate_test_set_columns
from .utils.iterables import is_iterable
from .utils.compat import xrange
from .utils.dataframe import dataframe_or_array
from .utils.metaestimators import timed_instance_method

# namespace import to avoid explicitly protected imports in global namespace
from .utils import _docstr as dsutils

import warnings
import copy

__all__ = [
    'BasePDTransformer'
]


[docs]class BasePDTransformer(six.with_metaclass(ABCMeta, BaseEstimator, TransformerMixin)): __doc__ = """The base class for all Pandas frame transformers. Provides the base class for all skoot transformers that require Pandas dataframes as input. Parameters ---------- {_cols_doc} {_as_df_doc} Examples -------- The following is an example of how to subclass a BasePDTransformer: >>> from skoot.base import BasePDTransformer >>> class A(BasePDTransformer): ... def __init__(self, cols=None, as_df=None): ... super(A, self).__init__(cols, as_df) ... >>> A() A(as_df=None, cols=None) """.format(_cols_doc=dsutils._cols_doc, _as_df_doc=dsutils._as_df_doc)
[docs] def __init__(self, cols=None, as_df=True): self.as_df = as_df # NOTE: As of sklearn 0.20+, copying no longer works. Should we warn # for mutable structs passed as cols??? TODO # self.cols = copy.deepcopy(cols) # do not let be mutable! self.cols = cols
[docs] @timed_instance_method(attribute_name="fit_time_") def fit(self, X, y=None): """Fit the transformer. Default behavior is not to fit any parameters and return self. This is useful for transformers which do not require parameterization, but need to fit into a pipeline. Parameters ---------- X : pd.DataFrame, shape=(n_samples, n_features) The Pandas frame to fit. y : array-like or None, shape=(n_samples,), optional (default=None) Pass-through for ``sklearn.pipeline.Pipeline``. """ return self
class _SelectiveTransformerWrapper(six.with_metaclass(dsutils._WritableDoc, BasePDTransformer)): # non-estimator parameters only used for the wrapper and not in set_params _p_names = ('cols', 'as_df', 'trans_col_name') # Build a selective transformer on the fly. # # This is a private method used at the head of submodules to wrap # sklearn modules in the selective interface. Do not include in __all__ # since this method is for power-users/package developers. Do not # include in __all__ since this class is for power-users/package # developers. def __init__(self, cols=None, as_df=True, trans_col_name=None, **kwargs): super(_SelectiveTransformerWrapper, self).__init__( cols=cols, as_df=as_df) # this is a STATIC attribute of subclasses try: cls = self._cls except AttributeError: raise DeveloperError("_SelectiveTransformerWrapper subclasses " "must contain a static _cls attribute that " "maps to a sklearn type!") # get the (default) parameters for the estimator in question # and initialize to default self.estimator_ = cls() default_est_parms = self.estimator_.get_params(deep=True) # set the attributes in the estimator AND in the constructor so this # class behaves like sklearn in grid search self.estimator_.set_params(**kwargs) # set the kwargs here to behave like sklearn for k, v in six.iteritems(default_est_parms): if kwargs: v = kwargs.get(k, v) # try get from kwargs, fail w def. value setattr(self, k, v) self.trans_col_name = trans_col_name @timed_instance_method(attribute_name="fit_time_") def fit(self, X, y=None, **fit_kwargs): """Fit the wrapped transformer. This method will fit the wrapped sklearn transformer on the selected columns, leaving other columns alone. Parameters ---------- X : pd.DataFrame, shape=(n_samples, n_features) The Pandas frame to fit. The frame will only be fit on the prescribed ``cols`` (see ``__init__``) or all of them if ``cols`` is None. Furthermore, ``X`` will not be altered in the process of the fit. y : array-like or None, shape=(n_samples,), optional (default=None) Pass-through for ``sklearn.pipeline.Pipeline``. Even if explicitly set, will not change behavior of ``fit``. """ # check on state of X and cols X, cols = check_dataframe(X, self.cols) # fit the estimator in place self.estimator_.fit(X[cols], **fit_kwargs) # the columns we fit on self.fit_cols_ = cols return self def transform(self, X): """Transform a test dataframe. Parameters ---------- X : pd.DataFrame, shape=(n_samples, n_features) The Pandas frame to transform. The operation will be applied to a copy of the input data, and the result will be returned. Returns ------- X : pd.DataFrame, shape=(n_samples, n_features) The operation is applied to a copy of ``X``, and the result set is returned. """ check_is_fitted(self, 'fit_cols_') # check on state of X and cols X, _, other_nms = check_dataframe(X, cols=self.cols, column_diff=True) # validate that the test set columns exist in the fit columns cols = self.fit_cols_ validate_test_set_columns(cols, X.columns) # get the transformer est = self.estimator_ transform = est.transform(X[cols]) # get the transformed column names trans = self.trans_col_name n_trans_cols = transform.shape[1] if is_iterable(trans): if len(trans) != n_trans_cols: raise ValueError("dim mismatch in transformed column names " "and transformed column shape! (%i!=%i)" % (len(trans), n_trans_cols)) # else it's some scalar else: if trans is None: # default to class name trans = self.estimator_.__class__.__name__ # this gets caught if it's None as well: trans = ["%s%i" % (trans, i + 1) for i in xrange(n_trans_cols)] # stack the transformed variables onto the RIGHT side right = pd.DataFrame.from_records( data=transform, columns=trans) # set the index of right to be equal to that of the input so # we can concat seamlessly right.index = X.index # concat if needed x = pd.concat([X[other_nms], right], axis=1) if other_nms else right return dataframe_or_array(x, self.as_df) @classmethod def _get_param_names(cls): # so we can get constructor args for grid search # (this is a closure) return list(cls._p_names) + \ cls._cls._get_param_names() # must have _cls! class _AnonymousPDTransformer(BasePDTransformer): """General transformer wrapper used to make a commutative function into a Pipeline-able function. Parameters ---------- func : callable The commutative function used to transform the train or test set. """ def __init__(self, **kwargs): super(_AnonymousPDTransformer, self).__init__( cols=None, as_df=True) # There should never NOT be a "func" key since this is handled # internally. Only time that could happen is if someone tries to # do this on their own.. Live with the KeyError if it breaks since # the silly developer screwed it up! self.func = kwargs["func"] # Assign the kwargs such that we can tune hyper parameters in # the anonymous transformer. param_names = [] for k, v in six.iteritems(kwargs): # two things: save the parameter name, and assign the value # as an internal attribute param_names.append(k) setattr(self, k, v) self._param_names = param_names def transform(self, X): """Apply the commutative function to the train or test set. Parameters ---------- X : pd.DataFrame, shape=(n_samples, n_features) The Pandas frame to transform. """ # construct the kwargs, but remember that we do not want "func"!!! kwargs = {k: getattr(self, k) for k in self._param_names if k != "func"} return self.func(X, **kwargs) def get_params(self, deep=True): """Get parameters for this estimator. Parameters ---------- deep : boolean, optional If True, will return the parameters for this estimator and contained sub-objects that are estimators. Returns ------- params : mapping of string to any Parameter names mapped to their values. """ out = dict() # unlike sklearn default we can use the stored param names for key in self._param_names: value = getattr(self, key, None) # should always be present... if deep and hasattr(value, 'get_params'): deep_items = value.get_params().items() out.update((key + '__' + k, val) for k, val in deep_items) out[key] = value return out
[docs]def make_transformer(func, **kwargs): """Make a function into a scikit-learn TransformerMixin. Wraps a commutative function as an anonymous BasePDTransformer in order to fit into a Pipeline. The returned transformer class methods adhere to the standard BasePDTransformer ``fit`` and ``transform`` signatures. This is useful when a transforming function that does not fit any parameters is used to pre-process data at a point that might split a pipeline. Parameters ---------- func : callable The function that will be used to transform a dataset. Note that for certain scikit-learn operations or for model persistence, this will need to be pickled. Therefore, using a closure or lambda expression could cause downstream issues that are not immediately apparent. This function will raise a warning if it's determined that a lambda expression is passed as ``func``, but not all corner cases can be caught. Be cautious. **kwargs : keyword args or dict, optional A dictionary of keyword args that will be passed to the transformer class' ``transform`` function (``func``) and enable the anonymous transformer to be tuned via grid search similar to other transformers. Examples -------- >>> from sklearn.datasets import load_iris >>> from sklearn.pipeline import Pipeline >>> from sklearn.decomposition import PCA >>> from sklearn.model_selection import GridSearchCV >>> from sklearn.linear_model import LogisticRegression >>> X, y = load_iris(return_X_y=True) >>> >>> def subtract_k(x, k): ... return x - float(k) >>> >>> pipe = Pipeline([ ... ('pca', PCA()), ... ('custom', make_transformer(subtract_k, k=2)), ... ('clf', LogisticRegression(random_state=42)) ... ]) >>> >>> hyper_params = {"pca__whiten": [True, False], ... "custom__k": [1, 2]} >>> search = GridSearchCV(pipe, param_grid=hyper_params, ... scoring="accuracy") >>> search.fit(X, y) # doctest: +SKIP GridSearchCV(...) """ # first, if it's a lambda function, warn the user. lam = (lambda: None) if isinstance(func, type(lam)) and func.__name__ == lam.__name__: warnings.warn("A lambda function was passed to the make_transformer " "function. While not explicitly unsupported, this will " "complicate transformer persistence. To persist " "dynamically-created transformers, use def-style " "functions.", UserWarning) # Note func needs to be passed as a keyword for it to be read in as a # "kwarg" argument return _AnonymousPDTransformer(func=func, **kwargs)