Source code for skoot.feature_selection.base
# -*- coding: utf-8 -*-
#
# Author: Taylor Smith <taylor.smith@alkaline-ml.com>
from sklearn.utils.validation import check_is_fitted
from abc import ABCMeta
import six
from ..base import BasePDTransformer
from ..utils.validation import check_dataframe
from ..utils.dataframe import dataframe_or_array
import warnings
__all__ = [
'BaseFeatureSelector'
]
[docs]class BaseFeatureSelector(six.with_metaclass(ABCMeta, BasePDTransformer)):
"""Base class for feature selectors.
The base class for all skoot feature selectors, the _BaseFeatureSelector
should adhere to the following behavior:
* The ``fit`` method should only fit the specified columns
(since it's also a ``SelectiveMixin``), fitting all columns
only when ``cols`` is None.
* The ``fit`` method should not change the state of the training frame.
* The transform method should return a copy of the test frame,
dropping the columns identified as "bad" in the ``fit`` method.
Parameters
----------
cols : array-like, shape=(n_features,), optional (default=None)
The names of the columns on which to apply the transformation.
If no column names are provided, the transformer will be ``fit``
on the entire frame. Note that the transformation will also only
apply to the specified columns, and any other non-specified
columns will still be present after transformation.
as_df : bool, optional (default=True)
Whether to return a Pandas ``DataFrame`` in the ``transform``
method. If False, will return a Numpy ``ndarray`` instead.
Since most skoot transformers depend on explicitly-named
``DataFrame`` features, the ``as_df`` parameter is True by default.
"""
[docs] def __init__(self, cols=None, as_df=True):
# simple pass-through for the super constructor call
super(BaseFeatureSelector, self).__init__(
cols=cols, as_df=as_df)
[docs] def transform(self, X):
"""Transform a test dataframe.
Parameters
----------
X : pd.DataFrame, shape=(n_samples, n_features)
The Pandas frame to transform. The operation will
be applied to a copy of the input data, and the result
will be returned.
Returns
-------
X_select : pd.DataFrame, shape=(n_samples, n_features)
The selected columns from ``X``.
"""
check_is_fitted(self, 'drop_')
# check on state of X and cols
X, cols = check_dataframe(X, self.cols)
# if there's nothing to drop
drop_columns = self.drop_ # type: list
if not drop_columns:
return X if self.as_df else X.values
# otherwise, there's something to drop
else:
# what if we don't want to throw this key error for a non-existent
# column that we hope to drop anyways? We need to at least inform
# the user...
colset = set(X.columns)
drops = [x for x in drop_columns if x in colset]
# for length mismatch, we know there's a missing column
if len(drops) != len(drop_columns):
warnings.warn('one or more features to drop not contained '
'in input data feature names (drop=%r)'
% drop_columns, UserWarning)
dropped = X.drop(drops, axis=1)
return dataframe_or_array(dropped, self.as_df)