Source code for skoot.feature_selection.base

# -*- coding: utf-8 -*-
#
# Author: Taylor Smith <taylor.smith@alkaline-ml.com>

from sklearn.utils.validation import check_is_fitted
from abc import ABCMeta
import six

from ..base import BasePDTransformer
from ..utils.validation import check_dataframe
from ..utils.dataframe import dataframe_or_array

import warnings

__all__ = [
    'BaseFeatureSelector'
]


[docs]class BaseFeatureSelector(six.with_metaclass(ABCMeta, BasePDTransformer)): """Base class for feature selectors. The base class for all skoot feature selectors, the _BaseFeatureSelector should adhere to the following behavior: * The ``fit`` method should only fit the specified columns (since it's also a ``SelectiveMixin``), fitting all columns only when ``cols`` is None. * The ``fit`` method should not change the state of the training frame. * The transform method should return a copy of the test frame, dropping the columns identified as "bad" in the ``fit`` method. Parameters ---------- cols : array-like, shape=(n_features,), optional (default=None) The names of the columns on which to apply the transformation. If no column names are provided, the transformer will be ``fit`` on the entire frame. Note that the transformation will also only apply to the specified columns, and any other non-specified columns will still be present after transformation. as_df : bool, optional (default=True) Whether to return a Pandas ``DataFrame`` in the ``transform`` method. If False, will return a Numpy ``ndarray`` instead. Since most skoot transformers depend on explicitly-named ``DataFrame`` features, the ``as_df`` parameter is True by default. """
[docs] def __init__(self, cols=None, as_df=True): # simple pass-through for the super constructor call super(BaseFeatureSelector, self).__init__( cols=cols, as_df=as_df)
[docs] def transform(self, X): """Transform a test dataframe. Parameters ---------- X : pd.DataFrame, shape=(n_samples, n_features) The Pandas frame to transform. The operation will be applied to a copy of the input data, and the result will be returned. Returns ------- X_select : pd.DataFrame, shape=(n_samples, n_features) The selected columns from ``X``. """ check_is_fitted(self, 'drop_') # check on state of X and cols X, cols = check_dataframe(X, self.cols) # if there's nothing to drop drop_columns = self.drop_ # type: list if not drop_columns: return X if self.as_df else X.values # otherwise, there's something to drop else: # what if we don't want to throw this key error for a non-existent # column that we hope to drop anyways? We need to at least inform # the user... colset = set(X.columns) drops = [x for x in drop_columns if x in colset] # for length mismatch, we know there's a missing column if len(drops) != len(drop_columns): warnings.warn('one or more features to drop not contained ' 'in input data feature names (drop=%r)' % drop_columns, UserWarning) dropped = X.drop(drops, axis=1) return dataframe_or_array(dropped, self.as_df)