Source code for skutil.feature_selection.select

# -*- coding: utf-8 -*-

from __future__ import print_function, division, absolute_import
from collections import namedtuple
import numpy as np
import pandas as pd
from sklearn.utils.validation import check_is_fitted
from .base import _BaseFeatureSelector
from ..utils import validate_is_pd, is_numeric
from ..utils.fixes import _cols_if_none

__all__ = [
    'FeatureDropper',
    'FeatureRetainer',
    'filter_collinearity',
    'MulticollinearityFilterer',
    'NearZeroVarianceFilterer',
    'SparseFeatureDropper'
]


def _validate_cols(cols):
    """Validate that there are at least two columns
    to evaluate. This is used for the MulticollinearityFilterer,
    as it requires there be at least two columns.

    Parameters
    ----------

    cols : None or array_like, shape=(n_features,)
        The columns to evaluate. If ``cols`` is not None
        and the length is less than 2, will raise a 
        ``ValueError``.
    """

    if cols is not None and len(cols) < 2:
        raise ValueError('too few features')


[docs]class SparseFeatureDropper(_BaseFeatureSelector): """Retains features that are less sparse (NaN) than the provided threshold. Useful in situations where matrices are too sparse to impute reliably. Parameters ---------- cols : array_like, shape=(n_features,), optional (default=None) The names of the columns on which to apply the transformation. If no column names are provided, the transformer will be ``fit`` on the entire frame. Note that the transformation will also only apply to the specified columns, and any other non-specified columns will still be present after transformation. threshold : float, optional (default=0.5) The threshold of sparsity above which features will be deemed "too sparse" and will be dropped. as_df : bool, optional (default=True) Whether to return a Pandas ``DataFrame`` in the ``transform`` method. If False, will return a Numpy ``ndarray`` instead. Since most skutil transformers depend on explicitly-named ``DataFrame`` features, the ``as_df`` parameter is True by default. Examples -------- >>> import numpy as np >>> import pandas as pd >>> >>> nan = np.nan >>> X = np.array([ ... [1.0, 2.0, nan], ... [2.0, 3.0, nan], ... [3.0, nan, 1.0], ... [4.0, 5.0, nan] ... ]) >>> >>> X = pd.DataFrame.from_records(data=X, columns=['a','b','c']) >>> dropper = SparseFeatureDropper(threshold=0.5) >>> X_transform = dropper.fit_transform(X) >>> assert X_transform.shape[1] == 2 # drop out last column Attributes ---------- sparsity_ : array_like, shape=(n_features,) The array of sparsity values drop_ : array_like, shape=(n_features,) Assigned after calling ``fit``. These are the features that are designated as "bad" and will be dropped in the ``transform`` method. """ def __init__(self, cols=None, threshold=0.5, as_df=True): super(SparseFeatureDropper, self).__init__(cols=cols, as_df=as_df) self.threshold = threshold
[docs] def fit(self, X, y=None): """Fit the transformer. Parameters ---------- X : Pandas ``DataFrame``, shape=(n_samples, n_features) The Pandas frame to fit. The frame will only be fit on the prescribed ``cols`` (see ``__init__``) or all of them if ``cols`` is None. Furthermore, ``X`` will not be altered in the process of the fit. y : None Passthrough for ``sklearn.pipeline.Pipeline``. Even if explicitly set, will not change behavior of ``fit``. Returns ------- self """ X, self.cols = validate_is_pd(X, self.cols) thresh = self.threshold # validate the threshold if not (is_numeric(thresh) and (0.0 <= thresh < 1.0)): raise ValueError('thresh must be a float between ' '0 (inclusive) and 1. Got %s' % str(thresh)) # get cols cols = _cols_if_none(X, self.cols) # assess sparsity self.sparsity_ = X[cols].apply(lambda x: x.isnull().sum() / x.shape[0]).values # numpy array mask = self.sparsity_ > thresh # numpy boolean array self.drop_ = X.columns[mask].tolist() return self
[docs]class FeatureDropper(_BaseFeatureSelector): """A very simple class to be used at the beginning or any stage of a Pipeline that will drop the given features from the remainder of the pipe Parameters ---------- cols : array_like, shape=(n_features,), optional (default=None) The features to drop. Note that ``FeatureDropper`` behaves slightly differently from all other ``_BaseFeatureSelector`` classes in the sense that it will drop all of the features prescribed in this parameter. However, if ``cols`` is None, it will not drop any (which is counter to other classes, which will operate on all columns in the absence of an explicit ``cols`` parameter). as_df : bool, optional (default=True) Whether to return a Pandas ``DataFrame`` in the ``transform`` method. If False, will return a Numpy ``ndarray`` instead. Since most skutil transformers depend on explicitly-named ``DataFrame`` features, the ``as_df`` parameter is True by default. Examples -------- >>> import numpy as np >>> import pandas as pd >>> >>> X = pd.DataFrame.from_records(data=np.random.rand(3,3), columns=['a','b','c']) >>> dropper = FeatureDropper(cols=['a','b']) >>> X_transform = dropper.fit_transform(X) >>> assert X_transform.shape[1] == 1 # drop out first two columns Attributes ---------- drop_ : array_like, shape=(n_features,) Assigned after calling ``fit``. These are the features that are designated as "bad" and will be dropped in the ``transform`` method. """ def __init__(self, cols=None, as_df=True): super(FeatureDropper, self).__init__(cols=cols, as_df=as_df)
[docs] def fit(self, X, y=None): # check on state of X and cols _, self.cols = validate_is_pd(X, self.cols) self.drop_ = self.cols return self
[docs]class FeatureRetainer(_BaseFeatureSelector): """A very simple class to be used at the beginning of a Pipeline that will only propagate the given features throughout the remainder of the pipe Parameters ---------- cols : array_like, shape=(n_features,), optional (default=None) The names of the columns on which to apply the transformation. If no column names are provided, the transformer will be ``fit`` on the entire frame. Note that the transformation will also only apply to the specified columns, and any other non-specified columns will still be present after transformation. as_df : bool, optional (default=True) Whether to return a Pandas ``DataFrame`` in the ``transform`` method. If False, will return a Numpy ``ndarray`` instead. Since most skutil transformers depend on explicitly-named ``DataFrame`` features, the ``as_df`` parameter is True by default. Examples -------- >>> import numpy as np >>> import pandas as pd >>> >>> X = pd.DataFrame.from_records(data=np.random.rand(3,3), columns=['a','b','c']) >>> dropper = FeatureRetainer(cols=['a','b']) >>> X_transform = dropper.fit_transform(X) >>> assert X_transform.shape[1] == 2 # retain first two columns Attributes ---------- drop_ : array_like, shape=(n_features,) Assigned after calling ``fit``. These are the features that are designated as "bad" and will be dropped in the ``transform`` method. """ def __init__(self, cols=None, as_df=True): super(FeatureRetainer, self).__init__(cols=cols, as_df=as_df)
[docs] def fit(self, X, y=None): """Fit the transformer. Parameters ---------- X : Pandas ``DataFrame``, shape=(n_samples, n_features) The Pandas frame to fit. The frame will only be fit on the prescribed ``cols`` (see ``__init__``) or all of them if ``cols`` is None. Furthermore, ``X`` will not be altered in the process of the fit. y : None Passthrough for ``sklearn.pipeline.Pipeline``. Even if explicitly set, will not change behavior of ``fit``. Returns ------- self """ # check on state of X and cols X, self.cols = validate_is_pd(X, self.cols) # set the drop as those not in cols cols = self.cols if self.cols is not None else [] self.drop_ = X.drop(cols, axis=1).columns.tolist() # these will be the left overs return self
[docs] def transform(self, X): """Transform a test matrix given the already-fit transformer. Parameters ---------- X : Pandas ``DataFrame``, shape=(n_samples, n_features) The Pandas frame to transform. The prescribed ``drop_`` columns will be dropped and a copy of ``X`` will be returned. Returns ------- dropped : Pandas ``DataFrame`` or np.ndarray, shape=(n_samples, n_features) The test data with the prescribed ``drop_`` columns removed. """ check_is_fitted(self, 'drop_') # check on state of X and cols X, _ = validate_is_pd(X, self.cols) # copy X cols = X.columns if self.cols is None else self.cols retained = X[cols] # if not cols, returns all return retained if self.as_df else retained.as_matrix()
class _MCFTuple(namedtuple('_MCFTuple', ('feature_x', 'feature_y', 'abs_corr', 'mac'))): """A raw namedtuple is very memory efficient as it packs the attributes in a struct to get rid of the __dict__ of attributes in particular it does not copy the string for the keys on each instance. By deriving a namedtuple class just to introduce the __repr__ method we would also reintroduce the __dict__ on the instance. By telling the Python interpreter that this subclass uses static __slots__ instead of dynamic attributes. Furthermore we don't need any additional slot in the subclass so we set __slots__ to the empty tuple. """ __slots__ = tuple() def __repr__(self): """Simple custom repr to summarize the main info""" return "Dropped: {0}, Corr_feature: {1}, abs_corr: {2:.5f}, MAC: {3:.5f}".format( self.feature_x, self.feature_y, self.abs_corr, self.mac)
[docs]def filter_collinearity(c, threshold): """Performs the collinearity filtration for both the ``MulticollinearityFilterer`` as well as the ``H2OMulticollinearityFilterer`` Parameters ---------- c : pandas ``DataFrame`` The pre-computed correlation matrix. This is expected to be a square matrix, and will raise a ``ValueError`` if it's not. threshold : float The threshold above which to filter features which are multicollinear in nature. Returns ------- drops : list (string), shape=(n_features,) The features that should be dropped macor : list (float), shape=(n_features,) The mean absolute correlations between the features. crrz : list (_MCFTuple), shape=(n_features,) The tuple containing all information on the collinearity metrics between each pairwise correlation. """ # ensure symmetric if c.shape[0] != c.shape[1]: raise ValueError('input dataframe should be symmetrical in dimensions') # init drops list drops = [] macor = [] # mean abs corrs corrz = [] # the correlations # Iterate over each feature finished = False while not finished: # Whenever there's a break, this loop will start over for i, nm in enumerate(c.columns): this_col = c[nm].drop(nm).sort_values( na_position='first') # gets the column, drops the index of itself, and sorts this_col_nms = this_col.index.tolist() this_col = np.array(this_col) # check if last value is over thresh max_cor = this_col[-1] if pd.isnull(max_cor) or max_cor < threshold or this_col.shape[0] == 1: if i == c.columns.shape[0] - 1: finished = True # control passes to next column name or end if finished continue # otherwise, we know the corr is over the threshold # gets the current col, and drops the same row, sorts asc and gets other col other_col_nm = this_col_nms[-1] that_col = c[other_col_nm].drop(other_col_nm) # get the mean absolute correlations of each mn_1, mn_2 = np.nanmean(this_col), np.nanmean(that_col) # we might get nans? # if pd.isnull(mn_1) and pd.isnull(mn_2): # this condition is literally impossible, as it would # require every corr to be NaN, and it wouldn't have # even gotten here without hitting the continue block. if pd.isnull(mn_1): drop_nm = other_col_nm elif pd.isnull(mn_2): drop_nm = nm else: drop_nm = nm if mn_1 > mn_2 else other_col_nm # drop the bad col, row c.drop(drop_nm, axis=1, inplace=True) c.drop(drop_nm, axis=0, inplace=True) # add the bad col to drops drops.append(drop_nm) macor.append(np.maximum(mn_1, mn_2)) corrz.append(_MCFTuple( feature_x=drop_nm, feature_y=nm if not nm == drop_nm else other_col_nm, abs_corr=max_cor, mac=macor[-1] )) # if we get here, we have to break so the loop will # start over from the first (non-popped) column break # if not finished, restarts loop, otherwise will exit loop # return out_tup = (drops, macor, corrz) return out_tup
[docs]class MulticollinearityFilterer(_BaseFeatureSelector): """Filter out features with a correlation greater than the provided threshold. When a pair of correlated features is identified, the mean absolute correlation (MAC) of each feature is considered, and the feature with the highest MAC is discarded. Parameters ---------- cols : array_like, shape=(n_features,), optional (default=None) The names of the columns on which to apply the transformation. If no column names are provided, the transformer will be ``fit`` on the entire frame. Note that the transformation will also only apply to the specified columns, and any other non-specified columns will still be present after transformation. threshold : float, optional (default=0.85) The threshold above which to filter correlated features method : str, optional (default='pearson') The method used to compute the correlation, one of ['pearson','kendall','spearman']. as_df : bool, optional (default=True) Whether to return a Pandas ``DataFrame`` in the ``transform`` method. If False, will return a Numpy ``ndarray`` instead. Since most skutil transformers depend on explicitly-named ``DataFrame`` features, the ``as_df`` parameter is True by default. Examples -------- The following demonstrates a simple multicollinearity filterer applied to the iris dataset. >>> import pandas as pd >>> from skutil.utils import load_iris_df >>> >>> X = load_iris_df(include_tgt=False) >>> mcf = MulticollinearityFilterer(threshold=0.85) >>> mcf.fit_transform(X).head() sepal length (cm) sepal width (cm) petal width (cm) 0 5.1 3.5 0.2 1 4.9 3.0 0.2 2 4.7 3.2 0.2 3 4.6 3.1 0.2 4 5.0 3.6 0.2 Attributes ---------- drop_ : array_like, shape=(n_features,) Assigned after calling ``fit``. These are the features that are designated as "bad" and will be dropped in the ``transform`` method. mean_abs_correlations_ : list, float The corresponding mean absolute correlations of each ``drop_`` name correlations_ : list of ``_MCFTuple`` instances Contains detailed info on multicollinear columns """ def __init__(self, cols=None, threshold=0.85, method='pearson', as_df=True): super(MulticollinearityFilterer, self).__init__(cols=cols, as_df=as_df) self.threshold = threshold self.method = method
[docs] def fit(self, X, y=None): """Fit the multicollinearity filterer. Parameters ---------- X : Pandas ``DataFrame``, shape=(n_samples, n_features) The Pandas frame to fit. The frame will only be fit on the prescribed ``cols`` (see ``__init__``) or all of them if ``cols`` is None. Furthermore, ``X`` will not be altered in the process of the fit. y : None Passthrough for ``sklearn.pipeline.Pipeline``. Even if explicitly set, will not change behavior of ``fit``. Returns ------- self """ # check on state of X and cols X, self.cols = validate_is_pd(X, self.cols, assert_all_finite=True) cols = _cols_if_none(X, self.cols) _validate_cols(cols) # Generate correlation matrix c = X[cols].corr(method=self.method).apply(lambda x: np.abs(x)) # get drops list self.drop_, self.mean_abs_correlations_, self.correlations_ = filter_collinearity(c, self.threshold) return self
def _near_zero_variance_ratio(series, ratio): """Perform NZV filtering based on a ratio of the most common value to the second-most-common value. Parameters ---------- series : pandas ``Series``, shape=(n_samples,) The series on which to compute ``value_counts``. Returns ------- ratio_ : float The ratio of the most-prevalent value to the second-most-prevalent value. drop_ : int Whether to keep the feature or drop it. 1 if drop, 0 if keep. """ counts = series.value_counts().sort_values(ascending=False) # if there's only one value... if counts.shape[0] < 2: return np.nan, 1 ratio_ = counts.iloc[0] / counts.iloc[1] drop_ = int(ratio_ >= ratio) return ratio_, drop_
[docs]class NearZeroVarianceFilterer(_BaseFeatureSelector): """Identify and remove any features that have a variance below a certain threshold. There are two possible strategies for near-zero variance feature selection: 1) Select features on the basis of the actual variance they exhibit. This is only relevant when the features are real numbers. 2) Remove features where the ratio of the frequency of the most prevalent value to that of the second-most frequent value is large, say 20 or above (Kuhn & Johnson[1]). Parameters ---------- cols : array_like, shape=(n_features,), optional (default=None) The names of the columns on which to apply the transformation. If no column names are provided, the transformer will be ``fit`` on the entire frame. Note that the transformation will also only apply to the specified columns, and any other non-specified columns will still be present after transformation. threshold : float, optional (default=1e-6) The threshold below which to declare "zero variance" as_df : bool, optional (default=True) Whether to return a Pandas ``DataFrame`` in the ``transform`` method. If False, will return a Numpy ``ndarray`` instead. Since most skutil transformers depend on explicitly-named ``DataFrame`` features, the ``as_df`` parameter is True by default. strategy : str, optional (default='variance') The strategy by which feature selection should be performed, one of ('variance', 'ratio'). If ``strategy`` is 'variance', features will be selected based on the amount of variance they exhibit; those that are low-variance (below ``threshold``) will be removed. If ``strategy`` is 'ratio', features are dropped if the most prevalent value is represented at a ratio greater than or equal to ``threshold`` to the second-most frequent value. **Note** that if ``strategy`` is 'ratio', ``threshold`` must be greater than 1. Examples -------- >>> import pandas as pd >>> import numpy as np >>> from skutil.feature_selection import NearZeroVarianceFilterer >>> >>> X = pd.DataFrame.from_records(data=np.array([ ... [1,2,3], ... [4,5,3], ... [6,7,3], ... [8,9,3]]), ... columns=['a','b','c']) >>> filterer = NearZeroVarianceFilterer(threshold=0.05) >>> filterer.fit_transform(X) a b 0 1 2 1 4 5 2 6 7 3 8 9 Attributes ---------- drop_ : array_like, shape=(n_features,) Assigned after calling ``fit``. These are the features that are designated as "bad" and will be dropped in the ``transform`` method. var_ : dict The dropped columns mapped to their corresponding variances or ratios, depending on the ``strategy`` References ---------- .. [1] Kuhn, M. & Johnson, K. "Applied Predictive Modeling" (2013). New York, NY: Springer. """ def __init__(self, cols=None, threshold=1e-6, as_df=True, strategy='variance'): super(NearZeroVarianceFilterer, self).__init__(cols=cols, as_df=as_df) self.threshold = threshold self.strategy = strategy
[docs] def fit(self, X, y=None): """Fit the transformer. Parameters ---------- X : Pandas ``DataFrame``, shape=(n_samples, n_features) The Pandas frame to fit. The frame will only be fit on the prescribed ``cols`` (see ``__init__``) or all of them if ``cols`` is None. Furthermore, ``X`` will not be altered in the process of the fit. y : None Passthrough for ``sklearn.pipeline.Pipeline``. Even if explicitly set, will not change behavior of ``fit``. Returns ------- self """ # check on state of X and cols X, self.cols = validate_is_pd(X, self.cols, assert_all_finite=True) cols = _cols_if_none(X, self.cols) # validate strategy valid_strategies = ('variance', 'ratio') if self.strategy not in valid_strategies: raise ValueError('strategy must be one of {0}, but got {1}'.format( str(valid_strategies), self.strategy)) if self.strategy == 'variance': # if cols is None, applies over everything variances = X[cols].var() mask = (variances < self.threshold).values self.var_ = variances[mask].tolist() self.drop_ = variances.index[mask].tolist() else: # validate ratio ratio = self.threshold if not ratio > 1.0: raise ValueError('when strategy=="ratio", threshold must be greater than 1.0') # get a np.array mask matrix = np.array([_near_zero_variance_ratio(X[col], ratio) for col in cols]) drop_mask = matrix[:, 1].astype(np.bool) self.drop_ = np.asarray(cols)[drop_mask].tolist() self.var_ = dict(zip(self.drop_, matrix[drop_mask, 0].tolist())) # just retain the variances return self