Source code for skutil.h2o.select

from __future__ import print_function, division, absolute_import
import warnings
import numpy as np
from sklearn.utils.validation import check_is_fitted
from ..feature_selection import filter_collinearity
from ..feature_selection.select import _near_zero_variance_ratio
from ..utils import is_numeric
from ..utils.fixes import is_iterable
from .base import (BaseH2OTransformer, check_frame, _retain_features, _frame_from_x_y)
from .frame import as_series

__all__ = [
    'BaseH2OFeatureSelector',
    'H2OFeatureDropper',
    'H2OMulticollinearityFilterer',
    'H2ONearZeroVarianceFilterer',
    'H2OSparseFeatureDropper'
]


def _validate_use(X, use, na_warn):
    """For H2OMulticollinearityFilterer and H2ONearZeroVarianceFilterer,
    validate that our 'use' arg is appropriate given the presence of NA
    values in the H2OFrame.

    Parameters
    ----------

    X : ``H2OFrame``, shape=(n_samples, n_features)
        The frame to evaluate. Since this is an internal method,
        no validation is done to ensure it is, in fact, an ``H2OFrame``

    use : str, one of ('complete.obs', 'all.obs', 'everything')
        The ``use`` argument passed to the transformer

    na_warn : bool
        Whether to warn if there are NAs present in the frame. If there are,
        and na_warn is set to False, the function will use the provided use,
        however, if na_warn is True and there are NA values present it will
        raise a warning and use 'complete.obs'

    Returns
    -------

    use : string
        The appropriate use string
    """
    # validate use
    _valid_use = ['complete.obs', 'all.obs', 'everything']
    if use not in _valid_use:
        raise ValueError('expected one of (%s) but got %s' % (', '.join(_valid_use), use))

    # check on NAs
    if use == 'complete.obs':
        pass
    elif na_warn:  # only warn if not using complete.obs
        nasum = X.isna().sum()
        if nasum > 0:
            warnings.warn('%i NA value(s) in frame; using "complete.obs"' % nasum)
            use = 'complete.obs'

    return use


[docs]class BaseH2OFeatureSelector(BaseH2OTransformer):
    """Base class for all H2O selectors.

    Parameters
    ----------

    feature_names : array_like (str), optional (default=None)
        The list of names on which to fit the transformer.

    target_feature : str, optional (default=None)
        The name of the target feature (is excluded from the fit)
        for the estimator.

    exclude_features : iterable or None, optional (default=None)
        Any names that should be excluded from ``feature_names``

    min_version : str or float, optional (default='any')
        The minimum version of h2o that is compatible with the transformer

    max_version : str or float, optional (default=None)
        The maximum version of h2o that is compatible with the transformer


    .. versionadded:: 0.1.0
    """

    def __init__(self, feature_names=None, target_feature=None, exclude_features=None,
                 min_version='any', max_version=None):
        super(BaseH2OFeatureSelector, self).__init__(feature_names=feature_names,
                                                     target_feature=target_feature,
                                                     exclude_features=exclude_features,
                                                     min_version=min_version,
                                                     max_version=max_version)

[docs]    def transform(self, X):
        """Transform the test frame, after fitting
        the transformer.

        Parameters
        ----------

        X : ``H2OFrame``, shape=(n_samples, n_features)
            The test frame to transform


        Returns
        -------

        X : ``H2OFrame``, shape=(n_samples, n_features)
            The transformed frame
        """
        # validate state, frame
        check_is_fitted(self, 'drop_')
        X = check_frame(X, copy=False)  # copied on next line

        return X[_retain_features(X, self.drop_)]


[docs]class H2OFeatureDropper(BaseH2OFeatureSelector):
    """A very simple class to be used at the beginning or any stage of an
    H2OPipeline that will drop the given features from the remainder of the pipe.

    This is useful when you have many features, but only a few to drop.
    Rather than passing the feature_names arg as the delta between many
    features and the several to drop, this allows you to drop them and keep
    feature_names as None in future steps.

    Parameters
    ----------

    feature_names : array_like (str), optional (default=None)
        The list of names on which to fit the transformer.

    target_feature : str, optional (default=None)
        The name of the target feature (is excluded from the fit)
        for the estimator.

    exclude_features : iterable or None, optional (default=None)
        Any names that should be excluded from ``feature_names``

    exclude_features : iterable or None, optional (default=None)
        Any names that should be excluded from ``feature_names``


    Attributes
    ----------

    drop_ : list (str)
        These are the features that will be dropped by 
        the ``FeatureDropper``


    .. versionadded:: 0.1.0
    """

    def __init__(self, feature_names=None, target_feature=None, exclude_features=None):
        super(H2OFeatureDropper, self).__init__(feature_names=feature_names,
                                                target_feature=target_feature,
                                                exclude_features=exclude_features)

[docs]    def fit(self, X):
        """Fit the H2OTransformer.

        Parameters
        ----------

        X : ``H2OFrame``, shape=(n_samples, n_features)
            The training data on which to fit.

        Returns
        -------

        return self
        """
        fn = self.feature_names
        if fn is None:
            fn = []

        # We validate the features_names is a list or iterable
        if is_iterable(fn):
            self.drop_ = [i for i in fn]
        else:
            raise ValueError('expected iterable for feature_names')

        return self


[docs]class H2OSparseFeatureDropper(BaseH2OFeatureSelector):
    """Retains features that are less sparse (NA) than
    the provided threshold.

    Parameters
    ----------

    feature_names : array_like (str), optional (default=None)
        The list of names on which to fit the transformer.

    target_feature : str, optional (default=None)
        The name of the target feature (is excluded from the fit)
        for the estimator.

    exclude_features : iterable or None, optional (default=None)
        Any names that should be excluded from ``feature_names``

    threshold : float, optional (default=0.5)
        The threshold of sparsity above which to drop

    Attributes
    ----------

    sparsity_ : array_like, (n_cols,)
        The array of sparsity values

    drop_ : array_like
        The array of column names to drop


    .. versionadded:: 0.1.0
    """

    _min_version = '3.8.2.9'
    _max_version = None

    def __init__(self, feature_names=None, target_feature=None, exclude_features=None, threshold=0.5):
        super(H2OSparseFeatureDropper, self).__init__(feature_names=feature_names,
                                                      target_feature=target_feature,
                                                      exclude_features=exclude_features,
                                                      min_version=self._min_version,
                                                      max_version=self._max_version)
        self.threshold = threshold

[docs]    def fit(self, X):
        """Fit the H2OTransformer.

        Parameters
        ----------

        X : ``H2OFrame``, shape=(n_samples, n_features)
            The training data on which to fit.

        Returns
        -------

        return self
        """
        X = check_frame(X, copy=False)  # gets copied below
        thresh = self.threshold

        # do copy
        frame = _frame_from_x_y(X, self.feature_names, 
                                self.target_feature, 
                                self.exclude_features)

        # validate the threshold
        if not (is_numeric(thresh) and (0.0 <= thresh < 1.0)):
            raise ValueError('thresh must be a float between '
                             '0 (inclusive) and 1. Got %s' % str(thresh))

        df = (frame.isna().apply(lambda x: x.sum()) / frame.shape[0]).as_data_frame(use_pandas=True)
        df.columns = frame.columns
        ser = df.T[0]

        self.drop_ = [str(x) for x in ser.index[ser > thresh]]
        self.sparsity_ = ser.values  # numpy array of sparsities

        return self


[docs]class H2OMulticollinearityFilterer(BaseH2OFeatureSelector):
    """Filter out features with a correlation greater than the provided threshold.
    When a pair of correlated features is identified, the mean absolute correlation (MAC)
    of each feature is considered, and the feature with the highest MAC is discarded.

    Parameters
    ----------

    feature_names : array_like (str), optional (default=None)
        The list of names on which to fit the transformer.

    target_feature : str, optional (default=None)
        The name of the target feature (is excluded from the fit)
        for the estimator.

    exclude_features : iterable or None, optional (default=None)
        Any names that should be excluded from ``feature_names``

    threshold : float, optional (default=0.85)
        The threshold above which to filter correlated features

    na_warn : bool, optional (default=True)
        Whether to warn if any NAs are present

    na_rm : bool, optional (default=False)
        Whether to remove NA values

    use : str, optional (default "complete.obs")
        One of {'complete.obs','all.obs','everything'}.
        A string indicating how to handle missing values.


    Attributes
    ----------

    drop_ : list, string
        The columns to drop

    mean_abs_correlations_ : list, float
        The corresponding mean absolute correlations of each drop_ name

    correlations_ : named tuple
        A list of tuples with each tuple containing the two correlated features, 
        the level of correlation, the feature that was selected for dropping, and
        the mean absolute correlation of the dropped feature.


    .. versionadded:: 0.1.0
    """

    _min_version = '3.8.2.9'
    _max_version = None

    def __init__(self, feature_names=None, target_feature=None, exclude_features=None,
                 threshold=0.85, na_warn=True, na_rm=False, use='complete.obs'):
        super(H2OMulticollinearityFilterer, self).__init__(feature_names=feature_names,
                                                           target_feature=target_feature,
                                                           exclude_features=exclude_features,
                                                           min_version=self._min_version,
                                                           max_version=self._max_version)
        self.threshold = threshold
        self.na_warn = na_warn
        self.na_rm = na_rm
        self.use = use

[docs]    def fit(self, X):
        """Fit the H2OTransformer.

        Parameters
        ----------

        X : ``H2OFrame``, shape=(n_samples, n_features)
            The training data on which to fit.

        Returns
        -------

        return self
        """
        self.fit_transform(X)
        return self

[docs]    def fit_transform(self, X):
        """Fit the multicollinearity filterer and
        return the transformed H2OFrame, X.

        Parameters
        ----------

        X : ``H2OFrame``, shape=(n_samples, n_features)
            The training data on which to fit

        Returns
        -------

        X : ``H2OFrame``, shape=(n_samples, n_features)
            The transformed training data
        """
        X = check_frame(X, copy=False) # copy below
        frame = _frame_from_x_y(X, self.feature_names, self.target_feature, self.exclude_features)

        # validate use, check NAs
        use = _validate_use(frame, self.use, self.na_warn)

        # Generate absolute correlation matrix
        c = frame.cor(use=use, na_rm=self.na_rm).abs().as_data_frame(use_pandas=True)
        c.columns = frame.columns  # set the cols to the same names
        c.index = frame.columns

        # get drops list
        self.drop_, self.mean_abs_correlations_, self.correlations_ = filter_collinearity(c, self.threshold)
        return self.transform(X)


[docs]class H2ONearZeroVarianceFilterer(BaseH2OFeatureSelector):
    """Identify and remove any features that have a variance below
    a certain threshold. There are two possible strategies for near-zero
    variance feature selection:

      1) Select features on the basis of the actual variance they
         exhibit. This is only relevant when the features are real
         numbers.

      2) Remove features where the ratio of the frequency of the most
         prevalent value to that of the second-most frequent value is
         large, say 20 or above (Kuhn & Johnson[1]).

    Parameters
    ----------

    feature_names : array_like (str), optional (default=None)
        The list of names on which to fit the transformer.

    target_feature : str, optional (default=None)
        The name of the target feature (is excluded from the fit)
        for the estimator.

    exclude_features : iterable or None, optional (default=None)
        Any names that should be excluded from ``feature_names``

    threshold : float, optional (default=1e-6)
        The threshold below which to declare "zero variance"

    na_warn : bool, optional (default=True)
        Whether to warn if any NAs are present

    na_rm : bool, optional (default=False)
        Whether to remove NA values

    use : str, optional (default "complete.obs")
        One of {'complete.obs','all.obs','everything'}
        A string indicating how to handle missing values.

    strategy : str, optional (default='variance')
        The strategy by which feature selection should be performed,
        one of ('variance', 'ratio'). If ``strategy`` is 'variance',
        features will be selected based on the amount of variance they
        exhibit; those that are low-variance (below ``threshold``) will
        be removed. If ``strategy`` is 'ratio', features are dropped if the
        most prevalent value is represented at a ratio greater than ``threshold``
        to the second-most frequent value. **Note** that if ``strategy`` is
        'ratio', ``threshold`` must be greater than 1.


    Attributes
    ----------

    drop_ : list, string
        The columns to drop

    var_ : dict
        The dropped columns mapped to their corresponding 
        variances or ratios, depending on the ``strategy``


    References
    ----------

    .. [1] Kuhn, M. & Johnson, K. "Applied Predictive 
           Modeling" (2013). New York, NY: Springer.


    .. versionadded:: 0.1.0
    """

    _min_version = '3.8.2.9'
    _max_version = None

    def __init__(self, feature_names=None, target_feature=None, exclude_features=None,
                 threshold=1e-6, na_warn=True, na_rm=False, use='complete.obs',
                 strategy='variance'):
        super(H2ONearZeroVarianceFilterer, self).__init__(feature_names=feature_names,
                                                          target_feature=target_feature,
                                                          exclude_features=exclude_features,
                                                          min_version=self._min_version,
                                                          max_version=self._max_version)
        self.threshold = threshold
        self.na_warn = na_warn
        self.na_rm = na_rm
        self.use = use
        self.strategy = strategy

[docs]    def fit(self, X):
        """Fit the near zero variance filterer,
        return the transformed X frame.

        Parameters
        ----------

        X : ``H2OFrame``, shape=(n_samples, n_features)
            The training data on which to fit.

        Returns
        -------

        self
        """
        self.fit_transform(X)
        return self

[docs]    def fit_transform(self, X):
        """Fit the near zero variance filterer.

        Parameters
        ----------

        X : ``H2OFrame``, shape=(n_samples, n_features)
            The training data on which to fit.

        Returns
        -------

        X : ``H2OFrame``, shape=(n_samples, n_features)
            The transformed training data
        """
        X = check_frame(X, copy=False)  # copy in next line
        frame = _frame_from_x_y(X, self.feature_names, self.target_feature, self.exclude_features)

        # validate use, check NAs
        use = _validate_use(frame, self.use, self.na_warn)
        cols = np.asarray([str(n) for n in frame.columns])

        # validate strategy
        valid_strategies = ('variance', 'ratio')
        if self.strategy not in valid_strategies:
            raise ValueError('strategy must be one of {0}, but got {1}'.format(
                str(valid_strategies), self.strategy))

        if self.strategy == 'variance':
            # get covariance matrix, extract the diagonal
            diag = np.diagonal(frame.var(na_rm=self.na_rm, use=use).as_data_frame(use_pandas=True).as_matrix())

            # create mask
            var_mask = diag < self.threshold

            self.drop_ = cols[var_mask].tolist()  # make list
            self.var_ = dict(zip(self.drop_, diag[var_mask]))
        else:
            # validate ratio
            ratio = self.threshold
            if not ratio > 1.0:
                raise ValueError('when strategy=="ratio", threshold must be greater than 1.0')

            # get a np.array mask
            matrix = np.array([_near_zero_variance_ratio(as_series(frame[col]), ratio) for col in frame.columns])
            drop_mask = matrix[:, 1].astype(np.bool)
            self.drop_ = np.asarray(frame.columns)[drop_mask].tolist()
            self.var_ = dict(zip(self.drop_, matrix[drop_mask, 0].tolist()))

        return self.transform(X)