Source code for skutil.h2o.one_way_fs

from __future__ import absolute_import, division, print_function
import numpy as np
import warnings
from abc import ABCMeta, abstractmethod
from scipy import special, stats
from sklearn.externals import six
from .split import *
from .select import BaseH2OFeatureSelector
from .util import _unq_vals_col
from .fixes import rbind_all
from ..utils import is_integer
from .base import (check_frame, _frame_from_x_y)
from ..base import overrides, since
from sklearn.utils import as_float_array

__all__ = [
    'h2o_f_classif',
    'h2o_f_oneway',
    'H2OFScoreKBestSelector',
    'H2OFScorePercentileSelector'
]


# This function is re-written from sklearn.feature_selection
# and is included for compatability for older versions of
# sklearn that might raise an ImportError.
def _clean_nans(scores):
    scores = as_float_array(scores, copy=True)
    scores[np.isnan(scores)] = np.finfo(scores.dtype).min
    return scores


@since('0.1.2')
[docs]def h2o_f_classif(X, feature_names, target_feature):
    """Compute the ANOVA F-value for the provided sample.
    This method is adapted from ``sklearn.feature_selection.f_classif``
    to function on H2OFrames.

    Parameters
    ----------

    X : ``H2OFrame``, shape=(n_samples, n_features)
        The feature matrix. Each feature will be tested 
        sequentially.

    feature_names : array_like (str), optional (default=None)
        The list of names on which to fit the transformer.

    target_feature : str, optional (default=None)
        The name of the target feature (is excluded from the fit)
        for the estimator.


    Returns
    -------

    f : float
        The computed F-value of the test.

    prob : float
        The associated p-value from the F-distribution.
    """
    frame = check_frame(X, copy=False)

    # first, get unique values of y
    y = X[target_feature]
    _, unq = _unq_vals_col(y)

    # if y is enum, make the unq strings..
    unq = unq[_] if not y.isfactor()[0] else [str(i) for i in unq[_]]

    # get the masks
    args = [frame[y == k, :][feature_names] for k in unq]
    f, prob = h2o_f_oneway(*args)
    return f, prob


# The following function is a rewriting (of the sklearn rewriting) of 
# scipy.stats.f_oneway. Contrary to the scipy.stats.f_oneway implementation 
# it does not copy the data while keeping the inputs unchanged. Furthermore,
# contrary to the sklearn implementation, it does not use np.ndarrays, rather
# amending 1d H2OFrames inplace.

@since('0.1.2')
[docs]def h2o_f_oneway(*args):
    """Performs a 1-way ANOVA.
    The one-way ANOVA tests the null hypothesis that 2 or more groups have
    the same population mean. The test is applied to samples from two or
    more groups, possibly with differing sizes.

    Parameters
    ----------

    sample1, sample2, ... : array_like, H2OFrames, shape=(n_classes,)
        The sample measurements should be given as varargs (*args).
        A slice of the original input frame for each class in the
        target feature.

    Returns
    -------

    f : float
        The computed F-value of the test.

    prob : float
        The associated p-value from the F-distribution.

    Notes
    -----

    The ANOVA test has important assumptions that must be satisfied in order
    for the associated p-value to be valid.

    1. The samples are independent
    2. Each sample is from a normally distributed population
    3. The population standard deviations of the groups are all equal. This
       property is known as homoscedasticity.

    If these assumptions are not true for a given set of data, it may still be
    possible to use the Kruskal-Wallis H-test (``scipy.stats.kruskal``) although
    with some loss of power.

    The algorithm is from Heiman[2], pp.394-7.
    See ``scipy.stats.f_oneway`` and ``sklearn.feature_selection.f_oneway``.

    References
    ----------

    .. [1] Lowry, Richard.  "Concepts and Applications of Inferential
           Statistics". Chapter 14.
           http://faculty.vassar.edu/lowry/ch14pt1.html

    .. [2] Heiman, G.W.  Research Methods in Statistics. 2002.
    """
    n_classes = len(args)

    # sklearn converts everything to float here. Rather than do so,
    # we will test for total numericism and fail out if it's not 100%
    # numeric.
    if not all([all([X.isnumeric() for X in args])]):
        raise ValueError("All features must be entirely numeric for F-test")

    n_samples_per_class = [X.shape[0] for X in args]
    n_samples = np.sum(n_samples_per_class)

    # compute the sum of squared values in each column, and then compute the column
    # sums of all of those intermittent rows rbound together
    ss_alldata = rbind_all(*[X.apply(lambda x: (x*x).sum()) for X in args]).apply(lambda x: x.sum())

    # compute the sum of each column for each X in args, then rbind them all
    # and sum them up, finally squaring them. Tantamount to the squared sum
    # of each complete column. Note that we need to add a tiny fraction to ensure
    # all are real numbers for the rbind...
    sum_args = [X.apply(lambda x: x.sum() + 1e-12).asnumeric() for X in args]  # col sums
    square_of_sums_alldata = rbind_all(*sum_args).apply(lambda x: x.sum())
    square_of_sums_alldata *= square_of_sums_alldata

    square_of_sums_args = [s*s for s in sum_args]
    sstot = ss_alldata - square_of_sums_alldata / float(n_samples)

    ssbn = None  # h2o frame
    for k, _ in enumerate(args):
        tmp = square_of_sums_args[k] / n_samples_per_class[k]
        ssbn = tmp if ssbn is None else (ssbn + tmp)

    ssbn -= square_of_sums_alldata / float(n_samples)
    sswn = sstot - ssbn
    dfbn = n_classes - 1
    dfwn = n_samples - n_classes
    msb = ssbn / float(dfbn)
    msw = sswn / float(dfwn)

    constant_feature_idx = (msw == 0)
    constant_feature_sum = constant_feature_idx.sum()  # sum of ones
    nonzero_size = (msb != 0).sum()
    if nonzero_size != msb.shape[1] and constant_feature_sum:
        warnings.warn("Features %s are constant." % np.arange(msw.shape[1])[constant_feature_idx], UserWarning)

    f = (msb / msw)

    # convert to numpy ndarray for special
    f = f.as_data_frame(use_pandas=True).iloc[0].values

    # compute prob
    prob = special.fdtrc(dfbn, dfwn, f)
    return f, prob


class _H2OBaseUnivariateSelector(six.with_metaclass(ABCMeta, BaseH2OFeatureSelector)):
    """The base class for all univariate feature selectors in H2O.

    Parameters
    ----------

    feature_names : array_like (str), optional (default=None)
        The list of names on which to fit the transformer.

    target_feature : str, optional (default=None)
        The name of the target feature (is excluded from the fit)
        for the estimator.

    exclude_features : iterable or None, optional (default=None)
        Any names that should be excluded from ``feature_names``

    cv : int or H2OBaseCrossValidator, optional (default=3)
        Univariate feature selection can very easily remove
        features erroneously or cause overfitting. Using cross
        validation, we can more confidently select the features 
        to drop.

    iid : bool, optional (default=True)
        Whether to consider each fold as IID. The fold scores
        are normalized at the end by the number of observations
        in each fold

    min_version : str or float (default='any')
        The minimum version of h2o that is compatible with the transformer

    max_version : str or float (default=None)
        The maximum version of h2o that is compatible with the transformer
    """
    @abstractmethod
    def __init__(self, feature_names=None, target_feature=None,
                 exclude_features=None, cv=3, iid=True,
                 min_version='any', max_version=None):
        super(_H2OBaseUnivariateSelector, self).__init__(
            feature_names=feature_names, target_feature=target_feature,
            exclude_features=exclude_features, min_version=min_version,
            max_version=max_version)

        # validate CV
        self.cv = cv
        self.iid = iid


def _repack_tuple(two, one):
    """Utility for ``_test_and_score``.
    Packs the scores, p-values and train-fold length
    into a single, flat tuple.

    Parameters
    ----------

    two : tuple, shape=(2,)
        The scores & p-values tuple

    one : int
        The train fold length

    Returns
    -------

    out : tuple, shape=(3,)
        The flattened tuple: (F-scores, p-values, 
        train-fold size)
    """
    return two[0], two[1], one


def _test_and_score(frame, fun, cv, feature_names, target_feature, iid, select_fun):
    """Fit all the folds of some provided function, repack the scores
    tuple and adjust the fold score if ``iid`` is True.

    Parameters
    ----------

    frame : H2OFrame, shape=(n_samples, n_features)
            The frame to fit

    fun : callable
        The function to call

    cv : H2OBaseCrossValidator
        The cross validation class

    feature_names : array_like (str)
        The list of names on which to fit the transformer.

    target_feature : str
        The name of the target feature (is excluded from the fit)
        for the estimator.

    iid : bool
        Whether to consider each fold as IID. The fold scores
        are normalized at the end by the number of observations
        in each fold

    select_fun : callable
        The function used for feature selection

    Returns
    -------

    all_scores : np.ndarray
        The normalized scores

    all_pvalues : np.ndarray
        The normalized p-values

    list : The column names to drop
    """
    fn, tf = feature_names, target_feature
    if tf is None:
        raise ValueError('target_feature must be a string')

    scores = [
        _repack_tuple(fun(frame[train, :],
                          feature_names=fn, 
                          target_feature=tf), 
                      len(train))
        for train, _ in cv.split(frame, tf)
    ]

    # compute the mean F-score, p-value, adjust with IID
    n_folds = cv.get_n_splits()
    all_scores = 0.
    all_pvalues = 0.

    # adjust the fold scores
    for these_scores, p_vals, fold_size in scores:
        if iid:
            these_scores *= fold_size
            p_vals *= fold_size
        all_scores += these_scores
        all_pvalues += p_vals

    if iid:
        all_scores /= frame.shape[0]
        all_pvalues /= frame.shape[0]
    else:
        all_scores /= float(n_folds)
        all_pvalues /= float(n_folds)

    # return tuple
    return all_scores, all_pvalues, select_fun(all_scores, all_pvalues, fn)


class _BaseH2OFScoreSelector(six.with_metaclass(ABCMeta, 
                                                _H2OBaseUnivariateSelector)):
    """Select features based on the F-score, using the 
    ``h2o_f_classif`` method. Sub-classes will define how the
    number of features to retain is selected.

    Parameters
    ----------

    feature_names : array_like (str), optional (default=None)
        The list of names on which to fit the transformer.

    target_feature : str, optional (default=None)
        The name of the target feature (is excluded from the fit)
        for the estimator.

    exclude_features : iterable or None, optional (default=None)
        Any names that should be excluded from ``feature_names``

    cv : int or H2OBaseCrossValidator, optional (default=3)
        Univariate feature selection can very easily remove
        features erroneously or cause overfitting. Using cross
        validation, we can more confidently select the features 
        to drop.

    iid : bool, optional (default=True)
        Whether to consider each fold as IID. The fold scores
        are normalized at the end by the number of observations
        in each fold

    min_version : str or float, optional (default='any')
        The minimum version of h2o that is compatible with the transformer

    max_version : str or float, optional (default=None)
        The maximum version of h2o that is compatible with the transformer

    Attributes
    ----------

    scores_ : np.ndarray, float
        The score array, adjusted for ``n_folds``

    p_values_ : np.ndarray, float
        The p-value array, adjusted for ``n_folds``


    .. versionadded:: 0.1.2
    """

    _min_version = '3.8.2.9'
    _max_version = None

    def __init__(self, feature_names=None, target_feature=None,
                 exclude_features=None, cv=3, iid=True):

        super(_BaseH2OFScoreSelector, self).__init__(
            feature_names=feature_names, target_feature=target_feature,
            exclude_features=exclude_features, cv=cv,
            iid=iid, min_version=self._min_version, 
            max_version=self._max_version)

    @abstractmethod
    def _select_features(self, all_scores, all_pvalues, feature_names):
        """This function should be overridden by subclasses, and
        should handle the selection of features given the scores
        and pvalues.

        Parameters
        ----------

        all_scores : np.ndarray (float)
            The scores

        all_pvalues : np.ndarray (float)
            The p-values

        feature_names : array_like (str)
            The list of names that are eligible for drop

        Returns
        -------

        list : the features to drop
        """
        raise NotImplementedError('must be implemented by subclass')

    def _fit(self, X):
        """Fit the F-score feature selector.

        Parameters
        ----------

        X : H2OFrame, shape=(n_samples, n_features)
            The training frame on which to fit

        Returns
        -------

        self
        """
        # we can use this to extract the feature names to pass...
        feature_names = _frame_from_x_y(
            X=X, x=self.feature_names, y=self.target_feature, 
            exclude_features=self.exclude_features).columns

        cv = check_cv(self.cv)

        # use the X frame (full frame) including target
        self.scores_, self.p_values_, self.drop_ = _test_and_score(
            frame=X, fun=h2o_f_classif, cv=cv, 
            feature_names=feature_names,  # extracted above
            target_feature=self.target_feature, 
            iid=self.iid, select_fun=self._select_features)

        return self


[docs]class H2OFScorePercentileSelector(_BaseH2OFScoreSelector):
    """Select the top percentile of features based on the F-score, 
    using the ``h2o_f_classif`` method.

    Parameters
    ----------

    feature_names : array_like (str), optional (default=None)
        The list of names on which to fit the transformer.

    target_feature : str, optional (default=None)
        The name of the target feature (is excluded from the fit)
        for the estimator.

    exclude_features : iterable or None, optional (default=None)
        Any names that should be excluded from ``feature_names``

    cv : int or H2OBaseCrossValidator, optional (default=3)
        Univariate feature selection can very easily remove
        features erroneously or cause overfitting. Using cross
        validation, we can more confidently select the features 
        to drop.

    percentile : int, optional (default=10)
        The percent of features to keep.

    iid : bool, optional (default=True)
        Whether to consider each fold as IID. The fold scores
        are normalized at the end by the number of observations
        in each fold

    min_version : str or float, optional (default='any')
        The minimum version of h2o that is compatible with the transformer

    max_version : str or float, optional (default=None)
        The maximum version of h2o that is compatible with the transformer

    Attributes
    ----------

    scores_ : np.ndarray, float
        The score array, adjusted for ``n_folds``

    p_values_ : np.ndarray, float
        The p-value array, adjusted for ``n_folds``


    .. versionadded:: 0.1.2
    """

    _min_version = '3.8.2.9'
    _max_version = None

    def __init__(self, feature_names=None, target_feature=None,
                 exclude_features=None, cv=3, percentile=10, iid=True):
        super(H2OFScorePercentileSelector, self).__init__(
            feature_names=feature_names, target_feature=target_feature,
            exclude_features=exclude_features, cv=cv, iid=iid)

        self.percentile = percentile

[docs]    def fit(self, X):
        """Fit the F-score feature selector.

        Parameters
        ----------

        X : H2OFrame, shape=(n_samples, n_features)
            The training frame on which to fit

        Returns
        -------

        self
        """
        if not is_integer(self.percentile):
            raise ValueError('percentile must be an integer')
        return self._fit(X)

    @overrides(_BaseH2OFScoreSelector)
    def _select_features(self, all_scores, all_pvalues, feature_names):
        """This function selects the top ``percentile`` of
        features from the F-scores.

        Parameters
        ----------

        all_scores : np.ndarray (float)
            The scores

        all_pvalues : np.ndarray (float)
            The p-values

        feature_names : array_like (str)
            The list of names that are eligible for drop

        Returns
        -------

        list : the features to drop
        """
        percentile = self.percentile

        # compute which features to keep or drop
        if percentile == 100:
            return []
        elif percentile == 0:
            return feature_names
        else:
            # adapted from sklearn.feature_selection.SelectPercentile
            all_scores = _clean_nans(all_scores)
            thresh = stats.scoreatpercentile(all_scores, 100 - percentile)

            mask = all_scores > thresh
            ties = np.where(all_scores == thresh)[0]
            if len(ties):
                max_feats = int(len(all_scores) * percentile / 100)
                kept_ties = ties[:max_feats - mask.sum()]
                mask[kept_ties] = True

            # inverse, since we're recording which features to DROP, not keep
            mask = np.asarray(~mask)

            # now se the drop as the inverse mask
            return (np.asarray(feature_names)[mask]).tolist()


[docs]class H2OFScoreKBestSelector(_BaseH2OFScoreSelector):
    """Select the top ``k`` features based on the F-score, 
    using the ``h2o_f_classif`` method.

    Parameters
    ----------

    feature_names : array_like (str), optional (default=None)
        The list of names on which to fit the transformer.

    target_feature : str, optional (default=None)
        The name of the target feature (is excluded from the fit)
        for the estimator.

    exclude_features : iterable or None, optional (default=None)
        Any names that should be excluded from ``feature_names``

    cv : int or H2OBaseCrossValidator, optional (default=3)
        Univariate feature selection can very easily remove
        features erroneously or cause overfitting. Using cross
        validation, we can more confidently select the features 
        to drop.

    k : int, optional (default=10)
        The number of features to keep.

    iid : bool, optional (default=True)
        Whether to consider each fold as IID. The fold scores
        are normalized at the end by the number of observations
        in each fold

    min_version : str or float, optional (default='any')
        The minimum version of h2o that is compatible with the transformer

    max_version : str or float, optional (default=None)
        The maximum version of h2o that is compatible with the transformer

    Attributes
    ----------

    scores_ : np.ndarray, float
        The score array, adjusted for ``n_folds``

    p_values_ : np.ndarray, float
        The p-value array, adjusted for ``n_folds``


    .. versionadded:: 0.1.2
    """

    _min_version = '3.8.2.9'
    _max_version = None

    def __init__(self, feature_names=None, target_feature=None,
                 exclude_features=None, cv=3, k=10, iid=True):

        super(H2OFScoreKBestSelector, self).__init__(
            feature_names=feature_names, target_feature=target_feature,
            exclude_features=exclude_features, cv=cv, iid=iid)

        self.k = k

[docs]    def fit(self, X):
        """Fit the F-score feature selector.

        Parameters
        ----------

        X : H2OFrame, shape=(n_samples, n_features)
            The training frame on which to fit

        Returns
        -------

        self
        """
        if not (self.k == 'all' or (is_integer(self.k) and self.k > 0)):
            raise ValueError('k must be a non-negative integer or "all"')
        return self._fit(X)

    @overrides(_BaseH2OFScoreSelector)
    def _select_features(self, all_scores, all_pvalues, feature_names):
        """This function selects the top ``k`` features 
        from the F-scores.

        Parameters
        ----------

        all_scores : np.ndarray (float)
            The scores

        all_pvalues : np.ndarray (float)
            The p-values

        feature_names : array_like (str)
            The list of names that are eligible for drop

        Returns
        -------

        list : the features to drop
        """
        k = self.k

        # compute which features to keep or drop
        if k == 'all':
            return []
        else:
            # adapted from sklearn.feature_selection.SelectKBest
            all_scores = _clean_nans(all_scores)
            mask = np.zeros(all_scores.shape, dtype=bool)
            mask[np.argsort(all_scores, kind="mergesort")[-k:]] = 1  # we know k > 0

            # inverse, since we're recording which features to DROP, not keep
            mask = np.asarray(~mask)

            # now se the drop as the inverse mask
            return (np.asarray(feature_names)[mask]).tolist()