Source code for skutil.h2o.one_way_fs

from __future__ import absolute_import, division, print_function
import numpy as np
import warnings
from abc import ABCMeta, abstractmethod
from scipy import special, stats
from sklearn.externals import six
from .split import *
from .select import BaseH2OFeatureSelector
from .util import _unq_vals_col
from .fixes import rbind_all
from ..utils import is_integer
from .base import (check_frame, _frame_from_x_y)
from ..base import overrides, since
from sklearn.utils import as_float_array

__all__ = [
    'h2o_f_classif',
    'h2o_f_oneway',
    'H2OFScoreKBestSelector',
    'H2OFScorePercentileSelector'
]


# This function is re-written from sklearn.feature_selection
# and is included for compatability for older versions of
# sklearn that might raise an ImportError.
def _clean_nans(scores):
    scores = as_float_array(scores, copy=True)
    scores[np.isnan(scores)] = np.finfo(scores.dtype).min
    return scores


@since('0.1.2')
[docs]def h2o_f_classif(X, feature_names, target_feature): """Compute the ANOVA F-value for the provided sample. This method is adapted from ``sklearn.feature_selection.f_classif`` to function on H2OFrames. Parameters ---------- X : ``H2OFrame``, shape=(n_samples, n_features) The feature matrix. Each feature will be tested sequentially. feature_names : array_like (str), optional (default=None) The list of names on which to fit the transformer. target_feature : str, optional (default=None) The name of the target feature (is excluded from the fit) for the estimator. Returns ------- f : float The computed F-value of the test. prob : float The associated p-value from the F-distribution. """ frame = check_frame(X, copy=False) # first, get unique values of y y = X[target_feature] _, unq = _unq_vals_col(y) # if y is enum, make the unq strings.. unq = unq[_] if not y.isfactor()[0] else [str(i) for i in unq[_]] # get the masks args = [frame[y == k, :][feature_names] for k in unq] f, prob = h2o_f_oneway(*args) return f, prob
# The following function is a rewriting (of the sklearn rewriting) of # scipy.stats.f_oneway. Contrary to the scipy.stats.f_oneway implementation # it does not copy the data while keeping the inputs unchanged. Furthermore, # contrary to the sklearn implementation, it does not use np.ndarrays, rather # amending 1d H2OFrames inplace. @since('0.1.2')
[docs]def h2o_f_oneway(*args): """Performs a 1-way ANOVA. The one-way ANOVA tests the null hypothesis that 2 or more groups have the same population mean. The test is applied to samples from two or more groups, possibly with differing sizes. Parameters ---------- sample1, sample2, ... : array_like, H2OFrames, shape=(n_classes,) The sample measurements should be given as varargs (*args). A slice of the original input frame for each class in the target feature. Returns ------- f : float The computed F-value of the test. prob : float The associated p-value from the F-distribution. Notes ----- The ANOVA test has important assumptions that must be satisfied in order for the associated p-value to be valid. 1. The samples are independent 2. Each sample is from a normally distributed population 3. The population standard deviations of the groups are all equal. This property is known as homoscedasticity. If these assumptions are not true for a given set of data, it may still be possible to use the Kruskal-Wallis H-test (``scipy.stats.kruskal``) although with some loss of power. The algorithm is from Heiman[2], pp.394-7. See ``scipy.stats.f_oneway`` and ``sklearn.feature_selection.f_oneway``. References ---------- .. [1] Lowry, Richard. "Concepts and Applications of Inferential Statistics". Chapter 14. http://faculty.vassar.edu/lowry/ch14pt1.html .. [2] Heiman, G.W. Research Methods in Statistics. 2002. """ n_classes = len(args) # sklearn converts everything to float here. Rather than do so, # we will test for total numericism and fail out if it's not 100% # numeric. if not all([all([X.isnumeric() for X in args])]): raise ValueError("All features must be entirely numeric for F-test") n_samples_per_class = [X.shape[0] for X in args] n_samples = np.sum(n_samples_per_class) # compute the sum of squared values in each column, and then compute the column # sums of all of those intermittent rows rbound together ss_alldata = rbind_all(*[X.apply(lambda x: (x*x).sum()) for X in args]).apply(lambda x: x.sum()) # compute the sum of each column for each X in args, then rbind them all # and sum them up, finally squaring them. Tantamount to the squared sum # of each complete column. Note that we need to add a tiny fraction to ensure # all are real numbers for the rbind... sum_args = [X.apply(lambda x: x.sum() + 1e-12).asnumeric() for X in args] # col sums square_of_sums_alldata = rbind_all(*sum_args).apply(lambda x: x.sum()) square_of_sums_alldata *= square_of_sums_alldata square_of_sums_args = [s*s for s in sum_args] sstot = ss_alldata - square_of_sums_alldata / float(n_samples) ssbn = None # h2o frame for k, _ in enumerate(args): tmp = square_of_sums_args[k] / n_samples_per_class[k] ssbn = tmp if ssbn is None else (ssbn + tmp) ssbn -= square_of_sums_alldata / float(n_samples) sswn = sstot - ssbn dfbn = n_classes - 1 dfwn = n_samples - n_classes msb = ssbn / float(dfbn) msw = sswn / float(dfwn) constant_feature_idx = (msw == 0) constant_feature_sum = constant_feature_idx.sum() # sum of ones nonzero_size = (msb != 0).sum() if nonzero_size != msb.shape[1] and constant_feature_sum: warnings.warn("Features %s are constant." % np.arange(msw.shape[1])[constant_feature_idx], UserWarning) f = (msb / msw) # convert to numpy ndarray for special f = f.as_data_frame(use_pandas=True).iloc[0].values # compute prob prob = special.fdtrc(dfbn, dfwn, f) return f, prob
class _H2OBaseUnivariateSelector(six.with_metaclass(ABCMeta, BaseH2OFeatureSelector)): """The base class for all univariate feature selectors in H2O. Parameters ---------- feature_names : array_like (str), optional (default=None) The list of names on which to fit the transformer. target_feature : str, optional (default=None) The name of the target feature (is excluded from the fit) for the estimator. exclude_features : iterable or None, optional (default=None) Any names that should be excluded from ``feature_names`` cv : int or H2OBaseCrossValidator, optional (default=3) Univariate feature selection can very easily remove features erroneously or cause overfitting. Using cross validation, we can more confidently select the features to drop. iid : bool, optional (default=True) Whether to consider each fold as IID. The fold scores are normalized at the end by the number of observations in each fold min_version : str or float (default='any') The minimum version of h2o that is compatible with the transformer max_version : str or float (default=None) The maximum version of h2o that is compatible with the transformer """ @abstractmethod def __init__(self, feature_names=None, target_feature=None, exclude_features=None, cv=3, iid=True, min_version='any', max_version=None): super(_H2OBaseUnivariateSelector, self).__init__( feature_names=feature_names, target_feature=target_feature, exclude_features=exclude_features, min_version=min_version, max_version=max_version) # validate CV self.cv = cv self.iid = iid def _repack_tuple(two, one): """Utility for ``_test_and_score``. Packs the scores, p-values and train-fold length into a single, flat tuple. Parameters ---------- two : tuple, shape=(2,) The scores & p-values tuple one : int The train fold length Returns ------- out : tuple, shape=(3,) The flattened tuple: (F-scores, p-values, train-fold size) """ return two[0], two[1], one def _test_and_score(frame, fun, cv, feature_names, target_feature, iid, select_fun): """Fit all the folds of some provided function, repack the scores tuple and adjust the fold score if ``iid`` is True. Parameters ---------- frame : H2OFrame, shape=(n_samples, n_features) The frame to fit fun : callable The function to call cv : H2OBaseCrossValidator The cross validation class feature_names : array_like (str) The list of names on which to fit the transformer. target_feature : str The name of the target feature (is excluded from the fit) for the estimator. iid : bool Whether to consider each fold as IID. The fold scores are normalized at the end by the number of observations in each fold select_fun : callable The function used for feature selection Returns ------- all_scores : np.ndarray The normalized scores all_pvalues : np.ndarray The normalized p-values list : The column names to drop """ fn, tf = feature_names, target_feature if tf is None: raise ValueError('target_feature must be a string') scores = [ _repack_tuple(fun(frame[train, :], feature_names=fn, target_feature=tf), len(train)) for train, _ in cv.split(frame, tf) ] # compute the mean F-score, p-value, adjust with IID n_folds = cv.get_n_splits() all_scores = 0. all_pvalues = 0. # adjust the fold scores for these_scores, p_vals, fold_size in scores: if iid: these_scores *= fold_size p_vals *= fold_size all_scores += these_scores all_pvalues += p_vals if iid: all_scores /= frame.shape[0] all_pvalues /= frame.shape[0] else: all_scores /= float(n_folds) all_pvalues /= float(n_folds) # return tuple return all_scores, all_pvalues, select_fun(all_scores, all_pvalues, fn) class _BaseH2OFScoreSelector(six.with_metaclass(ABCMeta, _H2OBaseUnivariateSelector)): """Select features based on the F-score, using the ``h2o_f_classif`` method. Sub-classes will define how the number of features to retain is selected. Parameters ---------- feature_names : array_like (str), optional (default=None) The list of names on which to fit the transformer. target_feature : str, optional (default=None) The name of the target feature (is excluded from the fit) for the estimator. exclude_features : iterable or None, optional (default=None) Any names that should be excluded from ``feature_names`` cv : int or H2OBaseCrossValidator, optional (default=3) Univariate feature selection can very easily remove features erroneously or cause overfitting. Using cross validation, we can more confidently select the features to drop. iid : bool, optional (default=True) Whether to consider each fold as IID. The fold scores are normalized at the end by the number of observations in each fold min_version : str or float, optional (default='any') The minimum version of h2o that is compatible with the transformer max_version : str or float, optional (default=None) The maximum version of h2o that is compatible with the transformer Attributes ---------- scores_ : np.ndarray, float The score array, adjusted for ``n_folds`` p_values_ : np.ndarray, float The p-value array, adjusted for ``n_folds`` .. versionadded:: 0.1.2 """ _min_version = '3.8.2.9' _max_version = None def __init__(self, feature_names=None, target_feature=None, exclude_features=None, cv=3, iid=True): super(_BaseH2OFScoreSelector, self).__init__( feature_names=feature_names, target_feature=target_feature, exclude_features=exclude_features, cv=cv, iid=iid, min_version=self._min_version, max_version=self._max_version) @abstractmethod def _select_features(self, all_scores, all_pvalues, feature_names): """This function should be overridden by subclasses, and should handle the selection of features given the scores and pvalues. Parameters ---------- all_scores : np.ndarray (float) The scores all_pvalues : np.ndarray (float) The p-values feature_names : array_like (str) The list of names that are eligible for drop Returns ------- list : the features to drop """ raise NotImplementedError('must be implemented by subclass') def _fit(self, X): """Fit the F-score feature selector. Parameters ---------- X : H2OFrame, shape=(n_samples, n_features) The training frame on which to fit Returns ------- self """ # we can use this to extract the feature names to pass... feature_names = _frame_from_x_y( X=X, x=self.feature_names, y=self.target_feature, exclude_features=self.exclude_features).columns cv = check_cv(self.cv) # use the X frame (full frame) including target self.scores_, self.p_values_, self.drop_ = _test_and_score( frame=X, fun=h2o_f_classif, cv=cv, feature_names=feature_names, # extracted above target_feature=self.target_feature, iid=self.iid, select_fun=self._select_features) return self
[docs]class H2OFScorePercentileSelector(_BaseH2OFScoreSelector): """Select the top percentile of features based on the F-score, using the ``h2o_f_classif`` method. Parameters ---------- feature_names : array_like (str), optional (default=None) The list of names on which to fit the transformer. target_feature : str, optional (default=None) The name of the target feature (is excluded from the fit) for the estimator. exclude_features : iterable or None, optional (default=None) Any names that should be excluded from ``feature_names`` cv : int or H2OBaseCrossValidator, optional (default=3) Univariate feature selection can very easily remove features erroneously or cause overfitting. Using cross validation, we can more confidently select the features to drop. percentile : int, optional (default=10) The percent of features to keep. iid : bool, optional (default=True) Whether to consider each fold as IID. The fold scores are normalized at the end by the number of observations in each fold min_version : str or float, optional (default='any') The minimum version of h2o that is compatible with the transformer max_version : str or float, optional (default=None) The maximum version of h2o that is compatible with the transformer Attributes ---------- scores_ : np.ndarray, float The score array, adjusted for ``n_folds`` p_values_ : np.ndarray, float The p-value array, adjusted for ``n_folds`` .. versionadded:: 0.1.2 """ _min_version = '3.8.2.9' _max_version = None def __init__(self, feature_names=None, target_feature=None, exclude_features=None, cv=3, percentile=10, iid=True): super(H2OFScorePercentileSelector, self).__init__( feature_names=feature_names, target_feature=target_feature, exclude_features=exclude_features, cv=cv, iid=iid) self.percentile = percentile
[docs] def fit(self, X): """Fit the F-score feature selector. Parameters ---------- X : H2OFrame, shape=(n_samples, n_features) The training frame on which to fit Returns ------- self """ if not is_integer(self.percentile): raise ValueError('percentile must be an integer') return self._fit(X)
@overrides(_BaseH2OFScoreSelector) def _select_features(self, all_scores, all_pvalues, feature_names): """This function selects the top ``percentile`` of features from the F-scores. Parameters ---------- all_scores : np.ndarray (float) The scores all_pvalues : np.ndarray (float) The p-values feature_names : array_like (str) The list of names that are eligible for drop Returns ------- list : the features to drop """ percentile = self.percentile # compute which features to keep or drop if percentile == 100: return [] elif percentile == 0: return feature_names else: # adapted from sklearn.feature_selection.SelectPercentile all_scores = _clean_nans(all_scores) thresh = stats.scoreatpercentile(all_scores, 100 - percentile) mask = all_scores > thresh ties = np.where(all_scores == thresh)[0] if len(ties): max_feats = int(len(all_scores) * percentile / 100) kept_ties = ties[:max_feats - mask.sum()] mask[kept_ties] = True # inverse, since we're recording which features to DROP, not keep mask = np.asarray(~mask) # now se the drop as the inverse mask return (np.asarray(feature_names)[mask]).tolist()
[docs]class H2OFScoreKBestSelector(_BaseH2OFScoreSelector): """Select the top ``k`` features based on the F-score, using the ``h2o_f_classif`` method. Parameters ---------- feature_names : array_like (str), optional (default=None) The list of names on which to fit the transformer. target_feature : str, optional (default=None) The name of the target feature (is excluded from the fit) for the estimator. exclude_features : iterable or None, optional (default=None) Any names that should be excluded from ``feature_names`` cv : int or H2OBaseCrossValidator, optional (default=3) Univariate feature selection can very easily remove features erroneously or cause overfitting. Using cross validation, we can more confidently select the features to drop. k : int, optional (default=10) The number of features to keep. iid : bool, optional (default=True) Whether to consider each fold as IID. The fold scores are normalized at the end by the number of observations in each fold min_version : str or float, optional (default='any') The minimum version of h2o that is compatible with the transformer max_version : str or float, optional (default=None) The maximum version of h2o that is compatible with the transformer Attributes ---------- scores_ : np.ndarray, float The score array, adjusted for ``n_folds`` p_values_ : np.ndarray, float The p-value array, adjusted for ``n_folds`` .. versionadded:: 0.1.2 """ _min_version = '3.8.2.9' _max_version = None def __init__(self, feature_names=None, target_feature=None, exclude_features=None, cv=3, k=10, iid=True): super(H2OFScoreKBestSelector, self).__init__( feature_names=feature_names, target_feature=target_feature, exclude_features=exclude_features, cv=cv, iid=iid) self.k = k
[docs] def fit(self, X): """Fit the F-score feature selector. Parameters ---------- X : H2OFrame, shape=(n_samples, n_features) The training frame on which to fit Returns ------- self """ if not (self.k == 'all' or (is_integer(self.k) and self.k > 0)): raise ValueError('k must be a non-negative integer or "all"') return self._fit(X)
@overrides(_BaseH2OFScoreSelector) def _select_features(self, all_scores, all_pvalues, feature_names): """This function selects the top ``k`` features from the F-scores. Parameters ---------- all_scores : np.ndarray (float) The scores all_pvalues : np.ndarray (float) The p-values feature_names : array_like (str) The list of names that are eligible for drop Returns ------- list : the features to drop """ k = self.k # compute which features to keep or drop if k == 'all': return [] else: # adapted from sklearn.feature_selection.SelectKBest all_scores = _clean_nans(all_scores) mask = np.zeros(all_scores.shape, dtype=bool) mask[np.argsort(all_scores, kind="mergesort")[-k:]] = 1 # we know k > 0 # inverse, since we're recording which features to DROP, not keep mask = np.asarray(~mask) # now se the drop as the inverse mask return (np.asarray(feature_names)[mask]).tolist()