Source code for skutil.h2o.transform


from __future__ import print_function, division, absolute_import
import numpy as np
from .base import BaseH2OTransformer, _frame_from_x_y, check_frame
from ..utils import is_numeric, flatten_all
from ..utils.fixes import is_iterable, dict_values
from ..preprocessing import ImputerMixin
from sklearn.externals import six
import pandas as pd
from sklearn.utils.validation import check_is_fitted

__all__ = [
    'H2OInteractionTermTransformer',
    'H2OSelectiveImputer',
    'H2OSelectiveScaler'
]


def _flatten_one(x):
    """There is a bug in some versions of h2o
    where a scalar is not returned by mean, but
    a list is. This will determine the proper 
    type for each item in the vec.
    """
    return x[0] if is_iterable(x) else x


def _transform_col(col, val):
    """If an imputation value does not match column
    type, we'll get some errors. So this is going to manipulate
    the column type based on the value type. This is
    necessary as opposed to the opposite way because an 
    int column might still have a 'mean' fill. Thus, we'll
    just treat everything as float.
    """
    if dict_values(col.types)[0] in ('int', 'real'):
        return col.asnumeric(), float(val)

    # for enums, character, etc...
    return col, val


class _H2OBaseImputer(BaseH2OTransformer, ImputerMixin):
    """A base class for all H2O imputers"""

    def __init__(self, feature_names=None, target_feature=None, exclude_features=None,
                 min_version='any', max_version=None, def_fill=None):
        super(_H2OBaseImputer, self).__init__(feature_names=feature_names,
                                              target_feature=target_feature,
                                              exclude_features=exclude_features,
                                              min_version=min_version,
                                              max_version=max_version)
        self.fill_ = self._def_fill if def_fill is None else def_fill


def _mode(x, def_fill=ImputerMixin._def_fill):
    """Get the most common value in a 1d
    H2OFrame. Ties will be handled in a non-specified
    manner.

    Parameters
    ----------

    x : ``H2OFrame``, shape=(n_samples, 1)
        The 1d frame from which to derive the mode
    """
    idx = x.as_data_frame(use_pandas=True)[x.columns[0]].value_counts().index

    # if the most common is null, then return the next most common.
    # if there is no next common (i.e., 100% null) then we return the def_fill
    return idx[0] if not pd.isnull(idx[0]) else idx[1] if idx.shape[0] > 1 else def_fill


[docs]class H2OSelectiveImputer(_H2OBaseImputer): """The selective imputer provides extreme flexibility and simplicity in imputation tasks. Rather than imposing one strategy across an entire frame, different strategies can be mapped to respective features. Parameters ---------- feature_names : array_like (str), optional (default=None) The list of names on which to fit the transformer. target_feature : str, optional (default None) The name of the target feature (is excluded from the fit) for the estimator. exclude_features : iterable or None, optional (default=None) Any names that should be excluded from ``feature_names`` def_fill : str, int or iterable, optional (default='mean') The fill strategy. If an int, the int value will be applied to all missing values in the H2OFrame. If a string, must be one of ('mean', 'median', 'mode') - note that 'mode' is still under development. If an iterable (list, tuple, array, etc.), the length must match the column dimensions. However, if a dict, the strategies will be applied to the mapped columns. Attributes ---------- fill_val_ : int, float or iterable The fill value(s) provided or derived in the ``fit`` method. .. versionadded:: 0.1.0 """ _min_version = '3.8.2.9' _max_version = None def __init__(self, feature_names=None, target_feature=None, exclude_features=None, def_fill='mean'): super(H2OSelectiveImputer, self).__init__(feature_names=feature_names, target_feature=target_feature, exclude_features=exclude_features, min_version=self._min_version, max_version=self._max_version, def_fill=def_fill)
[docs] def fit(self, X): """Fit the imputer. Parameters ---------- X : ``H2OFrame``, shape=(n_samples, n_features) The training data on which to fit. Returns ------- self """ X = check_frame(X, copy=False) frame = _frame_from_x_y(X, self.feature_names, self.target_feature, self.exclude_features) # at this point, the entirety of frame can be operated on... cols = [str(u) for u in frame.columns] # convert to string... # SHOULD we enforce this??... if any(frame.types[c] == 'enum' for c in cols): raise ValueError('can only impute numeric values') # validate the fill, do fit fill = self.fill_ if isinstance(fill, six.string_types): fill = str(fill) if fill not in ('mode', 'mean', 'median'): raise TypeError('self.fill must be either "mode", "mean", "median", None, ' 'a number, or an iterable. Got %s' % fill) if fill == 'mode': # for each column to impute, we go through and get the value counts # of each, sorting by the max... self.fill_val_ = dict(zip(cols, [_mode(X[c]) for c in cols])) elif fill == 'median': self.fill_val_ = dict(zip(cols, flatten_all([X[c].median(na_rm=True) for c in cols]))) else: self.fill_val_ = dict(zip(cols, flatten_all([X[c].mean(na_rm=True) for c in cols]))) elif is_iterable(fill): # if fill is a dictionary if isinstance(fill, dict): # if it's a dict, we can assume that these are the cols... cols, fill = zip(*fill.items()) # we need to get the length of the iterable, # make sure it matches the len of cols if not len(fill) == len(cols): raise ValueError('len of fill does not match that of cols') # make sure they're all ints if not all( [(is_numeric(i) or (isinstance(i, six.string_types)) and i in ('mode', 'mean', 'median')) for i in fill]): raise TypeError('All values in self.fill must be numeric or in ("mode", "mean", "median"). ' 'Got: %s' % ', '.join(fill)) d = {} for ind, c in enumerate(cols): f = fill[ind] if is_numeric(f): # if we fill with a single value... d[c] = f else: the_col = X[c] if f == 'mode': d[c] = _mode(the_col) # d[c] = _col_mode(the_col) elif f == 'median': d[c] = _flatten_one(the_col.median(na_rm=True)) else: d[c] = _flatten_one(the_col.mean(na_rm=True)) self.fill_val_ = d else: if not is_numeric(fill): raise TypeError('self.fill must be either "mode", "mean", "median", None, ' 'a number, or an iterable. Got %s' % str(fill)) # either the fill is an int, or it's something the user provided... # if it's not an int or float, we'll let it go and not catch it because # the it's their fault they were dumb. self.fill_val_ = fill return self
[docs] def transform(self, X): """Transform an H2OFrame given the fit imputer. Parameters ---------- X : ``H2OFrame``, shape=(n_samples, n_features) The test data to transform. Returns ------- X : ``H2OFrame``, shape=(n_samples, n_features) The transformed (imputed) test data. """ check_is_fitted(self, 'fill_val_') X = check_frame(X, copy=True) # make a copy # get the fills fill_val = self.fill_val_ # we get the subset frame just to retrieve the column names. We affect # X in place anyways, so no use using the slice... frame = _frame_from_x_y(X, self.feature_names, self.target_feature) cols = [str(u) for u in frame.columns] # the cols we'll ultimately impute X_columns = [str(u) for u in X.columns] # used for index lookup # get the frame of NAs na_frame = frame.isna() na_frame.columns = cols # iter over cols is_int = isinstance(fill_val, int) # is it an int? for _, col in enumerate(cols): if not is_int and col not in fill_val: # then it's a dict and this col doesn't exist in it... continue # if it's a single int, easy, otherwise query dict col_imp_value = fill_val if is_int else fill_val[col] # reassign the column itself, as we might need to make it # a float column for imputation to avoid numpy int64 bug # X[col], col_imp_value = _transform_col(X[col], col_imp_value) # unfortunately, since we can't boolean index the # h2oframe, we have to convert pandas the_na_col_frame = na_frame[col] if not the_na_col_frame.sum(): # if there are no missing ones here, move on... faster than making local continue the_na_col = the_na_col_frame.as_data_frame(use_pandas=True)[col] na_mask_idcs = the_na_col.index[the_na_col.astype(np.bool)].tolist() # if the mask is empty, move on - should be handled above... if not na_mask_idcs: continue # get the column index # col_idx = X_columns.index(col) # for na_row in na_mask_idcs: # X[na_row, col_idx] = col_imp_value X[na_mask_idcs, col] = col_imp_value # return the copy return X
[docs]class H2OSelectiveScaler(BaseH2OTransformer): """A class that will scale selected features in the H2OFrame. Parameters ---------- feature_names : array_like (str), optional (default=None) The list of names on which to fit the transformer. target_feature : str, optional (default=None) The name of the target feature (is excluded from the fit) for the estimator. exclude_features : iterable or None, optional (default=None) Any names that should be excluded from ``feature_names`` with_mean : bool, optional (default=True) should subtract mean? with_std : bool, optional (default=True) should divide by std? Attributes ------- means : dict (string:float) The mapping of column names to column means stds : dict (string:float) The mapping of column names to column standard deviations .. versionadded:: 0.1.0 """ _min_version = '3.8.2.9' _max_version = None def __init__(self, feature_names=None, target_feature=None, exclude_features=None, with_mean=True, with_std=True): super(H2OSelectiveScaler, self).__init__(feature_names=feature_names, target_feature=target_feature, exclude_features=exclude_features, min_version=self._min_version, max_version=self._max_version) self.with_mean = with_mean self.with_std = with_std
[docs] def fit(self, X): """Fit the transformer. Parameters ---------- X : ``H2OFrame``, shape=(n_samples, n_features) The training data on which to fit """ X = check_frame(X, copy=False) frame = _frame_from_x_y(X, self.feature_names, self.target_feature) self.cols_ = [str(i) for i in frame.columns] # get and std if self.with_mean: self.means = dict(zip(self.cols_, flatten_all(frame.mean()))) if self.with_std: self.stds = dict(zip(self.cols_, flatten_all(frame.sd()))) return self
[docs] def transform(self, X): """Do the transformation Parameters ---------- X : ``H2OFrame``, shape=(n_samples, n_features) The test data to transform Returns ------- frame : ``H2OFrame``, shape=(n_samples, n_features) The transformed test data. """ check_is_fitted(self, 'cols_') frame = check_frame(X, copy=True) # get a copy... if (not self.with_mean) and (not self.with_std): return frame # nothing to change... for nm in self.cols_: if self.with_mean: frame[nm] -= self.means[nm] if self.with_std: frame[nm] /= self.stds[nm] return frame
def _mul(a, b): """Multiplies two H2OFrame objects (no validation since internally used). Parameters ---------- a : ``H2OFrame``, shape=(n_samples, 1) The first feature b : ``H2OFrame``, shape=(n_samples, 1) The second feature Returns ------- ``a`` * ``b`` : H2OFrame The product of ``a`` and ``b`` """ return a * b
[docs]class H2OInteractionTermTransformer(BaseH2OTransformer): """A class that will generate interaction terms between selected columns. An interaction captures some relationship between two independent variables in the form of: :math:`In = (x_i * x_j)` Note that the ``H2OInteractionTermTransformer`` will only operate on the feature_names, and at the transform point will return ALL features plus the newly generated ones unless otherwise specified in the ``only_return_interactions`` parameter. Parameters ---------- feature_names : array_like (str), optional (default=None) The list of names on which to fit the transformer. target_feature : str, optional (default=None) The name of the target feature (is excluded from the fit) for the estimator. exclude_features : iterable or None, optional (default=None) Any names that should be excluded from ``feature_names`` interaction : callable, optional (default=None) A callable for interactions. Default None will result in multiplication of two Series objects name_suffix : str, optional (default='I') The suffix to add to the new feature name in the form of <feature_x>_<feature_y>_<suffix> only_return_interactions : bool, optional (default=False) If set to True, will only return features in feature_names and their respective generated interaction terms. Attributes ---------- fun_ : callable The interaction term function assigned in the ``fit`` method. .. versionadded:: 0.1.0 """ _min_version = '3.8.2.9' _max_version = None def __init__(self, feature_names=None, target_feature=None, exclude_features=None, interaction_function=None, name_suffix='I', only_return_interactions=False): super(H2OInteractionTermTransformer, self).__init__(feature_names=feature_names, target_feature=target_feature, exclude_features=exclude_features, min_version=self._min_version, max_version=self._max_version) self.interaction_function = interaction_function self.name_suffix = name_suffix self.only_return_interactions = only_return_interactions
[docs] def fit(self, frame): """Fit the transformer. Parameters ---------- frame : ``H2OFrame``, shape=(n_samples, n_features) The training data on which to fit. Returns ------- self """ frame = _frame_from_x_y(frame, self.feature_names, self.target_feature, self.exclude_features) self.cols = [str(u) for u in frame.columns] # the cols we'll ultimately operate on self.fun_ = self.interaction_function if self.interaction_function is not None else _mul # validate function if not hasattr(self.fun_, '__call__'): raise TypeError('require callable for interaction_function') # validate features if len(self.cols) < 2: raise ValueError('need at least two features') return self
[docs] def transform(self, X): """Perform the interaction term expansion. Parameters ---------- X : ``H2OFrame``, shape=(n_samples, n_features) The test data to transform. Returns ------- frame : ``H2OFrame``, shape=(n_samples, n_features) The expanded (interacted) test data. """ check_is_fitted(self, 'fun_') frame = check_frame(X, copy=True) # get a copy cols, fun, suff = self.cols, self.fun_, self.name_suffix n_features = len(cols) # these are the names to return if only_return_interactions interaction_names = [x for x in cols] # we can do this in N^2 or we can do it in an uglier N choose 2... for i in range(n_features - 1): for j in range(i + 1, n_features): col_i, col_j = cols[i], cols[j] new_col_nm = '%s_%s_%s' % (col_i, col_j, suff) new_col = fun(frame[col_i], frame[col_j]) new_col.columns = [new_col_nm] # add the new col nm to the list of interaction names interaction_names.append(new_col_nm) # cbind frame = frame.cbind(new_col) # return matrix if needed return frame if not self.only_return_interactions else frame[interaction_names]