Source code for skutil.h2o.balance

from __future__ import absolute_import, division, print_function
import pandas as pd
from abc import ABCMeta
import warnings
from sklearn.externals import six
from skutil.base import overrides
from .transform import _flatten_one
from .util import reorder_h2o_frame, _gen_optimized_chunks, h2o_col_to_numpy
from .base import check_frame, BaseH2OFunctionWrapper
from ..preprocessing.balance import (_validate_ratio, _validate_target, _validate_num_classes,
                                     _OversamplingBalancePartitioner, _UndersamplingBalancePartitioner,
                                     BalancerMixin)

__all__ = [
    'H2OOversamplingClassBalancer',
    'H2OUndersamplingClassBalancer'
]


def _validate_x_y_ratio(X, y, ratio):
    """Validates the following, given that X is
    already a validated pandas DataFrame:

    1. That y is a string
    2. That the number of classes does not exceed _max_classes
       as defined by the BalancerMixin class
    3. That the number of classes is at least 2
    4. That ratio is a float that falls between 0.0 (exclusive) and
       1.0 (inclusive)

    Parameters
    ----------

    X : ``H2OFrame``, shape=(n_samples, n_features)
        The frame from which to sample

    y : str
        The name of the column that is the response class.
        This is the column on which ``value_counts`` will be
        executed to determine imbalance.

    ratio : float
        The ratio at which the balancing operation will 
        be performed. Used to determine whether balancing is
        required.

    Returns
    -------

    out_tup : tuple, shape=(3,)
        a length-3 tuple with the following args:
            [0] - cts (pd.Series), the ascending sorted ``value_counts`` 
                  of the class, where the index is the class label.
            [1] - n_classes (int), the number of unique classes
            [2] - needs_balancing (bool), whether the least populated class
                  is represented at a rate lower than the demanded ratio.
    """
    # validate ratio, if the current ratio is >= the ratio, it's "balanced enough"
    ratio = _validate_ratio(ratio)
    y = _validate_target(y)  # cast to string type
    is_factor = _flatten_one(X[y].isfactor())  # is the target a factor?

    # if the target is a factor, we might have an issue here...
    """
    if is_factor:
        warnings.warn('Balancing with the target as a factor can cause unpredictable '
                      'sampling behavior (H2O makes it difficult to assess equality '
                      'between two factors). Balancing works best when the target '
                      'is an int. If possible, consider using `asnumeric`.', UserWarning)
    """

    # generate cts. Have to get kludgier in h2o... then validate is < max classes
    # we have to do it this way, because H2O might treat the vals as enum, and we cannot
    # slice based on equality (dernit, H2O).
    target_col = pd.Series(h2o_col_to_numpy(X[y]))
    cts = target_col.value_counts().sort_values(ascending=True)
    n_classes = _validate_num_classes(cts)
    needs_balancing = (cts.values[0] / cts.values[-1]) < ratio

    index = cts.index if not is_factor else cts.index.astype('str')
    out_tup = (dict(zip(index, cts.values)),  # cts
               index,  # labels sorted ascending by commonality
               target_col.values if not is_factor else target_col.astype('str').values,  # the target
               n_classes,
               needs_balancing)
    return out_tup


class _BaseH2OBalancer(six.with_metaclass(ABCMeta, 
                                          BaseH2OFunctionWrapper, 
                                          BalancerMixin)):
    """Base class for all H2O balancers. Provides _min_version
    and _max_version for BaseH2OFunctionWrapper constructor.
    """

    def __init__(self, target_feature, ratio=BalancerMixin._def_ratio, 
                 min_version='any', max_version=None, shuffle=True):
        super(_BaseH2OBalancer, self).__init__(target_feature=target_feature,
                                               min_version=min_version,
                                               max_version=max_version)
        self.ratio = ratio
        self.shuffle = shuffle

        # this is a new warning
        if shuffle:
            warnings.warn('Setting shuffle=True will eventually be deprecated, as H2O '
                          'does not allow re-ordering of frames by row. The current work-around '
                          '(rbinding the rows) is known to cause issues in the H2O ExprNode '
                          'cache for very large frames.', DeprecationWarning)


[docs]class H2OOversamplingClassBalancer(_BaseH2OBalancer): """Oversample the minority classes until they are represented at the target proportion to the majority class. Parameters ---------- target_feature : str The name of the response column. The response column must be more than a single class and less than ``skutil.preprocessing.balance.BalancerMixin._max_classes`` ratio : float, optional (default=0.2) The target ratio of the minority records to the majority records. If the existing ratio is >= the provided ratio, the return value will merely be a copy of the input frame shuffle : bool, optional (default=True) Whether or not to shuffle rows on return Examples -------- Consider the following example: with a ``ratio`` of 0.5, the minority classes (1, 2) will be oversampled until they are represented at a ratio of at least 0.5 * the prevalence of the majority class (0) >>> def example(): ... import h2o ... import pandas as pd ... import numpy as np ... from skutil.h2o.frame import value_counts ... from skutil.h2o import from_pandas ... ... # initialize h2o ... h2o.init() ... ... # read into pandas ... x = pd.DataFrame(np.concatenate([np.zeros(100), np.ones(30), np.ones(25)*2]), columns=['A']) ... ... # load into h2o ... X = from_pandas(x) ... ... # initialize sampler ... sampler = H2OOversamplingClassBalancer(target_feature="A", ratio=0.5) ... ... # do balancing ... X_balanced = sampler.balance(X) ... value_counts(X_balanced) >>> >>> example() # doctest: +SKIP 0 100 1 50 2 50 Name A, dtype: int64 .. versionadded:: 0.1.0 """ def __init__(self, target_feature, ratio=BalancerMixin._def_ratio, shuffle=True): # as of now, no min/max version; it's simply compatible with all... super(H2OOversamplingClassBalancer, self).__init__( target_feature=target_feature, ratio=ratio, shuffle=shuffle) @overrides(BalancerMixin)
[docs] def balance(self, X): """Apply the oversampling balance operation. Oversamples the minority class to the provided ratio of minority class(es) : majority class. Parameters ---------- X : ``H2OFrame``, shape=(n_samples, n_features) The imbalanced dataset. Returns ------- Xb : ``H2OFrame``, shape=(n_samples, n_features) The balanced H2OFrame """ # check on state of X frame = check_frame(X, copy=False) # get the partitioner partitioner = _OversamplingBalancePartitioner( X=frame, y_name=self.target_feature, ratio=self.ratio, validation_function=_validate_x_y_ratio) sample_idcs = partitioner.get_indices(self.shuffle) # since H2O won't allow us to resample (it's considered rearranging) # we need to rbind at each point of duplication... this can be pretty # inefficient, so we might need to get clever about this... Xb = reorder_h2o_frame(frame, _gen_optimized_chunks(sample_idcs), from_chunks=True) return Xb
[docs]class H2OUndersamplingClassBalancer(_BaseH2OBalancer): """Undersample the majority class until it is represented at the target proportion to the most-represented minority class. Parameters ---------- target_feature : str The name of the response column. The response column must be more than a single class and less than ``skutil.preprocessing.balance.BalancerMixin._max_classes`` ratio : float, optional (default=0.2) The target ratio of the minority records to the majority records. If the existing ratio is >= the provided ratio, the return value will merely be a copy of the input frame shuffle : bool, optional (default=True) Whether or not to shuffle rows on return Examples -------- Consider the following example: with a ``ratio`` of 0.5, the majority class (0) will be undersampled until the second most-populous class (1) is represented at a ratio of 0.5. >>> def example(): ... import h2o ... import pandas as pd ... import numpy as np ... from skutil.h2o.frame import value_counts ... from skutil.h2o import from_pandas ... ... # initialize h2o ... h2o.init() ... ... # read into pandas ... x = pd.DataFrame(np.concatenate([np.zeros(100), np.ones(30), np.ones(25)*2]), columns=['A']) ... ... # load into h2o ... X = from_pandas(x) # doctest:+ELLIPSIS ... ... # initialize sampler ... sampler = H2OUndersamplingClassBalancer(target_feature="A", ratio=0.5) ... ... X_balanced = sampler.balance(X) ... value_counts(X_balanced) ... >>> example() # doctest: +SKIP 0 60 1 30 2 10 Name A, dtype: int64 .. versionadded:: 0.1.0 """ _min_version = '3.8.2.9' _max_version = None def __init__(self, target_feature, ratio=BalancerMixin._def_ratio, shuffle=True): super(H2OUndersamplingClassBalancer, self).__init__( target_feature=target_feature, ratio=ratio, min_version=self._min_version, max_version=self._max_version, shuffle=shuffle) @overrides(BalancerMixin)
[docs] def balance(self, X): """Apply the undersampling balance operation. Undersamples the majority class to the provided ratio of minority class(es) : majority class Parameters ---------- X : ``H2OFrame``, shape=(n_samples, n_features) The imbalanced dataset. Returns ------- Xb : ``H2OFrame``, shape=(n_samples, n_features) The balanced H2OFrame """ # check on state of X frame = check_frame(X, copy=False) # get the partitioner partitioner = _UndersamplingBalancePartitioner( X=frame, y_name=self.target_feature, ratio=self.ratio, validation_function=_validate_x_y_ratio) # since there are no feature_names, we can just slice # the h2o frame as is, given the indices: idcs = partitioner.get_indices(self.shuffle) Xb = frame[idcs, :] if not self.shuffle else reorder_h2o_frame(frame, _gen_optimized_chunks(idcs), from_chunks=True) return Xb