from __future__ import absolute_import, division, print_function
import pandas as pd
from abc import ABCMeta
import warnings
from sklearn.externals import six
from skutil.base import overrides
from .transform import _flatten_one
from .util import reorder_h2o_frame, _gen_optimized_chunks, h2o_col_to_numpy
from .base import check_frame, BaseH2OFunctionWrapper
from ..preprocessing.balance import (_validate_ratio, _validate_target, _validate_num_classes,
_OversamplingBalancePartitioner, _UndersamplingBalancePartitioner,
BalancerMixin)
__all__ = [
'H2OOversamplingClassBalancer',
'H2OUndersamplingClassBalancer'
]
def _validate_x_y_ratio(X, y, ratio):
"""Validates the following, given that X is
already a validated pandas DataFrame:
1. That y is a string
2. That the number of classes does not exceed _max_classes
as defined by the BalancerMixin class
3. That the number of classes is at least 2
4. That ratio is a float that falls between 0.0 (exclusive) and
1.0 (inclusive)
Parameters
----------
X : ``H2OFrame``, shape=(n_samples, n_features)
The frame from which to sample
y : str
The name of the column that is the response class.
This is the column on which ``value_counts`` will be
executed to determine imbalance.
ratio : float
The ratio at which the balancing operation will
be performed. Used to determine whether balancing is
required.
Returns
-------
out_tup : tuple, shape=(3,)
a length-3 tuple with the following args:
[0] - cts (pd.Series), the ascending sorted ``value_counts``
of the class, where the index is the class label.
[1] - n_classes (int), the number of unique classes
[2] - needs_balancing (bool), whether the least populated class
is represented at a rate lower than the demanded ratio.
"""
# validate ratio, if the current ratio is >= the ratio, it's "balanced enough"
ratio = _validate_ratio(ratio)
y = _validate_target(y) # cast to string type
is_factor = _flatten_one(X[y].isfactor()) # is the target a factor?
# if the target is a factor, we might have an issue here...
"""
if is_factor:
warnings.warn('Balancing with the target as a factor can cause unpredictable '
'sampling behavior (H2O makes it difficult to assess equality '
'between two factors). Balancing works best when the target '
'is an int. If possible, consider using `asnumeric`.', UserWarning)
"""
# generate cts. Have to get kludgier in h2o... then validate is < max classes
# we have to do it this way, because H2O might treat the vals as enum, and we cannot
# slice based on equality (dernit, H2O).
target_col = pd.Series(h2o_col_to_numpy(X[y]))
cts = target_col.value_counts().sort_values(ascending=True)
n_classes = _validate_num_classes(cts)
needs_balancing = (cts.values[0] / cts.values[-1]) < ratio
index = cts.index if not is_factor else cts.index.astype('str')
out_tup = (dict(zip(index, cts.values)), # cts
index, # labels sorted ascending by commonality
target_col.values if not is_factor else target_col.astype('str').values, # the target
n_classes,
needs_balancing)
return out_tup
class _BaseH2OBalancer(six.with_metaclass(ABCMeta,
BaseH2OFunctionWrapper,
BalancerMixin)):
"""Base class for all H2O balancers. Provides _min_version
and _max_version for BaseH2OFunctionWrapper constructor.
"""
def __init__(self, target_feature, ratio=BalancerMixin._def_ratio,
min_version='any', max_version=None, shuffle=True):
super(_BaseH2OBalancer, self).__init__(target_feature=target_feature,
min_version=min_version,
max_version=max_version)
self.ratio = ratio
self.shuffle = shuffle
# this is a new warning
if shuffle:
warnings.warn('Setting shuffle=True will eventually be deprecated, as H2O '
'does not allow re-ordering of frames by row. The current work-around '
'(rbinding the rows) is known to cause issues in the H2O ExprNode '
'cache for very large frames.', DeprecationWarning)
[docs]class H2OOversamplingClassBalancer(_BaseH2OBalancer):
"""Oversample the minority classes until they are represented
at the target proportion to the majority class.
Parameters
----------
target_feature : str
The name of the response column. The response column must be
more than a single class and less than
``skutil.preprocessing.balance.BalancerMixin._max_classes``
ratio : float, optional (default=0.2)
The target ratio of the minority records to the majority records. If the
existing ratio is >= the provided ratio, the return value will merely be
a copy of the input frame
shuffle : bool, optional (default=True)
Whether or not to shuffle rows on return
Examples
--------
Consider the following example: with a ``ratio`` of 0.5, the
minority classes (1, 2) will be oversampled until they are represented
at a ratio of at least 0.5 * the prevalence of the majority class (0)
>>> def example():
... import h2o
... import pandas as pd
... import numpy as np
... from skutil.h2o.frame import value_counts
... from skutil.h2o import from_pandas
...
... # initialize h2o
... h2o.init()
...
... # read into pandas
... x = pd.DataFrame(np.concatenate([np.zeros(100), np.ones(30), np.ones(25)*2]), columns=['A'])
...
... # load into h2o
... X = from_pandas(x)
...
... # initialize sampler
... sampler = H2OOversamplingClassBalancer(target_feature="A", ratio=0.5)
...
... # do balancing
... X_balanced = sampler.balance(X)
... value_counts(X_balanced)
>>>
>>> example() # doctest: +SKIP
0 100
1 50
2 50
Name A, dtype: int64
.. versionadded:: 0.1.0
"""
def __init__(self, target_feature, ratio=BalancerMixin._def_ratio, shuffle=True):
# as of now, no min/max version; it's simply compatible with all...
super(H2OOversamplingClassBalancer, self).__init__(
target_feature=target_feature, ratio=ratio, shuffle=shuffle)
@overrides(BalancerMixin)
[docs] def balance(self, X):
"""Apply the oversampling balance operation. Oversamples
the minority class to the provided ratio of minority
class(es) : majority class.
Parameters
----------
X : ``H2OFrame``, shape=(n_samples, n_features)
The imbalanced dataset.
Returns
-------
Xb : ``H2OFrame``, shape=(n_samples, n_features)
The balanced H2OFrame
"""
# check on state of X
frame = check_frame(X, copy=False)
# get the partitioner
partitioner = _OversamplingBalancePartitioner(
X=frame, y_name=self.target_feature,
ratio=self.ratio, validation_function=_validate_x_y_ratio)
sample_idcs = partitioner.get_indices(self.shuffle)
# since H2O won't allow us to resample (it's considered rearranging)
# we need to rbind at each point of duplication... this can be pretty
# inefficient, so we might need to get clever about this...
Xb = reorder_h2o_frame(frame, _gen_optimized_chunks(sample_idcs), from_chunks=True)
return Xb
[docs]class H2OUndersamplingClassBalancer(_BaseH2OBalancer):
"""Undersample the majority class until it is represented
at the target proportion to the most-represented minority class.
Parameters
----------
target_feature : str
The name of the response column. The response column must be
more than a single class and less than
``skutil.preprocessing.balance.BalancerMixin._max_classes``
ratio : float, optional (default=0.2)
The target ratio of the minority records to the majority records. If the
existing ratio is >= the provided ratio, the return value will merely be
a copy of the input frame
shuffle : bool, optional (default=True)
Whether or not to shuffle rows on return
Examples
--------
Consider the following example: with a ``ratio`` of 0.5, the
majority class (0) will be undersampled until the second most-populous
class (1) is represented at a ratio of 0.5.
>>> def example():
... import h2o
... import pandas as pd
... import numpy as np
... from skutil.h2o.frame import value_counts
... from skutil.h2o import from_pandas
...
... # initialize h2o
... h2o.init()
...
... # read into pandas
... x = pd.DataFrame(np.concatenate([np.zeros(100), np.ones(30), np.ones(25)*2]), columns=['A'])
...
... # load into h2o
... X = from_pandas(x) # doctest:+ELLIPSIS
...
... # initialize sampler
... sampler = H2OUndersamplingClassBalancer(target_feature="A", ratio=0.5)
...
... X_balanced = sampler.balance(X)
... value_counts(X_balanced)
...
>>> example() # doctest: +SKIP
0 60
1 30
2 10
Name A, dtype: int64
.. versionadded:: 0.1.0
"""
_min_version = '3.8.2.9'
_max_version = None
def __init__(self, target_feature, ratio=BalancerMixin._def_ratio, shuffle=True):
super(H2OUndersamplingClassBalancer, self).__init__(
target_feature=target_feature, ratio=ratio, min_version=self._min_version,
max_version=self._max_version, shuffle=shuffle)
@overrides(BalancerMixin)
[docs] def balance(self, X):
"""Apply the undersampling balance operation. Undersamples
the majority class to the provided ratio of minority
class(es) : majority class
Parameters
----------
X : ``H2OFrame``, shape=(n_samples, n_features)
The imbalanced dataset.
Returns
-------
Xb : ``H2OFrame``, shape=(n_samples, n_features)
The balanced H2OFrame
"""
# check on state of X
frame = check_frame(X, copy=False)
# get the partitioner
partitioner = _UndersamplingBalancePartitioner(
X=frame, y_name=self.target_feature, ratio=self.ratio,
validation_function=_validate_x_y_ratio)
# since there are no feature_names, we can just slice
# the h2o frame as is, given the indices:
idcs = partitioner.get_indices(self.shuffle)
Xb = frame[idcs, :] if not self.shuffle else reorder_h2o_frame(frame,
_gen_optimized_chunks(idcs),
from_chunks=True)
return Xb