# -*- coding: utf-8 -*-
#
# Author: Taylor Smith <taylor.smith@alkaline-ml.com>
#
# The over-sampling balancer
from sklearn.preprocessing import LabelEncoder
from sklearn.utils.validation import check_random_state
from sklearn.utils import safe_indexing
from .base import _validate_X_y_ratio_classes, _reorder
from ..utils import safe_vstack
import numpy as np
__all__ = [
'over_sample_balance'
]
[docs]def over_sample_balance(X, y, balance_ratio=0.2, random_state=None,
shuffle=True):
"""Over sample a minority class to a specified ratio.
One strategy for balancing data is to over-sample the minority class
until it is represented at the prescribed ``balance_ratio``. While there
is significant literature to show that this is not the best technique,
and can sometimes lead to over-fitting, there are instances where it can
work well.
Parameters
----------
X : array-like, shape (n_samples, n_features)
The training array. Samples from this array will be resampled with
replacement for the minority class.
y : array-like, shape (n_samples,)
Training labels corresponding to the samples in ``X``.
balance_ratio : float, optional (default=0.2)
The minimum acceptable ratio of ``$MINORITY_CLASS : $MAJORITY_CLASS``
representation, where 0 < ``ratio`` <= 1
random_state : int, None or numpy RandomState, optional (default=None)
The seed to construct the random state to generate random selections.
shuffle : bool, optional (default=True)
Whether to shuffle the output.
Examples
--------
>>> from sklearn.datasets import make_classification
>>> X, y = make_classification(n_samples=1000, random_state=42,
... n_classes=2, weights=[0.99, 0.01])
>>> X_bal, y_bal = over_sample_balance(X, y, balance_ratio=0.2,
... random_state=42)
>>> ratio = round((y_bal == 1).sum() / float((y_bal == 0).sum()), 1)
>>> assert ratio == 0.2, ratio
Note that the count of samples is now greater than it initially was:
>>> assert X_bal.shape[0] > 1000
"""
random_state = check_random_state(random_state)
# validate before copying arrays around...
X, y, n_classes, present_classes, \
counts, majority_label, target_count = \
_validate_X_y_ratio_classes(X, y, balance_ratio)
# encode y, in case they are not numeric (we need them to be for np.ones)
le = LabelEncoder()
le.fit(present_classes)
y_transform = le.transform(y) # make numeric
# we'll vstack/concatenate to these
out_X, out_y = X.copy(), y_transform.copy()
# iterate the present classes
for label in present_classes:
if label == majority_label:
continue
# get the transformed label
label_transform = le.transform([label])[0]
while True:
# use the out_X, out_y copies. Since we're oversamping,
# it doesn't matter if we're drawing from the out_X matrix.
# also, this way we can better keep track of how many we've drawn.
mask = out_y == label_transform
n_req = target_count - mask.sum()
# terminal case
if n_req == 0:
break
# draw a sample, take first n_req:
idcs = np.arange(out_X.shape[0])[mask] # get the idcs, mask them
sample = safe_indexing(out_X,
random_state.permutation(idcs)[:n_req])
# vstack
out_X = safe_vstack(out_X, sample)
# concatenate. Use sample length, since it might be < n_req
out_y = np.concatenate([
out_y, np.ones(sample.shape[0],
dtype=np.int16) * label_transform])
return _reorder(out_X, le.inverse_transform(out_y), random_state, shuffle)