Source code for skoot.balance.over

# -*- coding: utf-8 -*-
#
# Author: Taylor Smith <taylor.smith@alkaline-ml.com>
#
# The over-sampling balancer

from sklearn.preprocessing import LabelEncoder
from sklearn.utils.validation import check_random_state
from sklearn.utils import safe_indexing

from .base import _validate_X_y_ratio_classes, _reorder
from ..utils import safe_vstack
import numpy as np

__all__ = [
    'over_sample_balance'
]


[docs]def over_sample_balance(X, y, balance_ratio=0.2, random_state=None,
                        shuffle=True):
    """Over sample a minority class to a specified ratio.

    One strategy for balancing data is to over-sample the minority class
    until it is represented at the prescribed ``balance_ratio``. While there
    is significant literature to show that this is not the best technique,
    and can sometimes lead to over-fitting, there are instances where it can
    work well.

    Parameters
    ----------
    X : array-like, shape (n_samples, n_features)
        The training array. Samples from this array will be resampled with
        replacement for the minority class.

    y : array-like, shape (n_samples,)
        Training labels corresponding to the samples in ``X``.

    balance_ratio : float, optional (default=0.2)
        The minimum acceptable ratio of ``$MINORITY_CLASS : $MAJORITY_CLASS``
        representation, where 0 < ``ratio`` <= 1

    random_state : int, None or numpy RandomState, optional (default=None)
        The seed to construct the random state to generate random selections.

    shuffle : bool, optional (default=True)
        Whether to shuffle the output.

    Examples
    --------
    >>> from sklearn.datasets import make_classification
    >>> X, y = make_classification(n_samples=1000, random_state=42,
    ...                            n_classes=2, weights=[0.99, 0.01])
    >>> X_bal, y_bal = over_sample_balance(X, y, balance_ratio=0.2,
    ...                                    random_state=42)
    >>> ratio = round((y_bal == 1).sum() / float((y_bal == 0).sum()), 1)
    >>> assert ratio == 0.2, ratio

    Note that the count of samples is now greater than it initially was:

    >>> assert X_bal.shape[0] > 1000
    """
    random_state = check_random_state(random_state)

    # validate before copying arrays around...
    X, y, n_classes, present_classes, \
        counts, majority_label, target_count = \
        _validate_X_y_ratio_classes(X, y, balance_ratio)

    # encode y, in case they are not numeric (we need them to be for np.ones)
    le = LabelEncoder()
    le.fit(present_classes)
    y_transform = le.transform(y)  # make numeric

    # we'll vstack/concatenate to these
    out_X, out_y = X.copy(), y_transform.copy()

    # iterate the present classes
    for label in present_classes:
        if label == majority_label:
            continue

        # get the transformed label
        label_transform = le.transform([label])[0]

        while True:
            # use the out_X, out_y copies. Since we're oversamping,
            # it doesn't matter if we're drawing from the out_X matrix.
            # also, this way we can better keep track of how many we've drawn.
            mask = out_y == label_transform
            n_req = target_count - mask.sum()

            # terminal case
            if n_req == 0:
                break

            # draw a sample, take first n_req:
            idcs = np.arange(out_X.shape[0])[mask]  # get the idcs, mask them
            sample = safe_indexing(out_X,
                                   random_state.permutation(idcs)[:n_req])

            # vstack
            out_X = safe_vstack(out_X, sample)

            # concatenate. Use sample length, since it might be < n_req
            out_y = np.concatenate([
                out_y, np.ones(sample.shape[0],
                               dtype=np.int16) * label_transform])

    return _reorder(out_X, le.inverse_transform(out_y), random_state, shuffle)