Source code for skoot.balance.under

# -*- coding: utf-8 -*-
#
# Author: Taylor Smith <taylor.smith@alkaline-ml.com>
#
# The under-sampling balancer

from sklearn.utils.validation import check_random_state
import numpy as np

from .base import _validate_X_y_ratio_classes, _reorder
from ..utils.dataframe import safe_drop_samples

__all__ = [
    'under_sample_balance'
]


[docs]def under_sample_balance(X, y, balance_ratio=0.2, random_state=None,
                         shuffle=True):
    """Under sample the majority class to a specified ratio.

    One strategy for balancing data is to under-sample the majority
    class until it is represented at the prescribed ``balance_ratio``.
    This can be effective in cases where the training set is already
    quite large, and diminishing its size may not prove detrimental.

    The under-sampling procedure behaves differently than the over-sampling
    technique: its objective is only to under-sample the *majority* class,
    and will down-sample it until the *second-most* represented class is
    present at the prescribed ratio.

    Parameters
    ----------
    X : array-like, shape (n_samples, n_features)
        The training array. Samples from this array that correspond to the
        majority class will be omitted until the minority class is represented
        at the ``balance_ratio``.

    y : array-like, shape (n_samples,)
        Training labels corresponding to the samples in ``X``.

    balance_ratio : float, optional (default=0.2)
        The minimum acceptable ratio of ``$MINORITY_CLASS : $MAJORITY_CLASS``
        representation, where 0 < ``ratio`` <= 1

    random_state : int, None or numpy RandomState, optional (default=None)
        The seed to construct the random state to generate random selections.

    shuffle : bool, optional (default=True)
        Whether to shuffle the output.

    Notes
    -----
    You should only use the under sampling method when you have lots of data
    and can afford to lose some training samples. Moreover, using this in
    conjunction with a high variance modeling method can pose a higher risk of
    over-fitting, since they typically require more data.

    Examples
    --------
    >>> from sklearn.datasets import make_classification
    >>> X, y = make_classification(n_samples=1000, random_state=42,
    ...                            n_classes=2, weights=[0.99, 0.01])
    >>> X_bal, y_bal = under_sample_balance(X, y, balance_ratio=0.2,
    ...                                     random_state=42)
    >>> ratio = round((y_bal == 1).sum() / float((y_bal == 0).sum()), 1)
    >>> assert ratio == 0.2, ratio

    Note that the number of samples in the data is now lower than it
    initially was:

    >>> assert X_bal.shape[0] < 1000
    """
    random_state = check_random_state(random_state)

    # validate before copying arrays around...
    X, y, n_classes, present_classes, \
        counts, majority_label, _ = \
        _validate_X_y_ratio_classes(X, y, balance_ratio)

    # get the second-most populous count, compute target
    sorted_counts = np.sort(counts)
    if sorted_counts[-1] == sorted_counts[-2]:  # corner case
        return _reorder(X, y, random_state, shuffle)

    target_count = max(int(sorted_counts[-2] / balance_ratio), 1)

    # select which rows gotta go...
    idcs = np.arange(X.shape[0])
    mask = (y == majority_label)  # type: np.ndarray
    remove = random_state.permutation(
        idcs[mask])[:mask.sum() - target_count]  # sum is > target count

    # remove them
    X = safe_drop_samples(X, remove)
    y = np.delete(y, remove)

    # reorder if needed
    return _reorder(X, y, random_state, shuffle)