Source code for skoot.balance.smote

# -*- coding: utf-8 -*-
#
# Author: Taylor Smith <taylor.smith@alkaline-ml.com>
#
# The SMOTE balancer

import numpy as np
import pandas as pd

from sklearn.neighbors import NearestNeighbors
from sklearn.preprocessing import LabelEncoder
from sklearn.utils.validation import check_random_state
from sklearn.utils import safe_indexing

from .base import _validate_X_y_ratio_classes
from ..utils import safe_vstack, safe_mask_samples

__all__ = [
    'smote_balance'
]


def _perturb(consider_vector, nearest, random_state):
    return (consider_vector - nearest) * \
        random_state.rand(nearest.shape[0], 1) + consider_vector


def _interpolate(consider_vector, nearest, _):
    # the danger here is that if there are not enough samples, we'll
    # interpolate with the same value. Hrmm. Maybe add some entropy? # todo
    return np.average((consider_vector, nearest), axis=0)


# Define after the _perturb and _interpolate functions are defined
STRATEGIES = {
    'perturb': _perturb,
    'interpolate': _interpolate
}


def _nearest_neighbors_for_class(X, label, label_encoder, y_transform,
                                 target_count, random_state, strategy,
                                 n_neighbors, algorithm, leaf_size, p,
                                 metric, metric_params, n_jobs):
    func = STRATEGIES[strategy]

    # transform the label, get the subset
    transformed_label = label_encoder.transform([label])[0]
    X_sub = safe_mask_samples(X, y_transform == transformed_label)

    # make sure it's a numpy array for all of this...
    if isinstance(X_sub, pd.DataFrame):
        X_sub = X_sub.values

    # if the count >= the ratio, skip this label
    count = X_sub.shape[0]
    if count >= target_count:
        return X, y_transform, None

    # define the nearest neighbors model
    model = NearestNeighbors(n_neighbors=n_neighbors, algorithm=algorithm,
                             leaf_size=leaf_size, p=p, metric=metric,
                             metric_params=metric_params, n_jobs=n_jobs)

    # get the observations that map to the transformed label
    amt_required = target_count - count

    # fit the model once, query the tree once. n_neighbors MUST
    # be ONE PLUS n_neighbors, since the zero'th index will always
    # be the index of the observation itself (i.e., obs 0 is its own
    # nearest neighbor).
    model.fit(X_sub)

    # draw the nearest neighbors ONCE. There is an interesting corner
    # case here... sklearn's nearest neighbors will draw the actual
    # observation as its own nearest neighbor so we need to query for k + 1,
    # and remove the first index (handled in the while loop)
    k_neighbors = min(X_sub.shape[0], n_neighbors + 1)
    nearest = model.kneighbors(X_sub, n_neighbors=k_neighbors,
                               return_distance=False)  # type: np.ndarray
    indices = np.arange(count)

    # append the labels to y_transform - do this once to avoid the
    # overhead of repeated concatenations
    y_transform = np.concatenate([
        y_transform,
        np.ones(amt_required, dtype=np.int16) * transformed_label])

    # sample while under the requisite ratio
    while amt_required > 0:
        # randomly select some observations. since each selection will produce
        # n_neighbors synthetic points, take the first
        # amt_required // n_neighbors
        draw_count = max(1, int(round(amt_required / n_neighbors)))
        random_indices = random_state.permutation(indices)[:draw_count]

        # select the random sample, get nearest neighbors for the
        # sample indices. SHUFFLE, because of the next draw step.
        synthetic = random_state.permutation(np.vstack([
            func(X_sub[consideration, :],

                 # take 0th out (the observation vector under consideration):
                 X_sub[nearest[consideration][1:], :],
                 random_state)

            # using the language of smote... ("vector under consideration")
            for consideration in random_indices
        ]))

        # append to X. Since the round up earlier might cause a slight
        # error in count, make sure to truncate the synthetically-drawn
        # examples to amt_required
        if isinstance(X, pd.DataFrame):
            synthetic = pd.DataFrame.from_records(
                synthetic, columns=X.columns).iloc[:amt_required]
        else:
            synthetic = synthetic[:amt_required]
        X = safe_vstack(X, synthetic)

        # determine whether we need to recurse for this class (if
        # there were too few samples)
        amt_required -= synthetic.shape[0]

    return X, y_transform, model


[docs]def smote_balance(X, y, return_estimators=False, balance_ratio=0.2, strategy='perturb', n_neighbors=5, algorithm='kd_tree', leaf_size=30, p=2, metric='minkowski', metric_params=None, n_jobs=1, random_state=None, shuffle=True): """Balance a dataset using SMOTE. Synthetic Minority Oversampling TEchnique (SMOTE) is a class balancing strategy that samples the k-Nearest Neighbors from each minority class, perturbs them with a random value between 0 and 1, and adds the difference between the original observation and the perturbation to the original observation to generate a synthetic observation. This is repeated until the minority classes are represented at a prescribed ratio to the majority class. Alternative methods involve interpolating the distance between the original observation and the nearest neighbors using either the median or the mean. This strategy can be set using the ``strategy`` arg (one of 'perturb' or 'interpolate'). Parameters ---------- X : array-like, shape (n_samples, n_features) The training array. Samples from the minority class(es) in this array will be interpolated until they are represented at ``balance_ratio``. y : array-like, shape (n_samples,) Training labels corresponding to the samples in ``X``. return_estimators : bool, optional (default=False) Whether or not to return the dictionary of fit :class:``sklearn.neighbors.NearestNeighbors`` instances. If True, the return value will be a tuple, with the first index being the balanced ``X`` matrix, the second index being the ``y`` values, and the third index being a dictionary of the fit estimators. If False, the return value is simply a tuple of the balanced ``X`` matrix and the corresponding labels. balance_ratio : float, optional (default=0.2) The minimum acceptable ratio of ``$MINORITY_CLASS : $MAJORITY_CLASS`` representation, where 0 < ``ratio`` <= 1 strategy : str, optional (default='perturb') The strategy used to construct synthetic examples from existing examples. The original SMOTE paper suggests a strategy by which a random value between 0 and 1 scales the difference between the nearest neighbors, and the difference is then added to the original vector. This is the default strategy, 'perturb'. Valid strategies include: * 'perturb' - a random value between 0 and 1 scales the difference between the nearest neighbors, and the difference is then added to the original vector. * 'interpolate' - the ``interpolation_method`` ('mean' or 'median') of the nearest neighbors constitutes the synthetic example. n_neighbors : int, optional (default=5) Number of neighbors to use by default for ``kneighbors`` queries. This parameter is passed to each respective ``NearestNeighbors call.`` algorithm : str or unicode, optional (default='kd_tree') Algorithm used to compute the nearest neighbors. One of {'auto', 'ball_tree', 'kd_tree', 'brute'}: - 'ball_tree' will use ``sklearn.neighbors.BallTree`` - 'kd_tree' will use ``sklearn.neighbors.KDtree`` - 'brute' will use a brute-force search. - 'auto' will attempt to decide the most appropriate algorithm based on the values passed to ``fit`` method. Note: fitting on sparse input will override the setting of this parameter, using brute force. This parameter is passed to each respective ``NearestNeighbors`` call. leaf_size : int, optional (default=30) Leaf size passed to ``BallTree`` or ``KDTree``. This can affect the speed of the construction and query, as well as the memory required to store the tree. The optimal value depends on the nature of the problem. p : integer, optional (default=2) Parameter for the Minkowski metric from ``sklearn.metrics.pairwise.pairwise_distances``. When p = 1, this is equivalent to using manhattan_distance (l1), and euclidean_distance (l2) for p = 2. For arbitrary p, minkowski_distance (l_p) is used. metric : string or callable, optional (default='minkowski') Metric to use for distance computation. Any metric from scikit-learn or ``scipy.spatial.distance`` can be used. If metric is a callable function, it is called on each pair of instances (rows) and the resulting value recorded. The callable should take two arrays as input and return one value indicating the distance between them. This works for Scipy's metrics, but is less efficient than passing the metric name as a string. Distance matrices are not supported. Valid values for metric are: - from scikit-learn: ['cityblock', 'cosine', 'euclidean', 'l1', 'l2', 'manhattan'] - from ``scipy.spatial.distance``: ['braycurtis', 'canberra', 'chebyshev', 'correlation', 'dice', 'hamming', 'jaccard', 'kulsinski', 'mahalanobis', 'matching', 'minkowski', 'rogerstanimoto', 'russellrao', 'seuclidean', 'sokalmichener', 'sokalsneath', 'sqeuclidean', 'yule'] See the documentation for ``scipy.spatial.distance`` for details on these metrics. metric_params : dict, optional (default = None) Additional keyword arguments for the metric function. n_jobs : int, optional (default = 1) The number of parallel jobs to run for neighbors search. If ``-1``, then the number of jobs is set to the number of CPU cores. Affects only ``kneighbors`` and ``kneighbors_graph`` methods. shuffle : bool, optional (default=True) Whether to shuffle the output. random_state : int or None, optional (default=None) The seed to construct the random state to generate random selections. Examples -------- >>> from sklearn.datasets import make_classification >>> X, y = make_classification(n_samples=1000, random_state=42, ... n_classes=2, weights=[0.99, 0.01]) >>> X_bal, y_bal = smote_balance(X, y, balance_ratio=0.2, random_state=42) >>> ratio = round((y_bal == 1).sum() / float((y_bal == 0).sum()), 1) >>> assert ratio == 0.2, ratio Note that the count of samples is now greater than it initially was: >>> assert X_bal.shape[0] > 1000 References ---------- .. [1] N. Chawla, K. Bowyer, L. Hall, W. Kegelmeyer, "SMOTE: Synthetic Minority Over-sampling Technique" https://www.jair.org/media/953/live-953-2037-jair.pdf """ # validate the cheap stuff before copying arrays around... X, y, n_classes, present_classes, \ counts, majority_label, target_count = \ _validate_X_y_ratio_classes(X, y, balance_ratio) # validate n_neighbors is at least one if n_neighbors < 1: raise ValueError('n_neighbors must be at least 1') if strategy not in STRATEGIES: raise ValueError('strategy must be one of %r' % STRATEGIES) # get the random state random_state = check_random_state(random_state) # encode y, in case they are not numeric (we need them to be for np.ones) le = LabelEncoder() le.fit(present_classes) y_transform = le.transform(y) # make numeric # get the nearest neighbor models models = dict() for label in present_classes: # skip the pass to the method and continue if label is majority if label == majority_label: models[label] = None continue # X and y_transform will progressively be updated throughout the runs X, y_transform, models[label] = \ _nearest_neighbors_for_class( X=X, label=label, label_encoder=le, y_transform=y_transform, target_count=target_count, random_state=random_state, strategy=strategy, n_neighbors=n_neighbors, algorithm=algorithm, leaf_size=leaf_size, p=p, metric=metric, metric_params=metric_params, n_jobs=n_jobs) # now that X, y_transform have been assembled, inverse_transform # the y_t back to its original state: y = le.inverse_transform(y_transform) # finally, shuffle both and return output_order = np.arange(X.shape[0]) if shuffle: output_order = random_state.permutation(output_order) # select the output order, then return X, y = safe_indexing(X, output_order), y[output_order] if return_estimators: return X, y, models return X, y