Source code for skoot.preprocessing.binning

# -*- coding: utf-8 -*-
#
# Author: Taylor G Smith <taylor.smith@alkaline-ml.com>
#
# Bin your continuous features.

import six
from joblib import Parallel, delayed
from sklearn.utils.validation import check_is_fitted

import numpy as np
import pandas as pd

from ..base import BasePDTransformer
from ..utils.iterables import chunk
from ..utils.dataframe import dataframe_or_array
from ..utils.validation import (check_dataframe, validate_test_set_columns,
                                type_or_iterable_to_col_mapping)
from ..utils.metaestimators import timed_instance_method

__all__ = [
    'BinningTransformer'
]


def _validate_n_bins(x, n):
    # get unique values
    unique, cts = np.unique(x, return_counts=True)
    if unique.shape[0] < n:
        raise ValueError("Fewer unique values than bins!")
    return unique, cts


def _uniform(x, n):
    # get unique and cut it at the uniform points
    unique, _ = _validate_n_bins(x, n)
    chunks = list(chunk(unique, n))

    # So now our chunks may resemble:
    # >>> list(chunk(np.arange(10), 4))
    # [array([0, 1, 2]), array([3, 4, 5]), array([6, 7]), array([8, 9])]
    # Transform them to bins
    return _Bins(chunks)


def _percentile(x, n):
    # bin by quartiles, quantiles, deciles, etc. This is really
    # easy to delegate to pandas...
    bins = pd.qcut(x, q=n, retbins=True)[1]

    # we can use the returned bins to create our own intervals
    return _Bins(list(zip(bins[:-1], bins[1:])))


_STRATEGIES = {"uniform": _uniform,
               "percentile": _percentile}


class _Bins(object):
    """Binning class that keeps track of upper and lower bounds of bins.
    The algorithm for assigning bins to a test vector is as follows:

        1. Initialize all bins as the highest bin
        2. For each lower bound in bin levels, determine which values in ``x``
           are >= to the bound. Invert the mask and decrement those bins (in
           other words, decrement the indices where the value is < the lower
           bound for the bin in question).
        3. Continue until there is no mask to invert (lowest bin).
    """
    def __init__(self, chunks):
        # chunks is a list of bin arrays
        self.n_bins = len(chunks)

        # create the repr for each bin and create the mins/maxes arrays
        upper_bounds = []
        lower_bounds = []
        reprs = []
        for i, (this_chunk, next_chunk) in \
                enumerate(zip(chunks[:-1], chunks[1:])):

            # If it's the first one, it's just less than
            # the next chunk's min.
            upper_bound = next_chunk[0]
            if i == 0:
                lower_bound = -np.inf
                rep = "(-Inf, %.2f]" % upper_bound

            # Otherwise we know it's a middle one (not the last since we
            # lagged with the zip function and handle that at the end)
            else:
                lower_bound = this_chunk[0]
                rep = "(%.2f, %.2f]" % (lower_bound, upper_bound)

            upper_bounds.append(upper_bound)
            lower_bounds.append(lower_bound)
            reprs.append(rep)

        # since we missed the last chunk due to the lag, get the last one
        lower_bounds.append(chunks[-1][0])
        upper_bounds.append(np.inf)
        reprs.append("(%.2f, Inf]" % lower_bounds[-1])

        # set the attributes
        self.upper_bounds = upper_bounds
        self.lower_bounds = lower_bounds
        self.reprs = reprs

    def assign(self, v, as_str):
        # given some vector of values, assign the appropriate bins. We can
        # do this in one pass, really. Just pass over one of the bounds arrays
        # and keep track of the level at which the elements in V are no longer
        # within the boundaries

        # Initialize by setting all to the highest bin
        bins = (np.ones(v.shape[0]) * (self.n_bins - 1)).astype(int)

        # now progress backwards
        for boundary in self.lower_bounds[::-1]:

            # figure out which are >= to the lower boundary. They should NOT
            # be changed. The ones that are FALSE, however, should be
            # decremented by 1. On the first pass, anything that actually
            # belongs in the top bin will not be adjusted, but everything
            # else will drop by one. Next, everything that is still below the
            # lower boundary will decrement again, etc., until the lowest bin
            # where the lower_bound is -np.inf. Since everything is >= that,
            # there will be no anti mask and nothing will change
            mask = v >= boundary
            anti_mask = ~mask  # type: np.ndarray

            if anti_mask.shape[0] > 0:
                bins[anti_mask] -= 1

        # now we have bin indices, get the reprs to return...
        if as_str:
            return np.array([self.reprs[i] for i in bins])
        # otherwise user just wants the bin level
        return bins


# Executed in parallel:
def _make_bin(binner, vec, c, n):
    # Parallelize the bin operation over columns
    return c, binner(vec, n)


# Executed in parallel:
def _assign_bin(binner, vec, c, return_label):
    # Parallelize bin assignment
    return c, binner.assign(vec, return_label)


[docs]class BinningTransformer(BasePDTransformer):
    r"""Bin continuous variables.

    The BinningTransformer will create buckets for continuous variables,
    effectively transforming continuous features into categorical features.

    Pros of binning:

      * Particularly useful in the case of very skewed data where an
        algorithm may make assumptions on the underlying distribution of the
        variables
      * Quick and easy way to take curvature into account

    There are absolutely some negatives to binning:
    
      * You can tend to throw away information from continuous variables
      * You might end up fitting "wiggles" rather than a linear
        relationship itself
      * You use up a lot of degrees of freedom

    For a more exhaustive list of detrimental effects of binning, take a look
    at [1].

    Parameters
    ----------
    cols : array-like, shape=(n_features,), optional (default=None)
        The names of the columns on which to apply the transformation.
        Optional. If None, will be applied to all features (which could
        prove to be expensive)

    as_df : bool, optional (default=True)
        Whether to return a Pandas ``DataFrame`` in the ``transform``
        method. If False, will return a Numpy ``ndarray`` instead.
        Since most skoot transformers depend on explicitly-named
        ``DataFrame`` features, the ``as_df`` parameter is True by default.

    n_bins : int or iterable, optional (default=10)
        The number of bins into which to separate each specified feature.
        Default is 20, but can also be an iterable or dict of the same length
        as ``cols``, where positional integers indicate a different bin size
        for that feature.

    strategy : str or unicode, optional (default="uniform")
        The strategy for binning. Default is "uniform", which uniformly
        segments a feature. Alternatives include "percentile" which uses
        ``n_bins`` to compute quantiles (for ``n_bins=5``), quartiles
        (for ``n_bins=4``), etc. Note that for percentile binning, the
        outer bin boundaries (low boundary of lowest bin and high
        boundary of the highest bin) will be set to -inf and inf,
        respectively, to behave similar to other binning strategies.

    return_bin_label : bool, optional (default=True)
        Whether to return the string representation of the bin (i.e., "<25.2")
        rather than the bin level, an integer.

    overwrite : bool, optional (default=True)
        Whether to overwrite the original feature with the binned feature.
        Default is True so that the output names match the input names. If
        False, the output columns will be appended to the right side of
        the frame with "_binned" appended.

    n_jobs : int, 1 by default
       The number of jobs to use for the encoding. This works by
       fitting each incremental LabelEncoder in parallel.

       If -1 all CPUs are used. If 1 is given, no parallel computing code
       is used at all, which is useful for debugging. For n_jobs below -1,
       (n_cpus + 1 + n_jobs) are used. Thus for n_jobs = -2, all CPUs but
       one are used.

    Notes
    -----
    If a feature has fewer than ``n_bins`` unique values, it will raise a
    ValueError in the fit procedure.

    Examples
    --------
    Bin two features in iris:

    >>> from skoot.datasets import load_iris_df
    >>> iris = load_iris_df(include_tgt=False, names=['a', 'b', 'c', 'd'])
    >>> binner = BinningTransformer(cols=["a", "b"], strategy="uniform")
    >>> trans = binner.fit_transform(iris)
    >>> trans.head()
                  a             b    c    d
    0  (5.10, 5.50]  (3.40, 3.60]  1.4  0.2
    1  (4.70, 5.10]  (3.00, 3.20]  1.4  0.2
    2  (4.70, 5.10]  (3.20, 3.40]  1.3  0.2
    3  (-Inf, 4.70]  (3.00, 3.20]  1.5  0.2
    4  (4.70, 5.10]  (3.60, 3.80]  1.4  0.2
    >>> trans.dtypes
    a     object
    b     object
    c    float64
    d    float64
    dtype: object

    Attributes
    ----------
    bins_ : dict
        A dictionary mapping the column names to the corresponding bins,
        which are internal _Bin objects that store data on upper and lower
        bounds.

    fit_cols_ : list
        The list of column names on which the transformer was fit. This
        is used to validate the presence of the features in the test set
        during the ``transform`` stage.

    References
    ----------
    .. [1] "Problems Caused by Categorizing Continuous Variables"
           http://biostat.mc.vanderbilt.edu/wiki/Main/CatContinuous
    """
[docs]    def __init__(self, cols=None, as_df=True, n_bins=10, strategy="uniform",
                 return_bin_label=True, overwrite=True, n_jobs=1):

        super(BinningTransformer, self).__init__(
            cols=cols, as_df=as_df)

        self.n_bins = n_bins
        self.strategy = strategy
        self.return_bin_label = return_bin_label
        self.overwrite = overwrite
        self.n_jobs = n_jobs

[docs]    @timed_instance_method(attribute_name="fit_time_")
    def fit(self, X, y=None):
        """Fit the transformer.

        Parameters
        ----------
        X : pd.DataFrame, shape=(n_samples, n_features)
            The Pandas frame to fit. The frame will only
            be fit on the prescribed ``cols`` (see ``__init__``) or
            all of them if ``cols`` is None.

        y : array-like or None, shape=(n_samples,), optional (default=None)
            Pass-through for ``sklearn.pipeline.Pipeline``.
        """
        # validate the input, and get a copy of it
        X, cols = check_dataframe(X, cols=self.cols,
                                  assert_all_finite=True)

        # validate n_bins...
        n_bins = type_or_iterable_to_col_mapping(cols=cols, param=self.n_bins,
                                                 param_name="n_bins",
                                                 permitted_scalar_types=int)

        # now that we have a dictionary, we can assess the actual integer
        for _, v in six.iteritems(n_bins):
            if not (isinstance(v, int) and v > 1):
                raise ValueError("Each n_bin value must be an integer > 1")

        # get and validate the strategy
        strategy = self.strategy
        try:
            binner = _STRATEGIES[strategy]
        except KeyError:
            raise ValueError("strategy must be one of %r, but got %r"
                             % (str(list(_STRATEGIES.keys())), strategy))

        # compute the bins for each feature
        bins = dict(Parallel(n_jobs=self.n_jobs)(
            delayed(_make_bin)(binner, vec=X[c].values, c=c, n=n)
            for c, n in six.iteritems(n_bins)))

        # set the instance attribute
        self.bins_ = bins
        self.fit_cols_ = cols
        return self

[docs]    def transform(self, X):
        """Apply the transformation to a dataframe.

        This method will bin the continuous values in the test frame with the
        bins designated in the ``fit`` stage.

        Parameters
        ----------
        X : pd.DataFrame, shape=(n_samples, n_features)
            The Pandas frame to transform. The operation will
            be applied to a copy of the input data, and the result
            will be returned.

        Returns
        -------
        X : pd.DataFrame or np.ndarray, shape=(n_samples, n_features)
            The operation is applied to a copy of ``X``,
            and the result set is returned.
        """
        check_is_fitted(self, 'bins_')
        X, _ = check_dataframe(X, cols=self.cols)  # X is a copy now

        # validate that fit cols in test set
        cols = self.fit_cols_
        validate_test_set_columns(cols, X.columns)

        # the bins
        bins = self.bins_

        # now apply the binning. Rather that use iteritems, iterate the cols
        # themselves so we get the order prescribed by the user
        bin_assignments = dict(Parallel(n_jobs=self.n_jobs)(
            delayed(_assign_bin)(
                bins[col], vec=X[col].values, c=col,
                return_label=self.return_bin_label)
            for col in cols))

        # Simple pass of O(N) to assign to dataframes. Lightweight, no
        # actual computations here. That all happened in parallel
        for c in cols:
            binned = bin_assignments[c]
            # if we overwrite, it's easy
            if self.overwrite:
                X[c] = binned
            # otherwise create a new feature
            else:
                X["%s_binned" % c] = binned

        return dataframe_or_array(X, self.as_df)