Source code for skoot.preprocessing.binning

# -*- coding: utf-8 -*-
#
# Author: Taylor G Smith <taylor.smith@alkaline-ml.com>
#
# Bin your continuous features.

import six
from joblib import Parallel, delayed
from sklearn.utils.validation import check_is_fitted

import numpy as np
import pandas as pd

from ..base import BasePDTransformer
from ..utils.iterables import chunk
from ..utils.dataframe import dataframe_or_array
from ..utils.validation import (check_dataframe, validate_test_set_columns,
                                type_or_iterable_to_col_mapping)
from ..utils.metaestimators import timed_instance_method

__all__ = [
    'BinningTransformer'
]


def _validate_n_bins(x, n):
    # get unique values
    unique, cts = np.unique(x, return_counts=True)
    if unique.shape[0] < n:
        raise ValueError("Fewer unique values than bins!")
    return unique, cts


def _uniform(x, n):
    # get unique and cut it at the uniform points
    unique, _ = _validate_n_bins(x, n)
    chunks = list(chunk(unique, n))

    # So now our chunks may resemble:
    # >>> list(chunk(np.arange(10), 4))
    # [array([0, 1, 2]), array([3, 4, 5]), array([6, 7]), array([8, 9])]
    # Transform them to bins
    return _Bins(chunks)


def _percentile(x, n):
    # bin by quartiles, quantiles, deciles, etc. This is really
    # easy to delegate to pandas...
    bins = pd.qcut(x, q=n, retbins=True)[1]

    # we can use the returned bins to create our own intervals
    return _Bins(list(zip(bins[:-1], bins[1:])))


_STRATEGIES = {"uniform": _uniform,
               "percentile": _percentile}


class _Bins(object):
    """Binning class that keeps track of upper and lower bounds of bins.
    The algorithm for assigning bins to a test vector is as follows:

        1. Initialize all bins as the highest bin
        2. For each lower bound in bin levels, determine which values in ``x``
           are >= to the bound. Invert the mask and decrement those bins (in
           other words, decrement the indices where the value is < the lower
           bound for the bin in question).
        3. Continue until there is no mask to invert (lowest bin).
    """
    def __init__(self, chunks):
        # chunks is a list of bin arrays
        self.n_bins = len(chunks)

        # create the repr for each bin and create the mins/maxes arrays
        upper_bounds = []
        lower_bounds = []
        reprs = []
        for i, (this_chunk, next_chunk) in \
                enumerate(zip(chunks[:-1], chunks[1:])):

            # If it's the first one, it's just less than
            # the next chunk's min.
            upper_bound = next_chunk[0]
            if i == 0:
                lower_bound = -np.inf
                rep = "(-Inf, %.2f]" % upper_bound

            # Otherwise we know it's a middle one (not the last since we
            # lagged with the zip function and handle that at the end)
            else:
                lower_bound = this_chunk[0]
                rep = "(%.2f, %.2f]" % (lower_bound, upper_bound)

            upper_bounds.append(upper_bound)
            lower_bounds.append(lower_bound)
            reprs.append(rep)

        # since we missed the last chunk due to the lag, get the last one
        lower_bounds.append(chunks[-1][0])
        upper_bounds.append(np.inf)
        reprs.append("(%.2f, Inf]" % lower_bounds[-1])

        # set the attributes
        self.upper_bounds = upper_bounds
        self.lower_bounds = lower_bounds
        self.reprs = reprs

    def assign(self, v, as_str):
        # given some vector of values, assign the appropriate bins. We can
        # do this in one pass, really. Just pass over one of the bounds arrays
        # and keep track of the level at which the elements in V are no longer
        # within the boundaries

        # Initialize by setting all to the highest bin
        bins = (np.ones(v.shape[0]) * (self.n_bins - 1)).astype(int)

        # now progress backwards
        for boundary in self.lower_bounds[::-1]:

            # figure out which are >= to the lower boundary. They should NOT
            # be changed. The ones that are FALSE, however, should be
            # decremented by 1. On the first pass, anything that actually
            # belongs in the top bin will not be adjusted, but everything
            # else will drop by one. Next, everything that is still below the
            # lower boundary will decrement again, etc., until the lowest bin
            # where the lower_bound is -np.inf. Since everything is >= that,
            # there will be no anti mask and nothing will change
            mask = v >= boundary
            anti_mask = ~mask  # type: np.ndarray

            if anti_mask.shape[0] > 0:
                bins[anti_mask] -= 1

        # now we have bin indices, get the reprs to return...
        if as_str:
            return np.array([self.reprs[i] for i in bins])
        # otherwise user just wants the bin level
        return bins


# Executed in parallel:
def _make_bin(binner, vec, c, n):
    # Parallelize the bin operation over columns
    return c, binner(vec, n)


# Executed in parallel:
def _assign_bin(binner, vec, c, return_label):
    # Parallelize bin assignment
    return c, binner.assign(vec, return_label)


[docs]class BinningTransformer(BasePDTransformer): r"""Bin continuous variables. The BinningTransformer will create buckets for continuous variables, effectively transforming continuous features into categorical features. Pros of binning: * Particularly useful in the case of very skewed data where an algorithm may make assumptions on the underlying distribution of the variables * Quick and easy way to take curvature into account There are absolutely some negatives to binning: * You can tend to throw away information from continuous variables * You might end up fitting "wiggles" rather than a linear relationship itself * You use up a lot of degrees of freedom For a more exhaustive list of detrimental effects of binning, take a look at [1]. Parameters ---------- cols : array-like, shape=(n_features,), optional (default=None) The names of the columns on which to apply the transformation. Optional. If None, will be applied to all features (which could prove to be expensive) as_df : bool, optional (default=True) Whether to return a Pandas ``DataFrame`` in the ``transform`` method. If False, will return a Numpy ``ndarray`` instead. Since most skoot transformers depend on explicitly-named ``DataFrame`` features, the ``as_df`` parameter is True by default. n_bins : int or iterable, optional (default=10) The number of bins into which to separate each specified feature. Default is 20, but can also be an iterable or dict of the same length as ``cols``, where positional integers indicate a different bin size for that feature. strategy : str or unicode, optional (default="uniform") The strategy for binning. Default is "uniform", which uniformly segments a feature. Alternatives include "percentile" which uses ``n_bins`` to compute quantiles (for ``n_bins=5``), quartiles (for ``n_bins=4``), etc. Note that for percentile binning, the outer bin boundaries (low boundary of lowest bin and high boundary of the highest bin) will be set to -inf and inf, respectively, to behave similar to other binning strategies. return_bin_label : bool, optional (default=True) Whether to return the string representation of the bin (i.e., "<25.2") rather than the bin level, an integer. overwrite : bool, optional (default=True) Whether to overwrite the original feature with the binned feature. Default is True so that the output names match the input names. If False, the output columns will be appended to the right side of the frame with "_binned" appended. n_jobs : int, 1 by default The number of jobs to use for the encoding. This works by fitting each incremental LabelEncoder in parallel. If -1 all CPUs are used. If 1 is given, no parallel computing code is used at all, which is useful for debugging. For n_jobs below -1, (n_cpus + 1 + n_jobs) are used. Thus for n_jobs = -2, all CPUs but one are used. Notes ----- If a feature has fewer than ``n_bins`` unique values, it will raise a ValueError in the fit procedure. Examples -------- Bin two features in iris: >>> from skoot.datasets import load_iris_df >>> iris = load_iris_df(include_tgt=False, names=['a', 'b', 'c', 'd']) >>> binner = BinningTransformer(cols=["a", "b"], strategy="uniform") >>> trans = binner.fit_transform(iris) >>> trans.head() a b c d 0 (5.10, 5.50] (3.40, 3.60] 1.4 0.2 1 (4.70, 5.10] (3.00, 3.20] 1.4 0.2 2 (4.70, 5.10] (3.20, 3.40] 1.3 0.2 3 (-Inf, 4.70] (3.00, 3.20] 1.5 0.2 4 (4.70, 5.10] (3.60, 3.80] 1.4 0.2 >>> trans.dtypes a object b object c float64 d float64 dtype: object Attributes ---------- bins_ : dict A dictionary mapping the column names to the corresponding bins, which are internal _Bin objects that store data on upper and lower bounds. fit_cols_ : list The list of column names on which the transformer was fit. This is used to validate the presence of the features in the test set during the ``transform`` stage. References ---------- .. [1] "Problems Caused by Categorizing Continuous Variables" http://biostat.mc.vanderbilt.edu/wiki/Main/CatContinuous """
[docs] def __init__(self, cols=None, as_df=True, n_bins=10, strategy="uniform", return_bin_label=True, overwrite=True, n_jobs=1): super(BinningTransformer, self).__init__( cols=cols, as_df=as_df) self.n_bins = n_bins self.strategy = strategy self.return_bin_label = return_bin_label self.overwrite = overwrite self.n_jobs = n_jobs
[docs] @timed_instance_method(attribute_name="fit_time_") def fit(self, X, y=None): """Fit the transformer. Parameters ---------- X : pd.DataFrame, shape=(n_samples, n_features) The Pandas frame to fit. The frame will only be fit on the prescribed ``cols`` (see ``__init__``) or all of them if ``cols`` is None. y : array-like or None, shape=(n_samples,), optional (default=None) Pass-through for ``sklearn.pipeline.Pipeline``. """ # validate the input, and get a copy of it X, cols = check_dataframe(X, cols=self.cols, assert_all_finite=True) # validate n_bins... n_bins = type_or_iterable_to_col_mapping(cols=cols, param=self.n_bins, param_name="n_bins", permitted_scalar_types=int) # now that we have a dictionary, we can assess the actual integer for _, v in six.iteritems(n_bins): if not (isinstance(v, int) and v > 1): raise ValueError("Each n_bin value must be an integer > 1") # get and validate the strategy strategy = self.strategy try: binner = _STRATEGIES[strategy] except KeyError: raise ValueError("strategy must be one of %r, but got %r" % (str(list(_STRATEGIES.keys())), strategy)) # compute the bins for each feature bins = dict(Parallel(n_jobs=self.n_jobs)( delayed(_make_bin)(binner, vec=X[c].values, c=c, n=n) for c, n in six.iteritems(n_bins))) # set the instance attribute self.bins_ = bins self.fit_cols_ = cols return self
[docs] def transform(self, X): """Apply the transformation to a dataframe. This method will bin the continuous values in the test frame with the bins designated in the ``fit`` stage. Parameters ---------- X : pd.DataFrame, shape=(n_samples, n_features) The Pandas frame to transform. The operation will be applied to a copy of the input data, and the result will be returned. Returns ------- X : pd.DataFrame or np.ndarray, shape=(n_samples, n_features) The operation is applied to a copy of ``X``, and the result set is returned. """ check_is_fitted(self, 'bins_') X, _ = check_dataframe(X, cols=self.cols) # X is a copy now # validate that fit cols in test set cols = self.fit_cols_ validate_test_set_columns(cols, X.columns) # the bins bins = self.bins_ # now apply the binning. Rather that use iteritems, iterate the cols # themselves so we get the order prescribed by the user bin_assignments = dict(Parallel(n_jobs=self.n_jobs)( delayed(_assign_bin)( bins[col], vec=X[col].values, c=col, return_label=self.return_bin_label) for col in cols)) # Simple pass of O(N) to assign to dataframes. Lightweight, no # actual computations here. That all happened in parallel for c in cols: binned = bin_assignments[c] # if we overwrite, it's easy if self.overwrite: X[c] = binned # otherwise create a new feature else: X["%s_binned" % c] = binned return dataframe_or_array(X, self.as_df)