Source code for skutil.preprocessing.impute

# -*- coding: utf-8 -*-

from __future__ import division, print_function, absolute_import
import numpy as np
import pandas as pd
from sklearn.base import BaseEstimator, TransformerMixin, is_classifier
from sklearn.ensemble import BaggingRegressor, BaggingClassifier
from sklearn.externals import six
from sklearn.utils.validation import check_is_fitted
from abc import ABCMeta
from skutil.base import SelectiveMixin, BaseSkutil
from ..utils import is_entirely_numeric, get_numeric, validate_is_pd, is_numeric
from ..utils.fixes import is_iterable

__all__ = [
    'BaggedImputer',
    'BaggedCategoricalImputer',
    'ImputerMixin',
    'SelectiveImputer'
]


def _validate_all_numeric(X):
    """Validate that all columns in X
    are numeric types. If not, raises a
    ``ValueError``

    Parameters
    ----------

    X : Pandas ``DataFrame``, shape=(n_samples, n_features)
        The dataframe to validate

    Raises
    ------

    ``ValueError`` if not all columns are numeric
    """
    if not is_entirely_numeric(X):
        raise ValueError('provided columns must be of only numeric columns')


def _col_mode(col):
    """Get the mode from a series.

    Returns
    -------

    com : int, float
        The column's most common value.
    """
    vals = col.value_counts()
    com = vals.index[0] if not np.isnan(vals.index[0]) else vals.index[1]
    return com


def _val_values(vals):
    """Validate that all values in the iterable
    are either numeric, or in ('mode', 'median', 'mean').
    If not, will raise a TypeError

    Raises
    ------

    ``TypeError`` if not all values are numeric or
    in valid values.
    """
    if not all([
                   (is_numeric(i) or (isinstance(i, six.string_types)) and i in ('mode', 'mean', 'median'))
                   for i in vals
               ]):
        raise TypeError('All values in self.fill must be numeric or in ("mode", "mean", "median"). '
                        'Got: %s' % ', '.join(vals))


[docs]class ImputerMixin:
    """A mixin for all imputer classes. Contains the default fill value.
    This mixin is used for the H2O imputer, as well.

    Attributes
    ----------

    _def_fill : int (default=-999999)
        The default fill value for NaN values
    """
    _def_fill = -999999


class _BaseImputer(six.with_metaclass(ABCMeta, BaseSkutil, TransformerMixin, ImputerMixin)):
    """A base class for all imputers. Handles assignment of the fill value.

    Parameters
    ----------

    cols : array_like, shape=(n_features,), optional (default=None)
        The names of the columns on which to apply the transformation.
        If no column names are provided, the transformer will be ``fit``
        on the entire frame. Note that the transformation will also only
        apply to the specified columns, and any other non-specified
        columns will still be present after transformation. Note that since 
        this transformer can only operate on numeric columns, not explicitly 
        setting the ``cols`` parameter may result in errors for categorical data.

    as_df : bool, optional (default=True)
        Whether to return a Pandas DataFrame in the ``transform``
        method. If False, will return a NumPy ndarray instead. 
        Since most skutil transformers depend on explicitly-named
        DataFrame features, the ``as_df`` parameter is True by default.

    fill : int, float, string or array_like, optional (default=None)
        The fill values to use for missing values in columns

    Attributes
    ----------

    fill : float, int, None or str
        The fill
    """

    def __init__(self, cols=None, as_df=True, fill=None):
        super(_BaseImputer, self).__init__(cols=cols, as_df=as_df)
        self.fill = fill if fill is not None else self._def_fill


[docs]class SelectiveImputer(_BaseImputer):
    """A more customizable form on sklearn's ``Imputer`` class. This class
    can handle more than mean, median or most common... it will also take
    numeric values. Moreover, it will take a vector of strategies or values
    with which to impute corresponding columns.

    Parameters
    ----------

    cols : array_like, optional (default=None)
        The columns on which the transformer will be ``fit``. In
        the case that ``cols`` is None, the transformer will be fit
        on all columns. Note that since this transformer can only operate
        on numeric columns, not explicitly setting the ``cols`` parameter
        may result in errors for categorical data.

    as_df : bool, optional (default=True)
        Whether to return a Pandas DataFrame in the ``transform``
        method. If False, will return a NumPy ndarray instead. 
        Since most skutil transformers depend on explicitly-named
        DataFrame features, the ``as_df`` parameter is True by default.

    fill : int, float, string or array_like, optional (default=None)
        the fill to use for missing values in the training matrix
        when fitting a ``SelectiveImputer``. If None, will default to 'mean'


    Examples
    --------

        >>> import numpy as np
        >>> import pandas as pd
        >>> from skutil.preprocessing import SelectiveImputer
        >>>
        >>> nan = np.nan
        >>> X = pd.DataFrame.from_records(data=np.array([
        ...                                 [1.0,  nan,  3.1],
        ...                                 [nan,  2.3,  nan],
        ...                                 [2.1,  2.1,  3.1]]), 
        ...                               columns=['a','b','c'])
        >>> imputer = SelectiveImputer(fill=['mean', -999, 'mode'])
        >>> imputer.fit_transform(X)
              a      b    c
        0  1.00 -999.0  3.1
        1  1.55    2.3  3.1
        2  2.10    2.1  3.1


    Attributes
    ----------

    fills_ : iterable, int or float
        The imputer fill-values
    """

    def __init__(self, cols=None, as_df=True, fill='mean'):
        super(SelectiveImputer, self).__init__(cols, as_df, fill)

[docs]    def fit(self, X, y=None):
        """Fit the imputer and return the
        transformed matrix or frame.

        Parameters
        ----------

        X : Pandas ``DataFrame``, shape=(n_samples, n_features)
            The Pandas frame to fit. The frame will only
            be fit on the prescribed ``cols`` (see ``__init__``) or
            all of them if ``cols`` is None.

        y : None
            Passthrough for ``sklearn.pipeline.Pipeline``. Even
            if explicitly set, will not change behavior of ``fit``.

        Returns
        -------

        self
        """

        # check on state of X and cols
        X, self.cols = validate_is_pd(X, self.cols)
        cols = self.cols if self.cols is not None else X.columns.values

        # validate the fill, do fit
        fill = self.fill
        if isinstance(fill, six.string_types):
            fill = str(fill)
            if fill not in ('mode', 'mean', 'median'):
                raise TypeError('self.fill must be either "mode", "mean", "median", None, '
                                'a number, or an iterable. Got %s' % fill)

            if fill == 'mode':
                # for each column to impute, we go through and get the value counts
                # of each, sorting by the max...
                self.fills_ = dict(zip(cols, X[cols].apply(lambda x: _col_mode(x))))

            elif fill == 'median':
                self.fills_ = dict(zip(cols, X[cols].apply(lambda x: np.nanmedian(x.values))))

            else:
                self.fills_ = dict(zip(cols, X[cols].apply(lambda x: np.nanmean(x.values))))

        # if the fill is an iterable, we have to get a bit more stringent on our validation
        elif is_iterable(fill):

            # if fill is a dictionary
            if isinstance(fill, dict):
                # if it's a dict, we can assume that these are the cols...
                cols, fill = zip(*fill.items())
                self.cols = cols  # we reset self.cols in this case!!!

            # we need to get the length of the iterable,
            # make sure it matches the len of cols
            if not len(fill) == len(cols):
                raise ValueError('len of fill does not match that of cols')

            # make sure they're all ints
            _val_values(fill)
            d = {}
            for ind, c in enumerate(cols):
                f = fill[ind]

                if is_numeric(f):
                    d[c] = f
                else:
                    the_col = X[c]
                    if f == 'mode':
                        d[c] = _col_mode(the_col)
                    elif f == 'median':
                        d[c] = np.nanmedian(the_col.values)
                    else:
                        d[c] = np.nanmean(the_col.values)

            self.fills_ = d

        else:
            if not is_numeric(fill):
                raise TypeError('self.fill must be either "mode", "mean", "median", None, '
                                'a number, or an iterable. Got %s' % str(fill))

            # either the fill is an int, or it's something the user provided...
            # if it's not an int or float, we'll let it go and not catch it because
            # the it's their fault they were dumb.
            self.fills_ = fill

        return self

[docs]    def transform(self, X):
        """Transform a dataframe given the fit imputer.

        Parameters
        ----------

        X : Pandas ``DataFrame``, shape=(n_samples, n_features)
            The Pandas frame to transform.

        Returns
        -------

        X : pd.DataFrame or np.ndarray
            The imputed matrix
        """

        check_is_fitted(self, 'fills_')
        # check on state of X and cols
        X, _ = validate_is_pd(X, self.cols)
        cols = self.cols if self.cols is not None else X.columns.values

        # get the fills
        modes = self.fills_

        # if it's a single int, easy:
        if isinstance(modes, int):
            X[cols] = X[cols].fillna(modes)
        else:
            # it's a dict
            for nm in cols:
                X[nm] = X[nm].fillna(modes[nm])

        return X if self.as_df else X.as_matrix()


class _BaseBaggedImputer(_BaseImputer):
    """Base class for all bagged imputers. See subclasses
    ``BaggedCategoricalImputer`` and ``BaggedImputer`` for specifics.
    """

    def __init__(self, cols=None, base_estimator=None, n_estimators=10,
                 max_samples=1.0, max_features=1.0, bootstrap=True, bootstrap_features=True,
                 oob_score=False, n_jobs=1, random_state=None, verbose=0, as_df=True,
                 fill=None, is_classification=False):

        super(_BaseBaggedImputer, self).__init__(cols=cols, as_df=as_df, fill=fill)

        # set self attributes
        self.base_estimator = base_estimator
        self.n_estimators = n_estimators
        self.max_samples = max_samples
        self.max_features = max_features
        self.bootstrap = bootstrap
        self.bootstrap_features = bootstrap_features
        self.oob_score = oob_score
        self.n_jobs = n_jobs
        self.random_state = random_state
        self.verbose = verbose
        self.is_classification = is_classification

    def fit(self, X, y=None):
        """Fit the bagged imputer.

        Parameters
        ----------

        X : Pandas ``DataFrame``, shape=(n_samples, n_features)
            The Pandas frame to fit. The frame will only
            be fit on the prescribed ``cols`` (see ``__init__``) or
            all of them if ``cols`` is None.

        y : None
            Passthrough for ``sklearn.pipeline.Pipeline``. Even
            if explicitly set, will not change behavior of ``fit``.

        Returns
        -------

        self
        """
        self.fit_transform(X, y)
        return self

    def fit_transform(self, X, y=None):
        """Fit the bagged imputer and return the
        transformed (imputed) matrix.

        Parameters
        ----------

        X : Pandas ``DataFrame``, shape=(n_samples, n_features)
            The Pandas frame to fit. The frame will only
            be fit on the prescribed ``cols`` (see ``__init__``) or
            all of them if ``cols`` is None.

        y : None
            Passthrough for ``sklearn.pipeline.Pipeline``. Even
            if explicitly set, will not change behavior of ``fit``.

        Returns
        -------

        X : pd.DataFrame or np.ndarray
            The imputed matrix.
        """
        # check on state of X and cols
        X, self.cols = validate_is_pd(X, self.cols)
        cols = self.cols if self.cols is not None else X.columns.values

        # subset, validate
        # we have to validate that all of the columns we're going to impute
        # are numeric (this could be float, or int...).
        _validate_all_numeric(X[cols])

        # we need to get all of the numerics out of X, because these are
        # the features we'll be modeling on.
        numeric_cols = get_numeric(X)
        numerics = X[numeric_cols]

        # if is_classification and our estimator is NOT, then we need to raise
        if self.base_estimator is not None:
            if self.is_classification and not is_classifier(self.base_estimator):
                raise TypeError('self.is_classification=True, '
                                'but base_estimator is not a classifier')

        # set which estimator type to fit:
        _model = BaggingRegressor if not self.is_classification else BaggingClassifier

        # if there's only one numeric, we know at this point it's the one
        # we're imputing. In that case, there's too few cols on which to model
        if numerics.shape[1] == 1:
            raise ValueError('too few numeric columns on which to model')

        # the core algorithm:
        # - for each col to impute
        #   - subset to all numeric columns except the col to impute
        #   - retain only the complete observations, separate the missing observations
        #   - build a bagging regressor model to predict for observations with missing values
        #   - fill in missing values in a copy of the dataframe

        models = {}
        for col in cols:
            x = numerics.copy()  # get copy of numerics for this model iteration
            y_missing = pd.isnull(x[col])  # boolean vector of which are missing in the current y
            y = x.pop(col)  # pop off the y vector from the matrix

            # if y_missing is all of the rows, we need to bail
            if y_missing.sum() == x.shape[0]:
                raise ValueError('%s has all missing values, cannot train model' % col)

            # at this point we've identified which y values we need to predict, however, we still
            # need to prep our x matrix... There are a few corner cases we need to account for:
            #
            # 1. there are no complete rows in the X matrix
            #   - we can eliminate some columns to model on in this case, but there's no silver bullet
            # 2. the cols selected for model building are missing in the rows needed to impute.
            #   - this is a hard solution that requires even more NA imputation...
            #
            # the most "catch-all" solution is going to be to fill all missing values with some val, say -999999

            x = x.fillna(self.fill)
            X_train = x[~y_missing]  # the rows that don't correspond to missing y values
            X_test = x[y_missing]  # the rows to "predict" on
            y_train = y[~y_missing]  # the training y vector

            # define the model
            model = _model(
                base_estimator=self.base_estimator,
                n_estimators=self.n_estimators,
                max_samples=self.max_samples,
                max_features=self.max_features,
                bootstrap=self.bootstrap,
                bootstrap_features=self.bootstrap_features,
                oob_score=self.oob_score,
                n_jobs=self.n_jobs,
                random_state=self.random_state,
                verbose=self.verbose)

            # fit the model
            model.fit(X_train, y_train)

            # predict on the missing values, stash the model and the features used to train it
            if X_test.shape[0] != 0:  # only do this step if there are actually any missing
                y_pred = model.predict(X_test)
                X.loc[y_missing, col] = y_pred  # fill the y vector missing slots and reassign back to X

            models[col] = {
                'model': model,
                'feature_names': X_train.columns.values
            }

        # assign the model dict to self -- this is the "fit" portion
        self.models_ = models
        return X if self.as_df else X.as_matrix()

    def transform(self, X):
        """Impute the test data after fit.

        Parameters
        ----------

        X : Pandas ``DataFrame``, shape=(n_samples, n_features)
            The Pandas frame to transform.

        Returns
        -------

        dropped : Pandas DataFrame or NumPy ndarray
            The test frame sans "bad" columns
        """
        check_is_fitted(self, 'models_')
        # check on state of X and cols
        X, _ = validate_is_pd(X, self.cols)

        # perform the transformations for missing vals
        models = self.models_
        for col, kv in six.iteritems(models):
            features, model = kv['feature_names'], kv['model']
            y = X[col]  # the y we're predicting

            # this will throw a key error if one of the features isn't there
            X_test = X[features]  # we need another copy

            # if col is in the features, there's something wrong internally
            assert col not in features, 'predictive column should not be in fit features (%s)' % col

            # since this is a copy, we can add the missing vals where needed
            X_test = X_test.fillna(self.fill)

            # generate predictions, subset where y was null
            y_null = pd.isnull(y)
            pred_y = model.predict(X_test.loc[y_null])

            # fill where necessary:
            if y_null.sum() > 0:
                y[y_null] = pred_y  # fill where null
                X[col] = y  # set back to X

        return X if self.as_df else X.as_matrix()


[docs]class BaggedCategoricalImputer(_BaseBaggedImputer):
    """Performs imputation on select columns by using BaggingRegressors
    on the provided columns.

    cols : array_like, optional (default=None)
        The columns on which the transformer will be ``fit``. In
        the case that ``cols`` is None, the transformer will be fit
        on all columns. Note that since this transformer can only operate
        on numeric columns, not explicitly setting the ``cols`` parameter
        may result in errors for categorical data.

    base_estimator : object or None, optional (default=None)
        The base estimator to fit on random subsets of the dataset.
        If None, then the base estimator is a decision tree.

    n_estimators : int, optional (default=10)
        The number of base estimators in the ensemble.

    max_samples : int or float, optional (default=1.0)
        The number of samples to draw from X to train each base estimator.
        If int, then draw max_samples samples.
        If float, then draw max_samples * X.shape[0] samples.

    max_features : int or float, optional (default=1.0)
        The number of features to draw from X to train each base estimator.
        If int, then draw max_features features.
        If float, then draw max_features * X.shape[1] features.

    bootstrap : boolean, optional (default=True)
        Whether samples are drawn with replacement.

    bootstrap_features : boolean, optional (default=False)
        Whether features are drawn with replacement.

    oob_score : bool, optional (default=False)
        Whether to use out-of-bag samples to estimate the generalization error.

    n_jobs : int, optional (default=1)
        The number of jobs to run in parallel for both fit and predict. If -1,
        then the number of jobs is set to the number of cores.

    random_state : int, RandomState instance or None, optional (default=None)
        If int, random_state is the seed used by the random number generator; If
        RandomState instance, random_state is the random number generator; If None,
        the random number generator is the RandomState instance used by np.random.

    verbose : int, optional (default=0)
        Controls the verbosity of the building process.

    as_df : bool, optional (default=True)
        Whether to return a Pandas DataFrame in the ``transform``
        method. If False, will return a NumPy ndarray instead. 
        Since most skutil transformers depend on explicitly-named
        DataFrame features, the ``as_df`` parameter is True by default.

    fill : int, optional (default=None)
        the fill to use for missing values in the training matrix
        when fitting a BaggingClassifier. If None, will default to -999999


    Examples
    --------

        >>> import numpy as np
        >>> import pandas as pd
        >>> from skutil.preprocessing import BaggedCategoricalImputer
        >>>
        >>> nan = np.nan
        >>> X = pd.DataFrame.from_records(data=np.array([
        ...                                 [1.0,  nan,  4.0],
        ...                                 [nan,  1.0,  nan],
        ...                                 [2.0,  2.0,  3.0]]), 
        ...                               columns=['a','b','c'])
        >>> imputer = BaggedCategoricalImputer(random_state=42)
        >>> imputer.fit_transform(X)
             a    b    c
        0  1.0  2.0  4.0
        1  2.0  1.0  4.0
        2  2.0  2.0  3.0


    Attributes
    ----------

    models_ : dict, (string : ``sklearn.base.BaseEstimator``)
        A dictionary mapping column names to the fit
        bagged estimator.
    """

    def __init__(self, cols=None, base_estimator=None, n_estimators=10,
                 max_samples=1.0, max_features=1.0, bootstrap=True, bootstrap_features=True,
                 oob_score=False, n_jobs=1, random_state=None, verbose=0, as_df=True, fill=None):

        # categorical imputer needs to be classification
        super(BaggedCategoricalImputer, self).__init__(
            cols=cols, as_df=as_df, fill=fill,
            base_estimator=base_estimator, n_estimators=n_estimators,
            max_samples=max_samples, max_features=max_features, bootstrap=bootstrap,
            bootstrap_features=bootstrap_features, oob_score=oob_score,
            n_jobs=n_jobs, random_state=random_state, verbose=verbose,
            is_classification=True)


[docs]class BaggedImputer(_BaseBaggedImputer):
    """Performs imputation on select columns by using BaggingRegressors
    on the provided columns.

    cols : array_like, optional (default=None)
        The columns on which the transformer will be ``fit``. In
        the case that ``cols`` is None, the transformer will be fit
        on all columns. Note that since this transformer can only operate
        on numeric columns, not explicitly setting the ``cols`` parameter
        may result in errors for categorical data.

    base_estimator : object or None, optional (default=None)
        The base estimator to fit on random subsets of the dataset.
        If None, then the base estimator is a decision tree.

    n_estimators : int, optional (default=10)
        The number of base estimators in the ensemble.

    max_samples : int or float, optional (default=1.0)
        The number of samples to draw from X to train each base estimator.
        If int, then draw max_samples samples.
        If float, then draw max_samples * X.shape[0] samples.

    max_features : int or float, optional (default=1.0)
        The number of features to draw from X to train each base estimator.
        If int, then draw max_features features.
        If float, then draw max_features * X.shape[1] features.

    bootstrap : boolean, optional (default=True)
        Whether samples are drawn with replacement.

    bootstrap_features : boolean, optional (default=False)
        Whether features are drawn with replacement.

    oob_score : bool, optional (default=False)
        Whether to use out-of-bag samples to estimate the generalization error.

    n_jobs : int, optional (default=1)
        The number of jobs to run in parallel for both fit and predict. If -1,
        then the number of jobs is set to the number of cores.

    random_state : int, RandomState instance or None, optional (default=None)
        If int, random_state is the seed used by the random number generator; If
        RandomState instance, random_state is the random number generator; If None,
        the random number generator is the RandomState instance used by np.random.

    verbose : int, optional (default=0)
        Controls the verbosity of the building process.

    as_df : bool, optional (default=True)
        Whether to return a Pandas DataFrame in the ``transform``
        method. If False, will return a NumPy ndarray instead. 
        Since most skutil transformers depend on explicitly-named
        DataFrame features, the ``as_df`` parameter is True by default.

    fill : int, optional (default=None)
        the fill to use for missing values in the training matrix
        when fitting a BaggingRegressor. If None, will default to -999999


    Examples
    --------

        >>> import numpy as np
        >>> import pandas as pd
        >>> from skutil.preprocessing import BaggedImputer
        >>>
        >>> nan = np.nan
        >>> X = pd.DataFrame.from_records(data=np.array([
        ...                                 [1.0,  nan,  3.1],
        ...                                 [nan,  2.3,  nan],
        ...                                 [2.1,  2.1,  3.1]]), 
        ...                               columns=['a','b','c'])
        >>> imputer = BaggedImputer(random_state=42)
        >>> imputer.fit_transform(X)
               a     b    c
        0  1.000  2.16  3.1
        1  1.715  2.30  3.1
        2  2.100  2.10  3.1


    Attributes
    ----------

    models_ : dict, (string : ``sklearn.base.BaseEstimator``)
        A dictionary mapping column names to the fit
        bagged estimator.
    """

    def __init__(self, cols=None, base_estimator=None, n_estimators=10,
                 max_samples=1.0, max_features=1.0, bootstrap=True, bootstrap_features=True,
                 oob_score=False, n_jobs=1, random_state=None, verbose=0, as_df=True, fill=None):
        # invoke super constructor
        super(BaggedImputer, self).__init__(
            cols=cols, as_df=as_df, fill=fill,
            base_estimator=base_estimator, n_estimators=n_estimators,
            max_samples=max_samples, max_features=max_features, bootstrap=bootstrap,
            bootstrap_features=bootstrap_features, oob_score=oob_score,
            n_jobs=n_jobs, random_state=random_state, verbose=verbose,
            is_classification=False)