Source code for skutil.preprocessing.encode

from __future__ import print_function, division, absolute_import
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
from sklearn.base import TransformerMixin, BaseEstimator
from sklearn.utils.validation import check_is_fitted
from sklearn.utils import column_or_1d
from sklearn.preprocessing.label import _check_numpy_unicode_bug
import numpy as np
import pandas as pd
from skutil.base import BaseSkutil
from skutil.utils import validate_is_pd

__all__ = [
    'SafeLabelEncoder',
    'OneHotCategoricalEncoder'
]


def _get_unseen():
    """Basically just a static method
    instead of a class attribute to avoid
    someone accidentally changing it."""
    return 99999


[docs]class SafeLabelEncoder(LabelEncoder):
    """An extension of LabelEncoder that will
    not throw an exception for unseen data, but will
    instead return a default value of 99999

    Attributes
    ----------

    classes_ : the classes that are encoded
    """

[docs]    def transform(self, y):
        """Perform encoding if already fit.

        Parameters
        ----------

        y : array_like, shape=(n_samples,)
            The array to encode

        Returns
        -------

        e : array_like, shape=(n_samples,)
            The encoded array
        """
        check_is_fitted(self, 'classes_')
        y = column_or_1d(y, warn=True)

        classes = np.unique(y)
        _check_numpy_unicode_bug(classes)

        # Check not too many:
        unseen = _get_unseen()
        if len(classes) >= unseen:
            raise ValueError('Too many factor levels in feature. Max is %i' % unseen)

        e = np.array([
                         np.searchsorted(self.classes_, x) if x in self.classes_ else unseen
                         for x in y
                         ])

        return e


[docs]class OneHotCategoricalEncoder(BaseSkutil, TransformerMixin):
    """This class achieves three things: first, it will fill in 
    any NaN values with a provided surrogate (if desired). Second,
    it will dummy out any categorical features using OneHotEncoding
    with a safety feature that can handle previously unseen values,
    and in the transform method will re-append the dummified features
    to the dataframe. Finally, it will return a numpy ndarray.
    
    Parameters
    ----------

    fill : str, optional (default = 'Missing')
        The value that will fill the missing values in the column

    as_df : bool, optional (default=True)
        Whether to return a Pandas ``DataFrame`` in the ``transform``
        method. If False, will return a Numpy ``ndarray`` instead. 
        Since most skutil transformers depend on explicitly-named
        ``DataFrame`` features, the ``as_df`` parameter is True by default.


    Examples
    --------

        >>> import pandas as pd
        >>> import numpy as np
        >>> from skutil.preprocessing import OneHotCategoricalEncoder
        >>>
        >>> X = pd.DataFrame.from_records(data=np.array([
        ...                                  ['USA','RED','a'],
        ...                                  ['MEX','GRN','b'],
        ...                                  ['FRA','RED','b']]), 
        ...                               columns=['A','B','C'])
        >>>
        >>> o = OneHotCategoricalEncoder(as_df=True)
        >>> o.fit_transform(X)
           A.FRA  A.MEX  A.USA  A.NA  B.GRN  B.RED  B.NA  C.a  C.b  C.NA
        0    0.0    0.0    1.0   0.0    0.0    1.0   0.0  1.0  0.0   0.0
        1    0.0    1.0    0.0   0.0    1.0    0.0   0.0  0.0  1.0   0.0
        2    1.0    0.0    0.0   0.0    0.0    1.0   0.0  0.0  1.0   0.0

        
    Attributes
    ----------
    
    obj_cols_ : array_like
        The list of object-type (categorical) features

    lab_encoders_ : array_like
        The label encoders

    one_hot_ : an instance of a OneHotEncoder

    trans_nms_ : the dummified names
    """

    def __init__(self, fill='Missing', as_df=True):
        super(OneHotCategoricalEncoder, self).__init__(cols=None, as_df=as_df)
        self.fill = fill

[docs]    def fit(self, X, y=None):
        """Fit the encoder.

        Parameters
        ----------

        X : Pandas ``DataFrame``, shape=(n_samples, n_features)
            The Pandas frame to fit. The frame will only
            be fit on the object columns of the dataframe.

        y : None
            Passthrough for ``sklearn.pipeline.Pipeline``. Even
            if explicitly set, will not change behavior of ``fit``.

        Returns
        -------

        self
        """
        # check on state of X, don't care about cols or the warning
        X, _ = validate_is_pd(X, None)

        # Extract the object columns
        obj_cols_ = X.select_dtypes(include=['object']).columns.values

        # If we need to fill in the NAs, take care of it
        if self.fill is not None:
            X[obj_cols_] = X[obj_cols_].fillna(self.fill)

        # Set an array of uninitialized label encoders
        # Then use fit_transform for effiency purposes
        # We can also set the dummy-level feature names in the same pass
        lab_encoders_ = []
        trans_array = []
        tnms = []

        unseen = _get_unseen()
        for nm in obj_cols_:
            encoder = SafeLabelEncoder()
            lab_encoders_.append(encoder)

            # This fits the reference to the encoder, and gets
            # the transformation. We then append a single unseen
            # value to the end as a safety for the transform method.
            # After the transpose, this is tantamount to appending a row
            # of unseen values so each feature can handle the 99999
            # This will expand the matrix by N columns, but if there's
            # no new values, they will be entirely zero and can be dropped later.
            encoded_array = np.append(encoder.fit_transform(X[nm]), unseen)

            # Add the transformed row
            trans_array.append(encoded_array)  # Updates in array

            # Update the names
            n_classes = len(encoder.classes_)
            sequential_nms = ['%s.%s' % (nm, str(encoder.classes_[i])) for i in range(n_classes)]

            # Remember to append the NA col
            sequential_nms.append('%s.NA' % nm)
            tnms.append(sequential_nms)

        # Get the transpose
        trans = np.array(trans_array).transpose()

        # flatten the name array, append numeric names prior
        num_nms = [n for n in X.columns.values if n not in obj_cols_]
        trans_nms_ = [item for sublist in tnms for item in sublist]
        self.trans_nms_ = num_nms + trans_nms_

        # we might get an empty set of object cols
        shape_tup = trans.shape
        is_empty = len(shape_tup) < 2 or shape_tup[1] == 0  # zero cols

        # Now we can do the actual one hot encoding, set internal state
        self.one_hot_ = None if is_empty else OneHotEncoder().fit(trans)
        self.obj_cols_ = obj_cols_
        self.lab_encoders_ = lab_encoders_

        return self

[docs]    def transform(self, X):
        """Transform X, a DataFrame, by stripping
        out the object columns, dummifying them, and
        re-appending them to the end.
        
        Parameters
        ----------

        X : Pandas ``DataFrame``, shape=(n_samples, n_features)
            The Pandas frame to transform.

        Returns
        -------

        x : Pandas ``DataFrame`` or np.ndarray, shape=(n_samples, n_features)
            The encoded dataframe or array
        """
        check_is_fitted(self, 'obj_cols_')
        # check on state of X, don't care about cols or warning
        X, _ = validate_is_pd(X, None)

        # if there is no encoder to speak of, just bail early
        if not self.one_hot_:
            return X if self.as_df else X.as_matrix()

        # Retain just the numers
        numers = X[[nm for nm in X.columns.values if nm not in self.obj_cols_]]
        objs = X[self.obj_cols_]

        # If we need to fill in the NAs, take care of it
        if self.fill is not None:
            objs = objs.fillna(self.fill)

        # Do label encoding using the safe label encoders
        trans = np.array([v.transform(objs[self.obj_cols_[i]]) for
                          i, v in enumerate(self.lab_encoders_)]).transpose()

        # Finally, get the one-hot encoding...
        oh = self.one_hot_.transform(trans).todense()
        x = np.array(np.hstack((numers, oh)))

        return x if not self.as_df else pd.DataFrame.from_records(data=x, columns=self.trans_nms_)