from __future__ import print_function, division, absolute_import
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
from sklearn.base import TransformerMixin, BaseEstimator
from sklearn.utils.validation import check_is_fitted
from sklearn.utils import column_or_1d
from sklearn.preprocessing.label import _check_numpy_unicode_bug
import numpy as np
import pandas as pd
from skutil.base import BaseSkutil
from skutil.utils import validate_is_pd
__all__ = [
'SafeLabelEncoder',
'OneHotCategoricalEncoder'
]
def _get_unseen():
"""Basically just a static method
instead of a class attribute to avoid
someone accidentally changing it."""
return 99999
[docs]class SafeLabelEncoder(LabelEncoder):
"""An extension of LabelEncoder that will
not throw an exception for unseen data, but will
instead return a default value of 99999
Attributes
----------
classes_ : the classes that are encoded
"""
[docs]class OneHotCategoricalEncoder(BaseSkutil, TransformerMixin):
"""This class achieves three things: first, it will fill in
any NaN values with a provided surrogate (if desired). Second,
it will dummy out any categorical features using OneHotEncoding
with a safety feature that can handle previously unseen values,
and in the transform method will re-append the dummified features
to the dataframe. Finally, it will return a numpy ndarray.
Parameters
----------
fill : str, optional (default = 'Missing')
The value that will fill the missing values in the column
as_df : bool, optional (default=True)
Whether to return a Pandas ``DataFrame`` in the ``transform``
method. If False, will return a Numpy ``ndarray`` instead.
Since most skutil transformers depend on explicitly-named
``DataFrame`` features, the ``as_df`` parameter is True by default.
Examples
--------
>>> import pandas as pd
>>> import numpy as np
>>> from skutil.preprocessing import OneHotCategoricalEncoder
>>>
>>> X = pd.DataFrame.from_records(data=np.array([
... ['USA','RED','a'],
... ['MEX','GRN','b'],
... ['FRA','RED','b']]),
... columns=['A','B','C'])
>>>
>>> o = OneHotCategoricalEncoder(as_df=True)
>>> o.fit_transform(X)
A.FRA A.MEX A.USA A.NA B.GRN B.RED B.NA C.a C.b C.NA
0 0.0 0.0 1.0 0.0 0.0 1.0 0.0 1.0 0.0 0.0
1 0.0 1.0 0.0 0.0 1.0 0.0 0.0 0.0 1.0 0.0
2 1.0 0.0 0.0 0.0 0.0 1.0 0.0 0.0 1.0 0.0
Attributes
----------
obj_cols_ : array_like
The list of object-type (categorical) features
lab_encoders_ : array_like
The label encoders
one_hot_ : an instance of a OneHotEncoder
trans_nms_ : the dummified names
"""
def __init__(self, fill='Missing', as_df=True):
super(OneHotCategoricalEncoder, self).__init__(cols=None, as_df=as_df)
self.fill = fill
[docs] def fit(self, X, y=None):
"""Fit the encoder.
Parameters
----------
X : Pandas ``DataFrame``, shape=(n_samples, n_features)
The Pandas frame to fit. The frame will only
be fit on the object columns of the dataframe.
y : None
Passthrough for ``sklearn.pipeline.Pipeline``. Even
if explicitly set, will not change behavior of ``fit``.
Returns
-------
self
"""
# check on state of X, don't care about cols or the warning
X, _ = validate_is_pd(X, None)
# Extract the object columns
obj_cols_ = X.select_dtypes(include=['object']).columns.values
# If we need to fill in the NAs, take care of it
if self.fill is not None:
X[obj_cols_] = X[obj_cols_].fillna(self.fill)
# Set an array of uninitialized label encoders
# Then use fit_transform for effiency purposes
# We can also set the dummy-level feature names in the same pass
lab_encoders_ = []
trans_array = []
tnms = []
unseen = _get_unseen()
for nm in obj_cols_:
encoder = SafeLabelEncoder()
lab_encoders_.append(encoder)
# This fits the reference to the encoder, and gets
# the transformation. We then append a single unseen
# value to the end as a safety for the transform method.
# After the transpose, this is tantamount to appending a row
# of unseen values so each feature can handle the 99999
# This will expand the matrix by N columns, but if there's
# no new values, they will be entirely zero and can be dropped later.
encoded_array = np.append(encoder.fit_transform(X[nm]), unseen)
# Add the transformed row
trans_array.append(encoded_array) # Updates in array
# Update the names
n_classes = len(encoder.classes_)
sequential_nms = ['%s.%s' % (nm, str(encoder.classes_[i])) for i in range(n_classes)]
# Remember to append the NA col
sequential_nms.append('%s.NA' % nm)
tnms.append(sequential_nms)
# Get the transpose
trans = np.array(trans_array).transpose()
# flatten the name array, append numeric names prior
num_nms = [n for n in X.columns.values if n not in obj_cols_]
trans_nms_ = [item for sublist in tnms for item in sublist]
self.trans_nms_ = num_nms + trans_nms_
# we might get an empty set of object cols
shape_tup = trans.shape
is_empty = len(shape_tup) < 2 or shape_tup[1] == 0 # zero cols
# Now we can do the actual one hot encoding, set internal state
self.one_hot_ = None if is_empty else OneHotEncoder().fit(trans)
self.obj_cols_ = obj_cols_
self.lab_encoders_ = lab_encoders_
return self