from __future__ import print_function, absolute_import, division
import pandas as pd
import numpy as np
from h2o.frame import H2OFrame
from sklearn.utils.validation import check_is_fitted
from sklearn.preprocessing import LabelEncoder
from ..preprocessing.encode import _get_unseen
from .frame import _check_is_1d_frame
from .base import (BaseH2OTransformer, check_frame, _frame_from_x_y)
from .util import h2o_col_to_numpy
__all__ = [
'H2OLabelEncoder',
'H2OSafeOneHotEncoder'
]
def _val_vec(y):
_check_is_1d_frame(y)
return y
[docs]class H2OLabelEncoder(BaseH2OTransformer):
"""Encode categorical values in a H2OFrame (single column)
into ordinal labels 0 - len(column) - 1.
Examples
--------
>>> def example():
... import pandas as pd
... from skutil.h2o import from_pandas
... from skutil.h2o.transform import H2OLabelEncoder
...
... x = pd.DataFrame.from_records(data=[
... [5, 4],
... [6, 2],
... [5, 1],
... [7, 9],
... [7, 2]], columns=['C1', 'C2'])
...
... X = from_pandas(x)
... encoder = H2OLabelEncoder()
... encoder.fit_transform(X['C1'])
>>>
>>> example() # doctest: +SKIP
C1
----
0
1
0
2
2
[5 rows x 1 column]
Attributes
----------
classes_ : np.ndarray
The unique class levels
.. versionadded:: 0.1.0
"""
_min_version = '3.8.2.9'
_max_version = None
def __init__(self):
super(H2OLabelEncoder, self).__init__(feature_names=None,
target_feature=None,
exclude_features=None,
min_version=self._min_version,
max_version=self._max_version)
[docs] def fit(self, column):
self.encoder_ = LabelEncoder().fit(h2o_col_to_numpy(column))
self.classes_ = self.encoder_.classes_
return self
class _H2OVecSafeOneHotEncoder(BaseH2OTransformer):
"""Safely one-hot encodes an H2OVec into an ``H2OFrame`` of
one-hot encoded dummies. Whereas H2O's default behavior for
previously-unseen factor levels is to error, the
``_H2OVecSafeOneHotEncoder`` skips previously-unseen levels
in the ``transform`` section, returning 'nan' (which H2O
interprets as ``NA``).
Parameters
----------
feature_names : array_like (str) shape=(n_features,), optional (default=None)
The list of names on which to fit the transformer.
target_feature : str, optional (default=None)
The name of the target feature (is excluded from the fit)
for the estimator.
exclude_features : array_like (str) shape=(n_features,), optional (default=None)
Any names that should be excluded from ``feature_names``
.. versionadded:: 0.1.0
"""
_min_version = '3.8.2.9'
_max_version = None
def __init__(self):
super(_H2OVecSafeOneHotEncoder, self).__init__(feature_names=None,
target_feature=None,
exclude_features=None,
min_version=self._min_version,
max_version=self._max_version)
def fit(self, y):
"""Fit the encoder.
Parameters
----------
y : ``H2OFrame``, shape=(n_samples, 1)
The training frame on which to fit. Should
be a single column ``H2OFrame``
Returns
-------
self
"""
# validate y
y = _val_vec(y)
# get the unique count
clz = y.unique().as_data_frame().T.iloc[0].tolist()
# max class check:
max_classes = _get_unseen()
if len(clz) > max_classes:
raise ValueError('max_classes=%i, but got %i'
% (max_classes, len(clz)))
# set internal
self.classes_ = clz
return self
def transform(self, y):
"""Transform a new 1d frame after fit.
Parameters
----------
y : ``H2OFrame``, shape=(n_samples, 1)
The 1d ``H2OFrame`` to transform
Returns
-------
output : ``H2OFrame``, shape=(n_samples, 1)
The transformed ``H2OFrame``
"""
# make sure is fitted, validate y
check_is_fitted(self, 'classes_')
y = _val_vec(y)
# get col name
col_name = str(y.columns[0])
# the frame output
output = None
# iterate over the classes
for clz in self.classes_:
isnan = False
rep = clz # we copy for sake of NaN preservation
# if the clz is np.nan, then the actual rep is 'NA'
if pd.isnull(clz):
isnan = True
rep = 'NA'
# returns int vec of 1s and 0s
dummies = (y == rep)
dummies.columns = ['%s.%s' % (col_name, clz if not isnan else 'nan')]
# cbind
output = dummies if output is None else output.cbind(dummies)
return output
[docs]class H2OSafeOneHotEncoder(BaseH2OTransformer):
"""Given a set of feature_names, one-hot encodes (dummies)
a set of vecs into an expanded set of dummied columns. Will
drop the original columns after transformation, unless otherwise
specified.
Parameters
----------
feature_names : array_like (str) shape=(n_features,), optional (default=None)
The list of names on which to fit the transformer.
target_feature : str, optional (default=None)
The name of the target feature (is excluded from the fit)
for the estimator.
exclude_features : array_like (str) shape=(n_features,), optional (default=None)
Any names that should be excluded from ``feature_names``
drop_after_encoded : bool (default=True)
Whether to drop the original columns after transform
.. versionadded:: 0.1.0
"""
_min_version = '3.8.2.9'
_max_version = None
def __init__(self, feature_names=None, target_feature=None, exclude_features=None, drop_after_encoded=True):
super(H2OSafeOneHotEncoder, self).__init__(feature_names=feature_names,
target_feature=target_feature,
exclude_features=exclude_features,
min_version=self._min_version,
max_version=self._max_version)
self.drop_after_encoded = drop_after_encoded
[docs] def fit(self, X):
"""Fit the one hot encoder.
Parameters
----------
X : ``H2OFrame``, shape=(n_samples, n_features)
The training frame to fit
Returns
-------
self
"""
X = check_frame(X, copy=False)
# these are just the features to encode
cat = _frame_from_x_y(X, self.feature_names, self.target_feature, self.exclude_features)
# do fit
self.encoders_ = {
str(k): _H2OVecSafeOneHotEncoder().fit(cat[str(k)])
for k in cat.columns
}
return self