Source code for skutil.h2o.encode

from __future__ import print_function, absolute_import, division
import pandas as pd
import numpy as np
from h2o.frame import H2OFrame
from sklearn.utils.validation import check_is_fitted
from sklearn.preprocessing import LabelEncoder
from ..preprocessing.encode import _get_unseen
from .frame import _check_is_1d_frame
from .base import (BaseH2OTransformer, check_frame, _frame_from_x_y)
from .util import h2o_col_to_numpy

__all__ = [
    'H2OLabelEncoder',
    'H2OSafeOneHotEncoder'
]


def _val_vec(y):
    _check_is_1d_frame(y)
    return y


[docs]class H2OLabelEncoder(BaseH2OTransformer):
    """Encode categorical values in a H2OFrame (single column)
    into ordinal labels 0 - len(column) - 1.


    Examples
    --------

        >>> def example():
        ...     import pandas as pd
        ...     from skutil.h2o import from_pandas
        ...     from skutil.h2o.transform import H2OLabelEncoder
        ...     
        ...     x = pd.DataFrame.from_records(data=[
        ...                 [5, 4],
        ...                 [6, 2],
        ...                 [5, 1],
        ...                 [7, 9],
        ...                 [7, 2]], columns=['C1', 'C2'])
        ...     
        ...     X = from_pandas(x)
        ...     encoder = H2OLabelEncoder()
        ...     encoder.fit_transform(X['C1'])
        >>>
        >>> example() # doctest: +SKIP
          C1
        ----
           0
           1
           0
           2
           2
        [5 rows x 1 column]


    Attributes
    ----------

    classes_ : np.ndarray
        The unique class levels


    .. versionadded:: 0.1.0
    """
    _min_version = '3.8.2.9'
    _max_version = None

    def __init__(self):
        super(H2OLabelEncoder, self).__init__(feature_names=None,
                                              target_feature=None,
                                              exclude_features=None,
                                              min_version=self._min_version,
                                              max_version=self._max_version)

[docs]    def fit(self, column):
        self.encoder_ = LabelEncoder().fit(h2o_col_to_numpy(column))
        self.classes_ = self.encoder_.classes_
        return self

[docs]    def transform(self, column):
        check_is_fitted(self, 'encoder_')
        column = h2o_col_to_numpy(column)

        # transform--
        # I don't like that we have to re-upload... but we do... for now...
        return H2OFrame.from_python(self.encoder_.transform(column).reshape(column.shape[0], 1))


class _H2OVecSafeOneHotEncoder(BaseH2OTransformer):
    """Safely one-hot encodes an H2OVec into an ``H2OFrame`` of
    one-hot encoded dummies. Whereas H2O's default behavior for
    previously-unseen factor levels is to error, the 
    ``_H2OVecSafeOneHotEncoder`` skips previously-unseen levels
    in the ``transform`` section, returning 'nan' (which H2O
    interprets as ``NA``).

    Parameters
    ----------

    feature_names : array_like (str) shape=(n_features,), optional (default=None)
        The list of names on which to fit the transformer.

    target_feature : str, optional (default=None)
        The name of the target feature (is excluded from the fit)
        for the estimator.

    exclude_features : array_like (str) shape=(n_features,), optional (default=None)
        Any names that should be excluded from ``feature_names``


    .. versionadded:: 0.1.0
    """

    _min_version = '3.8.2.9'
    _max_version = None

    def __init__(self):
        super(_H2OVecSafeOneHotEncoder, self).__init__(feature_names=None,
                                                       target_feature=None,
                                                       exclude_features=None,
                                                       min_version=self._min_version,
                                                       max_version=self._max_version)

    def fit(self, y):
        """Fit the encoder.

        Parameters
        ----------

        y : ``H2OFrame``, shape=(n_samples, 1)
            The training frame on which to fit. Should
            be a single column ``H2OFrame``

        Returns
        -------

        self
        """
        # validate y
        y = _val_vec(y)

        # get the unique count
        clz = y.unique().as_data_frame().T.iloc[0].tolist()

        # max class check:
        max_classes = _get_unseen()
        if len(clz) > max_classes:
            raise ValueError('max_classes=%i, but got %i'
                             % (max_classes, len(clz)))

        # set internal
        self.classes_ = clz

        return self

    def transform(self, y):
        """Transform a new 1d frame after fit.

        Parameters
        ----------

        y : ``H2OFrame``, shape=(n_samples, 1)
            The 1d ``H2OFrame`` to transform

        Returns
        -------

        output : ``H2OFrame``, shape=(n_samples, 1)
            The transformed ``H2OFrame``
        """
        # make sure is fitted, validate y
        check_is_fitted(self, 'classes_')
        y = _val_vec(y)

        # get col name
        col_name = str(y.columns[0])

        # the frame output
        output = None

        # iterate over the classes
        for clz in self.classes_:
            isnan = False
            rep = clz  # we copy for sake of NaN preservation

            # if the clz is np.nan, then the actual rep is 'NA'
            if pd.isnull(clz):
                isnan = True
                rep = 'NA'

            # returns int vec of 1s and 0s
            dummies = (y == rep)
            dummies.columns = ['%s.%s' % (col_name, clz if not isnan else 'nan')]

            # cbind
            output = dummies if output is None else output.cbind(dummies)

        return output


[docs]class H2OSafeOneHotEncoder(BaseH2OTransformer):
    """Given a set of feature_names, one-hot encodes (dummies)
    a set of vecs into an expanded set of dummied columns. Will
    drop the original columns after transformation, unless otherwise 
    specified.

    Parameters
    ----------

    feature_names : array_like (str) shape=(n_features,), optional (default=None)
        The list of names on which to fit the transformer.

    target_feature : str, optional (default=None)
        The name of the target feature (is excluded from the fit)
        for the estimator.

    exclude_features : array_like (str) shape=(n_features,), optional (default=None)
        Any names that should be excluded from ``feature_names``

    drop_after_encoded : bool (default=True)
        Whether to drop the original columns after transform


    .. versionadded:: 0.1.0
    """

    _min_version = '3.8.2.9'
    _max_version = None

    def __init__(self, feature_names=None, target_feature=None, exclude_features=None, drop_after_encoded=True):
        super(H2OSafeOneHotEncoder, self).__init__(feature_names=feature_names,
                                                   target_feature=target_feature,
                                                   exclude_features=exclude_features,
                                                   min_version=self._min_version,
                                                   max_version=self._max_version)

        self.drop_after_encoded = drop_after_encoded

[docs]    def fit(self, X):
        """Fit the one hot encoder.

        Parameters
        ----------

        X : ``H2OFrame``, shape=(n_samples, n_features)
            The training frame to fit

        Returns
        -------

        self
        """
        X = check_frame(X, copy=False)

        # these are just the features to encode
        cat = _frame_from_x_y(X, self.feature_names, self.target_feature, self.exclude_features)

        # do fit
        self.encoders_ = {
            str(k): _H2OVecSafeOneHotEncoder().fit(cat[str(k)])
            for k in cat.columns
        }

        return self

[docs]    def transform(self, X):
        """Transform a new frame after fit.

        Parameters
        ----------

        X : ``H2OFrame``, shape=(n_samples, n_features)
            The frame to transform

        Returns
        -------

        X : ``H2OFrame``, shape=(n_samples, n_features)
            The transformed H2OFrame
        """
        check_is_fitted(self, 'encoders_')
        X = check_frame(X, copy=True)
        enc = self.encoders_

        # these are just the features to encode. (we will return the 
        # entire frame unless told not to...)
        cat = _frame_from_x_y(X, self.feature_names, self.target_feature, self.exclude_features)

        output = None
        for name in cat.columns:
            name = str(name)
            dummied = enc[name].transform(cat[name])

            # duplicative of R's cbind (bind columns together)
            output = dummied if output is None else output.cbind(dummied)

        # if we need to drop the original columns, we do that here:
        if self.drop_after_encoded:
            keep_nms = [str(n) for n in X.columns if n not in cat.columns]
            X = X[keep_nms]

        # cbind the dummies at the end
        X = X.cbind(output)

        return X