Source code for skutil.h2o.grid_search

"""Sklearn-esque grid searches for H2O frames"""
# Author : Taylor Smith, originally adapted from sklearn for use with H2O datastructures
# License: BSD

from __future__ import division, print_function, absolute_import
import time
from abc import abstractmethod
import h2o
import numpy as np
import pandas as pd

try:
    from h2o import H2OEstimator
except ImportError:
    from h2o.estimators.estimator_base import H2OEstimator

from .pipeline import H2OPipeline
from .frame import _check_is_1d_frame
from .base import check_frame, BaseH2OFunctionWrapper, validate_x_y, VizMixin
from ..base import overrides, since
from ..utils import report_grid_score_detail
from ..utils.fixes import dict_keys
from ..utils.metaestimators import if_delegate_has_method, if_delegate_isinstance
from ..grid_search import _CVScoreTuple, _check_param_grid
from ..metrics import GainsStatisticalReport
from .split import *
from . import metrics

from sklearn.externals.joblib import logger
from sklearn.base import clone
from sklearn.utils.validation import check_is_fitted
from sklearn.externals import six
from h2o.estimators import (H2ODeepLearningEstimator,
                            H2OGradientBoostingEstimator,
                            H2OGeneralizedLinearEstimator,
                            H2ONaiveBayesEstimator,
                            H2ORandomForestEstimator)

try:
    import cPickle as pickle
except ImportError:
    import pickle

# >= sklearn 0.18
try:
    from sklearn.model_selection import ParameterSampler, ParameterGrid

    SK18 = True
except ImportError:
    from sklearn.grid_search import ParameterSampler, ParameterGrid

    SK18 = False

__all__ = [
    'H2OGridSearchCV',
    'H2ORandomizedSearchCV',
    'H2OGainsRandomizedSearchCV'
]

# map for the metrics
SCORERS = metrics.SCORERS

"""These parameters are ones h2o stores
that we don't necessarily want to clone.
"""
PARM_IGNORE = {
    'model_id',
    'fold_column',
    'fold_assignment',
    'keep_cross_validation_predictions',
    'offset_column',
    'checkpoint',
    'training_frame',
    'validation_frame',
    'response_column',
    'ignored_columns',
    'max_confusion_matrix_size',
    'score_each_iteration',
    'histogram_type',
    'col_sample_rate',
    'stopping_metric',
    'weights_column',
    'stopping_rounds',
    'col_sample_rate_change_per_level',
    'max_hit_ratio_k',
    'nbins_cats',
    'class_sampling_factors',
    'ignore_const_cols',
    'keep_cross_validation_fold_assignment'
}


def _as_numpy(_1d_h2o_frame):
    """Takes a single column h2o frame and
    converts it into a numpy array
    """
    f = _check_is_1d_frame(_1d_h2o_frame)

    nm = str(f.columns[0])
    return f[nm].as_data_frame(use_pandas=True)[nm].values


def _kv_str(k, v):
    k = str(k)  # h2o likes unicode...
    # likewise, if the v is unicode, let's make it a string.
    v = v if not isinstance(v, six.string_types) else str(v)
    return k, v


def _clone_h2o_obj(estimator, ignore=False, **kwargs):
    # do initial clone
    est = clone(estimator)

    # set kwargs:
    if kwargs:
        for k, v in six.iteritems(kwargs):
            setattr(est, k, v)

    # check on h2o estimator
    if isinstance(estimator, H2OPipeline):
        # the last step from the original estimator
        e = estimator.steps[-1][1]
        if isinstance(e, H2OEstimator):
            last_step = est.steps[-1][1]

            # so it's the last step
            for k, v in six.iteritems(e._parms):
                k, v = _kv_str(k, v)

                # if (not k in PARM_IGNORE) and (not v is None):
                #   e._parms[k] = v
                last_step._parms[k] = v

                # otherwise it's an BaseH2OFunctionWrapper
    return est


def _new_base_estimator(est, clonable_kwargs):
    """When the grid searches are pickled, the estimator
    has to be dropped out. When we load it back in, we have
    to reinstate a new one, since the fit is predicated on
    being able to clone a base estimator, we've got to have
    an estimator to clone and fit.

    Parameters
    ----------

    est : str
        The type of model to build

    Returns
    -------

    estimator : H2OEstimator
        The cloned base estimator
    """
    est_map = {
        'dl':   H2ODeepLearningEstimator,
        'gbm':  H2OGradientBoostingEstimator,
        'glm':  H2OGeneralizedLinearEstimator,
        # 'glrm': H2OGeneralizedLowRankEstimator,
        # 'km'  : H2OKMeansEstimator,
        'nb':   H2ONaiveBayesEstimator,
        'rf':   H2ORandomForestEstimator
    }

    estimator = est_map[est]()  # initialize the new ones
    for k, v in six.iteritems(clonable_kwargs):
        k, v = _kv_str(k, v)
        estimator._parms[k] = v

    return estimator


def _get_estimator_string(estimator):
    """Looks up the estimator string in the reverse
    dictionary. This way we can regenerate the base 
    estimator. This is kind of a hack...

    Parameters
    ----------

    estimator : H2OEstimator
        The estimator
    """
    if isinstance(estimator, H2ODeepLearningEstimator):
        return 'dl'
    elif isinstance(estimator, H2OGradientBoostingEstimator):
        return 'gbm'
    elif isinstance(estimator, H2OGeneralizedLinearEstimator):
        return 'glm'
    # elif isinstance(estimator, H2OGeneralizedLowRankEstimator):
    #    return 'glrm'
    # elif isinstance(estimator, H2OKMeansEstimator):
    #    return 'km'
    elif isinstance(estimator, H2ONaiveBayesEstimator):
        return 'nb'
    elif isinstance(estimator, H2ORandomForestEstimator):
        return 'rf'
    else:
        raise TypeError('unknown type for gridsearch: %s'
                        % type(estimator))


def _score(estimator, frame, target_feature, scorer, is_regression, **kwargs):
    y_truth = frame[target_feature]

    # gen predictions...
    pred = estimator.predict(frame)['predict']

    # it's calling and h2o scorer at this point
    return scorer.score(y_truth, pred, **kwargs)


def _fit_and_score(estimator, frame, feature_names, target_feature,
                   scorer, parameters, verbose, scoring_params,
                   train, test, is_regression, act_args,
                   cv_fold, iteration):
    """Fits the current fold on the current parameters.

        Parameters
        ----------

        estimator : H2OPipeline or H2OEstimator
            The estimator to fit

        frame : H2OFrame, shape=(n_samples, n_features)
            The training frame

        feature_names : iterable (str)
            The feature names on which to train

        target_feature : str
            The name of the target feature

        scorer : H2OScorer
            The scoring function

        parameters : dict
            The parameters to set in the estimator clone

        verbose : int
            The level of verbosity

        scoring_params : dict
            The parameters to pass as kwargs to the scoring function

        train : iterable, shape=(n_train_samples,)
            The train fold indices

        test : iterable, shape=(n_test_samples,)
            The test fold indices

        is_regression : bool
            Whether we are fitting a continuous target

        act_args : dict
            :class:``skutil.metrics.GainsStatisticalReport`` args if called 
            from a :class:``skutil.h2o.H2OGainsRandomizedSearchCV``. Otherwise,
            these are unused.

        cv_fold : int
            The fold number for reporting

        iteration : int
            The iteration number for reporting

        Returns
        -------

        out : list, shape=(4,)
            test_score : float
                The score produced by the ``_score`` method
                on the test fold of the training set.

            len(test) : int
                The number of samples included in the
                test fold of the training set. Used later
                for IID normalizing of test scores.

            estimator : ``H2OEstimator`` or ``H2OPipeline``
                The fit pipeline or estimator. Used for later
                scoring on the validation set.

            parameters : dict
                The parameters used to fit this estimator.
    """
    if parameters is None:
        parameters = {}

    if verbose > 1:
        if not parameters:
            msg = ''
        else:
            msg = 'Target: %s; %s' % (target_feature, ', '.join('%s=%s' % (k, v) for k, v in parameters.items()))
        print("[CV (iter %i, fold %i)] %s %s" % (iteration, cv_fold, msg, (64 - len(msg)) * '.'))

    # h2o doesn't currently re-order rows... and sometimes will
    # complain for some reason. We need to sort our train/test idcs
    train = sorted(train)
    test = sorted(test)

    # if act_args, then it's a gains search. We just need to slice
    # our existing numpy arrays
    if act_args is not None:
        kwargs = {
            'expo': act_args['expo'][test],
            'loss': act_args['loss'][test],
            'prem': act_args['prem'][test] if act_args['prem'] is not None else None
        }
    else:
        kwargs = scoring_params

    # generate split
    train_frame = frame[train, :]
    test_frame = frame[test, :]

    start_time = time.time()

    # it's probably a pipeline
    is_h2o_est = isinstance(estimator, H2OEstimator)
    if not is_h2o_est:
        estimator.set_params(**parameters)

        # the name setting should be taken care of pre-clone...
        # setattr(estimator, 'feature_names', feature_names)
        # setattr(estimator, 'target_feature',target_feature)

        # do fit
        estimator.fit(train_frame)
    else:  # it's just an H2OEstimator
        # parm_dict = {}
        for k, v in six.iteritems(parameters):
            if '__' in k:
                raise ValueError('only one estimator passed to grid search, '
                                 'but multiple named parameters passed: %s' % k)

            # {parm_name : v}
            estimator._parms[k] = v

        # do train
        estimator.train(training_frame=train_frame, x=feature_names, y=target_feature)

    # score model
    test_score = _score(estimator, test_frame, target_feature, scorer, is_regression, **kwargs)

    # h2o is verbose.. if we are too, print a new line:
    if verbose > 1:
        print()  # new line

    scoring_time = time.time() - start_time

    if verbose > 2:
        msg += ', score=%f' % test_score
    if verbose > 1:
        end_msg = '%s -%s' % (msg, logger.short_format_time(scoring_time))
        print('[CV (iter %i, fold %i)] %s %s' % (iteration, cv_fold, (64 - len(end_msg)) * '.', end_msg))
        print()  # new line
        print()  # new line

    out = [test_score, len(test), estimator, parameters]
    return out


class BaseH2OSearchCV(BaseH2OFunctionWrapper, VizMixin):
    """Base for all H2O grid searches"""

    _min_version = '3.8.2.9'
    _max_version = None

    @abstractmethod
    def __init__(self, estimator, feature_names,
                 target_feature, scoring=None,
                 scoring_params=None,
                 cv=5, verbose=0, iid=True,
                 validation_frame=None,
                 minimize='bias'):

        super(BaseH2OSearchCV, self).__init__(target_feature=target_feature,
                                              min_version=self._min_version,
                                              max_version=self._max_version)

        self.estimator = estimator
        self.feature_names = feature_names
        self.scoring = scoring
        self.scoring_params = scoring_params if scoring_params else {}
        self.cv = cv
        self.verbose = verbose
        self.iid = iid
        self.validation_frame = validation_frame
        self.minimize = minimize

    def _fit(self, X, parameter_iterable):
        """Actual fitting,  performing the search over parameters."""
        X = check_frame(X, copy=True)  # copy because who knows what people do inside of score...

        self.feature_names, self.target_feature = validate_x_y(X, self.feature_names, self.target_feature)
        self.is_regression_ = (not X[self.target_feature].isfactor()[0])

        # local scope
        minimize = self.minimize
        estimator = self.estimator

        # ensure minimize is in {'bias':'variance'}
        min_permitted = ['bias', 'variance']
        if minimize not in min_permitted:
            raise ValueError('minimize must be one of %s, but got %s' % (', '.join(min_permitted), str(minimize)))

        # validate the estimator... for grid search, we ONLY ALLOW the last step
        # of the grid search estimator to be an H2OEstimator. That means pipelines
        # that don't end in an estimator are invalid. If it's not a pipeline, it must
        # be an h2oestimator
        if isinstance(estimator, H2OPipeline):
            if not isinstance(estimator._final_estimator, H2OEstimator):
                raise TypeError('if estimator is H2OPipeline, its _final_estimator must '
                                'be of type H2OEstimator. Got %s' % type(estimator._final_estimator))
        elif not isinstance(estimator, H2OEstimator):
            raise TypeError('estimator must be an H2OPipeline or an H2OEstimator. Got %s' % type(estimator))

        # the addition of the gains search necessitates some hackiness.
        # if we have the attr 'extra_args_' then we know it's an gains search
        xtra = self.extra_args_ if hasattr(self, 'extra_args_') else None  # np arrays themselves
        xtra_nms = self.extra_names_ if hasattr(self,
                                                'extra_names_') else None  # the names of the prem,exp,loss features

        # we need to require scoring...
        scoring = self.scoring
        if hasattr(self,
                   'scoring_class_') or xtra is not None:  # this is a gains search, and we don't need to h2o-ize it
            pass
        else:
            if scoring is None:
                # set defaults
                if self.is_regression_:
                    scoring = 'r2_score'
                else:
                    scoring = 'accuracy_score'

            # make strs into scoring functions
            if isinstance(scoring, six.string_types):
                if scoring not in SCORERS:
                    raise ValueError('Scoring must be one of (%s) or a callable. '
                                     'Got %s' % (', '.join(dict_keys(SCORERS)), scoring))

                scoring = SCORERS[scoring]
                
            # make it a scorer
            if hasattr(scoring, '__call__'):
                self.scoring_class_ = metrics.make_h2o_scorer(scoring, X[self.target_feature])
            else:  # should be impossible to get here
                raise TypeError('expected string or callable for scorer, but got %s' % type(self.scoring))

        # validate CV
        cv = check_cv(self.cv)

        # clone estimator
        nms = {
            'feature_names': self.feature_names,
            'target_feature': self.target_feature
        }

        # do first clone, remember to set the names...
        base_estimator = _clone_h2o_obj(self.estimator, **nms)

        # do fits, scores
        out = [
            _fit_and_score(estimator=_clone_h2o_obj(base_estimator),
                           frame=X, feature_names=self.feature_names,
                           target_feature=self.target_feature,
                           scorer=self.scoring_class_, parameters=params,
                           verbose=self.verbose, scoring_params=self.scoring_params,
                           train=train, test=test, is_regression=self.is_regression_,
                           act_args=xtra, cv_fold=cv_fold, iteration=iteration)
            for iteration, params in enumerate(parameter_iterable)
            for cv_fold, (train, test) in enumerate(cv.split(X, self.target_feature))
            ]

        # Out is a list of quad: score, n_test_samples, estimator, parameters
        n_fits = len(out)
        n_folds = cv.get_n_splits()

        # if a validation frame was passed, user might want to see how it scores
        # on each model, so we'll do that here...
        if self.validation_frame is not None:
            score_validation = True
            self.validation_scores = []

            if xtra_nms is not None:
                self.val_score_report_ = GainsStatisticalReport(
                    n_folds=n_folds,
                    n_iter=n_fits // n_folds,
                    iid=self.iid)

                # set scoring function
                val_scorer = self.val_score_report_

                kwargs = {
                    'expo': _as_numpy(self.validation_frame[xtra_nms['expo']]),
                    'loss': _as_numpy(self.validation_frame[xtra_nms['loss']]),
                    'prem': _as_numpy(self.validation_frame[xtra_nms['prem']]) if (
                        xtra_nms['prem'] is not None) else None
                }
            else:
                kwargs = self.scoring_params
                val_scorer = self.scoring_class_
        else:
            score_validation = False

        # do scoring
        scores = list()
        grid_scores = list()
        for grid_start in range(0, n_fits, n_folds):
            n_test_samples = 0
            score = 0
            all_scores = []

            # iterate over OUT
            for this_score, this_n_test_samples, this_estimator, parameters in \
                    out[grid_start:grid_start + n_folds]:
                all_scores.append(this_score)

                if self.iid:
                    this_score *= this_n_test_samples
                    n_test_samples += this_n_test_samples
                score += this_score

                # score validation set if necessary
                if score_validation:
                    val_score = _score(this_estimator, self.validation_frame,
                                       self.target_feature, val_scorer,
                                       self.is_regression_, **kwargs)

                    # if it's gains scorer, handles the iid condition internally...
                    self.validation_scores.append(val_score)

            if self.iid:
                score /= float(n_test_samples)
            else:
                score /= float(n_folds)

            scores.append((score, parameters))
            grid_scores.append(_CVScoreTuple(
                parameters,
                score,
                np.array(all_scores)))

        # Store the computed scores
        self.grid_scores_ = grid_scores

        # Find the best parameters by comparing on the mean validation score:
        # note that `sorted` is deterministic in the way it breaks ties
        is_bias = minimize == 'bias'
        # else == variance
        the_key = (lambda x: x.mean_validation_score) if is_bias else (lambda x: x.cv_validation_scores.std())
        best = sorted(grid_scores, key=the_key, reverse=is_bias)[0]

        self.best_params_ = best.parameters
        self.best_score_ = best.mean_validation_score

        # fit the best estimator using the entire dataset
        # clone first to work around broken estimators
        best_estimator = _clone_h2o_obj(base_estimator, **nms)

        # if verbose alert user we're at the end...
        if self.verbose > 1:
            msg = 'Target: %s; %s' % (self.target_feature, ', '.join('%s=%s' % (k, v)
                                                                     for k, v in six.iteritems(best.parameters)))
            print("\nFitting best hyperparameters across all folds")
            print("[BEST] %s %s" % (msg, (64 - len(msg)) * '.'))

        # set params -- remember h2o gets funky with this...
        if isinstance(best_estimator, H2OEstimator):
            for k, v in six.iteritems(best.parameters):
                best_estimator._parms[k] = v
            best_estimator.train(training_frame=X, x=self.feature_names, y=self.target_feature)
        else:
            best_estimator.set_params(**best.parameters)
            best_estimator.fit(X)

        # Set the best estimator
        self.best_estimator_ = best_estimator

        return self

    def score(self, frame):
        """After the grid search is fit, generates and scores 
        the predictions of the ``best_estimator_``.

        Parameters
        ----------

        frame : H2OFrame, shape=(n_samples, n_features)
            The test frame on which to predict

        Returns
        -------

        scor : float
            The score of the test predictions
        """
        check_is_fitted(self, 'best_estimator_')
        frame = check_frame(frame, copy=True)  # copy because who knows what people do inside of score...
        scor = _score(self.best_estimator_, frame, self.target_feature,
                      self.scoring_class_, self.is_regression_,
                      **self.scoring_params)
        return scor

    @if_delegate_has_method(delegate='best_estimator_')
    def predict(self, frame):
        """After the grid search is fit, generates predictions 
        on the test frame using the ``best_estimator_``.

        Parameters
        ----------

        frame : H2OFrame, shape=(n_samples, n_features)
            The test frame on which to predict

        Returns
        -------

        p : H2OFrame, shape=(n_samples, 3 if is_classif else 1)
            The test predictions
        """
        frame = check_frame(frame, copy=False)  # don't copy because predict doesn't need it
        p = self.best_estimator_.predict(frame)
        return p

    def fit_predict(self, frame):
        """First, fits the grid search and then generates predictions 
        on the training frame using the ``best_estimator_``.

        Parameters
        ----------

        frame : H2OFrame, shape=(n_samples, n_features)
            The training frame on which to predict

        Returns
        -------

        p : H2OFrame, shape=(n_samples, 3 if is_classif else 1)
            The training predictions
        """
        p = self.fit(frame).predict(frame)
        return p

    @since('0.1.2')
    @if_delegate_isinstance(delegate='best_estimator_', instance_type=(H2OEstimator, H2OPipeline))
    def download_pojo(self, path="", get_jar=True):
        """This method is injected at runtime if the ``best_estimator_``
        is an instance of an ``H2OEstimator``. This method downloads the POJO
        from a fit estimator.

        Parameters
        ----------

        path : string, optional (default="")
            Path to folder in which to save the POJO.
            
        get_jar : bool, optional (default=True)
            Whether to get the jar from the POJO.

        Returns
        -------

        None or string
            Returns None if ``path`` is "" else, the filepath
            where the POJO was saved.
        """
        is_h2o = isinstance(self.best_estimator_, H2OEstimator)
        if is_h2o:
            return h2o.download_pojo(self.best_estimator_, path=path, get_jar=get_jar)
        else:
            return self.best_estimator_.download_pojo(path=path, get_jar=get_jar)

    @overrides(VizMixin)
    def plot(self, timestep, metric):
        """Plot an H2OEstimator's performance over a
        given ``timestep`` (x-axis) against a provided 
        ``metric`` (y-axis).

        Parameters
        ----------

        timestep : str
            A timestep as defined in the H2O API. One of
            ("AUTO", "duration", "number_of_trees").

        metric : str
            The performance metric to evaluate. One of
            ("log_likelihood", "objective", "MSE", "AUTO")
        """
        check_is_fitted(self, 'best_estimator_')

        # then it's a pipeline:
        if hasattr(self.best_estimator_, 'plot'):
            self.best_estimator_.plot(timestep=timestep, metric=metric)
        else:
            # should be an H2OEstimator
            self.best_estimator_._plot(timestep=timestep, metric=metric)

    @staticmethod
    def load(location):
        """Loads a persisted state of an instance of BaseH2OSearchCV
        from disk. This method will handle loading H2OEstimator models separately 
        and outside of the constraints of the pickle package. 

        Note that this is a static method and should be called accordingly:

            >>> def load_search():
            ...     return BaseH2OSearchCV.load('path/to/h2o/search.pkl') # GOOD!
            >>>
            >>> search = load_search() # doctest: +SKIP

        Also note that since BaseH2OSearchCV will contain an H2OEstimator, it's
        ``load`` functionality differs from that of its superclass, BaseH2OFunctionWrapper
        and will not function properly if called at the highest level of abstraction:

            >>> def load_search():
            ...     return BaseH2OFunctionWrapper.load('path/to/h2o/search.pkl') # BAD!
            >>>
            >>> search = load_search() # doctest: +SKIP

        Furthermore, trying to load a different type of BaseH2OFunctionWrapper from
        this method will raise a TypeError:

            >>> def load_search():
            ...     return BaseH2OSearchCV.load('path/to/some/other/transformer.pkl') # BAD!
            >>>
            >>> search = load_search() # doctest: +SKIP

        Parameters
        ----------

        location : str
            The location where the persisted BaseH2OSearchCV model resides.

        Returns
        -------

        model : BaseH2OSearchCV
            The unpickled instance of the BaseH2OSearchCV model
        """
        with open(location, 'rb') as f:
            model = pickle.load(f)

        if not isinstance(model, BaseH2OSearchCV):
            raise TypeError('expected BaseH2OSearchCV, got %s' % type(model))

        # read the model portion, delete the model path
        ex = None
        the_h2o_est = None
        for pth in [model.model_loc_, 'hdfs://%s' % model.model_loc_]:
            try:
                the_h2o_est = h2o.load_model(pth)
            except Exception as e:
                if ex is None:
                    ex = e
                else:
                    # only throws if fails twice
                    raise ex

                    # break if successfully loaded
            if the_h2o_est is not None:
                break

        # if self.estimator is None, then it's simply the H2OEstimator,
        # otherwise it's going to be the H2OPipeline
        if model.best_estimator_ is None:
            model.best_estimator_ = the_h2o_est
            model.estimator = _new_base_estimator(model.est_type_, model.base_estimator_parms_)
        else:
            model.best_estimator_.steps[-1] = (model.est_name_, the_h2o_est)
            model.estimator.steps[-1] = (
                model.est_name_, _new_base_estimator(model.est_type_, model.base_estimator_parms_))

        return model

    def _save_internal(self, **kwargs):
        check_is_fitted(self, 'best_estimator_')
        best_estimator = self.best_estimator_
        estimator = self.estimator

        # where we'll save things
        loc = kwargs.pop('location')
        model_loc = kwargs.pop('model_location')

        # need to save the h2o est before anything else. Note that since
        # we verify pre-fit that the _final_estimator is of type H2OEstimator,
        # we can assume nothing has changed internally...
        is_pipe = False
        if isinstance(best_estimator, H2OPipeline):
            self.est_name_ = best_estimator.steps[-1][0]  # don't need to duplicate--can use for base

            the_h2o_est = best_estimator._final_estimator
            the_base_est = estimator._final_estimator

            is_pipe = True
        else:
            # otherwise it's the H2OEstimator
            the_h2o_est = best_estimator
            the_base_est = estimator

        # get the key that will map to the new H2OEstimator
        self.est_type_ = _get_estimator_string(the_base_est)

        # first, save the best estimator's H2O piece...
        force = kwargs.pop('force', False)
        self.model_loc_ = h2o.save_model(model=the_h2o_est, path=model_loc, force=force)

        # set to none for pickling, and then restore state for scoring
        if is_pipe:
            last_step_ = best_estimator.steps[-1]
            best_estimator.steps[-1] = None

            base_last_step_ = estimator.steps[-1]
            estimator.steps[-1] = None
            self.base_estimator_parms_ = base_last_step_[1]._parms  # it's a tuple...
        else:
            last_step_ = self.best_estimator_
            base_last_step_ = self.estimator
            self.best_estimator_ = None
            self.estimator = None
            self.base_estimator_parms_ = base_last_step_._parms

            # now save the rest of things...
        with open(loc, 'wb') as output:
            pickle.dump(self, output, pickle.HIGHEST_PROTOCOL)

        # restore state for re-use
        if is_pipe:
            best_estimator.steps[-1] = last_step_
            estimator.steps[-1] = base_last_step_
        else:
            self.best_estimator_ = last_step_
            self.estimator = base_last_step_

    @if_delegate_has_method(delegate='best_estimator_')
    def varimp(self, use_pandas=True):
        """Get the variable importance, if the final
        estimator implements such a function.

        Parameters
        ----------

        use_pandas : bool, optional (default=True)
            Whether to return a pandas dataframe
        """
        return self.best_estimator_.varimp(use_pandas=use_pandas)


[docs]class H2OGridSearchCV(BaseH2OSearchCV): """An exhaustive grid search that will fit models across the entire hyperparameter grid provided. Parameters ---------- estimator : H2OPipeline or H2OEstimator The estimator to fit. Either an :class:``skutil.h2o.H2OPipeline`` or a ``H2OEstimator``. If the ``estimator`` is a pipeline, it must contain an estimator as the final step. param_grid : dict The hyper parameter grid over which to search. If ``estimator`` is an :class:``skutil.h2o.H2OPipeline``, the ``param_grid`` should be in the form of ``{'stepname__param':[values]}``; if there are not named steps (i.e., if ``estimator`` is an ``H2OEstimator``), ``param_grid`` should be in the form of ``{'param':[values]}``. Note that a ``param_grid`` with named step parameters in the absence of named steps will raise an error. feature_names : iterable (str) The list of feature names on which to fit target_feature : str The name of the target scoring : str, optional (default='lift') A valid scoring metric, i.e., "accuracy_score". See ``skutil.h2o.grid_search.SCORERS`` for a comprehensive list. scoring_params : dict, optional (default=None) Any kwargs to be passed to the scoring function for scoring at each iteration. cv : int or H2OCrossValidator, optional (default=5) The number of folds to be fit for cross validation. verbose : int, optional (default=0) The level of verbosity. 1, 2 or greater. A ``verbosity`` of 0 will produce no output other than the default H2O fit/predict output. A ``verbosity`` of 1 will print the selected parameters at each fold and iteration, and a ``verbosity`` of 2 will produce all of the aforementioned output plus the intermediate fold scores. iid : bool, optional (default=True) Whether to consider each fold as IID. The fold scores are normalized at the end by the number of observations in each fold. If True, the data is assumed to be identically distributed across the folds, and the loss minimized is the total loss per sample, and not the mean loss across the folds. validation_frame : H2OFrame, optional (default=None) Whether to score on the full validation frame at the end of all of the model fits. Note that this will NOT be used in the actual model selection process. minimize : str, optional (default='bias') How the search selects the best model to fit on the entire dataset. One of {'bias','variance'}. The default behavior is 'bias', which is also the default behavior of sklearn. This will select the set of hyper parameters which maximizes the cross validation score mean. Alternatively, 'variance' will select the model which minimizes the standard deviations between cross validation scores. .. versionadded:: 0.1.0 """ def __init__(self, estimator, param_grid, feature_names, target_feature, scoring=None, scoring_params=None, cv=5, verbose=0, iid=True, validation_frame=None, minimize='bias'): super(H2OGridSearchCV, self).__init__( estimator=estimator, feature_names=feature_names, target_feature=target_feature, scoring=scoring, scoring_params=scoring_params, cv=cv, verbose=verbose, iid=iid, validation_frame=validation_frame, minimize=minimize ) self.param_grid = param_grid _check_param_grid(param_grid)
[docs] def fit(self, frame): """Fit the grid search. Parameters ---------- frame : H2OFrame, shape=(n_samples, n_features) The training frame on which to fit. """ return self._fit(frame, ParameterGrid(self.param_grid))
[docs]class H2ORandomizedSearchCV(BaseH2OSearchCV): """A grid search that operates over a random sub-hyperparameter space at each iteration. Parameters ---------- estimator : H2OPipeline or H2OEstimator The estimator to fit. Either an :class:``skutil.h2o.H2OPipeline`` or a ``H2OEstimator``. If the ``estimator`` is a pipeline, it must contain an estimator as the final step. param_grid : dict The hyper parameter grid over which to search. If ``estimator`` is an :class:``skutil.h2o.H2OPipeline``, the ``param_grid`` should be in the form of ``{'stepname__param':[values]}``; if there are not named steps (i.e., if ``estimator`` is an ``H2OEstimator``), ``param_grid`` should be in the form of ``{'param':[values]}``. Note that a ``param_grid`` with named step parameters in the absence of named steps will raise an error. feature_names : iterable (str) The list of feature names on which to fit target_feature : str The name of the target n_iter : int, optional (default=10) The number of iterations to fit. Note that ``n_iter * cv.get_n_splits`` will be fit. If there are 10 folds and 10 iterations, 100 models (plus one) will be fit. random_state : int, optional (default=None) The random state for the search scoring : str, optional (default='lift') A valid scoring metric, i.e., "accuracy_score". See ``skutil.h2o.grid_search.SCORERS`` for a comprehensive list. scoring_params : dict, optional (default=None) Any kwargs to be passed to the scoring function for scoring at each iteration. cv : int or H2OCrossValidator, optional (default=5) The number of folds to be fit for cross validation. Note that ``n_iter * cv.get_n_splits`` will be fit. If there are 10 folds and 10 iterations, 100 models (plus one) will be fit. verbose : int, optional (default=0) The level of verbosity. 1, 2 or greater. A ``verbosity`` of 0 will produce no output other than the default H2O fit/predict output. A ``verbosity`` of 1 will print the selected parameters at each fold and iteration, and a ``verbosity`` of 2 will produce all of the aforementioned output plus the intermediate fold scores. iid : bool, optional (default=True) Whether to consider each fold as IID. The fold scores are normalized at the end by the number of observations in each fold. If True, the data is assumed to be identically distributed across the folds, and the loss minimized is the total loss per sample, and not the mean loss across the folds. validation_frame : H2OFrame, optional (default=None) Whether to score on the full validation frame at the end of all of the model fits. Note that this will NOT be used in the actual model selection process. minimize : str, optional (default='bias') How the search selects the best model to fit on the entire dataset. One of {'bias','variance'}. The default behavior is 'bias', which is also the default behavior of sklearn. This will select the set of hyper parameters which maximizes the cross validation score mean. Alternatively, 'variance' will select the model which minimizes the standard deviations between cross validation scores. .. versionadded:: 0.1.0 """ def __init__(self, estimator, param_grid, feature_names, target_feature, n_iter=10, random_state=None, scoring=None, scoring_params=None, cv=5, verbose=0, iid=True, validation_frame=None, minimize='bias'): super(H2ORandomizedSearchCV, self).__init__( estimator=estimator, feature_names=feature_names, target_feature=target_feature, scoring=scoring, scoring_params=scoring_params, cv=cv, verbose=verbose, iid=iid, validation_frame=validation_frame, minimize=minimize ) self.param_grid = param_grid self.n_iter = n_iter self.random_state = random_state
[docs] def fit(self, frame): """Fit the grid search. Parameters ---------- frame : H2OFrame, shape=(n_samples, n_features) The training frame on which to fit. """ sampled_params = ParameterSampler(self.param_grid, self.n_iter, random_state=self.random_state) return self._fit(frame, sampled_params)
def _val_exp_loss_prem(x, y, z): """Takes three strings (or unicode) and cleans them for indexing an H2OFrame. Parameters ---------- x : str exp name y : str loss name z : str premium name Returns ------- out : tuple exp : str The name of the exp feature (``x``) loss : str The name of the loss feature (``y``) prem : str or None The name of the prem feature (``z``) """ if not all([isinstance(i, six.string_types) for i in (x, y)]): raise TypeError('exposure and loss must be strings or unicode') if z is not None: if not isinstance(z, six.string_types): raise TypeError('premium must be None or string or unicode') out = (str(x), str(y), str(z) if z is not None else z) return out
[docs]class H2OGainsRandomizedSearchCV(H2ORandomizedSearchCV): """A grid search that scores based on actuarial metrics (See ``skutil.metrics.GainsStatisticalReport``). This is a more customized form of grid search, and must use a gains metric provided by the ``GainsStatisticalReport``. Parameters ---------- estimator : H2OPipeline or H2OEstimator The estimator to fit. Either an :class:``skutil.h2o.H2OPipeline`` or a ``H2OEstimator``. If the ``estimator`` is a pipeline, it must contain an estimator as the final step. param_grid : dict The hyper parameter grid over which to search. If ``estimator`` is an :class:``skutil.h2o.H2OPipeline``, the ``param_grid`` should be in the form of ``{'stepname__param':[values]}``; if there are not named steps (i.e., if ``estimator`` is an ``H2OEstimator``), ``param_grid`` should be in the form of ``{'param':[values]}``. Note that a ``param_grid`` with named step parameters in the absence of named steps will raise an error. feature_names : iterable (str) The list of feature names on which to fit target_feature : str The name of the target exposure_feature : str The name of the exposure feature loss_feature : str The name of the loss feature premium_feature : str The name of the premium feature n_iter : int, optional (default=10) The number of iterations to fit. Note that ``n_iter * cv.get_n_splits`` will be fit. If there are 10 folds and 10 iterations, 100 models (plus one) will be fit. random_state : int, optional (default=None) The random state for the search scoring : str, optional (default='lift') One of {'lift','gini'} or other valid GainsStatisticalReport scoring metrics. scoring_params : dict, optional (default=None) Any kwargs to be passed to the scoring function for scoring at each iteration. cv : int or H2OCrossValidator, optional (default=5) The number of folds to be fit for cross validation. Note that ``n_iter * cv.get_n_splits`` will be fit. If there are 10 folds and 10 iterations, 100 models (plus one) will be fit. verbose : int, optional (default=0) The level of verbosity. 1, 2 or greater. A ``verbosity`` of 0 will produce no output other than the default H2O fit/predict output. A ``verbosity`` of 1 will print the selected parameters at each fold and iteration, and a ``verbosity`` of 2 will produce all of the aforementioned output plus the intermediate fold scores. iid : bool, optional (default=True) Whether to consider each fold as IID. The fold scores are normalized at the end by the number of observations in each fold. If True, the data is assumed to be identically distributed across the folds, and the loss minimized is the total loss per sample, and not the mean loss across the folds. validation_frame : H2OFrame, optional (default=None) Whether to score on the full validation frame at the end of all of the model fits. Note that this will NOT be used in the actual model selection process. minimize : str, optional (default='bias') How the search selects the best model to fit on the entire dataset. One of {'bias','variance'}. The default behavior is 'bias', which is also the default behavior of sklearn. This will select the set of hyper parameters which maximizes the cross validation score mean. Alternatively, 'variance' will select the model which minimizes the standard deviations between cross validation scores. error_score : float, optional (default=np.nan) The default score to use in the case of a pd.qcuts ValueError (when there are non-unique bin edges) error_behavior : str, optional (default='warn') How to handle the pd.qcut ValueError. One of {'warn','raise','ignore'} .. versionadded:: 0.1.0 """ def __init__(self, estimator, param_grid, feature_names, target_feature, exposure_feature, loss_feature, premium_feature=None, n_iter=10, random_state=None, scoring='lift', scoring_params=None, cv=5, verbose=0, iid=True, # n_groups=10, validation_frame=None, minimize='bias', error_score=np.nan, error_behavior='warn'): super(H2OGainsRandomizedSearchCV, self).__init__( estimator=estimator, param_grid=param_grid, feature_names=feature_names, target_feature=target_feature, n_iter=n_iter, random_state=random_state, scoring=scoring, scoring_params=scoring_params, cv=cv, verbose=verbose, iid=iid, validation_frame=validation_frame, minimize=minimize ) # self.n_groups = 10 self.exposure_feature = exposure_feature self.loss_feature = loss_feature self.premium_feature = premium_feature # for re-fitting, we need these kwargs saved self.grsttngs_ = { 'score_by': scoring, 'n_folds': check_cv(cv).get_n_splits(), 'n_iter': n_iter, 'iid': iid, 'error_score': error_score, 'error_behavior': error_behavior } # the scoring_class_ (set in ``fit``) will do the scoring self.scoring = None
[docs] def fit(self, frame): """Fit the grid search. Parameters ---------- frame : H2OFrame, shape=(n_samples, n_features) The training frame on which to fit. """ sampled_params = ParameterSampler(self.param_grid, self.n_iter, random_state=self.random_state) # set our score class self.scoring_class_ = GainsStatisticalReport(**self.grsttngs_) # we can do this once to avoid many as_data_frame operations exp, loss, prem = _val_exp_loss_prem(self.exposure_feature, self.loss_feature, self.premium_feature) self.extra_args_ = { 'expo': _as_numpy(frame[exp]), 'loss': _as_numpy(frame[loss]), 'prem': _as_numpy(frame[prem]) if prem is not None else None } # for validation set self.extra_names_ = { 'expo': exp, 'loss': loss, 'prem': prem } # do fit the_fit = self._fit(frame, sampled_params) # clear extra_args_, because they might take lots of mem # we can do this because a re-fit will re-assign them anyways. # don't delete the extra_names_ though, because they're used in # scoring the incoming frame. del self.extra_args_ return the_fit
[docs] def report_scores(self): """Create a dataframe report for the fitting and scoring of the gains search. Will report lift, gini and any other relevant metrics. If a validation set was included, will also report validation scores. Returns ------- rdf : pd.DataFrame, shape=(n_iter, n_params) The grid search report """ check_is_fitted(self, 'best_estimator_') report_res = self.scoring_class_.as_data_frame() n_obs, _ = report_res.shape # Need to cbind the parameters... we don't care about ["score", "std"] rdf, drops = report_grid_score_detail(self, charts=False, return_drops=True) rdf.drop(drops, axis=1) assert n_obs == rdf.shape[0], 'Internal error: %d!=%d' % (n_obs, rdf.shape[0]) # change the names in the dataframe... report_res.columns = ['train_%s' % x for x in report_res.columns.values] # cbind... rdf = pd.concat([rdf, report_res], axis=1) # if we scored on the validation set, also need to get the val score struct if hasattr(self, 'val_score_report_'): val_res_df = self.val_score_report_.as_data_frame() assert n_obs == val_res_df.shape[0], 'Internal error: %d!=%d' % (n_obs, val_res_df.shape[0]) val_res_df.columns = ['val_%s' % x for x in val_res_df.columns.values] # cbind rdf = pd.concat([rdf, val_res_df], axis=1) rdf.index = ['Iter_%i' % i for i in range(self.n_iter)] return rdf
@overrides(BaseH2OSearchCV)
[docs] def score(self, frame): """Predict and score on a new frame. Note that this method will not store performance metrics in the report that ``report_score`` generates. Parameters ---------- frame : H2OFrame, shape=(n_samples, n_features) The test frame on which to predict and score performance. Returns ------- scor : float The score on the testing frame """ check_is_fitted(self, 'best_estimator_') e, l, p = self.extra_names_['expo'], self.extra_names_['loss'], self.extra_names_['prem'] kwargs = { 'expo': frame[e], 'loss': frame[l], 'prem': frame[p] if p is not None else None } y_truth = frame[self.target_feature] pred = self.best_estimator_.predict(frame)['predict'] scor = self.scoring_class_.score_no_store(y_truth, pred, **kwargs) return scor