Source code for skutil.utils.util

# -*- coding: utf-8 -*-

from __future__ import print_function, division, absolute_import
import warnings
import sys
import traceback
import numpy as np
import pandas as pd
import numbers
import scipy.stats as st
from sklearn.datasets import load_iris, load_breast_cancer, load_boston
from sklearn.externals import six
from sklearn.metrics import confusion_matrix as cm
from ..base import suppress_warnings
from .fixes import (_grid_detail, _is_integer, is_iterable, 
                    _cols_if_none, dict_keys, dict_values)

try:
    # this causes a UserWarning to be thrown by matplotlib... should we squelch this?
    with warnings.catch_warnings():
        warnings.simplefilter("ignore")

        import matplotlib
        matplotlib.use('Agg')  # set backend
        from matplotlib import pyplot as plt

        # log it
        CAN_CHART_MPL = True
except ImportError:
    CAN_CHART_MPL = False

try:
    with warnings.catch_warnings():
        warnings.simplefilter("ignore")
        import seaborn as sns

        CAN_CHART_SNS = True
except ImportError:
    CAN_CHART_SNS = False

__max_exp__ = 1e19
__min_log__ = -19
__all__ = [
    'corr_plot',
    'df_memory_estimate',
    'exp',
    'flatten_all',
    'flatten_all_generator',
    'get_numeric',
    'human_bytes',
    'is_entirely_numeric',
    'is_integer',
    'is_numeric',
    'is_integer',
    'is_float',
    'load_boston_df',
    'load_breast_cancer_df',
    'load_iris_df',
    'log',
    'pd_stats',
    'report_confusion_matrix',
    'report_grid_score_detail',
    'shuffle_dataframe',
    'validate_is_pd'
]


@suppress_warnings
def _log_single(x):
    """Sanitized log function for a single element.
    Since this method internally calls np.log and carries
    the (very likely) possibility to overflow, the method
    suppresses all warnings.

    #XXX: at some point we might want to let ``suppress_warnings``
    # specify exactly which types of warnings it should filter.

    Parameters
    ----------

    x : float, int
        The number to log

    Returns
    -------

    val : float
        the log of x
    """
    x = np.maximum(0, x)
    val = __min_log__ if x == 0 else np.maximum(__min_log__, np.log(x))
    return val


@suppress_warnings
def _exp_single(x):
    """Sanitized exponential function.
    Since this method internally calls np.exp and carries
    the (very likely) possibility to overflow, the method
    suppresses all warnings.

    #XXX: at some point we might want to let ``suppress_warnings``
    # specify exactly which types of warnings it should filter.

    Parameters
    ----------

    x : float, int
        The number to exp


    Returns
    -------

    val : float
        the exp of x
    """
    val = np.minimum(__max_exp__, np.exp(x))
    return val


def _vectorize(fun, x):
    if is_iterable(x):
        return np.array([fun(p) for p in x])
    raise ValueError('Type %s is not iterable' % type(x))


[docs]def exp(x):
    """A safe mechanism for computing the exponential function
    while avoiding overflows.
    
    Parameters
    ----------

    x : float, number
        The number for which to compute the exp


    Returns
    -------

    exp(x)
    """
    # check on single exp
    if is_numeric(x):
        return _exp_single(x)
    # try vectorized
    try:
        return _vectorize(exp, x)
    except ValueError:
        # bail
        raise ValueError("don't know how to compute exp for type %s" % type(x))


[docs]def log(x):
    """A safe mechanism for computing a log while
    avoiding NaNs or exceptions.

    Parameters
    ----------

    x : float, number
        The number for which to compute the log


    Returns
    -------

    log(x)
    """
    # check on single log
    if is_numeric(x):
        return _log_single(x)
    # try vectorized
    try:
        return _vectorize(log, x)
    except ValueError:
        # bail
        raise ValueError("don't know how to compute log for type %s" % type(x))


def _val_cols(cols):
    # if it's None, return immediately
    if cols is None:
        return cols

    # try to make cols a list
    if not is_iterable(cols):
        if isinstance(cols, six.string_types):
            return [cols]
        else:
            raise ValueError('cols must be an iterable sequence')

    # if it is an index or a np.ndarray, it will have a built-in
    # (potentially more efficient tolist() method)
    if hasattr(cols, 'tolist') and hasattr(cols.tolist, '__call__'):
        return cols.tolist()

    # make it a list implicitly, make no guarantees about elements
    return [c for c in cols]


def _def_headers(X):
    m = X.shape[1] if hasattr(X, 'shape') else len(X)
    return ['V%i' % (i + 1) for i in range(m)]


[docs]def corr_plot(X, plot_type='cor', cmap='Blues_d', n_levels=5, corr=None,
              method='pearson', figsize=(11, 9), cmap_a=220, cmap_b=10, vmax=0.3,
              xticklabels=5, yticklabels=5, linewidths=0.5, cbar_kws=None):
    """Create a simple correlation plot given a dataframe.
    Note that this requires all datatypes to be numeric and finite!

    Parameters
    ----------

    X : pd.DataFrame, shape=(n_samples, n_features)
        The pandas DataFrame on which to compute correlations,
        or if ``corr`` is 'precomputed', the correlation matrix.
        In the case that ``X`` is a correlation matrix, it must
        be square, i.e., shape=(n_features, n_features).

    plot_type : str, optional (default='cor')
        The type of plot, one of ('cor', 'kde', 'pair')

    cmap : str, optional (default='Blues_d')
        The color to use for the kernel density estimate plot
        if ``plot_type`` == 'kde'. Otherwise unused.

    n_levels : int, optional (default=5)
        The number of levels to use for the kde plot 
        if ``plot_type`` == 'kde'. Otherwise unused.

    corr : 'precomputed' or None, optional (default=None)
        If None, the correlation matrix is computed, otherwise
        if 'precomputed', ``X`` is treated as a correlation matrix.

    method : str, optional (default='pearson')
        The method to use for correlation

    figsize : tuple (int), shape=(w,h), optional (default=(11,9))
        The size of the image

    cmap_a : int, optional (default=220)
        The colormap start point

    cmap_b : int, optional (default=10)
        The colormap end point

    vmax : float, optional (default=0.3)
        Arg for seaborn heatmap

    xticklabels : int, optional (default=5)
        The spacing for X ticks

    yticklabels : int, optional (default=5)
        The spacing for Y ticks

    linewidths : float, optional (default=0.5)
        The width of the lines

    cbar_kws : dict, optional (default=None)
        Any KWs to pass to seaborn's heatmap when ``plot_type`` = 'cor'.
        If None, will default to {'shrink': 0.5}
    """

    X, _ = validate_is_pd(X, None, assert_all_finite=True)
    valid_types = ('cor', 'kde', 'pair')
    if plot_type not in valid_types:
        raise ValueError('expected one of (%s), but got %s'
                         % (','.join(valid_types), plot_type))

    if cbar_kws is None:
        cbar_kws = {'shrink': 0.5}

    # seaborn is needed for all of these, so we have to check outside
    if not CAN_CHART_SNS:
        warnings.warn('Cannot plot (unable to import Seaborn)', ImportWarning)
        return None

    if plot_type == 'cor':
        # MPL is only needed for COR
        if not CAN_CHART_MPL:
            warnings.warn('Cannot plot (unable to import Matplotlib)')
            return None

        if not corr == 'precomputed':
            cols = X.columns.values
            X = X.corr(method=method)
            X.columns = cols
            X.index = cols

        # mask for upper triangle
        mask = np.zeros_like(X, dtype=np.bool)
        mask[np.triu_indices_from(mask)] = True

        # set up mpl figure
        f, ax = plt.subplots(figsize=figsize)
        color_map = sns.diverging_palette(cmap_a, cmap_b, as_cmap=True)
        sns.heatmap(X, mask=mask, cmap=color_map, vmax=vmax,
                    square=True, xticklabels=xticklabels, yticklabels=yticklabels,
                    linewidths=linewidths, cbar_kws=cbar_kws, ax=ax)

    elif plot_type == 'pair':
        sns.pairplot(X)
        sns.plt.show()

    else:
        g = sns.PairGrid(X)
        g.map_diag(sns.kdeplot)
        g.map_offdiag(sns.kdeplot, cmap=cmap, n_levels=n_levels)
        sns.plt.show()


[docs]def flatten_all(container):
    """Recursively flattens an arbitrarily nested iterable.
    WARNING: this function may produce a list of mixed types.

    Parameters
    ----------

    container : array_like, shape=(n_items,)
        The iterable to flatten. If the ``container`` is
        not iterable, it will be returned in a list as 
        ``[container]``


    Examples
    --------

    The example below produces a list of mixed results:

        >>> a = [[[],3,4],['1','a'],[[[1]]],1,2]
        >>> flatten_all(a)
        [3, 4, '1', 'a', 1, 1, 2]


    Returns
    -------

    l : list, shape=(n_items,)
        The flattened list
    """
    l = [x for x in flatten_all_generator(container)]
    return l


[docs]def flatten_all_generator(container):
    """Recursively flattens an arbitrarily nested iterable.
    WARNING: this function may produce a list of mixed types.

    Parameters
    ----------

    container : array_like, shape=(n_items,)
        The iterable to flatten. If the ``container`` is
        not iterable, it will be returned in a list as
        ``[container]``


    Examples
    --------

    The example below produces a list of mixed results:

        >>> a = [[[],3,4],['1','a'],[[[1]]],1,2]
        >>> flatten_all(a) # yields a generator for this iterable
        [3, 4, '1', 'a', 1, 1, 2]


    Returns
    -------

    generator object
    """
    if not is_iterable(container):
        yield container
    else:
        for i in container:
            if is_iterable(i):
                for j in flatten_all_generator(i):
                    yield j
            else:
                yield i


[docs]def shuffle_dataframe(X):
    """Shuffle the rows in a data frame without replacement.
    The random state used for shuffling is controlled by
    numpy's random state.

    Parameters
    ----------

    X : pd.DataFrame, shape=(n_samples, n_features)
        The dataframe to shuffle
    """
    X, _ = validate_is_pd(X, None, False)
    return X.iloc[np.random.permutation(np.arange(X.shape[0]))]


[docs]def validate_is_pd(X, cols, assert_all_finite=False):
    """Used within each SelectiveMixin fit method to determine whether
    the passed ``X`` is a dataframe, and whether the cols is appropriate.
    There are four scenarios (in the order in which they're checked):

    1) Names is not None, but X is not a dataframe.
        Resolution: the method will attempt to return a DataFrame from the
        args provided (with default names), but catches any
        exception and raises a ValueError. A common case where this would work
        may be a numpy.ndarray as X, and a list as cols (where the list is either
        int indices or default names that the dataframe will take on).

    2) X is a DataFrame, but cols is None.
        Resolution: return a copy of the dataframe, and use all column names.

    3) X is a DataFrame and cols is not None.
        Return a copy of the dataframe, and use only the names provided. This is
        the typical use case.

    4) X is not a DataFrame, and cols is None.
        Resolution: this case will only work if the X can be built into a DataFrame.
        Otherwise, there will be a ValueError thrown.

    Parameters
    ----------

    X : array_like, shape=(n_samples, n_features)
        The dataframe to validate. If ``X`` is not a DataFrame,
        but it can be made into one, no exceptions will be raised.
        However, if ``X`` cannot naturally be made into a DataFrame,
        a TypeError will be raised.

    cols : array_like (str), shape=(n_features,)
        The list of column names. Used particularly in SelectiveMixin
        transformers that validate column names.

    assert_all_finite : bool, optional (default=False)
        If True, will raise an AssertionError if any np.nan or np.inf
        values reside in ``X``.


    Returns
    -------

    X : pd.DataFrame, shape=(n_samples, n_features)
        A copy of the original input ``X``

    cols : list or None, shape=(n_features,)
        If ``cols`` was not None and did not raise a TypeError,
        it is converted into a list of strings and returned
        as a copy. Else None.
    """

    def _check(X, cols):
        # first check hard-to-detect case:
        if isinstance(X, pd.Series):
            raise ValueError('expected DataFrame but got Series')

        # validate the cols arg
        cols = _val_cols(cols)

        # if someone devious gave us an empty set of cols
        if cols is not None and len(cols) == 0:
            cols = None

        # avoid multiple isinstance checks
        is_df = isinstance(X, pd.DataFrame)

        # we do want to make sure the X at least is "array-like"
        if not is_iterable(X):
            raise TypeError('X (type=%s) cannot be cast to DataFrame' % type(X))

        # case 1, we have names but the X is not a frame
        if not is_df and cols is not None:
            # this is tough, because they only pass cols if it's a subset
            # and this frame is likely too large for the passed columns.
            # so, we hope they either passed what the col names WILL be
            # or that they passed numeric cols... they should handle that
            # validation on their end, though. If this fails, we'll just let
            # it fall through.
            return pd.DataFrame.from_records(data=X, columns=_def_headers(X)), cols

        # case 2, we have a DF but no cols, def behavior: use all
        elif is_df and cols is None:
            return X.copy(), None

        # case 3, we have a DF AND cols
        elif is_df and cols is not None:
            return X.copy(), cols

        # case 4, we have neither a frame nor cols (maybe JUST a np.array?)
        else:
            # we'll do two tests here... either that it's a np ndarray or a list of lists
            if isinstance(X, np.ndarray) or (is_iterable(X) and all(isinstance(elem, list) for elem in X)):
                return pd.DataFrame.from_records(data=X, columns=_def_headers(X)), None

            # bail out:
            raise ValueError('cannot handle data of type %s' % type(X))

    # do initial check
    X, cols = _check(X, cols)

    # we need to ensure all are finite
    if assert_all_finite:
        # if cols, we only need to ensure the specified columns are finite
        cols_tmp = _cols_if_none(X, cols)
        X_prime = X[get_numeric(X[cols_tmp])]  # subset the subset... only numerics

        # also apply only to the non-object columns
        if X_prime.apply(lambda x: (~np.isfinite(x)).sum()).sum() > 0:
            raise ValueError('Expected all entries to be finite')

    return X, cols


[docs]def df_memory_estimate(X, unit='MB', index=False):
    """We estimate the memory footprint of an H2OFrame
    to determine whether it's capable of being held in memory 
    or not.

    Parameters
    ----------

    X : Pandas ``DataFrame`` or ``H2OFrame``, shape=(n_samples, n_features)
        The DataFrame in question

    unit : str, optional (default='MB')
        The units to report. One of ('MB', 'KB', 'GB', 'TB')

    index : bool, optional (default=False)
        Whether to also estimate the memory footprint of the index.


    Returns
    -------

    mb : str
        The estimated number of UNIT held in the frame
    """
    X, _ = validate_is_pd(X, None, False)
    return human_bytes(X.memory_usage(index=index).sum(), unit)


def _is_int(x, tp):
    """Determine whether a column can be cast to int
    without loss of data
    """
    if not any([tp.startswith(i) for i in ('float', 'int')]):
        return False

    # if there's no difference between the two, then it's an int.
    try:
        return (x - x.astype('int')).abs().sum() == 0
    except ValueError:  # happens when there are NaNs
        return False


[docs]def pd_stats(X, col_type='all', na_str='--', hi_skew_thresh=1.0, mod_skew_thresh=0.5):
    """Get a descriptive report of the elements in the data frame.
    Builds on existing pandas ``describe`` method by adding counts of
    factor-level features, a skewness rating and several other helpful
    statistics.

    Parameters
    ----------

    X : Pandas ``DataFrame`` or ``H2OFrame``, shape=(n_samples, n_features)
        The DataFrame on which to compute stats.

    col_type : str, optional (default='all')
        The types of columns to analyze. One of ('all',
        'numeric', 'object'). If not all, will only return
        corresponding typed columns.

    na_str : str, optional (default='--')
        The string to display in a cell that is not applicable
        for the column's datatype.

    hi_skew_thresh : float, optional (default=1.0)
        The threshold above which a skewness rating will
        be deemed "high."

    mod_skew_thresh : float, optional (default=0.5)
        The threshold above which a skewness rating will 
        be deemed "moderate," so long as it does not exceed
        ``hi_skew_thresh``


    Returns
    -------

    s : Pandas ``DataFrame`` or ``H2OFrame``, shape=(n_samples, n_features)
        The resulting stats dataframe
    """
    X, _ = validate_is_pd(X, None, False)
    raw_stats = X.describe()
    stats = raw_stats.to_dict()
    dtypes = X.dtypes

    # validate col_type
    valid_types = ('all', 'numeric', 'object')
    if col_type not in valid_types:
        raise ValueError('expected col_type in (%s), but got %s'
                         % (','.join(valid_types), col_type))

    # if user only wants part back, we can use this...
    type_dict = {}

    # the string to use when we don't want to populate a cell
    _nastr = na_str

    # objects are going to get dropped in the describe() call,
    # so we need to add them back in with dicts of nastr for all...
    object_dtypes = dtypes[dtypes == 'object']
    if object_dtypes.shape[0] > 0:
        obj_nms = object_dtypes.index.values

        for nm in obj_nms:
            obj_dct = {stat: _nastr for stat in raw_stats.index.values}
            stats[nm] = obj_dct

    # we'll add rows to the stats...
    for col, dct in six.iteritems(stats):
        # add the dtype
        _dtype = str(dtypes[col])
        is_numer = any([_dtype.startswith(x) for x in ('int', 'float')])
        dct['dtype'] = _dtype

        # populate type_dict
        type_dict[col] = 'numeric' if is_numer else 'object'

        # if the dtype is not a float, we can
        # get the count of uniques, then do a
        # ratio of majority : minority
        _isint = _is_int(X[col], _dtype)
        if _isint or _dtype == 'object':
            _unique = len(X[col].unique())
            _val_cts = X[col].value_counts().sort_values(ascending=True)
            _min_cls, _max_cls = _val_cts.index[0], _val_cts.index[-1]

            # if there's only one class...
            if _min_cls == _max_cls:
                _min_cls = _nastr
                _min_max_ratio = _nastr
            else:
                _min_max_ratio = _val_cts.values[0] / _val_cts.values[-1]

            # chance we didn't recognize it as an int before...
            if 'float' in dct['dtype']:
                dct['dtype'] = dct['dtype'].replace('float', 'int')

        else:
            _unique = _min_max_ratio = _nastr

        # populate the unique count and more
        dct['unique_ct'] = _unique
        dct['min_max_class_ratio'] = _min_max_ratio

        # get the skewness...
        if is_numer:
            _skew, _kurt = X[col].skew(), X[col].kurtosis()
            abs_skew = abs(_skew)
            hs, ms = hi_skew_thresh, mod_skew_thresh
            _skew_risk = 'high skew' if abs_skew > hs else 'mod. skew' if (ms < abs_skew < hs) else 'symmetric'
        else:
            _skew = _kurt = _skew_risk = _nastr

        dct['skewness'] = _skew
        dct['skewness rating'] = _skew_risk
        dct['kurtosis'] = _kurt

    # go through and pop the keys that might be filtered on
    if col_type != 'all':
        stat_out = {}
        for col, dtype in six.iteritems(type_dict):
            if col_type == dtype:
                stat_out[col] = stats[col]

    else:
        stat_out = stats

    s = pd.DataFrame.from_dict(stat_out)
    return s


[docs]def get_numeric(X):
    """Return list of indices of numeric dtypes variables

    Parameters
    ----------

    X : Pandas ``DataFrame``, shape=(n_samples, n_features)
        The dataframe


    Returns
    -------

    list, int
        The list of indices which are numeric.
    """
    validate_is_pd(X, cols=None, assert_all_finite=False)  # don't want to assert finite or maybe endless recursion
    return X.dtypes[X.dtypes.apply(lambda x: str(x).startswith(("float", "int")))].index.tolist()


[docs]def human_bytes(b, unit='MB'):
    """Get bytes in a human readable form

    Parameters
    ----------

    b : int
        The number of bytes

    unit : str, optional (default='MB')
        The units to report. One of ('MB', 'KB', 'GB', 'TB')


    Returns
    -------

    mb : str
        The estimated number of UNIT held in the frame
    """
    kb = float(1024)
    units = {
        'KB': kb,
        'MB': float(kb ** 2),
        'GB': float(kb ** 3),
        'TB': float(kb ** 4)
    }

    if unit not in units:
        raise ValueError('got %s, expected one of (%s)'
                         % (unit, ', '.join(dict_keys(units))))

    return '%.3f %s' % (b / units[unit], unit)


[docs]def is_entirely_numeric(X):
    """Determines whether an entire pandas frame
    is numeric in dtypes.

    Parameters
    ----------

    X : Pandas ``DataFrame`` or ``H2OFrame``, shape=(n_samples, n_features)
        The dataframe to test


    Returns
    -------

    bool
        True if the entire pd.DataFrame 
        is numeric else False
    """
    return X.shape[1] == len(get_numeric(X))


[docs]def is_integer(x):
    """Determine whether some object ``x`` is an
    integer type (int, long, etc).

    Parameters
    ----------

    x : object
        The item to assess


    Returns
    -------

    bool
        True if ``x`` is an integer type
    """
    try:
        python_major_version = sys.version_info.major
        assert(python_major_version == 2 or python_major_version == 3)
        if python_major_version == 2:
            return (not isinstance(x, (bool, np.bool))) and \
                isinstance(x, (numbers.Integral, int, long, np.int, np.long))
        elif python_major_version == 3:
            return (not isinstance(x, (bool, np.bool))) and \
                isinstance(x, (numbers.Integral, int, np.int, np.long))
    except AssertionError:
        _, _, tb = sys.exc_info()
        traceback.print_tb(tb)  # Fixed format
        tb_info = traceback.extract_tb(tb)
        filename, line, func, text = tb_info[-1]

        print('An error occurred on line {} in statement {}'.format(line, text))
        exit(1)
    return _is_integer(x)


[docs]def is_float(x):
    """Determine whether some object ``x`` is a
    float type (float, np.float, etc).

    Parameters
    ----------

    x : object
        The item to assess


    Returns
    -------

    bool
        True if ``x`` is a float type
    """
    return isinstance(x, (float, np.float)) or \
        (not isinstance(x, (bool, np.bool)) and isinstance(x, numbers.Real))


[docs]def is_numeric(x):
    """Determine whether some object ``x`` is a
    numeric type (float, int, etc).

    Parameters
    ----------

    x : object
        The item to assess


    Returns
    -------

    bool
        True if ``x`` is a float or integer type
    """
    return is_float(x) or is_integer(x)


[docs]def load_iris_df(include_tgt=True, tgt_name="Species", shuffle=False):
    """Loads the iris dataset into a dataframe with the
    target set as the "Species" feature or whatever name
    is specified in ``tgt_name``.

    Parameters
    ----------

    include_tgt : bool, optional (default=True)
        Whether to include the target

    tgt_name : str, optional (default="Species")
        The name of the target feature

    shuffle : bool, optional (default=False)
        Whether to shuffle the rows on return


    Returns
    -------

    X : pd.DataFrame, shape=(n_samples, n_features)
        The loaded dataset
    """
    iris = load_iris()
    X = pd.DataFrame.from_records(data=iris.data, columns=iris.feature_names)

    if include_tgt:
        X[tgt_name] = iris.target

    return X if not shuffle else shuffle_dataframe(X)


[docs]def load_breast_cancer_df(include_tgt=True, tgt_name="target", shuffle=False):
    """Loads the breast cancer dataset into a dataframe with the
    target set as the "target" feature or whatever name
    is specified in ``tgt_name``.

    Parameters
    ----------

    include_tgt : bool, optional (default=True)
        Whether to include the target

    tgt_name : str, optional (default="target")
        The name of the target feature

    shuffle : bool, optional (default=False)
        Whether to shuffle the rows


    Returns
    -------

    X : pd.DataFrame, shape=(n_samples, n_features)
        The loaded dataset
    """
    bc = load_breast_cancer()
    X = pd.DataFrame.from_records(data=bc.data, columns=bc.feature_names)

    if include_tgt:
        X[tgt_name] = bc.target

    return X if not shuffle else shuffle_dataframe(X)


[docs]def load_boston_df(include_tgt=True, tgt_name="target", shuffle=False):
    """Loads the boston housing dataset into a dataframe with the
    target set as the "target" feature or whatever name
    is specified in ``tgt_name``.

    Parameters
    ----------

    include_tgt : bool, optional (default=True)
        Whether to include the target

    tgt_name : str, optional (default="target")
        The name of the target feature

    shuffle : bool, optional (default=False)
        Whether to shuffle the rows


    Returns
    -------

    X : Pandas ``DataFrame`` or ``H2OFrame``, shape=(n_samples, n_features)
        The loaded dataset
    """
    bo = load_boston()
    X = pd.DataFrame.from_records(data=bo.data, columns=bo.feature_names)

    if include_tgt:
        X[tgt_name] = bo.target

    return X if not shuffle else shuffle_dataframe(X)


[docs]def report_grid_score_detail(random_search, charts=True, sort_results=True,
                             ascending=True, percentile=0.975, y_axis='mean_test_score', 
                             sort_by='mean_test_score', highlight_best=True, highlight_col='red', 
                             def_color='blue', return_drops=False):
    """Return plots and dataframe of results, given a fitted grid search.
    Note that if Matplotlib is not installed, a warning will be thrown
    and no plots will be generated.

    Parameters
    ----------

    random_search : ``BaseSearchCV`` or ``BaseH2OSearchCV``
        The fitted grid search

    charts : bool, optional (default=True)
        Whether to plot the charts

    sort_results : bool, optional (default=True)
        Whether to sort the results based on score

    ascending : bool, optional (default=True)
        If ``sort_results`` is True, whether to use asc or desc
        in the sorting process.

    percentile : float, optional (default=0.975)
        The percentile point (0 < percentile < 1.0). The
        corresponding z-score will be multiplied
        by the cross validation score standard deviations.

    y_axis : str, optional (default='mean_test_score')
        The y-axis of the charts. One of ('score','std')

    sort_by : str, optional (default='mean_test_score')
        The column to sort by. This is not validated, in case
        the user wants to sort by a parameter column. If
        not ``sort_results``, this is unused.

    highlight_best : bool, optional (default=True)
        If set to True, charts is True, and sort_results is 
        also True, then highlights the point in the top
        position of the model DF.

    highlight_col : str, optional (default='red')
        What color to use for ``highlight_best`` if both
        ``charts`` and ``highlight_best``. If either is False,
        this is unused.

    def_color : str, optional (default='blue')
        What color to use for the points if ``charts`` is True.
        This should differ from ``highlight_col``, but no validation
        is performed.

    return_drops : bool, optional (default=False)
        If True, will return the list of names that can be dropped
        out (i.e., were generated by sklearn and are not parameters
        of interest).


    Returns
    -------

    result_df : Pandas ``DataFrame`` or ``H2OFrame``, shape=(n_samples, n_features)
        The grid search results

    drops : list
        List of sklearn-generated names. Only returned if
        ``return_drops`` is True.
    """
    valid_axes = ('mean_test_score', 'std_test_score')

    # these are produced in sklearn 0.18 but not 0.17 -- want to skip for now...
    ignore_axes = ('mean_fit_time', 'mean_score_time', 
                   'mean_train_score', 'std_fit_time', 
                   'std_score_time', 'std_train_score')

    # validate y-axis
    if y_axis not in valid_axes:
        raise ValueError('y-axis=%s must be one of (%s)' % (y_axis, ', '.join(valid_axes)))

    # validate percentile
    if not (0 < percentile < 1):
        raise ValueError('percentile must be > 0 and < 1, but got %.5f' % percentile)
    z_score = st.norm.ppf(percentile)

    # make into a data frame from search
    result_df, drops = _grid_detail(random_search, 
                                    z_score=z_score,
                                    sort_results=sort_results, 
                                    sort_by=sort_by, 
                                    ascending=ascending)

    # if the import failed, we won't be able to chart here
    if charts and CAN_CHART_MPL:
        for col in get_numeric(result_df):
            if col in ignore_axes:
                # don't plot these ones
                continue
            elif col not in valid_axes:  # skip score / std
                ser = result_df[col]
                color = [def_color for _ in range(ser.shape[0])]

                # highlight if needed
                if sort_results and highlight_best:
                    color[0] = highlight_col

                # build scatter plot
                plt.scatter(ser, result_df[y_axis], color=color)
                plt.title(col)
                plt.ylabel(y_axis)

                # if there's a '__' in the col, split it
                x_lab = col if '__' not in col else col.split('__')[-1]
                plt.xlabel(x_lab)

                # render
                plt.show()

        for col in list(result_df.columns[result_df.dtypes == "object"]):
            cat_plot = result_df[y_axis].groupby(result_df[col]).mean()
            cat_plot.sort_values()
            cat_plot.plot(kind="barh", xlim=(.5, None), figsize=(7, cat_plot.shape[0] / 2))

            plt.show()

    elif charts and not CAN_CHART_MPL:
        warnings.warn('no module matplotlib, will not be able to display charts', ImportWarning)

    return result_df if not return_drops else (result_df, drops)


[docs]def report_confusion_matrix(actual, pred, return_metrics=True):
    """Return a dataframe with the confusion matrix, and a series
    with the classification performance metrics.

    Parameters
    ----------

    actual : np.ndarray, shape=(n_samples,)
        The array of actual values

    pred : np.ndarray, shape=(n_samples,)
        The array of predicted values

    return_metrics : bool, optional (default=True)
        Whether to return the metrics in a pd.Series. If False,
        index 1 of the returned tuple will be None.


    Returns
    -------

    conf : pd.DataFrame, shape=(2, 2)
        The confusion matrix

    ser : pd.Series or None
        The metrics if ``return_metrics`` else None
    """

    # ensure only two classes in each
    lens = [len(set(actual)), len(set(pred))]
    max_len = np.max(lens)
    if max_len > 2:
        raise ValueError('max classes is 2, but got %i' % max_len)

    cf = cm(actual, pred)
    # format: (col = pred, index = act)
    # array([[TN, FP],
    #        [FN, TP]])

    ser = None
    if return_metrics:
        total_pop = np.sum(cf)
        condition_pos = np.sum(cf[1, :])
        condition_neg = np.sum(cf[0, :])

        # alias the elements in the matrix
        tp = cf[1, 1]
        fp = cf[0, 1]
        tn = cf[0, 0]
        fn = cf[1, 0]

        # sums of the prediction cols
        pred_pos = tp + fp
        pred_neg = tn + fn

        acc = (tp + tn) / total_pop  # accuracy
        tpr = tp / condition_pos  # sensitivity, recall
        fpr = fp / condition_neg  # fall-out
        fnr = fn / condition_pos  # miss rate
        tnr = tn / condition_neg  # specificity
        prev = condition_pos / total_pop  # prevalence
        plr = tpr / fpr  # positive likelihood ratio, LR+
        nlr = fnr / tnr  # negative likelihood ratio, LR-
        dor = plr / nlr  # diagnostic odds ratio
        prc = tp / pred_pos  # precision, positive predictive value
        fdr = fp / pred_pos  # false discovery rate
        fomr = fn / pred_neg  # false omission rate
        npv = tn / pred_neg  # negative predictive value

        # define the series
        d = {
            'Accuracy': acc,
            'Diagnostic odds ratio': dor,
            'Fall-out': fpr,
            'False discovery rate': fdr,
            'False Neg. Rate': fnr,
            'False omission rate': fomr,
            'False Pos. Rate': fpr,
            'Miss rate': fnr,
            'Neg. likelihood ratio': nlr,
            'Neg. predictive value': npv,
            'Pos. likelihood ratio': plr,
            'Pos. predictive value': prc,
            'Precision': prc,
            'Prevalence': prev,
            'Recall': tpr,
            'Sensitivity': tpr,
            'Specificity': tnr,
            'True Pos. Rate': tpr,
            'True Neg. Rate': tnr
        }

        ser = pd.Series(data=d)
        ser.name = 'Metrics'

    # create the DF
    conf = pd.DataFrame.from_records(data=cf, columns=['Neg', 'Pos'])
    conf.index = ['Neg', 'Pos']

    return conf, ser