Source code for skutil.utils.util

# -*- coding: utf-8 -*-

from __future__ import print_function, division, absolute_import
import warnings
import sys
import traceback
import numpy as np
import pandas as pd
import numbers
import scipy.stats as st
from sklearn.datasets import load_iris, load_breast_cancer, load_boston
from sklearn.externals import six
from sklearn.metrics import confusion_matrix as cm
from ..base import suppress_warnings
from .fixes import (_grid_detail, _is_integer, is_iterable, 
                    _cols_if_none, dict_keys, dict_values)

try:
    # this causes a UserWarning to be thrown by matplotlib... should we squelch this?
    with warnings.catch_warnings():
        warnings.simplefilter("ignore")

        import matplotlib
        matplotlib.use('Agg')  # set backend
        from matplotlib import pyplot as plt

        # log it
        CAN_CHART_MPL = True
except ImportError:
    CAN_CHART_MPL = False

try:
    with warnings.catch_warnings():
        warnings.simplefilter("ignore")
        import seaborn as sns

        CAN_CHART_SNS = True
except ImportError:
    CAN_CHART_SNS = False

__max_exp__ = 1e19
__min_log__ = -19
__all__ = [
    'corr_plot',
    'df_memory_estimate',
    'exp',
    'flatten_all',
    'flatten_all_generator',
    'get_numeric',
    'human_bytes',
    'is_entirely_numeric',
    'is_integer',
    'is_numeric',
    'is_integer',
    'is_float',
    'load_boston_df',
    'load_breast_cancer_df',
    'load_iris_df',
    'log',
    'pd_stats',
    'report_confusion_matrix',
    'report_grid_score_detail',
    'shuffle_dataframe',
    'validate_is_pd'
]


@suppress_warnings
def _log_single(x):
    """Sanitized log function for a single element.
    Since this method internally calls np.log and carries
    the (very likely) possibility to overflow, the method
    suppresses all warnings.

    #XXX: at some point we might want to let ``suppress_warnings``
    # specify exactly which types of warnings it should filter.

    Parameters
    ----------

    x : float, int
        The number to log

    Returns
    -------

    val : float
        the log of x
    """
    x = np.maximum(0, x)
    val = __min_log__ if x == 0 else np.maximum(__min_log__, np.log(x))
    return val


@suppress_warnings
def _exp_single(x):
    """Sanitized exponential function.
    Since this method internally calls np.exp and carries
    the (very likely) possibility to overflow, the method
    suppresses all warnings.

    #XXX: at some point we might want to let ``suppress_warnings``
    # specify exactly which types of warnings it should filter.

    Parameters
    ----------

    x : float, int
        The number to exp


    Returns
    -------

    val : float
        the exp of x
    """
    val = np.minimum(__max_exp__, np.exp(x))
    return val


def _vectorize(fun, x):
    if is_iterable(x):
        return np.array([fun(p) for p in x])
    raise ValueError('Type %s is not iterable' % type(x))


[docs]def exp(x): """A safe mechanism for computing the exponential function while avoiding overflows. Parameters ---------- x : float, number The number for which to compute the exp Returns ------- exp(x) """ # check on single exp if is_numeric(x): return _exp_single(x) # try vectorized try: return _vectorize(exp, x) except ValueError: # bail raise ValueError("don't know how to compute exp for type %s" % type(x))
[docs]def log(x): """A safe mechanism for computing a log while avoiding NaNs or exceptions. Parameters ---------- x : float, number The number for which to compute the log Returns ------- log(x) """ # check on single log if is_numeric(x): return _log_single(x) # try vectorized try: return _vectorize(log, x) except ValueError: # bail raise ValueError("don't know how to compute log for type %s" % type(x))
def _val_cols(cols): # if it's None, return immediately if cols is None: return cols # try to make cols a list if not is_iterable(cols): if isinstance(cols, six.string_types): return [cols] else: raise ValueError('cols must be an iterable sequence') # if it is an index or a np.ndarray, it will have a built-in # (potentially more efficient tolist() method) if hasattr(cols, 'tolist') and hasattr(cols.tolist, '__call__'): return cols.tolist() # make it a list implicitly, make no guarantees about elements return [c for c in cols] def _def_headers(X): m = X.shape[1] if hasattr(X, 'shape') else len(X) return ['V%i' % (i + 1) for i in range(m)]
[docs]def corr_plot(X, plot_type='cor', cmap='Blues_d', n_levels=5, corr=None, method='pearson', figsize=(11, 9), cmap_a=220, cmap_b=10, vmax=0.3, xticklabels=5, yticklabels=5, linewidths=0.5, cbar_kws=None): """Create a simple correlation plot given a dataframe. Note that this requires all datatypes to be numeric and finite! Parameters ---------- X : pd.DataFrame, shape=(n_samples, n_features) The pandas DataFrame on which to compute correlations, or if ``corr`` is 'precomputed', the correlation matrix. In the case that ``X`` is a correlation matrix, it must be square, i.e., shape=(n_features, n_features). plot_type : str, optional (default='cor') The type of plot, one of ('cor', 'kde', 'pair') cmap : str, optional (default='Blues_d') The color to use for the kernel density estimate plot if ``plot_type`` == 'kde'. Otherwise unused. n_levels : int, optional (default=5) The number of levels to use for the kde plot if ``plot_type`` == 'kde'. Otherwise unused. corr : 'precomputed' or None, optional (default=None) If None, the correlation matrix is computed, otherwise if 'precomputed', ``X`` is treated as a correlation matrix. method : str, optional (default='pearson') The method to use for correlation figsize : tuple (int), shape=(w,h), optional (default=(11,9)) The size of the image cmap_a : int, optional (default=220) The colormap start point cmap_b : int, optional (default=10) The colormap end point vmax : float, optional (default=0.3) Arg for seaborn heatmap xticklabels : int, optional (default=5) The spacing for X ticks yticklabels : int, optional (default=5) The spacing for Y ticks linewidths : float, optional (default=0.5) The width of the lines cbar_kws : dict, optional (default=None) Any KWs to pass to seaborn's heatmap when ``plot_type`` = 'cor'. If None, will default to {'shrink': 0.5} """ X, _ = validate_is_pd(X, None, assert_all_finite=True) valid_types = ('cor', 'kde', 'pair') if plot_type not in valid_types: raise ValueError('expected one of (%s), but got %s' % (','.join(valid_types), plot_type)) if cbar_kws is None: cbar_kws = {'shrink': 0.5} # seaborn is needed for all of these, so we have to check outside if not CAN_CHART_SNS: warnings.warn('Cannot plot (unable to import Seaborn)', ImportWarning) return None if plot_type == 'cor': # MPL is only needed for COR if not CAN_CHART_MPL: warnings.warn('Cannot plot (unable to import Matplotlib)') return None if not corr == 'precomputed': cols = X.columns.values X = X.corr(method=method) X.columns = cols X.index = cols # mask for upper triangle mask = np.zeros_like(X, dtype=np.bool) mask[np.triu_indices_from(mask)] = True # set up mpl figure f, ax = plt.subplots(figsize=figsize) color_map = sns.diverging_palette(cmap_a, cmap_b, as_cmap=True) sns.heatmap(X, mask=mask, cmap=color_map, vmax=vmax, square=True, xticklabels=xticklabels, yticklabels=yticklabels, linewidths=linewidths, cbar_kws=cbar_kws, ax=ax) elif plot_type == 'pair': sns.pairplot(X) sns.plt.show() else: g = sns.PairGrid(X) g.map_diag(sns.kdeplot) g.map_offdiag(sns.kdeplot, cmap=cmap, n_levels=n_levels) sns.plt.show()
[docs]def flatten_all(container): """Recursively flattens an arbitrarily nested iterable. WARNING: this function may produce a list of mixed types. Parameters ---------- container : array_like, shape=(n_items,) The iterable to flatten. If the ``container`` is not iterable, it will be returned in a list as ``[container]`` Examples -------- The example below produces a list of mixed results: >>> a = [[[],3,4],['1','a'],[[[1]]],1,2] >>> flatten_all(a) [3, 4, '1', 'a', 1, 1, 2] Returns ------- l : list, shape=(n_items,) The flattened list """ l = [x for x in flatten_all_generator(container)] return l
[docs]def flatten_all_generator(container): """Recursively flattens an arbitrarily nested iterable. WARNING: this function may produce a list of mixed types. Parameters ---------- container : array_like, shape=(n_items,) The iterable to flatten. If the ``container`` is not iterable, it will be returned in a list as ``[container]`` Examples -------- The example below produces a list of mixed results: >>> a = [[[],3,4],['1','a'],[[[1]]],1,2] >>> flatten_all(a) # yields a generator for this iterable [3, 4, '1', 'a', 1, 1, 2] Returns ------- generator object """ if not is_iterable(container): yield container else: for i in container: if is_iterable(i): for j in flatten_all_generator(i): yield j else: yield i
[docs]def shuffle_dataframe(X): """Shuffle the rows in a data frame without replacement. The random state used for shuffling is controlled by numpy's random state. Parameters ---------- X : pd.DataFrame, shape=(n_samples, n_features) The dataframe to shuffle """ X, _ = validate_is_pd(X, None, False) return X.iloc[np.random.permutation(np.arange(X.shape[0]))]
[docs]def validate_is_pd(X, cols, assert_all_finite=False): """Used within each SelectiveMixin fit method to determine whether the passed ``X`` is a dataframe, and whether the cols is appropriate. There are four scenarios (in the order in which they're checked): 1) Names is not None, but X is not a dataframe. Resolution: the method will attempt to return a DataFrame from the args provided (with default names), but catches any exception and raises a ValueError. A common case where this would work may be a numpy.ndarray as X, and a list as cols (where the list is either int indices or default names that the dataframe will take on). 2) X is a DataFrame, but cols is None. Resolution: return a copy of the dataframe, and use all column names. 3) X is a DataFrame and cols is not None. Return a copy of the dataframe, and use only the names provided. This is the typical use case. 4) X is not a DataFrame, and cols is None. Resolution: this case will only work if the X can be built into a DataFrame. Otherwise, there will be a ValueError thrown. Parameters ---------- X : array_like, shape=(n_samples, n_features) The dataframe to validate. If ``X`` is not a DataFrame, but it can be made into one, no exceptions will be raised. However, if ``X`` cannot naturally be made into a DataFrame, a TypeError will be raised. cols : array_like (str), shape=(n_features,) The list of column names. Used particularly in SelectiveMixin transformers that validate column names. assert_all_finite : bool, optional (default=False) If True, will raise an AssertionError if any np.nan or np.inf values reside in ``X``. Returns ------- X : pd.DataFrame, shape=(n_samples, n_features) A copy of the original input ``X`` cols : list or None, shape=(n_features,) If ``cols`` was not None and did not raise a TypeError, it is converted into a list of strings and returned as a copy. Else None. """ def _check(X, cols): # first check hard-to-detect case: if isinstance(X, pd.Series): raise ValueError('expected DataFrame but got Series') # validate the cols arg cols = _val_cols(cols) # if someone devious gave us an empty set of cols if cols is not None and len(cols) == 0: cols = None # avoid multiple isinstance checks is_df = isinstance(X, pd.DataFrame) # we do want to make sure the X at least is "array-like" if not is_iterable(X): raise TypeError('X (type=%s) cannot be cast to DataFrame' % type(X)) # case 1, we have names but the X is not a frame if not is_df and cols is not None: # this is tough, because they only pass cols if it's a subset # and this frame is likely too large for the passed columns. # so, we hope they either passed what the col names WILL be # or that they passed numeric cols... they should handle that # validation on their end, though. If this fails, we'll just let # it fall through. return pd.DataFrame.from_records(data=X, columns=_def_headers(X)), cols # case 2, we have a DF but no cols, def behavior: use all elif is_df and cols is None: return X.copy(), None # case 3, we have a DF AND cols elif is_df and cols is not None: return X.copy(), cols # case 4, we have neither a frame nor cols (maybe JUST a np.array?) else: # we'll do two tests here... either that it's a np ndarray or a list of lists if isinstance(X, np.ndarray) or (is_iterable(X) and all(isinstance(elem, list) for elem in X)): return pd.DataFrame.from_records(data=X, columns=_def_headers(X)), None # bail out: raise ValueError('cannot handle data of type %s' % type(X)) # do initial check X, cols = _check(X, cols) # we need to ensure all are finite if assert_all_finite: # if cols, we only need to ensure the specified columns are finite cols_tmp = _cols_if_none(X, cols) X_prime = X[get_numeric(X[cols_tmp])] # subset the subset... only numerics # also apply only to the non-object columns if X_prime.apply(lambda x: (~np.isfinite(x)).sum()).sum() > 0: raise ValueError('Expected all entries to be finite') return X, cols
[docs]def df_memory_estimate(X, unit='MB', index=False): """We estimate the memory footprint of an H2OFrame to determine whether it's capable of being held in memory or not. Parameters ---------- X : Pandas ``DataFrame`` or ``H2OFrame``, shape=(n_samples, n_features) The DataFrame in question unit : str, optional (default='MB') The units to report. One of ('MB', 'KB', 'GB', 'TB') index : bool, optional (default=False) Whether to also estimate the memory footprint of the index. Returns ------- mb : str The estimated number of UNIT held in the frame """ X, _ = validate_is_pd(X, None, False) return human_bytes(X.memory_usage(index=index).sum(), unit)
def _is_int(x, tp): """Determine whether a column can be cast to int without loss of data """ if not any([tp.startswith(i) for i in ('float', 'int')]): return False # if there's no difference between the two, then it's an int. try: return (x - x.astype('int')).abs().sum() == 0 except ValueError: # happens when there are NaNs return False
[docs]def pd_stats(X, col_type='all', na_str='--', hi_skew_thresh=1.0, mod_skew_thresh=0.5): """Get a descriptive report of the elements in the data frame. Builds on existing pandas ``describe`` method by adding counts of factor-level features, a skewness rating and several other helpful statistics. Parameters ---------- X : Pandas ``DataFrame`` or ``H2OFrame``, shape=(n_samples, n_features) The DataFrame on which to compute stats. col_type : str, optional (default='all') The types of columns to analyze. One of ('all', 'numeric', 'object'). If not all, will only return corresponding typed columns. na_str : str, optional (default='--') The string to display in a cell that is not applicable for the column's datatype. hi_skew_thresh : float, optional (default=1.0) The threshold above which a skewness rating will be deemed "high." mod_skew_thresh : float, optional (default=0.5) The threshold above which a skewness rating will be deemed "moderate," so long as it does not exceed ``hi_skew_thresh`` Returns ------- s : Pandas ``DataFrame`` or ``H2OFrame``, shape=(n_samples, n_features) The resulting stats dataframe """ X, _ = validate_is_pd(X, None, False) raw_stats = X.describe() stats = raw_stats.to_dict() dtypes = X.dtypes # validate col_type valid_types = ('all', 'numeric', 'object') if col_type not in valid_types: raise ValueError('expected col_type in (%s), but got %s' % (','.join(valid_types), col_type)) # if user only wants part back, we can use this... type_dict = {} # the string to use when we don't want to populate a cell _nastr = na_str # objects are going to get dropped in the describe() call, # so we need to add them back in with dicts of nastr for all... object_dtypes = dtypes[dtypes == 'object'] if object_dtypes.shape[0] > 0: obj_nms = object_dtypes.index.values for nm in obj_nms: obj_dct = {stat: _nastr for stat in raw_stats.index.values} stats[nm] = obj_dct # we'll add rows to the stats... for col, dct in six.iteritems(stats): # add the dtype _dtype = str(dtypes[col]) is_numer = any([_dtype.startswith(x) for x in ('int', 'float')]) dct['dtype'] = _dtype # populate type_dict type_dict[col] = 'numeric' if is_numer else 'object' # if the dtype is not a float, we can # get the count of uniques, then do a # ratio of majority : minority _isint = _is_int(X[col], _dtype) if _isint or _dtype == 'object': _unique = len(X[col].unique()) _val_cts = X[col].value_counts().sort_values(ascending=True) _min_cls, _max_cls = _val_cts.index[0], _val_cts.index[-1] # if there's only one class... if _min_cls == _max_cls: _min_cls = _nastr _min_max_ratio = _nastr else: _min_max_ratio = _val_cts.values[0] / _val_cts.values[-1] # chance we didn't recognize it as an int before... if 'float' in dct['dtype']: dct['dtype'] = dct['dtype'].replace('float', 'int') else: _unique = _min_max_ratio = _nastr # populate the unique count and more dct['unique_ct'] = _unique dct['min_max_class_ratio'] = _min_max_ratio # get the skewness... if is_numer: _skew, _kurt = X[col].skew(), X[col].kurtosis() abs_skew = abs(_skew) hs, ms = hi_skew_thresh, mod_skew_thresh _skew_risk = 'high skew' if abs_skew > hs else 'mod. skew' if (ms < abs_skew < hs) else 'symmetric' else: _skew = _kurt = _skew_risk = _nastr dct['skewness'] = _skew dct['skewness rating'] = _skew_risk dct['kurtosis'] = _kurt # go through and pop the keys that might be filtered on if col_type != 'all': stat_out = {} for col, dtype in six.iteritems(type_dict): if col_type == dtype: stat_out[col] = stats[col] else: stat_out = stats s = pd.DataFrame.from_dict(stat_out) return s
[docs]def get_numeric(X): """Return list of indices of numeric dtypes variables Parameters ---------- X : Pandas ``DataFrame``, shape=(n_samples, n_features) The dataframe Returns ------- list, int The list of indices which are numeric. """ validate_is_pd(X, cols=None, assert_all_finite=False) # don't want to assert finite or maybe endless recursion return X.dtypes[X.dtypes.apply(lambda x: str(x).startswith(("float", "int")))].index.tolist()
[docs]def human_bytes(b, unit='MB'): """Get bytes in a human readable form Parameters ---------- b : int The number of bytes unit : str, optional (default='MB') The units to report. One of ('MB', 'KB', 'GB', 'TB') Returns ------- mb : str The estimated number of UNIT held in the frame """ kb = float(1024) units = { 'KB': kb, 'MB': float(kb ** 2), 'GB': float(kb ** 3), 'TB': float(kb ** 4) } if unit not in units: raise ValueError('got %s, expected one of (%s)' % (unit, ', '.join(dict_keys(units)))) return '%.3f %s' % (b / units[unit], unit)
[docs]def is_entirely_numeric(X): """Determines whether an entire pandas frame is numeric in dtypes. Parameters ---------- X : Pandas ``DataFrame`` or ``H2OFrame``, shape=(n_samples, n_features) The dataframe to test Returns ------- bool True if the entire pd.DataFrame is numeric else False """ return X.shape[1] == len(get_numeric(X))
[docs]def is_integer(x): """Determine whether some object ``x`` is an integer type (int, long, etc). Parameters ---------- x : object The item to assess Returns ------- bool True if ``x`` is an integer type """ try: python_major_version = sys.version_info.major assert(python_major_version == 2 or python_major_version == 3) if python_major_version == 2: return (not isinstance(x, (bool, np.bool))) and \ isinstance(x, (numbers.Integral, int, long, np.int, np.long)) elif python_major_version == 3: return (not isinstance(x, (bool, np.bool))) and \ isinstance(x, (numbers.Integral, int, np.int, np.long)) except AssertionError: _, _, tb = sys.exc_info() traceback.print_tb(tb) # Fixed format tb_info = traceback.extract_tb(tb) filename, line, func, text = tb_info[-1] print('An error occurred on line {} in statement {}'.format(line, text)) exit(1) return _is_integer(x)
[docs]def is_float(x): """Determine whether some object ``x`` is a float type (float, np.float, etc). Parameters ---------- x : object The item to assess Returns ------- bool True if ``x`` is a float type """ return isinstance(x, (float, np.float)) or \ (not isinstance(x, (bool, np.bool)) and isinstance(x, numbers.Real))
[docs]def is_numeric(x): """Determine whether some object ``x`` is a numeric type (float, int, etc). Parameters ---------- x : object The item to assess Returns ------- bool True if ``x`` is a float or integer type """ return is_float(x) or is_integer(x)
[docs]def load_iris_df(include_tgt=True, tgt_name="Species", shuffle=False): """Loads the iris dataset into a dataframe with the target set as the "Species" feature or whatever name is specified in ``tgt_name``. Parameters ---------- include_tgt : bool, optional (default=True) Whether to include the target tgt_name : str, optional (default="Species") The name of the target feature shuffle : bool, optional (default=False) Whether to shuffle the rows on return Returns ------- X : pd.DataFrame, shape=(n_samples, n_features) The loaded dataset """ iris = load_iris() X = pd.DataFrame.from_records(data=iris.data, columns=iris.feature_names) if include_tgt: X[tgt_name] = iris.target return X if not shuffle else shuffle_dataframe(X)
[docs]def load_breast_cancer_df(include_tgt=True, tgt_name="target", shuffle=False): """Loads the breast cancer dataset into a dataframe with the target set as the "target" feature or whatever name is specified in ``tgt_name``. Parameters ---------- include_tgt : bool, optional (default=True) Whether to include the target tgt_name : str, optional (default="target") The name of the target feature shuffle : bool, optional (default=False) Whether to shuffle the rows Returns ------- X : pd.DataFrame, shape=(n_samples, n_features) The loaded dataset """ bc = load_breast_cancer() X = pd.DataFrame.from_records(data=bc.data, columns=bc.feature_names) if include_tgt: X[tgt_name] = bc.target return X if not shuffle else shuffle_dataframe(X)
[docs]def load_boston_df(include_tgt=True, tgt_name="target", shuffle=False): """Loads the boston housing dataset into a dataframe with the target set as the "target" feature or whatever name is specified in ``tgt_name``. Parameters ---------- include_tgt : bool, optional (default=True) Whether to include the target tgt_name : str, optional (default="target") The name of the target feature shuffle : bool, optional (default=False) Whether to shuffle the rows Returns ------- X : Pandas ``DataFrame`` or ``H2OFrame``, shape=(n_samples, n_features) The loaded dataset """ bo = load_boston() X = pd.DataFrame.from_records(data=bo.data, columns=bo.feature_names) if include_tgt: X[tgt_name] = bo.target return X if not shuffle else shuffle_dataframe(X)
[docs]def report_grid_score_detail(random_search, charts=True, sort_results=True, ascending=True, percentile=0.975, y_axis='mean_test_score', sort_by='mean_test_score', highlight_best=True, highlight_col='red', def_color='blue', return_drops=False): """Return plots and dataframe of results, given a fitted grid search. Note that if Matplotlib is not installed, a warning will be thrown and no plots will be generated. Parameters ---------- random_search : ``BaseSearchCV`` or ``BaseH2OSearchCV`` The fitted grid search charts : bool, optional (default=True) Whether to plot the charts sort_results : bool, optional (default=True) Whether to sort the results based on score ascending : bool, optional (default=True) If ``sort_results`` is True, whether to use asc or desc in the sorting process. percentile : float, optional (default=0.975) The percentile point (0 < percentile < 1.0). The corresponding z-score will be multiplied by the cross validation score standard deviations. y_axis : str, optional (default='mean_test_score') The y-axis of the charts. One of ('score','std') sort_by : str, optional (default='mean_test_score') The column to sort by. This is not validated, in case the user wants to sort by a parameter column. If not ``sort_results``, this is unused. highlight_best : bool, optional (default=True) If set to True, charts is True, and sort_results is also True, then highlights the point in the top position of the model DF. highlight_col : str, optional (default='red') What color to use for ``highlight_best`` if both ``charts`` and ``highlight_best``. If either is False, this is unused. def_color : str, optional (default='blue') What color to use for the points if ``charts`` is True. This should differ from ``highlight_col``, but no validation is performed. return_drops : bool, optional (default=False) If True, will return the list of names that can be dropped out (i.e., were generated by sklearn and are not parameters of interest). Returns ------- result_df : Pandas ``DataFrame`` or ``H2OFrame``, shape=(n_samples, n_features) The grid search results drops : list List of sklearn-generated names. Only returned if ``return_drops`` is True. """ valid_axes = ('mean_test_score', 'std_test_score') # these are produced in sklearn 0.18 but not 0.17 -- want to skip for now... ignore_axes = ('mean_fit_time', 'mean_score_time', 'mean_train_score', 'std_fit_time', 'std_score_time', 'std_train_score') # validate y-axis if y_axis not in valid_axes: raise ValueError('y-axis=%s must be one of (%s)' % (y_axis, ', '.join(valid_axes))) # validate percentile if not (0 < percentile < 1): raise ValueError('percentile must be > 0 and < 1, but got %.5f' % percentile) z_score = st.norm.ppf(percentile) # make into a data frame from search result_df, drops = _grid_detail(random_search, z_score=z_score, sort_results=sort_results, sort_by=sort_by, ascending=ascending) # if the import failed, we won't be able to chart here if charts and CAN_CHART_MPL: for col in get_numeric(result_df): if col in ignore_axes: # don't plot these ones continue elif col not in valid_axes: # skip score / std ser = result_df[col] color = [def_color for _ in range(ser.shape[0])] # highlight if needed if sort_results and highlight_best: color[0] = highlight_col # build scatter plot plt.scatter(ser, result_df[y_axis], color=color) plt.title(col) plt.ylabel(y_axis) # if there's a '__' in the col, split it x_lab = col if '__' not in col else col.split('__')[-1] plt.xlabel(x_lab) # render plt.show() for col in list(result_df.columns[result_df.dtypes == "object"]): cat_plot = result_df[y_axis].groupby(result_df[col]).mean() cat_plot.sort_values() cat_plot.plot(kind="barh", xlim=(.5, None), figsize=(7, cat_plot.shape[0] / 2)) plt.show() elif charts and not CAN_CHART_MPL: warnings.warn('no module matplotlib, will not be able to display charts', ImportWarning) return result_df if not return_drops else (result_df, drops)
[docs]def report_confusion_matrix(actual, pred, return_metrics=True): """Return a dataframe with the confusion matrix, and a series with the classification performance metrics. Parameters ---------- actual : np.ndarray, shape=(n_samples,) The array of actual values pred : np.ndarray, shape=(n_samples,) The array of predicted values return_metrics : bool, optional (default=True) Whether to return the metrics in a pd.Series. If False, index 1 of the returned tuple will be None. Returns ------- conf : pd.DataFrame, shape=(2, 2) The confusion matrix ser : pd.Series or None The metrics if ``return_metrics`` else None """ # ensure only two classes in each lens = [len(set(actual)), len(set(pred))] max_len = np.max(lens) if max_len > 2: raise ValueError('max classes is 2, but got %i' % max_len) cf = cm(actual, pred) # format: (col = pred, index = act) # array([[TN, FP], # [FN, TP]]) ser = None if return_metrics: total_pop = np.sum(cf) condition_pos = np.sum(cf[1, :]) condition_neg = np.sum(cf[0, :]) # alias the elements in the matrix tp = cf[1, 1] fp = cf[0, 1] tn = cf[0, 0] fn = cf[1, 0] # sums of the prediction cols pred_pos = tp + fp pred_neg = tn + fn acc = (tp + tn) / total_pop # accuracy tpr = tp / condition_pos # sensitivity, recall fpr = fp / condition_neg # fall-out fnr = fn / condition_pos # miss rate tnr = tn / condition_neg # specificity prev = condition_pos / total_pop # prevalence plr = tpr / fpr # positive likelihood ratio, LR+ nlr = fnr / tnr # negative likelihood ratio, LR- dor = plr / nlr # diagnostic odds ratio prc = tp / pred_pos # precision, positive predictive value fdr = fp / pred_pos # false discovery rate fomr = fn / pred_neg # false omission rate npv = tn / pred_neg # negative predictive value # define the series d = { 'Accuracy': acc, 'Diagnostic odds ratio': dor, 'Fall-out': fpr, 'False discovery rate': fdr, 'False Neg. Rate': fnr, 'False omission rate': fomr, 'False Pos. Rate': fpr, 'Miss rate': fnr, 'Neg. likelihood ratio': nlr, 'Neg. predictive value': npv, 'Pos. likelihood ratio': plr, 'Pos. predictive value': prc, 'Precision': prc, 'Prevalence': prev, 'Recall': tpr, 'Sensitivity': tpr, 'Specificity': tnr, 'True Pos. Rate': tpr, 'True Neg. Rate': tnr } ser = pd.Series(data=d) ser.name = 'Metrics' # create the DF conf = pd.DataFrame.from_records(data=cf, columns=['Neg', 'Pos']) conf.index = ['Neg', 'Pos'] return conf, ser