Source code for skoot.exploration.multivariate

# -*- coding: utf-8 -*-

from ..utils.validation import check_dataframe
from ..utils.dataframe import get_continuous_columns
from .univariate import fisher_pearson_skewness, kurtosis

import pandas as pd
import numpy as np

__all__ = [
    'summarize'
]


[docs]def summarize(X):
    """Summarize a dataframe.

    Create a more in-depth summary of a dataframe than ``pd.describe`` will
    give you. This includes details on skewness, arity (for categorical
    features) and more. For continuous features (floats), this summary
    computes:

        * Mean
        * Median
        * Max
        * Min
        * Variance
        * Skewness
        * Kurtosis

    For categorical features:

        * Least frequent class
        * Most frequent class
        * Class balance (n_least_freq / n_most_freq; higher is better)
        * Num Levels
        * Arity (n_unique_classes / n_samples; lower is better)

    Parameters
    ----------
    X : array-like, shape=(n_samples, n_features)
        The input data. Can be comprised of categorical or continuous data,
        and will be cast to pandas DataFrame for the computations.

    Returns
    -------
    stats : DataFrame
        The summarized dataframe

    Notes
    -----
    The skewness of a normal distribution is zero, and symmetric data should
    exhibit a skewness near zero. Positive values for skewness indicate the
    data is skewed right, and negative indicate they're skewed left. If the
    data are multi-modal, this may impact the sign of the skewness.

    Examples
    --------
    >>> import skoot
    >>> import pandas as pd
    >>> import numpy as np
    >>> from sklearn.datasets import make_classification
    >>> X, y = make_classification(n_samples=1000, n_features=20,
    ...                            n_informative=12, random_state=1)
    >>> X = pd.DataFrame.from_records(X[:, :5],
    ...                               columns=['a', 'b', 'c', 'd', 'e'])
    >>> # Make one into a binary column
    >>> X['d'] = (np.random.RandomState(1).rand(X.shape[0]) > 0.9).astype(int)
    >>> skoot.summarize(X)
                          a         b         c         d         e
    Mean          -1.036419 -0.382853 -0.007993       NaN  0.394417
    Median        -0.968732 -0.382114 -0.047757       NaN  0.283779
    Max            4.559433  9.863773  2.991107       NaN  7.344063
    Min           -6.147430 -8.301872 -2.679137       NaN -5.866428
    Variance       3.324646  5.832246  0.985764       NaN  3.938836
    Skewness      -0.059496  0.148757  0.121908       NaN  0.021251
    Kurtosis       0.069795 -0.040619 -0.098477       NaN -0.187570
    Least Freq.         NaN       NaN       NaN      (1,)       NaN
    Most Freq.          NaN       NaN       NaN      (0,)       NaN
    Class Balance       NaN       NaN       NaN  0.113586       NaN
    Num Levels          NaN       NaN       NaN         2       NaN
    Arity               NaN       NaN       NaN     0.002       NaN
    Missing        0.000000  0.000000  0.000000         0  0.000000

    References
    ----------
    .. [1] Measures of Skewness and Kurtosis
           https://www.itl.nist.gov/div898/handbook/eda/section3/eda35b.htm
    """
    X, cols = check_dataframe(X, cols=None, assert_all_finite=False)
    n_samples = X.shape[0]

    # There are some operations we'll compute on all variable types:
    #   * n_missing
    n_missing = X.isnull().sum().values

    # compute stats on each continuous col
    continuous = get_continuous_columns(X)
    scont_cols = set(continuous.columns.tolist())
    other_cols = [c for c in cols
                  if c not in scont_cols]

    continuous_stats_cols = ["Mean", "Median", "Max", "Min",
                             "Variance", "Skewness", "Kurtosis"]
    categorical_stats_cols = ["Least Freq.", "Most Freq.",
                              "Class Balance", "Num Levels",
                              "Arity"]
    if scont_cols:
        # For each continuous feature, compute the following:
        #   * mean
        #   * median
        #   * max
        #   * min
        #   * variance
        #   * Fisher-Pearson skewness
        #   * Kurtosis
        #
        # We can largely vectorize each of these over the axis...
        means = continuous.mean().values
        medians = continuous.median().values
        maxes = continuous.max().values
        mins = continuous.min().values
        var = continuous.var().values

        # https://www.itl.nist.gov/div898/handbook/eda/section3/eda35b.htm
        fp_skew = continuous.apply(fisher_pearson_skewness).values
        kurt = continuous.apply(kurtosis).values
        stats = pd.DataFrame.from_records(
            data=[means, medians, maxes, mins, var, fp_skew, kurt],
            columns=continuous.columns.tolist(),
            index=continuous_stats_cols).T

        # for each column in the categorical statistics (that we won't be
        # computing over these features), set as NaN
        for stat in categorical_stats_cols:
            stats[stat] = np.nan
        stats = stats.T

    # otherwise we need "stats" in the namespace
    else:
        stats = None

    # Now for each categorical feature, compute the following:
    #   * Least populated class
    #   * Most populated class
    #   * Ratio of most-populous class: least-populous
    #   * Arity (num unique factor levels/num samples)
    def categ_summary(feature):
        vc = feature.value_counts()
        idcs, values = vc.index.values, vc.values
        n_levels = idcs.shape[0]

        # if there is only one value we have to return it as both
        # the most populous and the least-populous...
        if n_levels == 1:
            least_pop = most_pop = idcs[0]
            ratio = 1.
        else:
            # there might be ties, so use masks to determine all classes
            # that are least/most populated & return those as tuples
            least_pop = tuple(idcs[values == values[-1]])
            most_pop = tuple(idcs[values == values[0]])

            # only care about the ratio of the LEAST populous class to the most
            ratio = values[-1] / float(values[0])

        arity = n_levels / float(n_samples)
        return least_pop, most_pop, ratio, n_levels, arity

    # apply the categorical function
    if other_cols:
        # Compute and then transpose so we can tack this onto
        # the continuous statistics
        categ_results = pd.DataFrame.from_records(
            data=np.array(
                X[other_cols].apply(categ_summary)
                             .values
                             .tolist()).T,
            columns=other_cols,
            index=categorical_stats_cols).T

        # for each stat in the continuous stats (that we won't compute for
        # these features), set to np.nan
        for stat in continuous_stats_cols:
            categ_results[stat] = np.nan

        # select in this order to make sure our index will be in the right
        # order after we transpose back
        categ_results = categ_results[continuous_stats_cols +
                                      categorical_stats_cols].T

        # cbind to stats if it's defined, otherwise just rename it
        if stats is not None:
            stats = pd.concat([stats, categ_results], axis=1)
        else:
            stats = categ_results

    # Make sure we're in order
    stats = stats[cols]
    stats.loc["Missing"] = n_missing
    return stats