Source code for skoot.datasets

# -*- coding: utf-8 -*-
#
# Author: Taylor Smith <taylor.smith@alkaline-ml.com>

from __future__ import print_function, absolute_import

import pandas as pd

from os.path import dirname, join

__all__ = [
    'load_adult_df',
    'load_boston_df',
    'load_breast_cancer_df',
    'load_iris_df'
]


def _load_from_bunch(bunch, include_tgt, tgt_name, names):
    # internal loading method
    X = pd.DataFrame.from_records(
        data=bunch.data,
        columns=bunch.feature_names if not names else names)

    if include_tgt:
        X[tgt_name] = bunch.target
    return X


[docs]def load_adult_df(include_tgt=True, tgt_name="target", names=None):
    """Load and return the adult dataset (classification).

    The adult dataset is a classic binary classification problem requiring
    pre-processing prior to being model-ready.

    =================   ============================
    Classes                                        2
    Samples per class       <=50k: 24720; >50k: 7841
    Samples total                              32561
    Dimensionality                                15
    Features             real, positive, categorical
    =================   ============================

    Read more in the :ref:`User Guide <datasets>`.

    Parameters
    ----------
    include_tgt : bool, optional (default=True)
        Whether to include the target

    tgt_name : str, optional (default="target")
        The name of the target feature

    names : iterable or None
        The column names for the dataframe. If not
        defined, will default to the canonical feature names.

    Returns
    -------
    X : pd.DataFrame, shape=(n_samples, n_features)
        The loaded adult dataset

    References
    ----------
    .. [1] Ronny Kohavi and Barry Becker, "Data Mining and Visualization"
           Silicon Graphics. https://archive.ics.uci.edu/ml/datasets/Adult
    """
    # if names isn't defined, use the canonical names
    if names is None:
        names = ["age", "workclass", "fnlwgt", "education",
                 "education-num", "marital-status", "occupation",
                 "relationship", "race", "sex", "capital-gain",
                 "capital-loss", "hours-per-week", "native-country"]

    module_path = dirname(__file__)
    df = pd.read_csv(join(module_path, 'data', 'adult.csv'), header=None,
                     names=names + [tgt_name])

    # if we want to drop the target, do so now
    if not include_tgt:
        df.pop(tgt_name)
    return df


[docs]def load_boston_df(include_tgt=True, tgt_name="target", names=None):
    """Get the Boston housing dataset.

    Loads the boston housing dataset into a dataframe with the
    target set as the "target" feature or whatever name
    is specified in ``tgt_name``.

    Parameters
    ----------
    include_tgt : bool, optional (default=True)
        Whether to include the target

    tgt_name : str, optional (default="target")
        The name of the target feature

    names : iterable or None
        The column names for the dataframe. If not
        defined, will default to the ``feature_names``
        attribute in the sklearn bunch instance.

    Returns
    -------
    X : pd.DataFrame, shape=(n_samples, n_features)
        The loaded boston dataset
    """
    from sklearn.datasets import load_boston
    return _load_from_bunch(load_boston(), include_tgt,
                            tgt_name, names)


[docs]def load_breast_cancer_df(include_tgt=True, tgt_name="target", names=None):
    """Get the breast cancer dataset.

    Loads the breast cancer dataset into a dataframe with the
    target set as the "target" feature or whatever name
    is specified in ``tgt_name``.

    Parameters
    ----------
    include_tgt : bool, optional (default=True)
        Whether to include the target

    tgt_name : str, optional (default="target")
        The name of the target feature

    names : iterable or None
        The column names for the dataframe. If not
        defined, will default to the ``feature_names``
        attribute in the sklearn bunch instance.

    Returns
    -------
    X : pd.DataFrame, shape=(n_samples, n_features)
        The loaded breast cancer dataset
    """
    from sklearn.datasets import load_breast_cancer
    return _load_from_bunch(load_breast_cancer(), include_tgt,
                            tgt_name, names)


[docs]def load_iris_df(include_tgt=True, tgt_name="species", names=None):
    """Get the iris dataset.

    Loads the iris dataset into a dataframe with the
    target set as the "species" feature or whatever name
    is specified in ``tgt_name``.

    Parameters
    ----------
    include_tgt : bool, optional (default=True)
        Whether to include the target

    tgt_name : str or unicode, optional (default="species")
        The name of the target feature

    names : iterable or None
        The column names for the dataframe. If not
        defined, will default to the ``feature_names``
        attribute in the sklearn bunch instance.

    Returns
    -------
    X : pd.DataFrame, shape=(n_samples, n_features)
        The loaded iris dataset.
    """
    from sklearn.datasets import load_iris
    return _load_from_bunch(load_iris(), include_tgt, tgt_name, names)