# -*- coding: utf-8 -*-
#
# Author: Taylor Smith <taylor.smith@alkaline-ml.com>
import pandas as pd
import numpy as np
import copy
from .iterables import is_iterable
__all__ = [
'check_dataframe',
'type_or_iterable_to_col_mapping',
'validate_multiple_cols',
'validate_multiple_rows',
'validate_test_set_columns'
]
[docs]def check_dataframe(X, cols=None, assert_all_finite=False, column_diff=False):
r"""Check an input dataframe.
Determine whether an input frame is a Pandas dataframe or whether it can
be coerced as one, and raise a TypeError if not. Also check for finite
values if specified. If columns are provided, checks that all columns
are present within the dataframe and raises an assertion error if not.
**Note**: if ``X`` is not a dataframe (i.e., a list of lists or a numpy
array), the columns will not be specified when creating a pandas dataframe
and will thus be indices. Any columns provided should account for this
behavior.
Parameters
----------
X : array-like, shape=(n_samples, n_features)
The input frame. Should be a pandas DataFrame, numpy ``ndarray`` or
a similar array-like structure. Any non-pandas structure will be
attempted to be cast to pandas; if it cannot be cast, it will fail
with a TypeError.
cols : list, iterable or None
Any columns to check for. If this is provided, all columns will
be checked for presence in the ``X.columns`` index. If any are not
present, a ValueError will be raised.
assert_all_finite : bool, optional (default=False)
Whether to assert that all values within the ``X`` frame are
finite. Note that if ``cols`` is specified, this will only assert
all values in the specified columns are finite.
column_diff : bool, optional (default=False)
Whether to also get the columns present in ``X`` that are not present
in ``cols``. This is returned as the third element in the output if
``column_diff`` is True.
Examples
--------
When providing a dataframe and columns, the columns should be present:
>>> from skoot.datasets import load_iris_df
>>> df = load_iris_df(include_tgt=False, names=['a', 'b', 'c', 'd'])
>>> df, cols = check_dataframe(df, cols=('a', 'c'))
>>> assert cols == ['a', 'c']
>>> df.head()
a b c d
0 5.1 3.5 1.4 0.2
1 4.9 3.0 1.4 0.2
2 4.7 3.2 1.3 0.2
3 4.6 3.1 1.5 0.2
4 5.0 3.6 1.4 0.2
When passing numpy arrays, account for the fact that the columns cannot
be specified when creating the pandas dataframe:
>>> df2, cols = check_dataframe(df.values, cols=[0, 2])
>>> cols
[0, 2]
>>> df2.columns.tolist()
[0, 1, 2, 3]
>>> df2.head()
0 1 2 3
0 5.1 3.5 1.4 0.2
1 4.9 3.0 1.4 0.2
2 4.7 3.2 1.3 0.2
3 4.6 3.1 1.5 0.2
4 5.0 3.6 1.4 0.2
If you want to get the ``column_diff``, or the left-out columns, this will
be returned as a third element in the tuple when specifed:
>>> df2, cols, diff = check_dataframe(df.values, [0, 2], column_diff=True)
>>> cols
[0, 2]
>>> df2.columns.tolist()
[0, 1, 2, 3]
>>> diff
[1, 3]
Returns
-------
X_copy : DataFrame
A copy of the ``X`` dataframe.
cols : list
The list of columns on which to apply a function to this dataframe.
if ``cols`` was specified in the function, this is equal to ``cols``
as a list. Else, it's the ``X.columns`` index.
diff : list
If ``column_diff`` is True, will return as the third position in the
tuple the columns that are within ``X`` but NOT present in ``cols``.
"""
# determine if it's currently a DF or if it needs to be cast as one.
if not isinstance(X, pd.DataFrame):
if not is_iterable(X):
raise TypeError("X must be a DataFrame, iterable or np.ndarray, "
"but got type=%s" % type(X))
# Old behavior:
# if cols is not None:
# raise ValueError("When X is not a DataFrame, cols cannot be "
# "defined. Either pre-cast your data to Pandas, "
# "or pass cols=None.")
# Discussion (feel free to add below):
# * Skoot is intended to speed things up and make life easier.
# Unnecessary constraints like this make life more difficult and
# add work for the user. I vote we do away with this constraint.
# This will allow users to pipe a sklearn transformer into a skoot
# transformer and use numeric columns as indices rather than having
# to pipe into a DF transformer FIRST. ALSO the next stage makes
# sure the columns they pass are valid, so as long as they pass
# integers, this should be totally fine.
X = pd.DataFrame.from_records(X)
# if columns are provided, check...
present_columns = set(X.columns)
if cols is not None:
# ensure iterable, or copy if not
cols = copy.deepcopy(cols) if is_iterable(cols) else [cols]
# better to use "any" since it will short circuit!
if any(c not in present_columns for c in cols):
raise ValueError("All columns in `cols` must be present in X. "
"X columns=%r" % present_columns)
# otherwise, if not specified, make sure we define it since we
# end up returning cols (this is converted to a list in the next step)
else:
cols = X.columns
# cols might have been a np.array or might be an Index -- make it a list
if hasattr(cols, 'tolist'):
cols = cols.tolist()
elif not isinstance(cols, list):
cols = list(cols)
# if specified, check that all values are finite
if assert_all_finite and \
X[cols].apply(lambda x: (~np.isfinite(x)).sum()).sum() > 0:
raise ValueError('Expected all entries in specified columns '
'to be finite')
# get the copy of X to return
X_copy = X.copy()
# if column diff is defined, we need to get it...
if column_diff:
colset = set(cols)
# make sure to iter X.columns and not present_columns to preserve order
diff = [c for c in X.columns if c not in colset] # O(1) lookup (set)
return X_copy, cols, diff
return X_copy, cols
[docs]def type_or_iterable_to_col_mapping(cols, param, param_name,
permitted_scalar_types):
"""Map a parameter to various columns in a dict.
Many estimators accept either scalar values or iterables as parameters to
allow for different values across different features. This function creates
a dictionary mapping column names to parameter values and validates scalars
within a tuple of permitted scalar types.
Note: this is primarily intended to be an internal method.
Parameters
----------
cols : list
The list of columns against which to map some function or parameters.
param : int, float, str, iterable or object
The parameter value.
param_name : str or unicode
The name of the parameter
permitted_scalar_types : type or iterable
The permitted types.
Examples
--------
>>> cols = ["a", "c"]
>>> ticm = type_or_iterable_to_col_mapping # too many characters...
>>> assert ticm(cols, 0.5, "n_components", float) == {'a': 0.5, 'c': 0.5}
>>> assert ticm(cols, "uniform", "strategy", str) == {'a': 'uniform',
... 'c': 'uniform'}
>>> assert ticm(cols, [3, 5], "q", int) == {'a': 3, 'c': 5}
>>> assert ticm(cols, {"a": 3, "c": 5}, "q", int) == {'a': 3, 'c': 5}
Returns
-------
param : dict
The param dictionary.
"""
# we need permitted scalar types to be a tuple of allowed values
# (an instance, not the class)
if not isinstance(permitted_scalar_types, tuple):
permitted_scalar_types = (permitted_scalar_types,)
# validate the parameter
if is_iterable(param):
# first smoke test is easy -- if the length of the number of
# bins does not match the number of columns prescribed, raise
if len(param) != len(cols):
raise ValueError("Dim mismatch between cols and %s" % param_name)
# next, we're concerned with whether the param iterable is a dict
# and if it is, we have to validate the keys are all there...
if isinstance(param, dict):
# get sets of the columns and keys so we can easily compare
scols = set(cols)
skeys = set(param.keys())
# if there are extra keys (skeys - scols) or missing keys
# from the prescribed columns (scols - skeys) we have to raise
if scols - skeys or skeys - scols:
raise ValueError("When %s is provided as a dictionary "
"its keys must match the provided cols."
% param_name)
# otherwise it's a non-dict iterable, and what we ultimately
# want IS a dictionary
else:
param = dict(zip(cols, param))
else:
if not isinstance(param, permitted_scalar_types):
raise TypeError("Permitted types for %s if not iterable: %s"
% (param_name, str(permitted_scalar_types)))
# make it into a dictionary mapping cols to n_bins
param = {c: param for c in cols}
return param
[docs]def validate_multiple_cols(clsname, cols):
"""Validate that there are at least two columns to evaluate.
This is used for various feature selection techniques, as well as
in several feature extraction techniques.
Parameters
----------
clsname : str or unicode
The name of the class that is calling the function.
Used for more clear error messages.
cols : array-like, shape=(n_features,)
The columns to evaluate. If ``cols`` is not None
and the length is less than 2, will raise a
``ValueError``.
"""
if len(cols) < 2:
raise ValueError('%s requires at least two features. Your data '
'(or the passed ``cols`` parameter) includes too '
'few features (%i)' % (clsname, len(cols)))
[docs]def validate_multiple_rows(clsname, frame):
"""Validate that there are at least two samples to evaluate.
This is used for various feature transformation techniques, such as
box-cox and yeo-johnson transformations.
Parameters
----------
clsname : str or unicode
The name of the class that is calling the function.
Used for more clear error messages.
frame : array-like or pd.DataFrame, shape=(n_features, n_features)
The samples to evaluate. If contains less than two samples,
will raise a ValueError.
"""
n_samples = frame.shape[0]
if n_samples < 2:
raise ValueError('%s requires at least two samples. Your data '
'includes too few samples (%i)'
% (clsname, n_samples))
[docs]def validate_test_set_columns(fit_columns, test_columns):
"""Validate that the test set columns will work.
This function checks that the ``fit_columns`` are present in the
``test_columns`` set and raises a ValueError if not.
Parameters
----------
fit_columns : list or iterable
The column names the estimator was fit on.
test_columns : list or iterable
The column names the test set contains.
"""
present_cols = set(test_columns) # O(1) lookup
if not all(t in present_cols for t in fit_columns):
raise ValueError("Not all fit columns present in test data! "
"(expected=%r, present=%r)"
% (fit_columns, test_columns))
# nosetest pb:
validate_test_set_columns.__test__ = False