Source code for skoot.utils.dataframe

# -*- coding: utf-8 -*-

import pandas as pd
import numpy as np

__all__ = [
    'dataframe_or_array',
    'get_categorical_columns',
    'get_continuous_columns',
    'get_datetime_columns',
    'get_numeric_columns',
    'safe_drop_samples',
    'safe_mask_samples',
    'safe_vstack'
]


[docs]def dataframe_or_array(X, as_df): """Get a dataframe or numpy array. If the ``as_df`` param is True, returns a Pandas dataframe. Otherwise returns the underlying numpy array values. Parameters ---------- X : DataFrame The Pandas dataframe as_df : bool Whether to return a dataframe """ assert isinstance(X, pd.DataFrame), "Expected X to be a DataFrame" return X if as_df else X.values
def get_categorical_columns(X): """Get all categorical features from a pandas DataFrame. This function selects all categorical columns from a pandas DataFrame that are within the ``object`` or ``category`` family. Parameters ---------- X : pd.DataFrame The input dataframe. """ return X.select_dtypes(include=['object', 'category'])
[docs]def get_continuous_columns(X): """Get all continuous features from a pandas DataFrame. This function selects all numeric columns from a pandas DataFrame that are within the ``float`` family. Parameters ---------- X : pd.DataFrame The input dataframe. """ return X.select_dtypes(include=[float])
[docs]def get_datetime_columns(X): """Get all datetime features from a pandas DataFrame. This function selects all datetime columns from a pandas DataFrame that are within the ``np.datetime`` family. Parameters ---------- X : pd.DataFrame The input dataframe. """ return X.select_dtypes(include=[np.datetime64])
[docs]def get_numeric_columns(X): """Get all numeric columns from a pandas DataFrame. This function selects all numeric columns from a pandas DataFrame. A numeric column is defined as a column whose ``dtype`` is a ``np.number``. Parameters ---------- X : pd.DataFrame The input dataframe. """ return X.select_dtypes(include=[np.number])
[docs]def safe_drop_samples(X, drop_samples): """Drop samples (rows) from a matrix. Drop observations from a np.ndarray or pd.DataFrame. This produces a copy of data without the samples. Parameters ---------- X : array-like, shape=(n_samples, n_features) The array from which to drop records. drop_samples : array-like, shape=(n_samples,) The indices to drop. """ if isinstance(X, pd.DataFrame): return X.drop(drop_samples, axis=0) else: return np.delete(X, drop_samples, axis=0)
[docs]def safe_mask_samples(X, mask): """Select samples (rows) from a matrix from a mask. Select observations from a np.ndarray or pd.DataFrame by using a mask. This creates a copy of X, and allows us to use ``iloc`` with a mask even though not natively supported by Pandas. Parameters ---------- X : array-like, shape=(n_samples, n_features) The array from which to drop records. mask : array-like, shape=(n_samples,) The boolean mask. """ mask = np.asarray(mask) if isinstance(X, pd.DataFrame): return X.iloc[X.index[mask]] else: return X[mask, :]
[docs]def safe_vstack(a, b): """Stack two arrays on top of one another. Safely handle vertical stacking of arrays. This works for either np.ndarrays or pd.DataFrames. The types of both inputs must match! Parameters ---------- a : array-like, shape=(n_samples, n_features) The array that will be stacked on the top vertically. b : array-like, shape=(n_samples, n_features) The array that will be stacked below the other vertically. """ # we can only pd.concat if they BOTH are DataFrames if all(isinstance(x, pd.DataFrame) for x in (a, b)): return pd.concat([a, b], axis=0) # otherwise, at least one of them is a numpy array (we think) return np.vstack([a, b])