Source code for skoot.feature_extraction.dates

# -*- coding: utf-8 -*-
#
# Feature engineering for dates

from .base import BaseCompoundFeatureDeriver
from ..base import BasePDTransformer
from ..utils.validation import check_dataframe, validate_test_set_columns
from ..utils.iterables import ensure_iterable
from ..utils.series import is_datetime_type
from ..utils.dataframe import dataframe_or_array
from ..utils.metaestimators import timed_instance_method

from sklearn.utils.validation import check_is_fitted
from itertools import combinations

import numpy as np
import pandas as pd

__all__ = [
    "DateFactorizer",
    "TimeDeltaFeatures"
]


def _factorize(X, cols, feature_names, sep):
    right_side = None
    for col in cols:
        series = X[col]

        # Now the real challenge here is that some of the columns passed
        # may not be datetimes, which is required for this transformer.
        if not is_datetime_type(series):
            raise ValueError("The DateFactorizer requires passed columns "
                             "to be DateTime types. Consider using the "
                             "skoot.preprocessing.DateTransformer first.")

        # First, just extract each individual component from the date
        feat = np.asarray(
            series.apply(
                lambda d: [getattr(d, f)
                           for f in feature_names]).values.tolist())

        pd_features = pd.DataFrame.from_records(
            feat, columns=["%s%s%s" % (col, sep, feature)
                           for feature in feature_names])

        # Our single feature has just become a matrix. We'll make it into
        # a pandas frame that keeps getting concatenated together
        if right_side is None:  # first pass
            right_side = pd_features
        else:
            # No need to reset index here since right_side will be 0, 1, .., N
            right_side = pd.concat([right_side, pd_features], axis=1)

    # concat to the original df. we DO need to reset index of right_side here.
    right_side.index = X.index
    X = pd.concat([X, right_side], axis=1)
    return X


[docs]class DateFactorizer(BasePDTransformer): """Extract new features from datetime features. Automatically extract new features from datetime features. This class operates on datetime series objects and extracts features such as "year", "month", etc. These can then be expanded via one-hot encoding or further processed via other pre-processing techniques. Parameters ---------- cols : array-like, shape=(n_features,), optional (default=None) The names of the columns on which to apply the transformation. Will apply to all columns if None specified. Note that in this class, the columns applied-to must be DateTime types or this will raise a ValueError. as_df : bool, optional (default=True) Whether to return a Pandas ``DataFrame`` in the ``transform`` method. If False, will return a Numpy ``ndarray`` instead. drop_original : bool, optional (default=True) Whether to drop the original features from the dataframe prior to returning from the ``transform`` method. sep : str or unicode, optional (default="_") The string separator between the existing feature name and the extracted feature. E.g., for a feature named "Transaction" and for ``features=("year", "month")``, the original variable will be split into two new ones: "Transaction_year" and "Transaction_month". features : iterable, optional (default=("year", "month", "day", "hour")) The features to extract. These are attributes of the DateTime class and will raise an AttributeError if an invalid feature is passed. Examples -------- >>> import pandas as pd >>> from datetime import datetime as dt >>> strp = dt.strptime >>> data = [ ... [1, dt.strptime("06-01-2018", "%m-%d-%Y")], ... [2, dt.strptime("06-02-2018", "%m-%d-%Y")], ... [3, dt.strptime("06-03-2018", "%m-%d-%Y")], ... [4, dt.strptime("06-04-2018", "%m-%d-%Y")], ... [5, None] ... ] >>> df = pd.DataFrame.from_records(data, columns=["a", "b"]) >>> DateFactorizer(cols=['b']).fit_transform(df) a b_year b_month b_day b_hour 0 1 2018.0 6.0 1.0 0.0 1 2 2018.0 6.0 2.0 0.0 2 3 2018.0 6.0 3.0 0.0 3 4 2018.0 6.0 4.0 0.0 4 5 NaN NaN NaN NaN Attributes ---------- fit_cols_ : list The columns the transformer was fit on. """
[docs] def __init__(self, cols=None, as_df=True, drop_original=True, sep="_", features=("year", "month", "day", "hour")): super(DateFactorizer, self).__init__( cols=cols, as_df=as_df) self.drop_original = drop_original self.sep = sep self.features = features
# Don't decorate this one, since it calls fit_transform, which is decorated
[docs] def fit(self, X, y=None): """Fit the date factorizer. This is a tricky class because the "fit" isn't super necessary... But we use it as a validation stage to ensure the defined cols genuinely are datetime columns. That's the only reason this all happens in the fit portion. Parameters ---------- X : pd.DataFrame, shape=(n_samples, n_features) The Pandas frame to fit. The frame will only be fit on the prescribed ``cols`` (see ``__init__``) or all of them if ``cols`` is None. y : array-like or None, shape=(n_samples,), optional (default=None) Pass-through for ``sklearn.pipeline.Pipeline``. """ self.fit_transform(X, y) return self
[docs] @timed_instance_method(attribute_name="fit_time_") def fit_transform(self, X, y=None, **kwargs): """Fit the estimator and apply the date factorization to a dataframe. This is a tricky class because the "fit" isn't super necessary... But we use it as a validation stage to ensure the defined cols genuinely are datetime types. That's the only reason this all happens in the fit portion. Parameters ---------- X : pd.DataFrame, shape=(n_samples, n_features) The Pandas frame to fit. The operation will be applied to a copy of the input data, and the result will be returned. y : array-like or None, shape=(n_samples,), optional (default=None) Pass-through for ``sklearn.pipeline.Pipeline``. Returns ------- X : pd.DataFrame or np.ndarray, shape=(n_samples, n_features) The operation is applied to a copy of ``X``, and the result set is returned. """ X, cols = check_dataframe(X, cols=self.cols, assert_all_finite=False) # compute the factorized features and unify with the original DF features = ensure_iterable(self.features) X = _factorize(X, cols, features, self.sep) # type: pd.DataFrame # remove the original columns if necessary if self.drop_original: X.drop(cols, axis=1, inplace=True) # set the self params self.fit_cols_ = cols return dataframe_or_array(X, self.as_df)
[docs] def transform(self, X): """Apply the date transformation to a dataframe. This method will extract features from datetime features as specified by the ``features`` arg. Parameters ---------- X : pd.DataFrame, shape=(n_samples, n_features) The Pandas frame to transform. The operation will be applied to a copy of the input data, and the result will be returned. Returns ------- X : pd.DataFrame or np.ndarray, shape=(n_samples, n_features) The operation is applied to a copy of ``X``, and the result set is returned. """ check_is_fitted(self, "fit_cols_") X, _ = check_dataframe(X, cols=self.cols) # validate that fit cols in test set cols = self.fit_cols_ validate_test_set_columns(cols, X.columns) # compute the factorized features and unify with the original DF X = _factorize(X, cols, ensure_iterable(self.features), self.sep) # type: pd.DataFrame # remove the original columns if necessary if self.drop_original: X = X.drop(cols, axis=1) return dataframe_or_array(X, self.as_df)
def _time_between(X, cols, units, absolute, astype, suffix, sep): right_side = None # Do one pass to make sure they're all date times first (O(N)) for col in cols: series = X[col] # Some of the columns passed may not be datetimes, which is # required for this transformer. We will just raise here for that... if not is_datetime_type(series): raise ValueError("The TimeBetweenEvents transformer requires " "passed columns to be DateTime types. Consider " "using the skoot.preprocessing.DateTransformer " "first.") # Confirm the units to_unit = {'seconds': lambda d, s: s, # already have seconds 'minutes': lambda d, s: s / 60, 'hours': lambda d, s: s / 3600, 'days': lambda d, s: d}[units] # can fail with KeyError # Now do the combinatorial pass for col_a, col_b in combinations(cols, 2): # Subtract right from left, and then get the absolute, if needed delta = (X[col_a] - X[col_b]) days, seconds = delta.dt.days, delta.dt.total_seconds() diff = to_unit(days, seconds).astype(astype) if absolute: diff = diff.abs() # Set the name and make it a dataframe diff.name = "%s%s%s_%s" % (col_a, sep, col_b, suffix) diff = pd.DataFrame(diff) # Our single feature has just become a matrix. We'll make it into # a pandas frame that keeps getting concatenated together if right_side is None: # first pass right_side = diff else: # No need to reset index here since right_side will be 0, 1, .., N right_side = pd.concat([right_side, diff], axis=1) # concat to the original df. we DO need to reset index of right_side here. right_side.index = X.index X = pd.concat([X, right_side], axis=1) return X
[docs]class TimeDeltaFeatures(BaseCompoundFeatureDeriver): """Compute the time lapse between timestamp events. A transformer to compute time deltas between different date features. This can be useful, for instance, when the target is temporally sensitive to the lapse in time between certain events. This class will combinatorially calculate the deltas between features, expanding the dimensionality by :math:`{N \choose 2}`, where :math:`N` is the number of features included in ``cols``. Note that prescribed column order *does* matter in this transformer, as deltas are computed from left to right:: ['a', 'b', 'c'] -> ['a_b_delta', 'a_c_delta', 'b_c_delta'] Parameters ---------- cols : array-like, shape=(n_features,), optional (default=None) The names of the columns on which to apply the transformation. Will apply to all columns if None specified. Note that in this class, the columns applied-to must be DateTime types or this will raise a ValueError. as_df : bool, optional (default=True) Whether to return a Pandas ``DataFrame`` in the ``transform`` method. If False, will return a Numpy ``ndarray`` instead. units : str or unicode, optional (default='days') The unit of time to compute between events. One of ('seconds', 'minutes', 'hours', 'days'). sep : str or unicode (optional, default="_") The separator between the new feature names. The names will be in the form of:: <left><sep><right><sep><suffix> For examples, for columns 'a' and 'b', ``sep="_"`` and ``name_suffix="delta"``, the new column name would be:: a_b_delta astype : type, optional (default=float) The type to which to coerce the time deltas. absolute_difference : bool, optional (default=False) Whether to compute the absolute difference between dates. If False, the order of ``cols`` will matter, as that defines the subtractive order. (right-most columns will be subtracted from the left combinatorially) name_suffix : str, optional (default='delta') The suffix to add to the new feature name in the form of:: <feature_x>_<feature_y>_<suffix> See ``sep`` for more details about how new column names are formed. Notes ----- * Unlike the :class:`DateFactorizer` class, this transformer does not remove the original date features after extracting the new features. * Column deltas are computed from left to right. This means that the order in which columns are defined in ``cols`` *does* matter. Examples -------- >>> import pandas as pd >>> from datetime import datetime as dt >>> stp = dt.strptime >>> data = [ ... [1, stp("06-01-2018", "%m-%d-%Y"), stp("06-02-2018", "%m-%d-%Y")], ... [2, stp("06-02-2018", "%m-%d-%Y"), stp("06-03-2018", "%m-%d-%Y")], ... [3, stp("06-03-2018", "%m-%d-%Y"), stp("06-04-2018", "%m-%d-%Y")], ... [4, stp("06-04-2018", "%m-%d-%Y"), stp("06-05-2018", "%m-%d-%Y")], ... [5, None, stp("06-04-2018", "%m-%d-%Y")] ... ] >>> df = pd.DataFrame.from_records(data, columns=['a', 'b', 'c']) >>> tdf = TimeDeltaFeatures(cols=['b', 'c'], units='hours') >>> tdf.fit_transform(df) a b c b_c_delta 0 1 2018-06-01 2018-06-02 -24.0 1 2 2018-06-02 2018-06-03 -24.0 2 3 2018-06-03 2018-06-04 -24.0 3 4 2018-06-04 2018-06-05 -24.0 4 5 NaT 2018-06-04 NaN Notice that column order makes a difference. If 'c' is defined before 'b', the delta is positive: >>> TimeDeltaFeatures(cols=['c', 'b'], units='hours').fit_transform(df) a b c c_b_delta 0 1 2018-06-01 2018-06-02 24.0 1 2 2018-06-02 2018-06-03 24.0 2 3 2018-06-03 2018-06-04 24.0 3 4 2018-06-04 2018-06-05 24.0 4 5 NaT 2018-06-04 NaN """
[docs] def __init__(self, cols=None, as_df=True, units='days', sep="_", astype=float, absolute_difference=False, name_suffix="delta"): super(TimeDeltaFeatures, self).__init__( cols=cols, as_df=as_df, sep=sep, name_suffix=name_suffix) self.units = units self.astype = astype self.absolute_difference = absolute_difference
# Don't decorate this one, since it calls fit_transform, which is decorated
[docs] def fit(self, X, y=None): """Fit the time-between transformer. This is a tricky class because the "fit" isn't super necessary... But we use it as a validation stage to ensure the defined cols genuinely are datetime columns. That's the only reason this all happens in the fit portion. Parameters ---------- X : pd.DataFrame, shape=(n_samples, n_features) The Pandas frame to fit. The frame will only be fit on the prescribed ``cols`` (see ``__init__``) or all of them if ``cols`` is None. y : array-like or None, shape=(n_samples,), optional (default=None) Pass-through for ``sklearn.pipeline.Pipeline``. """ self.fit_transform(X, y) return self
[docs] @timed_instance_method(attribute_name="fit_time_") def fit_transform(self, X, y=None, **kwargs): """Fit the estimator and apply the date factorization to a dataframe. This is a tricky class because the "fit" isn't super necessary... But we use it as a validation stage to ensure the defined cols genuinely are datetime types. That's the only reason this all happens in the fit portion. Parameters ---------- X : pd.DataFrame, shape=(n_samples, n_features) The Pandas frame to fit. The operation will be applied to a copy of the input data, and the result will be returned. y : array-like or None, shape=(n_samples,), optional (default=None) Pass-through for ``sklearn.pipeline.Pipeline``. Returns ------- X : pd.DataFrame or np.ndarray, shape=(n_samples, n_features) The operation is applied to a copy of ``X``, and the result set is returned. """ X, cols = check_dataframe(X, cols=self.cols, assert_all_finite=False) # For each column, compute the time between X = _time_between(X, cols, self.units, absolute=self.absolute_difference, astype=self.astype, suffix=self.name_suffix, sep=self.sep) # type: pd.DataFrame # set the self params self.fit_cols_ = cols return dataframe_or_array(X, self.as_df)
[docs] def transform(self, X): """Apply the date transformation to a dataframe. This method will compute the deltas between provided datetime features. Parameters ---------- X : pd.DataFrame, shape=(n_samples, n_features) The Pandas frame to transform. The operation will be applied to a copy of the input data, and the result will be returned. Returns ------- X : pd.DataFrame or np.ndarray, shape=(n_samples, n_features) The operation is applied to a copy of ``X``, and the result set is returned. """ check_is_fitted(self, "fit_cols_") X, _ = check_dataframe(X, cols=self.cols) # validate that fit cols in test set cols = self.fit_cols_ validate_test_set_columns(cols, X.columns) # compute the factorized features and unify with the original DF X = _time_between(X, cols, self.units, absolute=self.absolute_difference, astype=self.astype, suffix=self.name_suffix, sep=self.sep) # type: pd.DataFrame return dataframe_or_array(X, self.as_df)