# -*- coding: utf-8 -*-
#
# Feature engineering for dates
from .base import BaseCompoundFeatureDeriver
from ..base import BasePDTransformer
from ..utils.validation import check_dataframe, validate_test_set_columns
from ..utils.iterables import ensure_iterable
from ..utils.series import is_datetime_type
from ..utils.dataframe import dataframe_or_array
from ..utils.metaestimators import timed_instance_method
from sklearn.utils.validation import check_is_fitted
from itertools import combinations
import numpy as np
import pandas as pd
__all__ = [
"DateFactorizer",
"TimeDeltaFeatures"
]
def _factorize(X, cols, feature_names, sep):
right_side = None
for col in cols:
series = X[col]
# Now the real challenge here is that some of the columns passed
# may not be datetimes, which is required for this transformer.
if not is_datetime_type(series):
raise ValueError("The DateFactorizer requires passed columns "
"to be DateTime types. Consider using the "
"skoot.preprocessing.DateTransformer first.")
# First, just extract each individual component from the date
feat = np.asarray(
series.apply(
lambda d: [getattr(d, f)
for f in feature_names]).values.tolist())
pd_features = pd.DataFrame.from_records(
feat, columns=["%s%s%s" % (col, sep, feature)
for feature in feature_names])
# Our single feature has just become a matrix. We'll make it into
# a pandas frame that keeps getting concatenated together
if right_side is None: # first pass
right_side = pd_features
else:
# No need to reset index here since right_side will be 0, 1, .., N
right_side = pd.concat([right_side, pd_features], axis=1)
# concat to the original df. we DO need to reset index of right_side here.
right_side.index = X.index
X = pd.concat([X, right_side], axis=1)
return X
[docs]class DateFactorizer(BasePDTransformer):
"""Extract new features from datetime features.
Automatically extract new features from datetime features. This class
operates on datetime series objects and extracts features such as "year",
"month", etc. These can then be expanded via one-hot encoding or further
processed via other pre-processing techniques.
Parameters
----------
cols : array-like, shape=(n_features,), optional (default=None)
The names of the columns on which to apply the transformation.
Will apply to all columns if None specified. Note that in this class,
the columns applied-to must be DateTime types or this will raise a
ValueError.
as_df : bool, optional (default=True)
Whether to return a Pandas ``DataFrame`` in the ``transform``
method. If False, will return a Numpy ``ndarray`` instead.
drop_original : bool, optional (default=True)
Whether to drop the original features from the dataframe prior to
returning from the ``transform`` method.
sep : str or unicode, optional (default="_")
The string separator between the existing feature name and the
extracted feature. E.g., for a feature named "Transaction" and for
``features=("year", "month")``, the original variable will be split
into two new ones: "Transaction_year" and "Transaction_month".
features : iterable, optional (default=("year", "month", "day", "hour"))
The features to extract. These are attributes of the DateTime class
and will raise an AttributeError if an invalid feature is passed.
Examples
--------
>>> import pandas as pd
>>> from datetime import datetime as dt
>>> strp = dt.strptime
>>> data = [
... [1, dt.strptime("06-01-2018", "%m-%d-%Y")],
... [2, dt.strptime("06-02-2018", "%m-%d-%Y")],
... [3, dt.strptime("06-03-2018", "%m-%d-%Y")],
... [4, dt.strptime("06-04-2018", "%m-%d-%Y")],
... [5, None]
... ]
>>> df = pd.DataFrame.from_records(data, columns=["a", "b"])
>>> DateFactorizer(cols=['b']).fit_transform(df)
a b_year b_month b_day b_hour
0 1 2018.0 6.0 1.0 0.0
1 2 2018.0 6.0 2.0 0.0
2 3 2018.0 6.0 3.0 0.0
3 4 2018.0 6.0 4.0 0.0
4 5 NaN NaN NaN NaN
Attributes
----------
fit_cols_ : list
The columns the transformer was fit on.
"""
[docs] def __init__(self, cols=None, as_df=True, drop_original=True, sep="_",
features=("year", "month", "day", "hour")):
super(DateFactorizer, self).__init__(
cols=cols, as_df=as_df)
self.drop_original = drop_original
self.sep = sep
self.features = features
# Don't decorate this one, since it calls fit_transform, which is decorated
[docs] def fit(self, X, y=None):
"""Fit the date factorizer.
This is a tricky class because the "fit" isn't super necessary...
But we use it as a validation stage to ensure the defined cols
genuinely are datetime columns. That's the only reason this all
happens in the fit portion.
Parameters
----------
X : pd.DataFrame, shape=(n_samples, n_features)
The Pandas frame to fit. The frame will only
be fit on the prescribed ``cols`` (see ``__init__``) or
all of them if ``cols`` is None.
y : array-like or None, shape=(n_samples,), optional (default=None)
Pass-through for ``sklearn.pipeline.Pipeline``.
"""
self.fit_transform(X, y)
return self
def _time_between(X, cols, units, absolute, astype, suffix, sep):
right_side = None
# Do one pass to make sure they're all date times first (O(N))
for col in cols:
series = X[col]
# Some of the columns passed may not be datetimes, which is
# required for this transformer. We will just raise here for that...
if not is_datetime_type(series):
raise ValueError("The TimeBetweenEvents transformer requires "
"passed columns to be DateTime types. Consider "
"using the skoot.preprocessing.DateTransformer "
"first.")
# Confirm the units
to_unit = {'seconds': lambda d, s: s, # already have seconds
'minutes': lambda d, s: s / 60,
'hours': lambda d, s: s / 3600,
'days': lambda d, s: d}[units] # can fail with KeyError
# Now do the combinatorial pass
for col_a, col_b in combinations(cols, 2):
# Subtract right from left, and then get the absolute, if needed
delta = (X[col_a] - X[col_b])
days, seconds = delta.dt.days, delta.dt.total_seconds()
diff = to_unit(days, seconds).astype(astype)
if absolute:
diff = diff.abs()
# Set the name and make it a dataframe
diff.name = "%s%s%s_%s" % (col_a, sep, col_b, suffix)
diff = pd.DataFrame(diff)
# Our single feature has just become a matrix. We'll make it into
# a pandas frame that keeps getting concatenated together
if right_side is None: # first pass
right_side = diff
else:
# No need to reset index here since right_side will be 0, 1, .., N
right_side = pd.concat([right_side, diff], axis=1)
# concat to the original df. we DO need to reset index of right_side here.
right_side.index = X.index
X = pd.concat([X, right_side], axis=1)
return X
[docs]class TimeDeltaFeatures(BaseCompoundFeatureDeriver):
"""Compute the time lapse between timestamp events.
A transformer to compute time deltas between different date features.
This can be useful, for instance, when the target is temporally sensitive
to the lapse in time between certain events.
This class will combinatorially calculate the deltas between features,
expanding the dimensionality by :math:`{N \choose 2}`, where :math:`N` is
the number of features included in ``cols``. Note that prescribed column
order *does* matter in this transformer, as deltas are computed from left
to right::
['a', 'b', 'c'] -> ['a_b_delta', 'a_c_delta', 'b_c_delta']
Parameters
----------
cols : array-like, shape=(n_features,), optional (default=None)
The names of the columns on which to apply the transformation.
Will apply to all columns if None specified. Note that in this class,
the columns applied-to must be DateTime types or this will raise a
ValueError.
as_df : bool, optional (default=True)
Whether to return a Pandas ``DataFrame`` in the ``transform``
method. If False, will return a Numpy ``ndarray`` instead.
units : str or unicode, optional (default='days')
The unit of time to compute between events. One of
('seconds', 'minutes', 'hours', 'days').
sep : str or unicode (optional, default="_")
The separator between the new feature names. The names will be in the
form of::
<left><sep><right><sep><suffix>
For examples, for columns 'a' and 'b', ``sep="_"`` and
``name_suffix="delta"``, the new column name would be::
a_b_delta
astype : type, optional (default=float)
The type to which to coerce the time deltas.
absolute_difference : bool, optional (default=False)
Whether to compute the absolute difference between dates. If False,
the order of ``cols`` will matter, as that defines the subtractive
order. (right-most columns will be subtracted from the left
combinatorially)
name_suffix : str, optional (default='delta')
The suffix to add to the new feature name in the form of::
<feature_x>_<feature_y>_<suffix>
See ``sep`` for more details about how new column names are formed.
Notes
-----
* Unlike the :class:`DateFactorizer` class, this transformer does not
remove the original date features after extracting the new features.
* Column deltas are computed from left to right. This means that the order
in which columns are defined in ``cols`` *does* matter.
Examples
--------
>>> import pandas as pd
>>> from datetime import datetime as dt
>>> stp = dt.strptime
>>> data = [
... [1, stp("06-01-2018", "%m-%d-%Y"), stp("06-02-2018", "%m-%d-%Y")],
... [2, stp("06-02-2018", "%m-%d-%Y"), stp("06-03-2018", "%m-%d-%Y")],
... [3, stp("06-03-2018", "%m-%d-%Y"), stp("06-04-2018", "%m-%d-%Y")],
... [4, stp("06-04-2018", "%m-%d-%Y"), stp("06-05-2018", "%m-%d-%Y")],
... [5, None, stp("06-04-2018", "%m-%d-%Y")]
... ]
>>> df = pd.DataFrame.from_records(data, columns=['a', 'b', 'c'])
>>> tdf = TimeDeltaFeatures(cols=['b', 'c'], units='hours')
>>> tdf.fit_transform(df)
a b c b_c_delta
0 1 2018-06-01 2018-06-02 -24.0
1 2 2018-06-02 2018-06-03 -24.0
2 3 2018-06-03 2018-06-04 -24.0
3 4 2018-06-04 2018-06-05 -24.0
4 5 NaT 2018-06-04 NaN
Notice that column order makes a difference. If 'c' is defined before 'b',
the delta is positive:
>>> TimeDeltaFeatures(cols=['c', 'b'], units='hours').fit_transform(df)
a b c c_b_delta
0 1 2018-06-01 2018-06-02 24.0
1 2 2018-06-02 2018-06-03 24.0
2 3 2018-06-03 2018-06-04 24.0
3 4 2018-06-04 2018-06-05 24.0
4 5 NaT 2018-06-04 NaN
"""
[docs] def __init__(self, cols=None, as_df=True, units='days', sep="_",
astype=float, absolute_difference=False, name_suffix="delta"):
super(TimeDeltaFeatures, self).__init__(
cols=cols, as_df=as_df,
sep=sep, name_suffix=name_suffix)
self.units = units
self.astype = astype
self.absolute_difference = absolute_difference
# Don't decorate this one, since it calls fit_transform, which is decorated
[docs] def fit(self, X, y=None):
"""Fit the time-between transformer.
This is a tricky class because the "fit" isn't super necessary...
But we use it as a validation stage to ensure the defined cols
genuinely are datetime columns. That's the only reason this all
happens in the fit portion.
Parameters
----------
X : pd.DataFrame, shape=(n_samples, n_features)
The Pandas frame to fit. The frame will only
be fit on the prescribed ``cols`` (see ``__init__``) or
all of them if ``cols`` is None.
y : array-like or None, shape=(n_samples,), optional (default=None)
Pass-through for ``sklearn.pipeline.Pipeline``.
"""
self.fit_transform(X, y)
return self