Source code for skoot.preprocessing.dates
# -*- coding: utf-8 -*-
#
# Coerce string fields to dates
import six
from sklearn.utils.validation import check_is_fitted
from ..base import BasePDTransformer
from ..utils.validation import type_or_iterable_to_col_mapping
from ..utils.validation import check_dataframe, validate_test_set_columns
from ..utils.metaestimators import timed_instance_method
from ..utils.compat import NoneType
from ..utils.series import is_datetime_type
import pandas as pd
__all__ = [
"DateTransformer"
]
def _cast_to_datetime(X, cols, formats, allowed_types):
# Now the real challenge here is that some of the columns passed
# may not be date-parseable... we'll duck type it. If it fails, it
# cannot be parsed, and we will let Pandas raise for that. No sense
# policing it if they are already doing that.
def cast(f):
fmt = formats[f.name]
# First make sure the type is in allowed types
dtype = f.dtype.name
if dtype not in allowed_types:
raise ValueError("dtype '%s' not in `allowed_types` (%r)"
% (dtype, allowed_types))
# Now if the format is already a datetime, we can return early.
# If the format isn't defined we can infer it, otherwise we can
# parse it explicitly
if is_datetime_type(f):
return f
elif fmt is None:
return pd.to_datetime(f, infer_datetime_format=True,
errors='raise')
# otherwise the fmt is defined so we'll let it fail out on its own
# if it cannot cast it
return pd.to_datetime(f, format=fmt)
casted = X[cols].apply(cast)
X[cols] = casted
return X
[docs]class DateTransformer(BasePDTransformer):
"""Cast features to datetime.
Convert multiple features with potentially differing formats to datetime
with specified formats or by inferring the formats. Note that unlike most
other Skoot transformers, this one requires that the output be a DataFrame
(note the lack of the ``as_df`` constructor arg).
Parameters
----------
cols : array-like, shape=(n_features,), optional (default=None)
The names of the columns on which to apply the transformation.
Will apply to all columns if None specified
date_format : str, iterable or None, optional (default=None)
The date format. If None, will infer. If a string, will be used to
parse the datetime. If an iterable, should contain strings or None
positionally corresponding to ``cols`` (or a dict mapping columns
to formats).
allowed_types : iterable, optional (default=("object", "datetime64[ns]"))
Permitted Series types. This is used to prevent accidentally casting
Series of unexpected types to DateTime. For instance, integer types
can be cast to DateTime even though the behavior may be unexpected.
Notes
-----
The ``fit`` method here is only used for validation that the columns can
be cast to datetime.
Examples
--------
>>> import pandas as pd
>>> from datetime import datetime as dt
>>> data = [
... [1, "06/01/2018", dt.strptime("06-01-2018", "%m-%d-%Y")],
... [2, "06/02/2018", dt.strptime("06-02-2018", "%m-%d-%Y")],
... [3, "06/03/2018", dt.strptime("06-03-2018", "%m-%d-%Y")],
... [4, None, dt.strptime("06-04-2018", "%m-%d-%Y")],
... [5, "06/05/2018", None]
... ]
>>> df = pd.DataFrame.from_records(data, columns=["a", "b", "c"])
>>> converter = DateTransformer(cols=["b", "c"],
... date_format=["%m/%d/%Y", None])
>>> converter.fit_transform(df)
a b c
0 1 2018-06-01 2018-06-01
1 2 2018-06-02 2018-06-02
2 3 2018-06-03 2018-06-03
3 4 NaT 2018-06-04
4 5 2018-06-05 NaT
Attributes
----------
DEFAULT_PERMITTED_DTYPES : tuple
This is a static attribute that controls what types can be cast.
These are the default permitted pandas dtypes that are allowed.
If a column is not one of these types, a ValueError will be raised.
To cast an int to datetime, for instance, the ``allowed_types`` arg
will need to include "int64"::
allowed_types=DateTransformer.DEFAULT_PERMITTED_DTYPES + ('int64',)
fit_cols_ : list
The columns the transformer was fit on.
formats_ : dict
Maps column name to date format, in case of varying date formats
passed in the ``date_format`` parameter.
"""
DEFAULT_PERMITTED_DTYPES = ("object", "datetime64[ns]")
[docs] def __init__(self, cols=None, date_format=None,
allowed_types=DEFAULT_PERMITTED_DTYPES):
super(DateTransformer, self).__init__(
cols=cols, as_df=True)
self.date_format = date_format
self.allowed_types = allowed_types
# Don't decorate 'fit' since it calls fit_transform
[docs] def fit(self, X, y=None):
"""Fit the date transformer.
This is a tricky class because the "fit" isn't super necessary...
But we use it as a validation stage to ensure the defined cols
genuinely can be cast to datetime. That's the only reason this all
happens in the fit portion.
Parameters
----------
X : pd.DataFrame, shape=(n_samples, n_features)
The Pandas frame to fit. The frame will only
be fit on the prescribed ``cols`` (see ``__init__``) or
all of them if ``cols`` is None.
y : array-like or None, shape=(n_samples,), optional (default=None)
Pass-through for ``sklearn.pipeline.Pipeline``.
"""
self.fit_transform(X, y)
return self
[docs] @timed_instance_method(attribute_name="fit_time_")
def fit_transform(self, X, y=None, **kwargs):
"""Fit the estimator and apply the date transformation
to a dataframe.
This is a tricky class because the "fit" isn't super necessary...
But we use it as a validation stage to ensure the defined cols
genuinely can be cast to datetime. That's the only reason this all
happens in the fit portion.
Parameters
----------
X : pd.DataFrame, shape=(n_samples, n_features)
The Pandas frame to fit. The operation will
be applied to a copy of the input data, and the result
will be returned.
y : array-like or None, shape=(n_samples,), optional (default=None)
Pass-through for ``sklearn.pipeline.Pipeline``.
Returns
-------
X : pd.DataFrame or np.ndarray, shape=(n_samples, n_features)
The operation is applied to a copy of ``X``,
and the result set is returned.
"""
X, cols = check_dataframe(X, cols=self.cols, assert_all_finite=False)
# Different fields may have different formats, so we have to
# allow a number of different formats to be passed.
formats = type_or_iterable_to_col_mapping(
cols=cols, param=self.date_format,
param_name="date_format",
permitted_scalar_types=six.string_types + (NoneType,))
X = _cast_to_datetime(X, cols, formats, self.allowed_types)
self.fit_cols_ = cols
self.formats_ = formats
return X
[docs] def transform(self, X):
"""Apply the date transformation to a dataframe.
This method will cast string features to datetimes as specified by
the ``date_format`` arg.
Parameters
----------
X : pd.DataFrame, shape=(n_samples, n_features)
The Pandas frame to transform. The operation will
be applied to a copy of the input data, and the result
will be returned.
Returns
-------
X : pd.DataFrame or np.ndarray, shape=(n_samples, n_features)
The operation is applied to a copy of ``X``,
and the result set is returned.
"""
check_is_fitted(self, "fit_cols_")
X, _ = check_dataframe(X, cols=self.cols)
# validate that fit cols in test set
cols = self.fit_cols_
validate_test_set_columns(cols, X.columns)
# transform
return _cast_to_datetime(X, cols, self.formats_, self.allowed_types)