Source code for skoot.preprocessing.dates

# -*- coding: utf-8 -*-
#
# Coerce string fields to dates

import six
from sklearn.utils.validation import check_is_fitted

from ..base import BasePDTransformer
from ..utils.validation import type_or_iterable_to_col_mapping
from ..utils.validation import check_dataframe, validate_test_set_columns
from ..utils.metaestimators import timed_instance_method
from ..utils.compat import NoneType
from ..utils.series import is_datetime_type

import pandas as pd

__all__ = [
    "DateTransformer"
]


def _cast_to_datetime(X, cols, formats, allowed_types):
    # Now the real challenge here is that some of the columns passed
    # may not be date-parseable... we'll duck type it. If it fails, it
    # cannot be parsed, and we will let Pandas raise for that. No sense
    # policing it if they are already doing that.
    def cast(f):
        fmt = formats[f.name]

        # First make sure the type is in allowed types
        dtype = f.dtype.name
        if dtype not in allowed_types:
            raise ValueError("dtype '%s' not in `allowed_types` (%r)"
                             % (dtype, allowed_types))

        # Now if the format is already a datetime, we can return early.
        # If the format isn't defined we can infer it, otherwise we can
        # parse it explicitly
        if is_datetime_type(f):
            return f
        elif fmt is None:
            return pd.to_datetime(f, infer_datetime_format=True,
                                  errors='raise')
        # otherwise the fmt is defined so we'll let it fail out on its own
        # if it cannot cast it
        return pd.to_datetime(f, format=fmt)

    casted = X[cols].apply(cast)
    X[cols] = casted
    return X


[docs]class DateTransformer(BasePDTransformer):
    """Cast features to datetime.

    Convert multiple features with potentially differing formats to datetime
    with specified formats or by inferring the formats. Note that unlike most
    other Skoot transformers, this one requires that the output be a DataFrame
    (note the lack of the ``as_df`` constructor arg).

    Parameters
    ----------
    cols : array-like, shape=(n_features,), optional (default=None)
        The names of the columns on which to apply the transformation.
        Will apply to all columns if None specified

    date_format : str, iterable or None, optional (default=None)
        The date format. If None, will infer. If a string, will be used to
        parse the datetime. If an iterable, should contain strings or None
        positionally corresponding to ``cols`` (or a dict mapping columns
        to formats).

    allowed_types : iterable, optional (default=("object", "datetime64[ns]"))
        Permitted Series types. This is used to prevent accidentally casting
        Series of unexpected types to DateTime. For instance, integer types
        can be cast to DateTime even though the behavior may be unexpected.

    Notes
    -----
    The ``fit`` method here is only used for validation that the columns can
    be cast to datetime.

    Examples
    --------
    >>> import pandas as pd
    >>> from datetime import datetime as dt
    >>> data = [
    ...     [1, "06/01/2018", dt.strptime("06-01-2018", "%m-%d-%Y")],
    ...     [2, "06/02/2018", dt.strptime("06-02-2018", "%m-%d-%Y")],
    ...     [3, "06/03/2018", dt.strptime("06-03-2018", "%m-%d-%Y")],
    ...     [4, None, dt.strptime("06-04-2018", "%m-%d-%Y")],
    ...     [5, "06/05/2018", None]
    ... ]
    >>> df = pd.DataFrame.from_records(data, columns=["a", "b", "c"])
    >>> converter = DateTransformer(cols=["b", "c"],
    ...                             date_format=["%m/%d/%Y", None])
    >>> converter.fit_transform(df)
       a          b          c
    0  1 2018-06-01 2018-06-01
    1  2 2018-06-02 2018-06-02
    2  3 2018-06-03 2018-06-03
    3  4        NaT 2018-06-04
    4  5 2018-06-05        NaT

    Attributes
    ----------
    DEFAULT_PERMITTED_DTYPES : tuple
        This is a static attribute that controls what types can be cast.
        These are the default permitted pandas dtypes that are allowed.
        If a column is not one of these types, a ValueError will be raised.
        To cast an int to datetime, for instance, the ``allowed_types`` arg
        will need to include "int64"::

            allowed_types=DateTransformer.DEFAULT_PERMITTED_DTYPES + ('int64',)

    fit_cols_ : list
        The columns the transformer was fit on.

    formats_ : dict
        Maps column name to date format, in case of varying date formats
        passed in the ``date_format`` parameter.
    """
    DEFAULT_PERMITTED_DTYPES = ("object", "datetime64[ns]")

[docs]    def __init__(self, cols=None, date_format=None,
                 allowed_types=DEFAULT_PERMITTED_DTYPES):
        super(DateTransformer, self).__init__(
            cols=cols, as_df=True)

        self.date_format = date_format
        self.allowed_types = allowed_types

    # Don't decorate 'fit' since it calls fit_transform
[docs]    def fit(self, X, y=None):
        """Fit the date transformer.

        This is a tricky class because the "fit" isn't super necessary...
        But we use it as a validation stage to ensure the defined cols
        genuinely can be cast to datetime. That's the only reason this all
        happens in the fit portion.

        Parameters
        ----------
        X : pd.DataFrame, shape=(n_samples, n_features)
            The Pandas frame to fit. The frame will only
            be fit on the prescribed ``cols`` (see ``__init__``) or
            all of them if ``cols`` is None.

        y : array-like or None, shape=(n_samples,), optional (default=None)
            Pass-through for ``sklearn.pipeline.Pipeline``.
        """
        self.fit_transform(X, y)
        return self

[docs]    @timed_instance_method(attribute_name="fit_time_")
    def fit_transform(self, X, y=None, **kwargs):
        """Fit the estimator and apply the date transformation
        to a dataframe.

        This is a tricky class because the "fit" isn't super necessary...
        But we use it as a validation stage to ensure the defined cols
        genuinely can be cast to datetime. That's the only reason this all
        happens in the fit portion.

        Parameters
        ----------
        X : pd.DataFrame, shape=(n_samples, n_features)
            The Pandas frame to fit. The operation will
            be applied to a copy of the input data, and the result
            will be returned.

        y : array-like or None, shape=(n_samples,), optional (default=None)
            Pass-through for ``sklearn.pipeline.Pipeline``.

        Returns
        -------
        X : pd.DataFrame or np.ndarray, shape=(n_samples, n_features)
            The operation is applied to a copy of ``X``,
            and the result set is returned.
        """
        X, cols = check_dataframe(X, cols=self.cols, assert_all_finite=False)

        # Different fields may have different formats, so we have to
        # allow a number of different formats to be passed.
        formats = type_or_iterable_to_col_mapping(
            cols=cols, param=self.date_format,
            param_name="date_format",
            permitted_scalar_types=six.string_types + (NoneType,))

        X = _cast_to_datetime(X, cols, formats, self.allowed_types)

        self.fit_cols_ = cols
        self.formats_ = formats
        return X

[docs]    def transform(self, X):
        """Apply the date transformation to a dataframe.

        This method will cast string features to datetimes as specified by
        the ``date_format`` arg.

        Parameters
        ----------
        X : pd.DataFrame, shape=(n_samples, n_features)
            The Pandas frame to transform. The operation will
            be applied to a copy of the input data, and the result
            will be returned.

        Returns
        -------
        X : pd.DataFrame or np.ndarray, shape=(n_samples, n_features)
            The operation is applied to a copy of ``X``,
            and the result set is returned.
        """
        check_is_fitted(self, "fit_cols_")
        X, _ = check_dataframe(X, cols=self.cols)

        # validate that fit cols in test set
        cols = self.fit_cols_
        validate_test_set_columns(cols, X.columns)

        # transform
        return _cast_to_datetime(X, cols, self.formats_, self.allowed_types)