Source code for skutil.preprocessing.transform
# -*- coding: utf-8 -*-
from __future__ import print_function, absolute_import, division
import numpy as np
import pandas as pd
from scipy import optimize
from scipy.stats import boxcox
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.externals import six
from sklearn.externals.joblib import Parallel, delayed
from sklearn.preprocessing import StandardScaler
from sklearn.utils.validation import check_is_fitted
from skutil.base import *
from ..utils import *
from ..utils.fixes import _cols_if_none
__all__ = [
'BoxCoxTransformer',
'FunctionMapper',
'InteractionTermTransformer',
'SelectiveScaler',
'SpatialSignTransformer',
'YeoJohnsonTransformer'
]
# A very small number used to measure differences.
# If the absolute difference between two numbers is
# <= EPS, it is considered equal.
EPS = 1e-12
# A very small number used to represent zero.
ZERO = 1e-16
# Helper funtions:
def _eqls(lam, v):
return np.abs(lam - v) <= EPS
def _validate_rows(X):
m, n = X.shape
if m < 2:
raise ValueError('n_samples should be at least two, but got %i' % m)
[docs]class FunctionMapper(BaseSkutil, TransformerMixin):
"""Apply a function to a column or set of columns.
Parameters
----------
cols : array_like, shape=(n_features,), optional (default=None)
The names of the columns on which to apply the transformation.
If no column names are provided, the transformer will be ``fit``
on the entire frame. Note that the transformation will also only
apply to the specified columns, and any other non-specified
columns will still be present after transformation.
fun : function, (default=None)
The function to apply to the feature(s). This function will be
applied via lambda expression to each column (independent of
one another). Therefore, the callable should accept an array-like
argument.
Attributes
----------
is_fit_ : bool
The ``FunctionMapper`` callable is set in the constructor,
but to remain true to the sklearn API, we need to ensure ``fit``
is called prior to ``transform``. Thus, we set this attribute in
the ``fit`` method, which performs some validation, to ensure the
``fun`` parameter has been validated.
Examples
--------
The following example will apply a cube-root transformation
to the first two columns in the iris dataset.
>>> from skutil.utils import load_iris_df
>>> import pandas as pd
>>> import numpy as np
>>>
>>> X = load_iris_df(include_tgt=False)
>>>
>>> # define the function
>>> def cube_root(x):
... return np.power(x, 0.333)
>>>
>>> # make our transformer
>>> trans = FunctionMapper(cols=X.columns[:2], fun=cube_root)
>>> trans.fit_transform(X).head()
sepal length (cm) sepal width (cm) petal length (cm) petal width (cm)
0 1.720366 1.517661 1.4 0.2
1 1.697600 1.441722 1.4 0.2
2 1.674205 1.473041 1.3 0.2
3 1.662258 1.457550 1.5 0.2
4 1.709059 1.531965 1.4 0.2
"""
def __init__(self, cols=None, fun=None, **kwargs):
super(FunctionMapper, self).__init__(cols=cols)
self.fun = fun
self.kwargs = kwargs
[docs] def fit(self, X, y=None):
"""Fit the transformer.
Parameters
----------
X : Pandas ``DataFrame``
The Pandas frame to fit. The frame will only
be fit on the prescribed ``cols`` (see ``__init__``) or
all of them if ``cols`` is None. Furthermore, ``X`` will
not be altered in the process of the fit.
y : None
Passthrough for ``sklearn.pipeline.Pipeline``. Even
if explicitly set, will not change behavior of ``fit``.
Returns
-------
self
"""
# Check this second in this case
X, self.cols = validate_is_pd(X, self.cols)
# validate the function. If none, make it a passthrough
if not self.fun:
def pass_through(x):
return x
self.fun = pass_through
else:
# check whether is function
if not hasattr(self.fun, '__call__'):
raise ValueError('passed fun arg is not a function')
# since we aren't checking is fit, we should set
# an arbitrary value to show validation has already occurred
self.is_fit_ = True
# TODO: this might cause issues in de-pickling, as we're
# going to be pickling a non-instance method... solve this.
return self
[docs] def transform(self, X):
"""Transform a test matrix given the already-fit transformer.
Parameters
----------
X : Pandas ``DataFrame``
The Pandas frame to transform. The operation will
be applied to a copy of the input data, and the result
will be returned.
Returns
-------
X : Pandas ``DataFrame``
The operation is applied to a copy of ``X``,
and the result set is returned.
"""
check_is_fitted(self, 'is_fit_')
X, _ = validate_is_pd(X, self.cols)
cols = _cols_if_none(X, self.cols)
# apply the function
# TODO: do we want to change the behavior to where the function
# should accept an entire frame and not a series?
X[cols] = X[cols].apply(lambda x: self.fun(x, **self.kwargs))
return X
def _mul(a, b):
"""Multiplies two series objects
(no validation since internally used).
Parameters
----------
a : Pandas ``Series``
One of two Pandas ``Series`` objects that will
be interacted together.
b : Pandas ``Series``
One of two Pandas ``Series`` objects that will
be interacted together.
Returns
-------
product np.ndarray
"""
return (a * b).values
[docs]class InteractionTermTransformer(BaseSkutil, TransformerMixin):
"""A class that will generate interaction terms between selected columns.
An interaction captures some relationship between two independent variables
in the form of In = (xi * xj).
Parameters
----------
cols : array_like, shape=(n_features,), optional (default=None)
The names of the columns on which to apply the transformation.
If no column names are provided, the transformer will be ``fit``
on the entire frame. Note that the transformation will also only
apply to the specified columns, and any other non-specified
columns will still be present after transformation. Note that since
this transformer can only operate on numeric columns, not explicitly
setting the ``cols`` parameter may result in errors for categorical data.
as_df : bool, optional (default=True)
Whether to return a Pandas ``DataFrame`` in the ``transform``
method. If False, will return a Numpy ``ndarray`` instead.
Since most skutil transformers depend on explicitly-named
``DataFrame`` features, the ``as_df`` parameter is True by default.
interaction : callable, optional (default=None)
A callable for interactions. Default None will
result in multiplication of two Series objects
name_suffix : str, optional (default='I')
The suffix to add to the new feature name in the form of
<feature_x>_<feature_y>_<suffix>
only_return_interactions : bool, optional (default=False)
If set to True, will only return features in feature_names
and their respective generated interaction terms.
Attributes
----------
fun_ : callable
The interaction term function
Examples
--------
The following example interacts the first two columns of the iris
dataset using the default ``_mul`` function (product).
>>> from skutil.preprocessing import InteractionTermTransformer
>>> from skutil.utils import load_iris_df
>>> import pandas as pd
>>>
>>> X = load_iris_df(include_tgt=False)
>>>
>>> trans = InteractionTermTransformer(cols=X.columns[:2])
>>> X_transform = trans.fit_transform(X)
>>>
>>> assert X_transform.shape[1] == X.shape[1] + 1 # only added one column
>>> X_transform[X_transform.columns[-1]].head()
0 17.85
1 14.70
2 15.04
3 14.26
4 18.00
Name: sepal length (cm)_sepal width (cm)_I, dtype: float64
"""
def __init__(self, cols=None, as_df=True, interaction_function=None,
name_suffix='I', only_return_interactions=False):
super(InteractionTermTransformer, self).__init__(cols=cols, as_df=as_df)
self.interaction_function = interaction_function
self.name_suffix = name_suffix
self.only_return_interactions = only_return_interactions
[docs] def fit(self, X, y=None):
"""Fit the transformer.
Parameters
----------
X : Pandas ``DataFrame``
The Pandas frame to fit. The frame will only
be fit on the prescribed ``cols`` (see ``__init__``) or
all of them if ``cols`` is None. Furthermore, ``X`` will
not be altered in the process of the fit.
y : None
Passthrough for ``sklearn.pipeline.Pipeline``. Even
if explicitly set, will not change behavior of ``fit``.
Returns
-------
self
"""
X, self.cols = validate_is_pd(X, self.cols)
cols = _cols_if_none(X, self.cols)
self.fun_ = self.interaction_function if self.interaction_function is not None else _mul
# validate function
if not hasattr(self.fun_, '__call__'):
raise TypeError('require callable for interaction_function')
# validate cols
if len(cols) < 2:
raise ValueError('need at least two columns')
return self
[docs] def transform(self, X):
"""Transform a test matrix given the already-fit transformer.
Parameters
----------
X : Pandas ``DataFrame``
The Pandas frame to transform. The operation will
be applied to a copy of the input data, and the result
will be returned.
Returns
-------
X : Pandas ``DataFrame``
The operation is applied to a copy of ``X``,
and the result set is returned.
"""
check_is_fitted(self, 'fun_')
X, _ = validate_is_pd(X, self.cols)
cols = _cols_if_none(X, self.cols)
n_features = len(cols)
suff = self.name_suffix
fun = self.fun_
append_dict = {}
interaction_names = [x for x in cols]
# we can do this in N^2 or we can do it in the uglier N choose 2...
for i in range(n_features - 1):
for j in range(i + 1, n_features):
col_i, col_j = cols[i], cols[j]
new_nm = '%s_%s_%s' % (col_i, col_j, suff)
append_dict[new_nm] = fun(X[col_i], X[col_j])
interaction_names.append(new_nm)
# create DF 2:
df2 = pd.DataFrame.from_dict(append_dict)
X = pd.concat([X, df2], axis=1)
# if we only want to keep interaction names, filter now
X = X if not self.only_return_interactions else X[interaction_names]
# return matrix if needed
return X if self.as_df else X.as_matrix()
[docs]class SelectiveScaler(BaseSkutil, TransformerMixin):
"""A class that will apply scaling only to a select group
of columns. Useful for data that may contain features that should not
be scaled, such as those that have been dummied, or for any already-in-scale
features. Perhaps, even, there are some features you'd like to scale in
a different manner than others. This, then, allows two back-to-back
``SelectiveScaler`` instances with different columns & strategies in a
pipeline object.
Parameters
----------
cols : array_like, shape=(n_features,), optional (default=None)
The names of the columns on which to apply the transformation.
If no column names are provided, the transformer will be ``fit``
on the entire frame. Note that the transformation will also only
apply to the specified columns, and any other non-specified
columns will still be present after transformation. Note that since
this transformer can only operate on numeric columns, not explicitly
setting the ``cols`` parameter may result in errors for categorical data.
scaler : instance of a sklearn Scaler, optional (default=StandardScaler)
The scaler to fit against ``cols``. Must be an instance of
``sklearn.preprocessing.BaseScaler``.
as_df : bool, optional (default=True)
Whether to return a Pandas ``DataFrame`` in the ``transform``
method. If False, will return a Numpy ``ndarray`` instead.
Since most skutil transformers depend on explicitly-named
``DataFrame`` features, the ``as_df`` parameter is True by default.
Attributes
----------
is_fit_ : bool
The ``SelectiveScaler`` parameter ``scaler`` is set in the constructor,
but to remain true to the sklearn API, we need to ensure ``fit``
is called prior to ``transform``. Thus, we set this attribute in
the ``fit`` method, which performs some validation, to ensure the
``scaler`` parameter has been validated.
Examples
--------
The following example will scale only the first two features
in the iris dataset:
>>> from skutil.preprocessing import SelectiveScaler
>>> from skutil.utils import load_iris_df
>>> import pandas as pd
>>> import numpy as np
>>>
>>> X = load_iris_df(include_tgt=False)
>>>
>>> trans = SelectiveScaler(cols=X.columns[:2])
>>> X_transform = trans.fit_transform(X)
>>>
>>> X_transform.head()
sepal length (cm) sepal width (cm) petal length (cm) petal width (cm)
0 -0.900681 1.032057 1.4 0.2
1 -1.143017 -0.124958 1.4 0.2
2 -1.385353 0.337848 1.3 0.2
3 -1.506521 0.106445 1.5 0.2
4 -1.021849 1.263460 1.4 0.2
"""
def __init__(self, cols=None, scaler=StandardScaler(), as_df=True):
super(SelectiveScaler, self).__init__(cols=cols, as_df=as_df)
self.scaler = scaler
[docs] def fit(self, X, y=None):
"""Fit the transformer.
Parameters
----------
X : Pandas ``DataFrame``
The Pandas frame to fit. The frame will only
be fit on the prescribed ``cols`` (see ``__init__``) or
all of them if ``cols`` is None. Furthermore, ``X`` will
not be altered in the process of the fit.
y : None
Passthrough for ``sklearn.pipeline.Pipeline``. Even
if explicitly set, will not change behavior of ``fit``.
Returns
-------
self
"""
# check on state of X and cols
X, self.cols = validate_is_pd(X, self.cols)
cols = _cols_if_none(X, self.cols)
# throws exception if the cols don't exist
self.scaler.fit(X[cols])
# this is our fit param
self.is_fit_ = True
return self
[docs] def transform(self, X):
"""Transform a test matrix given the already-fit transformer.
Parameters
----------
X : Pandas ``DataFrame``
The Pandas frame to transform. The operation will
be applied to a copy of the input data, and the result
will be returned.
Returns
-------
X : Pandas ``DataFrame``
The operation is applied to a copy of ``X``,
and the result set is returned.
"""
# check on state of X and cols
X, _ = validate_is_pd(X, self.cols)
cols = _cols_if_none(X, self.cols)
# Fails through if cols don't exist or if the scaler isn't fit yet
X[cols] = self.scaler.transform(X[cols])
return X if self.as_df else X.as_matrix()
[docs]class BoxCoxTransformer(BaseSkutil, TransformerMixin):
"""Estimate a lambda parameter for each feature, and transform
it to a distribution more-closely resembling a Gaussian bell
using the Box-Cox transformation.
Parameters
----------
cols : array_like, shape=(n_features,), optional (default=None)
The names of the columns on which to apply the transformation.
If no column names are provided, the transformer will be ``fit``
on the entire frame. Note that the transformation will also only
apply to the specified columns, and any other non-specified
columns will still be present after transformation. Note that since
this transformer can only operate on numeric columns, not explicitly
setting the ``cols`` parameter may result in errors for categorical data.
n_jobs : int, 1 by default
The number of jobs to use for the computation. This works by
estimating each of the feature lambdas in parallel.
If -1 all CPUs are used. If 1 is given, no parallel computing code
is used at all, which is useful for debugging. For n_jobs below -1,
(n_cpus + 1 + n_jobs) are used. Thus for n_jobs = -2, all CPUs but
one are used.
as_df : bool, optional (default=True)
Whether to return a Pandas ``DataFrame`` in the ``transform``
method. If False, will return a Numpy ``ndarray`` instead.
Since most skutil transformers depend on explicitly-named
``DataFrame`` features, the ``as_df`` parameter is True by default.
shift_amt : float, optional (default=1e-6)
Since the Box-Cox transformation requires that all values be positive
(above zero), any features that contain sub-zero elements will be shifted
up by the absolute value of the minimum element plus this amount in the ``fit``
method. In the ``transform`` method, if any of the test data is less than zero
after shifting, it will be truncated at the ``shift_amt`` value.
Attributes
----------
shift_ : dict
The shifts for each feature needed to shift the min value in
the feature up to at least 0.0, as every element must be positive
lambda_ : dict
The lambda values corresponding to each feature
"""
def __init__(self, cols=None, n_jobs=1, as_df=True, shift_amt=1e-6):
super(BoxCoxTransformer, self).__init__(cols=cols, as_df=as_df)
self.n_jobs = n_jobs
self.shift_amt = shift_amt
[docs] def fit(self, X, y=None):
"""Fit the transformer.
Parameters
----------
X : Pandas ``DataFrame``
The Pandas frame to fit. The frame will only
be fit on the prescribed ``cols`` (see ``__init__``) or
all of them if ``cols`` is None. Furthermore, ``X`` will
not be altered in the process of the fit.
y : None
Passthrough for ``sklearn.pipeline.Pipeline``. Even
if explicitly set, will not change behavior of ``fit``.
Returns
-------
self
"""
# check on state of X and cols
X, self.cols = validate_is_pd(X, self.cols, assert_all_finite=True) # creates a copy -- we need all to be finite
cols = _cols_if_none(X, self.cols)
# ensure enough rows
_validate_rows(X)
# First step is to compute all the shifts needed, then add back to X...
min_Xs = X[cols].min(axis=0)
shift = np.array([np.abs(x) + self.shift_amt if x <= 0.0 else 0.0 for x in min_Xs])
X[cols] += shift
# now put shift into a dict
self.shift_ = dict(zip(cols, shift))
# Now estimate the lambdas in parallel
self.lambda_ = dict(zip(cols,
Parallel(n_jobs=self.n_jobs)(
delayed(_estimate_lambda_single_y)
(X[i].tolist()) for i in cols)))
return self
[docs] def transform(self, X):
"""Transform a test matrix given the already-fit transformer.
Parameters
----------
X : Pandas ``DataFrame``
The Pandas frame to transform. The operation will
be applied to a copy of the input data, and the result
will be returned.
Returns
-------
X : Pandas ``DataFrame``
The operation is applied to a copy of ``X``,
and the result set is returned.
"""
check_is_fitted(self, 'shift_')
# check on state of X and cols
X, _ = validate_is_pd(X, self.cols, assert_all_finite=True)
cols = _cols_if_none(X, self.cols)
_, n_features = X.shape
lambdas_, shifts_ = self.lambda_, self.shift_
# Add the shifts in, and if they're too low,
# we have to truncate at some low value: 1e-6
for nm in cols:
X[nm] += shifts_[nm]
# If the shifts are too low, truncate...
X = X.apply(lambda x: x.apply(lambda y: np.maximum(self.shift_amt, y)))
# do transformations
for nm in cols:
X[nm] = _transform_y(X[nm].tolist(), lambdas_[nm])
return X if self.as_df else X.as_matrix()
def _transform_y(y, lam):
"""Transform a single y, given a single lambda value.
No validation performed.
Parameters
----------
y : array_like, shape (n_samples,)
The vector being transformed
lam : ndarray, shape (n_lambdas,)
The lambda value used for the transformation
"""
# ensure np array
y = np.array(y)
y_prime = np.array([(np.power(x, lam) - 1) / lam if not _eqls(lam, ZERO) else log(x) for x in y])
# rarely -- very rarely -- we can get a NaN. Why?
return y_prime
def _estimate_lambda_single_y(y):
"""Estimate lambda for a single y, given a range of lambdas
through which to search. No validation performed.
Parameters
----------
y : ndarray, shape (n_samples,)
The vector being estimated against
"""
# ensure is array
y = np.array(y)
# Use scipy's log-likelihood estimator
b = boxcox(y, lmbda=None)
# Return lambda corresponding to maximum P
return b[1]
[docs]class YeoJohnsonTransformer(BaseSkutil, TransformerMixin):
"""Estimate a lambda parameter for each feature, and transform
it to a distribution more-closely resembling a Gaussian bell
using the Yeo-Johnson transformation.
Parameters
----------
cols : array_like, shape=(n_features,), optional (default=None)
The names of the columns on which to apply the transformation.
If no column names are provided, the transformer will be ``fit``
on the entire frame. Note that the transformation will also only
apply to the specified columns, and any other non-specified
columns will still be present after transformation. Note that since
this transformer can only operate on numeric columns, not explicitly
setting the ``cols`` parameter may result in errors for categorical data.
n_jobs : int, 1 by default
The number of jobs to use for the computation. This works by
estimating each of the feature lambdas in parallel.
If -1 all CPUs are used. If 1 is given, no parallel computing code
is used at all, which is useful for debugging. For n_jobs below -1,
(n_cpus + 1 + n_jobs) are used. Thus for n_jobs = -2, all CPUs but
one are used.
as_df : bool, optional (default=True)
Whether to return a Pandas ``DataFrame`` in the ``transform``
method. If False, will return a Numpy ``ndarray`` instead.
Since most skutil transformers depend on explicitly-named
``DataFrame`` features, the ``as_df`` parameter is True by default.
Attributes
----------
lambda_ : dict
The lambda values corresponding to each feature
"""
def __init__(self, cols=None, n_jobs=1, as_df=True):
super(YeoJohnsonTransformer, self).__init__(cols=cols, as_df=as_df)
self.n_jobs = n_jobs
[docs] def fit(self, X, y=None):
"""Fit the transformer.
Parameters
----------
X : Pandas ``DataFrame``
The Pandas frame to fit. The frame will only
be fit on the prescribed ``cols`` (see ``__init__``) or
all of them if ``cols`` is None. Furthermore, ``X`` will
not be altered in the process of the fit.
y : None
Passthrough for ``sklearn.pipeline.Pipeline``. Even
if explicitly set, will not change behavior of ``fit``.
Returns
-------
self
"""
# check on state of X and cols
X, self.cols = validate_is_pd(X, self.cols, assert_all_finite=True) # creates a copy -- we need all to be finite
cols = _cols_if_none(X, self.cols)
# ensure enough rows
_validate_rows(X)
# Now estimate the lambdas in parallel
self.lambda_ = dict(zip(cols,
Parallel(n_jobs=self.n_jobs)(
delayed(_yj_estimate_lambda_single_y)
(X[nm]) for nm in cols)))
return self
[docs] def transform(self, X):
"""Transform a test matrix given the already-fit transformer.
Parameters
----------
X : Pandas ``DataFrame``
The Pandas frame to transform. The operation will
be applied to a copy of the input data, and the result
will be returned.
Returns
-------
X : Pandas ``DataFrame``
The operation is applied to a copy of ``X``,
and the result set is returned.
"""
check_is_fitted(self, 'lambda_')
# check on state of X and cols
X, cols = validate_is_pd(X, self.cols, assert_all_finite=True) # creates a copy -- we need all to be finite
cols = _cols_if_none(X, self.cols)
lambdas_ = self.lambda_
# do transformations
for nm in cols:
X[nm] = _yj_transform_y(X[nm], lambdas_[nm])
return X if self.as_df else X.as_matrix()
def _yj_trans_single_x(x, lam):
if x >= 0:
# Case 1: x >= 0 and lambda is not 0
if not _eqls(lam, ZERO):
return (np.power(x + 1, lam) - 1.0) / lam
# Case 2: x >= 0 and lambda is zero
return log(x + 1)
else:
# Case 2: x < 0 and lambda is not two
if not lam == 2.0:
denom = 2.0 - lam
numer = np.power((-x + 1), (2.0 - lam)) - 1.0
return -numer / denom
# Case 4: x < 0 and lambda is two
return -log(-x + 1)
def _yj_transform_y(y, lam):
"""Transform a single y, given a single lambda value.
No validation performed.
Parameters
----------
y : ndarray, shape (n_samples,)
The vector being transformed
lam : ndarray, shape (n_lambdas,)
The lambda value used for the transformation
"""
y = np.array(y)
return np.array([_yj_trans_single_x(x, lam) for x in y])
def _yj_estimate_lambda_single_y(y):
"""Estimate lambda for a single y, given a range of lambdas
through which to search. No validation performed.
Parameters
----------
y : ndarray, shape (n_samples,)
The vector being estimated against
"""
y = np.array(y)
# Use customlog-likelihood estimator
return _yj_normmax(y)
def _yj_normmax(x, brack=(-2, 2)):
"""Compute optimal YJ transform parameter for input data.
Parameters
----------
x : array_like
Input array.
brack : 2-tuple
The starting interval for a downhill bracket search
"""
# Use MLE to compute the optimal YJ parameter
def _mle_opt(i, brck):
def _eval_mle(lmb, data):
# Function to minimize
return -_yj_llf(data, lmb)
return optimize.brent(_eval_mle, brack=brck, args=(i,))
return _mle_opt(x, brack) # _mle(x, brack)
def _yj_llf(data, lmb):
"""Transform a y vector given a single lambda value,
and compute the log-likelihood function. No validation
is applied to the input.
Parameters
----------
data : array_like
The vector to transform
lmb : scalar
The lambda value
"""
data = np.asarray(data)
N = data.shape[0]
y = _yj_transform_y(data, lmb)
# We can't take the canonical log of data, as there could be
# zeros or negatives. Thus, we need to shift both distributions
# up by some artbitrary factor just for the LLF computation
min_d, min_y = np.min(data), np.min(y)
if min_d < ZERO:
shift = np.abs(min_d) + 1
data += shift
# Same goes for Y
if min_y < ZERO:
shift = np.abs(min_y) + 1
y += shift
# Compute mean on potentially shifted data
y_mean = np.mean(y, axis=0)
var = np.sum((y - y_mean) ** 2. / N, axis=0)
# If var is 0.0, we'll get a warning. Means all the
# values were nearly identical in y, so we will return
# NaN so we don't optimize for this value of lam
if 0 == var:
return np.nan
# Can't use canonical log due to maybe negatives, so use the truncated log function in utils
llf = (lmb - 1) * np.sum(log(data), axis=0)
llf -= N / 2.0 * log(var)
return llf
[docs]class SpatialSignTransformer(BaseSkutil, TransformerMixin):
"""Project the feature space of a matrix into a multi-dimensional sphere
by dividing each feature by its squared norm.
Parameters
----------
cols : array_like, shape=(n_features,), optional (default=None)
The names of the columns on which to apply the transformation.
If no column names are provided, the transformer will be ``fit``
on the entire frame. Note that the transformation will also only
apply to the specified columns, and any other non-specified
columns will still be present after transformation. Note that since
this transformer can only operate on numeric columns, not explicitly
setting the ``cols`` parameter may result in errors for categorical data.
n_jobs : int, 1 by default
The number of jobs to use for the computation. This works by
estimating each of the feature lambdas in parallel.
If -1 all CPUs are used. If 1 is given, no parallel computing code
is used at all, which is useful for debugging. For n_jobs below -1,
(n_cpus + 1 + n_jobs) are used. Thus for n_jobs = -2, all CPUs but
one are used.
as_df : bool, optional (default=True)
Whether to return a Pandas ``DataFrame`` in the ``transform``
method. If False, will return a Numpy ``ndarray`` instead.
Since most skutil transformers depend on explicitly-named
``DataFrame`` features, the ``as_df`` parameter is True by default.
Attributes
----------
sq_nms_ : dict
The squared norms for each feature
"""
def __init__(self, cols=None, n_jobs=1, as_df=True):
super(SpatialSignTransformer, self).__init__(cols=cols, as_df=as_df)
self.n_jobs = n_jobs
[docs] def fit(self, X, y=None):
"""Fit the transformer.
Parameters
----------
X : Pandas ``DataFrame``
The Pandas frame to fit. The frame will only
be fit on the prescribed ``cols`` (see ``__init__``) or
all of them if ``cols`` is None. Furthermore, ``X`` will
not be altered in the process of the fit.
y : None
Passthrough for ``sklearn.pipeline.Pipeline``. Even
if explicitly set, will not change behavior of ``fit``.
Returns
-------
self
"""
# check on state of X and cols
X, self.cols = validate_is_pd(X, self.cols)
cols = _cols_if_none(X, self.cols)
# Now get sqnms in parallel
self.sq_nms_ = dict(zip(cols,
Parallel(n_jobs=self.n_jobs)(
delayed(_sq_norm_single)
(X[nm]) for nm in cols)))
return self
[docs] def transform(self, X):
"""Transform a test matrix given the already-fit transformer.
Parameters
----------
X : Pandas ``DataFrame``
The Pandas frame to transform. The operation will
be applied to a copy of the input data, and the result
will be returned.
Returns
-------
X : Pandas ``DataFrame``
The operation is applied to a copy of ``X``,
and the result set is returned.
"""
check_is_fitted(self, 'sq_nms_')
# check on state of X and cols
X, _ = validate_is_pd(X, self.cols)
sq_nms_ = self.sq_nms_
# scale by norms
for nm, the_norm in six.iteritems(sq_nms_):
X[nm] /= the_norm
return X if self.as_df else X.as_matrix()
def _sq_norm_single(x, zero_action=np.inf):
x = np.asarray(x)
nrm = np.dot(x, x)
# What if a squared norm is zero? We want to
# avoid a divide-by-zero situation...
return nrm if not nrm == 0 else zero_action