Source code for skutil.preprocessing.transform

# -*- coding: utf-8 -*-

from __future__ import print_function, absolute_import, division
import numpy as np
import pandas as pd
from scipy import optimize
from scipy.stats import boxcox
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.externals import six
from sklearn.externals.joblib import Parallel, delayed
from sklearn.preprocessing import StandardScaler
from sklearn.utils.validation import check_is_fitted
from skutil.base import *
from ..utils import *
from ..utils.fixes import _cols_if_none

__all__ = [
    'BoxCoxTransformer',
    'FunctionMapper',
    'InteractionTermTransformer',
    'SelectiveScaler',
    'SpatialSignTransformer',
    'YeoJohnsonTransformer'
]

# A very small number used to measure differences.
# If the absolute difference between two numbers is
# <= EPS, it is considered equal.
EPS = 1e-12

# A very small number used to represent zero.
ZERO = 1e-16


# Helper funtions:
def _eqls(lam, v):
    return np.abs(lam - v) <= EPS


def _validate_rows(X):
    m, n = X.shape
    if m < 2:
        raise ValueError('n_samples should be at least two, but got %i' % m)


[docs]class FunctionMapper(BaseSkutil, TransformerMixin): """Apply a function to a column or set of columns. Parameters ---------- cols : array_like, shape=(n_features,), optional (default=None) The names of the columns on which to apply the transformation. If no column names are provided, the transformer will be ``fit`` on the entire frame. Note that the transformation will also only apply to the specified columns, and any other non-specified columns will still be present after transformation. fun : function, (default=None) The function to apply to the feature(s). This function will be applied via lambda expression to each column (independent of one another). Therefore, the callable should accept an array-like argument. Attributes ---------- is_fit_ : bool The ``FunctionMapper`` callable is set in the constructor, but to remain true to the sklearn API, we need to ensure ``fit`` is called prior to ``transform``. Thus, we set this attribute in the ``fit`` method, which performs some validation, to ensure the ``fun`` parameter has been validated. Examples -------- The following example will apply a cube-root transformation to the first two columns in the iris dataset. >>> from skutil.utils import load_iris_df >>> import pandas as pd >>> import numpy as np >>> >>> X = load_iris_df(include_tgt=False) >>> >>> # define the function >>> def cube_root(x): ... return np.power(x, 0.333) >>> >>> # make our transformer >>> trans = FunctionMapper(cols=X.columns[:2], fun=cube_root) >>> trans.fit_transform(X).head() sepal length (cm) sepal width (cm) petal length (cm) petal width (cm) 0 1.720366 1.517661 1.4 0.2 1 1.697600 1.441722 1.4 0.2 2 1.674205 1.473041 1.3 0.2 3 1.662258 1.457550 1.5 0.2 4 1.709059 1.531965 1.4 0.2 """ def __init__(self, cols=None, fun=None, **kwargs): super(FunctionMapper, self).__init__(cols=cols) self.fun = fun self.kwargs = kwargs
[docs] def fit(self, X, y=None): """Fit the transformer. Parameters ---------- X : Pandas ``DataFrame`` The Pandas frame to fit. The frame will only be fit on the prescribed ``cols`` (see ``__init__``) or all of them if ``cols`` is None. Furthermore, ``X`` will not be altered in the process of the fit. y : None Passthrough for ``sklearn.pipeline.Pipeline``. Even if explicitly set, will not change behavior of ``fit``. Returns ------- self """ # Check this second in this case X, self.cols = validate_is_pd(X, self.cols) # validate the function. If none, make it a passthrough if not self.fun: def pass_through(x): return x self.fun = pass_through else: # check whether is function if not hasattr(self.fun, '__call__'): raise ValueError('passed fun arg is not a function') # since we aren't checking is fit, we should set # an arbitrary value to show validation has already occurred self.is_fit_ = True # TODO: this might cause issues in de-pickling, as we're # going to be pickling a non-instance method... solve this. return self
[docs] def transform(self, X): """Transform a test matrix given the already-fit transformer. Parameters ---------- X : Pandas ``DataFrame`` The Pandas frame to transform. The operation will be applied to a copy of the input data, and the result will be returned. Returns ------- X : Pandas ``DataFrame`` The operation is applied to a copy of ``X``, and the result set is returned. """ check_is_fitted(self, 'is_fit_') X, _ = validate_is_pd(X, self.cols) cols = _cols_if_none(X, self.cols) # apply the function # TODO: do we want to change the behavior to where the function # should accept an entire frame and not a series? X[cols] = X[cols].apply(lambda x: self.fun(x, **self.kwargs)) return X
def _mul(a, b): """Multiplies two series objects (no validation since internally used). Parameters ---------- a : Pandas ``Series`` One of two Pandas ``Series`` objects that will be interacted together. b : Pandas ``Series`` One of two Pandas ``Series`` objects that will be interacted together. Returns ------- product np.ndarray """ return (a * b).values
[docs]class InteractionTermTransformer(BaseSkutil, TransformerMixin): """A class that will generate interaction terms between selected columns. An interaction captures some relationship between two independent variables in the form of In = (xi * xj). Parameters ---------- cols : array_like, shape=(n_features,), optional (default=None) The names of the columns on which to apply the transformation. If no column names are provided, the transformer will be ``fit`` on the entire frame. Note that the transformation will also only apply to the specified columns, and any other non-specified columns will still be present after transformation. Note that since this transformer can only operate on numeric columns, not explicitly setting the ``cols`` parameter may result in errors for categorical data. as_df : bool, optional (default=True) Whether to return a Pandas ``DataFrame`` in the ``transform`` method. If False, will return a Numpy ``ndarray`` instead. Since most skutil transformers depend on explicitly-named ``DataFrame`` features, the ``as_df`` parameter is True by default. interaction : callable, optional (default=None) A callable for interactions. Default None will result in multiplication of two Series objects name_suffix : str, optional (default='I') The suffix to add to the new feature name in the form of <feature_x>_<feature_y>_<suffix> only_return_interactions : bool, optional (default=False) If set to True, will only return features in feature_names and their respective generated interaction terms. Attributes ---------- fun_ : callable The interaction term function Examples -------- The following example interacts the first two columns of the iris dataset using the default ``_mul`` function (product). >>> from skutil.preprocessing import InteractionTermTransformer >>> from skutil.utils import load_iris_df >>> import pandas as pd >>> >>> X = load_iris_df(include_tgt=False) >>> >>> trans = InteractionTermTransformer(cols=X.columns[:2]) >>> X_transform = trans.fit_transform(X) >>> >>> assert X_transform.shape[1] == X.shape[1] + 1 # only added one column >>> X_transform[X_transform.columns[-1]].head() 0 17.85 1 14.70 2 15.04 3 14.26 4 18.00 Name: sepal length (cm)_sepal width (cm)_I, dtype: float64 """ def __init__(self, cols=None, as_df=True, interaction_function=None, name_suffix='I', only_return_interactions=False): super(InteractionTermTransformer, self).__init__(cols=cols, as_df=as_df) self.interaction_function = interaction_function self.name_suffix = name_suffix self.only_return_interactions = only_return_interactions
[docs] def fit(self, X, y=None): """Fit the transformer. Parameters ---------- X : Pandas ``DataFrame`` The Pandas frame to fit. The frame will only be fit on the prescribed ``cols`` (see ``__init__``) or all of them if ``cols`` is None. Furthermore, ``X`` will not be altered in the process of the fit. y : None Passthrough for ``sklearn.pipeline.Pipeline``. Even if explicitly set, will not change behavior of ``fit``. Returns ------- self """ X, self.cols = validate_is_pd(X, self.cols) cols = _cols_if_none(X, self.cols) self.fun_ = self.interaction_function if self.interaction_function is not None else _mul # validate function if not hasattr(self.fun_, '__call__'): raise TypeError('require callable for interaction_function') # validate cols if len(cols) < 2: raise ValueError('need at least two columns') return self
[docs] def transform(self, X): """Transform a test matrix given the already-fit transformer. Parameters ---------- X : Pandas ``DataFrame`` The Pandas frame to transform. The operation will be applied to a copy of the input data, and the result will be returned. Returns ------- X : Pandas ``DataFrame`` The operation is applied to a copy of ``X``, and the result set is returned. """ check_is_fitted(self, 'fun_') X, _ = validate_is_pd(X, self.cols) cols = _cols_if_none(X, self.cols) n_features = len(cols) suff = self.name_suffix fun = self.fun_ append_dict = {} interaction_names = [x for x in cols] # we can do this in N^2 or we can do it in the uglier N choose 2... for i in range(n_features - 1): for j in range(i + 1, n_features): col_i, col_j = cols[i], cols[j] new_nm = '%s_%s_%s' % (col_i, col_j, suff) append_dict[new_nm] = fun(X[col_i], X[col_j]) interaction_names.append(new_nm) # create DF 2: df2 = pd.DataFrame.from_dict(append_dict) X = pd.concat([X, df2], axis=1) # if we only want to keep interaction names, filter now X = X if not self.only_return_interactions else X[interaction_names] # return matrix if needed return X if self.as_df else X.as_matrix()
[docs]class SelectiveScaler(BaseSkutil, TransformerMixin): """A class that will apply scaling only to a select group of columns. Useful for data that may contain features that should not be scaled, such as those that have been dummied, or for any already-in-scale features. Perhaps, even, there are some features you'd like to scale in a different manner than others. This, then, allows two back-to-back ``SelectiveScaler`` instances with different columns & strategies in a pipeline object. Parameters ---------- cols : array_like, shape=(n_features,), optional (default=None) The names of the columns on which to apply the transformation. If no column names are provided, the transformer will be ``fit`` on the entire frame. Note that the transformation will also only apply to the specified columns, and any other non-specified columns will still be present after transformation. Note that since this transformer can only operate on numeric columns, not explicitly setting the ``cols`` parameter may result in errors for categorical data. scaler : instance of a sklearn Scaler, optional (default=StandardScaler) The scaler to fit against ``cols``. Must be an instance of ``sklearn.preprocessing.BaseScaler``. as_df : bool, optional (default=True) Whether to return a Pandas ``DataFrame`` in the ``transform`` method. If False, will return a Numpy ``ndarray`` instead. Since most skutil transformers depend on explicitly-named ``DataFrame`` features, the ``as_df`` parameter is True by default. Attributes ---------- is_fit_ : bool The ``SelectiveScaler`` parameter ``scaler`` is set in the constructor, but to remain true to the sklearn API, we need to ensure ``fit`` is called prior to ``transform``. Thus, we set this attribute in the ``fit`` method, which performs some validation, to ensure the ``scaler`` parameter has been validated. Examples -------- The following example will scale only the first two features in the iris dataset: >>> from skutil.preprocessing import SelectiveScaler >>> from skutil.utils import load_iris_df >>> import pandas as pd >>> import numpy as np >>> >>> X = load_iris_df(include_tgt=False) >>> >>> trans = SelectiveScaler(cols=X.columns[:2]) >>> X_transform = trans.fit_transform(X) >>> >>> X_transform.head() sepal length (cm) sepal width (cm) petal length (cm) petal width (cm) 0 -0.900681 1.032057 1.4 0.2 1 -1.143017 -0.124958 1.4 0.2 2 -1.385353 0.337848 1.3 0.2 3 -1.506521 0.106445 1.5 0.2 4 -1.021849 1.263460 1.4 0.2 """ def __init__(self, cols=None, scaler=StandardScaler(), as_df=True): super(SelectiveScaler, self).__init__(cols=cols, as_df=as_df) self.scaler = scaler
[docs] def fit(self, X, y=None): """Fit the transformer. Parameters ---------- X : Pandas ``DataFrame`` The Pandas frame to fit. The frame will only be fit on the prescribed ``cols`` (see ``__init__``) or all of them if ``cols`` is None. Furthermore, ``X`` will not be altered in the process of the fit. y : None Passthrough for ``sklearn.pipeline.Pipeline``. Even if explicitly set, will not change behavior of ``fit``. Returns ------- self """ # check on state of X and cols X, self.cols = validate_is_pd(X, self.cols) cols = _cols_if_none(X, self.cols) # throws exception if the cols don't exist self.scaler.fit(X[cols]) # this is our fit param self.is_fit_ = True return self
[docs] def transform(self, X): """Transform a test matrix given the already-fit transformer. Parameters ---------- X : Pandas ``DataFrame`` The Pandas frame to transform. The operation will be applied to a copy of the input data, and the result will be returned. Returns ------- X : Pandas ``DataFrame`` The operation is applied to a copy of ``X``, and the result set is returned. """ # check on state of X and cols X, _ = validate_is_pd(X, self.cols) cols = _cols_if_none(X, self.cols) # Fails through if cols don't exist or if the scaler isn't fit yet X[cols] = self.scaler.transform(X[cols]) return X if self.as_df else X.as_matrix()
[docs]class BoxCoxTransformer(BaseSkutil, TransformerMixin): """Estimate a lambda parameter for each feature, and transform it to a distribution more-closely resembling a Gaussian bell using the Box-Cox transformation. Parameters ---------- cols : array_like, shape=(n_features,), optional (default=None) The names of the columns on which to apply the transformation. If no column names are provided, the transformer will be ``fit`` on the entire frame. Note that the transformation will also only apply to the specified columns, and any other non-specified columns will still be present after transformation. Note that since this transformer can only operate on numeric columns, not explicitly setting the ``cols`` parameter may result in errors for categorical data. n_jobs : int, 1 by default The number of jobs to use for the computation. This works by estimating each of the feature lambdas in parallel. If -1 all CPUs are used. If 1 is given, no parallel computing code is used at all, which is useful for debugging. For n_jobs below -1, (n_cpus + 1 + n_jobs) are used. Thus for n_jobs = -2, all CPUs but one are used. as_df : bool, optional (default=True) Whether to return a Pandas ``DataFrame`` in the ``transform`` method. If False, will return a Numpy ``ndarray`` instead. Since most skutil transformers depend on explicitly-named ``DataFrame`` features, the ``as_df`` parameter is True by default. shift_amt : float, optional (default=1e-6) Since the Box-Cox transformation requires that all values be positive (above zero), any features that contain sub-zero elements will be shifted up by the absolute value of the minimum element plus this amount in the ``fit`` method. In the ``transform`` method, if any of the test data is less than zero after shifting, it will be truncated at the ``shift_amt`` value. Attributes ---------- shift_ : dict The shifts for each feature needed to shift the min value in the feature up to at least 0.0, as every element must be positive lambda_ : dict The lambda values corresponding to each feature """ def __init__(self, cols=None, n_jobs=1, as_df=True, shift_amt=1e-6): super(BoxCoxTransformer, self).__init__(cols=cols, as_df=as_df) self.n_jobs = n_jobs self.shift_amt = shift_amt
[docs] def fit(self, X, y=None): """Fit the transformer. Parameters ---------- X : Pandas ``DataFrame`` The Pandas frame to fit. The frame will only be fit on the prescribed ``cols`` (see ``__init__``) or all of them if ``cols`` is None. Furthermore, ``X`` will not be altered in the process of the fit. y : None Passthrough for ``sklearn.pipeline.Pipeline``. Even if explicitly set, will not change behavior of ``fit``. Returns ------- self """ # check on state of X and cols X, self.cols = validate_is_pd(X, self.cols, assert_all_finite=True) # creates a copy -- we need all to be finite cols = _cols_if_none(X, self.cols) # ensure enough rows _validate_rows(X) # First step is to compute all the shifts needed, then add back to X... min_Xs = X[cols].min(axis=0) shift = np.array([np.abs(x) + self.shift_amt if x <= 0.0 else 0.0 for x in min_Xs]) X[cols] += shift # now put shift into a dict self.shift_ = dict(zip(cols, shift)) # Now estimate the lambdas in parallel self.lambda_ = dict(zip(cols, Parallel(n_jobs=self.n_jobs)( delayed(_estimate_lambda_single_y) (X[i].tolist()) for i in cols))) return self
[docs] def transform(self, X): """Transform a test matrix given the already-fit transformer. Parameters ---------- X : Pandas ``DataFrame`` The Pandas frame to transform. The operation will be applied to a copy of the input data, and the result will be returned. Returns ------- X : Pandas ``DataFrame`` The operation is applied to a copy of ``X``, and the result set is returned. """ check_is_fitted(self, 'shift_') # check on state of X and cols X, _ = validate_is_pd(X, self.cols, assert_all_finite=True) cols = _cols_if_none(X, self.cols) _, n_features = X.shape lambdas_, shifts_ = self.lambda_, self.shift_ # Add the shifts in, and if they're too low, # we have to truncate at some low value: 1e-6 for nm in cols: X[nm] += shifts_[nm] # If the shifts are too low, truncate... X = X.apply(lambda x: x.apply(lambda y: np.maximum(self.shift_amt, y))) # do transformations for nm in cols: X[nm] = _transform_y(X[nm].tolist(), lambdas_[nm]) return X if self.as_df else X.as_matrix()
def _transform_y(y, lam): """Transform a single y, given a single lambda value. No validation performed. Parameters ---------- y : array_like, shape (n_samples,) The vector being transformed lam : ndarray, shape (n_lambdas,) The lambda value used for the transformation """ # ensure np array y = np.array(y) y_prime = np.array([(np.power(x, lam) - 1) / lam if not _eqls(lam, ZERO) else log(x) for x in y]) # rarely -- very rarely -- we can get a NaN. Why? return y_prime def _estimate_lambda_single_y(y): """Estimate lambda for a single y, given a range of lambdas through which to search. No validation performed. Parameters ---------- y : ndarray, shape (n_samples,) The vector being estimated against """ # ensure is array y = np.array(y) # Use scipy's log-likelihood estimator b = boxcox(y, lmbda=None) # Return lambda corresponding to maximum P return b[1]
[docs]class YeoJohnsonTransformer(BaseSkutil, TransformerMixin): """Estimate a lambda parameter for each feature, and transform it to a distribution more-closely resembling a Gaussian bell using the Yeo-Johnson transformation. Parameters ---------- cols : array_like, shape=(n_features,), optional (default=None) The names of the columns on which to apply the transformation. If no column names are provided, the transformer will be ``fit`` on the entire frame. Note that the transformation will also only apply to the specified columns, and any other non-specified columns will still be present after transformation. Note that since this transformer can only operate on numeric columns, not explicitly setting the ``cols`` parameter may result in errors for categorical data. n_jobs : int, 1 by default The number of jobs to use for the computation. This works by estimating each of the feature lambdas in parallel. If -1 all CPUs are used. If 1 is given, no parallel computing code is used at all, which is useful for debugging. For n_jobs below -1, (n_cpus + 1 + n_jobs) are used. Thus for n_jobs = -2, all CPUs but one are used. as_df : bool, optional (default=True) Whether to return a Pandas ``DataFrame`` in the ``transform`` method. If False, will return a Numpy ``ndarray`` instead. Since most skutil transformers depend on explicitly-named ``DataFrame`` features, the ``as_df`` parameter is True by default. Attributes ---------- lambda_ : dict The lambda values corresponding to each feature """ def __init__(self, cols=None, n_jobs=1, as_df=True): super(YeoJohnsonTransformer, self).__init__(cols=cols, as_df=as_df) self.n_jobs = n_jobs
[docs] def fit(self, X, y=None): """Fit the transformer. Parameters ---------- X : Pandas ``DataFrame`` The Pandas frame to fit. The frame will only be fit on the prescribed ``cols`` (see ``__init__``) or all of them if ``cols`` is None. Furthermore, ``X`` will not be altered in the process of the fit. y : None Passthrough for ``sklearn.pipeline.Pipeline``. Even if explicitly set, will not change behavior of ``fit``. Returns ------- self """ # check on state of X and cols X, self.cols = validate_is_pd(X, self.cols, assert_all_finite=True) # creates a copy -- we need all to be finite cols = _cols_if_none(X, self.cols) # ensure enough rows _validate_rows(X) # Now estimate the lambdas in parallel self.lambda_ = dict(zip(cols, Parallel(n_jobs=self.n_jobs)( delayed(_yj_estimate_lambda_single_y) (X[nm]) for nm in cols))) return self
[docs] def transform(self, X): """Transform a test matrix given the already-fit transformer. Parameters ---------- X : Pandas ``DataFrame`` The Pandas frame to transform. The operation will be applied to a copy of the input data, and the result will be returned. Returns ------- X : Pandas ``DataFrame`` The operation is applied to a copy of ``X``, and the result set is returned. """ check_is_fitted(self, 'lambda_') # check on state of X and cols X, cols = validate_is_pd(X, self.cols, assert_all_finite=True) # creates a copy -- we need all to be finite cols = _cols_if_none(X, self.cols) lambdas_ = self.lambda_ # do transformations for nm in cols: X[nm] = _yj_transform_y(X[nm], lambdas_[nm]) return X if self.as_df else X.as_matrix()
def _yj_trans_single_x(x, lam): if x >= 0: # Case 1: x >= 0 and lambda is not 0 if not _eqls(lam, ZERO): return (np.power(x + 1, lam) - 1.0) / lam # Case 2: x >= 0 and lambda is zero return log(x + 1) else: # Case 2: x < 0 and lambda is not two if not lam == 2.0: denom = 2.0 - lam numer = np.power((-x + 1), (2.0 - lam)) - 1.0 return -numer / denom # Case 4: x < 0 and lambda is two return -log(-x + 1) def _yj_transform_y(y, lam): """Transform a single y, given a single lambda value. No validation performed. Parameters ---------- y : ndarray, shape (n_samples,) The vector being transformed lam : ndarray, shape (n_lambdas,) The lambda value used for the transformation """ y = np.array(y) return np.array([_yj_trans_single_x(x, lam) for x in y]) def _yj_estimate_lambda_single_y(y): """Estimate lambda for a single y, given a range of lambdas through which to search. No validation performed. Parameters ---------- y : ndarray, shape (n_samples,) The vector being estimated against """ y = np.array(y) # Use customlog-likelihood estimator return _yj_normmax(y) def _yj_normmax(x, brack=(-2, 2)): """Compute optimal YJ transform parameter for input data. Parameters ---------- x : array_like Input array. brack : 2-tuple The starting interval for a downhill bracket search """ # Use MLE to compute the optimal YJ parameter def _mle_opt(i, brck): def _eval_mle(lmb, data): # Function to minimize return -_yj_llf(data, lmb) return optimize.brent(_eval_mle, brack=brck, args=(i,)) return _mle_opt(x, brack) # _mle(x, brack) def _yj_llf(data, lmb): """Transform a y vector given a single lambda value, and compute the log-likelihood function. No validation is applied to the input. Parameters ---------- data : array_like The vector to transform lmb : scalar The lambda value """ data = np.asarray(data) N = data.shape[0] y = _yj_transform_y(data, lmb) # We can't take the canonical log of data, as there could be # zeros or negatives. Thus, we need to shift both distributions # up by some artbitrary factor just for the LLF computation min_d, min_y = np.min(data), np.min(y) if min_d < ZERO: shift = np.abs(min_d) + 1 data += shift # Same goes for Y if min_y < ZERO: shift = np.abs(min_y) + 1 y += shift # Compute mean on potentially shifted data y_mean = np.mean(y, axis=0) var = np.sum((y - y_mean) ** 2. / N, axis=0) # If var is 0.0, we'll get a warning. Means all the # values were nearly identical in y, so we will return # NaN so we don't optimize for this value of lam if 0 == var: return np.nan # Can't use canonical log due to maybe negatives, so use the truncated log function in utils llf = (lmb - 1) * np.sum(log(data), axis=0) llf -= N / 2.0 * log(var) return llf
[docs]class SpatialSignTransformer(BaseSkutil, TransformerMixin): """Project the feature space of a matrix into a multi-dimensional sphere by dividing each feature by its squared norm. Parameters ---------- cols : array_like, shape=(n_features,), optional (default=None) The names of the columns on which to apply the transformation. If no column names are provided, the transformer will be ``fit`` on the entire frame. Note that the transformation will also only apply to the specified columns, and any other non-specified columns will still be present after transformation. Note that since this transformer can only operate on numeric columns, not explicitly setting the ``cols`` parameter may result in errors for categorical data. n_jobs : int, 1 by default The number of jobs to use for the computation. This works by estimating each of the feature lambdas in parallel. If -1 all CPUs are used. If 1 is given, no parallel computing code is used at all, which is useful for debugging. For n_jobs below -1, (n_cpus + 1 + n_jobs) are used. Thus for n_jobs = -2, all CPUs but one are used. as_df : bool, optional (default=True) Whether to return a Pandas ``DataFrame`` in the ``transform`` method. If False, will return a Numpy ``ndarray`` instead. Since most skutil transformers depend on explicitly-named ``DataFrame`` features, the ``as_df`` parameter is True by default. Attributes ---------- sq_nms_ : dict The squared norms for each feature """ def __init__(self, cols=None, n_jobs=1, as_df=True): super(SpatialSignTransformer, self).__init__(cols=cols, as_df=as_df) self.n_jobs = n_jobs
[docs] def fit(self, X, y=None): """Fit the transformer. Parameters ---------- X : Pandas ``DataFrame`` The Pandas frame to fit. The frame will only be fit on the prescribed ``cols`` (see ``__init__``) or all of them if ``cols`` is None. Furthermore, ``X`` will not be altered in the process of the fit. y : None Passthrough for ``sklearn.pipeline.Pipeline``. Even if explicitly set, will not change behavior of ``fit``. Returns ------- self """ # check on state of X and cols X, self.cols = validate_is_pd(X, self.cols) cols = _cols_if_none(X, self.cols) # Now get sqnms in parallel self.sq_nms_ = dict(zip(cols, Parallel(n_jobs=self.n_jobs)( delayed(_sq_norm_single) (X[nm]) for nm in cols))) return self
[docs] def transform(self, X): """Transform a test matrix given the already-fit transformer. Parameters ---------- X : Pandas ``DataFrame`` The Pandas frame to transform. The operation will be applied to a copy of the input data, and the result will be returned. Returns ------- X : Pandas ``DataFrame`` The operation is applied to a copy of ``X``, and the result set is returned. """ check_is_fitted(self, 'sq_nms_') # check on state of X and cols X, _ = validate_is_pd(X, self.cols) sq_nms_ = self.sq_nms_ # scale by norms for nm, the_norm in six.iteritems(sq_nms_): X[nm] /= the_norm return X if self.as_df else X.as_matrix()
def _sq_norm_single(x, zero_action=np.inf): x = np.asarray(x) nrm = np.dot(x, x) # What if a squared norm is zero? We want to # avoid a divide-by-zero situation... return nrm if not nrm == 0 else zero_action