Source code for skutil.decomposition.decompose

# -*- coding: utf-8 -*-

from __future__ import print_function, division, absolute_import
from abc import ABCMeta, abstractmethod
import numpy as np
import pandas as pd
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.decomposition import PCA, TruncatedSVD
from sklearn.utils.validation import check_is_fitted
from sklearn.externals import six
from skutil.base import *
from skutil.base import overrides
from ..utils import *
from ..utils.fixes import _cols_if_none, _as_numpy

__all__ = [
    'SelectivePCA',
    'SelectiveTruncatedSVD'
]


class _BaseSelectiveDecomposer(six.with_metaclass(ABCMeta, BaseSkutil, TransformerMixin)):
    """Base class for selective decompositional transformers.
    Each of these transformers should adhere to the :class:`skutil.base.SelectiveMixin`
    standard of accepting a ``cols`` parameter in the ``__init__`` method, and
    only applying the transformation to the defined columns, if any.

    Parameters
    ----------

    cols : array_like, shape=(n_features,), optional (default=None)
        The names of the columns on which to apply the transformation.
        If no column names are provided, the transformer will be ``fit``
        on the entire frame. Note that the transformation will also only
        apply to the specified columns, and any other non-specified
        columns will still be present after transformation.

    n_components : int, float, None or string, optional (default=None)
        ``n_components`` is specific to the type of transformation
        being fit, and determines the number of components to extract
        in the transformation.

    as_df : bool, optional (default=True)
        Whether to return a Pandas ``DataFrame`` in the ``transform``
        method. If False, will return a Numpy ``ndarray`` instead. 
        Since most skutil transformers depend on explicitly-named
        ``DataFrame`` features, the ``as_df`` parameter is True by default.
    """

    def __init__(self, cols=None, n_components=None, as_df=True):
        super(_BaseSelectiveDecomposer, self).__init__(cols=cols, as_df=as_df)
        self.n_components = n_components

    @abstractmethod
    def get_decomposition(self):
        """This method needs to be overridden by subclasses.
        It is intended to act as a property to return the specific
        decomposition. For `SelectivePCA`, this will return the `pca_`
        attribute; for `SelectiveTruncatedSVD`, this will return the
        `svd_` attribute.
        """
        raise NotImplementedError('this should be implemented by a subclass')

    def inverse_transform(self, X):
        """Given a transformed dataframe, inverse the transformation.

        Parameters
        ----------

        X : pd.DataFrame
            The transformed dataframe

        Returns
        -------

        Xi : pd.DataFrame
            The inverse-transformed dataframe
        """
        X, _ = validate_is_pd(X, None)
        Xi = self.get_decomposition().inverse_transform(X)
        return Xi


[docs]class SelectivePCA(_BaseSelectiveDecomposer):
    """A class that will apply PCA only to a select group
    of columns. Useful for data that may contain a mix of columns 
    that we do and don't want to decompose.

    Parameters
    ----------

    cols : array_like, shape=(n_features,), optional (default=None)
        The names of the columns on which to apply the transformation.
        If no column names are provided, the transformer will be ``fit``
        on the entire frame. Note that the transformation will also only
        apply to the specified columns, and any other non-specified
        columns will still be present after transformation.

    n_components : int, float, None or string, optional (default=None)
        The number of components to keep, per sklearn:

        * if n_components is not set, all components are kept:

            n_components == min(n_samples, n_features)

        * if n_components == 'mle' and svd_solver == 'full', Minka's MLE is used
        to guess the dimension.

        * if ``0 < n_components < 1`` and svd_solver == 'full', select the number
        of components such that the amount of variance that needs to be
        explained is greater than the percentage specified by ``n_components``

        * ``n_components`` cannot be equal to ``n_features`` for ``svd_solver`` == 'arpack'.

    as_df : bool, optional (default=True)
        Whether to return a Pandas ``DataFrame`` in the ``transform``
        method. If False, will return a Numpy ``ndarray`` instead. 
        Since most skutil transformers depend on explicitly-named
        ``DataFrame`` features, the ``as_df`` parameter is True by default.

    whiten : bool, optional (default False)
        When True (False by default) the `components_` vectors are multiplied
        by the square root of n_samples and then divided by the singular values
        to ensure uncorrelated outputs with unit component-wise variances.
        Whitening will remove some information from the transformed signal
        (the relative variance scales of the components) but can sometime
        improve the predictive accuracy of the downstream estimators by
        making their data respect some hard-wired assumptions.

    weight : bool, optional (default False)
        When True (False by default) the `explained_variance_` vector is used to weight
        the features post-transformation. This is especially useful in clustering contexts,
        where features are all implicitly assigned the same importance, even though PCA
        by nature orders the features by importance (i.e., not all components are created equally).
        When True, weighting will subtract the median variance from the weighting vector, and add one
        (so as not to down sample or upsample everything), then multiply the weights across the
        transformed features.

    
    Examples
    --------

        >>> from skutil.decomposition import SelectivePCA
        >>> from skutil.utils import load_iris_df
        >>>
        >>> X = load_iris_df(include_tgt=False)
        >>> pca = SelectivePCA(n_components=2)
        >>> X_transform = pca.fit_transform(X) # pca suffers sign indeterminancy and results will vary
        >>> assert X_transform.shape[1] == 2

    Attributes
    ----------

    pca_ : the PCA object
    """

    def __init__(self, cols=None, n_components=None, whiten=False, weight=False, as_df=True):
        super(SelectivePCA, self).__init__(cols=cols, n_components=n_components, as_df=as_df)
        self.whiten = whiten
        self.weight = weight

[docs]    def fit(self, X, y=None):
        """Fit the transformer.

        Parameters
        ----------

        X : Pandas ``DataFrame``, shape=(n_samples, n_features)
            The Pandas frame to fit. The frame will only
            be fit on the prescribed ``cols`` (see ``__init__``) or
            all of them if ``cols`` is None. Furthermore, ``X`` will
            not be altered in the process of the fit.

        y : None
            Passthrough for ``sklearn.pipeline.Pipeline``. Even
            if explicitly set, will not change behavior of ``fit``.

        Returns
        -------

        self
        """
        # check on state of X and cols
        X, self.cols = validate_is_pd(X, self.cols)
        cols = _cols_if_none(X, self.cols)

        # fails thru if names don't exist:
        self.pca_ = PCA(
            n_components=self.n_components,
            whiten=self.whiten).fit(X[cols].as_matrix())

        return self

[docs]    def transform(self, X):
        """Transform a test matrix given the already-fit transformer.

        Parameters
        ----------

        X : Pandas ``DataFrame``, shape=(n_samples, n_features)
            The Pandas frame to transform. The operation will
            be applied to a copy of the input data, and the result
            will be returned.


        Returns
        -------

        X : Pandas ``DataFrame``
            The operation is applied to a copy of ``X``,
            and the result set is returned.
        """
        check_is_fitted(self, 'pca_')
        # check on state of X and cols
        X, _ = validate_is_pd(X, self.cols)
        cols = _cols_if_none(X, self.cols)

        other_nms = [nm for nm in X.columns if nm not in cols]
        transform = self.pca_.transform(X[cols].as_matrix())

        # do weighting if necessary
        if self.weight:
            # get the weight vals
            weights = self.pca_.explained_variance_ratio_
            weights -= np.median(weights)
            weights += 1

            # now add to the transformed features
            transform *= weights

        left = pd.DataFrame.from_records(data=transform,
                                         columns=[('PC%i' % (i + 1)) for i in range(transform.shape[1])])

        # concat if needed
        x = pd.concat([left, X[other_nms]], axis=1) if other_nms else left
        return x if self.as_df else x.as_matrix()

    @overrides(_BaseSelectiveDecomposer)
[docs]    def get_decomposition(self):
        """Overridden from the :class:``skutil.decomposition.decompose._BaseSelectiveDecomposer`` class,
        this method returns the internal decomposition class: 
        ``sklearn.decomposition.PCA``

        Returns
        -------
        self.pca_ : ``sklearn.decomposition.PCA``
            The fit internal decomposition class
        """
        return self.pca_ if hasattr(self, 'pca_') else None

[docs]    def score(self, X, y=None):
        """Return the average log-likelihood of all samples.
        This calls sklearn.decomposition.PCA's score method
        on the specified columns [1].

        Parameters
        ----------

        X: Pandas ``DataFrame``, shape=(n_samples, n_features)
            The data to score.

        y: None
            Passthrough for pipeline/gridsearch


        Returns
        -------

        ll: float
            Average log-likelihood of the samples under the fit
            PCA model (`self.pca_`)


        References
        ----------

        .. [1] Bishop, C.  "Pattern Recognition and Machine Learning"
               12.2.1 p. 574 http://www.miketipping.com/papers/met-mppca.pdf
        """
        check_is_fitted(self, 'pca_')
        X, _ = validate_is_pd(X, self.cols)
        cols = X.columns if not self.cols else self.cols

        ll = self.pca_.score(X[cols].as_matrix(), _as_numpy(y))
        return ll


[docs]class SelectiveTruncatedSVD(_BaseSelectiveDecomposer):
    """A class that will apply truncated SVD (LSA) only to a select group
    of columns. Useful for data that contains categorical features
    that have not yet been dummied, or for dummied features we don't want
    decomposed. TruncatedSVD is the equivalent of Latent Semantic Analysis,
    and returns the "concept space" of the decomposed features.

    Parameters
    ----------

    cols : array_like, shape=(n_features,), optional (default=None)
        The names of the columns on which to apply the transformation.
        If no column names are provided, the transformer will be ``fit``
        on the entire frame. Note that the transformation will also only
        apply to the specified columns, and any other non-specified
        columns will still be present after transformation.

    n_components : int, (default=2)
        Desired dimensionality of output data.
        Must be strictly less than the number of features.
        The default value is useful for visualisation. For LSA, a value of
        100 is recommended.

    algorithm : string, (default="randomized")
        SVD solver to use. Either "arpack" for the ARPACK wrapper in SciPy
        (scipy.sparse.linalg.svds), or "randomized" for the randomized
        algorithm due to Halko (2009).

    n_iter : int, optional (default=5)
        Number of iterations for randomized SVD solver. Not used by ARPACK.
        The default is larger than the default in `randomized_svd` to handle
        sparse matrices that may have large slowly decaying spectrum.

    as_df : bool, optional (default=True)
        Whether to return a Pandas ``DataFrame`` in the ``transform``
        method. If False, will return a Numpy ``ndarray`` instead. 
        Since most skutil transformers depend on explicitly-named
        ``DataFrame`` features, the ``as_df`` parameter is True by default.


    Examples
    --------

        >>> from skutil.decomposition import SelectiveTruncatedSVD
        >>> from skutil.utils import load_iris_df
        >>>
        >>> X = load_iris_df(include_tgt=False)
        >>> svd = SelectiveTruncatedSVD(n_components=2)
        >>> X_transform = svd.fit_transform(X) # svd suffers sign indeterminancy and results will vary
        >>> assert X_transform.shape[1] == 2


    Attributes
    ----------

    svd_ : the SVD object
    """

    def __init__(self, cols=None, n_components=2, algorithm='randomized', n_iter=5, as_df=True):
        super(SelectiveTruncatedSVD, self).__init__(cols=cols, n_components=n_components, as_df=as_df)
        self.algorithm = algorithm
        self.n_iter = n_iter

[docs]    def fit(self, X, y=None):
        """Fit the transformer.

        Parameters
        ----------

        X : Pandas ``DataFrame``, shape=(n_samples, n_features)
            The Pandas frame to fit. The frame will only
            be fit on the prescribed ``cols`` (see ``__init__``) or
            all of them if ``cols`` is None. Furthermore, ``X`` will
            not be altered in the process of the fit.

        y : None
            Passthrough for ``sklearn.pipeline.Pipeline``. Even
            if explicitly set, will not change behavior of ``fit``.

        Returns
        -------

        self
        """
        # check on state of X and cols
        X, self.cols = validate_is_pd(X, self.cols)
        cols = _cols_if_none(X, self.cols)

        # fails thru if names don't exist:
        self.svd_ = TruncatedSVD(
            n_components=self.n_components,
            algorithm=self.algorithm,
            n_iter=self.n_iter).fit(X[cols].as_matrix())

        return self

[docs]    def transform(self, X):
        """Transform a test matrix given the already-fit transformer.

        Parameters
        ----------

        X : Pandas ``DataFrame``, shape=(n_samples, n_features)
            The Pandas frame to transform. The operation will
            be applied to a copy of the input data, and the result
            will be returned.


        Returns
        -------

        X : Pandas ``DataFrame``, shape=(n_samples, n_features)
            The operation is applied to a copy of ``X``,
            and the result set is returned.
        """
        check_is_fitted(self, 'svd_')
        # check on state of X and cols
        X, _ = validate_is_pd(X, self.cols)
        cols = _cols_if_none(X, self.cols)

        other_nms = [nm for nm in X.columns if nm not in cols]
        transform = self.svd_.transform(X[cols].as_matrix())
        left = pd.DataFrame.from_records(data=transform,
                                         columns=[
                                            ('Concept%i' % (i + 1)) 
                                            for i in range(transform.shape[1])
                                         ])

        # concat if needed
        x = pd.concat([left, X[other_nms]], axis=1) if other_nms else left

        return x if self.as_df else x.as_matrix()

    @overrides(_BaseSelectiveDecomposer)
[docs]    def get_decomposition(self):
        """Overridden from the :class:``skutil.decomposition.decompose._BaseSelectiveDecomposer`` class,
        this method returns the internal decomposition class: 
        ``sklearn.decomposition.TruncatedSVD``

        Returns
        -------
        self.svd_ : ``sklearn.decomposition.TruncatedSVD``
            The fit internal decomposition class
        """
        return self.svd_ if hasattr(self, 'svd_') else None