Source code for regressors.regressors

# -*- coding: utf-8 -*-

"""This module contains core classes for regression models."""
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
from __future__ import unicode_literals

import numpy as np
from sklearn import decomposition
from sklearn import linear_model as lm
from sklearn import metrics
from sklearn import preprocessing

from . import stats


def pcr_beta_coef(clf_regress, clf_pca):
    """Calculate the beta coefficients in real-space (instead of PCA-space)
    from principal components regression.

    Parameters
    ----------
    clf_regress : sklearn.linear_model
        A scikit-learn linear model classifier.
    clf_pca : sklearn.decomposition.PCA
        A scikit-learn PCA model.

    Returns
    -------
    np.ndarray
        An array of the real-space beta coefficients from principal components
        regression.
    """
    # Ensure we only calculate coefficients using classifiers we have tested
    assert isinstance(clf_pca, decomposition.PCA), (
        "Classifiers of type {0} are not supported. "
        "Please use class sklearn.decomposition.PCA.".format(type(clf_pca)))
    return np.dot(clf_regress.coef_, clf_pca.components_)


[docs]class PCR(object):
    """Principal components regression model.

    This model solves a regression model after standard scaling the X
    data and performing PCA to reduce the dimensionality of X. This class
    simply creates a pipeline that utilizes:

        1. sklearn.preprocessing.StandardScaler
        2. sklearn.decomposition.PCA
        3. a supported sklearn.linear_model

    Attributes of the class mimic what is provided by scikit-learn's PCA and
    linear model classes. Additional attributes specifically relevant to PCR
    are also provided, such as :py:attr:`.PCR.beta_coef_`.

    Parameters
    ----------
    n_components : int, float, None, str
        Number of components to keep when performing PCA. If n_components
        is not set all components are kept::

            n_components == min(n_samples, n_features)

        If n_components == 'mle', Minka\'s MLE is used to guess the
        dimension. If ``0 < n_components < 1``, selects the number of
        components such that the amount of variance that needs to be
        explained is greater than the percentage specified by n_components.
    regression_type : str
        The type of regression classifier to use. Must be one of 'ols',
        'lasso', 'ridge', or 'elasticnet'.
    n_jobs : int (optional)
        The number of jobs to use for the computation. If ``n_jobs=-1``, all
        CPUs are used. This will only increase speed of computation for
        n_targets > 1 and sufficiently large problems.
    alpha : float (optional)
        Used when regression_type is 'lasso', 'ridge', or 'elasticnet'.
        Represents the constant that multiplies the penalty terms. Setting
        ``alpha=0`` is equivalent to ordinary least square and it is advised
        in that case to instead use ``regression_type='ols'``. See the
        scikit-learn documentation for the chosen regression model for more
        information in this parameter.
    l1_ratio : float (optional)
        Used when regression_type is 'elasticnet'. The ElasticNet mixing
        parameter, with ``0 <= l1_ratio <= 1``. For ``l1_ratio = 0`` the
        penalty is an L2 penalty. ``For l1_ratio = 1`` it is an L1 penalty.
        For ``0 < l1_ratio < 1``, the penalty is a combination of L1 and L2.

    Attributes
    ----------
    scaler : sklearn.preprocessing.StandardScaler, None
        The StandardScaler object used to center the X data and scale to unit
        variance. Must have ``fit()`` and ``transform()`` methods.
        Can be overridden prior to fitting to use a different scaler::

            pcr = PCR()
            # Change StandardScaler options
            pcr.scaler = StandardScaler(with_mean=False, with_std=True)
            pcr.fit(X, y)

        The scaler can also be removed prior to fitting (to not scale X during
        fitting or predictions) with `pcr.scaler = None`.
    prcomp : sklearn.decomposition.PCA
        The PCA object use to perform PCA. This can also be accessed in the same
        way as the scaler.
    regression : sklearn.linear_model
        The linear model object used to perform regression. Must have ``fit()``
        and ``predict()`` methods. This defaults to OLS using scikit-learn's
        LinearRegression classifier, but can be overridden either using the
        `regression_type` parameter when instantiating the class, or
        by replacing the regression model with a different on prior to fitting::

            pcr = PCR(regression_type='ols')
            # Examine the current regression model
            print(pcr.regression)
            LinearRegression(copy_X=True, fit_intercept=True, n_jobs=1,
                normalize=False)
            # Use Lasso regression with cross-validation instead of OLS
            pcr.regression = linear_model.LassoCV(n_alphas=200)
            print(pcr.regression)
            LassoCV(alphas=None, copy_X=True, cv=None, eps=0.001,
                fit_intercept=True, max_iter=1000, n_alphas=200, n_jobs=1,
                normalize=False, positive=False, precompute='auto',
                random_state=None, selection='cyclic', tol=0.0001,
                verbose=False)
            pcr.fit(X, y)
    """

    def __init__(self, n_components=None, regression_type='ols',
                 alpha=1.0, l1_ratio=0.5, n_jobs=1):
        # Store class parameters
        self.n_components = n_components
        self.n_jobs = n_jobs
        # Create scaler and PCA models
        self.scaler = preprocessing.StandardScaler()
        self.prcomp = decomposition.PCA()
        # Create regression classifier
        regression_class = {'ols': lm.LinearRegression, 'lasso': lm.Lasso,
                            'ridge': lm.Ridge, 'elasticnet': lm.ElasticNet}
        self.regression = regression_class[regression_type]()

    @property
    def beta_coef_(self):
        """
        Returns
        -------
        numpy.ndarray
            Beta coefficients, corresponding to coefficients in the original
            space and dimension of X. These are calculated as
            :math:`B = A \cdot P`, where :math:`A` is a vector of the
            coefficients obtained from regression on the principal components
            and :math:`P` is the matrix of loadings from PCA.
        """
        return pcr_beta_coef(self.regression, self.prcomp)

    @property
    def intercept_(self):
        """
        Returns
        -------
        float
            The intercept for the regression model, both in PCA-space and
            in the original X-space.
        """
        return self.regression.intercept_

[docs]    def fit(self, X, y):
        """
        Fit the PCR model.

        Parameters
        ----------
        X : numpy.ndarray
            Training data.
        y : numpy.ndarray
            Target values.

        Returns
        -------
        regression.PCR
            An instance of self.
        """
        if self.scaler is not None:
            x_scaled = self.scaler.fit_transform(X)
        else:
            x_scaled = X
        x_reduced = self.prcomp.fit_transform(x_scaled)
        self.regression.fit(x_reduced, y)
        return self

[docs]    def predict(self, X):
        """
        Predict using the PCR model.

        Parameters
        ----------
        X : numpy.ndarray
            Samples to predict values from.

        Returns
        -------
        numpy.ndarray
            Predicted values.
        """
        if self.scaler is not None:
            x_scaled = self.scaler.fit_transform(X)
        else:
            x_scaled = X
        x_reduced = self.prcomp.transform(x_scaled)
        return self.regression.predict(x_reduced)

[docs]    def score(self, X, y):
        """
        Returns the coefficient of determination of :math:`R^2` of the
        predictions.

        Parameters
        ----------
        X : numpy.ndarray
            Training or tests samples.
        y : numpy.ndarray
            Target values.

        Returns
        -------
        float
            The :math:`R^2` value of the predictions.
        """
        return metrics.r2_score(y, self.predict(X))