Source code for regressors.regressors

# -*- coding: utf-8 -*-

"""This module contains core classes for regression models."""
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
from __future__ import unicode_literals

import numpy as np
from sklearn import decomposition
from sklearn import linear_model as lm
from sklearn import metrics
from sklearn import preprocessing

from . import stats


def pcr_beta_coef(clf_regress, clf_pca):
    """Calculate the beta coefficients in real-space (instead of PCA-space)
    from principal components regression.

    Parameters
    ----------
    clf_regress : sklearn.linear_model
        A scikit-learn linear model classifier.
    clf_pca : sklearn.decomposition.PCA
        A scikit-learn PCA model.

    Returns
    -------
    np.ndarray
        An array of the real-space beta coefficients from principal components
        regression.
    """
    # Ensure we only calculate coefficients using classifiers we have tested
    assert isinstance(clf_pca, decomposition.PCA), (
        "Classifiers of type {0} are not supported. "
        "Please use class sklearn.decomposition.PCA.".format(type(clf_pca)))
    return np.dot(clf_regress.coef_, clf_pca.components_)


[docs]class PCR(object): """Principal components regression model. This model solves a regression model after standard scaling the X data and performing PCA to reduce the dimensionality of X. This class simply creates a pipeline that utilizes: 1. sklearn.preprocessing.StandardScaler 2. sklearn.decomposition.PCA 3. a supported sklearn.linear_model Attributes of the class mimic what is provided by scikit-learn's PCA and linear model classes. Additional attributes specifically relevant to PCR are also provided, such as :py:attr:`.PCR.beta_coef_`. Parameters ---------- n_components : int, float, None, str Number of components to keep when performing PCA. If n_components is not set all components are kept:: n_components == min(n_samples, n_features) If n_components == 'mle', Minka\'s MLE is used to guess the dimension. If ``0 < n_components < 1``, selects the number of components such that the amount of variance that needs to be explained is greater than the percentage specified by n_components. regression_type : str The type of regression classifier to use. Must be one of 'ols', 'lasso', 'ridge', or 'elasticnet'. n_jobs : int (optional) The number of jobs to use for the computation. If ``n_jobs=-1``, all CPUs are used. This will only increase speed of computation for n_targets > 1 and sufficiently large problems. alpha : float (optional) Used when regression_type is 'lasso', 'ridge', or 'elasticnet'. Represents the constant that multiplies the penalty terms. Setting ``alpha=0`` is equivalent to ordinary least square and it is advised in that case to instead use ``regression_type='ols'``. See the scikit-learn documentation for the chosen regression model for more information in this parameter. l1_ratio : float (optional) Used when regression_type is 'elasticnet'. The ElasticNet mixing parameter, with ``0 <= l1_ratio <= 1``. For ``l1_ratio = 0`` the penalty is an L2 penalty. ``For l1_ratio = 1`` it is an L1 penalty. For ``0 < l1_ratio < 1``, the penalty is a combination of L1 and L2. Attributes ---------- scaler : sklearn.preprocessing.StandardScaler, None The StandardScaler object used to center the X data and scale to unit variance. Must have ``fit()`` and ``transform()`` methods. Can be overridden prior to fitting to use a different scaler:: pcr = PCR() # Change StandardScaler options pcr.scaler = StandardScaler(with_mean=False, with_std=True) pcr.fit(X, y) The scaler can also be removed prior to fitting (to not scale X during fitting or predictions) with `pcr.scaler = None`. prcomp : sklearn.decomposition.PCA The PCA object use to perform PCA. This can also be accessed in the same way as the scaler. regression : sklearn.linear_model The linear model object used to perform regression. Must have ``fit()`` and ``predict()`` methods. This defaults to OLS using scikit-learn's LinearRegression classifier, but can be overridden either using the `regression_type` parameter when instantiating the class, or by replacing the regression model with a different on prior to fitting:: pcr = PCR(regression_type='ols') # Examine the current regression model print(pcr.regression) LinearRegression(copy_X=True, fit_intercept=True, n_jobs=1, normalize=False) # Use Lasso regression with cross-validation instead of OLS pcr.regression = linear_model.LassoCV(n_alphas=200) print(pcr.regression) LassoCV(alphas=None, copy_X=True, cv=None, eps=0.001, fit_intercept=True, max_iter=1000, n_alphas=200, n_jobs=1, normalize=False, positive=False, precompute='auto', random_state=None, selection='cyclic', tol=0.0001, verbose=False) pcr.fit(X, y) """ def __init__(self, n_components=None, regression_type='ols', alpha=1.0, l1_ratio=0.5, n_jobs=1): # Store class parameters self.n_components = n_components self.n_jobs = n_jobs # Create scaler and PCA models self.scaler = preprocessing.StandardScaler() self.prcomp = decomposition.PCA() # Create regression classifier regression_class = {'ols': lm.LinearRegression, 'lasso': lm.Lasso, 'ridge': lm.Ridge, 'elasticnet': lm.ElasticNet} self.regression = regression_class[regression_type]() @property def beta_coef_(self): """ Returns ------- numpy.ndarray Beta coefficients, corresponding to coefficients in the original space and dimension of X. These are calculated as :math:`B = A \cdot P`, where :math:`A` is a vector of the coefficients obtained from regression on the principal components and :math:`P` is the matrix of loadings from PCA. """ return pcr_beta_coef(self.regression, self.prcomp) @property def intercept_(self): """ Returns ------- float The intercept for the regression model, both in PCA-space and in the original X-space. """ return self.regression.intercept_
[docs] def fit(self, X, y): """ Fit the PCR model. Parameters ---------- X : numpy.ndarray Training data. y : numpy.ndarray Target values. Returns ------- regression.PCR An instance of self. """ if self.scaler is not None: x_scaled = self.scaler.fit_transform(X) else: x_scaled = X x_reduced = self.prcomp.fit_transform(x_scaled) self.regression.fit(x_reduced, y) return self
[docs] def predict(self, X): """ Predict using the PCR model. Parameters ---------- X : numpy.ndarray Samples to predict values from. Returns ------- numpy.ndarray Predicted values. """ if self.scaler is not None: x_scaled = self.scaler.fit_transform(X) else: x_scaled = X x_reduced = self.prcomp.transform(x_scaled) return self.regression.predict(x_reduced)
[docs] def score(self, X, y): """ Returns the coefficient of determination of :math:`R^2` of the predictions. Parameters ---------- X : numpy.ndarray Training or tests samples. y : numpy.ndarray Target values. Returns ------- float The :math:`R^2` value of the predictions. """ return metrics.r2_score(y, self.predict(X))