Source code for regressors.plots

# -*- coding: utf-8 -*-

"""This module contains functions for making plots relevant to regressors."""
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
from __future__ import unicode_literals

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import scipy
import seaborn.apionly as sns
import statsmodels.api as sm
from sklearn import decomposition

from . import _utils
from . import stats


[docs]def plot_residuals(clf, X, y, r_type='standardized', figsize=(10, 8)): """Plot residuals of a linear model. Parameters ---------- clf : sklearn.linear_model A scikit-learn linear model classifier with a `predict()` method. X : numpy.ndarray Training data used to fit the classifier. y : numpy.ndarray Target training values, of shape = [n_samples]. r_type : str Type of residuals to return: 'raw', 'standardized', 'studentized'. Defaults to 'standardized'. * 'raw' will return the raw residuals. * 'standardized' will return the standardized residuals, also known as internally studentized residuals, which is calculated as the residuals divided by the square root of MSE (or the STD of the residuals). * 'studentized' will return the externally studentized residuals, which is calculated as the raw residuals divided by sqrt(LOO-MSE * (1 - leverage_score)). figsize : tuple A tuple indicating the size of the plot to be created, with format (x-axis, y-axis). Defaults to (10, 8). Returns ------- matplotlib.figure.Figure The Figure instance. """ # Ensure we only plot residuals using classifiers we have tested assert isinstance(clf, _utils.supported_linear_models), ( "Classifiers of type {0} not currently supported.".format(type(clf))) # Get residuals or standardized residuals resids = stats.residuals(clf, X, y, r_type) predictions = clf.predict(X) # Prepare plot labels to use, depending on which type of residuals used y_label = {'raw': 'Residuals', 'standardized': 'Standardized Residuals', 'studentized': 'Studentized Residuals'} # Set plot style sns.set_style("whitegrid") sns.set_context("talk") # Increase font size on plot # Generate residual plot try: fig = plt.figure('residuals', figsize=figsize) plt.scatter(predictions, resids, s=14, c='gray', alpha=0.7) plt.hlines(y=0, xmin=predictions.min(), xmax=predictions.max(), linestyle='dotted') plt.title("Residuals Plot") plt.xlabel("Predictions") plt.ylabel(y_label[r_type]) plt.show() except: raise # Re-raise the exception finally: sns.reset_orig() # Always reset back to default matplotlib styles return fig
[docs]def plot_scree(clf_pca, xlim=[-1, 10], ylim=[-0.1, 1.0], required_var=0.90, figsize=(10, 5)): """Create side-by-side scree plots for analyzing variance of principal components from PCA. Parameters ---------- clf_pca : sklearn.decomposition.PCA A fitted scikit-learn PCA model. xlim : list X-axis range. If `required_var` is supplied, the maximum x-axis value will automatically be set so that the required variance line is visible on the plot. Defaults to [-1, 10]. ylim : list Y-axis range. Defaults to [-0.1, 1.0]. required_var : float, int, None A value of variance to distinguish on the scree plot. Set to None to not include on the plot. Defaults to 0.90. figsize : tuple A tuple indicating the size of the plot to be created, with format (x-axis, y-axis). Defaults to (10, 5). Returns ------- matplotlib.figure.Figure The Figure instance. """ # Ensure we have the a PCA model assert isinstance(clf_pca, decomposition.PCA), ( "Models of type {0} are not supported. Only models of type " "sklearn.decomposition.PCA are supported.".format(type(clf_pca))) # Extract variances from the model variances = clf_pca.explained_variance_ratio_ # Set plot style and scale up font size sns.set_style("whitegrid") sns.set(font_scale=1.2) # Set up figure and generate subplots try: fig = plt.figure('scree', figsize=figsize) # First plot (in subplot) plt.subplot(1, 2, 1) plt.xlabel("Component Number") plt.ylabel("Proportion of Variance Explained") plt.xlim(xlim) plt.ylim(ylim) plt.plot(variances, marker='o', linestyle='--') # Second plot (in subplot) cumsum = np.cumsum(variances) # Cumulative sum of variances explained plt.subplot(1, 2, 2) plt.xlabel("Number of Components") plt.ylabel("Proportion of Variance Explained") plt.xlim(xlim) plt.ylim(ylim) plt.plot(cumsum, marker='o', linestyle='--') # Add marker for required variance line if required_var is not None: required_var_components = np.argmax(cumsum >= required_var) + 1 # Update xlim if it is too small to see the marker if xlim[1] <= required_var_components: plt.xlim([xlim[0], required_var_components + 1]) # Add the marker and legend to the plot plt.axvline(x=required_var_components, c='r', linestyle='dashed', label="> {0:.0f}% Var. Explained: {1} " "components".format(required_var * 100, required_var_components)) legend = plt.legend(loc='lower right', frameon=True) legend.get_frame().set_facecolor('#FFFFFF') plt.show() except: raise # Re-raise the exception finally: sns.reset_orig() return fig
[docs]def plot_qq(clf, X, y, figsize=(7, 7)): """Generate a Q-Q plot (a.k.a. normal quantile plot). Parameters ---------- clf : sklearn.linear_model A scikit-learn linear model classifier with a `predict()` method. X : numpy.ndarray Training data used to fit the classifier. y : numpy.ndarray Target training values, of shape = [n_samples]. figsize : tuple A tuple indicating the size of the plot to be created, with format (x-axis, y-axis). Defaults to (7, 7). Returns ------- matplotlib.figure.Figure The Figure instance. """ # Ensure we only plot residuals using classifiers we have tested assert isinstance(clf, _utils.supported_linear_models), ( "Classifiers of type {0} not currently supported.".format(type(clf))) residuals = stats.residuals(clf, X, y, r_type='raw') prob_plot = sm.ProbPlot(residuals, scipy.stats.t, fit=True) # Set plot style sns.set_style("darkgrid") sns.set(font_scale=1.2) # Generate plot try: # Q-Q plot doesn't respond to figure size, so prep a figure first fig, ax = plt.subplots(figsize=figsize) prob_plot.qqplot(line='45', ax=ax) plt.title("Normal Quantile Plot") plt.xlabel("Theoretical Standardized Residuals") plt.ylabel("Actual Standardized Residuals") plt.show() except: raise # Re-raise the exception finally: sns.reset_orig() return fig
[docs]def plot_pca_pairs(clf_pca, x_train, y=None, n_components=3, diag='kde', cmap=None, figsize=(10, 10)): """ Create pairwise plots of principal components from x data. Colors the components according to the `y` values. Parameters ---------- clf_pca : sklearn.decomposition.PCA A fitted scikit-learn PCA model. x_train : numpy.ndarray Training data used to fit `clf_pca`, either scaled or un-scaled, depending on how `clf_pca` was fit. y : numpy.ndarray Target training values, of shape = [n_samples]. n_components: int Desired number of principal components to plot. Defaults to 3. diag : str Type of plot to display on the diagonals. Default is 'kde'. * 'kde': density curves * 'hist': histograms cmap : str A string representation of a Seaborn color map. See available maps: https://stanford.edu/~mwaskom/software/seaborn/tutorial/color_palettes. figsize : tuple A tuple indicating the size of the plot to be created, with format (x-axis, y-axis). Defaults to (10, 10). Returns ------- matplotlib.figure.Figure The Figure instance. """ if y is not None: assert y.shape[0] == x_train.shape[0], ( "Dimensions of y {0} do not match dimensions of x_train {1}".format( y.shape[0], x_train.shape[0])) # Obtain the projections of x_train x_projection = clf_pca.transform(x_train) # Create a data frame to hold the projections of n_components PCs col_names = ["PC{0}".format(i + 1) for i in range(n_components)] df = pd.DataFrame(x_projection[:, 0:n_components], columns=col_names) # Generate the plot cmap = "Greys" if cmap is None else cmap color = "#55A969" if y is None else y sns.set_style("white", {"axes.linewidth": "0.8", "image.cmap": cmap}) sns.set_context("notebook") try: # Create figure instance with subplot and populate the subplot with # the scatter matrix. You need to do this so you can access the figure # properties later to increase distance between subplots. If you don't, # Pandas will create its own figure with a tight layout. fig = plt.figure(figsize=figsize) ax = fig.add_subplot(1, 1, 1) from pandas.tools.plotting import scatter_matrix axes = scatter_matrix(df, ax=ax, alpha=0.7, figsize=figsize, diagonal=diag, marker='o', c=color, density_kwds={'c': '#6283B9'}, hist_kwds={'facecolor': '#5A76A4', 'edgecolor': '#3D3D3D'}) # Increase space between subplots fig.subplots_adjust(hspace=0.1, wspace=0.1) # Loop through subplots and remove top and right axes axes_unwound = np.ravel(axes) for i in range(axes_unwound.shape[0]): ax = axes_unwound[i] ax.spines['top'].set_visible(False) ax.spines['right'].set_visible(False) plt.show() except: raise # Re-raise the exception else: sns.reset_orig() return fig finally: sns.reset_orig()