Source code for regressors.plots

# -*- coding: utf-8 -*-

"""This module contains functions for making plots relevant to regressors."""
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
from __future__ import unicode_literals

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import scipy
import seaborn.apionly as sns
import statsmodels.api as sm
from sklearn import decomposition

from . import _utils
from . import stats


[docs]def plot_residuals(clf, X, y, r_type='standardized', figsize=(10, 8)):
    """Plot residuals of a linear model.

    Parameters
    ----------
    clf : sklearn.linear_model
        A scikit-learn linear model classifier with a `predict()` method.
    X : numpy.ndarray
        Training data used to fit the classifier.
    y : numpy.ndarray
        Target training values, of shape = [n_samples].
    r_type : str
        Type of residuals to return: 'raw', 'standardized', 'studentized'.
        Defaults to 'standardized'.

        * 'raw' will return the raw residuals.
        * 'standardized' will return the standardized residuals, also known as
          internally studentized residuals, which is calculated as the residuals
          divided by the square root of MSE (or the STD of the residuals).
        * 'studentized' will return the externally studentized residuals, which
          is calculated as the raw residuals divided by sqrt(LOO-MSE * (1 -
          leverage_score)).
    figsize : tuple
        A tuple indicating the size of the plot to be created, with format
        (x-axis, y-axis). Defaults to (10, 8).

    Returns
    -------
    matplotlib.figure.Figure
        The Figure instance.
    """
    # Ensure we only plot residuals using classifiers we have tested
    assert isinstance(clf, _utils.supported_linear_models), (
        "Classifiers of type {0} not currently supported.".format(type(clf)))
    # Get residuals or standardized residuals
    resids = stats.residuals(clf, X, y, r_type)
    predictions = clf.predict(X)
    # Prepare plot labels to use, depending on which type of residuals used
    y_label = {'raw': 'Residuals', 'standardized': 'Standardized Residuals',
               'studentized': 'Studentized Residuals'}
    # Set plot style
    sns.set_style("whitegrid")
    sns.set_context("talk")  # Increase font size on plot
    # Generate residual plot
    try:
        fig = plt.figure('residuals', figsize=figsize)
        plt.scatter(predictions, resids, s=14, c='gray', alpha=0.7)
        plt.hlines(y=0, xmin=predictions.min(), xmax=predictions.max(),
                   linestyle='dotted')
        plt.title("Residuals Plot")
        plt.xlabel("Predictions")
        plt.ylabel(y_label[r_type])
        plt.show()
    except:
        raise  # Re-raise the exception
    finally:
        sns.reset_orig()  # Always reset back to default matplotlib styles
    return fig


[docs]def plot_scree(clf_pca, xlim=[-1, 10], ylim=[-0.1, 1.0], required_var=0.90,
               figsize=(10, 5)):
    """Create side-by-side scree plots for analyzing variance of principal
    components from PCA.

    Parameters
    ----------
    clf_pca : sklearn.decomposition.PCA
        A fitted scikit-learn PCA model.
    xlim : list
        X-axis range. If `required_var` is supplied, the maximum x-axis value
        will automatically be set so that the required variance line is visible
        on the plot. Defaults to [-1, 10].
    ylim : list
        Y-axis range. Defaults to [-0.1, 1.0].
    required_var : float, int, None
        A value of variance to distinguish on the scree plot. Set to None to
        not include on the plot. Defaults to 0.90.
    figsize : tuple
        A tuple indicating the size of the plot to be created, with format
        (x-axis, y-axis). Defaults to (10, 5).

    Returns
    -------
    matplotlib.figure.Figure
        The Figure instance.
    """
    # Ensure we have the a PCA model
    assert isinstance(clf_pca, decomposition.PCA), (
        "Models of type {0} are not supported. Only models of type "
        "sklearn.decomposition.PCA are supported.".format(type(clf_pca)))
    # Extract variances from the model
    variances = clf_pca.explained_variance_ratio_
    # Set plot style and scale up font size
    sns.set_style("whitegrid")
    sns.set(font_scale=1.2)
    # Set up figure and generate subplots
    try:
        fig = plt.figure('scree', figsize=figsize)
        # First plot (in subplot)
        plt.subplot(1, 2, 1)
        plt.xlabel("Component Number")
        plt.ylabel("Proportion of Variance Explained")
        plt.xlim(xlim)
        plt.ylim(ylim)
        plt.plot(variances, marker='o', linestyle='--')
        # Second plot (in subplot)
        cumsum = np.cumsum(variances)  # Cumulative sum of variances explained
        plt.subplot(1, 2, 2)
        plt.xlabel("Number of Components")
        plt.ylabel("Proportion of Variance Explained")
        plt.xlim(xlim)
        plt.ylim(ylim)
        plt.plot(cumsum, marker='o', linestyle='--')
        # Add marker for required variance line
        if required_var is not None:
            required_var_components = np.argmax(cumsum >= required_var) + 1
            # Update xlim if it is too small to see the marker
            if xlim[1] <= required_var_components:
                plt.xlim([xlim[0], required_var_components + 1])
            # Add the marker and legend to the plot
            plt.axvline(x=required_var_components, c='r', linestyle='dashed',
                        label="> {0:.0f}% Var. Explained: {1} "
                              "components".format(required_var * 100,
                            required_var_components))
            legend = plt.legend(loc='lower right', frameon=True)
            legend.get_frame().set_facecolor('#FFFFFF')
        plt.show()
    except:
        raise  # Re-raise the exception
    finally:
        sns.reset_orig()
    return fig


[docs]def plot_qq(clf, X, y, figsize=(7, 7)):
    """Generate a Q-Q plot (a.k.a. normal quantile plot).

    Parameters
    ----------
    clf : sklearn.linear_model
        A scikit-learn linear model classifier with a `predict()` method.
    X : numpy.ndarray
        Training data used to fit the classifier.
    y : numpy.ndarray
        Target training values, of shape = [n_samples].
    figsize : tuple
        A tuple indicating the size of the plot to be created, with format
        (x-axis, y-axis). Defaults to (7, 7).

    Returns
    -------
    matplotlib.figure.Figure
        The Figure instance.
    """
    # Ensure we only plot residuals using classifiers we have tested
    assert isinstance(clf, _utils.supported_linear_models), (
        "Classifiers of type {0} not currently supported.".format(type(clf)))
    residuals = stats.residuals(clf, X, y, r_type='raw')
    prob_plot = sm.ProbPlot(residuals, scipy.stats.t, fit=True)
    # Set plot style
    sns.set_style("darkgrid")
    sns.set(font_scale=1.2)
    # Generate plot
    try:
        # Q-Q plot doesn't respond to figure size, so prep a figure first
        fig, ax = plt.subplots(figsize=figsize)
        prob_plot.qqplot(line='45', ax=ax)
        plt.title("Normal Quantile Plot")
        plt.xlabel("Theoretical Standardized Residuals")
        plt.ylabel("Actual Standardized Residuals")
        plt.show()
    except:
        raise  # Re-raise the exception
    finally:
        sns.reset_orig()
    return fig


[docs]def plot_pca_pairs(clf_pca, x_train, y=None, n_components=3, diag='kde',
                   cmap=None, figsize=(10, 10)):
    """
    Create pairwise plots of principal components from x data.

    Colors the components according to the `y` values.

    Parameters
    ----------
    clf_pca : sklearn.decomposition.PCA
        A fitted scikit-learn PCA model.
    x_train : numpy.ndarray
        Training data used to fit `clf_pca`, either scaled or un-scaled,
        depending on how `clf_pca` was fit.
    y : numpy.ndarray
        Target training values, of shape = [n_samples].
    n_components: int
        Desired number of principal components to plot. Defaults to 3.
    diag : str
        Type of plot to display on the diagonals. Default is 'kde'.

        * 'kde': density curves
        * 'hist': histograms

    cmap : str
        A string representation of a Seaborn color map. See available maps:
        https://stanford.edu/~mwaskom/software/seaborn/tutorial/color_palettes.
    figsize : tuple
        A tuple indicating the size of the plot to be created, with format
        (x-axis, y-axis). Defaults to (10, 10).

    Returns
    -------
    matplotlib.figure.Figure
        The Figure instance.
    """
    if y is not None:
        assert y.shape[0] == x_train.shape[0], (
            "Dimensions of y {0} do not match dimensions of x_train {1}".format(
                y.shape[0], x_train.shape[0]))
    # Obtain the projections of x_train
    x_projection = clf_pca.transform(x_train)
    # Create a data frame to hold the projections of n_components PCs
    col_names = ["PC{0}".format(i + 1) for i in range(n_components)]
    df = pd.DataFrame(x_projection[:, 0:n_components], columns=col_names)
    # Generate the plot
    cmap = "Greys" if cmap is None else cmap
    color = "#55A969" if y is None else y
    sns.set_style("white", {"axes.linewidth": "0.8", "image.cmap": cmap})
    sns.set_context("notebook")
    try:
        # Create figure instance with subplot and populate the subplot with
        # the scatter matrix. You need to do this so you can access the figure
        # properties later to increase distance between subplots. If you don't,
        # Pandas will create its own figure with a tight layout.
        fig = plt.figure(figsize=figsize)
        ax = fig.add_subplot(1, 1, 1)
        from pandas.tools.plotting import scatter_matrix
        axes = scatter_matrix(df, ax=ax, alpha=0.7, figsize=figsize,
                              diagonal=diag, marker='o', c=color,
                              density_kwds={'c': '#6283B9'},
                              hist_kwds={'facecolor': '#5A76A4',
                                         'edgecolor': '#3D3D3D'})
        # Increase space between subplots
        fig.subplots_adjust(hspace=0.1, wspace=0.1)
        # Loop through subplots and remove top and right axes
        axes_unwound = np.ravel(axes)
        for i in range(axes_unwound.shape[0]):
            ax = axes_unwound[i]
            ax.spines['top'].set_visible(False)
            ax.spines['right'].set_visible(False)
        plt.show()
    except:
        raise  # Re-raise the exception
    else:
        sns.reset_orig()
        return fig
    finally:
        sns.reset_orig()