# Source code for yellowbrick.regressor.residuals

```
# yellowbrick.regressor.residuals
# Regressor visualizers that score residuals: prediction vs. actual data.
#
# Author: Rebecca Bilbro
# Author: Benjamin Bengfort
# Created: Fri Jun 03 10:30:36 2016 -0700
#
# Copyright (C) 2016 The scikit-yb developers
# For license information, see LICENSE.txt
#
# ID: residuals.py [7d3f5e6] benjamin@bengfort.com $
"""
Regressor visualizers that score residuals: prediction vs. actual data.
"""
##########################################################################
## Imports
##########################################################################
import matplotlib.pyplot as plt
try:
# Only available in Matplotlib >= 2.0.2
from mpl_toolkits.axes_grid1 import make_axes_locatable
except ImportError:
make_axes_locatable = None
from yellowbrick.draw import manual_legend
from yellowbrick.utils.decorators import memoized
from yellowbrick.style.palettes import LINE_COLOR
from yellowbrick.exceptions import YellowbrickValueError
from yellowbrick.regressor.base import RegressionScoreVisualizer
from yellowbrick.bestfit import draw_best_fit, draw_identity_line
## Packages for export
__all__ = ["PredictionError", "prediction_error", "ResidualsPlot", "residuals_plot"]
##########################################################################
## Prediction Error Plots
##########################################################################
[docs]class PredictionError(RegressionScoreVisualizer):
"""
The prediction error visualizer plots the actual targets from the dataset
against the predicted values generated by our model(s). This visualizer is
used to detect noise or heteroscedasticity along a range of the target
domain.
Parameters
----------
model : a Scikit-Learn regressor
Should be an instance of a regressor, otherwise will raise a
YellowbrickTypeError exception on instantiation.
If the estimator is not fitted, it is fit when the visualizer is fitted,
unless otherwise specified by ``is_fitted``.
ax : matplotlib Axes, default: None
The axes to plot the figure on. If None is passed in the current axes
will be used (or generated if required).
shared_limits : bool, default: True
If shared_limits is True, the range of the X and Y axis limits will
be identical, creating a square graphic with a true 45 degree line.
In this form, it is easier to diagnose under- or over- prediction,
though the figure will become more sparse. To localize points, set
shared_limits to False, but note that this will distort the figure
and should be accounted for during analysis.
bestfit : bool, default: True
Draw a linear best fit line to estimate the correlation between the
predicted and measured value of the target variable. The color of
the bestfit line is determined by the ``line_color`` argument.
identity : bool, default: True
Draw the 45 degree identity line, y=x in order to better show the
relationship or pattern of the residuals. E.g. to estimate if the
model is over- or under- estimating the given values. The color of the
identity line is a muted version of the ``line_color`` argument.
alpha : float, default: 0.75
Specify a transparency where 1 is completely opaque and 0 is completely
transparent. This property makes densely clustered points more visible.
is_fitted : bool or str, default='auto'
Specify if the wrapped estimator is already fitted. If False, the estimator
will be fit when the visualizer is fit, otherwise, the estimator will not be
modified. If 'auto' (default), a helper method will check if the estimator
is fitted before fitting it again.
kwargs : dict
Keyword arguments that are passed to the base class and may influence
the visualization as defined in other Visualizers.
Attributes
----------
score_ : float
The R^2 score that specifies the goodness of fit of the underlying
regression model to the test data.
Examples
--------
>>> from yellowbrick.regressor import PredictionError
>>> from sklearn.linear_model import Lasso
>>> model = PredictionError(Lasso())
>>> model.fit(X_train, y_train)
>>> model.score(X_test, y_test)
>>> model.show()
Notes
-----
PredictionError is a ScoreVisualizer, meaning that it wraps a model and
its primary entry point is the `score()` method.
"""
def __init__(
self,
model,
ax=None,
shared_limits=True,
bestfit=True,
identity=True,
alpha=0.75,
is_fitted="auto",
**kwargs
):
# Whether or not to check if the model is already fitted
self.is_fitted = is_fitted
# Initialize the visualizer
super(PredictionError, self).__init__(model, ax=ax, **kwargs)
# Visual arguments
self.colors = {
"point": kwargs.pop("point_color", None),
"line": kwargs.pop("line_color", LINE_COLOR),
}
# Drawing arguments
self.shared_limits = shared_limits
self.bestfit = bestfit
self.identity = identity
self.alpha = alpha
[docs] def score(self, X, y, **kwargs):
"""
The score function is the hook for visual interaction. Pass in test
data and the visualizer will create predictions on the data and
evaluate them with respect to the test values. The evaluation will
then be passed to draw() and the result of the estimator score will
be returned.
Parameters
----------
X : array-like
X (also X_test) are the dependent variables of test set to predict
y : array-like
y (also y_test) is the independent actual variables to score against
Returns
-------
score : float
"""
# super will set score_ on the visualizer
super(PredictionError, self).score(X, y, **kwargs)
y_pred = self.predict(X)
self.draw(y, y_pred)
return self.score_
[docs] def draw(self, y, y_pred):
"""
Parameters
----------
y : ndarray or Series of length n
An array or series of target or class values
y_pred : ndarray or Series of length n
An array or series of predicted target values
Returns
-------
ax : matplotlib Axes
The axis with the plotted figure
"""
label = "$R^2 = {:0.3f}$".format(self.score_)
self.ax.scatter(
y, y_pred, c=self.colors["point"], alpha=self.alpha, label=label
)
# TODO If score happens inside a loop, draw gets called multiple times.
# Ideally we'd want the best fit line to be drawn only once
if self.bestfit:
draw_best_fit(
y,
y_pred,
self.ax,
"linear",
ls="--",
lw=2,
c=self.colors["line"],
label="best fit",
)
# Set the axes limits based on the range of X and Y data
# NOTE: shared_limits will be accounted for in finalize()
# TODO: do better than add one for really small residuals
self.ax.set_xlim(y.min() - 1, y.max() + 1)
self.ax.set_ylim(y_pred.min() - 1, y_pred.max() + 1)
return self.ax
[docs] def finalize(self, **kwargs):
"""
Finalizes the figure by ensuring the aspect ratio is correct and adding
the identity line for comparison. Also adds a title, axis labels, and
the legend.
Parameters
----------
kwargs: generic keyword arguments.
Notes
-----
Generally this method is called from show and not directly by the user.
"""
# Set the title on the plot
self.set_title("Prediction Error for {}".format(self.name))
# Square the axes to ensure a 45 degree line
if self.shared_limits:
# Get the current limits
ylim = self.ax.get_ylim()
xlim = self.ax.get_xlim()
# Find the range that captures all data
bounds = (min(ylim[0], xlim[0]), max(ylim[1], xlim[1]))
# Reset the limits
self.ax.set_xlim(bounds)
self.ax.set_ylim(bounds)
# Ensure the aspect ratio is square
self.ax.set_aspect("equal", adjustable="box")
# Draw the 45 degree line
if self.identity:
draw_identity_line(
ax=self.ax,
ls="--",
lw=2,
c=self.colors["line"],
alpha=0.5,
label="identity",
)
# Set the axes labels
self.ax.set_ylabel(r"$\hat{y}$")
self.ax.set_xlabel(r"$y$")
# Set the legend
# Note: it would be nice to be able to use the manual_legend utility
# here, since if the user sets a low alpha value, the R2 color in the
# legend will also become more translucent. Unfortunately this is a
# bit tricky because adding a manual legend here would override the
# best fit and 45 degree line legend components. In particular, the
# best fit is plotted in draw because it depends on y and y_pred.
self.ax.legend(loc="best", frameon=True)
[docs]def prediction_error(
model,
X_train,
y_train,
X_test=None,
y_test=None,
ax=None,
shared_limits=True,
bestfit=True,
identity=True,
alpha=0.75,
is_fitted="auto",
show=True,
**kwargs):
"""Quickly plot a prediction error visualizer
Plot the actual targets from the dataset against the
predicted values generated by our model(s).
This helper function is a quick wrapper to utilize the PredictionError
ScoreVisualizer for one-off analysis.
Parameters
----------
model : the Scikit-Learn estimator (should be a regressor)
Should be an instance of a regressor, otherwise will raise a
YellowbrickTypeError exception on instantiation.
If the estimator is not fitted, it is fit when the visualizer is fitted,
unless otherwise specified by ``is_fitted``.
X_train : ndarray or DataFrame of shape n x m
A feature array of n instances with m features the model is trained on.
Used to fit the visualizer and also to score the visualizer if test splits are
not directly specified.
y_train : ndarray or Series of length n
An array or series of target or class values. Used to fit the visualizer and
also to score the visualizer if test splits are not specified.
X_test : ndarray or DataFrame of shape n x m, default: None
An optional feature array of n instances with m features that the model
is scored on if specified, using X_train as the training data.
y_test : ndarray or Series of length n, default: None
An optional array or series of target or class values that serve as actual
labels for X_test for scoring purposes.
ax : matplotlib Axes
The axes to plot the figure on.
shared_limits : bool, default: True
If shared_limits is True, the range of the X and Y axis limits will
be identical, creating a square graphic with a true 45 degree line.
In this form, it is easier to diagnose under- or over- prediction,
though the figure will become more sparse. To localize points, set
shared_limits to False, but note that this will distort the figure
and should be accounted for during analysis.
bestfit : bool, default: True
Draw a linear best fit line to estimate the correlation between the
predicted and measured value of the target variable. The color of
the bestfit line is determined by the ``line_color`` argument.
identity: bool, default: True
Draw the 45 degree identity line, y=x in order to better show the
relationship or pattern of the residuals. E.g. to estimate if the
model is over- or under- estimating the given values. The color of the
identity line is a muted version of the ``line_color`` argument.
alpha : float, default: 0.75
Specify a transparency where 1 is completely opaque and 0 is completely
transparent. This property makes densely clustered points more visible.
is_fitted : bool or str, default='auto'
Specify if the wrapped estimator is already fitted. If False, the estimator
will be fit when the visualizer is fit, otherwise, the estimator will not be
modified. If 'auto' (default), a helper method will check if the estimator
is fitted before fitting it again.
show: bool, default: True
If True, calls ``show()``, which in turn calls ``plt.show()`` however you cannot
call ``plt.savefig`` from this signature, nor ``clear_figure``. If False, simply
calls ``finalize()``
kwargs : dict
Keyword arguments that are passed to the base class and may influence
the visualization as defined in other Visualizers.
Returns
-------
ax : matplotlib Axes
Returns the axes that the prediction error plot was drawn on.
"""
# Instantiate the visualizer
visualizer = PredictionError(
model,
ax,
shared_limits=shared_limits,
bestfit=bestfit,
identity=identity,
alpha=alpha,
is_fitted=is_fitted,
**kwargs)
visualizer.fit(X_train, y_train)
# Scores the visualizer with X and y test if provided, X and y train if not
if X_test is not None and y_test is not None:
visualizer.score(X_test, y_test)
elif X_test is not None or y_test is not None:
raise YellowbrickValueError(
"both X_test and y_test are required if one is specified"
)
else:
visualizer.score(X_train, y_train)
if show:
visualizer.show()
else:
visualizer.finalize()
# Return the axes object on the visualizer
return visualizer
##########################################################################
## Residuals Plots
##########################################################################
[docs]class ResidualsPlot(RegressionScoreVisualizer):
"""
A residual plot shows the residuals on the vertical axis and the
independent variable on the horizontal axis.
If the points are randomly dispersed around the horizontal axis, a linear
regression model is appropriate for the data; otherwise, a non-linear
model is more appropriate.
Parameters
----------
model : a Scikit-Learn regressor
Should be an instance of a regressor, otherwise will raise a
YellowbrickTypeError exception on instantiation.
If the estimator is not fitted, it is fit when the visualizer is fitted,
unless otherwise specified by ``is_fitted``.
ax : matplotlib Axes, default: None
The axes to plot the figure on. If None is passed in the current axes
will be used (or generated if required).
hist : {True, False, None, 'density', 'frequency'}, default: True
Draw a histogram showing the distribution of the residuals on the
right side of the figure. Requires Matplotlib >= 2.0.2.
If set to 'density', the probability density function will be plotted.
If set to True or 'frequency' then the frequency will be plotted.
train_color : color, default: 'b'
Residuals for training data are ploted with this color but also
given an opacity of 0.5 to ensure that the test data residuals
are more visible. Can be any matplotlib color.
test_color : color, default: 'g'
Residuals for test data are plotted with this color. In order to
create generalizable models, reserved test data residuals are of
the most analytical interest, so these points are highlighted by
having full opacity. Can be any matplotlib color.
line_color : color, default: dark grey
Defines the color of the zero error line, can be any matplotlib color.
train_alpha : float, default: 0.75
Specify a transparency for traininig data, where 1 is completely opaque
and 0 is completely transparent. This property makes densely clustered
points more visible.
test_alpha : float, default: 0.75
Specify a transparency for test data, where 1 is completely opaque
and 0 is completely transparent. This property makes densely clustered
points more visible.
is_fitted : bool or str, default='auto'
Specify if the wrapped estimator is already fitted. If False, the estimator
will be fit when the visualizer is fit, otherwise, the estimator will not be
modified. If 'auto' (default), a helper method will check if the estimator
is fitted before fitting it again.
kwargs : dict
Keyword arguments that are passed to the base class and may influence
the visualization as defined in other Visualizers.
Attributes
----------
train_score_ : float
The R^2 score that specifies the goodness of fit of the underlying
regression model to the training data.
test_score_ : float
The R^2 score that specifies the goodness of fit of the underlying
regression model to the test data.
Examples
--------
>>> from yellowbrick.regressor import ResidualsPlot
>>> from sklearn.linear_model import Ridge
>>> model = ResidualsPlot(Ridge())
>>> model.fit(X_train, y_train)
>>> model.score(X_test, y_test)
>>> model.show()
Notes
-----
ResidualsPlot is a ScoreVisualizer, meaning that it wraps a model and
its primary entry point is the ``score()`` method.
The residuals histogram feature requires matplotlib 2.0.2 or greater.
"""
def __init__(
self,
model,
ax=None,
hist=True,
train_color="b",
test_color="g",
line_color=LINE_COLOR,
train_alpha=0.75,
test_alpha=0.75,
is_fitted="auto",
**kwargs
):
# Whether or not to check if the model is already fitted
self.is_fitted = is_fitted
# Initialize the visualizer base
super(ResidualsPlot, self).__init__(model, ax=ax, **kwargs)
# TODO: allow more scatter plot arguments for train and test points
# See #475 (RE: ScatterPlotMixin)
self.colors = {
"train_point": train_color,
"test_point": test_color,
"line": line_color,
}
self.hist = hist
if self.hist not in {True, "density", "frequency", None, False}:
raise YellowbrickValueError(
"'{}' is an invalid argument for hist, use None, True, "
"False, 'density', or 'frequency'".format(hist)
)
if self.hist in {True, "density", "frequency"}:
self.hax # If hist is True, test the version availability
# Store labels and colors for the legend ordered by call
self._labels, self._colors = [], []
self.alphas = {"train_point": train_alpha, "test_point": test_alpha}
@memoized
def hax(self):
"""
Returns the histogram axes, creating it only on demand.
"""
if make_axes_locatable is None:
raise YellowbrickValueError(
(
"residuals histogram requires matplotlib 2.0.2 or greater "
"please upgrade matplotlib or set hist=False on the visualizer"
)
)
divider = make_axes_locatable(self.ax)
hax = divider.append_axes("right", size=1, pad=0.1, sharey=self.ax)
hax.yaxis.tick_right()
hax.grid(False, axis="x")
return hax
[docs] def fit(self, X, y, **kwargs):
"""
Parameters
----------
X : ndarray or DataFrame of shape n x m
A matrix of n instances with m features
y : ndarray or Series of length n
An array or series of target values
kwargs: keyword arguments passed to Scikit-Learn API.
Returns
-------
self : ResidualsPlot
The visualizer instance
"""
# fit the underlying model to the data
super(ResidualsPlot, self).fit(X, y, **kwargs)
self.score(X, y, train=True)
return self
[docs] def score(self, X, y=None, train=False, **kwargs):
"""
Generates predicted target values using the Scikit-Learn
estimator.
Parameters
----------
X : array-like
X (also X_test) are the dependent variables of test set to predict
y : array-like
y (also y_test) is the independent actual variables to score against
train : boolean
If False, `score` assumes that the residual points being plotted
are from the test data; if True, `score` assumes the residuals
are the train data.
Returns
-------
score : float
The score of the underlying estimator, usually the R-squared score
for regression estimators.
"""
# Do not call super in order to differentiate train and test scores.
score = self.estimator.score(X, y, **kwargs)
if train:
self.train_score_ = score
else:
self.test_score_ = score
y_pred = self.predict(X)
residuals = y_pred - y
self.draw(y_pred, residuals, train=train)
return score
[docs] def draw(self, y_pred, residuals, train=False, **kwargs):
"""
Draw the residuals against the predicted value for the specified split.
It is best to draw the training split first, then the test split so
that the test split (usually smaller) is above the training split;
particularly if the histogram is turned on.
Parameters
----------
y_pred : ndarray or Series of length n
An array or series of predicted target values
residuals : ndarray or Series of length n
An array or series of the difference between the predicted and the
target values
train : boolean, default: False
If False, `draw` assumes that the residual points being plotted
are from the test data; if True, `draw` assumes the residuals
are the train data.
Returns
-------
ax : matplotlib Axes
The axis with the plotted figure
"""
if train:
color = self.colors["train_point"]
label = "Train $R^2 = {:0.3f}$".format(self.train_score_)
alpha = self.alphas["train_point"]
else:
color = self.colors["test_point"]
label = "Test $R^2 = {:0.3f}$".format(self.test_score_)
alpha = self.alphas["test_point"]
# Update the legend information
self._labels.append(label)
self._colors.append(color)
# Draw the residuals scatter plot
self.ax.scatter(y_pred, residuals, c=color, alpha=alpha, label=label)
# Add residuals histogram
if self.hist in {True, "frequency"}:
self.hax.hist(residuals, bins=50, orientation="horizontal", color=color)
elif self.hist == "density":
self.hax.hist(
residuals, bins=50, orientation="horizontal", density=True, color=color
)
# Ensure the current axes is always the main residuals axes
plt.sca(self.ax)
return self.ax
[docs] def finalize(self, **kwargs):
"""
Prepares the plot for renderig by adding a title, legend, and axis labels.
Also draws a line at the zero residuals to show the baseline.
Parameters
----------
kwargs: generic keyword arguments.
Notes
-----
Generally this method is called from show and not directly by the user.
"""
# Add the title to the plot
self.set_title("Residuals for {} Model".format(self.name))
# Set the legend with full opacity patches using manual legend
manual_legend(self, self._labels, self._colors, loc="best", frameon=True)
# Create a full line across the figure at zero error.
self.ax.axhline(y=0, c=self.colors["line"])
# Set the axes labels
self.ax.set_ylabel("Residuals")
self.ax.set_xlabel("Predicted Value")
# Finalize the histogram axes
if self.hist:
self.hax.axhline(y=0, c=self.colors["line"])
self.hax.set_xlabel("Distribution")
##########################################################################
## Quick Method
##########################################################################
[docs]def residuals_plot(
model,
X_train,
y_train,
X_test=None,
y_test=None,
ax=None,
hist=True,
train_color="b",
test_color="g",
line_color=LINE_COLOR,
train_alpha=0.75,
test_alpha=0.75,
is_fitted="auto",
show=True,
**kwargs
):
"""ResidualsPlot quick method:
A residual plot shows the residuals on the vertical axis and the
independent variable on the horizontal axis.
If the points are randomly dispersed around the horizontal axis, a linear
regression model is appropriate for the data; otherwise, a non-linear
model is more appropriate.
Parameters
----------
model : a Scikit-Learn regressor
Should be an instance of a regressor, otherwise will raise a
YellowbrickTypeError exception on instantiation.
If the estimator is not fitted, it is fit when the visualizer is fitted,
unless otherwise specified by ``is_fitted``.
X_train : ndarray or DataFrame of shape n x m
A feature array of n instances with m features the model is trained on.
Used to fit the visualizer and also to score the visualizer if test splits are
not directly specified.
y_train : ndarray or Series of length n
An array or series of target or class values. Used to fit the visualizer and
also to score the visualizer if test splits are not specified.
X_test : ndarray or DataFrame of shape n x m, default: None
An optional feature array of n instances with m features that the model
is scored on if specified, using X_train as the training data.
y_test : ndarray or Series of length n, default: None
An optional array or series of target or class values that serve as actual
labels for X_test for scoring purposes.
ax : matplotlib Axes, default: None
The axes to plot the figure on. If None is passed in the current axes
will be used (or generated if required).
hist : {True, False, None, 'density', 'frequency'}, default: True
Draw a histogram showing the distribution of the residuals on the
right side of the figure. Requires Matplotlib >= 2.0.2.
If set to 'density', the probability density function will be plotted.
If set to True or 'frequency' then the frequency will be plotted.
train_color : color, default: 'b'
Residuals for training data are ploted with this color but also
given an opacity of 0.5 to ensure that the test data residuals
are more visible. Can be any matplotlib color.
test_color : color, default: 'g'
Residuals for test data are plotted with this color. In order to
create generalizable models, reserved test data residuals are of
the most analytical interest, so these points are highlighted by
having full opacity. Can be any matplotlib color.
line_color : color, default: dark grey
Defines the color of the zero error line, can be any matplotlib color.
train_alpha : float, default: 0.75
Specify a transparency for traininig data, where 1 is completely opaque
and 0 is completely transparent. This property makes densely clustered
points more visible.
test_alpha : float, default: 0.75
Specify a transparency for test data, where 1 is completely opaque
and 0 is completely transparent. This property makes densely clustered
points more visible.
is_fitted : bool or str, default='auto'
Specify if the wrapped estimator is already fitted. If False, the estimator
will be fit when the visualizer is fit, otherwise, the estimator will not be
modified. If 'auto' (default), a helper method will check if the estimator
is fitted before fitting it again.
show: bool, default: True
If True, calls ``show()``, which in turn calls ``plt.show()`` however you cannot
call ``plt.savefig`` from this signature, nor ``clear_figure``. If False, simply
calls ``finalize()``
kwargs : dict
Keyword arguments that are passed to the base class and may influence
the visualization as defined in other Visualizers.
Returns
-------
viz : ResidualsPlot
Returns the fitted ResidualsPlot that created the figure.
"""
# Instantiate the visualizer
viz = ResidualsPlot(
model=model,
ax=ax,
hist=hist,
train_color=train_color,
test_color=test_color,
line_color=line_color,
train_alpha=train_alpha,
test_alpha=test_alpha,
is_fitted=is_fitted,
**kwargs
)
# Fit the visualizer
viz.fit(X_train, y_train)
# Score the visualizer
if X_test is not None and y_test is not None:
viz.score(X_test, y_test)
elif X_test is not None or y_test is not None:
raise YellowbrickValueError(
"both X_test and y_test are required if one is specified"
)
else:
viz.score(X_train, y_train)
# Draw the final visualization
if show:
viz.show()
else:
viz.finalize()
# Return the visualizer
return viz
```