# yellowbrick.features.jointplot
# Implementations of joint plots for univariate and bivariate analysis.
#
# Author: Prema Damodaran Roman
# Created: Mon Apr 10 21:00:54 2017 -0400
#
# Copyright (C) 2017 District Data Labs
# For license information, see LICENSE.txt
#
# ID: jointplot.py [7f47800] pdamodaran@users.noreply.github.com $
##########################################################################
## Imports
##########################################################################
import warnings
import numpy as np
import matplotlib as mpl
import matplotlib.pyplot as plt
from yellowbrick.features.base import FeatureVisualizer
from yellowbrick.exceptions import YellowbrickValueError
from yellowbrick.bestfit import draw_best_fit
from yellowbrick.utils import is_dataframe
##########################################################################
## Joint Plot Visualizer
##########################################################################
[belgeler]class JointPlotVisualizer(FeatureVisualizer):
"""
JointPlotVisualizer allows for a simultaneous visualization of the relationship
between two variables and the distrbution of each individual variable. The
relationship is plotted along the joint axis and univariate distributions
are plotted on top of the x axis and to the right of the y axis.
Parameters
----------
ax: matplotlib Axes, default: None
This is inherited from FeatureVisualizer but is defined within
JointPlotVisualizer since there are three axes objects.
feature: string, default: None
The name of the X variable
If a DataFrame is passed to fit and feature is None, feature
is selected as the column of the DataFrame. There must be only
one column in the DataFrame.
target: string, default: None
The name of the Y variable
If target is None and a y value is passed to fit then the target
is selected from the target vector.
joint_plot: one of {'scatter', 'hex'}, default: 'scatter'
The type of plot to render in the joint axis
Currently, the choices are scatter and hex.
Use scatter for small datasets and hex for large datasets
joint_args: dict, default: None
Keyword arguments used for customizing the joint plot:
============= ==================================================================
Property Description
------------- ------------------------------------------------------------------
alpha transparency
facecolor background color of the joint axis
aspect aspect ratio
fit used if scatter is selected for joint_plot to draw a
best fit line - values can be True or False.
Uses ``Yellowbrick.bestfit``
estimator used if scatter is selected for joint_plot to determine
the type of best fit line to use. Refer to
Yellowbrick.bestfit for types of estimators that can be used.
x_bins used if hex is selected to set the number of bins for the x value
y_bins used if hex is selected to set the number of bins for the y value
cmap string or matplotlib cmap to colorize lines
Use either color to colorize the lines on a per class basis or
colormap to color them on a continuous scale.
============= ==================================================================
xy_plot: one of {'hist'}, default: 'hist'
The type of plot to render along the x and y axes
Currently, the choice is hist
xy_args: dict, default: None
Keyword arguments used for customizing the x and y plots:
============== =====================================================
Property Description
-------------- -----------------------------------------------------
alpha transparency
facecolor_x background color of the x axis
facecolor_y background color of the y axis
bins used to set up the number of bins for the hist plot
histcolor_x used to set the color for the histogram on the x axis
histcolor_y used to set the color for the histogram on the y axis
============== =====================================================
size: float, default: 600
Size of each side of the figure in pixels
ratio: float, default: 5
Ratio of joint axis size to the x and y axes height
space: float, default: 0.2
Space between the joint axis and the x and y axes
kwargs : dict
Keyword arguments that are passed to the base class and may influence
the visualization as defined in other Visualizers.
Examples
--------
>>> visualizer = JointPlotVisualizer()
>>> visualizer.fit(X,y)
>>> visualizer.poof()
Notes
-----
These parameters can be influenced later on in the visualization
process, but can and should be set as early as possible.
"""
def __init__(self, ax=None, feature=None, target=None,
joint_plot='scatter', joint_args=None,
xy_plot='hist', xy_args=None,
size=600, ratio=5, space=.2, **kwargs):
# Check matplotlib version - needs to be version 2.0.0 or greater.
mpl_vers_maj = int(mpl.__version__.split(".")[0])
if mpl_vers_maj < 2:
warnings.warn((
"{} requires matplotlib major version 2 or greater. "
"Please upgrade."
).format(self.__class__.__name__))
super(JointPlotVisualizer, self).__init__(ax, **kwargs)
self.feature = feature
self.target = target
self.joint_plot = joint_plot
self.joint_args = joint_args
self.xy_plot = xy_plot
self.xy_args = xy_args
self.size = (size, size)
self.ratio = ratio
self.space = space
[belgeler] def fit(self, X, y, **kwargs):
"""
Sets up the X and y variables for the jointplot
and checks to ensure that X and y are of the
correct data type
Fit calls draw
Parameters
----------
X : ndarray or DataFrame of shape n x 1
A matrix of n instances with 1 feature
y : ndarray or Series of length n
An array or series of the target value
kwargs: dict
keyword arguments passed to Scikit-Learn API.
"""
#throw an error if X has more than 1 column
if is_dataframe(X):
nrows, ncols = X.shape
if ncols > 1:
raise YellowbrickValueError((
"X needs to be an ndarray or DataFrame with one feature, "
"please select one feature from the DataFrame"
))
#throw an error is y is None
if y is None:
raise YellowbrickValueError((
"Joint plots are useful for classification and regression "
"problems, which require a target variable"
))
# Handle the feature name if it is None.
if self.feature is None:
# If X is a data frame, get the columns off it.
if is_dataframe(X):
self.feature = X.columns
else:
self.feature = ['x']
# Handle the target name if it is None.
if self.target is None:
self.target = ['y']
self.draw(X, y, **kwargs)
return self
[belgeler] def draw(self, X, y, **kwargs):
"""
Sets up the layout for the joint plot draw calls ``draw_joint`` and
``draw_xy`` to render the visualizations.
"""
fig = plt.gcf()
gs = plt.GridSpec(self.ratio + 1, self.ratio + 1)
#Set up the 3 axes objects
joint_ax = fig.add_subplot(gs[1:, :-1])
x_ax = fig.add_subplot(gs[0, :-1], sharex=joint_ax)
y_ax = fig.add_subplot(gs[1:, -1], sharey=joint_ax)
fig.tight_layout()
fig.subplots_adjust(hspace=self.space, wspace=self.space)
self.fig = fig
self.joint_ax = joint_ax
self.x_ax = x_ax
self.y_ax = y_ax
self.draw_joint(X, y, **kwargs)
self.draw_xy(X, y, **kwargs)
[belgeler] def draw_joint(self, X, y, **kwargs):
"""
Draws the visualization for the joint axis.
"""
if self.joint_args is None:
self.joint_args = {}
self.joint_args.setdefault("alpha", 0.4)
facecolor = self.joint_args.pop("facecolor", "#dddddd")
self.joint_ax.set_facecolor(facecolor)
if self.joint_plot == "scatter":
aspect = self.joint_args.pop("aspect", "auto")
self.joint_ax.set_aspect(aspect)
self.joint_ax.scatter(X, y, **self.joint_args)
fit = self.joint_args.pop("fit", True)
if fit:
estimator = self.joint_args.pop("estimator", "linear")
draw_best_fit(X, y, self.joint_ax, estimator)
elif self.joint_plot == "hex":
x_bins = self.joint_args.pop("x_bins", 50)
y_bins = self.joint_args.pop("y_bins", 50)
colormap = self.joint_args.pop("cmap", 'Blues')
gridsize = int(np.mean([x_bins, y_bins]))
xmin = X.min()
xmax = X.max()
ymin = y.min()
ymax = y.max()
self.joint_ax.hexbin(X, y,
gridsize=gridsize, cmap=colormap, mincnt=1, **self.joint_args
)
self.joint_ax.axis([xmin, xmax, ymin, ymax])
[belgeler] def draw_xy(self, X, y, **kwargs):
"""
Draws the visualization for the x and y axes
"""
if self.xy_args is None:
self.xy_args = {}
facecolor_x = self.xy_args.pop("facecolor_x", "#dddddd")
self.x_ax.set_facecolor(facecolor_x)
facecolor_y = self.xy_args.pop("facecolor_y", "#dddddd")
self.y_ax.set_facecolor(facecolor_y)
if self.xy_plot == "hist":
hist_bins = self.xy_args.pop("bins", 50)
self.xy_args.setdefault("alpha", 0.4)
histcolor_x = self.xy_args.pop("histcolor_x", "#6897bb")
self.x_ax.set_facecolor(facecolor_x)
histcolor_y = self.xy_args.pop("histcolor_y", "#6897bb")
self.y_ax.set_facecolor(facecolor_y)
self.x_ax.hist(X, bins=hist_bins, color=histcolor_x, **self.xy_args)
self.y_ax.hist(y, bins=hist_bins, color=histcolor_y,
orientation='horizontal', **self.xy_args)
[belgeler] def poof(self, **kwargs):
"""
Creates the labels for the feature and target variables
"""
self.joint_ax.set_xlabel(self.feature)
self.joint_ax.set_ylabel(self.target)
self.finalize(**kwargs)
[belgeler] def finalize(self, **kwargs):
"""
Finalize executes any subclass-specific axes finalization steps.
The user calls poof and poof calls finalize.
Parameters
----------
kwargs: generic keyword arguments.
"""
plt.setp(self.x_ax.get_xticklabels(), visible=False)
plt.setp(self.y_ax.get_yticklabels(), visible=False)
plt.setp(self.x_ax.yaxis.get_majorticklines(), visible=False)
plt.setp(self.x_ax.yaxis.get_minorticklines(), visible=False)
plt.setp(self.y_ax.xaxis.get_majorticklines(), visible=False)
plt.setp(self.y_ax.xaxis.get_minorticklines(), visible=False)
plt.setp(self.x_ax.get_yticklabels(), visible=False)
plt.setp(self.y_ax.get_xticklabels(), visible=False)
self.x_ax.yaxis.grid(False)
self.y_ax.xaxis.grid(False)
self.fig.suptitle("Joint Plot of {} vs {}"
.format(self.feature, self.target), y=1.05)