# yellowbrick.classifier.feature_correlation
# Feature correlation to dependent variable visualizer.
#
# Author Zijie (ZJ) Poh
# Created: Wed Jul 29 15:30:40 2018 -0700
#
# Copyright (C) 2018 The scikit-yb developers
# For license information, see LICENSE.txt
#
# ID: feature_correlation.py [33aec16] 8103276+zjpoh@users.noreply.github.com $
"""
Feature Correlation to Dependent Variable Visualizer.
"""
##########################################################################
# Imports
##########################################################################
import numpy as np
from yellowbrick.utils import is_dataframe
from yellowbrick.target.base import TargetVisualizer
from yellowbrick.exceptions import YellowbrickValueError, YellowbrickWarning
from sklearn.feature_selection import mutual_info_classif
from sklearn.feature_selection import mutual_info_regression
from scipy.stats import pearsonr
##########################################################################
# Supported Correlation Computations
##########################################################################
CORRELATION_LABELS = {
"pearson": "Pearson Correlation",
"mutual_info-regression": "Mutual Information",
"mutual_info-classification": "Mutual Information",
}
CORRELATION_METHODS = {
"mutual_info-regression": mutual_info_regression,
"mutual_info-classification": mutual_info_classif,
}
##########################################################################
# Class Feature Correlation
##########################################################################
[docs]class FeatureCorrelation(TargetVisualizer):
"""
Displays the correlation between features and dependent variables.
This visualizer can be used side-by-side with
``yellowbrick.features.JointPlotVisualizer`` that plots a feature
against the target and shows the distribution of each via a
histogram on each axis.
Parameters
----------
ax : matplotlib Axes, default: None
The axis to plot the figure on. If None is passed in the current axes
will be used (or generated if required).
method : str, default: 'pearson'
The method to calculate correlation between features and target.
Options include:
- 'pearson', which uses ``scipy.stats.pearsonr``
- 'mutual_info-regression', which uses ``mutual_info-regression``
from ``sklearn.feature_selection``
- 'mutual_info-classification', which uses ``mutual_info_classif``
from ``sklearn.feature_selection``
labels : list, default: None
A list of feature names to use. If a DataFrame is passed to fit and
features is None, feature names are selected as the column names.
sort : boolean, default: False
If false, the features are are not sorted in the plot; otherwise
features are sorted in ascending order of correlation.
feature_index : list,
A list of feature index to include in the plot.
feature_names : list of feature names
A list of feature names to include in the plot.
Must have labels or the fitted data is a DataFrame with column names.
If feature_index is provided, feature_names will be ignored.
color: string
Specify color for barchart
kwargs : dict
Keyword arguments that are passed to the base class and may influence
the visualization as defined in other Visualizers.
Attributes
----------
features_ : np.array
The feature labels
scores_ : np.array
Correlation between features and dependent variable.
Examples
--------
>>> viz = FeatureCorrelation()
>>> viz.fit(X, y)
>>> viz.show()
"""
def __init__(
self,
ax=None,
method="pearson",
labels=None,
sort=False,
feature_index=None,
feature_names=None,
color=None,
**kwargs
):
super(FeatureCorrelation, self).__init__(ax, **kwargs)
self.correlation_labels = CORRELATION_LABELS
self.correlation_methods = CORRELATION_METHODS
if method not in self.correlation_labels:
raise YellowbrickValueError(
"Method {} not implement; choose from {}".format(
method, ", ".join(self.correlation_labels)
)
)
# Parameters
self.sort = sort
self.color = color
self.method = method
self.labels = labels
self.feature_index = feature_index
self.feature_names = feature_names
[docs] def fit(self, X, y, **kwargs):
"""
Fits the estimator to calculate feature correlation to
dependent variable.
Parameters
----------
X : ndarray or DataFrame of shape n x m
A matrix of n instances with m features
y : ndarray or Series of length n
An array or series of target or class values
kwargs : dict
Keyword arguments passed to the fit method of the estimator.
Returns
-------
self : visualizer
The fit method must always return self to support pipelines.
"""
self._create_labels_for_features(X)
self._select_features_to_plot(X)
# Calculate Features correlation with target variable
if self.method == "pearson":
self.scores_ = np.array(
[pearsonr(x, y, **kwargs)[0] for x in np.asarray(X).T]
)
else:
self.scores_ = np.array(
self.correlation_methods[self.method](X, y, **kwargs)
)
# If feature indices are given, plot only the given features
if self.feature_index:
self.scores_ = self.scores_[self.feature_index]
self.features_ = self.features_[self.feature_index]
# Sort features by correlation
if self.sort:
sort_idx = np.argsort(self.scores_)
self.scores_ = self.scores_[sort_idx]
self.features_ = self.features_[sort_idx]
self.draw()
return self
[docs] def draw(self):
"""
Draws the feature correlation to dependent variable, called from fit.
"""
pos = np.arange(self.scores_.shape[0]) + 0.5
self.ax.barh(pos, self.scores_, color=self.color)
# Set the labels for the bars
self.ax.set_yticks(pos)
self.ax.set_yticklabels(self.features_)
return self.ax
[docs] def finalize(self):
"""
Finalize the drawing setting labels and title.
"""
self.set_title("Features correlation with dependent variable")
self.ax.set_xlabel(self.correlation_labels[self.method])
self.ax.grid(False, axis="y")
def _create_labels_for_features(self, X):
"""
Create labels for the features
NOTE: this code is duplicated from MultiFeatureVisualizer
"""
if self.labels is None:
# Use column names if a dataframe
if is_dataframe(X):
self.features_ = np.array(X.columns)
# Otherwise use the column index as the labels
else:
_, ncols = X.shape
self.features_ = np.arange(0, ncols)
else:
self.features_ = np.array(self.labels)
def _select_features_to_plot(self, X):
"""
Select features to plot.
feature_index is always used as the filter and
if filter_names is supplied, a new feature_index
is computed from those names.
"""
if self.feature_index:
if self.feature_names:
raise YellowbrickWarning(
"Both feature_index and feature_names "
"are specified. feature_names is ignored"
)
if min(self.feature_index) < 0 or max(self.feature_index) >= X.shape[1]:
raise YellowbrickValueError("Feature index is out of range")
elif self.feature_names:
self.feature_index = []
features_list = self.features_.tolist()
for feature_name in self.feature_names:
try:
self.feature_index.append(features_list.index(feature_name))
except ValueError:
raise YellowbrickValueError("{} not in labels".format(feature_name))
##########################################################################
# Quick Method
##########################################################################
[docs]def feature_correlation(
X,
y,
ax=None,
method="pearson",
labels=None,
sort=False,
feature_index=None,
feature_names=None,
color=None,
show=True,
**kwargs
):
"""
Displays the correlation between features and dependent variables.
This visualizer can be used side-by-side with
yellowbrick.features.JointPlotVisualizer that plots a feature
against the target and shows the distribution of each via a
histogram on each axis.
Parameters
----------
X : ndarray or DataFrame of shape n x m
A matrix of n instances with m features
y : ndarray or Series of length n
An array or series of target or class values
ax : matplotlib Axes, default: None
The axis to plot the figure on. If None is passed in the current axes
will be used (or generated if required).
method : str, default: 'pearson'
The method to calculate correlation between features and target.
Options include:
- 'pearson', which uses ``scipy.stats.pearsonr``
- 'mutual_info-regression', which uses ``mutual_info-regression``
from ``sklearn.feature_selection``
- 'mutual_info-classification', which uses ``mutual_info_classif``
from ``sklearn.feature_selection``
labels : list, default: None
A list of feature names to use. If a DataFrame is passed to fit and
features is None, feature names are selected as the column names.
sort : boolean, default: False
If false, the features are are not sorted in the plot; otherwise
features are sorted in ascending order of correlation.
feature_index : list,
A list of feature index to include in the plot.
feature_names : list of feature names
A list of feature names to include in the plot.
Must have labels or the fitted data is a DataFrame with column names.
If feature_index is provided, feature_names will be ignored.
color: string
Specify color for barchart
show: bool, default: True
If True, calls ``show()``, which in turn calls ``plt.show()`` however you cannot
call ``plt.savefig`` from this signature, nor ``clear_figure``. If False, simply
calls ``finalize()``
kwargs : dict
Keyword arguments that are passed to the base class and may influence
the visualization as defined in other Visualizers.
Returns
-------
visualizer : FeatureCorrelation
Returns the fitted visualizer.
"""
# Instantiate the visualizer
visualizer = FeatureCorrelation(
ax=ax,
method=method,
labels=labels,
sort=sort,
color=color,
feature_index=feature_index,
feature_names=feature_names,
**kwargs
)
# Fit and transform the visualizer (calls draw)
visualizer.fit(X, y, **kwargs)
if show:
visualizer.show()
else:
visualizer.finalize()
# Return the visualizer
return visualizer