Source code for yellowbrick.target.feature_correlation

# yellowbrick.classifier.feature_correlation
# Feature correlation to dependent variable visualizer.
#
# Author    Zijie (ZJ) Poh <[email protected]>
# Created:  Wed Jul 29 15:30:40 2018 -0700
#
# ID: feature_correlation.py [] [email protected] $

"""
Feature Correlation to Dependent Variable Visualizer.
"""

##########################################################################
## Imports
##########################################################################

import numpy as np

from yellowbrick.target.base import TargetVisualizer
from yellowbrick.utils import is_dataframe
from yellowbrick.exceptions import YellowbrickValueError, YellowbrickWarning

from sklearn.feature_selection import mutual_info_classif
from sklearn.feature_selection import mutual_info_regression

from scipy.stats import pearsonr

##########################################################################
## Supported Correlation Computations
##########################################################################

CORRELATION_LABELS = {
    'pearson': 'Pearson Correlation',
    'mutual_info-regression': 'Mutual Information',
    'mutual_info-classification': 'Mutual Information'
}

CORRELATION_METHODS = {
    'mutual_info-regression': mutual_info_regression,
    'mutual_info-classification': mutual_info_classif
}

##########################################################################
## Class Feature Correlation
##########################################################################

[docs]class FeatureCorrelation(TargetVisualizer): """ Displays the correlation between features and dependent variables. This visualizer can be used side-by-side with ``yellowbrick.features.JointPlotVisualizer`` that plots a feature against the target and shows the distribution of each via a histogram on each axis. Parameters ---------- ax : matplotlib Axes, default: None The axis to plot the figure on. If None is passed in the current axes will be used (or generated if required). method : str, default: 'pearson' The method to calculate correlation between features and target. Options include: - 'pearson', which uses ``scipy.stats.pearsonr`` - 'mutual_info-regression', which uses ``mutual_info-regression`` from ``sklearn.feature_selection`` - 'mutual_info-classification', which uses ``mutual_info_classif`` from ``sklearn.feature_selection`` labels : list, default: None A list of feature names to use. If a DataFrame is passed to fit and features is None, feature names are selected as the column names. sort : boolean, default: False If false, the features are are not sorted in the plot; otherwise features are sorted in ascending order of correlation. feature_index : list, A list of feature index to include in the plot. feature_names : list of feature names A list of feature names to include in the plot. Must have labels or the fitted data is a DataFrame with column names. If feature_index is provided, feature_names will be ignored. kwargs : dict Keyword arguments that are passed to the base class and may influence the visualization as defined in other Visualizers. Attributes ---------- features_ : np.array The feature labels scores_ : np.array Correlation between features and dependent variable. Examples -------- >>> viz = FeatureCorrelation() >>> viz.fit(X, y) >>> viz.poof() """ def __init__(self, ax=None, method='pearson', labels=None, sort=False, feature_index=None, feature_names=None, **kwargs): super(FeatureCorrelation, self).__init__(ax=None, **kwargs) self.correlation_labels = CORRELATION_LABELS self.correlation_methods = CORRELATION_METHODS if method not in self.correlation_labels: raise YellowbrickValueError( 'Method {} not implement; choose from {}'.format( method, ", ".join(self.correlation_labels) ) ) # Parameters self.set_params( method=method, labels=labels, sort=sort, feature_index=feature_index, feature_names=feature_names )
[docs] def fit(self, X, y, **kwargs): """ Fits the estimator to calculate feature correlation to dependent variable. Parameters ---------- X : ndarray or DataFrame of shape n x m A matrix of n instances with m features y : ndarray or Series of length n An array or series of target or class values kwargs : dict Keyword arguments passed to the fit method of the estimator. Returns ------- self : visualizer The fit method must always return self to support pipelines. """ self._create_labels_for_features(X) self._select_features_to_plot(X) # Calculate Features correlation with target variable if self.method == "pearson": self.scores_ = np.array( [pearsonr(x, y, **kwargs)[0] for x in np.asarray(X).T] ) else: self.scores_ = np.array( self.correlation_methods[self.method](X, y, **kwargs) ) # If feature indices are given, plot only the given features if self.feature_index: self.scores_ = self.scores_[self.feature_index] self.features_ = self.features_[self.feature_index] # Sort features by correlation if self.sort: sort_idx = np.argsort(self.scores_) self.scores_ = self.scores_[sort_idx] self.features_ = self.features_[sort_idx] self.draw() return self
[docs] def draw(self): """ Draws the feature correlation to dependent variable, called from fit. """ pos = np.arange(self.scores_.shape[0]) + 0.5 self.ax.barh(pos, self.scores_) # Set the labels for the bars self.ax.set_yticks(pos) self.ax.set_yticklabels(self.features_) return self.ax
[docs] def finalize(self): """ Finalize the drawing setting labels and title. """ self.set_title('Features correlation with dependent variable') self.ax.set_xlabel(self.correlation_labels[self.method]) self.ax.grid(False, axis='y')
def _create_labels_for_features(self, X): """ Create labels for the features NOTE: this code is duplicated from MultiFeatureVisualizer """ if self.labels is None: # Use column names if a dataframe if is_dataframe(X): self.features_ = np.array(X.columns) # Otherwise use the column index as the labels else: _, ncols = X.shape self.features_ = np.arange(0, ncols) else: self.features_ = np.array(self.labels) def _select_features_to_plot(self, X): """ Select features to plot. feature_index is always used as the filter and if filter_names is supplied, a new feature_index is computed from those names. """ if self.feature_index: if self.feature_names: raise YellowbrickWarning( 'Both feature_index and feature_names ' 'are specified. feature_names is ignored' ) if (min(self.feature_index) < 0 or max(self.feature_index) >= X.shape[1]): raise YellowbrickValueError('Feature index is out of range') elif self.feature_names: self.feature_index = [] features_list = self.features_.tolist() for feature_name in self.feature_names: try: self.feature_index.append( features_list.index(feature_name) ) except ValueError: raise YellowbrickValueError( '{} not in labels'.format(feature_name) )
########################################################################## ## Quick Method ########################################################################## def feature_correlation(X, y, ax=None, method='pearson', labels=None, sort=False, feature_index=None, feature_names=None, **kwargs): """ Displays the correlation between features and dependent variables. This visualizer can be used side-by-side with yellowbrick.features.JointPlotVisualizer that plots a feature against the target and shows the distribution of each via a histogram on each axis. Parameters ---------- X : ndarray or DataFrame of shape n x m A matrix of n instances with m features y : ndarray or Series of length n An array or series of target or class values ax : matplotlib Axes, default: None The axis to plot the figure on. If None is passed in the current axes will be used (or generated if required). method : str, default: 'pearson' The method to calculate correlation between features and target. Options include: - 'pearson', which uses ``scipy.stats.pearsonr`` - 'mutual_info-regression', which uses ``mutual_info-regression`` from ``sklearn.feature_selection`` - 'mutual_info-classification', which uses ``mutual_info_classif`` from ``sklearn.feature_selection`` 'mutual_info-classification'], default: 'pearson' labels : list, default: None A list of feature names to use. If a DataFrame is passed to fit and features is None, feature names are selected as the column names. sort : boolean, default: False If false, the features are are not sorted in the plot; otherwise features are sorted in ascending order of correlation. feature_index : list, A list of feature index to include in the plot. feature_names : list of feature names A list of feature names to include in the plot. Must have labels or the fitted data is a DataFrame with column names. If feature_index is provided, feature_names will be ignored. kwargs : dict Keyword arguments that are passed to the base class and may influence the visualization as defined in other Visualizers. Returns ------- ax : matplotlib axes Returns the axes that the parallel coordinates were drawn on. """ # Instantiate the visualizer viz = FeatureCorrelation(ax, method, labels, sort, feature_index, feature_names, **kwargs) # Fit and transform the visualizer (calls draw) viz.fit(X, y, **kwargs) viz.finalize() # Return the axes object on the visualizer return viz.ax