Source code for yellowbrick.contrib.missing.dispersion

# yellowbrick.contrib.missing.dispersion
# Missing Values Dispersion Visualizer
#
# Author:  Nathan Danielsen
# Created: Fri Mar 29 5:17:36 2018 -0500
#
# Copyright (C) 2018 The scikit-yb developers
# For license information, see LICENSE.txt
#
# ID: dispersion.py [1443e16] ndanielsen@users.noreply.github.com $

"""
Dispersion visualizer for locations of missing values by column against index position.
"""

##########################################################################
## Imports
##########################################################################

import numpy as np

from yellowbrick.style.palettes import color_palette
from .base import MissingDataVisualizer


##########################################################################
## MissingValues Visualizer
##########################################################################


[docs]class MissingValuesDispersion(MissingDataVisualizer): """ The Missing Values Dispersion visualizer shows the locations of missing (nan) values in the feature dataset by the order of the index. When y targets are supplied to fit, the output dispersion plot is color coded according to the target y that the element refers to. Parameters ---------- alpha : float, default: 0.5 A value for bending elments with the background. marker : matplotlib marker, default: | The marker used for each element coordinate in the plot classes : list, default: None A list of class names for the legend. If classes is None and a y value is passed to fit then the classes are selected from the target vector. kwargs : dict Keyword arguments that are passed to the base class and may influence the visualization as defined in other Visualizers. Attributes ---------- features_ : np.array The feature labels ranked according to their importance classes_ : np.array The class labels for each of the target values Examples -------- >>> from yellowbrick.contrib.missing import MissingValuesDispersion >>> visualizer = MissingValuesDispersion() >>> visualizer.fit(X, y=y) >>> visualizer.show() """ def __init__(self, alpha=0.5, marker="|", classes=None, **kwargs): if "target_type" not in kwargs: kwargs["target_type"] = "single" super(MissingValuesDispersion, self).__init__(**kwargs) self.alpha = alpha self.marker = marker self.classes_ = classes # Convert to array if necessary to match estimator.classes_ if self.classes_ is not None: self.classes_ = np.array(classes) # Set up classifier score visualization properties if self.classes_ is not None: n_colors = len(self.classes_) else: n_colors = None self.colors = color_palette(kwargs.pop("colors", None), n_colors)
[docs] def get_nan_locs(self, **kwargs): """Gets the locations of nans in feature data and returns the coordinates in the matrix """ if np.issubdtype(self.X.dtype, np.string_) or np.issubdtype( self.X.dtype, np.unicode_ ): mask = np.where(self.X == "") nan_matrix = np.zeros(self.X.shape) nan_matrix[mask] = np.nan else: nan_matrix = self.X.astype(float) if self.y is None: return np.argwhere(np.isnan(nan_matrix)) else: nan_locs = [] for target_value in np.unique(self.y): indices = np.argwhere(self.y == target_value) target_matrix = nan_matrix[indices.flatten()] nan_target_locs = np.argwhere(np.isnan(target_matrix)) nan_locs.append((target_value, nan_target_locs)) return nan_locs
[docs] def draw(self, X, y, **kwargs): """Called from the fit method, this method creates a scatter plot that draws each instance as a class or target colored point, whose location is determined by the feature data set. If y is not None, then it draws a scatter plot where each class is in a different color. """ nan_locs = self.get_nan_locs() if y is None: x_, y_ = list(zip(*nan_locs)) self.ax.scatter(x_, y_, alpha=self.alpha, marker=self.marker, label=None) else: self.draw_multi_dispersion_chart(nan_locs)
[docs] def draw_multi_dispersion_chart(self, nan_locs): """Draws a multi dimensional dispersion chart, each color corresponds to a different target variable. """ for index, nan_values in enumerate(nan_locs): label, nan_locations = nan_values # if features passed in then, label as such if self.classes_ is not None: label = self.classes_[index] color = self.colors[index] x_, y_ = list(zip(*nan_locations)) self.ax.scatter( x_, y_, alpha=self.alpha, marker=self.marker, color=color, label=label )
[docs] def finalize(self, **kwargs): """ Sets the title and x-axis label and adds a legend. Also ensures that the y tick labels are set to the feature names. Parameters ---------- kwargs: generic keyword arguments. Notes ----- Generally this method is called from show and not directly by the user. """ # Set the title self.set_title("Dispersion of Missing Values by Feature") # the x locations for the groups tick_locations = np.arange(len(self.features_)) self.ax.set_xlabel("Position by index") self.ax.set_yticks(tick_locations) self.ax.set_yticklabels(self.get_feature_names()) self.ax.legend(loc="upper left", prop={"size": 5}, bbox_to_anchor=(1, 1))
########################################################################## ## Quick Method ########################################################################## def missing_dispersion( X, y=None, ax=None, classes=None, alpha=0.5, marker="|", **kwargs ): """ The Missing Values Dispersion visualizer shows the locations of missing (nan) values in the feature dataset by the order of the index. When y targets are supplied to fit, the output dispersion plot is color coded according to the target y that the element refers to. Parameters ---------- alpha : float, default: 0.5 A value for bending elments with the background. marker : matplotlib marker, default: | The marker used for each element coordinate in the plot classes : list, default: None A list of class names for the legend. If classes is None and a y value is passed to fit then the classes are selected from the target vector. kwargs : dict Keyword arguments that are passed to the base class and may influence the visualization as defined in other Visualizers. Attributes ---------- features_ : np.array The feature labels ranked according to their importance classes_ : np.array The class labels for each of the target values Examples -------- >>> from yellowbrick.contrib.missing import missing_dispersion >>> visualizer = missing_dispersion(X, y=y) """ # Instantiate the visualizer visualizer = MissingValuesDispersion( ax=ax, classes=classes, alpha=alpha, marker=marker, **kwargs ) # Fit and transform the visualizer (calls draw) visualizer.fit(X, y) visualizer.show() # Return the axes object on the visualizer return visualizer.ax