Source code for yellowbrick.features.pcoords

# yellowbrick.features.pcoords
# Implementations of parallel coordinates for feature analysis.
#
# Author:  Benjamin Bengfort
# Author:  @thekylesaurus
# Created: Mon Oct 03 21:46:06 2016 -0400
#
# Copyright (C) 2016 The scikit-yb developers
# For license information, see LICENSE.txt
#
# ID: pcoords.py [0f4b236] benjamin@bengfort.com $

"""
Implementation of parallel coordinates for multi-dimensional feature analysis.
"""

##########################################################################
## Imports
##########################################################################

import numpy as np

from numpy.random import RandomState
from sklearn.preprocessing import MinMaxScaler, MaxAbsScaler
from sklearn.preprocessing import Normalizer, StandardScaler

from yellowbrick.draw import manual_legend
from yellowbrick.features.base import DataVisualizer
from yellowbrick.utils import is_dataframe, is_series
from yellowbrick.exceptions import YellowbrickTypeError, YellowbrickValueError


##########################################################################
## Quick Methods
##########################################################################

[docs]def parallel_coordinates( X, y, ax=None, features=None, classes=None, normalize=None, sample=1.0, random_state=None, shuffle=False, colors=None, colormap=None, alpha=None, fast=False, vlines=True, vlines_kwds=None, show=True, **kwargs ): """Displays each feature as a vertical axis and each instance as a line. This helper function is a quick wrapper to utilize the ParallelCoordinates Visualizer (Transformer) for one-off analysis. Parameters ---------- X : ndarray or DataFrame of shape n x m A matrix of n instances with m features y : ndarray or Series of length n An array or series of target or class values ax : matplotlib Axes, default: None The axis to plot the figure on. If None is passed in the current axes will be used (or generated if required). features : list, default: None a list of feature names to use If a DataFrame is passed to fit and features is None, feature names are selected as the columns of the DataFrame. classes : list, default: None a list of class names for the legend If classes is None and a y value is passed to fit then the classes are selected from the target vector. normalize : string or None, default: None specifies which normalization method to use, if any Current supported options are 'minmax', 'maxabs', 'standard', 'l1', and 'l2'. sample : float or int, default: 1.0 specifies how many examples to display from the data If int, specifies the maximum number of samples to display. If float, specifies a fraction between 0 and 1 to display. random_state : int, RandomState instance or None If int, random_state is the seed used by the random number generator; If RandomState instance, random_state is the random number generator; If None, the random number generator is the RandomState instance used by np.random; only used if shuffle is True and sample < 1.0 shuffle : boolean, default: True specifies whether sample is drawn randomly colors : list or tuple, default: None optional list or tuple of colors to colorize lines Use either color to colorize the lines on a per class basis or colormap to color them on a continuous scale. colormap : string or cmap, default: None optional string or matplotlib cmap to colorize lines Use either color to colorize the lines on a per class basis or colormap to color them on a continuous scale. alpha : float, default: None Specify a transparency where 1 is completely opaque and 0 is completely transparent. This property makes densely clustered lines more visible. If None, the alpha is set to 0.5 in "fast" mode and 0.25 otherwise. fast : bool, default: False Fast mode improves the performance of the drawing time of parallel coordinates but produces an image that does not show the overlap of instances in the same class. Fast mode should be used when drawing all instances is too burdensome and sampling is not an option. vlines : boolean, default: True flag to determine vertical line display vlines_kwds : dict, default: None options to style or display the vertical lines, default: None show : bool, default: True If True, calls ``show()``, which in turn calls ``plt.show()`` however you cannot call ``plt.savefig`` from this signature, nor ``clear_figure``. If False, simply calls ``finalize()`` kwargs : dict Keyword arguments that are passed to the base class and may influence the visualization as defined in other Visualizers. Returns ------- viz : ParallelCoordinates Returns the fitted, finalized visualizer """ # Instantiate the visualizer visualizer = ParallelCoordinates( ax=ax, features=features, classes=classes, normalize=normalize, sample=sample, random_state=random_state, shuffle=shuffle, colors=colors, colormap=colormap, alpha=alpha, fast=fast, vlines=vlines, vlines_kwds=vlines_kwds, **kwargs ) # Fit and transform the visualizer (calls draw) visualizer.fit(X, y, **kwargs) visualizer.transform(X) if show: visualizer.show() else: visualizer.finalize() # Return the visualizer object return visualizer
########################################################################## ## Static Parallel Coordinates Visualizer ##########################################################################
[docs]class ParallelCoordinates(DataVisualizer): """ Parallel coordinates displays each feature as a vertical axis spaced evenly along the horizontal, and each instance as a line drawn between each individual axis. This allows you to detect braids of similar instances and separability that suggests a good classification problem. Parameters ---------- ax : matplotlib Axes, default: None The axis to plot the figure on. If None is passed in the current axes will be used (or generated if required). features : list, default: None a list of feature names to use If a DataFrame is passed to fit and features is None, feature names are selected as the columns of the DataFrame. classes : list, default: None a list of class names for the legend The class labels for each class in y, ordered by sorted class index. These names act as a label encoder for the legend, identifying integer classes or renaming string labels. If omitted, the class labels will be taken from the unique values in y. Note that the length of this list must match the number of unique values in y, otherwise an exception is raised. normalize : string or None, default: None specifies which normalization method to use, if any Current supported options are 'minmax', 'maxabs', 'standard', 'l1', and 'l2'. sample : float or int, default: 1.0 specifies how many examples to display from the data If int, specifies the maximum number of samples to display. If float, specifies a fraction between 0 and 1 to display. random_state : int, RandomState instance or None If int, random_state is the seed used by the random number generator; If RandomState instance, random_state is the random number generator; If None, the random number generator is the RandomState instance used by np.random; only used if shuffle is True and sample < 1.0 shuffle : boolean, default: True specifies whether sample is drawn randomly colors : list or tuple, default: None A single color to plot all instances as or a list of colors to color each instance according to its class. If not enough colors per class are specified then the colors are treated as a cycle. colormap : string or cmap, default: None The colormap used to create the individual colors. If classes are specified the colormap is used to evenly space colors across each class. alpha : float, default: None Specify a transparency where 1 is completely opaque and 0 is completely transparent. This property makes densely clustered lines more visible. If None, the alpha is set to 0.5 in "fast" mode and 0.25 otherwise. fast : bool, default: False Fast mode improves the performance of the drawing time of parallel coordinates but produces an image that does not show the overlap of instances in the same class. Fast mode should be used when drawing all instances is too burdensome and sampling is not an option. vlines : boolean, default: True flag to determine vertical line display vlines_kwds : dict, default: None options to style or display the vertical lines, default: None kwargs : dict Keyword arguments that are passed to the base class and may influence the visualization as defined in other Visualizers. Attributes ---------- n_samples_ : int number of samples included in the visualization object features_ : ndarray, shape (n_features,) The names of the features discovered or used in the visualizer that can be used as an index to access or modify data in X. If a user passes feature names in, those features are used. Otherwise the columns of a DataFrame are used or just simply the indices of the data array. classes_ : ndarray, shape (n_classes,) The class labels that define the discrete values in the target. Only available if the target type is discrete. This is guaranteed to be strings even if the classes are a different type. Examples -------- >>> visualizer = ParallelCoordinates() >>> visualizer.fit(X, y) >>> visualizer.transform(X) >>> visualizer.show() """ NORMALIZERS = { "minmax": MinMaxScaler(), "maxabs": MaxAbsScaler(), "standard": StandardScaler(), "l1": Normalizer("l1"), "l2": Normalizer("l2"), } def __init__( self, ax=None, features=None, classes=None, normalize=None, sample=1.0, random_state=None, shuffle=False, colors=None, colormap=None, alpha=None, fast=False, vlines=True, vlines_kwds=None, **kwargs ): if "target_type" not in kwargs: kwargs["target_type"] = "discrete" super(ParallelCoordinates, self).__init__( ax=ax, features=features, classes=classes, colors=colors, colormap=colormap, **kwargs ) # Validate 'normalize' argument if normalize in self.NORMALIZERS or normalize is None: self.normalize = normalize else: raise YellowbrickValueError( "'{}' is an unrecognized normalization method".format(normalize) ) # Validate 'sample' argument if isinstance(sample, int): if sample < 1: raise YellowbrickValueError( "`sample` parameter of type `int` must be greater than 1" ) elif isinstance(sample, float): if sample <= 0 or sample > 1: raise YellowbrickValueError( "`sample` parameter of type `float` must be between 0 and 1" ) else: raise YellowbrickTypeError("`sample` parameter must be int or float") self.sample = sample # Set sample parameters if isinstance(shuffle, bool): self.shuffle = shuffle else: raise YellowbrickTypeError("`shuffle` parameter must be boolean") if self.shuffle: if (random_state is None) or isinstance(random_state, int): self._rng = RandomState(random_state) elif isinstance(random_state, RandomState): self._rng = random_state else: raise YellowbrickTypeError( "`random_state` must be None, int, or np.random.RandomState" ) else: self._rng = None # Visual and drawing parameters self.fast = fast self.alpha = alpha self.show_vlines = vlines self.vlines_kwds = vlines_kwds or {"linewidth": 1, "color": "black"} # Internal properties self._increments = None self._colors = None
[docs] def fit(self, X, y=None, **kwargs): """ The fit method is the primary drawing input for the visualization since it has both the X and y data required for the viz and the transform method does not. Parameters ---------- X : ndarray or DataFrame of shape n x m A matrix of n instances with m features y : ndarray or Series of length n An array or series of target or class values kwargs : dict Pass generic arguments to the drawing method Returns ------- self : instance Returns the instance of the transformer/visualizer """ # Determine the features, classes, and colors super(ParallelCoordinates, self).fit(X, y) # Convert from pandas data types if is_dataframe(X): X = X.values if is_series(y): y = y.values # Ticks for each feature specified self._increments = np.arange(len(self.features_)) # Subsample instances X, y = self._subsample(X, y) # Normalize instances if self.normalize is not None: X = self.NORMALIZERS[self.normalize].fit_transform(X) self.draw(X, y, **kwargs) return self
[docs] def draw(self, X, y, **kwargs): """ Called from the fit method, this method creates the parallel coordinates canvas and draws each instance and vertical lines on it. Parameters ---------- X : ndarray of shape n x m A matrix of n instances with m features y : ndarray of length n An array or series of target or class values kwargs : dict Pass generic arguments to the drawing method """ if self.fast: return self.draw_classes(X, y, **kwargs) return self.draw_instances(X, y, **kwargs)
[docs] def draw_instances(self, X, y, **kwargs): """ Draw the instances colored by the target y such that each line is a single instance. This is the "slow" mode of drawing, since each instance has to be drawn individually. However, in so doing, the density of instances in braids is more apparent since lines have an independent alpha that is compounded in the figure. This is the default method of drawing. Parameters ---------- X : ndarray of shape n x m A matrix of n instances with m features y : ndarray of length n An array or series of target or class values Notes ----- This method can be used to draw additional instances onto the parallel coordinates before the figure is finalized. """ # Get alpha from param or default alpha = self.alpha or 0.25 for idx in range(len(X)): Xi = X[idx] yi = y[idx] color = self.get_colors([yi])[0] self.ax.plot(self._increments, Xi, color=color, alpha=alpha, **kwargs) return self.ax
[docs] def draw_classes(self, X, y, **kwargs): """ Draw the instances colored by the target y such that each line is a single class. This is the "fast" mode of drawing, since the number of lines drawn equals the number of classes, rather than the number of instances. However, this drawing method sacrifices inter-class density of points using the alpha parameter. Parameters ---------- X : ndarray of shape n x m A matrix of n instances with m features y : ndarray of length n An array or series of target or class values """ # Get alpha from param or default alpha = self.alpha or 0.5 # Prepare to flatten data within each class: # introduce separation between individual data points using None in # x-values and arbitrary value (one) in y-values X_separated = np.hstack([X, np.ones((X.shape[0], 1))]) increments_separated = self._increments.tolist() increments_separated.append(None) # Get the classes that exist in the dataset, y y_values = np.unique(y) # Plot each class as a single line plot for yi in y_values: color = self.get_colors([yi])[0] X_in_class = X_separated[y == yi, :] increments_in_class = increments_separated * len(X_in_class) if len(X_in_class) > 0: self.ax.plot( increments_in_class, X_in_class.flatten(), linewidth=1, color=color, alpha=alpha, **kwargs ) return self.ax
[docs] def finalize(self, **kwargs): """ Performs the final rendering for the multi-axis visualization, including setting and rendering the vertical axes each instance is plotted on. Adds a title, a legend, and manages the grid. Parameters ---------- kwargs: generic keyword arguments. Notes ----- Generally this method is called from show and not directly by the user. """ # Set the title self.set_title( "Parallel Coordinates for {} Features".format(len(self.features_)) ) # Add the vertical lines # TODO: Make an independent function for override! if self.show_vlines: for idx in self._increments: self.ax.axvline(idx, **self.vlines_kwds) # Set the limits self.ax.set_xticks(self._increments) self.ax.set_xticklabels(self.features_) self.ax.set_xlim(self._increments[0], self._increments[-1]) # Add the legend sorting classes by name labels = sorted(list(self._colors.keys())) colors = [self._colors[lbl] for lbl in labels] manual_legend(self, labels, colors, loc="best", frameon=True) # Add the grid view self.ax.grid()
def _subsample(self, X, y): # Choose a subset of samples if isinstance(self.sample, int): n_samples = min([self.sample, len(X)]) elif isinstance(self.sample, float): n_samples = int(len(X) * self.sample) if (n_samples < len(X)) and self.shuffle: indices = self._rng.choice(len(X), n_samples, replace=False) else: indices = slice(n_samples) X = X[indices, :] y = y[indices] self.n_samples_ = n_samples return X, y