Source code for yellowbrick.features.pcoords

# yellowbrick.features.pcoords
# Implementations of parallel coordinates for feature analysis.
#
# Author:  Benjamin Bengfort
# Author:  @thekylesaurus
# Created: Mon Oct 03 21:46:06 2016 -0400
#
# Copyright (C) 2016 The scikit-yb developers
# For license information, see LICENSE.txt
#
# ID: pcoords.py [0f4b236] benjamin@bengfort.com $

"""
Implementation of parallel coordinates for multi-dimensional feature analysis.
"""

##########################################################################
## Imports
##########################################################################

import numpy as np

from numpy.random import RandomState
from sklearn.preprocessing import MinMaxScaler, MaxAbsScaler
from sklearn.preprocessing import Normalizer, StandardScaler

from yellowbrick.draw import manual_legend
from yellowbrick.features.base import DataVisualizer
from yellowbrick.utils import is_dataframe, is_series
from yellowbrick.exceptions import YellowbrickTypeError, YellowbrickValueError


##########################################################################
## Quick Methods
##########################################################################

[docs]def parallel_coordinates(
    X,
    y,
    ax=None,
    features=None,
    classes=None,
    normalize=None,
    sample=1.0,
    random_state=None,
    shuffle=False,
    colors=None,
    colormap=None,
    alpha=None,
    fast=False,
    vlines=True,
    vlines_kwds=None,
    show=True,
    **kwargs
):
    """Displays each feature as a vertical axis and each instance as a line.

    This helper function is a quick wrapper to utilize the ParallelCoordinates
    Visualizer (Transformer) for one-off analysis.

    Parameters
    ----------
    X : ndarray or DataFrame of shape n x m
        A matrix of n instances with m features

    y : ndarray or Series of length n
        An array or series of target or class values

    ax : matplotlib Axes, default: None
        The axis to plot the figure on. If None is passed in the current axes
        will be used (or generated if required).

    features : list, default: None
        a list of feature names to use
        If a DataFrame is passed to fit and features is None, feature
        names are selected as the columns of the DataFrame.

    classes : list, default: None
        a list of class names for the legend
        If classes is None and a y value is passed to fit then the classes
        are selected from the target vector.

    normalize : string or None, default: None
        specifies which normalization method to use, if any
        Current supported options are 'minmax', 'maxabs', 'standard', 'l1',
        and 'l2'.

    sample : float or int, default: 1.0
        specifies how many examples to display from the data
        If int, specifies the maximum number of samples to display.
        If float, specifies a fraction between 0 and 1 to display.

    random_state : int, RandomState instance or None
        If int, random_state is the seed used by the random number generator;
        If RandomState instance, random_state is the random number generator;
        If None, the random number generator is the RandomState instance used
        by np.random; only used if shuffle is True and sample < 1.0

    shuffle : boolean, default: True
        specifies whether sample is drawn randomly

    colors : list or tuple, default: None
        optional list or tuple of colors to colorize lines
        Use either color to colorize the lines on a per class basis or
        colormap to color them on a continuous scale.

    colormap : string or cmap, default: None
        optional string or matplotlib cmap to colorize lines
        Use either color to colorize the lines on a per class basis or
        colormap to color them on a continuous scale.

    alpha : float, default: None
        Specify a transparency where 1 is completely opaque and 0 is completely
        transparent. This property makes densely clustered lines more visible.
        If None, the alpha is set to 0.5 in "fast" mode and 0.25 otherwise.

    fast : bool, default: False
        Fast mode improves the performance of the drawing time of parallel
        coordinates but produces an image that does not show the overlap of
        instances in the same class. Fast mode should be used when drawing all
        instances is too burdensome and sampling is not an option.

    vlines : boolean, default: True
        flag to determine vertical line display

    vlines_kwds : dict, default: None
        options to style or display the vertical lines, default: None

    show : bool, default: True
        If True, calls ``show()``, which in turn calls ``plt.show()`` however you cannot
        call ``plt.savefig`` from this signature, nor ``clear_figure``. If False, simply
        calls ``finalize()``

    kwargs : dict
        Keyword arguments that are passed to the base class and may influence
        the visualization as defined in other Visualizers.

    Returns
    -------
    viz : ParallelCoordinates
        Returns the fitted, finalized visualizer
    """
    # Instantiate the visualizer
    visualizer = ParallelCoordinates(
        ax=ax,
        features=features,
        classes=classes,
        normalize=normalize,
        sample=sample,
        random_state=random_state,
        shuffle=shuffle,
        colors=colors,
        colormap=colormap,
        alpha=alpha,
        fast=fast,
        vlines=vlines,
        vlines_kwds=vlines_kwds,
        **kwargs
    )

    # Fit and transform the visualizer (calls draw)
    visualizer.fit(X, y, **kwargs)
    visualizer.transform(X)

    if show:
        visualizer.show()
    else:
        visualizer.finalize()

    # Return the visualizer object
    return visualizer


##########################################################################
## Static Parallel Coordinates Visualizer
##########################################################################


[docs]class ParallelCoordinates(DataVisualizer):
    """
    Parallel coordinates displays each feature as a vertical axis spaced
    evenly along the horizontal, and each instance as a line drawn between
    each individual axis. This allows you to detect braids of similar instances
    and separability that suggests a good classification problem.

    Parameters
    ----------
    ax : matplotlib Axes, default: None
        The axis to plot the figure on. If None is passed in the current axes
        will be used (or generated if required).

    features : list, default: None
        a list of feature names to use
        If a DataFrame is passed to fit and features is None, feature
        names are selected as the columns of the DataFrame.

    classes : list, default: None
        a list of class names for the legend
        The class labels for each class in y, ordered by sorted class index. These
        names act as a label encoder for the legend, identifying integer classes
        or renaming string labels. If omitted, the class labels will be taken from
        the unique values in y.

        Note that the length of this list must match the number of unique values in
        y, otherwise an exception is raised.

    normalize : string or None, default: None
        specifies which normalization method to use, if any
        Current supported options are 'minmax', 'maxabs', 'standard', 'l1',
        and 'l2'.

    sample : float or int, default: 1.0
        specifies how many examples to display from the data
        If int, specifies the maximum number of samples to display.
        If float, specifies a fraction between 0 and 1 to display.

    random_state : int, RandomState instance or None
        If int, random_state is the seed used by the random number generator;
        If RandomState instance, random_state is the random number generator;
        If None, the random number generator is the RandomState instance used
        by np.random; only used if shuffle is True and sample < 1.0

    shuffle : boolean, default: True
        specifies whether sample is drawn randomly

    colors : list or tuple, default: None
        A single color to plot all instances as or a list of colors to color each
        instance according to its class. If not enough colors per class are
        specified then the colors are treated as a cycle.

    colormap : string or cmap, default: None
        The colormap used to create the individual colors. If classes are
        specified the colormap is used to evenly space colors across each class.

    alpha : float, default: None
        Specify a transparency where 1 is completely opaque and 0 is completely
        transparent. This property makes densely clustered lines more visible.
        If None, the alpha is set to 0.5 in "fast" mode and 0.25 otherwise.

    fast : bool, default: False
        Fast mode improves the performance of the drawing time of parallel
        coordinates but produces an image that does not show the overlap of
        instances in the same class. Fast mode should be used when drawing all
        instances is too burdensome and sampling is not an option.

    vlines : boolean, default: True
        flag to determine vertical line display

    vlines_kwds : dict, default: None
        options to style or display the vertical lines, default: None

    kwargs : dict
        Keyword arguments that are passed to the base class and may influence
        the visualization as defined in other Visualizers.

    Attributes
    ----------
    n_samples_ : int
        number of samples included in the visualization object

    features_ : ndarray, shape (n_features,)
        The names of the features discovered or used in the visualizer that
        can be used as an index to access or modify data in X. If a user passes
        feature names in, those features are used. Otherwise the columns of a
        DataFrame are used or just simply the indices of the data array.

    classes_ : ndarray, shape (n_classes,)
        The class labels that define the discrete values in the target. Only
        available if the target type is discrete. This is guaranteed to be
        strings even if the classes are a different type.

    Examples
    --------

    >>> visualizer = ParallelCoordinates()
    >>> visualizer.fit(X, y)
    >>> visualizer.transform(X)
    >>> visualizer.show()
    """

    NORMALIZERS = {
        "minmax": MinMaxScaler(),
        "maxabs": MaxAbsScaler(),
        "standard": StandardScaler(),
        "l1": Normalizer("l1"),
        "l2": Normalizer("l2"),
    }

    def __init__(
        self,
        ax=None,
        features=None,
        classes=None,
        normalize=None,
        sample=1.0,
        random_state=None,
        shuffle=False,
        colors=None,
        colormap=None,
        alpha=None,
        fast=False,
        vlines=True,
        vlines_kwds=None,
        **kwargs
    ):
        if "target_type" not in kwargs:
            kwargs["target_type"] = "discrete"
        super(ParallelCoordinates, self).__init__(
            ax=ax,
            features=features,
            classes=classes,
            colors=colors,
            colormap=colormap,
            **kwargs
        )

        # Validate 'normalize' argument
        if normalize in self.NORMALIZERS or normalize is None:
            self.normalize = normalize
        else:
            raise YellowbrickValueError(
                "'{}' is an unrecognized normalization method".format(normalize)
            )

        # Validate 'sample' argument
        if isinstance(sample, int):
            if sample < 1:
                raise YellowbrickValueError(
                    "`sample` parameter of type `int` must be greater than 1"
                )
        elif isinstance(sample, float):
            if sample <= 0 or sample > 1:
                raise YellowbrickValueError(
                    "`sample` parameter of type `float` must be between 0 and 1"
                )
        else:
            raise YellowbrickTypeError("`sample` parameter must be int or float")
        self.sample = sample

        # Set sample parameters
        if isinstance(shuffle, bool):
            self.shuffle = shuffle
        else:
            raise YellowbrickTypeError("`shuffle` parameter must be boolean")
        if self.shuffle:
            if (random_state is None) or isinstance(random_state, int):
                self._rng = RandomState(random_state)
            elif isinstance(random_state, RandomState):
                self._rng = random_state
            else:
                raise YellowbrickTypeError(
                    "`random_state` must be None, int, or np.random.RandomState"
                )
        else:
            self._rng = None

        # Visual and drawing parameters
        self.fast = fast
        self.alpha = alpha
        self.show_vlines = vlines
        self.vlines_kwds = vlines_kwds or {"linewidth": 1, "color": "black"}

        # Internal properties
        self._increments = None
        self._colors = None

[docs]    def fit(self, X, y=None, **kwargs):
        """
        The fit method is the primary drawing input for the
        visualization since it has both the X and y data required for the
        viz and the transform method does not.

        Parameters
        ----------
        X : ndarray or DataFrame of shape n x m
            A matrix of n instances with m features

        y : ndarray or Series of length n
            An array or series of target or class values

        kwargs : dict
            Pass generic arguments to the drawing method

        Returns
        -------
        self : instance
            Returns the instance of the transformer/visualizer
        """
        # Determine the features, classes, and colors
        super(ParallelCoordinates, self).fit(X, y)

        # Convert from pandas data types
        if is_dataframe(X):
            X = X.values
        if is_series(y):
            y = y.values

        # Ticks for each feature specified
        self._increments = np.arange(len(self.features_))

        # Subsample instances
        X, y = self._subsample(X, y)

        # Normalize instances
        if self.normalize is not None:
            X = self.NORMALIZERS[self.normalize].fit_transform(X)

        self.draw(X, y, **kwargs)
        return self

[docs]    def draw(self, X, y, **kwargs):
        """
        Called from the fit method, this method creates the parallel
        coordinates canvas and draws each instance and vertical lines on it.

        Parameters
        ----------
        X : ndarray of shape n x m
            A matrix of n instances with m features

        y : ndarray of length n
            An array or series of target or class values

        kwargs : dict
            Pass generic arguments to the drawing method

        """
        if self.fast:
            return self.draw_classes(X, y, **kwargs)
        return self.draw_instances(X, y, **kwargs)

[docs]    def draw_instances(self, X, y, **kwargs):
        """
        Draw the instances colored by the target y such that each line is a
        single instance. This is the "slow" mode of drawing, since each
        instance has to be drawn individually. However, in so doing, the
        density of instances in braids is more apparent since lines have an
        independent alpha that is compounded in the figure.

        This is the default method of drawing.

        Parameters
        ----------
        X : ndarray of shape n x m
            A matrix of n instances with m features

        y : ndarray of length n
            An array or series of target or class values

        Notes
        -----
        This method can be used to draw additional instances onto the parallel
        coordinates before the figure is finalized.
        """
        # Get alpha from param or default
        alpha = self.alpha or 0.25

        for idx in range(len(X)):
            Xi = X[idx]
            yi = y[idx]
            color = self.get_colors([yi])[0]

            self.ax.plot(self._increments, Xi, color=color, alpha=alpha, **kwargs)

        return self.ax

[docs]    def draw_classes(self, X, y, **kwargs):
        """
        Draw the instances colored by the target y such that each line is a
        single class. This is the "fast" mode of drawing, since the number of
        lines drawn equals the number of classes, rather than the number of
        instances. However, this drawing method sacrifices inter-class density
        of points using the alpha parameter.

        Parameters
        ----------
        X : ndarray of shape n x m
            A matrix of n instances with m features

        y : ndarray of length n
            An array or series of target or class values
        """
        # Get alpha from param or default
        alpha = self.alpha or 0.5

        # Prepare to flatten data within each class:
        #   introduce separation between individual data points using None in
        #   x-values and arbitrary value (one) in y-values
        X_separated = np.hstack([X, np.ones((X.shape[0], 1))])
        increments_separated = self._increments.tolist()
        increments_separated.append(None)

        # Get the classes that exist in the dataset, y
        y_values = np.unique(y)

        # Plot each class as a single line plot
        for yi in y_values:
            color = self.get_colors([yi])[0]

            X_in_class = X_separated[y == yi, :]
            increments_in_class = increments_separated * len(X_in_class)
            if len(X_in_class) > 0:
                self.ax.plot(
                    increments_in_class,
                    X_in_class.flatten(),
                    linewidth=1,
                    color=color,
                    alpha=alpha,
                    **kwargs
                )

        return self.ax

[docs]    def finalize(self, **kwargs):
        """
        Performs the final rendering for the multi-axis visualization, including
        setting and rendering the vertical axes each instance is plotted on. Adds
        a title, a legend, and manages the grid.

        Parameters
        ----------
        kwargs: generic keyword arguments.

        Notes
        -----
        Generally this method is called from show and not directly by the user.
        """
        # Set the title
        self.set_title(
            "Parallel Coordinates for {} Features".format(len(self.features_))
        )

        # Add the vertical lines
        # TODO: Make an independent function for override!
        if self.show_vlines:
            for idx in self._increments:
                self.ax.axvline(idx, **self.vlines_kwds)

        # Set the limits
        self.ax.set_xticks(self._increments)
        self.ax.set_xticklabels(self.features_)
        self.ax.set_xlim(self._increments[0], self._increments[-1])

        # Add the legend sorting classes by name
        labels = sorted(list(self._colors.keys()))
        colors = [self._colors[lbl] for lbl in labels]
        manual_legend(self, labels, colors, loc="best", frameon=True)

        # Add the grid view
        self.ax.grid()

    def _subsample(self, X, y):

        # Choose a subset of samples
        if isinstance(self.sample, int):
            n_samples = min([self.sample, len(X)])
        elif isinstance(self.sample, float):
            n_samples = int(len(X) * self.sample)

        if (n_samples < len(X)) and self.shuffle:
            indices = self._rng.choice(len(X), n_samples, replace=False)
        else:
            indices = slice(n_samples)
        X = X[indices, :]
        y = y[indices]

        self.n_samples_ = n_samples
        return X, y