Source code for yellowbrick.features.pca

# -*- coding: utf-8 -*-
# yellowbrick.features.pca
# Decomposition based feature visualization with PCA.
#
# Author:   Carlo Morales
# Author:   Raúl Peralta Lozada
# Author:   Benjamin Bengfort
# Created:  Tue May 23 18:34:27 2017 -0400
#
# Copyright (C) 2017 The scikit-yb developers
# For license information, see LICENSE.txt
#
# ID: pca.py [] cmorales@pacificmetrics.com $

"""
Decomposition based feature visualization with PCA.
"""

##########################################################################
## Imports
##########################################################################

# NOTE: must import mplot3d to load the 3D projection
import numpy as np
import matplotlib.pyplot as plt
from mpl_toolkits.axes_grid1 import make_axes_locatable

from yellowbrick.style import palettes
from yellowbrick.features.projection import ProjectionVisualizer
from yellowbrick.exceptions import YellowbrickValueError, NotFitted

from sklearn.pipeline import Pipeline
from sklearn.decomposition import PCA as PCATransformer
from sklearn.preprocessing import StandardScaler
from sklearn.exceptions import NotFittedError


##########################################################################
# 2D and 3D PCA Visualizer
##########################################################################


[docs]class PCA(ProjectionVisualizer):
    """
    Produce a two or three dimensional principal component plot of a data array
    projected onto its largest sequential principal components. It is common
    practice to scale the data array ``X`` before applying a PC decomposition.
    Variable scaling can be controlled using the ``scale`` argument.

    Parameters
    ----------
    ax : matplotlib Axes, default: None
        The axes to plot the figure on. If None is passed in, the current axes
        will be used (or generated if required).

    features : list, default: None
        The names of the features specified by the columns of the input dataset.
        This length of this list must match the number of columns in X, otherwise
        an exception will be raised on ``fit()``.

    classes : list, default: None
        The class labels for each class in y, ordered by sorted class index. These
        names act as a label encoder for the legend, identifying integer classes
        or renaming string labels. If omitted, the class labels will be taken from
        the unique values in y.

        Note that the length of this list must match the number of unique values in
        y, otherwise an exception is raised. This parameter is only used in the
        discrete target type case and is ignored otherwise.

    scale : bool, default: True
        Boolean that indicates if user wants to scale data.

    projection : int or string, default: 2
        The number of axes to project into, either 2d or 3d. To plot 3d plots
        with matplotlib, please ensure a 3d axes is passed to the visualizer,
        otherwise one will be created using the current figure.

    proj_features : bool, default: False
        Boolean that indicates if the user wants to project the features
        in the projected space. If True the plot will be similar to a biplot.

    colors : list or tuple, default: None
        A single color to plot all instances as or a list of colors to color each
        instance according to its class in the discrete case or as an ordered
        colormap in the sequential case. If not enough colors per class are
        specified then the colors are treated as a cycle.

    colormap : string or cmap, default: None
        The colormap used to create the individual colors. In the discrete case
        it is used to compute the number of colors needed for each class and
        in the continuous case it is used to create a sequential color map based
        on the range of the target.

    alpha : float, default: 0.75
        Specify a transparency where 1 is completely opaque and 0 is completely
        transparent. This property makes densely clustered points more visible.

    random_state : int, RandomState instance or None, optional (default None)
        This parameter sets the random state on this solver. If the input X is
        larger than 500x500 and the number of components to extract is lower
        than 80% of the smallest dimension of the data, then the more efficient
        `randomized` solver is enabled.

    colorbar : bool, default: True
        If the target_type is "continous" draw a colorbar to the right of the
        scatter plot. The colobar axes is accessible using the cax property.

    heatmap : bool, default: False
        Add a heatmap showing contribution of each feature in the principal components.
        Also draws a colorbar for readability purpose. The heatmap is accessible
        using lax property and colorbar using uax property.

    kwargs : dict
        Keyword arguments that are passed to the base class and may influence
        the visualization as defined in other Visualizers.

    Attributes
    ----------
    pca_components_ : ndarray, shape (n_features, n_components)
        This tells about the magnitude of each feature in the pricipal components.
        This is primarily used to draw the biplots.

    classes_ : ndarray, shape (n_classes,)
        The class labels that define the discrete values in the target. Only
        available if the target type is discrete. This is guaranteed to be
        strings even if the classes are a different type.

    features_ : ndarray, shape (n_features,)
        The names of the features discovered or used in the visualizer that
        can be used as an index to access or modify data in X. If a user passes
        feature names in, those features are used. Otherwise the columns of a
        DataFrame are used or just simply the indices of the data array.

    range_ : (min y, max y)
        A tuple that describes the minimum and maximum values in the target.
        Only available if the target type is continuous.

    Examples
    --------
    >>> from sklearn import datasets
    >>> iris = datasets.load_iris()
    >>> X = iris.data
    >>> y = iris.target
    >>> visualizer = PCA()
    >>> visualizer.fit_transform(X, y)
    >>> visualizer.show()

    """

    def __init__(
        self,
        ax=None,
        features=None,
        classes=None,
        scale=True,
        projection=2,
        proj_features=False,
        colors=None,
        colormap=None,
        alpha=0.75,
        random_state=None,
        colorbar=True,
        heatmap=False,
        **kwargs
    ):
        super(PCA, self).__init__(
            ax=ax,
            features=features,
            classes=classes,
            colors=colors,
            colormap=colormap,
            projection=projection,
            alpha=alpha,
            colorbar=colorbar,
            **kwargs
        )

        # Data Parameters
        self.scale = scale
        self.proj_features = proj_features

        # Create the PCA transformer
        self.pca_transformer = Pipeline(
            [
                ("scale", StandardScaler(with_std=self.scale)),
                ("pca", PCATransformer(self.projection, random_state=random_state)),
            ]
        )
        self.alpha = alpha

        # Visual Parameters
        self.heatmap = heatmap

        self._uax, self._lax = None, None

        # No heatmap can be drawn with 3d plots as they do not have permit axes
        # division.
        if self.projection == 3 and self.heatmap:
            raise YellowbrickValueError(
                "heatmap and colorbar are not compatible with 3d projections"
            )
        self._random_state = random_state

    @property
    def random_state(self):
        return self._random_state

    @random_state.setter
    def random_state(self, val):
        self._random_state = val
        self.pca_transformer.set_params(pca__random_state=val)

    @property
    def uax(self):
        """
        The axes of the colorbar, bottom of scatter plot. This is the colorbar
        for heatmap and not for the scatter plot.
        """
        if self._uax is None:
            raise AttributeError("This visualizer does not have an axes for colorbar")

        return self._uax

    @property
    def lax(self):
        """
        The axes of the heatmap below scatter plot.
        """
        if self._lax is None:
            raise AttributeError("This visualizer does not have an axes for heatmap")

        return self._lax

[docs]    def layout(self, divider=None):
        """
        Creates the layout for colorbar and heatmap, adding new axes for the heatmap
        if necessary and modifying the aspect ratio. Does not modify the axes or the
        layout if ``self.heatmap`` is ``False`` or ``None``.

        Parameters
        ----------
        divider: AxesDivider
            An AxesDivider to be passed among all layout calls.
        """

        # Ensure matplotlib version compatibility
        if make_axes_locatable is None:
            raise YellowbrickValueError(
                (
                    "heatmap requires matplotlib 2.0.2 or greater "
                    "please upgrade matplotlib or set heatmap=False on the visualizer"
                )
            )

        # Create the new axes for the colorbar and heatmap
        if divider is None:
            divider = make_axes_locatable(self.ax)

        # Call to super class ensures that a colorbar is drawn when target is
        # continuous.
        super(PCA, self).layout(divider)

        if self.heatmap:

            # Axes for colorbar(for heatmap).
            if self._uax is None:
                self._uax = divider.append_axes("bottom", size="10%", pad=0.7)

            # Axes for heatmap
            if self._lax is None:
                self._lax = divider.append_axes("bottom", size="15%", pad=0.5)

[docs]    def fit(self, X, y=None, **kwargs):
        """
        Fits the PCA transformer, transforms the data in X, then draws the
        decomposition in either 2D or 3D space as a scatter plot.

        Parameters
        ----------
        X : ndarray or DataFrame of shape n x m
            A matrix of n instances with m features.

        y : ndarray or Series of length n
            An array or series of target or class values.

        Returns
        -------
        self : visualizer
            Returns self for use in Pipelines.


        """
        # Call super fit to compute features, classes, colors, etc.
        super(PCA, self).fit(X=X, y=y, **kwargs)
        self.pca_transformer.fit(X)
        self.pca_components_ = self.pca_transformer.named_steps["pca"].components_
        return self

[docs]    def transform(self, X, y=None, **kwargs):
        """
        Calls the internal `transform` method of the scikit-learn PCA transformer, which
        performs a dimensionality reduction on the input features ``X``. Next calls the
        ``draw`` method of the Yellowbrick visualizer, finally returning a new array of
        transformed features of shape ``(len(X), projection)``.

        Parameters
        ----------
        X : ndarray or DataFrame of shape n x m
            A matrix of n instances with m features.

        y : ndarray or Series of length n
            An array or series of target or class values.

        Returns
        -------
        Xp : ndarray or DataFrame of shape n x m
            Returns a new array-like object of transformed features of shape
            ``(len(X), projection)``.
        """
        try:
            Xp = self.pca_transformer.transform(X)
            self.draw(Xp, y)
            return Xp
        except NotFittedError:
            raise NotFitted.from_estimator(self, "transform")

[docs]    def draw(self, Xp, y):
        """
        Plots a scatterplot of points that represented the decomposition,
        `pca_features_`, of the original features, `X`, projected into either 2 or
        3 dimensions.

        If 2 dimensions are selected, a colorbar and heatmap can also be optionally
        included to show the magnitude of each feature value to the component.

        Parameters
        ----------
        Xp : array-like of shape (n, 2) or (n, 3)
            The matrix produced by the ``transform()`` method.

        y : array-like of shape (n,), optional
            The target, used to specify the colors of the points.


        Returns
        -------
        self.ax : matplotlib Axes object
            Returns the axes that the scatter plot was drawn on.
        """
        # Call to super draw which draws the scatter plot.
        super(PCA, self).draw(Xp, y)
        if self.proj_features:
            # Draws projection features in transformed space.
            self._draw_projection_features(Xp, y)
        if self.projection == 2:
            if self.heatmap:
                if not self.colormap:
                    self.colormap = palettes.DEFAULT_SEQUENCE
                # TODO: change to pcolormesh instead of imshow per #615 spec
                im = self.lax.imshow(
                    self.pca_components_,
                    interpolation="none",
                    cmap=self.colormap,
                    aspect="auto",
                )
                plt.colorbar(
                    im,
                    cax=self.uax,
                    orientation="horizontal",
                    ticks=[self.pca_components_.min(), 0, self.pca_components_.max()],
                )
        return self.ax

    def _draw_projection_features(self, Xp, y):
        """
        Draw the projection of features in the transformed space.
        Parameters
        ----------
        Xp : array-like of shape (n, 2) or (n, 3)
            The matrix produced by the ``transform()`` method.

        y : array-like of shape (n,), optional
            The target, used to specify the colors of the points.

        Returns
        -------
        self.ax : matplotlib Axes object
            Returns the axes that the scatter plot was drawn on.

        """

        x_vector = self.pca_components_[0]
        y_vector = self.pca_components_[1]
        max_x = max(Xp[:, 0])
        max_y = max(Xp[:, 1])
        if self.projection == 2:
            for i in range(self.pca_components_.shape[1]):
                self.ax.arrow(
                    x=0,
                    y=0,
                    dx=x_vector[i] * max_x,
                    dy=y_vector[i] * max_y,
                    color="r",
                    head_width=0.05,
                    width=0.005,
                )
                self.ax.text(
                    x_vector[i] * max_x * 1.05,
                    y_vector[i] * max_y * 1.05,
                    self.features_[i],
                    color="r",
                )
        elif self.projection == 3:
            z_vector = self.pca_components_[2]
            max_z = max(Xp[:, 1])
            for i in range(self.pca_components_.shape[1]):
                self.ax.plot(
                    [0, x_vector[i] * max_x],
                    [0, y_vector[i] * max_y],
                    [0, z_vector[i] * max_z],
                    color="r",
                )
                self.ax.text(
                    x_vector[i] * max_x * 1.05,
                    y_vector[i] * max_y * 1.05,
                    z_vector[i] * max_z * 1.05,
                    self.features_[i],
                    color="r",
                )
        else:
            raise YellowbrickValueError("Projection dimensions must be either 2 or 3")

        return self.ax

[docs]    def finalize(self, **kwargs):
        """
        Draws the title, labels, legends, heatmap, and colorbar as specified by the
        keyword arguments.
        """
        super(PCA, self).finalize()

        self.ax.set_title("Principal Component Plot")
        self.ax.set_xlabel("$PC_1$")
        self.ax.set_ylabel("$PC_2$")
        if self.projection == 3:
            self.ax.set_zlabel("$PC_3$")
        if self.heatmap == True:
            self.lax.set_xticks(np.arange(-0.5, len(self.features_)))
            self.lax.set_xticklabels([])
            # Makes the labels centered.
            self.lax.set_xticks(np.arange(0, len(self.features_)), minor=True)
            self.lax.set_xticklabels(
                self.features_, rotation=90, fontsize=12, minor=True
            )
            self.lax.set_yticks(np.arange(0.5, 2))
            self.lax.set_yticklabels(["$PC_1$", "$PC_2$"], va="bottom", fontsize=10)
        self.fig.tight_layout()


##########################################################################
## Quick Method
##########################################################################


[docs]def pca_decomposition(
    X,
    y=None,
    ax=None,
    features=None,
    classes=None,
    scale=True,
    projection=2,
    proj_features=False,
    colors=None,
    colormap=None,
    alpha=0.75,
    random_state=None,
    colorbar=True,
    heatmap=False,
    show=True,
    **kwargs
):

    """
    Produce a two or three dimensional principal component plot of the data array ``X``
    projected onto its largest sequential principal components. It is common practice
    to scale the data array ``X`` before applying a PC decomposition. Variable scaling
    can be controlled using the ``scale`` argument.

    Parameters
    ----------
    X : ndarray or DataFrame of shape n x m
        A matrix of n instances with m features.

    y : ndarray or Series of length n
        An array or series of target or class values.

    ax : matplotlib Axes, default: None
        The axes to plot the figure on. If None is passed in, the current axes
        will be used (or generated if required).

    features : list, default: None
        The names of the features specified by the columns of the input dataset.
        This length of this list must match the number of columns in X, otherwise
        an exception will be raised on ``fit()``.

    classes : list, default: None
        The class labels for each class in y, ordered by sorted class index. These
        names act as a label encoder for the legend, identifying integer classes
        or renaming string labels. If omitted, the class labels will be taken from
        the unique values in y.

        Note that the length of this list must match the number of unique values in
        y, otherwise an exception is raised. This parameter is only used in the
        discrete target type case and is ignored otherwise.

    scale : bool, default: True
        Boolean that indicates if user wants to scale data.

    projection : int or string, default: 2
        The number of axes to project into, either 2d or 3d. To plot 3d plots
        with matplotlib, please ensure a 3d axes is passed to the visualizer,
        otherwise one will be created using the current figure.

    proj_features : bool, default: False
        Boolean that indicates if the user wants to project the features
        in the projected space. If True the plot will be similar to a biplot.

    colors : list or tuple, default: None
        A single color to plot all instances as or a list of colors to color each
        instance according to its class in the discrete case or as an ordered
        colormap in the sequential case. If not enough colors per class are
        specified then the colors are treated as a cycle.

    colormap : string or cmap, default: None
        The colormap used to create the individual colors. In the discrete case
        it is used to compute the number of colors needed for each class and
        in the continuous case it is used to create a sequential color map based
        on the range of the target.

    alpha : float, default: 0.75
        Specify a transparency where 1 is completely opaque and 0 is completely
        transparent. This property makes densely clustered points more visible.

    random_state : int, RandomState instance or None, optional (default None)
        This parameter sets the random state on this solver. If the input X is
        larger than 500x500 and the number of components to extract is lower
        than 80% of the smallest dimension of the data, then the more efficient
        `randomized` solver is enabled.

    colorbar : bool, default: True
        If the target_type is "continous" draw a colorbar to the right of the
        scatter plot. The colobar axes is accessible using the cax property.

    heatmap : bool, default: False
        Add a heatmap showing contribution of each feature in the principal components.
        Also draws a colorbar for readability purpose. The heatmap is accessible
        using lax property and colorbar using uax property.

    show : bool, default: True
        If True, calls ``show()``, which in turn calls ``plt.show()`` however you cannot
        call ``plt.savefig`` from this signature, nor ``clear_figure``. If False, simply
        calls ``finalize()``

    kwargs : dict
        Keyword arguments that are passed to the base class and may influence
        the visualization as defined in other Visualizers.

    Attributes
    ----------
    pca_components_ : ndarray, shape (n_features, n_components)
        This tells about the magnitude of each feature in the pricipal components.
        This is primarily used to draw the biplots.

    classes_ : ndarray, shape (n_classes,)
        The class labels that define the discrete values in the target. Only
        available if the target type is discrete. This is guaranteed to be
        strings even if the classes are a different type.

    features_ : ndarray, shape (n_features,)
        The names of the features discovered or used in the visualizer that
        can be used as an index to access or modify data in X. If a user passes
        feature names in, those features are used. Otherwise the columns of a
        DataFrame are used or just simply the indices of the data array.

    range_ : (min y, max y)
        A tuple that describes the minimum and maximum values in the target.
        Only available if the target type is continuous.

    Examples
    --------
    >>> from sklearn import datasets
    >>> iris = datasets.load_iris()
    >>> X = iris.data
    >>> y = iris.target
    >>> pca_decomposition(X, y, colors=['r', 'g', 'b'], projection=3)

    """
    # Instantiate the visualizer
    visualizer = PCA(
        ax=ax,
        features=features,
        classes=classes,
        scale=scale,
        projection=projection,
        proj_features=proj_features,
        colors=colors,
        colormap=colormap,
        alpha=alpha,
        random_state=random_state,
        colorbar=colorbar,
        heatmap=heatmap,
        **kwargs
    )

    # Fit and transform the visualizer (calls draw)
    visualizer.fit(X, y)
    visualizer.transform(X, y)

    if show:
        visualizer.show()
    else:
        visualizer.finalize()

    # Returns the visualizer object.
    return visualizer


# Alias for PCA
PCADecomposition = PCA