Source code for yellowbrick.cluster.icdm

# yellowbrick.cluster.icdm
# Implements Intercluster Distance Map visualizations.
#
# Author:  Benjamin Bengfort
# Created: Tue Aug 21 11:56:53 2018 -0400
#
# Copyright (C) 2018 The scikit-yb developers
# For license information, see LICENSE.txt
#
# ID: icdm.py [2f23976] benjamin@bengfort.com $

"""
Implements Intercluster Distance Map visualizations.
"""

##########################################################################
## Imports
##########################################################################

import numpy as np
import matplotlib.pyplot as plt

from matplotlib.patches import Circle
from sklearn.manifold import MDS, TSNE

from yellowbrick.utils.timer import Timer
from yellowbrick.utils.decorators import memoized
from yellowbrick.exceptions import YellowbrickValueError
from yellowbrick.cluster.base import ClusteringScoreVisualizer
from yellowbrick.utils.helpers import prop_to_size, check_fitted

try:
    # Only available in Matplotlib >= 2.0.2
    from mpl_toolkits.axes_grid1 import inset_locator
except ImportError:
    inset_locator = None


## Packages for export
__all__ = [
    "InterclusterDistance",
    "intercluster_distance",
    "VALID_EMBEDDING",
    "VALID_SCORING",
    "ICDM",
]


# Valid strings to use for embedding names
VALID_EMBEDDING = {"mds", "tsne"}

# Valid strings to use for scoring names
VALID_SCORING = {"membership"}


##########################################################################
## InterclusterDistance Visualizer
##########################################################################


[docs]class InterclusterDistance(ClusteringScoreVisualizer):
    """
    Intercluster distance maps display an embedding of the cluster centers in
    2 dimensions with the distance to other centers preserved. E.g. the closer
    to centers are in the visualization, the closer they are in the original
    feature space. The clusters are sized according to a scoring metric. By
    default, they are sized by membership, e.g. the number of instances that
    belong to each center. This gives a sense of the relative importance of
    clusters. Note however, that because two clusters overlap in the 2D space,
    it does not imply that they overlap in the original feature space.

    Parameters
    ----------
    estimator : a Scikit-Learn clusterer
        Should be an instance of a centroidal clustering algorithm (or a
        hierarchical algorithm with a specified number of clusters). Also
        accepts some other models like LDA for text clustering.
        If it is not a clusterer, an exception is raised. If the estimator
        is not fitted, it is fit when the visualizer is fitted, unless
        otherwise specified by ``is_fitted``.

    ax : matplotlib Axes, default: None
        The axes to plot the figure on. If None is passed in the current axes
        will be used (or generated if required).

    min_size : int, default: 400
        The size, in points, of the smallest cluster drawn on the graph.
        Cluster sizes will be scaled between the min and max sizes.

    max_size : int, default: 25000
        The size, in points, of the largest cluster drawn on the graph.
        Cluster sizes will be scaled between the min and max sizes.

    embedding : default: 'mds'
        The algorithm used to embed the cluster centers in 2 dimensional space
        so that the distance between clusters is represented equivalently to
        their relationship in feature spaceself.
        Embedding algorithm options include:

        - **mds**: multidimensional scaling
        - **tsne**: stochastic neighbor embedding

    scoring : default: 'membership'
        The scoring method used to determine the size of the clusters drawn on
        the graph so that the relative importance of clusters can be viewed.
        Scoring method options include:

        - **membership**: number of instances belonging to each cluster

    legend : bool, default: True
        Whether or not to draw the size legend onto the graph, omit the legend
        to more easily see clusters that overlap.

    legend_loc : str, default: "lower left"
        The location of the legend on the graph, used to move the legend out
        of the way of clusters into open space. The same legend location
        options for matplotlib are used here.

        .. seealso:: https://matplotlib.org/api/pyplot_api.html#matplotlib.pyplot.legend

    legend_size : float, default: 1.5
        The size, in inches, of the size legend to inset into the graph.

    random_state : int or RandomState, default: None
        Fixes the random state for stochastic embedding algorithms.

    is_fitted : bool or str, default='auto'
        Specify if the wrapped estimator is already fitted. If False, the estimator
        will be fit when the visualizer is fit, otherwise, the estimator will not be
        modified. If 'auto' (default), a helper method will check if the estimator
        is fitted before fitting it again.

    kwargs : dict
        Keyword arguments passed to the base class and may influence the
        feature visualization properties.

    Attributes
    ----------
    cluster_centers_ : array of shape (n_clusters, n_features)
        The computed cluster centers from the underlying model.

    embedded_centers_ : array of shape (n_clusters, 2)
        The positions of all the cluster centers on the graph.

    scores_ : array of shape (n_clusters,)
        The scores of each cluster that determine its size on the graph.

    fit_time_ : Timer
        The time it took to fit the clustering model and perform the embedding.

    Notes
    -----
    Currently the only two embeddings supported are MDS and TSNE. Soon to
    follow will be PCoA and a customized version of PCoA for LDA. The only
    supported scoring metric is membership, but in the future, silhouette
    scores and cluster diameter will be added.

    In terms of algorithm support, right now any clustering algorithm that has
    a learned ``cluster_centers_`` and ``labels_`` attribute will work with
    the visualizer. In the future, we will update this to work with hierarchical
    clusterers that have ``n_components`` and LDA.
    """

    def __init__(
        self,
        estimator,
        ax=None,
        min_size=400,
        max_size=25000,
        embedding="mds",
        scoring="membership",
        legend=True,
        legend_loc="lower left",
        legend_size=1.5,
        random_state=None,
        is_fitted="auto",
        **kwargs
    ):
        # Initialize the visualizer bases
        super(InterclusterDistance, self).__init__(estimator, ax=ax, **kwargs)

        # Ensure that a valid embedding and scoring is passed in
        validate_embedding(embedding)
        validate_scoring(scoring)

        # Set decomposition properties
        self.scoring = scoring
        self.embedding = embedding
        self.random_state = random_state

        # Set visual properties
        self.legend = legend
        self.min_size = min_size
        self.max_size = max_size
        self.legend_loc = legend_loc
        self.legend_size = legend_size

        # Colors are currently hardcoded, need to compute face and edge color
        # from this color based on the alpha of the cluster center. The user
        # can "hack" these properties before drawing, however.
        self.facecolor = "#2e719344"
        self.edgecolor = "#2e719399"

        if self.legend:
            self.lax  # If legend True, test the version availability

    @memoized
    def lax(self):
        """
        Returns the legend axes, creating it only on demand by creating a 2"
        by 2" inset axes that has no grid, ticks, spines or face frame (e.g
        is mostly invisible). The legend can then be drawn on this axes.
        """
        if inset_locator is None:
            raise YellowbrickValueError(
                (
                    "intercluster distance map legend requires matplotlib 2.0.2 or "
                    "later please upgrade matplotlib or set legend=False "
                )
            )

        lax = inset_locator.inset_axes(
            self.ax,
            width=self.legend_size,
            height=self.legend_size,
            loc=self.legend_loc,
        )

        lax.set_frame_on(False)
        lax.set_facecolor("none")
        lax.grid(False)
        lax.set_xlim(-1.4, 1.4)
        lax.set_ylim(-1.4, 1.4)
        lax.set_xticks([])
        lax.set_yticks([])

        for name in lax.spines:
            lax.spines[name].set_visible(False)

        return lax

    @memoized
    def transformer(self):
        """
        Creates the internal transformer that maps the cluster center's high
        dimensional space to its two dimensional space.
        """
        ttype = self.embedding.lower()  # transformer method type

        if ttype == "mds":
            return MDS(n_components=2, random_state=self.random_state)

        if ttype == "tsne":
            return TSNE(n_components=2, random_state=self.random_state)

        raise YellowbrickValueError("unknown embedding '{}'".format(ttype))

    @property
    def cluster_centers_(self):
        """
        Searches for or creates cluster centers for the specified clustering
        algorithm. This algorithm ensures that that the centers are
        appropriately drawn and scaled so that distance between clusters are
        maintained.
        """
        # TODO: Handle agglomerative clustering and LDA
        for attr in ("cluster_centers_",):
            try:
                return getattr(self.estimator, attr)
            except AttributeError:
                continue

        raise AttributeError(
            "could not find or make cluster_centers_ for {}".format(
                self.estimator.__class__.__name__
            )
        )

[docs]    def fit(self, X, y=None):
        """
        Fit the clustering model, computing the centers then embeds the centers
        into 2D space using the embedding method specified.
        """
        with Timer() as self.fit_time_:
            if not check_fitted(self.estimator, is_fitted_by=self.is_fitted):
                # Fit the underlying estimator
                self.estimator.fit(X, y)

        # Get the centers
        # TODO: is this how sklearn stores all centers in the model?
        C = self.cluster_centers_

        # Embed the centers in 2D space and get the cluster scores
        self.embedded_centers_ = self.transformer.fit_transform(C)
        self.scores_ = self._score_clusters(X, y)

        # Draw the clusters and fit returns self
        self.draw()
        return self

[docs]    def draw(self):
        """
        Draw the embedded centers with their sizes on the visualization.
        """
        # Compute the sizes of the markers from their score
        sizes = self._get_cluster_sizes()

        # Draw the scatter plots with associated sizes on the graph
        self.ax.scatter(
            self.embedded_centers_[:, 0],
            self.embedded_centers_[:, 1],
            s=sizes,
            c=self.facecolor,
            edgecolor=self.edgecolor,
            linewidth=1,
        )

        # Annotate the clusters with their labels
        for i, pt in enumerate(self.embedded_centers_):
            self.ax.text(
                s=str(i), x=pt[0], y=pt[1], va="center", ha="center", fontweight="bold"
            )

        # Ensure the current axes is always the main residuals axes
        plt.sca(self.ax)
        return self.ax

[docs]    def finalize(self):
        """
        Finalize the visualization to create an "origin grid" feel instead of
        the default matplotlib feel. Set the title, remove spines, and label
        the grid with components. This function also adds a legend from the
        sizes if required.
        """
        # Set the default title if a user hasn't supplied one
        self.set_title(
            "{} Intercluster Distance Map (via {})".format(
                self.estimator.__class__.__name__, self.embedding.upper()
            )
        )

        # Create the origin grid and minimalist display
        self.ax.set_xticks([0])
        self.ax.set_yticks([0])
        self.ax.set_xticklabels([])
        self.ax.set_yticklabels([])
        self.ax.set_xlabel("PC2")
        self.ax.set_ylabel("PC1")

        # Make the legend by creating an inset axes that shows relative sizing
        # based on the scoring metric supplied by the user.
        if self.legend:
            self._make_size_legend()

    def _score_clusters(self, X, y=None):
        """
        Determines the "scores" of the cluster, the metric that determines the
        size of the cluster visualized on the visualization.
        """
        stype = self.scoring.lower()  # scoring method name

        if stype == "membership":
            return np.bincount(self.estimator.labels_)

        raise YellowbrickValueError("unknown scoring method '{}'".format(stype))

    def _get_cluster_sizes(self):
        """
        Returns the marker size (in points, e.g. area of the circle) based on
        the scores, using the prop_to_size scaling mechanism.
        """
        # NOTE: log and power are hardcoded, should we allow the user to specify?
        return prop_to_size(
            self.scores_, mi=self.min_size, ma=self.max_size, log=False, power=0.5
        )

    def _make_size_legend(self):
        """
        Draw a legend that shows relative sizes of the clusters at the 25th,
        50th, and 75th percentile based on the current scoring metric.
        """
        # Compute the size of the markers and scale them to our figure size
        # NOTE: the marker size is the area of the plot, we need to compute the
        # radius of the markers.
        areas = self._get_cluster_sizes()
        radii = np.sqrt(areas / np.pi)
        scaled = np.interp(radii, (radii.min(), radii.max()), (0.1, 1))

        # Compute the locations of the 25th, 50th, and 75th percentile scores
        indices = np.array([percentile_index(self.scores_, p) for p in (25, 50, 75)])

        # Draw size circles annotated with the percentile score as the legend.
        for idx in indices:
            # TODO: should the size circle's center be hard coded like this?
            center = (-0.30, 1 - scaled[idx])
            c = Circle(
                center,
                scaled[idx],
                facecolor="none",
                edgecolor="#2e7193",
                linewidth=1.5,
                linestyle="--",
            )
            self.lax.add_patch(c)

            # Add annotation to the size circle with the value of the score
            self.lax.annotate(
                self.scores_[idx],
                (-0.30, 1 - (2 * scaled[idx])),
                xytext=(1, 1 - (2 * scaled[idx])),
                arrowprops=dict(arrowstyle="wedge", color="#2e7193"),
                va="center",
                ha="center",
            )

        # Draw size legend title
        self.lax.text(s="membership", x=0, y=1.2, va="center", ha="center")

        # Ensure the current axes is always the main axes after modifying the
        # inset axes and while drawing.
        plt.sca(self.ax)


# alias
ICDM = InterclusterDistance

##########################################################################
## Helper Methods
##########################################################################


def percentile_index(a, q):
    """
    Returns the index of the value at the Qth percentile in array a.
    """
    return np.where(a == np.percentile(a, q, interpolation="nearest"))[0][0]


def validate_string_param(s, valid, param_name="param"):
    """
    Raises a well formatted exception if s is not in valid, otherwise does not
    raise an exception. Uses ``param_name`` to identify the parameter.
    """
    if s.lower() not in valid:
        raise YellowbrickValueError(
            "unknown {} '{}', chose from '{}'".format(param_name, s, ", ".join(valid))
        )


def validate_embedding(param):
    """
    Raises an exception if the param is not in VALID_EMBEDDING
    """
    validate_string_param(param, VALID_EMBEDDING, "embedding")


def validate_scoring(param):
    """
    Raises an exception if the param is not in VALID_SCORING
    """
    validate_string_param(param, VALID_SCORING, "scoring")


##########################################################################
## Quick Method
##########################################################################


[docs]def intercluster_distance(
    estimator,
    X,
    y=None,
    ax=None,
    min_size=400,
    max_size=25000,
    embedding="mds",
    scoring="membership",
    legend=True,
    legend_loc="lower left",
    legend_size=1.5,
    random_state=None,
    is_fitted="auto",
    show=True,
    **kwargs
):
    """Quick Method:
    Intercluster distance maps display an embedding of the cluster centers in
    2 dimensions with the distance to other centers preserved. E.g. the closer
    to centers are in the visualization, the closer they are in the original
    feature space. The clusters are sized according to a scoring metric. By
    default, they are sized by membership, e.g. the number of instances that
    belong to each center. This gives a sense of the relative importance of
    clusters. Note however, that because two clusters overlap in the 2D space,
    it does not imply that they overlap in the original feature space.

    Parameters
    ----------
    estimator : a Scikit-Learn clusterer
        Should be an instance of a centroidal clustering algorithm (or a
        hierarchical algorithm with a specified number of clusters). Also
        accepts some other models like LDA for text clustering.
        If it is not a clusterer, an exception is raised. If the estimator
        is not fitted, it is fit when the visualizer is fitted, unless
        otherwise specified by ``is_fitted``.

    X : array-like of shape (n, m)
        A matrix or data frame with n instances and m features

    y : array-like of shape (n,), optional
        A vector or series representing the target for each instance

    ax : matplotlib Axes, default: None
        The axes to plot the figure on. If None is passed in the current axes
        will be used (or generated if required).

    min_size : int, default: 400
        The size, in points, of the smallest cluster drawn on the graph.
        Cluster sizes will be scaled between the min and max sizes.

    max_size : int, default: 25000
        The size, in points, of the largest cluster drawn on the graph.
        Cluster sizes will be scaled between the min and max sizes.

    embedding : default: 'mds'
        The algorithm used to embed the cluster centers in 2 dimensional space
        so that the distance between clusters is represented equivalently to
        their relationship in feature spaceself.
        Embedding algorithm options include:

        - **mds**: multidimensional scaling
        - **tsne**: stochastic neighbor embedding

    scoring : default: 'membership'
        The scoring method used to determine the size of the clusters drawn on
        the graph so that the relative importance of clusters can be viewed.
        Scoring method options include:

        - **membership**: number of instances belonging to each cluster

    legend : bool, default: True
        Whether or not to draw the size legend onto the graph, omit the legend
        to more easily see clusters that overlap.

    legend_loc : str, default: "lower left"
        The location of the legend on the graph, used to move the legend out
        of the way of clusters into open space. The same legend location
        options for matplotlib are used here.

        .. seealso:: https://matplotlib.org/api/pyplot_api.html#matplotlib.pyplot.legend

    legend_size : float, default: 1.5
        The size, in inches, of the size legend to inset into the graph.

    random_state : int or RandomState, default: None
        Fixes the random state for stochastic embedding algorithms.

    is_fitted : bool or str, default='auto'
        Specify if the wrapped estimator is already fitted. If False, the estimator
        will be fit when the visualizer is fit, otherwise, the estimator will not be
        modified. If 'auto' (default), a helper method will check if the estimator
        is fitted before fitting it again.

    show : bool, default: True
        If True, calls ``show()``, which in turn calls ``plt.show()`` however
        you cannot call ``plt.savefig`` from this signature, nor
        ``clear_figure``. If False, simply calls ``finalize()``

    kwargs : dict
        Keyword arguments passed to the base class and may influence the
        feature visualization properties.

    Returns
    -------
    viz : InterclusterDistance
        The intercluster distance visualizer, fitted and finalized.
    """
    oz = InterclusterDistance(
        estimator,
        ax=ax,
        min_size=min_size,
        max_size=max_size,
        embedding=embedding,
        scoring=scoring,
        legend=legend,
        legend_loc=legend_loc,
        legend_size=legend_size,
        random_state=random_state,
        is_fitted=is_fitted,
        **kwargs
    )

    oz.fit(X, y)

    if show:
        oz.show()
    else:
        oz.finalize()

    return oz