Source code for yellowbrick.features.manifold

# yellowbrick.features.manifold
# Use manifold algorithms for high dimensional visualization.
#
# Author:  Benjamin Bengfort
# Created: Sat May 12 11:25:24 2018 -0400
#
# Copyright (C) 2018 The scikit-yb developers
# For license information, see LICENSE.txt
#
# ID: manifold.py [02f8c27] benjamin@bengfort.com $

"""
Use manifold algorithms for high dimensional visualization.
"""

##########################################################################
## Imports
##########################################################################

import warnings

from yellowbrick.utils.timer import Timer
from yellowbrick.utils.types import is_estimator
from yellowbrick.exceptions import ModelError, NotFitted
from yellowbrick.features.projection import ProjectionVisualizer
from yellowbrick.exceptions import YellowbrickValueError, YellowbrickWarning

from sklearn.base import clone
from sklearn.exceptions import NotFittedError
from sklearn.manifold import LocallyLinearEmbedding
from sklearn.manifold import Isomap, MDS, TSNE, SpectralEmbedding


##########################################################################
## Supported manifold algorithms by name lookup
##########################################################################

MANIFOLD_ALGORITHMS = {
    "lle": LocallyLinearEmbedding(method="standard", eigen_solver="auto"),
    "ltsa": LocallyLinearEmbedding(method="ltsa", eigen_solver="auto"),
    "hessian": LocallyLinearEmbedding(method="hessian", eigen_solver="auto"),
    "modified": LocallyLinearEmbedding(method="modified", eigen_solver="auto"),
    "isomap": Isomap(),
    "mds": MDS(),
    "spectral": SpectralEmbedding(),
    "tsne": TSNE(init="pca"),
}

MANIFOLD_NAMES = {
    "lle": "Locally Linear Embedding",
    "ltsa": "LTSA LLE",
    "hessian": "Hessian LLE",
    "modified": "Modified LLE",
    "isomap": "Isomap",
    "mds": "MDS",
    "spectral": "Spectral Embedding",
    "tsne": "t-SNE",
}


##########################################################################
## Manifold Embeddings
##########################################################################


[docs]class Manifold(ProjectionVisualizer): """ The Manifold visualizer provides high dimensional visualization for feature analysis by embedding data into 2 dimensions using the sklearn.manifold package for manifold learning. In brief, manifold learning algorithms are unsuperivsed approaches to non-linear dimensionality reduction (unlike PCA or SVD) that help visualize latent structures in data. The manifold algorithm used to do the embedding in scatter plot space can either be a transformer or a string representing one of the already specified manifolds as follows: ============== ========================== Manifold Description -------------- -------------------------- ``"lle"`` `Locally Linear Embedding`_ ``"ltsa"`` `LTSA LLE`_ ``"hessian"`` `Hessian LLE`_ ``"modified"`` `Modified LLE`_ ``"isomap"`` `Isomap`_ ``"mds"`` `Multi-Dimensional Scaling`_ ``"spectral"`` `Spectral Embedding`_ ``"tsne"`` `t-SNE`_ ============== ========================== Each of these algorithms embeds non-linear relationships in different ways, allowing for an exploration of various structures in the feature space. Note however, that each of these algorithms has different time, memory and complexity requirements; take special care when using large datasets! The Manifold visualizer also shows the specified target (if given) as the color of the scatter plot. If a classification or clustering target is given, then discrete colors will be used with a legend. If a regression or continuous target is specified, then a colormap and colorbar will be shown. Parameters ---------- ax : matplotlib Axes, default: None The axes to plot the figure on. If None, the current axes will be used or generated if required. manifold : str or Transformer, default: "mds" Specify the manifold algorithm to perform the embedding. Either one of the strings listed in the table above, or an actual scikit-learn transformer. The constructed manifold is accessible with the manifold property, so as to modify hyperparameters before fit. n_neighbors : int, default: None Many manifold algorithms are nearest neighbors based, for those that are, this parameter specfies the number of neighbors to use in the embedding. If n_neighbors is not specified for those embeddings, it is set to 5 and a warning is issued. If the manifold algorithm doesn't use nearest neighbors, then this parameter is ignored. features : list, default: None The names of the features specified by the columns of the input dataset. This length of this list must match the number of columns in X, otherwise an exception will be raised on ``fit()``. classes : list, default: None The class labels for each class in y, ordered by sorted class index. These names act as a label encoder for the legend, identifying integer classes or renaming string labels. If omitted, the class labels will be taken from the unique values in y. Note that the length of this list must match the number of unique values in y, otherwise an exception is raised. This parameter is only used in the discrete target type case and is ignored otherwise. colors : list or tuple, default: None A single color to plot all instances as or a list of colors to color each instance according to its class in the discrete case or as an ordered colormap in the sequential case. If not enough colors per class are specified then the colors are treated as a cycle. colormap : string or cmap, default: None The colormap used to create the individual colors. In the discrete case it is used to compute the number of colors needed for each class and in the continuous case it is used to create a sequential color map based on the range of the target. target_type : str, default: "auto" Specify the type of target as either "discrete" (classes) or "continuous" (real numbers, usually for regression). If "auto", then it will attempt to determine the type by counting the number of unique values. If the target is discrete, the colors are returned as a dict with classes being the keys. If continuous the colors will be list having value of color for each point. In either case, if no target is specified, then color will be specified as the first color in the color cycle. projection : int or string, default: 2 The number of axes to project into, either 2d or 3d. To plot 3d plots with matplotlib, please ensure a 3d axes is passed to the visualizer, otherwise one will be created using the current figure. alpha : float, default: 0.75 Specify a transparency where 1 is completely opaque and 0 is completely transparent. This property makes densely clustered points more visible. random_state : int or RandomState, default: None Fixes the random state for stochastic manifold algorithms. colorbar : bool, default: True If the target_type is "continous" draw a colorbar to the right of the scatter plot. The colobar axes is accessible using the cax property. kwargs : dict Keyword arguments passed to the base class and may influence the feature visualization properties. Attributes ---------- fit_time_ : yellowbrick.utils.timer.Timer The amount of time in seconds it took to fit the Manifold. classes_ : ndarray, shape (n_classes,) The class labels that define the discrete values in the target. Only available if the target type is discrete. This is guaranteed to be strings even if the classes are a different type. features_ : ndarray, shape (n_features,) The names of the features discovered or used in the visualizer that can be used as an index to access or modify data in X. If a user passes feature names in, those features are used. Otherwise the columns of a DataFrame are used or just simply the indices of the data array. range_ : (min y, max y) A tuple that describes the minimum and maximum values in the target. Only available if the target type is continuous. Examples -------- >>> viz = Manifold(manifold='isomap', target='discrete') >>> viz.fit_transform(X, y) >>> viz.show() Notes ----- Specifying the target as ``'continuous'`` or ``'discrete'`` will influence how the visualizer is finally displayed, don't rely on the automatic determination from the Manifold! Scaling your data with the standard scalar before applying it to the visualizer is a great way of increasing performance. Additionally using the ``SelectKBest`` transformer may also improve performance and lead to better visualizations. .. warning:: Manifold visualizers have extremly varying time, resource, and complexity requirements. Sampling data or features may be necessary in order to finish a manifold computation. .. seealso:: The Scikit-Learn discussion on `Manifold Learning <http://scikit-learn.org/stable/modules/manifold.html>`_. .. _`Locally Linear Embedding`: http://scikit-learn.org/stable/modules/manifold.html#locally-linear-embedding .. _`LTSA LLE`: http://scikit-learn.org/stable/modules/manifold.html#local-tangent-space-alignment .. _`Hessian LLE`: http://scikit-learn.org/stable/modules/manifold.html#hessian-eigenmapping .. _`Modified LLE`: http://scikit-learn.org/stable/modules/manifold.html#modified-locally-linear-embedding .. _`Isomap`: http://scikit-learn.org/stable/modules/manifold.html#isomap .. _`Multi-Dimensional Scaling`: http://scikit-learn.org/stable/modules/manifold.html#multi-dimensional-scaling-mds .. _`Spectral Embedding`: http://scikit-learn.org/stable/modules/manifold.html#spectral-embedding .. _`t-SNE`: http://scikit-learn.org/stable/modules/manifold.html#t-distributed-stochastic-neighbor-embedding-t-sne """ ALGORITHMS = MANIFOLD_ALGORITHMS def __init__( self, ax=None, manifold="mds", n_neighbors=None, features=None, classes=None, colors=None, colormap=None, target_type="auto", projection=2, alpha=0.75, random_state=None, colorbar=True, **kwargs ): super(Manifold, self).__init__( ax, features, classes, colors, colormap, target_type, projection, alpha, colorbar, **kwargs ) self._name = None self._manifold = None self.n_neighbors = n_neighbors self.random_state = random_state self.manifold = manifold # must be set last @property def manifold(self): """ Property containing the manifold transformer constructed from the supplied hyperparameter. Use this property to modify the manifold before fit with ``manifold.set_params()``. """ return self._manifold @manifold.setter def manifold(self, transformer): """ Creates the manifold estimator if a string value is passed in, validates other objects passed in. """ if not is_estimator(transformer): if transformer not in self.ALGORITHMS: raise YellowbrickValueError( "could not create manifold for '{}'".format(str(transformer)) ) # 2 components is required for 2D plots n_components = self.projection requires_default_neighbors = { "lle", "ltsa", "isomap", "hessian", "spectral", "modified", } # Check if the n_neighbors attribute needs to be set. if self.n_neighbors is None and transformer in requires_default_neighbors: if transformer == "hessian": self.n_neighbors = int( 1 + (n_components * (1 + (n_components + 1) / 2)) ) else: self.n_neighbors = 5 # Issue a warning that the n_neighbors was set to a default. warnmsg = ( "using n_neighbors={};" " please explicitly specify for the '{}' manifold" ).format(self.n_neighbors, str(transformer)) warnings.warn(warnmsg, YellowbrickWarning) # Create a new transformer with the specified params self._name = MANIFOLD_NAMES[transformer] transformer = clone(self.ALGORITHMS[transformer]) params = { "n_components": n_components, "n_neighbors": self.n_neighbors, "random_state": self.random_state, } for param in list(params.keys()): if param not in transformer.get_params(): del params[param] transformer.set_params(**params) self._manifold = transformer if self._name is None: self._name = self._manifold.__class__.__name__
[docs] def fit(self, X, y=None, **kwargs): """ Fits the manifold on X and transforms the data to plot it on the axes. See fit_transform() for more details. Parameters ---------- X : array-like of shape (n, m) A matrix or data frame with n instances and m features y : array-like of shape (n,), optional A vector or series with target values for each instance in X. This vector is used to determine the color of the points in X. Returns ------- self : Manifold Returns the visualizer object. """ if not hasattr(self.manifold, "transform"): name = self.manifold.__class__.__name__ raise ModelError( ( "{} requires data to be simultaneously fit and transformed, " "use fit_transform instead" ).format(name) ) # Call super to compute features, classes, colors, etc. super(Manifold, self).fit(X, y) with Timer() as self.fit_time_: self.manifold.fit(X) return self
[docs] def fit_transform(self, X, y=None, **kwargs): """ Fits the manifold on X and transforms the data to plot it on the axes. The optional y specified can be used to declare discrete colors. If the target is set to 'auto', this method also determines the target type, and therefore what colors will be used. Note also that fit records the amount of time it takes to fit the manifold and reports that information in the visualization. Parameters ---------- X : array-like of shape (n, m) A matrix or data frame with n instances and m features y : array-like of shape (n,), optional A vector or series with target values for each instance in X. This vector is used to determine the color of the points in X. Returns ------- Xprime : array-like of shape (n, 2) Returns the 2-dimensional embedding of the instances. """ # Because some manifolds do not have transform, we cannot call individual # fit and transform methods, but must do it manually here. # Call super fit to compute features, classes, colors, etc. super(Manifold, self).fit(X, y) with Timer() as self.fit_time_: Xp = self.manifold.fit_transform(X) self.draw(Xp, y) return Xp
[docs] def transform(self, X, y=None, **kwargs): """ Returns the transformed data points from the manifold embedding. Parameters ---------- X : array-like of shape (n, m) A matrix or data frame with n instances and m features y : array-like of shape (n,), optional The target, used to specify the colors of the points. Returns ------- Xprime : array-like of shape (n, 2) Returns the 2-dimensional embedding of the instances. Note ---- This method does not work with MDS, TSNE and SpectralEmbedding because it is yet to be implemented in sklearn. """ # Because some manifolds do not have transform we cannot call super try: Xp = self.manifold.transform(X) self.draw(Xp, y) return Xp except NotFittedError: raise NotFitted.from_estimator(self, "transform") except AttributeError: name = self.manifold.__class__.__name__ raise ModelError( ( "{} requires data to be simultaneously fit and transformed, " "use fit_transform instead" ).format(name) ) return Xp
[docs] def draw(self, Xp, y=None): # Calls draw method from super class which draws scatter plot. super(Manifold, self).draw(Xp, y) return self.ax
[docs] def finalize(self): """ Add title and modify axes to make the image ready for display. """ self.set_title( "{} Manifold (fit in {:0.2f} seconds)".format( self._name, self.fit_time_.interval ) ) self.ax.set_xlabel("Using {} features".format(len(self.features_))) # Draws legend for discrete target and colorbar for continuous. super(Manifold, self).finalize()
########################################################################## ## Quick Method ##########################################################################
[docs]def manifold_embedding( X, y=None, ax=None, manifold="mds", n_neighbors=None, features=None, classes=None, colors=None, colormap=None, target_type="auto", projection=2, alpha=0.75, random_state=None, colorbar=True, show=True, **kwargs ): """Quick method for Manifold visualizer. The Manifold visualizer provides high dimensional visualization for feature analysis by embedding data into 2 dimensions using the sklearn.manifold package for manifold learning. In brief, manifold learning algorithms are unsuperivsed approaches to non-linear dimensionality reduction (unlike PCA or SVD) that help visualize latent structures in data. .. seealso:: See Manifold for more details. Parameters ---------- X : array-like of shape (n, m) A matrix or data frame with n instances and m features where m > 2. y : array-like of shape (n,), optional A vector or series with target values for each instance in X. This vector is used to determine the color of the points in X. ax : matplotlib.Axes, default: None The axis to plot the figure on. If None is passed in the current axes will be used (or generated if required). manifold : str or Transformer, default: "lle" Specify the manifold algorithm to perform the embedding. Either one of the strings listed in the table above, or an actual scikit-learn transformer. The constructed manifold is accessible with the manifold property, so as to modify hyperparameters before fit. n_neighbors : int, default: None Many manifold algorithms are nearest neighbors based, for those that are, this parameter specfies the number of neighbors to use in the embedding. If n_neighbors is not specified for those embeddings, it is set to 5 and a warning is issued. If the manifold algorithm doesn't use nearest neighbors, then this parameter is ignored. features : list, default: None The names of the features specified by the columns of the input dataset. This length of this list must match the number of columns in X, otherwise an exception will be raised on ``fit()``. classes : list, default: None The class labels for each class in y, ordered by sorted class index. These names act as a label encoder for the legend, identifying integer classes or renaming string labels. If omitted, the class labels will be taken from the unique values in y. Note that the length of this list must match the number of unique values in y, otherwise an exception is raised. This parameter is only used in the discrete target type case and is ignored otherwise. colors : list or tuple, default: None A single color to plot all instances as or a list of colors to color each instance according to its class in the discrete case or as an ordered colormap in the sequential case. If not enough colors per class are specified then the colors are treated as a cycle. colormap : string or cmap, default: None The colormap used to create the individual colors. In the discrete case it is used to compute the number of colors needed for each class and in the continuous case it is used to create a sequential color map based on the range of the target. target_type : str, default: "auto" Specify the type of target as either "discrete" (classes) or "continuous" (real numbers, usually for regression). If "auto", then it will attempt to determine the type by counting the number of unique values. If the target is discrete, the colors are returned as a dict with classes being the keys. If continuous the colors will be list having value of color for each point. In either case, if no target is specified, then color will be specified as the first color in the color cycle. projection : int or string, default: 2 The number of axes to project into, either 2d or 3d. To plot 3d plots with matplotlib, please ensure a 3d axes is passed to the visualizer, otherwise one will be created using the current figure. alpha : float, default: 0.75 Specify a transparency where 1 is completely opaque and 0 is completely transparent. This property makes densely clustered points more visible. random_state : int or RandomState, default: None Fixes the random state for stochastic manifold algorithms. colorbar : bool, default: True If the target_type is "continous" draw a colorbar to the right of the scatter plot. The colobar axes is accessible using the cax property. show: bool, default: True If True, calls ``show()``, which in turn calls ``plt.show()`` however you cannot call ``plt.savefig`` from this signature, nor ``clear_figure``. If False, simply calls ``finalize()`` kwargs : dict Keyword arguments passed to the base class and may influence the feature visualization properties. Returns ------- viz : Manifold Returns the fitted, finalized visualizer """ # Instantiate the visualizer viz = Manifold( ax=ax, manifold=manifold, n_neighbors=n_neighbors, features=features, classes=classes, colors=colors, colormap=colormap, target_type=target_type, projection=projection, alpha=alpha, random_state=random_state, colorbar=colorbar, **kwargs ) # Fit and transform the visualizer (calls draw) viz.fit_transform(X, y) if show: viz.show() else: viz.finalize() # Return the visualizer object return viz