Source code for yellowbrick.model_selection.dropping_curve

# yellowbrick.model_selection.dropping_curve
# Implements a feature dropping curve visualization for model selection.
#
# Author:   Charles Guan
# Created:  Wed Dec 8 15:03:00 2021 -0800

"""
Implements a random-input-dropout curve visualization for model selection.
Another common name: neuron dropping curve (NDC), in neural decoding research
"""

##########################################################################
## Imports
##########################################################################

import numpy as np

from yellowbrick.base import ModelVisualizer
from yellowbrick.style import resolve_colors
from yellowbrick.exceptions import YellowbrickValueError

from sklearn.model_selection import validation_curve as sk_validation_curve
from sklearn.pipeline import make_pipeline
from sklearn.feature_selection import SelectKBest


# Default ticks for the model selection curve, relative number of features
DEFAULT_FEATURE_SIZES = np.linspace(0.1, 1.0, 5)


##########################################################################
# DroppingCurve visualizer
##########################################################################


[docs]class DroppingCurve(ModelVisualizer): """ Selects random subsets of features and estimates the training and crossvalidation performance. Subset sizes are swept to visualize a feature-dropping curve. The visualization plots the score relative to each subset and shows the number of (randomly selected) features needed to achieve a score. The curve is often shaped like log(1+x). For example, see: https://www.frontiersin.org/articles/10.3389/fnsys.2014.00102/full Parameters ---------- estimator : a scikit-learn estimator An object that implements ``fit`` and ``predict``, can be a classifier, regressor, or clusterer so long as there is also a valid associated scoring metric. Note that the object is cloned for each validation. feature_sizes: array-like, shape (n_values,) default: ``np.linspace(0.1,1.0,5)`` Relative or absolute numbers of input features that will be used to generate the learning curve. If the dtype is float, it is regarded as a fraction of the maximum number of features, otherwise it is interpreted as absolute numbers of features. groups : array-like, with shape (n_samples,) Optional group labels for the samples used while splitting the dataset into train/test sets. ax : matplotlib.Axes object, optional The axes object to plot the figure on. logx : boolean, optional If True, plots the x-axis with a logarithmic scale. cv : int, cross-validation generator or an iterable, optional Determines the cross-validation splitting strategy. Possible inputs for cv are: - None, to use the default 3-fold cross-validation, - integer, to specify the number of folds. - An object to be used as a cross-validation generator. - An iterable yielding train/test splits. see the scikit-learn `cross-validation guide <https://bit.ly/2MMQAI7>`_ for more information on the possible strategies that can be used here. scoring : string, callable or None, optional, default: None A string or scorer callable object / function with signature ``scorer(estimator, X, y)``. See scikit-learn model evaluation documentation for names of possible metrics. n_jobs : integer, optional Number of jobs to run in parallel (default 1). pre_dispatch : integer or string, optional Number of predispatched jobs for parallel execution (default is all). The option can reduce the allocated memory. The string can be an expression like '2*n_jobs'. random_state : int, RandomState instance or None, optional (default=None) If int, random_state is the seed used by the random number generator; If RandomState instance, random_state is the random number generator; If None, the random number generator is the RandomState instance used by `np.random`. Used to generate feature subsets. kwargs : dict Keyword arguments that are passed to the base class and may influence the visualization as defined in other Visualizers. Attributes ---------- feature_sizes_ : array, shape = (n_unique_ticks,), dtype int Numbers of features that have been used to generate the dropping curve. Note that the number of ticks might be less than n_ticks because duplicate entries will be removed. train_scores_ : array, shape (n_ticks, n_cv_folds) Scores on training sets. train_scores_mean_ : array, shape (n_ticks,) Mean training data scores for each training split train_scores_std_ : array, shape (n_ticks,) Standard deviation of training data scores for each training split valid_scores_ : array, shape (n_ticks, n_cv_folds) Scores on validation set. valid_scores_mean_ : array, shape (n_ticks,) Mean scores for each validation split valid_scores_std_ : array, shape (n_ticks,) Standard deviation of scores for each validation split Examples -------- >>> from yellowbrick.model_selection import DroppingCurve >>> from sklearn.naive_bayes import GaussianNB >>> model = DroppingCurve(GaussianNB()) >>> model.fit(X, y) >>> model.show() Notes ----- This visualizer is based on sklearn.model_selection.validation_curve """ def __init__( self, estimator, ax=None, feature_sizes=DEFAULT_FEATURE_SIZES, groups=None, logx=False, cv=None, scoring=None, n_jobs=None, pre_dispatch='all', random_state=None, **kwargs ): # Initialize the model visualizer super(DroppingCurve, self).__init__(estimator, ax=ax, **kwargs) # Validate the feature sizes feature_sizes = np.asarray(feature_sizes) if feature_sizes.ndim != 1: raise YellowbrickValueError( "must specify 1-D array of feature sizes, '{}' is not valid".format( repr(feature_sizes) ) ) # Set the metric parameters to be used later self.feature_sizes = feature_sizes self.groups = groups self.logx = logx self.cv = cv self.scoring = scoring self.n_jobs = n_jobs self.pre_dispatch = pre_dispatch self.random_state = random_state
[docs] def fit(self, X, y=None): """ Fits the feature dropping curve with the wrapped model to the specified data. Draws training and cross-validation score curves and saves the scores to the estimator. Parameters ---------- X : array-like, shape (n_samples, n_features) Input vector, where n_samples is the number of samples and n_features is the number of features. y : array-like, shape (n_samples) or (n_samples, n_features), optional Target relative to X for classification or regression; None for unsupervised learning. """ # Get feature_sizes in whole numbers n_features = X.shape[-1] if np.issubdtype(self.feature_sizes.dtype, np.integer): if (self.feature_sizes <= 0).all() or (self.feature_sizes >= n_features).all(): raise YellowbrickValueError('Expected feature sizes in [0, n_features]') self.feature_sizes_ = self.feature_sizes else: if (self.feature_sizes <= 0.0).all() or (self.feature_sizes >= 1.0).all(): raise YellowbrickValueError('Expected feature ratio in [0,1]') self.feature_sizes_ = np.ceil(n_features * self.feature_sizes).astype(int) # The easiest way to prepend a random-dropout layer is to use # SelectKBest with a random scoring function. feature_dropping_pipeline = make_pipeline( SelectKBest( score_func=lambda X,y: np.random.default_rng(self.random_state).standard_normal(size=X.shape[-1]) ), self.estimator, ) # arguments to pass to sk_validation_curve skvc_kwargs = { key: self.get_params()[key] for key in ( "groups", "cv", "scoring", "n_jobs", "pre_dispatch", ) } self.train_scores_, self.valid_scores_ = sk_validation_curve( feature_dropping_pipeline, X, y, param_name="selectkbest__k", param_range=self.feature_sizes_, **skvc_kwargs ) # compute the mean and standard deviation of the training data self.train_scores_mean_ = np.mean(self.train_scores_, axis=1) self.train_scores_std_ = np.std(self.train_scores_, axis=1) # compute the mean and standard deviation of the validation data self.valid_scores_mean_ = np.mean(self.valid_scores_, axis=1) self.valid_scores_std_ = np.std(self.valid_scores_, axis=1) # draw the curves on the current axes self.draw() return self
[docs] def draw(self, **kwargs): """ Renders the training and validation learning curves. """ # Specify the curves to draw and their labels labels = ("Training Score", "Cross Validation Score") curves = ( (self.train_scores_mean_, self.train_scores_std_), (self.valid_scores_mean_, self.valid_scores_std_), ) # Get the colors for the train and test curves colors = resolve_colors(n_colors=2) # Plot the fill betweens first so they are behind the curves. for idx, (mean, std) in enumerate(curves): # Plot one standard deviation above and below the mean self.ax.fill_between( self.feature_sizes_, mean - std, mean + std, alpha=0.25, color=colors[idx] ) # Plot the mean curves so they are in front of the variance fill for idx, (mean, _) in enumerate(curves): self.ax.plot( self.feature_sizes_, mean, "o-", color=colors[idx], label=labels[idx] ) if self.logx: self.ax.set_xscale("log") return self.ax
[docs] def finalize(self, **kwargs): """ Add the title, legend, and other visual final touches to the plot. """ # Set the title of the figure self.set_title("Random-feature dropping curve for {}".format(self.name)) # Add the legend self.ax.legend(frameon=True, loc="best") # Set the axis labels self.ax.set_xlabel("number of features") self.ax.set_ylabel("score")
########################################################################## # Quick Method ##########################################################################
[docs]def dropping_curve( estimator, X, y, feature_sizes=DEFAULT_FEATURE_SIZES, groups=None, ax=None, logx=False, cv=None, scoring=None, n_jobs=None, pre_dispatch='all', random_state=None, show=True, **kwargs ) -> DroppingCurve: """ Displays a random-feature dropping curve, comparing feature size to training and cross validation scores. The dropping curve aims to show how a model improves with more information. This helper function wraps the DroppingCurve class for one-off analysis. Parameters ---------- estimator : a scikit-learn estimator An object that implements ``fit`` and ``predict``, can be a classifier, regressor, or clusterer so long as there is also a valid associated scoring metric. Note that the object is cloned for each validation. X : array-like, shape (n_samples, n_features) Input vector, where n_samples is the number of samples and n_features is the number of features. y : array-like, shape (n_samples) or (n_samples, n_features), optional Target relative to X for classification or regression; None for unsupervised learning. feature_sizes: array-like, shape (n_values,) default: ``np.linspace(0.1,1.0,5)`` Relative or absolute numbers of input features that will be used to generate the learning curve. If the dtype is float, it is regarded as a fraction of the maximum number of features, otherwise it is interpreted as absolute numbers of features. groups : array-like, with shape (n_samples,) Optional group labels for the samples used while splitting the dataset into train/test sets. ax : matplotlib.Axes object, optional The axes object to plot the figure on. logx : boolean, optional If True, plots the x-axis with a logarithmic scale. cv : int, cross-validation generator or an iterable, optional Determines the cross-validation splitting strategy. Possible inputs for cv are: - None, to use the default 3-fold cross-validation, - integer, to specify the number of folds. - An object to be used as a cross-validation generator. - An iterable yielding train/test splits. see the scikit-learn `cross-validation guide <https://bit.ly/2MMQAI7>`_ for more information on the possible strategies that can be used here. scoring : string, callable or None, optional, default: None A string or scorer callable object / function with signature ``scorer(estimator, X, y)``. See scikit-learn model evaluation documentation for names of possible metrics. n_jobs : integer, optional Number of jobs to run in parallel (default 1). pre_dispatch : integer or string, optional Number of predispatched jobs for parallel execution (default is all). The option can reduce the allocated memory. The string can be an expression like '2*n_jobs'. random_state : int, RandomState instance or None, optional (default=None) If int, random_state is the seed used by the random number generator; If RandomState instance, random_state is the random number generator; If None, the random number generator is the RandomState instance used by `np.random`. Used to generate feature subsets. kwargs : dict Keyword arguments that are passed to the base class and may influence the visualization as defined in other Visualizers. Returns ------- dc : DroppingCurve Returns the fitted visualizer. """ dc = DroppingCurve( estimator, feature_sizes=feature_sizes, groups=groups, ax=ax, logx=logx, cv=cv, scoring=scoring, n_jobs=n_jobs, pre_dispatch=pre_dispatch, random_state=random_state, **kwargs ) # Fit and show the visualizer dc.fit(X, y) if show: dc.show() else: dc.finalize() return dc