Source code for yellowbrick.model_selection.dropping_curve

# yellowbrick.model_selection.dropping_curve
# Implements a feature dropping curve visualization for model selection.
#
# Author:   Charles Guan
# Created:  Wed Dec 8 15:03:00 2021 -0800

"""
Implements a random-input-dropout curve visualization for model selection.
Another common name: neuron dropping curve (NDC), in neural decoding research
"""

##########################################################################
## Imports
##########################################################################

import numpy as np

from yellowbrick.base import ModelVisualizer
from yellowbrick.style import resolve_colors
from yellowbrick.exceptions import YellowbrickValueError

from sklearn.model_selection import validation_curve as sk_validation_curve
from sklearn.pipeline import make_pipeline
from sklearn.feature_selection import SelectKBest


# Default ticks for the model selection curve, relative number of features
DEFAULT_FEATURE_SIZES = np.linspace(0.1, 1.0, 5)


##########################################################################
# DroppingCurve visualizer
##########################################################################


[docs]class DroppingCurve(ModelVisualizer):
    """
    Selects random subsets of features and estimates the training and
    crossvalidation performance. Subset sizes are swept to visualize a
    feature-dropping curve.

    The visualization plots the score relative to each subset and shows
    the number of (randomly selected) features needed to achieve a score.
    The curve is often shaped like log(1+x). For example, see:
    https://www.frontiersin.org/articles/10.3389/fnsys.2014.00102/full

    Parameters
    ----------
    estimator : a scikit-learn estimator
        An object that implements ``fit`` and ``predict``, can be a
        classifier, regressor, or clusterer so long as there is also a valid
        associated scoring metric.

        Note that the object is cloned for each validation.

    feature_sizes: array-like, shape (n_values,)
        default: ``np.linspace(0.1,1.0,5)``

        Relative or absolute numbers of input features that will be used to
        generate the learning curve. If the dtype is float, it is regarded as
        a fraction of the maximum number of features, otherwise it is
        interpreted as absolute numbers of features.

    groups : array-like, with shape (n_samples,)
        Optional group labels for the samples used while splitting the dataset
        into train/test sets.

    ax : matplotlib.Axes object, optional
        The axes object to plot the figure on.

    logx : boolean, optional
        If True, plots the x-axis with a logarithmic scale.

    cv : int, cross-validation generator or an iterable, optional
        Determines the cross-validation splitting strategy.
        Possible inputs for cv are:

          - None, to use the default 3-fold cross-validation,
          - integer, to specify the number of folds.
          - An object to be used as a cross-validation generator.
          - An iterable yielding train/test splits.

        see the scikit-learn
        `cross-validation guide <https://bit.ly/2MMQAI7>`_
        for more information on the possible strategies that can be used here.

    scoring : string, callable or None, optional, default: None
        A string or scorer callable object / function with signature
        ``scorer(estimator, X, y)``. See scikit-learn model evaluation
        documentation for names of possible metrics.

    n_jobs : integer, optional
        Number of jobs to run in parallel (default 1).

    pre_dispatch : integer or string, optional
        Number of predispatched jobs for parallel execution (default is
        all). The option can reduce the allocated memory. The string can
        be an expression like '2*n_jobs'.

    random_state : int, RandomState instance or None, optional (default=None)
        If int, random_state is the seed used by the random number generator;
        If RandomState instance, random_state is the random number generator;
        If None, the random number generator is the RandomState instance used
        by `np.random`. Used to generate feature subsets.

    kwargs : dict
        Keyword arguments that are passed to the base class and may influence
        the visualization as defined in other Visualizers.

    Attributes
    ----------
    feature_sizes_ : array,     shape = (n_unique_ticks,), dtype int
        Numbers of features that have been used to generate the
        dropping curve. Note that the number of ticks might be less
        than n_ticks because duplicate entries will be removed.

    train_scores_ : array, shape (n_ticks, n_cv_folds)
        Scores on training sets.

    train_scores_mean_ : array, shape (n_ticks,)
        Mean training data scores for each training split

    train_scores_std_ : array, shape (n_ticks,)
        Standard deviation of training data scores for each training split

    valid_scores_ : array, shape (n_ticks, n_cv_folds)
        Scores on validation set.

    valid_scores_mean_ : array, shape (n_ticks,)
        Mean scores for each validation split

    valid_scores_std_ : array, shape (n_ticks,)
        Standard deviation of  scores for each validation split

    Examples
    --------

    >>> from yellowbrick.model_selection import DroppingCurve
    >>> from sklearn.naive_bayes import GaussianNB
    >>> model = DroppingCurve(GaussianNB())
    >>> model.fit(X, y)
    >>> model.show()

    Notes
    -----
    This visualizer is based on sklearn.model_selection.validation_curve
    """

    def __init__(
        self,
        estimator,
        ax=None,
        feature_sizes=DEFAULT_FEATURE_SIZES,
        groups=None,
        logx=False,
        cv=None,
        scoring=None,
        n_jobs=None,
        pre_dispatch='all',
        random_state=None,
        **kwargs
    ):

        # Initialize the model visualizer
        super(DroppingCurve, self).__init__(estimator, ax=ax, **kwargs)

        # Validate the feature sizes
        feature_sizes = np.asarray(feature_sizes)
        if feature_sizes.ndim != 1:
            raise YellowbrickValueError(
                "must specify 1-D array of feature sizes, '{}' is not valid".format(
                    repr(feature_sizes)
                )
            )

        # Set the metric parameters to be used later
        self.feature_sizes = feature_sizes
        self.groups = groups
        self.logx = logx
        self.cv = cv
        self.scoring = scoring
        self.n_jobs = n_jobs
        self.pre_dispatch = pre_dispatch
        self.random_state = random_state

[docs]    def fit(self, X, y=None):
        """
        Fits the feature dropping curve with the wrapped model to the specified data.
        Draws training and cross-validation score curves and saves the scores to the
        estimator.

        Parameters
        ----------
        X : array-like, shape (n_samples, n_features)
            Input vector, where n_samples is the number of samples and
            n_features is the number of features.

        y : array-like, shape (n_samples) or (n_samples, n_features), optional
            Target relative to X for classification or regression;
            None for unsupervised learning.
        """
        # Get feature_sizes in whole numbers
        n_features = X.shape[-1]
        if np.issubdtype(self.feature_sizes.dtype, np.integer):
            if (self.feature_sizes <= 0).all() or (self.feature_sizes >= n_features).all():
                raise YellowbrickValueError('Expected feature sizes in [0, n_features]')
            self.feature_sizes_ = self.feature_sizes
        else:
            if (self.feature_sizes <= 0.0).all() or (self.feature_sizes >= 1.0).all():
                raise YellowbrickValueError('Expected feature ratio in [0,1]')
            self.feature_sizes_ = np.ceil(n_features * self.feature_sizes).astype(int)

        # The easiest way to prepend a random-dropout layer is to use
        # SelectKBest with a random scoring function.
        feature_dropping_pipeline = make_pipeline(
            SelectKBest(
                score_func=lambda X,y: np.random.default_rng(self.random_state).standard_normal(size=X.shape[-1])
                ),
            self.estimator,
        )

        # arguments to pass to sk_validation_curve
        skvc_kwargs = {
            key: self.get_params()[key]
            for key in (
                "groups",
                "cv",
                "scoring",
                "n_jobs",
                "pre_dispatch",
            )
        }

        self.train_scores_, self.valid_scores_ = sk_validation_curve(
            feature_dropping_pipeline,
            X,
            y,
            param_name="selectkbest__k",
            param_range=self.feature_sizes_,
            **skvc_kwargs
        )

        # compute the mean and standard deviation of the training data
        self.train_scores_mean_ = np.mean(self.train_scores_, axis=1)
        self.train_scores_std_ = np.std(self.train_scores_, axis=1)

        # compute the mean and standard deviation of the validation data
        self.valid_scores_mean_ = np.mean(self.valid_scores_, axis=1)
        self.valid_scores_std_ = np.std(self.valid_scores_, axis=1)

        # draw the curves on the current axes
        self.draw()
        return self

[docs]    def draw(self, **kwargs):
        """
        Renders the training and validation learning curves.
        """
        # Specify the curves to draw and their labels
        labels = ("Training Score", "Cross Validation Score")
        curves = (
            (self.train_scores_mean_, self.train_scores_std_),
            (self.valid_scores_mean_, self.valid_scores_std_),
        )

        # Get the colors for the train and test curves
        colors = resolve_colors(n_colors=2)

        # Plot the fill betweens first so they are behind the curves.
        for idx, (mean, std) in enumerate(curves):
            # Plot one standard deviation above and below the mean
            self.ax.fill_between(
                self.feature_sizes_, mean - std, mean + std, alpha=0.25, color=colors[idx]
            )

        # Plot the mean curves so they are in front of the variance fill
        for idx, (mean, _) in enumerate(curves):
            self.ax.plot(
                self.feature_sizes_, mean, "o-", color=colors[idx], label=labels[idx]
            )

        if self.logx:
            self.ax.set_xscale("log")

        return self.ax

[docs]    def finalize(self, **kwargs):
        """
        Add the title, legend, and other visual final touches to the plot.
        """
        # Set the title of the figure
        self.set_title("Random-feature dropping curve for {}".format(self.name))

        # Add the legend
        self.ax.legend(frameon=True, loc="best")

        # Set the axis labels
        self.ax.set_xlabel("number of features")
        self.ax.set_ylabel("score")


##########################################################################
# Quick Method
##########################################################################


[docs]def dropping_curve(
    estimator,
    X,
    y,
    feature_sizes=DEFAULT_FEATURE_SIZES,
    groups=None,
    ax=None,
    logx=False,
    cv=None,
    scoring=None,
    n_jobs=None,
    pre_dispatch='all',
    random_state=None,
    show=True,
    **kwargs
) -> DroppingCurve:
    """
    Displays a random-feature dropping curve, comparing feature size to training
    and cross validation scores. The dropping curve aims to show how a model
    improves with more information.

    This helper function wraps the DroppingCurve class for one-off analysis.

    Parameters
    ----------
    estimator : a scikit-learn estimator
        An object that implements ``fit`` and ``predict``, can be a
        classifier, regressor, or clusterer so long as there is also a valid
        associated scoring metric.

        Note that the object is cloned for each validation.

    X : array-like, shape (n_samples, n_features)
        Input vector, where n_samples is the number of samples and
        n_features is the number of features.

    y : array-like, shape (n_samples) or (n_samples, n_features), optional
        Target relative to X for classification or regression;
        None for unsupervised learning.

    feature_sizes: array-like, shape (n_values,)
        default: ``np.linspace(0.1,1.0,5)``

        Relative or absolute numbers of input features that will be used to
        generate the learning curve. If the dtype is float, it is regarded as
        a fraction of the maximum number of features, otherwise it is
        interpreted as absolute numbers of features.

    groups : array-like, with shape (n_samples,)
        Optional group labels for the samples used while splitting the dataset
        into train/test sets.

    ax : matplotlib.Axes object, optional
        The axes object to plot the figure on.

    logx : boolean, optional
        If True, plots the x-axis with a logarithmic scale.

    cv : int, cross-validation generator or an iterable, optional
        Determines the cross-validation splitting strategy.
        Possible inputs for cv are:

          - None, to use the default 3-fold cross-validation,
          - integer, to specify the number of folds.
          - An object to be used as a cross-validation generator.
          - An iterable yielding train/test splits.

        see the scikit-learn
        `cross-validation guide <https://bit.ly/2MMQAI7>`_
        for more information on the possible strategies that can be used here.

    scoring : string, callable or None, optional, default: None
        A string or scorer callable object / function with signature
        ``scorer(estimator, X, y)``. See scikit-learn model evaluation
        documentation for names of possible metrics.

    n_jobs : integer, optional
        Number of jobs to run in parallel (default 1).

    pre_dispatch : integer or string, optional
        Number of predispatched jobs for parallel execution (default is
        all). The option can reduce the allocated memory. The string can
        be an expression like '2*n_jobs'.

    random_state : int, RandomState instance or None, optional (default=None)
        If int, random_state is the seed used by the random number generator;
        If RandomState instance, random_state is the random number generator;
        If None, the random number generator is the RandomState instance used
        by `np.random`. Used to generate feature subsets.

    kwargs : dict
        Keyword arguments that are passed to the base class and may influence
        the visualization as defined in other Visualizers.

    Returns
    -------
    dc : DroppingCurve
        Returns the fitted visualizer.
    """
    dc = DroppingCurve(
        estimator,
        feature_sizes=feature_sizes,
        groups=groups,
        ax=ax,
        logx=logx,
        cv=cv,
        scoring=scoring,
        n_jobs=n_jobs,
        pre_dispatch=pre_dispatch,
        random_state=random_state,
        **kwargs
    )

    # Fit and show the visualizer
    dc.fit(X, y)
    if show:
        dc.show()
    else:
        dc.finalize()
    return dc