Source code for yellowbrick.model_selection.importances

# yellowbrick.model_selection.importances
# Feature importance visualizer
#
# Author:  Benjamin Bengfort
# Author:  Rebecca Bilbro
# Created: Fri Mar 02 15:21:36 2018 -0500
#
# Copyright (C) 2018 The scikit-yb developers
# For license information, see LICENSE.txt
#
# ID: importances.py [] benjamin@bengfort.com $

"""
Implementation of a feature importances visualizer. This visualizer sits in
kind of a weird place since it is technically a model scoring visualizer, but
is generally used for feature engineering.
"""

##########################################################################
## Imports
##########################################################################

import warnings
import numpy as np

from yellowbrick.draw import bar_stack
from yellowbrick.base import ModelVisualizer
from yellowbrick.style.colors import resolve_colors
from yellowbrick.utils import is_dataframe, is_classifier
from yellowbrick.exceptions import YellowbrickTypeError, NotFitted
from yellowbrick.exceptions import YellowbrickWarning, YellowbrickValueError

##########################################################################
## Feature Visualizer
##########################################################################


[docs]class FeatureImportances(ModelVisualizer):
    """
    Displays the most informative features in a model by showing a bar chart
    of features ranked by their importances. Although primarily a feature
    engineering mechanism, this visualizer requires a model that has either a
    ``coef_`` or ``feature_importances_`` parameter after fit.

    Note: Some classification models such as ``LogisticRegression``, return
    ``coef_`` as a multidimensional array of shape ``(n_classes, n_features)``.
    In this case, the ``FeatureImportances`` visualizer computes the mean of the
    ``coefs_`` by class for each feature.

    Parameters
    ----------
    estimator : Estimator
        A Scikit-Learn estimator that learns feature importances. Must support
        either ``coef_`` or ``feature_importances_`` parameters. If the estimator
        is not fitted, it is fit when the visualizer is fitted, unless otherwise
        specified by ``is_fitted``.

    ax : matplotlib Axes, default: None
        The axis to plot the figure on. If None is passed in the current axes
        will be used (or generated if required).

    labels : list, default: None
        A list of feature names to use. If a DataFrame is passed to fit and
        features is None, feature names are selected as the column names.

    relative : bool, default: True
        If true, the features are described by their relative importance as a
        percentage of the strongest feature component; otherwise the raw
        numeric description of the feature importance is shown.

    absolute : bool, default: False
        Make all coeficients absolute to more easily compare negative
        coefficients with positive ones.

    xlabel : str, default: None
        The label for the X-axis. If None is automatically determined by the
        underlying model and options provided.

    stack : bool, default: False
        If true and the classifier returns multi-class feature importance,
        then a stacked bar plot is plotted; otherwise the mean of the
        feature importance across classes are plotted.

    colors: list of strings
        Specify colors for each bar in the chart if ``stack==False``.

    colormap : string or matplotlib cmap
        Specify a colormap to color the classes if ``stack==True``.

    is_fitted : bool or str, default='auto'
        Specify if the wrapped estimator is already fitted. If False, the estimator
        will be fit when the visualizer is fit, otherwise, the estimator will not be
        modified. If 'auto' (default), a helper method will check if the estimator
        is fitted before fitting it again.

    topn : int, default=None
        Display only the top N results with a positive integer, or the bottom N
        results with a negative integer. If None or 0, all results are shown.

    kwargs : dict
        Keyword arguments that are passed to the base class and may influence
        the visualization as defined in other Visualizers.

    Attributes
    ----------
    features_ : np.array
        The feature labels ranked according to their importance

    feature_importances_ : np.array
        The numeric value of the feature importance computed by the model

    classes_ : np.array
        The classes labeled. Is not None only for classifier.

    Examples
    --------

    >>> from sklearn.ensemble import GradientBoostingClassifier
    >>> visualizer = FeatureImportances(GradientBoostingClassifier())
    >>> visualizer.fit(X, y)
    >>> visualizer.show()
    """

    def __init__(
        self,
        estimator,
        ax=None,
        labels=None,
        relative=True,
        absolute=False,
        xlabel=None,
        stack=False,
        colors=None,
        colormap=None,
        is_fitted="auto",
        topn=None,
        **kwargs
    ):
        # Initialize the visualizer bases
        super(FeatureImportances, self).__init__(
            estimator, ax=ax, is_fitted=is_fitted, **kwargs
        )

        # Data Parameters
        self.labels = labels
        self.relative = relative
        self.absolute = absolute
        self.xlabel = xlabel
        self.stack = stack
        self.colors = colors
        self.colormap = colormap
        self.topn = topn

[docs]    def fit(self, X, y=None, **kwargs):
        """
        Fits the estimator to discover the feature importances described by
        the data, then draws those importances as a bar plot.

        Parameters
        ----------
        X : ndarray or DataFrame of shape n x m
            A matrix of n instances with m features

        y : ndarray or Series of length n
            An array or series of target or class values

        kwargs : dict
            Keyword arguments passed to the fit method of the estimator.

        Returns
        -------
        self : visualizer
            The fit method must always return self to support pipelines.
        """
        # Super call fits the underlying estimator if it's not already fitted
        super(FeatureImportances, self).fit(X, y, **kwargs)

        # Get the feature importances from the model
        self.feature_importances_ = self._find_importances_param()

        # Get the classes from the model
        if is_classifier(self):
            self.classes_ = self._find_classes_param()
        else:
            self.classes_ = None
            self.stack = False

        # If self.stack = True and feature importances is a multidim array,
        # we're expecting a shape of (n_classes, n_features)
        # therefore we flatten by taking the average by
        # column to get shape (n_features,)  (see LogisticRegression)
        if not self.stack and self.feature_importances_.ndim > 1:
            self.feature_importances_ = np.mean(self.feature_importances_, axis=0)
            warnings.warn(
                (
                    "detected multi-dimensional feature importances but stack=False, "
                    "using mean to aggregate them."
                ),
                YellowbrickWarning,
            )

        # Apply absolute value filter before normalization
        if self.absolute:
            self.feature_importances_ = np.abs(self.feature_importances_)

        # Normalize features relative to the maximum
        if self.relative:
            maxv = np.abs(self.feature_importances_).max()
            self.feature_importances_ /= maxv
            self.feature_importances_ *= 100.0

        # Create labels for the feature importances
        # NOTE: this code is duplicated from MultiFeatureVisualizer
        if self.labels is None:
            # Use column names if a dataframe
            if is_dataframe(X):
                self.features_ = np.array(X.columns)

            # Otherwise use the column index as the labels
            else:
                _, ncols = X.shape
                self.features_ = np.arange(0, ncols)
        else:
            self.features_ = np.array(self.labels)

        if self.topn and self.topn > self.features_.shape[0]:
            raise YellowbrickValueError(
                "topn '{}' cannot be greater than the number of "
                "features '{}'".format(self.topn, self.features_.shape[0])
            )

        # Sort the features and their importances
        if self.stack:
            if len(self.classes_) != self.feature_importances_.shape[0]:
                raise YellowbrickValueError(
                    (
                        "The model used does not return coef_ array in the shape of (n_classes, n_features)."
                        "  Unable to generate stacked feature importances.  "
                        "Consider setting the stack parameter to False or using a different model"
                    )
                )
            if self.topn:
                abs_sort_idx = np.argsort(
                    np.sum(np.absolute(self.feature_importances_), 0)
                )
                sort_idx = self._reduce_topn(abs_sort_idx)
            else:
                sort_idx = np.argsort(np.mean(self.feature_importances_, 0))

            self.features_ = self.features_[sort_idx]
            self.feature_importances_ = self.feature_importances_[:, sort_idx]
        else:
            if self.topn:
                abs_sort_idx = np.argsort(np.absolute(self.feature_importances_))
                abs_sort_idx = self._reduce_topn(abs_sort_idx)

                self.features_ = self.features_[abs_sort_idx]
                self.feature_importances_ = self.feature_importances_[abs_sort_idx]

            # Sort features by value (sorting a second time if topn)
            sort_idx = np.argsort(self.feature_importances_)
            self.features_ = self.features_[sort_idx]
            self.feature_importances_ = self.feature_importances_[sort_idx]

        # Draw the feature importances
        self.draw()
        return self

[docs]    def draw(self, **kwargs):
        """
        Draws the feature importances as a bar chart; called from fit.
        """
        # Quick validation
        for param in ("feature_importances_", "features_"):
            if not hasattr(self, param):
                raise NotFitted("missing required param '{}'".format(param))

        # Find the positions for each bar
        pos = np.arange(self.features_.shape[0]) + 0.5

        # Plot the bar chart
        if self.stack:
            colors = resolve_colors(len(self.classes_), colormap=self.colormap)
            legend_kws = {"bbox_to_anchor": (1.04, 0.5), "loc": "center left"}
            bar_stack(
                self.feature_importances_,
                ax=self.ax,
                labels=list(self.classes_),
                ticks=self.features_,
                orientation="h",
                colors=colors,
                legend_kws=legend_kws,
            )
        else:
            colors = resolve_colors(
                len(self.features_), colormap=self.colormap, colors=self.colors
            )
            self.ax.barh(pos, self.feature_importances_, color=colors, align="center")

            # Set the labels for the bars
            self.ax.set_yticks(pos)
            self.ax.set_yticklabels(self.features_)

        return self.ax

[docs]    def finalize(self, **kwargs):
        """
        Finalize the drawing setting labels and title.
        """
        # Set the title
        self.set_title(
            "Feature Importances of {} Features using {}".format(
                self._get_topn_title(), self.name
            )
        )

        # Set the xlabel
        self.ax.set_xlabel(self._get_xlabel())

        # Remove the ygrid
        self.ax.grid(False, axis="y")

        # Ensure we have a tight fit
        self.fig.tight_layout()

    def _find_classes_param(self):
        """
        Searches the wrapped model for the classes_ parameter.
        """
        for attr in ["classes_"]:
            try:
                return getattr(self.estimator, attr)
            except AttributeError:
                continue

        raise YellowbrickTypeError(
            "could not find classes_ param on {}".format(
                self.estimator.__class__.__name__
            )
        )

    def _find_importances_param(self):
        """
        Searches the wrapped model for the feature importances parameter.
        """
        for attr in ("feature_importances_", "coef_"):
            try:
                return getattr(self.estimator, attr)
            except AttributeError:
                continue

        raise YellowbrickTypeError(
            "could not find feature importances param on {}".format(
                self.estimator.__class__.__name__
            )
        )

    def _get_xlabel(self):
        """
        Determines the xlabel based on the underlying data structure
        """
        # Return user-specified label
        if self.xlabel:
            return self.xlabel

        # Label for coefficients
        if hasattr(self.estimator, "coef_"):
            if self.relative:
                return "relative coefficient magnitude"
            return "coefficient value"

        # Default label for feature_importances_
        if self.relative:
            return "relative importance"
        return "feature importance"

    def _is_fitted(self):
        """
        Returns true if the visualizer has been fit.
        """
        return hasattr(self, "feature_importances_") and hasattr(self, "features_")

    def _reduce_topn(self, arr):
        """
        Return only the top or bottom N items within a sliceable array/list.

        Assumes that arr is in ascending order.
        """
        if self.topn > 0:
            arr = arr[-self.topn:]
        elif self.topn < 0:
            arr = arr[:-self.topn]
        return arr

    def _get_topn_title(self):
        """
        Return an appropriate title for the plot: Top N, Bottom N, or N
        """
        if self.topn:
            if self.topn > 0:
                return "Top {}".format(len(self.features_))
            else:
                return "Bottom {}".format(len(self.features_))
        else:
            return str(len(self.features_))


##########################################################################
## Quick Method
##########################################################################


[docs]def feature_importances(
    estimator,
    X,
    y=None,
    ax=None,
    labels=None,
    relative=True,
    absolute=False,
    xlabel=None,
    stack=False,
    colors=None,
    colormap=None,
    is_fitted="auto",
    topn=None,
    show=True,
    **kwargs
):
    """Quick Method:
    Displays the most informative features in a model by showing a bar chart
    of features ranked by their importances. Although primarily a feature
    engineering mechanism, this visualizer requires a model that has either a
    ``coef_`` or ``feature_importances_`` parameter after fit.

    Parameters
    ----------
    estimator : Estimator
        A Scikit-Learn estimator that learns feature importances. Must support
        either ``coef_`` or ``feature_importances_`` parameters. If the estimator
        is not fitted, it is fit when the visualizer is fitted, unless otherwise
        specified by ``is_fitted``.

    X : ndarray or DataFrame of shape n x m
        A matrix of n instances with m features

    y : ndarray or Series of length n, optional
        An array or series of target or class values

    ax : matplotlib Axes, default: None
        The axis to plot the figure on. If None is passed in the current axes
        will be used (or generated if required).

    labels : list, default: None
        A list of feature names to use. If a DataFrame is passed to fit and
        features is None, feature names are selected as the column names.

    relative : bool, default: True
        If true, the features are described by their relative importance as a
        percentage of the strongest feature component; otherwise the raw
        numeric description of the feature importance is shown.

    absolute : bool, default: False
        Make all coeficients absolute to more easily compare negative
        coeficients with positive ones.

    xlabel : str, default: None
        The label for the X-axis. If None is automatically determined by the
        underlying model and options provided.

    stack : bool, default: False
        If true and the classifier returns multi-class feature importance,
        then a stacked bar plot is plotted; otherwise the mean of the
        feature importance across classes are plotted.

    colors: list of strings
        Specify colors for each bar in the chart if ``stack==False``.

    colormap : string or matplotlib cmap
        Specify a colormap to color the classes if ``stack==True``.

    is_fitted : bool or str, default='auto'
        Specify if the wrapped estimator is already fitted. If False, the estimator
        will be fit when the visualizer is fit, otherwise, the estimator will not be
        modified. If 'auto' (default), a helper method will check if the estimator
        is fitted before fitting it again.

    show: bool, default: True
        If True, calls ``show()``, which in turn calls ``plt.show()`` however you cannot
        call ``plt.savefig`` from this signature, nor ``clear_figure``. If False, simply
        calls ``finalize()``

    topn : int, default=None
        Display only the top N results with a positive integer, or the bottom N
        results with a negative integer. If None or 0, all results are shown.

    kwargs : dict
        Keyword arguments that are passed to the base class and may influence
        the visualization as defined in other Visualizers.

    Returns
    -------
    viz : FeatureImportances
        The feature importances visualizer, fitted and finalized.
    """
    # Instantiate the visualizer
    visualizer = FeatureImportances(
        estimator,
        ax=ax,
        labels=labels,
        relative=relative,
        absolute=absolute,
        xlabel=xlabel,
        stack=stack,
        colors=colors,
        colormap=colormap,
        is_fitted=is_fitted,
        topn=topn,
        **kwargs
    )

    # Fit and transform the visualizer (calls draw)
    visualizer.fit(X, y)

    if show:
        visualizer.show()
    else:
        visualizer.finalize()

    # Return the visualizer
    return visualizer