yellowbrick.classifier.confusion_matrix 源代码

# yellowbrick.classifier.confusion_matrix
# Visual confusion matrix for classifier scoring.
#
# Author:   Neal Humphrey
# Created:  Tue May 03 11:05:11 2017 -0700
#
# Copyright (C) 2017 District Data Labs
# For license information, see LICENSE.txt
#
# ID: confusion_matrix.py [5388065] neal@nhumphrey.com $

"""
Visual confusion matrix for classifier scoring.
"""

##########################################################################
## Imports
##########################################################################

import numpy as np

from sklearn.metrics import confusion_matrix

from ..utils import div_safe
from ..style import find_text_color
from ..style.palettes import color_sequence
from .base import ClassificationScoreVisualizer


##########################################################################
## ConfusionMatrix
##########################################################################

CMAP_OVERCOLOR = '#2a7d4f'


[文档]class ConfusionMatrix(ClassificationScoreVisualizer): """ Creates a heatmap visualization of the sklearn.metrics.confusion_matrix(). A confusion matrix shows each combination of the true and predicted classes for a test data set. The default color map uses a yellow/orange/red color scale. The user can choose between displaying values as the percent of true (cell value divided by sum of row) or as direct counts. If percent of true mode is selected, 100% accurate predictions are highlighted in green. Requires a classification model Parameters ---------- model : the Scikit-Learn estimator Should be an instance of a classifier or __init__ will return an error. ax : the matplotlib axis to plot the figure on (if None, a new axis will be created) classes : list, default: None a list of class names to use in the confusion_matrix. This is passed to the 'labels' parameter of sklearn.metrics.confusion_matrix(), and follows the behaviour indicated by that function. It may be used to reorder or select a subset of labels. If None, values that appear at least once in y_true or y_pred are used in sorted order. label_encoder : dict or LabelEncoder, default: None When specifying the ``classes`` argument, the input to ``fit()`` and ``score()`` must match the expected labels. If the ``X`` and ``y`` datasets have been encoded prior to training and the labels must be preserved for the visualization, use this argument to provide a mapping from the encoded class to the correct label. Because typically a Scikit-Learn ``LabelEncoder`` is used to perform this operation, you may provide it directly to the class to utilize its fitted encoding. Examples -------- >>> from yellowbrick.classifier import ConfusionMatrix >>> from sklearn.linear_model import LogisticRegression >>> viz = ConfusionMatrix(LogisticRegression()) >>> viz.fit(X_train, y_train) >>> viz.score(X_test, y_test) >>> viz.poof() """ def __init__(self, model, ax=None, classes=None, label_encoder=None, **kwargs): super(ConfusionMatrix, self).__init__( model, ax=ax, classes=classes, **kwargs ) #Initialize all the other attributes we'll use (for coder clarity) self.confusion_matrix = None self.cmap = color_sequence(kwargs.pop('cmap', 'YlOrRd')) self.cmap.set_under(color = 'w') self.cmap.set_over(color=CMAP_OVERCOLOR) self.edgecolors = [] #used to draw diagonal line for predicted class = true class self.label_encoder = label_encoder
[文档] def score(self, X, y, sample_weight=None, percent=True): """ Generates the Scikit-Learn confusion_matrix and applies this to the appropriate axis Parameters ---------- X : ndarray or DataFrame of shape n x m A matrix of n instances with m features y : ndarray or Series of length n An array or series of target or class values sample_weight: optional, passed to the confusion_matrix percent: optional, Boolean. Determines whether or not the confusion_matrix should be displayed as raw numbers or as a percent of the true predictions. Note, if using a subset of classes in __init__, percent should be set to False or inaccurate percents will be displayed. """ y_pred = self.predict(X) if self.label_encoder: try : y = self.label_encoder.inverse_transform(y) y_pred = self.label_encoder.inverse_transform(y_pred) except AttributeError: # if a mapping is passed to class apply it here. y = [self.label_encoder[x] for x in y] y_pred = [self.label_encoder[x] for x in y_pred] self.confusion_matrix = confusion_matrix( y, y_pred, labels=self.classes_, sample_weight=sample_weight ) self._class_counts = self.class_counts(y) #Make array of only the classes actually being used. #Needed because sklearn confusion_matrix only returns counts for selected classes #but percent should be calculated based on all classes selected_class_counts = [] for c in self.classes_: try: selected_class_counts.append(self._class_counts[c]) except KeyError: selected_class_counts.append(0) self.selected_class_counts = np.array(selected_class_counts) return self.draw(percent)
[文档] def draw(self, percent=True): """ Renders the classification report Should only be called internally, as it uses values calculated in Score and score calls this method. Parameters ---------- percent: Boolean Whether the heatmap should represent "% of True" or raw counts """ if percent == True: #Convert confusion matrix to percent of each row, i.e. the predicted as a percent of true in each class #div_safe function returns 0 instead of NAN. self._confusion_matrix_display = div_safe( self.confusion_matrix, self.selected_class_counts ) self._confusion_matrix_display =np.round(self._confusion_matrix_display* 100, decimals=0) else: self._confusion_matrix_display = self.confusion_matrix #Y axis should be sorted top to bottom in pcolormesh self._confusion_matrix_plottable = self._confusion_matrix_display[::-1,::] self.max = self._confusion_matrix_plottable.max() #Set up the dimensions of the pcolormesh X = np.linspace(start=0, stop=len(self.classes_), num=len(self.classes_)+1) Y = np.linspace(start=0, stop=len(self.classes_), num=len(self.classes_)+1) self.ax.set_ylim(bottom=0, top=self._confusion_matrix_plottable.shape[0]) self.ax.set_xlim(left=0, right=self._confusion_matrix_plottable.shape[1]) #Put in custom axis labels self.xticklabels = self.classes_ self.yticklabels = self.classes_[::-1] self.xticks = np.arange(0, len(self.classes_), 1) + .5 self.yticks = np.arange(0, len(self.classes_), 1) + .5 self.ax.set(xticks=self.xticks, yticks=self.yticks) self.ax.set_xticklabels(self.xticklabels, rotation="vertical", fontsize=8) self.ax.set_yticklabels(self.yticklabels, fontsize=8) ###################### # Add the data labels to each square ###################### for x_index, x in np.ndenumerate(X): #np.ndenumerate returns a tuple for the index, must access first element using [0] x_index = x_index[0] for y_index, y in np.ndenumerate(Y): #Clean up our iterators #numpy doesn't like non integers as indexes; also np.ndenumerate returns tuple x_int = int(x) y_int = int(y) y_index = y_index[0] #X and Y are one element longer than the confusion_matrix. Don't want to add text for the last X or Y if x_index == X[-1] or y_index == Y[-1]: break #center the text in the middle of the block text_x = x + 0.5 text_y = y + 0.5 #extract the value grid_val = self._confusion_matrix_plottable[x_int,y_int] #Determine text color scaled_grid_val = grid_val / self.max base_color = self.cmap(scaled_grid_val) text_color= find_text_color(base_color) #make zero values more subtle if self._confusion_matrix_plottable[x_int,y_int] == 0: text_color = "0.75" #Put the data labels in the middle of the heatmap square self.ax.text(text_y, text_x, "{:.0f}{}".format(grid_val,"%" if percent==True else ""), va='center', ha='center', fontsize=8, color=text_color) #If the prediction is correct, put a bounding box around that square to better highlight it to the user #This will be used in ax.pcolormesh, setting now since we're iterating over the matrix #ticklabels are conveniently already reversed properly to match the _confusion_matrix_plottalbe order if self.xticklabels[x_int] == self.yticklabels[y_int]: self.edgecolors.append('black') else: self.edgecolors.append('w') # Draw the heatmap. vmin and vmax operate in tandem with the cmap.set_under and cmap.set_over to alter the color of 0 and 100 highest_count = self._confusion_matrix_plottable.max() vmax = 99.999 if percent == True else highest_count self.ax.pcolormesh(X, Y, self._confusion_matrix_plottable, vmin=0.00001, vmax=vmax, edgecolor=self.edgecolors, cmap=self.cmap, linewidth='0.01') #edgecolor='0.75', linewidth='0.01' return self.ax
[文档] def finalize(self, **kwargs): self.set_title('{} Confusion Matrix'.format(self.name)) self.ax.set_ylabel('True Class') self.ax.set_xlabel('Predicted Class')