Source code for yellowbrick.contrib.prepredict

# yellowbrick.contrib.prepredict
# PrePredict estimator allows Yellowbrick to work with results produced by an estimator.
#
# Author:  Benjamin Bengfort <benjamin@bengfort.com>
# Created: Mon Jul 12 07:07:33 2021 -0400
#
# ID: prepredict.py [] benjamin@bengfort.com $

"""
PrePredict estimator allows Yellowbrick to work with results produced by an estimator
prior to the visual diagnostic workflow, particularly for inferences that require
extensive time or compute resources.
"""

##########################################################################
## Imports
##########################################################################

import pathlib
import numpy as np

from sklearn.base import BaseEstimator
from sklearn.metrics import accuracy_score, r2_score, silhouette_score
from yellowbrick.contrib.wrapper import CLASSIFIER, CLUSTERER, REGRESSOR


[docs]class PrePredict(BaseEstimator): """ The Passthrough estimator allows users to specify pre-predicted results to Yellowbrick without the need to input the original estimator. Note that Yellowbrick often uses the learned attributes of the estimator to produce rich visual diagnostics, so this estimator may not work for all Yellowbrick visualizers. The passthrough estimator can accept data either in memory as a numpy array or it can accept a string, which it interprets as a path on disk to load the data from. Currently passthrough does not support predict_proba or decision_function methods, which it could if it was passed predicted data as 2D array instead of a 1D array. Parameters ---------- data : array-like, func, or file-like object, string, or pathlib.Path The predicted values wrapped by the estimator and returned on predict() and used by the score function. The default expectation is that data is a 1D numpy array of y_hat or y_pred values produced by some other estimator. Data can also be a func, which is called and returned, or a file-like object, string, or pathlib.Path at which point the data is loaded from disk using ``np.load``. estimator_type : str, optional One of "classifier", "regressor", "clusterer", "DensityEstimator", or "outlier_detector" that allows the contrib estimator to pass the scikit-learn ``is_classifier``, etc. functions. If not specified, the Yellowbrick visualizer you're trying to use may error. """ def __init__(self, data, estimator_type=None): self.data = data self._estimator_type = estimator_type
[docs] def fit(self, X, y=None): """ Fit is a no-op, simply returning self per the scikit-learn API. """ return self
[docs] def predict(self, X): """ Predict returns the embedded data but does not perform any checks on the validity of X (e.g. that it has the same shape as the internal data). """ return self._load()
[docs] def score(self, X, y=None): """ Score uses an appropriate metric for the estimator type and compares the input y values with the pre-predicted values. """ if self._estimator_type == CLASSIFIER: return accuracy_score(y, self._load()) if self._estimator_type == REGRESSOR: return r2_score(y, self._load()) if self._estimator_type == CLUSTERER: labels = y if y is not None else self._load() return silhouette_score(X, labels) # If the estimator type is unknown return NaN since the score can't be computed. return np.nan
def _load(self): """ Loads the data by performing type checking to determine if data is a callable whose result needs to be returned, or an argument that supports from disk loading. If neither of these things, then assumes the data is array-like and returns it directly. """ if callable(self.data): return self.data() if hasattr(self.data, "read") or isinstance(self.data, (str, pathlib.Path)): return np.load(self.data) return self.data