# Authors: The scikit-autoeval developers
# SPDX-License-Identifier: BSD-3-Clause
from typing import Any, Callable, Dict, List, Mapping, Tuple, Union
import numpy as np
from sklearn.metrics import accuracy_score
from skeval.base import BaseEvaluator
from skeval.utils import check_is_fitted
[docs]
class ConfidenceThresholdEvaluator(BaseEvaluator):
"""Confidence-based evaluator for classification models.
This evaluator filters predictions from a classification model according to
a confidence threshold. Only predictions whose confidence (top-class
probability, or other chosen score) is greater than or equal to the given
threshold are treated as "trusted"; the remaining predictions are flipped
(binary case) to build an expected label vector used for metric estimation.
Parameters
----------
model : object
Any classifier implementing ``fit``, ``predict`` and either
``predict_proba`` or ``decision_function``.
scorer : callable or dict of str -> callable, default=accuracy_score
Single scoring function or mapping of metric names to callables with
signature ``scorer(y_true, y_pred)``.
verbose : bool, default=False
If ``True``, prints intermediate information during fitting and
estimation.
Attributes
----------
model : object
The primary model evaluated.
scorer : callable or dict
Scoring function(s) applied to agreement-based labels.
verbose : bool
Verbosity flag.
Examples
--------
Example using medical datasets and a RandomForest pipeline:
>>> import pandas as pd
>>> from sklearn.metrics import accuracy_score, f1_score
>>> from sklearn.impute import KNNImputer
>>> from sklearn.pipeline import make_pipeline
>>> from sklearn.ensemble import RandomForestClassifier
>>> from skeval.evaluators.confidence import ConfidenceThresholdEvaluator
>>> from skeval.utils import get_cv_and_real_scores, print_comparison
>>> # 1. Load datasets
>>> df_geriatrics = pd.read_csv("geriatria.csv")
>>> df_neurology = pd.read_csv("neurologia.csv")
>>> # 2. Separate features and target
>>> X1, y1 = df_geriatrics.drop(columns=["Alzheimer"]), df_geriatrics["Alzheimer"]
>>> X2, y2 = df_neurology.drop(columns=["Alzheimer"]), df_neurology["Alzheimer"]
>>> # 3. Define model pipeline
>>> model = make_pipeline(
... KNNImputer(n_neighbors=4),
... RandomForestClassifier(n_estimators=300, random_state=42),
... )
>>> # 4. Initialize evaluator with scorers
>>> scorers = {
... "accuracy": accuracy_score,
... "f1_macro": lambda y, p: f1_score(y, p, average="macro"),
... }
>>> evaluator = ConfidenceThresholdEvaluator(model=model, scorer=scorers)
>>> # 5. Fit evaluator
>>> evaluator.fit(X1, y1)
>>> # 6. Estimated performance (using confidence threshold)
>>> estimated_scores = evaluator.estimate(X2, threshold=0.65, limit_to_top_class=True)
>>> # 7. Cross-validation and real performance comparison
>>> scores_dict = get_cv_and_real_scores(
... model=model, scorers=scorers, train_data=(X1, y1), test_data=(X2, y2)
... )
>>> cv_scores = scores_dict["cv_scores"]
>>> real_scores = scores_dict["real_scores"]
>>> print_comparison(scorers, cv_scores, estimated_scores, real_scores)
"""
def __init__(
self,
model: Any,
scorer: Union[
Callable[..., Any], Mapping[str, Callable[..., Any]]
] = accuracy_score,
verbose: bool = False,
) -> None:
super().__init__(model=model, scorer=scorer, verbose=verbose)
[docs]
def fit(self, x: Any, y: Any) -> "ConfidenceThresholdEvaluator":
"""
Fits the model to the training data.
Parameters
----------
x : array-like of shape (n_samples, n_features)
The training input samples.
y : array-like of shape (n_samples,)
The target labels.
Returns
-------
self : object
Returns the instance itself.
"""
if self.verbose:
print("[INFO] Model has been trained.")
self.model.fit(x, y)
return self
[docs]
def estimate(
self, x_eval: Any, threshold: float = 0.65, limit_to_top_class: bool = True
) -> Dict[str, float]:
"""
Estimates scores based on the confidence threshold.
This method calculates the prediction confidences, filters out those
that do not meet the threshold, and then computes the score(s)
specified in the `scorer`.
Parameters
----------
x_eval : array-like of shape (n_samples, n_features)
Input data for which to estimate scores.
threshold : float, default=0.8
The minimum confidence required to include a prediction in the calculation.
limit_to_top_class : bool, default=True
If True, uses only the probability of the top class as the confidence score.
Returns
-------
dict
A dictionary with estimated scores for each scorer.
If no predictions pass the threshold, it returns 0.0 for each scorer.
"""
check_is_fitted(self.model)
conf, correct = self.__get_confidences_and_correct(
x_eval, threshold, limit_to_top_class
)
self._print_verbose_confidence_info(conf, correct)
if not np.any(correct):
return self._handle_no_confident_predictions()
y_pred = self.model.predict(x_eval)
y_estimated = self._build_estimated_labels(y_pred, correct)
self._print_verbose_label_info(y_pred, y_estimated)
return self._compute_scores(y_estimated, y_pred)
def _print_verbose_confidence_info(
self, conf: np.ndarray, correct: np.ndarray
) -> None:
if self.verbose:
print("[INFO] Confidences:", conf)
print("[INFO] Passed threshold:", correct)
def _handle_no_confident_predictions(self) -> Dict[str, float]:
if self.verbose:
print("[INFO] No predictions passed the threshold.")
return {name: 0.0 for name in self._get_scorer_names()}
def _build_estimated_labels(self, y_pred: Any, correct: np.ndarray) -> List[int]:
y_estimated = [
y_pred[i] if c == 1 else (y_pred[i] + 1) % 2 for i, c in enumerate(correct)
]
return [int(y) for y in y_estimated]
def _print_verbose_label_info(self, y_pred: Any, y_estimated: List[int]) -> None:
if self.verbose:
print("[INFO] y_pred:", y_pred)
print("[INFO] y_estimated:", y_estimated)
def _compute_scores(self, y_estimated: List[int], y_pred: Any) -> Dict[str, float]:
if isinstance(self.scorer, dict):
scores: Dict[str, float] = {
name: float(func(y_estimated, y_pred))
for name, func in self.scorer.items()
}
if self.verbose:
print("[INFO] Estimated scores:", scores)
return scores
if callable(self.scorer):
score_val = float(self.scorer(y_estimated, y_pred))
if self.verbose:
print("[INFO] Estimated score:", score_val)
return {"score": score_val}
raise ValueError("'scorer' must be a callable or a dict of callables.")
def __get_confidences_and_correct(
self, x: Any, threshold: float, limit_to_top_class: bool
) -> Tuple[np.ndarray, np.ndarray]:
"""
Computes confidence scores and applies the confidence threshold.
"""
if not (
hasattr(self.model, "predict_proba")
or hasattr(self.model, "decision_function")
):
raise ValueError(
"The model must implement predict_proba or decision_function."
)
if hasattr(self.model, "predict_proba"):
probas = self.model.predict_proba(x)
conf: np.ndarray = (
np.max(probas, axis=1) if limit_to_top_class else np.asarray(probas)
)
elif hasattr(self.model, "decision_function"):
decision = self.model.decision_function(x)
conf = (
np.max(decision, axis=1)
if getattr(decision, "ndim", 1) > 1
else np.abs(np.asarray(decision))
)
else:
raise ValueError(
"The model must implement predict_proba or decision_function."
)
correct = conf >= threshold
return conf, correct