Source code for skeval.evaluators.confidence

# Authors: The scikit-autoeval developers
# SPDX-License-Identifier: BSD-3-Clause
from typing import Any, Callable, Dict, List, Mapping, Tuple, Union

import numpy as np
from sklearn.metrics import accuracy_score

from skeval.base import BaseEvaluator
from skeval.utils import check_is_fitted



[docs]
class ConfidenceThresholdEvaluator(BaseEvaluator):
    """Confidence-based evaluator for classification models.

    This evaluator filters predictions from a classification model according to
    a confidence threshold. Only predictions whose confidence (top-class
    probability, or other chosen score) is greater than or equal to the given
    threshold are treated as "trusted"; the remaining predictions are flipped
    (binary case) to build an expected label vector used for metric estimation.

    Parameters
    ----------
    model : object
        Any classifier implementing ``fit``, ``predict`` and either
        ``predict_proba`` or ``decision_function``.
    scorer : callable or dict of str -> callable, default=accuracy_score
        Single scoring function or mapping of metric names to callables with
        signature ``scorer(y_true, y_pred)``.
    verbose : bool, default=False
        If ``True``, prints intermediate information during fitting and
        estimation.

    Attributes
    ----------
    model : object
        The primary model evaluated.
    scorer : callable or dict
        Scoring function(s) applied to agreement-based labels.
    verbose : bool
        Verbosity flag.

    Examples
    --------
    Example using medical datasets and a RandomForest pipeline:

    >>> import pandas as pd
    >>> from sklearn.metrics import accuracy_score, f1_score
    >>> from sklearn.impute import KNNImputer
    >>> from sklearn.pipeline import make_pipeline
    >>> from sklearn.ensemble import RandomForestClassifier
    >>> from skeval.evaluators.confidence import ConfidenceThresholdEvaluator
    >>> from skeval.utils import get_cv_and_real_scores, print_comparison
    >>> # 1. Load datasets
    >>> df_geriatrics = pd.read_csv("geriatria.csv")
    >>> df_neurology = pd.read_csv("neurologia.csv")
    >>> # 2. Separate features and target
    >>> X1, y1 = df_geriatrics.drop(columns=["Alzheimer"]), df_geriatrics["Alzheimer"]
    >>> X2, y2 = df_neurology.drop(columns=["Alzheimer"]), df_neurology["Alzheimer"]
    >>> # 3. Define model pipeline
    >>> model = make_pipeline(
    ...     KNNImputer(n_neighbors=4),
    ...     RandomForestClassifier(n_estimators=300, random_state=42),
    ... )
    >>> # 4. Initialize evaluator with scorers
    >>> scorers = {
    ...     "accuracy": accuracy_score,
    ...     "f1_macro": lambda y, p: f1_score(y, p, average="macro"),
    ... }
    >>> evaluator = ConfidenceThresholdEvaluator(model=model, scorer=scorers)
    >>> # 5. Fit evaluator
    >>> evaluator.fit(X1, y1)
    >>> # 6. Estimated performance (using confidence threshold)
    >>> estimated_scores = evaluator.estimate(X2, threshold=0.65, limit_to_top_class=True)
    >>> # 7. Cross-validation and real performance comparison
    >>> scores_dict = get_cv_and_real_scores(
    ...     model=model, scorers=scorers, train_data=(X1, y1), test_data=(X2, y2)
    ... )
    >>> cv_scores = scores_dict["cv_scores"]
    >>> real_scores = scores_dict["real_scores"]
    >>> print_comparison(scorers, cv_scores, estimated_scores, real_scores)
    """

    def __init__(
        self,
        model: Any,
        scorer: Union[
            Callable[..., Any], Mapping[str, Callable[..., Any]]
        ] = accuracy_score,
        verbose: bool = False,
    ) -> None:
        super().__init__(model=model, scorer=scorer, verbose=verbose)


[docs]
    def fit(self, x: Any, y: Any) -> "ConfidenceThresholdEvaluator":
        """
        Fits the model to the training data.

        Parameters
        ----------
        x : array-like of shape (n_samples, n_features)
            The training input samples.
        y : array-like of shape (n_samples,)
            The target labels.

        Returns
        -------
        self : object
            Returns the instance itself.
        """
        if self.verbose:
            print("[INFO] Model has been trained.")
        self.model.fit(x, y)
        return self



[docs]
    def estimate(
        self, x_eval: Any, threshold: float = 0.65, limit_to_top_class: bool = True
    ) -> Dict[str, float]:
        """
        Estimates scores based on the confidence threshold.

        This method calculates the prediction confidences, filters out those
        that do not meet the threshold, and then computes the score(s)
        specified in the `scorer`.

        Parameters
        ----------
        x_eval : array-like of shape (n_samples, n_features)
            Input data for which to estimate scores.
        threshold : float, default=0.8
            The minimum confidence required to include a prediction in the calculation.
        limit_to_top_class : bool, default=True
            If True, uses only the probability of the top class as the confidence score.

        Returns
        -------
        dict
            A dictionary with estimated scores for each scorer.

            If no predictions pass the threshold, it returns 0.0 for each scorer.
        """
        check_is_fitted(self.model)

        conf, correct = self.__get_confidences_and_correct(
            x_eval, threshold, limit_to_top_class
        )
        self._print_verbose_confidence_info(conf, correct)

        if not np.any(correct):
            return self._handle_no_confident_predictions()

        y_pred = self.model.predict(x_eval)
        y_estimated = self._build_estimated_labels(y_pred, correct)

        self._print_verbose_label_info(y_pred, y_estimated)

        return self._compute_scores(y_estimated, y_pred)


    def _print_verbose_confidence_info(
        self, conf: np.ndarray, correct: np.ndarray
    ) -> None:
        if self.verbose:
            print("[INFO] Confidences:", conf)
            print("[INFO] Passed threshold:", correct)

    def _handle_no_confident_predictions(self) -> Dict[str, float]:
        if self.verbose:
            print("[INFO] No predictions passed the threshold.")
        return {name: 0.0 for name in self._get_scorer_names()}

    def _build_estimated_labels(self, y_pred: Any, correct: np.ndarray) -> List[int]:
        y_estimated = [
            y_pred[i] if c == 1 else (y_pred[i] + 1) % 2 for i, c in enumerate(correct)
        ]
        return [int(y) for y in y_estimated]

    def _print_verbose_label_info(self, y_pred: Any, y_estimated: List[int]) -> None:
        if self.verbose:
            print("[INFO] y_pred:", y_pred)
            print("[INFO] y_estimated:", y_estimated)

    def _compute_scores(self, y_estimated: List[int], y_pred: Any) -> Dict[str, float]:
        if isinstance(self.scorer, dict):
            scores: Dict[str, float] = {
                name: float(func(y_estimated, y_pred))
                for name, func in self.scorer.items()
            }
            if self.verbose:
                print("[INFO] Estimated scores:", scores)
            return scores

        if callable(self.scorer):
            score_val = float(self.scorer(y_estimated, y_pred))
            if self.verbose:
                print("[INFO] Estimated score:", score_val)
            return {"score": score_val}
        raise ValueError("'scorer' must be a callable or a dict of callables.")

    def __get_confidences_and_correct(
        self, x: Any, threshold: float, limit_to_top_class: bool
    ) -> Tuple[np.ndarray, np.ndarray]:
        """
        Computes confidence scores and applies the confidence threshold.
        """
        if not (
            hasattr(self.model, "predict_proba")
            or hasattr(self.model, "decision_function")
        ):
            raise ValueError(
                "The model must implement predict_proba or decision_function."
            )

        if hasattr(self.model, "predict_proba"):
            probas = self.model.predict_proba(x)
            conf: np.ndarray = (
                np.max(probas, axis=1) if limit_to_top_class else np.asarray(probas)
            )
        elif hasattr(self.model, "decision_function"):
            decision = self.model.decision_function(x)
            conf = (
                np.max(decision, axis=1)
                if getattr(decision, "ndim", 1) > 1
                else np.abs(np.asarray(decision))
            )
        else:
            raise ValueError(
                "The model must implement predict_proba or decision_function."
            )

        correct = conf >= threshold
        return conf, correct