Source code for skeval.evaluators.agreement

# Authors: The scikit-autoeval developers
# SPDX-License-Identifier: BSD-3-Clause
from typing import Any, Dict, Mapping, Optional, Union

from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import accuracy_score

from skeval.base import BaseEvaluator
from skeval.utils import check_is_fitted



[docs]
class AgreementEvaluator(BaseEvaluator):
    """Agreement-based evaluator for supervised classification models.

    This evaluator compares predictions produced by a *primary* model (``model``)
    and a *secondary* model (``sec_model``) on an evaluation set. For each
    sample, an agreement indicator is defined as ``1`` when both models predict
    the same class and ``0`` otherwise. Using this indicator, an *expected label*
    vector is created by flipping the primary model's prediction when the models
    disagree. Metric(s) are then computed comparing the expected label vector
    to the agreement indicator, providing an estimate of how often the primary
    model's predictions would align with a plausible correction strategy based
    on model disagreement.

    Evaluation workflow:
    1. Fit both the primary and secondary models on the training data.
    2. Generate predictions for both models on the evaluation data.
    3. Build the agreement vector (1 = same prediction, 0 = different).
    4. Produce an expected label vector, flipping predictions where disagreement occurs.
    5. Compute the chosen metric(s) using the scorer(s).

    Parameters
    ----------
    model : estimator
        A classification estimator implementing ``fit`` and ``predict``.
        May be a single estimator or a pipeline created with
        ``sklearn.pipeline.make_pipeline``.
    scorer : callable or dict of str -> callable, default=accuracy_score
        A single scoring function or a dictionary mapping metric names to
        scoring callables. Each scorer must follow the signature
        ``scorer(y_true, y_pred)``.
    verbose : bool, default=False
        If ``True``, prints progress information during fit and estimate.
    sec_model : estimator, optional
        Secondary classification model used solely to generate comparison
        predictions. If ``None``, defaults to ``GaussianNB()``.

    Attributes
    ----------
    model : estimator
        The primary model provided at initialization.
    sec_model : estimator
        The secondary model used to create agreement signals.

    Notes
    -----
    This evaluator assumes both models output class labels directly via
    ``predict``. No probability calibration is performed. The metric(s) are
    computed on synthetic targets produced from model agreement—not against
    real ground-truth labels—so scores should be interpreted as *agreement-based
    estimates*, not actual performance metrics.

    Examples
    --------
    Basic usage with two RandomForest pipelines and multiple scorers:

    >>> import pandas as pd
    >>> from sklearn.metrics import accuracy_score, f1_score
    >>> from sklearn.impute import KNNImputer
    >>> from sklearn.pipeline import make_pipeline
    >>> from sklearn.ensemble import RandomForestClassifier
    >>> from skeval.evaluators.agreement import AgreementEvaluator
    >>> from skeval.utils import get_cv_and_real_scores, print_comparison
    >>> df_geriatrics = pd.read_csv("geriatria.csv")
    >>> df_neurology = pd.read_csv("neurologia.csv")
    >>> X1, y1 = df_geriatrics.drop(columns=["Alzheimer"]), df_geriatrics["Alzheimer"]
    >>> X2, y2 = df_neurology.drop(columns=["Alzheimer"]), df_neurology["Alzheimer"]
    >>> model = make_pipeline(
    ...     KNNImputer(n_neighbors=10),
    ...     RandomForestClassifier(n_estimators=50, random_state=42),
    ... )
    >>> sec_model = make_pipeline(
    ...     KNNImputer(n_neighbors=10),
    ...     RandomForestClassifier(n_estimators=100, random_state=42),
    ... )
    >>> scorers = {
    ...     "accuracy": accuracy_score,
    ...     "f1_macro": lambda y, p: f1_score(y, p, average="macro"),
    ... }
    >>> evaluator = AgreementEvaluator(model=model, sec_model=sec_model, scorer=scorers)
    >>> evaluator.fit(X1, y1)
    >>> estimated_scores = evaluator.estimate(X2)
    >>> # Optionally compare with CV and real scores
    >>> scores_dict = get_cv_and_real_scores(
    ...     model=model, scorers=scorers, train_data=(X1, y1), test_data=(X2, y2)
    ... )
    >>> cv_scores = scores_dict["cv_scores"]
    >>> real_scores = scores_dict["real_scores"]
    >>> print_comparison(scorers, cv_scores, estimated_scores, real_scores)
    """

    def __init__(
        self,
        model: Any,
        scorer: Union[Mapping[str, Any], Any] = accuracy_score,
        verbose: bool = False,
        sec_model: Optional[Any] = None,
    ) -> None:
        super().__init__(model=model, scorer=scorer, verbose=verbose)

        self.sec_model = sec_model if sec_model is not None else GaussianNB()


[docs]
    def fit(self, x: Any, y: Any) -> "AgreementEvaluator":
        """Fit the evaluator by training both primary and secondary models.

        Parameters
        ----------
        x : array-like of shape (n_samples, n_features)
            Feature matrix used to fit both models.
        y : array-like of shape (n_samples,)
            Target labels corresponding to ``x``.

        Returns
        -------
        self : AgreementEvaluator
            The fitted evaluator instance.
        """

        self.model.fit(x, y)
        self.sec_model.fit(x, y)

        if self.verbose:
            print("[INFO] Fit completed.")

        return self



[docs]
    def estimate(self, x_eval: Any) -> Dict[str, float]:
        """Estimate agreement-based metric values on evaluation data.

        Generates predictions from both models, constructs an agreement vector
        and an expected label vector (flipping the primary prediction when
        disagreement occurs), then applies the configured scorer(s).

        Parameters
        ----------
        x_eval : array-like of shape (n_samples, n_features)
            Evaluation feature matrix.

        Returns
        -------
        scores : dict
            If ``scorer`` is a dict, returns a mapping from metric name to
            agreement-based score. Otherwise returns ``{"score": float}``.

        Raises
        ------
        ValueError
            If ``scorer`` is neither a callable nor a dict of callables.
        """

        check_is_fitted(self.model)
        check_is_fitted(self.sec_model)

        pred_main = self.model.predict(x_eval)
        pred_secondary = self.sec_model.predict(x_eval)

        agreement = (pred_main == pred_secondary).astype(int)
        y_agreement = [p if a else 1 - p for p, a in zip(pred_main, agreement)]

        if isinstance(self.scorer, dict):
            score: Dict[str, float] = {
                name: float(metric(y_agreement, agreement))
                for name, metric in self.scorer.items()
            }
            if self.verbose:
                print("[INFO] Estimated score:", score)

            return score
        if callable(self.scorer):
            score_val = float(self.scorer(y_agreement, agreement))
            if self.verbose:
                print("[INFO] Estimated score:", score_val)
            return {"score": score_val}
        raise ValueError("'scorer' must be a callable or a dict of callables.")