Source code for skeval.metrics.comparison

# Authors: The scikit-autoeval developers
# SPDX-License-Identifier: BSD-3-Clause
from typing import Any, Callable, Dict, Mapping, Union
from sklearn.metrics import mean_absolute_error



[docs]
def score_error(
    real_scores: Mapping[str, float],
    est_scores: Mapping[str, float],
    comparator: Union[
        Callable[[Any, Any], float], Mapping[str, Callable[[Any, Any], float]]
    ] = mean_absolute_error,
    verbose: bool = False,
) -> Dict[str, float]:
    """
    Compares estimated and real scores using a user-defined comparison function.

    This function iterates through the metrics present in both `real_scores` and
    `est_scores` dictionaries and computes the error between them using
    the provided comparator function(s).

    Parameters
    ----------
    real_scores : dict
        A dictionary of scores computed with true labels.
        Example: `{'accuracy': 0.9, 'f1': 0.85}`
    est_scores : dict
        A dictionary of scores estimated without true labels.
        Example: `{'accuracy': 0.88, 'f1': 0.82}`
    comparator : callable or dict, default=mean_absolute_error
        The function or dictionary of functions used to compare the real and
        estimated scores.
        - If callable, it's applied to all common metrics.
        - If dict, it maps a metric name to a specific comparator function.
    verbose : bool, default=False
        If True, prints the real score, estimated score, and the resulting
        error for each metric.

    Returns
    -------
    dict
        A dictionary containing the comparison results (errors) for each
        common metric.

    Raises
    ------
    ValueError
        If `comparator` is not a callable or a dictionary of callables.

    Examples
    --------
    >>> real = {'accuracy': 0.95, 'precision': 0.90, 'recall': 0.85}
    >>> estimated = {'accuracy': 0.91, 'precision': 0.92, 'f1_score': 0.88}

    >>> # Example 1: Using the default comparator (mean_absolute_error)
    >>> errors = score_error(real, estimated)
    >>> for metric, error in sorted(errors.items()):
    ...     print(f"{metric}: {error:.4f}")
    accuracy: 0.0400
    precision: 0.0200

    >>> # Example 2: Using a dictionary of different comparators
    >>> from sklearn.metrics import mean_squared_error
    >>> comparators = {
    ...     'accuracy': mean_absolute_error,
    ...     'precision': mean_squared_error
    ... }
    >>> errors_custom = score_error(real, estimated, comparator=comparators, verbose=True)
    [accuracy] Real: 0.95, Estimated: 0.91, Error: 0.040000000000000036
    [precision] Real: 0.9, Estimated: 0.92, Error: 0.0004000000000000003
    >>> for metric, error in sorted(errors_custom.items()):
    ...     print(f"{metric}: {error:.4f}")
    accuracy: 0.0400
    precision: 0.0004
    """
    result = {}

    if callable(comparator):
        for metric in real_scores:
            if metric in est_scores:
                error = comparator([real_scores[metric]], [est_scores[metric]])
                result[metric] = error
                if verbose:
                    print(
                        f"[{metric}] Real: {real_scores[metric]}, " +
                        f"Estimated: {est_scores[metric]}, " +
                        f"Error: {error}"
                    )

    elif isinstance(comparator, dict):
        for metric in real_scores:
            if metric in est_scores and metric in comparator:
                error = comparator[metric](
                    [real_scores[metric]], [est_scores[metric]]
                )
                result[metric] = error
                if verbose:
                    print(
                        f"[{metric}] Real: {real_scores[metric]}, " +
                        f"Estimated: {est_scores[metric]}, " +
                        f"Error: {error}"
                    )
    else:
        raise ValueError("Comparator must be a callable or a dict of callables.")

    return result