Source code for mofdscribe.metrics.regression

# -*- coding: utf-8 -*-
"""Metrics for the regression setting."""
from __future__ import annotations

from typing import List

import numpy as np
from numpy.typing import ArrayLike
from pydantic import BaseModel
from sklearn.metrics import (
    max_error,
    mean_absolute_error,
    mean_absolute_percentage_error,
    mean_squared_error,
    r2_score,
)

__all__ = ["RegressionMetrics", "get_regression_metrics", "top_n_in_top_k"]


[docs]def top_n_in_top_k(
    predictions: ArrayLike, labels: ArrayLike, k: int, n: int, maximize: bool = True
) -> int:
    """Find how many of the top n predictions are in the top k labels.

    Args:
        predictions (ArrayLike): predictions for one objective
        labels (ArrayLike): true labels for one objective
        k (int): number of top labels to consider
        n (int): number of top predictions to consider
        maximize (bool): Set to `True` if larger is better.
            Defaults to True.

    Examples:
        >>> predictions = [0.1, 0.2, 0.3, 0.4, 0.5]
        >>> labels = [0.1, 0.2, 0.3, 0.4, 0.5]
        >>> top_n_in_top_k(predictions, labels, k=2, n=2)
        2

    Returns:
        int: number of top n predictions in top k labels
    """
    indices_predictions = np.argsort(predictions)
    indices_labels = np.argsort(labels)

    if maximize:
        indices_predictions = indices_predictions[::-1]
        indices_labels = indices_labels[::-1]

    top_n_predictions = indices_predictions[:n]
    top_k_labels = indices_labels[:k]

    return np.sum(np.isin(top_n_predictions, top_k_labels))


[docs]class RegressionMetrics(BaseModel):
    """Model for regression metrics.

    We will use this model to validate the benchmark results.
    """

    mean_squared_error: float
    mean_absolute_error: float
    r2_score: float
    max_error: float
    mean_absolute_percentage_error: float
    top_5_in_top_5: int
    top_10_in_top_10: int
    top_50_in_top_50: int
    top_100_in_top_100: int
    top_500_in_top_500: int


class RegressionMetricsConcat(BaseModel):
    mean_squared_error: List[float]
    mean_absolute_error: List[float]
    r2_score: List[float]
    max_error: List[float]
    mean_absolute_percentage_error: List[float]
    top_5_in_top_5: List[int]
    top_10_in_top_10: List[int]
    top_50_in_top_50: List[int]
    top_100_in_top_100: List[int]
    top_500_in_top_500: List[int]


[docs]def get_regression_metrics(predictions: ArrayLike, labels: ArrayLike) -> RegressionMetrics:
    """Get regression metrics.

    Args:
        predictions (ArrayLike): predictions for one objective
        labels (ArrayLike): true labels for one objective

    Returns:
        RegressionMetrics: regression metrics

    Examples:
        >>> predictions = [0.1, 0.2, 0.3, 0.4, 0.5]
        >>> labels = [0.1, 0.2, 0.3, 0.4, 0.5]
        >>> get_regression_metrics(predictions, labels)
        RegressionMetrics(**{'mean_absolute_error': 0.0,
        'mape': 0.0,
        'mean_squared_error': 0.0,
        'r2': 1.0,
        'max_error': 0.0,
        'top_5_in_top_5': 5,
        'top_10_in_top_10': 10,
        'top_100_in_top_100': 100,
        'top_500_in_top_500': 500})
    """
    metrics = RegressionMetrics(
        **{
            "mean_squared_error": mean_squared_error(labels, predictions),
            "mean_absolute_error": mean_absolute_error(labels, predictions),
            "r2_score": r2_score(labels, predictions),
            "max_error": max_error(labels, predictions),
            "mean_absolute_percentage_error": mean_absolute_percentage_error(labels, predictions),
            "top_5_in_top_5": top_n_in_top_k(predictions, labels, k=5, n=5),
            "top_10_in_top_10": top_n_in_top_k(predictions, labels, k=10, n=10),
            "top_50_in_top_50": top_n_in_top_k(predictions, labels, k=50, n=50),
            "top_100_in_top_100": top_n_in_top_k(predictions, labels, k=100, n=100),
            "top_500_in_top_500": top_n_in_top_k(predictions, labels, k=500, n=500),
        }
    )

    return metrics