Source code for mofdscribe.bench.logkHCO2

# -*- coding: utf-8 -*-
"""In-dataset predictions for the logarithmitic CO2 Henry coefficients"""
from typing import Optional

import numpy as np

from mofdscribe.datasets import CoREDataset
from mofdscribe.splitters.splitters import DensitySplitter, HashSplitter

from .mofbench import MOFBenchRegression

__all__ = ["LogkHCO2IDBench", "LogkHCO2OODBench"]


[docs]class LogkHCO2IDBench(MOFBenchRegression):
    """Benchmarking models for the logarithmic CO2 Henry coefficient under in-domain conditions.

    In-distribution implies that we use a cluster stratified splitter
    that ensures that the ratios of different clusters in the training
    and test set are the same.
    """

    def __init__(
        self,
        model,
        name: str,
        version: Optional[str] = "v0.0.1",
        features: Optional[str] = None,
        model_type: Optional[str] = None,
        reference: Optional[str] = None,
        implementation: Optional[str] = None,
        debug: bool = False,
        patch_in_ds: bool = False,
    ):
        """Initialize the  log KH CO2 interpolation benchmark.

        Args:
            model (object): The model to be benchmarked.
                Must implement the `fit` and `predict` methods.
            name (str): The name of the modeling approach.
            version (str, optional): Version of the dataset to use.
                Defaults to "v0.0.1".
            features (str, optional): Description of the features used in the model.
                Defaults to None.
            model_type (str, optional): Model type (e.g. Conv-Net, BERT, XGBoost).
                Defaults to None.
            reference (str, optional): Reference with more details about modeling approach.
                Defaults to None.
            implementation (str, optional): Link to implementation. Defaults to None.
            debug (bool): If True, use a small dataset (1% of full dataset) for debugging.
                Defaults to False.
            patch_in_ds (bool): If True, the dataset will be patched into the model class
                under the `ds` attribute.
        """
        super().__init__(
            model,
            ds=CoREDataset(version),
            splitter=HashSplitter(
                CoREDataset(version),
                stratification_col="outputs.logKH_CO2",
                sample_frac=0.01 if debug else 1.0,
            ),
            target=["outputs.logKH_CO2"],
            task="logKH_CO2_id",
            k=5,
            version=version,
            features=features,
            name=name,
            model_type=model_type,
            reference=reference,
            implementation=implementation,
            debug=debug,
            patch_in_ds=patch_in_ds,
        )


[docs]class LogkHCO2OODBench(MOFBenchRegression):
    """Benchmarking models for the logarithmic CO2 Henry coefficient under "out-of-domain" conditions.

    "Out-of-domain" conditions means that every of the 5 training fold will only see 4 out of the 5
    quantile bins.
    This implies that 2 runs are extrapolative and the other 3 need to "fill holes in the distribution".
    """

    def __init__(
        self,
        model,
        version: Optional[str] = "v0.0.1",
        features: Optional[str] = None,
        name: Optional[str] = None,
        model_type: Optional[str] = None,
        reference: Optional[str] = None,
        implementation: Optional[str] = None,
        debug: bool = False,
        patch_in_ds: bool = False,
    ):
        """Initialize the  log KH CO2 extrapolation benchmark.

        Args:
            model (object): The model to be benchmarked.
                Must implement the `fit` and `predict` methods.
            name (str): The name of the modeling approach.
            version (str, optional): Version of the dataset to use.
                Defaults to "v0.0.1".
            features (str, optional): Description of the features used in the model.
                Defaults to None.
            model_type (str, optional): Model type (e.g. Conv-Net, BERT, XGBoost).
                Defaults to None.
            reference (str, optional): Reference with more details about modeling approach.
                Defaults to None.
            implementation (str, optional): Link to implementation. Defaults to None.
            debug (bool): If True, use a small dataset (1% of full dataset) for debugging.
                Defaults to False.
            patch_in_ds (bool): If True, the dataset will be patched into the model class
                under the `ds` attribute.
        """
        super().__init__(
            model,
            ds=CoREDataset(version),
            splitter=DensitySplitter(
                CoREDataset(version),
                sample_frac=0.01 if debug else 1.0,
                density_q=np.linspace(0, 1, 6),
            ),
            target=["outputs.logKH_CO2"],
            task="logKH_CO2_ood",
            k=5,
            version=version,
            features=features,
            name=name,
            model_type=model_type,
            reference=reference,
            implementation=implementation,
            debug=debug,
            patch_in_ds=patch_in_ds,
        )