Source code for mofdscribe.datasets.qmof_dataset

# -*- coding: utf-8 -*-
"""Subset of the QMOF dataset."""
import os
from typing import Collection, Optional, Tuple

import numpy as np
import pandas as pd
from loguru import logger

from mofdscribe.constants import MOFDSCRIBE_PYSTOW_MODULE
from mofdscribe.datasets.checks import check_all_file_exists, length_check
from mofdscribe.datasets.dataset import AbstractStructureDataset
from mofdscribe.datasets.utils import compress_dataset

__all__ = ["QMOFDataset"]


[docs]class QMOFDataset(AbstractStructureDataset): """ Exposes the QMOF dataset by Rosen et al. [Rosen2021]_ [Rosen2022]_ . Currently based on v14 of the QMOF dataset. To reduce the risk of data leakage, we (by default) also only keep one representative structure for a "base refcode" (i.e. the first five letters of a refcode). For instance, the base refcode for IGAHED001 is IGAHED. Structures with same base refcode but different refcodes are often different refinements, or measurements at different temperatures and hence chemically quite similar. For instance, in the QMOF dataset `the basecode BOJKAM appears four times <https://materialsproject.org/mofs?_sort=data.lcd.value&data__csdRefcode__contains=BOJKAM>`_. Additionally, we (by default) only keep one structure per "structure hash" which is an approximate graph-isomoprhism check, assuming the VESTA bond thresholds for the derivation of the structure graph. Note that Rosen et al. already performed some deduplication using the pymatgen `StructureMatcher`. Our de-duplication is a bit more aggressive, and might be too aggressive in some cases. .. warning:: Even though we performed some basic sanity checks and Rosen et al. included checks to ensure high-fidelity structures, there might still be some structures that are not chemically reasonable. Also, even though we only keep one structure per base refcode, there is still potential for data leakge. We urge users to still drop duplicates (or close neighbors) after featurization. This dataset is available in different flavors: * ``"all"``: the full dataset, all original QMOF structures for which we could compute features and hashes * ``"csd"``: the subset which comes from the CSD and for which we could retrieve publication years. * ``"gcmc"``: the subset for which we performed grand canonical Monte Carlo simulations * ``"gcmc-csd"``: the subset for which we performed grand canonical Monte Carlo simulations and for which we could retrieve publication years. Currently, we expose the following labels: * outputs.pbe.energy_total * outputs.pbe.energy_vdw * outputs.pbe.energy_elec * outputs.pbe.net_magmom * outputs.pbe.bandgap * outputs.pbe.cbm * outputs.pbe.vbm * outputs.pbe.directgap * outputs.pbe.bandgap_spins * outputs.pbe.cbm_spins * outputs.pbe.vbm_spins * outputs.pbe.directgap_spins * outputs.hle17.energy_total * outputs.hle17.energy_vdw * outputs.hle17.energy_elec * outputs.hle17.net_magmom * outputs.hle17.bandgap * outputs.hle17.cbm * outputs.hle17.vbm * outputs.hle17.directgap * outputs.hle17.bandgap_spins * outputs.hle17.cbm_spins * outputs.hle17.vbm_spins * outputs.hle17.directgap_spins * outputs.hse06_10hf.energy_total * outputs.hse06_10hf.energy_vdw * outputs.hse06_10hf.energy_elec * outputs.hse06_10hf.net_magmom * outputs.hse06_10hf.bandgap * outputs.hse06_10hf.cbm * outputs.hse06_10hf.vbm * outputs.hse06_10hf.directgap * outputs.hse06_10hf.bandgap_spins * outputs.hse06_10hf.cbm_spins * outputs.hse06_10hf.vbm_spins * outputs.hse06_10hf.directgap_spins * outputs.hse06.energy_total * outputs.hse06.energy_vdw * outputs.hse06.energy_elec * outputs.hse06.net_magmom * outputs.hse06.bandgap * outputs.hse06.cbm * outputs.hse06.vbm * outputs.hse06.directgap * outputs.hse06.bandgap_spins * outputs.hse06.cbm_spins * outputs.hse06.vbm_spins * outputs.hse06.directgap_spins * outputs.CO2_Henry_coefficient * outputs.CO2_adsorption_energy * outputs.N2_Henry_coefficient * outputs.N2_adsorption_energy * outputs.CO2_parasitic_energy_(coal) * outputs.Gravimetric_working_capacity_(coal) * outputs.Volumetric_working_capacity_(coal) * outputs.CO2_parasitic_energy_(nat_gas) * outputs.Gravimetric_working_capacity_(nat_gas) * outputs.Volumetric_working_capacity_(nat_gas) * outputs.Final_CO2_purity_(nat_gas) * outputs.CH4_Henry_coefficient * outputs.CH4_adsorption_energy * outputs.Enthalphy_of_Adsorption__at__58_bar,_298K * outputs.Enthalphy_of_Adsorption__at__65bar--298K * outputs.Working_capacity_vol_(58-65bar--298K) * outputs.Working_capacity_mol_(58-65bar--298K) * outputs.Working_capacity_fract_(58-65bar--298K) * outputs.Working_capacity_wt%_(58-65bar--298K) * outputs.O2_Henry_coefficient * outputs.O2_adsorption_energy * outputs.Enthalphy_of_Adsorption__at__5_bar,_298K * outputs.Enthalphy_of_Adsorption__at__140bar--298K * outputs.Working_capacity_vol_(5-140bar--298K) * outputs.Working_capacity_mol_(5-140bar--298K) * outputs.Working_capacity_fract_(5-140bar--298K) * outputs.Working_capacity_wt%_(5-140bar--298K) * outputs.Xe_Henry_coefficient * outputs.Xe_adsorption_energy * outputs.Kr_Henry_coefficient * outputs.Kr_adsorption_energy * outputs.Xe--Kr_selectivity__at__298K * outputs.Working_capacity_g--L_(5-100bar--298-198K) * outputs.Working_capacity_g--L_(5-100bar--77K) * outputs.Working_capacity_g--L_(1-100bar--77K) * outputs.Working_capacity_wt%_(5-100bar--298-198K) * outputs.Working_capacity_wt%_(5-100bar--77K) * outputs.Working_capacity_wt%_(1-100bar--77K) * outputs.H2S_Henry_coefficient * outputs.H2S_adsorption_energy * outputs.H2O_Henry_coefficient * outputs.H2O_adsorption_energy * outputs.H2S--H2O_selectivity__at__298K * outputs.CH4--N2_selectivity__at__298K Note that many of the gas adsorption data are :py:obj:`numpy.nan` because the pores are not accessible to the guest molecules. Depending on your application you might want to fill them with zeros or drop them. .. warning:: The class will load almost 1GB of data into memory. .. warning:: By default, the values will be sorted by the PBE total energy References: .. [Rosen2021] `Rosen, A. S.; Iyer, S. M.; Ray, D.; Yao, Z.; Aspuru-Guzik, A.; Gagliardi, L.; Notestein, J. M.; Snurr, R. Q. Machine Learning the Quantum-Chemical Properties of Metal–Organic Frameworks for Accelerated Materials Discovery. Matter 2021, 4 (5), 1578–1597. <https://doi.org/10.1016/j.matt.2021.02.015>`_ .. [Rosen2022] `Rosen, A. S.; Fung, V.; Huck, P.; O'Donnell, C. T.; Horton, M. K.; Truhlar, D. G.; Persson, K. A.; Notestein, J. M.; Snurr, R. Q. High-Throughput Predictions of Metal–Organic Framework Electronic Properties: Theoretical Challenges, Graph Neural Networks, and Data Exploration. npj Computational Materials, 8, 112. <https://doi.org/10.1038/s41524-022-00796-6>`_ """ # we expect this len for the full dataset _files = { "v0.0.1": { "df": "https://zenodo.org/record/7031397/files/data.json?download=1", "structures": "https://zenodo.org/record/7031397/files/structures.tar.gz?download=1", "expected_length": 15042, "flavors": { "all": 15042, "csd": 6311, "gcmc": 5321, "csd-gcmc": 2295, }, } } def __init__( self, version: str = "v0.0.1", flavor: str = "all", drop_basename_duplicates: bool = True, drop_graph_duplicates: bool = True, subset: Optional[Collection[int]] = None, drop_nan: bool = False, ): """Construct an instance of the QMOF dataset. Args: version (str): version number to use. Defaults to "v0.0.1". flavor (str): flavor of the dataset to use. Accepted values are "all", "csd", "gcmc", and "csd-gcmc". Defaults to "all". drop_basename_duplicates (bool): If True, keep only one structure per CSD basename. Defaults to True. drop_graph_duplicates (bool): If True, keep only one structure per decorated graph hash. Defaults to True. subset (Optional[Collection[int]]): indices of the structures to include. This is useful for subsampling the dataset. Defaults to None. drop_nan (bool): If True, drop rows with NaN values in features or hashes. Defaults to False. Raises: ValueError: If the provided version number is not available. """ self._drop_basename_duplicates = drop_basename_duplicates self._drop_nan = drop_nan self._drop_graph_duplicates = drop_graph_duplicates self._flavor = flavor if version not in self._files: raise ValueError( f"Version {version} not available. Available versions: {list(self._files.keys())}" ) if flavor not in self._files[version]["flavors"]: raise ValueError( f"Flavor {flavor} not available. Available flavors: {list(self._files[version]['flavors'].keys())}" ) self.version = version # download the data for the largest set ("all") self._structure_dir = MOFDSCRIBE_PYSTOW_MODULE.ensure_untar( "QMOF", self.version, name="structures.tar.gz", url=self._files[version]["structures"], ) self._df = pd.DataFrame( MOFDSCRIBE_PYSTOW_MODULE.ensure_json( "QMOF", self.version, name="data.json", url=self._files[version]["df"] ) ).reset_index(drop=True) compress_dataset(self._df) length_check(self._df, self._files[version]["expected_length"]) # we sort by the PBE energy to make sure we keep always the lowest in energy self._df = self._df.sort_values(by="outputs.pbe.energy_total") if drop_basename_duplicates: old_len = len(self._df) self._df = self._df.drop_duplicates(subset=["info.basename"], keep="first") logger.debug( f"Dropped {old_len - len(self._df)} duplicate basenames. New length {len(self._df)}" ) if drop_graph_duplicates: old_len = len(self._df) self._df = self._df.drop_duplicates(subset=["info.decorated_graph_hash"], keep="first") logger.debug( f"Dropped {old_len - len(self._df)} duplicate graphs. New length {len(self._df)}" ) # select by flavor self._df = self._df[self._df[f"flavor.{self._flavor}"]] self._df = self._df.reset_index(drop=True) if drop_nan: self._df.dropna( subset=[c for c in self._df.columns if c.startswith("features.")] + [c for c in self._df.columns if c.startswith("info.")], inplace=True, ) self._df.reset_index(drop=True, inplace=True) if subset is not None: self._df = self._df.iloc[subset] self._df = self._df.reset_index(drop=True) self._structures = [ os.path.join(self._structure_dir, f + ".cif") for f in self._df["info.qmof_id"] ] check_all_file_exists(self._structures) self._years = self._df["info.year"].values self._decorated_graph_hashes = self._df["info.decorated_graph_hash"].values self._undecorated_graph_hashes = self._df["info.undecorated_graph_hash"].values self._decorated_scaffold_hashes = self._df["info.decorated_scaffold_hash"].values self._undecorated_scaffold_hashes = self._df["info.undecorated_scaffold_hash"].values self._densities = self._df["info.density"].values self._labelnames = (c for c in self._df.columns if c.startswith("outputs.")) self._featurenames = (c for c in self._df.columns if c.startswith("features.")) self._infonames = (c for c in self._df.columns if c.startswith("info."))
[docs] def get_subset(self, indices: Collection[int]) -> "AbstractStructureDataset": """Get a subset of the dataset. Args: indices (Collection[int]): indices of the structures to include. Returns: AbstractStructureDataset: a new dataset containing only the structures specified by the indices. """ return QMOFDataset( version=self.version, drop_basename_duplicates=self._drop_basename_duplicates, drop_graph_duplicates=self._drop_graph_duplicates, subset=indices, flavor=self._flavor, drop_nan=self._drop_nan, )
@property def available_info(self) -> Tuple[str]: return self._infonames @property def available_features(self) -> Tuple[str]: return self._featurenames @property def available_labels(self) -> Tuple[str]: return self._labelnames def get_labels(self, idx: Collection[int], labelnames: Collection[str] = None) -> np.ndarray: labelnames = labelnames if labelnames is not None else self._labelnames return self._df.iloc[idx][list(labelnames)].values @property def citations(self) -> Tuple[str]: return [ "@article{Rosen2021," "doi = {10.1016/j.matt.2021.02.015}," "url = {https://doi.org/10.1016/j.matt.2021.02.015}," "year = {2021}," "month = may," "publisher = {Elsevier {BV}}," "volume = {4}," "number = {5}," "pages = {1578--1597}," "author = {Andrew S. Rosen and Shaelyn M. Iyer and Debmalya Ray " "and Zhenpeng Yao and Al{'{a}}n Aspuru-Guzik and Laura Gagliardi " "and Justin M. Notestein and Randall Q. Snurr}," "title = {Machine learning the quantum-chemical properties of" "metal{\textendash}organic frameworks for accelerated materials discovery}," "journal = {Matter}" "}", "@article{Rosen2022", "title={High-throughput predictions of metal--organic framework electronic properties:" " theoretical challenges, graph neural networks, and data exploration}," "author={Rosen, Andrew S and Fung, Victor and Huck, Patrick and O’Donnell, " "Cody T and Horton, Matthew K and Truhlar, Donald G and Persson, Kristin A " "and Notestein, Justin M and Snurr, Randall Q}," "journal={npj Computational Materials}," "volume={8}," "pages={112}," "year={2022}," "publisher={Nature Publishing Group}" "}", ]