Source code for mofdscribe.featurizers.pore.geometric_properties

# -*- coding: utf-8 -*-
"""Computing pore properties using Zeo++."""
import os
import re
import subprocess
from io import StringIO
from tempfile import TemporaryDirectory
from typing import Callable, List, Optional, Tuple, Union

import numpy as np
import pandas as pd
from loguru import logger
from pymatgen.core import IStructure, Structure

from mofdscribe.featurizers.base import MOFBaseFeaturizer

from ..utils import is_tool
from ..utils.tempdir import TEMPDIR

ZEOPP_BASE_COMMAND = ["network"]
HA_COMMAND = ["-ha"]
NO_ZEOPP_WARNING = "Did not find the zeo++ network binary in the path. \
            Can not run pore analysis."


PROBE_RADII = {
    "CO2": 1.525,
    "N2": 1.655,
    "H2": 1.48,
    "CH4": 1.865,
    "O2": 1.51,
    "Xe": 1.985,
    "Kr": 1.83,
    "H2O": 1.58,
    "H2S": 1.74,
    "C6H6": 2.925,
}

__all__ = [
    "AccessibleVolume",
    "PoreDiameters",
    "PoreSizeDistribution",
    "RayTracingHistogram",
    "SurfaceArea",
]


def run_zeopp(structure: Structure, command: str, parser: Callable, ha: bool = True) -> dict:
    """Run zeopp with network -ha to find the pore diameters.

    Args:
        structure (Structure): pymatgen Structure object
        command (str): command for zeopp
        parser (Callable): function to parse the output of zeopp
        ha (bool): whether to use the 'high accuracy' :code:`-ha` flag

    Returns:
        dict: pore analysis results
    """
    if not is_tool("network"):
        logger.error(NO_ZEOPP_WARNING)
    with TemporaryDirectory(dir=TEMPDIR) as tempdir:
        structure_path = os.path.join(tempdir, "structure.cif")
        result_path = os.path.join(tempdir, "result.res")
        structure.to(filename=structure_path, fmt="cif")
        if ha:
            cmd = (
                ZEOPP_BASE_COMMAND + HA_COMMAND + command + [str(result_path), str(structure_path)]
            )
        else:
            cmd = ZEOPP_BASE_COMMAND + command + [str(result_path), str(structure_path)]
        _ = subprocess.run(  # nosec
            cmd,
            universal_newlines=True,
            stdout=subprocess.PIPE,
            stderr=subprocess.PIPE,
            check=True,
            cwd=tempdir,
        )

        with open(result_path, "r") as handle:
            results = handle.read()

        zeopp_results = parser(results)

        return zeopp_results


def _parse_res_zeopp(filecontent: str) -> Tuple[List[float], List[str]]:
    """Parse the results line of a network -res call to zeopp

    Args:
        filecontent (str): results file

    Returns:
        dict: largest included sphere, largest free sphere,
            largest included sphere along free sphere path
    """
    first_line = filecontent.split("\n")[0]
    parts = first_line.split()

    results = {
        "lis": float(parts[1]),  # largest included sphere
        "lifs": float(parts[2]),  # largest free sphere
        "lifsp": float(parts[3]),  # largest included sphere along free sphere path
    }

    return results


def _parse_sa_zeopp(filecontent):
    regex_unitcell = re.compile(r"Unitcell_volume: ((\d+\.\d+)|\d+)|$")
    regex_density = re.compile(r"Density: ((\d+\.\d+)|\d+)|$")
    asa_a2 = re.compile(r"ASA_A\^2: ((\d+\.\d+)|\d+)|$")
    asa_m2cm3 = re.compile(r"ASA_m\^2/cm\^3: ((\d+\.\d+)|\d+)|$")
    asa_m2g = re.compile(r"ASA_m\^2/g: ((\d+\.\d+)|\d+)|$")
    nasa_a2 = re.compile(r"NASA_A\^2: ((\d+\.\d+)|\d+)|$")
    nasa_m2cm3 = re.compile(r"NASA_m\^2/cm\^3: ((\d+\.\d+)|\d+)|$")
    nasa_m2g = re.compile(r"NASA_m\^2/g: ((\d+\.\d+)|\d+)|$")

    d = {
        "unitcell_volume": float(re.findall(regex_unitcell, filecontent)[0][0]),
        "density": float(re.findall(regex_density, filecontent)[0][0]),
        "asa_a2": float(re.findall(asa_a2, filecontent)[0][0]),
        "asa_m2cm3": float(re.findall(asa_m2cm3, filecontent)[0][0]),
        "asa_m2g": float(re.findall(asa_m2g, filecontent)[0][0]),
        "nasa_a2": float(re.findall(nasa_a2, filecontent)[0][0]),
        "nasa_m2cm3": float(re.findall(nasa_m2cm3, filecontent)[0][0]),
        "nasa_m2g": float(re.findall(nasa_m2g, filecontent)[0][0]),
    }

    return d


def _parse_volpo_zeopp(filecontent):
    regex_unitcell = re.compile(r"Unitcell_volume: ((\d+\.\d+)|\d+)|$")
    regex_density = re.compile(r"Density: ((\d+\.\d+)|\d+)|$")
    av_a3 = re.compile(r"AV_A\^3: ((\d+\.\d+)|\d+)|$")
    av_volume_fraction = re.compile(r"AV_Volume_fraction: ((\d+\.\d+)|\d+)|$")
    av_cm3g = re.compile(r"AV_cm\^3/g: ((\d+\.\d+)|\d+)|$")
    nav_a3 = re.compile(r"NAV_A\^3: ((\d+\.\d+)|\d+)|$")
    nav_volume_fraction = re.compile(r"NAV_Volume_fraction: ((\d+\.\d+)|\d+)|$")
    nav_cm3g = re.compile(r"NAV_cm\^3/g: ((\d+\.\d+)|\d+)|$")

    d = {
        "unitcell_volume": float(re.findall(regex_unitcell, filecontent)[0][0]),
        "density": float(re.findall(regex_density, filecontent)[0][0]),
        "av_a3": float(re.findall(av_a3, filecontent)[0][0]),
        "av_volume_fraction": float(re.findall(av_volume_fraction, filecontent)[0][0]),
        "av_cm3g": float(re.findall(av_cm3g, filecontent)[0][0]),
        "nav_a3": float(re.findall(nav_a3, filecontent)[0][0]),
        "nav_volume_fraction": float(re.findall(nav_volume_fraction, filecontent)[0][0]),
        "nav_cm3g": float(re.findall(nav_cm3g, filecontent)[0][0]),
    }

    return d


def _parse_ray_hist_zeopp(filecontent):
    return [float(i) for i in filecontent.split("\n")[1:-1]]


def _parse_psd_zeopp(filecontent):
    return pd.read_csv(
        StringIO(filecontent),
        skiprows=11,
        names=["bin", "count", "cumulative", "derivative"],
        sep=r"\s+",
    )


[docs]class PoreDiameters(MOFBaseFeaturizer): """Calculate the pore diameters of a framework.""" def __init__( self, ha: bool = True, primitive: bool = True, ): """Initialize the featurizer. Args: ha (bool): if True, run zeo++ with the "high accuracy" :code:`-ha` flag. It has been reported that this can lead to issues for some structures. Default is True. primitive (bool): If True, the structure is reduced to its primitive form before the descriptor is computed. Defaults to True. """ self.labels = ["lis", "lifs", "lifsp"] self.ha = ha super().__init__(primitive=primitive) def _featurize(self, s): result = run_zeopp(s, ["-res"], _parse_res_zeopp, self.ha) return np.array(list(result.values()))
[docs] def feature_labels(self): return self.labels
[docs] def citations(self): return [ "@article{Willems2012," "doi = {10.1016/j.micromeso.2011.08.020}," "url = {https://doi.org/10.1016/j.micromeso.2011.08.020}," "year = {2012}," "month = feb," "publisher = {Elsevier {BV}}," "volume = {149}," "number = {1}," "pages = {134--141}," "author = {Thomas F. Willems and Chris H. Rycroft and Michaeel Kazi " "and Juan C. Meza and Maciej Haranczyk}," "title = {Algorithms and tools for high-throughput geometry-based " "analysis of crystalline porous materials}," "journal = {Microporous and Mesoporous Materials}" "}" ]
[docs] def implementors(self): return ["Kevin Maik Jablonka", "Maciej Haranczyk and Zeo++ authors"]
[docs]class SurfaceArea(MOFBaseFeaturizer): def __init__( self, probe_radius: Union[str, float] = 0.1, num_samples: int = 100, channel_radius: Union[str, float, None] = None, ha: bool = True, primitive: bool = True, ): """Initialize the SurfaceArea featurizer. Args: probe_radius (Union[str, float]): Radius of the probe. Defaults to 0.1. num_samples (int): Number of samples. Defaults to 100. channel_radius (Union[str, float, None]): Channel radius. Should equal to `probe_radius`. Defaults to None. ha (bool): if True, run zeo++ with the "high accuracy" :code:`-ha` flag. It has been reported that this can lead to issues for some structures. Default is True. primitive (bool): If True, the structure is reduced to its primitive form before the descriptor is computed. Defaults to True. """ if channel_radius is not None and probe_radius != channel_radius: logger.warning( "Probe radius and channel radius are different. This is a highly unusual setting." ) if isinstance(probe_radius, str): try: probe_radius = PROBE_RADII[probe_radius] except KeyError: logger.error(f"Probe radius {probe_radius} not found in PROBE_RADII") if channel_radius is None: channel_radius = probe_radius self.ha = ha self.probe_radius = probe_radius self.num_samples = num_samples self.channel_radius = channel_radius labels = [ "uc_volume", "density", "asa_a2", "asa_m2cm3", "asa_m2g", "nasa_a2", "nasa_m2cm3", "nasa_m2g", ] self.labels = [f"{label}_{self.probe_radius}" for label in labels] super().__init__(primitive=primitive) def _featurize(self, s: Union[Structure, IStructure]) -> np.ndarray: command = [ "-sa", f"{self.channel_radius}", f"{self.probe_radius}", f"{self.num_samples}", ] results = run_zeopp(s, command, _parse_sa_zeopp, self.ha) return np.array(list(results.values()))
[docs] def feature_labels(self) -> List[str]: return self.labels
[docs] def citations(self) -> List[str]: return [ "@article{Willems2012," "doi = {10.1016/j.micromeso.2011.08.020}," "url = {https://doi.org/10.1016/j.micromeso.2011.08.020}," "year = {2012}," "month = feb," "publisher = {Elsevier {BV}}," "volume = {149}," "number = {1}," "pages = {134--141}," "author = {Thomas F. Willems and Chris H. Rycroft and Michaeel Kazi " "and Juan C. Meza and Maciej Haranczyk}," "title = {Algorithms and tools for high-throughput geometry-based " "analysis of crystalline porous materials}," "journal = {Microporous and Mesoporous Materials}" "}" ]
[docs] def implementors(self) -> List[str]: return ["Kevin Maik Jablonka", "Maciej Haranczyk and Zeo++ authors"]
[docs]class AccessibleVolume(MOFBaseFeaturizer): def __init__( self, probe_radius: Union[str, float] = 0.1, num_samples: int = 100, channel_radius: Union[str, float, None] = None, ha: bool = True, primitive: bool = True, ): """Initialize the AccessibleVolume featurizer. Args: probe_radius (Union[str, float]): Radius of the probe. Defaults to 0.1. num_samples (int): Number of samples. Defaults to 100. channel_radius (Union[str, float, None]): Channel radius. Should equal to `probe_radius`. Defaults to None. ha (bool): if True, run zeo++ with the "high accuracy" :code:`-ha` flag. It has been reported that this can lead to issues for some structures. Default is True. primitive (bool): If True, the structure is reduced to its primitive form before the descriptor is computed. Defaults to True. """ if channel_radius is not None and probe_radius != channel_radius: logger.warning( "Probe radius and channel radius are different. This is a highly unusual setting." ) if isinstance(probe_radius, str): try: probe_radius = PROBE_RADII[probe_radius] except KeyError: logger.error(f"Probe radius {probe_radius} not found in PROBE_RADII") if channel_radius is None: channel_radius = probe_radius self.ha = ha self.probe_radius = probe_radius self.num_samples = num_samples self.channel_radius = channel_radius labels = [ "uc_volume", "density", "av_a2", "av_volume_fraction", "av_cm3g", "nav_a3", "nav_volume_fraction", "nav_cm3g", ] self.labels = [f"{label}_{self.probe_radius}" for label in labels] super().__init__(primitive=primitive) def _featurize(self, s: Union[Structure, IStructure]) -> np.ndarray: command = ["-vol", f"{self.channel_radius}", f"{self.probe_radius}", f"{self.num_samples}"] results = run_zeopp(s, command, _parse_volpo_zeopp, self.ha) return np.array(list(results.values()))
[docs] def feature_labels(self) -> List[str]: return self.labels
[docs] def citations(self) -> List[str]: return [ "@article{Willems2012," "doi = {10.1016/j.micromeso.2011.08.020}," "url = {https://doi.org/10.1016/j.micromeso.2011.08.020}," "year = {2012}," "month = feb," "publisher = {Elsevier {BV}}," "volume = {149}," "number = {1}," "pages = {134--141}," "author = {Thomas F. Willems and Chris H. Rycroft and Michaeel Kazi " "and Juan C. Meza and Maciej Haranczyk}," "title = {Algorithms and tools for high-throughput geometry-based " "analysis of crystalline porous materials}," "journal = {Microporous and Mesoporous Materials}" "}", "@article{Ongari2017," "doi = {10.1021/acs.langmuir.7b01682}," "url = {https://doi.org/10.1021/acs.langmuir.7b01682}," "year = {2017}," "month = jul," "publisher = {American Chemical Society ({ACS})}," "volume = {33}," "number = {51}," "pages = {14529--14538}," "author = {Daniele Ongari and Peter G. Boyd and Senja Barthel " "and Matthew Witman and Maciej Haranczyk and Berend Smit}," "title = {Accurate Characterization of the Pore Volume in " "Microporous Crystalline Materials}," "journal = {Langmuir}" "}", ]
[docs] def implementors(self) -> List[str]: return ["Kevin Maik Jablonka", "Maciej Haranczyk and Zeo++ authors"]
[docs]class RayTracingHistogram(MOFBaseFeaturizer): """Describe pore structures using histograms of ray lengths. The algorithm (implemented in `zeo++ <http://www.zeoplusplus.org/>`_) shoots random rays through the accesible volume of the cell until the ray hits atoms, and it records their lenghts to provide the corresponding histogram. Such ray histograms are supposed to encode the shape, topology, distribution and size of voids. Currently, the histogram is hard-coded to be of length 1000 (in zeo++ itself). """ def __init__( self, probe_radius: Union[str, float] = 0.0, num_samples: int = 50000, channel_radius: Optional[Union[str, float]] = None, ha: bool = True, primitive: bool = True, ) -> None: """Initialize the RayTracingHistogram featurizer. Args: probe_radius (Union[str, float]): Used to estimate the accessible volume. Only the accessible volume is then considered for the histogram. Defaults to 0.0. num_samples (int): Number of rays that are placed through sample. Original publication used 1,000,000 sample points for IZA zeolites and 100,000 sample points for hypothetical zeolites. Larger numbers increase the runtime Defaults to 50000. channel_radius (Union[str, float, None]): Radius of a probe used to determine accessibility of the void space. Should typically equal the radius of the `probe_radius`. If set to `None`, we will use the `probe_radius`. Defaults to None. ha (bool): if True, run zeo++ with the "high accuracy" :code:`-ha` flag. It has been reported that this can lead to issues for some structures. Default is True. primitive (bool): If True, the structure is reduced to its primitive form before the descriptor is computed. Defaults to True. """ if channel_radius is not None and probe_radius != channel_radius: logger.warning( "Probe radius and channel radius are different. This is a highly unusual setting." ) if isinstance(probe_radius, str): try: probe_radius = PROBE_RADII[probe_radius] except KeyError: logger.error(f"Probe radius {probe_radius} not found in PROBE_RADII") if channel_radius is None: channel_radius = probe_radius self.probe_radius = probe_radius self.num_samples = num_samples self.channel_radius = channel_radius self.ha = ha super().__init__(primitive=primitive)
[docs] def feature_labels(self) -> List[str]: return [f"ray_hist_{self.probe_radius}_{i}" for i in range(1000)]
def _featurize(self, s: Union[Structure, IStructure]) -> np.ndarray: command = [ "-ray_atom", f"{self.channel_radius}", f"{self.probe_radius}", f"{self.num_samples}", ] results = run_zeopp(s, command, _parse_ray_hist_zeopp, self.ha) return np.array(results)
[docs] def citations(self) -> List[str]: return [ "@article{Jones2013," "doi = {10.1016/j.micromeso.2013.07.033}," "url = {https://doi.org/10.1016/j.micromeso.2013.07.033}," "year = {2013}," "month = nov," "publisher = {Elsevier {BV}}," "volume = {181}," "pages = {208--216}," "author = {Andrew J. Jones and Christopher Ostrouchov and Maciej Haranczyk " "and Enrique Iglesia}," "title = {From rays to structures: Representation and selection of " "void structures in zeolites using stochastic methods}," "journal = {Microporous and Mesoporous Materials}" "}", "@article{Willems2012," "doi = {10.1016/j.micromeso.2011.08.020}," "url = {https://doi.org/10.1016/j.micromeso.2011.08.020}," "year = {2012}," "month = feb," "publisher = {Elsevier {BV}}," "volume = {149}," "number = {1}," "pages = {134--141}," "author = {Thomas F. Willems and Chris H. Rycroft and Michaeel Kazi " "and Juan C. Meza and Maciej Haranczyk}," "title = {Algorithms and tools for high-throughput geometry-based " "analysis of crystalline porous materials}," "journal = {Microporous and Mesoporous Materials}" "}", ]
[docs] def implementors(self) -> List[str]: return ["Kevin Maik Jablonka", "Maciej Haranczyk and Zeo++ authors"]
[docs]class PoreSizeDistribution(MOFBaseFeaturizer): """Describe structures using histograms of pore sizes. The pore size distribution describes how much of the void space corresponds to certain pore sizes. Pinheiro et al. (2013) concluded that they are "sensitive to small changes in pore diameter" but do "not reflect subtle changes in features such as the surface texture of a pore". We use the implementation in `zeo++ <http://www.zeoplusplus.org/>`_ to calculate the pore size distribution. The pore size distribution has been used by the group of Gómez-Gualdrón as pore size standard deviation (PSSD) in, for example, `10.1021/acs.jctc.9b00940 <https://pubs.acs.org/doi/10.1021/acs.jctc.9b00940>`_ and `10.1063/5.0048736 <https://aip.scitation.org/doi/10.1063/5.0048736>`_. Currently, the histogram is hard-coded to be of length 1000 between 0 and 100 Angstrom (in zeo++ itself). """ def __init__( self, probe_radius: Union[str, float] = 0.0, num_samples: int = 5000, channel_radius: Optional[Union[str, float]] = None, hist_type: str = "derivative", ha: bool = False, primitive: bool = True, ) -> None: """Initialize the PoreSizeDistribution featurizer. Args: probe_radius (Union[str, float]): Used to estimate the accessible volume. Only the accessible volume is then considered for the histogram. Defaults to 0.0. num_samples (int): Number of rays that are placed through sample. Original publication used 1,000,000 sample points for IZA zeolites and 100,000 sample points for hypothetical zeolites. Larger numbers increase the runtime. Defaults to 50000. channel_radius (Union[str, float, None]): Radius of a probe used to determine accessibility of the void space. Should typically equal the radius of the `probe_radius`. If set to `None`, we will use the `probe_radius`. Defaults to None. hist_type (str): Type of the histogram. Available options `count`, `cumulative`, `derivative`. (The derivative distribution describes the change in the cumulative distribution with respect to pore size). Defaults to "derivative". ha (bool): if True, run zeo++ with the "high accuracy" :code:`-ha` flag. It has been reported that this can lead to issues for some structures. Default is True. primitive (bool): If True, the structure is reduced to its primitive form before the descriptor is computed. Defaults to True. Raises: ValueError: If type not one of 'count', 'cumulative', 'derivative'. """ if channel_radius is not None and probe_radius != channel_radius: logger.warning( "Probe radius and channel radius are different. This is a highly unusual setting." ) if isinstance(probe_radius, str): try: probe_radius = PROBE_RADII[probe_radius] except KeyError: logger.error(f"Probe radius {probe_radius} not found in PROBE_RADII") if channel_radius is None: channel_radius = probe_radius self.type = hist_type.lower() if self.type not in [ "count", "cumulative", "derivative", ]: raise ValueError( "Invalid histogram type, must be one of `count`, `cumulative`, `derivative`" ) self.probe_radius = probe_radius self.num_samples = num_samples self.channel_radius = channel_radius self.ha = ha super().__init__(primitive=primitive)
[docs] def feature_labels(self) -> List[str]: return [f"psd_hist_{self.probe_radius}_{i}" for i in range(1000)]
def _featurize(self, s: Union[Structure, IStructure]) -> np.ndarray: command = [ "-psd", f"{self.channel_radius}", f"{self.probe_radius}", f"{self.num_samples}", ] results = run_zeopp(s, command, _parse_psd_zeopp, self.ha) return results[self.type].values
[docs] def citations(self) -> List[str]: return [ "@article{Pinheiro2013," "doi = {10.1016/j.jmgm.2013.05.007}," "url = {https://doi.org/10.1016/j.jmgm.2013.05.007}," "year = {2013}," "month = jul," "publisher = {Elsevier {BV}}," "volume = {44}," "pages = {208--219}," "author = {Marielle Pinheiro and Richard L. Martin and " "Chris H. Rycroft and Andrew Jones and Enrique Iglesia and Maciej Haranczyk}," "title = {Characterization and comparison of pore landscapes " "in crystalline porous materials}," "journal = {Journal of Molecular Graphics and Modelling}" "}", "@article{Willems2012," "doi = {10.1016/j.micromeso.2011.08.020}," "url = {https://doi.org/10.1016/j.micromeso.2011.08.020}," "year = {2012}," "month = feb," "publisher = {Elsevier {BV}}," "volume = {149}," "number = {1}," "pages = {134--141}," "author = {Thomas F. Willems and Chris H. Rycroft and Michaeel Kazi " "and Juan C. Meza and Maciej Haranczyk}," "title = {Algorithms and tools for high-throughput geometry-based " "analysis of crystalline porous materials}," "journal = {Microporous and Mesoporous Materials}" "}", ]
[docs] def implementors(self): return ["Kevin Maik Jablonka", "Maciej Haranczyk and Zeo++ authors"]