Source code for mofdscribe.featurizers.topology.ph_image

# -*- coding: utf-8 -*-
"""Implements persistent homology images."""
from collections import defaultdict
from typing import List, Optional, Tuple, Union

import numpy as np
from pymatgen.core import IMolecule, IStructure, Molecule, Structure

from mofdscribe.featurizers.base import MOFBaseFeaturizer
from mofdscribe.featurizers.utils.extend import (
    operates_on_imolecule,
    operates_on_istructure,
    operates_on_molecule,
    operates_on_structure,
)

from ._tda_helpers import (
    get_persistence_image_limits_for_structure,
    get_persistent_images_for_structure,
)


[docs]@operates_on_imolecule @operates_on_molecule @operates_on_istructure @operates_on_structure class PHImage(MOFBaseFeaturizer): r"""Vectorize persistent diagrams as image. `Adams et al. (2017) <https://www.jmlr.org/papers/volume18/16-337/16-337.pdf>`_ introduced a stable vector representation of persistent homology. In persistent images, one replaces birth–persistence pairs (b, d – b) by a Gaussians (to spread its influence across the neighborhood, since nearby points represent features of similar size). Additionally, one multiplies with special weighting function such that .. math:: f(x, y)=w(y) \sum_{(b, d) \in D_{g}(P)} N((b, d-b), \sigma) A common weighting function is the linear function :math:`w(y) = y`. One application for porous materials has been reported by `Aditi S. Krishnapriyan et al. (2017) <https://pubs.acs.org/doi/full/10.1021/acs.jpcc.0c01167>`_. Typically, persistent images are computed for all atoms in the structure. However, one can also compute persistent images for a subset of atoms. This can be done by specifying the atom types in the constructor. """ def __init__( self, atom_types: Optional[Tuple[str]] = ( "C-H-N-O", "F-Cl-Br-I", "Cu-Mn-Ni-Mo-Fe-Pt-Zn-Ca-Er-Au-Cd-Co-Gd-Na-Sm-Eu-Tb-V" "-Ag-Nd-U-Ba-Ce-K-Ga-Cr-Al-Li-Sc-Ru-In-Mg-Zr-Dy-W-Yb-Y-" "Ho-Re-Be-Rb-La-Sn-Cs-Pb-Pr-Bi-Tm-Sr-Ti-Hf-Ir-Nb-Pd-Hg-" "Th-Np-Lu-Rh-Pu", ), dimensions: Tuple[int] = (0, 1, 2), compute_for_all_elements: bool = True, min_size: int = 20, image_size: Tuple[int] = (20, 20), spread: float = 0.2, weight: str = "identity", max_b: Union[int, List[int]] = 18, max_p: Union[int, List[int]] = 18, max_fit_tolerence: float = 0.1, periodic: bool = False, no_supercell: bool = False, primitive: bool = False, alpha_weight: Optional[str] = None, ) -> None: """Construct a PHImage object. Args: atom_types (Tuple[str], optional): Atoms that are used to create substructures that are analysed using persistent homology. If multiple atom types separated by hash are provided, e.g. "C-H-N-O", then the substructure consists of all atoms of type C, H, N, or O. Defaults to ( "C-H-N-O", "F-Cl-Br-I", "Cu-Mn-Ni-Mo-Fe-Pt-Zn-Ca-Er-Au-Cd-Co-Gd-Na-Sm-Eu-Tb-V-Ag-Nd-U-Ba- Ce-K-Ga-Cr-Al-Li-Sc-Ru-In-Mg-Zr-Dy-W-Yb-Y-Ho-Re-Be-Rb-La-Sn-Cs-Pb- Pr-Bi-Tm-Sr-Ti-Hf-Ir-Nb-Pd-Hg-Th-Np-Lu-Rh-Pu"). dimensions (Tuple[int]): Dimensions of topological features to consider for persistence images. Defaults to (0, 1, 2). compute_for_all_elements (bool): If true, compute persistence images for full structure (i.e. with all elements). If false, it will only do it for the substructures specified with `atom_types`. Defaults to True. min_size (int): Minimum supercell size (in Angstrom). Defaults to 20. image_size (Tuple[int]): Size of persistent image in pixel. Defaults to (20, 20). spread (float): "Smearing factor" for the Gaussians. Defaults to 0.2. weight (str): Weighting function for calculation of the persistence images. Defaults to "identity". max_b (Union[int, List[int]]): Maximum birth time. Defaults to 18. max_p (Union[int, List[int]]): Maximum persistence. Defaults to 18. max_fit_tolerence (float): If `fit` method is used to find the limits of the persistent images, one can appy a tolerance on the the found limits. The maximum will then be max + max_fit_tolerance * max. Defaults to 0.1. periodic (bool): If true, then periodic Euclidean is used in the analysis (experimental!). Defaults to False. no_supercell (bool): If true, then the supercell is not created. Defaults to False. primitive (bool): If True, the structure is reduced to its primitive form before the descriptor is computed. Defaults to False. alpha_weight (Optional[str]): If specified, the use weighted alpha shapes, i.e., replacing the points with balls of varying radii. For instance `atomic_radius_calculated` or `van_der_waals_radius`. Raises: AssertionError: If the length of the max_b and max_p is not equal to the number of dimensions. """ atom_types = [] if atom_types is None else atom_types self.atom_types = atom_types self.compute_for_all_elements = compute_for_all_elements self.min_size = min_size self.image_size = image_size self.spread = spread self.dimensions = dimensions self.weight = weight if isinstance(max_b, (list, tuple)): if len(max_b) != len(dimensions): raise AssertionError( "max_b must be a list of length equal to the number of dimensions" ) else: max_b = [max_b] * len(dimensions) if isinstance(max_p, (list, tuple)): if len(max_p) != len(dimensions): raise AssertionError( "max_p must be a list of length equal to the number of dimensions" ) else: max_p = [max_p] * len(dimensions) max_p_ = [0, 0, 0] max_b_ = [0, 0, 0] for i, dim in enumerate(dimensions): max_p_[dim] = max_p[i] max_b_[dim] = max_b[i] self.max_b = max_b_ self.max_p = max_p_ self.max_fit_tolerance = max_fit_tolerence self.periodic = periodic self.no_supercell = no_supercell self.alpha_weight = alpha_weight super().__init__(primitive=primitive)
[docs] def get_birth_persistance_death_from_pixel(self, dimension: int, x: int, y: int): """Get birth, persistence, and death from pixel coordinates. Args: dimension (int): Dimension of the topological feature. x (int): x coordinate. y (int): y coordinate. Returns: Tuple[float, float, float]: Birth, persistence, and death. """ birth_values = np.linspace(0, self.max_b[dimension], self.image_size[0]) persistance_values = np.linspace(0, self.max_p[dimension], self.image_size[1]) return birth_values[x], persistance_values[y], birth_values[x] + persistance_values[y]
def _get_feature_labels(self) -> List[str]: labels = [] _elements = list(self.atom_types) if self.compute_for_all_elements: _elements.append("all") for element in _elements: for dim in self.dimensions: for pixel_a in range(self.image_size[0]): for pixel_b in range(self.image_size[1]): labels.append(f"phimage_{element}_{dim}_{pixel_a}_{pixel_b}") return labels def find_relevant_substructure(self, structure, feature_name): parts = feature_name.split("_") # 'phimage_C-H-N-O_1_19_0' dim = int(parts[2]) birth, persistance, death = self.get_birth_persistance_death_from_pixel( dim, int(parts[4]), int(parts[3]) ) return self._find_relevant_substructure(structure, parts[1], dim, birth, persistance) def _find_relevant_substructure( self, structure: Structure, elements: str, dimension: int, birth, persistance ) -> List[Molecule]: """Find the substructure that matches a representative cycle. Done for the point on the persistence diagram that is closest to the given birth and persistence values. Args: structure (Structure): Structure to find the substructure in. elements (str): Element to find the substructure for. dimension (int): Dimension of the homology generator. birth (float): Birth of the homology generator. persistance (float): Persistence of the representative cycle. Returns: Molecule: Representative substructure. """ import dionysus as d from moleculetda.construct_pd import get_alpha_shapes, get_persistence from mofdscribe.featurizers.topology._tda_helpers import ( _coords_for_structure, _get_representative_cycles, ) from mofdscribe.featurizers.utils.substructures import filter_element if elements != "all": structure = filter_element(structure, elements.split("-")) coords, _weights, species = _coords_for_structure( structure, min_size=self.min_size, periodic=self.periodic, no_supercell=self.no_supercell, weighting=self.alpha_weight, ) f = get_alpha_shapes(coords, True, periodic=False) f = d.Filtration(f) m = get_persistence(f) cycles = _get_representative_cycles(f, m, dimension) dgms = d.init_diagrams(m, f) diagram = dgms[dimension] births, deaths, persistances, indices = [], [], [], [] for interval in diagram: births.append(interval.birth) deaths.append(interval.death) indices.append(interval.data) persistances.append(interval.death - interval.birth) births = np.array(births) deaths = np.array(deaths) indices = np.array(indices) persistances = np.array(persistances) distances = np.sqrt((births - birth) ** 2 + (persistances - persistance) ** 2) min_index = np.argmin(distances) point = indices[min_index] cycle = cycles[point] molecule = Molecule( species[cycle], coords[cycle], ) return molecule
[docs] def feature_labels(self) -> List[str]: return self._get_feature_labels()
def _featurize( self, structure: Union[Structure, IStructure, Molecule, IMolecule] ) -> np.ndarray: results = get_persistent_images_for_structure( structure, elements=self.atom_types, compute_for_all_elements=self.compute_for_all_elements, min_size=self.min_size, pixels=self.image_size, spread=self.spread, weighting=self.weight, max_b=self.max_b, max_p=self.max_p, periodic=self.periodic, no_supercell=self.no_supercell, alpha_weighting=self.alpha_weight, ) features = [] elements = list(self.atom_types) if self.compute_for_all_elements: elements.append("all") for element in elements: for dim in self.dimensions: features.append(np.array(results["image"][element][dim]).flatten()) return np.concatenate(features) def _fit(self, structures: List[Union[Structure, IStructure, Molecule, IMolecule]]) -> None: """Use structures to estimate the settings for the featurizer. Find the limits (maximum/minimum birth/death and persistence) for all the structures in the dataset and store them in the object. Args: structures (List[Union[Structure, IStructure, Molecule, IMolecule]]): List of structures to find the limits for. """ limits = defaultdict(list) for structure in structures: lim = get_persistence_image_limits_for_structure( structure, self.atom_types, self.compute_for_all_elements, self.min_size, periodic=self.periodic, no_supercell=self.no_supercell, alpha_weighting=self.alpha_weight, ) for k, v in lim.items(): limits[k].extend(v) # birth min, max persistence min, max maxp = [] maxb = [] for _, v in limits.items(): v = np.array(v) mb = np.max(v[:, 1]) mp = np.max(v[:, 3]) maxb.append(mb + self.max_fit_tolerance * mb) maxp.append(mp + self.max_fit_tolerance * mp) self.max_b = maxb self.max_p = maxp
[docs] def citations(self) -> List[str]: return [ "@article{doi:10.1021/acs.jpcc.0c01167," "author = {Krishnapriyan, Aditi S. and Haranczyk, Maciej and Morozov, Dmitriy}," "title = {Topological Descriptors Help Predict Guest Adsorption in Nanoporous Materials}," "journal = {The Journal of Physical Chemistry C}," "volume = {124}," "number = {17}," "pages = {9360-9368}," "year = {2020}," "doi = {10.1021/acs.jpcc.0c01167}," "}", "@article{krishnapriyan_machine_2021," "title={Machine learning with persistent homology and chemical word embeddings " "improves prediction accuracy and interpretability in metal-organic frameworks}," r"author={Krishnapriyan, Aditi S and Montoya, Joseph and Haranczyk, Maciej and " r"Hummelsh{\o}j, Jens and Morozov, Dmitriy}," "journal = {Scientific Reports}," "volume = {11}," "numer = {1}," "issn = {2045-2322}," "pages = {8888}," "year={2021}," "doi = {10.1038/s41598-021-88027-8}" "}", "@article{adams2017persistence," "title={Persistence images: A stable vector representation of persistent homology}," "author={Adams, Henry and Emerson, Tegan and Kirby, Michael and Neville, Rachel " "and Peterson, Chris and Shipman, Patrick and Chepushtanova, Sofya and Hanson, " "Eric and Motta, Francis and Ziegelmeier, Lori}," "journal={Journal of Machine Learning Research}," "volume={18}," "year={2017}" "}", ]
[docs] def implementors(self) -> List[str]: return ["Kevin Maik Jablonka", "Aditi Krishnapriyan"]