Source code for mofdscribe.featurizers.chemistry.amd

# -*- coding: utf-8 -*-
"""Generalized average minimum distance (AMD) featurizer."""
from typing import List, Tuple, Union

import numpy as np
from pymatgen.core import IStructure, Structure

from mofdscribe.featurizers.base import MOFBaseFeaturizer
from mofdscribe.featurizers.utils.extend import operates_on_istructure, operates_on_structure
from mofdscribe.featurizers.utils.substructures import filter_element

__all__ = ["AMD"]


[docs]@operates_on_istructure
@operates_on_structure
class AMD(MOFBaseFeaturizer):
    """Implements the generalized average minimum distance (AMD) isometry invariant [Widdowson2022]_.

    Note that it currently does not implement averages according to
    multiplicity of sites (as the original code supports).
    The generalization is to other aggregations of the PDD.

    The AMD is the average of the point-wise distance distribution (PDD) of a
    crystal. The PDD lists distances to neighbouring atoms in order, closest
    first. Hence, the kth AMD value corresponds to the average distance to the
    kth nearest neighbour.

    The descriptors can be computed over the full structure or substructures of
    certain atom types.
    """

    def __init__(
        self,
        k: int = 100,
        atom_types: Tuple[str] = (
            "C-H-N-O",
            "F-Cl-Br-I",
            "Cu-Mn-Ni-Mo-Fe-Pt-Zn-Ca-Er-Au-Cd-Co-Gd-Na-Sm-Eu-Tb-V"
            "-Ag-Nd-U-Ba-Ce-K-Ga-Cr-Al-Li-Sc-Ru-In-Mg-Zr-Dy-W-Yb-Y-"
            "Ho-Re-Be-Rb-La-Sn-Cs-Pb-Pr-Bi-Tm-Sr-Ti-Hf-Ir-Nb-Pd-Hg-"
            "Th-Np-Lu-Rh-Pu",
        ),
        compute_for_all_elements: bool = True,
        aggregations: Tuple[str] = ("mean",),
        primitive: bool = True,
    ) -> None:
        """Initialize the AMD descriptor.

        Args:
            k (int): controls the number of nearest neighbour atoms considered
                for each atom in the unit cell. Defaults to 100.
            atom_types (tuple): Atoms that are used to create substructures
                for which the AMD descriptor is computed.
                Defaults to ( 'C-H-N-O', 'F-Cl-Br-I',
                'Cu-Mn-Ni-Mo-Fe-Pt-Zn-Ca-Er-Au-Cd-Co-Gd-Na-Sm-Eu-Tb-V-Ag-Nd-U-Ba-Ce-K-Ga-
                Cr-Al-Li-Sc-Ru-In-Mg-Zr-Dy-W-Yb-Y-Ho-Re-Be-Rb-La-Sn-Cs-Pb-Pr-Bi-Tm-Sr-Ti-Hf-
                Ir-Nb-Pd-Hg-Th-Np-Lu-Rh-Pu', ).
            compute_for_all_elements (bool): If True, compute the AMD descriptor for
                the original structure with all elements. Defaults to True.
            aggregations (tuple): Aggregations of the AMD descriptor.
                The 'mean' is equivalent to the original AMD. Defaults to ('mean',).
            primitive (bool): If True, the structure is reduced to its primitive
                form before the descriptor is computed. Defaults to True.
        """
        self.k = k
        atom_types = [] if atom_types is None else atom_types
        self.elements = atom_types
        self.atom_types = (
            list(atom_types) + ["all"] if compute_for_all_elements else list(atom_types)
        )
        self.compute_for_all_elements = compute_for_all_elements
        self.aggregations = aggregations
        super().__init__(primitive=primitive)

    def _get_feature_labels(self) -> List[str]:
        labels = []
        for atom_type in self.atom_types:
            for agg in self.aggregations:
                for i in range(self.k):
                    labels.append(f"amd_{atom_type}_{agg}_{i}")

        return labels

[docs]    def feature_labels(self) -> List[str]:
        return self._get_feature_labels()

    def _featurize(self, structure: Union[Structure, IStructure]) -> np.ndarray:
        """Compute the AMD descriptor for a given structure.

        Args:
            structure (Union[Structure, IStructure]): Structure to compute the descriptor for.

        Returns:
            A numpy array containing the AMD descriptor.
        """
        from amd._nns import nearest_neighbours
        from amd.calculate import _extract_motif_cell
        from amd.periodicset import PeriodicSet

        def get_pdd(structure, k):
            motif, cell, asymmetric_unit, multiplicities = _extract_motif_cell(
                PeriodicSet(structure.cart_coords, structure.lattice.matrix)
            )
            pdd, _, _ = nearest_neighbours(motif, cell, asymmetric_unit, k)
            return pdd

        # ToDo: we can potentially parallelize this
        all_desc = []
        if len(self.elements) > 0:
            for element in self.elements:
                filtered_structure = filter_element(structure, element)
                if filtered_structure is not None:
                    pdd = get_pdd(filtered_structure, self.k)
                else:
                    pdd = np.empty((1, self.k))
                    pdd[:] = np.nan
                for agg in self.aggregations:
                    all_desc.append(getattr(np, agg)(pdd, axis=0))

        if self.compute_for_all_elements:
            pdd = get_pdd(structure, self.k)
            for agg in self.aggregations:
                all_desc.append(getattr(np, agg)(pdd, axis=0))

        return np.concatenate(all_desc)

[docs]    def citations(self):
        return [
            "@article{amd2022,"
            "title = {Average Minimum Distances of periodic point sets - "
            "foundational invariants for mapping periodic crystals},"
            "author = {Daniel Widdowson and Marco M Mosca and Angeles Pulido "
            "and Vitaliy Kurlin and Andrew I Cooper},"
            "journal = {MATCH Communications in Mathematical and in Computer Chemistry},"
            "doi = {10.46793/match.87-3.529W},"
            "volume = {87},"
            "number = {3},"
            "pages = {529-559},"
            "year = {2022}"
            "}"
        ]

[docs]    def implementors(self):
        return ["Kevin Maik Jablonka", "Daniel Widdowson"]