Source code for mofdscribe.featurizers.bu.distance_hist_featurizer

# -*- coding: utf-8 -*-
"""Describe molecules by computing a histogram of pairwise distances between their atoms."""
from typing import List, Union

import numpy as np
from matminer.featurizers.base import BaseFeaturizer
from pymatgen.core import IMolecule, IStructure, Molecule, Structure

from mofdscribe.featurizers.utils.extend import (
    operates_on_imolecule,
    operates_on_istructure,
    operates_on_molecule,
    operates_on_structure,
)
from mofdscribe.featurizers.utils.histogram import get_rdf


[docs]@operates_on_molecule
@operates_on_imolecule
@operates_on_istructure
@operates_on_structure
class PairwiseDistanceHist(BaseFeaturizer):
    """
    Describe the shape of molecules by computing a histogram of pairwise distances.

    For doing so, we will just compute all pairwise distances and then compute
    the histogram of them.
    One might also think of this as pretty rough approximation of something like
    the AMD fingerpint

    It also has some similarities to the "Grouped representation of interatomic distances"
    reported in [Zhang2022]_.
    """

    def __init__(
        self,
        lower_bound: float = 0.0,
        upper_bound: float = 15.0,
        bin_size: float = 0.5,
        density: bool = True,
    ) -> None:
        """Create a new PairwiseDistanceHist featurizer.

        Args:
            lower_bound (float): Lower bound of the histogram.
                Defaults to 0.0.
            upper_bound (float): Upper bound of the histogram.
                Defaults to 15.0.
            bin_size (float): Size of the bins.
                Defaults to 0.5.
            density (bool): Whether to return the density or the counts.
                Defaults to True.
        """
        self.lower_bound = lower_bound
        self.upper_bound = upper_bound
        self.bin_size = bin_size
        self.density = density

    def _get_grid(self):
        return np.arange(self.lower_bound, self.upper_bound, self.bin_size)

[docs]    def feature_labels(self) -> List[str]:
        return [f"pairwise_distance_hist_{a}" for a in self._get_grid()]

[docs]    def featurize(self, structure: Union[Molecule, IMolecule, Structure, IStructure]) -> np.ndarray:
        distances = []
        for i, _ in enumerate(structure):
            for j, _ in enumerate(structure):
                if i < j:
                    distances.append(structure.get_distance(i, j))

        features = get_rdf(
            distances,
            lower_lim=self.lower_bound,
            upper_lim=self.upper_bound,
            bin_size=self.bin_size,
            density=self.density,
            normalized=False,
        )
        return features

[docs]    def implementors(self) -> List[str]:
        return ["Kevin Maik Jablonka"]

[docs]    def citations(self) -> List[str]:
        return []