Source code for mofdscribe.featurizers.bu.compositionstats_featurizer

# -*- coding: utf-8 -*-
"""Describe the chemical composition of structures."""
from collections import defaultdict
from typing import List, Tuple, Union

import numpy as np
from element_coder import encode
from matminer.featurizers.base import BaseFeaturizer
from pymatgen.core import IMolecule, IStructure, Molecule, Structure

from mofdscribe.featurizers.utils.aggregators import ARRAY_AGGREGATORS
from mofdscribe.featurizers.utils.extend import (
    operates_on_imolecule,
    operates_on_istructure,
    operates_on_molecule,
    operates_on_structure,
)


[docs]@operates_on_molecule
@operates_on_imolecule
@operates_on_istructure
@operates_on_structure
class CompositionStats(BaseFeaturizer):
    """
    Describe the composition of molecules by computing statistics of their compositions.

    The featurizer will encode the element on all sites in the structure
    using user-defined encodings. Then it aggregates those encodings using
    user-defined encodings (e.g. min, max, min).
    """

    def __init__(
        self,
        encodings: Tuple[str] = ("mod_pettifor", "X"),
        aggregations: Tuple[str] = ("mean", "std", "max", "min"),
    ) -> None:
        """Initialize a CompositionStats featurizer.

        Args:
            encodings (Tuple[str]): Encoding used for the elements.
                Can be one of :py:obj:`element_coder.data.coding_data._PROPERTY_KEYS`.
                Defaults to ("mod_pettifor", "X").
            aggregations (Tuple[str]): Statistic to compute over the element encodings.
                Can be one of :py:obj:`mofdscribe.featurizers.utils.aggregators.ARRAY_AGGREGATORS`.
                Defaults to ("mean", "std", "max", "min").
        """
        self.aggregations = aggregations
        self.encodings = encodings

[docs]    def feature_labels(self) -> List[str]:
        feature_labels = []

        for encoding in self.encodings:
            for agg in self.aggregations:
                feature_labels.append(f"composition_stats_{encoding}_{agg}")

        return feature_labels

[docs]    def featurize(self, molecule: Union[Molecule, IMolecule, Structure, IStructure]) -> np.ndarray:
        encodings = defaultdict(list)
        for encoding in self.encodings:
            for site in molecule.sites:
                encodings[encoding] = encode(site.specie, encoding)

        features = []
        for encoding in self.encodings:
            for agg in self.aggregations:
                features.append(ARRAY_AGGREGATORS[agg](encodings[encoding]))

        return np.array(features)

[docs]    def implementors(self) -> List[str]:
        return ["Kevin Maik Jablonka"]

[docs]    def citations(self) -> List[str]:
        return []