Source code for mofdscribe.featurizers.bu.smarts_matches

"""Featurize a molecule using SMARTS matches."""
from functools import partial
from typing import Collection, List, Optional

from rdkit import Chem

from mofdscribe.featurizers.bu.rdkitadaptor import RDKitAdaptor


[docs]def number_smart_matches(mol, smarts: Collection[str]) -> int: """Count the number of SMARTS matches in a molecule. This can be useful if we have some prior knowledge about which substructures might be interesting/relevant. Args: mol (rdkit.Chem.rdchem.Mol): RDKit molecule. smarts (Collection[str]): SMARTS patterns to match. Returns: int: Number of SMARTS matches. """ s = ",".join("$(" + s + ")" for s in smarts) smarts_mol = Chem.MolFromSmarts("[" + s + "]") return len(mol.GetSubstructMatches(smarts_mol))
[docs]class SmartsMatchCounter(RDKitAdaptor): """Count the number of SMARTS matches in a molecule. This can be useful if we have some prior knowledge about which substructures might be interesting/relevant. For instance, you might want to count the number of carboxylic acid groups in a molecule. """ # ToDo: Perhaps normalize by the length of the SMARTS substructure def __init__(self, smarts: Collection[str], feature_labels: Optional[Collection[str]]) -> None: """Construct a new SmartsMatchCounter. Args: smarts (Collection[str]): SMARTS patterns to match. feature_labels (str, optional): Feature labels. If None, the SMARTS patterns are concatenated to a labels. """ featurizer = partial(number_smart_matches, smarts=smarts) if feature_labels is None: smarts_string = "_".join(smarts) feature_labels = [f"smarts_{smarts_string}"] super().__init__(featurizer, feature_labels)
[docs]class AcidGroupCounter(SmartsMatchCounter): """Count the number of acidic groups in a molecule. SMARTS patterns are taken from the Mordred package. """ def __init__(self) -> None: """Construct a new AcidGroupCounter.""" smarts = ["[O;H1]-[C,S,P]=O", "[*;-;!$(*~[*;+])]", "[NH](S(=O)=O)C(F)(F)F", "n1nnnc1"] super().__init__(smarts, feature_labels=["acid_groups"])
[docs] def citations(self) -> List[str]: return super().citations() + [ "@article{Moriwaki_2018," "doi = {10.1186/s13321-018-0258-y}," "url = {https://doi.org/10.1186%2Fs13321-018-0258-y}," "year = 2018," "month = {feb}," "publisher = {Springer Science and Business Media {LLC}}," "volume = {10}," "number = {1}," "author = {Hirotomo Moriwaki and Yu-Shi Tian and Norihito Kawashita and Tatsuya Takagi}," "title = {Mordred: a molecular descriptor calculator}," "journal = {J Cheminform}" "}" ]
[docs] def implementors(self) -> List[str]: return super().implementors() + ["Moriwaki H, Tian Y-S, Kawashita N, Takagi T"]
[docs]class BaseGroupCounter(SmartsMatchCounter): """Count the number of basic groups in a molecule. SMARTS pattern taken from the Mordred package """ def __init__(self) -> None: """Construct a new BaseGroupCounter.""" smarts = [ "[NH2]-[CX4]", "[NH](-[CX4])-[CX4]", "N(-[CX4])(-[CX4])-[CX4]", "[*;+;!$(*~[*;-])]", "N=C-N", "N-C=N", ] super().__init__(smarts, feature_labels=["base_groups"])
[docs] def citations(self) -> List[str]: return super().citations() + [ "@article{Moriwaki_2018," "doi = {10.1186/s13321-018-0258-y}," "url = {https://doi.org/10.1186%2Fs13321-018-0258-y}," "year = 2018," "month = {feb}," "publisher = {Springer Science and Business Media {LLC}}," "volume = {10}," "number = {1}," "author = {Hirotomo Moriwaki and Yu-Shi Tian and Norihito Kawashita and Tatsuya Takagi}," "title = {Mordred: a molecular descriptor calculator}," "journal = {J Cheminform}" "}" ]
[docs] def implementors(self) -> List[str]: return super().implementors() + ["Moriwaki H, Tian Y-S, Kawashita N, Takagi T"]