Source code for mofdscribe.featurizers.bu.nconf20_featurizer

# -*- coding: utf-8 -*-
"""Code taken from the SI for 10.1021/acs.jcim.6b00565"""
from collections import OrderedDict
from typing import List

import numpy as np
from rdkit import Chem
from rdkit.Chem import AllChem

from .rdkitadaptor import RDKitAdaptor


# ToDo: Allow to set some of the options, e.g. for UFF
def _generate_conformers(mol, num_confs):
    # Add H atoms to skeleton
    molecule = Chem.AddHs(mol)
    conformer_integers = []
    # Embed and optimise the conformers
    conformers = AllChem.EmbedMultipleConfs(molecule, num_confs, pruneRmsThresh=0.5, numThreads=5)

    optimised_and_energies = AllChem.MMFFOptimizeMoleculeConfs(
        molecule, maxIters=600, numThreads=5, nonBondedThresh=100.0
    )
    energy_dict_with_key_was_id = {}
    final_conformers_to_use = {}
    # Only keep the conformers which were successfully fully optimised

    for conformer in conformers:
        optimised, energy = optimised_and_energies[conformer]
        if optimised == 0:
            energy_dict_with_key_was_id[conformer] = energy
            conformer_integers.append(conformer)
    # Keep the lowest energy conformer
    lowestenergy = min(energy_dict_with_key_was_id.values())
    for key, value in energy_dict_with_key_was_id.items():
        if value == lowestenergy:
            lowest_energy_conformer_id = key
    final_conformers_to_use[lowest_energy_conformer_id] = lowestenergy

    # Remove H atoms to speed up substructure matching
    molecule = AllChem.RemoveHs(molecule)
    # Find all substructure matches of the molecule with itself,
    # to account for symmetry
    matches = molecule.GetSubstructMatches(molecule, uniquify=False)
    maps = [list(enumerate(match)) for match in matches]
    # Loop over conformers other than the lowest energy one
    for conformer_id, _ in energy_dict_with_key_was_id.items():
        okay_to_add = True
        for finalconformer_id in final_conformers_to_use:
            rms = AllChem.GetBestRMS(molecule, molecule, finalconformer_id, conformer_id, maps)
            if rms < 1.0:
                okay_to_add = False
                break

        if okay_to_add:
            final_conformers_to_use[conformer_id] = energy_dict_with_key_was_id[conformer_id]

    sorted_dictionary = OrderedDict(sorted(final_conformers_to_use.items(), key=lambda t: t[1]))
    energies = list(sorted_dictionary.values())

    return energies


def _calc_nconf20(energy_list):
    energy_descriptor = 0

    relative_energies = np.array(energy_list) - energy_list[0]

    for energy in relative_energies[1:]:
        if 0 <= energy < 20:
            energy_descriptor += 1

    return energy_descriptor


def _n_conf20(mol):
    try:
        energy_list = _generate_conformers(mol, 100)
        descriptor = _calc_nconf20(energy_list)
        return np.array([descriptor])
    except Exception:
        return np.array([np.nan])


[docs]class NConf20(RDKitAdaptor):
    """
    Compute the nConf20 descriptor for a molecule.

    This descriptor attempts to capture the flexibility of molecules
    by sampling the conformational space. The descriptor is a count
    of the "accessible" conformers (based on relative conformer energies up to 20 kcal/mol,
    the lowest energy conformer is not counted).
    Conformers are generated using the RDKit conformer generator.

    ... warning::
        Part of the featurization is a geometry optimization using the MMFF
        force field. This will naturally fail for some molecules, and for all metal clusters.
        In these cases, the descriptor will be set to NaN.
    """

    def __init__(self):
        """Construct a new nConf20 featurizer."""
        super().__init__(featurizer=_n_conf20, feature_labels=["n_conf20"])

[docs]    def citations(self) -> List[str]:
        return super().citations() + [
            "@article{Wicker2016,"
            "doi = {10.1021/acs.jcim.6b00565},"
            "url = {https://doi.org/10.1021/acs.jcim.6b00565},"
            "year = {2016},"
            "month = dec,"
            "publisher = {American Chemical Society ({ACS})},"
            "volume = {56},"
            "number = {12},"
            "pages = {2347--2352},"
            "author = {Jerome G. P. Wicker and Richard I. Cooper},"
            "title = {Beyond Rotatable Bond Counts: Capturing 3D Conformational Flexibility in a Single Descriptor},"
            "journal = {Journal of Chemical Information and Modeling}"
            "}",
            "@article{tosco2014bringing,"
            "title={Bringing the MMFF force field to the RDKit: implementation and validation},"
            "author={Tosco, Paolo and Stiefl, Nikolaus and Landrum, Gregory},"
            "journal={Journal of cheminformatics},"
            "volume={6},"
            "number={1},"
            "pages={1--4},"
            "year={2014},"
            "publisher={Springer}"
            "}",
        ]

[docs]    def implementors(self) -> List[str]:
        return super().implementors() + ["Jerome G. P. Wicker and Richard I. Cooper"]