Source code for fepops.fepops

import itertools
import logging
import multiprocessing as mp
import zlib
from enum import Enum
from multiprocessing import SimpleQueue
from typing import Literal, Tuple, Union

import numpy as np
import torch
from fast_pytorch_kmeans import KMeans as _FastPTKMeans
from rdkit import Chem
from rdkit.Chem import AllChem, Crippen, Lipinski, rdMolTransforms
from rdkit.Chem.MolStandardize import rdMolStandardize
from scipy.spatial.distance import cdist, pdist, squareform
from scipy.special import softmax
from sklearn.cluster import KMeans as _SKLearnKMeans

GetFepopStatusCode = Enum(
    "GetFepopStatusCode",
    ["SUCCESS", "FAILED_TO_GENERATE", "FAILED_TO_RETRIEVE", "FAILED_RETRIEVED_NONE"],
)


[docs]class OpenFEPOPS: """OpenFEPOPS (Feature Points) molecular similarity object Fepops allows the comparison of molecules using feature points, see the original publication for more information: https://doi.org/10.1021/jm049654z. In short, featurepoints reduce the number of points used to represent a molecule by combining atoms and their properties. Typically used to compare libraries of small molecules against known actives in the hope of discovering biosimilars based on queries. Parameters ---------- kmeans_method : str, optional String literal denoting the method which should be used for kmeans calculations. May be one of "sklearn", "pytorchgpu", or "pytorchcpu". If "sklearn" is passed then Scikit-learn's kmeans implementation is used. However a faster implementation from the fast_pytorch_kmeans package can also be used if Pytorch is available and may be run in cpu-only mode, or GPU accelerated mode. Note: GPU accelerated mode should only be used if you are stretching the capabilities in terms of feature points for large molecules. Small molecules will not benefit at all from GPU acceleration due to overheads. By default "sklearn" max_tautomers : Union[int, None], optional Maximum number of tautomers which should be generated. Internally, this implementation of FEPOPS relies upon RDKit's TautomerEnumerator to generate tautomers and pass 5 to the number of tautomers to generate based on original FEPOPS paper. Unless the molecules (or macromolecules) you areworking with generate massive numbers of tautomers, this may optionally set as None implying that no limit should be placed on tautomer generation. By default 5 num_fepops_per_mol : int, optional Number of feature points to use in the representation of a molecule. Literature notes that 7 has been empirically found to be a good number of feature points for performant representations of small molecules. This might be increased if you are dealing with large and very flexible molecules, by default 7 num_centroids_per_fepop : int, optional Each fepop is represented by a number of centres, into which atom properties are compressed. Literature notes that this has been empirically determined to be 4 for a performant representation of small molecules. By default 4 descriptor_means : Tuple[float, ...], optional Due to the need to apply scaling to FEPOPS, the DUDE diversity set has been profiled and the means collected for all contained FEPOPS. This this allows centering and scaling of FEPOPS before scoring. This field contains default values for FEPOP means calculated with num_fepops_per_mol = 7, num_centroids_per_fepop=4, and kmeans_method = 'sklearn'. New values should be supplied if the FEPOPS object is using different numbers for these values. By default (-0.28932319,0.5166312, 0.37458883,0.99913668,-0.04193182,1.03616917,0.27327129,0.99839024, 0.09701198,1.12969387,0.23718642,0.99865705,0.35968991,0.6649304, 0.4123743,0.99893657,5.70852885,6.3707943,6.47354071,6.26385429, 6.19229367,6.22946713) descriptor_sds : Tuple[float, ...], optional Due to the need to apply scaling to FEPOPS, the DUDE diversity set has been profiled and the means collected for all contained FEPOPS. This this allows centering and scaling of FEPOPS before scoring. This field contains default values for FEPOP standard deviations calculated with num_fepops_per_mol = 7, num_centroids_per_fepop=4, and kmeans_method = 'sklearn'. New values should be supplied if the FEPOPS object is using different numbers for these values. By default (0.35067291,1.00802116, 0.48380817,0.02926675,0.15400475,0.86220776,0.44542581,0.03999429, 0.16085455,0.92042695,0.42515847,0.03655217,0.35778578,1.36108994, 0.49210665,0.03252466,1.96446927,2.30792259,2.5024708,2.4155645, 2.29434487,2.31437527) Raises ------ ValueError Invalid kmeans method """ def __init__( self, *, kmeans_method: Literal['sklearn', 'pytorchcpu', 'pytorchgpu'] = 'sklearn', max_tautomers: Union[int, None] = 5, num_fepops_per_mol: int = 7, num_centroids_per_fepop: int = 4, descriptor_means: Tuple[float, ...] = ( -0.28971602, 0.5181022, 0.37487135, 0.99922747, -0.04187301, 1.03382471, 0.27407036, 0.99853436, 0.09725517, 1.12824307, 0.23735556, 0.99882914, 0.35977538, 0.66653514, 0.41238282, 0.99902545, 5.71261449, 6.37716992, 6.47293777, 6.26134733, 6.20354385, 6.23201498, ), descriptor_stds: Tuple[float, ...] = ( 0.35110473, 1.00839329, 0.4838859, 0.02769204, 0.15418035, 0.86446056, 0.44583626, 0.0381767, 0.16095862, 0.92079483, 0.42526185, 0.03413741, 0.35756229, 1.36093993, 0.4921059, 0.0311619, 1.9668792, 2.31266486, 2.50699385, 2.41269982, 2.30018205, 2.31527129, ), ): """OpenFEPOPS (Feature Points) molecular similarity object Fepops allows the comparison of molecules using feature points, see the original publication for more information: https://doi.org/10.1021/jm049654z. In short, featurepoints reduce the number of points used to represent a molecule by combining atoms and their properties. Typically used to compare libraries of small molecules against known actives in the hope of discovering biosimilars based on queries. Parameters ---------- kmeans_method : str, optional String literal denoting the method which should be used for kmeans calculations. May be one of "sklearn", "pytorchgpu", or "pytorchcpu". If "sklearn" is passed then Scikit-learn's kmeans implementation is used. However a faster implementation from the fast_pytorch_kmeans package can also be used if Pytorch is available and may be run in cpu-only mode, or GPU accelerated mode. Note: GPU accelerated mode should only be used if you are stretching the capabilities in terms of feature points for large molecules. Small molecules will not benefit at all from GPU acceleration due to overheads. By default "sklearn" max_tautomers : Optional[int], optional Maximum number of tautomers which should be generated. Internally, this implementation of FEPOPS relies upon RDKit's TautomerEnumerator to generate tautomers and may optionally pass in a limit to the number of Tautomers to generate. Unless the molecules (or macromolecules) you are working with generate massive numbers of tautomers, this should be None implying that no limit should be placed on tautomer generation. By default None num_fepops_per_mol : int, optional Number of feature points to use in the representation of a molecule. Literature notes that 7 has been empirically found to be a good number of feature points for performant representations of small molecules. This might be increased if you are dealing with large and very flexible molecules, by default 7 num_centroids_per_fepop : int, optional Each fepop is represented by a number of centres, into which atom properties are compressed. Literature notes that this has been empirically determined to be 4 for a performant representation of small molecules. By default 4 descriptor_means : Tuple[float, ...], optional Due to the need to apply scaling to FEPOPS, the DUDE diversity set has been profiled and the means collected for all contained FEPOPS. This this allows centering and scaling of FEPOPS before scoring. This field contains default values for FEPOP means calculated with num_fepops_per_mol = 7, num_centroids_per_fepop=4, and kmeans_method = 'sklearn'. New values should be supplied if the FEPOPS object is using different numbers for these values. By default (-0.28932319,0.5166312, 0.37458883,0.99913668,-0.04193182,1.03616917,0.27327129,0.99839024, 0.09701198,1.12969387,0.23718642,0.99865705,0.35968991,0.6649304, 0.4123743,0.99893657,5.70852885,6.3707943,6.47354071,6.26385429, 6.19229367,6.22946713) descriptor_stds : Tuple[float, ...], optional Due to the need to apply scaling to FEPOPS, the DUDE diversity set has been profiled and the means collected for all contained FEPOPS. This this allows centering and scaling of FEPOPS before scoring. This field contains default values for FEPOP standard deviations calculated with num_fepops_per_mol = 7, num_centroids_per_fepop=4, and kmeans_method = 'sklearn'. New values should be supplied if the FEPOPS object is using different numbers for these values. By default (0.35067291,1.00802116, 0.48380817,0.02926675,0.15400475,0.86220776,0.44542581,0.03999429, 0.16085455,0.92042695,0.42515847,0.03655217,0.35778578,1.36108994, 0.49210665,0.03252466,1.96446927,2.30792259,2.5024708,2.4155645, 2.29434487,2.31437527) Raises ------ ValueError Invalid kmeans method """ # Descriptor stds may contain zeros. If they do, then we mimic Scikit-Learn's # StandardScaler, whereby if unit variance is not achievable, no scaling is # applied (value of 1.0) self.descriptor_stds_no_zeros = np.array(descriptor_stds) self.descriptor_stds_no_zeros[self.descriptor_stds_no_zeros == 0.0] = 1.0 self.descriptor_means = np.array(descriptor_means) try: self.kmeans_func = getattr(self, f"_perform_kmeans_{kmeans_method}") except: raise ValueError( f"Supplied kmeans_method argument ({kmeans_method}) does not match a callable method of the form (_perfom_kmeans_{kmeans_method}). Implemented methods seem to be: {[m.split('_')[3] for m in OpenFEPOPS.__dict__.keys() if m.startswith('_perform_kmeans_')]}" ) self.sort_by_features_col_index_dict = { name: sort_order_index for sort_order_index, name in enumerate(["charge", "logP", "hba", "hbd"]) } self.num_fepops_per_mol = num_fepops_per_mol self.num_centroids_per_fepop = num_centroids_per_fepop self.num_features_per_fepop = len(self.sort_by_features_col_index_dict) self.num_distances_per_fepop = ( (self.num_centroids_per_fepop**2) - self.num_centroids_per_fepop ) // 2 self.donor_mol_from_smarts = Chem.MolFromSmarts("[!H0;#7,#8,#9]") self.acceptor_mol_from_smarts = Chem.MolFromSmarts( "[!$([#6,F,Cl,Br,I,o,s,nX3,#7v5,#15v5,#16v4,#16v6,*+1,*+2,*+3])]" ) self.rotatable_bond_from_smarts = Chem.MolFromSmarts( "[!$(*#*)&!D1]-&!@[!$(*#*)&!D1]" ) self.tautomer_enumerator = rdMolStandardize.TautomerEnumerator() if isinstance(max_tautomers, int): self.tautomer_enumerator.SetMaxTautomers(max_tautomers) if max_tautomers is None: self.tautomer_enumerator.GetMaxTautomers() def _get_k_medoids( self, input_x: np.ndarray, k: int = 7, random_state: int = 42 ) -> np.ndarray: """Select k FEPOPS from conformers and tautomers Gets k mediods from conformers (and tautomers) which are representative of the molecule as a function of conformer and tautomer states by virtue of chosen FEPOPS being diverse. Parameters ---------- input_x : np.ndarray The pharmacophore features of all conformers. k : int The number of medoids for clustering. By default 7. random_state : int Integer to use as a random state when seeding the random number generator. By default 42. Returns ------- np.ndarray The final Fepops descriptors comprised of k representative conformers/tautomers. """ input_x = np.unique(input_x, axis=0) if input_x.shape[0] <= k: return input_x # Apply standard scaling to FEPOP features. Behaviour when std dev is 0 mimics # Scikit-Learn's StandardScaler, whereby if unit variance is not achievable, no # scaling is applied (value of 1.0) input_x_std = np.std(input_x, axis=0) input_x_std[input_x_std == 0.0] = 1.0 X = (input_x - np.mean(input_x, axis=0)) / input_x_std point_to_centroid_map = np.ones(X.shape[0]) point_to_centroid_map_prev = np.zeros_like(point_to_centroid_map) np_rng = np.random.default_rng(seed=random_state) medoids = X[np_rng.choice(np.arange(X.shape[0]), size=k, replace=False), :] while (point_to_centroid_map != point_to_centroid_map_prev).any(): point_to_centroid_map_prev = point_to_centroid_map point_to_centroid_map = np.argmin(np.square(cdist(X, medoids)), axis=1) for i in range(k): medoid_members = X[point_to_centroid_map == i] if len(medoid_members) == 0: chosen_x_point = np_rng.choice(np.arange(X.shape[0])) medoids[i] = X[chosen_x_point, :] point_to_centroid_map[chosen_x_point] = i medoids[i] = np.median(X[point_to_centroid_map == i], axis=0) # Sorting at this stage for reproducibility with existing pregenerated # descriptor sets and convention with early FEPOPS versions which relied # upon FEPOPS within a molecule being sorted by charge (before moving to # the newer CombiAlign scoring algorithm) return input_x[np.lexsort(medoids.T[::-1])] def _calculate_atomic_logPs(self, mol: Chem.rdchem.Mol) -> dict: """Calculate logP contribution for each of atom in a molecule Parameters ---------- mol : Chem.rdchem.Mol The Rdkit mol object of the input molecule. Returns ------- dict A dictionary containing all atom symbols with their logP values. """ return { atom.GetIdx(): float(contribution[0]) for atom, contribution in zip( mol.GetAtoms(), Crippen.rdMolDescriptors._CalcCrippenContribs(mol) ) } def _calculate_partial_charges(self, mol: Chem.rdchem.Mol) -> dict: """Calculate the charge of each atom in a molecule Parameters ---------- mol : Chem.rdchem.Mol The Rdkit mol object of the input molecule. Returns ------- dict A dictionary containing all atom symobls with their charges. """ Chem.rdPartialCharges.ComputeGasteigerCharges(mol, throwOnParamFailure=True) return { atom.GetIdx(): float(atom.GetProp("_GasteigerCharge")) for atom in mol.GetAtoms() } def _sum_of_atomic_features_by_centroids( self, feature_dict: dict, centroid_atom_idx: np.ndarray ) -> int: """Sum all atomic features according to their centroid group A method that that is used to sum all the atomic features based on their centroid group. Parameters ---------- feature_dict : dict A dictionary containing the atomic features. centroid_atom_idx : np.ndarray A Numpy array containing the label of the centroid index for each atom. Returns ------- int Sum of the features across the centroid group of atoms. """ return sum([v for k, v in feature_dict.items() if k in centroid_atom_idx]) def _get_dihedrals(self, mol: Chem.rdchem.Mol) -> tuple: """Identify dihedrals in order to obtain rotatable bonds in a molecule Identify dihedrals using flanking atoms of rotatable bonds Parameters ---------- mol : Chem.rdchem.Mol The Rdkit mol object of the input molecule. Returns ------- tuple A tuple containing all identified dihedrals with the index of their four defined atoms. """ dihedrals = [] for atom_j, atom_k in mol.GetSubstructMatches(self.rotatable_bond_from_smarts): atom_i, atom_l = self._get_flanking_atoms(mol.GetBonds(), atom_j, atom_k) if atom_i is not None and atom_l is not None: dihedrals.append((atom_i, atom_j, atom_k, atom_l)) return dihedrals def _perform_kmeans_sklearn( self, atom_coords: np.ndarray, num_centroids: int = 4, seed: int = 42, ) -> tuple: """Perform kmeans calculation (sklearn method) Parameters ---------- atom_coords : ndarray A Numpy array containing the 3D coordinates of a molecule. num_centroids : int The number of centoids used for clustering. By default 4. seed : int Seed for sklearn kmeans initialisation. By default 42. Returns ------- tuple A tuple containing the centroid coordinates and the cluster labels of molecular atoms. """ kmeans = _SKLearnKMeans( n_clusters=num_centroids, random_state=seed, n_init="auto", ).fit(atom_coords) centroid_coords = kmeans.cluster_centers_ instance_cluster_labels = kmeans.labels_ return centroid_coords, instance_cluster_labels def _perform_kmeans_pytorchcpu( self, atom_coords: np.ndarray, num_centroids: int = 4, seed: int = 42, ) -> tuple: """Perform kmeans calculation using pytorch (CPU only) Parameters ---------- atom_coords : ndarray A Numpy array containing the 3D coordinates of a molecule. num_centroids : int The number of centoids used for clustering. By default 4. seed : int Seed for sklearn kmeans initialisation. By default 42. Returns ------- tuple A tuple containing the centroid coordinates and the cluster labels of molecular atoms. """ torch.manual_seed(seed) mol_coors_torch = torch.from_numpy(atom_coords) kmeans = _FastPTKMeans(n_clusters=num_centroids, max_iter=300) instance_cluster_labels = kmeans.fit_predict( mol_coors_torch, centroids=torch.tensor( atom_coords[:num_centroids], device=mol_coors_torch.device ), ).numpy() centroid_coords = kmeans.centroids.numpy() return centroid_coords, instance_cluster_labels def _perform_kmeans_pytorchgpu( self, atom_coords: np.ndarray, num_centroids: int = 4, seed: int = 42, ) -> tuple: """Perform kmeans calculation using pytorch (gpu accelerated) Parameters ---------- atom_coords : ndarray A Numpy array containing the 3D coordinates of a molecule. num_centroids : int The number of centoids used for clustering. By default 4. seed : int Seed for sklearn kmeans initialisation. By default 42. Returns ------- tuple A tuple containing the centroid coordinates and the cluster labels of molecular atoms. """ torch.manual_seed(seed) mol_coors_torch = torch.from_numpy(atom_coords).to("cuda") kmeans = _FastPTKMeans(n_clusters=num_centroids, max_iter=300) instance_cluster_labels = kmeans.fit_predict( mol_coors_torch, centroids=torch.tensor( atom_coords[:num_centroids], device=mol_coors_torch.device ), ).numpy() centroid_coords = kmeans.centroids.numpy() return centroid_coords, instance_cluster_labels def _get_centroid_distances( self, centroid_coords_or_distmat: np.ndarray, is_distance_matrix: bool ) -> np.ndarray: """Get centroid distances array In the fepops paper using 4 centroids, there is a specific order in which to return the 4 distances: d1-4, d1-2, d2-3, d3-4, d1-3, d2-4. This order is the same as the way matrix determinants are calculated, and as such this function generalises to other cardinalities of points. Parameters ---------- centroid_coords : np.ndarray MxN array of centroid coords, where M is the number of centroids, and N is the number of coordinates (should be 3). Returns ------- np.ndarray Ordered centroid distances """ if not is_distance_matrix: dmat = squareform(pdist(centroid_coords_or_distmat)) else: dmat = centroid_coords_or_distmat.copy() distances = np.hstack( [dmat[0, -1]] + [np.diagonal(dmat, offset=k) for k in range(1, dmat.shape[0] - 1)] ) return distances def _mol_from_smiles(self, smiles_string: str) -> Chem.rdchem.Mol: """Parse smiles to mol, catching errors This SMILES->RDKit mol converter is used throughout OpenFEPOPS and as such, any read in/parsing of a SMILES stirng should use this method. Parameters ---------- smiles_string : str Smiles string Returns ------- Chem.rdchem.Mol RDkit molecule Raises ------ ValueError Unable to parse smiles into a molecule """ try: mol = Chem.MolFromSmiles(smiles_string) except: try: mol = Chem.MolFromSmiles(smiles_string, sanitize=False) except: return None return mol def _get_flanking_atoms( self, bonds: Chem.rdchem._ROBondSeq, atom_1_idx: int, atom_2_idx: int ) -> tuple: """Search for two atoms connecting to either atom in a rotatable bond A private method to identify two atoms flanking atoms in a rotatable bond Parameters ---------- bonds : Chem.rdchem._ROBondSeq The Rdkit molecule bond object that contains the indexes of both begin and end atoms in a bond. atom_1_idx : int The index of the first atom in a rotatable bond. atom_2_inx : int The index of the second atom in a rotatable bond. Returns ------- tuple A tuple containing the indexes of two flanking atoms for the given atoms of a rotatable bond. """ bound_to_atom_1 = None bound_to_atom_2 = None for bond in bonds: bond_indexes = bond.GetBeginAtomIdx(), bond.GetEndAtomIdx() if atom_1_idx not in bond_indexes and atom_2_idx not in bond_indexes: continue if atom_1_idx in bond_indexes: # Atom 1 in bond indexes if atom_2_idx in bond_indexes: continue if atom_1_idx == bond_indexes[0]: bound_to_atom_1 = bond_indexes[1] continue if atom_1_idx == bond_indexes[1]: bound_to_atom_1 = bond_indexes[0] continue else: # Atom 2 in bond indexes if atom_2_idx == bond_indexes[0]: bound_to_atom_2 = bond_indexes[1] continue if atom_2_idx == bond_indexes[1]: bound_to_atom_2 = bond_indexes[0] continue if bound_to_atom_1 is not None and bound_to_atom_2 is not None: return bound_to_atom_1, bound_to_atom_2 return bound_to_atom_1, bound_to_atom_2 def _sample_bond_states(self, n_rot: int, seed: int) -> list: """Sample a set of conformers with different rotation angles A private method used to generate a set of bond angle multipliers (0 to 3) using all rotatable bonds within a molecule. Up to 1024 conformers are sampled for a molecule if n_rot is greater than five, otherwise all n_rot^4 are returned. Parameters ---------- n_nor : int The number of rotatable bonds in a molecule. seed : int Seed for random sampling of rotamer space. Typically the hash of molecule coords. Returns ------- List A list containing the sampled bond states (rotation angles) for all of the rotatable bonds of a molecule. """ if n_rot <= 5: return list(itertools.product(range(4), repeat=n_rot)) np_rng = np.random.default_rng(seed=seed) rotation_list = set( tuple(np_rng.choice(range(4), size=n_rot)) for counter in range(1024) ) while len(rotation_list) < 1024: rotation_list.add(tuple(np_rng.choice(range(4), n_rot))) return list(rotation_list)
[docs] def generate_conformers(self, mol: Chem.rdchem.Mol, random_state: int = 42) -> list: """Generate conformers with rotatable bonds Generate conformers for a molecule, enumerating rotatable bonds over 90 degree angles. This 90 degree increment was deemed opimal in literature. Parameters ---------- mol : Chem.rdchem.Mol The Rdkit mol object of the input molecule. random_state : int Integer to use as a random state when seeding the random number generator. By default 42. Returns ------- List A list containing mol objects of different conformers with different angles of rotetable bonds """ try: mol = Chem.AddHs(mol) params = AllChem.ETKDGv3() params.useSmallRingTorsions = True params.randomSeed = random_state original_conformer = mol.GetConformer(AllChem.EmbedMolecule(mol, params)) except ValueError: params = AllChem.ETKDGv2() id = AllChem.EmbedMolecule(mol, params) if id == -1: logging.warning( "Coords could not be generated without using random coords. using random coords now" ) params.useRandomCoords = True try: original_conformer = mol.GetConformer( AllChem.EmbedMolecule(mol, params) ) except ValueError: logging.warning("Conformer embedding failed") return [] dihedrals = self._get_dihedrals(mol) starting_angles = [ rdMolTransforms.GetDihedralDeg(original_conformer, *dihedral_atoms) for dihedral_atoms in dihedrals ] bond_states = self._sample_bond_states( len(dihedrals), zlib.crc32(mol.GetConformer(0).GetPositions().tobytes()) ) new_conf_mol_list = [] for bond_state in bond_states: self._generate_conf( original_conformer, dihedrals, starting_angles, bond_state ) new_conf_mol_list.append(Chem.Mol(mol)) return new_conf_mol_list
def _generate_conf( self, conformer: Chem.rdchem.Conformer, dihedrals: tuple, starting_angles: tuple, bond_state: tuple, ) -> None: """Rotate the assigned rotatable bonds Change conformers by rotating the assigned rotatable bond based on a set of dihedral angles defined by four flanking atoms. Parameters ---------- conformer : Chem.rdchem.Conformer The Rdkit conformer object. dihedrals : tuple A tuple containing all identified dihedrals with the index of their four defined atoms. starting_angles : tuple A tuple containing the orignal states (dihedral angles) of all the rotatable bond before rotating. bond_state : tuple A tuple containing a specific bond state (a combination of various rotation angles) for all rotatable bonds of a molecule. """ for dihedral_atoms, torsion_angle_multiplier, orig_torsion_angle in zip( dihedrals, bond_state, starting_angles ): rdMolTransforms.SetDihedralDeg( conformer, *dihedral_atoms, orig_torsion_angle + torsion_angle_multiplier * 90.0, )
[docs] def get_centroid_pharmacophoric_features( self, mol: Chem.rdchem.Mol, ) -> np.ndarray: """Obtain centroids and their corresponding pharmacophoric features Obtain centroids and then calucate and assign their corresponding pharmacophoric features (logP, charges, HBA, HBD, and distances between the centroids, following the pattern used for calculation of matrix determinants - in the case of 4 centroids, this is: d1-4, d1-2, d2-3, d3-4, d1-3, d2-4) Parameters ---------- mol : Chem.rdchem.Mol The Rdkit mol object of the input molecule. Returns ------- np.ndarray A Numpy array containing 22 pharmacophoric features for all conformers. """ centroid_coords, instance_cluster_labels = self.kmeans_func( mol.GetConformer(0).GetPositions(), num_centroids=self.num_centroids_per_fepop, ) atomic_logP_dict = self._calculate_atomic_logPs(mol) atomic_charge_dict = self._calculate_partial_charges(mol) hb_acceptors = set( i[0] for i in mol.GetSubstructMatches(self.acceptor_mol_from_smarts) ) hb_donors = set( i[0] for i in mol.GetSubstructMatches(self.donor_mol_from_smarts) ) pharmacophore_features_arr = np.empty(shape=[self.num_centroids_per_fepop, 4]) for centroid in range(self.num_centroids_per_fepop): centroid_atomic_id = np.where(instance_cluster_labels == centroid)[0] sum_of_logP = self._sum_of_atomic_features_by_centroids( atomic_logP_dict, centroid_atomic_id ) sum_of_charge = self._sum_of_atomic_features_by_centroids( atomic_charge_dict, centroid_atomic_id ) if any(atom_id in hb_acceptors for atom_id in centroid_atomic_id): hba = 1 else: hba = 0 if any(atom_id in hb_donors for atom_id in centroid_atomic_id): hbd = 1 else: hbd = 0 pharmacophore_features_arr[centroid, :] = ( sum_of_charge, sum_of_logP, hbd, hba, ) sorted_index_rank_arr = np.lexsort(pharmacophore_features_arr.T[::-1]) centroid_coords = centroid_coords[sorted_index_rank_arr] pharmacophore_features_arr = pharmacophore_features_arr[sorted_index_rank_arr] centroid_dist = self._get_centroid_distances( centroid_coords, is_distance_matrix=False ) pharmacophore_features_arr = np.append( pharmacophore_features_arr, centroid_dist ) return pharmacophore_features_arr
[docs] def get_fepops( self, mol: Union[str, None, Chem.rdchem.Mol], is_canonical: bool = False, ) -> Tuple[GetFepopStatusCode, Union[np.ndarray, None]]: """Get Fepops descriptors for a molecule Parameters ---------- mol : Union[str, None, Chem.rdchem.Mol] Molecule as a SMILES string or RDKit molecule. Can also be None, in which case a failure error status is returned along with None in place of the requested Fepops descriptors. Returns ------- Tuple[GetFepopStatusCode, Union[np.ndarray, None]] Returns a tuple, with the first value being a GetFepopStatusCode (enum) denoting SUCCESS or FAILED_TO_GENERATE. The second tuple element is either None (if unsuccessful), or a np.ndarray containing the calculated Fepops descriptors of the requested input molecule. """ original_smiles = None if isinstance(mol, np.ndarray): return GetFepopStatusCode.SUCCESS, mol if isinstance(mol, str): original_smiles = mol mol = self._mol_from_smiles(mol) if mol is None: logging.error( f"Failed to make a molecule{' from '+original_smiles if original_smiles is not None else ''}" ) return GetFepopStatusCode.FAILED_TO_GENERATE, None if Lipinski.HeavyAtomCount(mol) < self.num_centroids_per_fepop: logging.error( f"Number of heavy atoms ({Lipinski.HeavyAtomCount(mol)}) below requested feature points ({self.num_centroids_per_fepop}) for molecule {original_smiles if original_smiles is not None else ''}" ) return GetFepopStatusCode.FAILED_TO_GENERATE, None mol = Chem.RemoveAllHs(mol) tautomers_list = [ tautomer for tautomer in self.tautomer_enumerator.Enumerate(mol) ] tautomers_list = [Chem.AddHs(m_) for m_ in tautomers_list] each_mol_with_all_confs_list = [] for index, t_mol in enumerate(tautomers_list): conf_list = self.generate_conformers(t_mol) each_mol_with_all_confs_list.extend(conf_list) if each_mol_with_all_confs_list == []: logging.error( f"Failed to generate conformers/tautomers {' for '+original_smiles if original_smiles is not None else ''}" ) return GetFepopStatusCode.FAILED_TO_GENERATE, None try: pharmacophore_feature_all_confs = np.array( [ self.get_centroid_pharmacophoric_features(each_mol) for each_mol in each_mol_with_all_confs_list ] ) except ValueError as e: if original_smiles is not None: logging.error(f"Failed molecule had SMILES: {original_smiles}") logging.error(e) return GetFepopStatusCode.FAILED_TO_GENERATE, None medoids = self._get_k_medoids( pharmacophore_feature_all_confs, self.num_fepops_per_mol ) return GetFepopStatusCode.SUCCESS, medoids
[docs] def pairwise_correlation(self, A: np.ndarray, B: np.ndarray): """Fast method to generate pairwise correlation values (Pearson) Parameters ---------- A : np.ndarray First features array (1D) B : np.ndarray Second features array (1D) Returns ------- np.ndarray 2D matrix containing A vs B feature correlations """ if len(A) < len(B): A = np.pad(A, (0, len(B) - len(A)), mode='constant', constant_values=0) if len(B) < len(A): B = np.pad(B, (0, len(A) - len(B)), mode='constant', constant_values=0) am = A - np.mean(A, axis=0, keepdims=True) bm = B - np.mean(B, axis=0, keepdims=True) return ( am.T
[docs] @ bm / ( np.sqrt(np.sum(am**2, axis=0, keepdims=True)).T * np.sqrt(np.sum(bm**2, axis=0, keepdims=True)) ) )
def calc_similarity( self, query: Union[np.ndarray, str, None], candidate: Union[np.ndarray, str, None, list[np.ndarray, str, None]], ) -> float: """Calculate FEPOPS similarity Method for calculating molecular similarity based on their OpenFEPOPS descriptors. Centres and scales FEPOPS descriptors using parameters passed upon object initialisation. Parameters ---------- query : Union[np.ndarray, str] A Numpy array containing the FEPOPS descriptors of the query molecule or a smiles string from which to generate FEPOPS descriptors for the query molecule. Can also be None, in which case, np.nan is returned as a score. candidate : Union[np.ndarray, str, None, list[np.ndarray, str, None]], A Numpy array containing the FEPOPS descriptors of the candidate molecule or a smiles string from which to generate FEPOPS descriptors for the candidate molecule. Can also be None, in which case, np.nan is returned as a score, or a list of any of these. If it is a list, then a list of scores against the single candidate is returned. Returns ------- float Fepops similarity between two molecules """ if not isinstance(query, np.ndarray): query_status, query = self.get_fepops(query) if query_status != GetFepopStatusCode.SUCCESS: return np.nan if isinstance(candidate, list): scores = [] for c in candidate: scores.append(self.calc_similarity(query, c)) return scores if not isinstance(candidate, np.ndarray): candidate_status, candidate = self.get_fepops(candidate) if candidate_status != GetFepopStatusCode.SUCCESS: return np.nan if not isinstance(query, np.ndarray): raise ValueError("query was not, or could not be coerced into a np.ndarray") if not isinstance(candidate, np.ndarray): raise ValueError( "candidate was not, or could not be coerced into a np.ndarray" ) q = (query - self.descriptor_means) / self.descriptor_stds_no_zeros c = (candidate - self.descriptor_means) / self.descriptor_stds_no_zeros return self.pairwise_correlation(q.flatten(), c.flatten())
def __call__( self, query: Union[np.ndarray, str], candidate: Union[np.ndarray, str], ) -> float: """Calling the object has the same effect as calling calc_similarity Parameters ---------- query : Union[np.ndarray, str] A Numpy array containing the FEPOPS descriptors of the query molecule or a smiles string from which to generate FEPOPS descriptors for the query molecule. Can also be None, in which case, np.nan is returned as a score. candidate : Union[np.ndarray, str, None, list[np.ndarray, str, None]], A Numpy array containing the FEPOPS descriptors of the candidate molecule or a smiles string from which to generate FEPOPS descriptors for the candidate molecule. Can also be None, in which case, np.nan is returned as a score, or a list of any of these. If it is a list, then a list of scores against the single candidate is returned. Returns ------- float Fepops similarity between two molecules """ return self.calc_similarity(query, candidate)