Source code for fepops.fepops_persistent.fepops_persistent_abc

import logging
import multiprocessing as mp
from abc import ABCMeta, abstractmethod
from pathlib import Path
from typing import Union

import numpy as np
from rdkit import Chem
from scipy.spatial.distance import cdist, pdist, squareform
from tqdm import tqdm

from fepops.fepops import GetFepopStatusCode

from ..fepops import OpenFEPOPS


[docs]class FepopsPersistentAbstractBaseClass(metaclass=ABCMeta): """Abstract base class for persistent fepops storage New storage methods may be implemented as demonstrated in fepopsdb_json.py or in fepopsdb_sqlite.py by extending this abstract base class which provides some required functionality like: - save_descriptors(smiles: Union[str, Path, list[str]]) to save a smiles file/list of smiles to the persistent storage - get_cansmi_to_mol_dict_not_in_database(smiles: Union[str, Path, list[str]]) to retrieve a unique dictionary with canonical smiles as keys not already stored in the database and rdkit mol objects as values. When writing your own persistent storage methods, you must override the following methods: add_fepop(rdkit_canonical_smiles: str, fepops: np.ndarray) ---------------------------------------------------------- Add the fepop to persistent storage. super().add_fepop may be called by the overridden function to perform type checks on arguments. fepop_exists(rdkit_canonical_smiles: str) ----------------------------------------- Return True if the canonical smiles is already in the database, and False if not. super().fepop_exists may be called by the overridden function to perform type checks on arguments. get_fepops(rdkit_canonical_smiles: str) --------------------------------------- Return a fepop from persistent storage. If it does not exist, then generate it by calling self.fepops_object.get_fepops which is supplied by this base class. super().get_fepops may be called by the overridden function to perform type checks on arguments. With this function in place it allows interface compatibility with a standard Fepops object. Inheriting functions may also define __enter__ and __exit__ methods for use with context handlers. If none are defined, then empty ones are provided. This can be useful in doing things like writing out large files after descriptor generation if incremental writes are not possible, like in the case of the FepopsDBJSON child class. Parameters ---------- database_file : Union[str, Path] File to use for persistent storage. kmeans_method : str, optional Method which should be used for kmeans calculation by fepops objects, can be one of "sklearn", "pytorchgpu", or "pytorchcpu". parallel : bool, optional Run in parallel (using joblib), by default True n_jobs : int, optional Number of jobs to be spawned with joblib. If -1, then use all available cores. By default -1 """ @staticmethod def _parallel_init_worker_get_cansmi_mol_tuple(smiles_is_rdkit_canonical: bool): """Static method for initialisation of smiles and mol tuple workers Parameters ---------- smiles_is_rdkit_canonical : bool If True, then the supplied SMILES are guaranteed to be canonical and generated by RDKit """ global sirdkc sirdkc = smiles_is_rdkit_canonical @staticmethod def _parallel_get_cansmi_tuple(m): """Static method for worker threads to get smiles and mol tuples Parameters ---------- m : str Molecule as a smiles string Returns ------- tuple[str, rdkit.Chem.Mol] Tuple containing an RDKit canonical SMILES string and the RDKit mol constructed by it """ global sirdkc return FepopsPersistentAbstractBaseClass._get_can_smi_mol_tuple( m, smiles_guaranteed_rdkit_canonical=sirdkc ) @staticmethod def _parallel_init_worker_desc_gen_shared_fepops_ob(fepops_object): """Static method for FEPOPS descriptor worker processes initialisation Parameters ---------- fepops_object : OpenFEPOPS Initialised OpenFEPOPS object which should be used to generate descriptors """ global shared_fepops_ob shared_fepops_ob = fepops_object @staticmethod def _parallel_get_gen_fepops_descriptors(m): """Static method for worker processes to generate FEPOPS descriptors Parameters ---------- m : _type_ _description_ Returns ------- _type_ _description_ """ global shared_fepops_ob return m[0], shared_fepops_ob.get_fepops(m[1]) @abstractmethod def __init__( self, database_file: Union[str, Path], kmeans_method: str = "sklearn", parallel: bool = True, n_jobs: int = -1, ): """FepopsPersistentABC constructor for storing initialisation parameters Parameters ---------- database_file : Union[str, Path] Location of the database file as a string or Path object kmeans_method : str, optional Method to use for calculation of FEPOPS, by default "sklearn" parallel : bool, optional If calculations should be done making use of multiple CPU cores, by default True n_jobs : int, optional Number of worker processes to spawn. If -1, then the optimum is detected automatically, by default -1 """ self.database_file = Path(database_file) self.openfepops_object = OpenFEPOPS(kmeans_method=kmeans_method) self.parallel = parallel self.n_jobs = n_jobs
[docs] def save_descriptors( self, smiles: Union[str, Path, list[str]], add_failures_to_database: bool = True, smiles_guaranteed_rdkit_canonical: bool = False, fepops_object_constructor_kwargs: dict = {}, ): """Pregenerate FEPOPS descriptors for a set of SMILES strings Parameters ---------- smiles : Union[str, Path, list[str]] String containing the path to a SMILES file which should be read in and have each molecule with in added to the database add_failures_to_database: bool If True, then a record is kept in the database for SMILES which were problematic and FEPOPS descriptor generation failed for, by default True smiles_guaranteed_rdkit_canonical : bool If True, then the supplied SMILES are guaranteed to be canonical SMILES generated by RDKit which allows skipping of a sanitisation step, by default False fepops_object_constructor_kwargs: dict Dictionary of kwargs which will be passed to the FEPOPS object upon initialisation, by default {} """ canonical_smiles_to_mol_dict = self.get_cansmi_to_mol_dict_not_in_database( smiles, smiles_guaranteed_rdkit_canonical=smiles_guaranteed_rdkit_canonical ) if len(canonical_smiles_to_mol_dict) == 0: print("Nothing to add to database") return if not self.parallel: for rdkit_canonical_smiles, mol in tqdm( canonical_smiles_to_mol_dict.items(), desc="Generating fepops" ): status, fepops_array = self.openfepops_object.get_fepops(mol) if status == GetFepopStatusCode.SUCCESS or add_failures_to_database: self.add_fepop(rdkit_canonical_smiles, fepops_array) print( f"Added {len(canonical_smiles_to_mol_dict)} new molecues to the database ({self.database_file})" ) else: # Do it in parallel n_successes = 0 n_failures = 0 for rdkit_canonical_smiles, (status, new_fepop) in tqdm( mp.Pool( # processes=min(len(canonical_smiles_to_mol_dict), mp.cpu_count()), processes=2, initializer=self._parallel_init_worker_desc_gen_shared_fepops_ob, initargs=(OpenFEPOPS(**fepops_object_constructor_kwargs),), ).imap( self._parallel_get_gen_fepops_descriptors, canonical_smiles_to_mol_dict.items(), ), desc="Generating descriptors (parallel)", total=len(canonical_smiles_to_mol_dict), ): if status == GetFepopStatusCode.SUCCESS: n_successes += 1 else: n_failures += 1 if status == GetFepopStatusCode.SUCCESS or add_failures_to_database: self.add_fepop(rdkit_canonical_smiles, new_fepop) print( f"Successfully added {n_successes} new molecues to the database ({self.database_file}), {n_failures} failed" )
[docs] @abstractmethod def add_fepop(self, rdkit_canonical_smiles: str, fepops: np.ndarray): """Add canonical smiles and fepop to database. Must be overridden This abstractmethod must be overridden by the inheriting object, but provides some functionality for sanity checking input and may be called by the inheriting class. """ if not isinstance(rdkit_canonical_smiles, str): raise ValueError( f"Expected an rdkit canonical smiles string, but a {type(rdkit_canonical_smiles)} was passed" ) if not isinstance(fepops, np.ndarray): raise ValueError(f"Expected a fepop, but a {type(fepops)} was passed")
[docs] @abstractmethod def get_fepops( self, smiles: Union[str, Chem.rdchem.Mol, np.ndarray], is_canonical: bool = True ) -> None: """Get a FEPOP from the database using its SMILES. Must be overridden This abstractmethod must be overridden by the inheriting object, but provides some functionality for sanity checking input and may be called by the inheriting class. Parameters ---------- smiles : Union[str, Chem.rdchem.Mol, np.ndarray] _description_ is_canonical : bool, optional If True, then the supplied SMILES are guaranteed to be canonical SMILES generated by RDKit which allows skipping of a sanitisation step, by default True """ if not isinstance(smiles, (str, Chem.rdchem.Mol, np.ndarray)): raise ValueError( f"Expected an rdkit canonical smiles string, rdkit mol, or a numpy array of descriptors but a {type(smiles)} was passed: {smiles}" )
[docs] @abstractmethod def fepop_exists(self, rdkit_canonical_smiles: str) -> bool: """Return True if canonical smiles already exist in the database This abstractmethod must be overridden by the inheriting object, but provides some functionality for sanity checking input and may be called by the inheriting class. """ if not isinstance(rdkit_canonical_smiles, str): raise ValueError( f"Expected an rdkit canonical smiles string, but a {type(rdkit_canonical_smiles)} was passed" )
@staticmethod def _get_can_smi_mol_tuple(s: str, smiles_guaranteed_rdkit_canonical: bool = False): try: mol = Chem.MolFromSmiles(s) except: try: mol = Chem.MolFromSmiles(s, sanitize=False) except: mol = None if mol is None: logging.warning( f"Could not parse smiles to a valid molecule, smiles was: {s}" ) return (s, mol) if smiles_guaranteed_rdkit_canonical: return (s, mol) else: return (Chem.MolToSmiles(mol), mol)
[docs] def get_cansmi_to_mol_dict_not_in_database( self, smiles: Union[str, Path, list[str]], smiles_guaranteed_rdkit_canonical: bool = False, ): """Get smiles to mol dict for smiles not in the database Parameters ---------- smiles : Union[str, Path, list[str]] If a string is passed, then it is assumed to be a file path of SMILES file and this file is loaded for processing. Similarly, Path objects are assumed to point at SMILES files for processing. If passing smiles strings to this function, the wrap the string in a list (making a list containing only one element), or provide large multi-SMILES lists which will be operated upon directly smiles_guaranteed_rdkit_canonical : bool, optional If the supplied SMILES are canonical RDKit-generated SMILES, then regeneration of these SMILES strings for uniquification and database lookup may be skipped, by default False Returns ------- dict Dictionary with SMILES as keys and RDKit molecules as the values for molecules not present in the current database """ if isinstance(smiles, str): smiles = Path(smiles) if isinstance(smiles, Path): if smiles.exists(): smiles = [ s.strip() for s in open(smiles).readlines() if len(s.strip()) > 0 ] else: raise ValueError( f"smiles file ({smiles}) not found. If you are passing smiles, place it into a list first" ) if not isinstance(smiles, list): raise ValueError( "smiles should be a str or Path denoting the location of a smiles file, or a list of smiles" ) smiles = list(set(smiles)) print(f"Got {len(smiles)} unique SMILES strings") if not self.parallel: # Ensure unique (canonical, also storing intermediate mol) canonical_smiles_to_mol_dict = dict( self._get_can_smi_mol_tuple( s, smiles_guaranteed_rdkit_canonical=smiles_guaranteed_rdkit_canonical, ) for s in tqdm(smiles, desc="Uniquifying input smiles (non-parallel)") ) else: tmp_res_list = [] # Ensure unique (canonical, also storing intermediate mol) for res in tqdm( mp.Pool( initializer=self._parallel_init_worker_get_cansmi_mol_tuple, initargs=(smiles_guaranteed_rdkit_canonical,), processes=max(1, min(len(smiles) / 50, mp.cpu_count())), ).map(self._parallel_get_cansmi_tuple, smiles, chunksize=1), desc="Uniquifying input smiles (parallel)", total=len(smiles), ): tmp_res_list.append(res) canonical_smiles_to_mol_dict = dict(tmp_res_list) del tmp_res_list # Make sure none are already in the database canonical_smiles_to_mol_dict = { cansmi: mol for cansmi, mol in tqdm( canonical_smiles_to_mol_dict.items(), desc="Checking if mols already exist in the database", ) if not self.fepop_exists(cansmi) } print( f"Got {len(canonical_smiles_to_mol_dict)} unique molecules not already in the database" ) return canonical_smiles_to_mol_dict
[docs] def calc_similarity( self, fepops_features_1: Union[np.ndarray, str, None], fepops_features_2: Union[np.ndarray, str, None], is_canonical=True, ): """Calculate FEPOPS similarity A static method for calculating molecular similarity based on their FEPOPS descriptors. Parameters ---------- fepops_features_1 : Union[np.ndarray, str, None] A Numpy array containing the FEPOPS descriptors of the query molecule or a smiles string from which to generate FEPOPS descriptors for the query molecule. fepops_features_2 : Union[np.ndarray, str, None, list[np.ndarray, str, None]] A Numpy array containing the FEPOPS descriptors of the candidate molecule or a smiles string from which to generate FEPOPS descriptors for the candidate molecule. Can also be None, in which case, np.nan is returned as a score, or a list of any of these. If it is a list, then a list of scores against the single candidate is returned. Returns ------- float Fepops similarity between two molecules """ if fepops_features_1 is None: return np.nan if isinstance(fepops_features_1, (str, Chem.rdchem.Mol)): status, fepops_features_1 = self.get_fepops( fepops_features_1, is_canonical=is_canonical ) if status != GetFepopStatusCode.SUCCESS: return np.nan if isinstance(fepops_features_2, list): new_fepops_features_2 = [] for item in fepops_features_2: status, fpop = self.get_fepops(item, is_canonical=is_canonical) new_fepops_features_2.append( fpop if status == GetFepopStatusCode.SUCCESS else None ) return self.openfepops_object.calc_similarity( fepops_features_1, new_fepops_features_2 ) if isinstance(fepops_features_2, (str, Chem.rdchem.Mol)): status, fepops_features_2 = self.get_fepops( fepops_features_2, is_canonical=is_canonical ) if status != GetFepopStatusCode.SUCCESS: return np.nan if any(x is None for x in (fepops_features_1, fepops_features_2)): return np.nan score = self.openfepops_object.calc_similarity( fepops_features_1, fepops_features_2 ) return score if score is not None else np.nan
[docs] def write(self): pass
def __enter__(self): return self def __exit__(self, exc_type, exc_value, traceback): self.write()