Source code for fepops.fepops_persistent.fepopsdb_json

import bz2
import json
from base64 import b64decode, b64encode
from pathlib import Path
from typing import Union

import numpy as np

from .fepops_persistent_abc import FepopsPersistentAbstractBaseClass


[docs]class FepopsDBJSON(FepopsPersistentAbstractBaseClass): """FepopsDBJSON - allows reading and writing to a simple JSON style cache""" def __init__( self, database_file: Union[str, Path], kmeans_method: str = "sklearn", parallel: bool = True, n_jobs: int = -1, ): """FepopsDBJSON constructor Allows reading and writing to a JSON file in place of a database/cache Parameters ---------- database_file : Union[str, Path] Filename as a Path or string denoting where the file is, or should be created kmeans_method : str, optional KMeans method which should be used by the OpenFEPOPS object, by default "sklearn" parallel : bool, optional Run in parallel (using joblib), by default True n_jobs : int, optional Number of jobs to be spawned with joblib. If -1, then use all available cores. By default -1 """ super().__init__( database_file=database_file, kmeans_method=kmeans_method, parallel=parallel, n_jobs=n_jobs, ) if not self.database_file.parent.exists(): self.database_file.parent.mkdir(parents=True) if self.database_file.exists(): self.db = json.load(open(self.database_file, "r")) else: self.db = {} self._db_changed = False self._was_written = False
[docs] def add_fepop(self, rdkit_canonical_smiles: str, fepops: Union[np.ndarray, None]): """Add a FEPOP to the database using the supplied SMILES as a key Parameters ---------- rdkit_canonical_smiles : str Canonical SMILES string generated by RDKit which represents the molecule used to generate the FEPOPS fepops : Union[np.ndarray, None] Array containing calculated FEPOPS descriptors. If None, then None is stored in the database, which is useful for indicating that the canonical SMILES supplied did not succeed in generating a molecule and subsequent FEPOPS. Marking these difficult SMILES in the database means they can be checked and ignored without further time being spent to regenerate them again. """ if fepops is None: fepops = np.array([np.NaN]) super().add_fepop(rdkit_canonical_smiles=rdkit_canonical_smiles, fepops=fepops) if not self.fepop_exists(rdkit_canonical_smiles=rdkit_canonical_smiles): self.db[rdkit_canonical_smiles] = b64encode( bz2.compress(fepops.tobytes()) ).decode("ascii") self._db_changed = True
[docs] def get_fepops( self, smiles: str, is_canonical: bool = False ) -> Union[np.ndarray, None]: """Get FEPOPS from the database for a given SMILES Parameters ---------- smiles : str The SMILES string of the molecule is_canonical : bool, optional If True, then we guarantee that the SMILES string supplied is canonical and generated by RDKit and in which case, we may skip a cleaning step, by default False Returns ------- Union[np.ndarray, None] Returns an array representing the retrieved FEPOPS, or None if None was stored in the database under the supplied SMILES key """ super().get_fepops(smiles=smiles) if not is_canonical: smiles, mol = self._get_can_smi_mol_tuple(smiles) if self.fepop_exists(rdkit_canonical_smiles=smiles): res = np.frombuffer(bz2.decompress(b64decode(self.db[smiles].encode()))) if np.isnan(res).any(): return None else: return res.reshape( -1, self.openfepops_object.num_centroids_per_fepop * self.openfepops_object.num_features_per_fepop, ) else: new_fepops = self.openfepops_object.get_fepops(mol) self.add_fepop(rdkit_canonical_smiles=smiles, fepops=new_fepops) return new_fepops
[docs] def fepop_exists(self, rdkit_canonical_smiles: str) -> bool: """Check if Fepop exists in the database If the fepops object was constructed with a database file, then query if the supplied canonical SMILES is included. If no database is present, then False is returned, as if it is not included. Parameters ---------- rdkit_canonical_smiles : str Canonical smiles to check Returns ------- bool True if the canonical smiles exists in the database """ return rdkit_canonical_smiles in self.db
[docs] def write(self): """Write any changes to the database/cache to the original file""" if self._db_changed: json.dump(self.db, open(self.database_file, "w")) self._was_written = True
def __enter__(self): return self def __exit__(self, exc_type, exc_value, traceback): self.write() def __del__(self): if self._db_changed and not self._was_written: print( "New fepops were added but changes were not written. Either call .write() or use FepopsDBJSON in a context, like with FepopsDBJSON() as fepops_jsondb..." )