import logging import re from abc import ABC from collections import defaultdict from typing import List, Optional, Dict from indigo import Indigo, IndigoException, IndigoObject from indigo.renderer import IndigoRenderer from rdkit import Chem from rdkit import RDLogger from rdkit.Chem import MACCSkeys from rdkit.Chem import rdChemReactions from rdkit.Chem.Draw import rdMolDraw2D from rdkit.Chem.MolStandardize import rdMolStandardize from rdkit.Chem.rdmolops import GetMolFrags from rdkit.Contrib.IFG import ifg logger = logging.getLogger(__name__) RDLogger.DisableLog('rdApp.*') # from rdkit import rdBase # rdBase.LogToPythonLogger() # pylog = logging.getLogger("rdkit") # pylog.propagate = False # pylog.disabled = True # print(pylog.disabled) class ProductSet(object): def __init__(self, product_set: List[str]): self.product_set = product_set def __repr__(self): return self.product_set.__repr__() def __len__(self): return len(self.product_set) def __iter__(self): return iter(self.product_set) def __eq__(self, other): return isinstance(other, ProductSet) and sorted(self.product_set) == sorted(other.product_set) def __hash__(self): return hash('-'.join(sorted(self.product_set))) class PredictionResult(object): def __init__(self, product_sets: List['ProductSet'], probability: float, rule: Optional['Rule'] = None): self.product_sets = product_sets self.probability = probability self.rule = rule def __len__(self): return len(self.product_sets) def __iter__(self): return iter(self.product_sets) def __repr__(self): return f"--{self.probability}/{self.rule}--> {self.product_sets}" class FormatConverter(object): @staticmethod def from_smiles(smiles): return Chem.MolFromSmiles(smiles) @staticmethod def to_smiles(mol, canonical=False): return Chem.MolToSmiles(mol, canonical=canonical) @staticmethod def InChIKey(smiles): return Chem.MolToInchiKey(FormatConverter.from_smiles(smiles)) @staticmethod def canonicalize(smiles: str): return FormatConverter.to_smiles(FormatConverter.from_smiles(smiles), canonical=True) @staticmethod def maccs(smiles): mol = Chem.MolFromSmiles(smiles) bitvec = MACCSkeys.GenMACCSKeys(mol) return bitvec.ToList() @staticmethod def get_functional_groups(smiles: str) -> List[str]: res = list() try: m = Chem.MolFromSmiles(smiles) fgs = ifg.identify_functional_groups(m) for fg in fgs: # TODO atoms or type? res.append(fg.atoms) except AttributeError: logger.debug(f"Could not get functional groups for {smiles}") return res @staticmethod def to_svg(smiles, mol_size=(200, 150), kekulize=True): mol = FormatConverter.from_smiles(smiles) if kekulize: try: mol = Chem.Kekulize(mol) except: mol = Chem.Mol(mol.ToBinary()) if not mol.GetNumConformers(): Chem.rdDepictor.Compute2DCoords(mol) drawer = rdMolDraw2D.MolDraw2DSVG(*mol_size) opts = drawer.drawOptions() opts.clearBackground = False drawer.DrawMolecule(mol) drawer.FinishDrawing() svg = drawer.GetDrawingText().replace('svg:', '') svg = re.sub("<\?xml.*\?>", '', svg) return svg @staticmethod def to_png(smiles, mol_size=(200, 150), kekulize=True): mol = FormatConverter.from_smiles(smiles) if kekulize: try: Chem.Kekulize(mol) except: mc = Chem.Mol(mol.ToBinary()) if not mc.GetNumConformers(): Chem.rdDepictor.Compute2DCoords(mc) pass @staticmethod def normalize(smiles): # TODO call to AMBIT Service return smiles @staticmethod def ep_standardize(smiles): change = True while change: change = False for standardizer in MATCH_STANDARDIZER: tmp_smiles = standardizer.standardize(smiles) if tmp_smiles != smiles: print(f"change {smiles} to {tmp_smiles}") change = True smiles = tmp_smiles if change is False: print(f"nothing changed") return smiles @staticmethod def standardize(smiles): # Taken from https://bitsilla.com/blog/2021/06/standardizing-a-molecule-using-rdkit/ # follows the steps in # https://github.com/greglandrum/RSC_OpenScience_Standardization_202104/blob/main/MolStandardize%20pieces.ipynb # as described **excellently** (by Greg) in # https://www.youtube.com/watch?v=eWTApNX8dJQ mol = Chem.MolFromSmiles(smiles) # removeHs, disconnect metal atoms, normalize the molecule, reionize the molecule clean_mol = rdMolStandardize.Cleanup(mol) # if many fragments, get the "parent" (the actual mol we are interested in) parent_clean_mol = rdMolStandardize.FragmentParent(clean_mol) # try to neutralize molecule uncharger = rdMolStandardize.Uncharger() # annoying, but necessary as no convenience method exists uncharged_parent_clean_mol = uncharger.uncharge(parent_clean_mol) # note that no attempt is made at reionization at this step # nor at ionization at some pH (rdkit has no pKa caculator) # the main aim to to represent all molecules from different sources # in a (single) standard way, for use in ML, catalogue, etc. # te = rdMolStandardize.TautomerEnumerator() # idem # taut_uncharged_parent_clean_mol = te.Canonicalize(uncharged_parent_clean_mol) return Chem.MolToSmiles(uncharged_parent_clean_mol, kekuleSmiles=True) @staticmethod def neutralize_smiles(smiles): mol = Chem.MolFromSmiles(smiles) mol = FormatConverter.neutralize_molecule(mol) return Chem.MolToSmiles(mol) @staticmethod def neutralize_molecule(mol): pattern = Chem.MolFromSmarts("[+1!h0!$([*]~[-1,-2,-3,-4]),-1!$([*]~[+1,+2,+3,+4])]") at_matches = mol.GetSubstructMatches(pattern) at_matches_list = [y[0] for y in at_matches] if len(at_matches_list) > 0: for at_idx in at_matches_list: atom = mol.GetAtomWithIdx(at_idx) chg = atom.GetFormalCharge() hcount = atom.GetTotalNumHs() atom.SetFormalCharge(0) atom.SetNumExplicitHs(hcount - chg) atom.UpdatePropertyCache() return mol @staticmethod def is_valid_smirks(smirks: str) -> bool: try: rdChemReactions.ReactionFromSmarts(smirks) return True except: return False @staticmethod def apply(smiles: str, smirks: str, preprocess_smiles: bool = True, bracketize: bool = True, standardize: bool = True, kekulize: bool = True) -> List['ProductSet']: logger.debug(f'Applying {smirks} on {smiles}') # If explicitly wanted or rule generates multiple products add brackets around products to capture all if bracketize: # or "." in smirks: smirks = smirks.split('>>')[0] + ">>(" + smirks.split('>>')[1] + ")" # List of ProductSet objects pss = set() try: rxn = rdChemReactions.ReactionFromSmarts(smirks) mol = Chem.MolFromSmiles(smiles) # Inplace if preprocess_smiles: Chem.SanitizeMol(mol) mol = Chem.AddHs(mol) # apply! sites = rxn.RunReactants((mol,)) logger.debug(f"{len(sites)} products sets generated") if len(sites): # Sanitize mols for product_set in sites: prods = [] for product in product_set: try: Chem.SanitizeMol(product) product = GetMolFrags(product, asMols=True) for p in product: p = FormatConverter.standardize(Chem.MolToSmiles(p)) prods.append(p) # if kekulize: # # from rdkit.Chem import MolStandardize # # # # # Attempt re-sanitization via standardizer # # cleaner = MolStandardize.rdMolStandardize.Cleanup() # # mol = cleaner.cleanup(product) # # # Fixes # # # [2025-01-30 23:00:50] ERROR chem - Sanitizing and converting failed: # # # non-ring atom 3 marked aromatic # # # But does not improve overall performance # # # for a in product.GetAtoms(): # # # if (not a.IsInRing()) and a.GetIsAromatic(): # # # a.SetIsAromatic(False) # # # # # # for b in product.GetBonds(): # # # if (not b.IsInRing()) and b.GetIsAromatic(): # # # b.SetIsAromatic(False) # # for atom in product.GetAtoms(): # # atom.SetIsAromatic(False) # # for bond in product.GetBonds(): # # bond.SetIsAromatic(False) # Chem.Kekulize(product) except ValueError as e: logger.error(f'Sanitizing and converting failed:\n{e}') continue if len(prods): ps = ProductSet(prods) pss.add(ps) except Exception as e: logger.error(f'Applying {smirks} on {smiles} failed:\n{e}') return pss # @staticmethod # def apply(reaction, smiles): # rxn = AllChem.ReactionFromSmarts(reaction) # return [Chem.MolToSmiles(x, 1) for x in rxn.RunReactants((Chem.MolFromSmiles(smiles),))[0]] @staticmethod def MACCS(smiles): return MACCSkeys.GenMACCSKeys(FormatConverter.from_smiles(smiles)) @staticmethod def neutralize_atoms(mol): pattern = Chem.MolFromSmarts("[+1!h0!$([*]~[-1,-2,-3,-4]),-1!$([*]~[+1,+2,+3,+4])]") at_matches = mol.GetSubstructMatches(pattern) at_matches_list = [y[0] for y in at_matches] if len(at_matches_list) > 0: for at_idx in at_matches_list: atom = mol.GetAtomWithIdx(at_idx) chg = atom.GetFormalCharge() hcount = atom.GetTotalNumHs() atom.SetFormalCharge(0) atom.SetNumExplicitHs(hcount - chg) atom.UpdatePropertyCache() return mol @staticmethod def sanitize_smiles(smiles_list: List): parsed_smiles = [] errors = 0 for smi in smiles_list: try: # # Remove Stereo and Flatten # if "/" in smi: # smi = smi.replace("/", "") # if "\\" in smi: # smi = smi.replace("\\", "") # if "@" in smi: # smi = smi.replace("@", "") mol = Chem.MolFromSmiles(smi) mol = FormatConverter.neutralize_atoms(mol) mol = Chem.RemoveAllHs(mol) Chem.Kekulize(mol) smi_p = Chem.MolToSmiles(mol, kekuleSmiles=True) smi_p = Chem.CanonSmiles(smi_p) if '~' in smi_p: smi_p1 = smi_p.replace('~', '') parsed_smiles.append(smi_p1) else: parsed_smiles.append(smi_p) except Exception as e: errors += 1 pass return parsed_smiles, errors class Standardizer(ABC): def __init__(self, name): self.name = name def standardize(self, smiles: str) -> str: return FormatConverter.normalize(smiles) class RuleStandardizer(Standardizer): def __init__(self, name, smirks): super().__init__(name) self.smirks = smirks def standardize(self, smiles: str) -> str: standardized_smiles = list(set(FormatConverter.apply(smiles, self.smirks))) if len(standardized_smiles) > 1: logger.warning(f'{self.smirks} generated more than 1 compound {standardized_smiles}') print(f'{self.smirks} generated more than 1 compound {standardized_smiles}') standardized_smiles = standardized_smiles[:1] if standardized_smiles: smiles = standardized_smiles[0] return super().standardize(smiles) class RegExStandardizer(Standardizer): def __init__(self, name, replacements: dict): super().__init__(name) self.replacements = replacements def standardize(self, smiles: str) -> str: smi = smiles mod_smi = smiles for k, v in self.replacements.items(): mod_smi = smi.replace(k, v) while mod_smi != smi: mod_smi = smi for k, v in self.replacements.items(): smi = smi.replace(k, v) return super().standardize(smi) FLATTEN = [ RegExStandardizer("Remove Stereo", {"@": ""}) ] UN_CIS_TRANS = [ RegExStandardizer("Un-Cis-Trans", {"/": "", "\\": ""}) ] BASIC = [ RuleStandardizer("ammoniumstandardization", "[H][N+:1]([H])([H])[#6:2]>>[H][#7:1]([H])-[#6:2]"), RuleStandardizer("cyanate", "[H][#8:1][C:2]#[N:3]>>[#8-:1][C:2]#[N:3]"), RuleStandardizer("deprotonatecarboxyls", "[H][#8:1]-[#6:2]=[O:3]>>[#8-:1]-[#6:2]=[O:3]"), RuleStandardizer("forNOOH", "[H][#8:1]-[#7+:2](-[*:3])=[O:4]>>[#8-:1]-[#7+:2](-[*:3])=[O:4]"), RuleStandardizer("Hydroxylprotonation", "[#6;A:1][#6:2](-[#8-:3])=[#6;A:4]>>[#6:1]-[#6:2](-[#8:3][H])=[#6;A:4]"), RuleStandardizer("phosphatedeprotonation", "[H][#8:1]-[$([#15]);!$(P([O-])):2]>>[#8-:1]-[#15:2]"), RuleStandardizer("PicricAcid", "[H][#8:1]-[c:2]1[c:3][c:4][c:5]([c:6][c:7]1-[#7+:8](-[#8-:9])=[O:10])-[#7+:11](-[#8-:12])=[O:13]>>[#8-:1]-[c:2]1[c:3][c:4][c:5]([c:6][c:7]1-[#7+:8](-[#8-:9])=[O:10])-[#7+:11](-[#8-:12])=[O:13]"), RuleStandardizer("Sulfate1", "[H][#8:1][S:2]([#8:3][H])(=[O:4])=[O:5]>>[#8-:1][S:2]([#8-:3])(=[O:4])=[O:5]"), RuleStandardizer("Sulfate2", "[#6:1]-[#8:2][S:3]([#8:4][H])(=[O:5])=[O:6]>>[#6:1]-[#8:2][S:3]([#8-:4])(=[O:5])=[O:6]"), RuleStandardizer("Sulfate3", "[H][#8:3][S:2]([#6:1])(=[O:4])=[O:5]>>[#6:1][S:2]([#8-:3])(=[O:4])=[O:5]"), RuleStandardizer("Transform_c1353forSOOH", "[H][#8:1][S:2]([*:3])=[O:4]>>[#8-:1][S:2]([*:3])=[O:4]"), ] ENHANCED = BASIC + [ RuleStandardizer("fullPhosphatedeprotonation", "[H][#8:1]-[#15:2]>>[#8-:1]-[#15:2]") ] EXOTIC = ENHANCED + [ RuleStandardizer("ThioPhosphate1", "[H][S:1]-[#15:2]=[$([#16]),$([#8]):3]>>[S-:1]-[#15:2]=[$([#16]),$([#8]):3]") ] COA_CUTTER = [ RuleStandardizer("CutCoEnzymeAOff", "CC(C)(COP(O)(=O)OP(O)(=O)OCC1OC(C(O)C1OP(O)(O)=O)n1cnc2c(N)ncnc12)C(O)C(=O)NCCC(=O)NCCS[$(*):1]>>[O-][$(*):1]") ] ENOL_KETO = [ RuleStandardizer("enol2Ketone", "[H][#8:2]-[#6:3]=[#6:1]>>[#6:1]-[#6:3]=[O:2]") ] MATCH_STANDARDIZER = EXOTIC + FLATTEN + UN_CIS_TRANS + COA_CUTTER + ENOL_KETO class IndigoUtils(object): @staticmethod def layout(mol_data): i = Indigo() try: if mol_data.startswith('$RXN') or '>>' in mol_data: rxn = i.loadQueryReaction(mol_data) rxn.layout() return rxn.rxnfile() else: mol = i.loadQueryMolecule(mol_data) mol.layout() return mol.molfile() except IndigoException as e: try: logger.info("layout() failed, trying loadReactionSMARTS as fallback!") rxn = IndigoUtils.load_reaction_SMARTS(mol_data) rxn.layout() return rxn.molfile() except IndigoException as e2: logger.error(f'layout() failed due to {e2}!') @staticmethod def load_reaction_SMARTS(mol): return Indigo().loadReactionSmarts(mol) @staticmethod def aromatize(mol_data, is_query): i = Indigo() try: if mol_data.startswith('$RXN'): if is_query: rxn = i.loadQueryReaction(mol_data) else: rxn = i.loadReaction(mol_data) rxn.aromatize() return rxn.rxnfile() else: if is_query: mol = i.loadQueryMolecule(mol_data) else: mol = i.loadMolecule(mol_data) mol.aromatize() return mol.molfile() except IndigoException as e: try: logger.info("Aromatizing failed, trying loadReactionSMARTS as fallback!") rxn = IndigoUtils.load_reaction_SMARTS(mol_data) rxn.aromatize() return rxn.molfile() except IndigoException as e2: logger.error(f'Aromatizing failed due to {e2}!') @staticmethod def dearomatize(mol_data, is_query): i = Indigo() try: if mol_data.startswith('$RXN'): if is_query: rxn = i.loadQueryReaction(mol_data) else: rxn = i.loadReaction(mol_data) rxn.dearomatize() return rxn.rxnfile() else: if is_query: mol = i.loadQueryMolecule(mol_data) else: mol = i.loadMolecule(mol_data) mol.dearomatize() return mol.molfile() except IndigoException as e: try: logger.info("De-Aromatizing failed, trying loadReactionSMARTS as fallback!") rxn = IndigoUtils.load_reaction_SMARTS(mol_data) rxn.dearomatize() return rxn.molfile() except IndigoException as e2: logger.error(f'De-Aromatizing failed due to {e2}!') @staticmethod def sanitize_functional_group(functional_group: str): counter = 0 while True: counter += 1 copy = functional_group # special environment handling (amines, hydroxy, esters, ethers) # the higher substituted should not contain H env. if functional_group == '[C]=O': functional_group = "[H][C](=O)[CX4,c]" # aldamines if functional_group == "O=[C]N(R)R": functional_group = "O=[C]([H])N(R)R" # ether, ester, ketones, amines, thioether vals = [ "ROR", "O=C(R)OR", "[H]N(R)R", "O=C(R)R", "RN(R)R", "RSR", ] if functional_group in vals: functional_group = functional_group.replace("R", "[CX4,c]") # esters, ketones, amides functional_group = functional_group.replace("O=C(R)", "O=C([CX4,c])") if functional_group == "RS*R" or functional_group == "RO*R": # neighboring atoms can be any aromatic atom, but they should not be highlighted functional_group = functional_group.replace("R", "") # aromatic compounds: aromatic atoms are denoted with * # single aromatic heteroatoms, not substituted # unsubstituted aromatic nitrogen if functional_group == "RN*(R)R": functional_group = "[nH1,nX2](a)a" # pyrrole (with H) or pyridine (no other connections); currently overlaps with neighboring aromatic atoms # substituted aromatic nitrogen functional_group = functional_group.replace("N*(R)R", "n(a)a") # substituent will be before N*; currently overlaps with neighboring aromatic atoms # pyridinium if functional_group == "RN*(R)(R)(R)R": functional_group = "[CX4,c]n(a)a" # currently overlaps with neighboring aromatic atoms # N-oxide if functional_group == "[H]ON*(R)(R)(R)R": functional_group = "[O-][n+](a)a" # currently overlaps with neighboring aromatic atoms # other aromatic hetero atoms functional_group = functional_group.replace("C*", "c") functional_group = functional_group.replace("N*", "n") functional_group = functional_group.replace("S*", "s") functional_group = functional_group.replace("O*", "o") functional_group = functional_group.replace("Se*", "se") functional_group = functional_group.replace("P*", "p") # other replacement, to accomodate for the standardization rules in enviPath # This is not the perfect way to do it; there should be a way to replace substructure SMARTS in SMARTS? # nitro groups are broken, due to charge handling. this SMARTS matches both forms (formal charges and hypervalent); Ertl-CDK still treats both forms separately... functional_group = functional_group.replace("[H]O[N](=O)R", "[CX4,c][NX3](~[OX1])~[OX1]") functional_group = functional_group.replace("O=N(=O)R", "[CX4,c][NX3](~[OX1])~[OX1]") # carboxylic acid: this SMARTS matches both neutral and anionic form; includes COOH in larger functional_groups functional_group = functional_group.replace("[H]OC(=O)", "[OD1]C(=O)") # azo functional_group = functional_group.replace("N#[N]N(R)R", "[NX1]~[NX2]~N[CX4,c]") functional_group = functional_group.replace("RN=[N]=NR", "[NX1]~[NX2]~N[CX4,c]") # TODO: there might be more problematic groups, which we have yet to find. # other environment atoms (can be aromatic or aliphatic Csp3, or H) functional_group = functional_group.replace("R", "[H,CX4,c]") if copy == functional_group: break return functional_group @staticmethod def _colorize(indigo: Indigo, molecule: IndigoObject, functional_groups: Dict[str, int], is_reaction: bool): indigo.setOption("render-atom-color-property", "color") indigo.setOption("aromaticity-model", "generic") counts = defaultdict(lambda: 0) environment = set() matcher = indigo.substructureMatcher(molecule) # Determine environment atoms as they will be colored black for env in ["[CX4]", "c"]: query = indigo.loadSmarts(env) for match in matcher.iterateMatches(query): if match is not None: for atom in query.iterateAtoms(): mappedAtom = match.mapAtom(atom) if mappedAtom is None: continue environment.add(mappedAtom.index()) for k, v in functional_groups.items(): sanitized = IndigoUtils.sanitize_functional_group(k) query = indigo.loadSmarts(sanitized) for match in matcher.iterateMatches(query): if match is not None: for atom in query.iterateAtoms(): mappedAtom = match.mapAtom(atom) if mappedAtom is None or mappedAtom.index() in environment: continue counts[mappedAtom.index()] = max(v, counts[mappedAtom.index()]) for k, v in counts.items(): if is_reaction: color = "128, 0, 128" else: if v <= 5: color = "200, 0, 0" else: color = "0, 112, 0" molecule.addDataSGroup([k], [], "color", color) @staticmethod def mol_to_svg(mol_data: str, width: int = 0, height: int = 0, functional_groups: Dict[str, int] = None): if functional_groups is None: functional_groups = {} i = Indigo() renderer = IndigoRenderer(i) i.setOption("render-output-format", "svg") i.setOption("render-coloring", not bool(len(functional_groups.keys()))) i.setOption("render-image-size", width, height) i.setOption("render-bond-line-width", 2.0) mol = i.loadMolecule(mol_data) if len(functional_groups.keys()) > 0: IndigoUtils._colorize(i, mol, functional_groups, False) return renderer.renderToBuffer(mol).decode('UTF-8') @staticmethod def smirks_to_svg(smirks: str, is_query_smirks, width: int = 0, height: int = 0, educt_functional_groups: Dict[str, int] = None, product_functional_groups: Dict[str, int] = None): if educt_functional_groups is None: educt_functional_groups = {} if product_functional_groups is None: product_functional_groups = {} i = Indigo() renderer = IndigoRenderer(i) i.setOption("render-output-format", "svg") i.setOption("render-coloring", True) i.setOption("render-image-size", width, height) if is_query_smirks: obj = i.loadReactionSmarts(smirks) else: obj = i.loadReaction(smirks) if len(educt_functional_groups.keys()) > 0: for react in obj.iterateReactants(): IndigoUtils._colorize(i, react, educt_functional_groups, True) if len(product_functional_groups.keys()) > 0: for prod in obj.iterateProducts(): IndigoUtils._colorize(i, prod, product_functional_groups, True) return renderer.renderToBuffer(obj).decode('UTF-8') if __name__ == '__main__': data = { "struct": "\n Ketcher 2172510 12D 1 1.00000 0.00000 0\n\n 6 6 0 0 0 999 V2000\n 0.0000 0.0000 0.0000 C 0 0 0 0 0 0 0 0 0 0 0 0\n -1.0000 0.0000 0.0000 C 0 0 0 0 0 0 0 0 0 0 0 0\n -1.5000 -0.8660 0.0000 C 0 0 0 0 0 0 0 0 0 0 0 0\n -1.0000 -1.7321 0.0000 C 0 0 0 0 0 0 0 0 0 0 0 0\n 0.0000 -1.7321 0.0000 C 0 0 0 0 0 0 0 0 0 0 0 0\n 0.5000 -0.8660 0.0000 C 0 0 0 0 0 0 0 0 0 0 0 0\n 1 2 2 0 0 0 0\n 2 3 1 0 0 0 0\n 3 4 2 0 0 0 0\n 4 5 1 0 0 0 0\n 5 6 2 0 0 0 0\n 6 1 1 0 0 0 0\nM END\n", "options": { "smart-layout": True, "ignore-stereochemistry-errors": True, "mass-skip-error-on-pseudoatoms": False, "gross-formula-add-rsites": True } } print(IndigoUtils.aromatize(data['struct'], False))