Current Dev State

This commit is contained in:
Tim Lorsbach
2025-06-23 20:13:54 +02:00
parent b4f9bb277d
commit ded50edaa2
22617 changed files with 4345095 additions and 174 deletions

0
utilities/__init__.py Normal file
View File

13
utilities/biodeg.py Normal file
View File

@ -0,0 +1,13 @@
import abc
from enviPy.epdb import Pathway
class PredictionSchema(abc.ABC):
pass
class DFS(PredictionSchema):
def __init__(self, pw: Pathway, settings=None):
self.setting = settings or pw.prediction_settings
def predict(self):
pass

732
utilities/chem.py Normal file
View File

@ -0,0 +1,732 @@
import logging
import re
from abc import ABC
from collections import defaultdict
from typing import List, Optional, Dict
from indigo import Indigo, IndigoException, IndigoObject
from indigo.renderer import IndigoRenderer
from rdkit import Chem
from rdkit import RDLogger
from rdkit.Chem import MACCSkeys
from rdkit.Chem import rdChemReactions
from rdkit.Chem.Draw import rdMolDraw2D
from rdkit.Chem.MolStandardize import rdMolStandardize
logger = logging.getLogger(__name__)
RDLogger.DisableLog('rdApp.*')
# from rdkit import rdBase
# rdBase.LogToPythonLogger()
# pylog = logging.getLogger("rdkit")
# pylog.propagate = False
# pylog.disabled = True
# print(pylog.disabled)
class ProductSet(object):
def __init__(self, product_set: List[str]):
self.product_set = product_set
def __repr__(self):
return self.product_set.__repr__()
def __len__(self):
return len(self.product_set)
def __iter__(self):
return iter(self.product_set)
def __eq__(self, other):
return isinstance(other, ProductSet) and sorted(self.product_set) == sorted(other.product_set)
def __hash__(self):
return hash('-'.join(sorted(self.product_set)))
class PredictionResult(object):
def __init__(self, product_sets: List['ProductSet'], probability: float, rule: Optional['Rule'] = None):
self.product_sets = product_sets
self.probability = probability
self.rule = rule
def __len__(self):
return len(self.product_sets)
def __iter__(self):
return iter(self.product_sets)
def __repr__(self):
return f"--{self.probability}/{self.rule}--> {self.product_sets}"
class FormatConverter(object):
@staticmethod
def from_smiles(smiles):
return Chem.MolFromSmiles(smiles)
@staticmethod
def to_smiles(mol):
return Chem.MolToSmiles(mol)
@staticmethod
def InChIKey(smiles):
return Chem.MolToInchiKey(FormatConverter.from_smiles(smiles))
@staticmethod
def maccs(smiles):
mol = Chem.MolFromSmiles(smiles)
bitvec = MACCSkeys.GenMACCSKeys(mol)
return bitvec.ToList()
@staticmethod
def to_svg(smiles, mol_size=(200, 150), kekulize=True):
mol = FormatConverter.from_smiles(smiles)
if kekulize:
try:
mol = Chem.Kekulize(mol)
except:
mol = Chem.Mol(mol.ToBinary())
if not mol.GetNumConformers():
Chem.rdDepictor.Compute2DCoords(mol)
drawer = rdMolDraw2D.MolDraw2DSVG(*mol_size)
opts = drawer.drawOptions()
opts.clearBackground = False
drawer.DrawMolecule(mol)
drawer.FinishDrawing()
svg = drawer.GetDrawingText().replace('svg:', '')
svg = re.sub("<\?xml.*\?>", '', svg)
return svg
@staticmethod
def to_png(smiles, mol_size=(200, 150), kekulize=True):
mol = FormatConverter.from_smiles(smiles)
if kekulize:
try:
Chem.Kekulize(mol)
except:
mc = Chem.Mol(mol.ToBinary())
if not mc.GetNumConformers():
Chem.rdDepictor.Compute2DCoords(mc)
pass
@staticmethod
def normalize(smiles):
# TODO call to AMBIT Service
return smiles
@staticmethod
def standardize(smiles):
# Taken from https://bitsilla.com/blog/2021/06/standardizing-a-molecule-using-rdkit/
# follows the steps in
# https://github.com/greglandrum/RSC_OpenScience_Standardization_202104/blob/main/MolStandardize%20pieces.ipynb
# as described **excellently** (by Greg) in
# https://www.youtube.com/watch?v=eWTApNX8dJQ
mol = Chem.MolFromSmiles(smiles)
# removeHs, disconnect metal atoms, normalize the molecule, reionize the molecule
clean_mol = rdMolStandardize.Cleanup(mol)
# if many fragments, get the "parent" (the actual mol we are interested in)
parent_clean_mol = rdMolStandardize.FragmentParent(clean_mol)
# try to neutralize molecule
uncharger = rdMolStandardize.Uncharger() # annoying, but necessary as no convenience method exists
uncharged_parent_clean_mol = uncharger.uncharge(parent_clean_mol)
# note that no attempt is made at reionization at this step
# nor at ionization at some pH (rdkit has no pKa caculator)
# the main aim to to represent all molecules from different sources
# in a (single) standard way, for use in ML, catalogue, etc.
# te = rdMolStandardize.TautomerEnumerator() # idem
# taut_uncharged_parent_clean_mol = te.Canonicalize(uncharged_parent_clean_mol)
return Chem.MolToSmiles(uncharged_parent_clean_mol, kekuleSmiles=True)
@staticmethod
def neutralize_smiles(smiles):
mol = Chem.MolFromSmiles(smiles)
mol = FormatConverter.neutralize_molecule(mol)
return Chem.MolToSmiles(mol)
@staticmethod
def neutralize_molecule(mol):
pattern = Chem.MolFromSmarts("[+1!h0!$([*]~[-1,-2,-3,-4]),-1!$([*]~[+1,+2,+3,+4])]")
at_matches = mol.GetSubstructMatches(pattern)
at_matches_list = [y[0] for y in at_matches]
if len(at_matches_list) > 0:
for at_idx in at_matches_list:
atom = mol.GetAtomWithIdx(at_idx)
chg = atom.GetFormalCharge()
hcount = atom.GetTotalNumHs()
atom.SetFormalCharge(0)
atom.SetNumExplicitHs(hcount - chg)
atom.UpdatePropertyCache()
return mol
# @staticmethod
# def apply(smiles, smirks, preprocess_smiles=True, bracketize=False, standardize=True):
# logger.debug(f'Applying {smirks} on {smiles}')
#
# if bracketize:
# smirks = smirks.split('>>')[0] + ">>(" + smirks.split('>>')[1] + ")"
#
# res = set()
# try:
# rxn = rdChemReactions.ReactionFromSmarts(smirks)
# mol = Chem.MolFromSmiles(smiles)
#
# # Inplace
# if preprocess_smiles:
# Chem.SanitizeMol(mol)
# mol = Chem.AddHs(mol)
#
# # apply!
# reacts = rxn.RunReactants((mol,))
# if len(reacts):
# # Sanitize mols
# for product_set in reacts:
# prod_set = list()
# for product in product_set:
# # Fixes
# # [2025-01-30 23:00:50] ERROR chem - Sanitizing and converting failed:
# # non-ring atom 3 marked aromatic
# # But does not improve overall performance
# #
# # for a in product.GetAtoms():
# # if (not a.IsInRing()) and a.GetIsAromatic():
# # a.SetIsAromatic(False)
# # for b in product.GetBonds():
# # if (not b.IsInRing()) and b.GetIsAromatic():
# # b.SetIsAromatic(False)
#
# try:
# Chem.SanitizeMol(product)
# prod_set.append(FormatConverter.standardize(Chem.MolToSmiles(product)))
# except ValueError as e:
# logger.error(f'Sanitizing and converting failed:\n{e}')
# continue
# res.add(tuple(list(set(prod_set))))
# except Exception as e:
# logger.error(f'Applying {smirks} on {smiles} failed:\n{e}')
#
# return list(res)
@staticmethod
def apply(smiles: str, smirks: str, preprocess_smiles: bool = True, bracketize: bool = False,
standardize: bool = True, kekulize: bool = True) -> List['ProductSet']:
logger.debug(f'Applying {smirks} on {smiles}')
# If explicitly wanted or rule generates multiple products add brackets around products to capture all
if bracketize: # or "." in smirks:
smirks = smirks.split('>>')[0] + ">>(" + smirks.split('>>')[1] + ")"
# List of ProductSet objects
pss = set()
try:
rxn = rdChemReactions.ReactionFromSmarts(smirks)
mol = Chem.MolFromSmiles(smiles)
# Inplace
if preprocess_smiles:
Chem.SanitizeMol(mol)
mol = Chem.AddHs(mol)
# apply!
sites = rxn.RunReactants((mol,))
logger.debug(f"{len(sites)} products sets generated")
if len(sites):
# Sanitize mols
for product_set in sites:
prods = []
for product in product_set:
try:
Chem.SanitizeMol(product)
product = FormatConverter.standardize(Chem.MolToSmiles(product))
# if kekulize:
# # from rdkit.Chem import MolStandardize
# #
# # # Attempt re-sanitization via standardizer
# # cleaner = MolStandardize.rdMolStandardize.Cleanup()
# # mol = cleaner.cleanup(product)
# # # Fixes
# # # [2025-01-30 23:00:50] ERROR chem - Sanitizing and converting failed:
# # # non-ring atom 3 marked aromatic
# # # But does not improve overall performance
# # # for a in product.GetAtoms():
# # # if (not a.IsInRing()) and a.GetIsAromatic():
# # # a.SetIsAromatic(False)
# # #
# # # for b in product.GetBonds():
# # # if (not b.IsInRing()) and b.GetIsAromatic():
# # # b.SetIsAromatic(False)
# # for atom in product.GetAtoms():
# # atom.SetIsAromatic(False)
# # for bond in product.GetBonds():
# # bond.SetIsAromatic(False)
# Chem.Kekulize(product)
prods.append(product)
except ValueError as e:
logger.error(f'Sanitizing and converting failed:\n{e}')
continue
# TODO doc!
if len(prods) and len(prods) == len(product_set):
ps = ProductSet(prods)
pss.add(ps)
except Exception as e:
logger.error(f'Applying {smirks} on {smiles} failed:\n{e}')
return pss
# @staticmethod
# def apply(reaction, smiles):
# rxn = AllChem.ReactionFromSmarts(reaction)
# return [Chem.MolToSmiles(x, 1) for x in rxn.RunReactants((Chem.MolFromSmiles(smiles),))[0]]
@staticmethod
def MACCS(smiles):
return MACCSkeys.GenMACCSKeys(FormatConverter.from_smiles(smiles))
@staticmethod
def neutralize_atoms(mol):
pattern = Chem.MolFromSmarts("[+1!h0!$([*]~[-1,-2,-3,-4]),-1!$([*]~[+1,+2,+3,+4])]")
at_matches = mol.GetSubstructMatches(pattern)
at_matches_list = [y[0] for y in at_matches]
if len(at_matches_list) > 0:
for at_idx in at_matches_list:
atom = mol.GetAtomWithIdx(at_idx)
chg = atom.GetFormalCharge()
hcount = atom.GetTotalNumHs()
atom.SetFormalCharge(0)
atom.SetNumExplicitHs(hcount - chg)
atom.UpdatePropertyCache()
return mol
@staticmethod
def sanitize_smiles(smiles_list: List):
parsed_smiles = []
errors = 0
for smi in smiles_list:
try:
# # Remove Stereo and Flatten
# if "/" in smi:
# smi = smi.replace("/", "")
# if "\\" in smi:
# smi = smi.replace("\\", "")
# if "@" in smi:
# smi = smi.replace("@", "")
mol = Chem.MolFromSmiles(smi)
mol = FormatConverter.neutralize_atoms(mol)
mol = Chem.RemoveAllHs(mol)
Chem.Kekulize(mol)
smi_p = Chem.MolToSmiles(mol, kekuleSmiles=True)
smi_p = Chem.CanonSmiles(smi_p)
if '~' in smi_p:
smi_p1 = smi_p.replace('~', '')
parsed_smiles.append(smi_p1)
else:
parsed_smiles.append(smi_p)
except Exception as e:
errors += 1
pass
return parsed_smiles, errors
class Standardizer(ABC):
def __init__(self, name):
self.name = name
def standardize(self, smiles: str) -> str:
return FormatConverter.normalize(smiles)
class RuleStandardizer(Standardizer):
def __init__(self, name, smirks):
super().__init__(name)
self.smirks = smirks
def standardize(self, smiles: str) -> str:
standardized_smiles = list(set(FormatConverter.apply(smiles, self.smirks)))
if len(standardized_smiles) > 1:
logger.warning(f'{self.smirks} generated more than 1 compound {standardized_smiles}')
print(f'{self.smirks} generated more than 1 compound {standardized_smiles}')
standardized_smiles = standardized_smiles[:1]
if standardized_smiles:
smiles = standardized_smiles[0]
return super().standardize(smiles)
class RegExStandardizer(Standardizer):
def __init__(self, name, replacements: dict):
super().__init__(name)
self.replacements = replacements
def standardize(self, smiles: str) -> str:
smi = smiles
mod_smi = smiles
for k, v in self.replacements.items():
mod_smi = smi.replace(k, v)
while mod_smi != smi:
mod_smi = smi
for k, v in self.replacements.items():
smi = smi.replace(k, v)
return super().standardize(smi)
FLATTEN = [
RegExStandardizer("Remove Stereo", {"@": ""})
]
UN_CIS_TRANS = [
RegExStandardizer("Un-Cis-Trans", {"/": "", "\\": ""})
]
BASIC = [
RuleStandardizer("ammoniumstandardization", "[H][N+:1]([H])([H])[#6:2]>>[H][#7:1]([H])-[#6:2]"),
RuleStandardizer("cyanate", "[H][#8:1][C:2]#[N:3]>>[#8-:1][C:2]#[N:3]"),
RuleStandardizer("deprotonatecarboxyls", "[H][#8:1]-[#6:2]=[O:3]>>[#8-:1]-[#6:2]=[O:3]"),
RuleStandardizer("forNOOH", "[H][#8:1]-[#7+:2](-[*:3])=[O:4]>>[#8-:1]-[#7+:2](-[*:3])=[O:4]"),
RuleStandardizer("Hydroxylprotonation", "[#6;A:1][#6:2](-[#8-:3])=[#6;A:4]>>[#6:1]-[#6:2](-[#8:3][H])=[#6;A:4]"),
RuleStandardizer("phosphatedeprotonation", "[H][#8:1]-[$([#15]);!$(P([O-])):2]>>[#8-:1]-[#15:2]"),
RuleStandardizer("PicricAcid",
"[H][#8:1]-[c:2]1[c:3][c:4][c:5]([c:6][c:7]1-[#7+:8](-[#8-:9])=[O:10])-[#7+:11](-[#8-:12])=[O:13]>>[#8-:1]-[c:2]1[c:3][c:4][c:5]([c:6][c:7]1-[#7+:8](-[#8-:9])=[O:10])-[#7+:11](-[#8-:12])=[O:13]"),
RuleStandardizer("Sulfate1", "[H][#8:1][S:2]([#8:3][H])(=[O:4])=[O:5]>>[#8-:1][S:2]([#8-:3])(=[O:4])=[O:5]"),
RuleStandardizer("Sulfate2",
"[#6:1]-[#8:2][S:3]([#8:4][H])(=[O:5])=[O:6]>>[#6:1]-[#8:2][S:3]([#8-:4])(=[O:5])=[O:6]"),
RuleStandardizer("Sulfate3", "[H][#8:3][S:2]([#6:1])(=[O:4])=[O:5]>>[#6:1][S:2]([#8-:3])(=[O:4])=[O:5]"),
RuleStandardizer("Transform_c1353forSOOH", "[H][#8:1][S:2]([*:3])=[O:4]>>[#8-:1][S:2]([*:3])=[O:4]"),
]
ENHANCED = BASIC + [
RuleStandardizer("fullPhosphatedeprotonation", "[H][#8:1]-[#15:2]>>[#8-:1]-[#15:2]")
]
EXOTIC = ENHANCED + [
RuleStandardizer("ThioPhosphate1", "[H][S:1]-[#15:2]=[$([#16]),$([#8]):3]>>[S-:1]-[#15:2]=[$([#16]),$([#8]):3]")
]
COA_CUTTER = [
RuleStandardizer("CutCoEnzymeAOff",
"CC(C)(COP(O)(=O)OP(O)(=O)OCC1OC(C(O)C1OP(O)(O)=O)n1cnc2c(N)ncnc12)C(O)C(=O)NCCC(=O)NCCS[$(*):1]>>[O-][$(*):1]")
]
ENOL_KETO = [
RuleStandardizer("enol2Ketone", "[H][#8:2]-[#6:3]=[#6:1]>>[#6:1]-[#6:3]=[O:2]")
]
MATCH_STANDARDIZER = EXOTIC + FLATTEN + UN_CIS_TRANS + COA_CUTTER + ENOL_KETO
class IndigoUtils(object):
@staticmethod
def layout(mol_data):
i = Indigo()
try:
if mol_data.startswith('$RXN') or '>>' in mol_data:
rxn = i.loadQueryReaction(mol_data)
rxn.layout()
return rxn.rxnfile()
else:
mol = i.loadQueryMolecule(mol_data)
mol.layout()
return mol.molfile()
except IndigoException as e:
try:
logger.info("layout() failed, trying loadReactionSMARTS as fallback!")
rxn = IndigoUtils.load_reaction_SMARTS(mol_data)
rxn.layout()
return rxn.molfile()
except IndigoException as e2:
logger.error(f'layout() failed due to {e2}!')
@staticmethod
def load_reaction_SMARTS(mol):
return Indigo().loadReactionSmarts(mol)
@staticmethod
def aromatize(mol_data, is_query):
i = Indigo()
try:
if mol_data.startswith('$RXN'):
if is_query:
rxn = i.loadQueryReaction(mol_data)
else:
rxn = i.loadReaction(mol_data)
rxn.aromatize()
return rxn.rxnfile()
else:
if is_query:
mol = i.loadQueryMolecule(mol_data)
else:
mol = i.loadMolecule(mol_data)
mol.aromatize()
return mol.molfile()
except IndigoException as e:
try:
logger.info("Aromatizing failed, trying loadReactionSMARTS as fallback!")
rxn = IndigoUtils.load_reaction_SMARTS(mol_data)
rxn.aromatize()
return rxn.molfile()
except IndigoException as e2:
logger.error(f'Aromatizing failed due to {e2}!')
@staticmethod
def dearomatize(mol_data, is_query):
i = Indigo()
try:
if mol_data.startswith('$RXN'):
if is_query:
rxn = i.loadQueryReaction(mol_data)
else:
rxn = i.loadReaction(mol_data)
rxn.dearomatize()
return rxn.rxnfile()
else:
if is_query:
mol = i.loadQueryMolecule(mol_data)
else:
mol = i.loadMolecule(mol_data)
mol.dearomatize()
return mol.molfile()
except IndigoException as e:
try:
logger.info("De-Aromatizing failed, trying loadReactionSMARTS as fallback!")
rxn = IndigoUtils.load_reaction_SMARTS(mol_data)
rxn.dearomatize()
return rxn.molfile()
except IndigoException as e2:
logger.error(f'De-Aromatizing failed due to {e2}!')
@staticmethod
def sanitize_functional_group(functional_group: str):
counter = 0
while True:
counter += 1
copy = functional_group
# special environment handling (amines, hydroxy, esters, ethers)
# the higher substituted should not contain H env.
if functional_group == '[C]=O':
functional_group = "[H][C](=O)[CX4,c]"
# aldamines
if functional_group == "O=[C]N(R)R":
functional_group = "O=[C]([H])N(R)R"
# ether, ester, ketones, amines, thioether
vals = [
"ROR",
"O=C(R)OR",
"[H]N(R)R",
"O=C(R)R",
"RN(R)R",
"RSR",
]
if functional_group in vals:
functional_group = functional_group.replace("R", "[CX4,c]")
# esters, ketones, amides
functional_group = functional_group.replace("O=C(R)", "O=C([CX4,c])")
if functional_group == "RS*R" or functional_group == "RO*R":
# neighboring atoms can be any aromatic atom, but they should not be highlighted
functional_group = functional_group.replace("R", "")
# aromatic compounds: aromatic atoms are denoted with *
# single aromatic heteroatoms, not substituted
# unsubstituted aromatic nitrogen
if functional_group == "RN*(R)R":
functional_group = "[nH1,nX2](a)a" # pyrrole (with H) or pyridine (no other connections); currently overlaps with neighboring aromatic atoms
# substituted aromatic nitrogen
functional_group = functional_group.replace("N*(R)R",
"n(a)a") # substituent will be before N*; currently overlaps with neighboring aromatic atoms
# pyridinium
if functional_group == "RN*(R)(R)(R)R":
functional_group = "[CX4,c]n(a)a" # currently overlaps with neighboring aromatic atoms
# N-oxide
if functional_group == "[H]ON*(R)(R)(R)R":
functional_group = "[O-][n+](a)a" # currently overlaps with neighboring aromatic atoms
# other aromatic hetero atoms
functional_group = functional_group.replace("C*", "c")
functional_group = functional_group.replace("N*", "n")
functional_group = functional_group.replace("S*", "s")
functional_group = functional_group.replace("O*", "o")
functional_group = functional_group.replace("Se*", "se")
functional_group = functional_group.replace("P*", "p")
# other replacement, to accomodate for the standardization rules in enviPath
# This is not the perfect way to do it; there should be a way to replace substructure SMARTS in SMARTS?
# nitro groups are broken, due to charge handling. this SMARTS matches both forms (formal charges and hypervalent); Ertl-CDK still treats both forms separately...
functional_group = functional_group.replace("[H]O[N](=O)R", "[CX4,c][NX3](~[OX1])~[OX1]")
functional_group = functional_group.replace("O=N(=O)R", "[CX4,c][NX3](~[OX1])~[OX1]")
# carboxylic acid: this SMARTS matches both neutral and anionic form; includes COOH in larger functional_groups
functional_group = functional_group.replace("[H]OC(=O)", "[OD1]C(=O)")
# azo
functional_group = functional_group.replace("N#[N]N(R)R", "[NX1]~[NX2]~N[CX4,c]")
functional_group = functional_group.replace("RN=[N]=NR", "[NX1]~[NX2]~N[CX4,c]")
# TODO: there might be more problematic groups, which we have yet to find.
# other environment atoms (can be aromatic or aliphatic Csp3, or H)
functional_group = functional_group.replace("R", "[H,CX4,c]")
if copy == functional_group:
break
return functional_group
@staticmethod
def _colorize(indigo: Indigo, molecule: IndigoObject, functional_groups: Dict[str, int], is_reaction: bool):
indigo.setOption("render-atom-color-property", "color")
indigo.setOption("aromaticity-model", "generic")
counts = defaultdict(lambda: 0)
environment = set()
matcher = indigo.substructureMatcher(molecule)
# Determine environment atoms as they will be colored black
for env in ["[CX4]", "c"]:
query = indigo.loadSmarts(env)
for match in matcher.iterateMatches(query):
if match is not None:
for atom in query.iterateAtoms():
mappedAtom = match.mapAtom(atom)
if mappedAtom is None:
continue
environment.add(mappedAtom.index())
for k, v in functional_groups.items():
sanitized = IndigoUtils.sanitize_functional_group(k)
query = indigo.loadSmarts(sanitized)
for match in matcher.iterateMatches(query):
if match is not None:
for atom in query.iterateAtoms():
mappedAtom = match.mapAtom(atom)
if mappedAtom is None or mappedAtom.index() in environment:
continue
counts[mappedAtom.index()] = max(v, counts[mappedAtom.index()])
for k, v in counts.items():
if is_reaction:
color = "128, 0, 128"
else:
if v <= 5:
color = "200, 0, 0"
else:
color = "0, 112, 0"
molecule.addDataSGroup([k], [], "color", color)
@staticmethod
def mol_to_svg(mol_data: str, width: int = 0, height: int = 0, functional_groups: Dict[str, int] = None):
if functional_groups is None:
functional_groups = {}
i = Indigo()
renderer = IndigoRenderer(i)
i.setOption("render-output-format", "svg")
i.setOption("render-coloring", not bool(len(functional_groups.keys())))
i.setOption("render-image-size", width, height)
i.setOption("render-bond-line-width", 2.0)
mol = i.loadMolecule(mol_data)
if len(functional_groups.keys()) > 0:
IndigoUtils._colorize(i, mol, functional_groups, False)
return renderer.renderToBuffer(mol).decode('UTF-8')
@staticmethod
def smirks_to_svg(smirks: str, is_query_smirks, width: int = 0, height: int = 0,
educt_functional_groups: Dict[str, int] = None, product_functional_groups: Dict[str, int] = None):
if educt_functional_groups is None:
educt_functional_groups = {}
if product_functional_groups is None:
product_functional_groups = {}
i = Indigo()
renderer = IndigoRenderer(i)
i.setOption("render-output-format", "svg")
i.setOption("render-coloring", True)
i.setOption("render-image-size", width, height)
if is_query_smirks:
obj = i.loadReactionSmarts(smirks)
else:
obj = i.loadReaction(smirks)
if len(educt_functional_groups.keys()) > 0:
for react in obj.iterateReactants():
IndigoUtils._colorize(i, react, educt_functional_groups, True)
if len(product_functional_groups.keys()) > 0:
for prod in obj.iterateProducts():
IndigoUtils._colorize(i, prod, product_functional_groups, True)
return renderer.renderToBuffer(obj).decode('UTF-8')
if __name__ == '__main__':
data = {
"struct": "\n Ketcher 2172510 12D 1 1.00000 0.00000 0\n\n 6 6 0 0 0 999 V2000\n 0.0000 0.0000 0.0000 C 0 0 0 0 0 0 0 0 0 0 0 0\n -1.0000 0.0000 0.0000 C 0 0 0 0 0 0 0 0 0 0 0 0\n -1.5000 -0.8660 0.0000 C 0 0 0 0 0 0 0 0 0 0 0 0\n -1.0000 -1.7321 0.0000 C 0 0 0 0 0 0 0 0 0 0 0 0\n 0.0000 -1.7321 0.0000 C 0 0 0 0 0 0 0 0 0 0 0 0\n 0.5000 -0.8660 0.0000 C 0 0 0 0 0 0 0 0 0 0 0 0\n 1 2 2 0 0 0 0\n 2 3 1 0 0 0 0\n 3 4 2 0 0 0 0\n 4 5 1 0 0 0 0\n 5 6 2 0 0 0 0\n 6 1 1 0 0 0 0\nM END\n",
"options": {
"smart-layout": True,
"ignore-stereochemistry-errors": True,
"mass-skip-error-on-pseudoatoms": False,
"gross-formula-add-rsites": True
}
}
print(IndigoUtils.aromatize(data['struct'], False))

83
utilities/clients.py Normal file
View File

@ -0,0 +1,83 @@
import json
import requests
class AMBITResult:
def __init__(self, *args, **kwargs):
self.smiles = kwargs['smiles']
self.tps = []
for bt in kwargs['products']:
if len(bt['products']):
self.tps.append(bt)
self.probs = None
def __str__(self):
x = self.smiles + "\n"
total_bts = len(self.tps)
for i, tp in enumerate(self.tps):
prob = ""
if self.probs:
prob = f" (p={self.probs[tp['id']]})"
if i == total_bts - 1:
x += f"\t└── {tp['name']}{prob}\n"
else:
x += f"\t├── {tp['name']}{prob}\n"
total_products = len(tp['products'])
for j, p in enumerate(tp['products']):
if j == total_products - 1:
if i == total_bts - 1:
x += f"\t\t└── {p}"
else:
x += f"\t\t└── {p}\n"
else:
if i == total_bts - 1:
x += f"\t\t├── {p}\n"
else:
x += f"\t\t├── {p}\n"
return x
def set_probs(self, probs):
self.probs = probs
class AMBIT:
def __init__(self, host, rules=None):
self.host = host
self.rules = rules
self.ambit_params = {
'singlePos': True,
'split': False,
}
def batch_apply(self, smiles: list):
payload = {
'smiles': smiles,
'rules': self.rules,
}
payload.update(**self.ambit_params)
res = self._execute(payload)
tps = list()
for r in res['result']:
ar = AMBITResult(**r)
if len(ar.tps):
tps.append(ar)
else:
tps.append(None)
return tps
def apply(self, smiles: str):
return self.batch_apply([smiles])[0]
def _execute(self, payload):
res = requests.post(self.host + '/ambit', data=json.dumps(payload))
res.raise_for_status()
return res.json()

0
utilities/dataclasses.py Normal file
View File

239
utilities/ml.py Normal file
View File

@ -0,0 +1,239 @@
from __future__ import annotations
import dataclasses
from collections import defaultdict
from datetime import datetime
from typing import List, Optional
import numpy as np
from sklearn.base import BaseEstimator, ClassifierMixin
from sklearn.decomposition import PCA
from sklearn.metrics import accuracy_score
from sklearn.multioutput import ClassifierChain
from sklearn.preprocessing import StandardScaler
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
# @dataclasses.dataclass
# class Feature:
# name: str
# value: float
#
#
#
# class Row:
# def __init__(self, compound_uuid: str, compound_smiles: str, descriptors: List[int]):
# self.data = {}
#
#
#
# class DataSet(object):
#
# def __init__(self):
# self.rows: List[Row] = []
#
# def add_row(self, row: Row):
# pass
from dataclasses import dataclass, field
from utilities.chem import FormatConverter
@dataclass
class Compound:
smiles: str
uuid: str = field(default=None, compare=False, hash=False)
def __hash__(self):
if not hasattr(self, '_hash'):
self._hash = hash((
self.smiles
))
return self._hash
@dataclass
class Reaction:
educts: List[Compound]
products: List[Compound]
rule_uuid: str = field(default=None, compare=False, hash=False)
reaction_uuid: str = field(default=None, compare=False, hash=False)
def __hash__(self):
if not hasattr(self, '_hash'):
self._hash = hash((
tuple(sorted(self.educts, key=lambda x: x.smiles)),
tuple(sorted(self.products, key=lambda x: x.smiles)),
))
return self._hash
def __eq__(self, other):
if not isinstance(other, Reaction):
return NotImplemented
return (
sorted(self.educts, key=lambda x: x.smiles) == sorted(other.educts, key=lambda x: x.smiles) and
sorted(self.products, key=lambda x: x.smiles) == sorted(other.products, key=lambda x: x.smiles)
)
class Dataset(object):
def __init__(self, headers=List['str'], data=List[List[str|int|float]]):
self.headers = headers
self.data = data
def features(self):
pass
def labels(self):
pass
def to_json(self):
pass
def to_csv(self):
pass
def to_arff(self):
pass
class DatasetGenerator(object):
@staticmethod
def generate_dataset(compounds: List[Compound], reactions: List[Reaction], applicable_rules: 'Rule',
compounds_to_exclude: Optional[Compound] = None, educts_only: bool = False) -> Dataset:
rows = []
if educts_only:
compounds = set()
for r in reactions:
for e in r.educts:
compounds.add(e)
compounds = list(compounds)
total = len(compounds)
for i, c in enumerate(compounds):
row = []
print(f"{i + 1}/{total} - {c.smiles}")
for r in applicable_rules:
product_sets = r.rule.apply(c.smiles)
if len(product_sets) == 0:
row.append([])
continue
#triggered.add(f"{r.uuid} + {c.uuid}")
reacts = set()
for ps in product_sets:
products = []
for p in ps:
products.append(Compound(FormatConverter.standardize(p)))
reacts.add(Reaction([c], products, r))
row.append(list(reacts))
rows.append(row)
return rows
class SparseLabelECC(BaseEstimator, ClassifierMixin):
"""
Ensemble of Classifier Chains with sparse label removal.
Removes labels that are constant across all samples in training.
"""
def __init__(self, base_clf=RandomForestClassifier(n_estimators=100, max_features='log2', random_state=42),
num_chains: int = 10):
self.base_clf = base_clf
self.num_chains = num_chains
def fit(self, X, Y):
y = np.array(Y)
self.n_labels_ = y.shape[1]
self.removed_labels_ = {}
self.keep_columns_ = []
for col in range(self.n_labels_):
unique_values = np.unique(y[:, col])
if len(unique_values) == 1:
self.removed_labels_[col] = unique_values[0]
else:
self.keep_columns_.append(col)
y_reduced = y[:, self.keep_columns_]
self.chains_ = [ClassifierChain(self.base_clf, order='random', random_state=i)
for i in range(self.num_chains)]
for i, chain in enumerate(self.chains_):
print(f"{datetime.now()} fitting {i + 1}/{self.num_chains}")
chain.fit(X, y_reduced)
return self
def predict(self, X, threshold=0.5):
avg_preds = np.mean([chain.predict(X) for chain in self.chains_], axis=0) > threshold
full_y = np.zeros((avg_preds.shape[0], self.n_labels_))
for idx, col in enumerate(self.keep_columns_):
full_y[:, col] = avg_preds[:, idx]
for col, value in self.removed_labels_.items():
full_y[:, col] = bool(value)
return full_y
def predict_proba(self, X):
avg_proba = np.mean([chain.predict_proba(X) for chain in self.chains_], axis=0)
full_y = np.zeros((avg_proba.shape[0], self.n_labels_))
for idx, col in enumerate(self.keep_columns_):
full_y[:, col] = avg_proba[:, idx]
for col, value in self.removed_labels_.items():
full_y[:, col] = float(value)
return full_y
def score(self, X, Y, sample_weight=None):
"""
Default scoring using subset accuracy (exact match).
"""
y_true = np.array(Y)
y_pred = self.predict(X)
return accuracy_score(y_true, y_pred, sample_weight=sample_weight)
class ApplicabilityDomain(PCA):
def __init__(self, n_components=5):
super().__init__(n_components=n_components)
self.scaler = StandardScaler()
self.min_vals = None
self.max_vals = None
def build(self, X):
# transform
X_scaled = self.scaler.fit_transform(X)
# fit pca
X_pca = self.fit_transform(X_scaled)
self.max_vals = np.max(X_pca, axis=0)
self.min_vals = np.min(X_pca, axis=0)
def is_applicable(self, instances):
instances_scaled = self.scaler.transform(instances)
instances_pca = self.transform(instances_scaled)
is_applicable = []
for i, instance in enumerate(instances_pca):
is_applicable.append(True)
for min_v, max_v, new_v in zip(self.min_vals, self.max_vals, instance):
if not min_v <= new_v <= max_v:
is_applicable[i] = False
return is_applicable

66
utilities/plugin.py Normal file
View File

@ -0,0 +1,66 @@
import glob
import importlib.metadata
import os
import subprocess
from typing import Dict, Type
from django.conf import settings as s
from envipy_plugins import Descriptor, Classifier, Property
def is_installed(package_name):
try:
importlib.metadata.version(package_name)
return True
except importlib.metadata.PackageNotFoundError:
return False
def install_wheel(wheel_path):
print(f"Installing wheel: {wheel_path}")
subprocess.check_call(["uv", "pip", "install", wheel_path])
def extract_package_name_from_wheel(wheel_filename):
# Example: my_plugin-0.1.0-py3-none-any.whl -> my_plugin
return wheel_filename.split('-')[0]
def ensure_plugins_installed():
wheel_files = glob.glob(os.path.join(s.PLUGIN_DIR, '*.whl'))
for wheel_path in wheel_files:
wheel_filename = os.path.basename(wheel_path)
package_name = extract_package_name_from_wheel(wheel_filename)
if not is_installed(package_name):
install_wheel(wheel_path)
else:
print(f"Plugin already installed: {package_name}")
def discover_plugins(_cls: Type = None) -> Dict[str, Type]:
ensure_plugins_installed()
plugins = {}
for entry_point in importlib.metadata.entry_points(group='enviPy_plugins'):
try:
plugin_class = entry_point.load()
if _cls:
if issubclass(plugin_class, _cls):
instance = plugin_class()
plugins[instance.name()] = instance
else:
if (
issubclass(plugin_class, Classifier)
or issubclass(plugin_class, Descriptor)
or issubclass(plugin_class, Property)
):
instance = plugin_class()
plugins[instance.name()] = instance
except Exception as e:
print(f"Error loading plugin {entry_point.name}: {e}")
return plugins