Current Dev State

2025-06-23 20:13:54 +02:00
parent b4f9bb277d
commit ded50edaa2
22617 changed files with 4345095 additions and 174 deletions
--- a/utilities/init.py
+++ b/utilities/init.py
--- a/utilities/biodeg.py
+++ b/utilities/biodeg.py
@ -0,0 +1,13 @@
+import abc
+from enviPy.epdb import Pathway
+
+class PredictionSchema(abc.ABC):
+    pass
+
+class DFS(PredictionSchema):
+
+    def __init__(self, pw: Pathway, settings=None):
+        self.setting = settings or pw.prediction_settings
+
+    def predict(self):
+        pass
--- a/utilities/chem.py
+++ b/utilities/chem.py
@ -0,0 +1,732 @@
+import logging
+import re
+from abc import ABC
+from collections import defaultdict
+from typing import List, Optional, Dict
+
+from indigo import Indigo, IndigoException, IndigoObject
+from indigo.renderer import IndigoRenderer
+from rdkit import Chem
+from rdkit import RDLogger
+from rdkit.Chem import MACCSkeys
+from rdkit.Chem import rdChemReactions
+from rdkit.Chem.Draw import rdMolDraw2D
+from rdkit.Chem.MolStandardize import rdMolStandardize
+
+logger = logging.getLogger(__name__)
+RDLogger.DisableLog('rdApp.*')
+
+
+# from rdkit import rdBase
+# rdBase.LogToPythonLogger()
+# pylog = logging.getLogger("rdkit")
+# pylog.propagate = False
+# pylog.disabled = True
+# print(pylog.disabled)
+
+
+class ProductSet(object):
+
+    def __init__(self, product_set: List[str]):
+        self.product_set = product_set
+
+    def __repr__(self):
+        return self.product_set.__repr__()
+
+    def __len__(self):
+        return len(self.product_set)
+
+    def __iter__(self):
+        return iter(self.product_set)
+
+    def __eq__(self, other):
+        return isinstance(other, ProductSet) and sorted(self.product_set) == sorted(other.product_set)
+
+    def __hash__(self):
+        return hash('-'.join(sorted(self.product_set)))
+
+
+class PredictionResult(object):
+
+    def __init__(self, product_sets: List['ProductSet'], probability: float, rule: Optional['Rule'] = None):
+        self.product_sets = product_sets
+        self.probability = probability
+        self.rule = rule
+
+    def __len__(self):
+        return len(self.product_sets)
+
+    def __iter__(self):
+        return iter(self.product_sets)
+
+    def __repr__(self):
+        return f"--{self.probability}/{self.rule}--> {self.product_sets}"
+
+
+class FormatConverter(object):
+
+    @staticmethod
+    def from_smiles(smiles):
+        return Chem.MolFromSmiles(smiles)
+
+    @staticmethod
+    def to_smiles(mol):
+        return Chem.MolToSmiles(mol)
+
+    @staticmethod
+    def InChIKey(smiles):
+        return Chem.MolToInchiKey(FormatConverter.from_smiles(smiles))
+
+    @staticmethod
+    def maccs(smiles):
+        mol = Chem.MolFromSmiles(smiles)
+        bitvec = MACCSkeys.GenMACCSKeys(mol)
+        return bitvec.ToList()
+
+    @staticmethod
+    def to_svg(smiles, mol_size=(200, 150), kekulize=True):
+        mol = FormatConverter.from_smiles(smiles)
+
+        if kekulize:
+            try:
+                mol = Chem.Kekulize(mol)
+            except:
+                mol = Chem.Mol(mol.ToBinary())
+
+        if not mol.GetNumConformers():
+            Chem.rdDepictor.Compute2DCoords(mol)
+
+        drawer = rdMolDraw2D.MolDraw2DSVG(*mol_size)
+        opts = drawer.drawOptions()
+
+        opts.clearBackground = False
+        drawer.DrawMolecule(mol)
+        drawer.FinishDrawing()
+        svg = drawer.GetDrawingText().replace('svg:', '')
+        svg = re.sub("<\?xml.*\?>", '', svg)
+
+        return svg
+
+    @staticmethod
+    def to_png(smiles, mol_size=(200, 150), kekulize=True):
+        mol = FormatConverter.from_smiles(smiles)
+
+        if kekulize:
+            try:
+                Chem.Kekulize(mol)
+            except:
+                mc = Chem.Mol(mol.ToBinary())
+
+        if not mc.GetNumConformers():
+            Chem.rdDepictor.Compute2DCoords(mc)
+
+        pass
+
+    @staticmethod
+    def normalize(smiles):
+        # TODO call to AMBIT Service
+        return smiles
+
+    @staticmethod
+    def standardize(smiles):
+        # Taken from https://bitsilla.com/blog/2021/06/standardizing-a-molecule-using-rdkit/
+        # follows the steps in
+        # https://github.com/greglandrum/RSC_OpenScience_Standardization_202104/blob/main/MolStandardize%20pieces.ipynb
+        # as described **excellently** (by Greg) in
+        # https://www.youtube.com/watch?v=eWTApNX8dJQ
+        mol = Chem.MolFromSmiles(smiles)
+
+        # removeHs, disconnect metal atoms, normalize the molecule, reionize the molecule
+        clean_mol = rdMolStandardize.Cleanup(mol)
+
+        # if many fragments, get the "parent" (the actual mol we are interested in)
+        parent_clean_mol = rdMolStandardize.FragmentParent(clean_mol)
+
+        # try to neutralize molecule
+        uncharger = rdMolStandardize.Uncharger()  # annoying, but necessary as no convenience method exists
+        uncharged_parent_clean_mol = uncharger.uncharge(parent_clean_mol)
+
+        # note that no attempt is made at reionization at this step
+        # nor at ionization at some pH (rdkit has no pKa caculator)
+        # the main aim to to represent all molecules from different sources
+        # in a (single) standard way, for use in ML, catalogue, etc.
+        # te = rdMolStandardize.TautomerEnumerator()  # idem
+        # taut_uncharged_parent_clean_mol = te.Canonicalize(uncharged_parent_clean_mol)
+
+        return Chem.MolToSmiles(uncharged_parent_clean_mol, kekuleSmiles=True)
+
+    @staticmethod
+    def neutralize_smiles(smiles):
+        mol = Chem.MolFromSmiles(smiles)
+        mol = FormatConverter.neutralize_molecule(mol)
+        return Chem.MolToSmiles(mol)
+
+    @staticmethod
+    def neutralize_molecule(mol):
+        pattern = Chem.MolFromSmarts("[+1!h0!$([*]~[-1,-2,-3,-4]),-1!$([*]~[+1,+2,+3,+4])]")
+        at_matches = mol.GetSubstructMatches(pattern)
+        at_matches_list = [y[0] for y in at_matches]
+        if len(at_matches_list) > 0:
+            for at_idx in at_matches_list:
+                atom = mol.GetAtomWithIdx(at_idx)
+                chg = atom.GetFormalCharge()
+                hcount = atom.GetTotalNumHs()
+                atom.SetFormalCharge(0)
+                atom.SetNumExplicitHs(hcount - chg)
+                atom.UpdatePropertyCache()
+        return mol
+
+    # @staticmethod
+    # def apply(smiles, smirks, preprocess_smiles=True, bracketize=False, standardize=True):
+    #     logger.debug(f'Applying {smirks} on {smiles}')
+    #
+    #     if bracketize:
+    #         smirks = smirks.split('>>')[0] + ">>(" + smirks.split('>>')[1] + ")"
+    #
+    #     res = set()
+    #     try:
+    #         rxn = rdChemReactions.ReactionFromSmarts(smirks)
+    #         mol = Chem.MolFromSmiles(smiles)
+    #
+    #         # Inplace
+    #         if preprocess_smiles:
+    #             Chem.SanitizeMol(mol)
+    #             mol = Chem.AddHs(mol)
+    #
+    #         # apply!
+    #         reacts = rxn.RunReactants((mol,))
+    #         if len(reacts):
+    #             # Sanitize mols
+    #             for product_set in reacts:
+    #                 prod_set = list()
+    #                 for product in product_set:
+    #                     # Fixes
+    #                     # [2025-01-30 23:00:50] ERROR chem - Sanitizing and converting failed:
+    #                     # non-ring atom 3 marked aromatic
+    #                     # But does not improve overall performance
+    #                     #
+    #                     # for a in product.GetAtoms():
+    #                     #     if (not a.IsInRing()) and a.GetIsAromatic():
+    #                     #         a.SetIsAromatic(False)
+    #                     # for b in product.GetBonds():
+    #                     #     if (not b.IsInRing()) and b.GetIsAromatic():
+    #                     #         b.SetIsAromatic(False)
+    #
+    #                     try:
+    #                         Chem.SanitizeMol(product)
+    #                         prod_set.append(FormatConverter.standardize(Chem.MolToSmiles(product)))
+    #                     except ValueError as e:
+    #                         logger.error(f'Sanitizing and converting failed:\n{e}')
+    #                         continue
+    #                 res.add(tuple(list(set(prod_set))))
+    #     except Exception as e:
+    #         logger.error(f'Applying {smirks} on {smiles} failed:\n{e}')
+    #
+    #     return list(res)
+
+    @staticmethod
+    def apply(smiles: str, smirks: str, preprocess_smiles: bool = True, bracketize: bool = False,
+              standardize: bool = True, kekulize: bool = True) -> List['ProductSet']:
+        logger.debug(f'Applying {smirks} on {smiles}')
+
+        # If explicitly wanted or rule generates multiple products add brackets around products to capture all
+        if bracketize:  # or "." in smirks:
+            smirks = smirks.split('>>')[0] + ">>(" + smirks.split('>>')[1] + ")"
+
+        # List of ProductSet objects
+        pss = set()
+        try:
+            rxn = rdChemReactions.ReactionFromSmarts(smirks)
+            mol = Chem.MolFromSmiles(smiles)
+
+            # Inplace
+            if preprocess_smiles:
+                Chem.SanitizeMol(mol)
+                mol = Chem.AddHs(mol)
+
+            # apply!
+            sites = rxn.RunReactants((mol,))
+            logger.debug(f"{len(sites)} products sets generated")
+            if len(sites):
+                # Sanitize mols
+                for product_set in sites:
+                    prods = []
+                    for product in product_set:
+                        try:
+                            Chem.SanitizeMol(product)
+
+                            product = FormatConverter.standardize(Chem.MolToSmiles(product))
+
+                            # if kekulize:
+                            #     # from rdkit.Chem import MolStandardize
+                            #     #
+                            #     # # Attempt re-sanitization via standardizer
+                            #     # cleaner = MolStandardize.rdMolStandardize.Cleanup()
+                            #     # mol = cleaner.cleanup(product)
+                            #     # # Fixes
+                            #     # # [2025-01-30 23:00:50] ERROR chem - Sanitizing and converting failed:
+                            #     # # non-ring atom 3 marked aromatic
+                            #     # # But does not improve overall performance
+                            #     # # for a in product.GetAtoms():
+                            #     # #     if (not a.IsInRing()) and a.GetIsAromatic():
+                            #     # #         a.SetIsAromatic(False)
+                            #     # #
+                            #     # # for b in product.GetBonds():
+                            #     # #     if (not b.IsInRing()) and b.GetIsAromatic():
+                            #     # #         b.SetIsAromatic(False)
+                            #     # for atom in product.GetAtoms():
+                            #     #     atom.SetIsAromatic(False)
+                            #     # for bond in product.GetBonds():
+                            #     #     bond.SetIsAromatic(False)
+                            #     Chem.Kekulize(product)
+
+                            prods.append(product)
+                        except ValueError as e:
+                            logger.error(f'Sanitizing and converting failed:\n{e}')
+                            continue
+
+                    # TODO doc!
+                    if len(prods) and len(prods) == len(product_set):
+                        ps = ProductSet(prods)
+                        pss.add(ps)
+
+        except Exception as e:
+            logger.error(f'Applying {smirks} on {smiles} failed:\n{e}')
+
+        return pss
+
+    # @staticmethod
+    # def apply(reaction, smiles):
+    #     rxn = AllChem.ReactionFromSmarts(reaction)
+    #     return [Chem.MolToSmiles(x, 1) for x in rxn.RunReactants((Chem.MolFromSmiles(smiles),))[0]]
+
+    @staticmethod
+    def MACCS(smiles):
+        return MACCSkeys.GenMACCSKeys(FormatConverter.from_smiles(smiles))
+
+    @staticmethod
+    def neutralize_atoms(mol):
+        pattern = Chem.MolFromSmarts("[+1!h0!$([*]~[-1,-2,-3,-4]),-1!$([*]~[+1,+2,+3,+4])]")
+        at_matches = mol.GetSubstructMatches(pattern)
+        at_matches_list = [y[0] for y in at_matches]
+        if len(at_matches_list) > 0:
+            for at_idx in at_matches_list:
+                atom = mol.GetAtomWithIdx(at_idx)
+                chg = atom.GetFormalCharge()
+                hcount = atom.GetTotalNumHs()
+                atom.SetFormalCharge(0)
+                atom.SetNumExplicitHs(hcount - chg)
+                atom.UpdatePropertyCache()
+        return mol
+
+    @staticmethod
+    def sanitize_smiles(smiles_list: List):
+        parsed_smiles = []
+        errors = 0
+        for smi in smiles_list:
+            try:
+                # # Remove Stereo and Flatten
+                # if "/" in smi:
+                #     smi = smi.replace("/", "")
+                # if "\\" in smi:
+                #     smi = smi.replace("\\", "")
+                # if "@" in smi:
+                #     smi = smi.replace("@", "")
+
+                mol = Chem.MolFromSmiles(smi)
+                mol = FormatConverter.neutralize_atoms(mol)
+                mol = Chem.RemoveAllHs(mol)
+                Chem.Kekulize(mol)
+                smi_p = Chem.MolToSmiles(mol, kekuleSmiles=True)
+                smi_p = Chem.CanonSmiles(smi_p)
+
+                if '~' in smi_p:
+                    smi_p1 = smi_p.replace('~', '')
+                    parsed_smiles.append(smi_p1)
+                else:
+                    parsed_smiles.append(smi_p)
+            except Exception as e:
+                errors += 1
+                pass
+
+        return parsed_smiles, errors
+
+
+
+
+class Standardizer(ABC):
+
+    def __init__(self, name):
+        self.name = name
+
+    def standardize(self, smiles: str) -> str:
+        return FormatConverter.normalize(smiles)
+
+
+class RuleStandardizer(Standardizer):
+
+    def __init__(self, name, smirks):
+        super().__init__(name)
+        self.smirks = smirks
+
+    def standardize(self, smiles: str) -> str:
+        standardized_smiles = list(set(FormatConverter.apply(smiles, self.smirks)))
+
+        if len(standardized_smiles) > 1:
+            logger.warning(f'{self.smirks} generated more than 1 compound {standardized_smiles}')
+            print(f'{self.smirks} generated more than 1 compound {standardized_smiles}')
+            standardized_smiles = standardized_smiles[:1]
+
+        if standardized_smiles:
+            smiles = standardized_smiles[0]
+
+        return super().standardize(smiles)
+
+
+class RegExStandardizer(Standardizer):
+
+    def __init__(self, name, replacements: dict):
+        super().__init__(name)
+        self.replacements = replacements
+
+    def standardize(self, smiles: str) -> str:
+        smi = smiles
+        mod_smi = smiles
+
+        for k, v in self.replacements.items():
+            mod_smi = smi.replace(k, v)
+
+        while mod_smi != smi:
+            mod_smi = smi
+            for k, v in self.replacements.items():
+                smi = smi.replace(k, v)
+
+        return super().standardize(smi)
+
+
+FLATTEN = [
+    RegExStandardizer("Remove Stereo", {"@": ""})
+]
+
+UN_CIS_TRANS = [
+    RegExStandardizer("Un-Cis-Trans", {"/": "", "\\": ""})
+]
+
+BASIC = [
+    RuleStandardizer("ammoniumstandardization", "[H][N+:1]([H])([H])[#6:2]>>[H][#7:1]([H])-[#6:2]"),
+    RuleStandardizer("cyanate", "[H][#8:1][C:2]#[N:3]>>[#8-:1][C:2]#[N:3]"),
+    RuleStandardizer("deprotonatecarboxyls", "[H][#8:1]-[#6:2]=[O:3]>>[#8-:1]-[#6:2]=[O:3]"),
+    RuleStandardizer("forNOOH", "[H][#8:1]-[#7+:2](-[*:3])=[O:4]>>[#8-:1]-[#7+:2](-[*:3])=[O:4]"),
+    RuleStandardizer("Hydroxylprotonation", "[#6;A:1][#6:2](-[#8-:3])=[#6;A:4]>>[#6:1]-[#6:2](-[#8:3][H])=[#6;A:4]"),
+    RuleStandardizer("phosphatedeprotonation", "[H][#8:1]-[$([#15]);!$(P([O-])):2]>>[#8-:1]-[#15:2]"),
+    RuleStandardizer("PicricAcid",
+                     "[H][#8:1]-[c:2]1[c:3][c:4][c:5]([c:6][c:7]1-[#7+:8](-[#8-:9])=[O:10])-[#7+:11](-[#8-:12])=[O:13]>>[#8-:1]-[c:2]1[c:3][c:4][c:5]([c:6][c:7]1-[#7+:8](-[#8-:9])=[O:10])-[#7+:11](-[#8-:12])=[O:13]"),
+    RuleStandardizer("Sulfate1", "[H][#8:1][S:2]([#8:3][H])(=[O:4])=[O:5]>>[#8-:1][S:2]([#8-:3])(=[O:4])=[O:5]"),
+    RuleStandardizer("Sulfate2",
+                     "[#6:1]-[#8:2][S:3]([#8:4][H])(=[O:5])=[O:6]>>[#6:1]-[#8:2][S:3]([#8-:4])(=[O:5])=[O:6]"),
+    RuleStandardizer("Sulfate3", "[H][#8:3][S:2]([#6:1])(=[O:4])=[O:5]>>[#6:1][S:2]([#8-:3])(=[O:4])=[O:5]"),
+    RuleStandardizer("Transform_c1353forSOOH", "[H][#8:1][S:2]([*:3])=[O:4]>>[#8-:1][S:2]([*:3])=[O:4]"),
+]
+
+ENHANCED = BASIC + [
+    RuleStandardizer("fullPhosphatedeprotonation", "[H][#8:1]-[#15:2]>>[#8-:1]-[#15:2]")
+]
+
+EXOTIC = ENHANCED + [
+    RuleStandardizer("ThioPhosphate1", "[H][S:1]-[#15:2]=[$([#16]),$([#8]):3]>>[S-:1]-[#15:2]=[$([#16]),$([#8]):3]")
+]
+
+COA_CUTTER = [
+    RuleStandardizer("CutCoEnzymeAOff",
+                     "CC(C)(COP(O)(=O)OP(O)(=O)OCC1OC(C(O)C1OP(O)(O)=O)n1cnc2c(N)ncnc12)C(O)C(=O)NCCC(=O)NCCS[$(*):1]>>[O-][$(*):1]")
+]
+
+ENOL_KETO = [
+    RuleStandardizer("enol2Ketone", "[H][#8:2]-[#6:3]=[#6:1]>>[#6:1]-[#6:3]=[O:2]")
+]
+
+MATCH_STANDARDIZER = EXOTIC + FLATTEN + UN_CIS_TRANS + COA_CUTTER + ENOL_KETO
+
+
+class IndigoUtils(object):
+
+    @staticmethod
+    def layout(mol_data):
+        i = Indigo()
+        try:
+            if mol_data.startswith('$RXN') or '>>' in mol_data:
+                rxn = i.loadQueryReaction(mol_data)
+                rxn.layout()
+                return rxn.rxnfile()
+            else:
+                mol = i.loadQueryMolecule(mol_data)
+                mol.layout()
+                return mol.molfile()
+        except IndigoException as e:
+            try:
+                logger.info("layout() failed, trying loadReactionSMARTS as fallback!")
+                rxn = IndigoUtils.load_reaction_SMARTS(mol_data)
+                rxn.layout()
+                return rxn.molfile()
+            except IndigoException as e2:
+                logger.error(f'layout() failed due to {e2}!')
+
+    @staticmethod
+    def load_reaction_SMARTS(mol):
+        return Indigo().loadReactionSmarts(mol)
+
+    @staticmethod
+    def aromatize(mol_data, is_query):
+        i = Indigo()
+        try:
+            if mol_data.startswith('$RXN'):
+                if is_query:
+                    rxn = i.loadQueryReaction(mol_data)
+                else:
+                    rxn = i.loadReaction(mol_data)
+
+                rxn.aromatize()
+                return rxn.rxnfile()
+            else:
+                if is_query:
+                    mol = i.loadQueryMolecule(mol_data)
+                else:
+                    mol = i.loadMolecule(mol_data)
+
+                mol.aromatize()
+                return mol.molfile()
+        except IndigoException as e:
+            try:
+                logger.info("Aromatizing failed, trying loadReactionSMARTS as fallback!")
+                rxn = IndigoUtils.load_reaction_SMARTS(mol_data)
+                rxn.aromatize()
+                return rxn.molfile()
+            except IndigoException as e2:
+                logger.error(f'Aromatizing failed due to {e2}!')
+
+    @staticmethod
+    def dearomatize(mol_data, is_query):
+        i = Indigo()
+        try:
+            if mol_data.startswith('$RXN'):
+                if is_query:
+                    rxn = i.loadQueryReaction(mol_data)
+                else:
+                    rxn = i.loadReaction(mol_data)
+
+                rxn.dearomatize()
+                return rxn.rxnfile()
+            else:
+                if is_query:
+                    mol = i.loadQueryMolecule(mol_data)
+                else:
+                    mol = i.loadMolecule(mol_data)
+
+                mol.dearomatize()
+                return mol.molfile()
+        except IndigoException as e:
+            try:
+                logger.info("De-Aromatizing failed, trying loadReactionSMARTS as fallback!")
+                rxn = IndigoUtils.load_reaction_SMARTS(mol_data)
+                rxn.dearomatize()
+                return rxn.molfile()
+            except IndigoException as e2:
+                logger.error(f'De-Aromatizing failed due to {e2}!')
+
+    @staticmethod
+    def sanitize_functional_group(functional_group: str):
+        counter = 0
+        while True:
+            counter += 1
+
+            copy = functional_group
+
+            # special environment handling (amines, hydroxy, esters, ethers)
+            # the higher substituted should not contain H env.
+            if functional_group == '[C]=O':
+                functional_group = "[H][C](=O)[CX4,c]"
+
+            # aldamines
+            if functional_group == "O=[C]N(R)R":
+                functional_group = "O=[C]([H])N(R)R"
+
+            # ether, ester, ketones, amines, thioether
+            vals = [
+                "ROR",
+                "O=C(R)OR",
+                "[H]N(R)R",
+                "O=C(R)R",
+                "RN(R)R",
+                "RSR",
+            ]
+            if functional_group in vals:
+                functional_group = functional_group.replace("R", "[CX4,c]")
+
+            # esters, ketones, amides
+            functional_group = functional_group.replace("O=C(R)", "O=C([CX4,c])")
+
+            if functional_group == "RS*R" or functional_group == "RO*R":
+                # neighboring atoms can be any aromatic atom, but they should not be highlighted
+                functional_group = functional_group.replace("R", "")
+
+            # aromatic compounds:  aromatic atoms are denoted with *
+            # single aromatic heteroatoms, not substituted
+
+            # unsubstituted aromatic nitrogen
+            if functional_group == "RN*(R)R":
+                functional_group = "[nH1,nX2](a)a"  # pyrrole (with H) or pyridine (no other connections); currently overlaps with neighboring aromatic atoms
+
+            # substituted aromatic nitrogen
+            functional_group = functional_group.replace("N*(R)R",
+                                                        "n(a)a")  # substituent will be before N*; currently overlaps with neighboring aromatic atoms
+            # pyridinium
+            if functional_group == "RN*(R)(R)(R)R":
+                functional_group = "[CX4,c]n(a)a"  # currently overlaps with neighboring aromatic atoms
+
+            # N-oxide
+            if functional_group == "[H]ON*(R)(R)(R)R":
+                functional_group = "[O-][n+](a)a"  # currently overlaps with neighboring aromatic atoms
+
+            # other aromatic hetero atoms
+            functional_group = functional_group.replace("C*", "c")
+            functional_group = functional_group.replace("N*", "n")
+            functional_group = functional_group.replace("S*", "s")
+            functional_group = functional_group.replace("O*", "o")
+            functional_group = functional_group.replace("Se*", "se")
+            functional_group = functional_group.replace("P*", "p")
+
+            # other replacement, to accomodate for the standardization rules in enviPath
+            # This is not the perfect way to do it; there should be a way to replace substructure SMARTS in SMARTS?
+            # nitro groups are broken, due to charge handling. this SMARTS matches both forms (formal charges and hypervalent); Ertl-CDK still treats both forms separately...
+            functional_group = functional_group.replace("[H]O[N](=O)R", "[CX4,c][NX3](~[OX1])~[OX1]")
+            functional_group = functional_group.replace("O=N(=O)R", "[CX4,c][NX3](~[OX1])~[OX1]")
+            # carboxylic acid: this SMARTS matches both neutral and anionic form; includes COOH in larger functional_groups
+            functional_group = functional_group.replace("[H]OC(=O)", "[OD1]C(=O)")
+            # azo
+            functional_group = functional_group.replace("N#[N]N(R)R", "[NX1]~[NX2]~N[CX4,c]")
+            functional_group = functional_group.replace("RN=[N]=NR", "[NX1]~[NX2]~N[CX4,c]")
+            # TODO: there might be more problematic groups, which we have yet to find.
+
+            # other environment atoms (can be aromatic or aliphatic Csp3, or H)
+            functional_group = functional_group.replace("R", "[H,CX4,c]")
+
+            if copy == functional_group:
+                break
+
+        return functional_group
+
+    @staticmethod
+    def _colorize(indigo: Indigo, molecule: IndigoObject, functional_groups: Dict[str, int], is_reaction: bool):
+        indigo.setOption("render-atom-color-property", "color")
+        indigo.setOption("aromaticity-model", "generic")
+
+        counts = defaultdict(lambda: 0)
+        environment = set()
+
+        matcher = indigo.substructureMatcher(molecule)
+
+        # Determine environment atoms as they will be colored black
+        for env in ["[CX4]", "c"]:
+            query = indigo.loadSmarts(env)
+            for match in matcher.iterateMatches(query):
+                if match is not None:
+                    for atom in query.iterateAtoms():
+                        mappedAtom = match.mapAtom(atom)
+
+                        if mappedAtom is None:
+                            continue
+
+                        environment.add(mappedAtom.index())
+
+        for k, v in functional_groups.items():
+
+            sanitized = IndigoUtils.sanitize_functional_group(k)
+
+            query = indigo.loadSmarts(sanitized)
+
+            for match in matcher.iterateMatches(query):
+                if match is not None:
+
+                    for atom in query.iterateAtoms():
+                        mappedAtom = match.mapAtom(atom)
+                        if mappedAtom is None or mappedAtom.index() in environment:
+                            continue
+
+                        counts[mappedAtom.index()] = max(v, counts[mappedAtom.index()])
+
+        for k, v in counts.items():
+            if is_reaction:
+                color = "128, 0, 128"
+            else:
+                if v <= 5:
+                    color = "200, 0, 0"
+                else:
+                    color = "0, 112, 0"
+
+            molecule.addDataSGroup([k], [], "color", color)
+
+    @staticmethod
+    def mol_to_svg(mol_data: str, width: int = 0, height: int = 0, functional_groups: Dict[str, int] = None):
+
+        if functional_groups is None:
+            functional_groups = {}
+
+        i = Indigo()
+        renderer = IndigoRenderer(i)
+
+        i.setOption("render-output-format", "svg")
+        i.setOption("render-coloring", not bool(len(functional_groups.keys())))
+        i.setOption("render-image-size", width, height)
+        i.setOption("render-bond-line-width", 2.0)
+
+        mol = i.loadMolecule(mol_data)
+
+        if len(functional_groups.keys()) > 0:
+            IndigoUtils._colorize(i, mol, functional_groups, False)
+
+        return renderer.renderToBuffer(mol).decode('UTF-8')
+
+    @staticmethod
+    def smirks_to_svg(smirks: str, is_query_smirks, width: int = 0, height: int = 0,
+                      educt_functional_groups: Dict[str, int] = None, product_functional_groups: Dict[str, int] = None):
+        if educt_functional_groups is None:
+            educt_functional_groups = {}
+
+        if product_functional_groups is None:
+            product_functional_groups = {}
+
+        i = Indigo()
+        renderer = IndigoRenderer(i)
+
+        i.setOption("render-output-format", "svg")
+        i.setOption("render-coloring", True)
+        i.setOption("render-image-size", width, height)
+
+        if is_query_smirks:
+            obj = i.loadReactionSmarts(smirks)
+        else:
+            obj = i.loadReaction(smirks)
+
+            if len(educt_functional_groups.keys()) > 0:
+                for react in obj.iterateReactants():
+                    IndigoUtils._colorize(i, react, educt_functional_groups, True)
+
+            if len(product_functional_groups.keys()) > 0:
+                for prod in obj.iterateProducts():
+                    IndigoUtils._colorize(i, prod, product_functional_groups, True)
+
+        return renderer.renderToBuffer(obj).decode('UTF-8')
+
+
+if __name__ == '__main__':
+    data = {
+        "struct": "\n  Ketcher  2172510 12D 1   1.00000     0.00000     0\n\n  6  6  0     0  0            999 V2000\n    0.0000    0.0000    0.0000 C   0  0  0  0  0  0  0  0  0  0  0  0\n   -1.0000    0.0000    0.0000 C   0  0  0  0  0  0  0  0  0  0  0  0\n   -1.5000   -0.8660    0.0000 C   0  0  0  0  0  0  0  0  0  0  0  0\n   -1.0000   -1.7321    0.0000 C   0  0  0  0  0  0  0  0  0  0  0  0\n    0.0000   -1.7321    0.0000 C   0  0  0  0  0  0  0  0  0  0  0  0\n    0.5000   -0.8660    0.0000 C   0  0  0  0  0  0  0  0  0  0  0  0\n  1  2  2  0  0  0  0\n  2  3  1  0  0  0  0\n  3  4  2  0  0  0  0\n  4  5  1  0  0  0  0\n  5  6  2  0  0  0  0\n  6  1  1  0  0  0  0\nM  END\n",
+        "options": {
+            "smart-layout": True,
+            "ignore-stereochemistry-errors": True,
+            "mass-skip-error-on-pseudoatoms": False,
+            "gross-formula-add-rsites": True
+        }
+    }
+
+    print(IndigoUtils.aromatize(data['struct'], False))
--- a/utilities/clients.py
+++ b/utilities/clients.py
@ -0,0 +1,83 @@
+import json
+
+import requests
+
+
+class AMBITResult:
+
+    def __init__(self, *args, **kwargs):
+        self.smiles = kwargs['smiles']
+        self.tps = []
+        for bt in kwargs['products']:
+            if len(bt['products']):
+                self.tps.append(bt)
+
+        self.probs = None
+
+    def __str__(self):
+        x = self.smiles + "\n"
+        total_bts = len(self.tps)
+
+        for i, tp in enumerate(self.tps):
+            prob = ""
+            if self.probs:
+                prob = f" (p={self.probs[tp['id']]})"
+
+            if i == total_bts - 1:
+                x += f"\t└── {tp['name']}{prob}\n"
+            else:
+                x += f"\t├── {tp['name']}{prob}\n"
+
+            total_products = len(tp['products'])
+            for j, p in enumerate(tp['products']):
+                if j == total_products - 1:
+                    if i == total_bts - 1:
+                        x += f"\t\t└── {p}"
+                    else:
+                        x += f"\t│\t└── {p}\n"
+                else:
+                    if i == total_bts - 1:
+                        x += f"\t\t├── {p}\n"
+                    else:
+                        x += f"\t│\t├── {p}\n"
+        return x
+
+    def set_probs(self, probs):
+        self.probs = probs
+
+
+class AMBIT:
+
+    def __init__(self, host, rules=None):
+        self.host = host
+        self.rules = rules
+        self.ambit_params = {
+            'singlePos': True,
+            'split': False,
+        }
+
+    def batch_apply(self, smiles: list):
+        payload = {
+            'smiles': smiles,
+            'rules': self.rules,
+        }
+        payload.update(**self.ambit_params)
+
+        res = self._execute(payload)
+
+        tps = list()
+        for r in res['result']:
+            ar = AMBITResult(**r)
+            if len(ar.tps):
+                tps.append(ar)
+            else:
+                tps.append(None)
+        return tps
+
+    def apply(self, smiles: str):
+        return self.batch_apply([smiles])[0]
+
+    def _execute(self, payload):
+        res = requests.post(self.host + '/ambit', data=json.dumps(payload))
+        res.raise_for_status()
+        return res.json()
--- a/utilities/dataclasses.py
+++ b/utilities/dataclasses.py
--- a/utilities/ml.py
+++ b/utilities/ml.py
@ -0,0 +1,239 @@
+from __future__ import annotations
+
+import dataclasses
+from collections import defaultdict
+from datetime import datetime
+from typing import List, Optional
+import numpy as np
+from sklearn.base import BaseEstimator, ClassifierMixin
+from sklearn.decomposition import PCA
+from sklearn.metrics import accuracy_score
+from sklearn.multioutput import ClassifierChain
+from sklearn.preprocessing import StandardScaler
+from sklearn.tree import DecisionTreeClassifier
+from sklearn.ensemble import RandomForestClassifier
+
+# @dataclasses.dataclass
+# class Feature:
+#     name: str
+#     value: float
+#
+#
+#
+# class Row:
+#     def __init__(self, compound_uuid: str, compound_smiles: str, descriptors: List[int]):
+#         self.data = {}
+#
+#
+#
+# class DataSet(object):
+#
+#     def __init__(self):
+#         self.rows: List[Row] = []
+#
+#     def add_row(self, row: Row):
+#         pass
+
+from dataclasses import dataclass, field
+
+from utilities.chem import FormatConverter
+
+
+@dataclass
+class Compound:
+    smiles: str
+    uuid: str = field(default=None, compare=False, hash=False)
+
+    def __hash__(self):
+        if not hasattr(self, '_hash'):
+            self._hash = hash((
+                self.smiles
+            ))
+        return self._hash
+
+
+@dataclass
+class Reaction:
+    educts: List[Compound]
+    products: List[Compound]
+    rule_uuid: str = field(default=None, compare=False, hash=False)
+    reaction_uuid: str = field(default=None, compare=False, hash=False)
+
+    def __hash__(self):
+        if not hasattr(self, '_hash'):
+            self._hash = hash((
+                tuple(sorted(self.educts, key=lambda x: x.smiles)),
+                tuple(sorted(self.products, key=lambda x: x.smiles)),
+            ))
+        return self._hash
+
+    def __eq__(self, other):
+        if not isinstance(other, Reaction):
+            return NotImplemented
+        return (
+            sorted(self.educts, key=lambda x: x.smiles) == sorted(other.educts, key=lambda x: x.smiles) and
+            sorted(self.products, key=lambda x: x.smiles) == sorted(other.products, key=lambda x: x.smiles)
+        )
+
+
+class Dataset(object):
+
+    def __init__(self, headers=List['str'], data=List[List[str|int|float]]):
+        self.headers = headers
+        self.data = data
+
+
+    def features(self):
+        pass
+
+    def labels(self):
+        pass
+
+    def to_json(self):
+        pass
+
+    def to_csv(self):
+        pass
+
+    def to_arff(self):
+        pass
+
+
+
+class DatasetGenerator(object):
+
+    @staticmethod
+    def generate_dataset(compounds: List[Compound], reactions: List[Reaction], applicable_rules: 'Rule',
+                         compounds_to_exclude: Optional[Compound] = None, educts_only: bool = False) -> Dataset:
+
+        rows = []
+
+        if educts_only:
+            compounds = set()
+            for r in reactions:
+                for e in r.educts:
+                    compounds.add(e)
+            compounds = list(compounds)
+
+        total = len(compounds)
+        for i, c in enumerate(compounds):
+            row = []
+            print(f"{i + 1}/{total} - {c.smiles}")
+            for r in applicable_rules:
+                product_sets = r.rule.apply(c.smiles)
+
+                if len(product_sets) == 0:
+                    row.append([])
+                    continue
+
+                #triggered.add(f"{r.uuid} + {c.uuid}")
+                reacts = set()
+                for ps in product_sets:
+                    products = []
+                    for p in ps:
+                        products.append(Compound(FormatConverter.standardize(p)))
+
+                    reacts.add(Reaction([c], products, r))
+                row.append(list(reacts))
+
+            rows.append(row)
+
+        return rows
+
+
+class SparseLabelECC(BaseEstimator, ClassifierMixin):
+    """
+    Ensemble of Classifier Chains with sparse label removal.
+    Removes labels that are constant across all samples in training.
+    """
+
+    def __init__(self, base_clf=RandomForestClassifier(n_estimators=100, max_features='log2', random_state=42),
+                 num_chains: int = 10):
+        self.base_clf = base_clf
+        self.num_chains = num_chains
+
+    def fit(self, X, Y):
+        y = np.array(Y)
+        self.n_labels_ = y.shape[1]
+        self.removed_labels_ = {}
+        self.keep_columns_ = []
+
+        for col in range(self.n_labels_):
+            unique_values = np.unique(y[:, col])
+            if len(unique_values) == 1:
+                self.removed_labels_[col] = unique_values[0]
+            else:
+                self.keep_columns_.append(col)
+
+        y_reduced = y[:, self.keep_columns_]
+        self.chains_ = [ClassifierChain(self.base_clf, order='random', random_state=i)
+                        for i in range(self.num_chains)]
+
+        for i, chain in enumerate(self.chains_):
+            print(f"{datetime.now()} fitting {i + 1}/{self.num_chains}")
+            chain.fit(X, y_reduced)
+
+        return self
+
+    def predict(self, X, threshold=0.5):
+        avg_preds = np.mean([chain.predict(X) for chain in self.chains_], axis=0) > threshold
+        full_y = np.zeros((avg_preds.shape[0], self.n_labels_))
+
+        for idx, col in enumerate(self.keep_columns_):
+            full_y[:, col] = avg_preds[:, idx]
+
+        for col, value in self.removed_labels_.items():
+            full_y[:, col] = bool(value)
+
+        return full_y
+
+    def predict_proba(self, X):
+        avg_proba = np.mean([chain.predict_proba(X) for chain in self.chains_], axis=0)
+        full_y = np.zeros((avg_proba.shape[0], self.n_labels_))
+
+        for idx, col in enumerate(self.keep_columns_):
+            full_y[:, col] = avg_proba[:, idx]
+
+        for col, value in self.removed_labels_.items():
+            full_y[:, col] = float(value)
+
+        return full_y
+
+    def score(self, X, Y, sample_weight=None):
+        """
+        Default scoring using subset accuracy (exact match).
+        """
+        y_true = np.array(Y)
+        y_pred = self.predict(X)
+        return accuracy_score(y_true, y_pred, sample_weight=sample_weight)
+
+
+class ApplicabilityDomain(PCA):
+
+    def __init__(self, n_components=5):
+        super().__init__(n_components=n_components)
+        self.scaler = StandardScaler()
+        self.min_vals = None
+        self.max_vals = None
+
+    def build(self, X):
+        # transform
+        X_scaled = self.scaler.fit_transform(X)
+        # fit pca
+        X_pca = self.fit_transform(X_scaled)
+
+        self.max_vals = np.max(X_pca, axis=0)
+        self.min_vals = np.min(X_pca, axis=0)
+
+    def is_applicable(self, instances):
+        instances_scaled = self.scaler.transform(instances)
+        instances_pca = self.transform(instances_scaled)
+
+        is_applicable = []
+        for i, instance in enumerate(instances_pca):
+            is_applicable.append(True)
+            for min_v, max_v, new_v in zip(self.min_vals, self.max_vals, instance):
+                if not min_v <= new_v <= max_v:
+                    is_applicable[i] = False
+
+        return is_applicable
--- a/utilities/plugin.py
+++ b/utilities/plugin.py
@ -0,0 +1,66 @@
+import glob
+import importlib.metadata
+import os
+import subprocess
+from typing import Dict, Type
+
+from django.conf import settings as s
+from envipy_plugins import Descriptor, Classifier, Property
+
+
+def is_installed(package_name):
+    try:
+        importlib.metadata.version(package_name)
+        return True
+    except importlib.metadata.PackageNotFoundError:
+        return False
+
+
+def install_wheel(wheel_path):
+    print(f"Installing wheel: {wheel_path}")
+    subprocess.check_call(["uv", "pip", "install", wheel_path])
+
+
+def extract_package_name_from_wheel(wheel_filename):
+    # Example: my_plugin-0.1.0-py3-none-any.whl -> my_plugin
+    return wheel_filename.split('-')[0]
+
+
+def ensure_plugins_installed():
+    wheel_files = glob.glob(os.path.join(s.PLUGIN_DIR, '*.whl'))
+
+    for wheel_path in wheel_files:
+        wheel_filename = os.path.basename(wheel_path)
+        package_name = extract_package_name_from_wheel(wheel_filename)
+
+        if not is_installed(package_name):
+            install_wheel(wheel_path)
+        else:
+            print(f"Plugin already installed: {package_name}")
+
+
+def discover_plugins(_cls: Type = None) -> Dict[str, Type]:
+    ensure_plugins_installed()
+
+    plugins = {}
+
+    for entry_point in importlib.metadata.entry_points(group='enviPy_plugins'):
+        try:
+            plugin_class = entry_point.load()
+            if _cls:
+                if issubclass(plugin_class, _cls):
+                    instance = plugin_class()
+                    plugins[instance.name()] = instance
+            else:
+                if (
+                        issubclass(plugin_class, Classifier)
+                        or issubclass(plugin_class, Descriptor)
+                        or issubclass(plugin_class, Property)
+                ):
+                    instance = plugin_class()
+                    plugins[instance.name()] = instance
+
+        except Exception as e:
+            print(f"Error loading plugin {entry_point.name}: {e}")
+
+    return plugins