enviPy-bayer/utilities/chem.py

import logging
import re
from abc import ABC
from collections import defaultdict
from typing import List, Optional, Dict, TYPE_CHECKING

from indigo import Indigo, IndigoException, IndigoObject
from indigo.renderer import IndigoRenderer
from rdkit import Chem, rdBase
from rdkit.Chem import MACCSkeys, Descriptors, rdFingerprintGenerator
from rdkit.Chem import rdChemReactions
from rdkit.Chem.Draw import rdMolDraw2D
from rdkit.Chem.MolStandardize import rdMolStandardize
from rdkit.Chem.rdmolops import GetMolFrags
from rdkit.Contrib.IFG import ifg

if TYPE_CHECKING:
    from epdb.models import Rule

logger = logging.getLogger(__name__)
rdBase.DisableLog("rdApp.*")

# from rdkit import rdBase
# rdBase.LogToPythonLogger()
# pylog = logging.getLogger("rdkit")
# pylog.propagate = False
# pylog.disabled = True
# print(pylog.disabled)


class ProductSet(object):
    def __init__(self, product_set: List[str]):
        self.product_set = product_set

    def __repr__(self):
        return self.product_set.__repr__()

    def __len__(self):
        return len(self.product_set)

    def __iter__(self):
        return iter(self.product_set)

    def __eq__(self, other):
        return isinstance(other, ProductSet) and sorted(self.product_set) == sorted(
            other.product_set
        )

    def __hash__(self):
        return hash("-".join(sorted(self.product_set)))


class PredictionResult(object):
    def __init__(
        self, product_sets: List["ProductSet"], probability: float, rule: Optional["Rule"] = None
    ):
        self.product_sets = product_sets
        self.probability = probability
        self.rule = rule

    def __len__(self):
        return len(self.product_sets)

    def __iter__(self):
        return iter(self.product_sets)

    def __repr__(self):
        return f"--{self.probability:.2f}/{self.rule}--> {self.product_sets}"


class FormatConverter(object):
    @staticmethod
    def mass(smiles):
        return Descriptors.MolWt(FormatConverter.from_smiles(smiles))

    @staticmethod
    def charge(smiles):
        return Chem.GetFormalCharge(FormatConverter.from_smiles(smiles))

    @staticmethod
    def formula(smiles):
        return Chem.rdMolDescriptors.CalcMolFormula(FormatConverter.from_smiles(smiles))

    @staticmethod
    def from_smiles(smiles):
        return Chem.MolFromSmiles(smiles)

    @staticmethod
    def to_smiles(mol, canonical=False):
        return Chem.MolToSmiles(mol, canonical=canonical)

    @staticmethod
    def InChIKey(smiles):
        return Chem.MolToInchiKey(FormatConverter.from_smiles(smiles))

    @staticmethod
    def InChI(smiles):
        return Chem.MolToInchi(FormatConverter.from_smiles(smiles))

    @staticmethod
    def canonicalize(smiles: str):
        return FormatConverter.to_smiles(FormatConverter.from_smiles(smiles), canonical=True)

    @staticmethod
    def maccs(smiles):
        mol = Chem.MolFromSmiles(smiles)
        bitvec = MACCSkeys.GenMACCSKeys(mol)
        return bitvec.ToList()

    @staticmethod
    def morgan(smiles, radius=3, fpSize=2048):
        finger_gen = rdFingerprintGenerator.GetMorganGenerator(radius=radius, fpSize=fpSize)
        mol = Chem.MolFromSmiles(smiles)
        fp = finger_gen.GetFingerprint(mol)
        return fp.ToList()

    @staticmethod
    def get_functional_groups(smiles: str) -> List[str]:
        res = list()

        try:
            m = Chem.MolFromSmiles(smiles)
            fgs = ifg.identify_functional_groups(m)
            for fg in fgs:
                # TODO atoms or type?
                res.append(fg.atoms)
        except AttributeError:
            logger.debug(f"Could not get functional groups for {smiles}")

        return res

    @staticmethod
    def to_svg(smiles, mol_size=(200, 150), kekulize=True):
        mol = FormatConverter.from_smiles(smiles)

        if kekulize:
            try:
                mol = Chem.Kekulize(mol)
            except Exception:
                mol = Chem.Mol(mol.ToBinary())

        if not mol.GetNumConformers():
            Chem.rdDepictor.Compute2DCoords(mol)

        drawer = rdMolDraw2D.MolDraw2DSVG(*mol_size)
        opts = drawer.drawOptions()

        opts.clearBackground = False
        drawer.DrawMolecule(mol)
        drawer.FinishDrawing()
        svg = drawer.GetDrawingText().replace("svg:", "")
        svg = re.sub("<\?xml.*\?>", "", svg)

        return svg

    @staticmethod
    def to_png(smiles, mol_size=(200, 150), kekulize=True):
        mol = FormatConverter.from_smiles(smiles)

        if kekulize:
            try:
                Chem.Kekulize(mol)
            except Exception:
                mc = Chem.Mol(mol.ToBinary())

        if not mc.GetNumConformers():
            Chem.rdDepictor.Compute2DCoords(mc)

        pass

    @staticmethod
    def normalize(smiles):
        # TODO call to AMBIT Service
        return smiles

    @staticmethod
    def ep_standardize(smiles):
        change = True
        while change:
            change = False
            for standardizer in MATCH_STANDARDIZER:
                tmp_smiles = standardizer.standardize(smiles)

                if tmp_smiles != smiles:
                    print(f"change {smiles} to {tmp_smiles}")
                    change = True
                    smiles = tmp_smiles

            if change is False:
                print("nothing changed")

        return smiles

    @staticmethod
    def standardize(smiles, remove_stereo=False, canonicalize_tautomers=False):
        # Taken from https://bitsilla.com/blog/2021/06/standardizing-a-molecule-using-rdkit/
        # follows the steps in
        # https://github.com/greglandrum/RSC_OpenScience_Standardization_202104/blob/main/MolStandardize%20pieces.ipynb
        # as described **excellently** (by Greg) in
        # https://www.youtube.com/watch?v=eWTApNX8dJQ
        mol = Chem.MolFromSmiles(smiles)

        # removeHs, disconnect metal atoms, normalize the molecule, reionize the molecule
        clean_mol = rdMolStandardize.Cleanup(mol)

        # if many fragments, get the "parent" (the actual mol we are interested in)
        parent_clean_mol = rdMolStandardize.FragmentParent(clean_mol)

        # try to neutralize molecule
        uncharger = (
            rdMolStandardize.Uncharger()
        )  # annoying, but necessary as no convenience method exists
        res_mol = uncharger.uncharge(parent_clean_mol)

        # note that no attempt is made at reionization at this step
        # nor at ionization at some pH (rdkit has no pKa caculator)
        # the main aim to to represent all molecules from different sources
        # in a (single) standard way, for use in ML, catalogue, etc.

        if remove_stereo:
            Chem.RemoveStereochemistry(res_mol)

        if canonicalize_tautomers:
            te = rdMolStandardize.TautomerEnumerator()  # idem
            res_mol = te.Canonicalize(res_mol)

        return Chem.MolToSmiles(res_mol, kekuleSmiles=True)

    @staticmethod
    def neutralize_smiles(smiles):
        mol = Chem.MolFromSmiles(smiles)
        mol = FormatConverter.neutralize_molecule(mol)
        return Chem.MolToSmiles(mol)

    @staticmethod
    def neutralize_molecule(mol):
        pattern = Chem.MolFromSmarts("[+1!h0!$([*]~[-1,-2,-3,-4]),-1!$([*]~[+1,+2,+3,+4])]")
        at_matches = mol.GetSubstructMatches(pattern)
        at_matches_list = [y[0] for y in at_matches]
        if len(at_matches_list) > 0:
            for at_idx in at_matches_list:
                atom = mol.GetAtomWithIdx(at_idx)
                chg = atom.GetFormalCharge()
                hcount = atom.GetTotalNumHs()
                atom.SetFormalCharge(0)
                atom.SetNumExplicitHs(hcount - chg)
                atom.UpdatePropertyCache()
        return mol

    @staticmethod
    def is_valid_smirks(smirks: str) -> bool:
        try:
            rdChemReactions.ReactionFromSmarts(smirks)
            return True
        except Exception:
            return False

    @staticmethod
    def is_valid_smarts(smarts: str) -> bool:
        """
        Checks whether a given string is a valid SMARTS pattern.

        Parameters
        ----------
        smarts : str
            The SMARTS string to validate.

        Returns
        -------
        bool
            True if the SMARTS string is valid, False otherwise.
        """
        if not isinstance(smarts, str) or not smarts.strip():
            return False

        try:
            mol = Chem.MolFromSmarts(smarts)
            return mol is not None
        except Exception:
            return False

    @staticmethod
    def apply(
        smiles: str,
        smirks: str,
        preprocess_smiles: bool = True,
        bracketize: bool = True,
        standardize: bool = True,
        kekulize: bool = True,
        remove_stereo: bool = True,
    ) -> List["ProductSet"]:
        logger.debug(f"Applying {smirks} on {smiles}")

        # If explicitly wanted or rule generates multiple products add brackets around products to capture all
        if bracketize:  # or "." in smirks:
            smirks = smirks.split(">>")[0] + ">>(" + smirks.split(">>")[1] + ")"

        # List of ProductSet objects
        pss = set()
        try:
            rxn = rdChemReactions.ReactionFromSmarts(smirks)
            mol = Chem.MolFromSmiles(smiles)

            # Inplace
            if preprocess_smiles:
                Chem.SanitizeMol(mol)
                mol = Chem.AddHs(mol)

            # apply!
            sites = rxn.RunReactants((mol,))
            logger.debug(f"{len(sites)} products sets generated")
            if len(sites):
                # Sanitize mols
                for product_set in sites:
                    prods = []
                    for product in product_set:
                        try:
                            Chem.SanitizeMol(product)
                            product = GetMolFrags(product, asMols=True)
                            for p in product:
                                p = FormatConverter.standardize(
                                    Chem.MolToSmiles(p), remove_stereo=remove_stereo
                                )
                                prods.append(p)

                            # if kekulize:
                            #     # from rdkit.Chem import MolStandardize
                            #     #
                            #     # # Attempt re-sanitization via standardizer
                            #     # cleaner = MolStandardize.rdMolStandardize.Cleanup()
                            #     # mol = cleaner.cleanup(product)
                            #     # # Fixes
                            #     # # [2025-01-30 23:00:50] ERROR chem - Sanitizing and converting failed:
                            #     # # non-ring atom 3 marked aromatic
                            #     # # But does not improve overall performance
                            #     # # for a in product.GetAtoms():
                            #     # #     if (not a.IsInRing()) and a.GetIsAromatic():
                            #     # #         a.SetIsAromatic(False)
                            #     # #
                            #     # # for b in product.GetBonds():
                            #     # #     if (not b.IsInRing()) and b.GetIsAromatic():
                            #     # #         b.SetIsAromatic(False)
                            #     # for atom in product.GetAtoms():
                            #     #     atom.SetIsAromatic(False)
                            #     # for bond in product.GetBonds():
                            #     #     bond.SetIsAromatic(False)
                            #     Chem.Kekulize(product)

                        except ValueError as e:
                            logger.error(f"Sanitizing and converting failed:\n{e}")
                            continue

                    if len(prods):
                        ps = ProductSet(prods)
                        pss.add(ps)

        except Exception as e:
            logger.error(f"Applying {smirks} on {smiles} failed:\n{e}")

        return pss

    @staticmethod
    def MACCS(smiles):
        return MACCSkeys.GenMACCSKeys(FormatConverter.from_smiles(smiles))

    @staticmethod
    def sanitize_smiles(smiles_list: List):
        parsed_smiles = []
        errors = 0
        for smi in smiles_list:
            try:
                # # Remove Stereo and Flatten
                # if "/" in smi:
                #     smi = smi.replace("/", "")
                # if "\\" in smi:
                #     smi = smi.replace("\\", "")
                # if "@" in smi:
                #     smi = smi.replace("@", "")

                mol = Chem.MolFromSmiles(smi)
                mol = FormatConverter.neutralize_molecule(mol)
                Chem.RemoveStereochemistry(mol)
                mol = Chem.RemoveAllHs(mol)
                Chem.Kekulize(mol)
                smi_p = Chem.MolToSmiles(mol, kekuleSmiles=True)
                smi_p = Chem.CanonSmiles(smi_p)

                if "~" in smi_p:
                    smi_p1 = smi_p.replace("~", "")
                    parsed_smiles.append(smi_p1)
                else:
                    parsed_smiles.append(smi_p)
            except Exception:
                errors += 1
                pass

        return parsed_smiles, errors

    @staticmethod
    def smiles_covered_by(
        l_smiles: List[str],
        r_smiles: List[str],
        standardize: bool = True,
        canonicalize_tautomers: bool = True,
    ) -> bool:
        """
        Check if all SMILES in the left list are covered by (contained in) the right list.

        This function performs a subset check to determine if every chemical structure
        represented in l_smiles has a corresponding representation in r_smiles.

        Args:
            l_smiles (List[str]): List of SMILES strings to check for coverage.
            r_smiles (List[str]): List of SMILES strings that should contain all l_smiles.
            standardize (bool, optional): Whether to standardize SMILES before comparison.
                Defaults to True. When True, applies FormatConverter.standardize() to
                normalize representations for accurate comparison.
            canonicalize_tautomers (bool, optional): Whether to canonicalize tautomers
                Defaults to False. When True, applies rdMolStandardize.TautomerEnumerator().Canonicalize(res_mol)
                to the compounds before comparison.
        Returns:
            bool: True if all SMILES in l_smiles are found in r_smiles (i.e., l_smiles
                  is a subset of r_smiles), False otherwise.

        Note:
            - Comparison treats lists as sets, ignoring duplicates and order
            - Failed standardization attempts are silently ignored (original SMILES used)
            - This is a one-directional check: l_smiles ⊆ r_smiles
            - For bidirectional equality, both directions must be checked separately

        Example:
            >>> FormatConverter.smiles_covered_by(["CCO", "CC"], ["CCO", "CC", "CCC"])
            True
            >>> FormatConverter.smiles_covered_by(["CCO", "CCCC"], ["CCO", "CC", "CCC"])
            False
        """

        standardized_l_smiles = []

        if standardize:
            for smi in l_smiles:
                try:
                    smi = FormatConverter.standardize(
                        smi, canonicalize_tautomers=canonicalize_tautomers
                    )
                except Exception:
                    # :shrug:
                    # logger.debug(f'Standardizing SMILES failed for {smi}')
                    pass
                standardized_l_smiles.append(smi)
        else:
            standardized_l_smiles = l_smiles

        standardized_r_smiles = []
        if standardize:
            for smi in r_smiles:
                try:
                    smi = FormatConverter.standardize(smi)
                except Exception:
                    # :shrug:
                    # logger.debug(f'Standardizing SMILES failed for {smi}')
                    pass
                standardized_r_smiles.append(smi)
        else:
            standardized_r_smiles = r_smiles

        return len(set(standardized_l_smiles).difference(set(standardized_r_smiles))) == 0


class Standardizer(ABC):
    def __init__(self, name):
        self.name = name

    def standardize(self, smiles: str) -> str:
        return FormatConverter.normalize(smiles)


class RuleStandardizer(Standardizer):
    def __init__(self, name, smirks):
        super().__init__(name)
        self.smirks = smirks

    def standardize(self, smiles: str) -> str:
        standardized_smiles = list(set(FormatConverter.apply(smiles, self.smirks)))

        if len(standardized_smiles) > 1:
            logger.warning(f"{self.smirks} generated more than 1 compound {standardized_smiles}")
            print(f"{self.smirks} generated more than 1 compound {standardized_smiles}")
            standardized_smiles = standardized_smiles[:1]

        if standardized_smiles:
            smiles = standardized_smiles[0]

        return super().standardize(smiles)


class RegExStandardizer(Standardizer):
    def __init__(self, name, replacements: dict):
        super().__init__(name)
        self.replacements = replacements

    def standardize(self, smiles: str) -> str:
        smi = smiles
        mod_smi = smiles

        for k, v in self.replacements.items():
            mod_smi = smi.replace(k, v)

        while mod_smi != smi:
            mod_smi = smi
            for k, v in self.replacements.items():
                smi = smi.replace(k, v)

        return super().standardize(smi)


FLATTEN = [RegExStandardizer("Remove Stereo", {"@": ""})]

UN_CIS_TRANS = [RegExStandardizer("Un-Cis-Trans", {"/": "", "\\": ""})]

BASIC = [
    RuleStandardizer("ammoniumstandardization", "[H][N+:1]([H])([H])[#6:2]>>[H][#7:1]([H])-[#6:2]"),
    RuleStandardizer("cyanate", "[H][#8:1][C:2]#[N:3]>>[#8-:1][C:2]#[N:3]"),
    RuleStandardizer("deprotonatecarboxyls", "[H][#8:1]-[#6:2]=[O:3]>>[#8-:1]-[#6:2]=[O:3]"),
    RuleStandardizer("forNOOH", "[H][#8:1]-[#7+:2](-[*:3])=[O:4]>>[#8-:1]-[#7+:2](-[*:3])=[O:4]"),
    RuleStandardizer(
        "Hydroxylprotonation",
        "[#6;A:1][#6:2](-[#8-:3])=[#6;A:4]>>[#6:1]-[#6:2](-[#8:3][H])=[#6;A:4]",
    ),
    RuleStandardizer(
        "phosphatedeprotonation", "[H][#8:1]-[$([#15]);!$(P([O-])):2]>>[#8-:1]-[#15:2]"
    ),
    RuleStandardizer(
        "PicricAcid",
        "[H][#8:1]-[c:2]1[c:3][c:4][c:5]([c:6][c:7]1-[#7+:8](-[#8-:9])=[O:10])-[#7+:11](-[#8-:12])=[O:13]>>[#8-:1]-[c:2]1[c:3][c:4][c:5]([c:6][c:7]1-[#7+:8](-[#8-:9])=[O:10])-[#7+:11](-[#8-:12])=[O:13]",
    ),
    RuleStandardizer(
        "Sulfate1", "[H][#8:1][S:2]([#8:3][H])(=[O:4])=[O:5]>>[#8-:1][S:2]([#8-:3])(=[O:4])=[O:5]"
    ),
    RuleStandardizer(
        "Sulfate2",
        "[#6:1]-[#8:2][S:3]([#8:4][H])(=[O:5])=[O:6]>>[#6:1]-[#8:2][S:3]([#8-:4])(=[O:5])=[O:6]",
    ),
    RuleStandardizer(
        "Sulfate3", "[H][#8:3][S:2]([#6:1])(=[O:4])=[O:5]>>[#6:1][S:2]([#8-:3])(=[O:4])=[O:5]"
    ),
    RuleStandardizer(
        "Transform_c1353forSOOH", "[H][#8:1][S:2]([*:3])=[O:4]>>[#8-:1][S:2]([*:3])=[O:4]"
    ),
]

ENHANCED = BASIC + [
    RuleStandardizer("fullPhosphatedeprotonation", "[H][#8:1]-[#15:2]>>[#8-:1]-[#15:2]")
]

EXOTIC = ENHANCED + [
    RuleStandardizer(
        "ThioPhosphate1",
        "[H][S:1]-[#15:2]=[$([#16]),$([#8]):3]>>[S-:1]-[#15:2]=[$([#16]),$([#8]):3]",
    )
]

COA_CUTTER = [
    RuleStandardizer(
        "CutCoEnzymeAOff",
        "CC(C)(COP(O)(=O)OP(O)(=O)OCC1OC(C(O)C1OP(O)(O)=O)n1cnc2c(N)ncnc12)C(O)C(=O)NCCC(=O)NCCS[$(*):1]>>[O-][$(*):1]",
    )
]

ENOL_KETO = [RuleStandardizer("enol2Ketone", "[H][#8:2]-[#6:3]=[#6:1]>>[#6:1]-[#6:3]=[O:2]")]

MATCH_STANDARDIZER = EXOTIC + FLATTEN + UN_CIS_TRANS + COA_CUTTER + ENOL_KETO


class IndigoUtils(object):
    @staticmethod
    def layout(mol_data):
        i = Indigo()
        try:
            if mol_data.startswith("$RXN") or ">>" in mol_data:
                rxn = i.loadQueryReaction(mol_data)
                rxn.layout()
                return rxn.rxnfile()
            else:
                mol = i.loadQueryMolecule(mol_data)
                mol.layout()
                return mol.molfile()
        except IndigoException:
            try:
                logger.info("layout() failed, trying loadReactionSMARTS as fallback!")
                rxn = IndigoUtils.load_reaction_SMARTS(mol_data)
                rxn.layout()
                return rxn.molfile()
            except IndigoException as e2:
                logger.error(f"layout() failed due to {e2}!")

    @staticmethod
    def load_reaction_SMARTS(mol):
        return Indigo().loadReactionSmarts(mol)

    @staticmethod
    def aromatize(mol_data, is_query):
        i = Indigo()
        try:
            if mol_data.startswith("$RXN"):
                if is_query:
                    rxn = i.loadQueryReaction(mol_data)
                else:
                    rxn = i.loadReaction(mol_data)

                rxn.aromatize()
                return rxn.rxnfile()
            else:
                if is_query:
                    mol = i.loadQueryMolecule(mol_data)
                else:
                    mol = i.loadMolecule(mol_data)

                mol.aromatize()
                return mol.molfile()
        except IndigoException:
            try:
                logger.info("Aromatizing failed, trying loadReactionSMARTS as fallback!")
                rxn = IndigoUtils.load_reaction_SMARTS(mol_data)
                rxn.aromatize()
                return rxn.molfile()
            except IndigoException as e2:
                logger.error(f"Aromatizing failed due to {e2}!")

    @staticmethod
    def dearomatize(mol_data, is_query):
        i = Indigo()
        try:
            if mol_data.startswith("$RXN"):
                if is_query:
                    rxn = i.loadQueryReaction(mol_data)
                else:
                    rxn = i.loadReaction(mol_data)

                rxn.dearomatize()
                return rxn.rxnfile()
            else:
                if is_query:
                    mol = i.loadQueryMolecule(mol_data)
                else:
                    mol = i.loadMolecule(mol_data)

                mol.dearomatize()
                return mol.molfile()
        except IndigoException:
            try:
                logger.info("De-Aromatizing failed, trying loadReactionSMARTS as fallback!")
                rxn = IndigoUtils.load_reaction_SMARTS(mol_data)
                rxn.dearomatize()
                return rxn.molfile()
            except IndigoException as e2:
                logger.error(f"De-Aromatizing failed due to {e2}!")

    @staticmethod
    def sanitize_functional_group(functional_group: str):
        counter = 0
        while True:
            counter += 1

            copy = functional_group

            # special environment handling (amines, hydroxy, esters, ethers)
            # the higher substituted should not contain H env.
            if functional_group == "[C]=O":
                functional_group = "[H][C](=O)[CX4,c]"

            # aldamines
            if functional_group == "O=[C]N(R)R":
                functional_group = "O=[C]([H])N(R)R"

            # ether, ester, ketones, amines, thioether
            vals = [
                "ROR",
                "O=C(R)OR",
                "[H]N(R)R",
                "O=C(R)R",
                "RN(R)R",
                "RSR",
            ]
            if functional_group in vals:
                functional_group = functional_group.replace("R", "[CX4,c]")

            # esters, ketones, amides
            functional_group = functional_group.replace("O=C(R)", "O=C([CX4,c])")

            if functional_group == "RS*R" or functional_group == "RO*R":
                # neighboring atoms can be any aromatic atom, but they should not be highlighted
                functional_group = functional_group.replace("R", "")

            # aromatic compounds:  aromatic atoms are denoted with *
            # single aromatic heteroatoms, not substituted

            # unsubstituted aromatic nitrogen
            if functional_group == "RN*(R)R":
                functional_group = "[nH1,nX2](a)a"  # pyrrole (with H) or pyridine (no other connections); currently overlaps with neighboring aromatic atoms

            # substituted aromatic nitrogen
            functional_group = functional_group.replace(
                "N*(R)R", "n(a)a"
            )  # substituent will be before N*; currently overlaps with neighboring aromatic atoms
            # pyridinium
            if functional_group == "RN*(R)(R)(R)R":
                functional_group = (
                    "[CX4,c]n(a)a"  # currently overlaps with neighboring aromatic atoms
                )

            # N-oxide
            if functional_group == "[H]ON*(R)(R)(R)R":
                functional_group = (
                    "[O-][n+](a)a"  # currently overlaps with neighboring aromatic atoms
                )

            # other aromatic hetero atoms
            functional_group = functional_group.replace("C*", "c")
            functional_group = functional_group.replace("N*", "n")
            functional_group = functional_group.replace("S*", "s")
            functional_group = functional_group.replace("O*", "o")
            functional_group = functional_group.replace("Se*", "se")
            functional_group = functional_group.replace("P*", "p")

            # other replacement, to accomodate for the standardization rules in enviPath
            # This is not the perfect way to do it; there should be a way to replace substructure SMARTS in SMARTS?
            # nitro groups are broken, due to charge handling. this SMARTS matches both forms (formal charges and hypervalent); Ertl-CDK still treats both forms separately...
            functional_group = functional_group.replace(
                "[H]O[N](=O)R", "[CX4,c][NX3](~[OX1])~[OX1]"
            )
            functional_group = functional_group.replace("O=N(=O)R", "[CX4,c][NX3](~[OX1])~[OX1]")
            # carboxylic acid: this SMARTS matches both neutral and anionic form; includes COOH in larger functional_groups
            functional_group = functional_group.replace("[H]OC(=O)", "[OD1]C(=O)")
            # azo
            functional_group = functional_group.replace("N#[N]N(R)R", "[NX1]~[NX2]~N[CX4,c]")
            functional_group = functional_group.replace("RN=[N]=NR", "[NX1]~[NX2]~N[CX4,c]")
            # TODO: there might be more problematic groups, which we have yet to find.

            # other environment atoms (can be aromatic or aliphatic Csp3, or H)
            functional_group = functional_group.replace("R", "[H,CX4,c]")

            if copy == functional_group:
                break

        return functional_group

    @staticmethod
    def _colorize(
        indigo: Indigo, molecule: IndigoObject, functional_groups: Dict[str, int], is_reaction: bool
    ):
        indigo.setOption("render-atom-color-property", "color")
        indigo.setOption("aromaticity-model", "generic")

        counts = defaultdict(lambda: 0)
        environment = set()

        matcher = indigo.substructureMatcher(molecule)

        # Determine environment atoms as they will be colored black
        for env in ["[CX4]", "c"]:
            query = indigo.loadSmarts(env)
            for match in matcher.iterateMatches(query):
                if match is not None:
                    for atom in query.iterateAtoms():
                        mappedAtom = match.mapAtom(atom)

                        if mappedAtom is None:
                            continue

                        environment.add(mappedAtom.index())

        for k, v in functional_groups.items():
            try:
                sanitized = IndigoUtils.sanitize_functional_group(k)

                query = indigo.loadSmarts(sanitized)

                for match in matcher.iterateMatches(query):
                    if match is not None:
                        for atom in query.iterateAtoms():
                            mappedAtom = match.mapAtom(atom)
                            if mappedAtom is None or mappedAtom.index() in environment:
                                continue

                            counts[mappedAtom.index()] = max(v, counts[mappedAtom.index()])

            except IndigoException as e:
                logger.debug(f"Colorizing failed due to {e}")

        for k, v in counts.items():
            if is_reaction:
                color = "128, 0, 128"
            else:
                if v <= 5:
                    color = "200, 0, 0"
                else:
                    color = "0, 112, 0"

            molecule.addDataSGroup([k], [], "color", color)

    @staticmethod
    def mol_to_svg(
        mol_data: str, width: int = 0, height: int = 0, functional_groups: Dict[str, int] = None
    ):
        if functional_groups is None:
            functional_groups = {}

        i = Indigo()
        renderer = IndigoRenderer(i)

        i.setOption("render-output-format", "svg")
        i.setOption("render-coloring", not bool(len(functional_groups.keys())))
        i.setOption("render-image-size", width, height)
        i.setOption("render-bond-line-width", 2.0)

        if "~" in mol_data:
            mol = i.loadSmarts(mol_data)
        else:
            mol = i.loadMolecule(mol_data)

        if len(functional_groups.keys()) > 0:
            IndigoUtils._colorize(i, mol, functional_groups, False)

        return renderer.renderToBuffer(mol).decode("UTF-8")

    @staticmethod
    def smirks_to_svg(
        smirks: str,
        is_query_smirks,
        width: int = 0,
        height: int = 0,
        educt_functional_groups: Dict[str, int] = None,
        product_functional_groups: Dict[str, int] = None,
        debug: bool = False,
    ):
        if educt_functional_groups is None:
            educt_functional_groups = {}

        if product_functional_groups is None:
            product_functional_groups = {}

        i = Indigo()
        renderer = IndigoRenderer(i)

        if debug:
            i.setOption("render-atom-ids-visible", True)
            i.setOption("render-bond-ids-visible", False)
            i.setOption("render-atom-bond-ids-from-one", True)

        i.setOption("render-output-format", "svg")
        i.setOption("render-coloring", True)
        i.setOption("render-image-size", width, height)

        if is_query_smirks:
            obj = i.loadReactionSmarts(smirks)
        else:
            obj = i.loadReaction(smirks)

            if len(educt_functional_groups.keys()) > 0:
                for react in obj.iterateReactants():
                    IndigoUtils._colorize(i, react, educt_functional_groups, True)

            if len(product_functional_groups.keys()) > 0:
                for prod in obj.iterateProducts():
                    IndigoUtils._colorize(i, prod, product_functional_groups, True)

        return renderer.renderToBuffer(obj).decode("UTF-8")


if __name__ == "__main__":
    data = {
        "struct": "\n  Ketcher  2172510 12D 1   1.00000     0.00000     0\n\n  6  6  0     0  0            999 V2000\n    0.0000    0.0000    0.0000 C   0  0  0  0  0  0  0  0  0  0  0  0\n   -1.0000    0.0000    0.0000 C   0  0  0  0  0  0  0  0  0  0  0  0\n   -1.5000   -0.8660    0.0000 C   0  0  0  0  0  0  0  0  0  0  0  0\n   -1.0000   -1.7321    0.0000 C   0  0  0  0  0  0  0  0  0  0  0  0\n    0.0000   -1.7321    0.0000 C   0  0  0  0  0  0  0  0  0  0  0  0\n    0.5000   -0.8660    0.0000 C   0  0  0  0  0  0  0  0  0  0  0  0\n  1  2  2  0  0  0  0\n  2  3  1  0  0  0  0\n  3  4  2  0  0  0  0\n  4  5  1  0  0  0  0\n  5  6  2  0  0  0  0\n  6  1  1  0  0  0  0\nM  END\n",
        "options": {
            "smart-layout": True,
            "ignore-stereochemistry-errors": True,
            "mass-skip-error-on-pseudoatoms": False,
            "gross-formula-add-rsites": True,
        },
    }

    print(IndigoUtils.aromatize(data["struct"], False))