forked from enviPath/enviPy
883 lines
31 KiB
Python
883 lines
31 KiB
Python
import logging
|
|
import re
|
|
from abc import ABC
|
|
from collections import defaultdict
|
|
from typing import List, Optional, Dict, TYPE_CHECKING
|
|
|
|
from indigo import Indigo, IndigoException, IndigoObject
|
|
from indigo.renderer import IndigoRenderer
|
|
from rdkit import Chem, rdBase
|
|
from rdkit.Chem import MACCSkeys, Descriptors, rdFingerprintGenerator
|
|
from rdkit.Chem import rdChemReactions
|
|
from rdkit.Chem.Draw import rdMolDraw2D
|
|
from rdkit.Chem.MolStandardize import rdMolStandardize
|
|
from rdkit.Chem.rdmolops import GetMolFrags
|
|
from rdkit.Contrib.IFG import ifg
|
|
|
|
if TYPE_CHECKING:
|
|
from epdb.models import Rule
|
|
|
|
logger = logging.getLogger(__name__)
|
|
rdBase.DisableLog("rdApp.*")
|
|
|
|
# from rdkit import rdBase
|
|
# rdBase.LogToPythonLogger()
|
|
# pylog = logging.getLogger("rdkit")
|
|
# pylog.propagate = False
|
|
# pylog.disabled = True
|
|
# print(pylog.disabled)
|
|
|
|
|
|
class ProductSet(object):
|
|
def __init__(self, product_set: List[str]):
|
|
self.product_set = product_set
|
|
|
|
def __repr__(self):
|
|
return self.product_set.__repr__()
|
|
|
|
def __len__(self):
|
|
return len(self.product_set)
|
|
|
|
def __iter__(self):
|
|
return iter(self.product_set)
|
|
|
|
def __eq__(self, other):
|
|
return isinstance(other, ProductSet) and sorted(self.product_set) == sorted(
|
|
other.product_set
|
|
)
|
|
|
|
def __hash__(self):
|
|
return hash("-".join(sorted(self.product_set)))
|
|
|
|
|
|
class PredictionResult(object):
|
|
def __init__(
|
|
self, product_sets: List["ProductSet"], probability: float, rule: Optional["Rule"] = None
|
|
):
|
|
self.product_sets = product_sets
|
|
self.probability = probability
|
|
self.rule = rule
|
|
|
|
def __len__(self):
|
|
return len(self.product_sets)
|
|
|
|
def __iter__(self):
|
|
return iter(self.product_sets)
|
|
|
|
def __repr__(self):
|
|
return f"--{self.probability:.2f}/{self.rule}--> {self.product_sets}"
|
|
|
|
|
|
class FormatConverter(object):
|
|
@staticmethod
|
|
def mass(smiles):
|
|
return Descriptors.MolWt(FormatConverter.from_smiles(smiles))
|
|
|
|
@staticmethod
|
|
def charge(smiles):
|
|
return Chem.GetFormalCharge(FormatConverter.from_smiles(smiles))
|
|
|
|
@staticmethod
|
|
def formula(smiles):
|
|
return Chem.rdMolDescriptors.CalcMolFormula(FormatConverter.from_smiles(smiles))
|
|
|
|
@staticmethod
|
|
def from_smiles(smiles):
|
|
return Chem.MolFromSmiles(smiles)
|
|
|
|
@staticmethod
|
|
def to_smiles(mol, canonical=False):
|
|
return Chem.MolToSmiles(mol, canonical=canonical)
|
|
|
|
@staticmethod
|
|
def InChIKey(smiles):
|
|
return Chem.MolToInchiKey(FormatConverter.from_smiles(smiles))
|
|
|
|
@staticmethod
|
|
def InChI(smiles):
|
|
return Chem.MolToInchi(FormatConverter.from_smiles(smiles))
|
|
|
|
@staticmethod
|
|
def canonicalize(smiles: str):
|
|
return FormatConverter.to_smiles(FormatConverter.from_smiles(smiles), canonical=True)
|
|
|
|
@staticmethod
|
|
def maccs(smiles):
|
|
mol = Chem.MolFromSmiles(smiles)
|
|
bitvec = MACCSkeys.GenMACCSKeys(mol)
|
|
return bitvec.ToList()
|
|
|
|
@staticmethod
|
|
def morgan(smiles, radius=3, fpSize=2048):
|
|
finger_gen = rdFingerprintGenerator.GetMorganGenerator(radius=radius, fpSize=fpSize)
|
|
mol = Chem.MolFromSmiles(smiles)
|
|
fp = finger_gen.GetFingerprint(mol)
|
|
return fp.ToList()
|
|
|
|
@staticmethod
|
|
def get_functional_groups(smiles: str) -> List[str]:
|
|
res = list()
|
|
|
|
try:
|
|
m = Chem.MolFromSmiles(smiles)
|
|
fgs = ifg.identify_functional_groups(m)
|
|
for fg in fgs:
|
|
# TODO atoms or type?
|
|
res.append(fg.atoms)
|
|
except AttributeError:
|
|
logger.debug(f"Could not get functional groups for {smiles}")
|
|
|
|
return res
|
|
|
|
@staticmethod
|
|
def to_svg(smiles, mol_size=(200, 150), kekulize=True):
|
|
mol = FormatConverter.from_smiles(smiles)
|
|
|
|
if kekulize:
|
|
try:
|
|
mol = Chem.Kekulize(mol)
|
|
except Exception:
|
|
mol = Chem.Mol(mol.ToBinary())
|
|
|
|
if not mol.GetNumConformers():
|
|
Chem.rdDepictor.Compute2DCoords(mol)
|
|
|
|
drawer = rdMolDraw2D.MolDraw2DSVG(*mol_size)
|
|
opts = drawer.drawOptions()
|
|
|
|
opts.clearBackground = False
|
|
drawer.DrawMolecule(mol)
|
|
drawer.FinishDrawing()
|
|
svg = drawer.GetDrawingText().replace("svg:", "")
|
|
svg = re.sub("<\?xml.*\?>", "", svg)
|
|
|
|
return svg
|
|
|
|
@staticmethod
|
|
def to_png(smiles, mol_size=(200, 150), kekulize=True):
|
|
mol = FormatConverter.from_smiles(smiles)
|
|
|
|
if kekulize:
|
|
try:
|
|
Chem.Kekulize(mol)
|
|
except Exception:
|
|
mc = Chem.Mol(mol.ToBinary())
|
|
|
|
if not mc.GetNumConformers():
|
|
Chem.rdDepictor.Compute2DCoords(mc)
|
|
|
|
pass
|
|
|
|
@staticmethod
|
|
def normalize(smiles):
|
|
# TODO call to AMBIT Service
|
|
return smiles
|
|
|
|
@staticmethod
|
|
def ep_standardize(smiles):
|
|
change = True
|
|
while change:
|
|
change = False
|
|
for standardizer in MATCH_STANDARDIZER:
|
|
tmp_smiles = standardizer.standardize(smiles)
|
|
|
|
if tmp_smiles != smiles:
|
|
print(f"change {smiles} to {tmp_smiles}")
|
|
change = True
|
|
smiles = tmp_smiles
|
|
|
|
if change is False:
|
|
print("nothing changed")
|
|
|
|
return smiles
|
|
|
|
@staticmethod
|
|
def standardize(smiles, remove_stereo=False, canonicalize_tautomers=False):
|
|
# Taken from https://bitsilla.com/blog/2021/06/standardizing-a-molecule-using-rdkit/
|
|
# follows the steps in
|
|
# https://github.com/greglandrum/RSC_OpenScience_Standardization_202104/blob/main/MolStandardize%20pieces.ipynb
|
|
# as described **excellently** (by Greg) in
|
|
# https://www.youtube.com/watch?v=eWTApNX8dJQ
|
|
mol = Chem.MolFromSmiles(smiles)
|
|
|
|
# removeHs, disconnect metal atoms, normalize the molecule, reionize the molecule
|
|
clean_mol = rdMolStandardize.Cleanup(mol)
|
|
|
|
# if many fragments, get the "parent" (the actual mol we are interested in)
|
|
parent_clean_mol = rdMolStandardize.FragmentParent(clean_mol)
|
|
|
|
# try to neutralize molecule
|
|
uncharger = (
|
|
rdMolStandardize.Uncharger()
|
|
) # annoying, but necessary as no convenience method exists
|
|
res_mol = uncharger.uncharge(parent_clean_mol)
|
|
|
|
# note that no attempt is made at reionization at this step
|
|
# nor at ionization at some pH (rdkit has no pKa caculator)
|
|
# the main aim to to represent all molecules from different sources
|
|
# in a (single) standard way, for use in ML, catalogue, etc.
|
|
|
|
if remove_stereo:
|
|
Chem.RemoveStereochemistry(res_mol)
|
|
|
|
if canonicalize_tautomers:
|
|
te = rdMolStandardize.TautomerEnumerator() # idem
|
|
res_mol = te.Canonicalize(res_mol)
|
|
|
|
return Chem.MolToSmiles(res_mol, kekuleSmiles=True)
|
|
|
|
@staticmethod
|
|
def neutralize_smiles(smiles):
|
|
mol = Chem.MolFromSmiles(smiles)
|
|
mol = FormatConverter.neutralize_molecule(mol)
|
|
return Chem.MolToSmiles(mol)
|
|
|
|
@staticmethod
|
|
def neutralize_molecule(mol):
|
|
pattern = Chem.MolFromSmarts("[+1!h0!$([*]~[-1,-2,-3,-4]),-1!$([*]~[+1,+2,+3,+4])]")
|
|
at_matches = mol.GetSubstructMatches(pattern)
|
|
at_matches_list = [y[0] for y in at_matches]
|
|
if len(at_matches_list) > 0:
|
|
for at_idx in at_matches_list:
|
|
atom = mol.GetAtomWithIdx(at_idx)
|
|
chg = atom.GetFormalCharge()
|
|
hcount = atom.GetTotalNumHs()
|
|
atom.SetFormalCharge(0)
|
|
atom.SetNumExplicitHs(hcount - chg)
|
|
atom.UpdatePropertyCache()
|
|
return mol
|
|
|
|
@staticmethod
|
|
def is_valid_smirks(smirks: str) -> bool:
|
|
try:
|
|
rdChemReactions.ReactionFromSmarts(smirks)
|
|
return True
|
|
except Exception:
|
|
return False
|
|
|
|
@staticmethod
|
|
def is_valid_smarts(smarts: str) -> bool:
|
|
"""
|
|
Checks whether a given string is a valid SMARTS pattern.
|
|
|
|
Parameters
|
|
----------
|
|
smarts : str
|
|
The SMARTS string to validate.
|
|
|
|
Returns
|
|
-------
|
|
bool
|
|
True if the SMARTS string is valid, False otherwise.
|
|
"""
|
|
if not isinstance(smarts, str) or not smarts.strip():
|
|
return False
|
|
|
|
try:
|
|
mol = Chem.MolFromSmarts(smarts)
|
|
return mol is not None
|
|
except Exception:
|
|
return False
|
|
|
|
@staticmethod
|
|
def apply(
|
|
smiles: str,
|
|
smirks: str,
|
|
preprocess_smiles: bool = True,
|
|
bracketize: bool = True,
|
|
standardize: bool = True,
|
|
kekulize: bool = True,
|
|
remove_stereo: bool = True,
|
|
) -> List["ProductSet"]:
|
|
logger.debug(f"Applying {smirks} on {smiles}")
|
|
|
|
# If explicitly wanted or rule generates multiple products add brackets around products to capture all
|
|
if bracketize: # or "." in smirks:
|
|
smirks = smirks.split(">>")[0] + ">>(" + smirks.split(">>")[1] + ")"
|
|
|
|
# List of ProductSet objects
|
|
pss = set()
|
|
try:
|
|
rxn = rdChemReactions.ReactionFromSmarts(smirks)
|
|
mol = Chem.MolFromSmiles(smiles)
|
|
|
|
# Inplace
|
|
if preprocess_smiles:
|
|
Chem.SanitizeMol(mol)
|
|
mol = Chem.AddHs(mol)
|
|
|
|
# apply!
|
|
sites = rxn.RunReactants((mol,))
|
|
logger.debug(f"{len(sites)} products sets generated")
|
|
if len(sites):
|
|
# Sanitize mols
|
|
for product_set in sites:
|
|
prods = []
|
|
for product in product_set:
|
|
try:
|
|
Chem.SanitizeMol(product)
|
|
product = GetMolFrags(product, asMols=True)
|
|
for p in product:
|
|
p = FormatConverter.standardize(
|
|
Chem.MolToSmiles(p), remove_stereo=remove_stereo
|
|
)
|
|
prods.append(p)
|
|
|
|
# if kekulize:
|
|
# # from rdkit.Chem import MolStandardize
|
|
# #
|
|
# # # Attempt re-sanitization via standardizer
|
|
# # cleaner = MolStandardize.rdMolStandardize.Cleanup()
|
|
# # mol = cleaner.cleanup(product)
|
|
# # # Fixes
|
|
# # # [2025-01-30 23:00:50] ERROR chem - Sanitizing and converting failed:
|
|
# # # non-ring atom 3 marked aromatic
|
|
# # # But does not improve overall performance
|
|
# # # for a in product.GetAtoms():
|
|
# # # if (not a.IsInRing()) and a.GetIsAromatic():
|
|
# # # a.SetIsAromatic(False)
|
|
# # #
|
|
# # # for b in product.GetBonds():
|
|
# # # if (not b.IsInRing()) and b.GetIsAromatic():
|
|
# # # b.SetIsAromatic(False)
|
|
# # for atom in product.GetAtoms():
|
|
# # atom.SetIsAromatic(False)
|
|
# # for bond in product.GetBonds():
|
|
# # bond.SetIsAromatic(False)
|
|
# Chem.Kekulize(product)
|
|
|
|
except ValueError as e:
|
|
logger.error(f"Sanitizing and converting failed:\n{e}")
|
|
continue
|
|
|
|
if len(prods):
|
|
ps = ProductSet(prods)
|
|
pss.add(ps)
|
|
|
|
except Exception as e:
|
|
logger.error(f"Applying {smirks} on {smiles} failed:\n{e}")
|
|
|
|
return pss
|
|
|
|
@staticmethod
|
|
def MACCS(smiles):
|
|
return MACCSkeys.GenMACCSKeys(FormatConverter.from_smiles(smiles))
|
|
|
|
@staticmethod
|
|
def sanitize_smiles(smiles_list: List):
|
|
parsed_smiles = []
|
|
errors = 0
|
|
for smi in smiles_list:
|
|
try:
|
|
# # Remove Stereo and Flatten
|
|
# if "/" in smi:
|
|
# smi = smi.replace("/", "")
|
|
# if "\\" in smi:
|
|
# smi = smi.replace("\\", "")
|
|
# if "@" in smi:
|
|
# smi = smi.replace("@", "")
|
|
|
|
mol = Chem.MolFromSmiles(smi)
|
|
mol = FormatConverter.neutralize_molecule(mol)
|
|
Chem.RemoveStereochemistry(mol)
|
|
mol = Chem.RemoveAllHs(mol)
|
|
Chem.Kekulize(mol)
|
|
smi_p = Chem.MolToSmiles(mol, kekuleSmiles=True)
|
|
smi_p = Chem.CanonSmiles(smi_p)
|
|
|
|
if "~" in smi_p:
|
|
smi_p1 = smi_p.replace("~", "")
|
|
parsed_smiles.append(smi_p1)
|
|
else:
|
|
parsed_smiles.append(smi_p)
|
|
except Exception:
|
|
errors += 1
|
|
pass
|
|
|
|
return parsed_smiles, errors
|
|
|
|
@staticmethod
|
|
def smiles_covered_by(
|
|
l_smiles: List[str],
|
|
r_smiles: List[str],
|
|
standardize: bool = True,
|
|
canonicalize_tautomers: bool = True,
|
|
) -> bool:
|
|
"""
|
|
Check if all SMILES in the left list are covered by (contained in) the right list.
|
|
|
|
This function performs a subset check to determine if every chemical structure
|
|
represented in l_smiles has a corresponding representation in r_smiles.
|
|
|
|
Args:
|
|
l_smiles (List[str]): List of SMILES strings to check for coverage.
|
|
r_smiles (List[str]): List of SMILES strings that should contain all l_smiles.
|
|
standardize (bool, optional): Whether to standardize SMILES before comparison.
|
|
Defaults to True. When True, applies FormatConverter.standardize() to
|
|
normalize representations for accurate comparison.
|
|
canonicalize_tautomers (bool, optional): Whether to canonicalize tautomers
|
|
Defaults to False. When True, applies rdMolStandardize.TautomerEnumerator().Canonicalize(res_mol)
|
|
to the compounds before comparison.
|
|
Returns:
|
|
bool: True if all SMILES in l_smiles are found in r_smiles (i.e., l_smiles
|
|
is a subset of r_smiles), False otherwise.
|
|
|
|
Note:
|
|
- Comparison treats lists as sets, ignoring duplicates and order
|
|
- Failed standardization attempts are silently ignored (original SMILES used)
|
|
- This is a one-directional check: l_smiles ⊆ r_smiles
|
|
- For bidirectional equality, both directions must be checked separately
|
|
|
|
Example:
|
|
>>> FormatConverter.smiles_covered_by(["CCO", "CC"], ["CCO", "CC", "CCC"])
|
|
True
|
|
>>> FormatConverter.smiles_covered_by(["CCO", "CCCC"], ["CCO", "CC", "CCC"])
|
|
False
|
|
"""
|
|
|
|
standardized_l_smiles = []
|
|
|
|
if standardize:
|
|
for smi in l_smiles:
|
|
try:
|
|
smi = FormatConverter.standardize(
|
|
smi, canonicalize_tautomers=canonicalize_tautomers
|
|
)
|
|
except Exception:
|
|
# :shrug:
|
|
# logger.debug(f'Standardizing SMILES failed for {smi}')
|
|
pass
|
|
standardized_l_smiles.append(smi)
|
|
else:
|
|
standardized_l_smiles = l_smiles
|
|
|
|
standardized_r_smiles = []
|
|
if standardize:
|
|
for smi in r_smiles:
|
|
try:
|
|
smi = FormatConverter.standardize(smi)
|
|
except Exception:
|
|
# :shrug:
|
|
# logger.debug(f'Standardizing SMILES failed for {smi}')
|
|
pass
|
|
standardized_r_smiles.append(smi)
|
|
else:
|
|
standardized_r_smiles = r_smiles
|
|
|
|
return len(set(standardized_l_smiles).difference(set(standardized_r_smiles))) == 0
|
|
|
|
|
|
class Standardizer(ABC):
|
|
def __init__(self, name):
|
|
self.name = name
|
|
|
|
def standardize(self, smiles: str) -> str:
|
|
return FormatConverter.normalize(smiles)
|
|
|
|
|
|
class RuleStandardizer(Standardizer):
|
|
def __init__(self, name, smirks):
|
|
super().__init__(name)
|
|
self.smirks = smirks
|
|
|
|
def standardize(self, smiles: str) -> str:
|
|
standardized_smiles = list(set(FormatConverter.apply(smiles, self.smirks)))
|
|
|
|
if len(standardized_smiles) > 1:
|
|
logger.warning(f"{self.smirks} generated more than 1 compound {standardized_smiles}")
|
|
print(f"{self.smirks} generated more than 1 compound {standardized_smiles}")
|
|
standardized_smiles = standardized_smiles[:1]
|
|
|
|
if standardized_smiles:
|
|
smiles = standardized_smiles[0]
|
|
|
|
return super().standardize(smiles)
|
|
|
|
|
|
class RegExStandardizer(Standardizer):
|
|
def __init__(self, name, replacements: dict):
|
|
super().__init__(name)
|
|
self.replacements = replacements
|
|
|
|
def standardize(self, smiles: str) -> str:
|
|
smi = smiles
|
|
mod_smi = smiles
|
|
|
|
for k, v in self.replacements.items():
|
|
mod_smi = smi.replace(k, v)
|
|
|
|
while mod_smi != smi:
|
|
mod_smi = smi
|
|
for k, v in self.replacements.items():
|
|
smi = smi.replace(k, v)
|
|
|
|
return super().standardize(smi)
|
|
|
|
|
|
FLATTEN = [RegExStandardizer("Remove Stereo", {"@": ""})]
|
|
|
|
UN_CIS_TRANS = [RegExStandardizer("Un-Cis-Trans", {"/": "", "\\": ""})]
|
|
|
|
BASIC = [
|
|
RuleStandardizer("ammoniumstandardization", "[H][N+:1]([H])([H])[#6:2]>>[H][#7:1]([H])-[#6:2]"),
|
|
RuleStandardizer("cyanate", "[H][#8:1][C:2]#[N:3]>>[#8-:1][C:2]#[N:3]"),
|
|
RuleStandardizer("deprotonatecarboxyls", "[H][#8:1]-[#6:2]=[O:3]>>[#8-:1]-[#6:2]=[O:3]"),
|
|
RuleStandardizer("forNOOH", "[H][#8:1]-[#7+:2](-[*:3])=[O:4]>>[#8-:1]-[#7+:2](-[*:3])=[O:4]"),
|
|
RuleStandardizer(
|
|
"Hydroxylprotonation",
|
|
"[#6;A:1][#6:2](-[#8-:3])=[#6;A:4]>>[#6:1]-[#6:2](-[#8:3][H])=[#6;A:4]",
|
|
),
|
|
RuleStandardizer(
|
|
"phosphatedeprotonation", "[H][#8:1]-[$([#15]);!$(P([O-])):2]>>[#8-:1]-[#15:2]"
|
|
),
|
|
RuleStandardizer(
|
|
"PicricAcid",
|
|
"[H][#8:1]-[c:2]1[c:3][c:4][c:5]([c:6][c:7]1-[#7+:8](-[#8-:9])=[O:10])-[#7+:11](-[#8-:12])=[O:13]>>[#8-:1]-[c:2]1[c:3][c:4][c:5]([c:6][c:7]1-[#7+:8](-[#8-:9])=[O:10])-[#7+:11](-[#8-:12])=[O:13]",
|
|
),
|
|
RuleStandardizer(
|
|
"Sulfate1", "[H][#8:1][S:2]([#8:3][H])(=[O:4])=[O:5]>>[#8-:1][S:2]([#8-:3])(=[O:4])=[O:5]"
|
|
),
|
|
RuleStandardizer(
|
|
"Sulfate2",
|
|
"[#6:1]-[#8:2][S:3]([#8:4][H])(=[O:5])=[O:6]>>[#6:1]-[#8:2][S:3]([#8-:4])(=[O:5])=[O:6]",
|
|
),
|
|
RuleStandardizer(
|
|
"Sulfate3", "[H][#8:3][S:2]([#6:1])(=[O:4])=[O:5]>>[#6:1][S:2]([#8-:3])(=[O:4])=[O:5]"
|
|
),
|
|
RuleStandardizer(
|
|
"Transform_c1353forSOOH", "[H][#8:1][S:2]([*:3])=[O:4]>>[#8-:1][S:2]([*:3])=[O:4]"
|
|
),
|
|
]
|
|
|
|
ENHANCED = BASIC + [
|
|
RuleStandardizer("fullPhosphatedeprotonation", "[H][#8:1]-[#15:2]>>[#8-:1]-[#15:2]")
|
|
]
|
|
|
|
EXOTIC = ENHANCED + [
|
|
RuleStandardizer(
|
|
"ThioPhosphate1",
|
|
"[H][S:1]-[#15:2]=[$([#16]),$([#8]):3]>>[S-:1]-[#15:2]=[$([#16]),$([#8]):3]",
|
|
)
|
|
]
|
|
|
|
COA_CUTTER = [
|
|
RuleStandardizer(
|
|
"CutCoEnzymeAOff",
|
|
"CC(C)(COP(O)(=O)OP(O)(=O)OCC1OC(C(O)C1OP(O)(O)=O)n1cnc2c(N)ncnc12)C(O)C(=O)NCCC(=O)NCCS[$(*):1]>>[O-][$(*):1]",
|
|
)
|
|
]
|
|
|
|
ENOL_KETO = [RuleStandardizer("enol2Ketone", "[H][#8:2]-[#6:3]=[#6:1]>>[#6:1]-[#6:3]=[O:2]")]
|
|
|
|
MATCH_STANDARDIZER = EXOTIC + FLATTEN + UN_CIS_TRANS + COA_CUTTER + ENOL_KETO
|
|
|
|
|
|
class IndigoUtils(object):
|
|
@staticmethod
|
|
def layout(mol_data):
|
|
i = Indigo()
|
|
try:
|
|
if mol_data.startswith("$RXN") or ">>" in mol_data:
|
|
rxn = i.loadQueryReaction(mol_data)
|
|
rxn.layout()
|
|
return rxn.rxnfile()
|
|
else:
|
|
mol = i.loadQueryMolecule(mol_data)
|
|
mol.layout()
|
|
return mol.molfile()
|
|
except IndigoException:
|
|
try:
|
|
logger.info("layout() failed, trying loadReactionSMARTS as fallback!")
|
|
rxn = IndigoUtils.load_reaction_SMARTS(mol_data)
|
|
rxn.layout()
|
|
return rxn.molfile()
|
|
except IndigoException as e2:
|
|
logger.error(f"layout() failed due to {e2}!")
|
|
|
|
@staticmethod
|
|
def load_reaction_SMARTS(mol):
|
|
return Indigo().loadReactionSmarts(mol)
|
|
|
|
@staticmethod
|
|
def aromatize(mol_data, is_query):
|
|
i = Indigo()
|
|
try:
|
|
if mol_data.startswith("$RXN"):
|
|
if is_query:
|
|
rxn = i.loadQueryReaction(mol_data)
|
|
else:
|
|
rxn = i.loadReaction(mol_data)
|
|
|
|
rxn.aromatize()
|
|
return rxn.rxnfile()
|
|
else:
|
|
if is_query:
|
|
mol = i.loadQueryMolecule(mol_data)
|
|
else:
|
|
mol = i.loadMolecule(mol_data)
|
|
|
|
mol.aromatize()
|
|
return mol.molfile()
|
|
except IndigoException:
|
|
try:
|
|
logger.info("Aromatizing failed, trying loadReactionSMARTS as fallback!")
|
|
rxn = IndigoUtils.load_reaction_SMARTS(mol_data)
|
|
rxn.aromatize()
|
|
return rxn.molfile()
|
|
except IndigoException as e2:
|
|
logger.error(f"Aromatizing failed due to {e2}!")
|
|
|
|
@staticmethod
|
|
def dearomatize(mol_data, is_query):
|
|
i = Indigo()
|
|
try:
|
|
if mol_data.startswith("$RXN"):
|
|
if is_query:
|
|
rxn = i.loadQueryReaction(mol_data)
|
|
else:
|
|
rxn = i.loadReaction(mol_data)
|
|
|
|
rxn.dearomatize()
|
|
return rxn.rxnfile()
|
|
else:
|
|
if is_query:
|
|
mol = i.loadQueryMolecule(mol_data)
|
|
else:
|
|
mol = i.loadMolecule(mol_data)
|
|
|
|
mol.dearomatize()
|
|
return mol.molfile()
|
|
except IndigoException:
|
|
try:
|
|
logger.info("De-Aromatizing failed, trying loadReactionSMARTS as fallback!")
|
|
rxn = IndigoUtils.load_reaction_SMARTS(mol_data)
|
|
rxn.dearomatize()
|
|
return rxn.molfile()
|
|
except IndigoException as e2:
|
|
logger.error(f"De-Aromatizing failed due to {e2}!")
|
|
|
|
@staticmethod
|
|
def sanitize_functional_group(functional_group: str):
|
|
counter = 0
|
|
while True:
|
|
counter += 1
|
|
|
|
copy = functional_group
|
|
|
|
# special environment handling (amines, hydroxy, esters, ethers)
|
|
# the higher substituted should not contain H env.
|
|
if functional_group == "[C]=O":
|
|
functional_group = "[H][C](=O)[CX4,c]"
|
|
|
|
# aldamines
|
|
if functional_group == "O=[C]N(R)R":
|
|
functional_group = "O=[C]([H])N(R)R"
|
|
|
|
# ether, ester, ketones, amines, thioether
|
|
vals = [
|
|
"ROR",
|
|
"O=C(R)OR",
|
|
"[H]N(R)R",
|
|
"O=C(R)R",
|
|
"RN(R)R",
|
|
"RSR",
|
|
]
|
|
if functional_group in vals:
|
|
functional_group = functional_group.replace("R", "[CX4,c]")
|
|
|
|
# esters, ketones, amides
|
|
functional_group = functional_group.replace("O=C(R)", "O=C([CX4,c])")
|
|
|
|
if functional_group == "RS*R" or functional_group == "RO*R":
|
|
# neighboring atoms can be any aromatic atom, but they should not be highlighted
|
|
functional_group = functional_group.replace("R", "")
|
|
|
|
# aromatic compounds: aromatic atoms are denoted with *
|
|
# single aromatic heteroatoms, not substituted
|
|
|
|
# unsubstituted aromatic nitrogen
|
|
if functional_group == "RN*(R)R":
|
|
functional_group = "[nH1,nX2](a)a" # pyrrole (with H) or pyridine (no other connections); currently overlaps with neighboring aromatic atoms
|
|
|
|
# substituted aromatic nitrogen
|
|
functional_group = functional_group.replace(
|
|
"N*(R)R", "n(a)a"
|
|
) # substituent will be before N*; currently overlaps with neighboring aromatic atoms
|
|
# pyridinium
|
|
if functional_group == "RN*(R)(R)(R)R":
|
|
functional_group = (
|
|
"[CX4,c]n(a)a" # currently overlaps with neighboring aromatic atoms
|
|
)
|
|
|
|
# N-oxide
|
|
if functional_group == "[H]ON*(R)(R)(R)R":
|
|
functional_group = (
|
|
"[O-][n+](a)a" # currently overlaps with neighboring aromatic atoms
|
|
)
|
|
|
|
# other aromatic hetero atoms
|
|
functional_group = functional_group.replace("C*", "c")
|
|
functional_group = functional_group.replace("N*", "n")
|
|
functional_group = functional_group.replace("S*", "s")
|
|
functional_group = functional_group.replace("O*", "o")
|
|
functional_group = functional_group.replace("Se*", "se")
|
|
functional_group = functional_group.replace("P*", "p")
|
|
|
|
# other replacement, to accomodate for the standardization rules in enviPath
|
|
# This is not the perfect way to do it; there should be a way to replace substructure SMARTS in SMARTS?
|
|
# nitro groups are broken, due to charge handling. this SMARTS matches both forms (formal charges and hypervalent); Ertl-CDK still treats both forms separately...
|
|
functional_group = functional_group.replace(
|
|
"[H]O[N](=O)R", "[CX4,c][NX3](~[OX1])~[OX1]"
|
|
)
|
|
functional_group = functional_group.replace("O=N(=O)R", "[CX4,c][NX3](~[OX1])~[OX1]")
|
|
# carboxylic acid: this SMARTS matches both neutral and anionic form; includes COOH in larger functional_groups
|
|
functional_group = functional_group.replace("[H]OC(=O)", "[OD1]C(=O)")
|
|
# azo
|
|
functional_group = functional_group.replace("N#[N]N(R)R", "[NX1]~[NX2]~N[CX4,c]")
|
|
functional_group = functional_group.replace("RN=[N]=NR", "[NX1]~[NX2]~N[CX4,c]")
|
|
# TODO: there might be more problematic groups, which we have yet to find.
|
|
|
|
# other environment atoms (can be aromatic or aliphatic Csp3, or H)
|
|
functional_group = functional_group.replace("R", "[H,CX4,c]")
|
|
|
|
if copy == functional_group:
|
|
break
|
|
|
|
return functional_group
|
|
|
|
@staticmethod
|
|
def _colorize(
|
|
indigo: Indigo, molecule: IndigoObject, functional_groups: Dict[str, int], is_reaction: bool
|
|
):
|
|
indigo.setOption("render-atom-color-property", "color")
|
|
indigo.setOption("aromaticity-model", "generic")
|
|
|
|
counts = defaultdict(lambda: 0)
|
|
environment = set()
|
|
|
|
matcher = indigo.substructureMatcher(molecule)
|
|
|
|
# Determine environment atoms as they will be colored black
|
|
for env in ["[CX4]", "c"]:
|
|
query = indigo.loadSmarts(env)
|
|
for match in matcher.iterateMatches(query):
|
|
if match is not None:
|
|
for atom in query.iterateAtoms():
|
|
mappedAtom = match.mapAtom(atom)
|
|
|
|
if mappedAtom is None:
|
|
continue
|
|
|
|
environment.add(mappedAtom.index())
|
|
|
|
for k, v in functional_groups.items():
|
|
try:
|
|
sanitized = IndigoUtils.sanitize_functional_group(k)
|
|
|
|
query = indigo.loadSmarts(sanitized)
|
|
|
|
for match in matcher.iterateMatches(query):
|
|
if match is not None:
|
|
for atom in query.iterateAtoms():
|
|
mappedAtom = match.mapAtom(atom)
|
|
if mappedAtom is None or mappedAtom.index() in environment:
|
|
continue
|
|
|
|
counts[mappedAtom.index()] = max(v, counts[mappedAtom.index()])
|
|
|
|
except IndigoException as e:
|
|
logger.debug(f"Colorizing failed due to {e}")
|
|
|
|
for k, v in counts.items():
|
|
if is_reaction:
|
|
color = "128, 0, 128"
|
|
else:
|
|
if v <= 5:
|
|
color = "200, 0, 0"
|
|
else:
|
|
color = "0, 112, 0"
|
|
|
|
molecule.addDataSGroup([k], [], "color", color)
|
|
|
|
@staticmethod
|
|
def mol_to_svg(
|
|
mol_data: str, width: int = 0, height: int = 0, functional_groups: Dict[str, int] = None
|
|
):
|
|
if functional_groups is None:
|
|
functional_groups = {}
|
|
|
|
i = Indigo()
|
|
renderer = IndigoRenderer(i)
|
|
|
|
i.setOption("render-output-format", "svg")
|
|
i.setOption("render-coloring", not bool(len(functional_groups.keys())))
|
|
i.setOption("render-image-size", width, height)
|
|
i.setOption("render-bond-line-width", 2.0)
|
|
|
|
if "~" in mol_data:
|
|
mol = i.loadSmarts(mol_data)
|
|
else:
|
|
mol = i.loadMolecule(mol_data)
|
|
|
|
if len(functional_groups.keys()) > 0:
|
|
IndigoUtils._colorize(i, mol, functional_groups, False)
|
|
|
|
return renderer.renderToBuffer(mol).decode("UTF-8")
|
|
|
|
@staticmethod
|
|
def smirks_to_svg(
|
|
smirks: str,
|
|
is_query_smirks,
|
|
width: int = 0,
|
|
height: int = 0,
|
|
educt_functional_groups: Dict[str, int] = None,
|
|
product_functional_groups: Dict[str, int] = None,
|
|
debug: bool = False,
|
|
):
|
|
if educt_functional_groups is None:
|
|
educt_functional_groups = {}
|
|
|
|
if product_functional_groups is None:
|
|
product_functional_groups = {}
|
|
|
|
i = Indigo()
|
|
renderer = IndigoRenderer(i)
|
|
|
|
if debug:
|
|
i.setOption("render-atom-ids-visible", True)
|
|
i.setOption("render-bond-ids-visible", False)
|
|
i.setOption("render-atom-bond-ids-from-one", True)
|
|
|
|
i.setOption("render-output-format", "svg")
|
|
i.setOption("render-coloring", True)
|
|
i.setOption("render-image-size", width, height)
|
|
|
|
if is_query_smirks:
|
|
obj = i.loadReactionSmarts(smirks)
|
|
else:
|
|
obj = i.loadReaction(smirks)
|
|
|
|
if len(educt_functional_groups.keys()) > 0:
|
|
for react in obj.iterateReactants():
|
|
IndigoUtils._colorize(i, react, educt_functional_groups, True)
|
|
|
|
if len(product_functional_groups.keys()) > 0:
|
|
for prod in obj.iterateProducts():
|
|
IndigoUtils._colorize(i, prod, product_functional_groups, True)
|
|
|
|
return renderer.renderToBuffer(obj).decode("UTF-8")
|
|
|
|
|
|
if __name__ == "__main__":
|
|
data = {
|
|
"struct": "\n Ketcher 2172510 12D 1 1.00000 0.00000 0\n\n 6 6 0 0 0 999 V2000\n 0.0000 0.0000 0.0000 C 0 0 0 0 0 0 0 0 0 0 0 0\n -1.0000 0.0000 0.0000 C 0 0 0 0 0 0 0 0 0 0 0 0\n -1.5000 -0.8660 0.0000 C 0 0 0 0 0 0 0 0 0 0 0 0\n -1.0000 -1.7321 0.0000 C 0 0 0 0 0 0 0 0 0 0 0 0\n 0.0000 -1.7321 0.0000 C 0 0 0 0 0 0 0 0 0 0 0 0\n 0.5000 -0.8660 0.0000 C 0 0 0 0 0 0 0 0 0 0 0 0\n 1 2 2 0 0 0 0\n 2 3 1 0 0 0 0\n 3 4 2 0 0 0 0\n 4 5 1 0 0 0 0\n 5 6 2 0 0 0 0\n 6 1 1 0 0 0 0\nM END\n",
|
|
"options": {
|
|
"smart-layout": True,
|
|
"ignore-stereochemistry-errors": True,
|
|
"mass-skip-error-on-pseudoatoms": False,
|
|
"gross-formula-add-rsites": True,
|
|
},
|
|
}
|
|
|
|
print(IndigoUtils.aromatize(data["struct"], False))
|