Merge remote-tracking branch 'origin/develop' into enhancement/dataset

# Conflicts:
#	epdb/models.py
#	tests/test_enviformer.py
#	tests/test_model.py
This commit is contained in:
Liam Brydon
2025-11-07 08:28:03 +13:00
25 changed files with 1024 additions and 280 deletions

View File

@ -192,7 +192,7 @@ class FormatConverter(object):
return smiles
@staticmethod
def standardize(smiles, remove_stereo=False):
def standardize(smiles, remove_stereo=False, canonicalize_tautomers=False):
# Taken from https://bitsilla.com/blog/2021/06/standardizing-a-molecule-using-rdkit/
# follows the steps in
# https://github.com/greglandrum/RSC_OpenScience_Standardization_202104/blob/main/MolStandardize%20pieces.ipynb
@ -210,19 +210,21 @@ class FormatConverter(object):
uncharger = (
rdMolStandardize.Uncharger()
) # annoying, but necessary as no convenience method exists
uncharged_parent_clean_mol = uncharger.uncharge(parent_clean_mol)
res_mol = uncharger.uncharge(parent_clean_mol)
# note that no attempt is made at reionization at this step
# nor at ionization at some pH (rdkit has no pKa caculator)
# the main aim to to represent all molecules from different sources
# in a (single) standard way, for use in ML, catalogue, etc.
# te = rdMolStandardize.TautomerEnumerator() # idem
# taut_uncharged_parent_clean_mol = te.Canonicalize(uncharged_parent_clean_mol)
if remove_stereo:
Chem.RemoveStereochemistry(uncharged_parent_clean_mol)
Chem.RemoveStereochemistry(res_mol)
return Chem.MolToSmiles(uncharged_parent_clean_mol, kekuleSmiles=True)
if canonicalize_tautomers:
te = rdMolStandardize.TautomerEnumerator() # idem
res_mol = te.Canonicalize(res_mol)
return Chem.MolToSmiles(res_mol, kekuleSmiles=True)
@staticmethod
def neutralize_smiles(smiles):
@ -370,6 +372,76 @@ class FormatConverter(object):
return parsed_smiles, errors
@staticmethod
def smiles_covered_by(
l_smiles: List[str],
r_smiles: List[str],
standardize: bool = True,
canonicalize_tautomers: bool = True,
) -> bool:
"""
Check if all SMILES in the left list are covered by (contained in) the right list.
This function performs a subset check to determine if every chemical structure
represented in l_smiles has a corresponding representation in r_smiles.
Args:
l_smiles (List[str]): List of SMILES strings to check for coverage.
r_smiles (List[str]): List of SMILES strings that should contain all l_smiles.
standardize (bool, optional): Whether to standardize SMILES before comparison.
Defaults to True. When True, applies FormatConverter.standardize() to
normalize representations for accurate comparison.
canonicalize_tautomers (bool, optional): Whether to canonicalize tautomers
Defaults to False. When True, applies rdMolStandardize.TautomerEnumerator().Canonicalize(res_mol)
to the compounds before comparison.
Returns:
bool: True if all SMILES in l_smiles are found in r_smiles (i.e., l_smiles
is a subset of r_smiles), False otherwise.
Note:
- Comparison treats lists as sets, ignoring duplicates and order
- Failed standardization attempts are silently ignored (original SMILES used)
- This is a one-directional check: l_smiles ⊆ r_smiles
- For bidirectional equality, both directions must be checked separately
Example:
>>> FormatConverter.smiles_covered_by(["CCO", "CC"], ["CCO", "CC", "CCC"])
True
>>> FormatConverter.smiles_covered_by(["CCO", "CCCC"], ["CCO", "CC", "CCC"])
False
"""
standardized_l_smiles = []
if standardize:
for smi in l_smiles:
try:
smi = FormatConverter.standardize(
smi, canonicalize_tautomers=canonicalize_tautomers
)
except Exception:
# :shrug:
# logger.debug(f'Standardizing SMILES failed for {smi}')
pass
standardized_l_smiles.append(smi)
else:
standardized_l_smiles = l_smiles
standardized_r_smiles = []
if standardize:
for smi in r_smiles:
try:
smi = FormatConverter.standardize(smi)
except Exception:
# :shrug:
# logger.debug(f'Standardizing SMILES failed for {smi}')
pass
standardized_r_smiles.append(smi)
else:
standardized_r_smiles = r_smiles
return len(set(standardized_l_smiles).difference(set(standardized_r_smiles))) == 0
class Standardizer(ABC):
def __init__(self, name):

View File

@ -9,36 +9,37 @@ from collections import defaultdict
from datetime import datetime
from enum import Enum
from types import NoneType
from typing import Dict, Any, List
from typing import Any, Dict, List
from django.db import transaction
from envipy_additional_information import Interval, EnviPyModel
from envipy_additional_information import NAME_MAPPING
from envipy_additional_information import NAME_MAPPING, EnviPyModel, Interval
from pydantic import BaseModel, HttpUrl
from epdb.models import (
Package,
Compound,
CompoundStructure,
SimpleRule,
Edge,
EnviFormer,
EPModel,
ExternalDatabase,
ExternalIdentifier,
License,
MLRelativeReasoning,
Node,
Package,
ParallelRule,
Pathway,
PluginModel,
Reaction,
Rule,
RuleBasedRelativeReasoning,
Scenario,
SequentialRule,
SimpleAmbitRule,
SimpleRDKitRule,
ParallelRule,
SequentialRule,
Reaction,
Pathway,
Node,
Edge,
Scenario,
EPModel,
MLRelativeReasoning,
RuleBasedRelativeReasoning,
EnviFormer,
PluginModel,
ExternalIdentifier,
ExternalDatabase,
License,
SimpleRule,
)
from utilities.chem import FormatConverter
logger = logging.getLogger(__name__)
@ -48,7 +49,7 @@ class HTMLGenerator:
@staticmethod
def generate_html(additional_information: "EnviPyModel", prefix="") -> str:
from typing import get_origin, get_args, Union
from typing import Union, get_args, get_origin
if isinstance(additional_information, type):
clz_name = additional_information.__name__
@ -1171,3 +1172,89 @@ class PackageImporter:
url=identifier_data.get("url", ""),
is_primary=identifier_data.get("is_primary", False),
)
class PathwayUtils:
def __init__(self, pathway: "Pathway"):
self.pathway = pathway
@staticmethod
def _get_products(smiles: str, rules: List["Rule"]):
educt_rule_products: Dict[str, Dict[str, List[str]]] = defaultdict(
lambda: defaultdict(list)
)
for r in rules:
product_sets = r.apply(smiles)
for product_set in product_sets:
for product in product_set:
educt_rule_products[smiles][r.url].append(product)
return educt_rule_products
def find_missing_rules(self, rules: List["Rule"]):
print(f"Processing {self.pathway.name}")
# compute products for each node / rule combination in the pathway
educt_rule_products = defaultdict(lambda: defaultdict(list))
for node in self.pathway.nodes:
educt_rule_products.update(**self._get_products(node.default_node_label.smiles, rules))
# loop through edges and determine reactions that can't be constructed by
# any of the rules or a combination of two rules in a chained fashion
res: Dict[str, List["Rule"]] = dict()
for edge in self.pathway.edges:
found = False
reaction = edge.edge_label
educts = [cs for cs in reaction.educts.all()]
products = [cs.smiles for cs in reaction.products.all()]
rule_chain = []
for educt in educts:
educt = educt.smiles
triggered_rules = list(educt_rule_products.get(educt, {}).keys())
for triggered_rule in triggered_rules:
if rule_products := educt_rule_products[educt][triggered_rule]:
# check if this rule covers the reaction
if FormatConverter.smiles_covered_by(
products, rule_products, standardize=True, canonicalize_tautomers=True
):
found = True
else:
# Check if another prediction step would cover the reaction
for product in rule_products:
prod_rule_products = self._get_products(product, rules)
prod_triggered_rules = list(
prod_rule_products.get(product, {}).keys()
)
for prod_triggered_rule in prod_triggered_rules:
if second_step_products := prod_rule_products[product][
prod_triggered_rule
]:
if FormatConverter.smiles_covered_by(
products,
second_step_products,
standardize=True,
canonicalize_tautomers=True,
):
rule_chain.append(
(
triggered_rule,
Rule.objects.get(url=triggered_rule).name,
)
)
rule_chain.append(
(
prod_triggered_rule,
Rule.objects.get(url=prod_triggered_rule).name,
)
)
res[edge.url] = rule_chain
if not found:
res[edge.url] = rule_chain
return res