[Feature] Engineer Pathway (#256)

Co-authored-by: Tim Lorsbach <tim@lorsba.ch>
Reviewed-on: enviPath/enviPy#256
This commit is contained in:
2025-12-10 07:35:42 +13:00
parent 46b0f1c124
commit 648ec150a9
17 changed files with 990 additions and 127 deletions

View File

@ -754,6 +754,30 @@ class Compound(EnviPathModel, AliasMixin, ScenarioMixin, ChemicalIdentifierMixin
@property
def normalized_structure(self) -> "CompoundStructure":
if not CompoundStructure.objects.filter(compound=self, normalized_structure=True).exists():
num_structs = self.structures.count()
stand_smiles = set()
for structure in self.structures.all():
stand_smiles.add(FormatConverter.standardize(structure.smiles))
if len(stand_smiles) != 1:
logger.debug(
f"#Structures: {num_structs} - #Standardized SMILES: {len(stand_smiles)}"
)
logger.debug(f"Couldn't infer normalized structure for {self.name} - {self.url}")
raise ValueError(
f"Couldn't find nor infer normalized structure for {self.name} ({self.url})"
)
else:
cs = CompoundStructure.create(
self,
stand_smiles.pop(),
name="Normalized structure of {}".format(self.name),
description="{} (in its normalized form)".format(self.description),
normalized_structure=True,
)
return cs
return CompoundStructure.objects.get(compound=self, normalized_structure=True)
def _url(self):
@ -901,57 +925,121 @@ class Compound(EnviPathModel, AliasMixin, ScenarioMixin, ChemicalIdentifierMixin
if self in mapping:
return mapping[self]
new_compound = Compound.objects.create(
package=target,
name=self.name,
description=self.description,
kv=self.kv.copy() if self.kv else {},
)
mapping[self] = new_compound
default_structure_smiles = self.default_structure.smiles
normalized_structure_smiles = self.normalized_structure.smiles
# Copy compound structures
for structure in self.structures.all():
if structure not in mapping:
new_structure = CompoundStructure.objects.create(
compound=new_compound,
smiles=structure.smiles,
canonical_smiles=structure.canonical_smiles,
inchikey=structure.inchikey,
normalized_structure=structure.normalized_structure,
name=structure.name,
description=structure.description,
kv=structure.kv.copy() if structure.kv else {},
)
mapping[structure] = new_structure
existing_compound = None
existing_normalized_compound = None
# Copy external identifiers for structure
for ext_id in structure.external_identifiers.all():
ExternalIdentifier.objects.create(
content_object=new_structure,
database=ext_id.database,
identifier_value=ext_id.identifier_value,
url=ext_id.url,
is_primary=ext_id.is_primary,
# Dedup check - Check if we find a direct match for a given SMILES
if CompoundStructure.objects.filter(
smiles=default_structure_smiles, compound__package=target
).exists():
existing_compound = CompoundStructure.objects.get(
smiles=default_structure_smiles, compound__package=target
).compound
# Check if we can find the standardized one
if CompoundStructure.objects.filter(
smiles=normalized_structure_smiles, compound__package=target
).exists():
existing_normalized_compound = CompoundStructure.objects.get(
smiles=normalized_structure_smiles, compound__package=target
).compound
if any([existing_compound, existing_normalized_compound]):
if existing_normalized_compound and existing_compound:
# We only have to set the mapping
mapping[self] = existing_compound
for structure in self.structures.all():
if structure not in mapping:
mapping[structure] = existing_compound.structures.get(
smiles=structure.smiles
)
return existing_compound
elif existing_normalized_compound:
mapping[self] = existing_normalized_compound
# Merge the structure into the existing compound
for structure in self.structures.all():
if existing_normalized_compound.structures.filter(
smiles=structure.smiles
).exists():
continue
# Create a new Structure
cs = CompoundStructure.create(
existing_normalized_compound,
structure.smiles,
name=structure.name,
description=structure.description,
normalized_structure=structure.normalized_structure,
)
if self.default_structure:
new_compound.default_structure = mapping.get(self.default_structure)
new_compound.save()
mapping[structure] = cs
for a in self.aliases:
new_compound.add_alias(a)
new_compound.save()
return existing_normalized_compound
# Copy external identifiers for compound
for ext_id in self.external_identifiers.all():
ExternalIdentifier.objects.create(
content_object=new_compound,
database=ext_id.database,
identifier_value=ext_id.identifier_value,
url=ext_id.url,
is_primary=ext_id.is_primary,
else:
raise ValueError(
f"Found a CompoundStructure for {default_structure_smiles} but not for {normalized_structure_smiles} in target package {target.name}"
)
else:
# Here we can safely use Compound.objects.create as we won't end up in a duplicate
new_compound = Compound.objects.create(
package=target,
name=self.name,
description=self.description,
kv=self.kv.copy() if self.kv else {},
)
mapping[self] = new_compound
# Copy underlying structures
for structure in self.structures.all():
if structure not in mapping:
new_structure = CompoundStructure.objects.create(
compound=new_compound,
smiles=structure.smiles,
canonical_smiles=structure.canonical_smiles,
inchikey=structure.inchikey,
normalized_structure=structure.normalized_structure,
name=structure.name,
description=structure.description,
kv=structure.kv.copy() if structure.kv else {},
)
mapping[structure] = new_structure
# Copy external identifiers for structure
for ext_id in structure.external_identifiers.all():
ExternalIdentifier.objects.create(
content_object=new_structure,
database=ext_id.database,
identifier_value=ext_id.identifier_value,
url=ext_id.url,
is_primary=ext_id.is_primary,
)
if self.default_structure:
new_compound.default_structure = mapping.get(self.default_structure)
new_compound.save()
for a in self.aliases:
new_compound.add_alias(a)
new_compound.save()
# Copy external identifiers for compound
for ext_id in self.external_identifiers.all():
ExternalIdentifier.objects.create(
content_object=new_compound,
database=ext_id.database,
identifier_value=ext_id.identifier_value,
url=ext_id.url,
is_primary=ext_id.is_primary,
)
return new_compound
class Meta:
@ -1112,34 +1200,44 @@ class Rule(PolymorphicModel, EnviPathModel, AliasMixin, ScenarioMixin):
rule_type = type(self)
if rule_type == SimpleAmbitRule:
new_rule = SimpleAmbitRule.objects.create(
new_rule = SimpleAmbitRule.create(
package=target,
name=self.name,
description=self.description,
smirks=self.smirks,
reactant_filter_smarts=self.reactant_filter_smarts,
product_filter_smarts=self.product_filter_smarts,
kv=self.kv.copy() if self.kv else {},
)
if self.kv:
new_rule.kv.update(**self.kv)
new_rule.save()
elif rule_type == SimpleRDKitRule:
new_rule = SimpleRDKitRule.objects.create(
new_rule = SimpleRDKitRule.create(
package=target,
name=self.name,
description=self.description,
reaction_smarts=self.reaction_smarts,
kv=self.kv.copy() if self.kv else {},
)
if self.kv:
new_rule.kv.update(**self.kv)
new_rule.save()
elif rule_type == ParallelRule:
new_rule = ParallelRule.objects.create(
package=target,
name=self.name,
description=self.description,
kv=self.kv.copy() if self.kv else {},
)
# Copy simple rules relationships
new_srs = []
for simple_rule in self.simple_rules.all():
copied_simple_rule = simple_rule.copy(target, mapping)
new_rule.simple_rules.add(copied_simple_rule)
new_srs.append(copied_simple_rule)
new_rule = ParallelRule.create(
package=target,
simple_rules=new_srs,
name=self.name,
description=self.description,
)
elif rule_type == SequentialRule:
raise ValueError("SequentialRule copy not implemented!")
else:
@ -1343,6 +1441,20 @@ class ParallelRule(Rule):
f"Simple rule {sr.uuid} does not belong to package {package.uuid}!"
)
# Deduplication check
query = ParallelRule.objects.annotate(
srs_count=Count("simple_rules", filter=Q(simple_rules__in=simple_rules), distinct=True)
)
existing_rule_qs = query.filter(
srs_count=len(simple_rules),
)
if existing_rule_qs.exists():
if existing_rule_qs.count() > 1:
logger.error(f"Found more than one reaction for given input! {existing_rule_qs}")
return existing_rule_qs.first()
r = ParallelRule()
r.package = package
@ -1524,31 +1636,44 @@ class Reaction(EnviPathModel, AliasMixin, ScenarioMixin, ReactionIdentifierMixin
if self in mapping:
return mapping[self]
# Create new reaction
new_reaction = Reaction.objects.create(
package=target,
name=self.name,
description=self.description,
multi_step=self.multi_step,
medline_references=self.medline_references,
kv=self.kv.copy() if self.kv else {},
)
mapping[self] = new_reaction
copied_reaction_educts = []
copied_reaction_products = []
copied_reaction_rules = []
# Copy educts (reactant compounds)
for educt in self.educts.all():
copied_educt = educt.copy(target, mapping)
new_reaction.educts.add(copied_educt)
copied_reaction_educts.append(copied_educt)
# Copy products
for product in self.products.all():
copied_product = product.copy(target, mapping)
new_reaction.products.add(copied_product)
copied_reaction_products.append(copied_product)
# Copy rules
for rule in self.rules.all():
copied_rule = rule.copy(target, mapping)
new_reaction.rules.add(copied_rule)
copied_reaction_rules.append(copied_rule)
new_reaction = Reaction.create(
package=target,
name=self.name,
description=self.description,
educts=copied_reaction_educts,
products=copied_reaction_products,
rules=copied_reaction_rules,
multi_step=self.multi_step,
)
if self.medline_references:
new_reaction.medline_references = self.medline_references
new_reaction.save()
if self.kv:
new_reaction.kv = self.kv
new_reaction.save()
mapping[self] = new_reaction
# Copy external identifiers
for ext_id in self.external_identifiers.all():
@ -1666,14 +1791,12 @@ class Pathway(EnviPathModel, AliasMixin, ScenarioMixin):
while len(queue):
current = queue.pop()
processed.add(current)
nodes.append(current.d3_json())
for e in self.edges:
if current in e.start_nodes.all():
for prod in e.end_nodes.all():
if prod not in queue and prod not in processed:
queue.append(prod)
for e in self.edges.filter(start_nodes=current).distinct():
for prod in e.end_nodes.all():
if prod not in queue and prod not in processed:
queue.append(prod)
# We shouldn't lose or make up nodes...
assert len(nodes) == len(self.nodes)
@ -1838,6 +1961,8 @@ class Pathway(EnviPathModel, AliasMixin, ScenarioMixin):
return mapping[self]
# Start copying the pathway
# Its safe to use .objects.create here as Pathways itself aren't
# deduplicated
new_pathway = Pathway.objects.create(
package=target,
name=self.name,
@ -1975,6 +2100,7 @@ class Node(EnviPathModel, AliasMixin, ScenarioMixin):
else None,
"uncovered_functional_groups": False,
},
"is_engineered_intermediate": self.kv.get("is_engineered_intermediate", False),
}
@staticmethod
@ -3762,23 +3888,29 @@ class JobLog(TimeStampedModel):
done_at = models.DateTimeField(null=True, blank=True, default=None)
task_result = models.TextField(null=True, blank=True, default=None)
TERMINAL_STATES = [
"SUCCESS",
"FAILURE",
"REVOKED",
"IGNORED",
]
def is_in_terminal_state(self):
return self.status in self.TERMINAL_STATES
def check_for_update(self):
if self.is_in_terminal_state():
return
async_res = self.get_result()
new_status = async_res.state
TERMINAL_STATES = [
"SUCCESS",
"FAILURE",
"REVOKED",
"IGNORED",
]
if new_status != self.status and new_status in TERMINAL_STATES:
if new_status != self.status and new_status in self.TERMINAL_STATES:
self.status = new_status
self.done_at = async_res.date_done
if new_status == "SUCCESS":
self.task_result = async_res.result
self.task_result = str(async_res.result) if async_res.result else None
self.save()
@ -3789,3 +3921,13 @@ class JobLog(TimeStampedModel):
from celery.result import AsyncResult
return AsyncResult(str(self.task_id))
def parsed_result(self):
if not self.is_in_terminal_state() or self.task_result is None:
return None
import ast
if self.job_name == "engineer_pathways":
return ast.literal_eval(self.task_result)
return self.task_result