diff --git a/.env.prod.example b/.env.prod.example index 8d3ab7aa..217b9c91 100644 --- a/.env.prod.example +++ b/.env.prod.example @@ -16,3 +16,5 @@ POSTGRES_PORT= # MAIL EMAIL_HOST_USER= EMAIL_HOST_PASSWORD= +# MATOMO +MATOMO_SITE_ID diff --git a/envipath/settings.py b/envipath/settings.py index e114b575..2618b01c 100644 --- a/envipath/settings.py +++ b/envipath/settings.py @@ -137,6 +137,7 @@ USE_TZ = True DEFAULT_AUTO_FIELD = "django.db.models.BigAutoField" +EMAIL_SUBJECT_PREFIX = "[enviPath] " if DEBUG: EMAIL_BACKEND = "django.core.mail.backends.console.EmailBackend" else: @@ -146,6 +147,8 @@ else: EMAIL_HOST_USER = os.environ["EMAIL_HOST_USER"] EMAIL_HOST_PASSWORD = os.environ["EMAIL_HOST_PASSWORD"] EMAIL_PORT = 587 + DEFAULT_FROM_EMAIL = os.environ["DEFAULT_FROM_EMAIL"] + SERVER_EMAIL = os.environ["SERVER_EMAIL"] AUTH_USER_MODEL = "epdb.User" ADMIN_APPROVAL_REQUIRED = os.environ.get("ADMIN_APPROVAL_REQUIRED", "False") == "True" @@ -356,3 +359,6 @@ if MS_ENTRA_ENABLED: MS_ENTRA_AUTHORITY = f"https://login.microsoftonline.com/{MS_ENTRA_TENANT_ID}" MS_ENTRA_REDIRECT_URI = os.environ["MS_REDIRECT_URI"] MS_ENTRA_SCOPES = os.environ.get("MS_SCOPES", "").split(",") + +# Site ID 10 -> beta.envipath.org +MATOMO_SITE_ID = os.environ.get("MATOMO_SITE_ID", "10") diff --git a/epdb/admin.py b/epdb/admin.py index fefcdc32..88f851af 100644 --- a/epdb/admin.py +++ b/epdb/admin.py @@ -7,6 +7,7 @@ from .models import ( GroupPackagePermission, Package, MLRelativeReasoning, + EnviFormer, Compound, CompoundStructure, SimpleAmbitRule, @@ -19,11 +20,12 @@ from .models import ( Setting, ExternalDatabase, ExternalIdentifier, + JobLog, ) class UserAdmin(admin.ModelAdmin): - pass + list_display = ["username", "email", "is_active"] class UserPackagePermissionAdmin(admin.ModelAdmin): @@ -38,8 +40,14 @@ class GroupPackagePermissionAdmin(admin.ModelAdmin): pass +class JobLogAdmin(admin.ModelAdmin): + pass + + class EPAdmin(admin.ModelAdmin): search_fields = ["name", "description"] + list_display = ["name", "url", "created"] + ordering = ["-created"] class PackageAdmin(EPAdmin): @@ -50,6 +58,10 @@ class MLRelativeReasoningAdmin(EPAdmin): pass +class EnviFormerAdmin(EPAdmin): + pass + + class CompoundAdmin(EPAdmin): pass @@ -102,8 +114,10 @@ admin.site.register(User, UserAdmin) admin.site.register(UserPackagePermission, UserPackagePermissionAdmin) admin.site.register(Group, GroupAdmin) admin.site.register(GroupPackagePermission, GroupPackagePermissionAdmin) +admin.site.register(JobLog, JobLogAdmin) admin.site.register(Package, PackageAdmin) admin.site.register(MLRelativeReasoning, MLRelativeReasoningAdmin) +admin.site.register(EnviFormer, EnviFormerAdmin) admin.site.register(Compound, CompoundAdmin) admin.site.register(CompoundStructure, CompoundStructureAdmin) admin.site.register(SimpleAmbitRule, SimpleAmbitRuleAdmin) diff --git a/epdb/logic.py b/epdb/logic.py index 2f7ab53b..82324100 100644 --- a/epdb/logic.py +++ b/epdb/logic.py @@ -27,6 +27,7 @@ from epdb.models import ( Compound, Reaction, CompoundStructure, + EnzymeLink, ) from utilities.chem import FormatConverter from utilities.misc import PackageImporter, PackageExporter @@ -626,6 +627,8 @@ class PackageManager(object): parent_mapping = {} # Mapping old scen_id to old_obj_id scen_mapping = defaultdict(list) + # Enzymelink Mapping rule_id to enzymelink objects + enzyme_mapping = defaultdict(list) # Store Scenarios for scenario in data["scenarios"]: @@ -657,9 +660,7 @@ class PackageManager(object): # Broken eP Data if name == "initialmasssediment" and addinf_data == "missing data": continue - - # TODO Enzymes arent ready yet - if name == "enzyme": + if name == "columnheight" and addinf_data == "(2)-(2.5);(6)-(8)": continue try: @@ -749,6 +750,9 @@ class PackageManager(object): for scen in rule["scenarios"]: scen_mapping[scen["id"]].append(r) + for enzyme_link in rule.get("enzymeLinks", []): + enzyme_mapping[r.uuid].append(enzyme_link) + print("Par: ", len(par_rules)) print("Seq: ", len(seq_rules)) @@ -766,6 +770,9 @@ class PackageManager(object): for scen in par_rule["scenarios"]: scen_mapping[scen["id"]].append(r) + for enzyme_link in par_rule.get("enzymeLinks", []): + enzyme_mapping[r.uuid].append(enzyme_link) + for simple_rule in par_rule["simpleRules"]: if simple_rule["id"] in mapping: r.simple_rules.add(SimpleRule.objects.get(uuid=mapping[simple_rule["id"]])) @@ -786,6 +793,9 @@ class PackageManager(object): for scen in seq_rule["scenarios"]: scen_mapping[scen["id"]].append(r) + for enzyme_link in seq_rule.get("enzymeLinks", []): + enzyme_mapping[r.uuid].append(enzyme_link) + for i, simple_rule in enumerate(seq_rule["simpleRules"]): sro = SequentialRuleOrdering() sro.simple_rule = simple_rule @@ -919,6 +929,39 @@ class PackageManager(object): print("Scenarios linked...") + # Import Enzyme Links + for rule_uuid, enzyme_links in enzyme_mapping.items(): + r = Rule.objects.get(uuid=rule_uuid) + for enzyme in enzyme_links: + e = EnzymeLink() + e.uuid = UUID(enzyme["id"].split("/")[-1]) if keep_ids else uuid4() + e.rule = r + e.name = enzyme["name"] + e.ec_number = enzyme["ecNumber"] + e.classification_level = enzyme["classificationLevel"] + e.linking_method = enzyme["linkingMethod"] + e.save() + + for reaction in enzyme["reactionLinkEvidence"]: + reaction = Reaction.objects.get(uuid=mapping[reaction["id"]]) + e.reaction_evidence.add(reaction) + + for edge in enzyme["edgeLinkEvidence"]: + edge = Edge.objects.get(uuid=mapping[edge["id"]]) + e.reaction_evidence.add(edge) + + for evidence in enzyme["linkEvidence"]: + matches = re.findall(r">(R[0-9]+)<", evidence["evidence"]) + if not matches or len(matches) != 1: + logger.warning(f"Could not find reaction id in {evidence['evidence']}") + continue + + e.add_kegg_reaction_id(matches[0]) + + e.save() + + print("Enzyme links imported...") + print("Import statistics:") print("Package {} stored".format(pack.url)) print("Imported {} compounds".format(Compound.objects.filter(package=pack).count())) diff --git a/epdb/management/commands/create_ml_models.py b/epdb/management/commands/create_ml_models.py index 8cf3fd55..89fbc0ec 100644 --- a/epdb/management/commands/create_ml_models.py +++ b/epdb/management/commands/create_ml_models.py @@ -7,10 +7,11 @@ from epdb.models import MLRelativeReasoning, EnviFormer, Package class Command(BaseCommand): """This command can be run with - `python manage.py create_ml_models [model_names] -d [data_packages] OPTIONAL: -e [eval_packages]` - For example, to train both EnviFormer and MLRelativeReasoning on BBD and SOIL and evaluate them on SLUDGE - the below command would be used: - `python manage.py create_ml_models enviformer mlrr -d bbd soil -e sludge + `python manage.py create_ml_models [model_names] -d [data_packages] FOR MLRR ONLY: -r [rule_packages] + OPTIONAL: -e [eval_packages] -t threshold` + For example, to train both EnviFormer and MLRelativeReasoning on BBD and SOIL and evaluate them on SLUDGE with a + threshold of 0.6, the below command would be used: + `python manage.py create_ml_models enviformer mlrr -d bbd soil -e sludge -t 0.6 """ def add_arguments(self, parser): @@ -34,6 +35,13 @@ class Command(BaseCommand): help="Rule Packages mandatory for MLRR", default=[], ) + parser.add_argument( + "-t", + "--threshold", + type=float, + help="Model prediction threshold", + default=0.5, + ) @transaction.atomic def handle(self, *args, **options): @@ -67,7 +75,11 @@ class Command(BaseCommand): return packages # Iteratively create models in options["model_names"] - print(f"Creating models: {options['model_names']}") + print(f"Creating models: {options['model_names']}\n" + f"Data packages: {options['data_packages']}\n" + f"Rule Packages (only for MLRR): {options['rule_packages']}\n" + f"Eval Packages: {options['eval_packages']}\n" + f"Threshold: {options['threshold']:.2f}") data_packages = decode_packages(options["data_packages"]) eval_packages = decode_packages(options["eval_packages"]) rule_packages = decode_packages(options["rule_packages"]) @@ -78,9 +90,10 @@ class Command(BaseCommand): pack, data_packages=data_packages, eval_packages=eval_packages, - threshold=0.5, - name="EnviFormer - T0.5", - description="EnviFormer transformer", + threshold=options['threshold'], + name=f"EnviFormer - {', '.join(options['data_packages'])} - T{options['threshold']:.2f}", + description=f"EnviFormer transformer trained on {options['data_packages']} " + f"evaluated on {options['eval_packages']}.", ) elif model_name == "mlrr": model = MLRelativeReasoning.create( @@ -88,9 +101,10 @@ class Command(BaseCommand): rule_packages=rule_packages, data_packages=data_packages, eval_packages=eval_packages, - threshold=0.5, - name="ECC - BBD - T0.5", - description="ML Relative Reasoning", + threshold=options['threshold'], + name=f"ECC - {', '.join(options['data_packages'])} - T{options['threshold']:.2f}", + description=f"ML Relative Reasoning trained on {options['data_packages']} with rules from " + f"{options['rule_packages']} and evaluated on {options['eval_packages']}.", ) else: raise ValueError(f"Cannot create model of type {model_name}, unknown model type") @@ -100,6 +114,6 @@ class Command(BaseCommand): print(f"Training {model_name}") model.build_model() print(f"Evaluating {model_name}") - model.evaluate_model() + model.evaluate_model(False, eval_packages=eval_packages) print(f"Saving {model_name}") model.save() diff --git a/epdb/management/commands/dump_enviformer.py b/epdb/management/commands/dump_enviformer.py new file mode 100644 index 00000000..e333248a --- /dev/null +++ b/epdb/management/commands/dump_enviformer.py @@ -0,0 +1,59 @@ +import json +import os +import tarfile +from tempfile import TemporaryDirectory + +from django.conf import settings as s +from django.core.management.base import BaseCommand +from django.db import transaction + +from epdb.models import EnviFormer + + +class Command(BaseCommand): + def add_arguments(self, parser): + parser.add_argument( + "model", + type=str, + help="Model UUID of the Model to Dump", + ) + parser.add_argument("--output", type=str) + + def package_dict_and_folder(self, dict_data, folder_path, output_path): + with TemporaryDirectory() as tmpdir: + dict_filename = os.path.join(tmpdir, "data.json") + + with open(dict_filename, "w", encoding="utf-8") as f: + json.dump(dict_data, f, indent=2) + + with tarfile.open(output_path, "w:gz") as tar: + tar.add(dict_filename, arcname="data.json") + tar.add(folder_path, arcname=os.path.basename(folder_path)) + + os.remove(dict_filename) + + @transaction.atomic + def handle(self, *args, **options): + output = options["output"] + + if os.path.exists(output): + raise ValueError(f"Output file {output} already exists") + + model = EnviFormer.objects.get(uuid=options["model"]) + + data = { + "uuid": str(model.uuid), + "name": model.name, + "description": model.description, + "kv": model.kv, + "data_packages_uuids": [str(p.uuid) for p in model.data_packages.all()], + "eval_packages_uuids": [str(p.uuid) for p in model.data_packages.all()], + "threshold": model.threshold, + "eval_results": model.eval_results, + "multigen_eval": model.multigen_eval, + "model_status": model.model_status, + } + + model_folder = os.path.join(s.MODEL_DIR, "enviformer", str(model.uuid)) + + self.package_dict_and_folder(data, model_folder, output) diff --git a/epdb/management/commands/load_enviformer.py b/epdb/management/commands/load_enviformer.py new file mode 100644 index 00000000..b2f9c3e3 --- /dev/null +++ b/epdb/management/commands/load_enviformer.py @@ -0,0 +1,81 @@ +import json +import os +import shutil +import tarfile +from tempfile import TemporaryDirectory + +from django.conf import settings as s +from django.core.management.base import BaseCommand +from django.db import transaction + +from epdb.models import EnviFormer, Package + + +class Command(BaseCommand): + def add_arguments(self, parser): + parser.add_argument( + "input", + type=str, + help=".tar.gz file containing the Model dump.", + ) + parser.add_argument( + "package", + type=str, + help="Package UUID where the Model should be loaded to.", + ) + + def read_dict_and_folder_from_archive(self, archive_path, extract_to="extracted_folder"): + with tarfile.open(archive_path, "r:gz") as tar: + tar.extractall(extract_to) + + dict_path = os.path.join(extract_to, "data.json") + + if not os.path.exists(dict_path): + raise FileNotFoundError("data.json not found in the archive.") + + with open(dict_path, "r", encoding="utf-8") as f: + data_dict = json.load(f) + + extracted_items = os.listdir(extract_to) + folders = [item for item in extracted_items if item != "data.json"] + folder_path = os.path.join(extract_to, folders[0]) if folders else None + + return data_dict, folder_path + + @transaction.atomic + def handle(self, *args, **options): + if not os.path.exists(options["input"]): + raise ValueError(f"Input file {options['input']} does not exist.") + + target_package = Package.objects.get(uuid=options["package"]) + + with TemporaryDirectory() as tmpdir: + data, folder = self.read_dict_and_folder_from_archive(options["input"], tmpdir) + + model = EnviFormer() + model.package = target_package + # model.uuid = data["uuid"] + model.name = data["name"] + model.description = data["description"] + model.kv = data["kv"] + model.threshold = float(data["threshold"]) + model.eval_results = data["eval_results"] + model.multigen_eval = data["multigen_eval"] + model.model_status = data["model_status"] + model.save() + + for p_uuid in data["data_packages_uuids"]: + p = Package.objects.get(uuid=p_uuid) + model.data_packages.add(p) + + for p_uuid in data["eval_packages_uuids"]: + p = Package.objects.get(uuid=p_uuid) + model.eval_packages.add(p) + + target_folder = os.path.join(s.MODEL_DIR, "enviformer", str(model.uuid)) + + shutil.copytree(folder, target_folder) + os.rename( + os.path.join(s.MODEL_DIR, "enviformer", str(model.uuid), f"{data['uuid']}.ckpt"), + os.path.join(s.MODEL_DIR, "enviformer", str(model.uuid), f"{model.uuid}.ckpt"), + ) diff --git a/epdb/management/commands/localize_urls.py b/epdb/management/commands/localize_urls.py index b9f95b11..cc0a3726 100644 --- a/epdb/management/commands/localize_urls.py +++ b/epdb/management/commands/localize_urls.py @@ -1,8 +1,10 @@ from django.apps import apps from django.core.management.base import BaseCommand -from django.db.models import F, Value -from django.db.models.functions import Replace +from django.db.models import F, Value, TextField, JSONField +from django.db.models.functions import Replace, Cast + +from epdb.models import EnviPathModel class Command(BaseCommand): @@ -41,6 +43,7 @@ class Command(BaseCommand): "RuleBasedRelativeReasoning", "EnviFormer", "ApplicabilityDomain", + "EnzymeLink", ] for model in MODELS: obj_cls = apps.get_model("epdb", model) @@ -48,3 +51,14 @@ class Command(BaseCommand): obj_cls.objects.update( url=Replace(F("url"), Value(options["old"]), Value(options["new"])) ) + if issubclass(obj_cls, EnviPathModel): + obj_cls.objects.update( + kv=Cast( + Replace( + Cast(F("kv"), output_field=TextField()), + Value(options["old"]), + Value(options["new"]), + ), + output_field=JSONField(), + ) + ) diff --git a/epdb/management/commands/update_job_logs.py b/epdb/management/commands/update_job_logs.py new file mode 100644 index 00000000..a5b17cfa --- /dev/null +++ b/epdb/management/commands/update_job_logs.py @@ -0,0 +1,38 @@ +from datetime import date, timedelta + +from django.core.management.base import BaseCommand +from django.db import transaction + +from epdb.models import JobLog + + +class Command(BaseCommand): + def add_arguments(self, parser): + parser.add_argument( + "--cleanup", + type=int, + default=None, + help="Remove all logs older than this number of days. Default is None, which does not remove any logs.", + ) + + @transaction.atomic + def handle(self, *args, **options): + if options["cleanup"] is not None: + cleanup_dt = date.today() - timedelta(days=options["cleanup"]) + print(JobLog.objects.filter(created__lt=cleanup_dt).delete()) + + logs = JobLog.objects.filter(status="INITIAL") + print(f"Found {logs.count()} logs to update") + updated = 0 + for log in logs: + res = log.check_for_update() + if res: + updated += 1 + + print(f"Updated {updated} logs") + + from django.db.models import Count + + qs = JobLog.objects.values("status").annotate(total=Count("status")) + for r in qs: + print(r["status"], r["total"]) diff --git a/epdb/migrations/0008_enzymelink.py b/epdb/migrations/0008_enzymelink.py new file mode 100644 index 00000000..35d0a950 --- /dev/null +++ b/epdb/migrations/0008_enzymelink.py @@ -0,0 +1,64 @@ +# Generated by Django 5.2.7 on 2025-10-10 06:58 + +import django.db.models.deletion +import django.utils.timezone +import model_utils.fields +import uuid +from django.db import migrations, models + + +class Migration(migrations.Migration): + dependencies = [ + ("epdb", "0007_alter_enviformer_options_enviformer_app_domain_and_more"), + ] + + operations = [ + migrations.CreateModel( + name="EnzymeLink", + fields=[ + ( + "id", + models.BigAutoField( + auto_created=True, primary_key=True, serialize=False, verbose_name="ID" + ), + ), + ( + "created", + model_utils.fields.AutoCreatedField( + default=django.utils.timezone.now, editable=False, verbose_name="created" + ), + ), + ( + "modified", + model_utils.fields.AutoLastModifiedField( + default=django.utils.timezone.now, editable=False, verbose_name="modified" + ), + ), + ( + "uuid", + models.UUIDField( + default=uuid.uuid4, unique=True, verbose_name="UUID of this object" + ), + ), + ("name", models.TextField(default="no name", verbose_name="Name")), + ( + "description", + models.TextField(default="no description", verbose_name="Descriptions"), + ), + ("url", models.TextField(null=True, unique=True, verbose_name="URL")), + ("kv", models.JSONField(blank=True, default=dict, null=True)), + ("ec_number", models.TextField(verbose_name="EC Number")), + ("classification_level", models.IntegerField(verbose_name="Classification Level")), + ("linking_method", models.TextField(verbose_name="Linking Method")), + ("edge_evidence", models.ManyToManyField(to="epdb.edge")), + ("reaction_evidence", models.ManyToManyField(to="epdb.reaction")), + ( + "rule", + models.ForeignKey(on_delete=django.db.models.deletion.CASCADE, to="epdb.rule"), + ), + ], + options={ + "abstract": False, + }, + ), + ] diff --git a/epdb/migrations/0009_joblog.py b/epdb/migrations/0009_joblog.py new file mode 100644 index 00000000..5c731eb1 --- /dev/null +++ b/epdb/migrations/0009_joblog.py @@ -0,0 +1,66 @@ +# Generated by Django 5.2.7 on 2025-10-27 09:39 + +import django.db.models.deletion +import django.utils.timezone +import model_utils.fields +from django.conf import settings +from django.db import migrations, models + + +class Migration(migrations.Migration): + dependencies = [ + ("epdb", "0008_enzymelink"), + ] + + operations = [ + migrations.CreateModel( + name="JobLog", + fields=[ + ( + "id", + models.BigAutoField( + auto_created=True, primary_key=True, serialize=False, verbose_name="ID" + ), + ), + ( + "created", + model_utils.fields.AutoCreatedField( + default=django.utils.timezone.now, editable=False, verbose_name="created" + ), + ), + ( + "modified", + model_utils.fields.AutoLastModifiedField( + default=django.utils.timezone.now, editable=False, verbose_name="modified" + ), + ), + ("task_id", models.UUIDField(unique=True)), + ("job_name", models.TextField()), + ( + "status", + models.CharField( + choices=[ + ("INITIAL", "Initial"), + ("SUCCESS", "Success"), + ("FAILURE", "Failure"), + ("REVOKED", "Revoked"), + ("IGNORED", "Ignored"), + ], + default="INITIAL", + max_length=20, + ), + ), + ("done_at", models.DateTimeField(blank=True, default=None, null=True)), + ("task_result", models.TextField(blank=True, default=None, null=True)), + ( + "user", + models.ForeignKey( + on_delete=django.db.models.deletion.CASCADE, to=settings.AUTH_USER_MODEL + ), + ), + ], + options={ + "abstract": False, + }, + ), + ] diff --git a/epdb/models.py b/epdb/models.py index fcd579da..ceabd6d6 100644 --- a/epdb/models.py +++ b/epdb/models.py @@ -311,7 +311,7 @@ class ExternalDatabase(TimeStampedModel): }, { "database": ExternalDatabase.objects.get(name="ChEBI"), - "placeholder": "ChEBI ID without prefix e.g. 12345", + "placeholder": "ChEBI ID without prefix e.g. 10576", }, ], "structure": [ @@ -329,7 +329,7 @@ class ExternalDatabase(TimeStampedModel): }, { "database": ExternalDatabase.objects.get(name="ChEBI"), - "placeholder": "ChEBI ID without prefix e.g. 12345", + "placeholder": "ChEBI ID without prefix e.g. 10576", }, ], "reaction": [ @@ -343,7 +343,7 @@ class ExternalDatabase(TimeStampedModel): }, { "database": ExternalDatabase.objects.get(name="UniProt"), - "placeholder": "Query ID for UniPro e.g. rhea:12345", + "placeholder": "Query ID for UniProt e.g. rhea:12345", }, ], } @@ -478,7 +478,7 @@ class ChemicalIdentifierMixin(ExternalIdentifierMixin): return self.add_external_identifier("CAS", cas_number) def get_pubchem_identifiers(self): - return self.get_external_identifier("PubChem Compound") or self.get_external_identifier( + return self.get_external_identifier("PubChem Compound") | self.get_external_identifier( "PubChem Substance" ) @@ -495,6 +495,20 @@ class ChemicalIdentifierMixin(ExternalIdentifierMixin): return self.get_external_identifier("CAS") +class KEGGIdentifierMixin(ExternalIdentifierMixin): + @property + def kegg_reaction_links(self): + return self.get_external_identifier("KEGG Reaction") + + def add_kegg_reaction_id(self, kegg_id): + return self.add_external_identifier( + "KEGG Reaction", kegg_id, f"https://www.genome.jp/entry/{kegg_id}" + ) + + class Meta: + abstract = True + + class ReactionIdentifierMixin(ExternalIdentifierMixin): class Meta: abstract = True @@ -1014,6 +1028,26 @@ class CompoundStructure(EnviPathModel, AliasMixin, ScenarioMixin, ChemicalIdenti return self.compound.default_structure == self +class EnzymeLink(EnviPathModel, KEGGIdentifierMixin): + rule = models.ForeignKey("Rule", on_delete=models.CASCADE, db_index=True) + ec_number = models.TextField(blank=False, null=False, verbose_name="EC Number") + classification_level = models.IntegerField( + blank=False, null=False, verbose_name="Classification Level" + ) + linking_method = models.TextField(blank=False, null=False, verbose_name="Linking Method") + + reaction_evidence = models.ManyToManyField("epdb.Reaction") + edge_evidence = models.ManyToManyField("epdb.Edge") + + external_identifiers = GenericRelation("ExternalIdentifier") + + def _url(self): + return "{}/enzymelink/{}".format(self.rule.url, self.uuid) + + def get_group(self) -> str: + return ".".join(self.ec_number.split(".")[:3]) + ".-" + + class Rule(PolymorphicModel, EnviPathModel, AliasMixin, ScenarioMixin): package = models.ForeignKey( "epdb.Package", verbose_name="Package", on_delete=models.CASCADE, db_index=True @@ -1095,6 +1129,18 @@ class Rule(PolymorphicModel, EnviPathModel, AliasMixin, ScenarioMixin): return new_rule + def enzymelinks(self): + return self.enzymelink_set.all() + + def get_grouped_enzymelinks(self): + res = defaultdict(list) + + for el in self.enzymelinks(): + key = ".".join(el.ec_number.split(".")[:3]) + ".-" + res[key].append(el) + + return dict(res) + class SimpleRule(Rule): pass @@ -1436,6 +1482,16 @@ class Reaction(EnviPathModel, AliasMixin, ScenarioMixin, ReactionIdentifierMixin id__in=Edge.objects.filter(edge_label=self).values("pathway_id") ).order_by("name") + def get_related_enzymes(self): + res = [] + edges = Edge.objects.filter(edge_label=self) + for e in edges: + for scen in e.scenarios.all(): + for ai in scen.additional_information.keys(): + if ai == "Enzyme": + res.extend(scen.additional_information[ai]) + return res + class Pathway(EnviPathModel, AliasMixin, ScenarioMixin): package = models.ForeignKey( @@ -2172,10 +2228,18 @@ class PackageBasedModel(EPModel): self.model_status = self.BUILT_NOT_EVALUATED self.save() - def evaluate_model(self): + def evaluate_model(self, multigen: bool, eval_packages: List["Package"] = None): if self.model_status != self.BUILT_NOT_EVALUATED: raise ValueError(f"Can't evaluate a model in state {self.model_status}!") + if multigen: + self.multigen_eval = multigen + self.save() + + if eval_packages is not None: + for p in eval_packages: + self.eval_packages.add(p) + self.model_status = self.EVALUATING self.save() @@ -2472,7 +2536,6 @@ class RuleBasedRelativeReasoning(PackageBasedModel): package: "Package", rule_packages: List["Package"], data_packages: List["Package"], - eval_packages: List["Package"], threshold: float = 0.5, min_count: int = 10, max_count: int = 0, @@ -2521,10 +2584,6 @@ class RuleBasedRelativeReasoning(PackageBasedModel): for p in rule_packages: rbrr.data_packages.add(p) - if eval_packages: - for p in eval_packages: - rbrr.eval_packages.add(p) - rbrr.save() return rbrr @@ -2579,7 +2638,6 @@ class MLRelativeReasoning(PackageBasedModel): package: "Package", rule_packages: List["Package"], data_packages: List["Package"], - eval_packages: List["Package"], threshold: float = 0.5, name: "str" = None, description: str = None, @@ -2619,10 +2677,6 @@ class MLRelativeReasoning(PackageBasedModel): for p in rule_packages: mlrr.data_packages.add(p) - if eval_packages: - for p in eval_packages: - mlrr.eval_packages.add(p) - if build_app_domain: ad = ApplicabilityDomain.create( mlrr, @@ -2942,7 +2996,6 @@ class EnviFormer(PackageBasedModel): def create( package: "Package", data_packages: List["Package"], - eval_packages: List["Package"], threshold: float = 0.5, name: "str" = None, description: str = None, @@ -2975,10 +3028,6 @@ class EnviFormer(PackageBasedModel): for p in data_packages: mod.data_packages.add(p) - if eval_packages: - for p in eval_packages: - mod.eval_packages.add(p) - # if build_app_domain: # ad = ApplicabilityDomain.create(mod, app_domain_num_neighbours, app_domain_reliability_threshold, # app_domain_local_compatibility_threshold) @@ -2992,7 +3041,8 @@ class EnviFormer(PackageBasedModel): from enviformer import load ckpt = os.path.join(s.MODEL_DIR, "enviformer", str(self.uuid), f"{self.uuid}.ckpt") - return load(device=s.ENVIFORMER_DEVICE, ckpt_path=ckpt) + mod = load(device=s.ENVIFORMER_DEVICE, ckpt_path=ckpt) + return mod def predict(self, smiles) -> List["PredictionResult"]: return self.predict_batch([smiles])[0] @@ -3006,8 +3056,12 @@ class EnviFormer(PackageBasedModel): for smiles in smiles_list ] logger.info(f"Submitting {canon_smiles} to {self.name}") + start = datetime.now() products_list = self.model.predict_batch(canon_smiles) - logger.info(f"Got results {products_list}") + end = datetime.now() + logger.info( + f"Prediction took {(end - start).total_seconds():.2f} seconds. Got results {products_list}" + ) results = [] for products in products_list: @@ -3034,6 +3088,7 @@ class EnviFormer(PackageBasedModel): start = datetime.now() # Standardise reactions for the training data, EnviFormer ignores stereochemistry currently + co2 = {"C(=O)=O", "O=C=O"} ds = [] for reaction in self._get_reactions(): educts = ".".join( @@ -3048,7 +3103,8 @@ class EnviFormer(PackageBasedModel): for smile in reaction.products.all() ] ) - ds.append(f"{educts}>>{products}") + if products not in co2: + ds.append(f"{educts}>>{products}") end = datetime.now() logger.debug(f"build_dataset took {(end - start).total_seconds()} seconds") @@ -3084,10 +3140,18 @@ class EnviFormer(PackageBasedModel): args = {"clz": "EnviFormer"} return args - def evaluate_model(self): + def evaluate_model(self, multigen: bool, eval_packages: List["Package"] = None): if self.model_status != self.BUILT_NOT_EVALUATED: raise ValueError(f"Can't evaluate a model in state {self.model_status}!") + if multigen: + self.multigen_eval = multigen + self.save() + + if eval_packages is not None: + for p in eval_packages: + self.eval_packages.add(p) + self.model_status = self.EVALUATING self.save() @@ -3244,7 +3308,7 @@ class EnviFormer(PackageBasedModel): ds = self.load_dataset() n_splits = 20 - shuff = ShuffleSplit(n_splits=n_splits, test_size=0.25, random_state=42) + shuff = ShuffleSplit(n_splits=n_splits, test_size=0.1, random_state=42) # Single gen eval is done in one loop of train then evaluate rather than storing all n_splits trained models # this helps reduce the memory footprint. @@ -3312,7 +3376,7 @@ class EnviFormer(PackageBasedModel): # Compute splits of the collected pathway and evaluate. Like single gen we train and evaluate in each # iteration instead of storing all trained models. for split_id, (train, test) in enumerate( - ShuffleSplit(n_splits=n_splits, test_size=0.25, random_state=42).split(pathways) + ShuffleSplit(n_splits=n_splits, test_size=0.1, random_state=42).split(pathways) ): train_pathways = [pathways[i] for i in train] test_pathways = [pathways[i] for i in test] @@ -3614,3 +3678,53 @@ class Setting(EnviPathModel): self.public = True self.global_default = True self.save() + + +class JobLogStatus(models.TextChoices): + INITIAL = "INITIAL", "Initial" + SUCCESS = "SUCCESS", "Success" + FAILURE = "FAILURE", "Failure" + REVOKED = "REVOKED", "Revoked" + IGNORED = "IGNORED", "Ignored" + + +class JobLog(TimeStampedModel): + user = models.ForeignKey("epdb.User", models.CASCADE) + task_id = models.UUIDField(unique=True) + job_name = models.TextField(null=False, blank=False) + status = models.CharField( + max_length=20, + choices=JobLogStatus.choices, + default=JobLogStatus.INITIAL, + ) + + done_at = models.DateTimeField(null=True, blank=True, default=None) + task_result = models.TextField(null=True, blank=True, default=None) + + def check_for_update(self): + async_res = self.get_result() + new_status = async_res.state + + TERMINAL_STATES = [ + "SUCCESS", + "FAILURE", + "REVOKED", + "IGNORED", + ] + + if new_status != self.status and new_status in TERMINAL_STATES: + self.status = new_status + self.done_at = async_res.date_done + + if new_status == "SUCCESS": + self.task_result = async_res.result + + self.save() + + return True + return False + + def get_result(self): + from celery.result import AsyncResult + + return AsyncResult(str(self.task_id)) diff --git a/epdb/tasks.py b/epdb/tasks.py index aabaf8d1..b872d4a9 100644 --- a/epdb/tasks.py +++ b/epdb/tasks.py @@ -1,12 +1,58 @@ +import csv +import io import logging -from typing import Optional +from datetime import datetime +from typing import Any, Callable, List, Optional +from uuid import uuid4 from celery import shared_task -from epdb.models import Pathway, Node, EPModel, Setting -from epdb.logic import SPathway +from celery.utils.functional import LRUCache +from epdb.logic import SPathway +from epdb.models import EPModel, JobLog, Node, Package, Pathway, Rule, Setting, User, Edge logger = logging.getLogger(__name__) +ML_CACHE = LRUCache(3) # Cache the three most recent ML models to reduce load times. + + +def get_ml_model(model_pk: int): + if model_pk not in ML_CACHE: + ML_CACHE[model_pk] = EPModel.objects.get(id=model_pk) + return ML_CACHE[model_pk] + + +def dispatch_eager(user: "User", job: Callable, *args, **kwargs): + try: + x = job(*args, **kwargs) + log = JobLog() + log.user = user + log.task_id = uuid4() + log.job_name = job.__name__ + log.status = "SUCCESS" + log.done_at = datetime.now() + log.task_result = str(x) if x else None + log.save() + + return x + except Exception as e: + logger.exception(e) + raise e + + +def dispatch(user: "User", job: Callable, *args, **kwargs): + try: + x = job.delay(*args, **kwargs) + log = JobLog() + log.user = user + log.task_id = x.task_id + log.job_name = job.__name__ + log.status = "INITIAL" + log.save() + + return x.result + except Exception as e: + logger.exception(e) + raise e @shared_task(queue="background") @@ -16,7 +62,7 @@ def mul(a, b): @shared_task(queue="predict") def predict_simple(model_pk: int, smiles: str): - mod = EPModel.objects.get(id=model_pk) + mod = get_ml_model(model_pk) res = mod.predict(smiles) return res @@ -26,17 +72,55 @@ def send_registration_mail(user_pk: int): pass -@shared_task(queue="model") -def build_model(model_pk: int): +@shared_task(bind=True, queue="model") +def build_model(self, model_pk: int): mod = EPModel.objects.get(id=model_pk) - mod.build_dataset() - mod.build_model() + + if JobLog.objects.filter(task_id=self.request.id).exists(): + JobLog.objects.filter(task_id=self.request.id).update(status="RUNNING", task_result=mod.url) + + try: + mod.build_dataset() + mod.build_model() + except Exception as e: + if JobLog.objects.filter(task_id=self.request.id).exists(): + JobLog.objects.filter(task_id=self.request.id).update( + status="FAILED", task_result=mod.url + ) + + raise e + + if JobLog.objects.filter(task_id=self.request.id).exists(): + JobLog.objects.filter(task_id=self.request.id).update(status="SUCCESS", task_result=mod.url) + + return mod.url -@shared_task(queue="model") -def evaluate_model(model_pk: int): +@shared_task(bind=True, queue="model") +def evaluate_model(self, model_pk: int, multigen: bool, package_pks: Optional[list] = None): + packages = None + + if package_pks: + packages = Package.objects.filter(pk__in=package_pks) + mod = EPModel.objects.get(id=model_pk) - mod.evaluate_model() + if JobLog.objects.filter(task_id=self.request.id).exists(): + JobLog.objects.filter(task_id=self.request.id).update(status="RUNNING", task_result=mod.url) + + try: + mod.evaluate_model(multigen, eval_packages=packages) + except Exception as e: + if JobLog.objects.filter(task_id=self.request.id).exists(): + JobLog.objects.filter(task_id=self.request.id).update( + status="FAILED", task_result=mod.url + ) + + raise e + + if JobLog.objects.filter(task_id=self.request.id).exists(): + JobLog.objects.filter(task_id=self.request.id).update(status="SUCCESS", task_result=mod.url) + + return mod.url @shared_task(queue="model") @@ -45,16 +129,26 @@ def retrain(model_pk: int): mod.retrain() -@shared_task(queue="predict") +@shared_task(bind=True, queue="predict") def predict( - pw_pk: int, pred_setting_pk: int, limit: Optional[int] = None, node_pk: Optional[int] = None + self, + pw_pk: int, + pred_setting_pk: int, + limit: Optional[int] = None, + node_pk: Optional[int] = None, ) -> Pathway: pw = Pathway.objects.get(id=pw_pk) setting = Setting.objects.get(id=pred_setting_pk) + # If the setting has a model add/restore it from the cache + if setting.model is not None: + setting.model = get_ml_model(setting.model.pk) pw.kv.update(**{"status": "running"}) pw.save() + if JobLog.objects.filter(task_id=self.request.id).exists(): + JobLog.objects.filter(task_id=self.request.id).update(status="RUNNING", task_result=pw.url) + try: # regular prediction if limit is not None: @@ -79,7 +173,111 @@ def predict( except Exception as e: pw.kv.update({"status": "failed"}) pw.save() + + if JobLog.objects.filter(task_id=self.request.id).exists(): + JobLog.objects.filter(task_id=self.request.id).update( + status="FAILED", task_result=pw.url + ) + raise e pw.kv.update(**{"status": "completed"}) pw.save() + + if JobLog.objects.filter(task_id=self.request.id).exists(): + JobLog.objects.filter(task_id=self.request.id).update(status="SUCCESS", task_result=pw.url) + + return pw.url + + +@shared_task(bind=True, queue="background") +def identify_missing_rules( + self, + pw_pks: List[int], + rule_package_pk: int, +): + from utilities.misc import PathwayUtils + + rules = Package.objects.get(pk=rule_package_pk).get_applicable_rules() + + rows: List[Any] = [] + header = [ + "Package Name", + "Pathway Name", + "Educt Name", + "Educt SMILES", + "Reaction Name", + "Reaction SMIRKS", + "Triggered Rules", + "Reactant SMARTS", + "Product SMARTS", + "Product Names", + "Product SMILES", + ] + + rows.append(header) + + for pw in Pathway.objects.filter(pk__in=pw_pks): + pu = PathwayUtils(pw) + + missing_rules = pu.find_missing_rules(rules) + + package_name = pw.package.name + pathway_name = pw.name + + for edge_url, rule_chain in missing_rules.items(): + row: List[Any] = [package_name, pathway_name] + edge = Edge.objects.get(url=edge_url) + educts = edge.start_nodes.all() + + for educt in educts: + row.append(educt.default_node_label.name) + row.append(educt.default_node_label.smiles) + + row.append(edge.edge_label.name) + row.append(edge.edge_label.smirks()) + + rule_names = [] + reactant_smarts = [] + product_smarts = [] + + for r in rule_chain: + r = Rule.objects.get(url=r[0]) + rule_names.append(r.name) + + rs = r.reactants_smarts + if isinstance(rs, set): + rs = list(rs) + + ps = r.products_smarts + if isinstance(ps, set): + ps = list(ps) + + reactant_smarts.append(rs) + product_smarts.append(ps) + + row.append(rule_names) + row.append(reactant_smarts) + row.append(product_smarts) + + products = edge.end_nodes.all() + product_names = [] + product_smiles = [] + + for product in products: + product_names.append(product.default_node_label.name) + product_smiles.append(product.default_node_label.smiles) + + row.append(product_names) + row.append(product_smiles) + + rows.append(row) + + buffer = io.StringIO() + + writer = csv.writer(buffer) + writer.writerows(rows) + + buffer.seek(0) + + return buffer.getvalue() diff --git a/epdb/templatetags/envipytags.py b/epdb/templatetags/envipytags.py index c8c92fef..6c250e63 100644 --- a/epdb/templatetags/envipytags.py +++ b/epdb/templatetags/envipytags.py @@ -1,8 +1,21 @@ from django import template +from pydantic import AnyHttpUrl, ValidationError +from pydantic.type_adapter import TypeAdapter register = template.Library() +url_adapter = TypeAdapter(AnyHttpUrl) + @register.filter def classname(obj): return obj.__class__.__name__ + + +@register.filter +def is_url(value): + try: + url_adapter.validate_python(value) + return True + except ValidationError: + return False diff --git a/epdb/urls.py b/epdb/urls.py index 16f0f2ba..25e18680 100644 --- a/epdb/urls.py +++ b/epdb/urls.py @@ -1,5 +1,5 @@ -from django.urls import path, re_path from django.contrib.auth import views as auth_views +from django.urls import path, re_path from . import views as v @@ -88,20 +88,36 @@ urlpatterns = [ v.package_rule, name="package rule detail", ), - re_path( - rf"^package/(?P{UUID})/simple-rdkit-rule/(?P{UUID})$", - v.package_rule, - name="package rule detail", - ), + # re_path( + # rf"^package/(?P{UUID})/simple-rdkit-rule/(?P{UUID})$", + # v.package_rule, + # name="package rule detail", + # ), re_path( rf"^package/(?P{UUID})/parallel-rule/(?P{UUID})$", v.package_rule, name="package rule detail", ), + # re_path( + # rf"^package/(?P{UUID})/sequential-rule/(?P{UUID})$", + # v.package_rule, + # name="package rule detail", + # ), + # EnzymeLinks re_path( - rf"^package/(?P{UUID})/sequential-rule/(?P{UUID})$", - v.package_rule, - name="package rule detail", + rf"^package/(?P{UUID})/rule/(?P{UUID})/enzymelink/(?P{UUID})$", + v.package_rule_enzymelink, + name="package rule enzymelink detail", + ), + re_path( + rf"^package/(?P{UUID})/simple-ambit-rule/(?P{UUID})/enzymelink/(?P{UUID})$", + v.package_rule_enzymelink, + name="package rule enzymelink detail", + ), + re_path( + rf"^package/(?P{UUID})/parallel-rule/(?P{UUID})/enzymelink/(?P{UUID})$", + v.package_rule_enzymelink, + name="package rule enzymelink detail", ), # Reaction re_path( @@ -174,6 +190,7 @@ urlpatterns = [ re_path(r"^indigo/dearomatize$", v.dearomatize, name="indigo_dearomatize"), re_path(r"^indigo/layout$", v.layout, name="indigo_layout"), re_path(r"^depict$", v.depict, name="depict"), + re_path(r"^jobs", v.jobs, name="jobs"), # OAuth Stuff path("o/userinfo/", v.userinfo, name="oauth_userinfo"), ] diff --git a/epdb/views.py b/epdb/views.py index f9279e1a..10a8027b 100644 --- a/epdb/views.py +++ b/epdb/views.py @@ -47,6 +47,8 @@ from .models import ( Edge, ExternalDatabase, ExternalIdentifier, + EnzymeLink, + JobLog, ) logger = logging.getLogger(__name__) @@ -239,6 +241,7 @@ def get_base_context(request, for_user=None) -> Dict[str, Any]: "enabled_features": s.FLAGS, "debug": s.DEBUG, "external_databases": ExternalDatabase.get_databases(), + "site_id": s.MATOMO_SITE_ID, }, } @@ -756,8 +759,8 @@ def package_models(request, package_uuid): context["unreviewed_objects"] = unreviewed_model_qs context["model_types"] = { - "ML Relative Reasoning": "ml-relative-reasoning", - "Rule Based Relative Reasoning": "rule-based-relative-reasoning", + "ML Relative Reasoning": "mlrr", + "Rule Based Relative Reasoning": "rbrr", } if s.FLAGS.get("ENVIFORMER", False): @@ -776,69 +779,67 @@ def package_models(request, package_uuid): model_type = request.POST.get("model-type") + # Generic fields for ML and Rule Based + rule_packages = request.POST.getlist("model-rule-packages") + data_packages = request.POST.getlist("model-data-packages") + + # Generic params + params = { + "package": current_package, + "name": name, + "description": description, + "data_packages": [ + PackageManager.get_package_by_url(current_user, p) for p in data_packages + ], + } + if model_type == "enviformer": - threshold = float(request.POST.get(f"{model_type}-threshold", 0.5)) + threshold = float(request.POST.get("model-threshold", 0.5)) + params["threshold"] = threshold - mod = EnviFormer.create(current_package, name, description, threshold) + mod = EnviFormer.create(**params) + elif model_type == "mlrr": + # ML Specific + threshold = float(request.POST.get("model-threshold", 0.5)) + # TODO handle additional fingerprinter + # fingerprinter = request.POST.get("model-fingerprinter") - elif model_type == "ml-relative-reasoning" or model_type == "rule-based-relative-reasoning": - # Generic fields for ML and Rule Based - rule_packages = request.POST.getlist("package-based-relative-reasoning-rule-packages") - data_packages = request.POST.getlist("package-based-relative-reasoning-data-packages") - eval_packages = request.POST.getlist( - "package-based-relative-reasoning-evaluation-packages", [] - ) + params["rule_packages"] = [ + PackageManager.get_package_by_url(current_user, p) for p in rule_packages + ] - # Generic params - params = { - "package": current_package, - "name": name, - "description": description, - "rule_packages": [ - PackageManager.get_package_by_url(current_user, p) for p in rule_packages - ], - "data_packages": [ - PackageManager.get_package_by_url(current_user, p) for p in data_packages - ], - "eval_packages": [ - PackageManager.get_package_by_url(current_user, p) for p in eval_packages - ], - } + # App Domain related parameters + build_ad = request.POST.get("build-app-domain", False) == "on" + num_neighbors = request.POST.get("num-neighbors", 5) + reliability_threshold = request.POST.get("reliability-threshold", 0.5) + local_compatibility_threshold = request.POST.get("local-compatibility-threshold", 0.5) - if model_type == "ml-relative-reasoning": - # ML Specific - threshold = float(request.POST.get(f"{model_type}-threshold", 0.5)) - # TODO handle additional fingerprinter - # fingerprinter = request.POST.get(f"{model_type}-fingerprinter") + params["threshold"] = threshold + # params['fingerprinter'] = fingerprinter + params["build_app_domain"] = build_ad + params["app_domain_num_neighbours"] = num_neighbors + params["app_domain_reliability_threshold"] = reliability_threshold + params["app_domain_local_compatibility_threshold"] = local_compatibility_threshold - # App Domain related parameters - build_ad = request.POST.get("build-app-domain", False) == "on" - num_neighbors = request.POST.get("num-neighbors", 5) - reliability_threshold = request.POST.get("reliability-threshold", 0.5) - local_compatibility_threshold = request.POST.get( - "local-compatibility-threshold", 0.5 - ) + mod = MLRelativeReasoning.create(**params) + elif model_type == "rbrr": + params["rule_packages"] = [ + PackageManager.get_package_by_url(current_user, p) for p in rule_packages + ] - params["threshold"] = threshold - # params['fingerprinter'] = fingerprinter - params["build_app_domain"] = build_ad - params["app_domain_num_neighbours"] = num_neighbors - params["app_domain_reliability_threshold"] = reliability_threshold - params["app_domain_local_compatibility_threshold"] = local_compatibility_threshold - - mod = MLRelativeReasoning.create(**params) - else: - mod = RuleBasedRelativeReasoning.create(**params) - - from .tasks import build_model - - build_model.delay(mod.pk) + mod = RuleBasedRelativeReasoning.create(**params) + elif s.FLAGS.get("PLUGINS", False) and model_type in s.CLASSIFIER_PLUGINS.values(): + pass else: return error( request, "Invalid model type.", f'Model type "{model_type}" is not supported."' ) - return redirect(mod.url) + from .tasks import dispatch, build_model + + dispatch(current_user, build_model, mod.pk) + + return redirect(mod.url) else: return HttpResponseNotAllowed(["GET", "POST"]) @@ -866,6 +867,10 @@ def package_model(request, package_uuid, model_uuid): return JsonResponse({"error": f'"{smiles}" is not a valid SMILES'}, status=400) if classify: + from epdb.tasks import dispatch_eager, predict_simple + + res = dispatch_eager(current_user, predict_simple, current_model.pk, stand_smiles) + pred_res = current_model.predict(stand_smiles) res = [] @@ -910,9 +915,25 @@ def package_model(request, package_uuid, model_uuid): current_model.delete() return redirect(current_package.url + "/model") elif hidden == "evaluate": - from .tasks import evaluate_model + from .tasks import dispatch, evaluate_model + + eval_type = request.POST.get("model-evaluation-type") + + if eval_type not in ["sg", "mg"]: + return error( + request, + "Invalid evaluation type", + f'Evaluation type "{eval_type}" is not supported. Only "sg" and "mg" are supported.', + ) + + multigen = eval_type == "mg" + + eval_packages = request.POST.getlist("model-evaluation-packages") + eval_package_ids = [ + PackageManager.get_package_by_url(current_user, p).id for p in eval_packages + ] + dispatch(current_user, evaluate_model, current_model.pk, multigen, eval_package_ids) - evaluate_model.delay(current_model.pk) return redirect(current_model.url) else: return HttpResponseBadRequest() @@ -1255,7 +1276,16 @@ def package_compound_structures(request, package_uuid, compound_uuid): structure_smiles = request.POST.get("structure-smiles").strip() structure_description = request.POST.get("structure-description") - cs = current_compound.add_structure(structure_smiles, structure_name, structure_description) + try: + cs = current_compound.add_structure( + structure_smiles, structure_name, structure_description + ) + except ValueError: + return error( + request, + "Adding structure failed!", + "The structure could not be added as normalized structures don't match!", + ) return redirect(cs.url) @@ -1460,12 +1490,20 @@ def package_rule(request, package_uuid, rule_uuid): logger.info( f"Rule {current_rule.uuid} returned multiple product sets on {smiles}, picking the first one." ) - - smirks = f"{stand_smiles}>>{'.'.join(sorted(res[0]))}" + # Some Rules are touching unrelated areas which might result in ~ indicating + # any bond (-, =, #). For drawing we need a concrete bond. -> use single bond + product_smiles = [x.replace("~", "-") for x in res[0]] + smirks = f"{stand_smiles}>>{'.'.join(sorted(product_smiles))}" # Usually the functional groups are a mapping of fg -> count # As we are doing it on the fly here fake a high count to ensure that its properly highlighted - educt_functional_groups = {x: 1000 for x in current_rule.reactants_smarts} - product_functional_groups = {x: 1000 for x in current_rule.products_smarts} + + if isinstance(current_rule, SimpleAmbitRule): + educt_functional_groups = {current_rule.reactants_smarts: 1000} + product_functional_groups = {current_rule.products_smarts: 1000} + else: + educt_functional_groups = {x: 1000 for x in current_rule.reactants_smarts} + product_functional_groups = {x: 1000 for x in current_rule.products_smarts} + return HttpResponse( IndigoUtils.smirks_to_svg( smirks, @@ -1536,6 +1574,32 @@ def package_rule(request, package_uuid, rule_uuid): return HttpResponseNotAllowed(["GET", "POST"]) +@package_permission_required() +def package_rule_enzymelink(request, package_uuid, rule_uuid, enzymelink_uuid): + current_user = _anonymous_or_real(request) + current_package = PackageManager.get_package_by_id(current_user, package_uuid) + current_rule = Rule.objects.get(package=current_package, uuid=rule_uuid) + current_enzymelink = EnzymeLink.objects.get(rule=current_rule, uuid=enzymelink_uuid) + + if request.method == "GET": + context = get_base_context(request) + + context["title"] = f"enviPath - {current_package.name} - {current_rule.name}" + + context["meta"]["current_package"] = current_package + context["object_type"] = "enzyme" + context["breadcrumbs"] = breadcrumbs( + current_package, "rule", current_rule, "enzymelink", current_enzymelink + ) + + context["enzymelink"] = current_enzymelink + context["current_object"] = current_enzymelink + + return render(request, "objects/enzymelink.html", context) + + return HttpResponseNotAllowed(["GET"]) + + @package_permission_required() def package_reactions(request, package_uuid): current_user = _anonymous_or_real(request) @@ -1774,9 +1838,9 @@ def package_pathways(request, package_uuid): pw.setting = prediction_setting pw.save() - from .tasks import predict + from .tasks import dispatch, predict - predict.delay(pw.pk, prediction_setting.pk, limit=limit) + dispatch(current_user, predict, pw.pk, prediction_setting.pk, limit=limit) return redirect(pw.url) @@ -1812,6 +1876,25 @@ def package_pathway(request, package_uuid, pathway_uuid): return response + if ( + request.GET.get("identify-missing-rules", False) == "true" + and request.GET.get("rule-package") is not None + ): + from .tasks import dispatch_eager, identify_missing_rules + + rule_package = PackageManager.get_package_by_url( + current_user, request.GET.get("rule-package") + ) + res = dispatch_eager( + current_user, identify_missing_rules, [current_pathway.pk], rule_package.pk + ) + + filename = f"{current_pathway.name.replace(' ', '_')}_{current_pathway.uuid}.csv" + response = HttpResponse(res, content_type="text/csv") + response["Content-Disposition"] = f'attachment; filename="{filename}"' + + return response + # Pathway d3_json() relies on a lot of related objects (Nodes, Structures, Edges, Reaction, Rules, ...) # we will again fetch the current pathway identified by this url, but this time together with nearly all # related objects @@ -1896,10 +1979,16 @@ def package_pathway(request, package_uuid, pathway_uuid): if node_url: n = current_pathway.get_node(node_url) - from .tasks import predict + from .tasks import dispatch, predict + + dispatch( + current_user, + predict, + current_pathway.pk, + current_pathway.setting.pk, + node_pk=n.pk, + ) - # Dont delay? - predict(current_pathway.pk, current_pathway.setting.pk, node_pk=n.pk) return JsonResponse({"success": current_pathway.url}) return HttpResponseBadRequest() @@ -1976,9 +2065,42 @@ def package_pathway_node(request, package_uuid, pathway_uuid, node_uuid): if request.method == "GET": is_image_request = request.GET.get("image") + is_highlight_request = request.GET.get("highlight", False) + is_highlight_reactivity = request.GET.get("highlightReactivity", False) if is_image_request: if is_image_request == "svg": - svg_data = current_node.as_svg + # TODO optimize this chain + if is_highlight_request: + # User functional groups covered by the model training data + fgs = {} + if current_pathway.setting: + if current_pathway.setting.model: + if current_pathway.setting.model.app_domain: + fgs = current_pathway.setting.model.app_domain.functional_groups + + svg_data = IndigoUtils.mol_to_svg( + current_node.default_node_label.smiles, functional_groups=fgs + ) + elif is_highlight_reactivity: + # Use reactant smarts to show all reaction sites + # set a high count to obtain a strong color + ad_data = current_node.get_app_domain_assessment_data() + fgs = {} + for t in ad_data.get("assessment", {}).get("transformations", []): + r = Rule.objects.get(url=t["rule"]["url"]) + + if isinstance(r, SimpleAmbitRule): + fgs[r.reactants_smarts] = 1000 + else: + for sr in r.srs: + fgs[sr.reactants_smarts] = 1000 + + svg_data = IndigoUtils.mol_to_svg( + current_node.default_node_label.smiles, functional_groups=fgs + ) + else: + svg_data = current_node.as_svg + return HttpResponse(svg_data, content_type="image/svg+xml") context = get_base_context(request) @@ -2640,6 +2762,24 @@ def setting(request, setting_uuid): pass +def jobs(request): + current_user = _anonymous_or_real(request) + context = get_base_context(request) + + if request.method == "GET": + context["object_type"] = "joblog" + context["breadcrumbs"] = [ + {"Home": s.SERVER_URL}, + {"Jobs": s.SERVER_URL + "/jobs"}, + ] + if current_user.is_superuser: + context["jobs"] = JobLog.objects.all().order_by("-created") + else: + context["jobs"] = JobLog.objects.filter(user=current_user).order_by("-created") + + return render(request, "collections/joblog.html", context) + + ########### # KETCHER # ########### diff --git a/pyproject.toml b/pyproject.toml index 961f3aa7..0dfbe118 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -12,7 +12,6 @@ dependencies = [ "django-ninja>=1.4.1", "django-oauth-toolkit>=3.0.1", "django-polymorphic>=4.1.0", - "django-stubs>=5.2.4", "enviformer", "envipy-additional-information", "envipy-ambit>=0.1.0", @@ -34,12 +33,14 @@ dependencies = [ [tool.uv.sources] enviformer = { git = "ssh://git@git.envipath.com/enviPath/enviformer.git", rev = "v0.1.2" } envipy-plugins = { git = "ssh://git@git.envipath.com/enviPath/enviPy-plugins.git", rev = "v0.1.0" } -envipy-additional-information = { git = "ssh://git@git.envipath.com/enviPath/enviPy-additional-information.git", rev = "v0.1.4"} +envipy-additional-information = { git = "ssh://git@git.envipath.com/enviPath/enviPy-additional-information.git", rev = "v0.1.7"} envipy-ambit = { git = "ssh://git@git.envipath.com/enviPath/enviPy-ambit.git" } [project.optional-dependencies] ms-login = ["msal>=1.33.0"] dev = [ + "celery-stubs==0.1.3", + "django-stubs>=5.2.4", "poethepoet>=0.37.0", "pre-commit>=4.3.0", "ruff>=0.13.3", diff --git a/static/images/UoA-Logo-Primary-RGB-Large.png b/static/images/UoA-Logo-Primary-RGB-Large.png new file mode 100644 index 00000000..29b2a10d Binary files /dev/null and b/static/images/UoA-Logo-Primary-RGB-Large.png differ diff --git a/static/images/UoA-Logo-Primary-RGB-Reversed-Large.png b/static/images/UoA-Logo-Primary-RGB-Reversed-Large.png new file mode 100644 index 00000000..7714bd63 Binary files /dev/null and b/static/images/UoA-Logo-Primary-RGB-Reversed-Large.png differ diff --git a/static/images/UoA-Logo-Primary-RGB-Reversed-Small.png b/static/images/UoA-Logo-Primary-RGB-Reversed-Small.png new file mode 100644 index 00000000..7c48c2de Binary files /dev/null and b/static/images/UoA-Logo-Primary-RGB-Reversed-Small.png differ diff --git a/static/images/UoA-Logo-Primary-RGB-Small.png b/static/images/UoA-Logo-Primary-RGB-Small.png new file mode 100644 index 00000000..8d9ff890 Binary files /dev/null and b/static/images/UoA-Logo-Primary-RGB-Small.png differ diff --git a/static/images/uzh-logo.svg b/static/images/uzh-logo.svg new file mode 100644 index 00000000..4fa27b88 --- /dev/null +++ b/static/images/uzh-logo.svg @@ -0,0 +1,488 @@ + + + + + + diff --git a/static/js/pps.js b/static/js/pps.js index abc92695..c0829ad1 100644 --- a/static/js/pps.js +++ b/static/js/pps.js @@ -646,8 +646,8 @@ function handleAssessmentResponse(depict_url, data) { var reactivityCentersImgSrc = null; if (data['assessment']['node'] !== undefined) { - functionalGroupsImgSrc = ""; - reactivityCentersImgSrc = "" + functionalGroupsImgSrc = ""; + reactivityCentersImgSrc = "" } else { functionalGroupsImgSrc = ""; reactivityCentersImgSrc = "" @@ -784,4 +784,4 @@ function handleAssessmentResponse(depict_url, data) { $("#appDomainAssessmentResultTable").append(res); -} \ No newline at end of file +} diff --git a/static/js/pw.js b/static/js/pw.js index a335f951..907f28e0 100644 --- a/static/js/pw.js +++ b/static/js/pw.js @@ -444,6 +444,13 @@ function serializeSVG(svgElement) { line.setAttribute("fill", style.fill); }); + svgElement.querySelectorAll("line.link_no_arrow").forEach(line => { + const style = getComputedStyle(line); + line.setAttribute("stroke", style.stroke); + line.setAttribute("stroke-width", style.strokeWidth); + line.setAttribute("fill", style.fill); + }); + const serializer = new XMLSerializer(); let svgString = serializer.serializeToString(svgElement); @@ -455,7 +462,26 @@ function serializeSVG(svgElement) { return svgString; } +function shrinkSVG(svgSelector) { + + const svg = d3.select(svgSelector); + const node = svg.node(); + + // Compute bounding box of everything inside the SVG + const bbox = node.getBBox(); + + const padding = 10; + svg.attr("viewBox", + `${bbox.x - padding} ${bbox.y - padding} ${bbox.width + 2 * padding} ${bbox.height + 2 * padding}` + ) + .attr("width", bbox.width + 2 * padding) + .attr("height", bbox.height + 2 * padding); + + return bbox; +} + function downloadSVG(svgElement, filename = 'chart.svg') { + shrinkSVG("#" + svgElement.id); const svgString = serializeSVG(svgElement); const blob = new Blob([svgString], {type: 'image/svg+xml;charset=utf-8'}); const url = URL.createObjectURL(blob); diff --git a/templates/actions/objects/pathway.html b/templates/actions/objects/pathway.html index 28f74443..785f6213 100644 --- a/templates/actions/objects/pathway.html +++ b/templates/actions/objects/pathway.html @@ -22,6 +22,10 @@ Download Pathway as Image {% if meta.can_edit %} +
  • + + Identify Missing Rules +
  • diff --git a/templates/collections/joblog.html b/templates/collections/joblog.html new file mode 100644 index 00000000..7075e08e --- /dev/null +++ b/templates/collections/joblog.html @@ -0,0 +1,71 @@ +{% extends "framework.html" %} +{% load static %} +{% load envipytags %} +{% block content %} + +
    +
    +
    + Jobs +
    +
    +

    + Job Logs Desc +

    + +
    + +
    +
    +
    + + + + + + + + + + + {% for job in jobs %} + + + + + + + {% if job.task_result and job.task_result|is_url == True %} + + {% elif job.task_result %} + + {% else %} + + {% endif %} + + {% endfor %} + +
    IDNameStatusQueuedDoneResult
    {{ job.task_id }}{{ job.job_name }}{{ job.status }}{{ job.created }}{{ job.done_at }}Result{{ job.task_result|slice:"40" }}...Empty
    +
    +
    + + + +
    +
    +{% endblock content %} diff --git a/templates/framework.html b/templates/framework.html index e3f75176..80c7a6d5 100644 --- a/templates/framework.html +++ b/templates/framework.html @@ -56,7 +56,7 @@ (function () { var u = "//matomo.envipath.com/"; _paq.push(['setTrackerUrl', u + 'matomo.php']); - _paq.push(['setSiteId', '10']); + _paq.push(['setSiteId', '{{ meta.site_id }}']); var d = document, g = d.createElement('script'), s = d.getElementsByTagName('script')[0]; g.async = true; g.src = u + 'matomo.js'; @@ -227,21 +227,23 @@
    diff --git a/templates/modals/collections/new_model_modal.html b/templates/modals/collections/new_model_modal.html index a996a2df..52fd1b95 100644 --- a/templates/modals/collections/new_model_modal.html +++ b/templates/modals/collections/new_model_modal.html @@ -19,113 +19,117 @@ prediction. You just need to set a name and the packages you want the object to be based on. There are multiple types of models available. For additional information have a look at our - wiki >> + wiki + >> + + + + + - -
    - - - {% for obj in meta.readable_packages %} - {% if obj.reviewed %} - - {% endif %} + {% if obj.reviewed %} + + {% endif %} {% endfor %} {% for obj in meta.readable_packages %} - {% if not obj.reviewed %} - - {% endif %} + {% if not obj.reviewed %} + + {% endif %} {% endfor %} - - - - -
    - - - - {% if meta.enabled_features.PLUGINS and additional_descriptors %} - - - - {% endif %} - - - -
    - {% if meta.enabled_features.APPLICABILITY_DOMAIN %} - -
    - -
    - - {% endif %}
    - -
    - - + + +
    + + +
    + + +
    + + +
    + + +
    + + +
    + +
    + {% if meta.enabled_features.APPLICABILITY_DOMAIN %} + +
    + +
    + + {% endif %}
    @@ -138,53 +142,47 @@ diff --git a/templates/modals/objects/evaluate_model_modal.html b/templates/modals/objects/evaluate_model_modal.html index f5e6aa70..bd263f6f 100644 --- a/templates/modals/objects/evaluate_model_modal.html +++ b/templates/modals/objects/evaluate_model_modal.html @@ -18,10 +18,10 @@ For evaluation, you need to select the packages you want to use. While the model is evaluating, you can use the model for predictions. - - - {% for obj in meta.readable_packages %} {% if obj.reviewed %} @@ -36,7 +36,16 @@ {% endif %} {% endfor %} - + + + + + + @@ -66,7 +67,7 @@ $('#set_scenario_modal_form_submit').on('click', function (e) { e.preventDefault(); if ($('#scenario-select').val().length == 0) { - $('#scenario-select').val(['']) + $('#scenario-select').val("") } $('#set_scenario_modal_form').submit(); }); diff --git a/templates/modals/objects/identify_missing_rules_modal.html b/templates/modals/objects/identify_missing_rules_modal.html new file mode 100644 index 00000000..23f2a953 --- /dev/null +++ b/templates/modals/objects/identify_missing_rules_modal.html @@ -0,0 +1,54 @@ +{% load static %} + + + diff --git a/templates/objects/composite_rule.html b/templates/objects/composite_rule.html index 663ac91b..47d7f93a 100644 --- a/templates/objects/composite_rule.html +++ b/templates/objects/composite_rule.html @@ -88,19 +88,41 @@ {% endif %} - -
    -

    - EC Numbers -

    -
    -
    -
    - + {% if rule.enzymelinks %} + +
    +

    + EC Numbers +

    -
    - +
    +
    + {% for k, v in rule.get_grouped_enzymelinks.items %} + + + {% endfor %} +
    +
    + {% endif %}
    {% endblock content %} diff --git a/templates/objects/compound.html b/templates/objects/compound.html index 3117ece6..ef9ab43b 100644 --- a/templates/objects/compound.html +++ b/templates/objects/compound.html @@ -184,7 +184,7 @@
    - {% if compound.get_pubchem_identifiers %} + {% if compound.get_pubchem_compound_identifiers %}

    @@ -194,12 +194,28 @@

    - {% for eid in compound.get_pubchem_identifiers %} + {% for eid in compound.get_pubchem_compound_identifiers %} CID{{ eid.identifier_value }} {% endfor %}
    {% endif %} + {% if compound.get_pubchem_substance_identifiers %} + +
    + {% for eid in compound.get_pubchem_substance_identifiers %} + SID{{ eid.identifier_value }} + {% endfor %} +
    + {% endif %} {% if compound.get_chebi_identifiers %}
    diff --git a/templates/objects/enzymelink.html b/templates/objects/enzymelink.html new file mode 100644 index 00000000..464af8ae --- /dev/null +++ b/templates/objects/enzymelink.html @@ -0,0 +1,105 @@ +{% extends "framework.html" %} + +{% block content %} + +
    +
    +
    + {{ enzymelink.ec_number }} +
    + + +
    +

    + Enzyme Name +

    +
    +
    +
    + {{ enzymelink.name }} +
    +
    + + +
    +

    + Linking Method +

    +
    +
    +
    + {{ enzymelink.linking_method }}.  Learn more >> +
    +
    + + {% if enzymelink.kegg_reaction_links %} + +
    +
    + {% for kl in enzymelink.kegg_reaction_links %} + {{ kl.identifier_value }} + {% endfor %} +
    +
    + {% endif %} + + {% if enzymelink.reaction_evidence.all %} + +
    +
    + {% for r in enzymelink.reaction_evidence.all %} + {{ r.name }} ({{ r.package.name }}) + {% endfor %} +
    +
    + {% endif %} + + {% if enzymelink.edge_evidence.all %} + +
    +
    + {% for e in enzymelink.edge_evidence.all %} + {{ e.pathway.name }} + ({{ r.package.name }}) + {% endfor %} +
    +
    + {% endif %} + + + + + +
    +
    +{% endblock content %} diff --git a/templates/objects/model.html b/templates/objects/model.html index 29277311..9f6347a0 100644 --- a/templates/objects/model.html +++ b/templates/objects/model.html @@ -117,7 +117,7 @@ {% endif %} - {% if model.app_domain %} + {% if model.ready_for_prediction and model.app_domain %}

    diff --git a/templates/objects/pathway.html b/templates/objects/pathway.html index 905de2cc..b07ba6e9 100644 --- a/templates/objects/pathway.html +++ b/templates/objects/pathway.html @@ -84,6 +84,7 @@ {% include "modals/objects/add_pathway_edge_modal.html" %} {% include "modals/objects/download_pathway_csv_modal.html" %} {% include "modals/objects/download_pathway_image_modal.html" %} + {% include "modals/objects/identify_missing_rules_modal.html" %} {% include "modals/objects/generic_copy_object_modal.html" %} {% include "modals/objects/edit_pathway_modal.html" %} {% include "modals/objects/generic_set_aliases_modal.html" %} @@ -178,9 +179,6 @@
    - {% if debug %} - - {% endif %} diff --git a/templates/objects/reaction.html b/templates/objects/reaction.html index d384c0ba..ddd69c2d 100644 --- a/templates/objects/reaction.html +++ b/templates/objects/reaction.html @@ -125,6 +125,23 @@
    {% endif %} + {% if reaction.get_related_enzymes %} + +
    +

    + EC Numbers +

    +
    +
    +
    + {% for e in reaction.get_related_enzymes %} + {{ e.name }} + {% endfor %} +
    +
    + {% endif %} + {% if reaction.related_pathways %}
    diff --git a/templates/objects/simple_rule.html b/templates/objects/simple_rule.html index fcaa079d..23da1aa9 100644 --- a/templates/objects/simple_rule.html +++ b/templates/objects/simple_rule.html @@ -202,6 +202,43 @@

    {% endif %} + + {% if rule.enzymelinks %} + +
    +

    + EC Numbers +

    +
    +
    +
    + {% for k, v in rule.get_grouped_enzymelinks.items %} + + + {% endfor %} +
    +
    + {% endif %}
    {% endblock content %} diff --git a/tests/test_enviformer.py b/tests/test_enviformer.py index 1a688cb1..647433fc 100644 --- a/tests/test_enviformer.py +++ b/tests/test_enviformer.py @@ -1,7 +1,27 @@ +from collections import defaultdict +from datetime import datetime from tempfile import TemporaryDirectory from django.test import TestCase, tag from epdb.logic import PackageManager -from epdb.models import User, EnviFormer, Package +from epdb.models import User, EnviFormer, Package, Setting +from epdb.tasks import predict_simple, predict + + +def measure_predict(mod, pathway_pk=None): + # Measure and return the prediction time + start = datetime.now() + if pathway_pk: + s = Setting() + s.model = mod + s.model_threshold = 0.2 + s.max_depth = 4 + s.max_nodes = 20 + s.save() + pred_result = predict.delay(pathway_pk, s.pk, limit=s.max_depth) + else: + pred_result = predict_simple.delay(mod.pk, "C1=CC=C(CSCC2=CC=CC=C2)C=C1") + _ = pred_result.get() + return round((datetime.now() - start).total_seconds(), 2) @tag("slow") @@ -28,8 +48,41 @@ class EnviFormerTest(TestCase): mod.build_dataset() mod.build_model() - mod.multigen_eval = True - mod.save() - mod.evaluate_model() + mod.evaluate_model(True, eval_packages_objs) mod.predict("CCN(CC)C(=O)C1=CC(=CC=C1)C") + + def test_predict_runtime(self): + with TemporaryDirectory() as tmpdir: + with self.settings(MODEL_DIR=tmpdir): + threshold = float(0.5) + data_package_objs = [self.BBD_SUBSET] + eval_packages_objs = [self.BBD_SUBSET] + mods = [] + for _ in range(4): + mod = EnviFormer.create( + self.package, data_package_objs, eval_packages_objs, threshold=threshold + ) + mod.build_dataset() + mod.build_model() + mods.append(mod) + + # Test prediction time drops after first prediction + times = [measure_predict(mods[0]) for _ in range(5)] + print(f"First prediction took {times[0]} seconds, subsequent ones took {times[1:]}") + + # Test pathway prediction + times = [measure_predict(mods[1], self.BBD_SUBSET.pathways[0].pk) for _ in range(5)] + print( + f"First pathway prediction took {times[0]} seconds, subsequent ones took {times[1:]}" + ) + + # Test eviction by performing three prediction with every model, twice. + times = defaultdict(list) + for _ in range( + 2 + ): # Eviction should cause the second iteration here to have to reload the models + for mod in mods: + for _ in range(3): + times[mod.pk].append(measure_predict(mod)) + print(times) diff --git a/tests/test_model.py b/tests/test_model.py index e46046ec..f0355be9 100644 --- a/tests/test_model.py +++ b/tests/test_model.py @@ -30,7 +30,6 @@ class ModelTest(TestCase): self.package, rule_package_objs, data_package_objs, - eval_packages_objs, threshold=threshold, name="ECC - BBD - 0.5", description="Created MLRelativeReasoning in Testcase", @@ -50,9 +49,7 @@ class ModelTest(TestCase): mod.build_dataset() mod.build_model() - mod.multigen_eval = True - mod.save() - mod.evaluate_model() + mod.evaluate_model(True, eval_packages_objs) results = mod.predict("CCN(CC)C(=O)C1=CC(=CC=C1)C") diff --git a/tests/views/test_model_views.py b/tests/views/test_model_views.py index 558277f5..10cbefe2 100644 --- a/tests/views/test_model_views.py +++ b/tests/views/test_model_views.py @@ -6,7 +6,7 @@ from epdb.logic import UserManager from epdb.models import Package, User -@override_settings(MODEL_DIR=s.FIXTURE_DIRS[0] / "models") +@override_settings(MODEL_DIR=s.FIXTURE_DIRS[0] / "models", CELERY_TASK_ALWAYS_EAGER=True) class PathwayViewTest(TestCase): fixtures = ["test_fixtures_incl_model.jsonl.gz"] diff --git a/tests/views/test_pathway_views.py b/tests/views/test_pathway_views.py index 9e64e22f..b5fe99fd 100644 --- a/tests/views/test_pathway_views.py +++ b/tests/views/test_pathway_views.py @@ -6,7 +6,7 @@ from epdb.logic import UserManager, PackageManager from epdb.models import Pathway, Edge -@override_settings(MODEL_DIR=s.FIXTURE_DIRS[0] / "models") +@override_settings(MODEL_DIR=s.FIXTURE_DIRS[0] / "models", CELERY_TASK_ALWAYS_EAGER=True) class PathwayViewTest(TestCase): fixtures = ["test_fixtures_incl_model.jsonl.gz"] diff --git a/utilities/chem.py b/utilities/chem.py index 6de46147..250ccfb6 100644 --- a/utilities/chem.py +++ b/utilities/chem.py @@ -185,7 +185,7 @@ class FormatConverter(object): return smiles @staticmethod - def standardize(smiles, remove_stereo=False): + def standardize(smiles, remove_stereo=False, canonicalize_tautomers=False): # Taken from https://bitsilla.com/blog/2021/06/standardizing-a-molecule-using-rdkit/ # follows the steps in # https://github.com/greglandrum/RSC_OpenScience_Standardization_202104/blob/main/MolStandardize%20pieces.ipynb @@ -203,19 +203,21 @@ class FormatConverter(object): uncharger = ( rdMolStandardize.Uncharger() ) # annoying, but necessary as no convenience method exists - uncharged_parent_clean_mol = uncharger.uncharge(parent_clean_mol) + res_mol = uncharger.uncharge(parent_clean_mol) # note that no attempt is made at reionization at this step # nor at ionization at some pH (rdkit has no pKa caculator) # the main aim to to represent all molecules from different sources # in a (single) standard way, for use in ML, catalogue, etc. - # te = rdMolStandardize.TautomerEnumerator() # idem - # taut_uncharged_parent_clean_mol = te.Canonicalize(uncharged_parent_clean_mol) if remove_stereo: - Chem.RemoveStereochemistry(uncharged_parent_clean_mol) + Chem.RemoveStereochemistry(res_mol) - return Chem.MolToSmiles(uncharged_parent_clean_mol, kekuleSmiles=True) + if canonicalize_tautomers: + te = rdMolStandardize.TautomerEnumerator() # idem + res_mol = te.Canonicalize(res_mol) + + return Chem.MolToSmiles(res_mol, kekuleSmiles=True) @staticmethod def neutralize_smiles(smiles): @@ -363,6 +365,76 @@ class FormatConverter(object): return parsed_smiles, errors + @staticmethod + def smiles_covered_by( + l_smiles: List[str], + r_smiles: List[str], + standardize: bool = True, + canonicalize_tautomers: bool = True, + ) -> bool: + """ + Check if all SMILES in the left list are covered by (contained in) the right list. + + This function performs a subset check to determine if every chemical structure + represented in l_smiles has a corresponding representation in r_smiles. + + Args: + l_smiles (List[str]): List of SMILES strings to check for coverage. + r_smiles (List[str]): List of SMILES strings that should contain all l_smiles. + standardize (bool, optional): Whether to standardize SMILES before comparison. + Defaults to True. When True, applies FormatConverter.standardize() to + normalize representations for accurate comparison. + canonicalize_tautomers (bool, optional): Whether to canonicalize tautomers + Defaults to False. When True, applies rdMolStandardize.TautomerEnumerator().Canonicalize(res_mol) + to the compounds before comparison. + Returns: + bool: True if all SMILES in l_smiles are found in r_smiles (i.e., l_smiles + is a subset of r_smiles), False otherwise. + + Note: + - Comparison treats lists as sets, ignoring duplicates and order + - Failed standardization attempts are silently ignored (original SMILES used) + - This is a one-directional check: l_smiles ⊆ r_smiles + - For bidirectional equality, both directions must be checked separately + + Example: + >>> FormatConverter.smiles_covered_by(["CCO", "CC"], ["CCO", "CC", "CCC"]) + True + >>> FormatConverter.smiles_covered_by(["CCO", "CCCC"], ["CCO", "CC", "CCC"]) + False + """ + + standardized_l_smiles = [] + + if standardize: + for smi in l_smiles: + try: + smi = FormatConverter.standardize( + smi, canonicalize_tautomers=canonicalize_tautomers + ) + except Exception: + # :shrug: + # logger.debug(f'Standardizing SMILES failed for {smi}') + pass + standardized_l_smiles.append(smi) + else: + standardized_l_smiles = l_smiles + + standardized_r_smiles = [] + if standardize: + for smi in r_smiles: + try: + smi = FormatConverter.standardize(smi) + except Exception: + # :shrug: + # logger.debug(f'Standardizing SMILES failed for {smi}') + pass + standardized_r_smiles.append(smi) + else: + standardized_r_smiles = r_smiles + + return len(set(standardized_l_smiles).difference(set(standardized_r_smiles))) == 0 + class Standardizer(ABC): def __init__(self, name): @@ -729,6 +801,7 @@ class IndigoUtils(object): height: int = 0, educt_functional_groups: Dict[str, int] = None, product_functional_groups: Dict[str, int] = None, + debug: bool = False, ): if educt_functional_groups is None: educt_functional_groups = {} @@ -739,6 +812,11 @@ class IndigoUtils(object): i = Indigo() renderer = IndigoRenderer(i) + if debug: + i.setOption("render-atom-ids-visible", True) + i.setOption("render-bond-ids-visible", False) + i.setOption("render-atom-bond-ids-from-one", True) + i.setOption("render-output-format", "svg") i.setOption("render-coloring", True) i.setOption("render-image-size", width, height) diff --git a/utilities/misc.py b/utilities/misc.py index 3e4eeb59..0b7222f7 100644 --- a/utilities/misc.py +++ b/utilities/misc.py @@ -9,36 +9,37 @@ from collections import defaultdict from datetime import datetime from enum import Enum from types import NoneType -from typing import Dict, Any, List +from typing import Any, Dict, List from django.db import transaction -from envipy_additional_information import Interval, EnviPyModel -from envipy_additional_information import NAME_MAPPING +from envipy_additional_information import NAME_MAPPING, EnviPyModel, Interval from pydantic import BaseModel, HttpUrl from epdb.models import ( - Package, Compound, CompoundStructure, - SimpleRule, + Edge, + EnviFormer, + EPModel, + ExternalDatabase, + ExternalIdentifier, + License, + MLRelativeReasoning, + Node, + Package, + ParallelRule, + Pathway, + PluginModel, + Reaction, + Rule, + RuleBasedRelativeReasoning, + Scenario, + SequentialRule, SimpleAmbitRule, SimpleRDKitRule, - ParallelRule, - SequentialRule, - Reaction, - Pathway, - Node, - Edge, - Scenario, - EPModel, - MLRelativeReasoning, - RuleBasedRelativeReasoning, - EnviFormer, - PluginModel, - ExternalIdentifier, - ExternalDatabase, - License, + SimpleRule, ) +from utilities.chem import FormatConverter logger = logging.getLogger(__name__) @@ -48,7 +49,7 @@ class HTMLGenerator: @staticmethod def generate_html(additional_information: "EnviPyModel", prefix="") -> str: - from typing import get_origin, get_args, Union + from typing import Union, get_args, get_origin if isinstance(additional_information, type): clz_name = additional_information.__name__ @@ -1171,3 +1172,89 @@ class PackageImporter: url=identifier_data.get("url", ""), is_primary=identifier_data.get("is_primary", False), ) + + +class PathwayUtils: + def __init__(self, pathway: "Pathway"): + self.pathway = pathway + + @staticmethod + def _get_products(smiles: str, rules: List["Rule"]): + educt_rule_products: Dict[str, Dict[str, List[str]]] = defaultdict( + lambda: defaultdict(list) + ) + + for r in rules: + product_sets = r.apply(smiles) + for product_set in product_sets: + for product in product_set: + educt_rule_products[smiles][r.url].append(product) + + return educt_rule_products + + def find_missing_rules(self, rules: List["Rule"]): + print(f"Processing {self.pathway.name}") + # compute products for each node / rule combination in the pathway + educt_rule_products = defaultdict(lambda: defaultdict(list)) + + for node in self.pathway.nodes: + educt_rule_products.update(**self._get_products(node.default_node_label.smiles, rules)) + + # loop through edges and determine reactions that can't be constructed by + # any of the rules or a combination of two rules in a chained fashion + + res: Dict[str, List["Rule"]] = dict() + + for edge in self.pathway.edges: + found = False + reaction = edge.edge_label + + educts = [cs for cs in reaction.educts.all()] + products = [cs.smiles for cs in reaction.products.all()] + rule_chain = [] + + for educt in educts: + educt = educt.smiles + triggered_rules = list(educt_rule_products.get(educt, {}).keys()) + for triggered_rule in triggered_rules: + if rule_products := educt_rule_products[educt][triggered_rule]: + # check if this rule covers the reaction + if FormatConverter.smiles_covered_by( + products, rule_products, standardize=True, canonicalize_tautomers=True + ): + found = True + else: + # Check if another prediction step would cover the reaction + for product in rule_products: + prod_rule_products = self._get_products(product, rules) + prod_triggered_rules = list( + prod_rule_products.get(product, {}).keys() + ) + for prod_triggered_rule in prod_triggered_rules: + if second_step_products := prod_rule_products[product][ + prod_triggered_rule + ]: + if FormatConverter.smiles_covered_by( + products, + second_step_products, + standardize=True, + canonicalize_tautomers=True, + ): + rule_chain.append( + ( + triggered_rule, + Rule.objects.get(url=triggered_rule).name, + ) + ) + rule_chain.append( + ( + prod_triggered_rule, + Rule.objects.get(url=prod_triggered_rule).name, + ) + ) + res[edge.url] = rule_chain + + if not found: + res[edge.url] = rule_chain + + return res