[Enhancement] Create ML Models (#173)

## Changes - Ability to change the threshold from a command line argument. - Names of data packages included in model name - Names of data, rule and eval packages included in the model description - EnviFormer models are now viewable on the admin site - Ignore CO2 for training and evaluating EnviFormer Co-authored-by: Liam Brydon <62733830+MyCreativityOutlet@users.noreply.github.com> Reviewed-on: enviPath/enviPy#173 Reviewed-by: jebus <lorsbach@envipath.com> Co-authored-by: liambrydon <lbry121@aucklanduni.ac.nz> Co-committed-by: liambrydon <lbry121@aucklanduni.ac.nz>
2025-10-23 06:20:22 +13:00
parent 8fda2577ee
commit 551cfc7768
3 changed files with 36 additions and 14 deletions
--- a/epdb/admin.py
+++ b/epdb/admin.py
@ -7,6 +7,7 @@ from .models import (
    GroupPackagePermission,
    Package,
    MLRelativeReasoning,
    EnviFormer,
    Compound,
    CompoundStructure,
    SimpleAmbitRule,
@ -50,6 +51,10 @@ class MLRelativeReasoningAdmin(EPAdmin):
    pass
 class EnviFormerAdmin(EPAdmin):
    pass
 class CompoundAdmin(EPAdmin):
    pass
@ -104,6 +109,7 @@ admin.site.register(Group, GroupAdmin)
 admin.site.register(GroupPackagePermission, GroupPackagePermissionAdmin)
 admin.site.register(Package, PackageAdmin)
 admin.site.register(MLRelativeReasoning, MLRelativeReasoningAdmin)
 admin.site.register(EnviFormer, EnviFormerAdmin)
 admin.site.register(Compound, CompoundAdmin)
 admin.site.register(CompoundStructure, CompoundStructureAdmin)
 admin.site.register(SimpleAmbitRule, SimpleAmbitRuleAdmin)
--- a/epdb/management/commands/create_ml_models.py
+++ b/epdb/management/commands/create_ml_models.py
@ -7,10 +7,11 @@ from epdb.models import MLRelativeReasoning, EnviFormer, Package
 class Command(BaseCommand):
    """This command can be run with
-    `python manage.py create_ml_models [model_names] -d [data_packages] OPTIONAL: -e [eval_packages]`
+    `python manage.py create_ml_models [model_names] -d [data_packages] FOR MLRR ONLY: -r [rule_packages]
-    For example, to train both EnviFormer and MLRelativeReasoning on BBD and SOIL and evaluate them on SLUDGE
+                                        OPTIONAL: -e [eval_packages] -t threshold`
-    the below command would be used:
+    For example, to train both EnviFormer and MLRelativeReasoning on BBD and SOIL and evaluate them on SLUDGE with a
-    `python manage.py create_ml_models enviformer mlrr -d bbd soil -e sludge
+    threshold of 0.6, the below command would be used:
    `python manage.py create_ml_models enviformer mlrr -d bbd soil -e sludge -t 0.6
    """
    def add_arguments(self, parser):
@ -34,6 +35,13 @@ class Command(BaseCommand):
            help="Rule Packages mandatory for MLRR",
            default=[],
        )
        parser.add_argument(
            "-t",
            "--threshold",
            type=float,
            help="Model prediction threshold",
            default=0.5,
        )
    @transaction.atomic
    def handle(self, *args, **options):
@ -67,7 +75,11 @@ class Command(BaseCommand):
            return packages
        # Iteratively create models in options["model_names"]
-        print(f"Creating models: {options['model_names']}")
+        print(f"Creating models: {options['model_names']}\n"
              f"Data packages: {options['data_packages']}\n"
              f"Rule Packages (only for MLRR): {options['rule_packages']}\n"
              f"Eval Packages: {options['eval_packages']}\n"
              f"Threshold: {options['threshold']:.2f}")
        data_packages = decode_packages(options["data_packages"])
        eval_packages = decode_packages(options["eval_packages"])
        rule_packages = decode_packages(options["rule_packages"])
@ -78,9 +90,10 @@ class Command(BaseCommand):
                    pack,
                    data_packages=data_packages,
                    eval_packages=eval_packages,
-                    threshold=0.5,
+                    threshold=options['threshold'],
-                    name="EnviFormer - T0.5",
+                    name=f"EnviFormer - {', '.join(options['data_packages'])} - T{options['threshold']:.2f}",
-                    description="EnviFormer transformer",
+                    description=f"EnviFormer transformer trained on {options['data_packages']} "
                                f"evaluated on {options['eval_packages']}.",
                )
            elif model_name == "mlrr":
                model = MLRelativeReasoning.create(
@ -88,9 +101,10 @@ class Command(BaseCommand):
                    rule_packages=rule_packages,
                    data_packages=data_packages,
                    eval_packages=eval_packages,
-                    threshold=0.5,
+                    threshold=options['threshold'],
-                    name="ECC - BBD - T0.5",
+                    name=f"ECC - {', '.join(options['data_packages'])} - T{options['threshold']:.2f}",
-                    description="ML Relative Reasoning",
+                    description=f"ML Relative Reasoning trained on {options['data_packages']} with rules from "
                                f"{options['rule_packages']} and evaluated on {options['eval_packages']}.",
                )
            else:
                raise ValueError(f"Cannot create model of type {model_name}, unknown model type")
--- a/epdb/models.py
+++ b/epdb/models.py
@ -3092,6 +3092,7 @@ class EnviFormer(PackageBasedModel):
        start = datetime.now()
        # Standardise reactions for the training data, EnviFormer ignores stereochemistry currently
        co2 = {"C(=O)=O", "O=C=O"}
        ds = []
        for reaction in self._get_reactions():
            educts = ".".join(
@ -3106,7 +3107,8 @@ class EnviFormer(PackageBasedModel):
                    for smile in reaction.products.all()
                ]
            )
-            ds.append(f"{educts}>>{products}")
+            if products not in co2:
                ds.append(f"{educts}>>{products}")
        end = datetime.now()
        logger.debug(f"build_dataset took {(end - start).total_seconds()} seconds")
@ -3302,7 +3304,7 @@ class EnviFormer(PackageBasedModel):
            ds = self.load_dataset()
            n_splits = 20
-            shuff = ShuffleSplit(n_splits=n_splits, test_size=0.25, random_state=42)
+            shuff = ShuffleSplit(n_splits=n_splits, test_size=0.1, random_state=42)
            # Single gen eval is done in one loop of train then evaluate rather than storing all n_splits trained models
            # this helps reduce the memory footprint.
@ -3370,7 +3372,7 @@ class EnviFormer(PackageBasedModel):
                # Compute splits of the collected pathway and evaluate. Like single gen we train and evaluate in each
                # iteration instead of storing all trained models.
                for split_id, (train, test) in enumerate(
-                    ShuffleSplit(n_splits=n_splits, test_size=0.25, random_state=42).split(pathways)
+                    ShuffleSplit(n_splits=n_splits, test_size=0.1, random_state=42).split(pathways)
                ):
                    train_pathways = [pathways[i] for i in train]
                    test_pathways = [pathways[i] for i in test]