[Enhancement] Create ML Models (#173)

## Changes

- Ability to change the threshold from a command line argument.
- Names of data packages included in model name
- Names of data, rule and eval packages included in the model description
- EnviFormer models are now viewable on the admin site
- Ignore CO2 for training and evaluating EnviFormer

Co-authored-by: Liam Brydon <62733830+MyCreativityOutlet@users.noreply.github.com>
Reviewed-on: enviPath/enviPy#173
Reviewed-by: jebus <lorsbach@envipath.com>
Co-authored-by: liambrydon <lbry121@aucklanduni.ac.nz>
Co-committed-by: liambrydon <lbry121@aucklanduni.ac.nz>
This commit is contained in:
2025-10-23 06:20:22 +13:00
committed by jebus
parent 8fda2577ee
commit 551cfc7768
3 changed files with 36 additions and 14 deletions

View File

@ -7,6 +7,7 @@ from .models import (
GroupPackagePermission, GroupPackagePermission,
Package, Package,
MLRelativeReasoning, MLRelativeReasoning,
EnviFormer,
Compound, Compound,
CompoundStructure, CompoundStructure,
SimpleAmbitRule, SimpleAmbitRule,
@ -50,6 +51,10 @@ class MLRelativeReasoningAdmin(EPAdmin):
pass pass
class EnviFormerAdmin(EPAdmin):
pass
class CompoundAdmin(EPAdmin): class CompoundAdmin(EPAdmin):
pass pass
@ -104,6 +109,7 @@ admin.site.register(Group, GroupAdmin)
admin.site.register(GroupPackagePermission, GroupPackagePermissionAdmin) admin.site.register(GroupPackagePermission, GroupPackagePermissionAdmin)
admin.site.register(Package, PackageAdmin) admin.site.register(Package, PackageAdmin)
admin.site.register(MLRelativeReasoning, MLRelativeReasoningAdmin) admin.site.register(MLRelativeReasoning, MLRelativeReasoningAdmin)
admin.site.register(EnviFormer, EnviFormerAdmin)
admin.site.register(Compound, CompoundAdmin) admin.site.register(Compound, CompoundAdmin)
admin.site.register(CompoundStructure, CompoundStructureAdmin) admin.site.register(CompoundStructure, CompoundStructureAdmin)
admin.site.register(SimpleAmbitRule, SimpleAmbitRuleAdmin) admin.site.register(SimpleAmbitRule, SimpleAmbitRuleAdmin)

View File

@ -7,10 +7,11 @@ from epdb.models import MLRelativeReasoning, EnviFormer, Package
class Command(BaseCommand): class Command(BaseCommand):
"""This command can be run with """This command can be run with
`python manage.py create_ml_models [model_names] -d [data_packages] OPTIONAL: -e [eval_packages]` `python manage.py create_ml_models [model_names] -d [data_packages] FOR MLRR ONLY: -r [rule_packages]
For example, to train both EnviFormer and MLRelativeReasoning on BBD and SOIL and evaluate them on SLUDGE OPTIONAL: -e [eval_packages] -t threshold`
the below command would be used: For example, to train both EnviFormer and MLRelativeReasoning on BBD and SOIL and evaluate them on SLUDGE with a
`python manage.py create_ml_models enviformer mlrr -d bbd soil -e sludge threshold of 0.6, the below command would be used:
`python manage.py create_ml_models enviformer mlrr -d bbd soil -e sludge -t 0.6
""" """
def add_arguments(self, parser): def add_arguments(self, parser):
@ -34,6 +35,13 @@ class Command(BaseCommand):
help="Rule Packages mandatory for MLRR", help="Rule Packages mandatory for MLRR",
default=[], default=[],
) )
parser.add_argument(
"-t",
"--threshold",
type=float,
help="Model prediction threshold",
default=0.5,
)
@transaction.atomic @transaction.atomic
def handle(self, *args, **options): def handle(self, *args, **options):
@ -67,7 +75,11 @@ class Command(BaseCommand):
return packages return packages
# Iteratively create models in options["model_names"] # Iteratively create models in options["model_names"]
print(f"Creating models: {options['model_names']}") print(f"Creating models: {options['model_names']}\n"
f"Data packages: {options['data_packages']}\n"
f"Rule Packages (only for MLRR): {options['rule_packages']}\n"
f"Eval Packages: {options['eval_packages']}\n"
f"Threshold: {options['threshold']:.2f}")
data_packages = decode_packages(options["data_packages"]) data_packages = decode_packages(options["data_packages"])
eval_packages = decode_packages(options["eval_packages"]) eval_packages = decode_packages(options["eval_packages"])
rule_packages = decode_packages(options["rule_packages"]) rule_packages = decode_packages(options["rule_packages"])
@ -78,9 +90,10 @@ class Command(BaseCommand):
pack, pack,
data_packages=data_packages, data_packages=data_packages,
eval_packages=eval_packages, eval_packages=eval_packages,
threshold=0.5, threshold=options['threshold'],
name="EnviFormer - T0.5", name=f"EnviFormer - {', '.join(options['data_packages'])} - T{options['threshold']:.2f}",
description="EnviFormer transformer", description=f"EnviFormer transformer trained on {options['data_packages']} "
f"evaluated on {options['eval_packages']}.",
) )
elif model_name == "mlrr": elif model_name == "mlrr":
model = MLRelativeReasoning.create( model = MLRelativeReasoning.create(
@ -88,9 +101,10 @@ class Command(BaseCommand):
rule_packages=rule_packages, rule_packages=rule_packages,
data_packages=data_packages, data_packages=data_packages,
eval_packages=eval_packages, eval_packages=eval_packages,
threshold=0.5, threshold=options['threshold'],
name="ECC - BBD - T0.5", name=f"ECC - {', '.join(options['data_packages'])} - T{options['threshold']:.2f}",
description="ML Relative Reasoning", description=f"ML Relative Reasoning trained on {options['data_packages']} with rules from "
f"{options['rule_packages']} and evaluated on {options['eval_packages']}.",
) )
else: else:
raise ValueError(f"Cannot create model of type {model_name}, unknown model type") raise ValueError(f"Cannot create model of type {model_name}, unknown model type")

View File

@ -3092,6 +3092,7 @@ class EnviFormer(PackageBasedModel):
start = datetime.now() start = datetime.now()
# Standardise reactions for the training data, EnviFormer ignores stereochemistry currently # Standardise reactions for the training data, EnviFormer ignores stereochemistry currently
co2 = {"C(=O)=O", "O=C=O"}
ds = [] ds = []
for reaction in self._get_reactions(): for reaction in self._get_reactions():
educts = ".".join( educts = ".".join(
@ -3106,7 +3107,8 @@ class EnviFormer(PackageBasedModel):
for smile in reaction.products.all() for smile in reaction.products.all()
] ]
) )
ds.append(f"{educts}>>{products}") if products not in co2:
ds.append(f"{educts}>>{products}")
end = datetime.now() end = datetime.now()
logger.debug(f"build_dataset took {(end - start).total_seconds()} seconds") logger.debug(f"build_dataset took {(end - start).total_seconds()} seconds")
@ -3302,7 +3304,7 @@ class EnviFormer(PackageBasedModel):
ds = self.load_dataset() ds = self.load_dataset()
n_splits = 20 n_splits = 20
shuff = ShuffleSplit(n_splits=n_splits, test_size=0.25, random_state=42) shuff = ShuffleSplit(n_splits=n_splits, test_size=0.1, random_state=42)
# Single gen eval is done in one loop of train then evaluate rather than storing all n_splits trained models # Single gen eval is done in one loop of train then evaluate rather than storing all n_splits trained models
# this helps reduce the memory footprint. # this helps reduce the memory footprint.
@ -3370,7 +3372,7 @@ class EnviFormer(PackageBasedModel):
# Compute splits of the collected pathway and evaluate. Like single gen we train and evaluate in each # Compute splits of the collected pathway and evaluate. Like single gen we train and evaluate in each
# iteration instead of storing all trained models. # iteration instead of storing all trained models.
for split_id, (train, test) in enumerate( for split_id, (train, test) in enumerate(
ShuffleSplit(n_splits=n_splits, test_size=0.25, random_state=42).split(pathways) ShuffleSplit(n_splits=n_splits, test_size=0.1, random_state=42).split(pathways)
): ):
train_pathways = [pathways[i] for i in train] train_pathways = [pathways[i] for i in train]
test_pathways = [pathways[i] for i in test] test_pathways = [pathways[i] for i in test]