diff --git a/epdb/admin.py b/epdb/admin.py index fefcdc32..1f251cc1 100644 --- a/epdb/admin.py +++ b/epdb/admin.py @@ -7,6 +7,7 @@ from .models import ( GroupPackagePermission, Package, MLRelativeReasoning, + EnviFormer, Compound, CompoundStructure, SimpleAmbitRule, @@ -50,6 +51,10 @@ class MLRelativeReasoningAdmin(EPAdmin): pass +class EnviFormerAdmin(EPAdmin): + pass + + class CompoundAdmin(EPAdmin): pass @@ -104,6 +109,7 @@ admin.site.register(Group, GroupAdmin) admin.site.register(GroupPackagePermission, GroupPackagePermissionAdmin) admin.site.register(Package, PackageAdmin) admin.site.register(MLRelativeReasoning, MLRelativeReasoningAdmin) +admin.site.register(EnviFormer, EnviFormerAdmin) admin.site.register(Compound, CompoundAdmin) admin.site.register(CompoundStructure, CompoundStructureAdmin) admin.site.register(SimpleAmbitRule, SimpleAmbitRuleAdmin) diff --git a/epdb/management/commands/create_ml_models.py b/epdb/management/commands/create_ml_models.py index 8cf3fd55..6c59141d 100644 --- a/epdb/management/commands/create_ml_models.py +++ b/epdb/management/commands/create_ml_models.py @@ -7,10 +7,11 @@ from epdb.models import MLRelativeReasoning, EnviFormer, Package class Command(BaseCommand): """This command can be run with - `python manage.py create_ml_models [model_names] -d [data_packages] OPTIONAL: -e [eval_packages]` - For example, to train both EnviFormer and MLRelativeReasoning on BBD and SOIL and evaluate them on SLUDGE - the below command would be used: - `python manage.py create_ml_models enviformer mlrr -d bbd soil -e sludge + `python manage.py create_ml_models [model_names] -d [data_packages] FOR MLRR ONLY: -r [rule_packages] + OPTIONAL: -e [eval_packages] -t threshold` + For example, to train both EnviFormer and MLRelativeReasoning on BBD and SOIL and evaluate them on SLUDGE with a + threshold of 0.6, the below command would be used: + `python manage.py create_ml_models enviformer mlrr -d bbd soil -e sludge -t 0.6 """ def add_arguments(self, parser): @@ -34,6 +35,13 @@ class Command(BaseCommand): help="Rule Packages mandatory for MLRR", default=[], ) + parser.add_argument( + "-t", + "--threshold", + type=float, + help="Model prediction threshold", + default=0.5, + ) @transaction.atomic def handle(self, *args, **options): @@ -67,7 +75,11 @@ class Command(BaseCommand): return packages # Iteratively create models in options["model_names"] - print(f"Creating models: {options['model_names']}") + print(f"Creating models: {options['model_names']}\n" + f"Data packages: {options['data_packages']}\n" + f"Rule Packages (only for MLRR): {options['rule_packages']}\n" + f"Eval Packages: {options['eval_packages']}\n" + f"Threshold: {options['threshold']:.2f}") data_packages = decode_packages(options["data_packages"]) eval_packages = decode_packages(options["eval_packages"]) rule_packages = decode_packages(options["rule_packages"]) @@ -78,9 +90,10 @@ class Command(BaseCommand): pack, data_packages=data_packages, eval_packages=eval_packages, - threshold=0.5, - name="EnviFormer - T0.5", - description="EnviFormer transformer", + threshold=options['threshold'], + name=f"EnviFormer - {', '.join(options['data_packages'])} - T{options['threshold']:.2f}", + description=f"EnviFormer transformer trained on {options['data_packages']} " + f"evaluated on {options['eval_packages']}.", ) elif model_name == "mlrr": model = MLRelativeReasoning.create( @@ -88,9 +101,10 @@ class Command(BaseCommand): rule_packages=rule_packages, data_packages=data_packages, eval_packages=eval_packages, - threshold=0.5, - name="ECC - BBD - T0.5", - description="ML Relative Reasoning", + threshold=options['threshold'], + name=f"ECC - {', '.join(options['data_packages'])} - T{options['threshold']:.2f}", + description=f"ML Relative Reasoning trained on {options['data_packages']} with rules from " + f"{options['rule_packages']} and evaluated on {options['eval_packages']}.", ) else: raise ValueError(f"Cannot create model of type {model_name}, unknown model type") diff --git a/epdb/models.py b/epdb/models.py index 998503e2..33a0b89b 100644 --- a/epdb/models.py +++ b/epdb/models.py @@ -3092,6 +3092,7 @@ class EnviFormer(PackageBasedModel): start = datetime.now() # Standardise reactions for the training data, EnviFormer ignores stereochemistry currently + co2 = {"C(=O)=O", "O=C=O"} ds = [] for reaction in self._get_reactions(): educts = ".".join( @@ -3106,7 +3107,8 @@ class EnviFormer(PackageBasedModel): for smile in reaction.products.all() ] ) - ds.append(f"{educts}>>{products}") + if products not in co2: + ds.append(f"{educts}>>{products}") end = datetime.now() logger.debug(f"build_dataset took {(end - start).total_seconds()} seconds") @@ -3302,7 +3304,7 @@ class EnviFormer(PackageBasedModel): ds = self.load_dataset() n_splits = 20 - shuff = ShuffleSplit(n_splits=n_splits, test_size=0.25, random_state=42) + shuff = ShuffleSplit(n_splits=n_splits, test_size=0.1, random_state=42) # Single gen eval is done in one loop of train then evaluate rather than storing all n_splits trained models # this helps reduce the memory footprint. @@ -3370,7 +3372,7 @@ class EnviFormer(PackageBasedModel): # Compute splits of the collected pathway and evaluate. Like single gen we train and evaluate in each # iteration instead of storing all trained models. for split_id, (train, test) in enumerate( - ShuffleSplit(n_splits=n_splits, test_size=0.25, random_state=42).split(pathways) + ShuffleSplit(n_splits=n_splits, test_size=0.1, random_state=42).split(pathways) ): train_pathways = [pathways[i] for i in train] test_pathways = [pathways[i] for i in test]