forked from enviPath/enviPy
[Enhancement] Create ML Models (#173)
## Changes - Ability to change the threshold from a command line argument. - Names of data packages included in model name - Names of data, rule and eval packages included in the model description - EnviFormer models are now viewable on the admin site - Ignore CO2 for training and evaluating EnviFormer Co-authored-by: Liam Brydon <62733830+MyCreativityOutlet@users.noreply.github.com> Reviewed-on: enviPath/enviPy#173 Reviewed-by: jebus <lorsbach@envipath.com> Co-authored-by: liambrydon <lbry121@aucklanduni.ac.nz> Co-committed-by: liambrydon <lbry121@aucklanduni.ac.nz>
This commit is contained in:
@ -7,6 +7,7 @@ from .models import (
|
|||||||
GroupPackagePermission,
|
GroupPackagePermission,
|
||||||
Package,
|
Package,
|
||||||
MLRelativeReasoning,
|
MLRelativeReasoning,
|
||||||
|
EnviFormer,
|
||||||
Compound,
|
Compound,
|
||||||
CompoundStructure,
|
CompoundStructure,
|
||||||
SimpleAmbitRule,
|
SimpleAmbitRule,
|
||||||
@ -50,6 +51,10 @@ class MLRelativeReasoningAdmin(EPAdmin):
|
|||||||
pass
|
pass
|
||||||
|
|
||||||
|
|
||||||
|
class EnviFormerAdmin(EPAdmin):
|
||||||
|
pass
|
||||||
|
|
||||||
|
|
||||||
class CompoundAdmin(EPAdmin):
|
class CompoundAdmin(EPAdmin):
|
||||||
pass
|
pass
|
||||||
|
|
||||||
@ -104,6 +109,7 @@ admin.site.register(Group, GroupAdmin)
|
|||||||
admin.site.register(GroupPackagePermission, GroupPackagePermissionAdmin)
|
admin.site.register(GroupPackagePermission, GroupPackagePermissionAdmin)
|
||||||
admin.site.register(Package, PackageAdmin)
|
admin.site.register(Package, PackageAdmin)
|
||||||
admin.site.register(MLRelativeReasoning, MLRelativeReasoningAdmin)
|
admin.site.register(MLRelativeReasoning, MLRelativeReasoningAdmin)
|
||||||
|
admin.site.register(EnviFormer, EnviFormerAdmin)
|
||||||
admin.site.register(Compound, CompoundAdmin)
|
admin.site.register(Compound, CompoundAdmin)
|
||||||
admin.site.register(CompoundStructure, CompoundStructureAdmin)
|
admin.site.register(CompoundStructure, CompoundStructureAdmin)
|
||||||
admin.site.register(SimpleAmbitRule, SimpleAmbitRuleAdmin)
|
admin.site.register(SimpleAmbitRule, SimpleAmbitRuleAdmin)
|
||||||
|
|||||||
@ -7,10 +7,11 @@ from epdb.models import MLRelativeReasoning, EnviFormer, Package
|
|||||||
|
|
||||||
class Command(BaseCommand):
|
class Command(BaseCommand):
|
||||||
"""This command can be run with
|
"""This command can be run with
|
||||||
`python manage.py create_ml_models [model_names] -d [data_packages] OPTIONAL: -e [eval_packages]`
|
`python manage.py create_ml_models [model_names] -d [data_packages] FOR MLRR ONLY: -r [rule_packages]
|
||||||
For example, to train both EnviFormer and MLRelativeReasoning on BBD and SOIL and evaluate them on SLUDGE
|
OPTIONAL: -e [eval_packages] -t threshold`
|
||||||
the below command would be used:
|
For example, to train both EnviFormer and MLRelativeReasoning on BBD and SOIL and evaluate them on SLUDGE with a
|
||||||
`python manage.py create_ml_models enviformer mlrr -d bbd soil -e sludge
|
threshold of 0.6, the below command would be used:
|
||||||
|
`python manage.py create_ml_models enviformer mlrr -d bbd soil -e sludge -t 0.6
|
||||||
"""
|
"""
|
||||||
|
|
||||||
def add_arguments(self, parser):
|
def add_arguments(self, parser):
|
||||||
@ -34,6 +35,13 @@ class Command(BaseCommand):
|
|||||||
help="Rule Packages mandatory for MLRR",
|
help="Rule Packages mandatory for MLRR",
|
||||||
default=[],
|
default=[],
|
||||||
)
|
)
|
||||||
|
parser.add_argument(
|
||||||
|
"-t",
|
||||||
|
"--threshold",
|
||||||
|
type=float,
|
||||||
|
help="Model prediction threshold",
|
||||||
|
default=0.5,
|
||||||
|
)
|
||||||
|
|
||||||
@transaction.atomic
|
@transaction.atomic
|
||||||
def handle(self, *args, **options):
|
def handle(self, *args, **options):
|
||||||
@ -67,7 +75,11 @@ class Command(BaseCommand):
|
|||||||
return packages
|
return packages
|
||||||
|
|
||||||
# Iteratively create models in options["model_names"]
|
# Iteratively create models in options["model_names"]
|
||||||
print(f"Creating models: {options['model_names']}")
|
print(f"Creating models: {options['model_names']}\n"
|
||||||
|
f"Data packages: {options['data_packages']}\n"
|
||||||
|
f"Rule Packages (only for MLRR): {options['rule_packages']}\n"
|
||||||
|
f"Eval Packages: {options['eval_packages']}\n"
|
||||||
|
f"Threshold: {options['threshold']:.2f}")
|
||||||
data_packages = decode_packages(options["data_packages"])
|
data_packages = decode_packages(options["data_packages"])
|
||||||
eval_packages = decode_packages(options["eval_packages"])
|
eval_packages = decode_packages(options["eval_packages"])
|
||||||
rule_packages = decode_packages(options["rule_packages"])
|
rule_packages = decode_packages(options["rule_packages"])
|
||||||
@ -78,9 +90,10 @@ class Command(BaseCommand):
|
|||||||
pack,
|
pack,
|
||||||
data_packages=data_packages,
|
data_packages=data_packages,
|
||||||
eval_packages=eval_packages,
|
eval_packages=eval_packages,
|
||||||
threshold=0.5,
|
threshold=options['threshold'],
|
||||||
name="EnviFormer - T0.5",
|
name=f"EnviFormer - {', '.join(options['data_packages'])} - T{options['threshold']:.2f}",
|
||||||
description="EnviFormer transformer",
|
description=f"EnviFormer transformer trained on {options['data_packages']} "
|
||||||
|
f"evaluated on {options['eval_packages']}.",
|
||||||
)
|
)
|
||||||
elif model_name == "mlrr":
|
elif model_name == "mlrr":
|
||||||
model = MLRelativeReasoning.create(
|
model = MLRelativeReasoning.create(
|
||||||
@ -88,9 +101,10 @@ class Command(BaseCommand):
|
|||||||
rule_packages=rule_packages,
|
rule_packages=rule_packages,
|
||||||
data_packages=data_packages,
|
data_packages=data_packages,
|
||||||
eval_packages=eval_packages,
|
eval_packages=eval_packages,
|
||||||
threshold=0.5,
|
threshold=options['threshold'],
|
||||||
name="ECC - BBD - T0.5",
|
name=f"ECC - {', '.join(options['data_packages'])} - T{options['threshold']:.2f}",
|
||||||
description="ML Relative Reasoning",
|
description=f"ML Relative Reasoning trained on {options['data_packages']} with rules from "
|
||||||
|
f"{options['rule_packages']} and evaluated on {options['eval_packages']}.",
|
||||||
)
|
)
|
||||||
else:
|
else:
|
||||||
raise ValueError(f"Cannot create model of type {model_name}, unknown model type")
|
raise ValueError(f"Cannot create model of type {model_name}, unknown model type")
|
||||||
|
|||||||
@ -3092,6 +3092,7 @@ class EnviFormer(PackageBasedModel):
|
|||||||
|
|
||||||
start = datetime.now()
|
start = datetime.now()
|
||||||
# Standardise reactions for the training data, EnviFormer ignores stereochemistry currently
|
# Standardise reactions for the training data, EnviFormer ignores stereochemistry currently
|
||||||
|
co2 = {"C(=O)=O", "O=C=O"}
|
||||||
ds = []
|
ds = []
|
||||||
for reaction in self._get_reactions():
|
for reaction in self._get_reactions():
|
||||||
educts = ".".join(
|
educts = ".".join(
|
||||||
@ -3106,6 +3107,7 @@ class EnviFormer(PackageBasedModel):
|
|||||||
for smile in reaction.products.all()
|
for smile in reaction.products.all()
|
||||||
]
|
]
|
||||||
)
|
)
|
||||||
|
if products not in co2:
|
||||||
ds.append(f"{educts}>>{products}")
|
ds.append(f"{educts}>>{products}")
|
||||||
|
|
||||||
end = datetime.now()
|
end = datetime.now()
|
||||||
@ -3302,7 +3304,7 @@ class EnviFormer(PackageBasedModel):
|
|||||||
|
|
||||||
ds = self.load_dataset()
|
ds = self.load_dataset()
|
||||||
n_splits = 20
|
n_splits = 20
|
||||||
shuff = ShuffleSplit(n_splits=n_splits, test_size=0.25, random_state=42)
|
shuff = ShuffleSplit(n_splits=n_splits, test_size=0.1, random_state=42)
|
||||||
|
|
||||||
# Single gen eval is done in one loop of train then evaluate rather than storing all n_splits trained models
|
# Single gen eval is done in one loop of train then evaluate rather than storing all n_splits trained models
|
||||||
# this helps reduce the memory footprint.
|
# this helps reduce the memory footprint.
|
||||||
@ -3370,7 +3372,7 @@ class EnviFormer(PackageBasedModel):
|
|||||||
# Compute splits of the collected pathway and evaluate. Like single gen we train and evaluate in each
|
# Compute splits of the collected pathway and evaluate. Like single gen we train and evaluate in each
|
||||||
# iteration instead of storing all trained models.
|
# iteration instead of storing all trained models.
|
||||||
for split_id, (train, test) in enumerate(
|
for split_id, (train, test) in enumerate(
|
||||||
ShuffleSplit(n_splits=n_splits, test_size=0.25, random_state=42).split(pathways)
|
ShuffleSplit(n_splits=n_splits, test_size=0.1, random_state=42).split(pathways)
|
||||||
):
|
):
|
||||||
train_pathways = [pathways[i] for i in train]
|
train_pathways = [pathways[i] for i in train]
|
||||||
test_pathways = [pathways[i] for i in test]
|
test_pathways = [pathways[i] for i in test]
|
||||||
|
|||||||
Reference in New Issue
Block a user