Files
enviPy-bayer/tests/test_enviformer.py
liambrydon 376fd65785 [Feature] ML model caching for reducing prediction overhead (#156)
The caching is now finished. The cache is created in `settings.py` giving us the most flexibility for using it in the future.

The cache is currently updated/accessed by `tasks.py/get_ml_model` which can be called from whatever task needs to access ml models in this way (currently, `predict` and `predict_simple`).

This implementation currently caches all ml models including the relative reasoning. If we don't want this and only want to cache enviFormer, i can change it to that. However, I don't think there is a harm in having the other models be cached as well.

Co-authored-by: Liam Brydon <62733830+MyCreativityOutlet@users.noreply.github.com>
Reviewed-on: enviPath/enviPy#156
Co-authored-by: liambrydon <lbry121@aucklanduni.ac.nz>
Co-committed-by: liambrydon <lbry121@aucklanduni.ac.nz>
2025-10-16 08:58:36 +13:00

87 lines
3.5 KiB
Python

from collections import defaultdict
from datetime import datetime
from tempfile import TemporaryDirectory
from django.test import TestCase, tag
from epdb.logic import PackageManager
from epdb.models import User, EnviFormer, Package, Setting, Pathway
from epdb.tasks import predict_simple, predict
def measure_predict(mod, pathway_pk=None):
# Measure and return the prediction time
start = datetime.now()
if pathway_pk:
s = Setting()
s.model = mod
s.model_threshold = 0.2
s.max_depth = 4
s.max_nodes = 20
s.save()
pred_result = predict.delay(pathway_pk, s.pk, limit=s.max_depth)
else:
pred_result = predict_simple.delay(mod.pk, "C1=CC=C(CSCC2=CC=CC=C2)C=C1")
_ = pred_result.get()
return round((datetime.now() - start).total_seconds(), 2)
@tag("slow")
class EnviFormerTest(TestCase):
fixtures = ["test_fixtures.jsonl.gz"]
@classmethod
def setUpClass(cls):
super(EnviFormerTest, cls).setUpClass()
cls.user = User.objects.get(username="anonymous")
cls.package = PackageManager.create_package(cls.user, "Anon Test Package", "No Desc")
cls.BBD_SUBSET = Package.objects.get(name="Fixtures")
def test_model_flow(self):
"""Test the full flow of EnviFormer, dataset build -> model finetune -> model evaluate -> model inference"""
with TemporaryDirectory() as tmpdir:
with self.settings(MODEL_DIR=tmpdir):
threshold = float(0.5)
data_package_objs = [self.BBD_SUBSET]
eval_packages_objs = [self.BBD_SUBSET]
mod = EnviFormer.create(
self.package, data_package_objs, eval_packages_objs, threshold=threshold
)
mod.build_dataset()
mod.build_model()
mod.multigen_eval = True
mod.save()
mod.evaluate_model()
mod.predict("CCN(CC)C(=O)C1=CC(=CC=C1)C")
def test_predict_runtime(self):
with TemporaryDirectory() as tmpdir:
with self.settings(MODEL_DIR=tmpdir):
threshold = float(0.5)
data_package_objs = [self.BBD_SUBSET]
eval_packages_objs = [self.BBD_SUBSET]
mods = []
for _ in range(4):
mod = EnviFormer.create(
self.package, data_package_objs, eval_packages_objs, threshold=threshold
)
mod.build_dataset()
mod.build_model()
mods.append(mod)
# Test prediction time drops after first prediction
times = [measure_predict(mods[0]) for _ in range(5)]
print(f"First prediction took {times[0]} seconds, subsequent ones took {times[1:]}")
# Test pathway prediction
times = [measure_predict(mods[1], self.BBD_SUBSET.pathways[0].pk) for _ in range(5)]
print(f"First pathway prediction took {times[0]} seconds, subsequent ones took {times[1:]}")
# Test eviction by performing three prediction with every model, twice.
times = defaultdict(list)
for _ in range(2): # Eviction should cause the second iteration here to have to reload the models
for mod in mods:
for _ in range(3):
times[mod.pk].append(measure_predict(mod))
print(times)