[Feature] ML model caching for reducing prediction overhead (#156)

The caching is now finished. The cache is created in `settings.py` giving us the most flexibility for using it in the future.

The cache is currently updated/accessed by `tasks.py/get_ml_model` which can be called from whatever task needs to access ml models in this way (currently, `predict` and `predict_simple`).

This implementation currently caches all ml models including the relative reasoning. If we don't want this and only want to cache enviFormer, i can change it to that. However, I don't think there is a harm in having the other models be cached as well.

Co-authored-by: Liam Brydon <62733830+MyCreativityOutlet@users.noreply.github.com>
Reviewed-on: enviPath/enviPy#156
Co-authored-by: liambrydon <lbry121@aucklanduni.ac.nz>
Co-committed-by: liambrydon <lbry121@aucklanduni.ac.nz>
This commit is contained in:
2025-10-16 08:58:36 +13:00
committed by jebus
parent d5ebb23622
commit 376fd65785
3 changed files with 69 additions and 6 deletions

View File

@ -1,7 +1,27 @@
from collections import defaultdict
from datetime import datetime
from tempfile import TemporaryDirectory
from django.test import TestCase, tag
from epdb.logic import PackageManager
from epdb.models import User, EnviFormer, Package
from epdb.models import User, EnviFormer, Package, Setting, Pathway
from epdb.tasks import predict_simple, predict
def measure_predict(mod, pathway_pk=None):
# Measure and return the prediction time
start = datetime.now()
if pathway_pk:
s = Setting()
s.model = mod
s.model_threshold = 0.2
s.max_depth = 4
s.max_nodes = 20
s.save()
pred_result = predict.delay(pathway_pk, s.pk, limit=s.max_depth)
else:
pred_result = predict_simple.delay(mod.pk, "C1=CC=C(CSCC2=CC=CC=C2)C=C1")
_ = pred_result.get()
return round((datetime.now() - start).total_seconds(), 2)
@tag("slow")
@ -33,3 +53,34 @@ class EnviFormerTest(TestCase):
mod.evaluate_model()
mod.predict("CCN(CC)C(=O)C1=CC(=CC=C1)C")
def test_predict_runtime(self):
with TemporaryDirectory() as tmpdir:
with self.settings(MODEL_DIR=tmpdir):
threshold = float(0.5)
data_package_objs = [self.BBD_SUBSET]
eval_packages_objs = [self.BBD_SUBSET]
mods = []
for _ in range(4):
mod = EnviFormer.create(
self.package, data_package_objs, eval_packages_objs, threshold=threshold
)
mod.build_dataset()
mod.build_model()
mods.append(mod)
# Test prediction time drops after first prediction
times = [measure_predict(mods[0]) for _ in range(5)]
print(f"First prediction took {times[0]} seconds, subsequent ones took {times[1:]}")
# Test pathway prediction
times = [measure_predict(mods[1], self.BBD_SUBSET.pathways[0].pk) for _ in range(5)]
print(f"First pathway prediction took {times[0]} seconds, subsequent ones took {times[1:]}")
# Test eviction by performing three prediction with every model, twice.
times = defaultdict(list)
for _ in range(2): # Eviction should cause the second iteration here to have to reload the models
for mod in mods:
for _ in range(3):
times[mod.pk].append(measure_predict(mod))
print(times)