new RuleBasedDataset and EnviFormer dataset working for respective models #120

This commit is contained in:
Liam Brydon
2025-11-04 10:58:16 +13:00
parent ff51e48f90
commit ac5d370b18
5 changed files with 126 additions and 101 deletions

View File

@ -1,8 +1,9 @@
import os.path
from tempfile import TemporaryDirectory
from django.test import TestCase
from epdb.logic import PackageManager
from epdb.models import Reaction, Compound, User, Rule, Package
from utilities.ml import RuleBasedDataset
from utilities.ml import RuleBasedDataset, EnviFormerDataset
class DatasetTest(TestCase):
@ -45,11 +46,11 @@ class DatasetTest(TestCase):
def test_generate_dataset(self):
"""Test generating dataset does not crash"""
self.generate_dataset()
self.generate_rule_dataset()
def test_indexing(self):
"""Test indexing a few different ways to check for crashes"""
ds, reactions, rules = self.generate_dataset()
ds, reactions, rules = self.generate_rule_dataset()
print(ds[5])
print(ds[2, 5])
print(ds[3:6, 2:8])
@ -57,45 +58,45 @@ class DatasetTest(TestCase):
def test_add_rows(self):
"""Test adding one row and adding multiple rows"""
ds, reactions, rules = self.generate_dataset()
ds, reactions, rules = self.generate_rule_dataset()
ds.add_row(list(ds.df.row(1)))
ds.add_rows([list(ds.df.row(i)) for i in range(5)])
def test_times_triggered(self):
"""Check getting times triggered for a rule id"""
ds, reactions, rules = self.generate_dataset()
ds, reactions, rules = self.generate_rule_dataset()
print(ds.times_triggered(rules[0].uuid))
def test_block_indices(self):
"""Test the usages of _block_indices"""
ds, reactions, rules = self.generate_dataset()
ds, reactions, rules = self.generate_rule_dataset()
print(ds.struct_features())
print(ds.triggered())
print(ds.observed())
def test_structure_id(self):
"""Check getting a structure id from row index"""
ds, reactions, rules = self.generate_dataset()
ds, reactions, rules = self.generate_rule_dataset()
print(ds.structure_id(0))
def test_x(self):
"""Test getting X portion of the dataframe"""
ds, reactions, rules = self.generate_dataset()
ds, reactions, rules = self.generate_rule_dataset()
print(ds.X().df.head())
def test_trig(self):
"""Test getting the triggered portion of the dataframe"""
ds, reactions, rules = self.generate_dataset()
ds, reactions, rules = self.generate_rule_dataset()
print(ds.trig().df.head())
def test_y(self):
"""Test getting the Y portion of the dataframe"""
ds, reactions, rules = self.generate_dataset()
ds, reactions, rules = self.generate_rule_dataset()
print(ds.y().df.head())
def test_classification_dataset(self):
"""Test making the classification dataset"""
ds, reactions, rules = self.generate_dataset()
ds, reactions, rules = self.generate_rule_dataset()
compounds = [c.default_structure for c in Compound.objects.filter(package=self.BBD_SUBSET)]
class_ds, products = ds.classification_dataset(compounds, rules)
print(class_ds.df.head(5))
@ -103,12 +104,16 @@ class DatasetTest(TestCase):
def test_to_arff(self):
"""Test exporting the arff version of the dataset"""
ds, reactions, rules = self.generate_dataset()
ds, reactions, rules = self.generate_rule_dataset()
ds.to_arff("dataset_arff_test.arff")
def test_save_load(self):
"""Test saving and loading dataset"""
ds, reactions, rules = self.generate_dataset()
with TemporaryDirectory() as tmpdir:
ds, reactions, rules = self.generate_rule_dataset()
ds.save(os.path.join(tmpdir, "save_dataset.pkl"))
ds_loaded = RuleBasedDataset.load(os.path.join(tmpdir, "save_dataset.pkl"))
self.assertTrue(ds.df.equals(ds_loaded.df))
def test_dataset_example(self):
"""Test with a concrete example checking dataset size"""
@ -120,9 +125,19 @@ class DatasetTest(TestCase):
self.assertEqual(len(ds.y()), 1)
self.assertEqual(ds.y().df.item(), 1)
def generate_dataset(self):
def test_enviformer_dataset(self):
ds, reactions = self.generate_enviformer_dataset()
print(ds.X().head())
print(ds.y().head())
def generate_rule_dataset(self):
"""Generate a RuleBasedDataset from test package data"""
reactions = [r for r in Reaction.objects.filter(package=self.BBD_SUBSET)]
applicable_rules = [r for r in Rule.objects.filter(package=self.BBD_SUBSET)]
ds = RuleBasedDataset.generate_dataset(reactions, applicable_rules)
return ds, reactions, applicable_rules
def generate_enviformer_dataset(self):
reactions = [r for r in Reaction.objects.filter(package=self.BBD_SUBSET)]
ds = EnviFormerDataset.generate_dataset(reactions)
return ds, reactions

View File

@ -50,7 +50,7 @@ class EnviFormerTest(TestCase):
mod.build_model()
mod.multigen_eval = True
mod.save()
mod.evaluate_model()
mod.evaluate_model(n_splits=2)
mod.predict("CCN(CC)C(=O)C1=CC(=CC=C1)C")

View File

@ -4,7 +4,7 @@ import numpy as np
from django.test import TestCase
from epdb.logic import PackageManager
from epdb.models import User, MLRelativeReasoning, Package
from epdb.models import User, MLRelativeReasoning, Package, RuleBasedRelativeReasoning
class ModelTest(TestCase):
@ -17,7 +17,7 @@ class ModelTest(TestCase):
cls.package = PackageManager.create_package(cls.user, "Anon Test Package", "No Desc")
cls.BBD_SUBSET = Package.objects.get(name="Fixtures")
def test_smoke(self):
def test_mlrr(self):
with TemporaryDirectory() as tmpdir:
with self.settings(MODEL_DIR=tmpdir):
threshold = float(0.5)
@ -36,23 +36,11 @@ class ModelTest(TestCase):
description="Created MLRelativeReasoning in Testcase",
)
# mod = RuleBasedRelativeReasoning.create(
# self.package,
# rule_package_objs,
# data_package_objs,
# eval_packages_objs,
# threshold=threshold,
# min_count=5,
# max_count=0,
# name='ECC - BBD - 0.5',
# description='Created MLRelativeReasoning in Testcase',
# )
mod.build_dataset()
mod.build_model()
mod.multigen_eval = True
mod.save()
mod.evaluate_model()
mod.evaluate_model(n_splits=2)
results = mod.predict("CCN(CC)C(=O)C1=CC(=CC=C1)C")
@ -73,3 +61,32 @@ class ModelTest(TestCase):
# from pprint import pprint
# pprint(mod.eval_results)
def test_rbrr(self):
with TemporaryDirectory() as tmpdir:
with self.settings(MODEL_DIR=tmpdir):
threshold = float(0.5)
rule_package_objs = [self.BBD_SUBSET]
data_package_objs = [self.BBD_SUBSET]
eval_packages_objs = [self.BBD_SUBSET]
mod = RuleBasedRelativeReasoning.create(
self.package,
rule_package_objs,
data_package_objs,
eval_packages_objs,
threshold=threshold,
min_count=5,
max_count=0,
name='ECC - BBD - 0.5',
description='Created MLRelativeReasoning in Testcase',
)
mod.build_dataset()
mod.build_model()
mod.multigen_eval = True
mod.save()
mod.evaluate_model(n_splits=2)
results = mod.predict("CCN(CC)C(=O)C1=CC(=CC=C1)C")