new RuleBasedDataset and EnviFormer dataset working for respective models #120

2025-11-04 10:58:16 +13:00
parent ff51e48f90
commit ac5d370b18
5 changed files with 126 additions and 101 deletions
--- a/tests/test_dataset.py
+++ b/tests/test_dataset.py
@ -1,8 +1,9 @@
+import os.path
+from tempfile import TemporaryDirectory
 from django.test import TestCase
-
 from epdb.logic import PackageManager
 from epdb.models import Reaction, Compound, User, Rule, Package
-from utilities.ml import RuleBasedDataset
+from utilities.ml import RuleBasedDataset, EnviFormerDataset


 class DatasetTest(TestCase):
@ -45,11 +46,11 @@ class DatasetTest(TestCase):

    def test_generate_dataset(self):
        """Test generating dataset does not crash"""
-        self.generate_dataset()
+        self.generate_rule_dataset()

    def test_indexing(self):
        """Test indexing a few different ways to check for crashes"""
-        ds, reactions, rules = self.generate_dataset()
+        ds, reactions, rules = self.generate_rule_dataset()
        print(ds[5])
        print(ds[2, 5])
        print(ds[3:6, 2:8])
@ -57,45 +58,45 @@ class DatasetTest(TestCase):

    def test_add_rows(self):
        """Test adding one row and adding multiple rows"""
-        ds, reactions, rules = self.generate_dataset()
+        ds, reactions, rules = self.generate_rule_dataset()
        ds.add_row(list(ds.df.row(1)))
        ds.add_rows([list(ds.df.row(i)) for i in range(5)])

    def test_times_triggered(self):
        """Check getting times triggered for a rule id"""
-        ds, reactions, rules = self.generate_dataset()
+        ds, reactions, rules = self.generate_rule_dataset()
        print(ds.times_triggered(rules[0].uuid))

    def test_block_indices(self):
        """Test the usages of _block_indices"""
-        ds, reactions, rules = self.generate_dataset()
+        ds, reactions, rules = self.generate_rule_dataset()
        print(ds.struct_features())
        print(ds.triggered())
        print(ds.observed())

    def test_structure_id(self):
        """Check getting a structure id from row index"""
-        ds, reactions, rules = self.generate_dataset()
+        ds, reactions, rules = self.generate_rule_dataset()
        print(ds.structure_id(0))

    def test_x(self):
        """Test getting X portion of the dataframe"""
-        ds, reactions, rules = self.generate_dataset()
+        ds, reactions, rules = self.generate_rule_dataset()
        print(ds.X().df.head())

    def test_trig(self):
        """Test getting the triggered portion of the dataframe"""
-        ds, reactions, rules = self.generate_dataset()
+        ds, reactions, rules = self.generate_rule_dataset()
        print(ds.trig().df.head())

    def test_y(self):
        """Test getting the Y portion of the dataframe"""
-        ds, reactions, rules = self.generate_dataset()
+        ds, reactions, rules = self.generate_rule_dataset()
        print(ds.y().df.head())

    def test_classification_dataset(self):
        """Test making the classification dataset"""
-        ds, reactions, rules = self.generate_dataset()
+        ds, reactions, rules = self.generate_rule_dataset()
        compounds = [c.default_structure for c in Compound.objects.filter(package=self.BBD_SUBSET)]
        class_ds, products = ds.classification_dataset(compounds, rules)
        print(class_ds.df.head(5))
@ -103,12 +104,16 @@ class DatasetTest(TestCase):

    def test_to_arff(self):
        """Test exporting the arff version of the dataset"""
-        ds, reactions, rules = self.generate_dataset()
+        ds, reactions, rules = self.generate_rule_dataset()
        ds.to_arff("dataset_arff_test.arff")

    def test_save_load(self):
        """Test saving and loading dataset"""
-        ds, reactions, rules = self.generate_dataset()
+        with TemporaryDirectory() as tmpdir:
+            ds, reactions, rules = self.generate_rule_dataset()
+            ds.save(os.path.join(tmpdir, "save_dataset.pkl"))
+            ds_loaded = RuleBasedDataset.load(os.path.join(tmpdir, "save_dataset.pkl"))
+            self.assertTrue(ds.df.equals(ds_loaded.df))

    def test_dataset_example(self):
        """Test with a concrete example checking dataset size"""
@ -120,9 +125,19 @@ class DatasetTest(TestCase):
        self.assertEqual(len(ds.y()), 1)
        self.assertEqual(ds.y().df.item(), 1)

-    def generate_dataset(self):
+    def test_enviformer_dataset(self):
+        ds, reactions = self.generate_enviformer_dataset()
+        print(ds.X().head())
+        print(ds.y().head())
+
+    def generate_rule_dataset(self):
        """Generate a RuleBasedDataset from test package data"""
        reactions = [r for r in Reaction.objects.filter(package=self.BBD_SUBSET)]
        applicable_rules = [r for r in Rule.objects.filter(package=self.BBD_SUBSET)]
        ds = RuleBasedDataset.generate_dataset(reactions, applicable_rules)
        return ds, reactions, applicable_rules
+
+    def generate_enviformer_dataset(self):
+        reactions = [r for r in Reaction.objects.filter(package=self.BBD_SUBSET)]
+        ds = EnviFormerDataset.generate_dataset(reactions)
+        return ds, reactions
--- a/tests/test_enviformer.py
+++ b/tests/test_enviformer.py
@ -50,7 +50,7 @@ class EnviFormerTest(TestCase):
                mod.build_model()
                mod.multigen_eval = True
                mod.save()
-                mod.evaluate_model()
+                mod.evaluate_model(n_splits=2)

                mod.predict("CCN(CC)C(=O)C1=CC(=CC=C1)C")

--- a/tests/test_model.py
+++ b/tests/test_model.py
@ -4,7 +4,7 @@ import numpy as np
 from django.test import TestCase

 from epdb.logic import PackageManager
-from epdb.models import User, MLRelativeReasoning, Package
+from epdb.models import User, MLRelativeReasoning, Package, RuleBasedRelativeReasoning


 class ModelTest(TestCase):
@ -17,7 +17,7 @@ class ModelTest(TestCase):
        cls.package = PackageManager.create_package(cls.user, "Anon Test Package", "No Desc")
        cls.BBD_SUBSET = Package.objects.get(name="Fixtures")

-    def test_smoke(self):
+    def test_mlrr(self):
        with TemporaryDirectory() as tmpdir:
            with self.settings(MODEL_DIR=tmpdir):
                threshold = float(0.5)
@ -36,23 +36,11 @@ class ModelTest(TestCase):
                    description="Created MLRelativeReasoning in Testcase",
                )

-                # mod = RuleBasedRelativeReasoning.create(
-                #     self.package,
-                #     rule_package_objs,
-                #     data_package_objs,
-                #     eval_packages_objs,
-                #     threshold=threshold,
-                #     min_count=5,
-                #     max_count=0,
-                #     name='ECC - BBD - 0.5',
-                #     description='Created MLRelativeReasoning in Testcase',
-                # )
-
                mod.build_dataset()
                mod.build_model()
                mod.multigen_eval = True
                mod.save()
-                mod.evaluate_model()
+                mod.evaluate_model(n_splits=2)

                results = mod.predict("CCN(CC)C(=O)C1=CC(=CC=C1)C")

@ -73,3 +61,32 @@ class ModelTest(TestCase):

                # from pprint import pprint
                # pprint(mod.eval_results)
+
+    def test_rbrr(self):
+        with TemporaryDirectory() as tmpdir:
+            with self.settings(MODEL_DIR=tmpdir):
+                threshold = float(0.5)
+
+                rule_package_objs = [self.BBD_SUBSET]
+                data_package_objs = [self.BBD_SUBSET]
+                eval_packages_objs = [self.BBD_SUBSET]
+
+                mod = RuleBasedRelativeReasoning.create(
+                    self.package,
+                    rule_package_objs,
+                    data_package_objs,
+                    eval_packages_objs,
+                    threshold=threshold,
+                    min_count=5,
+                    max_count=0,
+                    name='ECC - BBD - 0.5',
+                    description='Created MLRelativeReasoning in Testcase',
+                )
+
+                mod.build_dataset()
+                mod.build_model()
+                mod.multigen_eval = True
+                mod.save()
+                mod.evaluate_model(n_splits=2)
+
+                results = mod.predict("CCN(CC)C(=O)C1=CC(=CC=C1)C")