work towards #120

2025-10-24 14:40:26 +13:00
parent 2980a75daa
commit 8166df6f39
2 changed files with 203 additions and 59 deletions
--- a/epdb/models.py
+++ b/epdb/models.py
@ -28,7 +28,7 @@ from sklearn.metrics import precision_score, recall_score, jaccard_score
 from sklearn.model_selection import ShuffleSplit

 from utilities.chem import FormatConverter, ProductSet, PredictionResult, IndigoUtils
-from utilities.ml import RuleBasedDataset, ApplicabilityDomainPCA, EnsembleClassifierChain, RelativeReasoning
+from utilities.ml import RuleBasedDataset, ApplicabilityDomainPCA, EnsembleClassifierChain, RelativeReasoning, EnviFormerDataset

 logger = logging.getLogger(__name__)

@ -3088,35 +3088,17 @@ class EnviFormer(PackageBasedModel):
        self.save()

        start = datetime.now()
-        # Standardise reactions for the training data, EnviFormer ignores stereochemistry currently
-        ds = []
-        for reaction in self._get_reactions():
-            educts = ".".join(
-                [
-                    FormatConverter.standardize(smile.smiles, remove_stereo=True)
-                    for smile in reaction.educts.all()
-                ]
-            )
-            products = ".".join(
-                [
-                    FormatConverter.standardize(smile.smiles, remove_stereo=True)
-                    for smile in reaction.products.all()
-                ]
-            )
-            ds.append(f"{educts}>>{products}")
+        ds = EnviFormerDataset.generate_dataset(self._get_reactions())

        end = datetime.now()
        logger.debug(f"build_dataset took {(end - start).total_seconds()} seconds")
        f = os.path.join(s.MODEL_DIR, f"{self.uuid}_ds.json")
-        with open(f, "w") as d_file:
-            json.dump(ds, d_file)
+        ds.save(f)
        return ds

    def load_dataset(self) -> "RuleBasedDataset":
        ds_path = os.path.join(s.MODEL_DIR, f"{self.uuid}_ds.json")
-        with open(ds_path) as d_file:
-            ds = json.load(d_file)
-        return ds
+        return EnviFormerDataset.load(ds_path)

    def _fit_model(self, ds):
        # Call to enviFormer's fine_tune function and return the model
@ -3148,13 +3130,12 @@ class EnviFormer(PackageBasedModel):

        def evaluate_sg(test_reactions, predictions, model_thresh):
            # Group the true products of reactions with the same reactant together
+            assert len(test_reactions) == len(predictions)
            true_dict = {}
            for r in test_reactions:
                reactant, true_product_set = r.split(">>")
                true_product_set = {p for p in true_product_set.split(".")}
                true_dict[reactant] = true_dict.setdefault(reactant, []) + [true_product_set]
-            assert len(test_reactions) == len(predictions)
-            assert sum(len(v) for v in true_dict.values()) == len(test_reactions)

            # Group the predicted products of reactions with the same reactant together
            pred_dict = {}
@ -3274,24 +3255,9 @@ class EnviFormer(PackageBasedModel):

        # If there are eval packages perform single generation evaluation on them instead of random splits
        if self.eval_packages.count() > 0:
-            ds = []
-            for reaction in Reaction.objects.filter(
-                package__in=self.eval_packages.all()
-            ).distinct():
-                educts = ".".join(
-                    [
-                        FormatConverter.standardize(smile.smiles, remove_stereo=True)
-                        for smile in reaction.educts.all()
-                    ]
-                )
-                products = ".".join(
-                    [
-                        FormatConverter.standardize(smile.smiles, remove_stereo=True)
-                        for smile in reaction.products.all()
-                    ]
-                )
-                ds.append(f"{educts}>>{products}")
-            test_result = self.model.predict_batch([smirk.split(">>")[0] for smirk in ds])
+            ds = EnviFormerDataset.generate_dataset(Reaction.objects.filter(
+                package__in=self.eval_packages.all()).distinct())
+            test_result = self.model.predict_batch(ds)
            single_gen_result = evaluate_sg(ds, test_result, self.threshold)
            self.eval_results = self.compute_averages([single_gen_result])
        else: