work towards #120

This commit is contained in:
Liam Brydon
2025-10-24 14:40:26 +13:00
parent 2980a75daa
commit 8166df6f39
2 changed files with 203 additions and 59 deletions

View File

@ -28,7 +28,7 @@ from sklearn.metrics import precision_score, recall_score, jaccard_score
from sklearn.model_selection import ShuffleSplit
from utilities.chem import FormatConverter, ProductSet, PredictionResult, IndigoUtils
from utilities.ml import RuleBasedDataset, ApplicabilityDomainPCA, EnsembleClassifierChain, RelativeReasoning
from utilities.ml import RuleBasedDataset, ApplicabilityDomainPCA, EnsembleClassifierChain, RelativeReasoning, EnviFormerDataset
logger = logging.getLogger(__name__)
@ -3088,35 +3088,17 @@ class EnviFormer(PackageBasedModel):
self.save()
start = datetime.now()
# Standardise reactions for the training data, EnviFormer ignores stereochemistry currently
ds = []
for reaction in self._get_reactions():
educts = ".".join(
[
FormatConverter.standardize(smile.smiles, remove_stereo=True)
for smile in reaction.educts.all()
]
)
products = ".".join(
[
FormatConverter.standardize(smile.smiles, remove_stereo=True)
for smile in reaction.products.all()
]
)
ds.append(f"{educts}>>{products}")
ds = EnviFormerDataset.generate_dataset(self._get_reactions())
end = datetime.now()
logger.debug(f"build_dataset took {(end - start).total_seconds()} seconds")
f = os.path.join(s.MODEL_DIR, f"{self.uuid}_ds.json")
with open(f, "w") as d_file:
json.dump(ds, d_file)
ds.save(f)
return ds
def load_dataset(self) -> "RuleBasedDataset":
ds_path = os.path.join(s.MODEL_DIR, f"{self.uuid}_ds.json")
with open(ds_path) as d_file:
ds = json.load(d_file)
return ds
return EnviFormerDataset.load(ds_path)
def _fit_model(self, ds):
# Call to enviFormer's fine_tune function and return the model
@ -3148,13 +3130,12 @@ class EnviFormer(PackageBasedModel):
def evaluate_sg(test_reactions, predictions, model_thresh):
# Group the true products of reactions with the same reactant together
assert len(test_reactions) == len(predictions)
true_dict = {}
for r in test_reactions:
reactant, true_product_set = r.split(">>")
true_product_set = {p for p in true_product_set.split(".")}
true_dict[reactant] = true_dict.setdefault(reactant, []) + [true_product_set]
assert len(test_reactions) == len(predictions)
assert sum(len(v) for v in true_dict.values()) == len(test_reactions)
# Group the predicted products of reactions with the same reactant together
pred_dict = {}
@ -3274,24 +3255,9 @@ class EnviFormer(PackageBasedModel):
# If there are eval packages perform single generation evaluation on them instead of random splits
if self.eval_packages.count() > 0:
ds = []
for reaction in Reaction.objects.filter(
package__in=self.eval_packages.all()
).distinct():
educts = ".".join(
[
FormatConverter.standardize(smile.smiles, remove_stereo=True)
for smile in reaction.educts.all()
]
)
products = ".".join(
[
FormatConverter.standardize(smile.smiles, remove_stereo=True)
for smile in reaction.products.all()
]
)
ds.append(f"{educts}>>{products}")
test_result = self.model.predict_batch([smirk.split(">>")[0] for smirk in ds])
ds = EnviFormerDataset.generate_dataset(Reaction.objects.filter(
package__in=self.eval_packages.all()).distinct())
test_result = self.model.predict_batch(ds)
single_gen_result = evaluate_sg(ds, test_result, self.threshold)
self.eval_results = self.compute_averages([single_gen_result])
else: