forked from enviPath/enviPy
work towards #120
This commit is contained in:
@ -28,7 +28,7 @@ from sklearn.metrics import precision_score, recall_score, jaccard_score
|
||||
from sklearn.model_selection import ShuffleSplit
|
||||
|
||||
from utilities.chem import FormatConverter, ProductSet, PredictionResult, IndigoUtils
|
||||
from utilities.ml import RuleBasedDataset, ApplicabilityDomainPCA, EnsembleClassifierChain, RelativeReasoning
|
||||
from utilities.ml import RuleBasedDataset, ApplicabilityDomainPCA, EnsembleClassifierChain, RelativeReasoning, EnviFormerDataset
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
@ -3088,35 +3088,17 @@ class EnviFormer(PackageBasedModel):
|
||||
self.save()
|
||||
|
||||
start = datetime.now()
|
||||
# Standardise reactions for the training data, EnviFormer ignores stereochemistry currently
|
||||
ds = []
|
||||
for reaction in self._get_reactions():
|
||||
educts = ".".join(
|
||||
[
|
||||
FormatConverter.standardize(smile.smiles, remove_stereo=True)
|
||||
for smile in reaction.educts.all()
|
||||
]
|
||||
)
|
||||
products = ".".join(
|
||||
[
|
||||
FormatConverter.standardize(smile.smiles, remove_stereo=True)
|
||||
for smile in reaction.products.all()
|
||||
]
|
||||
)
|
||||
ds.append(f"{educts}>>{products}")
|
||||
ds = EnviFormerDataset.generate_dataset(self._get_reactions())
|
||||
|
||||
end = datetime.now()
|
||||
logger.debug(f"build_dataset took {(end - start).total_seconds()} seconds")
|
||||
f = os.path.join(s.MODEL_DIR, f"{self.uuid}_ds.json")
|
||||
with open(f, "w") as d_file:
|
||||
json.dump(ds, d_file)
|
||||
ds.save(f)
|
||||
return ds
|
||||
|
||||
def load_dataset(self) -> "RuleBasedDataset":
|
||||
ds_path = os.path.join(s.MODEL_DIR, f"{self.uuid}_ds.json")
|
||||
with open(ds_path) as d_file:
|
||||
ds = json.load(d_file)
|
||||
return ds
|
||||
return EnviFormerDataset.load(ds_path)
|
||||
|
||||
def _fit_model(self, ds):
|
||||
# Call to enviFormer's fine_tune function and return the model
|
||||
@ -3148,13 +3130,12 @@ class EnviFormer(PackageBasedModel):
|
||||
|
||||
def evaluate_sg(test_reactions, predictions, model_thresh):
|
||||
# Group the true products of reactions with the same reactant together
|
||||
assert len(test_reactions) == len(predictions)
|
||||
true_dict = {}
|
||||
for r in test_reactions:
|
||||
reactant, true_product_set = r.split(">>")
|
||||
true_product_set = {p for p in true_product_set.split(".")}
|
||||
true_dict[reactant] = true_dict.setdefault(reactant, []) + [true_product_set]
|
||||
assert len(test_reactions) == len(predictions)
|
||||
assert sum(len(v) for v in true_dict.values()) == len(test_reactions)
|
||||
|
||||
# Group the predicted products of reactions with the same reactant together
|
||||
pred_dict = {}
|
||||
@ -3274,24 +3255,9 @@ class EnviFormer(PackageBasedModel):
|
||||
|
||||
# If there are eval packages perform single generation evaluation on them instead of random splits
|
||||
if self.eval_packages.count() > 0:
|
||||
ds = []
|
||||
for reaction in Reaction.objects.filter(
|
||||
package__in=self.eval_packages.all()
|
||||
).distinct():
|
||||
educts = ".".join(
|
||||
[
|
||||
FormatConverter.standardize(smile.smiles, remove_stereo=True)
|
||||
for smile in reaction.educts.all()
|
||||
]
|
||||
)
|
||||
products = ".".join(
|
||||
[
|
||||
FormatConverter.standardize(smile.smiles, remove_stereo=True)
|
||||
for smile in reaction.products.all()
|
||||
]
|
||||
)
|
||||
ds.append(f"{educts}>>{products}")
|
||||
test_result = self.model.predict_batch([smirk.split(">>")[0] for smirk in ds])
|
||||
ds = EnviFormerDataset.generate_dataset(Reaction.objects.filter(
|
||||
package__in=self.eval_packages.all()).distinct())
|
||||
test_result = self.model.predict_batch(ds)
|
||||
single_gen_result = evaluate_sg(ds, test_result, self.threshold)
|
||||
self.eval_results = self.compute_averages([single_gen_result])
|
||||
else:
|
||||
|
||||
Reference in New Issue
Block a user