forked from enviPath/enviPy
Fixed handling for SMIRKS/SMARTS, adjusted test values as they are now cleaned, refactored logic for object update
This commit is contained in:
@ -29,8 +29,14 @@ from sklearn.metrics import precision_score, recall_score, jaccard_score
|
||||
from sklearn.model_selection import ShuffleSplit
|
||||
|
||||
from utilities.chem import FormatConverter, ProductSet, PredictionResult, IndigoUtils
|
||||
from utilities.ml import RuleBasedDataset, ApplicabilityDomainPCA, EnsembleClassifierChain, RelativeReasoning, \
|
||||
EnviFormerDataset, Dataset
|
||||
from utilities.ml import (
|
||||
RuleBasedDataset,
|
||||
ApplicabilityDomainPCA,
|
||||
EnsembleClassifierChain,
|
||||
RelativeReasoning,
|
||||
EnviFormerDataset,
|
||||
Dataset,
|
||||
)
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
@ -1190,9 +1196,10 @@ class SimpleAmbitRule(SimpleRule):
|
||||
|
||||
r = SimpleAmbitRule()
|
||||
r.package = package
|
||||
|
||||
if name is not None:
|
||||
# Clean for potential XSS
|
||||
name = nh3.clean(name, tags=s.ALLOWED_HTML_TAGS).strip()
|
||||
|
||||
if name is None or name == "":
|
||||
name = f"Rule {Rule.objects.filter(package=package).count() + 1}"
|
||||
|
||||
@ -1200,13 +1207,19 @@ class SimpleAmbitRule(SimpleRule):
|
||||
if description is not None and description.strip() != "":
|
||||
r.description = nh3.clean(description, tags=s.ALLOWED_HTML_TAGS).strip()
|
||||
|
||||
r.smirks = nh3.clean(smirks).strip()
|
||||
r.smirks = smirks
|
||||
|
||||
if reactant_filter_smarts is not None and reactant_filter_smarts.strip() != "":
|
||||
r.reactant_filter_smarts = nh3.clean(reactant_filter_smarts).strip()
|
||||
if not FormatConverter.is_valid_smarts(reactant_filter_smarts.strip()):
|
||||
raise ValueError(f'Reactant Filter SMARTS "{reactant_filter_smarts}" is invalid!')
|
||||
else:
|
||||
r.reactant_filter_smarts = reactant_filter_smarts.strip()
|
||||
|
||||
if product_filter_smarts is not None and product_filter_smarts.strip() != "":
|
||||
r.product_filter_smarts = nh3.clean(product_filter_smarts).strip()
|
||||
if not FormatConverter.is_valid_smarts(product_filter_smarts.strip()):
|
||||
raise ValueError(f'Product Filter SMARTS "{product_filter_smarts}" is invalid!')
|
||||
else:
|
||||
r.product_filter_smarts = product_filter_smarts.strip()
|
||||
|
||||
r.save()
|
||||
return r
|
||||
@ -2353,7 +2366,9 @@ class PackageBasedModel(EPModel):
|
||||
eval_reactions = list(
|
||||
Reaction.objects.filter(package__in=self.eval_packages.all()).distinct()
|
||||
)
|
||||
ds = RuleBasedDataset.generate_dataset(eval_reactions, self.applicable_rules, educts_only=True)
|
||||
ds = RuleBasedDataset.generate_dataset(
|
||||
eval_reactions, self.applicable_rules, educts_only=True
|
||||
)
|
||||
if isinstance(self, RuleBasedRelativeReasoning):
|
||||
X = ds.X(exclude_id_col=False, na_replacement=None).to_numpy()
|
||||
y = ds.y(na_replacement=np.nan).to_numpy()
|
||||
@ -2818,7 +2833,9 @@ class ApplicabilityDomain(EnviPathModel):
|
||||
else:
|
||||
smiles.append(structures)
|
||||
|
||||
assessment_ds, assessment_prods = ds.classification_dataset(structures, self.model.applicable_rules)
|
||||
assessment_ds, assessment_prods = ds.classification_dataset(
|
||||
structures, self.model.applicable_rules
|
||||
)
|
||||
|
||||
# qualified_neighbours_per_rule is a nested dictionary structured as:
|
||||
# {
|
||||
@ -2834,12 +2851,16 @@ class ApplicabilityDomain(EnviPathModel):
|
||||
qualified_neighbours_per_rule: Dict = {}
|
||||
|
||||
import polars as pl
|
||||
|
||||
# Select only the triggered columns
|
||||
for i, row in enumerate(assessment_ds[:, assessment_ds.triggered()].iter_rows(named=True)):
|
||||
# Find the rules the structure triggers. For each rule, filter the training dataset to rows that also
|
||||
# trigger that rule.
|
||||
train_trig = {trig_uuid.split("_")[-1]: ds.filter(pl.col(trig_uuid).eq(1))
|
||||
for trig_uuid, value in row.items() if value == 1}
|
||||
train_trig = {
|
||||
trig_uuid.split("_")[-1]: ds.filter(pl.col(trig_uuid).eq(1))
|
||||
for trig_uuid, value in row.items()
|
||||
if value == 1
|
||||
}
|
||||
qualified_neighbours_per_rule[i] = train_trig
|
||||
rule_to_i = {str(r.uuid): i for i, r in enumerate(self.model.applicable_rules)}
|
||||
preds = self.model.combine_products_and_probs(
|
||||
@ -2859,18 +2880,28 @@ class ApplicabilityDomain(EnviPathModel):
|
||||
# loop through rule indices together with the collected neighbours indices from train dataset
|
||||
for rule_uuid, train_instances in qualified_neighbours_per_rule[i].items():
|
||||
# compute tanimoto distance for all neighbours and add to dataset
|
||||
dists = self._compute_distances(assessment_ds[i, assessment_ds.struct_features()].to_numpy()[0],
|
||||
train_instances[:, train_instances.struct_features()].to_numpy())
|
||||
dists = self._compute_distances(
|
||||
assessment_ds[i, assessment_ds.struct_features()].to_numpy()[0],
|
||||
train_instances[:, train_instances.struct_features()].to_numpy(),
|
||||
)
|
||||
train_instances = train_instances.with_columns(dist=pl.Series(dists))
|
||||
|
||||
# sort them in a descending way and take at most `self.num_neighbours`
|
||||
# TODO: Should this be descending? If we want the most similar then we want values close to zero (ascending)
|
||||
train_instances = train_instances.sort("dist", descending=True)[:self.num_neighbours]
|
||||
train_instances = train_instances.sort("dist", descending=True)[
|
||||
: self.num_neighbours
|
||||
]
|
||||
# compute average distance
|
||||
rule_reliabilities[rule_uuid] = train_instances.select(pl.mean("dist")).fill_nan(0.0).item()
|
||||
rule_reliabilities[rule_uuid] = (
|
||||
train_instances.select(pl.mean("dist")).fill_nan(0.0).item()
|
||||
)
|
||||
# for local_compatibility we'll need the datasets for the indices having the highest similarity
|
||||
local_compatibilities[rule_uuid] = self._compute_compatibility(rule_uuid, train_instances)
|
||||
neighbours_per_rule[rule_uuid] = list(CompoundStructure.objects.filter(uuid__in=train_instances["structure_id"]))
|
||||
local_compatibilities[rule_uuid] = self._compute_compatibility(
|
||||
rule_uuid, train_instances
|
||||
)
|
||||
neighbours_per_rule[rule_uuid] = list(
|
||||
CompoundStructure.objects.filter(uuid__in=train_instances["structure_id"])
|
||||
)
|
||||
neighbor_probs_per_rule[rule_uuid] = train_instances[f"prob_{rule_uuid}"].to_list()
|
||||
|
||||
ad_res = {
|
||||
@ -2944,8 +2975,11 @@ class ApplicabilityDomain(EnviPathModel):
|
||||
def _compute_compatibility(self, rule_idx: int, neighbours: "RuleBasedDataset"):
|
||||
accuracy = 0.0
|
||||
import polars as pl
|
||||
obs_pred = neighbours.select(obs=pl.col(f"obs_{rule_idx}").cast(pl.Boolean),
|
||||
pred=pl.col(f"prob_{rule_idx}") >= self.model.threshold)
|
||||
|
||||
obs_pred = neighbours.select(
|
||||
obs=pl.col(f"obs_{rule_idx}").cast(pl.Boolean),
|
||||
pred=pl.col(f"prob_{rule_idx}") >= self.model.threshold,
|
||||
)
|
||||
# Compute tp, tn, fp, fn using polars expressions
|
||||
tp = obs_pred.filter((pl.col("obs")) & (pl.col("pred"))).height
|
||||
tn = obs_pred.filter((~pl.col("obs")) & (~pl.col("pred"))).height
|
||||
@ -3115,7 +3149,7 @@ class EnviFormer(PackageBasedModel):
|
||||
pred_dict = {}
|
||||
for k, pred in enumerate(predictions):
|
||||
pred_smiles, pred_proba = zip(*pred.items())
|
||||
reactant, true_product = test_ds[k, "educts"], test_ds[k, "products"]
|
||||
reactant, _ = test_ds[k, "educts"], test_ds[k, "products"]
|
||||
pred_dict.setdefault(reactant, {"predict": [], "scores": []})
|
||||
for smiles, proba in zip(pred_smiles, pred_proba):
|
||||
smiles = set(smiles.split("."))
|
||||
@ -3229,8 +3263,9 @@ class EnviFormer(PackageBasedModel):
|
||||
|
||||
# If there are eval packages perform single generation evaluation on them instead of random splits
|
||||
if self.eval_packages.count() > 0:
|
||||
ds = EnviFormerDataset.generate_dataset(Reaction.objects.filter(
|
||||
package__in=self.eval_packages.all()).distinct())
|
||||
ds = EnviFormerDataset.generate_dataset(
|
||||
Reaction.objects.filter(package__in=self.eval_packages.all()).distinct()
|
||||
)
|
||||
test_result = self.model.predict_batch(ds.X())
|
||||
single_gen_result = evaluate_sg(ds, test_result, self.threshold)
|
||||
self.eval_results = self.compute_averages([single_gen_result])
|
||||
@ -3248,7 +3283,9 @@ class EnviFormer(PackageBasedModel):
|
||||
train = ds[train_index]
|
||||
test = ds[test_index]
|
||||
start = datetime.now()
|
||||
model = fine_tune(train.X(), train.y(), s.MODEL_DIR, str(split_id), device=s.ENVIFORMER_DEVICE)
|
||||
model = fine_tune(
|
||||
train.X(), train.y(), s.MODEL_DIR, str(split_id), device=s.ENVIFORMER_DEVICE
|
||||
)
|
||||
end = datetime.now()
|
||||
logger.debug(
|
||||
f"EnviFormer finetuning took {(end - start).total_seconds():.2f} seconds"
|
||||
@ -3325,7 +3362,12 @@ class EnviFormer(PackageBasedModel):
|
||||
for pathway in train_pathways:
|
||||
for reaction in pathway.edges:
|
||||
reaction = reaction.edge_label
|
||||
if any([educt in test_educts for educt in reaction_to_educts[str(reaction.uuid)]]):
|
||||
if any(
|
||||
[
|
||||
educt in test_educts
|
||||
for educt in reaction_to_educts[str(reaction.uuid)]
|
||||
]
|
||||
):
|
||||
overlap += 1
|
||||
continue
|
||||
train_reactions.append(reaction)
|
||||
|
||||
Reference in New Issue
Block a user