finished app domain conversion #120

This commit is contained in:
Liam Brydon
2025-11-05 12:41:33 +13:00
parent 13af49488e
commit f1f7ce344c
2 changed files with 63 additions and 81 deletions

View File

@ -2764,7 +2764,7 @@ class ApplicabilityDomain(EnviPathModel):
def training_set_probs(self): def training_set_probs(self):
ds = self.model.load_dataset() ds = self.model.load_dataset()
col_ids = ds.block_indices("prob") col_ids = ds.block_indices("prob")
return ds[ds.columns[col_ids[0]: col_ids[1]]] return ds[:, col_ids]
def build(self): def build(self):
ds = self.model.load_dataset() ds = self.model.load_dataset()
@ -2819,18 +2819,17 @@ class ApplicabilityDomain(EnviPathModel):
# it identifies all training structures that have the same trigger reaction activated (i.e., value 1). # it identifies all training structures that have the same trigger reaction activated (i.e., value 1).
# This is used to find "qualified neighbours" — training examples that share the same triggered feature # This is used to find "qualified neighbours" — training examples that share the same triggered feature
# with a given assessment structure under a particular rule. # with a given assessment structure under a particular rule.
qualified_neighbours_per_rule: Dict = {}
import polars as pl import polars as pl
qualified_neighbours_per_rule: Dict = {}
# Select only the triggered columns # Select only the triggered columns
for i, row in enumerate(assessment_ds[:, assessment_ds.triggered()].iter_rows(named=True)): for i, row in enumerate(assessment_ds[:, assessment_ds.triggered()].iter_rows(named=True)):
# Find the rules the structure triggers. For each rule, filter the training dataset to rows that also # Find the rules the structure triggers. For each rule, filter the training dataset to rows that also
# trigger that rule. Select the structure_id of the compounds in those filtered rows # trigger that rule.
train_trig = {col_name: ds.df.filter(pl.col(col_name).eq(1)).select("structure_id") for col_name, value in row.items() if value == 1} train_trig = {trig_uuid.split("_")[-1]: ds.filter(pl.col(trig_uuid).eq(1))
for trig_uuid, value in row.items() if value == 1}
qualified_neighbours_per_rule[i] = train_trig qualified_neighbours_per_rule[i] = train_trig
rule_to_i = {str(r.uuid): i for i, r in enumerate(self.model.applicable_rules)}
probs = self.training_set_probs
# preds = self.model.model.predict_proba(assessment_ds.X())
preds = self.model.combine_products_and_probs( preds = self.model.combine_products_and_probs(
self.model.applicable_rules, self.model.applicable_rules,
self.model.model.predict_proba(assessment_ds.X().to_numpy())[0], self.model.model.predict_proba(assessment_ds.X().to_numpy())[0],
@ -2840,57 +2839,28 @@ class ApplicabilityDomain(EnviPathModel):
assessments = list() assessments = list()
# loop through our assessment dataset # loop through our assessment dataset
for i, instance in enumerate(assessment_ds): for i, instance in enumerate(assessment_ds[:, assessment_ds.struct_features()]):
rule_reliabilities = dict() rule_reliabilities = dict()
local_compatibilities = dict() local_compatibilities = dict()
neighbours_per_rule = dict() neighbours_per_rule = dict()
neighbor_probs_per_rule = dict() neighbor_probs_per_rule = dict()
# loop through rule indices together with the collected neighbours indices from train dataset # loop through rule indices together with the collected neighbours indices from train dataset
for rule_idx, vals in qualified_neighbours_per_rule[i].items(): for rule_uuid, train_instances in qualified_neighbours_per_rule[i].items():
# collect the train dataset instances and store it along with the index (a.k.a. row number) of the # compute tanimoto distance for all neighbours and add to dataset
# train dataset dists = self._compute_distances(assessment_ds[i, assessment_ds.struct_features()].to_numpy()[0],
train_instances = [] train_instances[:, train_instances.struct_features()].to_numpy())
for v in vals: train_instances = train_instances.with_columns(dist=pl.Series(dists))
train_instances.append((v, ds.at(v)))
# sf is a tuple with start/end index of the features
sf = ds.struct_features()
# compute tanimoto distance for all neighbours
# result ist a list of tuples with train index and computed distance
dists = self._compute_distances(
instance.X()[0][sf[0] : sf[1]],
[ti[1].X()[0][sf[0] : sf[1]] for ti in train_instances],
)
dists_with_index = list()
for ti, dist in zip(train_instances, dists):
dists_with_index.append((ti[0], dist[1]))
# sort them in a descending way and take at most `self.num_neighbours` # sort them in a descending way and take at most `self.num_neighbours`
dists_with_index = sorted(dists_with_index, key=lambda x: x[1], reverse=True) # TODO: Should this be descending? If we want the most similar then we want values close to zero (ascending)
dists_with_index = dists_with_index[: self.num_neighbours] train_instances = train_instances.sort("dist", descending=True)[:self.num_neighbours]
# compute average distance # compute average distance
rule_reliabilities[rule_idx] = ( rule_reliabilities[rule_uuid] = train_instances.select(pl.mean("dist")).fill_nan(0.0).item()
sum([d[1] for d in dists_with_index]) / len(dists_with_index)
if len(dists_with_index) > 0
else 0.0
)
# for local_compatibility we'll need the datasets for the indices having the highest similarity # for local_compatibility we'll need the datasets for the indices having the highest similarity
neighbour_datasets = [(d[0], ds.at(d[0])) for d in dists_with_index] local_compatibilities[rule_uuid] = self._compute_compatibility(rule_uuid, train_instances)
local_compatibilities[rule_idx] = self._compute_compatibility( neighbours_per_rule[rule_uuid] = list(CompoundStructure.objects.filter(uuid__in=train_instances["structure_id"]))
rule_idx, probs, neighbour_datasets neighbor_probs_per_rule[rule_uuid] = train_instances[f"prob_{rule_uuid}"].to_list()
)
neighbours_per_rule[rule_idx] = [
CompoundStructure.objects.get(uuid=ds.structure_id(1))
for ds in neighbour_datasets
]
neighbor_probs_per_rule[rule_idx] = [
probs[d[0]][rule_idx] for d in dists_with_index
]
ad_res = { ad_res = {
"ad_params": { "ad_params": {
@ -2902,22 +2872,20 @@ class ApplicabilityDomain(EnviPathModel):
}, },
"assessment": { "assessment": {
"smiles": smiles, "smiles": smiles,
"inside_app_domain": self.pca.is_applicable(instance)[0], "inside_app_domain": self.pca.is_applicable(assessment_ds[i])[0],
}, },
} }
transformations = list() transformations = list()
for rule_idx in rule_reliabilities.keys(): for rule_uuid in rule_reliabilities.keys():
rule = Rule.objects.get( rule = Rule.objects.get(uuid=rule_uuid)
uuid=instance.columns[instance.observed()[0] + rule_idx].replace("obs_", "")
)
rule_data = rule.simple_json() rule_data = rule.simple_json()
rule_data["image"] = f"{rule.url}?image=svg" rule_data["image"] = f"{rule.url}?image=svg"
neighbors = [] neighbors = []
for n, n_prob in zip( for n, n_prob in zip(
neighbours_per_rule[rule_idx], neighbor_probs_per_rule[rule_idx] neighbours_per_rule[rule_uuid], neighbor_probs_per_rule[rule_uuid]
): ):
neighbor = n.simple_json() neighbor = n.simple_json()
neighbor["image"] = f"{n.url}?image=svg" neighbor["image"] = f"{n.url}?image=svg"
@ -2934,14 +2902,14 @@ class ApplicabilityDomain(EnviPathModel):
transformation = { transformation = {
"rule": rule_data, "rule": rule_data,
"reliability": rule_reliabilities[rule_idx], "reliability": rule_reliabilities[rule_uuid],
# We're setting it here to False, as we don't know whether "assess" is called during pathway # We're setting it here to False, as we don't know whether "assess" is called during pathway
# prediction or from Model Page. For persisted Nodes this field will be overwritten at runtime # prediction or from Model Page. For persisted Nodes this field will be overwritten at runtime
"is_predicted": False, "is_predicted": False,
"local_compatibility": local_compatibilities[rule_idx], "local_compatibility": local_compatibilities[rule_uuid],
"probability": preds[rule_idx].probability, "probability": preds[rule_to_i[rule_uuid]].probability,
"transformation_products": [ "transformation_products": [
x.product_set for x in preds[rule_idx].product_sets x.product_set for x in preds[rule_to_i[rule_uuid]].product_sets
], ],
"times_triggered": ds.times_triggered(str(rule.uuid)), "times_triggered": ds.times_triggered(str(rule.uuid)),
"neighbors": neighbors, "neighbors": neighbors,
@ -2959,32 +2927,23 @@ class ApplicabilityDomain(EnviPathModel):
def _compute_distances(classify_instance: List[int], train_instances: List[List[int]]): def _compute_distances(classify_instance: List[int], train_instances: List[List[int]]):
from utilities.ml import tanimoto_distance from utilities.ml import tanimoto_distance
distances = [ distances = [tanimoto_distance(classify_instance, train) for train in train_instances]
(i, tanimoto_distance(classify_instance, train))
for i, train in enumerate(train_instances)
]
return distances return distances
@staticmethod @staticmethod
def _compute_compatibility(rule_idx: int, preds, neighbours: List[Tuple[int, "RuleBasedDataset"]]): def _compute_compatibility(rule_idx: int, neighbours: "RuleBasedDataset"):
tp, tn, fp, fn = 0.0, 0.0, 0.0, 0.0
accuracy = 0.0 accuracy = 0.0
import polars as pl
for n in neighbours: # TODO: Use a threshold to convert prob to boolean, or pass boolean in
obs = n[1].y()[0][rule_idx] obs_pred = neighbours.select(obs=pl.col(f"obs_{rule_idx}").cast(pl.Boolean),
pred = preds[n[0]][rule_idx] pred=pl.col(f"prob_{rule_idx}").cast(pl.Boolean))
if obs and pred: # Compute tp, tn, fp, fn using polars expressions
tp += 1 tp = obs_pred.filter((pl.col("obs")) & (pl.col("pred"))).height
elif not obs and pred: tn = obs_pred.filter((~pl.col("obs")) & (~pl.col("pred"))).height
fp += 1 fp = obs_pred.filter((~pl.col("obs")) & (pl.col("pred"))).height
elif obs and not pred: fn = obs_pred.filter((pl.col("obs")) & (~pl.col("pred"))).height
fn += 1
else:
tn += 1
# Jaccard Index
if tp + tn > 0.0: if tp + tn > 0.0:
accuracy = (tp + tn) / (tp + tn + fp + fn) accuracy = (tp + tn) / (tp + tn + fp + fn)
return accuracy return accuracy

View File

@ -52,7 +52,7 @@ class Dataset(ABC):
self.add_rows([row]) self.add_rows([row])
def block_indices(self, prefix) -> List[int]: def block_indices(self, prefix) -> List[int]:
"""Find the start and end indexes in column labels that has the prefix""" """Find the indexes in column labels that has the prefix"""
indices: List[int] = [] indices: List[int] = []
for i, feature in enumerate(self.columns): for i, feature in enumerate(self.columns):
if feature.startswith(prefix): if feature.startswith(prefix):
@ -128,6 +128,29 @@ class Dataset(ABC):
def iter_rows(self, named=False): def iter_rows(self, named=False):
return self.df.iter_rows(named=named) return self.df.iter_rows(named=named)
def filter(self, *predicates, **constraints):
return self.__class__(data=self.df.filter(*predicates, **constraints))
def select(self, *exprs, **named_exprs):
return self.__class__(data=self.df.select(*exprs, **named_exprs))
def with_columns(self, *exprs, **name_exprs):
return self.__class__(data=self.df.with_columns(*exprs, **name_exprs))
def sort(self, by, *more_by, descending=False, nulls_last=False, multithreaded=True, maintain_order=False):
return self.__class__(data=self.df.sort(by, *more_by, descending=descending, nulls_last=nulls_last,
multithreaded=multithreaded, maintain_order=maintain_order))
def item(self, row=None, column=None):
return self.df.item(row, column)
def fill_nan(self, value):
return self.__class__(data=self.df.fill_nan(value))
@property
def height(self):
return self.df.height
class RuleBasedDataset(Dataset): class RuleBasedDataset(Dataset):
def __init__(self, num_labels=None, columns=None, data=None): def __init__(self, num_labels=None, columns=None, data=None):