starting on app domain with new dataset #120

This commit is contained in:
Liam Brydon
2025-11-04 16:33:56 +13:00
parent ac5d370b18
commit 13af49488e
3 changed files with 98 additions and 58 deletions

View File

@ -28,7 +28,8 @@ from sklearn.metrics import precision_score, recall_score, jaccard_score
from sklearn.model_selection import ShuffleSplit
from utilities.chem import FormatConverter, ProductSet, PredictionResult, IndigoUtils
from utilities.ml import RuleBasedDataset, ApplicabilityDomainPCA, EnsembleClassifierChain, RelativeReasoning, EnviFormerDataset
from utilities.ml import RuleBasedDataset, ApplicabilityDomainPCA, EnsembleClassifierChain, RelativeReasoning, \
EnviFormerDataset, Dataset
logger = logging.getLogger(__name__)
@ -2184,9 +2185,9 @@ class PackageBasedModel(EPModel):
ds.save(f)
return ds
def load_dataset(self) -> "RuleBasedDataset":
def load_dataset(self) -> "Dataset | RuleBasedDataset | EnviFormerDataset":
ds_path = os.path.join(s.MODEL_DIR, f"{self.uuid}_ds.pkl")
return RuleBasedDataset.load(ds_path)
return Dataset.load(ds_path)
def retrain(self):
self.build_dataset()
@ -2196,7 +2197,7 @@ class PackageBasedModel(EPModel):
self.build_model()
@abstractmethod
def _fit_model(self, ds: RuleBasedDataset):
def _fit_model(self, ds: Dataset):
pass
@abstractmethod
@ -2337,22 +2338,22 @@ class PackageBasedModel(EPModel):
)
ds = RuleBasedDataset.generate_dataset(eval_reactions, self.applicable_rules, educts_only=True)
if isinstance(self, RuleBasedRelativeReasoning):
X = np.array(ds.X(exclude_id_col=False, na_replacement=None))
y = np.array(ds.y(na_replacement=np.nan))
X = ds.X(exclude_id_col=False, na_replacement=None).to_numpy()
y = ds.y(na_replacement=np.nan).to_numpy()
else:
X = np.array(ds.X(na_replacement=np.nan))
y = np.array(ds.y(na_replacement=np.nan))
X = ds.X(na_replacement=np.nan).to_numpy()
y = ds.y(na_replacement=np.nan).to_numpy()
single_gen_result = evaluate_sg(self.model, X, y, np.arange(len(X)), self.threshold)
self.eval_results = self.compute_averages([single_gen_result])
else:
ds = self.load_dataset()
if isinstance(self, RuleBasedRelativeReasoning):
X = np.array(ds.X(exclude_id_col=False, na_replacement=None))
y = np.array(ds.y(na_replacement=np.nan))
X = ds.X(exclude_id_col=False, na_replacement=None).to_numpy()
y = ds.y(na_replacement=np.nan).to_numpy()
else:
X = np.array(ds.X(na_replacement=np.nan))
y = np.array(ds.y(na_replacement=np.nan))
X = ds.X(na_replacement=np.nan).to_numpy()
y = ds.y(na_replacement=np.nan).to_numpy()
n_splits = kwargs.get("n_splits", 20)
@ -2586,7 +2587,7 @@ class RuleBasedRelativeReasoning(PackageBasedModel):
X, y = ds.X(exclude_id_col=False, na_replacement=None), ds.y(na_replacement=None)
model = RelativeReasoning(
start_index=ds.triggered()[0],
end_index=ds.triggered()[1],
end_index=ds.triggered()[-1],
)
model.fit(X, y)
return model
@ -2596,7 +2597,7 @@ class RuleBasedRelativeReasoning(PackageBasedModel):
return {
"clz": "RuleBaseRelativeReasoning",
"start_index": ds.triggered()[0],
"end_index": ds.triggered()[1],
"end_index": ds.triggered()[-1],
}
def _save_model(self, model):
@ -2716,7 +2717,7 @@ class MLRelativeReasoning(PackageBasedModel):
start = datetime.now()
ds = self.load_dataset()
classify_ds, classify_prods = ds.classification_dataset([smiles], self.applicable_rules)
pred = self.model.predict_proba(np.array(classify_ds.X()))
pred = self.model.predict_proba(classify_ds.X().to_numpy())
res = MLRelativeReasoning.combine_products_and_probs(
self.applicable_rules, pred[0], classify_prods[0]
@ -2761,7 +2762,9 @@ class ApplicabilityDomain(EnviPathModel):
@cached_property
def training_set_probs(self):
return joblib.load(os.path.join(s.MODEL_DIR, f"{self.model.uuid}_train_probs.pkl"))
ds = self.model.load_dataset()
col_ids = ds.block_indices("prob")
return ds[ds.columns[col_ids[0]: col_ids[1]]]
def build(self):
ds = self.model.load_dataset()
@ -2769,9 +2772,9 @@ class ApplicabilityDomain(EnviPathModel):
start = datetime.now()
# Get Trainingset probs and dump them as they're required when using the app domain
probs = self.model.model.predict_proba(ds.X())
f = os.path.join(s.MODEL_DIR, f"{self.model.uuid}_train_probs.pkl")
joblib.dump(probs, f)
probs = self.model.model.predict_proba(ds.X().to_numpy())
ds.add_probs(probs)
ds.save(os.path.join(s.MODEL_DIR, f"{self.model.uuid}_ds.pkl"))
ad = ApplicabilityDomainPCA(num_neighbours=self.num_neighbours)
ad.build(ds)
@ -2816,25 +2819,21 @@ class ApplicabilityDomain(EnviPathModel):
# it identifies all training structures that have the same trigger reaction activated (i.e., value 1).
# This is used to find "qualified neighbours" — training examples that share the same triggered feature
# with a given assessment structure under a particular rule.
qualified_neighbours_per_rule: Dict[int, Dict[int, List[int]]] = defaultdict(
lambda: defaultdict(list)
)
for rule_idx, feature_index in enumerate(range(*assessment_ds.triggered())):
feature = ds.columns[feature_index]
if feature.startswith("trig_"):
# TODO unroll loop
for i, cx in enumerate(assessment_ds.X(exclude_id_col=False)):
if int(cx[feature_index]) == 1:
for j, tx in enumerate(ds.X(exclude_id_col=False)):
if int(tx[feature_index]) == 1:
qualified_neighbours_per_rule[i][rule_idx].append(j)
import polars as pl
qualified_neighbours_per_rule: Dict = {}
# Select only the triggered columns
for i, row in enumerate(assessment_ds[:, assessment_ds.triggered()].iter_rows(named=True)):
# Find the rules the structure triggers. For each rule, filter the training dataset to rows that also
# trigger that rule. Select the structure_id of the compounds in those filtered rows
train_trig = {col_name: ds.df.filter(pl.col(col_name).eq(1)).select("structure_id") for col_name, value in row.items() if value == 1}
qualified_neighbours_per_rule[i] = train_trig
probs = self.training_set_probs
# preds = self.model.model.predict_proba(assessment_ds.X())
preds = self.model.combine_products_and_probs(
self.model.applicable_rules,
self.model.model.predict_proba(assessment_ds.X())[0],
self.model.model.predict_proba(assessment_ds.X().to_numpy())[0],
assessment_prods[0],
)