forked from enviPath/enviPy
[Feature] PEPPER in enviPath (#332)
Co-authored-by: Tim Lorsbach <tim@lorsba.ch> Reviewed-on: enviPath/enviPy#332
This commit is contained in:
361
pepper/__init__.py
Normal file
361
pepper/__init__.py
Normal file
@ -0,0 +1,361 @@
|
||||
import logging
|
||||
import math
|
||||
import os
|
||||
import pickle
|
||||
from datetime import datetime
|
||||
from typing import Any, List, Optional
|
||||
|
||||
import polars as pl
|
||||
|
||||
from pydantic import computed_field
|
||||
from sklearn.metrics import (
|
||||
mean_absolute_error,
|
||||
mean_squared_error,
|
||||
r2_score,
|
||||
root_mean_squared_error,
|
||||
)
|
||||
from sklearn.model_selection import ShuffleSplit
|
||||
|
||||
# Once stable these will be exposed by enviPy-plugins lib
|
||||
from envipy_additional_information import register # noqa: I001
|
||||
from bridge.contracts import Property, PropertyType # noqa: I001
|
||||
from bridge.dto import (
|
||||
BuildResult,
|
||||
EnviPyDTO,
|
||||
EvaluationResult,
|
||||
PredictedProperty,
|
||||
RunResult,
|
||||
) # noqa: I001
|
||||
|
||||
from .impl.pepper import Pepper # noqa: I001
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
@register("pepperprediction")
|
||||
class PepperPrediction(PredictedProperty):
|
||||
mean: float | None
|
||||
std: float | None
|
||||
log_mean: float | None
|
||||
log_std: float | None
|
||||
|
||||
@computed_field
|
||||
@property
|
||||
def svg(self, xscale="linear", quantiles=(0.01, 0.99), n_points=2000) -> Optional[str]:
|
||||
import io
|
||||
|
||||
import matplotlib.patches as mpatches
|
||||
import numpy as np
|
||||
from matplotlib import pyplot as plt
|
||||
from scipy import stats
|
||||
|
||||
"""
|
||||
Plot the lognormal distribution of chemical half-lives where parameters are
|
||||
given on a base-10 log scale: log10(half-life) ~ Normal(mu_log10, sigma_log10^2).
|
||||
|
||||
Shades:
|
||||
- x < a in green (Non-persistent)
|
||||
- a <= x <= b in yellow (Persistent)
|
||||
- x > b in red (Very persistent)
|
||||
|
||||
Legend shows the shaded color and the probability mass in each region.
|
||||
"""
|
||||
|
||||
sigma_log10 = self.log_std
|
||||
mu_log10 = self.log_mean
|
||||
|
||||
if sigma_log10 <= 0:
|
||||
raise ValueError("sigma_log10 must be > 0")
|
||||
# Persistent and Very Persistent thresholds in days from REACH (https://doi.org/10.26434/chemrxiv-2025-xmslf)
|
||||
p = 120
|
||||
vp = 180
|
||||
|
||||
# Convert base-10 log parameters to natural-log parameters for SciPy's lognorm
|
||||
ln10 = np.log(10.0)
|
||||
mu_ln = mu_log10 * ln10
|
||||
sigma_ln = sigma_log10 * ln10
|
||||
|
||||
# SciPy parameterization: lognorm(s=sigma_ln, scale=exp(mu_ln))
|
||||
dist = stats.lognorm(s=sigma_ln, scale=np.exp(mu_ln))
|
||||
|
||||
# Exact probabilities
|
||||
p_green = dist.cdf(p) # P(X < a)
|
||||
p_yellow = dist.cdf(vp) - p_green # P(a <= X <= b)
|
||||
p_red = 1.0 - dist.cdf(vp) # P(X > b)
|
||||
|
||||
# Plotting range
|
||||
q_low, q_high = dist.ppf(quantiles)
|
||||
x_min = max(1e-12, min(q_low, p) * 0.9)
|
||||
x_max = max(q_high, vp) * 1.1
|
||||
|
||||
# Build x-grid (linear days axis)
|
||||
if xscale == "log":
|
||||
x = np.logspace(np.log10(x_min), np.log10(x_max), n_points)
|
||||
else:
|
||||
x = np.linspace(x_min, x_max, n_points)
|
||||
y = dist.pdf(x)
|
||||
|
||||
# Masks for shading
|
||||
mask_green = x < p
|
||||
mask_yellow = (x >= p) & (x <= vp)
|
||||
mask_red = x > vp
|
||||
|
||||
# Plot
|
||||
fig, ax = plt.subplots(figsize=(9, 5.5))
|
||||
ax.plot(x, y, color="#1f4e79", lw=2, label="Lognormal PDF")
|
||||
|
||||
if np.any(mask_green):
|
||||
ax.fill_between(x[mask_green], y[mask_green], 0, color="tab:green", alpha=0.3)
|
||||
if np.any(mask_yellow):
|
||||
ax.fill_between(x[mask_yellow], y[mask_yellow], 0, color="gold", alpha=0.35)
|
||||
if np.any(mask_red):
|
||||
ax.fill_between(x[mask_red], y[mask_red], 0, color="tab:red", alpha=0.3)
|
||||
|
||||
# Threshold lines
|
||||
ax.axvline(p, color="gray", ls="--", lw=1)
|
||||
ax.axvline(vp, color="gray", ls="--", lw=1)
|
||||
|
||||
# Labels & title
|
||||
ax.set_title(
|
||||
f"Half-life Distribution (Lognormal)\nlog10 parameters: μ={mu_log10:g}, σ={sigma_log10:g}"
|
||||
)
|
||||
ax.set_xlabel("Half-life (days)")
|
||||
ax.set_ylabel("Probability density")
|
||||
ax.grid(True, alpha=0.25)
|
||||
|
||||
if xscale == "log":
|
||||
ax.set_xscale("log") # not used in this example, but supported
|
||||
|
||||
# Legend with probabilities
|
||||
patches = [
|
||||
mpatches.Patch(
|
||||
color="tab:green",
|
||||
alpha=0.3,
|
||||
label=f"Non-persistent (<{p:g} d): {p_green:.2%}",
|
||||
),
|
||||
mpatches.Patch(
|
||||
color="gold",
|
||||
alpha=0.35,
|
||||
label=f"Persistent ({p:g}–{vp:g} d): {p_yellow:.2%}",
|
||||
),
|
||||
mpatches.Patch(
|
||||
color="tab:red",
|
||||
alpha=0.3,
|
||||
label=f"Very persistent (>{vp:g} d): {p_red:.2%}",
|
||||
),
|
||||
]
|
||||
ax.legend(handles=patches, frameon=True)
|
||||
|
||||
plt.tight_layout()
|
||||
|
||||
# --- Export to SVG string ---
|
||||
buf = io.StringIO()
|
||||
fig.savefig(buf, format="svg", bbox_inches="tight")
|
||||
svg = buf.getvalue()
|
||||
plt.close(fig)
|
||||
buf.close()
|
||||
|
||||
return svg
|
||||
|
||||
|
||||
class PEPPER(Property):
|
||||
def identifier(self) -> str:
|
||||
return "pepper"
|
||||
|
||||
def display(self) -> str:
|
||||
return "PEPPER"
|
||||
|
||||
def name(self) -> str:
|
||||
return "Predict Environmental Pollutant PERsistence"
|
||||
|
||||
def requires_rule_packages(self) -> bool:
|
||||
return False
|
||||
|
||||
def requires_data_packages(self) -> bool:
|
||||
return True
|
||||
|
||||
def get_type(self) -> PropertyType:
|
||||
return PropertyType.HEAVY
|
||||
|
||||
def generate_dataset(self, eP: EnviPyDTO) -> pl.DataFrame:
|
||||
"""
|
||||
Generates a dataset in the form of a Polars DataFrame containing compound information, including
|
||||
SMILES strings and logarithmic values of degradation half-lives (dt50).
|
||||
|
||||
The dataset is built by iterating over a list of compounds, standardizing SMILES strings, and
|
||||
calculating the logarithmic mean of the half-life intervals for different environmental scenarios
|
||||
associated with each compound.
|
||||
|
||||
The resulting DataFrame will only include unique rows based on SMILES and logarithmic half-life
|
||||
values.
|
||||
|
||||
Parameters:
|
||||
eP (EnviPyDTO): An object that provides access to compound data and utility functions for
|
||||
standardization and retrieval of half-life information.
|
||||
|
||||
Returns:
|
||||
pl.DataFrame: The resulting dataset with unique rows containing compound structure identifiers,
|
||||
standardized SMILES strings, and logarithmic half-life values.
|
||||
|
||||
Raises:
|
||||
Exception: Exceptions are caught and logged during data processing, specifically when retrieving
|
||||
half-life information.
|
||||
|
||||
Note:
|
||||
- The logarithmic mean is calculated from the start and end intervals of the dt50 (half-life).
|
||||
- Compounds not associated with any half-life data are skipped, and errors encountered during processing
|
||||
are logged without halting the execution.
|
||||
"""
|
||||
columns = ["structure_id", "smiles", "dt50_log"]
|
||||
rows = []
|
||||
|
||||
for c in eP.get_compounds():
|
||||
hls = c.half_lifes()
|
||||
|
||||
if len(hls):
|
||||
stand_smiles = eP.standardize(c.smiles, remove_stereo=True)
|
||||
for scenario, half_lives in hls.items():
|
||||
for h in half_lives:
|
||||
# In the original Pepper code they take the mean of the start and end interval.
|
||||
half_mean = (h.dt50.start + h.dt50.end) / 2
|
||||
rows.append([str(c.url), stand_smiles, math.log10(half_mean)])
|
||||
|
||||
df = pl.DataFrame(data=rows, schema=columns, orient="row", infer_schema_length=None)
|
||||
|
||||
df = df.unique(subset=["smiles", "dt50_log"], keep="any", maintain_order=False)
|
||||
|
||||
return df
|
||||
|
||||
def save_dataset(self, df: pl.DataFrame, path: str):
|
||||
with open(path, "wb") as fh:
|
||||
pickle.dump(df, fh)
|
||||
|
||||
def load_dataset(self, path: str) -> pl.DataFrame:
|
||||
with open(path, "rb") as fh:
|
||||
return pickle.load(fh)
|
||||
|
||||
def build(self, eP: EnviPyDTO, *args, **kwargs) -> BuildResult | None:
|
||||
logger.info(f"Start building PEPPER {eP.get_context().uuid}")
|
||||
df = self.generate_dataset(eP)
|
||||
|
||||
if df.shape[0] == 0:
|
||||
raise ValueError("No data found for building model")
|
||||
|
||||
p = Pepper()
|
||||
|
||||
p, train_ds = p.train_model(df)
|
||||
|
||||
ds_store_path = os.path.join(
|
||||
eP.get_context().work_dir, f"pepper_ds_{eP.get_context().uuid}.pkl"
|
||||
)
|
||||
self.save_dataset(train_ds, ds_store_path)
|
||||
|
||||
model_store_path = os.path.join(
|
||||
eP.get_context().work_dir, f"pepper_{eP.get_context().uuid}.pkl"
|
||||
)
|
||||
p.save_model(model_store_path)
|
||||
logger.info(f"Finished building PEPPER {eP.get_context().uuid}")
|
||||
|
||||
def run(self, eP: EnviPyDTO, *args, **kwargs) -> RunResult:
|
||||
load_path = os.path.join(eP.get_context().work_dir, f"pepper_{eP.get_context().uuid}.pkl")
|
||||
|
||||
p = Pepper.load_model(load_path)
|
||||
|
||||
X_new = [c.smiles for c in eP.get_compounds()]
|
||||
|
||||
predictions = p.predict_batch(X_new)
|
||||
|
||||
results = []
|
||||
|
||||
for p in zip(*predictions):
|
||||
if p[0] is None or p[1] is None:
|
||||
result = {"log_mean": None, "mean": None, "log_std": None, "std": None, "svg": None}
|
||||
else:
|
||||
result = {
|
||||
"log_mean": p[0],
|
||||
"mean": 10 ** p[0],
|
||||
"log_std": p[1],
|
||||
"std": 10 ** p[1],
|
||||
}
|
||||
|
||||
results.append(PepperPrediction(**result))
|
||||
|
||||
rr = RunResult(
|
||||
producer=eP.get_context().url,
|
||||
description=f"Generated at {datetime.now()}",
|
||||
result=results,
|
||||
)
|
||||
|
||||
return rr
|
||||
|
||||
def evaluate(self, eP: EnviPyDTO, *args, **kwargs) -> EvaluationResult | None:
|
||||
logger.info(f"Start evaluating PEPPER {eP.get_context().uuid}")
|
||||
load_path = os.path.join(eP.get_context().work_dir, f"pepper_{eP.get_context().uuid}.pkl")
|
||||
|
||||
p = Pepper.load_model(load_path)
|
||||
|
||||
df = self.generate_dataset(eP)
|
||||
ds = p.preprocess_data(df)
|
||||
|
||||
y_pred = p.predict_batch(ds["smiles"])
|
||||
|
||||
# We only need the mean
|
||||
if isinstance(y_pred, tuple):
|
||||
y_pred = y_pred[0]
|
||||
|
||||
res = self.eval_stats(ds["dt50_bayesian_mean"], y_pred)
|
||||
|
||||
logger.info(f"Finished evaluating PEPPER {eP.get_context().uuid}")
|
||||
return EvaluationResult(data=res)
|
||||
|
||||
def build_and_evaluate(self, eP: EnviPyDTO, *args, **kwargs) -> EvaluationResult | None:
|
||||
logger.info(f"Start evaluating PEPPER {eP.get_context().uuid}")
|
||||
ds_load_path = os.path.join(
|
||||
eP.get_context().work_dir, f"pepper_ds_{eP.get_context().uuid}.pkl"
|
||||
)
|
||||
ds = self.load_dataset(ds_load_path)
|
||||
|
||||
n_splits = kwargs.get("n_splits", 20)
|
||||
shuff = ShuffleSplit(n_splits=n_splits, test_size=0.1, random_state=42)
|
||||
|
||||
fold_metrics: List[dict[str, Any]] = []
|
||||
for split_id, (train_index, test_index) in enumerate(shuff.split(ds)):
|
||||
logger.info(f"Evaluation fold {split_id}/{n_splits} PEPPER {eP.get_context().uuid}")
|
||||
train = ds[train_index]
|
||||
test = ds[test_index]
|
||||
model = Pepper()
|
||||
model.train_model(train, preprocess=False)
|
||||
|
||||
features = test[model.descriptors.get_descriptor_names()].rows()
|
||||
y_pred = model.predict_batch(features, is_smiles=False)
|
||||
|
||||
# We only need the mean for eval statistics but mean, std can be returned
|
||||
if isinstance(y_pred, tuple) or isinstance(y_pred, list):
|
||||
y_pred = y_pred[0]
|
||||
|
||||
# Remove None if they occur
|
||||
y_true_filtered, y_pred_filtered = [], []
|
||||
for t, p in zip(test["dt50_bayesian_mean"], y_pred):
|
||||
if p is None:
|
||||
continue
|
||||
y_true_filtered.append(t)
|
||||
y_pred_filtered.append(p)
|
||||
|
||||
if len(y_true_filtered) == 0:
|
||||
print("Skipping empty fold")
|
||||
continue
|
||||
|
||||
fold_metrics.append(self.eval_stats(y_true_filtered, y_pred_filtered))
|
||||
|
||||
logger.info(f"Finished evaluating PEPPER {eP.get_context().uuid}")
|
||||
return EvaluationResult(data=fold_metrics)
|
||||
|
||||
@staticmethod
|
||||
def eval_stats(y_true, y_pred) -> dict[str, float]:
|
||||
scores_dic = {
|
||||
"r2": r2_score(y_true, y_pred),
|
||||
"mse": mean_squared_error(y_true, y_pred),
|
||||
"rmse": root_mean_squared_error(y_true, y_pred),
|
||||
"mae": mean_absolute_error(y_true, y_pred),
|
||||
}
|
||||
return scores_dic
|
||||
0
pepper/impl/__init__.py
Normal file
0
pepper/impl/__init__.py
Normal file
196
pepper/impl/bayesian.py
Normal file
196
pepper/impl/bayesian.py
Normal file
@ -0,0 +1,196 @@
|
||||
import emcee
|
||||
import numpy as np
|
||||
from scipy.stats import lognorm, norm
|
||||
|
||||
|
||||
class Bayesian:
|
||||
def __init__(self, y, comment_list=None):
|
||||
if comment_list is None:
|
||||
comment_list = []
|
||||
self.y = y
|
||||
self.comment_list = comment_list
|
||||
# LOQ default settings
|
||||
self.LOQ_lower = -1 # (2.4 hours)
|
||||
self.LOQ_upper = 3 # 1000 days
|
||||
# prior default settings
|
||||
self.prior_mu_mean = 1.5
|
||||
self.prior_mu_std = 2
|
||||
self.prior_sigma_mean = 0.4
|
||||
self.prior_sigma_std = 0.4
|
||||
self.lower_limit_sigma = 0.2
|
||||
# EMCEE defaults
|
||||
self.nwalkers = 10
|
||||
self.iterations = 2000
|
||||
self.burn_in = 100
|
||||
ndim = 2 # number of dimensions (mean, std)
|
||||
# backend = emcee.backends.HDFBackend("backend.h5")
|
||||
# backend.reset(self.nwalkers, ndim)
|
||||
self.sampler = emcee.EnsembleSampler(self.nwalkers, ndim, self.logPosterior)
|
||||
self.posterior_mu = None
|
||||
self.posterior_sigma = None
|
||||
|
||||
def get_censored_values_only(self):
|
||||
censored_values = []
|
||||
for i, comment in enumerate(self.comment_list):
|
||||
if comment in ["<", ">"]:
|
||||
censored_values.append(self.y[i])
|
||||
elif self.y[i] > self.LOQ_upper or self.y[i] < self.LOQ_lower:
|
||||
censored_values.append(self.y[i])
|
||||
return censored_values
|
||||
|
||||
# Class functions
|
||||
def determine_LOQ(self):
|
||||
"""
|
||||
Determines if the LOQ is upper or lower, and the value (if not default)
|
||||
:return: upper_LOQ , lower_LOQ
|
||||
"""
|
||||
|
||||
censored_values = self.get_censored_values_only()
|
||||
|
||||
# Find upper LOQ
|
||||
upper_LOQ = np.nan
|
||||
# bigger than global LOQ
|
||||
if max(self.y) >= self.LOQ_upper:
|
||||
upper_LOQ = self.LOQ_upper
|
||||
# case if exactly 365 days
|
||||
elif max(self.y) == 2.562: # 365 days
|
||||
upper_LOQ = 2.562
|
||||
self.LOQ_upper = upper_LOQ
|
||||
# case if "bigger than" indication in comments
|
||||
elif ">" in self.comment_list:
|
||||
i = 0
|
||||
while i < len(self.y):
|
||||
if self.y[i] == min(censored_values) and self.comment_list[i] == ">":
|
||||
self.LOQ_upper = self.y[i]
|
||||
break
|
||||
i += 1
|
||||
|
||||
# Find lower LOQ
|
||||
lower_LOQ = np.nan
|
||||
# smaller than global LOQ
|
||||
if min(self.y) <= self.LOQ_lower:
|
||||
lower_LOQ = self.LOQ_lower
|
||||
# case if exactly 1 day
|
||||
elif min(self.y) == 0: # 1 day
|
||||
lower_LOQ = 0
|
||||
self.LOQ_lower = 0
|
||||
# case if "smaller than" indication in comments
|
||||
elif "<" in self.comment_list:
|
||||
i = 0
|
||||
while i < len(self.y):
|
||||
if self.y[i] == max(censored_values) and self.comment_list[i] == "<":
|
||||
self.LOQ_lower = self.y[i]
|
||||
break
|
||||
i += 1
|
||||
return upper_LOQ, lower_LOQ
|
||||
|
||||
def logLikelihood(self, theta, sigma):
|
||||
"""
|
||||
Likelihood function (the probability of a dataset (mean, std) given the model parameters)
|
||||
Convert not censored observations into type ’numeric’
|
||||
:param theta: mean half-life value to be evaluated
|
||||
:param sigma: std half-life value to be evaluated
|
||||
:return: log_likelihood
|
||||
"""
|
||||
upper_LOQ, lower_LOQ = self.determine_LOQ()
|
||||
|
||||
n_censored_upper = 0
|
||||
n_censored_lower = 0
|
||||
y_not_cen = []
|
||||
|
||||
if np.isnan(upper_LOQ) and np.isnan(lower_LOQ):
|
||||
y_not_cen = self.y
|
||||
else:
|
||||
for i in self.y:
|
||||
if np.isnan(upper_LOQ) and i >= upper_LOQ: # censor above threshold
|
||||
n_censored_upper += 1
|
||||
if np.isnan(lower_LOQ) and i <= lower_LOQ: # censor below threshold
|
||||
n_censored_lower += 1
|
||||
else: # do not censor
|
||||
y_not_cen.append(i)
|
||||
|
||||
LL_left_cen = 0
|
||||
LL_right_cen = 0
|
||||
LL_not_cen = 0
|
||||
|
||||
# likelihood for not censored observations
|
||||
if n_censored_lower > 0: # loglikelihood for left censored observations
|
||||
LL_left_cen = n_censored_lower * norm.logcdf(
|
||||
lower_LOQ, loc=theta, scale=sigma
|
||||
) # cumulative distribution function CDF
|
||||
|
||||
if n_censored_upper > 0: # loglikelihood for right censored observations
|
||||
LL_right_cen = n_censored_upper * norm.logsf(
|
||||
upper_LOQ, loc=theta, scale=sigma
|
||||
) # survival function (1-CDF)
|
||||
|
||||
if len(y_not_cen) > 0: # loglikelihood for uncensored values
|
||||
LL_not_cen = sum(
|
||||
norm.logpdf(y_not_cen, loc=theta, scale=sigma)
|
||||
) # probability density function PDF
|
||||
|
||||
return LL_left_cen + LL_not_cen + LL_right_cen
|
||||
|
||||
def get_prior_probability_sigma(self, sigma):
|
||||
# convert mean and sd to logspace parameters, to see this formula check
|
||||
# https://en.wikipedia.org/wiki/Log-normal_distribution under Method of moments section
|
||||
temp = 1 + (self.prior_sigma_std / self.prior_sigma_mean) ** 2
|
||||
meanlog = self.prior_sigma_mean / np.sqrt(temp)
|
||||
sdlog = np.sqrt(np.log(temp))
|
||||
# calculate of logpdf of sigma
|
||||
norm_pdf_sigma = lognorm.logpdf(sigma, s=sdlog, loc=self.lower_limit_sigma, scale=meanlog)
|
||||
return norm_pdf_sigma
|
||||
|
||||
def get_prior_probability_theta(self, theta):
|
||||
norm_pdf_theta = norm.logpdf(theta, loc=self.prior_mu_mean, scale=self.prior_mu_std)
|
||||
return norm_pdf_theta
|
||||
|
||||
def logPrior(self, par):
|
||||
"""
|
||||
Obtain prior loglikelihood of [theta, sigma]
|
||||
:param par: par = [theta,sigma]
|
||||
:return: loglikelihood
|
||||
"""
|
||||
# calculate the mean and standard deviation in the log-space
|
||||
norm_pdf_mean = self.get_prior_probability_theta(par[0])
|
||||
norm_pdf_std = self.get_prior_probability_sigma(par[1])
|
||||
log_norm_pdf = [norm_pdf_mean, norm_pdf_std]
|
||||
return sum(log_norm_pdf)
|
||||
|
||||
def logPosterior(self, par):
|
||||
"""
|
||||
Obtain posterior loglikelihood
|
||||
:param par: [theta, sigma]
|
||||
:return: posterior loglikelihood
|
||||
"""
|
||||
logpri = self.logPrior(par)
|
||||
if not np.isfinite(logpri):
|
||||
return -np.inf
|
||||
loglikelihood = self.logLikelihood(par[0], par[1])
|
||||
return logpri + loglikelihood
|
||||
|
||||
def get_posterior_distribution(self):
|
||||
"""
|
||||
Sample posterior distribution and get median of mean and std samples
|
||||
:return: posterior half-life mean and std
|
||||
"""
|
||||
if self.posterior_mu:
|
||||
return self.posterior_mu, self.posterior_sigma
|
||||
|
||||
# Sampler parameters
|
||||
ndim = 2 # number of dimensions (mean,std)
|
||||
p0 = abs(np.random.randn(self.nwalkers, ndim)) # only positive starting numbers (for std)
|
||||
|
||||
# Sample distribution
|
||||
self.sampler.run_mcmc(p0, self.iterations)
|
||||
# get chain and log_prob in one-dimensional array (merged chains with burn-in)
|
||||
samples = self.sampler.get_chain(flat=True, discard=100)
|
||||
# get median mean and std
|
||||
self.posterior_mu = np.median(samples[:, 0])
|
||||
self.posterior_sigma = np.median(samples[:, 1])
|
||||
return self.posterior_mu, self.posterior_sigma
|
||||
|
||||
|
||||
# Utility functions
|
||||
def get_normal_distribution(x, mu, sig):
|
||||
return np.exp(-np.power(x - mu, 2.0) / (2 * np.power(sig, 2.0)))
|
||||
@ -0,0 +1,11 @@
|
||||
GPR:
|
||||
name: Gaussian Process Regressor
|
||||
regressor: GaussianProcessRegressor
|
||||
regressor_params:
|
||||
normalize_y: True
|
||||
n_restarts_optimizer: 0
|
||||
kernel: "ConstantKernel(1.0, (1e-3, 1e3)) * Matern(length_scale=2.5, length_scale_bounds=(1e-3, 1e3), nu=0.5)"
|
||||
feature_reduction_method: None
|
||||
feature_reduction_parameters:
|
||||
pca:
|
||||
n_components: 34
|
||||
60
pepper/impl/descriptors.py
Normal file
60
pepper/impl/descriptors.py
Normal file
@ -0,0 +1,60 @@
|
||||
from abc import ABC, abstractmethod
|
||||
from typing import List
|
||||
|
||||
from mordred import Calculator, descriptors
|
||||
from padelpy import from_smiles
|
||||
from rdkit import Chem
|
||||
|
||||
|
||||
class Descriptor(ABC):
|
||||
@abstractmethod
|
||||
def get_molecule_descriptors(self, molecule: str) -> List[float | int] | None:
|
||||
pass
|
||||
|
||||
@abstractmethod
|
||||
def get_descriptor_names(self) -> List[str]:
|
||||
pass
|
||||
|
||||
|
||||
class Mordred(Descriptor):
|
||||
calc = Calculator(descriptors, ignore_3D=True)
|
||||
|
||||
def get_molecule_descriptors(self, molecule: str) -> List[float | int] | None:
|
||||
mol = Chem.MolFromSmiles(molecule)
|
||||
res = list(self.calc(mol))
|
||||
return res
|
||||
|
||||
def get_descriptor_names(self) -> List[str]:
|
||||
return [f"Mordred_{i}" for i in range(len(self.calc.descriptors))]
|
||||
|
||||
|
||||
class PaDEL(Descriptor):
|
||||
calc = Calculator(descriptors)
|
||||
|
||||
def get_molecule_descriptors(self, molecule: str) -> List[float | int] | None:
|
||||
try:
|
||||
padel_descriptors = from_smiles(molecule, threads=1)
|
||||
except RuntimeError:
|
||||
return []
|
||||
|
||||
formatted = []
|
||||
for k, v in padel_descriptors.items():
|
||||
try:
|
||||
formatted.append(float(v))
|
||||
except ValueError:
|
||||
formatted.append(0.0)
|
||||
|
||||
return formatted
|
||||
|
||||
def get_descriptor_names(self) -> List[str]:
|
||||
return [f"PaDEL_{i}" for i in range(1875)]
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
mol = "CC1=CC(O)=CC=C1[N+](=O)[O-]"
|
||||
|
||||
m = Mordred()
|
||||
print(list(m.get_molecule_descriptors(mol)))
|
||||
|
||||
p = PaDEL()
|
||||
print(list(p.get_molecule_descriptors(mol)))
|
||||
329
pepper/impl/pepper.py
Normal file
329
pepper/impl/pepper.py
Normal file
@ -0,0 +1,329 @@
|
||||
import importlib.resources
|
||||
import logging
|
||||
import math
|
||||
import os
|
||||
import pickle
|
||||
from collections import defaultdict
|
||||
from typing import List
|
||||
|
||||
import numpy as np
|
||||
import polars as pl
|
||||
import yaml
|
||||
from joblib import Parallel, delayed
|
||||
from scipy.cluster import hierarchy
|
||||
from scipy.spatial.distance import squareform
|
||||
from scipy.stats import spearmanr
|
||||
from sklearn.feature_selection import VarianceThreshold
|
||||
from sklearn.gaussian_process import GaussianProcessRegressor
|
||||
from sklearn.pipeline import Pipeline
|
||||
from sklearn.preprocessing import FunctionTransformer, MinMaxScaler
|
||||
|
||||
from .bayesian import Bayesian
|
||||
from .descriptors import Mordred
|
||||
|
||||
|
||||
class Pepper:
|
||||
def __init__(self, config_path=None, random_state=42):
|
||||
self.random_state = random_state
|
||||
if config_path is None:
|
||||
config_path = importlib.resources.files("pepper.impl.config").joinpath(
|
||||
"regressor_settings_singlevalue_soil_paper_GPR_optimized.yml"
|
||||
)
|
||||
with open(config_path, "r") as file:
|
||||
regressor_settings = yaml.safe_load(file)
|
||||
if len(regressor_settings) > 1:
|
||||
logging.warning(
|
||||
f"More than one regressor config found in {config_path}, using the first one"
|
||||
)
|
||||
self.regressor_settings = regressor_settings[list(regressor_settings.keys())[0]]
|
||||
if "kernel" in self.regressor_settings["regressor_params"]:
|
||||
from sklearn.gaussian_process.kernels import ConstantKernel, Matern # noqa: F401
|
||||
|
||||
# We could hard-code the kernels they have, maybe better than using eval
|
||||
self.regressor_settings["regressor_params"]["kernel"] = eval(
|
||||
self.regressor_settings["regressor_params"]["kernel"]
|
||||
)
|
||||
# We assume the YAML has the key regressor containing a regressor name
|
||||
self.regressor = self.get_regressor_by_name(self.regressor_settings["regressor"])
|
||||
if "regressor_params" in self.regressor_settings: # Set params if any are given
|
||||
self.regressor.set_params(**self.regressor_settings["regressor_params"])
|
||||
|
||||
# TODO we could make this configurable
|
||||
self.descriptors = Mordred()
|
||||
self.descriptor_subset = None
|
||||
|
||||
self.min_max_scaler = MinMaxScaler().set_output(transform="polars")
|
||||
self.feature_preselector = Pipeline(
|
||||
[
|
||||
(
|
||||
"variance_threshold",
|
||||
VarianceThreshold(threshold=0.02).set_output(transform="polars"),
|
||||
),
|
||||
# Feature selection based on variance threshold
|
||||
(
|
||||
"custom_feature_selection",
|
||||
FunctionTransformer(
|
||||
func=self.remove_highly_correlated_features,
|
||||
validate=False,
|
||||
kw_args={"corr_method": "spearman", "cluster_threshold": 0.01},
|
||||
).set_output(transform="polars"),
|
||||
),
|
||||
]
|
||||
)
|
||||
|
||||
def get_regressor_by_name(self, regressor_string):
|
||||
"""
|
||||
Load regressor function from a regressor name
|
||||
:param regressor_string: name of regressor as defined in config file (function name with parentheses)
|
||||
:return: Regressor object
|
||||
"""
|
||||
# if regressor_string == 'RandomForestRegressor':
|
||||
# return RandomForestRegressor(random_state=self.random_state)
|
||||
# elif regressor_string == 'GradientBoostingRegressor':
|
||||
# return GradientBoostingRegressor(random_state=self.random_state)
|
||||
# elif regressor_string == 'AdaBoostRegressor':
|
||||
# return AdaBoostRegressor(random_state=self.random_state)
|
||||
# elif regressor_string == 'MLPRegressor':
|
||||
# return MLPRegressor(random_state=self.random_state)
|
||||
# elif regressor_string == 'SVR':
|
||||
# return SVR()
|
||||
# elif regressor_string == 'KNeighborsRegressor':
|
||||
# return KNeighborsRegressor()
|
||||
if regressor_string == "GaussianProcessRegressor":
|
||||
return GaussianProcessRegressor(random_state=self.random_state)
|
||||
# elif regressor_string == 'DecisionTreeRegressor':
|
||||
# return DecisionTreeRegressor(random_state=self.random_state)
|
||||
# elif regressor_string == 'Ridge':
|
||||
# return Ridge(random_state=self.random_state)
|
||||
# elif regressor_string == 'SGDRegressor':
|
||||
# return SGDRegressor(random_state=self.random_state)
|
||||
# elif regressor_string == 'KernelRidge':
|
||||
# return KernelRidge()
|
||||
# elif regressor_string == 'LinearRegression':
|
||||
# return LinearRegression()
|
||||
# elif regressor_string == 'LSVR':
|
||||
# return SVR(kernel='linear') # Linear Support Vector Regressor
|
||||
else:
|
||||
raise NotImplementedError(
|
||||
f"No regressor type defined for regressor_string = {regressor_string}"
|
||||
)
|
||||
|
||||
def train_model(self, train_data, preprocess=True):
|
||||
"""
|
||||
Fit self.regressor and preprocessors. train_data is a pl.DataFrame
|
||||
"""
|
||||
if preprocess:
|
||||
# Compute the mean and std of half-lives per structure
|
||||
train_data = self.preprocess_data(train_data)
|
||||
|
||||
# train_data structure:
|
||||
# columns = [
|
||||
# "structure_id",
|
||||
# "smiles",
|
||||
# "dt50_log",
|
||||
# "dt50_bayesian_mean",
|
||||
# "dt50_bayesian_std",
|
||||
# ] + self.descriptors.get_descriptor_names()
|
||||
|
||||
# only select descriptor features for feature preselector
|
||||
df = train_data[self.descriptors.get_descriptor_names()]
|
||||
|
||||
# Remove columns having at least None, nan, inf, "" value
|
||||
df = Pepper.keep_clean_columns(df)
|
||||
|
||||
# Scale and Remove highly correlated features as well as features having a low variance
|
||||
x_train_normal = self.min_max_scaler.fit_transform(df)
|
||||
x_train_normal = self.feature_preselector.fit_transform(x_train_normal)
|
||||
|
||||
# Store subset, as this is the input used for prediction
|
||||
self.descriptor_subset = x_train_normal.columns
|
||||
|
||||
y_train = train_data["dt50_bayesian_mean"].to_numpy()
|
||||
y_train_std = train_data["dt50_bayesian_std"].to_numpy()
|
||||
|
||||
self.regressor.set_params(alpha=y_train_std)
|
||||
self.regressor.fit(x_train_normal, y_train)
|
||||
|
||||
return self, train_data
|
||||
|
||||
@staticmethod
|
||||
def keep_clean_columns(df: pl.DataFrame) -> pl.DataFrame:
|
||||
"""
|
||||
Filters out columns from the DataFrame that contain null values, NaN, or infinite values.
|
||||
|
||||
This static method takes a DataFrame as input and evaluates each of its columns to determine
|
||||
if the column contains invalid values. Columns that have null values, NaN, or infinite values
|
||||
are excluded from the resulting DataFrame. The method is especially useful for cleaning up a
|
||||
dataset by keeping only the valid columns.
|
||||
|
||||
Parameters:
|
||||
df (polars.DataFrame): The input DataFrame to be cleaned.
|
||||
|
||||
Returns:
|
||||
polars.DataFrame: A DataFrame containing only columns without null, NaN, or infinite values.
|
||||
"""
|
||||
valid_cols = []
|
||||
|
||||
for col in df.columns:
|
||||
s = df[col]
|
||||
|
||||
# Check nulls
|
||||
has_null = s.null_count() > 0
|
||||
|
||||
# Check NaN and inf only for numeric columns
|
||||
if s.dtype.is_numeric():
|
||||
has_nan = s.is_nan().any()
|
||||
has_inf = s.is_infinite().any()
|
||||
else:
|
||||
has_nan = False
|
||||
has_inf = False
|
||||
|
||||
if not (has_null or has_nan or has_inf):
|
||||
valid_cols.append(col)
|
||||
|
||||
return df.select(valid_cols)
|
||||
|
||||
def preprocess_data(self, dataset):
|
||||
groups = [group for group in dataset.group_by("structure_id")]
|
||||
|
||||
# Unless explicitly set compute everything serial
|
||||
if os.environ.get("N_PEPPER_THREADS", 1) > 1:
|
||||
results = Parallel(n_jobs=os.environ["N_PEPPER_THREADS"])(
|
||||
delayed(compute_bayes_per_group)(group[1])
|
||||
for group in dataset.group_by("structure_id")
|
||||
)
|
||||
else:
|
||||
results = []
|
||||
for g in groups:
|
||||
results.append(compute_bayes_per_group(g[1]))
|
||||
|
||||
bayes_stats = pl.concat(results, how="vertical")
|
||||
dataset = dataset.join(bayes_stats, on="structure_id", how="left")
|
||||
|
||||
# Remove duplicates after calculating mean, std
|
||||
dataset = dataset.unique(subset="structure_id")
|
||||
|
||||
# Calculate and normalise features, make a "desc" column with the features
|
||||
dataset = dataset.with_columns(
|
||||
pl.col("smiles")
|
||||
.map_elements(
|
||||
self.descriptors.get_molecule_descriptors, return_dtype=pl.List(pl.Float64)
|
||||
)
|
||||
.alias("desc")
|
||||
)
|
||||
|
||||
# If a SMILES fails to get desc it is removed
|
||||
dataset = dataset.filter(pl.col("desc").is_not_null() & (pl.col("desc").list.len() > 0))
|
||||
|
||||
# Flatten the features into the dataset
|
||||
dataset = dataset.with_columns(
|
||||
pl.col("desc").list.to_struct(fields=self.descriptors.get_descriptor_names())
|
||||
).unnest("desc")
|
||||
|
||||
return dataset
|
||||
|
||||
def predict_batch(self, batch: List[str], is_smiles: bool = True) -> List[List[float | None]]:
|
||||
if is_smiles:
|
||||
rows = [self.descriptors.get_molecule_descriptors(smiles) for smiles in batch]
|
||||
else:
|
||||
rows = batch
|
||||
|
||||
# Create Dataframe with all descriptors
|
||||
initial_desc_rows_df = pl.DataFrame(
|
||||
data=rows, schema=self.descriptors.get_descriptor_names(), orient="row"
|
||||
)
|
||||
|
||||
# Before checking for invalid values per row, select only required columns
|
||||
initial_desc_rows_df = initial_desc_rows_df.select(
|
||||
list(self.min_max_scaler.feature_names_in_)
|
||||
)
|
||||
|
||||
to_pad = []
|
||||
adjusted_rows = []
|
||||
for i, row in enumerate(initial_desc_rows_df.rows()):
|
||||
# neither infs nor nans are found -> rows seems to be valid input
|
||||
if row and not any(math.isinf(x) for x in row) and not any(math.isnan(x) for x in row):
|
||||
adjusted_rows.append(row)
|
||||
else:
|
||||
to_pad.append(i)
|
||||
|
||||
if adjusted_rows:
|
||||
desc_rows_df = pl.DataFrame(
|
||||
data=adjusted_rows, schema=list(self.min_max_scaler.feature_names_in_), orient="row"
|
||||
)
|
||||
x_normal = self.min_max_scaler.transform(desc_rows_df)
|
||||
x_normal = x_normal[self.descriptor_subset]
|
||||
|
||||
res = self.regressor.predict(x_normal, return_std=True)
|
||||
|
||||
# Convert to lists
|
||||
res = [list(res[0]), list(res[1])]
|
||||
|
||||
# If we had rows containing bad input (inf, nan) insert Nones at the correct position
|
||||
if to_pad:
|
||||
for i in to_pad:
|
||||
res[0].insert(i, None)
|
||||
res[1].insert(i, None)
|
||||
|
||||
return res
|
||||
|
||||
else:
|
||||
return [[None] * len(batch), [None] * len(batch)]
|
||||
|
||||
@staticmethod
|
||||
def remove_highly_correlated_features(
|
||||
X_train,
|
||||
corr_method: str = "spearman",
|
||||
cluster_threshold: float = 0.01,
|
||||
ignore=False,
|
||||
):
|
||||
if ignore:
|
||||
return X_train
|
||||
# pass
|
||||
else:
|
||||
# Using spearmanr from scipy to achieve pandas.corr in polars
|
||||
corr = spearmanr(X_train, axis=0).statistic
|
||||
|
||||
# Ensure the correlation matrix is symmetric
|
||||
corr = (corr + corr.T) / 2
|
||||
np.fill_diagonal(corr, 1)
|
||||
corr = np.nan_to_num(corr)
|
||||
|
||||
# code from https://scikit-learn.org/stable/auto_examples/inspection/
|
||||
# plot_permutation_importance_multicollinear.html
|
||||
# We convert the correlation matrix to a distance matrix before performing
|
||||
# hierarchical clustering using Ward's linkage.
|
||||
distance_matrix = 1 - np.abs(corr)
|
||||
dist_linkage = hierarchy.ward(squareform(distance_matrix))
|
||||
|
||||
cluster_ids = hierarchy.fcluster(dist_linkage, cluster_threshold, criterion="distance")
|
||||
cluster_id_to_feature_ids = defaultdict(list)
|
||||
|
||||
for idx, cluster_id in enumerate(cluster_ids):
|
||||
cluster_id_to_feature_ids[cluster_id].append(idx)
|
||||
|
||||
my_selected_features = [v[0] for v in cluster_id_to_feature_ids.values()]
|
||||
X_train_sel = X_train[:, my_selected_features]
|
||||
|
||||
return X_train_sel
|
||||
|
||||
def save_model(self, path):
|
||||
with open(path, "wb") as save_file:
|
||||
pickle.dump(self, save_file, protocol=5)
|
||||
|
||||
@staticmethod
|
||||
def load_model(path) -> "Pepper":
|
||||
with open(path, "rb") as load_file:
|
||||
return pickle.load(load_file)
|
||||
|
||||
|
||||
def compute_bayes_per_group(group):
|
||||
"""Get mean and std using bayesian"""
|
||||
mean, std = Bayesian(group["dt50_log"]).get_posterior_distribution()
|
||||
return pl.DataFrame(
|
||||
{
|
||||
"structure_id": [group["structure_id"][0]],
|
||||
"dt50_bayesian_mean": [mean],
|
||||
"dt50_bayesian_std": [std],
|
||||
}
|
||||
)
|
||||
Reference in New Issue
Block a user