[Feature] PEPPER in enviPath (#332)

Co-authored-by: Tim Lorsbach <tim@lorsba.ch>
Reviewed-on: enviPath/enviPy#332
This commit is contained in:
2026-03-06 22:11:22 +13:00
parent 6e00926371
commit c6ff97694d
43 changed files with 3793 additions and 371 deletions

0
pepper/impl/__init__.py Normal file
View File

196
pepper/impl/bayesian.py Normal file
View File

@ -0,0 +1,196 @@
import emcee
import numpy as np
from scipy.stats import lognorm, norm
class Bayesian:
def __init__(self, y, comment_list=None):
if comment_list is None:
comment_list = []
self.y = y
self.comment_list = comment_list
# LOQ default settings
self.LOQ_lower = -1 # (2.4 hours)
self.LOQ_upper = 3 # 1000 days
# prior default settings
self.prior_mu_mean = 1.5
self.prior_mu_std = 2
self.prior_sigma_mean = 0.4
self.prior_sigma_std = 0.4
self.lower_limit_sigma = 0.2
# EMCEE defaults
self.nwalkers = 10
self.iterations = 2000
self.burn_in = 100
ndim = 2 # number of dimensions (mean, std)
# backend = emcee.backends.HDFBackend("backend.h5")
# backend.reset(self.nwalkers, ndim)
self.sampler = emcee.EnsembleSampler(self.nwalkers, ndim, self.logPosterior)
self.posterior_mu = None
self.posterior_sigma = None
def get_censored_values_only(self):
censored_values = []
for i, comment in enumerate(self.comment_list):
if comment in ["<", ">"]:
censored_values.append(self.y[i])
elif self.y[i] > self.LOQ_upper or self.y[i] < self.LOQ_lower:
censored_values.append(self.y[i])
return censored_values
# Class functions
def determine_LOQ(self):
"""
Determines if the LOQ is upper or lower, and the value (if not default)
:return: upper_LOQ , lower_LOQ
"""
censored_values = self.get_censored_values_only()
# Find upper LOQ
upper_LOQ = np.nan
# bigger than global LOQ
if max(self.y) >= self.LOQ_upper:
upper_LOQ = self.LOQ_upper
# case if exactly 365 days
elif max(self.y) == 2.562: # 365 days
upper_LOQ = 2.562
self.LOQ_upper = upper_LOQ
# case if "bigger than" indication in comments
elif ">" in self.comment_list:
i = 0
while i < len(self.y):
if self.y[i] == min(censored_values) and self.comment_list[i] == ">":
self.LOQ_upper = self.y[i]
break
i += 1
# Find lower LOQ
lower_LOQ = np.nan
# smaller than global LOQ
if min(self.y) <= self.LOQ_lower:
lower_LOQ = self.LOQ_lower
# case if exactly 1 day
elif min(self.y) == 0: # 1 day
lower_LOQ = 0
self.LOQ_lower = 0
# case if "smaller than" indication in comments
elif "<" in self.comment_list:
i = 0
while i < len(self.y):
if self.y[i] == max(censored_values) and self.comment_list[i] == "<":
self.LOQ_lower = self.y[i]
break
i += 1
return upper_LOQ, lower_LOQ
def logLikelihood(self, theta, sigma):
"""
Likelihood function (the probability of a dataset (mean, std) given the model parameters)
Convert not censored observations into type numeric
:param theta: mean half-life value to be evaluated
:param sigma: std half-life value to be evaluated
:return: log_likelihood
"""
upper_LOQ, lower_LOQ = self.determine_LOQ()
n_censored_upper = 0
n_censored_lower = 0
y_not_cen = []
if np.isnan(upper_LOQ) and np.isnan(lower_LOQ):
y_not_cen = self.y
else:
for i in self.y:
if np.isnan(upper_LOQ) and i >= upper_LOQ: # censor above threshold
n_censored_upper += 1
if np.isnan(lower_LOQ) and i <= lower_LOQ: # censor below threshold
n_censored_lower += 1
else: # do not censor
y_not_cen.append(i)
LL_left_cen = 0
LL_right_cen = 0
LL_not_cen = 0
# likelihood for not censored observations
if n_censored_lower > 0: # loglikelihood for left censored observations
LL_left_cen = n_censored_lower * norm.logcdf(
lower_LOQ, loc=theta, scale=sigma
) # cumulative distribution function CDF
if n_censored_upper > 0: # loglikelihood for right censored observations
LL_right_cen = n_censored_upper * norm.logsf(
upper_LOQ, loc=theta, scale=sigma
) # survival function (1-CDF)
if len(y_not_cen) > 0: # loglikelihood for uncensored values
LL_not_cen = sum(
norm.logpdf(y_not_cen, loc=theta, scale=sigma)
) # probability density function PDF
return LL_left_cen + LL_not_cen + LL_right_cen
def get_prior_probability_sigma(self, sigma):
# convert mean and sd to logspace parameters, to see this formula check
# https://en.wikipedia.org/wiki/Log-normal_distribution under Method of moments section
temp = 1 + (self.prior_sigma_std / self.prior_sigma_mean) ** 2
meanlog = self.prior_sigma_mean / np.sqrt(temp)
sdlog = np.sqrt(np.log(temp))
# calculate of logpdf of sigma
norm_pdf_sigma = lognorm.logpdf(sigma, s=sdlog, loc=self.lower_limit_sigma, scale=meanlog)
return norm_pdf_sigma
def get_prior_probability_theta(self, theta):
norm_pdf_theta = norm.logpdf(theta, loc=self.prior_mu_mean, scale=self.prior_mu_std)
return norm_pdf_theta
def logPrior(self, par):
"""
Obtain prior loglikelihood of [theta, sigma]
:param par: par = [theta,sigma]
:return: loglikelihood
"""
# calculate the mean and standard deviation in the log-space
norm_pdf_mean = self.get_prior_probability_theta(par[0])
norm_pdf_std = self.get_prior_probability_sigma(par[1])
log_norm_pdf = [norm_pdf_mean, norm_pdf_std]
return sum(log_norm_pdf)
def logPosterior(self, par):
"""
Obtain posterior loglikelihood
:param par: [theta, sigma]
:return: posterior loglikelihood
"""
logpri = self.logPrior(par)
if not np.isfinite(logpri):
return -np.inf
loglikelihood = self.logLikelihood(par[0], par[1])
return logpri + loglikelihood
def get_posterior_distribution(self):
"""
Sample posterior distribution and get median of mean and std samples
:return: posterior half-life mean and std
"""
if self.posterior_mu:
return self.posterior_mu, self.posterior_sigma
# Sampler parameters
ndim = 2 # number of dimensions (mean,std)
p0 = abs(np.random.randn(self.nwalkers, ndim)) # only positive starting numbers (for std)
# Sample distribution
self.sampler.run_mcmc(p0, self.iterations)
# get chain and log_prob in one-dimensional array (merged chains with burn-in)
samples = self.sampler.get_chain(flat=True, discard=100)
# get median mean and std
self.posterior_mu = np.median(samples[:, 0])
self.posterior_sigma = np.median(samples[:, 1])
return self.posterior_mu, self.posterior_sigma
# Utility functions
def get_normal_distribution(x, mu, sig):
return np.exp(-np.power(x - mu, 2.0) / (2 * np.power(sig, 2.0)))

View File

@ -0,0 +1,11 @@
GPR:
name: Gaussian Process Regressor
regressor: GaussianProcessRegressor
regressor_params:
normalize_y: True
n_restarts_optimizer: 0
kernel: "ConstantKernel(1.0, (1e-3, 1e3)) * Matern(length_scale=2.5, length_scale_bounds=(1e-3, 1e3), nu=0.5)"
feature_reduction_method: None
feature_reduction_parameters:
pca:
n_components: 34

View File

@ -0,0 +1,60 @@
from abc import ABC, abstractmethod
from typing import List
from mordred import Calculator, descriptors
from padelpy import from_smiles
from rdkit import Chem
class Descriptor(ABC):
@abstractmethod
def get_molecule_descriptors(self, molecule: str) -> List[float | int] | None:
pass
@abstractmethod
def get_descriptor_names(self) -> List[str]:
pass
class Mordred(Descriptor):
calc = Calculator(descriptors, ignore_3D=True)
def get_molecule_descriptors(self, molecule: str) -> List[float | int] | None:
mol = Chem.MolFromSmiles(molecule)
res = list(self.calc(mol))
return res
def get_descriptor_names(self) -> List[str]:
return [f"Mordred_{i}" for i in range(len(self.calc.descriptors))]
class PaDEL(Descriptor):
calc = Calculator(descriptors)
def get_molecule_descriptors(self, molecule: str) -> List[float | int] | None:
try:
padel_descriptors = from_smiles(molecule, threads=1)
except RuntimeError:
return []
formatted = []
for k, v in padel_descriptors.items():
try:
formatted.append(float(v))
except ValueError:
formatted.append(0.0)
return formatted
def get_descriptor_names(self) -> List[str]:
return [f"PaDEL_{i}" for i in range(1875)]
if __name__ == "__main__":
mol = "CC1=CC(O)=CC=C1[N+](=O)[O-]"
m = Mordred()
print(list(m.get_molecule_descriptors(mol)))
p = PaDEL()
print(list(p.get_molecule_descriptors(mol)))

329
pepper/impl/pepper.py Normal file
View File

@ -0,0 +1,329 @@
import importlib.resources
import logging
import math
import os
import pickle
from collections import defaultdict
from typing import List
import numpy as np
import polars as pl
import yaml
from joblib import Parallel, delayed
from scipy.cluster import hierarchy
from scipy.spatial.distance import squareform
from scipy.stats import spearmanr
from sklearn.feature_selection import VarianceThreshold
from sklearn.gaussian_process import GaussianProcessRegressor
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import FunctionTransformer, MinMaxScaler
from .bayesian import Bayesian
from .descriptors import Mordred
class Pepper:
def __init__(self, config_path=None, random_state=42):
self.random_state = random_state
if config_path is None:
config_path = importlib.resources.files("pepper.impl.config").joinpath(
"regressor_settings_singlevalue_soil_paper_GPR_optimized.yml"
)
with open(config_path, "r") as file:
regressor_settings = yaml.safe_load(file)
if len(regressor_settings) > 1:
logging.warning(
f"More than one regressor config found in {config_path}, using the first one"
)
self.regressor_settings = regressor_settings[list(regressor_settings.keys())[0]]
if "kernel" in self.regressor_settings["regressor_params"]:
from sklearn.gaussian_process.kernels import ConstantKernel, Matern # noqa: F401
# We could hard-code the kernels they have, maybe better than using eval
self.regressor_settings["regressor_params"]["kernel"] = eval(
self.regressor_settings["regressor_params"]["kernel"]
)
# We assume the YAML has the key regressor containing a regressor name
self.regressor = self.get_regressor_by_name(self.regressor_settings["regressor"])
if "regressor_params" in self.regressor_settings: # Set params if any are given
self.regressor.set_params(**self.regressor_settings["regressor_params"])
# TODO we could make this configurable
self.descriptors = Mordred()
self.descriptor_subset = None
self.min_max_scaler = MinMaxScaler().set_output(transform="polars")
self.feature_preselector = Pipeline(
[
(
"variance_threshold",
VarianceThreshold(threshold=0.02).set_output(transform="polars"),
),
# Feature selection based on variance threshold
(
"custom_feature_selection",
FunctionTransformer(
func=self.remove_highly_correlated_features,
validate=False,
kw_args={"corr_method": "spearman", "cluster_threshold": 0.01},
).set_output(transform="polars"),
),
]
)
def get_regressor_by_name(self, regressor_string):
"""
Load regressor function from a regressor name
:param regressor_string: name of regressor as defined in config file (function name with parentheses)
:return: Regressor object
"""
# if regressor_string == 'RandomForestRegressor':
# return RandomForestRegressor(random_state=self.random_state)
# elif regressor_string == 'GradientBoostingRegressor':
# return GradientBoostingRegressor(random_state=self.random_state)
# elif regressor_string == 'AdaBoostRegressor':
# return AdaBoostRegressor(random_state=self.random_state)
# elif regressor_string == 'MLPRegressor':
# return MLPRegressor(random_state=self.random_state)
# elif regressor_string == 'SVR':
# return SVR()
# elif regressor_string == 'KNeighborsRegressor':
# return KNeighborsRegressor()
if regressor_string == "GaussianProcessRegressor":
return GaussianProcessRegressor(random_state=self.random_state)
# elif regressor_string == 'DecisionTreeRegressor':
# return DecisionTreeRegressor(random_state=self.random_state)
# elif regressor_string == 'Ridge':
# return Ridge(random_state=self.random_state)
# elif regressor_string == 'SGDRegressor':
# return SGDRegressor(random_state=self.random_state)
# elif regressor_string == 'KernelRidge':
# return KernelRidge()
# elif regressor_string == 'LinearRegression':
# return LinearRegression()
# elif regressor_string == 'LSVR':
# return SVR(kernel='linear') # Linear Support Vector Regressor
else:
raise NotImplementedError(
f"No regressor type defined for regressor_string = {regressor_string}"
)
def train_model(self, train_data, preprocess=True):
"""
Fit self.regressor and preprocessors. train_data is a pl.DataFrame
"""
if preprocess:
# Compute the mean and std of half-lives per structure
train_data = self.preprocess_data(train_data)
# train_data structure:
# columns = [
# "structure_id",
# "smiles",
# "dt50_log",
# "dt50_bayesian_mean",
# "dt50_bayesian_std",
# ] + self.descriptors.get_descriptor_names()
# only select descriptor features for feature preselector
df = train_data[self.descriptors.get_descriptor_names()]
# Remove columns having at least None, nan, inf, "" value
df = Pepper.keep_clean_columns(df)
# Scale and Remove highly correlated features as well as features having a low variance
x_train_normal = self.min_max_scaler.fit_transform(df)
x_train_normal = self.feature_preselector.fit_transform(x_train_normal)
# Store subset, as this is the input used for prediction
self.descriptor_subset = x_train_normal.columns
y_train = train_data["dt50_bayesian_mean"].to_numpy()
y_train_std = train_data["dt50_bayesian_std"].to_numpy()
self.regressor.set_params(alpha=y_train_std)
self.regressor.fit(x_train_normal, y_train)
return self, train_data
@staticmethod
def keep_clean_columns(df: pl.DataFrame) -> pl.DataFrame:
"""
Filters out columns from the DataFrame that contain null values, NaN, or infinite values.
This static method takes a DataFrame as input and evaluates each of its columns to determine
if the column contains invalid values. Columns that have null values, NaN, or infinite values
are excluded from the resulting DataFrame. The method is especially useful for cleaning up a
dataset by keeping only the valid columns.
Parameters:
df (polars.DataFrame): The input DataFrame to be cleaned.
Returns:
polars.DataFrame: A DataFrame containing only columns without null, NaN, or infinite values.
"""
valid_cols = []
for col in df.columns:
s = df[col]
# Check nulls
has_null = s.null_count() > 0
# Check NaN and inf only for numeric columns
if s.dtype.is_numeric():
has_nan = s.is_nan().any()
has_inf = s.is_infinite().any()
else:
has_nan = False
has_inf = False
if not (has_null or has_nan or has_inf):
valid_cols.append(col)
return df.select(valid_cols)
def preprocess_data(self, dataset):
groups = [group for group in dataset.group_by("structure_id")]
# Unless explicitly set compute everything serial
if os.environ.get("N_PEPPER_THREADS", 1) > 1:
results = Parallel(n_jobs=os.environ["N_PEPPER_THREADS"])(
delayed(compute_bayes_per_group)(group[1])
for group in dataset.group_by("structure_id")
)
else:
results = []
for g in groups:
results.append(compute_bayes_per_group(g[1]))
bayes_stats = pl.concat(results, how="vertical")
dataset = dataset.join(bayes_stats, on="structure_id", how="left")
# Remove duplicates after calculating mean, std
dataset = dataset.unique(subset="structure_id")
# Calculate and normalise features, make a "desc" column with the features
dataset = dataset.with_columns(
pl.col("smiles")
.map_elements(
self.descriptors.get_molecule_descriptors, return_dtype=pl.List(pl.Float64)
)
.alias("desc")
)
# If a SMILES fails to get desc it is removed
dataset = dataset.filter(pl.col("desc").is_not_null() & (pl.col("desc").list.len() > 0))
# Flatten the features into the dataset
dataset = dataset.with_columns(
pl.col("desc").list.to_struct(fields=self.descriptors.get_descriptor_names())
).unnest("desc")
return dataset
def predict_batch(self, batch: List[str], is_smiles: bool = True) -> List[List[float | None]]:
if is_smiles:
rows = [self.descriptors.get_molecule_descriptors(smiles) for smiles in batch]
else:
rows = batch
# Create Dataframe with all descriptors
initial_desc_rows_df = pl.DataFrame(
data=rows, schema=self.descriptors.get_descriptor_names(), orient="row"
)
# Before checking for invalid values per row, select only required columns
initial_desc_rows_df = initial_desc_rows_df.select(
list(self.min_max_scaler.feature_names_in_)
)
to_pad = []
adjusted_rows = []
for i, row in enumerate(initial_desc_rows_df.rows()):
# neither infs nor nans are found -> rows seems to be valid input
if row and not any(math.isinf(x) for x in row) and not any(math.isnan(x) for x in row):
adjusted_rows.append(row)
else:
to_pad.append(i)
if adjusted_rows:
desc_rows_df = pl.DataFrame(
data=adjusted_rows, schema=list(self.min_max_scaler.feature_names_in_), orient="row"
)
x_normal = self.min_max_scaler.transform(desc_rows_df)
x_normal = x_normal[self.descriptor_subset]
res = self.regressor.predict(x_normal, return_std=True)
# Convert to lists
res = [list(res[0]), list(res[1])]
# If we had rows containing bad input (inf, nan) insert Nones at the correct position
if to_pad:
for i in to_pad:
res[0].insert(i, None)
res[1].insert(i, None)
return res
else:
return [[None] * len(batch), [None] * len(batch)]
@staticmethod
def remove_highly_correlated_features(
X_train,
corr_method: str = "spearman",
cluster_threshold: float = 0.01,
ignore=False,
):
if ignore:
return X_train
# pass
else:
# Using spearmanr from scipy to achieve pandas.corr in polars
corr = spearmanr(X_train, axis=0).statistic
# Ensure the correlation matrix is symmetric
corr = (corr + corr.T) / 2
np.fill_diagonal(corr, 1)
corr = np.nan_to_num(corr)
# code from https://scikit-learn.org/stable/auto_examples/inspection/
# plot_permutation_importance_multicollinear.html
# We convert the correlation matrix to a distance matrix before performing
# hierarchical clustering using Ward's linkage.
distance_matrix = 1 - np.abs(corr)
dist_linkage = hierarchy.ward(squareform(distance_matrix))
cluster_ids = hierarchy.fcluster(dist_linkage, cluster_threshold, criterion="distance")
cluster_id_to_feature_ids = defaultdict(list)
for idx, cluster_id in enumerate(cluster_ids):
cluster_id_to_feature_ids[cluster_id].append(idx)
my_selected_features = [v[0] for v in cluster_id_to_feature_ids.values()]
X_train_sel = X_train[:, my_selected_features]
return X_train_sel
def save_model(self, path):
with open(path, "wb") as save_file:
pickle.dump(self, save_file, protocol=5)
@staticmethod
def load_model(path) -> "Pepper":
with open(path, "rb") as load_file:
return pickle.load(load_file)
def compute_bayes_per_group(group):
"""Get mean and std using bayesian"""
mean, std = Bayesian(group["dt50_log"]).get_posterior_distribution()
return pl.DataFrame(
{
"structure_id": [group["structure_id"][0]],
"dt50_bayesian_mean": [mean],
"dt50_bayesian_std": [std],
}
)