[Feature] PEPPER in enviPath (#332)

Co-authored-by: Tim Lorsbach <tim@lorsba.ch> Reviewed-on: enviPath/enviPy#332
2026-03-06 22:11:22 +13:00
parent 6e00926371
commit c6ff97694d
43 changed files with 3793 additions and 371 deletions
--- a/pepper/init.py
+++ b/pepper/init.py
@ -0,0 +1,361 @@
+import logging
+import math
+import os
+import pickle
+from datetime import datetime
+from typing import Any, List, Optional
+
+import polars as pl
+
+from pydantic import computed_field
+from sklearn.metrics import (
+    mean_absolute_error,
+    mean_squared_error,
+    r2_score,
+    root_mean_squared_error,
+)
+from sklearn.model_selection import ShuffleSplit
+
+# Once stable these will be exposed by enviPy-plugins lib
+from envipy_additional_information import register  # noqa: I001
+from bridge.contracts import Property, PropertyType  # noqa: I001
+from bridge.dto import (
+    BuildResult,
+    EnviPyDTO,
+    EvaluationResult,
+    PredictedProperty,
+    RunResult,
+)  # noqa: I001
+
+from .impl.pepper import Pepper  # noqa: I001
+
+logger = logging.getLogger(__name__)
+
+
+@register("pepperprediction")
+class PepperPrediction(PredictedProperty):
+    mean: float | None
+    std: float | None
+    log_mean: float | None
+    log_std: float | None
+
+    @computed_field
+    @property
+    def svg(self, xscale="linear", quantiles=(0.01, 0.99), n_points=2000) -> Optional[str]:
+        import io
+
+        import matplotlib.patches as mpatches
+        import numpy as np
+        from matplotlib import pyplot as plt
+        from scipy import stats
+
+        """
+        Plot the lognormal distribution of chemical half-lives where parameters are
+        given on a base-10 log scale: log10(half-life) ~ Normal(mu_log10, sigma_log10^2).
+
+        Shades:
+          - x < a in green (Non-persistent)
+          - a <= x <= b in yellow (Persistent)
+          - x > b in red (Very persistent)
+
+        Legend shows the shaded color and the probability mass in each region.
+        """
+
+        sigma_log10 = self.log_std
+        mu_log10 = self.log_mean
+
+        if sigma_log10 <= 0:
+            raise ValueError("sigma_log10 must be > 0")
+        # Persistent and Very Persistent thresholds in days from REACH (https://doi.org/10.26434/chemrxiv-2025-xmslf)
+        p = 120
+        vp = 180
+
+        # Convert base-10 log parameters to natural-log parameters for SciPy's lognorm
+        ln10 = np.log(10.0)
+        mu_ln = mu_log10 * ln10
+        sigma_ln = sigma_log10 * ln10
+
+        # SciPy parameterization: lognorm(s=sigma_ln, scale=exp(mu_ln))
+        dist = stats.lognorm(s=sigma_ln, scale=np.exp(mu_ln))
+
+        # Exact probabilities
+        p_green = dist.cdf(p)  # P(X < a)
+        p_yellow = dist.cdf(vp) - p_green  # P(a <= X <= b)
+        p_red = 1.0 - dist.cdf(vp)  # P(X > b)
+
+        # Plotting range
+        q_low, q_high = dist.ppf(quantiles)
+        x_min = max(1e-12, min(q_low, p) * 0.9)
+        x_max = max(q_high, vp) * 1.1
+
+        # Build x-grid (linear days axis)
+        if xscale == "log":
+            x = np.logspace(np.log10(x_min), np.log10(x_max), n_points)
+        else:
+            x = np.linspace(x_min, x_max, n_points)
+        y = dist.pdf(x)
+
+        # Masks for shading
+        mask_green = x < p
+        mask_yellow = (x >= p) & (x <= vp)
+        mask_red = x > vp
+
+        # Plot
+        fig, ax = plt.subplots(figsize=(9, 5.5))
+        ax.plot(x, y, color="#1f4e79", lw=2, label="Lognormal PDF")
+
+        if np.any(mask_green):
+            ax.fill_between(x[mask_green], y[mask_green], 0, color="tab:green", alpha=0.3)
+        if np.any(mask_yellow):
+            ax.fill_between(x[mask_yellow], y[mask_yellow], 0, color="gold", alpha=0.35)
+        if np.any(mask_red):
+            ax.fill_between(x[mask_red], y[mask_red], 0, color="tab:red", alpha=0.3)
+
+        # Threshold lines
+        ax.axvline(p, color="gray", ls="--", lw=1)
+        ax.axvline(vp, color="gray", ls="--", lw=1)
+
+        # Labels & title
+        ax.set_title(
+            f"Half-life Distribution (Lognormal)\nlog10 parameters: μ={mu_log10:g}, σ={sigma_log10:g}"
+        )
+        ax.set_xlabel("Half-life (days)")
+        ax.set_ylabel("Probability density")
+        ax.grid(True, alpha=0.25)
+
+        if xscale == "log":
+            ax.set_xscale("log")  # not used in this example, but supported
+
+        # Legend with probabilities
+        patches = [
+            mpatches.Patch(
+                color="tab:green",
+                alpha=0.3,
+                label=f"Non-persistent (<{p:g} d): {p_green:.2%}",
+            ),
+            mpatches.Patch(
+                color="gold",
+                alpha=0.35,
+                label=f"Persistent ({p:g}–{vp:g} d): {p_yellow:.2%}",
+            ),
+            mpatches.Patch(
+                color="tab:red",
+                alpha=0.3,
+                label=f"Very persistent (>{vp:g} d): {p_red:.2%}",
+            ),
+        ]
+        ax.legend(handles=patches, frameon=True)
+
+        plt.tight_layout()
+
+        # --- Export to SVG string ---
+        buf = io.StringIO()
+        fig.savefig(buf, format="svg", bbox_inches="tight")
+        svg = buf.getvalue()
+        plt.close(fig)
+        buf.close()
+
+        return svg
+
+
+class PEPPER(Property):
+    def identifier(self) -> str:
+        return "pepper"
+
+    def display(self) -> str:
+        return "PEPPER"
+
+    def name(self) -> str:
+        return "Predict Environmental Pollutant PERsistence"
+
+    def requires_rule_packages(self) -> bool:
+        return False
+
+    def requires_data_packages(self) -> bool:
+        return True
+
+    def get_type(self) -> PropertyType:
+        return PropertyType.HEAVY
+
+    def generate_dataset(self, eP: EnviPyDTO) -> pl.DataFrame:
+        """
+        Generates a dataset in the form of a Polars DataFrame containing compound information, including
+        SMILES strings and logarithmic values of degradation half-lives (dt50).
+
+        The dataset is built by iterating over a list of compounds, standardizing SMILES strings, and
+        calculating the logarithmic mean of the half-life intervals for different environmental scenarios
+        associated with each compound.
+
+        The resulting DataFrame will only include unique rows based on SMILES and logarithmic half-life
+        values.
+
+        Parameters:
+            eP (EnviPyDTO): An object that provides access to compound data and utility functions for
+            standardization and retrieval of half-life information.
+
+        Returns:
+            pl.DataFrame: The resulting dataset with unique rows containing compound structure identifiers,
+            standardized SMILES strings, and logarithmic half-life values.
+
+        Raises:
+            Exception: Exceptions are caught and logged during data processing, specifically when retrieving
+            half-life information.
+
+        Note:
+            - The logarithmic mean is calculated from the start and end intervals of the dt50 (half-life).
+            - Compounds not associated with any half-life data are skipped, and errors encountered during processing
+              are logged without halting the execution.
+        """
+        columns = ["structure_id", "smiles", "dt50_log"]
+        rows = []
+
+        for c in eP.get_compounds():
+            hls = c.half_lifes()
+
+            if len(hls):
+                stand_smiles = eP.standardize(c.smiles, remove_stereo=True)
+                for scenario, half_lives in hls.items():
+                    for h in half_lives:
+                        # In the original Pepper code they take the mean of the start and end interval.
+                        half_mean = (h.dt50.start + h.dt50.end) / 2
+                        rows.append([str(c.url), stand_smiles, math.log10(half_mean)])
+
+        df = pl.DataFrame(data=rows, schema=columns, orient="row", infer_schema_length=None)
+
+        df = df.unique(subset=["smiles", "dt50_log"], keep="any", maintain_order=False)
+
+        return df
+
+    def save_dataset(self, df: pl.DataFrame, path: str):
+        with open(path, "wb") as fh:
+            pickle.dump(df, fh)
+
+    def load_dataset(self, path: str) -> pl.DataFrame:
+        with open(path, "rb") as fh:
+            return pickle.load(fh)
+
+    def build(self, eP: EnviPyDTO, *args, **kwargs) -> BuildResult | None:
+        logger.info(f"Start building PEPPER {eP.get_context().uuid}")
+        df = self.generate_dataset(eP)
+
+        if df.shape[0] == 0:
+            raise ValueError("No data found for building model")
+
+        p = Pepper()
+
+        p, train_ds = p.train_model(df)
+
+        ds_store_path = os.path.join(
+            eP.get_context().work_dir, f"pepper_ds_{eP.get_context().uuid}.pkl"
+        )
+        self.save_dataset(train_ds, ds_store_path)
+
+        model_store_path = os.path.join(
+            eP.get_context().work_dir, f"pepper_{eP.get_context().uuid}.pkl"
+        )
+        p.save_model(model_store_path)
+        logger.info(f"Finished building PEPPER {eP.get_context().uuid}")
+
+    def run(self, eP: EnviPyDTO, *args, **kwargs) -> RunResult:
+        load_path = os.path.join(eP.get_context().work_dir, f"pepper_{eP.get_context().uuid}.pkl")
+
+        p = Pepper.load_model(load_path)
+
+        X_new = [c.smiles for c in eP.get_compounds()]
+
+        predictions = p.predict_batch(X_new)
+
+        results = []
+
+        for p in zip(*predictions):
+            if p[0] is None or p[1] is None:
+                result = {"log_mean": None, "mean": None, "log_std": None, "std": None, "svg": None}
+            else:
+                result = {
+                    "log_mean": p[0],
+                    "mean": 10 ** p[0],
+                    "log_std": p[1],
+                    "std": 10 ** p[1],
+                }
+
+            results.append(PepperPrediction(**result))
+
+        rr = RunResult(
+            producer=eP.get_context().url,
+            description=f"Generated at {datetime.now()}",
+            result=results,
+        )
+
+        return rr
+
+    def evaluate(self, eP: EnviPyDTO, *args, **kwargs) -> EvaluationResult | None:
+        logger.info(f"Start evaluating PEPPER {eP.get_context().uuid}")
+        load_path = os.path.join(eP.get_context().work_dir, f"pepper_{eP.get_context().uuid}.pkl")
+
+        p = Pepper.load_model(load_path)
+
+        df = self.generate_dataset(eP)
+        ds = p.preprocess_data(df)
+
+        y_pred = p.predict_batch(ds["smiles"])
+
+        # We only need the mean
+        if isinstance(y_pred, tuple):
+            y_pred = y_pred[0]
+
+        res = self.eval_stats(ds["dt50_bayesian_mean"], y_pred)
+
+        logger.info(f"Finished evaluating PEPPER {eP.get_context().uuid}")
+        return EvaluationResult(data=res)
+
+    def build_and_evaluate(self, eP: EnviPyDTO, *args, **kwargs) -> EvaluationResult | None:
+        logger.info(f"Start evaluating PEPPER {eP.get_context().uuid}")
+        ds_load_path = os.path.join(
+            eP.get_context().work_dir, f"pepper_ds_{eP.get_context().uuid}.pkl"
+        )
+        ds = self.load_dataset(ds_load_path)
+
+        n_splits = kwargs.get("n_splits", 20)
+        shuff = ShuffleSplit(n_splits=n_splits, test_size=0.1, random_state=42)
+
+        fold_metrics: List[dict[str, Any]] = []
+        for split_id, (train_index, test_index) in enumerate(shuff.split(ds)):
+            logger.info(f"Evaluation fold {split_id}/{n_splits} PEPPER {eP.get_context().uuid}")
+            train = ds[train_index]
+            test = ds[test_index]
+            model = Pepper()
+            model.train_model(train, preprocess=False)
+
+            features = test[model.descriptors.get_descriptor_names()].rows()
+            y_pred = model.predict_batch(features, is_smiles=False)
+
+            # We only need the mean for eval statistics but mean, std can be returned
+            if isinstance(y_pred, tuple) or isinstance(y_pred, list):
+                y_pred = y_pred[0]
+
+            # Remove None if they occur
+            y_true_filtered, y_pred_filtered = [], []
+            for t, p in zip(test["dt50_bayesian_mean"], y_pred):
+                if p is None:
+                    continue
+                y_true_filtered.append(t)
+                y_pred_filtered.append(p)
+
+            if len(y_true_filtered) == 0:
+                print("Skipping empty fold")
+                continue
+
+            fold_metrics.append(self.eval_stats(y_true_filtered, y_pred_filtered))
+
+        logger.info(f"Finished evaluating PEPPER {eP.get_context().uuid}")
+        return EvaluationResult(data=fold_metrics)
+
+    @staticmethod
+    def eval_stats(y_true, y_pred) -> dict[str, float]:
+        scores_dic = {
+            "r2": r2_score(y_true, y_pred),
+            "mse": mean_squared_error(y_true, y_pred),
+            "rmse": root_mean_squared_error(y_true, y_pred),
+            "mae": mean_absolute_error(y_true, y_pred),
+        }
+        return scores_dic
--- a/pepper/impl/init.py
+++ b/pepper/impl/init.py
--- a/pepper/impl/bayesian.py
+++ b/pepper/impl/bayesian.py
@ -0,0 +1,196 @@
+import emcee
+import numpy as np
+from scipy.stats import lognorm, norm
+
+
+class Bayesian:
+    def __init__(self, y, comment_list=None):
+        if comment_list is None:
+            comment_list = []
+        self.y = y
+        self.comment_list = comment_list
+        # LOQ default settings
+        self.LOQ_lower = -1  # (2.4 hours)
+        self.LOQ_upper = 3  # 1000 days
+        # prior default settings
+        self.prior_mu_mean = 1.5
+        self.prior_mu_std = 2
+        self.prior_sigma_mean = 0.4
+        self.prior_sigma_std = 0.4
+        self.lower_limit_sigma = 0.2
+        # EMCEE defaults
+        self.nwalkers = 10
+        self.iterations = 2000
+        self.burn_in = 100
+        ndim = 2  # number of dimensions (mean, std)
+        # backend = emcee.backends.HDFBackend("backend.h5")
+        # backend.reset(self.nwalkers, ndim)
+        self.sampler = emcee.EnsembleSampler(self.nwalkers, ndim, self.logPosterior)
+        self.posterior_mu = None
+        self.posterior_sigma = None
+
+    def get_censored_values_only(self):
+        censored_values = []
+        for i, comment in enumerate(self.comment_list):
+            if comment in ["<", ">"]:
+                censored_values.append(self.y[i])
+            elif self.y[i] > self.LOQ_upper or self.y[i] < self.LOQ_lower:
+                censored_values.append(self.y[i])
+        return censored_values
+
+    # Class functions
+    def determine_LOQ(self):
+        """
+        Determines if the LOQ is upper or lower, and the value (if not default)
+        :return: upper_LOQ , lower_LOQ
+        """
+
+        censored_values = self.get_censored_values_only()
+
+        # Find upper LOQ
+        upper_LOQ = np.nan
+        # bigger than global LOQ
+        if max(self.y) >= self.LOQ_upper:
+            upper_LOQ = self.LOQ_upper
+        # case if exactly 365 days
+        elif max(self.y) == 2.562:  # 365 days
+            upper_LOQ = 2.562
+            self.LOQ_upper = upper_LOQ
+        # case if "bigger than" indication in comments
+        elif ">" in self.comment_list:
+            i = 0
+            while i < len(self.y):
+                if self.y[i] == min(censored_values) and self.comment_list[i] == ">":
+                    self.LOQ_upper = self.y[i]
+                    break
+                i += 1
+
+        # Find lower LOQ
+        lower_LOQ = np.nan
+        # smaller than global LOQ
+        if min(self.y) <= self.LOQ_lower:
+            lower_LOQ = self.LOQ_lower
+        # case if exactly 1 day
+        elif min(self.y) == 0:  # 1 day
+            lower_LOQ = 0
+            self.LOQ_lower = 0
+        # case if "smaller than" indication in comments
+        elif "<" in self.comment_list:
+            i = 0
+            while i < len(self.y):
+                if self.y[i] == max(censored_values) and self.comment_list[i] == "<":
+                    self.LOQ_lower = self.y[i]
+                    break
+                i += 1
+        return upper_LOQ, lower_LOQ
+
+    def logLikelihood(self, theta, sigma):
+        """
+        Likelihood function (the probability of a dataset (mean, std) given the model parameters)
+        Convert not censored observations into type ’numeric’
+        :param theta: mean half-life value to be evaluated
+        :param sigma: std half-life value to be evaluated
+        :return: log_likelihood
+        """
+        upper_LOQ, lower_LOQ = self.determine_LOQ()
+
+        n_censored_upper = 0
+        n_censored_lower = 0
+        y_not_cen = []
+
+        if np.isnan(upper_LOQ) and np.isnan(lower_LOQ):
+            y_not_cen = self.y
+        else:
+            for i in self.y:
+                if np.isnan(upper_LOQ) and i >= upper_LOQ:  # censor above threshold
+                    n_censored_upper += 1
+                if np.isnan(lower_LOQ) and i <= lower_LOQ:  # censor below threshold
+                    n_censored_lower += 1
+                else:  # do not censor
+                    y_not_cen.append(i)
+
+        LL_left_cen = 0
+        LL_right_cen = 0
+        LL_not_cen = 0
+
+        # likelihood for not censored observations
+        if n_censored_lower > 0:  # loglikelihood for left censored observations
+            LL_left_cen = n_censored_lower * norm.logcdf(
+                lower_LOQ, loc=theta, scale=sigma
+            )  # cumulative distribution function CDF
+
+        if n_censored_upper > 0:  # loglikelihood for right censored observations
+            LL_right_cen = n_censored_upper * norm.logsf(
+                upper_LOQ, loc=theta, scale=sigma
+            )  # survival function (1-CDF)
+
+        if len(y_not_cen) > 0:  # loglikelihood for uncensored values
+            LL_not_cen = sum(
+                norm.logpdf(y_not_cen, loc=theta, scale=sigma)
+            )  # probability density function PDF
+
+        return LL_left_cen + LL_not_cen + LL_right_cen
+
+    def get_prior_probability_sigma(self, sigma):
+        # convert mean and sd to logspace parameters, to see this formula check
+        # https://en.wikipedia.org/wiki/Log-normal_distribution under Method of moments section
+        temp = 1 + (self.prior_sigma_std / self.prior_sigma_mean) ** 2
+        meanlog = self.prior_sigma_mean / np.sqrt(temp)
+        sdlog = np.sqrt(np.log(temp))
+        # calculate of logpdf of sigma
+        norm_pdf_sigma = lognorm.logpdf(sigma, s=sdlog, loc=self.lower_limit_sigma, scale=meanlog)
+        return norm_pdf_sigma
+
+    def get_prior_probability_theta(self, theta):
+        norm_pdf_theta = norm.logpdf(theta, loc=self.prior_mu_mean, scale=self.prior_mu_std)
+        return norm_pdf_theta
+
+    def logPrior(self, par):
+        """
+        Obtain prior loglikelihood of [theta, sigma]
+        :param par: par = [theta,sigma]
+        :return: loglikelihood
+        """
+        # calculate the mean and standard deviation in the log-space
+        norm_pdf_mean = self.get_prior_probability_theta(par[0])
+        norm_pdf_std = self.get_prior_probability_sigma(par[1])
+        log_norm_pdf = [norm_pdf_mean, norm_pdf_std]
+        return sum(log_norm_pdf)
+
+    def logPosterior(self, par):
+        """
+        Obtain posterior loglikelihood
+        :param par: [theta, sigma]
+        :return: posterior loglikelihood
+        """
+        logpri = self.logPrior(par)
+        if not np.isfinite(logpri):
+            return -np.inf
+        loglikelihood = self.logLikelihood(par[0], par[1])
+        return logpri + loglikelihood
+
+    def get_posterior_distribution(self):
+        """
+        Sample posterior distribution and get median of mean and std samples
+        :return: posterior half-life mean and std
+        """
+        if self.posterior_mu:
+            return self.posterior_mu, self.posterior_sigma
+
+        # Sampler parameters
+        ndim = 2  # number of dimensions (mean,std)
+        p0 = abs(np.random.randn(self.nwalkers, ndim))  # only positive starting numbers (for std)
+
+        # Sample distribution
+        self.sampler.run_mcmc(p0, self.iterations)
+        # get chain and log_prob in one-dimensional array (merged chains with burn-in)
+        samples = self.sampler.get_chain(flat=True, discard=100)
+        # get median mean and std
+        self.posterior_mu = np.median(samples[:, 0])
+        self.posterior_sigma = np.median(samples[:, 1])
+        return self.posterior_mu, self.posterior_sigma
+
+
+# Utility functions
+def get_normal_distribution(x, mu, sig):
+    return np.exp(-np.power(x - mu, 2.0) / (2 * np.power(sig, 2.0)))
--- a/pepper/impl/config/regressor_settings_singlevalue_soil_paper_GPR_optimized.yml
+++ b/pepper/impl/config/regressor_settings_singlevalue_soil_paper_GPR_optimized.yml
@ -0,0 +1,11 @@
+GPR:
+  name: Gaussian Process Regressor
+  regressor: GaussianProcessRegressor
+  regressor_params:
+    normalize_y: True
+    n_restarts_optimizer: 0
+    kernel: "ConstantKernel(1.0, (1e-3, 1e3)) * Matern(length_scale=2.5, length_scale_bounds=(1e-3, 1e3), nu=0.5)"
+  feature_reduction_method: None
+  feature_reduction_parameters:
+    pca:
+      n_components: 34
--- a/pepper/impl/descriptors.py
+++ b/pepper/impl/descriptors.py
@ -0,0 +1,60 @@
+from abc import ABC, abstractmethod
+from typing import List
+
+from mordred import Calculator, descriptors
+from padelpy import from_smiles
+from rdkit import Chem
+
+
+class Descriptor(ABC):
+    @abstractmethod
+    def get_molecule_descriptors(self, molecule: str) -> List[float | int] | None:
+        pass
+
+    @abstractmethod
+    def get_descriptor_names(self) -> List[str]:
+        pass
+
+
+class Mordred(Descriptor):
+    calc = Calculator(descriptors, ignore_3D=True)
+
+    def get_molecule_descriptors(self, molecule: str) -> List[float | int] | None:
+        mol = Chem.MolFromSmiles(molecule)
+        res = list(self.calc(mol))
+        return res
+
+    def get_descriptor_names(self) -> List[str]:
+        return [f"Mordred_{i}" for i in range(len(self.calc.descriptors))]
+
+
+class PaDEL(Descriptor):
+    calc = Calculator(descriptors)
+
+    def get_molecule_descriptors(self, molecule: str) -> List[float | int] | None:
+        try:
+            padel_descriptors = from_smiles(molecule, threads=1)
+        except RuntimeError:
+            return []
+
+        formatted = []
+        for k, v in padel_descriptors.items():
+            try:
+                formatted.append(float(v))
+            except ValueError:
+                formatted.append(0.0)
+
+        return formatted
+
+    def get_descriptor_names(self) -> List[str]:
+        return [f"PaDEL_{i}" for i in range(1875)]
+
+
+if __name__ == "__main__":
+    mol = "CC1=CC(O)=CC=C1[N+](=O)[O-]"
+
+    m = Mordred()
+    print(list(m.get_molecule_descriptors(mol)))
+
+    p = PaDEL()
+    print(list(p.get_molecule_descriptors(mol)))
--- a/pepper/impl/pepper.py
+++ b/pepper/impl/pepper.py
@ -0,0 +1,329 @@
+import importlib.resources
+import logging
+import math
+import os
+import pickle
+from collections import defaultdict
+from typing import List
+
+import numpy as np
+import polars as pl
+import yaml
+from joblib import Parallel, delayed
+from scipy.cluster import hierarchy
+from scipy.spatial.distance import squareform
+from scipy.stats import spearmanr
+from sklearn.feature_selection import VarianceThreshold
+from sklearn.gaussian_process import GaussianProcessRegressor
+from sklearn.pipeline import Pipeline
+from sklearn.preprocessing import FunctionTransformer, MinMaxScaler
+
+from .bayesian import Bayesian
+from .descriptors import Mordred
+
+
+class Pepper:
+    def __init__(self, config_path=None, random_state=42):
+        self.random_state = random_state
+        if config_path is None:
+            config_path = importlib.resources.files("pepper.impl.config").joinpath(
+                "regressor_settings_singlevalue_soil_paper_GPR_optimized.yml"
+            )
+        with open(config_path, "r") as file:
+            regressor_settings = yaml.safe_load(file)
+        if len(regressor_settings) > 1:
+            logging.warning(
+                f"More than one regressor config found in {config_path}, using the first one"
+            )
+        self.regressor_settings = regressor_settings[list(regressor_settings.keys())[0]]
+        if "kernel" in self.regressor_settings["regressor_params"]:
+            from sklearn.gaussian_process.kernels import ConstantKernel, Matern  # noqa: F401
+
+            # We could hard-code the kernels they have, maybe better than using eval
+            self.regressor_settings["regressor_params"]["kernel"] = eval(
+                self.regressor_settings["regressor_params"]["kernel"]
+            )
+        # We assume the YAML has the key regressor containing a regressor name
+        self.regressor = self.get_regressor_by_name(self.regressor_settings["regressor"])
+        if "regressor_params" in self.regressor_settings:  # Set params if any are given
+            self.regressor.set_params(**self.regressor_settings["regressor_params"])
+
+        # TODO we could make this configurable
+        self.descriptors = Mordred()
+        self.descriptor_subset = None
+
+        self.min_max_scaler = MinMaxScaler().set_output(transform="polars")
+        self.feature_preselector = Pipeline(
+            [
+                (
+                    "variance_threshold",
+                    VarianceThreshold(threshold=0.02).set_output(transform="polars"),
+                ),
+                # Feature selection based on variance threshold
+                (
+                    "custom_feature_selection",
+                    FunctionTransformer(
+                        func=self.remove_highly_correlated_features,
+                        validate=False,
+                        kw_args={"corr_method": "spearman", "cluster_threshold": 0.01},
+                    ).set_output(transform="polars"),
+                ),
+            ]
+        )
+
+    def get_regressor_by_name(self, regressor_string):
+        """
+        Load regressor function from a regressor name
+        :param regressor_string: name of regressor as defined in config file (function name with parentheses)
+        :return: Regressor object
+        """
+        # if regressor_string == 'RandomForestRegressor':
+        #     return RandomForestRegressor(random_state=self.random_state)
+        # elif regressor_string == 'GradientBoostingRegressor':
+        #     return GradientBoostingRegressor(random_state=self.random_state)
+        # elif regressor_string == 'AdaBoostRegressor':
+        #     return AdaBoostRegressor(random_state=self.random_state)
+        # elif regressor_string == 'MLPRegressor':
+        #     return MLPRegressor(random_state=self.random_state)
+        # elif regressor_string == 'SVR':
+        #     return SVR()
+        # elif regressor_string == 'KNeighborsRegressor':
+        #     return KNeighborsRegressor()
+        if regressor_string == "GaussianProcessRegressor":
+            return GaussianProcessRegressor(random_state=self.random_state)
+        # elif regressor_string == 'DecisionTreeRegressor':
+        #     return DecisionTreeRegressor(random_state=self.random_state)
+        # elif regressor_string == 'Ridge':
+        #     return Ridge(random_state=self.random_state)
+        # elif regressor_string == 'SGDRegressor':
+        #     return SGDRegressor(random_state=self.random_state)
+        # elif regressor_string == 'KernelRidge':
+        #     return KernelRidge()
+        # elif regressor_string == 'LinearRegression':
+        #     return LinearRegression()
+        # elif regressor_string == 'LSVR':
+        #     return SVR(kernel='linear')  # Linear Support Vector Regressor
+        else:
+            raise NotImplementedError(
+                f"No regressor type defined for regressor_string = {regressor_string}"
+            )
+
+    def train_model(self, train_data, preprocess=True):
+        """
+        Fit self.regressor and preprocessors. train_data is a pl.DataFrame
+        """
+        if preprocess:
+            # Compute the mean and std of half-lives per structure
+            train_data = self.preprocess_data(train_data)
+
+        # train_data structure:
+        # columns = [
+        #     "structure_id",
+        #     "smiles",
+        #     "dt50_log",
+        #     "dt50_bayesian_mean",
+        #     "dt50_bayesian_std",
+        # ] + self.descriptors.get_descriptor_names()
+
+        # only select descriptor features for feature preselector
+        df = train_data[self.descriptors.get_descriptor_names()]
+
+        # Remove columns having at least None, nan, inf, "" value
+        df = Pepper.keep_clean_columns(df)
+
+        # Scale and Remove highly correlated features as well as features having a low variance
+        x_train_normal = self.min_max_scaler.fit_transform(df)
+        x_train_normal = self.feature_preselector.fit_transform(x_train_normal)
+
+        # Store subset, as this is the input used for prediction
+        self.descriptor_subset = x_train_normal.columns
+
+        y_train = train_data["dt50_bayesian_mean"].to_numpy()
+        y_train_std = train_data["dt50_bayesian_std"].to_numpy()
+
+        self.regressor.set_params(alpha=y_train_std)
+        self.regressor.fit(x_train_normal, y_train)
+
+        return self, train_data
+
+    @staticmethod
+    def keep_clean_columns(df: pl.DataFrame) -> pl.DataFrame:
+        """
+        Filters out columns from the DataFrame that contain null values, NaN, or infinite values.
+
+        This static method takes a DataFrame as input and evaluates each of its columns to determine
+        if the column contains invalid values. Columns that have null values, NaN, or infinite values
+        are excluded from the resulting DataFrame. The method is especially useful for cleaning up a
+        dataset by keeping only the valid columns.
+
+        Parameters:
+        df (polars.DataFrame): The input DataFrame to be cleaned.
+
+        Returns:
+        polars.DataFrame: A DataFrame containing only columns without null, NaN, or infinite values.
+        """
+        valid_cols = []
+
+        for col in df.columns:
+            s = df[col]
+
+            # Check nulls
+            has_null = s.null_count() > 0
+
+            # Check NaN and inf only for numeric columns
+            if s.dtype.is_numeric():
+                has_nan = s.is_nan().any()
+                has_inf = s.is_infinite().any()
+            else:
+                has_nan = False
+                has_inf = False
+
+            if not (has_null or has_nan or has_inf):
+                valid_cols.append(col)
+
+        return df.select(valid_cols)
+
+    def preprocess_data(self, dataset):
+        groups = [group for group in dataset.group_by("structure_id")]
+
+        # Unless explicitly set compute everything serial
+        if os.environ.get("N_PEPPER_THREADS", 1) > 1:
+            results = Parallel(n_jobs=os.environ["N_PEPPER_THREADS"])(
+                delayed(compute_bayes_per_group)(group[1])
+                for group in dataset.group_by("structure_id")
+            )
+        else:
+            results = []
+            for g in groups:
+                results.append(compute_bayes_per_group(g[1]))
+
+        bayes_stats = pl.concat(results, how="vertical")
+        dataset = dataset.join(bayes_stats, on="structure_id", how="left")
+
+        # Remove duplicates after calculating mean, std
+        dataset = dataset.unique(subset="structure_id")
+
+        # Calculate and normalise features, make a "desc" column with the features
+        dataset = dataset.with_columns(
+            pl.col("smiles")
+            .map_elements(
+                self.descriptors.get_molecule_descriptors, return_dtype=pl.List(pl.Float64)
+            )
+            .alias("desc")
+        )
+
+        # If a SMILES fails to get desc it is removed
+        dataset = dataset.filter(pl.col("desc").is_not_null() & (pl.col("desc").list.len() > 0))
+
+        # Flatten the features into the dataset
+        dataset = dataset.with_columns(
+            pl.col("desc").list.to_struct(fields=self.descriptors.get_descriptor_names())
+        ).unnest("desc")
+
+        return dataset
+
+    def predict_batch(self, batch: List[str], is_smiles: bool = True) -> List[List[float | None]]:
+        if is_smiles:
+            rows = [self.descriptors.get_molecule_descriptors(smiles) for smiles in batch]
+        else:
+            rows = batch
+
+        # Create Dataframe with all descriptors
+        initial_desc_rows_df = pl.DataFrame(
+            data=rows, schema=self.descriptors.get_descriptor_names(), orient="row"
+        )
+
+        # Before checking for invalid values per row, select only required columns
+        initial_desc_rows_df = initial_desc_rows_df.select(
+            list(self.min_max_scaler.feature_names_in_)
+        )
+
+        to_pad = []
+        adjusted_rows = []
+        for i, row in enumerate(initial_desc_rows_df.rows()):
+            # neither infs nor nans are found -> rows seems to be valid input
+            if row and not any(math.isinf(x) for x in row) and not any(math.isnan(x) for x in row):
+                adjusted_rows.append(row)
+            else:
+                to_pad.append(i)
+
+        if adjusted_rows:
+            desc_rows_df = pl.DataFrame(
+                data=adjusted_rows, schema=list(self.min_max_scaler.feature_names_in_), orient="row"
+            )
+            x_normal = self.min_max_scaler.transform(desc_rows_df)
+            x_normal = x_normal[self.descriptor_subset]
+
+            res = self.regressor.predict(x_normal, return_std=True)
+
+            # Convert to lists
+            res = [list(res[0]), list(res[1])]
+
+            # If we had rows containing bad input (inf, nan) insert Nones at the correct position
+            if to_pad:
+                for i in to_pad:
+                    res[0].insert(i, None)
+                    res[1].insert(i, None)
+
+            return res
+
+        else:
+            return [[None] * len(batch), [None] * len(batch)]
+
+    @staticmethod
+    def remove_highly_correlated_features(
+        X_train,
+        corr_method: str = "spearman",
+        cluster_threshold: float = 0.01,
+        ignore=False,
+    ):
+        if ignore:
+            return X_train
+            # pass
+        else:
+            # Using spearmanr from scipy to achieve pandas.corr in polars
+            corr = spearmanr(X_train, axis=0).statistic
+
+            # Ensure the correlation matrix is symmetric
+            corr = (corr + corr.T) / 2
+            np.fill_diagonal(corr, 1)
+            corr = np.nan_to_num(corr)
+
+            # code from https://scikit-learn.org/stable/auto_examples/inspection/
+            # plot_permutation_importance_multicollinear.html
+            # We convert the correlation matrix to a distance matrix before performing
+            # hierarchical clustering using Ward's linkage.
+            distance_matrix = 1 - np.abs(corr)
+            dist_linkage = hierarchy.ward(squareform(distance_matrix))
+
+            cluster_ids = hierarchy.fcluster(dist_linkage, cluster_threshold, criterion="distance")
+            cluster_id_to_feature_ids = defaultdict(list)
+
+            for idx, cluster_id in enumerate(cluster_ids):
+                cluster_id_to_feature_ids[cluster_id].append(idx)
+
+            my_selected_features = [v[0] for v in cluster_id_to_feature_ids.values()]
+            X_train_sel = X_train[:, my_selected_features]
+
+            return X_train_sel
+
+    def save_model(self, path):
+        with open(path, "wb") as save_file:
+            pickle.dump(self, save_file, protocol=5)
+
+    @staticmethod
+    def load_model(path) -> "Pepper":
+        with open(path, "rb") as load_file:
+            return pickle.load(load_file)
+
+
+def compute_bayes_per_group(group):
+    """Get mean and std using bayesian"""
+    mean, std = Bayesian(group["dt50_log"]).get_posterior_distribution()
+    return pl.DataFrame(
+        {
+            "structure_id": [group["structure_id"][0]],
+            "dt50_bayesian_mean": [mean],
+            "dt50_bayesian_std": [std],
+        }
+    )