enviPy-bayer/pepper/__init__.py

import logging
import math
import os
import pickle
from datetime import datetime
from typing import Any, List, Optional

import polars as pl

from pydantic import computed_field
from sklearn.metrics import (
    mean_absolute_error,
    mean_squared_error,
    r2_score,
    root_mean_squared_error,
)
from sklearn.model_selection import ShuffleSplit

# Once stable these will be exposed by enviPy-plugins lib
from envipy_additional_information import register  # noqa: I001
from bridge.contracts import Property, PropertyType  # noqa: I001
from bridge.dto import (
    BuildResult,
    EnviPyDTO,
    EvaluationResult,
    PredictedProperty,
    RunResult,
)  # noqa: I001

from .impl.pepper import Pepper  # noqa: I001

logger = logging.getLogger(__name__)


@register("pepperprediction")
class PepperPrediction(PredictedProperty):
    mean: float | None
    std: float | None
    log_mean: float | None
    log_std: float | None

    @computed_field
    @property
    def svg(self, xscale="linear", quantiles=(0.01, 0.99), n_points=2000) -> Optional[str]:
        import io

        import matplotlib.patches as mpatches
        import numpy as np
        from matplotlib import pyplot as plt
        from scipy import stats

        """
        Plot the lognormal distribution of chemical half-lives where parameters are
        given on a base-10 log scale: log10(half-life) ~ Normal(mu_log10, sigma_log10^2).

        Shades:
          - x < a in green (Non-persistent)
          - a <= x <= b in yellow (Persistent)
          - x > b in red (Very persistent)

        Legend shows the shaded color and the probability mass in each region.
        """

        sigma_log10 = self.log_std
        mu_log10 = self.log_mean

        if sigma_log10 <= 0:
            raise ValueError("sigma_log10 must be > 0")
        # Persistent and Very Persistent thresholds in days from REACH (https://doi.org/10.26434/chemrxiv-2025-xmslf)
        p = 120
        vp = 180

        # Convert base-10 log parameters to natural-log parameters for SciPy's lognorm
        ln10 = np.log(10.0)
        mu_ln = mu_log10 * ln10
        sigma_ln = sigma_log10 * ln10

        # SciPy parameterization: lognorm(s=sigma_ln, scale=exp(mu_ln))
        dist = stats.lognorm(s=sigma_ln, scale=np.exp(mu_ln))

        # Exact probabilities
        p_green = dist.cdf(p)  # P(X < a)
        p_yellow = dist.cdf(vp) - p_green  # P(a <= X <= b)
        p_red = 1.0 - dist.cdf(vp)  # P(X > b)

        # Plotting range
        q_low, q_high = dist.ppf(quantiles)
        x_min = max(1e-12, min(q_low, p) * 0.9)
        x_max = max(q_high, vp) * 1.1

        # Build x-grid (linear days axis)
        if xscale == "log":
            x = np.logspace(np.log10(x_min), np.log10(x_max), n_points)
        else:
            x = np.linspace(x_min, x_max, n_points)
        y = dist.pdf(x)

        # Masks for shading
        mask_green = x < p
        mask_yellow = (x >= p) & (x <= vp)
        mask_red = x > vp

        # Plot
        fig, ax = plt.subplots(figsize=(9, 5.5))
        ax.plot(x, y, color="#1f4e79", lw=2, label="Lognormal PDF")

        if np.any(mask_green):
            ax.fill_between(x[mask_green], y[mask_green], 0, color="tab:green", alpha=0.3)
        if np.any(mask_yellow):
            ax.fill_between(x[mask_yellow], y[mask_yellow], 0, color="gold", alpha=0.35)
        if np.any(mask_red):
            ax.fill_between(x[mask_red], y[mask_red], 0, color="tab:red", alpha=0.3)

        # Threshold lines
        ax.axvline(p, color="gray", ls="--", lw=1)
        ax.axvline(vp, color="gray", ls="--", lw=1)

        # Labels & title
        ax.set_title(
            f"Half-life Distribution (Lognormal)\nlog10 parameters: μ={mu_log10:g}, σ={sigma_log10:g}"
        )
        ax.set_xlabel("Half-life (days)")
        ax.set_ylabel("Probability density")
        ax.grid(True, alpha=0.25)

        if xscale == "log":
            ax.set_xscale("log")  # not used in this example, but supported

        # Legend with probabilities
        patches = [
            mpatches.Patch(
                color="tab:green",
                alpha=0.3,
                label=f"Non-persistent (<{p:g} d): {p_green:.2%}",
            ),
            mpatches.Patch(
                color="gold",
                alpha=0.35,
                label=f"Persistent ({p:g}–{vp:g} d): {p_yellow:.2%}",
            ),
            mpatches.Patch(
                color="tab:red",
                alpha=0.3,
                label=f"Very persistent (>{vp:g} d): {p_red:.2%}",
            ),
        ]
        ax.legend(handles=patches, frameon=True)

        plt.tight_layout()

        # --- Export to SVG string ---
        buf = io.StringIO()
        fig.savefig(buf, format="svg", bbox_inches="tight")
        svg = buf.getvalue()
        plt.close(fig)
        buf.close()

        return svg


class PEPPER(Property):
    def identifier(self) -> str:
        return "pepper"

    def display(self) -> str:
        return "PEPPER"

    def name(self) -> str:
        return "Predict Environmental Pollutant PERsistence"

    def requires_rule_packages(self) -> bool:
        return False

    def requires_data_packages(self) -> bool:
        return True

    def get_type(self) -> PropertyType:
        return PropertyType.HEAVY

    def generate_dataset(self, eP: EnviPyDTO) -> pl.DataFrame:
        """
        Generates a dataset in the form of a Polars DataFrame containing compound information, including
        SMILES strings and logarithmic values of degradation half-lives (dt50).

        The dataset is built by iterating over a list of compounds, standardizing SMILES strings, and
        calculating the logarithmic mean of the half-life intervals for different environmental scenarios
        associated with each compound.

        The resulting DataFrame will only include unique rows based on SMILES and logarithmic half-life
        values.

        Parameters:
            eP (EnviPyDTO): An object that provides access to compound data and utility functions for
            standardization and retrieval of half-life information.

        Returns:
            pl.DataFrame: The resulting dataset with unique rows containing compound structure identifiers,
            standardized SMILES strings, and logarithmic half-life values.

        Raises:
            Exception: Exceptions are caught and logged during data processing, specifically when retrieving
            half-life information.

        Note:
            - The logarithmic mean is calculated from the start and end intervals of the dt50 (half-life).
            - Compounds not associated with any half-life data are skipped, and errors encountered during processing
              are logged without halting the execution.
        """
        columns = ["structure_id", "smiles", "dt50_log"]
        rows = []

        for c in eP.get_compounds():
            hls = c.half_lifes()

            if len(hls):
                stand_smiles = eP.standardize(c.smiles, remove_stereo=True)
                for scenario, half_lives in hls.items():
                    for h in half_lives:
                        # In the original Pepper code they take the mean of the start and end interval.
                        half_mean = (h.dt50.start + h.dt50.end) / 2
                        rows.append([str(c.url), stand_smiles, math.log10(half_mean)])

        df = pl.DataFrame(data=rows, schema=columns, orient="row", infer_schema_length=None)

        df = df.unique(subset=["smiles", "dt50_log"], keep="any", maintain_order=False)

        return df

    def save_dataset(self, df: pl.DataFrame, path: str):
        with open(path, "wb") as fh:
            pickle.dump(df, fh)

    def load_dataset(self, path: str) -> pl.DataFrame:
        with open(path, "rb") as fh:
            return pickle.load(fh)

    def build(self, eP: EnviPyDTO, *args, **kwargs) -> BuildResult | None:
        logger.info(f"Start building PEPPER {eP.get_context().uuid}")
        df = self.generate_dataset(eP)

        if df.shape[0] == 0:
            raise ValueError("No data found for building model")

        p = Pepper()

        p, train_ds = p.train_model(df)

        ds_store_path = os.path.join(
            eP.get_context().work_dir, f"pepper_ds_{eP.get_context().uuid}.pkl"
        )
        self.save_dataset(train_ds, ds_store_path)

        model_store_path = os.path.join(
            eP.get_context().work_dir, f"pepper_{eP.get_context().uuid}.pkl"
        )
        p.save_model(model_store_path)
        logger.info(f"Finished building PEPPER {eP.get_context().uuid}")

    def run(self, eP: EnviPyDTO, *args, **kwargs) -> RunResult:
        load_path = os.path.join(eP.get_context().work_dir, f"pepper_{eP.get_context().uuid}.pkl")

        p = Pepper.load_model(load_path)

        X_new = [c.smiles for c in eP.get_compounds()]

        predictions = p.predict_batch(X_new)

        results = []

        for p in zip(*predictions):
            if p[0] is None or p[1] is None:
                result = {"log_mean": None, "mean": None, "log_std": None, "std": None, "svg": None}
            else:
                result = {
                    "log_mean": p[0],
                    "mean": 10 ** p[0],
                    "log_std": p[1],
                    "std": 10 ** p[1],
                }

            results.append(PepperPrediction(**result))

        rr = RunResult(
            producer=eP.get_context().url,
            description=f"Generated at {datetime.now()}",
            result=results,
        )

        return rr

    def evaluate(self, eP: EnviPyDTO, *args, **kwargs) -> EvaluationResult | None:
        logger.info(f"Start evaluating PEPPER {eP.get_context().uuid}")
        load_path = os.path.join(eP.get_context().work_dir, f"pepper_{eP.get_context().uuid}.pkl")

        p = Pepper.load_model(load_path)

        df = self.generate_dataset(eP)
        ds = p.preprocess_data(df)

        y_pred = p.predict_batch(ds["smiles"])

        # We only need the mean
        if isinstance(y_pred, tuple):
            y_pred = y_pred[0]

        res = self.eval_stats(ds["dt50_bayesian_mean"], y_pred)

        logger.info(f"Finished evaluating PEPPER {eP.get_context().uuid}")
        return EvaluationResult(data=res)

    def build_and_evaluate(self, eP: EnviPyDTO, *args, **kwargs) -> EvaluationResult | None:
        logger.info(f"Start evaluating PEPPER {eP.get_context().uuid}")
        ds_load_path = os.path.join(
            eP.get_context().work_dir, f"pepper_ds_{eP.get_context().uuid}.pkl"
        )
        ds = self.load_dataset(ds_load_path)

        n_splits = kwargs.get("n_splits", 20)
        shuff = ShuffleSplit(n_splits=n_splits, test_size=0.1, random_state=42)

        fold_metrics: List[dict[str, Any]] = []
        for split_id, (train_index, test_index) in enumerate(shuff.split(ds)):
            logger.info(f"Evaluation fold {split_id}/{n_splits} PEPPER {eP.get_context().uuid}")
            train = ds[train_index]
            test = ds[test_index]
            model = Pepper()
            model.train_model(train, preprocess=False)

            features = test[model.descriptors.get_descriptor_names()].rows()
            y_pred = model.predict_batch(features, is_smiles=False)

            # We only need the mean for eval statistics but mean, std can be returned
            if isinstance(y_pred, tuple) or isinstance(y_pred, list):
                y_pred = y_pred[0]

            # Remove None if they occur
            y_true_filtered, y_pred_filtered = [], []
            for t, p in zip(test["dt50_bayesian_mean"], y_pred):
                if p is None:
                    continue
                y_true_filtered.append(t)
                y_pred_filtered.append(p)

            if len(y_true_filtered) == 0:
                print("Skipping empty fold")
                continue

            fold_metrics.append(self.eval_stats(y_true_filtered, y_pred_filtered))

        logger.info(f"Finished evaluating PEPPER {eP.get_context().uuid}")
        return EvaluationResult(data=fold_metrics)

    @staticmethod
    def eval_stats(y_true, y_pred) -> dict[str, float]:
        scores_dic = {
            "r2": r2_score(y_true, y_pred),
            "mse": mean_squared_error(y_true, y_pred),
            "rmse": root_mean_squared_error(y_true, y_pred),
            "mae": mean_absolute_error(y_true, y_pred),
        }
        return scores_dic