[Feature] PEPPER in enviPath (#332)

Co-authored-by: Tim Lorsbach <tim@lorsba.ch> Reviewed-on: enviPath/enviPy#332
2026-03-06 22:11:22 +13:00
parent 6e00926371
commit c6ff97694d
43 changed files with 3793 additions and 371 deletions
--- a/pepper/impl/pepper.py
+++ b/pepper/impl/pepper.py
@ -0,0 +1,329 @@
+import importlib.resources
+import logging
+import math
+import os
+import pickle
+from collections import defaultdict
+from typing import List
+
+import numpy as np
+import polars as pl
+import yaml
+from joblib import Parallel, delayed
+from scipy.cluster import hierarchy
+from scipy.spatial.distance import squareform
+from scipy.stats import spearmanr
+from sklearn.feature_selection import VarianceThreshold
+from sklearn.gaussian_process import GaussianProcessRegressor
+from sklearn.pipeline import Pipeline
+from sklearn.preprocessing import FunctionTransformer, MinMaxScaler
+
+from .bayesian import Bayesian
+from .descriptors import Mordred
+
+
+class Pepper:
+    def __init__(self, config_path=None, random_state=42):
+        self.random_state = random_state
+        if config_path is None:
+            config_path = importlib.resources.files("pepper.impl.config").joinpath(
+                "regressor_settings_singlevalue_soil_paper_GPR_optimized.yml"
+            )
+        with open(config_path, "r") as file:
+            regressor_settings = yaml.safe_load(file)
+        if len(regressor_settings) > 1:
+            logging.warning(
+                f"More than one regressor config found in {config_path}, using the first one"
+            )
+        self.regressor_settings = regressor_settings[list(regressor_settings.keys())[0]]
+        if "kernel" in self.regressor_settings["regressor_params"]:
+            from sklearn.gaussian_process.kernels import ConstantKernel, Matern  # noqa: F401
+
+            # We could hard-code the kernels they have, maybe better than using eval
+            self.regressor_settings["regressor_params"]["kernel"] = eval(
+                self.regressor_settings["regressor_params"]["kernel"]
+            )
+        # We assume the YAML has the key regressor containing a regressor name
+        self.regressor = self.get_regressor_by_name(self.regressor_settings["regressor"])
+        if "regressor_params" in self.regressor_settings:  # Set params if any are given
+            self.regressor.set_params(**self.regressor_settings["regressor_params"])
+
+        # TODO we could make this configurable
+        self.descriptors = Mordred()
+        self.descriptor_subset = None
+
+        self.min_max_scaler = MinMaxScaler().set_output(transform="polars")
+        self.feature_preselector = Pipeline(
+            [
+                (
+                    "variance_threshold",
+                    VarianceThreshold(threshold=0.02).set_output(transform="polars"),
+                ),
+                # Feature selection based on variance threshold
+                (
+                    "custom_feature_selection",
+                    FunctionTransformer(
+                        func=self.remove_highly_correlated_features,
+                        validate=False,
+                        kw_args={"corr_method": "spearman", "cluster_threshold": 0.01},
+                    ).set_output(transform="polars"),
+                ),
+            ]
+        )
+
+    def get_regressor_by_name(self, regressor_string):
+        """
+        Load regressor function from a regressor name
+        :param regressor_string: name of regressor as defined in config file (function name with parentheses)
+        :return: Regressor object
+        """
+        # if regressor_string == 'RandomForestRegressor':
+        #     return RandomForestRegressor(random_state=self.random_state)
+        # elif regressor_string == 'GradientBoostingRegressor':
+        #     return GradientBoostingRegressor(random_state=self.random_state)
+        # elif regressor_string == 'AdaBoostRegressor':
+        #     return AdaBoostRegressor(random_state=self.random_state)
+        # elif regressor_string == 'MLPRegressor':
+        #     return MLPRegressor(random_state=self.random_state)
+        # elif regressor_string == 'SVR':
+        #     return SVR()
+        # elif regressor_string == 'KNeighborsRegressor':
+        #     return KNeighborsRegressor()
+        if regressor_string == "GaussianProcessRegressor":
+            return GaussianProcessRegressor(random_state=self.random_state)
+        # elif regressor_string == 'DecisionTreeRegressor':
+        #     return DecisionTreeRegressor(random_state=self.random_state)
+        # elif regressor_string == 'Ridge':
+        #     return Ridge(random_state=self.random_state)
+        # elif regressor_string == 'SGDRegressor':
+        #     return SGDRegressor(random_state=self.random_state)
+        # elif regressor_string == 'KernelRidge':
+        #     return KernelRidge()
+        # elif regressor_string == 'LinearRegression':
+        #     return LinearRegression()
+        # elif regressor_string == 'LSVR':
+        #     return SVR(kernel='linear')  # Linear Support Vector Regressor
+        else:
+            raise NotImplementedError(
+                f"No regressor type defined for regressor_string = {regressor_string}"
+            )
+
+    def train_model(self, train_data, preprocess=True):
+        """
+        Fit self.regressor and preprocessors. train_data is a pl.DataFrame
+        """
+        if preprocess:
+            # Compute the mean and std of half-lives per structure
+            train_data = self.preprocess_data(train_data)
+
+        # train_data structure:
+        # columns = [
+        #     "structure_id",
+        #     "smiles",
+        #     "dt50_log",
+        #     "dt50_bayesian_mean",
+        #     "dt50_bayesian_std",
+        # ] + self.descriptors.get_descriptor_names()
+
+        # only select descriptor features for feature preselector
+        df = train_data[self.descriptors.get_descriptor_names()]
+
+        # Remove columns having at least None, nan, inf, "" value
+        df = Pepper.keep_clean_columns(df)
+
+        # Scale and Remove highly correlated features as well as features having a low variance
+        x_train_normal = self.min_max_scaler.fit_transform(df)
+        x_train_normal = self.feature_preselector.fit_transform(x_train_normal)
+
+        # Store subset, as this is the input used for prediction
+        self.descriptor_subset = x_train_normal.columns
+
+        y_train = train_data["dt50_bayesian_mean"].to_numpy()
+        y_train_std = train_data["dt50_bayesian_std"].to_numpy()
+
+        self.regressor.set_params(alpha=y_train_std)
+        self.regressor.fit(x_train_normal, y_train)
+
+        return self, train_data
+
+    @staticmethod
+    def keep_clean_columns(df: pl.DataFrame) -> pl.DataFrame:
+        """
+        Filters out columns from the DataFrame that contain null values, NaN, or infinite values.
+
+        This static method takes a DataFrame as input and evaluates each of its columns to determine
+        if the column contains invalid values. Columns that have null values, NaN, or infinite values
+        are excluded from the resulting DataFrame. The method is especially useful for cleaning up a
+        dataset by keeping only the valid columns.
+
+        Parameters:
+        df (polars.DataFrame): The input DataFrame to be cleaned.
+
+        Returns:
+        polars.DataFrame: A DataFrame containing only columns without null, NaN, or infinite values.
+        """
+        valid_cols = []
+
+        for col in df.columns:
+            s = df[col]
+
+            # Check nulls
+            has_null = s.null_count() > 0
+
+            # Check NaN and inf only for numeric columns
+            if s.dtype.is_numeric():
+                has_nan = s.is_nan().any()
+                has_inf = s.is_infinite().any()
+            else:
+                has_nan = False
+                has_inf = False
+
+            if not (has_null or has_nan or has_inf):
+                valid_cols.append(col)
+
+        return df.select(valid_cols)
+
+    def preprocess_data(self, dataset):
+        groups = [group for group in dataset.group_by("structure_id")]
+
+        # Unless explicitly set compute everything serial
+        if os.environ.get("N_PEPPER_THREADS", 1) > 1:
+            results = Parallel(n_jobs=os.environ["N_PEPPER_THREADS"])(
+                delayed(compute_bayes_per_group)(group[1])
+                for group in dataset.group_by("structure_id")
+            )
+        else:
+            results = []
+            for g in groups:
+                results.append(compute_bayes_per_group(g[1]))
+
+        bayes_stats = pl.concat(results, how="vertical")
+        dataset = dataset.join(bayes_stats, on="structure_id", how="left")
+
+        # Remove duplicates after calculating mean, std
+        dataset = dataset.unique(subset="structure_id")
+
+        # Calculate and normalise features, make a "desc" column with the features
+        dataset = dataset.with_columns(
+            pl.col("smiles")
+            .map_elements(
+                self.descriptors.get_molecule_descriptors, return_dtype=pl.List(pl.Float64)
+            )
+            .alias("desc")
+        )
+
+        # If a SMILES fails to get desc it is removed
+        dataset = dataset.filter(pl.col("desc").is_not_null() & (pl.col("desc").list.len() > 0))
+
+        # Flatten the features into the dataset
+        dataset = dataset.with_columns(
+            pl.col("desc").list.to_struct(fields=self.descriptors.get_descriptor_names())
+        ).unnest("desc")
+
+        return dataset
+
+    def predict_batch(self, batch: List[str], is_smiles: bool = True) -> List[List[float | None]]:
+        if is_smiles:
+            rows = [self.descriptors.get_molecule_descriptors(smiles) for smiles in batch]
+        else:
+            rows = batch
+
+        # Create Dataframe with all descriptors
+        initial_desc_rows_df = pl.DataFrame(
+            data=rows, schema=self.descriptors.get_descriptor_names(), orient="row"
+        )
+
+        # Before checking for invalid values per row, select only required columns
+        initial_desc_rows_df = initial_desc_rows_df.select(
+            list(self.min_max_scaler.feature_names_in_)
+        )
+
+        to_pad = []
+        adjusted_rows = []
+        for i, row in enumerate(initial_desc_rows_df.rows()):
+            # neither infs nor nans are found -> rows seems to be valid input
+            if row and not any(math.isinf(x) for x in row) and not any(math.isnan(x) for x in row):
+                adjusted_rows.append(row)
+            else:
+                to_pad.append(i)
+
+        if adjusted_rows:
+            desc_rows_df = pl.DataFrame(
+                data=adjusted_rows, schema=list(self.min_max_scaler.feature_names_in_), orient="row"
+            )
+            x_normal = self.min_max_scaler.transform(desc_rows_df)
+            x_normal = x_normal[self.descriptor_subset]
+
+            res = self.regressor.predict(x_normal, return_std=True)
+
+            # Convert to lists
+            res = [list(res[0]), list(res[1])]
+
+            # If we had rows containing bad input (inf, nan) insert Nones at the correct position
+            if to_pad:
+                for i in to_pad:
+                    res[0].insert(i, None)
+                    res[1].insert(i, None)
+
+            return res
+
+        else:
+            return [[None] * len(batch), [None] * len(batch)]
+
+    @staticmethod
+    def remove_highly_correlated_features(
+        X_train,
+        corr_method: str = "spearman",
+        cluster_threshold: float = 0.01,
+        ignore=False,
+    ):
+        if ignore:
+            return X_train
+            # pass
+        else:
+            # Using spearmanr from scipy to achieve pandas.corr in polars
+            corr = spearmanr(X_train, axis=0).statistic
+
+            # Ensure the correlation matrix is symmetric
+            corr = (corr + corr.T) / 2
+            np.fill_diagonal(corr, 1)
+            corr = np.nan_to_num(corr)
+
+            # code from https://scikit-learn.org/stable/auto_examples/inspection/
+            # plot_permutation_importance_multicollinear.html
+            # We convert the correlation matrix to a distance matrix before performing
+            # hierarchical clustering using Ward's linkage.
+            distance_matrix = 1 - np.abs(corr)
+            dist_linkage = hierarchy.ward(squareform(distance_matrix))
+
+            cluster_ids = hierarchy.fcluster(dist_linkage, cluster_threshold, criterion="distance")
+            cluster_id_to_feature_ids = defaultdict(list)
+
+            for idx, cluster_id in enumerate(cluster_ids):
+                cluster_id_to_feature_ids[cluster_id].append(idx)
+
+            my_selected_features = [v[0] for v in cluster_id_to_feature_ids.values()]
+            X_train_sel = X_train[:, my_selected_features]
+
+            return X_train_sel
+
+    def save_model(self, path):
+        with open(path, "wb") as save_file:
+            pickle.dump(self, save_file, protocol=5)
+
+    @staticmethod
+    def load_model(path) -> "Pepper":
+        with open(path, "rb") as load_file:
+            return pickle.load(load_file)
+
+
+def compute_bayes_per_group(group):
+    """Get mean and std using bayesian"""
+    mean, std = Bayesian(group["dt50_log"]).get_posterior_distribution()
+    return pl.DataFrame(
+        {
+            "structure_id": [group["structure_id"][0]],
+            "dt50_bayesian_mean": [mean],
+            "dt50_bayesian_std": [std],
+        }
+    )