Current Dev State

This commit is contained in:
Tim Lorsbach
2025-06-23 20:13:54 +02:00
parent b4f9bb277d
commit ded50edaa2
22617 changed files with 4345095 additions and 174 deletions

239
utilities/ml.py Normal file
View File

@ -0,0 +1,239 @@
from __future__ import annotations
import dataclasses
from collections import defaultdict
from datetime import datetime
from typing import List, Optional
import numpy as np
from sklearn.base import BaseEstimator, ClassifierMixin
from sklearn.decomposition import PCA
from sklearn.metrics import accuracy_score
from sklearn.multioutput import ClassifierChain
from sklearn.preprocessing import StandardScaler
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
# @dataclasses.dataclass
# class Feature:
# name: str
# value: float
#
#
#
# class Row:
# def __init__(self, compound_uuid: str, compound_smiles: str, descriptors: List[int]):
# self.data = {}
#
#
#
# class DataSet(object):
#
# def __init__(self):
# self.rows: List[Row] = []
#
# def add_row(self, row: Row):
# pass
from dataclasses import dataclass, field
from utilities.chem import FormatConverter
@dataclass
class Compound:
smiles: str
uuid: str = field(default=None, compare=False, hash=False)
def __hash__(self):
if not hasattr(self, '_hash'):
self._hash = hash((
self.smiles
))
return self._hash
@dataclass
class Reaction:
educts: List[Compound]
products: List[Compound]
rule_uuid: str = field(default=None, compare=False, hash=False)
reaction_uuid: str = field(default=None, compare=False, hash=False)
def __hash__(self):
if not hasattr(self, '_hash'):
self._hash = hash((
tuple(sorted(self.educts, key=lambda x: x.smiles)),
tuple(sorted(self.products, key=lambda x: x.smiles)),
))
return self._hash
def __eq__(self, other):
if not isinstance(other, Reaction):
return NotImplemented
return (
sorted(self.educts, key=lambda x: x.smiles) == sorted(other.educts, key=lambda x: x.smiles) and
sorted(self.products, key=lambda x: x.smiles) == sorted(other.products, key=lambda x: x.smiles)
)
class Dataset(object):
def __init__(self, headers=List['str'], data=List[List[str|int|float]]):
self.headers = headers
self.data = data
def features(self):
pass
def labels(self):
pass
def to_json(self):
pass
def to_csv(self):
pass
def to_arff(self):
pass
class DatasetGenerator(object):
@staticmethod
def generate_dataset(compounds: List[Compound], reactions: List[Reaction], applicable_rules: 'Rule',
compounds_to_exclude: Optional[Compound] = None, educts_only: bool = False) -> Dataset:
rows = []
if educts_only:
compounds = set()
for r in reactions:
for e in r.educts:
compounds.add(e)
compounds = list(compounds)
total = len(compounds)
for i, c in enumerate(compounds):
row = []
print(f"{i + 1}/{total} - {c.smiles}")
for r in applicable_rules:
product_sets = r.rule.apply(c.smiles)
if len(product_sets) == 0:
row.append([])
continue
#triggered.add(f"{r.uuid} + {c.uuid}")
reacts = set()
for ps in product_sets:
products = []
for p in ps:
products.append(Compound(FormatConverter.standardize(p)))
reacts.add(Reaction([c], products, r))
row.append(list(reacts))
rows.append(row)
return rows
class SparseLabelECC(BaseEstimator, ClassifierMixin):
"""
Ensemble of Classifier Chains with sparse label removal.
Removes labels that are constant across all samples in training.
"""
def __init__(self, base_clf=RandomForestClassifier(n_estimators=100, max_features='log2', random_state=42),
num_chains: int = 10):
self.base_clf = base_clf
self.num_chains = num_chains
def fit(self, X, Y):
y = np.array(Y)
self.n_labels_ = y.shape[1]
self.removed_labels_ = {}
self.keep_columns_ = []
for col in range(self.n_labels_):
unique_values = np.unique(y[:, col])
if len(unique_values) == 1:
self.removed_labels_[col] = unique_values[0]
else:
self.keep_columns_.append(col)
y_reduced = y[:, self.keep_columns_]
self.chains_ = [ClassifierChain(self.base_clf, order='random', random_state=i)
for i in range(self.num_chains)]
for i, chain in enumerate(self.chains_):
print(f"{datetime.now()} fitting {i + 1}/{self.num_chains}")
chain.fit(X, y_reduced)
return self
def predict(self, X, threshold=0.5):
avg_preds = np.mean([chain.predict(X) for chain in self.chains_], axis=0) > threshold
full_y = np.zeros((avg_preds.shape[0], self.n_labels_))
for idx, col in enumerate(self.keep_columns_):
full_y[:, col] = avg_preds[:, idx]
for col, value in self.removed_labels_.items():
full_y[:, col] = bool(value)
return full_y
def predict_proba(self, X):
avg_proba = np.mean([chain.predict_proba(X) for chain in self.chains_], axis=0)
full_y = np.zeros((avg_proba.shape[0], self.n_labels_))
for idx, col in enumerate(self.keep_columns_):
full_y[:, col] = avg_proba[:, idx]
for col, value in self.removed_labels_.items():
full_y[:, col] = float(value)
return full_y
def score(self, X, Y, sample_weight=None):
"""
Default scoring using subset accuracy (exact match).
"""
y_true = np.array(Y)
y_pred = self.predict(X)
return accuracy_score(y_true, y_pred, sample_weight=sample_weight)
class ApplicabilityDomain(PCA):
def __init__(self, n_components=5):
super().__init__(n_components=n_components)
self.scaler = StandardScaler()
self.min_vals = None
self.max_vals = None
def build(self, X):
# transform
X_scaled = self.scaler.fit_transform(X)
# fit pca
X_pca = self.fit_transform(X_scaled)
self.max_vals = np.max(X_pca, axis=0)
self.min_vals = np.min(X_pca, axis=0)
def is_applicable(self, instances):
instances_scaled = self.scaler.transform(instances)
instances_pca = self.transform(instances_scaled)
is_applicable = []
for i, instance in enumerate(instances_pca):
is_applicable.append(True)
for min_v, max_v, new_v in zip(self.min_vals, self.max_vals, instance):
if not min_v <= new_v <= max_v:
is_applicable[i] = False
return is_applicable