Files
enviPy-bayer/epdb/models.py
jebus 49e02ed97d feature/additional_information (#30)
Fixes #12

Co-authored-by: Tim Lorsbach <tim@lorsba.ch>
Reviewed-on: enviPath/enviPy#30
2025-07-19 08:10:40 +12:00

1573 lines
53 KiB
Python

import abc
import json
import logging
import os
from collections import defaultdict
from datetime import datetime, timedelta, date
from typing import Union, List, Optional
from uuid import uuid4
import joblib
import numpy as np
from django.conf import settings as s
from django.contrib.auth.hashers import make_password, check_password
from django.contrib.auth.models import AbstractUser
from django.contrib.postgres.fields import ArrayField
from django.db import models, transaction
from django.db.models import JSONField
from django.utils import timezone
from django.utils.functional import cached_property
from model_utils.models import TimeStampedModel
from polymorphic.models import PolymorphicModel
from sklearn.metrics import precision_score, recall_score, jaccard_score
from sklearn.model_selection import ShuffleSplit
from utilities.chem import FormatConverter, ProductSet, PredictionResult, IndigoUtils
from utilities.ml import SparseLabelECC
logger = logging.getLogger(__name__)
##########################
# User/Groups/Permission #
##########################
class User(AbstractUser):
email = models.EmailField(unique=True)
uuid = models.UUIDField(null=False, blank=False, verbose_name='UUID of this object', unique=True,
default=uuid4)
default_package = models.ForeignKey('epdb.Package', verbose_name='Default Package', null=True,
on_delete=models.SET_NULL)
default_group = models.ForeignKey('Group', verbose_name='Default Group', null=True, blank=False,
on_delete=models.SET_NULL, related_name='default_group')
default_setting = models.ForeignKey('epdb.Setting', on_delete=models.SET_NULL,
verbose_name='The users default settings', null=True, blank=False)
# TODO remove
groups = models.ManyToManyField("Group", verbose_name='groups')
USERNAME_FIELD = "email"
REQUIRED_FIELDS = ['username']
@property
def url(self):
return '{}/user/{}'.format(s.SERVER_URL, self.uuid)
def prediction_settings(self):
if self.default_setting is None:
self.default_setting = Setting.objects.get(global_default=True)
self.save()
return self.default_setting
class APIToken(models.Model):
hashed_key = models.CharField(max_length=128, unique=True)
user = models.ForeignKey(User, on_delete=models.CASCADE)
created = models.DateTimeField(auto_now_add=True)
expires_at = models.DateTimeField(null=True, blank=True, default=timezone.now() + timedelta(days=90))
name = models.CharField(max_length=100, blank=True, help_text="Optional name for the token")
def is_valid(self):
return not self.expires_at or self.expires_at > timezone.now()
@staticmethod
def create_token(user, name="", valid_for=90):
import secrets
raw_token = secrets.token_urlsafe(32)
hashed = make_password(raw_token)
token = APIToken.objects.create(user=user, hashed_key=hashed, name=name,
expires_at=timezone.now() + timedelta(days=valid_for))
return token, raw_token
def check_token(self, raw_token):
return check_password(raw_token, self.hashed_key)
class Group(TimeStampedModel):
uuid = models.UUIDField(null=False, blank=False, verbose_name='UUID of this object', unique=True, default=uuid4)
name = models.TextField(blank=False, null=False, verbose_name='Group name')
owner = models.ForeignKey("User", verbose_name='Group Owner', on_delete=models.CASCADE)
description = models.TextField(blank=False, null=False, verbose_name='Descriptions', default='no description')
user_member = models.ManyToManyField("User", verbose_name='User members', related_name='users_in_group')
group_member = models.ManyToManyField("Group", verbose_name='Group member', related_name='groups_in_group')
def __str__(self):
return f"{self.name} (pk={self.pk})"
@property
def url(self):
return '{}/group/{}'.format(s.SERVER_URL, self.uuid)
class Permission(TimeStampedModel):
READ = ('read', 'Read')
WRITE = ('write', 'Write')
ALL = ('all', 'All')
PERMS = [
READ,
WRITE,
ALL
]
permission = models.CharField(max_length=32, choices=PERMS, null=False)
def has_read(self):
return self.permission in [p[0] for p in self.PERMS]
def has_write(self):
return self.permission in [self.WRITE[0], self.ALL[0]]
def has_all(self):
return self.permission == self.ALL[0]
class Meta:
abstract: True
class UserPackagePermission(Permission):
uuid = models.UUIDField(null=False, blank=False, verbose_name='UUID of this object', primary_key=True,
default=uuid4)
user = models.ForeignKey('User', verbose_name='Permission to', on_delete=models.CASCADE)
package = models.ForeignKey('epdb.Package', verbose_name='Permission on', on_delete=models.CASCADE)
class Meta:
unique_together = [('package', 'user')]
def __str__(self):
return f"User: {self.user} has Permission: {self.permission} on Package: {self.package}"
class GroupPackagePermission(Permission):
uuid = models.UUIDField(null=False, blank=False, verbose_name='UUID of this object', primary_key=True,
default=uuid4)
group = models.ForeignKey('Group', verbose_name='Permission to', on_delete=models.CASCADE)
package = models.ForeignKey('epdb.Package', verbose_name='Permission on', on_delete=models.CASCADE)
class Meta:
unique_together = [('package', 'group')]
def __str__(self):
return f"Group: {self.group} has Permission: {self.permission} on Package: {self.package}"
##############
# EP Objects #
##############
class EnviPathModel(TimeStampedModel):
uuid = models.UUIDField(null=False, blank=False, verbose_name='UUID of this object', unique=True,
default=uuid4)
name = models.TextField(blank=False, null=False, verbose_name='Name', default='no name')
description = models.TextField(blank=False, null=False, verbose_name='Descriptions', default='no description')
kv = JSONField(null=True, blank=True, default=dict)
@property
@abc.abstractmethod
def url(self):
pass
def get_v(self, k, default=None):
if self.kv:
return self.kv.get(k, default)
return default
class Meta:
abstract = True
class AliasMixin(models.Model):
aliases = ArrayField(
models.TextField(blank=False, null=False),
verbose_name='Aliases', default=list
)
@transaction.atomic
def add_alias(self, new_alias, set_as_default=False):
if set_as_default:
self.aliases.add(self.name)
self.name = new_alias
if new_alias in self.aliases:
self.aliases.remove(new_alias)
else:
if new_alias not in self.aliases:
self.aliases.add(new_alias)
self.save()
class Meta:
abstract = True
class ScenarioMixin(models.Model):
scenarios = models.ManyToManyField("epdb.Scenario", verbose_name='Attached Scenarios')
class Meta:
abstract = True
class License(models.Model):
link = models.URLField(blank=False, null=False, verbose_name='link')
image_link = models.URLField(blank=False, null=False, verbose_name='Image link')
class Package(EnviPathModel):
reviewed = models.BooleanField(verbose_name='Reviewstatus', default=False)
license = models.ForeignKey('epdb.License', on_delete=models.SET_NULL, blank=True, null=True, verbose_name='License')
def __str__(self):
return f"{self.name} (pk={self.pk})"
@property
def compounds(self):
return Compound.objects.filter(package=self)
@property
def rules(self):
return Rule.objects.filter(package=self)
@property
def reactions(self):
return Reaction.objects.filter(package=self)
@property
def pathways(self) -> 'Pathway':
return Pathway.objects.filter(package=self)
@property
def scenarios(self):
return Scenario.objects.filter(package=self)
@property
def models(self):
return EPModel.objects.filter(package=self)
@property
def url(self):
return '{}/package/{}'.format(s.SERVER_URL, self.uuid)
def get_applicable_rules(self):
"""
Returns a ordered set of rules where the following applies:
1. All Composite will be added to result
2. All SimpleRules will be added if theres no CompositeRule present using the SimpleRule
Ordering is based on "url" field.
"""
rules = []
rule_qs = self.rules
reflected_simple_rules = set()
for r in rule_qs:
if isinstance(r, ParallelRule) or isinstance(r, SequentialRule):
rules.append(r)
for sr in r.simple_rules.all():
reflected_simple_rules.add(sr)
for r in rule_qs:
if isinstance(r, SimpleAmbitRule) or isinstance(r, SimpleRDKitRule):
if r not in reflected_simple_rules:
rules.append(r)
rules = sorted(rules, key=lambda x: x.url)
return rules
class Compound(EnviPathModel, AliasMixin, ScenarioMixin):
package = models.ForeignKey('epdb.Package', verbose_name='Package', on_delete=models.CASCADE, db_index=True)
default_structure = models.ForeignKey('CompoundStructure', verbose_name='Default Structure',
related_name='compound_default_structure',
on_delete=models.CASCADE, null=True)
@property
def structures(self):
return CompoundStructure.objects.filter(compound=self)
@property
def normalized_structure(self):
return CompoundStructure.objects.get(compound=self, normalized_structure=True)
@property
def url(self):
return '{}/compound/{}'.format(self.package.url, self.uuid)
@transaction.atomic
def set_default_structure(self, cs: 'CompoundStructure'):
if cs.compound != self:
raise ValueError("Attempt to set a CompoundStructure stored in a different compound as default")
self.default_structure = cs
self.save()
@property
def related_pathways(self):
pathways = Node.objects.filter(node_labels__in=[self.default_structure]).values_list('pathway', flat=True)
return Pathway.objects.filter(package=self.package, id__in=set(pathways)).order_by('name')
@property
def related_reactions(self):
return (
Reaction.objects.filter(package=self.package, educts__in=[self.default_structure])
|
Reaction.objects.filter(package=self.package, products__in=[self.default_structure])
).order_by('name')
@staticmethod
@transaction.atomic
def create(package: Package, smiles: str, name: str = None, description: str = None, *args, **kwargs) -> 'Compound':
if smiles is None or smiles == '':
raise ValueError('SMILES is required')
smiles = smiles.strip()
parsed = FormatConverter.from_smiles(smiles)
if parsed is None:
raise ValueError('Given SMILES is invalid')
standardized_smiles = FormatConverter.standardize(smiles)
# Check if we find a direct match for a given SMILES
if CompoundStructure.objects.filter(smiles=smiles, compound__package=package).exists():
return CompoundStructure.objects.get(smiles=smiles, compound__package=package).compound
# Check if we can find the standardized one
if CompoundStructure.objects.filter(smiles=standardized_smiles, compound__package=package).exists():
# TODO should we add a structure?
return CompoundStructure.objects.get(smiles=standardized_smiles, compound__package=package).compound
# Generate Compound
c = Compound()
c.package = package
# For name and description we have defaults so only set them if they carry a real value
if name is not None and name != '':
c.name = name
if description is not None and description != '':
c.description = description
c.save()
is_standardized = standardized_smiles == smiles
if not is_standardized:
_ = CompoundStructure.create(c, standardized_smiles, name='Normalized structure of {}'.format(name),
description='{} (in its normalized form)'.format(description),
normalized_structure=True)
cs = CompoundStructure.create(c, smiles, name=name, description=description, normalized_structure=is_standardized)
c.default_structure = cs
c.save()
return c
@transaction.atomic
def add_structure(self, smiles: str, name: str = None, description: str = None, default_structure: bool = False,
*args, **kwargs) -> 'CompoundStructure':
if smiles is None or smiles == '':
raise ValueError('SMILES is required')
smiles = smiles.strip()
parsed = FormatConverter.from_smiles(smiles)
if parsed is None:
raise ValueError('Given SMILES is invalid')
standardized_smiles = FormatConverter.standardize(smiles)
is_standardized = standardized_smiles == smiles
if self.normalized_structure.smiles != standardized_smiles:
raise ValueError('The standardized SMILES does not match the compounds standardized one!')
if is_standardized:
CompoundStructure.objects.get(smiles__in=smiles, compound__package=self.package)
# Check if we find a direct match for a given SMILES and/or its standardized SMILES
if CompoundStructure.objects.filter(smiles__in=smiles, compound__package=self.package).exists():
return CompoundStructure.objects.get(smiles__in=smiles, compound__package=self.package)
cs = CompoundStructure.create(self, smiles, name=name, description=description, normalized_structure=is_standardized)
if default_structure:
self.default_structure = cs
self.save()
return cs
class Meta:
unique_together = [('uuid', 'package')]
class CompoundStructure(EnviPathModel, AliasMixin, ScenarioMixin):
compound = models.ForeignKey('epdb.Compound', on_delete=models.CASCADE, db_index=True)
smiles = models.TextField(blank=False, null=False, verbose_name='SMILES')
normalized_structure = models.BooleanField(null=False, blank=False, default=False)
@property
def url(self):
return '{}/structure/{}'.format(self.compound.url, self.uuid)
# @property
# def related_pathways(self):
# pathways = Node.objects.filter(node_labels__in=[self]).values_list('pathway', flat=True)
# return Pathway.objects.filter(package=self.compound.package, id__in=set(pathways)).order_by('name')
# @property
# def related_reactions(self):
# return (
# Reaction.objects.filter(package=self.compound.package, educts__in=[self])
# |
# Reaction.objects.filter(package=self.compound.package, products__in=[self])
# ).order_by('name')
@staticmethod
@transaction.atomic
def create(compound: Compound, smiles: str, name: str = None, description: str = None, *args, **kwargs):
if CompoundStructure.objects.filter(compound=compound, smiles=smiles).exists():
return CompoundStructure.objects.get(compound=compound, smiles=smiles)
if compound.pk is None:
raise ValueError("Unpersisted Compound! Persist compound first!")
cs = CompoundStructure()
if name is not None:
cs.name = name
if description is not None:
cs.description = description
cs.smiles = smiles
cs.compound = compound
if 'normalized_structure' in kwargs:
cs.normalized_structure = kwargs['normalized_structure']
cs.save()
return cs
# TODO add find method
@property
def InChIKey(self):
return FormatConverter.InChIKey(self.smiles)
@property
def canonical_smiles(self):
return FormatConverter.canonicalize(self.smiles)
@property
def as_svg(self):
return IndigoUtils.mol_to_svg(self.smiles)
class Rule(PolymorphicModel, EnviPathModel, AliasMixin, ScenarioMixin):
package = models.ForeignKey('epdb.Package', verbose_name='Package', on_delete=models.CASCADE, db_index=True)
# I think this only affects Django Admin which we are barely using
# # https://github.com/django-polymorphic/django-polymorphic/issues/229
# _non_polymorphic = models.Manager()
#
# class Meta:
# base_manager_name = '_non_polymorphic'
@abc.abstractmethod
def apply(self, *args, **kwargs):
pass
@staticmethod
def cls_for_type(rule_type: str):
if rule_type == 'SimpleAmbitRule':
return SimpleAmbitRule
elif rule_type == 'SimpleRDKitRule':
return SimpleRDKitRule
elif rule_type == 'ParallelRule':
return ParallelRule
elif rule_type == 'SequentialRule':
return SequentialRule
else:
raise ValueError(f'{rule_type} is unknown!')
@staticmethod
@transaction.atomic
def create(package: Package, rule_type: str, name: str = None, description: str = None, *args, **kwargs):
r = Rule.cls_for_type(rule_type)()
r.package = package
r.name = name
r.description = description
# As we are setting params this way the "k" has to match the property name
for k, v in kwargs.items():
setattr(r, k, v)
r.save()
return r
#
# @property
# def related_pathways(self):
# reaction_ids = self.related_reactions.values_list('id', flat=True)
# pathways = Edge.objects.filter(edge_label__in=reaction_ids).values_list('pathway', flat=True)
# return Pathway.objects.filter(package=self.package, id__in=set(pathways)).order_by('name')
#
# @property
# def related_reactions(self):
# return (
# Reaction.objects.filter(package=self.package, rules__in=[self])
# |
# Reaction.objects.filter(package=self.package, rules__in=[self])
# ).order_by('name')
#
#
class SimpleRule(Rule):
pass
#
#
class SimpleAmbitRule(SimpleRule):
smirks = models.TextField(blank=False, null=False, verbose_name='SMIRKS')
reactant_filter_smarts = models.TextField(null=True, verbose_name='Reactant Filter SMARTS')
product_filter_smarts = models.TextField(null=True, verbose_name='Product Filter SMARTS')
@property
def url(self):
return '{}/simple-ambit-rule/{}'.format(self.package.url, self.uuid)
def apply(self, smiles):
return FormatConverter.apply(smiles, self.smirks)
@property
def reactants_smarts(self):
return self.smirks.split('>>')[0]
@property
def products_smarts(self):
return self.smirks.split('>>')[1]
@property
def related_reactions(self):
qs = Package.objects.filter(reviewed=True)
return self.reaction_rule.filter(package__in=qs).order_by('name')
@property
def related_pathways(self):
return Pathway.objects.filter(
id__in=Edge.objects.filter(edge_label__in=self.related_reactions).values('pathway_id')).order_by('name')
@property
def as_svg(self):
return IndigoUtils.smirks_to_svg(self.smirks, True)
class SimpleRDKitRule(SimpleRule):
reaction_smarts = models.TextField(blank=False, null=False, verbose_name='SMIRKS')
def apply(self, smiles):
return FormatConverter.apply(smiles, self.reaction_smarts)
@property
def url(self):
return '{}/simple-rdkit-rule/{}'.format(self.package.url, self.uuid)
#
#
class ParallelRule(Rule):
simple_rules = models.ManyToManyField('epdb.SimpleRule', verbose_name='Simple rules')
@property
def url(self):
return '{}/parallel-rule/{}'.format(self.package.url, self.uuid)
@property
def srs(self):
return self.simple_rules.all()
def apply(self, structure):
res = list()
for simple_rule in self.srs:
res.extend(simple_rule.apply(structure))
return list(set(res))
class SequentialRule(Rule):
simple_rules = models.ManyToManyField('epdb.SimpleRule', verbose_name='Simple rules',
through='SequentialRuleOrdering')
@property
def url(self):
return '{}/sequential-rule/{}'.format(self.compound.url, self.uuid)
@property
def srs(self):
return self.simple_rules.all()
def apply(self, structure):
# TODO determine levels or see java implementation
res = set()
for simple_rule in self.srs:
res.union(set(simple_rule.apply(structure)))
return res
class SequentialRuleOrdering(models.Model):
sequential_rule = models.ForeignKey(SequentialRule, on_delete=models.CASCADE)
simple_rule = models.ForeignKey(SimpleRule, on_delete=models.CASCADE)
order_index = models.IntegerField(null=False, blank=False)
class Reaction(EnviPathModel, AliasMixin, ScenarioMixin):
package = models.ForeignKey('epdb.Package', verbose_name='Package', on_delete=models.CASCADE, db_index=True)
educts = models.ManyToManyField('epdb.CompoundStructure', verbose_name='Educts', related_name='reaction_educts')
products = models.ManyToManyField('epdb.CompoundStructure', verbose_name='Products',
related_name='reaction_products')
rules = models.ManyToManyField('epdb.Rule', verbose_name='Rule', related_name='reaction_rule')
multi_step = models.BooleanField(verbose_name='Multistep Reaction')
medline_references = ArrayField(
models.TextField(blank=False, null=False), null=True,
verbose_name='Medline References'
)
@property
def url(self):
return '{}/reaction/{}'.format(self.package.url, self.uuid)
@staticmethod
@transaction.atomic
def create(package: Package, name: str = None, description: str = None,
educts: Union[List[str], List[CompoundStructure]] = None,
products: Union[List[str], List[CompoundStructure]] = None,
rule: Rule = None, multi_step: bool = True):
_educts = []
_products = []
# Determine if we receive smiles or compoundstructures
if all(isinstance(x, str) for x in educts + products):
for educt in educts:
c = Compound.create(package, educt)
_educts.append(c.default_structure)
for product in products:
c = Compound.create(package, product)
_products.append(c.default_structure)
elif all(isinstance(x, CompoundStructure) for x in educts + products):
_educts += educts
_products += products
else:
raise ValueError("")
r = Reaction()
r.package = package
r.name = name
r.description = description
r.multi_step = multi_step
r.save()
if rule:
r.rules.add(rule)
for educt in _educts:
r.educts.add(educt)
for product in _products:
r.products.add(product)
r.save()
return r
def smirks(self):
return f"{'.'.join([cs.smiles for cs in self.educts.all()])}>>{'.'.join([cs.smiles for cs in self.products.all()])}"
@property
def as_svg(self):
return IndigoUtils.smirks_to_svg(self.smirks(), False, width=800, height=400)
@property
def related_pathways(self):
return Pathway.objects.filter(
id__in=Edge.objects.filter(edge_label=self).values('pathway_id')).order_by('name')
class Pathway(EnviPathModel, AliasMixin, ScenarioMixin):
package = models.ForeignKey('epdb.Package', verbose_name='Package', on_delete=models.CASCADE, db_index=True)
@property
def root_nodes(self):
return Node.objects.filter(pathway=self, depth=0)
@property
def nodes(self):
return Node.objects.filter(pathway=self)
@property
def edges(self):
return Edge.objects.filter(pathway=self)
@property
def url(self):
return '{}/pathway/{}'.format(self.package.url, self.uuid)
def d3_json(self):
# Ideally it would be something like this but
# to reduce crossing in edges do a DFS
# nodes = [n.d3_json() for n in self.nodes]
nodes = []
processed = set()
queue = list()
for n in self.root_nodes:
queue.append(n)
while len(queue):
current = queue.pop()
processed.add(current)
nodes.append(current.d3_json())
for e in self.edges:
if current in e.start_nodes.all():
for prod in e.end_nodes.all():
if prod not in queue and prod not in processed:
queue.append(prod)
# We shouldn't lose or make up nodes...
assert len(nodes) == len(self.nodes)
print(f"Num Nodes {len(nodes)} vs. DB Nodes {len(self.nodes)}")
links = [e.d3_json() for e in self.edges]
# D3 links Nodes based on indices in nodes array
node_url_to_idx = dict()
for i, n in enumerate(nodes):
n['id'] = i
node_url_to_idx[n['url']] = i
adjusted_links = []
for link in links:
# Check if we'll need pseudo nodes
if len(link['end_node_urls']) > 1:
start_depth = nodes[node_url_to_idx[link['start_node_urls'][0]]]['depth']
pseudo_idx = len(nodes)
pseudo_node = {
"depth": start_depth + 0.5,
"pseudo": True,
"id": pseudo_idx,
}
nodes.append(pseudo_node)
# add links start -> pseudo
new_link = {
'name': link['name'],
'id': link['id'],
'reaction': link['reaction'],
'source': node_url_to_idx[link['start_node_urls'][0]],
'target': pseudo_idx
}
adjusted_links.append(new_link)
# add n links pseudo -> end
for target in link['end_node_urls']:
new_link = {
'name': link['name'],
'id': link['id'],
'reaction': link['reaction'],
'source': pseudo_idx,
'target': node_url_to_idx[target]
}
adjusted_links.append(new_link)
else:
link['source'] = node_url_to_idx[link['start_node_urls'][0]]
link['target'] = node_url_to_idx[link['end_node_urls'][0]]
adjusted_links.append(link)
res = {
"aliases": [],
"completed": "true",
"description": self.description,
"id": self.url,
"isIncremental": False,
"isPredicted": False,
"lastModified": 1447842835894,
"pathwayName": self.name,
"reviewStatus": "reviewed" if self.package.reviewed else 'unreviewed',
"scenarios": [],
"upToDate": True,
"links": adjusted_links,
"nodes": nodes,
"modified": self.modified.strftime('%Y-%m-%d %H:%M:%S')
}
return json.dumps(res)
@staticmethod
@transaction.atomic
def create(package, name, description, smiles):
pw = Pathway()
pw.package = package
pw.name = name
pw.description = description
pw.save()
# create root node
Node.create(pw, smiles, 0)
return pw
class Node(EnviPathModel, AliasMixin, ScenarioMixin):
pathway = models.ForeignKey('epdb.Pathway', verbose_name='belongs to', on_delete=models.CASCADE, db_index=True)
default_node_label = models.ForeignKey('epdb.CompoundStructure', verbose_name='Default Node Label',
on_delete=models.CASCADE, related_name='default_node_structure')
node_labels = models.ManyToManyField('epdb.CompoundStructure', verbose_name='All Node Labels',
related_name='node_structures')
out_edges = models.ManyToManyField('epdb.Edge', verbose_name='Outgoing Edges')
depth = models.IntegerField(verbose_name='Node depth', null=False, blank=False)
@property
def url(self):
return '{}/node/{}'.format(self.pathway.url, self.uuid)
def d3_json(self):
return {
"depth": self.depth,
"url": self.url,
"node_label_id": self.default_node_label.url,
"image": self.url + '?image=svg',
"imageSize": 490, # TODO
"name": self.default_node_label.name,
"smiles": self.default_node_label.smiles,
}
@staticmethod
def create(pathway, smiles, depth):
c = Compound.create(pathway.package, smiles)
if Node.objects.filter(pathway=pathway, default_node_label=c.default_structure).exists():
print("found node")
return Node.objects.get(pathway=pathway, default_node_label=c.default_structure)
n = Node()
n.pathway = pathway
n.depth = depth
n.default_node_label = c.default_structure
n.save()
n.node_labels.add(c.default_structure)
n.save()
return n
@property
def as_svg(self):
return IndigoUtils.mol_to_svg(self.default_node_label.smiles)
class Edge(PolymorphicModel, EnviPathModel, AliasMixin, ScenarioMixin):
pathway = models.ForeignKey('epdb.Pathway', verbose_name='belongs to', on_delete=models.CASCADE, db_index=True)
edge_label = models.ForeignKey('epdb.Reaction', verbose_name='Edge label', null=True, on_delete=models.SET_NULL)
start_nodes = models.ManyToManyField('epdb.Node', verbose_name='Start Nodes', related_name='edge_educts')
end_nodes = models.ManyToManyField('epdb.Node', verbose_name='End Nodes', related_name='edge_products')
@property
def url(self):
return '{}/edge/{}'.format(self.pathway.url, self.uuid)
def d3_json(self):
# {
# "ecNumbers": [
# {
# "ecName": "DDT 2,3-dioxygenase",
# "ecNumber": "1.14.12.-"
# }
# ],
# "id": "https://envipath.org/package/32de3cf4-e3e6-4168-956e-32fa5ddb0ce1/pathway/3f58e4d4-1c63-4b30-bf31-7ae4b98899fe/edge/ff193e7b-f010-43d4-acb3-45f34d938824",
# "idreaction": "https://envipath.org/package/32de3cf4-e3e6-4168-956e-32fa5ddb0ce1/reaction/e11419cd-6b46-470b-8a06-a08d62281734",
# "multistep": "false",
# "name": "Eawag BBD reaction r0450",
# "pseudo": False,
# "scenarios": [],
# "source": 0,
# "target": 4
# }
return {
'name': self.name,
'id': self.url,
'reaction': self.edge_label.url if self.edge_label else None,
# TODO
'start_node_urls': [x.url for x in self.start_nodes.all()],
'end_node_urls': [x.url for x in self.end_nodes.all()],
}
@staticmethod
def create(pathway, start_nodes, end_nodes, rule: Optional[Rule] = None, name: Optional[str] = None,
description: Optional[str] = None):
e = Edge()
e.pathway = pathway
e.save()
for node in start_nodes:
e.start_nodes.add(node)
for node in end_nodes:
e.end_nodes.add(node)
if name is None:
name = f'Reaction {pathway.package.reactions.count() + 1}'
if description is None:
description = s.DEFAULT_VALUES['description']
r = Reaction.create(pathway.package, name=name, description=description,
educts=[n.default_node_label for n in e.start_nodes.all()],
products=[n.default_node_label for n in e.end_nodes.all()],
rule=rule, multi_step=False
)
e.edge_label = r
e.save()
return e
class EPModel(PolymorphicModel, EnviPathModel):
package = models.ForeignKey('epdb.Package', verbose_name='Package', on_delete=models.CASCADE, db_index=True)
@property
def url(self):
return '{}/model/{}'.format(self.package.url, self.uuid)
class MLRelativeReasoning(EPModel):
rule_packages = models.ManyToManyField("Package", verbose_name="Rule Packages", related_name="rule_packages")
data_packages = models.ManyToManyField("Package", verbose_name="Data Packages", related_name="data_packages")
eval_packages = models.ManyToManyField("Package", verbose_name="Evaluation Packages", related_name="eval_packages")
threshold = models.FloatField(null=False, blank=False, default=0.5)
INITIAL = "INITIAL"
INITIALIZING = "INITIALIZING"
BUILDING = "BUILDING"
BUILT_NOT_EVALUATED = "BUILT_NOT_EVALUATED"
EVALUATING = "EVALUATING"
FINISHED = "FINISHED"
ERROR = "ERROR"
PROGRESS_STATUS_CHOICES = {
INITIAL: "Initial",
INITIALIZING: "Model is initializing.",
BUILDING: "Model is building.",
BUILT_NOT_EVALUATED: "Model is built and can be used for predictions, Model is not evaluated yet.",
EVALUATING: "Model is evaluating",
FINISHED: "Model has finished building and evaluation.",
ERROR: "Model has failed."
}
model_status = models.CharField(blank=False, null=False, choices=PROGRESS_STATUS_CHOICES, default=INITIAL)
eval_results = JSONField(null=True, blank=True, default=dict)
@staticmethod
@transaction.atomic
def create(package, name, description, rule_packages, data_packages, eval_packages, threshold):
mlrr = MLRelativeReasoning()
mlrr.package = package
mlrr.name = name
mlrr.description = description
mlrr.threshold = threshold
mlrr.save()
for p in rule_packages:
mlrr.rule_packages.add(p)
if data_packages:
for p in data_packages:
mlrr.data_packages.add(p)
else:
for p in rule_packages:
mlrr.data_packages.add(p)
if eval_packages:
for p in eval_packages:
mlrr.eval_packages.add(p)
mlrr.save()
return mlrr
@cached_property
def applicable_rules(self):
"""
Returns a ordered set of rules where the following applies:
1. All Composite will be added to result
2. All SimpleRules will be added if theres no CompositeRule present using the SimpleRule
Ordering is based on "url" field.
"""
rules = []
rule_qs = Rule.objects.none()
for package in self.rule_packages.all():
rule_qs |= package.rules
rule_qs = rule_qs.distinct()
reflected_simple_rules = set()
for r in rule_qs:
if isinstance(r, ParallelRule) or isinstance(r, SequentialRule):
rules.append(r)
for sr in r.simple_rules.all():
reflected_simple_rules.add(sr)
for r in rule_qs:
if isinstance(r, SimpleAmbitRule) or isinstance(r, SimpleRDKitRule):
if r not in reflected_simple_rules:
rules.append(r)
rules = sorted(rules, key=lambda x: x.url)
return rules
def _get_excludes(self):
# TODO
return []
def _get_pathways(self):
pathway_qs = Pathway.objects.none()
for p in self.data_packages.all():
pathway_qs |= p.pathways
pathway_qs = pathway_qs.distinct()
return pathway_qs
def build_dataset(self):
self.model_status = self.INITIALIZING
self.save()
from datetime import datetime
start = datetime.now()
applicable_rules = self.applicable_rules
print("got rules")
# if s.DEBUG:
# pathways = self._get_pathways().order_by('-name')[:20]
# else:
pathways = self._get_pathways()
print("got pathways")
excludes = self._get_excludes()
# Collect all compounds
compounds = set()
reactions = set()
for i, p in enumerate(pathways):
print(f"{i + 1}/{len(pathways)}...")
for n in p.nodes:
cs = n.default_node_label.compound.default_structure
# TODO too many lookups
if cs.smiles in excludes:
continue
compounds.add(cs)
for e in p.edges:
reactions.add(e.edge_label)
print(len(compounds))
print(len(reactions))
triggered = set()
observed = set()
# TODO naming
pw = defaultdict(lambda: defaultdict(set))
for i, c in enumerate(compounds):
print(f"{i + 1}/{len(compounds)}...")
for r in applicable_rules:
# TODO check normalization
product_sets = r.apply(c.smiles)
if len(product_sets) == 0:
continue
triggered.add(f"{r.uuid} + {c.uuid}")
for ps in product_sets:
for p in ps:
pw[c][r].add(p)
for r in reactions:
if r is None:
print(r)
continue
if len(r.educts.all()) != 1:
print(f"Skipping {r.url}")
continue
# Loop will run only once
for c in r.educts.all():
if c not in pw:
continue
for rule in pw[c].keys():
# standardize...
if 0 != len(pw[c][rule]) and len(pw[c][rule]) == len(r.products.all()):
print(f"potential match for {c.smiles} and {r.uuid} ({r.name})")
standardized_products = []
for cs in r.products.all():
smi = cs.smiles
try:
smi = FormatConverter.standardize(smi)
except Exception as e:
# :shrug:
pass
standardized_products.append(smi)
standardized_pred_products = []
for smi in pw[c][rule]:
try:
smi = FormatConverter.standardize(smi)
except Exception as e:
# :shrug:
pass
standardized_pred_products.append(smi)
if sorted(list(set(standardized_products))) == sorted(list(set(standardized_pred_products))):
observed.add(f"{rule.uuid} + {c.uuid}")
print(f"Adding observed, current count {len(observed)}")
header = None
X = []
y = []
for i, c in enumerate(compounds):
print(f'{i + 1}/{len(compounds)}...')
# Features
feat = FormatConverter.maccs(c.smiles)
trig = []
obs = []
for rule in applicable_rules:
key = f"{rule.uuid} + {c.uuid}"
# Check triggered
if key in triggered:
trig.append(1)
else:
trig.append(0)
# Check obs
if key in triggered:
obs.append(1)
else:
obs.append(0)
if header is None:
header = [f'feature_{i}' for i, _ in enumerate(feat)] \
+ [f'trig_{r.uuid}' for r in applicable_rules] \
+ [f'corr_{r.uuid}' for r in applicable_rules]
X.append(feat + trig)
y.append(obs)
end = datetime.now()
print(f"Duration {(end - start).total_seconds()}s")
data = {
'X': X,
'y': y,
'header': header
}
f = os.path.join(s.MODEL_DIR, f"{self.uuid}.json")
json.dump(data, open(f, 'w'))
return X, y
def load_dataset(self):
ds_path = os.path.join(s.MODEL_DIR, f"{self.uuid}.json")
return json.load(open(ds_path, 'r'))
def build_model(self, X, y):
self.model_status = self.BUILDING
self.save()
mod = SparseLabelECC(
**s.DEFAULT_MODELS_PARAMS
)
mod.fit(X, y)
f = os.path.join(s.MODEL_DIR, f"{self.uuid}.pkl")
joblib.dump(mod, f)
self.model_status = self.BUILT_NOT_EVALUATED
self.save()
def rebuild(self):
data = self.load_dataset()
self.build_model(data['X'], data['y'])
def evaluate_model(self):
"""
Performs Leave-One-Out cross-validation on a multi-label dataset.
Parameters:
X (list of lists): Feature matrix.
y (list of lists): Multi-label targets.
classifier (sklearn estimator, optional): Base classifier. Defaults to RandomForest.
Returns:
float: Average accuracy across all LOO splits.
"""
if self.model_status != self.BUILT_NOT_EVALUATED:
raise ValueError(f"Can't evaluate a model in state {self.model_status}!")
self.model_status = self.EVALUATING
self.save()
f = os.path.join(s.MODEL_DIR, f"{self.uuid}.json")
data = json.load(open(f))
X = np.array(data['X'])
y = np.array(data['y'])
n_splits = 20
shuff = ShuffleSplit(n_splits=n_splits, test_size=0.25, random_state=42)
def train_and_evaluate(X, y, train_index, test_index, threshold):
X_train, X_test = X[train_index], X[test_index]
y_train, y_test = y[train_index], y[test_index]
model = SparseLabelECC(
**s.DEFAULT_MODELS_PARAMS
)
model.fit(X_train, y_train)
y_pred = model.predict_proba(X_test)
y_thresholded = (y_pred >= threshold).astype(int)
acc = jaccard_score(y_test, y_thresholded, average='samples', zero_division=0)
prec, rec = dict(), dict()
for t in np.arange(0, 1.05, 0.05):
temp_thresholded = (y_pred >= t).astype(int)
prec[f"{t:.2f}"] = precision_score(y_test, temp_thresholded, average='samples', zero_division=0)
rec[f"{t:.2f}"] = recall_score(y_test, temp_thresholded, average='samples', zero_division=0)
return acc, prec, rec
from joblib import Parallel, delayed
ret_vals = Parallel(n_jobs=10)(
delayed(train_and_evaluate)(X, y, train_index, test_index, self.threshold)
for train_index, test_index in shuff.split(X)
)
def compute_averages(data):
num_items = len(data)
avg_first_item = sum(item[0] for item in data) / num_items
sum_dict2 = defaultdict(float)
sum_dict3 = defaultdict(float)
for _, dict2, dict3 in data:
for key in dict2:
sum_dict2[key] += dict2[key]
for key in dict3:
sum_dict3[key] += dict3[key]
avg_dict2 = {key: val / num_items for key, val in sum_dict2.items()}
avg_dict3 = {key: val / num_items for key, val in sum_dict3.items()}
return {
"average_accuracy": float(avg_first_item),
"average_precision_per_threshold": avg_dict2,
"average_recall_per_threshold": avg_dict3
}
self.eval_results = compute_averages(ret_vals)
self.model_status = self.FINISHED
self.save()
@cached_property
def model(self):
mod = joblib.load(os.path.join(s.MODEL_DIR, f'{self.uuid}.pkl'))
mod.base_clf.n_jobs = -1
return mod
def predict(self, smiles) -> List['PredictionResult']:
start = datetime.now()
features = FormatConverter.maccs(smiles)
trig = []
prods = []
for rule in self.applicable_rules:
products = rule.apply(smiles)
if len(products):
trig.append(1)
prods.append(products)
else:
trig.append(0)
prods.append([])
end_ds_gen = datetime.now()
logger.info(f"Gen predict dataset took {(end_ds_gen - start).total_seconds()}s")
pred = self.model.predict_proba([features + trig])
res = []
for rule, p, smis in zip(self.applicable_rules, pred[0], prods):
res.append(PredictionResult(smis, p, rule))
end = datetime.now()
logger.info(f"Full predict took {(end - start).total_seconds()}s")
return res
@property
def pr_curve(self):
if self.model_status != self.FINISHED:
raise ValueError(f"Expected {self.FINISHED} but model is in status {self.model_status}")
res = []
thresholds = self.eval_results['average_precision_per_threshold'].keys()
for t in thresholds:
res.append({
'precision': self.eval_results['average_precision_per_threshold'][t],
'recall': self.eval_results['average_recall_per_threshold'][t],
'threshold': float(t)
})
return res
class ApplicabilityDomain(EnviPathModel):
model = models.ForeignKey(MLRelativeReasoning, on_delete=models.CASCADE)
num_neighbours = models.FloatField(blank=False, null=False, default=5)
reliability_threshold = models.FloatField(blank=False, null=False, default=0.5)
local_compatibilty_threshold = models.FloatField(blank=False, null=False, default=0.5)
def build_applicability_domain(self):
ds = self.model.load_dataset()
X = ds['X']
import numpy as np
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)
pca = PCA(n_components=5) # choose number of components
X_pca = pca.fit_transform(X_scaled)
max_vals = np.max(X_pca, axis=0)
min_vals = np.min(X_pca, axis=0)
class RuleBaseRelativeReasoning(EPModel):
pass
class EnviFormer(EPModel):
threshold = models.FloatField(null=False, blank=False, default=0.5)
@staticmethod
@transaction.atomic
def create(package, name, description, threshold):
mod = EnviFormer()
mod.package = package
mod.name = name
mod.description = description
mod.threshold = threshold
mod.save()
return mod
@cached_property
def model(self):
mod = getattr(s, 'ENVIFORMER_INSTANCE', None)
logger.info(f"Model from settings {hash(mod)}")
return mod
def predict(self, smiles) -> List['PredictionResult']:
# example = {
# 'C#N': 0.46326889595136767,
# 'C#C': 0.04981685951409509,
# }
from rdkit import Chem
m = Chem.MolFromSmiles(smiles)
Chem.Kekulize(m)
kek = Chem.MolToSmiles(m, kekuleSmiles=True)
logger.info(f"Submitting {kek} to {hash(self.model)}")
products = self.model.predict(kek)
logger.info(f"Got results {products}")
# from pprint import pprint
#
# print(smiles)
# pprint(products)
res = []
for smi, prob in products.items():
res.append(PredictionResult([ProductSet([smi])], prob, None))
return res
@cached_property
def applicable_rules(self):
return []
class PluginModel(EPModel):
pass
class Scenario(EnviPathModel):
package = models.ForeignKey('epdb.Package', verbose_name='Package', on_delete=models.CASCADE, db_index=True)
scenario_date = models.CharField(max_length=256, null=False, blank=False, default='No date')
scenario_type = models.CharField(max_length=256, null=False, blank=False, default='Not specified')
# for Referring Scenarios this property will be filled
parent = models.ForeignKey('self', on_delete=models.CASCADE, default=None, null=True)
additional_information = models.JSONField(verbose_name='Additional Information')
@property
def url(self):
return '{}/scenario/{}'.format(self.package.url, self.uuid)
@staticmethod
@transaction.atomic
def create(package, name, description, date, type, additional_information):
s = Scenario()
s.package = package
s.name = name
s.description = description
s.date = date
s.type = type
s.additional_information = additional_information
s.save()
return s
def add_additional_information(self, data):
pass
def remove_additional_information(self, data):
pass
def set_additional_information(self, data):
pass
def get_additional_information(self):
from envipy_additional_information import NAME_MAPPING
for k, vals in self.additional_information.items():
if k == 'enzyme':
continue
for v in vals:
yield NAME_MAPPING[k](**json.loads(v))
class UserSettingPermission(Permission):
uuid = models.UUIDField(null=False, blank=False, verbose_name='UUID of this object', primary_key=True,
default=uuid4)
user = models.ForeignKey('User', verbose_name='Permission to', on_delete=models.CASCADE)
setting = models.ForeignKey('epdb.Setting', verbose_name='Permission on', on_delete=models.CASCADE)
class Meta:
unique_together = [('setting', 'user')]
def __str__(self):
return f"User: {self.user} has Permission: {self.permission} on Setting: {self.setting}"
class Setting(EnviPathModel):
public = models.BooleanField(null=False, blank=False, default=False)
global_default = models.BooleanField(null=False, blank=False, default=False)
max_depth = models.IntegerField(null=False, blank=False, verbose_name='Setting Max Depth', default=5)
max_nodes = models.IntegerField(null=False, blank=False, verbose_name='Setting Max Number of Nodes', default=30)
rule_packages = models.ManyToManyField("Package", verbose_name="Setting Rule Packages",
related_name="setting_rule_packages")
model = models.ForeignKey('EPModel', verbose_name='Setting EPModel', on_delete=models.SET_NULL, null=True,
blank=True)
model_threshold = models.FloatField(null=True, blank=True, verbose_name='Setting Model Threshold', default=0.25)
@cached_property
def applicable_rules(self):
"""
Returns a ordered set of rules where the following applies:
1. All Composite will be added to result
2. All SimpleRules will be added if theres no CompositeRule present using the SimpleRule
Ordering is based on "url" field.
"""
rules = []
rule_qs = Rule.objects.none()
for package in self.rule_packages.all():
rule_qs |= package.rules
rule_qs = rule_qs.distinct()
reflected_simple_rules = set()
for r in rule_qs:
if isinstance(r, ParallelRule) or isinstance(r, SequentialRule):
rules.append(r)
for sr in r.simple_rules.all():
reflected_simple_rules.add(sr)
for r in rule_qs:
if isinstance(r, SimpleAmbitRule) or isinstance(r, SimpleRDKitRule):
if r not in reflected_simple_rules:
rules.append(r)
rules = sorted(rules, key=lambda x: x.url)
return rules
def expand(self, pathway, current_node):
"""Decision Method whether to expand on a certain Node or not"""
if pathway.num_nodes() >= self.max_nodes:
logger.info(f"Pathway has {pathway.num_nodes()} which exceeds the limit of {self.max_nodes}")
return []
if pathway.depth() >= self.max_depth:
logger.info(f"Pathway has reached depth {pathway.depth()} which exceeds the limit of {self.max_depth}")
return []
transformations = []
if self.model is not None:
print(self.model)
pred_results = self.model.predict(current_node.smiles)
print(pred_results)
for pred_result in pred_results:
if pred_result.probability >= self.model_threshold:
transformations.append(pred_result)
else:
for rule in self.applicable_rules:
tmp_products = rule.apply(current_node.smiles)
if tmp_products:
transformations.append(PredictionResult(tmp_products, 1.0, rule))
return transformations
@transaction.atomic
def make_global_default(self):
# Flag all others as global_default False to ensure there's only a single global_default
Setting.objects.all().update(global_default=False)
if not self.public:
self.public = True
self.global_default = True
self.save()