import abc import json import logging import os from collections import defaultdict from datetime import datetime, timedelta, date from typing import Union, List, Optional from uuid import uuid4 import joblib import numpy as np from django.conf import settings as s from django.contrib.auth.hashers import make_password, check_password from django.contrib.auth.models import AbstractUser from django.contrib.postgres.fields import ArrayField from django.db import models, transaction from django.db.models import JSONField from django.utils import timezone from django.utils.functional import cached_property from model_utils.models import TimeStampedModel from polymorphic.models import PolymorphicModel from sklearn.metrics import precision_score, recall_score, jaccard_score from sklearn.model_selection import ShuffleSplit from utilities.chem import FormatConverter, ProductSet, PredictionResult, IndigoUtils from utilities.ml import SparseLabelECC logger = logging.getLogger(__name__) ########################## # User/Groups/Permission # ########################## class User(AbstractUser): email = models.EmailField(unique=True) uuid = models.UUIDField(null=False, blank=False, verbose_name='UUID of this object', unique=True, default=uuid4) default_package = models.ForeignKey('epdb.Package', verbose_name='Default Package', null=True, on_delete=models.SET_NULL) default_group = models.ForeignKey('Group', verbose_name='Default Group', null=True, blank=False, on_delete=models.SET_NULL, related_name='default_group') default_setting = models.ForeignKey('epdb.Setting', on_delete=models.SET_NULL, verbose_name='The users default settings', null=True, blank=False) # TODO remove groups = models.ManyToManyField("Group", verbose_name='groups') USERNAME_FIELD = "email" REQUIRED_FIELDS = ['username'] @property def url(self): return '{}/user/{}'.format(s.SERVER_URL, self.uuid) def prediction_settings(self): if self.default_setting is None: self.default_setting = Setting.objects.get(global_default=True) self.save() return self.default_setting class APIToken(models.Model): hashed_key = models.CharField(max_length=128, unique=True) user = models.ForeignKey(User, on_delete=models.CASCADE) created = models.DateTimeField(auto_now_add=True) expires_at = models.DateTimeField(null=True, blank=True, default=timezone.now() + timedelta(days=90)) name = models.CharField(max_length=100, blank=True, help_text="Optional name for the token") def is_valid(self): return not self.expires_at or self.expires_at > timezone.now() @staticmethod def create_token(user, name="", valid_for=90): import secrets raw_token = secrets.token_urlsafe(32) hashed = make_password(raw_token) token = APIToken.objects.create(user=user, hashed_key=hashed, name=name, expires_at=timezone.now() + timedelta(days=valid_for)) return token, raw_token def check_token(self, raw_token): return check_password(raw_token, self.hashed_key) class Group(TimeStampedModel): uuid = models.UUIDField(null=False, blank=False, verbose_name='UUID of this object', unique=True, default=uuid4) name = models.TextField(blank=False, null=False, verbose_name='Group name') owner = models.ForeignKey("User", verbose_name='Group Owner', on_delete=models.CASCADE) description = models.TextField(blank=False, null=False, verbose_name='Descriptions', default='no description') user_member = models.ManyToManyField("User", verbose_name='User members', related_name='users_in_group') group_member = models.ManyToManyField("Group", verbose_name='Group member', related_name='groups_in_group') def __str__(self): return f"{self.name} (pk={self.pk})" @property def url(self): return '{}/group/{}'.format(s.SERVER_URL, self.uuid) class Permission(TimeStampedModel): READ = ('read', 'Read') WRITE = ('write', 'Write') ALL = ('all', 'All') PERMS = [ READ, WRITE, ALL ] permission = models.CharField(max_length=32, choices=PERMS, null=False) def has_read(self): return self.permission in [p[0] for p in self.PERMS] def has_write(self): return self.permission in [self.WRITE[0], self.ALL[0]] def has_all(self): return self.permission == self.ALL[0] class Meta: abstract: True class UserPackagePermission(Permission): uuid = models.UUIDField(null=False, blank=False, verbose_name='UUID of this object', primary_key=True, default=uuid4) user = models.ForeignKey('User', verbose_name='Permission to', on_delete=models.CASCADE) package = models.ForeignKey('epdb.Package', verbose_name='Permission on', on_delete=models.CASCADE) class Meta: unique_together = [('package', 'user')] def __str__(self): return f"User: {self.user} has Permission: {self.permission} on Package: {self.package}" class GroupPackagePermission(Permission): uuid = models.UUIDField(null=False, blank=False, verbose_name='UUID of this object', primary_key=True, default=uuid4) group = models.ForeignKey('Group', verbose_name='Permission to', on_delete=models.CASCADE) package = models.ForeignKey('epdb.Package', verbose_name='Permission on', on_delete=models.CASCADE) class Meta: unique_together = [('package', 'group')] def __str__(self): return f"Group: {self.group} has Permission: {self.permission} on Package: {self.package}" ############## # EP Objects # ############## class EnviPathModel(TimeStampedModel): uuid = models.UUIDField(null=False, blank=False, verbose_name='UUID of this object', unique=True, default=uuid4) name = models.TextField(blank=False, null=False, verbose_name='Name', default='no name') description = models.TextField(blank=False, null=False, verbose_name='Descriptions', default='no description') kv = JSONField(null=True, blank=True, default=dict) @property @abc.abstractmethod def url(self): pass def get_v(self, k, default=None): if self.kv: return self.kv.get(k, default) return default class Meta: abstract = True class AliasMixin(models.Model): aliases = ArrayField( models.TextField(blank=False, null=False), verbose_name='Aliases', default=list ) @transaction.atomic def add_alias(self, new_alias, set_as_default=False): if set_as_default: self.aliases.add(self.name) self.name = new_alias if new_alias in self.aliases: self.aliases.remove(new_alias) else: if new_alias not in self.aliases: self.aliases.add(new_alias) self.save() class Meta: abstract = True class ScenarioMixin(models.Model): scenarios = models.ManyToManyField("epdb.Scenario", verbose_name='Attached Scenarios') class Meta: abstract = True class Package(EnviPathModel): reviewed = models.BooleanField(verbose_name='Reviewstatus', default=False) def __str__(self): return f"{self.name} (pk={self.pk})" @property def compounds(self): return Compound.objects.filter(package=self) @property def rules(self): return Rule.objects.filter(package=self) @property def reactions(self): return Reaction.objects.filter(package=self) @property def pathways(self) -> 'Pathway': return Pathway.objects.filter(package=self) @property def scenarios(self): return Scenario.objects.filter(package=self) @property def models(self): return EPModel.objects.filter(package=self) @property def url(self): return '{}/package/{}'.format(s.SERVER_URL, self.uuid) def get_applicable_rules(self): """ Returns a ordered set of rules where the following applies: 1. All Composite will be added to result 2. All SimpleRules will be added if theres no CompositeRule present using the SimpleRule Ordering is based on "url" field. """ rules = [] rule_qs = self.rules reflected_simple_rules = set() for r in rule_qs: if isinstance(r, ParallelRule) or isinstance(r, SequentialRule): rules.append(r) for sr in r.simple_rules.all(): reflected_simple_rules.add(sr) for r in rule_qs: if isinstance(r, SimpleAmbitRule) or isinstance(r, SimpleRDKitRule): if r not in reflected_simple_rules: rules.append(r) rules = sorted(rules, key=lambda x: x.url) return rules class Compound(EnviPathModel, AliasMixin, ScenarioMixin): package = models.ForeignKey('epdb.Package', verbose_name='Package', on_delete=models.CASCADE, db_index=True) default_structure = models.ForeignKey('CompoundStructure', verbose_name='Default Structure', related_name='compound_default_structure', on_delete=models.CASCADE, null=True) @property def structures(self): return CompoundStructure.objects.filter(compound=self) @property def url(self): return '{}/compound/{}'.format(self.package.url, self.uuid) # @property # def related_pathways(self): # pathways = Node.objects.filter(node_labels__in=[self.default_structure]).values_list('pathway', flat=True) # return Pathway.objects.filter(package=self.package, id__in=set(pathways)).order_by('name') # @property # def related_reactions(self): # return ( # Reaction.objects.filter(package=self.package, educts__in=[self.default_structure]) # | # Reaction.objects.filter(package=self.package, products__in=[self.default_structure]) # ).order_by('name') @staticmethod @transaction.atomic def create(package: Package, smiles: str, name: str = None, description: str = None, *args, **kwargs): # Pre check # Validity of SMILES etc if CompoundStructure.objects.filter(smiles=smiles, compound__package=package).exists(): return CompoundStructure.objects.get(smiles=smiles, compound__package=package).compound # Generate Compound c = Compound() c.package = package if name is not None: c.name = name if description is not None: c.description = description c.save() normalized_smiles = smiles # chem.normalize(smiles) if normalized_smiles != smiles: _ = CompoundStructure.create(c, normalized_smiles, name='Normalized structure of {}'.format(name), description='{} (in its normalized form)'.format(description), normalized_structure=True) cs = CompoundStructure.create(c, smiles, name=name, description=description) c.default_structure = cs c.save() return c class Meta: unique_together = [('uuid', 'package')] class CompoundStructure(EnviPathModel, AliasMixin, ScenarioMixin): compound = models.ForeignKey('epdb.Compound', on_delete=models.CASCADE, db_index=True) smiles = models.TextField(blank=False, null=False, verbose_name='SMILES') normalized_structure = models.BooleanField(null=False, blank=False, default=False) @property def url(self): return '{}/structure/{}'.format(self.compound.url, self.uuid) # @property # def related_pathways(self): # pathways = Node.objects.filter(node_labels__in=[self]).values_list('pathway', flat=True) # return Pathway.objects.filter(package=self.compound.package, id__in=set(pathways)).order_by('name') # @property # def related_reactions(self): # return ( # Reaction.objects.filter(package=self.compound.package, educts__in=[self]) # | # Reaction.objects.filter(package=self.compound.package, products__in=[self]) # ).order_by('name') @staticmethod @transaction.atomic def create(compound: Compound, smiles: str, name: str = None, description: str = None, *args, **kwargs): if CompoundStructure.objects.filter(compound=compound, smiles=smiles).exists(): return CompoundStructure.objects.get(compound=compound, smiles=smiles) if compound.pk is None: raise ValueError("Unpersisted Compound! Persist compound first!") cs = CompoundStructure() if name is not None: cs.name = name if description is not None: cs.description = description cs.smiles = smiles cs.compound = compound if 'normalized_structure' in kwargs: cs.normalized_structure = kwargs['normalized_structure'] cs.save() return cs # TODO add find method @property def InChIKey(self): return FormatConverter.InChIKey(self.smiles) @property def as_svg(self): return IndigoUtils.mol_to_svg(self.smiles) class Rule(PolymorphicModel, EnviPathModel, AliasMixin, ScenarioMixin): package = models.ForeignKey('epdb.Package', verbose_name='Package', on_delete=models.CASCADE, db_index=True) # I think this only affects Django Admin which we are barely using # # https://github.com/django-polymorphic/django-polymorphic/issues/229 # _non_polymorphic = models.Manager() # # class Meta: # base_manager_name = '_non_polymorphic' @abc.abstractmethod def apply(self, *args, **kwargs): pass @staticmethod def cls_for_type(rule_type: str): if rule_type == 'SimpleAmbitRule': return SimpleAmbitRule elif rule_type == 'SimpleRDKitRule': return SimpleRDKitRule elif rule_type == 'ParallelRule': return ParallelRule elif rule_type == 'SequentialRule': return SequentialRule else: raise ValueError(f'{rule_type} is unknown!') @staticmethod @transaction.atomic def create(package: Package, rule_type: str, name: str = None, description: str = None, *args, **kwargs): r = Rule.cls_for_type(rule_type)() r.package = package r.name = name r.description = description # As we are setting params this way the "k" has to match the property name for k, v in kwargs.items(): setattr(r, k, v) r.save() return r # # @property # def related_pathways(self): # reaction_ids = self.related_reactions.values_list('id', flat=True) # pathways = Edge.objects.filter(edge_label__in=reaction_ids).values_list('pathway', flat=True) # return Pathway.objects.filter(package=self.package, id__in=set(pathways)).order_by('name') # # @property # def related_reactions(self): # return ( # Reaction.objects.filter(package=self.package, rules__in=[self]) # | # Reaction.objects.filter(package=self.package, rules__in=[self]) # ).order_by('name') # # class SimpleRule(Rule): pass # # class SimpleAmbitRule(SimpleRule): smirks = models.TextField(blank=False, null=False, verbose_name='SMIRKS') reactant_filter_smarts = models.TextField(null=True, verbose_name='Reactant Filter SMARTS') product_filter_smarts = models.TextField(null=True, verbose_name='Product Filter SMARTS') @property def url(self): return '{}/simple-ambit-rule/{}'.format(self.package.url, self.uuid) def apply(self, smiles): return FormatConverter.apply(smiles, self.smirks) @property def reactants_smarts(self): return self.smirks.split('>>')[0] @property def products_smarts(self): return self.smirks.split('>>')[1] @property def related_reactions(self): qs = Package.objects.filter(reviewed=True) return self.reaction_rule.filter(package__in=qs).order_by('name') @property def related_pathways(self): return Pathway.objects.filter( id__in=Edge.objects.filter(edge_label__in=self.related_reactions).values('pathway_id')).order_by('name') @property def as_svg(self): return IndigoUtils.smirks_to_svg(self.smirks, True) class SimpleRDKitRule(SimpleRule): reaction_smarts = models.TextField(blank=False, null=False, verbose_name='SMIRKS') def apply(self, smiles): return FormatConverter.apply(smiles, self.reaction_smarts) @property def url(self): return '{}/simple-rdkit-rule/{}'.format(self.package.url, self.uuid) # # class ParallelRule(Rule): simple_rules = models.ManyToManyField('epdb.SimpleRule', verbose_name='Simple rules') @property def url(self): return '{}/parallel-rule/{}'.format(self.package.url, self.uuid) @property def srs(self): return self.simple_rules.all() def apply(self, structure): res = list() for simple_rule in self.srs: res.extend(simple_rule.apply(structure)) return list(set(res)) class SequentialRule(Rule): simple_rules = models.ManyToManyField('epdb.SimpleRule', verbose_name='Simple rules', through='SequentialRuleOrdering') @property def url(self): return '{}/sequential-rule/{}'.format(self.compound.url, self.uuid) @property def srs(self): return self.simple_rules.all() def apply(self, structure): # TODO determine levels or see java implementation res = set() for simple_rule in self.srs: res.union(set(simple_rule.apply(structure))) return res class SequentialRuleOrdering(models.Model): sequential_rule = models.ForeignKey(SequentialRule, on_delete=models.CASCADE) simple_rule = models.ForeignKey(SimpleRule, on_delete=models.CASCADE) order_index = models.IntegerField(null=False, blank=False) class Reaction(EnviPathModel, AliasMixin, ScenarioMixin): package = models.ForeignKey('epdb.Package', verbose_name='Package', on_delete=models.CASCADE, db_index=True) educts = models.ManyToManyField('epdb.CompoundStructure', verbose_name='Educts', related_name='reaction_educts') products = models.ManyToManyField('epdb.CompoundStructure', verbose_name='Products', related_name='reaction_products') rules = models.ManyToManyField('epdb.Rule', verbose_name='Rule', related_name='reaction_rule') multi_step = models.BooleanField(verbose_name='Multistep Reaction') medline_references = ArrayField( models.TextField(blank=False, null=False), null=True, verbose_name='Medline References' ) @property def url(self): return '{}/reaction/{}'.format(self.package.url, self.uuid) @staticmethod @transaction.atomic def create(package: Package, name: str = None, description: str = None, educts: Union[List[str], List[CompoundStructure]] = None, products: Union[List[str], List[CompoundStructure]] = None, rule: Rule = None, multi_step: bool = True): _educts = [] _products = [] # Determine if we receive smiles or compoundstructures if all(isinstance(x, str) for x in educts + products): for educt in educts: c = Compound.create(package, educt) _educts.append(c.default_structure) for product in products: c = Compound.create(package, product) _products.append(c.default_structure) elif all(isinstance(x, CompoundStructure) for x in educts + products): _educts += educts _products += products else: raise ValueError("") r = Reaction() r.package = package r.name = name r.description = description r.multi_step = multi_step r.save() if rule: r.rules.add(rule) for educt in _educts: r.educts.add(educt) for product in _products: r.products.add(product) r.save() return r def smirks(self): return f"{'.'.join([cs.smiles for cs in self.educts.all()])}>>{'.'.join([cs.smiles for cs in self.products.all()])}" @property def as_svg(self): return IndigoUtils.smirks_to_svg(self.smirks(), False, width=800, height=400) @property def related_pathways(self): return Pathway.objects.filter( id__in=Edge.objects.filter(edge_label=self).values('pathway_id')).order_by('name') class Pathway(EnviPathModel, AliasMixin, ScenarioMixin): package = models.ForeignKey('epdb.Package', verbose_name='Package', on_delete=models.CASCADE, db_index=True) @property def root_nodes(self): return Node.objects.filter(pathway=self, depth=0) @property def nodes(self): return Node.objects.filter(pathway=self) @property def edges(self): return Edge.objects.filter(pathway=self) @property def url(self): return '{}/pathway/{}'.format(self.package.url, self.uuid) def d3_json(self): # Ideally it would be something like this but # to reduce crossing in edges do a DFS # nodes = [n.d3_json() for n in self.nodes] nodes = [] processed = set() queue = list() for n in self.root_nodes: queue.append(n) while len(queue): current = queue.pop() processed.add(current) nodes.append(current.d3_json()) for e in self.edges: if current in e.start_nodes.all(): for prod in e.end_nodes.all(): if prod not in queue and prod not in processed: queue.append(prod) # We shouldn't lose or make up nodes... assert len(nodes) == len(self.nodes) print(f"Num Nodes {len(nodes)} vs. DB Nodes {len(self.nodes)}") links = [e.d3_json() for e in self.edges] # D3 links Nodes based on indices in nodes array node_url_to_idx = dict() for i, n in enumerate(nodes): n['id'] = i node_url_to_idx[n['url']] = i adjusted_links = [] for link in links: # Check if we'll need pseudo nodes if len(link['end_node_urls']) > 1: start_depth = nodes[node_url_to_idx[link['start_node_urls'][0]]]['depth'] pseudo_idx = len(nodes) pseudo_node = { "depth": start_depth + 0.5, "pseudo": True, "id": pseudo_idx, } nodes.append(pseudo_node) # add links start -> pseudo new_link = { 'name': link['name'], 'id': link['id'], 'reaction': link['reaction'], 'source': node_url_to_idx[link['start_node_urls'][0]], 'target': pseudo_idx } adjusted_links.append(new_link) # add n links pseudo -> end for target in link['end_node_urls']: new_link = { 'name': link['name'], 'id': link['id'], 'reaction': link['reaction'], 'source': pseudo_idx, 'target': node_url_to_idx[target] } adjusted_links.append(new_link) else: link['source'] = node_url_to_idx[link['start_node_urls'][0]] link['target'] = node_url_to_idx[link['end_node_urls'][0]] adjusted_links.append(link) res = { "aliases": [], "completed": "true", "description": self.description, "id": self.url, "isIncremental": False, "isPredicted": False, "lastModified": 1447842835894, "pathwayName": self.name, "reviewStatus": "reviewed" if self.package.reviewed else 'unreviewed', "scenarios": [], "upToDate": True, "links": adjusted_links, "nodes": nodes, "modified": self.modified.strftime('%Y-%m-%d %H:%M:%S') } return json.dumps(res) @staticmethod @transaction.atomic def create(package, name, description, smiles): pw = Pathway() pw.package = package pw.name = name pw.description = description pw.save() # create root node Node.create(pw, smiles, 0) return pw class Node(EnviPathModel, AliasMixin, ScenarioMixin): pathway = models.ForeignKey('epdb.Pathway', verbose_name='belongs to', on_delete=models.CASCADE, db_index=True) default_node_label = models.ForeignKey('epdb.CompoundStructure', verbose_name='Default Node Label', on_delete=models.CASCADE, related_name='default_node_structure') node_labels = models.ManyToManyField('epdb.CompoundStructure', verbose_name='All Node Labels', related_name='node_structures') out_edges = models.ManyToManyField('epdb.Edge', verbose_name='Outgoing Edges') depth = models.IntegerField(verbose_name='Node depth', null=False, blank=False) @property def url(self): return '{}/node/{}'.format(self.pathway.url, self.uuid) def d3_json(self): return { "depth": self.depth, "url": self.url, "node_label_id": self.default_node_label.url, "image": self.url + '?image=svg', "imageSize": 490, # TODO "name": self.default_node_label.name, "smiles": self.default_node_label.smiles, } @staticmethod def create(pathway, smiles, depth): c = Compound.create(pathway.package, smiles) if Node.objects.filter(pathway=pathway, default_node_label=c.default_structure).exists(): print("found node") return Node.objects.get(pathway=pathway, default_node_label=c.default_structure) n = Node() n.pathway = pathway n.depth = depth n.default_node_label = c.default_structure n.save() n.node_labels.add(c.default_structure) n.save() return n @property def as_svg(self): return IndigoUtils.mol_to_svg(self.default_node_label.smiles) class Edge(PolymorphicModel, EnviPathModel, AliasMixin, ScenarioMixin): pathway = models.ForeignKey('epdb.Pathway', verbose_name='belongs to', on_delete=models.CASCADE, db_index=True) edge_label = models.ForeignKey('epdb.Reaction', verbose_name='Edge label', null=True, on_delete=models.SET_NULL) start_nodes = models.ManyToManyField('epdb.Node', verbose_name='Start Nodes', related_name='edge_educts') end_nodes = models.ManyToManyField('epdb.Node', verbose_name='End Nodes', related_name='edge_products') @property def url(self): return '{}/edge/{}'.format(self.pathway.url, self.uuid) def d3_json(self): # { # "ecNumbers": [ # { # "ecName": "DDT 2,3-dioxygenase", # "ecNumber": "1.14.12.-" # } # ], # "id": "https://envipath.org/package/32de3cf4-e3e6-4168-956e-32fa5ddb0ce1/pathway/3f58e4d4-1c63-4b30-bf31-7ae4b98899fe/edge/ff193e7b-f010-43d4-acb3-45f34d938824", # "idreaction": "https://envipath.org/package/32de3cf4-e3e6-4168-956e-32fa5ddb0ce1/reaction/e11419cd-6b46-470b-8a06-a08d62281734", # "multistep": "false", # "name": "Eawag BBD reaction r0450", # "pseudo": False, # "scenarios": [], # "source": 0, # "target": 4 # } return { 'name': self.name, 'id': self.url, 'reaction': self.edge_label.url if self.edge_label else None, # TODO 'start_node_urls': [x.url for x in self.start_nodes.all()], 'end_node_urls': [x.url for x in self.end_nodes.all()], } @staticmethod def create(pathway, start_nodes, end_nodes, rule: Optional[Rule] = None, name: Optional[str] = None, description: Optional[str] = None): e = Edge() e.pathway = pathway e.save() for node in start_nodes: e.start_nodes.add(node) for node in end_nodes: e.end_nodes.add(node) if name is None: name = f'Reaction {pathway.package.reactions.count() + 1}' if description is None: description = s.DEFAULT_VALUES['description'] r = Reaction.create(pathway.package, name=name, description=description, educts=[n.default_node_label for n in e.start_nodes.all()], products=[n.default_node_label for n in e.end_nodes.all()], rule=rule, multi_step=False ) e.edge_label = r e.save() return e class EPModel(PolymorphicModel, EnviPathModel): package = models.ForeignKey('epdb.Package', verbose_name='Package', on_delete=models.CASCADE, db_index=True) @property def url(self): return '{}/model/{}'.format(self.package.url, self.uuid) class MLRelativeReasoning(EPModel): rule_packages = models.ManyToManyField("Package", verbose_name="Rule Packages", related_name="rule_packages") data_packages = models.ManyToManyField("Package", verbose_name="Data Packages", related_name="data_packages") eval_packages = models.ManyToManyField("Package", verbose_name="Evaluation Packages", related_name="eval_packages") threshold = models.FloatField(null=False, blank=False, default=0.5) INITIAL = "INITIAL" INITIALIZING = "INITIALIZING" BUILDING = "BUILDING" BUILT_NOT_EVALUATED = "BUILT_NOT_EVALUATED" EVALUATING = "EVALUATING" FINISHED = "FINISHED" ERROR = "ERROR" PROGRESS_STATUS_CHOICES = { INITIAL: "Initial", INITIALIZING: "Model is initializing.", BUILDING: "Model is building.", BUILT_NOT_EVALUATED: "Model is built and can be used for predictions, Model is not evaluated yet.", EVALUATING: "Model is evaluating", FINISHED: "Model has finished building and evaluation.", ERROR: "Model has failed." } model_status = models.CharField(blank=False, null=False, choices=PROGRESS_STATUS_CHOICES, default=INITIAL) eval_results = JSONField(null=True, blank=True, default=dict) @staticmethod @transaction.atomic def create(package, name, description, rule_packages, data_packages, eval_packages, threshold): mlrr = MLRelativeReasoning() mlrr.package = package mlrr.name = name mlrr.description = description mlrr.threshold = threshold mlrr.save() for p in rule_packages: mlrr.rule_packages.add(p) if data_packages: for p in data_packages: mlrr.data_packages.add(p) else: for p in rule_packages: mlrr.data_packages.add(p) if eval_packages: for p in eval_packages: mlrr.eval_packages.add(p) mlrr.save() return mlrr @cached_property def applicable_rules(self): """ Returns a ordered set of rules where the following applies: 1. All Composite will be added to result 2. All SimpleRules will be added if theres no CompositeRule present using the SimpleRule Ordering is based on "url" field. """ rules = [] rule_qs = Rule.objects.none() for package in self.rule_packages.all(): rule_qs |= package.rules rule_qs = rule_qs.distinct() reflected_simple_rules = set() for r in rule_qs: if isinstance(r, ParallelRule) or isinstance(r, SequentialRule): rules.append(r) for sr in r.simple_rules.all(): reflected_simple_rules.add(sr) for r in rule_qs: if isinstance(r, SimpleAmbitRule) or isinstance(r, SimpleRDKitRule): if r not in reflected_simple_rules: rules.append(r) rules = sorted(rules, key=lambda x: x.url) return rules def _get_excludes(self): # TODO return [] def _get_pathways(self): pathway_qs = Pathway.objects.none() for p in self.data_packages.all(): pathway_qs |= p.pathways pathway_qs = pathway_qs.distinct() return pathway_qs def build_dataset(self): self.model_status = self.INITIALIZING self.save() from datetime import datetime start = datetime.now() applicable_rules = self.applicable_rules print("got rules") # if s.DEBUG: # pathways = self._get_pathways().order_by('-name')[:20] # else: pathways = self._get_pathways() print("got pathways") excludes = self._get_excludes() # Collect all compounds compounds = set() reactions = set() for i, p in enumerate(pathways): print(f"{i + 1}/{len(pathways)}...") for n in p.nodes: cs = n.default_node_label.compound.default_structure # TODO too many lookups if cs.smiles in excludes: continue compounds.add(cs) for e in p.edges: reactions.add(e.edge_label) print(len(compounds)) print(len(reactions)) triggered = set() observed = set() # TODO naming pw = defaultdict(lambda: defaultdict(set)) for i, c in enumerate(compounds): print(f"{i + 1}/{len(compounds)}...") for r in applicable_rules: # TODO check normalization product_sets = r.apply(c.smiles) if len(product_sets) == 0: continue triggered.add(f"{r.uuid} + {c.uuid}") for ps in product_sets: for p in ps: pw[c][r].add(p) for r in reactions: if r is None: print(r) continue if len(r.educts.all()) != 1: print(f"Skipping {r.url}") continue # Loop will run only once for c in r.educts.all(): if c not in pw: continue for rule in pw[c].keys(): # standardize... if 0 != len(pw[c][rule]) and len(pw[c][rule]) == len(r.products.all()): print(f"potential match for {c.smiles} and {r.uuid} ({r.name})") standardized_products = [] for cs in r.products.all(): smi = cs.smiles try: smi = FormatConverter.standardize(smi) except Exception as e: # :shrug: pass standardized_products.append(smi) standardized_pred_products = [] for smi in pw[c][rule]: try: smi = FormatConverter.standardize(smi) except Exception as e: # :shrug: pass standardized_pred_products.append(smi) if sorted(list(set(standardized_products))) == sorted(list(set(standardized_pred_products))): observed.add(f"{rule.uuid} + {c.uuid}") print(f"Adding observed, current count {len(observed)}") header = None X = [] y = [] for i, c in enumerate(compounds): print(f'{i + 1}/{len(compounds)}...') # Features feat = FormatConverter.maccs(c.smiles) trig = [] obs = [] for rule in applicable_rules: key = f"{rule.uuid} + {c.uuid}" # Check triggered if key in triggered: trig.append(1) else: trig.append(0) # Check obs if key in triggered: obs.append(1) else: obs.append(0) if header is None: header = [f'feature_{i}' for i, _ in enumerate(feat)] \ + [f'trig_{r.uuid}' for r in applicable_rules] \ + [f'corr_{r.uuid}' for r in applicable_rules] X.append(feat + trig) y.append(obs) end = datetime.now() print(f"Duration {(end - start).total_seconds()}s") data = { 'X': X, 'y': y, 'header': header } f = os.path.join(s.MODEL_DIR, f"{self.uuid}.json") json.dump(data, open(f, 'w')) return X, y def load_dataset(self): ds_path = os.path.join(s.MODEL_DIR, f"{self.uuid}.json") return json.load(open(ds_path, 'r')) def build_model(self, X, y): self.model_status = self.BUILDING self.save() mod = SparseLabelECC( **s.DEFAULT_MODELS_PARAMS ) mod.fit(X, y) f = os.path.join(s.MODEL_DIR, f"{self.uuid}.pkl") joblib.dump(mod, f) self.model_status = self.BUILT_NOT_EVALUATED self.save() def rebuild(self): data = self.load_dataset() self.build_model(data['X'], data['y']) def evaluate_model(self): """ Performs Leave-One-Out cross-validation on a multi-label dataset. Parameters: X (list of lists): Feature matrix. y (list of lists): Multi-label targets. classifier (sklearn estimator, optional): Base classifier. Defaults to RandomForest. Returns: float: Average accuracy across all LOO splits. """ if self.model_status != self.BUILT_NOT_EVALUATED: raise ValueError(f"Can't evaluate a model in state {self.model_status}!") self.model_status = self.EVALUATING self.save() f = os.path.join(s.MODEL_DIR, f"{self.uuid}.json") data = json.load(open(f)) X = np.array(data['X']) y = np.array(data['y']) n_splits = 20 shuff = ShuffleSplit(n_splits=n_splits, test_size=0.25, random_state=42) def train_and_evaluate(X, y, train_index, test_index, threshold): X_train, X_test = X[train_index], X[test_index] y_train, y_test = y[train_index], y[test_index] model = SparseLabelECC( **s.DEFAULT_MODELS_PARAMS ) model.fit(X_train, y_train) y_pred = model.predict_proba(X_test) y_thresholded = (y_pred >= threshold).astype(int) acc = jaccard_score(y_test, y_thresholded, average='samples', zero_division=0) prec, rec = dict(), dict() for t in np.arange(0, 1.05, 0.05): temp_thresholded = (y_pred >= t).astype(int) prec[f"{t:.2f}"] = precision_score(y_test, temp_thresholded, average='samples', zero_division=0) rec[f"{t:.2f}"] = recall_score(y_test, temp_thresholded, average='samples', zero_division=0) return acc, prec, rec from joblib import Parallel, delayed ret_vals = Parallel(n_jobs=10)( delayed(train_and_evaluate)(X, y, train_index, test_index, self.threshold) for train_index, test_index in shuff.split(X) ) def compute_averages(data): num_items = len(data) avg_first_item = sum(item[0] for item in data) / num_items sum_dict2 = defaultdict(float) sum_dict3 = defaultdict(float) for _, dict2, dict3 in data: for key in dict2: sum_dict2[key] += dict2[key] for key in dict3: sum_dict3[key] += dict3[key] avg_dict2 = {key: val / num_items for key, val in sum_dict2.items()} avg_dict3 = {key: val / num_items for key, val in sum_dict3.items()} return { "average_accuracy": float(avg_first_item), "average_precision_per_threshold": avg_dict2, "average_recall_per_threshold": avg_dict3 } self.eval_results = compute_averages(ret_vals) self.model_status = self.FINISHED self.save() @cached_property def model(self): mod = joblib.load(os.path.join(s.MODEL_DIR, f'{self.uuid}.pkl')) mod.base_clf.n_jobs = -1 return mod def predict(self, smiles) -> List['PredictionResult']: start = datetime.now() features = FormatConverter.maccs(smiles) trig = [] prods = [] for rule in self.applicable_rules: products = rule.apply(smiles) if len(products): trig.append(1) prods.append(products) else: trig.append(0) prods.append([]) end_ds_gen = datetime.now() logger.info(f"Gen predict dataset took {(end_ds_gen - start).total_seconds()}s") pred = self.model.predict_proba([features + trig]) res = [] for rule, p, smis in zip(self.applicable_rules, pred[0], prods): res.append(PredictionResult(smis, p, rule)) end = datetime.now() logger.info(f"Full predict took {(end - start).total_seconds()}s") return res @property def pr_curve(self): if self.model_status != self.FINISHED: raise ValueError(f"Expected {self.FINISHED} but model is in status {self.model_status}") res = [] thresholds = self.eval_results['average_precision_per_threshold'].keys() for t in thresholds: res.append({ 'precision': self.eval_results['average_precision_per_threshold'][t], 'recall': self.eval_results['average_recall_per_threshold'][t], 'threshold': float(t) }) return res class ApplicabilityDomain(EnviPathModel): model = models.ForeignKey(MLRelativeReasoning, on_delete=models.CASCADE) num_neighbours = models.FloatField(blank=False, null=False, default=5) reliability_threshold = models.FloatField(blank=False, null=False, default=0.5) local_compatibilty_threshold = models.FloatField(blank=False, null=False, default=0.5) def build_applicability_domain(self): ds = self.model.load_dataset() X = ds['X'] import numpy as np from sklearn.decomposition import PCA from sklearn.preprocessing import StandardScaler scaler = StandardScaler() X_scaled = scaler.fit_transform(X) pca = PCA(n_components=5) # choose number of components X_pca = pca.fit_transform(X_scaled) max_vals = np.max(X_pca, axis=0) min_vals = np.min(X_pca, axis=0) class RuleBaseRelativeReasoning(EPModel): pass class EnviFormer(EPModel): threshold = models.FloatField(null=False, blank=False, default=0.5) @staticmethod @transaction.atomic def create(package, name, description, threshold): mod = EnviFormer() mod.package = package mod.name = name mod.description = description mod.threshold = threshold mod.save() return mod @cached_property def model(self): mod = getattr(s, 'ENVIFORMER_INSTANCE', None) logger.info(f"Model from settings {hash(mod)}") return mod def predict(self, smiles) -> List['PredictionResult']: # example = { # 'C#N': 0.46326889595136767, # 'C#C': 0.04981685951409509, # } from rdkit import Chem m = Chem.MolFromSmiles(smiles) Chem.Kekulize(m) kek = Chem.MolToSmiles(m, kekuleSmiles=True) logger.info(f"Submitting {kek} to {hash(self.model)}") products = self.model.predict(kek) logger.info(f"Got results {products}") # from pprint import pprint # # print(smiles) # pprint(products) res = [] for smi, prob in products.items(): res.append(PredictionResult([ProductSet([smi])], prob, None)) return res @cached_property def applicable_rules(self): return [] class PluginModel(EPModel): pass # # # # # # # TODO fully implement AdditionalInformation # # # TODO consider Scenario, BaseScenario, RelatedScenario class Scenario(EnviPathModel): package = models.ForeignKey('epdb.Package', verbose_name='Package', on_delete=models.CASCADE, db_index=True) type = models.CharField(max_length=256, null=False, blank=False, default='No date') type = models.CharField(max_length=256, null=False, blank=False, default='Not specified') additional_information = models.JSONField(verbose_name='Additional Information') @property def url(self): return '{}/scenario/{}'.format(self.package.url, self.uuid) @staticmethod @transaction.atomic def create(package, name, description, date, type, additional_information): s = Scenario() s.package = package s.name = name s.description = description s.date = date s.type = type s.additional_information = additional_information s.save() return s def add_additional_information(self, data): pass def remove_additional_information(self, data): pass def set_additional_information(self, data): pass example = { "additionalInformationCollection": { "additionalInformation": [ { "addInfoName": "referringscenario", "creationDate": "2017-12-15 11:46:07.993", "data": "http://localhost:8080/package/5882df9c-dae1-4d80-a40e-db4724271456/scenario/11482bc1-8a0c-44a0-ae8b-5a02ae732559", "id": "http://localhost:8080/package/5882df9c-dae1-4d80-a40e-db4724271456/infocollection/0f30d0ca-b2bd-4c85-a425-ed8b22d4fed6/referringscenario/41532eac-e04a-4474-937a-df1344c3dce7", "identifier": "referringscenario", "lastModified": "2017-12-15 11:46:07.993", "name": "referringscenario" }, { "addInfoName": "halflife", "creationDate": "2017-12-15 11:46:07.934", "data": "First Order;;reported,no further information about the model;3690.0 - 3690.0;McCorquodale, G. & Wardrope, L. (2006)", "id": "http://localhost:8080/package/5882df9c-dae1-4d80-a40e-db4724271456/infocollection/0f30d0ca-b2bd-4c85-a425-ed8b22d4fed6/halflife/8f44fdd9-f453-4ab1-8509-2ee5826faad7", "identifier": "halflife", "lastModified": "2020-05-05 17:26:14.753", "name": "halflife" } ], "creationDate": "2017-12-15 11:46:07.608", "id": "http://localhost:8080/package/5882df9c-dae1-4d80-a40e-db4724271456/infocollection/0f30d0ca-b2bd-4c85-a425-ed8b22d4fed6", "identifier": "infocollection", "lastModified": "2020-05-05 17:26:15.496", "name": "no name" }, "aliases": [], "creationDate": "2017-12-15 11:46:08.221", "date": "no date", "description": "no description", "id": "http://localhost:8080/package/5882df9c-dae1-4d80-a40e-db4724271456/scenario/e7089e49-e07d-4a2d-8045-e144b7eb5a5e", "identifier": "scenario", "lastModified": "2020-05-05 17:26:15.065", "name": "McCorquodale, G. & Wardrope, L. (2006) - (00002) (Related Scenario) - (00000)", "reviewStatus": "reviewed", "scenarios": [], "type": "Not specified" } class UserSettingPermission(Permission): uuid = models.UUIDField(null=False, blank=False, verbose_name='UUID of this object', primary_key=True, default=uuid4) user = models.ForeignKey('User', verbose_name='Permission to', on_delete=models.CASCADE) setting = models.ForeignKey('epdb.Setting', verbose_name='Permission on', on_delete=models.CASCADE) class Meta: unique_together = [('setting', 'user')] def __str__(self): return f"User: {self.user} has Permission: {self.permission} on Setting: {self.setting}" class Setting(EnviPathModel): public = models.BooleanField(null=False, blank=False, default=False) global_default = models.BooleanField(null=False, blank=False, default=False) max_depth = models.IntegerField(null=False, blank=False, verbose_name='Setting Max Depth', default=5) max_nodes = models.IntegerField(null=False, blank=False, verbose_name='Setting Max Number of Nodes', default=30) rule_packages = models.ManyToManyField("Package", verbose_name="Setting Rule Packages", related_name="setting_rule_packages") model = models.ForeignKey('EPModel', verbose_name='Setting EPModel', on_delete=models.SET_NULL, null=True, blank=True) model_threshold = models.FloatField(null=True, blank=True, verbose_name='Setting Model Threshold', default=0.25) @cached_property def applicable_rules(self): """ Returns a ordered set of rules where the following applies: 1. All Composite will be added to result 2. All SimpleRules will be added if theres no CompositeRule present using the SimpleRule Ordering is based on "url" field. """ rules = [] rule_qs = Rule.objects.none() for package in self.rule_packages.all(): rule_qs |= package.rules rule_qs = rule_qs.distinct() reflected_simple_rules = set() for r in rule_qs: if isinstance(r, ParallelRule) or isinstance(r, SequentialRule): rules.append(r) for sr in r.simple_rules.all(): reflected_simple_rules.add(sr) for r in rule_qs: if isinstance(r, SimpleAmbitRule) or isinstance(r, SimpleRDKitRule): if r not in reflected_simple_rules: rules.append(r) rules = sorted(rules, key=lambda x: x.url) return rules def expand(self, pathway, current_node): """Decision Method whether to expand on a certain Node or not""" if pathway.num_nodes() >= self.max_nodes: logger.info(f"Pathway has {pathway.num_nodes()} which exceeds the limit of {self.max_nodes}") return [] if pathway.depth() >= self.max_depth: logger.info(f"Pathway has reached depth {pathway.depth()} which exceeds the limit of {self.max_depth}") return [] transformations = [] if self.model is not None: print(self.model) pred_results = self.model.predict(current_node.smiles) print(pred_results) for pred_result in pred_results: if pred_result.probability >= self.model_threshold: transformations.append(pred_result) else: for rule in self.applicable_rules: tmp_products = rule.apply(current_node.smiles) if tmp_products: transformations.append(PredictionResult(tmp_products, 1.0, rule)) return transformations @transaction.atomic def make_global_default(self): # Flag all others as global_default False to ensure there's only a single global_default Setting.objects.all().update(global_default=False) if not self.public: self.public = True self.global_default = True self.save()