Merge remote-tracking branch 'origin/develop' into enhancement/dataset

# Conflicts: # epdb/models.py # tests/test_enviformer.py # tests/test_model.py
2025-11-07 08:28:03 +13:00
parent 6a5413b492 98d62e1d1f
commit cfd8d7440b
25 changed files with 1024 additions and 280 deletions
--- a/utilities/chem.py
+++ b/utilities/chem.py
@ -192,7 +192,7 @@ class FormatConverter(object):
        return smiles

    @staticmethod
-    def standardize(smiles, remove_stereo=False):
+    def standardize(smiles, remove_stereo=False, canonicalize_tautomers=False):
        # Taken from https://bitsilla.com/blog/2021/06/standardizing-a-molecule-using-rdkit/
        # follows the steps in
        # https://github.com/greglandrum/RSC_OpenScience_Standardization_202104/blob/main/MolStandardize%20pieces.ipynb
@ -210,19 +210,21 @@ class FormatConverter(object):
        uncharger = (
            rdMolStandardize.Uncharger()
        )  # annoying, but necessary as no convenience method exists
-        uncharged_parent_clean_mol = uncharger.uncharge(parent_clean_mol)
+        res_mol = uncharger.uncharge(parent_clean_mol)

        # note that no attempt is made at reionization at this step
        # nor at ionization at some pH (rdkit has no pKa caculator)
        # the main aim to to represent all molecules from different sources
        # in a (single) standard way, for use in ML, catalogue, etc.
-        # te = rdMolStandardize.TautomerEnumerator()  # idem
-        # taut_uncharged_parent_clean_mol = te.Canonicalize(uncharged_parent_clean_mol)

        if remove_stereo:
-            Chem.RemoveStereochemistry(uncharged_parent_clean_mol)
+            Chem.RemoveStereochemistry(res_mol)

-        return Chem.MolToSmiles(uncharged_parent_clean_mol, kekuleSmiles=True)
+        if canonicalize_tautomers:
+            te = rdMolStandardize.TautomerEnumerator()  # idem
+            res_mol = te.Canonicalize(res_mol)
+
+        return Chem.MolToSmiles(res_mol, kekuleSmiles=True)

    @staticmethod
    def neutralize_smiles(smiles):
@ -370,6 +372,76 @@ class FormatConverter(object):

        return parsed_smiles, errors

+    @staticmethod
+    def smiles_covered_by(
+        l_smiles: List[str],
+        r_smiles: List[str],
+        standardize: bool = True,
+        canonicalize_tautomers: bool = True,
+    ) -> bool:
+        """
+        Check if all SMILES in the left list are covered by (contained in) the right list.
+
+        This function performs a subset check to determine if every chemical structure
+        represented in l_smiles has a corresponding representation in r_smiles.
+
+        Args:
+            l_smiles (List[str]): List of SMILES strings to check for coverage.
+            r_smiles (List[str]): List of SMILES strings that should contain all l_smiles.
+            standardize (bool, optional): Whether to standardize SMILES before comparison.
+                Defaults to True. When True, applies FormatConverter.standardize() to
+                normalize representations for accurate comparison.
+            canonicalize_tautomers (bool, optional): Whether to canonicalize tautomers
+                Defaults to False. When True, applies rdMolStandardize.TautomerEnumerator().Canonicalize(res_mol)
+                to the compounds before comparison.
+        Returns:
+            bool: True if all SMILES in l_smiles are found in r_smiles (i.e., l_smiles
+                  is a subset of r_smiles), False otherwise.
+
+        Note:
+            - Comparison treats lists as sets, ignoring duplicates and order
+            - Failed standardization attempts are silently ignored (original SMILES used)
+            - This is a one-directional check: l_smiles ⊆ r_smiles
+            - For bidirectional equality, both directions must be checked separately
+
+        Example:
+            >>> FormatConverter.smiles_covered_by(["CCO", "CC"], ["CCO", "CC", "CCC"])
+            True
+            >>> FormatConverter.smiles_covered_by(["CCO", "CCCC"], ["CCO", "CC", "CCC"])
+            False
+        """
+
+        standardized_l_smiles = []
+
+        if standardize:
+            for smi in l_smiles:
+                try:
+                    smi = FormatConverter.standardize(
+                        smi, canonicalize_tautomers=canonicalize_tautomers
+                    )
+                except Exception:
+                    # :shrug:
+                    # logger.debug(f'Standardizing SMILES failed for {smi}')
+                    pass
+                standardized_l_smiles.append(smi)
+        else:
+            standardized_l_smiles = l_smiles
+
+        standardized_r_smiles = []
+        if standardize:
+            for smi in r_smiles:
+                try:
+                    smi = FormatConverter.standardize(smi)
+                except Exception:
+                    # :shrug:
+                    # logger.debug(f'Standardizing SMILES failed for {smi}')
+                    pass
+                standardized_r_smiles.append(smi)
+        else:
+            standardized_r_smiles = r_smiles
+
+        return len(set(standardized_l_smiles).difference(set(standardized_r_smiles))) == 0
+

 class Standardizer(ABC):
    def __init__(self, name):