[Feature] Minimal IUCLID export (#338)

This is an initial implementation that creates a working minimal .i6z document. It passes schema validation and can be imported into IUCLID. Caveat: IUCLID files target individual compounds. Pathway is not actually covered by the format. It can be added in either soil or water and soil OECD endpoints. **I currently only implemented the soil endpoint for all data.** This sort of works, and I can report all degradation products in a pathway (not a nice view, but we can report many transformation products and add a diagram attachment in the future). Adding additional information is an absolute pain, as we need to explicitly map each type of information to the relevant OECD field. I use the XSD scheme for validation, but unfortunately the IUCLID parser is not fully compliant and requires a specific order, etc. The workflow is: finding the AI structure from the XSD scheme -> make the scheme validation pass -> upload to IUCLID to get obscure error messages -> guess what could be wrong -> repeat 💣 New specifications get released once per year, so we will have to update accordingly. I believe that this should be a more expensive feature, as it requires significant effort to uphold. Currently implemented for root compound only in SOIL: - Soil Texture 2 - Soil Texture 1 - pH value - Half-life per soil sample / scenario (mapped to disappearance; not sure about that). - CEC - Organic Matter (only Carbon) - Moisture content - Humidity <img width="2123" alt="image.png" src="attachments/d29830e1-65ef-4136-8939-1825e0959c62"> <img width="2124" alt="image.png" src="attachments/ac9de2ac-bf68-4ba4-b40b-82f810a9de93"> <img width="2139" alt="image.png" src="attachments/5674c7e6-865e-420e-974a-6b825b331e6c"> Reviewed-on: enviPath/enviPy#338 Co-authored-by: Tobias O <tobias.olenyi@envipath.com> Co-committed-by: Tobias O <tobias.olenyi@envipath.com>
2026-04-07 19:46:12 +12:00
parent f7c45b8015
commit d06bd0d4fd
49 changed files with 66402 additions and 1014 deletions
--- a/epiuclid/tests/test_i6z.py
+++ b/epiuclid/tests/test_i6z.py
@ -0,0 +1,199 @@
+"""Tests for i6z archive assembly."""
+
+import io
+import xml.etree.ElementTree as ET
+import zipfile
+from uuid import uuid4
+
+from django.test import SimpleTestCase, tag
+
+from epiuclid.serializers.i6z import I6ZSerializer
+from epiuclid.serializers.pathway_mapper import (
+    IUCLIDDocumentBundle,
+    IUCLIDEndpointStudyRecordData,
+    IUCLIDReferenceSubstanceData,
+    IUCLIDSubstanceData,
+    IUCLIDTransformationProductEntry,
+)
+
+
+def _make_bundle() -> IUCLIDDocumentBundle:
+    ref_uuid = uuid4()
+    sub_uuid = uuid4()
+    return IUCLIDDocumentBundle(
+        substances=[
+            IUCLIDSubstanceData(
+                uuid=sub_uuid,
+                name="Benzene",
+                reference_substance_uuid=ref_uuid,
+            ),
+        ],
+        reference_substances=[
+            IUCLIDReferenceSubstanceData(
+                uuid=ref_uuid,
+                name="Benzene",
+                smiles="c1ccccc1",
+                cas_number="71-43-2",
+                molecular_formula="C6H6",
+                molecular_weight=78.11,
+            ),
+        ],
+        endpoint_study_records=[
+            IUCLIDEndpointStudyRecordData(
+                uuid=uuid4(),
+                substance_uuid=sub_uuid,
+                name="Endpoint study - Benzene",
+            ),
+        ],
+    )
+
+
+def _make_bundle_with_transformation_links() -> tuple[IUCLIDDocumentBundle, str, str]:
+    parent_ref_uuid = uuid4()
+    product_ref_uuid = uuid4()
+    sub_uuid = uuid4()
+
+    bundle = IUCLIDDocumentBundle(
+        substances=[
+            IUCLIDSubstanceData(
+                uuid=sub_uuid,
+                name="Benzene",
+                reference_substance_uuid=parent_ref_uuid,
+            ),
+        ],
+        reference_substances=[
+            IUCLIDReferenceSubstanceData(uuid=parent_ref_uuid, name="Benzene", smiles="c1ccccc1"),
+            IUCLIDReferenceSubstanceData(
+                uuid=product_ref_uuid, name="Phenol", smiles="c1ccc(O)cc1"
+            ),
+        ],
+        endpoint_study_records=[
+            IUCLIDEndpointStudyRecordData(
+                uuid=uuid4(),
+                substance_uuid=sub_uuid,
+                name="Endpoint study - Benzene",
+                transformation_products=[
+                    IUCLIDTransformationProductEntry(
+                        uuid=uuid4(),
+                        product_reference_uuid=product_ref_uuid,
+                        parent_reference_uuids=[parent_ref_uuid],
+                    )
+                ],
+            ),
+        ],
+    )
+    return bundle, f"{parent_ref_uuid}/0", f"{product_ref_uuid}/0"
+
+
+@tag("iuclid")
+class I6ZSerializerTest(SimpleTestCase):
+    def test_output_is_valid_zip(self):
+        bundle = _make_bundle()
+        data = I6ZSerializer().serialize(bundle)
+        self.assertTrue(zipfile.is_zipfile(io.BytesIO(data)))
+
+    def test_contains_manifest(self):
+        bundle = _make_bundle()
+        data = I6ZSerializer().serialize(bundle)
+
+        with zipfile.ZipFile(io.BytesIO(data)) as zf:
+            self.assertIn("manifest.xml", zf.namelist())
+
+    def test_contains_i6d_files(self):
+        bundle = _make_bundle()
+        data = I6ZSerializer().serialize(bundle)
+
+        with zipfile.ZipFile(io.BytesIO(data)) as zf:
+            names = zf.namelist()
+            # manifest + 1 substance + 1 ref substance + 1 ESR = 4 files
+            self.assertEqual(len(names), 4)
+            i6d_files = [n for n in names if n.endswith(".i6d")]
+            self.assertEqual(len(i6d_files), 3)
+
+    def test_manifest_references_all_documents(self):
+        bundle = _make_bundle()
+        data = I6ZSerializer().serialize(bundle)
+
+        with zipfile.ZipFile(io.BytesIO(data)) as zf:
+            manifest_xml = zf.read("manifest.xml").decode("utf-8")
+            root = ET.fromstring(manifest_xml)
+
+            ns = "http://iuclid6.echa.europa.eu/namespaces/manifest/v1"
+            docs = root.findall(f".//{{{ns}}}document")
+            self.assertEqual(len(docs), 3)
+
+            types = set()
+            for doc in docs:
+                type_elem = doc.find(f"{{{ns}}}type")
+                self.assertIsNotNone(type_elem)
+                assert type_elem is not None
+                types.add(type_elem.text)
+            self.assertEqual(types, {"SUBSTANCE", "REFERENCE_SUBSTANCE", "ENDPOINT_STUDY_RECORD"})
+
+    def test_manifest_contains_expected_document_links(self):
+        bundle = _make_bundle()
+        data = I6ZSerializer().serialize(bundle)
+
+        with zipfile.ZipFile(io.BytesIO(data)) as zf:
+            manifest_xml = zf.read("manifest.xml").decode("utf-8")
+            root = ET.fromstring(manifest_xml)
+
+            ns = "http://iuclid6.echa.europa.eu/namespaces/manifest/v1"
+            docs = root.findall(f".//{{{ns}}}document")
+
+            links_by_type: dict[str, set[tuple[str | None, str | None]]] = {}
+            for doc in docs:
+                doc_type = doc.findtext(f"{{{ns}}}type")
+                links = set()
+                for link in doc.findall(f"{{{ns}}}links/{{{ns}}}link"):
+                    links.add(
+                        (
+                            link.findtext(f"{{{ns}}}ref-type"),
+                            link.findtext(f"{{{ns}}}ref-uuid"),
+                        )
+                    )
+                if doc_type:
+                    links_by_type[doc_type] = links
+
+            self.assertIn("REFERENCE", {ref_type for ref_type, _ in links_by_type["SUBSTANCE"]})
+            self.assertIn("CHILD", {ref_type for ref_type, _ in links_by_type["SUBSTANCE"]})
+            self.assertIn(
+                "PARENT", {ref_type for ref_type, _ in links_by_type["ENDPOINT_STUDY_RECORD"]}
+            )
+
+    def test_i6d_files_are_valid_xml(self):
+        bundle = _make_bundle()
+        data = I6ZSerializer().serialize(bundle)
+
+        with zipfile.ZipFile(io.BytesIO(data)) as zf:
+            for name in zf.namelist():
+                if name.endswith(".i6d"):
+                    content = zf.read(name).decode("utf-8")
+                    # Should not raise
+                    ET.fromstring(content)
+
+    def test_manifest_links_esr_to_transformation_reference_substances(self):
+        bundle, parent_ref_key, product_ref_key = _make_bundle_with_transformation_links()
+        data = I6ZSerializer().serialize(bundle)
+
+        with zipfile.ZipFile(io.BytesIO(data)) as zf:
+            manifest_xml = zf.read("manifest.xml").decode("utf-8")
+            root = ET.fromstring(manifest_xml)
+
+            ns = "http://iuclid6.echa.europa.eu/namespaces/manifest/v1"
+            esr_doc = None
+            for doc in root.findall(f".//{{{ns}}}document"):
+                if doc.findtext(f"{{{ns}}}type") == "ENDPOINT_STUDY_RECORD":
+                    esr_doc = doc
+                    break
+
+            self.assertIsNotNone(esr_doc)
+            assert esr_doc is not None
+
+            reference_links = {
+                link.findtext(f"{{{ns}}}ref-uuid")
+                for link in esr_doc.findall(f"{{{ns}}}links/{{{ns}}}link")
+                if link.findtext(f"{{{ns}}}ref-type") == "REFERENCE"
+            }
+            self.assertIn(parent_ref_key, reference_links)
+            self.assertIn(product_ref_key, reference_links)