[Feature] Minimal IUCLID export (#338)

This is an initial implementation that creates a working minimal .i6z document. It passes schema validation and can be imported into IUCLID. Caveat: IUCLID files target individual compounds. Pathway is not actually covered by the format. It can be added in either soil or water and soil OECD endpoints. **I currently only implemented the soil endpoint for all data.** This sort of works, and I can report all degradation products in a pathway (not a nice view, but we can report many transformation products and add a diagram attachment in the future). Adding additional information is an absolute pain, as we need to explicitly map each type of information to the relevant OECD field. I use the XSD scheme for validation, but unfortunately the IUCLID parser is not fully compliant and requires a specific order, etc. The workflow is: finding the AI structure from the XSD scheme -> make the scheme validation pass -> upload to IUCLID to get obscure error messages -> guess what could be wrong -> repeat 💣 New specifications get released once per year, so we will have to update accordingly. I believe that this should be a more expensive feature, as it requires significant effort to uphold. Currently implemented for root compound only in SOIL: - Soil Texture 2 - Soil Texture 1 - pH value - Half-life per soil sample / scenario (mapped to disappearance; not sure about that). - CEC - Organic Matter (only Carbon) - Moisture content - Humidity <img width="2123" alt="image.png" src="attachments/d29830e1-65ef-4136-8939-1825e0959c62"> <img width="2124" alt="image.png" src="attachments/ac9de2ac-bf68-4ba4-b40b-82f810a9de93"> <img width="2139" alt="image.png" src="attachments/5674c7e6-865e-420e-974a-6b825b331e6c"> Reviewed-on: enviPath/enviPy#338 Co-authored-by: Tobias O <tobias.olenyi@envipath.com> Co-committed-by: Tobias O <tobias.olenyi@envipath.com>
2026-04-07 19:46:12 +12:00
parent f7c45b8015
commit d06bd0d4fd
49 changed files with 66402 additions and 1014 deletions
--- a/epiuclid/serializers/i6z.py
+++ b/epiuclid/serializers/i6z.py
@ -0,0 +1,118 @@
+import io
+import xml.etree.ElementTree as ET
+import zipfile
+
+from epiuclid.builders.base import NS_PLATFORM_CONTAINER, document_key
+from epiuclid.builders.endpoint_study import EndpointStudyRecordBuilder
+from epiuclid.builders.reference_substance import ReferenceSubstanceBuilder
+from epiuclid.builders.substance import SubstanceBuilder
+from epiuclid.serializers.manifest import ManifestBuilder
+from epiuclid.serializers.pathway_mapper import IUCLIDDocumentBundle
+from epiuclid.schemas.loader import get_content_schema
+
+
+def _i6d_filename(uuid) -> str:
+    return f"{uuid}_0.i6d"
+
+
+class I6ZSerializer:
+    """Serialize a IUCLIDDocumentBundle to a ZIP file containing the manifest.xml and the i6d files in memory."""
+
+    def serialize(self, bundle: IUCLIDDocumentBundle, *, validate: bool = False) -> bytes:
+        return self._assemble(bundle, validate=validate)
+
+    def _assemble(self, bundle: IUCLIDDocumentBundle, *, validate: bool = False) -> bytes:
+        sub_builder = SubstanceBuilder()
+        ref_builder = ReferenceSubstanceBuilder()
+        esr_builder = EndpointStudyRecordBuilder()
+
+        # (filename, xml_string, doc_type, uuid, subtype)
+        files: list[tuple[str, str, str, str, str | None]] = []
+
+        for sub in bundle.substances:
+            fname = _i6d_filename(sub.uuid)
+            xml = sub_builder.build(sub)
+            files.append((fname, xml, "SUBSTANCE", str(sub.uuid), None))
+
+        for ref in bundle.reference_substances:
+            fname = _i6d_filename(ref.uuid)
+            xml = ref_builder.build(ref)
+            files.append((fname, xml, "REFERENCE_SUBSTANCE", str(ref.uuid), None))
+
+        for esr in bundle.endpoint_study_records:
+            fname = _i6d_filename(esr.uuid)
+            xml = esr_builder.build(esr)
+            files.append(
+                (fname, xml, "ENDPOINT_STUDY_RECORD", str(esr.uuid), "BiodegradationInSoil")
+            )
+
+        if validate:
+            self._validate_documents(files)
+
+        # Build document relationship links for manifest
+        links = self._build_links(bundle)
+
+        # Build manifest
+        manifest_docs = [(f[0], f[2], f[3], f[4]) for f in files]
+        base_uuid = str(bundle.substances[0].uuid) if bundle.substances else ""
+        manifest_xml = ManifestBuilder().build(manifest_docs, base_uuid, links=links)
+
+        # Assemble ZIP
+        buf = io.BytesIO()
+        with zipfile.ZipFile(buf, "w", zipfile.ZIP_DEFLATED) as zf:
+            zf.writestr("manifest.xml", manifest_xml)
+            for fname, xml, _, _, _ in files:
+                zf.writestr(fname, xml)
+        return buf.getvalue()
+
+    @staticmethod
+    def _validate_documents(
+        files: list[tuple[str, str, str, str, str | None]],
+    ) -> None:
+        """Validate each i6d document against its XSD schema.
+
+        Raises ``xmlschema.XMLSchemaValidationError`` on the first failure.
+        """
+
+        for fname, xml, doc_type, _uuid, subtype in files:
+            root = ET.fromstring(xml)
+            content = root.find(f"{{{NS_PLATFORM_CONTAINER}}}Content")
+            if content is None or len(content) == 0:
+                continue
+            content_el = list(content)[0]
+            schema = get_content_schema(doc_type, subtype)
+            schema.validate(content_el)
+
+    @staticmethod
+    def _build_links(bundle: IUCLIDDocumentBundle) -> dict[str, list[tuple[str, str]]]:
+        """Build manifest link relationships between documents.
+
+        Returns a dict mapping document UUID (str) to list of (target_doc_key, ref_type).
+        """
+        links: dict[str, list[tuple[str, str]]] = {}
+
+        def _add(uuid_str: str, target_key: str, ref_type: str) -> None:
+            doc_links = links.setdefault(uuid_str, [])
+            link = (target_key, ref_type)
+            if link not in doc_links:
+                doc_links.append(link)
+
+        # Substance -> REFERENCE link to its reference substance
+        for sub in bundle.substances:
+            if sub.reference_substance_uuid:
+                ref_key = document_key(sub.reference_substance_uuid)
+                _add(str(sub.uuid), ref_key, "REFERENCE")
+
+        # ESR -> PARENT link to its substance; substance -> CHILD link to ESR
+        for esr in bundle.endpoint_study_records:
+            sub_key = document_key(esr.substance_uuid)
+            esr_key = document_key(esr.uuid)
+            _add(str(esr.uuid), sub_key, "PARENT")
+            _add(str(esr.substance_uuid), esr_key, "CHILD")
+
+            for tp in esr.transformation_products:
+                _add(str(esr.uuid), document_key(tp.product_reference_uuid), "REFERENCE")
+                for parent_ref_uuid in tp.parent_reference_uuids:
+                    _add(str(esr.uuid), document_key(parent_ref_uuid), "REFERENCE")
+
+        return links