[Feature] Minimal IUCLID export (#338)

This is an initial implementation that creates a working minimal .i6z document.
It passes schema validation and can be imported into IUCLID.

Caveat:
IUCLID files target individual compounds.
Pathway is not actually covered by the format.

It can be added in either soil or water and soil OECD endpoints.
**I currently only implemented the soil endpoint for all data.**

This sort of works, and I can report all degradation products in a pathway (not a nice view, but we can report many transformation products and add a diagram attachment in the future).

Adding additional information is an absolute pain, as we need to explicitly map each type of information to the relevant OECD field.
I use the XSD scheme for validation, but unfortunately the IUCLID parser is not fully compliant and requires a specific order, etc.

The workflow is: finding the AI structure from the XSD scheme -> make the scheme validation pass -> upload to IUCLID to get obscure error messages -> guess what could be wrong -> repeat 💣

New specifications get released once per year, so we will have to update accordingly.
I believe that this should be a more expensive feature, as it requires significant effort to uphold.

Currently implemented for root compound only in SOIL:

- Soil Texture 2
- Soil Texture 1
- pH value
- Half-life per soil sample / scenario (mapped to disappearance; not sure about that).
- CEC
- Organic Matter (only Carbon)
- Moisture content
- Humidity

<img width="2123" alt="image.png" src="attachments/d29830e1-65ef-4136-8939-1825e0959c62">
<img width="2124" alt="image.png" src="attachments/ac9de2ac-bf68-4ba4-b40b-82f810a9de93">
<img width="2139" alt="image.png" src="attachments/5674c7e6-865e-420e-974a-6b825b331e6c">

Reviewed-on: enviPath/enviPy#338
Co-authored-by: Tobias O <tobias.olenyi@envipath.com>
Co-committed-by: Tobias O <tobias.olenyi@envipath.com>
This commit is contained in:
2026-04-07 19:46:12 +12:00
committed by jebus
parent f7c45b8015
commit d06bd0d4fd
49 changed files with 66402 additions and 1014 deletions

View File

105
epiuclid/builders/base.py Normal file
View File

@ -0,0 +1,105 @@
import xml.etree.ElementTree as ET
from datetime import datetime, timezone
# IUCLID 6 XML namespaces
NS_PLATFORM_CONTAINER = "http://iuclid6.echa.europa.eu/namespaces/platform-container/v2"
NS_PLATFORM_METADATA = "http://iuclid6.echa.europa.eu/namespaces/platform-metadata/v1"
NS_PLATFORM_FIELDS = "http://iuclid6.echa.europa.eu/namespaces/platform-fields/v1"
NS_XLINK = "http://www.w3.org/1999/xlink"
# Register namespace prefixes for clean output
ET.register_namespace("i6c", NS_PLATFORM_CONTAINER)
ET.register_namespace("i6m", NS_PLATFORM_METADATA)
ET.register_namespace("i6", NS_PLATFORM_FIELDS)
ET.register_namespace("xlink", NS_XLINK)
IUCLID_VERSION = "6.0.0"
DEFINITION_VERSION = "10.0"
CREATION_TOOL = "enviPath"
def _tag(ns: str, local: str) -> str:
return f"{{{ns}}}{local}"
def _sub(parent: ET.Element, ns: str, local: str, text: str | None = None) -> ET.Element:
"""Create a sub-element under parent. Only sets text if not None."""
elem = ET.SubElement(parent, _tag(ns, local))
if text is not None:
elem.text = str(text)
return elem
def _sub_if(parent: ET.Element, ns: str, local: str, text: str | None = None) -> ET.Element | None:
"""Create a sub-element only when text is not None."""
if text is None:
return None
return _sub(parent, ns, local, text)
def build_platform_metadata(
document_key: str,
document_type: str,
name: str,
document_sub_type: str | None = None,
parent_document_key: str | None = None,
order_in_section_no: int | None = None,
) -> ET.Element:
"""Build the <i6c:PlatformMetadata> element for an i6d document."""
pm = ET.Element(_tag(NS_PLATFORM_CONTAINER, "PlatformMetadata"))
now = datetime.now(timezone.utc).strftime("%Y-%m-%dT%H:%M:%SZ")
_sub(pm, NS_PLATFORM_METADATA, "iuclidVersion", IUCLID_VERSION)
_sub(pm, NS_PLATFORM_METADATA, "documentKey", document_key)
_sub(pm, NS_PLATFORM_METADATA, "documentType", document_type)
_sub(pm, NS_PLATFORM_METADATA, "definitionVersion", DEFINITION_VERSION)
_sub(pm, NS_PLATFORM_METADATA, "creationDate", now)
_sub(pm, NS_PLATFORM_METADATA, "lastModificationDate", now)
_sub(pm, NS_PLATFORM_METADATA, "name", name)
if document_sub_type:
_sub(pm, NS_PLATFORM_METADATA, "documentSubType", document_sub_type)
if parent_document_key:
_sub(pm, NS_PLATFORM_METADATA, "parentDocumentKey", parent_document_key)
if order_in_section_no is not None:
_sub(pm, NS_PLATFORM_METADATA, "orderInSectionNo", str(order_in_section_no))
_sub(pm, NS_PLATFORM_METADATA, "i5Origin", "false")
_sub(pm, NS_PLATFORM_METADATA, "creationTool", CREATION_TOOL)
return pm
def build_document(
document_key: str,
document_type: str,
name: str,
content_element: ET.Element,
document_sub_type: str | None = None,
parent_document_key: str | None = None,
order_in_section_no: int | None = None,
) -> str:
"""Build a complete i6d document XML string."""
root = ET.Element(_tag(NS_PLATFORM_CONTAINER, "Document"))
pm = build_platform_metadata(
document_key=document_key,
document_type=document_type,
name=name,
document_sub_type=document_sub_type,
parent_document_key=parent_document_key,
order_in_section_no=order_in_section_no,
)
root.append(pm)
content_wrapper = _sub(root, NS_PLATFORM_CONTAINER, "Content")
content_wrapper.append(content_element)
_sub(root, NS_PLATFORM_CONTAINER, "Attachments")
_sub(root, NS_PLATFORM_CONTAINER, "ModificationHistory")
return ET.tostring(root, encoding="unicode", xml_declaration=True)
def document_key(uuid) -> str:
"""Format a UUID as an IUCLID document key (uuid/0 for raw data)."""
return f"{uuid}/0"

View File

@ -0,0 +1,259 @@
import xml.etree.ElementTree as ET
from uuid import uuid4
from epiuclid.serializers.pathway_mapper import IUCLIDEndpointStudyRecordData, SoilPropertiesData
from .base import (
NS_PLATFORM_FIELDS,
_sub,
_tag,
build_document,
document_key,
)
NS_ESR_BIODEG = (
"http://iuclid6.echa.europa.eu/namespaces/ENDPOINT_STUDY_RECORD-BiodegradationInSoil/10.0"
)
ET.register_namespace("", NS_ESR_BIODEG)
DOC_SUBTYPE = "BiodegradationInSoil"
PICKLIST_OTHER_CODE = "1342"
SOIL_TYPE_CODE_BY_KEY = {
"CLAY": "257",
"CLAY_LOAM": "258",
"LOAM": "1026",
"LOAMY_SAND": "1027",
"SAND": "1522",
"SANDY_CLAY_LOAM": "1523",
"SANDY_LOAM": "1524",
"SANDY_CLAY": "1525",
"SILT": "1549",
"SILT_LOAM": "1550",
"SILTY_CLAY": "1551",
"SILTY_CLAY_LOAM": "1552",
}
SOIL_CLASSIFICATION_CODE_BY_KEY = {
"USDA": "1649",
"DE": "314",
"INTERNATIONAL": "1658",
}
class EndpointStudyRecordBuilder:
def build(self, data: IUCLIDEndpointStudyRecordData) -> str:
esr = ET.Element(f"{{{NS_ESR_BIODEG}}}ENDPOINT_STUDY_RECORD.{DOC_SUBTYPE}")
soil_entries = list(data.soil_properties_entries)
if not soil_entries and data.soil_properties is not None:
soil_entries = [data.soil_properties]
has_materials = bool(
data.model_name_and_version
or data.software_name_and_version
or data.model_remarks
or soil_entries
)
if has_materials:
materials = _sub(esr, NS_ESR_BIODEG, "MaterialsAndMethods")
if soil_entries:
self._build_soil_structured_full(materials, soil_entries)
if data.model_name_and_version or data.software_name_and_version or data.model_remarks:
model_info = _sub(materials, NS_ESR_BIODEG, "ModelAndSoftware")
for model_name in data.model_name_and_version:
_sub(model_info, NS_ESR_BIODEG, "ModelNameAndVersion", model_name)
for software_name in data.software_name_and_version:
_sub(model_info, NS_ESR_BIODEG, "SoftwareNameAndVersion", software_name)
for remark in data.model_remarks:
_sub(model_info, NS_ESR_BIODEG, "Remarks", remark)
has_results = (
data.half_lives or data.transformation_products or data.temperature is not None
)
if has_results:
results = _sub(esr, NS_ESR_BIODEG, "ResultsAndDiscussion")
if data.half_lives or data.temperature is not None:
dt_parent = _sub(results, NS_ESR_BIODEG, "DTParentCompound")
if data.half_lives:
for hl in data.half_lives:
entry = ET.SubElement(dt_parent, _tag(NS_ESR_BIODEG, "entry"))
entry.set(_tag(NS_PLATFORM_FIELDS, "uuid"), str(uuid4()))
if hl.soil_no_code:
soil_no = _sub(entry, NS_ESR_BIODEG, "SoilNo")
_sub(soil_no, NS_ESR_BIODEG, "value", hl.soil_no_code)
value_range = _sub(entry, NS_ESR_BIODEG, "Value")
_sub(value_range, NS_ESR_BIODEG, "unitCode", "2329") # days
_sub(value_range, NS_ESR_BIODEG, "lowerValue", str(hl.dt50_start))
_sub(value_range, NS_ESR_BIODEG, "upperValue", str(hl.dt50_end))
temperature = (
hl.temperature if hl.temperature is not None else data.temperature
)
if temperature is not None:
temp_range = _sub(entry, NS_ESR_BIODEG, "Temp")
_sub(temp_range, NS_ESR_BIODEG, "unitCode", "2493") # degree Celsius
_sub(temp_range, NS_ESR_BIODEG, "lowerValue", str(temperature[0]))
_sub(temp_range, NS_ESR_BIODEG, "upperValue", str(temperature[1]))
_sub(entry, NS_ESR_BIODEG, "KineticParameters", hl.model)
else:
# Temperature without half-lives: single entry with only Temp
assert data.temperature is not None
entry = ET.SubElement(dt_parent, _tag(NS_ESR_BIODEG, "entry"))
entry.set(_tag(NS_PLATFORM_FIELDS, "uuid"), str(uuid4()))
temp_range = _sub(entry, NS_ESR_BIODEG, "Temp")
_sub(temp_range, NS_ESR_BIODEG, "unitCode", "2493") # degree Celsius
_sub(temp_range, NS_ESR_BIODEG, "lowerValue", str(data.temperature[0]))
_sub(temp_range, NS_ESR_BIODEG, "upperValue", str(data.temperature[1]))
if data.transformation_products:
tp_details = _sub(results, NS_ESR_BIODEG, "TransformationProductsDetails")
for tp in data.transformation_products:
entry = ET.SubElement(tp_details, _tag(NS_ESR_BIODEG, "entry"))
entry.set(_tag(NS_PLATFORM_FIELDS, "uuid"), str(tp.uuid))
_sub(
entry,
NS_ESR_BIODEG,
"IdentityOfCompound",
document_key(tp.product_reference_uuid),
)
if tp.parent_reference_uuids:
parents = _sub(entry, NS_ESR_BIODEG, "ParentCompoundS")
for parent_uuid in tp.parent_reference_uuids:
_sub(parents, NS_PLATFORM_FIELDS, "key", document_key(parent_uuid))
if tp.kinetic_formation_fraction is not None:
_sub(
entry,
NS_ESR_BIODEG,
"KineticFormationFraction",
str(tp.kinetic_formation_fraction),
)
doc_key = document_key(data.uuid)
return build_document(
document_key=doc_key,
document_type="ENDPOINT_STUDY_RECORD",
document_sub_type=DOC_SUBTYPE,
name=data.name,
content_element=esr,
parent_document_key=document_key(data.substance_uuid),
order_in_section_no=1,
)
@staticmethod
def _build_soil_structured_full(
materials: ET.Element,
props_entries: list[SoilPropertiesData],
) -> None:
study_design = _sub(materials, NS_ESR_BIODEG, "StudyDesign")
soil_classification = None
for props in props_entries:
soil_classification = EndpointStudyRecordBuilder._soil_classification(props)
if soil_classification:
break
if soil_classification:
soil_classification_el = _sub(study_design, NS_ESR_BIODEG, "SoilClassification")
value, other = EndpointStudyRecordBuilder._picklist_value_and_other(
soil_classification,
SOIL_CLASSIFICATION_CODE_BY_KEY,
)
if value:
_sub(soil_classification_el, NS_ESR_BIODEG, "value", value)
if other:
_sub(soil_classification_el, NS_ESR_BIODEG, "other", other)
soil_props = _sub(study_design, NS_ESR_BIODEG, "SoilProperties")
for props in props_entries:
entry = ET.SubElement(soil_props, _tag(NS_ESR_BIODEG, "entry"))
entry.set(_tag(NS_PLATFORM_FIELDS, "uuid"), str(uuid4()))
if props.soil_no_code:
soil_no = _sub(entry, NS_ESR_BIODEG, "SoilNo")
_sub(soil_no, NS_ESR_BIODEG, "value", props.soil_no_code)
soil_type = props.soil_type.strip() if props.soil_type else None
if soil_type:
soil_type_el = _sub(entry, NS_ESR_BIODEG, "SoilType")
value, other = EndpointStudyRecordBuilder._picklist_value_and_other(
soil_type,
SOIL_TYPE_CODE_BY_KEY,
)
if value:
_sub(soil_type_el, NS_ESR_BIODEG, "value", value)
if other:
_sub(soil_type_el, NS_ESR_BIODEG, "other", other)
if props.clay is not None:
clay_el = _sub(entry, NS_ESR_BIODEG, "Clay")
_sub(clay_el, NS_ESR_BIODEG, "lowerValue", str(props.clay))
if props.silt is not None:
silt_el = _sub(entry, NS_ESR_BIODEG, "Silt")
_sub(silt_el, NS_ESR_BIODEG, "lowerValue", str(props.silt))
if props.sand is not None:
sand_el = _sub(entry, NS_ESR_BIODEG, "Sand")
_sub(sand_el, NS_ESR_BIODEG, "lowerValue", str(props.sand))
if props.org_carbon is not None:
orgc_el = _sub(entry, NS_ESR_BIODEG, "OrgC")
_sub(orgc_el, NS_ESR_BIODEG, "lowerValue", str(props.org_carbon))
if props.ph_lower is not None or props.ph_upper is not None:
ph_el = _sub(entry, NS_ESR_BIODEG, "Ph")
if props.ph_lower is not None:
_sub(ph_el, NS_ESR_BIODEG, "lowerValue", str(props.ph_lower))
if props.ph_upper is not None:
_sub(ph_el, NS_ESR_BIODEG, "upperValue", str(props.ph_upper))
ph_method = props.ph_method.strip() if props.ph_method else None
if ph_method:
_sub(entry, NS_ESR_BIODEG, "PHMeasuredIn", ph_method)
if props.cec is not None:
cec_el = _sub(entry, NS_ESR_BIODEG, "CEC")
_sub(cec_el, NS_ESR_BIODEG, "lowerValue", str(props.cec))
if props.moisture_content is not None:
moisture_el = _sub(entry, NS_ESR_BIODEG, "MoistureContent")
_sub(moisture_el, NS_ESR_BIODEG, "lowerValue", str(props.moisture_content))
@staticmethod
def _soil_classification(props: SoilPropertiesData) -> str | None:
if props.soil_classification:
value = props.soil_classification.strip()
if value:
return value
if props.soil_type:
return "USDA"
return None
@staticmethod
def _picklist_value_and_other(
raw_value: str,
code_map: dict[str, str],
) -> tuple[str | None, str | None]:
value = raw_value.strip()
if not value:
return None, None
key = value.upper().replace("-", "_").replace(" ", "_")
code = code_map.get(key)
if code is not None:
return code, None
return PICKLIST_OTHER_CODE, value.replace("_", " ")

View File

@ -0,0 +1,54 @@
import xml.etree.ElementTree as ET
from epiuclid.serializers.pathway_mapper import IUCLIDReferenceSubstanceData
from .base import (
_sub,
_sub_if,
build_document,
document_key,
)
NS_REFERENCE_SUBSTANCE = "http://iuclid6.echa.europa.eu/namespaces/REFERENCE_SUBSTANCE/10.0"
ET.register_namespace("", NS_REFERENCE_SUBSTANCE)
class ReferenceSubstanceBuilder:
def build(self, data: IUCLIDReferenceSubstanceData) -> str:
ref = ET.Element(f"{{{NS_REFERENCE_SUBSTANCE}}}REFERENCE_SUBSTANCE")
_sub(ref, NS_REFERENCE_SUBSTANCE, "ReferenceSubstanceName", data.name)
_sub_if(ref, NS_REFERENCE_SUBSTANCE, "IupacName", data.iupac_name)
if data.cas_number:
inventory = _sub(ref, NS_REFERENCE_SUBSTANCE, "Inventory")
_sub(inventory, NS_REFERENCE_SUBSTANCE, "CASNumber", data.cas_number)
has_structural_info = any(
[
data.molecular_formula,
data.molecular_weight is not None,
data.smiles,
data.inchi,
data.inchi_key,
]
)
if has_structural_info:
structural = _sub(ref, NS_REFERENCE_SUBSTANCE, "MolecularStructuralInfo")
_sub_if(structural, NS_REFERENCE_SUBSTANCE, "MolecularFormula", data.molecular_formula)
if data.molecular_weight is not None:
mw = _sub(structural, NS_REFERENCE_SUBSTANCE, "MolecularWeightRange")
_sub(mw, NS_REFERENCE_SUBSTANCE, "lowerValue", f"{data.molecular_weight:.2f}")
_sub(mw, NS_REFERENCE_SUBSTANCE, "upperValue", f"{data.molecular_weight:.2f}")
_sub_if(structural, NS_REFERENCE_SUBSTANCE, "SmilesNotation", data.smiles)
_sub_if(structural, NS_REFERENCE_SUBSTANCE, "InChl", data.inchi)
_sub_if(structural, NS_REFERENCE_SUBSTANCE, "InChIKey", data.inchi_key)
doc_key = document_key(data.uuid)
return build_document(
document_key=doc_key,
document_type="REFERENCE_SUBSTANCE",
name=data.name,
content_element=ref,
)

View File

@ -0,0 +1,37 @@
import xml.etree.ElementTree as ET
from epiuclid.serializers.pathway_mapper import IUCLIDSubstanceData
from .base import (
_sub,
build_document,
document_key,
)
NS_SUBSTANCE = "http://iuclid6.echa.europa.eu/namespaces/SUBSTANCE/10.0"
ET.register_namespace("", NS_SUBSTANCE)
class SubstanceBuilder:
def build(self, data: IUCLIDSubstanceData) -> str:
substance = ET.Element(f"{{{NS_SUBSTANCE}}}SUBSTANCE")
_sub(substance, NS_SUBSTANCE, "Templates")
_sub(substance, NS_SUBSTANCE, "ChemicalName", data.name)
if data.reference_substance_uuid:
ref_sub = _sub(substance, NS_SUBSTANCE, "ReferenceSubstance")
_sub(
ref_sub,
NS_SUBSTANCE,
"ReferenceSubstance",
document_key(data.reference_substance_uuid),
)
doc_key = document_key(data.uuid)
return build_document(
document_key=doc_key,
document_type="SUBSTANCE",
name=data.name,
content_element=substance,
)