enviPy-bayer/epiuclid/schemas/loader.py

"""Load and cache IUCLID XSD schemas with cross-reference resolution.

The bundled XSD schemas use bare ``schemaLocation`` filenames (e.g.
``platform-fields.xsd``, ``commonTypesDomainV10.xsd``) that don't match the
actual directory layout.  This module builds an explicit namespace → file-path
mapping so that ``xmlschema`` can resolve every import.
"""

from __future__ import annotations

from functools import lru_cache
from pathlib import Path

import xmlschema

_SCHEMA_ROOT = Path(__file__).resolve().parent / "v10"

# Namespace → relative file-path (from _SCHEMA_ROOT) for schemas that are
# referenced by bare filename from subdirectories that don't contain them.
_NS_LOCATIONS: dict[str, str] = {
    "http://iuclid6.echa.europa.eu/namespaces/platform-fields/v1": "platform-fields.xsd",
    "http://iuclid6.echa.europa.eu/namespaces/platform-metadata/v1": "platform-metadata.xsd",
    "http://iuclid6.echa.europa.eu/namespaces/platform-container/v2": "platform-container-v2.xsd",
    "http://iuclid6.echa.europa.eu/namespaces/platform-attachment/v1": "platform-attachment.xsd",
    "http://iuclid6.echa.europa.eu/namespaces/platform-modification-history/v1": (
        "platform-modification-history.xsd"
    ),
    "http://www.w3.org/1999/xlink": "xlink.xsd",
    "http://www.w3.org/XML/1998/namespace": "xml.xsd",
    "http://iuclid6.echa.europa.eu/namespaces/domain/v10": ("domain/v10/commonTypesDomainV10.xsd"),
    "http://iuclid6.echa.europa.eu/namespaces/oecd/v10": ("oecd/v10/commonTypesOecdV10.xsd"),
}

# doc_type → (subdir, filename-pattern)
_DOC_TYPE_PATHS: dict[str, tuple[str, str]] = {
    "SUBSTANCE": ("domain/v10", "SUBSTANCE-10.0.xsd"),
    "REFERENCE_SUBSTANCE": ("domain/v10", "REFERENCE_SUBSTANCE-10.0.xsd"),
}


def _absolute_locations() -> list[tuple[str, str]]:
    """Return (namespace, absolute-file-URI) pairs for all known schemas."""
    return [(ns, (_SCHEMA_ROOT / rel).as_uri()) for ns, rel in _NS_LOCATIONS.items()]


def _esr_path(subtype: str) -> Path:
    """Return the path to an Endpoint Study Record schema."""
    return _SCHEMA_ROOT / "oecd" / "v10" / f"ENDPOINT_STUDY_RECORD-{subtype}-10.0.xsd"


def _doc_type_path(doc_type: str, subtype: str | None = None) -> Path:
    if doc_type == "ENDPOINT_STUDY_RECORD":
        if not subtype:
            raise ValueError("subtype is required for ENDPOINT_STUDY_RECORD schemas")
        return _esr_path(subtype)
    info = _DOC_TYPE_PATHS.get(doc_type)
    if info is None:
        raise ValueError(f"Unknown document type: {doc_type}")
    subdir, filename = info
    return _SCHEMA_ROOT / subdir / filename


@lru_cache(maxsize=32)
def get_content_schema(doc_type: str, subtype: str | None = None) -> xmlschema.XMLSchema:
    """Return a compiled XSD schema for validating content elements.

    Parameters
    ----------
    doc_type:
        IUCLID document type (``SUBSTANCE``, ``REFERENCE_SUBSTANCE``,
        ``ENDPOINT_STUDY_RECORD``).
    subtype:
        Required for ``ENDPOINT_STUDY_RECORD`` (e.g. ``BiodegradationInSoil``).
    """
    path = _doc_type_path(doc_type, subtype)
    return xmlschema.XMLSchema(str(path), locations=_absolute_locations())


@lru_cache(maxsize=1)
def get_document_schema() -> xmlschema.XMLSchema:
    """Return a compiled XSD schema for the ``platform-container-v2`` wrapper.

    This validates the full ``<Document>`` element (PlatformMetadata + Content +
    Attachments + ModificationHistory).  Content is validated with
    ``processContents="strict"`` via ``xs:any``, but only if the content
    namespace has been loaded.  For full content validation, use
    :func:`get_content_schema` separately.
    """
    path = _SCHEMA_ROOT / "platform-container-v2.xsd"
    return xmlschema.XMLSchema(str(path), locations=_absolute_locations())