From 053f33d042635292a80bd83a109c871149d9e877 Mon Sep 17 00:00:00 2001
From: tejas <tejas.morbagalharish@brockmann-consult.de>
Date: Thu, 19 Mar 2026 16:03:58 +0100
Subject: [PATCH 01/16] validate collection_id, normalize osc_themes, fix
 variable description fallback, add visualisation link, osc_project_title, and
 STAC item/catalog log messages

---
 deep_code/utils/dataset_stac_generator.py | 29 +++++++++++++++++++----
 1 file changed, 25 insertions(+), 4 deletions(-)

diff --git a/deep_code/utils/dataset_stac_generator.py b/deep_code/utils/dataset_stac_generator.py
index 40a086f..288eaeb 100644
--- a/deep_code/utils/dataset_stac_generator.py
+++ b/deep_code/utils/dataset_stac_generator.py
@@ -50,20 +50,29 @@ def __init__(
         osc_missions: list[str] | None = None,
         cf_params: list[dict[str]] | None = None,
         osc_project: str = "deep-earth-system-data-lab",
+        osc_project_title: str | None = None,
+        visualisation_link: str | None = None,
     ):
+        if " " in collection_id:
+            raise ValueError(
+                f"collection_id must not contain spaces: {collection_id!r}. "
+                "Use hyphens as word separators (e.g. 'My-Dataset-2024')."
+            )
         self.dataset_id = dataset_id
         self.collection_id = collection_id
         self.workflow_id = workflow_id
         self.workflow_title = workflow_title
         self.license_type = license_type
         self.osc_project = osc_project
+        self.osc_project_title = osc_project_title or self.format_string(osc_project)
         self.access_link = access_link or f"s3://deep-esdl-public/{dataset_id}"
         self.documentation_link = documentation_link
         self.osc_status = osc_status
         self.osc_region = osc_region
-        self.osc_themes = osc_themes or []
+        self.osc_themes = [t.lower() for t in (osc_themes or [])]
         self.osc_missions = osc_missions or []
         self.cf_params = cf_params or {}
+        self.visualisation_link = visualisation_link
         self.logger = logging.getLogger(__name__)
         self.dataset = open_dataset(dataset_id=dataset_id, logger=self.logger)
         self.variables_metadata = self.get_variables_metadata()
@@ -215,7 +224,7 @@ def build_variable_catalog(self, var_metadata) -> Catalog:
         # Create a PySTAC Catalog object
         var_catalog = Catalog(
             id=var_id,
-            description=var_metadata.get("description"),
+            description=var_metadata.get("description") or self.format_string(var_id),
             title=self.format_string(var_id),
             stac_extensions=[
                 "https://stac-extensions.github.io/themes/v1.0.0/schema.json"
@@ -474,6 +483,7 @@ def build_zarr_stac_item(self, stac_catalog_s3_root: str) -> Item:
         Returns:
             A :class:`pystac.Item` ready to be serialised to S3.
         """
+        self.logger.info(f"Building STAC Item for collection '{self.collection_id}'.")
         spatial_extent = self._get_spatial_extent()
         temporal_extent = self._get_temporal_extent()
         general_metadata = self._get_general_metadata()
@@ -542,6 +552,7 @@ def build_zarr_stac_item(self, stac_catalog_s3_root: str) -> Item:
             title="Consolidated Zarr Metadata",
             roles=["metadata"],
         ))
+        self.logger.info(f"STAC Item built: {item_href}")
         return item
 
     def build_zarr_stac_catalog_file_dict(
@@ -563,6 +574,10 @@ def build_zarr_stac_catalog_file_dict(
         Returns:
             ``{s3_path: content_dict}`` for every file to be written to S3.
         """
+        self.logger.info(
+            f"Building STAC Catalog file dict for collection '{self.collection_id}' "
+            f"at root '{stac_catalog_s3_root}'."
+        )
         root = stac_catalog_s3_root.rstrip("/")
         catalog_href = f"{root}/catalog.json"
 
@@ -581,9 +596,11 @@ def build_zarr_stac_catalog_file_dict(
             title=self.collection_id,
         ))
 
+        item_href = f"{root}/{self.collection_id}/item.json"
+        self.logger.info(f"STAC Catalog file dict ready: {catalog_href}, {item_href}")
         return {
             catalog_href: catalog.to_dict(transform_hrefs=False),
-            f"{root}/{self.collection_id}/item.json": item.to_dict(transform_hrefs=False),
+            item_href: item.to_dict(transform_hrefs=False),
         }
 
     def build_dataset_stac_collection(self, mode: str, stac_catalog_s3_root: str | None = None) -> Collection:
@@ -642,6 +659,10 @@ def build_dataset_stac_collection(self, mode: str, stac_catalog_s3_root: str | N
             collection.add_link(
                 Link(rel="via", target=self.documentation_link, title="Documentation")
             )
+        if self.visualisation_link:
+            collection.add_link(
+                Link(rel="visualisation", target=self.visualisation_link, title="Dataset visualisation")
+            )
         collection.add_link(
             Link(
                 rel="parent",
@@ -689,7 +710,7 @@ def build_dataset_stac_collection(self, mode: str, stac_catalog_s3_root: str | N
                 rel="related",
                 target=f"../../projects/{self.osc_project}/collection.json",
                 media_type="application/json",
-                title=f"Project: {self.format_string(self.osc_project)}",
+                title=f"Project: {self.osc_project_title}",
             )
         )
 

From fafcf0941459dbdc1ef6c7b08d81e4a036ce1094 Mon Sep 17 00:00:00 2001
From: tejas <tejas.morbagalharish@brockmann-consult.de>
Date: Thu, 19 Mar 2026 16:04:15 +0100
Subject: [PATCH 02/16] omit S3 key/secret from storage_options when env vars
 are unset

---
 deep_code/utils/helper.py | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/deep_code/utils/helper.py b/deep_code/utils/helper.py
index 0e33585..9452b81 100644
--- a/deep_code/utils/helper.py
+++ b/deep_code/utils/helper.py
@@ -63,8 +63,10 @@ def open_dataset(
                 "root": os.environ.get("S3_USER_STORAGE_BUCKET", root),
                 "storage_options": {
                     "anon": False,
-                    "key": os.environ.get("S3_USER_STORAGE_KEY"),
-                    "secret": os.environ.get("S3_USER_STORAGE_SECRET"),
+                    **({
+                        "key": os.environ["S3_USER_STORAGE_KEY"],
+                        "secret": os.environ["S3_USER_STORAGE_SECRET"],
+                    } if os.environ.get("S3_USER_STORAGE_KEY") and os.environ.get("S3_USER_STORAGE_SECRET") else {}),
                 },
             },
         },

From fd8387c9f72553d41588d69760d65c0dad08b0b4 Mon Sep 17 00:00:00 2001
From: tejas <tejas.morbagalharish@brockmann-consult.de>
Date: Thu, 19 Mar 2026 16:04:35 +0100
Subject: [PATCH 03/16] make license_type, stac_catalog_s3_root mandatory; wire
 visualisation_link and osc_project_title; add PR attribution footer

---
 deep_code/tools/publish.py | 53 ++++++++++++++++++++++++++++----------
 1 file changed, 39 insertions(+), 14 deletions(-)

diff --git a/deep_code/tools/publish.py b/deep_code/tools/publish.py
index dda7c2d..7718ae9 100644
--- a/deep_code/tools/publish.py
+++ b/deep_code/tools/publish.py
@@ -267,10 +267,26 @@ def publish_dataset(
         osc_themes = self.dataset_config.get("osc_themes")
         cf_params = self.dataset_config.get("cf_parameter")
         license_type = self.dataset_config.get("license_type")
+        visualisation_link = self.dataset_config.get("visualisation_link")
+        osc_project_title = self.dataset_config.get("osc_project_title")
 
         if not dataset_id or not self.collection_id:
             raise ValueError("Dataset ID or Collection ID missing in the config.")
 
+        if not license_type:
+            raise ValueError(
+                "license_type is required in the dataset config. "
+                "Provide an SPDX identifier (e.g. 'CC-BY-4.0', 'MIT', 'proprietary')."
+            )
+
+        stac_catalog_s3_root = self.dataset_config.get("stac_catalog_s3_root")
+        if not stac_catalog_s3_root:
+            raise ValueError(
+                "stac_catalog_s3_root is required in the dataset config. "
+                "Provide the S3 root where the STAC catalog should be published "
+                "(e.g. 's3://my-bucket/stac/my-collection/')."
+            )
+
         logger.info("Generating STAC collection...")
 
         generator = OscDatasetStacGenerator(
@@ -285,6 +301,8 @@ def publish_dataset(
             osc_region=osc_region,
             osc_themes=osc_themes,
             cf_params=cf_params,
+            visualisation_link=visualisation_link,
+            osc_project_title=osc_project_title,
         )
         # Store so publish() can reuse it for zarr STAC catalog generation
         self._last_generator = generator
@@ -421,6 +439,13 @@ def generate_workflow_experiment_records(
             raise ValueError("workflow_id is missing in workflow config.")
 
         properties_list = self.workflow_config.get("properties", {})
+
+        if not properties_list.get("license"):
+            raise ValueError(
+                "license is required under 'properties' in the workflow config. "
+                "Provide an SPDX identifier (e.g. 'CC-BY-4.0', 'MIT', 'proprietary')."
+            )
+
         osc_themes = properties_list.get("themes")
         contacts = self.workflow_config.get("contact", [])
         links = self.workflow_config.get("links", [])
@@ -587,20 +612,16 @@ def publish(
             ds_files = self.publish_dataset(write_to_file=False, mode=mode)
             files.update(ds_files)
 
-            # Publish STAC catalog + item to S3 when stac_catalog_s3_root is configured.
-            # This is independent of the GitHub PR and happens immediately.
+            # Publish STAC catalog + item to S3 (stac_catalog_s3_root is mandatory).
             stac_catalog_s3_root = self.dataset_config.get("stac_catalog_s3_root")
-            if stac_catalog_s3_root:
-                logger.info(
-                    f"Publishing STAC catalog to S3: {stac_catalog_s3_root}"
-                )
-                zarr_stac_files = self._last_generator.build_zarr_stac_catalog_file_dict(
-                    stac_catalog_s3_root
-                )
-                self._write_stac_catalog_to_s3(
-                    zarr_stac_files, self._get_stac_s3_storage_options()
-                )
-                logger.info("STAC catalog written to S3.")
+            logger.info(f"Publishing STAC catalog to S3: {stac_catalog_s3_root}")
+            zarr_stac_files = self._last_generator.build_zarr_stac_catalog_file_dict(
+                stac_catalog_s3_root
+            )
+            self._write_stac_catalog_to_s3(
+                zarr_stac_files, self._get_stac_s3_storage_options()
+            )
+            logger.info("STAC catalog written to S3.")
 
         if mode in ("workflow", "all"):
             wf_files = self.generate_workflow_experiment_records(
@@ -633,7 +654,11 @@ def publish(
         )
         commit_message = f"Publish {mode_label}"
         pr_title = f"Publish {mode_label}"
-        pr_body = f"This PR publishes {mode_label} to the repository."
+        pr_body = (
+            f"This PR publishes {mode_label} to the repository.\n\n"
+            "---\n"
+            "_Generated with [deep-code](https://github.com/deepesdl/deep-code)_"
+        )
 
         pr_url = self.gh_publisher.publish_files(
             branch_name=branch_name,

From 061aa8b0a735270cb748657f61f2cd93940494d5 Mon Sep 17 00:00:00 2001
From: tejas <tejas.morbagalharish@brockmann-consult.de>
Date: Thu, 19 Mar 2026 16:04:58 +0100
Subject: [PATCH 04/16] split dataset and workflow templates into
 required/optional sections with inline labels

---
 deep_code/tools/new.py | 85 +++++++++++++++++++++---------------------
 1 file changed, 43 insertions(+), 42 deletions(-)

diff --git a/deep_code/tools/new.py b/deep_code/tools/new.py
index 8e3e67a..0932a9f 100644
--- a/deep_code/tools/new.py
+++ b/deep_code/tools/new.py
@@ -14,28 +14,29 @@ class TemplateGenerator:
     def generate_workflow_template(output_path: Optional[str] = None) -> str:
         """Generate a complete template with all possible keys and placeholder values"""
 
-        template = {
-            "workflow_id": "[A unique identifier for your workflow]",
+        required = {
+            "workflow_id": "[REQUIRED: unique identifier for your workflow]",
             "properties": {
-                "title": "[Human-readable title of the workflow]",
-                "description": "[A concise summary of what the workflow does]",
-                "keywords": ["[KEYWORD1]", "[KEYWORD2]"],
-                "themes": [
-                    "[Thematic area(s) of focus (e.g. land, ocean, atmosphere)]",
-                    "[THEME1]",
-                    "[THEME2]",
-                ],
-                "license": "[License type (e.g. MIT, Apache-2.0, CC-BY-4.0, proprietary)]",
+                "title": "[REQUIRED: human-readable title of the workflow]",
+                "license": "[REQUIRED: SPDX license identifier, e.g. CC-BY-4.0, MIT, proprietary]",
+            },
+        }
+
+        optional = {
+            "properties": {
+                "description": "[OPTIONAL: concise summary of what the workflow does]",
+                "keywords": ["[OPTIONAL: KEYWORD1]", "[KEYWORD2]"],
+                "themes": ["[OPTIONAL: thematic area, e.g. land, ocean, atmosphere]"],
                 "jupyter_kernel_info": {
-                    "name": "[Name of the execution environment or notebook kernel]",
+                    "name": "[OPTIONAL: name of the execution environment or notebook kernel]",
                     "python_version": "[PYTHON_VERSION]",
                     "env_file": "[Link to the environment file (YAML) used to create the notebook environment]",
                 },
             },
-            "jupyter_notebook_url": "[Link to the source notebook (e.g. on GitHub)]",
+            "jupyter_notebook_url": "[OPTIONAL: link to the source notebook (e.g. on GitHub)]",
             "contact": [
                 {
-                    "name": "[Contact person's full name]",
+                    "name": "[OPTIONAL: contact person's full name]",
                     "organization": "[Affiliated institution or company]",
                     "links": [
                         {
@@ -48,53 +49,53 @@ def generate_workflow_template(output_path: Optional[str] = None) -> str:
             ],
         }
 
-        yaml_str = yaml.dump(
-            template, sort_keys=False, width=1000, default_flow_style=False
-        )
-
         if output_path:
             with open(output_path, "w") as f:
-                f.write("# Complete Workflow Configuration Template\n")
+                f.write("# Workflow Configuration Template\n")
                 f.write("# Replace all [PLACEHOLDER] values with your actual data\n\n")
-                f.write(yaml_str)
+                f.write("# --- REQUIRED fields ---\n")
+                f.write(yaml.dump(required, sort_keys=False, width=1000, default_flow_style=False))
+                f.write("\n# --- OPTIONAL fields ---\n")
+                f.write(yaml.dump(optional, sort_keys=False, width=1000, default_flow_style=False))
 
     @staticmethod
     def generate_dataset_template(output_path: Optional[str] = None) -> str:
         """Generate a complete dataset template with all possible keys and placeholder values"""
 
-        template = {
-            "dataset_id": "[The name of the dataset object within your S3 bucket].zarr",
-            "collection_id": "[A unique identifier for the dataset collection]",
-            "osc_themes": [
-                "[Oceans]",
-                "[Open Science theme (choose from "
-                "https://opensciencedata.esa.int/themes/catalog)",
-            ],
-            "osc_region": "[Geographical coverage, e.g. 'global']",
-            "dataset_status": "[Status of the dataset: 'ongoing', 'completed', or 'planned']",
-            "documentation_link": "[Link to relevant documentation, publication, or handbook]",
+        required = {
+            "dataset_id": "[REQUIRED: name of the Zarr store in your S3 bucket, e.g. my-dataset.zarr]",
+            "collection_id": "[REQUIRED: unique identifier, no spaces — use hyphens (e.g. My-Dataset-2024)]",
+            "license_type": "[REQUIRED: SPDX license identifier, e.g. CC-BY-4.0, MIT, proprietary]",
+            "stac_catalog_s3_root": "[REQUIRED: S3 root for the STAC Catalog + Item, e.g. s3://my-bucket/stac/my-collection/]",
         }
 
-        yaml_str = yaml.dump(
-            template, sort_keys=False, width=1000, default_flow_style=False
-        )
+        optional = {
+            "osc_themes": ["[OPTIONAL: OSC theme slug, e.g. land, ocean, atmosphere — auto-lowercased]"],
+            "osc_region": "[OPTIONAL: geographical coverage, e.g. Global]",
+            "dataset_status": "[OPTIONAL: ongoing | completed | planned (default: ongoing)]",
+            "documentation_link": "[OPTIONAL: link to documentation, publication, or handbook]",
+            "visualisation_link": "[OPTIONAL: URL to a visualisation of the dataset (e.g. xcube Viewer, WMS)]",
+            "osc_project_title": "[OPTIONAL: display title of the OSC project as it appears in the catalog (e.g. DeepESDL). Defaults to a formatted version of osc_project if omitted]",
+            "access_link": "[OPTIONAL: public S3 URL of the Zarr store — defaults to s3://deep-esdl-public/{dataset_id}]",
+            "cf_parameter": [{"name": "[OPTIONAL: CF standard name]", "units": "[unit string]"}],
+        }
 
         stac_catalog_comment = (
-            "\n# Optional: publish a STAC Catalog + Item to S3 alongside the Zarr.\n"
-            "# When set, deep-code writes:\n"
-            "#   {stac_catalog_s3_root}/catalog.json          (STAC Catalog root)\n"
+            "\n# stac_catalog_s3_root: deep-code writes the following files to this S3 root:\n"
+            "#   {stac_catalog_s3_root}/catalog.json               (STAC Catalog root)\n"
             "#   {stac_catalog_s3_root}/{collection_id}/item.json  (STAC Item for the whole Zarr)\n"
-            "# and adds a 'child' link from the OSC Collection to this S3 catalog.\n"
-            "# S3 write credentials are resolved in order from:\n"
+            "# S3 write credentials are resolved in order:\n"
             "#   1. STAC_S3_KEY / STAC_S3_SECRET env vars (STAC-specific, any bucket)\n"
             "#   2. AWS_ACCESS_KEY_ID / AWS_SECRET_ACCESS_KEY env vars\n"
             "#   3. boto3 default chain (IAM role, ~/.aws/credentials)\n"
-            "# stac_catalog_s3_root: s3://[YOUR-BUCKET]/stac/[collection-id]/\n"
         )
 
         if output_path:
             with open(output_path, "w") as f:
-                f.write("# Complete Dataset Configuration Template\n")
+                f.write("# Dataset Configuration Template\n")
                 f.write("# Replace all [PLACEHOLDER] values with your actual data\n\n")
-                f.write(yaml_str)
+                f.write("# --- REQUIRED fields ---\n")
+                f.write(yaml.dump(required, sort_keys=False, width=1000, default_flow_style=False))
+                f.write("\n# --- OPTIONAL fields ---\n")
+                f.write(yaml.dump(optional, sort_keys=False, width=1000, default_flow_style=False))
                 f.write(stac_catalog_comment)

From 86fb0ea212754321419278767f5cf0b0d48d7b96 Mon Sep 17 00:00:00 2001
From: tejas <tejas.morbagalharish@brockmann-consult.de>
Date: Thu, 19 Mar 2026 16:05:16 +0100
Subject: [PATCH 05/16] add 0.1.9 changelog entries

---
 CHANGES.md | 7 +++++++
 1 file changed, 7 insertions(+)

diff --git a/CHANGES.md b/CHANGES.md
index 3f69147..daab3af 100644
--- a/CHANGES.md
+++ b/CHANGES.md
@@ -71,3 +71,10 @@
 - `jupyter_kernel_info` is now optional in `RecordProperties`; workflow configs without a notebook URL no longer require this field.
 - Removed redundant `hasattr` guard for `_last_generator` in `Publisher.publish()`.
 - Added automated MkDocs GitHub Pages deployment on release via a dedicated `docs.yml` workflow.
+- `osc_themes` values are now automatically lowercased, so `'Land'` and `'LAND'` are treated the same as `'land'`, preventing theme validation failures.
+- `collection_id` is now validated to contain no spaces; a clear error is raised with a hint to use hyphens instead.
+- `license_type` (dataset) and `properties.license` (workflow) are now mandatory fields; publishing fails immediately with a descriptive error if either is missing.
+- Variable catalog `description` now falls back to the title-cased variable ID when neither `description` nor `long_name` attrs are present on the zarr variable, preventing `null` description validation failures.
+- Pull requests opened by deep-code now include a "Generated with deep-code" note in the PR description.
+- `stac_catalog_s3_root` is now a mandatory field in the dataset config; publishing fails immediately with a descriptive error if it is absent.
+- Added optional `visualisation_link` field to the dataset config; when provided, a `related` link with title `"Dataset Visualisation"` is added to the generated OSC collection.

From 9b0e2bda4aed657e36aac608cfbaba87a81abdb7 Mon Sep 17 00:00:00 2001
From: tejas <tejas.morbagalharish@brockmann-consult.de>
Date: Thu, 19 Mar 2026 16:05:41 +0100
Subject: [PATCH 06/16] 0.1.9.dev1

---
 deep_code/version.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/deep_code/version.py b/deep_code/version.py
index c0819c3..ed094a1 100644
--- a/deep_code/version.py
+++ b/deep_code/version.py
@@ -19,4 +19,4 @@
 # FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
 # DEALINGS IN THE SOFTWARE.
 
-version = "0.1.9.dev0"
+version = "0.1.9.dev1"

From afed7eb9eff1b6393b6f5a718545438f2386a9fa Mon Sep 17 00:00:00 2001
From: tejas <tejas.morbagalharish@brockmann-consult.de>
Date: Thu, 19 Mar 2026 16:05:58 +0100
Subject: [PATCH 07/16] update docs

---
 docs/cli.md           |  2 ++
 docs/configuration.md | 28 ++++++++++------------------
 2 files changed, 12 insertions(+), 18 deletions(-)

diff --git a/docs/cli.md b/docs/cli.md
index 14947ed..f8e3a5f 100644
--- a/docs/cli.md
+++ b/docs/cli.md
@@ -28,3 +28,5 @@ Options:
 1. Reads your configs and builds dataset STAC collections plus variable catalogs.
 2. Builds workflow and experiment OGC API Records.
 3. Forks/clones the target metadata repo (production, staging, or testing), commits generated JSON, and opens a pull request on your behalf.
+
+The pull request description includes a "Generated with deep-code" attribution note.
diff --git a/docs/configuration.md b/docs/configuration.md
index bf2b955..b37c28c 100644
--- a/docs/configuration.md
+++ b/docs/configuration.md
@@ -17,11 +17,12 @@ The sections below document every field in those templates.
 ```yaml
 # Required
 dataset_id: your-dataset.zarr
-collection_id: your-collection
+collection_id: your-collection       # no spaces — use hyphens
 license_type: CC-BY-4.0
+stac_catalog_s3_root: s3://bucket/stac/your-collection/
 
 # Optional
-osc_themes: [cryosphere]        # must match slugs at opensciencedata.esa.int/themes/catalog
+osc_themes: [cryosphere]        # must match slugs at opensciencedata.esa.int/themes/catalog — auto-lowercased
 osc_region: global
 dataset_status: completed       # ongoing | completed | planned (default: ongoing)
 documentation_link: https://example.com/docs
@@ -31,15 +32,6 @@ access_link: s3://bucket/your-dataset.zarr   # defaults to s3://deep-esdl-public
 cf_parameter:
   - name: sea_surface_temperature
     units: kelvin
-
-# Optional: publish a STAC Catalog + Item next to the data on S3.
-# When set, a lightweight STAC hierarchy (catalog.json → item.json) is written
-# directly to S3 and a "via" link is added to the OSC collection pointing to it.
-# S3 write credentials are resolved in order:
-#   1. STAC_S3_KEY / STAC_S3_SECRET  (STAC-specific, any bucket)
-#   2. AWS_ACCESS_KEY_ID / AWS_SECRET_ACCESS_KEY
-#   3. boto3 default chain (IAM role, ~/.aws/credentials)
-stac_catalog_s3_root: s3://bucket/stac/your-collection/
 ```
 
 ### Field reference
@@ -47,20 +39,20 @@ stac_catalog_s3_root: s3://bucket/stac/your-collection/
 | Field | Required | Description |
 |---|---|---|
 | `dataset_id` | Yes | Zarr store identifier (used to open the dataset). |
-| `collection_id` | Yes | Unique ID for the STAC collection in the OSC catalog. |
-| `license_type` | Yes | SPDX license identifier (e.g. `CC-BY-4.0`). |
-| `osc_themes` | No | List of OSC theme slugs (e.g. `[cryosphere, oceans]`). |
+| `collection_id` | Yes | Unique ID for the STAC collection in the OSC catalog. **Must not contain spaces** — use hyphens as word separators (e.g. `My-Dataset-2024`). |
+| `license_type` | Yes | SPDX license identifier (e.g. `CC-BY-4.0`). Publishing fails if this field is absent. |
+| `osc_themes` | No | List of OSC theme slugs (e.g. `[cryosphere, oceans]`). Values are automatically lowercased so `Land` and `land` are equivalent. |
 | `osc_region` | No | Geographical region label (default: `Global`). |
 | `dataset_status` | No | One of `ongoing`, `completed`, or `planned` (default: `ongoing`). |
 | `access_link` | No | Public S3 URL of the Zarr store. Defaults to `s3://deep-esdl-public/{dataset_id}`. |
 | `documentation_link` | No | URL to dataset documentation. |
+| `visualisation_link` | No | URL to a visualisation of the dataset (e.g. xcube Viewer, WMS). Added as a `visualisation` link with title `"Dataset visualisation"`. |
 | `cf_parameter` | No | List of CF metadata dicts to override variable attributes (e.g. `name`, `units`). |
-| `stac_catalog_s3_root` | No | S3 root for the dataset-level STAC Catalog/Item. See [STAC Catalog on S3](#stac-catalog-on-s3). |
+| `stac_catalog_s3_root` | Yes | S3 root where the STAC Catalog and Item are published. Publishing fails if this field is absent. See [STAC Catalog on S3](#stac-catalog-on-s3). |
 
 ### STAC Catalog on S3
 
-Setting `stac_catalog_s3_root` generates a two-file STAC hierarchy on S3 alongside
-the data:
+`stac_catalog_s3_root` is required. deep-code writes a two-file STAC hierarchy to S3 alongside the data:
 
 ```
 s3://bucket/stac/your-collection/
@@ -124,7 +116,7 @@ links:
 | `properties.description` | No | Short summary of what the workflow does. |
 | `properties.keywords` | No | List of keyword strings. |
 | `properties.themes` | No | List of OSC theme slugs. |
-| `properties.license` | No | License identifier (e.g. `proprietary`, `CC-BY-4.0`). |
+| `properties.license` | Yes | SPDX license identifier (e.g. `CC-BY-4.0`, `proprietary`). Publishing fails if this field is absent. |
 | `jupyter_notebook_url` | No | Link to the source notebook on GitHub. When omitted, kernel and application links are skipped. |
 | `properties.jupyter_kernel_info` | No | Kernel name, Python version, and environment file URL. Only published when `jupyter_notebook_url` is set. |
 | `contact` | No | List of contact objects with `name`, `organization`, and `links`. |

From b9ad319f91123d8e22708e2a29b8180ce84bae4e Mon Sep 17 00:00:00 2001
From: tejas <tejas.morbagalharish@brockmann-consult.de>
Date: Thu, 19 Mar 2026 16:34:16 +0100
Subject: [PATCH 08/16] normalize workflow-id

---
 deep_code/tools/publish.py | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/deep_code/tools/publish.py b/deep_code/tools/publish.py
index 7718ae9..2daf1cf 100644
--- a/deep_code/tools/publish.py
+++ b/deep_code/tools/publish.py
@@ -172,7 +172,9 @@ def __init__(
             self.workflow_title = self.workflow_config.get("properties", {}).get(
                 "title"
             )
-            self.workflow_id = self.workflow_config.get("workflow_id")
+            self.workflow_id = self._normalize_name(
+                self.workflow_config.get("workflow_id")
+            )
 
     def _read_config_files(self) -> None:
         if self.dataset_config_path:
@@ -269,6 +271,7 @@ def publish_dataset(
         license_type = self.dataset_config.get("license_type")
         visualisation_link = self.dataset_config.get("visualisation_link")
         osc_project_title = self.dataset_config.get("osc_project_title")
+        description = self.dataset_config.get("description")
 
         if not dataset_id or not self.collection_id:
             raise ValueError("Dataset ID or Collection ID missing in the config.")
@@ -303,6 +306,7 @@ def publish_dataset(
             cf_params=cf_params,
             visualisation_link=visualisation_link,
             osc_project_title=osc_project_title,
+            description=description,
         )
         # Store so publish() can reuse it for zarr STAC catalog generation
         self._last_generator = generator

From b156b93931ab7b047c1cbe341ac6acda5e61100a Mon Sep 17 00:00:00 2001
From: tejas <tejas.morbagalharish@brockmann-consult.de>
Date: Thu, 19 Mar 2026 16:34:49 +0100
Subject: [PATCH 09/16] added description as option field in dataset config

---
 deep_code/tools/new.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/deep_code/tools/new.py b/deep_code/tools/new.py
index 0932a9f..a9f591f 100644
--- a/deep_code/tools/new.py
+++ b/deep_code/tools/new.py
@@ -73,6 +73,7 @@ def generate_dataset_template(output_path: Optional[str] = None) -> str:
             "osc_themes": ["[OPTIONAL: OSC theme slug, e.g. land, ocean, atmosphere — auto-lowercased]"],
             "osc_region": "[OPTIONAL: geographical coverage, e.g. Global]",
             "dataset_status": "[OPTIONAL: ongoing | completed | planned (default: ongoing)]",
+            "description": "[OPTIONAL: human-readable description of the dataset. Overrides the description attribute in the Zarr store if set]",
             "documentation_link": "[OPTIONAL: link to documentation, publication, or handbook]",
             "visualisation_link": "[OPTIONAL: URL to a visualisation of the dataset (e.g. xcube Viewer, WMS)]",
             "osc_project_title": "[OPTIONAL: display title of the OSC project as it appears in the catalog (e.g. DeepESDL). Defaults to a formatted version of osc_project if omitted]",

From 46171f055e61b63635125307d193f86658f56547 Mon Sep 17 00:00:00 2001
From: tejas <tejas.morbagalharish@brockmann-consult.de>
Date: Thu, 19 Mar 2026 16:41:53 +0100
Subject: [PATCH 10/16] fix STAC catalog links to use HTTPS and OSC browser
 convention

---
 deep_code/utils/dataset_stac_generator.py | 44 ++++++++++++++++++-----
 1 file changed, 35 insertions(+), 9 deletions(-)

diff --git a/deep_code/utils/dataset_stac_generator.py b/deep_code/utils/dataset_stac_generator.py
index 288eaeb..89b6e67 100644
--- a/deep_code/utils/dataset_stac_generator.py
+++ b/deep_code/utils/dataset_stac_generator.py
@@ -52,6 +52,7 @@ def __init__(
         osc_project: str = "deep-earth-system-data-lab",
         osc_project_title: str | None = None,
         visualisation_link: str | None = None,
+        description: str | None = None,
     ):
         if " " in collection_id:
             raise ValueError(
@@ -73,6 +74,7 @@ def __init__(
         self.osc_missions = osc_missions or []
         self.cf_params = cf_params or {}
         self.visualisation_link = visualisation_link
+        self.description = description
         self.logger = logging.getLogger(__name__)
         self.dataset = open_dataset(dataset_id=dataset_id, logger=self.logger)
         self.variables_metadata = self.get_variables_metadata()
@@ -137,8 +139,10 @@ def _normalize_name(name: str | None) -> str | None:
 
     def _get_general_metadata(self) -> dict:
         return {
-            "description": self.dataset.attrs.get(
-                "description", "No description available."
+            "description": (
+                self.description
+                or self.dataset.attrs.get("description")
+                or "No description available."
             )
         }
 
@@ -456,6 +460,17 @@ def update_existing_variable_catalog(self, var_file_path, var_id) -> dict:
             )
         return data
 
+    @staticmethod
+    def _s3_to_https(s3_url: str) -> str:
+        """Convert an s3:// URL to its HTTPS equivalent using AWS virtual-hosted style.
+
+        Example:
+            s3://my-bucket/path/to/file → https://my-bucket.s3.amazonaws.com/path/to/file
+        """
+        without_scheme = s3_url[len("s3://"):]
+        bucket, _, key = without_scheme.partition("/")
+        return f"https://{bucket}.s3.amazonaws.com/{key}"
+
     @staticmethod
     def format_string(s: str) -> str:
         # Strip leading/trailing spaces/underscores and replace underscores with spaces
@@ -726,17 +741,28 @@ def build_dataset_stac_collection(self, mode: str, stac_catalog_s3_root: str | N
 
         collection.license = self.license_type
 
-        # Link to the S3-hosted STAC catalog when provided.
-        # Uses rel="via" (not "child") because the OSC validator requires every
-        # "child" link to resolve to a file inside the metadata repository;
-        # the S3 catalog lives outside the repo and would fail that check.
+        # Add links to the S3-hosted STAC catalog following the OSC convention:
+        #   via   → STAC browser URL (human-browsable, HTTPS)
+        #   child → direct HTTPS URL to catalog.json (machine-readable)
+        # The s3:// URL is never used directly in the collection as it fails the
+        # products/children.json uri-reference format check.
         if stac_catalog_s3_root:
-            catalog_href = stac_catalog_s3_root.rstrip("/") + "/catalog.json"
+            catalog_s3 = stac_catalog_s3_root.rstrip("/") + "/catalog.json"
+            catalog_https = self._s3_to_https(catalog_s3)
+            stac_browser_href = (
+                "https://opensciencedata.esa.int/stac-browser/#/external/"
+                + catalog_https[len("https://"):]
+            )
             collection.add_link(Link(
                 rel="via",
-                target=catalog_href,
+                target=stac_browser_href,
+                title="Access",
+            ))
+            collection.add_link(Link(
+                rel="child",
+                target=catalog_https,
                 media_type="application/json",
-                title="STAC Catalog",
+                title="Items",
             ))
 
         # Validate OSC extension fields

From 29a44cb0c0f1e47ba73112db06d234ce8b77ef03 Mon Sep 17 00:00:00 2001
From: tejas <tejas.morbagalharish@brockmann-consult.de>
Date: Thu, 19 Mar 2026 16:42:30 +0100
Subject: [PATCH 11/16] updated docs with description filed

---
 docs/configuration.md | 1 +
 1 file changed, 1 insertion(+)

diff --git a/docs/configuration.md b/docs/configuration.md
index b37c28c..83b8abf 100644
--- a/docs/configuration.md
+++ b/docs/configuration.md
@@ -45,6 +45,7 @@ cf_parameter:
 | `osc_region` | No | Geographical region label (default: `Global`). |
 | `dataset_status` | No | One of `ongoing`, `completed`, or `planned` (default: `ongoing`). |
 | `access_link` | No | Public S3 URL of the Zarr store. Defaults to `s3://deep-esdl-public/{dataset_id}`. |
+| `description` | No | Human-readable description of the dataset. Overrides the `description` attribute in the Zarr store; falls back to `"No description available."` if neither is set. |
 | `documentation_link` | No | URL to dataset documentation. |
 | `visualisation_link` | No | URL to a visualisation of the dataset (e.g. xcube Viewer, WMS). Added as a `visualisation` link with title `"Dataset visualisation"`. |
 | `cf_parameter` | No | List of CF metadata dicts to override variable attributes (e.g. `name`, `units`). |

From 942255a0f802ed3f9679533be50d11121995be89 Mon Sep 17 00:00:00 2001
From: tejas <tejas.morbagalharish@brockmann-consult.de>
Date: Thu, 19 Mar 2026 16:42:51 +0100
Subject: [PATCH 12/16] updated change log

---
 CHANGES.md | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/CHANGES.md b/CHANGES.md
index daab3af..396216e 100644
--- a/CHANGES.md
+++ b/CHANGES.md
@@ -77,4 +77,8 @@
 - Variable catalog `description` now falls back to the title-cased variable ID when neither `description` nor `long_name` attrs are present on the zarr variable, preventing `null` description validation failures.
 - Pull requests opened by deep-code now include a "Generated with deep-code" note in the PR description.
 - `stac_catalog_s3_root` is now a mandatory field in the dataset config; publishing fails immediately with a descriptive error if it is absent.
-- Added optional `visualisation_link` field to the dataset config; when provided, a `related` link with title `"Dataset Visualisation"` is added to the generated OSC collection.
+- STAC catalog links in the OSC collection now follow the OSC convention: a `via` link to the STAC browser URL and a `child` link to the direct HTTPS catalog URL. The `s3://` URL is converted to HTTPS (AWS virtual-hosted style) to satisfy the `uri-reference` format check in the OSC products schema.
+- Added optional `visualisation_link` field to the dataset config; when provided, a `visualisation` link with title `"Dataset visualisation"` is added to the generated OSC collection.
+- Added optional `description` field to the dataset config; overrides the `description` attribute from the Zarr store when set.
+- Added optional `osc_project_title` field to the dataset config to correctly set the project link title (e.g. `"DeepESDL"`) instead of deriving it from the project ID.
+- Fixed `workflow_id` not being normalised (slugified) when stored on `Publisher`, causing spaces in experiment link hrefs and failing `uri-reference` format validation.

From 0f8fea7727e4808e1b836f30a433552356d07f88 Mon Sep 17 00:00:00 2001
From: tejas <tejas.morbagalharish@brockmann-consult.de>
Date: Thu, 19 Mar 2026 17:02:38 +0100
Subject: [PATCH 13/16] fix tests

---
 deep_code/tests/tools/test_publish.py         | 47 ++++++++++++++-----
 .../utils/test_dataset_stac_generator.py      | 34 +++++++++-----
 2 files changed, 55 insertions(+), 26 deletions(-)

diff --git a/deep_code/tests/tools/test_publish.py b/deep_code/tests/tools/test_publish.py
index de841f4..4c388ac 100644
--- a/deep_code/tests/tools/test_publish.py
+++ b/deep_code/tests/tools/test_publish.py
@@ -126,11 +126,22 @@ def test_environment_repo_selection(self, mock_gp):
             == "open-science-catalog-metadata-testing"
         )
 
+    @patch.object(Publisher, "_write_stac_catalog_to_s3")
     @patch.object(Publisher, "publish_dataset", return_value={"a": {}})
     @patch.object(
         Publisher, "generate_workflow_experiment_records", return_value={"b": {}}
     )
-    def test_publish_mode_routing(self, mock_wf, mock_ds):
+    def test_publish_mode_routing(self, mock_wf, mock_ds, mock_s3):
+        mock_generator = MagicMock()
+        mock_generator.build_zarr_stac_catalog_file_dict.return_value = {}
+        self.publisher._last_generator = mock_generator
+        self.publisher.dataset_config = {
+            "stac_catalog_s3_root": "s3://bucket/stac/",
+            "collection_id": "test-collection",
+            "dataset_id": "test-dataset",
+        }
+        self.publisher.gh_publisher.publish_files.return_value = "PR_URL"
+
         # dataset only
         self.publisher.publish(write_to_file=True, mode="dataset")
         mock_ds.assert_called()
@@ -142,21 +153,28 @@ def test_publish_mode_routing(self, mock_wf, mock_ds):
         mock_ds.assert_not_called()
         mock_wf.assert_called()
 
+    @patch.object(Publisher, "_write_stac_catalog_to_s3")
     @patch.object(Publisher, "generate_workflow_experiment_records", return_value={})
     @patch.object(Publisher, "publish_dataset", return_value={})
     def test_publish_nothing_to_publish_raises(
-        self, mock_publish_dataset, mock_generate_workflow_experiment_records
+        self, mock_publish_dataset, mock_generate_workflow_experiment_records, mock_s3
     ):
+        mock_generator = MagicMock()
+        mock_generator.build_zarr_stac_catalog_file_dict.return_value = {}
+        self.publisher._last_generator = mock_generator
+        self.publisher.dataset_config = {"stac_catalog_s3_root": "s3://bucket/stac/"}
+
         with pytest.raises(ValueError):
             self.publisher.publish(write_to_file=False, mode="dataset")
         mock_publish_dataset.assert_called_once()
         mock_generate_workflow_experiment_records.assert_not_called()
 
+    @patch.object(Publisher, "_write_stac_catalog_to_s3")
     @patch.object(Publisher, "publish_dataset", return_value={"x": {}})
     @patch.object(
         Publisher, "generate_workflow_experiment_records", return_value={"y": {}}
     )
-    def test_publish_builds_pr_params(self, mock_wf, mock_ds):
+    def test_publish_builds_pr_params(self, mock_wf, mock_ds, mock_s3):
         # Make PR creation return a fixed URL
         self.publisher.gh_publisher.publish_files.return_value = "PR_URL"
 
@@ -164,6 +182,12 @@ def test_publish_builds_pr_params(self, mock_wf, mock_ds):
         self.publisher.collection_id = "col"
         self.publisher.workflow_id = "wf"
 
+        # _last_generator is set by publish_dataset; since that's mocked, stub it
+        mock_generator = MagicMock()
+        mock_generator.build_zarr_stac_catalog_file_dict.return_value = {}
+        self.publisher._last_generator = mock_generator
+        self.publisher.dataset_config = {"stac_catalog_s3_root": "s3://bucket/stac/"}
+
         url = self.publisher.publish(write_to_file=False, mode="all")
         assert url == "PR_URL"
 
@@ -309,6 +333,7 @@ def test_publish_dataset_creates_project_collection_when_missing(
             "dataset_id": "test-dataset",
             "collection_id": "test-collection",
             "license_type": "CC-BY-4.0",
+            "stac_catalog_s3_root": "s3://bucket/stac/test-collection/",
         }
         self.publisher.collection_id = "test-collection"
 
@@ -343,6 +368,7 @@ def test_publish_dataset_updates_project_collection_when_exists(
             "dataset_id": "test-dataset",
             "collection_id": "test-collection",
             "license_type": "CC-BY-4.0",
+            "stac_catalog_s3_root": "s3://bucket/stac/test-collection/",
         }
         self.publisher.collection_id = "test-collection"
 
@@ -359,20 +385,15 @@ def test_publish_dataset_updates_project_collection_when_exists(
         update_methods = [call.args[2] for call in mock_update.call_args_list]
         self.assertIn(mock_gen.update_deepesdl_collection, update_methods)
 
-    @patch.object(Publisher, "publish_dataset", return_value={"github_file.json": {}})
-    def test_publish_skips_zarr_stac_when_not_configured(self, mock_publish_ds):
-        # No stac_catalog_s3_root in config
+    def test_publish_dataset_raises_when_stac_root_missing(self):
+        # stac_catalog_s3_root is mandatory; publish_dataset must raise ValueError
         self.publisher.dataset_config = {
             "collection_id": "test-collection",
             "dataset_id": "test-dataset",
+            "license_type": "CC-BY-4.0",
         }
-        self.publisher.gh_publisher.publish_files.return_value = "PR_URL"
-
-        with patch.object(
-            self.publisher, "_write_stac_catalog_to_s3"
-        ) as mock_write:
-            self.publisher.publish(mode="dataset")
-            mock_write.assert_not_called()
+        with pytest.raises(ValueError, match="stac_catalog_s3_root"):
+            self.publisher.publish_dataset(write_to_file=False)
 
 
 class TestParseGithubNotebookUrl:
diff --git a/deep_code/tests/utils/test_dataset_stac_generator.py b/deep_code/tests/utils/test_dataset_stac_generator.py
index 039bac2..75e5817 100644
--- a/deep_code/tests/utils/test_dataset_stac_generator.py
+++ b/deep_code/tests/utils/test_dataset_stac_generator.py
@@ -509,28 +509,36 @@ def test_build_zarr_stac_catalog_file_dict_content(self):
         self.assertIn("zarr-consolidated-metadata", item_dict["assets"])
 
     def test_build_dataset_stac_collection_adds_s3_catalog_via_link(self):
-        """A 'via' link to the S3 catalog is added when stac_catalog_s3_root is provided.
+        """A 'via' link (STAC browser) and a 'child' link (HTTPS catalog) are added
+        when stac_catalog_s3_root is provided.
 
-        rel='via' is used (not 'child') because the OSC validator requires every
-        'child' link to resolve to a file inside the metadata repository.
+        The OSC convention uses:
+          - rel='via' → STAC browser URL
+          - rel='child' → direct HTTPS catalog URL (s3:// converted to HTTPS)
         """
         s3_root = "s3://test-bucket/stac/my-collection/"
         collection = self.generator.build_dataset_stac_collection(
             mode="dataset", stac_catalog_s3_root=s3_root
         )
-        s3_via = next(
-            (
-                lnk
-                for lnk in collection.links
-                if lnk.rel == "via" and "catalog.json" in str(lnk.target)
-            ),
+        https_catalog = "https://test-bucket.s3.amazonaws.com/stac/my-collection/catalog.json"
+        stac_browser_href = (
+            "https://opensciencedata.esa.int/stac-browser/#/external/"
+            + https_catalog.replace("https://", "")
+        )
+
+        via_link = next(
+            (lnk for lnk in collection.links if lnk.rel == "via" and "stac-browser" in str(lnk.target)),
             None,
         )
-        self.assertIsNotNone(s3_via, "Expected a 'via' link pointing to S3 catalog")
-        self.assertEqual(
-            s3_via.target,
-            "s3://test-bucket/stac/my-collection/catalog.json",
+        self.assertIsNotNone(via_link, "Expected a 'via' STAC browser link")
+        self.assertEqual(via_link.target, stac_browser_href)
+
+        child_link = next(
+            (lnk for lnk in collection.links if lnk.rel == "child" and "catalog.json" in str(lnk.target)),
+            None,
         )
+        self.assertIsNotNone(child_link, "Expected a 'child' HTTPS catalog link")
+        self.assertEqual(child_link.target, https_catalog)
 
     def test_build_dataset_stac_collection_no_s3_via_link_by_default(self):
         """No S3 catalog 'via' link is added when stac_catalog_s3_root is absent."""

From 2d28ea945b03cb1ba72b7830944925a75a2b5bf6 Mon Sep 17 00:00:00 2001
From: tejas <tejas.morbagalharish@brockmann-consult.de>
Date: Thu, 19 Mar 2026 17:22:01 +0100
Subject: [PATCH 14/16] increase test coverage

---
 deep_code/tests/tools/test_publish.py         | 161 +++++++++++++++++-
 .../utils/test_dataset_stac_generator.py      | 156 +++++++++++++++++
 deep_code/tests/utils/test_helper.py          |  22 ++-
 3 files changed, 337 insertions(+), 2 deletions(-)

diff --git a/deep_code/tests/tools/test_publish.py b/deep_code/tests/tools/test_publish.py
index 4c388ac..6ccc8eb 100644
--- a/deep_code/tests/tools/test_publish.py
+++ b/deep_code/tests/tools/test_publish.py
@@ -386,7 +386,6 @@ def test_publish_dataset_updates_project_collection_when_exists(
         self.assertIn(mock_gen.update_deepesdl_collection, update_methods)
 
     def test_publish_dataset_raises_when_stac_root_missing(self):
-        # stac_catalog_s3_root is mandatory; publish_dataset must raise ValueError
         self.publisher.dataset_config = {
             "collection_id": "test-collection",
             "dataset_id": "test-dataset",
@@ -395,6 +394,166 @@ def test_publish_dataset_raises_when_stac_root_missing(self):
         with pytest.raises(ValueError, match="stac_catalog_s3_root"):
             self.publisher.publish_dataset(write_to_file=False)
 
+    def test_publish_dataset_raises_when_no_dataset_config(self):
+        self.publisher.dataset_config = None
+        with pytest.raises(ValueError, match="No dataset config"):
+            self.publisher.publish_dataset(write_to_file=False)
+
+    def test_publish_dataset_raises_when_ids_missing(self):
+        self.publisher.dataset_config = {"collection_id": "", "dataset_id": ""}
+        with pytest.raises(ValueError, match="Dataset ID or Collection ID missing"):
+            self.publisher.publish_dataset(write_to_file=False)
+
+    def test_publish_dataset_raises_when_license_missing(self):
+        self.publisher.dataset_config = {
+            "collection_id": "test-collection",
+            "dataset_id": "test-dataset",
+        }
+        with pytest.raises(ValueError, match="license_type is required"):
+            self.publisher.publish_dataset(write_to_file=False)
+
+    def test_write_to_file_serializes_dict(self):
+        import json
+        import os
+        import tempfile
+
+        with tempfile.NamedTemporaryFile(delete=False, suffix=".json") as f:
+            path = f.name
+        try:
+            Publisher._write_to_file(path, {"a": 1})
+            with open(path) as f:
+                result = json.load(f)
+            self.assertEqual(result, {"a": 1})
+        finally:
+            os.unlink(path)
+
+    def test_update_and_add_to_file_dict(self):
+        file_dict = {}
+        self.publisher.gh_publisher.github_automation.local_clone_dir = "/tmp"
+        update_method = MagicMock(return_value={"key": "value"})
+        self.publisher._update_and_add_to_file_dict(file_dict, "some/catalog.json", update_method)
+        update_method.assert_called_once()
+        assert any("some/catalog.json" in str(k) for k in file_dict)
+
+    def test_update_variable_catalogs_creates_new_when_missing(self):
+        mock_gen = MagicMock()
+        mock_gen.variables_metadata = {"var1": {"variable_id": "var1"}}
+        mock_gen.build_variable_catalog.return_value.to_dict.return_value = {"id": "var1"}
+        self.publisher.gh_publisher.github_automation.file_exists.return_value = False
+
+        file_dict = {}
+        self.publisher._update_variable_catalogs(mock_gen, file_dict, ["var1"])
+
+        mock_gen.build_variable_catalog.assert_called_once()
+        assert "variables/var1/catalog.json" in file_dict
+
+    def test_update_variable_catalogs_updates_existing(self):
+        mock_gen = MagicMock()
+        self.publisher.gh_publisher.github_automation.file_exists.return_value = True
+        self.publisher.gh_publisher.github_automation.local_clone_dir = "/tmp"
+        mock_gen.update_existing_variable_catalog.return_value = {"id": "var1"}
+
+        file_dict = {}
+        self.publisher._update_variable_catalogs(mock_gen, file_dict, ["var1"])
+
+        mock_gen.update_existing_variable_catalog.assert_called_once()
+        assert "variables/var1/catalog.json" in file_dict
+
+    # ------------------------------------------------------------------
+    # generate_workflow_experiment_records
+    # ------------------------------------------------------------------
+
+    def _setup_workflow_mocks(self):
+        """Patch all internals of generate_workflow_experiment_records."""
+        mock_rg = MagicMock()
+        mock_props = MagicMock()
+        mock_props.jupyter_kernel_info.to_dict.return_value = {}
+        mock_rg.build_record_properties.return_value = mock_props
+
+        mock_wf_record = MagicMock()
+        mock_wf_record.to_dict.return_value = {"id": "wf", "properties": {}}
+
+        mock_exp_record = MagicMock()
+        mock_exp_record.to_dict.return_value = {
+            "id": "wf",
+            "properties": {},
+            "jupyter_notebook_url": "url",
+            "collection_id": "col",
+        }
+        return mock_rg, mock_props, mock_wf_record, mock_exp_record
+
+    @patch("deep_code.tools.publish.WorkflowAsOgcRecord")
+    @patch("deep_code.tools.publish.LinksBuilder")
+    @patch("deep_code.tools.publish.OSCWorkflowOGCApiRecordGenerator")
+    def test_generate_workflow_records_mode_workflow(self, MockRG, MockLinks, MockWF):
+        mock_rg, mock_props, mock_wf_record, _ = self._setup_workflow_mocks()
+        MockRG.return_value = mock_rg
+        MockWF.return_value = mock_wf_record
+
+        self.publisher.workflow_config = {
+            "workflow_id": "my-workflow",
+            "properties": {"title": "My WF", "license": "CC-BY-4.0"},
+        }
+        with patch.object(self.publisher, "_update_base_catalog", return_value={}):
+            result = self.publisher.generate_workflow_experiment_records(
+                write_to_file=False, mode="workflow"
+            )
+
+        self.assertIn("workflows/my-workflow/record.json", result)
+        self.assertIn("workflows/catalog.json", result)
+        self.assertNotIn("experiments/catalog.json", result)
+
+    @patch("deep_code.tools.publish.ExperimentAsOgcRecord")
+    @patch("deep_code.tools.publish.WorkflowAsOgcRecord")
+    @patch("deep_code.tools.publish.LinksBuilder")
+    @patch("deep_code.tools.publish.OSCWorkflowOGCApiRecordGenerator")
+    def test_generate_workflow_records_mode_all(self, MockRG, MockLinks, MockWF, MockExp):
+        mock_rg, mock_props, mock_wf_record, mock_exp_record = self._setup_workflow_mocks()
+        MockRG.return_value = mock_rg
+        MockWF.return_value = mock_wf_record
+        MockExp.return_value = mock_exp_record
+
+        self.publisher.workflow_config = {
+            "workflow_id": "my-workflow",
+            "properties": {"title": "My WF", "license": "CC-BY-4.0"},
+        }
+        self.publisher.collection_id = "my-collection"
+        with patch.object(self.publisher, "_update_base_catalog", return_value={}):
+            result = self.publisher.generate_workflow_experiment_records(
+                write_to_file=False, mode="all"
+            )
+
+        self.assertIn("workflows/my-workflow/record.json", result)
+        self.assertIn("workflows/catalog.json", result)
+        self.assertIn("experiments/catalog.json", result)
+
+    @patch("deep_code.tools.publish.OSCWorkflowOGCApiRecordGenerator")
+    def test_generate_workflow_records_raises_when_workflow_id_missing(self, MockRG):
+        self.publisher.workflow_config = {
+            "properties": {"title": "My WF", "license": "CC-BY-4.0"},
+        }
+        with pytest.raises(ValueError, match="workflow_id is missing"):
+            self.publisher.generate_workflow_experiment_records(
+                write_to_file=False, mode="workflow"
+            )
+
+    @patch("deep_code.tools.publish.OSCWorkflowOGCApiRecordGenerator")
+    def test_generate_workflow_records_raises_when_license_missing(self, MockRG):
+        self.publisher.workflow_config = {
+            "workflow_id": "my-wf",
+            "properties": {"title": "My WF"},
+        }
+        with pytest.raises(ValueError, match="license is required"):
+            self.publisher.generate_workflow_experiment_records(
+                write_to_file=False, mode="workflow"
+            )
+
+    def test_generate_workflow_records_returns_empty_for_dataset_mode(self):
+        result = self.publisher.generate_workflow_experiment_records(
+            write_to_file=False, mode="dataset"
+        )
+        self.assertEqual(result, {})
+
 
 class TestParseGithubNotebookUrl:
     @pytest.mark.parametrize(
diff --git a/deep_code/tests/utils/test_dataset_stac_generator.py b/deep_code/tests/utils/test_dataset_stac_generator.py
index 75e5817..0b806cd 100644
--- a/deep_code/tests/utils/test_dataset_stac_generator.py
+++ b/deep_code/tests/utils/test_dataset_stac_generator.py
@@ -607,3 +607,159 @@ def test_edge_cases(self):
             OscDatasetStacGenerator.format_string("too   many   spaces"),
             "Too Many Spaces",
         )
+
+
+class TestOscDatasetStacGeneratorExtra(unittest.TestCase):
+    """Additional tests to cover branches not exercised by TestOSCProductSTACGenerator."""
+
+    def _make_generator(self, mock_ds, collection_id="my-collection", **kwargs):
+        with patch("deep_code.utils.dataset_stac_generator.open_dataset", return_value=mock_ds):
+            return OscDatasetStacGenerator(
+                dataset_id="test.zarr",
+                collection_id=collection_id,
+                workflow_id="wf",
+                workflow_title="WF",
+                license_type="CC-BY-4.0",
+                **kwargs,
+            )
+
+    def _make_dataset(self, coord_type="lon_lat"):
+        import numpy as np
+        from datetime import datetime
+        if coord_type == "lon_lat":
+            coords = {
+                "lon": ("lon", np.linspace(-10, 10, 3)),
+                "lat": ("lat", np.linspace(-5, 5, 2)),
+                "time": ("time", [np.datetime64(datetime(2020, 1, 1), "ns")]),
+            }
+        elif coord_type == "longitude_latitude":
+            coords = {
+                "longitude": ("longitude", np.linspace(-10, 10, 3)),
+                "latitude": ("latitude", np.linspace(-5, 5, 2)),
+                "time": ("time", [np.datetime64(datetime(2020, 1, 1), "ns")]),
+            }
+        elif coord_type == "x_y":
+            coords = {
+                "x": ("x", np.linspace(0, 100, 3)),
+                "y": ("y", np.linspace(0, 50, 2)),
+                "time": ("time", [np.datetime64(datetime(2020, 1, 1), "ns")]),
+            }
+        else:
+            coords = {}
+        from xarray import Dataset
+        return Dataset(coords=coords)
+
+    @patch("deep_code.utils.dataset_stac_generator.open_dataset")
+    def test_collection_id_with_space_raises(self, mock_open_ds):
+        mock_open_ds.return_value = self._make_dataset()
+        with self.assertRaisesRegex(ValueError, "must not contain spaces"):
+            OscDatasetStacGenerator(
+                dataset_id="test.zarr",
+                collection_id="bad id",
+                workflow_id="wf",
+                workflow_title="WF",
+                license_type="CC-BY-4.0",
+            )
+
+    @patch("deep_code.utils.dataset_stac_generator.open_dataset")
+    def test_spatial_extent_longitude_latitude(self, mock_open_ds):
+        ds = self._make_dataset("longitude_latitude")
+        mock_open_ds.return_value = ds
+        gen = self._make_generator(ds)
+        extent = gen._get_spatial_extent()
+        self.assertAlmostEqual(extent.bboxes[0][0], -10.0)
+        self.assertAlmostEqual(extent.bboxes[0][1], -5.0)
+
+    @patch("deep_code.utils.dataset_stac_generator.open_dataset")
+    def test_spatial_extent_x_y(self, mock_open_ds):
+        ds = self._make_dataset("x_y")
+        mock_open_ds.return_value = ds
+        gen = self._make_generator(ds)
+        extent = gen._get_spatial_extent()
+        self.assertAlmostEqual(extent.bboxes[0][0], 0.0)
+
+    @patch("deep_code.utils.dataset_stac_generator.open_dataset")
+    def test_spatial_extent_unknown_coords_raises(self, mock_open_ds):
+        ds = self._make_dataset("none")
+        mock_open_ds.return_value = ds
+        gen = self._make_generator(ds)
+        with self.assertRaisesRegex(ValueError, "recognized spatial coordinates"):
+            gen._get_spatial_extent()
+
+    @patch("deep_code.utils.dataset_stac_generator.open_dataset")
+    def test_temporal_extent_no_time_raises(self, mock_open_ds):
+        ds = self._make_dataset("none")
+        mock_open_ds.return_value = ds
+        gen = self._make_generator(ds)
+        with self.assertRaisesRegex(ValueError, "time"):
+            gen._get_temporal_extent()
+
+    @patch("deep_code.utils.dataset_stac_generator.open_dataset")
+    def test_normalize_name_none_returns_none(self, mock_open_ds):
+        ds = self._make_dataset()
+        mock_open_ds.return_value = ds
+        self.assertIsNone(OscDatasetStacGenerator._normalize_name(None))
+
+    @patch("deep_code.utils.dataset_stac_generator.open_dataset")
+    def test_build_collection_with_cf_params(self, mock_open_ds):
+        ds = self._make_dataset()
+        mock_open_ds.return_value = ds
+        gen = self._make_generator(ds, cf_params=[{"name": "temperature", "units": "K"}])
+        collection = gen.build_dataset_stac_collection(mode="dataset")
+        self.assertEqual(collection.extra_fields.get("cf:parameter"), [{"name": "temperature", "units": "K"}])
+
+    @patch("deep_code.utils.dataset_stac_generator.open_dataset")
+    def test_build_collection_with_visualisation_link(self, mock_open_ds):
+        ds = self._make_dataset()
+        mock_open_ds.return_value = ds
+        gen = self._make_generator(ds, visualisation_link="https://viewer.example.com/")
+        collection = gen.build_dataset_stac_collection(mode="dataset")
+        vis_links = [lnk for lnk in collection.links if lnk.rel == "visualisation"]
+        self.assertEqual(len(vis_links), 1)
+        self.assertEqual(vis_links[0].target, "https://viewer.example.com/")
+        self.assertEqual(vis_links[0].title, "Dataset visualisation")
+
+    @patch("deep_code.utils.dataset_stac_generator.open_dataset")
+    def test_build_collection_mode_all_adds_experiment_link(self, mock_open_ds):
+        ds = self._make_dataset()
+        mock_open_ds.return_value = ds
+        gen = self._make_generator(ds)
+        collection = gen.build_dataset_stac_collection(mode="all")
+        exp_links = [lnk for lnk in collection.links if "experiments" in str(lnk.target)]
+        self.assertEqual(len(exp_links), 1)
+
+    @patch("deep_code.utils.dataset_stac_generator.open_dataset")
+    def test_s3_to_https(self, mock_open_ds):
+        self.assertEqual(
+            OscDatasetStacGenerator._s3_to_https("s3://my-bucket/path/to/file.json"),
+            "https://my-bucket.s3.amazonaws.com/path/to/file.json",
+        )
+
+    @patch("deep_code.utils.dataset_stac_generator.open_dataset")
+    def test_update_existing_variable_catalog(self, mock_open_ds):
+        import json
+        import os
+        import tempfile
+
+        ds = self._make_dataset()
+        mock_open_ds.return_value = ds
+        gen = self._make_generator(ds, osc_themes=["land"])
+
+        base = {
+            "type": "Catalog",
+            "id": "var1",
+            "stac_version": "1.0.0",
+            "description": "Variable catalog",
+            "links": [],
+        }
+        with tempfile.NamedTemporaryFile(mode="w", suffix=".json", delete=False) as f:
+            json.dump(base, f)
+            tmp_path = f.name
+        try:
+            result = gen.update_existing_variable_catalog(tmp_path, "var1")
+        finally:
+            os.unlink(tmp_path)
+
+        rels = [lnk["rel"] for lnk in result["links"]]
+        self.assertIn("child", rels)
+        self.assertIn("related", rels)  # theme link
diff --git a/deep_code/tests/utils/test_helper.py b/deep_code/tests/utils/test_helper.py
index a7c8c3f..5044343 100644
--- a/deep_code/tests/utils/test_helper.py
+++ b/deep_code/tests/utils/test_helper.py
@@ -10,7 +10,7 @@
 import xarray
 import xarray as xr
 
-from deep_code.utils.helper import open_dataset
+from deep_code.utils.helper import open_dataset, serialize
 
 
 def make_dummy_dataset():
@@ -156,3 +156,23 @@ def test_uses_provided_logger(self, mock_new_store, mock_get_logger):
         custom_logger.info.assert_any_call(
             "Successfully opened dataset 'test-id' with configuration: Public store"
         )
+
+
+class TestSerialize(unittest.TestCase):
+    def test_set_converted_to_list(self):
+        result = serialize({1, 2, 3})
+        self.assertIsInstance(result, list)
+        self.assertCountEqual(result, [1, 2, 3])
+
+    def test_object_with_dict_returns_dict(self):
+        class Obj:
+            def __init__(self):
+                self.x = 1
+                self.y = 2
+
+        result = serialize(Obj())
+        self.assertEqual(result, {"x": 1, "y": 2})
+
+    def test_unserializable_raises_type_error(self):
+        with self.assertRaises(TypeError):
+            serialize(42)

From 875fbd4137893a50563e50f3d30e2b7c17eabb18 Mon Sep 17 00:00:00 2001
From: tejas <tejas.morbagalharish@brockmann-consult.de>
Date: Fri, 20 Mar 2026 11:45:05 +0100
Subject: [PATCH 15/16] support non-default osc_project from dataset config

---
 deep_code/tools/new.py     | 1 +
 deep_code/tools/publish.py | 2 ++
 docs/configuration.md      | 3 +++
 3 files changed, 6 insertions(+)

diff --git a/deep_code/tools/new.py b/deep_code/tools/new.py
index a9f591f..d7fb209 100644
--- a/deep_code/tools/new.py
+++ b/deep_code/tools/new.py
@@ -76,6 +76,7 @@ def generate_dataset_template(output_path: Optional[str] = None) -> str:
             "description": "[OPTIONAL: human-readable description of the dataset. Overrides the description attribute in the Zarr store if set]",
             "documentation_link": "[OPTIONAL: link to documentation, publication, or handbook]",
             "visualisation_link": "[OPTIONAL: URL to a visualisation of the dataset (e.g. xcube Viewer, WMS)]",
+            "osc_project": "[OPTIONAL: OSC project ID (e.g. deep-earth-system-data-lab). Defaults to deep-earth-system-data-lab]",
             "osc_project_title": "[OPTIONAL: display title of the OSC project as it appears in the catalog (e.g. DeepESDL). Defaults to a formatted version of osc_project if omitted]",
             "access_link": "[OPTIONAL: public S3 URL of the Zarr store — defaults to s3://deep-esdl-public/{dataset_id}]",
             "cf_parameter": [{"name": "[OPTIONAL: CF standard name]", "units": "[unit string]"}],
diff --git a/deep_code/tools/publish.py b/deep_code/tools/publish.py
index 2daf1cf..de76d72 100644
--- a/deep_code/tools/publish.py
+++ b/deep_code/tools/publish.py
@@ -270,6 +270,7 @@ def publish_dataset(
         cf_params = self.dataset_config.get("cf_parameter")
         license_type = self.dataset_config.get("license_type")
         visualisation_link = self.dataset_config.get("visualisation_link")
+        osc_project = self.dataset_config.get("osc_project")
         osc_project_title = self.dataset_config.get("osc_project_title")
         description = self.dataset_config.get("description")
 
@@ -305,6 +306,7 @@ def publish_dataset(
             osc_themes=osc_themes,
             cf_params=cf_params,
             visualisation_link=visualisation_link,
+            osc_project=osc_project,
             osc_project_title=osc_project_title,
             description=description,
         )
diff --git a/docs/configuration.md b/docs/configuration.md
index 83b8abf..d8d90d6 100644
--- a/docs/configuration.md
+++ b/docs/configuration.md
@@ -26,6 +26,8 @@ osc_themes: [cryosphere]        # must match slugs at opensciencedata.esa.int/th
 osc_region: global
 dataset_status: completed       # ongoing | completed | planned (default: ongoing)
 documentation_link: https://example.com/docs
+visualisation_link: https://example.com/viewer   # URL to a visualisation of the dataset
+osc_project: deep-earth-system-data-lab          # defaults to deep-earth-system-data-lab
 access_link: s3://bucket/your-dataset.zarr   # defaults to s3://deep-esdl-public/{dataset_id}
 
 # CF parameter overrides (list of {name, units, ...} dicts)
@@ -48,6 +50,7 @@ cf_parameter:
 | `description` | No | Human-readable description of the dataset. Overrides the `description` attribute in the Zarr store; falls back to `"No description available."` if neither is set. |
 | `documentation_link` | No | URL to dataset documentation. |
 | `visualisation_link` | No | URL to a visualisation of the dataset (e.g. xcube Viewer, WMS). Added as a `visualisation` link with title `"Dataset visualisation"`. |
+| `osc_project` | No | OSC project ID this dataset belongs to (e.g. `deep-earth-system-data-lab`). Defaults to `deep-earth-system-data-lab`. |
 | `cf_parameter` | No | List of CF metadata dicts to override variable attributes (e.g. `name`, `units`). |
 | `stac_catalog_s3_root` | Yes | S3 root where the STAC Catalog and Item are published. Publishing fails if this field is absent. See [STAC Catalog on S3](#stac-catalog-on-s3). |
 

From 15ebc76cc94b79278e2070e65de142750c7261f5 Mon Sep 17 00:00:00 2001
From: tejas <tejas.morbagalharish@brockmann-consult.de>
Date: Fri, 20 Mar 2026 15:30:21 +0100
Subject: [PATCH 16/16] fixed workflow template

---
 deep_code/tools/new.py | 13 +++----------
 1 file changed, 3 insertions(+), 10 deletions(-)

diff --git a/deep_code/tools/new.py b/deep_code/tools/new.py
index d7fb209..f2b2654 100644
--- a/deep_code/tools/new.py
+++ b/deep_code/tools/new.py
@@ -14,16 +14,11 @@ class TemplateGenerator:
     def generate_workflow_template(output_path: Optional[str] = None) -> str:
         """Generate a complete template with all possible keys and placeholder values"""
 
-        required = {
+        workflow_template = {
             "workflow_id": "[REQUIRED: unique identifier for your workflow]",
             "properties": {
                 "title": "[REQUIRED: human-readable title of the workflow]",
                 "license": "[REQUIRED: SPDX license identifier, e.g. CC-BY-4.0, MIT, proprietary]",
-            },
-        }
-
-        optional = {
-            "properties": {
                 "description": "[OPTIONAL: concise summary of what the workflow does]",
                 "keywords": ["[OPTIONAL: KEYWORD1]", "[KEYWORD2]"],
                 "themes": ["[OPTIONAL: thematic area, e.g. land, ocean, atmosphere]"],
@@ -53,10 +48,8 @@ def generate_workflow_template(output_path: Optional[str] = None) -> str:
             with open(output_path, "w") as f:
                 f.write("# Workflow Configuration Template\n")
                 f.write("# Replace all [PLACEHOLDER] values with your actual data\n\n")
-                f.write("# --- REQUIRED fields ---\n")
-                f.write(yaml.dump(required, sort_keys=False, width=1000, default_flow_style=False))
-                f.write("\n# --- OPTIONAL fields ---\n")
-                f.write(yaml.dump(optional, sort_keys=False, width=1000, default_flow_style=False))
+                f.write(yaml.dump(workflow_template, sort_keys=False, width=1000,
+                                  default_flow_style=False))
 
     @staticmethod
     def generate_dataset_template(output_path: Optional[str] = None) -> str: