From 053f33d042635292a80bd83a109c871149d9e877 Mon Sep 17 00:00:00 2001 From: tejas Date: Thu, 19 Mar 2026 16:03:58 +0100 Subject: [PATCH 01/16] validate collection_id, normalize osc_themes, fix variable description fallback, add visualisation link, osc_project_title, and STAC item/catalog log messages --- deep_code/utils/dataset_stac_generator.py | 29 +++++++++++++++++++---- 1 file changed, 25 insertions(+), 4 deletions(-) diff --git a/deep_code/utils/dataset_stac_generator.py b/deep_code/utils/dataset_stac_generator.py index 40a086f..288eaeb 100644 --- a/deep_code/utils/dataset_stac_generator.py +++ b/deep_code/utils/dataset_stac_generator.py @@ -50,20 +50,29 @@ def __init__( osc_missions: list[str] | None = None, cf_params: list[dict[str]] | None = None, osc_project: str = "deep-earth-system-data-lab", + osc_project_title: str | None = None, + visualisation_link: str | None = None, ): + if " " in collection_id: + raise ValueError( + f"collection_id must not contain spaces: {collection_id!r}. " + "Use hyphens as word separators (e.g. 'My-Dataset-2024')." + ) self.dataset_id = dataset_id self.collection_id = collection_id self.workflow_id = workflow_id self.workflow_title = workflow_title self.license_type = license_type self.osc_project = osc_project + self.osc_project_title = osc_project_title or self.format_string(osc_project) self.access_link = access_link or f"s3://deep-esdl-public/{dataset_id}" self.documentation_link = documentation_link self.osc_status = osc_status self.osc_region = osc_region - self.osc_themes = osc_themes or [] + self.osc_themes = [t.lower() for t in (osc_themes or [])] self.osc_missions = osc_missions or [] self.cf_params = cf_params or {} + self.visualisation_link = visualisation_link self.logger = logging.getLogger(__name__) self.dataset = open_dataset(dataset_id=dataset_id, logger=self.logger) self.variables_metadata = self.get_variables_metadata() @@ -215,7 +224,7 @@ def build_variable_catalog(self, var_metadata) -> Catalog: # Create a PySTAC Catalog object var_catalog = Catalog( id=var_id, - description=var_metadata.get("description"), + description=var_metadata.get("description") or self.format_string(var_id), title=self.format_string(var_id), stac_extensions=[ "https://stac-extensions.github.io/themes/v1.0.0/schema.json" @@ -474,6 +483,7 @@ def build_zarr_stac_item(self, stac_catalog_s3_root: str) -> Item: Returns: A :class:`pystac.Item` ready to be serialised to S3. """ + self.logger.info(f"Building STAC Item for collection '{self.collection_id}'.") spatial_extent = self._get_spatial_extent() temporal_extent = self._get_temporal_extent() general_metadata = self._get_general_metadata() @@ -542,6 +552,7 @@ def build_zarr_stac_item(self, stac_catalog_s3_root: str) -> Item: title="Consolidated Zarr Metadata", roles=["metadata"], )) + self.logger.info(f"STAC Item built: {item_href}") return item def build_zarr_stac_catalog_file_dict( @@ -563,6 +574,10 @@ def build_zarr_stac_catalog_file_dict( Returns: ``{s3_path: content_dict}`` for every file to be written to S3. """ + self.logger.info( + f"Building STAC Catalog file dict for collection '{self.collection_id}' " + f"at root '{stac_catalog_s3_root}'." + ) root = stac_catalog_s3_root.rstrip("/") catalog_href = f"{root}/catalog.json" @@ -581,9 +596,11 @@ def build_zarr_stac_catalog_file_dict( title=self.collection_id, )) + item_href = f"{root}/{self.collection_id}/item.json" + self.logger.info(f"STAC Catalog file dict ready: {catalog_href}, {item_href}") return { catalog_href: catalog.to_dict(transform_hrefs=False), - f"{root}/{self.collection_id}/item.json": item.to_dict(transform_hrefs=False), + item_href: item.to_dict(transform_hrefs=False), } def build_dataset_stac_collection(self, mode: str, stac_catalog_s3_root: str | None = None) -> Collection: @@ -642,6 +659,10 @@ def build_dataset_stac_collection(self, mode: str, stac_catalog_s3_root: str | N collection.add_link( Link(rel="via", target=self.documentation_link, title="Documentation") ) + if self.visualisation_link: + collection.add_link( + Link(rel="visualisation", target=self.visualisation_link, title="Dataset visualisation") + ) collection.add_link( Link( rel="parent", @@ -689,7 +710,7 @@ def build_dataset_stac_collection(self, mode: str, stac_catalog_s3_root: str | N rel="related", target=f"../../projects/{self.osc_project}/collection.json", media_type="application/json", - title=f"Project: {self.format_string(self.osc_project)}", + title=f"Project: {self.osc_project_title}", ) ) From fafcf0941459dbdc1ef6c7b08d81e4a036ce1094 Mon Sep 17 00:00:00 2001 From: tejas Date: Thu, 19 Mar 2026 16:04:15 +0100 Subject: [PATCH 02/16] omit S3 key/secret from storage_options when env vars are unset --- deep_code/utils/helper.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/deep_code/utils/helper.py b/deep_code/utils/helper.py index 0e33585..9452b81 100644 --- a/deep_code/utils/helper.py +++ b/deep_code/utils/helper.py @@ -63,8 +63,10 @@ def open_dataset( "root": os.environ.get("S3_USER_STORAGE_BUCKET", root), "storage_options": { "anon": False, - "key": os.environ.get("S3_USER_STORAGE_KEY"), - "secret": os.environ.get("S3_USER_STORAGE_SECRET"), + **({ + "key": os.environ["S3_USER_STORAGE_KEY"], + "secret": os.environ["S3_USER_STORAGE_SECRET"], + } if os.environ.get("S3_USER_STORAGE_KEY") and os.environ.get("S3_USER_STORAGE_SECRET") else {}), }, }, }, From fd8387c9f72553d41588d69760d65c0dad08b0b4 Mon Sep 17 00:00:00 2001 From: tejas Date: Thu, 19 Mar 2026 16:04:35 +0100 Subject: [PATCH 03/16] make license_type, stac_catalog_s3_root mandatory; wire visualisation_link and osc_project_title; add PR attribution footer --- deep_code/tools/publish.py | 53 ++++++++++++++++++++++++++++---------- 1 file changed, 39 insertions(+), 14 deletions(-) diff --git a/deep_code/tools/publish.py b/deep_code/tools/publish.py index dda7c2d..7718ae9 100644 --- a/deep_code/tools/publish.py +++ b/deep_code/tools/publish.py @@ -267,10 +267,26 @@ def publish_dataset( osc_themes = self.dataset_config.get("osc_themes") cf_params = self.dataset_config.get("cf_parameter") license_type = self.dataset_config.get("license_type") + visualisation_link = self.dataset_config.get("visualisation_link") + osc_project_title = self.dataset_config.get("osc_project_title") if not dataset_id or not self.collection_id: raise ValueError("Dataset ID or Collection ID missing in the config.") + if not license_type: + raise ValueError( + "license_type is required in the dataset config. " + "Provide an SPDX identifier (e.g. 'CC-BY-4.0', 'MIT', 'proprietary')." + ) + + stac_catalog_s3_root = self.dataset_config.get("stac_catalog_s3_root") + if not stac_catalog_s3_root: + raise ValueError( + "stac_catalog_s3_root is required in the dataset config. " + "Provide the S3 root where the STAC catalog should be published " + "(e.g. 's3://my-bucket/stac/my-collection/')." + ) + logger.info("Generating STAC collection...") generator = OscDatasetStacGenerator( @@ -285,6 +301,8 @@ def publish_dataset( osc_region=osc_region, osc_themes=osc_themes, cf_params=cf_params, + visualisation_link=visualisation_link, + osc_project_title=osc_project_title, ) # Store so publish() can reuse it for zarr STAC catalog generation self._last_generator = generator @@ -421,6 +439,13 @@ def generate_workflow_experiment_records( raise ValueError("workflow_id is missing in workflow config.") properties_list = self.workflow_config.get("properties", {}) + + if not properties_list.get("license"): + raise ValueError( + "license is required under 'properties' in the workflow config. " + "Provide an SPDX identifier (e.g. 'CC-BY-4.0', 'MIT', 'proprietary')." + ) + osc_themes = properties_list.get("themes") contacts = self.workflow_config.get("contact", []) links = self.workflow_config.get("links", []) @@ -587,20 +612,16 @@ def publish( ds_files = self.publish_dataset(write_to_file=False, mode=mode) files.update(ds_files) - # Publish STAC catalog + item to S3 when stac_catalog_s3_root is configured. - # This is independent of the GitHub PR and happens immediately. + # Publish STAC catalog + item to S3 (stac_catalog_s3_root is mandatory). stac_catalog_s3_root = self.dataset_config.get("stac_catalog_s3_root") - if stac_catalog_s3_root: - logger.info( - f"Publishing STAC catalog to S3: {stac_catalog_s3_root}" - ) - zarr_stac_files = self._last_generator.build_zarr_stac_catalog_file_dict( - stac_catalog_s3_root - ) - self._write_stac_catalog_to_s3( - zarr_stac_files, self._get_stac_s3_storage_options() - ) - logger.info("STAC catalog written to S3.") + logger.info(f"Publishing STAC catalog to S3: {stac_catalog_s3_root}") + zarr_stac_files = self._last_generator.build_zarr_stac_catalog_file_dict( + stac_catalog_s3_root + ) + self._write_stac_catalog_to_s3( + zarr_stac_files, self._get_stac_s3_storage_options() + ) + logger.info("STAC catalog written to S3.") if mode in ("workflow", "all"): wf_files = self.generate_workflow_experiment_records( @@ -633,7 +654,11 @@ def publish( ) commit_message = f"Publish {mode_label}" pr_title = f"Publish {mode_label}" - pr_body = f"This PR publishes {mode_label} to the repository." + pr_body = ( + f"This PR publishes {mode_label} to the repository.\n\n" + "---\n" + "_Generated with [deep-code](https://github.com/deepesdl/deep-code)_" + ) pr_url = self.gh_publisher.publish_files( branch_name=branch_name, From 061aa8b0a735270cb748657f61f2cd93940494d5 Mon Sep 17 00:00:00 2001 From: tejas Date: Thu, 19 Mar 2026 16:04:58 +0100 Subject: [PATCH 04/16] split dataset and workflow templates into required/optional sections with inline labels --- deep_code/tools/new.py | 85 +++++++++++++++++++++--------------------- 1 file changed, 43 insertions(+), 42 deletions(-) diff --git a/deep_code/tools/new.py b/deep_code/tools/new.py index 8e3e67a..0932a9f 100644 --- a/deep_code/tools/new.py +++ b/deep_code/tools/new.py @@ -14,28 +14,29 @@ class TemplateGenerator: def generate_workflow_template(output_path: Optional[str] = None) -> str: """Generate a complete template with all possible keys and placeholder values""" - template = { - "workflow_id": "[A unique identifier for your workflow]", + required = { + "workflow_id": "[REQUIRED: unique identifier for your workflow]", "properties": { - "title": "[Human-readable title of the workflow]", - "description": "[A concise summary of what the workflow does]", - "keywords": ["[KEYWORD1]", "[KEYWORD2]"], - "themes": [ - "[Thematic area(s) of focus (e.g. land, ocean, atmosphere)]", - "[THEME1]", - "[THEME2]", - ], - "license": "[License type (e.g. MIT, Apache-2.0, CC-BY-4.0, proprietary)]", + "title": "[REQUIRED: human-readable title of the workflow]", + "license": "[REQUIRED: SPDX license identifier, e.g. CC-BY-4.0, MIT, proprietary]", + }, + } + + optional = { + "properties": { + "description": "[OPTIONAL: concise summary of what the workflow does]", + "keywords": ["[OPTIONAL: KEYWORD1]", "[KEYWORD2]"], + "themes": ["[OPTIONAL: thematic area, e.g. land, ocean, atmosphere]"], "jupyter_kernel_info": { - "name": "[Name of the execution environment or notebook kernel]", + "name": "[OPTIONAL: name of the execution environment or notebook kernel]", "python_version": "[PYTHON_VERSION]", "env_file": "[Link to the environment file (YAML) used to create the notebook environment]", }, }, - "jupyter_notebook_url": "[Link to the source notebook (e.g. on GitHub)]", + "jupyter_notebook_url": "[OPTIONAL: link to the source notebook (e.g. on GitHub)]", "contact": [ { - "name": "[Contact person's full name]", + "name": "[OPTIONAL: contact person's full name]", "organization": "[Affiliated institution or company]", "links": [ { @@ -48,53 +49,53 @@ def generate_workflow_template(output_path: Optional[str] = None) -> str: ], } - yaml_str = yaml.dump( - template, sort_keys=False, width=1000, default_flow_style=False - ) - if output_path: with open(output_path, "w") as f: - f.write("# Complete Workflow Configuration Template\n") + f.write("# Workflow Configuration Template\n") f.write("# Replace all [PLACEHOLDER] values with your actual data\n\n") - f.write(yaml_str) + f.write("# --- REQUIRED fields ---\n") + f.write(yaml.dump(required, sort_keys=False, width=1000, default_flow_style=False)) + f.write("\n# --- OPTIONAL fields ---\n") + f.write(yaml.dump(optional, sort_keys=False, width=1000, default_flow_style=False)) @staticmethod def generate_dataset_template(output_path: Optional[str] = None) -> str: """Generate a complete dataset template with all possible keys and placeholder values""" - template = { - "dataset_id": "[The name of the dataset object within your S3 bucket].zarr", - "collection_id": "[A unique identifier for the dataset collection]", - "osc_themes": [ - "[Oceans]", - "[Open Science theme (choose from " - "https://opensciencedata.esa.int/themes/catalog)", - ], - "osc_region": "[Geographical coverage, e.g. 'global']", - "dataset_status": "[Status of the dataset: 'ongoing', 'completed', or 'planned']", - "documentation_link": "[Link to relevant documentation, publication, or handbook]", + required = { + "dataset_id": "[REQUIRED: name of the Zarr store in your S3 bucket, e.g. my-dataset.zarr]", + "collection_id": "[REQUIRED: unique identifier, no spaces — use hyphens (e.g. My-Dataset-2024)]", + "license_type": "[REQUIRED: SPDX license identifier, e.g. CC-BY-4.0, MIT, proprietary]", + "stac_catalog_s3_root": "[REQUIRED: S3 root for the STAC Catalog + Item, e.g. s3://my-bucket/stac/my-collection/]", } - yaml_str = yaml.dump( - template, sort_keys=False, width=1000, default_flow_style=False - ) + optional = { + "osc_themes": ["[OPTIONAL: OSC theme slug, e.g. land, ocean, atmosphere — auto-lowercased]"], + "osc_region": "[OPTIONAL: geographical coverage, e.g. Global]", + "dataset_status": "[OPTIONAL: ongoing | completed | planned (default: ongoing)]", + "documentation_link": "[OPTIONAL: link to documentation, publication, or handbook]", + "visualisation_link": "[OPTIONAL: URL to a visualisation of the dataset (e.g. xcube Viewer, WMS)]", + "osc_project_title": "[OPTIONAL: display title of the OSC project as it appears in the catalog (e.g. DeepESDL). Defaults to a formatted version of osc_project if omitted]", + "access_link": "[OPTIONAL: public S3 URL of the Zarr store — defaults to s3://deep-esdl-public/{dataset_id}]", + "cf_parameter": [{"name": "[OPTIONAL: CF standard name]", "units": "[unit string]"}], + } stac_catalog_comment = ( - "\n# Optional: publish a STAC Catalog + Item to S3 alongside the Zarr.\n" - "# When set, deep-code writes:\n" - "# {stac_catalog_s3_root}/catalog.json (STAC Catalog root)\n" + "\n# stac_catalog_s3_root: deep-code writes the following files to this S3 root:\n" + "# {stac_catalog_s3_root}/catalog.json (STAC Catalog root)\n" "# {stac_catalog_s3_root}/{collection_id}/item.json (STAC Item for the whole Zarr)\n" - "# and adds a 'child' link from the OSC Collection to this S3 catalog.\n" - "# S3 write credentials are resolved in order from:\n" + "# S3 write credentials are resolved in order:\n" "# 1. STAC_S3_KEY / STAC_S3_SECRET env vars (STAC-specific, any bucket)\n" "# 2. AWS_ACCESS_KEY_ID / AWS_SECRET_ACCESS_KEY env vars\n" "# 3. boto3 default chain (IAM role, ~/.aws/credentials)\n" - "# stac_catalog_s3_root: s3://[YOUR-BUCKET]/stac/[collection-id]/\n" ) if output_path: with open(output_path, "w") as f: - f.write("# Complete Dataset Configuration Template\n") + f.write("# Dataset Configuration Template\n") f.write("# Replace all [PLACEHOLDER] values with your actual data\n\n") - f.write(yaml_str) + f.write("# --- REQUIRED fields ---\n") + f.write(yaml.dump(required, sort_keys=False, width=1000, default_flow_style=False)) + f.write("\n# --- OPTIONAL fields ---\n") + f.write(yaml.dump(optional, sort_keys=False, width=1000, default_flow_style=False)) f.write(stac_catalog_comment) From 86fb0ea212754321419278767f5cf0b0d48d7b96 Mon Sep 17 00:00:00 2001 From: tejas Date: Thu, 19 Mar 2026 16:05:16 +0100 Subject: [PATCH 05/16] add 0.1.9 changelog entries --- CHANGES.md | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/CHANGES.md b/CHANGES.md index 3f69147..daab3af 100644 --- a/CHANGES.md +++ b/CHANGES.md @@ -71,3 +71,10 @@ - `jupyter_kernel_info` is now optional in `RecordProperties`; workflow configs without a notebook URL no longer require this field. - Removed redundant `hasattr` guard for `_last_generator` in `Publisher.publish()`. - Added automated MkDocs GitHub Pages deployment on release via a dedicated `docs.yml` workflow. +- `osc_themes` values are now automatically lowercased, so `'Land'` and `'LAND'` are treated the same as `'land'`, preventing theme validation failures. +- `collection_id` is now validated to contain no spaces; a clear error is raised with a hint to use hyphens instead. +- `license_type` (dataset) and `properties.license` (workflow) are now mandatory fields; publishing fails immediately with a descriptive error if either is missing. +- Variable catalog `description` now falls back to the title-cased variable ID when neither `description` nor `long_name` attrs are present on the zarr variable, preventing `null` description validation failures. +- Pull requests opened by deep-code now include a "Generated with deep-code" note in the PR description. +- `stac_catalog_s3_root` is now a mandatory field in the dataset config; publishing fails immediately with a descriptive error if it is absent. +- Added optional `visualisation_link` field to the dataset config; when provided, a `related` link with title `"Dataset Visualisation"` is added to the generated OSC collection. From 9b0e2bda4aed657e36aac608cfbaba87a81abdb7 Mon Sep 17 00:00:00 2001 From: tejas Date: Thu, 19 Mar 2026 16:05:41 +0100 Subject: [PATCH 06/16] 0.1.9.dev1 --- deep_code/version.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/deep_code/version.py b/deep_code/version.py index c0819c3..ed094a1 100644 --- a/deep_code/version.py +++ b/deep_code/version.py @@ -19,4 +19,4 @@ # FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER # DEALINGS IN THE SOFTWARE. -version = "0.1.9.dev0" +version = "0.1.9.dev1" From afed7eb9eff1b6393b6f5a718545438f2386a9fa Mon Sep 17 00:00:00 2001 From: tejas Date: Thu, 19 Mar 2026 16:05:58 +0100 Subject: [PATCH 07/16] update docs --- docs/cli.md | 2 ++ docs/configuration.md | 28 ++++++++++------------------ 2 files changed, 12 insertions(+), 18 deletions(-) diff --git a/docs/cli.md b/docs/cli.md index 14947ed..f8e3a5f 100644 --- a/docs/cli.md +++ b/docs/cli.md @@ -28,3 +28,5 @@ Options: 1. Reads your configs and builds dataset STAC collections plus variable catalogs. 2. Builds workflow and experiment OGC API Records. 3. Forks/clones the target metadata repo (production, staging, or testing), commits generated JSON, and opens a pull request on your behalf. + +The pull request description includes a "Generated with deep-code" attribution note. diff --git a/docs/configuration.md b/docs/configuration.md index bf2b955..b37c28c 100644 --- a/docs/configuration.md +++ b/docs/configuration.md @@ -17,11 +17,12 @@ The sections below document every field in those templates. ```yaml # Required dataset_id: your-dataset.zarr -collection_id: your-collection +collection_id: your-collection # no spaces — use hyphens license_type: CC-BY-4.0 +stac_catalog_s3_root: s3://bucket/stac/your-collection/ # Optional -osc_themes: [cryosphere] # must match slugs at opensciencedata.esa.int/themes/catalog +osc_themes: [cryosphere] # must match slugs at opensciencedata.esa.int/themes/catalog — auto-lowercased osc_region: global dataset_status: completed # ongoing | completed | planned (default: ongoing) documentation_link: https://example.com/docs @@ -31,15 +32,6 @@ access_link: s3://bucket/your-dataset.zarr # defaults to s3://deep-esdl-public cf_parameter: - name: sea_surface_temperature units: kelvin - -# Optional: publish a STAC Catalog + Item next to the data on S3. -# When set, a lightweight STAC hierarchy (catalog.json → item.json) is written -# directly to S3 and a "via" link is added to the OSC collection pointing to it. -# S3 write credentials are resolved in order: -# 1. STAC_S3_KEY / STAC_S3_SECRET (STAC-specific, any bucket) -# 2. AWS_ACCESS_KEY_ID / AWS_SECRET_ACCESS_KEY -# 3. boto3 default chain (IAM role, ~/.aws/credentials) -stac_catalog_s3_root: s3://bucket/stac/your-collection/ ``` ### Field reference @@ -47,20 +39,20 @@ stac_catalog_s3_root: s3://bucket/stac/your-collection/ | Field | Required | Description | |---|---|---| | `dataset_id` | Yes | Zarr store identifier (used to open the dataset). | -| `collection_id` | Yes | Unique ID for the STAC collection in the OSC catalog. | -| `license_type` | Yes | SPDX license identifier (e.g. `CC-BY-4.0`). | -| `osc_themes` | No | List of OSC theme slugs (e.g. `[cryosphere, oceans]`). | +| `collection_id` | Yes | Unique ID for the STAC collection in the OSC catalog. **Must not contain spaces** — use hyphens as word separators (e.g. `My-Dataset-2024`). | +| `license_type` | Yes | SPDX license identifier (e.g. `CC-BY-4.0`). Publishing fails if this field is absent. | +| `osc_themes` | No | List of OSC theme slugs (e.g. `[cryosphere, oceans]`). Values are automatically lowercased so `Land` and `land` are equivalent. | | `osc_region` | No | Geographical region label (default: `Global`). | | `dataset_status` | No | One of `ongoing`, `completed`, or `planned` (default: `ongoing`). | | `access_link` | No | Public S3 URL of the Zarr store. Defaults to `s3://deep-esdl-public/{dataset_id}`. | | `documentation_link` | No | URL to dataset documentation. | +| `visualisation_link` | No | URL to a visualisation of the dataset (e.g. xcube Viewer, WMS). Added as a `visualisation` link with title `"Dataset visualisation"`. | | `cf_parameter` | No | List of CF metadata dicts to override variable attributes (e.g. `name`, `units`). | -| `stac_catalog_s3_root` | No | S3 root for the dataset-level STAC Catalog/Item. See [STAC Catalog on S3](#stac-catalog-on-s3). | +| `stac_catalog_s3_root` | Yes | S3 root where the STAC Catalog and Item are published. Publishing fails if this field is absent. See [STAC Catalog on S3](#stac-catalog-on-s3). | ### STAC Catalog on S3 -Setting `stac_catalog_s3_root` generates a two-file STAC hierarchy on S3 alongside -the data: +`stac_catalog_s3_root` is required. deep-code writes a two-file STAC hierarchy to S3 alongside the data: ``` s3://bucket/stac/your-collection/ @@ -124,7 +116,7 @@ links: | `properties.description` | No | Short summary of what the workflow does. | | `properties.keywords` | No | List of keyword strings. | | `properties.themes` | No | List of OSC theme slugs. | -| `properties.license` | No | License identifier (e.g. `proprietary`, `CC-BY-4.0`). | +| `properties.license` | Yes | SPDX license identifier (e.g. `CC-BY-4.0`, `proprietary`). Publishing fails if this field is absent. | | `jupyter_notebook_url` | No | Link to the source notebook on GitHub. When omitted, kernel and application links are skipped. | | `properties.jupyter_kernel_info` | No | Kernel name, Python version, and environment file URL. Only published when `jupyter_notebook_url` is set. | | `contact` | No | List of contact objects with `name`, `organization`, and `links`. | From b9ad319f91123d8e22708e2a29b8180ce84bae4e Mon Sep 17 00:00:00 2001 From: tejas Date: Thu, 19 Mar 2026 16:34:16 +0100 Subject: [PATCH 08/16] normalize workflow-id --- deep_code/tools/publish.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/deep_code/tools/publish.py b/deep_code/tools/publish.py index 7718ae9..2daf1cf 100644 --- a/deep_code/tools/publish.py +++ b/deep_code/tools/publish.py @@ -172,7 +172,9 @@ def __init__( self.workflow_title = self.workflow_config.get("properties", {}).get( "title" ) - self.workflow_id = self.workflow_config.get("workflow_id") + self.workflow_id = self._normalize_name( + self.workflow_config.get("workflow_id") + ) def _read_config_files(self) -> None: if self.dataset_config_path: @@ -269,6 +271,7 @@ def publish_dataset( license_type = self.dataset_config.get("license_type") visualisation_link = self.dataset_config.get("visualisation_link") osc_project_title = self.dataset_config.get("osc_project_title") + description = self.dataset_config.get("description") if not dataset_id or not self.collection_id: raise ValueError("Dataset ID or Collection ID missing in the config.") @@ -303,6 +306,7 @@ def publish_dataset( cf_params=cf_params, visualisation_link=visualisation_link, osc_project_title=osc_project_title, + description=description, ) # Store so publish() can reuse it for zarr STAC catalog generation self._last_generator = generator From b156b93931ab7b047c1cbe341ac6acda5e61100a Mon Sep 17 00:00:00 2001 From: tejas Date: Thu, 19 Mar 2026 16:34:49 +0100 Subject: [PATCH 09/16] added description as option field in dataset config --- deep_code/tools/new.py | 1 + 1 file changed, 1 insertion(+) diff --git a/deep_code/tools/new.py b/deep_code/tools/new.py index 0932a9f..a9f591f 100644 --- a/deep_code/tools/new.py +++ b/deep_code/tools/new.py @@ -73,6 +73,7 @@ def generate_dataset_template(output_path: Optional[str] = None) -> str: "osc_themes": ["[OPTIONAL: OSC theme slug, e.g. land, ocean, atmosphere — auto-lowercased]"], "osc_region": "[OPTIONAL: geographical coverage, e.g. Global]", "dataset_status": "[OPTIONAL: ongoing | completed | planned (default: ongoing)]", + "description": "[OPTIONAL: human-readable description of the dataset. Overrides the description attribute in the Zarr store if set]", "documentation_link": "[OPTIONAL: link to documentation, publication, or handbook]", "visualisation_link": "[OPTIONAL: URL to a visualisation of the dataset (e.g. xcube Viewer, WMS)]", "osc_project_title": "[OPTIONAL: display title of the OSC project as it appears in the catalog (e.g. DeepESDL). Defaults to a formatted version of osc_project if omitted]", From 46171f055e61b63635125307d193f86658f56547 Mon Sep 17 00:00:00 2001 From: tejas Date: Thu, 19 Mar 2026 16:41:53 +0100 Subject: [PATCH 10/16] fix STAC catalog links to use HTTPS and OSC browser convention --- deep_code/utils/dataset_stac_generator.py | 44 ++++++++++++++++++----- 1 file changed, 35 insertions(+), 9 deletions(-) diff --git a/deep_code/utils/dataset_stac_generator.py b/deep_code/utils/dataset_stac_generator.py index 288eaeb..89b6e67 100644 --- a/deep_code/utils/dataset_stac_generator.py +++ b/deep_code/utils/dataset_stac_generator.py @@ -52,6 +52,7 @@ def __init__( osc_project: str = "deep-earth-system-data-lab", osc_project_title: str | None = None, visualisation_link: str | None = None, + description: str | None = None, ): if " " in collection_id: raise ValueError( @@ -73,6 +74,7 @@ def __init__( self.osc_missions = osc_missions or [] self.cf_params = cf_params or {} self.visualisation_link = visualisation_link + self.description = description self.logger = logging.getLogger(__name__) self.dataset = open_dataset(dataset_id=dataset_id, logger=self.logger) self.variables_metadata = self.get_variables_metadata() @@ -137,8 +139,10 @@ def _normalize_name(name: str | None) -> str | None: def _get_general_metadata(self) -> dict: return { - "description": self.dataset.attrs.get( - "description", "No description available." + "description": ( + self.description + or self.dataset.attrs.get("description") + or "No description available." ) } @@ -456,6 +460,17 @@ def update_existing_variable_catalog(self, var_file_path, var_id) -> dict: ) return data + @staticmethod + def _s3_to_https(s3_url: str) -> str: + """Convert an s3:// URL to its HTTPS equivalent using AWS virtual-hosted style. + + Example: + s3://my-bucket/path/to/file → https://my-bucket.s3.amazonaws.com/path/to/file + """ + without_scheme = s3_url[len("s3://"):] + bucket, _, key = without_scheme.partition("/") + return f"https://{bucket}.s3.amazonaws.com/{key}" + @staticmethod def format_string(s: str) -> str: # Strip leading/trailing spaces/underscores and replace underscores with spaces @@ -726,17 +741,28 @@ def build_dataset_stac_collection(self, mode: str, stac_catalog_s3_root: str | N collection.license = self.license_type - # Link to the S3-hosted STAC catalog when provided. - # Uses rel="via" (not "child") because the OSC validator requires every - # "child" link to resolve to a file inside the metadata repository; - # the S3 catalog lives outside the repo and would fail that check. + # Add links to the S3-hosted STAC catalog following the OSC convention: + # via → STAC browser URL (human-browsable, HTTPS) + # child → direct HTTPS URL to catalog.json (machine-readable) + # The s3:// URL is never used directly in the collection as it fails the + # products/children.json uri-reference format check. if stac_catalog_s3_root: - catalog_href = stac_catalog_s3_root.rstrip("/") + "/catalog.json" + catalog_s3 = stac_catalog_s3_root.rstrip("/") + "/catalog.json" + catalog_https = self._s3_to_https(catalog_s3) + stac_browser_href = ( + "https://opensciencedata.esa.int/stac-browser/#/external/" + + catalog_https[len("https://"):] + ) collection.add_link(Link( rel="via", - target=catalog_href, + target=stac_browser_href, + title="Access", + )) + collection.add_link(Link( + rel="child", + target=catalog_https, media_type="application/json", - title="STAC Catalog", + title="Items", )) # Validate OSC extension fields From 29a44cb0c0f1e47ba73112db06d234ce8b77ef03 Mon Sep 17 00:00:00 2001 From: tejas Date: Thu, 19 Mar 2026 16:42:30 +0100 Subject: [PATCH 11/16] updated docs with description filed --- docs/configuration.md | 1 + 1 file changed, 1 insertion(+) diff --git a/docs/configuration.md b/docs/configuration.md index b37c28c..83b8abf 100644 --- a/docs/configuration.md +++ b/docs/configuration.md @@ -45,6 +45,7 @@ cf_parameter: | `osc_region` | No | Geographical region label (default: `Global`). | | `dataset_status` | No | One of `ongoing`, `completed`, or `planned` (default: `ongoing`). | | `access_link` | No | Public S3 URL of the Zarr store. Defaults to `s3://deep-esdl-public/{dataset_id}`. | +| `description` | No | Human-readable description of the dataset. Overrides the `description` attribute in the Zarr store; falls back to `"No description available."` if neither is set. | | `documentation_link` | No | URL to dataset documentation. | | `visualisation_link` | No | URL to a visualisation of the dataset (e.g. xcube Viewer, WMS). Added as a `visualisation` link with title `"Dataset visualisation"`. | | `cf_parameter` | No | List of CF metadata dicts to override variable attributes (e.g. `name`, `units`). | From 942255a0f802ed3f9679533be50d11121995be89 Mon Sep 17 00:00:00 2001 From: tejas Date: Thu, 19 Mar 2026 16:42:51 +0100 Subject: [PATCH 12/16] updated change log --- CHANGES.md | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/CHANGES.md b/CHANGES.md index daab3af..396216e 100644 --- a/CHANGES.md +++ b/CHANGES.md @@ -77,4 +77,8 @@ - Variable catalog `description` now falls back to the title-cased variable ID when neither `description` nor `long_name` attrs are present on the zarr variable, preventing `null` description validation failures. - Pull requests opened by deep-code now include a "Generated with deep-code" note in the PR description. - `stac_catalog_s3_root` is now a mandatory field in the dataset config; publishing fails immediately with a descriptive error if it is absent. -- Added optional `visualisation_link` field to the dataset config; when provided, a `related` link with title `"Dataset Visualisation"` is added to the generated OSC collection. +- STAC catalog links in the OSC collection now follow the OSC convention: a `via` link to the STAC browser URL and a `child` link to the direct HTTPS catalog URL. The `s3://` URL is converted to HTTPS (AWS virtual-hosted style) to satisfy the `uri-reference` format check in the OSC products schema. +- Added optional `visualisation_link` field to the dataset config; when provided, a `visualisation` link with title `"Dataset visualisation"` is added to the generated OSC collection. +- Added optional `description` field to the dataset config; overrides the `description` attribute from the Zarr store when set. +- Added optional `osc_project_title` field to the dataset config to correctly set the project link title (e.g. `"DeepESDL"`) instead of deriving it from the project ID. +- Fixed `workflow_id` not being normalised (slugified) when stored on `Publisher`, causing spaces in experiment link hrefs and failing `uri-reference` format validation. From 0f8fea7727e4808e1b836f30a433552356d07f88 Mon Sep 17 00:00:00 2001 From: tejas Date: Thu, 19 Mar 2026 17:02:38 +0100 Subject: [PATCH 13/16] fix tests --- deep_code/tests/tools/test_publish.py | 47 ++++++++++++++----- .../utils/test_dataset_stac_generator.py | 34 +++++++++----- 2 files changed, 55 insertions(+), 26 deletions(-) diff --git a/deep_code/tests/tools/test_publish.py b/deep_code/tests/tools/test_publish.py index de841f4..4c388ac 100644 --- a/deep_code/tests/tools/test_publish.py +++ b/deep_code/tests/tools/test_publish.py @@ -126,11 +126,22 @@ def test_environment_repo_selection(self, mock_gp): == "open-science-catalog-metadata-testing" ) + @patch.object(Publisher, "_write_stac_catalog_to_s3") @patch.object(Publisher, "publish_dataset", return_value={"a": {}}) @patch.object( Publisher, "generate_workflow_experiment_records", return_value={"b": {}} ) - def test_publish_mode_routing(self, mock_wf, mock_ds): + def test_publish_mode_routing(self, mock_wf, mock_ds, mock_s3): + mock_generator = MagicMock() + mock_generator.build_zarr_stac_catalog_file_dict.return_value = {} + self.publisher._last_generator = mock_generator + self.publisher.dataset_config = { + "stac_catalog_s3_root": "s3://bucket/stac/", + "collection_id": "test-collection", + "dataset_id": "test-dataset", + } + self.publisher.gh_publisher.publish_files.return_value = "PR_URL" + # dataset only self.publisher.publish(write_to_file=True, mode="dataset") mock_ds.assert_called() @@ -142,21 +153,28 @@ def test_publish_mode_routing(self, mock_wf, mock_ds): mock_ds.assert_not_called() mock_wf.assert_called() + @patch.object(Publisher, "_write_stac_catalog_to_s3") @patch.object(Publisher, "generate_workflow_experiment_records", return_value={}) @patch.object(Publisher, "publish_dataset", return_value={}) def test_publish_nothing_to_publish_raises( - self, mock_publish_dataset, mock_generate_workflow_experiment_records + self, mock_publish_dataset, mock_generate_workflow_experiment_records, mock_s3 ): + mock_generator = MagicMock() + mock_generator.build_zarr_stac_catalog_file_dict.return_value = {} + self.publisher._last_generator = mock_generator + self.publisher.dataset_config = {"stac_catalog_s3_root": "s3://bucket/stac/"} + with pytest.raises(ValueError): self.publisher.publish(write_to_file=False, mode="dataset") mock_publish_dataset.assert_called_once() mock_generate_workflow_experiment_records.assert_not_called() + @patch.object(Publisher, "_write_stac_catalog_to_s3") @patch.object(Publisher, "publish_dataset", return_value={"x": {}}) @patch.object( Publisher, "generate_workflow_experiment_records", return_value={"y": {}} ) - def test_publish_builds_pr_params(self, mock_wf, mock_ds): + def test_publish_builds_pr_params(self, mock_wf, mock_ds, mock_s3): # Make PR creation return a fixed URL self.publisher.gh_publisher.publish_files.return_value = "PR_URL" @@ -164,6 +182,12 @@ def test_publish_builds_pr_params(self, mock_wf, mock_ds): self.publisher.collection_id = "col" self.publisher.workflow_id = "wf" + # _last_generator is set by publish_dataset; since that's mocked, stub it + mock_generator = MagicMock() + mock_generator.build_zarr_stac_catalog_file_dict.return_value = {} + self.publisher._last_generator = mock_generator + self.publisher.dataset_config = {"stac_catalog_s3_root": "s3://bucket/stac/"} + url = self.publisher.publish(write_to_file=False, mode="all") assert url == "PR_URL" @@ -309,6 +333,7 @@ def test_publish_dataset_creates_project_collection_when_missing( "dataset_id": "test-dataset", "collection_id": "test-collection", "license_type": "CC-BY-4.0", + "stac_catalog_s3_root": "s3://bucket/stac/test-collection/", } self.publisher.collection_id = "test-collection" @@ -343,6 +368,7 @@ def test_publish_dataset_updates_project_collection_when_exists( "dataset_id": "test-dataset", "collection_id": "test-collection", "license_type": "CC-BY-4.0", + "stac_catalog_s3_root": "s3://bucket/stac/test-collection/", } self.publisher.collection_id = "test-collection" @@ -359,20 +385,15 @@ def test_publish_dataset_updates_project_collection_when_exists( update_methods = [call.args[2] for call in mock_update.call_args_list] self.assertIn(mock_gen.update_deepesdl_collection, update_methods) - @patch.object(Publisher, "publish_dataset", return_value={"github_file.json": {}}) - def test_publish_skips_zarr_stac_when_not_configured(self, mock_publish_ds): - # No stac_catalog_s3_root in config + def test_publish_dataset_raises_when_stac_root_missing(self): + # stac_catalog_s3_root is mandatory; publish_dataset must raise ValueError self.publisher.dataset_config = { "collection_id": "test-collection", "dataset_id": "test-dataset", + "license_type": "CC-BY-4.0", } - self.publisher.gh_publisher.publish_files.return_value = "PR_URL" - - with patch.object( - self.publisher, "_write_stac_catalog_to_s3" - ) as mock_write: - self.publisher.publish(mode="dataset") - mock_write.assert_not_called() + with pytest.raises(ValueError, match="stac_catalog_s3_root"): + self.publisher.publish_dataset(write_to_file=False) class TestParseGithubNotebookUrl: diff --git a/deep_code/tests/utils/test_dataset_stac_generator.py b/deep_code/tests/utils/test_dataset_stac_generator.py index 039bac2..75e5817 100644 --- a/deep_code/tests/utils/test_dataset_stac_generator.py +++ b/deep_code/tests/utils/test_dataset_stac_generator.py @@ -509,28 +509,36 @@ def test_build_zarr_stac_catalog_file_dict_content(self): self.assertIn("zarr-consolidated-metadata", item_dict["assets"]) def test_build_dataset_stac_collection_adds_s3_catalog_via_link(self): - """A 'via' link to the S3 catalog is added when stac_catalog_s3_root is provided. + """A 'via' link (STAC browser) and a 'child' link (HTTPS catalog) are added + when stac_catalog_s3_root is provided. - rel='via' is used (not 'child') because the OSC validator requires every - 'child' link to resolve to a file inside the metadata repository. + The OSC convention uses: + - rel='via' → STAC browser URL + - rel='child' → direct HTTPS catalog URL (s3:// converted to HTTPS) """ s3_root = "s3://test-bucket/stac/my-collection/" collection = self.generator.build_dataset_stac_collection( mode="dataset", stac_catalog_s3_root=s3_root ) - s3_via = next( - ( - lnk - for lnk in collection.links - if lnk.rel == "via" and "catalog.json" in str(lnk.target) - ), + https_catalog = "https://test-bucket.s3.amazonaws.com/stac/my-collection/catalog.json" + stac_browser_href = ( + "https://opensciencedata.esa.int/stac-browser/#/external/" + + https_catalog.replace("https://", "") + ) + + via_link = next( + (lnk for lnk in collection.links if lnk.rel == "via" and "stac-browser" in str(lnk.target)), None, ) - self.assertIsNotNone(s3_via, "Expected a 'via' link pointing to S3 catalog") - self.assertEqual( - s3_via.target, - "s3://test-bucket/stac/my-collection/catalog.json", + self.assertIsNotNone(via_link, "Expected a 'via' STAC browser link") + self.assertEqual(via_link.target, stac_browser_href) + + child_link = next( + (lnk for lnk in collection.links if lnk.rel == "child" and "catalog.json" in str(lnk.target)), + None, ) + self.assertIsNotNone(child_link, "Expected a 'child' HTTPS catalog link") + self.assertEqual(child_link.target, https_catalog) def test_build_dataset_stac_collection_no_s3_via_link_by_default(self): """No S3 catalog 'via' link is added when stac_catalog_s3_root is absent.""" From 2d28ea945b03cb1ba72b7830944925a75a2b5bf6 Mon Sep 17 00:00:00 2001 From: tejas Date: Thu, 19 Mar 2026 17:22:01 +0100 Subject: [PATCH 14/16] increase test coverage --- deep_code/tests/tools/test_publish.py | 161 +++++++++++++++++- .../utils/test_dataset_stac_generator.py | 156 +++++++++++++++++ deep_code/tests/utils/test_helper.py | 22 ++- 3 files changed, 337 insertions(+), 2 deletions(-) diff --git a/deep_code/tests/tools/test_publish.py b/deep_code/tests/tools/test_publish.py index 4c388ac..6ccc8eb 100644 --- a/deep_code/tests/tools/test_publish.py +++ b/deep_code/tests/tools/test_publish.py @@ -386,7 +386,6 @@ def test_publish_dataset_updates_project_collection_when_exists( self.assertIn(mock_gen.update_deepesdl_collection, update_methods) def test_publish_dataset_raises_when_stac_root_missing(self): - # stac_catalog_s3_root is mandatory; publish_dataset must raise ValueError self.publisher.dataset_config = { "collection_id": "test-collection", "dataset_id": "test-dataset", @@ -395,6 +394,166 @@ def test_publish_dataset_raises_when_stac_root_missing(self): with pytest.raises(ValueError, match="stac_catalog_s3_root"): self.publisher.publish_dataset(write_to_file=False) + def test_publish_dataset_raises_when_no_dataset_config(self): + self.publisher.dataset_config = None + with pytest.raises(ValueError, match="No dataset config"): + self.publisher.publish_dataset(write_to_file=False) + + def test_publish_dataset_raises_when_ids_missing(self): + self.publisher.dataset_config = {"collection_id": "", "dataset_id": ""} + with pytest.raises(ValueError, match="Dataset ID or Collection ID missing"): + self.publisher.publish_dataset(write_to_file=False) + + def test_publish_dataset_raises_when_license_missing(self): + self.publisher.dataset_config = { + "collection_id": "test-collection", + "dataset_id": "test-dataset", + } + with pytest.raises(ValueError, match="license_type is required"): + self.publisher.publish_dataset(write_to_file=False) + + def test_write_to_file_serializes_dict(self): + import json + import os + import tempfile + + with tempfile.NamedTemporaryFile(delete=False, suffix=".json") as f: + path = f.name + try: + Publisher._write_to_file(path, {"a": 1}) + with open(path) as f: + result = json.load(f) + self.assertEqual(result, {"a": 1}) + finally: + os.unlink(path) + + def test_update_and_add_to_file_dict(self): + file_dict = {} + self.publisher.gh_publisher.github_automation.local_clone_dir = "/tmp" + update_method = MagicMock(return_value={"key": "value"}) + self.publisher._update_and_add_to_file_dict(file_dict, "some/catalog.json", update_method) + update_method.assert_called_once() + assert any("some/catalog.json" in str(k) for k in file_dict) + + def test_update_variable_catalogs_creates_new_when_missing(self): + mock_gen = MagicMock() + mock_gen.variables_metadata = {"var1": {"variable_id": "var1"}} + mock_gen.build_variable_catalog.return_value.to_dict.return_value = {"id": "var1"} + self.publisher.gh_publisher.github_automation.file_exists.return_value = False + + file_dict = {} + self.publisher._update_variable_catalogs(mock_gen, file_dict, ["var1"]) + + mock_gen.build_variable_catalog.assert_called_once() + assert "variables/var1/catalog.json" in file_dict + + def test_update_variable_catalogs_updates_existing(self): + mock_gen = MagicMock() + self.publisher.gh_publisher.github_automation.file_exists.return_value = True + self.publisher.gh_publisher.github_automation.local_clone_dir = "/tmp" + mock_gen.update_existing_variable_catalog.return_value = {"id": "var1"} + + file_dict = {} + self.publisher._update_variable_catalogs(mock_gen, file_dict, ["var1"]) + + mock_gen.update_existing_variable_catalog.assert_called_once() + assert "variables/var1/catalog.json" in file_dict + + # ------------------------------------------------------------------ + # generate_workflow_experiment_records + # ------------------------------------------------------------------ + + def _setup_workflow_mocks(self): + """Patch all internals of generate_workflow_experiment_records.""" + mock_rg = MagicMock() + mock_props = MagicMock() + mock_props.jupyter_kernel_info.to_dict.return_value = {} + mock_rg.build_record_properties.return_value = mock_props + + mock_wf_record = MagicMock() + mock_wf_record.to_dict.return_value = {"id": "wf", "properties": {}} + + mock_exp_record = MagicMock() + mock_exp_record.to_dict.return_value = { + "id": "wf", + "properties": {}, + "jupyter_notebook_url": "url", + "collection_id": "col", + } + return mock_rg, mock_props, mock_wf_record, mock_exp_record + + @patch("deep_code.tools.publish.WorkflowAsOgcRecord") + @patch("deep_code.tools.publish.LinksBuilder") + @patch("deep_code.tools.publish.OSCWorkflowOGCApiRecordGenerator") + def test_generate_workflow_records_mode_workflow(self, MockRG, MockLinks, MockWF): + mock_rg, mock_props, mock_wf_record, _ = self._setup_workflow_mocks() + MockRG.return_value = mock_rg + MockWF.return_value = mock_wf_record + + self.publisher.workflow_config = { + "workflow_id": "my-workflow", + "properties": {"title": "My WF", "license": "CC-BY-4.0"}, + } + with patch.object(self.publisher, "_update_base_catalog", return_value={}): + result = self.publisher.generate_workflow_experiment_records( + write_to_file=False, mode="workflow" + ) + + self.assertIn("workflows/my-workflow/record.json", result) + self.assertIn("workflows/catalog.json", result) + self.assertNotIn("experiments/catalog.json", result) + + @patch("deep_code.tools.publish.ExperimentAsOgcRecord") + @patch("deep_code.tools.publish.WorkflowAsOgcRecord") + @patch("deep_code.tools.publish.LinksBuilder") + @patch("deep_code.tools.publish.OSCWorkflowOGCApiRecordGenerator") + def test_generate_workflow_records_mode_all(self, MockRG, MockLinks, MockWF, MockExp): + mock_rg, mock_props, mock_wf_record, mock_exp_record = self._setup_workflow_mocks() + MockRG.return_value = mock_rg + MockWF.return_value = mock_wf_record + MockExp.return_value = mock_exp_record + + self.publisher.workflow_config = { + "workflow_id": "my-workflow", + "properties": {"title": "My WF", "license": "CC-BY-4.0"}, + } + self.publisher.collection_id = "my-collection" + with patch.object(self.publisher, "_update_base_catalog", return_value={}): + result = self.publisher.generate_workflow_experiment_records( + write_to_file=False, mode="all" + ) + + self.assertIn("workflows/my-workflow/record.json", result) + self.assertIn("workflows/catalog.json", result) + self.assertIn("experiments/catalog.json", result) + + @patch("deep_code.tools.publish.OSCWorkflowOGCApiRecordGenerator") + def test_generate_workflow_records_raises_when_workflow_id_missing(self, MockRG): + self.publisher.workflow_config = { + "properties": {"title": "My WF", "license": "CC-BY-4.0"}, + } + with pytest.raises(ValueError, match="workflow_id is missing"): + self.publisher.generate_workflow_experiment_records( + write_to_file=False, mode="workflow" + ) + + @patch("deep_code.tools.publish.OSCWorkflowOGCApiRecordGenerator") + def test_generate_workflow_records_raises_when_license_missing(self, MockRG): + self.publisher.workflow_config = { + "workflow_id": "my-wf", + "properties": {"title": "My WF"}, + } + with pytest.raises(ValueError, match="license is required"): + self.publisher.generate_workflow_experiment_records( + write_to_file=False, mode="workflow" + ) + + def test_generate_workflow_records_returns_empty_for_dataset_mode(self): + result = self.publisher.generate_workflow_experiment_records( + write_to_file=False, mode="dataset" + ) + self.assertEqual(result, {}) + class TestParseGithubNotebookUrl: @pytest.mark.parametrize( diff --git a/deep_code/tests/utils/test_dataset_stac_generator.py b/deep_code/tests/utils/test_dataset_stac_generator.py index 75e5817..0b806cd 100644 --- a/deep_code/tests/utils/test_dataset_stac_generator.py +++ b/deep_code/tests/utils/test_dataset_stac_generator.py @@ -607,3 +607,159 @@ def test_edge_cases(self): OscDatasetStacGenerator.format_string("too many spaces"), "Too Many Spaces", ) + + +class TestOscDatasetStacGeneratorExtra(unittest.TestCase): + """Additional tests to cover branches not exercised by TestOSCProductSTACGenerator.""" + + def _make_generator(self, mock_ds, collection_id="my-collection", **kwargs): + with patch("deep_code.utils.dataset_stac_generator.open_dataset", return_value=mock_ds): + return OscDatasetStacGenerator( + dataset_id="test.zarr", + collection_id=collection_id, + workflow_id="wf", + workflow_title="WF", + license_type="CC-BY-4.0", + **kwargs, + ) + + def _make_dataset(self, coord_type="lon_lat"): + import numpy as np + from datetime import datetime + if coord_type == "lon_lat": + coords = { + "lon": ("lon", np.linspace(-10, 10, 3)), + "lat": ("lat", np.linspace(-5, 5, 2)), + "time": ("time", [np.datetime64(datetime(2020, 1, 1), "ns")]), + } + elif coord_type == "longitude_latitude": + coords = { + "longitude": ("longitude", np.linspace(-10, 10, 3)), + "latitude": ("latitude", np.linspace(-5, 5, 2)), + "time": ("time", [np.datetime64(datetime(2020, 1, 1), "ns")]), + } + elif coord_type == "x_y": + coords = { + "x": ("x", np.linspace(0, 100, 3)), + "y": ("y", np.linspace(0, 50, 2)), + "time": ("time", [np.datetime64(datetime(2020, 1, 1), "ns")]), + } + else: + coords = {} + from xarray import Dataset + return Dataset(coords=coords) + + @patch("deep_code.utils.dataset_stac_generator.open_dataset") + def test_collection_id_with_space_raises(self, mock_open_ds): + mock_open_ds.return_value = self._make_dataset() + with self.assertRaisesRegex(ValueError, "must not contain spaces"): + OscDatasetStacGenerator( + dataset_id="test.zarr", + collection_id="bad id", + workflow_id="wf", + workflow_title="WF", + license_type="CC-BY-4.0", + ) + + @patch("deep_code.utils.dataset_stac_generator.open_dataset") + def test_spatial_extent_longitude_latitude(self, mock_open_ds): + ds = self._make_dataset("longitude_latitude") + mock_open_ds.return_value = ds + gen = self._make_generator(ds) + extent = gen._get_spatial_extent() + self.assertAlmostEqual(extent.bboxes[0][0], -10.0) + self.assertAlmostEqual(extent.bboxes[0][1], -5.0) + + @patch("deep_code.utils.dataset_stac_generator.open_dataset") + def test_spatial_extent_x_y(self, mock_open_ds): + ds = self._make_dataset("x_y") + mock_open_ds.return_value = ds + gen = self._make_generator(ds) + extent = gen._get_spatial_extent() + self.assertAlmostEqual(extent.bboxes[0][0], 0.0) + + @patch("deep_code.utils.dataset_stac_generator.open_dataset") + def test_spatial_extent_unknown_coords_raises(self, mock_open_ds): + ds = self._make_dataset("none") + mock_open_ds.return_value = ds + gen = self._make_generator(ds) + with self.assertRaisesRegex(ValueError, "recognized spatial coordinates"): + gen._get_spatial_extent() + + @patch("deep_code.utils.dataset_stac_generator.open_dataset") + def test_temporal_extent_no_time_raises(self, mock_open_ds): + ds = self._make_dataset("none") + mock_open_ds.return_value = ds + gen = self._make_generator(ds) + with self.assertRaisesRegex(ValueError, "time"): + gen._get_temporal_extent() + + @patch("deep_code.utils.dataset_stac_generator.open_dataset") + def test_normalize_name_none_returns_none(self, mock_open_ds): + ds = self._make_dataset() + mock_open_ds.return_value = ds + self.assertIsNone(OscDatasetStacGenerator._normalize_name(None)) + + @patch("deep_code.utils.dataset_stac_generator.open_dataset") + def test_build_collection_with_cf_params(self, mock_open_ds): + ds = self._make_dataset() + mock_open_ds.return_value = ds + gen = self._make_generator(ds, cf_params=[{"name": "temperature", "units": "K"}]) + collection = gen.build_dataset_stac_collection(mode="dataset") + self.assertEqual(collection.extra_fields.get("cf:parameter"), [{"name": "temperature", "units": "K"}]) + + @patch("deep_code.utils.dataset_stac_generator.open_dataset") + def test_build_collection_with_visualisation_link(self, mock_open_ds): + ds = self._make_dataset() + mock_open_ds.return_value = ds + gen = self._make_generator(ds, visualisation_link="https://viewer.example.com/") + collection = gen.build_dataset_stac_collection(mode="dataset") + vis_links = [lnk for lnk in collection.links if lnk.rel == "visualisation"] + self.assertEqual(len(vis_links), 1) + self.assertEqual(vis_links[0].target, "https://viewer.example.com/") + self.assertEqual(vis_links[0].title, "Dataset visualisation") + + @patch("deep_code.utils.dataset_stac_generator.open_dataset") + def test_build_collection_mode_all_adds_experiment_link(self, mock_open_ds): + ds = self._make_dataset() + mock_open_ds.return_value = ds + gen = self._make_generator(ds) + collection = gen.build_dataset_stac_collection(mode="all") + exp_links = [lnk for lnk in collection.links if "experiments" in str(lnk.target)] + self.assertEqual(len(exp_links), 1) + + @patch("deep_code.utils.dataset_stac_generator.open_dataset") + def test_s3_to_https(self, mock_open_ds): + self.assertEqual( + OscDatasetStacGenerator._s3_to_https("s3://my-bucket/path/to/file.json"), + "https://my-bucket.s3.amazonaws.com/path/to/file.json", + ) + + @patch("deep_code.utils.dataset_stac_generator.open_dataset") + def test_update_existing_variable_catalog(self, mock_open_ds): + import json + import os + import tempfile + + ds = self._make_dataset() + mock_open_ds.return_value = ds + gen = self._make_generator(ds, osc_themes=["land"]) + + base = { + "type": "Catalog", + "id": "var1", + "stac_version": "1.0.0", + "description": "Variable catalog", + "links": [], + } + with tempfile.NamedTemporaryFile(mode="w", suffix=".json", delete=False) as f: + json.dump(base, f) + tmp_path = f.name + try: + result = gen.update_existing_variable_catalog(tmp_path, "var1") + finally: + os.unlink(tmp_path) + + rels = [lnk["rel"] for lnk in result["links"]] + self.assertIn("child", rels) + self.assertIn("related", rels) # theme link diff --git a/deep_code/tests/utils/test_helper.py b/deep_code/tests/utils/test_helper.py index a7c8c3f..5044343 100644 --- a/deep_code/tests/utils/test_helper.py +++ b/deep_code/tests/utils/test_helper.py @@ -10,7 +10,7 @@ import xarray import xarray as xr -from deep_code.utils.helper import open_dataset +from deep_code.utils.helper import open_dataset, serialize def make_dummy_dataset(): @@ -156,3 +156,23 @@ def test_uses_provided_logger(self, mock_new_store, mock_get_logger): custom_logger.info.assert_any_call( "Successfully opened dataset 'test-id' with configuration: Public store" ) + + +class TestSerialize(unittest.TestCase): + def test_set_converted_to_list(self): + result = serialize({1, 2, 3}) + self.assertIsInstance(result, list) + self.assertCountEqual(result, [1, 2, 3]) + + def test_object_with_dict_returns_dict(self): + class Obj: + def __init__(self): + self.x = 1 + self.y = 2 + + result = serialize(Obj()) + self.assertEqual(result, {"x": 1, "y": 2}) + + def test_unserializable_raises_type_error(self): + with self.assertRaises(TypeError): + serialize(42) From 875fbd4137893a50563e50f3d30e2b7c17eabb18 Mon Sep 17 00:00:00 2001 From: tejas Date: Fri, 20 Mar 2026 11:45:05 +0100 Subject: [PATCH 15/16] support non-default osc_project from dataset config --- deep_code/tools/new.py | 1 + deep_code/tools/publish.py | 2 ++ docs/configuration.md | 3 +++ 3 files changed, 6 insertions(+) diff --git a/deep_code/tools/new.py b/deep_code/tools/new.py index a9f591f..d7fb209 100644 --- a/deep_code/tools/new.py +++ b/deep_code/tools/new.py @@ -76,6 +76,7 @@ def generate_dataset_template(output_path: Optional[str] = None) -> str: "description": "[OPTIONAL: human-readable description of the dataset. Overrides the description attribute in the Zarr store if set]", "documentation_link": "[OPTIONAL: link to documentation, publication, or handbook]", "visualisation_link": "[OPTIONAL: URL to a visualisation of the dataset (e.g. xcube Viewer, WMS)]", + "osc_project": "[OPTIONAL: OSC project ID (e.g. deep-earth-system-data-lab). Defaults to deep-earth-system-data-lab]", "osc_project_title": "[OPTIONAL: display title of the OSC project as it appears in the catalog (e.g. DeepESDL). Defaults to a formatted version of osc_project if omitted]", "access_link": "[OPTIONAL: public S3 URL of the Zarr store — defaults to s3://deep-esdl-public/{dataset_id}]", "cf_parameter": [{"name": "[OPTIONAL: CF standard name]", "units": "[unit string]"}], diff --git a/deep_code/tools/publish.py b/deep_code/tools/publish.py index 2daf1cf..de76d72 100644 --- a/deep_code/tools/publish.py +++ b/deep_code/tools/publish.py @@ -270,6 +270,7 @@ def publish_dataset( cf_params = self.dataset_config.get("cf_parameter") license_type = self.dataset_config.get("license_type") visualisation_link = self.dataset_config.get("visualisation_link") + osc_project = self.dataset_config.get("osc_project") osc_project_title = self.dataset_config.get("osc_project_title") description = self.dataset_config.get("description") @@ -305,6 +306,7 @@ def publish_dataset( osc_themes=osc_themes, cf_params=cf_params, visualisation_link=visualisation_link, + osc_project=osc_project, osc_project_title=osc_project_title, description=description, ) diff --git a/docs/configuration.md b/docs/configuration.md index 83b8abf..d8d90d6 100644 --- a/docs/configuration.md +++ b/docs/configuration.md @@ -26,6 +26,8 @@ osc_themes: [cryosphere] # must match slugs at opensciencedata.esa.int/th osc_region: global dataset_status: completed # ongoing | completed | planned (default: ongoing) documentation_link: https://example.com/docs +visualisation_link: https://example.com/viewer # URL to a visualisation of the dataset +osc_project: deep-earth-system-data-lab # defaults to deep-earth-system-data-lab access_link: s3://bucket/your-dataset.zarr # defaults to s3://deep-esdl-public/{dataset_id} # CF parameter overrides (list of {name, units, ...} dicts) @@ -48,6 +50,7 @@ cf_parameter: | `description` | No | Human-readable description of the dataset. Overrides the `description` attribute in the Zarr store; falls back to `"No description available."` if neither is set. | | `documentation_link` | No | URL to dataset documentation. | | `visualisation_link` | No | URL to a visualisation of the dataset (e.g. xcube Viewer, WMS). Added as a `visualisation` link with title `"Dataset visualisation"`. | +| `osc_project` | No | OSC project ID this dataset belongs to (e.g. `deep-earth-system-data-lab`). Defaults to `deep-earth-system-data-lab`. | | `cf_parameter` | No | List of CF metadata dicts to override variable attributes (e.g. `name`, `units`). | | `stac_catalog_s3_root` | Yes | S3 root where the STAC Catalog and Item are published. Publishing fails if this field is absent. See [STAC Catalog on S3](#stac-catalog-on-s3). | From 15ebc76cc94b79278e2070e65de142750c7261f5 Mon Sep 17 00:00:00 2001 From: tejas Date: Fri, 20 Mar 2026 15:30:21 +0100 Subject: [PATCH 16/16] fixed workflow template --- deep_code/tools/new.py | 13 +++---------- 1 file changed, 3 insertions(+), 10 deletions(-) diff --git a/deep_code/tools/new.py b/deep_code/tools/new.py index d7fb209..f2b2654 100644 --- a/deep_code/tools/new.py +++ b/deep_code/tools/new.py @@ -14,16 +14,11 @@ class TemplateGenerator: def generate_workflow_template(output_path: Optional[str] = None) -> str: """Generate a complete template with all possible keys and placeholder values""" - required = { + workflow_template = { "workflow_id": "[REQUIRED: unique identifier for your workflow]", "properties": { "title": "[REQUIRED: human-readable title of the workflow]", "license": "[REQUIRED: SPDX license identifier, e.g. CC-BY-4.0, MIT, proprietary]", - }, - } - - optional = { - "properties": { "description": "[OPTIONAL: concise summary of what the workflow does]", "keywords": ["[OPTIONAL: KEYWORD1]", "[KEYWORD2]"], "themes": ["[OPTIONAL: thematic area, e.g. land, ocean, atmosphere]"], @@ -53,10 +48,8 @@ def generate_workflow_template(output_path: Optional[str] = None) -> str: with open(output_path, "w") as f: f.write("# Workflow Configuration Template\n") f.write("# Replace all [PLACEHOLDER] values with your actual data\n\n") - f.write("# --- REQUIRED fields ---\n") - f.write(yaml.dump(required, sort_keys=False, width=1000, default_flow_style=False)) - f.write("\n# --- OPTIONAL fields ---\n") - f.write(yaml.dump(optional, sort_keys=False, width=1000, default_flow_style=False)) + f.write(yaml.dump(workflow_template, sort_keys=False, width=1000, + default_flow_style=False)) @staticmethod def generate_dataset_template(output_path: Optional[str] = None) -> str: