From 89c9b38315c531c74c22e62d44c912f10e922f04 Mon Sep 17 00:00:00 2001 From: Alex Richey Date: Thu, 2 Apr 2026 13:27:52 -0400 Subject: [PATCH 1/3] add destination version --- dcpy/models/product/dataset/metadata.py | 2 + dcpy/models/product/metadata.py | 22 +++++++ dcpy/test/models/product/test_metadata.py | 64 +++++++++++++++++++ .../metadata_repo/metadata.yml | 1 + .../metadata_repo/products/lion/metadata.yml | 1 + .../products/lion/pseudo_lots/metadata.yml | 2 + 6 files changed, 92 insertions(+) diff --git a/dcpy/models/product/dataset/metadata.py b/dcpy/models/product/dataset/metadata.py index 39fb422af8..803c68f3b2 100644 --- a/dcpy/models/product/dataset/metadata.py +++ b/dcpy/models/product/dataset/metadata.py @@ -147,6 +147,7 @@ class DatasetOrgProductAttributesOverride(CustomizableBase): contains_address: bool | None = ( None # `contains_address` refers specifically to addresses containing house, numbers + street names. (ie. not just streets, polys, etc.) ) + current_version: str | None = None data_collection_method: str | None = None data_change_frequency: str | None = None date_made_public: str | None = None @@ -219,6 +220,7 @@ class Destination(CustomizableBase): id: str type: str tags: list[str] = [] + current_version: str = "" class DestinationWithFiles(Destination): diff --git a/dcpy/models/product/metadata.py b/dcpy/models/product/metadata.py index 6a2b9a1921..f0cbf111fc 100644 --- a/dcpy/models/product/metadata.py +++ b/dcpy/models/product/metadata.py @@ -288,3 +288,25 @@ def query_product_dataset_destinations( def get_product_dataset_destinations(self, destination_path: str): prod, ds, dest_id = destination_path.split(".") return self.product(prod).dataset(ds).get_destination(dest_id) + + def get_all_destination_current_versions(self) -> list[str]: + """Get all destination paths with their current_version. + Returns a sorted list of strings in format: product.dataset.destination|current_version + """ + result = [] + for p_name in self.metadata.products: + product = self.product(p_name) + for ds in product.get_datasets_by_id().values(): + # Get the resolved current_version from dataset attributes (has org/product defaults applied) + dataset_version = ds.attributes.current_version or "" + + for dest in ds.destinations: + # Destination can override the dataset version + version = ( + dest.current_version + if dest.current_version + else dataset_version + ) + result.append(f"{p_name}.{ds.id}.{dest.id}|{version}") + + return sorted(result) diff --git a/dcpy/test/models/product/test_metadata.py b/dcpy/test/models/product/test_metadata.py index 7c3c7e054b..c0b22ec263 100644 --- a/dcpy/test/models/product/test_metadata.py +++ b/dcpy/test/models/product/test_metadata.py @@ -219,3 +219,67 @@ def test_column_defaults_applied(org_md: md.OrgMetadata): description="sample bbl description", example="1016370141", ) == colp_ds.get_column("bbl"), "The bbl column should have had defaults applied" + + +def test_current_version_overrides(org_md: md.OrgMetadata): + """Test that current_version can be overridden at org, product, dataset, and destination levels.""" + lion_md = org_md.product("lion") + + # Test dataset with dataset-level override + pseudo_lots = lion_md.dataset("pseudo_lots") + assert pseudo_lots.attributes.current_version == "dataset-override-3.0", ( + "Dataset-level current_version should override product default" + ) + + # Test destination with destination-level override + socrata_dest = pseudo_lots.get_destination("socrata") + assert socrata_dest.current_version == "destination-override-4.0", ( + "Destination should have its own current_version override" + ) + + # Test destination without override (should use dataset version via method) + garlic_dest = pseudo_lots.get_destination("garlic_sftp") + assert garlic_dest.current_version == "", ( + "Destination without override should have empty string" + ) + + # Test dataset without dataset-level override (should use product default) + school_districts = lion_md.dataset("school_districts") + assert school_districts.attributes.current_version == "product-default-2.0", ( + "Dataset without override should use product-level default" + ) + + +def test_get_all_destination_current_versions(org_md: md.OrgMetadata): + """Test that get_all_destination_current_versions returns correct sorted list.""" + # Temporarily filter out the error product since it's designed to fail + original_products = org_md.metadata.products + org_md.metadata.products = [ + p for p in original_products if p != "mock_product_with_errors" + ] + + try: + versions = org_md.get_all_destination_current_versions() + + # Should be sorted + assert versions == sorted(versions), "Result should be sorted" + + # Check that specific entries exist with correct version resolution + assert "lion.pseudo_lots.socrata|destination-override-4.0" in versions, ( + "Destination-level override should be in output" + ) + assert "lion.pseudo_lots.garlic_sftp|dataset-override-3.0" in versions, ( + "Destination without override should use dataset version" + ) + assert "lion.school_districts.socrata|product-default-2.0" in versions, ( + "Destination on dataset without override should use product default" + ) + assert "lion.school_districts.socrata_2|product-default-2.0" in versions, ( + "Another destination should also use product default" + ) + assert "lion.school_districts.other|product-default-2.0" in versions, ( + "Third destination should also use product default" + ) + finally: + # Restore original products list + org_md.metadata.products = original_products diff --git a/dcpy/test/resources/package_and_distribute/metadata_repo/metadata.yml b/dcpy/test/resources/package_and_distribute/metadata_repo/metadata.yml index 672bd11afc..9d809aa8e9 100644 --- a/dcpy/test/resources/package_and_distribute/metadata_repo/metadata.yml +++ b/dcpy/test/resources/package_and_distribute/metadata_repo/metadata.yml @@ -4,6 +4,7 @@ attributes: attribution: DCP attribution_link: https://www.nyc.gov/site/planning/data-maps/open-data.page contact_email: opendata@planning.nyc.gov + current_version: "org-default-1.0" products: - colp diff --git a/dcpy/test/resources/package_and_distribute/metadata_repo/products/lion/metadata.yml b/dcpy/test/resources/package_and_distribute/metadata_repo/products/lion/metadata.yml index 24bf14229b..62ad8b8fd1 100644 --- a/dcpy/test/resources/package_and_distribute/metadata_repo/products/lion/metadata.yml +++ b/dcpy/test/resources/package_and_distribute/metadata_repo/products/lion/metadata.yml @@ -7,6 +7,7 @@ attributes: dataset_defaults: publishing_frequency: "{{ lion_prod_level_pub_freq }}" publishing_purpose: "legal compliance." + current_version: "product-default-2.0" datasets: - pseudo_lots diff --git a/dcpy/test/resources/package_and_distribute/metadata_repo/products/lion/pseudo_lots/metadata.yml b/dcpy/test/resources/package_and_distribute/metadata_repo/products/lion/pseudo_lots/metadata.yml index 2127cde3de..6ed43755df 100644 --- a/dcpy/test/resources/package_and_distribute/metadata_repo/products/lion/pseudo_lots/metadata.yml +++ b/dcpy/test/resources/package_and_distribute/metadata_repo/products/lion/pseudo_lots/metadata.yml @@ -10,6 +10,7 @@ attributes: display_name: Pseudo Lots each_row_is_a: Pseudo Lot publishing_frequency: "{{ pseudo_lots_pub_freq }}" + current_version: "dataset-override-3.0" tags: - pseudo lots - bin @@ -21,5 +22,6 @@ destinations: - id: socrata type: socrata tags: [prod_tag, pseudo_lots_tag] + current_version: "destination-override-4.0" - id: garlic_sftp type: sftp From d238649fc88c62d4b38225f54fff600f168fc95a Mon Sep 17 00:00:00 2001 From: Alex Richey Date: Tue, 7 Apr 2026 18:53:08 -0400 Subject: [PATCH 2/3] First pass at migrating pmd to its own module --- dcpy/connectors/edm/open_data_nyc.py | 2 +- dcpy/connectors/edm/product_metadata.py | 2 +- .../connectors/esri/arcgis_feature_service.py | 2 +- dcpy/connectors/socrata/metadata.py | 2 +- dcpy/connectors/socrata/publish.py | 2 +- dcpy/lifecycle/package/abstract_doc.py | 11 +- dcpy/lifecycle/package/assemble.py | 2 +- dcpy/lifecycle/package/esri.py | 2 +- dcpy/lifecycle/package/pdf_writer.py | 4 +- dcpy/lifecycle/package/shapefiles.py | 4 +- dcpy/lifecycle/package/validate.py | 2 +- dcpy/lifecycle/package/xlsx_writer.py | 4 +- dcpy/lifecycle/package/yaml_writer.py | 2 +- dcpy/lifecycle/product_metadata.py | 2 +- dcpy/models/lifecycle/distribute.py | 2 +- dcpy/product_metadata/__init__.py | 0 dcpy/product_metadata/models/__init__.py | 0 .../models/design/__init__.py | 0 .../models/design/elements.py | 48 ++ .../models/metadata/__init__.py | 0 .../models/metadata/artifacts.py | 46 ++ .../models/metadata/data_dictionary.py | 18 + dcpy/product_metadata/models/metadata/org.py | 312 +++++++++++++ .../models/metadata/product.py | 420 ++++++++++++++++++ dcpy/product_metadata/package/__init__.py | 0 dcpy/test/connectors/socrata/test_publish.py | 2 +- .../package/assemble/test_abstract_docs.py | 4 +- .../package/assemble/test_unpacking.py | 2 +- .../package/test_parse_shapefile_to_md.py | 2 +- .../test/lifecycle/package/test_shapefiles.py | 2 +- .../validate/test_column_validation.py | 2 +- .../product/dataset/test_product_metadata.py | 2 +- .../product/dataset/test_text_cleaning.py | 2 +- dcpy/test/models/product/test_metadata.py | 4 +- 34 files changed, 879 insertions(+), 32 deletions(-) create mode 100644 dcpy/product_metadata/__init__.py create mode 100644 dcpy/product_metadata/models/__init__.py create mode 100644 dcpy/product_metadata/models/design/__init__.py create mode 100644 dcpy/product_metadata/models/design/elements.py create mode 100644 dcpy/product_metadata/models/metadata/__init__.py create mode 100644 dcpy/product_metadata/models/metadata/artifacts.py create mode 100644 dcpy/product_metadata/models/metadata/data_dictionary.py create mode 100644 dcpy/product_metadata/models/metadata/org.py create mode 100644 dcpy/product_metadata/models/metadata/product.py create mode 100644 dcpy/product_metadata/package/__init__.py diff --git a/dcpy/connectors/edm/open_data_nyc.py b/dcpy/connectors/edm/open_data_nyc.py index 57043e82a7..a8abd50a49 100644 --- a/dcpy/connectors/edm/open_data_nyc.py +++ b/dcpy/connectors/edm/open_data_nyc.py @@ -3,7 +3,7 @@ import time from pathlib import Path -import dcpy.models.product.dataset.metadata as md +import dcpy.product_metadata.models.metadata.product as md from dcpy.connectors.registry import VersionedConnector from dcpy.connectors.socrata import publish as soc_pub from dcpy.lifecycle import product_metadata diff --git a/dcpy/connectors/edm/product_metadata.py b/dcpy/connectors/edm/product_metadata.py index b34d86b49e..50130378b9 100644 --- a/dcpy/connectors/edm/product_metadata.py +++ b/dcpy/connectors/edm/product_metadata.py @@ -3,7 +3,7 @@ import requests import yaml -import dcpy.models.product.dataset.metadata as md +import dcpy.product_metadata.models.metadata.product as md from dcpy.utils.logging import logger METADATA_REPO_RAW_URL = ( diff --git a/dcpy/connectors/esri/arcgis_feature_service.py b/dcpy/connectors/esri/arcgis_feature_service.py index d666c90796..ab3e39a344 100644 --- a/dcpy/connectors/esri/arcgis_feature_service.py +++ b/dcpy/connectors/esri/arcgis_feature_service.py @@ -16,7 +16,7 @@ from dcpy.connectors.registry import Connector from dcpy.models.connectors.esri import FeatureServer, FeatureServerLayer, Server -from dcpy.models.product.dataset import metadata +from dcpy.product_metadata.models.metadata import product as metadata from dcpy.utils.logging import logger diff --git a/dcpy/connectors/socrata/metadata.py b/dcpy/connectors/socrata/metadata.py index dbdbf9617a..acdfc80305 100644 --- a/dcpy/connectors/socrata/metadata.py +++ b/dcpy/connectors/socrata/metadata.py @@ -3,7 +3,7 @@ import typer -import dcpy.models.product.dataset.metadata as md +import dcpy.product_metadata.models.metadata.product as md from dcpy.connectors.socrata import metadata from dcpy.connectors.socrata import publish as pub from dcpy.utils.logging import logger diff --git a/dcpy/connectors/socrata/publish.py b/dcpy/connectors/socrata/publish.py index b344691caf..51bd83a125 100644 --- a/dcpy/connectors/socrata/publish.py +++ b/dcpy/connectors/socrata/publish.py @@ -29,7 +29,7 @@ from socrata.sources import Source import dcpy.models.dataset as dataset -import dcpy.models.product.dataset.metadata as md +import dcpy.product_metadata.models.metadata.product as md from dcpy.utils.logging import logger # There are required publishing frequency fields in two different sections of diff --git a/dcpy/lifecycle/package/abstract_doc.py b/dcpy/lifecycle/package/abstract_doc.py index 1e9311f12f..777e96ba61 100644 --- a/dcpy/lifecycle/package/abstract_doc.py +++ b/dcpy/lifecycle/package/abstract_doc.py @@ -1,10 +1,13 @@ from pathlib import Path from typing import Any -from dcpy.models.design import elements as de -from dcpy.models.product.artifacts import Artifact, ExcelTableComponentDefinition -from dcpy.models.product.dataset.metadata import Dataset -from dcpy.models.product.metadata import OrgMetadata +from dcpy.product_metadata.models.design import elements as de +from dcpy.product_metadata.models.metadata.artifacts import ( + Artifact, + ExcelTableComponentDefinition, +) +from dcpy.product_metadata.models.metadata.org import OrgMetadata +from dcpy.product_metadata.models.metadata.product import Dataset from dcpy.utils.logging import logger # TODO; Extract these into a generic style that we can pass to the XLSX renderer diff --git a/dcpy/lifecycle/package/assemble.py b/dcpy/lifecycle/package/assemble.py index b5d412ce2f..9a906ebf79 100644 --- a/dcpy/lifecycle/package/assemble.py +++ b/dcpy/lifecycle/package/assemble.py @@ -3,7 +3,7 @@ import tempfile from pathlib import Path -import dcpy.models.product.dataset.metadata as md +import dcpy.product_metadata.models.metadata.product as md from dcpy.lifecycle import config, data_loader from dcpy.lifecycle import product_metadata as org_metadata_loader from dcpy.lifecycle.package import validate, xlsx_writer diff --git a/dcpy/lifecycle/package/esri.py b/dcpy/lifecycle/package/esri.py index f096c664bc..78b45e5e2a 100644 --- a/dcpy/lifecycle/package/esri.py +++ b/dcpy/lifecycle/package/esri.py @@ -4,7 +4,7 @@ import typer import yaml -import dcpy.models.product.dataset.metadata as models +import dcpy.product_metadata.models.metadata.product as models from dcpy.utils.logging import logger diff --git a/dcpy/lifecycle/package/pdf_writer.py b/dcpy/lifecycle/package/pdf_writer.py index ae96ca2590..fa935aa220 100644 --- a/dcpy/lifecycle/package/pdf_writer.py +++ b/dcpy/lifecycle/package/pdf_writer.py @@ -5,8 +5,8 @@ from bs4 import BeautifulSoup from jinja2 import Environment, FileSystemLoader -from dcpy.models.product.dataset.metadata import Metadata -from dcpy.models.product.metadata import OrgMetadata +from dcpy.product_metadata.models.metadata.org import OrgMetadata +from dcpy.product_metadata.models.metadata.product import Metadata from dcpy.utils.logging import logger from . import RESOURCES_PATH diff --git a/dcpy/lifecycle/package/shapefiles.py b/dcpy/lifecycle/package/shapefiles.py index 44555833cb..36a0e0e795 100644 --- a/dcpy/lifecycle/package/shapefiles.py +++ b/dcpy/lifecycle/package/shapefiles.py @@ -8,14 +8,14 @@ from dcpy.lifecycle import product_metadata from dcpy.models.data.shapefile_metadata import Attr, Edom -from dcpy.models.product.dataset.metadata import ( +from dcpy.product_metadata.models.metadata.org import OrgMetadata +from dcpy.product_metadata.models.metadata.product import ( COLUMN_TYPES, ColumnValue, DatasetAttributes, DatasetColumn, Metadata, ) -from dcpy.models.product.metadata import OrgMetadata from dcpy.utils.geospatial import shapefile as shp_utils from dcpy.utils.geospatial.shapefile import Shapefile from dcpy.utils.logging import logger diff --git a/dcpy/lifecycle/package/validate.py b/dcpy/lifecycle/package/validate.py index a6dbe11b67..75657c9cb5 100644 --- a/dcpy/lifecycle/package/validate.py +++ b/dcpy/lifecycle/package/validate.py @@ -9,7 +9,7 @@ from shapely import wkb, wkt from tabulate import tabulate # type: ignore -import dcpy.models.product.dataset.metadata as dataset_md +import dcpy.product_metadata.models.metadata.product as dataset_md from dcpy.utils.logging import logger diff --git a/dcpy/lifecycle/package/xlsx_writer.py b/dcpy/lifecycle/package/xlsx_writer.py index cdb15ebcbc..95aacf77ab 100644 --- a/dcpy/lifecycle/package/xlsx_writer.py +++ b/dcpy/lifecycle/package/xlsx_writer.py @@ -14,8 +14,8 @@ ) from dcpy.lifecycle import product_metadata -from dcpy.models.design import elements as de -from dcpy.models.product.metadata import OrgMetadata +from dcpy.product_metadata.models.design import elements as de +from dcpy.product_metadata.models.metadata.org import OrgMetadata from dcpy.utils.logging import logger from . import RESOURCES_PATH, abstract_doc diff --git a/dcpy/lifecycle/package/yaml_writer.py b/dcpy/lifecycle/package/yaml_writer.py index 5b6849a031..3aa7ea1939 100644 --- a/dcpy/lifecycle/package/yaml_writer.py +++ b/dcpy/lifecycle/package/yaml_writer.py @@ -1,6 +1,6 @@ from pathlib import Path -from dcpy.models.product.metadata import OrgMetadata +from dcpy.product_metadata.models.metadata.org import OrgMetadata def write_yaml( diff --git a/dcpy/lifecycle/product_metadata.py b/dcpy/lifecycle/product_metadata.py index b5334b3d5c..0f1af2b91a 100644 --- a/dcpy/lifecycle/product_metadata.py +++ b/dcpy/lifecycle/product_metadata.py @@ -1,5 +1,5 @@ from dcpy.lifecycle import config -from dcpy.models.product.metadata import OrgMetadata +from dcpy.product_metadata.models.metadata.org import OrgMetadata def load(**kwargs) -> OrgMetadata: diff --git a/dcpy/models/lifecycle/distribute.py b/dcpy/models/lifecycle/distribute.py index 65a6d3c032..70c1962b33 100644 --- a/dcpy/models/lifecycle/distribute.py +++ b/dcpy/models/lifecycle/distribute.py @@ -1,7 +1,7 @@ from pathlib import Path from typing import NotRequired, Required, TypedDict -import dcpy.models.product.dataset.metadata as ds_md +import dcpy.product_metadata.models.metadata.product as ds_md class DatasetDestinationPushArgs(TypedDict): diff --git a/dcpy/product_metadata/__init__.py b/dcpy/product_metadata/__init__.py new file mode 100644 index 0000000000..e69de29bb2 diff --git a/dcpy/product_metadata/models/__init__.py b/dcpy/product_metadata/models/__init__.py new file mode 100644 index 0000000000..e69de29bb2 diff --git a/dcpy/product_metadata/models/design/__init__.py b/dcpy/product_metadata/models/design/__init__.py new file mode 100644 index 0000000000..e69de29bb2 diff --git a/dcpy/product_metadata/models/design/elements.py b/dcpy/product_metadata/models/design/elements.py new file mode 100644 index 0000000000..361a9984a4 --- /dev/null +++ b/dcpy/product_metadata/models/design/elements.py @@ -0,0 +1,48 @@ +import typing +from pathlib import Path + +from pydantic import BaseModel + + +class Font(BaseModel): + name: str | None = None + size: float | None = None + rgb: str | None = None + italic: bool = False + bold: bool = False + monospaced: bool = False + + +class CellStyle(BaseModel): + font: Font = Font() + borders: list[str] | None = None + text_alignment_vertical: str | None = None + text_alignment_horizontal: str | None = None + + +class Image(BaseModel): + path: Path + + +class Cell(BaseModel): + value: typing.Any | Image | list["Cell"] # can be a value or inline cells + style: CellStyle = CellStyle() + + +class Row(BaseModel): + cells: list[Cell] + merge_cells: bool = False + is_top_row: bool = False + height: float | None = None + skip_default_styling: bool = False + + +class Table(BaseModel): + title: str + subtitle: str | None = None + description: str | None = None + rows: list[Row] + column_widths: list[float] | None = [] + + def total_cols(self): + return max(len(r.cells) for r in self.rows) diff --git a/dcpy/product_metadata/models/metadata/__init__.py b/dcpy/product_metadata/models/metadata/__init__.py new file mode 100644 index 0000000000..e69de29bb2 diff --git a/dcpy/product_metadata/models/metadata/artifacts.py b/dcpy/product_metadata/models/metadata/artifacts.py new file mode 100644 index 0000000000..a5e94a1aea --- /dev/null +++ b/dcpy/product_metadata/models/metadata/artifacts.py @@ -0,0 +1,46 @@ +from pathlib import Path + +from dcpy.models.base import TemplatedYamlReader + +from .product import CustomizableBase + + +class ExcelTableComponentDefinition(CustomizableBase): + """Declaration for a table in an XLSX. + + A table should declare a data_source from which to pull data (and field metadata), + and specify rows OR columns. (rows AND columns makes sense in theory, but isn't implemented) + """ + + id: str + name: str + type: str # atm, either a `list_table` or `object_table`... but we should determine this from the object itself, so this attribute should go away. + index: int + data_source: str | None = None + + title: str + subtitle: str + description: str | None = None # table description + include_column_description_row: bool = ( + True # header row underneath columns, with a description of the columns + ) + + extra_field_description_field: str | None = ( + None # field from which to pull extra description paragraphs + ) + image_path: Path | None = None + rows: list[str] | None = None + columns: list[str] | None = None + column_widths: list[float] | None = ( + None # TODO: generalize away from concrete numbers. + ) + + +class Artifact(CustomizableBase, TemplatedYamlReader): + name: str + type: str + components: list[ExcelTableComponentDefinition] + + +class Artifacts(CustomizableBase, TemplatedYamlReader): + artifacts: list[Artifact] diff --git a/dcpy/product_metadata/models/metadata/data_dictionary.py b/dcpy/product_metadata/models/metadata/data_dictionary.py new file mode 100644 index 0000000000..cee67b8c5e --- /dev/null +++ b/dcpy/product_metadata/models/metadata/data_dictionary.py @@ -0,0 +1,18 @@ +from dcpy.models.base import TemplatedYamlReader + +from .product import CustomizableBase + + +class FieldDefinition(CustomizableBase): + summary: str + extra_description: str | None = None + + +class FieldSet(CustomizableBase): + fields: dict[str, FieldDefinition] = {} + + +class DataDictionary(CustomizableBase, TemplatedYamlReader): + org: dict[str, dict[str, FieldDefinition]] = {} + product: dict[str, dict[str, FieldDefinition]] = {} + dataset: dict[str, dict[str, FieldDefinition]] = {} diff --git a/dcpy/product_metadata/models/metadata/org.py b/dcpy/product_metadata/models/metadata/org.py new file mode 100644 index 0000000000..4603135baa --- /dev/null +++ b/dcpy/product_metadata/models/metadata/org.py @@ -0,0 +1,312 @@ +from __future__ import annotations + +from pathlib import Path +from typing import ClassVar + +import pandas as pd +import yaml +from pydantic import BaseModel, Field, TypeAdapter + +from dcpy.models.base import SortedSerializedBase, TemplatedYamlReader, YamlWriter +from dcpy.product_metadata.models.metadata.artifacts import Artifact, Artifacts +from dcpy.product_metadata.models.metadata.data_dictionary import DataDictionary +from dcpy.product_metadata.models.metadata.product import ( + COLUMN_TYPES, + DatasetColumn, + DatasetOrgProductAttributesOverride, +) +from dcpy.product_metadata.models.metadata.product import ( + Metadata as DatasetMetadata, +) +from dcpy.utils.collections import deep_merge_dict as merge + +ERROR_PRODUCT_DATASET_METADATA_INSTANTIATION = ( + "Error instantiating dataset metadata for" +) +ERROR_PRODUCT_METADATA_INSTANTIATION = "Error instantiating product metadata" + + +class ProductAttributes(SortedSerializedBase, extra="forbid"): + display_name: str | None = None + description: str | None = None + + +class ProductMetadataFile( + SortedSerializedBase, YamlWriter, TemplatedYamlReader, extra="forbid" +): + id: str + attributes: ProductAttributes = Field(default_factory=ProductAttributes) + dataset_defaults: DatasetOrgProductAttributesOverride = Field( + default_factory=DatasetOrgProductAttributesOverride + ) + datasets: list[str] = [] + + +class ProductMetadata(SortedSerializedBase, extra="forbid"): + DATASET_NOT_LISTED_ERROR: ClassVar[str] = "Dataset not listed in metadata" + + root_path: Path + metadata: ProductMetadataFile + template_vars: dict = {} + column_defaults: dict[tuple[str, COLUMN_TYPES], DatasetColumn] = {} + org_attributes: DatasetOrgProductAttributesOverride + + @classmethod + def from_path( + cls, + root_path: Path, + template_vars: dict = {}, + column_defaults: dict[tuple[str, COLUMN_TYPES], DatasetColumn] = {}, + org_attributes: DatasetOrgProductAttributesOverride = DatasetOrgProductAttributesOverride(), + ) -> ProductMetadata: + return ProductMetadata( + root_path=root_path, + metadata=ProductMetadataFile.from_path( + root_path / "metadata.yml", template_vars=template_vars + ), + template_vars=template_vars, + column_defaults=column_defaults, + org_attributes=org_attributes, + ) + + def _dataset_folders(self): + return [p.parent.name for p in self.root_path.glob("*/*.yml")] + + def dataset(self, dataset_id: str) -> DatasetMetadata: + if dataset_id not in self.metadata.datasets: + raise Exception(f"{self.DATASET_NOT_LISTED_ERROR}: {dataset_id}") + + ds_md = DatasetMetadata.from_path( + self.root_path / dataset_id / "metadata.yml", + template_vars=self.template_vars, + ) + if ds_md.id != dataset_id: + raise Exception( + ( + "There is a mismatch between the dataset id listed at the" + f" dataset level ({ds_md.id}) vs the product-level ({dataset_id})" + ) + ) + + ds_md.attributes = ds_md.attributes.apply_defaults( + self.metadata.dataset_defaults + ).apply_defaults(self.org_attributes) + + ds_md.columns = ds_md.apply_column_defaults(self.column_defaults) + + return ds_md + + def get_datasets_by_id(self) -> dict[str, DatasetMetadata]: + dataset_mds = [self.dataset(ds_id) for ds_id in self.metadata.datasets] + return {m.id: m for m in dataset_mds} + + def all_destinations(self) -> list[dict]: + """Get all destinations for a product""" + found_dests = [] + for ds in self.get_datasets_by_id().values(): + for dest in ds.destinations: + found_dests.append( + { + "product": self.metadata.id, + "dataset_id": ds.id, + "destination_id": dest.id, + "destination_type": dest.type, + # "remote_id": (dest.custom or {}).get("four_four"), + "tags": set(dest.tags or []), + "custom": dest.custom, + "destination_path": f"{self.metadata.id}.{ds.id}.{dest.id}", + } + ) + return found_dests + + def all_destinations_df(self, grouped: bool = False) -> pd.DataFrame: + """Helper to display all destinations for a product. + Using the `grouped` flag wil group the output to make things visually a little easier. + """ + df = pd.DataFrame(self.all_destinations()) + return ( + df.set_index(["product", "dataset_id", "destination_id"]).sort_index() + if grouped + else df + ) + + def query_dataset_destinations( + self, + *, + dataset_ids: set[str] | None = None, + destination_filter: dict | None = None, + ) -> list[str]: + """Retrieve a list of destination paths for given filters. + .e.g. [lion.atomic_polygons.socrata, ...] + """ + dataset_ids = ( + dataset_ids & set(self.metadata.datasets) + if dataset_ids + else set(self.metadata.datasets) + ) + dest_paths = [] + for d in dataset_ids: + dest_paths += [ + f"{self.metadata.id}.{d}.{dest_id}" + for dest_id in self.dataset(d).query_destinations( + **(destination_filter or {}) + ) + ] + return sorted(dest_paths) + + def validate_dataset_metadata(self) -> dict[str, list[str]]: + product_errors = {} + + for ds_id in self.metadata.datasets: + errors = [] + try: + errors = self.dataset(ds_id).validate_consistency() + except Exception as e: + errors = [ + f"Error instantiating dataset metadata for {self.metadata.id}: {ds_id}: {e}" + ] + if errors: + product_errors[ds_id] = errors + return product_errors + + +class ProductDatasetDestinationKey(BaseModel): + product: str + dataset: str + destination: str + + +class OrgMetadataFile(TemplatedYamlReader, SortedSerializedBase, extra="forbid"): + products: list[str] + attributes: DatasetOrgProductAttributesOverride + + +class OrgMetadata(SortedSerializedBase, extra="forbid"): + PRODUCT_NOT_LISTED_ERROR: ClassVar[str] = "Product not listed in metadata" + + root_path: Path + template_vars: dict = Field(default_factory=dict) + metadata: OrgMetadataFile + column_defaults: dict[tuple[str, COLUMN_TYPES], DatasetColumn] + data_dictionary: DataDictionary = DataDictionary() + + @classmethod + def get_string_snippets(cls, path: Path) -> dict: + s_path = path / "snippets" / "strings.yml" + if not s_path.exists(): + return {} + + with open(s_path, "r", encoding="utf-8") as raw: + yml = yaml.safe_load(raw) or {} + if not isinstance(yml, dict): + raise ValueError("snippets must be valid yml dict, not array") + return yml + + @classmethod + def get_column_defaults( + cls, path: Path + ) -> dict[tuple[str, COLUMN_TYPES], DatasetColumn]: + c_path = path / "snippets" / "column_defaults.yml" + if not c_path.exists(): + return {} + with open(c_path, "r", encoding="utf-8") as raw: + yml = yaml.safe_load(raw) or [] + columns = TypeAdapter(list[DatasetColumn]).validate_python(yml) + return {(c.id, c.data_type): c for c in columns if c.data_type} + + @classmethod + def from_path(cls, path: Path, template_vars: dict | None = None): + template_vars = merge(cls.get_string_snippets(path), template_vars or {}) or {} + dd_default_path = path / "data_dictionary.yml" + return OrgMetadata( + root_path=path, + metadata=OrgMetadataFile.from_path( + path / "metadata.yml", template_vars=template_vars + ), + template_vars=template_vars, + column_defaults=cls.get_column_defaults(path), + data_dictionary=DataDictionary.from_path(dd_default_path) + if dd_default_path.exists() + else DataDictionary(), + ) + + def product(self, name: str) -> ProductMetadata: + if name not in self.metadata.products: + raise Exception(f"{self.PRODUCT_NOT_LISTED_ERROR}: {name}") + + return ProductMetadata.from_path( + root_path=self.root_path / "products" / name, + template_vars=self.template_vars, + column_defaults=self.column_defaults, + org_attributes=self.metadata.attributes, + ) + + def validate_metadata(self) -> dict[str, dict[str, list[str]]]: + product_errors = {} + for p in self.metadata.products: + try: + errors = self.product(p).validate_dataset_metadata() + if errors: + product_errors[p] = errors + except Exception as e: + product_errors[p] = { + "product-level-metadata": [ + f"{ERROR_PRODUCT_METADATA_INSTANTIATION}: {e}" + ] + } + return product_errors + + def get_packaging_artifacts(self) -> list[Artifact]: + return Artifacts.from_path( + self.root_path / "packaging" / "artifacts.yml" + ).artifacts + + def get_full_resource_path(self, file: str | Path): + return self.root_path / "packaging" / "resources" / file + + def query_product_dataset_destinations( + self, + *, + product_ids: set[str] | None = None, + dataset_ids: set[str] | None = None, + destination_filter: dict | None = None, + ) -> list[str]: + """Query for all destinations matching filters. + Returns a list of destination paths in the format product.dataset.destination_id + """ + all_dests = [] + for p_name in ( + (product_ids & set(self.metadata.products)) + if product_ids + else set(self.metadata.products) + ): + all_dests += self.product(p_name).query_dataset_destinations( + dataset_ids=dataset_ids, destination_filter=destination_filter or {} + ) + return sorted(all_dests) + + def get_product_dataset_destinations(self, destination_path: str): + prod, ds, dest_id = destination_path.split(".") + return self.product(prod).dataset(ds).get_destination(dest_id) + + def get_all_destination_current_versions(self) -> list[str]: + """Get all destination paths with their current_version. + Returns a sorted list of strings in format: product.dataset.destination|current_version + """ + result = [] + for p_name in self.metadata.products: + product = self.product(p_name) + for ds in product.get_datasets_by_id().values(): + # Get the resolved current_version from dataset attributes (has org/product defaults applied) + dataset_version = ds.attributes.current_version or "" + + for dest in ds.destinations: + # Destination can override the dataset version + version = ( + dest.current_version + if dest.current_version + else dataset_version + ) + result.append(f"{p_name}.{ds.id}.{dest.id}|{version}") + + return sorted(result) diff --git a/dcpy/product_metadata/models/metadata/product.py b/dcpy/product_metadata/models/metadata/product.py new file mode 100644 index 0000000000..803c68f3b2 --- /dev/null +++ b/dcpy/product_metadata/models/metadata/product.py @@ -0,0 +1,420 @@ +from __future__ import annotations + +import unicodedata +from typing import Any, List + +from pydantic import BaseModel +from tabulate import tabulate # type: ignore + +from dcpy.models.base import SortedSerializedBase, TemplatedYamlReader, YamlWriter +from dcpy.models.dataset import COLUMN_TYPES, Column +from dcpy.utils.collections import deep_merge_dict as merge + +ERROR_MISSING_COLUMN = "MISSING COLUMN" + + +# MISC UTILS +def normalize_text(s): + """ + Normalize the text we may receive from the various metadata sources. + Primarily useful for cleaning long-text like descriptions. + """ + char_map = { + "–": "-", # en dash to hyphen + "—": "-", # em dash to hyphen + "’": "'", # curly apostrophe + "“": '"', # lcurly quote + "”": '"', # rcurly quote + } + translator = str.maketrans(char_map) + # Normalize Unicode characters + normalized = unicodedata.normalize("NFKD", s) + # Apply the translation + cleaned = normalized.translate(translator) + return cleaned.strip() + + +class CustomizableBase(SortedSerializedBase, extra="forbid"): + """A Base Pydantic class to allow extensibility of our models via a `custom` + dictionary. + + Any additional attributes that aren't defined on our models should + be added to `custom`. This is intended for domain-specific, non-generalized uses + like data-dictionary generation (e.g. the `readme_data_type` field on the columns) + + It's also important that custom be preserved in the + course of overriding, specifically with a deep merge of the dictionary elements. + """ + + custom: dict[str, Any] = {} + + +# COLUMNS +class ColumnValue(CustomizableBase): + _head_sort_order = ["value", "description"] + + value: str + description: str | None = None + + +def make_value_table(values: list[ColumnValue]) -> str: + return ( + tabulate( + [ + [str(v.value) + " ", str(v.description or " ") + " "] # bool issue + for v in values + ], + headers=["Value", "Description"], + tablefmt="presto", + maxcolwidths=[10, 40], + ) + if values + else "" + ) + + +class DatasetColumn(CustomizableBase, Column): + _head_sort_order = ["id", "name", "data_type", "description"] + _tail_sort_order = ["example", "values", "custom"] + _repr_functions = {"values": make_value_table} + + # Note: id isn't intended to be overrideable, but is always required as a + # pointer back to the original column. + name: str | None = None + data_source: str | None = None + description: str | None = None + limitations: str | None = None + notes: str | None = None + example: str | None = None + deprecated: bool | None = None + values: list[ColumnValue] | None = None + + def override(self, overrides: DatasetColumn) -> DatasetColumn: + return DatasetColumn(**merge(self.model_dump(), overrides.model_dump())) + + +# FILE +class FileOverrides(CustomizableBase): + filename: str | None = None + type: str | None = None + + +class File(CustomizableBase): + """Describes an actual dataset file, e.g. dataset files or attachments.""" + + id: str + filename: str + type: str | None = None + is_metadata: bool | None = ( + None # e.g. readmes, data_dictionaries, version_files, etc. + ) + + def override(self, overrides: FileOverrides) -> File: + return File( + **(self.model_dump() | overrides.model_dump()), + ) + + +# PACKAGE / ASSEMBLY +class PackageFile(CustomizableBase): + """File found in a Package, e.g. a Zip. `filename` here refers to it's name + in the package + """ + + id: str + filename: str | None = None + + +class Package(CustomizableBase): + """Container for lists of files. Used as assembly instructions.""" + + id: str + type: str = "zip" + filename: str + contents: List[PackageFile] + + +class DatasetOrgProductAttributesOverride(CustomizableBase): + """Fields that might be set as a default at the Product/Org level.""" + + agency: str | None = None + agency_website_data_updated_automatically: bool | None = None + attribution: str | None = None + attribution_link: str | None = None + can_be_automated: bool | None = None + category: str | None = None + contact_email: str | None = None + contains_address: bool | None = ( + None # `contains_address` refers specifically to addresses containing house, numbers + street names. (ie. not just streets, polys, etc.) + ) + current_version: str | None = None + data_collection_method: str | None = None + data_change_frequency: str | None = None + date_made_public: str | None = None + disclaimer: str | None = None + geocoded: bool | None = None + on_agency_website: bool | None = None + potential_uses: str | None = None + projection: str | None = None + publishing_frequency: str | None = None # TODO: picklist values + publishing_frequency_details: str | None = None + publishing_purpose: str | None = None + rows_removed: bool | None = None + tags: List[str] | None = [] + + +class DatasetAttributesOverride(DatasetOrgProductAttributesOverride): + description: str | None = None + display_name: str | None = None + each_row_is_a: str | None = None + + +class DatasetAttributes(DatasetOrgProductAttributesOverride): + display_name: str + description: str = "" + each_row_is_a: str + + def override(self, overrides: DatasetAttributesOverride) -> DatasetAttributes: + return DatasetAttributes(**merge(self.model_dump(), overrides.model_dump())) + + def apply_defaults(self, defaults: BaseModel) -> DatasetAttributes: + return DatasetAttributes(**merge(defaults.model_dump(), self.model_dump())) + + +class DatasetOverrides(CustomizableBase): + overridden_columns: list[DatasetColumn] = [] + omitted_columns: list[str] = [] + attributes: DatasetAttributesOverride = DatasetAttributesOverride() + + +class Revision(CustomizableBase): + date: str + summary: str + notes: str + + +class Dataset(CustomizableBase): + columns: list[DatasetColumn] + attributes: DatasetAttributes + revisions: list[Revision] = [] + + def override(self, overrides: DatasetOverrides) -> Dataset: + """Apply column updates and prune any columns specified as omitted""" + overriden_cols_by_id = {c.id: c for c in overrides.overridden_columns} + + columns = [ + c.override(overriden_cols_by_id.get(c.id, DatasetColumn(id=c.id))) + for c in self.columns + if c.id not in overrides.omitted_columns + ] + + return Dataset( + columns=columns, + attributes=self.attributes.override(overrides.attributes), + revisions=self.revisions, + ) + + +# DESTINATION +class Destination(CustomizableBase): + id: str + type: str + tags: list[str] = [] + current_version: str = "" + + +class DestinationWithFiles(Destination): + files: List[DestinationFile] = [] + + +class FileAndOverrides(SortedSerializedBase): + _head_sort_order = ["file"] + + dataset_overrides: DatasetOverrides = DatasetOverrides() + file: File + + +class DestinationFile(CustomizableBase): + """Pointer to an actual `File`, with specifiable overrides.""" + + id: str + dataset_overrides: DatasetOverrides = DatasetOverrides() + file_overrides: FileOverrides = FileOverrides() + + +class DestinationMetadata(SortedSerializedBase): + dataset: Dataset + destination: Destination + file: File + + +class Metadata(CustomizableBase, YamlWriter, TemplatedYamlReader): + id: str + attributes: DatasetAttributes + assembly: List[Package] = [] + columns: List[DatasetColumn] = [] + files: List[FileAndOverrides] = [] + destinations: List[DestinationWithFiles] = [] + revisions: list[Revision] = [] + + _head_sort_order = [ + "id", + "attributes", + ] + _tail_sort_order = ["columns"] + _exclude_falsey_values = False # We never want to prune top-level attrs + + @property + def dataset(self): + return Dataset( + attributes=self.attributes, columns=self.columns, revisions=self.revisions + ) + + def get_package(self, id: str) -> Package: + packages = [p for p in self.assembly if p.id == id] + if len(packages) != 1: + raise Exception(f"There should exist one package with id: {id}") + return packages[0] + + def query_destinations(self, *, ids=None, types=None, tags=None): + return [ + d.id + for d in self.destinations + if (not ids or d.id in ids) + and (not types or d.type in types) + # if there's any overlap in the tags + and (not tags or set(d.tags) & tags) + ] + + def get_destination(self, id: str) -> DestinationWithFiles: + dests = [d for d in self.destinations if d.id == id] + if len(dests) != 1: + raise Exception(f"There should exist one destination with id: {id}") + return dests[0] + + def get_column(self, column_id: str): + cols = [c for c in self.columns if c.id == column_id] + if len(cols) != 1: + raise Exception( + f"There should exist one column with id: {column_id}. Found {len(cols)}" + ) + return cols[0] + + def get_file_ids(self): + return {f.file.id for f in self.files} + + def get_file_and_overrides(self, file_id: str) -> FileAndOverrides: + files = [f for f in self.files if f.file.id == file_id] + if len(files) != 1: + raise Exception(f"There should exist one file with id: {file_id}") + return files[0] + + def calculate_metadata( + self, *, file_id: str, destination_id: str | None = None + ) -> Dataset: + if destination_id: + return self.calculate_destination_metadata( + file_id=file_id, destination_id=destination_id + ).dataset + else: + return self.calculate_file_dataset_metadata(file_id=file_id) + + def calculate_file_dataset_metadata(self, *, file_id: str) -> Dataset: + return self.dataset.override( + self.get_file_and_overrides(file_id).dataset_overrides + ) + + def calculate_destination_metadata( + self, *, file_id: str, destination_id: str + ) -> DestinationMetadata: + dataset_file = self.get_file_and_overrides(file_id) + + destination = self.get_destination(destination_id) + dest_files = [f for f in destination.files if f.id == file_id] + if len(dest_files) != 1: + raise Exception( + f"Can't calculate overrides, because destination: {destination_id} doesn't reference file: {file_id}" + ) + dest_file = dest_files[0] + + dataset_metadata = self.calculate_file_dataset_metadata( + file_id=file_id + ).override(dest_file.dataset_overrides) + + destination_file = ( + dataset_file.file + if not dest_file + else dataset_file.file.override(dest_file.file_overrides) + ) + + return DestinationMetadata( + file=destination_file, + dataset=dataset_metadata, + destination=destination, + ) + + def validate_consistency(self): + # validate file references + errors = [] + + column_ids = {c.id for c in self.columns} + dataset_files_and_zip_ids = {f.id for f in self.assembly} | { + f.file.id for f in self.files + } + + # files + for fo in self.files: + for c in fo.dataset_overrides.overridden_columns: + if c.id not in column_ids: + errors.append( + f"{ERROR_MISSING_COLUMN}: file {fo.file.id} references undefined column {c.id}" + ) + for c in fo.dataset_overrides.omitted_columns: + if c not in column_ids: + errors.append( + f"{ERROR_MISSING_COLUMN}: file override for {fo.file.id} references undefined column {c}" + ) + + # destinations + for d in self.destinations: + for df in d.files: + if df.id not in dataset_files_and_zip_ids: + errors.append( + f"MISSING FILE: destination {d.id} references undefined file {df.id}" + ) + for c in df.dataset_overrides.omitted_columns: + if c not in column_ids: + errors.append( + f"{ERROR_MISSING_COLUMN}: file override for dest {d.id} references undefined omitted column {c}" + ) + for c in df.dataset_overrides.overridden_columns: + if c.id not in column_ids: + errors.append( + f"{ERROR_MISSING_COLUMN}: destination {d.id} references undefined column {c.id}" + ) + + # assemblies + for a in self.assembly: + for df in a.contents: + if df.id not in dataset_files_and_zip_ids: + errors.append( + f"MISSING FILE: zip {df.id} references undefined file {df.id}" + ) + + for c in self.columns: + if c.name is None: + errors.append(f"MISSING COLUMN NAME: column id {c.id}") + if c.data_type is None: + errors.append(f"MISSING COLUMN DATA TYPE: column id {c.id}") + + return errors + + def apply_column_defaults( + self, column_defaults: dict[tuple[str, COLUMN_TYPES], DatasetColumn] + ) -> list[DatasetColumn]: + return [ + c.override(column_defaults[c.id, c.data_type]) + if c.data_type and (c.id, c.data_type) in column_defaults + else c + for c in self.columns + ] diff --git a/dcpy/product_metadata/package/__init__.py b/dcpy/product_metadata/package/__init__.py new file mode 100644 index 0000000000..e69de29bb2 diff --git a/dcpy/test/connectors/socrata/test_publish.py b/dcpy/test/connectors/socrata/test_publish.py index 47c3464966..aa15043639 100644 --- a/dcpy/test/connectors/socrata/test_publish.py +++ b/dcpy/test/connectors/socrata/test_publish.py @@ -6,7 +6,7 @@ import pytest from socrata.output_schema import OutputSchema -import dcpy.models.product.dataset.metadata as md +import dcpy.product_metadata.models.metadata.product as md from dcpy.connectors.edm.open_data_nyc import OpenDataConnector from dcpy.connectors.socrata import publish diff --git a/dcpy/test/lifecycle/package/assemble/test_abstract_docs.py b/dcpy/test/lifecycle/package/assemble/test_abstract_docs.py index 2904c09f83..b0c5c2fdd2 100644 --- a/dcpy/test/lifecycle/package/assemble/test_abstract_docs.py +++ b/dcpy/test/lifecycle/package/assemble/test_abstract_docs.py @@ -1,8 +1,8 @@ import pytest from dcpy.lifecycle.package import abstract_doc -from dcpy.models.design import elements as de -from dcpy.models.product.metadata import OrgMetadata +from dcpy.product_metadata.models.design import elements as de +from dcpy.product_metadata.models.metadata.org import OrgMetadata @pytest.fixture diff --git a/dcpy/test/lifecycle/package/assemble/test_unpacking.py b/dcpy/test/lifecycle/package/assemble/test_unpacking.py index 55f6fb1ca0..878e192e78 100644 --- a/dcpy/test/lifecycle/package/assemble/test_unpacking.py +++ b/dcpy/test/lifecycle/package/assemble/test_unpacking.py @@ -3,7 +3,7 @@ import pytest -import dcpy.models.product.dataset.metadata as md +import dcpy.product_metadata.models.metadata.product as md from dcpy.lifecycle.package import assemble diff --git a/dcpy/test/lifecycle/package/test_parse_shapefile_to_md.py b/dcpy/test/lifecycle/package/test_parse_shapefile_to_md.py index 4f435db08e..914688f628 100644 --- a/dcpy/test/lifecycle/package/test_parse_shapefile_to_md.py +++ b/dcpy/test/lifecycle/package/test_parse_shapefile_to_md.py @@ -3,7 +3,7 @@ import yaml from dcpy.lifecycle.package import shapefiles -from dcpy.models.product.dataset import metadata as dsmd +from dcpy.product_metadata.models.metadata import product as dsmd from dcpy.test.lifecycle.package.conftest import PACKAGE_RESOURCES_PATH diff --git a/dcpy/test/lifecycle/package/test_shapefiles.py b/dcpy/test/lifecycle/package/test_shapefiles.py index 4118831b31..3e3c95329a 100644 --- a/dcpy/test/lifecycle/package/test_shapefiles.py +++ b/dcpy/test/lifecycle/package/test_shapefiles.py @@ -8,7 +8,7 @@ from dcpy.lifecycle.package import shapefiles from dcpy.models.data.shapefile_metadata import Metadata -from dcpy.models.product.metadata import OrgMetadata +from dcpy.product_metadata.models.metadata.org import OrgMetadata from dcpy.utils.geospatial import shapefile as shp_utils SHP_ZIP_NO_MD = "shapefile_single_pluto_feature_no_metadata.shp.zip" diff --git a/dcpy/test/lifecycle/package/validate/test_column_validation.py b/dcpy/test/lifecycle/package/validate/test_column_validation.py index 12b6d22b95..5bbbd45cbf 100644 --- a/dcpy/test/lifecycle/package/validate/test_column_validation.py +++ b/dcpy/test/lifecycle/package/validate/test_column_validation.py @@ -7,7 +7,7 @@ from shapely import wkb, wkt import dcpy.models.dataset as dataset -import dcpy.models.product.dataset.metadata as md +import dcpy.product_metadata.models.metadata.product as md from dcpy.lifecycle.package import validate from dcpy.test.lifecycle.package.conftest import TEST_METADATA_YAML_PATH diff --git a/dcpy/test/models/product/dataset/test_product_metadata.py b/dcpy/test/models/product/dataset/test_product_metadata.py index f027acd29d..12c79b7145 100644 --- a/dcpy/test/models/product/dataset/test_product_metadata.py +++ b/dcpy/test/models/product/dataset/test_product_metadata.py @@ -4,7 +4,7 @@ import pytest from pytest import fixture -from dcpy.models.product.dataset import metadata as m +from dcpy.product_metadata.models.metadata import product as m OVERRIDDEN_SHP_NAME_AT_DEST = "overridden_shp_name_at_dest.zip" DESTINATION_OVERRIDDEN_DISPLAY_NAME = "overridden dest display name" diff --git a/dcpy/test/models/product/dataset/test_text_cleaning.py b/dcpy/test/models/product/dataset/test_text_cleaning.py index cbb85b9551..251ff40509 100644 --- a/dcpy/test/models/product/dataset/test_text_cleaning.py +++ b/dcpy/test/models/product/dataset/test_text_cleaning.py @@ -1,4 +1,4 @@ -import dcpy.models.product.dataset.metadata as md +import dcpy.product_metadata.models.metadata.product as md def test_description_cleaning(): diff --git a/dcpy/test/models/product/test_metadata.py b/dcpy/test/models/product/test_metadata.py index c0b22ec263..e0ea11b323 100644 --- a/dcpy/test/models/product/test_metadata.py +++ b/dcpy/test/models/product/test_metadata.py @@ -3,8 +3,8 @@ import pytest from dcpy.models import dataset -from dcpy.models.product import metadata as md -from dcpy.models.product.dataset import metadata as ds_md +from dcpy.product_metadata.models.metadata import org as md +from dcpy.product_metadata.models.metadata import product as ds_md @pytest.fixture From 751b1a3b25449e2c3a8e18cfe93de77432a6555d Mon Sep 17 00:00:00 2001 From: Alex Richey Date: Tue, 7 Apr 2026 20:14:30 -0400 Subject: [PATCH 3/3] Add optional dependencies for pmd to enable installing JUST pmd --- pyproject.toml | 30 ++++++++++++++++++++++++++++++ 1 file changed, 30 insertions(+) diff --git a/pyproject.toml b/pyproject.toml index dccbc08022..84923c61b4 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -167,6 +167,35 @@ dependencies = [ "xlrd", ] +[project.optional-dependencies] +# Core utilities - database, s3, data processing (no GDAL) +utils = [ + "boto3", + "cloudpathlib[all]", + "duckdb", + "pandas", + "psycopg2-binary", + "pyarrow", + "python-dateutil", + "pytz", + "sqlalchemy", + "tabulate", + "urllib3", + "paramiko", +] + +# Product metadata - standalone metadata reading/writing +product-metadata = [ + "dcpy[utils]", + "pydantic", + "PyYAML", + "openpyxl", + "beautifulsoup4", + "css-inline", + "lxml", + "Jinja2", +] + [tool.setuptools.packages.find] include = ["dcpy*"] exclude = ["dcpy.test*"] @@ -179,5 +208,6 @@ dcpy = "dcpy.__main__:cli" dcpy = [ "library/templates/*.yml", "lifecycle/package/resources/**", + "product_metadata/package/resources/**", "connectors/edm/bytes/site_map.json", ]