diff --git a/scripts/convert_v2_to_v3.py b/migrations/migrate_000_convert_v2_to_v3.py similarity index 94% rename from scripts/convert_v2_to_v3.py rename to migrations/migrate_000_convert_v2_to_v3.py index 97d239360..676058849 100755 --- a/scripts/convert_v2_to_v3.py +++ b/migrations/migrate_000_convert_v2_to_v3.py @@ -6,11 +6,11 @@ import argparse from os.path import join, isfile -from lib.changelog import format_dataset_attributes_md_table -from lib.container import dict_set, dict_cleanup, dict_get, dict_remove_many -from lib.date import now_iso -from lib.fs import json_write, copy, json_read, file_write -from migrate_002_default_gene import rename_default_gene +from scripts.lib.changelog import format_dataset_attributes_md_table +from scripts.lib.container import dict_set, dict_cleanup, dict_get, dict_remove_many +from scripts.lib.date import now_iso +from scripts.lib.fs import json_write, copy, json_read, file_write +from migrations.migrate_002_default_gene import rename_default_gene def check_file(dataset_dir, filename): diff --git a/scripts/migrate_001_attributes.py b/migrations/migrate_001_attributes.py similarity index 93% rename from scripts/migrate_001_attributes.py rename to migrations/migrate_001_attributes.py index bff9cfaa7..47f686b57 100644 --- a/scripts/migrate_001_attributes.py +++ b/migrations/migrate_001_attributes.py @@ -1,5 +1,5 @@ -from lib.fs import find_files, json_read, json_write -from lib.container import dict_get, dict_remove, dict_cleanup, true_or_none +from scripts.lib.fs import find_files, json_read, json_write +from scripts.lib.container import dict_get, dict_remove, dict_cleanup, true_or_none def main(): diff --git a/scripts/migrate_002_default_gene.py b/migrations/migrate_002_default_gene.py similarity index 78% rename from scripts/migrate_002_default_gene.py rename to migrations/migrate_002_default_gene.py index 440030327..63709d6d1 100644 --- a/scripts/migrate_002_default_gene.py +++ b/migrations/migrate_002_default_gene.py @@ -1,5 +1,5 @@ -from lib.container import dict_rename_many -from lib.fs import find_files, json_read, json_write +from scripts.lib.container import dict_rename_many +from scripts.lib.fs import find_files, json_read, json_write def rename_default_gene(pathogen): diff --git a/scripts/migrate_003_gene_to_cds.py b/migrations/migrate_003_gene_to_cds.py similarity index 92% rename from scripts/migrate_003_gene_to_cds.py rename to migrations/migrate_003_gene_to_cds.py index 0b8637d8f..88dea779f 100644 --- a/scripts/migrate_003_gene_to_cds.py +++ b/migrations/migrate_003_gene_to_cds.py @@ -1,5 +1,5 @@ -from lib.container import dict_rename_many, dict_get -from lib.fs import find_files, json_read, json_write +from scripts.lib.container import dict_rename_many, dict_get +from scripts.lib.fs import find_files, json_read, json_write def rename_gene_to_cds(pathogen): diff --git a/scripts/migrate_004_remove_versions.py b/migrations/migrate_004_remove_versions.py similarity index 71% rename from scripts/migrate_004_remove_versions.py rename to migrations/migrate_004_remove_versions.py index 0f37cb6a7..1314e88fa 100644 --- a/scripts/migrate_004_remove_versions.py +++ b/migrations/migrate_004_remove_versions.py @@ -1,5 +1,5 @@ -from lib.container import dict_remove -from lib.fs import find_files, json_read, json_write +from scripts.lib.container import dict_remove +from scripts.lib.fs import find_files, json_read, json_write def main(): diff --git a/scripts/migrate_005_create_zips.py b/migrations/migrate_005_create_zips.py similarity index 89% rename from scripts/migrate_005_create_zips.py rename to migrations/migrate_005_create_zips.py index 470a44497..cf3392a60 100644 --- a/scripts/migrate_005_create_zips.py +++ b/migrations/migrate_005_create_zips.py @@ -1,7 +1,7 @@ from os.path import join, dirname from shutil import move -from lib.fs import find_files, make_zip, rmrf +from scripts.lib.fs import find_files, make_zip, rmrf tmp_dir = "data_temp/" diff --git a/scripts/migrate_006_add_rel_muts.py b/migrations/migrate_006_add_rel_muts.py similarity index 98% rename from scripts/migrate_006_add_rel_muts.py rename to migrations/migrate_006_add_rel_muts.py index 8e1410758..f22624b9b 100755 --- a/scripts/migrate_006_add_rel_muts.py +++ b/migrations/migrate_006_add_rel_muts.py @@ -1,9 +1,9 @@ #!/usr/bin/env python3 from os.path import dirname, join -from lib.changelog import changelog_get_unreleased_section -from lib.container import dict_set -from lib.fs import find_files, json_read, json_write, file_read, file_write +from scripts.lib.changelog import changelog_get_unreleased_section +from scripts.lib.container import dict_set +from scripts.lib.fs import find_files, json_read, json_write, file_read, file_write """ Replace .meta.extensions.nextclade.ref_nodes" in all tree.json files matching a subdirectory recursively diff --git a/scripts/add_schema_to_pathogen_json.py b/migrations/migrate_007_add_schema.py similarity index 100% rename from scripts/add_schema_to_pathogen_json.py rename to migrations/migrate_007_add_schema.py diff --git a/migrations/migrate_008_move_mut_labels.py b/migrations/migrate_008_move_mut_labels.py new file mode 100644 index 000000000..88be26fd4 --- /dev/null +++ b/migrations/migrate_008_move_mut_labels.py @@ -0,0 +1,40 @@ +""" +Move misplaced mutation label maps from root level into 'mutLabels'. + +- 'nucMutLabelMap' at root -> 'mutLabels.nucMutLabelMap' +- 'mutLabelMap' at root -> contents moved into 'mutLabels' +""" + +from scripts.lib.fs import find_files, json_read, json_write + + +def move_mut_labels(pathogen: dict) -> dict: + mut_labels = pathogen.setdefault("mutLabels", {}) + + if "nucMutLabelMap" in pathogen: + value = pathogen.pop("nucMutLabelMap") + if value and value != {}: + mut_labels.setdefault("nucMutLabelMap", value) + + if "mutLabelMap" in pathogen: + value = pathogen.pop("mutLabelMap") + if isinstance(value, dict): + for sub_key, sub_value in value.items(): + if sub_value and sub_value != {}: + mut_labels.setdefault(sub_key, sub_value) + + if mut_labels == {}: + pathogen.pop("mutLabels", None) + + return pathogen + + +def main(): + for file in find_files("pathogen.json", here="data/"): + pathogen = json_read(file) + pathogen = move_mut_labels(pathogen) + json_write(pathogen, file, no_sort_keys=True) + + +if __name__ == '__main__': + main() diff --git a/migrations/migrate_009_remove_nuc_mut_label_map_reverse.py b/migrations/migrate_009_remove_nuc_mut_label_map_reverse.py new file mode 100644 index 000000000..056e95913 --- /dev/null +++ b/migrations/migrate_009_remove_nuc_mut_label_map_reverse.py @@ -0,0 +1,29 @@ +""" +Remove legacy v2 'nucMutLabelMapReverse' field. + +Not used by Nextclade v3 - the reverse map is computed at runtime. +Removes from both root level and 'mutLabels' section. +""" + +from scripts.lib.fs import find_files, json_read, json_write + + +def remove_nuc_mut_label_map_reverse(pathogen: dict) -> dict: + pathogen.pop("nucMutLabelMapReverse", None) + + mut_labels = pathogen.get("mutLabels") + if isinstance(mut_labels, dict): + mut_labels.pop("nucMutLabelMapReverse", None) + + return pathogen + + +def main(): + for file in find_files("pathogen.json", here="data/"): + pathogen = json_read(file) + pathogen = remove_nuc_mut_label_map_reverse(pathogen) + json_write(pathogen, file, no_sort_keys=True) + + +if __name__ == '__main__': + main() diff --git a/migrations/migrate_010_rename_ignore_frame_shifts.py b/migrations/migrate_010_rename_ignore_frame_shifts.py new file mode 100644 index 000000000..83acd962f --- /dev/null +++ b/migrations/migrate_010_rename_ignore_frame_shifts.py @@ -0,0 +1,28 @@ +""" +Rename 'qc.frameShifts.ignoreFrameShifts' to 'ignoredFrameShifts'. + +The old name is silently ignored, so listed frame shifts are not +being excluded from QC scoring. +""" + +from scripts.lib.container import dict_get +from scripts.lib.fs import find_files, json_read, json_write + + +def rename_ignore_frame_shifts(pathogen: dict) -> dict: + frame_shifts = dict_get(pathogen, ["qc", "frameShifts"]) + if isinstance(frame_shifts, dict) and "ignoreFrameShifts" in frame_shifts: + value = frame_shifts.pop("ignoreFrameShifts") + frame_shifts["ignoredFrameShifts"] = value + return pathogen + + +def main(): + for file in find_files("pathogen.json", here="data/"): + pathogen = json_read(file) + pathogen = rename_ignore_frame_shifts(pathogen) + json_write(pathogen, file, no_sort_keys=True) + + +if __name__ == '__main__': + main() diff --git a/migrations/migrate_011_remove_qc_score_weight.py b/migrations/migrate_011_remove_qc_score_weight.py new file mode 100644 index 000000000..721b51aab --- /dev/null +++ b/migrations/migrate_011_remove_qc_score_weight.py @@ -0,0 +1,29 @@ +""" +Remove nonexistent 'scoreWeight' from 'qc.missingData' and 'qc.mixedSites'. + +These QC rules have no 'scoreWeight' parameter in Nextclade. +Sensitivity is controlled via 'missingDataThreshold'/'scoreBias' and +'mixedSitesThreshold' respectively. +""" + +from scripts.lib.container import dict_get +from scripts.lib.fs import find_files, json_read, json_write + + +def remove_qc_score_weight(pathogen: dict) -> dict: + for rule_name in ("missingData", "mixedSites"): + rule = dict_get(pathogen, ["qc", rule_name]) + if isinstance(rule, dict): + rule.pop("scoreWeight", None) + return pathogen + + +def main(): + for file in find_files("pathogen.json", here="data/"): + pathogen = json_read(file) + pathogen = remove_qc_score_weight(pathogen) + json_write(pathogen, file, no_sort_keys=True) + + +if __name__ == '__main__': + main() diff --git a/migrations/migrate_012_remove_qc_divergence.py b/migrations/migrate_012_remove_qc_divergence.py new file mode 100644 index 000000000..fec747f78 --- /dev/null +++ b/migrations/migrate_012_remove_qc_divergence.py @@ -0,0 +1,26 @@ +""" +Remove nonexistent 'qc.divergence' rule. + +Divergence is computed at runtime, not a configurable QC rule. +""" + +from scripts.lib.container import dict_get +from scripts.lib.fs import find_files, json_read, json_write + + +def remove_qc_divergence(pathogen: dict) -> dict: + qc = dict_get(pathogen, ["qc"]) + if isinstance(qc, dict): + qc.pop("divergence", None) + return pathogen + + +def main(): + for file in find_files("pathogen.json", here="data/"): + pathogen = json_read(file) + pathogen = remove_qc_divergence(pathogen) + json_write(pathogen, file, no_sort_keys=True) + + +if __name__ == '__main__': + main() diff --git a/migrations/migrate_013_rename_gene_order_preference.py b/migrations/migrate_013_rename_gene_order_preference.py new file mode 100644 index 000000000..90f1a15c0 --- /dev/null +++ b/migrations/migrate_013_rename_gene_order_preference.py @@ -0,0 +1,27 @@ +""" +Rename 'geneOrderPreference' to 'cdsOrderPreference'. + +The old name is silently ignored, so CDS display order falls back +to default. +""" + +from scripts.lib.fs import find_files, json_read, json_write + + +def rename_gene_order_preference(pathogen: dict) -> dict: + if "geneOrderPreference" in pathogen: + value = pathogen.pop("geneOrderPreference") + if value and "cdsOrderPreference" not in pathogen: + pathogen["cdsOrderPreference"] = value + return pathogen + + +def main(): + for file in find_files("pathogen.json", here="data/"): + pathogen = json_read(file) + pathogen = rename_gene_order_preference(pathogen) + json_write(pathogen, file, no_sort_keys=True) + + +if __name__ == '__main__': + main() diff --git a/migrations/migrate_014_move_general_params.py b/migrations/migrate_014_move_general_params.py new file mode 100644 index 000000000..430de8028 --- /dev/null +++ b/migrations/migrate_014_move_general_params.py @@ -0,0 +1,39 @@ +""" +Move 'includeReference' and 'includeNearestNodeInfo' from +'alignmentParams' to 'generalParams'. + +These control output behavior, not the alignment algorithm. +Under 'alignmentParams' they are silently ignored. +""" + +from scripts.lib.container import dict_get +from scripts.lib.fs import find_files, json_read, json_write + + +def move_general_params(pathogen: dict) -> dict: + alignment_params = dict_get(pathogen, ["alignmentParams"]) + if not isinstance(alignment_params, dict): + return pathogen + + general_params = pathogen.setdefault("generalParams", {}) + + for param in ("includeReference", "includeNearestNodeInfo"): + if param in alignment_params: + value = alignment_params.pop(param) + general_params.setdefault(param, value) + + if alignment_params == {}: + pathogen.pop("alignmentParams") + + return pathogen + + +def main(): + for file in find_files("pathogen.json", here="data/"): + pathogen = json_read(file) + pathogen = move_general_params(pathogen) + json_write(pathogen, file, no_sort_keys=True) + + +if __name__ == '__main__': + main() diff --git a/migrations/migrate_015_fix_excess_bandwidth_typo.py b/migrations/migrate_015_fix_excess_bandwidth_typo.py new file mode 100644 index 000000000..ed6a51bc2 --- /dev/null +++ b/migrations/migrate_015_fix_excess_bandwidth_typo.py @@ -0,0 +1,28 @@ +""" +Rename 'alignmentParams.excessBandwith' to 'alignmentParams.excessBandwidth'. + +The misspelled key is silently ignored, so the default bandwidth (9) +is used instead of the intended value. +""" + +from scripts.lib.container import dict_get +from scripts.lib.fs import find_files, json_read, json_write + + +def fix_excess_bandwidth_typo(pathogen: dict) -> dict: + alignment_params = dict_get(pathogen, ["alignmentParams"]) + if isinstance(alignment_params, dict) and "excessBandwith" in alignment_params: + value = alignment_params.pop("excessBandwith") + alignment_params.setdefault("excessBandwidth", value) + return pathogen + + +def main(): + for file in find_files("pathogen.json", here="data/"): + pathogen = json_read(file) + pathogen = fix_excess_bandwidth_typo(pathogen) + json_write(pathogen, file, no_sort_keys=True) + + +if __name__ == '__main__': + main() diff --git a/migrations/migrate_016_fix_misplaced_placement_mask_ranges.py b/migrations/migrate_016_fix_misplaced_placement_mask_ranges.py new file mode 100644 index 000000000..6bb7e3684 --- /dev/null +++ b/migrations/migrate_016_fix_misplaced_placement_mask_ranges.py @@ -0,0 +1,69 @@ +""" +Move 'placementMaskRanges' from pathogen.json to tree.json. + +This field belongs in tree.json at '.meta.extensions.nextclade.placement_mask_ranges'. +In pathogen.json it is silently ignored, so placement masking is not applied. + +One-time fix for existing datasets. Dataset authors must also update their +upstream build pipelines to place this field in tree.json directly, otherwise +the next dataset update will re-introduce the defect. +""" + +import json +from os.path import dirname, isfile, join + +from scripts.lib.fs import file_read, file_write, find_files, json_read, json_write + + +def move_placement_mask_ranges(pathogen: dict, tree: dict) -> tuple[dict, dict]: + ranges = pathogen.pop("placementMaskRanges", None) + if ranges is None: + return pathogen, tree + + meta = tree.setdefault("meta", {}) + extensions = meta.setdefault("extensions", {}) + nextclade = extensions.setdefault("nextclade", {}) + nextclade.setdefault("placement_mask_ranges", ranges) + + return pathogen, tree + + +def _detect_indent(text: str) -> int | None: + """Return indent width if pretty-printed, None if compact.""" + for line in text.split("\n")[1:6]: + stripped = line.lstrip(" ") + if stripped and len(line) != len(stripped): + return len(line) - len(stripped) + return None + + +def _json_write_preserving_format(obj: dict, filepath: str, original_text: str) -> None: + indent = _detect_indent(original_text) + content = json.dumps(obj, indent=indent, ensure_ascii=False, sort_keys=False) + if indent is not None: + content += "\n" + file_write(content, filepath) + + +def main(): + for pathogen_file in find_files("pathogen.json", here="data/"): + pathogen = json_read(pathogen_file) + if "placementMaskRanges" not in pathogen: + continue + + tree_file = join(dirname(pathogen_file), "tree.json") + if not isfile(tree_file): + continue + + tree_text = file_read(tree_file) + tree = json.loads(tree_text) + if tree.get("meta", {}).get("extensions", {}).get("nextclade", {}).get("placement_mask_ranges"): + continue + + pathogen, tree = move_placement_mask_ranges(pathogen, tree) + json_write(pathogen, pathogen_file, no_sort_keys=True) + _json_write_preserving_format(tree, tree_file, tree_text) + + +if __name__ == '__main__': + main() diff --git a/requirements.txt b/requirements.txt index 5ef9e88eb..cc93523d4 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,3 +1,5 @@ biopython +jsonschema numpy +rapidfuzz>=3.0 repro-zipfile diff --git a/scripts/lib/container.py b/scripts/lib/container.py index db4292163..398428753 100644 --- a/scripts/lib/container.py +++ b/scripts/lib/container.py @@ -1,10 +1,51 @@ from collections import namedtuple, Counter -from functools import reduce -from typing import List, Iterable, TypeVar, Callable, Union, Dict, Any, Hashable, Optional, Sequence +from dataclasses import dataclass +from typing import List, Iterable, TypeVar, Callable, Union, Dict, Any, Hashable, Optional, Sequence, Iterator T = TypeVar('T') +@dataclass(frozen=True) +class JsonPath: + """Path-tracking wrapper for JSON traversal with automatic error context.""" + data: Any + path: str = "" + + def get(self, key: str) -> "JsonPath": + new_path = f"{self.path}.{key}" if self.path else key + if self.data is None: + return JsonPath(None, new_path) + if not isinstance(self.data, dict): + raise TypeError(f"expected dict at '{self.path or '(root)'}' while accessing '{key}', got {type(self.data).__name__}") + return JsonPath(self.data.get(key), new_path) + + def items(self) -> Iterator[tuple[str, "JsonPath"]]: + if self.data is None: + return + if not isinstance(self.data, dict): + raise TypeError(f"expected dict at '{self.path or '(root)'}', got {type(self.data).__name__}") + for k, v in self.data.items(): + yield k, JsonPath(v, f"{self.path}.{k}" if self.path else k) + + def __iter__(self) -> Iterator["JsonPath"]: + if self.data is None: + return + if not isinstance(self.data, list): + raise TypeError(f"expected list at '{self.path or '(root)'}', got {type(self.data).__name__}") + for i, v in enumerate(self.data): + yield JsonPath(v, f"{self.path}[{i}]") + + def __bool__(self) -> bool: + return self.data is not None and bool(self.data) + + @property + def val(self) -> Any: + return self.data + + def or_default(self, default: T) -> T: + return self.data if self.data is not None else default + + def is_iterable(obj): return issubclass(type(obj), Iterable) @@ -14,13 +55,34 @@ def dict_to_namedtuple(name: str, dic: dict): def dict_set(obj: dict, key_path: List[str], value): - for key in key_path[:-1]: + for i, key in enumerate(key_path[:-1]): + _assert_dict(obj, key_path, i) obj = obj.setdefault(key, {}) + _assert_dict(obj, key_path, len(key_path) - 1) obj[key_path[-1]] = value def dict_get(obj: dict, keys: List[str]): - return reduce(lambda d, key: d.get(key) if d else None, keys, obj) + current = obj + for i, key in enumerate(keys): + if current is None: + return None + _assert_dict(current, keys, i) + current = current.get(key) + return current + + +def _assert_dict(obj: Any, keys: List[str], index: int): + if isinstance(obj, dict): + return + try: + traversed = '.'.join(str(k) for k in keys[:index]) if index > 0 else None + remaining = '.'.join(str(k) for k in keys[index:]) or '?' + location = f"after '{traversed}'" if traversed else "at root" + type_name = type(obj).__name__ + except Exception: + location, remaining, type_name = "at unknown", "?", "?" + raise TypeError(f"expected dict {location} while accessing '{remaining}', got {type_name}") def dict_get_required(obj: dict, keys: List[str]): diff --git a/scripts/lib/logger.py b/scripts/lib/logger.py index 0592bba11..b9c108611 100644 --- a/scripts/lib/logger.py +++ b/scripts/lib/logger.py @@ -1,4 +1,63 @@ import logging +import os +import re +import sys +from pathlib import Path -logging.basicConfig(level=logging.INFO) -l = logging.getLogger(" ") +PROJECT_ROOT_STR = str(Path(__file__).resolve().parent.parent.parent) + + +def _colors_enabled() -> bool: + if os.environ.get("NO_COLOR"): + return False + if os.environ.get("GITHUB_ACTIONS"): + return True + return sys.stderr.isatty() + + +class _ColorFormatter(logging.Formatter): + GREY = "\033[90m" + RESET = "\033[0m" + LEVEL_COLORS = { + logging.DEBUG: "\033[36m", # cyan + logging.INFO: "\033[32m", # green + logging.WARNING: "\033[33m", # yellow + logging.ERROR: "\033[31m", # red + logging.CRITICAL: "\033[35m", # magenta + } + + def __init__(self, use_colors: bool): + super().__init__() + self._use_colors = use_colors + # Match absolute paths, optionally followed by :line or :line:col + self._path_pattern = re.compile( + re.escape(PROJECT_ROOT_STR) + r"/([^\s:]+(?::\d+(?::\d+)?)?)" + ) + + def _transform_paths(self, message: str) -> str: + if self._use_colors: + return self._path_pattern.sub( + lambda m: f"{self.GREY}{m.group(1)}{self.RESET}", message + ) + return self._path_pattern.sub(r"\1", message) + + def format(self, record: logging.LogRecord) -> str: + message = self._transform_paths(record.getMessage()) + if self._use_colors: + color = self.LEVEL_COLORS.get(record.levelno, "") + return f"{color}{record.levelname}{self.RESET}: {message}" + return f"{record.levelname}: {message}" + + +def _setup_logger() -> logging.Logger: + logger = logging.getLogger("rebuild") + logger.setLevel(logging.INFO) + + handler = logging.StreamHandler(sys.stderr) + handler.setFormatter(_ColorFormatter(_colors_enabled())) + logger.addHandler(handler) + + return logger + + +l = _setup_logger() diff --git a/scripts/lib/schema.py b/scripts/lib/schema.py new file mode 100644 index 000000000..1c04c9dc3 --- /dev/null +++ b/scripts/lib/schema.py @@ -0,0 +1,596 @@ +import json +import os +import re +import urllib.request +from collections import defaultdict +from dataclasses import dataclass, field +from enum import Enum, auto +from functools import lru_cache +from pathlib import Path +from typing import Any +from urllib.error import HTTPError + +from jsonschema import Draft7Validator, ValidationError +from rapidfuzz import fuzz, process + +from .logger import l +from .process import run + + +NEXTCLADE_REPO = "nextstrain/nextclade" +SCHEMA_FILENAME = "input-pathogen-json.schema.json" +SCHEMA_PATH = f"packages/nextclade-schemas/{SCHEMA_FILENAME}" +SCHEMA_URL_TEMPLATE = "https://raw.githubusercontent.com/{repo}/refs/heads/{branch}/{path}" +SCHEMA_DOCS_URL = "https://docs.nextstrain.org/projects/nextclade/en/stable/user/datasets.html" + +FUZZY_MATCH_THRESHOLD = 80.0 + + +def validate_pathogen_json( + data: Any, + filepath: str, + ctx: "ValidationContext", + schemas_dir: Path | None = None, + dataset_path: str | None = None, +) -> None: + if filepath in ctx.reports: + return + + schema = fetch_schema(schemas_dir) + validator = Draft7Validator(schema) + errors = [] + for error in validator.iter_errors(data): + path = '.'.join(str(p) for p in error.absolute_path) or '(root)' + errors.append(f" {path}: {error.message}") + if errors: + raise ValidationError(f"Schema validation failed for '{filepath}':\n" + '\n'.join(errors)) + + report = DefectReport( + filepath=filepath, + dataset_path=dataset_path or _extract_dataset_path(filepath), + ) + report.defects = _check_known_defects(data, report.infer_upstream_repo()) + ctx.reports[filepath] = report + + known_defect_paths = {d.json_path for d in report.defects} + + schema_index = _build_schema_index(schema) + for message, json_path in _find_unknown_properties(data, schema, schema_index): + if json_path not in known_defect_paths: + _emit_ci_warning(filepath, message, json_path) + + for defect in report.defects: + _emit_defect(filepath, defect, defect.json_path) + + +def fetch_schema(schemas_dir: Path | None = None) -> dict: + if schemas_dir is not None: + return _load_local_schema(schemas_dir) + return _fetch_remote_schema() + + +def print_defect_summary(ctx: "ValidationContext") -> None: + reports = [r for r in ctx.reports.values() if r.defects] + if not reports: + return + + datasets: dict[str, list[DefectReport]] = {} + for r in reports: + datasets.setdefault(r.dataset_path, []).append(r) + + error_count = sum(1 for r in reports if r.has_errors) + warning_count = sum(1 for r in reports if r.has_warnings) + + print("\n" + "=" * 60) + print("DATASET DEFECTS SUMMARY") + print("=" * 60) + print(f"\n{len(datasets)} dataset(s), {error_count} error(s), {warning_count} warning(s)\n") + + for dataset_path in sorted(datasets.keys()): + dataset_reports = datasets[dataset_path] + upstream = dataset_reports[0].infer_upstream_repo() + upstream_hint = f" [upstream: {upstream}]" if upstream else "" + + has_errors = any(r.has_errors for r in dataset_reports) + has_warnings = any(r.has_warnings for r in dataset_reports) + marker = "ERROR" if has_errors else "WARNING" if has_warnings else "INFO" + + print(f"[{marker}] {dataset_path}{upstream_hint}") + + for r in sorted(dataset_reports, key=lambda x: x.filepath): + filename = Path(r.filepath).name + print(f" {filename}:") + for d in sorted(r.defects, key=lambda x: (x.severity.value, x.json_path)): + severity_tag = d.severity.name + print(f" [{severity_tag}] {d.problem}") + + print() + + print("-" * 60) + print("Run migration scripts to fix locally.") + print("Notify upstream maintainers to prevent defect recurrence.") + print(f"Docs: {SCHEMA_DOCS_URL}") + print("=" * 60 + "\n") + + +class Severity(Enum): + ERROR = auto() + WARNING = auto() + INFO = auto() + + +@dataclass(frozen=True) +class Defect: + severity: Severity + problem: str + impact: str + migration: str + json_path: str + upstream_fix: str | None = None + + def format_message(self) -> str: + lines = [f"{self.problem}. {self.impact}.", f"Run {self.migration} to fix locally."] + if self.upstream_fix: + lines.append(self.upstream_fix + ".") + return " ".join(lines) + + +@dataclass +class DefectReport: + filepath: str + dataset_path: str + defects: list[Defect] = field(default_factory=list) + + @property + def has_errors(self) -> bool: + return any(d.severity == Severity.ERROR for d in self.defects) + + @property + def has_warnings(self) -> bool: + return any(d.severity == Severity.WARNING for d in self.defects) + + def infer_upstream_repo(self) -> str | None: + parts = self.dataset_path.split("/") + if len(parts) >= 2: + collection, org = parts[0], parts[1] + if collection == "nextstrain": + return f"nextstrain/{org}" + if collection == "community": + return f"{org}/{parts[2]}" if len(parts) >= 3 else org + return None + + +@dataclass +class ValidationContext: + reports: dict[str, DefectReport] = field(default_factory=dict) + + def get_reports(self) -> list[DefectReport]: + return list(self.reports.values()) + + +@dataclass(frozen=True) +class SchemaIndex: + path_to_props: dict[str, frozenset[str]] + prop_to_paths: dict[str, tuple[str, ...]] + + +def get_current_branch() -> str: + branch = run("git rev-parse --abbrev-ref HEAD", error_if_empty=True) + assert branch is not None + return branch + + +def remote_branch_exists(repo: str, branch: str) -> bool: + url = f"https://api.github.com/repos/{repo}/branches/{branch}" + req = urllib.request.Request(url, method="HEAD") + try: + with urllib.request.urlopen(req, timeout=10) as response: + return response.status == 200 + except HTTPError: + return False + + +@lru_cache(maxsize=1) +def get_schema_branch() -> str: + current = get_current_branch() + if remote_branch_exists(NEXTCLADE_REPO, current): + l.info(f"Using schema from '{NEXTCLADE_REPO}' branch '{current}'") + return current + l.info(f"Branch '{current}' not found in '{NEXTCLADE_REPO}', using 'master'") + return "master" + + +def _load_local_schema(schemas_dir: Path) -> dict: + schema_path = schemas_dir / SCHEMA_FILENAME + with schema_path.open() as f: + return json.load(f) + + +@lru_cache(maxsize=1) +def _fetch_remote_schema() -> dict: + branch = get_schema_branch() + url = SCHEMA_URL_TEMPLATE.format(repo=NEXTCLADE_REPO, branch=branch, path=SCHEMA_PATH) + l.info(f"Fetching schema from {url}") + with urllib.request.urlopen(url, timeout=30) as response: + return json.loads(response.read().decode('utf-8')) + + +def _extract_dataset_path(filepath: str) -> str: + parts = Path(filepath).parts + try: + data_idx = parts.index("data") + return "/".join(parts[data_idx + 1 : -1]) + except ValueError: + return filepath + + +def _build_schema_index(schema: dict) -> SchemaIndex: + path_to_props: dict[str, set[str]] = defaultdict(set) + prop_to_paths: dict[str, list[str]] = defaultdict(list) + + def resolve_ref(ref: str) -> dict | None: + if not ref.startswith("#/"): + return None + parts = ref[2:].split("/") + node = schema + for part in parts: + if isinstance(node, dict) and part in node: + node = node[part] + else: + return None + return node if isinstance(node, dict) else None + + def traverse(node: Any, current_path: str, visited: frozenset[int] | None = None) -> None: + if not isinstance(node, dict): + return + + node_id = id(node) + visited = visited or frozenset() + if node_id in visited: + return + visited = visited | {node_id} + + if "$ref" in node: + ref_schema = resolve_ref(node["$ref"]) + if ref_schema: + traverse(ref_schema, current_path, visited) + return + + if "properties" in node: + props = node["properties"] + path_to_props[current_path].update(props.keys()) + for prop_name, prop_schema in props.items(): + full_path = f"{current_path}.{prop_name}" if current_path else prop_name + prop_to_paths[prop_name].append(full_path) + traverse(prop_schema, full_path, visited) + + for keyword in ("allOf", "anyOf", "oneOf"): + if keyword in node: + for item in node[keyword]: + traverse(item, current_path, visited) + + if isinstance(node.get("additionalProperties"), dict): + traverse(node["additionalProperties"], current_path, visited) + + if isinstance(node.get("items"), dict): + traverse(node["items"], current_path, visited) + + traverse(schema, "") + + return SchemaIndex( + path_to_props={k: frozenset(v) for k, v in path_to_props.items()}, + prop_to_paths={k: tuple(v) for k, v in prop_to_paths.items()}, + ) + + +def _suggest_correction( + unknown_prop: str, + parent_path: str, + schema_index: SchemaIndex, +) -> str | None: + valid_at_level = schema_index.path_to_props.get(parent_path, frozenset()) + + if valid_at_level: + match = process.extractOne( + unknown_prop, + valid_at_level, + scorer=fuzz.ratio, + score_cutoff=FUZZY_MATCH_THRESHOLD, + ) + if match: + suggested_prop = match[0] + full_suggestion = f"{parent_path}.{suggested_prop}" if parent_path else suggested_prop + return f"did you mean '{full_suggestion}'?" + + if unknown_prop in schema_index.prop_to_paths: + valid_locations = schema_index.prop_to_paths[unknown_prop] + current_full = f"{parent_path}.{unknown_prop}" if parent_path else unknown_prop + other_locations = [loc for loc in valid_locations if loc != current_full] + if other_locations: + if len(other_locations) == 1: + return f"belongs at '{other_locations[0]}'" + return f"belongs at one of: {', '.join(repr(loc) for loc in other_locations)}" + + all_props = set(schema_index.prop_to_paths.keys()) + if all_props: + match = process.extractOne( + unknown_prop, + all_props, + scorer=fuzz.ratio, + score_cutoff=FUZZY_MATCH_THRESHOLD, + ) + if match: + suggested_prop = match[0] + valid_locations = schema_index.prop_to_paths[suggested_prop] + if len(valid_locations) == 1: + return f"did you mean '{valid_locations[0]}'?" + return f"did you mean '{suggested_prop}'? Valid locations: {', '.join(repr(loc) for loc in valid_locations)}" + + return None + + +def _find_unknown_properties( + data: Any, + schema: dict, + schema_index: SchemaIndex, +) -> list[tuple[str, str]]: + strict_schema = _make_strict_schema(schema) + validator = Draft7Validator(strict_schema) + return _collect_unknown_property_warnings(validator.iter_errors(data), schema_index) + + +def _collect_unknown_property_warnings( + errors: Any, + schema_index: SchemaIndex, +) -> list[tuple[str, str]]: + warnings: list[tuple[str, str]] = [] + for error in errors: + if error.validator == "additionalProperties": + path_parts = [str(p) for p in error.absolute_path] + parent_path = '.'.join(path_parts) + extras = [e for e in error.message.split("'")[1::2] if e != "$schema"] + for extra in extras: + full_path = f"{parent_path}.{extra}" if parent_path else extra + suggestion = _suggest_correction(extra, parent_path, schema_index) + message = f"Unknown property '{full_path}' - {suggestion}" if suggestion else f"Unknown property '{full_path}'" + warnings.append((message, full_path)) + if error.context: + warnings.extend(_collect_unknown_property_warnings(error.context, schema_index)) + return warnings + + +def _make_strict_schema(schema: dict) -> dict: + import copy + schema = copy.deepcopy(schema) + _add_strict_recursive(schema) + return schema + + +def _add_strict_recursive(node: Any) -> None: + if isinstance(node, dict): + if node.get("type") == "object" and "properties" in node: + node.setdefault("additionalProperties", False) + for v in node.values(): + _add_strict_recursive(v) + elif isinstance(node, list): + for item in node: + _add_strict_recursive(item) + + +def _find_line_number(filepath: str, json_path: str) -> int | None: + if not json_path: + return None + key = json_path.split(".")[-1] + pattern = re.compile(rf'^\s*"{re.escape(key)}"\s*:') + try: + with open(filepath, encoding="utf-8") as f: + for lineno, line in enumerate(f, start=1): + if pattern.match(line): + return lineno + except OSError: + pass + return None + + +def _format_location(filepath: str, lineno: int | None) -> str: + return f"{filepath}:{lineno}" if lineno else filepath + + +def _emit_ci_warning(filepath: str, message: str, json_path: str | None = None) -> None: + lineno = _find_line_number(filepath, json_path) if json_path else None + location = _format_location(filepath, lineno) + if os.environ.get("GITHUB_ACTIONS"): + line_part = f",line={lineno}" if lineno else "" + print(f"::warning file={filepath}{line_part}::{message}") + else: + l.warning(f"{location}: {message}") + + +def _emit_defect(filepath: str, defect: Defect, json_path: str | None = None) -> None: + lineno = _find_line_number(filepath, json_path) if json_path else None + location = _format_location(filepath, lineno) + message = defect.format_message() + if os.environ.get("GITHUB_ACTIONS"): + level = "error" if defect.severity == Severity.ERROR else "warning" + line_part = f",line={lineno}" if lineno else "" + print(f"::{level} file={filepath}{line_part}::{message}") + else: + log_fn = l.error if defect.severity == Severity.ERROR else l.warning + log_fn(f"{location}: {message}") + + +def _upstream_hint(repo: str | None) -> str: + if repo: + return f"Fix in upstream pipeline ({repo}) to prevent recurrence" + return "Fix in upstream build pipeline to prevent recurrence" + + +def _check_known_defects(data: dict, upstream_repo: str | None) -> list[Defect]: + defects: list[Defect] = [] + defects.extend(_check_misplaced_mut_labels(data, upstream_repo)) + defects.extend(_check_qc_defects(data, upstream_repo)) + defects.extend(_check_misplaced_properties(data, upstream_repo)) + defects.extend(_check_alignment_param_typos(data, upstream_repo)) + return defects + + +def _check_misplaced_mut_labels(data: dict, upstream_repo: str | None) -> list[Defect]: + defects: list[Defect] = [] + + if "nucMutLabelMap" in data: + defects.append(Defect( + severity=Severity.ERROR, + problem="Misplaced 'nucMutLabelMap' at root level", + impact="Nucleotide mutation labels not applied to results", + migration="migrations/migrate_008_move_mut_labels.py", + json_path="nucMutLabelMap", + upstream_fix=_upstream_hint(upstream_repo), + )) + + if "nucMutLabelMapReverse" in data: + defects.append(Defect( + severity=Severity.INFO, + problem="Legacy v2 field 'nucMutLabelMapReverse' at root level", + impact="No effect (reverse map computed at runtime in v3)", + migration="migrations/migrate_009_remove_nuc_mut_label_map_reverse.py", + json_path="nucMutLabelMapReverse", + upstream_fix=_upstream_hint(upstream_repo), + )) + + if "mutLabelMap" in data: + defects.append(Defect( + severity=Severity.ERROR, + problem="Misplaced 'mutLabelMap' at root level", + impact="Mutation labels not applied to results", + migration="migrations/migrate_008_move_mut_labels.py", + json_path="mutLabelMap", + upstream_fix=_upstream_hint(upstream_repo), + )) + + mut_labels = data.get("mutLabels") + if isinstance(mut_labels, dict) and "nucMutLabelMapReverse" in mut_labels: + defects.append(Defect( + severity=Severity.INFO, + problem="Legacy v2 field 'mutLabels.nucMutLabelMapReverse'", + impact="No effect (reverse map computed at runtime in v3)", + migration="migrations/migrate_009_remove_nuc_mut_label_map_reverse.py", + json_path="mutLabels.nucMutLabelMapReverse", + upstream_fix=_upstream_hint(upstream_repo), + )) + + return defects + + +def _check_qc_defects(data: dict, upstream_repo: str | None) -> list[Defect]: + defects: list[Defect] = [] + qc = data.get("qc") + if not isinstance(qc, dict): + return defects + + if "divergence" in qc: + defects.append(Defect( + severity=Severity.INFO, + problem="Unknown QC rule 'qc.divergence'", + impact="No effect (divergence computed at runtime, not a QC rule)", + migration="migrations/migrate_012_remove_qc_divergence.py", + json_path="qc.divergence", + upstream_fix=_upstream_hint(upstream_repo), + )) + + missing_data = qc.get("missingData") + if isinstance(missing_data, dict) and "scoreWeight" in missing_data: + defects.append(Defect( + severity=Severity.INFO, + problem="Unknown field 'qc.missingData.scoreWeight'", + impact="No effect (use 'missingDataThreshold' and 'scoreBias' instead)", + migration="migrations/migrate_011_remove_qc_score_weight.py", + json_path="qc.missingData.scoreWeight", + upstream_fix=_upstream_hint(upstream_repo), + )) + + mixed_sites = qc.get("mixedSites") + if isinstance(mixed_sites, dict) and "scoreWeight" in mixed_sites: + defects.append(Defect( + severity=Severity.INFO, + problem="Unknown field 'qc.mixedSites.scoreWeight'", + impact="No effect (use 'mixedSitesThreshold' instead)", + migration="migrations/migrate_011_remove_qc_score_weight.py", + json_path="qc.mixedSites.scoreWeight", + upstream_fix=_upstream_hint(upstream_repo), + )) + + frame_shifts = qc.get("frameShifts") + if isinstance(frame_shifts, dict) and "ignoreFrameShifts" in frame_shifts: + defects.append(Defect( + severity=Severity.ERROR, + problem="Field 'qc.frameShifts.ignoreFrameShifts' was renamed to 'ignoredFrameShifts'", + impact="Frame shift exclusions not applied", + migration="migrations/migrate_010_rename_ignore_frame_shifts.py", + json_path="qc.frameShifts.ignoreFrameShifts", + upstream_fix=_upstream_hint(upstream_repo), + )) + + return defects + + +def _check_misplaced_properties(data: dict, upstream_repo: str | None) -> list[Defect]: + defects: list[Defect] = [] + + if "geneOrderPreference" in data: + defects.append(Defect( + severity=Severity.WARNING, + problem="Field 'geneOrderPreference' was renamed to 'cdsOrderPreference'", + impact="CDS display order uses default", + migration="migrations/migrate_013_rename_gene_order_preference.py", + json_path="geneOrderPreference", + upstream_fix=_upstream_hint(upstream_repo), + )) + + if "placementMaskRanges" in data: + defects.append(Defect( + severity=Severity.ERROR, + problem="Misplaced 'placementMaskRanges' in pathogen.json", + impact="Placement masking not applied (belongs in tree.json at .meta.extensions.nextclade.placement_mask_ranges)", + migration="migrations/migrate_016_fix_misplaced_placement_mask_ranges.py", + json_path="placementMaskRanges", + upstream_fix=_upstream_hint(upstream_repo), + )) + + alignment_params = data.get("alignmentParams") + if isinstance(alignment_params, dict): + if "includeReference" in alignment_params: + defects.append(Defect( + severity=Severity.WARNING, + problem="Misplaced 'alignmentParams.includeReference'", + impact="Setting ignored (move to 'generalParams.includeReference')", + migration="migrations/migrate_014_move_general_params.py", + json_path="alignmentParams.includeReference", + upstream_fix=_upstream_hint(upstream_repo), + )) + + if "includeNearestNodeInfo" in alignment_params: + defects.append(Defect( + severity=Severity.WARNING, + problem="Misplaced 'alignmentParams.includeNearestNodeInfo'", + impact="Setting ignored (move to 'generalParams.includeNearestNodeInfo')", + migration="migrations/migrate_014_move_general_params.py", + json_path="alignmentParams.includeNearestNodeInfo", + upstream_fix=_upstream_hint(upstream_repo), + )) + + return defects + + +def _check_alignment_param_typos(data: dict, upstream_repo: str | None) -> list[Defect]: + defects: list[Defect] = [] + alignment_params = data.get("alignmentParams") + if isinstance(alignment_params, dict) and "excessBandwith" in alignment_params: + defects.append(Defect( + severity=Severity.WARNING, + problem="Typo 'alignmentParams.excessBandwith' (missing 'd')", + impact="Default bandwidth (9) used instead (rename to 'excessBandwidth')", + migration="migrations/migrate_015_fix_excess_bandwidth_typo.py", + json_path="alignmentParams.excessBandwith", + upstream_fix=_upstream_hint(upstream_repo), + )) + return defects diff --git a/scripts/rebuild b/scripts/rebuild index b35953a62..d8e27da5f 100755 --- a/scripts/rebuild +++ b/scripts/rebuild @@ -10,10 +10,12 @@ from collections import defaultdict from copy import deepcopy from os import getcwd from os.path import dirname, realpath, join, relpath, isfile +from pathlib import Path from lib.changelog import changelog_prepare, changelog_get_unreleased_section from lib.container import dict_get, dict_get_required, find_index_by, first, format_list, \ - dict_remove_many, find_duplicates, dict_cleanup, find, unique + dict_remove_many, find_duplicates, dict_cleanup, find, unique, JsonPath +from lib.schema import validate_pathogen_json, print_defect_summary, ValidationContext from lib.date import now_iso, iso_to_iso_safe from lib.fasta import fasta_read_exactly_one_seq from lib.fs import json_read, find_files, json_write, copy, make_zip, file_write, rmrf @@ -60,8 +62,8 @@ def get_dataset_capabilities(pathogen_json: dict, dataset_dir: str): other.append("aaMotifs") qc = [] - for k, q in (dict_get(pathogen_json, ["qc"]) or {}).items(): - if dict_get(q, ["enabled"]): + for k, q in JsonPath(pathogen_json, "pathogen_json").get("qc").items(): + if q.get("enabled").val: qc.append(k) custom_clades = dict_cleanup({attr: len(values) for attr, values in custom_clades.items() if len(values) > 0}) @@ -123,8 +125,9 @@ def dataset_get_versions(dataset): return versions, last_version -def index_one_dataset(args, pathogen_json_path: str, dataset: object, tag: str, updated_at: str): +def index_one_dataset(args, pathogen_json_path: str, dataset: object, tag: str, updated_at: str, validation_ctx): pathogen_json = json_read(pathogen_json_path) + validate_pathogen_json(pathogen_json, pathogen_json_path, validation_ctx, args.nextclade_schemas_dir) dataset_dir = dirname(pathogen_json_path) path = relpath(dataset_dir, args.input_dir) @@ -305,6 +308,10 @@ def parse_args(): parser.add_argument('--repo', required=False, help="GitHub repo to push and to release to. You need to have write permission for that." ) + parser.add_argument('--nextclade-schemas-dir', required=False, + help="Local directory containing nextclade schema files (e.g., ../nextclade/packages/nextclade-schemas). " + "When set, schemas are loaded from this directory instead of fetching from GitHub." + ) args = parser.parse_args() @@ -319,6 +326,13 @@ def parse_args(): if args.push: args.commit = True + if args.nextclade_schemas_dir: + resolved = Path(args.nextclade_schemas_dir).resolve() + if not resolved.is_dir(): + parser.error(f"--nextclade-schemas-dir: directory does not exist: {resolved}") + args.nextclade_schemas_dir = resolved + l.info(f"Using local schema directory: {resolved}") + return args @@ -353,6 +367,7 @@ def validate_shortcuts(collections): def main(): args = parse_args() + validation_ctx = ValidationContext() if not args.allow_dirty and not git_dir_is_clean(): dirty_files = "\n ".join(git_get_dirty_files()) @@ -381,7 +396,7 @@ def main(): all_refs = {} for collection_json_path in collection_json_paths: collection, release_infos_for_dataset, refs = process_one_collection( - collection_json_path, index_json, args, tag, updated_at + collection_json_path, index_json, args, tag, updated_at, validation_ctx ) collections.append(collection) release_infos.extend(release_infos_for_dataset) @@ -431,6 +446,8 @@ def main(): time.sleep(5) publish_to_github_releases(args, tag, commit_hash, release_notes) + print_defect_summary(validation_ctx) + def dataset_has_changes(dataset_from_index, dataset_dir): _, last_version = dataset_get_versions(dataset_from_index) @@ -440,7 +457,7 @@ def dataset_has_changes(dataset_from_index, dataset_dir): return len(modified_files) > 0 -def process_one_collection(collection_json_path, index_json, args, tag, updated_at): +def process_one_collection(collection_json_path, index_json, args, tag, updated_at, validation_ctx): collection_json = json_read(collection_json_path) collection_dir = dirname(collection_json_path) collection_id = dict_get_required(collection_json, ["meta", "id"]) @@ -469,7 +486,7 @@ def process_one_collection(collection_json_path, index_json, args, tag, updated_ dataset_from_index = find(lambda dataset: dataset["path"] == path, datasets_from_index_json) or {} try: - dataset, ref = index_one_dataset(args, pathogen_json_path, dataset_from_index, tag, updated_at) + dataset, ref = index_one_dataset(args, pathogen_json_path, dataset_from_index, tag, updated_at, validation_ctx) datasets.append(dataset) # exclude deprecated datasets from list of refs that is later used to build minimizer index if not dataset['attributes'].get('deprecated', False): @@ -487,7 +504,7 @@ def process_one_collection(collection_json_path, index_json, args, tag, updated_ json_write(collection_json, collection_json_path, no_sort_keys=True) release_infos = prepare_dataset_release_infos(args, datasets, datasets_from_index_json, collection_dir, tag, - updated_at) + updated_at, validation_ctx) release_infos = sort_release_infos(release_infos, dataset_order) collection_info = deepcopy(collection_json) @@ -512,10 +529,11 @@ def get_datasets_from_index_json(index_json, collection_id): return dict_get(collection, ["datasets"]) or [] -def prepare_dataset_release_infos(args, datasets, datasets_from_index_json, collection_dir, tag, updated_at): +def prepare_dataset_release_infos(args, datasets, datasets_from_index_json, collection_dir, tag, updated_at, validation_ctx): release_infos = [] for pathogen_json_path in find_files("pathogen.json", collection_dir): pathogen_json = json_read(pathogen_json_path) + validate_pathogen_json(pathogen_json, pathogen_json_path, validation_ctx, args.nextclade_schemas_dir) dataset_dir = dirname(pathogen_json_path) dataset_dir_rel = relpath(dataset_dir, args.input_dir)