diff --git a/scripts/convert_v2_to_v3.py b/migrations/migrate_000_convert_v2_to_v3.py
similarity index 94%
rename from scripts/convert_v2_to_v3.py
rename to migrations/migrate_000_convert_v2_to_v3.py
index 97d239360..676058849 100755
--- a/scripts/convert_v2_to_v3.py
+++ b/migrations/migrate_000_convert_v2_to_v3.py
@@ -6,11 +6,11 @@
 import argparse
 from os.path import join, isfile
 
-from lib.changelog import format_dataset_attributes_md_table
-from lib.container import dict_set, dict_cleanup, dict_get, dict_remove_many
-from lib.date import now_iso
-from lib.fs import json_write, copy, json_read, file_write
-from migrate_002_default_gene import rename_default_gene
+from scripts.lib.changelog import format_dataset_attributes_md_table
+from scripts.lib.container import dict_set, dict_cleanup, dict_get, dict_remove_many
+from scripts.lib.date import now_iso
+from scripts.lib.fs import json_write, copy, json_read, file_write
+from migrations.migrate_002_default_gene import rename_default_gene
 
 
 def check_file(dataset_dir, filename):
diff --git a/scripts/migrate_001_attributes.py b/migrations/migrate_001_attributes.py
similarity index 93%
rename from scripts/migrate_001_attributes.py
rename to migrations/migrate_001_attributes.py
index bff9cfaa7..47f686b57 100644
--- a/scripts/migrate_001_attributes.py
+++ b/migrations/migrate_001_attributes.py
@@ -1,5 +1,5 @@
-from lib.fs import find_files, json_read, json_write
-from lib.container import dict_get, dict_remove, dict_cleanup, true_or_none
+from scripts.lib.fs import find_files, json_read, json_write
+from scripts.lib.container import dict_get, dict_remove, dict_cleanup, true_or_none
 
 
 def main():
diff --git a/scripts/migrate_002_default_gene.py b/migrations/migrate_002_default_gene.py
similarity index 78%
rename from scripts/migrate_002_default_gene.py
rename to migrations/migrate_002_default_gene.py
index 440030327..63709d6d1 100644
--- a/scripts/migrate_002_default_gene.py
+++ b/migrations/migrate_002_default_gene.py
@@ -1,5 +1,5 @@
-from lib.container import dict_rename_many
-from lib.fs import find_files, json_read, json_write
+from scripts.lib.container import dict_rename_many
+from scripts.lib.fs import find_files, json_read, json_write
 
 
 def rename_default_gene(pathogen):
diff --git a/scripts/migrate_003_gene_to_cds.py b/migrations/migrate_003_gene_to_cds.py
similarity index 92%
rename from scripts/migrate_003_gene_to_cds.py
rename to migrations/migrate_003_gene_to_cds.py
index 0b8637d8f..88dea779f 100644
--- a/scripts/migrate_003_gene_to_cds.py
+++ b/migrations/migrate_003_gene_to_cds.py
@@ -1,5 +1,5 @@
-from lib.container import dict_rename_many, dict_get
-from lib.fs import find_files, json_read, json_write
+from scripts.lib.container import dict_rename_many, dict_get
+from scripts.lib.fs import find_files, json_read, json_write
 
 
 def rename_gene_to_cds(pathogen):
diff --git a/scripts/migrate_004_remove_versions.py b/migrations/migrate_004_remove_versions.py
similarity index 71%
rename from scripts/migrate_004_remove_versions.py
rename to migrations/migrate_004_remove_versions.py
index 0f37cb6a7..1314e88fa 100644
--- a/scripts/migrate_004_remove_versions.py
+++ b/migrations/migrate_004_remove_versions.py
@@ -1,5 +1,5 @@
-from lib.container import dict_remove
-from lib.fs import find_files, json_read, json_write
+from scripts.lib.container import dict_remove
+from scripts.lib.fs import find_files, json_read, json_write
 
 
 def main():
diff --git a/scripts/migrate_005_create_zips.py b/migrations/migrate_005_create_zips.py
similarity index 89%
rename from scripts/migrate_005_create_zips.py
rename to migrations/migrate_005_create_zips.py
index 470a44497..cf3392a60 100644
--- a/scripts/migrate_005_create_zips.py
+++ b/migrations/migrate_005_create_zips.py
@@ -1,7 +1,7 @@
 from os.path import join, dirname
 from shutil import move
 
-from lib.fs import find_files, make_zip, rmrf
+from scripts.lib.fs import find_files, make_zip, rmrf
 
 tmp_dir = "data_temp/"
 
diff --git a/scripts/migrate_006_add_rel_muts.py b/migrations/migrate_006_add_rel_muts.py
similarity index 98%
rename from scripts/migrate_006_add_rel_muts.py
rename to migrations/migrate_006_add_rel_muts.py
index 8e1410758..f22624b9b 100755
--- a/scripts/migrate_006_add_rel_muts.py
+++ b/migrations/migrate_006_add_rel_muts.py
@@ -1,9 +1,9 @@
 #!/usr/bin/env python3
 from os.path import dirname, join
 
-from lib.changelog import changelog_get_unreleased_section
-from lib.container import dict_set
-from lib.fs import find_files, json_read, json_write, file_read, file_write
+from scripts.lib.changelog import changelog_get_unreleased_section
+from scripts.lib.container import dict_set
+from scripts.lib.fs import find_files, json_read, json_write, file_read, file_write
 
 """
 Replace .meta.extensions.nextclade.ref_nodes" in all tree.json files matching a subdirectory recursively
diff --git a/scripts/add_schema_to_pathogen_json.py b/migrations/migrate_007_add_schema.py
similarity index 100%
rename from scripts/add_schema_to_pathogen_json.py
rename to migrations/migrate_007_add_schema.py
diff --git a/migrations/migrate_008_move_mut_labels.py b/migrations/migrate_008_move_mut_labels.py
new file mode 100644
index 000000000..88be26fd4
--- /dev/null
+++ b/migrations/migrate_008_move_mut_labels.py
@@ -0,0 +1,40 @@
+"""
+Move misplaced mutation label maps from root level into 'mutLabels'.
+
+- 'nucMutLabelMap' at root -> 'mutLabels.nucMutLabelMap'
+- 'mutLabelMap' at root -> contents moved into 'mutLabels'
+"""
+
+from scripts.lib.fs import find_files, json_read, json_write
+
+
+def move_mut_labels(pathogen: dict) -> dict:
+  mut_labels = pathogen.setdefault("mutLabels", {})
+
+  if "nucMutLabelMap" in pathogen:
+    value = pathogen.pop("nucMutLabelMap")
+    if value and value != {}:
+      mut_labels.setdefault("nucMutLabelMap", value)
+
+  if "mutLabelMap" in pathogen:
+    value = pathogen.pop("mutLabelMap")
+    if isinstance(value, dict):
+      for sub_key, sub_value in value.items():
+        if sub_value and sub_value != {}:
+          mut_labels.setdefault(sub_key, sub_value)
+
+  if mut_labels == {}:
+    pathogen.pop("mutLabels", None)
+
+  return pathogen
+
+
+def main():
+  for file in find_files("pathogen.json", here="data/"):
+    pathogen = json_read(file)
+    pathogen = move_mut_labels(pathogen)
+    json_write(pathogen, file, no_sort_keys=True)
+
+
+if __name__ == '__main__':
+  main()
diff --git a/migrations/migrate_009_remove_nuc_mut_label_map_reverse.py b/migrations/migrate_009_remove_nuc_mut_label_map_reverse.py
new file mode 100644
index 000000000..056e95913
--- /dev/null
+++ b/migrations/migrate_009_remove_nuc_mut_label_map_reverse.py
@@ -0,0 +1,29 @@
+"""
+Remove legacy v2 'nucMutLabelMapReverse' field.
+
+Not used by Nextclade v3 - the reverse map is computed at runtime.
+Removes from both root level and 'mutLabels' section.
+"""
+
+from scripts.lib.fs import find_files, json_read, json_write
+
+
+def remove_nuc_mut_label_map_reverse(pathogen: dict) -> dict:
+  pathogen.pop("nucMutLabelMapReverse", None)
+
+  mut_labels = pathogen.get("mutLabels")
+  if isinstance(mut_labels, dict):
+    mut_labels.pop("nucMutLabelMapReverse", None)
+
+  return pathogen
+
+
+def main():
+  for file in find_files("pathogen.json", here="data/"):
+    pathogen = json_read(file)
+    pathogen = remove_nuc_mut_label_map_reverse(pathogen)
+    json_write(pathogen, file, no_sort_keys=True)
+
+
+if __name__ == '__main__':
+  main()
diff --git a/migrations/migrate_010_rename_ignore_frame_shifts.py b/migrations/migrate_010_rename_ignore_frame_shifts.py
new file mode 100644
index 000000000..83acd962f
--- /dev/null
+++ b/migrations/migrate_010_rename_ignore_frame_shifts.py
@@ -0,0 +1,28 @@
+"""
+Rename 'qc.frameShifts.ignoreFrameShifts' to 'ignoredFrameShifts'.
+
+The old name is silently ignored, so listed frame shifts are not
+being excluded from QC scoring.
+"""
+
+from scripts.lib.container import dict_get
+from scripts.lib.fs import find_files, json_read, json_write
+
+
+def rename_ignore_frame_shifts(pathogen: dict) -> dict:
+  frame_shifts = dict_get(pathogen, ["qc", "frameShifts"])
+  if isinstance(frame_shifts, dict) and "ignoreFrameShifts" in frame_shifts:
+    value = frame_shifts.pop("ignoreFrameShifts")
+    frame_shifts["ignoredFrameShifts"] = value
+  return pathogen
+
+
+def main():
+  for file in find_files("pathogen.json", here="data/"):
+    pathogen = json_read(file)
+    pathogen = rename_ignore_frame_shifts(pathogen)
+    json_write(pathogen, file, no_sort_keys=True)
+
+
+if __name__ == '__main__':
+  main()
diff --git a/migrations/migrate_011_remove_qc_score_weight.py b/migrations/migrate_011_remove_qc_score_weight.py
new file mode 100644
index 000000000..721b51aab
--- /dev/null
+++ b/migrations/migrate_011_remove_qc_score_weight.py
@@ -0,0 +1,29 @@
+"""
+Remove nonexistent 'scoreWeight' from 'qc.missingData' and 'qc.mixedSites'.
+
+These QC rules have no 'scoreWeight' parameter in Nextclade.
+Sensitivity is controlled via 'missingDataThreshold'/'scoreBias' and
+'mixedSitesThreshold' respectively.
+"""
+
+from scripts.lib.container import dict_get
+from scripts.lib.fs import find_files, json_read, json_write
+
+
+def remove_qc_score_weight(pathogen: dict) -> dict:
+  for rule_name in ("missingData", "mixedSites"):
+    rule = dict_get(pathogen, ["qc", rule_name])
+    if isinstance(rule, dict):
+      rule.pop("scoreWeight", None)
+  return pathogen
+
+
+def main():
+  for file in find_files("pathogen.json", here="data/"):
+    pathogen = json_read(file)
+    pathogen = remove_qc_score_weight(pathogen)
+    json_write(pathogen, file, no_sort_keys=True)
+
+
+if __name__ == '__main__':
+  main()
diff --git a/migrations/migrate_012_remove_qc_divergence.py b/migrations/migrate_012_remove_qc_divergence.py
new file mode 100644
index 000000000..fec747f78
--- /dev/null
+++ b/migrations/migrate_012_remove_qc_divergence.py
@@ -0,0 +1,26 @@
+"""
+Remove nonexistent 'qc.divergence' rule.
+
+Divergence is computed at runtime, not a configurable QC rule.
+"""
+
+from scripts.lib.container import dict_get
+from scripts.lib.fs import find_files, json_read, json_write
+
+
+def remove_qc_divergence(pathogen: dict) -> dict:
+  qc = dict_get(pathogen, ["qc"])
+  if isinstance(qc, dict):
+    qc.pop("divergence", None)
+  return pathogen
+
+
+def main():
+  for file in find_files("pathogen.json", here="data/"):
+    pathogen = json_read(file)
+    pathogen = remove_qc_divergence(pathogen)
+    json_write(pathogen, file, no_sort_keys=True)
+
+
+if __name__ == '__main__':
+  main()
diff --git a/migrations/migrate_013_rename_gene_order_preference.py b/migrations/migrate_013_rename_gene_order_preference.py
new file mode 100644
index 000000000..90f1a15c0
--- /dev/null
+++ b/migrations/migrate_013_rename_gene_order_preference.py
@@ -0,0 +1,27 @@
+"""
+Rename 'geneOrderPreference' to 'cdsOrderPreference'.
+
+The old name is silently ignored, so CDS display order falls back
+to default.
+"""
+
+from scripts.lib.fs import find_files, json_read, json_write
+
+
+def rename_gene_order_preference(pathogen: dict) -> dict:
+  if "geneOrderPreference" in pathogen:
+    value = pathogen.pop("geneOrderPreference")
+    if value and "cdsOrderPreference" not in pathogen:
+      pathogen["cdsOrderPreference"] = value
+  return pathogen
+
+
+def main():
+  for file in find_files("pathogen.json", here="data/"):
+    pathogen = json_read(file)
+    pathogen = rename_gene_order_preference(pathogen)
+    json_write(pathogen, file, no_sort_keys=True)
+
+
+if __name__ == '__main__':
+  main()
diff --git a/migrations/migrate_014_move_general_params.py b/migrations/migrate_014_move_general_params.py
new file mode 100644
index 000000000..430de8028
--- /dev/null
+++ b/migrations/migrate_014_move_general_params.py
@@ -0,0 +1,39 @@
+"""
+Move 'includeReference' and 'includeNearestNodeInfo' from
+'alignmentParams' to 'generalParams'.
+
+These control output behavior, not the alignment algorithm.
+Under 'alignmentParams' they are silently ignored.
+"""
+
+from scripts.lib.container import dict_get
+from scripts.lib.fs import find_files, json_read, json_write
+
+
+def move_general_params(pathogen: dict) -> dict:
+  alignment_params = dict_get(pathogen, ["alignmentParams"])
+  if not isinstance(alignment_params, dict):
+    return pathogen
+
+  general_params = pathogen.setdefault("generalParams", {})
+
+  for param in ("includeReference", "includeNearestNodeInfo"):
+    if param in alignment_params:
+      value = alignment_params.pop(param)
+      general_params.setdefault(param, value)
+
+  if alignment_params == {}:
+    pathogen.pop("alignmentParams")
+
+  return pathogen
+
+
+def main():
+  for file in find_files("pathogen.json", here="data/"):
+    pathogen = json_read(file)
+    pathogen = move_general_params(pathogen)
+    json_write(pathogen, file, no_sort_keys=True)
+
+
+if __name__ == '__main__':
+  main()
diff --git a/migrations/migrate_015_fix_excess_bandwidth_typo.py b/migrations/migrate_015_fix_excess_bandwidth_typo.py
new file mode 100644
index 000000000..ed6a51bc2
--- /dev/null
+++ b/migrations/migrate_015_fix_excess_bandwidth_typo.py
@@ -0,0 +1,28 @@
+"""
+Rename 'alignmentParams.excessBandwith' to 'alignmentParams.excessBandwidth'.
+
+The misspelled key is silently ignored, so the default bandwidth (9)
+is used instead of the intended value.
+"""
+
+from scripts.lib.container import dict_get
+from scripts.lib.fs import find_files, json_read, json_write
+
+
+def fix_excess_bandwidth_typo(pathogen: dict) -> dict:
+  alignment_params = dict_get(pathogen, ["alignmentParams"])
+  if isinstance(alignment_params, dict) and "excessBandwith" in alignment_params:
+    value = alignment_params.pop("excessBandwith")
+    alignment_params.setdefault("excessBandwidth", value)
+  return pathogen
+
+
+def main():
+  for file in find_files("pathogen.json", here="data/"):
+    pathogen = json_read(file)
+    pathogen = fix_excess_bandwidth_typo(pathogen)
+    json_write(pathogen, file, no_sort_keys=True)
+
+
+if __name__ == '__main__':
+  main()
diff --git a/migrations/migrate_016_fix_misplaced_placement_mask_ranges.py b/migrations/migrate_016_fix_misplaced_placement_mask_ranges.py
new file mode 100644
index 000000000..6bb7e3684
--- /dev/null
+++ b/migrations/migrate_016_fix_misplaced_placement_mask_ranges.py
@@ -0,0 +1,69 @@
+"""
+Move 'placementMaskRanges' from pathogen.json to tree.json.
+
+This field belongs in tree.json at '.meta.extensions.nextclade.placement_mask_ranges'.
+In pathogen.json it is silently ignored, so placement masking is not applied.
+
+One-time fix for existing datasets. Dataset authors must also update their
+upstream build pipelines to place this field in tree.json directly, otherwise
+the next dataset update will re-introduce the defect.
+"""
+
+import json
+from os.path import dirname, isfile, join
+
+from scripts.lib.fs import file_read, file_write, find_files, json_read, json_write
+
+
+def move_placement_mask_ranges(pathogen: dict, tree: dict) -> tuple[dict, dict]:
+  ranges = pathogen.pop("placementMaskRanges", None)
+  if ranges is None:
+    return pathogen, tree
+
+  meta = tree.setdefault("meta", {})
+  extensions = meta.setdefault("extensions", {})
+  nextclade = extensions.setdefault("nextclade", {})
+  nextclade.setdefault("placement_mask_ranges", ranges)
+
+  return pathogen, tree
+
+
+def _detect_indent(text: str) -> int | None:
+  """Return indent width if pretty-printed, None if compact."""
+  for line in text.split("\n")[1:6]:
+    stripped = line.lstrip(" ")
+    if stripped and len(line) != len(stripped):
+      return len(line) - len(stripped)
+  return None
+
+
+def _json_write_preserving_format(obj: dict, filepath: str, original_text: str) -> None:
+  indent = _detect_indent(original_text)
+  content = json.dumps(obj, indent=indent, ensure_ascii=False, sort_keys=False)
+  if indent is not None:
+    content += "\n"
+  file_write(content, filepath)
+
+
+def main():
+  for pathogen_file in find_files("pathogen.json", here="data/"):
+    pathogen = json_read(pathogen_file)
+    if "placementMaskRanges" not in pathogen:
+      continue
+
+    tree_file = join(dirname(pathogen_file), "tree.json")
+    if not isfile(tree_file):
+      continue
+
+    tree_text = file_read(tree_file)
+    tree = json.loads(tree_text)
+    if tree.get("meta", {}).get("extensions", {}).get("nextclade", {}).get("placement_mask_ranges"):
+      continue
+
+    pathogen, tree = move_placement_mask_ranges(pathogen, tree)
+    json_write(pathogen, pathogen_file, no_sort_keys=True)
+    _json_write_preserving_format(tree, tree_file, tree_text)
+
+
+if __name__ == '__main__':
+  main()
diff --git a/requirements.txt b/requirements.txt
index 5ef9e88eb..cc93523d4 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -1,3 +1,5 @@
 biopython
+jsonschema
 numpy
+rapidfuzz>=3.0
 repro-zipfile
diff --git a/scripts/lib/container.py b/scripts/lib/container.py
index db4292163..398428753 100644
--- a/scripts/lib/container.py
+++ b/scripts/lib/container.py
@@ -1,10 +1,51 @@
 from collections import namedtuple, Counter
-from functools import reduce
-from typing import List, Iterable, TypeVar, Callable, Union, Dict, Any, Hashable, Optional, Sequence
+from dataclasses import dataclass
+from typing import List, Iterable, TypeVar, Callable, Union, Dict, Any, Hashable, Optional, Sequence, Iterator
 
 T = TypeVar('T')
 
 
+@dataclass(frozen=True)
+class JsonPath:
+  """Path-tracking wrapper for JSON traversal with automatic error context."""
+  data: Any
+  path: str = ""
+
+  def get(self, key: str) -> "JsonPath":
+    new_path = f"{self.path}.{key}" if self.path else key
+    if self.data is None:
+      return JsonPath(None, new_path)
+    if not isinstance(self.data, dict):
+      raise TypeError(f"expected dict at '{self.path or '(root)'}' while accessing '{key}', got {type(self.data).__name__}")
+    return JsonPath(self.data.get(key), new_path)
+
+  def items(self) -> Iterator[tuple[str, "JsonPath"]]:
+    if self.data is None:
+      return
+    if not isinstance(self.data, dict):
+      raise TypeError(f"expected dict at '{self.path or '(root)'}', got {type(self.data).__name__}")
+    for k, v in self.data.items():
+      yield k, JsonPath(v, f"{self.path}.{k}" if self.path else k)
+
+  def __iter__(self) -> Iterator["JsonPath"]:
+    if self.data is None:
+      return
+    if not isinstance(self.data, list):
+      raise TypeError(f"expected list at '{self.path or '(root)'}', got {type(self.data).__name__}")
+    for i, v in enumerate(self.data):
+      yield JsonPath(v, f"{self.path}[{i}]")
+
+  def __bool__(self) -> bool:
+    return self.data is not None and bool(self.data)
+
+  @property
+  def val(self) -> Any:
+    return self.data
+
+  def or_default(self, default: T) -> T:
+    return self.data if self.data is not None else default
+
+
 def is_iterable(obj):
   return issubclass(type(obj), Iterable)
 
@@ -14,13 +55,34 @@ def dict_to_namedtuple(name: str, dic: dict):
 
 
 def dict_set(obj: dict, key_path: List[str], value):
-  for key in key_path[:-1]:
+  for i, key in enumerate(key_path[:-1]):
+    _assert_dict(obj, key_path, i)
     obj = obj.setdefault(key, {})
+  _assert_dict(obj, key_path, len(key_path) - 1)
   obj[key_path[-1]] = value
 
 
 def dict_get(obj: dict, keys: List[str]):
-  return reduce(lambda d, key: d.get(key) if d else None, keys, obj)
+  current = obj
+  for i, key in enumerate(keys):
+    if current is None:
+      return None
+    _assert_dict(current, keys, i)
+    current = current.get(key)
+  return current
+
+
+def _assert_dict(obj: Any, keys: List[str], index: int):
+  if isinstance(obj, dict):
+    return
+  try:
+    traversed = '.'.join(str(k) for k in keys[:index]) if index > 0 else None
+    remaining = '.'.join(str(k) for k in keys[index:]) or '?'
+    location = f"after '{traversed}'" if traversed else "at root"
+    type_name = type(obj).__name__
+  except Exception:
+    location, remaining, type_name = "at unknown", "?", "?"
+  raise TypeError(f"expected dict {location} while accessing '{remaining}', got {type_name}")
 
 
 def dict_get_required(obj: dict, keys: List[str]):
diff --git a/scripts/lib/logger.py b/scripts/lib/logger.py
index 0592bba11..b9c108611 100644
--- a/scripts/lib/logger.py
+++ b/scripts/lib/logger.py
@@ -1,4 +1,63 @@
 import logging
+import os
+import re
+import sys
+from pathlib import Path
 
-logging.basicConfig(level=logging.INFO)
-l = logging.getLogger(" ")
+PROJECT_ROOT_STR = str(Path(__file__).resolve().parent.parent.parent)
+
+
+def _colors_enabled() -> bool:
+    if os.environ.get("NO_COLOR"):
+        return False
+    if os.environ.get("GITHUB_ACTIONS"):
+        return True
+    return sys.stderr.isatty()
+
+
+class _ColorFormatter(logging.Formatter):
+    GREY = "\033[90m"
+    RESET = "\033[0m"
+    LEVEL_COLORS = {
+        logging.DEBUG: "\033[36m",     # cyan
+        logging.INFO: "\033[32m",      # green
+        logging.WARNING: "\033[33m",   # yellow
+        logging.ERROR: "\033[31m",     # red
+        logging.CRITICAL: "\033[35m",  # magenta
+    }
+
+    def __init__(self, use_colors: bool):
+        super().__init__()
+        self._use_colors = use_colors
+        # Match absolute paths, optionally followed by :line or :line:col
+        self._path_pattern = re.compile(
+            re.escape(PROJECT_ROOT_STR) + r"/([^\s:]+(?::\d+(?::\d+)?)?)"
+        )
+
+    def _transform_paths(self, message: str) -> str:
+        if self._use_colors:
+            return self._path_pattern.sub(
+                lambda m: f"{self.GREY}{m.group(1)}{self.RESET}", message
+            )
+        return self._path_pattern.sub(r"\1", message)
+
+    def format(self, record: logging.LogRecord) -> str:
+        message = self._transform_paths(record.getMessage())
+        if self._use_colors:
+            color = self.LEVEL_COLORS.get(record.levelno, "")
+            return f"{color}{record.levelname}{self.RESET}: {message}"
+        return f"{record.levelname}: {message}"
+
+
+def _setup_logger() -> logging.Logger:
+    logger = logging.getLogger("rebuild")
+    logger.setLevel(logging.INFO)
+
+    handler = logging.StreamHandler(sys.stderr)
+    handler.setFormatter(_ColorFormatter(_colors_enabled()))
+    logger.addHandler(handler)
+
+    return logger
+
+
+l = _setup_logger()
diff --git a/scripts/lib/schema.py b/scripts/lib/schema.py
new file mode 100644
index 000000000..1c04c9dc3
--- /dev/null
+++ b/scripts/lib/schema.py
@@ -0,0 +1,596 @@
+import json
+import os
+import re
+import urllib.request
+from collections import defaultdict
+from dataclasses import dataclass, field
+from enum import Enum, auto
+from functools import lru_cache
+from pathlib import Path
+from typing import Any
+from urllib.error import HTTPError
+
+from jsonschema import Draft7Validator, ValidationError
+from rapidfuzz import fuzz, process
+
+from .logger import l
+from .process import run
+
+
+NEXTCLADE_REPO = "nextstrain/nextclade"
+SCHEMA_FILENAME = "input-pathogen-json.schema.json"
+SCHEMA_PATH = f"packages/nextclade-schemas/{SCHEMA_FILENAME}"
+SCHEMA_URL_TEMPLATE = "https://raw.githubusercontent.com/{repo}/refs/heads/{branch}/{path}"
+SCHEMA_DOCS_URL = "https://docs.nextstrain.org/projects/nextclade/en/stable/user/datasets.html"
+
+FUZZY_MATCH_THRESHOLD = 80.0
+
+
+def validate_pathogen_json(
+  data: Any,
+  filepath: str,
+  ctx: "ValidationContext",
+  schemas_dir: Path | None = None,
+  dataset_path: str | None = None,
+) -> None:
+  if filepath in ctx.reports:
+    return
+
+  schema = fetch_schema(schemas_dir)
+  validator = Draft7Validator(schema)
+  errors = []
+  for error in validator.iter_errors(data):
+    path = '.'.join(str(p) for p in error.absolute_path) or '(root)'
+    errors.append(f"  {path}: {error.message}")
+  if errors:
+    raise ValidationError(f"Schema validation failed for '{filepath}':\n" + '\n'.join(errors))
+
+  report = DefectReport(
+    filepath=filepath,
+    dataset_path=dataset_path or _extract_dataset_path(filepath),
+  )
+  report.defects = _check_known_defects(data, report.infer_upstream_repo())
+  ctx.reports[filepath] = report
+
+  known_defect_paths = {d.json_path for d in report.defects}
+
+  schema_index = _build_schema_index(schema)
+  for message, json_path in _find_unknown_properties(data, schema, schema_index):
+    if json_path not in known_defect_paths:
+      _emit_ci_warning(filepath, message, json_path)
+
+  for defect in report.defects:
+    _emit_defect(filepath, defect, defect.json_path)
+
+
+def fetch_schema(schemas_dir: Path | None = None) -> dict:
+  if schemas_dir is not None:
+    return _load_local_schema(schemas_dir)
+  return _fetch_remote_schema()
+
+
+def print_defect_summary(ctx: "ValidationContext") -> None:
+  reports = [r for r in ctx.reports.values() if r.defects]
+  if not reports:
+    return
+
+  datasets: dict[str, list[DefectReport]] = {}
+  for r in reports:
+    datasets.setdefault(r.dataset_path, []).append(r)
+
+  error_count = sum(1 for r in reports if r.has_errors)
+  warning_count = sum(1 for r in reports if r.has_warnings)
+
+  print("\n" + "=" * 60)
+  print("DATASET DEFECTS SUMMARY")
+  print("=" * 60)
+  print(f"\n{len(datasets)} dataset(s), {error_count} error(s), {warning_count} warning(s)\n")
+
+  for dataset_path in sorted(datasets.keys()):
+    dataset_reports = datasets[dataset_path]
+    upstream = dataset_reports[0].infer_upstream_repo()
+    upstream_hint = f" [upstream: {upstream}]" if upstream else ""
+
+    has_errors = any(r.has_errors for r in dataset_reports)
+    has_warnings = any(r.has_warnings for r in dataset_reports)
+    marker = "ERROR" if has_errors else "WARNING" if has_warnings else "INFO"
+
+    print(f"[{marker}] {dataset_path}{upstream_hint}")
+
+    for r in sorted(dataset_reports, key=lambda x: x.filepath):
+      filename = Path(r.filepath).name
+      print(f"  {filename}:")
+      for d in sorted(r.defects, key=lambda x: (x.severity.value, x.json_path)):
+        severity_tag = d.severity.name
+        print(f"    [{severity_tag}] {d.problem}")
+
+    print()
+
+  print("-" * 60)
+  print("Run migration scripts to fix locally.")
+  print("Notify upstream maintainers to prevent defect recurrence.")
+  print(f"Docs: {SCHEMA_DOCS_URL}")
+  print("=" * 60 + "\n")
+
+
+class Severity(Enum):
+  ERROR = auto()
+  WARNING = auto()
+  INFO = auto()
+
+
+@dataclass(frozen=True)
+class Defect:
+  severity: Severity
+  problem: str
+  impact: str
+  migration: str
+  json_path: str
+  upstream_fix: str | None = None
+
+  def format_message(self) -> str:
+    lines = [f"{self.problem}. {self.impact}.", f"Run {self.migration} to fix locally."]
+    if self.upstream_fix:
+      lines.append(self.upstream_fix + ".")
+    return " ".join(lines)
+
+
+@dataclass
+class DefectReport:
+  filepath: str
+  dataset_path: str
+  defects: list[Defect] = field(default_factory=list)
+
+  @property
+  def has_errors(self) -> bool:
+    return any(d.severity == Severity.ERROR for d in self.defects)
+
+  @property
+  def has_warnings(self) -> bool:
+    return any(d.severity == Severity.WARNING for d in self.defects)
+
+  def infer_upstream_repo(self) -> str | None:
+    parts = self.dataset_path.split("/")
+    if len(parts) >= 2:
+      collection, org = parts[0], parts[1]
+      if collection == "nextstrain":
+        return f"nextstrain/{org}"
+      if collection == "community":
+        return f"{org}/{parts[2]}" if len(parts) >= 3 else org
+    return None
+
+
+@dataclass
+class ValidationContext:
+  reports: dict[str, DefectReport] = field(default_factory=dict)
+
+  def get_reports(self) -> list[DefectReport]:
+    return list(self.reports.values())
+
+
+@dataclass(frozen=True)
+class SchemaIndex:
+  path_to_props: dict[str, frozenset[str]]
+  prop_to_paths: dict[str, tuple[str, ...]]
+
+
+def get_current_branch() -> str:
+  branch = run("git rev-parse --abbrev-ref HEAD", error_if_empty=True)
+  assert branch is not None
+  return branch
+
+
+def remote_branch_exists(repo: str, branch: str) -> bool:
+  url = f"https://api.github.com/repos/{repo}/branches/{branch}"
+  req = urllib.request.Request(url, method="HEAD")
+  try:
+    with urllib.request.urlopen(req, timeout=10) as response:
+      return response.status == 200
+  except HTTPError:
+    return False
+
+
+@lru_cache(maxsize=1)
+def get_schema_branch() -> str:
+  current = get_current_branch()
+  if remote_branch_exists(NEXTCLADE_REPO, current):
+    l.info(f"Using schema from '{NEXTCLADE_REPO}' branch '{current}'")
+    return current
+  l.info(f"Branch '{current}' not found in '{NEXTCLADE_REPO}', using 'master'")
+  return "master"
+
+
+def _load_local_schema(schemas_dir: Path) -> dict:
+  schema_path = schemas_dir / SCHEMA_FILENAME
+  with schema_path.open() as f:
+    return json.load(f)
+
+
+@lru_cache(maxsize=1)
+def _fetch_remote_schema() -> dict:
+  branch = get_schema_branch()
+  url = SCHEMA_URL_TEMPLATE.format(repo=NEXTCLADE_REPO, branch=branch, path=SCHEMA_PATH)
+  l.info(f"Fetching schema from {url}")
+  with urllib.request.urlopen(url, timeout=30) as response:
+    return json.loads(response.read().decode('utf-8'))
+
+
+def _extract_dataset_path(filepath: str) -> str:
+  parts = Path(filepath).parts
+  try:
+    data_idx = parts.index("data")
+    return "/".join(parts[data_idx + 1 : -1])
+  except ValueError:
+    return filepath
+
+
+def _build_schema_index(schema: dict) -> SchemaIndex:
+  path_to_props: dict[str, set[str]] = defaultdict(set)
+  prop_to_paths: dict[str, list[str]] = defaultdict(list)
+
+  def resolve_ref(ref: str) -> dict | None:
+    if not ref.startswith("#/"):
+      return None
+    parts = ref[2:].split("/")
+    node = schema
+    for part in parts:
+      if isinstance(node, dict) and part in node:
+        node = node[part]
+      else:
+        return None
+    return node if isinstance(node, dict) else None
+
+  def traverse(node: Any, current_path: str, visited: frozenset[int] | None = None) -> None:
+    if not isinstance(node, dict):
+      return
+
+    node_id = id(node)
+    visited = visited or frozenset()
+    if node_id in visited:
+      return
+    visited = visited | {node_id}
+
+    if "$ref" in node:
+      ref_schema = resolve_ref(node["$ref"])
+      if ref_schema:
+        traverse(ref_schema, current_path, visited)
+      return
+
+    if "properties" in node:
+      props = node["properties"]
+      path_to_props[current_path].update(props.keys())
+      for prop_name, prop_schema in props.items():
+        full_path = f"{current_path}.{prop_name}" if current_path else prop_name
+        prop_to_paths[prop_name].append(full_path)
+        traverse(prop_schema, full_path, visited)
+
+    for keyword in ("allOf", "anyOf", "oneOf"):
+      if keyword in node:
+        for item in node[keyword]:
+          traverse(item, current_path, visited)
+
+    if isinstance(node.get("additionalProperties"), dict):
+      traverse(node["additionalProperties"], current_path, visited)
+
+    if isinstance(node.get("items"), dict):
+      traverse(node["items"], current_path, visited)
+
+  traverse(schema, "")
+
+  return SchemaIndex(
+    path_to_props={k: frozenset(v) for k, v in path_to_props.items()},
+    prop_to_paths={k: tuple(v) for k, v in prop_to_paths.items()},
+  )
+
+
+def _suggest_correction(
+  unknown_prop: str,
+  parent_path: str,
+  schema_index: SchemaIndex,
+) -> str | None:
+  valid_at_level = schema_index.path_to_props.get(parent_path, frozenset())
+
+  if valid_at_level:
+    match = process.extractOne(
+      unknown_prop,
+      valid_at_level,
+      scorer=fuzz.ratio,
+      score_cutoff=FUZZY_MATCH_THRESHOLD,
+    )
+    if match:
+      suggested_prop = match[0]
+      full_suggestion = f"{parent_path}.{suggested_prop}" if parent_path else suggested_prop
+      return f"did you mean '{full_suggestion}'?"
+
+  if unknown_prop in schema_index.prop_to_paths:
+    valid_locations = schema_index.prop_to_paths[unknown_prop]
+    current_full = f"{parent_path}.{unknown_prop}" if parent_path else unknown_prop
+    other_locations = [loc for loc in valid_locations if loc != current_full]
+    if other_locations:
+      if len(other_locations) == 1:
+        return f"belongs at '{other_locations[0]}'"
+      return f"belongs at one of: {', '.join(repr(loc) for loc in other_locations)}"
+
+  all_props = set(schema_index.prop_to_paths.keys())
+  if all_props:
+    match = process.extractOne(
+      unknown_prop,
+      all_props,
+      scorer=fuzz.ratio,
+      score_cutoff=FUZZY_MATCH_THRESHOLD,
+    )
+    if match:
+      suggested_prop = match[0]
+      valid_locations = schema_index.prop_to_paths[suggested_prop]
+      if len(valid_locations) == 1:
+        return f"did you mean '{valid_locations[0]}'?"
+      return f"did you mean '{suggested_prop}'? Valid locations: {', '.join(repr(loc) for loc in valid_locations)}"
+
+  return None
+
+
+def _find_unknown_properties(
+  data: Any,
+  schema: dict,
+  schema_index: SchemaIndex,
+) -> list[tuple[str, str]]:
+  strict_schema = _make_strict_schema(schema)
+  validator = Draft7Validator(strict_schema)
+  return _collect_unknown_property_warnings(validator.iter_errors(data), schema_index)
+
+
+def _collect_unknown_property_warnings(
+  errors: Any,
+  schema_index: SchemaIndex,
+) -> list[tuple[str, str]]:
+  warnings: list[tuple[str, str]] = []
+  for error in errors:
+    if error.validator == "additionalProperties":
+      path_parts = [str(p) for p in error.absolute_path]
+      parent_path = '.'.join(path_parts)
+      extras = [e for e in error.message.split("'")[1::2] if e != "$schema"]
+      for extra in extras:
+        full_path = f"{parent_path}.{extra}" if parent_path else extra
+        suggestion = _suggest_correction(extra, parent_path, schema_index)
+        message = f"Unknown property '{full_path}' - {suggestion}" if suggestion else f"Unknown property '{full_path}'"
+        warnings.append((message, full_path))
+    if error.context:
+      warnings.extend(_collect_unknown_property_warnings(error.context, schema_index))
+  return warnings
+
+
+def _make_strict_schema(schema: dict) -> dict:
+  import copy
+  schema = copy.deepcopy(schema)
+  _add_strict_recursive(schema)
+  return schema
+
+
+def _add_strict_recursive(node: Any) -> None:
+  if isinstance(node, dict):
+    if node.get("type") == "object" and "properties" in node:
+      node.setdefault("additionalProperties", False)
+    for v in node.values():
+      _add_strict_recursive(v)
+  elif isinstance(node, list):
+    for item in node:
+      _add_strict_recursive(item)
+
+
+def _find_line_number(filepath: str, json_path: str) -> int | None:
+  if not json_path:
+    return None
+  key = json_path.split(".")[-1]
+  pattern = re.compile(rf'^\s*"{re.escape(key)}"\s*:')
+  try:
+    with open(filepath, encoding="utf-8") as f:
+      for lineno, line in enumerate(f, start=1):
+        if pattern.match(line):
+          return lineno
+  except OSError:
+    pass
+  return None
+
+
+def _format_location(filepath: str, lineno: int | None) -> str:
+  return f"{filepath}:{lineno}" if lineno else filepath
+
+
+def _emit_ci_warning(filepath: str, message: str, json_path: str | None = None) -> None:
+  lineno = _find_line_number(filepath, json_path) if json_path else None
+  location = _format_location(filepath, lineno)
+  if os.environ.get("GITHUB_ACTIONS"):
+    line_part = f",line={lineno}" if lineno else ""
+    print(f"::warning file={filepath}{line_part}::{message}")
+  else:
+    l.warning(f"{location}: {message}")
+
+
+def _emit_defect(filepath: str, defect: Defect, json_path: str | None = None) -> None:
+  lineno = _find_line_number(filepath, json_path) if json_path else None
+  location = _format_location(filepath, lineno)
+  message = defect.format_message()
+  if os.environ.get("GITHUB_ACTIONS"):
+    level = "error" if defect.severity == Severity.ERROR else "warning"
+    line_part = f",line={lineno}" if lineno else ""
+    print(f"::{level} file={filepath}{line_part}::{message}")
+  else:
+    log_fn = l.error if defect.severity == Severity.ERROR else l.warning
+    log_fn(f"{location}: {message}")
+
+
+def _upstream_hint(repo: str | None) -> str:
+  if repo:
+    return f"Fix in upstream pipeline ({repo}) to prevent recurrence"
+  return "Fix in upstream build pipeline to prevent recurrence"
+
+
+def _check_known_defects(data: dict, upstream_repo: str | None) -> list[Defect]:
+  defects: list[Defect] = []
+  defects.extend(_check_misplaced_mut_labels(data, upstream_repo))
+  defects.extend(_check_qc_defects(data, upstream_repo))
+  defects.extend(_check_misplaced_properties(data, upstream_repo))
+  defects.extend(_check_alignment_param_typos(data, upstream_repo))
+  return defects
+
+
+def _check_misplaced_mut_labels(data: dict, upstream_repo: str | None) -> list[Defect]:
+  defects: list[Defect] = []
+
+  if "nucMutLabelMap" in data:
+    defects.append(Defect(
+      severity=Severity.ERROR,
+      problem="Misplaced 'nucMutLabelMap' at root level",
+      impact="Nucleotide mutation labels not applied to results",
+      migration="migrations/migrate_008_move_mut_labels.py",
+      json_path="nucMutLabelMap",
+      upstream_fix=_upstream_hint(upstream_repo),
+    ))
+
+  if "nucMutLabelMapReverse" in data:
+    defects.append(Defect(
+      severity=Severity.INFO,
+      problem="Legacy v2 field 'nucMutLabelMapReverse' at root level",
+      impact="No effect (reverse map computed at runtime in v3)",
+      migration="migrations/migrate_009_remove_nuc_mut_label_map_reverse.py",
+      json_path="nucMutLabelMapReverse",
+      upstream_fix=_upstream_hint(upstream_repo),
+    ))
+
+  if "mutLabelMap" in data:
+    defects.append(Defect(
+      severity=Severity.ERROR,
+      problem="Misplaced 'mutLabelMap' at root level",
+      impact="Mutation labels not applied to results",
+      migration="migrations/migrate_008_move_mut_labels.py",
+      json_path="mutLabelMap",
+      upstream_fix=_upstream_hint(upstream_repo),
+    ))
+
+  mut_labels = data.get("mutLabels")
+  if isinstance(mut_labels, dict) and "nucMutLabelMapReverse" in mut_labels:
+    defects.append(Defect(
+      severity=Severity.INFO,
+      problem="Legacy v2 field 'mutLabels.nucMutLabelMapReverse'",
+      impact="No effect (reverse map computed at runtime in v3)",
+      migration="migrations/migrate_009_remove_nuc_mut_label_map_reverse.py",
+      json_path="mutLabels.nucMutLabelMapReverse",
+      upstream_fix=_upstream_hint(upstream_repo),
+    ))
+
+  return defects
+
+
+def _check_qc_defects(data: dict, upstream_repo: str | None) -> list[Defect]:
+  defects: list[Defect] = []
+  qc = data.get("qc")
+  if not isinstance(qc, dict):
+    return defects
+
+  if "divergence" in qc:
+    defects.append(Defect(
+      severity=Severity.INFO,
+      problem="Unknown QC rule 'qc.divergence'",
+      impact="No effect (divergence computed at runtime, not a QC rule)",
+      migration="migrations/migrate_012_remove_qc_divergence.py",
+      json_path="qc.divergence",
+      upstream_fix=_upstream_hint(upstream_repo),
+    ))
+
+  missing_data = qc.get("missingData")
+  if isinstance(missing_data, dict) and "scoreWeight" in missing_data:
+    defects.append(Defect(
+      severity=Severity.INFO,
+      problem="Unknown field 'qc.missingData.scoreWeight'",
+      impact="No effect (use 'missingDataThreshold' and 'scoreBias' instead)",
+      migration="migrations/migrate_011_remove_qc_score_weight.py",
+      json_path="qc.missingData.scoreWeight",
+      upstream_fix=_upstream_hint(upstream_repo),
+    ))
+
+  mixed_sites = qc.get("mixedSites")
+  if isinstance(mixed_sites, dict) and "scoreWeight" in mixed_sites:
+    defects.append(Defect(
+      severity=Severity.INFO,
+      problem="Unknown field 'qc.mixedSites.scoreWeight'",
+      impact="No effect (use 'mixedSitesThreshold' instead)",
+      migration="migrations/migrate_011_remove_qc_score_weight.py",
+      json_path="qc.mixedSites.scoreWeight",
+      upstream_fix=_upstream_hint(upstream_repo),
+    ))
+
+  frame_shifts = qc.get("frameShifts")
+  if isinstance(frame_shifts, dict) and "ignoreFrameShifts" in frame_shifts:
+    defects.append(Defect(
+      severity=Severity.ERROR,
+      problem="Field 'qc.frameShifts.ignoreFrameShifts' was renamed to 'ignoredFrameShifts'",
+      impact="Frame shift exclusions not applied",
+      migration="migrations/migrate_010_rename_ignore_frame_shifts.py",
+      json_path="qc.frameShifts.ignoreFrameShifts",
+      upstream_fix=_upstream_hint(upstream_repo),
+    ))
+
+  return defects
+
+
+def _check_misplaced_properties(data: dict, upstream_repo: str | None) -> list[Defect]:
+  defects: list[Defect] = []
+
+  if "geneOrderPreference" in data:
+    defects.append(Defect(
+      severity=Severity.WARNING,
+      problem="Field 'geneOrderPreference' was renamed to 'cdsOrderPreference'",
+      impact="CDS display order uses default",
+      migration="migrations/migrate_013_rename_gene_order_preference.py",
+      json_path="geneOrderPreference",
+      upstream_fix=_upstream_hint(upstream_repo),
+    ))
+
+  if "placementMaskRanges" in data:
+    defects.append(Defect(
+      severity=Severity.ERROR,
+      problem="Misplaced 'placementMaskRanges' in pathogen.json",
+      impact="Placement masking not applied (belongs in tree.json at .meta.extensions.nextclade.placement_mask_ranges)",
+      migration="migrations/migrate_016_fix_misplaced_placement_mask_ranges.py",
+      json_path="placementMaskRanges",
+      upstream_fix=_upstream_hint(upstream_repo),
+    ))
+
+  alignment_params = data.get("alignmentParams")
+  if isinstance(alignment_params, dict):
+    if "includeReference" in alignment_params:
+      defects.append(Defect(
+        severity=Severity.WARNING,
+        problem="Misplaced 'alignmentParams.includeReference'",
+        impact="Setting ignored (move to 'generalParams.includeReference')",
+        migration="migrations/migrate_014_move_general_params.py",
+        json_path="alignmentParams.includeReference",
+        upstream_fix=_upstream_hint(upstream_repo),
+      ))
+
+    if "includeNearestNodeInfo" in alignment_params:
+      defects.append(Defect(
+        severity=Severity.WARNING,
+        problem="Misplaced 'alignmentParams.includeNearestNodeInfo'",
+        impact="Setting ignored (move to 'generalParams.includeNearestNodeInfo')",
+        migration="migrations/migrate_014_move_general_params.py",
+        json_path="alignmentParams.includeNearestNodeInfo",
+        upstream_fix=_upstream_hint(upstream_repo),
+      ))
+
+  return defects
+
+
+def _check_alignment_param_typos(data: dict, upstream_repo: str | None) -> list[Defect]:
+  defects: list[Defect] = []
+  alignment_params = data.get("alignmentParams")
+  if isinstance(alignment_params, dict) and "excessBandwith" in alignment_params:
+    defects.append(Defect(
+      severity=Severity.WARNING,
+      problem="Typo 'alignmentParams.excessBandwith' (missing 'd')",
+      impact="Default bandwidth (9) used instead (rename to 'excessBandwidth')",
+      migration="migrations/migrate_015_fix_excess_bandwidth_typo.py",
+      json_path="alignmentParams.excessBandwith",
+      upstream_fix=_upstream_hint(upstream_repo),
+    ))
+  return defects
diff --git a/scripts/rebuild b/scripts/rebuild
index b35953a62..d8e27da5f 100755
--- a/scripts/rebuild
+++ b/scripts/rebuild
@@ -10,10 +10,12 @@ from collections import defaultdict
 from copy import deepcopy
 from os import getcwd
 from os.path import dirname, realpath, join, relpath, isfile
+from pathlib import Path
 
 from lib.changelog import changelog_prepare, changelog_get_unreleased_section
 from lib.container import dict_get, dict_get_required, find_index_by, first, format_list, \
-  dict_remove_many, find_duplicates, dict_cleanup, find, unique
+  dict_remove_many, find_duplicates, dict_cleanup, find, unique, JsonPath
+from lib.schema import validate_pathogen_json, print_defect_summary, ValidationContext
 from lib.date import now_iso, iso_to_iso_safe
 from lib.fasta import fasta_read_exactly_one_seq
 from lib.fs import json_read, find_files, json_write, copy, make_zip, file_write, rmrf
@@ -60,8 +62,8 @@ def get_dataset_capabilities(pathogen_json: dict, dataset_dir: str):
     other.append("aaMotifs")
 
   qc = []
-  for k, q in (dict_get(pathogen_json, ["qc"]) or {}).items():
-    if dict_get(q, ["enabled"]):
+  for k, q in JsonPath(pathogen_json, "pathogen_json").get("qc").items():
+    if q.get("enabled").val:
       qc.append(k)
 
   custom_clades = dict_cleanup({attr: len(values) for attr, values in custom_clades.items() if len(values) > 0})
@@ -123,8 +125,9 @@ def dataset_get_versions(dataset):
   return versions, last_version
 
 
-def index_one_dataset(args, pathogen_json_path: str, dataset: object, tag: str, updated_at: str):
+def index_one_dataset(args, pathogen_json_path: str, dataset: object, tag: str, updated_at: str, validation_ctx):
   pathogen_json = json_read(pathogen_json_path)
+  validate_pathogen_json(pathogen_json, pathogen_json_path, validation_ctx, args.nextclade_schemas_dir)
   dataset_dir = dirname(pathogen_json_path)
   path = relpath(dataset_dir, args.input_dir)
 
@@ -305,6 +308,10 @@ def parse_args():
   parser.add_argument('--repo', required=False,
                       help="GitHub repo to push and to release to. You need to have write permission for that."
                       )
+  parser.add_argument('--nextclade-schemas-dir', required=False,
+                      help="Local directory containing nextclade schema files (e.g., ../nextclade/packages/nextclade-schemas). "
+                           "When set, schemas are loaded from this directory instead of fetching from GitHub."
+                      )
 
   args = parser.parse_args()
 
@@ -319,6 +326,13 @@ def parse_args():
   if args.push:
     args.commit = True
 
+  if args.nextclade_schemas_dir:
+    resolved = Path(args.nextclade_schemas_dir).resolve()
+    if not resolved.is_dir():
+      parser.error(f"--nextclade-schemas-dir: directory does not exist: {resolved}")
+    args.nextclade_schemas_dir = resolved
+    l.info(f"Using local schema directory: {resolved}")
+
   return args
 
 
@@ -353,6 +367,7 @@ def validate_shortcuts(collections):
 
 def main():
   args = parse_args()
+  validation_ctx = ValidationContext()
 
   if not args.allow_dirty and not git_dir_is_clean():
     dirty_files = "\n  ".join(git_get_dirty_files())
@@ -381,7 +396,7 @@ def main():
   all_refs = {}
   for collection_json_path in collection_json_paths:
     collection, release_infos_for_dataset, refs = process_one_collection(
-      collection_json_path, index_json, args, tag, updated_at
+      collection_json_path, index_json, args, tag, updated_at, validation_ctx
     )
     collections.append(collection)
     release_infos.extend(release_infos_for_dataset)
@@ -431,6 +446,8 @@ def main():
         time.sleep(5)
         publish_to_github_releases(args, tag, commit_hash, release_notes)
 
+  print_defect_summary(validation_ctx)
+
 
 def dataset_has_changes(dataset_from_index, dataset_dir):
   _, last_version = dataset_get_versions(dataset_from_index)
@@ -440,7 +457,7 @@ def dataset_has_changes(dataset_from_index, dataset_dir):
   return len(modified_files) > 0
 
 
-def process_one_collection(collection_json_path, index_json, args, tag, updated_at):
+def process_one_collection(collection_json_path, index_json, args, tag, updated_at, validation_ctx):
   collection_json = json_read(collection_json_path)
   collection_dir = dirname(collection_json_path)
   collection_id = dict_get_required(collection_json, ["meta", "id"])
@@ -469,7 +486,7 @@ def process_one_collection(collection_json_path, index_json, args, tag, updated_
     dataset_from_index = find(lambda dataset: dataset["path"] == path, datasets_from_index_json) or {}
 
     try:
-      dataset, ref = index_one_dataset(args, pathogen_json_path, dataset_from_index, tag, updated_at)
+      dataset, ref = index_one_dataset(args, pathogen_json_path, dataset_from_index, tag, updated_at, validation_ctx)
       datasets.append(dataset)
       # exclude deprecated datasets from list of refs that is later used to build minimizer index
       if not dataset['attributes'].get('deprecated', False):
@@ -487,7 +504,7 @@ def process_one_collection(collection_json_path, index_json, args, tag, updated_
   json_write(collection_json, collection_json_path, no_sort_keys=True)
 
   release_infos = prepare_dataset_release_infos(args, datasets, datasets_from_index_json, collection_dir, tag,
-                                                updated_at)
+                                                updated_at, validation_ctx)
   release_infos = sort_release_infos(release_infos, dataset_order)
 
   collection_info = deepcopy(collection_json)
@@ -512,10 +529,11 @@ def get_datasets_from_index_json(index_json, collection_id):
   return dict_get(collection, ["datasets"]) or []
 
 
-def prepare_dataset_release_infos(args, datasets, datasets_from_index_json, collection_dir, tag, updated_at):
+def prepare_dataset_release_infos(args, datasets, datasets_from_index_json, collection_dir, tag, updated_at, validation_ctx):
   release_infos = []
   for pathogen_json_path in find_files("pathogen.json", collection_dir):
     pathogen_json = json_read(pathogen_json_path)
+    validate_pathogen_json(pathogen_json, pathogen_json_path, validation_ctx, args.nextclade_schemas_dir)
     dataset_dir = dirname(pathogen_json_path)
     dataset_dir_rel = relpath(dataset_dir, args.input_dir)