From 3fe90f97a82e63daea0b84bf1e3efd7c80986732 Mon Sep 17 00:00:00 2001 From: Claude Date: Fri, 27 Feb 2026 02:21:10 +0000 Subject: [PATCH] Fix DuplicateIDError in chroma_uscode importer by deduplicating within each batch The JOIN to usc_section/usc_chapter in fetch_sections_batch can produce multiple rows with the same usc_ident (e.g. /us/usc/t10/s20251). ChromaDB's upsert rejects batches that contain duplicate IDs. Added a seen_in_batch set to skip any repeated ident within a single batch before calling upsert. https://claude.ai/code/session_01PaWRuuLei9GtmCUSBUGMfP --- backend/congress_parser/importers/chroma_uscode.py | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/backend/congress_parser/importers/chroma_uscode.py b/backend/congress_parser/importers/chroma_uscode.py index 2a0b697..cc9d491 100644 --- a/backend/congress_parser/importers/chroma_uscode.py +++ b/backend/congress_parser/importers/chroma_uscode.py @@ -307,9 +307,14 @@ async def run_import( ids: list[str] = [] documents: list[str] = [] metadatas: list[dict] = [] + seen_in_batch: set[str] = set() for row in rows: usc_ident: str = row["usc_ident"] + if usc_ident in seen_in_batch: + skipped += 1 + continue + seen_in_batch.add(usc_ident) doc_text = build_document(row) if not doc_text.strip(): skipped += 1