From 3fe90f97a82e63daea0b84bf1e3efd7c80986732 Mon Sep 17 00:00:00 2001
From: Claude <noreply@anthropic.com>
Date: Fri, 27 Feb 2026 02:21:10 +0000
Subject: [PATCH] Fix DuplicateIDError in chroma_uscode importer by
 deduplicating within each batch

The JOIN to usc_section/usc_chapter in fetch_sections_batch can produce
multiple rows with the same usc_ident (e.g. /us/usc/t10/s20251). ChromaDB's
upsert rejects batches that contain duplicate IDs. Added a seen_in_batch set
to skip any repeated ident within a single batch before calling upsert.

https://claude.ai/code/session_01PaWRuuLei9GtmCUSBUGMfP
---
 backend/congress_parser/importers/chroma_uscode.py | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/backend/congress_parser/importers/chroma_uscode.py b/backend/congress_parser/importers/chroma_uscode.py
index 2a0b697..cc9d491 100644
--- a/backend/congress_parser/importers/chroma_uscode.py
+++ b/backend/congress_parser/importers/chroma_uscode.py
@@ -307,9 +307,14 @@ async def run_import(
         ids: list[str] = []
         documents: list[str] = []
         metadatas: list[dict] = []
+        seen_in_batch: set[str] = set()
 
         for row in rows:
             usc_ident: str = row["usc_ident"]
+            if usc_ident in seen_in_batch:
+                skipped += 1
+                continue
+            seen_in_batch.add(usc_ident)
             doc_text = build_document(row)
             if not doc_text.strip():
                 skipped += 1