From 873d7d4cd0c78854cd71df44d23e0c4859d6750e Mon Sep 17 00:00:00 2001 From: Claude Date: Thu, 26 Feb 2026 04:14:14 +0000 Subject: [PATCH 1/3] Add ChromaDB as a persistent Docker service - Add congress_chromadb service (chromadb/chroma:0.6.3) to dev compose with a named volume (chromadb-volume) for data persistence - Add healthcheck against /api/v1/heartbeat - Add restart: unless-stopped in prod compose overlay - Wire CHROMA_HOST=congress_chromadb into congress_parser_fastapi so it finds ChromaDB via the Docker network instead of a bare IP - Add CHROMA_HOST env var support in uscode.py handler, falling back to LLM_HOST then 10.0.0.120 to preserve existing prod behaviour https://claude.ai/code/session_011LABnV4F5UKzgwKhWj5ND6 --- .docker/docker-compose.prod.yml | 3 +++ .docker/docker-compose.yml | 21 +++++++++++++++++++++ backend/congress_fastapi/handlers/uscode.py | 7 ++++++- 3 files changed, 30 insertions(+), 1 deletion(-) diff --git a/.docker/docker-compose.prod.yml b/.docker/docker-compose.prod.yml index 5ae9238..fd058bb 100644 --- a/.docker/docker-compose.prod.yml +++ b/.docker/docker-compose.prod.yml @@ -15,6 +15,7 @@ services: - CACHE_HEADER_TIME=0 - CACHE_TIME=0 - db_table=us_code_2025 + - CHROMA_HOST=congress_chromadb restart: unless-stopped congress_viewer_app: volumes: @@ -27,6 +28,8 @@ services: restart: unless-stopped congress_postgres: image: tianon/true + congress_chromadb: + restart: unless-stopped networks: parser: external: diff --git a/.docker/docker-compose.yml b/.docker/docker-compose.yml index e597af0..9b1223d 100644 --- a/.docker/docker-compose.yml +++ b/.docker/docker-compose.yml @@ -39,6 +39,7 @@ services: - db_user=parser - db_pass=parser - db_table=us_code_2025 + - CHROMA_HOST=congress_chromadb build: context: ../backend dockerfile: .docker/Dockerfile @@ -48,6 +49,7 @@ services: - "9091:8080" depends_on: - congress_postgres + - congress_chromadb networks: parser: entrypoint: "uvicorn" @@ -107,9 +109,28 @@ services: - postgres-volume:/var/lib/postgresql/data networks: parser: + congress_chromadb: + image: chromadb/chroma:0.6.3 + container_name: congress_chromadb + environment: + - IS_PERSISTENT=TRUE + - PERSIST_DIRECTORY=/chroma/chroma + - ANONYMIZED_TELEMETRY=False + volumes: + - chromadb-volume:/chroma/chroma + ports: + - "8000:8000" + networks: + parser: + healthcheck: + test: ["CMD", "curl", "-f", "http://localhost:8000/api/v1/heartbeat"] + interval: 30s + timeout: 10s + retries: 3 networks: parser: external: name: docker_parser volumes: postgres-volume: + chromadb-volume: diff --git a/backend/congress_fastapi/handlers/uscode.py b/backend/congress_fastapi/handlers/uscode.py index bb2fe96..c44f882 100644 --- a/backend/congress_fastapi/handlers/uscode.py +++ b/backend/congress_fastapi/handlers/uscode.py @@ -14,7 +14,12 @@ ) chroma_host = ( - os.environ.get("LLM_HOST", "10.0.0.120").split("http://")[-1].split(":")[0] + os.environ.get( + "CHROMA_HOST", + os.environ.get("LLM_HOST", "10.0.0.120"), + ) + .split("http://")[-1] + .split(":")[0] ) From fe6292b80ef1d338ecbdbe6513d2ae36442b5fdf Mon Sep 17 00:00:00 2001 From: Claude Date: Thu, 26 Feb 2026 04:34:49 +0000 Subject: [PATCH 2/3] Implement interest-based US Code linking feature Adds a system for users to describe their policy interests in natural language and automatically maps them to relevant USC sections via ChromaDB semantic search. Bills that amend those sections are then surfaced throughout the UI. Backend (Python/FastAPI): - Add UserInterest and UserInterestUscContent SQLAlchemy models (sensitive schema) with Alembic migration - New interest.py handler: save interest text, run ChromaDB search (search_chroma, n=50), upsert auto-matched sections, toggle/add sections manually, query legislation via USCContentDiff join chain - Add interest routes to user.py router (GET/POST /user/interest, PATCH/POST /user/interest/section, GET /user/interest/legislation) - search_chroma now includes usc_ident in each result dict Frontend (hillstack Next.js / tRPC): - Add user_interest and user_interest_usc_content Prisma models - Five new tRPC procedures on userRouter: interestGet, interestSave (calls FastAPI /uscode/search for ChromaDB then stores via Prisma), interestToggleSection, interestAddSection, interestLegislation ($queryRawUnsafe multi-join), interestBillMatch - Dashboard widget: InterestFeed replaces USC Tracking placeholder, shows up to 8 bills touching matched sections; prompts login/setup - Bill layout: InterestBadge client component shows green chip when the bill touches any of the user's active interest sections - New page /user/interests: textarea + save button, grouped section list with checkbox-toggle and manual-add support - Add FASTAPI_URL env var to congress_hillstack Docker service https://claude.ai/code/session_011LABnV4F5UKzgwKhWj5ND6 --- .docker/docker-compose.yml | 3 + .../a1b2c3d4e5f6_add_user_interest.py | 107 ++++++ backend/congress_db/models.py | 43 +++ backend/congress_fastapi/handlers/interest.py | 235 ++++++++++++ backend/congress_fastapi/handlers/uscode.py | 1 + backend/congress_fastapi/routes/user.py | 83 +++++ hillstack/prisma/schema.prisma | 29 ++ hillstack/src/app/_home/dashboard.tsx | 6 +- .../src/app/_home/widgets/interestFeed.tsx | 117 ++++++ .../congress/bills/[billId]/interestBadge.tsx | 31 ++ .../app/congress/bills/[billId]/layout.tsx | 4 +- hillstack/src/app/user/interests/page.tsx | 337 ++++++++++++++++++ hillstack/src/server/api/routers/user.ts | 302 ++++++++++++++++ 13 files changed, 1294 insertions(+), 4 deletions(-) create mode 100644 backend/.alembic/versions/a1b2c3d4e5f6_add_user_interest.py create mode 100644 backend/congress_fastapi/handlers/interest.py create mode 100644 hillstack/src/app/_home/widgets/interestFeed.tsx create mode 100644 hillstack/src/app/congress/bills/[billId]/interestBadge.tsx create mode 100644 hillstack/src/app/user/interests/page.tsx diff --git a/.docker/docker-compose.yml b/.docker/docker-compose.yml index 9b1223d..df7ad2c 100644 --- a/.docker/docker-compose.yml +++ b/.docker/docker-compose.yml @@ -87,6 +87,8 @@ services: container_name: congress_hillstack tty: true stdin_open: true + environment: + - FASTAPI_URL=http://congress_parser_fastapi:8080 build: context: ../hillstack dockerfile: .docker/Dockerfile @@ -94,6 +96,7 @@ services: - "3001:3001" depends_on: - congress_postgres + - congress_parser_fastapi networks: parser: congress_postgres: diff --git a/backend/.alembic/versions/a1b2c3d4e5f6_add_user_interest.py b/backend/.alembic/versions/a1b2c3d4e5f6_add_user_interest.py new file mode 100644 index 0000000..695b449 --- /dev/null +++ b/backend/.alembic/versions/a1b2c3d4e5f6_add_user_interest.py @@ -0,0 +1,107 @@ +"""Add user_interest and user_interest_usc_content tables + +Revision ID: a1b2c3d4e5f6 +Revises: c9d7e37be069 +Create Date: 2026-02-26 00:00:00.000000 + +""" +from typing import Sequence, Union + +from alembic import op +import sqlalchemy as sa + + +# revision identifiers, used by Alembic. +revision: str = 'a1b2c3d4e5f6' +down_revision: Union[str, None] = 'c9d7e37be069' +branch_labels: Union[str, Sequence[str], None] = None +depends_on: Union[str, Sequence[str], None] = None + + +def upgrade() -> None: + op.create_table( + 'user_interest', + sa.Column('user_interest_id', sa.Integer(), nullable=False), + sa.Column('user_id', sa.String(), nullable=True), + sa.Column('interest_text', sa.String(), nullable=True), + sa.Column( + 'created_at', + sa.DateTime(), + server_default=sa.text('now()'), + nullable=True, + ), + sa.Column( + 'updated_at', + sa.DateTime(), + server_default=sa.text('now()'), + nullable=True, + ), + sa.ForeignKeyConstraint( + ['user_id'], + ['sensitive.user_ident.user_id'], + ondelete='CASCADE', + ), + sa.PrimaryKeyConstraint('user_interest_id'), + schema='sensitive', + ) + op.create_index( + 'ix_sensitive_user_interest_user_id', + 'user_interest', + ['user_id'], + schema='sensitive', + ) + + op.create_table( + 'user_interest_usc_content', + sa.Column('user_interest_usc_content_id', sa.Integer(), nullable=False), + sa.Column('user_interest_id', sa.Integer(), nullable=True), + sa.Column('usc_ident', sa.String(), nullable=True), + sa.Column('match_source', sa.String(), nullable=True), + sa.Column('is_active', sa.Boolean(), nullable=True, server_default=sa.text('true')), + sa.Column('match_rank', sa.Integer(), nullable=True), + sa.Column( + 'created_at', + sa.DateTime(), + server_default=sa.text('now()'), + nullable=True, + ), + sa.ForeignKeyConstraint( + ['user_interest_id'], + ['sensitive.user_interest.user_interest_id'], + ondelete='CASCADE', + ), + sa.PrimaryKeyConstraint('user_interest_usc_content_id'), + schema='sensitive', + ) + op.create_index( + 'ix_sensitive_user_interest_usc_content_user_interest_id', + 'user_interest_usc_content', + ['user_interest_id'], + schema='sensitive', + ) + op.create_index( + 'ix_sensitive_user_interest_usc_content_usc_ident', + 'user_interest_usc_content', + ['usc_ident'], + schema='sensitive', + ) + + +def downgrade() -> None: + op.drop_index( + 'ix_sensitive_user_interest_usc_content_usc_ident', + table_name='user_interest_usc_content', + schema='sensitive', + ) + op.drop_index( + 'ix_sensitive_user_interest_usc_content_user_interest_id', + table_name='user_interest_usc_content', + schema='sensitive', + ) + op.drop_table('user_interest_usc_content', schema='sensitive') + op.drop_index( + 'ix_sensitive_user_interest_user_id', + table_name='user_interest', + schema='sensitive', + ) + op.drop_table('user_interest', schema='sensitive') diff --git a/backend/congress_db/models.py b/backend/congress_db/models.py index dfb8924..fb2ca6d 100644 --- a/backend/congress_db/models.py +++ b/backend/congress_db/models.py @@ -321,6 +321,49 @@ class UserUSCContent(SensitiveBase): usc_ident = Column(String) +class UserInterest(SensitiveBase): + """ + Natural language interest statement for a user, used to auto-match USC sections + """ + + __tablename__ = "user_interest" + + user_interest_id = Column(Integer, primary_key=True) + + user_id = Column( + String, + ForeignKey("sensitive.user_ident.user_id", ondelete="CASCADE"), + index=True, + ) + interest_text = Column(String) + created_at = Column(DateTime, server_default=func.now()) + updated_at = Column(DateTime, server_default=func.now(), onupdate=func.now()) + + +class UserInterestUscContent(SensitiveBase): + """ + USC content sections matched to a user's interest, either automatically or manually + """ + + __tablename__ = "user_interest_usc_content" + + user_interest_usc_content_id = Column(Integer, primary_key=True) + + user_interest_id = Column( + Integer, + ForeignKey( + "sensitive.user_interest.user_interest_id", + ondelete="CASCADE", + ), + index=True, + ) + usc_ident = Column(String, index=True) + match_source = Column(String, default="auto") # 'auto' or 'manual' + is_active = Column(Boolean, default=True) + match_rank = Column(Integer, nullable=True) + created_at = Column(DateTime, server_default=func.now()) + + class UserLLMQuery(SensitiveBase): """ Acts as a log of all user queries into legislation, for tracking purposes diff --git a/backend/congress_fastapi/handlers/interest.py b/backend/congress_fastapi/handlers/interest.py new file mode 100644 index 0000000..87c056c --- /dev/null +++ b/backend/congress_fastapi/handlers/interest.py @@ -0,0 +1,235 @@ +from typing import List + +from sqlalchemy import select, insert, update, or_, func + +from congress_fastapi.db.postgres import get_database +from congress_fastapi.handlers.uscode import search_chroma +from congress_db.models import ( + UserInterest, + UserInterestUscContent, + USCContent, + USCContentDiff, + LegislationVersion, + Legislation, + Congress, +) + + +async def handle_get_interest(user_id: str) -> dict: + database = await get_database() + + interest = await database.fetch_one( + select(UserInterest).where(UserInterest.user_id == user_id) + ) + if not interest: + return {"interest": None, "matches": []} + + matches = await database.fetch_all( + select(UserInterestUscContent) + .where( + UserInterestUscContent.user_interest_id + == interest["user_interest_id"] + ) + .order_by(UserInterestUscContent.match_rank) + ) + return { + "interest": dict(interest), + "matches": [dict(m) for m in matches], + } + + +async def handle_save_interest(user_id: str, interest_text: str) -> dict: + database = await get_database() + + existing = await database.fetch_one( + select(UserInterest).where(UserInterest.user_id == user_id) + ) + if existing: + await database.execute( + update(UserInterest) + .where(UserInterest.user_id == user_id) + .values(interest_text=interest_text, updated_at=func.now()) + ) + interest_id = existing["user_interest_id"] + else: + interest_id = await database.execute( + insert(UserInterest).values( + user_id=user_id, interest_text=interest_text + ) + ) + + # Run ChromaDB semantic search + try: + chroma_matches = await search_chroma(interest_text, 50) + except Exception: + chroma_matches = [] + + # Deactivate all previous auto-matched sections + await database.execute( + update(UserInterestUscContent) + .where( + UserInterestUscContent.user_interest_id == interest_id + ) + .where(UserInterestUscContent.match_source == "auto") + .values(is_active=False) + ) + + # Upsert new auto-matched sections + for rank, match in enumerate(chroma_matches): + usc_ident = match.get("usc_ident") + if not usc_ident: + continue + + existing_match = await database.fetch_one( + select(UserInterestUscContent) + .where( + UserInterestUscContent.user_interest_id == interest_id + ) + .where(UserInterestUscContent.usc_ident == usc_ident) + ) + if existing_match: + await database.execute( + update(UserInterestUscContent) + .where( + UserInterestUscContent.user_interest_usc_content_id + == existing_match["user_interest_usc_content_id"] + ) + .values(is_active=True, match_rank=rank, match_source="auto") + ) + else: + await database.execute( + insert(UserInterestUscContent).values( + user_interest_id=interest_id, + usc_ident=usc_ident, + match_source="auto", + is_active=True, + match_rank=rank, + ) + ) + + return await handle_get_interest(user_id) + + +async def handle_toggle_interest_section( + user_id: str, usc_ident: str, is_active: bool +) -> None: + database = await get_database() + + interest = await database.fetch_one( + select(UserInterest).where(UserInterest.user_id == user_id) + ) + if not interest: + return + + await database.execute( + update(UserInterestUscContent) + .where( + UserInterestUscContent.user_interest_id + == interest["user_interest_id"] + ) + .where(UserInterestUscContent.usc_ident == usc_ident) + .values(is_active=is_active) + ) + + +async def handle_add_interest_section(user_id: str, usc_ident: str) -> None: + database = await get_database() + + interest = await database.fetch_one( + select(UserInterest).where(UserInterest.user_id == user_id) + ) + if not interest: + return + + existing = await database.fetch_one( + select(UserInterestUscContent) + .where( + UserInterestUscContent.user_interest_id + == interest["user_interest_id"] + ) + .where(UserInterestUscContent.usc_ident == usc_ident) + ) + if existing: + await database.execute( + update(UserInterestUscContent) + .where( + UserInterestUscContent.user_interest_usc_content_id + == existing["user_interest_usc_content_id"] + ) + .values(is_active=True, match_source="manual") + ) + else: + await database.execute( + insert(UserInterestUscContent).values( + user_interest_id=interest["user_interest_id"], + usc_ident=usc_ident, + match_source="manual", + is_active=True, + match_rank=None, + ) + ) + + +async def handle_get_interest_legislation(user_id: str) -> dict: + database = await get_database() + + interest = await database.fetch_one( + select(UserInterest).where(UserInterest.user_id == user_id) + ) + if not interest: + return {"legislation": []} + + match_rows = await database.fetch_all( + select(UserInterestUscContent.usc_ident) + .where( + UserInterestUscContent.user_interest_id + == interest["user_interest_id"] + ) + .where(UserInterestUscContent.is_active == True) # noqa: E712 + .distinct() + ) + idents: List[str] = [r["usc_ident"] for r in match_rows if r["usc_ident"]] + if not idents: + return {"legislation": []} + + query = ( + select( + Legislation.legislation_id, + Legislation.title, + Legislation.number, + Congress.session_number, + Legislation.legislation_type, + Legislation.chamber, + func.min(LegislationVersion.effective_date).label("effective_date"), + ) + .select_from(USCContent) + .join( + USCContentDiff, + USCContentDiff.usc_content_id == USCContent.usc_content_id, + ) + .join( + LegislationVersion, + USCContentDiff.version_id == LegislationVersion.version_id, + ) + .join( + Legislation, + LegislationVersion.legislation_id == Legislation.legislation_id, + ) + .join(Congress, Congress.congress_id == Legislation.congress_id) + .where( + or_(*[USCContent.usc_ident.ilike(f"{ident}%") for ident in idents]) + ) + .group_by( + Legislation.legislation_id, + Legislation.title, + Legislation.number, + Congress.session_number, + Legislation.legislation_type, + Legislation.chamber, + ) + .order_by(func.min(LegislationVersion.effective_date).desc()) + .limit(100) + ) + + results = await database.fetch_all(query) + return {"legislation": [dict(r) for r in results]} diff --git a/backend/congress_fastapi/handlers/uscode.py b/backend/congress_fastapi/handlers/uscode.py index c44f882..7a3e7c3 100644 --- a/backend/congress_fastapi/handlers/uscode.py +++ b/backend/congress_fastapi/handlers/uscode.py @@ -113,5 +113,6 @@ async def search_chroma(query: str, num: int) -> List[dict]: result["title"] = short_title.capitalize() result["section_display"] = content.heading.strip() result["usc_link"] = f"{short_title[1:]}/{content.number}" + result["usc_ident"] = ident results_by_id[ident] = result return [results_by_id[ident] for ident in response["ids"][0]] diff --git a/backend/congress_fastapi/routes/user.py b/backend/congress_fastapi/routes/user.py index ccfeaee..c58911d 100644 --- a/backend/congress_fastapi/routes/user.py +++ b/backend/congress_fastapi/routes/user.py @@ -1,6 +1,7 @@ import traceback from typing import List, Optional from congress_db.models import UserIdent +from pydantic import BaseModel from fastapi import ( APIRouter, HTTPException, @@ -28,6 +29,13 @@ InvalidTokenException, handle_get_usc_tracking_folders ) +from congress_fastapi.handlers.interest import ( + handle_get_interest, + handle_save_interest, + handle_toggle_interest_section, + handle_add_interest_section, + handle_get_interest_legislation, +) from congress_fastapi.models.errors import Error from congress_fastapi.models.user import ( UserLoginRequest, @@ -277,3 +285,78 @@ async def user_usc_tracking_folder_results( ) return await handle_get_usc_tracking_results(user.user_id, folder_id) + + +class InterestSaveRequest(BaseModel): + interest_text: str + + +class InterestToggleRequest(BaseModel): + usc_ident: str + is_active: bool + + +class InterestAddSectionRequest(BaseModel): + usc_ident: str + + +@router.get("/user/interest") +async def user_interest_get( + user: UserIdent = Depends(user_from_cookie), +) -> dict: + if user is None: + raise HTTPException( + status_code=403, detail="Invalid or expired authentication token" + ) + return await handle_get_interest(user.user_id) + + +@router.post("/user/interest") +async def user_interest_save( + body: InterestSaveRequest, + user: UserIdent = Depends(user_from_cookie), +) -> dict: + if user is None: + raise HTTPException( + status_code=403, detail="Invalid or expired authentication token" + ) + return await handle_save_interest(user.user_id, body.interest_text) + + +@router.patch("/user/interest/section") +async def user_interest_toggle_section( + body: InterestToggleRequest, + user: UserIdent = Depends(user_from_cookie), +) -> dict: + if user is None: + raise HTTPException( + status_code=403, detail="Invalid or expired authentication token" + ) + await handle_toggle_interest_section( + user.user_id, body.usc_ident, body.is_active + ) + return {"ok": True} + + +@router.post("/user/interest/section") +async def user_interest_add_section( + body: InterestAddSectionRequest, + user: UserIdent = Depends(user_from_cookie), +) -> dict: + if user is None: + raise HTTPException( + status_code=403, detail="Invalid or expired authentication token" + ) + await handle_add_interest_section(user.user_id, body.usc_ident) + return {"ok": True} + + +@router.get("/user/interest/legislation") +async def user_interest_legislation( + user: UserIdent = Depends(user_from_cookie), +) -> dict: + if user is None: + raise HTTPException( + status_code=403, detail="Invalid or expired authentication token" + ) + return await handle_get_interest_legislation(user.user_id) diff --git a/hillstack/prisma/schema.prisma b/hillstack/prisma/schema.prisma index 87d6d30..de61f11 100644 --- a/hillstack/prisma/schema.prisma +++ b/hillstack/prisma/schema.prisma @@ -554,6 +554,7 @@ model user_ident { user_auth_google String? @db.VarChar user_auth_expiration DateTime? @db.Timestamp(6) user_auth_cookie String? @db.VarChar + user_interest user_interest[] user_legislation user_legislation[] user_legislator user_legislator[] user_llm_query user_llm_query[] @@ -604,6 +605,34 @@ model user_llm_query { @@schema("sensitive") } +model user_interest { + user_interest_id Int @id @default(autoincrement()) + user_id String? @db.VarChar + interest_text String? @db.Text + created_at DateTime? @default(now()) @db.Timestamp(6) + updated_at DateTime? @default(now()) @db.Timestamp(6) + user_ident user_ident? @relation(fields: [user_id], references: [user_id], onDelete: Cascade, onUpdate: NoAction) + user_interest_usc_content user_interest_usc_content[] + + @@index([user_id], map: "ix_sensitive_user_interest_user_id") + @@schema("sensitive") +} + +model user_interest_usc_content { + user_interest_usc_content_id Int @id @default(autoincrement()) + user_interest_id Int? + usc_ident String? @db.VarChar + match_source String? @db.VarChar + is_active Boolean? @default(true) + match_rank Int? + created_at DateTime? @default(now()) @db.Timestamp(6) + user_interest user_interest? @relation(fields: [user_interest_id], references: [user_interest_id], onDelete: Cascade, onUpdate: NoAction) + + @@index([user_interest_id], map: "ix_sensitive_user_interest_usc_content_user_interest_id") + @@index([usc_ident], map: "ix_sensitive_user_interest_usc_content_usc_ident") + @@schema("sensitive") +} + model user_usc_content { user_usc_content_id Int @id @default(autoincrement()) user_id String? @db.VarChar diff --git a/hillstack/src/app/_home/dashboard.tsx b/hillstack/src/app/_home/dashboard.tsx index 2542112..96cd220 100644 --- a/hillstack/src/app/_home/dashboard.tsx +++ b/hillstack/src/app/_home/dashboard.tsx @@ -9,11 +9,11 @@ import SellIcon from '@mui/icons-material/Sell'; import { Card, Grid, Toolbar } from '@mui/material'; import Box from '@mui/material/Box'; import type React from 'react'; -import { DashboardWidgetNoContent } from '~/app/_home/widgets'; import { LegislationCalendar } from '~/app/_home/widgets/legislationCalendar'; import { LegislationFollowed } from '~/app/_home/widgets/legislationFollowed'; import { LegislationTags } from '~/app/_home/widgets/legislationTags'; import { LegislatorFollowed } from '~/app/_home/widgets/legislatorFollowed'; +import { InterestFeed } from '~/app/_home/widgets/interestFeed'; function DashboardWidget({ title, @@ -82,9 +82,9 @@ export function Dashboard() { - + diff --git a/hillstack/src/app/_home/widgets/interestFeed.tsx b/hillstack/src/app/_home/widgets/interestFeed.tsx new file mode 100644 index 0000000..863c50f --- /dev/null +++ b/hillstack/src/app/_home/widgets/interestFeed.tsx @@ -0,0 +1,117 @@ +'use client'; + +import { Box, Button, List, ListItem, ListItemButton, Typography } from '@mui/material'; +import Link from 'next/link'; +import { useSession } from 'next-auth/react'; +import type { RouterOutputs } from '~/trpc/react'; +import { api } from '~/trpc/react'; +import { DashboardWidgetContent } from './'; + +type InterestBill = RouterOutputs['user']['interestLegislation'][number]; + +export function InterestFeed() { + const { data: session } = useSession(); + const { data, isLoading, isError } = api.user.interestLegislation.useQuery( + undefined, + { enabled: Boolean(session) }, + ); + + if (!session) { + return ( + + + Log in to track legislation by interest + + + ); + } + + if (!isLoading && !isError && data?.length === 0) { + return ( + + + No bills found yet. Set up your interests to track + relevant legislation. + + + + ); + } + + return ( + + {data && ( + + {data.slice(0, 8).map((bill: InterestBill) => ( + + + + + + {bill.title} + + + {`${bill.session_number}th · `} + {bill.chamber === 'house' + ? 'H.R.' + : 'S.'} + {` #${bill.number}`} + + + + + + ))} + + )} + + ); +} diff --git a/hillstack/src/app/congress/bills/[billId]/interestBadge.tsx b/hillstack/src/app/congress/bills/[billId]/interestBadge.tsx new file mode 100644 index 0000000..0821438 --- /dev/null +++ b/hillstack/src/app/congress/bills/[billId]/interestBadge.tsx @@ -0,0 +1,31 @@ +'use client'; + +import TrackChangesIcon from '@mui/icons-material/TrackChanges'; +import { Chip, Tooltip } from '@mui/material'; +import { useSession } from 'next-auth/react'; +import { api } from '~/trpc/react'; + +export function InterestBadge({ legislationId }: { legislationId: number }) { + const { data: session } = useSession(); + + const { data } = api.user.interestBillMatch.useQuery( + { legislation_id: legislationId }, + { enabled: Boolean(session) }, + ); + + if (!session || !data?.matches) return null; + + return ( + + } + label='Matches your interests' + size='small' + sx={{ px: 1, mr: 1, mt: { xs: 0.5, md: 0 } }} + /> + + ); +} diff --git a/hillstack/src/app/congress/bills/[billId]/layout.tsx b/hillstack/src/app/congress/bills/[billId]/layout.tsx index 8beccb1..b657dcf 100644 --- a/hillstack/src/app/congress/bills/[billId]/layout.tsx +++ b/hillstack/src/app/congress/bills/[billId]/layout.tsx @@ -3,6 +3,7 @@ import MergeTypeIcon from '@mui/icons-material/MergeType'; import { Box, Chip, Container, Paper, Typography } from '@mui/material'; import type { Params } from 'next/dist/server/request/params'; +import { InterestBadge } from '~/app/congress/bills/[billId]/interestBadge'; import { BillTabs } from '~/app/congress/bills/[billId]/tabs'; import { BillVersionEnum } from '~/enums'; import { api, HydrateClient } from '~/trpc/server'; @@ -118,7 +119,8 @@ export default async function BillLayout({ )} - + + { + const groups: Record = {}; + for (const m of matches) { + const parts = m.usc_ident?.split('/') ?? []; + // usc_ident looks like /us/usc/t42/s1395/... + const titlePart = parts[3] ?? 'unknown'; + const titleNum = titlePart.replace('t', 'Title '); + if (!groups[titleNum]) groups[titleNum] = []; + groups[titleNum].push(m); + } + return groups; +} + +export default function InterestsPage() { + const { data: session, status } = useSession(); + const utils = api.useUtils(); + + const { data: interest, isLoading } = api.user.interestGet.useQuery( + undefined, + { enabled: Boolean(session) }, + ); + + const [text, setText] = useState(''); + const [addIdent, setAddIdent] = useState(''); + const [showAddField, setShowAddField] = useState(false); + + useEffect(() => { + if (interest?.interest_text) { + setText(interest.interest_text); + } + }, [interest?.interest_text]); + + const saveMutation = api.user.interestSave.useMutation({ + onSuccess: () => utils.user.interestGet.invalidate(), + }); + + const toggleMutation = api.user.interestToggleSection.useMutation({ + onSuccess: () => utils.user.interestGet.invalidate(), + }); + + const addMutation = api.user.interestAddSection.useMutation({ + onSuccess: () => { + utils.user.interestGet.invalidate(); + setAddIdent(''); + setShowAddField(false); + }, + }); + + const rawMatches: MatchItem[] = ( + interest?.user_interest_usc_content ?? [] + ).map((m) => ({ + user_interest_usc_content_id: m.user_interest_usc_content_id, + usc_ident: m.usc_ident ?? null, + match_source: m.match_source ?? null, + is_active: m.is_active ?? null, + match_rank: m.match_rank ?? null, + })); + + const grouped = useMemo( + () => groupByTitle(rawMatches), + // eslint-disable-next-line react-hooks/exhaustive-deps + [interest?.user_interest_usc_content], + ); + + const activeCount = rawMatches.filter((m) => m.is_active).length; + + if (status === 'loading') { + return ( + + + + ); + } + + if (!session) { + return ( + + + Please log in to manage your policy interests. + + + ); + } + + return ( + + + Your Policy Interests + + + Describe what policy areas you care about in plain language. + The system will find matching US Code sections and alert you + when new legislation touches them. + + + + ) => + setText(e.target.value.slice(0, 500)) + } + placeholder='e.g. "Medicare billing and reimbursement policy for rural critical access hospitals"' + value={text} + /> + + + {saveMutation.isError && ( + + Failed to save. Please try again. + + )} + + + + {isLoading && } + + {interest && rawMatches.length > 0 && ( + + + + Matched US Code Sections + + + + + {(Object.entries(grouped) as [string, MatchItem[]][]).map( + ([titleLabel, sections]) => ( + + + {titleLabel} + + {sections.map((match) => { + const sectionSlug = + match.usc_ident + ?.split('/') + .slice(3) + .join('/') ?? ''; + return ( + + + + toggleMutation.mutate({ + user_interest_usc_content_id: + match.user_interest_usc_content_id, + is_active: + !match.is_active, + }) + } + size='small' + sx={{ mr: 1 }} + > + {match.is_active ? ( + + ) : ( + + )} + + + + {sectionSlug} + + {match.match_source === + 'manual' && ( + + )} + + ); + })} + + + ), + )} + + + {showAddField ? ( + + , + ) => setAddIdent(e.target.value)} + placeholder='/us/usc/t42/s1395' + size='small' + value={addIdent} + /> + + + + ) : ( + + )} + + + )} + + {interest && + rawMatches.length === 0 && + !isLoading && ( + + No matching sections found. Try a more specific + interest description, or make sure ChromaDB has been + indexed. + + )} + + ); +} diff --git a/hillstack/src/server/api/routers/user.ts b/hillstack/src/server/api/routers/user.ts index 8786189..1eb467a 100644 --- a/hillstack/src/server/api/routers/user.ts +++ b/hillstack/src/server/api/routers/user.ts @@ -1,6 +1,9 @@ +import { Prisma } from '@prisma/client'; import { z } from 'zod'; import { createTRPCRouter, privateProcedure } from '~/server/api/trpc'; +const FASTAPI_URL = process.env.FASTAPI_URL ?? 'http://localhost:8080'; + export const userRouter = createTRPCRouter({ legislationFollowing: privateProcedure .input( @@ -208,4 +211,303 @@ export const userRouter = createTRPCRouter({ return userLegislation.map((leg) => leg); }), + + // ── Interest procedures ────────────────────────────────────────────────── + + interestGet: privateProcedure.query(async ({ ctx }) => { + return ctx.db.user_interest.findFirst({ + where: { user_id: ctx.user.email }, + include: { + user_interest_usc_content: { + orderBy: { match_rank: 'asc' }, + }, + }, + }); + }), + + interestSave: privateProcedure + .input(z.object({ interest_text: z.string().min(1).max(500) })) + .mutation(async ({ input, ctx }) => { + // 1. Upsert the interest record + const existing = await ctx.db.user_interest.findFirst({ + where: { user_id: ctx.user.email }, + }); + + let interestId: number; + if (existing) { + await ctx.db.user_interest.update({ + where: { user_interest_id: existing.user_interest_id }, + data: { + interest_text: input.interest_text, + updated_at: new Date(), + }, + }); + interestId = existing.user_interest_id; + } else { + const created = await ctx.db.user_interest.create({ + data: { + user_id: ctx.user.email, + interest_text: input.interest_text, + }, + }); + interestId = created.user_interest_id; + } + + // 2. Call the FastAPI ChromaDB search endpoint (no auth needed) + let chromaMatches: Array<{ + usc_ident?: string; + title?: string; + section_display?: string; + usc_link?: string; + }> = []; + try { + const res = await fetch(`${FASTAPI_URL}/uscode/search`, { + method: 'POST', + headers: { 'Content-Type': 'application/json' }, + body: JSON.stringify({ + query: input.interest_text, + results: 50, + }), + }); + if (res.ok) { + chromaMatches = await res.json(); + } + } catch { + // ChromaDB unavailable — interest saved, matches will be empty + } + + // 3. Deactivate old auto-matched sections + await ctx.db.user_interest_usc_content.updateMany({ + where: { + user_interest_id: interestId, + match_source: 'auto', + }, + data: { is_active: false }, + }); + + // 4. Upsert new auto-matched sections + for (let rank = 0; rank < chromaMatches.length; rank++) { + const match = chromaMatches[rank]; + const usc_ident = match?.usc_ident; + if (!usc_ident) continue; + + const existingMatch = + await ctx.db.user_interest_usc_content.findFirst({ + where: { user_interest_id: interestId, usc_ident }, + }); + + if (existingMatch) { + await ctx.db.user_interest_usc_content.update({ + where: { + user_interest_usc_content_id: + existingMatch.user_interest_usc_content_id, + }, + data: { + is_active: true, + match_rank: rank, + match_source: 'auto', + }, + }); + } else { + await ctx.db.user_interest_usc_content.create({ + data: { + user_interest_id: interestId, + usc_ident, + match_source: 'auto', + is_active: true, + match_rank: rank, + }, + }); + } + } + + return ctx.db.user_interest.findFirst({ + where: { user_id: ctx.user.email }, + include: { + user_interest_usc_content: { + orderBy: { match_rank: 'asc' }, + }, + }, + }); + }), + + interestToggleSection: privateProcedure + .input( + z.object({ + user_interest_usc_content_id: z.number(), + is_active: z.boolean(), + }), + ) + .mutation(async ({ input, ctx }) => { + // Verify ownership before updating + const interest = await ctx.db.user_interest.findFirst({ + where: { user_id: ctx.user.email }, + }); + if (!interest) return; + + await ctx.db.user_interest_usc_content.updateMany({ + where: { + user_interest_usc_content_id: + input.user_interest_usc_content_id, + user_interest_id: interest.user_interest_id, + }, + data: { is_active: input.is_active }, + }); + }), + + interestAddSection: privateProcedure + .input(z.object({ usc_ident: z.string() })) + .mutation(async ({ input, ctx }) => { + const interest = await ctx.db.user_interest.findFirst({ + where: { user_id: ctx.user.email }, + }); + if (!interest) return; + + const existing = + await ctx.db.user_interest_usc_content.findFirst({ + where: { + user_interest_id: interest.user_interest_id, + usc_ident: input.usc_ident, + }, + }); + + if (existing) { + await ctx.db.user_interest_usc_content.update({ + where: { + user_interest_usc_content_id: + existing.user_interest_usc_content_id, + }, + data: { is_active: true, match_source: 'manual' }, + }); + } else { + await ctx.db.user_interest_usc_content.create({ + data: { + user_interest_id: interest.user_interest_id, + usc_ident: input.usc_ident, + match_source: 'manual', + is_active: true, + }, + }); + } + }), + + interestLegislation: privateProcedure.query(async ({ ctx }) => { + const interest = await ctx.db.user_interest.findFirst({ + where: { user_id: ctx.user.email }, + include: { + user_interest_usc_content: { + where: { is_active: true }, + select: { usc_ident: true }, + }, + }, + }); + + if ( + !interest || + interest.user_interest_usc_content.length === 0 + ) { + return []; + } + + const idents = interest.user_interest_usc_content + .map((m) => m.usc_ident) + .filter((id): id is string => Boolean(id)); + + if (idents.length === 0) return []; + + // Build a raw SQL query since Prisma doesn't support ILIKE prefix arrays + const identsWithWildcard = idents.map((id) => `${id}%`); + const orClauses = identsWithWildcard + .map((_, i) => `uc.usc_ident ILIKE $${i + 1}`) + .join(' OR '); + + type LegislationRow = { + legislation_id: bigint; + title: string; + number: number; + session_number: number; + legislation_type: string; + chamber: string; + effective_date: Date | null; + }; + + const rows = await ctx.db.$queryRawUnsafe( + `SELECT + l.legislation_id, + l.title, + l.number, + c.session_number, + l.legislation_type, + l.chamber, + MIN(lv.effective_date) AS effective_date + FROM usc_content uc + JOIN usc_content_diff ucd ON ucd.usc_content_id = uc.usc_content_id + JOIN legislation_version lv ON lv.version_id = ucd.version_id + JOIN legislation l ON l.legislation_id = lv.legislation_id + JOIN congress c ON c.congress_id = l.congress_id + WHERE ${orClauses} + GROUP BY l.legislation_id, l.title, l.number, l.legislation_type, l.chamber, c.session_number + ORDER BY MIN(lv.effective_date) DESC + LIMIT 50`, + ...identsWithWildcard, + ); + + // BigInt → number for JSON serialisation + return rows.map((r) => ({ + ...r, + legislation_id: Number(r.legislation_id), + })); + }), + + interestBillMatch: privateProcedure + .input(z.object({ legislation_id: z.number() })) + .query(async ({ input, ctx }) => { + const interest = await ctx.db.user_interest.findFirst({ + where: { user_id: ctx.user.email }, + include: { + user_interest_usc_content: { + where: { is_active: true }, + select: { usc_ident: true }, + }, + }, + }); + + if ( + !interest || + interest.user_interest_usc_content.length === 0 + ) { + return { matches: false, matchedIdents: [] as string[] }; + } + + const idents = interest.user_interest_usc_content + .map((m) => m.usc_ident) + .filter((id): id is string => Boolean(id)); + + if (idents.length === 0) { + return { matches: false, matchedIdents: [] as string[] }; + } + + const identsWithWildcard = idents.map((id) => `${id}%`); + const orClauses = identsWithWildcard + .map((_, i) => `uc.usc_ident ILIKE $${i + 2}`) + .join(' OR '); + + type IdentRow = { usc_ident: string }; + const rows = await ctx.db.$queryRawUnsafe( + `SELECT DISTINCT uc.usc_ident + FROM usc_content uc + JOIN usc_content_diff ucd ON ucd.usc_content_id = uc.usc_content_id + JOIN legislation_version lv ON lv.version_id = ucd.version_id + WHERE lv.legislation_id = $1 + AND (${orClauses}) + LIMIT 10`, + input.legislation_id, + ...identsWithWildcard, + ); + + return { + matches: rows.length > 0, + matchedIdents: rows.map((r) => r.usc_ident), + }; + }), }); From 4c0cc43b3f0752897a8d346a79a4e3a179db58b5 Mon Sep 17 00:00:00 2001 From: Claude Date: Fri, 27 Feb 2026 01:38:43 +0000 Subject: [PATCH 3/3] Add ChromaDB importer for USC sections Creates backend/congress_parser/importers/chroma_uscode.py, a standalone async script that reads top-level US Code sections from PostgreSQL and upserts them into the ChromaDB 'uscode' collection, enabling the interest-based semantic search feature to find relevant sections. Features: - Auto-detects the latest USC release version_id from usc_release table (or accepts --version-id for an explicit override) - Filters to top-level section identifiers (/us/usc/t{n}/s{identifier}) matching the resolution path used in search_chroma() - Builds rich document text: title name + section heading + content_str (truncated to 8 000 chars) for high-quality embeddings - Stores metadata: title number, section number, display label, heading - Idempotent: uses collection.upsert() so safe to re-run - --reset flag to wipe and rebuild the collection from scratch - --dry-run flag to count eligible sections without writing - --batch-size to tune throughput (default 200) - Creates the 'congress-dev' tenant and 'usc-chat' database via the ChromaDB REST API if they don't already exist - Graceful error messages when ChromaDB is unreachable or DB has no data Usage: python3 -m congress_parser.importers.chroma_uscode python3 -m congress_parser.importers.chroma_uscode --reset python3 -m congress_parser.importers.chroma_uscode --dry-run python3 -m congress_parser.importers.chroma_uscode --version-id 74573 https://claude.ai/code/session_011LABnV4F5UKzgwKhWj5ND6 --- .../importers/chroma_uscode.py | 403 ++++++++++++++++++ 1 file changed, 403 insertions(+) create mode 100644 backend/congress_parser/importers/chroma_uscode.py diff --git a/backend/congress_parser/importers/chroma_uscode.py b/backend/congress_parser/importers/chroma_uscode.py new file mode 100644 index 0000000..2a0b697 --- /dev/null +++ b/backend/congress_parser/importers/chroma_uscode.py @@ -0,0 +1,403 @@ +""" +ChromaDB importer for US Code sections. + +Reads top-level USC sections from PostgreSQL and indexes them into the ChromaDB +"uscode" collection so that the interest-based code linking feature can perform +semantic search against them. + +Usage +----- +# Index the latest USC release (auto-detected): + python3 -m congress_parser.importers.chroma_uscode + +# Wipe the collection and re-index from scratch: + python3 -m congress_parser.importers.chroma_uscode --reset + +# Specify an explicit version_id (matches the hardcoded value in uscode.py): + python3 -m congress_parser.importers.chroma_uscode --version-id 74573 + +# Check what would be indexed without writing anything: + python3 -m congress_parser.importers.chroma_uscode --dry-run + +# Tune throughput (default batch size is 200): + python3 -m congress_parser.importers.chroma_uscode --batch-size 500 + +Prerequisites +------------- +- US Code data must already be in PostgreSQL (run congress_parser.importers.releases first) +- ChromaDB service must be running (docker-compose up congress_chromadb) +- CHROMA_HOST env var must point to the ChromaDB host (default: congress_chromadb) +""" + +import argparse +import asyncio +import json +import os +import sys +from typing import Optional +from urllib import request as urllib_request +from urllib.error import HTTPError, URLError + +import chromadb +from chromadb.config import Settings + +from congress_fastapi.db.postgres import get_database + +# ── Configuration ──────────────────────────────────────────────────────────── + +CHROMA_HOST = ( + os.environ.get( + "CHROMA_HOST", + os.environ.get("LLM_HOST", "localhost"), + ) + .split("http://")[-1] + .split(":")[0] +) +CHROMA_PORT = int(os.environ.get("CHROMA_PORT", "8000")) +CHROMA_TENANT = "congress-dev" +CHROMA_DATABASE = "usc-chat" +COLLECTION_NAME = "uscode" +DEFAULT_BATCH_SIZE = 200 + + +# ── Database helpers ────────────────────────────────────────────────────────── + + +async def get_latest_version_id(database) -> int: + """Return the version_id associated with the most recent USC release point.""" + row = await database.fetch_one( + """ + SELECT v.version_id + FROM version v + JOIN usc_release ur ON ur.version_id = v.version_id + WHERE ur.effective_date IS NOT NULL + ORDER BY ur.effective_date DESC + LIMIT 1 + """ + ) + if row is None: + raise RuntimeError( + "No USC release points found in the database.\n" + "Run 'python3 -m congress_parser.importers.releases' first." + ) + return row[0] + + +async def count_sections(database, version_id: int) -> int: + """Count indexable top-level USC sections for this version.""" + row = await database.fetch_one( + """ + SELECT COUNT(*) + FROM usc_content + WHERE version_id = :vid + AND usc_ident ~ '^/us/usc/t[0-9]+/s[^/]+$' + AND heading IS NOT NULL + AND heading != '' + """, + values={"vid": version_id}, + ) + return row[0] + + +async def fetch_sections_batch( + database, version_id: int, offset: int, batch_size: int +) -> list: + """ + Fetch one page of top-level USC sections, joined to their title name. + + Filters to identifiers of the form /us/usc/t{n}/s{identifier} — these are + the IDs stored in ChromaDB and resolved back to Postgres in search_chroma(). + """ + return await database.fetch_all( + """ + SELECT + uc.usc_ident, + uc.heading, + uc.content_str, + uc.number, + uc.section_display, + ch.long_title AS chapter_title, + ch.short_title AS chapter_short_title + FROM usc_content uc + JOIN usc_section us_sec ON us_sec.usc_section_id = uc.usc_section_id + JOIN usc_chapter ch ON ch.usc_chapter_id = us_sec.usc_chapter_id + WHERE uc.version_id = :vid + AND uc.usc_ident ~ '^/us/usc/t[0-9]+/s[^/]+$' + AND uc.heading IS NOT NULL + AND uc.heading != '' + ORDER BY uc.usc_ident + LIMIT :lim + OFFSET :off + """, + values={"vid": version_id, "lim": batch_size, "off": offset}, + ) + + +def build_document(row) -> str: + """ + Build the text that ChromaDB will embed for a USC section. + + Format: " — §<number>. <heading>\\n<content>" + The title name helps the embedding model place sections in context. + """ + parts = [] + chapter_title = (row["chapter_title"] or "").strip().capitalize() + if chapter_title: + parts.append(chapter_title) + if row["section_display"]: + section_label = row["section_display"].strip() + heading = (row["heading"] or "").strip() + if heading: + parts.append(f"{section_label}. {heading}") + else: + parts.append(section_label) + elif row["heading"]: + parts.append(row["heading"].strip()) + header = " — ".join(parts) + + content = (row["content_str"] or "").strip() + if content: + # Truncate very long sections to stay within typical embedding limits + content = content[:8000] + return f"{header}\n{content}" if header else content + return header + + +# ── ChromaDB setup ───────────────────────────────────────────────────────────── + + +def _chroma_rest(method: str, path: str, body: Optional[dict] = None): + """Make a raw HTTP call to the ChromaDB admin REST API.""" + url = f"http://{CHROMA_HOST}:{CHROMA_PORT}{path}" + data = json.dumps(body).encode() if body else None + req = urllib_request.Request( + url, + data=data, + headers={"Content-Type": "application/json"} if data else {}, + method=method, + ) + try: + with urllib_request.urlopen(req, timeout=10) as resp: + return resp.status + except HTTPError as exc: + return exc.code # Return status code; callers decide what's acceptable + + +def ensure_chroma_tenant_and_db(): + """ + Create the ChromaDB tenant and database if they do not already exist. + + HTTP 422 means the resource already exists (ChromaDB's behaviour); we + treat that the same as HTTP 200. + """ + # Tenant + status = _chroma_rest("POST", "/api/v1/tenants", {"name": CHROMA_TENANT}) + if status not in (200, 201, 422, 409): + print( + f"Warning: unexpected HTTP {status} when creating tenant '{CHROMA_TENANT}'. " + "Proceeding anyway.", + file=sys.stderr, + ) + + # Database (scoped to tenant via query param) + status = _chroma_rest( + "POST", + f"/api/v1/databases?tenant={CHROMA_TENANT}", + {"name": CHROMA_DATABASE}, + ) + if status not in (200, 201, 422, 409): + print( + f"Warning: unexpected HTTP {status} when creating database '{CHROMA_DATABASE}'. " + "Proceeding anyway.", + file=sys.stderr, + ) + + +async def open_chroma_collection(reset: bool): + """Connect to ChromaDB and return the 'uscode' collection.""" + client = await chromadb.AsyncHttpClient( + host=CHROMA_HOST, + port=CHROMA_PORT, + ssl=False, + headers=None, + settings=Settings(), + tenant=CHROMA_TENANT, + database=CHROMA_DATABASE, + ) + + if reset: + try: + await client.delete_collection(COLLECTION_NAME) + print(f"Deleted existing collection '{COLLECTION_NAME}'.") + except Exception: + pass # Didn't exist yet + + collection = await client.get_or_create_collection( + COLLECTION_NAME, + metadata={"hnsw:space": "cosine"}, + ) + count = await collection.count() + if count > 0 and not reset: + print( + f"Collection '{COLLECTION_NAME}' already contains {count:,} documents. " + "Upserting will add new and update changed sections.\n" + "Use --reset to wipe and rebuild from scratch." + ) + return collection + + +# ── Main import logic ────────────────────────────────────────────────────────── + + +async def run_import( + version_id: Optional[int], + batch_size: int, + reset: bool, + dry_run: bool, +): + database = await get_database() + + # Determine version + if version_id is None: + version_id = await get_latest_version_id(database) + print(f"Auto-detected latest USC version_id: {version_id}") + else: + print(f"Using specified version_id: {version_id}") + + total = await count_sections(database, version_id) + print(f"Sections eligible for indexing: {total:,}") + + if total == 0: + print( + "\nNo sections found. Make sure the USC data has been loaded:\n" + " python3 -m congress_parser.importers.releases", + file=sys.stderr, + ) + sys.exit(1) + + if dry_run: + print("Dry-run mode — exiting without writing to ChromaDB.") + return + + # ── ChromaDB setup ────────────────────────────────────────── + print(f"\nConnecting to ChromaDB at {CHROMA_HOST}:{CHROMA_PORT} …") + try: + ensure_chroma_tenant_and_db() + except URLError as exc: + print( + f"Could not reach ChromaDB at {CHROMA_HOST}:{CHROMA_PORT}: {exc}\n" + "Is the container running? docker-compose up congress_chromadb", + file=sys.stderr, + ) + sys.exit(1) + + collection = await open_chroma_collection(reset=reset) + print(f"Opened collection '{COLLECTION_NAME}'. Starting import…\n") + + # ── Batch loop ────────────────────────────────────────────── + offset = 0 + indexed = 0 + skipped = 0 + + while offset < total: + rows = await fetch_sections_batch(database, version_id, offset, batch_size) + if not rows: + break + + ids: list[str] = [] + documents: list[str] = [] + metadatas: list[dict] = [] + + for row in rows: + usc_ident: str = row["usc_ident"] + doc_text = build_document(row) + if not doc_text.strip(): + skipped += 1 + continue + + # Extract title number from ident: /us/usc/t42/s1395 → "42" + parts = usc_ident.split("/") + title_num = parts[3].lstrip("t") if len(parts) > 3 else "" + + ids.append(usc_ident) + documents.append(doc_text) + metadatas.append( + { + "title": title_num, + "number": row["number"] or "", + "section_display": row["section_display"] or "", + "heading": (row["heading"] or "")[:200], + } + ) + + if ids: + await collection.upsert(ids=ids, documents=documents, metadatas=metadatas) + indexed += len(ids) + + offset += batch_size + done = min(offset, total) + pct = done / total * 100 + print( + f" {done:>{len(str(total))}}/{total} ({pct:5.1f}%) " + f"indexed: {indexed:,}", + end="\r", + flush=True, + ) + + print(f"\n\nDone. Indexed {indexed:,} sections, skipped {skipped:,} empty.") + final_count = await collection.count() + print(f"Collection '{COLLECTION_NAME}' now contains {final_count:,} documents.") + + +# ── CLI entry point ──────────────────────────────────────────────────────────── + + +def main(): + parser = argparse.ArgumentParser( + description=( + "Index US Code sections into ChromaDB for semantic interest-based search." + ), + formatter_class=argparse.RawDescriptionHelpFormatter, + epilog=__doc__, + ) + parser.add_argument( + "--reset", + action="store_true", + help="Delete and recreate the ChromaDB collection before indexing.", + ) + parser.add_argument( + "--batch-size", + type=int, + default=DEFAULT_BATCH_SIZE, + metavar="N", + help=f"Documents per upsert batch (default: {DEFAULT_BATCH_SIZE}).", + ) + parser.add_argument( + "--version-id", + type=int, + default=None, + metavar="ID", + help=( + "Specific usc_content.version_id to index " + "(default: latest usc_release)." + ), + ) + parser.add_argument( + "--dry-run", + action="store_true", + help="Count eligible sections and exit without writing to ChromaDB.", + ) + args = parser.parse_args() + + asyncio.run( + run_import( + version_id=args.version_id, + batch_size=args.batch_size, + reset=args.reset, + dry_run=args.dry_run, + ) + ) + + +if __name__ == "__main__": + main()