diff --git a/.github/workflows/docs-ingestion-ci.yml b/.github/workflows/docs-ingestion-ci.yml index e697d1fa..d984eca2 100644 --- a/.github/workflows/docs-ingestion-ci.yml +++ b/.github/workflows/docs-ingestion-ci.yml @@ -247,6 +247,11 @@ jobs: working-directory: docs-embeddings env: DRY_RUN: 'true' + AWS_REGION: ${{ vars.AWS_REGION }} + BEDROCK_MODEL_ID: ${{ vars.BEDROCK_MODEL_ID }} + PINECONE_INDEX: ${{ vars.PINECONE_INDEX }} + BATCH_SIZE: ${{ vars.BATCH_SIZE }} + EMBEDDING_CONCURRENCY: ${{ vars.EMBEDDING_CONCURRENCY }} INGESTION_OUTPUT_PATH: ${{ github.workspace }}/docs-ingestion/output/chunks.json run: node dist/index.js --dry-run @@ -278,7 +283,7 @@ jobs: uses: aws-actions/configure-aws-credentials@ff717079ee2060e4bcee96c4779b553acc87447c # v4 with: role-to-assume: ${{ secrets.AWS_ROLE_ARN }} - aws-region: ap-south-1 + aws-region: ${{ vars.AWS_REGION }} # ── Build ingestion pipeline ── - name: Install ingestion dependencies @@ -305,7 +310,8 @@ jobs: - name: Upload content to S3 working-directory: docs-ingestion env: - CONTENT_BUCKET_NAME: ${{ secrets.CONTENT_BUCKET_NAME }} + AWS_REGION: ${{ vars.AWS_REGION }} + CONTENT_BUCKET_NAME: ${{ vars.CONTENT_BUCKET_NAME }} run: node dist/upload-content.js # ── Build and run embedding sync ── @@ -320,8 +326,12 @@ jobs: - name: Run embedding sync working-directory: docs-embeddings env: + AWS_REGION: ${{ vars.AWS_REGION }} + BEDROCK_MODEL_ID: ${{ vars.BEDROCK_MODEL_ID }} PINECONE_API_KEY: ${{ secrets.PINECONE_API_KEY }} - PINECONE_INDEX: ${{ secrets.PINECONE_INDEX }} - CONTENT_BUCKET_NAME: ${{ secrets.CONTENT_BUCKET_NAME }} + PINECONE_INDEX: ${{ vars.PINECONE_INDEX }} + CONTENT_BUCKET_NAME: ${{ vars.CONTENT_BUCKET_NAME }} + BATCH_SIZE: ${{ vars.BATCH_SIZE }} + EMBEDDING_CONCURRENCY: ${{ vars.EMBEDDING_CONCURRENCY }} INGESTION_OUTPUT_PATH: ${{ github.workspace }}/docs-ingestion/output/chunks.json run: node dist/index.js diff --git a/docs-embeddings/src/embed-all.ts b/docs-embeddings/src/embed-all.ts index 1661be1e..6fac1b49 100644 --- a/docs-embeddings/src/embed-all.ts +++ b/docs-embeddings/src/embed-all.ts @@ -52,6 +52,12 @@ function fail(msg: string): never { process.exit(1); } +function requireEnv(name: string): string { + const value = process.env[name]; + if (!value) fail(`${name} environment variable is required`); + return value; +} + function runStep(label: string, cmd: string, cwd: string): void { console.log(`\n── ${label} ──`); try { @@ -144,7 +150,10 @@ async function main(): Promise { console.log("\n── Pinecone namespace check ──"); const pineconeApiKey = process.env.PINECONE_API_KEY; - const pineconeIndex = process.env.PINECONE_INDEX || "docs-embeddings"; + const pineconeIndex = process.env.PINECONE_INDEX; + if (!pineconeIndex) { + fail("PINECONE_INDEX environment variable is required"); + } if (!pineconeApiKey) { fail("PINECONE_API_KEY environment variable is required"); @@ -241,18 +250,14 @@ async function main(): Promise { const config: EmbeddingConfig = { ingestionOutputPath: chunksPath, stateFilePath: path.join(process.cwd(), "state", "indexed-hashes.json"), - awsRegion: process.env.AWS_REGION || "ap-south-1", - bedrockModelId: - process.env.BEDROCK_MODEL_ID || "amazon.titan-embed-text-v2:0", + awsRegion: requireEnv("AWS_REGION"), + bedrockModelId: requireEnv("BEDROCK_MODEL_ID"), pineconeApiKey: pineconeApiKey!, - pineconeIndex, - batchSize: parseInt(process.env.BATCH_SIZE || "25", 10), + pineconeIndex: pineconeIndex!, + batchSize: parseInt(requireEnv("BATCH_SIZE"), 10), s3ContentBucket: process.env.CONTENT_BUCKET_NAME || undefined, dryRun: false, - embeddingConcurrency: parseInt( - process.env.EMBEDDING_CONCURRENCY || "5", - 10, - ), + embeddingConcurrency: parseInt(requireEnv("EMBEDDING_CONCURRENCY"), 10), }; try { diff --git a/docs-embeddings/src/embedder.ts b/docs-embeddings/src/embedder.ts index dbfa98fc..4e7449e6 100644 --- a/docs-embeddings/src/embedder.ts +++ b/docs-embeddings/src/embedder.ts @@ -5,15 +5,15 @@ import { BedrockRuntimeClient, InvokeModelCommand, - InvokeModelCommandInput -} from '@aws-sdk/client-bedrock-runtime'; + InvokeModelCommandInput, +} from "@aws-sdk/client-bedrock-runtime"; export class BedrockEmbedder { private client: BedrockRuntimeClient; private modelId: string; private callCount: number = 0; - constructor(region: string = 'ap-south-1', modelId: string = 'amazon.titan-embed-text-v2:0') { + constructor(region: string, modelId: string) { this.client = new BedrockRuntimeClient({ region }); this.modelId = modelId; } @@ -33,19 +33,19 @@ export class BedrockEmbedder { try { const input: InvokeModelCommandInput = { modelId: this.modelId, - contentType: 'application/json', - accept: 'application/json', + contentType: "application/json", + accept: "application/json", body: JSON.stringify({ inputText: content, - normalize: true - }) + normalize: true, + }), }; const command = new InvokeModelCommand(input); const response = await this.client.send(command); const responseBody = JSON.parse( - new TextDecoder().decode(response.body) + new TextDecoder().decode(response.body), ); this.callCount++; @@ -53,22 +53,27 @@ export class BedrockEmbedder { } catch (error: any) { lastError = error; const isRetryable = - error?.name === 'ThrottlingException' || - error?.name === 'ServiceUnavailableException' || - error?.name === 'ModelTimeoutException' || + error?.name === "ThrottlingException" || + error?.name === "ServiceUnavailableException" || + error?.name === "ModelTimeoutException" || error?.$metadata?.httpStatusCode === 429 || error?.$metadata?.httpStatusCode >= 500; if (!isRetryable || attempt === BedrockEmbedder.MAX_RETRIES) { - console.error(`Bedrock embedding failed (attempt ${attempt + 1}/${BedrockEmbedder.MAX_RETRIES + 1}):`, error); + console.error( + `Bedrock embedding failed (attempt ${attempt + 1}/${BedrockEmbedder.MAX_RETRIES + 1}):`, + error, + ); throw error; } // Full jitter: randomize within [50%, 100%] of exponential delay // to prevent thundering herd when multiple concurrent calls retry const maxDelay = Math.min(1000 * Math.pow(2, attempt), 16000); - const delay = Math.floor(maxDelay / 2 + Math.random() * maxDelay / 2); - console.warn(` Bedrock throttled (attempt ${attempt + 1}), retrying in ${delay}ms...`); + const delay = Math.floor(maxDelay / 2 + (Math.random() * maxDelay) / 2); + console.warn( + ` Bedrock throttled (attempt ${attempt + 1}), retrying in ${delay}ms...`, + ); await this.sleep(delay); } } @@ -113,6 +118,6 @@ export class BedrockEmbedder { } private sleep(ms: number): Promise { - return new Promise(resolve => setTimeout(resolve, ms)); + return new Promise((resolve) => setTimeout(resolve, ms)); } } diff --git a/docs-embeddings/src/index.ts b/docs-embeddings/src/index.ts index 288a25bc..310f5ba9 100644 --- a/docs-embeddings/src/index.ts +++ b/docs-embeddings/src/index.ts @@ -11,6 +11,12 @@ import type { EmbeddingConfig } from './types.js'; dotenv.config({ path: '.env.local' }); dotenv.config({ path: '.env' }); +function requireEnv(name: string): string { + const value = process.env[name]; + if (!value) throw new Error(`${name} environment variable is required`); + return value; +} + async function main() { try { const dryRun = process.argv.includes('--dry-run') || process.env.DRY_RUN === 'true'; @@ -21,14 +27,14 @@ async function main() { path.join(process.cwd(), '..', 'docs-ingestion', 'output', 'chunks.json'), stateFilePath: process.env.STATE_FILE_PATH || path.join(process.cwd(), 'state', 'indexed-hashes.json'), - awsRegion: process.env.AWS_REGION || 'ap-south-1', - bedrockModelId: process.env.BEDROCK_MODEL_ID || 'amazon.titan-embed-text-v2:0', + awsRegion: requireEnv('AWS_REGION'), + bedrockModelId: requireEnv('BEDROCK_MODEL_ID'), pineconeApiKey: process.env.PINECONE_API_KEY || '', - pineconeIndex: process.env.PINECONE_INDEX || 'docs-embeddings', - batchSize: parseInt(process.env.BATCH_SIZE || '25', 10), + pineconeIndex: requireEnv('PINECONE_INDEX'), + batchSize: parseInt(requireEnv('BATCH_SIZE'), 10), s3ContentBucket: process.env.CONTENT_BUCKET_NAME || undefined, dryRun, - embeddingConcurrency: parseInt(process.env.EMBEDDING_CONCURRENCY || '3', 10), + embeddingConcurrency: parseInt(requireEnv('EMBEDDING_CONCURRENCY'), 10), }; // Validate (Pinecone key not required in dry-run) diff --git a/docs-embeddings/src/verify-embed.ts b/docs-embeddings/src/verify-embed.ts index 005d7900..864c5871 100644 --- a/docs-embeddings/src/verify-embed.ts +++ b/docs-embeddings/src/verify-embed.ts @@ -73,7 +73,10 @@ async function main(): Promise { // ── Connect to Pinecone ────────────────────────────────────── const pineconeApiKey = process.env.PINECONE_API_KEY; - const pineconeIndex = process.env.PINECONE_INDEX || "docs-embeddings"; + const pineconeIndex = process.env.PINECONE_INDEX; + if (!pineconeIndex) { + fail("PINECONE_INDEX environment variable is required"); + } if (!pineconeApiKey) { fail("PINECONE_API_KEY environment variable is required"); @@ -105,7 +108,7 @@ async function main(): Promise { console.log(" CONTENT_BUCKET_NAME not set — skipping S3 check\n"); } else { const s3 = new S3Client({ - region: process.env.AWS_REGION || "ap-south-1", + region: (() => { const r = process.env.AWS_REGION; if (!r) fail("AWS_REGION environment variable is required"); return r; })(), }); // Deterministic sample: pick every Nth chunk instead of random diff --git a/docs-ingestion/src/upload-content.ts b/docs-ingestion/src/upload-content.ts index 3df9bbd2..b6833e23 100644 --- a/docs-ingestion/src/upload-content.ts +++ b/docs-ingestion/src/upload-content.ts @@ -19,7 +19,10 @@ async function main() { try { // Load config from environment const bucketName = process.env.CONTENT_BUCKET_NAME; - const awsRegion = process.env.AWS_REGION || "ap-south-1"; + const awsRegion = process.env.AWS_REGION; + if (!awsRegion) { + throw new Error("AWS_REGION environment variable is required"); + } const outputPath = process.env.OUTPUT_PATH || path.join(process.cwd(), "output", "chunks.json");