From d3b95b00d638c4f205c1a50ddf6b97f55d7df00e Mon Sep 17 00:00:00 2001 From: pulkit28 Date: Wed, 25 Mar 2026 18:36:28 +0530 Subject: [PATCH 1/3] refactor: remove hardcoded defaults, require env vars for all config MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit All configuration (AWS_REGION, BEDROCK_MODEL_ID, PINECONE_INDEX, BATCH_SIZE, EMBEDDING_CONCURRENCY) must now be set via environment variables or .env.local — no more hardcoded ap-south-1 or model ID defaults. Fail-fast with clear error messages if any are missing. This fixes the S3 PermanentRedirect error in the Deploy Knowledge Base CI job, which was caused by the bucket being in us-east-1 while code defaulted to ap-south-1. Bucket has been recreated as setu-docs-content-prod in ap-south-1. Co-Authored-By: Claude Opus 4.6 (1M context) --- .github/workflows/docs-ingestion-ci.yml | 14 ++++++++-- docs-embeddings/src/embed-all.ts | 25 +++++++++++------- docs-embeddings/src/embedder.ts | 35 ++++++++++++++----------- docs-embeddings/src/index.ts | 16 +++++++---- docs-embeddings/src/verify-embed.ts | 7 +++-- docs-ingestion/src/upload-content.ts | 5 +++- 6 files changed, 67 insertions(+), 35 deletions(-) diff --git a/.github/workflows/docs-ingestion-ci.yml b/.github/workflows/docs-ingestion-ci.yml index e697d1fa..fc8212cc 100644 --- a/.github/workflows/docs-ingestion-ci.yml +++ b/.github/workflows/docs-ingestion-ci.yml @@ -247,6 +247,11 @@ jobs: working-directory: docs-embeddings env: DRY_RUN: 'true' + AWS_REGION: ${{ vars.AWS_REGION }} + BEDROCK_MODEL_ID: ${{ vars.BEDROCK_MODEL_ID }} + PINECONE_INDEX: ${{ vars.PINECONE_INDEX }} + BATCH_SIZE: ${{ vars.BATCH_SIZE }} + EMBEDDING_CONCURRENCY: ${{ vars.EMBEDDING_CONCURRENCY }} INGESTION_OUTPUT_PATH: ${{ github.workspace }}/docs-ingestion/output/chunks.json run: node dist/index.js --dry-run @@ -278,7 +283,7 @@ jobs: uses: aws-actions/configure-aws-credentials@ff717079ee2060e4bcee96c4779b553acc87447c # v4 with: role-to-assume: ${{ secrets.AWS_ROLE_ARN }} - aws-region: ap-south-1 + aws-region: ${{ vars.AWS_REGION }} # ── Build ingestion pipeline ── - name: Install ingestion dependencies @@ -305,6 +310,7 @@ jobs: - name: Upload content to S3 working-directory: docs-ingestion env: + AWS_REGION: ${{ vars.AWS_REGION }} CONTENT_BUCKET_NAME: ${{ secrets.CONTENT_BUCKET_NAME }} run: node dist/upload-content.js @@ -320,8 +326,12 @@ jobs: - name: Run embedding sync working-directory: docs-embeddings env: + AWS_REGION: ${{ vars.AWS_REGION }} + BEDROCK_MODEL_ID: ${{ vars.BEDROCK_MODEL_ID }} PINECONE_API_KEY: ${{ secrets.PINECONE_API_KEY }} - PINECONE_INDEX: ${{ secrets.PINECONE_INDEX }} + PINECONE_INDEX: ${{ vars.PINECONE_INDEX }} CONTENT_BUCKET_NAME: ${{ secrets.CONTENT_BUCKET_NAME }} + BATCH_SIZE: ${{ vars.BATCH_SIZE }} + EMBEDDING_CONCURRENCY: ${{ vars.EMBEDDING_CONCURRENCY }} INGESTION_OUTPUT_PATH: ${{ github.workspace }}/docs-ingestion/output/chunks.json run: node dist/index.js diff --git a/docs-embeddings/src/embed-all.ts b/docs-embeddings/src/embed-all.ts index 1661be1e..6fac1b49 100644 --- a/docs-embeddings/src/embed-all.ts +++ b/docs-embeddings/src/embed-all.ts @@ -52,6 +52,12 @@ function fail(msg: string): never { process.exit(1); } +function requireEnv(name: string): string { + const value = process.env[name]; + if (!value) fail(`${name} environment variable is required`); + return value; +} + function runStep(label: string, cmd: string, cwd: string): void { console.log(`\n── ${label} ──`); try { @@ -144,7 +150,10 @@ async function main(): Promise { console.log("\n── Pinecone namespace check ──"); const pineconeApiKey = process.env.PINECONE_API_KEY; - const pineconeIndex = process.env.PINECONE_INDEX || "docs-embeddings"; + const pineconeIndex = process.env.PINECONE_INDEX; + if (!pineconeIndex) { + fail("PINECONE_INDEX environment variable is required"); + } if (!pineconeApiKey) { fail("PINECONE_API_KEY environment variable is required"); @@ -241,18 +250,14 @@ async function main(): Promise { const config: EmbeddingConfig = { ingestionOutputPath: chunksPath, stateFilePath: path.join(process.cwd(), "state", "indexed-hashes.json"), - awsRegion: process.env.AWS_REGION || "ap-south-1", - bedrockModelId: - process.env.BEDROCK_MODEL_ID || "amazon.titan-embed-text-v2:0", + awsRegion: requireEnv("AWS_REGION"), + bedrockModelId: requireEnv("BEDROCK_MODEL_ID"), pineconeApiKey: pineconeApiKey!, - pineconeIndex, - batchSize: parseInt(process.env.BATCH_SIZE || "25", 10), + pineconeIndex: pineconeIndex!, + batchSize: parseInt(requireEnv("BATCH_SIZE"), 10), s3ContentBucket: process.env.CONTENT_BUCKET_NAME || undefined, dryRun: false, - embeddingConcurrency: parseInt( - process.env.EMBEDDING_CONCURRENCY || "5", - 10, - ), + embeddingConcurrency: parseInt(requireEnv("EMBEDDING_CONCURRENCY"), 10), }; try { diff --git a/docs-embeddings/src/embedder.ts b/docs-embeddings/src/embedder.ts index dbfa98fc..4e7449e6 100644 --- a/docs-embeddings/src/embedder.ts +++ b/docs-embeddings/src/embedder.ts @@ -5,15 +5,15 @@ import { BedrockRuntimeClient, InvokeModelCommand, - InvokeModelCommandInput -} from '@aws-sdk/client-bedrock-runtime'; + InvokeModelCommandInput, +} from "@aws-sdk/client-bedrock-runtime"; export class BedrockEmbedder { private client: BedrockRuntimeClient; private modelId: string; private callCount: number = 0; - constructor(region: string = 'ap-south-1', modelId: string = 'amazon.titan-embed-text-v2:0') { + constructor(region: string, modelId: string) { this.client = new BedrockRuntimeClient({ region }); this.modelId = modelId; } @@ -33,19 +33,19 @@ export class BedrockEmbedder { try { const input: InvokeModelCommandInput = { modelId: this.modelId, - contentType: 'application/json', - accept: 'application/json', + contentType: "application/json", + accept: "application/json", body: JSON.stringify({ inputText: content, - normalize: true - }) + normalize: true, + }), }; const command = new InvokeModelCommand(input); const response = await this.client.send(command); const responseBody = JSON.parse( - new TextDecoder().decode(response.body) + new TextDecoder().decode(response.body), ); this.callCount++; @@ -53,22 +53,27 @@ export class BedrockEmbedder { } catch (error: any) { lastError = error; const isRetryable = - error?.name === 'ThrottlingException' || - error?.name === 'ServiceUnavailableException' || - error?.name === 'ModelTimeoutException' || + error?.name === "ThrottlingException" || + error?.name === "ServiceUnavailableException" || + error?.name === "ModelTimeoutException" || error?.$metadata?.httpStatusCode === 429 || error?.$metadata?.httpStatusCode >= 500; if (!isRetryable || attempt === BedrockEmbedder.MAX_RETRIES) { - console.error(`Bedrock embedding failed (attempt ${attempt + 1}/${BedrockEmbedder.MAX_RETRIES + 1}):`, error); + console.error( + `Bedrock embedding failed (attempt ${attempt + 1}/${BedrockEmbedder.MAX_RETRIES + 1}):`, + error, + ); throw error; } // Full jitter: randomize within [50%, 100%] of exponential delay // to prevent thundering herd when multiple concurrent calls retry const maxDelay = Math.min(1000 * Math.pow(2, attempt), 16000); - const delay = Math.floor(maxDelay / 2 + Math.random() * maxDelay / 2); - console.warn(` Bedrock throttled (attempt ${attempt + 1}), retrying in ${delay}ms...`); + const delay = Math.floor(maxDelay / 2 + (Math.random() * maxDelay) / 2); + console.warn( + ` Bedrock throttled (attempt ${attempt + 1}), retrying in ${delay}ms...`, + ); await this.sleep(delay); } } @@ -113,6 +118,6 @@ export class BedrockEmbedder { } private sleep(ms: number): Promise { - return new Promise(resolve => setTimeout(resolve, ms)); + return new Promise((resolve) => setTimeout(resolve, ms)); } } diff --git a/docs-embeddings/src/index.ts b/docs-embeddings/src/index.ts index 288a25bc..310f5ba9 100644 --- a/docs-embeddings/src/index.ts +++ b/docs-embeddings/src/index.ts @@ -11,6 +11,12 @@ import type { EmbeddingConfig } from './types.js'; dotenv.config({ path: '.env.local' }); dotenv.config({ path: '.env' }); +function requireEnv(name: string): string { + const value = process.env[name]; + if (!value) throw new Error(`${name} environment variable is required`); + return value; +} + async function main() { try { const dryRun = process.argv.includes('--dry-run') || process.env.DRY_RUN === 'true'; @@ -21,14 +27,14 @@ async function main() { path.join(process.cwd(), '..', 'docs-ingestion', 'output', 'chunks.json'), stateFilePath: process.env.STATE_FILE_PATH || path.join(process.cwd(), 'state', 'indexed-hashes.json'), - awsRegion: process.env.AWS_REGION || 'ap-south-1', - bedrockModelId: process.env.BEDROCK_MODEL_ID || 'amazon.titan-embed-text-v2:0', + awsRegion: requireEnv('AWS_REGION'), + bedrockModelId: requireEnv('BEDROCK_MODEL_ID'), pineconeApiKey: process.env.PINECONE_API_KEY || '', - pineconeIndex: process.env.PINECONE_INDEX || 'docs-embeddings', - batchSize: parseInt(process.env.BATCH_SIZE || '25', 10), + pineconeIndex: requireEnv('PINECONE_INDEX'), + batchSize: parseInt(requireEnv('BATCH_SIZE'), 10), s3ContentBucket: process.env.CONTENT_BUCKET_NAME || undefined, dryRun, - embeddingConcurrency: parseInt(process.env.EMBEDDING_CONCURRENCY || '3', 10), + embeddingConcurrency: parseInt(requireEnv('EMBEDDING_CONCURRENCY'), 10), }; // Validate (Pinecone key not required in dry-run) diff --git a/docs-embeddings/src/verify-embed.ts b/docs-embeddings/src/verify-embed.ts index 005d7900..864c5871 100644 --- a/docs-embeddings/src/verify-embed.ts +++ b/docs-embeddings/src/verify-embed.ts @@ -73,7 +73,10 @@ async function main(): Promise { // ── Connect to Pinecone ────────────────────────────────────── const pineconeApiKey = process.env.PINECONE_API_KEY; - const pineconeIndex = process.env.PINECONE_INDEX || "docs-embeddings"; + const pineconeIndex = process.env.PINECONE_INDEX; + if (!pineconeIndex) { + fail("PINECONE_INDEX environment variable is required"); + } if (!pineconeApiKey) { fail("PINECONE_API_KEY environment variable is required"); @@ -105,7 +108,7 @@ async function main(): Promise { console.log(" CONTENT_BUCKET_NAME not set — skipping S3 check\n"); } else { const s3 = new S3Client({ - region: process.env.AWS_REGION || "ap-south-1", + region: (() => { const r = process.env.AWS_REGION; if (!r) fail("AWS_REGION environment variable is required"); return r; })(), }); // Deterministic sample: pick every Nth chunk instead of random diff --git a/docs-ingestion/src/upload-content.ts b/docs-ingestion/src/upload-content.ts index 3df9bbd2..b6833e23 100644 --- a/docs-ingestion/src/upload-content.ts +++ b/docs-ingestion/src/upload-content.ts @@ -19,7 +19,10 @@ async function main() { try { // Load config from environment const bucketName = process.env.CONTENT_BUCKET_NAME; - const awsRegion = process.env.AWS_REGION || "ap-south-1"; + const awsRegion = process.env.AWS_REGION; + if (!awsRegion) { + throw new Error("AWS_REGION environment variable is required"); + } const outputPath = process.env.OUTPUT_PATH || path.join(process.cwd(), "output", "chunks.json"); From a59a05e9f116a0bd5a0d8190666a0722965c9189 Mon Sep 17 00:00:00 2001 From: pulkit28 Date: Wed, 25 Mar 2026 19:14:37 +0530 Subject: [PATCH 2/3] ci: retrigger with GitHub Actions variables set From d3a7c1d8881d14cbb2bad9fb101d4c620cdf41a5 Mon Sep 17 00:00:00 2001 From: pulkit28 Date: Wed, 25 Mar 2026 19:17:29 +0530 Subject: [PATCH 3/3] ci: move CONTENT_BUCKET_NAME from secrets to variables --- .github/workflows/docs-ingestion-ci.yml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/.github/workflows/docs-ingestion-ci.yml b/.github/workflows/docs-ingestion-ci.yml index fc8212cc..d984eca2 100644 --- a/.github/workflows/docs-ingestion-ci.yml +++ b/.github/workflows/docs-ingestion-ci.yml @@ -311,7 +311,7 @@ jobs: working-directory: docs-ingestion env: AWS_REGION: ${{ vars.AWS_REGION }} - CONTENT_BUCKET_NAME: ${{ secrets.CONTENT_BUCKET_NAME }} + CONTENT_BUCKET_NAME: ${{ vars.CONTENT_BUCKET_NAME }} run: node dist/upload-content.js # ── Build and run embedding sync ── @@ -330,7 +330,7 @@ jobs: BEDROCK_MODEL_ID: ${{ vars.BEDROCK_MODEL_ID }} PINECONE_API_KEY: ${{ secrets.PINECONE_API_KEY }} PINECONE_INDEX: ${{ vars.PINECONE_INDEX }} - CONTENT_BUCKET_NAME: ${{ secrets.CONTENT_BUCKET_NAME }} + CONTENT_BUCKET_NAME: ${{ vars.CONTENT_BUCKET_NAME }} BATCH_SIZE: ${{ vars.BATCH_SIZE }} EMBEDDING_CONCURRENCY: ${{ vars.EMBEDDING_CONCURRENCY }} INGESTION_OUTPUT_PATH: ${{ github.workspace }}/docs-ingestion/output/chunks.json