From d3f38ed1854bff9704be126da71651d44b638f9d Mon Sep 17 00:00:00 2001 From: Copilot <223556219+Copilot@users.noreply.github.com> Date: Fri, 27 Mar 2026 09:19:09 -0700 Subject: [PATCH] feat(docs): enhance search with TF-IDF relevance ranking (#40) Search improvements: - TF-IDF relevance ranking toggle (localStorage persistent) - Pagefind basePath fix for dev mode - Conditional keyword highlights (on when relevance off) - Playwright e2e tests with self-contained webServer - build:pagefind convenience script for dev - CI: docs search index build step for test suite Team review: Flight approve, FIDO approve Relates to diberry/squad#40 Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> --- .github/workflows/squad-ci.yml | 3 + docs/.gitignore | 2 + docs/package.json | 4 +- docs/playwright.config.mjs | 8 +- docs/scripts/build-search-index.mjs | 153 +++++++++++++++++++ docs/src/components/Search.astro | 180 +++++++++++++++++++++-- docs/src/styles/global.css | 2 +- docs/tests/search.spec.mjs | 2 +- test/docs-search.test.ts | 218 ++++++++++++++++++++++++++++ 9 files changed, 559 insertions(+), 13 deletions(-) create mode 100644 docs/scripts/build-search-index.mjs create mode 100644 test/docs-search.test.ts diff --git a/.github/workflows/squad-ci.yml b/.github/workflows/squad-ci.yml index 773754e39..01b0f491e 100644 --- a/.github/workflows/squad-ci.yml +++ b/.github/workflows/squad-ci.yml @@ -179,6 +179,9 @@ jobs: - name: Build run: npm run build + - name: Build docs search index + run: cd docs && npm ci && npm run build:search && mkdir -p dist && cp public/search-index.json dist/search-index.json + - name: Run tests run: npm test diff --git a/docs/.gitignore b/docs/.gitignore index ddce69b68..6ec296f6a 100644 --- a/docs/.gitignore +++ b/docs/.gitignore @@ -1,3 +1,5 @@ node_modules/ dist/ .astro/ +public/search-index.json +public/pagefind/ diff --git a/docs/package.json b/docs/package.json index 488a4890d..7e0853063 100644 --- a/docs/package.json +++ b/docs/package.json @@ -5,7 +5,9 @@ "private": true, "scripts": { "dev": "astro dev", - "build": "astro build && npx pagefind --site dist", + "build:search": "node scripts/build-search-index.mjs", + "build:pagefind": "npm run build && node -e \"require('fs').cpSync('dist/pagefind','public/pagefind',{recursive:true,force:true});console.log('Pagefind index copied to public/pagefind — dev search ready')\"", + "build": "node scripts/build-search-index.mjs && astro build && npx pagefind --site dist", "preview": "astro preview", "astro": "astro", "test": "node --test tests/build-output.test.mjs && npx playwright test", diff --git a/docs/playwright.config.mjs b/docs/playwright.config.mjs index 2c69a1ac9..fd04fe182 100644 --- a/docs/playwright.config.mjs +++ b/docs/playwright.config.mjs @@ -6,8 +6,14 @@ export default defineConfig({ timeout: 30_000, retries: 0, use: { - baseURL: 'http://localhost:4321/squad/', + baseURL: 'http://localhost:4322/squad/', browserName: 'chromium', headless: true, }, + webServer: { + command: 'npm run build && npx astro preview --port 4322', + port: 4322, + timeout: 120_000, + reuseExistingServer: !process.env.CI, + }, }); diff --git a/docs/scripts/build-search-index.mjs b/docs/scripts/build-search-index.mjs new file mode 100644 index 000000000..00e082849 --- /dev/null +++ b/docs/scripts/build-search-index.mjs @@ -0,0 +1,153 @@ +#!/usr/bin/env node +/** + * build-search-index.mjs + * Reads all .md files from docs/src/content/docs/, chunks by ## headings, + * and outputs a static search-index.json for client-side TF-IDF search. + */ + +import { readdir, readFile, writeFile, mkdir } from 'node:fs/promises'; +import { join, relative, dirname, sep } from 'node:path'; +import { fileURLToPath } from 'node:url'; + +const __dirname = dirname(fileURLToPath(import.meta.url)); +const DOCS_ROOT = join(__dirname, '..', 'src', 'content', 'docs'); +const OUTPUT_DIR = join(__dirname, '..', 'public'); +const OUTPUT_FILE = join(OUTPUT_DIR, 'search-index.json'); + +// Section display names derived from directory +const SECTION_NAMES = { + 'get-started': 'Get Started', + guide: 'Guide', + features: 'Features', + reference: 'Reference', + scenarios: 'Scenarios', + concepts: 'Concepts', + cookbook: 'Cookbook', +}; + +async function collectMdFiles(dir) { + const entries = await readdir(dir, { withFileTypes: true }); + const files = []; + for (const entry of entries) { + const full = join(dir, entry.name); + if (entry.isDirectory()) { + files.push(...(await collectMdFiles(full))); + } else if (entry.name.endsWith('.md')) { + files.push(full); + } + } + return files; +} + +function stripFrontmatter(content) { + const match = content.match(/^---\r?\n[\s\S]*?\r?\n---\r?\n/); + return match ? content.slice(match[0].length) : content; +} + +function extractTitle(content) { + const match = content.match(/^#\s+(.+)$/m); + return match ? match[1].trim() : 'Untitled'; +} + +function deriveSlug(filePath) { + let rel = relative(DOCS_ROOT, filePath) + .replace(/\\/g, '/') + .replace(/\.md$/, ''); + if (rel.endsWith('/index')) rel = rel.replace(/\/index$/, ''); + return rel; +} + +function deriveSection(slug) { + const first = slug.split('/')[0]; + return SECTION_NAMES[first] || first.charAt(0).toUpperCase() + first.slice(1); +} + +function stripMarkdown(text) { + return text + .replace(/!\[.*?\]\(.*?\)/g, '') // images + .replace(/\[([^\]]*)\]\(.*?\)/g, '$1') // links → text + .replace(/(`{1,3})[\s\S]*?\1/g, '') // inline/fenced code + .replace(/^>\s?/gm, '') // blockquotes + .replace(/[*_~]{1,3}/g, '') // bold/italic/strikethrough + .replace(/^[-*+]\s/gm, '') // unordered list markers + .replace(/^\d+\.\s/gm, '') // ordered list markers + .replace(/\|/g, ' ') // table pipes + .replace(/^-{3,}$/gm, '') // horizontal rules + .replace(/<[^>]+>/g, '') // HTML tags + .replace(/\n{2,}/g, '\n') // collapse blank lines + .trim(); +} + +function chunkByHeadings(content, pageTitle, slug) { + const body = stripFrontmatter(content); + const section = deriveSection(slug); + const lines = body.split('\n'); + const chunks = []; + let currentHeading = pageTitle; + let buffer = []; + + function flush() { + const raw = buffer.join('\n').trim(); + if (!raw) return; + const text = stripMarkdown(raw); + if (text.length < 20) return; // skip tiny chunks + chunks.push({ + title: pageTitle, + slug, + section, + heading: currentHeading, + text, + }); + } + + for (const line of lines) { + const headingMatch = line.match(/^#{2,3}\s+(.+)/); + if (headingMatch) { + flush(); + currentHeading = headingMatch[1].trim(); + buffer = []; + } else { + buffer.push(line); + } + } + flush(); + + // If no chunks were produced, add the whole page as one chunk + if (chunks.length === 0) { + const text = stripMarkdown(body); + if (text.length >= 20) { + chunks.push({ title: pageTitle, slug, section, heading: pageTitle, text }); + } + } + + return chunks; +} + +async function main() { + console.log('Building search index...'); + const files = await collectMdFiles(DOCS_ROOT); + console.log(`Found ${files.length} markdown files`); + + const allChunks = []; + + for (const file of files) { + const content = await readFile(file, 'utf-8'); + const title = extractTitle(content); + const slug = deriveSlug(file); + const chunks = chunkByHeadings(content, title, slug); + allChunks.push(...chunks); + } + + await mkdir(OUTPUT_DIR, { recursive: true }); + const json = JSON.stringify(allChunks); + await writeFile(OUTPUT_FILE, json, 'utf-8'); + + const sizeKB = (Buffer.byteLength(json) / 1024).toFixed(1); + console.log(`✓ ${allChunks.length} chunks from ${files.length} files`); + console.log(`✓ Output: search-index.json (${sizeKB} KB)`); +} + +main().catch((err) => { + console.error('Build search index failed:', err); + process.exit(1); +}); diff --git a/docs/src/components/Search.astro b/docs/src/components/Search.astro index cac07914b..5bcbe4276 100644 --- a/docs/src/components/Search.astro +++ b/docs/src/components/Search.astro @@ -36,6 +36,17 @@ const base = import.meta.env.BASE_URL; /> Esc + +