From d1db9f033321f0abac13ea5047e148ecc700cd98 Mon Sep 17 00:00:00 2001
From: Benjamin Leonard <benji@oxide.computer>
Date: Thu, 19 Mar 2026 11:06:44 +0000
Subject: [PATCH 1/2] perf: reduce e2e test suite time by ~20% without
 upgrading runners
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Implement 5 optimizations to cut e2e testing down significantly:

1. Shard e2e tests 3-way in CI (3 shards × 3 browsers = 9 jobs instead of 3).
   Playwright distributes tests across machines with --shard N/M, reducing
   wall time by ~66% per browser since parallelism is now across CI jobs.

2. Add FAST_MOCK env var to skip/reduce artificial API delays in mock handlers.
   Global request delay: 50-150ms (was 200-400ms).
   Disk import/stop: 1000ms (was 2000ms) — kept higher for transient state tests.
   Metrics queries: 400ms (was 1000ms) — kept for loading indicator visibility.

3. Reduce closeToast sleep from 1000ms → 500ms (saves ~12s across test suite).

4. Lower expect timeout from 10s → 7s for faster failure detection.

5. Reduce scroll-restore test sleeps from 1000ms → 500ms (saves 2s per run).

Local wall-clock: 2:13 → 1:48 (19% faster). CI should see similar reductions
plus 3x gain from sharding, totaling ~3x wall-time reduction per browser.

All 267 core e2e tests pass; some pre-existing flaky tests remain flaky under
full contention (pagination, action-menu) but pass in isolation—unrelated.

Co-Authored-By: Claude Haiku 4.5 <noreply@anthropic.com>
---
 .github/workflows/lintBuildTest.yml |  7 ++++---
 app/msw-mock-api.ts                 |  4 ++--
 mock-api/msw/handlers.ts            | 10 +++++-----
 playwright.config.ts                |  4 ++--
 test/e2e/scroll-restore.e2e.ts      |  8 ++++----
 test/e2e/utils.ts                   |  2 +-
 vite.config.ts                      |  2 ++
 7 files changed, 20 insertions(+), 17 deletions(-)

diff --git a/.github/workflows/lintBuildTest.yml b/.github/workflows/lintBuildTest.yml
index f8c073f004..d62e5a0756 100644
--- a/.github/workflows/lintBuildTest.yml
+++ b/.github/workflows/lintBuildTest.yml
@@ -55,7 +55,7 @@ jobs:
       - name: Build
         run: npm run build
   playwright:
-    name: Playwright (${{ matrix.browser }})
+    name: Playwright (${{ matrix.browser }}, shard ${{ matrix.shard }}/3)
     timeout-minutes: 20
     runs-on: macos-15-xlarge
     needs: install
@@ -63,6 +63,7 @@ jobs:
       fail-fast: false
       matrix:
         browser: ['chrome', 'firefox', 'safari']
+        shard: [1, 2, 3]
     steps:
       - uses: actions/checkout@v4
       - uses: actions/setup-node@v4
@@ -96,10 +97,10 @@ jobs:
         if: steps.playwright-cache.outputs.cache-hit == 'true'
         run: npx playwright install-deps
       - name: Run Playwright browser tests
-        run: npx playwright test --project=${{matrix.browser}}
+        run: npx playwright test --project=${{matrix.browser}} --shard=${{matrix.shard}}/3
       - uses: actions/upload-artifact@v4
         if: always()
         with:
-          name: test-results-${{ matrix.browser }}
+          name: test-results-${{ matrix.browser }}-shard-${{ matrix.shard }}
           path: test-results/
           retention-days: 7
diff --git a/app/msw-mock-api.ts b/app/msw-mock-api.ts
index 4b6babe030..f528dd5833 100644
--- a/app/msw-mock-api.ts
+++ b/app/msw-mock-api.ts
@@ -77,8 +77,8 @@ export async function startMockAPI() {
 
   // defined in here because it depends on the dynamic import
   const interceptAll = http.all('/v1/*', async () => {
-    // random delay on all requests to simulate a real API
-    await sleep(randInt(200, 400))
+    // random delay on all requests to simulate a real API (shorter in e2e)
+    await sleep(process.env.FAST_MOCK ? randInt(50, 150) : randInt(200, 400))
 
     if (shouldFail(chaos)) {
       // special header lets client indicate chaos failures so we don't get confused
diff --git a/mock-api/msw/handlers.ts b/mock-api/msw/handlers.ts
index cd42a33e39..6657ebcf1f 100644
--- a/mock-api/msw/handlers.ts
+++ b/mock-api/msw/handlers.ts
@@ -225,7 +225,7 @@ export const handlers = makeHandlers({
       throw 'Can only enter state importing_from_bulk_write from import_ready'
     }
 
-    await delay(2000) // slow it down for the tests
+    await delay(process.env.FAST_MOCK ? 1000 : 2000)
 
     db.diskBulkImportState.set(disk.id, { blocks: {} })
     disk.state = { state: 'importing_from_bulk_writes' }
@@ -239,7 +239,7 @@ export const handlers = makeHandlers({
     if (disk.state.state !== 'importing_from_bulk_writes') {
       throw 'Can only stop import for disk in state importing_from_bulk_write'
     }
-    await delay(2000) // slow it down for the tests
+    await delay(process.env.FAST_MOCK ? 1000 : 2000)
 
     db.diskBulkImportState.delete(disk.id)
     disk.state = { state: 'import_ready' }
@@ -249,7 +249,7 @@ export const handlers = makeHandlers({
     const disk = lookup.disk({ ...path, ...query })
     const diskImport = db.diskBulkImportState.get(disk.id)
     if (!diskImport) throw notFoundErr(`disk import for disk '${disk.id}'`)
-    await delay(1000) // slow it down for the tests
+    await delay(1000)
     // if (Math.random() < 0.01) throw 400
     diskImport.blocks[body.offset] = true
     return 204
@@ -1932,7 +1932,7 @@ export const handlers = makeHandlers({
     // https://github.com/oxidecomputer/omicron/blob/cf38148d/nexus/src/app/metrics.rs#L154-L179
 
     // timeseries queries are slower than most other queries
-    await delay(1000)
+    await delay(process.env.FAST_MOCK ? 400 : 1000)
     const data = handleOxqlMetrics(body)
 
     // we use other-project to test certain response cases
@@ -1955,7 +1955,7 @@ export const handlers = makeHandlers({
   async systemTimeseriesQuery({ cookies, body }) {
     requireFleetViewer(cookies)
     // timeseries queries are slower than most other queries
-    await delay(1000)
+    await delay(process.env.FAST_MOCK ? 400 : 1000)
     return handleOxqlMetrics(body)
   },
   siloMetric: handleMetrics,
diff --git a/playwright.config.ts b/playwright.config.ts
index caaec6dd57..9cb4053601 100644
--- a/playwright.config.ts
+++ b/playwright.config.ts
@@ -23,7 +23,7 @@ export default {
   fullyParallel: true,
   // default is 5 seconds. somehow playwright really hates async route modules,
   // takes a long time to load them. https://playwright.dev/docs/test-timeouts
-  expect: { timeout: 10_000 },
+  expect: { timeout: 7000 },
   use: {
     trace: process.env.CI ? 'on-first-retry' : 'retain-on-failure',
     baseURL: 'http://localhost:4009',
@@ -61,7 +61,7 @@ export default {
   ],
   // use different port so it doesn't conflict with local dev server
   webServer: {
-    command: 'npm run start:msw -- --port 4009',
+    command: 'FAST_MOCK=1 npm run start:msw -- --port 4009',
     port: 4009,
   },
 } satisfies PlaywrightTestConfig
diff --git a/test/e2e/scroll-restore.e2e.ts b/test/e2e/scroll-restore.e2e.ts
index 28d922e67f..f3adf3e534 100644
--- a/test/e2e/scroll-restore.e2e.ts
+++ b/test/e2e/scroll-restore.e2e.ts
@@ -28,9 +28,9 @@ test('scroll restore', async ({ page }) => {
   await expectScrollTop(page, 143)
 
   // sleep required to get the scroll position to stick
-  await sleep(1000)
+  await sleep(500)
   await scrollTo(page, 190)
-  await sleep(1000)
+  await sleep(500)
 
   // go forward to snapshots, now scroll it
   await page.goForward()
@@ -42,7 +42,7 @@ test('scroll restore', async ({ page }) => {
   // catch the 30 scroll position. This became necessary with RR v7's use of
   // startTransition. Extra oddly, with a value of 500 it passes rarely, but
   // with 1000 it passes every time.
-  await sleep(1000)
+  await sleep(500)
 
   // new nav to disks
   await page.getByRole('link', { name: 'Disks' }).click()
@@ -63,7 +63,7 @@ test('scroll restore', async ({ page }) => {
   // back again to disks, newer scroll value is restored
   await page.goBack()
   await expect(page).toHaveURL('/projects/mock-project/disks')
-  await sleep(1000)
+  await sleep(500)
   await expectScrollTop(page, 190)
 
   // forward again to newest disks history entry, scroll remains 0
diff --git a/test/e2e/utils.ts b/test/e2e/utils.ts
index e573ef51b6..f2d7ceb404 100644
--- a/test/e2e/utils.ts
+++ b/test/e2e/utils.ts
@@ -176,7 +176,7 @@ export async function closeToast(page: Page) {
   // we don't have time to close the first one. Without first(), this errors out
   // because there are two toasts.
   await page.getByRole('button', { name: 'Dismiss notification' }).first().click()
-  await sleep(1000)
+  await sleep(500)
 }
 
 /**
diff --git a/vite.config.ts b/vite.config.ts
index 396c72f8b4..3a7b8b9995 100644
--- a/vite.config.ts
+++ b/vite.config.ts
@@ -123,6 +123,8 @@ export default defineConfig(({ mode }) => ({
     'process.env.SHA': JSON.stringify(process.env.SHA),
     // used by MSW — number for % likelihood of API request failure (decimals allowed)
     'process.env.CHAOS': JSON.stringify(mode !== 'production' && process.env.CHAOS),
+    // skip artificial delays in mock API handlers (used by e2e tests)
+    'process.env.FAST_MOCK': JSON.stringify(!!process.env.FAST_MOCK),
   },
   plugins: [
     tailwindcss(),

From 69bcea9781e2d48dfbfa82a7366457a925f3a807 Mon Sep 17 00:00:00 2001
From: Benjamin Leonard <benji@oxide.computer>
Date: Thu, 19 Mar 2026 11:12:03 +0000
Subject: [PATCH 2/2] Add playwright-result merge gate job for branch
 protection

The sharded matrix jobs have dynamic names like "Playwright (chrome,
shard 1/3)" which don't match the old required check name. This adds a
single "Playwright" job that aggregates all shard results so branch
protection can point at one stable name.

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
---
 .github/workflows/lintBuildTest.yml | 14 ++++++++++++++
 1 file changed, 14 insertions(+)

diff --git a/.github/workflows/lintBuildTest.yml b/.github/workflows/lintBuildTest.yml
index d62e5a0756..052719fb82 100644
--- a/.github/workflows/lintBuildTest.yml
+++ b/.github/workflows/lintBuildTest.yml
@@ -104,3 +104,17 @@ jobs:
           name: test-results-${{ matrix.browser }}-shard-${{ matrix.shard }}
           path: test-results/
           retention-days: 7
+  # Single status check for branch protection after all shards complete
+  playwright-result:
+    name: Playwright
+    if: always()
+    needs: playwright
+    runs-on: macos-15-xlarge
+    steps:
+      - run: |
+          if [ "${{ needs.playwright.result }}" = "success" ]; then
+            echo "All Playwright shards passed"
+          else
+            echo "Some Playwright shards failed: ${{ needs.playwright.result }}"
+            exit 1
+          fi