diff --git a/config/experiments/beam_search/math500/beam_search_openrouter_gpt4o_mini_math500_entropy.yaml b/config/experiments/beam_search/math500/beam_search_openrouter_gpt4o_mini_math500_entropy.yaml
new file mode 100644
index 00000000..6f101702
--- /dev/null
+++ b/config/experiments/beam_search/math500/beam_search_openrouter_gpt4o_mini_math500_entropy.yaml
@@ -0,0 +1,70 @@
+
+# @package _global_
+# Run: python scripts/run_tts_eval.py --config-path=../config --config-name=experiments/beam_search/math500/beam_search_openrouter_gpt4o_mini_math500_entropy
+# Beam Search for MATH-500 using OpenRouter API with entropy scorer
+# Quick test config for debugger visualization
+#
+# Environment variable required: OPENROUTER_API_KEY
+
+defaults:
+  - /config
+  - /model/openrouter
+  - /generation/default
+  - /system/default
+  - /strategy/beam_search
+  - /scorer/entropy
+  - /evaluation/default
+  - _self_
+
+# Run naming
+run_name: "beam_search_openrouter_gpt4o_mini_math500_entropy_seed${system.seed}_${now:%H-%M-%S}"
+
+# Main configuration
+verbose: true
+report_to: none
+
+# Model configuration (OpenRouter API)
+model:
+  type: "openai_api"
+  provider: openrouter
+  model_path: "qwen/qwen-2.5-7b-instruct"
+  api_key: null  # Set via OPENROUTER_API_KEY env var
+  supports_logprobs: true
+  prefill_mode: false
+  max_context_budget: 128000
+
+# System configuration
+system:
+  device: cpu
+  seed: 42
+
+# Generation configuration
+generation:
+  max_new_tokens: 2048
+  temperature: 0.7
+  top_p: 0.95
+  top_k: 50
+  batch_size: 1
+
+# Strategy - Beam Search (bigger tree for visualization)
+strategy:
+  beam_size: 3
+  candidates_per_beam: 3
+  max_steps: 10
+  min_step_tokens: 5
+  max_step_tokens: 100
+
+# Entropy scorer configuration
+scorer:
+  type: entropy
+  batch_size: 1
+
+# Dataset configuration - MATH-500
+dataset:
+  data_name: "math"
+  dataset_path: "test-time-compute/test_MATH"
+  dataset_split: "test"
+  subset: 3
+  offset: 0
+  answer_format: "numeric"
+  prompt_file: "${hydra:runtime.cwd}/config/prompts/default.txt"
diff --git a/docs/service/debugger.md b/docs/service/debugger.md
new file mode 100644
index 00000000..f60ff493
--- /dev/null
+++ b/docs/service/debugger.md
@@ -0,0 +1,78 @@
+# Experiment Results Visualizer
+
+View experiment results from `run_tts_eval.py` in the Visual Debugger — without re-running strategies live.
+
+## Quick Start
+
+```bash
+# 1. Run an experiment (tree data is now saved automatically)
+python scripts/run_tts_eval.py --config-path=../config --config-name=experiments/beam_search/...
+
+# 2. Convert results to debugger format
+python scripts/convert_results_to_debugger.py outputs/<date>/<run_name>/ --install
+
+# 3. Serve and open in browser
+python -m http.server 8080 -d service_app
+# Open http://localhost:8080/static/debugger/index.html
+```
+
+## Converter Options
+
+```bash
+# Basic: creates debugger_payload.json in the output dir
+python scripts/convert_results_to_debugger.py outputs/<path>/
+
+# Install as cached_examples.json (auto-loads in debugger)
+python scripts/convert_results_to_debugger.py outputs/<path>/ --install
+
+# Only incorrect samples (for debugging failures)
+python scripts/convert_results_to_debugger.py outputs/<path>/ --incorrect-only
+
+# Limit number of samples
+python scripts/convert_results_to_debugger.py outputs/<path>/ --max-samples 50
+
+# Custom output path
+python scripts/convert_results_to_debugger.py outputs/<path>/ --out my_results.json
+```
+
+## Using the Debugger
+
+1. Samples appear in the **Scenario** dropdown
+2. Select a strategy/scorer and click **Run** to see the tree
+3. **Timeline** (left) — click through reasoning steps
+4. **Tree** (bottom) — orange path = selected, grey = pruned
+5. **Candidates** panel — scores and text for each candidate at a step
+6. **Prev/Next** buttons — navigate between samples
+7. **Incorrect only** checkbox — filter to failed samples
+8. **Load File** button — load a `debugger_payload.json` without `--install`
+
+## What the Tree Shows
+
+- Each node is a candidate generated at a reasoning step
+- **Orange path**: the beam the strategy selected as its final answer
+- **Grey nodes**: candidates that were generated and scored but pruned
+- Click any node to see its full text and scores
+
+## Changed Files
+
+| File | Change |
+|------|--------|
+| `scripts/run_tts_eval.py` | Save tree data (`step_candidates`, `all_trajectories`, etc.) to `results.json` — previously discarded |
+| `scripts/convert_results_to_debugger.py` | **New.** Converts experiment output to debugger JSON format |
+| `service_app/static/debugger/index.html` | Added file upload input and sample navigation (Prev/Next, Incorrect only filter) |
+| `service_app/static/debugger/app.js` | File upload handler, sample navigation logic, auto-enable cached mode for offline use |
+| `config/experiments/beam_search/math500/beam_search_openrouter_gpt4o_mini_math500_entropy.yaml` | **New.** Quick-test config for OpenRouter beam search on MATH-500 |
+
+## Supported Strategies
+
+All strategies that produce tree data work:
+- **Beam Search** — full tree with per-step candidates and beam lineage
+- **Online Best-of-N** — stepwise candidate pools
+- **Offline Best-of-N** — trajectory-level reranking with per-step breakdown
+- **Self-Consistency** — parallel reasoning paths with voting
+
+Baseline (single-pass) also works but shows a linear chain (no branching).
+
+## Note on Old Experiments
+
+Experiments run **before** the `run_tts_eval.py` change won't have tree data in `results.json`. The converter still works — it falls back to a stepwise view (one candidate per step) — but you won't see the full branching tree. Re-run the experiment to get tree data.
diff --git a/presentation/experiment_visualization_plan.md b/presentation/experiment_visualization_plan.md
new file mode 100644
index 00000000..c3add5d6
--- /dev/null
+++ b/presentation/experiment_visualization_plan.md
@@ -0,0 +1,232 @@
+# Plan: Visualizing Experiment Results in the Visual Debugger
+
+## Goal
+
+Enable viewing experiment results (from `scripts/run_tts_eval.py`) in the existing Visual Debugger UI — without running strategies live.
+
+---
+
+## Current Architecture
+
+### How the Debugger Works Now
+
+1. **Backend** runs a strategy via `strategy_manager.py` → gets raw result dict
+2. **`debugger_events.py`** converts the result dict into a list of **events** (the universal visualization format)
+3. **`app.js`** receives events and builds an interactive tree via `buildTreeFromEvents(events)`
+
+The key conversion layer is `debugger_events.py:convert_strategy_result_to_debugger_run()` — it transforms raw strategy output into the event format the frontend understands.
+
+### Event Format (what the frontend needs)
+
+```json
+{
+  "step": 1,
+  "title": "Step 1: Candidate generation",
+  "stage": "tree_expand",
+  "signals": [{"name": "confidence", "value": 0.85, "direction": "higher_better"}],
+  "candidates": [
+    {
+      "id": "step_1_candidate_0",
+      "label": "Candidate 1",
+      "text": "Let me think about this...",
+      "status": "selected",
+      "selected": true,
+      "signals": {"confidence": 0.85, "prm": 0.92},
+      "beam_uid": 1,
+      "parent_beam_uid": null
+    }
+  ]
+}
+```
+
+### What `run_tts_eval.py` Already Saves
+
+| File | Content |
+|------|---------|
+| `results.json` | Per-sample results: steps, scores, trajectory, extracted_answer |
+| `candidates.json` | Multi-trajectory data (offline BoN) |
+| `sample_metrics.jsonl` | Per-sample compute metrics |
+| `metrics.json` | Aggregated accuracy, tokens, etc. |
+
+**IMPORTANT: `results.json` does NOT save the tree-building data.** The strategies return `step_candidates` (beam search, online BoN) and `all_trajectories` (offline BoN, self-consistency) in their result dicts, but `run_tts_eval.py` discards them (lines 1631-1674 cherry-pick only a subset of fields). Currently saved:
+- `steps` — list of step dicts with `text`, `token_ids`, `generation_scores`, `other_data`
+- `validity_scores` — flat list of per-step scores
+- `generated_trajectory` — concatenated text
+- `extracted_answer`, `answer_step`, `token_stats`, completion info
+
+**Not saved (but available in strategy return value):**
+- **Beam Search / Online BoN**: `step_candidates` — per-step decision points with all candidates, their scores, beam UIDs, parent linkage. This is the full tree structure.
+- **Offline BoN**: `all_trajectories`, `all_scores`, `all_step_scores`, `best_idx` — all N candidate trajectories with scores.
+- **Self-Consistency**: `all_trajectories` — all sampled paths.
+
+---
+
+## Proposed Approach
+
+### Two things are needed:
+
+**1. Save tree data in `run_tts_eval.py`** — modify `_generate_trajectories_batch()` (line 1631) to also save `step_candidates` and `all_trajectories` to results.json (or a separate `tree_data.json` to keep results.json lightweight).
+
+**2. A standalone converter script** that reads the saved data and converts it to the debugger format:
+- Reads `results.json` (with newly-saved tree fields) from an experiment output dir
+- Calls the existing `convert_strategy_result_to_debugger_run()` for each sample
+- Outputs a `cached_examples.json`-compatible file for the debugger to load
+
+### Architecture
+
+```
+                          Step 0 (one-time)
+                    ┌──────────────────────────┐
+                    │ Modify run_tts_eval.py   │
+                    │ to save step_candidates  │
+                    │ and all_trajectories     │
+                    └──────────────────────────┘
+
+Experiment output dir          Converter              Visual Debugger
+┌──────────────────┐     ┌──────────────────┐     ┌──────────────────┐
+│ results.json     │────▶│ convert_results  │────▶│ Load as "cached  │
+│  (with tree data)│     │ _to_debugger.py  │     │  example" or via │
+│ metrics.json     │     │                  │     │  file:// protocol│
+│ config.yaml      │     │ Uses existing    │     │                  │
+│                  │     │ debugger_events  │     │                  │
+└──────────────────┘     │ .py converter    │     └──────────────────┘
+                         └──────────────────┘
+```
+
+---
+
+## Implementation Steps
+
+### Step 0: Save Tree Data in `run_tts_eval.py` (PREREQUISITE)
+
+Currently `_generate_trajectories_batch()` at line 1631 builds `result_dict` without tree-building fields. Add:
+
+```python
+# After line 1660 in _generate_trajectories_batch()
+# Save tree visualization data (if strategy provides it)
+for key in ("step_candidates", "all_trajectories", "all_scores", "all_step_scores", "best_idx"):
+    if key in result:
+        result_dict[key] = result[key]
+```
+
+**Option A** — save directly in `results.json` (simpler, but increases file size significantly for beam search with many candidates).
+
+**Option B** — save to a separate `tree_data.jsonl` file (one line per sample, keyed by index). Keeps `results.json` lightweight. The converter script would then read both files.
+
+**Note on serialization:** `step_candidates` contains `StepCandidate` objects. These are already serialized as dicts with `text`, `token_ids`, `generation_scores`, `other_data` fields (same as `steps`), so JSON serialization should work. Verify with a test run.
+
+### Step 1: Converter Script (`scripts/convert_results_to_debugger.py`)
+
+**Input:** path to experiment output directory (containing `results.json` with tree data)
+**Output:** JSON file in the debugger payload format
+
+```python
+# Pseudocode
+from service_app.core.debugger_events import convert_strategy_result_to_debugger_run
+
+def convert_experiment(output_dir, strategy_type, scorer_type=None):
+    results = load_json(output_dir / "results.json")
+    config = load_yaml(output_dir / ".hydra/config.yaml")  # experiment config
+
+    examples = []
+    for sample in results:
+        # The result dict now contains step_candidates / all_trajectories
+        # (saved by the modified run_tts_eval.py)
+        run_payload = convert_strategy_result_to_debugger_run(
+            strategy={"id": strategy_type, "name": ..., "family": ...},
+            scorer={"id": scorer_type, ...} if scorer_type else None,
+            strategy_result=sample,  # pass the full saved result
+            budget=config.strategy.max_steps,
+            latency_ms=0,
+            ...
+        )
+
+        examples.append({
+            "id": f"sample_{sample['index']}",
+            "title": sample["question"][:80],
+            "description": f"Gold: {sample['gold_answer']}, Predicted: {sample['extracted_answer']}",
+            "payloads": {"default": make_payload(run_payload, sample)}
+        })
+
+    save_json(examples, output_dir / "debugger_payload.json")
+```
+
+### Step 2: Adapt `debugger_events.py` for Serialized Data
+
+The existing converter expects live `StepCandidate` objects (with `.text` attribute). Serialized results have dicts (with `"text"` key). Need to handle both:
+
+- `_build_events_from_step_candidates()` — already works with dict-like candidates (check this)
+- `_build_events_from_trajectory_pool()` — needs to accept step dicts instead of `StepCandidate` objects (access `.text` vs `["text"]`)
+- `_build_stepwise_events()` — same: accept step dicts
+
+**This is the main coding task** — make the converter accept both live objects and serialized JSON. A simple helper can bridge the gap:
+
+```python
+def _step_text(step):
+    """Get text from either a StepCandidate object or a serialized dict."""
+    return step.text if hasattr(step, "text") else step.get("text", str(step))
+```
+
+### Step 3: Add "Load from File" to the Debugger UI
+
+Two options (pick one):
+
+**Option A (simpler):** Generate a `cached_examples.json` and open the debugger HTML as `file://` — already supported, no backend needed.
+
+**Option B (richer):** Add a "Load experiment" button to the debugger that accepts a JSON file upload or a directory path. This would:
+- Add a file input element in `app.js`
+- Parse the uploaded JSON into the same format as cached examples
+- Populate the example selector dropdown
+
+### Step 4: Multi-Sample Navigation
+
+Current debugger shows one problem at a time. For experiments with hundreds of samples, add:
+- Sample index selector (dropdown or prev/next buttons)
+- Filter by correctness (show only incorrect samples for debugging)
+- Summary stats bar (accuracy, avg tokens)
+
+---
+
+## Where to Start
+
+### For the colleague — recommended order:
+
+1. **Start with `service_app/core/debugger_events.py`** — understand `convert_strategy_result_to_debugger_run()` (line 49). This is the core function. Read its input/output contract.
+
+2. **Read one cached example** — look at `service_app/static/debugger/cached_examples.json` to see the exact output format the frontend expects. Focus on `strategies[].run.events[]`.
+
+3. **Modify `run_tts_eval.py`** (Step 0) — add `step_candidates` and `all_trajectories` to the saved result dict. This is ~5 lines of code. Re-run one experiment to generate data with tree fields.
+
+4. **Write the converter script** — `scripts/convert_results_to_debugger.py`:
+   - Load `results.json` (with tree data) from experiment dir
+   - For each sample, call the existing converter (or a thin wrapper)
+   - Output a debugger-compatible JSON
+
+5. **Handle serialization gap** — the converter expects `StepCandidate` objects but results.json has dicts. Create a lightweight adapter or modify the converter to accept both.
+
+6. **Test with file:// protocol** — open `index.html` directly in a browser with the generated JSON as `cached_examples.json` in the same directory.
+
+---
+
+## Key Files to Read
+
+| File | Why |
+|------|-----|
+| `service_app/core/debugger_events.py` | **Core converter** — strategy result → events |
+| `service_app/static/debugger/cached_examples.json` | **Target format** — what the frontend expects |
+| `service_app/static/debugger/app.js:2052-2256` | `buildTreeFromEvents()` — how frontend builds the tree |
+| `service_app/core/visual_debugger_demo.py` | How demo payloads are assembled |
+| `scripts/run_tts_eval.py:1630-1674` | What fields are saved per sample in `results.json` |
+
+---
+
+## Summary
+
+**Answer to the colleague's question:** You don't need to build trees inside `run_tts_eval.py`. The tree construction logic already exists in `debugger_events.py`. But there's a prerequisite: `run_tts_eval.py` currently **discards** the tree-building data (`step_candidates`, `all_trajectories`) when saving results. So the plan is:
+
+1. **Modify `run_tts_eval.py`** (~5 lines) to also save `step_candidates` / `all_trajectories` to disk — this is the raw tree structure that strategies already compute but we throw away
+2. A **post-hoc converter script** (`scripts/convert_results_to_debugger.py`) that reads experiment outputs and calls the existing `debugger_events.py` converter to produce the frontend-ready format
+3. Minor **adaptation of `debugger_events.py`** to accept serialized dicts (from JSON) in addition to live `StepCandidate` objects
+4. Optionally, a **"Load experiment" UI** in the debugger for convenience
+
+The tree data is already computed by strategies at runtime — we just need to stop discarding it and then pipe it through the existing conversion layer.
diff --git a/scripts/convert_results_to_debugger.py b/scripts/convert_results_to_debugger.py
new file mode 100755
index 00000000..ed6ddce6
--- /dev/null
+++ b/scripts/convert_results_to_debugger.py
@@ -0,0 +1,424 @@
+#!/usr/bin/env python3
+"""Convert experiment results (from run_tts_eval.py) to the Visual Debugger format.
+
+Usage:
+    python scripts/convert_results_to_debugger.py <output_dir> [--out debugger_payload.json]
+
+Example:
+    python scripts/convert_results_to_debugger.py outputs/2026-02-04/beam_search_math500_09-09-47
+    # Opens: service_app/static/debugger/index.html with the generated cached_examples.json
+"""
+
+from __future__ import annotations
+
+import argparse
+import json
+import logging
+import re
+import sys
+from copy import deepcopy
+from pathlib import Path
+
+# Add the project root to sys.path so we can import service_app
+PROJECT_ROOT = Path(__file__).resolve().parents[1]
+sys.path.insert(0, str(PROJECT_ROOT))
+
+import yaml  # noqa: E402
+
+from service_app.core.debugger_events import (  # noqa: E402
+    convert_strategy_result_to_debugger_run,
+)
+
+log = logging.getLogger(__name__)
+
+# ---------------------------------------------------------------------------
+# Strategy type -> metadata mapping (mirrors visual_debugger_demo.py)
+# ---------------------------------------------------------------------------
+
+STRATEGY_META = {
+    "baseline": {
+        "id": "baseline",
+        "name": "Baseline (Raw CoT)",
+        "family": "single_pass",
+        "summary": "Single-pass raw chain-of-thought without search or reranking.",
+    },
+    "beam_search": {
+        "id": "beam_search",
+        "name": "Beam Search (ToT)",
+        "family": "tree_search",
+        "summary": "Tree-of-thought expansion with beam pruning.",
+    },
+    "adaptive": {
+        "id": "adaptive",
+        "name": "Adaptive Best-of-N",
+        "family": "reranking",
+        "summary": "Online best-of-n with adaptive scaling across steps.",
+    },
+    "online_best_of_n": {
+        "id": "online_best_of_n",
+        "name": "Online Best-of-N",
+        "family": "reranking",
+        "summary": "Iterative candidate generation with stepwise reranking.",
+    },
+    "offline_best_of_n": {
+        "id": "offline_best_of_n",
+        "name": "Offline Best-of-N",
+        "family": "reranking",
+        "summary": "Generate full trajectories first, then rerank at the end.",
+    },
+    "self_consistency": {
+        "id": "self_consistency",
+        "name": "Self-Consistency",
+        "family": "sample_and_vote",
+        "summary": "Sample diverse trajectories and select by answer consensus.",
+    },
+}
+
+SCORER_META = {
+    "prm": {
+        "id": "prm",
+        "name": "PRM",
+        "direction": "higher_better",
+        "summary": "Process Reward Model trajectory quality score.",
+    },
+    "self_verification": {
+        "id": "self_verification",
+        "name": "Self-Verification (LLM Critic)",
+        "direction": "higher_better",
+        "summary": "LLM-as-a-judge verification scoring.",
+    },
+    "sequence_prob": {
+        "id": "sequence_prob",
+        "name": "Sequence Prob",
+        "direction": "higher_better",
+        "summary": "Cumulative sequence probability from token logprobs.",
+    },
+    "perplexity": {
+        "id": "perplexity",
+        "name": "Perplexity",
+        "direction": "lower_better",
+        "summary": "Per-token perplexity estimated from generation logprobs.",
+    },
+    "entropy": {
+        "id": "entropy",
+        "name": "Entropy",
+        "direction": "lower_better",
+        "summary": "Mean token entropy of decoded reasoning steps.",
+    },
+}
+
+
+def load_config(output_dir: Path) -> dict:
+    """Load the Hydra config from the experiment output directory."""
+    config_path = output_dir / ".hydra" / "config.yaml"
+    if not config_path.exists():
+        raise FileNotFoundError(f"Config not found: {config_path}")
+    with open(config_path) as f:
+        return yaml.safe_load(f)
+
+
+def load_results(output_dir: Path) -> list:
+    """Load results.json from the experiment output directory."""
+    results_path = output_dir / "results.json"
+    if not results_path.exists():
+        raise FileNotFoundError(f"Results not found: {results_path}")
+    with open(results_path) as f:
+        return json.load(f)
+
+
+def get_strategy_info(config: dict) -> dict:
+    """Get strategy metadata from the config."""
+    strategy_type = config.get("strategy", {}).get("type", "baseline")
+    meta = STRATEGY_META.get(strategy_type)
+    if meta:
+        return deepcopy(meta)
+    return {"id": strategy_type, "name": strategy_type, "family": "unknown"}
+
+
+def get_scorer_info(config: dict) -> dict | None:
+    """Get scorer metadata from the config."""
+    scorer_cfg = config.get("scorer")
+    if not scorer_cfg:
+        return None
+    scorer_type = scorer_cfg.get("type", "")
+    # self_consistency uses consensus scoring, no external scorer
+    strategy_type = config.get("strategy", {}).get("type", "")
+    if strategy_type == "self_consistency":
+        return None
+    meta = SCORER_META.get(scorer_type)
+    if meta:
+        return deepcopy(meta)
+    return {
+        "id": scorer_type,
+        "name": scorer_type,
+        "direction": "higher_better",
+        "summary": "",
+    }
+
+
+def extract_run_timestamp(output_dir: Path) -> str:
+    """Extract a human-readable timestamp from the output directory path.
+
+    Looks for date pattern (YYYY-MM-DD) in parent dirs and time (HH-MM-SS)
+    in the run dir name.  Returns e.g. '2026-03-20 19:18'.
+    """
+    parts = output_dir.parts
+    date_part = ""
+    for p in parts:
+        if re.match(r"^\d{4}-\d{2}-\d{2}$", p):
+            date_part = p
+            break
+    # Time from dir name like seed42_19-18-04
+    time_match = re.search(r"(\d{2})-(\d{2})-(\d{2})$", output_dir.name)
+    time_part = f"{time_match.group(1)}:{time_match.group(2)}" if time_match else ""
+    if date_part and time_part:
+        return f"{date_part} {time_part}"
+    return date_part or time_part or output_dir.name
+
+
+def convert_experiment(
+    output_dir: Path,
+    filter_incorrect: bool = False,
+    max_samples: int | None = None,
+) -> dict:
+    """Convert an experiment output directory to the debugger cached_examples format.
+
+    Returns a dict matching the cached_examples.json schema.
+    """
+    config = load_config(output_dir)
+    results = load_results(output_dir)
+    run_timestamp = extract_run_timestamp(output_dir)
+
+    strategy_info = get_strategy_info(config)
+    scorer_info = get_scorer_info(config)
+
+    strategy_cfg = config.get("strategy", {})
+    scorer_cfg = config.get("scorer", {}) or {}
+    generation_cfg = config.get("generation", {})
+    model_cfg = config.get("model", {})
+    dataset_cfg = config.get("dataset", {})
+
+    budget = strategy_cfg.get("max_steps", 10)
+
+    model_config = {
+        "provider": model_cfg.get("provider", "unknown"),
+        "model_id": model_cfg.get("model_name", model_cfg.get("model_path", "")),
+        "api_key_masked": "sk-...eval",
+    }
+
+    data_name = dataset_cfg.get("data_name", "unknown")
+    dataset_offset = dataset_cfg.get("offset", 0)
+    strategy_type = strategy_cfg.get("type", "unknown")
+
+    # Count correct/incorrect
+    total = len(results)
+    correct = sum(1 for r in results if r.get("is_correct"))
+    accuracy = correct / total if total > 0 else 0
+
+    if filter_incorrect:
+        results = [r for r in results if not r.get("is_correct")]
+
+    if max_samples is not None:
+        results = results[:max_samples]
+
+    examples = []
+    for sample in results:
+        sample_idx = sample.get("index", 0)
+        question = sample.get("question", "")
+        gold_answer = str(sample.get("gold_answer", ""))
+        extracted_answer = str(sample.get("extracted_answer", ""))
+        is_correct = sample.get("is_correct", False)
+
+        # Build the strategy result dict that the converter expects
+        strategy_result = {
+            "trajectory": sample.get("generated_trajectory", ""),
+            "extracted_answer": extracted_answer,
+            "steps": sample.get("steps", []),
+            "validity_scores": sample.get("validity_scores", []),
+            "token_stats": sample.get("token_stats"),
+            "completed": sample.get("completed", True),
+        }
+
+        # Propagate tree visualization fields
+        for key in (
+            "step_candidates",
+            "all_trajectories",
+            "all_trajectory_steps",
+            "all_scores",
+            "all_step_scores",
+            "all_traces",
+            "best_idx",
+        ):
+            if key in sample:
+                strategy_result[key] = sample[key]
+
+        try:
+            run_payload = convert_strategy_result_to_debugger_run(
+                strategy=strategy_info,
+                scorer=scorer_info,
+                strategy_result=strategy_result,
+                budget=budget,
+                latency_ms=0,
+                model_config=model_config,
+                generation_config=generation_cfg,
+                strategy_config=dict(strategy_cfg),
+                scorer_config=dict(scorer_cfg),
+                has_gold_answer=bool(gold_answer),
+                gold_answer=gold_answer,
+            )
+        except Exception:
+            log.exception(f"Failed to convert sample {sample_idx}, skipping")
+            continue
+
+        correctness_mark = "correct" if is_correct else "INCORRECT"
+        scorer_label = scorer_info["name"] if scorer_info else "none"
+        dataset_idx = dataset_offset + sample_idx
+        title = f"[{run_timestamp}] [{strategy_info['name']}|{scorer_label}] #{dataset_idx} [{correctness_mark}] {question[:50]}"
+        description = (
+            f"Strategy: {strategy_info['name']} | Scorer: {scorer_label} | "
+            f"Gold: {gold_answer} | Predicted: {extracted_answer} | "
+            f"{'Correct' if is_correct else 'Incorrect'}"
+        )
+
+        scorer_catalog = [scorer_info] if scorer_info else []
+
+        payload = {
+            "scenario": {
+                "id": f"{strategy_type}_{sample_idx}",
+                "title": title,
+                "description": description,
+                "prompt": question,
+                "ground_truth": gold_answer,
+                "input_source": "experiment_results",
+                "model_config": model_config,
+                "strategy_count": 1,
+                "scorer_count": len(scorer_catalog),
+                "run_count": 1,
+            },
+            "available_budgets": [budget],
+            "selected_budget": budget,
+            "strategy_catalog": [],
+            "scorer_catalog": scorer_catalog,
+            "strategies": [
+                {
+                    "id": strategy_info["id"],
+                    "strategy_id": strategy_info["id"],
+                    "scorer_id": scorer_info["id"] if scorer_info else None,
+                    "name": strategy_info["name"],
+                    "family": strategy_info["family"],
+                    "summary": strategy_info.get("summary", ""),
+                    "requires_scorer": scorer_info is not None,
+                    "builtin_scorer": (
+                        "Consensus (majority vote)" if not scorer_info else None
+                    ),
+                    "run": run_payload,
+                    "comparison_rank": 1,
+                }
+            ],
+        }
+
+        examples.append(
+            {
+                "id": f"{strategy_type}_{sample_idx}",
+                "title": title,
+                "description": description,
+                "available_budgets": [budget],
+                "default_budget": budget,
+                "payloads": {str(budget): payload},
+            }
+        )
+
+    log.info(
+        f"Converted {len(examples)} samples from {output_dir.name} "
+        f"({data_name}, {strategy_type}, accuracy={accuracy:.1%})"
+    )
+    return {"examples": examples}
+
+
+def main():
+    parser = argparse.ArgumentParser(
+        description="Convert experiment results to Visual Debugger format."
+    )
+    parser.add_argument(
+        "output_dir",
+        type=Path,
+        help="Path to experiment output directory (containing results.json and .hydra/config.yaml)",
+    )
+    parser.add_argument(
+        "--out",
+        type=Path,
+        default=None,
+        help="Output path for the debugger JSON. Default: <output_dir>/debugger_payload.json",
+    )
+    parser.add_argument(
+        "--incorrect-only",
+        action="store_true",
+        help="Only include incorrect samples (useful for debugging failures)",
+    )
+    parser.add_argument(
+        "--max-samples",
+        type=int,
+        default=None,
+        help="Maximum number of samples to convert",
+    )
+    parser.add_argument(
+        "--install",
+        action="store_true",
+        help="Also copy the output to cached_examples.json for direct use with the debugger (replaces existing)",
+    )
+    parser.add_argument(
+        "--merge",
+        action="store_true",
+        help="Merge into existing cached_examples.json (adds new examples, keeps old ones)",
+    )
+
+    args = parser.parse_args()
+
+    logging.basicConfig(level=logging.INFO, format="%(message)s")
+
+    output_dir = args.output_dir.resolve()
+    if not output_dir.is_dir():
+        log.error(f"Not a directory: {output_dir}")
+        sys.exit(1)
+
+    result = convert_experiment(
+        output_dir=output_dir,
+        filter_incorrect=args.incorrect_only,
+        max_samples=args.max_samples,
+    )
+
+    out_path = args.out or (output_dir / "debugger_payload.json")
+    with open(out_path, "w") as f:
+        json.dump(result, f, indent=2, default=str)
+    log.info(f"Wrote {out_path} ({len(result['examples'])} examples)")
+
+    cached_path = (
+        PROJECT_ROOT / "service_app" / "static" / "debugger" / "cached_examples.json"
+    )
+
+    if args.install:
+        with open(cached_path, "w") as f:
+            json.dump(result, f, indent=2, default=str)
+        log.info(f"Installed to {cached_path} (replaced)")
+
+    if args.merge:
+        existing = {"examples": []}
+        if cached_path.exists():
+            with open(cached_path) as f:
+                existing = json.load(f)
+        existing_ids = {ex["id"] for ex in existing.get("examples", [])}
+        new_examples = [ex for ex in result["examples"] if ex["id"] not in existing_ids]
+        existing["examples"].extend(new_examples)
+        with open(cached_path, "w") as f:
+            json.dump(existing, f, indent=2, default=str)
+        log.info(
+            f"Merged {len(new_examples)} new examples into {cached_path} "
+            f"({len(existing['examples'])} total)"
+        )
+        log.info(
+            f"Open in browser: file://{PROJECT_ROOT}/service_app/static/debugger/index.html"
+        )
+
+
+if __name__ == "__main__":
+    main()
diff --git a/scripts/run_tts_eval.py b/scripts/run_tts_eval.py
index 8cf219e9..485137bd 100644
--- a/scripts/run_tts_eval.py
+++ b/scripts/run_tts_eval.py
@@ -106,11 +106,7 @@ def _make_output_name(run_name: str, strategy_type: str, data_name: str) -> str:
 except ImportError:
     POLYGRAPH_UNCERTAINTY_AVAILABLE = False
     VLLMWithUncertainty = None
-from utils.results import (
-    load_results_json,
-    parse_resume_arguments,
-    save_results_json,
-)
+from utils.results import load_results_json, parse_resume_arguments, save_results_json
 
 from llm_tts.evaluation import (
     EvaluatorAlignScore,
@@ -1659,6 +1655,19 @@ def _save_callback(strategy_results, phase="post_generation"):
                 if key in result:
                     result_dict[key] = result[key]
 
+            # Save tree visualization data (if strategy provides it)
+            for key in (
+                "step_candidates",
+                "all_trajectories",
+                "all_trajectory_steps",
+                "all_scores",
+                "all_step_scores",
+                "all_traces",
+                "best_idx",
+            ):
+                if key in result:
+                    result_dict[key] = result[key]
+
             # Store answer_step for thinking mode strategies
             if "answer_step" in result:
                 result_dict["answer_step"] = result["answer_step"]
diff --git a/service_app/static/debugger/app.js b/service_app/static/debugger/app.js
index d883dec6..99ceb099 100644
--- a/service_app/static/debugger/app.js
+++ b/service_app/static/debugger/app.js
@@ -38,6 +38,10 @@ const state = {
   advancedConfigExpanded: false,
   advancedConfigTemplateKey: null,
   advancedConfigDirty: false,
+  experimentSamples: [],
+  experimentFilteredIndices: [],
+  experimentCurrentIdx: 0,
+  experimentFilterIncorrect: false,
   isRunInProgress: false,
   runAbortController: null,
   activeRequestId: null,
@@ -80,6 +84,13 @@ const elements = {
   resetDemoButton: document.getElementById("resetDemoButton"),
   customStatus: document.getElementById("customStatus"),
   modelSuggestions: document.getElementById("modelSuggestions"),
+  experimentFileInput: document.getElementById("experimentFileInput"),
+  experimentFileButton: document.getElementById("experimentFileButton"),
+  sampleNavigation: document.getElementById("sampleNavigation"),
+  prevSampleBtn: document.getElementById("prevSampleBtn"),
+  nextSampleBtn: document.getElementById("nextSampleBtn"),
+  sampleCounter: document.getElementById("sampleCounter"),
+  filterIncorrectToggle: document.getElementById("filterIncorrectToggle"),
 };
 
 function updateModelSuggestions() {
@@ -2559,6 +2570,136 @@ function renderStepInspector() {
   renderTree();
 }
 
+// ---------------------------------------------------------------------------
+// Experiment file loading & sample navigation
+// ---------------------------------------------------------------------------
+
+function handleExperimentFileUpload(file) {
+  const reader = new FileReader();
+  reader.onload = (event) => {
+    try {
+      const rawBundle = JSON.parse(event.target.result);
+      const normalized = normalizePrototypeBundle(rawBundle);
+
+      if (!normalized.scenarios.length) {
+        setStatus("Loaded file contains no valid examples.", true);
+        return;
+      }
+
+      // Store full examples for sample navigation
+      state.experimentSamples = normalized.scenarios;
+      state.experimentFilterIncorrect = false;
+      if (elements.filterIncorrectToggle) {
+        elements.filterIncorrectToggle.checked = false;
+      }
+
+      // Load into custom payloads so loadPayloadForScenario works
+      state.customPayloads = normalized.payloads;
+      state.dataMode = "custom";
+
+      // Load into catalog and UI
+      state.catalog = normalized.scenarios;
+      state.prototypePayloads = normalized.payloads;
+      state.prototypeLoaded = true;
+      state.prototypeCatalog = normalized.scenarios;
+
+      recomputeFilteredIndices();
+      state.experimentCurrentIdx = 0;
+
+      populateScenarioSelect();
+      navigateToCurrentSample();
+      updateSampleNavigationUi();
+
+      // Auto-enable cached mode
+      state.useCachedExample = true;
+      if (elements.useCachedToggle) {
+        elements.useCachedToggle.checked = true;
+      }
+      applyCachedModeUi();
+
+      setStatus(
+        `Loaded ${normalized.scenarios.length} samples from file.`,
+        false,
+      );
+    } catch (error) {
+      setStatus(`Failed to parse file: ${error.message}`, true);
+    }
+  };
+  reader.readAsText(file);
+}
+
+function recomputeFilteredIndices() {
+  if (!state.experimentSamples.length) {
+    state.experimentFilteredIndices = [];
+    return;
+  }
+
+  state.experimentFilteredIndices = [];
+  for (let i = 0; i < state.experimentSamples.length; i++) {
+    if (state.experimentFilterIncorrect) {
+      // Check if sample title contains "[INCORRECT]"
+      const title = state.experimentSamples[i].title || "";
+      if (!title.includes("INCORRECT")) {
+        continue;
+      }
+    }
+    state.experimentFilteredIndices.push(i);
+  }
+}
+
+async function navigateToCurrentSample() {
+  const indices = state.experimentFilteredIndices;
+  if (!indices.length) {
+    setStatus("No samples match the current filter.", false);
+    return;
+  }
+
+  const clampedIdx = Math.max(
+    0,
+    Math.min(state.experimentCurrentIdx, indices.length - 1),
+  );
+  state.experimentCurrentIdx = clampedIdx;
+  const sampleIdx = indices[clampedIdx];
+  const scenario = state.experimentSamples[sampleIdx];
+
+  state.scenarioId = scenario.id;
+  elements.scenarioSelect.value = scenario.id;
+  configureCaseSelect(scenario.default_budget);
+
+  await loadCachedOptionsForCurrentScenario();
+  updateSampleNavigationUi();
+}
+
+function updateSampleNavigationUi() {
+  const indices = state.experimentFilteredIndices;
+  const hasExperiment = indices.length > 0;
+
+  if (elements.sampleNavigation) {
+    elements.sampleNavigation.classList.toggle("hidden", !hasExperiment);
+  }
+  if (elements.prevSampleBtn) {
+    elements.prevSampleBtn.disabled =
+      !hasExperiment || state.experimentCurrentIdx <= 0;
+  }
+  if (elements.nextSampleBtn) {
+    elements.nextSampleBtn.disabled =
+      !hasExperiment || state.experimentCurrentIdx >= indices.length - 1;
+  }
+  if (elements.sampleCounter) {
+    if (hasExperiment) {
+      const totalAll = state.experimentSamples.length;
+      const showing = indices.length;
+      const current = state.experimentCurrentIdx + 1;
+      elements.sampleCounter.textContent =
+        showing === totalAll
+          ? `Sample ${current} of ${showing}`
+          : `Sample ${current} of ${showing} (${totalAll} total)`;
+    } else {
+      elements.sampleCounter.textContent = "";
+    }
+  }
+}
+
 function render() {
   if (!state.payload) {
     return;
@@ -2581,6 +2722,8 @@ function bindHandlers() {
     clearRenderedResults();
     if (state.useCachedExample) {
       await loadCachedOptionsForCurrentScenario();
+      // Auto-run in cached mode so the user sees results immediately
+      elements.runButton?.click();
     }
   });
 
@@ -2753,6 +2896,55 @@ function bindHandlers() {
   elements.resetDemoButton.addEventListener("click", async () => {
     await restoreDemoData();
   });
+
+  // Experiment file upload
+  if (elements.experimentFileButton) {
+    elements.experimentFileButton.addEventListener("click", () => {
+      elements.experimentFileInput?.click();
+    });
+  }
+  if (elements.experimentFileInput) {
+    elements.experimentFileInput.addEventListener("change", (event) => {
+      const file = event.target.files?.[0];
+      if (file) {
+        handleExperimentFileUpload(file);
+      }
+    });
+  }
+
+  // Sample navigation
+  if (elements.prevSampleBtn) {
+    elements.prevSampleBtn.addEventListener("click", async () => {
+      if (state.experimentCurrentIdx > 0) {
+        state.experimentCurrentIdx--;
+        await navigateToCurrentSample();
+      }
+    });
+  }
+  if (elements.nextSampleBtn) {
+    elements.nextSampleBtn.addEventListener("click", async () => {
+      if (
+        state.experimentCurrentIdx <
+        state.experimentFilteredIndices.length - 1
+      ) {
+        state.experimentCurrentIdx++;
+        await navigateToCurrentSample();
+      }
+    });
+  }
+  if (elements.filterIncorrectToggle) {
+    elements.filterIncorrectToggle.addEventListener("change", async (event) => {
+      state.experimentFilterIncorrect = event.target.checked;
+      recomputeFilteredIndices();
+      state.experimentCurrentIdx = 0;
+      if (state.experimentFilteredIndices.length) {
+        await navigateToCurrentSample();
+      } else {
+        updateSampleNavigationUi();
+        setStatus("No incorrect samples found.", false);
+      }
+    });
+  }
 }
 
 async function checkApiHealth() {
@@ -2778,11 +2970,35 @@ async function init() {
   const apiAlive = await checkApiHealth();
   if (!apiAlive) {
     setStatus(
-      "Service API is not reachable. Make sure the server is running (python service_app/main.py).",
+      "Service API is not reachable. Load an experiment file or start the server.",
       true,
     );
-    elements.strategyGrid.innerHTML =
-      '<p class="tree-empty">Cannot connect to the service API.</p>';
+    // Try to load cached examples for file:// or prototype mode
+    try {
+      state.catalog = await loadCatalog();
+    } catch {
+      state.catalog = [];
+    }
+    if (!state.catalog.length) {
+      elements.strategyGrid.innerHTML =
+        '<p class="tree-empty">Load an experiment file (debugger_payload.json) to explore results.</p>';
+      // Show cached controls so file upload is accessible
+      elements.cachedExplorerControls?.classList.remove("hidden");
+      return;
+    }
+    // Auto-enable cached mode when API is down but cached data exists
+    state.useCachedExample = true;
+    if (elements.useCachedToggle) {
+      elements.useCachedToggle.checked = true;
+    }
+    state.scenarioId = state.catalog[0].id;
+    populateScenarioSelect();
+    configureCaseSelect(state.catalog[0].default_budget);
+    applyCachedModeUi();
+    await loadCachedOptionsForCurrentScenario();
+    // Auto-run first scenario so user sees results immediately
+    elements.runButton?.click();
+    setStatus("Loaded cached examples (offline mode).", false);
     return;
   }
 
@@ -2790,7 +3006,8 @@ async function init() {
 
   if (!state.catalog.length) {
     elements.strategyGrid.innerHTML =
-      '<p class="tree-empty">No debugger scenarios are available.</p>';
+      '<p class="tree-empty">No debugger scenarios are available. Load an experiment file to explore results.</p>';
+    elements.cachedExplorerControls?.classList.remove("hidden");
     return;
   }
 
diff --git a/service_app/static/debugger/index.html b/service_app/static/debugger/index.html
index d86406f2..695e47a9 100644
--- a/service_app/static/debugger/index.html
+++ b/service_app/static/debugger/index.html
@@ -91,6 +91,20 @@ <h2>Cached Example Explorer</h2>
             <span>Case</span>
             <select id="caseSelect"></select>
           </label>
+          <label class="control" style="text-align:center">
+            <span>Load File</span>
+            <button id="experimentFileButton" type="button" style="font-size:0.85em;cursor:pointer;padding:4px 12px">Choose JSON file…</button>
+            <input type="file" id="experimentFileInput" accept=".json" style="display:none" />
+          </label>
+        </div>
+        <div id="sampleNavigation" class="controls cached-controls hidden">
+          <button id="prevSampleBtn" type="button" disabled>&larr; Prev</button>
+          <span id="sampleCounter" style="font-size:0.85em;color:var(--muted)"></span>
+          <button id="nextSampleBtn" type="button" disabled>Next &rarr;</button>
+          <label style="display:flex;align-items:center;gap:0.3em;font-size:0.85em;color:var(--muted);margin-left:1em">
+            <input type="checkbox" id="filterIncorrectToggle" />
+            Incorrect only
+          </label>
         </div>
         <div id="cachedExplorerPrompt" class="prompt-strip embedded-prompt-strip hidden">
           <div>