diff --git a/config/experiments/beam_search/math500/beam_search_openrouter_gpt4o_mini_math500_entropy.yaml b/config/experiments/beam_search/math500/beam_search_openrouter_gpt4o_mini_math500_entropy.yaml new file mode 100644 index 00000000..6f101702 --- /dev/null +++ b/config/experiments/beam_search/math500/beam_search_openrouter_gpt4o_mini_math500_entropy.yaml @@ -0,0 +1,70 @@ + +# @package _global_ +# Run: python scripts/run_tts_eval.py --config-path=../config --config-name=experiments/beam_search/math500/beam_search_openrouter_gpt4o_mini_math500_entropy +# Beam Search for MATH-500 using OpenRouter API with entropy scorer +# Quick test config for debugger visualization +# +# Environment variable required: OPENROUTER_API_KEY + +defaults: + - /config + - /model/openrouter + - /generation/default + - /system/default + - /strategy/beam_search + - /scorer/entropy + - /evaluation/default + - _self_ + +# Run naming +run_name: "beam_search_openrouter_gpt4o_mini_math500_entropy_seed${system.seed}_${now:%H-%M-%S}" + +# Main configuration +verbose: true +report_to: none + +# Model configuration (OpenRouter API) +model: + type: "openai_api" + provider: openrouter + model_path: "qwen/qwen-2.5-7b-instruct" + api_key: null # Set via OPENROUTER_API_KEY env var + supports_logprobs: true + prefill_mode: false + max_context_budget: 128000 + +# System configuration +system: + device: cpu + seed: 42 + +# Generation configuration +generation: + max_new_tokens: 2048 + temperature: 0.7 + top_p: 0.95 + top_k: 50 + batch_size: 1 + +# Strategy - Beam Search (bigger tree for visualization) +strategy: + beam_size: 3 + candidates_per_beam: 3 + max_steps: 10 + min_step_tokens: 5 + max_step_tokens: 100 + +# Entropy scorer configuration +scorer: + type: entropy + batch_size: 1 + +# Dataset configuration - MATH-500 +dataset: + data_name: "math" + dataset_path: "test-time-compute/test_MATH" + dataset_split: "test" + subset: 3 + offset: 0 + answer_format: "numeric" + prompt_file: "${hydra:runtime.cwd}/config/prompts/default.txt" diff --git a/docs/service/debugger.md b/docs/service/debugger.md new file mode 100644 index 00000000..f60ff493 --- /dev/null +++ b/docs/service/debugger.md @@ -0,0 +1,78 @@ +# Experiment Results Visualizer + +View experiment results from `run_tts_eval.py` in the Visual Debugger — without re-running strategies live. + +## Quick Start + +```bash +# 1. Run an experiment (tree data is now saved automatically) +python scripts/run_tts_eval.py --config-path=../config --config-name=experiments/beam_search/... + +# 2. Convert results to debugger format +python scripts/convert_results_to_debugger.py outputs/// --install + +# 3. Serve and open in browser +python -m http.server 8080 -d service_app +# Open http://localhost:8080/static/debugger/index.html +``` + +## Converter Options + +```bash +# Basic: creates debugger_payload.json in the output dir +python scripts/convert_results_to_debugger.py outputs// + +# Install as cached_examples.json (auto-loads in debugger) +python scripts/convert_results_to_debugger.py outputs// --install + +# Only incorrect samples (for debugging failures) +python scripts/convert_results_to_debugger.py outputs// --incorrect-only + +# Limit number of samples +python scripts/convert_results_to_debugger.py outputs// --max-samples 50 + +# Custom output path +python scripts/convert_results_to_debugger.py outputs// --out my_results.json +``` + +## Using the Debugger + +1. Samples appear in the **Scenario** dropdown +2. Select a strategy/scorer and click **Run** to see the tree +3. **Timeline** (left) — click through reasoning steps +4. **Tree** (bottom) — orange path = selected, grey = pruned +5. **Candidates** panel — scores and text for each candidate at a step +6. **Prev/Next** buttons — navigate between samples +7. **Incorrect only** checkbox — filter to failed samples +8. **Load File** button — load a `debugger_payload.json` without `--install` + +## What the Tree Shows + +- Each node is a candidate generated at a reasoning step +- **Orange path**: the beam the strategy selected as its final answer +- **Grey nodes**: candidates that were generated and scored but pruned +- Click any node to see its full text and scores + +## Changed Files + +| File | Change | +|------|--------| +| `scripts/run_tts_eval.py` | Save tree data (`step_candidates`, `all_trajectories`, etc.) to `results.json` — previously discarded | +| `scripts/convert_results_to_debugger.py` | **New.** Converts experiment output to debugger JSON format | +| `service_app/static/debugger/index.html` | Added file upload input and sample navigation (Prev/Next, Incorrect only filter) | +| `service_app/static/debugger/app.js` | File upload handler, sample navigation logic, auto-enable cached mode for offline use | +| `config/experiments/beam_search/math500/beam_search_openrouter_gpt4o_mini_math500_entropy.yaml` | **New.** Quick-test config for OpenRouter beam search on MATH-500 | + +## Supported Strategies + +All strategies that produce tree data work: +- **Beam Search** — full tree with per-step candidates and beam lineage +- **Online Best-of-N** — stepwise candidate pools +- **Offline Best-of-N** — trajectory-level reranking with per-step breakdown +- **Self-Consistency** — parallel reasoning paths with voting + +Baseline (single-pass) also works but shows a linear chain (no branching). + +## Note on Old Experiments + +Experiments run **before** the `run_tts_eval.py` change won't have tree data in `results.json`. The converter still works — it falls back to a stepwise view (one candidate per step) — but you won't see the full branching tree. Re-run the experiment to get tree data. diff --git a/presentation/experiment_visualization_plan.md b/presentation/experiment_visualization_plan.md new file mode 100644 index 00000000..c3add5d6 --- /dev/null +++ b/presentation/experiment_visualization_plan.md @@ -0,0 +1,232 @@ +# Plan: Visualizing Experiment Results in the Visual Debugger + +## Goal + +Enable viewing experiment results (from `scripts/run_tts_eval.py`) in the existing Visual Debugger UI — without running strategies live. + +--- + +## Current Architecture + +### How the Debugger Works Now + +1. **Backend** runs a strategy via `strategy_manager.py` → gets raw result dict +2. **`debugger_events.py`** converts the result dict into a list of **events** (the universal visualization format) +3. **`app.js`** receives events and builds an interactive tree via `buildTreeFromEvents(events)` + +The key conversion layer is `debugger_events.py:convert_strategy_result_to_debugger_run()` — it transforms raw strategy output into the event format the frontend understands. + +### Event Format (what the frontend needs) + +```json +{ + "step": 1, + "title": "Step 1: Candidate generation", + "stage": "tree_expand", + "signals": [{"name": "confidence", "value": 0.85, "direction": "higher_better"}], + "candidates": [ + { + "id": "step_1_candidate_0", + "label": "Candidate 1", + "text": "Let me think about this...", + "status": "selected", + "selected": true, + "signals": {"confidence": 0.85, "prm": 0.92}, + "beam_uid": 1, + "parent_beam_uid": null + } + ] +} +``` + +### What `run_tts_eval.py` Already Saves + +| File | Content | +|------|---------| +| `results.json` | Per-sample results: steps, scores, trajectory, extracted_answer | +| `candidates.json` | Multi-trajectory data (offline BoN) | +| `sample_metrics.jsonl` | Per-sample compute metrics | +| `metrics.json` | Aggregated accuracy, tokens, etc. | + +**IMPORTANT: `results.json` does NOT save the tree-building data.** The strategies return `step_candidates` (beam search, online BoN) and `all_trajectories` (offline BoN, self-consistency) in their result dicts, but `run_tts_eval.py` discards them (lines 1631-1674 cherry-pick only a subset of fields). Currently saved: +- `steps` — list of step dicts with `text`, `token_ids`, `generation_scores`, `other_data` +- `validity_scores` — flat list of per-step scores +- `generated_trajectory` — concatenated text +- `extracted_answer`, `answer_step`, `token_stats`, completion info + +**Not saved (but available in strategy return value):** +- **Beam Search / Online BoN**: `step_candidates` — per-step decision points with all candidates, their scores, beam UIDs, parent linkage. This is the full tree structure. +- **Offline BoN**: `all_trajectories`, `all_scores`, `all_step_scores`, `best_idx` — all N candidate trajectories with scores. +- **Self-Consistency**: `all_trajectories` — all sampled paths. + +--- + +## Proposed Approach + +### Two things are needed: + +**1. Save tree data in `run_tts_eval.py`** — modify `_generate_trajectories_batch()` (line 1631) to also save `step_candidates` and `all_trajectories` to results.json (or a separate `tree_data.json` to keep results.json lightweight). + +**2. A standalone converter script** that reads the saved data and converts it to the debugger format: +- Reads `results.json` (with newly-saved tree fields) from an experiment output dir +- Calls the existing `convert_strategy_result_to_debugger_run()` for each sample +- Outputs a `cached_examples.json`-compatible file for the debugger to load + +### Architecture + +``` + Step 0 (one-time) + ┌──────────────────────────┐ + │ Modify run_tts_eval.py │ + │ to save step_candidates │ + │ and all_trajectories │ + └──────────────────────────┘ + +Experiment output dir Converter Visual Debugger +┌──────────────────┐ ┌──────────────────┐ ┌──────────────────┐ +│ results.json │────▶│ convert_results │────▶│ Load as "cached │ +│ (with tree data)│ │ _to_debugger.py │ │ example" or via │ +│ metrics.json │ │ │ │ file:// protocol│ +│ config.yaml │ │ Uses existing │ │ │ +│ │ │ debugger_events │ │ │ +└──────────────────┘ │ .py converter │ └──────────────────┘ + └──────────────────┘ +``` + +--- + +## Implementation Steps + +### Step 0: Save Tree Data in `run_tts_eval.py` (PREREQUISITE) + +Currently `_generate_trajectories_batch()` at line 1631 builds `result_dict` without tree-building fields. Add: + +```python +# After line 1660 in _generate_trajectories_batch() +# Save tree visualization data (if strategy provides it) +for key in ("step_candidates", "all_trajectories", "all_scores", "all_step_scores", "best_idx"): + if key in result: + result_dict[key] = result[key] +``` + +**Option A** — save directly in `results.json` (simpler, but increases file size significantly for beam search with many candidates). + +**Option B** — save to a separate `tree_data.jsonl` file (one line per sample, keyed by index). Keeps `results.json` lightweight. The converter script would then read both files. + +**Note on serialization:** `step_candidates` contains `StepCandidate` objects. These are already serialized as dicts with `text`, `token_ids`, `generation_scores`, `other_data` fields (same as `steps`), so JSON serialization should work. Verify with a test run. + +### Step 1: Converter Script (`scripts/convert_results_to_debugger.py`) + +**Input:** path to experiment output directory (containing `results.json` with tree data) +**Output:** JSON file in the debugger payload format + +```python +# Pseudocode +from service_app.core.debugger_events import convert_strategy_result_to_debugger_run + +def convert_experiment(output_dir, strategy_type, scorer_type=None): + results = load_json(output_dir / "results.json") + config = load_yaml(output_dir / ".hydra/config.yaml") # experiment config + + examples = [] + for sample in results: + # The result dict now contains step_candidates / all_trajectories + # (saved by the modified run_tts_eval.py) + run_payload = convert_strategy_result_to_debugger_run( + strategy={"id": strategy_type, "name": ..., "family": ...}, + scorer={"id": scorer_type, ...} if scorer_type else None, + strategy_result=sample, # pass the full saved result + budget=config.strategy.max_steps, + latency_ms=0, + ... + ) + + examples.append({ + "id": f"sample_{sample['index']}", + "title": sample["question"][:80], + "description": f"Gold: {sample['gold_answer']}, Predicted: {sample['extracted_answer']}", + "payloads": {"default": make_payload(run_payload, sample)} + }) + + save_json(examples, output_dir / "debugger_payload.json") +``` + +### Step 2: Adapt `debugger_events.py` for Serialized Data + +The existing converter expects live `StepCandidate` objects (with `.text` attribute). Serialized results have dicts (with `"text"` key). Need to handle both: + +- `_build_events_from_step_candidates()` — already works with dict-like candidates (check this) +- `_build_events_from_trajectory_pool()` — needs to accept step dicts instead of `StepCandidate` objects (access `.text` vs `["text"]`) +- `_build_stepwise_events()` — same: accept step dicts + +**This is the main coding task** — make the converter accept both live objects and serialized JSON. A simple helper can bridge the gap: + +```python +def _step_text(step): + """Get text from either a StepCandidate object or a serialized dict.""" + return step.text if hasattr(step, "text") else step.get("text", str(step)) +``` + +### Step 3: Add "Load from File" to the Debugger UI + +Two options (pick one): + +**Option A (simpler):** Generate a `cached_examples.json` and open the debugger HTML as `file://` — already supported, no backend needed. + +**Option B (richer):** Add a "Load experiment" button to the debugger that accepts a JSON file upload or a directory path. This would: +- Add a file input element in `app.js` +- Parse the uploaded JSON into the same format as cached examples +- Populate the example selector dropdown + +### Step 4: Multi-Sample Navigation + +Current debugger shows one problem at a time. For experiments with hundreds of samples, add: +- Sample index selector (dropdown or prev/next buttons) +- Filter by correctness (show only incorrect samples for debugging) +- Summary stats bar (accuracy, avg tokens) + +--- + +## Where to Start + +### For the colleague — recommended order: + +1. **Start with `service_app/core/debugger_events.py`** — understand `convert_strategy_result_to_debugger_run()` (line 49). This is the core function. Read its input/output contract. + +2. **Read one cached example** — look at `service_app/static/debugger/cached_examples.json` to see the exact output format the frontend expects. Focus on `strategies[].run.events[]`. + +3. **Modify `run_tts_eval.py`** (Step 0) — add `step_candidates` and `all_trajectories` to the saved result dict. This is ~5 lines of code. Re-run one experiment to generate data with tree fields. + +4. **Write the converter script** — `scripts/convert_results_to_debugger.py`: + - Load `results.json` (with tree data) from experiment dir + - For each sample, call the existing converter (or a thin wrapper) + - Output a debugger-compatible JSON + +5. **Handle serialization gap** — the converter expects `StepCandidate` objects but results.json has dicts. Create a lightweight adapter or modify the converter to accept both. + +6. **Test with file:// protocol** — open `index.html` directly in a browser with the generated JSON as `cached_examples.json` in the same directory. + +--- + +## Key Files to Read + +| File | Why | +|------|-----| +| `service_app/core/debugger_events.py` | **Core converter** — strategy result → events | +| `service_app/static/debugger/cached_examples.json` | **Target format** — what the frontend expects | +| `service_app/static/debugger/app.js:2052-2256` | `buildTreeFromEvents()` — how frontend builds the tree | +| `service_app/core/visual_debugger_demo.py` | How demo payloads are assembled | +| `scripts/run_tts_eval.py:1630-1674` | What fields are saved per sample in `results.json` | + +--- + +## Summary + +**Answer to the colleague's question:** You don't need to build trees inside `run_tts_eval.py`. The tree construction logic already exists in `debugger_events.py`. But there's a prerequisite: `run_tts_eval.py` currently **discards** the tree-building data (`step_candidates`, `all_trajectories`) when saving results. So the plan is: + +1. **Modify `run_tts_eval.py`** (~5 lines) to also save `step_candidates` / `all_trajectories` to disk — this is the raw tree structure that strategies already compute but we throw away +2. A **post-hoc converter script** (`scripts/convert_results_to_debugger.py`) that reads experiment outputs and calls the existing `debugger_events.py` converter to produce the frontend-ready format +3. Minor **adaptation of `debugger_events.py`** to accept serialized dicts (from JSON) in addition to live `StepCandidate` objects +4. Optionally, a **"Load experiment" UI** in the debugger for convenience + +The tree data is already computed by strategies at runtime — we just need to stop discarding it and then pipe it through the existing conversion layer. diff --git a/scripts/convert_results_to_debugger.py b/scripts/convert_results_to_debugger.py new file mode 100755 index 00000000..ed6ddce6 --- /dev/null +++ b/scripts/convert_results_to_debugger.py @@ -0,0 +1,424 @@ +#!/usr/bin/env python3 +"""Convert experiment results (from run_tts_eval.py) to the Visual Debugger format. + +Usage: + python scripts/convert_results_to_debugger.py [--out debugger_payload.json] + +Example: + python scripts/convert_results_to_debugger.py outputs/2026-02-04/beam_search_math500_09-09-47 + # Opens: service_app/static/debugger/index.html with the generated cached_examples.json +""" + +from __future__ import annotations + +import argparse +import json +import logging +import re +import sys +from copy import deepcopy +from pathlib import Path + +# Add the project root to sys.path so we can import service_app +PROJECT_ROOT = Path(__file__).resolve().parents[1] +sys.path.insert(0, str(PROJECT_ROOT)) + +import yaml # noqa: E402 + +from service_app.core.debugger_events import ( # noqa: E402 + convert_strategy_result_to_debugger_run, +) + +log = logging.getLogger(__name__) + +# --------------------------------------------------------------------------- +# Strategy type -> metadata mapping (mirrors visual_debugger_demo.py) +# --------------------------------------------------------------------------- + +STRATEGY_META = { + "baseline": { + "id": "baseline", + "name": "Baseline (Raw CoT)", + "family": "single_pass", + "summary": "Single-pass raw chain-of-thought without search or reranking.", + }, + "beam_search": { + "id": "beam_search", + "name": "Beam Search (ToT)", + "family": "tree_search", + "summary": "Tree-of-thought expansion with beam pruning.", + }, + "adaptive": { + "id": "adaptive", + "name": "Adaptive Best-of-N", + "family": "reranking", + "summary": "Online best-of-n with adaptive scaling across steps.", + }, + "online_best_of_n": { + "id": "online_best_of_n", + "name": "Online Best-of-N", + "family": "reranking", + "summary": "Iterative candidate generation with stepwise reranking.", + }, + "offline_best_of_n": { + "id": "offline_best_of_n", + "name": "Offline Best-of-N", + "family": "reranking", + "summary": "Generate full trajectories first, then rerank at the end.", + }, + "self_consistency": { + "id": "self_consistency", + "name": "Self-Consistency", + "family": "sample_and_vote", + "summary": "Sample diverse trajectories and select by answer consensus.", + }, +} + +SCORER_META = { + "prm": { + "id": "prm", + "name": "PRM", + "direction": "higher_better", + "summary": "Process Reward Model trajectory quality score.", + }, + "self_verification": { + "id": "self_verification", + "name": "Self-Verification (LLM Critic)", + "direction": "higher_better", + "summary": "LLM-as-a-judge verification scoring.", + }, + "sequence_prob": { + "id": "sequence_prob", + "name": "Sequence Prob", + "direction": "higher_better", + "summary": "Cumulative sequence probability from token logprobs.", + }, + "perplexity": { + "id": "perplexity", + "name": "Perplexity", + "direction": "lower_better", + "summary": "Per-token perplexity estimated from generation logprobs.", + }, + "entropy": { + "id": "entropy", + "name": "Entropy", + "direction": "lower_better", + "summary": "Mean token entropy of decoded reasoning steps.", + }, +} + + +def load_config(output_dir: Path) -> dict: + """Load the Hydra config from the experiment output directory.""" + config_path = output_dir / ".hydra" / "config.yaml" + if not config_path.exists(): + raise FileNotFoundError(f"Config not found: {config_path}") + with open(config_path) as f: + return yaml.safe_load(f) + + +def load_results(output_dir: Path) -> list: + """Load results.json from the experiment output directory.""" + results_path = output_dir / "results.json" + if not results_path.exists(): + raise FileNotFoundError(f"Results not found: {results_path}") + with open(results_path) as f: + return json.load(f) + + +def get_strategy_info(config: dict) -> dict: + """Get strategy metadata from the config.""" + strategy_type = config.get("strategy", {}).get("type", "baseline") + meta = STRATEGY_META.get(strategy_type) + if meta: + return deepcopy(meta) + return {"id": strategy_type, "name": strategy_type, "family": "unknown"} + + +def get_scorer_info(config: dict) -> dict | None: + """Get scorer metadata from the config.""" + scorer_cfg = config.get("scorer") + if not scorer_cfg: + return None + scorer_type = scorer_cfg.get("type", "") + # self_consistency uses consensus scoring, no external scorer + strategy_type = config.get("strategy", {}).get("type", "") + if strategy_type == "self_consistency": + return None + meta = SCORER_META.get(scorer_type) + if meta: + return deepcopy(meta) + return { + "id": scorer_type, + "name": scorer_type, + "direction": "higher_better", + "summary": "", + } + + +def extract_run_timestamp(output_dir: Path) -> str: + """Extract a human-readable timestamp from the output directory path. + + Looks for date pattern (YYYY-MM-DD) in parent dirs and time (HH-MM-SS) + in the run dir name. Returns e.g. '2026-03-20 19:18'. + """ + parts = output_dir.parts + date_part = "" + for p in parts: + if re.match(r"^\d{4}-\d{2}-\d{2}$", p): + date_part = p + break + # Time from dir name like seed42_19-18-04 + time_match = re.search(r"(\d{2})-(\d{2})-(\d{2})$", output_dir.name) + time_part = f"{time_match.group(1)}:{time_match.group(2)}" if time_match else "" + if date_part and time_part: + return f"{date_part} {time_part}" + return date_part or time_part or output_dir.name + + +def convert_experiment( + output_dir: Path, + filter_incorrect: bool = False, + max_samples: int | None = None, +) -> dict: + """Convert an experiment output directory to the debugger cached_examples format. + + Returns a dict matching the cached_examples.json schema. + """ + config = load_config(output_dir) + results = load_results(output_dir) + run_timestamp = extract_run_timestamp(output_dir) + + strategy_info = get_strategy_info(config) + scorer_info = get_scorer_info(config) + + strategy_cfg = config.get("strategy", {}) + scorer_cfg = config.get("scorer", {}) or {} + generation_cfg = config.get("generation", {}) + model_cfg = config.get("model", {}) + dataset_cfg = config.get("dataset", {}) + + budget = strategy_cfg.get("max_steps", 10) + + model_config = { + "provider": model_cfg.get("provider", "unknown"), + "model_id": model_cfg.get("model_name", model_cfg.get("model_path", "")), + "api_key_masked": "sk-...eval", + } + + data_name = dataset_cfg.get("data_name", "unknown") + dataset_offset = dataset_cfg.get("offset", 0) + strategy_type = strategy_cfg.get("type", "unknown") + + # Count correct/incorrect + total = len(results) + correct = sum(1 for r in results if r.get("is_correct")) + accuracy = correct / total if total > 0 else 0 + + if filter_incorrect: + results = [r for r in results if not r.get("is_correct")] + + if max_samples is not None: + results = results[:max_samples] + + examples = [] + for sample in results: + sample_idx = sample.get("index", 0) + question = sample.get("question", "") + gold_answer = str(sample.get("gold_answer", "")) + extracted_answer = str(sample.get("extracted_answer", "")) + is_correct = sample.get("is_correct", False) + + # Build the strategy result dict that the converter expects + strategy_result = { + "trajectory": sample.get("generated_trajectory", ""), + "extracted_answer": extracted_answer, + "steps": sample.get("steps", []), + "validity_scores": sample.get("validity_scores", []), + "token_stats": sample.get("token_stats"), + "completed": sample.get("completed", True), + } + + # Propagate tree visualization fields + for key in ( + "step_candidates", + "all_trajectories", + "all_trajectory_steps", + "all_scores", + "all_step_scores", + "all_traces", + "best_idx", + ): + if key in sample: + strategy_result[key] = sample[key] + + try: + run_payload = convert_strategy_result_to_debugger_run( + strategy=strategy_info, + scorer=scorer_info, + strategy_result=strategy_result, + budget=budget, + latency_ms=0, + model_config=model_config, + generation_config=generation_cfg, + strategy_config=dict(strategy_cfg), + scorer_config=dict(scorer_cfg), + has_gold_answer=bool(gold_answer), + gold_answer=gold_answer, + ) + except Exception: + log.exception(f"Failed to convert sample {sample_idx}, skipping") + continue + + correctness_mark = "correct" if is_correct else "INCORRECT" + scorer_label = scorer_info["name"] if scorer_info else "none" + dataset_idx = dataset_offset + sample_idx + title = f"[{run_timestamp}] [{strategy_info['name']}|{scorer_label}] #{dataset_idx} [{correctness_mark}] {question[:50]}" + description = ( + f"Strategy: {strategy_info['name']} | Scorer: {scorer_label} | " + f"Gold: {gold_answer} | Predicted: {extracted_answer} | " + f"{'Correct' if is_correct else 'Incorrect'}" + ) + + scorer_catalog = [scorer_info] if scorer_info else [] + + payload = { + "scenario": { + "id": f"{strategy_type}_{sample_idx}", + "title": title, + "description": description, + "prompt": question, + "ground_truth": gold_answer, + "input_source": "experiment_results", + "model_config": model_config, + "strategy_count": 1, + "scorer_count": len(scorer_catalog), + "run_count": 1, + }, + "available_budgets": [budget], + "selected_budget": budget, + "strategy_catalog": [], + "scorer_catalog": scorer_catalog, + "strategies": [ + { + "id": strategy_info["id"], + "strategy_id": strategy_info["id"], + "scorer_id": scorer_info["id"] if scorer_info else None, + "name": strategy_info["name"], + "family": strategy_info["family"], + "summary": strategy_info.get("summary", ""), + "requires_scorer": scorer_info is not None, + "builtin_scorer": ( + "Consensus (majority vote)" if not scorer_info else None + ), + "run": run_payload, + "comparison_rank": 1, + } + ], + } + + examples.append( + { + "id": f"{strategy_type}_{sample_idx}", + "title": title, + "description": description, + "available_budgets": [budget], + "default_budget": budget, + "payloads": {str(budget): payload}, + } + ) + + log.info( + f"Converted {len(examples)} samples from {output_dir.name} " + f"({data_name}, {strategy_type}, accuracy={accuracy:.1%})" + ) + return {"examples": examples} + + +def main(): + parser = argparse.ArgumentParser( + description="Convert experiment results to Visual Debugger format." + ) + parser.add_argument( + "output_dir", + type=Path, + help="Path to experiment output directory (containing results.json and .hydra/config.yaml)", + ) + parser.add_argument( + "--out", + type=Path, + default=None, + help="Output path for the debugger JSON. Default: /debugger_payload.json", + ) + parser.add_argument( + "--incorrect-only", + action="store_true", + help="Only include incorrect samples (useful for debugging failures)", + ) + parser.add_argument( + "--max-samples", + type=int, + default=None, + help="Maximum number of samples to convert", + ) + parser.add_argument( + "--install", + action="store_true", + help="Also copy the output to cached_examples.json for direct use with the debugger (replaces existing)", + ) + parser.add_argument( + "--merge", + action="store_true", + help="Merge into existing cached_examples.json (adds new examples, keeps old ones)", + ) + + args = parser.parse_args() + + logging.basicConfig(level=logging.INFO, format="%(message)s") + + output_dir = args.output_dir.resolve() + if not output_dir.is_dir(): + log.error(f"Not a directory: {output_dir}") + sys.exit(1) + + result = convert_experiment( + output_dir=output_dir, + filter_incorrect=args.incorrect_only, + max_samples=args.max_samples, + ) + + out_path = args.out or (output_dir / "debugger_payload.json") + with open(out_path, "w") as f: + json.dump(result, f, indent=2, default=str) + log.info(f"Wrote {out_path} ({len(result['examples'])} examples)") + + cached_path = ( + PROJECT_ROOT / "service_app" / "static" / "debugger" / "cached_examples.json" + ) + + if args.install: + with open(cached_path, "w") as f: + json.dump(result, f, indent=2, default=str) + log.info(f"Installed to {cached_path} (replaced)") + + if args.merge: + existing = {"examples": []} + if cached_path.exists(): + with open(cached_path) as f: + existing = json.load(f) + existing_ids = {ex["id"] for ex in existing.get("examples", [])} + new_examples = [ex for ex in result["examples"] if ex["id"] not in existing_ids] + existing["examples"].extend(new_examples) + with open(cached_path, "w") as f: + json.dump(existing, f, indent=2, default=str) + log.info( + f"Merged {len(new_examples)} new examples into {cached_path} " + f"({len(existing['examples'])} total)" + ) + log.info( + f"Open in browser: file://{PROJECT_ROOT}/service_app/static/debugger/index.html" + ) + + +if __name__ == "__main__": + main() diff --git a/scripts/run_tts_eval.py b/scripts/run_tts_eval.py index 8cf219e9..485137bd 100644 --- a/scripts/run_tts_eval.py +++ b/scripts/run_tts_eval.py @@ -106,11 +106,7 @@ def _make_output_name(run_name: str, strategy_type: str, data_name: str) -> str: except ImportError: POLYGRAPH_UNCERTAINTY_AVAILABLE = False VLLMWithUncertainty = None -from utils.results import ( - load_results_json, - parse_resume_arguments, - save_results_json, -) +from utils.results import load_results_json, parse_resume_arguments, save_results_json from llm_tts.evaluation import ( EvaluatorAlignScore, @@ -1659,6 +1655,19 @@ def _save_callback(strategy_results, phase="post_generation"): if key in result: result_dict[key] = result[key] + # Save tree visualization data (if strategy provides it) + for key in ( + "step_candidates", + "all_trajectories", + "all_trajectory_steps", + "all_scores", + "all_step_scores", + "all_traces", + "best_idx", + ): + if key in result: + result_dict[key] = result[key] + # Store answer_step for thinking mode strategies if "answer_step" in result: result_dict["answer_step"] = result["answer_step"] diff --git a/service_app/static/debugger/app.js b/service_app/static/debugger/app.js index d883dec6..99ceb099 100644 --- a/service_app/static/debugger/app.js +++ b/service_app/static/debugger/app.js @@ -38,6 +38,10 @@ const state = { advancedConfigExpanded: false, advancedConfigTemplateKey: null, advancedConfigDirty: false, + experimentSamples: [], + experimentFilteredIndices: [], + experimentCurrentIdx: 0, + experimentFilterIncorrect: false, isRunInProgress: false, runAbortController: null, activeRequestId: null, @@ -80,6 +84,13 @@ const elements = { resetDemoButton: document.getElementById("resetDemoButton"), customStatus: document.getElementById("customStatus"), modelSuggestions: document.getElementById("modelSuggestions"), + experimentFileInput: document.getElementById("experimentFileInput"), + experimentFileButton: document.getElementById("experimentFileButton"), + sampleNavigation: document.getElementById("sampleNavigation"), + prevSampleBtn: document.getElementById("prevSampleBtn"), + nextSampleBtn: document.getElementById("nextSampleBtn"), + sampleCounter: document.getElementById("sampleCounter"), + filterIncorrectToggle: document.getElementById("filterIncorrectToggle"), }; function updateModelSuggestions() { @@ -2559,6 +2570,136 @@ function renderStepInspector() { renderTree(); } +// --------------------------------------------------------------------------- +// Experiment file loading & sample navigation +// --------------------------------------------------------------------------- + +function handleExperimentFileUpload(file) { + const reader = new FileReader(); + reader.onload = (event) => { + try { + const rawBundle = JSON.parse(event.target.result); + const normalized = normalizePrototypeBundle(rawBundle); + + if (!normalized.scenarios.length) { + setStatus("Loaded file contains no valid examples.", true); + return; + } + + // Store full examples for sample navigation + state.experimentSamples = normalized.scenarios; + state.experimentFilterIncorrect = false; + if (elements.filterIncorrectToggle) { + elements.filterIncorrectToggle.checked = false; + } + + // Load into custom payloads so loadPayloadForScenario works + state.customPayloads = normalized.payloads; + state.dataMode = "custom"; + + // Load into catalog and UI + state.catalog = normalized.scenarios; + state.prototypePayloads = normalized.payloads; + state.prototypeLoaded = true; + state.prototypeCatalog = normalized.scenarios; + + recomputeFilteredIndices(); + state.experimentCurrentIdx = 0; + + populateScenarioSelect(); + navigateToCurrentSample(); + updateSampleNavigationUi(); + + // Auto-enable cached mode + state.useCachedExample = true; + if (elements.useCachedToggle) { + elements.useCachedToggle.checked = true; + } + applyCachedModeUi(); + + setStatus( + `Loaded ${normalized.scenarios.length} samples from file.`, + false, + ); + } catch (error) { + setStatus(`Failed to parse file: ${error.message}`, true); + } + }; + reader.readAsText(file); +} + +function recomputeFilteredIndices() { + if (!state.experimentSamples.length) { + state.experimentFilteredIndices = []; + return; + } + + state.experimentFilteredIndices = []; + for (let i = 0; i < state.experimentSamples.length; i++) { + if (state.experimentFilterIncorrect) { + // Check if sample title contains "[INCORRECT]" + const title = state.experimentSamples[i].title || ""; + if (!title.includes("INCORRECT")) { + continue; + } + } + state.experimentFilteredIndices.push(i); + } +} + +async function navigateToCurrentSample() { + const indices = state.experimentFilteredIndices; + if (!indices.length) { + setStatus("No samples match the current filter.", false); + return; + } + + const clampedIdx = Math.max( + 0, + Math.min(state.experimentCurrentIdx, indices.length - 1), + ); + state.experimentCurrentIdx = clampedIdx; + const sampleIdx = indices[clampedIdx]; + const scenario = state.experimentSamples[sampleIdx]; + + state.scenarioId = scenario.id; + elements.scenarioSelect.value = scenario.id; + configureCaseSelect(scenario.default_budget); + + await loadCachedOptionsForCurrentScenario(); + updateSampleNavigationUi(); +} + +function updateSampleNavigationUi() { + const indices = state.experimentFilteredIndices; + const hasExperiment = indices.length > 0; + + if (elements.sampleNavigation) { + elements.sampleNavigation.classList.toggle("hidden", !hasExperiment); + } + if (elements.prevSampleBtn) { + elements.prevSampleBtn.disabled = + !hasExperiment || state.experimentCurrentIdx <= 0; + } + if (elements.nextSampleBtn) { + elements.nextSampleBtn.disabled = + !hasExperiment || state.experimentCurrentIdx >= indices.length - 1; + } + if (elements.sampleCounter) { + if (hasExperiment) { + const totalAll = state.experimentSamples.length; + const showing = indices.length; + const current = state.experimentCurrentIdx + 1; + elements.sampleCounter.textContent = + showing === totalAll + ? `Sample ${current} of ${showing}` + : `Sample ${current} of ${showing} (${totalAll} total)`; + } else { + elements.sampleCounter.textContent = ""; + } + } +} + function render() { if (!state.payload) { return; @@ -2581,6 +2722,8 @@ function bindHandlers() { clearRenderedResults(); if (state.useCachedExample) { await loadCachedOptionsForCurrentScenario(); + // Auto-run in cached mode so the user sees results immediately + elements.runButton?.click(); } }); @@ -2753,6 +2896,55 @@ function bindHandlers() { elements.resetDemoButton.addEventListener("click", async () => { await restoreDemoData(); }); + + // Experiment file upload + if (elements.experimentFileButton) { + elements.experimentFileButton.addEventListener("click", () => { + elements.experimentFileInput?.click(); + }); + } + if (elements.experimentFileInput) { + elements.experimentFileInput.addEventListener("change", (event) => { + const file = event.target.files?.[0]; + if (file) { + handleExperimentFileUpload(file); + } + }); + } + + // Sample navigation + if (elements.prevSampleBtn) { + elements.prevSampleBtn.addEventListener("click", async () => { + if (state.experimentCurrentIdx > 0) { + state.experimentCurrentIdx--; + await navigateToCurrentSample(); + } + }); + } + if (elements.nextSampleBtn) { + elements.nextSampleBtn.addEventListener("click", async () => { + if ( + state.experimentCurrentIdx < + state.experimentFilteredIndices.length - 1 + ) { + state.experimentCurrentIdx++; + await navigateToCurrentSample(); + } + }); + } + if (elements.filterIncorrectToggle) { + elements.filterIncorrectToggle.addEventListener("change", async (event) => { + state.experimentFilterIncorrect = event.target.checked; + recomputeFilteredIndices(); + state.experimentCurrentIdx = 0; + if (state.experimentFilteredIndices.length) { + await navigateToCurrentSample(); + } else { + updateSampleNavigationUi(); + setStatus("No incorrect samples found.", false); + } + }); + } } async function checkApiHealth() { @@ -2778,11 +2970,35 @@ async function init() { const apiAlive = await checkApiHealth(); if (!apiAlive) { setStatus( - "Service API is not reachable. Make sure the server is running (python service_app/main.py).", + "Service API is not reachable. Load an experiment file or start the server.", true, ); - elements.strategyGrid.innerHTML = - '

Cannot connect to the service API.

'; + // Try to load cached examples for file:// or prototype mode + try { + state.catalog = await loadCatalog(); + } catch { + state.catalog = []; + } + if (!state.catalog.length) { + elements.strategyGrid.innerHTML = + '

Load an experiment file (debugger_payload.json) to explore results.

'; + // Show cached controls so file upload is accessible + elements.cachedExplorerControls?.classList.remove("hidden"); + return; + } + // Auto-enable cached mode when API is down but cached data exists + state.useCachedExample = true; + if (elements.useCachedToggle) { + elements.useCachedToggle.checked = true; + } + state.scenarioId = state.catalog[0].id; + populateScenarioSelect(); + configureCaseSelect(state.catalog[0].default_budget); + applyCachedModeUi(); + await loadCachedOptionsForCurrentScenario(); + // Auto-run first scenario so user sees results immediately + elements.runButton?.click(); + setStatus("Loaded cached examples (offline mode).", false); return; } @@ -2790,7 +3006,8 @@ async function init() { if (!state.catalog.length) { elements.strategyGrid.innerHTML = - '

No debugger scenarios are available.

'; + '

No debugger scenarios are available. Load an experiment file to explore results.

'; + elements.cachedExplorerControls?.classList.remove("hidden"); return; } diff --git a/service_app/static/debugger/index.html b/service_app/static/debugger/index.html index d86406f2..695e47a9 100644 --- a/service_app/static/debugger/index.html +++ b/service_app/static/debugger/index.html @@ -91,6 +91,20 @@

Cached Example Explorer

Case + + +