From b85377de77c5d82fb7b54a158eb5625d8fda5c7a Mon Sep 17 00:00:00 2001 From: cemde Date: Tue, 3 Mar 2026 14:22:44 +0000 Subject: [PATCH 1/2] Fix ResultLogger._filter_report() dropping status and error fields The _filter_report() method was only copying task_id and repeat_idx into filtered output, silently dropping the status and error fields. This made it impossible to distinguish successful runs from failures in persisted results. These are core metadata fields, not optional bulk data controlled by include_* flags. --- CHANGELOG.md | 1 + maseval/core/benchmark.py | 7 +--- maseval/core/callbacks/result_logger.py | 2 + .../test_benchmark_lifecycle.py | 2 +- .../test_callbacks/test_result_logger.py | 40 +++++++++++++++++++ 5 files changed, 46 insertions(+), 6 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 545e01d2..4539fe33 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -95,6 +95,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 ### Fixed +- `ResultLogger._filter_report()` now includes `status` and `error` fields in persisted results, so saved logs can distinguish successful runs from infrastructure failures. Report schema is now consistent across success and failure paths (`error` is always present, `None` on success). (PR: #PR_NUMBER_PLACEHOLDER) - GAIA2: Various fixes for faithful reproduction of ARE reference results — scenario lifecycle, data loading, evaluation flow, multi-turn notification handling, tool filtering, default agent fidelity, and simulation time management (PR: #30) - MultiAgentBench: Corrected domain mappings, added missing werewolf/minecraft support, fixed environment constructors, added result summarization matching MARBLE's evaluation pipeline (PR: #30) - Tau2: Fixed telecom domain schema to match tau2-bench, added agent/user state synchronization and deterministic network simulation, fixed initialization flow and tool result serialization (PR: #30) diff --git a/maseval/core/benchmark.py b/maseval/core/benchmark.py index 056f19e3..85dfba77 100644 --- a/maseval/core/benchmark.py +++ b/maseval/core/benchmark.py @@ -1226,20 +1226,17 @@ def _execute_task_repetition( # Task execution failed, so skip evaluation eval_results = None - # 5. Build report + # 5. Build report — all keys always present for consistent schema report: Dict[str, Any] = { "task_id": str(task.id), "repeat_idx": repeat_idx, "status": execution_status.value, + "error": error_info, "traces": execution_traces, "config": execution_configs, "eval": eval_results, } - # Add error info if present - if error_info is not None: - report["error"] = error_info - # Clear registry after task repetition completes self.clear_registry() diff --git a/maseval/core/callbacks/result_logger.py b/maseval/core/callbacks/result_logger.py index 92427618..1606cfc4 100644 --- a/maseval/core/callbacks/result_logger.py +++ b/maseval/core/callbacks/result_logger.py @@ -160,6 +160,8 @@ def _filter_report(self, report: Dict) -> Dict: filtered = { "task_id": report.get("task_id"), "repeat_idx": report.get("repeat_idx"), + "status": report.get("status"), + "error": report.get("error"), } if self.include_traces and "traces" in report: diff --git a/tests/test_core/test_benchmark/test_benchmark_lifecycle.py b/tests/test_core/test_benchmark/test_benchmark_lifecycle.py index 2d600fcc..f8bf2c07 100644 --- a/tests/test_core/test_benchmark/test_benchmark_lifecycle.py +++ b/tests/test_core/test_benchmark/test_benchmark_lifecycle.py @@ -562,7 +562,7 @@ def test_successful_task_has_success_status(self): assert len(reports) == 1 assert reports[0]["status"] == TaskExecutionStatus.SUCCESS.value - assert "error" not in reports[0] + assert reports[0]["error"] is None assert reports[0]["eval"] is not None def test_default_failure_flags(self): diff --git a/tests/test_core/test_callbacks/test_result_logger.py b/tests/test_core/test_callbacks/test_result_logger.py index 4f7654d6..95013641 100644 --- a/tests/test_core/test_callbacks/test_result_logger.py +++ b/tests/test_core/test_callbacks/test_result_logger.py @@ -134,6 +134,46 @@ def test_filter_report_all_included(self): assert "config" in filtered assert "eval" in filtered + def test_filter_report_preserves_status_and_error(self): + """Test that status and error fields are always included in filtered reports. + + These are core metadata fields (like task_id and repeat_idx) that must + always be present so persisted results can distinguish successes from failures. + """ + logger = MockResultLogger(include_traces=False, include_config=False, include_eval=False) + + report = { + "task_id": "task_0", + "repeat_idx": 0, + "status": "agent_error", + "error": {"type": "AgentError", "message": "Tool call failed"}, + "traces": {"agent": "trace_data"}, + "config": {"model": "gpt-4"}, + "eval": {"score": 0.0}, + } + + filtered = logger._filter_report(report) + + assert filtered["status"] == "agent_error" + assert filtered["error"] == {"type": "AgentError", "message": "Tool call failed"} + assert "traces" not in filtered + assert "config" not in filtered + assert "eval" not in filtered + + def test_filter_report_status_and_error_absent(self): + """Test that missing status/error fields result in None values.""" + logger = MockResultLogger() + + report = { + "task_id": "task_0", + "repeat_idx": 0, + } + + filtered = logger._filter_report(report) + + assert filtered["status"] is None + assert filtered["error"] is None + def test_filter_report_partial_included(self): """Test report filtering with only some fields included.""" logger = MockResultLogger(include_traces=False, include_config=True, include_eval=False) From b5181754f7171b48e1fbc420d06b3577736009d8 Mon Sep 17 00:00:00 2001 From: Cornelius <42615086+cemde@users.noreply.github.com> Date: Tue, 3 Mar 2026 15:15:30 +0000 Subject: [PATCH 2/2] Update CHANGELOG with fixed issues and PR references --- CHANGELOG.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 4539fe33..1005db9e 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -95,7 +95,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 ### Fixed -- `ResultLogger._filter_report()` now includes `status` and `error` fields in persisted results, so saved logs can distinguish successful runs from infrastructure failures. Report schema is now consistent across success and failure paths (`error` is always present, `None` on success). (PR: #PR_NUMBER_PLACEHOLDER) +- `ResultLogger._filter_report()` now includes `status` and `error` fields in persisted results, so saved logs can distinguish successful runs from infrastructure failures. Report schema is now consistent across success and failure paths (`error` is always present, `None` on success). (PR: #38) - GAIA2: Various fixes for faithful reproduction of ARE reference results — scenario lifecycle, data loading, evaluation flow, multi-turn notification handling, tool filtering, default agent fidelity, and simulation time management (PR: #30) - MultiAgentBench: Corrected domain mappings, added missing werewolf/minecraft support, fixed environment constructors, added result summarization matching MARBLE's evaluation pipeline (PR: #30) - Tau2: Fixed telecom domain schema to match tau2-bench, added agent/user state synchronization and deterministic network simulation, fixed initialization flow and tool result serialization (PR: #30)