diff --git a/CHANGELOG.md b/CHANGELOG.md index 545e01d2..1005db9e 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -95,6 +95,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 ### Fixed +- `ResultLogger._filter_report()` now includes `status` and `error` fields in persisted results, so saved logs can distinguish successful runs from infrastructure failures. Report schema is now consistent across success and failure paths (`error` is always present, `None` on success). (PR: #38) - GAIA2: Various fixes for faithful reproduction of ARE reference results — scenario lifecycle, data loading, evaluation flow, multi-turn notification handling, tool filtering, default agent fidelity, and simulation time management (PR: #30) - MultiAgentBench: Corrected domain mappings, added missing werewolf/minecraft support, fixed environment constructors, added result summarization matching MARBLE's evaluation pipeline (PR: #30) - Tau2: Fixed telecom domain schema to match tau2-bench, added agent/user state synchronization and deterministic network simulation, fixed initialization flow and tool result serialization (PR: #30) diff --git a/maseval/core/benchmark.py b/maseval/core/benchmark.py index 056f19e3..85dfba77 100644 --- a/maseval/core/benchmark.py +++ b/maseval/core/benchmark.py @@ -1226,20 +1226,17 @@ def _execute_task_repetition( # Task execution failed, so skip evaluation eval_results = None - # 5. Build report + # 5. Build report — all keys always present for consistent schema report: Dict[str, Any] = { "task_id": str(task.id), "repeat_idx": repeat_idx, "status": execution_status.value, + "error": error_info, "traces": execution_traces, "config": execution_configs, "eval": eval_results, } - # Add error info if present - if error_info is not None: - report["error"] = error_info - # Clear registry after task repetition completes self.clear_registry() diff --git a/maseval/core/callbacks/result_logger.py b/maseval/core/callbacks/result_logger.py index 92427618..1606cfc4 100644 --- a/maseval/core/callbacks/result_logger.py +++ b/maseval/core/callbacks/result_logger.py @@ -160,6 +160,8 @@ def _filter_report(self, report: Dict) -> Dict: filtered = { "task_id": report.get("task_id"), "repeat_idx": report.get("repeat_idx"), + "status": report.get("status"), + "error": report.get("error"), } if self.include_traces and "traces" in report: diff --git a/tests/test_core/test_benchmark/test_benchmark_lifecycle.py b/tests/test_core/test_benchmark/test_benchmark_lifecycle.py index 2d600fcc..f8bf2c07 100644 --- a/tests/test_core/test_benchmark/test_benchmark_lifecycle.py +++ b/tests/test_core/test_benchmark/test_benchmark_lifecycle.py @@ -562,7 +562,7 @@ def test_successful_task_has_success_status(self): assert len(reports) == 1 assert reports[0]["status"] == TaskExecutionStatus.SUCCESS.value - assert "error" not in reports[0] + assert reports[0]["error"] is None assert reports[0]["eval"] is not None def test_default_failure_flags(self): diff --git a/tests/test_core/test_callbacks/test_result_logger.py b/tests/test_core/test_callbacks/test_result_logger.py index 4f7654d6..95013641 100644 --- a/tests/test_core/test_callbacks/test_result_logger.py +++ b/tests/test_core/test_callbacks/test_result_logger.py @@ -134,6 +134,46 @@ def test_filter_report_all_included(self): assert "config" in filtered assert "eval" in filtered + def test_filter_report_preserves_status_and_error(self): + """Test that status and error fields are always included in filtered reports. + + These are core metadata fields (like task_id and repeat_idx) that must + always be present so persisted results can distinguish successes from failures. + """ + logger = MockResultLogger(include_traces=False, include_config=False, include_eval=False) + + report = { + "task_id": "task_0", + "repeat_idx": 0, + "status": "agent_error", + "error": {"type": "AgentError", "message": "Tool call failed"}, + "traces": {"agent": "trace_data"}, + "config": {"model": "gpt-4"}, + "eval": {"score": 0.0}, + } + + filtered = logger._filter_report(report) + + assert filtered["status"] == "agent_error" + assert filtered["error"] == {"type": "AgentError", "message": "Tool call failed"} + assert "traces" not in filtered + assert "config" not in filtered + assert "eval" not in filtered + + def test_filter_report_status_and_error_absent(self): + """Test that missing status/error fields result in None values.""" + logger = MockResultLogger() + + report = { + "task_id": "task_0", + "repeat_idx": 0, + } + + filtered = logger._filter_report(report) + + assert filtered["status"] is None + assert filtered["error"] is None + def test_filter_report_partial_included(self): """Test report filtering with only some fields included.""" logger = MockResultLogger(include_traces=False, include_config=True, include_eval=False)