Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -95,6 +95,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0

### Fixed

- `ResultLogger._filter_report()` now includes `status` and `error` fields in persisted results, so saved logs can distinguish successful runs from infrastructure failures. Report schema is now consistent across success and failure paths (`error` is always present, `None` on success). (PR: #38)
- GAIA2: Various fixes for faithful reproduction of ARE reference results — scenario lifecycle, data loading, evaluation flow, multi-turn notification handling, tool filtering, default agent fidelity, and simulation time management (PR: #30)
- MultiAgentBench: Corrected domain mappings, added missing werewolf/minecraft support, fixed environment constructors, added result summarization matching MARBLE's evaluation pipeline (PR: #30)
- Tau2: Fixed telecom domain schema to match tau2-bench, added agent/user state synchronization and deterministic network simulation, fixed initialization flow and tool result serialization (PR: #30)
Expand Down
7 changes: 2 additions & 5 deletions maseval/core/benchmark.py
Original file line number Diff line number Diff line change
Expand Up @@ -1226,20 +1226,17 @@ def _execute_task_repetition(
# Task execution failed, so skip evaluation
eval_results = None

# 5. Build report
# 5. Build report — all keys always present for consistent schema
report: Dict[str, Any] = {
"task_id": str(task.id),
"repeat_idx": repeat_idx,
"status": execution_status.value,
"error": error_info,
"traces": execution_traces,
"config": execution_configs,
"eval": eval_results,
}

# Add error info if present
if error_info is not None:
report["error"] = error_info

# Clear registry after task repetition completes
self.clear_registry()

Expand Down
2 changes: 2 additions & 0 deletions maseval/core/callbacks/result_logger.py
Original file line number Diff line number Diff line change
Expand Up @@ -160,6 +160,8 @@ def _filter_report(self, report: Dict) -> Dict:
filtered = {
"task_id": report.get("task_id"),
"repeat_idx": report.get("repeat_idx"),
"status": report.get("status"),
"error": report.get("error"),
}

if self.include_traces and "traces" in report:
Expand Down
2 changes: 1 addition & 1 deletion tests/test_core/test_benchmark/test_benchmark_lifecycle.py
Original file line number Diff line number Diff line change
Expand Up @@ -562,7 +562,7 @@ def test_successful_task_has_success_status(self):

assert len(reports) == 1
assert reports[0]["status"] == TaskExecutionStatus.SUCCESS.value
assert "error" not in reports[0]
assert reports[0]["error"] is None
assert reports[0]["eval"] is not None

def test_default_failure_flags(self):
Expand Down
40 changes: 40 additions & 0 deletions tests/test_core/test_callbacks/test_result_logger.py
Original file line number Diff line number Diff line change
Expand Up @@ -134,6 +134,46 @@ def test_filter_report_all_included(self):
assert "config" in filtered
assert "eval" in filtered

def test_filter_report_preserves_status_and_error(self):
"""Test that status and error fields are always included in filtered reports.

These are core metadata fields (like task_id and repeat_idx) that must
always be present so persisted results can distinguish successes from failures.
"""
logger = MockResultLogger(include_traces=False, include_config=False, include_eval=False)

report = {
"task_id": "task_0",
"repeat_idx": 0,
"status": "agent_error",
"error": {"type": "AgentError", "message": "Tool call failed"},
"traces": {"agent": "trace_data"},
"config": {"model": "gpt-4"},
"eval": {"score": 0.0},
}

filtered = logger._filter_report(report)

assert filtered["status"] == "agent_error"
assert filtered["error"] == {"type": "AgentError", "message": "Tool call failed"}
assert "traces" not in filtered
assert "config" not in filtered
assert "eval" not in filtered

def test_filter_report_status_and_error_absent(self):
"""Test that missing status/error fields result in None values."""
logger = MockResultLogger()

report = {
"task_id": "task_0",
"repeat_idx": 0,
}

filtered = logger._filter_report(report)

assert filtered["status"] is None
assert filtered["error"] is None

def test_filter_report_partial_included(self):
"""Test report filtering with only some fields included."""
logger = MockResultLogger(include_traces=False, include_config=True, include_eval=False)
Expand Down
Loading