From 2ba30a7fe21ed2b13e3ddb20d1c8f69dc45ac9b6 Mon Sep 17 00:00:00 2001 From: Claude Date: Sun, 8 Mar 2026 21:04:19 +0000 Subject: [PATCH] Remove --external-runner from MCP config and use Reviews mechanism for self-review Remove the --external-runner claude arg from the MCP config so Jobs always uses self-review mode. Refactor the self-review code path in finished_step to use the Deepwork Reviews mechanism (ReviewTask + write_instruction_files + format_for_claude) instead of the previous custom instruction-file generation. Add adapter function adapt_reviews_to_review_tasks in quality_gate.py that bridges Jobs review data into ReviewTask objects without modifying the Reviews code. https://claude.ai/code/session_013pcsdysk56DbD3mnXj8qC8 --- plugins/claude/.mcp.json | 2 +- src/deepwork/jobs/mcp/quality_gate.py | 159 ++++++++++++++++++++++++++ src/deepwork/jobs/mcp/tools.py | 66 +++++------ tests/unit/jobs/mcp/test_tools.py | 35 +++--- 4 files changed, 212 insertions(+), 50 deletions(-) diff --git a/plugins/claude/.mcp.json b/plugins/claude/.mcp.json index 278c6a9c..edad8208 100644 --- a/plugins/claude/.mcp.json +++ b/plugins/claude/.mcp.json @@ -2,7 +2,7 @@ "mcpServers": { "deepwork": { "command": "uvx", - "args": ["deepwork", "serve", "--path", ".", "--external-runner", "claude", "--platform", "claude"] + "args": ["deepwork", "serve", "--path", ".", "--platform", "claude"] } } } diff --git a/src/deepwork/jobs/mcp/quality_gate.py b/src/deepwork/jobs/mcp/quality_gate.py index 376db9b9..90743470 100644 --- a/src/deepwork/jobs/mcp/quality_gate.py +++ b/src/deepwork/jobs/mcp/quality_gate.py @@ -54,6 +54,165 @@ class QualityGateError(Exception): pass +# ========================================================================= +# Adapter: Jobs review data → Deepwork Reviews ReviewTask objects +# +# This adapter bridges the Jobs quality-gate review format into the +# ReviewTask dataclass used by Deepwork Reviews (deepwork.review.config). +# The Deepwork Reviews code is the canonical implementation and is NOT +# modified here; instead we wrap it by producing ReviewTask objects that +# its instruction/formatting pipeline can consume. +# ========================================================================= + + +def adapt_reviews_to_review_tasks( + reviews: list[dict[str, Any]], + outputs: dict[str, str | list[str]], + output_specs: dict[str, str], + step_id: str = "quality-review", + notes: str | None = None, +) -> list["ReviewTask"]: + """Convert Jobs review dicts into ReviewTask objects for the Reviews pipeline. + + This is adapter logic that wraps the Deepwork Reviews mechanism so that + Jobs self-review can reuse the same instruction-file generation and + formatting code. + + Args: + reviews: List of review dicts with ``run_each``, ``quality_criteria``, + and optional ``additional_review_guidance``. + outputs: Map of output names to file path(s) as submitted by the agent. + output_specs: Map of output names to their type (``"file"`` or ``"files"``). + step_id: Identifier for the step being reviewed (used in rule_name). + notes: Optional notes from the agent about work done. + + Returns: + List of ReviewTask objects suitable for ``write_instruction_files`` + and ``format_for_claude``. + """ + from deepwork.review.config import ReviewTask + + tasks: list[ReviewTask] = [] + + for review in reviews: + run_each: str = review["run_each"] + quality_criteria: dict[str, str] = review["quality_criteria"] + guidance: str | None = review.get("additional_review_guidance") + + # Build instruction text from quality criteria (adapter: translating + # Jobs quality_criteria dict into the free-text instructions field + # that ReviewTask expects). + instructions = _build_review_instructions_text(quality_criteria, guidance, notes) + + if run_each == "step": + # Review all outputs together — collect every file path + all_paths = _flatten_output_paths(outputs) + tasks.append( + ReviewTask( + rule_name=f"{step_id}-quality-review", + files_to_review=all_paths, + instructions=instructions, + agent_name=None, + ) + ) + elif run_each in outputs: + output_type = output_specs.get(run_each, "file") + output_value = outputs[run_each] + + if output_type == "files" and isinstance(output_value, list): + # Per-file review: one ReviewTask per file + for file_path in output_value: + tasks.append( + ReviewTask( + rule_name=f"{step_id}-quality-review-{run_each}", + files_to_review=[file_path], + instructions=instructions, + agent_name=None, + ) + ) + else: + # Single file review + path = output_value if isinstance(output_value, str) else str(output_value) + tasks.append( + ReviewTask( + rule_name=f"{step_id}-quality-review-{run_each}", + files_to_review=[path], + instructions=instructions, + agent_name=None, + ) + ) + + return tasks + + +def _build_review_instructions_text( + quality_criteria: dict[str, str], + guidance: str | None = None, + notes: str | None = None, +) -> str: + """Build instruction text from Jobs quality criteria for a ReviewTask. + + Adapter logic: translates the Jobs structured quality_criteria dict + into the free-text instructions string that ReviewTask.instructions expects. + + Args: + quality_criteria: Map of criterion name to question. + guidance: Optional additional review guidance. + notes: Optional author notes. + + Returns: + Markdown instruction text. + """ + parts: list[str] = [] + + parts.append( + "You are an editor responsible for reviewing the files listed below. " + "Evaluate whether the outputs meet the specified quality criteria." + ) + parts.append("") + + parts.append("## Criteria to Evaluate") + parts.append("") + for name, question in quality_criteria.items(): + parts.append(f"- **{name}**: {question}") + parts.append("") + + if guidance: + parts.append("## Additional Context") + parts.append("") + parts.append(guidance) + parts.append("") + + if notes: + parts.append("## Author Notes") + parts.append("") + parts.append(notes) + parts.append("") + + parts.append("## Guidelines") + parts.append("") + parts.append("- Be strict but fair") + parts.append( + "- Apply criteria pragmatically. If a criterion is not applicable " + "to this step's purpose, pass it." + ) + parts.append("- Only mark a criterion as passed if it is clearly met or not applicable.") + parts.append("- Provide specific, actionable feedback for failed criteria.") + + return "\n".join(parts) + + +def _flatten_output_paths(outputs: dict[str, str | list[str]]) -> list[str]: + """Flatten outputs dict into a list of file paths (adapter helper).""" + paths: list[str] = [] + for value in outputs.values(): + if isinstance(value, list): + paths.extend(value) + else: + paths.append(value) + return paths + + class QualityGate: """Evaluates step outputs against quality criteria. diff --git a/src/deepwork/jobs/mcp/tools.py b/src/deepwork/jobs/mcp/tools.py index 9cde1b5d..9ef7ad8a 100644 --- a/src/deepwork/jobs/mcp/tools.py +++ b/src/deepwork/jobs/mcp/tools.py @@ -14,8 +14,6 @@ from pathlib import Path from typing import TYPE_CHECKING -import aiofiles - from deepwork.jobs.discovery import JobLoadError, find_job_dir, load_all_jobs from deepwork.jobs.mcp.schemas import ( AbortWorkflowInput, @@ -466,46 +464,44 @@ async def finished_step(self, input_data: FinishedStepInput) -> FinishedStepResp output_specs = {out.name: out.type for out in current_step.outputs} if self.external_runner is None: - # Self-review mode: build instructions file and return guidance - # to the agent to verify its own work via a subagent. - review_content = await self.quality_gate.build_review_instructions_file( + # Self-review mode: use the Deepwork Reviews mechanism to + # generate instruction files and formatted output. + # This adapter bridges Jobs review data into ReviewTask objects + # that the Reviews pipeline can consume. + from deepwork.jobs.mcp.quality_gate import adapt_reviews_to_review_tasks + from deepwork.review.formatter import format_for_claude + from deepwork.review.instructions import write_instruction_files + + review_tasks = adapt_reviews_to_review_tasks( reviews=review_dicts, outputs=input_data.outputs, output_specs=output_specs, - project_root=self.project_root, + step_id=current_step_id, notes=input_data.notes, ) - # Write instructions to .deepwork/tmp/ - tmp_dir = self.project_root / ".deepwork" / "tmp" - tmp_dir.mkdir(parents=True, exist_ok=True) - review_filename = f"quality_review_{sid}_{current_step_id}.md" - review_file_path = tmp_dir / review_filename - async with aiofiles.open(review_file_path, "w", encoding="utf-8") as f: - await f.write(review_content) - - relative_path = f".deepwork/tmp/{review_filename}" - feedback = ( - f"Quality review required. Review instructions have been written to: " - f"{relative_path}\n\n" - f"Verify the quality of your work:\n" - f'1. Spawn a subagent with the prompt: "Read the file at ' - f"{relative_path} and follow the instructions in it. " - f"Review the referenced output files and evaluate them against " - f'the criteria specified. Report your detailed findings."\n' - f"2. Review the subagent's findings\n" - f"3. Fix any issues identified by the subagent\n" - f"4. Repeat steps 1-3 until all criteria pass\n" - f"5. Once all criteria pass, call finished_step again with " - f"quality_review_override_reason set to describe the " - f"review outcome (e.g. 'Self-review passed: all criteria met')" - ) + task_files = write_instruction_files(review_tasks, self.project_root) - return FinishedStepResponse( - status=StepStatus.NEEDS_WORK, - feedback=feedback, - stack=self.state_manager.get_stack(), - ) + if task_files: + formatted = format_for_claude(task_files, self.project_root) + feedback = ( + f"Quality review required.\n\n" + f"{formatted}\n" + f"After reviewing, fix any issues found, then call finished_step " + f"again with quality_review_override_reason set to describe the " + f"review outcome (e.g. 'Self-review passed: all criteria met')" + ) + else: + # All reviews already passed (cached .passed markers) + feedback = None + + if feedback: + return FinishedStepResponse( + status=StepStatus.NEEDS_WORK, + feedback=feedback, + stack=self.state_manager.get_stack(), + ) + # If no feedback (all cached), fall through to step completion else: # External runner mode: use quality gate subprocess evaluation attempts = await self.state_manager.record_quality_attempt( diff --git a/tests/unit/jobs/mcp/test_tools.py b/tests/unit/jobs/mcp/test_tools.py index 57b440d3..f19cb3fd 100644 --- a/tests/unit/jobs/mcp/test_tools.py +++ b/tests/unit/jobs/mcp/test_tools.py @@ -1552,7 +1552,7 @@ async def test_self_review_returns_needs_work( async def test_self_review_feedback_contains_instructions( self, tools_self_review: WorkflowTools, project_root: Path ) -> None: - """Test that feedback contains subagent and override instructions.""" + """Test that feedback contains task invocation and override instructions.""" await tools_self_review.start_workflow( StartWorkflowInput(goal="Test", job_name="test_job", workflow_name="main") ) @@ -1564,16 +1564,17 @@ async def test_self_review_feedback_contains_instructions( assert response.feedback is not None assert "Quality review required" in response.feedback - assert "subagent" in response.feedback.lower() + # Uses Reviews mechanism: format_for_claude output with subagent_type + assert "subagent_type" in response.feedback assert "quality_review_override_reason" in response.feedback - assert ".deepwork/tmp/quality_review_" in response.feedback + assert ".deepwork/tmp/review_instructions/" in response.feedback # THIS TEST VALIDATES A HARD REQUIREMENT (JOBS-REQ-001.4.10). # YOU MUST NOT MODIFY THIS TEST UNLESS THE REQUIREMENT CHANGES async def test_self_review_writes_instructions_file( self, tools_self_review: WorkflowTools, project_root: Path ) -> None: - """Test that an instructions file is written to .deepwork/tmp/.""" + """Test that an instructions file is written to .deepwork/tmp/review_instructions/.""" await tools_self_review.start_workflow( StartWorkflowInput(goal="Test", job_name="test_job", workflow_name="main") ) @@ -1583,7 +1584,8 @@ async def test_self_review_writes_instructions_file( FinishedStepInput(outputs={"output1.md": "output1.md"}) ) - review_files = list((project_root / ".deepwork" / "tmp").glob("quality_review_*.md")) + review_dir = project_root / ".deepwork" / "tmp" / "review_instructions" + review_files = list(review_dir.glob("*.md")) assert len(review_files) == 1 # THIS TEST VALIDATES A HARD REQUIREMENT (JOBS-REQ-001.4.10). @@ -1601,7 +1603,8 @@ async def test_self_review_file_contains_criteria( FinishedStepInput(outputs={"output1.md": "output1.md"}) ) - review_files = list((project_root / ".deepwork" / "tmp").glob("quality_review_*.md")) + review_dir = project_root / ".deepwork" / "tmp" / "review_instructions" + review_files = list(review_dir.glob("*.md")) content = review_files[0].read_text() # step1 has review criteria "Output Valid": "Is the output valid?" @@ -1621,7 +1624,8 @@ async def test_self_review_file_references_outputs_not_inline( FinishedStepInput(outputs={"output1.md": "output1.md"}) ) - review_files = list((project_root / ".deepwork" / "tmp").glob("quality_review_*.md")) + review_dir = project_root / ".deepwork" / "tmp" / "review_instructions" + review_files = list(review_dir.glob("*.md")) content = review_files[0].read_text() assert "output1.md" in content @@ -1629,22 +1633,24 @@ async def test_self_review_file_references_outputs_not_inline( # THIS TEST VALIDATES A HARD REQUIREMENT (JOBS-REQ-001.4.10). # YOU MUST NOT MODIFY THIS TEST UNLESS THE REQUIREMENT CHANGES - async def test_self_review_file_named_with_session_and_step( + async def test_self_review_file_named_with_step_id( self, tools_self_review: WorkflowTools, project_root: Path ) -> None: - """Test that review file name includes session and step IDs.""" - resp = await tools_self_review.start_workflow( + """Test that review file name includes the step ID via Reviews naming.""" + await tools_self_review.start_workflow( StartWorkflowInput(goal="Test", job_name="test_job", workflow_name="main") ) - session_id = resp.begin_step.session_id (project_root / "output1.md").write_text("output") await tools_self_review.finished_step( FinishedStepInput(outputs={"output1.md": "output1.md"}) ) - expected_file = project_root / ".deepwork" / "tmp" / f"quality_review_{session_id}_step1.md" - assert expected_file.exists() + # Uses Reviews mechanism: file named with deterministic review ID + # containing step_id (e.g. "step1-quality-review--output1.md--.md") + review_dir = project_root / ".deepwork" / "tmp" / "review_instructions" + review_files = list(review_dir.glob("step1-quality-review--*.md")) + assert len(review_files) == 1 # THIS TEST VALIDATES A HARD REQUIREMENT (JOBS-REQ-001.4.9, JOBS-REQ-001.4.10). # YOU MUST NOT MODIFY THIS TEST UNLESS THE REQUIREMENT CHANGES @@ -1716,7 +1722,8 @@ async def test_self_review_includes_notes_in_file( ) ) - review_files = list((project_root / ".deepwork" / "tmp").glob("quality_review_*.md")) + review_dir = project_root / ".deepwork" / "tmp" / "review_instructions" + review_files = list(review_dir.glob("*.md")) content = review_files[0].read_text() assert "I used the XYZ library for this step." in content