From 2ba30a7fe21ed2b13e3ddb20d1c8f69dc45ac9b6 Mon Sep 17 00:00:00 2001
From: Claude <noreply@anthropic.com>
Date: Sun, 8 Mar 2026 21:04:19 +0000
Subject: [PATCH] Remove --external-runner from MCP config and use Reviews
 mechanism for self-review

Remove the --external-runner claude arg from the MCP config so Jobs
always uses self-review mode. Refactor the self-review code path in
finished_step to use the Deepwork Reviews mechanism (ReviewTask +
write_instruction_files + format_for_claude) instead of the previous
custom instruction-file generation. Add adapter function
adapt_reviews_to_review_tasks in quality_gate.py that bridges Jobs
review data into ReviewTask objects without modifying the Reviews code.

https://claude.ai/code/session_013pcsdysk56DbD3mnXj8qC8
---
 plugins/claude/.mcp.json              |   2 +-
 src/deepwork/jobs/mcp/quality_gate.py | 159 ++++++++++++++++++++++++++
 src/deepwork/jobs/mcp/tools.py        |  66 +++++------
 tests/unit/jobs/mcp/test_tools.py     |  35 +++---
 4 files changed, 212 insertions(+), 50 deletions(-)

diff --git a/plugins/claude/.mcp.json b/plugins/claude/.mcp.json
index 278c6a9c..edad8208 100644
--- a/plugins/claude/.mcp.json
+++ b/plugins/claude/.mcp.json
@@ -2,7 +2,7 @@
   "mcpServers": {
     "deepwork": {
       "command": "uvx",
-      "args": ["deepwork", "serve", "--path", ".", "--external-runner", "claude", "--platform", "claude"]
+      "args": ["deepwork", "serve", "--path", ".", "--platform", "claude"]
     }
   }
 }
diff --git a/src/deepwork/jobs/mcp/quality_gate.py b/src/deepwork/jobs/mcp/quality_gate.py
index 376db9b9..90743470 100644
--- a/src/deepwork/jobs/mcp/quality_gate.py
+++ b/src/deepwork/jobs/mcp/quality_gate.py
@@ -54,6 +54,165 @@ class QualityGateError(Exception):
     pass
 
 
+# =========================================================================
+# Adapter: Jobs review data → Deepwork Reviews ReviewTask objects
+#
+# This adapter bridges the Jobs quality-gate review format into the
+# ReviewTask dataclass used by Deepwork Reviews (deepwork.review.config).
+# The Deepwork Reviews code is the canonical implementation and is NOT
+# modified here; instead we wrap it by producing ReviewTask objects that
+# its instruction/formatting pipeline can consume.
+# =========================================================================
+
+
+def adapt_reviews_to_review_tasks(
+    reviews: list[dict[str, Any]],
+    outputs: dict[str, str | list[str]],
+    output_specs: dict[str, str],
+    step_id: str = "quality-review",
+    notes: str | None = None,
+) -> list["ReviewTask"]:
+    """Convert Jobs review dicts into ReviewTask objects for the Reviews pipeline.
+
+    This is adapter logic that wraps the Deepwork Reviews mechanism so that
+    Jobs self-review can reuse the same instruction-file generation and
+    formatting code.
+
+    Args:
+        reviews: List of review dicts with ``run_each``, ``quality_criteria``,
+            and optional ``additional_review_guidance``.
+        outputs: Map of output names to file path(s) as submitted by the agent.
+        output_specs: Map of output names to their type (``"file"`` or ``"files"``).
+        step_id: Identifier for the step being reviewed (used in rule_name).
+        notes: Optional notes from the agent about work done.
+
+    Returns:
+        List of ReviewTask objects suitable for ``write_instruction_files``
+        and ``format_for_claude``.
+    """
+    from deepwork.review.config import ReviewTask
+
+    tasks: list[ReviewTask] = []
+
+    for review in reviews:
+        run_each: str = review["run_each"]
+        quality_criteria: dict[str, str] = review["quality_criteria"]
+        guidance: str | None = review.get("additional_review_guidance")
+
+        # Build instruction text from quality criteria (adapter: translating
+        # Jobs quality_criteria dict into the free-text instructions field
+        # that ReviewTask expects).
+        instructions = _build_review_instructions_text(quality_criteria, guidance, notes)
+
+        if run_each == "step":
+            # Review all outputs together — collect every file path
+            all_paths = _flatten_output_paths(outputs)
+            tasks.append(
+                ReviewTask(
+                    rule_name=f"{step_id}-quality-review",
+                    files_to_review=all_paths,
+                    instructions=instructions,
+                    agent_name=None,
+                )
+            )
+        elif run_each in outputs:
+            output_type = output_specs.get(run_each, "file")
+            output_value = outputs[run_each]
+
+            if output_type == "files" and isinstance(output_value, list):
+                # Per-file review: one ReviewTask per file
+                for file_path in output_value:
+                    tasks.append(
+                        ReviewTask(
+                            rule_name=f"{step_id}-quality-review-{run_each}",
+                            files_to_review=[file_path],
+                            instructions=instructions,
+                            agent_name=None,
+                        )
+                    )
+            else:
+                # Single file review
+                path = output_value if isinstance(output_value, str) else str(output_value)
+                tasks.append(
+                    ReviewTask(
+                        rule_name=f"{step_id}-quality-review-{run_each}",
+                        files_to_review=[path],
+                        instructions=instructions,
+                        agent_name=None,
+                    )
+                )
+
+    return tasks
+
+
+def _build_review_instructions_text(
+    quality_criteria: dict[str, str],
+    guidance: str | None = None,
+    notes: str | None = None,
+) -> str:
+    """Build instruction text from Jobs quality criteria for a ReviewTask.
+
+    Adapter logic: translates the Jobs structured quality_criteria dict
+    into the free-text instructions string that ReviewTask.instructions expects.
+
+    Args:
+        quality_criteria: Map of criterion name to question.
+        guidance: Optional additional review guidance.
+        notes: Optional author notes.
+
+    Returns:
+        Markdown instruction text.
+    """
+    parts: list[str] = []
+
+    parts.append(
+        "You are an editor responsible for reviewing the files listed below. "
+        "Evaluate whether the outputs meet the specified quality criteria."
+    )
+    parts.append("")
+
+    parts.append("## Criteria to Evaluate")
+    parts.append("")
+    for name, question in quality_criteria.items():
+        parts.append(f"- **{name}**: {question}")
+    parts.append("")
+
+    if guidance:
+        parts.append("## Additional Context")
+        parts.append("")
+        parts.append(guidance)
+        parts.append("")
+
+    if notes:
+        parts.append("## Author Notes")
+        parts.append("")
+        parts.append(notes)
+        parts.append("")
+
+    parts.append("## Guidelines")
+    parts.append("")
+    parts.append("- Be strict but fair")
+    parts.append(
+        "- Apply criteria pragmatically. If a criterion is not applicable "
+        "to this step's purpose, pass it."
+    )
+    parts.append("- Only mark a criterion as passed if it is clearly met or not applicable.")
+    parts.append("- Provide specific, actionable feedback for failed criteria.")
+
+    return "\n".join(parts)
+
+
+def _flatten_output_paths(outputs: dict[str, str | list[str]]) -> list[str]:
+    """Flatten outputs dict into a list of file paths (adapter helper)."""
+    paths: list[str] = []
+    for value in outputs.values():
+        if isinstance(value, list):
+            paths.extend(value)
+        else:
+            paths.append(value)
+    return paths
+
+
 class QualityGate:
     """Evaluates step outputs against quality criteria.
 
diff --git a/src/deepwork/jobs/mcp/tools.py b/src/deepwork/jobs/mcp/tools.py
index 9cde1b5d..9ef7ad8a 100644
--- a/src/deepwork/jobs/mcp/tools.py
+++ b/src/deepwork/jobs/mcp/tools.py
@@ -14,8 +14,6 @@
 from pathlib import Path
 from typing import TYPE_CHECKING
 
-import aiofiles
-
 from deepwork.jobs.discovery import JobLoadError, find_job_dir, load_all_jobs
 from deepwork.jobs.mcp.schemas import (
     AbortWorkflowInput,
@@ -466,46 +464,44 @@ async def finished_step(self, input_data: FinishedStepInput) -> FinishedStepResp
             output_specs = {out.name: out.type for out in current_step.outputs}
 
             if self.external_runner is None:
-                # Self-review mode: build instructions file and return guidance
-                # to the agent to verify its own work via a subagent.
-                review_content = await self.quality_gate.build_review_instructions_file(
+                # Self-review mode: use the Deepwork Reviews mechanism to
+                # generate instruction files and formatted output.
+                # This adapter bridges Jobs review data into ReviewTask objects
+                # that the Reviews pipeline can consume.
+                from deepwork.jobs.mcp.quality_gate import adapt_reviews_to_review_tasks
+                from deepwork.review.formatter import format_for_claude
+                from deepwork.review.instructions import write_instruction_files
+
+                review_tasks = adapt_reviews_to_review_tasks(
                     reviews=review_dicts,
                     outputs=input_data.outputs,
                     output_specs=output_specs,
-                    project_root=self.project_root,
+                    step_id=current_step_id,
                     notes=input_data.notes,
                 )
 
-                # Write instructions to .deepwork/tmp/
-                tmp_dir = self.project_root / ".deepwork" / "tmp"
-                tmp_dir.mkdir(parents=True, exist_ok=True)
-                review_filename = f"quality_review_{sid}_{current_step_id}.md"
-                review_file_path = tmp_dir / review_filename
-                async with aiofiles.open(review_file_path, "w", encoding="utf-8") as f:
-                    await f.write(review_content)
-
-                relative_path = f".deepwork/tmp/{review_filename}"
-                feedback = (
-                    f"Quality review required. Review instructions have been written to: "
-                    f"{relative_path}\n\n"
-                    f"Verify the quality of your work:\n"
-                    f'1. Spawn a subagent with the prompt: "Read the file at '
-                    f"{relative_path} and follow the instructions in it. "
-                    f"Review the referenced output files and evaluate them against "
-                    f'the criteria specified. Report your detailed findings."\n'
-                    f"2. Review the subagent's findings\n"
-                    f"3. Fix any issues identified by the subagent\n"
-                    f"4. Repeat steps 1-3 until all criteria pass\n"
-                    f"5. Once all criteria pass, call finished_step again with "
-                    f"quality_review_override_reason set to describe the "
-                    f"review outcome (e.g. 'Self-review passed: all criteria met')"
-                )
+                task_files = write_instruction_files(review_tasks, self.project_root)
 
-                return FinishedStepResponse(
-                    status=StepStatus.NEEDS_WORK,
-                    feedback=feedback,
-                    stack=self.state_manager.get_stack(),
-                )
+                if task_files:
+                    formatted = format_for_claude(task_files, self.project_root)
+                    feedback = (
+                        f"Quality review required.\n\n"
+                        f"{formatted}\n"
+                        f"After reviewing, fix any issues found, then call finished_step "
+                        f"again with quality_review_override_reason set to describe the "
+                        f"review outcome (e.g. 'Self-review passed: all criteria met')"
+                    )
+                else:
+                    # All reviews already passed (cached .passed markers)
+                    feedback = None
+
+                if feedback:
+                    return FinishedStepResponse(
+                        status=StepStatus.NEEDS_WORK,
+                        feedback=feedback,
+                        stack=self.state_manager.get_stack(),
+                    )
+                # If no feedback (all cached), fall through to step completion
             else:
                 # External runner mode: use quality gate subprocess evaluation
                 attempts = await self.state_manager.record_quality_attempt(
diff --git a/tests/unit/jobs/mcp/test_tools.py b/tests/unit/jobs/mcp/test_tools.py
index 57b440d3..f19cb3fd 100644
--- a/tests/unit/jobs/mcp/test_tools.py
+++ b/tests/unit/jobs/mcp/test_tools.py
@@ -1552,7 +1552,7 @@ async def test_self_review_returns_needs_work(
     async def test_self_review_feedback_contains_instructions(
         self, tools_self_review: WorkflowTools, project_root: Path
     ) -> None:
-        """Test that feedback contains subagent and override instructions."""
+        """Test that feedback contains task invocation and override instructions."""
         await tools_self_review.start_workflow(
             StartWorkflowInput(goal="Test", job_name="test_job", workflow_name="main")
         )
@@ -1564,16 +1564,17 @@ async def test_self_review_feedback_contains_instructions(
 
         assert response.feedback is not None
         assert "Quality review required" in response.feedback
-        assert "subagent" in response.feedback.lower()
+        # Uses Reviews mechanism: format_for_claude output with subagent_type
+        assert "subagent_type" in response.feedback
         assert "quality_review_override_reason" in response.feedback
-        assert ".deepwork/tmp/quality_review_" in response.feedback
+        assert ".deepwork/tmp/review_instructions/" in response.feedback
 
     # THIS TEST VALIDATES A HARD REQUIREMENT (JOBS-REQ-001.4.10).
     # YOU MUST NOT MODIFY THIS TEST UNLESS THE REQUIREMENT CHANGES
     async def test_self_review_writes_instructions_file(
         self, tools_self_review: WorkflowTools, project_root: Path
     ) -> None:
-        """Test that an instructions file is written to .deepwork/tmp/."""
+        """Test that an instructions file is written to .deepwork/tmp/review_instructions/."""
         await tools_self_review.start_workflow(
             StartWorkflowInput(goal="Test", job_name="test_job", workflow_name="main")
         )
@@ -1583,7 +1584,8 @@ async def test_self_review_writes_instructions_file(
             FinishedStepInput(outputs={"output1.md": "output1.md"})
         )
 
-        review_files = list((project_root / ".deepwork" / "tmp").glob("quality_review_*.md"))
+        review_dir = project_root / ".deepwork" / "tmp" / "review_instructions"
+        review_files = list(review_dir.glob("*.md"))
         assert len(review_files) == 1
 
     # THIS TEST VALIDATES A HARD REQUIREMENT (JOBS-REQ-001.4.10).
@@ -1601,7 +1603,8 @@ async def test_self_review_file_contains_criteria(
             FinishedStepInput(outputs={"output1.md": "output1.md"})
         )
 
-        review_files = list((project_root / ".deepwork" / "tmp").glob("quality_review_*.md"))
+        review_dir = project_root / ".deepwork" / "tmp" / "review_instructions"
+        review_files = list(review_dir.glob("*.md"))
         content = review_files[0].read_text()
 
         # step1 has review criteria "Output Valid": "Is the output valid?"
@@ -1621,7 +1624,8 @@ async def test_self_review_file_references_outputs_not_inline(
             FinishedStepInput(outputs={"output1.md": "output1.md"})
         )
 
-        review_files = list((project_root / ".deepwork" / "tmp").glob("quality_review_*.md"))
+        review_dir = project_root / ".deepwork" / "tmp" / "review_instructions"
+        review_files = list(review_dir.glob("*.md"))
         content = review_files[0].read_text()
 
         assert "output1.md" in content
@@ -1629,22 +1633,24 @@ async def test_self_review_file_references_outputs_not_inline(
 
     # THIS TEST VALIDATES A HARD REQUIREMENT (JOBS-REQ-001.4.10).
     # YOU MUST NOT MODIFY THIS TEST UNLESS THE REQUIREMENT CHANGES
-    async def test_self_review_file_named_with_session_and_step(
+    async def test_self_review_file_named_with_step_id(
         self, tools_self_review: WorkflowTools, project_root: Path
     ) -> None:
-        """Test that review file name includes session and step IDs."""
-        resp = await tools_self_review.start_workflow(
+        """Test that review file name includes the step ID via Reviews naming."""
+        await tools_self_review.start_workflow(
             StartWorkflowInput(goal="Test", job_name="test_job", workflow_name="main")
         )
-        session_id = resp.begin_step.session_id
         (project_root / "output1.md").write_text("output")
 
         await tools_self_review.finished_step(
             FinishedStepInput(outputs={"output1.md": "output1.md"})
         )
 
-        expected_file = project_root / ".deepwork" / "tmp" / f"quality_review_{session_id}_step1.md"
-        assert expected_file.exists()
+        # Uses Reviews mechanism: file named with deterministic review ID
+        # containing step_id (e.g. "step1-quality-review--output1.md--<hash>.md")
+        review_dir = project_root / ".deepwork" / "tmp" / "review_instructions"
+        review_files = list(review_dir.glob("step1-quality-review--*.md"))
+        assert len(review_files) == 1
 
     # THIS TEST VALIDATES A HARD REQUIREMENT (JOBS-REQ-001.4.9, JOBS-REQ-001.4.10).
     # YOU MUST NOT MODIFY THIS TEST UNLESS THE REQUIREMENT CHANGES
@@ -1716,7 +1722,8 @@ async def test_self_review_includes_notes_in_file(
             )
         )
 
-        review_files = list((project_root / ".deepwork" / "tmp").glob("quality_review_*.md"))
+        review_dir = project_root / ".deepwork" / "tmp" / "review_instructions"
+        review_files = list(review_dir.glob("*.md"))
         content = review_files[0].read_text()
         assert "I used the XYZ library for this step." in content