Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion plugins/claude/.mcp.json
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@
"mcpServers": {
"deepwork": {
"command": "uvx",
"args": ["deepwork", "serve", "--path", ".", "--external-runner", "claude", "--platform", "claude"]
"args": ["deepwork", "serve", "--path", ".", "--platform", "claude"]
}
}
}
159 changes: 159 additions & 0 deletions src/deepwork/jobs/mcp/quality_gate.py
Original file line number Diff line number Diff line change
Expand Up @@ -54,6 +54,165 @@ class QualityGateError(Exception):
pass


# =========================================================================
# Adapter: Jobs review data → Deepwork Reviews ReviewTask objects
#
# This adapter bridges the Jobs quality-gate review format into the
# ReviewTask dataclass used by Deepwork Reviews (deepwork.review.config).
# The Deepwork Reviews code is the canonical implementation and is NOT
# modified here; instead we wrap it by producing ReviewTask objects that
# its instruction/formatting pipeline can consume.
# =========================================================================


def adapt_reviews_to_review_tasks(
reviews: list[dict[str, Any]],
outputs: dict[str, str | list[str]],
output_specs: dict[str, str],
step_id: str = "quality-review",
notes: str | None = None,
) -> list["ReviewTask"]:
"""Convert Jobs review dicts into ReviewTask objects for the Reviews pipeline.

This is adapter logic that wraps the Deepwork Reviews mechanism so that
Jobs self-review can reuse the same instruction-file generation and
formatting code.

Args:
reviews: List of review dicts with ``run_each``, ``quality_criteria``,
and optional ``additional_review_guidance``.
outputs: Map of output names to file path(s) as submitted by the agent.
output_specs: Map of output names to their type (``"file"`` or ``"files"``).
step_id: Identifier for the step being reviewed (used in rule_name).
notes: Optional notes from the agent about work done.

Returns:
List of ReviewTask objects suitable for ``write_instruction_files``
and ``format_for_claude``.
"""
from deepwork.review.config import ReviewTask

tasks: list[ReviewTask] = []

for review in reviews:
run_each: str = review["run_each"]
quality_criteria: dict[str, str] = review["quality_criteria"]
guidance: str | None = review.get("additional_review_guidance")

# Build instruction text from quality criteria (adapter: translating
# Jobs quality_criteria dict into the free-text instructions field
# that ReviewTask expects).
instructions = _build_review_instructions_text(quality_criteria, guidance, notes)

if run_each == "step":
# Review all outputs together — collect every file path
all_paths = _flatten_output_paths(outputs)
tasks.append(
ReviewTask(
rule_name=f"{step_id}-quality-review",
files_to_review=all_paths,
instructions=instructions,
agent_name=None,
)
)
elif run_each in outputs:
output_type = output_specs.get(run_each, "file")
output_value = outputs[run_each]

if output_type == "files" and isinstance(output_value, list):
# Per-file review: one ReviewTask per file
for file_path in output_value:
tasks.append(
ReviewTask(
rule_name=f"{step_id}-quality-review-{run_each}",
files_to_review=[file_path],
instructions=instructions,
agent_name=None,
)
)
else:
# Single file review
path = output_value if isinstance(output_value, str) else str(output_value)
tasks.append(
ReviewTask(
rule_name=f"{step_id}-quality-review-{run_each}",
files_to_review=[path],
instructions=instructions,
agent_name=None,
)
)

return tasks


def _build_review_instructions_text(
quality_criteria: dict[str, str],
guidance: str | None = None,
notes: str | None = None,
) -> str:
"""Build instruction text from Jobs quality criteria for a ReviewTask.

Adapter logic: translates the Jobs structured quality_criteria dict
into the free-text instructions string that ReviewTask.instructions expects.

Args:
quality_criteria: Map of criterion name to question.
guidance: Optional additional review guidance.
notes: Optional author notes.

Returns:
Markdown instruction text.
"""
parts: list[str] = []

parts.append(
"You are an editor responsible for reviewing the files listed below. "
"Evaluate whether the outputs meet the specified quality criteria."
)
parts.append("")

parts.append("## Criteria to Evaluate")
parts.append("")
for name, question in quality_criteria.items():
parts.append(f"- **{name}**: {question}")
parts.append("")

if guidance:
parts.append("## Additional Context")
parts.append("")
parts.append(guidance)
parts.append("")

if notes:
parts.append("## Author Notes")
parts.append("")
parts.append(notes)
parts.append("")

parts.append("## Guidelines")
parts.append("")
parts.append("- Be strict but fair")
parts.append(
"- Apply criteria pragmatically. If a criterion is not applicable "
"to this step's purpose, pass it."
)
parts.append("- Only mark a criterion as passed if it is clearly met or not applicable.")
parts.append("- Provide specific, actionable feedback for failed criteria.")

return "\n".join(parts)


def _flatten_output_paths(outputs: dict[str, str | list[str]]) -> list[str]:
"""Flatten outputs dict into a list of file paths (adapter helper)."""
paths: list[str] = []
for value in outputs.values():
if isinstance(value, list):
paths.extend(value)
else:
paths.append(value)
return paths


class QualityGate:
"""Evaluates step outputs against quality criteria.

Expand Down
66 changes: 31 additions & 35 deletions src/deepwork/jobs/mcp/tools.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,8 +14,6 @@
from pathlib import Path
from typing import TYPE_CHECKING

import aiofiles

from deepwork.jobs.discovery import JobLoadError, find_job_dir, load_all_jobs
from deepwork.jobs.mcp.schemas import (
AbortWorkflowInput,
Expand Down Expand Up @@ -466,46 +464,44 @@ async def finished_step(self, input_data: FinishedStepInput) -> FinishedStepResp
output_specs = {out.name: out.type for out in current_step.outputs}

if self.external_runner is None:
# Self-review mode: build instructions file and return guidance
# to the agent to verify its own work via a subagent.
review_content = await self.quality_gate.build_review_instructions_file(
# Self-review mode: use the Deepwork Reviews mechanism to
# generate instruction files and formatted output.
# This adapter bridges Jobs review data into ReviewTask objects
# that the Reviews pipeline can consume.
from deepwork.jobs.mcp.quality_gate import adapt_reviews_to_review_tasks
from deepwork.review.formatter import format_for_claude
from deepwork.review.instructions import write_instruction_files

review_tasks = adapt_reviews_to_review_tasks(
reviews=review_dicts,
outputs=input_data.outputs,
output_specs=output_specs,
project_root=self.project_root,
step_id=current_step_id,
notes=input_data.notes,
)

# Write instructions to .deepwork/tmp/
tmp_dir = self.project_root / ".deepwork" / "tmp"
tmp_dir.mkdir(parents=True, exist_ok=True)
review_filename = f"quality_review_{sid}_{current_step_id}.md"
review_file_path = tmp_dir / review_filename
async with aiofiles.open(review_file_path, "w", encoding="utf-8") as f:
await f.write(review_content)

relative_path = f".deepwork/tmp/{review_filename}"
feedback = (
f"Quality review required. Review instructions have been written to: "
f"{relative_path}\n\n"
f"Verify the quality of your work:\n"
f'1. Spawn a subagent with the prompt: "Read the file at '
f"{relative_path} and follow the instructions in it. "
f"Review the referenced output files and evaluate them against "
f'the criteria specified. Report your detailed findings."\n'
f"2. Review the subagent's findings\n"
f"3. Fix any issues identified by the subagent\n"
f"4. Repeat steps 1-3 until all criteria pass\n"
f"5. Once all criteria pass, call finished_step again with "
f"quality_review_override_reason set to describe the "
f"review outcome (e.g. 'Self-review passed: all criteria met')"
)
task_files = write_instruction_files(review_tasks, self.project_root)

return FinishedStepResponse(
status=StepStatus.NEEDS_WORK,
feedback=feedback,
stack=self.state_manager.get_stack(),
)
if task_files:
formatted = format_for_claude(task_files, self.project_root)
feedback = (
f"Quality review required.\n\n"
f"{formatted}\n"
f"After reviewing, fix any issues found, then call finished_step "
f"again with quality_review_override_reason set to describe the "
f"review outcome (e.g. 'Self-review passed: all criteria met')"
)
else:
# All reviews already passed (cached .passed markers)
feedback = None

if feedback:
return FinishedStepResponse(
status=StepStatus.NEEDS_WORK,
feedback=feedback,
stack=self.state_manager.get_stack(),
)
# If no feedback (all cached), fall through to step completion
else:
# External runner mode: use quality gate subprocess evaluation
attempts = await self.state_manager.record_quality_attempt(
Expand Down
35 changes: 21 additions & 14 deletions tests/unit/jobs/mcp/test_tools.py
Original file line number Diff line number Diff line change
Expand Up @@ -1552,7 +1552,7 @@ async def test_self_review_returns_needs_work(
async def test_self_review_feedback_contains_instructions(
self, tools_self_review: WorkflowTools, project_root: Path
) -> None:
"""Test that feedback contains subagent and override instructions."""
"""Test that feedback contains task invocation and override instructions."""
await tools_self_review.start_workflow(
StartWorkflowInput(goal="Test", job_name="test_job", workflow_name="main")
)
Expand All @@ -1564,16 +1564,17 @@ async def test_self_review_feedback_contains_instructions(

assert response.feedback is not None
assert "Quality review required" in response.feedback
assert "subagent" in response.feedback.lower()
# Uses Reviews mechanism: format_for_claude output with subagent_type
assert "subagent_type" in response.feedback
assert "quality_review_override_reason" in response.feedback
assert ".deepwork/tmp/quality_review_" in response.feedback
assert ".deepwork/tmp/review_instructions/" in response.feedback

# THIS TEST VALIDATES A HARD REQUIREMENT (JOBS-REQ-001.4.10).
# YOU MUST NOT MODIFY THIS TEST UNLESS THE REQUIREMENT CHANGES
async def test_self_review_writes_instructions_file(
self, tools_self_review: WorkflowTools, project_root: Path
) -> None:
"""Test that an instructions file is written to .deepwork/tmp/."""
"""Test that an instructions file is written to .deepwork/tmp/review_instructions/."""
await tools_self_review.start_workflow(
StartWorkflowInput(goal="Test", job_name="test_job", workflow_name="main")
)
Expand All @@ -1583,7 +1584,8 @@ async def test_self_review_writes_instructions_file(
FinishedStepInput(outputs={"output1.md": "output1.md"})
)

review_files = list((project_root / ".deepwork" / "tmp").glob("quality_review_*.md"))
review_dir = project_root / ".deepwork" / "tmp" / "review_instructions"
review_files = list(review_dir.glob("*.md"))
assert len(review_files) == 1

# THIS TEST VALIDATES A HARD REQUIREMENT (JOBS-REQ-001.4.10).
Expand All @@ -1601,7 +1603,8 @@ async def test_self_review_file_contains_criteria(
FinishedStepInput(outputs={"output1.md": "output1.md"})
)

review_files = list((project_root / ".deepwork" / "tmp").glob("quality_review_*.md"))
review_dir = project_root / ".deepwork" / "tmp" / "review_instructions"
review_files = list(review_dir.glob("*.md"))
content = review_files[0].read_text()

# step1 has review criteria "Output Valid": "Is the output valid?"
Expand All @@ -1621,30 +1624,33 @@ async def test_self_review_file_references_outputs_not_inline(
FinishedStepInput(outputs={"output1.md": "output1.md"})
)

review_files = list((project_root / ".deepwork" / "tmp").glob("quality_review_*.md"))
review_dir = project_root / ".deepwork" / "tmp" / "review_instructions"
review_files = list(review_dir.glob("*.md"))
content = review_files[0].read_text()

assert "output1.md" in content
assert "UNIQUE_CONTENT_MARKER_12345" not in content

# THIS TEST VALIDATES A HARD REQUIREMENT (JOBS-REQ-001.4.10).
# YOU MUST NOT MODIFY THIS TEST UNLESS THE REQUIREMENT CHANGES
async def test_self_review_file_named_with_session_and_step(
async def test_self_review_file_named_with_step_id(
self, tools_self_review: WorkflowTools, project_root: Path
) -> None:
"""Test that review file name includes session and step IDs."""
resp = await tools_self_review.start_workflow(
"""Test that review file name includes the step ID via Reviews naming."""
await tools_self_review.start_workflow(
StartWorkflowInput(goal="Test", job_name="test_job", workflow_name="main")
)
session_id = resp.begin_step.session_id
(project_root / "output1.md").write_text("output")

await tools_self_review.finished_step(
FinishedStepInput(outputs={"output1.md": "output1.md"})
)

expected_file = project_root / ".deepwork" / "tmp" / f"quality_review_{session_id}_step1.md"
assert expected_file.exists()
# Uses Reviews mechanism: file named with deterministic review ID
# containing step_id (e.g. "step1-quality-review--output1.md--<hash>.md")
review_dir = project_root / ".deepwork" / "tmp" / "review_instructions"
review_files = list(review_dir.glob("step1-quality-review--*.md"))
assert len(review_files) == 1

# THIS TEST VALIDATES A HARD REQUIREMENT (JOBS-REQ-001.4.9, JOBS-REQ-001.4.10).
# YOU MUST NOT MODIFY THIS TEST UNLESS THE REQUIREMENT CHANGES
Expand Down Expand Up @@ -1716,7 +1722,8 @@ async def test_self_review_includes_notes_in_file(
)
)

review_files = list((project_root / ".deepwork" / "tmp").glob("quality_review_*.md"))
review_dir = project_root / ".deepwork" / "tmp" / "review_instructions"
review_files = list(review_dir.glob("*.md"))
content = review_files[0].read_text()
assert "I used the XYZ library for this step." in content

Expand Down
Loading