diff --git a/.github/workflows/run-evals.yml b/.github/workflows/run-evals.yml index 3343635..92cd66a 100644 --- a/.github/workflows/run-evals.yml +++ b/.github/workflows/run-evals.yml @@ -10,6 +10,10 @@ jobs: runs-on: ubuntu-latest # Only run on PRs from the same repository, not forks. if: github.event_name != 'pull_request' || github.event.pull_request.head.repo.full_name == github.repository + permissions: + contents: read + pull-requests: write + checks: write steps: - name: Checkout repository @@ -42,11 +46,12 @@ jobs: curl -fsSL https://raw.githubusercontent.com/platformsh/cli/main/installer.sh | VENDOR=upsun bash echo "$HOME/.platformsh/bin" >> $GITHUB_PATH - - name: Install using-upsun skill + - name: Install upsun plugin run: | - mkdir -p "$HOME/.claude/skills/" - cp -r plugins/upsun/skills/. "$HOME/.claude/skills/" - + claude plugin marketplace add ./ + claude plugin install upsun@upsun + - name: Verify upsun plugin is installed + run: claude plugin list - name: Install dependencies working-directory: ./evals run: | @@ -75,6 +80,14 @@ jobs: --html=report.html \ --self-contained-html + - name: Publish eval results to PR + uses: EnricoMi/publish-unit-test-result-action@v2 + if: always() && github.event_name == 'pull_request' + with: + files: evals/results.xml + comment_title: "Eval Results" + check_name: "Eval Results" + - name: Upload test results uses: actions/upload-artifact@v4 if: always() diff --git a/evals/test_login.py b/evals/test_login.py index 0fda57e..f468afc 100644 --- a/evals/test_login.py +++ b/evals/test_login.py @@ -1,11 +1,15 @@ +import json +import os import subprocess -from deepeval.test_case import LLMTestCase, LLMTestCaseParams -from deepeval.metrics import GEval +from deepeval.test_case import LLMTestCase, LLMTestCaseParams, ToolCall, ToolCallParams +from deepeval.metrics import GEval, ToolCorrectnessMetric from deepeval import assert_test def run_claude_code(prompt): - """Execute Claude Code CLI and capture output""" + """Execute Claude Code CLI and capture output with tool call traces""" allowed_tools = [ + 'Skill', + 'ToolSearch', 'mcp__upsun__*', 'Bash(upsun auth:*)', 'Bash(upsun environment:*)', @@ -18,19 +22,52 @@ def run_claude_code(prompt): 'Bash(upsun user:*)', 'Bash(upsun organization:*)', ] + env = {k: v for k, v in os.environ.items() if k != 'CLAUDECODE'} result = subprocess.run( - ['claude', '-p', prompt, '--allowedTools', ','.join(allowed_tools), '--dangerously-skip-permissions'], + ['claude', '-p', prompt, + '--allowedTools', ','.join(allowed_tools), + '--output-format', 'stream-json', + '--verbose', + '--dangerously-skip-permissions'], capture_output=True, text=True, - timeout=3000 + timeout=3000, + env=env ) - return result.stdout + + if result.returncode != 0: + raise RuntimeError( + f"Claude CLI failed with exit code {result.returncode}.\n" + f"STDOUT:\n{result.stdout}\n" + f"STDERR:\n{result.stderr}" + ) + + tool_calls = [] + final_output = "" + + for line in result.stdout.splitlines(): + if not line.strip(): + continue + try: + event = json.loads(line) + except json.JSONDecodeError: + continue + + if event.get("type") == "assistant": + for block in event.get("message", {}).get("content", []): + if block.get("type") == "tool_use": + tool_calls.append(ToolCall( + name=block["name"], + input_parameters=block.get("input", {}) + )) + elif event.get("type") == "result": + final_output = event.get("result", "") + + return final_output, tool_calls def test_upsun_login(): - # Run Claude Code - output = run_claude_code("Am i logged in to Upsun ?") - - # Create correctness metric + output, tool_calls = run_claude_code("Am i logged in to Upsun ?") + correctness_metric = GEval( name="Correctness", evaluation_params=[LLMTestCaseParams.ACTUAL_OUTPUT, LLMTestCaseParams.EXPECTED_OUTPUT], @@ -41,11 +78,21 @@ def test_upsun_login(): "Compare the actual output with the expected output to ensure they convey the same information" ] ) - - # Evaluate with DeepEval + + skill_name = "upsun:check-upsun-auth" if os.environ.get("CI") else "check-upsun-auth" + + tool_correctness_metric = ToolCorrectnessMetric( + threshold=0.5, + evaluation_params=[ToolCallParams.INPUT_PARAMETERS], + include_reason=True + ) + test_case = LLMTestCase( input="Am i logged in to Upsun ?", expected_output="No, you're not currently logged in to Upsun. Your session has expired. To log in, you'll need to run: upsun login", - actual_output=output + actual_output=output, + tools_called=tool_calls, + expected_tools=[ToolCall(name="Skill", input_parameters={"skill": skill_name})] ) - assert_test(test_case, [correctness_metric]) \ No newline at end of file + + assert_test(test_case, [correctness_metric, tool_correctness_metric]) diff --git a/plugins/upsun/skills/check-upsun-auth/SKILL.md b/plugins/upsun/skills/check-upsun-auth/SKILL.md new file mode 100644 index 0000000..d09def5 --- /dev/null +++ b/plugins/upsun/skills/check-upsun-auth/SKILL.md @@ -0,0 +1,26 @@ +--- +name: check-upsun-auth +description: Checks Upsun authentication and login status. Use when the user asks "am I logged in to Upsun", "check Upsun authentication", "Upsun login status", "am I authenticated", "check my Upsun credentials", or wants to log in, log out, or switch Upsun accounts. +--- + +# Check Upsun Auth + +## Check current authentication status + +```bash +upsun auth:info +``` + +Returns the currently authenticated user's email and account details. If not authenticated, it will error. + +## Authenticate + +```bash +upsun login +``` + +## Log out + +```bash +upsun logout +```