Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
21 changes: 17 additions & 4 deletions .github/workflows/run-evals.yml
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,10 @@ jobs:
runs-on: ubuntu-latest
# Only run on PRs from the same repository, not forks.
if: github.event_name != 'pull_request' || github.event.pull_request.head.repo.full_name == github.repository
permissions:
contents: read
pull-requests: write
checks: write

steps:
- name: Checkout repository
Expand Down Expand Up @@ -42,11 +46,12 @@ jobs:
curl -fsSL https://raw.githubusercontent.com/platformsh/cli/main/installer.sh | VENDOR=upsun bash
echo "$HOME/.platformsh/bin" >> $GITHUB_PATH

- name: Install using-upsun skill
- name: Install upsun plugin
run: |
mkdir -p "$HOME/.claude/skills/"
cp -r plugins/upsun/skills/. "$HOME/.claude/skills/"

claude plugin marketplace add ./
claude plugin install upsun@upsun
- name: Verify upsun plugin is installed
run: claude plugin list
- name: Install dependencies
working-directory: ./evals
run: |
Expand Down Expand Up @@ -75,6 +80,14 @@ jobs:
--html=report.html \
--self-contained-html

- name: Publish eval results to PR
uses: EnricoMi/publish-unit-test-result-action@v2
if: always() && github.event_name == 'pull_request'
with:
files: evals/results.xml
comment_title: "Eval Results"
check_name: "Eval Results"

- name: Upload test results
uses: actions/upload-artifact@v4
if: always()
Expand Down
75 changes: 61 additions & 14 deletions evals/test_login.py
Original file line number Diff line number Diff line change
@@ -1,11 +1,15 @@
import json
import os
import subprocess
from deepeval.test_case import LLMTestCase, LLMTestCaseParams
from deepeval.metrics import GEval
from deepeval.test_case import LLMTestCase, LLMTestCaseParams, ToolCall, ToolCallParams
from deepeval.metrics import GEval, ToolCorrectnessMetric
from deepeval import assert_test

def run_claude_code(prompt):
"""Execute Claude Code CLI and capture output"""
"""Execute Claude Code CLI and capture output with tool call traces"""
allowed_tools = [
'Skill',
'ToolSearch',
'mcp__upsun__*',
'Bash(upsun auth:*)',
'Bash(upsun environment:*)',
Expand All @@ -18,19 +22,52 @@ def run_claude_code(prompt):
'Bash(upsun user:*)',
'Bash(upsun organization:*)',
]
env = {k: v for k, v in os.environ.items() if k != 'CLAUDECODE'}
result = subprocess.run(
['claude', '-p', prompt, '--allowedTools', ','.join(allowed_tools), '--dangerously-skip-permissions'],
['claude', '-p', prompt,
'--allowedTools', ','.join(allowed_tools),
'--output-format', 'stream-json',
'--verbose',
'--dangerously-skip-permissions'],
capture_output=True,
text=True,
timeout=3000
timeout=3000,
env=env
)
return result.stdout

if result.returncode != 0:
raise RuntimeError(
f"Claude CLI failed with exit code {result.returncode}.\n"
f"STDOUT:\n{result.stdout}\n"
f"STDERR:\n{result.stderr}"
)

tool_calls = []
final_output = ""

for line in result.stdout.splitlines():
if not line.strip():
continue
try:
event = json.loads(line)
except json.JSONDecodeError:
continue

if event.get("type") == "assistant":
for block in event.get("message", {}).get("content", []):
if block.get("type") == "tool_use":
tool_calls.append(ToolCall(
name=block["name"],
input_parameters=block.get("input", {})
))
elif event.get("type") == "result":
final_output = event.get("result", "")

return final_output, tool_calls

def test_upsun_login():
# Run Claude Code
output = run_claude_code("Am i logged in to Upsun ?")

# Create correctness metric
output, tool_calls = run_claude_code("Am i logged in to Upsun ?")

correctness_metric = GEval(
name="Correctness",
evaluation_params=[LLMTestCaseParams.ACTUAL_OUTPUT, LLMTestCaseParams.EXPECTED_OUTPUT],
Expand All @@ -41,11 +78,21 @@ def test_upsun_login():
"Compare the actual output with the expected output to ensure they convey the same information"
]
)

# Evaluate with DeepEval

skill_name = "upsun:check-upsun-auth" if os.environ.get("CI") else "check-upsun-auth"

tool_correctness_metric = ToolCorrectnessMetric(
threshold=0.5,
evaluation_params=[ToolCallParams.INPUT_PARAMETERS],
include_reason=True
)

test_case = LLMTestCase(
input="Am i logged in to Upsun ?",
expected_output="No, you're not currently logged in to Upsun. Your session has expired. To log in, you'll need to run: upsun login",
actual_output=output
actual_output=output,
tools_called=tool_calls,
expected_tools=[ToolCall(name="Skill", input_parameters={"skill": skill_name})]
)
assert_test(test_case, [correctness_metric])

assert_test(test_case, [correctness_metric, tool_correctness_metric])
26 changes: 26 additions & 0 deletions plugins/upsun/skills/check-upsun-auth/SKILL.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,26 @@
---
name: check-upsun-auth
description: Checks Upsun authentication and login status. Use when the user asks "am I logged in to Upsun", "check Upsun authentication", "Upsun login status", "am I authenticated", "check my Upsun credentials", or wants to log in, log out, or switch Upsun accounts.
---

# Check Upsun Auth

## Check current authentication status

```bash
upsun auth:info
```

Returns the currently authenticated user's email and account details. If not authenticated, it will error.

## Authenticate

```bash
upsun login
```

## Log out

```bash
upsun logout
```
Loading