From 0ab829c37ccbb3432e9dea8d6f8e4f2326d0bc4b Mon Sep 17 00:00:00 2001 From: Ganeshdip Dumbare Date: Mon, 16 Mar 2026 11:55:43 +0100 Subject: [PATCH 01/14] add tool correctness metric --- evals/test_login.py | 56 ++++++++++++++++++++++++++++++++++----------- 1 file changed, 43 insertions(+), 13 deletions(-) diff --git a/evals/test_login.py b/evals/test_login.py index 0fda57e..0d5751d 100644 --- a/evals/test_login.py +++ b/evals/test_login.py @@ -1,11 +1,13 @@ +import json import subprocess -from deepeval.test_case import LLMTestCase, LLMTestCaseParams -from deepeval.metrics import GEval +from deepeval.test_case import LLMTestCase, LLMTestCaseParams, ToolCall +from deepeval.metrics import GEval, ToolCorrectnessMetric from deepeval import assert_test def run_claude_code(prompt): - """Execute Claude Code CLI and capture output""" + """Execute Claude Code CLI and capture output with tool call traces""" allowed_tools = [ + 'Skill', 'mcp__upsun__*', 'Bash(upsun auth:*)', 'Bash(upsun environment:*)', @@ -19,18 +21,42 @@ def run_claude_code(prompt): 'Bash(upsun organization:*)', ] result = subprocess.run( - ['claude', '-p', prompt, '--allowedTools', ','.join(allowed_tools), '--dangerously-skip-permissions'], + ['claude', '-p', prompt, + '--allowedTools', ','.join(allowed_tools), + '--output-format', 'stream-json', + '--verbose', + '--dangerously-skip-permissions'], capture_output=True, text=True, timeout=3000 ) - return result.stdout + + tool_calls = [] + final_output = "" + + for line in result.stdout.splitlines(): + if not line.strip(): + continue + try: + event = json.loads(line) + except json.JSONDecodeError: + continue + + if event.get("type") == "assistant": + for block in event.get("message", {}).get("content", []): + if block.get("type") == "tool_use": + tool_calls.append(ToolCall( + name=block["name"], + input_parameters=block.get("input", {}) + )) + elif event.get("type") == "result": + final_output = event.get("result", "") + + return final_output, tool_calls def test_upsun_login(): - # Run Claude Code - output = run_claude_code("Am i logged in to Upsun ?") - - # Create correctness metric + output, tool_calls = run_claude_code("Am i logged in to Upsun ?") + correctness_metric = GEval( name="Correctness", evaluation_params=[LLMTestCaseParams.ACTUAL_OUTPUT, LLMTestCaseParams.EXPECTED_OUTPUT], @@ -41,11 +67,15 @@ def test_upsun_login(): "Compare the actual output with the expected output to ensure they convey the same information" ] ) - - # Evaluate with DeepEval + + tool_correctness_metric = ToolCorrectnessMetric() + test_case = LLMTestCase( input="Am i logged in to Upsun ?", expected_output="No, you're not currently logged in to Upsun. Your session has expired. To log in, you'll need to run: upsun login", - actual_output=output + actual_output=output, + tools_called=tool_calls, + expected_tools=[ToolCall(name="Skill")] ) - assert_test(test_case, [correctness_metric]) \ No newline at end of file + + assert_test(test_case, [correctness_metric, tool_correctness_metric]) From ce13adcffd885e82d1912195f8a8989da7cdee6d Mon Sep 17 00:00:00 2001 From: Ganeshdip Dumbare Date: Mon, 16 Mar 2026 12:16:43 +0100 Subject: [PATCH 02/14] install upsun plugin in CC --- .github/workflows/run-evals.yml | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/.github/workflows/run-evals.yml b/.github/workflows/run-evals.yml index 3343635..27acfd0 100644 --- a/.github/workflows/run-evals.yml +++ b/.github/workflows/run-evals.yml @@ -42,10 +42,10 @@ jobs: curl -fsSL https://raw.githubusercontent.com/platformsh/cli/main/installer.sh | VENDOR=upsun bash echo "$HOME/.platformsh/bin" >> $GITHUB_PATH - - name: Install using-upsun skill + - name: Install upsun plugin run: | - mkdir -p "$HOME/.claude/skills/" - cp -r plugins/upsun/skills/. "$HOME/.claude/skills/" + claude plugin marketplace add upsun/ai + claude plugin install upsun@upsun - name: Install dependencies working-directory: ./evals From 6bd2155559bd578af7ada8f97fc23a91229f3e7f Mon Sep 17 00:00:00 2001 From: Ganeshdip Dumbare Date: Mon, 16 Mar 2026 12:34:18 +0100 Subject: [PATCH 03/14] add eval for PR plugins --- evals/test_login.py | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/evals/test_login.py b/evals/test_login.py index 0d5751d..2abc650 100644 --- a/evals/test_login.py +++ b/evals/test_login.py @@ -1,6 +1,6 @@ import json import subprocess -from deepeval.test_case import LLMTestCase, LLMTestCaseParams, ToolCall +from deepeval.test_case import LLMTestCase, LLMTestCaseParams, ToolCall, ToolCallParams from deepeval.metrics import GEval, ToolCorrectnessMetric from deepeval import assert_test @@ -68,14 +68,16 @@ def test_upsun_login(): ] ) - tool_correctness_metric = ToolCorrectnessMetric() + tool_correctness_metric = ToolCorrectnessMetric( + evaluation_params=[ToolCallParams.INPUT_PARAMETERS] + ) test_case = LLMTestCase( input="Am i logged in to Upsun ?", expected_output="No, you're not currently logged in to Upsun. Your session has expired. To log in, you'll need to run: upsun login", actual_output=output, tools_called=tool_calls, - expected_tools=[ToolCall(name="Skill")] + expected_tools=[ToolCall(name="Skill", input_parameters={"name": "check-upsun-auth"})] ) assert_test(test_case, [correctness_metric, tool_correctness_metric]) From 404100dcd0522766c232cdd2e9b4b5cf5d7619a1 Mon Sep 17 00:00:00 2001 From: Ganeshdip Dumbare Date: Mon, 16 Mar 2026 12:36:31 +0100 Subject: [PATCH 04/14] install PR plugins to Claude --- .github/workflows/run-evals.yml | 2 +- .../upsun/skills/check-upsun-auth/SKILL.md | 32 +++++++++++++++++++ 2 files changed, 33 insertions(+), 1 deletion(-) create mode 100644 plugins/upsun/skills/check-upsun-auth/SKILL.md diff --git a/.github/workflows/run-evals.yml b/.github/workflows/run-evals.yml index 27acfd0..9f707af 100644 --- a/.github/workflows/run-evals.yml +++ b/.github/workflows/run-evals.yml @@ -44,7 +44,7 @@ jobs: - name: Install upsun plugin run: | - claude plugin marketplace add upsun/ai + claude plugin marketplace add . claude plugin install upsun@upsun - name: Install dependencies diff --git a/plugins/upsun/skills/check-upsun-auth/SKILL.md b/plugins/upsun/skills/check-upsun-auth/SKILL.md new file mode 100644 index 0000000..ecac5e1 --- /dev/null +++ b/plugins/upsun/skills/check-upsun-auth/SKILL.md @@ -0,0 +1,32 @@ +--- +name: check-upsun-auth +description: Checks Upsun authentication and login status. Use when the user asks "am I logged in to Upsun", "check Upsun authentication", "Upsun login status", "am I authenticated", "check my Upsun credentials", or wants to log in, log out, or switch Upsun accounts. +--- + +# Check Upsun Auth + +## Check current authentication status + +```bash +upsun auth:info +``` + +Returns the currently authenticated user's email and account details. If not authenticated, it will error. + +## Authenticate + +**Via browser (recommended):** +```bash +upsun auth:browser-login +``` + +**Via API token:** +```bash +upsun auth:api-token-login +``` + +## Log out + +```bash +upsun auth:logout +``` From 4c1b767b8ae062d796b39f5dc746940c458e068b Mon Sep 17 00:00:00 2001 From: Ganeshdip Dumbare Date: Mon, 16 Mar 2026 12:38:19 +0100 Subject: [PATCH 05/14] fix path --- .github/workflows/run-evals.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/run-evals.yml b/.github/workflows/run-evals.yml index 9f707af..5b897c7 100644 --- a/.github/workflows/run-evals.yml +++ b/.github/workflows/run-evals.yml @@ -44,7 +44,7 @@ jobs: - name: Install upsun plugin run: | - claude plugin marketplace add . + claude plugin marketplace add ./ claude plugin install upsun@upsun - name: Install dependencies From 3b4d776496a9cf3cf57937d257cef6cc728787e3 Mon Sep 17 00:00:00 2001 From: Ganeshdip Dumbare Date: Mon, 16 Mar 2026 12:43:57 +0100 Subject: [PATCH 06/14] fix skill to match the correct command --- plugins/upsun/skills/check-upsun-auth/SKILL.md | 10 ++-------- 1 file changed, 2 insertions(+), 8 deletions(-) diff --git a/plugins/upsun/skills/check-upsun-auth/SKILL.md b/plugins/upsun/skills/check-upsun-auth/SKILL.md index ecac5e1..d09def5 100644 --- a/plugins/upsun/skills/check-upsun-auth/SKILL.md +++ b/plugins/upsun/skills/check-upsun-auth/SKILL.md @@ -15,18 +15,12 @@ Returns the currently authenticated user's email and account details. If not aut ## Authenticate -**Via browser (recommended):** ```bash -upsun auth:browser-login -``` - -**Via API token:** -```bash -upsun auth:api-token-login +upsun login ``` ## Log out ```bash -upsun auth:logout +upsun logout ``` From 39acf84ed5c634697738c6afd14691b707d6c8b5 Mon Sep 17 00:00:00 2001 From: Ganeshdip Dumbare Date: Mon, 16 Mar 2026 12:51:38 +0100 Subject: [PATCH 07/14] add fix to check skill call correctly7 --- evals/test_login.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/evals/test_login.py b/evals/test_login.py index 2abc650..99fa229 100644 --- a/evals/test_login.py +++ b/evals/test_login.py @@ -77,7 +77,7 @@ def test_upsun_login(): expected_output="No, you're not currently logged in to Upsun. Your session has expired. To log in, you'll need to run: upsun login", actual_output=output, tools_called=tool_calls, - expected_tools=[ToolCall(name="Skill", input_parameters={"name": "check-upsun-auth"})] + expected_tools=[ToolCall(name="Skill", input_parameters={"skill": "check-upsun-auth"})] ) assert_test(test_case, [correctness_metric, tool_correctness_metric]) From cf657b0291c1a7a055a5381e2f31a69ce75db263 Mon Sep 17 00:00:00 2001 From: Ganeshdip Dumbare Date: Mon, 16 Mar 2026 14:35:38 +0100 Subject: [PATCH 08/14] add check if the plugin is installed or not --- .github/workflows/run-evals.yml | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/.github/workflows/run-evals.yml b/.github/workflows/run-evals.yml index 5b897c7..08a6c3e 100644 --- a/.github/workflows/run-evals.yml +++ b/.github/workflows/run-evals.yml @@ -46,7 +46,8 @@ jobs: run: | claude plugin marketplace add ./ claude plugin install upsun@upsun - + - name: Verify upsun plugin is installed + run: claude plugin list - name: Install dependencies working-directory: ./evals run: | From 9a1cd005c02868ba858fa06a51a9fe741521ed3f Mon Sep 17 00:00:00 2001 From: Ganeshdip Dumbare Date: Mon, 16 Mar 2026 14:43:52 +0100 Subject: [PATCH 09/14] add correct eval param --- evals/test_login.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/evals/test_login.py b/evals/test_login.py index 99fa229..0bb4a25 100644 --- a/evals/test_login.py +++ b/evals/test_login.py @@ -8,6 +8,7 @@ def run_claude_code(prompt): """Execute Claude Code CLI and capture output with tool call traces""" allowed_tools = [ 'Skill', + 'ToolSearch', 'mcp__upsun__*', 'Bash(upsun auth:*)', 'Bash(upsun environment:*)', @@ -69,7 +70,7 @@ def test_upsun_login(): ) tool_correctness_metric = ToolCorrectnessMetric( - evaluation_params=[ToolCallParams.INPUT_PARAMETERS] + evaluation_params=[ToolCallParams.TOOL_CALLED] ) test_case = LLMTestCase( From 8bc0064d37282c8f560cb06f1e9c706c0adc5cbe Mon Sep 17 00:00:00 2001 From: Ganeshdip Dumbare Date: Mon, 16 Mar 2026 14:57:02 +0100 Subject: [PATCH 10/14] fix tool call check --- evals/test_login.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/evals/test_login.py b/evals/test_login.py index 0bb4a25..8d67e03 100644 --- a/evals/test_login.py +++ b/evals/test_login.py @@ -70,7 +70,9 @@ def test_upsun_login(): ) tool_correctness_metric = ToolCorrectnessMetric( - evaluation_params=[ToolCallParams.TOOL_CALLED] + threshold=0.5, + evaluation_params=[ToolCallParams.INPUT_PARAMETERS], + include_reason=True ) test_case = LLMTestCase( From 2b41e557b74d60a5ba67c1a0bdf40c2a2c0505fa Mon Sep 17 00:00:00 2001 From: Ganeshdip Dumbare Date: Mon, 16 Mar 2026 15:05:34 +0100 Subject: [PATCH 11/14] add correct tool debug --- evals/test_login.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/evals/test_login.py b/evals/test_login.py index 8d67e03..db451b3 100644 --- a/evals/test_login.py +++ b/evals/test_login.py @@ -58,6 +58,10 @@ def run_claude_code(prompt): def test_upsun_login(): output, tool_calls = run_claude_code("Am i logged in to Upsun ?") + print("\n[DEBUG] Tool calls made:") + for tc in tool_calls: + print(f" {tc.name}: {tc.input_parameters}") + correctness_metric = GEval( name="Correctness", evaluation_params=[LLMTestCaseParams.ACTUAL_OUTPUT, LLMTestCaseParams.EXPECTED_OUTPUT], From c5e60eea4de89765ded2277b2a3c118611669ad9 Mon Sep 17 00:00:00 2001 From: Ganeshdip Dumbare Date: Mon, 16 Mar 2026 15:16:26 +0100 Subject: [PATCH 12/14] check env to run evals --- evals/test_login.py | 13 +++++++------ 1 file changed, 7 insertions(+), 6 deletions(-) diff --git a/evals/test_login.py b/evals/test_login.py index db451b3..f96733a 100644 --- a/evals/test_login.py +++ b/evals/test_login.py @@ -1,4 +1,5 @@ import json +import os import subprocess from deepeval.test_case import LLMTestCase, LLMTestCaseParams, ToolCall, ToolCallParams from deepeval.metrics import GEval, ToolCorrectnessMetric @@ -21,6 +22,7 @@ def run_claude_code(prompt): 'Bash(upsun user:*)', 'Bash(upsun organization:*)', ] + env = {k: v for k, v in os.environ.items() if k != 'CLAUDECODE'} result = subprocess.run( ['claude', '-p', prompt, '--allowedTools', ','.join(allowed_tools), @@ -29,7 +31,8 @@ def run_claude_code(prompt): '--dangerously-skip-permissions'], capture_output=True, text=True, - timeout=3000 + timeout=3000, + env=env ) tool_calls = [] @@ -58,10 +61,6 @@ def run_claude_code(prompt): def test_upsun_login(): output, tool_calls = run_claude_code("Am i logged in to Upsun ?") - print("\n[DEBUG] Tool calls made:") - for tc in tool_calls: - print(f" {tc.name}: {tc.input_parameters}") - correctness_metric = GEval( name="Correctness", evaluation_params=[LLMTestCaseParams.ACTUAL_OUTPUT, LLMTestCaseParams.EXPECTED_OUTPUT], @@ -73,6 +72,8 @@ def test_upsun_login(): ] ) + skill_name = "upsun:check-upsun-auth" if os.environ.get("CI") else "check-upsun-auth" + tool_correctness_metric = ToolCorrectnessMetric( threshold=0.5, evaluation_params=[ToolCallParams.INPUT_PARAMETERS], @@ -84,7 +85,7 @@ def test_upsun_login(): expected_output="No, you're not currently logged in to Upsun. Your session has expired. To log in, you'll need to run: upsun login", actual_output=output, tools_called=tool_calls, - expected_tools=[ToolCall(name="Skill", input_parameters={"skill": "check-upsun-auth"})] + expected_tools=[ToolCall(name="Skill", input_parameters={"skill": skill_name})] ) assert_test(test_case, [correctness_metric, tool_correctness_metric]) From d672193007e925e7ab31ad215e327fdead814da7 Mon Sep 17 00:00:00 2001 From: Ganeshdip Dumbare Date: Mon, 16 Mar 2026 15:53:20 +0100 Subject: [PATCH 13/14] Potential fix for pull request finding Co-authored-by: Copilot Autofix powered by AI <175728472+Copilot@users.noreply.github.com> --- evals/test_login.py | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/evals/test_login.py b/evals/test_login.py index f96733a..f468afc 100644 --- a/evals/test_login.py +++ b/evals/test_login.py @@ -35,6 +35,13 @@ def run_claude_code(prompt): env=env ) + if result.returncode != 0: + raise RuntimeError( + f"Claude CLI failed with exit code {result.returncode}.\n" + f"STDOUT:\n{result.stdout}\n" + f"STDERR:\n{result.stderr}" + ) + tool_calls = [] final_output = "" From b136079811ea45083ad130081d0703fa4a44b6d9 Mon Sep 17 00:00:00 2001 From: Ganeshdip Dumbare Date: Mon, 16 Mar 2026 15:57:14 +0100 Subject: [PATCH 14/14] put the eval result on PR --- .github/workflows/run-evals.yml | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/.github/workflows/run-evals.yml b/.github/workflows/run-evals.yml index 08a6c3e..92cd66a 100644 --- a/.github/workflows/run-evals.yml +++ b/.github/workflows/run-evals.yml @@ -10,6 +10,10 @@ jobs: runs-on: ubuntu-latest # Only run on PRs from the same repository, not forks. if: github.event_name != 'pull_request' || github.event.pull_request.head.repo.full_name == github.repository + permissions: + contents: read + pull-requests: write + checks: write steps: - name: Checkout repository @@ -76,6 +80,14 @@ jobs: --html=report.html \ --self-contained-html + - name: Publish eval results to PR + uses: EnricoMi/publish-unit-test-result-action@v2 + if: always() && github.event_name == 'pull_request' + with: + files: evals/results.xml + comment_title: "Eval Results" + check_name: "Eval Results" + - name: Upload test results uses: actions/upload-artifact@v4 if: always()