From 0ab829c37ccbb3432e9dea8d6f8e4f2326d0bc4b Mon Sep 17 00:00:00 2001
From: Ganeshdip Dumbare <ganeshdip.dumbare@gmail.com>
Date: Mon, 16 Mar 2026 11:55:43 +0100
Subject: [PATCH 01/14] add tool correctness metric

---
 evals/test_login.py | 56 ++++++++++++++++++++++++++++++++++-----------
 1 file changed, 43 insertions(+), 13 deletions(-)

diff --git a/evals/test_login.py b/evals/test_login.py
index 0fda57e..0d5751d 100644
--- a/evals/test_login.py
+++ b/evals/test_login.py
@@ -1,11 +1,13 @@
+import json
 import subprocess
-from deepeval.test_case import LLMTestCase, LLMTestCaseParams
-from deepeval.metrics import GEval
+from deepeval.test_case import LLMTestCase, LLMTestCaseParams, ToolCall
+from deepeval.metrics import GEval, ToolCorrectnessMetric
 from deepeval import assert_test
 
 def run_claude_code(prompt):
-  """Execute Claude Code CLI and capture output"""
+  """Execute Claude Code CLI and capture output with tool call traces"""
   allowed_tools = [
+    'Skill',
     'mcp__upsun__*',
     'Bash(upsun auth:*)',
     'Bash(upsun environment:*)',
@@ -19,18 +21,42 @@ def run_claude_code(prompt):
     'Bash(upsun organization:*)',
   ]
   result = subprocess.run(
-    ['claude', '-p', prompt, '--allowedTools', ','.join(allowed_tools), '--dangerously-skip-permissions'],
+    ['claude', '-p', prompt,
+     '--allowedTools', ','.join(allowed_tools),
+     '--output-format', 'stream-json',
+     '--verbose',
+     '--dangerously-skip-permissions'],
     capture_output=True,
     text=True,
     timeout=3000
   )
-  return result.stdout
+
+  tool_calls = []
+  final_output = ""
+
+  for line in result.stdout.splitlines():
+    if not line.strip():
+      continue
+    try:
+      event = json.loads(line)
+    except json.JSONDecodeError:
+      continue
+
+    if event.get("type") == "assistant":
+      for block in event.get("message", {}).get("content", []):
+        if block.get("type") == "tool_use":
+          tool_calls.append(ToolCall(
+            name=block["name"],
+            input_parameters=block.get("input", {})
+          ))
+    elif event.get("type") == "result":
+      final_output = event.get("result", "")
+
+  return final_output, tool_calls
 
 def test_upsun_login():
-  # Run Claude Code
-  output = run_claude_code("Am i logged in to Upsun ?")
-  
-  # Create correctness metric
+  output, tool_calls = run_claude_code("Am i logged in to Upsun ?")
+
   correctness_metric = GEval(
     name="Correctness",
     evaluation_params=[LLMTestCaseParams.ACTUAL_OUTPUT, LLMTestCaseParams.EXPECTED_OUTPUT],
@@ -41,11 +67,15 @@ def test_upsun_login():
       "Compare the actual output with the expected output to ensure they convey the same information"
     ]
   )
-  
-  # Evaluate with DeepEval
+
+  tool_correctness_metric = ToolCorrectnessMetric()
+
   test_case = LLMTestCase(
     input="Am i logged in to Upsun ?",
     expected_output="No, you're not currently logged in to Upsun. Your session has expired. To log in, you'll need to run: upsun login",
-    actual_output=output
+    actual_output=output,
+    tools_called=tool_calls,
+    expected_tools=[ToolCall(name="Skill")]
   )
-  assert_test(test_case, [correctness_metric])
\ No newline at end of file
+
+  assert_test(test_case, [correctness_metric, tool_correctness_metric])

From ce13adcffd885e82d1912195f8a8989da7cdee6d Mon Sep 17 00:00:00 2001
From: Ganeshdip Dumbare <ganeshdip.dumbare@gmail.com>
Date: Mon, 16 Mar 2026 12:16:43 +0100
Subject: [PATCH 02/14] install upsun plugin in CC

---
 .github/workflows/run-evals.yml | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/.github/workflows/run-evals.yml b/.github/workflows/run-evals.yml
index 3343635..27acfd0 100644
--- a/.github/workflows/run-evals.yml
+++ b/.github/workflows/run-evals.yml
@@ -42,10 +42,10 @@ jobs:
         curl -fsSL https://raw.githubusercontent.com/platformsh/cli/main/installer.sh | VENDOR=upsun bash
         echo "$HOME/.platformsh/bin" >> $GITHUB_PATH
     
-    - name: Install using-upsun skill
+    - name: Install upsun plugin
       run: |
-        mkdir -p "$HOME/.claude/skills/"
-        cp -r plugins/upsun/skills/. "$HOME/.claude/skills/"
+        claude plugin marketplace add upsun/ai
+        claude plugin install upsun@upsun
     
     - name: Install dependencies
       working-directory: ./evals

From 6bd2155559bd578af7ada8f97fc23a91229f3e7f Mon Sep 17 00:00:00 2001
From: Ganeshdip Dumbare <ganeshdip.dumbare@gmail.com>
Date: Mon, 16 Mar 2026 12:34:18 +0100
Subject: [PATCH 03/14] add eval for PR plugins

---
 evals/test_login.py | 8 +++++---
 1 file changed, 5 insertions(+), 3 deletions(-)

diff --git a/evals/test_login.py b/evals/test_login.py
index 0d5751d..2abc650 100644
--- a/evals/test_login.py
+++ b/evals/test_login.py
@@ -1,6 +1,6 @@
 import json
 import subprocess
-from deepeval.test_case import LLMTestCase, LLMTestCaseParams, ToolCall
+from deepeval.test_case import LLMTestCase, LLMTestCaseParams, ToolCall, ToolCallParams
 from deepeval.metrics import GEval, ToolCorrectnessMetric
 from deepeval import assert_test
 
@@ -68,14 +68,16 @@ def test_upsun_login():
     ]
   )
 
-  tool_correctness_metric = ToolCorrectnessMetric()
+  tool_correctness_metric = ToolCorrectnessMetric(
+    evaluation_params=[ToolCallParams.INPUT_PARAMETERS]
+  )
 
   test_case = LLMTestCase(
     input="Am i logged in to Upsun ?",
     expected_output="No, you're not currently logged in to Upsun. Your session has expired. To log in, you'll need to run: upsun login",
     actual_output=output,
     tools_called=tool_calls,
-    expected_tools=[ToolCall(name="Skill")]
+    expected_tools=[ToolCall(name="Skill", input_parameters={"name": "check-upsun-auth"})]
   )
 
   assert_test(test_case, [correctness_metric, tool_correctness_metric])

From 404100dcd0522766c232cdd2e9b4b5cf5d7619a1 Mon Sep 17 00:00:00 2001
From: Ganeshdip Dumbare <ganeshdip.dumbare@gmail.com>
Date: Mon, 16 Mar 2026 12:36:31 +0100
Subject: [PATCH 04/14] install PR plugins to Claude

---
 .github/workflows/run-evals.yml               |  2 +-
 .../upsun/skills/check-upsun-auth/SKILL.md    | 32 +++++++++++++++++++
 2 files changed, 33 insertions(+), 1 deletion(-)
 create mode 100644 plugins/upsun/skills/check-upsun-auth/SKILL.md

diff --git a/.github/workflows/run-evals.yml b/.github/workflows/run-evals.yml
index 27acfd0..9f707af 100644
--- a/.github/workflows/run-evals.yml
+++ b/.github/workflows/run-evals.yml
@@ -44,7 +44,7 @@ jobs:
     
     - name: Install upsun plugin
       run: |
-        claude plugin marketplace add upsun/ai
+        claude plugin marketplace add .
         claude plugin install upsun@upsun
     
     - name: Install dependencies
diff --git a/plugins/upsun/skills/check-upsun-auth/SKILL.md b/plugins/upsun/skills/check-upsun-auth/SKILL.md
new file mode 100644
index 0000000..ecac5e1
--- /dev/null
+++ b/plugins/upsun/skills/check-upsun-auth/SKILL.md
@@ -0,0 +1,32 @@
+---
+name: check-upsun-auth
+description: Checks Upsun authentication and login status. Use when the user asks "am I logged in to Upsun", "check Upsun authentication", "Upsun login status", "am I authenticated", "check my Upsun credentials", or wants to log in, log out, or switch Upsun accounts.
+---
+
+# Check Upsun Auth
+
+## Check current authentication status
+
+```bash
+upsun auth:info
+```
+
+Returns the currently authenticated user's email and account details. If not authenticated, it will error.
+
+## Authenticate
+
+**Via browser (recommended):**
+```bash
+upsun auth:browser-login
+```
+
+**Via API token:**
+```bash
+upsun auth:api-token-login
+```
+
+## Log out
+
+```bash
+upsun auth:logout
+```

From 4c1b767b8ae062d796b39f5dc746940c458e068b Mon Sep 17 00:00:00 2001
From: Ganeshdip Dumbare <ganeshdip.dumbare@gmail.com>
Date: Mon, 16 Mar 2026 12:38:19 +0100
Subject: [PATCH 05/14] fix path

---
 .github/workflows/run-evals.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/run-evals.yml b/.github/workflows/run-evals.yml
index 9f707af..5b897c7 100644
--- a/.github/workflows/run-evals.yml
+++ b/.github/workflows/run-evals.yml
@@ -44,7 +44,7 @@ jobs:
     
     - name: Install upsun plugin
       run: |
-        claude plugin marketplace add .
+        claude plugin marketplace add ./
         claude plugin install upsun@upsun
     
     - name: Install dependencies

From 3b4d776496a9cf3cf57937d257cef6cc728787e3 Mon Sep 17 00:00:00 2001
From: Ganeshdip Dumbare <ganeshdip.dumbare@gmail.com>
Date: Mon, 16 Mar 2026 12:43:57 +0100
Subject: [PATCH 06/14] fix skill to match the correct command

---
 plugins/upsun/skills/check-upsun-auth/SKILL.md | 10 ++--------
 1 file changed, 2 insertions(+), 8 deletions(-)

diff --git a/plugins/upsun/skills/check-upsun-auth/SKILL.md b/plugins/upsun/skills/check-upsun-auth/SKILL.md
index ecac5e1..d09def5 100644
--- a/plugins/upsun/skills/check-upsun-auth/SKILL.md
+++ b/plugins/upsun/skills/check-upsun-auth/SKILL.md
@@ -15,18 +15,12 @@ Returns the currently authenticated user's email and account details. If not aut
 
 ## Authenticate
 
-**Via browser (recommended):**
 ```bash
-upsun auth:browser-login
-```
-
-**Via API token:**
-```bash
-upsun auth:api-token-login
+upsun login
 ```
 
 ## Log out
 
 ```bash
-upsun auth:logout
+upsun logout
 ```

From 39acf84ed5c634697738c6afd14691b707d6c8b5 Mon Sep 17 00:00:00 2001
From: Ganeshdip Dumbare <ganeshdip.dumbare@gmail.com>
Date: Mon, 16 Mar 2026 12:51:38 +0100
Subject: [PATCH 07/14] add fix to check skill call correctly7

---
 evals/test_login.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/evals/test_login.py b/evals/test_login.py
index 2abc650..99fa229 100644
--- a/evals/test_login.py
+++ b/evals/test_login.py
@@ -77,7 +77,7 @@ def test_upsun_login():
     expected_output="No, you're not currently logged in to Upsun. Your session has expired. To log in, you'll need to run: upsun login",
     actual_output=output,
     tools_called=tool_calls,
-    expected_tools=[ToolCall(name="Skill", input_parameters={"name": "check-upsun-auth"})]
+    expected_tools=[ToolCall(name="Skill", input_parameters={"skill": "check-upsun-auth"})]
   )
 
   assert_test(test_case, [correctness_metric, tool_correctness_metric])

From cf657b0291c1a7a055a5381e2f31a69ce75db263 Mon Sep 17 00:00:00 2001
From: Ganeshdip Dumbare <ganeshdip.dumbare@gmail.com>
Date: Mon, 16 Mar 2026 14:35:38 +0100
Subject: [PATCH 08/14] add check if the plugin is installed or not

---
 .github/workflows/run-evals.yml | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/.github/workflows/run-evals.yml b/.github/workflows/run-evals.yml
index 5b897c7..08a6c3e 100644
--- a/.github/workflows/run-evals.yml
+++ b/.github/workflows/run-evals.yml
@@ -46,7 +46,8 @@ jobs:
       run: |
         claude plugin marketplace add ./
         claude plugin install upsun@upsun
-    
+    - name: Verify upsun plugin is installed
+      run: claude plugin list
     - name: Install dependencies
       working-directory: ./evals
       run: |

From 9a1cd005c02868ba858fa06a51a9fe741521ed3f Mon Sep 17 00:00:00 2001
From: Ganeshdip Dumbare <ganeshdip.dumbare@gmail.com>
Date: Mon, 16 Mar 2026 14:43:52 +0100
Subject: [PATCH 09/14] add correct eval param

---
 evals/test_login.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/evals/test_login.py b/evals/test_login.py
index 99fa229..0bb4a25 100644
--- a/evals/test_login.py
+++ b/evals/test_login.py
@@ -8,6 +8,7 @@ def run_claude_code(prompt):
   """Execute Claude Code CLI and capture output with tool call traces"""
   allowed_tools = [
     'Skill',
+    'ToolSearch',
     'mcp__upsun__*',
     'Bash(upsun auth:*)',
     'Bash(upsun environment:*)',
@@ -69,7 +70,7 @@ def test_upsun_login():
   )
 
   tool_correctness_metric = ToolCorrectnessMetric(
-    evaluation_params=[ToolCallParams.INPUT_PARAMETERS]
+    evaluation_params=[ToolCallParams.TOOL_CALLED]
   )
 
   test_case = LLMTestCase(

From 8bc0064d37282c8f560cb06f1e9c706c0adc5cbe Mon Sep 17 00:00:00 2001
From: Ganeshdip Dumbare <ganeshdip.dumbare@gmail.com>
Date: Mon, 16 Mar 2026 14:57:02 +0100
Subject: [PATCH 10/14] fix tool call check

---
 evals/test_login.py | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/evals/test_login.py b/evals/test_login.py
index 0bb4a25..8d67e03 100644
--- a/evals/test_login.py
+++ b/evals/test_login.py
@@ -70,7 +70,9 @@ def test_upsun_login():
   )
 
   tool_correctness_metric = ToolCorrectnessMetric(
-    evaluation_params=[ToolCallParams.TOOL_CALLED]
+    threshold=0.5,
+    evaluation_params=[ToolCallParams.INPUT_PARAMETERS],
+    include_reason=True
   )
 
   test_case = LLMTestCase(

From 2b41e557b74d60a5ba67c1a0bdf40c2a2c0505fa Mon Sep 17 00:00:00 2001
From: Ganeshdip Dumbare <ganeshdip.dumbare@gmail.com>
Date: Mon, 16 Mar 2026 15:05:34 +0100
Subject: [PATCH 11/14] add correct tool debug

---
 evals/test_login.py | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/evals/test_login.py b/evals/test_login.py
index 8d67e03..db451b3 100644
--- a/evals/test_login.py
+++ b/evals/test_login.py
@@ -58,6 +58,10 @@ def run_claude_code(prompt):
 def test_upsun_login():
   output, tool_calls = run_claude_code("Am i logged in to Upsun ?")
 
+  print("\n[DEBUG] Tool calls made:")
+  for tc in tool_calls:
+    print(f"  {tc.name}: {tc.input_parameters}")
+
   correctness_metric = GEval(
     name="Correctness",
     evaluation_params=[LLMTestCaseParams.ACTUAL_OUTPUT, LLMTestCaseParams.EXPECTED_OUTPUT],

From c5e60eea4de89765ded2277b2a3c118611669ad9 Mon Sep 17 00:00:00 2001
From: Ganeshdip Dumbare <ganeshdip.dumbare@gmail.com>
Date: Mon, 16 Mar 2026 15:16:26 +0100
Subject: [PATCH 12/14] check env to run evals

---
 evals/test_login.py | 13 +++++++------
 1 file changed, 7 insertions(+), 6 deletions(-)

diff --git a/evals/test_login.py b/evals/test_login.py
index db451b3..f96733a 100644
--- a/evals/test_login.py
+++ b/evals/test_login.py
@@ -1,4 +1,5 @@
 import json
+import os
 import subprocess
 from deepeval.test_case import LLMTestCase, LLMTestCaseParams, ToolCall, ToolCallParams
 from deepeval.metrics import GEval, ToolCorrectnessMetric
@@ -21,6 +22,7 @@ def run_claude_code(prompt):
     'Bash(upsun user:*)',
     'Bash(upsun organization:*)',
   ]
+  env = {k: v for k, v in os.environ.items() if k != 'CLAUDECODE'}
   result = subprocess.run(
     ['claude', '-p', prompt,
      '--allowedTools', ','.join(allowed_tools),
@@ -29,7 +31,8 @@ def run_claude_code(prompt):
      '--dangerously-skip-permissions'],
     capture_output=True,
     text=True,
-    timeout=3000
+    timeout=3000,
+    env=env
   )
 
   tool_calls = []
@@ -58,10 +61,6 @@ def run_claude_code(prompt):
 def test_upsun_login():
   output, tool_calls = run_claude_code("Am i logged in to Upsun ?")
 
-  print("\n[DEBUG] Tool calls made:")
-  for tc in tool_calls:
-    print(f"  {tc.name}: {tc.input_parameters}")
-
   correctness_metric = GEval(
     name="Correctness",
     evaluation_params=[LLMTestCaseParams.ACTUAL_OUTPUT, LLMTestCaseParams.EXPECTED_OUTPUT],
@@ -73,6 +72,8 @@ def test_upsun_login():
     ]
   )
 
+  skill_name = "upsun:check-upsun-auth" if os.environ.get("CI") else "check-upsun-auth"
+
   tool_correctness_metric = ToolCorrectnessMetric(
     threshold=0.5,
     evaluation_params=[ToolCallParams.INPUT_PARAMETERS],
@@ -84,7 +85,7 @@ def test_upsun_login():
     expected_output="No, you're not currently logged in to Upsun. Your session has expired. To log in, you'll need to run: upsun login",
     actual_output=output,
     tools_called=tool_calls,
-    expected_tools=[ToolCall(name="Skill", input_parameters={"skill": "check-upsun-auth"})]
+    expected_tools=[ToolCall(name="Skill", input_parameters={"skill": skill_name})]
   )
 
   assert_test(test_case, [correctness_metric, tool_correctness_metric])

From d672193007e925e7ab31ad215e327fdead814da7 Mon Sep 17 00:00:00 2001
From: Ganeshdip Dumbare <ganeshdip.dumbare@gmail.com>
Date: Mon, 16 Mar 2026 15:53:20 +0100
Subject: [PATCH 13/14] Potential fix for pull request finding

Co-authored-by: Copilot Autofix powered by AI <175728472+Copilot@users.noreply.github.com>
---
 evals/test_login.py | 7 +++++++
 1 file changed, 7 insertions(+)

diff --git a/evals/test_login.py b/evals/test_login.py
index f96733a..f468afc 100644
--- a/evals/test_login.py
+++ b/evals/test_login.py
@@ -35,6 +35,13 @@ def run_claude_code(prompt):
     env=env
   )
 
+  if result.returncode != 0:
+    raise RuntimeError(
+      f"Claude CLI failed with exit code {result.returncode}.\n"
+      f"STDOUT:\n{result.stdout}\n"
+      f"STDERR:\n{result.stderr}"
+    )
+
   tool_calls = []
   final_output = ""
 

From b136079811ea45083ad130081d0703fa4a44b6d9 Mon Sep 17 00:00:00 2001
From: Ganeshdip Dumbare <ganeshdip.dumbare@gmail.com>
Date: Mon, 16 Mar 2026 15:57:14 +0100
Subject: [PATCH 14/14] put the eval result on PR

---
 .github/workflows/run-evals.yml | 12 ++++++++++++
 1 file changed, 12 insertions(+)

diff --git a/.github/workflows/run-evals.yml b/.github/workflows/run-evals.yml
index 08a6c3e..92cd66a 100644
--- a/.github/workflows/run-evals.yml
+++ b/.github/workflows/run-evals.yml
@@ -10,6 +10,10 @@ jobs:
     runs-on: ubuntu-latest
     # Only run on PRs from the same repository, not forks.
     if: github.event_name != 'pull_request' || github.event.pull_request.head.repo.full_name == github.repository
+    permissions:
+      contents: read
+      pull-requests: write
+      checks: write
     
     steps:
     - name: Checkout repository
@@ -76,6 +80,14 @@ jobs:
           --html=report.html \
           --self-contained-html
     
+    - name: Publish eval results to PR
+      uses: EnricoMi/publish-unit-test-result-action@v2
+      if: always() && github.event_name == 'pull_request'
+      with:
+        files: evals/results.xml
+        comment_title: "Eval Results"
+        check_name: "Eval Results"
+
     - name: Upload test results
       uses: actions/upload-artifact@v4
       if: always()