diff --git a/.github/workflows/run-evals.yml b/.github/workflows/run-evals.yml
index 3343635..92cd66a 100644
--- a/.github/workflows/run-evals.yml
+++ b/.github/workflows/run-evals.yml
@@ -10,6 +10,10 @@ jobs:
     runs-on: ubuntu-latest
     # Only run on PRs from the same repository, not forks.
     if: github.event_name != 'pull_request' || github.event.pull_request.head.repo.full_name == github.repository
+    permissions:
+      contents: read
+      pull-requests: write
+      checks: write
     
     steps:
     - name: Checkout repository
@@ -42,11 +46,12 @@ jobs:
         curl -fsSL https://raw.githubusercontent.com/platformsh/cli/main/installer.sh | VENDOR=upsun bash
         echo "$HOME/.platformsh/bin" >> $GITHUB_PATH
     
-    - name: Install using-upsun skill
+    - name: Install upsun plugin
       run: |
-        mkdir -p "$HOME/.claude/skills/"
-        cp -r plugins/upsun/skills/. "$HOME/.claude/skills/"
-    
+        claude plugin marketplace add ./
+        claude plugin install upsun@upsun
+    - name: Verify upsun plugin is installed
+      run: claude plugin list
     - name: Install dependencies
       working-directory: ./evals
       run: |
@@ -75,6 +80,14 @@ jobs:
           --html=report.html \
           --self-contained-html
     
+    - name: Publish eval results to PR
+      uses: EnricoMi/publish-unit-test-result-action@v2
+      if: always() && github.event_name == 'pull_request'
+      with:
+        files: evals/results.xml
+        comment_title: "Eval Results"
+        check_name: "Eval Results"
+
     - name: Upload test results
       uses: actions/upload-artifact@v4
       if: always()
diff --git a/evals/test_login.py b/evals/test_login.py
index 0fda57e..f468afc 100644
--- a/evals/test_login.py
+++ b/evals/test_login.py
@@ -1,11 +1,15 @@
+import json
+import os
 import subprocess
-from deepeval.test_case import LLMTestCase, LLMTestCaseParams
-from deepeval.metrics import GEval
+from deepeval.test_case import LLMTestCase, LLMTestCaseParams, ToolCall, ToolCallParams
+from deepeval.metrics import GEval, ToolCorrectnessMetric
 from deepeval import assert_test
 
 def run_claude_code(prompt):
-  """Execute Claude Code CLI and capture output"""
+  """Execute Claude Code CLI and capture output with tool call traces"""
   allowed_tools = [
+    'Skill',
+    'ToolSearch',
     'mcp__upsun__*',
     'Bash(upsun auth:*)',
     'Bash(upsun environment:*)',
@@ -18,19 +22,52 @@ def run_claude_code(prompt):
     'Bash(upsun user:*)',
     'Bash(upsun organization:*)',
   ]
+  env = {k: v for k, v in os.environ.items() if k != 'CLAUDECODE'}
   result = subprocess.run(
-    ['claude', '-p', prompt, '--allowedTools', ','.join(allowed_tools), '--dangerously-skip-permissions'],
+    ['claude', '-p', prompt,
+     '--allowedTools', ','.join(allowed_tools),
+     '--output-format', 'stream-json',
+     '--verbose',
+     '--dangerously-skip-permissions'],
     capture_output=True,
     text=True,
-    timeout=3000
+    timeout=3000,
+    env=env
   )
-  return result.stdout
+
+  if result.returncode != 0:
+    raise RuntimeError(
+      f"Claude CLI failed with exit code {result.returncode}.\n"
+      f"STDOUT:\n{result.stdout}\n"
+      f"STDERR:\n{result.stderr}"
+    )
+
+  tool_calls = []
+  final_output = ""
+
+  for line in result.stdout.splitlines():
+    if not line.strip():
+      continue
+    try:
+      event = json.loads(line)
+    except json.JSONDecodeError:
+      continue
+
+    if event.get("type") == "assistant":
+      for block in event.get("message", {}).get("content", []):
+        if block.get("type") == "tool_use":
+          tool_calls.append(ToolCall(
+            name=block["name"],
+            input_parameters=block.get("input", {})
+          ))
+    elif event.get("type") == "result":
+      final_output = event.get("result", "")
+
+  return final_output, tool_calls
 
 def test_upsun_login():
-  # Run Claude Code
-  output = run_claude_code("Am i logged in to Upsun ?")
-  
-  # Create correctness metric
+  output, tool_calls = run_claude_code("Am i logged in to Upsun ?")
+
   correctness_metric = GEval(
     name="Correctness",
     evaluation_params=[LLMTestCaseParams.ACTUAL_OUTPUT, LLMTestCaseParams.EXPECTED_OUTPUT],
@@ -41,11 +78,21 @@ def test_upsun_login():
       "Compare the actual output with the expected output to ensure they convey the same information"
     ]
   )
-  
-  # Evaluate with DeepEval
+
+  skill_name = "upsun:check-upsun-auth" if os.environ.get("CI") else "check-upsun-auth"
+
+  tool_correctness_metric = ToolCorrectnessMetric(
+    threshold=0.5,
+    evaluation_params=[ToolCallParams.INPUT_PARAMETERS],
+    include_reason=True
+  )
+
   test_case = LLMTestCase(
     input="Am i logged in to Upsun ?",
     expected_output="No, you're not currently logged in to Upsun. Your session has expired. To log in, you'll need to run: upsun login",
-    actual_output=output
+    actual_output=output,
+    tools_called=tool_calls,
+    expected_tools=[ToolCall(name="Skill", input_parameters={"skill": skill_name})]
   )
-  assert_test(test_case, [correctness_metric])
\ No newline at end of file
+
+  assert_test(test_case, [correctness_metric, tool_correctness_metric])
diff --git a/plugins/upsun/skills/check-upsun-auth/SKILL.md b/plugins/upsun/skills/check-upsun-auth/SKILL.md
new file mode 100644
index 0000000..d09def5
--- /dev/null
+++ b/plugins/upsun/skills/check-upsun-auth/SKILL.md
@@ -0,0 +1,26 @@
+---
+name: check-upsun-auth
+description: Checks Upsun authentication and login status. Use when the user asks "am I logged in to Upsun", "check Upsun authentication", "Upsun login status", "am I authenticated", "check my Upsun credentials", or wants to log in, log out, or switch Upsun accounts.
+---
+
+# Check Upsun Auth
+
+## Check current authentication status
+
+```bash
+upsun auth:info
+```
+
+Returns the currently authenticated user's email and account details. If not authenticated, it will error.
+
+## Authenticate
+
+```bash
+upsun login
+```
+
+## Log out
+
+```bash
+upsun logout
+```