From 605869db4d914d3f2bfadc4166b767d74929bdfd Mon Sep 17 00:00:00 2001
From: "Strix (Claude Opus 4.6)" <noreply@anthropic.com>
Date: Wed, 25 Mar 2026 01:07:33 +0000
Subject: [PATCH] Add dissonance tracking builtin skill

Cross-references journal entries (user_wanted/agent_did) against event
logs to detect intent-vs-outcome gaps: action mismatches (claimed silence
but sent message), invisible failures (claimed success with errors),
scope drift (work volume vs description), and phantom work.

Includes analysis script (dissonance_review.py), skill docs, and 22 tests.

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
---
 .../dissonance-tracking/SKILL.md              | 118 +++++++
 .../scripts/dissonance_review.py              | 334 ++++++++++++++++++
 tests/test_dissonance_review.py               | 222 ++++++++++++
 3 files changed, 674 insertions(+)
 create mode 100644 open_strix/builtin_skills/dissonance-tracking/SKILL.md
 create mode 100644 open_strix/builtin_skills/scripts/dissonance_review.py
 create mode 100644 tests/test_dissonance_review.py

diff --git a/open_strix/builtin_skills/dissonance-tracking/SKILL.md b/open_strix/builtin_skills/dissonance-tracking/SKILL.md
new file mode 100644
index 0000000..0c4c6b9
--- /dev/null
+++ b/open_strix/builtin_skills/dissonance-tracking/SKILL.md
@@ -0,0 +1,118 @@
+---
+name: dissonance-tracking
+description: Detect and analyze gaps between what you intended to do and what you actually did, using journal entries cross-referenced with event logs. Use during periodic self-review (ticks, maintenance windows) or when you suspect behavioral drift. Do not use for one-off messaging or real-time decision-making.
+allowed-tools: bash powershell read_file
+---
+
+# Dissonance Tracking
+
+Systematic detection of intent-vs-outcome gaps. You already capture intent (`user_wanted`, `agent_did`) in every journal entry. This skill teaches you to cross-reference those claims against ground truth (events.jsonl, Discord history) to find where your self-report diverges from reality.
+
+## Why This Matters
+
+Self-reports are narratives, not measurements. Common failure modes:
+
+- **Action mismatch:** Journal says "Silence" but events.jsonl shows a `send_message` in the same session
+- **Scope drift:** `user_wanted` asks for one thing, `agent_did` describes three things
+- **Invisible failures:** `agent_did` claims success but events show `tool_call_error` in that session
+- **Phantom work:** `agent_did` describes actions with no corresponding events
+- **Understated action:** Events show significant work not mentioned in `agent_did`
+
+These aren't moral failures. They're calibration data. A pilot who doesn't cross-check instruments against reality eventually drifts off course.
+
+## Running a Dissonance Review
+
+### Quick check (single session)
+
+Use the analysis script to review the most recent sessions:
+
+```bash
+uv run python .open_strix_builtin_skills/scripts/dissonance_review.py --last 5
+```
+
+This compares the last 5 journal entries against their corresponding event logs and reports any gaps.
+
+### Full review (time window)
+
+```bash
+uv run python .open_strix_builtin_skills/scripts/dissonance_review.py --hours 72
+```
+
+Reviews all journal entries from the last 72 hours.
+
+### Output
+
+The script writes structured records to `state/dissonance_reviews.jsonl`:
+
+```json
+{
+  "timestamp": "2026-03-25T12:00:00+00:00",
+  "journal_timestamp": "2026-03-25T11:55:00+00:00",
+  "session_id": "abc123",
+  "dissonance_type": "action_mismatch",
+  "journal_claim": "Silence — no response needed",
+  "event_evidence": "send_message called at 11:56:00",
+  "severity": "high",
+  "notes": ""
+}
+```
+
+### Severity levels
+
+- **high:** Direct contradiction between journal claim and events (said silence, sent message; said success, got error)
+- **medium:** Scope mismatch or understated action (did more/less than described)
+- **low:** Minor omissions or imprecise language (described 3 of 4 actions taken)
+
+## Dissonance Types
+
+### action_mismatch
+Journal claims one action, events show a different one. The sharpest signal.
+
+**Detection:** Compare `agent_did` keywords against session events. If journal says "silence"/"no response"/"no message" but session has `send_message` events, that's a mismatch. If journal says "sent message" but no `send_message` event exists, also a mismatch.
+
+### scope_drift
+Agent did significantly more or less than what was requested or described.
+
+**Detection:** Count tool calls in session vs complexity described in `agent_did`. Large discrepancy (many tools, brief description OR few tools, elaborate description) suggests drift.
+
+### invisible_failure
+Journal claims success but events show errors in the same session.
+
+**Detection:** Check for `tool_call_error` events in sessions where `agent_did` doesn't mention any failure.
+
+### phantom_work
+Journal describes actions with no corresponding events.
+
+**Detection:** Journal references specific tools or file operations but events.jsonl has no matching tool calls in that session.
+
+### understated_action
+Significant event activity not reflected in journal.
+
+**Detection:** Session has many tool calls, file operations, or messages but journal is minimal. This is the least concerning type — better to understate than overstate — but persistent understatement means your self-model is incomplete.
+
+## Integration with Other Skills
+
+**Prediction Review:** Dissonance tracking asks "did I do what I said I did?" Prediction review asks "did reality match what I predicted?" They're complementary — predictions test your world model, dissonance tests your self-model.
+
+**Introspection:** When dissonance review finds a pattern (e.g., repeatedly understating action in certain channels), use introspection's event queries to dig deeper into the specific sessions.
+
+**Memory:** If a dissonance pattern is persistent (same type appearing across multiple reviews), update a memory block with the behavioral correction. The pattern itself is the learning.
+
+## Review Cadence
+
+Run dissonance review:
+- During maintenance ticks (every 12-24 hours)
+- After sessions where you feel uncertain about what you did
+- When someone corrects your self-report (that's a confirmed dissonance — log it)
+
+Do NOT run dissonance review:
+- Every single session (too noisy, diminishing returns)
+- As a real-time decision tool (it's retrospective by design)
+
+## Interpreting Results
+
+**Zero dissonance is suspicious.** Either the review window is too short, the detection thresholds are too loose, or you're not doing enough to have gaps. Some dissonance is healthy — it means you're operating in uncertain territory.
+
+**Persistent patterns matter more than individual events.** One action mismatch is a data point. Five action mismatches in the same channel or context is a behavioral pattern that needs correction.
+
+**High severity + low frequency = probably fine.** Everyone has off moments. High severity + high frequency = something structural needs to change.
diff --git a/open_strix/builtin_skills/scripts/dissonance_review.py b/open_strix/builtin_skills/scripts/dissonance_review.py
new file mode 100644
index 0000000..db1b000
--- /dev/null
+++ b/open_strix/builtin_skills/scripts/dissonance_review.py
@@ -0,0 +1,334 @@
+"""Cross-reference journal entries against event logs to detect intent-vs-outcome gaps.
+
+Reads logs/journal.jsonl and logs/events.jsonl, compares claims in each journal
+entry against the actual events recorded for that session, and outputs structured
+dissonance records.
+"""
+from __future__ import annotations
+
+import argparse
+import json
+import re
+import sys
+from datetime import datetime, timedelta, timezone
+from pathlib import Path
+from typing import Any
+
+UTC = timezone.utc
+
+# Keywords that indicate "no action taken" in agent_did
+SILENCE_PATTERNS = re.compile(
+    r"\b(silence|no\s+(?:message|response|text|reply)\s+(?:sent|needed|warranted))\b",
+    re.IGNORECASE,
+)
+
+# Keywords that indicate message sending in agent_did
+SEND_PATTERNS = re.compile(
+    r"\b(sent|posted|replied|responded|messaged|relayed|shared)\b",
+    re.IGNORECASE,
+)
+
+# Keywords that indicate success in agent_did
+SUCCESS_PATTERNS = re.compile(
+    r"\b(completed|done|succeeded|delivered|shipped|fixed|resolved)\b",
+    re.IGNORECASE,
+)
+
+
+def load_jsonl(path: Path) -> list[dict[str, Any]]:
+    """Load a JSONL file, skipping blank or malformed lines."""
+    if not path.exists():
+        return []
+    entries: list[dict[str, Any]] = []
+    for line in path.read_text(encoding="utf-8").splitlines():
+        line = line.strip()
+        if not line:
+            continue
+        try:
+            entries.append(json.loads(line))
+        except json.JSONDecodeError:
+            continue
+    return entries
+
+
+def parse_timestamp(raw: str) -> datetime:
+    """Parse an ISO timestamp, normalizing to UTC."""
+    dt = datetime.fromisoformat(raw.replace("Z", "+00:00"))
+    if dt.tzinfo is None:
+        dt = dt.replace(tzinfo=UTC)
+    return dt.astimezone(UTC)
+
+
+def events_for_session(
+    events: list[dict[str, Any]], session_id: str
+) -> list[dict[str, Any]]:
+    """Filter events to a specific session."""
+    return [e for e in events if e.get("session_id") == session_id]
+
+
+def detect_action_mismatch(
+    journal_entry: dict[str, Any],
+    session_events: list[dict[str, Any]],
+) -> list[dict[str, Any]]:
+    """Detect contradictions between journal claims and actual events."""
+    findings: list[dict[str, Any]] = []
+    agent_did = str(journal_entry.get("agent_did", ""))
+
+    send_events = [
+        e for e in session_events if e.get("tool") == "send_message"
+    ]
+    react_events = [e for e in session_events if e.get("tool") == "react"]
+
+    # Claimed silence but sent messages
+    if SILENCE_PATTERNS.search(agent_did) and send_events:
+        findings.append({
+            "dissonance_type": "action_mismatch",
+            "journal_claim": _truncate(agent_did, 200),
+            "event_evidence": (
+                f"{len(send_events)} send_message call(s) in session"
+            ),
+            "severity": "high",
+        })
+
+    # Claimed sending but no send events
+    if SEND_PATTERNS.search(agent_did) and not send_events and not react_events:
+        # Only flag if the claim is about sending a message, not just reacting
+        if not re.search(r"\breact", agent_did, re.IGNORECASE):
+            findings.append({
+                "dissonance_type": "action_mismatch",
+                "journal_claim": _truncate(agent_did, 200),
+                "event_evidence": "no send_message or react events in session",
+                "severity": "high",
+            })
+
+    return findings
+
+
+def detect_invisible_failure(
+    journal_entry: dict[str, Any],
+    session_events: list[dict[str, Any]],
+) -> list[dict[str, Any]]:
+    """Detect sessions where journal claims success but events show errors."""
+    findings: list[dict[str, Any]] = []
+    agent_did = str(journal_entry.get("agent_did", ""))
+
+    error_events = [
+        e
+        for e in session_events
+        if "error" in str(e.get("type", "")).lower()
+    ]
+
+    if SUCCESS_PATTERNS.search(agent_did) and error_events:
+        error_types = [e.get("type", "unknown") for e in error_events]
+        if not re.search(r"\b(error|fail|issue)\b", agent_did, re.IGNORECASE):
+            findings.append({
+                "dissonance_type": "invisible_failure",
+                "journal_claim": _truncate(agent_did, 200),
+                "event_evidence": (
+                    f"{len(error_events)} error event(s): "
+                    f"{', '.join(error_types[:3])}"
+                ),
+                "severity": "high",
+            })
+
+    return findings
+
+
+def detect_scope_drift(
+    journal_entry: dict[str, Any],
+    session_events: list[dict[str, Any]],
+) -> list[dict[str, Any]]:
+    """Detect significant mismatch between event volume and journal description."""
+    findings: list[dict[str, Any]] = []
+    agent_did = str(journal_entry.get("agent_did", ""))
+
+    tool_calls = [e for e in session_events if e.get("type") == "tool_call"]
+    description_length = len(agent_did)
+
+    # Many tool calls, very brief description
+    if len(tool_calls) >= 10 and description_length < 50:
+        findings.append({
+            "dissonance_type": "understated_action",
+            "journal_claim": _truncate(agent_did, 200),
+            "event_evidence": (
+                f"{len(tool_calls)} tool calls but only "
+                f"{description_length} chars in agent_did"
+            ),
+            "severity": "low",
+        })
+
+    # Very few tool calls, elaborate description
+    if len(tool_calls) <= 1 and description_length > 500:
+        findings.append({
+            "dissonance_type": "phantom_work",
+            "journal_claim": _truncate(agent_did, 200),
+            "event_evidence": (
+                f"only {len(tool_calls)} tool call(s) but "
+                f"{description_length} chars describing work done"
+            ),
+            "severity": "medium",
+        })
+
+    return findings
+
+
+def _truncate(text: str, max_len: int) -> str:
+    if len(text) <= max_len:
+        return text
+    return text[: max_len - 3] + "..."
+
+
+def review_entry(
+    journal_entry: dict[str, Any],
+    all_events: list[dict[str, Any]],
+) -> list[dict[str, Any]]:
+    """Run all detectors on a single journal entry."""
+    session_id = journal_entry.get("session_id", "")
+    if not session_id:
+        return []
+
+    session_events = events_for_session(all_events, session_id)
+    if not session_events:
+        return []
+
+    findings: list[dict[str, Any]] = []
+    findings.extend(detect_action_mismatch(journal_entry, session_events))
+    findings.extend(detect_invisible_failure(journal_entry, session_events))
+    findings.extend(detect_scope_drift(journal_entry, session_events))
+
+    now_iso = datetime.now(tz=UTC).isoformat()
+    for f in findings:
+        f["timestamp"] = now_iso
+        f["journal_timestamp"] = journal_entry.get("timestamp", "")
+        f["session_id"] = session_id
+        f.setdefault("notes", "")
+
+    return findings
+
+
+def build_parser() -> argparse.ArgumentParser:
+    parser = argparse.ArgumentParser(
+        description="Cross-reference journal entries against event logs to detect dissonance.",
+    )
+    group = parser.add_mutually_exclusive_group()
+    group.add_argument(
+        "--last",
+        type=int,
+        default=None,
+        help="Review the N most recent journal entries.",
+    )
+    group.add_argument(
+        "--hours",
+        type=float,
+        default=None,
+        help="Review journal entries from the last N hours.",
+    )
+    parser.add_argument(
+        "--journal",
+        default="logs/journal.jsonl",
+        help="Path to journal JSONL file.",
+    )
+    parser.add_argument(
+        "--events",
+        default="logs/events.jsonl",
+        help="Path to events JSONL file.",
+    )
+    parser.add_argument(
+        "--output",
+        default="state/dissonance_reviews.jsonl",
+        help="Path to write dissonance records.",
+    )
+    parser.add_argument(
+        "--dry-run",
+        action="store_true",
+        help="Print findings to stdout without writing to output file.",
+    )
+    return parser
+
+
+def main() -> None:
+    args = build_parser().parse_args()
+
+    journal_path = Path(args.journal).expanduser()
+    events_path = Path(args.events).expanduser()
+    output_path = Path(args.output).expanduser()
+
+    if not journal_path.is_absolute():
+        journal_path = Path.cwd() / journal_path
+    if not events_path.is_absolute():
+        events_path = Path.cwd() / events_path
+    if not output_path.is_absolute():
+        output_path = Path.cwd() / output_path
+
+    journal_entries = load_jsonl(journal_path)
+    all_events = load_jsonl(events_path)
+
+    if not journal_entries:
+        print("No journal entries found.", file=sys.stderr)
+        return
+
+    # Filter entries based on args
+    if args.last is not None:
+        journal_entries = journal_entries[-args.last :]
+    elif args.hours is not None:
+        cutoff = datetime.now(tz=UTC) - timedelta(hours=args.hours)
+        journal_entries = [
+            e
+            for e in journal_entries
+            if "timestamp" in e and parse_timestamp(e["timestamp"]) >= cutoff
+        ]
+    else:
+        # Default: last 10 entries
+        journal_entries = journal_entries[-10:]
+
+    all_findings: list[dict[str, Any]] = []
+    for entry in journal_entries:
+        findings = review_entry(entry, all_events)
+        all_findings.extend(findings)
+
+    # Report
+    if not all_findings:
+        print(f"No dissonance detected across {len(journal_entries)} journal entries.")
+        return
+
+    print(
+        f"Found {len(all_findings)} dissonance(s) across "
+        f"{len(journal_entries)} journal entries:"
+    )
+    for f in all_findings:
+        severity = f["severity"].upper()
+        dtype = f["dissonance_type"]
+        print(f"  [{severity}] {dtype}: {f['event_evidence']}")
+
+    # Summary by type
+    type_counts: dict[str, int] = {}
+    for f in all_findings:
+        t = f["dissonance_type"]
+        type_counts[t] = type_counts.get(t, 0) + 1
+    print("\nBy type:")
+    for t, count in sorted(type_counts.items(), key=lambda x: -x[1]):
+        print(f"  {t}: {count}")
+
+    severity_counts: dict[str, int] = {}
+    for f in all_findings:
+        s = f["severity"]
+        severity_counts[s] = severity_counts.get(s, 0) + 1
+    print("\nBy severity:")
+    for s in ("high", "medium", "low"):
+        if s in severity_counts:
+            print(f"  {s}: {severity_counts[s]}")
+
+    if args.dry_run:
+        print("\n(dry run — not writing to output file)")
+        return
+
+    # Write findings
+    output_path.parent.mkdir(parents=True, exist_ok=True)
+    with output_path.open("a", encoding="utf-8") as fh:
+        for f in all_findings:
+            fh.write(json.dumps(f, ensure_ascii=True) + "\n")
+    print(f"\nAppended {len(all_findings)} record(s) to {output_path}")
+
+
+if __name__ == "__main__":
+    main()
diff --git a/tests/test_dissonance_review.py b/tests/test_dissonance_review.py
new file mode 100644
index 0000000..df2d331
--- /dev/null
+++ b/tests/test_dissonance_review.py
@@ -0,0 +1,222 @@
+"""Tests for the dissonance review script."""
+from __future__ import annotations
+
+import json
+from datetime import datetime, timezone
+from pathlib import Path
+
+import pytest
+
+from open_strix.builtin_skills.scripts.dissonance_review import (
+    detect_action_mismatch,
+    detect_invisible_failure,
+    detect_scope_drift,
+    load_jsonl,
+    review_entry,
+)
+
+UTC = timezone.utc
+NOW = datetime.now(tz=UTC).isoformat()
+
+
+def _journal(agent_did: str, session_id: str = "sess-1") -> dict:
+    return {
+        "timestamp": NOW,
+        "session_id": session_id,
+        "channel_id": "ch-1",
+        "user_wanted": "test",
+        "agent_did": agent_did,
+        "predictions": "",
+    }
+
+
+def _event(
+    event_type: str = "tool_call",
+    session_id: str = "sess-1",
+    tool: str | None = None,
+    **kwargs: object,
+) -> dict:
+    d: dict = {"timestamp": NOW, "type": event_type, "session_id": session_id}
+    if tool is not None:
+        d["tool"] = tool
+    d.update(kwargs)
+    return d
+
+
+class TestActionMismatch:
+    """Detect contradictions between journal silence claims and actual sends."""
+
+    def test_silence_claimed_but_message_sent(self) -> None:
+        entry = _journal("Silence — no response needed")
+        events = [_event(tool="send_message")]
+        findings = detect_action_mismatch(entry, events)
+        assert len(findings) == 1
+        assert findings[0]["dissonance_type"] == "action_mismatch"
+        assert findings[0]["severity"] == "high"
+
+    def test_no_message_sent_variation(self) -> None:
+        entry = _journal("No message sent — Tim heads-down")
+        events = [_event(tool="send_message")]
+        findings = detect_action_mismatch(entry, events)
+        assert len(findings) == 1
+
+    def test_actual_silence_no_finding(self) -> None:
+        entry = _journal("Silence — no response needed")
+        events = [_event(tool="read_file")]
+        findings = detect_action_mismatch(entry, events)
+        assert len(findings) == 0
+
+    def test_claimed_send_but_no_events(self) -> None:
+        entry = _journal("Sent substantive analysis to research channel")
+        events = [_event(tool="read_file")]
+        findings = detect_action_mismatch(entry, events)
+        assert len(findings) == 1
+        assert findings[0]["dissonance_type"] == "action_mismatch"
+
+    def test_claimed_send_with_send_event(self) -> None:
+        entry = _journal("Sent substantive analysis to research channel")
+        events = [_event(tool="send_message")]
+        findings = detect_action_mismatch(entry, events)
+        assert len(findings) == 0
+
+    def test_claimed_react_with_react_event(self) -> None:
+        entry = _journal("Reacted with owl emoji")
+        events = [_event(tool="react")]
+        findings = detect_action_mismatch(entry, events)
+        assert len(findings) == 0
+
+    def test_no_silence_keywords_no_finding(self) -> None:
+        entry = _journal("Updated state files and committed")
+        events = [_event(tool="send_message")]
+        findings = detect_action_mismatch(entry, events)
+        assert len(findings) == 0
+
+
+class TestInvisibleFailure:
+    """Detect sessions where journal claims success but events show errors."""
+
+    def test_success_claimed_with_errors(self) -> None:
+        entry = _journal("Completed the update successfully")
+        events = [
+            _event(tool="edit_file"),
+            _event(event_type="tool_call_error", error_type="permission_denied"),
+        ]
+        findings = detect_invisible_failure(entry, events)
+        assert len(findings) == 1
+        assert findings[0]["dissonance_type"] == "invisible_failure"
+        assert findings[0]["severity"] == "high"
+
+    def test_success_with_acknowledged_error(self) -> None:
+        entry = _journal("Fixed the error in the config file after initial failure")
+        events = [
+            _event(event_type="tool_call_error", error_type="parse_error"),
+            _event(tool="edit_file"),
+        ]
+        findings = detect_invisible_failure(entry, events)
+        # Should not flag because agent_did mentions the error
+        assert len(findings) == 0
+
+    def test_no_success_keywords_no_finding(self) -> None:
+        entry = _journal("Tried to update but ran into issues")
+        events = [
+            _event(event_type="tool_call_error", error_type="timeout"),
+        ]
+        findings = detect_invisible_failure(entry, events)
+        assert len(findings) == 0
+
+    def test_success_no_errors(self) -> None:
+        entry = _journal("Completed the full migration")
+        events = [_event(tool="edit_file"), _event(tool="write_file")]
+        findings = detect_invisible_failure(entry, events)
+        assert len(findings) == 0
+
+
+class TestScopeDrift:
+    """Detect mismatch between event volume and journal description length."""
+
+    def test_many_tools_brief_description(self) -> None:
+        entry = _journal("Done.")
+        events = [_event(tool=f"tool_{i}") for i in range(15)]
+        findings = detect_scope_drift(entry, events)
+        assert len(findings) == 1
+        assert findings[0]["dissonance_type"] == "understated_action"
+        assert findings[0]["severity"] == "low"
+
+    def test_few_tools_elaborate_description(self) -> None:
+        entry = _journal("A" * 600)
+        events = [_event(tool="read_file")]
+        findings = detect_scope_drift(entry, events)
+        assert len(findings) == 1
+        assert findings[0]["dissonance_type"] == "phantom_work"
+        assert findings[0]["severity"] == "medium"
+
+    def test_proportional_no_finding(self) -> None:
+        entry = _journal("Updated the config and committed changes to git")
+        events = [_event(tool="edit_file"), _event(tool="bash")]
+        findings = detect_scope_drift(entry, events)
+        assert len(findings) == 0
+
+
+class TestReviewEntry:
+    """Integration test for the full review pipeline."""
+
+    def test_multiple_findings_single_entry(self) -> None:
+        # Silence claimed but message sent AND errors present
+        entry = _journal("Silence — completed without issues")
+        events = [
+            _event(tool="send_message"),
+            _event(event_type="tool_call_error", error_type="timeout"),
+        ]
+        findings = review_entry(entry, events)
+        # Should find action_mismatch (silence + send) and invisible_failure (completed + error)
+        types = {f["dissonance_type"] for f in findings}
+        assert "action_mismatch" in types
+
+    def test_no_session_id_skipped(self) -> None:
+        entry = {"agent_did": "test", "timestamp": NOW}
+        events = [_event()]
+        findings = review_entry(entry, events)
+        assert len(findings) == 0
+
+    def test_no_matching_events_skipped(self) -> None:
+        entry = _journal("Silence — no response", session_id="sess-1")
+        events = [_event(session_id="sess-other", tool="send_message")]
+        findings = review_entry(entry, events)
+        assert len(findings) == 0
+
+    def test_findings_have_metadata(self) -> None:
+        entry = _journal("Silence — no response needed")
+        events = [_event(tool="send_message")]
+        findings = review_entry(entry, events)
+        assert len(findings) == 1
+        f = findings[0]
+        assert "timestamp" in f
+        assert f["session_id"] == "sess-1"
+        assert f["journal_timestamp"] == NOW
+
+
+class TestLoadJsonl:
+    """Test JSONL loading with edge cases."""
+
+    def test_loads_valid_file(self, tmp_path: Path) -> None:
+        p = tmp_path / "test.jsonl"
+        p.write_text('{"a": 1}\n{"b": 2}\n', encoding="utf-8")
+        result = load_jsonl(p)
+        assert len(result) == 2
+
+    def test_skips_blank_lines(self, tmp_path: Path) -> None:
+        p = tmp_path / "test.jsonl"
+        p.write_text('{"a": 1}\n\n{"b": 2}\n\n', encoding="utf-8")
+        result = load_jsonl(p)
+        assert len(result) == 2
+
+    def test_skips_malformed_lines(self, tmp_path: Path) -> None:
+        p = tmp_path / "test.jsonl"
+        p.write_text('{"a": 1}\nnot json\n{"b": 2}\n', encoding="utf-8")
+        result = load_jsonl(p)
+        assert len(result) == 2
+
+    def test_missing_file_returns_empty(self, tmp_path: Path) -> None:
+        p = tmp_path / "nonexistent.jsonl"
+        result = load_jsonl(p)
+        assert result == []