From b47120e2a8aafaadf3d2e1eb8f9b04ed26921ac9 Mon Sep 17 00:00:00 2001 From: Richard-Otterli Date: Fri, 6 Mar 2026 22:59:05 +0000 Subject: [PATCH 1/3] Add git history discovery miner (#132) --- src/specleft/discovery/miners/__init__.py | 2 + src/specleft/discovery/miners/defaults.py | 2 + .../discovery/miners/shared/__init__.py | 6 +- .../discovery/miners/shared/git_history.py | 230 ++++++++++++++++++ tests/discovery/miners/test_git_history.py | 169 +++++++++++++ 5 files changed, 408 insertions(+), 1 deletion(-) create mode 100644 src/specleft/discovery/miners/shared/git_history.py create mode 100644 tests/discovery/miners/test_git_history.py diff --git a/src/specleft/discovery/miners/__init__.py b/src/specleft/discovery/miners/__init__.py index 1f10f79..542876f 100644 --- a/src/specleft/discovery/miners/__init__.py +++ b/src/specleft/discovery/miners/__init__.py @@ -7,12 +7,14 @@ from specleft.discovery.miners.python.routes import PythonRouteMiner from specleft.discovery.miners.python.tests import PythonTestMiner from specleft.discovery.miners.shared.docstrings import DocstringMiner +from specleft.discovery.miners.shared.git_history import GitHistoryMiner from specleft.discovery.miners.shared.readme import ReadmeOverviewMiner from specleft.discovery.miners.typescript.routes import TypeScriptRouteMiner from specleft.discovery.miners.typescript.tests import TypeScriptTestMiner __all__ = [ "DocstringMiner", + "GitHistoryMiner", "PythonRouteMiner", "PythonTestMiner", "ReadmeOverviewMiner", diff --git a/src/specleft/discovery/miners/defaults.py b/src/specleft/discovery/miners/defaults.py index f8e83a0..482e449 100644 --- a/src/specleft/discovery/miners/defaults.py +++ b/src/specleft/discovery/miners/defaults.py @@ -10,6 +10,7 @@ from specleft.discovery.miners.python.routes import PythonRouteMiner from specleft.discovery.miners.python.tests import PythonTestMiner from specleft.discovery.miners.shared.docstrings import DocstringMiner +from specleft.discovery.miners.shared.git_history import GitHistoryMiner from specleft.discovery.miners.shared.readme import ReadmeOverviewMiner from specleft.discovery.miners.typescript.routes import TypeScriptRouteMiner from specleft.discovery.miners.typescript.tests import TypeScriptTestMiner @@ -27,4 +28,5 @@ def default_miners() -> list[BaseMiner]: TypeScriptTestMiner(), TypeScriptRouteMiner(), DocstringMiner(), + GitHistoryMiner(), ] diff --git a/src/specleft/discovery/miners/shared/__init__.py b/src/specleft/discovery/miners/shared/__init__.py index 3910fa5..78d6d4d 100644 --- a/src/specleft/discovery/miners/shared/__init__.py +++ b/src/specleft/discovery/miners/shared/__init__.py @@ -5,7 +5,7 @@ from __future__ import annotations -__all__ = ["DocstringMiner", "ReadmeOverviewMiner"] +__all__ = ["DocstringMiner", "GitHistoryMiner", "ReadmeOverviewMiner"] def __getattr__(name: str) -> object: @@ -13,6 +13,10 @@ def __getattr__(name: str) -> object: from specleft.discovery.miners.shared.docstrings import DocstringMiner return DocstringMiner + if name == "GitHistoryMiner": + from specleft.discovery.miners.shared.git_history import GitHistoryMiner + + return GitHistoryMiner if name == "ReadmeOverviewMiner": from specleft.discovery.miners.shared.readme import ReadmeOverviewMiner diff --git a/src/specleft/discovery/miners/shared/git_history.py b/src/specleft/discovery/miners/shared/git_history.py new file mode 100644 index 0000000..12f5d46 --- /dev/null +++ b/src/specleft/discovery/miners/shared/git_history.py @@ -0,0 +1,230 @@ +# SPDX-License-Identifier: Apache-2.0 +# Copyright (c) 2026 SpecLeft Contributors + +"""Language-agnostic git history miner.""" + +from __future__ import annotations + +import re +import subprocess +import time +import uuid +from dataclasses import dataclass +from pathlib import Path + +from specleft.discovery.context import MinerContext +from specleft.discovery.miners.shared.common import elapsed_ms +from specleft.discovery.models import ( + DiscoveredItem, + GitCommitMeta, + ItemKind, + MinerErrorKind, + MinerResult, + SupportedLanguage, +) + +_SEPARATOR = "---END---" +_NOISE_CONVENTIONAL_TYPES = frozenset({"chore", "ci", "build", "docs", "style", "test"}) +_SOURCE_SUFFIXES = frozenset({".py", ".js", ".jsx", ".mjs", ".cjs", ".ts", ".tsx"}) +_CONVENTIONAL_PREFIX = re.compile(r"^(?P[a-z]+)(?:\([^)]+\))?(?:!)?:\s*") + + +@dataclass(frozen=True) +class _CommitRecord: + commit_hash: str + subject: str + body: str | None + changed_files: list[str] + + +class GitHistoryMiner: + """Extract discovery signals from recent git commit history.""" + + miner_id = uuid.UUID("f1c93075-4e3c-44b8-bef6-9c0bc25b6c42") + name = "git_history" + languages: frozenset[SupportedLanguage] = frozenset() + + def mine(self, ctx: MinerContext) -> MinerResult: + started = time.perf_counter() + process = _run_git_log(ctx.root, ctx.config.max_git_commits) + if process is None: + return _git_error_result( + miner_id=self.miner_id, + miner_name=self.name, + error="git executable not found", + duration_ms=elapsed_ms(started), + ) + + if process.returncode != 0: + error = process.stderr.strip() or "not a git repository" + return _git_error_result( + miner_id=self.miner_id, + miner_name=self.name, + error=error, + duration_ms=elapsed_ms(started), + ) + + items = _items_from_log(process.stdout) + return MinerResult( + miner_id=self.miner_id, + miner_name=self.name, + items=items, + duration_ms=elapsed_ms(started), + ) + + +def _run_git_log( + root: Path, max_commits: int +) -> subprocess.CompletedProcess[str] | None: + command = [ + "git", + "-C", + str(root), + "log", + "--no-merges", + "--format=%H%n%s%n%b%n---END---", + "--name-only", + "-n", + str(max_commits), + ] + try: + return subprocess.run( + command, + check=False, + capture_output=True, + text=True, + encoding="utf-8", + errors="replace", + ) + except FileNotFoundError: + return None + + +def _items_from_log(log_output: str) -> list[DiscoveredItem]: + items: list[DiscoveredItem] = [] + for record in _parse_records(log_output): + conventional_type = _conventional_type(record.subject) + if conventional_type in _NOISE_CONVENTIONAL_TYPES: + continue + + source_files = [path for path in record.changed_files if _is_source_path(path)] + if not source_files: + continue + + metadata = GitCommitMeta( + commit_hash=record.commit_hash[:7], + subject=record.subject, + body=record.body, + changed_files=source_files, + conventional_type=conventional_type, + file_prefixes=_collect_file_prefixes(source_files), + ) + items.append( + DiscoveredItem( + kind=ItemKind.GIT_COMMIT, + name=record.subject, + file_path=None, + line_number=None, + language=None, + raw_text=record.body, + metadata=metadata.model_dump(), + confidence=0.5, + ) + ) + return items + + +def _parse_records(log_output: str) -> list[_CommitRecord]: + lines = log_output.splitlines() + records: list[_CommitRecord] = [] + cursor = 0 + total_lines = len(lines) + + while cursor < total_lines: + while cursor < total_lines and not _is_full_hash(lines[cursor]): + cursor += 1 + if cursor >= total_lines: + break + + commit_hash = lines[cursor].strip() + cursor += 1 + if cursor >= total_lines: + break + + subject = lines[cursor].strip() + cursor += 1 + + body_lines: list[str] = [] + while cursor < total_lines and lines[cursor].strip() != _SEPARATOR: + body_lines.append(lines[cursor].rstrip()) + cursor += 1 + if cursor < total_lines and lines[cursor].strip() == _SEPARATOR: + cursor += 1 + + changed_files: list[str] = [] + while cursor < total_lines and not _is_full_hash(lines[cursor]): + file_path = lines[cursor].strip() + if file_path: + changed_files.append(file_path) + cursor += 1 + + body = "\n".join(body_lines).strip() or None + records.append( + _CommitRecord( + commit_hash=commit_hash, + subject=subject, + body=body, + changed_files=changed_files, + ) + ) + + return records + + +def _is_full_hash(value: str) -> bool: + stripped = value.strip() + if len(stripped) != 40: + return False + return all(character in "0123456789abcdef" for character in stripped.lower()) + + +def _conventional_type(subject: str) -> str | None: + match = _CONVENTIONAL_PREFIX.match(subject.strip().lower()) + if not match: + return None + return match.group("kind") + + +def _is_source_path(path: str) -> bool: + suffix = Path(path).suffix.lower() + return suffix in _SOURCE_SUFFIXES + + +def _collect_file_prefixes(paths: list[str]) -> list[str]: + prefixes: list[str] = [] + seen: set[str] = set() + for path in paths: + parent = Path(path).parent.as_posix() + prefix = parent if parent != "." else path + if prefix in seen: + continue + seen.add(prefix) + prefixes.append(prefix) + return prefixes + + +def _git_error_result( + *, + miner_id: uuid.UUID, + miner_name: str, + error: str, + duration_ms: int, +) -> MinerResult: + return MinerResult( + miner_id=miner_id, + miner_name=miner_name, + items=[], + error=error, + error_kind=MinerErrorKind.NOT_INSTALLED, + duration_ms=duration_ms, + ) diff --git a/tests/discovery/miners/test_git_history.py b/tests/discovery/miners/test_git_history.py new file mode 100644 index 0000000..1d73d29 --- /dev/null +++ b/tests/discovery/miners/test_git_history.py @@ -0,0 +1,169 @@ +# SPDX-License-Identifier: Apache-2.0 +# Copyright (c) 2026 SpecLeft Contributors + +"""Tests for git history discovery miner.""" + +from __future__ import annotations + +import subprocess +from pathlib import Path + +from specleft.discovery.config import DiscoveryConfig +from specleft.discovery.context import MinerContext +from specleft.discovery.file_index import FileIndex +from specleft.discovery.language_registry import LanguageRegistry +from specleft.discovery.miners.shared.git_history import GitHistoryMiner +from specleft.discovery.models import GitCommitMeta, MinerErrorKind + + +def _context(root: Path, *, max_git_commits: int = 200) -> MinerContext: + return MinerContext( + root=root, + registry=LanguageRegistry(), + file_index=FileIndex(root), + frameworks={}, + config=DiscoveryConfig(max_git_commits=max_git_commits), + ) + + +def _git(repo: Path, *args: str) -> str: + result = subprocess.run( + ["git", "-C", str(repo), *args], + check=True, + capture_output=True, + text=True, + encoding="utf-8", + ) + return result.stdout.strip() + + +def _init_repo(repo: Path) -> None: + repo.mkdir(parents=True, exist_ok=True) + _git(repo, "init", "-b", "main") + _git(repo, "config", "user.email", "specleft-tests@example.com") + _git(repo, "config", "user.name", "SpecLeft Tests") + + +def _commit( + repo: Path, + *, + subject: str, + files: dict[str, str], + body: str | None = None, +) -> str: + for rel_path, content in files.items(): + absolute = repo / rel_path + absolute.parent.mkdir(parents=True, exist_ok=True) + absolute.write_text(content, encoding="utf-8") + + _git(repo, "add", *sorted(files)) + command = ["commit", "-m", subject] + if body is not None: + command.extend(["-m", body]) + _git(repo, *command) + return _git(repo, "rev-parse", "--short=7", "HEAD") + + +def test_git_history_miner_parses_metadata_and_filters_noise(tmp_path: Path) -> None: + repo = tmp_path / "repo" + _init_repo(repo) + + _commit( + repo, + subject="chore: update lockfile", + files={"package-lock.json": '{"lockfileVersion": 3}'}, + ) + expected_hash = _commit( + repo, + subject="feat: add login endpoint", + body="Implements JWT-based authentication.", + files={ + "src/auth/login.py": "def login() -> None:\n pass\n", + "tests/test_login.py": "def test_login() -> None:\n assert True\n", + }, + ) + _commit( + repo, + subject="docs: update onboarding guide", + files={"docs/onboarding.md": "# onboarding\n"}, + ) + + result = GitHistoryMiner().mine(_context(repo)) + + assert result.error is None + assert result.error_kind is None + assert len(result.items) == 1 + + item = result.items[0] + metadata = item.typed_meta() + assert isinstance(metadata, GitCommitMeta) + assert metadata.commit_hash == expected_hash + assert metadata.subject == "feat: add login endpoint" + assert metadata.body == "Implements JWT-based authentication." + assert metadata.changed_files == ["src/auth/login.py", "tests/test_login.py"] + assert metadata.conventional_type == "feat" + assert metadata.file_prefixes == ["src/auth", "tests"] + assert item.language is None + assert item.file_path is None + assert item.confidence == 0.5 + + +def test_git_history_miner_uses_configured_commit_limit(tmp_path: Path) -> None: + repo = tmp_path / "repo" + _init_repo(repo) + _commit( + repo, + subject="feat: initial endpoint", + files={"src/api.py": "def endpoint() -> None:\n pass\n"}, + ) + _commit( + repo, + subject="fix: tighten validation", + files={"src/api.py": "def endpoint() -> int:\n return 1\n"}, + ) + + result = GitHistoryMiner().mine(_context(repo, max_git_commits=1)) + + assert [item.name for item in result.items] == ["fix: tighten validation"] + + +def test_git_history_miner_excludes_merge_commits(tmp_path: Path) -> None: + repo = tmp_path / "repo" + _init_repo(repo) + _commit( + repo, + subject="feat: base endpoint", + files={"src/service.py": "def base() -> None:\n pass\n"}, + ) + + _git(repo, "checkout", "-b", "feature/login") + _commit( + repo, + subject="feat: add login flow", + files={"src/login.py": "def login() -> None:\n pass\n"}, + ) + + _git(repo, "checkout", "main") + _commit( + repo, + subject="fix: harden auth checks", + files={"src/auth.py": "def auth() -> None:\n pass\n"}, + ) + _git(repo, "merge", "--no-ff", "feature/login", "-m", "Merge feature/login") + + result = GitHistoryMiner().mine(_context(repo)) + + subjects = [item.name for item in result.items] + assert "Merge feature/login" not in subjects + assert "feat: add login flow" in subjects + assert "fix: harden auth checks" in subjects + + +def test_git_history_miner_returns_not_installed_error_for_non_repo( + tmp_path: Path, +) -> None: + result = GitHistoryMiner().mine(_context(tmp_path)) + + assert result.items == [] + assert result.error is not None + assert result.error_kind == MinerErrorKind.NOT_INSTALLED From 9d2e7b76d8e18a33bd0524dd132bb9c2dc45f3d4 Mon Sep 17 00:00:00 2001 From: Richard-Otterli Date: Fri, 6 Mar 2026 22:59:13 +0000 Subject: [PATCH 2/3] Update discovery spec for git miner (#132) --- features/feature-spec-discovery.md | 24 ++++++++++++++++++++++++ 1 file changed, 24 insertions(+) diff --git a/features/feature-spec-discovery.md b/features/feature-spec-discovery.md index 8493c11..4e8ad7d 100644 --- a/features/feature-spec-discovery.md +++ b/features/feature-spec-discovery.md @@ -145,6 +145,23 @@ Add shared discovery infrastructure for Issues #125 and #126: centralized parser **When** `TypeScriptRouteMiner` executes **Then** it reports `MinerErrorKind.PARSE_ERROR` for parse failures and still returns items from valid files. +### Story 12: Git history mining +**Scenario:** As a discovery pipeline, I need language-agnostic intent signals from recent commits. +**Given** a git repository and `ctx.config.max_git_commits` +**When** `GitHistoryMiner` runs +**Then** it executes `git log --no-merges` and emits `DiscoveredItem(kind=GIT_COMMIT)` entries for non-noise commits with changed source files. + +**Scenario:** As a miner maintainer, I need typed and filtered commit metadata. +**Given** commits with conventional prefixes and changed files +**When** items are emitted +**Then** metadata validates against `GitCommitMeta` with short hash, subject, body, changed files, `conventional_type`, and deduplicated `file_prefixes`. +**And** `chore:`, `ci:`, `build:`, `docs:`, `style:`, and `test:` commits are skipped. + +**Scenario:** As a pipeline operator, I need resilient behavior outside git repositories. +**Given** a non-git directory or missing `git` binary +**When** `GitHistoryMiner` executes +**Then** it returns `MinerResult(error_kind=NOT_INSTALLED, items=[])` without raising. + ## Acceptance Criteria - Language abstraction returns `SupportedLanguage` members for `.py`, `.ts`, `.tsx`, `.js`, `.jsx`, `.mjs` and `None` otherwise. - `LanguageRegistry().parse(path_to_py_file)` returns `(node, SupportedLanguage.PYTHON)` for valid Python input. @@ -193,3 +210,10 @@ Add shared discovery infrastructure for Issues #125 and #126: centralized parser - Route metadata validates against `ApiRouteMeta` for both Express and Next.js outputs. - `app/api/users/[id]/route.ts` exports like `DELETE` map to `/api/users/{id}` with `is_file_based_route=True`. - Next.js route files with multiple HTTP exports emit one API route item per export. +- `GitHistoryMiner` is language-agnostic (`languages = frozenset()`) and always runs regardless of detected languages. +- `GitHistoryMiner` executes `git log` with `--no-merges` and `-n {ctx.config.max_git_commits}`. +- Commits with prefixes `chore:`, `ci:`, `build:`, `docs:`, `style:`, and `test:` are excluded from emitted items. +- Only commits with at least one changed source file (`.py`, `.ts`, `.tsx`, `.js`, `.jsx`, `.mjs`, `.cjs`) are emitted. +- Git commit metadata validates against `GitCommitMeta`, including short hash and parsed `conventional_type`. +- All git commit items have `kind=GIT_COMMIT`, `language=None`, `file_path=None`, and `confidence=0.5`. +- Running discovery on a non-git directory produces a miner error with `error_kind=NOT_INSTALLED` and no exception. From c530b23ff4468981d382ef5f503075770ee78b11 Mon Sep 17 00:00:00 2001 From: Richard-Otterli Date: Fri, 6 Mar 2026 23:23:39 +0000 Subject: [PATCH 3/3] Harden git history parser safety guards (#132) --- .../discovery/miners/shared/git_history.py | 50 +++++++++++++++++-- tests/discovery/miners/test_git_history.py | 29 +++++++++++ 2 files changed, 75 insertions(+), 4 deletions(-) diff --git a/src/specleft/discovery/miners/shared/git_history.py b/src/specleft/discovery/miners/shared/git_history.py index 12f5d46..de860e9 100644 --- a/src/specleft/discovery/miners/shared/git_history.py +++ b/src/specleft/discovery/miners/shared/git_history.py @@ -37,6 +37,10 @@ class _CommitRecord: changed_files: list[str] +class _GitLogParseError(ValueError): + """Raised when `git log` output cannot be parsed safely.""" + + class GitHistoryMiner: """Extract discovery signals from recent git commit history.""" @@ -64,7 +68,18 @@ def mine(self, ctx: MinerContext) -> MinerResult: duration_ms=elapsed_ms(started), ) - items = _items_from_log(process.stdout) + try: + items = _items_from_log(process.stdout) + except _GitLogParseError as exc: + return MinerResult( + miner_id=self.miner_id, + miner_name=self.name, + items=[], + error=str(exc), + error_kind=MinerErrorKind.PARSE_ERROR, + duration_ms=elapsed_ms(started), + ) + return MinerResult( miner_id=self.miner_id, miner_name=self.name, @@ -139,8 +154,17 @@ def _parse_records(log_output: str) -> list[_CommitRecord]: records: list[_CommitRecord] = [] cursor = 0 total_lines = len(lines) + safety_budget = total_lines + 1 while cursor < total_lines: + if safety_budget <= 0: + raise _GitLogParseError( + "Git history parser stopped for safety: unable to make progress while " + "reading commit records. Please rerun and report if this persists." + ) + safety_budget -= 1 + iteration_start = cursor + while cursor < total_lines and not _is_full_hash(lines[cursor]): cursor += 1 if cursor >= total_lines: @@ -149,17 +173,29 @@ def _parse_records(log_output: str) -> list[_CommitRecord]: commit_hash = lines[cursor].strip() cursor += 1 if cursor >= total_lines: - break + raise _GitLogParseError( + "Git history parser found an incomplete record after commit " + f"{commit_hash[:7]}: missing subject line." + ) subject = lines[cursor].strip() + if not subject: + raise _GitLogParseError( + "Git history parser found an empty commit subject for " + f"{commit_hash[:7]}." + ) cursor += 1 body_lines: list[str] = [] while cursor < total_lines and lines[cursor].strip() != _SEPARATOR: body_lines.append(lines[cursor].rstrip()) cursor += 1 - if cursor < total_lines and lines[cursor].strip() == _SEPARATOR: - cursor += 1 + if cursor >= total_lines: + raise _GitLogParseError( + "Git history parser found malformed `git log` output: missing " + f"'{_SEPARATOR}' marker for commit {commit_hash[:7]}." + ) + cursor += 1 changed_files: list[str] = [] while cursor < total_lines and not _is_full_hash(lines[cursor]): @@ -178,6 +214,12 @@ def _parse_records(log_output: str) -> list[_CommitRecord]: ) ) + if cursor <= iteration_start: + raise _GitLogParseError( + "Git history parser stopped for safety: parser made no progress " + f"near line {iteration_start + 1}. Please rerun and report if this persists." + ) + return records diff --git a/tests/discovery/miners/test_git_history.py b/tests/discovery/miners/test_git_history.py index 1d73d29..7ea1ed8 100644 --- a/tests/discovery/miners/test_git_history.py +++ b/tests/discovery/miners/test_git_history.py @@ -8,6 +8,7 @@ import subprocess from pathlib import Path +import specleft.discovery.miners.shared.git_history as git_history_module from specleft.discovery.config import DiscoveryConfig from specleft.discovery.context import MinerContext from specleft.discovery.file_index import FileIndex @@ -167,3 +168,31 @@ def test_git_history_miner_returns_not_installed_error_for_non_repo( assert result.items == [] assert result.error is not None assert result.error_kind == MinerErrorKind.NOT_INSTALLED + + +def test_git_history_miner_returns_parse_error_for_malformed_log_output( + tmp_path: Path, monkeypatch +) -> None: + malformed_stdout = "\n".join( + [ + "a" * 40, + "feat: malformed stream", + "body line without separator", + ] + ) + process = subprocess.CompletedProcess( + args=["git"], + returncode=0, + stdout=malformed_stdout, + stderr="", + ) + + monkeypatch.setattr(git_history_module, "_run_git_log", lambda *_: process) + + result = GitHistoryMiner().mine(_context(tmp_path)) + + assert result.items == [] + assert result.error_kind == MinerErrorKind.PARSE_ERROR + assert result.error is not None + assert "missing '---END---' marker" in result.error + assert "commit aaaaaaa" in result.error