diff --git a/features/feature-spec-discovery.md b/features/feature-spec-discovery.md index 4e8ad7d..c0c4258 100644 --- a/features/feature-spec-discovery.md +++ b/features/feature-spec-discovery.md @@ -162,6 +162,23 @@ Add shared discovery infrastructure for Issues #125 and #126: centralized parser **When** `GitHistoryMiner` executes **Then** it returns `MinerResult(error_kind=NOT_INSTALLED, items=[])` without raising. +### Story 13: Phase 2 feature grouping +**Scenario:** As a discovery pipeline, I need deterministic grouping for auth tests. +**Given** 10 discovered test-function items from `tests/auth/` +**When** `group_items(items)` is called +**Then** all 10 items land in a single `DraftFeature` cluster. + +**Scenario:** As a discovery pipeline, I need route-prefix grouping. +**Given** route items for `GET /payments/*`, `POST /payments`, and `GET /users/*` +**When** `group_items(items)` is called +**Then** payment routes group separately from user routes. + +**Scenario:** As a maintainer, I need complete and type-safe grouping. +**Given** mixed discovered items (tests, routes, docstrings, git commits) +**When** `group_items(items)` is called +**Then** every `DiscoveredItem` appears in exactly one `DraftFeature`. +**And** grouping uses `item.typed_meta()` for API and git metadata access. + ## Acceptance Criteria - Language abstraction returns `SupportedLanguage` members for `.py`, `.ts`, `.tsx`, `.js`, `.jsx`, `.mjs` and `None` otherwise. - `LanguageRegistry().parse(path_to_py_file)` returns `(node, SupportedLanguage.PYTHON)` for valid Python input. @@ -217,3 +234,9 @@ Add shared discovery infrastructure for Issues #125 and #126: centralized parser - Git commit metadata validates against `GitCommitMeta`, including short hash and parsed `conventional_type`. - All git commit items have `kind=GIT_COMMIT`, `language=None`, `file_path=None`, and `confidence=0.5`. - Running discovery on a non-git directory produces a miner error with `error_kind=NOT_INSTALLED` and no exception. +- `group_items()` clusters 10 items from `tests/auth/` into one feature. +- API routes under `/payments` group separately from `/users`. +- Every input `DiscoveredItem` is assigned exactly once to a feature (`source_items`). +- Git commit items are merged into nearest matching feature via `GitCommitMeta.file_prefixes`. +- Grouping uses `item.typed_meta()` for API and git metadata access. +- Single-item groups are valid outputs. diff --git a/src/specleft/discovery/__init__.py b/src/specleft/discovery/__init__.py index 096509a..23bfbc0 100644 --- a/src/specleft/discovery/__init__.py +++ b/src/specleft/discovery/__init__.py @@ -8,6 +8,7 @@ from specleft.discovery.framework_detector import FrameworkDetector from specleft.discovery.language_detect import detect_project_languages from specleft.discovery.language_registry import SUPPORTED_EXTENSIONS, LanguageRegistry +from specleft.discovery.grouping import group_items from specleft.discovery.pipeline import ( BaseMiner, DiscoveryPipeline, diff --git a/src/specleft/discovery/grouping.py b/src/specleft/discovery/grouping.py new file mode 100644 index 0000000..37b5b0d --- /dev/null +++ b/src/specleft/discovery/grouping.py @@ -0,0 +1,375 @@ +# SPDX-License-Identifier: Apache-2.0 +# Copyright (c) 2026 SpecLeft Contributors + +"""Phase-2 grouping logic for discovery items.""" + +from __future__ import annotations + +from dataclasses import dataclass, field +from pathlib import Path + +from slugify import slugify + +from specleft.discovery.models import ( + ApiRouteMeta, + DiscoveredItem, + DraftFeature, + DraftScenario, + GitCommitMeta, + ItemKind, +) +from specleft.schema import SpecStep, StepType + +_GENERIC_PATH_SEGMENTS = frozenset( + { + "app", + "core", + "features", + "lib", + "specleft", + "specs", + "src", + "test", + "tests", + } +) +_ABBREVIATION_EXPANSIONS: dict[str, str] = { + "auth": "authentication", + "cfg": "configuration", + "config": "configuration", + "mgmt": "management", + "msg": "messaging", + "notif": "notifications", +} +_NAME_PREFIXES = ( + "test_", + "test ", + "it_", + "it ", + "should_", + "should ", +) + + +@dataclass +class _Group: + key: str + items: list[DiscoveredItem] = field(default_factory=list) + path_signals: set[str] = field(default_factory=set) + + +def group_items(items: list[DiscoveredItem]) -> list[DraftFeature]: + """Group discovered items into draft features using deterministic heuristics.""" + segment_counts = _segment_counts(items) + groups: dict[str, _Group] = {} + git_items: list[DiscoveredItem] = [] + + for item in items: + if item.kind is ItemKind.GIT_COMMIT: + git_items.append(item) + continue + + key = _group_key_for_item(item, segment_counts) + _append_to_group(groups, key, item) + + _assign_git_items(groups, git_items) + + features: list[DraftFeature] = [] + for group in groups.values(): + if not group.items: + continue + features.append(_group_to_feature(group)) + return features + + +def _segment_counts(items: list[DiscoveredItem]) -> dict[str, int]: + counts: dict[str, int] = {} + for item in items: + if item.file_path is None: + continue + for part in item.file_path.parent.parts: + token = _normalize_token(part) + if not token: + continue + counts[token] = counts.get(token, 0) + 1 + return counts + + +def _group_key_for_item(item: DiscoveredItem, segment_counts: dict[str, int]) -> str: + path_key = _path_group_key(item.file_path, segment_counts) + + if item.kind is ItemKind.API_ROUTE: + api_key = _api_group_key(item) + if api_key: + return api_key + + if path_key: + return path_key + + name_key = _name_group_key(item.name) + if name_key: + return name_key + + if item.file_path is not None: + fallback = _normalize_token(item.file_path.stem) + if fallback: + return fallback + + return "misc" + + +def _path_group_key( + file_path: Path | None, segment_counts: dict[str, int] +) -> str | None: + if file_path is None: + return None + + parent_parts = [part for part in file_path.parent.parts if part not in {"", "."}] + if not parent_parts: + return None + + preferred = _deepest_shared_specific_segment(parent_parts, segment_counts) + if preferred: + return preferred + + for part in reversed(parent_parts): + token = _normalize_token(part) + if token and token not in _GENERIC_PATH_SEGMENTS: + return token + return None + + +def _deepest_shared_specific_segment( + parent_parts: list[str], segment_counts: dict[str, int] +) -> str | None: + for part in reversed(parent_parts): + token = _normalize_token(part) + if not token or token in _GENERIC_PATH_SEGMENTS: + continue + if segment_counts.get(token, 0) >= 2: + return token + return None + + +def _api_group_key(item: DiscoveredItem) -> str | None: + meta = item.typed_meta() + if not isinstance(meta, ApiRouteMeta): + return None + + segments = [segment for segment in meta.path.split("/") if segment] + for segment in segments: + if segment.startswith("{") and segment.endswith("}"): + continue + token = _normalize_token(segment) + if token: + return token + return "root" + + +def _name_group_key(name: str) -> str | None: + normalized = name.strip().lower() + for prefix in _NAME_PREFIXES: + if normalized.startswith(prefix): + normalized = normalized[len(prefix) :] + break + + tokens = [token for token in slugify(normalized, separator=" ").split() if token] + while tokens and tokens[0] in {"and", "but", "given", "then", "when"}: + tokens = tokens[1:] + + if not tokens: + return None + return tokens[0] + + +def _assign_git_items( + groups: dict[str, _Group], git_items: list[DiscoveredItem] +) -> None: + pending: dict[str, list[DiscoveredItem]] = {} + + for git_item in git_items: + best_key = _best_group_key_for_git(groups, git_item) + if best_key is not None: + _append_to_group(groups, best_key, git_item) + continue + + bucket_key = _git_bucket_key(git_item) + pending.setdefault(bucket_key, []).append(git_item) + + for bucket_key, bucket_items in pending.items(): + if len(bucket_items) >= 3: + for git_item in bucket_items: + _append_to_group(groups, bucket_key, git_item) + continue + + for git_item in bucket_items: + _append_to_group( + groups, _fallback_group_for_git(groups, git_item), git_item + ) + + +def _best_group_key_for_git( + groups: dict[str, _Group], git_item: DiscoveredItem +) -> str | None: + if not groups: + return None + + meta = git_item.typed_meta() + if not isinstance(meta, GitCommitMeta): + return None + + best_key: str | None = None + best_score = 0 + best_size = -1 + + for key, group in groups.items(): + score = _overlap_score(meta.file_prefixes, group.path_signals) + size = len(group.items) + if score == 0: + continue + if score > best_score or (score == best_score and size > best_size): + best_key = key + best_score = score + best_size = size + + return best_key + + +def _overlap_score(prefixes: list[str], signals: set[str]) -> int: + score = 0 + for prefix in prefixes: + prefix_signal = prefix.strip("/") + if not prefix_signal: + continue + for signal in signals: + if signal.startswith(prefix_signal) or prefix_signal.startswith(signal): + score += 1 + return score + + +def _git_bucket_key(git_item: DiscoveredItem) -> str: + meta = git_item.typed_meta() + if isinstance(meta, GitCommitMeta) and meta.file_prefixes: + first = meta.file_prefixes[0].split("/", maxsplit=1)[0] + token = _normalize_token(first) + if token: + return token + + fallback = _name_group_key(git_item.name) + return fallback or "misc" + + +def _fallback_group_for_git(groups: dict[str, _Group], git_item: DiscoveredItem) -> str: + named_key = _name_group_key(git_item.name) + if named_key and named_key in groups: + return named_key + + if groups: + return max(groups.items(), key=lambda pair: len(pair[1].items))[0] + return "misc" + + +def _append_to_group(groups: dict[str, _Group], key: str, item: DiscoveredItem) -> None: + group = groups.get(key) + if group is None: + group = _Group(key=key) + groups[key] = group + + group.items.append(item) + group.path_signals.update(_path_signals_for_item(item)) + + +def _path_signals_for_item(item: DiscoveredItem) -> set[str]: + if item.file_path is None: + return set() + + as_posix = item.file_path.as_posix().strip("/") + if not as_posix: + return set() + + parts = as_posix.split("/") + signals: set[str] = set() + for index in range(1, len(parts) + 1): + signals.add("/".join(parts[:index])) + parent = item.file_path.parent.as_posix() + if parent != ".": + signals.add(parent) + return signals + + +def _group_to_feature(group: _Group) -> DraftFeature: + expanded_label = _expand_abbreviations(group.key) + feature_id = slugify(expanded_label, lowercase=True) + if not feature_id: + feature_id = "misc" + + feature_name = expanded_label.title() + scenarios = [ + _item_to_scenario(item, index) for index, item in enumerate(group.items) + ] + + return DraftFeature( + feature_id=feature_id, + name=feature_name, + scenarios=scenarios, + source_items=list(group.items), + confidence=_group_confidence(group.items), + ) + + +def _item_to_scenario(item: DiscoveredItem, index: int) -> DraftScenario: + label = _scenario_label(item) + title = slugify(label, lowercase=True) or f"scenario-{index + 1}" + + return DraftScenario( + title=title, + steps=[ + SpecStep(type=StepType.GIVEN, description=f"context for {label}"), + SpecStep(type=StepType.WHEN, description=f"action for {label}"), + SpecStep(type=StepType.THEN, description=f"outcome for {label}"), + ], + source_items=[item], + ) + + +def _scenario_label(item: DiscoveredItem) -> str: + if item.kind is ItemKind.API_ROUTE: + meta = item.typed_meta() + if isinstance(meta, ApiRouteMeta): + method = ( + ", ".join(meta.http_method) + if isinstance(meta.http_method, list) + else meta.http_method + ) + return f"{method} {meta.path}".strip() + + candidate = item.name.replace("_", " ").replace("-", " ").strip() + for prefix in ("test ", "it "): + if candidate.lower().startswith(prefix): + candidate = candidate[len(prefix) :] + break + return candidate or "scenario" + + +def _group_confidence(items: list[DiscoveredItem]) -> float: + score = 0.5 + if len({item.kind for item in items}) >= 2: + score += 0.2 + if any(item.kind is ItemKind.DOCSTRING for item in items): + score += 0.1 + if any(item.kind is ItemKind.GIT_COMMIT for item in items): + score += 0.1 + return min(score, 1.0) + + +def _expand_abbreviations(key: str) -> str: + tokens = [token for token in slugify(key, separator=" ").split() if token] + if not tokens: + return "misc" + + expanded = [_ABBREVIATION_EXPANSIONS.get(token, token) for token in tokens] + return " ".join(expanded) + + +def _normalize_token(value: str) -> str: + return slugify(value, lowercase=True) diff --git a/tests/discovery/test_grouping.py b/tests/discovery/test_grouping.py new file mode 100644 index 0000000..e7c2f6a --- /dev/null +++ b/tests/discovery/test_grouping.py @@ -0,0 +1,218 @@ +# SPDX-License-Identifier: Apache-2.0 +# Copyright (c) 2026 SpecLeft Contributors + +"""Tests for phase-2 feature grouping.""" + +from __future__ import annotations + +from collections import Counter +from pathlib import Path +from typing import Any + +import pytest + +from specleft.discovery.grouping import group_items +from specleft.discovery.models import ( + ApiRouteMeta, + DiscoveredItem, + DocstringMeta, + GitCommitMeta, + ItemKind, + SupportedLanguage, + TestFunctionMeta, +) + + +def _test_item(name: str, file_path: Path) -> DiscoveredItem: + return DiscoveredItem( + kind=ItemKind.TEST_FUNCTION, + name=name, + file_path=file_path, + line_number=1, + language=SupportedLanguage.PYTHON, + raw_text=None, + metadata=TestFunctionMeta(framework="pytest").model_dump(), + confidence=0.9, + ) + + +def _route_item(name: str, path: str, file_path: Path) -> DiscoveredItem: + return DiscoveredItem( + kind=ItemKind.API_ROUTE, + name=name, + file_path=file_path, + line_number=1, + language=SupportedLanguage.PYTHON, + raw_text=None, + metadata=ApiRouteMeta( + http_method="GET", + path=path, + framework="fastapi", + ).model_dump(), + confidence=0.9, + ) + + +def _doc_item(name: str, file_path: Path) -> DiscoveredItem: + return DiscoveredItem( + kind=ItemKind.DOCSTRING, + name=name, + file_path=file_path, + line_number=1, + language=SupportedLanguage.PYTHON, + raw_text=name, + metadata=DocstringMeta( + target_kind="module", + target_name=file_path.stem, + text=name, + ).model_dump(), + confidence=0.8, + ) + + +def _git_item(subject: str, file_prefixes: list[str]) -> DiscoveredItem: + return DiscoveredItem( + kind=ItemKind.GIT_COMMIT, + name=subject, + file_path=None, + line_number=None, + language=None, + raw_text=None, + metadata=GitCommitMeta( + commit_hash="abc1234", + subject=subject, + changed_files=[f"{prefix}/file.py" for prefix in file_prefixes], + file_prefixes=file_prefixes, + ).model_dump(), + confidence=0.5, + ) + + +def _feature_for_item(features: list[Any], item: DiscoveredItem) -> Any: + return next( + feature + for feature in features + if any(source is item for source in feature.source_items) + ) + + +def test_items_from_auth_directory_land_in_single_group() -> None: + items = [ + _test_item(f"test_auth_case_{index}", Path(f"tests/auth/test_case_{index}.py")) + for index in range(10) + ] + + features = group_items(items) + + assert len(features) == 1 + assert features[0].feature_id == "authentication" + assert len(features[0].source_items) == 10 + + +def test_api_routes_group_by_first_path_segment() -> None: + payment_get = _route_item( + "GET /payments/{id}", + "/payments/{id}", + Path("src/api/payments.py"), + ) + payment_post = _route_item( + "POST /payments", + "/payments", + Path("src/api/payments_write.py"), + ) + users_get = _route_item("GET /users/{id}", "/users/{id}", Path("src/api/users.py")) + + features = group_items([payment_get, payment_post, users_get]) + + payment_feature = _feature_for_item(features, payment_get) + assert payment_feature is _feature_for_item(features, payment_post) + assert payment_feature.feature_id == "payments" + assert _feature_for_item(features, users_get).feature_id == "users" + + +def test_every_item_is_assigned_exactly_once() -> None: + auth_test = _test_item("test_auth_valid", Path("tests/auth/test_login.py")) + auth_doc = _doc_item("Authentication module", Path("src/auth/service.py")) + billing_route = _route_item("GET /billing", "/billing", Path("src/api/billing.py")) + git_auth = _git_item("feat: auth hardening", ["tests/auth", "src/auth"]) + git_billing = _git_item("feat: billing endpoint", ["src/api/billing"]) + items = [auth_test, auth_doc, billing_route, git_auth, git_billing] + + features = group_items(items) + assigned = [item for feature in features for item in feature.source_items] + + counts = Counter(id(item) for item in assigned) + assert len(assigned) == len(items) + assert len(counts) == len(items) + assert all(count == 1 for count in counts.values()) + + +def test_git_items_merge_into_nearest_existing_group() -> None: + auth_test = _test_item("test_auth_login", Path("tests/auth/test_login.py")) + billing_test = _test_item( + "test_invoice_paid", Path("tests/billing/test_invoice.py") + ) + git_auth = _git_item("feat: improve auth", ["tests/auth", "src/auth"]) + git_billing = _git_item("feat: tighten invoices", ["tests/billing"]) + + features = group_items([auth_test, billing_test, git_auth, git_billing]) + + assert len(features) == 2 + assert git_auth in _feature_for_item(features, auth_test).source_items + assert git_billing in _feature_for_item(features, billing_test).source_items + + +def test_grouping_uses_typed_meta_for_api_and_git( + monkeypatch: pytest.MonkeyPatch, +) -> None: + api_item = _route_item("GET /users", "/users", Path("src/api/users.py")) + git_item = _git_item("feat: users", ["src/api/users"]) + + calls = {"api": 0, "git": 0} + original = DiscoveredItem.typed_meta + + def _spy(self: DiscoveredItem) -> Any: + meta = original(self) + if self.kind is ItemKind.API_ROUTE: + calls["api"] += 1 + if self.kind is ItemKind.GIT_COMMIT: + calls["git"] += 1 + return meta + + monkeypatch.setattr(DiscoveredItem, "typed_meta", _spy) + + group_items([api_item, git_item]) + + assert calls["api"] >= 1 + assert calls["git"] >= 1 + + +def test_single_item_group_is_valid() -> None: + solo = _doc_item("single docstring", Path("src/single/service.py")) + + features = group_items([solo]) + + assert len(features) == 1 + assert len(features[0].source_items) == 1 + + +def test_unmatched_git_items_form_group_when_three_share_prefix() -> None: + git_a = _git_item("feat: payments workflow", ["payments/core"]) + git_b = _git_item("fix: payments retries", ["payments/jobs"]) + git_c = _git_item("refactor: payments ledger", ["payments/ledger"]) + + features = group_items([git_a, git_b, git_c]) + + assert len(features) == 1 + assert features[0].feature_id == "payments" + assert len(features[0].source_items) == 3 + + +def test_confidence_scores_include_kind_docstring_and_git_bonuses() -> None: + auth_test = _test_item("test_auth_login", Path("tests/auth/test_login.py")) + auth_doc = _doc_item("Authentication service", Path("src/auth/service.py")) + auth_git = _git_item("feat: auth service", ["tests/auth", "src/auth"]) + + feature = _feature_for_item(group_items([auth_test, auth_doc, auth_git]), auth_test) + + assert feature.confidence == pytest.approx(0.9)