diff --git a/features/feature-spec-discovery.md b/features/feature-spec-discovery.md new file mode 100644 index 0000000..623634a --- /dev/null +++ b/features/feature-spec-discovery.md @@ -0,0 +1,45 @@ +# Feature Spec: Discovery language registry and file indexing + +## Purpose +Add shared discovery infrastructure for Issue #125: centralized parser abstraction and one-pass filesystem indexing. + +## User Stories + +### Story 1: Shared language detection and parsing +**Scenario:** As a discovery miner, I need a single abstraction to detect and parse language files. +**Given** a project file path +**When** I call `LanguageRegistry().detect_language(path)` +**Then** it should map supported extensions to a `SupportedLanguage` enum value and return `None` for unsupported files. + +**Scenario:** As a discovery miner, I need resilient parsing. +**Given** a supported file path and `LanguageRegistry` +**When** parsing succeeds +**Then** `parse(path)` returns `(root_node, detected_language)`. +**And when parse fails or content is corrupt +**Then** `parse(path)` returns `None` without raising. + +### Story 2: Shared file indexing +**Scenario:** As a discovery pipeline, I need to avoid repeated filesystem walks. +**Given** a `FileIndex` built for the repository +**Then** miners can query `files_by_language`, `files_by_extension`, `files_matching`, and `files_under`. + +**Scenario:** As a pipeline maintainer, I need noisy directories excluded consistently. +**Given** directories in `DEFAULT_EXCLUDE_DIRS` +**Then** those paths are never returned by index lookups. + +### Story 3: Project language signal +**Scenario:** As downstream planning logic, I need a low-cost language signal. +**Given** a populated `FileIndex` +**When** calling `detect_project_languages(index)` +**Then** it returns detected languages above the ratio threshold, computed against total indexed files. + +## Acceptance Criteria +- Language abstraction returns `SupportedLanguage` members for `.py`, `.ts`, `.tsx`, `.js`, `.jsx`, `.mjs` and `None` otherwise. +- `LanguageRegistry().parse(path_to_py_file)` returns `(node, SupportedLanguage.PYTHON)` for valid Python input. +- `LanguageRegistry().parse(path_to_ts_file)` returns `(node, SupportedLanguage.TYPESCRIPT)` for valid TypeScript input. +- Corrupt file content returns `None` without raising. +- Grammar/parser handling is cached and does not recreate parser objects per call. +- `FileIndex` builds once per root and exposes query helpers used by miners. +- `detect_project_languages()` thresholds are applied against total indexed files, not only supported-language files. +- Tests cover registry parsing, caching behavior, index filtering, and language detection thresholding. +- Feature spec is updated to document the new discovery layer behavior for issue #125. diff --git a/pyproject.toml b/pyproject.toml index 536b319..9563853 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -34,6 +34,9 @@ dependencies = [ "python-frontmatter>=1.0.0", "python-slugify>=8.0.0", "pyyaml>=6.0.0", + "tree-sitter>=0.23", + "tree-sitter-python>=0.23", + "tree-sitter-typescript>=0.23", ] keywords=[ "ai", diff --git a/src/specleft/discovery/__init__.py b/src/specleft/discovery/__init__.py index 5ef8125..75b2a66 100644 --- a/src/specleft/discovery/__init__.py +++ b/src/specleft/discovery/__init__.py @@ -1,3 +1,7 @@ """Discovery models and infrastructure package.""" from specleft.discovery.models import * # noqa: F401,F403 + +from specleft.discovery.file_index import DEFAULT_EXCLUDE_DIRS, FileIndex +from specleft.discovery.language_detect import detect_project_languages +from specleft.discovery.language_registry import SUPPORTED_EXTENSIONS, LanguageRegistry diff --git a/src/specleft/discovery/file_index.py b/src/specleft/discovery/file_index.py new file mode 100644 index 0000000..43a1911 --- /dev/null +++ b/src/specleft/discovery/file_index.py @@ -0,0 +1,128 @@ +# SPDX-License-Identifier: Apache-2.0 +# Copyright (c) 2026 SpecLeft Contributors + +"""Filesystem abstraction for discovery miners.""" + +from __future__ import annotations + +import fnmatch +import os +from pathlib import Path + +from specleft.discovery.language_registry import SUPPORTED_EXTENSIONS +from specleft.discovery.models import SupportedLanguage + +DEFAULT_EXCLUDE_DIRS: frozenset[str] = frozenset( + { + ".git", + "node_modules", + "__pycache__", + ".venv", + "venv", + "dist", + "build", + ".next", + ".mypy_cache", + ".pytest_cache", + ".tox", + ".eggs", + "*.egg-info", + } +) + + +class FileIndex: + """Walk the repository once and provide filtered views.""" + + def __init__( + self, + root: Path, + exclude_dirs: frozenset[str] = DEFAULT_EXCLUDE_DIRS, + ) -> None: + self._root = root + self._files: list[Path] = [] + self._by_language: dict[SupportedLanguage, list[Path]] = {} + self._by_extension: dict[str, list[Path]] = {} + self._exclude_dirs = exclude_dirs + self._build() + + @property + def root(self) -> Path: + """Project root.""" + return self._root + + @property + def total_files(self) -> int: + """Number of discovered files.""" + return len(self._files) + + def files_by_language(self, lang: SupportedLanguage) -> list[Path]: + """Return all files for a language.""" + return sorted( + self._by_language.get(lang, []), + key=lambda value: value.as_posix(), + ) + + def files_by_extension(self, *exts: str) -> list[Path]: + """Return files matching any extension.""" + output: list[Path] = [] + for ext in exts: + output.extend(self._by_extension.get(ext.lower(), [])) + return sorted(output, key=lambda value: value.as_posix()) + + def files_matching(self, *patterns: str) -> list[Path]: + """Return files whose names match any glob pattern.""" + matched: list[Path] = [] + for file_path in self._files: + for pattern in patterns: + if fnmatch.fnmatch(file_path.name, pattern): + matched.append(file_path) + break + return sorted(matched, key=lambda value: value.as_posix()) + + def files_under(self, *dirs: str) -> list[Path]: + """Return files under the specified directory prefixes.""" + return sorted( + [ + file_path + for file_path in self._files + if any( + file_path.parts[: len(Path(prefix).parts)] + == tuple(Path(prefix).parts) + for prefix in dirs + ) + ], + key=lambda value: value.as_posix(), + ) + + def _build(self) -> None: + root = self._root.resolve() + for current_root, dirnames, filenames in os.walk(root): + dirnames.sort() + filenames.sort() + path_dirnames = list(dirnames) + filtered: list[str] = [] + for dirname in path_dirnames: + if self._is_excluded_dir(dirname): + continue + filtered.append(dirname) + dirnames[:] = filtered + + for filename in filenames: + file_path = Path(current_root, filename) + if file_path.is_dir(): + continue + rel_path = file_path.relative_to(root) + self._files.append(rel_path) + + extension = rel_path.suffix.lower() + self._by_extension.setdefault(extension, []).append(rel_path) + + language = SUPPORTED_EXTENSIONS.get(extension) + if language is not None: + self._by_language.setdefault(language, []).append(rel_path) + + def _is_excluded_dir(self, dirname: str) -> bool: + if dirname in self._exclude_dirs: + return True + return any(fnmatch.fnmatch(dirname, pattern) for pattern in self._exclude_dirs) diff --git a/src/specleft/discovery/language_detect.py b/src/specleft/discovery/language_detect.py new file mode 100644 index 0000000..794ae1b --- /dev/null +++ b/src/specleft/discovery/language_detect.py @@ -0,0 +1,27 @@ +# SPDX-License-Identifier: Apache-2.0 +# Copyright (c) 2026 SpecLeft Contributors + +"""Language detection helpers over a prebuilt file index.""" + +from __future__ import annotations + +from specleft.discovery.file_index import FileIndex +from specleft.discovery.models import SupportedLanguage + + +def detect_project_languages( + file_index: FileIndex, + threshold: float = 0.01, +) -> list[SupportedLanguage]: + """Return languages whose file ratio exceeds the given threshold.""" + total_files = file_index.total_files + if total_files == 0: + return [] + + detected: list[SupportedLanguage] = [] + for language in SupportedLanguage: + language_files = file_index.files_by_language(language) + ratio = len(language_files) / total_files + if ratio >= threshold: + detected.append(language) + return detected diff --git a/src/specleft/discovery/language_registry.py b/src/specleft/discovery/language_registry.py new file mode 100644 index 0000000..c3e60b4 --- /dev/null +++ b/src/specleft/discovery/language_registry.py @@ -0,0 +1,125 @@ +# SPDX-License-Identifier: Apache-2.0 +# Copyright (c) 2026 SpecLeft Contributors + +"""Language detection and parser abstractions for discovery miners.""" + +from __future__ import annotations + +from pathlib import Path +from typing import Any + +from specleft.discovery.models import SupportedLanguage + +SUPPORTED_EXTENSIONS: dict[str, SupportedLanguage] = { + ".py": SupportedLanguage.PYTHON, + ".ts": SupportedLanguage.TYPESCRIPT, + ".tsx": SupportedLanguage.TYPESCRIPT, + ".js": SupportedLanguage.JAVASCRIPT, + ".jsx": SupportedLanguage.JAVASCRIPT, + ".mjs": SupportedLanguage.JAVASCRIPT, +} + + +class LanguageRegistry: + """Cache language grammars and expose shared parse operations.""" + + def __init__(self) -> None: + self._language_cache: dict[SupportedLanguage, Any] = {} + self._parser_cache: dict[SupportedLanguage, Any] = {} + + def detect_language(self, file_path: Path) -> SupportedLanguage | None: + """Return the detected language by file extension, or `None`.""" + return SUPPORTED_EXTENSIONS.get(file_path.suffix.lower()) + + def parse(self, file_path: Path) -> tuple[Any, SupportedLanguage] | None: + """Parse bytes from disk and return ``(root_node, language)``. + + Unsupported extensions, parse failures, or malformed files return ``None``. + """ + language = self.detect_language(file_path) + if language is None: + return None + + try: + source = file_path.read_bytes() + except OSError: + return None + + try: + root_node = self.parse_source(source, language) + except Exception: + return None + + if root_node is None: + return None + return root_node, language + + def parse_source(self, source: bytes, language: SupportedLanguage) -> Any | None: + """Parse raw source bytes directly and return tree root node.""" + parser = self._parser_for(language) + if parser is None: + return None + + try: + tree = parser.parse(source) + except Exception: + return None + + root_node = tree.root_node + if getattr(root_node, "has_error", False): + return None + + return root_node + + def _parser_for(self, language: SupportedLanguage) -> Any | None: + parser = self._parser_cache.get(language) + if parser is not None: + return parser + + language_obj = self._language_for(language) + if language_obj is None: + return None + + try: + from tree_sitter import Parser # type: ignore[import-untyped] + + parser = Parser() + parser.set_language(language_obj) + self._parser_cache[language] = parser + return parser + except Exception: + return None + + def _language_for(self, language: SupportedLanguage) -> Any | None: + if language in self._language_cache: + return self._language_cache[language] + + if language == SupportedLanguage.PYTHON: + try: + import tree_sitter_python # type: ignore[import-not-found] + + language_obj = tree_sitter_python.language() + except Exception: + return None + elif language in (SupportedLanguage.TYPESCRIPT, SupportedLanguage.JAVASCRIPT): + try: + import tree_sitter_typescript # type: ignore[import-not-found] + + if language == SupportedLanguage.TYPESCRIPT: + language_obj = tree_sitter_typescript.language_typescript() + else: + language_loader = getattr( + tree_sitter_typescript, + "language_javascript", + None, + ) + if language_loader is None: + return None + language_obj = language_loader() + except Exception: + return None + else: + return None + + self._language_cache[language] = language_obj + return language_obj diff --git a/tests/discovery/test_file_index.py b/tests/discovery/test_file_index.py new file mode 100644 index 0000000..d4092d3 --- /dev/null +++ b/tests/discovery/test_file_index.py @@ -0,0 +1,95 @@ +# SPDX-License-Identifier: Apache-2.0 +# Copyright (c) 2026 SpecLeft Contributors + +"""Tests for discovery file index abstraction.""" + +from __future__ import annotations + +from pathlib import Path + +from specleft.discovery.file_index import DEFAULT_EXCLUDE_DIRS, FileIndex +from specleft.discovery.models import SupportedLanguage + + +def _seed_tree(root: Path) -> None: + (root / "src").mkdir(parents=True) + (root / "src" / "main.py").write_text("print('ok')") + (root / "src" / "helpers.ts").write_text("const a = 1;") + + tests_dir = root / "tests" + tests_dir.mkdir() + (tests_dir / "test_auth.py").write_text("def test_ok():\n assert True\n") + (tests_dir / "test_cache_service.py").write_text( + "def test_cache():\n assert True\n" + ) + + (root / "notes.md").write_text("just notes") + + (root / ".venv").mkdir() + (root / ".venv" / "tmp.py").write_text("print('skip')") + + (root / "node_modules").mkdir() + (root / "node_modules" / "a.js").write_text("console.log(1)") + + +def test_file_index_walk_counts_and_filters(tmp_path: Path) -> None: + _seed_tree(tmp_path) + index = FileIndex(tmp_path) + + assert index.root == tmp_path + assert index.total_files == 5 + assert {path.as_posix() for path in index._files} == { + "src/main.py", + "src/helpers.ts", + "tests/test_auth.py", + "tests/test_cache_service.py", + "notes.md", + } + + +def test_file_index_by_language_only_includes_supported_languages( + tmp_path: Path, +) -> None: + _seed_tree(tmp_path) + index = FileIndex(tmp_path) + + assert index.files_by_language(SupportedLanguage.PYTHON) == [ + Path("src/main.py"), + Path("tests/test_auth.py"), + Path("tests/test_cache_service.py"), + ] + assert index.files_by_language(SupportedLanguage.TYPESCRIPT) == [ + Path("src/helpers.ts") + ] + assert index.files_by_language(SupportedLanguage.JAVASCRIPT) == [] + + +def test_files_matching_respects_patterns(tmp_path: Path) -> None: + _seed_tree(tmp_path) + index = FileIndex(tmp_path) + + matches = index.files_matching("test_*.py") + assert [path.name for path in matches] == [ + "test_auth.py", + "test_cache_service.py", + ] + + +def test_files_under_returns_subset_of_root(tmp_path: Path) -> None: + _seed_tree(tmp_path) + index = FileIndex(tmp_path) + + assert index.files_under("src") == [ + Path("src/helpers.ts"), + Path("src/main.py"), + ] + assert index.files_under("tests") == [ + Path("tests/test_auth.py"), + Path("tests/test_cache_service.py"), + ] + + +def test_exclude_dirs_are_skipped_by_default() -> None: + assert ".venv" in DEFAULT_EXCLUDE_DIRS + assert "node_modules" in DEFAULT_EXCLUDE_DIRS + assert "*.egg-info" in DEFAULT_EXCLUDE_DIRS diff --git a/tests/discovery/test_language_detect.py b/tests/discovery/test_language_detect.py new file mode 100644 index 0000000..bc60c20 --- /dev/null +++ b/tests/discovery/test_language_detect.py @@ -0,0 +1,35 @@ +# SPDX-License-Identifier: Apache-2.0 +# Copyright (c) 2026 SpecLeft Contributors + +"""Tests for project language detection.""" + +from __future__ import annotations + +from pathlib import Path + +from specleft.discovery.file_index import FileIndex +from specleft.discovery.language_detect import detect_project_languages +from specleft.discovery.models import SupportedLanguage + + +def test_detect_project_languages_uses_ratio_threshold(tmp_path: Path) -> None: + (tmp_path / "a.py").write_text("x=1") + (tmp_path / "b.py").write_text("x=2") + (tmp_path / "c.py").write_text("x=3") + (tmp_path / "d.ts").write_text("const x = 1;") + (tmp_path / "notes.md").write_text("notes") + + index = FileIndex(tmp_path) + + assert set(detect_project_languages(index)) == { + SupportedLanguage.PYTHON, + SupportedLanguage.TYPESCRIPT, + } + assert set(detect_project_languages(index, threshold=0.7)) == set() + + +def test_detect_project_languages_empty_index_returns_empty(tmp_path: Path) -> None: + (tmp_path / "notes.md").write_text("no supported files") + index = FileIndex(tmp_path) + + assert detect_project_languages(index) == [] diff --git a/tests/discovery/test_language_registry.py b/tests/discovery/test_language_registry.py new file mode 100644 index 0000000..e6e8c83 --- /dev/null +++ b/tests/discovery/test_language_registry.py @@ -0,0 +1,168 @@ +# SPDX-License-Identifier: Apache-2.0 +# Copyright (c) 2026 SpecLeft Contributors + +"""Tests for language registry and parser abstraction.""" + +from __future__ import annotations + +import sys +from pathlib import Path +from types import SimpleNamespace +from typing import Any + +import pytest + +from specleft.discovery.language_registry import LanguageRegistry, SUPPORTED_EXTENSIONS +from specleft.discovery.models import SupportedLanguage + + +def test_supported_extensions_map() -> None: + assert SUPPORTED_EXTENSIONS[".py"] == SupportedLanguage.PYTHON + assert SUPPORTED_EXTENSIONS[".ts"] == SupportedLanguage.TYPESCRIPT + assert SUPPORTED_EXTENSIONS[".tsx"] == SupportedLanguage.TYPESCRIPT + assert SUPPORTED_EXTENSIONS[".js"] == SupportedLanguage.JAVASCRIPT + + +def test_detect_language_matches_supported_extensions() -> None: + registry = LanguageRegistry() + assert registry.detect_language(Path("module.py")) == SupportedLanguage.PYTHON + assert registry.detect_language(Path("main.ts")) == SupportedLanguage.TYPESCRIPT + assert registry.detect_language(Path("client.mjs")) == SupportedLanguage.JAVASCRIPT + + +def test_detect_language_skips_unsupported_extension() -> None: + assert LanguageRegistry().detect_language(Path("notes.txt")) is None + assert LanguageRegistry().detect_language(Path("script.rb")) is None + + +@pytest.mark.parametrize( + ("filename", "expected_language"), + [ + ("sample.py", SupportedLanguage.PYTHON), + ("sample.ts", SupportedLanguage.TYPESCRIPT), + ], +) +def test_parse_uses_parse_source_and_returns_detected_language( + monkeypatch: pytest.MonkeyPatch, + tmp_path: Path, + filename: str, + expected_language: SupportedLanguage, +) -> None: + path = tmp_path / filename + path.write_text("x = 1") + + registry = LanguageRegistry() + monkeypatch.setattr( + registry, + "parse_source", + lambda _source, _language: "fake-root", + ) + + result = registry.parse(path) + assert result is not None + root_node, language = result + assert root_node == "fake-root" + assert language == expected_language + + +def test_parse_returns_none_for_unsupported_extension(tmp_path: Path) -> None: + path = tmp_path / "notes.txt" + path.write_text("x") + + assert LanguageRegistry().parse(path) is None + + +def test_parse_returns_none_on_parse_error( + monkeypatch: pytest.MonkeyPatch, tmp_path: Path +) -> None: + path = tmp_path / "sample.ts" + path.write_text("const x = 1;") + + registry = LanguageRegistry() + + def broken(source: bytes, _language: SupportedLanguage) -> Any: + raise RuntimeError("boom") + + monkeypatch.setattr(registry, "parse_source", broken) + assert registry.parse(path) is None + + +def test_parse_returns_none_on_corrupt_content( + monkeypatch: pytest.MonkeyPatch, tmp_path: Path +) -> None: + path = tmp_path / "corrupt.py" + path.write_bytes(b"\x80\x81\x82") + + registry = LanguageRegistry() + + class FakeParser: + def parse(self, _source: bytes) -> SimpleNamespace: + return SimpleNamespace(root_node=SimpleNamespace(has_error=True)) + + monkeypatch.setattr(registry, "_parser_for", lambda _language: FakeParser()) + assert registry.parse(path) is None + + +def test_parser_and_language_are_cached(monkeypatch: pytest.MonkeyPatch) -> None: + registry = LanguageRegistry() + calls: dict[str, int] = {"language": 0} + + def fake_language_for(self: LanguageRegistry, language: SupportedLanguage) -> str: + calls["language"] += 1 + return f"fake-{language.value}" + + class FakeParser: + def __init__(self) -> None: + self.language: str | None = None + + def set_language(self, language: str) -> None: + self.language = language + + def parse(self, source: bytes) -> SimpleNamespace: + return SimpleNamespace(root_node=f"root({source!s})") + + parser = FakeParser() + + def fake_parser_class() -> FakeParser: + return parser + + monkeypatch.setattr(LanguageRegistry, "_language_for", fake_language_for) + + fake_tree_sitter = SimpleNamespace(Parser=fake_parser_class) + monkeypatch.setitem(sys.modules, "tree_sitter", fake_tree_sitter) + + first = registry._parser_for(SupportedLanguage.PYTHON) + second = registry._parser_for(SupportedLanguage.PYTHON) + + assert first is second + assert first is parser + assert calls["language"] == 1 + + +def test_javascript_uses_javascript_grammar_loader( + monkeypatch: pytest.MonkeyPatch, +) -> None: + registry = LanguageRegistry() + calls: dict[str, int] = {"ts": 0, "js": 0} + + def fake_ts_loader() -> str: + calls["ts"] += 1 + return "ts-language" + + def fake_js_loader() -> str: + calls["js"] += 1 + return "js-language" + + fake_typescript_module = SimpleNamespace( + language_typescript=fake_ts_loader, + language_javascript=fake_js_loader, + ) + monkeypatch.setitem(sys.modules, "tree_sitter_typescript", fake_typescript_module) + + js_language = registry._language_for(SupportedLanguage.JAVASCRIPT) + ts_language = registry._language_for(SupportedLanguage.TYPESCRIPT) + + assert js_language == "js-language" + assert ts_language == "ts-language" + assert calls["js"] == 1 + assert calls["ts"] == 1 diff --git a/tests/fixtures/discovery/sample.py b/tests/fixtures/discovery/sample.py new file mode 100644 index 0000000..72e9097 --- /dev/null +++ b/tests/fixtures/discovery/sample.py @@ -0,0 +1,12 @@ +"""Sample Python module for discovery tests.""" + + +def add_numbers(a: int, b: int) -> int: + """Add two integers.""" + + return a + b + + +class Calculator: + def multiply(self, a: int, b: int) -> int: + return a * b diff --git a/tests/fixtures/discovery/sample.ts b/tests/fixtures/discovery/sample.ts new file mode 100644 index 0000000..7d5e5aa --- /dev/null +++ b/tests/fixtures/discovery/sample.ts @@ -0,0 +1,11 @@ +// Sample TypeScript module for discovery tests + +export function addNumbers(a: number, b: number): number { + return a + b; +} + +export class Calculator { + multiply(a: number, b: number): number { + return a * b; + } +} diff --git a/tests/fixtures/discovery/sample_api.py b/tests/fixtures/discovery/sample_api.py new file mode 100644 index 0000000..1933559 --- /dev/null +++ b/tests/fixtures/discovery/sample_api.py @@ -0,0 +1,11 @@ +"""Sample API module for discovery tests.""" + + +def get_user(user_id: int) -> dict[str, int]: + """Return a fake user payload.""" + + return {"id": user_id} + + +def create_user(payload: dict[str, str]) -> dict[str, str]: + return payload diff --git a/tests/fixtures/discovery/sample_api.ts b/tests/fixtures/discovery/sample_api.ts new file mode 100644 index 0000000..583ed2b --- /dev/null +++ b/tests/fixtures/discovery/sample_api.ts @@ -0,0 +1,9 @@ +// Sample API module for discovery tests + +export function getUser(userId: number): {id: number} { + return { id: userId }; +} + +export function createUser(payload: {name: string}): {name: string} { + return payload; +} diff --git a/tests/fixtures/discovery/sample_tests.py b/tests/fixtures/discovery/sample_tests.py new file mode 100644 index 0000000..4a33762 --- /dev/null +++ b/tests/fixtures/discovery/sample_tests.py @@ -0,0 +1,14 @@ +"""Sample tests for discovery validation fixtures.""" + + +def test_add(): + assert (1 + 1) == 2 + + +def test_parametrized(): + assert 2 > 1 + + +class TestMath: + def test_subtract(self): + assert 3 - 1 == 2 diff --git a/tests/fixtures/discovery/sample_tests.ts b/tests/fixtures/discovery/sample_tests.ts new file mode 100644 index 0000000..c399745 --- /dev/null +++ b/tests/fixtures/discovery/sample_tests.ts @@ -0,0 +1,9 @@ +// Sample TypeScript tests for discovery fixtures + +describe('math', () => { + it('adds', () => { + expect(1 + 1).toBe(2); + }); + + it.todo('will be implemented later'); +});