From b3e09a96ca743dc3ce6be2db63067b238e5082e0 Mon Sep 17 00:00:00 2001 From: Richard-Otterli Date: Fri, 6 Mar 2026 15:10:30 +0000 Subject: [PATCH 1/3] Add tree-sitter discovery foundation (#125) --- features/feature-spec-discovery.md | 41 +++++++ pyproject.toml | 3 + src/specleft/discovery/__init__.py | 4 + src/specleft/discovery/file_index.py | 128 ++++++++++++++++++++ src/specleft/discovery/language_detect.py | 29 +++++ src/specleft/discovery/language_registry.py | 111 +++++++++++++++++ tests/discovery/test_file_index.py | 95 +++++++++++++++ tests/discovery/test_language_detect.py | 37 ++++++ tests/discovery/test_language_registry.py | 113 +++++++++++++++++ tests/fixtures/discovery/sample.py | 12 ++ tests/fixtures/discovery/sample.ts | 11 ++ tests/fixtures/discovery/sample_api.py | 11 ++ tests/fixtures/discovery/sample_api.ts | 9 ++ tests/fixtures/discovery/sample_tests.py | 14 +++ tests/fixtures/discovery/sample_tests.ts | 9 ++ 15 files changed, 627 insertions(+) create mode 100644 features/feature-spec-discovery.md create mode 100644 src/specleft/discovery/file_index.py create mode 100644 src/specleft/discovery/language_detect.py create mode 100644 src/specleft/discovery/language_registry.py create mode 100644 tests/discovery/test_file_index.py create mode 100644 tests/discovery/test_language_detect.py create mode 100644 tests/discovery/test_language_registry.py create mode 100644 tests/fixtures/discovery/sample.py create mode 100644 tests/fixtures/discovery/sample.ts create mode 100644 tests/fixtures/discovery/sample_api.py create mode 100644 tests/fixtures/discovery/sample_api.ts create mode 100644 tests/fixtures/discovery/sample_tests.py create mode 100644 tests/fixtures/discovery/sample_tests.ts diff --git a/features/feature-spec-discovery.md b/features/feature-spec-discovery.md new file mode 100644 index 0000000..3d00369 --- /dev/null +++ b/features/feature-spec-discovery.md @@ -0,0 +1,41 @@ +# Feature Spec: Discovery language registry and file indexing + +## Purpose +Add shared discovery infrastructure for Issue #125: centralized parser abstraction and one-pass filesystem indexing. + +## User Stories + +### Story 1: Shared language detection and parsing +**Scenario:** As a discovery miner, I need a single abstraction to detect and parse language files. +**Given** a project file path +**When** I call `LanguageRegistry().detect_language(path)` +**Then** it should map supported extensions to a `SupportedLanguage` enum value and return `None` for unsupported files. + +**Scenario:** As a discovery miner, I need resilient parsing. +**Given** a supported file path and `LanguageRegistry` +**When** parsing succeeds +**Then** `parse(path)` returns `(root_node, detected_language)`. +**And when parse fails or content is corrupt +**Then** `parse(path)` returns `None` without raising. + +### Story 2: Shared file indexing +**Scenario:** As a discovery pipeline, I need to avoid repeated filesystem walks. +**Given** a `FileIndex` built for the repository +**Then** miners can query `files_by_language`, `files_by_extension`, `files_matching`, and `files_under`. + +**Scenario:** As a pipeline maintainer, I need noisy directories excluded consistently. +**Given** directories in `DEFAULT_EXCLUDE_DIRS` +**Then** those paths are never returned by index lookups. + +### Story 3: Project language signal +**Scenario:** As downstream planning logic, I need a low-cost language signal. +**Given** a populated `FileIndex` +**When** calling `detect_project_languages(index)` +**Then** it returns detected languages above the ratio threshold. + +## Acceptance Criteria +- Language abstraction returns `SupportedLanguage` members for `.py`, `.ts`, `.tsx`, `.js`, `.jsx`, `.mjs` and `None` otherwise. +- Grammar/parser handling is cached and does not recreate parser objects per call. +- `FileIndex` builds once per root and exposes query helpers used by miners. +- Tests cover registry parsing, caching behavior, index filtering, and language detection thresholding. +- Feature spec is updated to document the new discovery layer behavior for issue #125. diff --git a/pyproject.toml b/pyproject.toml index 536b319..9563853 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -34,6 +34,9 @@ dependencies = [ "python-frontmatter>=1.0.0", "python-slugify>=8.0.0", "pyyaml>=6.0.0", + "tree-sitter>=0.23", + "tree-sitter-python>=0.23", + "tree-sitter-typescript>=0.23", ] keywords=[ "ai", diff --git a/src/specleft/discovery/__init__.py b/src/specleft/discovery/__init__.py index 5ef8125..75b2a66 100644 --- a/src/specleft/discovery/__init__.py +++ b/src/specleft/discovery/__init__.py @@ -1,3 +1,7 @@ """Discovery models and infrastructure package.""" from specleft.discovery.models import * # noqa: F401,F403 + +from specleft.discovery.file_index import DEFAULT_EXCLUDE_DIRS, FileIndex +from specleft.discovery.language_detect import detect_project_languages +from specleft.discovery.language_registry import SUPPORTED_EXTENSIONS, LanguageRegistry diff --git a/src/specleft/discovery/file_index.py b/src/specleft/discovery/file_index.py new file mode 100644 index 0000000..43a1911 --- /dev/null +++ b/src/specleft/discovery/file_index.py @@ -0,0 +1,128 @@ +# SPDX-License-Identifier: Apache-2.0 +# Copyright (c) 2026 SpecLeft Contributors + +"""Filesystem abstraction for discovery miners.""" + +from __future__ import annotations + +import fnmatch +import os +from pathlib import Path + +from specleft.discovery.language_registry import SUPPORTED_EXTENSIONS +from specleft.discovery.models import SupportedLanguage + +DEFAULT_EXCLUDE_DIRS: frozenset[str] = frozenset( + { + ".git", + "node_modules", + "__pycache__", + ".venv", + "venv", + "dist", + "build", + ".next", + ".mypy_cache", + ".pytest_cache", + ".tox", + ".eggs", + "*.egg-info", + } +) + + +class FileIndex: + """Walk the repository once and provide filtered views.""" + + def __init__( + self, + root: Path, + exclude_dirs: frozenset[str] = DEFAULT_EXCLUDE_DIRS, + ) -> None: + self._root = root + self._files: list[Path] = [] + self._by_language: dict[SupportedLanguage, list[Path]] = {} + self._by_extension: dict[str, list[Path]] = {} + self._exclude_dirs = exclude_dirs + self._build() + + @property + def root(self) -> Path: + """Project root.""" + return self._root + + @property + def total_files(self) -> int: + """Number of discovered files.""" + return len(self._files) + + def files_by_language(self, lang: SupportedLanguage) -> list[Path]: + """Return all files for a language.""" + return sorted( + self._by_language.get(lang, []), + key=lambda value: value.as_posix(), + ) + + def files_by_extension(self, *exts: str) -> list[Path]: + """Return files matching any extension.""" + output: list[Path] = [] + for ext in exts: + output.extend(self._by_extension.get(ext.lower(), [])) + return sorted(output, key=lambda value: value.as_posix()) + + def files_matching(self, *patterns: str) -> list[Path]: + """Return files whose names match any glob pattern.""" + matched: list[Path] = [] + for file_path in self._files: + for pattern in patterns: + if fnmatch.fnmatch(file_path.name, pattern): + matched.append(file_path) + break + return sorted(matched, key=lambda value: value.as_posix()) + + def files_under(self, *dirs: str) -> list[Path]: + """Return files under the specified directory prefixes.""" + return sorted( + [ + file_path + for file_path in self._files + if any( + file_path.parts[: len(Path(prefix).parts)] + == tuple(Path(prefix).parts) + for prefix in dirs + ) + ], + key=lambda value: value.as_posix(), + ) + + def _build(self) -> None: + root = self._root.resolve() + for current_root, dirnames, filenames in os.walk(root): + dirnames.sort() + filenames.sort() + path_dirnames = list(dirnames) + filtered: list[str] = [] + for dirname in path_dirnames: + if self._is_excluded_dir(dirname): + continue + filtered.append(dirname) + dirnames[:] = filtered + + for filename in filenames: + file_path = Path(current_root, filename) + if file_path.is_dir(): + continue + rel_path = file_path.relative_to(root) + self._files.append(rel_path) + + extension = rel_path.suffix.lower() + self._by_extension.setdefault(extension, []).append(rel_path) + + language = SUPPORTED_EXTENSIONS.get(extension) + if language is not None: + self._by_language.setdefault(language, []).append(rel_path) + + def _is_excluded_dir(self, dirname: str) -> bool: + if dirname in self._exclude_dirs: + return True + return any(fnmatch.fnmatch(dirname, pattern) for pattern in self._exclude_dirs) diff --git a/src/specleft/discovery/language_detect.py b/src/specleft/discovery/language_detect.py new file mode 100644 index 0000000..2c1ecae --- /dev/null +++ b/src/specleft/discovery/language_detect.py @@ -0,0 +1,29 @@ +# SPDX-License-Identifier: Apache-2.0 +# Copyright (c) 2026 SpecLeft Contributors + +"""Language detection helpers over a prebuilt file index.""" + +from __future__ import annotations + +from specleft.discovery.file_index import FileIndex +from specleft.discovery.models import SupportedLanguage + + +def detect_project_languages( + file_index: FileIndex, + threshold: float = 0.01, +) -> list[SupportedLanguage]: + """Return languages whose supported-file ratio exceeds the given threshold.""" + supported_total = sum( + len(file_index.files_by_language(language)) for language in SupportedLanguage + ) + if supported_total == 0: + return [] + + detected: list[SupportedLanguage] = [] + for language in SupportedLanguage: + language_files = file_index.files_by_language(language) + ratio = len(language_files) / supported_total + if ratio >= threshold: + detected.append(language) + return detected diff --git a/src/specleft/discovery/language_registry.py b/src/specleft/discovery/language_registry.py new file mode 100644 index 0000000..2d837f3 --- /dev/null +++ b/src/specleft/discovery/language_registry.py @@ -0,0 +1,111 @@ +# SPDX-License-Identifier: Apache-2.0 +# Copyright (c) 2026 SpecLeft Contributors + +"""Language detection and parser abstractions for discovery miners.""" + +from __future__ import annotations + +from pathlib import Path +from typing import Any + +from specleft.discovery.models import SupportedLanguage + +SUPPORTED_EXTENSIONS: dict[str, SupportedLanguage] = { + ".py": SupportedLanguage.PYTHON, + ".ts": SupportedLanguage.TYPESCRIPT, + ".tsx": SupportedLanguage.TYPESCRIPT, + ".js": SupportedLanguage.JAVASCRIPT, + ".jsx": SupportedLanguage.JAVASCRIPT, + ".mjs": SupportedLanguage.JAVASCRIPT, +} + + +class LanguageRegistry: + """Cache language grammars and expose shared parse operations.""" + + def __init__(self) -> None: + self._language_cache: dict[SupportedLanguage, Any] = {} + self._parser_cache: dict[SupportedLanguage, Any] = {} + + def detect_language(self, file_path: Path) -> SupportedLanguage | None: + """Return the detected language by file extension, or `None`.""" + return SUPPORTED_EXTENSIONS.get(file_path.suffix.lower()) + + def parse(self, file_path: Path) -> tuple[Any, SupportedLanguage] | None: + """Parse bytes from disk and return ``(root_node, language)``. + + Unsupported extensions, parse failures, or malformed files return ``None``. + """ + language = self.detect_language(file_path) + if language is None: + return None + + try: + source = file_path.read_bytes() + except OSError: + return None + + try: + root_node = self.parse_source(source, language) + except Exception: + return None + + if root_node is None: + return None + return root_node, language + + def parse_source(self, source: bytes, language: SupportedLanguage) -> Any | None: + """Parse raw source bytes directly and return tree root node.""" + parser = self._parser_for(language) + if parser is None: + return None + + try: + tree = parser.parse(source) + except Exception: + return None + + return tree.root_node + + def _parser_for(self, language: SupportedLanguage) -> Any | None: + parser = self._parser_cache.get(language) + if parser is not None: + return parser + + language_obj = self._language_for(language) + if language_obj is None: + return None + + try: + from tree_sitter import Parser # type: ignore[import-untyped] + + parser = Parser() + parser.set_language(language_obj) + self._parser_cache[language] = parser + return parser + except Exception: + return None + + def _language_for(self, language: SupportedLanguage) -> Any | None: + if language in self._language_cache: + return self._language_cache[language] + + if language == SupportedLanguage.PYTHON: + try: + import tree_sitter_python # type: ignore[import-not-found] + + language_obj = tree_sitter_python.language() + except Exception: + return None + elif language in (SupportedLanguage.TYPESCRIPT, SupportedLanguage.JAVASCRIPT): + try: + import tree_sitter_typescript # type: ignore[import-not-found] + + language_obj = tree_sitter_typescript.language_typescript() + except Exception: + return None + else: + return None + + self._language_cache[language] = language_obj + return language_obj diff --git a/tests/discovery/test_file_index.py b/tests/discovery/test_file_index.py new file mode 100644 index 0000000..d4092d3 --- /dev/null +++ b/tests/discovery/test_file_index.py @@ -0,0 +1,95 @@ +# SPDX-License-Identifier: Apache-2.0 +# Copyright (c) 2026 SpecLeft Contributors + +"""Tests for discovery file index abstraction.""" + +from __future__ import annotations + +from pathlib import Path + +from specleft.discovery.file_index import DEFAULT_EXCLUDE_DIRS, FileIndex +from specleft.discovery.models import SupportedLanguage + + +def _seed_tree(root: Path) -> None: + (root / "src").mkdir(parents=True) + (root / "src" / "main.py").write_text("print('ok')") + (root / "src" / "helpers.ts").write_text("const a = 1;") + + tests_dir = root / "tests" + tests_dir.mkdir() + (tests_dir / "test_auth.py").write_text("def test_ok():\n assert True\n") + (tests_dir / "test_cache_service.py").write_text( + "def test_cache():\n assert True\n" + ) + + (root / "notes.md").write_text("just notes") + + (root / ".venv").mkdir() + (root / ".venv" / "tmp.py").write_text("print('skip')") + + (root / "node_modules").mkdir() + (root / "node_modules" / "a.js").write_text("console.log(1)") + + +def test_file_index_walk_counts_and_filters(tmp_path: Path) -> None: + _seed_tree(tmp_path) + index = FileIndex(tmp_path) + + assert index.root == tmp_path + assert index.total_files == 5 + assert {path.as_posix() for path in index._files} == { + "src/main.py", + "src/helpers.ts", + "tests/test_auth.py", + "tests/test_cache_service.py", + "notes.md", + } + + +def test_file_index_by_language_only_includes_supported_languages( + tmp_path: Path, +) -> None: + _seed_tree(tmp_path) + index = FileIndex(tmp_path) + + assert index.files_by_language(SupportedLanguage.PYTHON) == [ + Path("src/main.py"), + Path("tests/test_auth.py"), + Path("tests/test_cache_service.py"), + ] + assert index.files_by_language(SupportedLanguage.TYPESCRIPT) == [ + Path("src/helpers.ts") + ] + assert index.files_by_language(SupportedLanguage.JAVASCRIPT) == [] + + +def test_files_matching_respects_patterns(tmp_path: Path) -> None: + _seed_tree(tmp_path) + index = FileIndex(tmp_path) + + matches = index.files_matching("test_*.py") + assert [path.name for path in matches] == [ + "test_auth.py", + "test_cache_service.py", + ] + + +def test_files_under_returns_subset_of_root(tmp_path: Path) -> None: + _seed_tree(tmp_path) + index = FileIndex(tmp_path) + + assert index.files_under("src") == [ + Path("src/helpers.ts"), + Path("src/main.py"), + ] + assert index.files_under("tests") == [ + Path("tests/test_auth.py"), + Path("tests/test_cache_service.py"), + ] + + +def test_exclude_dirs_are_skipped_by_default() -> None: + assert ".venv" in DEFAULT_EXCLUDE_DIRS + assert "node_modules" in DEFAULT_EXCLUDE_DIRS + assert "*.egg-info" in DEFAULT_EXCLUDE_DIRS diff --git a/tests/discovery/test_language_detect.py b/tests/discovery/test_language_detect.py new file mode 100644 index 0000000..37fae28 --- /dev/null +++ b/tests/discovery/test_language_detect.py @@ -0,0 +1,37 @@ +# SPDX-License-Identifier: Apache-2.0 +# Copyright (c) 2026 SpecLeft Contributors + +"""Tests for project language detection.""" + +from __future__ import annotations + +from pathlib import Path + +from specleft.discovery.file_index import FileIndex +from specleft.discovery.language_detect import detect_project_languages +from specleft.discovery.models import SupportedLanguage + + +def test_detect_project_languages_uses_ratio_threshold(tmp_path: Path) -> None: + (tmp_path / "a.py").write_text("x=1") + (tmp_path / "b.py").write_text("x=2") + (tmp_path / "c.py").write_text("x=3") + (tmp_path / "d.ts").write_text("const x = 1;") + (tmp_path / "notes.md").write_text("notes") + + index = FileIndex(tmp_path) + + assert set(detect_project_languages(index)) == { + SupportedLanguage.PYTHON, + SupportedLanguage.TYPESCRIPT, + } + assert set(detect_project_languages(index, threshold=0.7)) == { + SupportedLanguage.PYTHON + } + + +def test_detect_project_languages_empty_index_returns_empty(tmp_path: Path) -> None: + (tmp_path / "notes.md").write_text("no supported files") + index = FileIndex(tmp_path) + + assert detect_project_languages(index) == [] diff --git a/tests/discovery/test_language_registry.py b/tests/discovery/test_language_registry.py new file mode 100644 index 0000000..78a781a --- /dev/null +++ b/tests/discovery/test_language_registry.py @@ -0,0 +1,113 @@ +# SPDX-License-Identifier: Apache-2.0 +# Copyright (c) 2026 SpecLeft Contributors + +"""Tests for language registry and parser abstraction.""" + +from __future__ import annotations + +import sys +from pathlib import Path +from types import SimpleNamespace +from typing import Any + +import pytest + +from specleft.discovery.language_registry import LanguageRegistry, SUPPORTED_EXTENSIONS +from specleft.discovery.models import SupportedLanguage + + +def test_supported_extensions_map() -> None: + assert SUPPORTED_EXTENSIONS[".py"] == SupportedLanguage.PYTHON + assert SUPPORTED_EXTENSIONS[".ts"] == SupportedLanguage.TYPESCRIPT + assert SUPPORTED_EXTENSIONS[".tsx"] == SupportedLanguage.TYPESCRIPT + assert SUPPORTED_EXTENSIONS[".js"] == SupportedLanguage.JAVASCRIPT + + +def test_detect_language_matches_supported_extensions() -> None: + registry = LanguageRegistry() + assert registry.detect_language(Path("module.py")) == SupportedLanguage.PYTHON + assert registry.detect_language(Path("main.ts")) == SupportedLanguage.TYPESCRIPT + assert registry.detect_language(Path("client.mjs")) == SupportedLanguage.JAVASCRIPT + + +def test_detect_language_skips_unsupported_extension() -> None: + assert LanguageRegistry().detect_language(Path("notes.txt")) is None + assert LanguageRegistry().detect_language(Path("script.rb")) is None + + +def test_parse_uses_parse_source_and_returns_detected_language( + monkeypatch: pytest.MonkeyPatch, tmp_path: Path +) -> None: + path = tmp_path / "sample.py" + path.write_text("x = 1") + + registry = LanguageRegistry() + monkeypatch.setattr( + registry, + "parse_source", + lambda _source, _language: "fake-root", + ) + + result = registry.parse(path) + assert result is not None + root_node, language = result + assert root_node == "fake-root" + assert language == SupportedLanguage.PYTHON + + +def test_parse_returns_none_for_unsupported_extension(tmp_path: Path) -> None: + path = tmp_path / "notes.txt" + path.write_text("x") + + assert LanguageRegistry().parse(path) is None + + +def test_parse_returns_none_on_parse_error( + monkeypatch: pytest.MonkeyPatch, tmp_path: Path +) -> None: + path = tmp_path / "sample.ts" + path.write_text("const x = 1;") + + registry = LanguageRegistry() + + def broken(source: bytes, _language: SupportedLanguage) -> Any: + raise RuntimeError("boom") + + monkeypatch.setattr(registry, "parse_source", broken) + assert registry.parse(path) is None + + +def test_parser_and_language_are_cached(monkeypatch: pytest.MonkeyPatch) -> None: + registry = LanguageRegistry() + calls: dict[str, int] = {"language": 0} + + def fake_language_for(self: LanguageRegistry, language: SupportedLanguage) -> str: + calls["language"] += 1 + return f"fake-{language.value}" + + class FakeParser: + def __init__(self) -> None: + self.language: str | None = None + + def set_language(self, language: str) -> None: + self.language = language + + def parse(self, source: bytes) -> SimpleNamespace: + return SimpleNamespace(root_node=f"root({source!s})") + + parser = FakeParser() + + def fake_parser_class() -> FakeParser: + return parser + + monkeypatch.setattr(LanguageRegistry, "_language_for", fake_language_for) + + fake_tree_sitter = SimpleNamespace(Parser=fake_parser_class) + monkeypatch.setitem(sys.modules, "tree_sitter", fake_tree_sitter) + + first = registry._parser_for(SupportedLanguage.PYTHON) + second = registry._parser_for(SupportedLanguage.PYTHON) + + assert first is second + assert first is parser + assert calls["language"] == 1 diff --git a/tests/fixtures/discovery/sample.py b/tests/fixtures/discovery/sample.py new file mode 100644 index 0000000..72e9097 --- /dev/null +++ b/tests/fixtures/discovery/sample.py @@ -0,0 +1,12 @@ +"""Sample Python module for discovery tests.""" + + +def add_numbers(a: int, b: int) -> int: + """Add two integers.""" + + return a + b + + +class Calculator: + def multiply(self, a: int, b: int) -> int: + return a * b diff --git a/tests/fixtures/discovery/sample.ts b/tests/fixtures/discovery/sample.ts new file mode 100644 index 0000000..7d5e5aa --- /dev/null +++ b/tests/fixtures/discovery/sample.ts @@ -0,0 +1,11 @@ +// Sample TypeScript module for discovery tests + +export function addNumbers(a: number, b: number): number { + return a + b; +} + +export class Calculator { + multiply(a: number, b: number): number { + return a * b; + } +} diff --git a/tests/fixtures/discovery/sample_api.py b/tests/fixtures/discovery/sample_api.py new file mode 100644 index 0000000..1933559 --- /dev/null +++ b/tests/fixtures/discovery/sample_api.py @@ -0,0 +1,11 @@ +"""Sample API module for discovery tests.""" + + +def get_user(user_id: int) -> dict[str, int]: + """Return a fake user payload.""" + + return {"id": user_id} + + +def create_user(payload: dict[str, str]) -> dict[str, str]: + return payload diff --git a/tests/fixtures/discovery/sample_api.ts b/tests/fixtures/discovery/sample_api.ts new file mode 100644 index 0000000..583ed2b --- /dev/null +++ b/tests/fixtures/discovery/sample_api.ts @@ -0,0 +1,9 @@ +// Sample API module for discovery tests + +export function getUser(userId: number): {id: number} { + return { id: userId }; +} + +export function createUser(payload: {name: string}): {name: string} { + return payload; +} diff --git a/tests/fixtures/discovery/sample_tests.py b/tests/fixtures/discovery/sample_tests.py new file mode 100644 index 0000000..4a33762 --- /dev/null +++ b/tests/fixtures/discovery/sample_tests.py @@ -0,0 +1,14 @@ +"""Sample tests for discovery validation fixtures.""" + + +def test_add(): + assert (1 + 1) == 2 + + +def test_parametrized(): + assert 2 > 1 + + +class TestMath: + def test_subtract(self): + assert 3 - 1 == 2 diff --git a/tests/fixtures/discovery/sample_tests.ts b/tests/fixtures/discovery/sample_tests.ts new file mode 100644 index 0000000..c399745 --- /dev/null +++ b/tests/fixtures/discovery/sample_tests.ts @@ -0,0 +1,9 @@ +// Sample TypeScript tests for discovery fixtures + +describe('math', () => { + it('adds', () => { + expect(1 + 1).toBe(2); + }); + + it.todo('will be implemented later'); +}); From 1d54ae2c094297a7c9733b0b8e80d502541ed741 Mon Sep 17 00:00:00 2001 From: Richard-Otterli Date: Fri, 6 Mar 2026 15:39:53 +0000 Subject: [PATCH 2/3] Align discovery acceptance behavior (#125) --- features/feature-spec-discovery.md | 6 +++- src/specleft/discovery/language_detect.py | 10 +++---- src/specleft/discovery/language_registry.py | 6 +++- tests/discovery/test_language_detect.py | 4 +-- tests/discovery/test_language_registry.py | 32 +++++++++++++++++++-- 5 files changed, 44 insertions(+), 14 deletions(-) diff --git a/features/feature-spec-discovery.md b/features/feature-spec-discovery.md index 3d00369..623634a 100644 --- a/features/feature-spec-discovery.md +++ b/features/feature-spec-discovery.md @@ -31,11 +31,15 @@ Add shared discovery infrastructure for Issue #125: centralized parser abstracti **Scenario:** As downstream planning logic, I need a low-cost language signal. **Given** a populated `FileIndex` **When** calling `detect_project_languages(index)` -**Then** it returns detected languages above the ratio threshold. +**Then** it returns detected languages above the ratio threshold, computed against total indexed files. ## Acceptance Criteria - Language abstraction returns `SupportedLanguage` members for `.py`, `.ts`, `.tsx`, `.js`, `.jsx`, `.mjs` and `None` otherwise. +- `LanguageRegistry().parse(path_to_py_file)` returns `(node, SupportedLanguage.PYTHON)` for valid Python input. +- `LanguageRegistry().parse(path_to_ts_file)` returns `(node, SupportedLanguage.TYPESCRIPT)` for valid TypeScript input. +- Corrupt file content returns `None` without raising. - Grammar/parser handling is cached and does not recreate parser objects per call. - `FileIndex` builds once per root and exposes query helpers used by miners. +- `detect_project_languages()` thresholds are applied against total indexed files, not only supported-language files. - Tests cover registry parsing, caching behavior, index filtering, and language detection thresholding. - Feature spec is updated to document the new discovery layer behavior for issue #125. diff --git a/src/specleft/discovery/language_detect.py b/src/specleft/discovery/language_detect.py index 2c1ecae..794ae1b 100644 --- a/src/specleft/discovery/language_detect.py +++ b/src/specleft/discovery/language_detect.py @@ -13,17 +13,15 @@ def detect_project_languages( file_index: FileIndex, threshold: float = 0.01, ) -> list[SupportedLanguage]: - """Return languages whose supported-file ratio exceeds the given threshold.""" - supported_total = sum( - len(file_index.files_by_language(language)) for language in SupportedLanguage - ) - if supported_total == 0: + """Return languages whose file ratio exceeds the given threshold.""" + total_files = file_index.total_files + if total_files == 0: return [] detected: list[SupportedLanguage] = [] for language in SupportedLanguage: language_files = file_index.files_by_language(language) - ratio = len(language_files) / supported_total + ratio = len(language_files) / total_files if ratio >= threshold: detected.append(language) return detected diff --git a/src/specleft/discovery/language_registry.py b/src/specleft/discovery/language_registry.py index 2d837f3..003f457 100644 --- a/src/specleft/discovery/language_registry.py +++ b/src/specleft/discovery/language_registry.py @@ -65,7 +65,11 @@ def parse_source(self, source: bytes, language: SupportedLanguage) -> Any | None except Exception: return None - return tree.root_node + root_node = tree.root_node + if getattr(root_node, "has_error", False): + return None + + return root_node def _parser_for(self, language: SupportedLanguage) -> Any | None: parser = self._parser_cache.get(language) diff --git a/tests/discovery/test_language_detect.py b/tests/discovery/test_language_detect.py index 37fae28..bc60c20 100644 --- a/tests/discovery/test_language_detect.py +++ b/tests/discovery/test_language_detect.py @@ -25,9 +25,7 @@ def test_detect_project_languages_uses_ratio_threshold(tmp_path: Path) -> None: SupportedLanguage.PYTHON, SupportedLanguage.TYPESCRIPT, } - assert set(detect_project_languages(index, threshold=0.7)) == { - SupportedLanguage.PYTHON - } + assert set(detect_project_languages(index, threshold=0.7)) == set() def test_detect_project_languages_empty_index_returns_empty(tmp_path: Path) -> None: diff --git a/tests/discovery/test_language_registry.py b/tests/discovery/test_language_registry.py index 78a781a..2eadefe 100644 --- a/tests/discovery/test_language_registry.py +++ b/tests/discovery/test_language_registry.py @@ -35,10 +35,20 @@ def test_detect_language_skips_unsupported_extension() -> None: assert LanguageRegistry().detect_language(Path("script.rb")) is None +@pytest.mark.parametrize( + ("filename", "expected_language"), + [ + ("sample.py", SupportedLanguage.PYTHON), + ("sample.ts", SupportedLanguage.TYPESCRIPT), + ], +) def test_parse_uses_parse_source_and_returns_detected_language( - monkeypatch: pytest.MonkeyPatch, tmp_path: Path + monkeypatch: pytest.MonkeyPatch, + tmp_path: Path, + filename: str, + expected_language: SupportedLanguage, ) -> None: - path = tmp_path / "sample.py" + path = tmp_path / filename path.write_text("x = 1") registry = LanguageRegistry() @@ -52,7 +62,7 @@ def test_parse_uses_parse_source_and_returns_detected_language( assert result is not None root_node, language = result assert root_node == "fake-root" - assert language == SupportedLanguage.PYTHON + assert language == expected_language def test_parse_returns_none_for_unsupported_extension(tmp_path: Path) -> None: @@ -77,6 +87,22 @@ def broken(source: bytes, _language: SupportedLanguage) -> Any: assert registry.parse(path) is None +def test_parse_returns_none_on_corrupt_content( + monkeypatch: pytest.MonkeyPatch, tmp_path: Path +) -> None: + path = tmp_path / "corrupt.py" + path.write_bytes(b"\x80\x81\x82") + + registry = LanguageRegistry() + + class FakeParser: + def parse(self, _source: bytes) -> SimpleNamespace: + return SimpleNamespace(root_node=SimpleNamespace(has_error=True)) + + monkeypatch.setattr(registry, "_parser_for", lambda _language: FakeParser()) + assert registry.parse(path) is None + + def test_parser_and_language_are_cached(monkeypatch: pytest.MonkeyPatch) -> None: registry = LanguageRegistry() calls: dict[str, int] = {"language": 0} From 7c686c12069ccb502519645653e826419bf29d23 Mon Sep 17 00:00:00 2001 From: Richard-Otterli Date: Fri, 6 Mar 2026 15:52:22 +0000 Subject: [PATCH 3/3] Fix JavaScript grammar selection (#125) --- src/specleft/discovery/language_registry.py | 12 ++++++++- tests/discovery/test_language_registry.py | 29 +++++++++++++++++++++ 2 files changed, 40 insertions(+), 1 deletion(-) diff --git a/src/specleft/discovery/language_registry.py b/src/specleft/discovery/language_registry.py index 003f457..c3e60b4 100644 --- a/src/specleft/discovery/language_registry.py +++ b/src/specleft/discovery/language_registry.py @@ -105,7 +105,17 @@ def _language_for(self, language: SupportedLanguage) -> Any | None: try: import tree_sitter_typescript # type: ignore[import-not-found] - language_obj = tree_sitter_typescript.language_typescript() + if language == SupportedLanguage.TYPESCRIPT: + language_obj = tree_sitter_typescript.language_typescript() + else: + language_loader = getattr( + tree_sitter_typescript, + "language_javascript", + None, + ) + if language_loader is None: + return None + language_obj = language_loader() except Exception: return None else: diff --git a/tests/discovery/test_language_registry.py b/tests/discovery/test_language_registry.py index 2eadefe..e6e8c83 100644 --- a/tests/discovery/test_language_registry.py +++ b/tests/discovery/test_language_registry.py @@ -137,3 +137,32 @@ def fake_parser_class() -> FakeParser: assert first is second assert first is parser assert calls["language"] == 1 + + +def test_javascript_uses_javascript_grammar_loader( + monkeypatch: pytest.MonkeyPatch, +) -> None: + registry = LanguageRegistry() + calls: dict[str, int] = {"ts": 0, "js": 0} + + def fake_ts_loader() -> str: + calls["ts"] += 1 + return "ts-language" + + def fake_js_loader() -> str: + calls["js"] += 1 + return "js-language" + + fake_typescript_module = SimpleNamespace( + language_typescript=fake_ts_loader, + language_javascript=fake_js_loader, + ) + monkeypatch.setitem(sys.modules, "tree_sitter_typescript", fake_typescript_module) + + js_language = registry._language_for(SupportedLanguage.JAVASCRIPT) + ts_language = registry._language_for(SupportedLanguage.TYPESCRIPT) + + assert js_language == "js-language" + assert ts_language == "ts-language" + assert calls["js"] == 1 + assert calls["ts"] == 1