Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
45 changes: 45 additions & 0 deletions features/feature-spec-discovery.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,45 @@
# Feature Spec: Discovery language registry and file indexing

## Purpose
Add shared discovery infrastructure for Issue #125: centralized parser abstraction and one-pass filesystem indexing.

## User Stories

### Story 1: Shared language detection and parsing
**Scenario:** As a discovery miner, I need a single abstraction to detect and parse language files.
**Given** a project file path
**When** I call `LanguageRegistry().detect_language(path)`
**Then** it should map supported extensions to a `SupportedLanguage` enum value and return `None` for unsupported files.

**Scenario:** As a discovery miner, I need resilient parsing.
**Given** a supported file path and `LanguageRegistry`
**When** parsing succeeds
**Then** `parse(path)` returns `(root_node, detected_language)`.
**And when parse fails or content is corrupt
**Then** `parse(path)` returns `None` without raising.

### Story 2: Shared file indexing
**Scenario:** As a discovery pipeline, I need to avoid repeated filesystem walks.
**Given** a `FileIndex` built for the repository
**Then** miners can query `files_by_language`, `files_by_extension`, `files_matching`, and `files_under`.

**Scenario:** As a pipeline maintainer, I need noisy directories excluded consistently.
**Given** directories in `DEFAULT_EXCLUDE_DIRS`
**Then** those paths are never returned by index lookups.

### Story 3: Project language signal
**Scenario:** As downstream planning logic, I need a low-cost language signal.
**Given** a populated `FileIndex`
**When** calling `detect_project_languages(index)`
**Then** it returns detected languages above the ratio threshold, computed against total indexed files.

## Acceptance Criteria
- Language abstraction returns `SupportedLanguage` members for `.py`, `.ts`, `.tsx`, `.js`, `.jsx`, `.mjs` and `None` otherwise.
- `LanguageRegistry().parse(path_to_py_file)` returns `(node, SupportedLanguage.PYTHON)` for valid Python input.
- `LanguageRegistry().parse(path_to_ts_file)` returns `(node, SupportedLanguage.TYPESCRIPT)` for valid TypeScript input.
- Corrupt file content returns `None` without raising.
- Grammar/parser handling is cached and does not recreate parser objects per call.
- `FileIndex` builds once per root and exposes query helpers used by miners.
- `detect_project_languages()` thresholds are applied against total indexed files, not only supported-language files.
- Tests cover registry parsing, caching behavior, index filtering, and language detection thresholding.
- Feature spec is updated to document the new discovery layer behavior for issue #125.
3 changes: 3 additions & 0 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -34,6 +34,9 @@ dependencies = [
"python-frontmatter>=1.0.0",
"python-slugify>=8.0.0",
"pyyaml>=6.0.0",
"tree-sitter>=0.23",
"tree-sitter-python>=0.23",
"tree-sitter-typescript>=0.23",
]
keywords=[
"ai",
Expand Down
4 changes: 4 additions & 0 deletions src/specleft/discovery/__init__.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,7 @@
"""Discovery models and infrastructure package."""

from specleft.discovery.models import * # noqa: F401,F403

from specleft.discovery.file_index import DEFAULT_EXCLUDE_DIRS, FileIndex
from specleft.discovery.language_detect import detect_project_languages
from specleft.discovery.language_registry import SUPPORTED_EXTENSIONS, LanguageRegistry
128 changes: 128 additions & 0 deletions src/specleft/discovery/file_index.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,128 @@
# SPDX-License-Identifier: Apache-2.0
# Copyright (c) 2026 SpecLeft Contributors

"""Filesystem abstraction for discovery miners."""

from __future__ import annotations

import fnmatch
import os
from pathlib import Path

from specleft.discovery.language_registry import SUPPORTED_EXTENSIONS
from specleft.discovery.models import SupportedLanguage

DEFAULT_EXCLUDE_DIRS: frozenset[str] = frozenset(
{
".git",
"node_modules",
"__pycache__",
".venv",
"venv",
"dist",
"build",
".next",
".mypy_cache",
".pytest_cache",
".tox",
".eggs",
"*.egg-info",
}
)


class FileIndex:
"""Walk the repository once and provide filtered views."""

def __init__(
self,
root: Path,
exclude_dirs: frozenset[str] = DEFAULT_EXCLUDE_DIRS,
) -> None:
self._root = root
self._files: list[Path] = []
self._by_language: dict[SupportedLanguage, list[Path]] = {}
self._by_extension: dict[str, list[Path]] = {}
self._exclude_dirs = exclude_dirs
self._build()

@property
def root(self) -> Path:
"""Project root."""
return self._root

@property
def total_files(self) -> int:
"""Number of discovered files."""
return len(self._files)

def files_by_language(self, lang: SupportedLanguage) -> list[Path]:
"""Return all files for a language."""
return sorted(
self._by_language.get(lang, []),
key=lambda value: value.as_posix(),
)

def files_by_extension(self, *exts: str) -> list[Path]:
"""Return files matching any extension."""
output: list[Path] = []
for ext in exts:
output.extend(self._by_extension.get(ext.lower(), []))
return sorted(output, key=lambda value: value.as_posix())

def files_matching(self, *patterns: str) -> list[Path]:
"""Return files whose names match any glob pattern."""
matched: list[Path] = []
for file_path in self._files:
for pattern in patterns:
if fnmatch.fnmatch(file_path.name, pattern):
matched.append(file_path)
break
return sorted(matched, key=lambda value: value.as_posix())

def files_under(self, *dirs: str) -> list[Path]:
"""Return files under the specified directory prefixes."""
return sorted(
[
file_path
for file_path in self._files
if any(
file_path.parts[: len(Path(prefix).parts)]
== tuple(Path(prefix).parts)
for prefix in dirs
)
],
key=lambda value: value.as_posix(),
)

def _build(self) -> None:
root = self._root.resolve()
for current_root, dirnames, filenames in os.walk(root):
dirnames.sort()
filenames.sort()
path_dirnames = list(dirnames)
filtered: list[str] = []
for dirname in path_dirnames:
if self._is_excluded_dir(dirname):
continue
filtered.append(dirname)
dirnames[:] = filtered

for filename in filenames:
file_path = Path(current_root, filename)
if file_path.is_dir():
continue
rel_path = file_path.relative_to(root)
self._files.append(rel_path)

extension = rel_path.suffix.lower()
self._by_extension.setdefault(extension, []).append(rel_path)

language = SUPPORTED_EXTENSIONS.get(extension)
if language is not None:
self._by_language.setdefault(language, []).append(rel_path)

def _is_excluded_dir(self, dirname: str) -> bool:
if dirname in self._exclude_dirs:
return True
return any(fnmatch.fnmatch(dirname, pattern) for pattern in self._exclude_dirs)
27 changes: 27 additions & 0 deletions src/specleft/discovery/language_detect.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,27 @@
# SPDX-License-Identifier: Apache-2.0
# Copyright (c) 2026 SpecLeft Contributors

"""Language detection helpers over a prebuilt file index."""

from __future__ import annotations

from specleft.discovery.file_index import FileIndex
from specleft.discovery.models import SupportedLanguage


def detect_project_languages(
file_index: FileIndex,
threshold: float = 0.01,
) -> list[SupportedLanguage]:
"""Return languages whose file ratio exceeds the given threshold."""
total_files = file_index.total_files
if total_files == 0:
return []

detected: list[SupportedLanguage] = []
for language in SupportedLanguage:
language_files = file_index.files_by_language(language)
ratio = len(language_files) / total_files
if ratio >= threshold:
detected.append(language)
return detected
125 changes: 125 additions & 0 deletions src/specleft/discovery/language_registry.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,125 @@
# SPDX-License-Identifier: Apache-2.0
# Copyright (c) 2026 SpecLeft Contributors

"""Language detection and parser abstractions for discovery miners."""

from __future__ import annotations

from pathlib import Path
from typing import Any

from specleft.discovery.models import SupportedLanguage

SUPPORTED_EXTENSIONS: dict[str, SupportedLanguage] = {
".py": SupportedLanguage.PYTHON,
".ts": SupportedLanguage.TYPESCRIPT,
".tsx": SupportedLanguage.TYPESCRIPT,
".js": SupportedLanguage.JAVASCRIPT,
".jsx": SupportedLanguage.JAVASCRIPT,
".mjs": SupportedLanguage.JAVASCRIPT,
}


class LanguageRegistry:
"""Cache language grammars and expose shared parse operations."""

def __init__(self) -> None:
self._language_cache: dict[SupportedLanguage, Any] = {}
self._parser_cache: dict[SupportedLanguage, Any] = {}

def detect_language(self, file_path: Path) -> SupportedLanguage | None:
"""Return the detected language by file extension, or `None`."""
return SUPPORTED_EXTENSIONS.get(file_path.suffix.lower())

def parse(self, file_path: Path) -> tuple[Any, SupportedLanguage] | None:
"""Parse bytes from disk and return ``(root_node, language)``.

Unsupported extensions, parse failures, or malformed files return ``None``.
"""
language = self.detect_language(file_path)
if language is None:
return None

try:
source = file_path.read_bytes()
except OSError:
return None

try:
root_node = self.parse_source(source, language)
except Exception:
return None

if root_node is None:
return None
return root_node, language

def parse_source(self, source: bytes, language: SupportedLanguage) -> Any | None:
"""Parse raw source bytes directly and return tree root node."""
parser = self._parser_for(language)
if parser is None:
return None

try:
tree = parser.parse(source)
except Exception:
return None

root_node = tree.root_node
if getattr(root_node, "has_error", False):
return None

return root_node

def _parser_for(self, language: SupportedLanguage) -> Any | None:
parser = self._parser_cache.get(language)
if parser is not None:
return parser

language_obj = self._language_for(language)
if language_obj is None:
return None

try:
from tree_sitter import Parser # type: ignore[import-untyped]

parser = Parser()
parser.set_language(language_obj)
self._parser_cache[language] = parser
return parser
except Exception:
return None

def _language_for(self, language: SupportedLanguage) -> Any | None:
if language in self._language_cache:
return self._language_cache[language]

if language == SupportedLanguage.PYTHON:
try:
import tree_sitter_python # type: ignore[import-not-found]

language_obj = tree_sitter_python.language()
except Exception:
return None
elif language in (SupportedLanguage.TYPESCRIPT, SupportedLanguage.JAVASCRIPT):
try:
import tree_sitter_typescript # type: ignore[import-not-found]

if language == SupportedLanguage.TYPESCRIPT:
language_obj = tree_sitter_typescript.language_typescript()
else:
language_loader = getattr(
tree_sitter_typescript,
"language_javascript",
None,
)
if language_loader is None:
return None
language_obj = language_loader()
except Exception:
return None
else:
return None

self._language_cache[language] = language_obj
return language_obj
Loading