From a43c3b7fad50133a172f9add71f9fc48cc39a5c0 Mon Sep 17 00:00:00 2001 From: Richard-Otterli Date: Fri, 6 Mar 2026 20:20:48 +0000 Subject: [PATCH 1/5] Add discovery docstring miner and wire pipeline (#131) --- src/specleft/discovery/miners/__init__.py | 8 + .../discovery/miners/shared/__init__.py | 8 + .../discovery/miners/shared/docstrings.py | 446 ++++++++++++++++++ src/specleft/discovery/pipeline.py | 3 +- 4 files changed, 464 insertions(+), 1 deletion(-) create mode 100644 src/specleft/discovery/miners/__init__.py create mode 100644 src/specleft/discovery/miners/shared/__init__.py create mode 100644 src/specleft/discovery/miners/shared/docstrings.py diff --git a/src/specleft/discovery/miners/__init__.py b/src/specleft/discovery/miners/__init__.py new file mode 100644 index 0000000..7c6d4ea --- /dev/null +++ b/src/specleft/discovery/miners/__init__.py @@ -0,0 +1,8 @@ +# SPDX-License-Identifier: Apache-2.0 +# Copyright (c) 2026 SpecLeft Contributors + +"""Discovery miner implementations.""" + +from specleft.discovery.miners.shared import DocstringMiner + +__all__ = ["DocstringMiner"] diff --git a/src/specleft/discovery/miners/shared/__init__.py b/src/specleft/discovery/miners/shared/__init__.py new file mode 100644 index 0000000..85ce2aa --- /dev/null +++ b/src/specleft/discovery/miners/shared/__init__.py @@ -0,0 +1,8 @@ +# SPDX-License-Identifier: Apache-2.0 +# Copyright (c) 2026 SpecLeft Contributors + +"""Shared miners used by multiple discovery workflows.""" + +from specleft.discovery.miners.shared.docstrings import DocstringMiner + +__all__ = ["DocstringMiner"] diff --git a/src/specleft/discovery/miners/shared/docstrings.py b/src/specleft/discovery/miners/shared/docstrings.py new file mode 100644 index 0000000..fa59953 --- /dev/null +++ b/src/specleft/discovery/miners/shared/docstrings.py @@ -0,0 +1,446 @@ +# SPDX-License-Identifier: Apache-2.0 +# Copyright (c) 2026 SpecLeft Contributors + +"""Docstring/JSDoc miner for discovery pipeline.""" + +from __future__ import annotations + +import ast +import fnmatch +import time +import uuid +from dataclasses import dataclass +from pathlib import Path +from typing import Any + +from specleft.discovery.context import MinerContext +from specleft.discovery.models import ( + DiscoveredItem, + DocstringMeta, + ItemKind, + MinerResult, + SupportedLanguage, +) + +_CONFIDENCE = 0.8 +_MEANINGFUL_INIT_DOCSTRING_LEN = 10 +_JSDOC_TARGET_TYPES = frozenset( + { + "function_declaration", + "class_declaration", + "method_definition", + "export_statement", + } +) +_TEST_FILE_PATTERNS = ( + "test_*.py", + "*_test.py", + "*_tests.py", + "test_*.ts", + "*.test.ts", + "*.spec.ts", + "test_*.tsx", + "*.test.tsx", + "*.spec.tsx", + "test_*.js", + "*.test.js", + "*.spec.js", + "test_*.jsx", + "*.test.jsx", + "*.spec.jsx", + "test_*.mjs", + "*.test.mjs", + "*.spec.mjs", +) + + +@dataclass(frozen=True) +class _DocstringMatch: + text: str + line_number: int + + +class DocstringMiner: + """Extract Python docstrings and TypeScript/JavaScript JSDoc comments.""" + + miner_id = uuid.UUID("dcc2e631-67e7-4af7-b8ba-ca3397ccae0b") + name = "docstrings" + languages = frozenset( + { + SupportedLanguage.PYTHON, + SupportedLanguage.TYPESCRIPT, + SupportedLanguage.JAVASCRIPT, + } + ) + + def mine(self, ctx: MinerContext) -> MinerResult: + started = time.perf_counter() + items: list[DiscoveredItem] = [] + + for rel_path in _candidate_source_files(ctx): + if _is_test_file(rel_path): + continue + + abs_path = ctx.root / rel_path + parsed = ctx.registry.parse(abs_path) + if parsed is None: + continue + + try: + source_bytes = abs_path.read_bytes() + except OSError: + continue + + root_node, language = parsed + if language == SupportedLanguage.PYTHON: + items.extend(_extract_python_items(root_node, source_bytes, rel_path)) + elif language in ( + SupportedLanguage.TYPESCRIPT, + SupportedLanguage.JAVASCRIPT, + ): + items.extend( + _extract_jsdoc_items(root_node, source_bytes, rel_path, language) + ) + + return MinerResult( + miner_id=self.miner_id, + miner_name=self.name, + items=items, + duration_ms=_elapsed_ms(started), + ) + + +def _candidate_source_files(ctx: MinerContext) -> list[Path]: + source_dirs = ctx.config.source_dirs + if not source_dirs: + return [] + + return sorted( + ctx.file_index.files_under(*source_dirs), + key=lambda value: value.as_posix(), + ) + + +def _extract_python_items( + root_node: Any, + source_bytes: bytes, + file_path: Path, +) -> list[DiscoveredItem]: + items: list[DiscoveredItem] = [] + module_doc = _extract_python_leading_docstring(root_node, source_bytes) + if module_doc is not None: + items.append( + _make_item( + file_path=file_path, + line_number=module_doc.line_number, + language=SupportedLanguage.PYTHON, + target_kind="module", + target_name=file_path.stem, + text=module_doc.text, + ) + ) + + for node in _walk_tree(root_node): + if node.type == "class_definition": + name = _field_text(node, "name", source_bytes) + class_doc = _extract_python_body_docstring(node, source_bytes) + if class_doc is None or not name: + continue + items.append( + _make_item( + file_path=file_path, + line_number=class_doc.line_number, + language=SupportedLanguage.PYTHON, + target_kind="class", + target_name=name, + text=class_doc.text, + ) + ) + continue + + if node.type not in {"function_definition", "async_function_definition"}: + continue + + name = _field_text(node, "name", source_bytes) + function_doc = _extract_python_body_docstring(node, source_bytes) + if function_doc is None or not name: + continue + if ( + name == "__init__" + and len(function_doc.text.strip()) <= _MEANINGFUL_INIT_DOCSTRING_LEN + ): + continue + + items.append( + _make_item( + file_path=file_path, + line_number=function_doc.line_number, + language=SupportedLanguage.PYTHON, + target_kind="function", + target_name=name, + text=function_doc.text, + ) + ) + + return items + + +def _extract_jsdoc_items( + root_node: Any, + source_bytes: bytes, + file_path: Path, + language: SupportedLanguage, +) -> list[DiscoveredItem]: + items: list[DiscoveredItem] = [] + + def walk(parent: Any) -> None: + children = list(getattr(parent, "children", ())) + for index, child in enumerate(children): + if child.type == "comment": + jsdoc_text = _normalise_jsdoc(_node_text(child, source_bytes)) + if jsdoc_text: + target = _resolve_jsdoc_target(children, index, source_bytes) + if target is not None: + kind, name = _target_kind_and_name( + target, source_bytes, file_path + ) + items.append( + _make_item( + file_path=file_path, + line_number=_line_number(child), + language=language, + target_kind=kind, + target_name=name, + text=jsdoc_text, + ) + ) + + walk(child) + + walk(root_node) + return items + + +def _resolve_jsdoc_target( + siblings: list[Any], + comment_index: int, + source_bytes: bytes, +) -> Any | None: + comment_node = siblings[comment_index] + for candidate in siblings[comment_index + 1 :]: + if candidate.type == "comment": + continue + + target = _unwrap_export_target(candidate) + if target is None: + break + + if not _is_immediately_before(comment_node, target, source_bytes): + break + return target + return None + + +def _unwrap_export_target(node: Any) -> Any | None: + if node.type in _JSDOC_TARGET_TYPES and node.type != "export_statement": + return node + if node.type != "export_statement": + return None + + for child in getattr(node, "named_children", ()): + if child.type in _JSDOC_TARGET_TYPES and child.type != "export_statement": + return child + return node + + +def _is_immediately_before( + comment_node: Any, target_node: Any, source_bytes: bytes +) -> bool: + source_text = source_bytes.decode("utf-8", errors="ignore") + lines = source_text.splitlines() + comment_line = int(comment_node.end_point[0]) + target_line = int(target_node.start_point[0]) + if target_line < comment_line: + return False + if target_line == comment_line: + return True + for index in range(comment_line + 1, target_line): + if index >= len(lines): + break + if lines[index].strip(): + return False + return True + + +def _target_kind_and_name( + node: Any, + source_bytes: bytes, + file_path: Path, +) -> tuple[str, str | None]: + if node.type == "class_declaration": + return "class", _field_text(node, "name", source_bytes) + if node.type == "method_definition": + return "method", _field_text(node, "name", source_bytes) + if node.type == "function_declaration": + return "function", _field_text(node, "name", source_bytes) + if node.type == "export_statement": + return "module", file_path.stem + return "module", file_path.stem + + +def _extract_python_leading_docstring( + container_node: Any, + source_bytes: bytes, +) -> _DocstringMatch | None: + expression = _first_expression_string(container_node) + if expression is None: + return None + text = _clean_python_string(_node_text(expression, source_bytes)) + if not text: + return None + return _DocstringMatch(text=text, line_number=_line_number(expression)) + + +def _extract_python_body_docstring( + definition_node: Any, + source_bytes: bytes, +) -> _DocstringMatch | None: + body = definition_node.child_by_field_name("body") + if body is None: + return None + return _extract_python_leading_docstring(body, source_bytes) + + +def _first_expression_string(container_node: Any) -> Any | None: + named_children = list(getattr(container_node, "named_children", ())) + if not named_children: + return None + first = named_children[0] + if first.type != "expression_statement": + return None + + for child in getattr(first, "named_children", ()): + if child.type in {"string", "concatenated_string"}: + return child + return None + + +def _clean_python_string(value: str) -> str | None: + stripped = value.strip() + if not stripped: + return None + + try: + parsed = ast.literal_eval(stripped) + except (SyntaxError, ValueError): + parsed = _strip_wrapping_quotes(stripped) + if not isinstance(parsed, str): + return None + + cleaned = parsed.strip() + return cleaned or None + + +def _strip_wrapping_quotes(value: str) -> str: + for quote in ('"""', "'''", '"', "'"): + if value.startswith(quote) and value.endswith(quote) and len(value) >= 2: + return value[len(quote) : len(value) - len(quote)].strip() + return value + + +def _normalise_jsdoc(raw_comment: str) -> str | None: + stripped = raw_comment.strip() + if not stripped.startswith("/**"): + return None + + content = stripped + if content.startswith("/**"): + content = content[3:] + if content.endswith("*/"): + content = content[:-2] + + lines = [] + for line in content.splitlines(): + cleaned = line.lstrip() + if cleaned.startswith("*"): + cleaned = cleaned[1:] + if cleaned.startswith(" "): + cleaned = cleaned[1:] + lines.append(cleaned.rstrip()) + + text = "\n".join(lines).strip() + return text or None + + +def _make_item( + *, + file_path: Path, + line_number: int, + language: SupportedLanguage, + target_kind: str, + target_name: str | None, + text: str, +) -> DiscoveredItem: + item_name = ( + f"{target_kind}:{target_name}" if target_name else f"module:{file_path.stem}" + ) + metadata = DocstringMeta( + target_kind=target_kind, + target_name=target_name, + text=text, + ) + return DiscoveredItem( + kind=ItemKind.DOCSTRING, + name=item_name, + file_path=file_path, + line_number=line_number, + language=language, + raw_text=text, + metadata=metadata.model_dump(), + confidence=_CONFIDENCE, + ) + + +def _walk_tree(node: Any) -> list[Any]: + nodes: list[Any] = [] + for child in getattr(node, "children", ()): + nodes.append(child) + nodes.extend(_walk_tree(child)) + return nodes + + +def _line_number(node: Any) -> int: + return int(node.start_point[0]) + 1 + + +def _field_text(node: Any, field: str, source_bytes: bytes) -> str | None: + field_node = node.child_by_field_name(field) + if field_node is None: + return None + text = _node_text(field_node, source_bytes).strip() + return text or None + + +def _node_text(node: Any, source_bytes: bytes) -> str: + raw = getattr(node, "text", None) + if isinstance(raw, bytes): + return raw.decode("utf-8", errors="ignore") + if isinstance(raw, str): + return raw + + start_byte = getattr(node, "start_byte", None) + end_byte = getattr(node, "end_byte", None) + if isinstance(start_byte, int) and isinstance(end_byte, int): + return source_bytes[start_byte:end_byte].decode("utf-8", errors="ignore") + return "" + + +def _is_test_file(path: Path) -> bool: + file_name = path.name + if any(part in {"tests", "__tests__"} for part in path.parts): + return True + return any(fnmatch.fnmatch(file_name, pattern) for pattern in _TEST_FILE_PATTERNS) + + +def _elapsed_ms(started: float) -> int: + return max(0, int((time.perf_counter() - started) * 1000)) diff --git a/src/specleft/discovery/pipeline.py b/src/specleft/discovery/pipeline.py index ed4d9fe..a123a11 100644 --- a/src/specleft/discovery/pipeline.py +++ b/src/specleft/discovery/pipeline.py @@ -16,6 +16,7 @@ from specleft.discovery.framework_detector import FrameworkDetector from specleft.discovery.language_detect import detect_project_languages from specleft.discovery.language_registry import LanguageRegistry +from specleft.discovery.miners import DocstringMiner from specleft.discovery.models import ( DiscoveryReport, DiscoveredItem, @@ -242,4 +243,4 @@ def mine(self, ctx: MinerContext) -> MinerResult: def _default_miners() -> list[BaseMiner]: - return [_ReadmeMiner()] + return [_ReadmeMiner(), DocstringMiner()] From c42dc7b771fad9d6f754fb45fa42a6b896bafff3 Mon Sep 17 00:00:00 2001 From: Richard-Otterli Date: Fri, 6 Mar 2026 20:20:53 +0000 Subject: [PATCH 2/5] Add docstring miner test coverage (#131) --- tests/discovery/miners/test_docstrings.py | 265 ++++++++++++++++++++++ 1 file changed, 265 insertions(+) create mode 100644 tests/discovery/miners/test_docstrings.py diff --git a/tests/discovery/miners/test_docstrings.py b/tests/discovery/miners/test_docstrings.py new file mode 100644 index 0000000..89697ec --- /dev/null +++ b/tests/discovery/miners/test_docstrings.py @@ -0,0 +1,265 @@ +# SPDX-License-Identifier: Apache-2.0 +# Copyright (c) 2026 SpecLeft Contributors + +"""Tests for discovery docstring/JSDoc miner.""" + +from __future__ import annotations + +from dataclasses import dataclass, field +from pathlib import Path +from typing import Any + +from specleft.discovery.config import DiscoveryConfig +from specleft.discovery.context import MinerContext +from specleft.discovery.file_index import FileIndex +from specleft.discovery.miners.shared.docstrings import DocstringMiner +from specleft.discovery.models import DocstringMeta, SupportedLanguage + + +@dataclass +class _FakeNode: + type: str + text_value: str = "" + children: list[_FakeNode] = field(default_factory=list) + named_children: list[_FakeNode] = field(default_factory=list) + fields: dict[str, _FakeNode] = field(default_factory=dict) + start_point: tuple[int, int] = (0, 0) + end_point: tuple[int, int] = (0, 0) + + @property + def text(self) -> bytes: + return self.text_value.encode("utf-8") + + def child_by_field_name(self, name: str) -> _FakeNode | None: + return self.fields.get(name) + + +class _RegistryStub: + def __init__( + self, mapping: dict[Path, tuple[Any, SupportedLanguage] | None] + ) -> None: + self._mapping = mapping + self.calls: list[Path] = [] + + def parse(self, file_path: Path) -> tuple[Any, SupportedLanguage] | None: + self.calls.append(file_path) + return self._mapping.get(file_path) + + +def _string_expr(value: str, row: int) -> _FakeNode: + string_node = _FakeNode( + type="string", + text_value=value, + start_point=(row, 0), + end_point=(row, len(value)), + ) + return _FakeNode( + type="expression_statement", + children=[string_node], + named_children=[string_node], + start_point=(row, 0), + end_point=(row, len(value)), + ) + + +def _identifier(value: str, row: int) -> _FakeNode: + return _FakeNode( + type="identifier", + text_value=value, + start_point=(row, 0), + end_point=(row, len(value)), + ) + + +def _python_function(name: str, docstring: str, start_row: int) -> _FakeNode: + name_node = _identifier(name, start_row) + body_doc = _string_expr(docstring, start_row + 1) + body = _FakeNode( + type="block", + children=[body_doc], + named_children=[body_doc], + start_point=(start_row + 1, 0), + end_point=(start_row + 1, 0), + ) + return _FakeNode( + type="function_definition", + children=[name_node, body], + named_children=[name_node, body], + fields={"name": name_node, "body": body}, + start_point=(start_row, 0), + end_point=(start_row + 2, 0), + ) + + +def _python_class(name: str, docstring: str, start_row: int) -> _FakeNode: + name_node = _identifier(name, start_row) + body_doc = _string_expr(docstring, start_row + 1) + body = _FakeNode( + type="block", + children=[body_doc], + named_children=[body_doc], + start_point=(start_row + 1, 0), + end_point=(start_row + 1, 0), + ) + return _FakeNode( + type="class_definition", + children=[name_node, body], + named_children=[name_node, body], + fields={"name": name_node, "body": body}, + start_point=(start_row, 0), + end_point=(start_row + 2, 0), + ) + + +def _python_module_tree() -> _FakeNode: + module_doc = _string_expr('"""Auth module docs."""', 0) + class_node = _python_class("AuthService", '"""Class docs."""', 2) + init_fn = _python_function("__init__", '"""Init."""', 5) + create_user_fn = _python_function( + "create_user", + '"""Create a new user with the given credentials."""', + 8, + ) + class_body = class_node.child_by_field_name("body") + assert class_body is not None + class_body.children.extend([init_fn, create_user_fn]) + class_body.named_children.extend([init_fn, create_user_fn]) + + top_function = _python_function("validate_credentials", '"""Validate login."""', 12) + return _FakeNode( + type="module", + children=[module_doc, class_node, top_function], + named_children=[module_doc, class_node, top_function], + start_point=(0, 0), + end_point=(15, 0), + ) + + +def _jsdoc_comment(text: str, row: int) -> _FakeNode: + return _FakeNode( + type="comment", + text_value=text, + start_point=(row, 0), + end_point=(row, len(text)), + ) + + +def _typescript_function(name: str, row: int) -> _FakeNode: + name_node = _identifier(name, row) + return _FakeNode( + type="function_declaration", + children=[name_node], + named_children=[name_node], + fields={"name": name_node}, + start_point=(row, 0), + end_point=(row + 1, 0), + ) + + +def _javascript_tree(function_name: str) -> _FakeNode: + comment = _jsdoc_comment("/** Creates a new user */", 0) + function = _typescript_function(function_name, 1) + return _FakeNode( + type="program", + children=[comment, function], + named_children=[comment, function], + start_point=(0, 0), + end_point=(2, 0), + ) + + +def _context( + root: Path, + registry: _RegistryStub, + source_dirs: tuple[str, ...] = ("src",), +) -> MinerContext: + return MinerContext( + root=root, + registry=registry, # type: ignore[arg-type] + file_index=FileIndex(root), + frameworks={}, + config=DiscoveryConfig(source_dirs=source_dirs), + ) + + +def test_python_docstrings_include_module_class_and_functions(tmp_path: Path) -> None: + source_file = tmp_path / "src" / "auth.py" + source_file.parent.mkdir(parents=True) + source_file.write_text( + '"""Auth module docs."""\nclass AuthService:\n """Class docs."""\n', + encoding="utf-8", + ) + excluded_test_file = tmp_path / "src" / "test_auth.py" + excluded_test_file.write_text('"""should be skipped"""', encoding="utf-8") + + registry = _RegistryStub( + { + source_file: (_python_module_tree(), SupportedLanguage.PYTHON), + excluded_test_file: (_python_module_tree(), SupportedLanguage.PYTHON), + } + ) + miner = DocstringMiner() + + result = miner.mine(_context(tmp_path, registry)) + + assert result.error is None + assert result.error_kind is None + assert all(item.language == SupportedLanguage.PYTHON for item in result.items) + assert "module:auth" in {item.name for item in result.items} + assert "class:AuthService" in {item.name for item in result.items} + assert "function:create_user" in {item.name for item in result.items} + assert "function:validate_credentials" in {item.name for item in result.items} + assert "function:__init__" not in {item.name for item in result.items} + assert all(isinstance(item.typed_meta(), DocstringMeta) for item in result.items) + + module_item = next(item for item in result.items if item.name == "module:auth") + assert module_item.metadata["target_kind"] == "module" + assert module_item.language == SupportedLanguage.PYTHON + assert registry.calls == [source_file] + + +def test_jsdoc_uses_language_per_extension(tmp_path: Path) -> None: + ts_file = tmp_path / "src" / "auth.ts" + js_file = tmp_path / "src" / "auth.js" + ts_file.parent.mkdir(parents=True) + ts_file.write_text("/** Creates a new user */\nexport function createUser() {}\n") + js_file.write_text("/** Creates a helper */\nfunction createHelper() {}\n") + + registry = _RegistryStub( + { + ts_file: (_javascript_tree("createUser"), SupportedLanguage.TYPESCRIPT), + js_file: (_javascript_tree("createHelper"), SupportedLanguage.JAVASCRIPT), + } + ) + + result = DocstringMiner().mine(_context(tmp_path, registry)) + + by_name = {item.name: item for item in result.items} + assert by_name["function:createUser"].language == SupportedLanguage.TYPESCRIPT + assert by_name["function:createHelper"].language == SupportedLanguage.JAVASCRIPT + assert by_name["function:createUser"].metadata["text"] == "Creates a new user" + + +def test_source_dirs_scope_and_test_patterns_are_respected(tmp_path: Path) -> None: + inside_source = tmp_path / "app" / "module.py" + outside_source = tmp_path / "src" / "outside.py" + ts_test = tmp_path / "app" / "auth.test.ts" + inside_source.parent.mkdir(parents=True) + outside_source.parent.mkdir(parents=True, exist_ok=True) + + inside_source.write_text('"""inside"""') + outside_source.write_text('"""outside"""') + ts_test.write_text("/** should skip */\nfunction x() {}\n") + + registry = _RegistryStub( + { + inside_source: (_python_module_tree(), SupportedLanguage.PYTHON), + outside_source: (_python_module_tree(), SupportedLanguage.PYTHON), + ts_test: (_javascript_tree("x"), SupportedLanguage.TYPESCRIPT), + } + ) + + result = DocstringMiner().mine(_context(tmp_path, registry, source_dirs=("app",))) + + assert all(item.file_path == Path("app/module.py") for item in result.items) + assert registry.calls == [inside_source] From 9aec251ae975283ab278a2c4f950724d878c0723 Mon Sep 17 00:00:00 2001 From: Richard-Otterli Date: Fri, 6 Mar 2026 20:20:58 +0000 Subject: [PATCH 3/5] Document docstring miner discovery behavior (#131) --- features/feature-spec-discovery.md | 20 ++++++++++++++++++++ 1 file changed, 20 insertions(+) diff --git a/features/feature-spec-discovery.md b/features/feature-spec-discovery.md index 6e64157..5f858a6 100644 --- a/features/feature-spec-discovery.md +++ b/features/feature-spec-discovery.md @@ -62,6 +62,22 @@ Add shared discovery infrastructure for Issues #125 and #126: centralized parser **When** I call `build_default_pipeline(root).run()` **Then** a `DiscoveryReport` is returned with run duration, detected languages, miner results, and total item counts. +### Story 7: Shared docstring and JSDoc mining +**Scenario:** As a discovery pipeline, I need intent-rich text signals from source code comments. +**Given** configured source directories and a shared miner context +**When** `DocstringMiner` runs +**Then** it extracts Python module/class/function docstrings and TypeScript/JavaScript JSDoc comments into `DiscoveredItem(kind=DOCSTRING)` entries with typed `DocstringMeta`. + +**Scenario:** As a pipeline maintainer, I need predictable mining scope and exclusions. +**Given** `source_dirs` in `DiscoveryConfig` +**When** `DocstringMiner` scans files +**Then** it reads only `ctx.file_index.files_under(*ctx.config.source_dirs)` and excludes test files (`test_*.py`, `*.test.ts`, etc.). + +**Scenario:** As a spec generation pipeline, I need clean signal quality. +**Given** Python `__init__` docstrings +**When** the content is trivial (10 chars or fewer) +**Then** it is skipped and not emitted as a discovery item. + ## Acceptance Criteria - Language abstraction returns `SupportedLanguage` members for `.py`, `.ts`, `.tsx`, `.js`, `.jsx`, `.mjs` and `None` otherwise. - `LanguageRegistry().parse(path_to_py_file)` returns `(node, SupportedLanguage.PYTHON)` for valid Python input. @@ -84,3 +100,7 @@ Add shared discovery infrastructure for Issues #125 and #126: centralized parser - Integration on the SpecLeft repository produces `report.total_items > 0`. - Tests cover config parsing, framework detection, pipeline registration/filtering/error isolation, and default pipeline integration. - Feature spec is updated to document the discovery layer behavior introduced in issues #125 and #126. +- `DocstringMiner` emits module/class/function Python docstrings with `DocstringMeta` and `confidence=0.8`. +- TypeScript/JavaScript JSDoc comments immediately preceding declarations are emitted with the correct `SupportedLanguage`. +- Test files are excluded from docstring mining and configured `source_dirs` scope is respected. +- Trivial `__init__` docstrings (<=10 chars) are skipped. From 55d12bb77707c5a2a8799266326deaa20b16033a Mon Sep 17 00:00:00 2001 From: Richard-Otterli Date: Fri, 6 Mar 2026 20:38:17 +0000 Subject: [PATCH 4/5] Refactor docstring miner by language modules (#131) --- .../discovery/miners/python/__init__.py | 8 + .../discovery/miners/python/docstrings.py | 153 +++++++ .../discovery/miners/shared/common.py | 134 ++++++ .../discovery/miners/shared/docstrings.py | 403 +----------------- .../discovery/miners/typescript/__init__.py | 8 + .../discovery/miners/typescript/jsdoc.py | 154 +++++++ 6 files changed, 470 insertions(+), 390 deletions(-) create mode 100644 src/specleft/discovery/miners/python/__init__.py create mode 100644 src/specleft/discovery/miners/python/docstrings.py create mode 100644 src/specleft/discovery/miners/shared/common.py create mode 100644 src/specleft/discovery/miners/typescript/__init__.py create mode 100644 src/specleft/discovery/miners/typescript/jsdoc.py diff --git a/src/specleft/discovery/miners/python/__init__.py b/src/specleft/discovery/miners/python/__init__.py new file mode 100644 index 0000000..b933028 --- /dev/null +++ b/src/specleft/discovery/miners/python/__init__.py @@ -0,0 +1,8 @@ +# SPDX-License-Identifier: Apache-2.0 +# Copyright (c) 2026 SpecLeft Contributors + +"""Python-specific discovery miners.""" + +from specleft.discovery.miners.python.docstrings import extract_python_items + +__all__ = ["extract_python_items"] diff --git a/src/specleft/discovery/miners/python/docstrings.py b/src/specleft/discovery/miners/python/docstrings.py new file mode 100644 index 0000000..e60814c --- /dev/null +++ b/src/specleft/discovery/miners/python/docstrings.py @@ -0,0 +1,153 @@ +# SPDX-License-Identifier: Apache-2.0 +# Copyright (c) 2026 SpecLeft Contributors + +"""Python docstring extraction for discovery miners.""" + +from __future__ import annotations + +import ast +from dataclasses import dataclass +from pathlib import Path +from typing import Any + +from specleft.discovery.miners.shared.common import ( + field_text, + line_number, + make_docstring_item, + node_text, + walk_tree, +) +from specleft.discovery.models import DiscoveredItem, SupportedLanguage + +_MEANINGFUL_INIT_DOCSTRING_LEN = 10 + + +@dataclass(frozen=True) +class _DocstringMatch: + text: str + line_number: int + + +def extract_python_items( + root_node: Any, + source_bytes: bytes, + file_path: Path, +) -> list[DiscoveredItem]: + """Extract module/class/function docstrings from a Python source tree.""" + items: list[DiscoveredItem] = [] + module_doc = _extract_python_leading_docstring(root_node, source_bytes) + if module_doc is not None: + items.append( + make_docstring_item( + file_path=file_path, + line_number=module_doc.line_number, + language=SupportedLanguage.PYTHON, + target_kind="module", + target_name=file_path.stem, + text=module_doc.text, + ) + ) + + for node in walk_tree(root_node): + if node.type == "class_definition": + name = field_text(node, "name", source_bytes) + class_doc = _extract_python_body_docstring(node, source_bytes) + if class_doc is None or not name: + continue + items.append( + make_docstring_item( + file_path=file_path, + line_number=class_doc.line_number, + language=SupportedLanguage.PYTHON, + target_kind="class", + target_name=name, + text=class_doc.text, + ) + ) + continue + + if node.type not in {"function_definition", "async_function_definition"}: + continue + + name = field_text(node, "name", source_bytes) + function_doc = _extract_python_body_docstring(node, source_bytes) + if function_doc is None or not name: + continue + if ( + name == "__init__" + and len(function_doc.text.strip()) <= _MEANINGFUL_INIT_DOCSTRING_LEN + ): + continue + + items.append( + make_docstring_item( + file_path=file_path, + line_number=function_doc.line_number, + language=SupportedLanguage.PYTHON, + target_kind="function", + target_name=name, + text=function_doc.text, + ) + ) + + return items + + +def _extract_python_leading_docstring( + container_node: Any, + source_bytes: bytes, +) -> _DocstringMatch | None: + expression = _first_expression_string(container_node) + if expression is None: + return None + text = _clean_python_string(node_text(expression, source_bytes)) + if not text: + return None + return _DocstringMatch(text=text, line_number=line_number(expression)) + + +def _extract_python_body_docstring( + definition_node: Any, + source_bytes: bytes, +) -> _DocstringMatch | None: + body = definition_node.child_by_field_name("body") + if body is None: + return None + return _extract_python_leading_docstring(body, source_bytes) + + +def _first_expression_string(container_node: Any) -> Any | None: + named_children = list(getattr(container_node, "named_children", ())) + if not named_children: + return None + first = named_children[0] + if first.type != "expression_statement": + return None + + for child in getattr(first, "named_children", ()): + if child.type in {"string", "concatenated_string"}: + return child + return None + + +def _clean_python_string(value: str) -> str | None: + stripped = value.strip() + if not stripped: + return None + + try: + parsed = ast.literal_eval(stripped) + except (SyntaxError, ValueError): + parsed = _strip_wrapping_quotes(stripped) + if not isinstance(parsed, str): + return None + + cleaned = parsed.strip() + return cleaned or None + + +def _strip_wrapping_quotes(value: str) -> str: + for quote in ('"""', "'''", '"', "'"): + if value.startswith(quote) and value.endswith(quote) and len(value) >= 2: + return value[len(quote) : len(value) - len(quote)].strip() + return value diff --git a/src/specleft/discovery/miners/shared/common.py b/src/specleft/discovery/miners/shared/common.py new file mode 100644 index 0000000..dd1baec --- /dev/null +++ b/src/specleft/discovery/miners/shared/common.py @@ -0,0 +1,134 @@ +# SPDX-License-Identifier: Apache-2.0 +# Copyright (c) 2026 SpecLeft Contributors + +"""Shared helpers for discovery miners.""" + +from __future__ import annotations + +import fnmatch +import time +from pathlib import Path +from typing import Any + +from specleft.discovery.context import MinerContext +from specleft.discovery.models import ( + DiscoveredItem, + DocstringMeta, + ItemKind, + SupportedLanguage, +) + +CONFIDENCE = 0.8 +TEST_FILE_PATTERNS = ( + "test_*.py", + "*_test.py", + "*_tests.py", + "test_*.ts", + "*.test.ts", + "*.spec.ts", + "test_*.tsx", + "*.test.tsx", + "*.spec.tsx", + "test_*.js", + "*.test.js", + "*.spec.js", + "test_*.jsx", + "*.test.jsx", + "*.spec.jsx", + "test_*.mjs", + "*.test.mjs", + "*.spec.mjs", +) + + +def candidate_source_files(ctx: MinerContext) -> list[Path]: + """Return configured source files in deterministic order.""" + source_dirs = ctx.config.source_dirs + if not source_dirs: + return [] + + return sorted( + ctx.file_index.files_under(*source_dirs), + key=lambda value: value.as_posix(), + ) + + +def is_test_file(path: Path) -> bool: + """Return whether a path should be excluded as a test file.""" + file_name = path.name + if any(part in {"tests", "__tests__"} for part in path.parts): + return True + return any(fnmatch.fnmatch(file_name, pattern) for pattern in TEST_FILE_PATTERNS) + + +def make_docstring_item( + *, + file_path: Path, + line_number: int, + language: SupportedLanguage, + target_kind: str, + target_name: str | None, + text: str, +) -> DiscoveredItem: + """Build a typed discovery item for docstring/JSDoc output.""" + item_name = ( + f"{target_kind}:{target_name}" if target_name else f"module:{file_path.stem}" + ) + metadata = DocstringMeta( + target_kind=target_kind, + target_name=target_name, + text=text, + ) + return DiscoveredItem( + kind=ItemKind.DOCSTRING, + name=item_name, + file_path=file_path, + line_number=line_number, + language=language, + raw_text=text, + metadata=metadata.model_dump(), + confidence=CONFIDENCE, + ) + + +def elapsed_ms(started: float) -> int: + """Return elapsed milliseconds from a `time.perf_counter()` start.""" + return max(0, int((time.perf_counter() - started) * 1000)) + + +def walk_tree(node: Any) -> list[Any]: + """Return all descendant nodes in depth-first order.""" + nodes: list[Any] = [] + for child in getattr(node, "children", ()): + nodes.append(child) + nodes.extend(walk_tree(child)) + return nodes + + +def line_number(node: Any) -> int: + """Return 1-based line number for a tree-sitter node.""" + return int(node.start_point[0]) + 1 + + +def field_text(node: Any, field: str, source_bytes: bytes) -> str | None: + """Return source text for a named field on a node.""" + field_node = node.child_by_field_name(field) + if field_node is None: + return None + text = node_text(field_node, source_bytes).strip() + return text or None + + +def node_text(node: Any, source_bytes: bytes) -> str: + """Return best-effort source text for a tree-sitter node.""" + raw = getattr(node, "text", None) + if isinstance(raw, bytes): + return raw.decode("utf-8", errors="ignore") + if isinstance(raw, str): + return raw + + start_byte = getattr(node, "start_byte", None) + end_byte = getattr(node, "end_byte", None) + if isinstance(start_byte, int) and isinstance(end_byte, int): + return source_bytes[start_byte:end_byte].decode("utf-8", errors="ignore") + return "" diff --git a/src/specleft/discovery/miners/shared/docstrings.py b/src/specleft/discovery/miners/shared/docstrings.py index fa59953..554def7 100644 --- a/src/specleft/discovery/miners/shared/docstrings.py +++ b/src/specleft/discovery/miners/shared/docstrings.py @@ -1,63 +1,22 @@ # SPDX-License-Identifier: Apache-2.0 # Copyright (c) 2026 SpecLeft Contributors -"""Docstring/JSDoc miner for discovery pipeline.""" +"""Docstring/JSDoc miner orchestration.""" from __future__ import annotations -import ast -import fnmatch import time import uuid -from dataclasses import dataclass -from pathlib import Path -from typing import Any from specleft.discovery.context import MinerContext -from specleft.discovery.models import ( - DiscoveredItem, - DocstringMeta, - ItemKind, - MinerResult, - SupportedLanguage, +from specleft.discovery.miners.python.docstrings import extract_python_items +from specleft.discovery.miners.shared.common import ( + candidate_source_files, + elapsed_ms, + is_test_file, ) - -_CONFIDENCE = 0.8 -_MEANINGFUL_INIT_DOCSTRING_LEN = 10 -_JSDOC_TARGET_TYPES = frozenset( - { - "function_declaration", - "class_declaration", - "method_definition", - "export_statement", - } -) -_TEST_FILE_PATTERNS = ( - "test_*.py", - "*_test.py", - "*_tests.py", - "test_*.ts", - "*.test.ts", - "*.spec.ts", - "test_*.tsx", - "*.test.tsx", - "*.spec.tsx", - "test_*.js", - "*.test.js", - "*.spec.js", - "test_*.jsx", - "*.test.jsx", - "*.spec.jsx", - "test_*.mjs", - "*.test.mjs", - "*.spec.mjs", -) - - -@dataclass(frozen=True) -class _DocstringMatch: - text: str - line_number: int +from specleft.discovery.miners.typescript.jsdoc import extract_jsdoc_items +from specleft.discovery.models import DiscoveredItem, MinerResult, SupportedLanguage class DocstringMiner: @@ -77,8 +36,8 @@ def mine(self, ctx: MinerContext) -> MinerResult: started = time.perf_counter() items: list[DiscoveredItem] = [] - for rel_path in _candidate_source_files(ctx): - if _is_test_file(rel_path): + for rel_path in candidate_source_files(ctx): + if is_test_file(rel_path): continue abs_path = ctx.root / rel_path @@ -93,354 +52,18 @@ def mine(self, ctx: MinerContext) -> MinerResult: root_node, language = parsed if language == SupportedLanguage.PYTHON: - items.extend(_extract_python_items(root_node, source_bytes, rel_path)) + items.extend(extract_python_items(root_node, source_bytes, rel_path)) elif language in ( SupportedLanguage.TYPESCRIPT, SupportedLanguage.JAVASCRIPT, ): items.extend( - _extract_jsdoc_items(root_node, source_bytes, rel_path, language) + extract_jsdoc_items(root_node, source_bytes, rel_path, language) ) return MinerResult( miner_id=self.miner_id, miner_name=self.name, items=items, - duration_ms=_elapsed_ms(started), - ) - - -def _candidate_source_files(ctx: MinerContext) -> list[Path]: - source_dirs = ctx.config.source_dirs - if not source_dirs: - return [] - - return sorted( - ctx.file_index.files_under(*source_dirs), - key=lambda value: value.as_posix(), - ) - - -def _extract_python_items( - root_node: Any, - source_bytes: bytes, - file_path: Path, -) -> list[DiscoveredItem]: - items: list[DiscoveredItem] = [] - module_doc = _extract_python_leading_docstring(root_node, source_bytes) - if module_doc is not None: - items.append( - _make_item( - file_path=file_path, - line_number=module_doc.line_number, - language=SupportedLanguage.PYTHON, - target_kind="module", - target_name=file_path.stem, - text=module_doc.text, - ) - ) - - for node in _walk_tree(root_node): - if node.type == "class_definition": - name = _field_text(node, "name", source_bytes) - class_doc = _extract_python_body_docstring(node, source_bytes) - if class_doc is None or not name: - continue - items.append( - _make_item( - file_path=file_path, - line_number=class_doc.line_number, - language=SupportedLanguage.PYTHON, - target_kind="class", - target_name=name, - text=class_doc.text, - ) - ) - continue - - if node.type not in {"function_definition", "async_function_definition"}: - continue - - name = _field_text(node, "name", source_bytes) - function_doc = _extract_python_body_docstring(node, source_bytes) - if function_doc is None or not name: - continue - if ( - name == "__init__" - and len(function_doc.text.strip()) <= _MEANINGFUL_INIT_DOCSTRING_LEN - ): - continue - - items.append( - _make_item( - file_path=file_path, - line_number=function_doc.line_number, - language=SupportedLanguage.PYTHON, - target_kind="function", - target_name=name, - text=function_doc.text, - ) + duration_ms=elapsed_ms(started), ) - - return items - - -def _extract_jsdoc_items( - root_node: Any, - source_bytes: bytes, - file_path: Path, - language: SupportedLanguage, -) -> list[DiscoveredItem]: - items: list[DiscoveredItem] = [] - - def walk(parent: Any) -> None: - children = list(getattr(parent, "children", ())) - for index, child in enumerate(children): - if child.type == "comment": - jsdoc_text = _normalise_jsdoc(_node_text(child, source_bytes)) - if jsdoc_text: - target = _resolve_jsdoc_target(children, index, source_bytes) - if target is not None: - kind, name = _target_kind_and_name( - target, source_bytes, file_path - ) - items.append( - _make_item( - file_path=file_path, - line_number=_line_number(child), - language=language, - target_kind=kind, - target_name=name, - text=jsdoc_text, - ) - ) - - walk(child) - - walk(root_node) - return items - - -def _resolve_jsdoc_target( - siblings: list[Any], - comment_index: int, - source_bytes: bytes, -) -> Any | None: - comment_node = siblings[comment_index] - for candidate in siblings[comment_index + 1 :]: - if candidate.type == "comment": - continue - - target = _unwrap_export_target(candidate) - if target is None: - break - - if not _is_immediately_before(comment_node, target, source_bytes): - break - return target - return None - - -def _unwrap_export_target(node: Any) -> Any | None: - if node.type in _JSDOC_TARGET_TYPES and node.type != "export_statement": - return node - if node.type != "export_statement": - return None - - for child in getattr(node, "named_children", ()): - if child.type in _JSDOC_TARGET_TYPES and child.type != "export_statement": - return child - return node - - -def _is_immediately_before( - comment_node: Any, target_node: Any, source_bytes: bytes -) -> bool: - source_text = source_bytes.decode("utf-8", errors="ignore") - lines = source_text.splitlines() - comment_line = int(comment_node.end_point[0]) - target_line = int(target_node.start_point[0]) - if target_line < comment_line: - return False - if target_line == comment_line: - return True - for index in range(comment_line + 1, target_line): - if index >= len(lines): - break - if lines[index].strip(): - return False - return True - - -def _target_kind_and_name( - node: Any, - source_bytes: bytes, - file_path: Path, -) -> tuple[str, str | None]: - if node.type == "class_declaration": - return "class", _field_text(node, "name", source_bytes) - if node.type == "method_definition": - return "method", _field_text(node, "name", source_bytes) - if node.type == "function_declaration": - return "function", _field_text(node, "name", source_bytes) - if node.type == "export_statement": - return "module", file_path.stem - return "module", file_path.stem - - -def _extract_python_leading_docstring( - container_node: Any, - source_bytes: bytes, -) -> _DocstringMatch | None: - expression = _first_expression_string(container_node) - if expression is None: - return None - text = _clean_python_string(_node_text(expression, source_bytes)) - if not text: - return None - return _DocstringMatch(text=text, line_number=_line_number(expression)) - - -def _extract_python_body_docstring( - definition_node: Any, - source_bytes: bytes, -) -> _DocstringMatch | None: - body = definition_node.child_by_field_name("body") - if body is None: - return None - return _extract_python_leading_docstring(body, source_bytes) - - -def _first_expression_string(container_node: Any) -> Any | None: - named_children = list(getattr(container_node, "named_children", ())) - if not named_children: - return None - first = named_children[0] - if first.type != "expression_statement": - return None - - for child in getattr(first, "named_children", ()): - if child.type in {"string", "concatenated_string"}: - return child - return None - - -def _clean_python_string(value: str) -> str | None: - stripped = value.strip() - if not stripped: - return None - - try: - parsed = ast.literal_eval(stripped) - except (SyntaxError, ValueError): - parsed = _strip_wrapping_quotes(stripped) - if not isinstance(parsed, str): - return None - - cleaned = parsed.strip() - return cleaned or None - - -def _strip_wrapping_quotes(value: str) -> str: - for quote in ('"""', "'''", '"', "'"): - if value.startswith(quote) and value.endswith(quote) and len(value) >= 2: - return value[len(quote) : len(value) - len(quote)].strip() - return value - - -def _normalise_jsdoc(raw_comment: str) -> str | None: - stripped = raw_comment.strip() - if not stripped.startswith("/**"): - return None - - content = stripped - if content.startswith("/**"): - content = content[3:] - if content.endswith("*/"): - content = content[:-2] - - lines = [] - for line in content.splitlines(): - cleaned = line.lstrip() - if cleaned.startswith("*"): - cleaned = cleaned[1:] - if cleaned.startswith(" "): - cleaned = cleaned[1:] - lines.append(cleaned.rstrip()) - - text = "\n".join(lines).strip() - return text or None - - -def _make_item( - *, - file_path: Path, - line_number: int, - language: SupportedLanguage, - target_kind: str, - target_name: str | None, - text: str, -) -> DiscoveredItem: - item_name = ( - f"{target_kind}:{target_name}" if target_name else f"module:{file_path.stem}" - ) - metadata = DocstringMeta( - target_kind=target_kind, - target_name=target_name, - text=text, - ) - return DiscoveredItem( - kind=ItemKind.DOCSTRING, - name=item_name, - file_path=file_path, - line_number=line_number, - language=language, - raw_text=text, - metadata=metadata.model_dump(), - confidence=_CONFIDENCE, - ) - - -def _walk_tree(node: Any) -> list[Any]: - nodes: list[Any] = [] - for child in getattr(node, "children", ()): - nodes.append(child) - nodes.extend(_walk_tree(child)) - return nodes - - -def _line_number(node: Any) -> int: - return int(node.start_point[0]) + 1 - - -def _field_text(node: Any, field: str, source_bytes: bytes) -> str | None: - field_node = node.child_by_field_name(field) - if field_node is None: - return None - text = _node_text(field_node, source_bytes).strip() - return text or None - - -def _node_text(node: Any, source_bytes: bytes) -> str: - raw = getattr(node, "text", None) - if isinstance(raw, bytes): - return raw.decode("utf-8", errors="ignore") - if isinstance(raw, str): - return raw - - start_byte = getattr(node, "start_byte", None) - end_byte = getattr(node, "end_byte", None) - if isinstance(start_byte, int) and isinstance(end_byte, int): - return source_bytes[start_byte:end_byte].decode("utf-8", errors="ignore") - return "" - - -def _is_test_file(path: Path) -> bool: - file_name = path.name - if any(part in {"tests", "__tests__"} for part in path.parts): - return True - return any(fnmatch.fnmatch(file_name, pattern) for pattern in _TEST_FILE_PATTERNS) - - -def _elapsed_ms(started: float) -> int: - return max(0, int((time.perf_counter() - started) * 1000)) diff --git a/src/specleft/discovery/miners/typescript/__init__.py b/src/specleft/discovery/miners/typescript/__init__.py new file mode 100644 index 0000000..96fabc5 --- /dev/null +++ b/src/specleft/discovery/miners/typescript/__init__.py @@ -0,0 +1,8 @@ +# SPDX-License-Identifier: Apache-2.0 +# Copyright (c) 2026 SpecLeft Contributors + +"""TypeScript/JavaScript-specific discovery miners.""" + +from specleft.discovery.miners.typescript.jsdoc import extract_jsdoc_items + +__all__ = ["extract_jsdoc_items"] diff --git a/src/specleft/discovery/miners/typescript/jsdoc.py b/src/specleft/discovery/miners/typescript/jsdoc.py new file mode 100644 index 0000000..8459703 --- /dev/null +++ b/src/specleft/discovery/miners/typescript/jsdoc.py @@ -0,0 +1,154 @@ +# SPDX-License-Identifier: Apache-2.0 +# Copyright (c) 2026 SpecLeft Contributors + +"""TypeScript/JavaScript JSDoc extraction for discovery miners.""" + +from __future__ import annotations + +from pathlib import Path +from typing import Any + +from specleft.discovery.miners.shared.common import ( + field_text, + line_number, + make_docstring_item, + node_text, +) +from specleft.discovery.models import DiscoveredItem, SupportedLanguage + +_JSDOC_TARGET_TYPES = frozenset( + { + "function_declaration", + "class_declaration", + "method_definition", + "export_statement", + } +) + + +def extract_jsdoc_items( + root_node: Any, + source_bytes: bytes, + file_path: Path, + language: SupportedLanguage, +) -> list[DiscoveredItem]: + """Extract attached JSDoc comments for TS/JS declarations.""" + items: list[DiscoveredItem] = [] + + def walk(parent: Any) -> None: + children = list(getattr(parent, "children", ())) + for index, child in enumerate(children): + if child.type == "comment": + jsdoc_text = _normalise_jsdoc(node_text(child, source_bytes)) + if jsdoc_text: + target = _resolve_jsdoc_target(children, index, source_bytes) + if target is not None: + kind, name = _target_kind_and_name( + target, source_bytes, file_path + ) + items.append( + make_docstring_item( + file_path=file_path, + line_number=line_number(child), + language=language, + target_kind=kind, + target_name=name, + text=jsdoc_text, + ) + ) + + walk(child) + + walk(root_node) + return items + + +def _resolve_jsdoc_target( + siblings: list[Any], + comment_index: int, + source_bytes: bytes, +) -> Any | None: + comment_node = siblings[comment_index] + for candidate in siblings[comment_index + 1 :]: + if candidate.type == "comment": + continue + + target = _unwrap_export_target(candidate) + if target is None: + break + + if not _is_immediately_before(comment_node, target, source_bytes): + break + return target + return None + + +def _unwrap_export_target(node: Any) -> Any | None: + if node.type in _JSDOC_TARGET_TYPES and node.type != "export_statement": + return node + if node.type != "export_statement": + return None + + for child in getattr(node, "named_children", ()): + if child.type in _JSDOC_TARGET_TYPES and child.type != "export_statement": + return child + return node + + +def _is_immediately_before( + comment_node: Any, target_node: Any, source_bytes: bytes +) -> bool: + source_text = source_bytes.decode("utf-8", errors="ignore") + lines = source_text.splitlines() + comment_line = int(comment_node.end_point[0]) + target_line = int(target_node.start_point[0]) + if target_line < comment_line: + return False + if target_line == comment_line: + return True + for index in range(comment_line + 1, target_line): + if index >= len(lines): + break + if lines[index].strip(): + return False + return True + + +def _target_kind_and_name( + node: Any, + source_bytes: bytes, + file_path: Path, +) -> tuple[str, str | None]: + if node.type == "class_declaration": + return "class", field_text(node, "name", source_bytes) + if node.type == "method_definition": + return "method", field_text(node, "name", source_bytes) + if node.type == "function_declaration": + return "function", field_text(node, "name", source_bytes) + if node.type == "export_statement": + return "module", file_path.stem + return "module", file_path.stem + + +def _normalise_jsdoc(raw_comment: str) -> str | None: + stripped = raw_comment.strip() + if not stripped.startswith("/**"): + return None + + content = stripped + if content.startswith("/**"): + content = content[3:] + if content.endswith("*/"): + content = content[:-2] + + lines = [] + for line in content.splitlines(): + cleaned = line.lstrip() + if cleaned.startswith("*"): + cleaned = cleaned[1:] + if cleaned.startswith(" "): + cleaned = cleaned[1:] + lines.append(cleaned.rstrip()) + + text = "\n".join(lines).strip() + return text or None From fb536773935b54fc046d9217265ace73abc7d605 Mon Sep 17 00:00:00 2001 From: Richard-Otterli Date: Fri, 6 Mar 2026 20:42:49 +0000 Subject: [PATCH 5/5] Reorganize remaining default miner modules (#131) --- src/specleft/discovery/miners/__init__.py | 5 +- src/specleft/discovery/miners/defaults.py | 18 +++++ .../discovery/miners/shared/__init__.py | 3 +- .../discovery/miners/shared/readme.py | 72 +++++++++++++++++++ src/specleft/discovery/pipeline.py | 63 +--------------- 5 files changed, 97 insertions(+), 64 deletions(-) create mode 100644 src/specleft/discovery/miners/defaults.py create mode 100644 src/specleft/discovery/miners/shared/readme.py diff --git a/src/specleft/discovery/miners/__init__.py b/src/specleft/discovery/miners/__init__.py index 7c6d4ea..0f23a79 100644 --- a/src/specleft/discovery/miners/__init__.py +++ b/src/specleft/discovery/miners/__init__.py @@ -3,6 +3,7 @@ """Discovery miner implementations.""" -from specleft.discovery.miners.shared import DocstringMiner +from specleft.discovery.miners.defaults import default_miners +from specleft.discovery.miners.shared import DocstringMiner, ReadmeOverviewMiner -__all__ = ["DocstringMiner"] +__all__ = ["DocstringMiner", "ReadmeOverviewMiner", "default_miners"] diff --git a/src/specleft/discovery/miners/defaults.py b/src/specleft/discovery/miners/defaults.py new file mode 100644 index 0000000..f6af3bf --- /dev/null +++ b/src/specleft/discovery/miners/defaults.py @@ -0,0 +1,18 @@ +# SPDX-License-Identifier: Apache-2.0 +# Copyright (c) 2026 SpecLeft Contributors + +"""Default miner registry for discovery pipeline wiring.""" + +from __future__ import annotations + +from typing import TYPE_CHECKING + +from specleft.discovery.miners.shared import DocstringMiner, ReadmeOverviewMiner + +if TYPE_CHECKING: + from specleft.discovery.pipeline import BaseMiner + + +def default_miners() -> list[BaseMiner]: + """Return default miners in deterministic execution order.""" + return [ReadmeOverviewMiner(), DocstringMiner()] diff --git a/src/specleft/discovery/miners/shared/__init__.py b/src/specleft/discovery/miners/shared/__init__.py index 85ce2aa..8e11f15 100644 --- a/src/specleft/discovery/miners/shared/__init__.py +++ b/src/specleft/discovery/miners/shared/__init__.py @@ -4,5 +4,6 @@ """Shared miners used by multiple discovery workflows.""" from specleft.discovery.miners.shared.docstrings import DocstringMiner +from specleft.discovery.miners.shared.readme import ReadmeOverviewMiner -__all__ = ["DocstringMiner"] +__all__ = ["DocstringMiner", "ReadmeOverviewMiner"] diff --git a/src/specleft/discovery/miners/shared/readme.py b/src/specleft/discovery/miners/shared/readme.py new file mode 100644 index 0000000..368026f --- /dev/null +++ b/src/specleft/discovery/miners/shared/readme.py @@ -0,0 +1,72 @@ +# SPDX-License-Identifier: Apache-2.0 +# Copyright (c) 2026 SpecLeft Contributors + +"""Language-agnostic README overview miner.""" + +from __future__ import annotations + +import uuid +from pathlib import Path + +from specleft.discovery.context import MinerContext +from specleft.discovery.models import ( + DiscoveredItem, + DocstringMeta, + ItemKind, + MinerResult, + SupportedLanguage, +) + + +class ReadmeOverviewMiner: + """Extract a single high-level project overview from README content.""" + + miner_id = uuid.UUID("2f87e7a5-a362-4adc-a005-84457b6abc04") + name = "readme_overview" + languages: frozenset[SupportedLanguage] = frozenset() + + def mine(self, ctx: MinerContext) -> MinerResult: + readme_paths = ( + Path("README.md"), + Path("README.rst"), + Path("README.txt"), + ) + + items: list[DiscoveredItem] = [] + for rel_path in readme_paths: + abs_path = ctx.root / rel_path + if not abs_path.is_file(): + continue + + try: + raw_text = abs_path.read_text(encoding="utf-8") + except (OSError, UnicodeDecodeError): + continue + + first_line = next( + (line.strip() for line in raw_text.splitlines() if line.strip()), + "Project overview", + ) + item = DiscoveredItem( + kind=ItemKind.DOCSTRING, + name="project_overview", + file_path=rel_path, + line_number=1, + language=None, + raw_text=first_line, + metadata=DocstringMeta( + target_kind="module", + target_name="README", + text=first_line, + ).model_dump(), + confidence=0.3, + ) + items.append(item) + break + + return MinerResult( + miner_id=self.miner_id, + miner_name=self.name, + items=items, + duration_ms=0, + ) diff --git a/src/specleft/discovery/pipeline.py b/src/specleft/discovery/pipeline.py index a123a11..71c8226 100644 --- a/src/specleft/discovery/pipeline.py +++ b/src/specleft/discovery/pipeline.py @@ -16,12 +16,9 @@ from specleft.discovery.framework_detector import FrameworkDetector from specleft.discovery.language_detect import detect_project_languages from specleft.discovery.language_registry import LanguageRegistry -from specleft.discovery.miners import DocstringMiner +from specleft.discovery.miners import default_miners from specleft.discovery.models import ( DiscoveryReport, - DiscoveredItem, - DocstringMeta, - ItemKind, MinerErrorKind, MinerResult, SupportedLanguage, @@ -186,61 +183,5 @@ def _normalize_languages( return normalized -class _ReadmeMiner: - """Minimal built-in miner used as default pipeline baseline.""" - - miner_id = uuid.UUID("2f87e7a5-a362-4adc-a005-84457b6abc04") - name = "readme_overview" - languages: frozenset[SupportedLanguage] = frozenset() - - def mine(self, ctx: MinerContext) -> MinerResult: - readme_paths = ( - Path("README.md"), - Path("README.rst"), - Path("README.txt"), - ) - - items: list[DiscoveredItem] = [] - for rel_path in readme_paths: - abs_path = ctx.root / rel_path - if not abs_path.is_file(): - continue - - try: - raw_text = abs_path.read_text(encoding="utf-8") - except OSError: - continue - except UnicodeDecodeError: - continue - - first_line = next( - (line.strip() for line in raw_text.splitlines() if line.strip()), - "Project overview", - ) - item = DiscoveredItem( - kind=ItemKind.DOCSTRING, - name="project_overview", - file_path=rel_path, - line_number=1, - language=None, - raw_text=first_line, - metadata=DocstringMeta( - target_kind="module", - target_name="README", - text=first_line, - ).model_dump(), - confidence=0.3, - ) - items.append(item) - break - - return MinerResult( - miner_id=self.miner_id, - miner_name=self.name, - items=items, - duration_ms=0, - ) - - def _default_miners() -> list[BaseMiner]: - return [_ReadmeMiner(), DocstringMiner()] + return default_miners()