diff --git a/features/feature-spec-discovery.md b/features/feature-spec-discovery.md index 6e64157..5f858a6 100644 --- a/features/feature-spec-discovery.md +++ b/features/feature-spec-discovery.md @@ -62,6 +62,22 @@ Add shared discovery infrastructure for Issues #125 and #126: centralized parser **When** I call `build_default_pipeline(root).run()` **Then** a `DiscoveryReport` is returned with run duration, detected languages, miner results, and total item counts. +### Story 7: Shared docstring and JSDoc mining +**Scenario:** As a discovery pipeline, I need intent-rich text signals from source code comments. +**Given** configured source directories and a shared miner context +**When** `DocstringMiner` runs +**Then** it extracts Python module/class/function docstrings and TypeScript/JavaScript JSDoc comments into `DiscoveredItem(kind=DOCSTRING)` entries with typed `DocstringMeta`. + +**Scenario:** As a pipeline maintainer, I need predictable mining scope and exclusions. +**Given** `source_dirs` in `DiscoveryConfig` +**When** `DocstringMiner` scans files +**Then** it reads only `ctx.file_index.files_under(*ctx.config.source_dirs)` and excludes test files (`test_*.py`, `*.test.ts`, etc.). + +**Scenario:** As a spec generation pipeline, I need clean signal quality. +**Given** Python `__init__` docstrings +**When** the content is trivial (10 chars or fewer) +**Then** it is skipped and not emitted as a discovery item. + ## Acceptance Criteria - Language abstraction returns `SupportedLanguage` members for `.py`, `.ts`, `.tsx`, `.js`, `.jsx`, `.mjs` and `None` otherwise. - `LanguageRegistry().parse(path_to_py_file)` returns `(node, SupportedLanguage.PYTHON)` for valid Python input. @@ -84,3 +100,7 @@ Add shared discovery infrastructure for Issues #125 and #126: centralized parser - Integration on the SpecLeft repository produces `report.total_items > 0`. - Tests cover config parsing, framework detection, pipeline registration/filtering/error isolation, and default pipeline integration. - Feature spec is updated to document the discovery layer behavior introduced in issues #125 and #126. +- `DocstringMiner` emits module/class/function Python docstrings with `DocstringMeta` and `confidence=0.8`. +- TypeScript/JavaScript JSDoc comments immediately preceding declarations are emitted with the correct `SupportedLanguage`. +- Test files are excluded from docstring mining and configured `source_dirs` scope is respected. +- Trivial `__init__` docstrings (<=10 chars) are skipped. diff --git a/src/specleft/discovery/miners/__init__.py b/src/specleft/discovery/miners/__init__.py new file mode 100644 index 0000000..0f23a79 --- /dev/null +++ b/src/specleft/discovery/miners/__init__.py @@ -0,0 +1,9 @@ +# SPDX-License-Identifier: Apache-2.0 +# Copyright (c) 2026 SpecLeft Contributors + +"""Discovery miner implementations.""" + +from specleft.discovery.miners.defaults import default_miners +from specleft.discovery.miners.shared import DocstringMiner, ReadmeOverviewMiner + +__all__ = ["DocstringMiner", "ReadmeOverviewMiner", "default_miners"] diff --git a/src/specleft/discovery/miners/defaults.py b/src/specleft/discovery/miners/defaults.py new file mode 100644 index 0000000..f6af3bf --- /dev/null +++ b/src/specleft/discovery/miners/defaults.py @@ -0,0 +1,18 @@ +# SPDX-License-Identifier: Apache-2.0 +# Copyright (c) 2026 SpecLeft Contributors + +"""Default miner registry for discovery pipeline wiring.""" + +from __future__ import annotations + +from typing import TYPE_CHECKING + +from specleft.discovery.miners.shared import DocstringMiner, ReadmeOverviewMiner + +if TYPE_CHECKING: + from specleft.discovery.pipeline import BaseMiner + + +def default_miners() -> list[BaseMiner]: + """Return default miners in deterministic execution order.""" + return [ReadmeOverviewMiner(), DocstringMiner()] diff --git a/src/specleft/discovery/miners/python/__init__.py b/src/specleft/discovery/miners/python/__init__.py new file mode 100644 index 0000000..b933028 --- /dev/null +++ b/src/specleft/discovery/miners/python/__init__.py @@ -0,0 +1,8 @@ +# SPDX-License-Identifier: Apache-2.0 +# Copyright (c) 2026 SpecLeft Contributors + +"""Python-specific discovery miners.""" + +from specleft.discovery.miners.python.docstrings import extract_python_items + +__all__ = ["extract_python_items"] diff --git a/src/specleft/discovery/miners/python/docstrings.py b/src/specleft/discovery/miners/python/docstrings.py new file mode 100644 index 0000000..e60814c --- /dev/null +++ b/src/specleft/discovery/miners/python/docstrings.py @@ -0,0 +1,153 @@ +# SPDX-License-Identifier: Apache-2.0 +# Copyright (c) 2026 SpecLeft Contributors + +"""Python docstring extraction for discovery miners.""" + +from __future__ import annotations + +import ast +from dataclasses import dataclass +from pathlib import Path +from typing import Any + +from specleft.discovery.miners.shared.common import ( + field_text, + line_number, + make_docstring_item, + node_text, + walk_tree, +) +from specleft.discovery.models import DiscoveredItem, SupportedLanguage + +_MEANINGFUL_INIT_DOCSTRING_LEN = 10 + + +@dataclass(frozen=True) +class _DocstringMatch: + text: str + line_number: int + + +def extract_python_items( + root_node: Any, + source_bytes: bytes, + file_path: Path, +) -> list[DiscoveredItem]: + """Extract module/class/function docstrings from a Python source tree.""" + items: list[DiscoveredItem] = [] + module_doc = _extract_python_leading_docstring(root_node, source_bytes) + if module_doc is not None: + items.append( + make_docstring_item( + file_path=file_path, + line_number=module_doc.line_number, + language=SupportedLanguage.PYTHON, + target_kind="module", + target_name=file_path.stem, + text=module_doc.text, + ) + ) + + for node in walk_tree(root_node): + if node.type == "class_definition": + name = field_text(node, "name", source_bytes) + class_doc = _extract_python_body_docstring(node, source_bytes) + if class_doc is None or not name: + continue + items.append( + make_docstring_item( + file_path=file_path, + line_number=class_doc.line_number, + language=SupportedLanguage.PYTHON, + target_kind="class", + target_name=name, + text=class_doc.text, + ) + ) + continue + + if node.type not in {"function_definition", "async_function_definition"}: + continue + + name = field_text(node, "name", source_bytes) + function_doc = _extract_python_body_docstring(node, source_bytes) + if function_doc is None or not name: + continue + if ( + name == "__init__" + and len(function_doc.text.strip()) <= _MEANINGFUL_INIT_DOCSTRING_LEN + ): + continue + + items.append( + make_docstring_item( + file_path=file_path, + line_number=function_doc.line_number, + language=SupportedLanguage.PYTHON, + target_kind="function", + target_name=name, + text=function_doc.text, + ) + ) + + return items + + +def _extract_python_leading_docstring( + container_node: Any, + source_bytes: bytes, +) -> _DocstringMatch | None: + expression = _first_expression_string(container_node) + if expression is None: + return None + text = _clean_python_string(node_text(expression, source_bytes)) + if not text: + return None + return _DocstringMatch(text=text, line_number=line_number(expression)) + + +def _extract_python_body_docstring( + definition_node: Any, + source_bytes: bytes, +) -> _DocstringMatch | None: + body = definition_node.child_by_field_name("body") + if body is None: + return None + return _extract_python_leading_docstring(body, source_bytes) + + +def _first_expression_string(container_node: Any) -> Any | None: + named_children = list(getattr(container_node, "named_children", ())) + if not named_children: + return None + first = named_children[0] + if first.type != "expression_statement": + return None + + for child in getattr(first, "named_children", ()): + if child.type in {"string", "concatenated_string"}: + return child + return None + + +def _clean_python_string(value: str) -> str | None: + stripped = value.strip() + if not stripped: + return None + + try: + parsed = ast.literal_eval(stripped) + except (SyntaxError, ValueError): + parsed = _strip_wrapping_quotes(stripped) + if not isinstance(parsed, str): + return None + + cleaned = parsed.strip() + return cleaned or None + + +def _strip_wrapping_quotes(value: str) -> str: + for quote in ('"""', "'''", '"', "'"): + if value.startswith(quote) and value.endswith(quote) and len(value) >= 2: + return value[len(quote) : len(value) - len(quote)].strip() + return value diff --git a/src/specleft/discovery/miners/shared/__init__.py b/src/specleft/discovery/miners/shared/__init__.py new file mode 100644 index 0000000..8e11f15 --- /dev/null +++ b/src/specleft/discovery/miners/shared/__init__.py @@ -0,0 +1,9 @@ +# SPDX-License-Identifier: Apache-2.0 +# Copyright (c) 2026 SpecLeft Contributors + +"""Shared miners used by multiple discovery workflows.""" + +from specleft.discovery.miners.shared.docstrings import DocstringMiner +from specleft.discovery.miners.shared.readme import ReadmeOverviewMiner + +__all__ = ["DocstringMiner", "ReadmeOverviewMiner"] diff --git a/src/specleft/discovery/miners/shared/common.py b/src/specleft/discovery/miners/shared/common.py new file mode 100644 index 0000000..dd1baec --- /dev/null +++ b/src/specleft/discovery/miners/shared/common.py @@ -0,0 +1,134 @@ +# SPDX-License-Identifier: Apache-2.0 +# Copyright (c) 2026 SpecLeft Contributors + +"""Shared helpers for discovery miners.""" + +from __future__ import annotations + +import fnmatch +import time +from pathlib import Path +from typing import Any + +from specleft.discovery.context import MinerContext +from specleft.discovery.models import ( + DiscoveredItem, + DocstringMeta, + ItemKind, + SupportedLanguage, +) + +CONFIDENCE = 0.8 +TEST_FILE_PATTERNS = ( + "test_*.py", + "*_test.py", + "*_tests.py", + "test_*.ts", + "*.test.ts", + "*.spec.ts", + "test_*.tsx", + "*.test.tsx", + "*.spec.tsx", + "test_*.js", + "*.test.js", + "*.spec.js", + "test_*.jsx", + "*.test.jsx", + "*.spec.jsx", + "test_*.mjs", + "*.test.mjs", + "*.spec.mjs", +) + + +def candidate_source_files(ctx: MinerContext) -> list[Path]: + """Return configured source files in deterministic order.""" + source_dirs = ctx.config.source_dirs + if not source_dirs: + return [] + + return sorted( + ctx.file_index.files_under(*source_dirs), + key=lambda value: value.as_posix(), + ) + + +def is_test_file(path: Path) -> bool: + """Return whether a path should be excluded as a test file.""" + file_name = path.name + if any(part in {"tests", "__tests__"} for part in path.parts): + return True + return any(fnmatch.fnmatch(file_name, pattern) for pattern in TEST_FILE_PATTERNS) + + +def make_docstring_item( + *, + file_path: Path, + line_number: int, + language: SupportedLanguage, + target_kind: str, + target_name: str | None, + text: str, +) -> DiscoveredItem: + """Build a typed discovery item for docstring/JSDoc output.""" + item_name = ( + f"{target_kind}:{target_name}" if target_name else f"module:{file_path.stem}" + ) + metadata = DocstringMeta( + target_kind=target_kind, + target_name=target_name, + text=text, + ) + return DiscoveredItem( + kind=ItemKind.DOCSTRING, + name=item_name, + file_path=file_path, + line_number=line_number, + language=language, + raw_text=text, + metadata=metadata.model_dump(), + confidence=CONFIDENCE, + ) + + +def elapsed_ms(started: float) -> int: + """Return elapsed milliseconds from a `time.perf_counter()` start.""" + return max(0, int((time.perf_counter() - started) * 1000)) + + +def walk_tree(node: Any) -> list[Any]: + """Return all descendant nodes in depth-first order.""" + nodes: list[Any] = [] + for child in getattr(node, "children", ()): + nodes.append(child) + nodes.extend(walk_tree(child)) + return nodes + + +def line_number(node: Any) -> int: + """Return 1-based line number for a tree-sitter node.""" + return int(node.start_point[0]) + 1 + + +def field_text(node: Any, field: str, source_bytes: bytes) -> str | None: + """Return source text for a named field on a node.""" + field_node = node.child_by_field_name(field) + if field_node is None: + return None + text = node_text(field_node, source_bytes).strip() + return text or None + + +def node_text(node: Any, source_bytes: bytes) -> str: + """Return best-effort source text for a tree-sitter node.""" + raw = getattr(node, "text", None) + if isinstance(raw, bytes): + return raw.decode("utf-8", errors="ignore") + if isinstance(raw, str): + return raw + + start_byte = getattr(node, "start_byte", None) + end_byte = getattr(node, "end_byte", None) + if isinstance(start_byte, int) and isinstance(end_byte, int): + return source_bytes[start_byte:end_byte].decode("utf-8", errors="ignore") + return "" diff --git a/src/specleft/discovery/miners/shared/docstrings.py b/src/specleft/discovery/miners/shared/docstrings.py new file mode 100644 index 0000000..554def7 --- /dev/null +++ b/src/specleft/discovery/miners/shared/docstrings.py @@ -0,0 +1,69 @@ +# SPDX-License-Identifier: Apache-2.0 +# Copyright (c) 2026 SpecLeft Contributors + +"""Docstring/JSDoc miner orchestration.""" + +from __future__ import annotations + +import time +import uuid + +from specleft.discovery.context import MinerContext +from specleft.discovery.miners.python.docstrings import extract_python_items +from specleft.discovery.miners.shared.common import ( + candidate_source_files, + elapsed_ms, + is_test_file, +) +from specleft.discovery.miners.typescript.jsdoc import extract_jsdoc_items +from specleft.discovery.models import DiscoveredItem, MinerResult, SupportedLanguage + + +class DocstringMiner: + """Extract Python docstrings and TypeScript/JavaScript JSDoc comments.""" + + miner_id = uuid.UUID("dcc2e631-67e7-4af7-b8ba-ca3397ccae0b") + name = "docstrings" + languages = frozenset( + { + SupportedLanguage.PYTHON, + SupportedLanguage.TYPESCRIPT, + SupportedLanguage.JAVASCRIPT, + } + ) + + def mine(self, ctx: MinerContext) -> MinerResult: + started = time.perf_counter() + items: list[DiscoveredItem] = [] + + for rel_path in candidate_source_files(ctx): + if is_test_file(rel_path): + continue + + abs_path = ctx.root / rel_path + parsed = ctx.registry.parse(abs_path) + if parsed is None: + continue + + try: + source_bytes = abs_path.read_bytes() + except OSError: + continue + + root_node, language = parsed + if language == SupportedLanguage.PYTHON: + items.extend(extract_python_items(root_node, source_bytes, rel_path)) + elif language in ( + SupportedLanguage.TYPESCRIPT, + SupportedLanguage.JAVASCRIPT, + ): + items.extend( + extract_jsdoc_items(root_node, source_bytes, rel_path, language) + ) + + return MinerResult( + miner_id=self.miner_id, + miner_name=self.name, + items=items, + duration_ms=elapsed_ms(started), + ) diff --git a/src/specleft/discovery/miners/shared/readme.py b/src/specleft/discovery/miners/shared/readme.py new file mode 100644 index 0000000..368026f --- /dev/null +++ b/src/specleft/discovery/miners/shared/readme.py @@ -0,0 +1,72 @@ +# SPDX-License-Identifier: Apache-2.0 +# Copyright (c) 2026 SpecLeft Contributors + +"""Language-agnostic README overview miner.""" + +from __future__ import annotations + +import uuid +from pathlib import Path + +from specleft.discovery.context import MinerContext +from specleft.discovery.models import ( + DiscoveredItem, + DocstringMeta, + ItemKind, + MinerResult, + SupportedLanguage, +) + + +class ReadmeOverviewMiner: + """Extract a single high-level project overview from README content.""" + + miner_id = uuid.UUID("2f87e7a5-a362-4adc-a005-84457b6abc04") + name = "readme_overview" + languages: frozenset[SupportedLanguage] = frozenset() + + def mine(self, ctx: MinerContext) -> MinerResult: + readme_paths = ( + Path("README.md"), + Path("README.rst"), + Path("README.txt"), + ) + + items: list[DiscoveredItem] = [] + for rel_path in readme_paths: + abs_path = ctx.root / rel_path + if not abs_path.is_file(): + continue + + try: + raw_text = abs_path.read_text(encoding="utf-8") + except (OSError, UnicodeDecodeError): + continue + + first_line = next( + (line.strip() for line in raw_text.splitlines() if line.strip()), + "Project overview", + ) + item = DiscoveredItem( + kind=ItemKind.DOCSTRING, + name="project_overview", + file_path=rel_path, + line_number=1, + language=None, + raw_text=first_line, + metadata=DocstringMeta( + target_kind="module", + target_name="README", + text=first_line, + ).model_dump(), + confidence=0.3, + ) + items.append(item) + break + + return MinerResult( + miner_id=self.miner_id, + miner_name=self.name, + items=items, + duration_ms=0, + ) diff --git a/src/specleft/discovery/miners/typescript/__init__.py b/src/specleft/discovery/miners/typescript/__init__.py new file mode 100644 index 0000000..96fabc5 --- /dev/null +++ b/src/specleft/discovery/miners/typescript/__init__.py @@ -0,0 +1,8 @@ +# SPDX-License-Identifier: Apache-2.0 +# Copyright (c) 2026 SpecLeft Contributors + +"""TypeScript/JavaScript-specific discovery miners.""" + +from specleft.discovery.miners.typescript.jsdoc import extract_jsdoc_items + +__all__ = ["extract_jsdoc_items"] diff --git a/src/specleft/discovery/miners/typescript/jsdoc.py b/src/specleft/discovery/miners/typescript/jsdoc.py new file mode 100644 index 0000000..8459703 --- /dev/null +++ b/src/specleft/discovery/miners/typescript/jsdoc.py @@ -0,0 +1,154 @@ +# SPDX-License-Identifier: Apache-2.0 +# Copyright (c) 2026 SpecLeft Contributors + +"""TypeScript/JavaScript JSDoc extraction for discovery miners.""" + +from __future__ import annotations + +from pathlib import Path +from typing import Any + +from specleft.discovery.miners.shared.common import ( + field_text, + line_number, + make_docstring_item, + node_text, +) +from specleft.discovery.models import DiscoveredItem, SupportedLanguage + +_JSDOC_TARGET_TYPES = frozenset( + { + "function_declaration", + "class_declaration", + "method_definition", + "export_statement", + } +) + + +def extract_jsdoc_items( + root_node: Any, + source_bytes: bytes, + file_path: Path, + language: SupportedLanguage, +) -> list[DiscoveredItem]: + """Extract attached JSDoc comments for TS/JS declarations.""" + items: list[DiscoveredItem] = [] + + def walk(parent: Any) -> None: + children = list(getattr(parent, "children", ())) + for index, child in enumerate(children): + if child.type == "comment": + jsdoc_text = _normalise_jsdoc(node_text(child, source_bytes)) + if jsdoc_text: + target = _resolve_jsdoc_target(children, index, source_bytes) + if target is not None: + kind, name = _target_kind_and_name( + target, source_bytes, file_path + ) + items.append( + make_docstring_item( + file_path=file_path, + line_number=line_number(child), + language=language, + target_kind=kind, + target_name=name, + text=jsdoc_text, + ) + ) + + walk(child) + + walk(root_node) + return items + + +def _resolve_jsdoc_target( + siblings: list[Any], + comment_index: int, + source_bytes: bytes, +) -> Any | None: + comment_node = siblings[comment_index] + for candidate in siblings[comment_index + 1 :]: + if candidate.type == "comment": + continue + + target = _unwrap_export_target(candidate) + if target is None: + break + + if not _is_immediately_before(comment_node, target, source_bytes): + break + return target + return None + + +def _unwrap_export_target(node: Any) -> Any | None: + if node.type in _JSDOC_TARGET_TYPES and node.type != "export_statement": + return node + if node.type != "export_statement": + return None + + for child in getattr(node, "named_children", ()): + if child.type in _JSDOC_TARGET_TYPES and child.type != "export_statement": + return child + return node + + +def _is_immediately_before( + comment_node: Any, target_node: Any, source_bytes: bytes +) -> bool: + source_text = source_bytes.decode("utf-8", errors="ignore") + lines = source_text.splitlines() + comment_line = int(comment_node.end_point[0]) + target_line = int(target_node.start_point[0]) + if target_line < comment_line: + return False + if target_line == comment_line: + return True + for index in range(comment_line + 1, target_line): + if index >= len(lines): + break + if lines[index].strip(): + return False + return True + + +def _target_kind_and_name( + node: Any, + source_bytes: bytes, + file_path: Path, +) -> tuple[str, str | None]: + if node.type == "class_declaration": + return "class", field_text(node, "name", source_bytes) + if node.type == "method_definition": + return "method", field_text(node, "name", source_bytes) + if node.type == "function_declaration": + return "function", field_text(node, "name", source_bytes) + if node.type == "export_statement": + return "module", file_path.stem + return "module", file_path.stem + + +def _normalise_jsdoc(raw_comment: str) -> str | None: + stripped = raw_comment.strip() + if not stripped.startswith("/**"): + return None + + content = stripped + if content.startswith("/**"): + content = content[3:] + if content.endswith("*/"): + content = content[:-2] + + lines = [] + for line in content.splitlines(): + cleaned = line.lstrip() + if cleaned.startswith("*"): + cleaned = cleaned[1:] + if cleaned.startswith(" "): + cleaned = cleaned[1:] + lines.append(cleaned.rstrip()) + + text = "\n".join(lines).strip() + return text or None diff --git a/src/specleft/discovery/pipeline.py b/src/specleft/discovery/pipeline.py index ed4d9fe..71c8226 100644 --- a/src/specleft/discovery/pipeline.py +++ b/src/specleft/discovery/pipeline.py @@ -16,11 +16,9 @@ from specleft.discovery.framework_detector import FrameworkDetector from specleft.discovery.language_detect import detect_project_languages from specleft.discovery.language_registry import LanguageRegistry +from specleft.discovery.miners import default_miners from specleft.discovery.models import ( DiscoveryReport, - DiscoveredItem, - DocstringMeta, - ItemKind, MinerErrorKind, MinerResult, SupportedLanguage, @@ -185,61 +183,5 @@ def _normalize_languages( return normalized -class _ReadmeMiner: - """Minimal built-in miner used as default pipeline baseline.""" - - miner_id = uuid.UUID("2f87e7a5-a362-4adc-a005-84457b6abc04") - name = "readme_overview" - languages: frozenset[SupportedLanguage] = frozenset() - - def mine(self, ctx: MinerContext) -> MinerResult: - readme_paths = ( - Path("README.md"), - Path("README.rst"), - Path("README.txt"), - ) - - items: list[DiscoveredItem] = [] - for rel_path in readme_paths: - abs_path = ctx.root / rel_path - if not abs_path.is_file(): - continue - - try: - raw_text = abs_path.read_text(encoding="utf-8") - except OSError: - continue - except UnicodeDecodeError: - continue - - first_line = next( - (line.strip() for line in raw_text.splitlines() if line.strip()), - "Project overview", - ) - item = DiscoveredItem( - kind=ItemKind.DOCSTRING, - name="project_overview", - file_path=rel_path, - line_number=1, - language=None, - raw_text=first_line, - metadata=DocstringMeta( - target_kind="module", - target_name="README", - text=first_line, - ).model_dump(), - confidence=0.3, - ) - items.append(item) - break - - return MinerResult( - miner_id=self.miner_id, - miner_name=self.name, - items=items, - duration_ms=0, - ) - - def _default_miners() -> list[BaseMiner]: - return [_ReadmeMiner()] + return default_miners() diff --git a/tests/discovery/miners/test_docstrings.py b/tests/discovery/miners/test_docstrings.py new file mode 100644 index 0000000..89697ec --- /dev/null +++ b/tests/discovery/miners/test_docstrings.py @@ -0,0 +1,265 @@ +# SPDX-License-Identifier: Apache-2.0 +# Copyright (c) 2026 SpecLeft Contributors + +"""Tests for discovery docstring/JSDoc miner.""" + +from __future__ import annotations + +from dataclasses import dataclass, field +from pathlib import Path +from typing import Any + +from specleft.discovery.config import DiscoveryConfig +from specleft.discovery.context import MinerContext +from specleft.discovery.file_index import FileIndex +from specleft.discovery.miners.shared.docstrings import DocstringMiner +from specleft.discovery.models import DocstringMeta, SupportedLanguage + + +@dataclass +class _FakeNode: + type: str + text_value: str = "" + children: list[_FakeNode] = field(default_factory=list) + named_children: list[_FakeNode] = field(default_factory=list) + fields: dict[str, _FakeNode] = field(default_factory=dict) + start_point: tuple[int, int] = (0, 0) + end_point: tuple[int, int] = (0, 0) + + @property + def text(self) -> bytes: + return self.text_value.encode("utf-8") + + def child_by_field_name(self, name: str) -> _FakeNode | None: + return self.fields.get(name) + + +class _RegistryStub: + def __init__( + self, mapping: dict[Path, tuple[Any, SupportedLanguage] | None] + ) -> None: + self._mapping = mapping + self.calls: list[Path] = [] + + def parse(self, file_path: Path) -> tuple[Any, SupportedLanguage] | None: + self.calls.append(file_path) + return self._mapping.get(file_path) + + +def _string_expr(value: str, row: int) -> _FakeNode: + string_node = _FakeNode( + type="string", + text_value=value, + start_point=(row, 0), + end_point=(row, len(value)), + ) + return _FakeNode( + type="expression_statement", + children=[string_node], + named_children=[string_node], + start_point=(row, 0), + end_point=(row, len(value)), + ) + + +def _identifier(value: str, row: int) -> _FakeNode: + return _FakeNode( + type="identifier", + text_value=value, + start_point=(row, 0), + end_point=(row, len(value)), + ) + + +def _python_function(name: str, docstring: str, start_row: int) -> _FakeNode: + name_node = _identifier(name, start_row) + body_doc = _string_expr(docstring, start_row + 1) + body = _FakeNode( + type="block", + children=[body_doc], + named_children=[body_doc], + start_point=(start_row + 1, 0), + end_point=(start_row + 1, 0), + ) + return _FakeNode( + type="function_definition", + children=[name_node, body], + named_children=[name_node, body], + fields={"name": name_node, "body": body}, + start_point=(start_row, 0), + end_point=(start_row + 2, 0), + ) + + +def _python_class(name: str, docstring: str, start_row: int) -> _FakeNode: + name_node = _identifier(name, start_row) + body_doc = _string_expr(docstring, start_row + 1) + body = _FakeNode( + type="block", + children=[body_doc], + named_children=[body_doc], + start_point=(start_row + 1, 0), + end_point=(start_row + 1, 0), + ) + return _FakeNode( + type="class_definition", + children=[name_node, body], + named_children=[name_node, body], + fields={"name": name_node, "body": body}, + start_point=(start_row, 0), + end_point=(start_row + 2, 0), + ) + + +def _python_module_tree() -> _FakeNode: + module_doc = _string_expr('"""Auth module docs."""', 0) + class_node = _python_class("AuthService", '"""Class docs."""', 2) + init_fn = _python_function("__init__", '"""Init."""', 5) + create_user_fn = _python_function( + "create_user", + '"""Create a new user with the given credentials."""', + 8, + ) + class_body = class_node.child_by_field_name("body") + assert class_body is not None + class_body.children.extend([init_fn, create_user_fn]) + class_body.named_children.extend([init_fn, create_user_fn]) + + top_function = _python_function("validate_credentials", '"""Validate login."""', 12) + return _FakeNode( + type="module", + children=[module_doc, class_node, top_function], + named_children=[module_doc, class_node, top_function], + start_point=(0, 0), + end_point=(15, 0), + ) + + +def _jsdoc_comment(text: str, row: int) -> _FakeNode: + return _FakeNode( + type="comment", + text_value=text, + start_point=(row, 0), + end_point=(row, len(text)), + ) + + +def _typescript_function(name: str, row: int) -> _FakeNode: + name_node = _identifier(name, row) + return _FakeNode( + type="function_declaration", + children=[name_node], + named_children=[name_node], + fields={"name": name_node}, + start_point=(row, 0), + end_point=(row + 1, 0), + ) + + +def _javascript_tree(function_name: str) -> _FakeNode: + comment = _jsdoc_comment("/** Creates a new user */", 0) + function = _typescript_function(function_name, 1) + return _FakeNode( + type="program", + children=[comment, function], + named_children=[comment, function], + start_point=(0, 0), + end_point=(2, 0), + ) + + +def _context( + root: Path, + registry: _RegistryStub, + source_dirs: tuple[str, ...] = ("src",), +) -> MinerContext: + return MinerContext( + root=root, + registry=registry, # type: ignore[arg-type] + file_index=FileIndex(root), + frameworks={}, + config=DiscoveryConfig(source_dirs=source_dirs), + ) + + +def test_python_docstrings_include_module_class_and_functions(tmp_path: Path) -> None: + source_file = tmp_path / "src" / "auth.py" + source_file.parent.mkdir(parents=True) + source_file.write_text( + '"""Auth module docs."""\nclass AuthService:\n """Class docs."""\n', + encoding="utf-8", + ) + excluded_test_file = tmp_path / "src" / "test_auth.py" + excluded_test_file.write_text('"""should be skipped"""', encoding="utf-8") + + registry = _RegistryStub( + { + source_file: (_python_module_tree(), SupportedLanguage.PYTHON), + excluded_test_file: (_python_module_tree(), SupportedLanguage.PYTHON), + } + ) + miner = DocstringMiner() + + result = miner.mine(_context(tmp_path, registry)) + + assert result.error is None + assert result.error_kind is None + assert all(item.language == SupportedLanguage.PYTHON for item in result.items) + assert "module:auth" in {item.name for item in result.items} + assert "class:AuthService" in {item.name for item in result.items} + assert "function:create_user" in {item.name for item in result.items} + assert "function:validate_credentials" in {item.name for item in result.items} + assert "function:__init__" not in {item.name for item in result.items} + assert all(isinstance(item.typed_meta(), DocstringMeta) for item in result.items) + + module_item = next(item for item in result.items if item.name == "module:auth") + assert module_item.metadata["target_kind"] == "module" + assert module_item.language == SupportedLanguage.PYTHON + assert registry.calls == [source_file] + + +def test_jsdoc_uses_language_per_extension(tmp_path: Path) -> None: + ts_file = tmp_path / "src" / "auth.ts" + js_file = tmp_path / "src" / "auth.js" + ts_file.parent.mkdir(parents=True) + ts_file.write_text("/** Creates a new user */\nexport function createUser() {}\n") + js_file.write_text("/** Creates a helper */\nfunction createHelper() {}\n") + + registry = _RegistryStub( + { + ts_file: (_javascript_tree("createUser"), SupportedLanguage.TYPESCRIPT), + js_file: (_javascript_tree("createHelper"), SupportedLanguage.JAVASCRIPT), + } + ) + + result = DocstringMiner().mine(_context(tmp_path, registry)) + + by_name = {item.name: item for item in result.items} + assert by_name["function:createUser"].language == SupportedLanguage.TYPESCRIPT + assert by_name["function:createHelper"].language == SupportedLanguage.JAVASCRIPT + assert by_name["function:createUser"].metadata["text"] == "Creates a new user" + + +def test_source_dirs_scope_and_test_patterns_are_respected(tmp_path: Path) -> None: + inside_source = tmp_path / "app" / "module.py" + outside_source = tmp_path / "src" / "outside.py" + ts_test = tmp_path / "app" / "auth.test.ts" + inside_source.parent.mkdir(parents=True) + outside_source.parent.mkdir(parents=True, exist_ok=True) + + inside_source.write_text('"""inside"""') + outside_source.write_text('"""outside"""') + ts_test.write_text("/** should skip */\nfunction x() {}\n") + + registry = _RegistryStub( + { + inside_source: (_python_module_tree(), SupportedLanguage.PYTHON), + outside_source: (_python_module_tree(), SupportedLanguage.PYTHON), + ts_test: (_javascript_tree("x"), SupportedLanguage.TYPESCRIPT), + } + ) + + result = DocstringMiner().mine(_context(tmp_path, registry, source_dirs=("app",))) + + assert all(item.file_path == Path("app/module.py") for item in result.items) + assert registry.calls == [inside_source]