Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
20 changes: 20 additions & 0 deletions features/feature-spec-discovery.md
Original file line number Diff line number Diff line change
Expand Up @@ -62,6 +62,22 @@ Add shared discovery infrastructure for Issues #125 and #126: centralized parser
**When** I call `build_default_pipeline(root).run()`
**Then** a `DiscoveryReport` is returned with run duration, detected languages, miner results, and total item counts.

### Story 7: Shared docstring and JSDoc mining
**Scenario:** As a discovery pipeline, I need intent-rich text signals from source code comments.
**Given** configured source directories and a shared miner context
**When** `DocstringMiner` runs
**Then** it extracts Python module/class/function docstrings and TypeScript/JavaScript JSDoc comments into `DiscoveredItem(kind=DOCSTRING)` entries with typed `DocstringMeta`.

**Scenario:** As a pipeline maintainer, I need predictable mining scope and exclusions.
**Given** `source_dirs` in `DiscoveryConfig`
**When** `DocstringMiner` scans files
**Then** it reads only `ctx.file_index.files_under(*ctx.config.source_dirs)` and excludes test files (`test_*.py`, `*.test.ts`, etc.).

**Scenario:** As a spec generation pipeline, I need clean signal quality.
**Given** Python `__init__` docstrings
**When** the content is trivial (10 chars or fewer)
**Then** it is skipped and not emitted as a discovery item.

## Acceptance Criteria
- Language abstraction returns `SupportedLanguage` members for `.py`, `.ts`, `.tsx`, `.js`, `.jsx`, `.mjs` and `None` otherwise.
- `LanguageRegistry().parse(path_to_py_file)` returns `(node, SupportedLanguage.PYTHON)` for valid Python input.
Expand All @@ -84,3 +100,7 @@ Add shared discovery infrastructure for Issues #125 and #126: centralized parser
- Integration on the SpecLeft repository produces `report.total_items > 0`.
- Tests cover config parsing, framework detection, pipeline registration/filtering/error isolation, and default pipeline integration.
- Feature spec is updated to document the discovery layer behavior introduced in issues #125 and #126.
- `DocstringMiner` emits module/class/function Python docstrings with `DocstringMeta` and `confidence=0.8`.
- TypeScript/JavaScript JSDoc comments immediately preceding declarations are emitted with the correct `SupportedLanguage`.
- Test files are excluded from docstring mining and configured `source_dirs` scope is respected.
- Trivial `__init__` docstrings (<=10 chars) are skipped.
9 changes: 9 additions & 0 deletions src/specleft/discovery/miners/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@
# SPDX-License-Identifier: Apache-2.0
# Copyright (c) 2026 SpecLeft Contributors

"""Discovery miner implementations."""

from specleft.discovery.miners.defaults import default_miners
from specleft.discovery.miners.shared import DocstringMiner, ReadmeOverviewMiner

__all__ = ["DocstringMiner", "ReadmeOverviewMiner", "default_miners"]
18 changes: 18 additions & 0 deletions src/specleft/discovery/miners/defaults.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,18 @@
# SPDX-License-Identifier: Apache-2.0
# Copyright (c) 2026 SpecLeft Contributors

"""Default miner registry for discovery pipeline wiring."""

from __future__ import annotations

from typing import TYPE_CHECKING

from specleft.discovery.miners.shared import DocstringMiner, ReadmeOverviewMiner

if TYPE_CHECKING:
from specleft.discovery.pipeline import BaseMiner


def default_miners() -> list[BaseMiner]:
"""Return default miners in deterministic execution order."""
return [ReadmeOverviewMiner(), DocstringMiner()]
8 changes: 8 additions & 0 deletions src/specleft/discovery/miners/python/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,8 @@
# SPDX-License-Identifier: Apache-2.0
# Copyright (c) 2026 SpecLeft Contributors

"""Python-specific discovery miners."""

from specleft.discovery.miners.python.docstrings import extract_python_items

__all__ = ["extract_python_items"]
153 changes: 153 additions & 0 deletions src/specleft/discovery/miners/python/docstrings.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,153 @@
# SPDX-License-Identifier: Apache-2.0
# Copyright (c) 2026 SpecLeft Contributors

"""Python docstring extraction for discovery miners."""

from __future__ import annotations

import ast
from dataclasses import dataclass
from pathlib import Path
from typing import Any

from specleft.discovery.miners.shared.common import (
field_text,
line_number,
make_docstring_item,
node_text,
walk_tree,
)
from specleft.discovery.models import DiscoveredItem, SupportedLanguage

_MEANINGFUL_INIT_DOCSTRING_LEN = 10


@dataclass(frozen=True)
class _DocstringMatch:
text: str
line_number: int


def extract_python_items(
root_node: Any,
source_bytes: bytes,
file_path: Path,
) -> list[DiscoveredItem]:
"""Extract module/class/function docstrings from a Python source tree."""
items: list[DiscoveredItem] = []
module_doc = _extract_python_leading_docstring(root_node, source_bytes)
if module_doc is not None:
items.append(
make_docstring_item(
file_path=file_path,
line_number=module_doc.line_number,
language=SupportedLanguage.PYTHON,
target_kind="module",
target_name=file_path.stem,
text=module_doc.text,
)
)

for node in walk_tree(root_node):
if node.type == "class_definition":
name = field_text(node, "name", source_bytes)
class_doc = _extract_python_body_docstring(node, source_bytes)
if class_doc is None or not name:
continue
items.append(
make_docstring_item(
file_path=file_path,
line_number=class_doc.line_number,
language=SupportedLanguage.PYTHON,
target_kind="class",
target_name=name,
text=class_doc.text,
)
)
continue

if node.type not in {"function_definition", "async_function_definition"}:
continue

name = field_text(node, "name", source_bytes)
function_doc = _extract_python_body_docstring(node, source_bytes)
if function_doc is None or not name:
continue
if (
name == "__init__"
and len(function_doc.text.strip()) <= _MEANINGFUL_INIT_DOCSTRING_LEN
):
continue

items.append(
make_docstring_item(
file_path=file_path,
line_number=function_doc.line_number,
language=SupportedLanguage.PYTHON,
target_kind="function",
target_name=name,
text=function_doc.text,
)
)

return items


def _extract_python_leading_docstring(
container_node: Any,
source_bytes: bytes,
) -> _DocstringMatch | None:
expression = _first_expression_string(container_node)
if expression is None:
return None
text = _clean_python_string(node_text(expression, source_bytes))
if not text:
return None
return _DocstringMatch(text=text, line_number=line_number(expression))


def _extract_python_body_docstring(
definition_node: Any,
source_bytes: bytes,
) -> _DocstringMatch | None:
body = definition_node.child_by_field_name("body")
if body is None:
return None
return _extract_python_leading_docstring(body, source_bytes)


def _first_expression_string(container_node: Any) -> Any | None:
named_children = list(getattr(container_node, "named_children", ()))
if not named_children:
return None
first = named_children[0]
if first.type != "expression_statement":
return None

for child in getattr(first, "named_children", ()):
if child.type in {"string", "concatenated_string"}:
return child
return None


def _clean_python_string(value: str) -> str | None:
stripped = value.strip()
if not stripped:
return None

try:
parsed = ast.literal_eval(stripped)
except (SyntaxError, ValueError):
parsed = _strip_wrapping_quotes(stripped)
if not isinstance(parsed, str):
return None

cleaned = parsed.strip()
return cleaned or None


def _strip_wrapping_quotes(value: str) -> str:
for quote in ('"""', "'''", '"', "'"):
if value.startswith(quote) and value.endswith(quote) and len(value) >= 2:
return value[len(quote) : len(value) - len(quote)].strip()
return value
9 changes: 9 additions & 0 deletions src/specleft/discovery/miners/shared/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@
# SPDX-License-Identifier: Apache-2.0
# Copyright (c) 2026 SpecLeft Contributors

"""Shared miners used by multiple discovery workflows."""

from specleft.discovery.miners.shared.docstrings import DocstringMiner
from specleft.discovery.miners.shared.readme import ReadmeOverviewMiner

__all__ = ["DocstringMiner", "ReadmeOverviewMiner"]
134 changes: 134 additions & 0 deletions src/specleft/discovery/miners/shared/common.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,134 @@
# SPDX-License-Identifier: Apache-2.0
# Copyright (c) 2026 SpecLeft Contributors

"""Shared helpers for discovery miners."""

from __future__ import annotations

import fnmatch
import time
from pathlib import Path
from typing import Any

from specleft.discovery.context import MinerContext
from specleft.discovery.models import (
DiscoveredItem,
DocstringMeta,
ItemKind,
SupportedLanguage,
)

CONFIDENCE = 0.8
TEST_FILE_PATTERNS = (
"test_*.py",
"*_test.py",
"*_tests.py",
"test_*.ts",
"*.test.ts",
"*.spec.ts",
"test_*.tsx",
"*.test.tsx",
"*.spec.tsx",
"test_*.js",
"*.test.js",
"*.spec.js",
"test_*.jsx",
"*.test.jsx",
"*.spec.jsx",
"test_*.mjs",
"*.test.mjs",
"*.spec.mjs",
)


def candidate_source_files(ctx: MinerContext) -> list[Path]:
"""Return configured source files in deterministic order."""
source_dirs = ctx.config.source_dirs
if not source_dirs:
return []

return sorted(
ctx.file_index.files_under(*source_dirs),
key=lambda value: value.as_posix(),
)


def is_test_file(path: Path) -> bool:
"""Return whether a path should be excluded as a test file."""
file_name = path.name
if any(part in {"tests", "__tests__"} for part in path.parts):
return True
return any(fnmatch.fnmatch(file_name, pattern) for pattern in TEST_FILE_PATTERNS)


def make_docstring_item(
*,
file_path: Path,
line_number: int,
language: SupportedLanguage,
target_kind: str,
target_name: str | None,
text: str,
) -> DiscoveredItem:
"""Build a typed discovery item for docstring/JSDoc output."""
item_name = (
f"{target_kind}:{target_name}" if target_name else f"module:{file_path.stem}"
)
metadata = DocstringMeta(
target_kind=target_kind,
target_name=target_name,
text=text,
)
return DiscoveredItem(
kind=ItemKind.DOCSTRING,
name=item_name,
file_path=file_path,
line_number=line_number,
language=language,
raw_text=text,
metadata=metadata.model_dump(),
confidence=CONFIDENCE,
)


def elapsed_ms(started: float) -> int:
"""Return elapsed milliseconds from a `time.perf_counter()` start."""
return max(0, int((time.perf_counter() - started) * 1000))


def walk_tree(node: Any) -> list[Any]:
"""Return all descendant nodes in depth-first order."""
nodes: list[Any] = []
for child in getattr(node, "children", ()):
nodes.append(child)
nodes.extend(walk_tree(child))
return nodes


def line_number(node: Any) -> int:
"""Return 1-based line number for a tree-sitter node."""
return int(node.start_point[0]) + 1


def field_text(node: Any, field: str, source_bytes: bytes) -> str | None:
"""Return source text for a named field on a node."""
field_node = node.child_by_field_name(field)
if field_node is None:
return None
text = node_text(field_node, source_bytes).strip()
return text or None


def node_text(node: Any, source_bytes: bytes) -> str:
"""Return best-effort source text for a tree-sitter node."""
raw = getattr(node, "text", None)
if isinstance(raw, bytes):
return raw.decode("utf-8", errors="ignore")
if isinstance(raw, str):
return raw

start_byte = getattr(node, "start_byte", None)
end_byte = getattr(node, "end_byte", None)
if isinstance(start_byte, int) and isinstance(end_byte, int):
return source_bytes[start_byte:end_byte].decode("utf-8", errors="ignore")
return ""
Loading