SpecLeft · Dimwiddle · Mar 6, 2026 · Mar 6, 2026 · Mar 6, 2026 · Mar 6, 2026
diff --git a/features/feature-spec-discovery.md b/features/feature-spec-discovery.md
@@ -62,6 +62,22 @@ Add shared discovery infrastructure for Issues #125 and #126: centralized parser
 **When** I call `build_default_pipeline(root).run()`
 **Then** a `DiscoveryReport` is returned with run duration, detected languages, miner results, and total item counts.
 
+### Story 7: Shared docstring and JSDoc mining
+**Scenario:** As a discovery pipeline, I need intent-rich text signals from source code comments.
+**Given** configured source directories and a shared miner context
+**When** `DocstringMiner` runs
+**Then** it extracts Python module/class/function docstrings and TypeScript/JavaScript JSDoc comments into `DiscoveredItem(kind=DOCSTRING)` entries with typed `DocstringMeta`.
+
+**Scenario:** As a pipeline maintainer, I need predictable mining scope and exclusions.
+**Given** `source_dirs` in `DiscoveryConfig`
+**When** `DocstringMiner` scans files
+**Then** it reads only `ctx.file_index.files_under(*ctx.config.source_dirs)` and excludes test files (`test_*.py`, `*.test.ts`, etc.).
+
+**Scenario:** As a spec generation pipeline, I need clean signal quality.
+**Given** Python `__init__` docstrings
+**When** the content is trivial (10 chars or fewer)
+**Then** it is skipped and not emitted as a discovery item.
+
 ## Acceptance Criteria
 - Language abstraction returns `SupportedLanguage` members for `.py`, `.ts`, `.tsx`, `.js`, `.jsx`, `.mjs` and `None` otherwise.
 - `LanguageRegistry().parse(path_to_py_file)` returns `(node, SupportedLanguage.PYTHON)` for valid Python input.
@@ -84,3 +100,7 @@ Add shared discovery infrastructure for Issues #125 and #126: centralized parser
 - Integration on the SpecLeft repository produces `report.total_items > 0`.
 - Tests cover config parsing, framework detection, pipeline registration/filtering/error isolation, and default pipeline integration.
 - Feature spec is updated to document the discovery layer behavior introduced in issues #125 and #126.
+- `DocstringMiner` emits module/class/function Python docstrings with `DocstringMeta` and `confidence=0.8`.
+- TypeScript/JavaScript JSDoc comments immediately preceding declarations are emitted with the correct `SupportedLanguage`.
+- Test files are excluded from docstring mining and configured `source_dirs` scope is respected.
+- Trivial `__init__` docstrings (<=10 chars) are skipped.
diff --git a/src/specleft/discovery/miners/__init__.py b/src/specleft/discovery/miners/__init__.py
@@ -0,0 +1,9 @@
+# SPDX-License-Identifier: Apache-2.0
+# Copyright (c) 2026 SpecLeft Contributors
+
+"""Discovery miner implementations."""
+
+from specleft.discovery.miners.defaults import default_miners
+from specleft.discovery.miners.shared import DocstringMiner, ReadmeOverviewMiner
+
+__all__ = ["DocstringMiner", "ReadmeOverviewMiner", "default_miners"]
diff --git a/src/specleft/discovery/miners/defaults.py b/src/specleft/discovery/miners/defaults.py
@@ -0,0 +1,18 @@
+# SPDX-License-Identifier: Apache-2.0
+# Copyright (c) 2026 SpecLeft Contributors
+
+"""Default miner registry for discovery pipeline wiring."""
+
+from __future__ import annotations
+
+from typing import TYPE_CHECKING
+
+from specleft.discovery.miners.shared import DocstringMiner, ReadmeOverviewMiner
+
+if TYPE_CHECKING:
+    from specleft.discovery.pipeline import BaseMiner
+
+
+def default_miners() -> list[BaseMiner]:
+    """Return default miners in deterministic execution order."""
+    return [ReadmeOverviewMiner(), DocstringMiner()]
diff --git a/src/specleft/discovery/miners/python/__init__.py b/src/specleft/discovery/miners/python/__init__.py
@@ -0,0 +1,8 @@
+# SPDX-License-Identifier: Apache-2.0
+# Copyright (c) 2026 SpecLeft Contributors
+
+"""Python-specific discovery miners."""
+
+from specleft.discovery.miners.python.docstrings import extract_python_items
+
+__all__ = ["extract_python_items"]
diff --git a/src/specleft/discovery/miners/python/docstrings.py b/src/specleft/discovery/miners/python/docstrings.py
@@ -0,0 +1,153 @@
+# SPDX-License-Identifier: Apache-2.0
+# Copyright (c) 2026 SpecLeft Contributors
+
+"""Python docstring extraction for discovery miners."""
+
+from __future__ import annotations
+
+import ast
+from dataclasses import dataclass
+from pathlib import Path
+from typing import Any
+
+from specleft.discovery.miners.shared.common import (
+    field_text,
+    line_number,
+    make_docstring_item,
+    node_text,
+    walk_tree,
+)
+from specleft.discovery.models import DiscoveredItem, SupportedLanguage
+
+_MEANINGFUL_INIT_DOCSTRING_LEN = 10
+
+
+@dataclass(frozen=True)
+class _DocstringMatch:
+    text: str
+    line_number: int
+
+
+def extract_python_items(
+    root_node: Any,
+    source_bytes: bytes,
+    file_path: Path,
+) -> list[DiscoveredItem]:
+    """Extract module/class/function docstrings from a Python source tree."""
+    items: list[DiscoveredItem] = []
+    module_doc = _extract_python_leading_docstring(root_node, source_bytes)
+    if module_doc is not None:
+        items.append(
+            make_docstring_item(
+                file_path=file_path,
+                line_number=module_doc.line_number,
+                language=SupportedLanguage.PYTHON,
+                target_kind="module",
+                target_name=file_path.stem,
+                text=module_doc.text,
+            )
+        )
+
+    for node in walk_tree(root_node):
+        if node.type == "class_definition":
+            name = field_text(node, "name", source_bytes)
+            class_doc = _extract_python_body_docstring(node, source_bytes)
+            if class_doc is None or not name:
+                continue
+            items.append(
+                make_docstring_item(
+                    file_path=file_path,
+                    line_number=class_doc.line_number,
+                    language=SupportedLanguage.PYTHON,
+                    target_kind="class",
+                    target_name=name,
+                    text=class_doc.text,
+                )
+            )
+            continue
+
+        if node.type not in {"function_definition", "async_function_definition"}:
+            continue
+
+        name = field_text(node, "name", source_bytes)
+        function_doc = _extract_python_body_docstring(node, source_bytes)
+        if function_doc is None or not name:
+            continue
+        if (
+            name == "__init__"
+            and len(function_doc.text.strip()) <= _MEANINGFUL_INIT_DOCSTRING_LEN
+        ):
+            continue
+
+        items.append(
+            make_docstring_item(
+                file_path=file_path,
+                line_number=function_doc.line_number,
+                language=SupportedLanguage.PYTHON,
+                target_kind="function",
+                target_name=name,
+                text=function_doc.text,
+            )
+        )
+
+    return items
+
+
+def _extract_python_leading_docstring(
+    container_node: Any,
+    source_bytes: bytes,
+) -> _DocstringMatch | None:
+    expression = _first_expression_string(container_node)
+    if expression is None:
+        return None
+    text = _clean_python_string(node_text(expression, source_bytes))
+    if not text:
+        return None
+    return _DocstringMatch(text=text, line_number=line_number(expression))
+
+
+def _extract_python_body_docstring(
+    definition_node: Any,
+    source_bytes: bytes,
+) -> _DocstringMatch | None:
+    body = definition_node.child_by_field_name("body")
+    if body is None:
+        return None
+    return _extract_python_leading_docstring(body, source_bytes)
+
+
+def _first_expression_string(container_node: Any) -> Any | None:
+    named_children = list(getattr(container_node, "named_children", ()))
+    if not named_children:
+        return None
+    first = named_children[0]
+    if first.type != "expression_statement":
+        return None
+
+    for child in getattr(first, "named_children", ()):
+        if child.type in {"string", "concatenated_string"}:
+            return child
+    return None
+
+
+def _clean_python_string(value: str) -> str | None:
+    stripped = value.strip()
+    if not stripped:
+        return None
+
+    try:
+        parsed = ast.literal_eval(stripped)
+    except (SyntaxError, ValueError):
+        parsed = _strip_wrapping_quotes(stripped)
+    if not isinstance(parsed, str):
+        return None
+
+    cleaned = parsed.strip()
+    return cleaned or None
+
+
+def _strip_wrapping_quotes(value: str) -> str:
+    for quote in ('"""', "'''", '"', "'"):
+        if value.startswith(quote) and value.endswith(quote) and len(value) >= 2:
+            return value[len(quote) : len(value) - len(quote)].strip()
+    return value
diff --git a/src/specleft/discovery/miners/shared/__init__.py b/src/specleft/discovery/miners/shared/__init__.py
@@ -0,0 +1,9 @@
+# SPDX-License-Identifier: Apache-2.0
+# Copyright (c) 2026 SpecLeft Contributors
+
+"""Shared miners used by multiple discovery workflows."""
+
+from specleft.discovery.miners.shared.docstrings import DocstringMiner
+from specleft.discovery.miners.shared.readme import ReadmeOverviewMiner
+
+__all__ = ["DocstringMiner", "ReadmeOverviewMiner"]
diff --git a/src/specleft/discovery/miners/shared/common.py b/src/specleft/discovery/miners/shared/common.py
@@ -0,0 +1,134 @@
+# SPDX-License-Identifier: Apache-2.0
+# Copyright (c) 2026 SpecLeft Contributors
+
+"""Shared helpers for discovery miners."""
+
+from __future__ import annotations
+
+import fnmatch
+import time
+from pathlib import Path
+from typing import Any
+
+from specleft.discovery.context import MinerContext
+from specleft.discovery.models import (
+    DiscoveredItem,
+    DocstringMeta,
+    ItemKind,
+    SupportedLanguage,
+)
+
+CONFIDENCE = 0.8
+TEST_FILE_PATTERNS = (
+    "test_*.py",
+    "*_test.py",
+    "*_tests.py",
+    "test_*.ts",
+    "*.test.ts",
+    "*.spec.ts",
+    "test_*.tsx",
+    "*.test.tsx",
+    "*.spec.tsx",
+    "test_*.js",
+    "*.test.js",
+    "*.spec.js",
+    "test_*.jsx",
+    "*.test.jsx",
+    "*.spec.jsx",
+    "test_*.mjs",
+    "*.test.mjs",
+    "*.spec.mjs",
+)
+
+
+def candidate_source_files(ctx: MinerContext) -> list[Path]:
+    """Return configured source files in deterministic order."""
+    source_dirs = ctx.config.source_dirs
+    if not source_dirs:
+        return []
+
+    return sorted(
+        ctx.file_index.files_under(*source_dirs),
+        key=lambda value: value.as_posix(),
+    )
+
+
+def is_test_file(path: Path) -> bool:
+    """Return whether a path should be excluded as a test file."""
+    file_name = path.name
+    if any(part in {"tests", "__tests__"} for part in path.parts):
+        return True
+    return any(fnmatch.fnmatch(file_name, pattern) for pattern in TEST_FILE_PATTERNS)
+
+
+def make_docstring_item(
+    *,
+    file_path: Path,
+    line_number: int,
+    language: SupportedLanguage,
+    target_kind: str,
+    target_name: str | None,
+    text: str,
+) -> DiscoveredItem:
+    """Build a typed discovery item for docstring/JSDoc output."""
+    item_name = (
+        f"{target_kind}:{target_name}" if target_name else f"module:{file_path.stem}"
+    )
+    metadata = DocstringMeta(
+        target_kind=target_kind,
+        target_name=target_name,
+        text=text,
+    )
+    return DiscoveredItem(
+        kind=ItemKind.DOCSTRING,
+        name=item_name,
+        file_path=file_path,
+        line_number=line_number,
+        language=language,
+        raw_text=text,
+        metadata=metadata.model_dump(),
+        confidence=CONFIDENCE,
+    )
+
+
+def elapsed_ms(started: float) -> int:
+    """Return elapsed milliseconds from a `time.perf_counter()` start."""
+    return max(0, int((time.perf_counter() - started) * 1000))
+
+
+def walk_tree(node: Any) -> list[Any]:
+    """Return all descendant nodes in depth-first order."""
+    nodes: list[Any] = []
+    for child in getattr(node, "children", ()):
+        nodes.append(child)
+        nodes.extend(walk_tree(child))
+    return nodes
+
+
+def line_number(node: Any) -> int:
+    """Return 1-based line number for a tree-sitter node."""
+    return int(node.start_point[0]) + 1
+
+
+def field_text(node: Any, field: str, source_bytes: bytes) -> str | None:
+    """Return source text for a named field on a node."""
+    field_node = node.child_by_field_name(field)
+    if field_node is None:
+        return None
+    text = node_text(field_node, source_bytes).strip()
+    return text or None
+
+
+def node_text(node: Any, source_bytes: bytes) -> str:
+    """Return best-effort source text for a tree-sitter node."""
+    raw = getattr(node, "text", None)
+    if isinstance(raw, bytes):
+        return raw.decode("utf-8", errors="ignore")
+    if isinstance(raw, str):
+        return raw
+
+    start_byte = getattr(node, "start_byte", None)
+    end_byte = getattr(node, "end_byte", None)
+    if isinstance(start_byte, int) and isinstance(end_byte, int):
+        return source_bytes[start_byte:end_byte].decode("utf-8", errors="ignore")
+    return ""