From a43c3b7fad50133a172f9add71f9fc48cc39a5c0 Mon Sep 17 00:00:00 2001
From: Richard-Otterli <richard.kakengi@gmail.com>
Date: Fri, 6 Mar 2026 20:20:48 +0000
Subject: [PATCH 1/5] Add discovery docstring miner and wire pipeline (#131)

---
 src/specleft/discovery/miners/__init__.py     |   8 +
 .../discovery/miners/shared/__init__.py       |   8 +
 .../discovery/miners/shared/docstrings.py     | 446 ++++++++++++++++++
 src/specleft/discovery/pipeline.py            |   3 +-
 4 files changed, 464 insertions(+), 1 deletion(-)
 create mode 100644 src/specleft/discovery/miners/__init__.py
 create mode 100644 src/specleft/discovery/miners/shared/__init__.py
 create mode 100644 src/specleft/discovery/miners/shared/docstrings.py

diff --git a/src/specleft/discovery/miners/__init__.py b/src/specleft/discovery/miners/__init__.py
new file mode 100644
index 0000000..7c6d4ea
--- /dev/null
+++ b/src/specleft/discovery/miners/__init__.py
@@ -0,0 +1,8 @@
+# SPDX-License-Identifier: Apache-2.0
+# Copyright (c) 2026 SpecLeft Contributors
+
+"""Discovery miner implementations."""
+
+from specleft.discovery.miners.shared import DocstringMiner
+
+__all__ = ["DocstringMiner"]
diff --git a/src/specleft/discovery/miners/shared/__init__.py b/src/specleft/discovery/miners/shared/__init__.py
new file mode 100644
index 0000000..85ce2aa
--- /dev/null
+++ b/src/specleft/discovery/miners/shared/__init__.py
@@ -0,0 +1,8 @@
+# SPDX-License-Identifier: Apache-2.0
+# Copyright (c) 2026 SpecLeft Contributors
+
+"""Shared miners used by multiple discovery workflows."""
+
+from specleft.discovery.miners.shared.docstrings import DocstringMiner
+
+__all__ = ["DocstringMiner"]
diff --git a/src/specleft/discovery/miners/shared/docstrings.py b/src/specleft/discovery/miners/shared/docstrings.py
new file mode 100644
index 0000000..fa59953
--- /dev/null
+++ b/src/specleft/discovery/miners/shared/docstrings.py
@@ -0,0 +1,446 @@
+# SPDX-License-Identifier: Apache-2.0
+# Copyright (c) 2026 SpecLeft Contributors
+
+"""Docstring/JSDoc miner for discovery pipeline."""
+
+from __future__ import annotations
+
+import ast
+import fnmatch
+import time
+import uuid
+from dataclasses import dataclass
+from pathlib import Path
+from typing import Any
+
+from specleft.discovery.context import MinerContext
+from specleft.discovery.models import (
+    DiscoveredItem,
+    DocstringMeta,
+    ItemKind,
+    MinerResult,
+    SupportedLanguage,
+)
+
+_CONFIDENCE = 0.8
+_MEANINGFUL_INIT_DOCSTRING_LEN = 10
+_JSDOC_TARGET_TYPES = frozenset(
+    {
+        "function_declaration",
+        "class_declaration",
+        "method_definition",
+        "export_statement",
+    }
+)
+_TEST_FILE_PATTERNS = (
+    "test_*.py",
+    "*_test.py",
+    "*_tests.py",
+    "test_*.ts",
+    "*.test.ts",
+    "*.spec.ts",
+    "test_*.tsx",
+    "*.test.tsx",
+    "*.spec.tsx",
+    "test_*.js",
+    "*.test.js",
+    "*.spec.js",
+    "test_*.jsx",
+    "*.test.jsx",
+    "*.spec.jsx",
+    "test_*.mjs",
+    "*.test.mjs",
+    "*.spec.mjs",
+)
+
+
+@dataclass(frozen=True)
+class _DocstringMatch:
+    text: str
+    line_number: int
+
+
+class DocstringMiner:
+    """Extract Python docstrings and TypeScript/JavaScript JSDoc comments."""
+
+    miner_id = uuid.UUID("dcc2e631-67e7-4af7-b8ba-ca3397ccae0b")
+    name = "docstrings"
+    languages = frozenset(
+        {
+            SupportedLanguage.PYTHON,
+            SupportedLanguage.TYPESCRIPT,
+            SupportedLanguage.JAVASCRIPT,
+        }
+    )
+
+    def mine(self, ctx: MinerContext) -> MinerResult:
+        started = time.perf_counter()
+        items: list[DiscoveredItem] = []
+
+        for rel_path in _candidate_source_files(ctx):
+            if _is_test_file(rel_path):
+                continue
+
+            abs_path = ctx.root / rel_path
+            parsed = ctx.registry.parse(abs_path)
+            if parsed is None:
+                continue
+
+            try:
+                source_bytes = abs_path.read_bytes()
+            except OSError:
+                continue
+
+            root_node, language = parsed
+            if language == SupportedLanguage.PYTHON:
+                items.extend(_extract_python_items(root_node, source_bytes, rel_path))
+            elif language in (
+                SupportedLanguage.TYPESCRIPT,
+                SupportedLanguage.JAVASCRIPT,
+            ):
+                items.extend(
+                    _extract_jsdoc_items(root_node, source_bytes, rel_path, language)
+                )
+
+        return MinerResult(
+            miner_id=self.miner_id,
+            miner_name=self.name,
+            items=items,
+            duration_ms=_elapsed_ms(started),
+        )
+
+
+def _candidate_source_files(ctx: MinerContext) -> list[Path]:
+    source_dirs = ctx.config.source_dirs
+    if not source_dirs:
+        return []
+
+    return sorted(
+        ctx.file_index.files_under(*source_dirs),
+        key=lambda value: value.as_posix(),
+    )
+
+
+def _extract_python_items(
+    root_node: Any,
+    source_bytes: bytes,
+    file_path: Path,
+) -> list[DiscoveredItem]:
+    items: list[DiscoveredItem] = []
+    module_doc = _extract_python_leading_docstring(root_node, source_bytes)
+    if module_doc is not None:
+        items.append(
+            _make_item(
+                file_path=file_path,
+                line_number=module_doc.line_number,
+                language=SupportedLanguage.PYTHON,
+                target_kind="module",
+                target_name=file_path.stem,
+                text=module_doc.text,
+            )
+        )
+
+    for node in _walk_tree(root_node):
+        if node.type == "class_definition":
+            name = _field_text(node, "name", source_bytes)
+            class_doc = _extract_python_body_docstring(node, source_bytes)
+            if class_doc is None or not name:
+                continue
+            items.append(
+                _make_item(
+                    file_path=file_path,
+                    line_number=class_doc.line_number,
+                    language=SupportedLanguage.PYTHON,
+                    target_kind="class",
+                    target_name=name,
+                    text=class_doc.text,
+                )
+            )
+            continue
+
+        if node.type not in {"function_definition", "async_function_definition"}:
+            continue
+
+        name = _field_text(node, "name", source_bytes)
+        function_doc = _extract_python_body_docstring(node, source_bytes)
+        if function_doc is None or not name:
+            continue
+        if (
+            name == "__init__"
+            and len(function_doc.text.strip()) <= _MEANINGFUL_INIT_DOCSTRING_LEN
+        ):
+            continue
+
+        items.append(
+            _make_item(
+                file_path=file_path,
+                line_number=function_doc.line_number,
+                language=SupportedLanguage.PYTHON,
+                target_kind="function",
+                target_name=name,
+                text=function_doc.text,
+            )
+        )
+
+    return items
+
+
+def _extract_jsdoc_items(
+    root_node: Any,
+    source_bytes: bytes,
+    file_path: Path,
+    language: SupportedLanguage,
+) -> list[DiscoveredItem]:
+    items: list[DiscoveredItem] = []
+
+    def walk(parent: Any) -> None:
+        children = list(getattr(parent, "children", ()))
+        for index, child in enumerate(children):
+            if child.type == "comment":
+                jsdoc_text = _normalise_jsdoc(_node_text(child, source_bytes))
+                if jsdoc_text:
+                    target = _resolve_jsdoc_target(children, index, source_bytes)
+                    if target is not None:
+                        kind, name = _target_kind_and_name(
+                            target, source_bytes, file_path
+                        )
+                        items.append(
+                            _make_item(
+                                file_path=file_path,
+                                line_number=_line_number(child),
+                                language=language,
+                                target_kind=kind,
+                                target_name=name,
+                                text=jsdoc_text,
+                            )
+                        )
+
+            walk(child)
+
+    walk(root_node)
+    return items
+
+
+def _resolve_jsdoc_target(
+    siblings: list[Any],
+    comment_index: int,
+    source_bytes: bytes,
+) -> Any | None:
+    comment_node = siblings[comment_index]
+    for candidate in siblings[comment_index + 1 :]:
+        if candidate.type == "comment":
+            continue
+
+        target = _unwrap_export_target(candidate)
+        if target is None:
+            break
+
+        if not _is_immediately_before(comment_node, target, source_bytes):
+            break
+        return target
+    return None
+
+
+def _unwrap_export_target(node: Any) -> Any | None:
+    if node.type in _JSDOC_TARGET_TYPES and node.type != "export_statement":
+        return node
+    if node.type != "export_statement":
+        return None
+
+    for child in getattr(node, "named_children", ()):
+        if child.type in _JSDOC_TARGET_TYPES and child.type != "export_statement":
+            return child
+    return node
+
+
+def _is_immediately_before(
+    comment_node: Any, target_node: Any, source_bytes: bytes
+) -> bool:
+    source_text = source_bytes.decode("utf-8", errors="ignore")
+    lines = source_text.splitlines()
+    comment_line = int(comment_node.end_point[0])
+    target_line = int(target_node.start_point[0])
+    if target_line < comment_line:
+        return False
+    if target_line == comment_line:
+        return True
+    for index in range(comment_line + 1, target_line):
+        if index >= len(lines):
+            break
+        if lines[index].strip():
+            return False
+    return True
+
+
+def _target_kind_and_name(
+    node: Any,
+    source_bytes: bytes,
+    file_path: Path,
+) -> tuple[str, str | None]:
+    if node.type == "class_declaration":
+        return "class", _field_text(node, "name", source_bytes)
+    if node.type == "method_definition":
+        return "method", _field_text(node, "name", source_bytes)
+    if node.type == "function_declaration":
+        return "function", _field_text(node, "name", source_bytes)
+    if node.type == "export_statement":
+        return "module", file_path.stem
+    return "module", file_path.stem
+
+
+def _extract_python_leading_docstring(
+    container_node: Any,
+    source_bytes: bytes,
+) -> _DocstringMatch | None:
+    expression = _first_expression_string(container_node)
+    if expression is None:
+        return None
+    text = _clean_python_string(_node_text(expression, source_bytes))
+    if not text:
+        return None
+    return _DocstringMatch(text=text, line_number=_line_number(expression))
+
+
+def _extract_python_body_docstring(
+    definition_node: Any,
+    source_bytes: bytes,
+) -> _DocstringMatch | None:
+    body = definition_node.child_by_field_name("body")
+    if body is None:
+        return None
+    return _extract_python_leading_docstring(body, source_bytes)
+
+
+def _first_expression_string(container_node: Any) -> Any | None:
+    named_children = list(getattr(container_node, "named_children", ()))
+    if not named_children:
+        return None
+    first = named_children[0]
+    if first.type != "expression_statement":
+        return None
+
+    for child in getattr(first, "named_children", ()):
+        if child.type in {"string", "concatenated_string"}:
+            return child
+    return None
+
+
+def _clean_python_string(value: str) -> str | None:
+    stripped = value.strip()
+    if not stripped:
+        return None
+
+    try:
+        parsed = ast.literal_eval(stripped)
+    except (SyntaxError, ValueError):
+        parsed = _strip_wrapping_quotes(stripped)
+    if not isinstance(parsed, str):
+        return None
+
+    cleaned = parsed.strip()
+    return cleaned or None
+
+
+def _strip_wrapping_quotes(value: str) -> str:
+    for quote in ('"""', "'''", '"', "'"):
+        if value.startswith(quote) and value.endswith(quote) and len(value) >= 2:
+            return value[len(quote) : len(value) - len(quote)].strip()
+    return value
+
+
+def _normalise_jsdoc(raw_comment: str) -> str | None:
+    stripped = raw_comment.strip()
+    if not stripped.startswith("/**"):
+        return None
+
+    content = stripped
+    if content.startswith("/**"):
+        content = content[3:]
+    if content.endswith("*/"):
+        content = content[:-2]
+
+    lines = []
+    for line in content.splitlines():
+        cleaned = line.lstrip()
+        if cleaned.startswith("*"):
+            cleaned = cleaned[1:]
+            if cleaned.startswith(" "):
+                cleaned = cleaned[1:]
+        lines.append(cleaned.rstrip())
+
+    text = "\n".join(lines).strip()
+    return text or None
+
+
+def _make_item(
+    *,
+    file_path: Path,
+    line_number: int,
+    language: SupportedLanguage,
+    target_kind: str,
+    target_name: str | None,
+    text: str,
+) -> DiscoveredItem:
+    item_name = (
+        f"{target_kind}:{target_name}" if target_name else f"module:{file_path.stem}"
+    )
+    metadata = DocstringMeta(
+        target_kind=target_kind,
+        target_name=target_name,
+        text=text,
+    )
+    return DiscoveredItem(
+        kind=ItemKind.DOCSTRING,
+        name=item_name,
+        file_path=file_path,
+        line_number=line_number,
+        language=language,
+        raw_text=text,
+        metadata=metadata.model_dump(),
+        confidence=_CONFIDENCE,
+    )
+
+
+def _walk_tree(node: Any) -> list[Any]:
+    nodes: list[Any] = []
+    for child in getattr(node, "children", ()):
+        nodes.append(child)
+        nodes.extend(_walk_tree(child))
+    return nodes
+
+
+def _line_number(node: Any) -> int:
+    return int(node.start_point[0]) + 1
+
+
+def _field_text(node: Any, field: str, source_bytes: bytes) -> str | None:
+    field_node = node.child_by_field_name(field)
+    if field_node is None:
+        return None
+    text = _node_text(field_node, source_bytes).strip()
+    return text or None
+
+
+def _node_text(node: Any, source_bytes: bytes) -> str:
+    raw = getattr(node, "text", None)
+    if isinstance(raw, bytes):
+        return raw.decode("utf-8", errors="ignore")
+    if isinstance(raw, str):
+        return raw
+
+    start_byte = getattr(node, "start_byte", None)
+    end_byte = getattr(node, "end_byte", None)
+    if isinstance(start_byte, int) and isinstance(end_byte, int):
+        return source_bytes[start_byte:end_byte].decode("utf-8", errors="ignore")
+    return ""
+
+
+def _is_test_file(path: Path) -> bool:
+    file_name = path.name
+    if any(part in {"tests", "__tests__"} for part in path.parts):
+        return True
+    return any(fnmatch.fnmatch(file_name, pattern) for pattern in _TEST_FILE_PATTERNS)
+
+
+def _elapsed_ms(started: float) -> int:
+    return max(0, int((time.perf_counter() - started) * 1000))
diff --git a/src/specleft/discovery/pipeline.py b/src/specleft/discovery/pipeline.py
index ed4d9fe..a123a11 100644
--- a/src/specleft/discovery/pipeline.py
+++ b/src/specleft/discovery/pipeline.py
@@ -16,6 +16,7 @@
 from specleft.discovery.framework_detector import FrameworkDetector
 from specleft.discovery.language_detect import detect_project_languages
 from specleft.discovery.language_registry import LanguageRegistry
+from specleft.discovery.miners import DocstringMiner
 from specleft.discovery.models import (
     DiscoveryReport,
     DiscoveredItem,
@@ -242,4 +243,4 @@ def mine(self, ctx: MinerContext) -> MinerResult:
 
 
 def _default_miners() -> list[BaseMiner]:
-    return [_ReadmeMiner()]
+    return [_ReadmeMiner(), DocstringMiner()]

From c42dc7b771fad9d6f754fb45fa42a6b896bafff3 Mon Sep 17 00:00:00 2001
From: Richard-Otterli <richard.kakengi@gmail.com>
Date: Fri, 6 Mar 2026 20:20:53 +0000
Subject: [PATCH 2/5] Add docstring miner test coverage (#131)

---
 tests/discovery/miners/test_docstrings.py | 265 ++++++++++++++++++++++
 1 file changed, 265 insertions(+)
 create mode 100644 tests/discovery/miners/test_docstrings.py

diff --git a/tests/discovery/miners/test_docstrings.py b/tests/discovery/miners/test_docstrings.py
new file mode 100644
index 0000000..89697ec
--- /dev/null
+++ b/tests/discovery/miners/test_docstrings.py
@@ -0,0 +1,265 @@
+# SPDX-License-Identifier: Apache-2.0
+# Copyright (c) 2026 SpecLeft Contributors
+
+"""Tests for discovery docstring/JSDoc miner."""
+
+from __future__ import annotations
+
+from dataclasses import dataclass, field
+from pathlib import Path
+from typing import Any
+
+from specleft.discovery.config import DiscoveryConfig
+from specleft.discovery.context import MinerContext
+from specleft.discovery.file_index import FileIndex
+from specleft.discovery.miners.shared.docstrings import DocstringMiner
+from specleft.discovery.models import DocstringMeta, SupportedLanguage
+
+
+@dataclass
+class _FakeNode:
+    type: str
+    text_value: str = ""
+    children: list[_FakeNode] = field(default_factory=list)
+    named_children: list[_FakeNode] = field(default_factory=list)
+    fields: dict[str, _FakeNode] = field(default_factory=dict)
+    start_point: tuple[int, int] = (0, 0)
+    end_point: tuple[int, int] = (0, 0)
+
+    @property
+    def text(self) -> bytes:
+        return self.text_value.encode("utf-8")
+
+    def child_by_field_name(self, name: str) -> _FakeNode | None:
+        return self.fields.get(name)
+
+
+class _RegistryStub:
+    def __init__(
+        self, mapping: dict[Path, tuple[Any, SupportedLanguage] | None]
+    ) -> None:
+        self._mapping = mapping
+        self.calls: list[Path] = []
+
+    def parse(self, file_path: Path) -> tuple[Any, SupportedLanguage] | None:
+        self.calls.append(file_path)
+        return self._mapping.get(file_path)
+
+
+def _string_expr(value: str, row: int) -> _FakeNode:
+    string_node = _FakeNode(
+        type="string",
+        text_value=value,
+        start_point=(row, 0),
+        end_point=(row, len(value)),
+    )
+    return _FakeNode(
+        type="expression_statement",
+        children=[string_node],
+        named_children=[string_node],
+        start_point=(row, 0),
+        end_point=(row, len(value)),
+    )
+
+
+def _identifier(value: str, row: int) -> _FakeNode:
+    return _FakeNode(
+        type="identifier",
+        text_value=value,
+        start_point=(row, 0),
+        end_point=(row, len(value)),
+    )
+
+
+def _python_function(name: str, docstring: str, start_row: int) -> _FakeNode:
+    name_node = _identifier(name, start_row)
+    body_doc = _string_expr(docstring, start_row + 1)
+    body = _FakeNode(
+        type="block",
+        children=[body_doc],
+        named_children=[body_doc],
+        start_point=(start_row + 1, 0),
+        end_point=(start_row + 1, 0),
+    )
+    return _FakeNode(
+        type="function_definition",
+        children=[name_node, body],
+        named_children=[name_node, body],
+        fields={"name": name_node, "body": body},
+        start_point=(start_row, 0),
+        end_point=(start_row + 2, 0),
+    )
+
+
+def _python_class(name: str, docstring: str, start_row: int) -> _FakeNode:
+    name_node = _identifier(name, start_row)
+    body_doc = _string_expr(docstring, start_row + 1)
+    body = _FakeNode(
+        type="block",
+        children=[body_doc],
+        named_children=[body_doc],
+        start_point=(start_row + 1, 0),
+        end_point=(start_row + 1, 0),
+    )
+    return _FakeNode(
+        type="class_definition",
+        children=[name_node, body],
+        named_children=[name_node, body],
+        fields={"name": name_node, "body": body},
+        start_point=(start_row, 0),
+        end_point=(start_row + 2, 0),
+    )
+
+
+def _python_module_tree() -> _FakeNode:
+    module_doc = _string_expr('"""Auth module docs."""', 0)
+    class_node = _python_class("AuthService", '"""Class docs."""', 2)
+    init_fn = _python_function("__init__", '"""Init."""', 5)
+    create_user_fn = _python_function(
+        "create_user",
+        '"""Create a new user with the given credentials."""',
+        8,
+    )
+    class_body = class_node.child_by_field_name("body")
+    assert class_body is not None
+    class_body.children.extend([init_fn, create_user_fn])
+    class_body.named_children.extend([init_fn, create_user_fn])
+
+    top_function = _python_function("validate_credentials", '"""Validate login."""', 12)
+    return _FakeNode(
+        type="module",
+        children=[module_doc, class_node, top_function],
+        named_children=[module_doc, class_node, top_function],
+        start_point=(0, 0),
+        end_point=(15, 0),
+    )
+
+
+def _jsdoc_comment(text: str, row: int) -> _FakeNode:
+    return _FakeNode(
+        type="comment",
+        text_value=text,
+        start_point=(row, 0),
+        end_point=(row, len(text)),
+    )
+
+
+def _typescript_function(name: str, row: int) -> _FakeNode:
+    name_node = _identifier(name, row)
+    return _FakeNode(
+        type="function_declaration",
+        children=[name_node],
+        named_children=[name_node],
+        fields={"name": name_node},
+        start_point=(row, 0),
+        end_point=(row + 1, 0),
+    )
+
+
+def _javascript_tree(function_name: str) -> _FakeNode:
+    comment = _jsdoc_comment("/** Creates a new user */", 0)
+    function = _typescript_function(function_name, 1)
+    return _FakeNode(
+        type="program",
+        children=[comment, function],
+        named_children=[comment, function],
+        start_point=(0, 0),
+        end_point=(2, 0),
+    )
+
+
+def _context(
+    root: Path,
+    registry: _RegistryStub,
+    source_dirs: tuple[str, ...] = ("src",),
+) -> MinerContext:
+    return MinerContext(
+        root=root,
+        registry=registry,  # type: ignore[arg-type]
+        file_index=FileIndex(root),
+        frameworks={},
+        config=DiscoveryConfig(source_dirs=source_dirs),
+    )
+
+
+def test_python_docstrings_include_module_class_and_functions(tmp_path: Path) -> None:
+    source_file = tmp_path / "src" / "auth.py"
+    source_file.parent.mkdir(parents=True)
+    source_file.write_text(
+        '"""Auth module docs."""\nclass AuthService:\n    """Class docs."""\n',
+        encoding="utf-8",
+    )
+    excluded_test_file = tmp_path / "src" / "test_auth.py"
+    excluded_test_file.write_text('"""should be skipped"""', encoding="utf-8")
+
+    registry = _RegistryStub(
+        {
+            source_file: (_python_module_tree(), SupportedLanguage.PYTHON),
+            excluded_test_file: (_python_module_tree(), SupportedLanguage.PYTHON),
+        }
+    )
+    miner = DocstringMiner()
+
+    result = miner.mine(_context(tmp_path, registry))
+
+    assert result.error is None
+    assert result.error_kind is None
+    assert all(item.language == SupportedLanguage.PYTHON for item in result.items)
+    assert "module:auth" in {item.name for item in result.items}
+    assert "class:AuthService" in {item.name for item in result.items}
+    assert "function:create_user" in {item.name for item in result.items}
+    assert "function:validate_credentials" in {item.name for item in result.items}
+    assert "function:__init__" not in {item.name for item in result.items}
+    assert all(isinstance(item.typed_meta(), DocstringMeta) for item in result.items)
+
+    module_item = next(item for item in result.items if item.name == "module:auth")
+    assert module_item.metadata["target_kind"] == "module"
+    assert module_item.language == SupportedLanguage.PYTHON
+    assert registry.calls == [source_file]
+
+
+def test_jsdoc_uses_language_per_extension(tmp_path: Path) -> None:
+    ts_file = tmp_path / "src" / "auth.ts"
+    js_file = tmp_path / "src" / "auth.js"
+    ts_file.parent.mkdir(parents=True)
+    ts_file.write_text("/** Creates a new user */\nexport function createUser() {}\n")
+    js_file.write_text("/** Creates a helper */\nfunction createHelper() {}\n")
+
+    registry = _RegistryStub(
+        {
+            ts_file: (_javascript_tree("createUser"), SupportedLanguage.TYPESCRIPT),
+            js_file: (_javascript_tree("createHelper"), SupportedLanguage.JAVASCRIPT),
+        }
+    )
+
+    result = DocstringMiner().mine(_context(tmp_path, registry))
+
+    by_name = {item.name: item for item in result.items}
+    assert by_name["function:createUser"].language == SupportedLanguage.TYPESCRIPT
+    assert by_name["function:createHelper"].language == SupportedLanguage.JAVASCRIPT
+    assert by_name["function:createUser"].metadata["text"] == "Creates a new user"
+
+
+def test_source_dirs_scope_and_test_patterns_are_respected(tmp_path: Path) -> None:
+    inside_source = tmp_path / "app" / "module.py"
+    outside_source = tmp_path / "src" / "outside.py"
+    ts_test = tmp_path / "app" / "auth.test.ts"
+    inside_source.parent.mkdir(parents=True)
+    outside_source.parent.mkdir(parents=True, exist_ok=True)
+
+    inside_source.write_text('"""inside"""')
+    outside_source.write_text('"""outside"""')
+    ts_test.write_text("/** should skip */\nfunction x() {}\n")
+
+    registry = _RegistryStub(
+        {
+            inside_source: (_python_module_tree(), SupportedLanguage.PYTHON),
+            outside_source: (_python_module_tree(), SupportedLanguage.PYTHON),
+            ts_test: (_javascript_tree("x"), SupportedLanguage.TYPESCRIPT),
+        }
+    )
+
+    result = DocstringMiner().mine(_context(tmp_path, registry, source_dirs=("app",)))
+
+    assert all(item.file_path == Path("app/module.py") for item in result.items)
+    assert registry.calls == [inside_source]

From 9aec251ae975283ab278a2c4f950724d878c0723 Mon Sep 17 00:00:00 2001
From: Richard-Otterli <richard.kakengi@gmail.com>
Date: Fri, 6 Mar 2026 20:20:58 +0000
Subject: [PATCH 3/5] Document docstring miner discovery behavior (#131)

---
 features/feature-spec-discovery.md | 20 ++++++++++++++++++++
 1 file changed, 20 insertions(+)

diff --git a/features/feature-spec-discovery.md b/features/feature-spec-discovery.md
index 6e64157..5f858a6 100644
--- a/features/feature-spec-discovery.md
+++ b/features/feature-spec-discovery.md
@@ -62,6 +62,22 @@ Add shared discovery infrastructure for Issues #125 and #126: centralized parser
 **When** I call `build_default_pipeline(root).run()`
 **Then** a `DiscoveryReport` is returned with run duration, detected languages, miner results, and total item counts.
 
+### Story 7: Shared docstring and JSDoc mining
+**Scenario:** As a discovery pipeline, I need intent-rich text signals from source code comments.
+**Given** configured source directories and a shared miner context
+**When** `DocstringMiner` runs
+**Then** it extracts Python module/class/function docstrings and TypeScript/JavaScript JSDoc comments into `DiscoveredItem(kind=DOCSTRING)` entries with typed `DocstringMeta`.
+
+**Scenario:** As a pipeline maintainer, I need predictable mining scope and exclusions.
+**Given** `source_dirs` in `DiscoveryConfig`
+**When** `DocstringMiner` scans files
+**Then** it reads only `ctx.file_index.files_under(*ctx.config.source_dirs)` and excludes test files (`test_*.py`, `*.test.ts`, etc.).
+
+**Scenario:** As a spec generation pipeline, I need clean signal quality.
+**Given** Python `__init__` docstrings
+**When** the content is trivial (10 chars or fewer)
+**Then** it is skipped and not emitted as a discovery item.
+
 ## Acceptance Criteria
 - Language abstraction returns `SupportedLanguage` members for `.py`, `.ts`, `.tsx`, `.js`, `.jsx`, `.mjs` and `None` otherwise.
 - `LanguageRegistry().parse(path_to_py_file)` returns `(node, SupportedLanguage.PYTHON)` for valid Python input.
@@ -84,3 +100,7 @@ Add shared discovery infrastructure for Issues #125 and #126: centralized parser
 - Integration on the SpecLeft repository produces `report.total_items > 0`.
 - Tests cover config parsing, framework detection, pipeline registration/filtering/error isolation, and default pipeline integration.
 - Feature spec is updated to document the discovery layer behavior introduced in issues #125 and #126.
+- `DocstringMiner` emits module/class/function Python docstrings with `DocstringMeta` and `confidence=0.8`.
+- TypeScript/JavaScript JSDoc comments immediately preceding declarations are emitted with the correct `SupportedLanguage`.
+- Test files are excluded from docstring mining and configured `source_dirs` scope is respected.
+- Trivial `__init__` docstrings (<=10 chars) are skipped.

From 55d12bb77707c5a2a8799266326deaa20b16033a Mon Sep 17 00:00:00 2001
From: Richard-Otterli <richard.kakengi@gmail.com>
Date: Fri, 6 Mar 2026 20:38:17 +0000
Subject: [PATCH 4/5] Refactor docstring miner by language modules (#131)

---
 .../discovery/miners/python/__init__.py       |   8 +
 .../discovery/miners/python/docstrings.py     | 153 +++++++
 .../discovery/miners/shared/common.py         | 134 ++++++
 .../discovery/miners/shared/docstrings.py     | 403 +-----------------
 .../discovery/miners/typescript/__init__.py   |   8 +
 .../discovery/miners/typescript/jsdoc.py      | 154 +++++++
 6 files changed, 470 insertions(+), 390 deletions(-)
 create mode 100644 src/specleft/discovery/miners/python/__init__.py
 create mode 100644 src/specleft/discovery/miners/python/docstrings.py
 create mode 100644 src/specleft/discovery/miners/shared/common.py
 create mode 100644 src/specleft/discovery/miners/typescript/__init__.py
 create mode 100644 src/specleft/discovery/miners/typescript/jsdoc.py

diff --git a/src/specleft/discovery/miners/python/__init__.py b/src/specleft/discovery/miners/python/__init__.py
new file mode 100644
index 0000000..b933028
--- /dev/null
+++ b/src/specleft/discovery/miners/python/__init__.py
@@ -0,0 +1,8 @@
+# SPDX-License-Identifier: Apache-2.0
+# Copyright (c) 2026 SpecLeft Contributors
+
+"""Python-specific discovery miners."""
+
+from specleft.discovery.miners.python.docstrings import extract_python_items
+
+__all__ = ["extract_python_items"]
diff --git a/src/specleft/discovery/miners/python/docstrings.py b/src/specleft/discovery/miners/python/docstrings.py
new file mode 100644
index 0000000..e60814c
--- /dev/null
+++ b/src/specleft/discovery/miners/python/docstrings.py
@@ -0,0 +1,153 @@
+# SPDX-License-Identifier: Apache-2.0
+# Copyright (c) 2026 SpecLeft Contributors
+
+"""Python docstring extraction for discovery miners."""
+
+from __future__ import annotations
+
+import ast
+from dataclasses import dataclass
+from pathlib import Path
+from typing import Any
+
+from specleft.discovery.miners.shared.common import (
+    field_text,
+    line_number,
+    make_docstring_item,
+    node_text,
+    walk_tree,
+)
+from specleft.discovery.models import DiscoveredItem, SupportedLanguage
+
+_MEANINGFUL_INIT_DOCSTRING_LEN = 10
+
+
+@dataclass(frozen=True)
+class _DocstringMatch:
+    text: str
+    line_number: int
+
+
+def extract_python_items(
+    root_node: Any,
+    source_bytes: bytes,
+    file_path: Path,
+) -> list[DiscoveredItem]:
+    """Extract module/class/function docstrings from a Python source tree."""
+    items: list[DiscoveredItem] = []
+    module_doc = _extract_python_leading_docstring(root_node, source_bytes)
+    if module_doc is not None:
+        items.append(
+            make_docstring_item(
+                file_path=file_path,
+                line_number=module_doc.line_number,
+                language=SupportedLanguage.PYTHON,
+                target_kind="module",
+                target_name=file_path.stem,
+                text=module_doc.text,
+            )
+        )
+
+    for node in walk_tree(root_node):
+        if node.type == "class_definition":
+            name = field_text(node, "name", source_bytes)
+            class_doc = _extract_python_body_docstring(node, source_bytes)
+            if class_doc is None or not name:
+                continue
+            items.append(
+                make_docstring_item(
+                    file_path=file_path,
+                    line_number=class_doc.line_number,
+                    language=SupportedLanguage.PYTHON,
+                    target_kind="class",
+                    target_name=name,
+                    text=class_doc.text,
+                )
+            )
+            continue
+
+        if node.type not in {"function_definition", "async_function_definition"}:
+            continue
+
+        name = field_text(node, "name", source_bytes)
+        function_doc = _extract_python_body_docstring(node, source_bytes)
+        if function_doc is None or not name:
+            continue
+        if (
+            name == "__init__"
+            and len(function_doc.text.strip()) <= _MEANINGFUL_INIT_DOCSTRING_LEN
+        ):
+            continue
+
+        items.append(
+            make_docstring_item(
+                file_path=file_path,
+                line_number=function_doc.line_number,
+                language=SupportedLanguage.PYTHON,
+                target_kind="function",
+                target_name=name,
+                text=function_doc.text,
+            )
+        )
+
+    return items
+
+
+def _extract_python_leading_docstring(
+    container_node: Any,
+    source_bytes: bytes,
+) -> _DocstringMatch | None:
+    expression = _first_expression_string(container_node)
+    if expression is None:
+        return None
+    text = _clean_python_string(node_text(expression, source_bytes))
+    if not text:
+        return None
+    return _DocstringMatch(text=text, line_number=line_number(expression))
+
+
+def _extract_python_body_docstring(
+    definition_node: Any,
+    source_bytes: bytes,
+) -> _DocstringMatch | None:
+    body = definition_node.child_by_field_name("body")
+    if body is None:
+        return None
+    return _extract_python_leading_docstring(body, source_bytes)
+
+
+def _first_expression_string(container_node: Any) -> Any | None:
+    named_children = list(getattr(container_node, "named_children", ()))
+    if not named_children:
+        return None
+    first = named_children[0]
+    if first.type != "expression_statement":
+        return None
+
+    for child in getattr(first, "named_children", ()):
+        if child.type in {"string", "concatenated_string"}:
+            return child
+    return None
+
+
+def _clean_python_string(value: str) -> str | None:
+    stripped = value.strip()
+    if not stripped:
+        return None
+
+    try:
+        parsed = ast.literal_eval(stripped)
+    except (SyntaxError, ValueError):
+        parsed = _strip_wrapping_quotes(stripped)
+    if not isinstance(parsed, str):
+        return None
+
+    cleaned = parsed.strip()
+    return cleaned or None
+
+
+def _strip_wrapping_quotes(value: str) -> str:
+    for quote in ('"""', "'''", '"', "'"):
+        if value.startswith(quote) and value.endswith(quote) and len(value) >= 2:
+            return value[len(quote) : len(value) - len(quote)].strip()
+    return value
diff --git a/src/specleft/discovery/miners/shared/common.py b/src/specleft/discovery/miners/shared/common.py
new file mode 100644
index 0000000..dd1baec
--- /dev/null
+++ b/src/specleft/discovery/miners/shared/common.py
@@ -0,0 +1,134 @@
+# SPDX-License-Identifier: Apache-2.0
+# Copyright (c) 2026 SpecLeft Contributors
+
+"""Shared helpers for discovery miners."""
+
+from __future__ import annotations
+
+import fnmatch
+import time
+from pathlib import Path
+from typing import Any
+
+from specleft.discovery.context import MinerContext
+from specleft.discovery.models import (
+    DiscoveredItem,
+    DocstringMeta,
+    ItemKind,
+    SupportedLanguage,
+)
+
+CONFIDENCE = 0.8
+TEST_FILE_PATTERNS = (
+    "test_*.py",
+    "*_test.py",
+    "*_tests.py",
+    "test_*.ts",
+    "*.test.ts",
+    "*.spec.ts",
+    "test_*.tsx",
+    "*.test.tsx",
+    "*.spec.tsx",
+    "test_*.js",
+    "*.test.js",
+    "*.spec.js",
+    "test_*.jsx",
+    "*.test.jsx",
+    "*.spec.jsx",
+    "test_*.mjs",
+    "*.test.mjs",
+    "*.spec.mjs",
+)
+
+
+def candidate_source_files(ctx: MinerContext) -> list[Path]:
+    """Return configured source files in deterministic order."""
+    source_dirs = ctx.config.source_dirs
+    if not source_dirs:
+        return []
+
+    return sorted(
+        ctx.file_index.files_under(*source_dirs),
+        key=lambda value: value.as_posix(),
+    )
+
+
+def is_test_file(path: Path) -> bool:
+    """Return whether a path should be excluded as a test file."""
+    file_name = path.name
+    if any(part in {"tests", "__tests__"} for part in path.parts):
+        return True
+    return any(fnmatch.fnmatch(file_name, pattern) for pattern in TEST_FILE_PATTERNS)
+
+
+def make_docstring_item(
+    *,
+    file_path: Path,
+    line_number: int,
+    language: SupportedLanguage,
+    target_kind: str,
+    target_name: str | None,
+    text: str,
+) -> DiscoveredItem:
+    """Build a typed discovery item for docstring/JSDoc output."""
+    item_name = (
+        f"{target_kind}:{target_name}" if target_name else f"module:{file_path.stem}"
+    )
+    metadata = DocstringMeta(
+        target_kind=target_kind,
+        target_name=target_name,
+        text=text,
+    )
+    return DiscoveredItem(
+        kind=ItemKind.DOCSTRING,
+        name=item_name,
+        file_path=file_path,
+        line_number=line_number,
+        language=language,
+        raw_text=text,
+        metadata=metadata.model_dump(),
+        confidence=CONFIDENCE,
+    )
+
+
+def elapsed_ms(started: float) -> int:
+    """Return elapsed milliseconds from a `time.perf_counter()` start."""
+    return max(0, int((time.perf_counter() - started) * 1000))
+
+
+def walk_tree(node: Any) -> list[Any]:
+    """Return all descendant nodes in depth-first order."""
+    nodes: list[Any] = []
+    for child in getattr(node, "children", ()):
+        nodes.append(child)
+        nodes.extend(walk_tree(child))
+    return nodes
+
+
+def line_number(node: Any) -> int:
+    """Return 1-based line number for a tree-sitter node."""
+    return int(node.start_point[0]) + 1
+
+
+def field_text(node: Any, field: str, source_bytes: bytes) -> str | None:
+    """Return source text for a named field on a node."""
+    field_node = node.child_by_field_name(field)
+    if field_node is None:
+        return None
+    text = node_text(field_node, source_bytes).strip()
+    return text or None
+
+
+def node_text(node: Any, source_bytes: bytes) -> str:
+    """Return best-effort source text for a tree-sitter node."""
+    raw = getattr(node, "text", None)
+    if isinstance(raw, bytes):
+        return raw.decode("utf-8", errors="ignore")
+    if isinstance(raw, str):
+        return raw
+
+    start_byte = getattr(node, "start_byte", None)
+    end_byte = getattr(node, "end_byte", None)
+    if isinstance(start_byte, int) and isinstance(end_byte, int):
+        return source_bytes[start_byte:end_byte].decode("utf-8", errors="ignore")
+    return ""
diff --git a/src/specleft/discovery/miners/shared/docstrings.py b/src/specleft/discovery/miners/shared/docstrings.py
index fa59953..554def7 100644
--- a/src/specleft/discovery/miners/shared/docstrings.py
+++ b/src/specleft/discovery/miners/shared/docstrings.py
@@ -1,63 +1,22 @@
 # SPDX-License-Identifier: Apache-2.0
 # Copyright (c) 2026 SpecLeft Contributors
 
-"""Docstring/JSDoc miner for discovery pipeline."""
+"""Docstring/JSDoc miner orchestration."""
 
 from __future__ import annotations
 
-import ast
-import fnmatch
 import time
 import uuid
-from dataclasses import dataclass
-from pathlib import Path
-from typing import Any
 
 from specleft.discovery.context import MinerContext
-from specleft.discovery.models import (
-    DiscoveredItem,
-    DocstringMeta,
-    ItemKind,
-    MinerResult,
-    SupportedLanguage,
+from specleft.discovery.miners.python.docstrings import extract_python_items
+from specleft.discovery.miners.shared.common import (
+    candidate_source_files,
+    elapsed_ms,
+    is_test_file,
 )
-
-_CONFIDENCE = 0.8
-_MEANINGFUL_INIT_DOCSTRING_LEN = 10
-_JSDOC_TARGET_TYPES = frozenset(
-    {
-        "function_declaration",
-        "class_declaration",
-        "method_definition",
-        "export_statement",
-    }
-)
-_TEST_FILE_PATTERNS = (
-    "test_*.py",
-    "*_test.py",
-    "*_tests.py",
-    "test_*.ts",
-    "*.test.ts",
-    "*.spec.ts",
-    "test_*.tsx",
-    "*.test.tsx",
-    "*.spec.tsx",
-    "test_*.js",
-    "*.test.js",
-    "*.spec.js",
-    "test_*.jsx",
-    "*.test.jsx",
-    "*.spec.jsx",
-    "test_*.mjs",
-    "*.test.mjs",
-    "*.spec.mjs",
-)
-
-
-@dataclass(frozen=True)
-class _DocstringMatch:
-    text: str
-    line_number: int
+from specleft.discovery.miners.typescript.jsdoc import extract_jsdoc_items
+from specleft.discovery.models import DiscoveredItem, MinerResult, SupportedLanguage
 
 
 class DocstringMiner:
@@ -77,8 +36,8 @@ def mine(self, ctx: MinerContext) -> MinerResult:
         started = time.perf_counter()
         items: list[DiscoveredItem] = []
 
-        for rel_path in _candidate_source_files(ctx):
-            if _is_test_file(rel_path):
+        for rel_path in candidate_source_files(ctx):
+            if is_test_file(rel_path):
                 continue
 
             abs_path = ctx.root / rel_path
@@ -93,354 +52,18 @@ def mine(self, ctx: MinerContext) -> MinerResult:
 
             root_node, language = parsed
             if language == SupportedLanguage.PYTHON:
-                items.extend(_extract_python_items(root_node, source_bytes, rel_path))
+                items.extend(extract_python_items(root_node, source_bytes, rel_path))
             elif language in (
                 SupportedLanguage.TYPESCRIPT,
                 SupportedLanguage.JAVASCRIPT,
             ):
                 items.extend(
-                    _extract_jsdoc_items(root_node, source_bytes, rel_path, language)
+                    extract_jsdoc_items(root_node, source_bytes, rel_path, language)
                 )
 
         return MinerResult(
             miner_id=self.miner_id,
             miner_name=self.name,
             items=items,
-            duration_ms=_elapsed_ms(started),
-        )
-
-
-def _candidate_source_files(ctx: MinerContext) -> list[Path]:
-    source_dirs = ctx.config.source_dirs
-    if not source_dirs:
-        return []
-
-    return sorted(
-        ctx.file_index.files_under(*source_dirs),
-        key=lambda value: value.as_posix(),
-    )
-
-
-def _extract_python_items(
-    root_node: Any,
-    source_bytes: bytes,
-    file_path: Path,
-) -> list[DiscoveredItem]:
-    items: list[DiscoveredItem] = []
-    module_doc = _extract_python_leading_docstring(root_node, source_bytes)
-    if module_doc is not None:
-        items.append(
-            _make_item(
-                file_path=file_path,
-                line_number=module_doc.line_number,
-                language=SupportedLanguage.PYTHON,
-                target_kind="module",
-                target_name=file_path.stem,
-                text=module_doc.text,
-            )
-        )
-
-    for node in _walk_tree(root_node):
-        if node.type == "class_definition":
-            name = _field_text(node, "name", source_bytes)
-            class_doc = _extract_python_body_docstring(node, source_bytes)
-            if class_doc is None or not name:
-                continue
-            items.append(
-                _make_item(
-                    file_path=file_path,
-                    line_number=class_doc.line_number,
-                    language=SupportedLanguage.PYTHON,
-                    target_kind="class",
-                    target_name=name,
-                    text=class_doc.text,
-                )
-            )
-            continue
-
-        if node.type not in {"function_definition", "async_function_definition"}:
-            continue
-
-        name = _field_text(node, "name", source_bytes)
-        function_doc = _extract_python_body_docstring(node, source_bytes)
-        if function_doc is None or not name:
-            continue
-        if (
-            name == "__init__"
-            and len(function_doc.text.strip()) <= _MEANINGFUL_INIT_DOCSTRING_LEN
-        ):
-            continue
-
-        items.append(
-            _make_item(
-                file_path=file_path,
-                line_number=function_doc.line_number,
-                language=SupportedLanguage.PYTHON,
-                target_kind="function",
-                target_name=name,
-                text=function_doc.text,
-            )
+            duration_ms=elapsed_ms(started),
         )
-
-    return items
-
-
-def _extract_jsdoc_items(
-    root_node: Any,
-    source_bytes: bytes,
-    file_path: Path,
-    language: SupportedLanguage,
-) -> list[DiscoveredItem]:
-    items: list[DiscoveredItem] = []
-
-    def walk(parent: Any) -> None:
-        children = list(getattr(parent, "children", ()))
-        for index, child in enumerate(children):
-            if child.type == "comment":
-                jsdoc_text = _normalise_jsdoc(_node_text(child, source_bytes))
-                if jsdoc_text:
-                    target = _resolve_jsdoc_target(children, index, source_bytes)
-                    if target is not None:
-                        kind, name = _target_kind_and_name(
-                            target, source_bytes, file_path
-                        )
-                        items.append(
-                            _make_item(
-                                file_path=file_path,
-                                line_number=_line_number(child),
-                                language=language,
-                                target_kind=kind,
-                                target_name=name,
-                                text=jsdoc_text,
-                            )
-                        )
-
-            walk(child)
-
-    walk(root_node)
-    return items
-
-
-def _resolve_jsdoc_target(
-    siblings: list[Any],
-    comment_index: int,
-    source_bytes: bytes,
-) -> Any | None:
-    comment_node = siblings[comment_index]
-    for candidate in siblings[comment_index + 1 :]:
-        if candidate.type == "comment":
-            continue
-
-        target = _unwrap_export_target(candidate)
-        if target is None:
-            break
-
-        if not _is_immediately_before(comment_node, target, source_bytes):
-            break
-        return target
-    return None
-
-
-def _unwrap_export_target(node: Any) -> Any | None:
-    if node.type in _JSDOC_TARGET_TYPES and node.type != "export_statement":
-        return node
-    if node.type != "export_statement":
-        return None
-
-    for child in getattr(node, "named_children", ()):
-        if child.type in _JSDOC_TARGET_TYPES and child.type != "export_statement":
-            return child
-    return node
-
-
-def _is_immediately_before(
-    comment_node: Any, target_node: Any, source_bytes: bytes
-) -> bool:
-    source_text = source_bytes.decode("utf-8", errors="ignore")
-    lines = source_text.splitlines()
-    comment_line = int(comment_node.end_point[0])
-    target_line = int(target_node.start_point[0])
-    if target_line < comment_line:
-        return False
-    if target_line == comment_line:
-        return True
-    for index in range(comment_line + 1, target_line):
-        if index >= len(lines):
-            break
-        if lines[index].strip():
-            return False
-    return True
-
-
-def _target_kind_and_name(
-    node: Any,
-    source_bytes: bytes,
-    file_path: Path,
-) -> tuple[str, str | None]:
-    if node.type == "class_declaration":
-        return "class", _field_text(node, "name", source_bytes)
-    if node.type == "method_definition":
-        return "method", _field_text(node, "name", source_bytes)
-    if node.type == "function_declaration":
-        return "function", _field_text(node, "name", source_bytes)
-    if node.type == "export_statement":
-        return "module", file_path.stem
-    return "module", file_path.stem
-
-
-def _extract_python_leading_docstring(
-    container_node: Any,
-    source_bytes: bytes,
-) -> _DocstringMatch | None:
-    expression = _first_expression_string(container_node)
-    if expression is None:
-        return None
-    text = _clean_python_string(_node_text(expression, source_bytes))
-    if not text:
-        return None
-    return _DocstringMatch(text=text, line_number=_line_number(expression))
-
-
-def _extract_python_body_docstring(
-    definition_node: Any,
-    source_bytes: bytes,
-) -> _DocstringMatch | None:
-    body = definition_node.child_by_field_name("body")
-    if body is None:
-        return None
-    return _extract_python_leading_docstring(body, source_bytes)
-
-
-def _first_expression_string(container_node: Any) -> Any | None:
-    named_children = list(getattr(container_node, "named_children", ()))
-    if not named_children:
-        return None
-    first = named_children[0]
-    if first.type != "expression_statement":
-        return None
-
-    for child in getattr(first, "named_children", ()):
-        if child.type in {"string", "concatenated_string"}:
-            return child
-    return None
-
-
-def _clean_python_string(value: str) -> str | None:
-    stripped = value.strip()
-    if not stripped:
-        return None
-
-    try:
-        parsed = ast.literal_eval(stripped)
-    except (SyntaxError, ValueError):
-        parsed = _strip_wrapping_quotes(stripped)
-    if not isinstance(parsed, str):
-        return None
-
-    cleaned = parsed.strip()
-    return cleaned or None
-
-
-def _strip_wrapping_quotes(value: str) -> str:
-    for quote in ('"""', "'''", '"', "'"):
-        if value.startswith(quote) and value.endswith(quote) and len(value) >= 2:
-            return value[len(quote) : len(value) - len(quote)].strip()
-    return value
-
-
-def _normalise_jsdoc(raw_comment: str) -> str | None:
-    stripped = raw_comment.strip()
-    if not stripped.startswith("/**"):
-        return None
-
-    content = stripped
-    if content.startswith("/**"):
-        content = content[3:]
-    if content.endswith("*/"):
-        content = content[:-2]
-
-    lines = []
-    for line in content.splitlines():
-        cleaned = line.lstrip()
-        if cleaned.startswith("*"):
-            cleaned = cleaned[1:]
-            if cleaned.startswith(" "):
-                cleaned = cleaned[1:]
-        lines.append(cleaned.rstrip())
-
-    text = "\n".join(lines).strip()
-    return text or None
-
-
-def _make_item(
-    *,
-    file_path: Path,
-    line_number: int,
-    language: SupportedLanguage,
-    target_kind: str,
-    target_name: str | None,
-    text: str,
-) -> DiscoveredItem:
-    item_name = (
-        f"{target_kind}:{target_name}" if target_name else f"module:{file_path.stem}"
-    )
-    metadata = DocstringMeta(
-        target_kind=target_kind,
-        target_name=target_name,
-        text=text,
-    )
-    return DiscoveredItem(
-        kind=ItemKind.DOCSTRING,
-        name=item_name,
-        file_path=file_path,
-        line_number=line_number,
-        language=language,
-        raw_text=text,
-        metadata=metadata.model_dump(),
-        confidence=_CONFIDENCE,
-    )
-
-
-def _walk_tree(node: Any) -> list[Any]:
-    nodes: list[Any] = []
-    for child in getattr(node, "children", ()):
-        nodes.append(child)
-        nodes.extend(_walk_tree(child))
-    return nodes
-
-
-def _line_number(node: Any) -> int:
-    return int(node.start_point[0]) + 1
-
-
-def _field_text(node: Any, field: str, source_bytes: bytes) -> str | None:
-    field_node = node.child_by_field_name(field)
-    if field_node is None:
-        return None
-    text = _node_text(field_node, source_bytes).strip()
-    return text or None
-
-
-def _node_text(node: Any, source_bytes: bytes) -> str:
-    raw = getattr(node, "text", None)
-    if isinstance(raw, bytes):
-        return raw.decode("utf-8", errors="ignore")
-    if isinstance(raw, str):
-        return raw
-
-    start_byte = getattr(node, "start_byte", None)
-    end_byte = getattr(node, "end_byte", None)
-    if isinstance(start_byte, int) and isinstance(end_byte, int):
-        return source_bytes[start_byte:end_byte].decode("utf-8", errors="ignore")
-    return ""
-
-
-def _is_test_file(path: Path) -> bool:
-    file_name = path.name
-    if any(part in {"tests", "__tests__"} for part in path.parts):
-        return True
-    return any(fnmatch.fnmatch(file_name, pattern) for pattern in _TEST_FILE_PATTERNS)
-
-
-def _elapsed_ms(started: float) -> int:
-    return max(0, int((time.perf_counter() - started) * 1000))
diff --git a/src/specleft/discovery/miners/typescript/__init__.py b/src/specleft/discovery/miners/typescript/__init__.py
new file mode 100644
index 0000000..96fabc5
--- /dev/null
+++ b/src/specleft/discovery/miners/typescript/__init__.py
@@ -0,0 +1,8 @@
+# SPDX-License-Identifier: Apache-2.0
+# Copyright (c) 2026 SpecLeft Contributors
+
+"""TypeScript/JavaScript-specific discovery miners."""
+
+from specleft.discovery.miners.typescript.jsdoc import extract_jsdoc_items
+
+__all__ = ["extract_jsdoc_items"]
diff --git a/src/specleft/discovery/miners/typescript/jsdoc.py b/src/specleft/discovery/miners/typescript/jsdoc.py
new file mode 100644
index 0000000..8459703
--- /dev/null
+++ b/src/specleft/discovery/miners/typescript/jsdoc.py
@@ -0,0 +1,154 @@
+# SPDX-License-Identifier: Apache-2.0
+# Copyright (c) 2026 SpecLeft Contributors
+
+"""TypeScript/JavaScript JSDoc extraction for discovery miners."""
+
+from __future__ import annotations
+
+from pathlib import Path
+from typing import Any
+
+from specleft.discovery.miners.shared.common import (
+    field_text,
+    line_number,
+    make_docstring_item,
+    node_text,
+)
+from specleft.discovery.models import DiscoveredItem, SupportedLanguage
+
+_JSDOC_TARGET_TYPES = frozenset(
+    {
+        "function_declaration",
+        "class_declaration",
+        "method_definition",
+        "export_statement",
+    }
+)
+
+
+def extract_jsdoc_items(
+    root_node: Any,
+    source_bytes: bytes,
+    file_path: Path,
+    language: SupportedLanguage,
+) -> list[DiscoveredItem]:
+    """Extract attached JSDoc comments for TS/JS declarations."""
+    items: list[DiscoveredItem] = []
+
+    def walk(parent: Any) -> None:
+        children = list(getattr(parent, "children", ()))
+        for index, child in enumerate(children):
+            if child.type == "comment":
+                jsdoc_text = _normalise_jsdoc(node_text(child, source_bytes))
+                if jsdoc_text:
+                    target = _resolve_jsdoc_target(children, index, source_bytes)
+                    if target is not None:
+                        kind, name = _target_kind_and_name(
+                            target, source_bytes, file_path
+                        )
+                        items.append(
+                            make_docstring_item(
+                                file_path=file_path,
+                                line_number=line_number(child),
+                                language=language,
+                                target_kind=kind,
+                                target_name=name,
+                                text=jsdoc_text,
+                            )
+                        )
+
+            walk(child)
+
+    walk(root_node)
+    return items
+
+
+def _resolve_jsdoc_target(
+    siblings: list[Any],
+    comment_index: int,
+    source_bytes: bytes,
+) -> Any | None:
+    comment_node = siblings[comment_index]
+    for candidate in siblings[comment_index + 1 :]:
+        if candidate.type == "comment":
+            continue
+
+        target = _unwrap_export_target(candidate)
+        if target is None:
+            break
+
+        if not _is_immediately_before(comment_node, target, source_bytes):
+            break
+        return target
+    return None
+
+
+def _unwrap_export_target(node: Any) -> Any | None:
+    if node.type in _JSDOC_TARGET_TYPES and node.type != "export_statement":
+        return node
+    if node.type != "export_statement":
+        return None
+
+    for child in getattr(node, "named_children", ()):
+        if child.type in _JSDOC_TARGET_TYPES and child.type != "export_statement":
+            return child
+    return node
+
+
+def _is_immediately_before(
+    comment_node: Any, target_node: Any, source_bytes: bytes
+) -> bool:
+    source_text = source_bytes.decode("utf-8", errors="ignore")
+    lines = source_text.splitlines()
+    comment_line = int(comment_node.end_point[0])
+    target_line = int(target_node.start_point[0])
+    if target_line < comment_line:
+        return False
+    if target_line == comment_line:
+        return True
+    for index in range(comment_line + 1, target_line):
+        if index >= len(lines):
+            break
+        if lines[index].strip():
+            return False
+    return True
+
+
+def _target_kind_and_name(
+    node: Any,
+    source_bytes: bytes,
+    file_path: Path,
+) -> tuple[str, str | None]:
+    if node.type == "class_declaration":
+        return "class", field_text(node, "name", source_bytes)
+    if node.type == "method_definition":
+        return "method", field_text(node, "name", source_bytes)
+    if node.type == "function_declaration":
+        return "function", field_text(node, "name", source_bytes)
+    if node.type == "export_statement":
+        return "module", file_path.stem
+    return "module", file_path.stem
+
+
+def _normalise_jsdoc(raw_comment: str) -> str | None:
+    stripped = raw_comment.strip()
+    if not stripped.startswith("/**"):
+        return None
+
+    content = stripped
+    if content.startswith("/**"):
+        content = content[3:]
+    if content.endswith("*/"):
+        content = content[:-2]
+
+    lines = []
+    for line in content.splitlines():
+        cleaned = line.lstrip()
+        if cleaned.startswith("*"):
+            cleaned = cleaned[1:]
+            if cleaned.startswith(" "):
+                cleaned = cleaned[1:]
+        lines.append(cleaned.rstrip())
+
+    text = "\n".join(lines).strip()
+    return text or None

From fb536773935b54fc046d9217265ace73abc7d605 Mon Sep 17 00:00:00 2001
From: Richard-Otterli <richard.kakengi@gmail.com>
Date: Fri, 6 Mar 2026 20:42:49 +0000
Subject: [PATCH 5/5] Reorganize remaining default miner modules (#131)

---
 src/specleft/discovery/miners/__init__.py     |  5 +-
 src/specleft/discovery/miners/defaults.py     | 18 +++++
 .../discovery/miners/shared/__init__.py       |  3 +-
 .../discovery/miners/shared/readme.py         | 72 +++++++++++++++++++
 src/specleft/discovery/pipeline.py            | 63 +---------------
 5 files changed, 97 insertions(+), 64 deletions(-)
 create mode 100644 src/specleft/discovery/miners/defaults.py
 create mode 100644 src/specleft/discovery/miners/shared/readme.py

diff --git a/src/specleft/discovery/miners/__init__.py b/src/specleft/discovery/miners/__init__.py
index 7c6d4ea..0f23a79 100644
--- a/src/specleft/discovery/miners/__init__.py
+++ b/src/specleft/discovery/miners/__init__.py
@@ -3,6 +3,7 @@
 
 """Discovery miner implementations."""
 
-from specleft.discovery.miners.shared import DocstringMiner
+from specleft.discovery.miners.defaults import default_miners
+from specleft.discovery.miners.shared import DocstringMiner, ReadmeOverviewMiner
 
-__all__ = ["DocstringMiner"]
+__all__ = ["DocstringMiner", "ReadmeOverviewMiner", "default_miners"]
diff --git a/src/specleft/discovery/miners/defaults.py b/src/specleft/discovery/miners/defaults.py
new file mode 100644
index 0000000..f6af3bf
--- /dev/null
+++ b/src/specleft/discovery/miners/defaults.py
@@ -0,0 +1,18 @@
+# SPDX-License-Identifier: Apache-2.0
+# Copyright (c) 2026 SpecLeft Contributors
+
+"""Default miner registry for discovery pipeline wiring."""
+
+from __future__ import annotations
+
+from typing import TYPE_CHECKING
+
+from specleft.discovery.miners.shared import DocstringMiner, ReadmeOverviewMiner
+
+if TYPE_CHECKING:
+    from specleft.discovery.pipeline import BaseMiner
+
+
+def default_miners() -> list[BaseMiner]:
+    """Return default miners in deterministic execution order."""
+    return [ReadmeOverviewMiner(), DocstringMiner()]
diff --git a/src/specleft/discovery/miners/shared/__init__.py b/src/specleft/discovery/miners/shared/__init__.py
index 85ce2aa..8e11f15 100644
--- a/src/specleft/discovery/miners/shared/__init__.py
+++ b/src/specleft/discovery/miners/shared/__init__.py
@@ -4,5 +4,6 @@
 """Shared miners used by multiple discovery workflows."""
 
 from specleft.discovery.miners.shared.docstrings import DocstringMiner
+from specleft.discovery.miners.shared.readme import ReadmeOverviewMiner
 
-__all__ = ["DocstringMiner"]
+__all__ = ["DocstringMiner", "ReadmeOverviewMiner"]
diff --git a/src/specleft/discovery/miners/shared/readme.py b/src/specleft/discovery/miners/shared/readme.py
new file mode 100644
index 0000000..368026f
--- /dev/null
+++ b/src/specleft/discovery/miners/shared/readme.py
@@ -0,0 +1,72 @@
+# SPDX-License-Identifier: Apache-2.0
+# Copyright (c) 2026 SpecLeft Contributors
+
+"""Language-agnostic README overview miner."""
+
+from __future__ import annotations
+
+import uuid
+from pathlib import Path
+
+from specleft.discovery.context import MinerContext
+from specleft.discovery.models import (
+    DiscoveredItem,
+    DocstringMeta,
+    ItemKind,
+    MinerResult,
+    SupportedLanguage,
+)
+
+
+class ReadmeOverviewMiner:
+    """Extract a single high-level project overview from README content."""
+
+    miner_id = uuid.UUID("2f87e7a5-a362-4adc-a005-84457b6abc04")
+    name = "readme_overview"
+    languages: frozenset[SupportedLanguage] = frozenset()
+
+    def mine(self, ctx: MinerContext) -> MinerResult:
+        readme_paths = (
+            Path("README.md"),
+            Path("README.rst"),
+            Path("README.txt"),
+        )
+
+        items: list[DiscoveredItem] = []
+        for rel_path in readme_paths:
+            abs_path = ctx.root / rel_path
+            if not abs_path.is_file():
+                continue
+
+            try:
+                raw_text = abs_path.read_text(encoding="utf-8")
+            except (OSError, UnicodeDecodeError):
+                continue
+
+            first_line = next(
+                (line.strip() for line in raw_text.splitlines() if line.strip()),
+                "Project overview",
+            )
+            item = DiscoveredItem(
+                kind=ItemKind.DOCSTRING,
+                name="project_overview",
+                file_path=rel_path,
+                line_number=1,
+                language=None,
+                raw_text=first_line,
+                metadata=DocstringMeta(
+                    target_kind="module",
+                    target_name="README",
+                    text=first_line,
+                ).model_dump(),
+                confidence=0.3,
+            )
+            items.append(item)
+            break
+
+        return MinerResult(
+            miner_id=self.miner_id,
+            miner_name=self.name,
+            items=items,
+            duration_ms=0,
+        )
diff --git a/src/specleft/discovery/pipeline.py b/src/specleft/discovery/pipeline.py
index a123a11..71c8226 100644
--- a/src/specleft/discovery/pipeline.py
+++ b/src/specleft/discovery/pipeline.py
@@ -16,12 +16,9 @@
 from specleft.discovery.framework_detector import FrameworkDetector
 from specleft.discovery.language_detect import detect_project_languages
 from specleft.discovery.language_registry import LanguageRegistry
-from specleft.discovery.miners import DocstringMiner
+from specleft.discovery.miners import default_miners
 from specleft.discovery.models import (
     DiscoveryReport,
-    DiscoveredItem,
-    DocstringMeta,
-    ItemKind,
     MinerErrorKind,
     MinerResult,
     SupportedLanguage,
@@ -186,61 +183,5 @@ def _normalize_languages(
     return normalized
 
 
-class _ReadmeMiner:
-    """Minimal built-in miner used as default pipeline baseline."""
-
-    miner_id = uuid.UUID("2f87e7a5-a362-4adc-a005-84457b6abc04")
-    name = "readme_overview"
-    languages: frozenset[SupportedLanguage] = frozenset()
-
-    def mine(self, ctx: MinerContext) -> MinerResult:
-        readme_paths = (
-            Path("README.md"),
-            Path("README.rst"),
-            Path("README.txt"),
-        )
-
-        items: list[DiscoveredItem] = []
-        for rel_path in readme_paths:
-            abs_path = ctx.root / rel_path
-            if not abs_path.is_file():
-                continue
-
-            try:
-                raw_text = abs_path.read_text(encoding="utf-8")
-            except OSError:
-                continue
-            except UnicodeDecodeError:
-                continue
-
-            first_line = next(
-                (line.strip() for line in raw_text.splitlines() if line.strip()),
-                "Project overview",
-            )
-            item = DiscoveredItem(
-                kind=ItemKind.DOCSTRING,
-                name="project_overview",
-                file_path=rel_path,
-                line_number=1,
-                language=None,
-                raw_text=first_line,
-                metadata=DocstringMeta(
-                    target_kind="module",
-                    target_name="README",
-                    text=first_line,
-                ).model_dump(),
-                confidence=0.3,
-            )
-            items.append(item)
-            break
-
-        return MinerResult(
-            miner_id=self.miner_id,
-            miner_name=self.name,
-            items=items,
-            duration_ms=0,
-        )
-
-
 def _default_miners() -> list[BaseMiner]:
-    return [_ReadmeMiner(), DocstringMiner()]
+    return default_miners()