From b4f2a5c1780116110291996a1e79b9166f0691e9 Mon Sep 17 00:00:00 2001 From: Joe <4088382+JoeStech@users.noreply.github.com> Date: Fri, 20 Mar 2026 15:53:24 -0600 Subject: [PATCH 1/3] full rebuild of knowledge base code with hybrid search and ampere docs --- embedding-generation/Dockerfile | 14 +- embedding-generation/document_chunking.py | 520 ++++++++++++++++ embedding-generation/eval_questions.json | 85 +++ embedding-generation/evaluate_retrieval.py | 122 ++++ embedding-generation/generate-chunks.py | 554 +++++++++++------- .../local_vectorstore_creation.py | 65 +- embedding-generation/requirements.txt | 4 +- .../tests/test_generate_chunks.py | 67 ++- embedding-generation/vector-db-sources.csv | 88 +++ mcp-local/Dockerfile | 12 +- mcp-local/requirements.txt | 3 +- mcp-local/server.py | 42 +- mcp-local/utils/search_utils.py | 290 ++++++--- 13 files changed, 1539 insertions(+), 327 deletions(-) create mode 100644 embedding-generation/document_chunking.py create mode 100644 embedding-generation/eval_questions.json create mode 100644 embedding-generation/evaluate_retrieval.py diff --git a/embedding-generation/Dockerfile b/embedding-generation/Dockerfile index dc88a56..4909565 100644 --- a/embedding-generation/Dockerfile +++ b/embedding-generation/Dockerfile @@ -19,9 +19,15 @@ FROM ${EMBEDDING_BASE_IMAGE} AS intrinsic-chunks FROM ubuntu:24.04 AS builder +ARG SOURCES_FILE=vector-db-sources.csv +ARG EMBEDDING_MODEL=all-MiniLM-L6-v2 + ENV DEBIAN_FRONTEND=noninteractive \ PIP_INDEX_URL=https://download.pytorch.org/whl/cpu \ - PIP_EXTRA_INDEX_URL=https://pypi.org/simple + PIP_EXTRA_INDEX_URL=https://pypi.org/simple \ + SENTENCE_TRANSFORMER_MODEL=${EMBEDDING_MODEL} \ + HF_HOME=/embedding-data/.cache/huggingface \ + SENTENCE_TRANSFORMERS_HOME=/embedding-data/.cache/sentence_transformers # Install Python RUN apt-get update && apt-get install -y --no-install-recommends \ @@ -32,6 +38,7 @@ WORKDIR /embedding-data # Copy Python scripts and dependencies COPY generate-chunks.py . +COPY document_chunking.py . COPY local_vectorstore_creation.py . COPY vector-db-sources.csv . COPY requirements.txt . @@ -42,8 +49,11 @@ COPY --from=intrinsic-chunks /embedding-data/intrinsic_chunks ./intrinsic_chunks # Install Python dependencies (force CPU-only torch) RUN pip3 install --no-cache-dir --break-system-packages -r requirements.txt +# Pre-download the embedding model so local/offline loads succeed later in the build. +RUN python3 -c "from sentence_transformers import SentenceTransformer; import os; SentenceTransformer(os.environ['SENTENCE_TRANSFORMER_MODEL'], cache_folder=os.environ['SENTENCE_TRANSFORMERS_HOME'])" + # Generate vector database -RUN python3 generate-chunks.py vector-db-sources.csv && \ +RUN python3 generate-chunks.py ${SOURCES_FILE} && \ python3 local_vectorstore_creation.py && \ rm -f embeddings_*.txt diff --git a/embedding-generation/document_chunking.py b/embedding-generation/document_chunking.py new file mode 100644 index 0000000..43a6c99 --- /dev/null +++ b/embedding-generation/document_chunking.py @@ -0,0 +1,520 @@ +"""Utilities for parsing documentation sources into retrieval-friendly chunks.""" + +from __future__ import annotations + +from dataclasses import dataclass +from io import BytesIO +import math +import re +from typing import Dict, Iterable, List, Optional +from urllib.parse import urlparse + +from bs4 import BeautifulSoup +from pypdf import PdfReader + + +TOKEN_PATTERN = re.compile(r"\w+|[^\w\s]", re.UNICODE) +WORD_PATTERN = re.compile(r"\S+") +SENTENCE_SPLIT_PATTERN = re.compile(r"(?<=[.!?])\s+") +MARKDOWN_HEADING_PATTERN = re.compile(r"^(#{1,6})\s+(.*)$") +MARKDOWN_FENCE_PATTERN = re.compile(r"^(```|~~~)") +HTML_HEADING_TAGS = {f"h{level}" for level in range(1, 7)} +HTML_BLOCK_TAGS = HTML_HEADING_TAGS | {"p", "li", "pre", "code", "table"} +BOILERPLATE_LINE_PATTERNS = [ + re.compile(pattern, re.IGNORECASE) + for pattern in ( + r"^register\s*login$", + r"^english\s*chinese$", + r"^about\s*\|\s*contact us\s*\|\s*privacy\s*\|\s*sitemap$", + r"^this site runs on ampere processors\.?$", + r"^created at\s*:", + r"^last updated at\s*:", + r"^copy$", + r"^table of contents$", + r"^on this page$", + r"^skip to content$", + r"^sign in$", + r"^sign up$", + r"^all rights reserved\.?$", + r"^ampere computing llc$", + r"^products solutions developers support resources company$", + ) +] + + +@dataclass +class Block: + kind: str + text: str + + +@dataclass +class Section: + heading_path: List[str] + blocks: List[Block] + + +@dataclass +class ParsedDocument: + source_url: str + resolved_url: str + display_title: str + content_type: str + sections: List[Section] + + +def normalize_source_url(url: str) -> str: + """Strip browser-extension wrappers and normalize trivial URL noise.""" + url = (url or "").strip() + if url.startswith("chrome-extension://") and "https:/" in url: + _, tail = url.split("https:/", 1) + url = f"https://{tail.lstrip('/')}" + return url + + +def source_to_fetch_url(url: str) -> str: + """Resolve source URLs into directly fetchable content URLs.""" + url = normalize_source_url(url) + if url == "https://learn.arm.com/migration": + return ( + "https://raw.githubusercontent.com/ArmDeveloperEcosystem/" + "arm-learning-paths/refs/heads/main/content/migration/_index.md" + ) + if "/github.com/aws/aws-graviton-getting-started/" in url: + specific_content = url.split("/main/", 1)[1] + return ( + "https://raw.githubusercontent.com/aws/aws-graviton-getting-started/" + f"refs/heads/main/{specific_content}" + ) + if url.startswith("https://github.com/") and "/blob/" in url: + owner_repo, path = url.split("/blob/", 1) + branch, relative_path = path.split("/", 1) + return owner_repo.replace("https://github.com/", "https://raw.githubusercontent.com/") + f"/{branch}/{relative_path}" + return url + + +def estimate_tokens(text: str) -> int: + """Cheap token estimator good enough for chunk sizing.""" + if not text: + return 0 + return math.ceil(len(TOKEN_PATTERN.findall(text)) * 0.85) + + +def clean_text(text: str) -> str: + text = text.replace("\r\n", "\n").replace("\r", "\n") + text = re.sub(r"[ \t]+", " ", text) + text = re.sub(r"\n{3,}", "\n\n", text) + return text.strip() + + +def is_boilerplate_line(line: str) -> bool: + line = clean_text(line) + if not line: + return False + if re.fullmatch(r"©\s*\d{4}.*", line): + return True + if re.fullmatch(r"\d+\s*/\s*\d+", line): + return True + if re.fullmatch(r"\d+", line): + return True + return any(pattern.match(line) for pattern in BOILERPLATE_LINE_PATTERNS) + + +def strip_frontmatter(markdown: str) -> str: + markdown = markdown.lstrip("\ufeff") + if markdown.startswith("---"): + end = markdown.find("\n---", 3) + if end != -1: + return markdown[end + 4 :].lstrip() + return markdown + + +def normalize_heading_path(title: str, heading_path: List[str]) -> List[str]: + normalized = [clean_text(part) for part in heading_path if clean_text(part)] + if normalized and clean_text(normalized[0]).lower() == clean_text(title).lower(): + normalized = normalized[1:] + return normalized + + +def parse_markdown(markdown: str, source_url: str, resolved_url: str, fallback_title: str) -> ParsedDocument: + markdown = strip_frontmatter(markdown) + lines = markdown.splitlines() + heading_stack: List[str] = [] + sections: List[Section] = [] + current_blocks: List[Block] = [] + current_paragraph: List[str] = [] + current_code: List[str] = [] + in_code_block = False + document_title = fallback_title + + def flush_paragraph() -> None: + nonlocal current_paragraph + if not current_paragraph: + return + paragraph = clean_text("\n".join(current_paragraph)) + current_paragraph = [] + if paragraph and not is_boilerplate_line(paragraph): + current_blocks.append(Block("paragraph", paragraph)) + + def flush_code() -> None: + nonlocal current_code + if not current_code: + return + code = "\n".join(current_code).strip() + current_code = [] + if code: + current_blocks.append(Block("code", code)) + + def flush_section() -> None: + if current_blocks: + sections.append(Section(list(heading_stack), list(current_blocks))) + current_blocks.clear() + + for line in lines: + if MARKDOWN_FENCE_PATTERN.match(line.strip()): + if in_code_block: + current_code.append(line) + flush_code() + in_code_block = False + else: + flush_paragraph() + in_code_block = True + current_code = [line] + continue + if in_code_block: + current_code.append(line) + continue + heading_match = MARKDOWN_HEADING_PATTERN.match(line.strip()) + if heading_match: + flush_paragraph() + flush_section() + level = len(heading_match.group(1)) + heading_text = clean_text(heading_match.group(2)) + if level == 1 and fallback_title == document_title: + document_title = heading_text + while len(heading_stack) >= level: + heading_stack.pop() + heading_stack.append(heading_text) + continue + if not line.strip(): + flush_paragraph() + continue + current_paragraph.append(line) + + flush_paragraph() + flush_code() + flush_section() + if not sections: + sections.append(Section([], [Block("paragraph", clean_text(markdown))])) + return ParsedDocument( + source_url=source_url, + resolved_url=resolved_url, + display_title=document_title, + content_type="markdown", + sections=sections, + ) + + +def _select_html_root(soup: BeautifulSoup): + for selector in ("main", "article", "[role='main']", ".article", ".content"): + root = soup.select_one(selector) + if root: + return root + return soup.body or soup + + +def _should_skip_html_tag(tag) -> bool: + if tag.name not in HTML_BLOCK_TAGS: + return True + parent = tag.parent + while parent is not None: + if getattr(parent, "name", None) in HTML_BLOCK_TAGS: + if tag.name == "code" and parent.name == "pre": + return True + if tag.name == "li" and parent.name not in {"ul", "ol"}: + return True + if tag.name not in {"li"}: + return True + parent = parent.parent + return False + + +def parse_html(html: str, source_url: str, resolved_url: str, fallback_title: str) -> ParsedDocument: + soup = BeautifulSoup(html, "html.parser") + for tag in soup.find_all(["script", "style", "nav", "footer", "header", "aside", "noscript", "svg", "form"]): + tag.decompose() + root = _select_html_root(soup) + title = fallback_title + if soup.find("meta", attrs={"property": "og:title"}): + title = clean_text(soup.find("meta", attrs={"property": "og:title"}).get("content", "")) or title + elif soup.title: + title = clean_text(soup.title.get_text(" ", strip=True)) or title + + heading_stack: List[str] = [] + sections: List[Section] = [] + current_blocks: List[Block] = [] + first_h1_seen = False + + def flush_section() -> None: + if current_blocks: + sections.append(Section(list(heading_stack), list(current_blocks))) + current_blocks.clear() + + for tag in root.find_all(list(HTML_BLOCK_TAGS)): + if _should_skip_html_tag(tag): + continue + text = clean_text(tag.get_text("\n" if tag.name == "pre" else " ", strip=True)) + if not text or is_boilerplate_line(text): + continue + if tag.name in HTML_HEADING_TAGS: + flush_section() + level = int(tag.name[1]) + while len(heading_stack) >= level: + heading_stack.pop() + heading_stack.append(text) + if level == 1 and not first_h1_seen: + title = text + first_h1_seen = True + continue + if tag.name == "table": + rows = [] + for row in tag.find_all("tr"): + values = [clean_text(cell.get_text(" ", strip=True)) for cell in row.find_all(["th", "td"])] + values = [value for value in values if value] + if values: + rows.append(" | ".join(values)) + text = "\n".join(rows) + if tag.name in {"pre", "code"}: + current_blocks.append(Block("code", f"```\n{text}\n```")) + elif tag.name == "li": + current_blocks.append(Block("paragraph", f"- {text}")) + else: + current_blocks.append(Block("paragraph", text)) + + flush_section() + if not sections: + page_text = clean_text(root.get_text("\n", strip=True)) + if page_text: + sections.append(Section([], [Block("paragraph", page_text)])) + return ParsedDocument( + source_url=source_url, + resolved_url=resolved_url, + display_title=title, + content_type="html", + sections=sections, + ) + + +def looks_like_heading(paragraph: str) -> bool: + text = clean_text(paragraph) + if not text or len(text) > 120: + return False + if text.endswith((".", "!", "?", ":")): + return False + if len(text.split()) > 12: + return False + return text == text.title() or text == text.upper() + + +def parse_pdf(pdf_bytes: bytes, source_url: str, resolved_url: str, fallback_title: str) -> ParsedDocument: + reader = PdfReader(BytesIO(pdf_bytes)) + sections: List[Section] = [] + document_title = fallback_title + for page_number, page in enumerate(reader.pages, start=1): + raw_text = clean_text(page.extract_text() or "") + if not raw_text: + continue + paragraphs = [clean_text(chunk) for chunk in re.split(r"\n\s*\n", raw_text) if clean_text(chunk)] + heading_path = [f"Page {page_number}"] + blocks: List[Block] = [] + for paragraph in paragraphs: + if page_number == 1 and document_title == fallback_title and len(paragraph.split()) <= 12: + document_title = paragraph + continue + if looks_like_heading(paragraph): + heading_path = [f"Page {page_number}", paragraph] + continue + if is_boilerplate_line(paragraph): + continue + blocks.append(Block("paragraph", paragraph)) + if blocks: + sections.append(Section(heading_path, blocks)) + if not sections: + sections.append(Section([], [Block("paragraph", fallback_title)])) + return ParsedDocument( + source_url=source_url, + resolved_url=resolved_url, + display_title=document_title, + content_type="pdf", + sections=sections, + ) + + +def parse_document_content( + source_url: str, + resolved_url: str, + response_content: bytes, + content_type: str, + fallback_title: str, +) -> ParsedDocument: + content_type = (content_type or "").lower() + if "pdf" in content_type or resolved_url.lower().endswith(".pdf"): + return parse_pdf(response_content, source_url, resolved_url, fallback_title) + decoded = response_content.decode("utf-8", errors="ignore") + if "markdown" in content_type or resolved_url.lower().endswith(".md"): + return parse_markdown(decoded, source_url, resolved_url, fallback_title) + if "html" in content_type or " List[str]: + merged: List[str] = [] + i = 0 + while i < len(blocks): + block = blocks[i] + if block.kind == "code": + parts = [] + if merged: + previous = merged.pop() + if estimate_tokens(previous) <= 180: + parts.append(previous) + else: + merged.append(previous) + parts.append(block.text) + if i + 1 < len(blocks) and blocks[i + 1].kind != "code": + if estimate_tokens(blocks[i + 1].text) <= 180: + parts.append(blocks[i + 1].text) + i += 1 + merged.append("\n\n".join(part for part in parts if part)) + else: + merged.append(block.text) + i += 1 + return [clean_text(item) for item in merged if clean_text(item)] + + +def split_text_recursively(text: str, max_tokens: int) -> List[str]: + text = clean_text(text) + if not text: + return [] + if estimate_tokens(text) <= max_tokens: + return [text] + parts = [clean_text(part) for part in re.split(r"\n\s*\n", text) if clean_text(part)] + if len(parts) > 1: + flattened: List[str] = [] + for part in parts: + flattened.extend(split_text_recursively(part, max_tokens)) + return flattened + if "```" not in text: + sentences = [clean_text(part) for part in SENTENCE_SPLIT_PATTERN.split(text) if clean_text(part)] + if len(sentences) > 1: + flattened = [] + for sentence in sentences: + flattened.extend(split_text_recursively(sentence, max_tokens)) + return flattened + words = WORD_PATTERN.findall(text) + step = max(1, int(max_tokens / 0.85)) + return [" ".join(words[index : index + step]) for index in range(0, len(words), step)] + + +def overlap_tail(text: str, overlap_tokens: int) -> str: + words = WORD_PATTERN.findall(text) + if len(words) <= overlap_tokens: + return text + return " ".join(words[-overlap_tokens:]) + + +def chunk_section_units( + units: List[str], + min_tokens: int, + max_tokens: int, + overlap_tokens: int, +) -> List[str]: + normalized_units: List[str] = [] + for unit in units: + normalized_units.extend(split_text_recursively(unit, max_tokens)) + + chunks: List[str] = [] + current_units: List[str] = [] + current_tokens = 0 + for unit in normalized_units: + unit_tokens = estimate_tokens(unit) + if current_units and current_tokens + unit_tokens > max_tokens and current_tokens >= min_tokens: + current_text = "\n\n".join(current_units) + chunks.append(current_text.strip()) + tail = overlap_tail(current_text, overlap_tokens) + current_units = [tail] if tail else [] + current_tokens = estimate_tokens(tail) + current_units.append(unit) + current_tokens += unit_tokens + + if current_units: + current_text = "\n\n".join(current_units).strip() + if chunks and estimate_tokens(current_text) < max(80, min_tokens // 2): + chunks[-1] = f"{chunks[-1]}\n\n{current_text}".strip() + else: + chunks.append(current_text) + return [chunk for chunk in chunks if clean_text(chunk)] + + +def build_chunk_text(title: str, heading_path: List[str], body: str) -> str: + normalized_heading_path = normalize_heading_path(title, heading_path) + heading_label = " > ".join(normalized_heading_path) if normalized_heading_path else title + return clean_text(f"Document Title: {title}\nHeading Path: {heading_label}\n\n{body}") + + +def derive_version(title: str, source_url: str, content: str = "") -> str: + haystack = " ".join([title, source_url, content[:4000]]) + match = re.search(r"\b(v?\d+(?:\.\d+){0,2})\b", haystack, re.IGNORECASE) + if match: + return match.group(1) + match = re.search(r"\b(20\d{2})\b", haystack) + if match: + return match.group(1) + return "" + + +def derive_product(title: str, source_url: str, doc_type: str, keywords: Iterable[str]) -> str: + haystack = " ".join([title, source_url, doc_type, *keywords]).lower() + if "graviton" in haystack: + return "AWS Graviton" + if "ampere" in haystack or "amperecomputing.com" in source_url: + return "Ampere" + if "learn.arm.com" in source_url or "/arm-" in source_url or " arm " in f" {haystack} ": + return "Arm" + return clean_text(doc_type) or "Documentation" + + +def chunk_parsed_document( + parsed_document: ParsedDocument, + doc_type: str, + keywords: List[str], + min_tokens: int = 300, + max_tokens: int = 600, + overlap_tokens: int = 50, +) -> List[Dict[str, str]]: + chunks: List[Dict[str, str]] = [] + product = derive_product(parsed_document.display_title, parsed_document.source_url, doc_type, keywords) + version = derive_version(parsed_document.display_title, parsed_document.resolved_url) + for section in parsed_document.sections: + heading_path = normalize_heading_path(parsed_document.display_title, section.heading_path) + units = merge_code_context(section.blocks) + if not units: + continue + for chunk_body in chunk_section_units(units, min_tokens, max_tokens, overlap_tokens): + heading = heading_path[-1] if heading_path else parsed_document.display_title + chunks.append( + { + "title": parsed_document.display_title, + "url": parsed_document.source_url, + "resolved_url": parsed_document.resolved_url, + "heading": heading, + "heading_path": heading_path, + "doc_type": doc_type, + "product": product, + "version": version, + "content_type": parsed_document.content_type, + "content": build_chunk_text(parsed_document.display_title, heading_path, chunk_body), + } + ) + return chunks diff --git a/embedding-generation/eval_questions.json b/embedding-generation/eval_questions.json new file mode 100644 index 0000000..242f5a3 --- /dev/null +++ b/embedding-generation/eval_questions.json @@ -0,0 +1,85 @@ +[ + { + "question": "How should worker_processes, worker_connections, and keepalive settings be tuned for NGINX on Ampere processors?", + "expected_urls": ["https://amperecomputing.com/tuning-guides/nginx-tuning-guide"] + }, + { + "question": "Which MySQL configuration and benchmarking steps are recommended to improve OLTP performance on Ampere systems?", + "expected_urls": ["https://amperecomputing.com/tuning-guides/mysql-tuning-guide"] + }, + { + "question": "What Redis server settings and benchmark client parameters does the Ampere tuning guide focus on?", + "expected_urls": ["https://amperecomputing.com/tuning-guides/Redis-setup-and-tuning-guide"] + }, + { + "question": "How should Kafka brokers, storage, and benchmark settings be tuned on Ampere for better throughput and latency?", + "expected_urls": ["https://amperecomputing.com/tuning-guides/apache-kafka-tuning-guide"] + }, + { + "question": "What JVM flags, profiling workflow, and GC advice are recommended for Java on Ampere Altra family processors?", + "expected_urls": ["https://amperecomputing.com/tuning-guides/unlocking-java-performance-tuning-guide"] + }, + { + "question": "How do locking primitives and memory ordering work on Ampere Altra, and when are barriers required?", + "expected_urls": ["https://amperecomputing.com/tuning-guides/locking-primitives-and-memory-ordering"] + }, + { + "question": "What huge page sizes are available on Arm64, and when should larger page sizes be used for performance tuning?", + "expected_urls": ["https://amperecomputing.com/tuning-guides/understanding-memory-page-sizes-on-arm64"] + }, + { + "question": "Which GCC compiler options and tuning recommendations are called out in the 2025 Ampere GCC guide?", + "expected_urls": ["https://amperecomputing.com/tutorials/gcc-guide-ampere-processors"] + }, + { + "question": "How do I use the Ampere Porting Advisor to inspect Arm64 migration issues before porting an application?", + "expected_urls": ["https://amperecomputing.com/tutorials/porting-advisor"] + }, + { + "question": "What are the main deployment steps in the reference architecture for running an ELK stack on Google Tau T2A?", + "expected_urls": ["https://amperecomputing.com/reference-architecture/deploying-an-elk-stack-on-google-tau-t2a"] + }, + { + "question": "How do I build and tune DPDK cryptography workloads on Ampere systems?", + "expected_urls": ["https://amperecomputing.com/tuning-guides/dpdk-cryptography-build-and-tuning-guide"] + }, + { + "question": "What huge page, NIC, and core-affinity setup is recommended in the DPDK setup and tuning guide for Ampere?", + "expected_urls": ["https://amperecomputing.com/tuning-guides/DPDK-setup-and-tuning-guide"] + }, + { + "question": "What bare-metal tuning advice does the Hadoop guide provide for Ampere processors?", + "expected_urls": ["https://amperecomputing.com/tuning-guides/hadoop-tuning-guide-on-bare-metal"] + }, + { + "question": "How should MongoDB be configured and benchmarked on Ampere processors for better performance?", + "expected_urls": ["https://amperecomputing.com/tuning-guides/mongoDB-tuning-guide"] + }, + { + "question": "What thread-count, connection, and benchmarking guidance does the Memcached tuning guide give for Ampere?", + "expected_urls": ["https://amperecomputing.com/tuning-guides/memcached-tuning-guide"] + }, + { + "question": "How can cryptography libraries be accelerated on Ampere processors according to the Ampere tutorial?", + "expected_urls": ["https://amperecomputing.com/tutorials/cryptography"] + }, + { + "question": "What does the Azure Dpsv5 workload brief say about running AI inference workloads on Ampere-based virtual machines?", + "expected_urls": ["https://amperecomputing.com/briefs/ai-inference-on-azure-brief"] + }, + { + "question": "Which storage layout and deployment pattern is described in the MinIO single-node workload brief on Ampere?", + "expected_urls": ["https://www.amperecomputing.com/briefs/minio-on-single-node-brief"] + }, + { + "question": "How do I get started with cloud-native FreeBSD on OCI Ampere A1 using Terraform?", + "expected_urls": ["https://amperecomputing.com/blogs/getting-cloud-native-with-freebsd-on-oci-ampere-a1-with-terraform-"] + }, + { + "question": "In the AWS Graviton performance runbook, how should I define a benchmark and configure the system under test before optimization?", + "expected_urls": [ + "https://github.com/aws/aws-graviton-getting-started/blob/main/perfrunbook/defining_your_benchmark.md", + "https://github.com/aws/aws-graviton-getting-started/blob/main/perfrunbook/configuring_your_sut.md" + ] + } +] diff --git a/embedding-generation/evaluate_retrieval.py b/embedding-generation/evaluate_retrieval.py new file mode 100644 index 0000000..e5e62c0 --- /dev/null +++ b/embedding-generation/evaluate_retrieval.py @@ -0,0 +1,122 @@ +"""Run a small retrieval evaluation over the local metadata and index.""" + +from __future__ import annotations + +import argparse +import json +import os +import sys +from pathlib import Path + +from sentence_transformers import SentenceTransformer + + +REPO_ROOT = Path(__file__).resolve().parents[1] +MCP_LOCAL_DIR = REPO_ROOT / "mcp-local" +if str(MCP_LOCAL_DIR) not in sys.path: + sys.path.insert(0, str(MCP_LOCAL_DIR)) + +from utils.search_utils import build_bm25_index, deduplicate_urls, hybrid_search, load_metadata, load_usearch_index # noqa: E402 + + +def sentence_transformer_cache_folder() -> str | None: + return os.getenv("SENTENCE_TRANSFORMERS_HOME") or None + + +def evaluate(index_path: Path, metadata_path: Path, eval_path: Path, model_name: str, top_k: int) -> int: + metadata = load_metadata(str(metadata_path)) + if not metadata: + print(f"Metadata not found or empty: {metadata_path}") + return 1 + + embedding_model = SentenceTransformer( + model_name, + cache_folder=sentence_transformer_cache_folder(), + local_files_only=True, + ) + usearch_index = load_usearch_index( + str(index_path), + embedding_model.get_sentence_embedding_dimension(), + ) + bm25_index = build_bm25_index(metadata) + + with eval_path.open() as file: + eval_rows = json.load(file) + + hits_at_1 = 0 + hits_at_3 = 0 + hits_at_5 = 0 + reciprocal_ranks = [] + misses = [] + + for row in eval_rows: + raw_results = hybrid_search( + row["question"], + usearch_index, + metadata, + embedding_model, + bm25_index, + k=top_k, + ) + results = deduplicate_urls(raw_results, max_chunks_per_url=1)[:top_k] + ranked_urls = [item["metadata"].get("url") for item in results] + expected = set(row["expected_urls"]) + + match_rank = None + for index, url in enumerate(ranked_urls, start=1): + if url in expected: + match_rank = index + break + + if match_rank == 1: + hits_at_1 += 1 + if match_rank is not None and match_rank <= 3: + hits_at_3 += 1 + if match_rank is not None and match_rank <= 5: + hits_at_5 += 1 + reciprocal_ranks.append(0 if match_rank is None else 1 / match_rank) + + if match_rank is None: + misses.append( + { + "question": row["question"], + "expected_urls": row["expected_urls"], + "ranked_urls": ranked_urls, + } + ) + + total = len(eval_rows) + print(f"Questions: {total}") + print(f"Hit@1: {hits_at_1 / total:.2%}") + print(f"Hit@3: {hits_at_3 / total:.2%}") + print(f"Hit@5: {hits_at_5 / total:.2%}") + print(f"MRR: {sum(reciprocal_ranks) / total:.3f}") + print(f"Misses: {len(misses)}") + for miss in misses[:10]: + print() + print(f"Q: {miss['question']}") + print(f"Expected: {miss['expected_urls']}") + print(f"Got: {miss['ranked_urls']}") + return 0 + + +def main() -> int: + parser = argparse.ArgumentParser(description="Evaluate retrieval over the generated local knowledge base.") + parser.add_argument("--index-path", default="usearch_index.bin") + parser.add_argument("--metadata-path", default="metadata.json") + parser.add_argument("--eval-path", default="eval_questions.json") + parser.add_argument("--model-name", default="all-MiniLM-L6-v2") + parser.add_argument("--top-k", type=int, default=5) + args = parser.parse_args() + + return evaluate( + index_path=Path(args.index_path), + metadata_path=Path(args.metadata_path), + eval_path=Path(args.eval_path), + model_name=args.model_name, + top_k=args.top_k, + ) + + +if __name__ == "__main__": + raise SystemExit(main()) diff --git a/embedding-generation/generate-chunks.py b/embedding-generation/generate-chunks.py index 2175820..18e56fe 100644 --- a/embedding-generation/generate-chunks.py +++ b/embedding-generation/generate-chunks.py @@ -13,14 +13,12 @@ # limitations under the License. import argparse -import sys import os import re import uuid import yaml import csv import datetime -import json import boto3 from botocore.exceptions import NoCredentialsError, ClientError @@ -28,6 +26,16 @@ import requests from requests.adapters import HTTPAdapter from urllib3.util.retry import Retry +from urllib.parse import parse_qs, urlparse + +from document_chunking import ( + chunk_parsed_document, + derive_product, + derive_version, + normalize_source_url, + parse_document_content, + source_to_fetch_url, +) # Create a session with retry logic for resilient HTTP requests @@ -88,14 +96,18 @@ def ensure_intrinsic_chunks_from_s3(local_folder='intrinsic_chunks', 2. Learning Path titles must come from index page...send through function along with Graviton. ''' -yaml_dir = 'yaml_data' -details_file = 'info/chunk_details.csv' +yaml_dir = os.getenv('YAML_OUTPUT_DIR', 'yaml_data') +details_file = os.getenv('CHUNK_DETAILS_FILE', 'info/chunk_details.csv') chunk_index = 1 # Global var to prevent duplication entries from cross platform learning paths cross_platform_lps_dont_duplicate = [] +# Cache the ecosystem dashboard page so package entries do not re-fetch the same +# multi-megabyte HTML document for every source row. +ecosystem_dashboard_entries = None + # Global tracking for vector-db-sources.csv # Set of URLs already in the CSV (for deduplication) known_source_urls = set() @@ -181,11 +193,32 @@ def save_sources_csv(csv_file): print(f"Saved {len(all_sources)} sources to '{csv_file}'") class Chunk: - def __init__(self, title, url, uuid, keywords, content): + def __init__( + self, + title, + url, + uuid, + keywords, + content, + heading="", + heading_path=None, + doc_type="", + product="", + version="", + resolved_url="", + content_type="", + ): self.title = title self.url = url self.uuid = uuid self.content = content + self.heading = heading + self.heading_path = heading_path or [] + self.doc_type = doc_type + self.product = product + self.version = version + self.resolved_url = resolved_url + self.content_type = content_type # Translate keyword list into comma-separated string, and add similar words to keywords. self.keywords = self.formatKeywords(keywords) @@ -201,88 +234,161 @@ def toDict(self): 'url': self.url, 'uuid': self.uuid, 'keywords': self.keywords, - 'content': self.content + 'content': self.content, + 'heading': self.heading, + 'heading_path': self.heading_path, + 'doc_type': self.doc_type, + 'product': self.product, + 'version': self.version, + 'resolved_url': self.resolved_url, + 'content_type': self.content_type, } def __repr__(self): - return f"Chunk(title={self.title}, focus={self.focus}, url={self.url}, uuid={self.uuid}, display_name={self.display_name}, content={self.content})" + return f"Chunk(title={self.title}, url={self.url}, uuid={self.uuid}, heading={self.heading})" -def createEcosystemDashboardChunks(): - ''' Format of Chunk text_snippet: - .NET works on Arm Linux servers starting from version 5 released in November 2020. +def build_ecosystem_dashboard_entries(): + """Load and cache package-level snippets from the ecosystem dashboard.""" + global ecosystem_dashboard_entries + if ecosystem_dashboard_entries is not None: + return ecosystem_dashboard_entries - [Download .NET here.](https://dotnet.microsoft.com/en-us/download/dotnet) + def create_text_snippet(main_row): + package_name = main_row.get('data-title') + download_link = main_row.find('a', class_='download-icon-a') + download_url = download_link.get('href') if download_link else None - To get started quickly, here are some helpful guides from different sources: - - [Arm guide](https://learn.arm.com/install-guides/dotnet/) - - [CSP guide](https://aws.amazon.com/blogs/dotnet/powering-net-8-with-aws-graviton3-benchmarks/) - - [Official documentation](https://learn.microsoft.com/en-us/dotnet/core/install/linux-ubuntu) - ''' - - def createTextSnippet(main_row): - package_name = row.get('data-title') - download_url = row.find('a', class_='download-icon-a').get('href') - - # Get the support statement next_row = main_row.find_next_sibling('tr') - works_on_arm_div = next_row.find('div', class_='description') - - arm_support_statement = works_on_arm_div.get_text().replace('\n',' ') - - # Get individual links to help - quick_start_links_div = works_on_arm_div.parent.find_next_sibling('section').find('div', class_='description') - li_elements = quick_start_links_div.find_all('li') - get_started_text = "" - if li_elements: - get_started_text = "\n\nTo get started quickly, here are some helpful guides from different sources:\n" - for li in quick_start_links_div.find_all('li'): - get_started_text = get_started_text + f"- [{li.find('a').get_text()}]({li.find('a').get('href')})\n" - - - - text_snippet = f"{arm_support_statement}\n\n[Download {package_name} here.]({download_url}){get_started_text}" - return text_snippet + works_on_arm_div = next_row.find('div', class_='description') if next_row else None + arm_support_statement = "" + if works_on_arm_div: + arm_support_statement = works_on_arm_div.get_text(" ", strip=True) + + quick_start_section = None + if works_on_arm_div and works_on_arm_div.parent: + next_section = works_on_arm_div.parent.find_next_sibling('section') + if next_section: + quick_start_section = next_section.find('div', class_='description') + + quick_start_lines = [] + if quick_start_section: + for li in quick_start_section.find_all('li'): + link = li.find('a') + if not link: + continue + link_text = link.get_text(" ", strip=True) + link_href = link.get('href') + if link_text and link_href: + quick_start_lines.append(f"- [{link_text}]({link_href})") + + snippet_parts = [] + if arm_support_statement: + snippet_parts.append(arm_support_statement) + if download_url: + snippet_parts.append(f"[Download {package_name} here.]({download_url})") + if quick_start_lines: + snippet_parts.append( + "To get started quickly, here are some helpful guides from different sources:\n" + + "\n".join(quick_start_lines) + ) + return "\n\n".join(part for part in snippet_parts if part) - # Obtain all url = "https://www.arm.com/developer-hub/ecosystem-dashboard/" response = http_session.get(url, timeout=60) + response.raise_for_status() soup = BeautifulSoup(response.text, 'html.parser') - rows = soup.find_all('tr', class_=['main-sw-row']) + rows = soup.find_all('tr', class_=['main-sw-row']) + entries = {} for row in rows: - # Obtain details for text snippet - text_snippet = createTextSnippet(row) package_name = row.get('data-title') - package_name_urlized = row.get('data-title-urlized') + package_slug = row.get('data-title-urlized') + if not package_name or not package_slug: + continue - # Keywords - keywords=[package_name] - for c in row.get('class'): + keywords = [package_name] + for c in row.get('class', []): if 'tag-' in c: keywords.append(c.replace('tag-license-','').replace('tag-category-','')) + package_url = f"{url}?package={package_slug}" + entries[package_slug] = { + "display_name": f"Ecosystem Dashboard - {package_name}", + "package_name": package_name, + "keywords": keywords, + "url": package_url, + "resolved_url": response.url + f"?package={package_slug}", + "content": create_text_snippet(row), + } - package_url = f"{url}?package={package_name_urlized}" - - # Register this ecosystem dashboard entry as a source + ecosystem_dashboard_entries = entries + return ecosystem_dashboard_entries + + +def ecosystem_dashboard_slug_from_url(source_url): + query = parse_qs(urlparse(source_url).query) + values = query.get("package", []) + if values: + return values[0].strip() + return "" + + +def create_ecosystem_dashboard_chunk(source_url, source_name, keywords_value): + package_slug = ecosystem_dashboard_slug_from_url(source_url) + if not package_slug: + return [] + + entry = build_ecosystem_dashboard_entries().get(package_slug) + if not entry or not entry["content"]: + return [] + + keywords = parse_keywords(keywords_value, entry["package_name"]) + return [ + createChunk( + text_snippet=entry["content"], + WEBSITE_url=normalize_source_url(source_url), + keywords=keywords, + title=entry["display_name"], + heading=entry["package_name"], + heading_path=[entry["package_name"]], + doc_type="Ecosystem Dashboard", + product=derive_product(entry["display_name"], source_url, "Ecosystem Dashboard", keywords), + version=derive_version(entry["display_name"], entry["resolved_url"], entry["content"]), + resolved_url=entry["resolved_url"], + content_type="html", + ) + ] + + +def createEcosystemDashboardChunks(emit_chunks=True): + for entry in build_ecosystem_dashboard_entries().values(): register_source( site_name='Ecosystem Dashboard', license_type='Arm Proprietary', - display_name=f'Ecosystem Dashboard - {package_name}', - url=package_url, - keywords=keywords + display_name=entry["display_name"], + url=entry["url"], + keywords=entry["keywords"] ) - + if not emit_chunks: + continue + chunk = Chunk( - title = f"Ecosystem Dashboard - {package_name}", - url = package_url, - uuid = str(uuid.uuid4()), - keywords = keywords, - content = text_snippet + title=entry["display_name"], + url=entry["url"], + uuid=str(uuid.uuid4()), + keywords=entry["keywords"], + content=entry["content"], + heading=entry["package_name"], + heading_path=[entry["package_name"]], + doc_type="Ecosystem Dashboard", + product=derive_product(entry["display_name"], entry["url"], "Ecosystem Dashboard", entry["keywords"]), + version=derive_version(entry["display_name"], entry["resolved_url"], entry["content"]), + resolved_url=entry["resolved_url"], + content_type="html", ) - chunkSaveAndTrack(url,chunk) + chunkSaveAndTrack(entry["url"], chunk) - return + return def createIntrinsicsDatabaseChunks(): @@ -403,30 +509,50 @@ def htmlToMarkdown(html_string): ''' -def processLearningPath(url,type): +def processLearningPath(url, type, emit_chunks=True): github_raw_link = "https://raw.githubusercontent.com/ArmDeveloperEcosystem/arm-learning-paths/refs/heads/production/content" site_link = "https://learn.arm.com" def chunkizeLearningPath(relative_url, title, keywords): + if not emit_chunks: + return if relative_url.endswith('/'): relative_url = relative_url[:-1] MARKDOWN_url = github_raw_link + relative_url + '.md' WEBSITE_url = site_link + relative_url + response = fetch_with_logging(MARKDOWN_url) + if response is None: + return + parsed_document = parse_document_content( + source_url=WEBSITE_url, + resolved_url=response.url, + response_content=response.content, + content_type=response.headers.get("content-type", "text/markdown"), + fallback_title=title, + ) + chunk_payloads = chunk_parsed_document( + parsed_document, + doc_type=type, + keywords=keywords, + ) - # 3) Extract markdown, skipping those that are 404ing - if not URLIsValidCheck(MARKDOWN_url): - return - markdown = obtainMarkdownContentFromGitHubMDFile(MARKDOWN_url) - - # 4) Get sized text snippets the markdown - text_snippets = obtainTextSnippets__Markdown(markdown) - - # 5) Create chunks for each snippet by adding metadata - for text_snippet in text_snippets: - chunk = createChunk(text_snippet, WEBSITE_url, keywords, title) - - chunkSaveAndTrack(WEBSITE_url,chunk) + # 5) Create chunks for each snippet by adding metadata + for payload in chunk_payloads: + chunk = createChunk( + payload["content"], + WEBSITE_url, + keywords, + payload["title"], + heading=payload["heading"], + heading_path=payload["heading_path"], + doc_type=payload["doc_type"], + product=payload["product"], + version=payload["version"], + resolved_url=payload["resolved_url"], + content_type=payload["content_type"], + ) + chunkSaveAndTrack(WEBSITE_url,chunk) if type == 'Learning Path': @@ -534,20 +660,20 @@ def chunkizeLearningPath(relative_url, title, keywords): for guide in multi_install_guides: sub_ig_rel_url = guide.get('link') - chunkizeLearningPath(sub_ig_rel_url,title, keywords) + chunkizeLearningPath(sub_ig_rel_url,title, keywords) # If not multi-install (most cases) else: chunkizeLearningPath(ig_rel_url,title, keywords) -def createLearningPathChunks(): +def createLearningPathChunks(emit_chunks=True): # Find all categories to iterate over learn_url = "https://learn.arm.com/" response = http_session.get(learn_url, timeout=60) soup = BeautifulSoup(response.text, 'html.parser') # Process Install Guides separately (directly from /install-guides page) - processLearningPath("/install-guides", "Install Guide") + processLearningPath("/install-guides", "Install Guide", emit_chunks=emit_chunks) # Find category links - main-topic-card elements are now wrapped in tags # Look for tags that contain main-topic-card divs @@ -569,7 +695,7 @@ def createLearningPathChunks(): continue lp_url = learn_url.rstrip('/') + lp_link # Chunking step - processLearningPath(lp_url, "Learning Path") + processLearningPath(lp_url, "Learning Path", emit_chunks=emit_chunks) def readInCSV(csv_file): @@ -581,7 +707,9 @@ def readInCSV(csv_file): csv_dict = { 'urls': [], 'focus': [], - 'source_names': [] + 'source_names': [], + 'site_names': [], + 'license_types': [], } if not os.path.exists(csv_file): @@ -590,9 +718,11 @@ def readInCSV(csv_file): with open(csv_file, 'r', newline='', encoding='utf-8') as file: reader = csv.DictReader(file) for row in reader: - csv_dict['urls'].append(row.get('URL', '')) + csv_dict['urls'].append(normalize_source_url(row.get('URL', ''))) csv_dict['focus'].append(row.get('Keywords', '')) csv_dict['source_names'].append(row.get('Display Name', '')) + csv_dict['site_names'].append(row.get('Site Name', '')) + csv_dict['license_types'].append(row.get('License Type', '')) return csv_dict, len(csv_dict['urls']) @@ -601,30 +731,14 @@ def getMarkdownGitHubURLsFromPage(url): GH_urls = [] SITE_urls = [] - if url == 'https://learn.arm.com/migration': - github_raw_link = "https://raw.githubusercontent.com/ArmDeveloperEcosystem/arm-learning-paths/refs/heads/main/content" - github_md_link = github_raw_link + '/migration/_index.md' - - SITE_urls.append(url) - GH_urls.append(github_md_link) - - elif '/github.com/aws/aws-graviton-getting-started/' in url: - github_raw_link = "https://raw.githubusercontent.com/aws/aws-graviton-getting-started/refs/heads/main/" - - # Rip off part of the URL after '/main/' - specific_content = url.split('/main/')[1] - - github_md_link = github_raw_link + specific_content - - SITE_urls.append(url) - GH_urls.append(github_md_link) - + fetch_url = source_to_fetch_url(url) + if fetch_url != normalize_source_url(url): + SITE_urls.append(normalize_source_url(url)) + GH_urls.append(fetch_url) else: print('url doesnt match expected format. Check function and try again.') print('URL: ',url) - - return GH_urls, SITE_urls @@ -639,6 +753,25 @@ def URLIsValidCheck(url): csv_writer = csv.writer(csvfile) csv_writer.writerow([url,str(http_err)]) return False + + +def fetch_with_logging(url): + try: + response = http_session.get(url, timeout=60) + response.raise_for_status() + return response + except requests.exceptions.HTTPError as http_err: + print(f"HTTP error occurred: {http_err}") + with open('info/errors.csv', 'a', newline='') as csvfile: + csv_writer = csv.writer(csvfile) + csv_writer.writerow([url, str(http_err)]) + return None + except Exception as err: + print(f"Other error occurred: {err}") + with open('info/errors.csv', 'a', newline='') as csvfile: + csv_writer = csv.writer(csvfile) + csv_writer.writerow([url, str(err)]) + return None except Exception as err: print(f"Other error occurred: {err}") with open('info/errors.csv', 'a', newline='') as csvfile: @@ -652,106 +785,57 @@ def obtainMarkdownContentFromGitHubMDFile(gh_url): response.raise_for_status() # Ensure we got a valid response md_content = response.text - - # Remove frontmatter bounded by '---' - md_content = md_content[md_content.find('---', 3) + 3:].strip() # +3 to remove the '---' and strip to remove leading/trailing whitespace - return md_content def obtainTextSnippets__Markdown(content, min_words=300, max_words=500, min_final_words=200): - """Split content into chunks based on headers and word count constraints.""" - - # Helper function to count words - def word_count(text): - return len(text.split()) - - # Helper function to split content by a given heading level (e.g., h2, h3, h4) - def split_by_heading(content, heading_level): - pattern = re.compile(rf'(?<=\n)({heading_level} .+)', re.IGNORECASE) - return pattern.split(content) - - # Helper function to chunk content - def create_chunks(content_pieces, heading_level='##'): - """ - Create chunks from content pieces based on the word count limits. - """ - chunks = [] - current_chunk = "" - current_word_count = 0 - - for piece in content_pieces: - piece_word_count = word_count(piece) - - # Check if the current piece starts with the heading level, indicating the start of a new section - if re.match(rf'^{heading_level} ', piece.strip()): - # If the current chunk has enough words, finalize it and start a new chunk - if current_word_count >= min_words: - chunks.append(current_chunk.strip()) - current_chunk = "" - current_word_count = 0 - - # Add the piece to the current chunk - if current_word_count + piece_word_count > max_words and current_word_count >= min_words: - # If adding this piece exceeds max_words, finalize the current chunk - chunks.append(current_chunk.strip()) - current_chunk = piece.strip() - current_word_count = piece_word_count - else: - current_chunk += piece + "\n" - current_word_count += piece_word_count - - # Handle the last chunk - if current_chunk.strip(): - if current_word_count < min_final_words and chunks: - # If the last chunk is too small, merge it with the previous chunk - chunks[-1] += "\n" + current_chunk.strip() - else: - # Otherwise, add it as a separate chunk - chunks.append(current_chunk.strip()) - - return chunks - - # 1. Split by h2 headings - content_pieces = split_by_heading(content, '##') - chunks = create_chunks(content_pieces) - - # 2. Further split large chunks by h3 if they exceed max_words - final_chunks = [] - for chunk in chunks: - if word_count(chunk) > max_words: - sub_pieces = split_by_heading(chunk, '###') - sub_chunks = create_chunks(sub_pieces,'###') - - # 3. Further split large sub-chunks by h4 if they exceed max_words - for sub_chunk in sub_chunks: - if word_count(sub_chunk) > max_words: - sub_sub_pieces = split_by_heading(sub_chunk, '####') - sub_sub_chunks = create_chunks(sub_sub_pieces,'####') - - # 4. If still too large, split by paragraph - for sub_sub_chunk in sub_sub_chunks: - if word_count(sub_sub_chunk) > max_words: - paragraphs = sub_sub_chunk.split('\n\n') - paragraph_chunks = create_chunks(paragraphs) - final_chunks.extend(paragraph_chunks) - else: - final_chunks.append(sub_sub_chunk) - else: - final_chunks.append(sub_chunk) - else: - final_chunks.append(chunk) - - return final_chunks - - -def createChunk(text_snippet,WEBSITE_url,keywords,title): + """Backward-compatible wrapper that now uses structured chunking.""" + if not content or not content.strip(): + return [] + parsed_document = parse_document_content( + source_url="https://example.com", + resolved_url="https://example.com/doc.md", + response_content=content.encode("utf-8"), + content_type="text/markdown", + fallback_title="Document", + ) + chunks = chunk_parsed_document( + parsed_document, + doc_type="Markdown", + keywords=[], + min_tokens=min_words, + max_tokens=max_words, + overlap_tokens=max(0, min_final_words // 4), + ) + return [chunk["content"] for chunk in chunks] + + +def createChunk( + text_snippet, + WEBSITE_url, + keywords, + title, + heading="", + heading_path=None, + doc_type="", + product="", + version="", + resolved_url="", + content_type="", +): chunk = Chunk( title = title, url = WEBSITE_url, uuid = str(uuid.uuid4()), keywords = keywords, - content = text_snippet + content = text_snippet, + heading = heading, + heading_path = heading_path or [], + doc_type = doc_type, + product = product, + version = version, + resolved_url = resolved_url, + content_type = content_type, ) return chunk @@ -768,6 +852,48 @@ def printChunks(chunks): print('='*100) +def parse_keywords(keywords_value, title=""): + keywords = [keyword.strip() for keyword in re.split(r"[;,]", keywords_value or "") if keyword.strip()] + if title and title not in keywords: + keywords.append(title) + return keywords + + +def create_chunks_for_source(source_url, source_name, doc_type, keywords_value): + if doc_type == "Ecosystem Dashboard": + return create_ecosystem_dashboard_chunk(source_url, source_name, keywords_value) + + fetch_url = source_to_fetch_url(source_url) + response = fetch_with_logging(fetch_url) + if response is None: + print('not valid, ', fetch_url) + return [] + parsed_document = parse_document_content( + source_url=normalize_source_url(source_url), + resolved_url=response.url, + response_content=response.content, + content_type=response.headers.get("content-type", ""), + fallback_title=source_name, + ) + keywords = parse_keywords(keywords_value, source_name) + return [ + createChunk( + text_snippet=payload["content"], + WEBSITE_url=payload["url"], + keywords=keywords, + title=payload["title"], + heading=payload["heading"], + heading_path=payload["heading_path"], + doc_type=payload["doc_type"], + product=payload["product"], + version=payload["version"], + resolved_url=payload["resolved_url"], + content_type=payload["content_type"], + ) + for payload in chunk_parsed_document(parsed_document, doc_type=doc_type or "Documentation", keywords=keywords) + ] + + def chunkSaveAndTrack(url,chunk): def addNewRow(current_date,chunk_words,chunk_id): @@ -828,7 +954,7 @@ def recordChunk(): def main(): - + skip_discovery = os.getenv("SKIP_DISCOVERY", "").lower() in {"1", "true", "yes"} # Ensure intrinsic_chunks folder and files from S3 are present ensure_intrinsic_chunks_from_s3() @@ -853,17 +979,23 @@ def main(): # 0) Initialize files os.makedirs(yaml_dir, exist_ok=True) # create if doesn't exist - os.makedirs('info', exist_ok=True) # create if doesn't exist + details_dir = os.path.dirname(details_file) + if details_dir: + os.makedirs(details_dir, exist_ok=True) + for filename in os.listdir(yaml_dir): + if filename.startswith('chunk_') and filename.endswith('.yaml'): + os.remove(os.path.join(yaml_dir, filename)) with open(details_file, mode='w', newline='') as file: writer = csv.writer(file) writer.writerow(['URL','Date', 'Number of Words', 'Number of Chunks','Chunk IDs']) # 0) Obtain full database information: # a) Learning Paths & Install Guides - createLearningPathChunks() + if not skip_discovery: + createLearningPathChunks(emit_chunks=False) - # b) Ecosystem Dashboard - createEcosystemDashboardChunks() + # b) Ecosystem Dashboard + createEcosystemDashboardChunks(emit_chunks=False) # c) Intrinsics #createIntrinsicsDatabaseChunks() @@ -875,29 +1007,11 @@ def main(): for i in range(csv_length): url = csv_dict['urls'][i] source_name = csv_dict['source_names'][i] + doc_type = csv_dict['site_names'][i] + keywords_value = csv_dict['focus'][i] - # 2) Translate a URL into all it's individual page URLs, if applicable, as their raw GitHub MD files --> https://raw.githubusercontent.com/ArmDeveloperEcosystem/arm-learning-paths/refs/heads/main/content/learning-paths/servers-and-cloud-computing/llama-cpu/llama-chatbot.md - MARKDOWN_urls, WEBSITE_urls = getMarkdownGitHubURLsFromPage(url) - for j in range(len(MARKDOWN_urls)): - MARKDOWN_url = MARKDOWN_urls[j] - WEBSITE_url = WEBSITE_urls[j] - - # 3) Extract markdown, skipping those that are 404ing - if not URLIsValidCheck(MARKDOWN_url): - print('not valid, ',MARKDOWN_url) - continue - markdown = obtainMarkdownContentFromGitHubMDFile(MARKDOWN_url) - - # 4) Get keywords (removing -) - keywords = [source_name.replace(" - ", " ").replace(" ", ", ")] - - # 4) Get sized text snippets the markdown - text_snippets = obtainTextSnippets__Markdown(markdown) - - # 5) Create chunks for each snippet by adding metadata - for text_snippet in text_snippets: - chunk = createChunk(text_snippet, WEBSITE_url, keywords, source_name) - chunkSaveAndTrack(url,chunk) + for chunk in create_chunks_for_source(url, source_name, doc_type, keywords_value): + chunkSaveAndTrack(url, chunk) # Save updated sources CSV with all discovered sources save_sources_csv(sources_file) @@ -906,4 +1020,4 @@ def main(): if __name__ == "__main__": - main() \ No newline at end of file + main() diff --git a/embedding-generation/local_vectorstore_creation.py b/embedding-generation/local_vectorstore_creation.py index 08f5899..f4afeda 100644 --- a/embedding-generation/local_vectorstore_creation.py +++ b/embedding-generation/local_vectorstore_creation.py @@ -19,22 +19,27 @@ import json import os import glob -import sys import datetime from sentence_transformers import SentenceTransformer from usearch.index import Index +def sentence_transformer_cache_folder(): + return os.getenv("SENTENCE_TRANSFORMERS_HOME") or None + + def load_local_yaml_files() -> List[Dict]: """Load locally stored YAML files and return their contents as a list of dictionaries.""" print("Loading local YAML files") yaml_contents = [] + intrinsic_dir = os.getenv("INTRINSIC_CHUNKS_DIR", "intrinsic_chunks") + yaml_dir = os.getenv("YAML_DATA_DIR", "yaml_data") - intrinsic_files = glob.glob(os.path.join("intrinsic_chunks", "*.yaml")) - print(f"Found {len(intrinsic_files)} YAML files in intrinsic_chunks directory") + intrinsic_files = glob.glob(os.path.join(intrinsic_dir, "*.yaml")) + print(f"Found {len(intrinsic_files)} YAML files in {intrinsic_dir} directory") - yaml_data_files = glob.glob(os.path.join("yaml_data", "*.yaml")) - print(f"Found {len(yaml_data_files)} YAML files in yaml_data directory") + yaml_data_files = glob.glob(os.path.join(yaml_dir, "*.yaml")) + print(f"Found {len(yaml_data_files)} YAML files in {yaml_dir} directory") # Combine all files all_files = intrinsic_files + yaml_data_files @@ -42,12 +47,13 @@ def load_local_yaml_files() -> List[Dict]: print(f"Total files to process: {total_files}") for i, file_path in enumerate(all_files, 1): - print(f"Loading file {i}/{total_files}: {file_path}") + if i <= 10 or i % 1000 == 0 or i == total_files: + print(f"Loading file {i}/{total_files}: {file_path}") # Extract chunk identifier based on file location - if file_path.startswith("intrinsic_chunks"): + if os.path.normpath(file_path).startswith(os.path.normpath(intrinsic_dir)): chunk_uuid = f"intrinsic_{os.path.basename(file_path).replace('.yaml', '')}" - elif file_path.startswith("yaml_data"): + elif os.path.normpath(file_path).startswith(os.path.normpath(yaml_dir)): chunk_uuid = f"yaml_data_{os.path.basename(file_path).replace('.yaml', '')}" else: chunk_uuid = file_path.replace('chunk_', '').replace('.yaml', '') @@ -68,7 +74,11 @@ def load_local_yaml_files() -> List[Dict]: def create_embeddings(contents: List[str], model_name: str = 'all-MiniLM-L6-v2') -> np.ndarray: """Create embeddings for the given contents using SentenceTransformers.""" print(f"Creating embeddings using model: {model_name}") - model = SentenceTransformer(model_name) + model = SentenceTransformer( + model_name, + cache_folder=sentence_transformer_cache_folder(), + local_files_only=True, + ) embeddings = model.encode(contents, show_progress_bar=True, convert_to_numpy=True) print(f"Created embeddings with shape: {embeddings.shape}") return embeddings @@ -96,9 +106,6 @@ def create_usearch_index(embeddings: np.ndarray, metadata: List[Dict]) -> Tuple[ print(f"Adding {num_vectors} vectors to the index") for i, embedding in enumerate(embeddings): index.add(i, embedding) - - for item, vec in zip(metadata, embeddings): - item['vector'] = vec.tolist() print(f"Added {len(index)} vectors to the index") return index, metadata @@ -115,15 +122,39 @@ def main(): contents = [] metadata = [] for i, yaml_content in enumerate(yaml_contents, 1): - print(f"Processing YAML content {i}/{len(yaml_contents)}") + if i <= 10 or i % 1000 == 0 or i == len(yaml_contents): + print(f"Processing YAML content {i}/{len(yaml_contents)}") contents.append(yaml_content['content']) + heading_path = yaml_content.get('heading_path', []) or [] + search_text = " ".join( + str(value) + for value in [ + yaml_content.get('title', ''), + " ".join(heading_path), + yaml_content.get('heading', ''), + yaml_content.get('doc_type', ''), + yaml_content.get('product', ''), + yaml_content.get('version', ''), + yaml_content.get('keywords', ''), + yaml_content.get('content', ''), + ] + if value + ) metadata.append({ 'uuid': yaml_content['uuid'], 'url': yaml_content['url'], + 'resolved_url': yaml_content.get('resolved_url', yaml_content['url']), 'original_text': yaml_content['content'], 'title': yaml_content['title'], 'keywords': yaml_content['keywords'], - 'chunk_uuid': yaml_content['chunk_uuid'] + 'chunk_uuid': yaml_content['chunk_uuid'], + 'heading': yaml_content.get('heading', ''), + 'heading_path': heading_path, + 'doc_type': yaml_content.get('doc_type', ''), + 'product': yaml_content.get('product', ''), + 'version': yaml_content.get('version', ''), + 'content_type': yaml_content.get('content_type', ''), + 'search_text': search_text, }) # Create embeddings @@ -139,12 +170,12 @@ def main(): index, metadata = create_usearch_index(embeddings, metadata) # Save the USearch index - index_filename = 'usearch_index.bin' + index_filename = os.getenv('USEARCH_INDEX_FILENAME', 'usearch_index.bin') print(f"Saving USearch index to {index_filename}") index.save(index_filename) # Save metadata - metadata_filename = 'metadata.json' + metadata_filename = os.getenv('METADATA_FILENAME', 'metadata.json') print(f"Saving metadata to {metadata_filename}") with open(metadata_filename, 'w') as f: json.dump(metadata, f, indent=2) @@ -155,4 +186,4 @@ def main(): print(f"Metadata saved to: {os.path.abspath(metadata_filename)}") if __name__ == "__main__": - main() \ No newline at end of file + main() diff --git a/embedding-generation/requirements.txt b/embedding-generation/requirements.txt index fc8fd7b..f6846d7 100644 --- a/embedding-generation/requirements.txt +++ b/embedding-generation/requirements.txt @@ -3,4 +3,6 @@ beautifulsoup4 pyyaml usearch boto3 -sentence-transformers \ No newline at end of file +sentence-transformers +pypdf +rank-bm25 diff --git a/embedding-generation/tests/test_generate_chunks.py b/embedding-generation/tests/test_generate_chunks.py index 96ecfa1..86f8b35 100644 --- a/embedding-generation/tests/test_generate_chunks.py +++ b/embedding-generation/tests/test_generate_chunks.py @@ -73,18 +73,30 @@ def test_chunk_to_dict(self, gc): url="https://example.com", uuid="test-uuid", keywords=["key1", "key2"], - content="Test content" + content="Test content", + heading="Install", + heading_path=["Guide", "Install"], + doc_type="Tutorial", + product="Ampere", + version="2025", + resolved_url="https://example.com/resolved", + content_type="markdown", ) result = chunk.toDict() - assert result == { - 'title': "Test Title", - 'url': "https://example.com", - 'uuid': "test-uuid", - 'keywords': "key1, key2", - 'content': "Test content" - } + assert result["title"] == "Test Title" + assert result["url"] == "https://example.com" + assert result["uuid"] == "test-uuid" + assert result["keywords"] == "key1, key2" + assert result["content"] == "Test content" + assert result["heading"] == "Install" + assert result["heading_path"] == ["Guide", "Install"] + assert result["doc_type"] == "Tutorial" + assert result["product"] == "Ampere" + assert result["version"] == "2025" + assert result["resolved_url"] == "https://example.com/resolved" + assert result["content_type"] == "markdown" def test_chunk_empty_keywords(self, gc): """Test Chunk with empty keywords list.""" @@ -371,6 +383,43 @@ def test_respects_max_words(self, gc): # With headers, content should be split into multiple chunks assert len(chunks) >= 2 + def test_prepends_document_title_and_heading_path(self, gc): + """Structured chunks should carry the document title and heading path prefix.""" + content = """ +# Deployment Guide + +## Install +""" + "word " * 350 + + chunks = gc.obtainTextSnippets__Markdown(content, min_words=150, max_words=400) + + assert len(chunks) >= 1 + assert chunks[0].startswith("Document Title: Deployment Guide") + assert "Heading Path: Install" in chunks[0] + + def test_keeps_code_with_neighboring_explanation(self, gc): + """Code blocks should remain grouped with nearby explanatory text.""" + content = """ +# Example Guide + +## Build +First install dependencies and verify the environment is ready for compilation. + +```bash +make build +make test +``` + +Use the generated binary to verify the expected output and continue with setup. +""" + ("\n\nAdditional context. " * 120) + + chunks = gc.obtainTextSnippets__Markdown(content, min_words=100, max_words=250) + + matching = [chunk for chunk in chunks if "make build" in chunk] + assert matching + assert "First install dependencies" in matching[0] + assert "Use the generated binary" in matching[0] + class TestReadInCSV: """Tests for readInCSV function.""" @@ -390,6 +439,8 @@ def test_read_csv_basic(self, gc, tmp_path): assert csv_dict['urls'] == ['https://example.com/1', 'https://example.com/2'] assert csv_dict['source_names'] == ['Display1', 'Display2'] assert csv_dict['focus'] == ['key1', 'key2'] + assert csv_dict['site_names'] == ['Site1', 'Site2'] + assert csv_dict['license_types'] == ['MIT', 'Apache'] def test_read_csv_empty(self, gc, tmp_path): """Test reading an empty CSV (header only).""" diff --git a/embedding-generation/vector-db-sources.csv b/embedding-generation/vector-db-sources.csv index a219193..930c53d 100755 --- a/embedding-generation/vector-db-sources.csv +++ b/embedding-generation/vector-db-sources.csv @@ -1672,3 +1672,91 @@ Ecosystem Dashboard,Arm Proprietary,Ecosystem Dashboard - Zookeeper,https://www. Ecosystem Dashboard,Arm Proprietary,Ecosystem Dashboard - Zstandard,https://www.arm.com/developer-hub/ecosystem-dashboard/?package=zstandard,Zstandard; open-source; compression; database Ecosystem Dashboard,Arm Proprietary,Ecosystem Dashboard - Zulip,https://www.arm.com/developer-hub/ecosystem-dashboard/?package=zulip,Zulip; open-source; messaging__comms; cloud-native Ecosystem Dashboard,Arm Proprietary,Ecosystem Dashboard - Zulu OpenJDK (Azul Systems),https://www.arm.com/developer-hub/ecosystem-dashboard/?package=zulu-openjdk-azul-systems,Zulu OpenJDK (Azul Systems); open-source; runtimes; languages +Reference Architecture,,Deploying an ELK stack,https://amperecomputing.com/reference-architecture/deploying-an-elk-stack-on-google-tau-t2a, +Tuning Guide,,DPDK Cryptography Build and Tuning Guide,https://amperecomputing.com/tuning-guides/dpdk-cryptography-build-and-tuning-guide, +Tuning Guide,,DPDK Setup and Tuning Guide - Refresh,https://amperecomputing.com/tuning-guides/DPDK-setup-and-tuning-guide, +Tuning Guide,,Hadoop Tuning Guide,https://amperecomputing.com/tuning-guides/hadoop-tuning-guide-on-bare-metal, +Tuning Guide,,Kafka Tuning Guide,https://amperecomputing.com/tuning-guides/apache-kafka-tuning-guide, +Tuning Guide,,Locking primitives and memory ordering on Altra,https://amperecomputing.com/tuning-guides/locking-primitives-and-memory-ordering, +Tuning Guide,,Memcached Tuning Guide,https://amperecomputing.com/tuning-guides/memcached-tuning-guide, +Tuning Guide,,MongoDB Tuning Guide,https://amperecomputing.com/tuning-guides/mongoDB-tuning-guide, +Tuning Guide,,MySQL Tuning Guide,https://amperecomputing.com/tuning-guides/mysql-tuning-guide, +Tuning Guide,,NGINX Tuning Guide,https://amperecomputing.com/tuning-guides/nginx-tuning-guide, +Tuning Guide,,PostgreSQL Tuning Guide for Ampere Altra Processors on Oracle Cloud Infrastructure,https://amperecomputing.com/tuning-guides/postgreSQL-tuning-guide, +Tuning Guide,,Redis Tuning Guide,https://amperecomputing.com/tuning-guides/Redis-setup-and-tuning-guide, +Tuning Guide,,Tuning guide for video codecs,https://amperecomputing.com/tuning-guides/FFmpeg-Tuning-Guide, +Tuning Guide,,Unlocking Java Performance on Ampere® Altra® Family Processors,https://amperecomputing.com/tuning-guides/unlocking-java-performance-tuning-guide, +Tutorial,,Accelerating the Cloud Part 1: Going Cloud Native,https://amperecomputing.com/guides/accelerating-the-cloud/going-cloud-native, +Tutorial,,Accelerating the Cloud Part 2: The Investment to Go Cloud Native,https://amperecomputing.com/guides/accelerating-the-cloud/The-Investment-to-Go-Cloud-Native, +Tutorial,,Accelerating the Cloud Part 3: Redeployment Pre-Flight Checklist,https://amperecomputing.com/guides/accelerating-the-cloud/Transitioning-to-Cloud-Native-Pre-Flight-Checklist, +Tutorial,,Accelerating the Cloud Part 4: What to Expect When Going Cloud Native,https://amperecomputing.com/guides/accelerating-the-cloud/What-to-Expect-When-Going-Cloud-Native, +Tutorial,,Accelerating the Cloud Part 5: The Final Step,https://amperecomputing.com/guides/accelerating-the-cloud/the-final-steps, +Tutorial,,Ampere AI,https://amperecomputing.com/solutions/ampere-ai, +Tutorial,,Ampere AI Optimized Frameworks,https://uawartifacts.blob.core.windows.net/upload-files/Ampere_AI_Optimized_Frameworks_92851db62e.pdf?updated_at=2022-10-04T16:55:44.090Z, +Tutorial,,Ampere Porting Advisor Tutorial,https://amperecomputing.com/tutorials/porting-advisor, +Tutorial,,Arm Native,https://amperecomputing.com/solutions/arm-native, +Tutorial,,Big Data Solutions,https://amperecomputing.com/solutions/big-data, +Tutorial,,Ceph on Ampere Processors,https://uawartifacts.blob.core.windows.net/upload-files/Ampere_Arm_Processors_for_Ceph_WP_v1_00_20230222_1_fcd19200fb.pdf?updated_at=2023-03-13T18:10:32.078Z, +Tutorial,,Cloud Native Solutions,https://amperecomputing.com/solutions/cloud-native, +Tutorial,,Cryptography Library on Ampere Tutorial,https://amperecomputing.com/tutorials/cryptography, +Tutorial,,FP16 vs Fp32 Data Formats,https://uawartifacts.blob.core.windows.net/upload-files/Fp16_vs_Fp32_Data_Formats_b2bac45bf0.pdf?updated_at=2022-10-04T16:55:45.159Z, +Tutorial,,GCC Guide for Ampere Processors 2025 - updated,https://amperecomputing.com/tutorials/gcc-guide-ampere-processors, +Tutorial,,Getting Cloud-Native with FreeBSD in OCI with Ampere A1 and Terraform,https://amperecomputing.com/blogs/getting-cloud-native-with-freebsd-on-oci-ampere-a1-with-terraform-, +Tutorial,,Getting started on Azure Ampere VMs with Debian using Terraform,https://amperecomputing.com/tutorials/getting-started-on-azure-ampere-VMs-with-Debian-using-Terraform, +Tutorial,,Getting started on Azure Ampere VMs with Opensuse using Terraform,https://amperecomputing.com/tutorials/getting-started-on-azure-ampere-vms-with-opensuse-using-terraform, +Tutorial,,Improving the Performance of Atomic Instructions for Ampere,https://amperecomputing.com/tutorials/fixing-page-fault-performance-issue, +Tutorial,,Introducing Almalinux 9 in OCI using Ampere 1 and Terraform,https://amperecomputing.com/blogs/introducing-almalinux-9-on-oci-ampere-a1-with-terraform, +Tutorial,,Introducing OpenMandriva in OCI using Ampere A1 and Terraform,https://amperecomputing.com/blogs/introducing-openmandriva-on-oci-ampere-a1-with-terraform-, +Tutorial,,Memory Page Sizes,https://amperecomputing.com/tuning-guides/understanding-memory-page-sizes-on-arm64, +Tutorial,,On demand build infrastructure in OCI using Ampere A1 and Terraform,https://amperecomputing.com/blogs/on-demand-build-infrastructure-on-oci-ampere-a1-with-terraform, +Tutorial,,Optimizing the JVM for Ampere part 1,https://amperecomputing.com/tutorials/optimizing-java-applications-for-arm64-in-the-cloud, +Tutorial,,The First 10 Questions to Answer while running on Ampere Altra-based Instances,https://amperecomputing.com/tutorials/the-first-10-questions-to-answer-while-running-on-ampere-altra-based-instances, +Tutorial,,Web Services Reference Architecture,chrome-extension://efaidnbmnnnibpcajpcglclefindmkaj/https:/uawartifacts.blob.core.windows.net/upload-files/Web_Services_Efficiency_Reference_Architecture_v1_00_20230510_2d10554b8a.pdf?updated_at=2023-05-10T15:10:19.861Z, +Workload Brief,,AI Inference on Azure Dpsv5 instances,https://amperecomputing.com/briefs/ai-inference-on-azure-brief, +Workload Brief,,AmpereOne vBench on Bare Metal,https://amperecomputing.com/briefs/x264-on-ampereone-brief, +Workload Brief,,AmpereOne: DLRM (torchbench) on Bare Metal,https://amperecomputing.com/briefs/recommender-engine-ai-inference-on-ampereone, +Workload Brief,,AmpereOne: Llama-3 on Bare Metal,https://amperecomputing.com/briefs/llama-3-ai-inference-on-ampereone, +Workload Brief,,AmpereOne: Memcached on Bare Metal,https://amperecomputing.com/briefs/memcached-on-ampereone, +Workload Brief,,AmpereOne: MySQL on Bare Metal,https://amperecomputing.com/briefs/mysql-on-ampereone, +Workload Brief,,AmpereOne: NGINX on Bare Metal,https://amperecomputing.com/briefs/nginx-on-AC04-brief, +Workload Brief,,AmpereOne: PostgreSQL on Bare Metal,https://amperecomputing.com/briefs/postgresql-on-ampereone, +Workload Brief,,AmpereOne: Redis on Bare Metal,https://amperecomputing.com/briefs/redis-on-AC04-brief, +Workload Brief,,Canonical Anbox Cloud Brief,https://amperecomputing.com/briefs/anbox_solution_brief, +Workload Brief,,Cassandra on Azure,https://amperecomputing.com/briefs/cassandra-on-azure-brief, +Workload Brief,,Cassandra on Bare Metal,https://amperecomputing.com/briefs/cassandra-workload-brief, +Workload Brief,,Cassandra on Google Cloud,https://amperecomputing.com/briefs/cassandra-on-google-cloud-brief, +Workload Brief,,DSB Social Network Brief on Bare Metal,https://amperecomputing.com/briefs/dsb-sn-brief, +Workload Brief,,DSB Social Network on OCI Brief,https://amperecomputing.com/briefs/dsb-social-network-scale-out-brief, +Workload Brief,,ElasticSearch on Azure Workload Brief,https://amperecomputing.com/briefs/elasticsearch-on-azure-brief, +Workload Brief,,ElasticSearch on OCI Workload Brief,https://amperecomputing.com/briefs/elasticsearch-oci-brief, +Workload Brief,,Hadoop Brief,https://amperecomputing.com/briefs/hadoop-workload-brief, +Workload Brief,,Hadoop on OCI Workload Brief,https://amperecomputing.com/briefs/hadoop-on-oci-brief, +Workload Brief,,Kafka Workload Brief,https://amperecomputing.com/briefs/apache-kafka-solution-brief, +Workload Brief,,Kafka on Azure Brief,https://amperecomputing.com/briefs/kafka-on-azure-brief, +Workload Brief,,Memcached on Azure,https://amperecomputing.com/briefs/memcached-on-azure-brief, +Workload Brief,,Memcached on Bare Metal,https://amperecomputing.com/briefs/memcached-workload-brief, +Workload Brief,,MongoDB Workload Brief on Bare Metal,https://amperecomputing.com/briefs/mongodb-brief, +Workload Brief,,MySQL on Bare Metal Workload Brief,https://amperecomputing.com/briefs/mysqlserver_workload_brief, +Workload Brief,,NGINX on Azure Workload Brief - Updated replacement,https://amperecomputing.com/briefs/nginx-on-azure-brief, +Workload Brief,,NGINX on Bare Metal Workload Brief,https://amperecomputing.com/briefs/nginx-workload-brief, +Workload Brief,,NGINX on Google Cloud Workload Brief,https://amperecomputing.com/briefs/nginx-on-google-cloud-brief, +Workload Brief,,Object Storage MinIO Single Node,https://www.amperecomputing.com/briefs/minio-on-single-node-brief, +Workload Brief,,Redis on Azure Workload Brief - updated replacement,https://amperecomputing.com/briefs/redis-on-azure-brief, +Workload Brief,,Redis on Bare Metal Workload Brief,https://amperecomputing.com/briefs/redis-workload-brief, +Workload Brief,,Redis on Google Cloud Workload Brief,https://amperecomputing.com/briefs/redis-on-google-brief, +Workload Brief,,Spark on OCI Workload Brief,https://amperecomputing.com/briefs/spark-on-OCI-brief, +Workload Brief,,Spark on Google Cloud Brief,https://amperecomputing.com/briefs/spark-on-google-brief, +Workload Brief,,Spark on Azure Brief,https://amperecomputing.com/briefs/spark-on-azure-brief, +Workload Brief,,Spark Workload Brief,https://amperecomputing.com/briefs/spark-workload-brief, +Workload Brief,,VP9 Video Codec on Google Cloud Workload Brief,https://amperecomputing.com/briefs/vp9-on-google-brief, +Workload Brief,,x264 on Azure Workload Brief,https://amperecomputing.com/briefs/x264-on-azure-brief, +Workload Brief,,x264 on Bare Metal Workload Brief,https://amperecomputing.com/briefs/x264_workload_brief, +Workload Brief,,x264 on Google Cloud Workload Brief,https://amperecomputing.com/briefs/x264-on-google-cloud-brief, +Workload Brief,,x265 on Azure Workload Brief,https://amperecomputing.com/briefs/x265-on-azure-brief, +Workload Brief,,x265 on Bare Metal Workload Brief,https://amperecomputing.com/briefs/x265-workload-brief, +Workload Brief,,x265 on Google Cloud Workload Brief,https://amperecomputing.com/briefs/x265-on-google-cloud-brief, +Learning Paths,CC4.0,Learning Path - Monitor Azure Cobalt 100 Arm64 virtual machines using Dynatrace OneAgent,https://learn.arm.com/learning-paths/servers-and-cloud-computing/dynatrace-azure/,Containers and Virtualization; Microsoft Azure; Linux; Dynatrace; NGINX; ActiveGate +Learning Paths,CC4.0,Learning Path - Build Robot Simulation and Reinforcement Learning Workflows with Isaac Sim and Isaac Lab on DGX Spark,https://learn.arm.com/learning-paths/laptops-and-desktops/dgx_spark_isaac_robotics/,ML; Linux; Python; Bash; IsaacSim; IsaacLab +Learning Paths,CC4.0,Learning Path - Build a customer support chatbot on Android with Llama and ExecuTorch,https://learn.arm.com/learning-paths/mobile-graphics-and-gaming/customer-support-chatbot-with-llama-and-executorch-on-arm-based-mobile-devices/,ML; macOS; Linux; Android; Java; Python; ExecuTorch +Learning Paths,CC4.0,Learning Path - Run image classification on an Alif Ensemble E8 DevKit using ExecuTorch and Ethos-U85,https://learn.arm.com/learning-paths/embedded-and-microcontrollers/alif-image-classification/,ML; Baremetal; ExecuTorch; PyTorch; GCC; CMSIS-Toolbox; Python +Learning Paths,CC4.0,Learning Path - Deploy ExecuTorch firmware on NXP FRDM i.MX 93 for Ethos-U65 acceleration,https://learn.arm.com/learning-paths/embedded-and-microcontrollers/observing-ethos-u-on-nxp/,ML; Linux; macOS; Baremetal; Python; PyTorch; ExecuTorch; Arm Compute Library; GCC diff --git a/mcp-local/Dockerfile b/mcp-local/Dockerfile index d371376..811f982 100644 --- a/mcp-local/Dockerfile +++ b/mcp-local/Dockerfile @@ -15,16 +15,21 @@ # syntax=docker/dockerfile:1.7 ARG EMBEDDINGS_IMAGE=armlimited/arm-mcp:embeddings-latest +ARG EMBEDDING_MODEL=all-MiniLM-L6-v2 # EMBEDDINGS_IMAGE must point to an embeddings image tag (e.g., armlimited/arm-mcp:embeddings-YYYY-MM-DD). FROM --platform=linux/arm64 ${EMBEDDINGS_IMAGE} AS embeddings # Stage 1: Build main application with prebuilt vector database FROM ubuntu:24.04 AS builder +ARG EMBEDDING_MODEL=all-MiniLM-L6-v2 ENV DEBIAN_FRONTEND=noninteractive \ PYTHONUNBUFFERED=1 \ PIP_NO_CACHE_DIR=1 \ - WORKSPACE_DIR=/workspace + WORKSPACE_DIR=/workspace \ + HF_HOME=/app/.cache/huggingface \ + SENTENCE_TRANSFORMERS_HOME=/app/.cache/sentence_transformers \ + SENTENCE_TRANSFORMER_MODEL=${EMBEDDING_MODEL} RUN apt-get update && apt-get install -y --no-install-recommends \ python3 python3-venv python3-pip \ @@ -65,6 +70,9 @@ RUN if [ "$TARGETPLATFORM" = "linux/amd64" ]; then \ pip install --no-cache-dir -r requirements.txt; \ fi +RUN mkdir -p "$HF_HOME" "$SENTENCE_TRANSFORMERS_HOME" && \ + python -c "from sentence_transformers import SentenceTransformer; import os; SentenceTransformer(os.environ['SENTENCE_TRANSFORMER_MODEL'], cache_folder=os.environ['SENTENCE_TRANSFORMERS_HOME'])" + # Copy generated vector database files RUN mkdir -p ./data COPY --from=embeddings /embedding-data/metadata.json ./data/metadata.json @@ -82,6 +90,8 @@ ENV DEBIAN_FRONTEND=noninteractive \ PYTHONUNBUFFERED=1 \ PIP_NO_CACHE_DIR=1 \ WORKSPACE_DIR=/workspace \ + HF_HOME=/app/.cache/huggingface \ + SENTENCE_TRANSFORMERS_HOME=/app/.cache/sentence_transformers \ VIRTUAL_ENV=/app/.venv \ PATH=/app/.venv/bin:$PATH diff --git a/mcp-local/requirements.txt b/mcp-local/requirements.txt index bcad0c8..8023a60 100644 --- a/mcp-local/requirements.txt +++ b/mcp-local/requirements.txt @@ -5,4 +5,5 @@ boto3 requests mcp sentence-transformers -fastmcp \ No newline at end of file +fastmcp +rank-bm25 diff --git a/mcp-local/server.py b/mcp-local/server.py index be1d1b1..095d118 100644 --- a/mcp-local/server.py +++ b/mcp-local/server.py @@ -14,9 +14,10 @@ from fastmcp import FastMCP from typing import List, Dict, Any, Optional +import os from sentence_transformers import SentenceTransformer from utils.config import METADATA_PATH, USEARCH_INDEX_PATH, MODEL_NAME, SUPPORTED_SCANNERS, DEFAULT_ARCH -from utils.search_utils import load_metadata, load_usearch_index, embedding_search, deduplicate_urls +from utils.search_utils import build_bm25_index, deduplicate_urls, hybrid_search, load_metadata, load_usearch_index from utils.docker_utils import check_docker_image_architectures from utils.migrate_ease_utils import run_migrate_ease_scan from utils.skopeo_tool import skopeo_help, skopeo_inspect @@ -27,10 +28,35 @@ # Initialize the MCP server mcp = FastMCP("arm-mcp") + +def sentence_transformer_cache_folder() -> str | None: + return os.getenv("SENTENCE_TRANSFORMERS_HOME") or None + + +def load_embedding_model() -> SentenceTransformer: + try: + return SentenceTransformer( + MODEL_NAME, + cache_folder=sentence_transformer_cache_folder(), + local_files_only=True, + ) + except Exception as exc: + print(f"Local cache miss for embedding model '{MODEL_NAME}', retrying with network access: {exc}") + return SentenceTransformer( + MODEL_NAME, + cache_folder=sentence_transformer_cache_folder(), + local_files_only=False, + ) + + # Load USearch index and metadata at module load time METADATA = load_metadata(METADATA_PATH) -USEARCH_INDEX = load_usearch_index(USEARCH_INDEX_PATH, METADATA) -EMBEDDING_MODEL = SentenceTransformer(MODEL_NAME) +EMBEDDING_MODEL = load_embedding_model() +USEARCH_INDEX = load_usearch_index( + USEARCH_INDEX_PATH, + EMBEDDING_MODEL.get_sentence_embedding_dimension(), +) +BM25_INDEX = build_bm25_index(METADATA) # error formatter now lives in utils/error_handling.py @@ -56,15 +82,19 @@ def knowledge_base_search(query: str, invocation_reason: Optional[str] = None) - List of dictionaries with metadata including url and text snippets. """ try: - embedding_results = embedding_search(query, USEARCH_INDEX, METADATA, EMBEDDING_MODEL) - deduped = deduplicate_urls(embedding_results) + search_results = hybrid_search(query, USEARCH_INDEX, METADATA, EMBEDDING_MODEL, BM25_INDEX) + deduped = deduplicate_urls(search_results) # Only return the relevant fields formatted = [ { "url": item["metadata"].get("url"), "snippet": item["metadata"].get("original_text", item["metadata"].get("content", "")), "title": item["metadata"].get("title", ""), - "distance": item.get("distance") + "heading": item["metadata"].get("heading", ""), + "doc_type": item["metadata"].get("doc_type", ""), + "product": item["metadata"].get("product", ""), + "distance": item.get("distance"), + "score": item.get("rerank_score", item.get("rrf_score")), } for item in deduped ] diff --git a/mcp-local/utils/search_utils.py b/mcp-local/utils/search_utils.py index bac7fd6..71acc25 100644 --- a/mcp-local/utils/search_utils.py +++ b/mcp-local/utils/search_utils.py @@ -12,37 +12,67 @@ # See the License for the specific language governing permissions and # limitations under the License. -from typing import List, Dict, Any -from usearch.index import Index +from typing import Any, Dict, List, Optional import json +import os +import re + import numpy as np +from rank_bm25 import BM25Okapi from sentence_transformers import SentenceTransformer -from .config import USEARCH_INDEX_PATH, METADATA_PATH, MODEL_NAME, DISTANCE_THRESHOLD, K_RESULTS -import os +from usearch.index import Index + +from .config import DISTANCE_THRESHOLD, K_RESULTS + + +SEARCH_TOKEN_PATTERN = re.compile(r"[a-z0-9][a-z0-9_\-+.]*", re.IGNORECASE) +RRF_K = 60 +SEARCH_STOPWORDS = { + "a", "an", "and", "are", "be", "better", "can", "configured", "configuration", "for", + "called", "how", "i", "improve", "in", "is", "it", "of", "on", "or", "out", "performance", "processor", + "processors", "recommended", "settings", "should", "step", "steps", "system", "systems", + "the", "to", "use", "what", "which", "with", "ampere", "arm", "benchmark", "benchmarking", + "benchmarked", "benchmarks", "brief", "cloud", "config", "configure", "guide", "options", + "performance", "processor", "processors", "reference", "setup", "tutorial", "tune", + "tuned", "tuning", +} +TUNING_INTENT_TOKENS = { + "benchmark", "benchmarking", "benchmarked", "benchmarks", "config", "configure", + "configured", "configuration", "latency", "oltp", "optimize", "optimized", "performance", + "throughput", "tune", "tuned", "tuning", +} +REFERENCE_ARCHITECTURE_INTENT_TOKENS = { + "architecture", "deploy", "deployment", "reference", "steps", +} +TUTORIAL_INTENT_TOKENS = { + "how", "install", "migration", "migrate", "port", "porting", "setup", "tutorial", +} + +def tokenize_for_search(text: str) -> List[str]: + return [token.lower() for token in SEARCH_TOKEN_PATTERN.findall(text or "")] -def load_usearch_index(index_path: str, metadata: List[Dict]) -> Index: + +def salient_tokens(text: str) -> List[str]: + return [token for token in tokenize_for_search(text) if token not in SEARCH_STOPWORDS] + + +def load_usearch_index(index_path: str, dimension: int) -> Optional[Index]: """Load USearch index from file.""" if not os.path.exists(index_path): print(f"Error: USearch index file '{index_path}' does not exist.") return None - if not metadata: - print("Error: Knowledge base metadata is missing or invalid.") + if dimension <= 0: + print("Error: Invalid embedding dimension.") return None - # Get dimension from the first metadata entry's vector - dimension = len(metadata[0]['vector']) - - # Create index with same parameters as used during creation index = Index( ndim=dimension, - metric='l2sq', # L2 squared distance - dtype='f32', + metric="l2sq", + dtype="f32", connectivity=16, expansion_add=128, - expansion_search=64 + expansion_search=64, ) - - # Load the saved index index.load(index_path) return index @@ -52,72 +82,190 @@ def load_metadata(metadata_path: str) -> List[Dict]: if not os.path.exists(metadata_path): print(f"Error: Metadata file '{metadata_path}' does not exist.") return [] - with open(metadata_path, 'r') as f: - metadata = json.load(f) - return metadata + with open(metadata_path, "r") as file: + return json.load(file) + + +def build_bm25_index(metadata: List[Dict]) -> Optional[BM25Okapi]: + corpus = [tokenize_for_search(item.get("search_text", "")) for item in metadata] + if not any(corpus): + return None + return BM25Okapi(corpus) def embedding_search( - query: str, - usearch_index: Index, - metadata: List[Dict], + query: str, + usearch_index: Optional[Index], + metadata: List[Dict], embedding_model: SentenceTransformer, - k: int = K_RESULTS + k: int = K_RESULTS, ) -> List[Dict[str, Any]]: """Search the USearch index with a text query.""" - # Create query embedding + if usearch_index is None: + return [] query_embedding = embedding_model.encode([query])[0] - - # Search in USearch index matches = usearch_index.search(query_embedding, k) - results = [] - # Robust handling of USearch Matches object, as in test_vectorstore.py - if matches is not None: - try: - # USearch Matches object can be accessed with .keys and .distances properties - if hasattr(matches, 'keys') and hasattr(matches, 'distances'): - labels = matches.keys - distances = matches.distances - # Alternative attribute names - elif hasattr(matches, 'labels') and hasattr(matches, 'distances'): - labels = matches.labels - distances = matches.distances - # Try converting to numpy arrays - else: - labels = np.array(matches.keys) if hasattr(matches, 'keys') else None - distances = np.array(matches.distances) if hasattr(matches, 'distances') else None - # If tuple (labels, distances) - if labels is None or distances is None: - if isinstance(matches, tuple) and len(matches) == 2: - labels, distances = matches - elif isinstance(matches, dict): - labels = matches.get('labels', matches.get('indices')) - distances = matches.get('distances') - if labels is not None and distances is not None: - labels = np.atleast_1d(labels) - distances = np.atleast_1d(distances) - for i, (idx, dist) in enumerate(zip(labels, distances)): - if idx != -1 and float(dist) < DISTANCE_THRESHOLD: - result = { - "rank": i + 1, - "distance": float(dist), - "metadata": metadata[int(idx)] - } - results.append(result) - except Exception as e: - print(f"Error processing matches: {e}") - import traceback - traceback.print_exc() + results: List[Dict[str, Any]] = [] + if matches is None: + return results + + try: + labels = getattr(matches, "keys", None) + distances = getattr(matches, "distances", None) + if labels is None or distances is None: + if isinstance(matches, tuple) and len(matches) == 2: + labels, distances = matches + elif isinstance(matches, dict): + labels = matches.get("labels", matches.get("indices")) + distances = matches.get("distances") + if labels is None or distances is None: + return results + + labels = np.atleast_1d(labels) + distances = np.atleast_1d(distances) + for rank, (idx, dist) in enumerate(zip(labels, distances), start=1): + if idx == -1: + continue + distance = float(dist) + if distance < DISTANCE_THRESHOLD: + results.append( + { + "rank": rank, + "distance": distance, + "metadata": metadata[int(idx)], + } + ) + except Exception as exc: + print(f"Error processing dense matches: {exc}") return results -def deduplicate_urls(embedding_results: List[Dict[str, Any]]) -> List[Dict[str, Any]]: - """Deduplicate metadata based on the 'url' field.""" - seen_urls = set() +def bm25_search( + query: str, + metadata: List[Dict], + bm25_index: Optional[BM25Okapi], + k: int = K_RESULTS, +) -> List[Dict[str, Any]]: + if bm25_index is None: + return [] + tokens = tokenize_for_search(query) + if not tokens: + return [] + scores = bm25_index.get_scores(tokens) + ranking = np.argsort(scores)[::-1] + results: List[Dict[str, Any]] = [] + for rank, idx in enumerate(ranking[:k], start=1): + score = float(scores[idx]) + if score <= 0: + continue + results.append( + { + "rank": rank, + "bm25_score": score, + "metadata": metadata[int(idx)], + } + ) + return results + + +def rerank_candidates(query: str, candidates: List[Dict[str, Any]]) -> List[Dict[str, Any]]: + query_tokens = set(tokenize_for_search(query)) + if not query_tokens: + return candidates + salient_query_tokens = set(salient_tokens(query)) + prefers_tuning_guide = bool(query_tokens & TUNING_INTENT_TOKENS) + prefers_reference_architecture = bool(query_tokens & REFERENCE_ARCHITECTURE_INTENT_TOKENS) + prefers_tutorial = bool(query_tokens & TUTORIAL_INTENT_TOKENS) + + reranked: List[Dict[str, Any]] = [] + for candidate in candidates: + metadata = candidate["metadata"] + full_text_tokens = set(tokenize_for_search(metadata.get("search_text", ""))) + title_tokens = set(tokenize_for_search(metadata.get("title", ""))) + heading_tokens = set(tokenize_for_search(" ".join(metadata.get("heading_path", [])))) + url_tokens = set(tokenize_for_search(metadata.get("url", ""))) + doc_type = (metadata.get("doc_type", "") or "").strip().lower() + overlap = len(query_tokens & full_text_tokens) / len(query_tokens) + title_overlap = len(query_tokens & title_tokens) / len(query_tokens) + heading_overlap = len(query_tokens & heading_tokens) / len(query_tokens) + entity_overlap = 0.0 + if salient_query_tokens: + entity_space = title_tokens | heading_tokens | url_tokens + entity_overlap = len(salient_query_tokens & entity_space) / len(salient_query_tokens) + exact_entity_bonus = 0.0 + if salient_query_tokens and (salient_query_tokens & (title_tokens | url_tokens)): + exact_entity_bonus = 0.18 + dense_bonus = 0.0 + if candidate.get("distance") is not None: + dense_bonus = max(0.0, (DISTANCE_THRESHOLD - candidate["distance"]) / DISTANCE_THRESHOLD) + sparse_bonus = min(1.0, candidate.get("bm25_score", 0.0) / 10.0) + doc_type_bonus = 0.0 + if prefers_tuning_guide: + if doc_type == "tuning guide": + doc_type_bonus += 0.30 + elif "brief" in doc_type: + doc_type_bonus -= 0.12 + if prefers_reference_architecture: + if doc_type == "reference architecture": + doc_type_bonus += 0.25 + elif "brief" in doc_type: + doc_type_bonus -= 0.05 + if prefers_tutorial: + if doc_type in {"tutorial", "install guide", "learning path"}: + doc_type_bonus += 0.10 + rerank_score = ( + candidate.get("rrf_score", 0.0) + + (0.35 * overlap) + + (0.20 * title_overlap) + + (0.15 * heading_overlap) + + (0.20 * entity_overlap) + + (0.15 * dense_bonus) + + (0.15 * sparse_bonus) + + exact_entity_bonus + + doc_type_bonus + ) + reranked.append({**candidate, "rerank_score": rerank_score}) + return sorted(reranked, key=lambda item: item["rerank_score"], reverse=True) + + +def hybrid_search( + query: str, + usearch_index: Optional[Index], + metadata: List[Dict], + embedding_model: SentenceTransformer, + bm25_index: Optional[BM25Okapi], + k: int = K_RESULTS, +) -> List[Dict[str, Any]]: + candidate_depth = max(k * 20, 100) + dense_results = embedding_search(query, usearch_index, metadata, embedding_model, candidate_depth) + sparse_results = bm25_search(query, metadata, bm25_index, candidate_depth) + + candidates: Dict[str, Dict[str, Any]] = {} + for result in dense_results: + chunk_uuid = result["metadata"].get("chunk_uuid") or result["metadata"].get("uuid") + candidates[chunk_uuid] = {**result, "rrf_score": 1 / (RRF_K + result["rank"])} + + for result in sparse_results: + chunk_uuid = result["metadata"].get("chunk_uuid") or result["metadata"].get("uuid") + existing = candidates.get(chunk_uuid, {"metadata": result["metadata"], "rrf_score": 0.0}) + existing["rank"] = min(existing.get("rank", result["rank"]), result["rank"]) + existing["bm25_score"] = result["bm25_score"] + existing["rrf_score"] += 1 / (RRF_K + result["rank"]) + candidates[chunk_uuid] = existing + + combined = rerank_candidates(query, list(candidates.values())) + return combined[:candidate_depth] + + +def deduplicate_urls(results: List[Dict[str, Any]], max_chunks_per_url: int = 1) -> List[Dict[str, Any]]: + """Keep the highest-ranked chunk for each URL by default.""" + seen_counts: Dict[str, int] = {} deduplicated_results = [] - for item in embedding_results: + for item in results: url = item["metadata"].get("url") - if url and url not in seen_urls: - seen_urls.add(url) + if not url: + continue + seen_counts[url] = seen_counts.get(url, 0) + 1 + if seen_counts[url] <= max_chunks_per_url: deduplicated_results.append(item) - return deduplicated_results \ No newline at end of file + return deduplicated_results From 8fbcd9d45694ebf76f424314126dc89e57493c88 Mon Sep 17 00:00:00 2001 From: Joe Stech <4088382+JoeStech@users.noreply.github.com> Date: Fri, 20 Mar 2026 16:24:15 -0600 Subject: [PATCH 2/3] Apply suggestion from @Copilot Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com> --- mcp-local/utils/search_utils.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mcp-local/utils/search_utils.py b/mcp-local/utils/search_utils.py index 71acc25..90efa8f 100644 --- a/mcp-local/utils/search_utils.py +++ b/mcp-local/utils/search_utils.py @@ -211,7 +211,7 @@ def rerank_candidates(query: str, candidates: List[Dict[str, Any]]) -> List[Dict elif "brief" in doc_type: doc_type_bonus -= 0.05 if prefers_tutorial: - if doc_type in {"tutorial", "install guide", "learning path"}: + if doc_type in {"tutorial", "install guide", "learning path", "learning paths"}: doc_type_bonus += 0.10 rerank_score = ( candidate.get("rrf_score", 0.0) From be402dba7cb2ca60a31d1bdedb42d0fe3a30c9b5 Mon Sep 17 00:00:00 2001 From: Joe Stech <4088382+JoeStech@users.noreply.github.com> Date: Fri, 20 Mar 2026 16:24:58 -0600 Subject: [PATCH 3/3] Apply suggestion from @Copilot Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com> --- mcp-local/utils/search_utils.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mcp-local/utils/search_utils.py b/mcp-local/utils/search_utils.py index 90efa8f..ba04c1b 100644 --- a/mcp-local/utils/search_utils.py +++ b/mcp-local/utils/search_utils.py @@ -254,7 +254,7 @@ def hybrid_search( candidates[chunk_uuid] = existing combined = rerank_candidates(query, list(candidates.values())) - return combined[:candidate_depth] + return combined[:k] def deduplicate_urls(results: List[Dict[str, Any]], max_chunks_per_url: int = 1) -> List[Dict[str, Any]]: