In this demo, we will go through the process of building a RAG system using LangChain and HuggingFace.
- 🦜 LangChain: https://www.langchain.com/
- 🤗 HuggingFace: https://huggingface.co/
The vector database requires Python <= 3.12
conda create -n ragnificent python=3.12
conda activate ragnificent
pip install torch==2.8.0 torchvision==0.23.0
pip install -r requirements.txtFirst, we need documents to work with. Let's explore different ways to load them.
from langchain_core.documents import Document
page_content = "Hello, world!"
docs = []
doc = Document(
page_content=page_content,
metadata={
"source": "My custom document",
"title": "Title of my custom document"
}
)
docs.append(doc)
print("-"*50)
print("Metadata:")
for k, v in docs[0].metadata.items():
print(f"{k}: {v}")
print("-"*50)
print("Page content (preview):")
print(docs[0].page_content if len(docs[0].page_content) < 500 else docs[0].page_content[:500] + "...")from langchain_community.document_loaders import PyPDFLoader
pdf_path = "context/Weller et al. - 2025 - On the Theoretical Limitations of Embedding-Based Retrieval.pdf"
loader = PyPDFLoader(
file_path=pdf_path,
mode="single"
)
docs = loader.load()
print(f"Loaded PDF as {len(docs):d} document(s)")
print("-"*50)
print("Metadata:")
for k, v in docs[0].metadata.items():
print(f"{k}: {v}")
print("-"*50)
print("Page content (preview):")
print(docs[0].page_content[:500] + "...")from langchain_community.document_loaders import WikipediaLoader
loader = WikipediaLoader(
query="What is the capital of France?",
load_max_docs=10
)
docs = loader.load()
print(f"Loaded {len(docs):d} document(s)")
print("-"*50)
for doc in docs:
print(doc.metadata)For demonstration purposes, the WebBaseLoader can provide timely context that is not included in any LLM's training data. For instance, the NFL scores and highlights from last week: https://www.cbssports.com/nfl/news/nfl-week-3-grades-scores-results-highlights-browns-packers-vikings-bengals/.
Note: Different websites require different parsing strategies.
import bs4
from langchain_community.document_loaders import WebBaseLoader
import os
import re
os.environ['USER_AGENT'] = ("Demo")
loader = WebBaseLoader(
web_paths=("https://www.nfl.com/news/2025-nfl-week-3-takeaways-what-we-learned-from-sunday-s-14-games/",),
bs_kwargs=dict(
parse_only=bs4.SoupStrainer("article")
),
)
docs = loader.load()
# Clean and condense page content: strip, remove blank lines, trim lines, and join into a single line
docs[0].page_content = " ".join(
line.strip() for line in docs[0].page_content.strip().splitlines() if line.strip()
)
# Replace sequences of two or more spaces with a single space
docs[0].page_content = re.sub(r'\s{2,}', ' ', docs[0].page_content)
print(f"Loaded Website as {len(docs):d} document(s)")
print("-"*50)
print("Metadata:")
for k, v in docs[0].metadata.items():
print(f"{k}: {v}")
print("-"*50)
print("Page content (preview):")
print(docs[0].page_content if len(docs[0].page_content) < 10000 else docs[0].page_content[:10000] + "...")In RAG systems, it is common to split large documents into smaller chunks for effective retrieval.
from langchain_text_splitters import RecursiveCharacterTextSplitter
# Split document into chunks for vector storage
text_splitter = RecursiveCharacterTextSplitter(
chunk_size=500,
chunk_overlap=50,
add_start_index=True,
)
doc_chunks = text_splitter.split_documents(docs)
print(f"Document split into {len(doc_chunks):d} chunks.")
# Show first few chunks for verification
for index, chunk in enumerate(doc_chunks[:3]):
print(f"\n\nChunk {index+1}")
print("="*50)
print("Metadata:")
for k, v in chunk.metadata.items():
print(f"{k}: {v}")
print("-"*50)
print("Page content:")
print(chunk.page_content if len(chunk.page_content) < 500 else chunk.page_content[:500] + "...")Now we'll embed our text chunks into vectors using a pre-trained embedding model.
There are many pre-trained embedding models available on HuggingFace, here are some examples:
- intfloat/multilingual-e5-base
- Qwen/Qwen2.5-0.5B
- Qwen/Qwen3-Embedding-0.6B
- Qwen/Qwen3-Embedding-4B
- sentence-transformers/all-MiniLM-L6-v2
Choose an embedding model and set it in the following code block:
from langchain_huggingface import HuggingFaceEmbeddings
embedding_model_name = "<embedding_model_name>"
embedding_function = HuggingFaceEmbeddings(
model_name=embedding_model_name,
model_kwargs={"device": "cuda"},
encode_kwargs={"normalize_embeddings": True},
)from langchain_core.vectorstores import InMemoryVectorStore
vector_store = InMemoryVectorStore(embedding=embedding_function)
# Add documents to vector store
document_chunk_ids = vector_store.add_documents(documents=doc_chunks)
print(f"Added {len(document_chunk_ids):d} documents to the vector store")# Inspect in-memory vector store
n_chunks = 5
for index, (id, doc) in enumerate(vector_store.store.items()):
if index < n_chunks:
print(f"Chunk {index+1}")
print("-"*50)
print(f"id: {id}")
print(f"vector (length: {len(doc['vector'])}): {doc['vector']}")
print(f"metadata: {doc['metadata']}")
print(f"text:\n{doc['text'] if len(doc['text']) < 100 else doc['text'][:100] + '...'}\n\n")
else:
breakimport faiss
from langchain_community.docstore.in_memory import InMemoryDocstore
from langchain_community.vectorstores import FAISS
vector_store_path = "vector_store"
embedding_dim = len(embedding_function.embed_query("test"))
index = faiss.IndexFlatL2(embedding_dim)
vector_store = FAISS(
embedding_function=embedding_function,
index=index,
docstore=InMemoryDocstore(),
index_to_docstore_id={},
)
# Add documents to vector store
document_chunk_ids = vector_store.add_documents(documents=doc_chunks)
print(f"Added {len(document_chunk_ids):d} documents to the vector store")
vector_store.save_local(vector_store_path)
vector_store = FAISS.load_local(
vector_store_path,
embeddings=embedding_function,
allow_dangerous_deserialization=True
)# Inspect FAISS vector store
n_chunks = 5
print(f"Total documents in FAISS vector store: {vector_store.index.ntotal}")
print(f"Vector dimension: {vector_store.index.d}")
print("-"*50)
# Get document IDs from the index_to_docstore_id mapping
docstore_ids = list(vector_store.index_to_docstore_id.values())[:n_chunks]
for index, doc_id in enumerate(docstore_ids):
doc = vector_store.docstore.search(doc_id)
print(f"Chunk {index+1}")
print("-"*50)
print(f"Document ID: {doc_id}")
print(f"Metadata: {doc.metadata}")
print(f"Text:\n{doc.page_content if len(doc.page_content) < 100 else doc.page_content[:100] + '...'}\n\n")
if index >= n_chunks - 1:
breakLet's test our vector store with a sample query. This is a useful test to verify that the embedding model is working correctly and to compare the quality of the different embedding models.
# Test vector store with a sample query
query = "Who was the Vikings' starting quarterback in week 3?"
top_k = 5
similar_document_chunks = vector_store.similarity_search_with_score(query, k=top_k)
print(f"List of {len(similar_document_chunks):d} most similar document chunks for query: '{query:s}'")
for i, (doc, score) in enumerate(similar_document_chunks):
if i < top_k:
print("\n" + "-"*50)
print(f"Result {i+1} (Similarity Score: {score:.4f})")
print(f"\tid: {doc.id}")
print(f"\tmetadata: {doc.metadata}")
print(f"\tpage content: {doc.page_content if len(doc.page_content) < 300 else doc.page_content[:300] + '...'}")
else:
breakNow we need an LLM to generate answers based on retrieved context.
This ChatPromptTemplate has two placeholders. The {context}, which will be replaced with the retrieved document chunks from the vector store, and the {input}, which will be replaced with the user's question.
from langchain_core.prompts import ChatPromptTemplate
# Create prompt template for RAG system
chat_prompt = ChatPromptTemplate.from_messages([
("system",
"""
You are an AI assistant that answers questions based on provided context documents.
"""
),
("human",
"""
Answer the question based on the context.
CRITICAL RULES:
- Answer concisely
- Use information from the provided context
- If the context doesn't contain enough information, state this clearly
- Cite specific details from the context when possible
CONTEXT:
{context}
QUESTION: {input}
ANSWER:
"""
)
])Again, there are many pre-trained language models available on HuggingFace, here are some examples:
- google/gemma-3-1b-it
- google/gemma-3-4b-it
- meta-llama/Llama-2-7b
- meta-llama/Llama-3.2-1B
- openai-community/gpt2
- Qwen/Qwen2.5-0.5B
- Qwen/Qwen2.5-3B
For the purpose of this demo, it can be interesting to try different models, even some lower-performance ones, to see the impact of the context provided through the RAG system.
Choose a language model and set it in the following code block:
from langchain_huggingface import HuggingFacePipeline
from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline
model_name = "<model_name>"
print(f"Loading llm <{model_name:s}>")
# Load LLM
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForCausalLM.from_pretrained(model_name)
# Create text generation pipeline
text_generation_pipeline = pipeline(
task="text-generation",
model=model,
tokenizer=tokenizer,
return_full_text=False,
max_new_tokens=250, # Limit answer length for concise RAG responses
temperature=0.2, # Low randomness, mostly deterministic
top_p=0.95, # Sample from top 95% probable tokens
repetition_penalty=1.2 # Penalize repeated content to improve answer quality
)
# Wrap pipeline for LangChain
llm = HuggingFacePipeline(pipeline=text_generation_pipeline)Here we are using the create_stuff_documents_chain to combine the retrieved document chunks into a single context for the LLM.
It takes your docs and smooshes them all together into one big text. Then it takes your ChatPromptTemplate (the template that says "Hey AI, here's some context + my question...") and fills in the template with the smooshed-together documents. Finally, it sends everything to the LLM, which will generate a response.
from langchain.chains.combine_documents import create_stuff_documents_chain
combine_docs_chain = create_stuff_documents_chain(llm, chat_prompt)
retriever = vector_store.as_retriever(
search_type="similarity",
search_kwargs={
"k": 5,
"score_threshold": 0.7,
}
)secret_spider_man_docs = [
Document(
page_content="My first name is Peter.",
metadata={"doc-nr.": "1"}
),
Document(
page_content="My last name is Parker.",
metadata={"doc-nr.": "2"}
)
]
response = combine_docs_chain.invoke({"context": secret_spider_man_docs, "input": "What is the person's full name?"})
print(f"Response:")
print("-"*50)
print(f"{response:s}")This combines retrieval with generation.
from langchain.chains import create_retrieval_chain
retrieval_chain = create_retrieval_chain(retriever, combine_docs_chain)
response = retrieval_chain.invoke({"input": "What is this document about?"})print("ANSWER")
print("-"*50)
print(f"{response['answer']:s}")
print("\n\n")
print("CONTEXT")
print("-"*50)
for i, doc in enumerate(response['context']):
print(f"Document {i+1}:")
for key, value in doc.metadata.items():
print(f"\t{key}: {value}")
print(f"\tPage content: {doc.page_content if len(doc.page_content) < 300 else doc.page_content[:300] + '...'}")
print("\n")
print("\n\n")Let's see the difference between an LLM with and without context. For demonstration purposes, we'll use the same question from earlier, to which the LLM can't provide an accurate answer.
question = "Who was the Vikings' starting quarterback in week 3?"
print("LLM without context:")
print("-"*50)
llm_response = llm.invoke(question)
print(f"Answer: {llm_response}")
print("\n\n")
print("RAG system with context:")
print("-"*50)
rag_response = retrieval_chain.invoke({"input": question})
print(f"Answer: {rag_response['answer']}")
print("Context:")
for i, doc in enumerate(rag_response['context']):
print(f"Document {i+1}:")
for key, value in doc.metadata.items():
print(f"\t{key}: {value}")
print(f"\tPage content: {doc.page_content if len(doc.page_content) < 300 else doc.page_content[:300] + '...'}")
print("\n")
print("\n\n")