Building an Intelligent Agentic Retrieval-Augmented Generation (RAG) System
This guide demonstrates how to develop an advanced Agentic Retrieval-Augmented Generation (RAG) framework. Unlike traditional RAG models that simply fetch documents, our agent actively evaluates when to retrieve information, selects the optimal retrieval method, and crafts responses with contextual precision. Leveraging embeddings, FAISS indexing, and a simulated language model, this example illustrates how embedding agentic decision-making transforms a standard RAG pipeline into a more flexible and insightful system.
Establishing the Core Components: Mock LLM, Retrieval Strategies, and Document Structure
import numpy as np
import faiss
from sentence_transformers import SentenceTransformer
from dataclasses import dataclass
from enum import Enum
from typing import List, Dict, Any, Optional
class SimulatedLLM:
def generate(self, prompt: str, max_tokens: int = 150) -> str:
prompt_lower = prompt.lower()
if "should retrieve" in prompt_lower:
if any(keyword in prompt_lower for keyword in ["specific", "latest", "facts", "when", "who", "what"]):
return "RETRIEVE: Query demands precise factual data."
else:
return "NO_RETRIEVE: General knowledge suffices for this query."
elif "select retrieval method" in prompt_lower:
if "compare" in prompt_lower or "versus" in prompt_lower:
return "STRATEGY: multi_query - Requires multiple entity retrieval for comparison."
elif "latest" in prompt_lower or "recent" in prompt_lower:
return "STRATEGY: temporal - Prioritize up-to-date information."
else:
return "STRATEGY: semantic - Use standard semantic similarity search."
elif "compose answer" in prompt_lower and "context:" in prompt_lower:
return "Synthesizing a detailed response by integrating multiple sources with contextual relevance."
return "Simulated response: replace with a real LLM for production."
class RetrievalMethod(Enum):
SEMANTIC = "semantic"
MULTI_QUERY = "multi_query"
TEMPORAL = "temporal"
HYBRID = "hybrid"
@dataclass
class Document:
id: str
content: str
metadata: Dict[str, Any]
embedding: Optional[np.ndarray] = None
Here, we define a simulated language model to mimic decision-making processes, enumerate retrieval strategies for clarity, and create a structured Document class to organize our knowledge base effectively.
Constructing the Agentic RAG System: Embeddings and FAISS Indexing
class AgenticRAG:
def __init__(self, embedding_model: str = "all-MiniLM-L6-v2"):
self.encoder = SentenceTransformer(embedding_model)
self.llm = SimulatedLLM()
self.documents: List[Document] = []
self.index: Optional[faiss.Index] = None
def ingest_documents(self, docs: List[Dict[str, Any]]) -> None:
print(f"Adding {len(docs)} documents to knowledge base...")
for idx, doc in enumerate(docs):
document = Document(
id=doc.get("id", str(idx)),
content=doc["content"],
metadata=doc.get("metadata", {})
)
self.documents.append(document)
texts = [doc.content for doc in self.documents]
embeddings = self.encoder.encode(texts, show_progress_bar=True)
for doc, emb in zip(self.documents, embeddings):
doc.embedding = emb
dim = embeddings.shape[1]
self.index = faiss.IndexFlatIP(dim)
faiss.normalize_L2(embeddings)
self.index.add(embeddings.astype("float32"))
print(f"Knowledge base initialized with {len(self.documents)} documents.")
This segment initializes the embedding model and FAISS index, then encodes document contents into vector representations to enable rapid semantic retrieval.
Empowering the Agent: Decision-Making and Strategy Selection
def should_retrieve(self, query: str) -> bool:
prompt = f"""
Evaluate if the following query requires retrieval:
Query: "{query}"
Indicate if specific facts or recent data are necessary.
Reply with: RETRIEVE: [reason] or NO_RETRIEVE: [reason]
"""
response = self.llm.generate(prompt)
retrieve = response.startswith("RETRIEVE:")
print(f"🤖 Agent Decision: {'Retrieve' if retrieve else 'Answer Directly'}")
print(f" Explanation: {response.split(':', 1)[1].strip() if ':' in response else response}")
return retrieve
def select_strategy(self, query: str) -> RetrievalMethod:
prompt = f"""
Select the optimal retrieval method for this query:
Query: "{query}"
Options:
- semantic: Standard similarity search
- multi_query: Multiple related queries for comparisons
- temporal: Emphasize recent information
- hybrid: Combination of methods
Respond with: STRATEGY: [method] - [justification]
"""
response = self.llm.generate(prompt)
if "multi_query" in response.lower():
method = RetrievalMethod.MULTI_QUERY
elif "temporal" in response.lower():
method = RetrievalMethod.TEMPORAL
elif "hybrid" in response.lower():
method = RetrievalMethod.HYBRID
else:
method = RetrievalMethod.SEMANTIC
print(f"🎯 Retrieval Strategy: {method.value}")
print(f" Justification: {response.split('-', 1)[1].strip() if '-' in response else response}")
return method
Our agent evaluates whether retrieval is necessary and then intelligently chooses the best retrieval approach, providing transparent reasoning at each step.
Executing Retrieval and Crafting Contextual Responses
def fetch_documents(self, query: str, strategy: RetrievalMethod, top_k: int = 3) -> List[Document]:
if not self.index:
print("❌ Knowledge base is empty.")
return []
if strategy == RetrievalMethod.MULTI_QUERY:
queries = [query, f"benefits of {query}", f"drawbacks of {query}"]
combined_docs = []
for q in queries:
docs = self._semantic_search(q, k=2)
combined_docs.extend(docs)
unique_docs = []
seen = set()
for doc in combined_docs:
if doc.id not in seen:
unique_docs.append(doc)
seen.add(doc.id)
return unique_docs[:top_k]
elif strategy == RetrievalMethod.TEMPORAL:
docs = self._semantic_search(query, k=top_k * 2)
docs_with_dates = [(doc, doc.metadata.get("date", "1900-01-01")) for doc in docs]
docs_with_dates.sort(key=lambda x: x[1], reverse=True)
return [doc for doc, _ in docs_with_dates[:top_k]]
else:
return self._semantic_search(query, k=top_k)
def _semantic_search(self, query: str, k: int) -> List[Document]:
query_emb = self.encoder.encode([query])
faiss.normalize_L2(query_emb)
scores, indices = self.index.search(query_emb.astype("float32"), k)
results = []
for score, idx in zip(scores[0], indices[0]):
if idx str:
if not docs:
return self.llm.generate(f"Provide an answer to: {query}")
context_text = "nn".join([f"Doc {i+1}: {doc.content}" for i, doc in enumerate(docs)])
prompt = f"""
Query: {query}
Context: {context_text}
Compose a detailed answer using the context. Cite sources when applicable.
"""
return self.llm.generate(prompt, max_tokens=200)
This section details how the system retrieves documents based on the chosen strategy, removes duplicates, prioritizes recent data when needed, and synthesizes a comprehensive answer grounded in the retrieved context.
Integrating the Workflow: From Query to Answer
def process_query(self, query: str) -> str:
print(f"n🔍 Processing query: '{query}'")
print("=" * 50)
if not self.should_retrieve(query):
print("n📝 Generating direct answer...")
return self.llm.generate(f"Provide an answer to: {query}")
strategy = self.select_strategy(query)
print(f"n📚 Retrieving documents using '{strategy.value}' strategy...")
retrieved = self.fetch_documents(query, strategy)
print(f" Retrieved {len(retrieved)} documents.")
print("n🧠 Synthesizing final response...")
answer = self.generate_response(query, retrieved)
if retrieved:
print("n📄 Retrieved Context Preview:")
for i, doc in enumerate(retrieved[:2], 1):
print(f" {i}. {doc.content[:100]}...")
return answer
This method orchestrates the entire pipeline: deciding on retrieval, selecting the strategy, fetching relevant documents, and producing a well-informed response, all while providing clear feedback on each stage.
Demonstration: Sample Knowledge Base and Query Execution
def build_sample_knowledge():
return [
{
"id": "ai_intro",
"content": "Artificial Intelligence (AI) encompasses systems capable of performing tasks that typically require human intelligence.",
"metadata": {"topic": "AI Fundamentals", "date": "2024-04-10"}
},
{
"id": "ml_overview",
"content": "Machine Learning (ML) is a branch of AI focused on algorithms that improve through experience.",
"metadata": {"topic": "Machine Learning", "date": "2024-05-01"}
},
{
"id": "rag_explanation",
"content": "Retrieval-Augmented Generation (RAG) integrates external knowledge retrieval with language models to enhance response accuracy and timeliness.",
"metadata": {"topic": "RAG", "date": "2024-05-15"}
},
{
"id": "agentic_ai",
"content": "Agentic AI systems autonomously decide actions, improving adaptability and contextual understanding.",
"metadata": {"topic": "Agentic AI", "date": "2024-06-01"}
}
]
if __name__ == "__main__":
print("🚀 Launching Agentic RAG System...")
rag = AgenticRAG()
knowledge_docs = build_sample_knowledge()
rag.ingest_documents(knowledge_docs)
test_queries = [
"Define artificial intelligence.",
"How is the weather today?",
"Compare artificial intelligence and machine learning."
]
for q in test_queries:
result = rag.process_query(q)
print(f"n💬 Final Answer: {result}")
print("n" + "=" * 80)
print("n✅ Agentic RAG System demonstration complete!")
print("Key capabilities showcased:")
print("• Autonomous retrieval decision-making")
print("• Adaptive retrieval strategy selection")
print("• Multi-faceted retrieval techniques")
print("• Transparent and interpretable reasoning")
In this final example, we assemble a concise knowledge base centered on AI topics, initialize the Agentic RAG system, and execute sample queries that illustrate the agent’s ability to decide when to retrieve, how to retrieve, and how to generate context-aware answers.
Summary
This tutorial highlights how embedding agentic reasoning into a RAG system enhances its intelligence and flexibility. By enabling the agent to autonomously determine retrieval necessity, select tailored strategies, and transparently explain its decisions, we create a more human-like and effective information retrieval workflow. This foundation paves the way for integrating real-world large language models, expanding knowledge bases, and refining retrieval techniques in future developments.
