# 基于RAG的中土世界知识问答系统构建实践
## 一、项目概述与核心架构设计
### 项目背景与目标
在《魔戒》、《霍比特人》等作品构建的中土世界,存在着庞大而复杂的知识体系。传统的问答系统难以准确理解这个奇幻世界的独特术语、地理关系和历史脉络。本文将探讨如何利用检索增强生成技术,构建一个专门针对中土世界知识的智能问答系统。
### 系统架构概览
```
中土世界知识问答系统架构:
├── 知识库构建层
│ ├── 数据采集与预处理
│ ├── 文本分割与向量化
│ └── 向量数据库存储
├── 检索增强层
│ ├── 语义检索模块
│ ├── 重排序模块
│ └── 上下文构建模块
└── 生成推理层
├── LLM集成接口
├── 提示词工程
└── 响应生成与验证
```
### 技术栈选型
```python
# requirements.txt 核心依赖
langchain==0.1.0
chromadb==0.4.22
sentence-transformers==2.2.2
faiss-cpu==1.7.4
openai==1.3.0 # 或其他本地LLM
pydantic==2.5.0
fastapi==0.104.0
```
## 二、中土世界知识库构建
### 数据源收集与处理
中土世界知识数据主要来源于多个权威渠道:
```python
import json
from typing import List, Dict
from dataclasses import dataclass
@dataclass
class MiddleEarthDocument:
"""中土世界文档数据结构"""
title: str
content: str
category: str # 如: 种族、地理、历史、人物
source: str
metadata: Dict
class KnowledgeCollector:
def __init__(self):
self.documents: List[MiddleEarthDocument] = []
def load_tolkien_texts(self, file_path: str):
"""加载托尔金原著文本"""
with open(file_path, 'r', encoding='utf-8') as f:
texts = json.load(f)
for text in texts:
doc = MiddleEarthDocument(
title=text.get('title', ''),
content=text['content'],
category=text.get('category', 'literature'),
source='tolkien_original',
metadata={
'book': text.get('book', ''),
'chapter': text.get('chapter', ''),
'era': text.get('era', 'Third Age')
}
)
self.documents.append(doc)
def load_wiki_data(self, wiki_dump_path: str):
"""加载中土世界维基数据"""
# 解析维基格式,提取结构化信息
pass
```
### 文本分割与处理策略
考虑到中土世界文本的特殊性,需要定制化的分割策略:
```python
from langchain.text_splitter import RecursiveCharacterTextSplitter
class MiddleEarthTextSplitter:
def __init__(self):|6H.E8P.HK|BV.R6T.HK|LZ.P8H.HK
# 中土世界特有的分隔符
self.separators = [
"\n## ", # 章节标题
"\n### ",
"\n\n",
"。",
".",
" ",
"",
]
self.text_splitter = RecursiveCharacterTextSplitter(
chunk_size=500,
chunk_overlap=50,
separators=self.separators,
length_function=len,
)
def split_document(self, document: MiddleEarthDocument) -> List[str]:
"""分割中土世界文档"""
chunks = self.text_splitter.split_text(document.content)
# 为每个块添加元数据
chunk_docs = []
for i, chunk in enumerate(chunks):
chunk_doc = {
"content": chunk,
"title": document.title,
"category": document.category,
"chunk_index": i,
"source": document.source,
**document.metadata
}
chunk_docs.append(chunk_doc)
return chunk_docs
```
### 向量化与索引构建
```python
import chromadb
from sentence_transformers import SentenceTransformer
from typing import List
class VectorStoreManager:
def __init__(self, model_name: str = "paraphrase-multilingual-MiniLM-L12-v2"):
self.embedding_model = SentenceTransformer(model_name)
self.chroma_client = chromadb.PersistentClient(path="./middle_earth_db")
def create_collection(self, collection_name: str = "middle_earth_knowledge"):
"""创建向量存储集合"""
self.collection = self.chroma_client.create_collection(
name=collection_name,
metadata={"description": "中土世界知识库"}
)
def add_documents(self, documents: List[Dict]):
"""添加文档到向量数据库"""
ids = []
embeddings = []
metadatas = []
for i, doc in enumerate(documents):
# 生成嵌入向量
embedding = self.embedding_model.encode(doc["content"]).tolist()
ids.append(f"doc_{i}")
embeddings.append(embedding)
metadatas.append({
"title": doc.get("title", ""),
"category": doc.get("category", ""),
"source": doc.get("source", ""),
**{k: v for k, v in doc.items() if k not in ["content", "title", "category", "source"]}
})
# 批量添加
self.collection.add(
embeddings=embeddings,
metadatas=metadatas,
ids=ids
)
def similarity_search(self, query: str, k: int = 5):
"""相似度搜索"""
query_embedding = self.embedding_model.encode(query).tolist()
results = self.collection.query(
query_embeddings=[query_embedding],
n_results=k,
include=["metadatas", "documents", "distances"]
)
return results
```
## 三、RAG检索增强机制实现
### 混合检索策略
针对中土世界知识的特殊性,实施混合检索策略:
```python
class HybridRetriever:
def __init__(self, vector_store: VectorStoreManager):
self.vector_store = vector_store
self.keyword_index = {} # 关键词倒排索引
def build_keyword_index(self, documents: List[Dict]):
"""构建中土世界关键词索引"""
middle_earth_keywords = {
"种族": ["霍比特人", "精灵", "矮人", "人类", "奥克", "恩特"],
"地理": ["夏尔", "瑞文戴尔", "摩瑞亚", "刚铎", "魔多"],
"人物": ["弗罗多", "甘道夫", "阿拉贡", "索伦", "咕噜"],
"物品": ["魔戒", "刺叮剑", "敌击剑", "精灵宝钻"]
}
for doc in documents:
doc_keywords = []
for category, keywords in middle_earth_keywords.items():
for keyword in keywords:
if keyword in doc["content"]:
doc_keywords.append(keyword)
if doc_keywords:
for keyword in doc_keywords:
if keyword not in self.keyword_index:
self.keyword_index[keyword] = []
self.keyword_index[keyword].append(doc["title"])
def retrieve(self, query: str, use_hybrid: bool = True):
"""混合检索"""
# 语义检索
semantic_results = self.vector_store.similarity_search(query, k=10)
if use_hybrid:
# 关键词检索增强
keyword_docs = []
for keyword in self.keyword_index.keys():
if keyword in query:
keyword_docs.extend(self.keyword_index[keyword])
# 合并结果
all_docs = self._merge_results(semantic_results, keyword_docs)
return all_docs
return semantic_results
def _merge_results(self, semantic_results, keyword_docs):
"""合并检索结果"""
# 实现结果去重和重排序
pass
```
### 上下文优化与重排序
```python
class ContextOptimizer:
def __init__(self):
self.relevance_threshold = 0.7
def optimize_context(self, retrieved_docs, query: str):
"""优化检索到的上下文"""
# 1. 相关性过滤
filtered_docs = [
doc for doc in retrieved_docs
if self._calculate_relevance(doc["content"], query) > self.relevance_threshold
]
# 2. 多样性选择
selected_docs = self._select_diverse_docs(filtered_docs, max_docs=5)
# 3. 构建上下文
context = self._construct_context(selected_docs, query)
return context
def _calculate_relevance(self, doc_content: str, query: str) -> float:
"""计算文档与查询的相关性"""
# 使用更复杂的相关性计算
query_terms = set(query.split())
doc_terms = set(doc_content.split())
intersection = query_terms.intersection(doc_terms)
return len(intersection) / len(query_terms) if query_terms else 0
def _select_diverse_docs(self, docs: List[Dict], max_docs: int):
"""选择多样化的文档"""
selected = []
categories_seen = set()
for doc in docs:
category = doc.get("category", "unknown")
if category not in categories_seen:
selected.append(doc)
categories_seen.add(category)
if len(selected) >= max_docs:
break
return selected
def _construct_context(self, docs: List[Dict], query: str) -> str:
"""构建LLM可理解的上下文"""
context_parts = []
for i, doc in enumerate(docs, 1):
context_parts.append(
f"[文档 {i}] 标题: {doc.get('title', '未知')}\n"
f"分类: {doc.get('category', '未知')}\n"
f"内容: {doc['content'][:300]}...\n"
)
context = "\n".join(context_parts)
return context
```
## 四、智能问答系统实现
### 提示词工程
针对中土世界特点设计专用提示词模板:
```python
class MiddleEarthPromptEngineer:
def __init__(self):
self.system_prompt = """你是中土世界知识专家,专门回答关于托尔金创造的奇幻世界的问题。
你的回答需要基于提供的中土世界资料,确保准确性和一致性。
中土世界特点:
1. 遵循托尔金原著设定
2. 注意不同种族的特性差异
3. 考虑历史时间线的准确性
4. 使用正确的专有名词和术语
请以专业但易懂的方式回答问题。"""
def build_rag_prompt(self, query: str, context: str) -> str:
"""构建RAG提示词"""
prompt = f"""{self.system_prompt}
相关背景资料:
{context}
用户问题:{query}
请根据以上资料回答问题。如果资料中没有相关信息,请明确说明不清楚,不要编造信息。
回答时请:
1. 引用资料中的具体信息
2. 如果涉及不同来源有冲突,指出可能的差异
3. 保持回答结构清晰
回答:"""
return prompt
```
### 问答系统主流程
```python
from typing import Optional
import openai # 或使用本地LLM
class MiddleEarthQA:9K.E2C.HK|2G.W4E.HK|YX.E8P.HK|PM.R6T.HK
def __init__(self,
retriever: HybridRetriever,
llm_config: Optional[Dict] = None):
self.retriever = retriever
self.context_optimizer = ContextOptimizer()
self.prompt_engineer = MiddleEarthPromptEngineer()
# 配置LLM
if llm_config:
openai.api_key = llm_config.get("api_key")
self.llm_model = llm_config.get("model", "gpt-3.5-turbo")
else:
# 本地LLM配置
self.llm_model = self._setup_local_llm()
def ask(self, question: str) -> Dict:
"""问答主流程"""
try:
# 1. 检索相关文档
retrieved_docs = self.retriever.retrieve(question)
# 2. 优化上下文
context = self.context_optimizer.optimize_context(
retrieved_docs, question
)
# 3. 构建提示词
prompt = self.prompt_engineer.build_rag_prompt(question, context)
# 4. 生成回答
response = self._generate_response(prompt)
# 5. 添加溯源信息
response_with_sources = self._add_source_attribution(
response, retrieved_docs
)
return {
"answer": response_with_sources,
"sources": retrieved_docs[:3], # 显示主要来源
"context_used": context[:500] # 显示使用的上下文摘要
}
except Exception as e:
return {
"error": str(e),
"answer": "抱歉,回答问题时出现错误。"
}
def _generate_response(self, prompt: str) -> str:
"""调用LLM生成回答"""
if hasattr(self, 'llm_model'):
# 使用OpenAI API
response = openai.ChatCompletion.create(
model=self.llm_model,
messages=[
{"role": "system", "content": self.prompt_engineer.system_prompt},
{"role": "user", "content": prompt}
],
temperature=0.3,
max_tokens=1000
)
return response.choices[0].message.content
else:
# 本地LLM调用
return self._call_local_llm(prompt)
def _add_source_attribution(self, response: str, sources: List[Dict]) -> str:
"""在回答中添加来源标注"""
source_refs = []
for i, source in enumerate(sources[:3], 1):
source_refs.append(
f"[{i}] {source.get('title', '未知标题')} "
f"({source.get('category', '未知分类')})"
)
if source_refs:
response += f"\n\n---\n参考资料:\n" + "\n".join(source_refs)
return response
```
## 五、系统部署与评估
### Web接口实现
```python
from fastapi import FastAPI, HTTPException
from pydantic import BaseModel
app = FastAPI(title="中土世界知识问答系统")
class QuestionRequest(BaseModel):
question: str
use_hybrid_search: bool = True
class AnswerResponse(BaseModel):
answer: str
sources: List[Dict]
processing_time: float
# 全局系统实例
qa_system = None
@app.on_event("startup")
async def startup_event():
"""系统启动时初始化"""
global qa_system
# 初始化所有组件
vector_store = VectorStoreManager()
retriever = HybridRetriever(vector_store)
qa_system = MiddleEarthQA(retriever)
@app.post("/ask", response_model=AnswerResponse)
async def ask_question(request: QuestionRequest):
"""问答接口"""
if not qa_system:
raise HTTPException(status_code=503, detail="系统未就绪")
import time
start_time = time.time()
result = qa_system.ask(request.question)
processing_time = time.time() - start_time
return AnswerResponse(
answer=result.get("answer", ""),
sources=result.get("sources", []),
processing_time=processing_time
)
@app.get("/health")
async def health_check():
"""健康检查"""
return {"status": "healthy", "service": "middle_earth_qa"}
```
### 评估指标与测试
```python
class SystemEvaluator:
def __init__(self, qa_system: MiddleEarthQA):
self.qa_system = qa_system
self.test_cases = [
{
"question": "霍比特人的特点是什么?",
"expected_keywords": ["夏尔", "热爱和平", "小个子"]
},
{
"question": "魔戒的历史是怎样的?",
"expected_keywords": ["索伦", "精灵", "铸造", "销毁"]
},
# 更多测试用例...
]
def evaluate(self) -> Dict:
"""系统评估"""
results = {
"accuracy": 0,
"relevance": 0,
"completeness": 0,
"response_time": 0
}
total_tests = len(self.test_cases)
successful_tests = 0
for test_case in self.test_cases:
response = self.qa_system.ask(test_case["question"])
# 检查是否包含预期关键词
answer = response.get("answer", "").lower()
expected_keywords = [kw.lower() for kw in test_case["expected_keywords"]]
keyword_matches = sum(1 for kw in expected_keywords if kw in answer)
keyword_coverage = keyword_matches / len(expected_keywords)
if keyword_coverage > 0.5:
successful_tests += 1
results["accuracy"] = successful_tests / total_tests
return results
```
## 六、扩展与应用展望
### 多模态扩展
未来可扩展支持中土世界地图、族谱图谱、人物关系图等多模态数据:
```python
class MultimodalRAG:
def __init__(self):
self.image_retriever = None # 图像检索器
self.map_analyzer = None # 地图分析器
def process_map_question(self, question: str, map_image: bytes):
"""处理地图相关问题"""
# 结合文本和图像信息进行回答
pass
```
### 对话历史与上下文管理
```python
class ConversationManager:
def __init__(self):
self.conversation_history = []
def manage_context(self, user_input: str, chat_history: List) -> str:
"""管理对话上下文"""
# 分析历史对话,提取相关信息
# 优化当前查询
pass
```
通过上述架构和实现,我们构建了一个专门针对中土世界知识的智能问答系统。该系统能够准确理解复杂的奇幻世界设定,提供基于可靠来源的准确回答。RAG技术的应用确保了回答的准确性和可追溯性,为托尔金作品的爱好者和研究者提供了一个有价值的工具。
这种垂直领域智能体的构建方法,同样可以应用于其他特定领域的知识问答系统构建,具有很好的可扩展性和借鉴意义。