section_reference_block/services/retrieval_service.py
xxy aa98ea2623 @
Initial commit

Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
@
2026-06-05 18:45:29 +08:00

325 lines
13 KiB
Python
Raw Permalink Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

"""
services/retrieval_service.py
后评价报告材料检索服务
用于从向量库中检索与后评价报告相关的材料
"""
from typing import List, Dict, Any, Optional
from langchain_core.documents import Document
from function.vector_store import VectorStore
class RetrievalService:
"""后评价报告材料检索服务"""
def __init__(self, collection_name: str = "eval_report"):
"""
初始化检索服务
Args:
collection_name: 向量库集合名称
"""
self.collection_name = collection_name
self.vector_store = VectorStore(collection_name=collection_name, drop_old=False)
def search_by_query(self, query: str, top_k: int = 10, filter_project: Optional[str] = None) -> List[Document]:
"""
根据查询语句检索相关材料
Args:
query: 查询语句,例如"项目背景""财务评价""技术方案"
top_k: 返回结果数量
filter_project: 可选的项目 UUID 过滤
Returns:
检索到的文档列表
"""
# 构建查询语句
if filter_project:
full_query = f"{query} 项目 UUID:{filter_project}"
else:
full_query = query
# 执行检索
results = self.vector_store.similarity_search_with_score(full_query, k=top_k)
# 过滤并返回文档
docs = []
for doc, score in results:
# 如果指定了项目过滤,检查文档是否属于该项目
if filter_project and doc.metadata.get("project_uuid") != filter_project:
continue
docs.append(doc)
return docs
def search_by_category(self, category: str, project_uuid: str, top_k: int = 10) -> List[Dict[str, Any]]:
"""
根据类别检索材料
Args:
category: 类别,如"项目概况""技术方案""财务评价""效益分析"
project_uuid: 项目 UUID
top_k: 返回结果数量
Returns:
检索结果列表,包含文档内容和元数据
"""
# 定义类别对应的检索关键词
category_keywords = {
"项目概况": ["项目背景", "建设内容", "项目规模", "建设地点", "建设单位", "项目决策", "立项依据"],
"技术方案": ["技术方案", "工艺技术", "设备选型", "工程设计", "施工安装", "调试运行", "专利技术"],
"财务评价": ["投资估算", "资金筹措", "财务分析", "现金流量", "利润计算", "成本分析", "经济效益"],
"效益分析": ["经济效益", "社会效益", "环境效益", "环境影响", "资源利用", "节能降耗"],
"风险分析": ["风险分析", "风险识别", "风险评价", "风险对策", "不确定性分析"],
"后评价结论": ["后评价结论", "经验教训", "建议措施", "综合评价"],
}
# 使用多个关键词进行检索
all_docs = []
for keyword in category_keywords.get(category, [category]):
docs = self.search_by_query(keyword, top_k=5, filter_project=project_uuid)
all_docs.extend(docs)
# 去重并返回
seen = set()
unique_docs = []
for doc in all_docs:
key = (doc.page_content[:100], doc.metadata.get("heading", ""))
if key not in seen:
seen.add(key)
unique_docs.append(doc)
# 转换为字典格式
result = []
for doc in unique_docs[:top_k]:
result.append({
"content": doc.page_content,
"heading": doc.metadata.get("heading", ""),
"heading_level": doc.metadata.get("heading_level", 0),
"doc_id": doc.metadata.get("doc_id", ""),
"path": doc.metadata.get("path", ""),
"score": doc.metadata.get("score", 0.0),
})
return result
def get_project_materials(self, project_uuid: str) -> Dict[str, Any]:
"""
获取项目的所有相关材料
Args:
project_uuid: 项目 UUID
Returns:
包含项目所有材料的字典
"""
# 检索项目基本信息
basic_info = self.search_by_query(
"项目概况 项目基本情况",
top_k=5,
filter_project=project_uuid
)
# 检索技术方案
tech_info = self.search_by_query(
"技术方案 工艺技术",
top_k=5,
filter_project=project_uuid
)
# 检索财务信息
finance_info = self.search_by_query(
"财务评价 经济效益",
top_k=5,
filter_project=project_uuid
)
# 检索效益分析
benefit_info = self.search_by_query(
"效益分析 社会效益",
top_k=5,
filter_project=project_uuid
)
return {
"basic_info": [doc.page_content for doc in basic_info],
"tech_info": [doc.page_content for doc in tech_info],
"finance_info": [doc.page_content for doc in finance_info],
"benefit_info": [doc.page_content for doc in benefit_info],
}
def search_similar_report(self, reference_content: str, top_k: int = 5) -> List[Document]:
"""
根据参考内容检索相似报告
Args:
reference_content: 参考报告内容
top_k: 返回结果数量
Returns:
相似报告列表
"""
# 提取关键信息用于检索
query = f"后评价报告 项目概况 技术方案 财务评价"
results = self.vector_store.similarity_search_with_score(query, k=top_k)
docs = []
for doc, score in results:
docs.append(doc)
return docs
def get_template_data(self, project_uuid: str, query: str = "项目概况 技术方案 财务评价", top_k: int = 15) -> Dict[str, Any]:
"""
获取符合模板要求的数据
Args:
project_uuid: 项目 UUID
query: 检索查询语句
top_k: 检索结果数量
Returns:
符合模板字段要求的数据字典
"""
from report_template import ReportTemplate
# 检索材料
materials = self.search_by_query(query, top_k=top_k, filter_project=project_uuid)
if not materials:
return {
"materials": [],
"template_data": {},
"key_info": {}
}
# 提取关键信息
key_info = ReportTemplate.extract_key_info([doc.page_content for doc in materials])
# 映射到模板字段
template_data = ReportTemplate.map_materials_to_template([doc.page_content for doc in materials])
return {
"materials": [doc for doc in materials],
"materials_text": [doc.page_content for doc in materials],
"template_data": template_data,
"key_info": key_info
}
def get_chapter_materials(self, project_uuid: str, chapter: str, top_k: int = 10) -> List[Dict[str, Any]]:
"""
获取指定章节的材料
Args:
project_uuid: 项目 UUID
chapter: 章节名称
top_k: 返回结果数量
Returns:
材料列表
"""
# 定义章节对应的检索关键词
chapter_keywords = {
"项目概况": ["项目背景", "建设内容", "项目规模", "建设地点", "建设单位", "项目决策", "立项依据"],
"技术方案": ["技术方案", "工艺技术", "设备选型", "工程设计", "施工安装", "调试运行", "专利技术"],
"项目全过程总结与管理评价": [
# ---- 强优先表1~表14 + 编号小节 ----
"2.1", "2.1.1", "2.1.1.3", "2.1.6", "2.2", "2.2.1", "2.2.10", "2.3", "2.3.1", "2.3.6",
"表1原料数量及组成对比表", "表2原料性质对比表",
"表3前期预测和2019年实际产品对比表",
"表4装置规模及实际运行负荷对比表",
"表5项目规模对比表",
"表6可研报告与基础设计阶段工程内容对比表",
"表7项目承包商的招投标情况表",
"表8项目设计主要进度控制情况表",
"表9施工图设计变更情况表",
"表10重大设计变更情况表",
"表11主要设备采购情况表",
"表12施工重要节点进度表",
"表13原料性质对比表",
"表14主要标定结果与设计指标对比表",
# ---- 次优先:结构性关键词 ----
"可行性研究", "可研编制", "可研报告", "评估会", "可研批复", "资源与原料评价",
"基础设计", "设计审查", "审查意见", "设计变更", "施工图设计", "招投标", "施工准备",
"工程监理", "HSE", "竣工验收",
"投产管理", "生产准备", "联合试运", "试生产", "生产运行评价", "原料供应评价", "标定结果",
"原料数量及组成对比", "装置规模", "负荷率",
],
"财务评价": ["投资估算", "资金筹措", "财务分析", "现金流量", "利润计算", "成本分析", "经济效益"],
"效益分析": ["经济效益", "社会效益", "环境效益", "环境影响", "资源利用", "节能降耗"],
"项目目标和可持续性评价": [
# 强优先:章节标题与编号
"5", "5.1", "5.1.1", "5.1.2", "5.1.3", "5.2", "5.3", "5.3.1", "5.3.2", "5.3.3", "5.3.4", "5.3.5",
"项目目标实现程度评价", "项目绩效对标分析", "项目持续性评价",
# 目标实现(工程/技术/经济)
"工程规模", "项目进度", "工程质量", "项目功能", "投资控制",
"加工量", "负荷", "产品产量", "产品质量", "技术指标", "标定", "设计值", "考核",
"主要经济指标", "IRR", "内部收益率", "净现值", "NPV", "投资回收期", "营业收入", "成本费用", "税后利润",
# 对标
"对标", "横向对比", "同类装置", "单位投资", "单位能耗", "蒸汽能耗", "综合能耗", "辛烷值", "收率", "烯烃",
# 持续性(资源/产品/内部/政策)
"资源分析", "原料供应", "资源保障",
"产品分析", "市场需求", "国Ⅵ", "国ⅥA", "国ⅥB",
"项目内部因素", "装置规模合理性", "工艺方案", "技术水平",
"国家政策", "产业政策", "质量标准",
# 若材料以安全/环保合规支撑持续性
"个人风险", "社会风险", "可接受", "风险曲线",
"非甲烷总烃", "无组织排放", "mg/m3", "标准值",
],
"风险分析": ["风险分析", "风险识别", "风险评价", "风险对策", "不确定性分析"],
"后评价结论": ["后评价结论", "经验教训", "建议措施", "综合评价"],
}
keywords = chapter_keywords.get(chapter, [chapter])
# 使用多个关键词进行检索
all_docs = []
for keyword in keywords:
docs = self.search_by_query(keyword, top_k=5, filter_project=project_uuid)
all_docs.extend(docs)
# 去重并返回
seen = set()
unique_docs = []
for doc in all_docs:
key = (doc.page_content[:100], doc.metadata.get("heading", ""))
if key not in seen:
seen.add(key)
unique_docs.append(doc)
# 转换为字典格式
result = []
for doc in unique_docs[:top_k]:
result.append({
"content": doc.page_content,
"heading": doc.metadata.get("heading", ""),
"heading_level": doc.metadata.get("heading_level", 0),
"doc_id": doc.metadata.get("doc_id", ""),
"path": doc.metadata.get("path", ""),
"score": doc.metadata.get("score", 0.0),
})
return result
# 检索示例
if __name__ == "__main__":
# 创建检索服务实例
service = RetrievalService()
# 示例 1搜索项目背景
print("示例 1搜索项目背景")
docs = service.search_by_query("项目背景 建设内容", top_k=3)
for doc in docs:
print(f"标题:{doc.metadata.get('heading', 'N/A')}")
print(f"内容:{doc.page_content[:200]}...\n")
# 示例 2搜索财务评价
print("示例 2搜索财务评价")
docs = service.search_by_query("财务评价 现金流量", top_k=3)
for doc in docs:
print(f"标题:{doc.metadata.get('heading', 'N/A')}")
print(f"内容:{doc.page_content[:200]}...\n")