325 lines
13 KiB
Python
325 lines
13 KiB
Python
"""
|
||
services/retrieval_service.py
|
||
后评价报告材料检索服务
|
||
用于从向量库中检索与后评价报告相关的材料
|
||
"""
|
||
|
||
from typing import List, Dict, Any, Optional
|
||
from langchain_core.documents import Document
|
||
from function.vector_store import VectorStore
|
||
|
||
|
||
class RetrievalService:
|
||
"""后评价报告材料检索服务"""
|
||
|
||
def __init__(self, collection_name: str = "eval_report"):
|
||
"""
|
||
初始化检索服务
|
||
|
||
Args:
|
||
collection_name: 向量库集合名称
|
||
"""
|
||
self.collection_name = collection_name
|
||
self.vector_store = VectorStore(collection_name=collection_name, drop_old=False)
|
||
|
||
def search_by_query(self, query: str, top_k: int = 10, filter_project: Optional[str] = None) -> List[Document]:
|
||
"""
|
||
根据查询语句检索相关材料
|
||
|
||
Args:
|
||
query: 查询语句,例如"项目背景"、"财务评价"、"技术方案"
|
||
top_k: 返回结果数量
|
||
filter_project: 可选的项目 UUID 过滤
|
||
|
||
Returns:
|
||
检索到的文档列表
|
||
"""
|
||
# 构建查询语句
|
||
if filter_project:
|
||
full_query = f"{query} 项目 UUID:{filter_project}"
|
||
else:
|
||
full_query = query
|
||
|
||
# 执行检索
|
||
results = self.vector_store.similarity_search_with_score(full_query, k=top_k)
|
||
|
||
# 过滤并返回文档
|
||
docs = []
|
||
for doc, score in results:
|
||
# 如果指定了项目过滤,检查文档是否属于该项目
|
||
if filter_project and doc.metadata.get("project_uuid") != filter_project:
|
||
continue
|
||
docs.append(doc)
|
||
|
||
return docs
|
||
|
||
def search_by_category(self, category: str, project_uuid: str, top_k: int = 10) -> List[Dict[str, Any]]:
|
||
"""
|
||
根据类别检索材料
|
||
|
||
Args:
|
||
category: 类别,如"项目概况"、"技术方案"、"财务评价"、"效益分析"
|
||
project_uuid: 项目 UUID
|
||
top_k: 返回结果数量
|
||
|
||
Returns:
|
||
检索结果列表,包含文档内容和元数据
|
||
"""
|
||
# 定义类别对应的检索关键词
|
||
category_keywords = {
|
||
"项目概况": ["项目背景", "建设内容", "项目规模", "建设地点", "建设单位", "项目决策", "立项依据"],
|
||
"技术方案": ["技术方案", "工艺技术", "设备选型", "工程设计", "施工安装", "调试运行", "专利技术"],
|
||
"财务评价": ["投资估算", "资金筹措", "财务分析", "现金流量", "利润计算", "成本分析", "经济效益"],
|
||
"效益分析": ["经济效益", "社会效益", "环境效益", "环境影响", "资源利用", "节能降耗"],
|
||
"风险分析": ["风险分析", "风险识别", "风险评价", "风险对策", "不确定性分析"],
|
||
"后评价结论": ["后评价结论", "经验教训", "建议措施", "综合评价"],
|
||
}
|
||
|
||
# 使用多个关键词进行检索
|
||
all_docs = []
|
||
for keyword in category_keywords.get(category, [category]):
|
||
docs = self.search_by_query(keyword, top_k=5, filter_project=project_uuid)
|
||
all_docs.extend(docs)
|
||
|
||
# 去重并返回
|
||
seen = set()
|
||
unique_docs = []
|
||
for doc in all_docs:
|
||
key = (doc.page_content[:100], doc.metadata.get("heading", ""))
|
||
if key not in seen:
|
||
seen.add(key)
|
||
unique_docs.append(doc)
|
||
|
||
# 转换为字典格式
|
||
result = []
|
||
for doc in unique_docs[:top_k]:
|
||
result.append({
|
||
"content": doc.page_content,
|
||
"heading": doc.metadata.get("heading", ""),
|
||
"heading_level": doc.metadata.get("heading_level", 0),
|
||
"doc_id": doc.metadata.get("doc_id", ""),
|
||
"path": doc.metadata.get("path", ""),
|
||
"score": doc.metadata.get("score", 0.0),
|
||
})
|
||
|
||
return result
|
||
|
||
def get_project_materials(self, project_uuid: str) -> Dict[str, Any]:
|
||
"""
|
||
获取项目的所有相关材料
|
||
|
||
Args:
|
||
project_uuid: 项目 UUID
|
||
|
||
Returns:
|
||
包含项目所有材料的字典
|
||
"""
|
||
# 检索项目基本信息
|
||
basic_info = self.search_by_query(
|
||
"项目概况 项目基本情况",
|
||
top_k=5,
|
||
filter_project=project_uuid
|
||
)
|
||
# 检索技术方案
|
||
tech_info = self.search_by_query(
|
||
"技术方案 工艺技术",
|
||
top_k=5,
|
||
filter_project=project_uuid
|
||
)
|
||
# 检索财务信息
|
||
finance_info = self.search_by_query(
|
||
"财务评价 经济效益",
|
||
top_k=5,
|
||
filter_project=project_uuid
|
||
)
|
||
# 检索效益分析
|
||
benefit_info = self.search_by_query(
|
||
"效益分析 社会效益",
|
||
top_k=5,
|
||
filter_project=project_uuid
|
||
)
|
||
return {
|
||
"basic_info": [doc.page_content for doc in basic_info],
|
||
"tech_info": [doc.page_content for doc in tech_info],
|
||
"finance_info": [doc.page_content for doc in finance_info],
|
||
"benefit_info": [doc.page_content for doc in benefit_info],
|
||
}
|
||
|
||
def search_similar_report(self, reference_content: str, top_k: int = 5) -> List[Document]:
|
||
"""
|
||
根据参考内容检索相似报告
|
||
|
||
Args:
|
||
reference_content: 参考报告内容
|
||
top_k: 返回结果数量
|
||
|
||
Returns:
|
||
相似报告列表
|
||
"""
|
||
# 提取关键信息用于检索
|
||
query = f"后评价报告 项目概况 技术方案 财务评价"
|
||
results = self.vector_store.similarity_search_with_score(query, k=top_k)
|
||
|
||
docs = []
|
||
for doc, score in results:
|
||
docs.append(doc)
|
||
|
||
return docs
|
||
|
||
def get_template_data(self, project_uuid: str, query: str = "项目概况 技术方案 财务评价", top_k: int = 15) -> Dict[str, Any]:
|
||
"""
|
||
获取符合模板要求的数据
|
||
|
||
Args:
|
||
project_uuid: 项目 UUID
|
||
query: 检索查询语句
|
||
top_k: 检索结果数量
|
||
|
||
Returns:
|
||
符合模板字段要求的数据字典
|
||
"""
|
||
from report_template import ReportTemplate
|
||
|
||
# 检索材料
|
||
materials = self.search_by_query(query, top_k=top_k, filter_project=project_uuid)
|
||
|
||
if not materials:
|
||
return {
|
||
"materials": [],
|
||
"template_data": {},
|
||
"key_info": {}
|
||
}
|
||
|
||
# 提取关键信息
|
||
key_info = ReportTemplate.extract_key_info([doc.page_content for doc in materials])
|
||
|
||
# 映射到模板字段
|
||
template_data = ReportTemplate.map_materials_to_template([doc.page_content for doc in materials])
|
||
|
||
return {
|
||
"materials": [doc for doc in materials],
|
||
"materials_text": [doc.page_content for doc in materials],
|
||
"template_data": template_data,
|
||
"key_info": key_info
|
||
}
|
||
|
||
def get_chapter_materials(self, project_uuid: str, chapter: str, top_k: int = 10) -> List[Dict[str, Any]]:
|
||
"""
|
||
获取指定章节的材料
|
||
|
||
Args:
|
||
project_uuid: 项目 UUID
|
||
chapter: 章节名称
|
||
top_k: 返回结果数量
|
||
|
||
Returns:
|
||
材料列表
|
||
"""
|
||
# 定义章节对应的检索关键词
|
||
chapter_keywords = {
|
||
"项目概况": ["项目背景", "建设内容", "项目规模", "建设地点", "建设单位", "项目决策", "立项依据"],
|
||
"技术方案": ["技术方案", "工艺技术", "设备选型", "工程设计", "施工安装", "调试运行", "专利技术"],
|
||
"项目全过程总结与管理评价": [
|
||
# ---- 强优先:表1~表14 + 编号小节 ----
|
||
"2.1", "2.1.1", "2.1.1.3", "2.1.6", "2.2", "2.2.1", "2.2.10", "2.3", "2.3.1", "2.3.6",
|
||
"表1原料数量及组成对比表", "表2原料性质对比表",
|
||
"表3前期预测和2019年实际产品对比表",
|
||
"表4装置规模及实际运行负荷对比表",
|
||
"表5项目规模对比表",
|
||
"表6可研报告与基础设计阶段工程内容对比表",
|
||
"表7项目承包商的招投标情况表",
|
||
"表8项目设计主要进度控制情况表",
|
||
"表9施工图设计变更情况表",
|
||
"表10重大设计变更情况表",
|
||
"表11主要设备采购情况表",
|
||
"表12施工重要节点进度表",
|
||
"表13原料性质对比表",
|
||
"表14主要标定结果与设计指标对比表",
|
||
|
||
# ---- 次优先:结构性关键词 ----
|
||
"可行性研究", "可研编制", "可研报告", "评估会", "可研批复", "资源与原料评价",
|
||
"基础设计", "设计审查", "审查意见", "设计变更", "施工图设计", "招投标", "施工准备",
|
||
"工程监理", "HSE", "竣工验收",
|
||
"投产管理", "生产准备", "联合试运", "试生产", "生产运行评价", "原料供应评价", "标定结果",
|
||
"原料数量及组成对比", "装置规模", "负荷率",
|
||
],
|
||
"财务评价": ["投资估算", "资金筹措", "财务分析", "现金流量", "利润计算", "成本分析", "经济效益"],
|
||
"效益分析": ["经济效益", "社会效益", "环境效益", "环境影响", "资源利用", "节能降耗"],
|
||
"项目目标和可持续性评价": [
|
||
# 强优先:章节标题与编号
|
||
"5", "5.1", "5.1.1", "5.1.2", "5.1.3", "5.2", "5.3", "5.3.1", "5.3.2", "5.3.3", "5.3.4", "5.3.5",
|
||
"项目目标实现程度评价", "项目绩效对标分析", "项目持续性评价",
|
||
|
||
# 目标实现(工程/技术/经济)
|
||
"工程规模", "项目进度", "工程质量", "项目功能", "投资控制",
|
||
"加工量", "负荷", "产品产量", "产品质量", "技术指标", "标定", "设计值", "考核",
|
||
"主要经济指标", "IRR", "内部收益率", "净现值", "NPV", "投资回收期", "营业收入", "成本费用", "税后利润",
|
||
|
||
# 对标
|
||
"对标", "横向对比", "同类装置", "单位投资", "单位能耗", "蒸汽能耗", "综合能耗", "辛烷值", "收率", "烯烃",
|
||
|
||
# 持续性(资源/产品/内部/政策)
|
||
"资源分析", "原料供应", "资源保障",
|
||
"产品分析", "市场需求", "国Ⅵ", "国ⅥA", "国ⅥB",
|
||
"项目内部因素", "装置规模合理性", "工艺方案", "技术水平",
|
||
"国家政策", "产业政策", "质量标准",
|
||
|
||
# 若材料以安全/环保合规支撑持续性
|
||
"个人风险", "社会风险", "可接受", "风险曲线",
|
||
"非甲烷总烃", "无组织排放", "mg/m3", "标准值",
|
||
],
|
||
"风险分析": ["风险分析", "风险识别", "风险评价", "风险对策", "不确定性分析"],
|
||
"后评价结论": ["后评价结论", "经验教训", "建议措施", "综合评价"],
|
||
}
|
||
|
||
keywords = chapter_keywords.get(chapter, [chapter])
|
||
|
||
# 使用多个关键词进行检索
|
||
all_docs = []
|
||
for keyword in keywords:
|
||
docs = self.search_by_query(keyword, top_k=5, filter_project=project_uuid)
|
||
all_docs.extend(docs)
|
||
|
||
# 去重并返回
|
||
seen = set()
|
||
unique_docs = []
|
||
for doc in all_docs:
|
||
key = (doc.page_content[:100], doc.metadata.get("heading", ""))
|
||
if key not in seen:
|
||
seen.add(key)
|
||
unique_docs.append(doc)
|
||
|
||
# 转换为字典格式
|
||
result = []
|
||
for doc in unique_docs[:top_k]:
|
||
result.append({
|
||
"content": doc.page_content,
|
||
"heading": doc.metadata.get("heading", ""),
|
||
"heading_level": doc.metadata.get("heading_level", 0),
|
||
"doc_id": doc.metadata.get("doc_id", ""),
|
||
"path": doc.metadata.get("path", ""),
|
||
"score": doc.metadata.get("score", 0.0),
|
||
})
|
||
|
||
return result
|
||
|
||
|
||
# 检索示例
|
||
if __name__ == "__main__":
|
||
# 创建检索服务实例
|
||
service = RetrievalService()
|
||
|
||
# 示例 1:搜索项目背景
|
||
print("示例 1:搜索项目背景")
|
||
docs = service.search_by_query("项目背景 建设内容", top_k=3)
|
||
for doc in docs:
|
||
print(f"标题:{doc.metadata.get('heading', 'N/A')}")
|
||
print(f"内容:{doc.page_content[:200]}...\n")
|
||
|
||
# 示例 2:搜索财务评价
|
||
print("示例 2:搜索财务评价")
|
||
docs = service.search_by_query("财务评价 现金流量", top_k=3)
|
||
for doc in docs:
|
||
print(f"标题:{doc.metadata.get('heading', 'N/A')}")
|
||
print(f"内容:{doc.page_content[:200]}...\n")
|