""" services/retrieval_service.py 后评价报告材料检索服务 用于从向量库中检索与后评价报告相关的材料 """ from typing import List, Dict, Any, Optional from langchain_core.documents import Document from function.vector_store import VectorStore class RetrievalService: """后评价报告材料检索服务""" def __init__(self, collection_name: str = "eval_report"): """ 初始化检索服务 Args: collection_name: 向量库集合名称 """ self.collection_name = collection_name self.vector_store = VectorStore(collection_name=collection_name, drop_old=False) def search_by_query(self, query: str, top_k: int = 10, filter_project: Optional[str] = None) -> List[Document]: """ 根据查询语句检索相关材料 Args: query: 查询语句,例如"项目背景"、"财务评价"、"技术方案" top_k: 返回结果数量 filter_project: 可选的项目 UUID 过滤 Returns: 检索到的文档列表 """ # 构建查询语句 if filter_project: full_query = f"{query} 项目 UUID:{filter_project}" else: full_query = query # 执行检索 results = self.vector_store.similarity_search_with_score(full_query, k=top_k) # 过滤并返回文档 docs = [] for doc, score in results: # 如果指定了项目过滤,检查文档是否属于该项目 if filter_project and doc.metadata.get("project_uuid") != filter_project: continue docs.append(doc) return docs def search_by_category(self, category: str, project_uuid: str, top_k: int = 10) -> List[Dict[str, Any]]: """ 根据类别检索材料 Args: category: 类别,如"项目概况"、"技术方案"、"财务评价"、"效益分析" project_uuid: 项目 UUID top_k: 返回结果数量 Returns: 检索结果列表,包含文档内容和元数据 """ # 定义类别对应的检索关键词 category_keywords = { "项目概况": ["项目背景", "建设内容", "项目规模", "建设地点", "建设单位", "项目决策", "立项依据"], "技术方案": ["技术方案", "工艺技术", "设备选型", "工程设计", "施工安装", "调试运行", "专利技术"], "财务评价": ["投资估算", "资金筹措", "财务分析", "现金流量", "利润计算", "成本分析", "经济效益"], "效益分析": ["经济效益", "社会效益", "环境效益", "环境影响", "资源利用", "节能降耗"], "风险分析": ["风险分析", "风险识别", "风险评价", "风险对策", "不确定性分析"], "后评价结论": ["后评价结论", "经验教训", "建议措施", "综合评价"], } # 使用多个关键词进行检索 all_docs = [] for keyword in category_keywords.get(category, [category]): docs = self.search_by_query(keyword, top_k=5, filter_project=project_uuid) all_docs.extend(docs) # 去重并返回 seen = set() unique_docs = [] for doc in all_docs: key = (doc.page_content[:100], doc.metadata.get("heading", "")) if key not in seen: seen.add(key) unique_docs.append(doc) # 转换为字典格式 result = [] for doc in unique_docs[:top_k]: result.append({ "content": doc.page_content, "heading": doc.metadata.get("heading", ""), "heading_level": doc.metadata.get("heading_level", 0), "doc_id": doc.metadata.get("doc_id", ""), "path": doc.metadata.get("path", ""), "score": doc.metadata.get("score", 0.0), }) return result def get_project_materials(self, project_uuid: str) -> Dict[str, Any]: """ 获取项目的所有相关材料 Args: project_uuid: 项目 UUID Returns: 包含项目所有材料的字典 """ # 检索项目基本信息 basic_info = self.search_by_query( "项目概况 项目基本情况", top_k=5, filter_project=project_uuid ) # 检索技术方案 tech_info = self.search_by_query( "技术方案 工艺技术", top_k=5, filter_project=project_uuid ) # 检索财务信息 finance_info = self.search_by_query( "财务评价 经济效益", top_k=5, filter_project=project_uuid ) # 检索效益分析 benefit_info = self.search_by_query( "效益分析 社会效益", top_k=5, filter_project=project_uuid ) return { "basic_info": [doc.page_content for doc in basic_info], "tech_info": [doc.page_content for doc in tech_info], "finance_info": [doc.page_content for doc in finance_info], "benefit_info": [doc.page_content for doc in benefit_info], } def search_similar_report(self, reference_content: str, top_k: int = 5) -> List[Document]: """ 根据参考内容检索相似报告 Args: reference_content: 参考报告内容 top_k: 返回结果数量 Returns: 相似报告列表 """ # 提取关键信息用于检索 query = f"后评价报告 项目概况 技术方案 财务评价" results = self.vector_store.similarity_search_with_score(query, k=top_k) docs = [] for doc, score in results: docs.append(doc) return docs def get_template_data(self, project_uuid: str, query: str = "项目概况 技术方案 财务评价", top_k: int = 15) -> Dict[str, Any]: """ 获取符合模板要求的数据 Args: project_uuid: 项目 UUID query: 检索查询语句 top_k: 检索结果数量 Returns: 符合模板字段要求的数据字典 """ from report_template import ReportTemplate # 检索材料 materials = self.search_by_query(query, top_k=top_k, filter_project=project_uuid) if not materials: return { "materials": [], "template_data": {}, "key_info": {} } # 提取关键信息 key_info = ReportTemplate.extract_key_info([doc.page_content for doc in materials]) # 映射到模板字段 template_data = ReportTemplate.map_materials_to_template([doc.page_content for doc in materials]) return { "materials": [doc for doc in materials], "materials_text": [doc.page_content for doc in materials], "template_data": template_data, "key_info": key_info } def get_chapter_materials(self, project_uuid: str, chapter: str, top_k: int = 10) -> List[Dict[str, Any]]: """ 获取指定章节的材料 Args: project_uuid: 项目 UUID chapter: 章节名称 top_k: 返回结果数量 Returns: 材料列表 """ # 定义章节对应的检索关键词 chapter_keywords = { "项目概况": ["项目背景", "建设内容", "项目规模", "建设地点", "建设单位", "项目决策", "立项依据"], "技术方案": ["技术方案", "工艺技术", "设备选型", "工程设计", "施工安装", "调试运行", "专利技术"], "项目全过程总结与管理评价": [ # ---- 强优先:表1~表14 + 编号小节 ---- "2.1", "2.1.1", "2.1.1.3", "2.1.6", "2.2", "2.2.1", "2.2.10", "2.3", "2.3.1", "2.3.6", "表1原料数量及组成对比表", "表2原料性质对比表", "表3前期预测和2019年实际产品对比表", "表4装置规模及实际运行负荷对比表", "表5项目规模对比表", "表6可研报告与基础设计阶段工程内容对比表", "表7项目承包商的招投标情况表", "表8项目设计主要进度控制情况表", "表9施工图设计变更情况表", "表10重大设计变更情况表", "表11主要设备采购情况表", "表12施工重要节点进度表", "表13原料性质对比表", "表14主要标定结果与设计指标对比表", # ---- 次优先:结构性关键词 ---- "可行性研究", "可研编制", "可研报告", "评估会", "可研批复", "资源与原料评价", "基础设计", "设计审查", "审查意见", "设计变更", "施工图设计", "招投标", "施工准备", "工程监理", "HSE", "竣工验收", "投产管理", "生产准备", "联合试运", "试生产", "生产运行评价", "原料供应评价", "标定结果", "原料数量及组成对比", "装置规模", "负荷率", ], "财务评价": ["投资估算", "资金筹措", "财务分析", "现金流量", "利润计算", "成本分析", "经济效益"], "效益分析": ["经济效益", "社会效益", "环境效益", "环境影响", "资源利用", "节能降耗"], "项目目标和可持续性评价": [ # 强优先:章节标题与编号 "5", "5.1", "5.1.1", "5.1.2", "5.1.3", "5.2", "5.3", "5.3.1", "5.3.2", "5.3.3", "5.3.4", "5.3.5", "项目目标实现程度评价", "项目绩效对标分析", "项目持续性评价", # 目标实现(工程/技术/经济) "工程规模", "项目进度", "工程质量", "项目功能", "投资控制", "加工量", "负荷", "产品产量", "产品质量", "技术指标", "标定", "设计值", "考核", "主要经济指标", "IRR", "内部收益率", "净现值", "NPV", "投资回收期", "营业收入", "成本费用", "税后利润", # 对标 "对标", "横向对比", "同类装置", "单位投资", "单位能耗", "蒸汽能耗", "综合能耗", "辛烷值", "收率", "烯烃", # 持续性(资源/产品/内部/政策) "资源分析", "原料供应", "资源保障", "产品分析", "市场需求", "国Ⅵ", "国ⅥA", "国ⅥB", "项目内部因素", "装置规模合理性", "工艺方案", "技术水平", "国家政策", "产业政策", "质量标准", # 若材料以安全/环保合规支撑持续性 "个人风险", "社会风险", "可接受", "风险曲线", "非甲烷总烃", "无组织排放", "mg/m3", "标准值", ], "风险分析": ["风险分析", "风险识别", "风险评价", "风险对策", "不确定性分析"], "后评价结论": ["后评价结论", "经验教训", "建议措施", "综合评价"], } keywords = chapter_keywords.get(chapter, [chapter]) # 使用多个关键词进行检索 all_docs = [] for keyword in keywords: docs = self.search_by_query(keyword, top_k=5, filter_project=project_uuid) all_docs.extend(docs) # 去重并返回 seen = set() unique_docs = [] for doc in all_docs: key = (doc.page_content[:100], doc.metadata.get("heading", "")) if key not in seen: seen.add(key) unique_docs.append(doc) # 转换为字典格式 result = [] for doc in unique_docs[:top_k]: result.append({ "content": doc.page_content, "heading": doc.metadata.get("heading", ""), "heading_level": doc.metadata.get("heading_level", 0), "doc_id": doc.metadata.get("doc_id", ""), "path": doc.metadata.get("path", ""), "score": doc.metadata.get("score", 0.0), }) return result # 检索示例 if __name__ == "__main__": # 创建检索服务实例 service = RetrievalService() # 示例 1:搜索项目背景 print("示例 1:搜索项目背景") docs = service.search_by_query("项目背景 建设内容", top_k=3) for doc in docs: print(f"标题:{doc.metadata.get('heading', 'N/A')}") print(f"内容:{doc.page_content[:200]}...\n") # 示例 2:搜索财务评价 print("示例 2:搜索财务评价") docs = service.search_by_query("财务评价 现金流量", top_k=3) for doc in docs: print(f"标题:{doc.metadata.get('heading', 'N/A')}") print(f"内容:{doc.page_content[:200]}...\n")