Merge origin/main — keep local version

Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
This commit is contained in:
xxy 2026-06-05 18:48:31 +08:00
commit bf3d340aa8
29 changed files with 12115 additions and 0 deletions

464
database/init.sql Normal file
View File

@ -0,0 +1,464 @@
-- 智能报告生成平台 - 数据库初始化脚本
-- 数据库名建议post_eval_report
-- 适用于 MySQL
-- 创建数据库(可选)
-- CREATE DATABASE IF NOT EXISTS post_eval_report DEFAULT CHARACTER SET utf8mb4 COLLATE utf8mb4_unicode_ci;
-- USE post_eval_report;
-- 项目(统一:知识库 + 撰写)
-- uuid 由应用层生成,避免 MySQL 8/9 对生成列函数限制导致初始化失败
CREATE TABLE IF NOT EXISTS projects (
id INT AUTO_INCREMENT PRIMARY KEY,
uuid VARCHAR(32) NOT NULL UNIQUE,
name VARCHAR(255) NOT NULL,
description TEXT,
created_at DATETIME NOT NULL,
updated_at DATETIME NOT NULL,
doc_count INT DEFAULT 0,
eval_reports_count INT DEFAULT 0,
total_size VARCHAR(32) DEFAULT '0 B',
tags TEXT,
status VARCHAR(16) DEFAULT 'active',
color VARCHAR(16) DEFAULT '#3b82f6',
sync_suppressed_table_names LONGTEXT NULL
) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4 COLLATE=utf8mb4_unicode_ci;
CREATE INDEX idx_projects_created_at ON projects(created_at);
CREATE INDEX idx_projects_updated_at ON projects(updated_at);
CREATE INDEX idx_projects_status ON projects(status);
-- 知识库目录表project_id 关联 projects.uuidparent_id 形成目录树
CREATE TABLE IF NOT EXISTS kb_directories (
id VARCHAR(64) PRIMARY KEY,
project_id VARCHAR(32) NOT NULL,
parent_id VARCHAR(64) NULL,
name VARCHAR(255) NOT NULL,
full_path VARCHAR(1024) NOT NULL,
created_at DATETIME NOT NULL,
updated_at DATETIME NOT NULL,
FOREIGN KEY (project_id) REFERENCES projects(uuid) ON DELETE CASCADE,
FOREIGN KEY (parent_id) REFERENCES kb_directories(id) ON DELETE CASCADE
) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4 COLLATE=utf8mb4_unicode_ci;
CREATE INDEX idx_kb_dirs_project ON kb_directories(project_id);
CREATE INDEX idx_kb_dirs_parent ON kb_directories(parent_id);
-- 知识库文档status: 0=失败 2=排队中 3=处理中 4=可用)
CREATE TABLE IF NOT EXISTS kb_documents (
id VARCHAR(64) PRIMARY KEY,
project_id VARCHAR(32) NOT NULL,
directory_id VARCHAR(64) NULL,
name VARCHAR(255) NOT NULL,
size VARCHAR(32) NOT NULL,
file_path VARCHAR(512),
storage_rel_path VARCHAR(512) NULL COMMENT '项目内完整相对路径(含文件名)',
word_count INT DEFAULT 0,
uploaded_at DATETIME NOT NULL,
status INT DEFAULT 2,
error_message TEXT NULL,
factor JSON NULL COMMENT '文档要素 JSON 数组',
category VARCHAR(32) NULL DEFAULT NULL COMMENT '文件分类',
FOREIGN KEY (project_id) REFERENCES projects(uuid) ON DELETE CASCADE,
FOREIGN KEY (directory_id) REFERENCES kb_directories(id) ON DELETE SET NULL
) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4 COLLATE=utf8mb4_unicode_ci;
CREATE INDEX idx_kb_docs_project ON kb_documents(project_id);
CREATE INDEX idx_kb_docs_directory ON kb_documents(directory_id);
-- 若已有 kb_documents 表,执行以下语句添加 word_count 字段:
-- ALTER TABLE kb_documents ADD COLUMN word_count INT DEFAULT 0 AFTER file_path;
-- 撰写文档project_id 关联 projects.uuid与 kb_documents 一致)
CREATE TABLE IF NOT EXISTS write_documents (
id VARCHAR(64) PRIMARY KEY,
project_id VARCHAR(32) NOT NULL,
title VARCHAR(255) NOT NULL,
content LONGTEXT,
word_count INT DEFAULT 0,
created_at DATETIME NOT NULL,
updated_at DATETIME NOT NULL,
status VARCHAR(16) DEFAULT 'draft',
sort_order INT DEFAULT 0,
FOREIGN KEY (project_id) REFERENCES projects(uuid) ON DELETE CASCADE
) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4 COLLATE=utf8mb4_unicode_ci;
CREATE INDEX idx_write_docs_project ON write_documents(project_id);
-- 文档版本
CREATE TABLE IF NOT EXISTS doc_versions (
id VARCHAR(64) PRIMARY KEY,
document_id VARCHAR(64) NOT NULL,
version VARCHAR(32) NOT NULL,
content LONGTEXT NOT NULL,
citation_payload LONGTEXT NULL,
saved_at DATETIME NOT NULL,
author VARCHAR(64) NOT NULL,
note TEXT,
FOREIGN KEY (document_id) REFERENCES write_documents(id) ON DELETE CASCADE
) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4 COLLATE=utf8mb4_unicode_ci;
CREATE INDEX idx_versions_doc ON doc_versions(document_id);
-- 要素表定义(全局/时间)
CREATE TABLE IF NOT EXISTS element_tables (
id VARCHAR(64) PRIMARY KEY,
project_id VARCHAR(32) NOT NULL,
table_type VARCHAR(32) NOT NULL,
table_name VARCHAR(255) NOT NULL,
year INT NULL,
is_time_dimension TINYINT(1) DEFAULT 0,
sort_order INT DEFAULT 0,
sync_suppressed_row_keys LONGTEXT NULL,
custom_row_order LONGTEXT NULL,
created_at DATETIME NOT NULL,
updated_at DATETIME NOT NULL,
FOREIGN KEY (project_id) REFERENCES projects(uuid) ON DELETE CASCADE
) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4 COLLATE=utf8mb4_unicode_ci;
CREATE INDEX idx_element_tables_project ON element_tables(project_id);
CREATE INDEX idx_element_tables_type_year ON element_tables(table_type, year);
CREATE INDEX idx_element_tables_name ON element_tables(table_name);
-- 要素单元格
CREATE TABLE IF NOT EXISTS element_cells (
id VARCHAR(64) PRIMARY KEY,
table_id VARCHAR(64) NOT NULL,
project_id VARCHAR(32) NOT NULL,
row_key VARCHAR(255) NOT NULL,
col_key VARCHAR(255) NULL,
year INT NULL,
value LONGTEXT NULL,
source_document_id VARCHAR(64) NULL,
source_line_no INT NULL,
source_line_end INT NULL,
source_quote TEXT NULL,
confidence FLOAT NULL,
extraction_batch_id VARCHAR(64) NULL,
extraction_model VARCHAR(128) NULL,
source_type VARCHAR(16) NULL COMMENT 'extract=文档抽取, manual=手工输入',
conflict_status VARCHAR(16) DEFAULT 'none',
created_at DATETIME NOT NULL,
updated_at DATETIME NOT NULL,
FOREIGN KEY (table_id) REFERENCES element_tables(id) ON DELETE CASCADE,
FOREIGN KEY (project_id) REFERENCES projects(uuid) ON DELETE CASCADE,
FOREIGN KEY (source_document_id) REFERENCES kb_documents(id) ON DELETE SET NULL
) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4 COLLATE=utf8mb4_unicode_ci;
CREATE INDEX idx_element_cells_project ON element_cells(project_id);
CREATE INDEX idx_element_cells_row_col ON element_cells(row_key, col_key);
CREATE INDEX idx_element_cells_year ON element_cells(year);
-- 抽取结果留存table/element
CREATE TABLE IF NOT EXISTS extraction_results (
id VARCHAR(64) PRIMARY KEY,
project_id VARCHAR(32) NOT NULL,
document_id VARCHAR(64) NOT NULL,
batch_id VARCHAR(64) NOT NULL,
result_type VARCHAR(16) NOT NULL,
table_type VARCHAR(32) NULL,
table_name VARCHAR(255) NULL,
year INT NULL,
item_key VARCHAR(255) NOT NULL,
item_value LONGTEXT NULL,
source_line_no INT NULL,
source_line_end INT NULL,
confidence FLOAT NULL,
raw_payload JSON NULL,
extracted_at DATETIME NULL,
created_at DATETIME NOT NULL,
FOREIGN KEY (project_id) REFERENCES projects(uuid) ON DELETE CASCADE,
FOREIGN KEY (document_id) REFERENCES kb_documents(id) ON DELETE CASCADE
) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4 COLLATE=utf8mb4_unicode_ci;
CREATE INDEX idx_extraction_project_doc ON extraction_results(project_id, document_id);
CREATE INDEX idx_extraction_batch ON extraction_results(batch_id);
CREATE INDEX idx_extraction_table_name ON extraction_results(table_name);
CREATE INDEX idx_extraction_key ON extraction_results(item_key);
-- 要素抽取结果明细(面向“细则章节/小节提示词 -> 项目材料”)
CREATE TABLE IF NOT EXISTS element_extraction_results (
id VARCHAR(64) PRIMARY KEY,
project_id VARCHAR(32) NOT NULL,
table_type VARCHAR(32) NOT NULL,
year INT NULL,
table_name VARCHAR(255) NOT NULL,
extracted_at DATETIME NOT NULL,
item_key VARCHAR(255) NOT NULL,
item_value LONGTEXT NULL,
source_document_id VARCHAR(64) NULL,
source_line_no INT NULL,
source_line_end INT NULL,
created_at DATETIME NOT NULL,
FOREIGN KEY (project_id) REFERENCES projects(uuid) ON DELETE CASCADE,
FOREIGN KEY (source_document_id) REFERENCES kb_documents(id) ON DELETE SET NULL
) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4 COLLATE=utf8mb4_unicode_ci;
CREATE INDEX idx_el_ext_project ON element_extraction_results(project_id);
CREATE INDEX idx_el_ext_table ON element_extraction_results(table_type, year, table_name);
CREATE INDEX idx_el_ext_key ON element_extraction_results(item_key);
CREATE INDEX idx_el_ext_source_doc ON element_extraction_results(source_document_id);
-- 冲突记录
CREATE TABLE IF NOT EXISTS element_conflicts (
id VARCHAR(64) PRIMARY KEY,
project_id VARCHAR(32) NOT NULL,
table_id VARCHAR(64) NULL,
cell_id VARCHAR(64) NULL,
item_key VARCHAR(255) NOT NULL,
old_value LONGTEXT NULL,
new_value LONGTEXT NULL,
selected_value LONGTEXT NULL,
source_document_id VARCHAR(64) NULL,
source_line_no INT NULL,
status VARCHAR(16) DEFAULT 'pending',
created_at DATETIME NOT NULL,
updated_at DATETIME NOT NULL,
FOREIGN KEY (project_id) REFERENCES projects(uuid) ON DELETE CASCADE,
FOREIGN KEY (table_id) REFERENCES element_tables(id) ON DELETE SET NULL,
FOREIGN KEY (cell_id) REFERENCES element_cells(id) ON DELETE SET NULL,
FOREIGN KEY (source_document_id) REFERENCES kb_documents(id) ON DELETE SET NULL
) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4 COLLATE=utf8mb4_unicode_ci;
CREATE INDEX idx_element_conflicts_project ON element_conflicts(project_id);
CREATE INDEX idx_element_conflicts_status ON element_conflicts(status);
-- 文档 markdown 落库
CREATE TABLE IF NOT EXISTS document_markdowns (
id VARCHAR(64) PRIMARY KEY,
project_id VARCHAR(32) NOT NULL,
document_id VARCHAR(64) NOT NULL,
extracted_filename VARCHAR(255) NULL,
markdown_content LONGTEXT NOT NULL,
content_hash VARCHAR(64) NULL,
created_at DATETIME NOT NULL,
updated_at DATETIME NOT NULL,
FOREIGN KEY (project_id) REFERENCES projects(uuid) ON DELETE CASCADE,
FOREIGN KEY (document_id) REFERENCES kb_documents(id) ON DELETE CASCADE
) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4 COLLATE=utf8mb4_unicode_ci;
CREATE INDEX idx_markdowns_project_doc ON document_markdowns(project_id, document_id);
-- 文档段落切分
CREATE TABLE IF NOT EXISTS document_chunks (
id VARCHAR(64) PRIMARY KEY,
project_id VARCHAR(32) NOT NULL,
document_id VARCHAR(64) NOT NULL,
markdown_id VARCHAR(64) NULL,
heading VARCHAR(512) NULL,
chunk_text LONGTEXT NOT NULL,
chunk_index INT DEFAULT 0,
source_line_start INT NULL,
source_line_end INT NULL,
vector_id VARCHAR(128) NULL,
created_at DATETIME NOT NULL,
FOREIGN KEY (project_id) REFERENCES projects(uuid) ON DELETE CASCADE,
FOREIGN KEY (document_id) REFERENCES kb_documents(id) ON DELETE CASCADE,
FOREIGN KEY (markdown_id) REFERENCES document_markdowns(id) ON DELETE CASCADE
) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4 COLLATE=utf8mb4_unicode_ci;
CREATE INDEX idx_chunks_project_doc ON document_chunks(project_id, document_id);
CREATE INDEX idx_chunks_heading ON document_chunks(heading(255));
-- 独立后台任务pdf2md 文件处理与 element-agent 要素抽取
CREATE TABLE IF NOT EXISTS tasks (
id VARCHAR(64) PRIMARY KEY,
project VARCHAR(64) NOT NULL,
task_type INT NOT NULL,
file_id VARCHAR(64) NULL,
file_path VARCHAR(1024) NULL,
status INT NOT NULL DEFAULT 1,
payload_json JSON NULL,
result_path VARCHAR(1024) NULL,
error_message LONGTEXT NULL,
add_time DATETIME NOT NULL,
finish_time DATETIME NULL
) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4 COLLATE=utf8mb4_unicode_ci;
CREATE INDEX idx_tasks_status_type_time ON tasks(status, task_type, add_time);
CREATE INDEX idx_tasks_project ON tasks(project);
CREATE INDEX idx_tasks_file_id ON tasks(file_id);
-- 模板管理
CREATE TABLE IF NOT EXISTS report_templates (
id VARCHAR(64) PRIMARY KEY,
name VARCHAR(255) NOT NULL,
description TEXT NULL,
is_default TINYINT(1) DEFAULT 0,
is_active TINYINT(1) DEFAULT 1,
created_at DATETIME NOT NULL,
updated_at DATETIME NOT NULL
) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4 COLLATE=utf8mb4_unicode_ci;
CREATE INDEX idx_templates_default ON report_templates(is_default);
CREATE TABLE IF NOT EXISTS report_template_sections (
id VARCHAR(64) PRIMARY KEY,
template_id VARCHAR(64) NOT NULL,
section_key VARCHAR(64) NOT NULL,
section_title VARCHAR(255) NOT NULL,
section_prompt LONGTEXT NULL,
section_output_contract LONGTEXT NULL,
section_order INT DEFAULT 0,
examples LONGTEXT NULL,
created_at DATETIME NOT NULL,
updated_at DATETIME NOT NULL,
FOREIGN KEY (template_id) REFERENCES report_templates(id) ON DELETE CASCADE
) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4 COLLATE=utf8mb4_unicode_ci;
CREATE INDEX idx_template_sections_template ON report_template_sections(template_id);
-- 报告生成任务7章分章异步
CREATE TABLE IF NOT EXISTS report_generation_jobs (
id VARCHAR(64) PRIMARY KEY,
project_id VARCHAR(32) NOT NULL,
template_id VARCHAR(64) NULL,
status VARCHAR(16) DEFAULT 'pending',
progress INT DEFAULT 0,
current_section_key VARCHAR(64) NULL,
error_message TEXT NULL,
requested_by VARCHAR(64) NULL,
options JSON NULL,
snapshot JSON NULL,
created_at DATETIME NOT NULL,
updated_at DATETIME NOT NULL,
completed_at DATETIME NULL,
FOREIGN KEY (project_id) REFERENCES projects(uuid) ON DELETE CASCADE,
FOREIGN KEY (template_id) REFERENCES report_templates(id) ON DELETE SET NULL
) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4 COLLATE=utf8mb4_unicode_ci;
CREATE INDEX idx_report_jobs_project ON report_generation_jobs(project_id);
CREATE INDEX idx_report_jobs_status ON report_generation_jobs(status);
CREATE TABLE IF NOT EXISTS report_generation_chapters (
id VARCHAR(64) PRIMARY KEY,
job_id VARCHAR(64) NOT NULL,
section_key VARCHAR(64) NOT NULL,
section_title VARCHAR(255) NOT NULL,
section_order INT DEFAULT 0,
status VARCHAR(16) DEFAULT 'pending',
content LONGTEXT NULL,
prompt_text LONGTEXT NULL,
evidence_payload JSON NULL,
validation_payload JSON NULL,
error_message TEXT NULL,
created_at DATETIME NOT NULL,
updated_at DATETIME NOT NULL,
completed_at DATETIME NULL,
FOREIGN KEY (job_id) REFERENCES report_generation_jobs(id) ON DELETE CASCADE
) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4 COLLATE=utf8mb4_unicode_ci;
CREATE INDEX idx_report_chapters_job ON report_generation_chapters(job_id);
CREATE INDEX idx_report_chapters_status ON report_generation_chapters(status);
-- 最小 RBAC
CREATE TABLE IF NOT EXISTS departments (
id VARCHAR(64) PRIMARY KEY,
name VARCHAR(255) NOT NULL,
description TEXT NULL,
parent_id VARCHAR(64) NULL,
created_at DATETIME NOT NULL,
updated_at DATETIME NOT NULL,
FOREIGN KEY (parent_id) REFERENCES departments(id) ON DELETE SET NULL
) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4 COLLATE=utf8mb4_unicode_ci;
CREATE TABLE IF NOT EXISTS users (
id VARCHAR(64) PRIMARY KEY,
username VARCHAR(64) NOT NULL UNIQUE,
password_hash VARCHAR(255) NULL,
department_id VARCHAR(64) NULL,
created_at DATETIME NOT NULL,
updated_at DATETIME NOT NULL,
FOREIGN KEY (department_id) REFERENCES departments(id) ON DELETE SET NULL
) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4 COLLATE=utf8mb4_unicode_ci;
CREATE INDEX idx_users_department ON users(department_id);
CREATE TABLE IF NOT EXISTS roles (
id VARCHAR(64) PRIMARY KEY,
name VARCHAR(64) NOT NULL UNIQUE,
description TEXT NULL,
created_at DATETIME NOT NULL,
updated_at DATETIME NOT NULL
) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4 COLLATE=utf8mb4_unicode_ci;
CREATE TABLE IF NOT EXISTS permissions (
id VARCHAR(64) PRIMARY KEY,
perm_key VARCHAR(128) NOT NULL UNIQUE,
perm_type VARCHAR(32) NOT NULL,
description TEXT NULL,
created_at DATETIME NOT NULL
) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4 COLLATE=utf8mb4_unicode_ci;
CREATE INDEX idx_permissions_type ON permissions(perm_type);
CREATE TABLE IF NOT EXISTS role_permissions (
id VARCHAR(64) PRIMARY KEY,
role_id VARCHAR(64) NOT NULL,
permission_id VARCHAR(64) NOT NULL,
created_at DATETIME NOT NULL,
FOREIGN KEY (role_id) REFERENCES roles(id) ON DELETE CASCADE,
FOREIGN KEY (permission_id) REFERENCES permissions(id) ON DELETE CASCADE
) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4 COLLATE=utf8mb4_unicode_ci;
CREATE TABLE IF NOT EXISTS user_roles (
id VARCHAR(64) PRIMARY KEY,
user_id VARCHAR(64) NOT NULL,
role_id VARCHAR(64) NOT NULL,
created_at DATETIME NOT NULL,
FOREIGN KEY (user_id) REFERENCES users(id) ON DELETE CASCADE,
FOREIGN KEY (role_id) REFERENCES roles(id) ON DELETE CASCADE
) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4 COLLATE=utf8mb4_unicode_ci;
CREATE TABLE IF NOT EXISTS project_members (
id VARCHAR(64) PRIMARY KEY,
project_id VARCHAR(32) NOT NULL,
user_id VARCHAR(64) NOT NULL,
role VARCHAR(32) DEFAULT 'editor',
created_at DATETIME NOT NULL,
FOREIGN KEY (project_id) REFERENCES projects(uuid) ON DELETE CASCADE,
FOREIGN KEY (user_id) REFERENCES users(id) ON DELETE CASCADE
) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4 COLLATE=utf8mb4_unicode_ci;
CREATE INDEX idx_project_members_project ON project_members(project_id);
CREATE TABLE IF NOT EXISTS project_departments (
id VARCHAR(64) PRIMARY KEY,
project_id VARCHAR(32) NOT NULL,
department_id VARCHAR(64) NOT NULL,
created_at DATETIME NOT NULL,
FOREIGN KEY (project_id) REFERENCES projects(uuid) ON DELETE CASCADE,
FOREIGN KEY (department_id) REFERENCES departments(id) ON DELETE CASCADE,
UNIQUE KEY uq_project_department (project_id, department_id)
) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4 COLLATE=utf8mb4_unicode_ci;
CREATE INDEX idx_project_departments_project ON project_departments(project_id);
-- 回填记录:每次要素回填均留痕,支持证据追溯
CREATE TABLE IF NOT EXISTS fill_records (
id VARCHAR(64) PRIMARY KEY,
project_id VARCHAR(32) NOT NULL,
cell_id VARCHAR(64) NULL,
table_id VARCHAR(64) NULL,
row_key VARCHAR(255) NOT NULL,
col_key VARCHAR(255) NULL,
year INT NULL,
filled_value LONGTEXT NULL,
previous_value LONGTEXT NULL,
source_document_id VARCHAR(64) NULL,
source_document_name VARCHAR(255) NULL COMMENT '冗余存储文档名,文档删除后仍可追溯',
source_line_no INT NULL,
source_line_end INT NULL,
source_quote TEXT NULL COMMENT '原文摘录片段,作为回填依据',
confidence FLOAT NULL,
extraction_batch_id VARCHAR(64) NULL,
extraction_model VARCHAR(128) NULL COMMENT '使用的 LLM 模型标识',
fill_type VARCHAR(16) NOT NULL DEFAULT 'auto' COMMENT 'auto=抽取回填, manual=人工编辑, resolve=冲突解决',
created_at DATETIME NOT NULL,
FOREIGN KEY (project_id) REFERENCES projects(uuid) ON DELETE CASCADE,
FOREIGN KEY (cell_id) REFERENCES element_cells(id) ON DELETE SET NULL,
FOREIGN KEY (table_id) REFERENCES element_tables(id) ON DELETE SET NULL,
FOREIGN KEY (source_document_id) REFERENCES kb_documents(id) ON DELETE SET NULL
) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4 COLLATE=utf8mb4_unicode_ci;
CREATE INDEX idx_fill_records_project ON fill_records(project_id);
CREATE INDEX idx_fill_records_cell ON fill_records(cell_id);
CREATE INDEX idx_fill_records_batch ON fill_records(extraction_batch_id);
CREATE INDEX idx_fill_records_source_doc ON fill_records(source_document_id);
CREATE INDEX idx_fill_records_created ON fill_records(created_at);
-- ============================================================
-- report_section_references章节参考范文
-- ============================================================
CREATE TABLE IF NOT EXISTS report_section_references (
id VARCHAR(64) PRIMARY KEY,
template_id VARCHAR(64) NULL COMMENT '关联模板IDreport_templates.id按模板过滤参考范文',
source_file VARCHAR(255) NOT NULL COMMENT '来源文件名',
section_key VARCHAR(64) NOT NULL COMMENT '章节标识,如 1.1、2.1.1',
section_title VARCHAR(255) NOT NULL COMMENT '章节标题',
section_order INT DEFAULT 0 COMMENT '章节序号',
content TEXT NOT NULL COMMENT '该章节的参考范文 Markdown',
created_at DATETIME NOT NULL,
updated_at DATETIME NOT NULL
) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4 COLLATE=utf8mb4_unicode_ci;
CREATE INDEX idx_ref_source_file ON report_section_references(source_file);
CREATE INDEX idx_ref_section_key ON report_section_references(section_key);
CREATE INDEX idx_ref_template_id ON report_section_references(template_id);

View File

@ -0,0 +1,3 @@
-- 为 report_section_references 增加 template_id按模板过滤参考范文
ALTER TABLE report_section_references ADD COLUMN template_id VARCHAR(64) NULL COMMENT '关联模板IDreport_templates.id按模板过滤参考范文';
CREATE INDEX idx_ref_template_id ON report_section_references(template_id);

1
function/__init__.py Normal file
View File

@ -0,0 +1 @@
# function 包

550
function/vector_store.py Normal file
View File

@ -0,0 +1,550 @@
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
function/vector_store.py
向量库模块 - kb_service 项目集成
已修改drop_old 全部 = False不会删除已有集合
已修复 413 超长 token 问题语义友好版
"""
import re
import json
import logging
from typing import Dict, List, Optional, Tuple
from pathlib import Path
from langchain_core.documents import Document
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_openai import OpenAIEmbeddings
from langchain_milvus import Milvus, BM25BuiltInFunction
from pymilvus import MilvusClient, connections
from config import settings
logger = logging.getLogger(__name__)
# ============================================================================
# 配置
# ============================================================================
COLLECTION_NAME = "eval_report"
EMBEDDING_API_BASE = settings.EMBEDDING_API_BASE
EMBEDDING_API_KEY = settings.EMBEDDING_API_KEY
MILVUS_DB_URL = settings.MILVUS_DB_URL
CONSISTENCY_LEVEL = "Bounded"
AUTO_ID = True
METRIC_TYPE = "COSINE"
INDEX_TYPE = "AUTOINDEX"
SPARSE_METRIC_TYPE = "BM25"
SPARSE_INDEX_TYPE = "SPARSE_INVERTED_INDEX"
def _embedding_batch_limits() -> tuple[int, int, int]:
max_docs = max(1, int(getattr(settings, "EMBEDDING_BATCH_MAX_DOCS", 4) or 4))
max_chars = max(512, int(getattr(settings, "EMBEDDING_BATCH_MAX_CHARS", 12000) or 12000))
max_chunk = max(512, int(getattr(settings, "EMBEDDING_MAX_CHUNK_CHARS", 4000) or 4000))
return max_docs, max_chars, max_chunk
def _is_embedding_backend_oom(exc: BaseException) -> bool:
msg = str(exc).lower()
return (
"out of memory" in msg
or "npu out of memory" in msg
or "cuda out of memory" in msg
or "error code: 424" in msg
or "'code': 424" in msg
)
def _add_documents_batch_with_retry(vs: Milvus, batch: List[Document]) -> List[str]:
"""写入一批文档;远端 embedding OOM 时自动拆半重试。"""
if not batch:
return []
try:
return list(vs.add_documents(batch))
except Exception as e:
if not _is_embedding_backend_oom(e) or len(batch) <= 1:
raise
mid = max(1, len(batch) // 2)
logger.warning(
"embedding 批次 OOM拆分为 %s + %s 重试",
mid,
len(batch) - mid,
)
ids: List[str] = []
ids.extend(_add_documents_batch_with_retry(vs, batch[:mid]))
ids.extend(_add_documents_batch_with_retry(vs, batch[mid:]))
return ids
def _register_milvus_client_for_orm(client: MilvusClient) -> None:
"""pymilvus 2.6+ MilvusClient uses ConnectionManager; ORM Collection still resolves
pymilvus.orm.connections by client._using. langchain-milvus touches Collection during
Milvus.__init__, so register before constructing Milvus (bootstrap client)."""
alias = client._using
if connections.has_connection(alias):
return
cfg = client._config
connections._alias_handlers[alias] = client._handler
connections._alias_config[alias] = {
"address": cfg.address,
"user": "",
"db_name": cfg.db_name or "default",
}
# ============================================================================
# VectorStore 类(已全部改为 drop_old=False
# ============================================================================
class VectorStore:
def __init__(
self,
collection_name: str = COLLECTION_NAME,
drop_old: bool = False,
chunk_size: int = 500,
chunk_overlap: int = 50
):
self.collection_name = collection_name
self.chunk_size = chunk_size
self.chunk_overlap = chunk_overlap
self._drop_old = drop_old
self._milvus = None
def _get_embeddings(self):
return OpenAIEmbeddings(
base_url=EMBEDDING_API_BASE,
api_key=EMBEDDING_API_KEY,
model="bge-m3",
check_embedding_ctx_length=False,
)
def _get_milvus(self, drop_old: bool = False) -> Milvus:
logger.info("【VectorStore】初始化 Milvus 混合向量存储dense + sparse")
if self._milvus is not None and not drop_old:
logger.info("【VectorStore】复用已有 Milvus 实例")
return self._milvus
if not MILVUS_DB_URL:
raise ValueError("MILVUS_DB_URL 未配置,请在 .env 中设置")
embeddings = self._get_embeddings()
logger.info("【VectorStore】Embedding 模型 bge-m3 初始化完成")
try:
# 与 langchain 内 MilvusClient 共享 ConnectionManager先注册 ORM alias否则 __init__ 内访问 Collection 会报错
_register_milvus_client_for_orm(MilvusClient(uri=MILVUS_DB_URL))
self._milvus = Milvus(
embedding_function=embeddings,
builtin_function=BM25BuiltInFunction(),
vector_field=["dense", "sparse"],
connection_args={"uri": MILVUS_DB_URL},
collection_name=self.collection_name,
consistency_level=CONSISTENCY_LEVEL,
auto_id=AUTO_ID,
drop_old=False,
index_params=[
{"metric_type": METRIC_TYPE, "index_type": INDEX_TYPE},
{"metric_type": SPARSE_METRIC_TYPE, "index_type": SPARSE_INDEX_TYPE},
],
)
_register_milvus_client_for_orm(self._milvus.client)
logger.info("✅ Milvus 混合向量存储初始化成功")
except Exception as e:
logger.error(f"❌ Milvus 初始化失败: {str(e)}", exc_info=True)
raise
return self._milvus
# ========================================================================
# ✅ 修复版 add_documents语义友好不破坏段落不触发413
# ========================================================================
def add_documents(self, documents: List[Document]) -> List[str]:
if not documents:
logger.info("【add_documents】无文档可写入")
return []
max_docs_per_batch, max_chars_per_batch, max_chunk_chars = _embedding_batch_limits()
# ---------------------- 语义安全切分(只修问题,不破坏结构)----------------------
# 只处理【真的超长】的段落,在句子/段落边界分割,绝不乱切
safe_splitter = RecursiveCharacterTextSplitter(
chunk_size=max_chunk_chars,
chunk_overlap=min(200, max(0, max_chunk_chars // 20)),
separators=["\n\n", "\n", "", "", "", "", "", ""]
)
safe_documents = []
for doc in documents:
# 超过限制才切分
if len(doc.page_content) > max_chunk_chars:
chunks = safe_splitter.split_text(doc.page_content)
for chunk in chunks:
if chunk.strip():
safe_documents.append(Document(
page_content=chunk,
metadata=doc.metadata.copy()
))
else:
safe_documents.append(doc)
# --------------------------------------------------------------------------------
# Milvus 现有集合要求部分 metadata 字段必填;历史调用方未必都传这些字段,这里统一兜底补齐。
for idx, doc in enumerate(safe_documents):
metadata = doc.metadata or {}
if not metadata.get("doc_id"):
project_uuid = metadata.get("project_uuid") or "unknown_project"
heading = metadata.get("heading") or "chunk"
metadata["doc_id"] = f"{project_uuid}:{heading}:{idx}"
if "original_title" not in metadata:
metadata["original_title"] = metadata.get("heading") or ""
if "path" not in metadata:
metadata["path"] = ""
if "project_uuid" not in metadata:
metadata["project_uuid"] = "unknown_project"
doc.metadata = metadata
logger.info(f"【add_documents】预处理后准备写入 {len(safe_documents)} 条文档")
vs = self._get_milvus(drop_old=self._drop_old)
self._drop_old = False
ids = []
current_batch: List[Document] = []
current_batch_chars = 0
batch_num = 1
def _flush_batch() -> None:
nonlocal current_batch, current_batch_chars, batch_num
if not current_batch:
return
logger.info(
"【add_documents】写入批次 %s,数量:%s,约 %s 字符",
batch_num,
len(current_batch),
current_batch_chars,
)
try:
res = _add_documents_batch_with_retry(vs, current_batch)
ids.extend(res)
logger.info("✅ 批次写入成功,返回 ID 数:%s", len(res))
except Exception as e:
logger.error("❌ 批次写入失败: %s", e, exc_info=True)
batch_num += 1
current_batch = []
current_batch_chars = 0
for doc in safe_documents:
doc_chars = len(doc.page_content or "")
would_exceed_docs = bool(current_batch) and len(current_batch) >= max_docs_per_batch
would_exceed_chars = bool(current_batch) and (
current_batch_chars + doc_chars > max_chars_per_batch
)
if would_exceed_docs or would_exceed_chars:
_flush_batch()
current_batch.append(doc)
current_batch_chars += doc_chars
_flush_batch()
logger.info(f"【add_documents】全部完成总写入 ID 数:{len(ids)}")
return ids
def similarity_search_with_score(
self, query: str, k: int = 10, filter: Optional[str] = None
) -> List[Tuple[Document, float]]:
vs = self._get_milvus(drop_old=False)
query = query[:5000]
if filter:
return vs.similarity_search_with_score(query, k=k, filter=filter)
return vs.similarity_search_with_score(query, k=k)
def similarity_search_dense_filtered(
self,
query: str,
k: int,
filter_expr: str,
) -> List[Tuple[Document, float]]:
"""
使用 dense 向量 ANN + Milvus 标量过滤检索
hybriddense+sparse集合上 langchain_milvus filter 可能不生效抽取侧召回用此路径保证 doc_id 隔离
"""
from pymilvus import MilvusClient
q = (query or "")[:5000]
if not q.strip():
return []
emb = self._get_embeddings().embed_query(q)
client = MilvusClient(uri=MILVUS_DB_URL)
try:
raw = client.search(
collection_name=self.collection_name,
data=[emb],
anns_field="dense",
limit=max(1, int(k)),
filter=filter_expr,
output_fields=[
"text",
"heading",
"heading_level",
"doc_id",
"project_uuid",
"original_title",
"path",
],
)
finally:
client.close()
hits = raw[0] if raw else []
out: List[Tuple[Document, float]] = []
for hit in hits:
ent = hit.get("entity") or {}
doc = Document(
page_content=str(ent.get("text") or ""),
metadata={
"heading": ent.get("heading"),
"heading_level": ent.get("heading_level"),
"doc_id": ent.get("doc_id"),
"project_uuid": ent.get("project_uuid"),
"original_title": ent.get("original_title"),
"path": ent.get("path"),
},
)
dist = hit.get("distance")
try:
score = float(dist) if dist is not None else 0.0
except (TypeError, ValueError):
score = 0.0
out.append((doc, score))
return out
def delete_by_filter(self, filter_expr: str) -> int:
try:
from pymilvus import MilvusClient
client = MilvusClient(uri=MILVUS_DB_URL)
if not client.has_collection(self.collection_name):
return 0
# 某些集合主键字段名不叫 id例如 langchain-milvus 可能使用自定义 PK/auto_id
# 先从集合描述里找主键字段,再用于 query 计数。
pk_field = None
describe = client.describe_collection(self.collection_name)
for f in describe.get("fields", []) or []:
# 兼容不同返回结构is_primary / isPrimary / primary
if f.get("is_primary") or f.get("isPrimary") or f.get("primary"):
pk_field = f.get("name")
break
count = 0
try:
if pk_field:
res = client.query(
self.collection_name,
filter=filter_expr,
output_fields=[pk_field],
)
count = len(res)
else:
# 找不到主键字段名时也不阻断删除
count = 0
except Exception:
# 仅计数失败不影响删除
count = 0
client.delete(self.collection_name, filter=filter_expr)
client.close()
return count
except Exception as e:
logger.error(f"删除失败: {e}")
return 0
# ============================================================================
# Markdown 拆分
# ============================================================================
def split_markdown(text: str, chunk_size: int = 500, chunk_overlap: int = 50) -> List[str]:
if not text: return []
splitter = RecursiveCharacterTextSplitter(
chunk_size=chunk_size, chunk_overlap=chunk_overlap,
separators=["\n\n", "", "", "", "\n", "", "", ""]
)
return splitter.split_text(text)
def split_markdown_by_headings(content: str, chunk_size=300, chunk_overlap=40) -> List[Document]:
if not content: return []
docs = []
lines = content.split("\n")
current_heading = ""
current_level = 0
current_lines = []
def flush():
nonlocal current_lines, current_heading, current_level
txt = "\n".join(current_lines).strip()
if txt:
docs.append(Document(
page_content=txt,
metadata={"heading": current_heading, "heading_level": current_level}
))
current_lines = []
for line in lines:
line = line.rstrip()
m = re.match(r"^(#{1,6})\s+(.+)$", line)
if m:
flush()
current_level = len(m.group(1))
current_heading = m.group(2).strip()
else:
current_lines.append(line)
flush()
if not docs:
chunks = split_markdown(content, chunk_size, chunk_overlap)
for i, c in enumerate(chunks):
docs.append(
Document(
page_content=c,
metadata={"chunk_index": i, "heading": "", "heading_level": 0},
)
)
return docs
def process_document_to_vector_store(
doc_id: str, title: str, content: str, path: str, project_uuid: str, collection_name=COLLECTION_NAME
) -> bool:
try:
vs = VectorStore(collection_name=collection_name, drop_old=False)
docs = split_markdown_by_headings(content)
for d in docs:
d.metadata["doc_id"] = doc_id
d.metadata["original_title"] = title
d.metadata["path"] = path
d.metadata["project_uuid"] = project_uuid
vs.add_documents(docs)
return True
except Exception as e:
logger.error(f"处理文档失败: {e}")
return False
# ============================================================================
# 数据预处理
# ============================================================================
INPUT_FILE = "data/articles.jsonl"
OUTPUT_CHUNK_FILE = "data/processed/eval_chunks.jsonl"
def load_jsonl(filename: str, encoding="utf-8"):
with open(filename, encoding=encoding) as f:
for line in f:
if line.strip():
yield json.loads(line)
def write_jsonl(data, filename, append=False, ensure_ascii=False):
mode = "a" if append else "w"
with open(filename, mode, encoding="utf-8") as f:
for item in data:
f.write(json.dumps(item, ensure_ascii=ensure_ascii) + "\n")
def clean_text(text: str) -> str:
if not isinstance(text, str): return ""
text = re.sub(r"[\x00-\x09\x0B-\x1F\x7F]", "", text)
text = re.sub(r"[\u200b-\u200f\u2028\u2029]", "", text)
text = re.sub(r"[:’“”•…–—]", "", text)
text = re.sub(r"<[^>]+>", "\n", text)
text = re.sub(r"\n+", "\n", text)
text = re.sub(r" +", " ", text)
text = re.sub(r"^[。,?!;:]", "", text)
text = re.sub(r'[^\u4e00-\u9fff_a-zA-Z0-9\s《》【】""''·!@#$%^&*()_+=[]{}|;:\'",./<>?-]', "", text)
return text.strip()
def concat_metadata_to_content(title: str, content: str, metadata: dict):
parts = [
f"标题:{title}",
f"发布时间:{metadata.get('publish_time')}",
f"作者:{metadata.get('author')}",
f"来源:{metadata.get('source')}",
]
parts = [p for p in parts if p.split("")[-1]]
return " | ".join(parts) + "\n---\n" + content.strip()
def process_all_documents(input_file, output_file, chunk_size=500, overlap=50):
docs = load_jsonl(input_file)
splitter = RecursiveCharacterTextSplitter(chunk_size=chunk_size, chunk_overlap=overlap,
separators=["\n\n", "", "", "", "\n", "", "", ""])
all_chunks = []
num_docs = 0
for doc in docs:
num_docs +=1
content = clean_text(doc["content"])
chunks = splitter.split_text(content)
for i, chunk in chunks:
clean_c = clean_text(chunk)
if len(clean_c) <10: continue
all_chunks.append({
"id": f"{doc['id']}_chunk_{i}",
"doc_id": doc["id"],
"title": doc["title"],
"content": concat_metadata_to_content(doc["title"], clean_c, doc.get("metadata",{})),
"chunk_index": i,
"url": doc.get("metadata",{}).get("url","")
})
write_jsonl(all_chunks, output_file)
return {"num_docs":num_docs, "num_chunks":len(all_chunks)}
def load_chunk_jsonl(path):
res = []
with open(path, encoding="utf-8") as f:
for line in f:
if line.strip():
res.append(json.loads(line))
return res
def build_index(data, vs: VectorStore):
docs: List[Document] = []
for row in data:
c = row.pop("content", "").strip()
if len(c) < 10:
continue
docs.append(Document(page_content=c, metadata=row))
if docs:
vs.add_documents(docs)
def get_vector_store(drop_old=False):
vs = VectorStore(collection_name=COLLECTION_NAME, drop_old=drop_old)
return vs._get_milvus(drop_old=drop_old)
def search_eval(query, top_k=10):
from time import time
vs = VectorStore(drop_old=False)
st = time()
results = vs.similarity_search_with_score(query, k=top_k)
print(f"检索耗时: {time()-st:.2f}s")
return results
# ============================================================================
# 运行入口
# ============================================================================
if __name__ == "__main__":
logger.info("="*60)
logger.info("【Milvus 混合向量索引构建启动】dense + sparse(BM25)")
logger.info("="*60)
process_all_documents(INPUT_FILE, OUTPUT_CHUNK_FILE)
logger.info("✅ 文本分块处理完成")
chunk_data = load_chunk_jsonl(OUTPUT_CHUNK_FILE)
logger.info(f"✅ 加载分块数据:{len(chunk_data)}")
vs = VectorStore(drop_old=False)
build_index(chunk_data, vs)
logger.info("✅ 索引构建完成(增量写入)")
res = search_eval("测试检索内容")
logger.info(f"✅ 检索完成,命中数量:{len(res)}")
for doc, score in res:
logger.info(f"score={score:.4f} | content={doc.page_content[:80]}...")
logger.info("="*60)
logger.info("【全部执行完成】")

View File

@ -0,0 +1 @@
# prompts 包

View File

@ -0,0 +1 @@
# report_generation prompts 包

View File

@ -0,0 +1,52 @@
"""Fixed markdown templates used by report generation."""
def markdown_hashes_for_section_no(section_no: str) -> str:
"""与前端 markdownHashesForSectionNo / _heading_level_and_class 对齐。"""
parts = str(section_no or "").strip().split(".")
if len(parts) == 1:
return "##"
if len(parts) == 2:
return "###"
return "####"
def missing_child_heading_markdown(heading_no: str) -> str:
hashes = markdown_hashes_for_section_no(heading_no)
return f"\n\n{hashes} {heading_no} 待补充\n\n待补充"
# 兼容旧引用;新代码请用 missing_child_heading_markdown(heading_no)
MISSING_CHILD_HEADING_TEMPLATE = "\n\n### {heading_no} 待补充\n\n待补充"
MINIMAL_MISSING_TABLE_TEMPLATE = (
"\n\n### {table_name}\n\n"
"| 项目 | 内容 |\n"
"| --- | --- |\n"
"| 关键数据 | 待补充 |\n"
)
APPENDIX8_PARAMETER_COMPARISON_TABLE = (
"| 序号 | 项目名称 | 单位 | 可研报告 | 后评价报告 | 备注 |\n"
"| --- | --- | --- | --- | --- | --- |\n"
"| 一 | 成本参数 | | | | |\n"
"| 1 | 原料价格 | | | | |\n"
"| 1.1 | 氢气 | 元/吨 | 待补充 | 待补充 | |\n"
"| 2 | 催化剂和化学药剂 | 万元 | 待补充 | 待补充 | |\n"
"| 3 | 燃料动力价格 | | | | |\n"
"| 3.1 | 除盐水价格 | 元/吨 | 待补充 | 待补充 | |\n"
"| …… | …… | | | | |\n"
"| 二 | 营业收入参数 | | | | |\n"
"| 2.1 | 98#汽油 | 元/吨 | 待补充 | 待补充 | |\n"
"| …… | …… | | | | |\n"
"| 三 | 税收参数 | | | | |\n"
"| | 增值税税率 | | | | |\n"
"| | 汽油各品种产品 | % | 待补充 | 待补充 | |\n"
"| …… | …… | | | | |\n"
"| 四 | 基准收益率 | % | 待补充 | 待补充 | |"
)
APPENDIX_FIGURE_TARGETS: list[tuple[str, str]] = [
("附图1", "全厂物料平衡图"),
("附图2", "烷基化装置物料平衡图"),
]

View File

@ -0,0 +1 @@
你是后评价报告撰写助手。严格基于证据输出,禁止编造。示例仅可用于写作风格参考,禁止复用示例中的任何事实数据与结论。禁止输出与当前小节无关的表号/表题清单及跨节“详见表/参见表”引用。必须返回 JSON 对象,字段为 content/missingInfo/qualityChecks。

View File

@ -0,0 +1,67 @@
你正在编写后评价报告章节:{{section_title}}
【章节细则描述】
{{section_prompt}}
【章节模板】
{{section_title}}
【模板必需表格】
{{required_tables_text}}
【结构化表格证据(必须优先采用)】
{{structured_tables_text}}
【字段级已抽取结果(强约束)】
{{canonical_fields_text}}
【章节示例】
{{selected_example}}
【参考范文】
{{section_reference_block}}
【示例使用约束】
1. 以《模版.doc》同章节结构为第一优先段落顺序、表格标题、表头字段尽量保持一致
2. 参考范文仅用于格式与结构参考,严禁复用示例中的项目名称、年份、金额、比例、指标值与结论;
3. 所有数值必须来自证据包;如需表格,表头可沿用模板,表内数据必须替换为当前项目证据;
4. 若模板字段无证据,按字段粒度写"待补充",不得整段空泛描述。
【输出硬约束】
1. 若存在【模板必需表格】,正文必须出现同名(或同编号)表格标题;
2. 若【结构化表格证据】中存在对应必需表,必须原样使用该 Markdown 表格,不得自行生成或改写表头/数值;
3. 仅在单元格级别缺失时写"待补充",避免整段反复"待补充"
4. 若【字段级已抽取结果】中某字段为非"待补充"值,正文该字段不得写"待补充",必须使用该抽取值;
5. content 字段只允许写章节正文,严禁出现"【缺失信息说明】""【质量检查】"及其任何条目;
6. 禁止输出与本节无关的表号/表题清单,禁止出现跨节表格引用(如"详见表X-X/参见表X-X/见表X-X/如表X-X所示");仅当【章节输出结构约束】明确要求时,才允许引用或输出对应表。
{{heading_rule}}7. 禁止使用"关键里程碑时间线""建设/投资执行情况"等突兀标签式标题。
【表格严格管控——必须遵守】
1. **禁止凭空生成表格**:只有当【章节输出结构约束】中明确包含"【表格强制要求】"标签时,本节才允许输出 Markdown 表格;
2. **无"表格强制要求"的章节一律禁止输出任何 Markdown 表格**(即不得输出含 | 分隔符的表格行),即使证据包中有结构化表格数据也不得在正文中嵌入;
3. **"见附表N"仅为引用语**:若合同要求写"项目建设工作程序见附表1。"等引用句,只需输出该引用句文本,附表本体在报告末尾统一输出,严禁在本节正文中展开附表的完整 Markdown 表格;
4. 表格数据必须严格来自要素管理element_tables/element_cells不得自行编造表格内容
5. 每个 Markdown 表格前须有独立一行表题形如「表1 …」「表2-3 …」「附表8 …」等);表题紧挨表格上方单独成段,表题与表格之间最多空一行或一行注释;前端会将表题居中排版。
6. **表号与表名间距**表题中表号如「表2-4」「附表8」与表名之间须空两个全角空格U+3000例如「表2-4  原料数量及组成对比表」。
7. **表头栏单位**凡含计量单位的列名名称写第一行、单位加括号写在第二行且在同一表头单元格内Markdown 可用 `<br>`,如 `新鲜水<br>m³/h`);表题与表头均勿使用 `**` 加粗;勿将单位单独占一列,勿把「名称(单位)」横挤在同一行。
8. **公共单位写表题**若整张表各数据列所用单位相同单位应加括号写在表题行末尾如「表3 ××公司储罐能力 (m³)」),表头栏内不再重复该单位;若各列单位不一致,则仍按列在表头内分行写单位。
9. **表格序号列**:用阿拉伯数字,层次与正文一致(如 1、1.1、1.2、2、2.1);行键或表体第一列已带层次编号时可与之对齐;否则自上而下用 1、2、3…「合计」「总计」行可用「—」。
10. **表体与数字**:表内文字、数字宜水平与垂直居中;若单元格内需换行或分段(含 `<br>`),宜左齐排列以便阅读。同一表内、同列的小数、百分比等宜保留相同的小数位数。
【检索顺序约束】
1. 优先使用要素抽取结果;
2. 要素不足时补充文档段落;
3. 最后使用关键词检索到的补充材料;
4. 无证据时写"待补充",禁止编造。
{{prior_sibling_sections_block}}
{{prior_chapters_block}}
【章节输出结构约束】
{{section_contract}}
【证据包(JSON)】
{{evidence_json}}
请仅返回 JSON{"content":"章节Markdown正文","missingInfo":["缺失项"],"qualityChecks":["校验结论"]}

View File

@ -0,0 +1,88 @@
你正在编写后评价报告章节:{{section_title}}
本次任务:以【章节细则描述】和【参考范文】共同作为本节的写作模板,以【事实证据】作为唯一数据来源。核心原则是:**细则与范文决定写什么、怎么写;证据只负责提供可填入模板的真实数据**。生成时必须先搭模板,再填证据,严禁脱离模板自由发挥,严禁复用范文数据或自行改写证据数据。
========================= 第一部分 · 写作模板(最高优先级:决定内容范围、结构和文风)=========================
【标题编号规则】
{{heading_rule}}
【章节细则描述】
{{section_prompt}}
【参考范文(内容范围、论述维度、段落结构和行文风格的主要模板)】
{{section_reference_block}}
========================= 第二部分 · 事实证据(唯一数据来源,仅用于支撑和填充模板)=========================
【模板必需表格】
{{required_tables_text}}
【结构化表格证据(必须优先采用)】
{{structured_tables_text}}
【字段级已抽取结果(强约束)】
{{canonical_fields_text}}
【证据包(JSON)】
{{evidence_json}}
========================= 第三部分 · 上文已生成内容(只用于一致性校验,不改变本节模板)=========================
{{prior_sibling_sections_block}}
{{prior_chapters_block}}
========================= 第四部分 · 写作与输出要求(务必逐条遵守)=========================
【生成步骤】
1. 先读取【章节细则描述】和【参考范文】,抽取本节应覆盖的内容主题、论述维度、段落顺序、子标题层级、表格/列举形式和结论方式;
2. 再读取【章节输出结构约束】,确认本节是否允许/必须输出表格、附表引用或特定结构;
3. 然后只从【事实证据】中选择可支撑上述模板的数据,把证据数据填入对应位置;
4. 最后输出正文。若模板要求的某项内容在证据中没有对应数据,写"待补充",不得跳过、猜测、编造或用范文数据顶替。
【模板遵循要求——细则与范文共同决定“写什么”和“怎么写”】
1. "写什么"由【章节细则描述】与【参考范文】共同决定:细则列出的要点、子项及顺序为必写项;参考范文实际写到的内容主题、论述维度和信息点(如背景、依据、目标、措施、问题、结论等)也应覆盖。二者取并集,不得遗漏,也不得另起炉灶写无关内容;
2. "怎么写"以【参考范文】为主要模板:段落数量、段落顺序、每段主题、论述推进、句式结构、专业术语、连接词、语气口吻、详略程度和结论表达都应高度贴合范文;
3. 若【章节细则描述】与【参考范文】存在差异,优先保证细则要求完整覆盖,再用范文的结构和笔法组织表达;若二者均未要求,正文不要主动扩展。
【证据使用要求——数据必须来自证据且保持原值】
1. 所有项目名称、时间、金额、数量、比例、指标值、单位、结论依据等事实性内容,只能来自第二部分事实证据;
2. 数据必须原值引用,严禁自行修改、估算、换算单位、四舍五入、增减、归纳为新数值或编造。证据是多少就写多少;证据未给出的数据写"待补充"
3. 若【字段级已抽取结果】中某字段为非"待补充"值,正文必须原样使用该抽取值,不得写"待补充",也不得改动、换算或重新表述其数值;
4. 内容来源优先级:结构化表格证据 / 字段级已抽取结果 > 证据包(JSON)中的章节文档 > 关键词检索补充材料;
5. 禁止复用【参考范文】或【章节示例】中的任何项目名称、年份、金额、指标值、比例、结论等事实数据。
【参考范文贴合要求——高度相似但严禁照抄】
1. 逐段对照:范文有几段就尽量写几段,每段主题、先后顺序、论述角度与起承转合须与范文对应;
2. 句式与笔法对齐:尽量沿用范文的段首引导方式、常用表达、收束方式和专业语气,使本节读起来与范文出自同一类报告;
3. 篇幅与颗粒度对齐:每段篇幅、信息密度和展开程度与范文相当,不得明显更短、更空泛,也不得无端扩写;
4. 形式对齐:范文采用分条、分项、描述性子标题或表格呈现的,本节也尽量采用同类形式,但必须满足【章节输出结构约束】和下方表格规则;
5. 禁止逐字照抄不得出现与范文连续相同超过15字的句子或成段文字应在保持结构和笔法相似的前提下用本项目证据重新表述。
【输出硬约束】
1. content字段只允许写章节正文严禁出现"【缺失信息说明】""【质量检查】"及其任何条目;
2. 若存在【模板必需表格】,正文必须出现同名(或同编号)表格标题;
3. 若【结构化表格证据】中存在对应必需表必须原样使用该Markdown表格不得自行生成或改写表头/数值;
4. 仅在单元格级别缺失时写"待补充",避免整段反复"待补充"
5. 禁止输出与本节无关的表号/表题清单,禁止出现跨节表格引用(如"详见表X-X/参见表X-X/见表X-X/如表X-X所示");仅当【章节输出结构约束】明确要求时,才允许引用或输出对应表;
6. 禁止使用"关键里程碑时间线""建设/投资执行情况"等突兀标签式标题;
7. 数字与汉字之间不留空格:阿拉伯数字、百分比、金额、年份等与相邻汉字之间不得插入半角或全角空格,例如写"投资1.2亿元""2023年12月""产能达95%",不得写"投资 1.2 亿元""2023 年 12 月""产能达 95 %";数字与计量单位之间也不留空格,如"30万吨"而非"30 万吨"
8. 子标题形式约束:正文段落允许使用描述性小标题,但只能采用"一、""(一)""1."或加粗短语单独成行等中文公文层级形式严禁使用Markdown标题语法`#``##``###`等)充当子标题。表格上方的表题不属于子标题;
9. 计量单位须规范:面积写"m²"不得写"m2",体积写"m³"不得写"m3",流量写"m³/h"不得写"m3/h";温度写"℃",千分号写"‰",科学计数可写"×10⁴"。正文与表格中的单位均须规范。
【表格严格管控】
1. 只有当【章节输出结构约束】中明确包含"【表格强制要求】"标签时本节才允许输出Markdown表格
2. 无"表格强制要求"的章节一律禁止输出任何Markdown表格不得输出含`|`分隔符的表格行),即使证据包中有结构化表格数据也不得在正文中嵌入;
3. "见附表N"仅为引用语:若结构约束要求写"项目建设工作程序见附表1。"等引用句只输出引用句文本附表本体在报告末尾统一输出严禁在本节展开完整Markdown表格
4. 表格数据必须严格来自要素管理element_tables/element_cells或结构化表格证据不得自行编造、换算或改写表格内容
5. 每个Markdown表格前须有独立一行表题如「表1  ××表」「表2-3  ××表」「附表8  ××表」表题紧挨表格上方单独成段
6. 表号与表名之间须空两个全角空格U+3000例如「表2-4  原料数量及组成对比表」
7. 含计量单位的表头名称写第一行、单位加括号写第二行且在同一表头单元格内Markdown可用`<br>`,如`新鲜水<br>m³/h`);勿将单位单独占一列;
8. 若整张表各数据列所用单位相同,单位写在表题行末尾,表头栏内不再重复;若各列单位不一致,则按列在表头内分行写单位;
9. 表格序号列用阿拉伯数字,层次与正文一致;"合计""总计"行可用"—"
10. 同一表内、同列的小数、百分比等宜保留相同的小数位数,但不得因此改动证据原值。
【输出格式】
请仅返回JSON{"content":"章节Markdown正文","missingInfo":["缺失项"],"qualityChecks":["校验结论"]}
你正在编写后评价报告章节:{{section_title}}

View File

@ -0,0 +1,14 @@
"""Heading rule prompt variables for report generation."""
DEFAULT_HEADING_RULE = (
"5. 各章节内部小标题须使用规范层级格式(如“### 1.2.1 …”);"
"若在同一节内使用并列条目必须统一写作“1… 2… 3…”"
"禁止使用“一、二、三、”“”或“1.”“1.2.”“3.1”等序号形式;\n"
)
SECTION_HEADING_RULES: dict[str, str] = {
"1.2": (
"5. 本节1.2)必须严格遵循【章节输出结构约束】给定的纯文本编号体结构;"
"不得使用“###”等 Markdown 小标题语法不得将“1.2.1/1.2.2”改写为“1/2”。\n"
),
}

View File

@ -0,0 +1,4 @@
"""Fallback prompt fragments for report generation."""
DEFAULT_SECTION_PROMPT_FALLBACK = "按后评价细则规范撰写本章节。"
DEFAULT_SELECTED_EXAMPLE_FALLBACK = "无示例,按规范输出。"

View File

@ -0,0 +1 @@
你是后评价报告撰写助手。任务是对既有章节做最小修改补齐缺表,禁止删除事实性内容,禁止编造。返回 JSON{"content":"..."}

View File

@ -0,0 +1,19 @@
你正在修订章节:{{section_title}}
目标:在不删除原有有效内容的前提下,补齐缺失表格。
必须出现的表标识:{{missing_tables}}
要求:
1) 每个缺失表都必须在正文中出现,并使用 Markdown 表格;
2) 若证据不足,单元格可写“待补充”;
3) 表标题必须包含对应表标识如“表2-1”
4) 仅输出修订后的完整章节 Markdown。
【原章节内容】
{{content}}
【原始章节提示词】
{{original_prompt}}
【证据包(JSON)】
{{evidence_json}}

View File

@ -0,0 +1 @@
你是后评价报告格式修订助手。仅做格式对齐修订:章节标题、表名、表头。禁止新增未证据支持的数据。返回 JSON{"content":"..."}

View File

@ -0,0 +1,25 @@
你正在修订章节:{{section_title}}
目标:对齐模板格式,不改变事实结论。
请仅修订“章节标题、表名、表头”,正文事实描述尽量保持原样。
【模板表规格(JSON)】
{{table_specs_json}}
【当前章节】
{{content}}
【证据包(JSON)】
{{evidence_json}}
修订规则:
1) 章节首行必须为标准章节标题;
2) 表名必须与模板表规格中的 token/title 对齐表题中表号与表名之间须空两个全角空格如「表2-4  原料数量及组成对比表」
3) 表头字段优先与模板一致,表内数据来自证据包,无值写待补充;
4) 必须使用 Markdown 表格;
5) 表头栏排版指标名称与计量单位分两行写在同一表头单元格内单位须加括号并写在名称正下方Markdown 可用 `<br>`,如 `新鲜水<br>m³/h`);表题与表头均勿使用 `**` 加粗;勿将单位单独拆成一列表头列,勿把「名称(单位)」横挤在同一行;
6) 若整张表各数据列所用单位相同应将单位加括号写在表题末尾如「表3 ××公司储罐能力 (m³)」),表头栏内不再重复写该单位;
7) 表格「序号」列优先使用各行行键row_key首部已有的阿拉伯数字层次编号与正文 1、1.1、1.2、2、2.1 一致);若行键未带此类编号,则用自上而下连续阿拉伯数字 1、2、3…「合计」「总计」行序号可用「—」
8) 表体单元格内容宜居中;若有换行或分段,宜左齐。同列数值宜统一小数位数;
9) 禁止编造事实数据;
10) 仅返回修订后的完整章节 Markdown不要返回 JSON

204
routers/report.py Normal file
View File

@ -0,0 +1,204 @@
"""
routers/report.py
后评价报告核心生成路由独立抽取版
eval_report routers/write.py 摘取报告生成相关端点去除鉴权依赖
项目查询改用轻量的 services/project_service.get_project
业务逻辑在 services/report_generation_service.py
"""
from __future__ import annotations
import asyncio
import json
from typing import Optional
from fastapi import APIRouter, Depends, Header, HTTPException
from fastapi.responses import StreamingResponse
from sqlalchemy.orm import Session
from database import SessionLocal, get_db
from database.models import ReportTemplate, ReportTemplateSection
from schemas.write import (
GenerateReportJobCreate,
GenerateReportJobItem,
GenerateReportResult,
)
from services.project_service import get_project
from services.report_generation_service import (
create_report_job,
get_report_job,
get_report_result,
get_report_stream_snapshot,
retry_report_chapter,
cancel_report_job,
)
router = APIRouter(prefix="/write", tags=["后评价报告生成"])
@router.get("/projects/{project_id}/generate-sections", summary="按章节智能体生成提示词清单")
def generate_sections_prompt(
project_id: str,
template_id: Optional[str] = None,
db: Session = Depends(get_db),
):
_ = get_project(project_id, db)
template = None
if template_id:
template = db.query(ReportTemplate).filter(ReportTemplate.id == template_id, ReportTemplate.is_active == True).first() # noqa: E712
if not template:
template = db.query(ReportTemplate).filter(ReportTemplate.is_default == True, ReportTemplate.is_active == True).first() # noqa: E712
if not template:
raise HTTPException(status_code=404, detail="未找到可用模板")
sections = (
db.query(ReportTemplateSection)
.filter(ReportTemplateSection.template_id == template.id)
.order_by(ReportTemplateSection.section_order.asc())
.all()
)
return {
"templateId": template.id,
"templateName": template.name,
"sections": [
{
"sectionKey": s.section_key,
"sectionTitle": s.section_title,
"prompt": (
"请基于2020后评价细则与本项目检索材料先查要素表再查文档段落最后生成本章节内容。\n"
+ (s.section_prompt or "")
),
"examples": s.examples or "",
}
for s in sections
],
}
@router.post(
"/projects/{project_id}/generate-report-job",
response_model=GenerateReportJobItem,
summary="创建分章异步报告生成任务",
)
def create_generate_report_job(
project_id: str,
body: GenerateReportJobCreate,
db: Session = Depends(get_db),
x_user_id: Optional[str] = Header(default=None, alias="X-User-Id"),
):
_ = get_project(project_id, db)
return create_report_job(
project_id,
db,
template_id=body.templateId,
top_k=body.topK,
requested_by=x_user_id,
)
@router.get(
"/projects/{project_id}/generate-report-job/{job_id}",
response_model=GenerateReportJobItem,
summary="查询分章异步报告任务进度",
)
def get_generate_report_job(
project_id: str,
job_id: str,
db: Session = Depends(get_db),
):
return get_report_job(project_id, job_id, db)
@router.get(
"/projects/{project_id}/generate-report-job/{job_id}/result",
response_model=GenerateReportResult,
summary="获取分章异步报告任务结果",
)
def get_generate_report_result(
project_id: str,
job_id: str,
include_debug: bool = False,
db: Session = Depends(get_db),
):
return get_report_result(project_id, job_id, db, include_debug=include_debug)
@router.get(
"/projects/{project_id}/generate-report-job/{job_id}/events",
summary="订阅分章异步报告任务实时事件SSE",
)
async def stream_generate_report_job_events(
project_id: str,
job_id: str,
include_debug: bool = False,
):
# 校验后立即释放连接SSE 循环中按需短连接查询,避免长连占满连接池
with SessionLocal() as db:
_ = get_report_job(project_id, job_id, db)
async def _event_stream():
last_payload = ""
idle_ticks = 0
while True:
snapshot = get_report_stream_snapshot(job_id, include_debug=include_debug)
if not snapshot:
with SessionLocal() as db:
job = get_report_job(project_id, job_id, db)
result = get_report_result(project_id, job_id, db, include_debug=include_debug)
snapshot = {
"job": job.model_dump(),
"result": result.model_dump(),
}
payload = json.dumps(snapshot, ensure_ascii=False, separators=(",", ":"))
if payload != last_payload:
last_payload = payload
idle_ticks = 0
yield f"event: snapshot\ndata: {payload}\n\n"
else:
idle_ticks += 1
if idle_ticks >= 20:
idle_ticks = 0
yield "event: keepalive\ndata: ping\n\n"
status = str(((snapshot.get("job") or {}).get("status") or "")).strip().lower()
if status in ("completed", "failed", "cancelled"):
yield f"event: end\ndata: {payload}\n\n"
break
await asyncio.sleep(0.25)
return StreamingResponse(
_event_stream(),
media_type="text/event-stream",
headers={
"Cache-Control": "no-cache",
"Connection": "keep-alive",
"X-Accel-Buffering": "no",
},
)
@router.post(
"/projects/{project_id}/generate-report-job/{job_id}/retry-chapter",
response_model=GenerateReportJobItem,
summary="重试指定章节",
)
def retry_generate_report_chapter(
project_id: str,
job_id: str,
section_key: str,
db: Session = Depends(get_db),
):
return retry_report_chapter(project_id, job_id, section_key, db)
@router.post(
"/projects/{project_id}/generate-report-job/{job_id}/cancel",
response_model=GenerateReportJobItem,
summary="取消报告生成任务",
)
def cancel_generate_report_job(
project_id: str,
job_id: str,
db: Session = Depends(get_db),
):
return cancel_report_job(project_id, job_id, db)

179
schemas/write.py Normal file
View File

@ -0,0 +1,179 @@
"""
schemas/write.py
后评价报告项目相关的 Pydantic 数据模型
"""
from __future__ import annotations
from typing import Any, List, Optional
from pydantic import BaseModel
# ---------- 版本 ----------
class DocVersion(BaseModel):
id: str
version: str
content: str
savedAt: str
author: str
note: Optional[str] = ""
citationPayload: Optional[dict[str, Any]] = None
# ---------- 文档 ----------
class WriteDocument(BaseModel):
id: str
title: str
content: str
wordCount: int
createdAt: str
updatedAt: str
projectId: str
status: str # draft | review | published
versions: List[DocVersion] = []
class WriteDocumentSummary(BaseModel):
"""列表页只返回摘要,不含 content 正文"""
id: str
title: str
wordCount: int
createdAt: str
updatedAt: str
projectId: str
status: str
# ---------- 项目 ----------
class WriteProject(BaseModel):
id: str
uuid: str # 项目唯一标识,与 kb 共用
name: str
description: Optional[str] = ""
createdAt: str
updatedAt: str
docCount: int
status: str # active | archived
kbProjectId: Optional[str] = None
color: str
documents: List[WriteDocument] = []
class WriteProjectSummary(BaseModel):
"""列表页摘要,不含 documents"""
id: str
uuid: Optional[str] = None # 项目唯一标识,用于 URL 参数;兼容旧数据
name: str
description: Optional[str] = ""
createdAt: str
updatedAt: str
docCount: int
status: str
kbProjectId: Optional[str] = None
color: str
# ---------- 创建 / 更新请求体 ----------
class WriteProjectCreate(BaseModel):
name: str
description: Optional[str] = ""
kbProjectId: Optional[str] = None
color: Optional[str] = "#3b82f6"
class WriteProjectUpdate(BaseModel):
name: Optional[str] = None
description: Optional[str] = None
status: Optional[str] = None
kbProjectId: Optional[str] = None
color: Optional[str] = None
class WriteDocumentCreate(BaseModel):
title: str
content: Optional[str] = ""
class WriteDocumentUpdate(BaseModel):
title: Optional[str] = None
content: Optional[str] = None
status: Optional[str] = None
class DocVersionCreate(BaseModel):
version: Optional[str] = None
content: str
author: str
note: Optional[str] = ""
citationPayload: Optional[dict[str, Any]] = None
# ---------- 章节审查(智能体) ----------
class ChapterReviewRequest(BaseModel):
"""章节智能审查请求体:选择章节 + 输入待审查文本。"""
chapter: str # "1"~"6"
content: str
class ChapterReviewResponse(BaseModel):
"""章节智能审查响应体:返回 Markdown 审查报告。"""
success: bool = True
chapter: str
review: str
model: Optional[str] = None
message: Optional[str] = ""
class GenerateReportJobCreate(BaseModel):
templateId: Optional[str] = None
topK: int = 10
class GenerateReportChapterItem(BaseModel):
sectionKey: str
sectionTitle: str
sectionOrder: int
status: str
updatedAt: Optional[str] = None
errorMessage: Optional[str] = None
class GenerateReportJobItem(BaseModel):
jobId: str
projectId: str
templateId: Optional[str] = None
status: str
progress: int
currentSectionKey: Optional[str] = None
errorMessage: Optional[str] = None
createdAt: Optional[str] = None
updatedAt: Optional[str] = None
completedAt: Optional[str] = None
chapters: List[GenerateReportChapterItem] = []
class GenerateReportResultChapter(BaseModel):
sectionKey: str
sectionTitle: str
sectionOrder: int
status: str
content: Optional[str] = None
errorMessage: Optional[str] = None
promptText: Optional[str] = None
evidencePayload: Optional[dict] = None
validationPayload: Optional[dict] = None
class GenerateReportResult(BaseModel):
jobId: str
status: str
report: Optional[str] = None
consistency: List[str] = []
chapters: List[GenerateReportResultChapter] = []

View File

@ -0,0 +1,199 @@
"""
从项目知识库 Word.docx中提取附图1/附图2嵌入图用于报告附录
细则常见版式附图标题段落与图在同一节或相邻段落解析时合并前/当前/后段文字做关键词匹配
"""
from __future__ import annotations
import base64
import logging
from pathlib import Path
from typing import Optional
from docx import Document
from docx.oxml.ns import qn
from docx.table import Table
from docx.text.paragraph import Paragraph
logger = logging.getLogger(__name__)
# 过滤装饰性小图logo 等)
_MIN_FIGURE_BYTES = 6000
R_EMBED = "{http://schemas.openxmlformats.org/officeDocument/2006/relationships}embed"
_NS = {
"w": "http://schemas.openxmlformats.org/wordprocessingml/2006/main",
"a": "http://schemas.openxmlformats.org/drawingml/2006/main",
}
def _compact(s: str) -> str:
return "".join(str(s or "").split())
def _classify_slot(ctx: str) -> Optional[int]:
"""
返回 1=全厂物料平衡图2=装置如烷基化物料平衡图
"""
t = _compact(ctx)
if not t:
return None
# 附图编号(先判 2避免同段目录同时出现两个编号时误判
if "附图2" in t:
return 2
if "附图1" in t:
return 1
if "全厂" in t and "物料平衡" in t:
return 1
if "烷基化" in t and "物料平衡" in t:
return 2
if "装置" in t and "物料平衡" in t and "全厂" not in t:
return 2
return None
def _content_type_to_md_subtype(content_type: str) -> str:
ct = (content_type or "").lower()
if "jpeg" in ct or ct.endswith("jpg"):
return "jpeg"
if "png" in ct:
return "png"
if "gif" in ct:
return "gif"
if "emf" in ct:
return "x-emf"
if "wmf" in ct:
return "x-wmf"
return "png"
def _blob_to_data_uri(blob: bytes, content_type: str) -> str:
sub = _content_type_to_md_subtype(content_type)
b64 = base64.standard_b64encode(blob).decode("ascii")
return f"data:image/{sub};base64,{b64}"
def _iter_paragraphs_deep(doc: Document):
body_el = doc.element.body
for el in body_el:
if el.tag == qn("w:p"):
yield Paragraph(el, doc._body)
elif el.tag == qn("w:tbl"):
table = Table(el, doc._body)
for row in table.rows:
for cell in row.cells:
for p in cell.paragraphs:
yield p
def extract_appendix_figure_candidates_from_docx(path: Path) -> dict[int, list[tuple[int, bytes, str]]]:
"""
从单个 docx 收集候选图slot -> [(size, blob, content_type), ...]
content_type 来自 OPC part用于拼 data URI
"""
candidates: dict[int, list[tuple[int, bytes, str]]] = {1: [], 2: []}
orphans_ordered: list[tuple[bytes, str]] = []
try:
doc = Document(str(path))
except Exception as exc:
logger.warning("appendix figure: open docx failed %s: %s", path, exc)
return candidates
paras = list(_iter_paragraphs_deep(doc))
texts = [p.text or "" for p in paras]
for i, p in enumerate(paras):
blobs_with_type: list[tuple[bytes, str]] = []
for blip in p._element.findall(".//a:blip", _NS):
embed = blip.get(R_EMBED)
if not embed:
continue
try:
rel = p.part.related_parts[embed]
except KeyError:
continue
blob = getattr(rel, "blob", None)
ct = getattr(rel, "content_type", "") or "image/png"
if blob and len(blob) >= _MIN_FIGURE_BYTES:
blobs_with_type.append((blob, ct))
if not blobs_with_type:
continue
prev_t = texts[i - 1] if i > 0 else ""
cur_t = texts[i]
next_t = texts[i + 1] if i + 1 < len(texts) else ""
ctx = f"{prev_t}\n{cur_t}\n{next_t}"
slot = _classify_slot(ctx)
if slot is None:
for blob, ct in blobs_with_type:
orphans_ordered.append((blob, ct))
continue
for blob, ct in blobs_with_type:
candidates[slot].append((len(blob), blob, ct))
def _dedupe_preserve_order(pairs: list[tuple[bytes, str]]) -> list[tuple[bytes, str]]:
seen: set[int] = set()
out: list[tuple[bytes, str]] = []
for blob, ct in pairs:
bid = id(blob)
if bid in seen:
continue
seen.add(bid)
out.append((blob, ct))
return out
orphans_ordered = _dedupe_preserve_order(orphans_ordered)
used_ids: set[int] = set()
for lst in candidates.values():
for _sz, blob, _ct in lst:
used_ids.add(id(blob))
orphans_ordered = [(b, c) for b, c in orphans_ordered if id(b) not in used_ids]
if not candidates[1] and orphans_ordered:
b, c = orphans_ordered.pop(0)
candidates[1].append((len(b), b, c))
if not candidates[2] and orphans_ordered:
b, c = orphans_ordered.pop(0)
candidates[2].append((len(b), b, c))
return candidates
def merge_best_appendix_figures(
per_doc: list[tuple[str, dict[int, list[tuple[int, bytes, str]]]]],
) -> dict[int, tuple[bytes, str, str]]:
"""
多文档合并每个 slot 只保留字节最大的一张更可能是主流程图而非小图标
返回 slot -> (blob, content_type, source_doc_name)
"""
best: dict[int, tuple[int, bytes, str, str]] = {}
for doc_name, cand in per_doc:
for slot in (1, 2):
for size, blob, ct in cand.get(slot) or []:
prev = best.get(slot)
if prev is None or size > prev[0]:
best[slot] = (size, blob, ct, doc_name)
return {k: (v[1], v[2], v[3]) for k, v in best.items()}
def appendix_figure_markdown_images(
resolved: dict[int, tuple[bytes, str, str]],
*,
label_title: list[tuple[str, str]],
) -> dict[int, str]:
"""slot -> markdown 片段(含 ### 标题与 ![](data:...)"""
out: dict[int, str] = {}
slot_to_title = {i + 1: lt for i, lt in enumerate(label_title)}
for slot, (blob, ct, src) in resolved.items():
if slot not in slot_to_title:
continue
label, title = slot_to_title[slot]
uri = _blob_to_data_uri(blob, ct)
cap = f"{label} {title}"
src_note = f"\n\n*(嵌入来源:{src}*" if src else ""
out[slot] = f"### {cap}\n\n![{cap}]({uri}){src_note}"
return out

View File

@ -0,0 +1,28 @@
"""
services/docx_export_service.py瘦身版
本独立服务不提供 Word 导出能力此处仅保留 report_generation_service
正文小节编号识别时懒加载依赖的 `_is_likely_section_number`以满足导入
"""
from __future__ import annotations
import re
def _is_likely_section_number(num: str) -> bool:
"""报告小节编号(如 2.1.1),非正文能耗数值(如 132.41)。"""
s = str(num or "").strip()
if not s or not re.fullmatch(r"\d+(?:\.\d+)*", s):
return False
parts = s.split(".")
if len(parts) > 4:
return False
for part in parts:
try:
n = int(part)
except ValueError:
return False
if n < 1 or n > 30:
return False
return True

80
services/kb_service.py Normal file
View File

@ -0,0 +1,80 @@
"""
services/kb_service.py瘦身版
仅保留报告生成附图提取所需的知识库文档磁盘路径解析助手
eval_report 的完整 kb_service.py 中抽取去除知识库 CRUD / 上传 / worker 等无关逻辑
"""
from __future__ import annotations
from pathlib import Path
from typing import List, Optional
from config import settings
from database.models import KbDocument as KbDocumentModel
def _normalize_rel_path(path: str) -> str:
"""'a\\b\\c' 规范为 'a/b/c',并去掉前导 '/'"""
s = str(path or "").replace("\\", "/").strip()
while s.startswith("./"):
s = s[2:]
return s.lstrip("/")
def _kb_doc_storage_rel_path(
file_path_dir: Optional[str],
basename: str,
storage_rel_path: Optional[str] = None,
) -> str:
"""项目目录下的相对存储路径(含文件名)。优先 storage_rel_pathconfirm 时写入)。"""
stored = _normalize_rel_path(str(storage_rel_path or ""))
if stored:
return stored
d = _normalize_rel_path(str(file_path_dir or ""))
bn = str(basename or "").strip()
if d and bn:
return f"{d}/{bn}"
return bn or d
def _kb_doc_path_candidates_for_model(doc_root: Path, doc: KbDocumentModel) -> List[Path]:
"""解析磁盘路径时的候选列表(按优先级)。"""
rel = _kb_doc_storage_rel_path(
doc.file_path,
doc.name,
getattr(doc, "storage_rel_path", None),
)
candidates: List[Path] = []
if rel:
candidates.append((doc_root / doc.project_id / rel).resolve())
name = str(doc.name or "").strip()
fp_dir = _normalize_rel_path(str(doc.file_path or ""))
if fp_dir and name:
candidates.append((doc_root / doc.project_id / fp_dir / name).resolve())
if name:
candidates.append((doc_root / doc.project_id / name).resolve())
if not candidates:
candidates.append((doc_root / doc.project_id / "_missing_").resolve())
deduped: List[Path] = []
seen: set[str] = set()
for p in candidates:
key = str(p)
if key in seen:
continue
seen.add(key)
deduped.append(p)
return deduped
def _kb_doc_absolute_file_path_for_model(doc_root: Path, doc: KbDocumentModel) -> Path:
for p in _kb_doc_path_candidates_for_model(doc_root, doc):
if p.is_file():
return p
return _kb_doc_path_candidates_for_model(doc_root, doc)[0]
def _kb_doc_file_exists_for_model(doc: KbDocumentModel) -> bool:
"""文档在磁盘上是否可读(多路径回退,兼容历史 file_path/name 组合)。"""
doc_root = Path(settings.DOC_PAT).resolve()
return any(p.is_file() for p in _kb_doc_path_candidates_for_model(doc_root, doc))

View File

@ -0,0 +1,43 @@
"""
services/project_service.py
报告生成所需的最小项目查询替代 eval_report 中重型的 write_service
仅提供按 uuid / 数字 id 查询项目并返回 WriteProject用于校验项目存在性与取项目名
"""
from __future__ import annotations
from fastapi import HTTPException
from sqlalchemy.orm import Session
from database.models import Project
from schemas.write import WriteProject
def get_project(project_id: str, db: Session) -> WriteProject:
"""获取后评价报告项目详情。支持 uuid 或数字 id优先 uuid。"""
project = None
if project_id:
project = db.query(Project).filter(Project.uuid == project_id).first()
if not project:
try:
pid = int(project_id)
project = db.query(Project).filter(Project.id == pid).first()
except (ValueError, TypeError):
pass
if not project:
raise HTTPException(status_code=404, detail="项目不存在")
return WriteProject(
id=str(project.id),
uuid=project.uuid,
name=project.name,
description=project.description or "",
createdAt=project.created_at.strftime("%Y-%m-%d") if project.created_at else "",
updatedAt=project.updated_at.strftime("%Y-%m-%d") if project.updated_at else "",
docCount=project.doc_count,
status=project.status,
kbProjectId=None,
color=project.color,
documents=[],
)

View File

@ -0,0 +1,28 @@
from __future__ import annotations
import re
from pathlib import Path
from typing import Any
PROMPT_ROOT = Path(__file__).resolve().parent.parent / "prompts"
_TOKEN_RE = re.compile(r"{{\s*([A-Za-z_][A-Za-z0-9_]*)\s*}}")
def load_prompt_template(relative_path: str) -> str:
path = (PROMPT_ROOT / relative_path).resolve()
if not path.is_relative_to(PROMPT_ROOT.resolve()):
raise ValueError(f"Invalid prompt path: {relative_path}")
return path.read_text(encoding="utf-8")
def render_prompt_template(template: str, **context: Any) -> str:
def _replace(match: re.Match[str]) -> str:
value = context.get(match.group(1), "")
return "" if value is None else str(value)
return _TOKEN_RE.sub(_replace, template)
def render_prompt(relative_path: str, **context: Any) -> str:
return render_prompt_template(load_prompt_template(relative_path), **context)

View File

@ -0,0 +1,292 @@
"""
services/reference_service.py
参考范文加载服务报告生成时按需加载对应章节参考范文
"""
from __future__ import annotations
import json
import logging
import re
from typing import Optional
from sqlalchemy.orm import Session
from database.models import ReportSectionReference
from services.llm_client import chat_completions_json
logger = logging.getLogger(__name__)
_DESENSITIZE_SYSTEM_PROMPT = """你是一个文档脱敏助手。你的任务是对后评价报告范文进行脱敏处理,只保留报告的结构骨架。
## 脱敏规则
### 必须保留的结构
1. Markdown 标题层级## 1.1、## 1.2、### 1.2.1 等)
2. 表格的表头行分隔行|--|--|
3. 段落/章节的组织顺序和逻辑关系
4. 文字的叙述逻辑先写什么再写什么
5. 表格的行数列数表头字段名"序号""项目名称""可研报告""实际值"
### 必须替换为 xxx 的内容
1. 所有具体数字金额年份百分比数量面积产能投资额等
2. 项目名称公司名称单位名称等专有名词书名号/引号内的内容
3. 表格中的数据单元格内容保留表头
4. 具体的日期时间节点
5. 财务指标的具体数值IRRNPV回收期等
### 特别注意
- 不要随意增删段落或改变段落顺序
- 不要删除整个表格只替换表格中的数据单元格
- 保持原 Markdown 格式不变
- "待补充""详见附表" 固定用语 不脱敏
- 书名号中的内容如果是不知名的规范/标准名称石油化工标准保留书名号但内容替换为 xxx"""
_DESENSITIZE_USER_PROMPT_TEMPLATE = """请对以下后评价报告章节进行脱敏处理,只保留结构骨架,所有具体数据替换为 xxx
```
{content}
```
请严格按照脱敏规则处理直接输出脱敏后的完整 Markdown 内容不要输出任何额外说明"""
def _desensitize_via_llm(content: str) -> str:
"""
调用大模型对参考范文进行脱敏处理
传入完整内容返回仅保留结构骨架具体数据替换为 xxx Markdown
LLM 调用失败退回原始内容不脱敏优于拒绝服务
"""
if not content or not content.strip():
return content
user_prompt = _DESENSITIZE_USER_PROMPT_TEMPLATE.format(content=content[:12000])
logger.info("参考范文脱敏 start | content_len=%s", len(content))
try:
result = chat_completions_json(
system_prompt=_DESENSITIZE_SYSTEM_PROMPT,
user_prompt=user_prompt,
temperature=0.0,
max_tokens=16384,
timeout_sec=120,
)
raw = result.get("content") or ""
if isinstance(raw, str) and raw.strip():
# 去掉可能的 ```markdown / ``` 包裹
cleaned = re.sub(r"^```(?:markdown)?\s*", "", raw.strip(), flags=re.IGNORECASE)
cleaned = re.sub(r"\s*```$", "", cleaned)
logger.info("参考范文脱敏 done | original_len=%s | desensitized_len=%s", len(content), len(cleaned))
return cleaned.strip()
except Exception as e:
logger.warning("LLM 脱敏失败,退回原文: %s", e)
return content
def load_section_reference(
db: Session,
section_key: str,
source_file: Optional[str] = None,
*,
max_chars: int = 8000,
) -> str:
"""
加载指定章节的参考范文内容
Args:
db: 数据库会话
section_key: 章节标识 "1.1", "2.1.1"
source_file: 来源文件名可选不指定时取该章节最新的一条
max_chars: 最大字符数超出截断
Returns:
参考范文 Markdown 文本未找到时返回空字符串
"""
query = db.query(ReportSectionReference).filter(
ReportSectionReference.section_key == section_key
)
if source_file:
query = query.filter(ReportSectionReference.source_file == source_file)
ref = (
query
.order_by(ReportSectionReference.updated_at.desc())
.first()
)
if not ref or not ref.content:
return ""
content = ref.content.strip()
if not content:
return ""
content = _desensitize_via_llm(content)
if len(content) > max_chars:
logger.info("参考范文 %s 超出 %d 字符限制,已截断", section_key, max_chars)
content = content[:max_chars] + "\n\n(参考范文超出长度限制,已截断)"
return content
def load_section_reference_by_title(
db: Session,
section_title: str,
source_file: Optional[str] = None,
*,
max_chars: int = 8000,
) -> str:
"""
按标题关键字匹配加载参考范文不精确匹配 section_key 时的兜底方案
"""
refs = db.query(ReportSectionReference)
if source_file:
refs = refs.filter(ReportSectionReference.source_file == source_file)
# 尝试精确匹配 section_key从标题中提取编号
import re
m = re.match(r"(\d+(?:\.\d+)*)", section_title.strip())
if m:
key = m.group(1)
exact = (
refs.filter(ReportSectionReference.section_key == key)
.order_by(ReportSectionReference.updated_at.desc())
.first()
)
if exact and exact.content:
content = exact.content.strip()
content = _desensitize_via_llm(content)
if len(content) > max_chars:
content = content[:max_chars] + "\n\n(参考范文超出长度限制,已截断)"
return content
# 按标题模糊匹配
ref = (
refs.filter(ReportSectionReference.section_title.contains(section_title[:20]))
.order_by(ReportSectionReference.updated_at.desc())
.first()
)
if not ref or not ref.content:
return ""
content = ref.content.strip()
if not content:
return ""
content = _desensitize_via_llm(content)
if len(content) > max_chars:
content = content[:max_chars] + "\n\n(参考范文超出长度限制,已截断)"
return content
def load_section_reference_raw(
db: Session,
section_key: str,
template_id: Optional[str] = None,
*,
max_chars: int = 8000,
) -> str:
"""
加载指定章节存储在数据库中的原始参考范文内容不做 LLM 脱敏
load_section_reference 的区别直接返回 report_section_references.content 原文
仅保留长度截断保护不再调用 _desensitize_via_llm
template_id: 选中模板的 ID传入后只注入与该模板关联的参考范文实现按模板过滤
为空则不做模板过滤取最新一条
"""
query = db.query(ReportSectionReference).filter(
ReportSectionReference.section_key == section_key
)
if template_id:
query = query.filter(ReportSectionReference.template_id == template_id)
ref = (
query
.order_by(ReportSectionReference.updated_at.desc())
.first()
)
if not ref or not ref.content:
return ""
content = ref.content.strip()
if not content:
return ""
if len(content) > max_chars:
logger.info("参考范文 %s 超出 %d 字符限制,已截断", section_key, max_chars)
content = content[:max_chars] + "\n\n(参考范文超出长度限制,已截断)"
return content
def load_section_reference_raw_by_title(
db: Session,
section_title: str,
template_id: Optional[str] = None,
*,
max_chars: int = 8000,
) -> str:
"""按标题匹配加载原始参考范文内容(不做 LLM 脱敏),用于 section_key 未命中时的兜底。"""
refs = db.query(ReportSectionReference)
if template_id:
refs = refs.filter(ReportSectionReference.template_id == template_id)
import re
m = re.match(r"(\d+(?:\.\d+)*)", section_title.strip())
if m:
key = m.group(1)
exact = (
refs.filter(ReportSectionReference.section_key == key)
.order_by(ReportSectionReference.updated_at.desc())
.first()
)
if exact and exact.content:
content = exact.content.strip()
if len(content) > max_chars:
content = content[:max_chars] + "\n\n(参考范文超出长度限制,已截断)"
return content
ref = (
refs.filter(ReportSectionReference.section_title.contains(section_title[:20]))
.order_by(ReportSectionReference.updated_at.desc())
.first()
)
if not ref or not ref.content:
return ""
content = ref.content.strip()
if not content:
return ""
if len(content) > max_chars:
content = content[:max_chars] + "\n\n(参考范文超出长度限制,已截断)"
return content
def list_available_source_files(db: Session) -> list[str]:
"""列出所有已上传的参考范文来源文件列表。"""
results = (
db.query(ReportSectionReference.source_file)
.distinct()
.order_by(ReportSectionReference.source_file)
.all()
)
return [r[0] for r in results if r[0]]

File diff suppressed because it is too large Load Diff

View File

@ -0,0 +1,135 @@
from __future__ import annotations
from services.prompt_template_service import render_prompt
from prompts.report_generation.prompt_defaults import (
DEFAULT_SECTION_PROMPT_FALLBACK,
DEFAULT_SELECTED_EXAMPLE_FALLBACK,
)
def chapter_generation_system_prompt() -> str:
return render_prompt("report_generation/chapter_generation_system.md")
def repair_missing_tables_system_prompt() -> str:
return render_prompt("report_generation/repair_missing_tables_system.md")
def table_format_repair_system_prompt() -> str:
return render_prompt("report_generation/table_format_repair_system.md")
def _build_prior_sibling_sections_prompt_block(prior_sibling_sections_text: str) -> str:
body = str(prior_sibling_sections_text or "").strip()
if not body:
return ""
return (
"【同章前序小节正文(时间与金额须保持一致)】\n"
f"{body}\n\n"
"【同章一致性约束】\n"
"1. 竣工时间、开工/中交/投产/验收等关键里程碑日期,以及建设投资、总投资、营业收入、利润等各类金额数字,"
"须与本章前序小节已写明的口径完全一致(年月日表述可适度简化,但不得出现另一套矛盾日期或金额);\n"
"2. 若【证据包】或【字段级已抽取结果】中某日期/金额与前序小节矛盾,以前序小节为准写入本节,"
"不得在正文中另写一套矛盾数值;\n"
"3. 前序小节为「待补充」的字段,本节仍写「待补充」,不得自行编造;\n"
"4. 可补充本节新增信息,但不得改写或否定前序小节已确立的时间与金额。"
)
def _build_prior_chapters_prompt_block(prior_chapters_text: str) -> str:
body = str(prior_chapters_text or "").strip()
if not body:
return ""
return (
"【前序章节正文第16章本章须据此总结\n"
f"{body}\n\n"
"【前序章节使用约束】\n"
"1. 第7章各节是对第16章已生成正文的归纳、提炼与升华不得与前面章节结论矛盾\n"
"2. 可概括前文要点,禁止大段照搬;数据与结论须与前文一致;\n"
"3. 若前序章节某处为「待补充」,本节对应表述也应为「待补充」,不得编造;\n"
"4. 须由要素管理直出的表格如表7-1仍按【章节输出结构约束】执行不受本条限制。"
)
def _build_section_reference_block(section_reference: str) -> str:
body = str(section_reference or "").strip()
if not body:
return ""
return (
"【本章参考范文(本节写作蓝本:结构与行文风格须高度贴合;禁止复用数据、禁止照抄)】\n"
f"{body}\n\n"
"【参考范文使用约束】\n"
"1. 以范文为写作蓝本:段落数量与顺序、每段主题、论述逻辑、句式笔法与篇幅颗粒度均须与范文高度一致,做到逐段对应、同一笔法;\n"
"2. 严禁复用范文中的项目名称、时间、金额、指标值等任何事实数据,须全部替换为当前项目证据包的真实值;\n"
"3. 范文中的表格结构(表头、列顺序、行项)须沿用,但表内数据必须替换为当前项目证据包的值;\n"
"4. 禁止逐字照抄:不得出现与范文连续相同超过 15 字的文字,须改写措辞做到“形似而文不同”;\n"
"5. 若范文与证据包存在矛盾,以证据包为准。"
)
def build_report_chapter_prompt(
*,
section_title: str,
section_prompt: str,
required_tables_text: str,
structured_tables_text: str,
canonical_fields_text: str,
selected_example: str,
heading_rule: str,
section_contract: str,
evidence_json: str,
prior_sibling_sections_text: str = "",
prior_chapters_text: str = "",
section_reference: str = "",
) -> str:
return render_prompt(
"report_generation/chapter_generation_user_ref_aligned.md",
section_title=section_title,
section_prompt=section_prompt or DEFAULT_SECTION_PROMPT_FALLBACK,
required_tables_text=required_tables_text or "",
structured_tables_text=structured_tables_text,
canonical_fields_text=canonical_fields_text,
selected_example=selected_example or DEFAULT_SELECTED_EXAMPLE_FALLBACK,
heading_rule=heading_rule,
section_contract=section_contract,
evidence_json=evidence_json,
prior_sibling_sections_block=_build_prior_sibling_sections_prompt_block(
prior_sibling_sections_text
),
prior_chapters_block=_build_prior_chapters_prompt_block(prior_chapters_text),
section_reference_block=_build_section_reference_block(section_reference),
)
def build_repair_missing_tables_prompt(
*,
section_title: str,
original_prompt: str,
content: str,
missing_tables: list[str],
evidence_json: str,
) -> str:
return render_prompt(
"report_generation/repair_missing_tables_user.md",
section_title=section_title,
missing_tables=", ".join(missing_tables),
content=content,
original_prompt=original_prompt[:8000],
evidence_json=evidence_json[:12000],
)
def build_table_format_repair_prompt(
*,
section_title: str,
table_specs_json: str,
content: str,
evidence_json: str,
) -> str:
return render_prompt(
"report_generation/table_format_repair_user.md",
section_title=section_title,
table_specs_json=table_specs_json,
content=content,
evidence_json=evidence_json[:12000],
)

View File

@ -0,0 +1,145 @@
from __future__ import annotations
from copy import deepcopy
from datetime import datetime
import threading
from typing import Any, Optional
_RUNTIME_LOCK = threading.RLock()
_JOB_STATES: dict[str, dict[str, Any]] = {}
def _now_str() -> str:
return datetime.now().strftime("%Y-%m-%d %H:%M:%S")
def _chapter_payload(
*,
section_key: str,
section_title: str,
section_order: int,
status: str = "pending",
) -> dict[str, Any]:
return {
"sectionKey": section_key,
"sectionTitle": section_title,
"sectionOrder": section_order,
"status": status,
"content": None,
"errorMessage": None,
"updatedAt": _now_str(),
"promptText": None,
"evidencePayload": None,
"validationPayload": None,
}
def init_job_state(
*,
job_id: str,
project_id: str,
template_id: Optional[str],
chapters: list[dict[str, Any]],
) -> None:
with _RUNTIME_LOCK:
_JOB_STATES[job_id] = {
"jobId": job_id,
"projectId": project_id,
"templateId": template_id,
"status": "pending",
"progress": 0,
"currentSectionKey": None,
"errorMessage": None,
"createdAt": _now_str(),
"updatedAt": _now_str(),
"completedAt": None,
"chapters": {
str(item["sectionKey"]): _chapter_payload(
section_key=str(item["sectionKey"]),
section_title=str(item["sectionTitle"]),
section_order=int(item["sectionOrder"]),
status=str(item.get("status") or "pending"),
)
for item in (chapters or [])
},
}
def get_job_state(job_id: str) -> Optional[dict[str, Any]]:
with _RUNTIME_LOCK:
state = _JOB_STATES.get(job_id)
return deepcopy(state) if state else None
def update_job_state(job_id: str, **fields: Any) -> None:
with _RUNTIME_LOCK:
state = _JOB_STATES.get(job_id)
if not state:
return
state.update(fields)
state["updatedAt"] = _now_str()
def update_chapter_state(
job_id: str,
section_key: str,
**fields: Any,
) -> None:
with _RUNTIME_LOCK:
state = _JOB_STATES.get(job_id)
if not state:
return
chapter = state.get("chapters", {}).get(section_key)
if not chapter:
return
chapter.update(fields)
chapter["updatedAt"] = _now_str()
state["updatedAt"] = _now_str()
def append_chapter_content(
job_id: str,
section_key: str,
delta_text: str,
*,
stream_phase: str,
) -> None:
if not delta_text:
return
with _RUNTIME_LOCK:
state = _JOB_STATES.get(job_id)
if not state:
return
chapter = state.get("chapters", {}).get(section_key)
if not chapter:
return
current = str(chapter.get("content") or "")
validation_payload = dict(chapter.get("validationPayload") or {})
validation_payload["streamPhase"] = stream_phase
chapter["content"] = current + delta_text
chapter["validationPayload"] = validation_payload
chapter["updatedAt"] = _now_str()
state["currentSectionKey"] = section_key
state["updatedAt"] = _now_str()
def set_chapter_stream_phase(job_id: str, section_key: str, stream_phase: str) -> None:
with _RUNTIME_LOCK:
state = _JOB_STATES.get(job_id)
if not state:
return
chapter = state.get("chapters", {}).get(section_key)
if not chapter:
return
validation_payload = dict(chapter.get("validationPayload") or {})
validation_payload["streamPhase"] = stream_phase
chapter["validationPayload"] = validation_payload
chapter["updatedAt"] = _now_str()
state["currentSectionKey"] = section_key
state["updatedAt"] = _now_str()
def remove_job_state(job_id: str) -> None:
with _RUNTIME_LOCK:
_JOB_STATES.pop(job_id, None)

View File

@ -0,0 +1,324 @@
"""
services/retrieval_service.py
后评价报告材料检索服务
用于从向量库中检索与后评价报告相关的材料
"""
from typing import List, Dict, Any, Optional
from langchain_core.documents import Document
from function.vector_store import VectorStore
class RetrievalService:
"""后评价报告材料检索服务"""
def __init__(self, collection_name: str = "eval_report"):
"""
初始化检索服务
Args:
collection_name: 向量库集合名称
"""
self.collection_name = collection_name
self.vector_store = VectorStore(collection_name=collection_name, drop_old=False)
def search_by_query(self, query: str, top_k: int = 10, filter_project: Optional[str] = None) -> List[Document]:
"""
根据查询语句检索相关材料
Args:
query: 查询语句例如"项目背景""财务评价""技术方案"
top_k: 返回结果数量
filter_project: 可选的项目 UUID 过滤
Returns:
检索到的文档列表
"""
# 构建查询语句
if filter_project:
full_query = f"{query} 项目 UUID:{filter_project}"
else:
full_query = query
# 执行检索
results = self.vector_store.similarity_search_with_score(full_query, k=top_k)
# 过滤并返回文档
docs = []
for doc, score in results:
# 如果指定了项目过滤,检查文档是否属于该项目
if filter_project and doc.metadata.get("project_uuid") != filter_project:
continue
docs.append(doc)
return docs
def search_by_category(self, category: str, project_uuid: str, top_k: int = 10) -> List[Dict[str, Any]]:
"""
根据类别检索材料
Args:
category: 类别"项目概况""技术方案""财务评价""效益分析"
project_uuid: 项目 UUID
top_k: 返回结果数量
Returns:
检索结果列表包含文档内容和元数据
"""
# 定义类别对应的检索关键词
category_keywords = {
"项目概况": ["项目背景", "建设内容", "项目规模", "建设地点", "建设单位", "项目决策", "立项依据"],
"技术方案": ["技术方案", "工艺技术", "设备选型", "工程设计", "施工安装", "调试运行", "专利技术"],
"财务评价": ["投资估算", "资金筹措", "财务分析", "现金流量", "利润计算", "成本分析", "经济效益"],
"效益分析": ["经济效益", "社会效益", "环境效益", "环境影响", "资源利用", "节能降耗"],
"风险分析": ["风险分析", "风险识别", "风险评价", "风险对策", "不确定性分析"],
"后评价结论": ["后评价结论", "经验教训", "建议措施", "综合评价"],
}
# 使用多个关键词进行检索
all_docs = []
for keyword in category_keywords.get(category, [category]):
docs = self.search_by_query(keyword, top_k=5, filter_project=project_uuid)
all_docs.extend(docs)
# 去重并返回
seen = set()
unique_docs = []
for doc in all_docs:
key = (doc.page_content[:100], doc.metadata.get("heading", ""))
if key not in seen:
seen.add(key)
unique_docs.append(doc)
# 转换为字典格式
result = []
for doc in unique_docs[:top_k]:
result.append({
"content": doc.page_content,
"heading": doc.metadata.get("heading", ""),
"heading_level": doc.metadata.get("heading_level", 0),
"doc_id": doc.metadata.get("doc_id", ""),
"path": doc.metadata.get("path", ""),
"score": doc.metadata.get("score", 0.0),
})
return result
def get_project_materials(self, project_uuid: str) -> Dict[str, Any]:
"""
获取项目的所有相关材料
Args:
project_uuid: 项目 UUID
Returns:
包含项目所有材料的字典
"""
# 检索项目基本信息
basic_info = self.search_by_query(
"项目概况 项目基本情况",
top_k=5,
filter_project=project_uuid
)
# 检索技术方案
tech_info = self.search_by_query(
"技术方案 工艺技术",
top_k=5,
filter_project=project_uuid
)
# 检索财务信息
finance_info = self.search_by_query(
"财务评价 经济效益",
top_k=5,
filter_project=project_uuid
)
# 检索效益分析
benefit_info = self.search_by_query(
"效益分析 社会效益",
top_k=5,
filter_project=project_uuid
)
return {
"basic_info": [doc.page_content for doc in basic_info],
"tech_info": [doc.page_content for doc in tech_info],
"finance_info": [doc.page_content for doc in finance_info],
"benefit_info": [doc.page_content for doc in benefit_info],
}
def search_similar_report(self, reference_content: str, top_k: int = 5) -> List[Document]:
"""
根据参考内容检索相似报告
Args:
reference_content: 参考报告内容
top_k: 返回结果数量
Returns:
相似报告列表
"""
# 提取关键信息用于检索
query = f"后评价报告 项目概况 技术方案 财务评价"
results = self.vector_store.similarity_search_with_score(query, k=top_k)
docs = []
for doc, score in results:
docs.append(doc)
return docs
def get_template_data(self, project_uuid: str, query: str = "项目概况 技术方案 财务评价", top_k: int = 15) -> Dict[str, Any]:
"""
获取符合模板要求的数据
Args:
project_uuid: 项目 UUID
query: 检索查询语句
top_k: 检索结果数量
Returns:
符合模板字段要求的数据字典
"""
from report_template import ReportTemplate
# 检索材料
materials = self.search_by_query(query, top_k=top_k, filter_project=project_uuid)
if not materials:
return {
"materials": [],
"template_data": {},
"key_info": {}
}
# 提取关键信息
key_info = ReportTemplate.extract_key_info([doc.page_content for doc in materials])
# 映射到模板字段
template_data = ReportTemplate.map_materials_to_template([doc.page_content for doc in materials])
return {
"materials": [doc for doc in materials],
"materials_text": [doc.page_content for doc in materials],
"template_data": template_data,
"key_info": key_info
}
def get_chapter_materials(self, project_uuid: str, chapter: str, top_k: int = 10) -> List[Dict[str, Any]]:
"""
获取指定章节的材料
Args:
project_uuid: 项目 UUID
chapter: 章节名称
top_k: 返回结果数量
Returns:
材料列表
"""
# 定义章节对应的检索关键词
chapter_keywords = {
"项目概况": ["项目背景", "建设内容", "项目规模", "建设地点", "建设单位", "项目决策", "立项依据"],
"技术方案": ["技术方案", "工艺技术", "设备选型", "工程设计", "施工安装", "调试运行", "专利技术"],
"项目全过程总结与管理评价": [
# ---- 强优先表1~表14 + 编号小节 ----
"2.1", "2.1.1", "2.1.1.3", "2.1.6", "2.2", "2.2.1", "2.2.10", "2.3", "2.3.1", "2.3.6",
"表1原料数量及组成对比表", "表2原料性质对比表",
"表3前期预测和2019年实际产品对比表",
"表4装置规模及实际运行负荷对比表",
"表5项目规模对比表",
"表6可研报告与基础设计阶段工程内容对比表",
"表7项目承包商的招投标情况表",
"表8项目设计主要进度控制情况表",
"表9施工图设计变更情况表",
"表10重大设计变更情况表",
"表11主要设备采购情况表",
"表12施工重要节点进度表",
"表13原料性质对比表",
"表14主要标定结果与设计指标对比表",
# ---- 次优先:结构性关键词 ----
"可行性研究", "可研编制", "可研报告", "评估会", "可研批复", "资源与原料评价",
"基础设计", "设计审查", "审查意见", "设计变更", "施工图设计", "招投标", "施工准备",
"工程监理", "HSE", "竣工验收",
"投产管理", "生产准备", "联合试运", "试生产", "生产运行评价", "原料供应评价", "标定结果",
"原料数量及组成对比", "装置规模", "负荷率",
],
"财务评价": ["投资估算", "资金筹措", "财务分析", "现金流量", "利润计算", "成本分析", "经济效益"],
"效益分析": ["经济效益", "社会效益", "环境效益", "环境影响", "资源利用", "节能降耗"],
"项目目标和可持续性评价": [
# 强优先:章节标题与编号
"5", "5.1", "5.1.1", "5.1.2", "5.1.3", "5.2", "5.3", "5.3.1", "5.3.2", "5.3.3", "5.3.4", "5.3.5",
"项目目标实现程度评价", "项目绩效对标分析", "项目持续性评价",
# 目标实现(工程/技术/经济)
"工程规模", "项目进度", "工程质量", "项目功能", "投资控制",
"加工量", "负荷", "产品产量", "产品质量", "技术指标", "标定", "设计值", "考核",
"主要经济指标", "IRR", "内部收益率", "净现值", "NPV", "投资回收期", "营业收入", "成本费用", "税后利润",
# 对标
"对标", "横向对比", "同类装置", "单位投资", "单位能耗", "蒸汽能耗", "综合能耗", "辛烷值", "收率", "烯烃",
# 持续性(资源/产品/内部/政策)
"资源分析", "原料供应", "资源保障",
"产品分析", "市场需求", "国Ⅵ", "国ⅥA", "国ⅥB",
"项目内部因素", "装置规模合理性", "工艺方案", "技术水平",
"国家政策", "产业政策", "质量标准",
# 若材料以安全/环保合规支撑持续性
"个人风险", "社会风险", "可接受", "风险曲线",
"非甲烷总烃", "无组织排放", "mg/m3", "标准值",
],
"风险分析": ["风险分析", "风险识别", "风险评价", "风险对策", "不确定性分析"],
"后评价结论": ["后评价结论", "经验教训", "建议措施", "综合评价"],
}
keywords = chapter_keywords.get(chapter, [chapter])
# 使用多个关键词进行检索
all_docs = []
for keyword in keywords:
docs = self.search_by_query(keyword, top_k=5, filter_project=project_uuid)
all_docs.extend(docs)
# 去重并返回
seen = set()
unique_docs = []
for doc in all_docs:
key = (doc.page_content[:100], doc.metadata.get("heading", ""))
if key not in seen:
seen.add(key)
unique_docs.append(doc)
# 转换为字典格式
result = []
for doc in unique_docs[:top_k]:
result.append({
"content": doc.page_content,
"heading": doc.metadata.get("heading", ""),
"heading_level": doc.metadata.get("heading_level", 0),
"doc_id": doc.metadata.get("doc_id", ""),
"path": doc.metadata.get("path", ""),
"score": doc.metadata.get("score", 0.0),
})
return result
# 检索示例
if __name__ == "__main__":
# 创建检索服务实例
service = RetrievalService()
# 示例 1搜索项目背景
print("示例 1搜索项目背景")
docs = service.search_by_query("项目背景 建设内容", top_k=3)
for doc in docs:
print(f"标题:{doc.metadata.get('heading', 'N/A')}")
print(f"内容:{doc.page_content[:200]}...\n")
# 示例 2搜索财务评价
print("示例 2搜索财务评价")
docs = service.search_by_query("财务评价 现金流量", top_k=3)
for doc in docs:
print(f"标题:{doc.metadata.get('heading', 'N/A')}")
print(f"内容:{doc.page_content[:200]}...\n")

File diff suppressed because it is too large Load Diff