Merge origin/main — keep local version
Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
This commit is contained in:
commit
bf3d340aa8
464
database/init.sql
Normal file
464
database/init.sql
Normal file
@ -0,0 +1,464 @@
|
|||||||
|
-- 智能报告生成平台 - 数据库初始化脚本
|
||||||
|
-- 数据库名建议:post_eval_report
|
||||||
|
-- 适用于 MySQL
|
||||||
|
|
||||||
|
-- 创建数据库(可选)
|
||||||
|
-- CREATE DATABASE IF NOT EXISTS post_eval_report DEFAULT CHARACTER SET utf8mb4 COLLATE utf8mb4_unicode_ci;
|
||||||
|
-- USE post_eval_report;
|
||||||
|
|
||||||
|
-- 项目(统一:知识库 + 撰写)
|
||||||
|
-- uuid 由应用层生成,避免 MySQL 8/9 对生成列函数限制导致初始化失败
|
||||||
|
CREATE TABLE IF NOT EXISTS projects (
|
||||||
|
id INT AUTO_INCREMENT PRIMARY KEY,
|
||||||
|
uuid VARCHAR(32) NOT NULL UNIQUE,
|
||||||
|
name VARCHAR(255) NOT NULL,
|
||||||
|
description TEXT,
|
||||||
|
created_at DATETIME NOT NULL,
|
||||||
|
updated_at DATETIME NOT NULL,
|
||||||
|
doc_count INT DEFAULT 0,
|
||||||
|
eval_reports_count INT DEFAULT 0,
|
||||||
|
total_size VARCHAR(32) DEFAULT '0 B',
|
||||||
|
tags TEXT,
|
||||||
|
status VARCHAR(16) DEFAULT 'active',
|
||||||
|
color VARCHAR(16) DEFAULT '#3b82f6',
|
||||||
|
sync_suppressed_table_names LONGTEXT NULL
|
||||||
|
) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4 COLLATE=utf8mb4_unicode_ci;
|
||||||
|
CREATE INDEX idx_projects_created_at ON projects(created_at);
|
||||||
|
CREATE INDEX idx_projects_updated_at ON projects(updated_at);
|
||||||
|
CREATE INDEX idx_projects_status ON projects(status);
|
||||||
|
|
||||||
|
-- 知识库目录表:project_id 关联 projects.uuid;parent_id 形成目录树
|
||||||
|
CREATE TABLE IF NOT EXISTS kb_directories (
|
||||||
|
id VARCHAR(64) PRIMARY KEY,
|
||||||
|
project_id VARCHAR(32) NOT NULL,
|
||||||
|
parent_id VARCHAR(64) NULL,
|
||||||
|
name VARCHAR(255) NOT NULL,
|
||||||
|
full_path VARCHAR(1024) NOT NULL,
|
||||||
|
created_at DATETIME NOT NULL,
|
||||||
|
updated_at DATETIME NOT NULL,
|
||||||
|
FOREIGN KEY (project_id) REFERENCES projects(uuid) ON DELETE CASCADE,
|
||||||
|
FOREIGN KEY (parent_id) REFERENCES kb_directories(id) ON DELETE CASCADE
|
||||||
|
) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4 COLLATE=utf8mb4_unicode_ci;
|
||||||
|
CREATE INDEX idx_kb_dirs_project ON kb_directories(project_id);
|
||||||
|
CREATE INDEX idx_kb_dirs_parent ON kb_directories(parent_id);
|
||||||
|
|
||||||
|
-- 知识库文档(status: 0=失败 2=排队中 3=处理中 4=可用)
|
||||||
|
CREATE TABLE IF NOT EXISTS kb_documents (
|
||||||
|
id VARCHAR(64) PRIMARY KEY,
|
||||||
|
project_id VARCHAR(32) NOT NULL,
|
||||||
|
directory_id VARCHAR(64) NULL,
|
||||||
|
name VARCHAR(255) NOT NULL,
|
||||||
|
size VARCHAR(32) NOT NULL,
|
||||||
|
file_path VARCHAR(512),
|
||||||
|
storage_rel_path VARCHAR(512) NULL COMMENT '项目内完整相对路径(含文件名)',
|
||||||
|
word_count INT DEFAULT 0,
|
||||||
|
uploaded_at DATETIME NOT NULL,
|
||||||
|
status INT DEFAULT 2,
|
||||||
|
error_message TEXT NULL,
|
||||||
|
factor JSON NULL COMMENT '文档要素 JSON 数组',
|
||||||
|
category VARCHAR(32) NULL DEFAULT NULL COMMENT '文件分类',
|
||||||
|
FOREIGN KEY (project_id) REFERENCES projects(uuid) ON DELETE CASCADE,
|
||||||
|
FOREIGN KEY (directory_id) REFERENCES kb_directories(id) ON DELETE SET NULL
|
||||||
|
) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4 COLLATE=utf8mb4_unicode_ci;
|
||||||
|
CREATE INDEX idx_kb_docs_project ON kb_documents(project_id);
|
||||||
|
CREATE INDEX idx_kb_docs_directory ON kb_documents(directory_id);
|
||||||
|
|
||||||
|
-- 若已有 kb_documents 表,执行以下语句添加 word_count 字段:
|
||||||
|
-- ALTER TABLE kb_documents ADD COLUMN word_count INT DEFAULT 0 AFTER file_path;
|
||||||
|
|
||||||
|
-- 撰写文档(project_id 关联 projects.uuid,与 kb_documents 一致)
|
||||||
|
CREATE TABLE IF NOT EXISTS write_documents (
|
||||||
|
id VARCHAR(64) PRIMARY KEY,
|
||||||
|
project_id VARCHAR(32) NOT NULL,
|
||||||
|
title VARCHAR(255) NOT NULL,
|
||||||
|
content LONGTEXT,
|
||||||
|
word_count INT DEFAULT 0,
|
||||||
|
created_at DATETIME NOT NULL,
|
||||||
|
updated_at DATETIME NOT NULL,
|
||||||
|
status VARCHAR(16) DEFAULT 'draft',
|
||||||
|
sort_order INT DEFAULT 0,
|
||||||
|
FOREIGN KEY (project_id) REFERENCES projects(uuid) ON DELETE CASCADE
|
||||||
|
) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4 COLLATE=utf8mb4_unicode_ci;
|
||||||
|
CREATE INDEX idx_write_docs_project ON write_documents(project_id);
|
||||||
|
|
||||||
|
-- 文档版本
|
||||||
|
CREATE TABLE IF NOT EXISTS doc_versions (
|
||||||
|
id VARCHAR(64) PRIMARY KEY,
|
||||||
|
document_id VARCHAR(64) NOT NULL,
|
||||||
|
version VARCHAR(32) NOT NULL,
|
||||||
|
content LONGTEXT NOT NULL,
|
||||||
|
citation_payload LONGTEXT NULL,
|
||||||
|
saved_at DATETIME NOT NULL,
|
||||||
|
author VARCHAR(64) NOT NULL,
|
||||||
|
note TEXT,
|
||||||
|
FOREIGN KEY (document_id) REFERENCES write_documents(id) ON DELETE CASCADE
|
||||||
|
) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4 COLLATE=utf8mb4_unicode_ci;
|
||||||
|
CREATE INDEX idx_versions_doc ON doc_versions(document_id);
|
||||||
|
|
||||||
|
-- 要素表定义(全局/时间)
|
||||||
|
CREATE TABLE IF NOT EXISTS element_tables (
|
||||||
|
id VARCHAR(64) PRIMARY KEY,
|
||||||
|
project_id VARCHAR(32) NOT NULL,
|
||||||
|
table_type VARCHAR(32) NOT NULL,
|
||||||
|
table_name VARCHAR(255) NOT NULL,
|
||||||
|
year INT NULL,
|
||||||
|
is_time_dimension TINYINT(1) DEFAULT 0,
|
||||||
|
sort_order INT DEFAULT 0,
|
||||||
|
sync_suppressed_row_keys LONGTEXT NULL,
|
||||||
|
custom_row_order LONGTEXT NULL,
|
||||||
|
created_at DATETIME NOT NULL,
|
||||||
|
updated_at DATETIME NOT NULL,
|
||||||
|
FOREIGN KEY (project_id) REFERENCES projects(uuid) ON DELETE CASCADE
|
||||||
|
) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4 COLLATE=utf8mb4_unicode_ci;
|
||||||
|
CREATE INDEX idx_element_tables_project ON element_tables(project_id);
|
||||||
|
CREATE INDEX idx_element_tables_type_year ON element_tables(table_type, year);
|
||||||
|
CREATE INDEX idx_element_tables_name ON element_tables(table_name);
|
||||||
|
|
||||||
|
-- 要素单元格
|
||||||
|
CREATE TABLE IF NOT EXISTS element_cells (
|
||||||
|
id VARCHAR(64) PRIMARY KEY,
|
||||||
|
table_id VARCHAR(64) NOT NULL,
|
||||||
|
project_id VARCHAR(32) NOT NULL,
|
||||||
|
row_key VARCHAR(255) NOT NULL,
|
||||||
|
col_key VARCHAR(255) NULL,
|
||||||
|
year INT NULL,
|
||||||
|
value LONGTEXT NULL,
|
||||||
|
source_document_id VARCHAR(64) NULL,
|
||||||
|
source_line_no INT NULL,
|
||||||
|
source_line_end INT NULL,
|
||||||
|
source_quote TEXT NULL,
|
||||||
|
confidence FLOAT NULL,
|
||||||
|
extraction_batch_id VARCHAR(64) NULL,
|
||||||
|
extraction_model VARCHAR(128) NULL,
|
||||||
|
source_type VARCHAR(16) NULL COMMENT 'extract=文档抽取, manual=手工输入',
|
||||||
|
conflict_status VARCHAR(16) DEFAULT 'none',
|
||||||
|
created_at DATETIME NOT NULL,
|
||||||
|
updated_at DATETIME NOT NULL,
|
||||||
|
FOREIGN KEY (table_id) REFERENCES element_tables(id) ON DELETE CASCADE,
|
||||||
|
FOREIGN KEY (project_id) REFERENCES projects(uuid) ON DELETE CASCADE,
|
||||||
|
FOREIGN KEY (source_document_id) REFERENCES kb_documents(id) ON DELETE SET NULL
|
||||||
|
) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4 COLLATE=utf8mb4_unicode_ci;
|
||||||
|
CREATE INDEX idx_element_cells_project ON element_cells(project_id);
|
||||||
|
CREATE INDEX idx_element_cells_row_col ON element_cells(row_key, col_key);
|
||||||
|
CREATE INDEX idx_element_cells_year ON element_cells(year);
|
||||||
|
|
||||||
|
-- 抽取结果留存(table/element)
|
||||||
|
CREATE TABLE IF NOT EXISTS extraction_results (
|
||||||
|
id VARCHAR(64) PRIMARY KEY,
|
||||||
|
project_id VARCHAR(32) NOT NULL,
|
||||||
|
document_id VARCHAR(64) NOT NULL,
|
||||||
|
batch_id VARCHAR(64) NOT NULL,
|
||||||
|
result_type VARCHAR(16) NOT NULL,
|
||||||
|
table_type VARCHAR(32) NULL,
|
||||||
|
table_name VARCHAR(255) NULL,
|
||||||
|
year INT NULL,
|
||||||
|
item_key VARCHAR(255) NOT NULL,
|
||||||
|
item_value LONGTEXT NULL,
|
||||||
|
source_line_no INT NULL,
|
||||||
|
source_line_end INT NULL,
|
||||||
|
confidence FLOAT NULL,
|
||||||
|
raw_payload JSON NULL,
|
||||||
|
extracted_at DATETIME NULL,
|
||||||
|
created_at DATETIME NOT NULL,
|
||||||
|
FOREIGN KEY (project_id) REFERENCES projects(uuid) ON DELETE CASCADE,
|
||||||
|
FOREIGN KEY (document_id) REFERENCES kb_documents(id) ON DELETE CASCADE
|
||||||
|
) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4 COLLATE=utf8mb4_unicode_ci;
|
||||||
|
CREATE INDEX idx_extraction_project_doc ON extraction_results(project_id, document_id);
|
||||||
|
CREATE INDEX idx_extraction_batch ON extraction_results(batch_id);
|
||||||
|
CREATE INDEX idx_extraction_table_name ON extraction_results(table_name);
|
||||||
|
CREATE INDEX idx_extraction_key ON extraction_results(item_key);
|
||||||
|
|
||||||
|
-- 要素抽取结果明细(面向“细则章节/小节提示词 -> 项目材料”)
|
||||||
|
CREATE TABLE IF NOT EXISTS element_extraction_results (
|
||||||
|
id VARCHAR(64) PRIMARY KEY,
|
||||||
|
project_id VARCHAR(32) NOT NULL,
|
||||||
|
table_type VARCHAR(32) NOT NULL,
|
||||||
|
year INT NULL,
|
||||||
|
table_name VARCHAR(255) NOT NULL,
|
||||||
|
extracted_at DATETIME NOT NULL,
|
||||||
|
item_key VARCHAR(255) NOT NULL,
|
||||||
|
item_value LONGTEXT NULL,
|
||||||
|
source_document_id VARCHAR(64) NULL,
|
||||||
|
source_line_no INT NULL,
|
||||||
|
source_line_end INT NULL,
|
||||||
|
created_at DATETIME NOT NULL,
|
||||||
|
FOREIGN KEY (project_id) REFERENCES projects(uuid) ON DELETE CASCADE,
|
||||||
|
FOREIGN KEY (source_document_id) REFERENCES kb_documents(id) ON DELETE SET NULL
|
||||||
|
) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4 COLLATE=utf8mb4_unicode_ci;
|
||||||
|
CREATE INDEX idx_el_ext_project ON element_extraction_results(project_id);
|
||||||
|
CREATE INDEX idx_el_ext_table ON element_extraction_results(table_type, year, table_name);
|
||||||
|
CREATE INDEX idx_el_ext_key ON element_extraction_results(item_key);
|
||||||
|
CREATE INDEX idx_el_ext_source_doc ON element_extraction_results(source_document_id);
|
||||||
|
|
||||||
|
-- 冲突记录
|
||||||
|
CREATE TABLE IF NOT EXISTS element_conflicts (
|
||||||
|
id VARCHAR(64) PRIMARY KEY,
|
||||||
|
project_id VARCHAR(32) NOT NULL,
|
||||||
|
table_id VARCHAR(64) NULL,
|
||||||
|
cell_id VARCHAR(64) NULL,
|
||||||
|
item_key VARCHAR(255) NOT NULL,
|
||||||
|
old_value LONGTEXT NULL,
|
||||||
|
new_value LONGTEXT NULL,
|
||||||
|
selected_value LONGTEXT NULL,
|
||||||
|
source_document_id VARCHAR(64) NULL,
|
||||||
|
source_line_no INT NULL,
|
||||||
|
status VARCHAR(16) DEFAULT 'pending',
|
||||||
|
created_at DATETIME NOT NULL,
|
||||||
|
updated_at DATETIME NOT NULL,
|
||||||
|
FOREIGN KEY (project_id) REFERENCES projects(uuid) ON DELETE CASCADE,
|
||||||
|
FOREIGN KEY (table_id) REFERENCES element_tables(id) ON DELETE SET NULL,
|
||||||
|
FOREIGN KEY (cell_id) REFERENCES element_cells(id) ON DELETE SET NULL,
|
||||||
|
FOREIGN KEY (source_document_id) REFERENCES kb_documents(id) ON DELETE SET NULL
|
||||||
|
) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4 COLLATE=utf8mb4_unicode_ci;
|
||||||
|
CREATE INDEX idx_element_conflicts_project ON element_conflicts(project_id);
|
||||||
|
CREATE INDEX idx_element_conflicts_status ON element_conflicts(status);
|
||||||
|
|
||||||
|
-- 文档 markdown 落库
|
||||||
|
CREATE TABLE IF NOT EXISTS document_markdowns (
|
||||||
|
id VARCHAR(64) PRIMARY KEY,
|
||||||
|
project_id VARCHAR(32) NOT NULL,
|
||||||
|
document_id VARCHAR(64) NOT NULL,
|
||||||
|
extracted_filename VARCHAR(255) NULL,
|
||||||
|
markdown_content LONGTEXT NOT NULL,
|
||||||
|
content_hash VARCHAR(64) NULL,
|
||||||
|
created_at DATETIME NOT NULL,
|
||||||
|
updated_at DATETIME NOT NULL,
|
||||||
|
FOREIGN KEY (project_id) REFERENCES projects(uuid) ON DELETE CASCADE,
|
||||||
|
FOREIGN KEY (document_id) REFERENCES kb_documents(id) ON DELETE CASCADE
|
||||||
|
) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4 COLLATE=utf8mb4_unicode_ci;
|
||||||
|
CREATE INDEX idx_markdowns_project_doc ON document_markdowns(project_id, document_id);
|
||||||
|
|
||||||
|
-- 文档段落切分
|
||||||
|
CREATE TABLE IF NOT EXISTS document_chunks (
|
||||||
|
id VARCHAR(64) PRIMARY KEY,
|
||||||
|
project_id VARCHAR(32) NOT NULL,
|
||||||
|
document_id VARCHAR(64) NOT NULL,
|
||||||
|
markdown_id VARCHAR(64) NULL,
|
||||||
|
heading VARCHAR(512) NULL,
|
||||||
|
chunk_text LONGTEXT NOT NULL,
|
||||||
|
chunk_index INT DEFAULT 0,
|
||||||
|
source_line_start INT NULL,
|
||||||
|
source_line_end INT NULL,
|
||||||
|
vector_id VARCHAR(128) NULL,
|
||||||
|
created_at DATETIME NOT NULL,
|
||||||
|
FOREIGN KEY (project_id) REFERENCES projects(uuid) ON DELETE CASCADE,
|
||||||
|
FOREIGN KEY (document_id) REFERENCES kb_documents(id) ON DELETE CASCADE,
|
||||||
|
FOREIGN KEY (markdown_id) REFERENCES document_markdowns(id) ON DELETE CASCADE
|
||||||
|
) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4 COLLATE=utf8mb4_unicode_ci;
|
||||||
|
CREATE INDEX idx_chunks_project_doc ON document_chunks(project_id, document_id);
|
||||||
|
CREATE INDEX idx_chunks_heading ON document_chunks(heading(255));
|
||||||
|
|
||||||
|
-- 独立后台任务:pdf2md 文件处理与 element-agent 要素抽取
|
||||||
|
CREATE TABLE IF NOT EXISTS tasks (
|
||||||
|
id VARCHAR(64) PRIMARY KEY,
|
||||||
|
project VARCHAR(64) NOT NULL,
|
||||||
|
task_type INT NOT NULL,
|
||||||
|
file_id VARCHAR(64) NULL,
|
||||||
|
file_path VARCHAR(1024) NULL,
|
||||||
|
status INT NOT NULL DEFAULT 1,
|
||||||
|
payload_json JSON NULL,
|
||||||
|
result_path VARCHAR(1024) NULL,
|
||||||
|
error_message LONGTEXT NULL,
|
||||||
|
add_time DATETIME NOT NULL,
|
||||||
|
finish_time DATETIME NULL
|
||||||
|
) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4 COLLATE=utf8mb4_unicode_ci;
|
||||||
|
CREATE INDEX idx_tasks_status_type_time ON tasks(status, task_type, add_time);
|
||||||
|
CREATE INDEX idx_tasks_project ON tasks(project);
|
||||||
|
CREATE INDEX idx_tasks_file_id ON tasks(file_id);
|
||||||
|
|
||||||
|
-- 模板管理
|
||||||
|
CREATE TABLE IF NOT EXISTS report_templates (
|
||||||
|
id VARCHAR(64) PRIMARY KEY,
|
||||||
|
name VARCHAR(255) NOT NULL,
|
||||||
|
description TEXT NULL,
|
||||||
|
is_default TINYINT(1) DEFAULT 0,
|
||||||
|
is_active TINYINT(1) DEFAULT 1,
|
||||||
|
created_at DATETIME NOT NULL,
|
||||||
|
updated_at DATETIME NOT NULL
|
||||||
|
) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4 COLLATE=utf8mb4_unicode_ci;
|
||||||
|
CREATE INDEX idx_templates_default ON report_templates(is_default);
|
||||||
|
|
||||||
|
CREATE TABLE IF NOT EXISTS report_template_sections (
|
||||||
|
id VARCHAR(64) PRIMARY KEY,
|
||||||
|
template_id VARCHAR(64) NOT NULL,
|
||||||
|
section_key VARCHAR(64) NOT NULL,
|
||||||
|
section_title VARCHAR(255) NOT NULL,
|
||||||
|
section_prompt LONGTEXT NULL,
|
||||||
|
section_output_contract LONGTEXT NULL,
|
||||||
|
section_order INT DEFAULT 0,
|
||||||
|
examples LONGTEXT NULL,
|
||||||
|
created_at DATETIME NOT NULL,
|
||||||
|
updated_at DATETIME NOT NULL,
|
||||||
|
FOREIGN KEY (template_id) REFERENCES report_templates(id) ON DELETE CASCADE
|
||||||
|
) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4 COLLATE=utf8mb4_unicode_ci;
|
||||||
|
CREATE INDEX idx_template_sections_template ON report_template_sections(template_id);
|
||||||
|
|
||||||
|
-- 报告生成任务(7章分章异步)
|
||||||
|
CREATE TABLE IF NOT EXISTS report_generation_jobs (
|
||||||
|
id VARCHAR(64) PRIMARY KEY,
|
||||||
|
project_id VARCHAR(32) NOT NULL,
|
||||||
|
template_id VARCHAR(64) NULL,
|
||||||
|
status VARCHAR(16) DEFAULT 'pending',
|
||||||
|
progress INT DEFAULT 0,
|
||||||
|
current_section_key VARCHAR(64) NULL,
|
||||||
|
error_message TEXT NULL,
|
||||||
|
requested_by VARCHAR(64) NULL,
|
||||||
|
options JSON NULL,
|
||||||
|
snapshot JSON NULL,
|
||||||
|
created_at DATETIME NOT NULL,
|
||||||
|
updated_at DATETIME NOT NULL,
|
||||||
|
completed_at DATETIME NULL,
|
||||||
|
FOREIGN KEY (project_id) REFERENCES projects(uuid) ON DELETE CASCADE,
|
||||||
|
FOREIGN KEY (template_id) REFERENCES report_templates(id) ON DELETE SET NULL
|
||||||
|
) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4 COLLATE=utf8mb4_unicode_ci;
|
||||||
|
CREATE INDEX idx_report_jobs_project ON report_generation_jobs(project_id);
|
||||||
|
CREATE INDEX idx_report_jobs_status ON report_generation_jobs(status);
|
||||||
|
|
||||||
|
CREATE TABLE IF NOT EXISTS report_generation_chapters (
|
||||||
|
id VARCHAR(64) PRIMARY KEY,
|
||||||
|
job_id VARCHAR(64) NOT NULL,
|
||||||
|
section_key VARCHAR(64) NOT NULL,
|
||||||
|
section_title VARCHAR(255) NOT NULL,
|
||||||
|
section_order INT DEFAULT 0,
|
||||||
|
status VARCHAR(16) DEFAULT 'pending',
|
||||||
|
content LONGTEXT NULL,
|
||||||
|
prompt_text LONGTEXT NULL,
|
||||||
|
evidence_payload JSON NULL,
|
||||||
|
validation_payload JSON NULL,
|
||||||
|
error_message TEXT NULL,
|
||||||
|
created_at DATETIME NOT NULL,
|
||||||
|
updated_at DATETIME NOT NULL,
|
||||||
|
completed_at DATETIME NULL,
|
||||||
|
FOREIGN KEY (job_id) REFERENCES report_generation_jobs(id) ON DELETE CASCADE
|
||||||
|
) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4 COLLATE=utf8mb4_unicode_ci;
|
||||||
|
CREATE INDEX idx_report_chapters_job ON report_generation_chapters(job_id);
|
||||||
|
CREATE INDEX idx_report_chapters_status ON report_generation_chapters(status);
|
||||||
|
|
||||||
|
-- 最小 RBAC
|
||||||
|
CREATE TABLE IF NOT EXISTS departments (
|
||||||
|
id VARCHAR(64) PRIMARY KEY,
|
||||||
|
name VARCHAR(255) NOT NULL,
|
||||||
|
description TEXT NULL,
|
||||||
|
parent_id VARCHAR(64) NULL,
|
||||||
|
created_at DATETIME NOT NULL,
|
||||||
|
updated_at DATETIME NOT NULL,
|
||||||
|
FOREIGN KEY (parent_id) REFERENCES departments(id) ON DELETE SET NULL
|
||||||
|
) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4 COLLATE=utf8mb4_unicode_ci;
|
||||||
|
|
||||||
|
CREATE TABLE IF NOT EXISTS users (
|
||||||
|
id VARCHAR(64) PRIMARY KEY,
|
||||||
|
username VARCHAR(64) NOT NULL UNIQUE,
|
||||||
|
password_hash VARCHAR(255) NULL,
|
||||||
|
department_id VARCHAR(64) NULL,
|
||||||
|
created_at DATETIME NOT NULL,
|
||||||
|
updated_at DATETIME NOT NULL,
|
||||||
|
FOREIGN KEY (department_id) REFERENCES departments(id) ON DELETE SET NULL
|
||||||
|
) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4 COLLATE=utf8mb4_unicode_ci;
|
||||||
|
CREATE INDEX idx_users_department ON users(department_id);
|
||||||
|
|
||||||
|
CREATE TABLE IF NOT EXISTS roles (
|
||||||
|
id VARCHAR(64) PRIMARY KEY,
|
||||||
|
name VARCHAR(64) NOT NULL UNIQUE,
|
||||||
|
description TEXT NULL,
|
||||||
|
created_at DATETIME NOT NULL,
|
||||||
|
updated_at DATETIME NOT NULL
|
||||||
|
) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4 COLLATE=utf8mb4_unicode_ci;
|
||||||
|
|
||||||
|
CREATE TABLE IF NOT EXISTS permissions (
|
||||||
|
id VARCHAR(64) PRIMARY KEY,
|
||||||
|
perm_key VARCHAR(128) NOT NULL UNIQUE,
|
||||||
|
perm_type VARCHAR(32) NOT NULL,
|
||||||
|
description TEXT NULL,
|
||||||
|
created_at DATETIME NOT NULL
|
||||||
|
) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4 COLLATE=utf8mb4_unicode_ci;
|
||||||
|
CREATE INDEX idx_permissions_type ON permissions(perm_type);
|
||||||
|
|
||||||
|
CREATE TABLE IF NOT EXISTS role_permissions (
|
||||||
|
id VARCHAR(64) PRIMARY KEY,
|
||||||
|
role_id VARCHAR(64) NOT NULL,
|
||||||
|
permission_id VARCHAR(64) NOT NULL,
|
||||||
|
created_at DATETIME NOT NULL,
|
||||||
|
FOREIGN KEY (role_id) REFERENCES roles(id) ON DELETE CASCADE,
|
||||||
|
FOREIGN KEY (permission_id) REFERENCES permissions(id) ON DELETE CASCADE
|
||||||
|
) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4 COLLATE=utf8mb4_unicode_ci;
|
||||||
|
|
||||||
|
CREATE TABLE IF NOT EXISTS user_roles (
|
||||||
|
id VARCHAR(64) PRIMARY KEY,
|
||||||
|
user_id VARCHAR(64) NOT NULL,
|
||||||
|
role_id VARCHAR(64) NOT NULL,
|
||||||
|
created_at DATETIME NOT NULL,
|
||||||
|
FOREIGN KEY (user_id) REFERENCES users(id) ON DELETE CASCADE,
|
||||||
|
FOREIGN KEY (role_id) REFERENCES roles(id) ON DELETE CASCADE
|
||||||
|
) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4 COLLATE=utf8mb4_unicode_ci;
|
||||||
|
|
||||||
|
CREATE TABLE IF NOT EXISTS project_members (
|
||||||
|
id VARCHAR(64) PRIMARY KEY,
|
||||||
|
project_id VARCHAR(32) NOT NULL,
|
||||||
|
user_id VARCHAR(64) NOT NULL,
|
||||||
|
role VARCHAR(32) DEFAULT 'editor',
|
||||||
|
created_at DATETIME NOT NULL,
|
||||||
|
FOREIGN KEY (project_id) REFERENCES projects(uuid) ON DELETE CASCADE,
|
||||||
|
FOREIGN KEY (user_id) REFERENCES users(id) ON DELETE CASCADE
|
||||||
|
) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4 COLLATE=utf8mb4_unicode_ci;
|
||||||
|
CREATE INDEX idx_project_members_project ON project_members(project_id);
|
||||||
|
|
||||||
|
CREATE TABLE IF NOT EXISTS project_departments (
|
||||||
|
id VARCHAR(64) PRIMARY KEY,
|
||||||
|
project_id VARCHAR(32) NOT NULL,
|
||||||
|
department_id VARCHAR(64) NOT NULL,
|
||||||
|
created_at DATETIME NOT NULL,
|
||||||
|
FOREIGN KEY (project_id) REFERENCES projects(uuid) ON DELETE CASCADE,
|
||||||
|
FOREIGN KEY (department_id) REFERENCES departments(id) ON DELETE CASCADE,
|
||||||
|
UNIQUE KEY uq_project_department (project_id, department_id)
|
||||||
|
) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4 COLLATE=utf8mb4_unicode_ci;
|
||||||
|
CREATE INDEX idx_project_departments_project ON project_departments(project_id);
|
||||||
|
|
||||||
|
-- 回填记录:每次要素回填均留痕,支持证据追溯
|
||||||
|
CREATE TABLE IF NOT EXISTS fill_records (
|
||||||
|
id VARCHAR(64) PRIMARY KEY,
|
||||||
|
project_id VARCHAR(32) NOT NULL,
|
||||||
|
cell_id VARCHAR(64) NULL,
|
||||||
|
table_id VARCHAR(64) NULL,
|
||||||
|
row_key VARCHAR(255) NOT NULL,
|
||||||
|
col_key VARCHAR(255) NULL,
|
||||||
|
year INT NULL,
|
||||||
|
filled_value LONGTEXT NULL,
|
||||||
|
previous_value LONGTEXT NULL,
|
||||||
|
source_document_id VARCHAR(64) NULL,
|
||||||
|
source_document_name VARCHAR(255) NULL COMMENT '冗余存储文档名,文档删除后仍可追溯',
|
||||||
|
source_line_no INT NULL,
|
||||||
|
source_line_end INT NULL,
|
||||||
|
source_quote TEXT NULL COMMENT '原文摘录片段,作为回填依据',
|
||||||
|
confidence FLOAT NULL,
|
||||||
|
extraction_batch_id VARCHAR(64) NULL,
|
||||||
|
extraction_model VARCHAR(128) NULL COMMENT '使用的 LLM 模型标识',
|
||||||
|
fill_type VARCHAR(16) NOT NULL DEFAULT 'auto' COMMENT 'auto=抽取回填, manual=人工编辑, resolve=冲突解决',
|
||||||
|
created_at DATETIME NOT NULL,
|
||||||
|
FOREIGN KEY (project_id) REFERENCES projects(uuid) ON DELETE CASCADE,
|
||||||
|
FOREIGN KEY (cell_id) REFERENCES element_cells(id) ON DELETE SET NULL,
|
||||||
|
FOREIGN KEY (table_id) REFERENCES element_tables(id) ON DELETE SET NULL,
|
||||||
|
FOREIGN KEY (source_document_id) REFERENCES kb_documents(id) ON DELETE SET NULL
|
||||||
|
) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4 COLLATE=utf8mb4_unicode_ci;
|
||||||
|
CREATE INDEX idx_fill_records_project ON fill_records(project_id);
|
||||||
|
CREATE INDEX idx_fill_records_cell ON fill_records(cell_id);
|
||||||
|
CREATE INDEX idx_fill_records_batch ON fill_records(extraction_batch_id);
|
||||||
|
CREATE INDEX idx_fill_records_source_doc ON fill_records(source_document_id);
|
||||||
|
CREATE INDEX idx_fill_records_created ON fill_records(created_at);
|
||||||
|
|
||||||
|
-- ============================================================
|
||||||
|
-- report_section_references:章节参考范文
|
||||||
|
-- ============================================================
|
||||||
|
CREATE TABLE IF NOT EXISTS report_section_references (
|
||||||
|
id VARCHAR(64) PRIMARY KEY,
|
||||||
|
template_id VARCHAR(64) NULL COMMENT '关联模板ID(report_templates.id),按模板过滤参考范文',
|
||||||
|
source_file VARCHAR(255) NOT NULL COMMENT '来源文件名',
|
||||||
|
section_key VARCHAR(64) NOT NULL COMMENT '章节标识,如 1.1、2.1.1',
|
||||||
|
section_title VARCHAR(255) NOT NULL COMMENT '章节标题',
|
||||||
|
section_order INT DEFAULT 0 COMMENT '章节序号',
|
||||||
|
content TEXT NOT NULL COMMENT '该章节的参考范文 Markdown',
|
||||||
|
created_at DATETIME NOT NULL,
|
||||||
|
updated_at DATETIME NOT NULL
|
||||||
|
) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4 COLLATE=utf8mb4_unicode_ci;
|
||||||
|
CREATE INDEX idx_ref_source_file ON report_section_references(source_file);
|
||||||
|
CREATE INDEX idx_ref_section_key ON report_section_references(section_key);
|
||||||
|
CREATE INDEX idx_ref_template_id ON report_section_references(template_id);
|
||||||
3
database/migrations/add_ref_template_id.sql
Normal file
3
database/migrations/add_ref_template_id.sql
Normal file
@ -0,0 +1,3 @@
|
|||||||
|
-- 为 report_section_references 增加 template_id,按模板过滤参考范文
|
||||||
|
ALTER TABLE report_section_references ADD COLUMN template_id VARCHAR(64) NULL COMMENT '关联模板ID(report_templates.id),按模板过滤参考范文';
|
||||||
|
CREATE INDEX idx_ref_template_id ON report_section_references(template_id);
|
||||||
1
function/__init__.py
Normal file
1
function/__init__.py
Normal file
@ -0,0 +1 @@
|
|||||||
|
# function 包
|
||||||
550
function/vector_store.py
Normal file
550
function/vector_store.py
Normal file
@ -0,0 +1,550 @@
|
|||||||
|
#!/usr/bin/env python3
|
||||||
|
# -*- coding: utf-8 -*-
|
||||||
|
"""
|
||||||
|
function/vector_store.py
|
||||||
|
向量库模块 - 与 kb_service 项目集成
|
||||||
|
已修改:drop_old 全部 = False,不会删除已有集合
|
||||||
|
✅ 已修复 413 超长 token 问题(语义友好版)
|
||||||
|
"""
|
||||||
|
|
||||||
|
import re
|
||||||
|
import json
|
||||||
|
import logging
|
||||||
|
from typing import Dict, List, Optional, Tuple
|
||||||
|
from pathlib import Path
|
||||||
|
|
||||||
|
from langchain_core.documents import Document
|
||||||
|
from langchain_text_splitters import RecursiveCharacterTextSplitter
|
||||||
|
from langchain_openai import OpenAIEmbeddings
|
||||||
|
from langchain_milvus import Milvus, BM25BuiltInFunction
|
||||||
|
from pymilvus import MilvusClient, connections
|
||||||
|
|
||||||
|
from config import settings
|
||||||
|
|
||||||
|
logger = logging.getLogger(__name__)
|
||||||
|
|
||||||
|
# ============================================================================
|
||||||
|
# 配置
|
||||||
|
# ============================================================================
|
||||||
|
COLLECTION_NAME = "eval_report"
|
||||||
|
EMBEDDING_API_BASE = settings.EMBEDDING_API_BASE
|
||||||
|
EMBEDDING_API_KEY = settings.EMBEDDING_API_KEY
|
||||||
|
MILVUS_DB_URL = settings.MILVUS_DB_URL
|
||||||
|
|
||||||
|
CONSISTENCY_LEVEL = "Bounded"
|
||||||
|
AUTO_ID = True
|
||||||
|
METRIC_TYPE = "COSINE"
|
||||||
|
INDEX_TYPE = "AUTOINDEX"
|
||||||
|
SPARSE_METRIC_TYPE = "BM25"
|
||||||
|
SPARSE_INDEX_TYPE = "SPARSE_INVERTED_INDEX"
|
||||||
|
|
||||||
|
|
||||||
|
def _embedding_batch_limits() -> tuple[int, int, int]:
|
||||||
|
max_docs = max(1, int(getattr(settings, "EMBEDDING_BATCH_MAX_DOCS", 4) or 4))
|
||||||
|
max_chars = max(512, int(getattr(settings, "EMBEDDING_BATCH_MAX_CHARS", 12000) or 12000))
|
||||||
|
max_chunk = max(512, int(getattr(settings, "EMBEDDING_MAX_CHUNK_CHARS", 4000) or 4000))
|
||||||
|
return max_docs, max_chars, max_chunk
|
||||||
|
|
||||||
|
|
||||||
|
def _is_embedding_backend_oom(exc: BaseException) -> bool:
|
||||||
|
msg = str(exc).lower()
|
||||||
|
return (
|
||||||
|
"out of memory" in msg
|
||||||
|
or "npu out of memory" in msg
|
||||||
|
or "cuda out of memory" in msg
|
||||||
|
or "error code: 424" in msg
|
||||||
|
or "'code': 424" in msg
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
def _add_documents_batch_with_retry(vs: Milvus, batch: List[Document]) -> List[str]:
|
||||||
|
"""写入一批文档;远端 embedding OOM 时自动拆半重试。"""
|
||||||
|
if not batch:
|
||||||
|
return []
|
||||||
|
try:
|
||||||
|
return list(vs.add_documents(batch))
|
||||||
|
except Exception as e:
|
||||||
|
if not _is_embedding_backend_oom(e) or len(batch) <= 1:
|
||||||
|
raise
|
||||||
|
mid = max(1, len(batch) // 2)
|
||||||
|
logger.warning(
|
||||||
|
"embedding 批次 OOM,拆分为 %s + %s 重试",
|
||||||
|
mid,
|
||||||
|
len(batch) - mid,
|
||||||
|
)
|
||||||
|
ids: List[str] = []
|
||||||
|
ids.extend(_add_documents_batch_with_retry(vs, batch[:mid]))
|
||||||
|
ids.extend(_add_documents_batch_with_retry(vs, batch[mid:]))
|
||||||
|
return ids
|
||||||
|
|
||||||
|
|
||||||
|
def _register_milvus_client_for_orm(client: MilvusClient) -> None:
|
||||||
|
"""pymilvus 2.6+ MilvusClient uses ConnectionManager; ORM Collection still resolves
|
||||||
|
pymilvus.orm.connections by client._using. langchain-milvus touches Collection during
|
||||||
|
Milvus.__init__, so register before constructing Milvus (bootstrap client)."""
|
||||||
|
alias = client._using
|
||||||
|
if connections.has_connection(alias):
|
||||||
|
return
|
||||||
|
cfg = client._config
|
||||||
|
connections._alias_handlers[alias] = client._handler
|
||||||
|
connections._alias_config[alias] = {
|
||||||
|
"address": cfg.address,
|
||||||
|
"user": "",
|
||||||
|
"db_name": cfg.db_name or "default",
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
# ============================================================================
|
||||||
|
# VectorStore 类(已全部改为 drop_old=False)
|
||||||
|
# ============================================================================
|
||||||
|
|
||||||
|
class VectorStore:
|
||||||
|
def __init__(
|
||||||
|
self,
|
||||||
|
collection_name: str = COLLECTION_NAME,
|
||||||
|
drop_old: bool = False,
|
||||||
|
chunk_size: int = 500,
|
||||||
|
chunk_overlap: int = 50
|
||||||
|
):
|
||||||
|
self.collection_name = collection_name
|
||||||
|
self.chunk_size = chunk_size
|
||||||
|
self.chunk_overlap = chunk_overlap
|
||||||
|
self._drop_old = drop_old
|
||||||
|
self._milvus = None
|
||||||
|
|
||||||
|
def _get_embeddings(self):
|
||||||
|
return OpenAIEmbeddings(
|
||||||
|
base_url=EMBEDDING_API_BASE,
|
||||||
|
api_key=EMBEDDING_API_KEY,
|
||||||
|
model="bge-m3",
|
||||||
|
check_embedding_ctx_length=False,
|
||||||
|
)
|
||||||
|
|
||||||
|
def _get_milvus(self, drop_old: bool = False) -> Milvus:
|
||||||
|
logger.info("【VectorStore】初始化 Milvus 混合向量存储(dense + sparse)")
|
||||||
|
|
||||||
|
if self._milvus is not None and not drop_old:
|
||||||
|
logger.info("【VectorStore】复用已有 Milvus 实例")
|
||||||
|
return self._milvus
|
||||||
|
|
||||||
|
if not MILVUS_DB_URL:
|
||||||
|
raise ValueError("MILVUS_DB_URL 未配置,请在 .env 中设置")
|
||||||
|
|
||||||
|
embeddings = self._get_embeddings()
|
||||||
|
logger.info("【VectorStore】Embedding 模型 bge-m3 初始化完成")
|
||||||
|
|
||||||
|
try:
|
||||||
|
# 与 langchain 内 MilvusClient 共享 ConnectionManager,先注册 ORM alias,否则 __init__ 内访问 Collection 会报错
|
||||||
|
_register_milvus_client_for_orm(MilvusClient(uri=MILVUS_DB_URL))
|
||||||
|
self._milvus = Milvus(
|
||||||
|
embedding_function=embeddings,
|
||||||
|
builtin_function=BM25BuiltInFunction(),
|
||||||
|
vector_field=["dense", "sparse"],
|
||||||
|
connection_args={"uri": MILVUS_DB_URL},
|
||||||
|
collection_name=self.collection_name,
|
||||||
|
consistency_level=CONSISTENCY_LEVEL,
|
||||||
|
auto_id=AUTO_ID,
|
||||||
|
drop_old=False,
|
||||||
|
index_params=[
|
||||||
|
{"metric_type": METRIC_TYPE, "index_type": INDEX_TYPE},
|
||||||
|
{"metric_type": SPARSE_METRIC_TYPE, "index_type": SPARSE_INDEX_TYPE},
|
||||||
|
],
|
||||||
|
)
|
||||||
|
_register_milvus_client_for_orm(self._milvus.client)
|
||||||
|
logger.info("✅ Milvus 混合向量存储初始化成功")
|
||||||
|
except Exception as e:
|
||||||
|
logger.error(f"❌ Milvus 初始化失败: {str(e)}", exc_info=True)
|
||||||
|
raise
|
||||||
|
|
||||||
|
return self._milvus
|
||||||
|
|
||||||
|
# ========================================================================
|
||||||
|
# ✅ 修复版 add_documents:语义友好,不破坏段落,不触发413
|
||||||
|
# ========================================================================
|
||||||
|
def add_documents(self, documents: List[Document]) -> List[str]:
|
||||||
|
if not documents:
|
||||||
|
logger.info("【add_documents】无文档可写入")
|
||||||
|
return []
|
||||||
|
|
||||||
|
max_docs_per_batch, max_chars_per_batch, max_chunk_chars = _embedding_batch_limits()
|
||||||
|
|
||||||
|
# ---------------------- 语义安全切分(只修问题,不破坏结构)----------------------
|
||||||
|
# 只处理【真的超长】的段落,在句子/段落边界分割,绝不乱切
|
||||||
|
safe_splitter = RecursiveCharacterTextSplitter(
|
||||||
|
chunk_size=max_chunk_chars,
|
||||||
|
chunk_overlap=min(200, max(0, max_chunk_chars // 20)),
|
||||||
|
separators=["\n\n", "\n", "。", "!", "?", ";", ":", ","]
|
||||||
|
)
|
||||||
|
|
||||||
|
safe_documents = []
|
||||||
|
for doc in documents:
|
||||||
|
# 超过限制才切分
|
||||||
|
if len(doc.page_content) > max_chunk_chars:
|
||||||
|
chunks = safe_splitter.split_text(doc.page_content)
|
||||||
|
for chunk in chunks:
|
||||||
|
if chunk.strip():
|
||||||
|
safe_documents.append(Document(
|
||||||
|
page_content=chunk,
|
||||||
|
metadata=doc.metadata.copy()
|
||||||
|
))
|
||||||
|
else:
|
||||||
|
safe_documents.append(doc)
|
||||||
|
# --------------------------------------------------------------------------------
|
||||||
|
|
||||||
|
# Milvus 现有集合要求部分 metadata 字段必填;历史调用方未必都传这些字段,这里统一兜底补齐。
|
||||||
|
for idx, doc in enumerate(safe_documents):
|
||||||
|
metadata = doc.metadata or {}
|
||||||
|
if not metadata.get("doc_id"):
|
||||||
|
project_uuid = metadata.get("project_uuid") or "unknown_project"
|
||||||
|
heading = metadata.get("heading") or "chunk"
|
||||||
|
metadata["doc_id"] = f"{project_uuid}:{heading}:{idx}"
|
||||||
|
if "original_title" not in metadata:
|
||||||
|
metadata["original_title"] = metadata.get("heading") or ""
|
||||||
|
if "path" not in metadata:
|
||||||
|
metadata["path"] = ""
|
||||||
|
if "project_uuid" not in metadata:
|
||||||
|
metadata["project_uuid"] = "unknown_project"
|
||||||
|
doc.metadata = metadata
|
||||||
|
|
||||||
|
logger.info(f"【add_documents】预处理后准备写入 {len(safe_documents)} 条文档")
|
||||||
|
vs = self._get_milvus(drop_old=self._drop_old)
|
||||||
|
self._drop_old = False
|
||||||
|
|
||||||
|
ids = []
|
||||||
|
current_batch: List[Document] = []
|
||||||
|
current_batch_chars = 0
|
||||||
|
batch_num = 1
|
||||||
|
|
||||||
|
def _flush_batch() -> None:
|
||||||
|
nonlocal current_batch, current_batch_chars, batch_num
|
||||||
|
if not current_batch:
|
||||||
|
return
|
||||||
|
logger.info(
|
||||||
|
"【add_documents】写入批次 %s,数量:%s,约 %s 字符",
|
||||||
|
batch_num,
|
||||||
|
len(current_batch),
|
||||||
|
current_batch_chars,
|
||||||
|
)
|
||||||
|
try:
|
||||||
|
res = _add_documents_batch_with_retry(vs, current_batch)
|
||||||
|
ids.extend(res)
|
||||||
|
logger.info("✅ 批次写入成功,返回 ID 数:%s", len(res))
|
||||||
|
except Exception as e:
|
||||||
|
logger.error("❌ 批次写入失败: %s", e, exc_info=True)
|
||||||
|
batch_num += 1
|
||||||
|
current_batch = []
|
||||||
|
current_batch_chars = 0
|
||||||
|
|
||||||
|
for doc in safe_documents:
|
||||||
|
doc_chars = len(doc.page_content or "")
|
||||||
|
would_exceed_docs = bool(current_batch) and len(current_batch) >= max_docs_per_batch
|
||||||
|
would_exceed_chars = bool(current_batch) and (
|
||||||
|
current_batch_chars + doc_chars > max_chars_per_batch
|
||||||
|
)
|
||||||
|
if would_exceed_docs or would_exceed_chars:
|
||||||
|
_flush_batch()
|
||||||
|
current_batch.append(doc)
|
||||||
|
current_batch_chars += doc_chars
|
||||||
|
|
||||||
|
_flush_batch()
|
||||||
|
|
||||||
|
logger.info(f"【add_documents】全部完成,总写入 ID 数:{len(ids)}")
|
||||||
|
return ids
|
||||||
|
|
||||||
|
def similarity_search_with_score(
|
||||||
|
self, query: str, k: int = 10, filter: Optional[str] = None
|
||||||
|
) -> List[Tuple[Document, float]]:
|
||||||
|
vs = self._get_milvus(drop_old=False)
|
||||||
|
query = query[:5000]
|
||||||
|
if filter:
|
||||||
|
return vs.similarity_search_with_score(query, k=k, filter=filter)
|
||||||
|
return vs.similarity_search_with_score(query, k=k)
|
||||||
|
|
||||||
|
def similarity_search_dense_filtered(
|
||||||
|
self,
|
||||||
|
query: str,
|
||||||
|
k: int,
|
||||||
|
filter_expr: str,
|
||||||
|
) -> List[Tuple[Document, float]]:
|
||||||
|
"""
|
||||||
|
使用 dense 向量 ANN + Milvus 标量过滤检索。
|
||||||
|
hybrid(dense+sparse)集合上 langchain_milvus 的 filter 可能不生效,抽取侧召回用此路径保证 doc_id 隔离。
|
||||||
|
"""
|
||||||
|
from pymilvus import MilvusClient
|
||||||
|
|
||||||
|
q = (query or "")[:5000]
|
||||||
|
if not q.strip():
|
||||||
|
return []
|
||||||
|
emb = self._get_embeddings().embed_query(q)
|
||||||
|
client = MilvusClient(uri=MILVUS_DB_URL)
|
||||||
|
try:
|
||||||
|
raw = client.search(
|
||||||
|
collection_name=self.collection_name,
|
||||||
|
data=[emb],
|
||||||
|
anns_field="dense",
|
||||||
|
limit=max(1, int(k)),
|
||||||
|
filter=filter_expr,
|
||||||
|
output_fields=[
|
||||||
|
"text",
|
||||||
|
"heading",
|
||||||
|
"heading_level",
|
||||||
|
"doc_id",
|
||||||
|
"project_uuid",
|
||||||
|
"original_title",
|
||||||
|
"path",
|
||||||
|
],
|
||||||
|
)
|
||||||
|
finally:
|
||||||
|
client.close()
|
||||||
|
hits = raw[0] if raw else []
|
||||||
|
out: List[Tuple[Document, float]] = []
|
||||||
|
for hit in hits:
|
||||||
|
ent = hit.get("entity") or {}
|
||||||
|
doc = Document(
|
||||||
|
page_content=str(ent.get("text") or ""),
|
||||||
|
metadata={
|
||||||
|
"heading": ent.get("heading"),
|
||||||
|
"heading_level": ent.get("heading_level"),
|
||||||
|
"doc_id": ent.get("doc_id"),
|
||||||
|
"project_uuid": ent.get("project_uuid"),
|
||||||
|
"original_title": ent.get("original_title"),
|
||||||
|
"path": ent.get("path"),
|
||||||
|
},
|
||||||
|
)
|
||||||
|
dist = hit.get("distance")
|
||||||
|
try:
|
||||||
|
score = float(dist) if dist is not None else 0.0
|
||||||
|
except (TypeError, ValueError):
|
||||||
|
score = 0.0
|
||||||
|
out.append((doc, score))
|
||||||
|
return out
|
||||||
|
|
||||||
|
def delete_by_filter(self, filter_expr: str) -> int:
|
||||||
|
try:
|
||||||
|
from pymilvus import MilvusClient
|
||||||
|
client = MilvusClient(uri=MILVUS_DB_URL)
|
||||||
|
if not client.has_collection(self.collection_name):
|
||||||
|
return 0
|
||||||
|
# 某些集合主键字段名不叫 id(例如 langchain-milvus 可能使用自定义 PK/auto_id)。
|
||||||
|
# 先从集合描述里找主键字段,再用于 query 计数。
|
||||||
|
pk_field = None
|
||||||
|
describe = client.describe_collection(self.collection_name)
|
||||||
|
for f in describe.get("fields", []) or []:
|
||||||
|
# 兼容不同返回结构:is_primary / isPrimary / primary
|
||||||
|
if f.get("is_primary") or f.get("isPrimary") or f.get("primary"):
|
||||||
|
pk_field = f.get("name")
|
||||||
|
break
|
||||||
|
|
||||||
|
count = 0
|
||||||
|
try:
|
||||||
|
if pk_field:
|
||||||
|
res = client.query(
|
||||||
|
self.collection_name,
|
||||||
|
filter=filter_expr,
|
||||||
|
output_fields=[pk_field],
|
||||||
|
)
|
||||||
|
count = len(res)
|
||||||
|
else:
|
||||||
|
# 找不到主键字段名时也不阻断删除
|
||||||
|
count = 0
|
||||||
|
except Exception:
|
||||||
|
# 仅计数失败不影响删除
|
||||||
|
count = 0
|
||||||
|
|
||||||
|
client.delete(self.collection_name, filter=filter_expr)
|
||||||
|
client.close()
|
||||||
|
return count
|
||||||
|
except Exception as e:
|
||||||
|
logger.error(f"删除失败: {e}")
|
||||||
|
return 0
|
||||||
|
|
||||||
|
|
||||||
|
# ============================================================================
|
||||||
|
# Markdown 拆分
|
||||||
|
# ============================================================================
|
||||||
|
|
||||||
|
def split_markdown(text: str, chunk_size: int = 500, chunk_overlap: int = 50) -> List[str]:
|
||||||
|
if not text: return []
|
||||||
|
splitter = RecursiveCharacterTextSplitter(
|
||||||
|
chunk_size=chunk_size, chunk_overlap=chunk_overlap,
|
||||||
|
separators=["\n\n", "。", "?", "!", "\n", ";", ":", ","]
|
||||||
|
)
|
||||||
|
return splitter.split_text(text)
|
||||||
|
|
||||||
|
def split_markdown_by_headings(content: str, chunk_size=300, chunk_overlap=40) -> List[Document]:
|
||||||
|
if not content: return []
|
||||||
|
docs = []
|
||||||
|
lines = content.split("\n")
|
||||||
|
current_heading = ""
|
||||||
|
current_level = 0
|
||||||
|
current_lines = []
|
||||||
|
|
||||||
|
def flush():
|
||||||
|
nonlocal current_lines, current_heading, current_level
|
||||||
|
txt = "\n".join(current_lines).strip()
|
||||||
|
if txt:
|
||||||
|
docs.append(Document(
|
||||||
|
page_content=txt,
|
||||||
|
metadata={"heading": current_heading, "heading_level": current_level}
|
||||||
|
))
|
||||||
|
current_lines = []
|
||||||
|
|
||||||
|
for line in lines:
|
||||||
|
line = line.rstrip()
|
||||||
|
m = re.match(r"^(#{1,6})\s+(.+)$", line)
|
||||||
|
if m:
|
||||||
|
flush()
|
||||||
|
current_level = len(m.group(1))
|
||||||
|
current_heading = m.group(2).strip()
|
||||||
|
else:
|
||||||
|
current_lines.append(line)
|
||||||
|
flush()
|
||||||
|
|
||||||
|
if not docs:
|
||||||
|
chunks = split_markdown(content, chunk_size, chunk_overlap)
|
||||||
|
for i, c in enumerate(chunks):
|
||||||
|
docs.append(
|
||||||
|
Document(
|
||||||
|
page_content=c,
|
||||||
|
metadata={"chunk_index": i, "heading": "", "heading_level": 0},
|
||||||
|
)
|
||||||
|
)
|
||||||
|
return docs
|
||||||
|
|
||||||
|
def process_document_to_vector_store(
|
||||||
|
doc_id: str, title: str, content: str, path: str, project_uuid: str, collection_name=COLLECTION_NAME
|
||||||
|
) -> bool:
|
||||||
|
try:
|
||||||
|
vs = VectorStore(collection_name=collection_name, drop_old=False)
|
||||||
|
docs = split_markdown_by_headings(content)
|
||||||
|
for d in docs:
|
||||||
|
d.metadata["doc_id"] = doc_id
|
||||||
|
d.metadata["original_title"] = title
|
||||||
|
d.metadata["path"] = path
|
||||||
|
d.metadata["project_uuid"] = project_uuid
|
||||||
|
vs.add_documents(docs)
|
||||||
|
return True
|
||||||
|
except Exception as e:
|
||||||
|
logger.error(f"处理文档失败: {e}")
|
||||||
|
return False
|
||||||
|
|
||||||
|
# ============================================================================
|
||||||
|
# 数据预处理
|
||||||
|
# ============================================================================
|
||||||
|
|
||||||
|
INPUT_FILE = "data/articles.jsonl"
|
||||||
|
OUTPUT_CHUNK_FILE = "data/processed/eval_chunks.jsonl"
|
||||||
|
|
||||||
|
def load_jsonl(filename: str, encoding="utf-8"):
|
||||||
|
with open(filename, encoding=encoding) as f:
|
||||||
|
for line in f:
|
||||||
|
if line.strip():
|
||||||
|
yield json.loads(line)
|
||||||
|
|
||||||
|
def write_jsonl(data, filename, append=False, ensure_ascii=False):
|
||||||
|
mode = "a" if append else "w"
|
||||||
|
with open(filename, mode, encoding="utf-8") as f:
|
||||||
|
for item in data:
|
||||||
|
f.write(json.dumps(item, ensure_ascii=ensure_ascii) + "\n")
|
||||||
|
|
||||||
|
def clean_text(text: str) -> str:
|
||||||
|
if not isinstance(text, str): return ""
|
||||||
|
text = re.sub(r"[\x00-\x09\x0B-\x1F\x7F]", "", text)
|
||||||
|
text = re.sub(r"[\u200b-\u200f\u2028\u2029]", "", text)
|
||||||
|
text = re.sub(r"[:’“â€â€¢â€¦â€“—]", "", text)
|
||||||
|
text = re.sub(r"<[^>]+>", "\n", text)
|
||||||
|
text = re.sub(r"\n+", "\n", text)
|
||||||
|
text = re.sub(r" +", " ", text)
|
||||||
|
text = re.sub(r"^[。,?!;:]", "", text)
|
||||||
|
text = re.sub(r'[^\u4e00-\u9fff_a-zA-Z0-9\s,。!?;:、()《》【】""''·!@#$%^&*()_+=[]{}|;:\'",./<>?-]', "", text)
|
||||||
|
return text.strip()
|
||||||
|
|
||||||
|
def concat_metadata_to_content(title: str, content: str, metadata: dict):
|
||||||
|
parts = [
|
||||||
|
f"标题:{title}",
|
||||||
|
f"发布时间:{metadata.get('publish_time')}",
|
||||||
|
f"作者:{metadata.get('author')}",
|
||||||
|
f"来源:{metadata.get('source')}",
|
||||||
|
]
|
||||||
|
parts = [p for p in parts if p.split(":")[-1]]
|
||||||
|
return " | ".join(parts) + "\n---\n" + content.strip()
|
||||||
|
|
||||||
|
def process_all_documents(input_file, output_file, chunk_size=500, overlap=50):
|
||||||
|
docs = load_jsonl(input_file)
|
||||||
|
splitter = RecursiveCharacterTextSplitter(chunk_size=chunk_size, chunk_overlap=overlap,
|
||||||
|
separators=["\n\n", "。", "?", "!", "\n", ";", ":", ","])
|
||||||
|
all_chunks = []
|
||||||
|
num_docs = 0
|
||||||
|
for doc in docs:
|
||||||
|
num_docs +=1
|
||||||
|
content = clean_text(doc["content"])
|
||||||
|
chunks = splitter.split_text(content)
|
||||||
|
for i, chunk in chunks:
|
||||||
|
clean_c = clean_text(chunk)
|
||||||
|
if len(clean_c) <10: continue
|
||||||
|
all_chunks.append({
|
||||||
|
"id": f"{doc['id']}_chunk_{i}",
|
||||||
|
"doc_id": doc["id"],
|
||||||
|
"title": doc["title"],
|
||||||
|
"content": concat_metadata_to_content(doc["title"], clean_c, doc.get("metadata",{})),
|
||||||
|
"chunk_index": i,
|
||||||
|
"url": doc.get("metadata",{}).get("url","")
|
||||||
|
})
|
||||||
|
write_jsonl(all_chunks, output_file)
|
||||||
|
return {"num_docs":num_docs, "num_chunks":len(all_chunks)}
|
||||||
|
|
||||||
|
def load_chunk_jsonl(path):
|
||||||
|
res = []
|
||||||
|
with open(path, encoding="utf-8") as f:
|
||||||
|
for line in f:
|
||||||
|
if line.strip():
|
||||||
|
res.append(json.loads(line))
|
||||||
|
return res
|
||||||
|
|
||||||
|
def build_index(data, vs: VectorStore):
|
||||||
|
docs: List[Document] = []
|
||||||
|
for row in data:
|
||||||
|
c = row.pop("content", "").strip()
|
||||||
|
if len(c) < 10:
|
||||||
|
continue
|
||||||
|
docs.append(Document(page_content=c, metadata=row))
|
||||||
|
if docs:
|
||||||
|
vs.add_documents(docs)
|
||||||
|
|
||||||
|
def get_vector_store(drop_old=False):
|
||||||
|
vs = VectorStore(collection_name=COLLECTION_NAME, drop_old=drop_old)
|
||||||
|
return vs._get_milvus(drop_old=drop_old)
|
||||||
|
|
||||||
|
def search_eval(query, top_k=10):
|
||||||
|
from time import time
|
||||||
|
vs = VectorStore(drop_old=False)
|
||||||
|
st = time()
|
||||||
|
results = vs.similarity_search_with_score(query, k=top_k)
|
||||||
|
print(f"检索耗时: {time()-st:.2f}s")
|
||||||
|
return results
|
||||||
|
|
||||||
|
# ============================================================================
|
||||||
|
# 运行入口
|
||||||
|
# ============================================================================
|
||||||
|
if __name__ == "__main__":
|
||||||
|
logger.info("="*60)
|
||||||
|
logger.info("【Milvus 混合向量索引构建启动】dense + sparse(BM25)")
|
||||||
|
logger.info("="*60)
|
||||||
|
|
||||||
|
process_all_documents(INPUT_FILE, OUTPUT_CHUNK_FILE)
|
||||||
|
logger.info("✅ 文本分块处理完成")
|
||||||
|
|
||||||
|
chunk_data = load_chunk_jsonl(OUTPUT_CHUNK_FILE)
|
||||||
|
logger.info(f"✅ 加载分块数据:{len(chunk_data)} 条")
|
||||||
|
|
||||||
|
vs = VectorStore(drop_old=False)
|
||||||
|
build_index(chunk_data, vs)
|
||||||
|
logger.info("✅ 索引构建完成(增量写入)")
|
||||||
|
|
||||||
|
res = search_eval("测试检索内容")
|
||||||
|
logger.info(f"✅ 检索完成,命中数量:{len(res)}")
|
||||||
|
for doc, score in res:
|
||||||
|
logger.info(f"score={score:.4f} | content={doc.page_content[:80]}...")
|
||||||
|
|
||||||
|
logger.info("="*60)
|
||||||
|
logger.info("【全部执行完成】")
|
||||||
@ -0,0 +1 @@
|
|||||||
|
# prompts 包
|
||||||
@ -0,0 +1 @@
|
|||||||
|
# report_generation prompts 包
|
||||||
52
prompts/report_generation/appendix_templates.py
Normal file
52
prompts/report_generation/appendix_templates.py
Normal file
@ -0,0 +1,52 @@
|
|||||||
|
"""Fixed markdown templates used by report generation."""
|
||||||
|
|
||||||
|
|
||||||
|
def markdown_hashes_for_section_no(section_no: str) -> str:
|
||||||
|
"""与前端 markdownHashesForSectionNo / _heading_level_and_class 对齐。"""
|
||||||
|
parts = str(section_no or "").strip().split(".")
|
||||||
|
if len(parts) == 1:
|
||||||
|
return "##"
|
||||||
|
if len(parts) == 2:
|
||||||
|
return "###"
|
||||||
|
return "####"
|
||||||
|
|
||||||
|
|
||||||
|
def missing_child_heading_markdown(heading_no: str) -> str:
|
||||||
|
hashes = markdown_hashes_for_section_no(heading_no)
|
||||||
|
return f"\n\n{hashes} {heading_no} 待补充\n\n待补充"
|
||||||
|
|
||||||
|
|
||||||
|
# 兼容旧引用;新代码请用 missing_child_heading_markdown(heading_no)
|
||||||
|
MISSING_CHILD_HEADING_TEMPLATE = "\n\n### {heading_no} 待补充\n\n待补充"
|
||||||
|
|
||||||
|
MINIMAL_MISSING_TABLE_TEMPLATE = (
|
||||||
|
"\n\n### {table_name}\n\n"
|
||||||
|
"| 项目 | 内容 |\n"
|
||||||
|
"| --- | --- |\n"
|
||||||
|
"| 关键数据 | 待补充 |\n"
|
||||||
|
)
|
||||||
|
|
||||||
|
APPENDIX8_PARAMETER_COMPARISON_TABLE = (
|
||||||
|
"| 序号 | 项目名称 | 单位 | 可研报告 | 后评价报告 | 备注 |\n"
|
||||||
|
"| --- | --- | --- | --- | --- | --- |\n"
|
||||||
|
"| 一 | 成本参数 | | | | |\n"
|
||||||
|
"| 1 | 原料价格 | | | | |\n"
|
||||||
|
"| 1.1 | 氢气 | 元/吨 | 待补充 | 待补充 | |\n"
|
||||||
|
"| 2 | 催化剂和化学药剂 | 万元 | 待补充 | 待补充 | |\n"
|
||||||
|
"| 3 | 燃料动力价格 | | | | |\n"
|
||||||
|
"| 3.1 | 除盐水价格 | 元/吨 | 待补充 | 待补充 | |\n"
|
||||||
|
"| …… | …… | | | | |\n"
|
||||||
|
"| 二 | 营业收入参数 | | | | |\n"
|
||||||
|
"| 2.1 | 98#汽油 | 元/吨 | 待补充 | 待补充 | |\n"
|
||||||
|
"| …… | …… | | | | |\n"
|
||||||
|
"| 三 | 税收参数 | | | | |\n"
|
||||||
|
"| | 增值税税率 | | | | |\n"
|
||||||
|
"| | 汽油各品种产品 | % | 待补充 | 待补充 | |\n"
|
||||||
|
"| …… | …… | | | | |\n"
|
||||||
|
"| 四 | 基准收益率 | % | 待补充 | 待补充 | |"
|
||||||
|
)
|
||||||
|
|
||||||
|
APPENDIX_FIGURE_TARGETS: list[tuple[str, str]] = [
|
||||||
|
("附图1", "全厂物料平衡图"),
|
||||||
|
("附图2", "烷基化装置物料平衡图"),
|
||||||
|
]
|
||||||
1
prompts/report_generation/chapter_generation_system.md
Normal file
1
prompts/report_generation/chapter_generation_system.md
Normal file
@ -0,0 +1 @@
|
|||||||
|
你是后评价报告撰写助手。严格基于证据输出,禁止编造。示例仅可用于写作风格参考,禁止复用示例中的任何事实数据与结论。禁止输出与当前小节无关的表号/表题清单及跨节“详见表/参见表”引用。必须返回 JSON 对象,字段为 content/missingInfo/qualityChecks。
|
||||||
67
prompts/report_generation/chapter_generation_user.md
Normal file
67
prompts/report_generation/chapter_generation_user.md
Normal file
@ -0,0 +1,67 @@
|
|||||||
|
你正在编写后评价报告章节:{{section_title}}
|
||||||
|
|
||||||
|
【章节细则描述】
|
||||||
|
{{section_prompt}}
|
||||||
|
|
||||||
|
【章节模板】
|
||||||
|
{{section_title}}
|
||||||
|
|
||||||
|
【模板必需表格】
|
||||||
|
{{required_tables_text}}
|
||||||
|
|
||||||
|
【结构化表格证据(必须优先采用)】
|
||||||
|
{{structured_tables_text}}
|
||||||
|
|
||||||
|
【字段级已抽取结果(强约束)】
|
||||||
|
{{canonical_fields_text}}
|
||||||
|
|
||||||
|
【章节示例】
|
||||||
|
{{selected_example}}
|
||||||
|
|
||||||
|
【参考范文】
|
||||||
|
{{section_reference_block}}
|
||||||
|
|
||||||
|
【示例使用约束】
|
||||||
|
1. 以《模版.doc》同章节结构为第一优先:段落顺序、表格标题、表头字段尽量保持一致;
|
||||||
|
2. 参考范文仅用于格式与结构参考,严禁复用示例中的项目名称、年份、金额、比例、指标值与结论;
|
||||||
|
3. 所有数值必须来自证据包;如需表格,表头可沿用模板,表内数据必须替换为当前项目证据;
|
||||||
|
4. 若模板字段无证据,按字段粒度写"待补充",不得整段空泛描述。
|
||||||
|
|
||||||
|
【输出硬约束】
|
||||||
|
1. 若存在【模板必需表格】,正文必须出现同名(或同编号)表格标题;
|
||||||
|
2. 若【结构化表格证据】中存在对应必需表,必须原样使用该 Markdown 表格,不得自行生成或改写表头/数值;
|
||||||
|
3. 仅在单元格级别缺失时写"待补充",避免整段反复"待补充";
|
||||||
|
4. 若【字段级已抽取结果】中某字段为非"待补充"值,正文该字段不得写"待补充",必须使用该抽取值;
|
||||||
|
5. content 字段只允许写章节正文,严禁出现"【缺失信息说明】""【质量检查】"及其任何条目;
|
||||||
|
6. 禁止输出与本节无关的表号/表题清单,禁止出现跨节表格引用(如"详见表X-X/参见表X-X/见表X-X/如表X-X所示");仅当【章节输出结构约束】明确要求时,才允许引用或输出对应表。
|
||||||
|
{{heading_rule}}7. 禁止使用"关键里程碑时间线""建设/投资执行情况"等突兀标签式标题。
|
||||||
|
|
||||||
|
【表格严格管控——必须遵守】
|
||||||
|
1. **禁止凭空生成表格**:只有当【章节输出结构约束】中明确包含"【表格强制要求】"标签时,本节才允许输出 Markdown 表格;
|
||||||
|
2. **无"表格强制要求"的章节一律禁止输出任何 Markdown 表格**(即不得输出含 | 分隔符的表格行),即使证据包中有结构化表格数据也不得在正文中嵌入;
|
||||||
|
3. **"见附表N"仅为引用语**:若合同要求写"项目建设工作程序见附表1。"等引用句,只需输出该引用句文本,附表本体在报告末尾统一输出,严禁在本节正文中展开附表的完整 Markdown 表格;
|
||||||
|
4. 表格数据必须严格来自要素管理(element_tables/element_cells),不得自行编造表格内容;
|
||||||
|
5. 每个 Markdown 表格前须有独立一行表题(形如「表1 …」「表2-3 …」「附表8 …」等);表题紧挨表格上方单独成段,表题与表格之间最多空一行或一行注释;前端会将表题居中排版。
|
||||||
|
6. **表号与表名间距**:表题中表号(如「表2-4」「附表8」)与表名之间须空两个全角空格(U+3000),例如「表2-4 原料数量及组成对比表」。
|
||||||
|
7. **表头栏单位**:凡含计量单位的列名,名称写第一行、单位加括号写在第二行,且在同一表头单元格内(Markdown 可用 `<br>`,如 `新鲜水<br>(m³/h)`);表题与表头均勿使用 `**` 加粗;勿将单位单独占一列,勿把「名称(单位)」横挤在同一行。
|
||||||
|
8. **公共单位写表题**:若整张表各数据列所用单位相同,单位应加括号写在表题行末尾(如「表3 ××公司储罐能力 (m³)」),表头栏内不再重复该单位;若各列单位不一致,则仍按列在表头内分行写单位。
|
||||||
|
9. **表格序号列**:用阿拉伯数字,层次与正文一致(如 1、1.1、1.2、2、2.1);行键或表体第一列已带层次编号时可与之对齐;否则自上而下用 1、2、3…;「合计」「总计」行可用「—」。
|
||||||
|
10. **表体与数字**:表内文字、数字宜水平与垂直居中;若单元格内需换行或分段(含 `<br>`),宜左齐排列以便阅读。同一表内、同列的小数、百分比等宜保留相同的小数位数。
|
||||||
|
|
||||||
|
【检索顺序约束】
|
||||||
|
1. 优先使用要素抽取结果;
|
||||||
|
2. 要素不足时补充文档段落;
|
||||||
|
3. 最后使用关键词检索到的补充材料;
|
||||||
|
4. 无证据时写"待补充",禁止编造。
|
||||||
|
|
||||||
|
{{prior_sibling_sections_block}}
|
||||||
|
|
||||||
|
{{prior_chapters_block}}
|
||||||
|
|
||||||
|
【章节输出结构约束】
|
||||||
|
{{section_contract}}
|
||||||
|
|
||||||
|
【证据包(JSON)】
|
||||||
|
{{evidence_json}}
|
||||||
|
|
||||||
|
请仅返回 JSON:{"content":"章节Markdown正文","missingInfo":["缺失项"],"qualityChecks":["校验结论"]}
|
||||||
@ -0,0 +1,88 @@
|
|||||||
|
你正在编写后评价报告章节:{{section_title}}
|
||||||
|
|
||||||
|
本次任务:以【章节细则描述】和【参考范文】共同作为本节的写作模板,以【事实证据】作为唯一数据来源。核心原则是:**细则与范文决定写什么、怎么写;证据只负责提供可填入模板的真实数据**。生成时必须先搭模板,再填证据,严禁脱离模板自由发挥,严禁复用范文数据或自行改写证据数据。
|
||||||
|
|
||||||
|
========================= 第一部分 · 写作模板(最高优先级:决定内容范围、结构和文风)=========================
|
||||||
|
|
||||||
|
【标题编号规则】
|
||||||
|
{{heading_rule}}
|
||||||
|
|
||||||
|
【章节细则描述】
|
||||||
|
{{section_prompt}}
|
||||||
|
|
||||||
|
【参考范文(内容范围、论述维度、段落结构和行文风格的主要模板)】
|
||||||
|
{{section_reference_block}}
|
||||||
|
|
||||||
|
========================= 第二部分 · 事实证据(唯一数据来源,仅用于支撑和填充模板)=========================
|
||||||
|
|
||||||
|
【模板必需表格】
|
||||||
|
{{required_tables_text}}
|
||||||
|
|
||||||
|
【结构化表格证据(必须优先采用)】
|
||||||
|
{{structured_tables_text}}
|
||||||
|
|
||||||
|
【字段级已抽取结果(强约束)】
|
||||||
|
{{canonical_fields_text}}
|
||||||
|
|
||||||
|
【证据包(JSON)】
|
||||||
|
{{evidence_json}}
|
||||||
|
|
||||||
|
========================= 第三部分 · 上文已生成内容(只用于一致性校验,不改变本节模板)=========================
|
||||||
|
|
||||||
|
{{prior_sibling_sections_block}}
|
||||||
|
|
||||||
|
{{prior_chapters_block}}
|
||||||
|
|
||||||
|
========================= 第四部分 · 写作与输出要求(务必逐条遵守)=========================
|
||||||
|
|
||||||
|
【生成步骤】
|
||||||
|
1. 先读取【章节细则描述】和【参考范文】,抽取本节应覆盖的内容主题、论述维度、段落顺序、子标题层级、表格/列举形式和结论方式;
|
||||||
|
2. 再读取【章节输出结构约束】,确认本节是否允许/必须输出表格、附表引用或特定结构;
|
||||||
|
3. 然后只从【事实证据】中选择可支撑上述模板的数据,把证据数据填入对应位置;
|
||||||
|
4. 最后输出正文。若模板要求的某项内容在证据中没有对应数据,写"待补充",不得跳过、猜测、编造或用范文数据顶替。
|
||||||
|
|
||||||
|
【模板遵循要求——细则与范文共同决定“写什么”和“怎么写”】
|
||||||
|
1. "写什么"由【章节细则描述】与【参考范文】共同决定:细则列出的要点、子项及顺序为必写项;参考范文实际写到的内容主题、论述维度和信息点(如背景、依据、目标、措施、问题、结论等)也应覆盖。二者取并集,不得遗漏,也不得另起炉灶写无关内容;
|
||||||
|
2. "怎么写"以【参考范文】为主要模板:段落数量、段落顺序、每段主题、论述推进、句式结构、专业术语、连接词、语气口吻、详略程度和结论表达都应高度贴合范文;
|
||||||
|
3. 若【章节细则描述】与【参考范文】存在差异,优先保证细则要求完整覆盖,再用范文的结构和笔法组织表达;若二者均未要求,正文不要主动扩展。
|
||||||
|
|
||||||
|
【证据使用要求——数据必须来自证据且保持原值】
|
||||||
|
1. 所有项目名称、时间、金额、数量、比例、指标值、单位、结论依据等事实性内容,只能来自第二部分事实证据;
|
||||||
|
2. 数据必须原值引用,严禁自行修改、估算、换算单位、四舍五入、增减、归纳为新数值或编造。证据是多少就写多少;证据未给出的数据写"待补充";
|
||||||
|
3. 若【字段级已抽取结果】中某字段为非"待补充"值,正文必须原样使用该抽取值,不得写"待补充",也不得改动、换算或重新表述其数值;
|
||||||
|
4. 内容来源优先级:结构化表格证据 / 字段级已抽取结果 > 证据包(JSON)中的章节文档 > 关键词检索补充材料;
|
||||||
|
5. 禁止复用【参考范文】或【章节示例】中的任何项目名称、年份、金额、指标值、比例、结论等事实数据。
|
||||||
|
|
||||||
|
【参考范文贴合要求——高度相似但严禁照抄】
|
||||||
|
1. 逐段对照:范文有几段就尽量写几段,每段主题、先后顺序、论述角度与起承转合须与范文对应;
|
||||||
|
2. 句式与笔法对齐:尽量沿用范文的段首引导方式、常用表达、收束方式和专业语气,使本节读起来与范文出自同一类报告;
|
||||||
|
3. 篇幅与颗粒度对齐:每段篇幅、信息密度和展开程度与范文相当,不得明显更短、更空泛,也不得无端扩写;
|
||||||
|
4. 形式对齐:范文采用分条、分项、描述性子标题或表格呈现的,本节也尽量采用同类形式,但必须满足【章节输出结构约束】和下方表格规则;
|
||||||
|
5. 禁止逐字照抄:不得出现与范文连续相同超过15字的句子或成段文字;应在保持结构和笔法相似的前提下,用本项目证据重新表述。
|
||||||
|
|
||||||
|
【输出硬约束】
|
||||||
|
1. content字段只允许写章节正文,严禁出现"【缺失信息说明】""【质量检查】"及其任何条目;
|
||||||
|
2. 若存在【模板必需表格】,正文必须出现同名(或同编号)表格标题;
|
||||||
|
3. 若【结构化表格证据】中存在对应必需表,必须原样使用该Markdown表格,不得自行生成或改写表头/数值;
|
||||||
|
4. 仅在单元格级别缺失时写"待补充",避免整段反复"待补充";
|
||||||
|
5. 禁止输出与本节无关的表号/表题清单,禁止出现跨节表格引用(如"详见表X-X/参见表X-X/见表X-X/如表X-X所示");仅当【章节输出结构约束】明确要求时,才允许引用或输出对应表;
|
||||||
|
6. 禁止使用"关键里程碑时间线""建设/投资执行情况"等突兀标签式标题;
|
||||||
|
7. 数字与汉字之间不留空格:阿拉伯数字、百分比、金额、年份等与相邻汉字之间不得插入半角或全角空格,例如写"投资1.2亿元""2023年12月""产能达95%",不得写"投资 1.2 亿元""2023 年 12 月""产能达 95 %";数字与计量单位之间也不留空格,如"30万吨"而非"30 万吨";
|
||||||
|
8. 子标题形式约束:正文段落允许使用描述性小标题,但只能采用"一、""(一)""1."或加粗短语单独成行等中文公文层级形式;严禁使用Markdown标题语法(`#`、`##`、`###`等)充当子标题。表格上方的表题不属于子标题;
|
||||||
|
9. 计量单位须规范:面积写"m²"不得写"m2",体积写"m³"不得写"m3",流量写"m³/h"不得写"m3/h";温度写"℃",千分号写"‰",科学计数可写"×10⁴"。正文与表格中的单位均须规范。
|
||||||
|
|
||||||
|
【表格严格管控】
|
||||||
|
1. 只有当【章节输出结构约束】中明确包含"【表格强制要求】"标签时,本节才允许输出Markdown表格;
|
||||||
|
2. 无"表格强制要求"的章节一律禁止输出任何Markdown表格(不得输出含`|`分隔符的表格行),即使证据包中有结构化表格数据也不得在正文中嵌入;
|
||||||
|
3. "见附表N"仅为引用语:若结构约束要求写"项目建设工作程序见附表1。"等引用句,只输出引用句文本,附表本体在报告末尾统一输出,严禁在本节展开完整Markdown表格;
|
||||||
|
4. 表格数据必须严格来自要素管理(element_tables/element_cells)或结构化表格证据,不得自行编造、换算或改写表格内容;
|
||||||
|
5. 每个Markdown表格前须有独立一行表题(如「表1 ××表」「表2-3 ××表」「附表8 ××表」),表题紧挨表格上方单独成段;
|
||||||
|
6. 表号与表名之间须空两个全角空格(U+3000),例如「表2-4 原料数量及组成对比表」;
|
||||||
|
7. 含计量单位的表头,名称写第一行、单位加括号写第二行,且在同一表头单元格内(Markdown可用`<br>`,如`新鲜水<br>(m³/h)`);勿将单位单独占一列;
|
||||||
|
8. 若整张表各数据列所用单位相同,单位写在表题行末尾,表头栏内不再重复;若各列单位不一致,则按列在表头内分行写单位;
|
||||||
|
9. 表格序号列用阿拉伯数字,层次与正文一致;"合计""总计"行可用"—";
|
||||||
|
10. 同一表内、同列的小数、百分比等宜保留相同的小数位数,但不得因此改动证据原值。
|
||||||
|
|
||||||
|
【输出格式】
|
||||||
|
请仅返回JSON:{"content":"章节Markdown正文","missingInfo":["缺失项"],"qualityChecks":["校验结论"]}
|
||||||
|
你正在编写后评价报告章节:{{section_title}}
|
||||||
14
prompts/report_generation/heading_rules.py
Normal file
14
prompts/report_generation/heading_rules.py
Normal file
@ -0,0 +1,14 @@
|
|||||||
|
"""Heading rule prompt variables for report generation."""
|
||||||
|
|
||||||
|
DEFAULT_HEADING_RULE = (
|
||||||
|
"5. 各章节内部小标题须使用规范层级格式(如“### 1.2.1 …”);"
|
||||||
|
"若在同一节内使用并列条目,必须统一写作“1)… 2)… 3)…”,"
|
||||||
|
"禁止使用“一、二、三、”“(一)(二)(三)”或“1.”“1.2.”“3.1”等序号形式;\n"
|
||||||
|
)
|
||||||
|
|
||||||
|
SECTION_HEADING_RULES: dict[str, str] = {
|
||||||
|
"1.2": (
|
||||||
|
"5. 本节(1.2)必须严格遵循【章节输出结构约束】给定的纯文本编号体结构;"
|
||||||
|
"不得使用“###”等 Markdown 小标题语法;不得将“1.2.1/1.2.2”改写为“1)/2)”。\n"
|
||||||
|
),
|
||||||
|
}
|
||||||
4
prompts/report_generation/prompt_defaults.py
Normal file
4
prompts/report_generation/prompt_defaults.py
Normal file
@ -0,0 +1,4 @@
|
|||||||
|
"""Fallback prompt fragments for report generation."""
|
||||||
|
|
||||||
|
DEFAULT_SECTION_PROMPT_FALLBACK = "按后评价细则规范撰写本章节。"
|
||||||
|
DEFAULT_SELECTED_EXAMPLE_FALLBACK = "无示例,按规范输出。"
|
||||||
@ -0,0 +1 @@
|
|||||||
|
你是后评价报告撰写助手。任务是对既有章节做最小修改补齐缺表,禁止删除事实性内容,禁止编造。返回 JSON:{"content":"..."}
|
||||||
19
prompts/report_generation/repair_missing_tables_user.md
Normal file
19
prompts/report_generation/repair_missing_tables_user.md
Normal file
@ -0,0 +1,19 @@
|
|||||||
|
你正在修订章节:{{section_title}}
|
||||||
|
|
||||||
|
目标:在不删除原有有效内容的前提下,补齐缺失表格。
|
||||||
|
必须出现的表标识:{{missing_tables}}
|
||||||
|
|
||||||
|
要求:
|
||||||
|
1) 每个缺失表都必须在正文中出现,并使用 Markdown 表格;
|
||||||
|
2) 若证据不足,单元格可写“待补充”;
|
||||||
|
3) 表标题必须包含对应表标识(如“表2-1”);
|
||||||
|
4) 仅输出修订后的完整章节 Markdown。
|
||||||
|
|
||||||
|
【原章节内容】
|
||||||
|
{{content}}
|
||||||
|
|
||||||
|
【原始章节提示词】
|
||||||
|
{{original_prompt}}
|
||||||
|
|
||||||
|
【证据包(JSON)】
|
||||||
|
{{evidence_json}}
|
||||||
1
prompts/report_generation/table_format_repair_system.md
Normal file
1
prompts/report_generation/table_format_repair_system.md
Normal file
@ -0,0 +1 @@
|
|||||||
|
你是后评价报告格式修订助手。仅做格式对齐修订:章节标题、表名、表头。禁止新增未证据支持的数据。返回 JSON:{"content":"..."}
|
||||||
25
prompts/report_generation/table_format_repair_user.md
Normal file
25
prompts/report_generation/table_format_repair_user.md
Normal file
@ -0,0 +1,25 @@
|
|||||||
|
你正在修订章节:{{section_title}}
|
||||||
|
|
||||||
|
目标:对齐模板格式,不改变事实结论。
|
||||||
|
请仅修订“章节标题、表名、表头”,正文事实描述尽量保持原样。
|
||||||
|
|
||||||
|
【模板表规格(JSON)】
|
||||||
|
{{table_specs_json}}
|
||||||
|
|
||||||
|
【当前章节】
|
||||||
|
{{content}}
|
||||||
|
|
||||||
|
【证据包(JSON)】
|
||||||
|
{{evidence_json}}
|
||||||
|
|
||||||
|
修订规则:
|
||||||
|
1) 章节首行必须为标准章节标题;
|
||||||
|
2) 表名必须与模板表规格中的 token/title 对齐;表题中表号与表名之间须空两个全角空格(如「表2-4 原料数量及组成对比表」);
|
||||||
|
3) 表头字段优先与模板一致,表内数据来自证据包,无值写待补充;
|
||||||
|
4) 必须使用 Markdown 表格;
|
||||||
|
5) 表头栏排版:指标名称与计量单位分两行写在同一表头单元格内;单位须加括号并写在名称正下方(Markdown 可用 `<br>`,如 `新鲜水<br>(m³/h)`);表题与表头均勿使用 `**` 加粗;勿将单位单独拆成一列表头列,勿把「名称(单位)」横挤在同一行;
|
||||||
|
6) 若整张表各数据列所用单位相同,应将单位加括号写在表题末尾(如「表3 ××公司储罐能力 (m³)」),表头栏内不再重复写该单位;
|
||||||
|
7) 表格「序号」列:优先使用各行行键(row_key)首部已有的阿拉伯数字层次编号(与正文 1、1.1、1.2、2、2.1 一致);若行键未带此类编号,则用自上而下连续阿拉伯数字 1、2、3…;「合计」「总计」行序号可用「—」;
|
||||||
|
8) 表体单元格内容宜居中;若有换行或分段,宜左齐。同列数值宜统一小数位数;
|
||||||
|
9) 禁止编造事实数据;
|
||||||
|
10) 仅返回修订后的完整章节 Markdown(不要返回 JSON)。
|
||||||
204
routers/report.py
Normal file
204
routers/report.py
Normal file
@ -0,0 +1,204 @@
|
|||||||
|
"""
|
||||||
|
routers/report.py
|
||||||
|
后评价报告「核心生成」路由(独立抽取版)。
|
||||||
|
|
||||||
|
从 eval_report 的 routers/write.py 摘取报告生成相关端点,去除鉴权依赖,
|
||||||
|
项目查询改用轻量的 services/project_service.get_project。
|
||||||
|
业务逻辑在 services/report_generation_service.py。
|
||||||
|
"""
|
||||||
|
|
||||||
|
from __future__ import annotations
|
||||||
|
|
||||||
|
import asyncio
|
||||||
|
import json
|
||||||
|
from typing import Optional
|
||||||
|
|
||||||
|
from fastapi import APIRouter, Depends, Header, HTTPException
|
||||||
|
from fastapi.responses import StreamingResponse
|
||||||
|
from sqlalchemy.orm import Session
|
||||||
|
|
||||||
|
from database import SessionLocal, get_db
|
||||||
|
from database.models import ReportTemplate, ReportTemplateSection
|
||||||
|
from schemas.write import (
|
||||||
|
GenerateReportJobCreate,
|
||||||
|
GenerateReportJobItem,
|
||||||
|
GenerateReportResult,
|
||||||
|
)
|
||||||
|
from services.project_service import get_project
|
||||||
|
from services.report_generation_service import (
|
||||||
|
create_report_job,
|
||||||
|
get_report_job,
|
||||||
|
get_report_result,
|
||||||
|
get_report_stream_snapshot,
|
||||||
|
retry_report_chapter,
|
||||||
|
cancel_report_job,
|
||||||
|
)
|
||||||
|
|
||||||
|
router = APIRouter(prefix="/write", tags=["后评价报告生成"])
|
||||||
|
|
||||||
|
|
||||||
|
@router.get("/projects/{project_id}/generate-sections", summary="按章节智能体生成提示词清单")
|
||||||
|
def generate_sections_prompt(
|
||||||
|
project_id: str,
|
||||||
|
template_id: Optional[str] = None,
|
||||||
|
db: Session = Depends(get_db),
|
||||||
|
):
|
||||||
|
_ = get_project(project_id, db)
|
||||||
|
template = None
|
||||||
|
if template_id:
|
||||||
|
template = db.query(ReportTemplate).filter(ReportTemplate.id == template_id, ReportTemplate.is_active == True).first() # noqa: E712
|
||||||
|
if not template:
|
||||||
|
template = db.query(ReportTemplate).filter(ReportTemplate.is_default == True, ReportTemplate.is_active == True).first() # noqa: E712
|
||||||
|
if not template:
|
||||||
|
raise HTTPException(status_code=404, detail="未找到可用模板")
|
||||||
|
sections = (
|
||||||
|
db.query(ReportTemplateSection)
|
||||||
|
.filter(ReportTemplateSection.template_id == template.id)
|
||||||
|
.order_by(ReportTemplateSection.section_order.asc())
|
||||||
|
.all()
|
||||||
|
)
|
||||||
|
return {
|
||||||
|
"templateId": template.id,
|
||||||
|
"templateName": template.name,
|
||||||
|
"sections": [
|
||||||
|
{
|
||||||
|
"sectionKey": s.section_key,
|
||||||
|
"sectionTitle": s.section_title,
|
||||||
|
"prompt": (
|
||||||
|
"请基于2020后评价细则与本项目检索材料,先查要素表,再查文档段落,最后生成本章节内容。\n"
|
||||||
|
+ (s.section_prompt or "")
|
||||||
|
),
|
||||||
|
"examples": s.examples or "",
|
||||||
|
}
|
||||||
|
for s in sections
|
||||||
|
],
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
@router.post(
|
||||||
|
"/projects/{project_id}/generate-report-job",
|
||||||
|
response_model=GenerateReportJobItem,
|
||||||
|
summary="创建分章异步报告生成任务",
|
||||||
|
)
|
||||||
|
def create_generate_report_job(
|
||||||
|
project_id: str,
|
||||||
|
body: GenerateReportJobCreate,
|
||||||
|
db: Session = Depends(get_db),
|
||||||
|
x_user_id: Optional[str] = Header(default=None, alias="X-User-Id"),
|
||||||
|
):
|
||||||
|
_ = get_project(project_id, db)
|
||||||
|
return create_report_job(
|
||||||
|
project_id,
|
||||||
|
db,
|
||||||
|
template_id=body.templateId,
|
||||||
|
top_k=body.topK,
|
||||||
|
requested_by=x_user_id,
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
@router.get(
|
||||||
|
"/projects/{project_id}/generate-report-job/{job_id}",
|
||||||
|
response_model=GenerateReportJobItem,
|
||||||
|
summary="查询分章异步报告任务进度",
|
||||||
|
)
|
||||||
|
def get_generate_report_job(
|
||||||
|
project_id: str,
|
||||||
|
job_id: str,
|
||||||
|
db: Session = Depends(get_db),
|
||||||
|
):
|
||||||
|
return get_report_job(project_id, job_id, db)
|
||||||
|
|
||||||
|
|
||||||
|
@router.get(
|
||||||
|
"/projects/{project_id}/generate-report-job/{job_id}/result",
|
||||||
|
response_model=GenerateReportResult,
|
||||||
|
summary="获取分章异步报告任务结果",
|
||||||
|
)
|
||||||
|
def get_generate_report_result(
|
||||||
|
project_id: str,
|
||||||
|
job_id: str,
|
||||||
|
include_debug: bool = False,
|
||||||
|
db: Session = Depends(get_db),
|
||||||
|
):
|
||||||
|
return get_report_result(project_id, job_id, db, include_debug=include_debug)
|
||||||
|
|
||||||
|
|
||||||
|
@router.get(
|
||||||
|
"/projects/{project_id}/generate-report-job/{job_id}/events",
|
||||||
|
summary="订阅分章异步报告任务实时事件(SSE)",
|
||||||
|
)
|
||||||
|
async def stream_generate_report_job_events(
|
||||||
|
project_id: str,
|
||||||
|
job_id: str,
|
||||||
|
include_debug: bool = False,
|
||||||
|
):
|
||||||
|
# 校验后立即释放连接;SSE 循环中按需短连接查询,避免长连占满连接池
|
||||||
|
with SessionLocal() as db:
|
||||||
|
_ = get_report_job(project_id, job_id, db)
|
||||||
|
|
||||||
|
async def _event_stream():
|
||||||
|
last_payload = ""
|
||||||
|
idle_ticks = 0
|
||||||
|
while True:
|
||||||
|
snapshot = get_report_stream_snapshot(job_id, include_debug=include_debug)
|
||||||
|
if not snapshot:
|
||||||
|
with SessionLocal() as db:
|
||||||
|
job = get_report_job(project_id, job_id, db)
|
||||||
|
result = get_report_result(project_id, job_id, db, include_debug=include_debug)
|
||||||
|
snapshot = {
|
||||||
|
"job": job.model_dump(),
|
||||||
|
"result": result.model_dump(),
|
||||||
|
}
|
||||||
|
payload = json.dumps(snapshot, ensure_ascii=False, separators=(",", ":"))
|
||||||
|
if payload != last_payload:
|
||||||
|
last_payload = payload
|
||||||
|
idle_ticks = 0
|
||||||
|
yield f"event: snapshot\ndata: {payload}\n\n"
|
||||||
|
else:
|
||||||
|
idle_ticks += 1
|
||||||
|
if idle_ticks >= 20:
|
||||||
|
idle_ticks = 0
|
||||||
|
yield "event: keepalive\ndata: ping\n\n"
|
||||||
|
|
||||||
|
status = str(((snapshot.get("job") or {}).get("status") or "")).strip().lower()
|
||||||
|
if status in ("completed", "failed", "cancelled"):
|
||||||
|
yield f"event: end\ndata: {payload}\n\n"
|
||||||
|
break
|
||||||
|
await asyncio.sleep(0.25)
|
||||||
|
|
||||||
|
return StreamingResponse(
|
||||||
|
_event_stream(),
|
||||||
|
media_type="text/event-stream",
|
||||||
|
headers={
|
||||||
|
"Cache-Control": "no-cache",
|
||||||
|
"Connection": "keep-alive",
|
||||||
|
"X-Accel-Buffering": "no",
|
||||||
|
},
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
@router.post(
|
||||||
|
"/projects/{project_id}/generate-report-job/{job_id}/retry-chapter",
|
||||||
|
response_model=GenerateReportJobItem,
|
||||||
|
summary="重试指定章节",
|
||||||
|
)
|
||||||
|
def retry_generate_report_chapter(
|
||||||
|
project_id: str,
|
||||||
|
job_id: str,
|
||||||
|
section_key: str,
|
||||||
|
db: Session = Depends(get_db),
|
||||||
|
):
|
||||||
|
return retry_report_chapter(project_id, job_id, section_key, db)
|
||||||
|
|
||||||
|
|
||||||
|
@router.post(
|
||||||
|
"/projects/{project_id}/generate-report-job/{job_id}/cancel",
|
||||||
|
response_model=GenerateReportJobItem,
|
||||||
|
summary="取消报告生成任务",
|
||||||
|
)
|
||||||
|
def cancel_generate_report_job(
|
||||||
|
project_id: str,
|
||||||
|
job_id: str,
|
||||||
|
db: Session = Depends(get_db),
|
||||||
|
):
|
||||||
|
return cancel_report_job(project_id, job_id, db)
|
||||||
179
schemas/write.py
Normal file
179
schemas/write.py
Normal file
@ -0,0 +1,179 @@
|
|||||||
|
"""
|
||||||
|
schemas/write.py
|
||||||
|
后评价报告项目相关的 Pydantic 数据模型。
|
||||||
|
"""
|
||||||
|
|
||||||
|
from __future__ import annotations
|
||||||
|
from typing import Any, List, Optional
|
||||||
|
from pydantic import BaseModel
|
||||||
|
|
||||||
|
|
||||||
|
# ---------- 版本 ----------
|
||||||
|
|
||||||
|
class DocVersion(BaseModel):
|
||||||
|
id: str
|
||||||
|
version: str
|
||||||
|
content: str
|
||||||
|
savedAt: str
|
||||||
|
author: str
|
||||||
|
note: Optional[str] = ""
|
||||||
|
citationPayload: Optional[dict[str, Any]] = None
|
||||||
|
|
||||||
|
|
||||||
|
# ---------- 文档 ----------
|
||||||
|
|
||||||
|
class WriteDocument(BaseModel):
|
||||||
|
id: str
|
||||||
|
title: str
|
||||||
|
content: str
|
||||||
|
wordCount: int
|
||||||
|
createdAt: str
|
||||||
|
updatedAt: str
|
||||||
|
projectId: str
|
||||||
|
status: str # draft | review | published
|
||||||
|
versions: List[DocVersion] = []
|
||||||
|
|
||||||
|
|
||||||
|
class WriteDocumentSummary(BaseModel):
|
||||||
|
"""列表页只返回摘要,不含 content 正文"""
|
||||||
|
id: str
|
||||||
|
title: str
|
||||||
|
wordCount: int
|
||||||
|
createdAt: str
|
||||||
|
updatedAt: str
|
||||||
|
projectId: str
|
||||||
|
status: str
|
||||||
|
|
||||||
|
|
||||||
|
# ---------- 项目 ----------
|
||||||
|
|
||||||
|
class WriteProject(BaseModel):
|
||||||
|
id: str
|
||||||
|
uuid: str # 项目唯一标识,与 kb 共用
|
||||||
|
name: str
|
||||||
|
description: Optional[str] = ""
|
||||||
|
createdAt: str
|
||||||
|
updatedAt: str
|
||||||
|
docCount: int
|
||||||
|
status: str # active | archived
|
||||||
|
kbProjectId: Optional[str] = None
|
||||||
|
color: str
|
||||||
|
documents: List[WriteDocument] = []
|
||||||
|
|
||||||
|
|
||||||
|
class WriteProjectSummary(BaseModel):
|
||||||
|
"""列表页摘要,不含 documents"""
|
||||||
|
id: str
|
||||||
|
uuid: Optional[str] = None # 项目唯一标识,用于 URL 参数;兼容旧数据
|
||||||
|
name: str
|
||||||
|
description: Optional[str] = ""
|
||||||
|
createdAt: str
|
||||||
|
updatedAt: str
|
||||||
|
docCount: int
|
||||||
|
status: str
|
||||||
|
kbProjectId: Optional[str] = None
|
||||||
|
color: str
|
||||||
|
|
||||||
|
|
||||||
|
# ---------- 创建 / 更新请求体 ----------
|
||||||
|
|
||||||
|
class WriteProjectCreate(BaseModel):
|
||||||
|
name: str
|
||||||
|
description: Optional[str] = ""
|
||||||
|
kbProjectId: Optional[str] = None
|
||||||
|
color: Optional[str] = "#3b82f6"
|
||||||
|
|
||||||
|
|
||||||
|
class WriteProjectUpdate(BaseModel):
|
||||||
|
name: Optional[str] = None
|
||||||
|
description: Optional[str] = None
|
||||||
|
status: Optional[str] = None
|
||||||
|
kbProjectId: Optional[str] = None
|
||||||
|
color: Optional[str] = None
|
||||||
|
|
||||||
|
|
||||||
|
class WriteDocumentCreate(BaseModel):
|
||||||
|
title: str
|
||||||
|
content: Optional[str] = ""
|
||||||
|
|
||||||
|
|
||||||
|
class WriteDocumentUpdate(BaseModel):
|
||||||
|
title: Optional[str] = None
|
||||||
|
content: Optional[str] = None
|
||||||
|
status: Optional[str] = None
|
||||||
|
|
||||||
|
|
||||||
|
class DocVersionCreate(BaseModel):
|
||||||
|
version: Optional[str] = None
|
||||||
|
content: str
|
||||||
|
author: str
|
||||||
|
note: Optional[str] = ""
|
||||||
|
citationPayload: Optional[dict[str, Any]] = None
|
||||||
|
|
||||||
|
|
||||||
|
# ---------- 章节审查(智能体) ----------
|
||||||
|
|
||||||
|
|
||||||
|
class ChapterReviewRequest(BaseModel):
|
||||||
|
"""章节智能审查请求体:选择章节 + 输入待审查文本。"""
|
||||||
|
|
||||||
|
chapter: str # "1"~"6"
|
||||||
|
content: str
|
||||||
|
|
||||||
|
|
||||||
|
class ChapterReviewResponse(BaseModel):
|
||||||
|
"""章节智能审查响应体:返回 Markdown 审查报告。"""
|
||||||
|
|
||||||
|
success: bool = True
|
||||||
|
chapter: str
|
||||||
|
review: str
|
||||||
|
model: Optional[str] = None
|
||||||
|
message: Optional[str] = ""
|
||||||
|
|
||||||
|
|
||||||
|
class GenerateReportJobCreate(BaseModel):
|
||||||
|
templateId: Optional[str] = None
|
||||||
|
topK: int = 10
|
||||||
|
|
||||||
|
|
||||||
|
class GenerateReportChapterItem(BaseModel):
|
||||||
|
sectionKey: str
|
||||||
|
sectionTitle: str
|
||||||
|
sectionOrder: int
|
||||||
|
status: str
|
||||||
|
updatedAt: Optional[str] = None
|
||||||
|
errorMessage: Optional[str] = None
|
||||||
|
|
||||||
|
|
||||||
|
class GenerateReportJobItem(BaseModel):
|
||||||
|
jobId: str
|
||||||
|
projectId: str
|
||||||
|
templateId: Optional[str] = None
|
||||||
|
status: str
|
||||||
|
progress: int
|
||||||
|
currentSectionKey: Optional[str] = None
|
||||||
|
errorMessage: Optional[str] = None
|
||||||
|
createdAt: Optional[str] = None
|
||||||
|
updatedAt: Optional[str] = None
|
||||||
|
completedAt: Optional[str] = None
|
||||||
|
chapters: List[GenerateReportChapterItem] = []
|
||||||
|
|
||||||
|
|
||||||
|
class GenerateReportResultChapter(BaseModel):
|
||||||
|
sectionKey: str
|
||||||
|
sectionTitle: str
|
||||||
|
sectionOrder: int
|
||||||
|
status: str
|
||||||
|
content: Optional[str] = None
|
||||||
|
errorMessage: Optional[str] = None
|
||||||
|
promptText: Optional[str] = None
|
||||||
|
evidencePayload: Optional[dict] = None
|
||||||
|
validationPayload: Optional[dict] = None
|
||||||
|
|
||||||
|
|
||||||
|
class GenerateReportResult(BaseModel):
|
||||||
|
jobId: str
|
||||||
|
status: str
|
||||||
|
report: Optional[str] = None
|
||||||
|
consistency: List[str] = []
|
||||||
|
chapters: List[GenerateReportResultChapter] = []
|
||||||
199
services/appendix_figure_extraction.py
Normal file
199
services/appendix_figure_extraction.py
Normal file
@ -0,0 +1,199 @@
|
|||||||
|
"""
|
||||||
|
从项目知识库 Word(.docx)中提取「附图1/附图2」嵌入图,用于报告附录。
|
||||||
|
|
||||||
|
细则常见版式:附图标题段落与图在同一节或相邻段落;解析时合并前/当前/后段文字做关键词匹配。
|
||||||
|
"""
|
||||||
|
|
||||||
|
from __future__ import annotations
|
||||||
|
|
||||||
|
import base64
|
||||||
|
import logging
|
||||||
|
from pathlib import Path
|
||||||
|
from typing import Optional
|
||||||
|
|
||||||
|
from docx import Document
|
||||||
|
from docx.oxml.ns import qn
|
||||||
|
from docx.table import Table
|
||||||
|
from docx.text.paragraph import Paragraph
|
||||||
|
|
||||||
|
logger = logging.getLogger(__name__)
|
||||||
|
|
||||||
|
# 过滤装饰性小图(logo 等)
|
||||||
|
_MIN_FIGURE_BYTES = 6000
|
||||||
|
|
||||||
|
R_EMBED = "{http://schemas.openxmlformats.org/officeDocument/2006/relationships}embed"
|
||||||
|
_NS = {
|
||||||
|
"w": "http://schemas.openxmlformats.org/wordprocessingml/2006/main",
|
||||||
|
"a": "http://schemas.openxmlformats.org/drawingml/2006/main",
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
def _compact(s: str) -> str:
|
||||||
|
return "".join(str(s or "").split())
|
||||||
|
|
||||||
|
|
||||||
|
def _classify_slot(ctx: str) -> Optional[int]:
|
||||||
|
"""
|
||||||
|
返回 1=全厂物料平衡图,2=装置(如烷基化)物料平衡图。
|
||||||
|
"""
|
||||||
|
t = _compact(ctx)
|
||||||
|
if not t:
|
||||||
|
return None
|
||||||
|
# 附图编号(先判 2,避免同段目录同时出现两个编号时误判)
|
||||||
|
if "附图2" in t:
|
||||||
|
return 2
|
||||||
|
if "附图1" in t:
|
||||||
|
return 1
|
||||||
|
if "全厂" in t and "物料平衡" in t:
|
||||||
|
return 1
|
||||||
|
if "烷基化" in t and "物料平衡" in t:
|
||||||
|
return 2
|
||||||
|
if "装置" in t and "物料平衡" in t and "全厂" not in t:
|
||||||
|
return 2
|
||||||
|
return None
|
||||||
|
|
||||||
|
|
||||||
|
def _content_type_to_md_subtype(content_type: str) -> str:
|
||||||
|
ct = (content_type or "").lower()
|
||||||
|
if "jpeg" in ct or ct.endswith("jpg"):
|
||||||
|
return "jpeg"
|
||||||
|
if "png" in ct:
|
||||||
|
return "png"
|
||||||
|
if "gif" in ct:
|
||||||
|
return "gif"
|
||||||
|
if "emf" in ct:
|
||||||
|
return "x-emf"
|
||||||
|
if "wmf" in ct:
|
||||||
|
return "x-wmf"
|
||||||
|
return "png"
|
||||||
|
|
||||||
|
|
||||||
|
def _blob_to_data_uri(blob: bytes, content_type: str) -> str:
|
||||||
|
sub = _content_type_to_md_subtype(content_type)
|
||||||
|
b64 = base64.standard_b64encode(blob).decode("ascii")
|
||||||
|
return f"data:image/{sub};base64,{b64}"
|
||||||
|
|
||||||
|
|
||||||
|
def _iter_paragraphs_deep(doc: Document):
|
||||||
|
body_el = doc.element.body
|
||||||
|
for el in body_el:
|
||||||
|
if el.tag == qn("w:p"):
|
||||||
|
yield Paragraph(el, doc._body)
|
||||||
|
elif el.tag == qn("w:tbl"):
|
||||||
|
table = Table(el, doc._body)
|
||||||
|
for row in table.rows:
|
||||||
|
for cell in row.cells:
|
||||||
|
for p in cell.paragraphs:
|
||||||
|
yield p
|
||||||
|
|
||||||
|
|
||||||
|
def extract_appendix_figure_candidates_from_docx(path: Path) -> dict[int, list[tuple[int, bytes, str]]]:
|
||||||
|
"""
|
||||||
|
从单个 docx 收集候选图:slot -> [(size, blob, content_type), ...]
|
||||||
|
content_type 来自 OPC part,用于拼 data URI。
|
||||||
|
"""
|
||||||
|
candidates: dict[int, list[tuple[int, bytes, str]]] = {1: [], 2: []}
|
||||||
|
orphans_ordered: list[tuple[bytes, str]] = []
|
||||||
|
try:
|
||||||
|
doc = Document(str(path))
|
||||||
|
except Exception as exc:
|
||||||
|
logger.warning("appendix figure: open docx failed %s: %s", path, exc)
|
||||||
|
return candidates
|
||||||
|
|
||||||
|
paras = list(_iter_paragraphs_deep(doc))
|
||||||
|
texts = [p.text or "" for p in paras]
|
||||||
|
|
||||||
|
for i, p in enumerate(paras):
|
||||||
|
blobs_with_type: list[tuple[bytes, str]] = []
|
||||||
|
for blip in p._element.findall(".//a:blip", _NS):
|
||||||
|
embed = blip.get(R_EMBED)
|
||||||
|
if not embed:
|
||||||
|
continue
|
||||||
|
try:
|
||||||
|
rel = p.part.related_parts[embed]
|
||||||
|
except KeyError:
|
||||||
|
continue
|
||||||
|
blob = getattr(rel, "blob", None)
|
||||||
|
ct = getattr(rel, "content_type", "") or "image/png"
|
||||||
|
if blob and len(blob) >= _MIN_FIGURE_BYTES:
|
||||||
|
blobs_with_type.append((blob, ct))
|
||||||
|
|
||||||
|
if not blobs_with_type:
|
||||||
|
continue
|
||||||
|
|
||||||
|
prev_t = texts[i - 1] if i > 0 else ""
|
||||||
|
cur_t = texts[i]
|
||||||
|
next_t = texts[i + 1] if i + 1 < len(texts) else ""
|
||||||
|
ctx = f"{prev_t}\n{cur_t}\n{next_t}"
|
||||||
|
slot = _classify_slot(ctx)
|
||||||
|
if slot is None:
|
||||||
|
for blob, ct in blobs_with_type:
|
||||||
|
orphans_ordered.append((blob, ct))
|
||||||
|
continue
|
||||||
|
|
||||||
|
for blob, ct in blobs_with_type:
|
||||||
|
candidates[slot].append((len(blob), blob, ct))
|
||||||
|
|
||||||
|
def _dedupe_preserve_order(pairs: list[tuple[bytes, str]]) -> list[tuple[bytes, str]]:
|
||||||
|
seen: set[int] = set()
|
||||||
|
out: list[tuple[bytes, str]] = []
|
||||||
|
for blob, ct in pairs:
|
||||||
|
bid = id(blob)
|
||||||
|
if bid in seen:
|
||||||
|
continue
|
||||||
|
seen.add(bid)
|
||||||
|
out.append((blob, ct))
|
||||||
|
return out
|
||||||
|
|
||||||
|
orphans_ordered = _dedupe_preserve_order(orphans_ordered)
|
||||||
|
used_ids: set[int] = set()
|
||||||
|
for lst in candidates.values():
|
||||||
|
for _sz, blob, _ct in lst:
|
||||||
|
used_ids.add(id(blob))
|
||||||
|
orphans_ordered = [(b, c) for b, c in orphans_ordered if id(b) not in used_ids]
|
||||||
|
|
||||||
|
if not candidates[1] and orphans_ordered:
|
||||||
|
b, c = orphans_ordered.pop(0)
|
||||||
|
candidates[1].append((len(b), b, c))
|
||||||
|
if not candidates[2] and orphans_ordered:
|
||||||
|
b, c = orphans_ordered.pop(0)
|
||||||
|
candidates[2].append((len(b), b, c))
|
||||||
|
|
||||||
|
return candidates
|
||||||
|
|
||||||
|
|
||||||
|
def merge_best_appendix_figures(
|
||||||
|
per_doc: list[tuple[str, dict[int, list[tuple[int, bytes, str]]]]],
|
||||||
|
) -> dict[int, tuple[bytes, str, str]]:
|
||||||
|
"""
|
||||||
|
多文档合并:每个 slot 只保留字节最大的一张(更可能是主流程图而非小图标)。
|
||||||
|
|
||||||
|
返回 slot -> (blob, content_type, source_doc_name)
|
||||||
|
"""
|
||||||
|
best: dict[int, tuple[int, bytes, str, str]] = {}
|
||||||
|
for doc_name, cand in per_doc:
|
||||||
|
for slot in (1, 2):
|
||||||
|
for size, blob, ct in cand.get(slot) or []:
|
||||||
|
prev = best.get(slot)
|
||||||
|
if prev is None or size > prev[0]:
|
||||||
|
best[slot] = (size, blob, ct, doc_name)
|
||||||
|
return {k: (v[1], v[2], v[3]) for k, v in best.items()}
|
||||||
|
|
||||||
|
|
||||||
|
def appendix_figure_markdown_images(
|
||||||
|
resolved: dict[int, tuple[bytes, str, str]],
|
||||||
|
*,
|
||||||
|
label_title: list[tuple[str, str]],
|
||||||
|
) -> dict[int, str]:
|
||||||
|
"""slot -> markdown 片段(含 ### 标题与 )"""
|
||||||
|
out: dict[int, str] = {}
|
||||||
|
slot_to_title = {i + 1: lt for i, lt in enumerate(label_title)}
|
||||||
|
for slot, (blob, ct, src) in resolved.items():
|
||||||
|
if slot not in slot_to_title:
|
||||||
|
continue
|
||||||
|
label, title = slot_to_title[slot]
|
||||||
|
uri = _blob_to_data_uri(blob, ct)
|
||||||
|
cap = f"{label} {title}"
|
||||||
|
src_note = f"\n\n*(嵌入来源:{src})*" if src else ""
|
||||||
|
out[slot] = f"### {cap}\n\n{src_note}"
|
||||||
|
return out
|
||||||
28
services/docx_export_service.py
Normal file
28
services/docx_export_service.py
Normal file
@ -0,0 +1,28 @@
|
|||||||
|
"""
|
||||||
|
services/docx_export_service.py(瘦身版)
|
||||||
|
|
||||||
|
本独立服务不提供 Word 导出能力;此处仅保留 report_generation_service 在
|
||||||
|
正文小节编号识别时懒加载依赖的 `_is_likely_section_number`,以满足导入。
|
||||||
|
"""
|
||||||
|
|
||||||
|
from __future__ import annotations
|
||||||
|
|
||||||
|
import re
|
||||||
|
|
||||||
|
|
||||||
|
def _is_likely_section_number(num: str) -> bool:
|
||||||
|
"""报告小节编号(如 2.1.1),非正文能耗数值(如 132.41)。"""
|
||||||
|
s = str(num or "").strip()
|
||||||
|
if not s or not re.fullmatch(r"\d+(?:\.\d+)*", s):
|
||||||
|
return False
|
||||||
|
parts = s.split(".")
|
||||||
|
if len(parts) > 4:
|
||||||
|
return False
|
||||||
|
for part in parts:
|
||||||
|
try:
|
||||||
|
n = int(part)
|
||||||
|
except ValueError:
|
||||||
|
return False
|
||||||
|
if n < 1 or n > 30:
|
||||||
|
return False
|
||||||
|
return True
|
||||||
80
services/kb_service.py
Normal file
80
services/kb_service.py
Normal file
@ -0,0 +1,80 @@
|
|||||||
|
"""
|
||||||
|
services/kb_service.py(瘦身版)
|
||||||
|
|
||||||
|
仅保留报告生成「附图提取」所需的知识库文档磁盘路径解析助手:
|
||||||
|
从 eval_report 的完整 kb_service.py 中抽取,去除知识库 CRUD / 上传 / worker 等无关逻辑。
|
||||||
|
"""
|
||||||
|
|
||||||
|
from __future__ import annotations
|
||||||
|
|
||||||
|
from pathlib import Path
|
||||||
|
from typing import List, Optional
|
||||||
|
|
||||||
|
from config import settings
|
||||||
|
from database.models import KbDocument as KbDocumentModel
|
||||||
|
|
||||||
|
|
||||||
|
def _normalize_rel_path(path: str) -> str:
|
||||||
|
"""将 'a\\b\\c' 规范为 'a/b/c',并去掉前导 '/'。"""
|
||||||
|
s = str(path or "").replace("\\", "/").strip()
|
||||||
|
while s.startswith("./"):
|
||||||
|
s = s[2:]
|
||||||
|
return s.lstrip("/")
|
||||||
|
|
||||||
|
|
||||||
|
def _kb_doc_storage_rel_path(
|
||||||
|
file_path_dir: Optional[str],
|
||||||
|
basename: str,
|
||||||
|
storage_rel_path: Optional[str] = None,
|
||||||
|
) -> str:
|
||||||
|
"""项目目录下的相对存储路径(含文件名)。优先 storage_rel_path(confirm 时写入)。"""
|
||||||
|
stored = _normalize_rel_path(str(storage_rel_path or ""))
|
||||||
|
if stored:
|
||||||
|
return stored
|
||||||
|
d = _normalize_rel_path(str(file_path_dir or ""))
|
||||||
|
bn = str(basename or "").strip()
|
||||||
|
if d and bn:
|
||||||
|
return f"{d}/{bn}"
|
||||||
|
return bn or d
|
||||||
|
|
||||||
|
|
||||||
|
def _kb_doc_path_candidates_for_model(doc_root: Path, doc: KbDocumentModel) -> List[Path]:
|
||||||
|
"""解析磁盘路径时的候选列表(按优先级)。"""
|
||||||
|
rel = _kb_doc_storage_rel_path(
|
||||||
|
doc.file_path,
|
||||||
|
doc.name,
|
||||||
|
getattr(doc, "storage_rel_path", None),
|
||||||
|
)
|
||||||
|
candidates: List[Path] = []
|
||||||
|
if rel:
|
||||||
|
candidates.append((doc_root / doc.project_id / rel).resolve())
|
||||||
|
name = str(doc.name or "").strip()
|
||||||
|
fp_dir = _normalize_rel_path(str(doc.file_path or ""))
|
||||||
|
if fp_dir and name:
|
||||||
|
candidates.append((doc_root / doc.project_id / fp_dir / name).resolve())
|
||||||
|
if name:
|
||||||
|
candidates.append((doc_root / doc.project_id / name).resolve())
|
||||||
|
if not candidates:
|
||||||
|
candidates.append((doc_root / doc.project_id / "_missing_").resolve())
|
||||||
|
deduped: List[Path] = []
|
||||||
|
seen: set[str] = set()
|
||||||
|
for p in candidates:
|
||||||
|
key = str(p)
|
||||||
|
if key in seen:
|
||||||
|
continue
|
||||||
|
seen.add(key)
|
||||||
|
deduped.append(p)
|
||||||
|
return deduped
|
||||||
|
|
||||||
|
|
||||||
|
def _kb_doc_absolute_file_path_for_model(doc_root: Path, doc: KbDocumentModel) -> Path:
|
||||||
|
for p in _kb_doc_path_candidates_for_model(doc_root, doc):
|
||||||
|
if p.is_file():
|
||||||
|
return p
|
||||||
|
return _kb_doc_path_candidates_for_model(doc_root, doc)[0]
|
||||||
|
|
||||||
|
|
||||||
|
def _kb_doc_file_exists_for_model(doc: KbDocumentModel) -> bool:
|
||||||
|
"""文档在磁盘上是否可读(多路径回退,兼容历史 file_path/name 组合)。"""
|
||||||
|
doc_root = Path(settings.DOC_PAT).resolve()
|
||||||
|
return any(p.is_file() for p in _kb_doc_path_candidates_for_model(doc_root, doc))
|
||||||
43
services/project_service.py
Normal file
43
services/project_service.py
Normal file
@ -0,0 +1,43 @@
|
|||||||
|
"""
|
||||||
|
services/project_service.py
|
||||||
|
|
||||||
|
报告生成所需的最小项目查询,替代 eval_report 中重型的 write_service。
|
||||||
|
仅提供按 uuid / 数字 id 查询项目并返回 WriteProject,用于校验项目存在性与取项目名。
|
||||||
|
"""
|
||||||
|
|
||||||
|
from __future__ import annotations
|
||||||
|
|
||||||
|
from fastapi import HTTPException
|
||||||
|
from sqlalchemy.orm import Session
|
||||||
|
|
||||||
|
from database.models import Project
|
||||||
|
from schemas.write import WriteProject
|
||||||
|
|
||||||
|
|
||||||
|
def get_project(project_id: str, db: Session) -> WriteProject:
|
||||||
|
"""获取后评价报告项目详情。支持 uuid 或数字 id;优先 uuid。"""
|
||||||
|
project = None
|
||||||
|
if project_id:
|
||||||
|
project = db.query(Project).filter(Project.uuid == project_id).first()
|
||||||
|
if not project:
|
||||||
|
try:
|
||||||
|
pid = int(project_id)
|
||||||
|
project = db.query(Project).filter(Project.id == pid).first()
|
||||||
|
except (ValueError, TypeError):
|
||||||
|
pass
|
||||||
|
if not project:
|
||||||
|
raise HTTPException(status_code=404, detail="项目不存在")
|
||||||
|
|
||||||
|
return WriteProject(
|
||||||
|
id=str(project.id),
|
||||||
|
uuid=project.uuid,
|
||||||
|
name=project.name,
|
||||||
|
description=project.description or "",
|
||||||
|
createdAt=project.created_at.strftime("%Y-%m-%d") if project.created_at else "",
|
||||||
|
updatedAt=project.updated_at.strftime("%Y-%m-%d") if project.updated_at else "",
|
||||||
|
docCount=project.doc_count,
|
||||||
|
status=project.status,
|
||||||
|
kbProjectId=None,
|
||||||
|
color=project.color,
|
||||||
|
documents=[],
|
||||||
|
)
|
||||||
28
services/prompt_template_service.py
Normal file
28
services/prompt_template_service.py
Normal file
@ -0,0 +1,28 @@
|
|||||||
|
from __future__ import annotations
|
||||||
|
|
||||||
|
import re
|
||||||
|
from pathlib import Path
|
||||||
|
from typing import Any
|
||||||
|
|
||||||
|
|
||||||
|
PROMPT_ROOT = Path(__file__).resolve().parent.parent / "prompts"
|
||||||
|
_TOKEN_RE = re.compile(r"{{\s*([A-Za-z_][A-Za-z0-9_]*)\s*}}")
|
||||||
|
|
||||||
|
|
||||||
|
def load_prompt_template(relative_path: str) -> str:
|
||||||
|
path = (PROMPT_ROOT / relative_path).resolve()
|
||||||
|
if not path.is_relative_to(PROMPT_ROOT.resolve()):
|
||||||
|
raise ValueError(f"Invalid prompt path: {relative_path}")
|
||||||
|
return path.read_text(encoding="utf-8")
|
||||||
|
|
||||||
|
|
||||||
|
def render_prompt_template(template: str, **context: Any) -> str:
|
||||||
|
def _replace(match: re.Match[str]) -> str:
|
||||||
|
value = context.get(match.group(1), "")
|
||||||
|
return "" if value is None else str(value)
|
||||||
|
|
||||||
|
return _TOKEN_RE.sub(_replace, template)
|
||||||
|
|
||||||
|
|
||||||
|
def render_prompt(relative_path: str, **context: Any) -> str:
|
||||||
|
return render_prompt_template(load_prompt_template(relative_path), **context)
|
||||||
292
services/reference_service.py
Normal file
292
services/reference_service.py
Normal file
@ -0,0 +1,292 @@
|
|||||||
|
"""
|
||||||
|
services/reference_service.py
|
||||||
|
参考范文加载服务:报告生成时按需加载对应章节参考范文
|
||||||
|
"""
|
||||||
|
from __future__ import annotations
|
||||||
|
|
||||||
|
import json
|
||||||
|
import logging
|
||||||
|
import re
|
||||||
|
from typing import Optional
|
||||||
|
|
||||||
|
from sqlalchemy.orm import Session
|
||||||
|
|
||||||
|
from database.models import ReportSectionReference
|
||||||
|
from services.llm_client import chat_completions_json
|
||||||
|
|
||||||
|
logger = logging.getLogger(__name__)
|
||||||
|
|
||||||
|
|
||||||
|
_DESENSITIZE_SYSTEM_PROMPT = """你是一个文档脱敏助手。你的任务是对后评价报告范文进行脱敏处理,只保留报告的结构骨架。
|
||||||
|
|
||||||
|
## 脱敏规则
|
||||||
|
|
||||||
|
### 必须保留的结构
|
||||||
|
1. Markdown 标题层级(## 1.1、## 1.2、### 1.2.1 等)
|
||||||
|
2. 表格的表头行、分隔行(|--|--|)
|
||||||
|
3. 段落/章节的组织顺序和逻辑关系
|
||||||
|
4. 文字的叙述逻辑(先写什么、再写什么)
|
||||||
|
5. 表格的行数、列数、表头字段名(如"序号""项目名称""可研报告""实际值")
|
||||||
|
|
||||||
|
### 必须替换为 xxx 的内容
|
||||||
|
1. 所有具体数字:金额、年份、百分比、数量、面积、产能、投资额等
|
||||||
|
2. 项目名称、公司名称、单位名称等专有名词(书名号/引号内的内容)
|
||||||
|
3. 表格中的数据单元格内容(保留表头)
|
||||||
|
4. 具体的日期、时间节点
|
||||||
|
5. 财务指标的具体数值(IRR、NPV、回收期等)
|
||||||
|
|
||||||
|
### 特别注意
|
||||||
|
- 不要随意增删段落或改变段落顺序
|
||||||
|
- 不要删除整个表格,只替换表格中的数据单元格
|
||||||
|
- 保持原 Markdown 格式不变
|
||||||
|
- "待补充"、"详见附表"等 固定用语 不脱敏
|
||||||
|
- 书名号《》中的内容如果是不知名的规范/标准名称(如《石油化工标准》),保留书名号但内容替换为 xxx"""
|
||||||
|
|
||||||
|
|
||||||
|
_DESENSITIZE_USER_PROMPT_TEMPLATE = """请对以下后评价报告章节进行脱敏处理,只保留结构骨架,所有具体数据替换为 xxx:
|
||||||
|
|
||||||
|
```
|
||||||
|
{content}
|
||||||
|
```
|
||||||
|
|
||||||
|
请严格按照脱敏规则处理,直接输出脱敏后的完整 Markdown 内容,不要输出任何额外说明。"""
|
||||||
|
|
||||||
|
|
||||||
|
def _desensitize_via_llm(content: str) -> str:
|
||||||
|
"""
|
||||||
|
调用大模型对参考范文进行脱敏处理。
|
||||||
|
传入完整内容,返回仅保留结构骨架、具体数据替换为 xxx 的 Markdown。
|
||||||
|
|
||||||
|
若 LLM 调用失败,退回原始内容(不脱敏优于拒绝服务)。
|
||||||
|
"""
|
||||||
|
if not content or not content.strip():
|
||||||
|
return content
|
||||||
|
|
||||||
|
user_prompt = _DESENSITIZE_USER_PROMPT_TEMPLATE.format(content=content[:12000])
|
||||||
|
|
||||||
|
logger.info("参考范文脱敏 start | content_len=%s", len(content))
|
||||||
|
|
||||||
|
try:
|
||||||
|
result = chat_completions_json(
|
||||||
|
system_prompt=_DESENSITIZE_SYSTEM_PROMPT,
|
||||||
|
user_prompt=user_prompt,
|
||||||
|
temperature=0.0,
|
||||||
|
max_tokens=16384,
|
||||||
|
timeout_sec=120,
|
||||||
|
)
|
||||||
|
raw = result.get("content") or ""
|
||||||
|
if isinstance(raw, str) and raw.strip():
|
||||||
|
# 去掉可能的 ```markdown / ``` 包裹
|
||||||
|
cleaned = re.sub(r"^```(?:markdown)?\s*", "", raw.strip(), flags=re.IGNORECASE)
|
||||||
|
cleaned = re.sub(r"\s*```$", "", cleaned)
|
||||||
|
logger.info("参考范文脱敏 done | original_len=%s | desensitized_len=%s", len(content), len(cleaned))
|
||||||
|
return cleaned.strip()
|
||||||
|
except Exception as e:
|
||||||
|
logger.warning("LLM 脱敏失败,退回原文: %s", e)
|
||||||
|
|
||||||
|
return content
|
||||||
|
|
||||||
|
|
||||||
|
def load_section_reference(
|
||||||
|
db: Session,
|
||||||
|
section_key: str,
|
||||||
|
source_file: Optional[str] = None,
|
||||||
|
*,
|
||||||
|
max_chars: int = 8000,
|
||||||
|
) -> str:
|
||||||
|
"""
|
||||||
|
加载指定章节的参考范文内容。
|
||||||
|
|
||||||
|
Args:
|
||||||
|
db: 数据库会话
|
||||||
|
section_key: 章节标识(如 "1.1", "2.1.1")
|
||||||
|
source_file: 来源文件名(可选,不指定时取该章节最新的一条)
|
||||||
|
max_chars: 最大字符数,超出截断
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
参考范文 Markdown 文本,未找到时返回空字符串
|
||||||
|
"""
|
||||||
|
query = db.query(ReportSectionReference).filter(
|
||||||
|
ReportSectionReference.section_key == section_key
|
||||||
|
)
|
||||||
|
|
||||||
|
if source_file:
|
||||||
|
query = query.filter(ReportSectionReference.source_file == source_file)
|
||||||
|
|
||||||
|
ref = (
|
||||||
|
query
|
||||||
|
.order_by(ReportSectionReference.updated_at.desc())
|
||||||
|
.first()
|
||||||
|
)
|
||||||
|
|
||||||
|
if not ref or not ref.content:
|
||||||
|
return ""
|
||||||
|
|
||||||
|
content = ref.content.strip()
|
||||||
|
if not content:
|
||||||
|
return ""
|
||||||
|
|
||||||
|
content = _desensitize_via_llm(content)
|
||||||
|
|
||||||
|
if len(content) > max_chars:
|
||||||
|
logger.info("参考范文 %s 超出 %d 字符限制,已截断", section_key, max_chars)
|
||||||
|
content = content[:max_chars] + "\n\n(参考范文超出长度限制,已截断)"
|
||||||
|
|
||||||
|
return content
|
||||||
|
|
||||||
|
|
||||||
|
def load_section_reference_by_title(
|
||||||
|
db: Session,
|
||||||
|
section_title: str,
|
||||||
|
source_file: Optional[str] = None,
|
||||||
|
*,
|
||||||
|
max_chars: int = 8000,
|
||||||
|
) -> str:
|
||||||
|
"""
|
||||||
|
按标题关键字匹配加载参考范文(不精确匹配 section_key 时的兜底方案)。
|
||||||
|
"""
|
||||||
|
refs = db.query(ReportSectionReference)
|
||||||
|
|
||||||
|
if source_file:
|
||||||
|
refs = refs.filter(ReportSectionReference.source_file == source_file)
|
||||||
|
|
||||||
|
# 尝试精确匹配 section_key(从标题中提取编号)
|
||||||
|
import re
|
||||||
|
|
||||||
|
m = re.match(r"(\d+(?:\.\d+)*)", section_title.strip())
|
||||||
|
if m:
|
||||||
|
key = m.group(1)
|
||||||
|
exact = (
|
||||||
|
refs.filter(ReportSectionReference.section_key == key)
|
||||||
|
.order_by(ReportSectionReference.updated_at.desc())
|
||||||
|
.first()
|
||||||
|
)
|
||||||
|
if exact and exact.content:
|
||||||
|
content = exact.content.strip()
|
||||||
|
content = _desensitize_via_llm(content)
|
||||||
|
if len(content) > max_chars:
|
||||||
|
content = content[:max_chars] + "\n\n(参考范文超出长度限制,已截断)"
|
||||||
|
return content
|
||||||
|
|
||||||
|
# 按标题模糊匹配
|
||||||
|
ref = (
|
||||||
|
refs.filter(ReportSectionReference.section_title.contains(section_title[:20]))
|
||||||
|
.order_by(ReportSectionReference.updated_at.desc())
|
||||||
|
.first()
|
||||||
|
)
|
||||||
|
|
||||||
|
if not ref or not ref.content:
|
||||||
|
return ""
|
||||||
|
|
||||||
|
content = ref.content.strip()
|
||||||
|
if not content:
|
||||||
|
return ""
|
||||||
|
|
||||||
|
content = _desensitize_via_llm(content)
|
||||||
|
|
||||||
|
if len(content) > max_chars:
|
||||||
|
content = content[:max_chars] + "\n\n(参考范文超出长度限制,已截断)"
|
||||||
|
|
||||||
|
return content
|
||||||
|
|
||||||
|
|
||||||
|
def load_section_reference_raw(
|
||||||
|
db: Session,
|
||||||
|
section_key: str,
|
||||||
|
template_id: Optional[str] = None,
|
||||||
|
*,
|
||||||
|
max_chars: int = 8000,
|
||||||
|
) -> str:
|
||||||
|
"""
|
||||||
|
加载指定章节存储在数据库中的原始参考范文内容(不做 LLM 脱敏)。
|
||||||
|
|
||||||
|
与 load_section_reference 的区别:直接返回 report_section_references.content 原文,
|
||||||
|
仅保留长度截断保护,不再调用 _desensitize_via_llm。
|
||||||
|
|
||||||
|
template_id: 选中模板的 ID。传入后只注入与该模板关联的参考范文,实现“按模板过滤”;
|
||||||
|
为空则不做模板过滤(取最新一条)。
|
||||||
|
"""
|
||||||
|
query = db.query(ReportSectionReference).filter(
|
||||||
|
ReportSectionReference.section_key == section_key
|
||||||
|
)
|
||||||
|
|
||||||
|
if template_id:
|
||||||
|
query = query.filter(ReportSectionReference.template_id == template_id)
|
||||||
|
|
||||||
|
ref = (
|
||||||
|
query
|
||||||
|
.order_by(ReportSectionReference.updated_at.desc())
|
||||||
|
.first()
|
||||||
|
)
|
||||||
|
|
||||||
|
if not ref or not ref.content:
|
||||||
|
return ""
|
||||||
|
|
||||||
|
content = ref.content.strip()
|
||||||
|
if not content:
|
||||||
|
return ""
|
||||||
|
|
||||||
|
if len(content) > max_chars:
|
||||||
|
logger.info("参考范文 %s 超出 %d 字符限制,已截断", section_key, max_chars)
|
||||||
|
content = content[:max_chars] + "\n\n(参考范文超出长度限制,已截断)"
|
||||||
|
|
||||||
|
return content
|
||||||
|
|
||||||
|
|
||||||
|
def load_section_reference_raw_by_title(
|
||||||
|
db: Session,
|
||||||
|
section_title: str,
|
||||||
|
template_id: Optional[str] = None,
|
||||||
|
*,
|
||||||
|
max_chars: int = 8000,
|
||||||
|
) -> str:
|
||||||
|
"""按标题匹配加载原始参考范文内容(不做 LLM 脱敏),用于 section_key 未命中时的兜底。"""
|
||||||
|
refs = db.query(ReportSectionReference)
|
||||||
|
|
||||||
|
if template_id:
|
||||||
|
refs = refs.filter(ReportSectionReference.template_id == template_id)
|
||||||
|
|
||||||
|
import re
|
||||||
|
|
||||||
|
m = re.match(r"(\d+(?:\.\d+)*)", section_title.strip())
|
||||||
|
if m:
|
||||||
|
key = m.group(1)
|
||||||
|
exact = (
|
||||||
|
refs.filter(ReportSectionReference.section_key == key)
|
||||||
|
.order_by(ReportSectionReference.updated_at.desc())
|
||||||
|
.first()
|
||||||
|
)
|
||||||
|
if exact and exact.content:
|
||||||
|
content = exact.content.strip()
|
||||||
|
if len(content) > max_chars:
|
||||||
|
content = content[:max_chars] + "\n\n(参考范文超出长度限制,已截断)"
|
||||||
|
return content
|
||||||
|
|
||||||
|
ref = (
|
||||||
|
refs.filter(ReportSectionReference.section_title.contains(section_title[:20]))
|
||||||
|
.order_by(ReportSectionReference.updated_at.desc())
|
||||||
|
.first()
|
||||||
|
)
|
||||||
|
|
||||||
|
if not ref or not ref.content:
|
||||||
|
return ""
|
||||||
|
|
||||||
|
content = ref.content.strip()
|
||||||
|
if not content:
|
||||||
|
return ""
|
||||||
|
|
||||||
|
if len(content) > max_chars:
|
||||||
|
content = content[:max_chars] + "\n\n(参考范文超出长度限制,已截断)"
|
||||||
|
|
||||||
|
return content
|
||||||
|
|
||||||
|
|
||||||
|
def list_available_source_files(db: Session) -> list[str]:
|
||||||
|
"""列出所有已上传的参考范文来源文件列表。"""
|
||||||
|
results = (
|
||||||
|
db.query(ReportSectionReference.source_file)
|
||||||
|
.distinct()
|
||||||
|
.order_by(ReportSectionReference.source_file)
|
||||||
|
.all()
|
||||||
|
)
|
||||||
|
return [r[0] for r in results if r[0]]
|
||||||
7771
services/report_generation_service.py
Normal file
7771
services/report_generation_service.py
Normal file
File diff suppressed because it is too large
Load Diff
135
services/report_prompt_service.py
Normal file
135
services/report_prompt_service.py
Normal file
@ -0,0 +1,135 @@
|
|||||||
|
from __future__ import annotations
|
||||||
|
|
||||||
|
from services.prompt_template_service import render_prompt
|
||||||
|
from prompts.report_generation.prompt_defaults import (
|
||||||
|
DEFAULT_SECTION_PROMPT_FALLBACK,
|
||||||
|
DEFAULT_SELECTED_EXAMPLE_FALLBACK,
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
def chapter_generation_system_prompt() -> str:
|
||||||
|
return render_prompt("report_generation/chapter_generation_system.md")
|
||||||
|
|
||||||
|
|
||||||
|
def repair_missing_tables_system_prompt() -> str:
|
||||||
|
return render_prompt("report_generation/repair_missing_tables_system.md")
|
||||||
|
|
||||||
|
|
||||||
|
def table_format_repair_system_prompt() -> str:
|
||||||
|
return render_prompt("report_generation/table_format_repair_system.md")
|
||||||
|
|
||||||
|
|
||||||
|
def _build_prior_sibling_sections_prompt_block(prior_sibling_sections_text: str) -> str:
|
||||||
|
body = str(prior_sibling_sections_text or "").strip()
|
||||||
|
if not body:
|
||||||
|
return ""
|
||||||
|
return (
|
||||||
|
"【同章前序小节正文(时间与金额须保持一致)】\n"
|
||||||
|
f"{body}\n\n"
|
||||||
|
"【同章一致性约束】\n"
|
||||||
|
"1. 竣工时间、开工/中交/投产/验收等关键里程碑日期,以及建设投资、总投资、营业收入、利润等各类金额数字,"
|
||||||
|
"须与本章前序小节已写明的口径完全一致(年月日表述可适度简化,但不得出现另一套矛盾日期或金额);\n"
|
||||||
|
"2. 若【证据包】或【字段级已抽取结果】中某日期/金额与前序小节矛盾,以前序小节为准写入本节,"
|
||||||
|
"不得在正文中另写一套矛盾数值;\n"
|
||||||
|
"3. 前序小节为「待补充」的字段,本节仍写「待补充」,不得自行编造;\n"
|
||||||
|
"4. 可补充本节新增信息,但不得改写或否定前序小节已确立的时间与金额。"
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
def _build_prior_chapters_prompt_block(prior_chapters_text: str) -> str:
|
||||||
|
body = str(prior_chapters_text or "").strip()
|
||||||
|
if not body:
|
||||||
|
return ""
|
||||||
|
return (
|
||||||
|
"【前序章节正文(第1~6章,本章须据此总结)】\n"
|
||||||
|
f"{body}\n\n"
|
||||||
|
"【前序章节使用约束】\n"
|
||||||
|
"1. 第7章各节是对第1~6章已生成正文的归纳、提炼与升华,不得与前面章节结论矛盾;\n"
|
||||||
|
"2. 可概括前文要点,禁止大段照搬;数据与结论须与前文一致;\n"
|
||||||
|
"3. 若前序章节某处为「待补充」,本节对应表述也应为「待补充」,不得编造;\n"
|
||||||
|
"4. 须由要素管理直出的表格(如表7-1)仍按【章节输出结构约束】执行,不受本条限制。"
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
def _build_section_reference_block(section_reference: str) -> str:
|
||||||
|
body = str(section_reference or "").strip()
|
||||||
|
if not body:
|
||||||
|
return ""
|
||||||
|
return (
|
||||||
|
"【本章参考范文(本节写作蓝本:结构与行文风格须高度贴合;禁止复用数据、禁止照抄)】\n"
|
||||||
|
f"{body}\n\n"
|
||||||
|
"【参考范文使用约束】\n"
|
||||||
|
"1. 以范文为写作蓝本:段落数量与顺序、每段主题、论述逻辑、句式笔法与篇幅颗粒度均须与范文高度一致,做到逐段对应、同一笔法;\n"
|
||||||
|
"2. 严禁复用范文中的项目名称、时间、金额、指标值等任何事实数据,须全部替换为当前项目证据包的真实值;\n"
|
||||||
|
"3. 范文中的表格结构(表头、列顺序、行项)须沿用,但表内数据必须替换为当前项目证据包的值;\n"
|
||||||
|
"4. 禁止逐字照抄:不得出现与范文连续相同超过 15 字的文字,须改写措辞做到“形似而文不同”;\n"
|
||||||
|
"5. 若范文与证据包存在矛盾,以证据包为准。"
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
def build_report_chapter_prompt(
|
||||||
|
*,
|
||||||
|
section_title: str,
|
||||||
|
section_prompt: str,
|
||||||
|
required_tables_text: str,
|
||||||
|
structured_tables_text: str,
|
||||||
|
canonical_fields_text: str,
|
||||||
|
selected_example: str,
|
||||||
|
heading_rule: str,
|
||||||
|
section_contract: str,
|
||||||
|
evidence_json: str,
|
||||||
|
prior_sibling_sections_text: str = "",
|
||||||
|
prior_chapters_text: str = "",
|
||||||
|
section_reference: str = "",
|
||||||
|
) -> str:
|
||||||
|
return render_prompt(
|
||||||
|
"report_generation/chapter_generation_user_ref_aligned.md",
|
||||||
|
section_title=section_title,
|
||||||
|
section_prompt=section_prompt or DEFAULT_SECTION_PROMPT_FALLBACK,
|
||||||
|
required_tables_text=required_tables_text or "无",
|
||||||
|
structured_tables_text=structured_tables_text,
|
||||||
|
canonical_fields_text=canonical_fields_text,
|
||||||
|
selected_example=selected_example or DEFAULT_SELECTED_EXAMPLE_FALLBACK,
|
||||||
|
heading_rule=heading_rule,
|
||||||
|
section_contract=section_contract,
|
||||||
|
evidence_json=evidence_json,
|
||||||
|
prior_sibling_sections_block=_build_prior_sibling_sections_prompt_block(
|
||||||
|
prior_sibling_sections_text
|
||||||
|
),
|
||||||
|
prior_chapters_block=_build_prior_chapters_prompt_block(prior_chapters_text),
|
||||||
|
section_reference_block=_build_section_reference_block(section_reference),
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
def build_repair_missing_tables_prompt(
|
||||||
|
*,
|
||||||
|
section_title: str,
|
||||||
|
original_prompt: str,
|
||||||
|
content: str,
|
||||||
|
missing_tables: list[str],
|
||||||
|
evidence_json: str,
|
||||||
|
) -> str:
|
||||||
|
return render_prompt(
|
||||||
|
"report_generation/repair_missing_tables_user.md",
|
||||||
|
section_title=section_title,
|
||||||
|
missing_tables=", ".join(missing_tables),
|
||||||
|
content=content,
|
||||||
|
original_prompt=original_prompt[:8000],
|
||||||
|
evidence_json=evidence_json[:12000],
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
def build_table_format_repair_prompt(
|
||||||
|
*,
|
||||||
|
section_title: str,
|
||||||
|
table_specs_json: str,
|
||||||
|
content: str,
|
||||||
|
evidence_json: str,
|
||||||
|
) -> str:
|
||||||
|
return render_prompt(
|
||||||
|
"report_generation/table_format_repair_user.md",
|
||||||
|
section_title=section_title,
|
||||||
|
table_specs_json=table_specs_json,
|
||||||
|
content=content,
|
||||||
|
evidence_json=evidence_json[:12000],
|
||||||
|
)
|
||||||
145
services/report_runtime_store.py
Normal file
145
services/report_runtime_store.py
Normal file
@ -0,0 +1,145 @@
|
|||||||
|
from __future__ import annotations
|
||||||
|
|
||||||
|
from copy import deepcopy
|
||||||
|
from datetime import datetime
|
||||||
|
import threading
|
||||||
|
from typing import Any, Optional
|
||||||
|
|
||||||
|
|
||||||
|
_RUNTIME_LOCK = threading.RLock()
|
||||||
|
_JOB_STATES: dict[str, dict[str, Any]] = {}
|
||||||
|
|
||||||
|
|
||||||
|
def _now_str() -> str:
|
||||||
|
return datetime.now().strftime("%Y-%m-%d %H:%M:%S")
|
||||||
|
|
||||||
|
|
||||||
|
def _chapter_payload(
|
||||||
|
*,
|
||||||
|
section_key: str,
|
||||||
|
section_title: str,
|
||||||
|
section_order: int,
|
||||||
|
status: str = "pending",
|
||||||
|
) -> dict[str, Any]:
|
||||||
|
return {
|
||||||
|
"sectionKey": section_key,
|
||||||
|
"sectionTitle": section_title,
|
||||||
|
"sectionOrder": section_order,
|
||||||
|
"status": status,
|
||||||
|
"content": None,
|
||||||
|
"errorMessage": None,
|
||||||
|
"updatedAt": _now_str(),
|
||||||
|
"promptText": None,
|
||||||
|
"evidencePayload": None,
|
||||||
|
"validationPayload": None,
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
def init_job_state(
|
||||||
|
*,
|
||||||
|
job_id: str,
|
||||||
|
project_id: str,
|
||||||
|
template_id: Optional[str],
|
||||||
|
chapters: list[dict[str, Any]],
|
||||||
|
) -> None:
|
||||||
|
with _RUNTIME_LOCK:
|
||||||
|
_JOB_STATES[job_id] = {
|
||||||
|
"jobId": job_id,
|
||||||
|
"projectId": project_id,
|
||||||
|
"templateId": template_id,
|
||||||
|
"status": "pending",
|
||||||
|
"progress": 0,
|
||||||
|
"currentSectionKey": None,
|
||||||
|
"errorMessage": None,
|
||||||
|
"createdAt": _now_str(),
|
||||||
|
"updatedAt": _now_str(),
|
||||||
|
"completedAt": None,
|
||||||
|
"chapters": {
|
||||||
|
str(item["sectionKey"]): _chapter_payload(
|
||||||
|
section_key=str(item["sectionKey"]),
|
||||||
|
section_title=str(item["sectionTitle"]),
|
||||||
|
section_order=int(item["sectionOrder"]),
|
||||||
|
status=str(item.get("status") or "pending"),
|
||||||
|
)
|
||||||
|
for item in (chapters or [])
|
||||||
|
},
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
def get_job_state(job_id: str) -> Optional[dict[str, Any]]:
|
||||||
|
with _RUNTIME_LOCK:
|
||||||
|
state = _JOB_STATES.get(job_id)
|
||||||
|
return deepcopy(state) if state else None
|
||||||
|
|
||||||
|
|
||||||
|
def update_job_state(job_id: str, **fields: Any) -> None:
|
||||||
|
with _RUNTIME_LOCK:
|
||||||
|
state = _JOB_STATES.get(job_id)
|
||||||
|
if not state:
|
||||||
|
return
|
||||||
|
state.update(fields)
|
||||||
|
state["updatedAt"] = _now_str()
|
||||||
|
|
||||||
|
|
||||||
|
def update_chapter_state(
|
||||||
|
job_id: str,
|
||||||
|
section_key: str,
|
||||||
|
**fields: Any,
|
||||||
|
) -> None:
|
||||||
|
with _RUNTIME_LOCK:
|
||||||
|
state = _JOB_STATES.get(job_id)
|
||||||
|
if not state:
|
||||||
|
return
|
||||||
|
chapter = state.get("chapters", {}).get(section_key)
|
||||||
|
if not chapter:
|
||||||
|
return
|
||||||
|
chapter.update(fields)
|
||||||
|
chapter["updatedAt"] = _now_str()
|
||||||
|
state["updatedAt"] = _now_str()
|
||||||
|
|
||||||
|
|
||||||
|
def append_chapter_content(
|
||||||
|
job_id: str,
|
||||||
|
section_key: str,
|
||||||
|
delta_text: str,
|
||||||
|
*,
|
||||||
|
stream_phase: str,
|
||||||
|
) -> None:
|
||||||
|
if not delta_text:
|
||||||
|
return
|
||||||
|
with _RUNTIME_LOCK:
|
||||||
|
state = _JOB_STATES.get(job_id)
|
||||||
|
if not state:
|
||||||
|
return
|
||||||
|
chapter = state.get("chapters", {}).get(section_key)
|
||||||
|
if not chapter:
|
||||||
|
return
|
||||||
|
current = str(chapter.get("content") or "")
|
||||||
|
validation_payload = dict(chapter.get("validationPayload") or {})
|
||||||
|
validation_payload["streamPhase"] = stream_phase
|
||||||
|
chapter["content"] = current + delta_text
|
||||||
|
chapter["validationPayload"] = validation_payload
|
||||||
|
chapter["updatedAt"] = _now_str()
|
||||||
|
state["currentSectionKey"] = section_key
|
||||||
|
state["updatedAt"] = _now_str()
|
||||||
|
|
||||||
|
|
||||||
|
def set_chapter_stream_phase(job_id: str, section_key: str, stream_phase: str) -> None:
|
||||||
|
with _RUNTIME_LOCK:
|
||||||
|
state = _JOB_STATES.get(job_id)
|
||||||
|
if not state:
|
||||||
|
return
|
||||||
|
chapter = state.get("chapters", {}).get(section_key)
|
||||||
|
if not chapter:
|
||||||
|
return
|
||||||
|
validation_payload = dict(chapter.get("validationPayload") or {})
|
||||||
|
validation_payload["streamPhase"] = stream_phase
|
||||||
|
chapter["validationPayload"] = validation_payload
|
||||||
|
chapter["updatedAt"] = _now_str()
|
||||||
|
state["currentSectionKey"] = section_key
|
||||||
|
state["updatedAt"] = _now_str()
|
||||||
|
|
||||||
|
|
||||||
|
def remove_job_state(job_id: str) -> None:
|
||||||
|
with _RUNTIME_LOCK:
|
||||||
|
_JOB_STATES.pop(job_id, None)
|
||||||
324
services/retrieval_service.py
Normal file
324
services/retrieval_service.py
Normal file
@ -0,0 +1,324 @@
|
|||||||
|
"""
|
||||||
|
services/retrieval_service.py
|
||||||
|
后评价报告材料检索服务
|
||||||
|
用于从向量库中检索与后评价报告相关的材料
|
||||||
|
"""
|
||||||
|
|
||||||
|
from typing import List, Dict, Any, Optional
|
||||||
|
from langchain_core.documents import Document
|
||||||
|
from function.vector_store import VectorStore
|
||||||
|
|
||||||
|
|
||||||
|
class RetrievalService:
|
||||||
|
"""后评价报告材料检索服务"""
|
||||||
|
|
||||||
|
def __init__(self, collection_name: str = "eval_report"):
|
||||||
|
"""
|
||||||
|
初始化检索服务
|
||||||
|
|
||||||
|
Args:
|
||||||
|
collection_name: 向量库集合名称
|
||||||
|
"""
|
||||||
|
self.collection_name = collection_name
|
||||||
|
self.vector_store = VectorStore(collection_name=collection_name, drop_old=False)
|
||||||
|
|
||||||
|
def search_by_query(self, query: str, top_k: int = 10, filter_project: Optional[str] = None) -> List[Document]:
|
||||||
|
"""
|
||||||
|
根据查询语句检索相关材料
|
||||||
|
|
||||||
|
Args:
|
||||||
|
query: 查询语句,例如"项目背景"、"财务评价"、"技术方案"
|
||||||
|
top_k: 返回结果数量
|
||||||
|
filter_project: 可选的项目 UUID 过滤
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
检索到的文档列表
|
||||||
|
"""
|
||||||
|
# 构建查询语句
|
||||||
|
if filter_project:
|
||||||
|
full_query = f"{query} 项目 UUID:{filter_project}"
|
||||||
|
else:
|
||||||
|
full_query = query
|
||||||
|
|
||||||
|
# 执行检索
|
||||||
|
results = self.vector_store.similarity_search_with_score(full_query, k=top_k)
|
||||||
|
|
||||||
|
# 过滤并返回文档
|
||||||
|
docs = []
|
||||||
|
for doc, score in results:
|
||||||
|
# 如果指定了项目过滤,检查文档是否属于该项目
|
||||||
|
if filter_project and doc.metadata.get("project_uuid") != filter_project:
|
||||||
|
continue
|
||||||
|
docs.append(doc)
|
||||||
|
|
||||||
|
return docs
|
||||||
|
|
||||||
|
def search_by_category(self, category: str, project_uuid: str, top_k: int = 10) -> List[Dict[str, Any]]:
|
||||||
|
"""
|
||||||
|
根据类别检索材料
|
||||||
|
|
||||||
|
Args:
|
||||||
|
category: 类别,如"项目概况"、"技术方案"、"财务评价"、"效益分析"
|
||||||
|
project_uuid: 项目 UUID
|
||||||
|
top_k: 返回结果数量
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
检索结果列表,包含文档内容和元数据
|
||||||
|
"""
|
||||||
|
# 定义类别对应的检索关键词
|
||||||
|
category_keywords = {
|
||||||
|
"项目概况": ["项目背景", "建设内容", "项目规模", "建设地点", "建设单位", "项目决策", "立项依据"],
|
||||||
|
"技术方案": ["技术方案", "工艺技术", "设备选型", "工程设计", "施工安装", "调试运行", "专利技术"],
|
||||||
|
"财务评价": ["投资估算", "资金筹措", "财务分析", "现金流量", "利润计算", "成本分析", "经济效益"],
|
||||||
|
"效益分析": ["经济效益", "社会效益", "环境效益", "环境影响", "资源利用", "节能降耗"],
|
||||||
|
"风险分析": ["风险分析", "风险识别", "风险评价", "风险对策", "不确定性分析"],
|
||||||
|
"后评价结论": ["后评价结论", "经验教训", "建议措施", "综合评价"],
|
||||||
|
}
|
||||||
|
|
||||||
|
# 使用多个关键词进行检索
|
||||||
|
all_docs = []
|
||||||
|
for keyword in category_keywords.get(category, [category]):
|
||||||
|
docs = self.search_by_query(keyword, top_k=5, filter_project=project_uuid)
|
||||||
|
all_docs.extend(docs)
|
||||||
|
|
||||||
|
# 去重并返回
|
||||||
|
seen = set()
|
||||||
|
unique_docs = []
|
||||||
|
for doc in all_docs:
|
||||||
|
key = (doc.page_content[:100], doc.metadata.get("heading", ""))
|
||||||
|
if key not in seen:
|
||||||
|
seen.add(key)
|
||||||
|
unique_docs.append(doc)
|
||||||
|
|
||||||
|
# 转换为字典格式
|
||||||
|
result = []
|
||||||
|
for doc in unique_docs[:top_k]:
|
||||||
|
result.append({
|
||||||
|
"content": doc.page_content,
|
||||||
|
"heading": doc.metadata.get("heading", ""),
|
||||||
|
"heading_level": doc.metadata.get("heading_level", 0),
|
||||||
|
"doc_id": doc.metadata.get("doc_id", ""),
|
||||||
|
"path": doc.metadata.get("path", ""),
|
||||||
|
"score": doc.metadata.get("score", 0.0),
|
||||||
|
})
|
||||||
|
|
||||||
|
return result
|
||||||
|
|
||||||
|
def get_project_materials(self, project_uuid: str) -> Dict[str, Any]:
|
||||||
|
"""
|
||||||
|
获取项目的所有相关材料
|
||||||
|
|
||||||
|
Args:
|
||||||
|
project_uuid: 项目 UUID
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
包含项目所有材料的字典
|
||||||
|
"""
|
||||||
|
# 检索项目基本信息
|
||||||
|
basic_info = self.search_by_query(
|
||||||
|
"项目概况 项目基本情况",
|
||||||
|
top_k=5,
|
||||||
|
filter_project=project_uuid
|
||||||
|
)
|
||||||
|
# 检索技术方案
|
||||||
|
tech_info = self.search_by_query(
|
||||||
|
"技术方案 工艺技术",
|
||||||
|
top_k=5,
|
||||||
|
filter_project=project_uuid
|
||||||
|
)
|
||||||
|
# 检索财务信息
|
||||||
|
finance_info = self.search_by_query(
|
||||||
|
"财务评价 经济效益",
|
||||||
|
top_k=5,
|
||||||
|
filter_project=project_uuid
|
||||||
|
)
|
||||||
|
# 检索效益分析
|
||||||
|
benefit_info = self.search_by_query(
|
||||||
|
"效益分析 社会效益",
|
||||||
|
top_k=5,
|
||||||
|
filter_project=project_uuid
|
||||||
|
)
|
||||||
|
return {
|
||||||
|
"basic_info": [doc.page_content for doc in basic_info],
|
||||||
|
"tech_info": [doc.page_content for doc in tech_info],
|
||||||
|
"finance_info": [doc.page_content for doc in finance_info],
|
||||||
|
"benefit_info": [doc.page_content for doc in benefit_info],
|
||||||
|
}
|
||||||
|
|
||||||
|
def search_similar_report(self, reference_content: str, top_k: int = 5) -> List[Document]:
|
||||||
|
"""
|
||||||
|
根据参考内容检索相似报告
|
||||||
|
|
||||||
|
Args:
|
||||||
|
reference_content: 参考报告内容
|
||||||
|
top_k: 返回结果数量
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
相似报告列表
|
||||||
|
"""
|
||||||
|
# 提取关键信息用于检索
|
||||||
|
query = f"后评价报告 项目概况 技术方案 财务评价"
|
||||||
|
results = self.vector_store.similarity_search_with_score(query, k=top_k)
|
||||||
|
|
||||||
|
docs = []
|
||||||
|
for doc, score in results:
|
||||||
|
docs.append(doc)
|
||||||
|
|
||||||
|
return docs
|
||||||
|
|
||||||
|
def get_template_data(self, project_uuid: str, query: str = "项目概况 技术方案 财务评价", top_k: int = 15) -> Dict[str, Any]:
|
||||||
|
"""
|
||||||
|
获取符合模板要求的数据
|
||||||
|
|
||||||
|
Args:
|
||||||
|
project_uuid: 项目 UUID
|
||||||
|
query: 检索查询语句
|
||||||
|
top_k: 检索结果数量
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
符合模板字段要求的数据字典
|
||||||
|
"""
|
||||||
|
from report_template import ReportTemplate
|
||||||
|
|
||||||
|
# 检索材料
|
||||||
|
materials = self.search_by_query(query, top_k=top_k, filter_project=project_uuid)
|
||||||
|
|
||||||
|
if not materials:
|
||||||
|
return {
|
||||||
|
"materials": [],
|
||||||
|
"template_data": {},
|
||||||
|
"key_info": {}
|
||||||
|
}
|
||||||
|
|
||||||
|
# 提取关键信息
|
||||||
|
key_info = ReportTemplate.extract_key_info([doc.page_content for doc in materials])
|
||||||
|
|
||||||
|
# 映射到模板字段
|
||||||
|
template_data = ReportTemplate.map_materials_to_template([doc.page_content for doc in materials])
|
||||||
|
|
||||||
|
return {
|
||||||
|
"materials": [doc for doc in materials],
|
||||||
|
"materials_text": [doc.page_content for doc in materials],
|
||||||
|
"template_data": template_data,
|
||||||
|
"key_info": key_info
|
||||||
|
}
|
||||||
|
|
||||||
|
def get_chapter_materials(self, project_uuid: str, chapter: str, top_k: int = 10) -> List[Dict[str, Any]]:
|
||||||
|
"""
|
||||||
|
获取指定章节的材料
|
||||||
|
|
||||||
|
Args:
|
||||||
|
project_uuid: 项目 UUID
|
||||||
|
chapter: 章节名称
|
||||||
|
top_k: 返回结果数量
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
材料列表
|
||||||
|
"""
|
||||||
|
# 定义章节对应的检索关键词
|
||||||
|
chapter_keywords = {
|
||||||
|
"项目概况": ["项目背景", "建设内容", "项目规模", "建设地点", "建设单位", "项目决策", "立项依据"],
|
||||||
|
"技术方案": ["技术方案", "工艺技术", "设备选型", "工程设计", "施工安装", "调试运行", "专利技术"],
|
||||||
|
"项目全过程总结与管理评价": [
|
||||||
|
# ---- 强优先:表1~表14 + 编号小节 ----
|
||||||
|
"2.1", "2.1.1", "2.1.1.3", "2.1.6", "2.2", "2.2.1", "2.2.10", "2.3", "2.3.1", "2.3.6",
|
||||||
|
"表1原料数量及组成对比表", "表2原料性质对比表",
|
||||||
|
"表3前期预测和2019年实际产品对比表",
|
||||||
|
"表4装置规模及实际运行负荷对比表",
|
||||||
|
"表5项目规模对比表",
|
||||||
|
"表6可研报告与基础设计阶段工程内容对比表",
|
||||||
|
"表7项目承包商的招投标情况表",
|
||||||
|
"表8项目设计主要进度控制情况表",
|
||||||
|
"表9施工图设计变更情况表",
|
||||||
|
"表10重大设计变更情况表",
|
||||||
|
"表11主要设备采购情况表",
|
||||||
|
"表12施工重要节点进度表",
|
||||||
|
"表13原料性质对比表",
|
||||||
|
"表14主要标定结果与设计指标对比表",
|
||||||
|
|
||||||
|
# ---- 次优先:结构性关键词 ----
|
||||||
|
"可行性研究", "可研编制", "可研报告", "评估会", "可研批复", "资源与原料评价",
|
||||||
|
"基础设计", "设计审查", "审查意见", "设计变更", "施工图设计", "招投标", "施工准备",
|
||||||
|
"工程监理", "HSE", "竣工验收",
|
||||||
|
"投产管理", "生产准备", "联合试运", "试生产", "生产运行评价", "原料供应评价", "标定结果",
|
||||||
|
"原料数量及组成对比", "装置规模", "负荷率",
|
||||||
|
],
|
||||||
|
"财务评价": ["投资估算", "资金筹措", "财务分析", "现金流量", "利润计算", "成本分析", "经济效益"],
|
||||||
|
"效益分析": ["经济效益", "社会效益", "环境效益", "环境影响", "资源利用", "节能降耗"],
|
||||||
|
"项目目标和可持续性评价": [
|
||||||
|
# 强优先:章节标题与编号
|
||||||
|
"5", "5.1", "5.1.1", "5.1.2", "5.1.3", "5.2", "5.3", "5.3.1", "5.3.2", "5.3.3", "5.3.4", "5.3.5",
|
||||||
|
"项目目标实现程度评价", "项目绩效对标分析", "项目持续性评价",
|
||||||
|
|
||||||
|
# 目标实现(工程/技术/经济)
|
||||||
|
"工程规模", "项目进度", "工程质量", "项目功能", "投资控制",
|
||||||
|
"加工量", "负荷", "产品产量", "产品质量", "技术指标", "标定", "设计值", "考核",
|
||||||
|
"主要经济指标", "IRR", "内部收益率", "净现值", "NPV", "投资回收期", "营业收入", "成本费用", "税后利润",
|
||||||
|
|
||||||
|
# 对标
|
||||||
|
"对标", "横向对比", "同类装置", "单位投资", "单位能耗", "蒸汽能耗", "综合能耗", "辛烷值", "收率", "烯烃",
|
||||||
|
|
||||||
|
# 持续性(资源/产品/内部/政策)
|
||||||
|
"资源分析", "原料供应", "资源保障",
|
||||||
|
"产品分析", "市场需求", "国Ⅵ", "国ⅥA", "国ⅥB",
|
||||||
|
"项目内部因素", "装置规模合理性", "工艺方案", "技术水平",
|
||||||
|
"国家政策", "产业政策", "质量标准",
|
||||||
|
|
||||||
|
# 若材料以安全/环保合规支撑持续性
|
||||||
|
"个人风险", "社会风险", "可接受", "风险曲线",
|
||||||
|
"非甲烷总烃", "无组织排放", "mg/m3", "标准值",
|
||||||
|
],
|
||||||
|
"风险分析": ["风险分析", "风险识别", "风险评价", "风险对策", "不确定性分析"],
|
||||||
|
"后评价结论": ["后评价结论", "经验教训", "建议措施", "综合评价"],
|
||||||
|
}
|
||||||
|
|
||||||
|
keywords = chapter_keywords.get(chapter, [chapter])
|
||||||
|
|
||||||
|
# 使用多个关键词进行检索
|
||||||
|
all_docs = []
|
||||||
|
for keyword in keywords:
|
||||||
|
docs = self.search_by_query(keyword, top_k=5, filter_project=project_uuid)
|
||||||
|
all_docs.extend(docs)
|
||||||
|
|
||||||
|
# 去重并返回
|
||||||
|
seen = set()
|
||||||
|
unique_docs = []
|
||||||
|
for doc in all_docs:
|
||||||
|
key = (doc.page_content[:100], doc.metadata.get("heading", ""))
|
||||||
|
if key not in seen:
|
||||||
|
seen.add(key)
|
||||||
|
unique_docs.append(doc)
|
||||||
|
|
||||||
|
# 转换为字典格式
|
||||||
|
result = []
|
||||||
|
for doc in unique_docs[:top_k]:
|
||||||
|
result.append({
|
||||||
|
"content": doc.page_content,
|
||||||
|
"heading": doc.metadata.get("heading", ""),
|
||||||
|
"heading_level": doc.metadata.get("heading_level", 0),
|
||||||
|
"doc_id": doc.metadata.get("doc_id", ""),
|
||||||
|
"path": doc.metadata.get("path", ""),
|
||||||
|
"score": doc.metadata.get("score", 0.0),
|
||||||
|
})
|
||||||
|
|
||||||
|
return result
|
||||||
|
|
||||||
|
|
||||||
|
# 检索示例
|
||||||
|
if __name__ == "__main__":
|
||||||
|
# 创建检索服务实例
|
||||||
|
service = RetrievalService()
|
||||||
|
|
||||||
|
# 示例 1:搜索项目背景
|
||||||
|
print("示例 1:搜索项目背景")
|
||||||
|
docs = service.search_by_query("项目背景 建设内容", top_k=3)
|
||||||
|
for doc in docs:
|
||||||
|
print(f"标题:{doc.metadata.get('heading', 'N/A')}")
|
||||||
|
print(f"内容:{doc.page_content[:200]}...\n")
|
||||||
|
|
||||||
|
# 示例 2:搜索财务评价
|
||||||
|
print("示例 2:搜索财务评价")
|
||||||
|
docs = service.search_by_query("财务评价 现金流量", top_k=3)
|
||||||
|
for doc in docs:
|
||||||
|
print(f"标题:{doc.metadata.get('heading', 'N/A')}")
|
||||||
|
print(f"内容:{doc.page_content[:200]}...\n")
|
||||||
1395
services/standard_elements_2020.py
Normal file
1395
services/standard_elements_2020.py
Normal file
File diff suppressed because it is too large
Load Diff
Loading…
x
Reference in New Issue
Block a user