commit aa98ea262395b0e4831f300819a7e992f9bc413f Author: xxy Date: Fri Jun 5 18:45:29 2026 +0800 @ Initial commit Co-Authored-By: Claude Opus 4.8 @ diff --git a/.env.example b/.env.example new file mode 100644 index 0000000..b4ea07a --- /dev/null +++ b/.env.example @@ -0,0 +1,24 @@ +# 复制为 .env 后按实际环境填写。 + +# 数据库(MySQL,与原 eval_report 共用同一库) +DATABASE_URL=mysql+pymysql://root:123456@127.0.0.1:3306/post_eval_report?charset=utf8mb4 + +# 文档存储根目录(附图提取按 DOC_PAT/{project_uuid}/<相对路径> 定位 .docx) +DOC_PAT=./docpath + +# LLM(OpenAI 兼容接口) +LLM_API_BASE= +LLM_API_KEY= +LLM_MODEL_NAME= +# 报告章节单次 chat 读超时(秒),长章节建议 600+ +REPORT_LLM_HTTP_TIMEOUT_SEC=600 + +# Embedding / Milvus(向量检索证据) +EMBEDDING_API_BASE= +EMBEDDING_API_KEY= +MILVUS_DB_URL= + +# 服务监听 +HOST=0.0.0.0 +PORT=8099 +RELOAD=false diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..2969a31 --- /dev/null +++ b/.gitignore @@ -0,0 +1,21 @@ +# Python-generated files +__pycache__/ +*.py[oc] +build/ +dist/ +wheels/ +*.egg-info + +# Virtual environments +.venv + +# Environment / secrets +.env + +# Local artifacts +*.log +.DS_Store +comp/ +docpath/ +docs/ +logs/ diff --git a/README.md b/README.md new file mode 100644 index 0000000..c87567e --- /dev/null +++ b/README.md @@ -0,0 +1,52 @@ +# 报告生成服务(独立抽取版) + +从 `eval_report` 中抽取出的「后评价报告核心生成」链路,作为独立 FastAPI 服务运行。 +保留原有的证据装配(要素表 + Milvus 向量检索)、分章 LLM 生成、表格修复、报告合并与 SSE 流式进度, +连接与原项目相同的 MySQL / Milvus / LLM 服务。 + +## 范围 + +- 包含:异步分章生成任务、进度查询、结果获取、SSE 实时事件、章节重试、任务取消。 +- 不含:鉴权、知识库 worker、模板/范文管理、Word(docx) 导出(这些仍在原 `eval_report` 中)。 + +## 目录结构 + +``` +report_generation/ + main.py FastAPI 入口 + config.py 配置(DB / LLM / Embedding / Milvus / DOC_PAT) + database/ SQLAlchemy 引擎、Session、ORM 模型、建表 + schemas/ Pydantic 模型 + services/ 报告生成核心逻辑(含瘦身版 kb_service / docx_export_service / project_service) + function/vector_store.py Milvus 向量库封装 + prompts/report_generation/ 提示词模板与章节合同 + routers/report.py 报告生成 HTTP 端点 +``` + +## 快速开始 + +```bash +pip install -r requirements.txt +cp .env.example .env # 按需填写 DATABASE_URL / LLM_* / EMBEDDING_* / MILVUS_DB_URL +uvicorn main:app --reload +``` + +启动后访问 `http://127.0.0.1:8099/docs` 查看接口文档,`/health` 做健康检查。 + +## 主要接口(前缀 `/api/v1/write`) + +| 方法 | 路径 | 说明 | +|------|------|------| +| GET | `/projects/{project_id}/generate-sections` | 预览模板章节提示词清单 | +| POST | `/projects/{project_id}/generate-report-job` | 创建分章异步报告生成任务 | +| GET | `/projects/{project_id}/generate-report-job/{job_id}` | 查询任务进度 | +| GET | `/projects/{project_id}/generate-report-job/{job_id}/result` | 获取任务结果 | +| GET | `/projects/{project_id}/generate-report-job/{job_id}/events` | 订阅实时事件(SSE) | +| POST | `/projects/{project_id}/generate-report-job/{job_id}/retry-chapter` | 重试指定章节 | +| POST | `/projects/{project_id}/generate-report-job/{job_id}/cancel` | 取消任务 | + +## 依赖的外部数据 + +报告生成依赖原库中已有的项目数据:`projects`、`element_tables` / `element_cells`(要素表)、 +`report_templates` / `report_template_sections`(模板章节)、可选的 `report_section_references`(参考范文), +以及 Milvus 中按项目 UUID 写入的文档向量。请确保新服务连接到已包含这些数据的 MySQL 与 Milvus。 diff --git a/config.py b/config.py new file mode 100644 index 0000000..c51d98a --- /dev/null +++ b/config.py @@ -0,0 +1,70 @@ +""" +config.py +全局配置项。可通过 .env 文件或环境变量覆盖。 + +本项目为「报告生成」独立服务,仅保留报告生成链路所需配置: +数据库(MySQL) / LLM / Embedding / Milvus / 文档存储路径。 +""" + +from pydantic_settings import BaseSettings, SettingsConfigDict + + +class Settings(BaseSettings): + # 应用基本信息 + APP_TITLE: str = "智能报告生成服务 API" + APP_VERSION: str = "0.1.0" + APP_DESCRIPTION: str = "后评价报告分章异步生成后端服务(独立抽取版)" + + # 服务监听 + HOST: str = "0.0.0.0" + PORT: int = 8099 + RELOAD: bool = False + + # CORS 允许的前端源(开发阶段放开,生产环境改为具体域名) + CORS_ORIGINS: list[str] = ["*"] + + # 数据库(MySQL) + DATABASE_URL: str = "mysql+pymysql://root:123456@127.0.0.1:3306/post_eval_report?charset=utf8mb4" + DB_POOL_SIZE: int = 15 + DB_MAX_OVERFLOW: int = 25 + DB_POOL_TIMEOUT: int = 60 + DB_POOL_PRE_PING: bool = True + + # 文档存储根目录(附图提取时按 DOC_PAT/{project_uuid}/<相对路径> 定位 .docx) + DOC_PAT: str = "./docpath" + + # LLM(OpenAI 兼容接口) + LLM_API_BASE: str = "" + LLM_API_KEY: str = "" + LLM_MODEL_NAME: str = "" + LLM_HTTP_TIMEOUT_SEC: int = 120 + LLM_CONNECT_TIMEOUT_SEC: int = 30 + LLM_RETRY_COUNT: int = 3 + LLM_RETRY_BACKOFF_SEC: float = 1.0 + LLM_RETRY_BACKOFF_MAX_SEC: float = 12.0 + # 报告章节单次 chat 读超时(秒)。0 表示沿用 LLM_HTTP_TIMEOUT_SEC;长章节建议 600+ + REPORT_LLM_HTTP_TIMEOUT_SEC: int = 600 + # 某章 LLM 仍失败时写入占位正文并继续后续章节,避免整份任务失败 + REPORT_LLM_CONTINUE_ON_TIMEOUT: bool = True + # 表格抽取延迟补抽(首轮失败后进入队列,按轮次延迟重试) + LLM_TABLE_DELAY_RETRY_ROUNDS: int = 2 + LLM_TABLE_DELAY_RETRY_SEC: float = 8.0 + LLM_TABLE_DELAY_RETRY_BACKOFF: float = 2.0 + LLM_TABLE_DELAY_RETRY_MAX_SEC: float = 60.0 + + # Embedding / Milvus(向量检索证据 L2/L3) + EMBEDDING_API_KEY: str = "" + EMBEDDING_API_BASE: str = "" + EMBEDDING_BATCH_MAX_DOCS: int = 4 + EMBEDDING_BATCH_MAX_CHARS: int = 12000 + EMBEDDING_MAX_CHUNK_CHARS: int = 4000 + MILVUS_DB_URL: str = "" + + model_config = SettingsConfigDict( + env_file=".env", + env_file_encoding="utf-8", + extra="ignore", + ) + + +settings = Settings() diff --git a/database/__init__.py b/database/__init__.py new file mode 100644 index 0000000..22a992d --- /dev/null +++ b/database/__init__.py @@ -0,0 +1,27 @@ +""" +database +数据库连接与 Session 管理。 + +使用方式: + from database import get_db, SessionLocal, init_database + + # 依赖注入(FastAPI 路由) + @router.get("/items") + def list_items(db: Session = Depends(get_db)): + ... + + # 上下文管理器(脚本、worker) + with SessionLocal() as db: + ... +""" + +from database.core import engine, SessionLocal +from database.dependencies import get_db +from database.init_db import init_database + +__all__ = [ + "engine", + "SessionLocal", + "get_db", + "init_database", +] diff --git a/database/core.py b/database/core.py new file mode 100644 index 0000000..ca52822 --- /dev/null +++ b/database/core.py @@ -0,0 +1,42 @@ +""" +database/core.py +SQLAlchemy 引擎与 Session 工厂。 + +- 同步引擎,默认连接池(QueuePool) +- 后续可替换为 create_async_engine 实现异步 +""" + +from sqlalchemy import create_engine +from sqlalchemy.orm import sessionmaker, Session + +from config import settings + +# ----------------------------------------------------------------------------- +# 引擎配置 +# ----------------------------------------------------------------------------- + +engine = create_engine( + settings.DATABASE_URL, + pool_size=settings.DB_POOL_SIZE, + max_overflow=settings.DB_MAX_OVERFLOW, + pool_timeout=settings.DB_POOL_TIMEOUT, + pool_pre_ping=settings.DB_POOL_PRE_PING, + pool_recycle=3600, # 1 小时回收空闲连接,避免 MySQL wait_timeout + connect_args={ + "charset": "utf8mb4", + "use_unicode": True, + "init_command": "SET NAMES utf8mb4 COLLATE utf8mb4_unicode_ci", + }, + echo=False, # 开发时可设为 True 打印 SQL +) + +# ----------------------------------------------------------------------------- +# Session 工厂 +# ----------------------------------------------------------------------------- + +SessionLocal = sessionmaker( + bind=engine, + autocommit=False, + autoflush=False, + expire_on_commit=False, # 提交后对象仍可访问属性,便于返回响应 +) diff --git a/database/dependencies.py b/database/dependencies.py new file mode 100644 index 0000000..757fa45 --- /dev/null +++ b/database/dependencies.py @@ -0,0 +1,28 @@ +""" +database/dependencies.py +FastAPI 依赖注入:获取数据库 Session。 + +每个请求创建新 Session,请求结束后自动关闭。 +""" + +from collections.abc import Generator + +from sqlalchemy.orm import Session + +from database.core import SessionLocal + + +def get_db() -> Generator[Session, None, None]: + """ + 获取数据库 Session,用于 FastAPI Depends()。 + + 用法: + @router.get("/items") + def list_items(db: Session = Depends(get_db)): + ... + """ + db = SessionLocal() + try: + yield db + finally: + db.close() diff --git a/database/init.sql b/database/init.sql new file mode 100644 index 0000000..2087a36 --- /dev/null +++ b/database/init.sql @@ -0,0 +1,464 @@ +-- 智能报告生成平台 - 数据库初始化脚本 +-- 数据库名建议:post_eval_report +-- 适用于 MySQL + +-- 创建数据库(可选) +-- CREATE DATABASE IF NOT EXISTS post_eval_report DEFAULT CHARACTER SET utf8mb4 COLLATE utf8mb4_unicode_ci; +-- USE post_eval_report; + +-- 项目(统一:知识库 + 撰写) +-- uuid 由应用层生成,避免 MySQL 8/9 对生成列函数限制导致初始化失败 +CREATE TABLE IF NOT EXISTS projects ( + id INT AUTO_INCREMENT PRIMARY KEY, + uuid VARCHAR(32) NOT NULL UNIQUE, + name VARCHAR(255) NOT NULL, + description TEXT, + created_at DATETIME NOT NULL, + updated_at DATETIME NOT NULL, + doc_count INT DEFAULT 0, + eval_reports_count INT DEFAULT 0, + total_size VARCHAR(32) DEFAULT '0 B', + tags TEXT, + status VARCHAR(16) DEFAULT 'active', + color VARCHAR(16) DEFAULT '#3b82f6', + sync_suppressed_table_names LONGTEXT NULL +) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4 COLLATE=utf8mb4_unicode_ci; +CREATE INDEX idx_projects_created_at ON projects(created_at); +CREATE INDEX idx_projects_updated_at ON projects(updated_at); +CREATE INDEX idx_projects_status ON projects(status); + +-- 知识库目录表:project_id 关联 projects.uuid;parent_id 形成目录树 +CREATE TABLE IF NOT EXISTS kb_directories ( + id VARCHAR(64) PRIMARY KEY, + project_id VARCHAR(32) NOT NULL, + parent_id VARCHAR(64) NULL, + name VARCHAR(255) NOT NULL, + full_path VARCHAR(1024) NOT NULL, + created_at DATETIME NOT NULL, + updated_at DATETIME NOT NULL, + FOREIGN KEY (project_id) REFERENCES projects(uuid) ON DELETE CASCADE, + FOREIGN KEY (parent_id) REFERENCES kb_directories(id) ON DELETE CASCADE +) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4 COLLATE=utf8mb4_unicode_ci; +CREATE INDEX idx_kb_dirs_project ON kb_directories(project_id); +CREATE INDEX idx_kb_dirs_parent ON kb_directories(parent_id); + +-- 知识库文档(status: 0=失败 2=排队中 3=处理中 4=可用) +CREATE TABLE IF NOT EXISTS kb_documents ( + id VARCHAR(64) PRIMARY KEY, + project_id VARCHAR(32) NOT NULL, + directory_id VARCHAR(64) NULL, + name VARCHAR(255) NOT NULL, + size VARCHAR(32) NOT NULL, + file_path VARCHAR(512), + storage_rel_path VARCHAR(512) NULL COMMENT '项目内完整相对路径(含文件名)', + word_count INT DEFAULT 0, + uploaded_at DATETIME NOT NULL, + status INT DEFAULT 2, + error_message TEXT NULL, + factor JSON NULL COMMENT '文档要素 JSON 数组', + category VARCHAR(32) NULL DEFAULT NULL COMMENT '文件分类', + FOREIGN KEY (project_id) REFERENCES projects(uuid) ON DELETE CASCADE, + FOREIGN KEY (directory_id) REFERENCES kb_directories(id) ON DELETE SET NULL +) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4 COLLATE=utf8mb4_unicode_ci; +CREATE INDEX idx_kb_docs_project ON kb_documents(project_id); +CREATE INDEX idx_kb_docs_directory ON kb_documents(directory_id); + +-- 若已有 kb_documents 表,执行以下语句添加 word_count 字段: +-- ALTER TABLE kb_documents ADD COLUMN word_count INT DEFAULT 0 AFTER file_path; + +-- 撰写文档(project_id 关联 projects.uuid,与 kb_documents 一致) +CREATE TABLE IF NOT EXISTS write_documents ( + id VARCHAR(64) PRIMARY KEY, + project_id VARCHAR(32) NOT NULL, + title VARCHAR(255) NOT NULL, + content LONGTEXT, + word_count INT DEFAULT 0, + created_at DATETIME NOT NULL, + updated_at DATETIME NOT NULL, + status VARCHAR(16) DEFAULT 'draft', + sort_order INT DEFAULT 0, + FOREIGN KEY (project_id) REFERENCES projects(uuid) ON DELETE CASCADE +) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4 COLLATE=utf8mb4_unicode_ci; +CREATE INDEX idx_write_docs_project ON write_documents(project_id); + +-- 文档版本 +CREATE TABLE IF NOT EXISTS doc_versions ( + id VARCHAR(64) PRIMARY KEY, + document_id VARCHAR(64) NOT NULL, + version VARCHAR(32) NOT NULL, + content LONGTEXT NOT NULL, + citation_payload LONGTEXT NULL, + saved_at DATETIME NOT NULL, + author VARCHAR(64) NOT NULL, + note TEXT, + FOREIGN KEY (document_id) REFERENCES write_documents(id) ON DELETE CASCADE +) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4 COLLATE=utf8mb4_unicode_ci; +CREATE INDEX idx_versions_doc ON doc_versions(document_id); + +-- 要素表定义(全局/时间) +CREATE TABLE IF NOT EXISTS element_tables ( + id VARCHAR(64) PRIMARY KEY, + project_id VARCHAR(32) NOT NULL, + table_type VARCHAR(32) NOT NULL, + table_name VARCHAR(255) NOT NULL, + year INT NULL, + is_time_dimension TINYINT(1) DEFAULT 0, + sort_order INT DEFAULT 0, + sync_suppressed_row_keys LONGTEXT NULL, + custom_row_order LONGTEXT NULL, + created_at DATETIME NOT NULL, + updated_at DATETIME NOT NULL, + FOREIGN KEY (project_id) REFERENCES projects(uuid) ON DELETE CASCADE +) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4 COLLATE=utf8mb4_unicode_ci; +CREATE INDEX idx_element_tables_project ON element_tables(project_id); +CREATE INDEX idx_element_tables_type_year ON element_tables(table_type, year); +CREATE INDEX idx_element_tables_name ON element_tables(table_name); + +-- 要素单元格 +CREATE TABLE IF NOT EXISTS element_cells ( + id VARCHAR(64) PRIMARY KEY, + table_id VARCHAR(64) NOT NULL, + project_id VARCHAR(32) NOT NULL, + row_key VARCHAR(255) NOT NULL, + col_key VARCHAR(255) NULL, + year INT NULL, + value LONGTEXT NULL, + source_document_id VARCHAR(64) NULL, + source_line_no INT NULL, + source_line_end INT NULL, + source_quote TEXT NULL, + confidence FLOAT NULL, + extraction_batch_id VARCHAR(64) NULL, + extraction_model VARCHAR(128) NULL, + source_type VARCHAR(16) NULL COMMENT 'extract=文档抽取, manual=手工输入', + conflict_status VARCHAR(16) DEFAULT 'none', + created_at DATETIME NOT NULL, + updated_at DATETIME NOT NULL, + FOREIGN KEY (table_id) REFERENCES element_tables(id) ON DELETE CASCADE, + FOREIGN KEY (project_id) REFERENCES projects(uuid) ON DELETE CASCADE, + FOREIGN KEY (source_document_id) REFERENCES kb_documents(id) ON DELETE SET NULL +) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4 COLLATE=utf8mb4_unicode_ci; +CREATE INDEX idx_element_cells_project ON element_cells(project_id); +CREATE INDEX idx_element_cells_row_col ON element_cells(row_key, col_key); +CREATE INDEX idx_element_cells_year ON element_cells(year); + +-- 抽取结果留存(table/element) +CREATE TABLE IF NOT EXISTS extraction_results ( + id VARCHAR(64) PRIMARY KEY, + project_id VARCHAR(32) NOT NULL, + document_id VARCHAR(64) NOT NULL, + batch_id VARCHAR(64) NOT NULL, + result_type VARCHAR(16) NOT NULL, + table_type VARCHAR(32) NULL, + table_name VARCHAR(255) NULL, + year INT NULL, + item_key VARCHAR(255) NOT NULL, + item_value LONGTEXT NULL, + source_line_no INT NULL, + source_line_end INT NULL, + confidence FLOAT NULL, + raw_payload JSON NULL, + extracted_at DATETIME NULL, + created_at DATETIME NOT NULL, + FOREIGN KEY (project_id) REFERENCES projects(uuid) ON DELETE CASCADE, + FOREIGN KEY (document_id) REFERENCES kb_documents(id) ON DELETE CASCADE +) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4 COLLATE=utf8mb4_unicode_ci; +CREATE INDEX idx_extraction_project_doc ON extraction_results(project_id, document_id); +CREATE INDEX idx_extraction_batch ON extraction_results(batch_id); +CREATE INDEX idx_extraction_table_name ON extraction_results(table_name); +CREATE INDEX idx_extraction_key ON extraction_results(item_key); + +-- 要素抽取结果明细(面向“细则章节/小节提示词 -> 项目材料”) +CREATE TABLE IF NOT EXISTS element_extraction_results ( + id VARCHAR(64) PRIMARY KEY, + project_id VARCHAR(32) NOT NULL, + table_type VARCHAR(32) NOT NULL, + year INT NULL, + table_name VARCHAR(255) NOT NULL, + extracted_at DATETIME NOT NULL, + item_key VARCHAR(255) NOT NULL, + item_value LONGTEXT NULL, + source_document_id VARCHAR(64) NULL, + source_line_no INT NULL, + source_line_end INT NULL, + created_at DATETIME NOT NULL, + FOREIGN KEY (project_id) REFERENCES projects(uuid) ON DELETE CASCADE, + FOREIGN KEY (source_document_id) REFERENCES kb_documents(id) ON DELETE SET NULL +) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4 COLLATE=utf8mb4_unicode_ci; +CREATE INDEX idx_el_ext_project ON element_extraction_results(project_id); +CREATE INDEX idx_el_ext_table ON element_extraction_results(table_type, year, table_name); +CREATE INDEX idx_el_ext_key ON element_extraction_results(item_key); +CREATE INDEX idx_el_ext_source_doc ON element_extraction_results(source_document_id); + +-- 冲突记录 +CREATE TABLE IF NOT EXISTS element_conflicts ( + id VARCHAR(64) PRIMARY KEY, + project_id VARCHAR(32) NOT NULL, + table_id VARCHAR(64) NULL, + cell_id VARCHAR(64) NULL, + item_key VARCHAR(255) NOT NULL, + old_value LONGTEXT NULL, + new_value LONGTEXT NULL, + selected_value LONGTEXT NULL, + source_document_id VARCHAR(64) NULL, + source_line_no INT NULL, + status VARCHAR(16) DEFAULT 'pending', + created_at DATETIME NOT NULL, + updated_at DATETIME NOT NULL, + FOREIGN KEY (project_id) REFERENCES projects(uuid) ON DELETE CASCADE, + FOREIGN KEY (table_id) REFERENCES element_tables(id) ON DELETE SET NULL, + FOREIGN KEY (cell_id) REFERENCES element_cells(id) ON DELETE SET NULL, + FOREIGN KEY (source_document_id) REFERENCES kb_documents(id) ON DELETE SET NULL +) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4 COLLATE=utf8mb4_unicode_ci; +CREATE INDEX idx_element_conflicts_project ON element_conflicts(project_id); +CREATE INDEX idx_element_conflicts_status ON element_conflicts(status); + +-- 文档 markdown 落库 +CREATE TABLE IF NOT EXISTS document_markdowns ( + id VARCHAR(64) PRIMARY KEY, + project_id VARCHAR(32) NOT NULL, + document_id VARCHAR(64) NOT NULL, + extracted_filename VARCHAR(255) NULL, + markdown_content LONGTEXT NOT NULL, + content_hash VARCHAR(64) NULL, + created_at DATETIME NOT NULL, + updated_at DATETIME NOT NULL, + FOREIGN KEY (project_id) REFERENCES projects(uuid) ON DELETE CASCADE, + FOREIGN KEY (document_id) REFERENCES kb_documents(id) ON DELETE CASCADE +) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4 COLLATE=utf8mb4_unicode_ci; +CREATE INDEX idx_markdowns_project_doc ON document_markdowns(project_id, document_id); + +-- 文档段落切分 +CREATE TABLE IF NOT EXISTS document_chunks ( + id VARCHAR(64) PRIMARY KEY, + project_id VARCHAR(32) NOT NULL, + document_id VARCHAR(64) NOT NULL, + markdown_id VARCHAR(64) NULL, + heading VARCHAR(512) NULL, + chunk_text LONGTEXT NOT NULL, + chunk_index INT DEFAULT 0, + source_line_start INT NULL, + source_line_end INT NULL, + vector_id VARCHAR(128) NULL, + created_at DATETIME NOT NULL, + FOREIGN KEY (project_id) REFERENCES projects(uuid) ON DELETE CASCADE, + FOREIGN KEY (document_id) REFERENCES kb_documents(id) ON DELETE CASCADE, + FOREIGN KEY (markdown_id) REFERENCES document_markdowns(id) ON DELETE CASCADE +) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4 COLLATE=utf8mb4_unicode_ci; +CREATE INDEX idx_chunks_project_doc ON document_chunks(project_id, document_id); +CREATE INDEX idx_chunks_heading ON document_chunks(heading(255)); + +-- 独立后台任务:pdf2md 文件处理与 element-agent 要素抽取 +CREATE TABLE IF NOT EXISTS tasks ( + id VARCHAR(64) PRIMARY KEY, + project VARCHAR(64) NOT NULL, + task_type INT NOT NULL, + file_id VARCHAR(64) NULL, + file_path VARCHAR(1024) NULL, + status INT NOT NULL DEFAULT 1, + payload_json JSON NULL, + result_path VARCHAR(1024) NULL, + error_message LONGTEXT NULL, + add_time DATETIME NOT NULL, + finish_time DATETIME NULL +) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4 COLLATE=utf8mb4_unicode_ci; +CREATE INDEX idx_tasks_status_type_time ON tasks(status, task_type, add_time); +CREATE INDEX idx_tasks_project ON tasks(project); +CREATE INDEX idx_tasks_file_id ON tasks(file_id); + +-- 模板管理 +CREATE TABLE IF NOT EXISTS report_templates ( + id VARCHAR(64) PRIMARY KEY, + name VARCHAR(255) NOT NULL, + description TEXT NULL, + is_default TINYINT(1) DEFAULT 0, + is_active TINYINT(1) DEFAULT 1, + created_at DATETIME NOT NULL, + updated_at DATETIME NOT NULL +) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4 COLLATE=utf8mb4_unicode_ci; +CREATE INDEX idx_templates_default ON report_templates(is_default); + +CREATE TABLE IF NOT EXISTS report_template_sections ( + id VARCHAR(64) PRIMARY KEY, + template_id VARCHAR(64) NOT NULL, + section_key VARCHAR(64) NOT NULL, + section_title VARCHAR(255) NOT NULL, + section_prompt LONGTEXT NULL, + section_output_contract LONGTEXT NULL, + section_order INT DEFAULT 0, + examples LONGTEXT NULL, + created_at DATETIME NOT NULL, + updated_at DATETIME NOT NULL, + FOREIGN KEY (template_id) REFERENCES report_templates(id) ON DELETE CASCADE +) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4 COLLATE=utf8mb4_unicode_ci; +CREATE INDEX idx_template_sections_template ON report_template_sections(template_id); + +-- 报告生成任务(7章分章异步) +CREATE TABLE IF NOT EXISTS report_generation_jobs ( + id VARCHAR(64) PRIMARY KEY, + project_id VARCHAR(32) NOT NULL, + template_id VARCHAR(64) NULL, + status VARCHAR(16) DEFAULT 'pending', + progress INT DEFAULT 0, + current_section_key VARCHAR(64) NULL, + error_message TEXT NULL, + requested_by VARCHAR(64) NULL, + options JSON NULL, + snapshot JSON NULL, + created_at DATETIME NOT NULL, + updated_at DATETIME NOT NULL, + completed_at DATETIME NULL, + FOREIGN KEY (project_id) REFERENCES projects(uuid) ON DELETE CASCADE, + FOREIGN KEY (template_id) REFERENCES report_templates(id) ON DELETE SET NULL +) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4 COLLATE=utf8mb4_unicode_ci; +CREATE INDEX idx_report_jobs_project ON report_generation_jobs(project_id); +CREATE INDEX idx_report_jobs_status ON report_generation_jobs(status); + +CREATE TABLE IF NOT EXISTS report_generation_chapters ( + id VARCHAR(64) PRIMARY KEY, + job_id VARCHAR(64) NOT NULL, + section_key VARCHAR(64) NOT NULL, + section_title VARCHAR(255) NOT NULL, + section_order INT DEFAULT 0, + status VARCHAR(16) DEFAULT 'pending', + content LONGTEXT NULL, + prompt_text LONGTEXT NULL, + evidence_payload JSON NULL, + validation_payload JSON NULL, + error_message TEXT NULL, + created_at DATETIME NOT NULL, + updated_at DATETIME NOT NULL, + completed_at DATETIME NULL, + FOREIGN KEY (job_id) REFERENCES report_generation_jobs(id) ON DELETE CASCADE +) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4 COLLATE=utf8mb4_unicode_ci; +CREATE INDEX idx_report_chapters_job ON report_generation_chapters(job_id); +CREATE INDEX idx_report_chapters_status ON report_generation_chapters(status); + +-- 最小 RBAC +CREATE TABLE IF NOT EXISTS departments ( + id VARCHAR(64) PRIMARY KEY, + name VARCHAR(255) NOT NULL, + description TEXT NULL, + parent_id VARCHAR(64) NULL, + created_at DATETIME NOT NULL, + updated_at DATETIME NOT NULL, + FOREIGN KEY (parent_id) REFERENCES departments(id) ON DELETE SET NULL +) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4 COLLATE=utf8mb4_unicode_ci; + +CREATE TABLE IF NOT EXISTS users ( + id VARCHAR(64) PRIMARY KEY, + username VARCHAR(64) NOT NULL UNIQUE, + password_hash VARCHAR(255) NULL, + department_id VARCHAR(64) NULL, + created_at DATETIME NOT NULL, + updated_at DATETIME NOT NULL, + FOREIGN KEY (department_id) REFERENCES departments(id) ON DELETE SET NULL +) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4 COLLATE=utf8mb4_unicode_ci; +CREATE INDEX idx_users_department ON users(department_id); + +CREATE TABLE IF NOT EXISTS roles ( + id VARCHAR(64) PRIMARY KEY, + name VARCHAR(64) NOT NULL UNIQUE, + description TEXT NULL, + created_at DATETIME NOT NULL, + updated_at DATETIME NOT NULL +) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4 COLLATE=utf8mb4_unicode_ci; + +CREATE TABLE IF NOT EXISTS permissions ( + id VARCHAR(64) PRIMARY KEY, + perm_key VARCHAR(128) NOT NULL UNIQUE, + perm_type VARCHAR(32) NOT NULL, + description TEXT NULL, + created_at DATETIME NOT NULL +) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4 COLLATE=utf8mb4_unicode_ci; +CREATE INDEX idx_permissions_type ON permissions(perm_type); + +CREATE TABLE IF NOT EXISTS role_permissions ( + id VARCHAR(64) PRIMARY KEY, + role_id VARCHAR(64) NOT NULL, + permission_id VARCHAR(64) NOT NULL, + created_at DATETIME NOT NULL, + FOREIGN KEY (role_id) REFERENCES roles(id) ON DELETE CASCADE, + FOREIGN KEY (permission_id) REFERENCES permissions(id) ON DELETE CASCADE +) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4 COLLATE=utf8mb4_unicode_ci; + +CREATE TABLE IF NOT EXISTS user_roles ( + id VARCHAR(64) PRIMARY KEY, + user_id VARCHAR(64) NOT NULL, + role_id VARCHAR(64) NOT NULL, + created_at DATETIME NOT NULL, + FOREIGN KEY (user_id) REFERENCES users(id) ON DELETE CASCADE, + FOREIGN KEY (role_id) REFERENCES roles(id) ON DELETE CASCADE +) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4 COLLATE=utf8mb4_unicode_ci; + +CREATE TABLE IF NOT EXISTS project_members ( + id VARCHAR(64) PRIMARY KEY, + project_id VARCHAR(32) NOT NULL, + user_id VARCHAR(64) NOT NULL, + role VARCHAR(32) DEFAULT 'editor', + created_at DATETIME NOT NULL, + FOREIGN KEY (project_id) REFERENCES projects(uuid) ON DELETE CASCADE, + FOREIGN KEY (user_id) REFERENCES users(id) ON DELETE CASCADE +) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4 COLLATE=utf8mb4_unicode_ci; +CREATE INDEX idx_project_members_project ON project_members(project_id); + +CREATE TABLE IF NOT EXISTS project_departments ( + id VARCHAR(64) PRIMARY KEY, + project_id VARCHAR(32) NOT NULL, + department_id VARCHAR(64) NOT NULL, + created_at DATETIME NOT NULL, + FOREIGN KEY (project_id) REFERENCES projects(uuid) ON DELETE CASCADE, + FOREIGN KEY (department_id) REFERENCES departments(id) ON DELETE CASCADE, + UNIQUE KEY uq_project_department (project_id, department_id) +) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4 COLLATE=utf8mb4_unicode_ci; +CREATE INDEX idx_project_departments_project ON project_departments(project_id); + +-- 回填记录:每次要素回填均留痕,支持证据追溯 +CREATE TABLE IF NOT EXISTS fill_records ( + id VARCHAR(64) PRIMARY KEY, + project_id VARCHAR(32) NOT NULL, + cell_id VARCHAR(64) NULL, + table_id VARCHAR(64) NULL, + row_key VARCHAR(255) NOT NULL, + col_key VARCHAR(255) NULL, + year INT NULL, + filled_value LONGTEXT NULL, + previous_value LONGTEXT NULL, + source_document_id VARCHAR(64) NULL, + source_document_name VARCHAR(255) NULL COMMENT '冗余存储文档名,文档删除后仍可追溯', + source_line_no INT NULL, + source_line_end INT NULL, + source_quote TEXT NULL COMMENT '原文摘录片段,作为回填依据', + confidence FLOAT NULL, + extraction_batch_id VARCHAR(64) NULL, + extraction_model VARCHAR(128) NULL COMMENT '使用的 LLM 模型标识', + fill_type VARCHAR(16) NOT NULL DEFAULT 'auto' COMMENT 'auto=抽取回填, manual=人工编辑, resolve=冲突解决', + created_at DATETIME NOT NULL, + FOREIGN KEY (project_id) REFERENCES projects(uuid) ON DELETE CASCADE, + FOREIGN KEY (cell_id) REFERENCES element_cells(id) ON DELETE SET NULL, + FOREIGN KEY (table_id) REFERENCES element_tables(id) ON DELETE SET NULL, + FOREIGN KEY (source_document_id) REFERENCES kb_documents(id) ON DELETE SET NULL +) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4 COLLATE=utf8mb4_unicode_ci; +CREATE INDEX idx_fill_records_project ON fill_records(project_id); +CREATE INDEX idx_fill_records_cell ON fill_records(cell_id); +CREATE INDEX idx_fill_records_batch ON fill_records(extraction_batch_id); +CREATE INDEX idx_fill_records_source_doc ON fill_records(source_document_id); +CREATE INDEX idx_fill_records_created ON fill_records(created_at); + +-- ============================================================ +-- report_section_references:章节参考范文 +-- ============================================================ +CREATE TABLE IF NOT EXISTS report_section_references ( + id VARCHAR(64) PRIMARY KEY, + template_id VARCHAR(64) NULL COMMENT '关联模板ID(report_templates.id),按模板过滤参考范文', + source_file VARCHAR(255) NOT NULL COMMENT '来源文件名', + section_key VARCHAR(64) NOT NULL COMMENT '章节标识,如 1.1、2.1.1', + section_title VARCHAR(255) NOT NULL COMMENT '章节标题', + section_order INT DEFAULT 0 COMMENT '章节序号', + content TEXT NOT NULL COMMENT '该章节的参考范文 Markdown', + created_at DATETIME NOT NULL, + updated_at DATETIME NOT NULL +) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4 COLLATE=utf8mb4_unicode_ci; +CREATE INDEX idx_ref_source_file ON report_section_references(source_file); +CREATE INDEX idx_ref_section_key ON report_section_references(section_key); +CREATE INDEX idx_ref_template_id ON report_section_references(template_id); diff --git a/database/init_db.py b/database/init_db.py new file mode 100644 index 0000000..5dd4ae4 --- /dev/null +++ b/database/init_db.py @@ -0,0 +1,764 @@ +""" +database/init_db.py +应用启动时初始化数据库表结构。 + +执行 init.sql 中的 DDL,使用 IF NOT EXISTS 保证幂等。 +""" + +import re +from pathlib import Path + +from sqlalchemy import text + +from database.core import engine + +# DDL 与 init_db.py 同目录:database/init.sql +INIT_SQL_PATH = Path(__file__).resolve().parent / "init.sql" + +INIT_TABLES = [ + "projects", + "kb_directories", + "kb_documents", + "write_documents", + "doc_versions", + "element_tables", + "element_cells", + "extraction_results", + "element_extraction_results", + "element_conflicts", + "document_markdowns", + "document_chunks", + "report_templates", + "report_template_sections", + "report_generation_jobs", + "report_generation_chapters", + "departments", + "users", + "roles", + "permissions", + "role_permissions", + "user_roles", + "project_members", + "project_departments", + "fill_records", + "report_section_references", +] + + +_TARGET_TABLE_COLLATION = "utf8mb4_unicode_ci" + + +def _existing_tables(conn) -> set[str]: + return { + row[0] + for row in conn.execute( + text( + "SELECT TABLE_NAME FROM information_schema.TABLES " + "WHERE TABLE_SCHEMA = DATABASE()" + ) + ).fetchall() + } + + +def _table_collation(conn, table_name: str) -> str | None: + row = conn.execute( + text( + "SELECT TABLE_COLLATION FROM information_schema.TABLES " + "WHERE TABLE_SCHEMA = DATABASE() AND TABLE_NAME = :table_name" + ), + {"table_name": table_name}, + ).first() + return str(row[0]).strip() if row and row[0] else None + + +def _column_collation(conn, table_name: str, column_name: str) -> str | None: + row = conn.execute( + text( + "SELECT COLLATION_NAME FROM information_schema.COLUMNS " + "WHERE TABLE_SCHEMA = DATABASE() " + "AND TABLE_NAME = :table_name AND COLUMN_NAME = :column_name" + ), + {"table_name": table_name, "column_name": column_name}, + ).first() + return str(row[0]).strip() if row and row[0] else None + + +def _normalize_projects_table(conn) -> None: + """ + 将历史库表/列统一为 utf8mb4_unicode_ci(仅在实际不一致时执行 ALTER)。 + + 切勿在每次启动时对已迁移库重复 CONVERT:会长时间持有 metadata lock, + 阻塞所有对 projects 等表的读写,并导致连接池耗尽。 + """ + existing = _existing_tables(conn) + tables_to_convert = [ + name + for name in INIT_TABLES + if name in existing and _table_collation(conn, name) != _TARGET_TABLE_COLLATION + ] + projects_uuid_needs_fix = ( + "projects" in existing + and _column_collation(conn, "projects", "uuid") != _TARGET_TABLE_COLLATION + ) + if not tables_to_convert and not projects_uuid_needs_fix: + return + + conn.execute(text("SET FOREIGN_KEY_CHECKS=0")) + try: + for table_name in tables_to_convert: + conn.execute( + text( + f"ALTER TABLE `{table_name}` " + "CONVERT TO CHARACTER SET utf8mb4 COLLATE utf8mb4_unicode_ci" + ) + ) + if projects_uuid_needs_fix: + conn.execute( + text( + "ALTER TABLE projects " + "MODIFY uuid VARCHAR(32) CHARACTER SET utf8mb4 " + "COLLATE utf8mb4_unicode_ci NOT NULL" + ) + ) + conn.commit() + finally: + conn.execute(text("SET FOREIGN_KEY_CHECKS=1")) + + +def _split_sql_statements(content: str) -> list[str]: + """ + 按分号拆分 SQL 语句,忽略注释和空行。 + 简单实现:不处理字符串内的分号。 + """ + # 移除单行注释 + content = re.sub(r"--[^\n]*", "", content) + # 移除多行注释 + content = re.sub(r"/\*.*?\*/", "", content, flags=re.DOTALL) + statements = [ + s.strip() + for s in content.split(";") + if s.strip() and not s.strip().startswith("--") + ] + return statements + + +def init_database() -> None: + """ + 执行 init.sql,创建表结构,并按需执行缺失字段迁移。 + + 注意:init.sql 里使用了 `CREATE TABLE IF NOT EXISTS`,因此对“已存在但缺列”的旧库, + 需要额外执行对应迁移脚本(例如补齐 `kb_documents.factor`)。 + """ + if not INIT_SQL_PATH.exists(): + return + + sql_text = INIT_SQL_PATH.read_text(encoding="utf-8") + statements = _split_sql_statements(sql_text) + + with engine.connect() as conn: + for stmt in statements: + stmt = stmt.strip() + if not stmt: + continue + try: + conn.execute(text(stmt)) + conn.commit() + except Exception as e: + # 表/索引已存在时忽略(Duplicate key name、already exists) + err_msg = str(e).lower() + # 历史库可能缺列,导致 CREATE INDEX 报 "Key column ... doesn't exist in table"。 + # 这里先跳过,后续 migrate_extraction_results.sql 会补齐列并建索引。 + if ( + "already exists" in err_msg + or "duplicate" in err_msg + or ("key column" in err_msg and "doesn't exist in table" in err_msg) + or "error 1072" in err_msg + ): + conn.rollback() + continue + conn.rollback() + raise + # 仅在字符集未达标时执行 ALTER(勿在每次 CREATE TABLE projects 后重复调用) + _normalize_projects_table(conn) + + # ------------------------------------------------------------------ + # Missing-column migrations (idempotent via "duplicate column" ignore) + # ------------------------------------------------------------------ + factor_migrate_path = Path(__file__).resolve().parent / "migrate_kb_documents_factor.sql" + if factor_migrate_path.exists(): + factor_sql_text = factor_migrate_path.read_text(encoding="utf-8") + factor_statements = _split_sql_statements(factor_sql_text) + + with engine.connect() as conn: + for stmt in factor_statements: + stmt = stmt.strip() + if not stmt: + continue + try: + conn.execute(text(stmt)) + conn.commit() + except Exception as e: + # MySQL: Error 1060 "Duplicate column name 'factor'" + err_msg = str(e).lower() + if "duplicate column" in err_msg or "error 1060" in err_msg: + conn.rollback() + continue + conn.rollback() + raise + + # ------------------------------------------------------------------ + # Missing tables/columns migrations (kb_directories + directory_id) + # ------------------------------------------------------------------ + kb_dirs_migrate_path = Path(__file__).resolve().parent / "migrate_kb_directories.sql" + if kb_dirs_migrate_path.exists(): + kb_dirs_sql_text = kb_dirs_migrate_path.read_text(encoding="utf-8") + kb_dirs_statements = _split_sql_statements(kb_dirs_sql_text) + + with engine.connect() as conn: + for stmt in kb_dirs_statements: + stmt = stmt.strip() + if not stmt: + continue + try: + conn.execute(text(stmt)) + conn.commit() + except Exception as e: + err_msg = str(e).lower() + # MySQL 常见“已存在/重复”错误:忽略以保证幂等 + if ( + "duplicate column" in err_msg + or "error 1060" in err_msg + or "already exists" in err_msg + or "duplicate" in err_msg + ): + conn.rollback() + continue + conn.rollback() + raise + + # ------------------------------------------------------------------ + # Missing tables/columns migrations (extraction_results legacy schema) + # ------------------------------------------------------------------ + extraction_migrate_path = Path(__file__).resolve().parent / "migrate_extraction_results.sql" + if extraction_migrate_path.exists(): + extraction_sql_text = extraction_migrate_path.read_text(encoding="utf-8") + extraction_statements = _split_sql_statements(extraction_sql_text) + + with engine.connect() as conn: + for stmt in extraction_statements: + stmt = stmt.strip() + if not stmt: + continue + try: + conn.execute(text(stmt)) + conn.commit() + except Exception as e: + err_msg = str(e).lower() + if ( + "duplicate column" in err_msg + or "error 1060" in err_msg + or "already exists" in err_msg + or "duplicate" in err_msg + or "check that column/key exists" in err_msg + or "error 1072" in err_msg + or "doesn't exist" in err_msg + ): + conn.rollback() + continue + conn.rollback() + raise + + # ------------------------------------------------------------------ + # extraction_results:移除历史 run_id 外键/字段(收敛到 batch_id) + # ------------------------------------------------------------------ + extraction_drop_run_id_path = ( + Path(__file__).resolve().parent / "migrate_extraction_results_drop_run_id.sql" + ) + if extraction_drop_run_id_path.exists(): + extraction_drop_run_id_sql = extraction_drop_run_id_path.read_text(encoding="utf-8") + extraction_drop_run_id_statements = _split_sql_statements(extraction_drop_run_id_sql) + + with engine.connect() as conn: + for stmt in extraction_drop_run_id_statements: + stmt = stmt.strip() + if not stmt: + continue + try: + conn.execute(text(stmt)) + conn.commit() + except Exception as e: + err_msg = str(e).lower() + if ( + "already exists" in err_msg + or "duplicate" in err_msg + or "doesn't exist" in err_msg + or "check that column/key exists" in err_msg + or "error 1091" in err_msg + ): + conn.rollback() + continue + conn.rollback() + raise + + # ------------------------------------------------------------------ + # element_conflicts:补齐 table_id / cell_id(旧库缺列导致 ORM 查询 500) + # ------------------------------------------------------------------ + ec_migrate_path = Path(__file__).resolve().parent / "migrate_element_conflicts.sql" + if ec_migrate_path.exists(): + ec_sql_text = ec_migrate_path.read_text(encoding="utf-8") + ec_statements = _split_sql_statements(ec_sql_text) + + with engine.connect() as conn: + for stmt in ec_statements: + stmt = stmt.strip() + if not stmt: + continue + try: + conn.execute(text(stmt)) + conn.commit() + except Exception as e: + err_msg = str(e).lower() + if ( + "duplicate column" in err_msg + or "error 1060" in err_msg + or "already exists" in err_msg + or "errno 1060" in err_msg + ): + conn.rollback() + continue + conn.rollback() + raise + + # ------------------------------------------------------------------ + # element_conflicts:兼容历史 project_element_id NOT NULL(改为 NULL) + # ------------------------------------------------------------------ + ec_project_element_id_path = ( + Path(__file__).resolve().parent / "migrate_element_conflicts_project_element_id_nullable.sql" + ) + if ec_project_element_id_path.exists(): + ec_peid_sql = ec_project_element_id_path.read_text(encoding="utf-8") + ec_peid_stmts = _split_sql_statements(ec_peid_sql) + with engine.connect() as conn: + for stmt in ec_peid_stmts: + stmt = stmt.strip() + if not stmt: + continue + try: + conn.execute(text(stmt)) + conn.commit() + except Exception: + conn.rollback() + raise + + # ------------------------------------------------------------------ + # element_conflicts:兼容历史 extraction_result_id NOT NULL(改为 NULL) + # ------------------------------------------------------------------ + ec_extraction_result_id_path = ( + Path(__file__).resolve().parent / "migrate_element_conflicts_extraction_result_id_nullable.sql" + ) + if ec_extraction_result_id_path.exists(): + ec_erid_sql = ec_extraction_result_id_path.read_text(encoding="utf-8") + ec_erid_stmts = _split_sql_statements(ec_erid_sql) + with engine.connect() as conn: + for stmt in ec_erid_stmts: + stmt = stmt.strip() + if not stmt: + continue + try: + conn.execute(text(stmt)) + conn.commit() + except Exception: + conn.rollback() + raise + + # ------------------------------------------------------------------ + # extraction_results:extracted_at / source_line_end + # ------------------------------------------------------------------ + ext_time_path = Path(__file__).resolve().parent / "migrate_extraction_results_extracted_at.sql" + if ext_time_path.exists(): + ext_sql = ext_time_path.read_text(encoding="utf-8") + ext_stmts = _split_sql_statements(ext_sql) + with engine.connect() as conn: + for stmt in ext_stmts: + stmt = stmt.strip() + if not stmt: + continue + try: + conn.execute(text(stmt)) + conn.commit() + except Exception as e: + err_msg = str(e).lower() + if ( + "duplicate column" in err_msg + or "error 1060" in err_msg + or "already exists" in err_msg + ): + conn.rollback() + continue + conn.rollback() + raise + + # ------------------------------------------------------------------ + # element_extraction_results:要素抽取结果明细表(若旧库缺表则补齐) + # ------------------------------------------------------------------ + el_ext_path = Path(__file__).resolve().parent / "migrate_element_extraction_results.sql" + if el_ext_path.exists(): + el_ext_sql = el_ext_path.read_text(encoding="utf-8") + el_ext_stmts = _split_sql_statements(el_ext_sql) + with engine.connect() as conn: + for stmt in el_ext_stmts: + stmt = stmt.strip() + if not stmt: + continue + try: + conn.execute(text(stmt)) + conn.commit() + except Exception as e: + err_msg = str(e).lower() + if ( + "already exists" in err_msg + or "duplicate" in err_msg + ): + conn.rollback() + continue + conn.rollback() + raise + + # ------------------------------------------------------------------ + # project_departments:项目可见部门 + # ------------------------------------------------------------------ + proj_dept_path = Path(__file__).resolve().parent / "migrate_project_departments.sql" + if proj_dept_path.exists(): + proj_dept_sql = proj_dept_path.read_text(encoding="utf-8") + proj_dept_stmts = _split_sql_statements(proj_dept_sql) + with engine.connect() as conn: + for stmt in proj_dept_stmts: + stmt = stmt.strip() + if not stmt: + continue + try: + conn.execute(text(stmt)) + conn.commit() + except Exception as e: + err_msg = str(e).lower() + if "already exists" in err_msg or "duplicate" in err_msg: + conn.rollback() + continue + conn.rollback() + raise + + # ------------------------------------------------------------------ + # report_template_sections:章节输出合同(section_output_contract) + # ------------------------------------------------------------------ + template_section_contract_path = ( + Path(__file__).resolve().parent / "migrations" / "add_section_output_contract.sql" + ) + if template_section_contract_path.exists(): + template_section_contract_sql = template_section_contract_path.read_text(encoding="utf-8") + template_section_contract_stmts = _split_sql_statements(template_section_contract_sql) + with engine.connect() as conn: + for stmt in template_section_contract_stmts: + stmt = stmt.strip() + if not stmt: + continue + try: + conn.execute(text(stmt)) + conn.commit() + except Exception as e: + err_msg = str(e).lower() + if "duplicate column" in err_msg or "error 1060" in err_msg or "already exists" in err_msg: + conn.rollback() + continue + conn.rollback() + raise + + # ------------------------------------------------------------------ + # report_section_references:补齐 template_id(按模板过滤参考范文) + # ------------------------------------------------------------------ + ref_template_id_path = ( + Path(__file__).resolve().parent / "migrations" / "add_ref_template_id.sql" + ) + if ref_template_id_path.exists(): + ref_template_id_sql = ref_template_id_path.read_text(encoding="utf-8") + ref_template_id_stmts = _split_sql_statements(ref_template_id_sql) + with engine.connect() as conn: + for stmt in ref_template_id_stmts: + stmt = stmt.strip() + if not stmt: + continue + try: + conn.execute(text(stmt)) + conn.commit() + except Exception as e: + err_msg = str(e).lower() + if ( + "duplicate column" in err_msg + or "error 1060" in err_msg + or "duplicate key name" in err_msg + or "error 1061" in err_msg + or "already exists" in err_msg + ): + conn.rollback() + continue + conn.rollback() + raise + + # ------------------------------------------------------------------ + # users:补齐 password_hash(登录注册) + # ------------------------------------------------------------------ + users_pwd_path = Path(__file__).resolve().parent / "migrate_users_password_hash.sql" + if users_pwd_path.exists(): + users_pwd_sql = users_pwd_path.read_text(encoding="utf-8") + users_pwd_stmts = _split_sql_statements(users_pwd_sql) + with engine.connect() as conn: + for stmt in users_pwd_stmts: + stmt = stmt.strip() + if not stmt: + continue + try: + conn.execute(text(stmt)) + conn.commit() + except Exception as e: + err_msg = str(e).lower() + if "duplicate column" in err_msg or "error 1060" in err_msg or "already exists" in err_msg: + conn.rollback() + continue + conn.rollback() + raise + + # ------------------------------------------------------------------ + # departments:补齐 description(部门描述) + # ------------------------------------------------------------------ + dept_desc_path = Path(__file__).resolve().parent / "migrate_departments_description.sql" + if dept_desc_path.exists(): + dept_desc_sql = dept_desc_path.read_text(encoding="utf-8") + dept_desc_stmts = _split_sql_statements(dept_desc_sql) + with engine.connect() as conn: + for stmt in dept_desc_stmts: + stmt = stmt.strip() + if not stmt: + continue + try: + conn.execute(text(stmt)) + conn.commit() + except Exception as e: + err_msg = str(e).lower() + if "duplicate column" in err_msg or "error 1060" in err_msg or "already exists" in err_msg: + conn.rollback() + continue + conn.rollback() + raise + + # ------------------------------------------------------------------ + # projects:用户删除的标准模版表不再被「同步模版」回补 + # ------------------------------------------------------------------ + proj_sup_path = Path(__file__).resolve().parent / "migrate_projects_sync_suppressed_tables.sql" + if proj_sup_path.exists(): + proj_sup_sql = proj_sup_path.read_text(encoding="utf-8") + proj_sup_stmts = _split_sql_statements(proj_sup_sql) + with engine.connect() as conn: + for stmt in proj_sup_stmts: + stmt = stmt.strip() + if not stmt: + continue + try: + conn.execute(text(stmt)) + conn.commit() + except Exception as e: + err_msg = str(e).lower() + if "duplicate column" in err_msg or "error 1060" in err_msg or "already exists" in err_msg: + conn.rollback() + continue + conn.rollback() + raise + + # ------------------------------------------------------------------ + # element_tables:用户删行后不再被「同步模版」回补 + # ------------------------------------------------------------------ + et_sup_path = Path(__file__).resolve().parent / "migrate_element_tables_sync_suppressed.sql" + if et_sup_path.exists(): + et_sup_sql = et_sup_path.read_text(encoding="utf-8") + et_sup_stmts = _split_sql_statements(et_sup_sql) + with engine.connect() as conn: + for stmt in et_sup_stmts: + stmt = stmt.strip() + if not stmt: + continue + try: + conn.execute(text(stmt)) + conn.commit() + except Exception as e: + err_msg = str(e).lower() + if "duplicate column" in err_msg or "error 1060" in err_msg or "already exists" in err_msg: + conn.rollback() + continue + conn.rollback() + raise + + # ------------------------------------------------------------------ + # element_tables:自定义行顺序(加行插在选中行下;刷新后仍保持) + # ------------------------------------------------------------------ + et_row_order_path = Path(__file__).resolve().parent / "migrate_element_tables_custom_row_order.sql" + if et_row_order_path.exists(): + et_row_order_sql = et_row_order_path.read_text(encoding="utf-8") + et_row_order_stmts = _split_sql_statements(et_row_order_sql) + with engine.connect() as conn: + for stmt in et_row_order_stmts: + stmt = stmt.strip() + if not stmt: + continue + try: + conn.execute(text(stmt)) + conn.commit() + except Exception as e: + err_msg = str(e).lower() + if "duplicate column" in err_msg or "error 1060" in err_msg or "already exists" in err_msg: + conn.rollback() + continue + conn.rollback() + raise + + # ------------------------------------------------------------------ + # kb_documents:status 语义 v2(0/2/3/4),仅旧库(仍有 status=1 且无 status=4)时执行 + # ------------------------------------------------------------------ + status_v2_path = Path(__file__).resolve().parent / "migrate_kb_doc_status_v2.sql" + if status_v2_path.exists(): + with engine.connect() as conn: + try: + probe = conn.execute( + text( + """ + SELECT + SUM(CASE WHEN status = 1 THEN 1 ELSE 0 END) AS s1, + SUM(CASE WHEN status = 4 THEN 1 ELSE 0 END) AS s4 + FROM kb_documents + """ + ) + ).fetchone() + s1 = int((probe[0] if probe else 0) or 0) + s4 = int((probe[1] if probe else 0) or 0) + if s1 > 0 and s4 == 0: + status_v2_sql = status_v2_path.read_text(encoding="utf-8") + for stmt in _split_sql_statements(status_v2_sql): + stmt = stmt.strip() + if not stmt: + continue + conn.execute(text(stmt)) + conn.commit() + except Exception as e: + err_msg = str(e).lower() + if "doesn't exist" in err_msg and "kb_documents" in err_msg: + conn.rollback() + else: + conn.rollback() + raise + + # ------------------------------------------------------------------ + # kb_documents:storage_rel_path + error_message + # ------------------------------------------------------------------ + kb_storage_path = Path(__file__).resolve().parent / "migrate_kb_doc_storage_path.sql" + if kb_storage_path.exists(): + kb_storage_sql = kb_storage_path.read_text(encoding="utf-8") + kb_storage_stmts = _split_sql_statements(kb_storage_sql) + with engine.connect() as conn: + for stmt in kb_storage_stmts: + stmt = stmt.strip() + if not stmt: + continue + try: + conn.execute(text(stmt)) + conn.commit() + except Exception as e: + err_msg = str(e).lower() + if "duplicate column" in err_msg or "error 1060" in err_msg or "already exists" in err_msg: + conn.rollback() + continue + conn.rollback() + raise + + # ------------------------------------------------------------------ + # kb_documents:category (文件分类) + # ------------------------------------------------------------------ + category_migrate_path = Path(__file__).resolve().parent / "migrate_kb_documents_category.sql" + if category_migrate_path.exists(): + category_sql_text = category_migrate_path.read_text(encoding="utf-8") + category_statements = _split_sql_statements(category_sql_text) + with engine.connect() as conn: + for stmt in category_statements: + stmt = stmt.strip() + if not stmt: + continue + try: + conn.execute(text(stmt)) + conn.commit() + except Exception as e: + err_msg = str(e).lower() + if "duplicate column" in err_msg or "error 1060" in err_msg or "already exists" in err_msg: + conn.rollback() + continue + conn.rollback() + raise + + # ------------------------------------------------------------------ + # kb_documents:旧版分类 → 资料清单六大类 + # ------------------------------------------------------------------ + category_checklist_path = Path(__file__).resolve().parent / "migrate_kb_category_checklist.sql" + if category_checklist_path.exists(): + checklist_sql_text = category_checklist_path.read_text(encoding="utf-8") + checklist_statements = _split_sql_statements(checklist_sql_text) + with engine.connect() as conn: + for stmt in checklist_statements: + stmt = stmt.strip() + if not stmt: + continue + try: + conn.execute(text(stmt)) + conn.commit() + except Exception: + conn.rollback() + + # ------------------------------------------------------------------ + # kb_documents:upload_filename(上传/解压原始文件名) + # ------------------------------------------------------------------ + upload_fn_path = Path(__file__).resolve().parent / "migrate_kb_documents_upload_filename.sql" + if upload_fn_path.exists(): + upload_fn_sql = upload_fn_path.read_text(encoding="utf-8") + upload_fn_stmts = _split_sql_statements(upload_fn_sql) + with engine.connect() as conn: + for stmt in upload_fn_stmts: + stmt = stmt.strip() + if not stmt: + continue + try: + conn.execute(text(stmt)) + conn.commit() + except Exception as e: + err_msg = str(e).lower() + if "duplicate column" in err_msg or "error 1060" in err_msg or "already exists" in err_msg: + conn.rollback() + continue + conn.rollback() + raise + + # ------------------------------------------------------------------ + # element_cells:source_type(文档抽取 / 手工输入) + # ------------------------------------------------------------------ + ec_source_type_path = Path(__file__).resolve().parent / "migrate_element_cells_source_type.sql" + if ec_source_type_path.exists(): + ec_source_sql = ec_source_type_path.read_text(encoding="utf-8") + ec_source_stmts = _split_sql_statements(ec_source_sql) + with engine.connect() as conn: + for stmt in ec_source_stmts: + stmt = stmt.strip() + if not stmt: + continue + try: + conn.execute(text(stmt)) + conn.commit() + except Exception as e: + err_msg = str(e).lower() + if "duplicate column" in err_msg or "error 1060" in err_msg or "already exists" in err_msg: + conn.rollback() + continue + conn.rollback() + raise diff --git a/database/migrations/add_ref_template_id.sql b/database/migrations/add_ref_template_id.sql new file mode 100644 index 0000000..0cf11df --- /dev/null +++ b/database/migrations/add_ref_template_id.sql @@ -0,0 +1,3 @@ +-- 为 report_section_references 增加 template_id,按模板过滤参考范文 +ALTER TABLE report_section_references ADD COLUMN template_id VARCHAR(64) NULL COMMENT '关联模板ID(report_templates.id),按模板过滤参考范文'; +CREATE INDEX idx_ref_template_id ON report_section_references(template_id); diff --git a/database/models.py b/database/models.py new file mode 100644 index 0000000..32dd0e1 --- /dev/null +++ b/database/models.py @@ -0,0 +1,503 @@ +""" +database/models.py +SQLAlchemy ORM 模型,与 db.md / init.sql 对应。 +""" + +from datetime import datetime +from typing import Optional + +from sqlalchemy import Boolean, DateTime, Float, ForeignKey, Integer, JSON, String, Text, UniqueConstraint +from sqlalchemy.orm import DeclarativeBase, Mapped, mapped_column, relationship + + +class Base(DeclarativeBase): + pass + + +class Project(Base): + """项目表(统一:知识库 + 撰写)""" + + __tablename__ = "projects" + + id: Mapped[int] = mapped_column(Integer, primary_key=True, autoincrement=True) + uuid: Mapped[str] = mapped_column( + String(32), + unique=True, + nullable=False, + ) + name: Mapped[str] = mapped_column(String(255), nullable=False) + description: Mapped[Optional[str]] = mapped_column(Text, nullable=True) + created_at: Mapped[datetime] = mapped_column(DateTime, nullable=False) + updated_at: Mapped[datetime] = mapped_column(DateTime, nullable=False) + doc_count: Mapped[int] = mapped_column(Integer, default=0) + eval_reports_count: Mapped[int] = mapped_column(Integer, default=0) + total_size: Mapped[str] = mapped_column(String(32), default="0 B") + tags: Mapped[Optional[str]] = mapped_column(Text, nullable=True) + status: Mapped[str] = mapped_column(String(16), default="active") + color: Mapped[str] = mapped_column(String(16), default="#3b82f6") + sync_suppressed_table_names: Mapped[Optional[str]] = mapped_column(Text, nullable=True) + + kb_documents: Mapped[list["KbDocument"]] = relationship( + "KbDocument", back_populates="project", cascade="all, delete-orphan" + ) + kb_directories: Mapped[list["KbDirectory"]] = relationship( + "KbDirectory", back_populates="project", cascade="all, delete-orphan" + ) + write_documents: Mapped[list["WriteDocumentModel"]] = relationship( + "WriteDocumentModel", back_populates="project", cascade="all, delete-orphan" + ) + + +class WriteDocumentModel(Base): + """撰写文档表(后评价报告)。project_id 关联 projects.uuid""" + + __tablename__ = "write_documents" + + id: Mapped[str] = mapped_column(String(64), primary_key=True) + project_id: Mapped[str] = mapped_column(ForeignKey("projects.uuid", ondelete="CASCADE"), nullable=False) + title: Mapped[str] = mapped_column(String(255), nullable=False) + content: Mapped[Optional[str]] = mapped_column(Text, nullable=True) + word_count: Mapped[int] = mapped_column(Integer, default=0) + created_at: Mapped[datetime] = mapped_column(DateTime, nullable=False) + updated_at: Mapped[datetime] = mapped_column(DateTime, nullable=False) + status: Mapped[str] = mapped_column(String(16), default="draft") + sort_order: Mapped[int] = mapped_column(Integer, default=0) + + project: Mapped["Project"] = relationship("Project", back_populates="write_documents") + doc_versions: Mapped[list["DocumentVersion"]] = relationship( + "DocumentVersion", back_populates="document", cascade="all, delete-orphan" + ) + + +class DocumentVersion(Base): + """撰写文档版本表(对应 doc_versions)。""" + + __tablename__ = "doc_versions" + + id: Mapped[str] = mapped_column(String(64), primary_key=True) + document_id: Mapped[str] = mapped_column( + ForeignKey("write_documents.id", ondelete="CASCADE"), nullable=False + ) + version: Mapped[str] = mapped_column(String(32), nullable=False) + content: Mapped[str] = mapped_column(Text, nullable=False) + citation_payload: Mapped[Optional[str]] = mapped_column(Text, nullable=True) + saved_at: Mapped[datetime] = mapped_column(DateTime, nullable=False) + author: Mapped[str] = mapped_column(String(64), nullable=False) + note: Mapped[Optional[str]] = mapped_column(Text, nullable=True) + + document: Mapped["WriteDocumentModel"] = relationship("WriteDocumentModel", back_populates="doc_versions") + + +class KbDocument(Base): + """知识库文档表。project_id 关联 projects.uuid。status: 0=失败 2=排队中 3=处理中 4=可用""" + + __tablename__ = "kb_documents" + + id: Mapped[str] = mapped_column(String(64), primary_key=True) + project_id: Mapped[str] = mapped_column(ForeignKey("projects.uuid", ondelete="CASCADE"), nullable=False) + directory_id: Mapped[Optional[str]] = mapped_column( + ForeignKey("kb_directories.id", ondelete="SET NULL"), nullable=True + ) + name: Mapped[str] = mapped_column(String(255), nullable=False) + upload_filename: Mapped[Optional[str]] = mapped_column( + String(255), nullable=True + ) # 上传/解压时的原始文件名(含扩展名),与智能展示名 name 区分 + size: Mapped[str] = mapped_column(String(32), nullable=False) + file_path: Mapped[Optional[str]] = mapped_column(String(512), nullable=True) # 仅目录路径,不含文件名 + storage_rel_path: Mapped[Optional[str]] = mapped_column( + String(512), nullable=True + ) # 项目内完整相对路径(含文件名),用于精确定位磁盘文件 + word_count: Mapped[int] = mapped_column(Integer, default=0) + uploaded_at: Mapped[datetime] = mapped_column(DateTime, nullable=False) + status: Mapped[int] = mapped_column(Integer, default=2) # 0=失败 2=排队中 3=处理中 4=可用 + error_message: Mapped[Optional[str]] = mapped_column(Text, nullable=True) + category: Mapped[Optional[str]] = mapped_column(String(32), nullable=True, default=None) + + project: Mapped["Project"] = relationship("Project", back_populates="kb_documents") + directory: Mapped[Optional["KbDirectory"]] = relationship("KbDirectory", back_populates="documents") + + +class KbDirectory(Base): + """知识库目录表。project_id 关联 projects.uuid;parent_id 形成目录树。""" + + __tablename__ = "kb_directories" + + id: Mapped[str] = mapped_column(String(64), primary_key=True) + project_id: Mapped[str] = mapped_column(ForeignKey("projects.uuid", ondelete="CASCADE"), nullable=False) + parent_id: Mapped[Optional[str]] = mapped_column( + ForeignKey("kb_directories.id", ondelete="CASCADE"), nullable=True + ) + name: Mapped[str] = mapped_column(String(255), nullable=False) + full_path: Mapped[str] = mapped_column(String(1024), nullable=False) + created_at: Mapped[datetime] = mapped_column(DateTime, nullable=False) + updated_at: Mapped[datetime] = mapped_column(DateTime, nullable=False) + + project: Mapped["Project"] = relationship("Project", back_populates="kb_directories") + documents: Mapped[list["KbDocument"]] = relationship("KbDocument", back_populates="directory") + + +class Task(Base): + """独立后台任务表:pdf2md 转换和 element-agent 要素抽取。""" + + __tablename__ = "tasks" + + id: Mapped[str] = mapped_column(String(64), primary_key=True) + project: Mapped[str] = mapped_column(String(64), nullable=False) + task_type: Mapped[int] = mapped_column(Integer, nullable=False) + file_id: Mapped[Optional[str]] = mapped_column(String(64), nullable=True) + file_path: Mapped[Optional[str]] = mapped_column(String(1024), nullable=True) + status: Mapped[int] = mapped_column(Integer, nullable=False, default=1) + payload_json: Mapped[Optional[dict]] = mapped_column(JSON, nullable=True) + result_path: Mapped[Optional[str]] = mapped_column(String(1024), nullable=True) + error_message: Mapped[Optional[str]] = mapped_column(Text, nullable=True) + add_time: Mapped[datetime] = mapped_column(DateTime, nullable=False) + finish_time: Mapped[Optional[datetime]] = mapped_column(DateTime, nullable=True) + + +class ElementTable(Base): + __tablename__ = "element_tables" + + id: Mapped[str] = mapped_column(String(64), primary_key=True) + project_id: Mapped[str] = mapped_column(ForeignKey("projects.uuid", ondelete="CASCADE"), nullable=False) + table_type: Mapped[str] = mapped_column(String(32), nullable=False) # global/time + table_name: Mapped[str] = mapped_column(String(255), nullable=False) + year: Mapped[Optional[int]] = mapped_column(Integer, nullable=True) + is_time_dimension: Mapped[bool] = mapped_column(Boolean, default=False) + sort_order: Mapped[int] = mapped_column(Integer, default=0) + # JSON 数组字符串,row_key 列表;sync 模版时跳过为这些行补格子,避免用户删行后一同步又出现 + sync_suppressed_row_keys: Mapped[Optional[str]] = mapped_column(Text, nullable=True) + # JSON 数组:界面行键展示顺序(含用户加行) + custom_row_order: Mapped[Optional[str]] = mapped_column(Text, nullable=True) + created_at: Mapped[datetime] = mapped_column(DateTime, nullable=False) + updated_at: Mapped[datetime] = mapped_column(DateTime, nullable=False) + + +class ElementCell(Base): + __tablename__ = "element_cells" + + id: Mapped[str] = mapped_column(String(64), primary_key=True) + table_id: Mapped[str] = mapped_column(ForeignKey("element_tables.id", ondelete="CASCADE"), nullable=False) + project_id: Mapped[str] = mapped_column(ForeignKey("projects.uuid", ondelete="CASCADE"), nullable=False) + row_key: Mapped[str] = mapped_column(String(255), nullable=False) + col_key: Mapped[Optional[str]] = mapped_column(String(255), nullable=True) + year: Mapped[Optional[int]] = mapped_column(Integer, nullable=True) + value: Mapped[Optional[str]] = mapped_column(Text, nullable=True) + source_document_id: Mapped[Optional[str]] = mapped_column( + ForeignKey("kb_documents.id", ondelete="SET NULL"), nullable=True + ) + source_line_no: Mapped[Optional[int]] = mapped_column(Integer, nullable=True) + source_line_end: Mapped[Optional[int]] = mapped_column(Integer, nullable=True) + source_quote: Mapped[Optional[str]] = mapped_column(Text, nullable=True) + confidence: Mapped[Optional[float]] = mapped_column(Float, nullable=True) + extraction_batch_id: Mapped[Optional[str]] = mapped_column(String(64), nullable=True) + extraction_model: Mapped[Optional[str]] = mapped_column(String(128), nullable=True) + source_type: Mapped[Optional[str]] = mapped_column(String(16), nullable=True) # extract | manual + conflict_status: Mapped[str] = mapped_column(String(16), default="none") + created_at: Mapped[datetime] = mapped_column(DateTime, nullable=False) + updated_at: Mapped[datetime] = mapped_column(DateTime, nullable=False) + + +class ExtractionResult(Base): + __tablename__ = "extraction_results" + + id: Mapped[str] = mapped_column(String(64), primary_key=True) + project_id: Mapped[str] = mapped_column(ForeignKey("projects.uuid", ondelete="CASCADE"), nullable=False) + document_id: Mapped[str] = mapped_column(ForeignKey("kb_documents.id", ondelete="CASCADE"), nullable=False) + batch_id: Mapped[str] = mapped_column(String(64), nullable=False) + result_type: Mapped[str] = mapped_column(String(16), nullable=False) # table/element + table_type: Mapped[Optional[str]] = mapped_column(String(32), nullable=True) + table_name: Mapped[Optional[str]] = mapped_column(String(255), nullable=True) + year: Mapped[Optional[int]] = mapped_column(Integer, nullable=True) + item_key: Mapped[str] = mapped_column(String(255), nullable=False) + item_value: Mapped[Optional[str]] = mapped_column(Text, nullable=True) + source_line_no: Mapped[Optional[int]] = mapped_column(Integer, nullable=True) + source_line_end: Mapped[Optional[int]] = mapped_column(Integer, nullable=True) + confidence: Mapped[Optional[float]] = mapped_column(Float, nullable=True) + raw_payload: Mapped[Optional[dict]] = mapped_column(JSON, nullable=True) + extracted_at: Mapped[Optional[datetime]] = mapped_column(DateTime, nullable=True) # 抽取业务时间(旧库迁移前可为空) + created_at: Mapped[datetime] = mapped_column(DateTime, nullable=False) + + +class ElementExtractionResult(Base): + """ + 要素抽取结果明细表(面向“细则章节/小节提示词 -> 项目材料”抽取)。 + + 字段对齐(用户侧语义): + - 表类型 -> table_type + - 年份 -> year + - 表名称 -> table_name + - 时间 -> extracted_at + - 键 -> item_key + - 值 -> item_value + - 来源文档ID -> source_document_id + - 来源行数 -> source_line_no / source_line_end + """ + + __tablename__ = "element_extraction_results" + + id: Mapped[str] = mapped_column(String(64), primary_key=True) + project_id: Mapped[str] = mapped_column(ForeignKey("projects.uuid", ondelete="CASCADE"), nullable=False) + table_type: Mapped[str] = mapped_column(String(32), nullable=False) + year: Mapped[Optional[int]] = mapped_column(Integer, nullable=True) + table_name: Mapped[str] = mapped_column(String(255), nullable=False) + extracted_at: Mapped[datetime] = mapped_column(DateTime, nullable=False) + item_key: Mapped[str] = mapped_column(String(255), nullable=False) + item_value: Mapped[Optional[str]] = mapped_column(Text, nullable=True) + source_document_id: Mapped[Optional[str]] = mapped_column( + ForeignKey("kb_documents.id", ondelete="SET NULL"), nullable=True + ) + source_line_no: Mapped[Optional[int]] = mapped_column(Integer, nullable=True) + source_line_end: Mapped[Optional[int]] = mapped_column(Integer, nullable=True) + created_at: Mapped[datetime] = mapped_column(DateTime, nullable=False) + + +class ElementConflict(Base): + __tablename__ = "element_conflicts" + + id: Mapped[str] = mapped_column(String(64), primary_key=True) + project_id: Mapped[str] = mapped_column(ForeignKey("projects.uuid", ondelete="CASCADE"), nullable=False) + table_id: Mapped[Optional[str]] = mapped_column(ForeignKey("element_tables.id", ondelete="SET NULL"), nullable=True) + cell_id: Mapped[Optional[str]] = mapped_column(ForeignKey("element_cells.id", ondelete="SET NULL"), nullable=True) + item_key: Mapped[str] = mapped_column(String(255), nullable=False) + old_value: Mapped[Optional[str]] = mapped_column(Text, nullable=True) + new_value: Mapped[Optional[str]] = mapped_column(Text, nullable=True) + selected_value: Mapped[Optional[str]] = mapped_column(Text, nullable=True) + source_document_id: Mapped[Optional[str]] = mapped_column( + ForeignKey("kb_documents.id", ondelete="SET NULL"), nullable=True + ) + source_line_no: Mapped[Optional[int]] = mapped_column(Integer, nullable=True) + status: Mapped[str] = mapped_column(String(16), default="pending") + created_at: Mapped[datetime] = mapped_column(DateTime, nullable=False) + updated_at: Mapped[datetime] = mapped_column(DateTime, nullable=False) + + +class DocumentMarkdown(Base): + __tablename__ = "document_markdowns" + + id: Mapped[str] = mapped_column(String(64), primary_key=True) + project_id: Mapped[str] = mapped_column(ForeignKey("projects.uuid", ondelete="CASCADE"), nullable=False) + document_id: Mapped[str] = mapped_column(ForeignKey("kb_documents.id", ondelete="CASCADE"), nullable=False) + extracted_filename: Mapped[Optional[str]] = mapped_column(String(255), nullable=True) + markdown_content: Mapped[str] = mapped_column(Text, nullable=False) + content_hash: Mapped[Optional[str]] = mapped_column(String(64), nullable=True) + created_at: Mapped[datetime] = mapped_column(DateTime, nullable=False) + updated_at: Mapped[datetime] = mapped_column(DateTime, nullable=False) + + +class DocumentChunk(Base): + __tablename__ = "document_chunks" + + id: Mapped[str] = mapped_column(String(64), primary_key=True) + project_id: Mapped[str] = mapped_column(ForeignKey("projects.uuid", ondelete="CASCADE"), nullable=False) + document_id: Mapped[str] = mapped_column(ForeignKey("kb_documents.id", ondelete="CASCADE"), nullable=False) + markdown_id: Mapped[Optional[str]] = mapped_column(ForeignKey("document_markdowns.id", ondelete="CASCADE"), nullable=True) + heading: Mapped[Optional[str]] = mapped_column(String(512), nullable=True) + chunk_text: Mapped[str] = mapped_column(Text, nullable=False) + chunk_index: Mapped[int] = mapped_column(Integer, default=0) + source_line_start: Mapped[Optional[int]] = mapped_column(Integer, nullable=True) + source_line_end: Mapped[Optional[int]] = mapped_column(Integer, nullable=True) + vector_id: Mapped[Optional[str]] = mapped_column(String(128), nullable=True) + created_at: Mapped[datetime] = mapped_column(DateTime, nullable=False) + + +class ReportTemplate(Base): + __tablename__ = "report_templates" + + id: Mapped[str] = mapped_column(String(64), primary_key=True) + name: Mapped[str] = mapped_column(String(255), nullable=False) + description: Mapped[Optional[str]] = mapped_column(Text, nullable=True) + is_default: Mapped[bool] = mapped_column(Boolean, default=False) + is_active: Mapped[bool] = mapped_column(Boolean, default=True) + created_at: Mapped[datetime] = mapped_column(DateTime, nullable=False) + updated_at: Mapped[datetime] = mapped_column(DateTime, nullable=False) + + +class ReportTemplateSection(Base): + __tablename__ = "report_template_sections" + + id: Mapped[str] = mapped_column(String(64), primary_key=True) + template_id: Mapped[str] = mapped_column(ForeignKey("report_templates.id", ondelete="CASCADE"), nullable=False) + section_key: Mapped[str] = mapped_column(String(64), nullable=False) + section_title: Mapped[str] = mapped_column(String(255), nullable=False) + section_prompt: Mapped[Optional[str]] = mapped_column(Text, nullable=True) + section_output_contract: Mapped[Optional[str]] = mapped_column(Text, nullable=True) + section_order: Mapped[int] = mapped_column(Integer, default=0) + examples: Mapped[Optional[str]] = mapped_column(Text, nullable=True) + created_at: Mapped[datetime] = mapped_column(DateTime, nullable=False) + updated_at: Mapped[datetime] = mapped_column(DateTime, nullable=False) + + +class ReportGenerationJob(Base): + __tablename__ = "report_generation_jobs" + + id: Mapped[str] = mapped_column(String(64), primary_key=True) + project_id: Mapped[str] = mapped_column(ForeignKey("projects.uuid", ondelete="CASCADE"), nullable=False) + template_id: Mapped[Optional[str]] = mapped_column( + ForeignKey("report_templates.id", ondelete="SET NULL"), nullable=True + ) + status: Mapped[str] = mapped_column(String(16), default="pending") # pending/running/completed/failed + progress: Mapped[int] = mapped_column(Integer, default=0) + current_section_key: Mapped[Optional[str]] = mapped_column(String(64), nullable=True) + error_message: Mapped[Optional[str]] = mapped_column(Text, nullable=True) + requested_by: Mapped[Optional[str]] = mapped_column(String(64), nullable=True) + options: Mapped[Optional[dict]] = mapped_column(JSON, nullable=True) + snapshot: Mapped[Optional[dict]] = mapped_column(JSON, nullable=True) + created_at: Mapped[datetime] = mapped_column(DateTime, nullable=False) + updated_at: Mapped[datetime] = mapped_column(DateTime, nullable=False) + completed_at: Mapped[Optional[datetime]] = mapped_column(DateTime, nullable=True) + + +class ReportGenerationChapter(Base): + __tablename__ = "report_generation_chapters" + + id: Mapped[str] = mapped_column(String(64), primary_key=True) + job_id: Mapped[str] = mapped_column( + ForeignKey("report_generation_jobs.id", ondelete="CASCADE"), nullable=False + ) + section_key: Mapped[str] = mapped_column(String(64), nullable=False) + section_title: Mapped[str] = mapped_column(String(255), nullable=False) + section_order: Mapped[int] = mapped_column(Integer, default=0) + status: Mapped[str] = mapped_column(String(16), default="pending") # pending/running/completed/failed + content: Mapped[Optional[str]] = mapped_column(Text, nullable=True) + prompt_text: Mapped[Optional[str]] = mapped_column(Text, nullable=True) + evidence_payload: Mapped[Optional[dict]] = mapped_column(JSON, nullable=True) + validation_payload: Mapped[Optional[dict]] = mapped_column(JSON, nullable=True) + error_message: Mapped[Optional[str]] = mapped_column(Text, nullable=True) + created_at: Mapped[datetime] = mapped_column(DateTime, nullable=False) + updated_at: Mapped[datetime] = mapped_column(DateTime, nullable=False) + completed_at: Mapped[Optional[datetime]] = mapped_column(DateTime, nullable=True) + +class ReportSectionReference(Base): + """章节参考范文(独立于模板配置,用于报告生成时拼入 prompt)""" + + __tablename__ = "report_section_references" + + id: Mapped[str] = mapped_column(String(64), primary_key=True) + template_id: Mapped[Optional[str]] = mapped_column( + ForeignKey("report_templates.id", ondelete="CASCADE"), nullable=True + ) + source_file: Mapped[str] = mapped_column(String(255), nullable=False) + section_key: Mapped[str] = mapped_column(String(64), nullable=False) + section_title: Mapped[str] = mapped_column(String(255), nullable=False) + section_order: Mapped[int] = mapped_column(Integer, default=0) + content: Mapped[str] = mapped_column(Text, nullable=False) + created_at: Mapped[datetime] = mapped_column(DateTime, nullable=False) + updated_at: Mapped[datetime] = mapped_column(DateTime, nullable=False) + + +class Department(Base): + + __tablename__ = "department" + + id: Mapped[str] = mapped_column(String(64), primary_key=True) + name: Mapped[str] = mapped_column(String(255), nullable=False) + parent_id: Mapped[Optional[str]] = mapped_column(ForeignKey("departments.id", ondelete="SET NULL"), nullable=True) + description: Mapped[Optional[str]] = mapped_column(Text, nullable=True) + created_at: Mapped[datetime] = mapped_column(DateTime, nullable=False) + updated_at: Mapped[datetime] = mapped_column(DateTime, nullable=False) + + +class User(Base): + __tablename__ = "users" + + id: Mapped[str] = mapped_column(String(64), primary_key=True) + username: Mapped[str] = mapped_column(String(64), nullable=False, unique=True) + password_hash: Mapped[Optional[str]] = mapped_column(String(255), nullable=True) + department_id: Mapped[Optional[str]] = mapped_column(ForeignKey("departments.id", ondelete="SET NULL"), nullable=True) + created_at: Mapped[datetime] = mapped_column(DateTime, nullable=False) + updated_at: Mapped[datetime] = mapped_column(DateTime, nullable=False) + + +class Role(Base): + __tablename__ = "roles" + + id: Mapped[str] = mapped_column(String(64), primary_key=True) + name: Mapped[str] = mapped_column(String(64), nullable=False, unique=True) + description: Mapped[Optional[str]] = mapped_column(Text, nullable=True) + created_at: Mapped[datetime] = mapped_column(DateTime, nullable=False) + updated_at: Mapped[datetime] = mapped_column(DateTime, nullable=False) + + +class Permission(Base): + __tablename__ = "permissions" + + id: Mapped[str] = mapped_column(String(64), primary_key=True) + perm_key: Mapped[str] = mapped_column(String(128), nullable=False, unique=True) + perm_type: Mapped[str] = mapped_column(String(32), nullable=False) # menu/project + description: Mapped[Optional[str]] = mapped_column(Text, nullable=True) + created_at: Mapped[datetime] = mapped_column(DateTime, nullable=False) + + +class RolePermission(Base): + __tablename__ = "role_permissions" + + id: Mapped[str] = mapped_column(String(64), primary_key=True) + role_id: Mapped[str] = mapped_column(ForeignKey("roles.id", ondelete="CASCADE"), nullable=False) + permission_id: Mapped[str] = mapped_column(ForeignKey("permissions.id", ondelete="CASCADE"), nullable=False) + created_at: Mapped[datetime] = mapped_column(DateTime, nullable=False) + + +class UserRole(Base): + __tablename__ = "user_roles" + + id: Mapped[str] = mapped_column(String(64), primary_key=True) + user_id: Mapped[str] = mapped_column(ForeignKey("users.id", ondelete="CASCADE"), nullable=False) + role_id: Mapped[str] = mapped_column(ForeignKey("roles.id", ondelete="CASCADE"), nullable=False) + created_at: Mapped[datetime] = mapped_column(DateTime, nullable=False) + + +class ProjectMember(Base): + __tablename__ = "project_members" + + id: Mapped[str] = mapped_column(String(64), primary_key=True) + project_id: Mapped[str] = mapped_column(ForeignKey("projects.uuid", ondelete="CASCADE"), nullable=False) + user_id: Mapped[str] = mapped_column(ForeignKey("users.id", ondelete="CASCADE"), nullable=False) + role: Mapped[str] = mapped_column(String(32), default="editor") + created_at: Mapped[datetime] = mapped_column(DateTime, nullable=False) + + +class ProjectDepartment(Base): + """项目可见部门:绑定后,仅这些部门下的用户可访问(另有管理员与 project_members 例外)。""" + + __tablename__ = "project_departments" + __table_args__ = (UniqueConstraint("project_id", "department_id", name="uq_project_department"),) + + id: Mapped[str] = mapped_column(String(64), primary_key=True) + project_id: Mapped[str] = mapped_column(ForeignKey("projects.uuid", ondelete="CASCADE"), nullable=False) + department_id: Mapped[str] = mapped_column(ForeignKey("departments.id", ondelete="CASCADE"), nullable=False) + created_at: Mapped[datetime] = mapped_column(DateTime, nullable=False) + + +class FillRecord(Base): + """回填记录:每次要素回填均留痕,支持证据追溯。""" + + __tablename__ = "fill_records" + + id: Mapped[str] = mapped_column(String(64), primary_key=True) + project_id: Mapped[str] = mapped_column(ForeignKey("projects.uuid", ondelete="CASCADE"), nullable=False) + cell_id: Mapped[Optional[str]] = mapped_column( + ForeignKey("element_cells.id", ondelete="SET NULL"), nullable=True + ) + table_id: Mapped[Optional[str]] = mapped_column( + ForeignKey("element_tables.id", ondelete="SET NULL"), nullable=True + ) + row_key: Mapped[str] = mapped_column(String(255), nullable=False) + col_key: Mapped[Optional[str]] = mapped_column(String(255), nullable=True) + year: Mapped[Optional[int]] = mapped_column(Integer, nullable=True) + filled_value: Mapped[Optional[str]] = mapped_column(Text, nullable=True) + previous_value: Mapped[Optional[str]] = mapped_column(Text, nullable=True) + + source_document_id: Mapped[Optional[str]] = mapped_column( + ForeignKey("kb_documents.id", ondelete="SET NULL"), nullable=True + ) + source_document_name: Mapped[Optional[str]] = mapped_column(String(255), nullable=True) + source_line_no: Mapped[Optional[int]] = mapped_column(Integer, nullable=True) + source_line_end: Mapped[Optional[int]] = mapped_column(Integer, nullable=True) + source_quote: Mapped[Optional[str]] = mapped_column(Text, nullable=True) + + confidence: Mapped[Optional[float]] = mapped_column(Float, nullable=True) + extraction_batch_id: Mapped[Optional[str]] = mapped_column(String(64), nullable=True) + extraction_model: Mapped[Optional[str]] = mapped_column(String(128), nullable=True) + + fill_type: Mapped[str] = mapped_column(String(16), nullable=False, default="auto") + created_at: Mapped[datetime] = mapped_column(DateTime, nullable=False) diff --git a/function/__init__.py b/function/__init__.py new file mode 100644 index 0000000..4ffb595 --- /dev/null +++ b/function/__init__.py @@ -0,0 +1 @@ +# function 包 diff --git a/function/vector_store.py b/function/vector_store.py new file mode 100644 index 0000000..0413d13 --- /dev/null +++ b/function/vector_store.py @@ -0,0 +1,550 @@ +#!/usr/bin/env python3 +# -*- coding: utf-8 -*- +""" +function/vector_store.py +向量库模块 - 与 kb_service 项目集成 +已修改:drop_old 全部 = False,不会删除已有集合 +✅ 已修复 413 超长 token 问题(语义友好版) +""" + +import re +import json +import logging +from typing import Dict, List, Optional, Tuple +from pathlib import Path + +from langchain_core.documents import Document +from langchain_text_splitters import RecursiveCharacterTextSplitter +from langchain_openai import OpenAIEmbeddings +from langchain_milvus import Milvus, BM25BuiltInFunction +from pymilvus import MilvusClient, connections + +from config import settings + +logger = logging.getLogger(__name__) + +# ============================================================================ +# 配置 +# ============================================================================ +COLLECTION_NAME = "eval_report" +EMBEDDING_API_BASE = settings.EMBEDDING_API_BASE +EMBEDDING_API_KEY = settings.EMBEDDING_API_KEY +MILVUS_DB_URL = settings.MILVUS_DB_URL + +CONSISTENCY_LEVEL = "Bounded" +AUTO_ID = True +METRIC_TYPE = "COSINE" +INDEX_TYPE = "AUTOINDEX" +SPARSE_METRIC_TYPE = "BM25" +SPARSE_INDEX_TYPE = "SPARSE_INVERTED_INDEX" + + +def _embedding_batch_limits() -> tuple[int, int, int]: + max_docs = max(1, int(getattr(settings, "EMBEDDING_BATCH_MAX_DOCS", 4) or 4)) + max_chars = max(512, int(getattr(settings, "EMBEDDING_BATCH_MAX_CHARS", 12000) or 12000)) + max_chunk = max(512, int(getattr(settings, "EMBEDDING_MAX_CHUNK_CHARS", 4000) or 4000)) + return max_docs, max_chars, max_chunk + + +def _is_embedding_backend_oom(exc: BaseException) -> bool: + msg = str(exc).lower() + return ( + "out of memory" in msg + or "npu out of memory" in msg + or "cuda out of memory" in msg + or "error code: 424" in msg + or "'code': 424" in msg + ) + + +def _add_documents_batch_with_retry(vs: Milvus, batch: List[Document]) -> List[str]: + """写入一批文档;远端 embedding OOM 时自动拆半重试。""" + if not batch: + return [] + try: + return list(vs.add_documents(batch)) + except Exception as e: + if not _is_embedding_backend_oom(e) or len(batch) <= 1: + raise + mid = max(1, len(batch) // 2) + logger.warning( + "embedding 批次 OOM,拆分为 %s + %s 重试", + mid, + len(batch) - mid, + ) + ids: List[str] = [] + ids.extend(_add_documents_batch_with_retry(vs, batch[:mid])) + ids.extend(_add_documents_batch_with_retry(vs, batch[mid:])) + return ids + + +def _register_milvus_client_for_orm(client: MilvusClient) -> None: + """pymilvus 2.6+ MilvusClient uses ConnectionManager; ORM Collection still resolves + pymilvus.orm.connections by client._using. langchain-milvus touches Collection during + Milvus.__init__, so register before constructing Milvus (bootstrap client).""" + alias = client._using + if connections.has_connection(alias): + return + cfg = client._config + connections._alias_handlers[alias] = client._handler + connections._alias_config[alias] = { + "address": cfg.address, + "user": "", + "db_name": cfg.db_name or "default", + } + + +# ============================================================================ +# VectorStore 类(已全部改为 drop_old=False) +# ============================================================================ + +class VectorStore: + def __init__( + self, + collection_name: str = COLLECTION_NAME, + drop_old: bool = False, + chunk_size: int = 500, + chunk_overlap: int = 50 + ): + self.collection_name = collection_name + self.chunk_size = chunk_size + self.chunk_overlap = chunk_overlap + self._drop_old = drop_old + self._milvus = None + + def _get_embeddings(self): + return OpenAIEmbeddings( + base_url=EMBEDDING_API_BASE, + api_key=EMBEDDING_API_KEY, + model="bge-m3", + check_embedding_ctx_length=False, + ) + + def _get_milvus(self, drop_old: bool = False) -> Milvus: + logger.info("【VectorStore】初始化 Milvus 混合向量存储(dense + sparse)") + + if self._milvus is not None and not drop_old: + logger.info("【VectorStore】复用已有 Milvus 实例") + return self._milvus + + if not MILVUS_DB_URL: + raise ValueError("MILVUS_DB_URL 未配置,请在 .env 中设置") + + embeddings = self._get_embeddings() + logger.info("【VectorStore】Embedding 模型 bge-m3 初始化完成") + + try: + # 与 langchain 内 MilvusClient 共享 ConnectionManager,先注册 ORM alias,否则 __init__ 内访问 Collection 会报错 + _register_milvus_client_for_orm(MilvusClient(uri=MILVUS_DB_URL)) + self._milvus = Milvus( + embedding_function=embeddings, + builtin_function=BM25BuiltInFunction(), + vector_field=["dense", "sparse"], + connection_args={"uri": MILVUS_DB_URL}, + collection_name=self.collection_name, + consistency_level=CONSISTENCY_LEVEL, + auto_id=AUTO_ID, + drop_old=False, + index_params=[ + {"metric_type": METRIC_TYPE, "index_type": INDEX_TYPE}, + {"metric_type": SPARSE_METRIC_TYPE, "index_type": SPARSE_INDEX_TYPE}, + ], + ) + _register_milvus_client_for_orm(self._milvus.client) + logger.info("✅ Milvus 混合向量存储初始化成功") + except Exception as e: + logger.error(f"❌ Milvus 初始化失败: {str(e)}", exc_info=True) + raise + + return self._milvus + + # ======================================================================== + # ✅ 修复版 add_documents:语义友好,不破坏段落,不触发413 + # ======================================================================== + def add_documents(self, documents: List[Document]) -> List[str]: + if not documents: + logger.info("【add_documents】无文档可写入") + return [] + + max_docs_per_batch, max_chars_per_batch, max_chunk_chars = _embedding_batch_limits() + + # ---------------------- 语义安全切分(只修问题,不破坏结构)---------------------- + # 只处理【真的超长】的段落,在句子/段落边界分割,绝不乱切 + safe_splitter = RecursiveCharacterTextSplitter( + chunk_size=max_chunk_chars, + chunk_overlap=min(200, max(0, max_chunk_chars // 20)), + separators=["\n\n", "\n", "。", "!", "?", ";", ":", ","] + ) + + safe_documents = [] + for doc in documents: + # 超过限制才切分 + if len(doc.page_content) > max_chunk_chars: + chunks = safe_splitter.split_text(doc.page_content) + for chunk in chunks: + if chunk.strip(): + safe_documents.append(Document( + page_content=chunk, + metadata=doc.metadata.copy() + )) + else: + safe_documents.append(doc) + # -------------------------------------------------------------------------------- + + # Milvus 现有集合要求部分 metadata 字段必填;历史调用方未必都传这些字段,这里统一兜底补齐。 + for idx, doc in enumerate(safe_documents): + metadata = doc.metadata or {} + if not metadata.get("doc_id"): + project_uuid = metadata.get("project_uuid") or "unknown_project" + heading = metadata.get("heading") or "chunk" + metadata["doc_id"] = f"{project_uuid}:{heading}:{idx}" + if "original_title" not in metadata: + metadata["original_title"] = metadata.get("heading") or "" + if "path" not in metadata: + metadata["path"] = "" + if "project_uuid" not in metadata: + metadata["project_uuid"] = "unknown_project" + doc.metadata = metadata + + logger.info(f"【add_documents】预处理后准备写入 {len(safe_documents)} 条文档") + vs = self._get_milvus(drop_old=self._drop_old) + self._drop_old = False + + ids = [] + current_batch: List[Document] = [] + current_batch_chars = 0 + batch_num = 1 + + def _flush_batch() -> None: + nonlocal current_batch, current_batch_chars, batch_num + if not current_batch: + return + logger.info( + "【add_documents】写入批次 %s,数量:%s,约 %s 字符", + batch_num, + len(current_batch), + current_batch_chars, + ) + try: + res = _add_documents_batch_with_retry(vs, current_batch) + ids.extend(res) + logger.info("✅ 批次写入成功,返回 ID 数:%s", len(res)) + except Exception as e: + logger.error("❌ 批次写入失败: %s", e, exc_info=True) + batch_num += 1 + current_batch = [] + current_batch_chars = 0 + + for doc in safe_documents: + doc_chars = len(doc.page_content or "") + would_exceed_docs = bool(current_batch) and len(current_batch) >= max_docs_per_batch + would_exceed_chars = bool(current_batch) and ( + current_batch_chars + doc_chars > max_chars_per_batch + ) + if would_exceed_docs or would_exceed_chars: + _flush_batch() + current_batch.append(doc) + current_batch_chars += doc_chars + + _flush_batch() + + logger.info(f"【add_documents】全部完成,总写入 ID 数:{len(ids)}") + return ids + + def similarity_search_with_score( + self, query: str, k: int = 10, filter: Optional[str] = None + ) -> List[Tuple[Document, float]]: + vs = self._get_milvus(drop_old=False) + query = query[:5000] + if filter: + return vs.similarity_search_with_score(query, k=k, filter=filter) + return vs.similarity_search_with_score(query, k=k) + + def similarity_search_dense_filtered( + self, + query: str, + k: int, + filter_expr: str, + ) -> List[Tuple[Document, float]]: + """ + 使用 dense 向量 ANN + Milvus 标量过滤检索。 + hybrid(dense+sparse)集合上 langchain_milvus 的 filter 可能不生效,抽取侧召回用此路径保证 doc_id 隔离。 + """ + from pymilvus import MilvusClient + + q = (query or "")[:5000] + if not q.strip(): + return [] + emb = self._get_embeddings().embed_query(q) + client = MilvusClient(uri=MILVUS_DB_URL) + try: + raw = client.search( + collection_name=self.collection_name, + data=[emb], + anns_field="dense", + limit=max(1, int(k)), + filter=filter_expr, + output_fields=[ + "text", + "heading", + "heading_level", + "doc_id", + "project_uuid", + "original_title", + "path", + ], + ) + finally: + client.close() + hits = raw[0] if raw else [] + out: List[Tuple[Document, float]] = [] + for hit in hits: + ent = hit.get("entity") or {} + doc = Document( + page_content=str(ent.get("text") or ""), + metadata={ + "heading": ent.get("heading"), + "heading_level": ent.get("heading_level"), + "doc_id": ent.get("doc_id"), + "project_uuid": ent.get("project_uuid"), + "original_title": ent.get("original_title"), + "path": ent.get("path"), + }, + ) + dist = hit.get("distance") + try: + score = float(dist) if dist is not None else 0.0 + except (TypeError, ValueError): + score = 0.0 + out.append((doc, score)) + return out + + def delete_by_filter(self, filter_expr: str) -> int: + try: + from pymilvus import MilvusClient + client = MilvusClient(uri=MILVUS_DB_URL) + if not client.has_collection(self.collection_name): + return 0 + # 某些集合主键字段名不叫 id(例如 langchain-milvus 可能使用自定义 PK/auto_id)。 + # 先从集合描述里找主键字段,再用于 query 计数。 + pk_field = None + describe = client.describe_collection(self.collection_name) + for f in describe.get("fields", []) or []: + # 兼容不同返回结构:is_primary / isPrimary / primary + if f.get("is_primary") or f.get("isPrimary") or f.get("primary"): + pk_field = f.get("name") + break + + count = 0 + try: + if pk_field: + res = client.query( + self.collection_name, + filter=filter_expr, + output_fields=[pk_field], + ) + count = len(res) + else: + # 找不到主键字段名时也不阻断删除 + count = 0 + except Exception: + # 仅计数失败不影响删除 + count = 0 + + client.delete(self.collection_name, filter=filter_expr) + client.close() + return count + except Exception as e: + logger.error(f"删除失败: {e}") + return 0 + + +# ============================================================================ +# Markdown 拆分 +# ============================================================================ + +def split_markdown(text: str, chunk_size: int = 500, chunk_overlap: int = 50) -> List[str]: + if not text: return [] + splitter = RecursiveCharacterTextSplitter( + chunk_size=chunk_size, chunk_overlap=chunk_overlap, + separators=["\n\n", "。", "?", "!", "\n", ";", ":", ","] + ) + return splitter.split_text(text) + +def split_markdown_by_headings(content: str, chunk_size=300, chunk_overlap=40) -> List[Document]: + if not content: return [] + docs = [] + lines = content.split("\n") + current_heading = "" + current_level = 0 + current_lines = [] + + def flush(): + nonlocal current_lines, current_heading, current_level + txt = "\n".join(current_lines).strip() + if txt: + docs.append(Document( + page_content=txt, + metadata={"heading": current_heading, "heading_level": current_level} + )) + current_lines = [] + + for line in lines: + line = line.rstrip() + m = re.match(r"^(#{1,6})\s+(.+)$", line) + if m: + flush() + current_level = len(m.group(1)) + current_heading = m.group(2).strip() + else: + current_lines.append(line) + flush() + + if not docs: + chunks = split_markdown(content, chunk_size, chunk_overlap) + for i, c in enumerate(chunks): + docs.append( + Document( + page_content=c, + metadata={"chunk_index": i, "heading": "", "heading_level": 0}, + ) + ) + return docs + +def process_document_to_vector_store( + doc_id: str, title: str, content: str, path: str, project_uuid: str, collection_name=COLLECTION_NAME +) -> bool: + try: + vs = VectorStore(collection_name=collection_name, drop_old=False) + docs = split_markdown_by_headings(content) + for d in docs: + d.metadata["doc_id"] = doc_id + d.metadata["original_title"] = title + d.metadata["path"] = path + d.metadata["project_uuid"] = project_uuid + vs.add_documents(docs) + return True + except Exception as e: + logger.error(f"处理文档失败: {e}") + return False + +# ============================================================================ +# 数据预处理 +# ============================================================================ + +INPUT_FILE = "data/articles.jsonl" +OUTPUT_CHUNK_FILE = "data/processed/eval_chunks.jsonl" + +def load_jsonl(filename: str, encoding="utf-8"): + with open(filename, encoding=encoding) as f: + for line in f: + if line.strip(): + yield json.loads(line) + +def write_jsonl(data, filename, append=False, ensure_ascii=False): + mode = "a" if append else "w" + with open(filename, mode, encoding="utf-8") as f: + for item in data: + f.write(json.dumps(item, ensure_ascii=ensure_ascii) + "\n") + +def clean_text(text: str) -> str: + if not isinstance(text, str): return "" + text = re.sub(r"[\x00-\x09\x0B-\x1F\x7F]", "", text) + text = re.sub(r"[\u200b-\u200f\u2028\u2029]", "", text) + text = re.sub(r"[:’“”•…–—]", "", text) + text = re.sub(r"<[^>]+>", "\n", text) + text = re.sub(r"\n+", "\n", text) + text = re.sub(r" +", " ", text) + text = re.sub(r"^[。,?!;:]", "", text) + text = re.sub(r'[^\u4e00-\u9fff_a-zA-Z0-9\s,。!?;:、()《》【】""''·!@#$%^&*()_+=[]{}|;:\'",./<>?-]', "", text) + return text.strip() + +def concat_metadata_to_content(title: str, content: str, metadata: dict): + parts = [ + f"标题:{title}", + f"发布时间:{metadata.get('publish_time')}", + f"作者:{metadata.get('author')}", + f"来源:{metadata.get('source')}", + ] + parts = [p for p in parts if p.split(":")[-1]] + return " | ".join(parts) + "\n---\n" + content.strip() + +def process_all_documents(input_file, output_file, chunk_size=500, overlap=50): + docs = load_jsonl(input_file) + splitter = RecursiveCharacterTextSplitter(chunk_size=chunk_size, chunk_overlap=overlap, + separators=["\n\n", "。", "?", "!", "\n", ";", ":", ","]) + all_chunks = [] + num_docs = 0 + for doc in docs: + num_docs +=1 + content = clean_text(doc["content"]) + chunks = splitter.split_text(content) + for i, chunk in chunks: + clean_c = clean_text(chunk) + if len(clean_c) <10: continue + all_chunks.append({ + "id": f"{doc['id']}_chunk_{i}", + "doc_id": doc["id"], + "title": doc["title"], + "content": concat_metadata_to_content(doc["title"], clean_c, doc.get("metadata",{})), + "chunk_index": i, + "url": doc.get("metadata",{}).get("url","") + }) + write_jsonl(all_chunks, output_file) + return {"num_docs":num_docs, "num_chunks":len(all_chunks)} + +def load_chunk_jsonl(path): + res = [] + with open(path, encoding="utf-8") as f: + for line in f: + if line.strip(): + res.append(json.loads(line)) + return res + +def build_index(data, vs: VectorStore): + docs: List[Document] = [] + for row in data: + c = row.pop("content", "").strip() + if len(c) < 10: + continue + docs.append(Document(page_content=c, metadata=row)) + if docs: + vs.add_documents(docs) + +def get_vector_store(drop_old=False): + vs = VectorStore(collection_name=COLLECTION_NAME, drop_old=drop_old) + return vs._get_milvus(drop_old=drop_old) + +def search_eval(query, top_k=10): + from time import time + vs = VectorStore(drop_old=False) + st = time() + results = vs.similarity_search_with_score(query, k=top_k) + print(f"检索耗时: {time()-st:.2f}s") + return results + +# ============================================================================ +# 运行入口 +# ============================================================================ +if __name__ == "__main__": + logger.info("="*60) + logger.info("【Milvus 混合向量索引构建启动】dense + sparse(BM25)") + logger.info("="*60) + + process_all_documents(INPUT_FILE, OUTPUT_CHUNK_FILE) + logger.info("✅ 文本分块处理完成") + + chunk_data = load_chunk_jsonl(OUTPUT_CHUNK_FILE) + logger.info(f"✅ 加载分块数据:{len(chunk_data)} 条") + + vs = VectorStore(drop_old=False) + build_index(chunk_data, vs) + logger.info("✅ 索引构建完成(增量写入)") + + res = search_eval("测试检索内容") + logger.info(f"✅ 检索完成,命中数量:{len(res)}") + for doc, score in res: + logger.info(f"score={score:.4f} | content={doc.page_content[:80]}...") + + logger.info("="*60) + logger.info("【全部执行完成】") diff --git a/log/__init__.py b/log/__init__.py new file mode 100644 index 0000000..38702e5 --- /dev/null +++ b/log/__init__.py @@ -0,0 +1,3 @@ +from .logger import configure_logging, get_logger + +__all__ = ["configure_logging", "get_logger"] diff --git a/log/logger.py b/log/logger.py new file mode 100644 index 0000000..8b195d9 --- /dev/null +++ b/log/logger.py @@ -0,0 +1,185 @@ +from __future__ import annotations + +import logging +from logging.handlers import RotatingFileHandler +from pathlib import Path + + +_CONFIGURED = False +_FILE_PROCESSING_PREFIXES = ( + "worker.document_processing", + "services.kb_service", + "services.es_docs", + "services.element_llm_extract_service", + "routers.extract", + "function.documents", + "function.vector_store", + "repo.kb_documents", + "routers.reference", + "services.doc_convert_service", + "services.reference_service", +) +_DOCUMENT_GENERATION_PREFIXES = ( + "services.write_service", + "services.report_generation_service", + "services.markdown_stream_service", + "services.llm_client", + "services.llm_runner", + "services.report_prompt_service", + "services.report_runtime_store", +) +# 生成全过程追踪:完整记录输入 prompt / 调用模型 / 模型输出 +_GENERATION_TRACE_PREFIXES = ( + "generation.trace", +) + + +class _PrefixFilter(logging.Filter): + def __init__(self, prefixes: tuple[str, ...]) -> None: + super().__init__() + self.prefixes = prefixes + + def filter(self, record: logging.LogRecord) -> bool: + name = str(record.name or "") + return any(name == prefix or name.startswith(prefix + ".") for prefix in self.prefixes) + + +class _OtherFilter(logging.Filter): + def filter(self, record: logging.LogRecord) -> bool: + name = str(record.name or "") + if any(name == prefix or name.startswith(prefix + ".") for prefix in _FILE_PROCESSING_PREFIXES): + return False + if any(name == prefix or name.startswith(prefix + ".") for prefix in _DOCUMENT_GENERATION_PREFIXES): + return False + if any(name == prefix or name.startswith(prefix + ".") for prefix in _GENERATION_TRACE_PREFIXES): + return False + return True + + +def configure_logging( + *, + log_dir: str | Path = "logs", + level: int = logging.INFO, +) -> Path: + global _CONFIGURED + + target_dir = Path(log_dir).resolve() + target_dir.mkdir(parents=True, exist_ok=True) + other_log_path = target_dir / "other.log" + + if _CONFIGURED: + return other_log_path + + formatter = logging.Formatter( + "%(asctime)s | %(levelname)s | %(name)s | %(message)s" + ) + + root_logger = logging.getLogger() + root_logger.setLevel(level) + + file_processing_handler = RotatingFileHandler( + target_dir / "file_processing.log", + maxBytes=10 * 1024 * 1024, + backupCount=5, + encoding="utf-8", + ) + file_processing_handler.setLevel(level) + file_processing_handler.setFormatter(formatter) + file_processing_handler.addFilter(_PrefixFilter(_FILE_PROCESSING_PREFIXES)) + + document_generation_handler = RotatingFileHandler( + target_dir / "document_generation.log", + maxBytes=10 * 1024 * 1024, + backupCount=5, + encoding="utf-8", + ) + document_generation_handler.setLevel(level) + document_generation_handler.setFormatter(formatter) + document_generation_handler.addFilter(_PrefixFilter(_DOCUMENT_GENERATION_PREFIXES)) + + other_handler = RotatingFileHandler( + other_log_path, + maxBytes=10 * 1024 * 1024, + backupCount=5, + encoding="utf-8", + ) + other_handler.setLevel(level) + other_handler.setFormatter(formatter) + other_handler.addFilter(_OtherFilter()) + + # ── 要素抽取独立日志 ───────────────────────────────────────────── + element_extract_handler = RotatingFileHandler( + target_dir / "element_extract.log", + maxBytes=10 * 1024 * 1024, + backupCount=10, + encoding="utf-8", + ) + element_extract_handler.setLevel(level) + element_extract_handler.setFormatter(formatter) + element_extract_handler.addFilter(_PrefixFilter(("services.element_llm_extract_service", "routers.extract"))) + + # ── 文件上传/解析独立日志 ───────────────────────────────────────── + file_upload_handler = RotatingFileHandler( + target_dir / "file_upload.log", + maxBytes=10 * 1024 * 1024, + backupCount=10, + encoding="utf-8", + ) + file_upload_handler.setLevel(level) + file_upload_handler.setFormatter(formatter) + file_upload_handler.addFilter(_PrefixFilter(("routers.reference", "routers.template", "services.doc_convert_service", "services.reference_service", "services.kb_service", "routers.kb"))) + + # ── 报告生成独立日志 ────────────────────────────────────────────── + report_generation_handler = RotatingFileHandler( + target_dir / "report_generation.log", + maxBytes=10 * 1024 * 1024, + backupCount=10, + encoding="utf-8", + ) + report_generation_handler.setLevel(level) + report_generation_handler.setFormatter(formatter) + report_generation_handler.addFilter(_PrefixFilter(("services.report_generation_service", "services.report_prompt_service", "services.report_runtime_store", "services.markdown_stream_service"))) + + # ── LLM 调用独立日志 ────────────────────────────────────────────── + llm_handler = RotatingFileHandler( + target_dir / "llm.log", + maxBytes=10 * 1024 * 1024, + backupCount=10, + encoding="utf-8", + ) + llm_handler.setLevel(level) + llm_handler.setFormatter(formatter) + llm_handler.addFilter(_PrefixFilter(("services.llm_client", "services.llm_runner"))) + + # ── 生成全过程追踪日志(输入 prompt / 模型 / 输出,单条可能较大)──────── + generation_trace_handler = RotatingFileHandler( + target_dir / "generation_trace.log", + maxBytes=50 * 1024 * 1024, + backupCount=10, + encoding="utf-8", + ) + generation_trace_handler.setLevel(level) + generation_trace_handler.setFormatter(formatter) + generation_trace_handler.addFilter(_PrefixFilter(_GENERATION_TRACE_PREFIXES)) + + stream_handler = logging.StreamHandler() + stream_handler.setLevel(level) + stream_handler.setFormatter(formatter) + + root_logger.handlers.clear() + root_logger.addHandler(file_processing_handler) + root_logger.addHandler(document_generation_handler) + root_logger.addHandler(other_handler) + root_logger.addHandler(element_extract_handler) + root_logger.addHandler(file_upload_handler) + root_logger.addHandler(report_generation_handler) + root_logger.addHandler(llm_handler) + root_logger.addHandler(generation_trace_handler) + root_logger.addHandler(stream_handler) + + _CONFIGURED = True + return other_log_path + + +def get_logger(name: str) -> logging.Logger: + return logging.getLogger(name) diff --git a/main.py b/main.py new file mode 100644 index 0000000..0964a6e --- /dev/null +++ b/main.py @@ -0,0 +1,66 @@ +""" +main.py +报告生成独立服务 FastAPI 入口。 + +启动方式: + uvicorn main:app --reload + 或:python main.py +""" + +import logging +from contextlib import asynccontextmanager + +import uvicorn +from fastapi import FastAPI +from fastapi.middleware.cors import CORSMiddleware + +from config import settings +from database import engine, init_database +from log import configure_logging +from routers import report + +configure_logging() +_log = logging.getLogger(__name__) + + +@asynccontextmanager +async def lifespan(app: FastAPI): + """应用启动与关闭时执行。""" + init_database() + yield + engine.dispose() + + +app = FastAPI( + lifespan=lifespan, + title=settings.APP_TITLE, + version=settings.APP_VERSION, + description=settings.APP_DESCRIPTION, + docs_url="/docs", + redoc_url="/redoc", +) + +app.add_middleware( + CORSMiddleware, + allow_origins=settings.CORS_ORIGINS, + allow_credentials=True, + allow_methods=["*"], + allow_headers=["*"], +) + +app.include_router(report.router, prefix="/api/v1") + + +@app.get("/health", tags=["系统"], summary="健康检查") +def health_check(): + """确认服务存活,返回版本信息。""" + return {"status": "ok", "version": settings.APP_VERSION} + + +if __name__ == "__main__": + uvicorn.run( + "main:app", + host=settings.HOST, + port=settings.PORT, + reload=settings.RELOAD, + ) diff --git a/prompts/__init__.py b/prompts/__init__.py new file mode 100644 index 0000000..407d6d0 --- /dev/null +++ b/prompts/__init__.py @@ -0,0 +1 @@ +# prompts 包 diff --git a/prompts/report_generation/__init__.py b/prompts/report_generation/__init__.py new file mode 100644 index 0000000..4272161 --- /dev/null +++ b/prompts/report_generation/__init__.py @@ -0,0 +1 @@ +# report_generation prompts 包 diff --git a/prompts/report_generation/appendix_templates.py b/prompts/report_generation/appendix_templates.py new file mode 100644 index 0000000..c9c6e21 --- /dev/null +++ b/prompts/report_generation/appendix_templates.py @@ -0,0 +1,52 @@ +"""Fixed markdown templates used by report generation.""" + + +def markdown_hashes_for_section_no(section_no: str) -> str: + """与前端 markdownHashesForSectionNo / _heading_level_and_class 对齐。""" + parts = str(section_no or "").strip().split(".") + if len(parts) == 1: + return "##" + if len(parts) == 2: + return "###" + return "####" + + +def missing_child_heading_markdown(heading_no: str) -> str: + hashes = markdown_hashes_for_section_no(heading_no) + return f"\n\n{hashes} {heading_no} 待补充\n\n待补充" + + +# 兼容旧引用;新代码请用 missing_child_heading_markdown(heading_no) +MISSING_CHILD_HEADING_TEMPLATE = "\n\n### {heading_no} 待补充\n\n待补充" + +MINIMAL_MISSING_TABLE_TEMPLATE = ( + "\n\n### {table_name}\n\n" + "| 项目 | 内容 |\n" + "| --- | --- |\n" + "| 关键数据 | 待补充 |\n" +) + +APPENDIX8_PARAMETER_COMPARISON_TABLE = ( + "| 序号 | 项目名称 | 单位 | 可研报告 | 后评价报告 | 备注 |\n" + "| --- | --- | --- | --- | --- | --- |\n" + "| 一 | 成本参数 | | | | |\n" + "| 1 | 原料价格 | | | | |\n" + "| 1.1 | 氢气 | 元/吨 | 待补充 | 待补充 | |\n" + "| 2 | 催化剂和化学药剂 | 万元 | 待补充 | 待补充 | |\n" + "| 3 | 燃料动力价格 | | | | |\n" + "| 3.1 | 除盐水价格 | 元/吨 | 待补充 | 待补充 | |\n" + "| …… | …… | | | | |\n" + "| 二 | 营业收入参数 | | | | |\n" + "| 2.1 | 98#汽油 | 元/吨 | 待补充 | 待补充 | |\n" + "| …… | …… | | | | |\n" + "| 三 | 税收参数 | | | | |\n" + "| | 增值税税率 | | | | |\n" + "| | 汽油各品种产品 | % | 待补充 | 待补充 | |\n" + "| …… | …… | | | | |\n" + "| 四 | 基准收益率 | % | 待补充 | 待补充 | |" +) + +APPENDIX_FIGURE_TARGETS: list[tuple[str, str]] = [ + ("附图1", "全厂物料平衡图"), + ("附图2", "烷基化装置物料平衡图"), +] diff --git a/prompts/report_generation/chapter_generation_system.md b/prompts/report_generation/chapter_generation_system.md new file mode 100644 index 0000000..ffb8471 --- /dev/null +++ b/prompts/report_generation/chapter_generation_system.md @@ -0,0 +1 @@ +你是后评价报告撰写助手。严格基于证据输出,禁止编造。示例仅可用于写作风格参考,禁止复用示例中的任何事实数据与结论。禁止输出与当前小节无关的表号/表题清单及跨节“详见表/参见表”引用。必须返回 JSON 对象,字段为 content/missingInfo/qualityChecks。 diff --git a/prompts/report_generation/chapter_generation_user.md b/prompts/report_generation/chapter_generation_user.md new file mode 100644 index 0000000..4f12d7a --- /dev/null +++ b/prompts/report_generation/chapter_generation_user.md @@ -0,0 +1,67 @@ +你正在编写后评价报告章节:{{section_title}} + +【章节细则描述】 +{{section_prompt}} + +【章节模板】 +{{section_title}} + +【模板必需表格】 +{{required_tables_text}} + +【结构化表格证据(必须优先采用)】 +{{structured_tables_text}} + +【字段级已抽取结果(强约束)】 +{{canonical_fields_text}} + +【章节示例】 +{{selected_example}} + +【参考范文】 +{{section_reference_block}} + +【示例使用约束】 +1. 以《模版.doc》同章节结构为第一优先:段落顺序、表格标题、表头字段尽量保持一致; +2. 参考范文仅用于格式与结构参考,严禁复用示例中的项目名称、年份、金额、比例、指标值与结论; +3. 所有数值必须来自证据包;如需表格,表头可沿用模板,表内数据必须替换为当前项目证据; +4. 若模板字段无证据,按字段粒度写"待补充",不得整段空泛描述。 + +【输出硬约束】 +1. 若存在【模板必需表格】,正文必须出现同名(或同编号)表格标题; +2. 若【结构化表格证据】中存在对应必需表,必须原样使用该 Markdown 表格,不得自行生成或改写表头/数值; +3. 仅在单元格级别缺失时写"待补充",避免整段反复"待补充"; +4. 若【字段级已抽取结果】中某字段为非"待补充"值,正文该字段不得写"待补充",必须使用该抽取值; +5. content 字段只允许写章节正文,严禁出现"【缺失信息说明】""【质量检查】"及其任何条目; +6. 禁止输出与本节无关的表号/表题清单,禁止出现跨节表格引用(如"详见表X-X/参见表X-X/见表X-X/如表X-X所示");仅当【章节输出结构约束】明确要求时,才允许引用或输出对应表。 +{{heading_rule}}7. 禁止使用"关键里程碑时间线""建设/投资执行情况"等突兀标签式标题。 + +【表格严格管控——必须遵守】 +1. **禁止凭空生成表格**:只有当【章节输出结构约束】中明确包含"【表格强制要求】"标签时,本节才允许输出 Markdown 表格; +2. **无"表格强制要求"的章节一律禁止输出任何 Markdown 表格**(即不得输出含 | 分隔符的表格行),即使证据包中有结构化表格数据也不得在正文中嵌入; +3. **"见附表N"仅为引用语**:若合同要求写"项目建设工作程序见附表1。"等引用句,只需输出该引用句文本,附表本体在报告末尾统一输出,严禁在本节正文中展开附表的完整 Markdown 表格; +4. 表格数据必须严格来自要素管理(element_tables/element_cells),不得自行编造表格内容; +5. 每个 Markdown 表格前须有独立一行表题(形如「表1 …」「表2-3 …」「附表8 …」等);表题紧挨表格上方单独成段,表题与表格之间最多空一行或一行注释;前端会将表题居中排版。 +6. **表号与表名间距**:表题中表号(如「表2-4」「附表8」)与表名之间须空两个全角空格(U+3000),例如「表2-4  原料数量及组成对比表」。 +7. **表头栏单位**:凡含计量单位的列名,名称写第一行、单位加括号写在第二行,且在同一表头单元格内(Markdown 可用 `
`,如 `新鲜水
(m³/h)`);表题与表头均勿使用 `**` 加粗;勿将单位单独占一列,勿把「名称(单位)」横挤在同一行。 +8. **公共单位写表题**:若整张表各数据列所用单位相同,单位应加括号写在表题行末尾(如「表3 ××公司储罐能力 (m³)」),表头栏内不再重复该单位;若各列单位不一致,则仍按列在表头内分行写单位。 +9. **表格序号列**:用阿拉伯数字,层次与正文一致(如 1、1.1、1.2、2、2.1);行键或表体第一列已带层次编号时可与之对齐;否则自上而下用 1、2、3…;「合计」「总计」行可用「—」。 +10. **表体与数字**:表内文字、数字宜水平与垂直居中;若单元格内需换行或分段(含 `
`),宜左齐排列以便阅读。同一表内、同列的小数、百分比等宜保留相同的小数位数。 + +【检索顺序约束】 +1. 优先使用要素抽取结果; +2. 要素不足时补充文档段落; +3. 最后使用关键词检索到的补充材料; +4. 无证据时写"待补充",禁止编造。 + +{{prior_sibling_sections_block}} + +{{prior_chapters_block}} + +【章节输出结构约束】 +{{section_contract}} + +【证据包(JSON)】 +{{evidence_json}} + +请仅返回 JSON:{"content":"章节Markdown正文","missingInfo":["缺失项"],"qualityChecks":["校验结论"]} diff --git a/prompts/report_generation/chapter_generation_user_ref_aligned.md b/prompts/report_generation/chapter_generation_user_ref_aligned.md new file mode 100644 index 0000000..cbd74e0 --- /dev/null +++ b/prompts/report_generation/chapter_generation_user_ref_aligned.md @@ -0,0 +1,88 @@ +你正在编写后评价报告章节:{{section_title}} + +本次任务:以【章节细则描述】和【参考范文】共同作为本节的写作模板,以【事实证据】作为唯一数据来源。核心原则是:**细则与范文决定写什么、怎么写;证据只负责提供可填入模板的真实数据**。生成时必须先搭模板,再填证据,严禁脱离模板自由发挥,严禁复用范文数据或自行改写证据数据。 + +========================= 第一部分 · 写作模板(最高优先级:决定内容范围、结构和文风)========================= + +【标题编号规则】 +{{heading_rule}} + +【章节细则描述】 +{{section_prompt}} + +【参考范文(内容范围、论述维度、段落结构和行文风格的主要模板)】 +{{section_reference_block}} + +========================= 第二部分 · 事实证据(唯一数据来源,仅用于支撑和填充模板)========================= + +【模板必需表格】 +{{required_tables_text}} + +【结构化表格证据(必须优先采用)】 +{{structured_tables_text}} + +【字段级已抽取结果(强约束)】 +{{canonical_fields_text}} + +【证据包(JSON)】 +{{evidence_json}} + +========================= 第三部分 · 上文已生成内容(只用于一致性校验,不改变本节模板)========================= + +{{prior_sibling_sections_block}} + +{{prior_chapters_block}} + +========================= 第四部分 · 写作与输出要求(务必逐条遵守)========================= + +【生成步骤】 +1. 先读取【章节细则描述】和【参考范文】,抽取本节应覆盖的内容主题、论述维度、段落顺序、子标题层级、表格/列举形式和结论方式; +2. 再读取【章节输出结构约束】,确认本节是否允许/必须输出表格、附表引用或特定结构; +3. 然后只从【事实证据】中选择可支撑上述模板的数据,把证据数据填入对应位置; +4. 最后输出正文。若模板要求的某项内容在证据中没有对应数据,写"待补充",不得跳过、猜测、编造或用范文数据顶替。 + +【模板遵循要求——细则与范文共同决定“写什么”和“怎么写”】 +1. "写什么"由【章节细则描述】与【参考范文】共同决定:细则列出的要点、子项及顺序为必写项;参考范文实际写到的内容主题、论述维度和信息点(如背景、依据、目标、措施、问题、结论等)也应覆盖。二者取并集,不得遗漏,也不得另起炉灶写无关内容; +2. "怎么写"以【参考范文】为主要模板:段落数量、段落顺序、每段主题、论述推进、句式结构、专业术语、连接词、语气口吻、详略程度和结论表达都应高度贴合范文; +3. 若【章节细则描述】与【参考范文】存在差异,优先保证细则要求完整覆盖,再用范文的结构和笔法组织表达;若二者均未要求,正文不要主动扩展。 + +【证据使用要求——数据必须来自证据且保持原值】 +1. 所有项目名称、时间、金额、数量、比例、指标值、单位、结论依据等事实性内容,只能来自第二部分事实证据; +2. 数据必须原值引用,严禁自行修改、估算、换算单位、四舍五入、增减、归纳为新数值或编造。证据是多少就写多少;证据未给出的数据写"待补充"; +3. 若【字段级已抽取结果】中某字段为非"待补充"值,正文必须原样使用该抽取值,不得写"待补充",也不得改动、换算或重新表述其数值; +4. 内容来源优先级:结构化表格证据 / 字段级已抽取结果 > 证据包(JSON)中的章节文档 > 关键词检索补充材料; +5. 禁止复用【参考范文】或【章节示例】中的任何项目名称、年份、金额、指标值、比例、结论等事实数据。 + +【参考范文贴合要求——高度相似但严禁照抄】 +1. 逐段对照:范文有几段就尽量写几段,每段主题、先后顺序、论述角度与起承转合须与范文对应; +2. 句式与笔法对齐:尽量沿用范文的段首引导方式、常用表达、收束方式和专业语气,使本节读起来与范文出自同一类报告; +3. 篇幅与颗粒度对齐:每段篇幅、信息密度和展开程度与范文相当,不得明显更短、更空泛,也不得无端扩写; +4. 形式对齐:范文采用分条、分项、描述性子标题或表格呈现的,本节也尽量采用同类形式,但必须满足【章节输出结构约束】和下方表格规则; +5. 禁止逐字照抄:不得出现与范文连续相同超过15字的句子或成段文字;应在保持结构和笔法相似的前提下,用本项目证据重新表述。 + +【输出硬约束】 +1. content字段只允许写章节正文,严禁出现"【缺失信息说明】""【质量检查】"及其任何条目; +2. 若存在【模板必需表格】,正文必须出现同名(或同编号)表格标题; +3. 若【结构化表格证据】中存在对应必需表,必须原样使用该Markdown表格,不得自行生成或改写表头/数值; +4. 仅在单元格级别缺失时写"待补充",避免整段反复"待补充"; +5. 禁止输出与本节无关的表号/表题清单,禁止出现跨节表格引用(如"详见表X-X/参见表X-X/见表X-X/如表X-X所示");仅当【章节输出结构约束】明确要求时,才允许引用或输出对应表; +6. 禁止使用"关键里程碑时间线""建设/投资执行情况"等突兀标签式标题; +7. 数字与汉字之间不留空格:阿拉伯数字、百分比、金额、年份等与相邻汉字之间不得插入半角或全角空格,例如写"投资1.2亿元""2023年12月""产能达95%",不得写"投资 1.2 亿元""2023 年 12 月""产能达 95 %";数字与计量单位之间也不留空格,如"30万吨"而非"30 万吨"; +8. 子标题形式约束:正文段落允许使用描述性小标题,但只能采用"一、""(一)""1."或加粗短语单独成行等中文公文层级形式;严禁使用Markdown标题语法(`#`、`##`、`###`等)充当子标题。表格上方的表题不属于子标题; +9. 计量单位须规范:面积写"m²"不得写"m2",体积写"m³"不得写"m3",流量写"m³/h"不得写"m3/h";温度写"℃",千分号写"‰",科学计数可写"×10⁴"。正文与表格中的单位均须规范。 + +【表格严格管控】 +1. 只有当【章节输出结构约束】中明确包含"【表格强制要求】"标签时,本节才允许输出Markdown表格; +2. 无"表格强制要求"的章节一律禁止输出任何Markdown表格(不得输出含`|`分隔符的表格行),即使证据包中有结构化表格数据也不得在正文中嵌入; +3. "见附表N"仅为引用语:若结构约束要求写"项目建设工作程序见附表1。"等引用句,只输出引用句文本,附表本体在报告末尾统一输出,严禁在本节展开完整Markdown表格; +4. 表格数据必须严格来自要素管理(element_tables/element_cells)或结构化表格证据,不得自行编造、换算或改写表格内容; +5. 每个Markdown表格前须有独立一行表题(如「表1  ××表」「表2-3  ××表」「附表8  ××表」),表题紧挨表格上方单独成段; +6. 表号与表名之间须空两个全角空格(U+3000),例如「表2-4  原料数量及组成对比表」; +7. 含计量单位的表头,名称写第一行、单位加括号写第二行,且在同一表头单元格内(Markdown可用`
`,如`新鲜水
(m³/h)`);勿将单位单独占一列; +8. 若整张表各数据列所用单位相同,单位写在表题行末尾,表头栏内不再重复;若各列单位不一致,则按列在表头内分行写单位; +9. 表格序号列用阿拉伯数字,层次与正文一致;"合计""总计"行可用"—"; +10. 同一表内、同列的小数、百分比等宜保留相同的小数位数,但不得因此改动证据原值。 + +【输出格式】 +请仅返回JSON:{"content":"章节Markdown正文","missingInfo":["缺失项"],"qualityChecks":["校验结论"]} +你正在编写后评价报告章节:{{section_title}} \ No newline at end of file diff --git a/prompts/report_generation/heading_rules.py b/prompts/report_generation/heading_rules.py new file mode 100644 index 0000000..cb0df0c --- /dev/null +++ b/prompts/report_generation/heading_rules.py @@ -0,0 +1,14 @@ +"""Heading rule prompt variables for report generation.""" + +DEFAULT_HEADING_RULE = ( + "5. 各章节内部小标题须使用规范层级格式(如“### 1.2.1 …”);" + "若在同一节内使用并列条目,必须统一写作“1)… 2)… 3)…”," + "禁止使用“一、二、三、”“(一)(二)(三)”或“1.”“1.2.”“3.1”等序号形式;\n" +) + +SECTION_HEADING_RULES: dict[str, str] = { + "1.2": ( + "5. 本节(1.2)必须严格遵循【章节输出结构约束】给定的纯文本编号体结构;" + "不得使用“###”等 Markdown 小标题语法;不得将“1.2.1/1.2.2”改写为“1)/2)”。\n" + ), +} diff --git a/prompts/report_generation/prompt_defaults.py b/prompts/report_generation/prompt_defaults.py new file mode 100644 index 0000000..15392c2 --- /dev/null +++ b/prompts/report_generation/prompt_defaults.py @@ -0,0 +1,4 @@ +"""Fallback prompt fragments for report generation.""" + +DEFAULT_SECTION_PROMPT_FALLBACK = "按后评价细则规范撰写本章节。" +DEFAULT_SELECTED_EXAMPLE_FALLBACK = "无示例,按规范输出。" diff --git a/prompts/report_generation/repair_missing_tables_system.md b/prompts/report_generation/repair_missing_tables_system.md new file mode 100644 index 0000000..a3f6170 --- /dev/null +++ b/prompts/report_generation/repair_missing_tables_system.md @@ -0,0 +1 @@ +你是后评价报告撰写助手。任务是对既有章节做最小修改补齐缺表,禁止删除事实性内容,禁止编造。返回 JSON:{"content":"..."} diff --git a/prompts/report_generation/repair_missing_tables_user.md b/prompts/report_generation/repair_missing_tables_user.md new file mode 100644 index 0000000..1d65432 --- /dev/null +++ b/prompts/report_generation/repair_missing_tables_user.md @@ -0,0 +1,19 @@ +你正在修订章节:{{section_title}} + +目标:在不删除原有有效内容的前提下,补齐缺失表格。 +必须出现的表标识:{{missing_tables}} + +要求: +1) 每个缺失表都必须在正文中出现,并使用 Markdown 表格; +2) 若证据不足,单元格可写“待补充”; +3) 表标题必须包含对应表标识(如“表2-1”); +4) 仅输出修订后的完整章节 Markdown。 + +【原章节内容】 +{{content}} + +【原始章节提示词】 +{{original_prompt}} + +【证据包(JSON)】 +{{evidence_json}} diff --git a/prompts/report_generation/section_output_contracts.py b/prompts/report_generation/section_output_contracts.py new file mode 100644 index 0000000..f7a9d0c --- /dev/null +++ b/prompts/report_generation/section_output_contracts.py @@ -0,0 +1,877 @@ +"""Section output contract prompt variables.""" + +SECTION_OUTPUT_CONTRACTS: dict[str, str] = {'1': '按章节标题自然组织内容,围绕证据包先事实后结论,缺失项写“待补充”。', + '1.1': '必须按以下顺序输出,不得缺项、不得改名:\n' + '1) 项目名称:...\n' + '2) 建设单位:...\n' + '3) 建设地点:...\n' + '4) 建设类型:...\n' + '5) 起止时间:...\n' + '6) 建设内容:...\n' + '7) 建设投资:...\n' + '8) 占地面积:...\n' + '规则:内容仅可来自证据包;缺失项写“待补充”;严禁复用示例中的事实数据。', + '1.2': '必须严格按以下固定结构输出(纯文本编号体),不得缺项、不得增项、不得使用“###”等 Markdown 标题语法:\n' + '项目决策要点\n' + '1.2.1项目背景\n' + '1)...\n' + '(要求:先用 2~4 句完整书面语概括动因与结论,再视需要附表)\n' + '2)...\n' + '(要求:同上)\n' + '3)...\n' + '(要求:第3条背景优先写“政策/标准/环保与质量升级”类动因,并给出可由证据包定位支撑的结论,但正文中不要输出“【证据依据:...】”标签)\n' + '综合上述因素,...\n' + '\n' + '1.2.2预期目标\n' + '项目实施后,...\n' + '\n' + '写作质量规则(必须遵守):\n' + '1) ' + '必须完全按上述行序与段落结构输出:只允许出现「项目决策要点」「1.2.1项目背景」「1)」「2)」「3)」「综合上述因素,...」「1.2.2预期目标」这些结构标识;不得输出额外小标题、不得输出项目之外的说明段。\n' + '2) 每条背景必须是连续自然段(可多段),禁止把证据包里的原始换行表直接粘贴成“多列对不齐”的纯文本块。\n' + '3) 若需引用对比表、物料平衡表等,必须使用 Markdown 表格(含表头分隔行),表内数字与证据包一致,可注明表号(如表1)。\n' + '4) 第3条背景:请检索证据包中国VI、汽油标准、环评、排放、清洁生产等相关表述,结论必须可由证据包定位到文档名或段落支撑,但正文中不要输出“【证据依据:...】”标签。\n' + '5) 「预期目标」必须写成一段或多段完整书面语(不要用“- 规模目标/质量目标/效益目标:”三行结构)。若证据包已出现装置规模、烷基化油产量/产能(万吨/年)、辛烷值、国VI、收入、利润、IRR ' + '等任一可核对信息,必须在该段落中明确写出对应数字/结论;不得在证据已含关键数字时仍全部写“待补充”。\n' + '6) 证据不足时,对应句子写“待补充”,不得编造数字。\n' + '7) 严禁复用【章节示例】中的项目名、金额与结论。', + '1.3': '必须严格按以下格式输出,不得缺项、不得改名:\n' + '第一行固定为标题:"1.3 项目实施情况"。\n' + '第二段:仅用一段连续文字,按时间顺序写项目实施关键节点,覆盖可研/初设批复、开工、中交、投产试运行、竣工验收等信息;时间与事件要一一对应,可用分号分隔,禁止拆成条目。\n' + '第三段:仅用一段连续文字写投资执行对比,至少包含批复可研估算、批复初设概算、竣工决算;并计算与表述节余金额及比例(若证据不足则对应项写“待补充”),金额与口径仅可使用证据包。\n' + '第四段固定写法:"项目建设工作程序见附表1。"(无证据冲突时必须保留原句)。\n' + '写作约束:正文不得使用“项目实施关键节点”“建设与投资执行情况”等标签式小标题,不得编造时间、金额或比例。\n' + '【禁止输出表格】本节禁止输出任何 Markdown 表格(含附表1在内),“见附表1”仅为文字引用,附表在报告末尾统一输出。', + '1.4': '必须严格按以下格式输出,不得缺项、不得改名:\n' + '第一行固定为标题:"1.4 项目运行情况"。\n' + '第二段:仅用一段连续文字写项目运行情况,需包含投产后运行状态、分阶段连续运行时长、停工原因(如有)、加工负荷与烷基化油产量等关键事实;按时间顺序组织,禁止拆成条目。\n' + '第三段:仅用一段连续文字写经营与财务表现,至少包含营业收入、总成本费用、利润总额等指标,并给出经营结论(如盈利能力判断);结论必须由证据支撑。\n' + '财务口径强约束:本节优先且原则上仅可使用投产后已实现的实际值(如某年实际营业收入/成本/利润);不得使用“预测值、后预测、测算值、年均值(生产期均值)”替代实际值。”。\n' + '写作约束:正文不得使用“运行负荷与产量”“经营表现/财务表现”“总体运行结论”等标签式小标题;涉及时间、负荷、产量、金额等数据时,仅可使用证据包口径,证据不足处写“待补充”,不得编造;禁止在同一段内重复抄写相同句子或同一年份财务数据。', + '2': '按章节标题自然组织内容,围绕证据包先事实后结论,缺失项写“待补充”。', + '2.1': '本节必须使用“事实依据—评价判断—问题与建议”三段式结构。正文不得出现“【事实依据】”“【评价判断】”“【问题与建议】”标题标签。并按顺序完整覆盖下级小节:2.1.1、2.1.2、2.1.3、2.1.4、2.1.5、2.1.6、2.1.7。', + '2.1.1': '必须严格按以下格式与顺序输出,不得缺项、不得改名、不得调换段落顺序。\n' + '正文必须贴近以下版式组织,使用连续自然段表达,不得再写“事实依据/评价判断/问题与建议”等标签:\n' + '1)先写一段原料来源及与可研一致性的判断,并以“原料数量及组成对比见下表”引出表1。\n' + '2)随后固定输出表题“表1原料数量及组成对比表”,并紧跟表格。\n' + '3)表1后单独一行输出“注1.……”;如果有注释就输出,无明确注释时不写。\n' + '4)表1后写一段,对可研报告、初步设计、实际生产的原料数量与组成进行对比并给出结论。\n' + '5)再写一段全厂或装置负荷、原油加工量、装置加工量等实际运行情况。\n' + '6)然后写一句“实际生产原料组成与性质与可研报告基本一致,满足装置进料要求,详见下表。”或同义表达,引出表2。\n' + '7)随后固定输出表题“表2原料性质对比表(醚后碳四)”,并紧跟表格。\n' + '8)表2后写一段,分析原料组成、性质、烷烯比等变化及其对生产的影响。\n' + '9)最后单独写“后评价认为:……”总结性结论,结论必须回扣前文和表格数据。\n' + '\n' + '【表格强制要求】\n' + '1)表格必须直接使用“要素管理”中的表格(element_tables/element_cells),不得自行新造表,不得用正文推断补表,不得用安评/工艺包比选表替代。\n' + '2)表1必须使用要素管理中对应的“原料数量及组成对比表”;表题固定为“表1原料数量及组成对比表”。\n' + '3)表2必须使用要素管理中对应的“原料性质对比表(醚后碳四)”;表题固定为“表2原料性质对比表(醚后碳四)”。\n' + '4)若要素管理中存在上述表格,则优先直出其表头、分组表头、行项目和单元格内容;不得改列名、不得合并为其他样式、不得替换成其他表。\n' + '5)表1字段含义必须覆盖:序号、原料名称、规格、可研报告(数量(万吨)/占比(%))、初步设计(数量(万吨)/占比(%))、实际生产(数量(万吨)/占比(%))、备注;须保留合计行。\n' + '6)表2字段含义必须覆盖:序号、名称、可研报告、初步设计、实际生产、备注;行至少包含“密度,kg/m3”“硫含量,ppm”“氮含量,ppm”,其余行按要素管理表格直出。\n' + '\n' + '【禁止】\n' + '不得使用“表2.6-1”“原料选择加氢工艺技术对比”等安评/工艺包比选表作为本节主体;不得出现与本节无关的附录标题;不得把表格改写成列表、条目或非表格文本;证据不足处写“待补充”,不得编造。', + '2.1.2': '本节必须使用“事实依据—评价判断—问题与建议”三段式结构。正文不得出现“【事实依据】”“【评价判断】”“【问题与建议】”标题标签。', + '2.1.2.1': '本节必须按“产品方案评价”目的组织内容,针对全厂性炼油项目或部分化工类项目,通过对项目投产后市场对产品种类、规格、标准等方面需求的实际情况与前期工作确定的产品方案进行对比,评价前期工作确定的产品方案是否与市场实际需求相适应,评价主要产品是否为高效厚利产品以及对项目成败的影响情况。\n' + '\n' + '必须严格按以下格式与顺序输出,不得缺项、不得改名、不得调换顺序:\n' + '1)先写一段总述,说明本节是通过产品种类、规格、标准、产量及市场实际需求的前后对比,评价产品方案适应性及其对项目成败的影响。\n' + '2)再写一段,对项目产品前后对比情况进行分析;如产品方案与实际需求相差较大,必须分析原因。\n' + '3)随后固定写一句“项目产品方案对比表见表2-3。”\n' + '4)紧接着固定输出表题“表2-3 产品方案对比表”,并紧跟表格。\n' + '5)表格后不输出任何模板性注释(如“注.表中内容可根据项目实际需要进行增减”等套话),直接进入后评价结论。\n' + '6)最后单独写“后评价认为:。”,并基于前文与表格数据补全评价结论。\n' + '\n' + '【表格强制要求】\n' + '1)表格必须直接使用“要素管理”中的表格(element_tables/element_cells),不得自行新造表,不得用正文推断补表,不得改写成列表或段落。\n' + '2)优先使用要素管理中对应“产品方案对比表”的结构化表;若存在对应表格,须直出其表头、分组表头、行项目和单元格内容,不得改列名、不得替换成其他表。\n' + '3)表题固定为“表2-3 产品方案对比表”。\n' + '4)表格字段含义必须覆盖:序号、产品、可研报告规格、可研报告数量(万吨/年)、实际生产规格、实际生产数量(万吨/年)、备注。\n' + '5)表内行项目可包括但不限于:汽油、航空煤油、柴油、XX化工品、XX润滑油、XXX、轻油产品率,%、综合商品率,%、柴汽比;具体行项目按要素管理中的表格直出,可根据项目实际需要增减。\n' + '\n' + '【写作约束】\n' + '正文不得出现“【事实依据】”“【评价判断】”“【问题与建议】”等标签;不得编造市场需求、产品规格、产量、比率或效益判断;证据不足处写“待补充”。', + '2.1.2.2': '本节必须按“产品市场评价”目的组织内容,围绕项目产品市场需求、销售渠道、产品流向、市场风险及产品结构改善情况展开分析,通过对可研报告预期与实际生产情况进行对比,评价前期工作对产品市场的预测是否合理。\n' + '\n' + '必须严格按以下格式与顺序输出,不得缺项、不得改名、不得调换顺序:\n' + '1)第一段写可研报告对产品市场的判断依据,需说明市场供需现状、预测、供需平衡或标准升级等因素如何支撑项目产品需求。\n' + '2)第二段写可研报告对产品消化路径、厂内平衡、销售渠道、市场风险的预测,可结合汽油调和、统一销售、内部消化等实际口径展开。\n' + '3)第三段写实际生产情况,需说明实际产品调入去向、销售方式、销售渠道,并与可研预期进行对比。\n' + '4)随后固定写一句“前期工作预测的主要产品产量、流向与实际生产产品产量及流向对比见下表。”\n' + '5)紧接着固定输出表题“表2-4 ××年项目主要产品流向状况”,并紧跟表格;若为多年数据,应按每年分别列表;投产时间较短时可按季度或几个月列表。\n' + '6)表格后(即表格最后一行之后)单独一行固定输出“注:指装置投产到后评价时点,按每年列表,投产时间短的也可以是季或几个月。”——注必须在表格外面,严禁将“注”写入表格的任何单元格(包括备注列)中\n' + '7)最后单独写“后评价认为:……”总结性结论,必须明确判断产品流向、销售渠道、市场风险、产品结构改善等方面与可研预测的一致性或差异。\n' + '\n' + '【表格强制要求】\n' + '1)表格必须直接使用“要素管理”中的表格(element_tables/element_cells),不得自行新造表,不得用正文推断补表,不得改写成列表或段落。\n' + '2)优先使用要素管理中对应“表2-4 ××年项目主要产品流向状况”或“主要产品流向状况”的结构化表;若存在对应表格,须直出其表头、分组表头、行项目和单元格内容,不得改列名、不得替换成其他表。\n' + '3)表题固定为“表2-4 ××年项目主要产品流向状况”;其中“××年”应替换为要素管理表对应年份。若存在多个年份,应逐年分别输出对应表题和表格。\n' + '4)表格字段含义必须覆盖:产品名称、规格、实际产量、销量、产品实际流向、可研报告产品流向、备注。\n' + '5)表内行项目按要素管理中的表格直出,可包含“×××”“小计”等行;“小计/合计”行应放在表格末尾。\n' + '6)严禁将“注:”或注释性文字写入表格行或任何单元格中;所有注释必须在表格 Markdown 结束后另起一行输出。\n' + '\n' + '【写作约束】\n' + '正文不得出现“【事实依据】”“【评价判断】”“【问题与建议】”等标签;不得编造市场需求、销量、流向、销售渠道、市场风险或产品结构改善情况;证据不足处写“待补充”。', + '2.1.3': '本节必须使用“事实依据—评价判断—问题与建议”三段式结构。正文不得出现“【事实依据】”“【评价判断】”“【问题与建议】”标题标签。', + '2.1.3.1': '本节必须按“总加工方案评价”目的组织内容,通过实际情况与前期工作的对比,评价整体规模、单线规模、产品方案是否一致;如存在较大差异,必须分析变动原因及其合理性,并结合项目实际运行情况,对总加工方案的合理性、适应性作出评价。\n' + '\n' + '必须严格按以下要求输出:\n' + '1)先写前期工作确定的总体加工方案,包括整体规模、单线规模、主要产品方案等核心内容。\n' + '2)再写项目实际建设和运行情况,与前期工作逐项对比,说明一致项与差异项。\n' + '3)如整体规模、单线规模、产品方案存在较大变化,必须写明变化内容、形成原因及是否合理,不得只写结论不写依据。\n' + '4)最后单独给出总结性评价,明确判断总加工方案是否合理、是否适应实际运行需要,以及相关调整是否合理。\n' + '\n' + '【写作约束】\n' + '正文不得出现“【事实依据】”“【评价判断】”“【问题与建议】”等标签;可使用连续自然段表达;证据不足处写“待补充”,不得编造规模、负荷、产品方案或变动原因。', + '2.1.3.2': '本节必须按“建设规模及工艺技术方案评价”目的组织内容,通过实际情况与前期工作的对比,评价建设规模、装置规模、运行负荷与工艺技术方案是否与可研一致;如存在较大差异,必须分析原因及合理性,并结合实际运行情况,对工艺技术方案的先进性、适应性、可靠性和环保性能作出评价。\n' + '\n' + '必须严格按以下格式与顺序输出,不得缺项、不得改名、不得调换顺序:\n' + '1)先固定输出“1)烷基化装置工艺技术方案”作为小标题;小标题下先用一段完整文字写可研报告对不同烷基化工艺的比选过程、比选维度、拟选工艺及最终技术供应商;随后必须使用“(1)…(2)…(3)…”逐条列出可研推荐工艺的先进性与适应性,条目内容可覆盖辛烷值、选择性、酸耗、反应器与传热方式、安全环保、可靠性、运行费用等方面。\n' + '2)再固定输出“2)废酸再生单元工艺技术方案”作为小标题;小标题下先用一段完整文字写可研报告对不同废酸再生工艺的比选过程、比选对象、拟选工艺及最终技术供应商;随后必须使用“(1)…(2)…”逐条列出推荐工艺的主要特点,条目内容可覆盖流程简洁性、运行成本、尾气排放、二次污染、操作弹性、工艺适用性等方面。\n' + '3)最后直接单独写“后评价认为:……”,不得再输出“3)后评价结论”或其他总结性小标题。结论必须综合评价烷基化与废酸再生工艺选用是否合理适用、技术是否先进可靠、环保性能是否良好,以及前期工作确定的装置规模和原料条件是否在实际运行中得到验证。\n' + '\n' + '【结构硬约束】\n' + '本节正文仅允许出现上述两个编号小标题与最后“后评价认为:……”结论,不得再新增“建设规模及装置规模对比”“配套单元及附属单元”等其他编号小标题;如需说明规模、负荷、原料来源等内容,只能写入“后评价认为:……”段内,不得单列成新标题。\n' + '\n' + '【写作约束】\n' + '正文不得出现“【事实依据】”“【评价判断】”“【问题与建议】”等标签;不得编造装置规模、单元规模、运行负荷、技术供应商、工艺优缺点或调整方向;证据不足处写“待补充”。', + '2.1.3.3': '本节必须按“主要设备方案评价”目的组织内容,通过对可研报告、初步设计与实际运行情况的对比,评价主要设备的选型、材质、结构形式及优化调整是否合理适用,是否满足装置长周期平稳运行需要。\n' + '\n' + '必须严格按以下格式与顺序输出,不得缺项、不得改名、不得调换顺序:\n' + '1)固定输出“1)反应器”作为小标题,写明反应器是否采用技术专利商专利设备、设备材质、结构形式、专利内件或关键设计特点,并说明其在传质效率、混合效果、安全生产可靠性等方面的作用。\n' + '2)固定输出“2)冷剂压缩机”作为小标题,写明压缩机类型、国产化或供货来源、设计制造成熟度、运行平稳性及选型合理性。\n' + '3)固定输出“3)塔类”作为小标题,写明主要塔器的塔型、筒体材质,以及初步设计与可研在塔型或材质上的一致性与变化情况;如存在优化调整,需说明调整内容及合理性。\n' + '4)最后单独写“后评价认为:……”总结性结论,需明确判断主要设备的选型、材质与可研是否基本一致,深化设计过程中的尺寸或结构优化是否合理,以及是否满足装置长周期平稳运行需要。\n' + '\n' + '【写作约束】\n' + '正文不得出现“【事实依据】”“【评价判断】”“【问题与建议】”等标签;不得编造设备名称、材质、塔型、专利设备、运行效果或优化原因;证据不足处写“待补充”。', + '2.1.4': '本节必须按“厂址选择及外部条件评价”目的组织内容,对比最终选择厂址与前期各阶段是否一致;如有变化,必须分析变化原因。并结合项目实际运行情况,评价厂址及外部条件是否满足项目要求,判断推荐厂址方案的合理性。\n' + '\n' + '必须严格按以下格式与顺序输出,不得缺项、不得改名、不得调换顺序:\n' + '1)固定输出小标题“1)前期工作厂址方案对比”,并在该标题下写一段或多段连续文字,对比最终选择厂址与可研、前评估、初设及相关决策程序中的厂址方案是否一致;可结合前期决策程序合规性、初步设计评价、前评估意见采纳落实、厂址结论等证据展开,但不得跑题写成程序评价主节。若厂址方案发生变化,必须明确写出变化内容、原因及合理性。\n' + '2)固定输出小标题“2)外部条件满足性评价”,并在该标题下写一段或多段连续文字,结合项目实际运行情况评价厂址及外部条件是否满足项目要求;可按“厂址选择、总图与配套工程、环境保护设施、风险防控、港口、码头、铁路、公路、管道、供水、供电等方面”组织内容。实际有证据的写具体情况,无证据的对应项可单独写“待补充”,但不得在完整段落末尾附加“待补充”。\n' + '3)最后直接单独写“后评价认为:……”,不得再输出“3)后评价结论”或其他总结性小标题。结论必须明确判断厂址选择是否合理、前期工作与厂址方案是否一致、外部条件是否满足项目建设与运行需要;对缺少运行数据支撑的外部条件,可在结论中指出仍需补充完善。\n' + '\n' + '【写作约束】\n' + '除“1)前期工作厂址方案对比”“2)外部条件满足性评价”外,不得输出其他自拟小标题;正文不得出现“【事实依据】”“【评价判断】”“【问题与建议】”等标签;可使用连续自然段表达;不得编造厂址变化原因、交通运输条件、管道条件、供水供电条件或运行适应性结论;证据不足处写“待补充”。', + '2.1.5': '本节必须按“总图及系统配套工程评价”目的组织内容,评价项目总图布置、公用工程、储运工程及辅助设施配置是否合理,并区分新建部分与依托部分进行对比分析。\n' + '\n' + '必须严格按以下格式与顺序输出,不得缺项、不得改名、不得调换顺序:\n' + '1)固定输出“1)总图布置”作为小标题,写明可研报告中装置工艺部分、配套单元、储罐、变配电室、机柜室等占地情况,说明装置总占地面积、布置位置、是否新征用地,并评价总平面布置的合理性。\n' + '2)对总图、储运、公用工程及辅助工程等主要建设内容与可行性研究报告进行对比,说明是否有变化,如有较大变化应说明原因,并评价其变化的合理性。\n' + '3)在“2)系统工程配套”相关文字之后,必须输出“表2-5 总图、储运、公用工程及辅助工程对比”,并紧跟表格,用于呈现新建部分对比。\n' + '4)随后必须输出“表2-6 储运、公用工程及辅助工程依托对比”,并紧跟表格,用于呈现依托部分对比。\n' + '5)最后单独写“后评价认为:……”总结性结论,明确判断总图布置是否合理、公用工程及辅助设施配置是否满足需要、利用现有设施是否节约投资并取得较好效果。\n' + '\n' + '【表格强制要求】\n' + '1)表格必须直接使用“要素管理”中的表格(element_tables/element_cells),不得自行新造表,不得用正文推断补表,不得改写成列表或段落。\n' + '2)必须优先使用要素管理中对应“表2-5 总图、储运、公用工程及辅助工程对比”和“表2-6 ' + '储运、公用工程及辅助工程依托对比”的结构化表;若存在对应表格,须直出其表头、行项目和单元格内容,不得改列名、不得替换成其他表。\n' + '3)表2-5字段含义必须覆盖:序号、项目名称、单位、可研报告、初步设计、实际实施、备注。\n' + '4)表2-6字段含义必须覆盖:序号、依托项目名称、单位、可研报告、初步设计、实际实施、备注。\n' + '5)两张表均不得省略;如要素管理中未命中对应表格,也必须按模板字段输出占位表,不得只写正文不写表。\n' + '6)表后不输出任何模板性注释(如"注.表中内容可根据项目实际需要进行增减"等套话),仅保留要素管理中有实质内容的原始注释。\n' + '\n' + '【写作约束】\n' + '正文不得出现“【事实依据】”“【评价判断】”“【问题与建议】”等标签;不得编造占地面积、储罐数量、容积、公用工程能力、依托关系或投资效果;证据不足处写“待补充”。', + '2.1.6': '本节必须按“主要技术指标评价”目的组织内容,围绕可研报告、初步设计与实际运行的主要技术指标进行对比,评价初步设计相对可研的优化效果以及实际运行达成情况。\n' + '\n' + '必须严格按以下格式与顺序输出,不得缺项、不得改名、不得调换顺序:\n' + '1)先写一段引导语,明确“项目可研报告和初步设计主要设计指标对比见下表”,并说明通过对比可观察到主要技术指标变化趋势。\n' + '2)随后必须输出表题“表2-7 主要设计指标对比表”,并紧跟表格。\n' + '3)表格后可写一段分析,至少覆盖烷基化油RON、酸耗、能耗等关键指标的变化方向;如证据显示初步设计优于可研,应说明这是设计优化、细化和深化的结果。\n' + '【表格强制要求】\n' + '1)表格必须直接使用“要素管理”中的表格(element_tables/element_cells),不得自行新造表,不得用正文推断补表,不得改写成列表或段落。\n' + '2)必须优先使用要素管理中对应“表2-7 主要设计指标对比表”的结构化表;若存在对应表格,须直出其表头、行项目和单元格内容,不得改列名、不得替换成其他表。\n' + '3)表头字段必须覆盖:序号、指标名称、可研报告、初步设计、实际运行、备注。\n' + '4)行项目可包括但不限于:原油加工量、综合商品率、全厂柴汽比、全厂新鲜水耗、全厂平均电耗、能耗、其它、常减压蒸馏装置能耗;具体按要素管理表格直出,可酌情增减。\n' + '5)表后注释需保留要素管理中的原始注释;若未提取到注释,不输出任何模板性注释(如“注:根据项目的情况,可酌情增减指标”等套话),仅保留要素管理中有实质内容的原始注释。\n' + '6)表2-7不得省略;如要素管理中未命中对应表格,也必须按模板字段输出占位表。\n' + '\n' + '【写作约束】\n' + '正文不得出现“【事实依据】”“【评价判断】”“【问题与建议】”等标签;不得编造RON、酸耗、能耗及其他技术指标数据;证据不足处写“待补充”。', + '2.1.7': '本节必须按“风险分析评价”目的组织内容,围绕可研报告对技术、设备、施工、社会、原料及产品、安全、环保、消防、职业卫生等风险的识别、分析及应对措施进行评价,并结合实际执行情况判断风险防控措施是否有效。\n' + '\n' + '必须严格按以下格式与顺序输出,不得缺项、不得改名、不得调换顺序:\n' + '1)正文只允许两段,不得再拆分为“1)/2)/3)/4)”等子标题、不得再加编号小节。\n' + '2)第一段为风险分析主体段:综合写可研报告对技术、设备、施工、社会、原料及产品、安全、环保、消防、职业卫生等风险的识别与应对,说明工艺技术选择、设备选型、防腐与伴热等措施是否将风险降到可控范围。\n' + '3)第二段必须以“后评价认为:”开头,给出总结性结论,明确评价前期风险防控措施在后续设计、施工和生产运行中的贯彻执行情况,以及对建设实施和生产运行安全的保障效果。\n' + '4)除“后评价认为:”外,不得输出其他总结性标题(如“后评价结论”“风险防控措施评价”“生产运行风险评价”等)。\n' + '\n' + '【写作约束】\n' + '正文不得出现“【事实依据】”“【评价判断】”“【问题与建议】”等标签;不得编造工艺技术来源、风险结论、防腐措施、环保安全执行效果或事故情况;证据不足处写“待补充”;有实质内容时不得在段尾附加“待补充”。', + '2.2': '本节必须使用“事实依据—评价判断—问题与建议”三段式结构。正文不得出现“【事实依据】”“【评价判断】”“【问题与建议】”标题标签。并按顺序完整覆盖下级小节:2.2.1、2.2.2、2.2.3、2.2.4。', + '2.2.1': '必须严格按以下格式输出,不得缺项、不得改名:\n' + '1)首行固定输出标题:"2.2.1 编制单位资质及选择方式评价"。\n' + '2)标题后仅输出一段连续文字,围绕“可研报告编制单位资质及选择方式评价”展开,不得再拆分小标题或编号条目。\n' + '3)该段至少应包含:编制单位全称(及简称,如有)、单位沿革或背景、资质等级与资质类别、区域/行业熟悉度、承担项目前期工作的能力评价。\n' + '4)如有明确依据,可补充选择该单位的原因(如不可替代专有技术、区域经验、既有装置熟悉度等);结尾需给出“具备承担项目前期工作的能力”或同义评价。\n' + '5)证据不足处写“待补充”,不得编造单位资质、历史沿革、能力结论或选择依据。', + '2.2.2': '必须严格按以下格式输出,不得缺项、不得改名:\n' + '1)首行固定输出标题:"2.2.2 编制进度评价"。\n' + '2)标题后仅输出一段连续文字,简要说明可行性研究报告编制历程,至少包含关键时间节点(如启动、提交、审查、批复等)或阶段性进展信息。\n' + '3)该段必须明确判断编制进度是否满足项目需要及建设单位要求,并给出简要依据。\n' + '4)不得拆分为条目或小标题;证据不足处写“待补充”,不得编造时间线、进度结论或满足性判断。', + '2.2.3': '必须严格按以下格式输出,不得缺项、不得改名:\n' + '1)首行固定输出标题:"2.2.3 与专项评价的结合情况"。\n' + '2)标题后仅输出一段连续文字,说明可行性研究编制与专项评价结论的结合情况。\n' + '3)该段应至少体现专项评价结论在可研中的采纳、衔接或落实情况,并给出是否结合充分、是否支撑项目决策的判断。\n' + '4)不得拆分为条目或小标题;证据不足处写“待补充”,不得编造专项评价结论或采纳落实情况。', + '2.2.4': '必须严格按以下格式输出,不得缺项、不得改名:\n' + '1)首行固定输出标题:"2.2.4 可行性研究报告的质量评价"。\n' + '2)标题后仅输出一段连续文字,结合前期工作的成效,评价可行性研究报告的质量。\n' + '3)该段应至少体现可研报告在完整性、深度、可实施性、与批复/初设衔接性或风险识别等方面的质量判断,并说明其对后续建设实施的支撑作用。\n' + '4)不得拆分为条目或小标题;证据不足处写“待补充”,不得编造质量结论或前期工作成效。', + '2.3': '必须严格按以下格式输出,不得缺项、不得改名:\n' + '1)首行固定输出标题:"2.3 前评估工作评价"。\n' + '2)标题后按连续自然段输出,不得拆分小标题或编号条目。\n' + '3)第一段写前评估组织单位及资质情况,明确其是否满足承担项目可研评估的资质要求。\n' + '4)第二段写前评估会议(或评估过程)时间、组织形式及评审范围,至少覆盖原料及产品方案、工艺技术、节能节水、厂址、公用工程、环保安全、投资经济性等方面。\n' + '5)第三段写评估主要结论,明确项目可行性判断依据。\n' + '6)第四段写评估意见与建议数量及落实情况,说明可研编制单位是否逐项答复、修改并采纳。\n' + '7)最后单独写“后评价认为:……”总结性结论,评价前评估结论的客观性、公正性,以及其对后续设计、建设实施和投产运行的支撑作用。\n' + '8)证据不足处写“待补充”,不得编造评估单位资质、评审时间、意见数量或落实结论。', + '2.4': '本节必须使用“事实依据—评价判断—问题与建议”三段式结构。正文不得出现“【事实依据】”“【评价判断】”“【问题与建议】”标题标签。并按顺序完整覆盖下级小节:2.4.1、2.4.2、2.4.3、2.4.4。', + '2.4.1': '必须严格按以下格式输出,不得缺项、不得改名:\n' + '1)首行固定输出标题:"2.4.1 设计单位资质及选择方式评价"。\n' + '2)标题后仅输出一段连续文字,说明初步设计承担单位及其分工关系(如主体装置设计与配套公用工程设计分工)。\n' + '3)该段至少应包含:各设计单位全称(及简称,如有)、主要业务范围或专业特长、资质等级或资质类型、与项目匹配性评价。\n' + '4)结尾需明确给出资质与选择方式评价结论(如“均具有与承担项目相适应的设计资质,符合资质要求”或同义表述)。\n' + '5)不得拆分为条目或小标题;证据不足处写“待补充”,不得编造设计单位资质、分工内容或结论。', + '2.4.2': '必须严格按以下格式输出,不得缺项、不得改名:\n' + '1)首行固定输出标题:"2.4.2 初步设计进度评价"。\n' + '2)标题后仅输出一段连续文字,按时间顺序写明初步设计关键节点,至少包括开始时间、完成时间、审查完成时间、批复时间。\n' + '3)该段结尾必须明确判断“初步设计进度满足合同和项目总体部署的工期要求”或同义评价。\n' + '4)不得拆分为条目或小标题;证据不足处写“待补充”,不得编造时间节点、审查时间或进度评价结论。', + '2.4.3': '必须严格按以下格式输出,不得缺项、不得改名:\n' + '1)首行固定输出标题:"2.4.3 初步设计质量评价"。\n' + '2)标题后仅输出一段连续文字,结合前期工作成效、要素评价内容和采取的设计手段,评价初步设计质量。\n' + '3)该段至少应覆盖:设计内容完整性、设计深度、技术水平、与相关规范/规定的符合性,并明确是否满足要求。\n' + '4)可结合优化设计、细化深化、可实施性和运行验证等信息支撑评价结论。\n' + '5)不得拆分为条目或小标题;证据不足处写“待补充”,不得编造设计质量结论、设计手段或规范符合性。', + '2.4.4': '必须严格按以下格式输出,不得缺项、不得改名:\n' + '1)首行固定输出标题:"2.4.4 初步设计审查工作评价"(与 2.4.3 等同级小节相同:纯文本编号标题,不得使用“###”等 Markdown 标题语法)。\n' + '2)标题须单独成行,正文另起一行;禁止标题与首段正文粘在同一行(如「…评价2017年12月……」)。\n' + '3)标题后先输出一段连续文字,写明初步设计审查工作的时间、组织单位、审查专业分组、意见数量、设计单位整改落实情况,以及未采纳意见数量与说明上报情况(如有)。\n' + '4)随后单独写“后评价认为:……”总结性结论,评价审查意见的客观性、公正性、指导作用,并说明未采纳少量意见的合理性判断(如有证据)。\n' + '5)不得拆分为条目或小标题;证据不足处写“待补充”,不得编造审查时间、意见数量、采纳落实情况或结论。', + '2.5': '必须严格按以下格式输出,不得缺项、不得改名:\n' + '1)首行固定输出标题:"2.5 前期决策程序评价"。\n' + '2)标题后仅输出一段连续文字,从可行性研究、初步设计等环节说明项目是否严格按国家基本建设程序运作。\n' + '3)该段应明确写出可研批复前专项评价/专项报告(如环境评价、职业病危害预评价、安全预评价等)的完成与批复情况,并给出“符合项目建设程序规定”或同义结论。\n' + '4)不得拆分为条目或小标题;证据不足处写“待补充”,不得编造批复时间、专项名称或程序合规结论。', + '2.6': '必须严格按以下格式输出,不得缺项、不得改名:\n' + '1)首行固定输出标题:"2.6 前期工作评价结论"。\n' + '2)标题后仅输出两段连续文字,不得拆分为条目、小标题或“主要结论/主要问题/改进建议”三段结构。\n' + '3)第一段应围绕前期工作总体过程与关键决策展开,至少覆盖:项目任务背景(如国Ⅵ质量升级目标)、可研研究结论与方案选择、工艺技术比选与确定、支持性报告与可研批复情况、初步设计阶段方案优化及其合理性。\n' + '4)第二段应给出总体程序与合规性结论,明确前期工作是否执行国家和集团基本建设制度、是否按基本建设程序运作、依据是否充分、决策程序是否合规。\n' + '5)证据不足处写“待补充”,不得编造工艺路线、审批流程、批复结论或合规性判断。', + '3': '按章节标题自然组织内容,围绕证据包先事实后结论,缺失项写“待补充”。', + '3.1': '必须严格按以下格式与顺序输出,不得缺项、不得改名:\n' + '1)首行固定输出标题:"3.1 工程建设管理模式评价"。\n' + '2)随后固定输出小标题“3.1.1管理模式”,并在该小标题下用连续文字说明项目管理模式(如“业主+监理+EPC”)、项目经理负责制、组织架构设置、职责分工及工程建设到投产试运行衔接效果。\n' + '3)再固定输出小标题“3.1.2管理效果”,并在该小标题下先写一段总体管理成效描述,再按顺序固定输出“1)加强设计、施工、采购管理,确保工程质量”“2)项目建设安全管理全面受控”“3)进度控制存在一定不足”三项内容。\n' + '4)上述三项中,前两项重点写质量管理、安全/HSE管理、体系运行与控制成效;第3项需如实写进度偏差、滞后原因及对目标进度的影响,不得回避问题。\n' + '5)不得输出与本节无关的小标题;证据不足处写“待补充”,不得编造管理模式、事故指标、验收合格率、进度偏差或原因。', + '3.10': '必须严格按以下格式与顺序输出,不得缺项、不得改名:\n' + '1)首行固定输出标题:"3.10 工程竣工验收评价"。\n' + '2)随后必须按顺序固定输出以下六个小标题并分别展开:\n' + ' 1)消防验收\n' + ' 2)环境保护验收\n' + ' 3)安全设施验收\n' + ' 4)职业病防护设施验收\n' + ' 5)档案验收\n' + ' 6)竣工决算审计\n' + '3)每个小标题下应写明组织方式(政府/企业自行组织)、时间节点、验收或审计结论(是否通过/同意投入使用)。\n' + '4)六个小标题后,必须再写一段总体情况说明,概述专项验收完成情况、竣工验收是否已完成、未完成原因及计划安排(如有)。\n' + '5)最后单独写“后评价认为:……”总结性结论,明确对专项验收组织情况、竣工验收进度滞后问题及改进方向的评价。\n' + '6)不得新增无关小标题或调整上述顺序;证据不足处写“待补充”,不得编造验收时间、验收结论、审计结论、未完成原因或计划节点。', + '3.11': '必须严格按以下格式与顺序输出,不得缺项、不得改名:\n' + '1)首行固定输出标题:"3.11 建设实施评价结论"。\n' + '2)标题后先写一段总体结论,概述建设管理模式、任务完成情况、质量/HSE/投资/目标达成情况。\n' + '3)随后固定输出小标题“3.11.1经验和好的做法”,并按顺序固定输出三项:\n' + ' 1)选用适宜管理模式,保证项目顺利实施\n' + ' 2)加强项目过程控制,高质量完成项目实施\n' + ' 3)生产人员提前介入,实现工程建设和投产的有效衔接\n' + '4)再固定输出小标题“3.11.2存在问题”,并按顺序固定输出两项:\n' + ' 1)施工图设计不优化,存在浪费和安全隐患\n' + ' 2)主体装置和配套单元建设不同步,进度控制待加强\n' + '5)每个分项下均应写连续文字,既要有事实依据,也要有评价判断;不得空写标题。\n' + '6)不得新增无关小标题或调整上述顺序;证据不足处写“待补充”,不得编造合格率、工期差值、问题数量、HSE结论或投资达成情况。', + '3.2': '必须严格按以下格式与顺序输出,不得缺项、不得改名:\n' + '1)首行固定输出标题:"3.2 招投标评价"。\n' + '2)标题后先输出连续文字,说明项目招投标总体执行情况,至少覆盖EPC总承包、监理承包商、无损检测承包商、工程质量监督单位等确定方式(招标/非招标/谈判)及合规性。\n' + '3)对于非招标确定的单位,应写明资质能力、选择理由及上报审批手续履行情况。\n' + '4)随后必须输出表题“表3-1 项目承包单位情况”,并紧跟表格。\n' + '5)表后单独写“后评价认为:……”总结性结论,明确招投标程序是否符合法律法规及公司管理制度,并如实评价合同金额与批复概算关系等不足。\n' + '\n' + '【表格强制要求】\n' + '1)表格必须直接使用“要素管理”中的表格(element_tables/element_cells),不得自行新造表,不得用正文推断补表。\n' + '2)必须优先使用要素管理中对应“表3-1 项目承包单位情况”的结构化表;若存在对应表格,须直出其表头、行项目和单元格内容,不得改列名、不得替换成其他表。\n' + '3)表头字段必须覆盖:序号、单元名称、承包单位、(合同金额)(万元)、是/否招标、资质情况。\n' + '4)表3-1不得省略;如要素管理中未命中对应表格,也必须按模板字段输出占位表。\n' + '\n' + '【写作约束】\n' + '不得拆分为与本节无关的小标题;证据不足处写“待补充”;不得编造承包单位名称、合同金额、招标方式、资质情况或审批结论。', + '3.3': '本节必须使用“事实依据—评价判断—问题与建议”三段式结构。正文不得出现“【事实依据】”“【评价判断】”“【问题与建议】”标题标签。并按顺序完整覆盖下级小节:3.3.1、3.3.2、3.3.3、3.3.4。', + '3.3.1': '必须严格按以下格式输出,不得缺项、不得改名:\n' + '1)首行固定输出标题:"3.3.1 与批复后初步设计符合性评价"。\n' + '2)标题后仅输出一段连续文字,围绕施工图设计与批复后初步设计在范围、内容、规模方面的一致性进行评价。\n' + '3)该段应至少包含:施工图设计与初步设计一致性结论、建设单位技术人员在前期与施工图阶段的参与情况、设计联络/图纸审查/交底会审等过程控制对符合性的保障作用。\n' + '4)结尾需明确给出符合性判断(如“与初步设计一致,符合性较好”或同义表述)。\n' + '5)正文禁止出现任何形式的表格交叉引用:不得写“详见表…”“参见表…”“见表…”“如表…所示”等,亦不得出现表3-2、表3-3、表3-4、表2-7等表号;相关进度与变更数据只在3.3.2、3.3.4以表呈现。\n' + '6)不得拆分为条目或小标题;证据不足处写“待补充”,不得编造一致性结论、参与过程或审查工作。', + '3.3.2': '必须严格按以下格式与顺序输出,不得缺项、不得改名:\n' + '1)首行固定输出标题:"3.3.2 设计进度评价"。\n' + '2)标题后先输出一段连续文字,说明设计管理模式(如以设计为龙头的EPC总承包)、主体装置与公用工程设计分工、总承包单位对设计-采购-施工衔接的进度协调机制、分批次按节点出图情况以及总体进度满足性判断。\n' + '3)该段中应体现施工图设计时间区间(如有证据),并以“施工图设计进度情况见表3-2”或同义句引出表格。\n' + '4)随后必须输出表题“表3-2 施工图设计进度情况”,并紧跟表格。\n' + '\n' + '【表格强制要求】\n' + '1)表格必须直接使用“要素管理”中的表格(element_tables/element_cells),不得自行新造表,不得用正文推断补表。\n' + '2)必须优先使用要素管理中对应“表3-2 施工图设计进度情况”的结构化表;若存在对应表格,须直出其表头、行项目和单元格内容,不得改列名、不得替换成其他表。\n' + '3)表头字段必须覆盖:序号、项目、设计单位、合同期限、实际执行情况、备注。\n' + '4)表3-2不得省略;如要素管理中未命中对应表格,也必须按模板字段输出占位表。\n' + '\n' + '【写作约束】\n' + '不得拆分为与本节无关的小标题;证据不足处写“待补充”;不得编造设计分工、进度节点、合同期限或执行情况。', + '3.3.3': '必须严格按以下格式与顺序输出,不得缺项、不得改名:\n' + '1)首行固定输出标题:"3.3.3 施工图设计水平及质量评价"。\n' + '2)标题后先写一段连续文字,说明建设单位设计管理与技术人员参与情况(前期方案比选、优化、总图与公用工程依托、设计联络、图纸审查、交底与会审等),并评价设计单位对可研评估与基础设计审查意见的采纳情况及设计满足性。\n' + '3)再写一段连续文字,客观描述后评价现场发现的设计问题与不足(如管廊/框架载荷考虑不足、设计保守、投资浪费、通行受阻或碰头等安全隐患),不得回避负面问题。\n' + '4)最后单独写“后评价认为:……”总结性结论,明确设计水平和质量总体判断,既要写成效,也要写不足及其影响。\n' + '5)本节为纯文字评价,禁止输出任何表题行或表号清单(不得以独立行/列表重复“表3-2…”“表3-3…”“表3-4…”“表2-7…”等);禁止表格交叉引用。\n' + '6)不得拆分为额外条目或无关小标题;证据不足处写“待补充”,不得编造现场问题、隐患结论或投资浪费判断。', + '3.3.4': '必须严格按以下格式与顺序输出,不得缺项、不得改名:\n' + '1)首行固定输出标题:"3.3.4 施工图设计变更管理评价"。\n' + '2)标题后先输出一段连续文字,说明设计变更管理机制与控制措施,至少覆盖EPC统筹协调、初设审查意见采纳、施工图会审、三维配管设计应用、现场设计代表配合等对减少变更的作用。\n' + '3)随后写一段连续文字,明确设计变更总量、变更费用、费用占比,并以“见表3-3~表3-5”或同义表达引出表格。\n' + '4)随后必须依次输出:\n' + ' - 表3-3 施工图设计变更情况(全厂性项目)\n' + ' - 表3-4 施工图设计变更情况(单装置项目)\n' + ' - 表3-5 影响投资或工期重(较)大设计变更及原因分析\n' + ' 每个表题后必须紧跟对应表格。\n' + '5)最后单独写“后评价认为:……”总结性结论,明确施工图设计变更管理总体评价,并对是否存在因设计原因导致的重大投资/工期影响作出判断。\n' + '\n' + '【表格强制要求】\n' + '1)三张表必须直接使用“要素管理”中的表格(element_tables/element_cells),不得自行新造表,不得用正文推断补表。\n' + '2)必须优先使用要素管理中对应“表3-3”“表3-4”“表3-5”的结构化表;若存在对应表格,须直出其表头、行项目和单元格内容,不得改列名、不得替换成其他表。\n' + '3)表3-3字段必须覆盖:序号、单元名称、设计变更(份数)、设计变更金额(万元)、备注(含合计行)。\n' + '4)表3-4字段必须覆盖:序号、专业、设计变更(份数)、设计变更金额(万元)、备注(含合计行)。\n' + '5)表3-5字段必须覆盖:序号、单元名称、变更内容、金额(万元)、原因、备注。\n' + '6)三张表均不得省略;如要素管理中未命中对应表格,也必须按模板字段输出占位表。\n' + '\n' + '【写作约束】\n' + '不得拆分为与本节无关的小标题;证据不足处写“待补充”;不得编造变更份数、费用金额、占比、重大变更结论或原因分析。', + '3.4': '本节必须使用“事实依据—评价判断—问题与建议”三段式结构。正文不得出现“【事实依据】”“【评价判断】”“【问题与建议】”标题标签。并按顺序完整覆盖下级小节:3.4.1、3.4.2。', + '3.4.1': '必须严格按以下格式与顺序输出,不得缺项、不得改名:\n' + '1)首行固定输出标题:"3.4.1 施工准备评价"。\n' + '2)标题后先写一段总述,说明项目开工前施工准备工作总体情况(如招投标组织、承包商选择、项目部成立、人员配备、施工物资准备等)。\n' + '3)随后必须按顺序固定输出以下六个小标题并分别展开:\n' + ' 1)项目部成立、人员配备\n' + ' 2)完成总体部署并获得批复\n' + ' 3)开工报告批准\n' + ' 4)“四通一平”工作完成\n' + ' 5)EPC总承包管理组织成立\n' + ' 6)资金已准备到位\n' + '4)每个小标题下均应写对应事实与完成情况,涉及时间节点或批复信息时应按证据给出。\n' + '5)末尾再写一段总结性文字,明确项目是否满足工程施工准备基本条件。\n' + '6)本节为施工准备文字评价,禁止出现任何表格交叉引用与表题清单:不得写“详见表…”“参见表…”“见表…”“如表…所示”,不得单独成行输出“表3-2/表3-3/表3-4/表2-7”等表号或表题。\n' + '7)不得新增无关小标题或调整上述顺序;证据不足处写“待补充”,不得编造批复时间、资金到位情况、组织机构或准备完成结论。', + '3.4.2': '必须严格按以下格式与顺序输出,不得缺项、不得改名:\n' + '1)首行固定输出标题:"3.4.2 施工计划的执行情况"。\n' + '2)标题后先写一段连续文字,简要说明工程建设进度控制目标与实际执行情况。\n' + '3)随后必须输出表题“表3-6 施工进度情况”,并紧跟表格。\n' + '4)表后再写一段简要评价:如工程进度有较大变化,必须分析原因;如无明显偏差,应明确说明总体执行情况。\n' + '\n' + '【表格强制要求】\n' + '1)表格必须直接使用“要素管理”中的表格(element_tables/element_cells),不得自行新造表,不得用正文推断补表。\n' + '2)必须优先使用要素管理中对应“表3-6 施工进度情况”的结构化表;若存在对应表格,须直出其表头、行项目和单元格内容,不得改列名、不得替换成其他表。\n' + '3)表头字段必须覆盖:序号、项目、施工单位、合同期限、实际执行情况、备注。\n' + '4)表3-6不得省略;如要素管理中未命中对应表格,也必须按模板字段输出占位表。\n' + '\n' + '【写作约束】\n' + '不得拆分为与本节无关的小标题;证据不足处写“待补充”;不得编造施工单位、合同期限、执行进度或偏差原因。', + '3.5': '必须严格按以下格式与顺序输出,不得缺项、不得改名:\n' + '1)首行固定输出标题:"3.5 采购工作评价"。\n' + '2)随后固定输出小标题“1)采购工作情况介绍”,并在该小标题下写连续文字,至少包括:采购分工(甲供/乙供或建设单位采购与EPC采购边界)、采购合同数量与金额、供应商审查与选商把关、入厂检验与质量控制、采购进度控制措施及关键设备采购完成节点。\n' + '3)在采购进度或关键设备采购描述后,必须以“主要设备及大型机组的采购计划见表3-7”或同义句引出表格。\n' + '4)随后必须输出表题“表3-7 采购工作情况”,并紧跟表格。\n' + '5)表格后必须保留注释:\n' + ' 注:1.采购工作评价指甲供主要材料、设备的评价;\n' + ' 2.应招标数量指合同数量,应招标金额指合同金额。\n' + '6)最后单独写“后评价认为:……”总结性结论,明确采购质量控制效果、交货进度与对总体建设部署的支撑情况。\n' + '\n' + '【表格强制要求】\n' + '1)表格必须直接使用“要素管理”中的表格(element_tables/element_cells),不得自行新造表,不得用正文推断补表。\n' + '2)必须优先使用要素管理中对应“表3-7 采购工作情况”的结构化表;若存在对应表格,须直出其表头、行项目和单元格内容,不得改列名、不得替换成其他表。\n' + '3)表头字段必须覆盖:序号、物资(类别)名称、采购方式、制造商、供货商、金额(万元)、未招标原因;并包含金额分项(单位、数量、单价、小计)以及“应招标数量/招标数量率”“应招标金额/招标金额率”等统计行。\n' + '4)表3-7不得省略;如要素管理中未命中对应表格,也必须按模板字段输出占位表。\n' + '\n' + '【写作约束】\n' + '不得拆分为与本节无关的小标题;证据不足处写“待补充”;不得编造采购合同数量、金额、完成节点、检验合格率、招标率或未招标原因。', + '3.6': '必须严格按以下格式输出,不得缺项、不得改名:\n' + '1)首行固定输出标题:"3.6 工程监理评价"。\n' + '2)标题后先输出一段连续文字,说明监理单位名称、资质等级、承担能力、监理组织配置(如总监、专业监理、资料员等)及人员配备是否满足合同与现场需求。\n' + '3)再输出一段连续文字,评价监理单位在进度、质量、安全、投资控制等方面的措施与执行效果;可结合监理通知单、暂停令、问题整改闭环、安全事故率等事实进行支撑。\n' + '4)结尾需明确给出总体评价结论,说明监理工作对建设目标实现的作用。\n' + '5)不得拆分为条目或无关小标题;证据不足处写“待补充”,不得编造监理人数、通知单数量、暂停令数量、事故率或控制成效。', + '3.7': '必须严格按以下格式与顺序输出,不得缺项、不得改名:\n' + '1)首行固定输出标题:"3.7 工程质量评价"。\n' + '2)随后固定输出小标题“1)工程质量责任主体评价”,并按顺序展开:\n' + ' (1)明确责任主体\n' + ' (2)确保质量保证体系建立及有效运行\n' + ' (3)制定质量计划\n' + ' (4)质量保证措施\n' + ' (5)监理承包商、质量检测单位的质量管理\n' + '3)再固定输出小标题“2)工程质量管控过程评价”,并按顺序展开设备安装、管道安装、建筑工程、钢结构/防腐保温/电气仪表、“三查四定和中间交接验收”等过程质量管控情况,需体现质监点、监督记录、通知书及整改闭环情况。\n' + '4)再固定输出小标题“3)工程质量验收结果评价”,明确质量控制点、单位工程、分项工程合格率及投料试车结果等验收结论。\n' + '5)最后单独写“后评价认为:……”总结性结论,明确工程质量责任落实、体系运行、过程整改与质量目标达成情况。\n' + '\n' + '【写作约束】\n' + '不得新增无关小标题或调整上述顺序;证据不足处写“待补充”;不得编造合格率、通知书数量、问题项数量、质监记录份数或质量事故结论。', + '3.8': '必须严格按以下格式与顺序输出,不得缺项、不得改名:\n' + '1)首行固定输出标题:"3.8 HSE管理评价"。\n' + '2)随后固定输出小标题“1)建立HSE管理组织,强化责任落实”,并在该小标题下结合证据写具体做法与执行效果。\n' + '3)再固定输出小标题“2)落实HSE管理制度,强化日常安全监督检查”,并在该小标题下结合证据写具体做法与执行效果。\n' + '4)再固定输出小标题“3)加强承包商管理,提升安全管理意识”,并在该小标题下结合证据写具体做法与执行效果。\n' + '5)再固定输出小标题“4)强化HSE过程管控,确保全过程受控”,并在该小标题下结合证据写具体做法与执行效果。\n' + '6)各小标题下应结合证据写具体做法与执行效果,可覆盖HSE组织设置、岗位责任、属地管理、制度修订、监督检查、教育培训、JSA活动、作业许可、隐患排查、违约处理等内容。\n' + '7)结尾需单独写一段总体成效结论,明确是否实现HSE目标(如“零事故、零伤害、零污染”)及安全工时等结果(如有证据)。\n' + '\n' + '【写作约束】\n' + '四个固定小节标题的输出体例须与上一节“3.7 工程质量评价”中“1)工程质量责任主体评价”“2)工程质量管控过程评价”“3)工程质量验收结果评价”等小节标题保持一致:单独成行;行首两个全角空格缩进;纯文本编号体,与正文同字号、不得整行加粗;禁止使用“##”“###”以及成对的“**…**”等 Markdown 标题或强调语法把小节标题做成副标题样式。\n' + '不得新增无关小标题或调整上述顺序;证据不足处写“待补充”,不得编造培训次数/人数、安全工时、事故结论或HSE目标达成情况。', + '3.9': '必须严格按以下格式输出,不得缺项、不得改名:\n' + '1)首行固定输出标题:"3.9 三查四定及中间交接"。\n' + '2)标题后仅输出一段连续文字,说明“三查四定”和中间交接验收监督检查情况。\n' + '3)该段应至少包含:组织时间(如有)、牵头单位与参与专业、问题发现数量、承包商整改销项机制、监理与业主联合复核确认情况。\n' + '4)不得拆分为条目或无关小标题;证据不足处写“待补充”,不得编造问题数量、整改闭环结论或参与单位。', + '4': '按章节标题自然组织内容,围绕证据包先事实后结论,缺失项写“待补充”。', + '4.1': '必须严格按以下格式与顺序输出,不得缺项、不得改名:\n' + '1)首行固定输出标题:"4.1 生产准备评价"。\n' + '2)随后必须按顺序固定输出以下五个小标题并分别展开:\n' + ' 4.1.1投产管理组织机构成立、生产人员配备情况\n' + ' 4.1.2生产人员培训\n' + ' 4.1.3岗位规章制度、操作规程、事故应急预案情况\n' + ' 4.1.4原料、燃料、动力的供给情况\n' + ' 4.1.5环保、消防、安全及职业卫生等方面的批准文件\n' + '3)在4.1.1中应写明生产准备组织设置、人员配置、职责分工及是否满足投产需求;可体现“三查四定”、中交验收、开工准备、HSE管理、试生产审批等职责。\n' + '4)在4.1.2中应按阶段写培训安排、外部学习/现场培训、考核与持证上岗情况,并给出能力达标结论。\n' + '5)在4.1.3中应写明试车方案、操作规程、制度体系和应急预案的建立与执行情况,明确其对试车与运行的支撑作用。\n' + '6)在4.1.4中应分“原料/辅助材料供给”和“公用工程供应”描述,明确供给来源、保障措施及满足性判断。\n' + '7)在4.1.5中应列出环保、消防、安全、职业卫生等批准/审查文件及关键信息(如文号、时间),证据不足处写“待补充”。\n' + '8)最后单独写“后评价认为:……”总结性结论,明确生产准备条件是否充分可靠、是否为投产试运行奠定基础。\n' + '9)不得新增无关小标题或调整上述顺序;不得编造人员数量、培训批次、文号时间、物料来源、公用工程供给或准备充分性结论。', + '4.2': '必须严格按以下格式与顺序输出,不得缺项、不得改名:\n' + '1)首行固定输出标题:"4.2 联合试运与试生产情况评价"。\n' + '2)随后必须按顺序固定输出以下三个小标题并分别展开:\n' + ' 4.2.1总体试车安排\n' + ' 4.2.2投料试车情况\n' + ' 4.2.3出现的问题及解决措施\n' + '3)在4.2.1中应写明试车组织机制、联动试车安排、投料组织逻辑、物料供给方式和产品质量控制要求。\n' + '4)在4.2.2中应按阶段描述试车过程(如三查四定/单机与中交、联动试车、投料试车),并分别写烷基化装置与废酸再生单元的关键时间节点和结果。\n' + '5)在4.2.3中应按问题-措施闭环方式描述问题,至少覆盖问题现象、原因或适配不足、整改措施与整改结果;不得只写问题不写处置。\n' + '6)最后单独写“后评价认为:……”总结性结论,明确联合试运与试生产组织成效、一次投产情况、问题责任归属及改进判断。\n' + '7)不得新增无关小标题或调整上述顺序;证据不足处写“待补充”;不得编造试车时间节点、产品合格结论、问题数量、责任归属或整改效果。', + '4.3': '本节必须使用“事实依据—评价判断—问题与建议”三段式结构。正文不得出现“【事实依据】”“【评价判断】”“【问题与建议】”标题标签。并按顺序完整覆盖下级小节:4.3.1、4.3.2、4.3.3、4.3.4、4.3.5、4.3.6。', + '4.3.1': '必须严格按以下格式输出,不得缺项、不得改名:\n' + '1)首行固定输出标题:"4.3.1 原料供应评价"。\n' + '2)标题后先写一段连续文字,说明装置原料来源(如上游MTBE、加氢裂化、轻烃回收等)及在上游正常生产情况下的供应稳定性与对装置平稳运行的支撑作用。\n' + '3)再写一段连续文字,分析原料组成与产量控制关系(如碳四烯烃与异丁烷配比、异丁烷补充来源、关键控制因素),并结合装置负荷率与未满负荷原因进行评价。\n' + '4)不得拆分为条目或无关小标题;证据不足处写“待补充”,不得编造来源装置、负荷率、原料组成变化或原因判断。', + '4.3.2': '必须严格按以下格式与顺序输出,不得缺项、不得改名:\n' + '1)首行固定输出标题:"4.3.2 生产运行总体情况评价"。\n' + '2)随后必须按顺序固定输出以下三个小标题并分别展开:\n' + ' 1)运行负荷逐步提高\n' + ' 2)精细管理,合理扩大原料来源\n' + ' 3)优化运行,装置能耗逐步降低\n' + '3)在“1)运行负荷逐步提高”中应写明投产以来装置运行天数、产量、负荷率及关键产品质量指标达成情况(如有证据)。\n' + '4)在“2)精细管理,合理扩大原料来源”中应写明原料受限背景、工艺优化措施(如引入替代原料)、试用过程中的问题与调整、以及对产量和创效能力的影响。\n' + '5)在“3)优化运行,装置能耗逐步降低”中应写明能耗优化措施(如酸烃比、异丁烷循环量、分馏压力、压缩机运行优化等)及优化前后能耗变化结果。\n' + '6)随后必须输出“后评价认为:……”总结性结论,明确装置运行负荷、原料优化与能耗改善的总体评价。\n' + '7)最后必须输出表题“表4-1 投产以来运行周期统计表”,并紧跟表格。\n' + '\n' + '【表格强制要求】\n' + '1)表格必须直接使用“要素管理”中的表格(element_tables/element_cells),不得自行新造表,不得用正文推断补表。\n' + '2)必须优先使用要素管理中对应“表4-1 投产以来运行周期统计表”的结构化表;若存在对应表格,须直出其表头、行项目和单元格内容,不得改列名、不得替换成其他表。\n' + '3)表头字段必须覆盖:序号、装置名称、本周期开工日期、本周期运行时间(天)、本周期非计划停工次数(次)、本周期非计划停工时数(时)、原因简要分析。\n' + '4)表4-1不得省略;如要素管理中未命中对应表格,也必须按模板字段输出占位表。\n' + '\n' + '【写作约束】\n' + '不得新增无关小标题或调整上述顺序;证据不足处写“待补充”;不得编造负荷率、产量、原料加工量、增产量、能耗值或优化效果。', + '4.3.3': '必须严格按以下格式与顺序输出,不得缺项、不得改名:\n' + '1)首行固定输出标题:"4.3.3 达标评价"。\n' + '2)随后必须按顺序固定输出以下三个小标题并分别展开:\n' + ' 1) 标定工作评价\n' + ' 2) 主要装置达标评价\n' + ' 3) 全厂达标评价\n' + '3)在“1) 标定工作评价”中应写明标定组织方式、方案审定流程、参与部门、标定时间及是否按规范执行。\n' + '4)在“2) 主要装置达标评价”中应基于设计值、标定值、实际值进行对比评价;如存在较大变化,必须分析原因。\n' + '5)在“2) 主要装置达标评价”内容后,必须输出表题“表4-2 烷基化装置运行分析表(考核时间:×年×月×日)”,并紧跟表格。\n' + '6)在“3) 全厂达标评价”中应针对全厂性项目,使用全年实际运行数据与设计值对比,评价全厂达标情况;如差距较大,应分析原因。\n' + '\n' + '【表格强制要求】\n' + '1)表格必须直接使用“要素管理”中的表格(element_tables/element_cells),不得自行新造表,不得用正文推断补表。\n' + '2)必须优先使用要素管理中对应“表4-2 烷基化装置运行分析表(考核时间:×年×月×日)”的结构化表;若存在对应表格,须直出其表头、行项目和单元格内容,不得改列名、不得替换成其他表。\n' + '3)表头字段必须覆盖:序号、项目、单位、设计值、标定值、实际值、备注。\n' + '4)表内行项目应覆盖生产能力、主要原材料、主要产品产量、公用工程消耗(水/蒸汽/电/燃料气)、综合能耗、现金加工成本、单位毛利等(按要素管理表格直出,可增减)。\n' + '5)表后不输出任何模板性注释(如“注:表中内容可根据项目不同进行增减”等套话),仅保留要素管理中有实质内容的原始注释。\n' + '6)本节要求的表4-2不得省略;如要素管理中未命中对应表格,也必须按模板字段输出占位表。\n' + '7)表4-2 在全节仅允许出现一次:必须在「2) 主要装置达标评价」对应文字之后输出表题与表格;不得在「3) 全厂达标评价」中再次输出表4-2,也不得重复输出同一张烷基化装置运行分析数据表。\n' + '\n' + '【写作约束】\n' + '不得新增无关小标题或调整上述顺序;证据不足处写“待补充”;不得编造标定时间、达标结论、对比差异或原因分析。', + '4.3.4': '必须严格按以下格式与顺序输出,不得缺项、不得改名:\n' + '1)首行固定输出标题:"4.3.4 生产工艺技术评价"。\n' + '2)随后必须按顺序固定输出两个小标题并分别展开:\n' + ' 1)烷基化工艺技术\n' + ' 2)废酸再生工艺技术\n' + '3)在“1)烷基化工艺技术”中应写明工艺路线、关键设备/关键机理、运行特征(如选择性、酸耗、可靠性、操作灵活性)及后评价对技术适配性的判断;可包含与同类工艺的横向比较(有证据时)。\n' + '4)在“2)废酸再生工艺技术”中应写明工艺来源与核心指标(如硫回收率、热回收、尾气排放、产品硫酸浓度等),并如实评价其与常规工艺相比的优缺点(如流程复杂度、运行复杂度、投资水平等)。\n' + '5)两部分均需体现“技术优势 + 局限性/代价”的平衡评价,不得只写优点不写不足。\n' + '6)不得新增无关小标题或调整上述顺序;证据不足处写“待补充”;不得编造工艺来源、技术参数、比较结论或投资高低判断。', + '4.3.5': '必须严格按以下格式与顺序输出,不得缺项、不得改名:\n' + '1)首行固定输出标题:"4.3.5 设备运行评价"。\n' + '2)标题后先写一段总体设备运行情况,至少包括设备总量及分类(如反应器、塔类、容器、换热、压缩机、空冷器、机泵等)和投产至后评价时点的总体运行结论。\n' + '3)随后按顺序固定输出两个小标题并分别展开:\n' + ' (1)静设备方面\n' + ' (2)动设备方面\n' + '4)在“(1)静设备方面”中应基于运行参数与设计参数对比,评价塔器/容器等静设备工况与长周期运行适应性。\n' + '5)在“(2)动设备方面”中应写明压缩机、机泵、阀门等关键动设备运行情况,以及开工初期问题、整改措施与整改结果(如材质更换、新增泵、安装方式调整、阀杆修复等)。\n' + '6)结尾应给出总体设备运行评价(如运行平稳、调节正常、满足长周期/安全运行要求等),若存在未完全满足项应如实写明。\n' + '7)不得新增无关小标题或调整上述顺序;证据不足处写“待补充”;不得编造设备数量、故障问题、整改措施或运行结论。', + '4.3.6': '必须严格按以下格式输出,不得缺项、不得改名:\n' + '1)首行固定输出标题:"4.3.6 公用工程及辅助设施合理性评价"。\n' + '2)标题后先写一段连续文字,结合实际生产及标定结果,验证公用工程及辅助设施是否满足生产运行需要。\n' + '3)若存在问题,必须再写一段连续文字,明确问题表现、影响及整改措施或优化建议;若无明显问题,应明确说明总体满足性结论。\n' + '4)不得拆分为无关小标题或条目;证据不足处写“待补充”,不得编造标定结果、公用工程能力、设施问题或整改建议。', + '4.4': '必须严格按以下格式与顺序输出,不得缺项、不得改名:\n' + '1)首行固定输出标题:"4.4 生产运行评价结论"。\n' + '2)标题后先写两段总体结论:\n' + ' - 第一段聚焦生产准备、试车投产、设备与公用工程保障、达标结果及预期目标实现情况;\n' + ' - 第二段聚焦投产以来原料供应、负荷变化、能耗变化、优化运行与创效提升情况。\n' + '3)随后固定输出小标题“1)主要经验”,并按顺序固定输出两项:\n' + ' (1)生产运行精细管理、合理优化,实现提质增效\n' + ' (2)结合生产实际情况优化运行,有效降低装置能耗\n' + '4)再固定输出小标题“2)存在问题”,并按顺序固定输出两项:\n' + ' (1)考核标定不规范\n' + ' (2)装置运行酸耗偏高\n' + '5)各分项下均应写连续文字,明确事实、原因和影响,不得只列标题。\n' + '6)不得新增无关小标题或调整上述顺序;证据不足处写“待补充”;不得编造负荷率、能耗值、酸耗值、标定时间或同类对标结论。', + '5': '必须严格按以下格式输出,不得缺项、不得改名:\n' + '1)首行固定输出章标题:"5 投资与经济效益评价"(仅此一行;与第1章「1 项目概况」同一编号体例,勿写「第5章」;勿使用 Markdown「#」前缀)。\n' + '2)标题后空一行,再写一段连续文字(2~4句),概括本章将依次从主要经济指标实现程度、投资与执行情况、经济效益、不确定性分析到本章结论展开;仅可使用证据包,缺失处写“待补充”。\n' + '3)不得输出 5.1~5.5 各小节正文或表格(各小节由后续章节单独生成)。\n' + '4)不得新增其他小标题或条目。', + '5.1': '必须严格按以下格式与顺序输出,不得缺项、不得改名:\n' + '1)首行固定输出标题:"5.1 主要经济指标实现程度评价"。\n' + '2)标题后先写一段连续文字,明确后评价时点(如2020年12月31日)下项目效益测算结果与可研报告对比情况。\n' + '3)该段应至少写出税后财务内部收益率(后评价值与可研值)及差值判断,并给出“项目效益目标实现程度较好”或同义结论。\n' + '4)段末需以“见表5-1”或同义表达引出表格。\n' + '5)随后必须输出表题“表5-1 主要经济指标对比表”,并紧跟表格。\n' + '\n' + '【表格强制要求】\n' + '1)表格必须直接使用“要素管理”中的表格(element_tables/element_cells),不得自行新造表,不得用正文推断补表。\n' + '2)必须优先使用要素管理中对应“表5-1 主要经济指标对比表”的结构化表;若存在对应表格,须直出其表头、行项目和单元格内容,不得改列名、不得替换成其他表。\n' + '3)表头字段必须覆盖:序号、项目名称、单位、可研值、后评价值、差值、比例(%)、备注,并保留“(1)/(2)/(2)-(1)”等分栏信息(如要素管理中存在)。\n' + '4)行项目应覆盖:项目报批总投资(含建设投资、建设期利息、铺底流动资金)、年均营业收入、年均总成本费用、年均流转税金及附加、年均利润总额、年均所得税金、年均税后利润、项目投资内部收益率、项目投资财务净现值、项目静态投资回收期。\n' + '5)表5-1不得省略;如要素管理中未命中对应表格,也必须按模板字段输出占位表。\n' + '\n' + '【写作约束】\n' + '不得新增无关小标题;证据不足处写“待补充”;不得编造IRR、净现值、回收期、投资金额、差值或比例。', + '5.2': '本节必须使用“事实依据—评价判断—问题与建议”三段式结构。正文不得出现“【事实依据】”“【评价判断】”“【问题与建议】”标题标签。并按顺序完整覆盖下级小节:5.2.1、5.2.2、5.2.3、5.2.4。', + '5.2.1': '必须严格按以下格式与顺序输出,不得缺项、不得改名:\n' + '1)首行固定输出标题:"5.2.1 投资控制及变动原因分析"。\n' + '2)标题后先写一段连续文字,说明可研批复投资、初设批复概算、竣工决算审计投资三者对比,明确与可研/初设相比的增减额及比例,并给出投资控制评价判断。\n' + '3)再写一段连续文字,说明投资变动主要原因分析及调概推进情况(如有证据)。\n' + '4)随后必须依次输出:\n' + ' - 表5-2 投资变动情况表(单位:万元、万美元)\n' + ' - 表5-3 工程费用变动情况表(万元、万美元)\n' + ' 每个表题后必须紧跟对应表格。\n' + '5)表后应补充工程费用及其他费用、建设期利息等变动原因分析(有证据时),并说明超支/结余的主要因素。\n' + '\n' + '【表格强制要求】\n' + '1)两张表必须直接使用“要素管理”中的表格(element_tables/element_cells),不得自行新造表,不得用正文推断补表。\n' + '2)必须优先使用要素管理中对应“表5-2”“表5-3”的结构化表;若存在对应表格,须直出其表头、行项目和单元格内容,不得改列名、不得替换成其他表。\n' + '3)表5-2应保留投资估算/初设概算/竣工决算及“决算与估算比较”“决算与概算比较”的差额、比例分栏结构,并保留批准单位/批准文号等信息行(如有)。\n' + '4)表5-3应保留工程费用分解结构(设备购置费、安装工程费、建筑工程费等)及比较分栏结构,并保留“其中:外汇”等信息行(如有)。\n' + '5)表5-2、表5-3均不得省略;如要素管理中未命中对应表格,也必须按模板字段输出占位表。\n' + '\n' + '【写作约束】\n' + '不得新增无关小标题;证据不足处写“待补充”;不得编造批复文号、投资金额、增减比例、超支结余原因或调概进展。', + '5.2.2': '必须严格按以下格式输出,不得缺项、不得改名:\n' + '1)首行固定输出标题:"5.2.2 投资水平分析"。\n' + '2)标题后仅输出一段连续文字,围绕单位加工能力投资(或单位投资)开展对标分析。\n' + '3)该段至少应包含:本项目烷基化装置单位加工能力投资值、与可比装置(如吉林石化、锦州石化、兰州石化等)对比结果、偏高/偏低判断;如有废酸单元投资数据,也应给出单位投资值及高低判断。\n' + '4)不得拆分为条目或小标题;证据不足处写“待补充”,不得编造单位投资数值、对标对象或高低结论。', + '5.2.3': '必须严格按以下格式输出,不得缺项、不得改名:\n' + '1)首行固定输出标题:"5.2.3 资金来源及到位评价"。\n' + '2)标题后仅输出一段连续文字,说明可研阶段资金来源结构(如企业自筹与债务资金比例)与实际建设期资金来源差异。\n' + '3)该段应写明投资计划下达金额与时间范围、资金到位及时性、资金使用合规性,并明确是否存在转移、侵占、挪用或损失浪费问题。\n' + '4)不得拆分为条目或小标题;证据不足处写“待补充”,不得编造资金比例、计划金额、到位情况或合规性结论。', + '5.2.4': '必须严格按以下格式输出,不得缺项、不得改名:\n' + '1)首行固定输出标题:"5.2.4 投资控制的经验和教训"。\n' + '2)标题后仅输出一段连续文字,结合初步设计概算批复值与竣工决算审计值的差异,归纳投资控制方面的经验和教训。\n' + '3)该段应至少体现:超概金额与比例、主要影响因素(如漏项、单方造价偏低、主材费偏低、工程量增加、材料设备涨价、设计变更、现场签证等)以及对前期工作和投资控制能力的反思。\n' + '4)结尾需提出改进方向(如梳理流程、强化费用控制、合理确定并有效控制投资)。\n' + '5)不得拆分为条目或小标题;证据不足处写“待补充”,不得编造超概金额比例、影响因素或改进结论。', + '5.3': '本节必须使用“事实依据—评价判断—问题与建议”三段式结构。正文不得出现“【事实依据】”“【评价判断】”“【问题与建议】”标题标签。并按顺序完整覆盖下级小节:5.3.1、5.3.2。', + '5.3.1': '必须严格按以下格式与顺序输出,不得缺项、不得改名:\n' + '1)首行固定输出标题:"5.3.1 项目投产以来生产经营及效益状况"。\n' + '2)标题后先写投产以来各年度(如2019、2020)的运行天数、加工量、主要产品产量、平均负荷率、产品率、税后利润等核心经营数据对比描述。\n' + '3)随后必须输出表题“表5-4 生产经营及效益情况对比表”,并紧跟表格。\n' + '4)表后必须按顺序固定输出三段分析:\n' + ' 1)营业收入变动分析\n' + ' 2)总成本费用变动分析\n' + ' 3)税后利润变动分析\n' + '5)三段分析中应体现与可研值的对比关系,说明增减幅度及主要原因。\n' + '\n' + '【表格强制要求】\n' + '1)表格必须直接使用“要素管理”中的表格(element_tables/element_cells),不得自行新造表,不得用正文推断补表。\n' + '2)必须优先使用要素管理中对应“表5-4 生产经营及效益情况对比表”的结构化表;若存在对应表格,须直出其表头、行项目和单元格内容,不得改列名、不得替换成其他表。\n' + '3)表头字段应覆盖项目、单位、分年度可研报告值/实际值/增减(%)对比结构。\n' + '4)行项目应覆盖运行情况、主要原料价格、主要产品价格、主要产品年产量、主要产品年销售量、主要原料和公用工程消耗量、主要经济指标(营业收入、成本费用、利润总额、税后利润)等(按要素管理表格直出,可增减)。\n' + '5)表5-4不得省略;如要素管理中未命中对应表格,也必须按模板字段输出占位表。\n' + '\n' + '【写作约束】\n' + '不得新增无关小标题或调整上述顺序;证据不足处写“待补充”;不得编造运行天数、负荷率、收入成本、利润或变动原因。', + '5.3.2': '必须严格按以下格式与顺序输出,不得缺项、不得改名:\n' + '1)首行固定输出标题:"5.3.2 项目经济效益后评价"。\n' + '2)随后必须按顺序固定输出以下六个小标题并分别展开:\n' + ' 1)计算范围及评价方法\n' + ' 2)主要生产经营指标\n' + ' 3)总成本费用\n' + ' 4)营业收入\n' + ' 5)销售税金及附加\n' + ' 6)盈利能力分析\n' + '3)在“1)计算范围及评价方法”中应写明与可研一致的评价口径、增量法、评价期、基准收益率、实绩与预测口径、价格体系分段假设等关键参数。\n' + '4)在“2)主要生产经营指标”中必须以“见表5-5”或同义表达引出并输出“表5-5 主要生产经营指标”。\n' + '5)在“3)总成本费用”中应按分项说明测算口径(原料产品价格、工资福利、折旧、修理费、其他制造费、摊销、安全生产费、安保基金、财务费用、营业费用等)。\n' + '6)在“4)营业收入”中应说明收入计算逻辑、价格选取与贴水假设,并给出评价期关键结果(有证据时)。\n' + '7)在“5)销售税金及附加”中应说明消费税、增值税、城建税、教育费附加等计税依据与税率口径。\n' + '8)在“6)盈利能力分析”中应至少包含IRR、NPV、投资回收期与可研对比结论,并以“填写表5-6”或同义表达引出并输出“表5-6 不同因素变化对项目内部收益率的影响”。\n' + '9)在“6)盈利能力分析”末尾需补一句:后评价财务报表见附表3~附表8(如有证据),不得遗漏;该句仅为文字引用,本节不得输出附表3~附表8任一张的 Markdown 表格(附表由全书「附表」区汇总)。\n' + '\n' + '【表格强制要求】\n' + '1)表5-5、表5-6必须直接使用“要素管理”中的表格(element_tables/element_cells),不得自行新造表,不得用正文推断补表。\n' + '2)必须优先使用要素管理中对应“表5-5”“表5-6”的结构化表;若存在对应表格,须直出其表头、行项目和单元格内容,不得改列名、不得替换成其他表。\n' + '3)表5-5应保留“后评价时点前实际值/后评价时点后预测值”的分年列结构;表5-6应保留“财务内部收益率、变化幅度、占比”列结构。\n' + '4)表5-5、表5-6均不得省略;如要素管理中未命中对应表格,也必须按模板字段输出占位表。\n' + '\n' + '【写作约束】\n' + '不得新增无关小标题或调整上述顺序;证据不足处写“待补充”;不得编造价格体系、税率、负荷率、IRR/NPV/回收期、附表引用或敏感性分析结论;除表5-5、表5-6外不得输出任何「附表」类 Markdown 表(尤其禁止附表8)。', + '5.4': '必须严格按以下格式与顺序输出,不得缺项、不得改名:\n' + '1)首行固定输出标题:"5.4 不确定性分析"。\n' + '2)标题后先写一段连续文字,说明后评价时影响项目财务效益的主要不确定因素(至少包括生产负荷、产品价格、原材料价格等)。\n' + '3)再写一段连续文字,说明定量分析方法:以项目投资财务内部收益率达到基准收益率为约束,计算相关因素的临界点(%)或临界值。\n' + '4)随后必须输出表题“表5-7 内部收益率为基准收益率时不确定因素临界点或临界值”,并紧跟表格。\n' + '5)表后再写一段连续文字,开展定性分析:判断不确定因素可能变化趋势、潜在风险及风险应对对策(减少风险/规避风险)。\n' + '\n' + '【表格强制要求】\n' + '1)表格必须直接使用“要素管理”中的表格(element_tables/element_cells),不得自行新造表,不得用正文推断补表。\n' + '2)必须优先使用要素管理中对应“表5-7 内部收益率为基准收益率时不确定因素临界点或临界值”的结构化表;若存在对应表格,须直出其表头、行项目和单元格内容,不得改列名、不得替换成其他表。\n' + '3)表头字段必须覆盖:序号、项目、单位、数值、备注;行项目至少覆盖生产负荷、产品价格、主要原材料价格、其它。\n' + '4)表5-7不得省略;如要素管理中未命中对应表格,也必须按模板字段输出占位表。\n' + '\n' + '【写作约束】\n' + '不得新增无关小标题;证据不足处写“待补充”;不得编造临界点、临界值、变化趋势或风险对策。', + '5.5': '必须严格按以下格式输出,不得缺项、不得改名:\n' + '1)首行固定输出标题:"5.5 投资与经济效益评价结论"。\n' + '2)标题后先写一段连续文字,聚焦投资控制结论:至少包含竣工决算与可研估算/初设概算的差异金额与比例、投资控制是否有效、以及与同类项目单位工程费投资水平对比判断。\n' + '3)再写一段连续文字,聚焦经济效益结论:明确评价方法与可研一致,并给出税后财务内部收益率与可研值对比、差值及是否实现预期效益目标。\n' + '4)最后写一段综合结论,说明在加工负荷、价格体系变化条件下盈利能力表现,并分析主要原因(如收率变化、原料与产品价格关系等)。\n' + '5)不得拆分为条目或无关小标题;证据不足处写“待补充”,不得编造投资差异、IRR数值、同类对标结论或原因分析。', + '6': '按章节标题自然组织内容,围绕证据包先事实后结论,缺失项写“待补充”。', + '6.1': '本节必须使用“事实依据—评价判断—问题与建议”三段式结构。正文不得出现“【事实依据】”“【评价判断】”“【问题与建议】”标题标签。并按顺序完整覆盖下级小节:6.1.1、6.1.2、6.1.3、6.1.4、6.1.5。', + '6.1.1': '必须严格按以下格式与顺序输出,不得缺项、不得改名:\n' + '1)首行固定输出标题:"6.1.1 环境影响评价"。\n' + '2)标题后先写1段总体评价,内容聚焦:是否落实国家/地方环保法律法规与环评批复要求、环保设施“三同时”落实情况、环保设施是否满足生产需要、污染物总量控制是否在地方指标以内。\n' + '3)固定输出小节标题:"6.1.1.1 环保措施",并在该小节下按顺序固定输出以下5个条目标题与正文:\n' + ' 1)废水处理措施\n' + ' 2)废气处理措施\n' + ' 3)固体废物处理措施\n' + ' 4)噪声处理措施\n' + ' 5)环境风险防范措施\n' + ' 每个条目均需结合项目实际设施、处理路径、依托系统或管理制度进行描述。\n' + '4)固定输出小节标题:"6.1.1.2 效果及影响",先写1句监测说明,再按顺序固定输出以下3个条目标题与正文:\n' + ' 1)废气监测结果\n' + ' 2)废水监测结果\n' + ' 3)噪声监测结果\n' + ' 各条目需说明监测结论与执行标准符合性(如有标准名称可写明)。\n' + '5)末尾必须以“后评价认为:”起1段结论,综合评价环保措施落实情况、废气废水噪声达标情况、固体废物处置与环境总体影响。\n' + '【写作约束】\n' + '不得新增无关小标题;不得改变上述标题与编号顺序;不得把固定条目合并或拆分;证据不足处写“待补充”,不得编造监测结果、达标结论或法规标准。', + '6.1.2': '必须严格按以下格式与顺序输出,不得缺项、不得改名:\n' + '1)首行固定输出标题:"6.1.2 安全影响评价"。\n' + '2)标题后先写1段总体评价,内容聚焦:是否符合相关规划要求,设计与施工是否执行国家法律法规、规章及技术标准,配套安全设施是否落实“三同时”(同时设计、同时施工、同时投入生产和使用)。\n' + '3)固定输出小节标题:"6.1.2.1 安全风险因素",写项目生产运行中的主要风险因素,至少覆盖:易燃易爆介质风险、火灾爆炸风险、腐蚀危害、噪声、机械伤害等。\n' + '4)固定输出小节标题:"6.1.2.2 防范措施",先写1句引导语“为将安全风险降到最低,采取的主要防范措施:”,再按顺序固定输出3个条目:\n' + ' 1)设计、施工过程中依法合规执行要求;\n' + ' 2)落实安全评价及安全设施设计专篇措施并落实“三同时”;\n' + ' 3)报警、联锁、快速切断等检测控制仪表与联锁设施投用情况。\n' + '5)固定输出小节标题:"6.1.2.3 效果及影响",写建设期与投产后的安全运行效果,至少包含:是否发生安全事故/人身伤害事故、运行平稳性、对周边单位与居民影响。\n' + '6)末尾必须以“后评价认为:”起1段结论,综合评价危险因素可控性、措施有效性及项目整体安全影响。\n' + '【写作约束】\n' + '不得新增无关小标题;不得改变上述标题与编号顺序;不得把固定条目合并或拆分;证据不足处写“待补充”,不得编造事故情况、联锁投用率或安全结论。', + '6.1.3': '必须严格按以下格式输出,不得缺项、不得改名:\n' + '1)首行固定输出标题:"6.1.3 科技进步影响"。\n' + '2)标题后写第1段,简要评价项目对科技进步的推动作用,至少覆盖:\n' + ' 对技术开发、技术创新、技术改造、技术引进的作用;\n' + ' 对高新技术产业化、商品化和国际化的作用;\n' + ' 对国家和地方科技进步的推动作用。\n' + '3)紧接着写第2段,从项目应用的引进技术、国产技术、自有技术等方面,总结项目对企业、公司、国家科技发展和技术推广的影响。\n' + '【写作约束】\n' + '全文仅保留标题+两段正文,不得新增小标题或条目;不得改写为多级结构;证据不足处写“待补充”,不得编造技术来源、推广范围或科技成效。', + '6.1.4': '必须严格按以下格式输出,不得缺项、不得改名:\n' + '1)首行固定输出标题:"6.1.4 项目社会影响评价"。\n' + '2)标题后写第1段,围绕项目社会效益进行评价,至少覆盖:\n' + ' 项目与国家及地方环保优先、油品质量提升政策导向的符合性;\n' + ' 对汽油品质改善、高标号汽油产量提升、区域机动车排放降低与空气质量改善的作用;\n' + ' 对当地及国家经济发展、社会稳定的促进作用。\n' + '3)紧接着写第2段,围绕项目实施后的成效,至少覆盖:\n' + ' 全厂汽油质量是否满足国VI B相关指标要求;\n' + ' 成品油质量升级任务完成情况;\n' + ' 对城市机动车尾气污染改善的环境效益判断。\n' + '【写作约束】\n' + '全文仅保留标题+两段正文,不得新增小标题或条目;不得改写为多级结构;证据不足处写“待补充”,不得编造油品指标、政策符合性或社会环境效益结论。', + '6.1.5': '必须严格按以下格式输出,不得缺项、不得改名:\n' + '1)首行固定输出标题:"6.1.5 项目影响评价结论"。\n' + '2)标题后写第1段,聚焦风险敏感性与管理要求,至少覆盖:\n' + ' 炼化项目高温高压、易燃易爆/有毒有害介质等风险特征;\n' + ' 项目区位敏感性与事故/环保事件潜在社会关注度;\n' + ' 企业责任意识、危机意识及落实法律法规、主管部门要求、“三同时”和“环保优先”理念的情况。\n' + '3)紧接着写第2段,聚焦综合影响结论,至少覆盖:\n' + ' 项目选址与总体规划符合性、清洁生产符合性、污染物达标与总量控制、事故防范与环境风险可接受性;\n' + ' 项目在选址、总图、工艺技术、设备设施方面对国家法规和技术标准的符合性;\n' + ' 项目投产后对油品质量升级(如国VI标准达成)与区域清洁油品供应、机动车尾气污染改善的环境效益。\n' + '【写作约束】\n' + '全文仅保留标题+两段正文,不得新增小标题或条目;不得改写为多级结构;证据不足处写“待补充”,不得编造合规性、风险可接受性、油品标准达成或环境效益结论。', + '6.2': '本节必须使用“事实依据—评价判断—问题与建议”三段式结构。正文不得出现“【事实依据】”“【评价判断】”“【问题与建议】”标题标签。并按顺序完整覆盖下级小节:6.2.1、6.2.2、6.2.3、6.2.4。', + '6.2.1': '必须严格按以下格式输出,不得缺项、不得改名:\n' + '1)首行固定输出标题:"6.2.1 资源分析"。\n' + '2)标题后仅输出1段连续正文,围绕资源保障与持续性进行分析,至少覆盖:\n' + ' 原料来源构成(如醚后碳四、饱和液化气等);\n' + ' 投产初期负荷与一次加工负荷关联影响;\n' + ' 通过上游优化及原料补充(如醚后碳五)后的负荷提升情况;\n' + ' 后评价时点运行负荷状态及后续原料稳定供应判断;\n' + ' 项目发展持续性结论。\n' + '【写作约束】\n' + '全文仅保留标题+1段正文,不得新增小标题、条目或表格;不得改写为多级结构;证据不足处写“待补充”,不得编造原料来源、负荷数据或持续性结论。', + '6.2.2': '必须严格按以下格式输出,不得缺项、不得改名:\n' + '1)首行固定输出标题:"6.2.2 产品分析"。\n' + '2)标题后写第1段,说明清洁汽油发展趋势与政策标准背景,至少覆盖:低硫、低烯烃、低芳烃、低苯、低蒸汽压方向,以及国VI A/国VI B实施时间与要求。\n' + '3)紧接着写第2段,分析烷基化油的产品属性与价值,至少覆盖:低芳烃/低苯/低烯烃/低硫/低蒸汽压等品质优势、对国VI ' + 'B汽油升级作用、高辛烷值带来的高标号汽油增产价值、以及液化气向高附加值产品转化对资源利用率和经济效益的提升。\n' + '4)最后写第3段,分析区域市场与持续性,至少覆盖:项目区位与市场需求特征、汽油消费占比、就近销售与成本优势、产品销路保障及项目持续性判断。\n' + '【写作约束】\n' + '全文仅保留标题+3段正文,不得新增小标题、条目或表格;不得改写为多级结构;证据不足处写“待补充”,不得编造实施时间、产品指标、市场占比或持续性结论。', + '6.2.3': '必须严格按以下格式与顺序输出,不得缺项、不得改名:\n' + '1)首行固定输出标题:"6.2.3 主要技术及经济指标对比"。\n' + '2)标题后先写1段引导性分析文字,说明需通过项目(装置)规模、能耗、投资及成本、工艺技术及设备等与国内外同类项目对比,判断项目运行是否达到同类水平及是否具有竞争优势,并明确写出“主要技术经济指标对比表见表6-1”。\n' + '3)紧接着必须输出表题:"表6-1 装置技术经济指标对比表"。\n' + '4)表6-1必须输出为Markdown表格(优先使用要素管理同名表),至少包含以下列:项目名称、技术来源、规模(万吨/年)、物耗(Wt)%、能耗(kgEo/t)、产品质量、产品收率(Wt)%、排名。\n' + '5)表后不输出任何模板性注释(如"注:可根据项目具体情况增减指标"等套话),仅保留要素管理中有实质内容的原始注释。\n' + '\n' + '【表格强制要求】\n' + '1)表格必须直接使用“要素管理”中的表格(element_tables/element_cells),不得自行新造表,不得用正文推断补表。\n' + '2)必须优先使用要素管理中对应“表6-1 装置技术经济指标对比表”的结构化表;若存在对应表格,须直出其表头、行项目和单元格内容,不得改列名、不得替换成其他表。\n' + '3)表头字段须覆盖合同正文所列列名;若要素管理表结构略有差异,以要素管理为准直出,不得省略整张表。\n' + '4)表6-1不得省略;如要素管理中未命中对应表格,也必须按上述列名输出占位表(单元格可填“待补充”)。\n' + '\n' + '【写作约束】\n' + '不得新增无关小标题;不得省略表题、表格或表注;证据不足处写“待补充”;不得编造对标对象数据、排名结果或竞争优势结论。', + '6.2.4': '必须严格按以下格式输出,不得缺项、不得改名:\n' + '1)首行固定输出标题:"6.2.4 项目持续性评价结论"。\n' + '2)标题后写第1段,聚焦市场与产销持续性,至少覆盖:\n' + ' 项目投产后油品质量升级与国家成品油质量升级任务完成情况;\n' + ' 原料来源(上游装置供给)保障情况;\n' + ' 成品油销售组织与区域流向(如省内占比)及销路保障;\n' + ' 对占领和巩固当地市场的优势判断。\n' + '3)紧接着写第2段,聚焦政策与运行持续性,至少覆盖:\n' + ' 项目与国家绿色发展、可持续发展产业政策符合性;\n' + ' 投产后通过优化措施在加工能力、产品质量、能耗、物耗等指标上的达成情况;\n' + ' 项目发展持续性总体结论。\n' + '【写作约束】\n' + '全文仅保留标题+2段正文,不得新增小标题、条目或表格;不得改写为多级结构;证据不足处写“待补充”,不得编造省内销售占比、指标达成值或持续性结论。', + '7': '本章为综合评价结论,必须基于【前序章节正文(第1~6章)】归纳提炼,是对前六章内容的总结升华,不得脱离前文另起论述。按章节标题组织,先事实后结论;结论与数据须与前文一致;缺失项写“待补充”。', + '7.1': '本节是对第1~6章的归纳性评价,必须基于【前序章节正文】撰写。须使用“事实依据—评价判断—问题与建议”三段式结构。正文不得出现“【事实依据】”“【评价判断】”“【问题与建议】”标题标签。并按顺序完整覆盖下级小节:7.1.1、7.1.2。', + '7.1.1': '必须严格按以下格式与顺序输出,不得缺项、不得改名:\n' + '1)首行固定输出标题:"7.1.1 总体评价结论"。\n' + '2)标题后先写1段总体结论,聚焦项目定位与总体成效,至少覆盖:\n' + ' 作为国VI汽油质量升级配套项目的作用;\n' + ' 以醚后碳四等液化气为原料生产高辛烷值调和组分;\n' + ' 对解决汽油池质量问题、完成国家质量升级任务、提升高标号汽油产量与附加值、经济效益表现的综合判断。\n' + '3)随后按顺序固定输出以下5个条目标题与正文:\n' + ' 1)前期工作规范有效,及时完成质量升级\n' + ' 2)建设实施管理规范,满足总体部署要求\n' + ' 3)投产一次成功,主要技术指标达到设计要求\n' + ' 4)竣工决算投资超批复,投资回报实现预期目标\n' + ' 5)产品适应市场需求,发展持续性较好\n' + '4)上述5个条目内容至少分别覆盖:\n' + ' 前期报批与立项及投产节点;\n' + ' 管理模式(业主+监理+EPC)及建设/HSE/进度成效;\n' + ' 生产准备、一次开车成功、长周期运行与关键技术指标达标;\n' + ' 竣工决算与批复差异及投资收益率对可研预期比较;\n' + ' 市场适应性、国VI需求匹配、运行安全与持续性判断。\n' + '【写作约束】\n' + '须优先依据【前序章节正文(第1~6章)】归纳各条目,不得与前面章节结论矛盾;不得新增无关小标题;不得改变上述标题与编号顺序;不得合并或拆分5个固定条目;证据不足处写“待补充”,不得编造时间节点、百分比数据或收益指标。', + '7.1.2': '必须严格按以下格式与顺序输出,不得缺项、不得改名:\n' + '1)首行固定输出标题:"7.1.2 成功度评价"。\n' + '2)标题后先写1段方法说明:说明采用要素成功度评价方法,基于前文各章评价要点形成项目综合评价结论;可写明要素评分与汇总见下表。\n' + '3)紧接着输出成功度等级划分说明,按以下顺序逐条给出4级标准:\n' + ' 优秀项目(评分≥9分):项目的各项指标都已全面实现或超过,且在经济上取得了较大效益和影响。\n' + ' 良好项目(9分>评分≥8分):项目的大部分指标都已全面实现,且在经济上取得了预期效益和影响。\n' + ' 中等项目(8分>评分≥6分):项目实现了原定的部分目标,且在经济上无明显效益和影响。\n' + ' 较差项目(评分<6分):项目实现的目标非常有限,且在经济上没有正效益和影响。\n' + '4)在上述等级标准之后,固定输出表题:"表7-1 项目综合评价评分表",并紧跟 Markdown 表格(列名、行键与要素管理一致)。\n' + '5)表格之后用1~2段自然段给出本项目成功度综合评价结论(可与前文结论一致、作归纳),证据不足处写“待补充”。\n' + '\n' + '【表格强制要求】\n' + '1)表格必须直接使用“要素管理”中的表格(element_tables/element_cells),不得自行新造表,不得用正文推断补表。\n' + '2)必须优先使用要素管理中对应“表7-1 项目综合评价评分表”的结构化表;若存在对应表格,须直出其表头、行项目和单元格内容,不得改列名、不得替换成其他表。\n' + '3)表7-1不得省略;如要素管理中未命中对应表格,也必须按细则列结构输出占位表(单元格可填“待补充”)。\n' + '\n' + '【写作约束】\n' + '须优先依据【前序章节正文(第1~6章)】归纳成功度结论;不得新增无关小标题;不得省略四级标准、表题与表格;表内数值以要素管理直出为准;证据不足处写“待补充”;不得在正文编造与表内数据相矛盾的评分、综合得分或等级结论。', + '7.2': '必须严格按以下格式与顺序输出,不得缺项、不得改名:\n' + '1)首行固定输出标题:"7.2 主要经验"。\n' + '2)随后固定输出主条目:"1)生产运行精细化管理,实现提质增效",并在其下按顺序固定输出2个分项:\n' + ' (1)合理扩大原料来源,提高装置运行负荷\n' + ' (2)优化生产运行,有效降低装置能耗\n' + '3)再固定输出主条目:"2)采用适宜的建设管理模式,有效实现建设目标",并在其下按顺序固定输出4个分项:\n' + ' (1)结合企业实际情况和项目特点采用“业主+监理+EPC”管理模式并完成建设任务\n' + ' (2)发挥EPC统筹管理作用,合理交叉设计采购施工并保障工期目标\n' + ' (3)加强建设过程管理,实现质量、安全、环保等事故控制目标\n' + ' (4)加强属地管理与“三查四定”问题整改,保障建设与投产衔接\n' + '4)各分项正文需结合项目已发生事实进行归纳,允许使用关键数据(如负荷、能耗、工期、问题项数)支撑经验结论。\n' + '【写作约束】\n' + '须优先依据【前序章节正文(第1~6章)】提炼主要经验,与前文已述事实一致;不得新增无关小标题;不得改变上述标题与分项顺序;不得合并或拆分固定分项;证据不足处写“待补充”,不得编造负荷、能耗、工期或整改数量数据。', + '7.3': '必须严格按以下格式与顺序输出,不得缺项、不得改名:\n' + '1)首行固定输出标题:"7.3 问题与建议"。\n' + '2)固定输出小节标题:"7.3.1主要问题"。\n' + '3)在“7.3.1主要问题”下按顺序固定输出以下3个主条目及分项:\n' + ' 1)施工图设计不优化,存在浪费和安全隐患\n' + ' 2)投资和进度控制存在不足,建设实施管理待加强\n' + ' (1)主体装置和配套单元建设不同步,进度控制存在不足\n' + ' (2)项目实际投资超批复,未能实现项目投资控制预期目标\n' + ' (3)招标管理待进一步规范\n' + ' 3)考核标定不规范,运行有待进一步优化\n' + ' (1)考核标定不及时,部分指标未按设计值标定\n' + ' (2)装置运行酸耗偏高,生产待进一步优化\n' + '4)随后固定输出小节标题:"7.3.2对策建议"。\n' + '5)在“7.3.2对策建议”下按顺序固定输出以下4个主条目及分项:\n' + ' 1)总结推广先进经验,提高集团公司同类项目运行水平\n' + ' 2)合理利用副产品开发新产品,进一步实现提质增效\n' + ' 3)上下游装置一体优化运行,实现降本增效\n' + ' (1)合理配置催化原料、提高负荷、优化运行\n' + ' (2)研究降低MTBE/轻汽油醚化装置二甲醚生成量\n' + ' (3)加强烷基化原料预处理,降低酸耗与再生负荷\n' + ' (4)优先利用低温热水伴热\n' + ' 4)尽快完成竣工验收,进入正式生产阶段\n' + '6)各条目与分项正文必须结合项目事实与数据展开,不得只写标题。\n' + '【写作约束】\n' + '须优先依据【前序章节正文(第1~6章)】归纳问题与建议,与前文已述问题一致;不得新增无关小标题;不得改变上述标题、条目与分项顺序;不得合并或拆分固定条目;证据不足处写“待补充”,不得编造时间节点、投资金额比例、酸耗或整改结论。'} + +DEFAULT_SECTION_OUTPUT_CONTRACT = '按章节标题自然组织内容,围绕证据包先事实后结论,缺失项写“待补充”。' diff --git a/prompts/report_generation/table_format_repair_system.md b/prompts/report_generation/table_format_repair_system.md new file mode 100644 index 0000000..a284f49 --- /dev/null +++ b/prompts/report_generation/table_format_repair_system.md @@ -0,0 +1 @@ +你是后评价报告格式修订助手。仅做格式对齐修订:章节标题、表名、表头。禁止新增未证据支持的数据。返回 JSON:{"content":"..."} diff --git a/prompts/report_generation/table_format_repair_user.md b/prompts/report_generation/table_format_repair_user.md new file mode 100644 index 0000000..d022065 --- /dev/null +++ b/prompts/report_generation/table_format_repair_user.md @@ -0,0 +1,25 @@ +你正在修订章节:{{section_title}} + +目标:对齐模板格式,不改变事实结论。 +请仅修订“章节标题、表名、表头”,正文事实描述尽量保持原样。 + +【模板表规格(JSON)】 +{{table_specs_json}} + +【当前章节】 +{{content}} + +【证据包(JSON)】 +{{evidence_json}} + +修订规则: +1) 章节首行必须为标准章节标题; +2) 表名必须与模板表规格中的 token/title 对齐;表题中表号与表名之间须空两个全角空格(如「表2-4  原料数量及组成对比表」); +3) 表头字段优先与模板一致,表内数据来自证据包,无值写待补充; +4) 必须使用 Markdown 表格; +5) 表头栏排版:指标名称与计量单位分两行写在同一表头单元格内;单位须加括号并写在名称正下方(Markdown 可用 `
`,如 `新鲜水
(m³/h)`);表题与表头均勿使用 `**` 加粗;勿将单位单独拆成一列表头列,勿把「名称(单位)」横挤在同一行; +6) 若整张表各数据列所用单位相同,应将单位加括号写在表题末尾(如「表3 ××公司储罐能力 (m³)」),表头栏内不再重复写该单位; +7) 表格「序号」列:优先使用各行行键(row_key)首部已有的阿拉伯数字层次编号(与正文 1、1.1、1.2、2、2.1 一致);若行键未带此类编号,则用自上而下连续阿拉伯数字 1、2、3…;「合计」「总计」行序号可用「—」; +8) 表体单元格内容宜居中;若有换行或分段,宜左齐。同列数值宜统一小数位数; +9) 禁止编造事实数据; +10) 仅返回修订后的完整章节 Markdown(不要返回 JSON)。 diff --git a/requirements.txt b/requirements.txt new file mode 100644 index 0000000..4fa2352 --- /dev/null +++ b/requirements.txt @@ -0,0 +1,23 @@ +# Web 框架 +fastapi +uvicorn[standard] +pydantic +pydantic-settings + +# 数据库(MySQL) +sqlalchemy +pymysql +cryptography + +# HTTP(LLM / Embedding 调用) +requests + +# 附图提取(解析项目 .docx 内嵌图片) +python-docx + +# 向量检索(Milvus + Embeddings + BM25) +langchain-core +langchain-text-splitters +langchain-openai +langchain-milvus +pymilvus diff --git a/routers/__init__.py b/routers/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/routers/report.py b/routers/report.py new file mode 100644 index 0000000..6cde0fa --- /dev/null +++ b/routers/report.py @@ -0,0 +1,204 @@ +""" +routers/report.py +后评价报告「核心生成」路由(独立抽取版)。 + +从 eval_report 的 routers/write.py 摘取报告生成相关端点,去除鉴权依赖, +项目查询改用轻量的 services/project_service.get_project。 +业务逻辑在 services/report_generation_service.py。 +""" + +from __future__ import annotations + +import asyncio +import json +from typing import Optional + +from fastapi import APIRouter, Depends, Header, HTTPException +from fastapi.responses import StreamingResponse +from sqlalchemy.orm import Session + +from database import SessionLocal, get_db +from database.models import ReportTemplate, ReportTemplateSection +from schemas.write import ( + GenerateReportJobCreate, + GenerateReportJobItem, + GenerateReportResult, +) +from services.project_service import get_project +from services.report_generation_service import ( + create_report_job, + get_report_job, + get_report_result, + get_report_stream_snapshot, + retry_report_chapter, + cancel_report_job, +) + +router = APIRouter(prefix="/write", tags=["后评价报告生成"]) + + +@router.get("/projects/{project_id}/generate-sections", summary="按章节智能体生成提示词清单") +def generate_sections_prompt( + project_id: str, + template_id: Optional[str] = None, + db: Session = Depends(get_db), +): + _ = get_project(project_id, db) + template = None + if template_id: + template = db.query(ReportTemplate).filter(ReportTemplate.id == template_id, ReportTemplate.is_active == True).first() # noqa: E712 + if not template: + template = db.query(ReportTemplate).filter(ReportTemplate.is_default == True, ReportTemplate.is_active == True).first() # noqa: E712 + if not template: + raise HTTPException(status_code=404, detail="未找到可用模板") + sections = ( + db.query(ReportTemplateSection) + .filter(ReportTemplateSection.template_id == template.id) + .order_by(ReportTemplateSection.section_order.asc()) + .all() + ) + return { + "templateId": template.id, + "templateName": template.name, + "sections": [ + { + "sectionKey": s.section_key, + "sectionTitle": s.section_title, + "prompt": ( + "请基于2020后评价细则与本项目检索材料,先查要素表,再查文档段落,最后生成本章节内容。\n" + + (s.section_prompt or "") + ), + "examples": s.examples or "", + } + for s in sections + ], + } + + +@router.post( + "/projects/{project_id}/generate-report-job", + response_model=GenerateReportJobItem, + summary="创建分章异步报告生成任务", +) +def create_generate_report_job( + project_id: str, + body: GenerateReportJobCreate, + db: Session = Depends(get_db), + x_user_id: Optional[str] = Header(default=None, alias="X-User-Id"), +): + _ = get_project(project_id, db) + return create_report_job( + project_id, + db, + template_id=body.templateId, + top_k=body.topK, + requested_by=x_user_id, + ) + + +@router.get( + "/projects/{project_id}/generate-report-job/{job_id}", + response_model=GenerateReportJobItem, + summary="查询分章异步报告任务进度", +) +def get_generate_report_job( + project_id: str, + job_id: str, + db: Session = Depends(get_db), +): + return get_report_job(project_id, job_id, db) + + +@router.get( + "/projects/{project_id}/generate-report-job/{job_id}/result", + response_model=GenerateReportResult, + summary="获取分章异步报告任务结果", +) +def get_generate_report_result( + project_id: str, + job_id: str, + include_debug: bool = False, + db: Session = Depends(get_db), +): + return get_report_result(project_id, job_id, db, include_debug=include_debug) + + +@router.get( + "/projects/{project_id}/generate-report-job/{job_id}/events", + summary="订阅分章异步报告任务实时事件(SSE)", +) +async def stream_generate_report_job_events( + project_id: str, + job_id: str, + include_debug: bool = False, +): + # 校验后立即释放连接;SSE 循环中按需短连接查询,避免长连占满连接池 + with SessionLocal() as db: + _ = get_report_job(project_id, job_id, db) + + async def _event_stream(): + last_payload = "" + idle_ticks = 0 + while True: + snapshot = get_report_stream_snapshot(job_id, include_debug=include_debug) + if not snapshot: + with SessionLocal() as db: + job = get_report_job(project_id, job_id, db) + result = get_report_result(project_id, job_id, db, include_debug=include_debug) + snapshot = { + "job": job.model_dump(), + "result": result.model_dump(), + } + payload = json.dumps(snapshot, ensure_ascii=False, separators=(",", ":")) + if payload != last_payload: + last_payload = payload + idle_ticks = 0 + yield f"event: snapshot\ndata: {payload}\n\n" + else: + idle_ticks += 1 + if idle_ticks >= 20: + idle_ticks = 0 + yield "event: keepalive\ndata: ping\n\n" + + status = str(((snapshot.get("job") or {}).get("status") or "")).strip().lower() + if status in ("completed", "failed", "cancelled"): + yield f"event: end\ndata: {payload}\n\n" + break + await asyncio.sleep(0.25) + + return StreamingResponse( + _event_stream(), + media_type="text/event-stream", + headers={ + "Cache-Control": "no-cache", + "Connection": "keep-alive", + "X-Accel-Buffering": "no", + }, + ) + + +@router.post( + "/projects/{project_id}/generate-report-job/{job_id}/retry-chapter", + response_model=GenerateReportJobItem, + summary="重试指定章节", +) +def retry_generate_report_chapter( + project_id: str, + job_id: str, + section_key: str, + db: Session = Depends(get_db), +): + return retry_report_chapter(project_id, job_id, section_key, db) + + +@router.post( + "/projects/{project_id}/generate-report-job/{job_id}/cancel", + response_model=GenerateReportJobItem, + summary="取消报告生成任务", +) +def cancel_generate_report_job( + project_id: str, + job_id: str, + db: Session = Depends(get_db), +): + return cancel_report_job(project_id, job_id, db) diff --git a/schemas/__init__.py b/schemas/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/schemas/write.py b/schemas/write.py new file mode 100644 index 0000000..5fbf958 --- /dev/null +++ b/schemas/write.py @@ -0,0 +1,179 @@ +""" +schemas/write.py +后评价报告项目相关的 Pydantic 数据模型。 +""" + +from __future__ import annotations +from typing import Any, List, Optional +from pydantic import BaseModel + + +# ---------- 版本 ---------- + +class DocVersion(BaseModel): + id: str + version: str + content: str + savedAt: str + author: str + note: Optional[str] = "" + citationPayload: Optional[dict[str, Any]] = None + + +# ---------- 文档 ---------- + +class WriteDocument(BaseModel): + id: str + title: str + content: str + wordCount: int + createdAt: str + updatedAt: str + projectId: str + status: str # draft | review | published + versions: List[DocVersion] = [] + + +class WriteDocumentSummary(BaseModel): + """列表页只返回摘要,不含 content 正文""" + id: str + title: str + wordCount: int + createdAt: str + updatedAt: str + projectId: str + status: str + + +# ---------- 项目 ---------- + +class WriteProject(BaseModel): + id: str + uuid: str # 项目唯一标识,与 kb 共用 + name: str + description: Optional[str] = "" + createdAt: str + updatedAt: str + docCount: int + status: str # active | archived + kbProjectId: Optional[str] = None + color: str + documents: List[WriteDocument] = [] + + +class WriteProjectSummary(BaseModel): + """列表页摘要,不含 documents""" + id: str + uuid: Optional[str] = None # 项目唯一标识,用于 URL 参数;兼容旧数据 + name: str + description: Optional[str] = "" + createdAt: str + updatedAt: str + docCount: int + status: str + kbProjectId: Optional[str] = None + color: str + + +# ---------- 创建 / 更新请求体 ---------- + +class WriteProjectCreate(BaseModel): + name: str + description: Optional[str] = "" + kbProjectId: Optional[str] = None + color: Optional[str] = "#3b82f6" + + +class WriteProjectUpdate(BaseModel): + name: Optional[str] = None + description: Optional[str] = None + status: Optional[str] = None + kbProjectId: Optional[str] = None + color: Optional[str] = None + + +class WriteDocumentCreate(BaseModel): + title: str + content: Optional[str] = "" + + +class WriteDocumentUpdate(BaseModel): + title: Optional[str] = None + content: Optional[str] = None + status: Optional[str] = None + + +class DocVersionCreate(BaseModel): + version: Optional[str] = None + content: str + author: str + note: Optional[str] = "" + citationPayload: Optional[dict[str, Any]] = None + + +# ---------- 章节审查(智能体) ---------- + + +class ChapterReviewRequest(BaseModel): + """章节智能审查请求体:选择章节 + 输入待审查文本。""" + + chapter: str # "1"~"6" + content: str + + +class ChapterReviewResponse(BaseModel): + """章节智能审查响应体:返回 Markdown 审查报告。""" + + success: bool = True + chapter: str + review: str + model: Optional[str] = None + message: Optional[str] = "" + + +class GenerateReportJobCreate(BaseModel): + templateId: Optional[str] = None + topK: int = 10 + + +class GenerateReportChapterItem(BaseModel): + sectionKey: str + sectionTitle: str + sectionOrder: int + status: str + updatedAt: Optional[str] = None + errorMessage: Optional[str] = None + + +class GenerateReportJobItem(BaseModel): + jobId: str + projectId: str + templateId: Optional[str] = None + status: str + progress: int + currentSectionKey: Optional[str] = None + errorMessage: Optional[str] = None + createdAt: Optional[str] = None + updatedAt: Optional[str] = None + completedAt: Optional[str] = None + chapters: List[GenerateReportChapterItem] = [] + + +class GenerateReportResultChapter(BaseModel): + sectionKey: str + sectionTitle: str + sectionOrder: int + status: str + content: Optional[str] = None + errorMessage: Optional[str] = None + promptText: Optional[str] = None + evidencePayload: Optional[dict] = None + validationPayload: Optional[dict] = None + + +class GenerateReportResult(BaseModel): + jobId: str + status: str + report: Optional[str] = None + consistency: List[str] = [] + chapters: List[GenerateReportResultChapter] = [] diff --git a/services/__init__.py b/services/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/services/appendix_figure_extraction.py b/services/appendix_figure_extraction.py new file mode 100644 index 0000000..ef74110 --- /dev/null +++ b/services/appendix_figure_extraction.py @@ -0,0 +1,199 @@ +""" +从项目知识库 Word(.docx)中提取「附图1/附图2」嵌入图,用于报告附录。 + +细则常见版式:附图标题段落与图在同一节或相邻段落;解析时合并前/当前/后段文字做关键词匹配。 +""" + +from __future__ import annotations + +import base64 +import logging +from pathlib import Path +from typing import Optional + +from docx import Document +from docx.oxml.ns import qn +from docx.table import Table +from docx.text.paragraph import Paragraph + +logger = logging.getLogger(__name__) + +# 过滤装饰性小图(logo 等) +_MIN_FIGURE_BYTES = 6000 + +R_EMBED = "{http://schemas.openxmlformats.org/officeDocument/2006/relationships}embed" +_NS = { + "w": "http://schemas.openxmlformats.org/wordprocessingml/2006/main", + "a": "http://schemas.openxmlformats.org/drawingml/2006/main", +} + + +def _compact(s: str) -> str: + return "".join(str(s or "").split()) + + +def _classify_slot(ctx: str) -> Optional[int]: + """ + 返回 1=全厂物料平衡图,2=装置(如烷基化)物料平衡图。 + """ + t = _compact(ctx) + if not t: + return None + # 附图编号(先判 2,避免同段目录同时出现两个编号时误判) + if "附图2" in t: + return 2 + if "附图1" in t: + return 1 + if "全厂" in t and "物料平衡" in t: + return 1 + if "烷基化" in t and "物料平衡" in t: + return 2 + if "装置" in t and "物料平衡" in t and "全厂" not in t: + return 2 + return None + + +def _content_type_to_md_subtype(content_type: str) -> str: + ct = (content_type or "").lower() + if "jpeg" in ct or ct.endswith("jpg"): + return "jpeg" + if "png" in ct: + return "png" + if "gif" in ct: + return "gif" + if "emf" in ct: + return "x-emf" + if "wmf" in ct: + return "x-wmf" + return "png" + + +def _blob_to_data_uri(blob: bytes, content_type: str) -> str: + sub = _content_type_to_md_subtype(content_type) + b64 = base64.standard_b64encode(blob).decode("ascii") + return f"data:image/{sub};base64,{b64}" + + +def _iter_paragraphs_deep(doc: Document): + body_el = doc.element.body + for el in body_el: + if el.tag == qn("w:p"): + yield Paragraph(el, doc._body) + elif el.tag == qn("w:tbl"): + table = Table(el, doc._body) + for row in table.rows: + for cell in row.cells: + for p in cell.paragraphs: + yield p + + +def extract_appendix_figure_candidates_from_docx(path: Path) -> dict[int, list[tuple[int, bytes, str]]]: + """ + 从单个 docx 收集候选图:slot -> [(size, blob, content_type), ...] + content_type 来自 OPC part,用于拼 data URI。 + """ + candidates: dict[int, list[tuple[int, bytes, str]]] = {1: [], 2: []} + orphans_ordered: list[tuple[bytes, str]] = [] + try: + doc = Document(str(path)) + except Exception as exc: + logger.warning("appendix figure: open docx failed %s: %s", path, exc) + return candidates + + paras = list(_iter_paragraphs_deep(doc)) + texts = [p.text or "" for p in paras] + + for i, p in enumerate(paras): + blobs_with_type: list[tuple[bytes, str]] = [] + for blip in p._element.findall(".//a:blip", _NS): + embed = blip.get(R_EMBED) + if not embed: + continue + try: + rel = p.part.related_parts[embed] + except KeyError: + continue + blob = getattr(rel, "blob", None) + ct = getattr(rel, "content_type", "") or "image/png" + if blob and len(blob) >= _MIN_FIGURE_BYTES: + blobs_with_type.append((blob, ct)) + + if not blobs_with_type: + continue + + prev_t = texts[i - 1] if i > 0 else "" + cur_t = texts[i] + next_t = texts[i + 1] if i + 1 < len(texts) else "" + ctx = f"{prev_t}\n{cur_t}\n{next_t}" + slot = _classify_slot(ctx) + if slot is None: + for blob, ct in blobs_with_type: + orphans_ordered.append((blob, ct)) + continue + + for blob, ct in blobs_with_type: + candidates[slot].append((len(blob), blob, ct)) + + def _dedupe_preserve_order(pairs: list[tuple[bytes, str]]) -> list[tuple[bytes, str]]: + seen: set[int] = set() + out: list[tuple[bytes, str]] = [] + for blob, ct in pairs: + bid = id(blob) + if bid in seen: + continue + seen.add(bid) + out.append((blob, ct)) + return out + + orphans_ordered = _dedupe_preserve_order(orphans_ordered) + used_ids: set[int] = set() + for lst in candidates.values(): + for _sz, blob, _ct in lst: + used_ids.add(id(blob)) + orphans_ordered = [(b, c) for b, c in orphans_ordered if id(b) not in used_ids] + + if not candidates[1] and orphans_ordered: + b, c = orphans_ordered.pop(0) + candidates[1].append((len(b), b, c)) + if not candidates[2] and orphans_ordered: + b, c = orphans_ordered.pop(0) + candidates[2].append((len(b), b, c)) + + return candidates + + +def merge_best_appendix_figures( + per_doc: list[tuple[str, dict[int, list[tuple[int, bytes, str]]]]], +) -> dict[int, tuple[bytes, str, str]]: + """ + 多文档合并:每个 slot 只保留字节最大的一张(更可能是主流程图而非小图标)。 + + 返回 slot -> (blob, content_type, source_doc_name) + """ + best: dict[int, tuple[int, bytes, str, str]] = {} + for doc_name, cand in per_doc: + for slot in (1, 2): + for size, blob, ct in cand.get(slot) or []: + prev = best.get(slot) + if prev is None or size > prev[0]: + best[slot] = (size, blob, ct, doc_name) + return {k: (v[1], v[2], v[3]) for k, v in best.items()} + + +def appendix_figure_markdown_images( + resolved: dict[int, tuple[bytes, str, str]], + *, + label_title: list[tuple[str, str]], +) -> dict[int, str]: + """slot -> markdown 片段(含 ### 标题与 ![](data:...))""" + out: dict[int, str] = {} + slot_to_title = {i + 1: lt for i, lt in enumerate(label_title)} + for slot, (blob, ct, src) in resolved.items(): + if slot not in slot_to_title: + continue + label, title = slot_to_title[slot] + uri = _blob_to_data_uri(blob, ct) + cap = f"{label} {title}" + src_note = f"\n\n*(嵌入来源:{src})*" if src else "" + out[slot] = f"### {cap}\n\n![{cap}]({uri}){src_note}" + return out diff --git a/services/docx_export_service.py b/services/docx_export_service.py new file mode 100644 index 0000000..901e760 --- /dev/null +++ b/services/docx_export_service.py @@ -0,0 +1,28 @@ +""" +services/docx_export_service.py(瘦身版) + +本独立服务不提供 Word 导出能力;此处仅保留 report_generation_service 在 +正文小节编号识别时懒加载依赖的 `_is_likely_section_number`,以满足导入。 +""" + +from __future__ import annotations + +import re + + +def _is_likely_section_number(num: str) -> bool: + """报告小节编号(如 2.1.1),非正文能耗数值(如 132.41)。""" + s = str(num or "").strip() + if not s or not re.fullmatch(r"\d+(?:\.\d+)*", s): + return False + parts = s.split(".") + if len(parts) > 4: + return False + for part in parts: + try: + n = int(part) + except ValueError: + return False + if n < 1 or n > 30: + return False + return True diff --git a/services/kb_service.py b/services/kb_service.py new file mode 100644 index 0000000..7e6d744 --- /dev/null +++ b/services/kb_service.py @@ -0,0 +1,80 @@ +""" +services/kb_service.py(瘦身版) + +仅保留报告生成「附图提取」所需的知识库文档磁盘路径解析助手: +从 eval_report 的完整 kb_service.py 中抽取,去除知识库 CRUD / 上传 / worker 等无关逻辑。 +""" + +from __future__ import annotations + +from pathlib import Path +from typing import List, Optional + +from config import settings +from database.models import KbDocument as KbDocumentModel + + +def _normalize_rel_path(path: str) -> str: + """将 'a\\b\\c' 规范为 'a/b/c',并去掉前导 '/'。""" + s = str(path or "").replace("\\", "/").strip() + while s.startswith("./"): + s = s[2:] + return s.lstrip("/") + + +def _kb_doc_storage_rel_path( + file_path_dir: Optional[str], + basename: str, + storage_rel_path: Optional[str] = None, +) -> str: + """项目目录下的相对存储路径(含文件名)。优先 storage_rel_path(confirm 时写入)。""" + stored = _normalize_rel_path(str(storage_rel_path or "")) + if stored: + return stored + d = _normalize_rel_path(str(file_path_dir or "")) + bn = str(basename or "").strip() + if d and bn: + return f"{d}/{bn}" + return bn or d + + +def _kb_doc_path_candidates_for_model(doc_root: Path, doc: KbDocumentModel) -> List[Path]: + """解析磁盘路径时的候选列表(按优先级)。""" + rel = _kb_doc_storage_rel_path( + doc.file_path, + doc.name, + getattr(doc, "storage_rel_path", None), + ) + candidates: List[Path] = [] + if rel: + candidates.append((doc_root / doc.project_id / rel).resolve()) + name = str(doc.name or "").strip() + fp_dir = _normalize_rel_path(str(doc.file_path or "")) + if fp_dir and name: + candidates.append((doc_root / doc.project_id / fp_dir / name).resolve()) + if name: + candidates.append((doc_root / doc.project_id / name).resolve()) + if not candidates: + candidates.append((doc_root / doc.project_id / "_missing_").resolve()) + deduped: List[Path] = [] + seen: set[str] = set() + for p in candidates: + key = str(p) + if key in seen: + continue + seen.add(key) + deduped.append(p) + return deduped + + +def _kb_doc_absolute_file_path_for_model(doc_root: Path, doc: KbDocumentModel) -> Path: + for p in _kb_doc_path_candidates_for_model(doc_root, doc): + if p.is_file(): + return p + return _kb_doc_path_candidates_for_model(doc_root, doc)[0] + + +def _kb_doc_file_exists_for_model(doc: KbDocumentModel) -> bool: + """文档在磁盘上是否可读(多路径回退,兼容历史 file_path/name 组合)。""" + doc_root = Path(settings.DOC_PAT).resolve() + return any(p.is_file() for p in _kb_doc_path_candidates_for_model(doc_root, doc)) diff --git a/services/llm_client.py b/services/llm_client.py new file mode 100644 index 0000000..3b846f9 --- /dev/null +++ b/services/llm_client.py @@ -0,0 +1,724 @@ +from __future__ import annotations + +import json +import logging +import random +import re +import time +import threading +from typing import Any, Optional + +import requests +from requests import RequestException +from requests.exceptions import ChunkedEncodingError + +from config import settings + +_logger = logging.getLogger(__name__) +# 生成全过程追踪:完整记录输入 prompt / 调用模型 / 模型输出,写入 logs/generation_trace.log +_trace_logger = logging.getLogger("generation.trace") + +_LLM_MAX_CONCURRENCY = 5 +_llm_slots = threading.BoundedSemaphore(_LLM_MAX_CONCURRENCY) + + +class _RetryableLLMError(RuntimeError): + """用于标记可安全重试的 LLM 调用异常。""" + + +class _ContentFieldStreamExtractor: + """从流式 JSON 文本中增量提取 content 字段的已解码正文。""" + + def __init__(self) -> None: + self._raw = "" + self._content_started = False + self._content_done = False + self._value_start = -1 + self._consumed_pos = 0 + + def feed(self, chunk: str) -> tuple[str, bool]: + if not chunk: + return "", False + self._raw += chunk + emitted = "" + done_now = False + + if not self._content_started: + marker = '"content"' + idx = self._raw.find(marker) + if idx == -1: + return "", False + colon = self._raw.find(":", idx + len(marker)) + if colon == -1: + return "", False + quote = self._raw.find('"', colon + 1) + if quote == -1: + return "", False + self._content_started = True + self._value_start = quote + 1 + self._consumed_pos = self._value_start + + if self._content_started and not self._content_done: + emitted, consumed_pos, done_now = self._decode_partial_json_string( + self._raw, + self._consumed_pos, + ) + self._consumed_pos = consumed_pos + if done_now: + self._content_done = True + return emitted, done_now + + @staticmethod + def _decode_partial_json_string(src: str, start: int) -> tuple[str, int, bool]: + out: list[str] = [] + i = start + n = len(src) + while i < n: + ch = src[i] + if ch == '"': + if i == 0 or src[i - 1] != "\\" or _ContentFieldStreamExtractor._is_escaped(src, i): + return "".join(out), i, True + if ch != "\\": + out.append(ch) + i += 1 + continue + if i + 1 >= n: + break + esc = src[i + 1] + mapping = { + '"': '"', + "\\": "\\", + "/": "/", + "b": "\b", + "f": "\f", + "n": "\n", + "r": "\r", + "t": "\t", + } + if esc == "u": + if i + 6 > n: + break + hex_part = src[i + 2 : i + 6] + try: + out.append(chr(int(hex_part, 16))) + except Exception: + pass + i += 6 + continue + if esc in mapping: + out.append(mapping[esc]) + i += 2 + continue + out.append(esc) + i += 2 + return "".join(out), i, False + + @staticmethod + def _is_escaped(src: str, quote_index: int) -> bool: + backslashes = 0 + i = quote_index - 1 + while i >= 0 and src[i] == "\\": + backslashes += 1 + i -= 1 + return backslashes % 2 == 0 + + +def _format_exc_raw(e: Exception) -> str: + """统一输出最直接的异常原文(类型 + repr)。""" + return f"{type(e).__name__}: {e!r}" + + +def _chat_completions_stream_text( + *, + api_base: str, + api_key: str, + model_name: str, + system_prompt: str, + user_prompt: str, + temperature: float, + max_tokens: int, + extra_payload: dict[str, Any], + connect_timeout_sec: int, + read_timeout_sec: int = 300, + on_content_delta: Optional[callable] = None, +) -> str: + """ + 以 OpenAI-compat SSE 流式读取模型输出文本。 + - connect timeout 保留,避免连接阶段长时间卡死 + - read timeout 防止流式读取无限挂起(默认 300s) + """ + _logger.info( + "LLM 流式调用开始 | model=%s | temperature=%s | max_tokens=%s | timeout_connect=%s timeout_read=%s", + model_name, temperature, max_tokens, connect_timeout_sec, read_timeout_sec, + ) + resp = requests.post( + f"{api_base}/chat/completions", + headers={ + "Authorization": f"Bearer {api_key}", + "Content-Type": "application/json", + }, + json={ + "model": model_name, + "messages": [ + {"role": "system", "content": system_prompt}, + {"role": "user", "content": user_prompt}, + ], + "temperature": temperature, + "max_tokens": max_tokens, + "response_format": {"type": "json_object"}, + "stream": True, + **extra_payload, + }, + stream=True, + timeout=(connect_timeout_sec, max(60, read_timeout_sec)), + ) + + if resp.status_code in (408, 429, 500, 502, 503, 504): + raise _RetryableLLMError(f"LLM HTTP {resp.status_code}: {(resp.text or '')[:300]}") + if resp.status_code != 200: + raise RuntimeError(f"LLM HTTP {resp.status_code}: {(resp.text or '')[:800]}") + + resp.encoding = "utf-8" + chunks: list[str] = [] + extractor = _ContentFieldStreamExtractor() + try: + for line in resp.iter_lines(decode_unicode=True): + if not line: + continue + s = line.strip() + if not s.startswith("data:"): + continue + payload = s[5:].strip() + if not payload or payload == "[DONE]": + break + try: + obj = json.loads(payload) + except Exception: + continue + choices = obj.get("choices") + if not isinstance(choices, list) or not choices: + continue + first = choices[0] if isinstance(choices[0], dict) else {} + delta = first.get("delta") if isinstance(first.get("delta"), dict) else {} + content = delta.get("content") + if isinstance(content, str) and content: + chunks.append(content) + # print(content, end="", flush=True) + if on_content_delta: + delta_text, done_now = extractor.feed(content) + if delta_text: + try: + on_content_delta("delta", delta_text) + except Exception: + pass + if done_now: + try: + on_content_delta("finalizing", "") + except Exception: + pass + # 兼容部分实现把最终结果放在 message.content + message = first.get("message") if isinstance(first.get("message"), dict) else {} + msg_content = message.get("content") + if isinstance(msg_content, str) and msg_content: + chunks.append(msg_content) + # print(msg_content, end="", flush=True) + if on_content_delta: + delta_text, done_now = extractor.feed(msg_content) + if delta_text: + try: + on_content_delta("delta", delta_text) + except Exception: + pass + if done_now: + try: + on_content_delta("finalizing", "") + except Exception: + pass + except ChunkedEncodingError as e: + partial_text = "".join(chunks).strip() + # 若流提前结束但已收到完整 JSON,直接使用,避免无谓重试失败。 + if partial_text: + try: + parse_json_object_from_text(partial_text) + return partial_text + except Exception: + pass + raise _RetryableLLMError(f"LLM 流中断: {_format_exc_raw(e)}") from e + + text = "".join(chunks).strip() + if not text: + raise _RetryableLLMError("LLM 返回空内容") + print() + return text + + +def chat_completions_json( + *, + system_prompt: str, + user_prompt: str, + temperature: float = 0.2, + max_tokens: int = 4096, + timeout_sec: int = 180, + on_content_delta: Optional[callable] = None, + log_context: str = "", +) -> dict[str, Any]: + """ + 统一的 OpenAI-compat chat/completions 调用,强制返回 JSON object。 + 复用项目现有配置:LLM_API_BASE/LLM_API_KEY/LLM_MODEL_NAME。 + + log_context: 调用来源标签(如章节编号),用于在 generation_trace.log 中区分各次生成调用。 + """ + api_base = (settings.LLM_API_BASE or "").rstrip("/") + api_key = settings.LLM_API_KEY or "" + model_name = settings.LLM_MODEL_NAME or "" + + if not api_base or not api_key or not model_name: + raise RuntimeError("LLM 未配置:请设置 LLM_API_BASE/LLM_API_KEY/LLM_MODEL_NAME") + + ctx = log_context or "-" + _trace_logger.info( + "[输入] context=%s | model=%s | temperature=%s | max_tokens=%s\n" + "----- SYSTEM PROMPT -----\n%s\n" + "----- USER PROMPT -----\n%s\n" + "----- END INPUT -----", + ctx, model_name, temperature, max_tokens, system_prompt, user_prompt, + ) + + extra_payload: dict[str, Any] = {} + # SiliconFlow 的部分 Qwen 模型默认把输出写到 reasoning_content,导致 content 为空; + # 显式关闭 thinking,确保最终输出进入 content,避免下游解析失败。 + if "siliconflow" in api_base.lower() and "qwen" in model_name.lower(): + extra_payload["enable_thinking"] = False + + final_timeout_sec = int(timeout_sec or 0) + if final_timeout_sec <= 0: + final_timeout_sec = int(getattr(settings, "LLM_HTTP_TIMEOUT_SEC", 90) or 90) + retry_count = int(getattr(settings, "LLM_RETRY_COUNT", 2) or 2) + if retry_count < 1: + retry_count = 1 + retry_backoff = float(getattr(settings, "LLM_RETRY_BACKOFF_SEC", 1.0) or 1.0) + retry_backoff_max = float(getattr(settings, "LLM_RETRY_BACKOFF_MAX_SEC", 12.0) or 12.0) + connect_timeout_sec = int(getattr(settings, "LLM_CONNECT_TIMEOUT_SEC", 20) or 20) + if connect_timeout_sec <= 0: + connect_timeout_sec = 20 + use_stream = True + + _logger.info( + "chat_completions_json 调用 | model=%s | temperature=%s | max_tokens=%s | timeout=%s | retry=%s", + model_name, temperature, max_tokens, final_timeout_sec, retry_count, + ) + + with _llm_slots: + last_err: Optional[Exception] = None + for attempt in range(retry_count): + try: + if use_stream: + content = _chat_completions_stream_text( + api_base=api_base, + api_key=api_key, + model_name=model_name, + system_prompt=system_prompt, + user_prompt=user_prompt, + temperature=temperature, + max_tokens=max_tokens, + extra_payload=extra_payload, + connect_timeout_sec=connect_timeout_sec, + read_timeout_sec=final_timeout_sec, + on_content_delta=on_content_delta, + ) + else: + # 分离连接超时与读超时:长生成阶段只应占用「读」时间,避免与连接握手混在一个上限里过早超时 + read_timeout = max(int(connect_timeout_sec) + 5, int(final_timeout_sec)) + resp = requests.post( + f"{api_base}/chat/completions", + headers={ + "Authorization": f"Bearer {api_key}", + "Content-Type": "application/json", + }, + json={ + "model": model_name, + "messages": [ + {"role": "system", "content": system_prompt}, + {"role": "user", "content": user_prompt}, + ], + "temperature": temperature, + "max_tokens": max_tokens, + "response_format": {"type": "json_object"}, + **extra_payload, + }, + timeout=(connect_timeout_sec, read_timeout), + ) + if resp.status_code in (408, 429, 500, 502, 503, 504): + raise _RetryableLLMError( + f"LLM HTTP {resp.status_code}: {(resp.text or '')[:300]}" + ) + if resp.status_code != 200: + raise RuntimeError(f"LLM HTTP {resp.status_code}: {(resp.text or '')[:800]}") + + data = resp.json() + content = ( + (data.get("choices") or [{}])[0] + .get("message", {}) + .get("content", "") + ) + if not isinstance(content, str) or not content.strip(): + raise _RetryableLLMError("LLM 返回空内容") + try: + obj = parse_json_object_from_text(content) + _logger.info( + "chat_completions_json 成功 | model=%s | attempt=%d/%d | content_len=%d | keys=%s", + model_name, attempt + 1, retry_count, len(content), list(obj.keys())[:8], + ) + _trace_logger.info( + "[输出] context=%s | model=%s | attempt=%d/%d | output_len=%d\n" + "----- MODEL OUTPUT -----\n%s\n" + "----- END OUTPUT -----", + ctx, model_name, attempt + 1, retry_count, len(content), content, + ) + return obj + except ValueError as e: + raise _RetryableLLMError(f"LLM JSON 解析失败: {e}") from e + except ( + requests.ReadTimeout, + requests.ConnectTimeout, + requests.ConnectionError, + ChunkedEncodingError, + ) as e: + last_err = e + if attempt >= retry_count - 1: + raise RuntimeError( + "LLM 请求超时/连接失败" + f"(已重试{retry_count}次,timeout={final_timeout_sec}s)" + f",endpoint={api_base}/chat/completions" + f",model={model_name}" + f",raw={_format_exc_raw(e)}" + ) from e + sleep_sec = min(retry_backoff * (2 ** attempt), retry_backoff_max) + sleep_sec += random.uniform(0, min(0.5, sleep_sec * 0.2)) + time.sleep(sleep_sec) + except _RetryableLLMError as e: + last_err = e + if attempt >= retry_count - 1: + raise RuntimeError( + f"{e}(已重试{retry_count}次,timeout={final_timeout_sec}s)" + f",endpoint={api_base}/chat/completions" + f",model={model_name}" + f",raw={_format_exc_raw(e)}" + ) from e + sleep_sec = min(retry_backoff * (2 ** attempt), retry_backoff_max) + sleep_sec += random.uniform(0, min(0.5, sleep_sec * 0.2)) + time.sleep(sleep_sec) + except RequestException as e: + resp = getattr(e, "response", None) + status = getattr(resp, "status_code", None) + body = "" + if resp is not None: + try: + body = (resp.text or "")[:800] + except Exception: + body = "" + raise RuntimeError( + "LLM 请求失败" + f",endpoint={api_base}/chat/completions" + f",model={model_name}" + f",status={status}" + + (f",body={body}" if body else "") + + f",raw={_format_exc_raw(e)}" + ) from e + else: + raw = _format_exc_raw(last_err) if isinstance(last_err, Exception) else str(last_err) + raise RuntimeError( + "LLM 请求失败" + f",endpoint={api_base}/chat/completions" + f",model={model_name}" + f",raw={raw}" + ) + + +def _repair_loose_json_object(s: str) -> str: + """常见模型输出问题:尾随逗号(, 后紧跟 } 或 ])。""" + return re.sub(r",(\s*[}\]])", r"\1", s) + + +def _extract_balanced_json_prefix(s: str) -> str: + """ + 提取以 `{` 开始的最长“可能完整”的 JSON 对象前缀。 + 会忽略字符串内的花括号,避免误判。 + """ + start = s.find("{") + if start == -1: + return s + in_string = False + escaped = False + depth = 0 + end_idx = -1 + for i, ch in enumerate(s[start:], start=start): + if in_string: + if escaped: + escaped = False + elif ch == "\\": + escaped = True + elif ch == '"': + in_string = False + continue + if ch == '"': + in_string = True + elif ch == "{": + depth += 1 + elif ch == "}": + depth -= 1 + if depth == 0: + end_idx = i + break + if end_idx != -1: + return s[start : end_idx + 1] + return s[start:] + + +def _close_truncated_json_object(s: str) -> str: + """ + 处理模型截断导致的 JSON 残缺: + - 若字符串未闭合,补一个 `"` + - 按栈补齐缺失的 `}` / `]` + """ + out: list[str] = [] + stack: list[str] = [] + in_string = False + escaped = False + + for ch in s: + out.append(ch) + if in_string: + if escaped: + escaped = False + elif ch == "\\": + escaped = True + elif ch == '"': + in_string = False + continue + if ch == '"': + in_string = True + continue + if ch == "{": + stack.append("}") + elif ch == "[": + stack.append("]") + elif ch in ("}", "]"): + if stack and stack[-1] == ch: + stack.pop() + + if in_string: + out.append('"') + while stack: + out.append(stack.pop()) + return "".join(out) + + +def parse_json_object_from_text(text: str) -> dict[str, Any]: + """从模型输出里提取并解析 { ... } JSON 对象。""" + s = (text or "").strip() + s = re.sub(r"```(?:json)?", "", s, flags=re.IGNORECASE).replace("```", "").strip() + start = s.find("{") + if start == -1: + raise ValueError("未找到 JSON 对象") + chunk = s[start:] + balanced_chunk = _extract_balanced_json_prefix(chunk) + decoder = json.JSONDecoder() + last_err: Optional[Exception] = None + for candidate in ( + balanced_chunk, + _repair_loose_json_object(balanced_chunk), + _close_truncated_json_object(_repair_loose_json_object(balanced_chunk)), + _close_truncated_json_object(_repair_loose_json_object(chunk)), + ): + try: + obj, _ = decoder.raw_decode(candidate) + if not isinstance(obj, dict): + raise ValueError("JSON 根节点不是对象(dict)") + return obj + except json.JSONDecodeError as e: + last_err = e + raise ValueError(f"JSON 解析失败:{last_err}") from last_err + + +def safe_get_str(v: Any) -> Optional[str]: + if v is None: + return None + s = str(v).strip() + return s if s else None + + +# ------------------------------------------------------------------ +# Agent 多轮对话 + Tool Calling(流式生成器) +# ------------------------------------------------------------------ + +def _iter_chat_stream_events( + *, + api_base: str, + api_key: str, + model_name: str, + messages: list[dict[str, Any]], + tools: list[dict[str, Any]] | None = None, + temperature: float = 0.3, + max_tokens: int = 4096, + extra_payload: dict[str, Any] | None = None, + connect_timeout_sec: int = 20, + read_timeout_sec: int = 300, +): + """ + 流式调用 OpenAI-compat /chat/completions,逐步 yield 事件: + ("delta", str) — 文本增量 + ("tool_calls", list) — 完整 tool_calls 列表 [{id, function:{name, arguments}}] + ("done", dict) — 最终 usage 等元信息 + """ + payload: dict[str, Any] = { + "model": model_name, + "messages": messages, + "temperature": temperature, + "max_tokens": max_tokens, + "stream": True, + **(extra_payload or {}), + } + if tools: + payload["tools"] = tools + payload["tool_choice"] = "auto" + + resp = requests.post( + f"{api_base}/chat/completions", + headers={ + "Authorization": f"Bearer {api_key}", + "Content-Type": "application/json", + }, + json=payload, + stream=True, + timeout=(connect_timeout_sec, max(60, read_timeout_sec)), + ) + + if resp.status_code in (408, 429, 500, 502, 503, 504): + raise _RetryableLLMError(f"LLM HTTP {resp.status_code}: {(resp.text or '')[:300]}") + if resp.status_code != 200: + raise RuntimeError(f"LLM HTTP {resp.status_code}: {(resp.text or '')[:800]}") + + resp.encoding = "utf-8" + content_parts: list[str] = [] + tool_calls_map: dict[int, dict] = {} + + for line in resp.iter_lines(decode_unicode=True): + if not line: + continue + s = line.strip() + if not s.startswith("data:"): + continue + data_str = s[5:].strip() + if not data_str or data_str == "[DONE]": + break + try: + obj = json.loads(data_str) + except Exception: + continue + choices = obj.get("choices") + if not isinstance(choices, list) or not choices: + continue + first = choices[0] if isinstance(choices[0], dict) else {} + delta = first.get("delta") if isinstance(first.get("delta"), dict) else {} + + # 仅输出正文 content;忽略 reasoning_content,避免思考过程展示给用户 + _ = delta.get("reasoning_content") + + content = delta.get("content") + if isinstance(content, str) and content: + content_parts.append(content) + yield ("delta", content) + + # tool_calls delta (streamed incrementally) + tc_deltas = delta.get("tool_calls") + if isinstance(tc_deltas, list): + for tc in tc_deltas: + if not isinstance(tc, dict): + continue + idx = tc.get("index", 0) + if idx not in tool_calls_map: + tool_calls_map[idx] = { + "id": tc.get("id", ""), + "type": "function", + "function": {"name": "", "arguments": ""}, + } + entry = tool_calls_map[idx] + if tc.get("id"): + entry["id"] = tc["id"] + fn = tc.get("function") if isinstance(tc.get("function"), dict) else {} + if fn.get("name"): + entry["function"]["name"] += fn["name"] + if fn.get("arguments"): + entry["function"]["arguments"] += fn["arguments"] + + # finish_reason + finish = first.get("finish_reason") + if finish == "tool_calls" and tool_calls_map: + ordered = [tool_calls_map[k] for k in sorted(tool_calls_map.keys())] + yield ("tool_calls", ordered) + tool_calls_map = {} + + if tool_calls_map: + ordered = [tool_calls_map[k] for k in sorted(tool_calls_map.keys())] + yield ("tool_calls", ordered) + + yield ("done", {"content": "".join(content_parts)}) + + +def _default_disable_thinking_payload(model_name: str) -> dict[str, Any]: + """Qwen 等推理模型:关闭 thinking,仅将最终答案写入 content。""" + if not model_name or "qwen" not in str(model_name).lower(): + return {} + return { + "enable_thinking": False, + # vLLM / 部分 OpenAI 兼容网关使用 chat_template_kwargs + "chat_template_kwargs": {"enable_thinking": False}, + } + + +def chat_completions_with_tools( + *, + messages: list[dict[str, Any]], + tools: list[dict[str, Any]] | None = None, + temperature: float = 0.3, + max_tokens: int = 4096, + timeout_sec: int = 180, + extra_payload: dict[str, Any] | None = None, +): + """ + Agent 用多轮对话 + tool calling。返回生成器,yield 事件元组。 + 调用方负责工具循环编排。 + """ + api_base = (settings.LLM_API_BASE or "").rstrip("/") + api_key = settings.LLM_API_KEY or "" + model_name = settings.LLM_MODEL_NAME or "" + + if not api_base or not api_key or not model_name: + raise RuntimeError("LLM 未配置:请设置 LLM_API_BASE/LLM_API_KEY/LLM_MODEL_NAME") + + merged_extra: dict[str, Any] = dict(_default_disable_thinking_payload(model_name)) + if extra_payload: + merged_extra.update(extra_payload) + + connect_timeout_sec = int(getattr(settings, "LLM_CONNECT_TIMEOUT_SEC", 20) or 20) + if connect_timeout_sec <= 0: + connect_timeout_sec = 20 + + final_timeout_sec = int(timeout_sec or 0) + if final_timeout_sec <= 0: + final_timeout_sec = int(getattr(settings, "LLM_HTTP_TIMEOUT_SEC", 90) or 90) + + with _llm_slots: + yield from _iter_chat_stream_events( + api_base=api_base, + api_key=api_key, + model_name=model_name, + messages=messages, + tools=tools, + temperature=temperature, + max_tokens=max_tokens, + extra_payload=merged_extra or None, + connect_timeout_sec=connect_timeout_sec, + read_timeout_sec=final_timeout_sec, + ) diff --git a/services/project_service.py b/services/project_service.py new file mode 100644 index 0000000..e476e0c --- /dev/null +++ b/services/project_service.py @@ -0,0 +1,43 @@ +""" +services/project_service.py + +报告生成所需的最小项目查询,替代 eval_report 中重型的 write_service。 +仅提供按 uuid / 数字 id 查询项目并返回 WriteProject,用于校验项目存在性与取项目名。 +""" + +from __future__ import annotations + +from fastapi import HTTPException +from sqlalchemy.orm import Session + +from database.models import Project +from schemas.write import WriteProject + + +def get_project(project_id: str, db: Session) -> WriteProject: + """获取后评价报告项目详情。支持 uuid 或数字 id;优先 uuid。""" + project = None + if project_id: + project = db.query(Project).filter(Project.uuid == project_id).first() + if not project: + try: + pid = int(project_id) + project = db.query(Project).filter(Project.id == pid).first() + except (ValueError, TypeError): + pass + if not project: + raise HTTPException(status_code=404, detail="项目不存在") + + return WriteProject( + id=str(project.id), + uuid=project.uuid, + name=project.name, + description=project.description or "", + createdAt=project.created_at.strftime("%Y-%m-%d") if project.created_at else "", + updatedAt=project.updated_at.strftime("%Y-%m-%d") if project.updated_at else "", + docCount=project.doc_count, + status=project.status, + kbProjectId=None, + color=project.color, + documents=[], + ) diff --git a/services/prompt_template_service.py b/services/prompt_template_service.py new file mode 100644 index 0000000..e8bae96 --- /dev/null +++ b/services/prompt_template_service.py @@ -0,0 +1,28 @@ +from __future__ import annotations + +import re +from pathlib import Path +from typing import Any + + +PROMPT_ROOT = Path(__file__).resolve().parent.parent / "prompts" +_TOKEN_RE = re.compile(r"{{\s*([A-Za-z_][A-Za-z0-9_]*)\s*}}") + + +def load_prompt_template(relative_path: str) -> str: + path = (PROMPT_ROOT / relative_path).resolve() + if not path.is_relative_to(PROMPT_ROOT.resolve()): + raise ValueError(f"Invalid prompt path: {relative_path}") + return path.read_text(encoding="utf-8") + + +def render_prompt_template(template: str, **context: Any) -> str: + def _replace(match: re.Match[str]) -> str: + value = context.get(match.group(1), "") + return "" if value is None else str(value) + + return _TOKEN_RE.sub(_replace, template) + + +def render_prompt(relative_path: str, **context: Any) -> str: + return render_prompt_template(load_prompt_template(relative_path), **context) diff --git a/services/reference_service.py b/services/reference_service.py new file mode 100644 index 0000000..c8bf581 --- /dev/null +++ b/services/reference_service.py @@ -0,0 +1,292 @@ +""" +services/reference_service.py +参考范文加载服务:报告生成时按需加载对应章节参考范文 +""" +from __future__ import annotations + +import json +import logging +import re +from typing import Optional + +from sqlalchemy.orm import Session + +from database.models import ReportSectionReference +from services.llm_client import chat_completions_json + +logger = logging.getLogger(__name__) + + +_DESENSITIZE_SYSTEM_PROMPT = """你是一个文档脱敏助手。你的任务是对后评价报告范文进行脱敏处理,只保留报告的结构骨架。 + +## 脱敏规则 + +### 必须保留的结构 +1. Markdown 标题层级(## 1.1、## 1.2、### 1.2.1 等) +2. 表格的表头行、分隔行(|--|--|) +3. 段落/章节的组织顺序和逻辑关系 +4. 文字的叙述逻辑(先写什么、再写什么) +5. 表格的行数、列数、表头字段名(如"序号""项目名称""可研报告""实际值") + +### 必须替换为 xxx 的内容 +1. 所有具体数字:金额、年份、百分比、数量、面积、产能、投资额等 +2. 项目名称、公司名称、单位名称等专有名词(书名号/引号内的内容) +3. 表格中的数据单元格内容(保留表头) +4. 具体的日期、时间节点 +5. 财务指标的具体数值(IRR、NPV、回收期等) + +### 特别注意 +- 不要随意增删段落或改变段落顺序 +- 不要删除整个表格,只替换表格中的数据单元格 +- 保持原 Markdown 格式不变 +- "待补充"、"详见附表"等 固定用语 不脱敏 +- 书名号《》中的内容如果是不知名的规范/标准名称(如《石油化工标准》),保留书名号但内容替换为 xxx""" + + +_DESENSITIZE_USER_PROMPT_TEMPLATE = """请对以下后评价报告章节进行脱敏处理,只保留结构骨架,所有具体数据替换为 xxx: + +``` +{content} +``` + +请严格按照脱敏规则处理,直接输出脱敏后的完整 Markdown 内容,不要输出任何额外说明。""" + + +def _desensitize_via_llm(content: str) -> str: + """ + 调用大模型对参考范文进行脱敏处理。 + 传入完整内容,返回仅保留结构骨架、具体数据替换为 xxx 的 Markdown。 + + 若 LLM 调用失败,退回原始内容(不脱敏优于拒绝服务)。 + """ + if not content or not content.strip(): + return content + + user_prompt = _DESENSITIZE_USER_PROMPT_TEMPLATE.format(content=content[:12000]) + + logger.info("参考范文脱敏 start | content_len=%s", len(content)) + + try: + result = chat_completions_json( + system_prompt=_DESENSITIZE_SYSTEM_PROMPT, + user_prompt=user_prompt, + temperature=0.0, + max_tokens=16384, + timeout_sec=120, + ) + raw = result.get("content") or "" + if isinstance(raw, str) and raw.strip(): + # 去掉可能的 ```markdown / ``` 包裹 + cleaned = re.sub(r"^```(?:markdown)?\s*", "", raw.strip(), flags=re.IGNORECASE) + cleaned = re.sub(r"\s*```$", "", cleaned) + logger.info("参考范文脱敏 done | original_len=%s | desensitized_len=%s", len(content), len(cleaned)) + return cleaned.strip() + except Exception as e: + logger.warning("LLM 脱敏失败,退回原文: %s", e) + + return content + + +def load_section_reference( + db: Session, + section_key: str, + source_file: Optional[str] = None, + *, + max_chars: int = 8000, +) -> str: + """ + 加载指定章节的参考范文内容。 + + Args: + db: 数据库会话 + section_key: 章节标识(如 "1.1", "2.1.1") + source_file: 来源文件名(可选,不指定时取该章节最新的一条) + max_chars: 最大字符数,超出截断 + + Returns: + 参考范文 Markdown 文本,未找到时返回空字符串 + """ + query = db.query(ReportSectionReference).filter( + ReportSectionReference.section_key == section_key + ) + + if source_file: + query = query.filter(ReportSectionReference.source_file == source_file) + + ref = ( + query + .order_by(ReportSectionReference.updated_at.desc()) + .first() + ) + + if not ref or not ref.content: + return "" + + content = ref.content.strip() + if not content: + return "" + + content = _desensitize_via_llm(content) + + if len(content) > max_chars: + logger.info("参考范文 %s 超出 %d 字符限制,已截断", section_key, max_chars) + content = content[:max_chars] + "\n\n(参考范文超出长度限制,已截断)" + + return content + + +def load_section_reference_by_title( + db: Session, + section_title: str, + source_file: Optional[str] = None, + *, + max_chars: int = 8000, +) -> str: + """ + 按标题关键字匹配加载参考范文(不精确匹配 section_key 时的兜底方案)。 + """ + refs = db.query(ReportSectionReference) + + if source_file: + refs = refs.filter(ReportSectionReference.source_file == source_file) + + # 尝试精确匹配 section_key(从标题中提取编号) + import re + + m = re.match(r"(\d+(?:\.\d+)*)", section_title.strip()) + if m: + key = m.group(1) + exact = ( + refs.filter(ReportSectionReference.section_key == key) + .order_by(ReportSectionReference.updated_at.desc()) + .first() + ) + if exact and exact.content: + content = exact.content.strip() + content = _desensitize_via_llm(content) + if len(content) > max_chars: + content = content[:max_chars] + "\n\n(参考范文超出长度限制,已截断)" + return content + + # 按标题模糊匹配 + ref = ( + refs.filter(ReportSectionReference.section_title.contains(section_title[:20])) + .order_by(ReportSectionReference.updated_at.desc()) + .first() + ) + + if not ref or not ref.content: + return "" + + content = ref.content.strip() + if not content: + return "" + + content = _desensitize_via_llm(content) + + if len(content) > max_chars: + content = content[:max_chars] + "\n\n(参考范文超出长度限制,已截断)" + + return content + + +def load_section_reference_raw( + db: Session, + section_key: str, + template_id: Optional[str] = None, + *, + max_chars: int = 8000, +) -> str: + """ + 加载指定章节存储在数据库中的原始参考范文内容(不做 LLM 脱敏)。 + + 与 load_section_reference 的区别:直接返回 report_section_references.content 原文, + 仅保留长度截断保护,不再调用 _desensitize_via_llm。 + + template_id: 选中模板的 ID。传入后只注入与该模板关联的参考范文,实现“按模板过滤”; + 为空则不做模板过滤(取最新一条)。 + """ + query = db.query(ReportSectionReference).filter( + ReportSectionReference.section_key == section_key + ) + + if template_id: + query = query.filter(ReportSectionReference.template_id == template_id) + + ref = ( + query + .order_by(ReportSectionReference.updated_at.desc()) + .first() + ) + + if not ref or not ref.content: + return "" + + content = ref.content.strip() + if not content: + return "" + + if len(content) > max_chars: + logger.info("参考范文 %s 超出 %d 字符限制,已截断", section_key, max_chars) + content = content[:max_chars] + "\n\n(参考范文超出长度限制,已截断)" + + return content + + +def load_section_reference_raw_by_title( + db: Session, + section_title: str, + template_id: Optional[str] = None, + *, + max_chars: int = 8000, +) -> str: + """按标题匹配加载原始参考范文内容(不做 LLM 脱敏),用于 section_key 未命中时的兜底。""" + refs = db.query(ReportSectionReference) + + if template_id: + refs = refs.filter(ReportSectionReference.template_id == template_id) + + import re + + m = re.match(r"(\d+(?:\.\d+)*)", section_title.strip()) + if m: + key = m.group(1) + exact = ( + refs.filter(ReportSectionReference.section_key == key) + .order_by(ReportSectionReference.updated_at.desc()) + .first() + ) + if exact and exact.content: + content = exact.content.strip() + if len(content) > max_chars: + content = content[:max_chars] + "\n\n(参考范文超出长度限制,已截断)" + return content + + ref = ( + refs.filter(ReportSectionReference.section_title.contains(section_title[:20])) + .order_by(ReportSectionReference.updated_at.desc()) + .first() + ) + + if not ref or not ref.content: + return "" + + content = ref.content.strip() + if not content: + return "" + + if len(content) > max_chars: + content = content[:max_chars] + "\n\n(参考范文超出长度限制,已截断)" + + return content + + +def list_available_source_files(db: Session) -> list[str]: + """列出所有已上传的参考范文来源文件列表。""" + results = ( + db.query(ReportSectionReference.source_file) + .distinct() + .order_by(ReportSectionReference.source_file) + .all() + ) + return [r[0] for r in results if r[0]] \ No newline at end of file diff --git a/services/report_generation_service.py b/services/report_generation_service.py new file mode 100644 index 0000000..f23591e --- /dev/null +++ b/services/report_generation_service.py @@ -0,0 +1,7771 @@ +from __future__ import annotations + +import json +import logging +import re +import threading +from difflib import SequenceMatcher +import unicodedata +import uuid +from datetime import datetime +from pathlib import Path +from types import SimpleNamespace +from typing import Any, Optional + +from fastapi import HTTPException + +logger = logging.getLogger(__name__) + +# ── 运行时提示词落盘 ────────────────────────────────────────────────────────── +_PROMPT_DUMP_ROOT = Path(__file__).resolve().parent.parent / "comp" / "runtime" +_REPORT_OUTPUT_DUMP_ROOT = Path(__file__).resolve().parent.parent / "comp" / "report_outputs" + + +def _safe_markdown_filename(name: str, fallback: str = "section") -> str: + safe = re.sub(r'[\\/:*?"<>|]', "_", str(name or "").strip()) + safe = re.sub(r"\s+", " ", safe).strip(" ._") + return safe[:120] or fallback + + +def _dump_runtime_prompt( + job_id: str, + section_key: str, + section_title: str, + system_prompt: str, + user_prompt: str, +) -> None: + """将本次实际调用大模型的完整提示词(含证据)写入 comp/runtime//.md。""" + try: + out_dir = _PROMPT_DUMP_ROOT / job_id + out_dir.mkdir(parents=True, exist_ok=True) + safe_key = re.sub(r'[\\/:*?"<>|]', "_", section_key) + out_path = out_dir / f"{safe_key}.md" + ts = datetime.now().strftime("%Y-%m-%d %H:%M:%S") + content = ( + f"# {section_title}\n\n" + f"> job_id: `{job_id}` \n" + f"> section_key: `{section_key}` \n" + f"> 生成时间: {ts}\n\n" + "---\n\n" + "## System Prompt\n\n" + f"```\n{system_prompt}\n```\n\n" + "---\n\n" + "## User Prompt\n\n" + f"```\n{user_prompt}\n```\n" + ) + out_path.write_text(content, encoding="utf-8") + except Exception as exc: + logger.warning("dump runtime prompt failed: %s", exc) + + +def _dump_report_chapter_json_markdown( + *, + job_id: str, + section_key: str, + section_title: str, + output_json: dict[str, Any], +) -> Optional[str]: + """在章节流式生成结束并准备入库时,将该章节最终 JSON 输出写入 markdown 文件。 + + Returns: + 写入的文件路径,文件已存在(合并写入)时返回 None。 + """ + try: + out_dir = _REPORT_OUTPUT_DUMP_ROOT / job_id + out_dir.mkdir(parents=True, exist_ok=True) + safe_title = _safe_markdown_filename(section_title, fallback=_safe_markdown_filename(section_key)) + out_path = out_dir / f"{safe_title}.md" + ts = datetime.now().strftime("%Y-%m-%d %H:%M:%S") + json_text = json.dumps(output_json or {}, ensure_ascii=False, indent=2, default=str) + content = ( + f"# {section_title}\n\n" + f"> job_id: `{job_id}` \n" + f"> section_key: `{section_key}` \n" + f"> 写入时间: {ts}\n\n" + "```json\n" + f"{json_text}\n" + "```\n" + ) + out_path.write_text(content, encoding="utf-8") + return str(out_path) + except Exception as exc: + logger.warning("dump report chapter json markdown failed: %s", exc) + return None +from sqlalchemy.orm import Session + +from database import SessionLocal +from database.models import ( + ElementCell, + ElementTable, + Project, + KbDocument, + ReportGenerationChapter, + ReportGenerationJob, + ReportTemplate, + ReportTemplateSection, +) +from schemas.write import ( + GenerateReportJobItem, + GenerateReportChapterItem, + GenerateReportResult, + GenerateReportResultChapter, +) +from services.llm_client import chat_completions_json +from services.report_prompt_service import ( + build_report_chapter_prompt, + build_repair_missing_tables_prompt, + build_table_format_repair_prompt, + chapter_generation_system_prompt, + repair_missing_tables_system_prompt, + table_format_repair_system_prompt, +) +from services.retrieval_service import RetrievalService +from config import settings +from services.appendix_figure_extraction import ( + appendix_figure_markdown_images, + extract_appendix_figure_candidates_from_docx, + merge_best_appendix_figures, +) +from services.kb_service import _kb_doc_absolute_file_path_for_model +from services.report_runtime_store import ( + append_chapter_content, + get_job_state, + init_job_state, + set_chapter_stream_phase, + update_chapter_state, + update_job_state, +) +from services.standard_elements_2020 import ( + CHAPTER1_PROJECT_OVERVIEW_TABLE_GROUP, + MULTI_COLUMN_GLOBAL_SPECS, + APPENDIX2_CANONICAL_ROW_ORDER, + APPENDIX2_LEGACY_ROW_KEY_MAP, + APPENDIX8_LEGACY_ROW_KEY_MAP, + TABLE_5_3_ROW_KEY_ALTERNATES, + canonical_row_order_for_table, + TABLE_7_1_COLUMN_KEYS, + TABLE_7_1_ROW_CELL_DEFAULTS, + TABLE_7_1_SCORING_TABLE_NAME, + global_table_row_keys, + section_table_row_keys, + time_table_default_columns_for_name, +) +from prompts.report_generation.section_output_contracts import ( + DEFAULT_SECTION_OUTPUT_CONTRACT, + SECTION_OUTPUT_CONTRACTS, +) +from prompts.report_generation.heading_rules import ( + DEFAULT_HEADING_RULE, + SECTION_HEADING_RULES, +) +from prompts.report_generation.appendix_templates import ( + APPENDIX8_PARAMETER_COMPARISON_TABLE, + APPENDIX_FIGURE_TARGETS, + MINIMAL_MISSING_TABLE_TEMPLATE, + missing_child_heading_markdown, +) + +RUNNING_CHAPTER_STALE_SECONDS = 180 + +# 同一表号存在多张历史/别名表时,优先命中该表号的标准表名关键词,避免误选。 +_TABLE_TOKEN_PREFERRED_NAME_HINTS: dict[str, tuple[str, ...]] = { + "表2-5": ("总图、储运、公用工程及辅助工程对比",), + "表2-6": ("储运、公用工程及辅助工程依托对比", "依托"), + "表3-3": ("施工图设计变更情况", "全厂性项目"), + "表3-4": ("施工图设计变更情况", "单装置项目"), + "表3-5": ("影响投资或工期", "重大设计变更"), + "表5-4": ("生产经营及效益情况对比表",), + "表5-5": ("主要生产经营指标",), + "表5-6": ("不同因素变化对项目内部收益率的影响",), + "表5-7": ("内部收益率为基准收益率时不确定因素临界点或临界值",), +} + +# 表5-4 列键形如「可研报告|××年#1」:须与附表时间槽区分,且不可走「可研报告」前缀拆行,否则会生成「可研报告-|××年#1」错位表头。 +_TABLE54_PIPE_METRIC_PREFIXES = frozenset( + {"可研报告", "可研值", "实际值", "增减(%)", "增减", "指标"} +) +# 与表5-1 等混同步入的非细则列,直出时剔除 +_TABLE54_DROP_COL_KEYS = frozenset({"后评价值", "后评价报告"}) +_TABLE54_INVISIBLE_RE = re.compile(r"[\ufeff\u200b-\u200d]") + + +def _table54_ck_norm(ck: str) -> str: + """列键 NFKC 与去空白、BOM,便于识别误写入的「unit」全角变体等。""" + t = unicodedata.normalize("NFKC", str(ck or "")).strip() + return _TABLE54_INVISIBLE_RE.sub("", t) + + +def _is_table54_operating_benefit(table_name: str) -> bool: + tn = str(table_name or "").strip() + return "表5-4" in tn and "生产经营及效益情况对比表" in tn + + +def _element_table_collect_score(db: Session, table: ElementTable, token: str) -> int: + """报告生成选表:表5-4 须优先时间表且列键为「可研报告|××年#1」结构,避免误选抽取简表。""" + if not _table_token_matches_name(token, "表5-4"): + return 0 + score = 0 + if str(table.table_type or "").strip() == "time": + score += 200 + name = str(table.table_name or "") + if "生产经营及效益" in name: + score += 40 + sample = ( + db.query(ElementCell.row_key, ElementCell.col_key) + .filter( + ElementCell.table_id == table.id, + ElementCell.value.isnot(None), + ElementCell.value != "", + ) + .limit(48) + .all() + ) + for rk, ck in sample: + rk_s, ck_s = str(rk or ""), str(ck or "") + if "|" in ck_s and any( + p in ck_s for p in ("可研报告", "实际值", "增减") + ): + score += 8 + if "·" in rk_s: + score += 2 + if "年份未识别" in rk_s or "年份未识别" in ck_s: + score -= 40 + return score + + +def _pick_table54_year_markdown( + year_items: list[tuple[str, str]], + *, + table_year: int | None = None, +) -> tuple[str, str] | None: + """多张按年拆分的表5-4 取评价年(优先 element_tables.year / 2019)且表体最完整的一张。""" + if not year_items: + return None + if len(year_items) == 1: + return year_items[0] + + def _item_score(item: tuple[str, str]) -> int: + disp, md = item + sc = 0 + if table_year is not None and str(table_year) in str(disp): + sc += 120 + if "2019" in str(disp) or re.search(r"2019\s*年", md[:800]): + sc += 80 + if "可研报告" in md and "实际值" in md: + sc += 70 + if "增减" in md: + sc += 25 + if "运行情况·" in md or "主要经济指标·" in md: + sc += 35 + if "主要经济指标-" in md and "可研报告" not in md: + sc -= 60 + sc += min(md.count("\n|"), 60) + return sc + + return max(year_items, key=_item_score) + + +def _score_structured_table_hit_dict(hit: dict) -> int: + """structuredTables 条目评分:完整表5-4 对比表优先于 LLM 三行简表。""" + if not isinstance(hit, dict): + return 0 + md = str(hit.get("markdown") or "") + if not md: + return 0 + if _is_table54_simplified_extract_body(md): + return 0 + sc = 0 + if "可研报告" in md and "实际值" in md: + sc += 90 + if "增减" in md: + sc += 25 + if "运行情况·" in md or "主要经济指标·" in md: + sc += 40 + if "主要经济指标-" in md and "可研报告" not in md: + sc -= 70 + sc += min(md.count("\n|"), 80) + return sc + + +def _table54_body_preceded_by_element_source(text_before: str, *, max_chars: int = 600) -> bool: + """表体紧邻前是否已有要素直出注释(有则视为权威表5-4,勿删勿换)。""" + tail = str(text_before or "")[-max_chars:] + if "表格来源:要素管理" not in tail: + return False + after = tail.rsplit("表格来源:要素管理", 1)[-1] + chunk = after.split("\n", 8)[-1] + return not any( + ln.strip().startswith("|") or _is_pipe_markdown_table_row_line(ln) + for ln in chunk.splitlines()[:6] + if ln.strip() + ) + + +def _is_table54_simplified_extract_body(block: str) -> bool: + """ + 识别抽取/LLM 三行简表:仅「2019年实际值」等单列 + 少量「主要经济指标·」行, + 无「可研报告|…」与「增减」对比结构。 + """ + md = str(block or "").strip() + if not md or "|" not in md: + return False + hdr = re.sub(r"\s+", "", _extract_table_header_key(md)).lower() + hdr = re.sub(r"
.*", "", hdr, flags=re.IGNORECASE) + if not hdr: + return False + if "后评价值" in hdr or ("可研值" in hdr and "项目" not in hdr and "运行情况" not in md): + return True + has_compare_cols = ("可研报告" in hdr or ("可研" in hdr and "增减" in hdr)) and ( + "实际值" in hdr or "实际" in hdr + ) + if has_compare_cols and ("运行情况·" in md or md.count("\n|") >= 12): + return False + single_actual_year = bool( + re.search(r"\d{4}\s*年\s*实际值", hdr) or re.search(r"\d{4}年实际值", hdr) + ) + if single_actual_year and "可研" not in hdr and "增减" not in hdr: + if "主要经济指标" in md or "主要经济指标-" in md: + return True + if "主要经济指标-" in md and "可研报告" not in md and "增减" not in md: + return True + pipe_rows = [ + ln + for ln in md.splitlines() + if _is_pipe_markdown_table_row_line(ln) and not _is_pipe_markdown_table_separator_line(ln) + ] + if ( + len(pipe_rows) <= 5 + and "主要经济指标" in md + and "可研报告" not in md + and "运行情况·" not in md + ): + return True + return False + + +def _reorder_table54_col_order(col_order: list[str]) -> list[str]: + """单位列置前;其余按年度槽与「可研→实际→增减」顺序排列;剔除无效英文 unit 列与表5-1 混入列。""" + cols: list[str] = [] + for c in col_order: + s = _table54_ck_norm(c) + if not s: + continue + if s.lower() in ("unit", "__unit__"): + continue + if s in _TABLE54_DROP_COL_KEYS: + continue + cols.append(s) + cols = ["单位" if c == "指标单位" else c for c in cols] + seen: set[str] = set() + deduped: list[str] = [] + for c in cols: + if c in seen: + continue + seen.add(c) + deduped.append(c) + cols = deduped + + units = [c for c in cols if c == "单位"] + metrics = [c for c in cols if c != "单位"] + + def _metric_rank(g: str) -> int: + gs = g.strip() + if gs in ("可研报告", "指标", "可研值"): + return 0 + if gs == "实际值": + return 1 + if gs.startswith("增减"): + return 2 + return 9 + + def _sort_key(ck: str) -> tuple[str, int, str]: + if "|" not in ck: + return ("\xff", 99, ck) + g, t = ck.split("|", 1) + return (t.strip(), _metric_rank(g), ck) + + metrics = sorted(metrics, key=_sort_key) + if not units: + return ["单位"] + metrics + return units + metrics + + +def _table54_rekey_latest_col_keys(latest: dict[tuple[str, str], str]) -> None: + """将 latest 的 col_key 与 _reorder_table54_col_order 一致地做 NFKC 等规范化,否则「增减(%)」与「增减(%)」无法对齐。""" + tmp: dict[tuple[str, str], str] = {} + + def _prefer_val(cur: str, new: str) -> str: + s_new = str(new or "").strip() + if s_new and s_new != "待补充": + return str(new) + s_cur = str(cur or "").strip() + if s_cur and s_cur != "待补充": + return str(cur) + return s_new or s_cur or "" + + for (rk, ck), v in list(latest.items()): + rk_s = str(rk) + nk = _table54_ck_norm(str(ck)) + key = (rk_s, nk) + if key in tmp: + tmp[key] = _prefer_val(tmp[key], v) + else: + tmp[key] = str(v or "") + latest.clear() + latest.update(tmp) + + +def _table54_coalesce_legacy_bare_metric_cols( + latest: dict[tuple[str, str], str], row_order: list[str] +) -> None: + """ + 要素管理常见:数据写在裸列「可研报告/实际值/增减(%)」, + 列定义仍为「可研报告|××年#1」等;合并到槽位列以便与 UI 一致。 + """ + slot_map = { + "可研报告": "可研报告|××年#1", + "实际值": "实际值|××年#1", + "增减(%)": "增减(%)|××年#1", + "增减(%)": "增减(%)|××年#1", + } + + def _prefer(a: str, b: str) -> str: + sa, sb = str(a or "").strip(), str(b or "").strip() + if sa and sa != "待补充": + return sa + if sb and sb != "待补充": + return sb + return sa or sb + + for rk in row_order: + for bare, slot in slot_map.items(): + merged = _prefer(latest.get((rk, slot), ""), latest.get((rk, bare), "")) + if merged: + latest[(rk, slot)] = merged + + +def _table54_merge_year_cells_for_table_year( + year_cells: dict[int | None, list], + *, + table_year: int | None, +) -> tuple[dict[int | None, list], list[int]]: + """ + 表5-4:同一张 element_tables(year=2019)下多数格子 element_cells.year 为空, + 须与 year=2019 的少量格子合并后再渲染,否则只剩单列「实际值」简表。 + """ + if table_year is None or int(table_year) <= 0: + real = sorted(y for y in year_cells if y is not None) + return year_cells, real + ty = int(table_year) + merged: list = list(year_cells.get(None, [])) + for cy in sorted(y for y in year_cells if y is not None): + if cy == ty: + merged.extend(year_cells.get(cy, [])) + if not merged: + return year_cells, sorted(y for y in year_cells if y is not None) + return {ty: merged}, [ty] + + +def _table54_remap_indicator_unit_latest(latest: dict[tuple[str, str], str]) -> None: + """将历史列键「指标单位」的值并入「单位」,避免列键规范为「单位」后取不到数。""" + touched: list[tuple[str, str]] = [] + for (rk, ck), v in list(latest.items()): + if str(ck) != "指标单位": + continue + rk_s = str(rk) + k_unit = (rk_s, "单位") + cur = str(latest.get(k_unit, "") or "").strip() + nv = str(v or "").strip() + if nv and (not cur or cur == "待补充"): + latest[k_unit] = v + elif not cur: + latest[k_unit] = v + touched.append((rk_s, str(ck))) + for pair in touched: + latest.pop(pair, None) + + +_TABLE54_SLOT_YEAR_RE = re.compile(r"^(\d{4})年(?:#\d+)?$") + + +def _norm_table54_placeholder_year_tail(tail: str) -> str: + buf: list[str] = [] + for ch in (tail or "").strip(): + if ch in "xXxX": + buf.append("×") + elif ch == "\u00d7": + buf.append("×") + else: + buf.append(ch) + return "".join(buf) + + +def _table54_placeholder_year_tail(tail: str) -> bool: + """列键尾部为「××年#n」等占位列(与前端 isEmTable54YearSlotColKey 一致)。""" + u = _norm_table54_placeholder_year_tail(tail) + return bool(re.fullmatch(r"×{2}年(?:#\d+)?", u)) + + +def _parse_real_year_from_table54_slot_tail(tail: str) -> int | None: + """列键尾部为「2019」「2019年」「2019年#1」等真实日历时返回四位年。""" + t = (tail or "").strip() + if _table54_placeholder_year_tail(t): + return None + m = _TABLE54_SLOT_YEAR_RE.fullmatch(t) + if m: + y = int(m.group(1)) + if 1900 <= y <= 2100: + return y + m2 = re.match(r"^(\d{4})年", t) + if m2: + y = int(m2.group(1)) + if 1900 <= y <= 2100: + return y + return None + + +def _infer_time_column_year_for_table54( + col_order: list[str], + cells: list[Any], + table_year: int | None, +) -> int | None: + """ + 从单元格 year、时间表 element_tables.year、或列键「…|2019年」推断表5-4 年度栏对应的日历年。 + 无法唯一确定时返回 None(表头占位列退回「某年」)。 + """ + ys = sorted( + { + int(c.year) + for c in cells + if getattr(c, "year", None) is not None and int(c.year) > 0 + } + ) + if len(ys) == 1: + return ys[0] + if table_year is not None and int(table_year) > 0: + return int(table_year) + parsed: list[int] = [] + for ck in col_order: + s = str(ck or "").strip() + if "|" not in s: + continue + _, tail = s.split("|", 1) + cy = _parse_real_year_from_table54_slot_tail(tail.strip()) + if cy is not None: + parsed.append(cy) + uniq = sorted(set(parsed)) + if len(uniq) == 1: + return uniq[0] + # 列键正文含四位年(如「2019年可研报告」类裸列名);多列多年份时不武断取第一个 + text_years: list[int] = [] + for ck in col_order: + m = re.search(r"(19|20)\d{2}", str(ck or "")) + if m: + yi = int(m.group(0)) + if 1900 <= yi <= 2100: + text_years.append(yi) + ty_uniq = sorted(set(text_years)) + if len(ty_uniq) == 1: + return ty_uniq[0] + return None + + +def _table54_year_label_prefix(time_column_year: int | None) -> str: + if time_column_year is not None and 1900 <= int(time_column_year) <= 2100: + return f"{int(time_column_year)}年" + return "某年" + + +def _table54_year_prefix_for_slot_tail(tail: str, *, time_column_year: int | None) -> str: + """表头「{年}可研报告」中的「{年}」:优先列键自带年份,否则用推断的日历年,最后退回「某年」。""" + cy = _parse_real_year_from_table54_slot_tail(tail) + if cy is not None: + return f"{cy}年" + if _table54_placeholder_year_tail(tail): + return _table54_year_label_prefix(time_column_year) + t = (tail or "").strip() + if not t: + return "某年" + if re.match(r"^\d{4}年", t): + return t.split("#", 1)[0] + return f"{t}年" if not t.endswith("年") else t + + +def _table54_bare_metric_header_label(col_key: str, *, time_column_year: int | None) -> str | None: + """无「指标|年度槽」时的列键:表头带评价年。""" + s = str(col_key or "").strip() + if not s: + return None + ypfx = _table54_year_label_prefix(time_column_year) + if s == "实际值": + return f"{ypfx}实际值" + if s in ("可研值", "可研报告"): + return f"{ypfx}可研报告" + if s.startswith("增减"): + rest = s[len("增减") :] + return f"{ypfx}增减{rest}" + return None + + +def _table54_markdown_header_labels( + col_order: list[str], + *, + time_column_year: int | None = None, +) -> list[str]: + """扁平表头:单位、{年}可研报告、{年}实际值、{年}增减;{年}来自列键或要素日历年推断。""" + out: list[str] = [] + for ck in col_order: + s = str(ck).strip() + if s == "单位": + out.append("单位") + continue + if "|" not in s: + bare_l = _table54_bare_metric_header_label(s, time_column_year=time_column_year) + if bare_l is not None: + out.append(bare_l) + continue + out.append(s.replace("|", "|")) + continue + g, t = s.split("|", 1) + g, t = g.strip(), t.strip() + g_norm = _table54_ck_norm(g) + if (g not in _TABLE54_PIPE_METRIC_PREFIXES and g_norm not in _TABLE54_PIPE_METRIC_PREFIXES) or not t: + out.append(s.replace("|", "|")) + continue + ypfx = _table54_year_prefix_for_slot_tail(t, time_column_year=time_column_year) + if g_norm in ("可研报告", "指标", "可研值") or g in ("可研报告", "指标", "可研值"): + out.append(f"{ypfx}可研报告") + elif g_norm == "实际值" or g == "实际值": + out.append(f"{ypfx}实际值") + elif g_norm.startswith("增减") or g.startswith("增减"): + rest = g[len("增减") :] + out.append(f"{ypfx}增减{rest}") + else: + out.append(s.replace("|", "|")) + return out + + +def create_report_job( + project_id: str, + db: Session, + *, + template_id: Optional[str] = None, + top_k: int = 10, + requested_by: Optional[str] = None, +) -> GenerateReportJobItem: + project = _resolve_project(db, project_id) + if not project: + raise HTTPException(status_code=404, detail="项目不存在") + template = _resolve_template(db, template_id) + sections = _sections_for_generation(_list_template_sections(db, template.id)) + if not sections: + raise HTTPException(status_code=400, detail="模板未配置章节") + + now = datetime.now() + job = ReportGenerationJob( + id=uuid.uuid4().hex, + project_id=project.uuid, + template_id=template.id, + status="pending", + progress=0, + requested_by=requested_by, + options={"topK": max(5, min(int(top_k or 10), 20))}, + created_at=now, + updated_at=now, + ) + db.add(job) + # 先把父任务写入当前事务,确保后续章节插入满足外键约束。 + db.flush() + for s in sections: + db.add( + ReportGenerationChapter( + id=uuid.uuid4().hex, + job_id=job.id, + section_key=s.section_key, + section_title=s.section_title, + section_order=s.section_order, + status="pending", + created_at=now, + updated_at=now, + ) + ) + db.commit() + init_job_state( + job_id=job.id, + project_id=project.uuid, + template_id=template.id, + chapters=[ + { + "sectionKey": s.section_key, + "sectionTitle": s.section_title, + "sectionOrder": s.section_order, + "status": "pending", + } + for s in sections + ], + ) + _start_job_worker(job.id) + return get_report_job(project.uuid, job.id, db) + + +def get_report_job(project_id: str, job_id: str, db: Session) -> GenerateReportJobItem: + project = _resolve_project(db, project_id) + if not project: + raise HTTPException(status_code=404, detail="项目不存在") + job = ( + db.query(ReportGenerationJob) + .filter(ReportGenerationJob.id == job_id, ReportGenerationJob.project_id == project.uuid) + .first() + ) + if not job: + raise HTTPException(status_code=404, detail="任务不存在") + _recover_stalled_job(db, job) + chapters = ( + db.query(ReportGenerationChapter) + .filter(ReportGenerationChapter.job_id == job.id) + .order_by(ReportGenerationChapter.section_order.asc()) + .all() + ) + runtime_state = get_job_state(job.id) + runtime_chapter_map = ((runtime_state or {}).get("chapters") or {}) if isinstance(runtime_state, dict) else {} + return GenerateReportJobItem( + jobId=job.id, + projectId=job.project_id, + templateId=job.template_id, + status=(runtime_state or {}).get("status") or job.status, + progress=int((runtime_state or {}).get("progress") or job.progress or 0), + currentSectionKey=(runtime_state or {}).get("currentSectionKey") or job.current_section_key, + errorMessage=(runtime_state or {}).get("errorMessage") or job.error_message, + createdAt=_fmt_dt(job.created_at), + updatedAt=(runtime_state or {}).get("updatedAt") or _fmt_dt(job.updated_at), + completedAt=(runtime_state or {}).get("completedAt") or _fmt_dt(job.completed_at), + chapters=[ + GenerateReportChapterItem( + sectionKey=c.section_key, + sectionTitle=c.section_title, + sectionOrder=c.section_order, + status=(runtime_chapter_map.get(c.section_key) or {}).get("status") or c.status, + updatedAt=(runtime_chapter_map.get(c.section_key) or {}).get("updatedAt") or _fmt_dt(c.updated_at), + errorMessage=(runtime_chapter_map.get(c.section_key) or {}).get("errorMessage") or c.error_message, + ) + for c in chapters + ], + ) + + +def get_report_result( + project_id: str, + job_id: str, + db: Session, + *, + include_debug: bool = False, +) -> GenerateReportResult: + project = _resolve_project(db, project_id) + if not project: + raise HTTPException(status_code=404, detail="项目不存在") + job = ( + db.query(ReportGenerationJob) + .filter(ReportGenerationJob.id == job_id, ReportGenerationJob.project_id == project.uuid) + .first() + ) + if not job: + raise HTTPException(status_code=404, detail="任务不存在") + _recover_stalled_job(db, job) + chapter_rows = ( + db.query(ReportGenerationChapter) + .filter(ReportGenerationChapter.job_id == job.id) + .order_by(ReportGenerationChapter.section_order.asc()) + .all() + ) + chapter_title_map: dict[str, str] = {} + if job.template_id: + chapter_title_map = _build_section_title_map( + _list_template_sections(db, job.template_id) + ) + runtime_state = get_job_state(job.id) + if runtime_state: + report_text, chapter_items = _build_live_result_from_runtime( + runtime_state, + include_debug=include_debug, + chapter_title_map=chapter_title_map, + ) + else: + report_text, chapter_items = _build_live_result_from_chapters( + chapter_rows, + include_debug=include_debug, + chapter_title_map=chapter_title_map, + ) + report_text = _append_report_appendices(db, project.uuid, report_text) + report_text = _normalize_table_captions_in_markdown(report_text or "") + consistency = _check_consistency(report_text or "", project.name) + return GenerateReportResult( + jobId=job.id, + status=(runtime_state or {}).get("status") or job.status, + report=report_text, + consistency=consistency, + chapters=chapter_items, + ) + + +def _build_live_result_from_chapters( + chapters: list[ReportGenerationChapter], + *, + include_debug: bool = False, + chapter_title_map: Optional[dict[str, str]] = None, +) -> tuple[str, list[GenerateReportResultChapter]]: + chapter_items: list[GenerateReportResultChapter] = [] + report_parts: list[str] = [] + title_map = chapter_title_map or {} + for i, c in enumerate(chapters): + normalized_content = _fix_numeric_line_breaks(str(c.content or "").strip()) if c.content else c.content + if normalized_content: + normalized_content, _ = _collapse_consecutive_text_repetitions( + str(normalized_content).strip() + ) + normalized_content = _normalize_table_captions_in_markdown(str(normalized_content).strip()) + prev_body = "" + if i > 0 and chapters[i - 1].content: + prev_body = _fix_numeric_line_breaks(str(chapters[i - 1].content).strip()) + normalized_content = _inject_missing_parent_section_headings( + str(c.section_title or ""), + str(normalized_content).strip(), + prev_body, + title_map, + ) + chapter_items.append( + GenerateReportResultChapter( + sectionKey=c.section_key, + sectionTitle=c.section_title, + sectionOrder=c.section_order, + status=c.status, + content=normalized_content, + errorMessage=c.error_message, + promptText=(c.prompt_text if include_debug else None), + evidencePayload=(c.evidence_payload if include_debug else None), + validationPayload=(c.validation_payload if include_debug else None), + ) + ) + if normalized_content: + report_parts.append(str(normalized_content).strip()) + return _fix_numeric_line_breaks("\n\n".join(report_parts).strip()), chapter_items + + +def _build_live_result_from_runtime( + runtime_state: dict, + *, + include_debug: bool = False, + chapter_title_map: Optional[dict[str, str]] = None, +) -> tuple[str, list[GenerateReportResultChapter]]: + chapter_items: list[GenerateReportResultChapter] = [] + report_parts: list[str] = [] + title_map = chapter_title_map or {} + chapter_values = list(((runtime_state or {}).get("chapters") or {}).values()) + chapter_values.sort(key=lambda x: int((x or {}).get("sectionOrder") or 0)) + for i, chapter in enumerate(chapter_values): + if not isinstance(chapter, dict): + continue + normalized_content = _fix_numeric_line_breaks(str(chapter.get("content") or "").strip()) + if normalized_content: + normalized_content, _ = _collapse_consecutive_text_repetitions( + str(normalized_content).strip() + ) + normalized_content = _normalize_table_captions_in_markdown(str(normalized_content).strip()) + prev_body = "" + if i > 0 and isinstance(chapter_values[i - 1], dict): + prev_body = _fix_numeric_line_breaks(str(chapter_values[i - 1].get("content") or "").strip()) + normalized_content = _inject_missing_parent_section_headings( + str(chapter.get("sectionTitle") or ""), + normalized_content, + prev_body, + title_map, + ) + chapter_items.append( + GenerateReportResultChapter( + sectionKey=str(chapter.get("sectionKey") or ""), + sectionTitle=str(chapter.get("sectionTitle") or ""), + sectionOrder=int(chapter.get("sectionOrder") or 0), + status=str(chapter.get("status") or "pending"), + content=normalized_content or None, + errorMessage=chapter.get("errorMessage"), + promptText=(chapter.get("promptText") if include_debug else None), + evidencePayload=(chapter.get("evidencePayload") if include_debug else None), + validationPayload=(chapter.get("validationPayload") if include_debug else None), + ) + ) + if normalized_content: + report_parts.append(normalized_content) + return _fix_numeric_line_breaks("\n\n".join(report_parts).strip()), chapter_items + + +def get_report_stream_snapshot( + job_id: str, + *, + include_debug: bool = False, +) -> Optional[dict[str, Any]]: + runtime_state = get_job_state(job_id) + if not runtime_state: + return None + chapter_title_map: dict[str, str] = {} + template_id = runtime_state.get("templateId") + if template_id: + with SessionLocal() as db: + chapter_title_map = _build_section_title_map( + _list_template_sections(db, str(template_id)) + ) + report_text, chapter_items = _build_live_result_from_runtime( + runtime_state, + include_debug=include_debug, + chapter_title_map=chapter_title_map, + ) + runtime_chapters = list(((runtime_state or {}).get("chapters") or {}).values()) + runtime_chapters.sort(key=lambda x: int((x or {}).get("sectionOrder") or 0)) + job_payload = { + "jobId": runtime_state.get("jobId"), + "projectId": runtime_state.get("projectId"), + "templateId": runtime_state.get("templateId"), + "status": runtime_state.get("status"), + "progress": int(runtime_state.get("progress") or 0), + "currentSectionKey": runtime_state.get("currentSectionKey"), + "errorMessage": runtime_state.get("errorMessage"), + "createdAt": runtime_state.get("createdAt"), + "updatedAt": runtime_state.get("updatedAt"), + "completedAt": runtime_state.get("completedAt"), + "chapters": [ + { + "sectionKey": str(c.get("sectionKey") or ""), + "sectionTitle": str(c.get("sectionTitle") or ""), + "sectionOrder": int(c.get("sectionOrder") or 0), + "status": str(c.get("status") or "pending"), + "updatedAt": c.get("updatedAt"), + "errorMessage": c.get("errorMessage"), + } + for c in runtime_chapters + ], + } + result_payload = { + "jobId": runtime_state.get("jobId"), + "status": runtime_state.get("status"), + "report": report_text, + "consistency": [], + "chapters": [c.model_dump() for c in chapter_items], + } + return { + "job": job_payload, + "result": result_payload, + } + + +def retry_report_chapter(project_id: str, job_id: str, section_key: str, db: Session) -> GenerateReportJobItem: + project = _resolve_project(db, project_id) + if not project: + raise HTTPException(status_code=404, detail="项目不存在") + job = ( + db.query(ReportGenerationJob) + .filter(ReportGenerationJob.id == job_id, ReportGenerationJob.project_id == project.uuid) + .first() + ) + if not job: + raise HTTPException(status_code=404, detail="任务不存在") + chapter = ( + db.query(ReportGenerationChapter) + .filter(ReportGenerationChapter.job_id == job.id, ReportGenerationChapter.section_key == section_key) + .first() + ) + if not chapter: + raise HTTPException(status_code=404, detail="章节不存在") + now = datetime.now() + chapter.status = "pending" + chapter.error_message = None + chapter.updated_at = now + job.status = "running" + job.updated_at = now + db.commit() + update_job_state(job.id, status="running", errorMessage=None, completedAt=None) + update_chapter_state( + job.id, + section_key, + status="pending", + content=None, + errorMessage=None, + promptText=None, + evidencePayload=None, + validationPayload=None, + ) + _start_job_worker(job.id, section_key=section_key) + return get_report_job(project.uuid, job_id, db) + + +def cancel_report_job(project_id: str, job_id: str, db: Session) -> GenerateReportJobItem: + project = _resolve_project(db, project_id) + if not project: + raise HTTPException(status_code=404, detail="项目不存在") + job = ( + db.query(ReportGenerationJob) + .filter(ReportGenerationJob.id == job_id, ReportGenerationJob.project_id == project.uuid) + .first() + ) + if not job: + raise HTTPException(status_code=404, detail="任务不存在") + + now = datetime.now() + if job.status in ("completed", "failed", "cancelled"): + return get_report_job(project.uuid, job_id, db) + + chapters = ( + db.query(ReportGenerationChapter) + .filter(ReportGenerationChapter.job_id == job.id) + .all() + ) + for c in chapters: + if c.status in ("pending", "running"): + c.status = "failed" + c.error_message = "任务已由用户取消" + c.updated_at = now + + job.status = "cancelled" + job.error_message = "任务已由用户取消" + job.current_section_key = None + job.updated_at = now + job.completed_at = now + db.commit() + update_job_state( + job.id, + status="cancelled", + errorMessage="任务已由用户取消", + currentSectionKey=None, + completedAt=_fmt_dt(now), + ) + for c in chapters: + if c.status in ("failed", "cancelled") or c.error_message == "任务已由用户取消": + update_chapter_state( + job.id, + c.section_key, + status="failed", + errorMessage="任务已由用户取消", + ) + return get_report_job(project.uuid, job_id, db) + + +def _start_job_worker(job_id: str, section_key: Optional[str] = None) -> None: + threading.Thread( + target=_run_job_worker, + args=(job_id, section_key), + daemon=True, + name=f"report-job-{job_id[:8]}", + ).start() + + +def _run_job_worker(job_id: str, only_section_key: Optional[str] = None) -> None: + with SessionLocal() as db: + job = db.query(ReportGenerationJob).filter(ReportGenerationJob.id == job_id).first() + if not job: + return + try: + job.status = "running" + job.error_message = None + job.updated_at = datetime.now() + db.commit() + update_job_state(job.id, status="running", errorMessage=None) + + project = db.query(Project).filter(Project.uuid == job.project_id).first() + if not project: + raise RuntimeError("项目不存在") + template = _resolve_template(db, job.template_id) + all_template_sections = _list_template_sections(db, template.id) + sections = _sections_for_generation(all_template_sections) + chapter_title_map = _build_section_title_map(all_template_sections) + chapters = ( + db.query(ReportGenerationChapter) + .filter(ReportGenerationChapter.job_id == job.id) + .order_by(ReportGenerationChapter.section_order.asc()) + .all() + ) + chapter_map = {c.section_key: c for c in chapters} + completed_section_contents: dict[str, str] = { + c.section_key: str(c.content or "").strip() + for c in chapters + if c.status == "completed" and str(c.content or "").strip() + } + target_sections = [s for s in sections if (not only_section_key or s.section_key == only_section_key)] + retrieval = RetrievalService() + top_k = int((job.options or {}).get("topK") or 10) + completed_count = 0 + pending_sections = [] + for section in target_sections: + chapter = chapter_map.get(section.section_key) + if not chapter: + continue + if not only_section_key and chapter.status == "completed": + completed_count += 1 + continue + pending_sections.append(section) + + total_count = max(1, completed_count + len(pending_sections)) + logger.info( + "报告生成 job start | job=%s | project=%s | total_sections=%d | pending=%d | completed=%d | top_k=%d", + job.id, project.uuid, len(sections), len(pending_sections), completed_count, top_k, + ) + for idx, section in enumerate(pending_sections, start=1): + db.refresh(job) + if job.status == "cancelled": + return + chapter = chapter_map.get(section.section_key) + if not chapter: + continue + + section_no = _extract_section_number(section.section_title or "") + logger.info( + "报告生成 start chapter | job=%s | section=%s | title=%s | section_no=%s | idx=%d/%d", + job.id, section.section_key, section.section_title, section_no, idx, total_count, + ) + + _update_chapter_status(db, job, chapter, "running", None) + update_job_state(job.id, currentSectionKey=section.section_key) + update_chapter_state( + job.id, + section.section_key, + status="running", + errorMessage=None, + content=None, + promptText=None, + evidencePayload=None, + validationPayload={"streamPhase": "waiting"}, + ) + required_tables = _extract_required_table_tokens( + section.section_prompt or "", + _extract_section_number(section.section_title or ""), + contract_text=_effective_section_output_contract(section), + ) + if _extract_section_number(section.section_title or "") == "5.3.2": + na8 = _norm_table_token("附表8") + required_tables = [ + t for t in required_tables if _norm_table_token(str(t)) != na8 + ] + evidence, retrieval_stage = _collect_evidence_progressive( + db, + retrieval, + project.uuid, + section, + top_k=top_k, + required_tables=required_tables, + ) + prior_sibling_sections_text = _build_prior_sibling_sections_text( + section, + sections, + completed_section_contents, + ) + section_reference = _load_section_reference_for_chapter( + db, + section.section_key, + section.section_title, + template_id=template.id, + ) + logger.info( + "section_reference 注入 | section=%s | template_id=%s | 命中=%s", + section.section_key, + template.id, + "是" if section_reference else "否", + ) + prompt = _build_chapter_prompt( + section, + evidence, + prior_sibling_sections_text=prior_sibling_sections_text, + section_reference=section_reference, + ) + _dump_runtime_prompt( + job_id=job.id, + section_key=section.section_key, + section_title=section.section_title, + system_prompt=chapter_generation_system_prompt(), + user_prompt=prompt, + ) + stream_state = { + "buffer": "", + "phase": "waiting", + } + + def _on_content_delta(event: str, delta_text: str) -> None: + if event == "delta": + if delta_text: + stream_state["phase"] = "streaming" + stream_state["buffer"] = str(stream_state.get("buffer") or "") + delta_text + append_chapter_content( + job.id, + section.section_key, + delta_text, + stream_phase="streaming", + ) + elif event == "finalizing": + stream_state["phase"] = "finalizing" + set_chapter_stream_phase(job.id, section.section_key, "finalizing") + + content, validation, model_output = _generate_chapter_content( + section, + prompt, + on_content_delta=_on_content_delta, + ) + content = _apply_canonical_field_backfill(section, evidence, content) + _cur_section_no = _extract_section_number(section.section_title or "") + _skip_table_enforcement = _cur_section_no in {"2.1.1"} + if _skip_table_enforcement: + remaining_missing_tables = [] + content_after_tables = content + else: + content, remaining_missing_tables = _enforce_required_tables( + section, + prompt, + content, + evidence, + ) + content_after_tables = content + content = _strip_tables_from_non_table_section( + section.section_title or "", content, section=section + ) + content = _strip_forbidden_tables( + section.section_title or "", content, + ) + content, format_issues = _enforce_template_format_contract( + section, + content, + evidence, + chapter_title_map=chapter_title_map, + ) + _sec_no = _extract_section_number(section.section_title or "") + _refresh_tokens: tuple[str, ...] = ("表5-4",) + if _sec_no == "5.1": + _refresh_tokens = ("表5-1",) + elif _sec_no == "5.2.1": + _refresh_tokens = ("表5-2", "表5-3") + content = _refresh_element_table_markdown_tokens( + content, evidence, _refresh_tokens + ) + content = _strip_bracketed_three_part_labels(content) + content = _strip_placeholder_table_notes(content) + content = _normalize_table_captions_in_markdown(content) + content = _strip_trailing_partial_missing_markers(content) + content = _fix_numeric_line_breaks(content) + content = _cleanup_section_table_artifacts( + section.section_title or "", + content, + allowed_table_tokens=required_tables, + ) + if _sec_no == "5.3.1": + content = _refresh_element_table_markdown_tokens( + content, evidence, ("表5-4",) + ) + content = _fill_required_table_caption_stubs( + content, ["表5-4"], evidence + ) + content = _strip_orphan_markdown_table_rows(content) + content = _strip_minimal_missing_table_tail(content) + content, intra_repeat_removed = _collapse_consecutive_text_repetitions(content) + content, chapter_dedupe_removed = _dedupe_long_chapter_repetition(content) + chapter_dedupe_removed += intra_repeat_removed + if chapter_dedupe_removed > 0: + warnings = validation.get("warnings") if isinstance(validation, dict) else [] + if not isinstance(warnings, list): + warnings = [] + warnings.append( + f"章节去重:已移除 {chapter_dedupe_removed} 处重复段落/表格" + ) + validation["warnings"] = warnings + validation["chapterDedupeRemoved"] = chapter_dedupe_removed + if required_tables and not _skip_table_enforcement: + content = _restore_required_tables_safety_net( + content, + required_tables, + evidence, + content_after_tables, + ) + content = _finalize_section_table_dedupe(content, required_tables) + if remaining_missing_tables: + warnings = validation.get("warnings") if isinstance(validation, dict) else [] + if not isinstance(warnings, list): + warnings = [] + warnings.append( + "部分必需表格仍缺失,已插入占位表:" + + "、".join(remaining_missing_tables) + ) + validation["warnings"] = warnings + if format_issues: + warnings = validation.get("warnings") if isinstance(validation, dict) else [] + if not isinstance(warnings, list): + warnings = [] + warnings.extend([f"格式验收器:{x}" for x in format_issues][:8]) + validation["warnings"] = warnings + validation["retrievalStage"] = retrieval_stage + validation["streamPhase"] = "completed" + diagnostics = _build_field_diagnostics(section, evidence, content) + if diagnostics: + validation["fieldDiagnostics"] = diagnostics + if model_output: + validation["modelOutput"] = model_output + content = _inject_missing_parent_section_headings( + section.section_title or "", + content, + _previous_completed_section_content( + section, sections, completed_section_contents + ), + chapter_title_map, + ) + now = datetime.now() + chapter.content = content + completed_section_contents[section.section_key] = content + chapter.prompt_text = prompt[:20000] + chapter.evidence_payload = evidence + chapter.validation_payload = validation + chapter.status = "completed" + chapter.error_message = None + chapter.updated_at = now + chapter.completed_at = now + if not only_section_key: + job.progress = int((completed_count + idx) * 100 / total_count) + job.current_section_key = section.section_key + job.updated_at = now + db.commit() + dump_out_path = _dump_report_chapter_json_markdown( + job_id=job.id, + section_key=section.section_key, + section_title=section.section_title, + output_json={ + "modelOutput": model_output or {}, + "persistedChapter": { + "sectionKey": section.section_key, + "sectionTitle": section.section_title, + "sectionOrder": section.section_order, + "status": "completed", + "content": content, + "promptText": prompt[:20000], + "evidencePayload": evidence, + "validationPayload": validation, + }, + }, + ) + logger.info( + "章节生成落盘 | job=%s | section=%s | prompt_len=%s | content_len=%s | output_file=%s", + job.id, section.section_key, len(prompt[:20000]), len(content), + dump_out_path or "(已存在合并写入)", + ) + update_chapter_state( + job.id, + section.section_key, + status="completed", + content=content, + errorMessage=None, + promptText=prompt[:20000], + evidencePayload=evidence, + validationPayload=validation, + ) + if not only_section_key: + update_job_state( + job.id, + progress=int((completed_count + idx) * 100 / total_count), + currentSectionKey=section.section_key, + ) + else: + update_job_state(job.id, currentSectionKey=section.section_key) + + db.refresh(job) + if job.status == "cancelled": + return + + db.refresh(job) + if job.status == "cancelled": + return + + if only_section_key: + # 单章重跑不应直接终结整任务,仅回写章节并刷新任务进度。 + all_chapters = ( + db.query(ReportGenerationChapter) + .filter(ReportGenerationChapter.job_id == job.id) + .order_by(ReportGenerationChapter.section_order.asc()) + .all() + ) + done = sum(1 for c in all_chapters if c.status == "completed") + total = max(1, len(all_chapters)) + job.progress = int(done * 100 / total) + job.current_section_key = None + job.updated_at = datetime.now() + db.commit() + update_job_state(job.id, progress=int(done * 100 / total), currentSectionKey=None) + else: + job.snapshot = None + job.status = "completed" + job.progress = 100 + job.current_section_key = None + job.completed_at = datetime.now() + job.updated_at = datetime.now() + db.commit() + logger.info( + "报告生成 job completed | job=%s | project=%s | total_chapters=%d", + job.id, project.uuid, len(sections), + ) + update_job_state( + job.id, + status="completed", + progress=100, + currentSectionKey=None, + completedAt=_fmt_dt(job.completed_at), + ) + except Exception as e: + current_section_key = job.current_section_key + logger.error( + "报告生成 job failed | job=%s | project=%s | section=%s | err=%s", + job.id, project.uuid, current_section_key, e, + ) + job.status = "failed" + job.error_message = str(e) + job.updated_at = datetime.now() + db.commit() + update_job_state(job.id, status="failed", errorMessage=str(e)) + if current_section_key: + update_chapter_state( + job.id, + current_section_key, + status="failed", + errorMessage=str(e), + ) + + +# 这些章节号在 L1/L2 已判「证据充足」时仍继续走 L2/L3,避免仅章节定向检索就提前结束而漏掉关键词召回。 +_SECTION_NUMBERS_FORCE_L3_KEYWORD_RETRIEVAL: frozenset[str] = frozenset({"2.1.5", "3.3.3", "3.4.1", "3.6", "3.7", "3.8", "3.10", "4.3.3","5.2.3", "6.1.1.1", "6.1.1.2", "6.2.1", "6.2.4"}) + + +def _section_forces_l3_keyword_retrieval(section: ReportTemplateSection) -> bool: + return _extract_section_number(section.section_title or "") in _SECTION_NUMBERS_FORCE_L3_KEYWORD_RETRIEVAL + + +def _collect_evidence_progressive( + db: Session, + retrieval: RetrievalService, + project_uuid: str, + section: ReportTemplateSection, + *, + top_k: int, + required_tables: Optional[list[str]] = None, +) -> tuple[dict, str]: + force_l3 = _section_forces_l3_keyword_retrieval(section) + # L1: 仅要素与结构化表 + evidence = _collect_evidence( + db, + retrieval, + project_uuid, + section, + top_k=top_k, + required_tables=required_tables, + include_chapter_docs=False, + include_keyword_docs=False, + ) + if _is_evidence_sufficient(section, evidence) and not force_l3: + return evidence, "elements_only" + + # L2: 补充章节定向检索段落 + evidence = _collect_evidence( + db, + retrieval, + project_uuid, + section, + top_k=top_k, + required_tables=required_tables, + include_chapter_docs=True, + include_keyword_docs=False, + ) + if _is_evidence_sufficient(section, evidence) and not force_l3: + return evidence, "elements_plus_chapter_docs" + + # L3: 最后补充关键词兜底检索 + evidence = _collect_evidence( + db, + retrieval, + project_uuid, + section, + top_k=top_k, + required_tables=required_tables, + include_chapter_docs=True, + include_keyword_docs=True, + ) + return evidence, "elements_plus_chapter_and_keyword_docs" + + +def _latest_element_payloads_by_row_col( + db: Session, + project_uuid: str, + row_keys: list[str], + *, + non_empty_value: bool = True, +) -> list[dict[str, Any]]: + """按 ``row_key + col_key`` 去重,保留 ``updated_at`` 最新的一条(查询已按时间倒序)。""" + if not row_keys: + return [] + q = db.query(ElementCell).filter( + ElementCell.project_id == project_uuid, + ElementCell.row_key.in_(row_keys), + ) + if non_empty_value: + q = q.filter(ElementCell.value.isnot(None), ElementCell.value != "") + cells = q.order_by(ElementCell.updated_at.desc()).all() + picked: dict[tuple[str, str], dict[str, Any]] = {} + for cell in cells: + rk = str(cell.row_key or "") + ck = str(cell.col_key or "") + key = (rk, ck) + if key in picked: + continue + picked[key] = { + "rowKey": cell.row_key, + "colKey": cell.col_key, + "value": str(cell.value or "")[:500], + "sourceDocumentId": cell.source_document_id, + } + return list(picked.values()) + + +def _merge_section_11_forced_elements( + forced_payloads: list[dict[str, Any]], + scored_top_payloads: list[dict[str, Any]], + *, + max_additional_scored: int = 40, +) -> list[dict[str, Any]]: + """1.1 节:先发制人并入第 1 章概况要素,再追加与其它章节相同的 Top-K 打分单元格(去重)。""" + seen: set[tuple[str, str]] = set() + out: list[dict[str, Any]] = [] + for p in forced_payloads: + key = (str(p.get("rowKey") or ""), str(p.get("colKey") or "")) + if key in seen: + continue + seen.add(key) + out.append(p) + added = 0 + for p in scored_top_payloads: + key = (str(p.get("rowKey") or ""), str(p.get("colKey") or "")) + if key in seen: + continue + seen.add(key) + out.append(p) + added += 1 + if added >= max_additional_scored: + break + return out + + +def _collect_evidence( + db: Session, + retrieval: RetrievalService, + project_uuid: str, + section: ReportTemplateSection, + *, + top_k: int, + required_tables: Optional[list[str]] = None, + include_chapter_docs: bool = True, + include_keyword_docs: bool = True, +) -> dict: + section_no = _extract_section_number(section.section_title or "") + tokens = _extract_tokens(f"{section.section_title} {section.section_prompt or ''}")[:14] + if section_no == "1.2": + # 标题词过短会导致检索跑偏;补充决策类短语提高召回 + extra = " ".join( + [ + "项目决策要点 建设必要性 立项背景", + "国VI 国Ⅵ 汽油质量升级 芳烃 烯烃 环保", + "预期目标 烷基化油 产量 辛烷值 万吨", + "可研 批复 投资 效益 利润", + ] + ) + merged = _extract_tokens(f"{section.section_title} {section.section_prompt or ''} {extra}") + tokens = list(dict.fromkeys(merged))[:20] + cells_query = ( + db.query(ElementCell, ElementTable.table_name) + .join(ElementTable, ElementTable.id == ElementCell.table_id) + .filter( + ElementCell.project_id == project_uuid, + ElementTable.project_id == project_uuid, + ElementCell.value.isnot(None), + ElementCell.value != "", + ) + .order_by(ElementCell.updated_at.desc()) + ) + candidate_cells: list[tuple[int, dict]] = [] + for cell, table_name in cells_query.limit(800).all(): + payload = { + "tableId": cell.table_id, + "tableName": table_name, + "rowKey": cell.row_key, + "colKey": cell.col_key, + "year": cell.year, + "value": str(cell.value or "")[:500], + "sourceDocumentId": cell.source_document_id, + "sourceType": cell.source_type, + } + score = _score_element_cell_relevance( + section.section_title, + tokens, + payload.get("rowKey"), + payload.get("colKey"), + payload.get("value"), + table_name=payload.get("tableName"), + section=section, + ) + # 无 token 命中但字段语义强相关时仍保留(例如 value 内存在“项目名称:xxx”)。 + if score > 0: + candidate_cells.append((score, payload)) + elif not tokens: + candidate_cells.append((1, payload)) + candidate_cells.sort(key=lambda x: x[0], reverse=True) + matched_cells = [x[1] for x in candidate_cells[:40]] + # 1.1「项目基本情况」:强制并入「章节要素-第1章项目概况」全部非空格子,避免被全局 Top40 相关性截断挤出导致建设投资等待补充。 + if _extract_section_number(section.section_title or "") == "1.1": + ch1_rows = section_table_row_keys(CHAPTER1_PROJECT_OVERVIEW_TABLE_GROUP) + forced_ch1 = _latest_element_payloads_by_row_col(db, project_uuid, ch1_rows, non_empty_value=True) + if forced_ch1: + matched_cells = _merge_section_11_forced_elements(forced_ch1, matched_cells, max_additional_scored=40) + required = [str(t) for t in (required_tables or []) if str(t).strip()] + structured_tables = _collect_structured_tables( + db, + project_uuid, + required, + section_title=str(section.section_title or ""), + section_tokens=tokens, + ) + chapter_docs = [] + if include_chapter_docs: + chapter_docs = retrieval.get_chapter_materials(project_uuid, section.section_title, top_k=top_k) + keyword_docs: list[dict] = [] + if include_keyword_docs and tokens: + if section_no == "1.2": + queries = [ + "国VI 国Ⅵ 汽油 质量升级 芳烃 烯烃 环保 标准", + "项目 建设 必要性 决策 依据 立项", + "预期 目标 烷基化油 产量 辛烷值 效益 万吨", + " ".join(tokens[:8]), + ] + seen: set[tuple[str, str]] = set() + for q in queries: + docs = retrieval.search_by_query(q, top_k=6, filter_project=project_uuid) + for d in docs: + did = str(d.metadata.get("doc_id", "") or "") + body = str(d.page_content or "")[:2000] + key = (did, body[:240]) + if key in seen: + continue + seen.add(key) + keyword_docs.append( + { + "heading": d.metadata.get("heading", ""), + "content": body, + "docId": did, + "query": q[:120], + } + ) + if len(keyword_docs) >= 14: + break + if len(keyword_docs) >= 14: + break + else: + q = " ".join(tokens[:5]) + docs = retrieval.search_by_query(q, top_k=10, filter_project=project_uuid) + for d in docs: + keyword_docs.append( + { + "heading": d.metadata.get("heading", ""), + "content": str(d.page_content or "")[:2000], + "docId": d.metadata.get("doc_id", ""), + } + ) + return { + "tokens": tokens, + "requiredTables": required, + "structuredTables": structured_tables, + "canonicalFields": _extract_canonical_fields( + section.section_title, matched_cells, section=section + ), + "elements": matched_cells, + "chapterDocs": chapter_docs[:top_k], + "keywordDocs": keyword_docs[:14] if section_no == "1.2" else keyword_docs[:8], + } + + +def _is_evidence_sufficient(section: ReportTemplateSection, evidence: dict) -> bool: + required_tables = evidence.get("requiredTables") if isinstance(evidence, dict) else [] + structured_tables = evidence.get("structuredTables") if isinstance(evidence, dict) else [] + elements = evidence.get("elements") if isinstance(evidence, dict) else [] + chapter_docs = evidence.get("chapterDocs") if isinstance(evidence, dict) else [] + keyword_docs = evidence.get("keywordDocs") if isinstance(evidence, dict) else [] + + required_count = len(required_tables) if isinstance(required_tables, list) else 0 + structured_count = len(structured_tables) if isinstance(structured_tables, list) else 0 + element_count = len(elements) if isinstance(elements, list) else 0 + chapter_doc_count = len(chapter_docs) if isinstance(chapter_docs, list) else 0 + keyword_doc_count = len(keyword_docs) if isinstance(keyword_docs, list) else 0 + + # 有必需表格时优先保证结构化表匹配覆盖 + if required_count > 0 and structured_count < min(required_count, 2): + return False + + # 没有足够要素时,需要至少一种文档证据补充 + if element_count < 6 and chapter_doc_count == 0 and keyword_doc_count == 0: + return False + + # 表格相关章节通常需要更高证据密度 + title = str(section.section_title or "") + if "表" in title and (structured_count == 0 and element_count < 10): + return False + + # 关键章节按字段完整性判定,避免“有数量但没关键字段”时误判为充足 + title_norm = re.sub(r"\s+", "", title) + if "1.1项目基本情况" in title_norm: + required_groups = [ + ["建设单位", "建设单位名称"], + ["建设地点", "厂址"], + ["建设规模", "装置规模", "能力", "万吨/年"], + ["投资", "概算", "估算", "决算"], + ] + for group in required_groups: + if not _evidence_contains_any_fact(evidence, group): + return False + + if "1.2项目决策要点" in title_norm: + required_groups = [ + ["国vi", "国ⅵ", "质量升级", "汽油标准", "环保", "环评", "排放", "清洁生产"], + ["高标号", "辛烷值", "汽油池", "产品结构", "汽油"], + ["碳四", "液化气", "原料", "物料平衡", "资源利用", "附加值"], + ["杂质", "预处理", "丁二烯", "选择性加氢", "催化剂", "甲醇", "二甲醚"], + ["万吨", "产量", "烷基化油", "效益", "利润", "营业收入", "预期", "目标"], + ] + hit_count = 0 + for group in required_groups: + if _evidence_contains_any_fact(evidence, group): + hit_count += 1 + # 至少命中 2 组:安全评价里常有杂质/物料平衡,可研/环评可补环保与目标 + if hit_count < 2: + return False + + return True + + +def _score_element_cell_relevance( + section_title: str, + tokens: list[str], + row_key: Optional[str], + col_key: Optional[str], + value: Optional[str], + *, + table_name: Optional[str] = None, + section: ReportTemplateSection | None = None, +) -> int: + table = str(table_name or "") + row = str(row_key or "") + col = str(col_key or "") + val = str(value or "") + full_text = f"{table} {row} {col} {val}" + full_text_l = full_text.lower() + key_text_l = f"{table} {row} {col}".lower() + score = 0 + + for t in (tokens or []): + tt = str(t or "").strip() + if not tt: + continue + if tt in full_text: + score += 1 + if table and tt in table: + score += 2 + + title_norm = re.sub(r"\s+", "", str(section_title or "")) + table_norm = re.sub(r"\s+", "", table) + if title_norm and table_norm and (title_norm in table_norm or table_norm in title_norm): + score += 8 + section_no = _extract_section_number(section_title) + if section_no and table_norm and section_no.replace(".", ""): + section_no_norm = section_no.replace(".", "") + table_no_norm = re.sub(r"\D", "", table_norm[:12]) + if table_no_norm and table_no_norm.startswith(section_no_norm): + score += 3 + + # 对关键章节字段进行强加权,降低无关单元格被截断前占位的概率。 + expected = _section_expected_fields(section_title, section) + for field in expected: + aliases = [str(a).strip() for a in _field_aliases(field) if str(a).strip()] + alias_hit = False + for alias in aliases: + a_l = alias.lower() + if a_l in key_text_l: + score += 4 + alias_hit = True + break + if alias_hit: + continue + # 若 row/col 不包含字段名,尝试 value 中“字段:值”模式。 + if _extract_value_by_alias_from_text(val, aliases): + score += 5 + continue + # 最弱相关:value 中仅出现别名关键词。 + if any(str(a).lower() in full_text_l for a in aliases): + score += 1 + + return score + + +def _evidence_contains_any_fact(evidence: dict, keywords: list[str]) -> bool: + if not isinstance(evidence, dict): + return False + lowered_keywords = [str(k).strip().lower() for k in keywords if str(k).strip()] + if not lowered_keywords: + return False + + elements = evidence.get("elements") if isinstance(evidence.get("elements"), list) else [] + for row in elements: + if not isinstance(row, dict): + continue + row_key = str(row.get("rowKey") or "").lower() + col_key = str(row.get("colKey") or "").lower() + value = str(row.get("value") or "").strip() + value_l = value.lower() + if _is_missing_like(value): + continue + if any(k in row_key or k in col_key or k in value_l for k in lowered_keywords): + return True + + for doc_field in ("chapterDocs", "keywordDocs"): + docs = evidence.get(doc_field) if isinstance(evidence.get(doc_field), list) else [] + for d in docs[:12]: + if not isinstance(d, dict): + continue + text = (str(d.get("heading") or "") + " " + str(d.get("content") or "")).lower() + if any(k in text for k in lowered_keywords): + return True + return False + + +def _recover_stalled_job(db: Session, job: ReportGenerationJob) -> None: + if not job or job.status != "running": + return + now = datetime.now() + running_chapter = ( + db.query(ReportGenerationChapter) + .filter(ReportGenerationChapter.job_id == job.id, ReportGenerationChapter.status == "running") + .order_by(ReportGenerationChapter.updated_at.asc()) + .first() + ) + if not running_chapter or not running_chapter.updated_at: + return + stale_seconds = (now - running_chapter.updated_at).total_seconds() + if stale_seconds < RUNNING_CHAPTER_STALE_SECONDS: + return + + running_chapter.status = "pending" + running_chapter.error_message = "检测到章节长时间未更新,已自动回收并重试" + running_chapter.updated_at = now + job.status = "pending" + job.error_message = None + job.current_section_key = None + job.updated_at = now + db.commit() + update_job_state(job.id, status="pending", errorMessage=None, currentSectionKey=None) + update_chapter_state( + job.id, + running_chapter.section_key, + status="pending", + errorMessage="检测到章节长时间未更新,已自动回收并重试", + content=None, + promptText=None, + evidencePayload=None, + validationPayload=None, + ) + _start_job_worker(job.id) + + +def _load_section_reference_for_chapter( + db: Session, + section_key: str, + section_title: str, + *, + template_id: Optional[str] = None, + max_chars: int = 8000, +) -> str: + """ + 从 report_section_references 表加载当前章节存储的原始章节内容(content), + 直接用于填充 user-prompt 的 section_reference_block,不做 LLM 脱敏。 + 优先按 section_key 精确匹配,其次从标题中提取编号匹配,最后按标题模糊匹配。 + + template_id: 选中模板的 ID。传入后只注入与该模板关联(report_section_references.template_id) + 的参考范文,实现“按模板过滤参考范文”;为空则不做模板过滤(取最新一条)。 + """ + from services.reference_service import ( + load_section_reference_raw, + load_section_reference_raw_by_title, + ) + + tid = (template_id or "").strip() or None + + content = load_section_reference_raw( + db, section_key, template_id=tid, max_chars=max_chars + ) + if content: + return content + + # 兜底:按标题匹配(仍限定在同一模板内) + return load_section_reference_raw_by_title( + db, section_title, template_id=tid, max_chars=max_chars + ) + + +def _build_chapter_prompt( + section: ReportTemplateSection, + evidence: dict, + *, + prior_sibling_sections_text: str = "", + section_reference: str = "", +) -> str: + selected_example = _select_chapter_example( + section.section_title, + section.examples, + evidence, + ) + section_contract = _effective_section_output_contract(section) + section_no = _extract_section_number(section.section_title) + heading_rule = SECTION_HEADING_RULES.get(section_no, DEFAULT_HEADING_RULE) + expected_fields = _section_expected_fields(section.section_title, section) + return build_report_chapter_prompt( + section_title=section.section_title, + section_prompt=_effective_section_prompt_for_generation(section, section_contract), + required_tables_text="、".join(evidence.get("requiredTables") or []) or "无", + structured_tables_text=_render_structured_tables_for_prompt(evidence), + canonical_fields_text=_render_canonical_fields_for_prompt( + evidence, allowed_fields=expected_fields or None + ), + selected_example=selected_example, + heading_rule=heading_rule, + section_contract=section_contract, + evidence_json=json.dumps(evidence, ensure_ascii=False), + prior_sibling_sections_text=prior_sibling_sections_text, + section_reference=section_reference, + ) + + +def _generate_chapter_content( + section: ReportTemplateSection, + prompt: str, + on_content_delta: Optional[callable] = None, +) -> tuple[str, dict, dict]: + section_no = _extract_section_number(section.section_title or "") + logger.info( + "LLM 章节生成 start | section=%s | section_no=%s | max_tokens=%s", + section.section_key, section_no, _chapter_generation_max_tokens(section_no), + ) + obj = chat_completions_json( + system_prompt=chapter_generation_system_prompt(), + user_prompt=prompt, + temperature=0.1, + max_tokens=_chapter_generation_max_tokens(section_no), + timeout_sec=120, + on_content_delta=on_content_delta, + log_context=f"章节生成 section_key={section.section_key} | {section.section_title}", + ) + content = str(obj.get("content") or "").strip() + if not content: + content = f"{section.section_title}\n\n待补充" + # 不对章节编号/条目序号做“统一编号归一化”改写,避免破坏模板章节层级(如 2.1.1、3.4.2 等)。 + # 仅清理证据标签/引用编号等噪声。 + content = _strip_inline_evidence_labels(content) + if section_no == "1.2": + content = re.sub( + r"(?m)^[\s\u3000]*1[\s\u3000]*[)\)][\s\u3000]*项目背景[\s\u3000]*$", + "1.2.1项目背景", + content, + ) + content = re.sub( + r"(?m)^[\s\u3000]*2[\s\u3000]*[)\)][\s\u3000]*预期目标[\s\u3000]*$", + "1.2.2预期目标", + content, + ) + content = _normalize_section_12_content(content) + missing = obj.get("missingInfo") if isinstance(obj.get("missingInfo"), list) else [] + checks = obj.get("qualityChecks") if isinstance(obj.get("qualityChecks"), list) else [] + validation = { + "missingInfo": [str(x) for x in missing][:20], + "qualityChecks": [str(x) for x in checks][:20], + "warnings": _basic_warnings(section.section_title, content), + } + return content, validation, obj + + +def _normalize_ordered_item_markers(content: str) -> str: + text = _strip_inline_evidence_labels(str(content or "")) + if not text: + return text + cn_num_to_idx = { + "一": 1, + "二": 2, + "三": 3, + "四": 4, + "五": 5, + "六": 6, + "七": 7, + "八": 8, + "九": 9, + "十": 10, + } + + # Keep the first non-empty line unchanged, to avoid mutating the section title. + lines = text.splitlines() + first_non_empty_idx = -1 + for i, ln in enumerate(lines): + if ln.strip(): + first_non_empty_idx = i + break + + # Convert line-leading markers such as: + # - Chinese numerals: "一、" / "(一)" + # - Arabic numerals: "1." / "2." / "1.2." / "3.1" + # into a unified "n)" style. + cn_pattern = re.compile(r"^(\s*(?:#+\s*)?(?:[-*]\s*)?)(?:(([一二三四五六七八九十]))|([一二三四五六七八九十])、)\s*") + ar_pattern = re.compile(r"^(\s*(?:#+\s*)?(?:[-*]\s*)?)(\d+(?:\.\d+)*)(?:\.)?\s+") + + def _replace_line(ln: str) -> str: + m_cn = cn_pattern.match(ln) + if m_cn: + prefix = m_cn.group(1) or "" + cn = m_cn.group(2) or m_cn.group(3) or "" + idx = cn_num_to_idx.get(cn) + if idx: + return cn_pattern.sub(f"{prefix}{idx})", ln, count=1) + return ln + + m_ar = ar_pattern.match(ln) + if m_ar: + prefix = m_ar.group(1) or "" + seq = m_ar.group(2) or "" + parts = [p for p in seq.split(".") if p] + # Use the last segment as list index: 1.2 -> 2), 3.1 -> 1) + idx = parts[-1] if parts else "" + if idx.isdigit(): + return ar_pattern.sub(f"{prefix}{int(idx)}) ", ln, count=1) + return ln + + out: list[str] = [] + for i, ln in enumerate(lines): + if i == first_non_empty_idx: + out.append(ln) + continue + out.append(_replace_line(ln)) + return "\n".join(out) + + +def _strip_inline_evidence_labels(text: str) -> str: + src = str(text or "") + if not src: + return src + cleaned = re.sub(r"【\s*证据依据\s*[::]\s*[0-9a-fA-F]{16,}\s*】", "", src) + cleaned = re.sub(r"\[\s*证据依据\s*[::]\s*[0-9a-fA-F]{16,}\s*\]", "", cleaned) + # Remove simple inline numeric citations like [1], [2] that often leak from evidence. + cleaned = re.sub(r"\[\s*\d{1,3}\s*\]", "", cleaned) + # Strip leaked meta sections from model JSON fields when they are accidentally merged into content. + cleaned = re.sub( + r"(?is)\n*【\s*缺失信息说明\s*】[\s\S]*?(?=\n【\s*质量检查\s*】|\Z)", + "\n", + cleaned, + ) + cleaned = re.sub(r"(?is)\n*【\s*质量检查\s*】[\s\S]*$", "\n", cleaned) + return re.sub(r"[ \t]{2,}", " ", cleaned) + + +def _markdown_hashes_for_section_no(section_no: str) -> str: + parts = str(section_no or "").strip().split(".") + if len(parts) == 1: + return "##" + if len(parts) == 2: + return "###" + return "####" + + +def _normalize_numbered_heading_spacing(content: str) -> str: + """编号与题名之间补空格,便于前后端一致识别为标题。""" + text = str(content or "") + if not text: + return text + + def _line_repl(m: re.Match[str]) -> str: + indent, num, title = m.group(1), m.group(2), str(m.group(3) or "").strip() + parts = num.split(".") + if len(parts) < 2 or len(parts) > 4: + return m.group(0) + for part in parts: + if not part.isdigit() or int(part) < 1 or int(part) > 30: + return m.group(0) + if not title or len(title) > 36 or re.search(r"[,。;:!?]", title): + return m.group(0) + return f"{indent}{num} {title}" + + return re.sub( + r"(?m)^([\s\u3000]*)(\d+(?:\.\d+)+)\s*([\u4e00-\u9fff][^\n]{0,40})\s*$", + _line_repl, + text, + ) + + +def _normalize_markdown_heading_levels(content: str) -> str: + """三节及以上编号统一为 ####,避免 ### 与 #### 混用导致同级标题字号不一致。""" + text = str(content or "") + if not text: + return text + + def _line_repl(m: re.Match[str]) -> str: + hashes, num, tail = m.group(1), m.group(2), m.group(3) + parts = num.split(".") + if len(parts) < 3: + return m.group(0) + want = _markdown_hashes_for_section_no(num) + if hashes == want: + return m.group(0) + return f"{want} {num} {tail}" + + return re.sub( + r"(?m)^(#{1,6})\s+(\d+(?:\.\d+)+)\s+([\u4e00-\u9fff].*)$", + _line_repl, + text, + ) + + +def _normalize_section_heading_markdown(content: str) -> str: + return _normalize_markdown_heading_levels( + _normalize_numbered_heading_spacing(content) + ) + + +def _normalize_section_12_content(content: str) -> str: + """1.2 合同为纯文本编号体:首行「项目决策要点」、小节「1.2.1项目背景」无空格。 + 与模板/标题验收叠加后会重复节标题,且前端/导出无法识别为 h4。""" + text = str(content or "").strip() + if not text: + return text + text = re.sub( + r"(?m)^[\s\u3000]*项目决策要点[\s\u3000]*(?:[::])?[\s\u3000]*\n+", + "", + text, + count=1, + ) + text = re.sub( + r"(?m)^([\s\u3000]*)(1\.2\.[12])(项目背景|预期目标)[\s\u3000]*$", + r"\1\2 \3", + text, + ) + return re.sub(r"\n{3,}", "\n\n", text).strip() + + +_CONTRACT_FIELD_LINE_RE = re.compile( + r"^\s*(\d+)[))]\s*(?P[^::\n]+)[::]\s*(?P.*)$", + re.MULTILINE, +) +_CONTRACT_FIELD_SKIP_RE = re.compile( + r"必须|不得|禁止|严禁|应|需|写|输出|背景|规则|约束|表\d|后评价|若|当|正文|首行|写作|请", +) + + +def _parse_expected_fields_from_contract(contract: str | None) -> list[str]: + """从模版输出合同解析「1) 字段名:...」连续编号字段;无则返回空。""" + text = str(contract or "").strip() + if not text: + return [] + fields: list[str] = [] + nums: list[int] = [] + for m in _CONTRACT_FIELD_LINE_RE.finditer(text): + field = str(m.group("field") or "").strip() + tail = str(m.group("tail") or "").strip() + if not field or len(field) > 10 or _CONTRACT_FIELD_SKIP_RE.search(field): + continue + if tail and not re.fullmatch(r"\.{2,}|待补充", tail) and len(tail) > 6: + continue + fields.append(field) + nums.append(int(m.group(1))) + if len(fields) < 3 or not nums or nums[0] != 1: + return [] + for i in range(1, len(nums)): + if nums[i] != nums[i - 1] + 1: + return [] + return fields + + +def _section_expected_fields( + section_title: str, + section: ReportTemplateSection | None = None, +) -> list[str]: + """从模版输出合同解析应输出字段;无编号/枚举字段时返回空(不再写死 1.1 八项)。""" + if section is not None: + contract = _effective_section_output_contract(section) + else: + contract = _section_output_contract(section_title) + parsed = _parse_expected_fields_from_contract(contract) + if parsed: + return parsed + title_norm = re.sub(r"\s+", "", str(section_title or "")) + if "1.2项目决策要点" in title_norm: + return ["规模目标", "质量目标", "效益目标"] + return [] + + +def _effective_section_prompt_for_generation( + section: ReportTemplateSection, + contract: str, +) -> str: + """模版合同为结构权威;与合同重复的 section_prompt 不再注入,避免双源冲突。""" + stored = str(section.section_prompt or "").strip() + contract_text = str(contract or "").strip() + if stored and stored != contract_text: + return stored + return "" + + +def _field_aliases(field: str) -> list[str]: + base = str(field or "").strip() + aliases: dict[str, list[str]] = { + "项目名称": ["项目名称", "工程名称", "装置名称"], + "建设单位": ["建设单位", "业主单位", "实施单位"], + "建设地点": ["建设地点", "建设地址", "厂址", "所在地"], + "建设类型": ["建设类型", "项目类型", "新建", "改扩建"], + "起止时间": ["起止时间", "工作起止时间", "开工时间", "完工时间", "建设工期", "建设期限"], + # 抽取/填表侧常将“建设内容”写作“项目内容/工程内容/装置内容”,需兼容回填。 + "建设内容": ["建设内容", "主要建设内容", "建设范围", "项目内容", "工程内容", "装置内容"], + "建设投资": ["建设投资", "总投资", "投资估算", "项目总投资", "概算"], + "占地面积": ["占地面积", "用地面积"], + "规模目标": ["规模目标", "产量", "规模", "万吨"], + "质量目标": ["质量目标", "辛烷值", "质量升级", "国VI", "国Ⅵ"], + "效益目标": ["效益目标", "利润", "收益", "营业收入", "内部收益率", "IRR"], + } + out = aliases.get(base, []) + if base and base not in out: + out.insert(0, base) + return out[:8] + + +def _is_missing_like(value: str) -> bool: + text = str(value or "").strip() + if not text: + return True + lowered = text.lower() + missing_like = { + "待补充", + "无", + "n/a", + "na", + "-", + "—", + "——", + "暂无", + "未知", + "未提供", + } + return lowered in missing_like + + +def _normalize_land_area_value(value: str) -> str: + """ + 规范化“占地面积”字段: + - 遇到“84m×187m=15708m2”这类表达时,仅保留等号后的结果; + - 将 m2/m^2/m²/㎡ 统一为 ㎡,避免导出时出现单位显示异常。 + """ + text = str(value or "").strip() + if not text: + return text + core = text + if "=" in core: + core = core.split("=")[-1].strip() + # 中文全角等号兼容 + if "=" in core: + core = core.split("=")[-1].strip() + unified = re.sub(r"(?i)\bm\s*(?:\^?\s*2)\b", "㎡", core) + unified = unified.replace("m²", "㎡").replace("M²", "㎡") + unified = re.sub(r"\s*㎡", "㎡", unified) + return unified or text + + +def _normalize_canonical_field_value(field: str, value: str) -> str: + f = str(field or "").strip() + v = str(value or "").strip() + if not v: + return v + if f == "占地面积": + return _normalize_land_area_value(v) + return v + + +def _extract_value_by_alias_from_text(text: str, aliases: list[str]) -> str: + src = str(text or "").strip() + if not src: + return "" + for alias in aliases: + a = str(alias or "").strip() + if not a: + continue + # 支持“字段名:值”或“字段名:值”,值截取到常见分隔符前。 + pattern = rf"{re.escape(a)}\s*[::]\s*([^\n;;,,。]+)" + m = re.search(pattern, src, flags=re.IGNORECASE) + if m: + val = str(m.group(1) or "").strip() + if val and not _is_missing_like(val): + return val + return "" + + +def _is_valid_value_for_field(field: str, value: str, row_key: str = "", col_key: str = "") -> bool: + f = str(field or "").strip() + v = str(value or "").strip() + rk = str(row_key or "").strip().lower() + ck = str(col_key or "").strip().lower() + if not v or _is_missing_like(v): + return False + if f != "建设投资": + return True + + # “建设投资”仅接受金额口径,过滤收益率/回收期等财务指标,避免把 2.89 这类比率误填入。 + key_text = f"{rk} {ck}" + if any(x in key_text for x in ["收益率", "irr", "回收期", "净现值", "百分点", "利润率"]): + return False + if re.search(r"%|%|‰", v): + return False + if re.search(r"(收益率|回收期|净现值|利润率|irr)", v, flags=re.IGNORECASE): + return False + + has_amount_unit = bool(re.search(r"(万元|万|亿元|亿元人民币|元)", v)) + number_match = re.search(r"\d+(?:\.\d+)?", v) + if has_amount_unit: + return True + if not number_match: + return False + + # 无单位纯数字时,过小值大概率是比率而非投资金额(如 2.89)。 + num = float(number_match.group(0)) + return num >= 100 + + +def _extract_canonical_fields( + section_title: str, + elements: list[dict], + *, + section: ReportTemplateSection | None = None, +) -> dict[str, str]: + expected = _section_expected_fields(section_title, section) + if not expected: + return {} + rows = elements if isinstance(elements, list) else [] + out: dict[str, str] = {} + for field in expected: + aliases_raw = [str(a).strip() for a in _field_aliases(field) if str(a).strip()] + aliases = [a.lower() for a in aliases_raw] + best_value = "" + for row in rows: + if not isinstance(row, dict): + continue + row_key = str(row.get("rowKey") or "") + col_key = str(row.get("colKey") or "") + value = str(row.get("value") or "").strip() + if _is_missing_like(value): + continue + key_text = f"{row_key} {col_key}".lower() + if any(a in key_text for a in aliases): + if _is_valid_value_for_field(field, value, row_key=row_key, col_key=col_key): + best_value = value + break + # 兼容 row/col 泛化时,直接从 value 文本中解析“字段: 值”。 + from_value = _extract_value_by_alias_from_text(value, aliases_raw) + if from_value and _is_valid_value_for_field(field, from_value, row_key=row_key, col_key=col_key): + best_value = from_value + break + normalized = _normalize_canonical_field_value(field, best_value) + out[field] = normalized or "待补充" + return out + + +def _render_canonical_fields_for_prompt( + evidence: dict, + *, + allowed_fields: list[str] | None = None, +) -> str: + canonical = evidence.get("canonicalFields") if isinstance(evidence, dict) else None + if not isinstance(canonical, dict) or not canonical: + return "无字段级已抽取结果。" + allowed_set = {str(f).strip() for f in (allowed_fields or []) if str(f).strip()} + lines: list[str] = [] + for field, value in canonical.items(): + f = str(field or "").strip() + if allowed_set and f not in allowed_set: + continue + v = _normalize_canonical_field_value(f, str(value or "").strip()) or "待补充" + if not f: + continue + lines.append(f"- {f}: {v}") + return "\n".join(lines) if lines else "无字段级已抽取结果。" + + +def _extract_field_value_from_docs(field: str, docs: list[dict]) -> str: + aliases = [str(a).strip() for a in _field_aliases(field) if str(a).strip()] + if not aliases or not isinstance(docs, list): + return "" + texts: list[str] = [] + for doc in docs: + if not isinstance(doc, dict): + continue + heading = str(doc.get("heading") or "").strip() + content = str(doc.get("content") or "").strip() + merged = f"{heading}\n{content}".strip() + if merged: + texts.append(merged[:8000]) + + # 先尝试“字段: 值”类型,命中率高且更稳。 + for text in texts: + val = _extract_value_by_alias_from_text(text, aliases) + if val and not _is_missing_like(val): + return _normalize_canonical_field_value(field, val) + + # “建设内容”常写成段落而非冒号键值,补充宽松句式抽取。 + if field == "建设内容": + for text in texts: + for alias in aliases: + pattern = rf"{re.escape(alias)}\s*(?:为|包括|包含|主要包括)\s*([^\n。]{{12,420}})" + m = re.search(pattern, text, flags=re.IGNORECASE) + if m: + val = str(m.group(1) or "").strip(" ::;;,,") + if val and not _is_missing_like(val): + return _normalize_canonical_field_value(field, val) + return "" + + +def _merge_canonical_fields_from_docs( + section_title: str, + evidence: dict, + canonical: dict[str, str], + *, + section: ReportTemplateSection | None = None, +) -> dict[str, str]: + expected = _section_expected_fields(section_title, section) + if not expected or not isinstance(evidence, dict): + return canonical + merged = { + str(k): _normalize_canonical_field_value(str(k), str(v)) + for k, v in dict(canonical or {}).items() + } + docs: list[dict] = [] + chapter_docs = evidence.get("chapterDocs") + keyword_docs = evidence.get("keywordDocs") + if isinstance(chapter_docs, list): + docs.extend(chapter_docs) + # 1.1 项目基本情况:必须优先使用“要素管理-章节要素-第一章项目概况”的表格要素。 + # 仅当章节要素表整体为空/极少时,才允许使用 keywordDocs 做跨文档回退匹配, + # 避免将其它章节的“投资/总投资”等金额误回填到 1.1(例如建设投资被污染)。 + title_norm = re.sub(r"\s+", "", str(section_title or "")) + allow_keyword_fallback = True + if "1.1项目基本情况" in title_norm: + elements = evidence.get("elements") if isinstance(evidence.get("elements"), list) else [] + non_missing_elements = 0 + for row in elements[:80]: + if not isinstance(row, dict): + continue + v = str(row.get("value") or "").strip() + if v and not _is_missing_like(v): + non_missing_elements += 1 + if non_missing_elements >= 4: + break + # “有一定数量的非空单元格”即认为章节要素不空:禁止 keywordDocs 参与回填。 + allow_keyword_fallback = non_missing_elements < 4 + if allow_keyword_fallback and isinstance(keyword_docs, list): + docs.extend(keyword_docs) + if not docs: + return merged + for field in expected: + current = str(merged.get(field) or "").strip() + if current and not _is_missing_like(current): + continue + from_docs = _extract_field_value_from_docs(field, docs) + if from_docs and not _is_missing_like(from_docs): + merged[field] = _normalize_canonical_field_value(field, from_docs) + return merged + + +def _apply_canonical_field_backfill( + section: ReportTemplateSection, + evidence: dict, + content: str, +) -> str: + text = str(content or "") + canonical = evidence.get("canonicalFields") if isinstance(evidence, dict) else {} + if not isinstance(canonical, dict) or not canonical: + elements = evidence.get("elements") if isinstance(evidence, dict) else [] + canonical = _extract_canonical_fields( + section.section_title, + elements if isinstance(elements, list) else [], + section=section, + ) + canonical = _merge_canonical_fields_from_docs( + section.section_title, evidence, canonical, section=section + ) + if not canonical: + return text + repaired = text + for field in _section_expected_fields(section.section_title, section): + value = str(canonical.get(field) or "").strip() + if _is_missing_like(value): + continue + # 先按“字段名: 待补充”进行宽松替换,兼容编号/加粗等格式包装。 + broad_pattern = rf"(^.*{re.escape(field)}.*?[::]\s*)待补充(?:\s|$)" + repaired = re.sub( + broad_pattern, + rf"\g<1>{value}\n", + repaired, + flags=re.MULTILINE, + ) + labels = list(dict.fromkeys([x for x in _field_aliases(field) if str(x).strip()])) + for label in labels: + pattern = rf"({re.escape(label)}\s*[::]\s*)待补充\b" + repaired = re.sub(pattern, rf"\g<1>{value}", repaired) + # 若正文还没有落入该字段值,则追加一行显式键值,避免模型遗漏。 + if value not in repaired and re.search(rf"{re.escape(field)}\s*[::]", repaired): + repaired += f"\n{field}:{value}" + return repaired + + +def _build_field_diagnostics(section: ReportTemplateSection, evidence: dict, content: str) -> list[dict[str, Any]]: + expected = _section_expected_fields(section.section_title, section) + if not expected: + return [] + elements = evidence.get("elements") if isinstance(evidence, dict) else [] + if not isinstance(elements, list): + elements = [] + content_text = str(content or "") + out: list[dict[str, Any]] = [] + for field in expected: + aliases_raw = [str(a).strip() for a in _field_aliases(field) if str(a).strip()] + aliases = [a.lower() for a in aliases_raw] + hits: list[str] = [] + for row in elements: + if not isinstance(row, dict): + continue + row_key = str(row.get("rowKey") or "") + col_key = str(row.get("colKey") or "") + value = str(row.get("value") or "").strip() + if _is_missing_like(value): + continue + key_text = f"{row_key} {col_key}".lower() + if any(a in key_text for a in aliases): + hits.append(value[:120]) + else: + from_value = _extract_value_by_alias_from_text(value, aliases_raw) + if from_value: + hits.append(from_value[:120]) + if len(hits) >= 5: + break + unique_hits = list(dict.fromkeys(hits)) + content_has_value = any((not _is_missing_like(v)) and v in content_text for v in unique_hits) + content_marked_missing = bool( + re.search( + rf"{re.escape(field)}\s*[::].*?待补充", + content_text, + flags=re.IGNORECASE | re.DOTALL, + ) + ) + status = "unknown" + if unique_hits and content_has_value: + status = "used" + elif unique_hits and content_marked_missing: + status = "extracted_but_missing_in_content" + elif unique_hits: + status = "extracted_but_not_matched" + elif content_marked_missing: + status = "not_extracted_and_missing" + out.append( + { + "field": field, + "extractedValues": unique_hits, + "contentHasExtractedValue": content_has_value, + "contentMarkedMissing": content_marked_missing, + "status": status, + } + ) + return out + + +def _section_output_contract(section_title: str) -> str: + section_no = _extract_section_number(str(section_title or "")) + if section_no in SECTION_OUTPUT_CONTRACTS: + return SECTION_OUTPUT_CONTRACTS[section_no] + return DEFAULT_SECTION_OUTPUT_CONTRACT + + +def _effective_section_output_contract(section: ReportTemplateSection) -> str: + raw = getattr(section, "section_output_contract", None) + if isinstance(raw, str) and raw.strip(): + return raw.strip() + return _section_output_contract(section.section_title or "") + + +def _section_requires_tables(section_title: str, *, contract_text: str | None = None) -> bool: + """判断章节合同是否包含【表格强制要求】,决定该节是否允许出现表格。""" + c = (str(contract_text or "").strip() or _section_output_contract(section_title)) + return "表格强制要求" in c + + +def _strip_tables_from_non_table_section( + section_title: str, + content: str, + *, + section: ReportTemplateSection | None = None, +) -> str: + """对无表格需求的章节,移除模型可能自行生成的 Markdown 表格。""" + if section is not None: + contract = _effective_section_output_contract(section) + else: + contract = _section_output_contract(section_title) + if "表格强制要求" in contract: + return content + if not content: + return content + + lines = content.split("\n") + out: list[str] = [] + in_table = False + for line in lines: + stripped = line.strip() + is_table_line = stripped.startswith("|") and stripped.endswith("|") + is_separator = bool(re.match(r"^\|[\s\-:|]+\|$", stripped)) if stripped else False + if is_table_line or is_separator: + if not in_table: + in_table = True + if out and out[-1].strip().startswith("###") and "表" in out[-1]: + out.pop() + continue + else: + if in_table: + in_table = False + if stripped.startswith("[ \t]*\n)*" + r"(?:[ \t]*\|[^\n]*\|[ \t]*\n)+)", + flags=re.IGNORECASE, + ) + m = pat.search(content) + return m.group(0).strip() if m else "" + + +def _find_table_insert_position(content: str, token: str, required_tables: list[str]) -> int | None: + """在 content 中找到 token 对应表应插入的位置。 + + 规则:插入到下一个必需表的表题行之前;若没有后续表,返回 None(追加到末尾)。 + """ + token_idx = None + for i, t in enumerate(required_tables): + if _norm_table_token(t) == _norm_table_token(token): + token_idx = i + break + if token_idx is None: + return None + for later_token in required_tables[token_idx + 1:]: + later_plain = re.sub(r"\s+", "", str(later_token or "")) + if not later_plain: + continue + later_re = re.escape(later_plain).replace(r"\-", r"[--—–]") + later_pat = re.compile( + r"(?:^|\n)([^\n]*?" + later_re + r"[^\n]*)\n", + flags=re.IGNORECASE, + ) + m = later_pat.search(content) + if m: + pos = m.start() + if pos > 0 and content[pos] == "\n": + pos += 1 + return pos + return None + + +def _enforce_required_tables( + section: ReportTemplateSection, + prompt: str, + content: str, + evidence: dict, +) -> tuple[str, list[str]]: + required = _extract_required_table_tokens( + section.section_prompt or "", + _extract_section_number(section.section_title or ""), + contract_text=_effective_section_output_contract(section), + ) + if not required: + return content, [] + # 模板必需表优先“要素表直出”,避免模型改写结构化表中的真实数据。 + repaired = _append_authoritative_required_tables(content, required, evidence) + missing = [t for t in required if not _table_token_exists(repaired, t)] + if missing: + repaired = _append_structured_missing_tables(repaired, missing, evidence) + still_missing = [t for t in required if not _table_token_exists(repaired, t)] + if still_missing: + repaired = _repair_missing_tables(section, prompt, repaired, still_missing, evidence) + still_missing = [t for t in required if not _table_token_exists(repaired, t)] + if still_missing: + repaired = _append_minimal_missing_tables(repaired, still_missing) + # 章节间串表清理:4.3.2 仅保留运行周期统计表;4.3.3 仅保留装置运行分析表。 + repaired = _remove_cross_section_table_pollution(section.section_title or "", repaired) + # 末尾兜底:若必需表已“存在”但表体残缺(仅分隔行/缺数据行), + # 仍要强制回填要素管理中的完整结构化表。 + repaired = _ensure_required_structured_tables_integrity(repaired, required, evidence) + # 即使 missing 为空(如 5.1 已由 LLM 写出表5-1),仍须去重,避免 LLM 表 + 要素直出表并存。 + repaired = _finalize_section_table_dedupe(repaired, required) + repaired = _fill_required_table_caption_stubs(repaired, required, evidence) + repaired = _finalize_section_table_dedupe(repaired, required) + final_missing = [t for t in required if not _table_token_exists(repaired, t)] + return repaired, final_missing + + +def _extract_required_table_tokens( + section_prompt: str, + section_no: str = "", + *, + contract_text: Optional[str] = None, +) -> list[str]: + """ + 从模板 section_prompt 与章节输出合同(section_output_contracts)中抽取「表 x-x / 附表 x」, + 使合同内写死的「见表2-3」等也能触发 _append_authoritative_required_tables 要素直出。 + + 严格规则: + - 仅当合同中包含「【表格强制要求】」标签时,才提取正文表(表x-x)。 + - 「见附表N」「附表N~附表M」等仅为引用语,不视为本节必需内嵌的表格(含区间端点及中间附表)。 + - 附图与附表在正文之后由 _append_report_appendices 统一汇总(附图在上、附表在下)。 + """ + parts = [str(section_prompt or "").strip(), str(contract_text or "").strip()] + text = "\n".join(p for p in parts if p) + if not text: + return [] + + has_table_mandate = "表格强制要求" in text + + if not has_table_mandate: + return [] + + # 剔除「【禁止】」段落,避免将禁止示例中的表号(如"表2.6-1")误判为必需表。 + text_for_extraction = re.sub( + r"【禁止】.*?(?=【|$)", "", text, flags=re.DOTALL, + ) + + raw = re.findall( + r"(附表\s*\d+(?:\s*[.\--]\s*\d+)*|表\s*\d+(?:\s*[.\--]\s*\d+)*)", + text_for_extraction, + ) + out: list[str] = [] + seen = set() + chapter_no = "" + m_sec = re.match(r"^\s*(\d+)", str(section_no or "")) + if m_sec: + chapter_no = m_sec.group(1) + + _REF_ONLY_PATTERN = re.compile( + r"(?:见|详见|参见|参照|详)\s*附表\s*\d+", + ) + ref_only_appendices: set[str] = set() + for m in _REF_ONLY_PATTERN.finditer(text): + tok_in_ref = re.findall(r"(附表\s*\d+(?:\s*[.\--]\s*\d+)*)", m.group()) + for t in tok_in_ref: + ref_only_appendices.add(re.sub(r"\s+", "", t)) + + for tok in raw: + norm = re.sub(r"\s+", "", tok) + if norm.startswith("附表") and norm in ref_only_appendices: + continue + if chapter_no: + m_tok = re.match(r"^(?:附表|表)\s*(\d+)", norm) + if m_tok: + tail = norm[m_tok.end() :] + # 「表1」「表2」等为节内顺序号,首位数字不等于章号(如第二章下的表1);仅对「表2-3」「表2.6-1」等带子级编号的表号按章首数字过滤。 + if tail and tail[0] in ".--—–": + if m_tok.group(1) != chapter_no: + continue + if norm in seen: + continue + seen.add(norm) + out.append(norm) + + # 5.3.2 合同正文仅允许表5-5、表5-6;模板示例里若夹带「附表8」等,一律不纳入必需表,避免要素直出串表。 + if str(section_no or "").strip() == "5.3.2": + allow_532 = {_norm_table_token("表5-5"), _norm_table_token("表5-6")} + out = [t for t in out if _norm_table_token(t) in allow_532] + # 7.1.2 仅内嵌表7-1;合同/模板中若夹带其他章表号,不纳入本节必需表。 + if str(section_no or "").strip() == "7.1.2": + allow_712 = {_norm_table_token("表7-1")} + out = [t for t in out if _norm_table_token(t) in allow_712] + return out[:20] + + +def _repair_missing_tables( + section: ReportTemplateSection, + prompt: str, + content: str, + missing_tables: list[str], + evidence: dict, +) -> str: + fix_prompt = build_repair_missing_tables_prompt( + section_title=section.section_title, + original_prompt=prompt, + content=content, + missing_tables=missing_tables, + evidence_json=json.dumps(evidence, ensure_ascii=False), + ) + obj = chat_completions_json( + system_prompt=repair_missing_tables_system_prompt(), + user_prompt=fix_prompt, + temperature=0.1, + max_tokens=2200, + timeout_sec=120, + log_context=f"补缺失表格 section_key={section.section_key} | {section.section_title}", + ) + new_content = str(obj.get("content") or "").strip() + return new_content or content + + +def _append_minimal_missing_tables(content: str, missing_tables: list[str]) -> str: + blocks = [content.rstrip()] + for t in missing_tables: + blocks.append( + MINIMAL_MISSING_TABLE_TEMPLATE.format( + table_name=_normalize_table_caption_number_name_gap(str(t or "").strip()) + ) + ) + return "".join(blocks).strip() + + +def _remove_cross_section_table_pollution(section_title: str, content: str) -> str: + """ + 清理 4.3.2 / 4.3.3 的跨节串表: + - 4.3.2 不允许出现“装置运行分析”表 + - 4.3.3 不允许出现“投产以来运行周期统计表” + """ + text = str(content or "") + section_no = _extract_section_number(section_title) + if section_no not in {"4.3.2", "4.3.3"}: + return text + + if section_no == "4.3.2": + forbidden_kw = "装置运行分析" + else: + forbidden_kw = "投产以来运行周期统计表" + + # 表题行 + Markdown 表格(允许表题与表格之间有空行/注释行) + md_pat = re.compile( + rf"(?:^|\n)[^\n]*{re.escape(forbidden_kw)}[^\n]*\n" + rf"(?:\s*\n|\s*\n)*" + rf"(?:\s*\|[^\n]+\|\s*\n)+", + flags=re.IGNORECASE, + ) + text = md_pat.sub("\n", text) + + # 表题行 + HTML 表格(允许表题与表格之间有空行/注释行) + html_pat = re.compile( + rf"(?:^|\n)[^\n]*{re.escape(forbidden_kw)}[^\n]*\n" + rf"(?:\s*\n|\s*\n)*" + rf"\s*[\s\S]*?
", + flags=re.IGNORECASE, + ) + text = html_pat.sub("\n", text) + + # 残留单独表题行(无表体)也移除,避免视觉噪音 + title_only_pat = re.compile( + rf"(?:^|\n)\s*[#>*\-\d\.\)()\s]*[^\n]*{re.escape(forbidden_kw)}[^\n]*(?=\n|$)", + flags=re.IGNORECASE, + ) + text = title_only_pat.sub("\n", text) + # 折叠多余空行 + text = re.sub(r"\n{3,}", "\n\n", text) + return text.strip() + + +def _title_compare_norm(s: str) -> str: + """标题宽松比较:折叠空白,并去掉中英括号两侧多余空格。""" + t = re.sub(r"\s+", " ", str(s or "")).strip() + t = re.sub(r"\s*([((])\s*", r"\1", t) + t = re.sub(r"\s*([))])\s*", r"\1", t) + return t + + +def _heading_line_section_number(line: str) -> str: + normalized = str(line or "").strip().lstrip("#").strip() + m = re.match(r"^(\d+(?:\.\d+)*)", normalized) + return m.group(1) if m else "" + + +def _is_heading_line_for_section(line: str, section_no: str) -> bool: + if not section_no: + return False + return _heading_line_section_number(line) == section_no + + +def _strip_leading_section_heading_lines(lines: list[str], section_no: str) -> list[str]: + """去掉正文开头连续的、与 section_no 同编号的标题行(避免 prepend 后重复)。""" + trimmed = list(lines) + while trimmed: + first = trimmed[0] + if not str(first).strip(): + trimmed.pop(0) + continue + if _is_heading_line_for_section(first, section_no): + trimmed.pop(0) + while trimmed and not str(trimmed[0]).strip(): + trimmed.pop(0) + continue + break + return trimmed + + +def _replace_first_section_heading_line(content: str, section_no: str, canonical_title: str) -> str: + lines_list = content.splitlines() + for idx_l, ln in enumerate(lines_list): + if not ln.strip(): + continue + if not _is_heading_line_for_section(ln, section_no): + break + stripped = ln.strip() + section_no_heading = section_no + plain_numbered_24x = section_no_heading in { + "2.4.1", + "2.4.2", + "2.4.3", + "2.4.4", + } + if stripped.startswith("#") and not plain_numbered_24x: + hm = re.match(r"^(#+\s*)", stripped) + prefix = hm.group(1) if hm else "" + lines_list[idx_l] = (prefix + canonical_title).rstrip() + else: + lines_list[idx_l] = canonical_title + break + return "\n".join(lines_list) + + +def _enforce_template_format_contract( + section: ReportTemplateSection, + content: str, + evidence: dict, + *, + chapter_title_map: Optional[dict[str, str]] = None, +) -> tuple[str, list[str]]: + issues: list[str] = [] + # 不对编号样式做统一归一化,避免破坏章节层级编号(如 1.2.1 / 2.1.3 / 3.4.2)。 + repaired = _strip_inline_evidence_labels(str(content or "").strip()) + repaired = _normalize_section_heading_markdown(repaired) + if _extract_section_number(str(section.section_title or "")) == "1.2": + repaired = _normalize_section_12_content(repaired) + + # 0) 先拆行再做标题验收,否则步骤 1 会因首行 != 标准标题而重复插入标题。 + # 0.1) 上一段正文末尾与下一小节编号粘在同一行(如「……证明了2.4.4 初步……」)。 + repaired = _split_inline_template_headings(repaired, chapter_title_map or {}) + # 0.2) 小节标题后与正文首字粘在同行(如「2.4.4 初步设计审查工作评价2017年……」)。 + repaired = _split_glued_template_heading_body(repaired, chapter_title_map or {}) + + # 1) 标题验收: + # - 普通节:首行为当前节标题(允许附加 # 前缀) + # - 每章第一节(x.1):首行为章标题,且必须包含当前节标题 + title = str(section.section_title or "").strip() + section_no_heading = _extract_section_number(title) + title_norm = _title_compare_norm(title) + non_empty_lines: list[str] = [] + for line in repaired.splitlines(): + if line.strip(): + non_empty_lines.append(line.strip().lstrip("#").strip()) + first_non_empty = non_empty_lines[0] if non_empty_lines else "" + first_matches_title = bool( + title and first_non_empty and _title_compare_norm(first_non_empty) == title_norm + ) + first_is_section_heading = bool( + title + and section_no_heading + and first_non_empty + and _is_heading_line_for_section(first_non_empty, section_no_heading) + ) + + ancestor_titles = _resolve_ancestor_titles_for_section( + section, + chapter_title_map or {}, + ) + if ancestor_titles: + required_titles = ancestor_titles + ([title] if title else []) + required_norms = [_title_compare_norm(t) for t in required_titles] + existing_lines = repaired.splitlines() + trimmed_lines = list(existing_lines) + existing_title_chain: list[str] = [] + while trimmed_lines: + first_line = trimmed_lines[0] + normalized = _title_compare_norm(first_line.strip().lstrip("#").strip()) + if not normalized: + trimmed_lines.pop(0) + continue + if normalized in required_norms: + existing_title_chain.append(normalized) + trimmed_lines.pop(0) + while trimmed_lines and not trimmed_lines[0].strip(): + trimmed_lines.pop(0) + continue + if ( + title + and section_no_heading + and len(existing_title_chain) == len(required_norms) - 1 + and _is_heading_line_for_section(first_line, section_no_heading) + ): + existing_title_chain.append(required_norms[-1]) + trimmed_lines.pop(0) + while trimmed_lines and not trimmed_lines[0].strip(): + trimmed_lines.pop(0) + continue + break + if existing_title_chain != required_norms: + body_lines = _strip_leading_section_heading_lines(trimmed_lines, section_no_heading) + body = "\n".join(body_lines).strip() + repaired = "\n\n".join(required_titles + ([body] if body else [])).strip() + issues.append("章节缺少父级标题链,已自动补齐") + elif title and first_is_section_heading and first_non_empty != title: + repaired = _replace_first_section_heading_line(repaired, section_no_heading, title) + elif title and not first_matches_title and first_is_section_heading: + repaired = _replace_first_section_heading_line(repaired, section_no_heading, title) + if first_non_empty != title: + issues.append("章节标题与模板不一致,已规范为标准标题行") + elif title and not first_matches_title: + repaired = f"{title}\n\n{repaired}".strip() + issues.append("章节标题与模板不一致,已自动补齐标准标题行") + elif title and first_matches_title and first_non_empty != title: + repaired = _replace_first_section_heading_line(repaired, section_no_heading, title) + + # 1.42) 2.4.1~2.4.4:首行可能是「### 2.4.x …」且去 # 后与模板标题一致,此时不会进入上一分支,须去掉 Markdown 前缀。 + if ( + _extract_section_number(title) in {"2.4.1", "2.4.2", "2.4.3", "2.4.4"} + and title + ): + lns_strip = repaired.splitlines() + for _is, ln_s in enumerate(lns_strip): + if not ln_s.strip(): + continue + sh = ln_s.strip() + if sh.startswith("#") and _title_compare_norm(sh.lstrip("#").strip()) == title_norm: + lns_strip[_is] = title + break + repaired = "\n".join(lns_strip) + + # 1.5) 确保标题行(如 "2.4.4 初步设计审查工作评价")后面有空行, + # 否则前端 Markdown 渲染或 DOCX 导出时可能无法识别为标题。 + repaired = _ensure_heading_lines_separated(repaired) + + # 2) 标题树验收:若该节定义了固定子节顺序,缺失则自动补齐占位小节。 + section_no = _extract_section_number(title) + repaired, missing_children = _auto_append_missing_child_headings(repaired, section_no) + if missing_children: + issues.append("缺失下级小节已自动补齐:" + "、".join(missing_children[:10])) + + # 3) 仅当章节合同显式要求表格时,才做模板表格规格验收与修复。 + # 否则像 5.2.2/5.2.3/5.2.4 这类纯文字章节会被示例表误触发补表,产生脏表格。 + if not _section_requires_tables( + title, contract_text=_effective_section_output_contract(section) + ): + return _strip_inline_evidence_labels(repaired), issues + + # 4) 解析模板示例中的表规格(表名 + 表头关键字) + table_specs = _extract_template_table_specs(section.examples) + if not table_specs: + return repaired, issues + + # 4.1) 第 5 章共用示例里同时出现「表5-1/表5-2」宁夏样例与各小节真实合同(如 5.3.1 仅表5-4)。 + # 若不按合同过滤,_find_table_format_issues 会误报缺表5-1,_repair_table_format_by_template 会把表5-4「修」成样例表头。 + contract_required = _extract_required_table_tokens( + section.section_prompt or "", + section_no, + contract_text=_effective_section_output_contract(section), + ) + if contract_required: + allow = {_norm_table_token(t) for t in contract_required if _norm_table_token(t)} + narrowed = [ + s + for s in table_specs + if _norm_table_token(str(s.get("token") or "")) in allow + ] + if narrowed: + table_specs = narrowed + + table_issues = _find_table_format_issues(repaired, table_specs) + if table_issues: + issues.extend(table_issues) + repaired = _repair_table_format_by_template(section, repaired, table_specs, evidence) + # 二次验收,仍不通过则提示但不循环重试 + still = _find_table_format_issues(repaired, table_specs) + if still: + issues.extend([f"二次修正后仍存在:{x}" for x in still[:4]]) + return _strip_inline_evidence_labels(repaired), issues + + +def _extract_template_table_specs(raw_examples: Optional[str]) -> list[dict]: + text = str(raw_examples or "").strip() + if not text: + return [] + lines = [ln.rstrip() for ln in text.splitlines()] + specs: list[dict] = [] + i = 0 + while i < len(lines): + line = lines[i].strip() + m = re.match(r"^(附表\s*\d+|表\s*\d+(?:\s*-\s*\d+)?)\s*(.*)$", line) + if not m: + i += 1 + continue + token = re.sub(r"\s+", "", m.group(1)) + title_tail = str(m.group(2) or "").strip() + title = f"{m.group(1)} {title_tail}".strip() + + j = i + 1 + header_keywords: list[str] = [] + # 采集该表后面的头部字段线索 + while j < len(lines): + cur = lines[j].strip() + if not cur: + j += 1 + if header_keywords: + break + continue + if re.match(r"^(附表\s*\d+|表\s*\d+(?:\s*-\s*\d+)?)\s*", cur): + break + if re.match(r"^\d+(?:\.\d+)*\s+", cur): # 下一个章节 + break + if cur.startswith("注"): + break + if "|" in cur: + # Markdown 表头 + cells = [c.strip() for c in cur.split("|") if c.strip()] + for c in cells[:8]: + if c and c not in ("---", "—"): + header_keywords.append(c) + break + # 普通文本表头行 + if len(cur) <= 24 and not re.fullmatch(r"[0-9.%()()\-~~:/\s]+", cur): + header_keywords.append(cur) + if len(header_keywords) >= 8: + break + j += 1 + + specs.append( + { + "token": token, + "title": title, + "headerKeywords": list(dict.fromkeys(header_keywords))[:8], + } + ) + i = j + # 去重同 token + dedup: dict[str, dict] = {} + for s in specs: + tk = str(s.get("token") or "") + if not tk or tk in dedup: + continue + dedup[tk] = s + return list(dedup.values())[:12] + + +def _find_table_format_issues(content: str, table_specs: list[dict]) -> list[str]: + issues: list[str] = [] + c = str(content or "") + c_norm = _norm_table_token(c) + for spec in table_specs: + token = str(spec.get("token") or "") + title = str(spec.get("title") or token) + token_norm = _norm_table_token(token) + if token_norm and token_norm not in c_norm: + issues.append(f"缺少模板表名:{title}") + continue + # 若模板存在表头关键词,则要求至少命中2个(或全部,如果少于2) + headers = [str(h).strip() for h in (spec.get("headerKeywords") or []) if str(h).strip()] + if not headers: + continue + hit = sum(1 for h in headers if h in c) + need = min(2, len(headers)) + if hit < need: + issues.append(f"表头与模板不一致:{title}") + return issues + + +def _repair_table_format_by_template( + section: ReportTemplateSection, + content: str, + table_specs: list[dict], + evidence: dict, +) -> str: + specs_text = json.dumps(table_specs, ensure_ascii=False) + fix_prompt = build_table_format_repair_prompt( + section_title=section.section_title, + table_specs_json=specs_text, + content=content, + evidence_json=json.dumps(evidence, ensure_ascii=False), + ) + obj = chat_completions_json( + system_prompt=table_format_repair_system_prompt(), + user_prompt=fix_prompt, + temperature=0.1, + max_tokens=2600, + timeout_sec=120, + log_context=f"表格格式修复 section_key={section.section_key} | {section.section_title}", + ) + fixed = str(obj.get("content") or "").strip() + return fixed or content + + +def _render_structured_tables_for_prompt(evidence: dict) -> str: + rows = evidence.get("structuredTables") if isinstance(evidence, dict) else [] + if not isinstance(rows, list) or not rows: + return "无结构化表格证据" + blocks: list[str] = [] + for row in rows[:8]: + if not isinstance(row, dict): + continue + name = str(row.get("tableName") or "").strip() + md = str(row.get("markdown") or "").strip() + if not name or not md: + continue + blocks.append(f"### {_normalize_table_caption_number_name_gap(name)}\n\n{md}") + return "\n\n".join(blocks) if blocks else "无结构化表格证据" + + +def _strip_bracketed_three_part_labels(content: str) -> str: + text = str(content or "") + if not text: + return text + # 全章统一移除方括号三段式标题,保留其后正文内容。 + patterns = [ + r"^\s{0,3}#{0,6}\s*【事实依据】\s*$", + r"^\s{0,3}#{0,6}\s*【评价判断】\s*$", + r"^\s{0,3}#{0,6}\s*【问题与建议】\s*$", + r"^\s{0,3}#{0,6}\s*事实依据\s*[::、]?\s*$", + r"^\s{0,3}#{0,6}\s*评价判断\s*[::、]?\s*$", + r"^\s{0,3}#{0,6}\s*问题与建议\s*[::、]?\s*$", + r"【事实依据】", + r"【评价判断】", + r"【问题与建议】", + ] + for p in patterns: + text = re.sub(p, "", text, flags=re.MULTILINE) + text = re.sub(r"\n{3,}", "\n\n", text).strip() + return text + + +def _strip_placeholder_table_notes(content: str) -> str: + text = str(content or "") + if not text: + return text + placeholder_note_pattern = re.compile( + r"^\s{0,3}(?:[-*+]\s*)?(?:>\s*)?(?:\*\*|__)?\s*注\s*\d+\s*[\.。::、]?\s*待补充\s*(?:\*\*|__)?\s*$", + flags=re.IGNORECASE, + ) + boilerplate_note_line_pattern = re.compile( + r"可酌情增减指标|可酌情增减|根据项目的情况|根据项目实际需要进行增减|根据项目不同进行增减|根据项目具体情况增减|表中内容可根据", + flags=re.IGNORECASE, + ) + boilerplate_full_line_pattern = re.compile( + r"^\s{0,3}(?:[-*+]\s*)?(?:>\s*)?(?:\*\*|__)?\s*注\s*[\.。::]?\s*(?:\d+\s*[\.。、::]?\s*)?(?:表中内容)?可根据项目.{0,20}(?:增减|调整)", + flags=re.IGNORECASE, + ) + note_header_pattern = re.compile( + r"^\s*(?:[-*+]\s*)?(?:>\s*)?(?:\*\*|__)?\s*注\s*[::]\s*(?:\*\*|__)?\s*$", + flags=re.IGNORECASE, + ) + + src_lines = text.splitlines() + lines: list[str] = [] + i = 0 + while i < len(src_lines): + line = src_lines[i] + if placeholder_note_pattern.match(line): + i += 1 + continue + if boilerplate_full_line_pattern.match(line): + i += 1 + continue + if boilerplate_note_line_pattern.search(line): + i += 1 + continue + if note_header_pattern.match(line): + j = i + 1 + skipped_boilerplate = False + while j < len(src_lines): + nxt = src_lines[j] + if not str(nxt).strip(" \t\u3000"): + j += 1 + continue + if boilerplate_note_line_pattern.search(nxt): + skipped_boilerplate = True + j += 1 + break + if skipped_boilerplate: + i = j + continue + lines.append(line) + i += 1 + return re.sub(r"\n{3,}", "\n\n", "\n".join(lines)).strip() + + +def _strip_trailing_partial_missing_markers(content: str) -> str: + text = str(content or "") + if not text: + return text + + cleaned_lines: list[str] = [] + for raw_line in text.splitlines(): + line = raw_line.rstrip() + compact = re.sub(r"\s+", "", line) + if compact in {"待补充", "-待补充", "*待补充"}: + cleaned_lines.append(line) + continue + + updated = re.sub(r"\s*待补充\s*(?:\[\s*\d{1,3}\s*\])?\s*$", "", line) + updated = re.sub(r"\s{2,}", " ", updated).rstrip() + stripped = updated.strip() + core_len = len(re.sub(r"[^\u4e00-\u9fffA-Za-z0-9]", "", stripped)) + + if stripped and stripped != line.strip() and core_len >= 12: + cleaned_lines.append(updated) + else: + cleaned_lines.append(line) + + return re.sub(r"\n{3,}", "\n\n", "\n".join(cleaned_lines)).strip() + + +def _is_pipe_markdown_table_row_line(line: str) -> bool: + s = line.strip() + return len(s) >= 2 and s.startswith("|") and s.endswith("|") + + +def _is_pipe_markdown_table_separator_line(line: str) -> bool: + s = line.strip() + return bool(re.match(r"^\|[\s\-:|]+\|$", s)) if s else False + + +def _markdown_table_body_fingerprint(md: str) -> str: + """用于判断两张 Markdown 表是否实质相同(忽略行间空行与首尾空白)。""" + lines = [ + re.sub(r"\s+", "", ln.strip()) + for ln in str(md or "").splitlines() + if ln.strip().startswith("|") or ln.strip().startswith("|") + ] + return "\n".join(lines) + + +_INTRA_REPEAT_MIN_FRAGMENT_LEN = 18 +_INTRA_REPEAT_MIN_CONSECUTIVE_COUNT = 3 + + +def _collapse_consecutive_repetitions_in_string(text: str) -> tuple[str, int]: + """ + 折叠同一行/字符串内连续重复片段(如模型将同一句财务描述拼接数十次)。 + 仅处理紧邻重复,避免误伤正常文中偶然出现的相同短语。 + """ + s = str(text or "") + min_len = _INTRA_REPEAT_MIN_FRAGMENT_LEN + min_count = _INTRA_REPEAT_MIN_CONSECUTIVE_COUNT + if len(s) < min_len * min_count: + return s, 0 + + removed = 0 + out: list[str] = [] + i = 0 + n = len(s) + while i < n: + best_plen = 0 + best_count = 0 + max_plen = (n - i) // min_count + for plen in range(min_len, max_plen + 1): + pat = s[i : i + plen] + if not pat.strip(): + continue + count = 1 + j = i + plen + while j + plen <= n and s[j : j + plen] == pat: + count += 1 + j += plen + if count >= min_count: + span = plen * count + if span > best_plen * best_count: + best_plen = plen + best_count = count + if best_plen: + out.append(s[i : i + best_plen]) + removed += best_count - 1 + i += best_plen * best_count + else: + out.append(s[i]) + i += 1 + return "".join(out), removed + + +def _collapse_consecutive_text_repetitions(content: str) -> tuple[str, int]: + """按行折叠段内连续重复;返回 (正文, 移除的重复次数)。""" + lines = str(content or "").splitlines() + if not lines: + return str(content or ""), 0 + total_removed = 0 + collapsed_lines: list[str] = [] + for line in lines: + collapsed, removed = _collapse_consecutive_repetitions_in_string(line) + total_removed += removed + collapsed_lines.append(collapsed) + return "\n".join(collapsed_lines), total_removed + + +_CHAPTER_CONTENT_DEDUPE_MIN_CHARS = 2000 +_CHAPTER_DEDUPE_MIN_BLOCK_CORE_LEN = 48 +_CHAPTER_DEDUPE_NEAR_MATCH_RATIO = 0.90 + + +def _chapter_block_core_len(text: str) -> int: + return len(re.sub(r"[^\u4e00-\u9fffA-Za-z0-9]", "", _strip_inline_evidence_labels(str(text or "")))) + + +def _chapter_text_block_fingerprint(text: str) -> str: + t = _strip_inline_evidence_labels(str(text or "")) + t = re.sub(r"\s+", "", t) + return t.casefold() + + +def _is_likely_table_caption_line(line: str) -> bool: + prev = str(line or "").strip() + if not prev or len(prev) > 120: + return False + if prev.startswith("#"): + return True + if re.search(r"表\s*[\d一二三四五六七八九十\--—–]+", prev): + return True + return "表" in prev and len(prev) <= 80 + + +def _chapter_block_is_table(block: str) -> bool: + pipe_rows = [ + ln for ln in str(block or "").splitlines() if ln.strip() and _is_pipe_markdown_table_row_line(ln) + ] + return len(pipe_rows) >= 2 + + +def _split_chapter_blocks_for_dedupe(content: str) -> list[str]: + """将章节正文拆成段落块与 Markdown 表块,便于做重复检测。""" + lines = str(content or "").splitlines() + blocks: list[str] = [] + i = 0 + n = len(lines) + pending_caption: list[str] = [] + + def _flush_pending_caption() -> None: + nonlocal pending_caption + if pending_caption: + blocks.append("\n".join(pending_caption)) + pending_caption = [] + + while i < n: + if not lines[i].strip(): + i += 1 + continue + if _is_pipe_markdown_table_row_line(lines[i]): + table_lines: list[str] = [] + while i < n and lines[i].strip() and _is_pipe_markdown_table_row_line(lines[i]): + table_lines.append(lines[i]) + i += 1 + if table_lines: + block_lines = list(pending_caption) + table_lines + pending_caption = [] + start = i - len(table_lines) + scan = start - 1 + while scan >= 0 and lines[scan].strip(): + if _is_pipe_markdown_table_row_line(lines[scan]): + break + if _is_likely_table_caption_line(lines[scan]): + block_lines.insert(0, lines[scan]) + scan -= 1 + continue + break + blocks.append("\n".join(block_lines)) + continue + para_lines: list[str] = [] + while i < n: + if not lines[i].strip(): + i += 1 + break + if _is_pipe_markdown_table_row_line(lines[i]): + break + para_lines.append(lines[i]) + i += 1 + if not para_lines: + continue + if len(para_lines) == 1 and _is_likely_table_caption_line(para_lines[0]): + _flush_pending_caption() + pending_caption = para_lines + continue + _flush_pending_caption() + blocks.append("\n".join(para_lines)) + _flush_pending_caption() + return blocks + + +def _chapter_blocks_near_duplicate(a: str, b: str) -> bool: + fa = _chapter_text_block_fingerprint(a) + fb = _chapter_text_block_fingerprint(b) + if not fa or not fb: + return False + if fa == fb: + return True + short, long = (fa, fb) if len(fa) <= len(fb) else (fb, fa) + if len(short) >= _CHAPTER_DEDUPE_MIN_BLOCK_CORE_LEN and short in long: + if len(short) / max(len(long), 1) >= 0.82: + return True + if min(len(fa), len(fb)) < 80: + return False + return SequenceMatcher(None, fa, fb).ratio() >= _CHAPTER_DEDUPE_NEAR_MATCH_RATIO + + +def _chapter_block_duplicate_key(block: str) -> tuple[str, str]: + text = str(block or "") + if _chapter_block_is_table(text): + hdr = _extract_table_header_key(text) + fp = _markdown_table_body_fingerprint(text) + return ("table", hdr or fp) + return ("text", _chapter_text_block_fingerprint(text)) + + +def _dedupe_long_chapter_repetition(content: str) -> tuple[str, int]: + """ + 单章字数超过阈值时,对段落/表格块做去重,缓解模型整段或整表重复输出。 + 保留首次出现块,删除后续实质重复块。 + """ + text = str(content or "") + if len(text) <= _CHAPTER_CONTENT_DEDUPE_MIN_CHARS: + return text, 0 + + blocks = _split_chapter_blocks_for_dedupe(text) + if len(blocks) < 2: + return text, 0 + + kept: list[str] = [] + seen_table_hdr: set[str] = set() + seen_table_fp: set[str] = set() + seen_text_fp: set[str] = set() + kept_text_samples: list[str] = [] + removed = 0 + + for block in blocks: + core_len = _chapter_block_core_len(block) + kind, key = _chapter_block_duplicate_key(block) + is_dup = False + + if kind == "table": + hdr = _extract_table_header_key(block) if key else "" + fp = _markdown_table_body_fingerprint(block) + if hdr and hdr in seen_table_hdr: + is_dup = True + elif fp and fp in seen_table_fp: + is_dup = True + elif key and key in seen_text_fp: + is_dup = True + elif core_len >= _CHAPTER_DEDUPE_MIN_BLOCK_CORE_LEN: + for prev in kept_text_samples: + if _chapter_blocks_near_duplicate(block, prev): + is_dup = True + break + + if is_dup: + removed += 1 + continue + + if kind == "table": + hdr = _extract_table_header_key(block) + fp = _markdown_table_body_fingerprint(block) + if hdr: + seen_table_hdr.add(hdr) + if fp: + seen_table_fp.add(fp) + elif key: + seen_text_fp.add(key) + if core_len >= _CHAPTER_DEDUPE_MIN_BLOCK_CORE_LEN: + kept_text_samples.append(block) + + kept.append(block) + + if removed <= 0: + return text, 0 + + merged = "\n\n".join(b.strip() for b in kept if b.strip()) + return re.sub(r"\n{3,}", "\n\n", merged).strip(), removed + + +def _dedupe_structured_table_hits(hits: list[dict]) -> list[dict]: + """ + 同一必需表 token 可能命中多张历史表或重复 markdown;合并输出会导致章节内连续两张相同表。 + 按 tableId 与表体指纹去重,保留表体最完整的一条(表5-4 优先可研/实际/增减结构)。 + """ + ranked: list[tuple[int, dict]] = [] + for hit in hits: + if not isinstance(hit, dict): + continue + md = str(hit.get("markdown") or "").strip() + if not md: + continue + fp = _markdown_table_body_fingerprint(md) + if not fp: + continue + ranked.append((_score_structured_table_hit_dict(hit), hit)) + ranked.sort(key=lambda x: (-x[0], str(x[1].get("tableId") or ""))) + + out: list[dict] = [] + seen_ids: set[str] = set() + seen_fp: set[str] = set() + for _score, hit in ranked: + tid = str(hit.get("tableId") or "").strip() + md = str(hit.get("markdown") or "").strip() + fp = _markdown_table_body_fingerprint(md) + if tid and tid in seen_ids: + continue + if fp in seen_fp: + continue + if tid: + seen_ids.add(tid) + seen_fp.add(fp) + out.append(hit) + return out + + +def _is_likely_table1_raw_material_caption(line: str) -> bool: + s = re.sub(r"\s+", "", str(line or "")) + if not s: + return False + if "原料数量及组成对比" in s: + return True + if re.search(r"表\s*1", s) and "原料" in s: + return True + return False + + +def _extract_table_header_key(table_block: str) -> str: + """提取表格的表头行(第一条 pipe 行),归一化后作为同表判断依据。""" + for ln in str(table_block or "").splitlines(): + s = ln.strip() + if s.startswith("|") and s.endswith("|") and not re.match(r"^\|[\s\-:|]+\|$", s): + return re.sub(r"\s+", "", s) + return "" + + +def _is_433_operation_analysis_table_header(header_norm: str) -> bool: + """4.3.3 表4-2「烷基化装置运行分析」常见 Markdown 表头(全列或仅实际值列)。""" + h = str(header_norm or "") + if "序号" not in h or "项目" not in h: + return False + if "实际值" in h: + return True + if "设计值" in h and "标定值" in h: + return True + return False + + +def _433_op_analysis_table_has_canonical_caption(text: str, table_block_start: int) -> bool: + """表前若干行内是否出现合同规定的表4-2 烷基化装置运行分析表题(用于保留规范副本、去掉无表题重复表)。""" + before = str(text or "")[: int(table_block_start)].rstrip() + lines = before.split("\n") + tail = "\n".join(lines[-18:]) + if "烷基化装置运行分析" not in tail: + return False + n = re.sub(r"\s+", "", tail) + return bool(re.search(r"表4[--—–]2", n)) + + +def _dedupe_433_alkylation_operation_analysis_markdown_tables(content: str) -> str: + """ + 4.3.3 常见故障:模型在「2) 主要装置达标评价」下先输出无表题的同结构表, + 又在「3) 全厂达标评价」下重复输出带「表4-2 …烷基化装置运行分析…」表题的同一表。 + 对表头/表体指纹相同的重复表:优先保留表前带规范表4-2 表题的一张;否则保留文档中第一张。 + """ + text = str(content or "") + if not text.strip(): + return text + + pat = re.compile(r"(?m)(?:^\s*\|.+\|\s*\n){3,}") + matches = list(pat.finditer(text)) + if len(matches) < 2: + return text + + items: list[dict] = [] + for m in matches: + block = m.group(0) + hdr = _extract_table_header_key(block) + if not _is_433_operation_analysis_table_header(hdr): + continue + fp = _markdown_table_body_fingerprint(block) + items.append( + { + "m": m, + "hdr": hdr, + "fp": fp, + "cap": _433_op_analysis_table_has_canonical_caption(text, m.start()), + } + ) + + n = len(items) + if n < 2: + return text + + parent = list(range(n)) + + def find(x: int) -> int: + if parent[x] != x: + parent[x] = find(parent[x]) + return parent[x] + + def union(x: int, y: int) -> None: + rx, ry = find(x), find(y) + if rx != ry: + parent[ry] = rx + + for i in range(n): + for j in range(i + 1, n): + a, b = items[i], items[j] + same_hdr = bool(a["hdr"] and a["hdr"] == b["hdr"]) + same_fp = bool(a["fp"] and a["fp"] == b["fp"]) + if same_hdr or same_fp: + union(i, j) + + clusters: dict[int, list[int]] = {} + for i in range(n): + r = find(i) + clusters.setdefault(r, []).append(i) + + remove_spans: list[tuple[int, int]] = [] + for _root, idxs in clusters.items(): + if len(idxs) < 2: + continue + idxs_sorted = sorted(idxs, key=lambda ii: items[ii]["m"].start()) + caps = [ii for ii in idxs_sorted if items[ii]["cap"]] + keep_idx = caps[0] if caps else idxs_sorted[0] + for ii in idxs_sorted: + if ii == keep_idx: + continue + m = items[ii]["m"] + start = m.start() + prefix = text[:start].rstrip("\n") + last_nl = prefix.rfind("\n") + title_line = prefix[last_nl + 1 :] if last_nl >= 0 else prefix + tl = title_line.strip() + if "烷基化装置运行分析" in tl and re.search( + r"表4[--—–]2", re.sub(r"\s+", "", tl) + ): + start = last_nl + 1 if last_nl >= 0 else 0 + before = text[:start] + if before.rstrip().endswith("-->"): + comment_start = before.rstrip().rfind(""): + comment_start = before.rstrip().rfind(""): + comment_start = before.rstrip().rfind("\s*\n)*" + r"(?:\s*\|[^\n]+\|\s*\n)+", + flags=re.IGNORECASE, + ) + text = md1.sub("\n", text) + # 「###」独占行后再起表题(与 DOCX 导出兼容) + md2 = re.compile( + rf"(?:^|\n)(?:\s*#{{1,6}}\s*\n)+(?:\s*\n)*" + rf"(?:[^\n]*{kw8}[^\n]*\n(?:\s*[^\n]*{kwname}[^\n]*\n)?)" + r"(?:\s*\n|\s*\n)*" + r"(?:\s*\|[^\n]+\|\s*\n)+", + flags=re.IGNORECASE, + ) + text = md2.sub("\n", text) + html_pat = re.compile( + rf"(?:^|\n)[^\n]*{kw8}[^\n]*{kwname}[^\n]*\n" + r"(?:\s*\n|\s*\n)*" + r"\s*[\s\S]*?
", + flags=re.IGNORECASE, + ) + text = html_pat.sub("\n", text) + title_only = re.compile( + rf"(?:^|\n)(?:\s*#{{1,6}}\s*\n)+(?:\s*\n)*[^\n]*{kw8}[^\n]*(?:{kwname})?[^\n]*(?=\n|$)", + flags=re.IGNORECASE, + ) + text = title_only.sub("\n", text) + title_only2 = re.compile( + rf"(?:^|\n)\s*[#>*\-\d\.\)()\s]*[^\n]*{kw8}[^\n]*{kwname}[^\n]*(?=\n|$)", + flags=re.IGNORECASE, + ) + text = title_only2.sub("\n", text) + return re.sub(r"\n{3,}", "\n\n", text).strip() + + +def _pipe_markdown_row_cells(line: str) -> list[str]: + raw = str(line or "").rstrip("\n") + s = raw.strip() + if not s.startswith("|") or not s.endswith("|"): + return [] + inner = s[1:-1] + return [p.strip() for p in inner.split("|")] + + +def _strip_md_cell_noise(s: str) -> str: + t = re.sub(r"\*+", "", str(s or "")) + t = re.sub(r"", "", t, flags=re.I) + return t.strip() + + +def _strip_532_table55_bad_markdown_columns(content: str) -> str: + """去掉正文中「表5-5 主要生产经营指标」Markdown 表的多余列(如「后评价-时点点后预测值」及冗余裸预测列)。""" + text = str(content or "") + if not text or "主要生产经营指标" not in text: + return text + fc = "后评价时点后预测值" + + def _bad_header_indices(header_cells: list[str]) -> set[int]: + bad: set[int] = set() + comp_cells = [_compact_zh_ident(_strip_md_cell_noise(h)) for h in header_cells] + has_slot = False + for i, h in enumerate(header_cells): + hs = str(h or "") + parts = _split_group_year_col_key(hs) + if parts and parts[0] == fc: + tail = parts[1].strip() + if _appendix_norm_year_tail(tail) or ( + _BARE_FOUR_DIGIT_YEAR_COL.fullmatch(tail) and 1900 <= int(tail) <= 2100 + ): + has_slot = True + break + if re.search(r"后评价时点后预测值\s*[--—–]\s*\d{4}", hs): + has_slot = True + break + fc_c = _compact_zh_ident(fc) + for i, c in enumerate(comp_cells): + if "时点点后" in c: + bad.add(i) + if has_slot and c == fc_c: + bad.add(i) + return bad + + def _drop_cols_from_pipe_block(block: str, drop_idx: set[int]) -> str: + if not drop_idx: + return block + out_lines: list[str] = [] + for ln in block.splitlines(): + if not ln.strip().startswith("|"): + out_lines.append(ln) + continue + cells = _pipe_markdown_row_cells(ln) + if not cells: + out_lines.append(ln) + continue + new_cells = [c for j, c in enumerate(cells) if j not in drop_idx] + if len(new_cells) == len(cells): + out_lines.append(ln) + else: + out_lines.append("| " + " | ".join(new_cells) + " |") + return "\n".join(out_lines) + + rx = re.compile( + r"((?:^|\n)[^\n]*表\s*5\s*[--\..·]\s*5[^\n]*主要生产经营指标[^\n]*\n)" + r"(?:\s*\n|\s*\n)*" + r"((?:^[ \t]*\|[^\n]+\|\s*\n)+)", + flags=re.MULTILINE | re.IGNORECASE, + ) + + def _repl(m: re.Match) -> str: + prefix, body = m.group(1), m.group(2) + tbl_lines = [ + ln + for ln in body.splitlines() + if ln.strip().startswith("|") and ln.strip().endswith("|") + ] + if len(tbl_lines) < 2: + return m.group(0) + hdr = _pipe_markdown_row_cells(tbl_lines[0]) + if not hdr: + return m.group(0) + drop = _bad_header_indices(hdr) + if not drop: + return m.group(0) + return prefix + _drop_cols_from_pipe_block(body, drop) + + return rx.sub(_repl, text) + + +def _cleanup_section_table_artifacts( + section_title: str, + content: str, + *, + allowed_table_tokens: Optional[list[str]] = None, +) -> str: + section_no = _extract_section_number(str(section_title or "")) + text = _strip_unallowed_table_references( + str(content or ""), + allowed_table_tokens=allowed_table_tokens, + ) + # 末尾兜底:防止中间步骤再次引入 4.3.2/4.3.3 串表。 + if section_no in {"4.3.2", "4.3.3"}: + text = _remove_cross_section_table_pollution(section_title, text) + if section_no == "4.3.3": + text = _dedupe_433_alkylation_operation_analysis_markdown_tables(text) + if section_no == "5.3.2": + text = _strip_532_embedded_appendix8_table(text) + text = _strip_532_table55_bad_markdown_columns(text) + if section_no == "3.3.1": + return _strip_331_table_crossrefs(text) + if section_no in {"2.1.5", "3.3.2", "3.3.4", "5.1", "5.3.1", "5.3.2"}: + text = _finalize_section_table_dedupe(text, allowed_table_tokens) + if section_no == "3.3.3": + return _strip_333_trailing_table_caption_lines(text) + if section_no == "3.4.1": + return _strip_341_table_artifacts(text) + if section_no == "2.1.1": + text = _strip_211_stray_table_261(text) + text = _dedupe_211_duplicate_markdown_tables(text) + if section_no == "5.2.1": + text = _fix_521_table52_wrong_caption(text) + text = _strip_521_spurious_llm_table52(text) + text = _finalize_section_table_dedupe(text, allowed_table_tokens) + if section_no == "5.3.1": + text = _strip_531_spurious_llm_table(text) + if section_no == "5.4": + text = _strip_54_spurious_llm_table(text) + return text + + +def _chapter5_opening_heading_present(text: str) -> bool: + """判断正文块是否已以第5章章题开头(「5 投资…」与第1章「1 项目概况」同体例,便于前端提升为 ##)。""" + t = str(text or "").strip() + if not t: + return False + first = t.split("\n", 1)[0].strip() + if first.startswith("#"): + first = first.lstrip("#").strip() + if "第5章" in first and "投资与经济效益评价" in first: + return True + return bool(re.match(r"^5\s+投资与经济效益评价", first)) + + +def _canonicalize_chapter5_shell_heading_line(text: str) -> str: + """ + 将独立行的「第5章 投资与经济效益评价」规范为「5 投资与经济效益评价」, + 与模板第1章及 promoteNumberedHeadingLinesToMarkdown(## 章级)一致。 + """ + lines = str(text or "").split("\n") + out: list[str] = [] + replaced = False + for line in lines: + if not replaced and line.strip(): + stripped = line.strip().lstrip("#").strip() + if stripped == "第5章 投资与经济效益评价" or ( + stripped.startswith("第5章") and "投资与经济效益评价" in stripped + ): + out.append("5 投资与经济效益评价") + replaced = True + continue + out.append(line) + return "\n".join(out) + + +def _section_heading_present_in_text(text: str, heading_title: str) -> bool: + """判断正文中是否已出现指定节标题行。""" + if not str(text or "").strip() or not str(heading_title or "").strip(): + return False + target_norm = _title_compare_norm(heading_title) + section_no = _extract_section_number(heading_title) + for line in str(text).splitlines(): + stripped = line.strip() + if not stripped: + continue + plain = stripped.lstrip("#").strip() + if _title_compare_norm(plain) == target_norm: + return True + if section_no and _is_heading_line_for_section(plain, section_no): + return True + if section_no == "5" and _chapter5_opening_heading_present(text): + return True + return False + + +def _inject_missing_parent_section_headings( + section_title: str, + content: str, + previous_section_content: str, + chapter_title_map: dict[str, str], +) -> str: + """ + 仅生成叶子节时,父节壳(如 5.2、5.3、5)不会单独落库;在首个子节(x.y.1)前补足父节标题。 + """ + if not chapter_title_map: + return content + + stub = SimpleNamespace(section_title=str(section_title or "").strip()) + ancestors = _resolve_ancestor_titles_for_section(stub, chapter_title_map) + if not ancestors: + return content + + body = str(content or "").strip() + if not body: + return content + if _extract_section_number(section_title or "") == "5.1": + body = _canonicalize_chapter5_shell_heading_line(body) + + prior = str(previous_section_content or "") + missing: list[str] = [] + for anc in ancestors: + if _section_heading_present_in_text(body, anc): + continue + if _section_heading_present_in_text(prior, anc): + continue + missing.append(anc) + if not missing: + return body + return "\n\n".join(missing + [body]).strip() + + +def _inject_chapter5_title_before_section_51( + section_key: str, + content: str, + previous_section_content: str, + *, + section_title: str = "", + chapter_title_map: Optional[dict[str, str]] = None, +) -> str: + """兼容旧调用;优先走通用父节标题注入。""" + if chapter_title_map and section_title: + return _inject_missing_parent_section_headings( + section_title, content, previous_section_content, chapter_title_map + ) + if str(section_key or "").strip() != "5-1": + return content + body = _canonicalize_chapter5_shell_heading_line(str(content or "").strip()) + if not body: + return content + if _chapter5_opening_heading_present(body): + return body + if _chapter5_opening_heading_present(previous_section_content): + return body + return f"5 投资与经济效益评价\n\n{body}" + + +def _previous_completed_section_content( + section: ReportTemplateSection, + sections: list[ReportTemplateSection], + completed_contents: dict[str, str], +) -> str: + """按模板顺序取当前节之前最近一节已生成正文(用于父节标题是否已出现)。""" + ordered = list(sections or []) + try: + idx = next(i for i, s in enumerate(ordered) if s.section_key == section.section_key) + except StopIteration: + return "" + for j in range(idx - 1, -1, -1): + body = str(completed_contents.get(ordered[j].section_key) or "").strip() + if body: + return body + return "" + + +def _prev_line_invites_metric_continuation(prev_line: str) -> bool: + """上一行是否像在句中被截断、下一行应以能耗/物耗数值续写。""" + s = str(prev_line or "").strip() + if not s: + return False + if s.startswith("|"): + return False + if re.match(r"^\s{0,3}#{0,6}\s*\d+(?:\.\d+)+\s+[\u4e00-\u9fff]", s): + return False + if re.search(r"(?:\[\d+\]\s*)+$", s): + return True + if re.search( + r"(?:单耗|电耗|能耗|水耗|物耗|损失|运行值|设计值|加工量|负荷|占比)为?" + r"\s*(?:\[\d+\]\s*)*$", + s, + ): + return True + if re.search( + r"(上升至|升至|降至|下降为|提高为|降低为|为|达到|至)\s*(?:\[\d+\]\s*)*$", + s, + ): + return True + return bool(re.search(r"[至为是到]$", s)) + + +def _merge_orphan_energy_metric_lines(text: str) -> str: + """ + 将误断成独立行的能耗/物耗数值片段并回上一行正文。 + 例:「…可研报告 [50]\\n132.41 MJ/t产品及初步设计」→ 合并为一行,避免前端误判为 ### 标题。 + """ + from services.docx_export_service import _is_likely_section_number + + metric_re = re.compile( + r"^\s*(?:#{1,6}\s+)?(\d+(?:\.\d+)?)\s+(MJ/t|kWh/t|kgce/t|t产品)", + re.IGNORECASE, + ) + lines = str(text or "").split("\n") + out: list[str] = [] + for line in lines: + stripped = re.sub(r"^#{1,6}\s+", "", str(line or "").strip()) + m = metric_re.match(stripped) + if m and not _is_likely_section_number(m.group(1)): + prev = len(out) - 1 + while prev >= 0 and not str(out[prev] or "").strip(): + prev -= 1 + if prev >= 0 and _prev_line_invites_metric_continuation(str(out[prev] or "")): + out[prev] = out[prev].rstrip() + stripped + continue + out.append(line) + return "\n".join(out) + + +def _fix_numeric_line_breaks(content: str) -> str: + """ + 修复数字与单位/日期在换行处被意外拆分的问题。 + 仅合并明显数字语义连续场景,尽量不影响正常段落换行。 + """ + text = str(content or "") + if not text: + return text + # 统一各种换行分隔符,避免 \u2028/\u2029 导致规则失效 + text = text.replace("\r\n", "\n").replace("\r", "\n") + text = text.replace("\u2028", "\n").replace("\u2029", "\n") + + # 保护 Markdown 表格行之间及表格行与后续正文之间的换行, + # 否则数字合并规则会把表格末行和下一行粘在一起变成多余列。 + # 策略:按行拆分,识别所有表格行(以 | 开头或以 | 结尾), + # 将其前后换行替换为保护占位符,合并规则处理完毕后再恢复。 + table_nl_token = "\u0000TABLE_NL\u0000" + _lines = text.split("\n") + for _li in range(len(_lines)): + _stripped = _lines[_li].strip() + _is_table = _stripped.startswith("|") or _stripped.endswith("|") + if _is_table: + _lines[_li] = table_nl_token + _lines[_li] + table_nl_token + text = "\n".join(_lines) + text = text.replace(table_nl_token + "\n" + table_nl_token, table_nl_token) + text = text.replace("\n" + table_nl_token, table_nl_token) + text = text.replace(table_nl_token + "\n", table_nl_token) + + # 先保护“章节标题换行”(如:1 项目概况\n1.1 项目基本情况),避免被数字合并规则误伤。 + heading_nl_token = "\u0000HEADING_NL\u0000" + text = re.sub( + r"\n(?=\s*\d+(?:\.\d+)*\s+[\u4e00-\u9fff]{2,}(?:\s|$))", + heading_nl_token, + text, + ) + + # 数字/中文与下一行之间的合并:换行两侧仅允许水平空白(不含 \\n), + # 否则 \\s* 会吞掉段落空行的第一个 \\n,使 \\n(?!\\n) 失效,误把「标题\\n\\n2017年…」粘回一行。 + _hsp = r"[ \t\u3000]*" + # 例:2018 年 11 月\n4 日、24.48\n%、1906\nm2、0.05\ng + text = re.sub( + rf"(?<=\d){_hsp}\n(?!\n){_hsp}(?=(?:\d|[年月日时分秒度%%℃吨米台套项]|[A-Za-z]))", + "", + text, + ) + # 例:烈度\n7 度、规模\n15 万吨/年(中文描述后接数字) + # 仅在“下一行是数字 + 常见单位/量纲”时合并,避免误伤编号列表(如 1)/1.) + text = re.sub( + rf"(?<=[\u4e00-\u9fff]){_hsp}\n(?!\n){_hsp}(?=\d+(?:\.\d+)?\s*(?![))、.])(?:[年月日时分秒度%%℃吨米台套项个级亩万亿千百十gGlLmMkKvVaAwWhHzHPp]|[A-Za-z]{{1,4}}\b))", + "", + text, + ) + # 例:106万\n工时、15万吨/年\n烷基化项目(数量级后接中文语义单位) + text = re.sub( + rf"(?<=[\d万亿千百十]){_hsp}\n(?!\n){_hsp}(?=(?:工时|吨/年|万吨/年|亿元|万元|万人|m2|m3|m²|m³|项|台|套|个|座|处|条|次|年|月|日))", + "", + text, + flags=re.IGNORECASE, + ) + # 例:kgEo/\nt、m\n2 等单位被拆分 + text = re.sub(r"(?<=[A-Za-z/])\s*\n\s*(?=\d)", "", text) + text = re.sub(r"(?<=[A-Za-z])\s*\n\s*(?=[A-Za-z])", "", text) + # 例:实际运行值为\n137.88 MJ/t;…单耗为 [93][94]\n\n1.38 MJ/t(2.1.1/2.1.6 常见断行) + _metric_num = r"\d+(?:\.\d+)?\s*(?:MJ/t|kWh/t|kgce/t|t产品)" + text = re.sub( + rf"((?:\[\d+\]\s*)+)\s*\n+\s*({_metric_num})", + r"\1 \2", + text, + flags=re.IGNORECASE, + ) + text = re.sub( + rf"(?<=[\u4e00-\u9fff)\])])\s*\n+\s*({_metric_num})", + r" \1", + text, + flags=re.IGNORECASE, + ) + text = _merge_orphan_energy_metric_lines(text) + + # 统一面积/体积单位写法:m2/m3 -> m²/m³(兼容空格、大小写、^ 写法) + text = re.sub(r"(?i)\bm\s*(?:\^?\s*2)\b", "m²", text) + text = re.sub(r"(?i)\bm\s*(?:\^?\s*3)\b", "m³", text) + text = text.replace(heading_nl_token, "\n") + text = text.replace(table_nl_token, "\n") + return text + + +def _canonical_global_table_name_for_token(token: str) -> str | None: + t = str(token or "").strip() + if not t: + return None + for name in MULTI_COLUMN_GLOBAL_SPECS: + if _table_token_matches_name(t, name): + return name + return None + + +def _skeleton_markdown_for_table_token(token: str, *, table_name: str = "") -> str: + """按细则模版生成占位 Markdown 表体(要素无有效单元格时仍保证表3-4 等有表体)。""" + full_name = _canonical_global_table_name_for_token(token) or "" + if not full_name: + tn = str(table_name or "").strip() + if _multi_column_global_spec_for_table(tn): + full_name = tn + elif tn: + full_name = _canonical_global_table_name_for_token(tn) or tn + spec = _multi_column_global_spec_for_table(full_name) + if not spec: + return "" + row_order = global_table_row_keys(full_name) + if not row_order: + return "" + col_order = [str(c).strip() for c in (spec[0] or []) if str(c).strip()] + if not col_order: + return "" + md, _ = _render_markdown_table(full_name, row_order, col_order, {}) + return str(md or "").strip() + + +def _authoritative_block_for_required_table(token: str, evidence: dict) -> str | None: + """要素直出整块:优先 structuredTables 中的 markdown,否则用模版骨架表。""" + table_rows = evidence.get("structuredTables") if isinstance(evidence, dict) else [] + title = str(token or "").strip() + md = "" + if isinstance(table_rows, list): + best_row: dict | None = None + best_row_score = -1 + for row in table_rows: + if not isinstance(row, dict): + continue + token_hit = str(row.get("token") or "") + table_name_hit = str(row.get("tableName") or "") + if _table_token_matches_name(token, token_hit) or _table_token_matches_name( + token, table_name_hit + ): + row_md = str(row.get("markdown") or "").strip() + if not row_md: + continue + row_score = ( + _score_structured_table_hit_dict(row) + if _table_token_matches_name(token, "表5-4") + else len(row_md) + ) + if row_score > best_row_score: + best_row_score = row_score + best_row = row + if best_row: + md = str(best_row.get("markdown") or "").strip() + title = str(best_row.get("tableName") or token).strip() or token + if not md: + sk = _skeleton_markdown_for_table_token(token, table_name=title) + if sk: + md = sk + canon = _canonical_global_table_name_for_token(token) + if canon: + title = canon + if not md: + return None + return ( + f"{title}\n\n" + "\n" + f"{md}" + ) + + +def _fill_required_table_caption_stubs( + content: str, required_tables: list[str], evidence: dict +) -> str: + """将仅有表题、段内无 Markdown 表体的必需表替换为要素直出或模版骨架。""" + text = str(content or "") + changed = False + for token in required_tables or []: + if not _table_token_caption_line_re(token).search(text): + continue + seg = _segment_after_table_caption(text, token) + if _segment_has_markdown_table_body(seg): + seg_tbl = re.search(r"(?m)(?:^\s*\|[^\n]+\|\s*\n){3,}", seg) + if not ( + _table_token_matches_name(token, "表5-4") + and seg_tbl + and _is_table54_simplified_extract_body(seg_tbl.group(0)) + ): + continue + block = _authoritative_block_for_required_table(token, evidence) + if not block: + continue + text = _replace_caption_stub_with_authoritative_table(text, token, block) + changed = True + return text if changed else content + + +def _append_structured_missing_tables(content: str, missing_tables: list[str], evidence: dict) -> str: + out_content = str(content or "").rstrip() + used = False + for token in missing_tables: + block = _authoritative_block_for_required_table(token, evidence) + if not block: + continue + if _table_token_caption_line_re(token).search(out_content): + out_content = _replace_caption_stub_with_authoritative_table( + out_content, token, block + ) + else: + out_content = out_content + "\n\n" + block + used = True + return out_content.strip() if used else content + + +def _replace_llm_table_with_authoritative(content: str, token: str, replacement_md: str) -> str: + """将 LLM 自行生成的同 token 表格(表题行 + 表格体)替换为要素管理直出内容。 + + 关键:管道行匹配使用 ``[ \\t]*`` 而非 ``\\s*``,防止 ``\\s`` 跨越空行 + 把分析文字中的 token 引用误关联到远处另一张表的管道行。 + 表题行与首条管道行之间允许至多一个空行(``\\n?``)。 + """ + token_plain = re.sub(r"\s+", "", str(token or "")) + if not token_plain or not replacement_md: + return content + token_re = re.escape(token_plain).replace(r"\-", r"[--—–]") + md_table_pat = re.compile( + r"((?:^|\n)[^\n]*?" + token_re + r"[^\n]*\n)" + r"(\n?(?:[ \t]*\|[^\n]+\|[ \t]*\n)+)", + flags=re.IGNORECASE, + ) + m = md_table_pat.search(content) + if m: + return content[:m.start()] + "\n" + replacement_md + "\n\n" + content[m.end():].lstrip("\n") + html_table_pat = re.compile( + r"((?:^|\n)[^\n]*?" + token_re + r"[^\n]*\n)" + r"(\s*[\s\S]*?
)", + flags=re.IGNORECASE, + ) + m = html_table_pat.search(content) + if m: + return content[:m.start()] + "\n" + replacement_md + "\n\n" + content[m.end():].lstrip("\n") + return content + + +def _caption_followed_by_element_table_comment(content: str, token: str) -> bool: + """仅当「本表表题行后」紧跟要素直出注释时,才视为已权威化,避免全篇任一注释误伤其它表的替换。""" + token_plain = re.sub(r"\s+", "", str(token or "")) + if not token_plain: + return False + token_re = re.escape(token_plain).replace(r"\-", r"[--—–]") + return bool( + re.search( + r"(?:^|\n)[^\n]*?" + token_re + r"[^\n]*\n" + r"(?:[ \t]*\n)?[ \t]*\n" + f"{hit_md}" + ) + if _table_token_exists(out, token_n): + out = _replace_llm_table_with_authoritative(out, token_n, rep) + return out + + +def _append_authoritative_required_tables(content: str, required_tables: list[str], evidence: dict) -> str: + """ + 为模板必需表追加"要素表直出"块,确保表格数据直接来自结构化要素表。 + 若 LLM 已自行生成了同 token 的表格,用要素管理数据替换之。 + """ + if not required_tables: + return content + + out_content = str(content or "") + used = False + for token in required_tables: + already_authoritative = ( + _caption_followed_by_element_table_comment(out_content, token) + and _table_token_exists(out_content, token) + ) + if already_authoritative: + continue + combined_md = _authoritative_block_for_required_table(token, evidence) + if not combined_md: + continue + if _table_token_exists(out_content, token): + replaced = _replace_llm_table_with_authoritative(out_content, token, combined_md) + out_content = ( + replaced + if replaced != out_content + else _replace_caption_stub_with_authoritative_table( + out_content, token, combined_md + ) + ) + elif _table_token_caption_line_re(token).search(out_content): + out_content = _replace_caption_stub_with_authoritative_table( + out_content, token, combined_md + ) + else: + out_content = out_content.rstrip() + "\n\n" + combined_md + used = True + return out_content.strip() if used else content + + +def _is_effective_markdown_table_block(md_block: str) -> bool: + lines = [str(ln or "").strip() for ln in str(md_block or "").splitlines() if str(ln or "").strip()] + if len(lines) < 3: + return False + if _is_pipe_markdown_table_separator_line(lines[0]): + return False + if not _is_pipe_markdown_table_separator_line(lines[1]): + return False + data_rows = [ + ln for ln in lines[2:] + if _is_pipe_markdown_table_row_line(ln) and not _is_pipe_markdown_table_separator_line(ln) + ] + return bool(data_rows) + + +def _ensure_required_structured_tables_integrity(content: str, required_tables: list[str], evidence: dict) -> str: + if not required_tables: + return content + repaired = str(content or "") + + for token in required_tables: + authoritative_block = _authoritative_block_for_required_table(token, evidence) + if not authoritative_block: + continue + + if _table_token_caption_line_re(token).search(repaired): + seg = _segment_after_table_caption(repaired, token) + if not _segment_has_markdown_table_body(seg): + repaired = _replace_caption_stub_with_authoritative_table( + repaired, token, authoritative_block + ) + continue + if _table_token_matches_name(token, "表5-4"): + m_seg = re.search( + r"(?m)(?:^\s*\|[^\n]+\|\s*\n){3,}", seg + ) + if m_seg and _is_table54_simplified_extract_body(m_seg.group(0)): + repaired = _replace_caption_stub_with_authoritative_table( + repaired, token, authoritative_block + ) + continue + + token_plain = re.sub(r"\s+", "", str(token or "")) + token_re = re.escape(token_plain).replace(r"\-", r"[--—–]") + table_pat = re.compile( + r"((?:^|\n)[^\n]*?" + token_re + r"[^\n]*\n(?:\n|[ \t]*[ \t]*\n)*)" + r"((?:[ \t]*\|[^\n]*\|[ \t]*\n)+)", + flags=re.IGNORECASE, + ) + m = table_pat.search(repaired) + if m: + cur_table = str(m.group(2) or "") + need_replace = not _is_effective_markdown_table_block(cur_table) + if _table_token_matches_name(token, "表5-4") and _is_table54_simplified_extract_body( + cur_table + ): + need_replace = True + if need_replace: + repaired = ( + repaired[:m.start()] + + "\n" + + authoritative_block + + "\n\n" + + repaired[m.end():].lstrip("\n") + ) + elif not _table_token_exists(repaired, token): + repaired = repaired.rstrip() + "\n\n" + authoritative_block + + return repaired.strip() + + +def _collect_structured_tables( + db: Session, + project_uuid: str, + required_tables: list[str], + *, + section_title: str, + section_tokens: list[str], +) -> list[dict]: + """ + 报告生成阶段的结构化表来源必须与“要素管理”一致(element_tables/element_cells)。 + + 规则: + - 若模板 prompt 中声明了必需表(如 表2-1/附表8),优先按 token 精准匹配; + - 若未声明必需表,或声明了但匹配不到,则按章节标题/关键词从要素管理中选取最相关的表直出, + 避免模型自行编造表格。 + """ + + def _table_relevance_score(table_name: str) -> int: + name = str(table_name or "").strip() + if not name: + return 0 + name_l = name.lower() + score = 0 + # 章节标题强相关加权 + t = str(section_title or "").strip() + if t and t in name: + score += 10 + # token 命中加分 + for tok in (section_tokens or [])[:20]: + tt = str(tok or "").strip() + if not tt: + continue + if tt.lower() in name_l: + score += 2 + # 常见表名关键字(表/附表/对比/评价)做轻微加权,便于优先输出真正的表 + if any(k in name for k in ("表", "附表", "对比", "评价", "评分")): + score += 1 + return score + + tables: list[ElementTable] = ( + db.query(ElementTable) + .filter(ElementTable.project_id == project_uuid) + .order_by(ElementTable.sort_order.asc(), ElementTable.updated_at.desc()) + .all() + ) + if not tables: + return [] + + # 5.3.2:正文仅需表5-5/表5-6;附表8 归入全书「## 附表」,勿纳入本节结构化证据。 + if _extract_section_number(str(section_title or "")) == "5.3.2": + tables = [ + t + for t in tables + if not ( + ("附表8" in str(t.table_name or "")) + and ("可研报告和后评价参数对比表" in str(t.table_name or "")) + ) + ] + if not tables: + return [] + + # 1) 必需表:按 token/表名匹配(尽量“直接用要素管理中的表”) + required_norm = [_norm_table_token(t) for t in (required_tables or []) if _norm_table_token(t)] + required_hits: list[ElementTable] = [] + if required_norm: + for t in tables: + name_norm = _norm_table_token(t.table_name) + if not name_norm: + continue + if any(req and _table_token_matches_name(req, name_norm, normalized=True) for req in required_norm): + required_hits.append(t) + if required_hits: + req_hint_words: list[str] = [] + for req in required_norm: + req_hint_words.extend(_TABLE_TOKEN_PREFERRED_NAME_HINTS.get(req, ())) + + def _required_hit_score(tb: ElementTable) -> tuple[int, int, int]: + tb_name = str(tb.table_name or "").strip() + hint_hit = 0 + if req_hint_words: + for hint in req_hint_words: + if hint and hint in tb_name: + hint_hit += 1 + collect = 0 + if any(_table_token_matches_name(req, "表5-4") for req in required_norm): + collect = _element_table_collect_score(db, tb, "表5-4") + return (collect, hint_hit, _table_relevance_score(tb_name)) + + required_hits.sort(key=_required_hit_score, reverse=True) + # 保留靠前若干张,避免同 token 多张历史表导致提示词爆炸 + required_hits = required_hits[:8] + + # 2) 回退:未声明必需表,或声明了但没匹配上时,按相关性挑选 + selected: list[ElementTable] = list(required_hits) + if not selected: + scored: list[tuple[int, ElementTable]] = [] + for t in tables: + s = _table_relevance_score(t.table_name) + if s > 0: + scored.append((s, t)) + scored.sort(key=lambda x: x[0], reverse=True) + selected = [t for _, t in scored[:4]] + + selected_has_time54 = any( + _is_table54_operating_benefit(str(t.table_name or "")) + and str(t.table_type or "").strip() == "time" + for t in selected + ) + + out: list[dict] = [] + for table in selected: + is_time = str(table.table_type or "").strip() == "time" + if is_time: + ty_row = db.query(ElementTable.year).filter(ElementTable.id == table.id).first() + tbl_y = ( + int(ty_row[0]) + if ty_row and ty_row[0] is not None and int(ty_row[0]) > 0 + else None + ) + year_items = _build_time_table_markdowns_by_year(db, table.id, table.table_name) + if _is_table54_operating_benefit(table.table_name): + picked = _pick_table54_year_markdown(year_items, table_year=tbl_y) + if picked: + year_items = [picked] + for display_name, md in year_items: + if not md: + continue + token = _extract_table_short_token(display_name) + out.append( + { + "tableId": table.id, + "tableName": display_name, + "token": token, + "markdown": md, + } + ) + if len(out) >= 12: + break + else: + if ( + selected_has_time54 + and _is_table54_operating_benefit(table.table_name) + ): + continue + md, common_unit = _build_structured_table_markdown(db, table.id, table.table_name) + display_name = _merge_table_title_with_common_unit(str(table.table_name or "").strip(), common_unit) + token = _extract_table_short_token(table.table_name) + if not md: + md = _skeleton_markdown_for_table_token( + token or display_name, table_name=display_name + ) + if not md: + continue + hit = { + "tableId": table.id, + "tableName": display_name, + "token": token, + "markdown": md, + } + if _is_table54_operating_benefit(table.table_name) and _is_table54_simplified_extract_body(md): + continue + out.append(hit) + if len(out) >= 12: + break + + t54_norm = _norm_table_token("表5-4") + t54_hits = [h for h in out if _norm_table_token(str(h.get("token") or "")) == t54_norm] + if t54_hits: + rest = [h for h in out if _norm_table_token(str(h.get("token") or "")) != t54_norm] + ranked = _dedupe_structured_table_hits(t54_hits) + out = rest + (ranked[:1] if ranked else []) + return out[:12] + + +def _table_2_5_general_layout_comparison_name(table_name: str) -> bool: + """与要素管理 quick-fill 表2-5 判定一致(表头用「项目名称」,不含依托对比)。""" + n = str(table_name or "") + if "依托" in n: + return False + return "表2-5" in n or "总图、储运、公用工程及辅助工程对比" in n + + +def _table_2_6_reliance_comparison_name(table_name: str) -> bool: + """与要素管理 quick-fill 表2-6判定一致(行展示去「依托·」等类别前缀、表头用依托项目名称)。""" + n = str(table_name or "") + return ( + "表2-6" in n + or "储运、公用工程及辅助工程依托对比" in n + or "辅助工程依托对比" in n + ) + + +def _table_3_1_contracting_units_name(table_name: str) -> bool: + """与要素管理 quick-fill 表3-1判定一致(行展示去「承包单元·」前缀、表头用单元名称)。""" + n = str(table_name or "") + return "表3-1" in n or "项目承包单位情况" in n + + +def _table_3_3_plantwide_design_change_name(table_name: str) -> bool: + """与要素管理 quick-fill 表3-3 判定一致(表头用「单元名称」)。""" + n = str(table_name or "") + return ("表3-3" in n or "施工图设计变更情况" in n) and "全厂" in n + + +def _table_3_4_single_unit_design_change_name(table_name: str) -> bool: + """与要素管理 quick-fill 表3-4 判定一致(表头用「专业」)。""" + n = str(table_name or "") + return ("表3-4" in n or "施工图设计变更情况" in n) and "单装置" in n + + +def _table_3_5_major_design_change_name(table_name: str) -> bool: + """与要素管理 quick-fill 表3-5判定一致(行展示去「重大变更·」前缀、表头用单元名称)。""" + n = str(table_name or "") + return "表3-5" in n or "影响投资或工期" in n + + +def _table_3_7_procurement_name(table_name: str) -> bool: + """与要素管理 quick-fill 表3-7判定一致(行展示去「采购物资·」前缀、表头用物资(类别)名称)。""" + n = str(table_name or "") + return "表3-7" in n or "采购工作情况" in n + + +def _table_4_2_alkylation_operation_analysis_name(table_name: str) -> bool: + """烷基化装置运行分析表(含历史误标为表4-1、用户改写考核日期后的表题)。""" + n = re.sub(r"\s+", "", str(table_name or "")) + return ("烷基化装置运行分析" in n) and ("考核时间" in n) + + +_TABLE42_ANALYSIS_TEMPLATE_NAME = "表4-2 烷基化装置运行分析表(考核时间:×年×月×日)" + + +def _multi_column_global_spec_for_table(table_name: str): + """按表名取多列模版;表4-2 槽位改名后仍套用标准列序(单位/设计值/标定值/实际值)。""" + tn = str(table_name or "").strip() + spec = MULTI_COLUMN_GLOBAL_SPECS.get(tn) + if spec: + return spec + if _table_4_2_alkylation_operation_analysis_name(table_name): + return MULTI_COLUMN_GLOBAL_SPECS.get(_TABLE42_ANALYSIS_TEMPLATE_NAME) + return None + + +def _element_manage_row_label_after_first_dot(label: str) -> str: + """与 quick-fill.js parseRowKeyForDisplay 一致:去掉行键第一个「…·」段(仅作展示)。""" + s = str(label or "").strip() + if "\u00b7" in s: + rest = "\u00b7".join(s.split("\u00b7", 1)[1:]).strip() + return rest if rest else s + return s + + +def _element_manage_table_row_display_label(table_name: str, label: str) -> str: + """表2-6/表3-1/表3-5/表3-7/表4-2 等与要素管理行名展示对齐(库内 row_key 仍保留类别前缀)。""" + if ( + _table_2_6_reliance_comparison_name(table_name) + or _table_3_1_contracting_units_name(table_name) + or _table_3_5_major_design_change_name(table_name) + or _table_3_7_procurement_name(table_name) + or _table_4_2_alkylation_operation_analysis_name(table_name) + ): + return _element_manage_row_label_after_first_dot(label) + return str(label or "").strip() + + +def _row_header_name_for_table(table_name: str) -> str: + name = str(table_name or "") + if "产品方案对比表" in name: + return "产品" + if "原料数量及组成对比表" in name: + return "原料名称" + if "原料)性质对比表" in name or "原料性质对比表" in name: + return "名称" + if _table_2_5_general_layout_comparison_name(name): + return "项目名称" + if _table_2_6_reliance_comparison_name(name): + return "依托项目名称" + if _table_3_3_plantwide_design_change_name(name): + return "单元名称" + if _table_3_4_single_unit_design_change_name(name): + return "专业" + if _table_3_1_contracting_units_name(name) or _table_3_5_major_design_change_name(name): + return "单元名称" + if _table_3_7_procurement_name(name): + return "物资(类别)名称" + if _table_4_2_alkylation_operation_analysis_name(name): + return "项目名称" + if ( + _table52_investment_change_name(name) + or _table53_engineering_cost_change_name(name) + or _appendix2_investment_structure_name(name) + ): + return "工程或费用名称" + return "项目" + + +_GROUP_HEADER_PREFIXES = [ + "可研报告", "可研", "初步设计", "实际生产", "实际运行", "实际实施", "后评价", +] +# 含完整时点组名,避免「后评价时点后预测值」被前缀「后评价」误拆成两行表头 +_GROUP_HEADER_EXACT = {p for p in _GROUP_HEADER_PREFIXES} | { + "后评价时点前实际值", + "后评价时点后预测值", +} + + +def _group_column_headers(col_order: list[str]) -> tuple[list[str], list[str]] | None: + """识别多级列头。仅当列名形如"可研报告数量(万吨)"时拆分为 top=可研报告 sub=数量(万吨)。 + 独立列名(如"可研报告""初步设计""实际实施")不视为分组,避免误拆。""" + top_headers: list[str] = [] + sub_headers: list[str] = [] + has_group = False + for col in col_order: + text = str(col or "").strip() + if not text: + top_headers.append("") + sub_headers.append("") + continue + # 列键笔误「…时点点后…」勿按「后评价」前缀拆分,否则 Markdown 展平成「后评价-时点点后…」。 + if "时点点后" in text: + has_group = True + top_headers.append("") + sub_headers.append(text.replace("时点点后", "时点后", 1)) + continue + # 附表3~7、表5-5:列键为「后评价时点后预测值|2021」等,必须在「后评价」前缀规则之前处理, + # 否则会拆成 top=后评价、sub=时点后预测值|2021,Markdown 单行表头与按 col_key 取值的列错位。 + pipe_sep = "|" if "|" in text else ("\uff5c" if "\uff5c" in text else None) + if pipe_sep is not None: + group, tail = text.split(pipe_sep, 1) + group, tail = group.strip(), tail.strip() + if group in _APPENDIX_TIME_SLOT_GROUPS and tail: + has_group = True + top_headers.append(group) + sub_headers.append(tail) + continue + # 表5-4:「可研报告|××年#1」等不得按「可研报告」前缀拆成「可研报告-|××年#1」 + if group in _TABLE54_PIPE_METRIC_PREFIXES and tail: + has_group = True + top_headers.append("") + sub_headers.append(text) + continue + if text in _GROUP_HEADER_EXACT: + top_headers.append("") + sub_headers.append(text) + continue + matched = False + for prefix in _GROUP_HEADER_PREFIXES: + if text.startswith(prefix) and len(text) > len(prefix): + suffix = text[len(prefix):].strip() + if suffix: + has_group = True + top_headers.append(prefix) + sub_headers.append(suffix) + matched = True + break + if matched: + continue + if "·" in text: + has_group = True + left, right = [part.strip() for part in text.split("·", 1)] + top_headers.append(left) + sub_headers.append(right) + continue + top_headers.append("") + sub_headers.append(text) + return (top_headers, sub_headers) if has_group else None + + +def _table51_main_economic_indicators_name(table_name: str) -> bool: + n = str(table_name or "") + return "表5-1" in n and "主要经济指标对比" in n + + +def _table52_investment_change_name(table_name: str) -> bool: + n = str(table_name or "") + return "表5-2" in n and "投资变动情况表" in n + + +def _table53_engineering_cost_change_name(table_name: str) -> bool: + n = str(table_name or "") + return "表5-3" in n and "工程费用变动情况表" in n + + +def _appendix2_investment_structure_name(table_name: str) -> bool: + n = str(table_name or "") + return "附表2" in n and "项目竣工决算投资构成表" in n + + +def _appendix3_cashflow_name(table_name: str) -> bool: + n = str(table_name or "") + return "附表3" in n and "项目投资财务现金流量表" in n + + +def _appendix4_profit_name(table_name: str) -> bool: + n = str(table_name or "") + return "附表4" in n and "利润与利润分配计算表" in n + + +def _appendix5_revenue_tax_name(table_name: str) -> bool: + n = str(table_name or "") + return "附表5" in n and "营业收入与营业税金及附加计算表" in n + + +def _appendix6_cost_name(table_name: str) -> bool: + n = str(table_name or "") + return "附表6" in n and "总成本费用计算表" in n + + +def _appendix7_materials_name(table_name: str) -> bool: + n = str(table_name or "") + return "附表7" in n and "原材料、燃料及动力费用计算表" in n + + +def _appendix8_param_name(table_name: str) -> bool: + n = str(table_name or "") + return "附表8" in n and "可研报告和后评价参数对比表" in n + + +def _appendix_time_table_name(table_name: str) -> bool: + return ( + _appendix3_cashflow_name(table_name) + or _appendix4_profit_name(table_name) + or _appendix5_revenue_tax_name(table_name) + or _appendix6_cost_name(table_name) + or _appendix7_materials_name(table_name) + ) + + +def _table_row_seq_name_split_display(table_name: str) -> bool: + """投资/附表类表:项目列仅展示名称(序号另列,与要素管理一致)。""" + return ( + _table51_main_economic_indicators_name(table_name) + or _table52_investment_change_name(table_name) + or _table53_engineering_cost_change_name(table_name) + or _appendix2_investment_structure_name(table_name) + or _appendix_time_table_name(table_name) + or _appendix8_param_name(table_name) + ) + + +def _pick_row_key_with_legacy( + canon: str, row_set: set[str], legacy_map: dict[str, str] +) -> str | None: + """在库内实际 row_key 中选取规范键或其旧版别名(优先规范键)。""" + if canon in row_set: + return canon + for legacy, normalized in legacy_map.items(): + if normalized == canon and legacy in row_set: + return legacy + return None + + +def _order_rows_by_preferred( + row_order: list[str], + preferred: list[str], + *, + legacy_map: dict[str, str] | None = None, +) -> list[str]: + row_set = set(row_order) + ordered: list[str] = [] + seen: set[str] = set() + for canon in preferred: + picked: str | None + if legacy_map: + picked = _pick_row_key_with_legacy(canon, row_set, legacy_map) + else: + picked = canon if canon in row_set else None + if picked and picked not in seen: + seen.add(picked) + ordered.append(picked) + extras = sorted(rk for rk in row_order if rk not in seen) + return ordered + extras + + +def _legacy_map_for_table(table_name: str) -> dict[str, str] | None: + if _appendix2_investment_structure_name(table_name): + return APPENDIX2_LEGACY_ROW_KEY_MAP + if _appendix8_param_name(table_name): + return APPENDIX8_LEGACY_ROW_KEY_MAP + return None + + +def _apply_global_table_standard_row_order(table_name: str, row_order: list[str]) -> list[str]: + """表5-1/5-2/5-3、附表2~8:与要素管理、标准模版一致的标准行序。""" + if not row_order: + return row_order + tn = str(table_name or "") + row_set = set(row_order) + + preferred = canonical_row_order_for_table(tn) + if preferred is not None: + return _order_rows_by_preferred(row_order, preferred, legacy_map=_legacy_map_for_table(tn)) + + if _table53_engineering_cost_change_name(tn): + ordered: list[str] = [] + seen: set[str] = set() + for alts in TABLE_5_3_ROW_KEY_ALTERNATES: + picked: str | None = None + for rk in alts: + if rk in row_set: + picked = rk + break + if picked and picked not in seen: + seen.add(picked) + ordered.append(picked) + extras = sorted(rk for rk in row_order if rk not in seen) + return ordered + extras + + preferred: list[str] | None = None + if ( + _table51_main_economic_indicators_name(tn) + or _table52_investment_change_name(tn) + ): + preferred = global_table_row_keys(tn) + + if not preferred: + return row_order + + ordered = [rk for rk in preferred if rk in row_set] + seen = set(ordered) + extras = sorted(rk for rk in row_order if rk not in seen) + return ordered + extras + + +def _normalize_table_row_order(row_order: list[str], *, table_name: str = "") -> list[str]: + row_order = _apply_global_table_standard_row_order(table_name, row_order) + normal_rows: list[str] = [] + total_rows: list[str] = [] + for row in row_order: + text = str(row or "").strip() + compact = re.sub(r"\s+", "", text) + if compact in {"合计", "总计"}: + total_rows.append(row) + else: + normal_rows.append(row) + return normal_rows + total_rows + + +_BARE_FOUR_DIGIT_YEAR_COL = re.compile(r"^\d{4}$") +_APPENDIX_YEAR_TAIL_NORM = re.compile(r"^(\d{4})年?$") + + +def _appendix_norm_year_tail(tail: str) -> str | None: + """将列键尾部「2020」「2020年」规范为四位年字符串;非日历年返回 None。""" + t = str(tail or "").strip() + m = _APPENDIX_YEAR_TAIL_NORM.fullmatch(t) + if not m: + return None + y = m.group(1) + try: + yi = int(y) + except ValueError: + return None + if 1900 <= yi <= 2100: + return y + return None + + +def _filter_redundant_bare_year_columns(col_order: list[str]) -> list[str]: + """去掉与「组|年度」列重复的旧版纯年份列键(如模板历史同步遗留的 2019 / 2020)。""" + if not col_order: + return col_order + cols = [str(c).strip() for c in col_order if str(c or "").strip()] + if not cols: + return list(col_order) + years_from_piped: set[str] = set() + for c in cols: + if "|" not in c: + continue + tail = c.rsplit("|", 1)[-1].strip() + ny = _appendix_norm_year_tail(tail) + if ny: + years_from_piped.add(ny) + for tok in re.split(r"[\s,,、/-]+", tail): + t = tok.strip() + if _BARE_FOUR_DIGIT_YEAR_COL.fullmatch(t) and 1900 <= int(t) <= 2100: + years_from_piped.add(t) + if not years_from_piped: + return list(col_order) + out: list[str] = [] + for c in col_order: + cs = str(c or "").strip() + ny_bare = _appendix_norm_year_tail(cs) + if ny_bare and ny_bare in years_from_piped: + continue + if _BARE_FOUR_DIGIT_YEAR_COL.fullmatch(cs) and cs in years_from_piped: + continue + out.append(c) + return out if out else list(col_order) + + +_APPENDIX_TIME_SLOT_GROUPS = frozenset({"建设期", "后评价时点前实际值", "后评价时点后预测值"}) + + +def _appendix_time_slot_group_tail_is_real(tail: str) -> bool: + """附表时间分组下子列是否为真实年份(YYYY / YYYY年)。""" + if _appendix_norm_year_tail(tail): + return True + t = (tail or "").strip() + if _BARE_FOUR_DIGIT_YEAR_COL.fullmatch(t): + try: + return 1900 <= int(t) <= 2100 + except ValueError: + return False + return False + + +def _bare_appendix_year_placeholder_col_key(s: str) -> bool: + """无竖线列键:末栏「××年」「xx年」等占位列(兼容 x/×/全角拉丁混写)。""" + t = str(s or "").strip() + if not t: + return False + if t in ("…", "..."): + return True + buf: list[str] = [] + for ch in t: + if ch in "xXxX": + buf.append("×") + elif ch == "\u00d7": + buf.append("×") + else: + buf.append(ch) + u = "".join(buf) + return bool(re.fullmatch(r"×{2}年(?:#\d+)?", u)) + + +def _filter_appendix_placeholder_slot_columns(col_order: list[str]) -> list[str]: + """某组下已有真实年份列时,该组内只保留 YYYY / YYYY年 子列,并去掉裸组名列与裸「××年」占位列。""" + if not col_order: + return col_order + groups_with_real_year: set[str] = set() + for c in col_order: + cs = str(c or "").strip() + if "|" not in cs: + continue + group, tail = cs.split("|", 1) + group, tail = group.strip(), tail.strip() + if group not in _APPENDIX_TIME_SLOT_GROUPS: + continue + if _appendix_time_slot_group_tail_is_real(tail): + groups_with_real_year.add(group) + if not groups_with_real_year: + return list(col_order) + out: list[str] = [] + for c in col_order: + cs = str(c or "").strip() + if "|" not in cs: + if cs in groups_with_real_year: + continue + if _bare_appendix_year_placeholder_col_key(cs): + continue + out.append(c) + continue + group, tail = cs.split("|", 1) + group, tail = group.strip(), tail.strip() + if group in groups_with_real_year: + if _appendix_time_slot_group_tail_is_real(tail): + out.append(c) + continue + out.append(c) + return out if out else list(col_order) + + +def _filter_appendix3_summary_duplicate_forecast_years(table_name: str, col_order: list[str]) -> list[str]: + """附表3:「建设期」「时点前」下与「后评价时点后预测值」同年栏重复时去掉,避免表尾多出 2019/2020 等重复列。""" + tn = str(table_name or "").strip() + if "附表3" not in tn or "项目投资财务现金流量" not in tn: + return col_order + forecast_g = "后评价时点后预测值" + summary_gs = frozenset({"建设期", "后评价时点前实际值"}) + fy: set[str] = set() + for c in col_order: + cs = str(c or "").strip() + if "|" not in cs: + continue + g, tail = cs.split("|", 1) + if g.strip() != forecast_g: + continue + ny = _appendix_norm_year_tail(tail.strip()) + if ny: + fy.add(ny) + if not fy: + return col_order + drop: set[str] = set() + for c in col_order: + cs = str(c or "").strip() + if "|" not in cs: + continue + g, tail = cs.split("|", 1) + g, tail = g.strip(), tail.strip() + if g not in summary_gs: + continue + ny = _appendix_norm_year_tail(tail) + if ny and ny in fy: + drop.add(cs) + if not drop: + return col_order + out = [c for c in col_order if str(c).strip() not in drop] + return out if out else list(col_order) + + +def _filter_appendix3_placeholders_when_forecast_has_real_year(table_name: str, col_order: list[str]) -> list[str]: + """附表3:后评价时点后预测值已有 YYYY 列时,三组内所有「××年#n」占位列均剔除(含建设期/时点前仅余占位的情况)。""" + tn = str(table_name or "").strip() + if "附表3" not in tn or "项目投资财务现金流量" not in tn: + return list(col_order) + forecast_g = "后评价时点后预测值" + has_forecast_real = False + for c in col_order: + cs = str(c or "").strip() + if "|" not in cs: + continue + g, tail = cs.split("|", 1) + if g.strip() != forecast_g: + continue + if _appendix_norm_year_tail(tail.strip()): + has_forecast_real = True + break + if not has_forecast_real: + return list(col_order) + out: list[str] = [] + for c in col_order: + cs = str(c or "").strip() + if "|" not in cs: + out.append(c) + continue + g, tail = cs.split("|", 1) + g, tail = g.strip(), tail.strip() + if g in _APPENDIX_TIME_SLOT_GROUPS and not _appendix_time_slot_group_tail_is_real(tail): + continue + out.append(c) + return out if out else list(col_order) + + +def _filter_appendix5_orphan_price_unit_column(table_name: str, col_order: list[str]) -> list[str]: + """附表5:去掉与「价格(元/t)」重复的独立列键「(元/t)」(多为表头拆行误入数据列)。""" + tn = str(table_name or "") + if "附表5" not in tn or "营业收入与营业税金" not in tn: + return col_order + if not any("价格" in str(c) and "元/t" in str(c) for c in col_order): + return col_order + orphans = {"(元/t)", "(元/t)"} + out = [c for c in col_order if str(c).strip() not in orphans] + return out if out else list(col_order) + + +# 表5-5:仅按表号匹配(不要求表名含「主要生产经营指标」,避免要素表标题变体导致过滤未生效) +_TABLE_55_TITLE_RX = re.compile(r"表\s*5\s*[--\..·]\s*5") +_TABLE_55_FORECAST_GROUP = "后评价时点后预测值" +_TABLE_55_FORECAST_HYPHEN_YEAR = re.compile( + rf"^{re.escape(_TABLE_55_FORECAST_GROUP)}\s*[--—–]\s*(\d{{4}})(?:年)?$" +) + + +def _compact_zh_ident(s: str) -> str: + return re.sub(r"\s+", "", unicodedata.normalize("NFKC", str(s or ""))) + + +def _split_group_year_col_key(col: str) -> tuple[str, str] | None: + """解析「组|子列」;支持半角/全角竖线。""" + st = str(col or "").strip() + if not st: + return None + for sep in ("|", "\uff5c"): # U+FF5C 全角竖线 + if sep in st: + a, b = st.split(sep, 1) + return a.strip(), b.strip() + return None + + +def _table55_has_forecast_year_slot_columns(col_order: list[str]) -> bool: + """是否存在「后评价时点后预测值」下的分年列(|、| 或 后缀 -YYYY)。""" + for c in col_order: + parts = _split_group_year_col_key(str(c or "")) + if parts: + g, tail = parts + if g != _TABLE_55_FORECAST_GROUP: + continue + ts = tail.strip() + if _appendix_norm_year_tail(ts): + return True + if _BARE_FOUR_DIGIT_YEAR_COL.fullmatch(ts): + try: + if 1900 <= int(ts) <= 2100: + return True + except ValueError: + pass + continue + st = str(c or "").strip() + m = _TABLE_55_FORECAST_HYPHEN_YEAR.match(st) + if m: + try: + if 1900 <= int(m.group(1)) <= 2100: + return True + except ValueError: + pass + return False + + +def _table55_col_should_drop(col: str, *, has_forecast_year_slots: bool) -> bool: + c0 = _compact_zh_ident(col) + if "时点点后" in c0: + return True + if has_forecast_year_slots and c0 == _compact_zh_ident(_TABLE_55_FORECAST_GROUP): + return True + return False + + +def _filter_table55_redundant_malformed_forecast_column(table_name: str, col_order: list[str]) -> list[str]: + """表5-5:去掉笔误列「…时点点后…」及在有分年预测列时多余的裸「后评价时点后预测值」列。""" + if not col_order: + return col_order + if not _TABLE_55_TITLE_RX.search(str(table_name or "")): + return list(col_order) + has_slots = _table55_has_forecast_year_slot_columns(col_order) + out = [c for c in col_order if not _table55_col_should_drop(str(c), has_forecast_year_slots=has_slots)] + return out if out else list(col_order) + + +_APPENDIX_TIME_GROUP_YEAR_HYPHEN_RE = re.compile( + r"^(建设期|后评价时点前实际值|后评价时点后预测值)\s*[--—–]\s*(.+)$" +) + + +def _appendix_time_col_group_and_tail(col: str) -> tuple[str | None, str | None]: + """解析附表时间列键为 (组名, 子列);支持「组|年」「组-年」及裸组名列。""" + st = str(col or "").strip() + if not st: + return None, None + parts = _split_group_year_col_key(st) + if parts: + return parts[0], parts[1] + m = _APPENDIX_TIME_GROUP_YEAR_HYPHEN_RE.match(st) + if m: + return m.group(1).strip(), m.group(2).strip() + if st in _APPENDIX_TIME_SLOT_GROUPS or st == "价格(元/t)": + return st, "" + return None, None + + +def _appendix_time_tail_sort_key(tail: str | None) -> tuple[int, int, str]: + """组内子列排序:裸组名 < 分年列(年份升序) < 占位列 < 其它。""" + t = str(tail or "").strip() + if not t: + return (0, -1, "") + ny = _appendix_norm_year_tail(t) + if ny: + return (1, int(ny), "") + if _BARE_FOUR_DIGIT_YEAR_COL.fullmatch(t): + try: + yi = int(t) + if 1900 <= yi <= 2100: + return (1, yi, "") + except ValueError: + pass + buf: list[str] = [] + for ch in t: + if ch in "xXxX": + buf.append("×") + elif ch == "\u00d7": + buf.append("×") + else: + buf.append(ch) + pm = re.fullmatch(r"×{2}年#(\d+)", "".join(buf)) + if pm: + return (2, int(pm.group(1)), "") + return (3, 0, t) + + +def _reorder_appendix_time_col_order(table_name: str, col_order: list[str]) -> list[str]: + """附表3~7:按细则组序排列列,组内年份从小到大。""" + if not col_order or not _appendix_time_table_name(table_name): + return list(col_order) + spec_groups = time_table_default_columns_for_name(table_name) or [] + group_rank: dict[str, int] = {g: i for i, g in enumerate(spec_groups)} + by_group: dict[str, list[str]] = {} + ungrouped: list[str] = [] + for col in col_order: + cs = str(col or "").strip() + if not cs: + continue + g, _ = _appendix_time_col_group_and_tail(cs) + if g == "价格(元/t)" or g in _APPENDIX_TIME_SLOT_GROUPS: + by_group.setdefault(g, []).append(cs) + if g not in group_rank: + group_rank[g] = len(group_rank) + 100 + else: + ungrouped.append(cs) + + def _sort_group_cols(cols: list[str]) -> list[str]: + return sorted( + cols, + key=lambda c: _appendix_time_tail_sort_key(_appendix_time_col_group_and_tail(c)[1]), + ) + + ordered_groups = list(spec_groups) + for g in sorted(by_group.keys(), key=lambda x: group_rank.get(x, 999)): + if g not in ordered_groups: + ordered_groups.append(g) + out: list[str] = [] + seen: set[str] = set() + for g in ordered_groups: + cols = by_group.get(g) + if not cols: + continue + for c in _sort_group_cols(cols): + if c not in seen: + out.append(c) + seen.add(c) + for c in ungrouped: + if c not in seen: + out.append(c) + seen.add(c) + return out if out else list(col_order) + + +def _build_structured_table_html( + table_name: str, + row_order: list[str], + col_order: list[str], + latest: dict[tuple[str, str], str], +) -> str: + row_order = _normalize_table_row_order(row_order, table_name=table_name) + row_header = _row_header_name_for_table(table_name) + grouped = _group_column_headers(col_order) + + lines: list[str] = ["", " "] + if grouped: + top_headers, sub_headers = grouped + lines.append(" ") + lines.append(' ') + lines.append(f' ') + idx = 0 + while idx < len(top_headers): + group = top_headers[idx] + if not group: + lines.append(f' ') + idx += 1 + continue + span = 1 + while idx + span < len(top_headers) and top_headers[idx + span] == group: + span += 1 + lines.append(f' ') + idx += span + lines.append(" ") + lines.append(" ") + for top, sub in zip(top_headers, sub_headers): + if top: + lines.append(f" ") + lines.append(" ") + else: + lines.append(" ") + lines.append(" ") + lines.append(f" ") + for col in col_order: + lines.append(f" ") + lines.append(" ") + lines.append(" ") + lines.append(" ") + max_rows = min(120, len(row_order)) if _is_table54_operating_benefit(table_name) else min(24, len(row_order)) + serial_col = _table_row_outline_serial_column(row_order, max_rows=max_rows) + for idx, rk in enumerate(row_order[:max_rows], start=1): + display_rk = _project_column_row_label( + table_name, rk, latest, serial_col=serial_col + ) + serial_cell = _serial_cell_for_report_table( + table_name, rk, idx, serial_col, serial_idx=idx - 1 + ) + lines.append(" ") + lines.append(f" ") + lines.append(f" ") + for ck in col_order: + val = latest.get((rk, ck), "待补充") or "待补充" + lines.append(f" ") + lines.append(" ") + lines.append(" ") + lines.append("
序号{row_header}{sub_headers[idx]}{group}
{sub}
序号{row_header}{col}
{serial_cell}{display_rk}{val}
") + return "\n".join(lines) + + +_RE_MD_HEADER_NAME_UNIT = re.compile(r"^(.+?)\s*([((][^))]+[))])$") + + +def _strip_md_bold_markup(text: str) -> str: + """去掉 Markdown 加粗标记 **,保留其余内容。""" + s = str(text or "") + while True: + new = re.sub(r"\*\*([^*]+?)\*\*", r"\1", s) + if new == s: + break + s = new + return s + + +def _markdown_table_header_cell_display(col_label: str, *, plain: bool = False) -> str: + """表头栏 Markdown:量纲写在名称下方,单位加括号(同一单元格内用
换行);不加粗。""" + del plain # 保留参数以兼容旧调用;表头一律不加 ** 包裹 + s = _strip_md_bold_markup(str(col_label or "").strip()).replace("|", "|") + if not s: + return "" + if re.search(r"", s, re.I): + parts = [ + _strip_md_bold_markup(p).strip() + for p in re.split(r"", s, flags=re.I) + ] + out_parts = [p for p in parts if p] + return "
".join(out_parts) if out_parts else s + m = _RE_MD_HEADER_NAME_UNIT.match(s) + if m: + name, unit = m.group(1).strip(), m.group(2).strip() + if name: + return f"{name}
{unit}" if unit else name + return s + + +def _common_trailing_parenthetical_unit_from_flat_labels( + flat_cols: list[str], +) -> tuple[str | None, list[str]]: + """当合并后的列表头列名末尾「(单位)」在各列一致时,返回该单位及去掉单位后的表头文案。""" + stripped: list[str] = [] + units: list[str | None] = [] + for lab in flat_cols: + s = str(lab or "").strip() + m = _RE_MD_HEADER_NAME_UNIT.match(s) + if m: + stripped.append(m.group(1).strip()) + units.append(m.group(2).strip()) + else: + stripped.append(s) + units.append(None) + present = [u for u in units if u] + if not present: + return None, list(flat_cols) + u0 = present[0] + if any(units[i] is not None and units[i] != u0 for i in range(len(units))): + return None, list(flat_cols) + return u0, stripped + + +# 表号与表名之间空两格:采用两个全角空格(与公文「空两格」习惯一致) +_TABLE_CAPTION_NUMBER_NAME_GAP = "\u3000\u3000" +_TABLE52_INVESTMENT_CHANGE_CAPTION = ( + f"表5-2{_TABLE_CAPTION_NUMBER_NAME_GAP}投资变动情况表(单位:万元、万美元)" +) +_RE_TABLE_CAPTION_LEADING_TOKEN = re.compile( + r"^(附表\s*\d+(?:\s*[.\--.]\s*\d+)*|表\s*\d+(?:\s*[.\--.]\s*\d+)*)\s*(.*)$", + re.DOTALL, +) + + +def _fix_521_table52_wrong_caption(content: str) -> str: + """5.2.1 若表题误用「表5-2 同类烷基化…」等,改回标准投资变动情况表表题。""" + text = str(content or "") + if not text.strip(): + return text + caption_re = re.compile( + r"^(\s*(?:#{1,6}\s+)?)(表\s*5\s*[--.]\s*2\s*(.*))$", + re.IGNORECASE, + ) + out: list[str] = [] + for line in text.split("\n"): + m = caption_re.match(line) + if m: + tail = (m.group(3) or "").strip() + if "投资变动情况表" not in tail: + out.append(f"{m.group(1)}{_TABLE52_INVESTMENT_CHANGE_CAPTION}") + continue + out.append(line) + return "\n".join(out) + + +def _normalize_table_caption_number_name_gap(title: str) -> str: + """将「表2-4xxx」「表 2 - 4 xxx」规范为「表2-4」+ 两全角空格 + 表名。""" + s = str(title or "").strip() + if not s: + return s + m = _RE_TABLE_CAPTION_LEADING_TOKEN.match(s) + if not m: + return s + token_compact = re.sub(r"\s+", "", (m.group(1) or "").strip()) + rest = (m.group(2) or "").strip() + if not rest: + return token_compact + return f"{token_compact}{_TABLE_CAPTION_NUMBER_NAME_GAP}{rest}" + + +def _rewrite_table_caption_line_for_number_name_gap(line: str) -> str: + """修正独立表题行(非表格管道行)中表号与表名间距。""" + if "|" in line or not line.strip(): + return line + if line.strip().startswith("```"): + return line + m = re.match(r"^(\s*)(.*)$", line) + if not m: + return line + indent, rest = m.group(1), m.group(2) + h = "" + m2 = re.match(r"^(#{1,6}\s+)(.*)$", rest) + if m2: + h, rest = m2.group(1), m2.group(2) + rest_st = rest.strip() + if not rest_st: + return line + if rest_st.startswith("**") and rest_st.endswith("**") and len(rest_st) >= 4: + inner = _strip_md_bold_markup(rest_st[2:-2]).strip() + n = _normalize_table_caption_number_name_gap(inner) + return f"{indent}{h}{n}" + n2 = _normalize_table_caption_number_name_gap(_strip_md_bold_markup(rest_st)) + if n2 != rest_st: + return f"{indent}{h}{n2}" + return line + + +def _debold_md_table_row(line: str) -> str: + if "|" not in line: + return line + return "|".join(_strip_md_bold_markup(part) for part in line.split("|")) + + +def _debold_markdown_table_blocks_in_content(content: str) -> str: + """去掉 Markdown 管道表表头行中的 ** 加粗(含双行表头)。""" + lines = str(content or "").split("\n") + if not lines: + return str(content or "") + out: list[str] = [] + i = 0 + while i < len(lines): + ln = lines[i] + if _is_pipe_markdown_table_row_line(ln) and not _is_pipe_markdown_table_separator_line(ln): + header_rows: list[str] = [] + j = i + while j < len(lines) and _is_pipe_markdown_table_row_line(lines[j]) and not _is_pipe_markdown_table_separator_line(lines[j]): + header_rows.append(lines[j]) + j += 1 + if j < len(lines) and _is_pipe_markdown_table_separator_line(lines[j]): + out.extend(_debold_md_table_row(hr) for hr in header_rows) + out.append(lines[j]) + j += 1 + while j < len(lines) and ( + _is_pipe_markdown_table_row_line(lines[j]) + or _is_pipe_markdown_table_separator_line(lines[j]) + ): + out.append(lines[j]) + j += 1 + i = j + continue + out.extend(header_rows) + i += len(header_rows) + continue + out.append(ln) + i += 1 + return "\n".join(out) + + +def _normalize_table_captions_in_markdown(content: str) -> str: + if not str(content or "").strip(): + return str(content or "") + lines = str(content).split("\n") + text = "\n".join(_rewrite_table_caption_line_for_number_name_gap(ln) for ln in lines) + return _debold_markdown_table_blocks_in_content(text) + + +def _merge_table_title_with_common_unit(base_title: str, unit: str | None) -> str: + """表题末尾追加各列相同的公共单位(括号形式);表题已含该单位则不重复。""" + b = str(base_title or "").strip() + if not unit or not str(unit).strip(): + return _normalize_table_caption_number_name_gap(b) + u = str(unit).strip() + bc = re.sub(r"\s+", "", b) + uc = re.sub(r"\s+", "", u) + if uc and bc.endswith(uc): + return _normalize_table_caption_number_name_gap(b) + return _normalize_table_caption_number_name_gap(f"{b} {u}") + + +def _render_table_7_1_markdown( + row_order: list[str], + col_order: list[str], + latest: dict[tuple[str, str], str], +) -> str: + """表7-1 行键为「指标·要素」或「综合得分」;输出合同要求的「指标」「要素」分列。""" + def esc(v: str) -> str: + return str(v or "").replace("|", "|") + + data_cols: list[str] = [] + for c in TABLE_7_1_COLUMN_KEYS: + if c in col_order: + data_cols.append(c) + for c in col_order: + if c not in data_cols: + data_cols.append(c) + + preferred = [rk for rk, _ in TABLE_7_1_ROW_CELL_DEFAULTS] + preferred_set = set(preferred) + ordered_rows = [rk for rk in preferred if rk in row_order] + for rk in row_order: + if rk not in preferred_set: + ordered_rows.append(rk) + + header = "| " + " | ".join( + [_markdown_table_header_cell_display("指标"), _markdown_table_header_cell_display("要素")] + + [_markdown_table_header_cell_display(c) for c in data_cols] + ) + " |" + sep = "| " + " | ".join(["---"] * (2 + len(data_cols))) + " |" + lines = [header, sep] + for rk in ordered_rows: + rk_s = str(rk or "").strip() + if rk_s == "综合得分": + ind, elem = "综合得分", "" + elif "·" in rk_s: + left, right = rk_s.split("·", 1) + ind, elem = left.strip(), right.strip() + else: + ind, elem = rk_s, "" + vals: list[str] = [] + for ck in data_cols: + raw = str(latest.get((rk_s, ck), "") or "").strip() + vals.append(esc(raw if raw else "待补充")) + lines.append("| " + " | ".join([esc(ind), esc(elem)] + vals) + " |") + return "\n".join(lines) + "\n" + + +def _build_structured_table_markdown(db: Session, table_id: str, table_name: str = "") -> tuple[str, str | None]: + cells = ( + db.query(ElementCell) + .filter( + ElementCell.table_id == table_id, + ElementCell.value.isnot(None), + ElementCell.value != "", + ) + .order_by(ElementCell.updated_at.desc()) + .all() + ) + if not cells: + return "", None + latest: dict[tuple[str, str], str] = {} + row_order: list[str] = [] + col_order: list[str] = [] + for cell in cells: + row_key = str(cell.row_key or "").strip() + col_key = str(cell.col_key or "内容").strip() or "内容" + if not row_key: + continue + key = (row_key, col_key) + if key not in latest: + latest[key] = str(cell.value or "").strip() + if row_key not in row_order: + row_order.append(row_key) + if col_key not in col_order: + col_order.append(col_key) + if not row_order: + return "", None + row_order = _normalize_table_row_order(row_order, table_name=table_name) + spec = _multi_column_global_spec_for_table(table_name) + if spec: + spec_cols = [str(col).strip() for col in (spec[0] or []) if str(col).strip()] + ordered = [col for col in spec_cols if col in col_order] + extras = [col for col in col_order if col not in ordered] + col_order = ordered + extras + col_order = _filter_redundant_bare_year_columns(col_order) + col_order = _filter_appendix_placeholder_slot_columns(col_order) + col_order = _filter_appendix3_summary_duplicate_forecast_years(table_name, col_order) + col_order = _filter_appendix3_placeholders_when_forecast_has_real_year(table_name, col_order) + col_order = _filter_appendix5_orphan_price_unit_column(table_name, col_order) + col_order = _filter_table55_redundant_malformed_forecast_column(table_name, col_order) + col_order = _reorder_appendix_time_col_order(table_name, col_order) + inferred_ty: int | None = None + if _is_table54_operating_benefit(str(table_name or "").strip()): + ty_row = db.query(ElementTable.year).filter(ElementTable.id == table_id).first() + tbl_y = int(ty_row[0]) if ty_row and ty_row[0] is not None and int(ty_row[0]) > 0 else None + inferred_ty = _infer_time_column_year_for_table54(col_order, cells, tbl_y) + if str(table_name or "").strip() == TABLE_7_1_SCORING_TABLE_NAME: + return _render_table_7_1_markdown(row_order, col_order, latest), None + return _render_markdown_table( + table_name, row_order, col_order, latest, time_column_year=inferred_ty + ) + + +_PLACEHOLDER_ROW_PREFIX = re.compile(r"^(产品名称|原料名称|项目名称|名称|产品|项目)\s*[·.]\s*") + + +def _display_row_key(table_name: str, rk: str, latest: dict[tuple[str, str], str]) -> str: + """将模板占位行名替换为真实名称(优先使用该行单元格中的项目/产品名称)。""" + text = str(rk or "").strip() + m = _PLACEHOLDER_ROW_PREFIX.match(text) + if not m: + return text + + preferred_cols: list[str] = [] + tn = str(table_name or "") + if "表2-4" in tn or "产品流向" in tn: + preferred_cols.extend(["项目名称", "产品名称", "规格"]) + preferred_cols.extend(["项目名称", "产品名称", "名称", "规格"]) + + for col in preferred_cols: + v = str(latest.get((rk, col), "") or "").strip() + if v and v != "待补充": + return v + + suffix = text[m.end():].strip() + return f"产品{suffix}" if suffix else text + + +# 与正文层次编号一致:row_key 形如「1.1 建设投资」「1.2.3 工艺」「3原料」 +_ROWKEY_OUTLINE_PREFIX = re.compile( + r"^\s*(\d+(?:\.\d+)*)(?:\s*[、..]?\s+(?=\S)|(?=[\u4e00-\u9fffA-Za-z((]))" +) +_CN_OUTLINE_ROWKEY_PREFIX = re.compile(r"^\s*([一二三四五六七八九十百千]+)\s+(.+)$") +_APPENDIX5_PRODUCT_TRIPLE = re.compile( + r"^(\d+(?:\.\d+)*)\s+(.+?)·(销量|营业收入|销项税)$" +) +_APPENDIX7_DETAIL_ROW = re.compile( + r"^(\d+(?:\.\d+)*)\s+([^·]+?)(?:·(单价|数量|进项税额|……))?$" +) + +_TABLE_53_LEGACY_ROW_DISPLAY: dict[str, str] = { + "工程费用变动·批准单位": "批准单位", + "工程费用变动·批准文号": "批准文号", + "工程费用变动·工程费用合计": "工程费用合计", + "工程费用变动·工艺生产装置": "1 工艺生产装置", + "工程费用变动·装置·设备购置费": "1.1.1 设备购置费", + "工程费用变动·装置·安装工程费": "1.1.2 安装工程费", + "工程费用变动·装置·建筑工程费": "1.1.3 建筑工程费", + "工程费用变动·总图运输": "2 总图运输", + "工程费用变动·储运工程": "3 储运工程", + "工程费用变动·其它分项(可增删)": "其它分项(可增删)", +} + + +def _strip_table_prefix_from_row_key(rk: str) -> str: + s = str(rk or "").strip() + if "\u00b7" in s: + return "\u00b7".join(s.split("\u00b7")[1:]).strip() + return s + + +def _parse_row_key_seq_and_name(rk: str, *, table_name: str = "") -> tuple[str, str]: + """与 quick-fill.js ``parseRowKeyForDisplay`` 一致。""" + s = str(rk or "").strip() + if not s: + return "", "" + legacy = _legacy_map_for_table(table_name) + if legacy: + s = legacy.get(s, s) + if _table53_engineering_cost_change_name(table_name): + s = _TABLE_53_LEGACY_ROW_DISPLAY.get(s, s) + if _appendix_time_table_name(table_name) or _appendix8_param_name(table_name): + s = _strip_table_prefix_from_row_key(s) + elif "\u00b7" in s: + s = "\u00b7".join(s.split("\u00b7")[1:]).strip() + m = _ROWKEY_OUTLINE_PREFIX.match(s) + if m: + rest = s[m.end():].strip() + return m.group(1), rest if rest else s + m_cn = _CN_OUTLINE_ROWKEY_PREFIX.match(s) + if m_cn: + return m_cn.group(1), m_cn.group(2).strip() + return "", s + + +def _row_display_name_for_table(table_name: str, rk: str) -> str: + """项目/工程名称列展示文案(去表内前缀与层次编号,附表5/7 明细行单独处理)。""" + s0 = str(rk or "").strip() + if not s0: + return "" + if _appendix5_revenue_tax_name(table_name): + s = _strip_table_prefix_from_row_key(s0) + m = _APPENDIX5_PRODUCT_TRIPLE.match(s) + if m: + return m.group(3) + if _appendix7_materials_name(table_name): + s = _strip_table_prefix_from_row_key(s0) + m = _APPENDIX7_DETAIL_ROW.match(s) + if m and m.group(3): + return m.group(3) + if m: + return m.group(2).strip() + if _table_row_seq_name_split_display(table_name): + _, name = _parse_row_key_seq_and_name(s0, table_name=table_name) + return name or s0 + return s0 + + +def _serial_cell_for_report_table( + table_name: str, + rk: str, + idx: int, + serial_col: list[str] | None, + *, + serial_idx: int, +) -> str: + """表5-2/5-3、附表2~8 用连续 1..n;表5-1 用层次编号;其余表沿用原逻辑。""" + if _table_row_seq_name_split_display(table_name) and not _table51_main_economic_indicators_name( + table_name + ): + return str(idx) + if _table51_main_economic_indicators_name(table_name): + seq, _ = _parse_row_key_seq_and_name(rk, table_name=table_name) + return seq if seq else str(idx) + if serial_col is not None: + return serial_col[serial_idx] + return str(idx) + + +def _project_column_row_label( + table_name: str, + rk: str, + latest: dict[tuple[str, str], str], + *, + serial_col: list[str] | None, +) -> str: + if _table_row_seq_name_split_display(table_name): + label = _row_display_name_for_table(table_name, rk) + elif serial_col is not None: + label = _strip_row_key_leading_outline_for_display(rk) + if not str(label or "").strip(): + label = rk + else: + label = rk + return _element_manage_table_row_display_label( + table_name, _display_row_key(table_name, label, latest) + ) + + +def _outline_serial_from_row_key(rk: str) -> str | None: + """若 row_key 以阿拉伯数字层次编号开头,返回该编号字符串。""" + rk_s = str(rk or "").strip() + if not rk_s: + return None + compact = re.sub(r"\s+", "", rk_s) + if compact in ("合计", "总计"): + return None + m = _ROWKEY_OUTLINE_PREFIX.match(rk_s) + if not m: + return None + num = m.group(1) + if re.fullmatch(r"\d{4}", num): + try: + yi = int(num) + except ValueError: + return None + if 1900 <= yi <= 2100: + return None + return num + + +def _table_row_outline_serial_column(row_order: list[str], *, max_rows: int) -> list[str] | None: + """当每一数据行(合计/总计除外)的 row_key 均带层次编号时,序号列采用该编号。""" + rows = row_order[:max_rows] + if not rows: + return None + serials: list[str] = [] + for rk in rows: + compact = re.sub(r"\s+", "", str(rk or "")) + if compact in ("合计", "总计") or str(rk or "").strip() in ("合计", "总计"): + serials.append("—") + continue + s = _outline_serial_from_row_key(str(rk) or "") + if s is None: + return None + serials.append(s) + return serials + + +def _strip_row_key_leading_outline_for_display(rk: str) -> str: + """去掉 row_key 首部层次编号,避免第二列与序号列重复。""" + rk_s = str(rk or "").strip() + m = _ROWKEY_OUTLINE_PREFIX.match(rk_s) + if not m: + return rk_s + rest = rk_s[m.end():].strip() + return rest if rest else rk_s + + +def _render_markdown_table( + table_name: str, + row_order: list[str], + col_order: list[str], + latest: dict[tuple[str, str], str], + *, + time_column_year: int | None = None, +) -> tuple[str, str | None]: + col_order = list(col_order) + tn = str(table_name or "").strip() + table54 = _is_table54_operating_benefit(tn) + if table54: + _table54_remap_indicator_unit_latest(latest) + _table54_rekey_latest_col_keys(latest) + col_order = _reorder_table54_col_order(col_order) + grouped = _group_column_headers(col_order) + if grouped: + top_headers, sub_headers = grouped + flat_cols: list[str] = [] + for top, sub in zip(top_headers, sub_headers): + if top and sub: + flat_cols.append(f"{top}-{sub}") + elif top: + flat_cols.append(top) + else: + flat_cols.append(sub) + else: + flat_cols = list(col_order) + + if table54: + flat_cols = list(col_order) + flat_header_labels = _table54_markdown_header_labels( + col_order, time_column_year=time_column_year + ) + common_unit = None + else: + common_unit, flat_header_labels = _common_trailing_parenthetical_unit_from_flat_labels(flat_cols) + if common_unit is None: + flat_header_labels = flat_cols + + def _esc_pipe(v: str) -> str: + return str(v or "").replace("|", "|") + + row_header = _row_header_name_for_table(table_name) + if table54: + row_header = "项目" + + header = ( + "| " + + " | ".join( + [ + _markdown_table_header_cell_display("序号"), + _markdown_table_header_cell_display(row_header), + ] + + [_markdown_table_header_cell_display(c) for c in flat_header_labels] + ) + + " |" + ) + split = "| --- | --- | " + " | ".join(["---"] * len(flat_cols)) + " |" + lines = [header, split] + max_rows = min(120, len(row_order)) if table54 else min(24, len(row_order)) + serial_col = _table_row_outline_serial_column(row_order, max_rows=max_rows) + for idx, rk in enumerate(row_order[:max_rows], start=1): + vals = [_esc_pipe(latest.get((rk, ck), "待补充") or "待补充") for ck in col_order] + display_rk = _project_column_row_label( + table_name, rk, latest, serial_col=serial_col + ) + serial_cell = _serial_cell_for_report_table( + table_name, rk, idx, serial_col, serial_idx=idx - 1 + ) + lines.append("| " + serial_cell + " | " + _esc_pipe(display_rk) + " | " + " | ".join(vals) + " |") + return "\n".join(lines) + "\n", common_unit + + +def _build_time_table_markdowns_by_year( + db: Session, table_id: str, table_name: str = "", +) -> list[tuple[str, str]]: + """为时间要素表按 year 拆分,返回 [(display_table_name, markdown), ...] 列表。 + + 时间表的 ElementCell 通过 year 字段区分不同年份的数据;前端用 col_key + "|" + year + 渲染多级表头。本函数按年份分别聚合 cell,为每个年份生成独立的 Markdown 表格, + 表名中的「××年」替换为实际年份。 + """ + cells = ( + db.query(ElementCell) + .filter( + ElementCell.table_id == table_id, + ElementCell.value.isnot(None), + ElementCell.value != "", + ) + .order_by(ElementCell.updated_at.desc()) + .all() + ) + if not cells: + return [] + + from collections import defaultdict + year_cells: dict[int | None, list[ElementCell]] = defaultdict(list) + for cell in cells: + year_cells[cell.year].append(cell) + + ty_row = db.query(ElementTable.year).filter(ElementTable.id == table_id).first() + tbl_y = ( + int(ty_row[0]) + if ty_row and ty_row[0] is not None and int(ty_row[0]) > 0 + else None + ) + base_name = str(table_name or "").strip() + if _is_table54_operating_benefit(base_name): + year_cells, real_years = _table54_merge_year_cells_for_table_year( + year_cells, table_year=tbl_y + ) + else: + real_years = sorted(y for y in year_cells if y is not None) + if not real_years: + md, common_unit = _build_structured_table_markdown(db, table_id, table_name) + if not md: + return [] + disp = _merge_table_title_with_common_unit(str(table_name or "").strip(), common_unit) + return [(disp, md)] + + results: list[tuple[str, str]] = [] + for year in real_years: + year_cell_list = year_cells[year] + latest: dict[tuple[str, str], str] = {} + row_order: list[str] = [] + col_order: list[str] = [] + for cell in year_cell_list: + row_key = str(cell.row_key or "").strip() + col_key = str(cell.col_key or "内容").strip() or "内容" + if not row_key: + continue + key = (row_key, col_key) + if key not in latest: + latest[key] = str(cell.value or "").strip() + if row_key not in row_order: + row_order.append(row_key) + if col_key not in col_order: + col_order.append(col_key) + if not row_order: + continue + if _is_table54_operating_benefit(base_name): + _table54_coalesce_legacy_bare_metric_cols(latest, row_order) + row_order = _normalize_table_row_order(row_order, table_name=base_name) + time_spec_cols = time_table_default_columns_for_name(base_name) + if time_spec_cols and _is_table54_operating_benefit(base_name): + col_order = ["单位"] + [c for c in time_spec_cols if c != "单位"] + elif time_spec_cols: + ordered = [col for col in time_spec_cols if col in col_order] + extras = [col for col in col_order if col not in ordered] + col_order = ordered + extras + col_order = _filter_redundant_bare_year_columns(col_order) + col_order = _filter_appendix_placeholder_slot_columns(col_order) + col_order = _filter_appendix3_summary_duplicate_forecast_years(base_name, col_order) + col_order = _filter_appendix3_placeholders_when_forecast_has_real_year(base_name, col_order) + col_order = _filter_appendix5_orphan_price_unit_column(base_name, col_order) + col_order = _filter_table55_redundant_malformed_forecast_column(base_name, col_order) + col_order = _reorder_appendix_time_col_order(base_name, col_order) + display_name = re.sub(r"××年", f"{year}年", base_name) + md, common_unit = _render_markdown_table( + display_name, row_order, col_order, latest, time_column_year=year + ) + if md: + results.append((_merge_table_title_with_common_unit(display_name, common_unit), md)) + + if not results and None in year_cells: + md, common_unit = _build_structured_table_markdown(db, table_id, table_name) + if md: + results.append((_merge_table_title_with_common_unit(str(table_name or "").strip(), common_unit), md)) + return results + + +def _extract_table_short_token(table_name: str) -> str: + text = str(table_name or "") + m = re.search(r"(附表\s*\d+(?:\s*[.\--]\s*\d+)*|表\s*\d+(?:\s*[.\--]\s*\d+)*)", text) + return re.sub(r"\s+", "", m.group(1)) if m else "" + + +def _norm_table_token(token: str) -> str: + text = re.sub(r"\s+", "", str(token or "")).lower() + return text.replace("-", "-").replace("—", "-").replace("–", "-") + + +def _table_token_matches_name(token: str, name: str, *, normalized: bool = False) -> bool: + """ + 表号精确匹配,避免“表1”误命中“表10”。 + - normalized=True: token/name 已经是 _norm_table_token 结果。 + """ + t = token if normalized else _norm_table_token(token) + n = name if normalized else _norm_table_token(name) + if not t or not n: + return False + if t == n: + return True + # 兼容历史项目:4.3.3 的“烷基化装置运行分析”可能仍存为表4-1,仍应视为表4-2 的同义候选。 + if t == _norm_table_token("表4-2"): + raw_name = str(name or "") + name_plain = re.sub(r"\s+", "", raw_name) + if ("烷基化装置运行分析" in name_plain) and ("考核时间" in name_plain): + if ("表4-1" in name_plain) or ("表4-2" in name_plain) or ("表41" in _norm_table_token(name_plain)): + return True + # 后面不能紧跟 1-2 位数字后即结束或遇到非数字(避免 表1→表10、表2-4→表2-40), + # 但允许紧跟 4 位年份(如 表2-42019年…)或非数字字符(如 表2-4××年…)。 + pattern = re.compile(rf"{re.escape(t)}(?!\d{{1,2}}(?!\d))") + return bool(pattern.search(n)) + + +def _table_token_caption_line_re(token: str) -> re.Pattern[str]: + token_plain = re.sub(r"\s+", "", str(token or "")) + token_re = re.escape(token_plain).replace(r"\-", r"[--—–]") + return re.compile( + r"(?:^|\n)([^\n]*?" + token_re + r"[^\n]*)\n", + flags=re.IGNORECASE, + ) + + +def _segment_after_table_caption(content: str, token: str) -> str: + """本表表题行之后、下一张「表 x-x …」表题之前的内容(不含引用语中的表号)。""" + text = str(content or "") + cap = _table_token_caption_line_re(token).search(text) + if not cap: + return "" + rest = text[cap.end() :] + next_cap = re.search( + r"\n[^\n]*?表\s*\d+(?:\s*[--.]\s*\d+)*\s+[\u4e00-\u9fff]", + rest, + flags=re.IGNORECASE, + ) + if next_cap: + return rest[: next_cap.start()] + return rest + + +def _segment_has_markdown_table_body(segment: str) -> bool: + seg = str(segment or "") + if not seg.strip(): + return False + return bool( + re.search( + r"(?:|(?:\n[ \t]*\|[^\n]+\|[ \t]*\n[ \t]*\|[-:\s|]+\|))", + seg, + flags=re.IGNORECASE, + ) + ) + + +def _replace_caption_stub_with_authoritative_table( + content: str, token: str, authoritative_block: str +) -> str: + """将「仅有表题/注释、无表体」的占位段替换为要素直出整块(用于 3.3.4 表3-4 等)。""" + text = str(content or "") + block = str(authoritative_block or "").strip() + if not block: + return text + cap = _table_token_caption_line_re(token).search(text) + if not cap: + return text.rstrip() + "\n\n" + block + region_start = cap.start() + if region_start > 0 and text[region_start] == "\n": + region_start += 1 + rest = text[cap.end() :] + next_cap = re.search( + r"\n[^\n]*?表\s*\d+(?:\s*[--.]\s*\d+)*\s+[\u4e00-\u9fff]", + rest, + flags=re.IGNORECASE, + ) + region_end = cap.end() + (next_cap.start() if next_cap else len(rest)) + head = text[:region_start].rstrip("\n") + tail = text[region_end:].lstrip("\n") + if head: + return f"{head}\n\n{block}\n\n{tail}".strip() if tail else f"{head}\n\n{block}".strip() + return f"{block}\n\n{tail}".strip() if tail else block + + +def _table_token_exists(content: str, token: str) -> bool: + text = str(content or "") + t = _norm_table_token(token) + if not text or not t: + return False + # 须有独立表题行;正文「见表3-3~表3-5」等引用不算。 + if not _table_token_caption_line_re(token).search(text): + return False + # 表体必须紧跟在本表表题与下一张表题之间,不得借用后续表的 Markdown 块(如 3.3.4 仅表3-4 题、表3-5 有体)。 + return _segment_has_markdown_table_body(_segment_after_table_caption(text, token)) + + +def _basic_warnings(section_title: str, content: str) -> list[str]: + warnings: list[str] = [] + if len(content.strip()) < 80: + warnings.append("章节内容过短,建议补充证据后重试") + title_norm = re.sub(r"\s+", "", str(section_title or "")) + if "1.2项目决策要点" in title_norm: + if "1.2.1项目背景" not in content or "1.2.2预期目标" not in content: + warnings.append("1.2 未按固定结构输出(缺少“1.2.1项目背景/1.2.2预期目标”小节)") + if "2.1.1资源与原料评价" in title_norm: + if "原料数量及组成对比表" not in content: + warnings.append("2.1.1 缺少模版规定的「原料数量及组成对比表」标题") + if "原料性质对比表(醚后碳四)" not in content and "原料性质对比表" not in content: + warnings.append("2.1.1 缺少模版规定的「原料性质对比表(醚后碳四)」标题") + if "原料选择加氢工艺技术对比" in content or ( + "表2.6-1" in content + and "原料数量及组成对比" not in content + and "原料选择加氢" in content + ): + warnings.append("2.1.1 不应出现安评类「表2.6-1 原料选择加氢工艺技术对比」等内容,本节仅允许模版主表") + if "附录:原料预处理工艺方案比选" in content or "(非模版主表)" in content: + warnings.append("2.1.1 不应出现附录或“非模版主表”字样,请仅保留模版两张主表") + if "表" in section_title and "|" not in content: + warnings.append("章节标题疑似要求表格,但输出未包含 Markdown 表格") + if "待补充" in content and len(content.strip()) < 140: + warnings.append("缺失信息较多,建议补充材料后重跑") + return warnings + + +def _check_consistency(report: str, project_name: str) -> list[str]: + out: list[str] = [] + if project_name and project_name not in report: + out.append("正文未显式出现项目名称,请检查第一章基本信息。") + amounts = re.findall(r"(\d+(?:\.\d+)?)\s*(亿元|万元|万)", report) + if amounts: + normalized = [f"{v}-{u}" for v, u in amounts] + if len(normalized) >= 4 and len(set(normalized[:10])) >= 6: + out.append("金额口径较分散,建议统一投资/决算/效益统计口径。") + unit_lines = re.findall(r"(?:单位|计量单位)\s*[::]\s*([^\n]{1,40})", report) + if unit_lines and len(set(unit_lines)) > 1: + out.append("检测到多个计量单位定义,建议统一单位说明(如万元、吨/年)。") + years = [int(y) for y in re.findall(r"(20\d{2})年", report)] + if years: + min_y, max_y = min(years), max(years) + if max_y - min_y >= 12: + out.append("年份跨度较大,建议复核建设期与运营期时间线是否混写。") + if "待补充" in report: + missing_count = report.count("待补充") + if missing_count >= 10: + out.append(f"全篇“待补充”出现 {missing_count} 次,建议补充关键材料后重跑。") + if _has_conflict_terms(report): + out.append("发现同一指标存在“增加/下降”等相反表述,建议人工复核结论口径。") + return out + + +def _append_report_appendices(db: Session, project_uuid: str, report_text: str) -> str: + """ + 为最终报告追加「附图/附表」(细则顺序:附图在上,附表在下)。 + + 说明: + - 附图:从项目知识库 .docx 中解析嵌入图(全厂/装置物料平衡等),以 Markdown 内嵌图输出; + 解析不到则不输出该项(无占位说明)。 + - 附表:从结构化表(element_tables/element_cells)汇总,优先抓取表名包含「附表」的表。 + """ + base = (report_text or "").strip() + if not base: + base = "" + + appendix_tables = _build_appendix_tables_markdown(db, project_uuid) + appendix_figures = _build_appendix_figures_markdown(db, project_uuid) + + parts = [base] if base else [] + if appendix_figures: + parts.append(appendix_figures) + if appendix_tables: + parts.append(appendix_tables) + return "\n\n".join([p for p in parts if str(p).strip()]).strip() + + +def _build_appendix_tables_markdown(db: Session, project_uuid: str) -> str: + tables = ( + db.query(ElementTable) + .filter(ElementTable.project_id == project_uuid) + .order_by(ElementTable.table_name.asc(), ElementTable.updated_at.desc()) + .all() + ) + appendix = [t for t in tables if "附表" in (t.table_name or "")] + if not appendix: + return "" + + blocks: list[str] = ["## 附表"] + used = 0 + for t in appendix: + md, common_unit = _build_structured_table_markdown(db, t.id, t.table_name) + title = str(t.table_name or "").strip() or f"附表({t.id})" + title = _merge_table_title_with_common_unit(title, common_unit) + if not md: + md = _build_appendix_table_fallback_markdown(title) + if not md: + continue + blocks.append(f"### {title}\n\n{md}") + used += 1 + if used >= 30: + break + return "\n\n".join(blocks).strip() if used else "" + + +def _build_appendix_table_fallback_markdown(table_name: str) -> str: + """ + 当 element_cells 暂无有效数据时,按固定模板输出占位附表,避免附表缺失。 + 当前优先支持:附表8 可研报告和后评价参数对比表。 + """ + name = str(table_name or "").replace(" ", "") + if ("附表8" in name) and ("可研报告和后评价参数对比表" in name): + return APPENDIX8_PARAMETER_COMPARISON_TABLE + return "" + + +def _resolve_appendix_figure_blobs_from_kb(db: Session, project_uuid: str) -> dict[int, tuple[bytes, str, str]]: + """自知识库 docx 抽取附图嵌入图:slot -> (blob, content_type, source_filename)。""" + doc_root = Path(settings.DOC_PAT).resolve() + rows = ( + db.query(KbDocument) + .filter(KbDocument.project_id == project_uuid) + .order_by(KbDocument.uploaded_at.desc()) + .all() + ) + per_doc: list[tuple[str, dict[int, list[tuple[int, bytes, str]]]]] = [] + for d in rows: + name = str(d.name or "") + if not name.lower().endswith(".docx"): + continue + full = _kb_doc_absolute_file_path_for_model(doc_root, d) + if not full.is_file(): + continue + try: + cand = extract_appendix_figure_candidates_from_docx(full) + except Exception as exc: + logger.warning("appendix figure extraction failed %s: %s", full, exc) + continue + per_doc.append((name, cand)) + return merge_best_appendix_figures(per_doc) + + +def _build_appendix_figures_markdown(db: Session, project_uuid: str) -> str: + """ + 附图固定两项(细则): + - 附图1 全厂物料平衡图 + - 附图2 烷基化装置物料平衡图(常见为装置物料平衡图) + + 仅从知识库 .docx 嵌入对象抽取真实图片;解析不到则不在报告中展示该项(不输出占位说明)。 + """ + targets = APPENDIX_FIGURE_TARGETS + resolved = _resolve_appendix_figure_blobs_from_kb(db, project_uuid) + md_by_slot = appendix_figure_markdown_images(resolved, label_title=list(targets)) + + figure_parts: list[str] = [] + for slot in range(1, len(targets) + 1): + md = md_by_slot.get(slot) + if md and str(md).strip(): + figure_parts.append(str(md).strip()) + if not figure_parts: + return "" + return "## 附图\n\n" + "\n\n".join(figure_parts) + + +def _update_chapter_status( + db: Session, + job: ReportGenerationJob, + chapter: ReportGenerationChapter, + status: str, + error_message: Optional[str], +) -> None: + now = datetime.now() + chapter.status = status + chapter.error_message = error_message + chapter.updated_at = now + job.current_section_key = chapter.section_key + job.updated_at = now + db.commit() + + +def _resolve_template(db: Session, template_id: Optional[str]) -> ReportTemplate: + template = None + if template_id: + template = db.query(ReportTemplate).filter(ReportTemplate.id == template_id).first() + if not template: + template = ( + db.query(ReportTemplate) + .filter(ReportTemplate.is_default == True, ReportTemplate.is_active == True) # noqa: E712 + .first() + ) + if not template: + raise HTTPException(status_code=404, detail="未找到可用模板") + return template + + +def _list_template_sections(db: Session, template_id: str) -> list[ReportTemplateSection]: + return ( + db.query(ReportTemplateSection) + .filter(ReportTemplateSection.template_id == template_id) + .order_by(ReportTemplateSection.section_order.asc()) + .all() + ) + + +def _sections_for_generation(sections: list[ReportTemplateSection]) -> list[ReportTemplateSection]: + items = list(sections or []) + if not items: + return [] + + section_nos = { + _extract_section_number(section.section_title or "") + for section in items + if _extract_section_number(section.section_title or "") + } + + filtered: list[ReportTemplateSection] = [] + for section in items: + section_no = _extract_section_number(section.section_title or "") + # 无法解析编号时保持兼容,继续参与生成。 + if not section_no: + filtered.append(section) + continue + # 仅生成叶子节:若存在任一后续子节(前缀匹配 x.y.z ...),则当前节跳过。 + has_children = any(no.startswith(f"{section_no}.") for no in section_nos) + if not has_children: + filtered.append(section) + return filtered + + +def _extract_tokens(text: str) -> list[str]: + src = str(text or "") + zh = re.findall(r"[\u4e00-\u9fa5]{2,8}", src) + en = re.findall(r"[A-Za-z]{3,20}", src.lower()) + raw = zh + en + out: list[str] = [] + seen = set() + for t in raw: + if t in seen: + continue + seen.add(t) + out.append(t) + return out + + +def _fmt_dt(dt: Optional[datetime]) -> Optional[str]: + if not dt: + return None + return dt.strftime("%Y-%m-%d %H:%M:%S") + + +def _select_chapter_example(section_title: str, raw_examples: Optional[str], evidence: dict) -> str: + # 1.2 对“叙事+证据锚点”敏感,宁夏石化类示例易带偏叙述重点,故不使用章节示例 + if _extract_section_number(section_title or "") == "1.2": + return "" + # 2.1.1 须严格对齐《模版.doc》两张原料表,章节示例中的其他项目表结构易干扰 + if _extract_section_number(section_title or "") == "2.1.1": + return "" + # 5.1 表5-1 由合同与要素直出约束;第5章样例为简化「指标|可研值|后评价值」表,易与要素表重复输出 + if _extract_section_number(section_title or "") == "5.1": + return "" + # 5.2.1 表5-2/表5-3 由输出合同与要素直出约束;第5章共用样例曾误标「表5-2 同类对标」易带偏表题 + if _extract_section_number(section_title or "") == "5.2.1": + return "" + # 5.3.1 仅允许要素直出表5-4;第5章样例含「指标|可研值|后评价值」简表易诱发重复表 + if _extract_section_number(section_title or "") == "5.3.1": + return "" + # 5.4 仅允许表5-7;第5章样例(表5-1 可研/后评对比)易诱发重复简表 + if _extract_section_number(section_title or "") == "5.4": + return "" + samples = _parse_examples(raw_examples) + if not samples: + return "" + title = str(section_title or "") + has_table_need = ("表" in title) or _evidence_has_table_signal(evidence) + scored: list[tuple[int, str]] = [] + for s in samples: + score = 0 + txt = s.lower() + if has_table_need and ("|" in s or "表" in s): + score += 4 + if any(k in txt for k in ("万元", "亿元", "投资", "收益")) and ("投资" in title or "财务" in title): + score += 3 + if any(k in txt for k in ("环保", "安全", "排放")) and ("影响" in title or "持续" in title): + score += 3 + if any(k in txt for k in ("结论", "建议", "经验")) and ("结论" in title or "综合" in title): + score += 3 + score += min(len(s) // 300, 2) + scored.append((score, s)) + scored.sort(key=lambda x: x[0], reverse=True) + return scored[0][1] + + +def _parse_examples(raw_examples: Optional[str]) -> list[str]: + text = str(raw_examples or "").strip() + if not text: + return [] + # 支持 JSON 数组格式:["示例1","示例2","示例3"] + if text.startswith("[") and text.endswith("]"): + try: + arr = json.loads(text) + if isinstance(arr, list): + out = [str(x).strip() for x in arr if str(x).strip()] + return out[:3] + except Exception: + pass + # 支持分隔符:---EXAMPLE--- 或 \n\n====\n\n + for sep in ("\n---EXAMPLE---\n", "\n====\n"): + if sep in text: + return [x.strip() for x in text.split(sep) if x.strip()][:3] + # 兼容“示例1/示例2/示例3”文本段 + blocks = re.split(r"\n\s*示例\s*[1-3][::]\s*", "\n" + text) + blocks = [b.strip() for b in blocks if b.strip()] + if len(blocks) >= 2: + return blocks[:3] + return [text] + + +def _evidence_has_table_signal(evidence: dict) -> bool: + docs = evidence.get("chapterDocs") if isinstance(evidence, dict) else [] + if not isinstance(docs, list): + return False + for d in docs[:8]: + if not isinstance(d, dict): + continue + content = str(d.get("content") or "") + if "|" in content or "表" in content[:200]: + return True + return False + + +def _has_conflict_terms(report: str) -> bool: + pairs = [ + ("增加", "下降"), + ("达标", "未达标"), + ("盈利", "亏损"), + ("改善", "恶化"), + ] + for a, b in pairs: + if a in report and b in report: + return True + return False + + +def _resolve_project(db: Session, project_id: str) -> Optional[Project]: + if not project_id: + return None + p = db.query(Project).filter(Project.uuid == project_id).first() + if p: + return p + try: + pid = int(project_id) + except Exception: + return None + return db.query(Project).filter(Project.id == pid).first() diff --git a/services/report_prompt_service.py b/services/report_prompt_service.py new file mode 100644 index 0000000..5c1f445 --- /dev/null +++ b/services/report_prompt_service.py @@ -0,0 +1,135 @@ +from __future__ import annotations + +from services.prompt_template_service import render_prompt +from prompts.report_generation.prompt_defaults import ( + DEFAULT_SECTION_PROMPT_FALLBACK, + DEFAULT_SELECTED_EXAMPLE_FALLBACK, +) + + +def chapter_generation_system_prompt() -> str: + return render_prompt("report_generation/chapter_generation_system.md") + + +def repair_missing_tables_system_prompt() -> str: + return render_prompt("report_generation/repair_missing_tables_system.md") + + +def table_format_repair_system_prompt() -> str: + return render_prompt("report_generation/table_format_repair_system.md") + + +def _build_prior_sibling_sections_prompt_block(prior_sibling_sections_text: str) -> str: + body = str(prior_sibling_sections_text or "").strip() + if not body: + return "" + return ( + "【同章前序小节正文(时间与金额须保持一致)】\n" + f"{body}\n\n" + "【同章一致性约束】\n" + "1. 竣工时间、开工/中交/投产/验收等关键里程碑日期,以及建设投资、总投资、营业收入、利润等各类金额数字," + "须与本章前序小节已写明的口径完全一致(年月日表述可适度简化,但不得出现另一套矛盾日期或金额);\n" + "2. 若【证据包】或【字段级已抽取结果】中某日期/金额与前序小节矛盾,以前序小节为准写入本节," + "不得在正文中另写一套矛盾数值;\n" + "3. 前序小节为「待补充」的字段,本节仍写「待补充」,不得自行编造;\n" + "4. 可补充本节新增信息,但不得改写或否定前序小节已确立的时间与金额。" + ) + + +def _build_prior_chapters_prompt_block(prior_chapters_text: str) -> str: + body = str(prior_chapters_text or "").strip() + if not body: + return "" + return ( + "【前序章节正文(第1~6章,本章须据此总结)】\n" + f"{body}\n\n" + "【前序章节使用约束】\n" + "1. 第7章各节是对第1~6章已生成正文的归纳、提炼与升华,不得与前面章节结论矛盾;\n" + "2. 可概括前文要点,禁止大段照搬;数据与结论须与前文一致;\n" + "3. 若前序章节某处为「待补充」,本节对应表述也应为「待补充」,不得编造;\n" + "4. 须由要素管理直出的表格(如表7-1)仍按【章节输出结构约束】执行,不受本条限制。" + ) + + +def _build_section_reference_block(section_reference: str) -> str: + body = str(section_reference or "").strip() + if not body: + return "" + return ( + "【本章参考范文(本节写作蓝本:结构与行文风格须高度贴合;禁止复用数据、禁止照抄)】\n" + f"{body}\n\n" + "【参考范文使用约束】\n" + "1. 以范文为写作蓝本:段落数量与顺序、每段主题、论述逻辑、句式笔法与篇幅颗粒度均须与范文高度一致,做到逐段对应、同一笔法;\n" + "2. 严禁复用范文中的项目名称、时间、金额、指标值等任何事实数据,须全部替换为当前项目证据包的真实值;\n" + "3. 范文中的表格结构(表头、列顺序、行项)须沿用,但表内数据必须替换为当前项目证据包的值;\n" + "4. 禁止逐字照抄:不得出现与范文连续相同超过 15 字的文字,须改写措辞做到“形似而文不同”;\n" + "5. 若范文与证据包存在矛盾,以证据包为准。" + ) + + +def build_report_chapter_prompt( + *, + section_title: str, + section_prompt: str, + required_tables_text: str, + structured_tables_text: str, + canonical_fields_text: str, + selected_example: str, + heading_rule: str, + section_contract: str, + evidence_json: str, + prior_sibling_sections_text: str = "", + prior_chapters_text: str = "", + section_reference: str = "", +) -> str: + return render_prompt( + "report_generation/chapter_generation_user_ref_aligned.md", + section_title=section_title, + section_prompt=section_prompt or DEFAULT_SECTION_PROMPT_FALLBACK, + required_tables_text=required_tables_text or "无", + structured_tables_text=structured_tables_text, + canonical_fields_text=canonical_fields_text, + selected_example=selected_example or DEFAULT_SELECTED_EXAMPLE_FALLBACK, + heading_rule=heading_rule, + section_contract=section_contract, + evidence_json=evidence_json, + prior_sibling_sections_block=_build_prior_sibling_sections_prompt_block( + prior_sibling_sections_text + ), + prior_chapters_block=_build_prior_chapters_prompt_block(prior_chapters_text), + section_reference_block=_build_section_reference_block(section_reference), + ) + + +def build_repair_missing_tables_prompt( + *, + section_title: str, + original_prompt: str, + content: str, + missing_tables: list[str], + evidence_json: str, +) -> str: + return render_prompt( + "report_generation/repair_missing_tables_user.md", + section_title=section_title, + missing_tables=", ".join(missing_tables), + content=content, + original_prompt=original_prompt[:8000], + evidence_json=evidence_json[:12000], + ) + + +def build_table_format_repair_prompt( + *, + section_title: str, + table_specs_json: str, + content: str, + evidence_json: str, +) -> str: + return render_prompt( + "report_generation/table_format_repair_user.md", + section_title=section_title, + table_specs_json=table_specs_json, + content=content, + evidence_json=evidence_json[:12000], + ) diff --git a/services/report_runtime_store.py b/services/report_runtime_store.py new file mode 100644 index 0000000..26839b2 --- /dev/null +++ b/services/report_runtime_store.py @@ -0,0 +1,145 @@ +from __future__ import annotations + +from copy import deepcopy +from datetime import datetime +import threading +from typing import Any, Optional + + +_RUNTIME_LOCK = threading.RLock() +_JOB_STATES: dict[str, dict[str, Any]] = {} + + +def _now_str() -> str: + return datetime.now().strftime("%Y-%m-%d %H:%M:%S") + + +def _chapter_payload( + *, + section_key: str, + section_title: str, + section_order: int, + status: str = "pending", +) -> dict[str, Any]: + return { + "sectionKey": section_key, + "sectionTitle": section_title, + "sectionOrder": section_order, + "status": status, + "content": None, + "errorMessage": None, + "updatedAt": _now_str(), + "promptText": None, + "evidencePayload": None, + "validationPayload": None, + } + + +def init_job_state( + *, + job_id: str, + project_id: str, + template_id: Optional[str], + chapters: list[dict[str, Any]], +) -> None: + with _RUNTIME_LOCK: + _JOB_STATES[job_id] = { + "jobId": job_id, + "projectId": project_id, + "templateId": template_id, + "status": "pending", + "progress": 0, + "currentSectionKey": None, + "errorMessage": None, + "createdAt": _now_str(), + "updatedAt": _now_str(), + "completedAt": None, + "chapters": { + str(item["sectionKey"]): _chapter_payload( + section_key=str(item["sectionKey"]), + section_title=str(item["sectionTitle"]), + section_order=int(item["sectionOrder"]), + status=str(item.get("status") or "pending"), + ) + for item in (chapters or []) + }, + } + + +def get_job_state(job_id: str) -> Optional[dict[str, Any]]: + with _RUNTIME_LOCK: + state = _JOB_STATES.get(job_id) + return deepcopy(state) if state else None + + +def update_job_state(job_id: str, **fields: Any) -> None: + with _RUNTIME_LOCK: + state = _JOB_STATES.get(job_id) + if not state: + return + state.update(fields) + state["updatedAt"] = _now_str() + + +def update_chapter_state( + job_id: str, + section_key: str, + **fields: Any, +) -> None: + with _RUNTIME_LOCK: + state = _JOB_STATES.get(job_id) + if not state: + return + chapter = state.get("chapters", {}).get(section_key) + if not chapter: + return + chapter.update(fields) + chapter["updatedAt"] = _now_str() + state["updatedAt"] = _now_str() + + +def append_chapter_content( + job_id: str, + section_key: str, + delta_text: str, + *, + stream_phase: str, +) -> None: + if not delta_text: + return + with _RUNTIME_LOCK: + state = _JOB_STATES.get(job_id) + if not state: + return + chapter = state.get("chapters", {}).get(section_key) + if not chapter: + return + current = str(chapter.get("content") or "") + validation_payload = dict(chapter.get("validationPayload") or {}) + validation_payload["streamPhase"] = stream_phase + chapter["content"] = current + delta_text + chapter["validationPayload"] = validation_payload + chapter["updatedAt"] = _now_str() + state["currentSectionKey"] = section_key + state["updatedAt"] = _now_str() + + +def set_chapter_stream_phase(job_id: str, section_key: str, stream_phase: str) -> None: + with _RUNTIME_LOCK: + state = _JOB_STATES.get(job_id) + if not state: + return + chapter = state.get("chapters", {}).get(section_key) + if not chapter: + return + validation_payload = dict(chapter.get("validationPayload") or {}) + validation_payload["streamPhase"] = stream_phase + chapter["validationPayload"] = validation_payload + chapter["updatedAt"] = _now_str() + state["currentSectionKey"] = section_key + state["updatedAt"] = _now_str() + + +def remove_job_state(job_id: str) -> None: + with _RUNTIME_LOCK: + _JOB_STATES.pop(job_id, None) diff --git a/services/retrieval_service.py b/services/retrieval_service.py new file mode 100644 index 0000000..a38ffe9 --- /dev/null +++ b/services/retrieval_service.py @@ -0,0 +1,324 @@ +""" +services/retrieval_service.py +后评价报告材料检索服务 +用于从向量库中检索与后评价报告相关的材料 +""" + +from typing import List, Dict, Any, Optional +from langchain_core.documents import Document +from function.vector_store import VectorStore + + +class RetrievalService: + """后评价报告材料检索服务""" + + def __init__(self, collection_name: str = "eval_report"): + """ + 初始化检索服务 + + Args: + collection_name: 向量库集合名称 + """ + self.collection_name = collection_name + self.vector_store = VectorStore(collection_name=collection_name, drop_old=False) + + def search_by_query(self, query: str, top_k: int = 10, filter_project: Optional[str] = None) -> List[Document]: + """ + 根据查询语句检索相关材料 + + Args: + query: 查询语句,例如"项目背景"、"财务评价"、"技术方案" + top_k: 返回结果数量 + filter_project: 可选的项目 UUID 过滤 + + Returns: + 检索到的文档列表 + """ + # 构建查询语句 + if filter_project: + full_query = f"{query} 项目 UUID:{filter_project}" + else: + full_query = query + + # 执行检索 + results = self.vector_store.similarity_search_with_score(full_query, k=top_k) + + # 过滤并返回文档 + docs = [] + for doc, score in results: + # 如果指定了项目过滤,检查文档是否属于该项目 + if filter_project and doc.metadata.get("project_uuid") != filter_project: + continue + docs.append(doc) + + return docs + + def search_by_category(self, category: str, project_uuid: str, top_k: int = 10) -> List[Dict[str, Any]]: + """ + 根据类别检索材料 + + Args: + category: 类别,如"项目概况"、"技术方案"、"财务评价"、"效益分析" + project_uuid: 项目 UUID + top_k: 返回结果数量 + + Returns: + 检索结果列表,包含文档内容和元数据 + """ + # 定义类别对应的检索关键词 + category_keywords = { + "项目概况": ["项目背景", "建设内容", "项目规模", "建设地点", "建设单位", "项目决策", "立项依据"], + "技术方案": ["技术方案", "工艺技术", "设备选型", "工程设计", "施工安装", "调试运行", "专利技术"], + "财务评价": ["投资估算", "资金筹措", "财务分析", "现金流量", "利润计算", "成本分析", "经济效益"], + "效益分析": ["经济效益", "社会效益", "环境效益", "环境影响", "资源利用", "节能降耗"], + "风险分析": ["风险分析", "风险识别", "风险评价", "风险对策", "不确定性分析"], + "后评价结论": ["后评价结论", "经验教训", "建议措施", "综合评价"], + } + + # 使用多个关键词进行检索 + all_docs = [] + for keyword in category_keywords.get(category, [category]): + docs = self.search_by_query(keyword, top_k=5, filter_project=project_uuid) + all_docs.extend(docs) + + # 去重并返回 + seen = set() + unique_docs = [] + for doc in all_docs: + key = (doc.page_content[:100], doc.metadata.get("heading", "")) + if key not in seen: + seen.add(key) + unique_docs.append(doc) + + # 转换为字典格式 + result = [] + for doc in unique_docs[:top_k]: + result.append({ + "content": doc.page_content, + "heading": doc.metadata.get("heading", ""), + "heading_level": doc.metadata.get("heading_level", 0), + "doc_id": doc.metadata.get("doc_id", ""), + "path": doc.metadata.get("path", ""), + "score": doc.metadata.get("score", 0.0), + }) + + return result + + def get_project_materials(self, project_uuid: str) -> Dict[str, Any]: + """ + 获取项目的所有相关材料 + + Args: + project_uuid: 项目 UUID + + Returns: + 包含项目所有材料的字典 + """ + # 检索项目基本信息 + basic_info = self.search_by_query( + "项目概况 项目基本情况", + top_k=5, + filter_project=project_uuid + ) + # 检索技术方案 + tech_info = self.search_by_query( + "技术方案 工艺技术", + top_k=5, + filter_project=project_uuid + ) + # 检索财务信息 + finance_info = self.search_by_query( + "财务评价 经济效益", + top_k=5, + filter_project=project_uuid + ) + # 检索效益分析 + benefit_info = self.search_by_query( + "效益分析 社会效益", + top_k=5, + filter_project=project_uuid + ) + return { + "basic_info": [doc.page_content for doc in basic_info], + "tech_info": [doc.page_content for doc in tech_info], + "finance_info": [doc.page_content for doc in finance_info], + "benefit_info": [doc.page_content for doc in benefit_info], + } + + def search_similar_report(self, reference_content: str, top_k: int = 5) -> List[Document]: + """ + 根据参考内容检索相似报告 + + Args: + reference_content: 参考报告内容 + top_k: 返回结果数量 + + Returns: + 相似报告列表 + """ + # 提取关键信息用于检索 + query = f"后评价报告 项目概况 技术方案 财务评价" + results = self.vector_store.similarity_search_with_score(query, k=top_k) + + docs = [] + for doc, score in results: + docs.append(doc) + + return docs + + def get_template_data(self, project_uuid: str, query: str = "项目概况 技术方案 财务评价", top_k: int = 15) -> Dict[str, Any]: + """ + 获取符合模板要求的数据 + + Args: + project_uuid: 项目 UUID + query: 检索查询语句 + top_k: 检索结果数量 + + Returns: + 符合模板字段要求的数据字典 + """ + from report_template import ReportTemplate + + # 检索材料 + materials = self.search_by_query(query, top_k=top_k, filter_project=project_uuid) + + if not materials: + return { + "materials": [], + "template_data": {}, + "key_info": {} + } + + # 提取关键信息 + key_info = ReportTemplate.extract_key_info([doc.page_content for doc in materials]) + + # 映射到模板字段 + template_data = ReportTemplate.map_materials_to_template([doc.page_content for doc in materials]) + + return { + "materials": [doc for doc in materials], + "materials_text": [doc.page_content for doc in materials], + "template_data": template_data, + "key_info": key_info + } + + def get_chapter_materials(self, project_uuid: str, chapter: str, top_k: int = 10) -> List[Dict[str, Any]]: + """ + 获取指定章节的材料 + + Args: + project_uuid: 项目 UUID + chapter: 章节名称 + top_k: 返回结果数量 + + Returns: + 材料列表 + """ + # 定义章节对应的检索关键词 + chapter_keywords = { + "项目概况": ["项目背景", "建设内容", "项目规模", "建设地点", "建设单位", "项目决策", "立项依据"], + "技术方案": ["技术方案", "工艺技术", "设备选型", "工程设计", "施工安装", "调试运行", "专利技术"], + "项目全过程总结与管理评价": [ + # ---- 强优先:表1~表14 + 编号小节 ---- + "2.1", "2.1.1", "2.1.1.3", "2.1.6", "2.2", "2.2.1", "2.2.10", "2.3", "2.3.1", "2.3.6", + "表1原料数量及组成对比表", "表2原料性质对比表", + "表3前期预测和2019年实际产品对比表", + "表4装置规模及实际运行负荷对比表", + "表5项目规模对比表", + "表6可研报告与基础设计阶段工程内容对比表", + "表7项目承包商的招投标情况表", + "表8项目设计主要进度控制情况表", + "表9施工图设计变更情况表", + "表10重大设计变更情况表", + "表11主要设备采购情况表", + "表12施工重要节点进度表", + "表13原料性质对比表", + "表14主要标定结果与设计指标对比表", + + # ---- 次优先:结构性关键词 ---- + "可行性研究", "可研编制", "可研报告", "评估会", "可研批复", "资源与原料评价", + "基础设计", "设计审查", "审查意见", "设计变更", "施工图设计", "招投标", "施工准备", + "工程监理", "HSE", "竣工验收", + "投产管理", "生产准备", "联合试运", "试生产", "生产运行评价", "原料供应评价", "标定结果", + "原料数量及组成对比", "装置规模", "负荷率", + ], + "财务评价": ["投资估算", "资金筹措", "财务分析", "现金流量", "利润计算", "成本分析", "经济效益"], + "效益分析": ["经济效益", "社会效益", "环境效益", "环境影响", "资源利用", "节能降耗"], + "项目目标和可持续性评价": [ + # 强优先:章节标题与编号 + "5", "5.1", "5.1.1", "5.1.2", "5.1.3", "5.2", "5.3", "5.3.1", "5.3.2", "5.3.3", "5.3.4", "5.3.5", + "项目目标实现程度评价", "项目绩效对标分析", "项目持续性评价", + + # 目标实现(工程/技术/经济) + "工程规模", "项目进度", "工程质量", "项目功能", "投资控制", + "加工量", "负荷", "产品产量", "产品质量", "技术指标", "标定", "设计值", "考核", + "主要经济指标", "IRR", "内部收益率", "净现值", "NPV", "投资回收期", "营业收入", "成本费用", "税后利润", + + # 对标 + "对标", "横向对比", "同类装置", "单位投资", "单位能耗", "蒸汽能耗", "综合能耗", "辛烷值", "收率", "烯烃", + + # 持续性(资源/产品/内部/政策) + "资源分析", "原料供应", "资源保障", + "产品分析", "市场需求", "国Ⅵ", "国ⅥA", "国ⅥB", + "项目内部因素", "装置规模合理性", "工艺方案", "技术水平", + "国家政策", "产业政策", "质量标准", + + # 若材料以安全/环保合规支撑持续性 + "个人风险", "社会风险", "可接受", "风险曲线", + "非甲烷总烃", "无组织排放", "mg/m3", "标准值", + ], + "风险分析": ["风险分析", "风险识别", "风险评价", "风险对策", "不确定性分析"], + "后评价结论": ["后评价结论", "经验教训", "建议措施", "综合评价"], + } + + keywords = chapter_keywords.get(chapter, [chapter]) + + # 使用多个关键词进行检索 + all_docs = [] + for keyword in keywords: + docs = self.search_by_query(keyword, top_k=5, filter_project=project_uuid) + all_docs.extend(docs) + + # 去重并返回 + seen = set() + unique_docs = [] + for doc in all_docs: + key = (doc.page_content[:100], doc.metadata.get("heading", "")) + if key not in seen: + seen.add(key) + unique_docs.append(doc) + + # 转换为字典格式 + result = [] + for doc in unique_docs[:top_k]: + result.append({ + "content": doc.page_content, + "heading": doc.metadata.get("heading", ""), + "heading_level": doc.metadata.get("heading_level", 0), + "doc_id": doc.metadata.get("doc_id", ""), + "path": doc.metadata.get("path", ""), + "score": doc.metadata.get("score", 0.0), + }) + + return result + + +# 检索示例 +if __name__ == "__main__": + # 创建检索服务实例 + service = RetrievalService() + + # 示例 1:搜索项目背景 + print("示例 1:搜索项目背景") + docs = service.search_by_query("项目背景 建设内容", top_k=3) + for doc in docs: + print(f"标题:{doc.metadata.get('heading', 'N/A')}") + print(f"内容:{doc.page_content[:200]}...\n") + + # 示例 2:搜索财务评价 + print("示例 2:搜索财务评价") + docs = service.search_by_query("财务评价 现金流量", top_k=3) + for doc in docs: + print(f"标题:{doc.metadata.get('heading', 'N/A')}") + print(f"内容:{doc.page_content[:200]}...\n") diff --git a/services/standard_elements_2020.py b/services/standard_elements_2020.py new file mode 100644 index 0000000..09a43f3 --- /dev/null +++ b/services/standard_elements_2020.py @@ -0,0 +1,1395 @@ +""" +2020 版《炼油化工建设项目后评价报告编制细则(修订)》— 附件/附表结构。 + +依据工作区《炼油化工建设项目后评价报告编制细则(修订).doc》抽取的**正文表格**与**附表1~8**表题、表头整理: +- **全局要素表**:第1章项目概况(文字要素)+正文各章**非按年**对比表(表2-1~表3-7、表4-1、表5-1~5-3/5-6/5-7、表6-1、表7-1)+ **附表1、附表2、附表8**。 +- **时间要素表**:**附表3~附表7**(利润表、税金表等按「建设期 / 后评价时点前、后」及年度栏填报)+ 正文**按年(或按评价期)**列表:**表2-4** 产品流向、**表5-4** 生产经营及效益对比、**表5-5** 主要生产经营指标;库中按「**表名 × 日历年**」各建一张时间表,列名与细则表头一致(见 TIME_TABLE_MULTI_COLUMNS)。 + +附表3~7 各行在库中使用「现金流·」「利润·」等**表内前缀**避免不同附表中同名行(如「营业收入」)在抽取路由上冲突;展示时仍以细则原文行名为 lvl3/说明。 + +规则抽取(build_rule_factor_items)仅包含概况 + 附表1/2/8 行 + 少量高频时间指标;完整模版见 ALL_GLOBAL_TABLES + ALL_TIME_TABLE_SPECS。 +""" + +from __future__ import annotations + +def _S(src: str, names: list[str]) -> list[tuple[str, str, str]]: + """章节要素行:source 作为提示词路径,lvl3 默认与 name 一致。""" + return [(n, src, n) for n in names] + + +# 细则“报告摘要、前言、第1~7章”章节要素(非表格字段),用于新建项目预置空值并供 LLM 回填。 +GLOBAL_SECTION_TABLES: list[tuple[str, int, list[tuple[str, str, str]]]] = [ + ( + "章节要素-摘要与前言", + 100, + _S( + "报告摘要与前言", + [ + "摘要·项目基本情况", + "摘要·总体评价结论", + "摘要·成功度评价结果", + "摘要·主要经验", + "摘要·主要问题", + "摘要·主要建议", + "前言·评价依据", + "前言·评价范围与时点", + "前言·组织方式与过程", + "前言·基础资料清单", + "前言·需解释问题清单", + ], + ), + ), + ( + "章节要素-第1章项目概况", + 110, + _S( + "第1章 项目概况", + [ + "第1章·项目名称", + "第1章·建设单位", + "第1章·建设地点", + "第1章·建设类型", + "第1章·起止时间", + "第1章·建设内容", + "第1章·建设投资", + "第1章·占地面积", + ], + ), + ), + ( + "章节要素-第2章前期工作评价", + 120, + _S( + "第2章 前期工作评价", + [ + "第2章·资源原料评价结论", + "第2章·产品方案评价结论", + "第2章·产品市场评价结论", + "第2章·工艺技术方案评价结论", + "第2章·设备方案评价结论", + "第2章·厂址与外部条件评价结论", + "第2章·总图与配套工程评价结论", + "第2章·技术指标评价结论", + "第2章·风险分析评价结论", + "第2章·可研编制单位资质与选择评价", + "第2章·可研进度评价", + "第2章·可研质量评价", + "第2章·前评估意见采纳落实评价", + "第2章·初步设计评价结论", + "第2章·前期决策程序合规性", + "第2章·前期工作总体结论", + ], + ), + ), + ( + "章节要素-第3章建设实施评价", + 130, + _S( + "第3章 建设实施评价", + [ + "第3章·建设管理模式评价结论", + "第3章·招投标评价结论", + "第3章·施工图设计符合性评价", + "第3章·施工图设计进度评价", + "第3章·施工图设计质量评价", + "第3章·设计变更管理评价", + "第3章·施工准备评价", + "第3章·施工计划执行评价", + "第3章·采购工作评价结论", + "第3章·工程监理评价结论", + "第3章·工程质量评价结论", + "第3章·HSE管理评价结论", + "第3章·三查四定与中间交接评价", + "第3章·竣工验收评价结论", + "第3章·建设实施总体结论", + ], + ), + ), + ( + "章节要素-第4章生产运行评价", + 140, + _S( + "第4章 生产运行评价", + [ + "第4章·生产准备评价结论", + "第4章·联合试运与试生产评价结论", + "第4章·原料供应评价结论", + "第4章·生产运行总体评价", + "第4章·达标评价结论", + "第4章·工艺技术评价结论", + "第4章·设备运行评价结论", + "第4章·公用工程与辅助设施评价结论", + "第4章·生产运行总体结论", + ], + ), + ), + ( + "章节要素-第5章投资与经济效益评价", + 150, + _S( + "第5章 投资与经济效益评价", + [ + "第5章·主要经济指标实现程度评价", + "第5章·投资控制及变动原因结论", + "第5章·投资水平分析结论", + "第5章·资金来源及到位评价结论", + "第5章·投资控制经验教训", + "第5章·营业收入变动原因", + "第5章·总成本费用变动原因", + "第5章·税后利润变动原因", + "第5章·财务后评价IRR", + "第5章·财务后评价NPV", + "第5章·财务后评价回收期", + "第5章·不确定性分析结论", + "第5章·投资与经济效益总体结论", + ], + ), + ), + ( + "章节要素-第6章影响与持续性评价", + 160, + _S( + "第6章 影响与持续性评价", + [ + "第6章·环境影响评价结论", + "第6章·安全影响评价结论", + "第6章·科技进步影响评价结论", + "第6章·社会影响评价结论", + "第6章·项目影响总体结论", + "第6章·资源持续性评价结论", + "第6章·产品持续性评价结论", + "第6章·技术经济竞争力评价结论", + "第6章·项目持续性总体结论", + ], + ), + ), +] + + +def section_table_row_keys(table_group_name: str) -> list[str]: + """返回 ``GLOBAL_SECTION_TABLES`` 中某「章节要素-*」分组的全部行键(与要素库 ``row_key`` 一致)。""" + for name, _, rows in GLOBAL_SECTION_TABLES: + if name == table_group_name: + return [str(r[0]) for r in rows if r and str(r[0]).strip()] + return [] + + +CHAPTER1_PROJECT_OVERVIEW_TABLE_GROUP = "章节要素-第1章项目概况" + +# 细则「附表1」项目建设工作程序表(全局,单行键 = 程序节点) +APPENDIX1_PROGRAM_ROWS: list[tuple[str, str, str]] = [ + ("项目建议书(预可研)批复", "附表1 项目建设工作程序表", "项目建议书"), + ("可行性研究报告编制", "附表1 项目建设工作程序表", "可行性研究报告编制"), + ("环境影响报告编制", "附表1 项目建设工作程序表", "环境影响报告编制"), + ("环境影响报告批复", "附表1 项目建设工作程序表", "环境影响报告批复"), + ("安全评价报告编制", "附表1 项目建设工作程序表", "安全评价报告编制"), + ("安全评价报告批复", "附表1 项目建设工作程序表", "安全评价报告批复"), + ("节能评估报告编制", "附表1 项目建设工作程序表", "节能评估报告编制"), + ("节能评估报告批复", "附表1 项目建设工作程序表", "节能评估报告批复"), + ("可行性研究报告评估", "附表1 项目建设工作程序表", "可行性研究报告评估"), + ("可行性研究报告批复", "附表1 项目建设工作程序表", "可行性研究报告批复"), + ("核准报告批复", "附表1 项目建设工作程序表", "核准报告批复"), + ("初步设计编制", "附表1 项目建设工作程序表", "初步设计编制"), + ("初步设计审查", "附表1 项目建设工作程序表", "初步设计审查"), + ("初步设计批复", "附表1 项目建设工作程序表", "初步设计批复"), + ("施工图设计编制", "附表1 项目建设工作程序表", "施工图设计编制"), + ("开工报告批复", "附表1 项目建设工作程序表", "开工报告批复"), + ("开工建设", "附表1 项目建设工作程序表", "开工建设"), + ("投产运行", "附表1 项目建设工作程序表", "投产运行"), + ("竣工验收", "附表1 项目建设工作程序表", "竣工验收"), +] + +# 细则「附表2」竣工决算投资构成(全局) +APPENDIX2_INVESTMENT_ROWS: list[tuple[str, str, str]] = [ + ("建设投资", "附表2 项目竣工决算投资构成表(万元)", "建设投资"), + ("固定资产投资", "附表2 项目竣工决算投资构成表(万元)", "固定资产投资"), + ("工程费用", "附表2 项目竣工决算投资构成表(万元)", "工程费用"), + ("工艺生产装置", "附表2 项目竣工决算投资构成表(万元)", "工艺生产装置"), + ("总图运输", "附表2 项目竣工决算投资构成表(万元)", "总图运输"), + ("储运工程", "附表2 项目竣工决算投资构成表(万元)", "储运工程"), + ("辅助设施", "附表2 项目竣工决算投资构成表(万元)", "辅助设施"), + ("公用工程", "附表2 项目竣工决算投资构成表(万元)", "公用工程"), + ("生产管理设施", "附表2 项目竣工决算投资构成表(万元)", "生产管理设施"), + ("厂外工程", "附表2 项目竣工决算投资构成表(万元)", "厂外工程"), + ("工器具及生产家具购置费", "附表2 项目竣工决算投资构成表(万元)", "工器具及生产家具购置费"), + ("固定资产其他费用", "附表2 项目竣工决算投资构成表(万元)", "固定资产其他费用"), + ("无形资产费用", "附表2 项目竣工决算投资构成表(万元)", "无形资产费用"), + ("递延资产费用", "附表2 项目竣工决算投资构成表(万元)", "递延资产费用"), + ("固定资产投资方向调节税", "附表2 项目竣工决算投资构成表(万元)", "固定资产投资方向调节税"), + ("建设期利息", "附表2 项目竣工决算投资构成表(万元)", "建设期利息"), + ("铺底流动资金", "附表2 项目竣工决算投资构成表(万元)", "铺底流动资金"), + ("报批项目总投资", "附表2 项目竣工决算投资构成表(万元)", "报批项目总投资"), +] + +# 附表2 规范行键顺序(与 templates/js/quick-fill.js preferredA2Rows 一致) +APPENDIX2_CANONICAL_ROW_ORDER: list[str] = [ + "一 建设投资", + "1 固定资产投资", + "1.1 工程费用", + "1.1.1 工艺生产装置", + "1.1.2 总图运输", + "1.1.3 储运工程", + "1.1.4 辅助设施", + "1.1.5 公用工程", + "1.1.6 生产管理设施", + "1.1.7 厂外工程", + "1.1.8 工器具及生产家具购置费", + "1.2 固定资产其他费用", + "1.2.1 ×××费用", + "1.2.2 ×××费用", + "2 无形资产费用", + "2.1 ×××费用", + "3 递延资产费用", + "3.1 ×××费用", + "二 固定资产投资方向调节税", + "三 建设期利息", + "四 铺底流动资金", + "报批项目总投资", +] + +# 附表2 旧版短行键 → 规范行键(与 quick-fill.js getLegacyAppendix2RowKeyMap 一致) +APPENDIX2_LEGACY_ROW_KEY_MAP: dict[str, str] = { + "建设投资": "一 建设投资", + "固定资产投资": "1 固定资产投资", + "工程费用": "1.1 工程费用", + "工艺生产装置": "1.1.1 工艺生产装置", + "总图运输": "1.1.2 总图运输", + "储运工程": "1.1.3 储运工程", + "辅助设施": "1.1.4 辅助设施", + "公用工程": "1.1.5 公用工程", + "生产管理设施": "1.1.6 生产管理设施", + "厂外工程": "1.1.7 厂外工程", + "工器具及生产家具购置费": "1.1.8 工器具及生产家具购置费", + "固定资产其他费用": "1.2 固定资产其他费用", + "无形资产费用": "2 无形资产费用", + "递延资产费用": "3 递延资产费用", + "固定资产投资方向调节税": "二 固定资产投资方向调节税", + "建设期利息": "三 建设期利息", + "铺底流动资金": "四 铺底流动资金", + "报批项目总投资": "报批项目总投资", +} + +# 细则「附表8」可研与后评价参数对比(全局;行键与要素表编辑/回填一致;税收明细用「税类·子项」避免与成本 3.x 序号冲突) +_A8 = "附表8 可研报告和后评价参数对比表" +APPENDIX8_PARAM_ROWS: list[tuple[str, str, str]] = [ + ("一 成本参数", _A8, "一 成本参数"), + ("1 原料价格", _A8, "1 原料价格"), + ("1.1 氢气", _A8, "1.1 氢气"), + ("2 催化剂和化学药剂", _A8, "2 催化剂和化学药剂"), + ("3 燃料动力价格", _A8, "3 燃料动力价格"), + ("3.1 除盐水价格", _A8, "3.1 除盐水价格"), + ("3.2 除氧水价格", _A8, "3.2 除氧水价格"), + ("3.3 循环水价格", _A8, "3.3 循环水价格"), + ("3.4 1.0MPa蒸汽价格", _A8, "3.4 1.0MPa蒸汽价格"), + ("3.5 3.5MPa蒸汽价格", _A8, "3.5 3.5MPa蒸汽价格"), + ("3.6 新鲜水", _A8, "3.6 新鲜水"), + ("3.7 电价格", _A8, "3.7 电价格"), + ("3.8 净化风价格", _A8, "3.8 净化风价格"), + ("3.9 氮气价格", _A8, "3.9 氮气价格"), + ("3.10 凝结水", _A8, "3.10 凝结水"), + ("3.11 废渣处置", _A8, "3.11 废渣处置"), + ("4 人员费用", _A8, "4 人员费用"), + ("5 折旧年限", _A8, "5 折旧年限"), + ("6 无形资产摊销年限", _A8, "6 无形资产摊销年限"), + ("7 其他资产摊销年限", _A8, "7 其他资产摊销年限"), + ("8 安全生产费用", _A8, "8 安全生产费用"), + ("9 安保基金", _A8, "9 安保基金"), + ("10 其他制造费用", _A8, "10 其他制造费用"), + ("11 其他管理费用", _A8, "11 其他管理费用"), + ("12 长期贷款利息", _A8, "12 长期贷款利息"), + ("13 短期贷款/流动资产贷款利率", _A8, "13 短期贷款/流动资产贷款利率"), + ("14 其他销售费用", _A8, "14 其他销售费用"), + ("二 营业收入参数", _A8, "二 营业收入参数"), + ("2.1 98#汽油", _A8, "2.1 98#汽油"), + ("2.2 95#汽油", _A8, "2.2 95#汽油"), + ("2.3 92#汽油", _A8, "2.3 92#汽油"), + ("2.4 异丁烷", _A8, "2.4 异丁烷"), + ("2.5 正丁烷", _A8, "2.5 正丁烷"), + ("2.6 燃料气", _A8, "2.6 燃料气"), + ("2.7 液化气", _A8, "2.7 液化气"), + ("三 税收参数", _A8, "三 税收参数"), + ("增值税税率", _A8, "增值税税率"), + ("增值税·汽油各品种产品", _A8, "汽油各品种产品"), + ("增值税·异丁烷", _A8, "异丁烷"), + ("增值税·正丁烷", _A8, "正丁烷"), + ("增值税·燃料气", _A8, "燃料气"), + ("增值税·液化气", _A8, "液化气"), + ("消费税税率", _A8, "消费税税率"), + ("消费税·产品汽油税率", _A8, "产品汽油税率"), + ("城市维护建设税税率", _A8, "城市维护建设税税率"), + ("教育费附加", _A8, "教育费附加"), + ("所得税税率", _A8, "所得税税率"), + ("四 基准收益率", _A8, "四 基准收益率"), +] + +APPENDIX8_CANONICAL_ROW_ORDER: list[str] = [str(r[0]) for r in APPENDIX8_PARAM_ROWS] + +# 与 quick-fill.js legacyA8Map 一致(旧 row_key → 规范 row_key) +APPENDIX8_LEGACY_ROW_KEY_MAP: dict[str, str] = { + "3.1.1 增值税·产品A税率": "增值税·汽油各品种产品", + "3.1.2 增值税·产品B税率": "增值税·异丁烷", + "3.2.1 消费税·产品A税率": "消费税·产品汽油税率", + "3.2.2 消费税·产品B税率": "消费税·产品汽油税率", + "3.1 增值税税率": "增值税税率", + "3.2 消费税税率": "消费税税率", + "3.3 城市维护建设税税率": "城市维护建设税税率", + "3.4 教育费附加": "教育费附加", + "3.5 所得税税率": "所得税税率", +} + +GLOBAL_APPENDIX_TABLES: list[tuple[str, int, list[tuple[str, str, str]]]] = [ + ("附表1 项目建设工作程序表", 600, APPENDIX1_PROGRAM_ROWS), + ("附表2 项目竣工决算投资构成表(万元)", 700, APPENDIX2_INVESTMENT_ROWS), + ("附表8 可研报告和后评价参数对比表", 800, APPENDIX8_PARAM_ROWS), +] + + +def _T(src: str, names: list[str]) -> list[tuple[str, str, str]]: + """细则表格行:source 用于检索路径前缀,lvl3 与行名一致便于规则抽取。""" + return [(n, src, n) for n in names] + + +# --------------------------------------------------------------------------- +# 表7-1 项目综合评价评分表:多列(要素权重/评分/得分、指标评分/权重/得分),行键=「指标·要素」 +# --------------------------------------------------------------------------- +TABLE_7_1_SCORING_TABLE_NAME = "表7-1 项目综合评价评分表" + +TABLE_7_1_COLUMN_KEYS: list[str] = [ + "要素权重", + "要素评分", + "要素得分", + "指标评分", + "指标权重", + "指标得分", +] + +# 每行预置的列值(细则给定权重;评分/得分类由填报时录入,默认不写库或 None) +TABLE_7_1_ROW_CELL_DEFAULTS: list[tuple[str, dict[str, str]]] = [ + ("前期工作·资料完备性", {"要素权重": "0.4", "指标权重": "0.2"}), + ("前期工作·程序规范性", {"要素权重": "0.4", "指标权重": "0.2"}), + ("前期工作·前期工作质量", {"要素权重": "0.2", "指标权重": "0.2"}), + ("建设实施·施工图设计质量", {"要素权重": "0.2", "指标权重": "0.2"}), + ("建设实施·管理规范性", {"要素权重": "0.2", "指标权重": "0.2"}), + ("建设实施·合同、招投标及采购", {"要素权重": "0.2", "指标权重": "0.2"}), + ("建设实施·工程质量及进度", {"要素权重": "0.2", "指标权重": "0.2"}), + ("建设实施·施工安全", {"要素权重": "0.1", "指标权重": "0.2"}), + ("建设实施·竣工验收", {"要素权重": "0.1", "指标权重": "0.2"}), + ("生产运行·生产准备", {"要素权重": "0.1", "指标权重": "0.2"}), + ("生产运行·生产装置负荷率", {"要素权重": "0.3", "指标权重": "0.2"}), + ("生产运行·生产达标率", {"要素权重": "0.3", "指标权重": "0.2"}), + ("生产运行·生产运行周期", {"要素权重": "0.2", "指标权重": "0.2"}), + ("生产运行·安全环保达标情况", {"要素权重": "0.1", "指标权重": "0.2"}), + ("投资与经济效益·投资控制", {"要素权重": "0.5", "指标权重": "0.2"}), + ("投资与经济效益·经济效益", {"要素权重": "0.5", "指标权重": "0.2"}), + ("影响与持续性·装置规模和技术竞争力", {"要素权重": "0.4", "指标权重": "0.2"}), + ("影响与持续性·安全环保节能等政策影响", {"要素权重": "0.3", "指标权重": "0.2"}), + ("影响与持续性·科技进步和社会影响", {"要素权重": "0.1", "指标权重": "0.2"}), + ("影响与持续性·资源持续性", {"要素权重": "0.2", "指标权重": "0.2"}), + ("综合得分", {"要素权重": "1.0"}), +] + +TABLE_7_1_FIELDS: list[tuple[str, str, str]] = [ + (rk, "7.1.2 成功度评价", rk) for rk, _ in TABLE_7_1_ROW_CELL_DEFAULTS +] + +# --------------------------------------------------------------------------- +# 多列表格:与 ALL_GLOBAL_TABLES 中 table_name 一致。 +# 元组为 (数据列名列表, 行级默认单元格值);行键仍为「项目/要素名称」列(与单列表相同)。 +# --------------------------------------------------------------------------- +MULTI_COLUMN_GLOBAL_SPECS: dict[str, tuple[list[str], Optional[dict[str, dict[str, str]]]]] = { + "附表1 项目建设工作程序表": ( + ["开始时间", "完成时间", "文号", "部门/单位", "备注"], + None, + ), + "附表2 项目竣工决算投资构成表(万元)": ( + [ + "设备购置", + "安装工程", + "建筑工程", + "其他费用", + "合计", + "其中外汇", + "占建设投资的比例(%)", + "备注", + ], + None, + ), + "附表8 可研报告和后评价参数对比表": ( + ["单位", "可研报告", "后评价报告", "备注"], + None, + ), + TABLE_7_1_SCORING_TABLE_NAME: ( + TABLE_7_1_COLUMN_KEYS, + dict(TABLE_7_1_ROW_CELL_DEFAULTS), + ), + # 正文表(细则列结构,便于多列采集与回填) + # 细则表2-1:行键=原料名称;列与 Word 表头一致(序号仅展示用,不入库为列) + "表2-1 资源(原料)组成、数量对比表": ( + [ + "规格", + "可研报告数量(万吨)", + "可研报告占比(%)", + "初步设计数量(万吨)", + "初步设计占比(%)", + "实际生产数量(万吨)", + "实际生产占比(%)", + "备注", + ], + None, + ), + "表2-2 资源(原料)性质对比表": ( + ["可研报告", "初步设计", "实际生产", "备注"], + None, + ), + "表2-3 产品方案对比表": ( + ["可研规格", "可研数量(万吨/年)", "实际规格", "实际数量(万吨/年)", "备注"], + None, + ), + "表2-5 总图、储运、公用工程及辅助工程对比": ( + ["单位", "可研报告", "初步设计", "实际实施", "备注"], + None, + ), + "表2-6 储运、公用工程及辅助工程依托对比": ( + ["单位", "可研报告", "初步设计", "实际实施", "备注"], + None, + ), + "表2-7 主要设计指标对比表": ( + ["单位", "可研报告", "初步设计", "实际运行", "备注"], + None, + ), + "表5-1 主要经济指标对比表": ( + ["单位", "可研值", "后评价值", "差值", "比例(%)", "备注"], + None, + ), + "表3-1 项目承包单位情况": ( + ["承包单位", "(合同金额)(万元)", "是/否招标", "资质情况"], + None, + ), + "表3-2 施工图设计进度情况": ( + ["设计单位", "合同期限", "实际执行情况", "备注"], + None, + ), + "表3-3 施工图设计变更情况(全厂性项目)": ( + ["设计变更(份数)", "设计变更金额(万元)", "备注"], + None, + ), + "表3-4 施工图设计变更情况(单装置项目)": ( + ["设计变更(份数)", "设计变更金额(万元)", "备注"], + None, + ), + "表3-5 影响投资或工期重(较)大设计变更及原因分析": ( + ["变更内容", "金额(万元)", "原因", "备注"], + None, + ), + "表3-6 施工进度情况": ( + ["施工单位", "合同期限", "实际执行情况", "备注"], + None, + ), + "表3-7 采购工作情况": ( + ["采购方式", "制造商", "供货商", "金额(万元)", "未招标原因"], + None, + ), + "表4-1 投产以来运行周期统计表": ( + [ + "本周期开工日期", + "本周期运行时间(天)", + "非计划停工·次数(次)", + "非计划停工·时数(时)", + "原因简要分析", + ], + None, + ), + "表4-2 烷基化装置运行分析表(考核时间:×年×月×日)": ( + ["单位", "设计值", "标定值", "实际值", "备注"], + None, + ), + "表5-2 投资变动情况表(单位:万元、万美元)": ( + [ + "投资估算", + "初设概算", + "竣工决算", + "决算较估算·差额", + "决算较估算·比例(%)", + "决算较概算·差额", + "决算较概算·比例(%)", + ], + None, + ), + "表5-3 工程费用变动情况表(万元、万美元)": ( + [ + "投资估算", + "初设概算", + "竣工决算", + "决算较估算·差额", + "决算较估算·比例(%)", + "决算较概算·差额", + "决算较概算·比例(%)", + ], + None, + ), + "表5-6 不同因素变化对项目内部收益率的影响": ( + ["财务内部收益率(%)", "变化幅度", "占比"], + None, + ), + "表5-7 内部收益率为基准收益率时不确定因素临界点或临界值": ( + ["单位", "数值", "备注"], + None, + ), + "表6-1 装置技术经济指标对比表": ( + [ + "技术来源", + "规模(万吨/年)", + "物耗(Wt)%", + "能耗(kgEo/t)", + "产品质量", + "产品收率(Wt)%", + "排名", + ], + None, + ), +} + +# 时间表默认列(细则附表4/6/7、表5-5 等以「后评价时点前/后」分栏;具体见 TIME_TABLE_MULTI_COLUMNS)。 +TIME_APPENDIX_MULTI_COLUMNS: list[str] = ["后评价时点前实际值", "后评价时点后预测值"] + +TIME_TABLE_MULTI_COLUMNS: dict[str, list[str]] = { + "附表3 项目投资财务现金流量表(万元)": ["建设期", "后评价时点前实际值", "后评价时点后预测值"], + "附表4 利润与利润分配计算表(万元)": ["后评价时点前实际值", "后评价时点后预测值"], + "附表5 营业收入与营业税金及附加计算表(万元)": [ + "价格(元/t)", + "后评价时点前实际值", + "后评价时点后预测值", + ], + "附表6 总成本费用计算表(万元)": ["后评价时点前实际值", "后评价时点后预测值"], + "附表7 原材料、燃料及动力费用计算表(万元)": ["后评价时点前实际值", "后评价时点后预测值"], + "表2-4 ××年项目主要产品流向状况": [ + "规格", + "实际产量", + "销量", + "产品实际流向", + "可研报告产品流向", + "备注", + ], + # 与前端多年栏一致:每年三列,首年槽位「××年#1」,避免多栏共用一个 col_key + "表5-4 生产经营及效益情况对比表": [ + "可研报告|××年#1", + "实际值|××年#1", + "增减(%)|××年#1", + ], + "表5-5 主要生产经营指标": ["后评价时点前实际值", "后评价时点后预测值"], +} + + +def _norm_time_table_lookup_key(name: str) -> str: + """空白/全半角括号/连字符差异下稳定匹配 TIME_TABLE_MULTI_COLUMNS 键(如「(万元)」与「(万元)」)。""" + t = "".join(str(name or "").split()) + t = ( + t.replace("(", "(") + .replace(")", ")") + .replace("-", "-") + .replace("—", "-") + .replace("–", "-") + ) + return t.casefold() + + +def time_table_default_columns_for_name(table_name: str) -> list[str] | None: + """时间表默认列顺序:先精确命中,再规范化表名后模糊命中。""" + raw = str(table_name or "").strip() + if not raw: + return None + hit = TIME_TABLE_MULTI_COLUMNS.get(raw) + if hit is not None: + return list(hit) + n = _norm_time_table_lookup_key(raw) + for k, v in TIME_TABLE_MULTI_COLUMNS.items(): + if _norm_time_table_lookup_key(k) == n: + return list(v) + return None + + +# 细则正文「第2章~第7章」表格(与 Word 中表题一致;行键在可能冲突处加类别前缀) +GLOBAL_CHAPTER_TABLES: list[tuple[str, int, list[tuple[str, str, str]]]] = [ + ( + "表2-1 资源(原料)组成、数量对比表", + 850, + _T("2.1.1 资源与原料评价", ["气分重碳四", "MTBE醚后碳四", "氢气", "合计"]), + ), + ( + "表2-2 资源(原料)性质对比表", + 851, + _T( + "2.1.1 资源与原料评价", + ["密度(kg/m³)", "硫含量(ppm)", "氮含量(ppm)", "其它指标(可增删)"], + ), + ), + ( + "表2-3 产品方案对比表", + 852, + _T( + "2.1.2.1 产品方案评价", + [ + "汽油", + "航空煤油", + "柴油", + "XX化工品", + "XX润滑油", + "其它产品", + "轻油产品率(%)", + "综合商品率(%)", + "柴汽比", + ], + ), + ), + ( + "表2-5 总图、储运、公用工程及辅助工程对比", + 854, + _T( + "2.1.5 总图及系统配套工程评价", + [ + "占地面积", + "建筑面积", + "铁路专用线", + "产品仓库面积", + "产品储罐总容积", + "原料储罐总容积", + "净水厂总能力", + "循环水厂总能力", + "污水处理厂总能力", + "总变电所总容量", + "锅炉供热总能力", + "辅助设施", + "其它(可增删)", + ], + ), + ), + ( + "表2-6 储运、公用工程及辅助工程依托对比", + 855, + _T( + "2.1.5 总图及系统配套工程评价", + [ + "依托·铁路专用线", + "依托·产品仓库面积", + "依托·原料储罐容积", + "依托·产品储罐容积", + "依托·净化水厂能力", + "依托·循环水厂能力", + "依托·污水处理厂能力", + "依托·总变电所容量", + "依托·锅炉供热能力", + "依托·辅助设施", + "依托·其它(可增删)", + ], + ), + ), + ( + "表2-7 主要设计指标对比表", + 856, + _T( + "2.1.7 主要技术指标评价", + [ + "原油加工量", + "综合商品率", + "全厂柴汽比", + "全厂新鲜水耗", + "全厂平均电耗", + "能耗", + "其它综合指标", + "常减压蒸馏装置能耗", + "其它装置指标(可增删)", + ], + ), + ), + ( + "表3-1 项目承包单位情况", + 860, + _T("3.2 招投标评价", ["承包单元·示例1", "承包单元·示例2", "承包单元·示例3"]), + ), + ( + "表3-2 施工图设计进度情况", + 861, + _T("3.3.2 设计进度评价", ["工艺装置", "公用工程", "辅助设施"]), + ), + ( + "表3-3 施工图设计变更情况(全厂性项目)", + 862, + _T("3.3.4 施工图设计变更管理评价", ["工艺装置", "公用工程", "辅助设施", "合计"]), + ), + ( + "表3-4 施工图设计变更情况(单装置项目)", + 863, + _T("3.3.4 施工图设计变更管理评价", ["工艺", "电气", "其它专业(可增删)", "合计"]), + ), + ( + "表3-5 影响投资或工期重(较)大设计变更及原因分析", + 864, + _T("3.3.4 施工图设计变更管理评价", ["重大变更·示例1", "重大变更·示例2", "重大变更·示例3"]), + ), + ( + "表3-6 施工进度情况", + 865, + _T("3.4.2 施工计划的执行情况", ["工艺装置", "公用工程", "辅助设施"]), + ), + ( + "表3-7 采购工作情况", + 866, + _T( + "3.5 采购工作评价", + [ + "采购物资·示例1", + "采购物资·示例2", + "采购物资·示例3", + "应招标数量(个)", + "招标数量率(%)", + "应招标金额(万元)", + "招标金额率(%)", + ], + ), + ), + ( + "表4-1 投产以来运行周期统计表", + 870, + _T("4.3.2 生产运行总体情况评价", ["运行周期·装置示例1", "运行周期·装置示例2"]), + ), + ( + "表4-2 烷基化装置运行分析表(考核时间:×年×月×日)", + 871, + _T( + "4.3.3 达标评价", + [ + "生产能力", + "主要原材料(代表物料)", + "主要产品产量(代表产品)", + "公用工程消耗·水", + "公用工程消耗·蒸汽", + "公用工程消耗·电", + "公用工程消耗·燃料气", + "综合能耗", + "现金加工成本", + "单位毛利", + "其它(可增删)", + ], + ), + ), + ( + "表5-1 主要经济指标对比表", + 880, + _T( + "5.1 主要经济指标实现程度评价", + [ + "1 项目报批总投资", + "1.1 建设投资", + "1.2 建设期利息", + "1.3 铺底流动资金", + "2 年均营业收入", + "3 年均总成本费用", + "4 年均流转税金及附加", + "5 年均利润总额", + "6 年均所得税金", + "7 年均税后利润", + "8 项目投资内部收益率(税后)", + "9 项目投资财务净现值(税后)", + "10 项目静态投资回收期(含建设期)", + ], + ), + ), + ( + "表5-2 投资变动情况表(单位:万元、万美元)", + 881, + _T( + "5.2.1 投资控制及变动原因分析", + [ + "批准单位", + "批准文号", + "一 建设投资", + "1 固定资产投资", + "1.1 工程费用", + "1.1.1 工艺生产装置", + "1.1.2 总图运输", + "1.1.3 储运工程", + "1.1.4 辅助设施", + "1.1.5 公用工程", + "1.1.6 生产管理设施", + "1.1.7 厂外工程", + "1.1.8 工器具及生产家具购置费", + "1.2 固定资产其它费用", + "1.2.1 ×××费用", + "1.2.2 ×××费用", + "2 无形资产费用", + "2.1 ×××费用", + "3 递延资产费用", + "3.1 ×××费用", + "4 预备费用", + "4.1 基本预备费", + "4.2 价差预备费", + "二 固定资产投资方向调节税", + "三 建设期利息", + "四 铺底流动资金", + # 与附表2「报批项目总投资」行键区分,避免全局回填串表 + "报批项目总投资(投资变动表)", + "其中:外汇(投资变动表)", + ], + ), + ), + ( + "表5-3 工程费用变动情况表(万元、万美元)", + 882, + _T( + "5.2.1 投资控制及变动原因分析", + [ + "批准单位", + "批准文号", + "工程费用", + "工程费用·其中:外汇", + "1 工艺生产装置", + "1 工艺生产装置·其中:外汇", + "1.1 ×××装置", + "1.1 ×××装置·其中:外汇", + "1.1.1 设备购置费", + "1.1.1 设备购置费·其中:外汇", + "1.1.2 安装工程费", + "1.1.2 安装工程费·其中:外汇", + "1.1.3 建筑工程费", + "1.2 ×××装置", + "1.2 ×××装置·其中:外汇", + "1.2.1 设备购置费", + "1.2.1 设备购置费·其中:外汇", + "1.2.2 安装工程费", + "1.2.2 安装工程费·其中:外汇", + "1.2.3 建筑工程费", + "2 总图运输", + "3 储运工程", + "其它分项(可增删)", + "工程费用合计", + ], + ), + ), + ( + "表5-6 不同因素变化对项目内部收益率的影响", + 890, + _T( + "5.3.2 项目经济效益后评价", + [ + "可研报告(基准)", + "后评价报告", + "建设投资变动", + "价格体系变动", + "生产负荷变动", + "建设周期变动", + "其它因素(可增删)", + ], + ), + ), + ( + "表5-7 内部收益率为基准收益率时不确定因素临界点或临界值", + 891, + _T( + "5.4 不确定性分析", + ["生产负荷临界点", "产品价格临界值", "主要原材料价格临界值", "其它不确定因素"], + ), + ), + ( + "表6-1 装置技术经济指标对比表", + 895, + _T( + "6.2.3 主要技术及经济指标对比", + [ + "XX装置·示例1", + "XX装置·示例2", + "XX装置·示例3", + "XX装置·示例4", + "XX装置·示例5", + ], + ), + ), + ( + TABLE_7_1_SCORING_TABLE_NAME, + 896, + TABLE_7_1_FIELDS, + ), +] + +ALL_GLOBAL_TABLES: list[tuple[str, int, list[tuple[str, str, str]]]] = ( + GLOBAL_SECTION_TABLES + GLOBAL_CHAPTER_TABLES + GLOBAL_APPENDIX_TABLES +) + +# 表5-3 新旧 row_key 同义组(与 templates/js/quick-fill.js preferred53Specs 一致;报告/要素展示优先新键) +TABLE_5_3_ROW_KEY_ALTERNATES: tuple[tuple[str, ...], ...] = ( + ("批准单位", "工程费用变动·批准单位"), + ("批准文号", "工程费用变动·批准文号"), + ("工程费用",), + ("工程费用·其中:外汇",), + ("工程费用合计", "工程费用变动·工程费用合计"), + ("1 工艺生产装置", "工程费用变动·工艺生产装置"), + ("1 工艺生产装置·其中:外汇",), + ("1.1 ×××装置",), + ("1.1 ×××装置·其中:外汇",), + ("1.1.1 设备购置费", "工程费用变动·装置·设备购置费"), + ("1.1.1 设备购置费·其中:外汇",), + ("1.1.2 安装工程费", "工程费用变动·装置·安装工程费"), + ("1.1.2 安装工程费·其中:外汇",), + ("1.1.3 建筑工程费", "工程费用变动·装置·建筑工程费"), + ("1.2 ×××装置",), + ("1.2 ×××装置·其中:外汇",), + ("1.2.1 设备购置费",), + ("1.2.1 设备购置费·其中:外汇",), + ("1.2.2 安装工程费",), + ("1.2.2 安装工程费·其中:外汇",), + ("1.2.3 建筑工程费",), + ("2 总图运输", "工程费用变动·总图运输"), + ("3 储运工程", "工程费用变动·储运工程"), + ("其它分项(可增删)", "工程费用变动·其它分项(可增删)"), +) + + +def _norm_global_table_lookup_key(name: str) -> str: + """空白/全半角括号/连字符差异下稳定匹配 ALL_GLOBAL_TABLES 表名。""" + return _norm_time_table_lookup_key(name) + + +def global_table_row_keys(table_name: str) -> list[str]: + """返回 ``ALL_GLOBAL_TABLES`` 中某张全局表的 canonical 行键顺序(与要素库 ``row_key`` 一致)。""" + raw = str(table_name or "").strip() + if not raw: + return [] + for name, _, fields in ALL_GLOBAL_TABLES: + if name == raw: + return [str(r[0]) for r in fields if r and str(r[0]).strip()] + n = _norm_global_table_lookup_key(raw) + for name, _, fields in ALL_GLOBAL_TABLES: + if _norm_global_table_lookup_key(name) == n: + return [str(r[0]) for r in fields if r and str(r[0]).strip()] + return [] + + +def canonical_row_order_for_table(table_name: str) -> list[str] | None: + """附表2~8 及时间附表3~7 的标准行键顺序(与 quick-fill.js preferred*Rows / TIME_APPENDIX_SPECS 一致)。""" + raw = str(table_name or "").strip() + if not raw: + return None + n = _norm_global_table_lookup_key(raw) + if "附表2" in raw and "项目竣工决算投资构成表" in raw: + return list(APPENDIX2_CANONICAL_ROW_ORDER) + for spec_name, rows in TIME_APPENDIX_SPECS: + if _norm_global_table_lookup_key(spec_name) == n: + return list(rows) + for spec_name, rows in TIME_BODY_SPECS: + if _norm_global_table_lookup_key(spec_name) == n: + return list(rows) + if "附表8" in raw and "可研报告和后评价参数对比表" in raw: + return list(APPENDIX8_CANONICAL_ROW_ORDER) + return None + + +# 规则 /factors 抽取用:仅概况 + 附表1/2/8,避免正文几十张表拖慢超时;正文表仍完整预置在 ALL_GLOBAL_TABLES。 +RULE_GLOBAL_TABLES: list[tuple[str, int, list[tuple[str, str, str]]]] = ( + GLOBAL_SECTION_TABLES + GLOBAL_APPENDIX_TABLES +) + +# --------------------------------------------------------------------------- +# 时间维度:附表3~7 + 正文按年表;每(表名, 年)一张 ElementTable,行键与细则「项目名称」一致并加表前缀防冲突。 +# --------------------------------------------------------------------------- +_CF = "现金流量·" +_PL = "利润表·" +_TX = "税金表·" +_CT = "成本表·" +_MT = "料燃动·" + +TIME_APPENDIX_SPECS: list[tuple[str, list[str]]] = [ + ( + "附表3 项目投资财务现金流量表(万元)", + [ + _CF + "1 现金流入", + _CF + "1.1 营业收入", + _CF + "1.2 回收固定资产余值", + _CF + "1.3 回收流动资金", + _CF + "2 现金流出", + _CF + "2.1 建设投资", + _CF + "2.2 流动资金", + _CF + "2.3 经营成本", + _CF + "2.4 营业税金及附加", + _CF + "2.5 调整所得税", + _CF + "3 净现金流量", + _CF + "计算指标·所得税后财务内部收益率(%)", + _CF + "计算指标·所得税后财务净现值(万元)", + _CF + "计算指标·所得税后静态投资回收期(年)", + ], + ), + ( + "附表4 利润与利润分配计算表(万元)", + [ + _PL + "1 营业收入", + _PL + "2 总成本费用", + _PL + "3 营业税金及附加", + _PL + "4 利润总额", + _PL + "5 弥补以前年度亏损", + _PL + "6 应纳税所得额", + _PL + "7 所得税", + _PL + "8 净利润", + _PL + "9 盈余公积及公益金", + _PL + "10 可供分配利润", + _PL + "11 息税前利润总额", + _PL + "12 调整所得税", + ], + ), + ( + "附表5 营业收入与营业税金及附加计算表(万元)", + [ + _TX + "1 营业收入", + _TX + "1.1 产品A·销量", + _TX + "1.1 产品A·营业收入", + _TX + "1.1 产品A·销项税", + _TX + "1.2 产品B·销量", + _TX + "1.2 产品B·营业收入", + _TX + "1.2 产品B·销项税", + _TX + "1.3 产品·……", + _TX + "3 增值税", + _TX + "3.1 销项税", + _TX + "3.2 进项税", + _TX + "3.3 设备材料进项税", + _TX + "4 消费税", + _TX + "4.1 汽油", + _TX + "4.2 柴油", + _TX + "4.3 ……", + _TX + "5 城建税", + _TX + "6 教育费附加", + _TX + "7 营业税金及附加", + ], + ), + ( + "附表6 总成本费用计算表(万元)", + [ + _CT + "1 生产成本", + _CT + "1.1 原材料", + _CT + "1.2 辅助材料", + _CT + "1.3 燃料", + _CT + "1.4 动力", + _CT + "1.5 员工工资及福利", + _CT + "1.6 制造费用", + _CT + "1.6.1 折旧费", + _CT + "1.6.2 修理费", + _CT + "1.6.3 其他制造费用", + _CT + "2 期间费用", + _CT + "2.1 无形资产摊销", + _CT + "2.2 递延资产摊销", + _CT + "2.3 安全生产费用", + _CT + "2.4 安保基金", + _CT + "2.5 其他管理费", + _CT + "3 财务费用", + _CT + "3.1 长期借款利息", + _CT + "3.2 流动资金借款利息", + _CT + "4 总成本费用", + _CT + "4.1 固定成本", + _CT + "4.2 可变成本", + _CT + "5 经营成本", + _CT + "6 单位加工成本", + _CT + "7 单位产品生产成本(化工项目)", + ], + ), + ( + "附表7 原材料、燃料及动力费用计算表(万元)", + [ + _MT + "1 原材料费用", + _MT + "1.1 原料A", + _MT + "1.1 原料A·单价", + _MT + "1.1 原料A·数量", + _MT + "1.1 原料A·进项税额", + _MT + "1.2 原料B", + _MT + "1.2 原料B·……", + _MT + "2 辅助材料费用", + _MT + "2.1 辅助材料A", + _MT + "2.1 辅助材料A·单价", + _MT + "2.1 辅助材料A·数量", + _MT + "2.1 辅助材料A·进项税额", + _MT + "2.2 辅助材料B", + _MT + "2.2 辅助材料B·……", + _MT + "3 燃料费", + _MT + "3.1 燃料A", + _MT + "3.1 燃料A·单价", + _MT + "3.1 燃料A·数量", + _MT + "3.1 燃料A·进项税额", + _MT + "3.2 燃料B", + _MT + "3.2 燃料B·……", + _MT + "4 动力费", + _MT + "4.1 动力A", + _MT + "4.1 动力A·单价", + _MT + "4.1 动力A·数量", + _MT + "4.1 动力A·进项税额", + _MT + "4.2 动力B", + _MT + "4.2 动力B·……", + _MT + "5 进项税合计", + ], + ), +] + +# 细则正文要求按年(或评价年)填报的表格,与附表3~7 相同按「年 × 表」预置 +TIME_BODY_SPECS: list[tuple[str, list[str]]] = [ + ( + "表2-4 ××年项目主要产品流向状况", + [ + "产品名称·1", + "产品名称·2", + "产品名称·3", + "小计", + ], + ), + ( + "表5-4 生产经营及效益情况对比表", + [ + "运行情况·生产天数", + "运行情况·负荷率", + "主要原料价格·氢气", + "主要产品年产量·98#汽油", + "主要产品年产量·95#汽油", + "主要产品年产量·92#汽油", + "主要产品年产量·异丁烷", + "主要产品年产量·正丁烷", + "主要产品年产量·燃料气", + "主要产品年产量·液化气", + "主要产品年销售量·98#汽油", + "主要产品年销售量·95#汽油", + "主要产品年销售量·92#汽油", + "主要产品年销售量·异丁烷", + "主要产品年销售量·正丁烷", + "主要产品年销售量·燃料气", + "主要产品年销售量·液化气", + "主要原料和公用工程消耗量·氢气", + "主要原料和公用工程消耗量·辅助材料", + "主要原料和公用工程消耗量·电", + "主要原料和公用工程消耗量·净化风", + "主要原料和公用工程消耗量·循环水", + "主要原料和公用工程消耗量·除盐水", + "主要原料和公用工程消耗量·除氧水", + "主要原料和公用工程消耗量·蒸汽1.0MPa", + "主要原料和公用工程消耗量·蒸汽3.5MPa", + "主要原料和公用工程消耗量·氮气", + "主要原料和公用工程消耗量·废渣处理", + "主要原料和公用工程消耗量·新鲜水", + "主要原料和公用工程消耗量·凝结水", + "主要经济指标·营业收入", + "主要经济指标·成本费用", + "主要经济指标·利润总额", + "主要经济指标·税后利润", + ], + ), + ( + "表5-5 主要生产经营指标", + [ + "生产负荷", + "原料消耗量", + "燃料消耗量", + "动力消耗量", + "产品产量", + "其它", + ], + ), +] + +ALL_TIME_TABLE_SPECS: list[tuple[str, list[str]]] = TIME_APPENDIX_SPECS + TIME_BODY_SPECS + +# 哈尔滨石化分公司烷基化装置建设项目 — 表5-4 生产经营及效益情况对比(可研预测 vs 时点前实际值 2019 年) +# 通过 element_service.apply_harbin_alkylation_table54_preset 或 POST .../presets/harbin-alkylation-table54 写入指定项目的要素表。 +# 列键须与 TIME_TABLE_MULTI_COLUMNS 中「可研报告|××年#1」一致,避免与模板同步占位列并存时出现双「××年」空栏。 +HARBIN_ALKYLATION_TABLE54_CELL_VALUES: dict[str, dict[str, str]] = { + "运行情况·生产天数": {"可研报告|××年#1": "365", "实际值|××年#1": "334", "增减(%)|××年#1": "-8.49"}, + "运行情况·负荷率": {"可研报告|××年#1": "100.00%", "实际值|××年#1": "42.87%", "增减(%)|××年#1": "-57.13"}, + "主要原料价格·氢气": {"可研报告|××年#1": "4000", "实际值|××年#1": "4376.74", "增减(%)|××年#1": "9.42"}, + "主要产品年产量·98#汽油": {"可研报告|××年#1": "1.08", "实际值|××年#1": "0.30", "增减(%)|××年#1": "-72.16"}, + "主要产品年产量·95#汽油": {"可研报告|××年#1": "14.03", "实际值|××年#1": "6.13", "增减(%)|××年#1": "-56.34"}, + "主要产品年产量·92#汽油": {"可研报告|××年#1": "1.08", "实际值|××年#1": "0.00", "增减(%)|××年#1": "-100.00"}, + "主要产品年产量·异丁烷": {"可研报告|××年#1": "", "实际值|××年#1": "1.38", "增减(%)|××年#1": "/"}, + "主要产品年产量·正丁烷": {"可研报告|××年#1": "5.64", "实际值|××年#1": "1.17", "增减(%)|××年#1": "-79.27"}, + "主要产品年产量·燃料气": {"可研报告|××年#1": "", "实际值|××年#1": "0.12", "增减(%)|××年#1": "/"}, + "主要产品年产量·液化气": {"可研报告|××年#1": "-21.93", "实际值|××年#1": "-9.24", "增减(%)|××年#1": "-57.85"}, + "主要产品年销售量·98#汽油": {"可研报告|××年#1": "1.08", "实际值|××年#1": "0.30", "增减(%)|××年#1": "-72.16"}, + "主要产品年销售量·95#汽油": {"可研报告|××年#1": "14.03", "实际值|××年#1": "6.13", "增减(%)|××年#1": "-56.34"}, + "主要产品年销售量·92#汽油": {"可研报告|××年#1": "1.08", "实际值|××年#1": "0.00", "增减(%)|××年#1": "-100.00"}, + "主要产品年销售量·异丁烷": {"可研报告|××年#1": "", "实际值|××年#1": "1.38", "增减(%)|××年#1": "/"}, + "主要产品年销售量·正丁烷": {"可研报告|××年#1": "5.64", "实际值|××年#1": "1.17", "增减(%)|××年#1": "-79.27"}, + "主要产品年销售量·燃料气": {"可研报告|××年#1": "", "实际值|××年#1": "0.12", "增减(%)|××年#1": "/"}, + "主要产品年销售量·液化气": {"可研报告|××年#1": "-21.93", "实际值|××年#1": "-9.24", "增减(%)|××年#1": "-57.85"}, + "主要原料和公用工程消耗量·氢气": {"可研报告|××年#1": "0.02", "实际值|××年#1": "0.01", "增减(%)|××年#1": "-51.49"}, + "主要原料和公用工程消耗量·辅助材料": {"可研报告|××年#1": "3054", "实际值|××年#1": "796.66", "增减(%)|××年#1": "-73.91"}, + "主要原料和公用工程消耗量·电": {"可研报告|××年#1": "1346", "实际值|××年#1": "669.75", "增减(%)|××年#1": "-50.24"}, + "主要原料和公用工程消耗量·净化风": {"可研报告|××年#1": "294", "实际值|××年#1": "235.02", "增减(%)|××年#1": "-20.06"}, + "主要原料和公用工程消耗量·循环水": {"可研报告|××年#1": "483", "实际值|××年#1": "231.96", "增减(%)|××年#1": "-51.97"}, + "主要原料和公用工程消耗量·除盐水": {"可研报告|××年#1": "10.50", "实际值|××年#1": "4.01", "增减(%)|××年#1": "-61.80"}, + "主要原料和公用工程消耗量·除氧水": {"可研报告|××年#1": "", "实际值|××年#1": "0.01", "增减(%)|××年#1": "/"}, + "主要原料和公用工程消耗量·蒸汽1.0MPa": {"可研报告|××年#1": "-8.99", "实际值|××年#1": "-6.11", "增减(%)|××年#1": "-31.98"}, + "主要原料和公用工程消耗量·蒸汽3.5MPa": {"可研报告|××年#1": "28.31", "实际值|××年#1": "19.67", "增减(%)|××年#1": "-30.52"}, + "主要原料和公用工程消耗量·氮气": {"可研报告|××年#1": "", "实际值|××年#1": "288.17", "增减(%)|××年#1": "/"}, + "主要原料和公用工程消耗量·废渣处理": {"可研报告|××年#1": "0.04", "实际值|××年#1": "0.0121", "增减(%)|××年#1": "-67.73"}, + "主要原料和公用工程消耗量·新鲜水": {"可研报告|××年#1": "", "实际值|××年#1": "0.15", "增减(%)|××年#1": "/"}, + "主要原料和公用工程消耗量·凝结水": {"可研报告|××年#1": "19.32", "实际值|××年#1": "0.00", "增减(%)|××年#1": "-100.00"}, + "主要经济指标·营业收入": {"可研报告|××年#1": "64278", "实际值|××年#1": "30610", "增减(%)|××年#1": "-52.38"}, + "主要经济指标·成本费用": {"可研报告|××年#1": "10627", "实际值|××年#1": "7332", "增减(%)|××年#1": "-31.01"}, + "主要经济指标·利润总额": {"可研报告|××年#1": "13785", "实际值|××年#1": "7876", "增减(%)|××年#1": "-42.87"}, + "主要经济指标·税后利润": {"可研报告|××年#1": "10339", "实际值|××年#1": "5907", "增减(%)|××年#1": "-42.87"}, +} + +# 行键已按表加前缀,一般无需覆盖;保留字典供日后特例。 +TIME_ROW_TABLE_OVERRIDE: dict[str, str] = {} + +TIME_ROW_PRIMARY_TABLE: dict[str, str] = {} +for _tname, _rows in ALL_TIME_TABLE_SPECS: + for _rk in _rows: + if _rk not in TIME_ROW_PRIMARY_TABLE: + TIME_ROW_PRIMARY_TABLE[_rk] = _tname +for _k, _t in TIME_ROW_TABLE_OVERRIDE.items(): + TIME_ROW_PRIMARY_TABLE[_k] = _t + +TIME_KEY_SET: set[str] = {r for _, rows in ALL_TIME_TABLE_SPECS for r in rows} + +# 规则抽取用:全局 + 少量时间字段(name 须与时间表行键一致) +RULE_EXTRACT_EXTRA_TIME: list[tuple[str, str, str, str]] = [ + ("附表4 利润与利润分配计算表(万元)", _PL + "1 营业收入", "5.3.1 项目投产以来生产经营及效益状况", "营业收入"), + ("附表4 利润与利润分配计算表(万元)", _PL + "2 总成本费用", "5.3.1 项目投产以来生产经营及效益状况", "总成本费用"), + ("附表4 利润与利润分配计算表(万元)", _PL + "8 净利润", "5.3.1 项目投产以来生产经营及效益状况", "净利润"), + ("附表5 营业收入与营业税金及附加计算表(万元)", _TX + "7 营业税金及附加", "5.3.2 项目经济效益后评价", "营业税金及附加"), + ("附表3 项目投资财务现金流量表(万元)", _CF + "3 净现金流量", "5.3.2 项目经济效益后评价", "净现金流量"), +] + + +def build_rule_factor_items() -> list[dict[str, str]]: + """规则抽取条目(精简);完整要素模版见 ALL_GLOBAL_TABLES + ALL_TIME_TABLE_SPECS。""" + out: list[dict[str, str]] = [] + for table_name, _base, fields in RULE_GLOBAL_TABLES: + for name, source, lvl3 in fields: + out.append( + { + "name": name, + "source": source, + "lvl3": lvl3, + "table_name": table_name, + "table_type": "global", + } + ) + for table_name, name, source, lvl3 in RULE_EXTRACT_EXTRA_TIME: + out.append( + { + "name": name, + "source": source, + "lvl3": lvl3, + "table_name": table_name, + "table_type": "time", + } + ) + return out + + +ROW_KEY_TO_GLOBAL_TABLE: dict[str, str] = {} +for _tn, _b, fields in ALL_GLOBAL_TABLES: + for key, _, _ in fields: + ROW_KEY_TO_GLOBAL_TABLE[key] = _tn + +GLOBAL_KEY_SET = {key for key in ROW_KEY_TO_GLOBAL_TABLE} + + +def all_global_row_specs() -> list[tuple[str, str, str, str]]: + rows: list[tuple[str, str, str, str]] = [] + for table_name, _b, fields in ALL_GLOBAL_TABLES: + for name, source, lvl3 in fields: + rows.append((table_name, name, source, lvl3)) + return rows + + +# 章节/表到“材料文档关键词”硬映射: +# key: source 分组名(通常为 table_name,如“章节要素-第2章前期工作评价”“附表4 ...”) +# value: 该分组允许命中的文档名关键词(按 document_markdowns.extracted_filename/kb_documents.name 匹配) +# +# 说明: +# 1) 这里不做打分,命中即纳入; +# 2) 未配置的分组默认不过滤(避免因未补齐映射导致漏抽); +# 3) 关键词建议用 2~8 字的稳定片段,如“可研”“初设”“竣工决算”“后评价报告”。 +SOURCE_DOC_KEYWORDS_MAP: dict[str, list[str]] = { + # 摘要与正文章节(从摘要开始) + "章节要素-摘要与前言": ["可研"], + "章节要素-第1章项目概况": ["可研", "竣工验收"], + "章节要素-第2章前期工作评价": ["可研", "初设"], + "章节要素-第3章建设实施评价": ["初设", "施工", "监理", "竣工验收"], + "章节要素-第4章生产运行评价": ["生产运行", "标定", "运行月报", "可研"], + "章节要素-第5章投资与经济效益评价": ["竣工决算", "财务", "可研"], + "章节要素-第6章影响与持续性评价": [ + "可研", + "环评", + "环境影响", + "环保", + "环境保护", + "验收监测", + "安评", + "安全评价", + "安全预评价", + "安全验收", + "安全设施", + ], + + # 第2章相关表 + "表2-1 资源(原料)组成、数量对比表": ["可研"], + "表2-2 资源(原料)性质对比表": ["可研"], + "表2-3 产品方案对比表": ["可研"], + "表2-4 ××年项目主要产品流向状况": ["可研"], + "表2-5 总图、储运、公用工程及辅助工程对比": ["可研"], + "表2-6 储运、公用工程及辅助工程依托对比": ["可研"], + "表2-7 主要设计指标对比表": ["可研"], + + # 第3章相关表 + "表3-1 项目承包单位情况": ["施工", "监理", "可研"], + "表3-2 施工图设计进度情况": ["施工图", "初设", "可研"], + "表3-3 施工图设计变更情况(全厂性项目)": ["施工图", "设计变更", "可研"], + "表3-4 施工图设计变更情况(单装置项目)": ["施工图", "设计变更", "可研"], + "表3-5 影响投资或工期重(较)大设计变更及原因分析": ["设计变更", "可研"], + "表3-6 施工进度情况": ["施工进度", "可研"], + "表3-7 采购工作情况": ["采购", "可研"], + + # 第4章相关表 + "表4-1 投产以来运行周期统计表": ["生产运行", "可研"], + "表4-2 烷基化装置运行分析表(考核时间:×年×月×日)": ["生产运行", "标定", "可研"], + + # 第5章相关表 + "表5-1 主要经济指标对比表": ["财务", "竣工决算", "可研"], + "表5-2 投资变动情况表(单位:万元、万美元)": ["竣工决算", "投资", "可研"], + "表5-3 工程费用变动情况表(万元、万美元)": ["竣工决算", "工程费用", "可研"], + "表5-4 生产经营及效益情况对比表": ["生产经营", "财务", "可研"], + "表5-5 主要生产经营指标": ["生产经营", "财务", "可研"], + "表5-6 不同因素变化对项目内部收益率的影响": ["财务", "可研"], + "表5-7 内部收益率为基准收益率时不确定因素临界点或临界值": ["财务", "可研"], + + # 第6/7章相关表 + "表6-1 装置技术经济指标对比表": [ + "可研", + "环评", + "环境影响", + "环保", + "安评", + "安全评价", + "安全设施", + ], + + # 附表 + "附表1 项目建设工作程序表": ["可研", "前期", "批复"], + "附表2 项目竣工决算投资构成表(万元)": ["竣工决算"], + "附表3 项目投资财务现金流量表(万元)": ["财务", "可研"], + "附表4 利润与利润分配计算表(万元)": ["财务", "可研"], + "附表5 营业收入与营业税金及附加计算表(万元)": ["财务", "可研"], + "附表6 总成本费用计算表(万元)": ["财务", "可研"], + "附表7 原材料、燃料及动力费用计算表(万元)": ["财务", "可研"], + "附表8 可研报告和后评价参数对比表": ["可研"], +} + + +def source_doc_keywords_for(src: str) -> list[str]: + """ + 获取某个 source 分组对应的文档关键词(支持前缀键匹配)。 + """ + key = str(src or "").strip() + if not key: + return [] + exact = SOURCE_DOC_KEYWORDS_MAP.get(key) + if exact is not None: + return [str(x).strip() for x in exact if str(x).strip()] + for k, vals in SOURCE_DOC_KEYWORDS_MAP.items(): + ks = str(k or "").strip() + if ks and key.startswith(ks): + return [str(x).strip() for x in vals if str(x).strip()] + return []