diff --git a/database/init.sql b/database/init.sql new file mode 100644 index 0000000..2087a36 --- /dev/null +++ b/database/init.sql @@ -0,0 +1,464 @@ +-- 智能报告生成平台 - 数据库初始化脚本 +-- 数据库名建议:post_eval_report +-- 适用于 MySQL + +-- 创建数据库(可选) +-- CREATE DATABASE IF NOT EXISTS post_eval_report DEFAULT CHARACTER SET utf8mb4 COLLATE utf8mb4_unicode_ci; +-- USE post_eval_report; + +-- 项目(统一:知识库 + 撰写) +-- uuid 由应用层生成,避免 MySQL 8/9 对生成列函数限制导致初始化失败 +CREATE TABLE IF NOT EXISTS projects ( + id INT AUTO_INCREMENT PRIMARY KEY, + uuid VARCHAR(32) NOT NULL UNIQUE, + name VARCHAR(255) NOT NULL, + description TEXT, + created_at DATETIME NOT NULL, + updated_at DATETIME NOT NULL, + doc_count INT DEFAULT 0, + eval_reports_count INT DEFAULT 0, + total_size VARCHAR(32) DEFAULT '0 B', + tags TEXT, + status VARCHAR(16) DEFAULT 'active', + color VARCHAR(16) DEFAULT '#3b82f6', + sync_suppressed_table_names LONGTEXT NULL +) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4 COLLATE=utf8mb4_unicode_ci; +CREATE INDEX idx_projects_created_at ON projects(created_at); +CREATE INDEX idx_projects_updated_at ON projects(updated_at); +CREATE INDEX idx_projects_status ON projects(status); + +-- 知识库目录表:project_id 关联 projects.uuid;parent_id 形成目录树 +CREATE TABLE IF NOT EXISTS kb_directories ( + id VARCHAR(64) PRIMARY KEY, + project_id VARCHAR(32) NOT NULL, + parent_id VARCHAR(64) NULL, + name VARCHAR(255) NOT NULL, + full_path VARCHAR(1024) NOT NULL, + created_at DATETIME NOT NULL, + updated_at DATETIME NOT NULL, + FOREIGN KEY (project_id) REFERENCES projects(uuid) ON DELETE CASCADE, + FOREIGN KEY (parent_id) REFERENCES kb_directories(id) ON DELETE CASCADE +) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4 COLLATE=utf8mb4_unicode_ci; +CREATE INDEX idx_kb_dirs_project ON kb_directories(project_id); +CREATE INDEX idx_kb_dirs_parent ON kb_directories(parent_id); + +-- 知识库文档(status: 0=失败 2=排队中 3=处理中 4=可用) +CREATE TABLE IF NOT EXISTS kb_documents ( + id VARCHAR(64) PRIMARY KEY, + project_id VARCHAR(32) NOT NULL, + directory_id VARCHAR(64) NULL, + name VARCHAR(255) NOT NULL, + size VARCHAR(32) NOT NULL, + file_path VARCHAR(512), + storage_rel_path VARCHAR(512) NULL COMMENT '项目内完整相对路径(含文件名)', + word_count INT DEFAULT 0, + uploaded_at DATETIME NOT NULL, + status INT DEFAULT 2, + error_message TEXT NULL, + factor JSON NULL COMMENT '文档要素 JSON 数组', + category VARCHAR(32) NULL DEFAULT NULL COMMENT '文件分类', + FOREIGN KEY (project_id) REFERENCES projects(uuid) ON DELETE CASCADE, + FOREIGN KEY (directory_id) REFERENCES kb_directories(id) ON DELETE SET NULL +) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4 COLLATE=utf8mb4_unicode_ci; +CREATE INDEX idx_kb_docs_project ON kb_documents(project_id); +CREATE INDEX idx_kb_docs_directory ON kb_documents(directory_id); + +-- 若已有 kb_documents 表,执行以下语句添加 word_count 字段: +-- ALTER TABLE kb_documents ADD COLUMN word_count INT DEFAULT 0 AFTER file_path; + +-- 撰写文档(project_id 关联 projects.uuid,与 kb_documents 一致) +CREATE TABLE IF NOT EXISTS write_documents ( + id VARCHAR(64) PRIMARY KEY, + project_id VARCHAR(32) NOT NULL, + title VARCHAR(255) NOT NULL, + content LONGTEXT, + word_count INT DEFAULT 0, + created_at DATETIME NOT NULL, + updated_at DATETIME NOT NULL, + status VARCHAR(16) DEFAULT 'draft', + sort_order INT DEFAULT 0, + FOREIGN KEY (project_id) REFERENCES projects(uuid) ON DELETE CASCADE +) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4 COLLATE=utf8mb4_unicode_ci; +CREATE INDEX idx_write_docs_project ON write_documents(project_id); + +-- 文档版本 +CREATE TABLE IF NOT EXISTS doc_versions ( + id VARCHAR(64) PRIMARY KEY, + document_id VARCHAR(64) NOT NULL, + version VARCHAR(32) NOT NULL, + content LONGTEXT NOT NULL, + citation_payload LONGTEXT NULL, + saved_at DATETIME NOT NULL, + author VARCHAR(64) NOT NULL, + note TEXT, + FOREIGN KEY (document_id) REFERENCES write_documents(id) ON DELETE CASCADE +) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4 COLLATE=utf8mb4_unicode_ci; +CREATE INDEX idx_versions_doc ON doc_versions(document_id); + +-- 要素表定义(全局/时间) +CREATE TABLE IF NOT EXISTS element_tables ( + id VARCHAR(64) PRIMARY KEY, + project_id VARCHAR(32) NOT NULL, + table_type VARCHAR(32) NOT NULL, + table_name VARCHAR(255) NOT NULL, + year INT NULL, + is_time_dimension TINYINT(1) DEFAULT 0, + sort_order INT DEFAULT 0, + sync_suppressed_row_keys LONGTEXT NULL, + custom_row_order LONGTEXT NULL, + created_at DATETIME NOT NULL, + updated_at DATETIME NOT NULL, + FOREIGN KEY (project_id) REFERENCES projects(uuid) ON DELETE CASCADE +) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4 COLLATE=utf8mb4_unicode_ci; +CREATE INDEX idx_element_tables_project ON element_tables(project_id); +CREATE INDEX idx_element_tables_type_year ON element_tables(table_type, year); +CREATE INDEX idx_element_tables_name ON element_tables(table_name); + +-- 要素单元格 +CREATE TABLE IF NOT EXISTS element_cells ( + id VARCHAR(64) PRIMARY KEY, + table_id VARCHAR(64) NOT NULL, + project_id VARCHAR(32) NOT NULL, + row_key VARCHAR(255) NOT NULL, + col_key VARCHAR(255) NULL, + year INT NULL, + value LONGTEXT NULL, + source_document_id VARCHAR(64) NULL, + source_line_no INT NULL, + source_line_end INT NULL, + source_quote TEXT NULL, + confidence FLOAT NULL, + extraction_batch_id VARCHAR(64) NULL, + extraction_model VARCHAR(128) NULL, + source_type VARCHAR(16) NULL COMMENT 'extract=文档抽取, manual=手工输入', + conflict_status VARCHAR(16) DEFAULT 'none', + created_at DATETIME NOT NULL, + updated_at DATETIME NOT NULL, + FOREIGN KEY (table_id) REFERENCES element_tables(id) ON DELETE CASCADE, + FOREIGN KEY (project_id) REFERENCES projects(uuid) ON DELETE CASCADE, + FOREIGN KEY (source_document_id) REFERENCES kb_documents(id) ON DELETE SET NULL +) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4 COLLATE=utf8mb4_unicode_ci; +CREATE INDEX idx_element_cells_project ON element_cells(project_id); +CREATE INDEX idx_element_cells_row_col ON element_cells(row_key, col_key); +CREATE INDEX idx_element_cells_year ON element_cells(year); + +-- 抽取结果留存(table/element) +CREATE TABLE IF NOT EXISTS extraction_results ( + id VARCHAR(64) PRIMARY KEY, + project_id VARCHAR(32) NOT NULL, + document_id VARCHAR(64) NOT NULL, + batch_id VARCHAR(64) NOT NULL, + result_type VARCHAR(16) NOT NULL, + table_type VARCHAR(32) NULL, + table_name VARCHAR(255) NULL, + year INT NULL, + item_key VARCHAR(255) NOT NULL, + item_value LONGTEXT NULL, + source_line_no INT NULL, + source_line_end INT NULL, + confidence FLOAT NULL, + raw_payload JSON NULL, + extracted_at DATETIME NULL, + created_at DATETIME NOT NULL, + FOREIGN KEY (project_id) REFERENCES projects(uuid) ON DELETE CASCADE, + FOREIGN KEY (document_id) REFERENCES kb_documents(id) ON DELETE CASCADE +) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4 COLLATE=utf8mb4_unicode_ci; +CREATE INDEX idx_extraction_project_doc ON extraction_results(project_id, document_id); +CREATE INDEX idx_extraction_batch ON extraction_results(batch_id); +CREATE INDEX idx_extraction_table_name ON extraction_results(table_name); +CREATE INDEX idx_extraction_key ON extraction_results(item_key); + +-- 要素抽取结果明细(面向“细则章节/小节提示词 -> 项目材料”) +CREATE TABLE IF NOT EXISTS element_extraction_results ( + id VARCHAR(64) PRIMARY KEY, + project_id VARCHAR(32) NOT NULL, + table_type VARCHAR(32) NOT NULL, + year INT NULL, + table_name VARCHAR(255) NOT NULL, + extracted_at DATETIME NOT NULL, + item_key VARCHAR(255) NOT NULL, + item_value LONGTEXT NULL, + source_document_id VARCHAR(64) NULL, + source_line_no INT NULL, + source_line_end INT NULL, + created_at DATETIME NOT NULL, + FOREIGN KEY (project_id) REFERENCES projects(uuid) ON DELETE CASCADE, + FOREIGN KEY (source_document_id) REFERENCES kb_documents(id) ON DELETE SET NULL +) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4 COLLATE=utf8mb4_unicode_ci; +CREATE INDEX idx_el_ext_project ON element_extraction_results(project_id); +CREATE INDEX idx_el_ext_table ON element_extraction_results(table_type, year, table_name); +CREATE INDEX idx_el_ext_key ON element_extraction_results(item_key); +CREATE INDEX idx_el_ext_source_doc ON element_extraction_results(source_document_id); + +-- 冲突记录 +CREATE TABLE IF NOT EXISTS element_conflicts ( + id VARCHAR(64) PRIMARY KEY, + project_id VARCHAR(32) NOT NULL, + table_id VARCHAR(64) NULL, + cell_id VARCHAR(64) NULL, + item_key VARCHAR(255) NOT NULL, + old_value LONGTEXT NULL, + new_value LONGTEXT NULL, + selected_value LONGTEXT NULL, + source_document_id VARCHAR(64) NULL, + source_line_no INT NULL, + status VARCHAR(16) DEFAULT 'pending', + created_at DATETIME NOT NULL, + updated_at DATETIME NOT NULL, + FOREIGN KEY (project_id) REFERENCES projects(uuid) ON DELETE CASCADE, + FOREIGN KEY (table_id) REFERENCES element_tables(id) ON DELETE SET NULL, + FOREIGN KEY (cell_id) REFERENCES element_cells(id) ON DELETE SET NULL, + FOREIGN KEY (source_document_id) REFERENCES kb_documents(id) ON DELETE SET NULL +) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4 COLLATE=utf8mb4_unicode_ci; +CREATE INDEX idx_element_conflicts_project ON element_conflicts(project_id); +CREATE INDEX idx_element_conflicts_status ON element_conflicts(status); + +-- 文档 markdown 落库 +CREATE TABLE IF NOT EXISTS document_markdowns ( + id VARCHAR(64) PRIMARY KEY, + project_id VARCHAR(32) NOT NULL, + document_id VARCHAR(64) NOT NULL, + extracted_filename VARCHAR(255) NULL, + markdown_content LONGTEXT NOT NULL, + content_hash VARCHAR(64) NULL, + created_at DATETIME NOT NULL, + updated_at DATETIME NOT NULL, + FOREIGN KEY (project_id) REFERENCES projects(uuid) ON DELETE CASCADE, + FOREIGN KEY (document_id) REFERENCES kb_documents(id) ON DELETE CASCADE +) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4 COLLATE=utf8mb4_unicode_ci; +CREATE INDEX idx_markdowns_project_doc ON document_markdowns(project_id, document_id); + +-- 文档段落切分 +CREATE TABLE IF NOT EXISTS document_chunks ( + id VARCHAR(64) PRIMARY KEY, + project_id VARCHAR(32) NOT NULL, + document_id VARCHAR(64) NOT NULL, + markdown_id VARCHAR(64) NULL, + heading VARCHAR(512) NULL, + chunk_text LONGTEXT NOT NULL, + chunk_index INT DEFAULT 0, + source_line_start INT NULL, + source_line_end INT NULL, + vector_id VARCHAR(128) NULL, + created_at DATETIME NOT NULL, + FOREIGN KEY (project_id) REFERENCES projects(uuid) ON DELETE CASCADE, + FOREIGN KEY (document_id) REFERENCES kb_documents(id) ON DELETE CASCADE, + FOREIGN KEY (markdown_id) REFERENCES document_markdowns(id) ON DELETE CASCADE +) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4 COLLATE=utf8mb4_unicode_ci; +CREATE INDEX idx_chunks_project_doc ON document_chunks(project_id, document_id); +CREATE INDEX idx_chunks_heading ON document_chunks(heading(255)); + +-- 独立后台任务:pdf2md 文件处理与 element-agent 要素抽取 +CREATE TABLE IF NOT EXISTS tasks ( + id VARCHAR(64) PRIMARY KEY, + project VARCHAR(64) NOT NULL, + task_type INT NOT NULL, + file_id VARCHAR(64) NULL, + file_path VARCHAR(1024) NULL, + status INT NOT NULL DEFAULT 1, + payload_json JSON NULL, + result_path VARCHAR(1024) NULL, + error_message LONGTEXT NULL, + add_time DATETIME NOT NULL, + finish_time DATETIME NULL +) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4 COLLATE=utf8mb4_unicode_ci; +CREATE INDEX idx_tasks_status_type_time ON tasks(status, task_type, add_time); +CREATE INDEX idx_tasks_project ON tasks(project); +CREATE INDEX idx_tasks_file_id ON tasks(file_id); + +-- 模板管理 +CREATE TABLE IF NOT EXISTS report_templates ( + id VARCHAR(64) PRIMARY KEY, + name VARCHAR(255) NOT NULL, + description TEXT NULL, + is_default TINYINT(1) DEFAULT 0, + is_active TINYINT(1) DEFAULT 1, + created_at DATETIME NOT NULL, + updated_at DATETIME NOT NULL +) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4 COLLATE=utf8mb4_unicode_ci; +CREATE INDEX idx_templates_default ON report_templates(is_default); + +CREATE TABLE IF NOT EXISTS report_template_sections ( + id VARCHAR(64) PRIMARY KEY, + template_id VARCHAR(64) NOT NULL, + section_key VARCHAR(64) NOT NULL, + section_title VARCHAR(255) NOT NULL, + section_prompt LONGTEXT NULL, + section_output_contract LONGTEXT NULL, + section_order INT DEFAULT 0, + examples LONGTEXT NULL, + created_at DATETIME NOT NULL, + updated_at DATETIME NOT NULL, + FOREIGN KEY (template_id) REFERENCES report_templates(id) ON DELETE CASCADE +) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4 COLLATE=utf8mb4_unicode_ci; +CREATE INDEX idx_template_sections_template ON report_template_sections(template_id); + +-- 报告生成任务(7章分章异步) +CREATE TABLE IF NOT EXISTS report_generation_jobs ( + id VARCHAR(64) PRIMARY KEY, + project_id VARCHAR(32) NOT NULL, + template_id VARCHAR(64) NULL, + status VARCHAR(16) DEFAULT 'pending', + progress INT DEFAULT 0, + current_section_key VARCHAR(64) NULL, + error_message TEXT NULL, + requested_by VARCHAR(64) NULL, + options JSON NULL, + snapshot JSON NULL, + created_at DATETIME NOT NULL, + updated_at DATETIME NOT NULL, + completed_at DATETIME NULL, + FOREIGN KEY (project_id) REFERENCES projects(uuid) ON DELETE CASCADE, + FOREIGN KEY (template_id) REFERENCES report_templates(id) ON DELETE SET NULL +) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4 COLLATE=utf8mb4_unicode_ci; +CREATE INDEX idx_report_jobs_project ON report_generation_jobs(project_id); +CREATE INDEX idx_report_jobs_status ON report_generation_jobs(status); + +CREATE TABLE IF NOT EXISTS report_generation_chapters ( + id VARCHAR(64) PRIMARY KEY, + job_id VARCHAR(64) NOT NULL, + section_key VARCHAR(64) NOT NULL, + section_title VARCHAR(255) NOT NULL, + section_order INT DEFAULT 0, + status VARCHAR(16) DEFAULT 'pending', + content LONGTEXT NULL, + prompt_text LONGTEXT NULL, + evidence_payload JSON NULL, + validation_payload JSON NULL, + error_message TEXT NULL, + created_at DATETIME NOT NULL, + updated_at DATETIME NOT NULL, + completed_at DATETIME NULL, + FOREIGN KEY (job_id) REFERENCES report_generation_jobs(id) ON DELETE CASCADE +) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4 COLLATE=utf8mb4_unicode_ci; +CREATE INDEX idx_report_chapters_job ON report_generation_chapters(job_id); +CREATE INDEX idx_report_chapters_status ON report_generation_chapters(status); + +-- 最小 RBAC +CREATE TABLE IF NOT EXISTS departments ( + id VARCHAR(64) PRIMARY KEY, + name VARCHAR(255) NOT NULL, + description TEXT NULL, + parent_id VARCHAR(64) NULL, + created_at DATETIME NOT NULL, + updated_at DATETIME NOT NULL, + FOREIGN KEY (parent_id) REFERENCES departments(id) ON DELETE SET NULL +) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4 COLLATE=utf8mb4_unicode_ci; + +CREATE TABLE IF NOT EXISTS users ( + id VARCHAR(64) PRIMARY KEY, + username VARCHAR(64) NOT NULL UNIQUE, + password_hash VARCHAR(255) NULL, + department_id VARCHAR(64) NULL, + created_at DATETIME NOT NULL, + updated_at DATETIME NOT NULL, + FOREIGN KEY (department_id) REFERENCES departments(id) ON DELETE SET NULL +) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4 COLLATE=utf8mb4_unicode_ci; +CREATE INDEX idx_users_department ON users(department_id); + +CREATE TABLE IF NOT EXISTS roles ( + id VARCHAR(64) PRIMARY KEY, + name VARCHAR(64) NOT NULL UNIQUE, + description TEXT NULL, + created_at DATETIME NOT NULL, + updated_at DATETIME NOT NULL +) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4 COLLATE=utf8mb4_unicode_ci; + +CREATE TABLE IF NOT EXISTS permissions ( + id VARCHAR(64) PRIMARY KEY, + perm_key VARCHAR(128) NOT NULL UNIQUE, + perm_type VARCHAR(32) NOT NULL, + description TEXT NULL, + created_at DATETIME NOT NULL +) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4 COLLATE=utf8mb4_unicode_ci; +CREATE INDEX idx_permissions_type ON permissions(perm_type); + +CREATE TABLE IF NOT EXISTS role_permissions ( + id VARCHAR(64) PRIMARY KEY, + role_id VARCHAR(64) NOT NULL, + permission_id VARCHAR(64) NOT NULL, + created_at DATETIME NOT NULL, + FOREIGN KEY (role_id) REFERENCES roles(id) ON DELETE CASCADE, + FOREIGN KEY (permission_id) REFERENCES permissions(id) ON DELETE CASCADE +) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4 COLLATE=utf8mb4_unicode_ci; + +CREATE TABLE IF NOT EXISTS user_roles ( + id VARCHAR(64) PRIMARY KEY, + user_id VARCHAR(64) NOT NULL, + role_id VARCHAR(64) NOT NULL, + created_at DATETIME NOT NULL, + FOREIGN KEY (user_id) REFERENCES users(id) ON DELETE CASCADE, + FOREIGN KEY (role_id) REFERENCES roles(id) ON DELETE CASCADE +) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4 COLLATE=utf8mb4_unicode_ci; + +CREATE TABLE IF NOT EXISTS project_members ( + id VARCHAR(64) PRIMARY KEY, + project_id VARCHAR(32) NOT NULL, + user_id VARCHAR(64) NOT NULL, + role VARCHAR(32) DEFAULT 'editor', + created_at DATETIME NOT NULL, + FOREIGN KEY (project_id) REFERENCES projects(uuid) ON DELETE CASCADE, + FOREIGN KEY (user_id) REFERENCES users(id) ON DELETE CASCADE +) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4 COLLATE=utf8mb4_unicode_ci; +CREATE INDEX idx_project_members_project ON project_members(project_id); + +CREATE TABLE IF NOT EXISTS project_departments ( + id VARCHAR(64) PRIMARY KEY, + project_id VARCHAR(32) NOT NULL, + department_id VARCHAR(64) NOT NULL, + created_at DATETIME NOT NULL, + FOREIGN KEY (project_id) REFERENCES projects(uuid) ON DELETE CASCADE, + FOREIGN KEY (department_id) REFERENCES departments(id) ON DELETE CASCADE, + UNIQUE KEY uq_project_department (project_id, department_id) +) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4 COLLATE=utf8mb4_unicode_ci; +CREATE INDEX idx_project_departments_project ON project_departments(project_id); + +-- 回填记录:每次要素回填均留痕,支持证据追溯 +CREATE TABLE IF NOT EXISTS fill_records ( + id VARCHAR(64) PRIMARY KEY, + project_id VARCHAR(32) NOT NULL, + cell_id VARCHAR(64) NULL, + table_id VARCHAR(64) NULL, + row_key VARCHAR(255) NOT NULL, + col_key VARCHAR(255) NULL, + year INT NULL, + filled_value LONGTEXT NULL, + previous_value LONGTEXT NULL, + source_document_id VARCHAR(64) NULL, + source_document_name VARCHAR(255) NULL COMMENT '冗余存储文档名,文档删除后仍可追溯', + source_line_no INT NULL, + source_line_end INT NULL, + source_quote TEXT NULL COMMENT '原文摘录片段,作为回填依据', + confidence FLOAT NULL, + extraction_batch_id VARCHAR(64) NULL, + extraction_model VARCHAR(128) NULL COMMENT '使用的 LLM 模型标识', + fill_type VARCHAR(16) NOT NULL DEFAULT 'auto' COMMENT 'auto=抽取回填, manual=人工编辑, resolve=冲突解决', + created_at DATETIME NOT NULL, + FOREIGN KEY (project_id) REFERENCES projects(uuid) ON DELETE CASCADE, + FOREIGN KEY (cell_id) REFERENCES element_cells(id) ON DELETE SET NULL, + FOREIGN KEY (table_id) REFERENCES element_tables(id) ON DELETE SET NULL, + FOREIGN KEY (source_document_id) REFERENCES kb_documents(id) ON DELETE SET NULL +) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4 COLLATE=utf8mb4_unicode_ci; +CREATE INDEX idx_fill_records_project ON fill_records(project_id); +CREATE INDEX idx_fill_records_cell ON fill_records(cell_id); +CREATE INDEX idx_fill_records_batch ON fill_records(extraction_batch_id); +CREATE INDEX idx_fill_records_source_doc ON fill_records(source_document_id); +CREATE INDEX idx_fill_records_created ON fill_records(created_at); + +-- ============================================================ +-- report_section_references:章节参考范文 +-- ============================================================ +CREATE TABLE IF NOT EXISTS report_section_references ( + id VARCHAR(64) PRIMARY KEY, + template_id VARCHAR(64) NULL COMMENT '关联模板ID(report_templates.id),按模板过滤参考范文', + source_file VARCHAR(255) NOT NULL COMMENT '来源文件名', + section_key VARCHAR(64) NOT NULL COMMENT '章节标识,如 1.1、2.1.1', + section_title VARCHAR(255) NOT NULL COMMENT '章节标题', + section_order INT DEFAULT 0 COMMENT '章节序号', + content TEXT NOT NULL COMMENT '该章节的参考范文 Markdown', + created_at DATETIME NOT NULL, + updated_at DATETIME NOT NULL +) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4 COLLATE=utf8mb4_unicode_ci; +CREATE INDEX idx_ref_source_file ON report_section_references(source_file); +CREATE INDEX idx_ref_section_key ON report_section_references(section_key); +CREATE INDEX idx_ref_template_id ON report_section_references(template_id); diff --git a/database/migrations/add_ref_template_id.sql b/database/migrations/add_ref_template_id.sql new file mode 100644 index 0000000..0cf11df --- /dev/null +++ b/database/migrations/add_ref_template_id.sql @@ -0,0 +1,3 @@ +-- 为 report_section_references 增加 template_id,按模板过滤参考范文 +ALTER TABLE report_section_references ADD COLUMN template_id VARCHAR(64) NULL COMMENT '关联模板ID(report_templates.id),按模板过滤参考范文'; +CREATE INDEX idx_ref_template_id ON report_section_references(template_id); diff --git a/function/__init__.py b/function/__init__.py new file mode 100644 index 0000000..4ffb595 --- /dev/null +++ b/function/__init__.py @@ -0,0 +1 @@ +# function 包 diff --git a/function/vector_store.py b/function/vector_store.py new file mode 100644 index 0000000..0413d13 --- /dev/null +++ b/function/vector_store.py @@ -0,0 +1,550 @@ +#!/usr/bin/env python3 +# -*- coding: utf-8 -*- +""" +function/vector_store.py +向量库模块 - 与 kb_service 项目集成 +已修改:drop_old 全部 = False,不会删除已有集合 +✅ 已修复 413 超长 token 问题(语义友好版) +""" + +import re +import json +import logging +from typing import Dict, List, Optional, Tuple +from pathlib import Path + +from langchain_core.documents import Document +from langchain_text_splitters import RecursiveCharacterTextSplitter +from langchain_openai import OpenAIEmbeddings +from langchain_milvus import Milvus, BM25BuiltInFunction +from pymilvus import MilvusClient, connections + +from config import settings + +logger = logging.getLogger(__name__) + +# ============================================================================ +# 配置 +# ============================================================================ +COLLECTION_NAME = "eval_report" +EMBEDDING_API_BASE = settings.EMBEDDING_API_BASE +EMBEDDING_API_KEY = settings.EMBEDDING_API_KEY +MILVUS_DB_URL = settings.MILVUS_DB_URL + +CONSISTENCY_LEVEL = "Bounded" +AUTO_ID = True +METRIC_TYPE = "COSINE" +INDEX_TYPE = "AUTOINDEX" +SPARSE_METRIC_TYPE = "BM25" +SPARSE_INDEX_TYPE = "SPARSE_INVERTED_INDEX" + + +def _embedding_batch_limits() -> tuple[int, int, int]: + max_docs = max(1, int(getattr(settings, "EMBEDDING_BATCH_MAX_DOCS", 4) or 4)) + max_chars = max(512, int(getattr(settings, "EMBEDDING_BATCH_MAX_CHARS", 12000) or 12000)) + max_chunk = max(512, int(getattr(settings, "EMBEDDING_MAX_CHUNK_CHARS", 4000) or 4000)) + return max_docs, max_chars, max_chunk + + +def _is_embedding_backend_oom(exc: BaseException) -> bool: + msg = str(exc).lower() + return ( + "out of memory" in msg + or "npu out of memory" in msg + or "cuda out of memory" in msg + or "error code: 424" in msg + or "'code': 424" in msg + ) + + +def _add_documents_batch_with_retry(vs: Milvus, batch: List[Document]) -> List[str]: + """写入一批文档;远端 embedding OOM 时自动拆半重试。""" + if not batch: + return [] + try: + return list(vs.add_documents(batch)) + except Exception as e: + if not _is_embedding_backend_oom(e) or len(batch) <= 1: + raise + mid = max(1, len(batch) // 2) + logger.warning( + "embedding 批次 OOM,拆分为 %s + %s 重试", + mid, + len(batch) - mid, + ) + ids: List[str] = [] + ids.extend(_add_documents_batch_with_retry(vs, batch[:mid])) + ids.extend(_add_documents_batch_with_retry(vs, batch[mid:])) + return ids + + +def _register_milvus_client_for_orm(client: MilvusClient) -> None: + """pymilvus 2.6+ MilvusClient uses ConnectionManager; ORM Collection still resolves + pymilvus.orm.connections by client._using. langchain-milvus touches Collection during + Milvus.__init__, so register before constructing Milvus (bootstrap client).""" + alias = client._using + if connections.has_connection(alias): + return + cfg = client._config + connections._alias_handlers[alias] = client._handler + connections._alias_config[alias] = { + "address": cfg.address, + "user": "", + "db_name": cfg.db_name or "default", + } + + +# ============================================================================ +# VectorStore 类(已全部改为 drop_old=False) +# ============================================================================ + +class VectorStore: + def __init__( + self, + collection_name: str = COLLECTION_NAME, + drop_old: bool = False, + chunk_size: int = 500, + chunk_overlap: int = 50 + ): + self.collection_name = collection_name + self.chunk_size = chunk_size + self.chunk_overlap = chunk_overlap + self._drop_old = drop_old + self._milvus = None + + def _get_embeddings(self): + return OpenAIEmbeddings( + base_url=EMBEDDING_API_BASE, + api_key=EMBEDDING_API_KEY, + model="bge-m3", + check_embedding_ctx_length=False, + ) + + def _get_milvus(self, drop_old: bool = False) -> Milvus: + logger.info("【VectorStore】初始化 Milvus 混合向量存储(dense + sparse)") + + if self._milvus is not None and not drop_old: + logger.info("【VectorStore】复用已有 Milvus 实例") + return self._milvus + + if not MILVUS_DB_URL: + raise ValueError("MILVUS_DB_URL 未配置,请在 .env 中设置") + + embeddings = self._get_embeddings() + logger.info("【VectorStore】Embedding 模型 bge-m3 初始化完成") + + try: + # 与 langchain 内 MilvusClient 共享 ConnectionManager,先注册 ORM alias,否则 __init__ 内访问 Collection 会报错 + _register_milvus_client_for_orm(MilvusClient(uri=MILVUS_DB_URL)) + self._milvus = Milvus( + embedding_function=embeddings, + builtin_function=BM25BuiltInFunction(), + vector_field=["dense", "sparse"], + connection_args={"uri": MILVUS_DB_URL}, + collection_name=self.collection_name, + consistency_level=CONSISTENCY_LEVEL, + auto_id=AUTO_ID, + drop_old=False, + index_params=[ + {"metric_type": METRIC_TYPE, "index_type": INDEX_TYPE}, + {"metric_type": SPARSE_METRIC_TYPE, "index_type": SPARSE_INDEX_TYPE}, + ], + ) + _register_milvus_client_for_orm(self._milvus.client) + logger.info("✅ Milvus 混合向量存储初始化成功") + except Exception as e: + logger.error(f"❌ Milvus 初始化失败: {str(e)}", exc_info=True) + raise + + return self._milvus + + # ======================================================================== + # ✅ 修复版 add_documents:语义友好,不破坏段落,不触发413 + # ======================================================================== + def add_documents(self, documents: List[Document]) -> List[str]: + if not documents: + logger.info("【add_documents】无文档可写入") + return [] + + max_docs_per_batch, max_chars_per_batch, max_chunk_chars = _embedding_batch_limits() + + # ---------------------- 语义安全切分(只修问题,不破坏结构)---------------------- + # 只处理【真的超长】的段落,在句子/段落边界分割,绝不乱切 + safe_splitter = RecursiveCharacterTextSplitter( + chunk_size=max_chunk_chars, + chunk_overlap=min(200, max(0, max_chunk_chars // 20)), + separators=["\n\n", "\n", "。", "!", "?", ";", ":", ","] + ) + + safe_documents = [] + for doc in documents: + # 超过限制才切分 + if len(doc.page_content) > max_chunk_chars: + chunks = safe_splitter.split_text(doc.page_content) + for chunk in chunks: + if chunk.strip(): + safe_documents.append(Document( + page_content=chunk, + metadata=doc.metadata.copy() + )) + else: + safe_documents.append(doc) + # -------------------------------------------------------------------------------- + + # Milvus 现有集合要求部分 metadata 字段必填;历史调用方未必都传这些字段,这里统一兜底补齐。 + for idx, doc in enumerate(safe_documents): + metadata = doc.metadata or {} + if not metadata.get("doc_id"): + project_uuid = metadata.get("project_uuid") or "unknown_project" + heading = metadata.get("heading") or "chunk" + metadata["doc_id"] = f"{project_uuid}:{heading}:{idx}" + if "original_title" not in metadata: + metadata["original_title"] = metadata.get("heading") or "" + if "path" not in metadata: + metadata["path"] = "" + if "project_uuid" not in metadata: + metadata["project_uuid"] = "unknown_project" + doc.metadata = metadata + + logger.info(f"【add_documents】预处理后准备写入 {len(safe_documents)} 条文档") + vs = self._get_milvus(drop_old=self._drop_old) + self._drop_old = False + + ids = [] + current_batch: List[Document] = [] + current_batch_chars = 0 + batch_num = 1 + + def _flush_batch() -> None: + nonlocal current_batch, current_batch_chars, batch_num + if not current_batch: + return + logger.info( + "【add_documents】写入批次 %s,数量:%s,约 %s 字符", + batch_num, + len(current_batch), + current_batch_chars, + ) + try: + res = _add_documents_batch_with_retry(vs, current_batch) + ids.extend(res) + logger.info("✅ 批次写入成功,返回 ID 数:%s", len(res)) + except Exception as e: + logger.error("❌ 批次写入失败: %s", e, exc_info=True) + batch_num += 1 + current_batch = [] + current_batch_chars = 0 + + for doc in safe_documents: + doc_chars = len(doc.page_content or "") + would_exceed_docs = bool(current_batch) and len(current_batch) >= max_docs_per_batch + would_exceed_chars = bool(current_batch) and ( + current_batch_chars + doc_chars > max_chars_per_batch + ) + if would_exceed_docs or would_exceed_chars: + _flush_batch() + current_batch.append(doc) + current_batch_chars += doc_chars + + _flush_batch() + + logger.info(f"【add_documents】全部完成,总写入 ID 数:{len(ids)}") + return ids + + def similarity_search_with_score( + self, query: str, k: int = 10, filter: Optional[str] = None + ) -> List[Tuple[Document, float]]: + vs = self._get_milvus(drop_old=False) + query = query[:5000] + if filter: + return vs.similarity_search_with_score(query, k=k, filter=filter) + return vs.similarity_search_with_score(query, k=k) + + def similarity_search_dense_filtered( + self, + query: str, + k: int, + filter_expr: str, + ) -> List[Tuple[Document, float]]: + """ + 使用 dense 向量 ANN + Milvus 标量过滤检索。 + hybrid(dense+sparse)集合上 langchain_milvus 的 filter 可能不生效,抽取侧召回用此路径保证 doc_id 隔离。 + """ + from pymilvus import MilvusClient + + q = (query or "")[:5000] + if not q.strip(): + return [] + emb = self._get_embeddings().embed_query(q) + client = MilvusClient(uri=MILVUS_DB_URL) + try: + raw = client.search( + collection_name=self.collection_name, + data=[emb], + anns_field="dense", + limit=max(1, int(k)), + filter=filter_expr, + output_fields=[ + "text", + "heading", + "heading_level", + "doc_id", + "project_uuid", + "original_title", + "path", + ], + ) + finally: + client.close() + hits = raw[0] if raw else [] + out: List[Tuple[Document, float]] = [] + for hit in hits: + ent = hit.get("entity") or {} + doc = Document( + page_content=str(ent.get("text") or ""), + metadata={ + "heading": ent.get("heading"), + "heading_level": ent.get("heading_level"), + "doc_id": ent.get("doc_id"), + "project_uuid": ent.get("project_uuid"), + "original_title": ent.get("original_title"), + "path": ent.get("path"), + }, + ) + dist = hit.get("distance") + try: + score = float(dist) if dist is not None else 0.0 + except (TypeError, ValueError): + score = 0.0 + out.append((doc, score)) + return out + + def delete_by_filter(self, filter_expr: str) -> int: + try: + from pymilvus import MilvusClient + client = MilvusClient(uri=MILVUS_DB_URL) + if not client.has_collection(self.collection_name): + return 0 + # 某些集合主键字段名不叫 id(例如 langchain-milvus 可能使用自定义 PK/auto_id)。 + # 先从集合描述里找主键字段,再用于 query 计数。 + pk_field = None + describe = client.describe_collection(self.collection_name) + for f in describe.get("fields", []) or []: + # 兼容不同返回结构:is_primary / isPrimary / primary + if f.get("is_primary") or f.get("isPrimary") or f.get("primary"): + pk_field = f.get("name") + break + + count = 0 + try: + if pk_field: + res = client.query( + self.collection_name, + filter=filter_expr, + output_fields=[pk_field], + ) + count = len(res) + else: + # 找不到主键字段名时也不阻断删除 + count = 0 + except Exception: + # 仅计数失败不影响删除 + count = 0 + + client.delete(self.collection_name, filter=filter_expr) + client.close() + return count + except Exception as e: + logger.error(f"删除失败: {e}") + return 0 + + +# ============================================================================ +# Markdown 拆分 +# ============================================================================ + +def split_markdown(text: str, chunk_size: int = 500, chunk_overlap: int = 50) -> List[str]: + if not text: return [] + splitter = RecursiveCharacterTextSplitter( + chunk_size=chunk_size, chunk_overlap=chunk_overlap, + separators=["\n\n", "。", "?", "!", "\n", ";", ":", ","] + ) + return splitter.split_text(text) + +def split_markdown_by_headings(content: str, chunk_size=300, chunk_overlap=40) -> List[Document]: + if not content: return [] + docs = [] + lines = content.split("\n") + current_heading = "" + current_level = 0 + current_lines = [] + + def flush(): + nonlocal current_lines, current_heading, current_level + txt = "\n".join(current_lines).strip() + if txt: + docs.append(Document( + page_content=txt, + metadata={"heading": current_heading, "heading_level": current_level} + )) + current_lines = [] + + for line in lines: + line = line.rstrip() + m = re.match(r"^(#{1,6})\s+(.+)$", line) + if m: + flush() + current_level = len(m.group(1)) + current_heading = m.group(2).strip() + else: + current_lines.append(line) + flush() + + if not docs: + chunks = split_markdown(content, chunk_size, chunk_overlap) + for i, c in enumerate(chunks): + docs.append( + Document( + page_content=c, + metadata={"chunk_index": i, "heading": "", "heading_level": 0}, + ) + ) + return docs + +def process_document_to_vector_store( + doc_id: str, title: str, content: str, path: str, project_uuid: str, collection_name=COLLECTION_NAME +) -> bool: + try: + vs = VectorStore(collection_name=collection_name, drop_old=False) + docs = split_markdown_by_headings(content) + for d in docs: + d.metadata["doc_id"] = doc_id + d.metadata["original_title"] = title + d.metadata["path"] = path + d.metadata["project_uuid"] = project_uuid + vs.add_documents(docs) + return True + except Exception as e: + logger.error(f"处理文档失败: {e}") + return False + +# ============================================================================ +# 数据预处理 +# ============================================================================ + +INPUT_FILE = "data/articles.jsonl" +OUTPUT_CHUNK_FILE = "data/processed/eval_chunks.jsonl" + +def load_jsonl(filename: str, encoding="utf-8"): + with open(filename, encoding=encoding) as f: + for line in f: + if line.strip(): + yield json.loads(line) + +def write_jsonl(data, filename, append=False, ensure_ascii=False): + mode = "a" if append else "w" + with open(filename, mode, encoding="utf-8") as f: + for item in data: + f.write(json.dumps(item, ensure_ascii=ensure_ascii) + "\n") + +def clean_text(text: str) -> str: + if not isinstance(text, str): return "" + text = re.sub(r"[\x00-\x09\x0B-\x1F\x7F]", "", text) + text = re.sub(r"[\u200b-\u200f\u2028\u2029]", "", text) + text = re.sub(r"[:’“”•…–—]", "", text) + text = re.sub(r"<[^>]+>", "\n", text) + text = re.sub(r"\n+", "\n", text) + text = re.sub(r" +", " ", text) + text = re.sub(r"^[。,?!;:]", "", text) + text = re.sub(r'[^\u4e00-\u9fff_a-zA-Z0-9\s,。!?;:、()《》【】""''·!@#$%^&*()_+=[]{}|;:\'",./<>?-]', "", text) + return text.strip() + +def concat_metadata_to_content(title: str, content: str, metadata: dict): + parts = [ + f"标题:{title}", + f"发布时间:{metadata.get('publish_time')}", + f"作者:{metadata.get('author')}", + f"来源:{metadata.get('source')}", + ] + parts = [p for p in parts if p.split(":")[-1]] + return " | ".join(parts) + "\n---\n" + content.strip() + +def process_all_documents(input_file, output_file, chunk_size=500, overlap=50): + docs = load_jsonl(input_file) + splitter = RecursiveCharacterTextSplitter(chunk_size=chunk_size, chunk_overlap=overlap, + separators=["\n\n", "。", "?", "!", "\n", ";", ":", ","]) + all_chunks = [] + num_docs = 0 + for doc in docs: + num_docs +=1 + content = clean_text(doc["content"]) + chunks = splitter.split_text(content) + for i, chunk in chunks: + clean_c = clean_text(chunk) + if len(clean_c) <10: continue + all_chunks.append({ + "id": f"{doc['id']}_chunk_{i}", + "doc_id": doc["id"], + "title": doc["title"], + "content": concat_metadata_to_content(doc["title"], clean_c, doc.get("metadata",{})), + "chunk_index": i, + "url": doc.get("metadata",{}).get("url","") + }) + write_jsonl(all_chunks, output_file) + return {"num_docs":num_docs, "num_chunks":len(all_chunks)} + +def load_chunk_jsonl(path): + res = [] + with open(path, encoding="utf-8") as f: + for line in f: + if line.strip(): + res.append(json.loads(line)) + return res + +def build_index(data, vs: VectorStore): + docs: List[Document] = [] + for row in data: + c = row.pop("content", "").strip() + if len(c) < 10: + continue + docs.append(Document(page_content=c, metadata=row)) + if docs: + vs.add_documents(docs) + +def get_vector_store(drop_old=False): + vs = VectorStore(collection_name=COLLECTION_NAME, drop_old=drop_old) + return vs._get_milvus(drop_old=drop_old) + +def search_eval(query, top_k=10): + from time import time + vs = VectorStore(drop_old=False) + st = time() + results = vs.similarity_search_with_score(query, k=top_k) + print(f"检索耗时: {time()-st:.2f}s") + return results + +# ============================================================================ +# 运行入口 +# ============================================================================ +if __name__ == "__main__": + logger.info("="*60) + logger.info("【Milvus 混合向量索引构建启动】dense + sparse(BM25)") + logger.info("="*60) + + process_all_documents(INPUT_FILE, OUTPUT_CHUNK_FILE) + logger.info("✅ 文本分块处理完成") + + chunk_data = load_chunk_jsonl(OUTPUT_CHUNK_FILE) + logger.info(f"✅ 加载分块数据:{len(chunk_data)} 条") + + vs = VectorStore(drop_old=False) + build_index(chunk_data, vs) + logger.info("✅ 索引构建完成(增量写入)") + + res = search_eval("测试检索内容") + logger.info(f"✅ 检索完成,命中数量:{len(res)}") + for doc, score in res: + logger.info(f"score={score:.4f} | content={doc.page_content[:80]}...") + + logger.info("="*60) + logger.info("【全部执行完成】") diff --git a/prompts/__init__.py b/prompts/__init__.py index e69de29..407d6d0 100644 --- a/prompts/__init__.py +++ b/prompts/__init__.py @@ -0,0 +1 @@ +# prompts 包 diff --git a/prompts/report_generation/__init__.py b/prompts/report_generation/__init__.py index e69de29..4272161 100644 --- a/prompts/report_generation/__init__.py +++ b/prompts/report_generation/__init__.py @@ -0,0 +1 @@ +# report_generation prompts 包 diff --git a/prompts/report_generation/appendix_templates.py b/prompts/report_generation/appendix_templates.py new file mode 100644 index 0000000..c9c6e21 --- /dev/null +++ b/prompts/report_generation/appendix_templates.py @@ -0,0 +1,52 @@ +"""Fixed markdown templates used by report generation.""" + + +def markdown_hashes_for_section_no(section_no: str) -> str: + """与前端 markdownHashesForSectionNo / _heading_level_and_class 对齐。""" + parts = str(section_no or "").strip().split(".") + if len(parts) == 1: + return "##" + if len(parts) == 2: + return "###" + return "####" + + +def missing_child_heading_markdown(heading_no: str) -> str: + hashes = markdown_hashes_for_section_no(heading_no) + return f"\n\n{hashes} {heading_no} 待补充\n\n待补充" + + +# 兼容旧引用;新代码请用 missing_child_heading_markdown(heading_no) +MISSING_CHILD_HEADING_TEMPLATE = "\n\n### {heading_no} 待补充\n\n待补充" + +MINIMAL_MISSING_TABLE_TEMPLATE = ( + "\n\n### {table_name}\n\n" + "| 项目 | 内容 |\n" + "| --- | --- |\n" + "| 关键数据 | 待补充 |\n" +) + +APPENDIX8_PARAMETER_COMPARISON_TABLE = ( + "| 序号 | 项目名称 | 单位 | 可研报告 | 后评价报告 | 备注 |\n" + "| --- | --- | --- | --- | --- | --- |\n" + "| 一 | 成本参数 | | | | |\n" + "| 1 | 原料价格 | | | | |\n" + "| 1.1 | 氢气 | 元/吨 | 待补充 | 待补充 | |\n" + "| 2 | 催化剂和化学药剂 | 万元 | 待补充 | 待补充 | |\n" + "| 3 | 燃料动力价格 | | | | |\n" + "| 3.1 | 除盐水价格 | 元/吨 | 待补充 | 待补充 | |\n" + "| …… | …… | | | | |\n" + "| 二 | 营业收入参数 | | | | |\n" + "| 2.1 | 98#汽油 | 元/吨 | 待补充 | 待补充 | |\n" + "| …… | …… | | | | |\n" + "| 三 | 税收参数 | | | | |\n" + "| | 增值税税率 | | | | |\n" + "| | 汽油各品种产品 | % | 待补充 | 待补充 | |\n" + "| …… | …… | | | | |\n" + "| 四 | 基准收益率 | % | 待补充 | 待补充 | |" +) + +APPENDIX_FIGURE_TARGETS: list[tuple[str, str]] = [ + ("附图1", "全厂物料平衡图"), + ("附图2", "烷基化装置物料平衡图"), +] diff --git a/prompts/report_generation/chapter_generation_system.md b/prompts/report_generation/chapter_generation_system.md new file mode 100644 index 0000000..ffb8471 --- /dev/null +++ b/prompts/report_generation/chapter_generation_system.md @@ -0,0 +1 @@ +你是后评价报告撰写助手。严格基于证据输出,禁止编造。示例仅可用于写作风格参考,禁止复用示例中的任何事实数据与结论。禁止输出与当前小节无关的表号/表题清单及跨节“详见表/参见表”引用。必须返回 JSON 对象,字段为 content/missingInfo/qualityChecks。 diff --git a/prompts/report_generation/chapter_generation_user.md b/prompts/report_generation/chapter_generation_user.md new file mode 100644 index 0000000..4f12d7a --- /dev/null +++ b/prompts/report_generation/chapter_generation_user.md @@ -0,0 +1,67 @@ +你正在编写后评价报告章节:{{section_title}} + +【章节细则描述】 +{{section_prompt}} + +【章节模板】 +{{section_title}} + +【模板必需表格】 +{{required_tables_text}} + +【结构化表格证据(必须优先采用)】 +{{structured_tables_text}} + +【字段级已抽取结果(强约束)】 +{{canonical_fields_text}} + +【章节示例】 +{{selected_example}} + +【参考范文】 +{{section_reference_block}} + +【示例使用约束】 +1. 以《模版.doc》同章节结构为第一优先:段落顺序、表格标题、表头字段尽量保持一致; +2. 参考范文仅用于格式与结构参考,严禁复用示例中的项目名称、年份、金额、比例、指标值与结论; +3. 所有数值必须来自证据包;如需表格,表头可沿用模板,表内数据必须替换为当前项目证据; +4. 若模板字段无证据,按字段粒度写"待补充",不得整段空泛描述。 + +【输出硬约束】 +1. 若存在【模板必需表格】,正文必须出现同名(或同编号)表格标题; +2. 若【结构化表格证据】中存在对应必需表,必须原样使用该 Markdown 表格,不得自行生成或改写表头/数值; +3. 仅在单元格级别缺失时写"待补充",避免整段反复"待补充"; +4. 若【字段级已抽取结果】中某字段为非"待补充"值,正文该字段不得写"待补充",必须使用该抽取值; +5. content 字段只允许写章节正文,严禁出现"【缺失信息说明】""【质量检查】"及其任何条目; +6. 禁止输出与本节无关的表号/表题清单,禁止出现跨节表格引用(如"详见表X-X/参见表X-X/见表X-X/如表X-X所示");仅当【章节输出结构约束】明确要求时,才允许引用或输出对应表。 +{{heading_rule}}7. 禁止使用"关键里程碑时间线""建设/投资执行情况"等突兀标签式标题。 + +【表格严格管控——必须遵守】 +1. **禁止凭空生成表格**:只有当【章节输出结构约束】中明确包含"【表格强制要求】"标签时,本节才允许输出 Markdown 表格; +2. **无"表格强制要求"的章节一律禁止输出任何 Markdown 表格**(即不得输出含 | 分隔符的表格行),即使证据包中有结构化表格数据也不得在正文中嵌入; +3. **"见附表N"仅为引用语**:若合同要求写"项目建设工作程序见附表1。"等引用句,只需输出该引用句文本,附表本体在报告末尾统一输出,严禁在本节正文中展开附表的完整 Markdown 表格; +4. 表格数据必须严格来自要素管理(element_tables/element_cells),不得自行编造表格内容; +5. 每个 Markdown 表格前须有独立一行表题(形如「表1 …」「表2-3 …」「附表8 …」等);表题紧挨表格上方单独成段,表题与表格之间最多空一行或一行注释;前端会将表题居中排版。 +6. **表号与表名间距**:表题中表号(如「表2-4」「附表8」)与表名之间须空两个全角空格(U+3000),例如「表2-4  原料数量及组成对比表」。 +7. **表头栏单位**:凡含计量单位的列名,名称写第一行、单位加括号写在第二行,且在同一表头单元格内(Markdown 可用 `
`,如 `新鲜水
(m³/h)`);表题与表头均勿使用 `**` 加粗;勿将单位单独占一列,勿把「名称(单位)」横挤在同一行。 +8. **公共单位写表题**:若整张表各数据列所用单位相同,单位应加括号写在表题行末尾(如「表3 ××公司储罐能力 (m³)」),表头栏内不再重复该单位;若各列单位不一致,则仍按列在表头内分行写单位。 +9. **表格序号列**:用阿拉伯数字,层次与正文一致(如 1、1.1、1.2、2、2.1);行键或表体第一列已带层次编号时可与之对齐;否则自上而下用 1、2、3…;「合计」「总计」行可用「—」。 +10. **表体与数字**:表内文字、数字宜水平与垂直居中;若单元格内需换行或分段(含 `
`),宜左齐排列以便阅读。同一表内、同列的小数、百分比等宜保留相同的小数位数。 + +【检索顺序约束】 +1. 优先使用要素抽取结果; +2. 要素不足时补充文档段落; +3. 最后使用关键词检索到的补充材料; +4. 无证据时写"待补充",禁止编造。 + +{{prior_sibling_sections_block}} + +{{prior_chapters_block}} + +【章节输出结构约束】 +{{section_contract}} + +【证据包(JSON)】 +{{evidence_json}} + +请仅返回 JSON:{"content":"章节Markdown正文","missingInfo":["缺失项"],"qualityChecks":["校验结论"]} diff --git a/prompts/report_generation/chapter_generation_user_ref_aligned.md b/prompts/report_generation/chapter_generation_user_ref_aligned.md new file mode 100644 index 0000000..cbd74e0 --- /dev/null +++ b/prompts/report_generation/chapter_generation_user_ref_aligned.md @@ -0,0 +1,88 @@ +你正在编写后评价报告章节:{{section_title}} + +本次任务:以【章节细则描述】和【参考范文】共同作为本节的写作模板,以【事实证据】作为唯一数据来源。核心原则是:**细则与范文决定写什么、怎么写;证据只负责提供可填入模板的真实数据**。生成时必须先搭模板,再填证据,严禁脱离模板自由发挥,严禁复用范文数据或自行改写证据数据。 + +========================= 第一部分 · 写作模板(最高优先级:决定内容范围、结构和文风)========================= + +【标题编号规则】 +{{heading_rule}} + +【章节细则描述】 +{{section_prompt}} + +【参考范文(内容范围、论述维度、段落结构和行文风格的主要模板)】 +{{section_reference_block}} + +========================= 第二部分 · 事实证据(唯一数据来源,仅用于支撑和填充模板)========================= + +【模板必需表格】 +{{required_tables_text}} + +【结构化表格证据(必须优先采用)】 +{{structured_tables_text}} + +【字段级已抽取结果(强约束)】 +{{canonical_fields_text}} + +【证据包(JSON)】 +{{evidence_json}} + +========================= 第三部分 · 上文已生成内容(只用于一致性校验,不改变本节模板)========================= + +{{prior_sibling_sections_block}} + +{{prior_chapters_block}} + +========================= 第四部分 · 写作与输出要求(务必逐条遵守)========================= + +【生成步骤】 +1. 先读取【章节细则描述】和【参考范文】,抽取本节应覆盖的内容主题、论述维度、段落顺序、子标题层级、表格/列举形式和结论方式; +2. 再读取【章节输出结构约束】,确认本节是否允许/必须输出表格、附表引用或特定结构; +3. 然后只从【事实证据】中选择可支撑上述模板的数据,把证据数据填入对应位置; +4. 最后输出正文。若模板要求的某项内容在证据中没有对应数据,写"待补充",不得跳过、猜测、编造或用范文数据顶替。 + +【模板遵循要求——细则与范文共同决定“写什么”和“怎么写”】 +1. "写什么"由【章节细则描述】与【参考范文】共同决定:细则列出的要点、子项及顺序为必写项;参考范文实际写到的内容主题、论述维度和信息点(如背景、依据、目标、措施、问题、结论等)也应覆盖。二者取并集,不得遗漏,也不得另起炉灶写无关内容; +2. "怎么写"以【参考范文】为主要模板:段落数量、段落顺序、每段主题、论述推进、句式结构、专业术语、连接词、语气口吻、详略程度和结论表达都应高度贴合范文; +3. 若【章节细则描述】与【参考范文】存在差异,优先保证细则要求完整覆盖,再用范文的结构和笔法组织表达;若二者均未要求,正文不要主动扩展。 + +【证据使用要求——数据必须来自证据且保持原值】 +1. 所有项目名称、时间、金额、数量、比例、指标值、单位、结论依据等事实性内容,只能来自第二部分事实证据; +2. 数据必须原值引用,严禁自行修改、估算、换算单位、四舍五入、增减、归纳为新数值或编造。证据是多少就写多少;证据未给出的数据写"待补充"; +3. 若【字段级已抽取结果】中某字段为非"待补充"值,正文必须原样使用该抽取值,不得写"待补充",也不得改动、换算或重新表述其数值; +4. 内容来源优先级:结构化表格证据 / 字段级已抽取结果 > 证据包(JSON)中的章节文档 > 关键词检索补充材料; +5. 禁止复用【参考范文】或【章节示例】中的任何项目名称、年份、金额、指标值、比例、结论等事实数据。 + +【参考范文贴合要求——高度相似但严禁照抄】 +1. 逐段对照:范文有几段就尽量写几段,每段主题、先后顺序、论述角度与起承转合须与范文对应; +2. 句式与笔法对齐:尽量沿用范文的段首引导方式、常用表达、收束方式和专业语气,使本节读起来与范文出自同一类报告; +3. 篇幅与颗粒度对齐:每段篇幅、信息密度和展开程度与范文相当,不得明显更短、更空泛,也不得无端扩写; +4. 形式对齐:范文采用分条、分项、描述性子标题或表格呈现的,本节也尽量采用同类形式,但必须满足【章节输出结构约束】和下方表格规则; +5. 禁止逐字照抄:不得出现与范文连续相同超过15字的句子或成段文字;应在保持结构和笔法相似的前提下,用本项目证据重新表述。 + +【输出硬约束】 +1. content字段只允许写章节正文,严禁出现"【缺失信息说明】""【质量检查】"及其任何条目; +2. 若存在【模板必需表格】,正文必须出现同名(或同编号)表格标题; +3. 若【结构化表格证据】中存在对应必需表,必须原样使用该Markdown表格,不得自行生成或改写表头/数值; +4. 仅在单元格级别缺失时写"待补充",避免整段反复"待补充"; +5. 禁止输出与本节无关的表号/表题清单,禁止出现跨节表格引用(如"详见表X-X/参见表X-X/见表X-X/如表X-X所示");仅当【章节输出结构约束】明确要求时,才允许引用或输出对应表; +6. 禁止使用"关键里程碑时间线""建设/投资执行情况"等突兀标签式标题; +7. 数字与汉字之间不留空格:阿拉伯数字、百分比、金额、年份等与相邻汉字之间不得插入半角或全角空格,例如写"投资1.2亿元""2023年12月""产能达95%",不得写"投资 1.2 亿元""2023 年 12 月""产能达 95 %";数字与计量单位之间也不留空格,如"30万吨"而非"30 万吨"; +8. 子标题形式约束:正文段落允许使用描述性小标题,但只能采用"一、""(一)""1."或加粗短语单独成行等中文公文层级形式;严禁使用Markdown标题语法(`#`、`##`、`###`等)充当子标题。表格上方的表题不属于子标题; +9. 计量单位须规范:面积写"m²"不得写"m2",体积写"m³"不得写"m3",流量写"m³/h"不得写"m3/h";温度写"℃",千分号写"‰",科学计数可写"×10⁴"。正文与表格中的单位均须规范。 + +【表格严格管控】 +1. 只有当【章节输出结构约束】中明确包含"【表格强制要求】"标签时,本节才允许输出Markdown表格; +2. 无"表格强制要求"的章节一律禁止输出任何Markdown表格(不得输出含`|`分隔符的表格行),即使证据包中有结构化表格数据也不得在正文中嵌入; +3. "见附表N"仅为引用语:若结构约束要求写"项目建设工作程序见附表1。"等引用句,只输出引用句文本,附表本体在报告末尾统一输出,严禁在本节展开完整Markdown表格; +4. 表格数据必须严格来自要素管理(element_tables/element_cells)或结构化表格证据,不得自行编造、换算或改写表格内容; +5. 每个Markdown表格前须有独立一行表题(如「表1  ××表」「表2-3  ××表」「附表8  ××表」),表题紧挨表格上方单独成段; +6. 表号与表名之间须空两个全角空格(U+3000),例如「表2-4  原料数量及组成对比表」; +7. 含计量单位的表头,名称写第一行、单位加括号写第二行,且在同一表头单元格内(Markdown可用`
`,如`新鲜水
(m³/h)`);勿将单位单独占一列; +8. 若整张表各数据列所用单位相同,单位写在表题行末尾,表头栏内不再重复;若各列单位不一致,则按列在表头内分行写单位; +9. 表格序号列用阿拉伯数字,层次与正文一致;"合计""总计"行可用"—"; +10. 同一表内、同列的小数、百分比等宜保留相同的小数位数,但不得因此改动证据原值。 + +【输出格式】 +请仅返回JSON:{"content":"章节Markdown正文","missingInfo":["缺失项"],"qualityChecks":["校验结论"]} +你正在编写后评价报告章节:{{section_title}} \ No newline at end of file diff --git a/prompts/report_generation/heading_rules.py b/prompts/report_generation/heading_rules.py new file mode 100644 index 0000000..cb0df0c --- /dev/null +++ b/prompts/report_generation/heading_rules.py @@ -0,0 +1,14 @@ +"""Heading rule prompt variables for report generation.""" + +DEFAULT_HEADING_RULE = ( + "5. 各章节内部小标题须使用规范层级格式(如“### 1.2.1 …”);" + "若在同一节内使用并列条目,必须统一写作“1)… 2)… 3)…”," + "禁止使用“一、二、三、”“(一)(二)(三)”或“1.”“1.2.”“3.1”等序号形式;\n" +) + +SECTION_HEADING_RULES: dict[str, str] = { + "1.2": ( + "5. 本节(1.2)必须严格遵循【章节输出结构约束】给定的纯文本编号体结构;" + "不得使用“###”等 Markdown 小标题语法;不得将“1.2.1/1.2.2”改写为“1)/2)”。\n" + ), +} diff --git a/prompts/report_generation/prompt_defaults.py b/prompts/report_generation/prompt_defaults.py new file mode 100644 index 0000000..15392c2 --- /dev/null +++ b/prompts/report_generation/prompt_defaults.py @@ -0,0 +1,4 @@ +"""Fallback prompt fragments for report generation.""" + +DEFAULT_SECTION_PROMPT_FALLBACK = "按后评价细则规范撰写本章节。" +DEFAULT_SELECTED_EXAMPLE_FALLBACK = "无示例,按规范输出。" diff --git a/prompts/report_generation/repair_missing_tables_system.md b/prompts/report_generation/repair_missing_tables_system.md new file mode 100644 index 0000000..a3f6170 --- /dev/null +++ b/prompts/report_generation/repair_missing_tables_system.md @@ -0,0 +1 @@ +你是后评价报告撰写助手。任务是对既有章节做最小修改补齐缺表,禁止删除事实性内容,禁止编造。返回 JSON:{"content":"..."} diff --git a/prompts/report_generation/repair_missing_tables_user.md b/prompts/report_generation/repair_missing_tables_user.md new file mode 100644 index 0000000..1d65432 --- /dev/null +++ b/prompts/report_generation/repair_missing_tables_user.md @@ -0,0 +1,19 @@ +你正在修订章节:{{section_title}} + +目标:在不删除原有有效内容的前提下,补齐缺失表格。 +必须出现的表标识:{{missing_tables}} + +要求: +1) 每个缺失表都必须在正文中出现,并使用 Markdown 表格; +2) 若证据不足,单元格可写“待补充”; +3) 表标题必须包含对应表标识(如“表2-1”); +4) 仅输出修订后的完整章节 Markdown。 + +【原章节内容】 +{{content}} + +【原始章节提示词】 +{{original_prompt}} + +【证据包(JSON)】 +{{evidence_json}} diff --git a/prompts/report_generation/table_format_repair_system.md b/prompts/report_generation/table_format_repair_system.md new file mode 100644 index 0000000..a284f49 --- /dev/null +++ b/prompts/report_generation/table_format_repair_system.md @@ -0,0 +1 @@ +你是后评价报告格式修订助手。仅做格式对齐修订:章节标题、表名、表头。禁止新增未证据支持的数据。返回 JSON:{"content":"..."} diff --git a/prompts/report_generation/table_format_repair_user.md b/prompts/report_generation/table_format_repair_user.md new file mode 100644 index 0000000..d022065 --- /dev/null +++ b/prompts/report_generation/table_format_repair_user.md @@ -0,0 +1,25 @@ +你正在修订章节:{{section_title}} + +目标:对齐模板格式,不改变事实结论。 +请仅修订“章节标题、表名、表头”,正文事实描述尽量保持原样。 + +【模板表规格(JSON)】 +{{table_specs_json}} + +【当前章节】 +{{content}} + +【证据包(JSON)】 +{{evidence_json}} + +修订规则: +1) 章节首行必须为标准章节标题; +2) 表名必须与模板表规格中的 token/title 对齐;表题中表号与表名之间须空两个全角空格(如「表2-4  原料数量及组成对比表」); +3) 表头字段优先与模板一致,表内数据来自证据包,无值写待补充; +4) 必须使用 Markdown 表格; +5) 表头栏排版:指标名称与计量单位分两行写在同一表头单元格内;单位须加括号并写在名称正下方(Markdown 可用 `
`,如 `新鲜水
(m³/h)`);表题与表头均勿使用 `**` 加粗;勿将单位单独拆成一列表头列,勿把「名称(单位)」横挤在同一行; +6) 若整张表各数据列所用单位相同,应将单位加括号写在表题末尾(如「表3 ××公司储罐能力 (m³)」),表头栏内不再重复写该单位; +7) 表格「序号」列:优先使用各行行键(row_key)首部已有的阿拉伯数字层次编号(与正文 1、1.1、1.2、2、2.1 一致);若行键未带此类编号,则用自上而下连续阿拉伯数字 1、2、3…;「合计」「总计」行序号可用「—」; +8) 表体单元格内容宜居中;若有换行或分段,宜左齐。同列数值宜统一小数位数; +9) 禁止编造事实数据; +10) 仅返回修订后的完整章节 Markdown(不要返回 JSON)。 diff --git a/routers/report.py b/routers/report.py new file mode 100644 index 0000000..6cde0fa --- /dev/null +++ b/routers/report.py @@ -0,0 +1,204 @@ +""" +routers/report.py +后评价报告「核心生成」路由(独立抽取版)。 + +从 eval_report 的 routers/write.py 摘取报告生成相关端点,去除鉴权依赖, +项目查询改用轻量的 services/project_service.get_project。 +业务逻辑在 services/report_generation_service.py。 +""" + +from __future__ import annotations + +import asyncio +import json +from typing import Optional + +from fastapi import APIRouter, Depends, Header, HTTPException +from fastapi.responses import StreamingResponse +from sqlalchemy.orm import Session + +from database import SessionLocal, get_db +from database.models import ReportTemplate, ReportTemplateSection +from schemas.write import ( + GenerateReportJobCreate, + GenerateReportJobItem, + GenerateReportResult, +) +from services.project_service import get_project +from services.report_generation_service import ( + create_report_job, + get_report_job, + get_report_result, + get_report_stream_snapshot, + retry_report_chapter, + cancel_report_job, +) + +router = APIRouter(prefix="/write", tags=["后评价报告生成"]) + + +@router.get("/projects/{project_id}/generate-sections", summary="按章节智能体生成提示词清单") +def generate_sections_prompt( + project_id: str, + template_id: Optional[str] = None, + db: Session = Depends(get_db), +): + _ = get_project(project_id, db) + template = None + if template_id: + template = db.query(ReportTemplate).filter(ReportTemplate.id == template_id, ReportTemplate.is_active == True).first() # noqa: E712 + if not template: + template = db.query(ReportTemplate).filter(ReportTemplate.is_default == True, ReportTemplate.is_active == True).first() # noqa: E712 + if not template: + raise HTTPException(status_code=404, detail="未找到可用模板") + sections = ( + db.query(ReportTemplateSection) + .filter(ReportTemplateSection.template_id == template.id) + .order_by(ReportTemplateSection.section_order.asc()) + .all() + ) + return { + "templateId": template.id, + "templateName": template.name, + "sections": [ + { + "sectionKey": s.section_key, + "sectionTitle": s.section_title, + "prompt": ( + "请基于2020后评价细则与本项目检索材料,先查要素表,再查文档段落,最后生成本章节内容。\n" + + (s.section_prompt or "") + ), + "examples": s.examples or "", + } + for s in sections + ], + } + + +@router.post( + "/projects/{project_id}/generate-report-job", + response_model=GenerateReportJobItem, + summary="创建分章异步报告生成任务", +) +def create_generate_report_job( + project_id: str, + body: GenerateReportJobCreate, + db: Session = Depends(get_db), + x_user_id: Optional[str] = Header(default=None, alias="X-User-Id"), +): + _ = get_project(project_id, db) + return create_report_job( + project_id, + db, + template_id=body.templateId, + top_k=body.topK, + requested_by=x_user_id, + ) + + +@router.get( + "/projects/{project_id}/generate-report-job/{job_id}", + response_model=GenerateReportJobItem, + summary="查询分章异步报告任务进度", +) +def get_generate_report_job( + project_id: str, + job_id: str, + db: Session = Depends(get_db), +): + return get_report_job(project_id, job_id, db) + + +@router.get( + "/projects/{project_id}/generate-report-job/{job_id}/result", + response_model=GenerateReportResult, + summary="获取分章异步报告任务结果", +) +def get_generate_report_result( + project_id: str, + job_id: str, + include_debug: bool = False, + db: Session = Depends(get_db), +): + return get_report_result(project_id, job_id, db, include_debug=include_debug) + + +@router.get( + "/projects/{project_id}/generate-report-job/{job_id}/events", + summary="订阅分章异步报告任务实时事件(SSE)", +) +async def stream_generate_report_job_events( + project_id: str, + job_id: str, + include_debug: bool = False, +): + # 校验后立即释放连接;SSE 循环中按需短连接查询,避免长连占满连接池 + with SessionLocal() as db: + _ = get_report_job(project_id, job_id, db) + + async def _event_stream(): + last_payload = "" + idle_ticks = 0 + while True: + snapshot = get_report_stream_snapshot(job_id, include_debug=include_debug) + if not snapshot: + with SessionLocal() as db: + job = get_report_job(project_id, job_id, db) + result = get_report_result(project_id, job_id, db, include_debug=include_debug) + snapshot = { + "job": job.model_dump(), + "result": result.model_dump(), + } + payload = json.dumps(snapshot, ensure_ascii=False, separators=(",", ":")) + if payload != last_payload: + last_payload = payload + idle_ticks = 0 + yield f"event: snapshot\ndata: {payload}\n\n" + else: + idle_ticks += 1 + if idle_ticks >= 20: + idle_ticks = 0 + yield "event: keepalive\ndata: ping\n\n" + + status = str(((snapshot.get("job") or {}).get("status") or "")).strip().lower() + if status in ("completed", "failed", "cancelled"): + yield f"event: end\ndata: {payload}\n\n" + break + await asyncio.sleep(0.25) + + return StreamingResponse( + _event_stream(), + media_type="text/event-stream", + headers={ + "Cache-Control": "no-cache", + "Connection": "keep-alive", + "X-Accel-Buffering": "no", + }, + ) + + +@router.post( + "/projects/{project_id}/generate-report-job/{job_id}/retry-chapter", + response_model=GenerateReportJobItem, + summary="重试指定章节", +) +def retry_generate_report_chapter( + project_id: str, + job_id: str, + section_key: str, + db: Session = Depends(get_db), +): + return retry_report_chapter(project_id, job_id, section_key, db) + + +@router.post( + "/projects/{project_id}/generate-report-job/{job_id}/cancel", + response_model=GenerateReportJobItem, + summary="取消报告生成任务", +) +def cancel_generate_report_job( + project_id: str, + job_id: str, + db: Session = Depends(get_db), +): + return cancel_report_job(project_id, job_id, db) diff --git a/schemas/write.py b/schemas/write.py new file mode 100644 index 0000000..5fbf958 --- /dev/null +++ b/schemas/write.py @@ -0,0 +1,179 @@ +""" +schemas/write.py +后评价报告项目相关的 Pydantic 数据模型。 +""" + +from __future__ import annotations +from typing import Any, List, Optional +from pydantic import BaseModel + + +# ---------- 版本 ---------- + +class DocVersion(BaseModel): + id: str + version: str + content: str + savedAt: str + author: str + note: Optional[str] = "" + citationPayload: Optional[dict[str, Any]] = None + + +# ---------- 文档 ---------- + +class WriteDocument(BaseModel): + id: str + title: str + content: str + wordCount: int + createdAt: str + updatedAt: str + projectId: str + status: str # draft | review | published + versions: List[DocVersion] = [] + + +class WriteDocumentSummary(BaseModel): + """列表页只返回摘要,不含 content 正文""" + id: str + title: str + wordCount: int + createdAt: str + updatedAt: str + projectId: str + status: str + + +# ---------- 项目 ---------- + +class WriteProject(BaseModel): + id: str + uuid: str # 项目唯一标识,与 kb 共用 + name: str + description: Optional[str] = "" + createdAt: str + updatedAt: str + docCount: int + status: str # active | archived + kbProjectId: Optional[str] = None + color: str + documents: List[WriteDocument] = [] + + +class WriteProjectSummary(BaseModel): + """列表页摘要,不含 documents""" + id: str + uuid: Optional[str] = None # 项目唯一标识,用于 URL 参数;兼容旧数据 + name: str + description: Optional[str] = "" + createdAt: str + updatedAt: str + docCount: int + status: str + kbProjectId: Optional[str] = None + color: str + + +# ---------- 创建 / 更新请求体 ---------- + +class WriteProjectCreate(BaseModel): + name: str + description: Optional[str] = "" + kbProjectId: Optional[str] = None + color: Optional[str] = "#3b82f6" + + +class WriteProjectUpdate(BaseModel): + name: Optional[str] = None + description: Optional[str] = None + status: Optional[str] = None + kbProjectId: Optional[str] = None + color: Optional[str] = None + + +class WriteDocumentCreate(BaseModel): + title: str + content: Optional[str] = "" + + +class WriteDocumentUpdate(BaseModel): + title: Optional[str] = None + content: Optional[str] = None + status: Optional[str] = None + + +class DocVersionCreate(BaseModel): + version: Optional[str] = None + content: str + author: str + note: Optional[str] = "" + citationPayload: Optional[dict[str, Any]] = None + + +# ---------- 章节审查(智能体) ---------- + + +class ChapterReviewRequest(BaseModel): + """章节智能审查请求体:选择章节 + 输入待审查文本。""" + + chapter: str # "1"~"6" + content: str + + +class ChapterReviewResponse(BaseModel): + """章节智能审查响应体:返回 Markdown 审查报告。""" + + success: bool = True + chapter: str + review: str + model: Optional[str] = None + message: Optional[str] = "" + + +class GenerateReportJobCreate(BaseModel): + templateId: Optional[str] = None + topK: int = 10 + + +class GenerateReportChapterItem(BaseModel): + sectionKey: str + sectionTitle: str + sectionOrder: int + status: str + updatedAt: Optional[str] = None + errorMessage: Optional[str] = None + + +class GenerateReportJobItem(BaseModel): + jobId: str + projectId: str + templateId: Optional[str] = None + status: str + progress: int + currentSectionKey: Optional[str] = None + errorMessage: Optional[str] = None + createdAt: Optional[str] = None + updatedAt: Optional[str] = None + completedAt: Optional[str] = None + chapters: List[GenerateReportChapterItem] = [] + + +class GenerateReportResultChapter(BaseModel): + sectionKey: str + sectionTitle: str + sectionOrder: int + status: str + content: Optional[str] = None + errorMessage: Optional[str] = None + promptText: Optional[str] = None + evidencePayload: Optional[dict] = None + validationPayload: Optional[dict] = None + + +class GenerateReportResult(BaseModel): + jobId: str + status: str + report: Optional[str] = None + consistency: List[str] = [] + chapters: List[GenerateReportResultChapter] = [] diff --git a/services/appendix_figure_extraction.py b/services/appendix_figure_extraction.py new file mode 100644 index 0000000..ef74110 --- /dev/null +++ b/services/appendix_figure_extraction.py @@ -0,0 +1,199 @@ +""" +从项目知识库 Word(.docx)中提取「附图1/附图2」嵌入图,用于报告附录。 + +细则常见版式:附图标题段落与图在同一节或相邻段落;解析时合并前/当前/后段文字做关键词匹配。 +""" + +from __future__ import annotations + +import base64 +import logging +from pathlib import Path +from typing import Optional + +from docx import Document +from docx.oxml.ns import qn +from docx.table import Table +from docx.text.paragraph import Paragraph + +logger = logging.getLogger(__name__) + +# 过滤装饰性小图(logo 等) +_MIN_FIGURE_BYTES = 6000 + +R_EMBED = "{http://schemas.openxmlformats.org/officeDocument/2006/relationships}embed" +_NS = { + "w": "http://schemas.openxmlformats.org/wordprocessingml/2006/main", + "a": "http://schemas.openxmlformats.org/drawingml/2006/main", +} + + +def _compact(s: str) -> str: + return "".join(str(s or "").split()) + + +def _classify_slot(ctx: str) -> Optional[int]: + """ + 返回 1=全厂物料平衡图,2=装置(如烷基化)物料平衡图。 + """ + t = _compact(ctx) + if not t: + return None + # 附图编号(先判 2,避免同段目录同时出现两个编号时误判) + if "附图2" in t: + return 2 + if "附图1" in t: + return 1 + if "全厂" in t and "物料平衡" in t: + return 1 + if "烷基化" in t and "物料平衡" in t: + return 2 + if "装置" in t and "物料平衡" in t and "全厂" not in t: + return 2 + return None + + +def _content_type_to_md_subtype(content_type: str) -> str: + ct = (content_type or "").lower() + if "jpeg" in ct or ct.endswith("jpg"): + return "jpeg" + if "png" in ct: + return "png" + if "gif" in ct: + return "gif" + if "emf" in ct: + return "x-emf" + if "wmf" in ct: + return "x-wmf" + return "png" + + +def _blob_to_data_uri(blob: bytes, content_type: str) -> str: + sub = _content_type_to_md_subtype(content_type) + b64 = base64.standard_b64encode(blob).decode("ascii") + return f"data:image/{sub};base64,{b64}" + + +def _iter_paragraphs_deep(doc: Document): + body_el = doc.element.body + for el in body_el: + if el.tag == qn("w:p"): + yield Paragraph(el, doc._body) + elif el.tag == qn("w:tbl"): + table = Table(el, doc._body) + for row in table.rows: + for cell in row.cells: + for p in cell.paragraphs: + yield p + + +def extract_appendix_figure_candidates_from_docx(path: Path) -> dict[int, list[tuple[int, bytes, str]]]: + """ + 从单个 docx 收集候选图:slot -> [(size, blob, content_type), ...] + content_type 来自 OPC part,用于拼 data URI。 + """ + candidates: dict[int, list[tuple[int, bytes, str]]] = {1: [], 2: []} + orphans_ordered: list[tuple[bytes, str]] = [] + try: + doc = Document(str(path)) + except Exception as exc: + logger.warning("appendix figure: open docx failed %s: %s", path, exc) + return candidates + + paras = list(_iter_paragraphs_deep(doc)) + texts = [p.text or "" for p in paras] + + for i, p in enumerate(paras): + blobs_with_type: list[tuple[bytes, str]] = [] + for blip in p._element.findall(".//a:blip", _NS): + embed = blip.get(R_EMBED) + if not embed: + continue + try: + rel = p.part.related_parts[embed] + except KeyError: + continue + blob = getattr(rel, "blob", None) + ct = getattr(rel, "content_type", "") or "image/png" + if blob and len(blob) >= _MIN_FIGURE_BYTES: + blobs_with_type.append((blob, ct)) + + if not blobs_with_type: + continue + + prev_t = texts[i - 1] if i > 0 else "" + cur_t = texts[i] + next_t = texts[i + 1] if i + 1 < len(texts) else "" + ctx = f"{prev_t}\n{cur_t}\n{next_t}" + slot = _classify_slot(ctx) + if slot is None: + for blob, ct in blobs_with_type: + orphans_ordered.append((blob, ct)) + continue + + for blob, ct in blobs_with_type: + candidates[slot].append((len(blob), blob, ct)) + + def _dedupe_preserve_order(pairs: list[tuple[bytes, str]]) -> list[tuple[bytes, str]]: + seen: set[int] = set() + out: list[tuple[bytes, str]] = [] + for blob, ct in pairs: + bid = id(blob) + if bid in seen: + continue + seen.add(bid) + out.append((blob, ct)) + return out + + orphans_ordered = _dedupe_preserve_order(orphans_ordered) + used_ids: set[int] = set() + for lst in candidates.values(): + for _sz, blob, _ct in lst: + used_ids.add(id(blob)) + orphans_ordered = [(b, c) for b, c in orphans_ordered if id(b) not in used_ids] + + if not candidates[1] and orphans_ordered: + b, c = orphans_ordered.pop(0) + candidates[1].append((len(b), b, c)) + if not candidates[2] and orphans_ordered: + b, c = orphans_ordered.pop(0) + candidates[2].append((len(b), b, c)) + + return candidates + + +def merge_best_appendix_figures( + per_doc: list[tuple[str, dict[int, list[tuple[int, bytes, str]]]]], +) -> dict[int, tuple[bytes, str, str]]: + """ + 多文档合并:每个 slot 只保留字节最大的一张(更可能是主流程图而非小图标)。 + + 返回 slot -> (blob, content_type, source_doc_name) + """ + best: dict[int, tuple[int, bytes, str, str]] = {} + for doc_name, cand in per_doc: + for slot in (1, 2): + for size, blob, ct in cand.get(slot) or []: + prev = best.get(slot) + if prev is None or size > prev[0]: + best[slot] = (size, blob, ct, doc_name) + return {k: (v[1], v[2], v[3]) for k, v in best.items()} + + +def appendix_figure_markdown_images( + resolved: dict[int, tuple[bytes, str, str]], + *, + label_title: list[tuple[str, str]], +) -> dict[int, str]: + """slot -> markdown 片段(含 ### 标题与 ![](data:...))""" + out: dict[int, str] = {} + slot_to_title = {i + 1: lt for i, lt in enumerate(label_title)} + for slot, (blob, ct, src) in resolved.items(): + if slot not in slot_to_title: + continue + label, title = slot_to_title[slot] + uri = _blob_to_data_uri(blob, ct) + cap = f"{label} {title}" + src_note = f"\n\n*(嵌入来源:{src})*" if src else "" + out[slot] = f"### {cap}\n\n![{cap}]({uri}){src_note}" + return out diff --git a/services/docx_export_service.py b/services/docx_export_service.py new file mode 100644 index 0000000..901e760 --- /dev/null +++ b/services/docx_export_service.py @@ -0,0 +1,28 @@ +""" +services/docx_export_service.py(瘦身版) + +本独立服务不提供 Word 导出能力;此处仅保留 report_generation_service 在 +正文小节编号识别时懒加载依赖的 `_is_likely_section_number`,以满足导入。 +""" + +from __future__ import annotations + +import re + + +def _is_likely_section_number(num: str) -> bool: + """报告小节编号(如 2.1.1),非正文能耗数值(如 132.41)。""" + s = str(num or "").strip() + if not s or not re.fullmatch(r"\d+(?:\.\d+)*", s): + return False + parts = s.split(".") + if len(parts) > 4: + return False + for part in parts: + try: + n = int(part) + except ValueError: + return False + if n < 1 or n > 30: + return False + return True diff --git a/services/kb_service.py b/services/kb_service.py new file mode 100644 index 0000000..7e6d744 --- /dev/null +++ b/services/kb_service.py @@ -0,0 +1,80 @@ +""" +services/kb_service.py(瘦身版) + +仅保留报告生成「附图提取」所需的知识库文档磁盘路径解析助手: +从 eval_report 的完整 kb_service.py 中抽取,去除知识库 CRUD / 上传 / worker 等无关逻辑。 +""" + +from __future__ import annotations + +from pathlib import Path +from typing import List, Optional + +from config import settings +from database.models import KbDocument as KbDocumentModel + + +def _normalize_rel_path(path: str) -> str: + """将 'a\\b\\c' 规范为 'a/b/c',并去掉前导 '/'。""" + s = str(path or "").replace("\\", "/").strip() + while s.startswith("./"): + s = s[2:] + return s.lstrip("/") + + +def _kb_doc_storage_rel_path( + file_path_dir: Optional[str], + basename: str, + storage_rel_path: Optional[str] = None, +) -> str: + """项目目录下的相对存储路径(含文件名)。优先 storage_rel_path(confirm 时写入)。""" + stored = _normalize_rel_path(str(storage_rel_path or "")) + if stored: + return stored + d = _normalize_rel_path(str(file_path_dir or "")) + bn = str(basename or "").strip() + if d and bn: + return f"{d}/{bn}" + return bn or d + + +def _kb_doc_path_candidates_for_model(doc_root: Path, doc: KbDocumentModel) -> List[Path]: + """解析磁盘路径时的候选列表(按优先级)。""" + rel = _kb_doc_storage_rel_path( + doc.file_path, + doc.name, + getattr(doc, "storage_rel_path", None), + ) + candidates: List[Path] = [] + if rel: + candidates.append((doc_root / doc.project_id / rel).resolve()) + name = str(doc.name or "").strip() + fp_dir = _normalize_rel_path(str(doc.file_path or "")) + if fp_dir and name: + candidates.append((doc_root / doc.project_id / fp_dir / name).resolve()) + if name: + candidates.append((doc_root / doc.project_id / name).resolve()) + if not candidates: + candidates.append((doc_root / doc.project_id / "_missing_").resolve()) + deduped: List[Path] = [] + seen: set[str] = set() + for p in candidates: + key = str(p) + if key in seen: + continue + seen.add(key) + deduped.append(p) + return deduped + + +def _kb_doc_absolute_file_path_for_model(doc_root: Path, doc: KbDocumentModel) -> Path: + for p in _kb_doc_path_candidates_for_model(doc_root, doc): + if p.is_file(): + return p + return _kb_doc_path_candidates_for_model(doc_root, doc)[0] + + +def _kb_doc_file_exists_for_model(doc: KbDocumentModel) -> bool: + """文档在磁盘上是否可读(多路径回退,兼容历史 file_path/name 组合)。""" + doc_root = Path(settings.DOC_PAT).resolve() + return any(p.is_file() for p in _kb_doc_path_candidates_for_model(doc_root, doc)) diff --git a/services/project_service.py b/services/project_service.py new file mode 100644 index 0000000..e476e0c --- /dev/null +++ b/services/project_service.py @@ -0,0 +1,43 @@ +""" +services/project_service.py + +报告生成所需的最小项目查询,替代 eval_report 中重型的 write_service。 +仅提供按 uuid / 数字 id 查询项目并返回 WriteProject,用于校验项目存在性与取项目名。 +""" + +from __future__ import annotations + +from fastapi import HTTPException +from sqlalchemy.orm import Session + +from database.models import Project +from schemas.write import WriteProject + + +def get_project(project_id: str, db: Session) -> WriteProject: + """获取后评价报告项目详情。支持 uuid 或数字 id;优先 uuid。""" + project = None + if project_id: + project = db.query(Project).filter(Project.uuid == project_id).first() + if not project: + try: + pid = int(project_id) + project = db.query(Project).filter(Project.id == pid).first() + except (ValueError, TypeError): + pass + if not project: + raise HTTPException(status_code=404, detail="项目不存在") + + return WriteProject( + id=str(project.id), + uuid=project.uuid, + name=project.name, + description=project.description or "", + createdAt=project.created_at.strftime("%Y-%m-%d") if project.created_at else "", + updatedAt=project.updated_at.strftime("%Y-%m-%d") if project.updated_at else "", + docCount=project.doc_count, + status=project.status, + kbProjectId=None, + color=project.color, + documents=[], + ) diff --git a/services/prompt_template_service.py b/services/prompt_template_service.py new file mode 100644 index 0000000..e8bae96 --- /dev/null +++ b/services/prompt_template_service.py @@ -0,0 +1,28 @@ +from __future__ import annotations + +import re +from pathlib import Path +from typing import Any + + +PROMPT_ROOT = Path(__file__).resolve().parent.parent / "prompts" +_TOKEN_RE = re.compile(r"{{\s*([A-Za-z_][A-Za-z0-9_]*)\s*}}") + + +def load_prompt_template(relative_path: str) -> str: + path = (PROMPT_ROOT / relative_path).resolve() + if not path.is_relative_to(PROMPT_ROOT.resolve()): + raise ValueError(f"Invalid prompt path: {relative_path}") + return path.read_text(encoding="utf-8") + + +def render_prompt_template(template: str, **context: Any) -> str: + def _replace(match: re.Match[str]) -> str: + value = context.get(match.group(1), "") + return "" if value is None else str(value) + + return _TOKEN_RE.sub(_replace, template) + + +def render_prompt(relative_path: str, **context: Any) -> str: + return render_prompt_template(load_prompt_template(relative_path), **context) diff --git a/services/reference_service.py b/services/reference_service.py new file mode 100644 index 0000000..c8bf581 --- /dev/null +++ b/services/reference_service.py @@ -0,0 +1,292 @@ +""" +services/reference_service.py +参考范文加载服务:报告生成时按需加载对应章节参考范文 +""" +from __future__ import annotations + +import json +import logging +import re +from typing import Optional + +from sqlalchemy.orm import Session + +from database.models import ReportSectionReference +from services.llm_client import chat_completions_json + +logger = logging.getLogger(__name__) + + +_DESENSITIZE_SYSTEM_PROMPT = """你是一个文档脱敏助手。你的任务是对后评价报告范文进行脱敏处理,只保留报告的结构骨架。 + +## 脱敏规则 + +### 必须保留的结构 +1. Markdown 标题层级(## 1.1、## 1.2、### 1.2.1 等) +2. 表格的表头行、分隔行(|--|--|) +3. 段落/章节的组织顺序和逻辑关系 +4. 文字的叙述逻辑(先写什么、再写什么) +5. 表格的行数、列数、表头字段名(如"序号""项目名称""可研报告""实际值") + +### 必须替换为 xxx 的内容 +1. 所有具体数字:金额、年份、百分比、数量、面积、产能、投资额等 +2. 项目名称、公司名称、单位名称等专有名词(书名号/引号内的内容) +3. 表格中的数据单元格内容(保留表头) +4. 具体的日期、时间节点 +5. 财务指标的具体数值(IRR、NPV、回收期等) + +### 特别注意 +- 不要随意增删段落或改变段落顺序 +- 不要删除整个表格,只替换表格中的数据单元格 +- 保持原 Markdown 格式不变 +- "待补充"、"详见附表"等 固定用语 不脱敏 +- 书名号《》中的内容如果是不知名的规范/标准名称(如《石油化工标准》),保留书名号但内容替换为 xxx""" + + +_DESENSITIZE_USER_PROMPT_TEMPLATE = """请对以下后评价报告章节进行脱敏处理,只保留结构骨架,所有具体数据替换为 xxx: + +``` +{content} +``` + +请严格按照脱敏规则处理,直接输出脱敏后的完整 Markdown 内容,不要输出任何额外说明。""" + + +def _desensitize_via_llm(content: str) -> str: + """ + 调用大模型对参考范文进行脱敏处理。 + 传入完整内容,返回仅保留结构骨架、具体数据替换为 xxx 的 Markdown。 + + 若 LLM 调用失败,退回原始内容(不脱敏优于拒绝服务)。 + """ + if not content or not content.strip(): + return content + + user_prompt = _DESENSITIZE_USER_PROMPT_TEMPLATE.format(content=content[:12000]) + + logger.info("参考范文脱敏 start | content_len=%s", len(content)) + + try: + result = chat_completions_json( + system_prompt=_DESENSITIZE_SYSTEM_PROMPT, + user_prompt=user_prompt, + temperature=0.0, + max_tokens=16384, + timeout_sec=120, + ) + raw = result.get("content") or "" + if isinstance(raw, str) and raw.strip(): + # 去掉可能的 ```markdown / ``` 包裹 + cleaned = re.sub(r"^```(?:markdown)?\s*", "", raw.strip(), flags=re.IGNORECASE) + cleaned = re.sub(r"\s*```$", "", cleaned) + logger.info("参考范文脱敏 done | original_len=%s | desensitized_len=%s", len(content), len(cleaned)) + return cleaned.strip() + except Exception as e: + logger.warning("LLM 脱敏失败,退回原文: %s", e) + + return content + + +def load_section_reference( + db: Session, + section_key: str, + source_file: Optional[str] = None, + *, + max_chars: int = 8000, +) -> str: + """ + 加载指定章节的参考范文内容。 + + Args: + db: 数据库会话 + section_key: 章节标识(如 "1.1", "2.1.1") + source_file: 来源文件名(可选,不指定时取该章节最新的一条) + max_chars: 最大字符数,超出截断 + + Returns: + 参考范文 Markdown 文本,未找到时返回空字符串 + """ + query = db.query(ReportSectionReference).filter( + ReportSectionReference.section_key == section_key + ) + + if source_file: + query = query.filter(ReportSectionReference.source_file == source_file) + + ref = ( + query + .order_by(ReportSectionReference.updated_at.desc()) + .first() + ) + + if not ref or not ref.content: + return "" + + content = ref.content.strip() + if not content: + return "" + + content = _desensitize_via_llm(content) + + if len(content) > max_chars: + logger.info("参考范文 %s 超出 %d 字符限制,已截断", section_key, max_chars) + content = content[:max_chars] + "\n\n(参考范文超出长度限制,已截断)" + + return content + + +def load_section_reference_by_title( + db: Session, + section_title: str, + source_file: Optional[str] = None, + *, + max_chars: int = 8000, +) -> str: + """ + 按标题关键字匹配加载参考范文(不精确匹配 section_key 时的兜底方案)。 + """ + refs = db.query(ReportSectionReference) + + if source_file: + refs = refs.filter(ReportSectionReference.source_file == source_file) + + # 尝试精确匹配 section_key(从标题中提取编号) + import re + + m = re.match(r"(\d+(?:\.\d+)*)", section_title.strip()) + if m: + key = m.group(1) + exact = ( + refs.filter(ReportSectionReference.section_key == key) + .order_by(ReportSectionReference.updated_at.desc()) + .first() + ) + if exact and exact.content: + content = exact.content.strip() + content = _desensitize_via_llm(content) + if len(content) > max_chars: + content = content[:max_chars] + "\n\n(参考范文超出长度限制,已截断)" + return content + + # 按标题模糊匹配 + ref = ( + refs.filter(ReportSectionReference.section_title.contains(section_title[:20])) + .order_by(ReportSectionReference.updated_at.desc()) + .first() + ) + + if not ref or not ref.content: + return "" + + content = ref.content.strip() + if not content: + return "" + + content = _desensitize_via_llm(content) + + if len(content) > max_chars: + content = content[:max_chars] + "\n\n(参考范文超出长度限制,已截断)" + + return content + + +def load_section_reference_raw( + db: Session, + section_key: str, + template_id: Optional[str] = None, + *, + max_chars: int = 8000, +) -> str: + """ + 加载指定章节存储在数据库中的原始参考范文内容(不做 LLM 脱敏)。 + + 与 load_section_reference 的区别:直接返回 report_section_references.content 原文, + 仅保留长度截断保护,不再调用 _desensitize_via_llm。 + + template_id: 选中模板的 ID。传入后只注入与该模板关联的参考范文,实现“按模板过滤”; + 为空则不做模板过滤(取最新一条)。 + """ + query = db.query(ReportSectionReference).filter( + ReportSectionReference.section_key == section_key + ) + + if template_id: + query = query.filter(ReportSectionReference.template_id == template_id) + + ref = ( + query + .order_by(ReportSectionReference.updated_at.desc()) + .first() + ) + + if not ref or not ref.content: + return "" + + content = ref.content.strip() + if not content: + return "" + + if len(content) > max_chars: + logger.info("参考范文 %s 超出 %d 字符限制,已截断", section_key, max_chars) + content = content[:max_chars] + "\n\n(参考范文超出长度限制,已截断)" + + return content + + +def load_section_reference_raw_by_title( + db: Session, + section_title: str, + template_id: Optional[str] = None, + *, + max_chars: int = 8000, +) -> str: + """按标题匹配加载原始参考范文内容(不做 LLM 脱敏),用于 section_key 未命中时的兜底。""" + refs = db.query(ReportSectionReference) + + if template_id: + refs = refs.filter(ReportSectionReference.template_id == template_id) + + import re + + m = re.match(r"(\d+(?:\.\d+)*)", section_title.strip()) + if m: + key = m.group(1) + exact = ( + refs.filter(ReportSectionReference.section_key == key) + .order_by(ReportSectionReference.updated_at.desc()) + .first() + ) + if exact and exact.content: + content = exact.content.strip() + if len(content) > max_chars: + content = content[:max_chars] + "\n\n(参考范文超出长度限制,已截断)" + return content + + ref = ( + refs.filter(ReportSectionReference.section_title.contains(section_title[:20])) + .order_by(ReportSectionReference.updated_at.desc()) + .first() + ) + + if not ref or not ref.content: + return "" + + content = ref.content.strip() + if not content: + return "" + + if len(content) > max_chars: + content = content[:max_chars] + "\n\n(参考范文超出长度限制,已截断)" + + return content + + +def list_available_source_files(db: Session) -> list[str]: + """列出所有已上传的参考范文来源文件列表。""" + results = ( + db.query(ReportSectionReference.source_file) + .distinct() + .order_by(ReportSectionReference.source_file) + .all() + ) + return [r[0] for r in results if r[0]] \ No newline at end of file diff --git a/services/report_generation_service.py b/services/report_generation_service.py new file mode 100644 index 0000000..f23591e --- /dev/null +++ b/services/report_generation_service.py @@ -0,0 +1,7771 @@ +from __future__ import annotations + +import json +import logging +import re +import threading +from difflib import SequenceMatcher +import unicodedata +import uuid +from datetime import datetime +from pathlib import Path +from types import SimpleNamespace +from typing import Any, Optional + +from fastapi import HTTPException + +logger = logging.getLogger(__name__) + +# ── 运行时提示词落盘 ────────────────────────────────────────────────────────── +_PROMPT_DUMP_ROOT = Path(__file__).resolve().parent.parent / "comp" / "runtime" +_REPORT_OUTPUT_DUMP_ROOT = Path(__file__).resolve().parent.parent / "comp" / "report_outputs" + + +def _safe_markdown_filename(name: str, fallback: str = "section") -> str: + safe = re.sub(r'[\\/:*?"<>|]', "_", str(name or "").strip()) + safe = re.sub(r"\s+", " ", safe).strip(" ._") + return safe[:120] or fallback + + +def _dump_runtime_prompt( + job_id: str, + section_key: str, + section_title: str, + system_prompt: str, + user_prompt: str, +) -> None: + """将本次实际调用大模型的完整提示词(含证据)写入 comp/runtime//.md。""" + try: + out_dir = _PROMPT_DUMP_ROOT / job_id + out_dir.mkdir(parents=True, exist_ok=True) + safe_key = re.sub(r'[\\/:*?"<>|]', "_", section_key) + out_path = out_dir / f"{safe_key}.md" + ts = datetime.now().strftime("%Y-%m-%d %H:%M:%S") + content = ( + f"# {section_title}\n\n" + f"> job_id: `{job_id}` \n" + f"> section_key: `{section_key}` \n" + f"> 生成时间: {ts}\n\n" + "---\n\n" + "## System Prompt\n\n" + f"```\n{system_prompt}\n```\n\n" + "---\n\n" + "## User Prompt\n\n" + f"```\n{user_prompt}\n```\n" + ) + out_path.write_text(content, encoding="utf-8") + except Exception as exc: + logger.warning("dump runtime prompt failed: %s", exc) + + +def _dump_report_chapter_json_markdown( + *, + job_id: str, + section_key: str, + section_title: str, + output_json: dict[str, Any], +) -> Optional[str]: + """在章节流式生成结束并准备入库时,将该章节最终 JSON 输出写入 markdown 文件。 + + Returns: + 写入的文件路径,文件已存在(合并写入)时返回 None。 + """ + try: + out_dir = _REPORT_OUTPUT_DUMP_ROOT / job_id + out_dir.mkdir(parents=True, exist_ok=True) + safe_title = _safe_markdown_filename(section_title, fallback=_safe_markdown_filename(section_key)) + out_path = out_dir / f"{safe_title}.md" + ts = datetime.now().strftime("%Y-%m-%d %H:%M:%S") + json_text = json.dumps(output_json or {}, ensure_ascii=False, indent=2, default=str) + content = ( + f"# {section_title}\n\n" + f"> job_id: `{job_id}` \n" + f"> section_key: `{section_key}` \n" + f"> 写入时间: {ts}\n\n" + "```json\n" + f"{json_text}\n" + "```\n" + ) + out_path.write_text(content, encoding="utf-8") + return str(out_path) + except Exception as exc: + logger.warning("dump report chapter json markdown failed: %s", exc) + return None +from sqlalchemy.orm import Session + +from database import SessionLocal +from database.models import ( + ElementCell, + ElementTable, + Project, + KbDocument, + ReportGenerationChapter, + ReportGenerationJob, + ReportTemplate, + ReportTemplateSection, +) +from schemas.write import ( + GenerateReportJobItem, + GenerateReportChapterItem, + GenerateReportResult, + GenerateReportResultChapter, +) +from services.llm_client import chat_completions_json +from services.report_prompt_service import ( + build_report_chapter_prompt, + build_repair_missing_tables_prompt, + build_table_format_repair_prompt, + chapter_generation_system_prompt, + repair_missing_tables_system_prompt, + table_format_repair_system_prompt, +) +from services.retrieval_service import RetrievalService +from config import settings +from services.appendix_figure_extraction import ( + appendix_figure_markdown_images, + extract_appendix_figure_candidates_from_docx, + merge_best_appendix_figures, +) +from services.kb_service import _kb_doc_absolute_file_path_for_model +from services.report_runtime_store import ( + append_chapter_content, + get_job_state, + init_job_state, + set_chapter_stream_phase, + update_chapter_state, + update_job_state, +) +from services.standard_elements_2020 import ( + CHAPTER1_PROJECT_OVERVIEW_TABLE_GROUP, + MULTI_COLUMN_GLOBAL_SPECS, + APPENDIX2_CANONICAL_ROW_ORDER, + APPENDIX2_LEGACY_ROW_KEY_MAP, + APPENDIX8_LEGACY_ROW_KEY_MAP, + TABLE_5_3_ROW_KEY_ALTERNATES, + canonical_row_order_for_table, + TABLE_7_1_COLUMN_KEYS, + TABLE_7_1_ROW_CELL_DEFAULTS, + TABLE_7_1_SCORING_TABLE_NAME, + global_table_row_keys, + section_table_row_keys, + time_table_default_columns_for_name, +) +from prompts.report_generation.section_output_contracts import ( + DEFAULT_SECTION_OUTPUT_CONTRACT, + SECTION_OUTPUT_CONTRACTS, +) +from prompts.report_generation.heading_rules import ( + DEFAULT_HEADING_RULE, + SECTION_HEADING_RULES, +) +from prompts.report_generation.appendix_templates import ( + APPENDIX8_PARAMETER_COMPARISON_TABLE, + APPENDIX_FIGURE_TARGETS, + MINIMAL_MISSING_TABLE_TEMPLATE, + missing_child_heading_markdown, +) + +RUNNING_CHAPTER_STALE_SECONDS = 180 + +# 同一表号存在多张历史/别名表时,优先命中该表号的标准表名关键词,避免误选。 +_TABLE_TOKEN_PREFERRED_NAME_HINTS: dict[str, tuple[str, ...]] = { + "表2-5": ("总图、储运、公用工程及辅助工程对比",), + "表2-6": ("储运、公用工程及辅助工程依托对比", "依托"), + "表3-3": ("施工图设计变更情况", "全厂性项目"), + "表3-4": ("施工图设计变更情况", "单装置项目"), + "表3-5": ("影响投资或工期", "重大设计变更"), + "表5-4": ("生产经营及效益情况对比表",), + "表5-5": ("主要生产经营指标",), + "表5-6": ("不同因素变化对项目内部收益率的影响",), + "表5-7": ("内部收益率为基准收益率时不确定因素临界点或临界值",), +} + +# 表5-4 列键形如「可研报告|××年#1」:须与附表时间槽区分,且不可走「可研报告」前缀拆行,否则会生成「可研报告-|××年#1」错位表头。 +_TABLE54_PIPE_METRIC_PREFIXES = frozenset( + {"可研报告", "可研值", "实际值", "增减(%)", "增减", "指标"} +) +# 与表5-1 等混同步入的非细则列,直出时剔除 +_TABLE54_DROP_COL_KEYS = frozenset({"后评价值", "后评价报告"}) +_TABLE54_INVISIBLE_RE = re.compile(r"[\ufeff\u200b-\u200d]") + + +def _table54_ck_norm(ck: str) -> str: + """列键 NFKC 与去空白、BOM,便于识别误写入的「unit」全角变体等。""" + t = unicodedata.normalize("NFKC", str(ck or "")).strip() + return _TABLE54_INVISIBLE_RE.sub("", t) + + +def _is_table54_operating_benefit(table_name: str) -> bool: + tn = str(table_name or "").strip() + return "表5-4" in tn and "生产经营及效益情况对比表" in tn + + +def _element_table_collect_score(db: Session, table: ElementTable, token: str) -> int: + """报告生成选表:表5-4 须优先时间表且列键为「可研报告|××年#1」结构,避免误选抽取简表。""" + if not _table_token_matches_name(token, "表5-4"): + return 0 + score = 0 + if str(table.table_type or "").strip() == "time": + score += 200 + name = str(table.table_name or "") + if "生产经营及效益" in name: + score += 40 + sample = ( + db.query(ElementCell.row_key, ElementCell.col_key) + .filter( + ElementCell.table_id == table.id, + ElementCell.value.isnot(None), + ElementCell.value != "", + ) + .limit(48) + .all() + ) + for rk, ck in sample: + rk_s, ck_s = str(rk or ""), str(ck or "") + if "|" in ck_s and any( + p in ck_s for p in ("可研报告", "实际值", "增减") + ): + score += 8 + if "·" in rk_s: + score += 2 + if "年份未识别" in rk_s or "年份未识别" in ck_s: + score -= 40 + return score + + +def _pick_table54_year_markdown( + year_items: list[tuple[str, str]], + *, + table_year: int | None = None, +) -> tuple[str, str] | None: + """多张按年拆分的表5-4 取评价年(优先 element_tables.year / 2019)且表体最完整的一张。""" + if not year_items: + return None + if len(year_items) == 1: + return year_items[0] + + def _item_score(item: tuple[str, str]) -> int: + disp, md = item + sc = 0 + if table_year is not None and str(table_year) in str(disp): + sc += 120 + if "2019" in str(disp) or re.search(r"2019\s*年", md[:800]): + sc += 80 + if "可研报告" in md and "实际值" in md: + sc += 70 + if "增减" in md: + sc += 25 + if "运行情况·" in md or "主要经济指标·" in md: + sc += 35 + if "主要经济指标-" in md and "可研报告" not in md: + sc -= 60 + sc += min(md.count("\n|"), 60) + return sc + + return max(year_items, key=_item_score) + + +def _score_structured_table_hit_dict(hit: dict) -> int: + """structuredTables 条目评分:完整表5-4 对比表优先于 LLM 三行简表。""" + if not isinstance(hit, dict): + return 0 + md = str(hit.get("markdown") or "") + if not md: + return 0 + if _is_table54_simplified_extract_body(md): + return 0 + sc = 0 + if "可研报告" in md and "实际值" in md: + sc += 90 + if "增减" in md: + sc += 25 + if "运行情况·" in md or "主要经济指标·" in md: + sc += 40 + if "主要经济指标-" in md and "可研报告" not in md: + sc -= 70 + sc += min(md.count("\n|"), 80) + return sc + + +def _table54_body_preceded_by_element_source(text_before: str, *, max_chars: int = 600) -> bool: + """表体紧邻前是否已有要素直出注释(有则视为权威表5-4,勿删勿换)。""" + tail = str(text_before or "")[-max_chars:] + if "表格来源:要素管理" not in tail: + return False + after = tail.rsplit("表格来源:要素管理", 1)[-1] + chunk = after.split("\n", 8)[-1] + return not any( + ln.strip().startswith("|") or _is_pipe_markdown_table_row_line(ln) + for ln in chunk.splitlines()[:6] + if ln.strip() + ) + + +def _is_table54_simplified_extract_body(block: str) -> bool: + """ + 识别抽取/LLM 三行简表:仅「2019年实际值」等单列 + 少量「主要经济指标·」行, + 无「可研报告|…」与「增减」对比结构。 + """ + md = str(block or "").strip() + if not md or "|" not in md: + return False + hdr = re.sub(r"\s+", "", _extract_table_header_key(md)).lower() + hdr = re.sub(r"
.*", "", hdr, flags=re.IGNORECASE) + if not hdr: + return False + if "后评价值" in hdr or ("可研值" in hdr and "项目" not in hdr and "运行情况" not in md): + return True + has_compare_cols = ("可研报告" in hdr or ("可研" in hdr and "增减" in hdr)) and ( + "实际值" in hdr or "实际" in hdr + ) + if has_compare_cols and ("运行情况·" in md or md.count("\n|") >= 12): + return False + single_actual_year = bool( + re.search(r"\d{4}\s*年\s*实际值", hdr) or re.search(r"\d{4}年实际值", hdr) + ) + if single_actual_year and "可研" not in hdr and "增减" not in hdr: + if "主要经济指标" in md or "主要经济指标-" in md: + return True + if "主要经济指标-" in md and "可研报告" not in md and "增减" not in md: + return True + pipe_rows = [ + ln + for ln in md.splitlines() + if _is_pipe_markdown_table_row_line(ln) and not _is_pipe_markdown_table_separator_line(ln) + ] + if ( + len(pipe_rows) <= 5 + and "主要经济指标" in md + and "可研报告" not in md + and "运行情况·" not in md + ): + return True + return False + + +def _reorder_table54_col_order(col_order: list[str]) -> list[str]: + """单位列置前;其余按年度槽与「可研→实际→增减」顺序排列;剔除无效英文 unit 列与表5-1 混入列。""" + cols: list[str] = [] + for c in col_order: + s = _table54_ck_norm(c) + if not s: + continue + if s.lower() in ("unit", "__unit__"): + continue + if s in _TABLE54_DROP_COL_KEYS: + continue + cols.append(s) + cols = ["单位" if c == "指标单位" else c for c in cols] + seen: set[str] = set() + deduped: list[str] = [] + for c in cols: + if c in seen: + continue + seen.add(c) + deduped.append(c) + cols = deduped + + units = [c for c in cols if c == "单位"] + metrics = [c for c in cols if c != "单位"] + + def _metric_rank(g: str) -> int: + gs = g.strip() + if gs in ("可研报告", "指标", "可研值"): + return 0 + if gs == "实际值": + return 1 + if gs.startswith("增减"): + return 2 + return 9 + + def _sort_key(ck: str) -> tuple[str, int, str]: + if "|" not in ck: + return ("\xff", 99, ck) + g, t = ck.split("|", 1) + return (t.strip(), _metric_rank(g), ck) + + metrics = sorted(metrics, key=_sort_key) + if not units: + return ["单位"] + metrics + return units + metrics + + +def _table54_rekey_latest_col_keys(latest: dict[tuple[str, str], str]) -> None: + """将 latest 的 col_key 与 _reorder_table54_col_order 一致地做 NFKC 等规范化,否则「增减(%)」与「增减(%)」无法对齐。""" + tmp: dict[tuple[str, str], str] = {} + + def _prefer_val(cur: str, new: str) -> str: + s_new = str(new or "").strip() + if s_new and s_new != "待补充": + return str(new) + s_cur = str(cur or "").strip() + if s_cur and s_cur != "待补充": + return str(cur) + return s_new or s_cur or "" + + for (rk, ck), v in list(latest.items()): + rk_s = str(rk) + nk = _table54_ck_norm(str(ck)) + key = (rk_s, nk) + if key in tmp: + tmp[key] = _prefer_val(tmp[key], v) + else: + tmp[key] = str(v or "") + latest.clear() + latest.update(tmp) + + +def _table54_coalesce_legacy_bare_metric_cols( + latest: dict[tuple[str, str], str], row_order: list[str] +) -> None: + """ + 要素管理常见:数据写在裸列「可研报告/实际值/增减(%)」, + 列定义仍为「可研报告|××年#1」等;合并到槽位列以便与 UI 一致。 + """ + slot_map = { + "可研报告": "可研报告|××年#1", + "实际值": "实际值|××年#1", + "增减(%)": "增减(%)|××年#1", + "增减(%)": "增减(%)|××年#1", + } + + def _prefer(a: str, b: str) -> str: + sa, sb = str(a or "").strip(), str(b or "").strip() + if sa and sa != "待补充": + return sa + if sb and sb != "待补充": + return sb + return sa or sb + + for rk in row_order: + for bare, slot in slot_map.items(): + merged = _prefer(latest.get((rk, slot), ""), latest.get((rk, bare), "")) + if merged: + latest[(rk, slot)] = merged + + +def _table54_merge_year_cells_for_table_year( + year_cells: dict[int | None, list], + *, + table_year: int | None, +) -> tuple[dict[int | None, list], list[int]]: + """ + 表5-4:同一张 element_tables(year=2019)下多数格子 element_cells.year 为空, + 须与 year=2019 的少量格子合并后再渲染,否则只剩单列「实际值」简表。 + """ + if table_year is None or int(table_year) <= 0: + real = sorted(y for y in year_cells if y is not None) + return year_cells, real + ty = int(table_year) + merged: list = list(year_cells.get(None, [])) + for cy in sorted(y for y in year_cells if y is not None): + if cy == ty: + merged.extend(year_cells.get(cy, [])) + if not merged: + return year_cells, sorted(y for y in year_cells if y is not None) + return {ty: merged}, [ty] + + +def _table54_remap_indicator_unit_latest(latest: dict[tuple[str, str], str]) -> None: + """将历史列键「指标单位」的值并入「单位」,避免列键规范为「单位」后取不到数。""" + touched: list[tuple[str, str]] = [] + for (rk, ck), v in list(latest.items()): + if str(ck) != "指标单位": + continue + rk_s = str(rk) + k_unit = (rk_s, "单位") + cur = str(latest.get(k_unit, "") or "").strip() + nv = str(v or "").strip() + if nv and (not cur or cur == "待补充"): + latest[k_unit] = v + elif not cur: + latest[k_unit] = v + touched.append((rk_s, str(ck))) + for pair in touched: + latest.pop(pair, None) + + +_TABLE54_SLOT_YEAR_RE = re.compile(r"^(\d{4})年(?:#\d+)?$") + + +def _norm_table54_placeholder_year_tail(tail: str) -> str: + buf: list[str] = [] + for ch in (tail or "").strip(): + if ch in "xXxX": + buf.append("×") + elif ch == "\u00d7": + buf.append("×") + else: + buf.append(ch) + return "".join(buf) + + +def _table54_placeholder_year_tail(tail: str) -> bool: + """列键尾部为「××年#n」等占位列(与前端 isEmTable54YearSlotColKey 一致)。""" + u = _norm_table54_placeholder_year_tail(tail) + return bool(re.fullmatch(r"×{2}年(?:#\d+)?", u)) + + +def _parse_real_year_from_table54_slot_tail(tail: str) -> int | None: + """列键尾部为「2019」「2019年」「2019年#1」等真实日历时返回四位年。""" + t = (tail or "").strip() + if _table54_placeholder_year_tail(t): + return None + m = _TABLE54_SLOT_YEAR_RE.fullmatch(t) + if m: + y = int(m.group(1)) + if 1900 <= y <= 2100: + return y + m2 = re.match(r"^(\d{4})年", t) + if m2: + y = int(m2.group(1)) + if 1900 <= y <= 2100: + return y + return None + + +def _infer_time_column_year_for_table54( + col_order: list[str], + cells: list[Any], + table_year: int | None, +) -> int | None: + """ + 从单元格 year、时间表 element_tables.year、或列键「…|2019年」推断表5-4 年度栏对应的日历年。 + 无法唯一确定时返回 None(表头占位列退回「某年」)。 + """ + ys = sorted( + { + int(c.year) + for c in cells + if getattr(c, "year", None) is not None and int(c.year) > 0 + } + ) + if len(ys) == 1: + return ys[0] + if table_year is not None and int(table_year) > 0: + return int(table_year) + parsed: list[int] = [] + for ck in col_order: + s = str(ck or "").strip() + if "|" not in s: + continue + _, tail = s.split("|", 1) + cy = _parse_real_year_from_table54_slot_tail(tail.strip()) + if cy is not None: + parsed.append(cy) + uniq = sorted(set(parsed)) + if len(uniq) == 1: + return uniq[0] + # 列键正文含四位年(如「2019年可研报告」类裸列名);多列多年份时不武断取第一个 + text_years: list[int] = [] + for ck in col_order: + m = re.search(r"(19|20)\d{2}", str(ck or "")) + if m: + yi = int(m.group(0)) + if 1900 <= yi <= 2100: + text_years.append(yi) + ty_uniq = sorted(set(text_years)) + if len(ty_uniq) == 1: + return ty_uniq[0] + return None + + +def _table54_year_label_prefix(time_column_year: int | None) -> str: + if time_column_year is not None and 1900 <= int(time_column_year) <= 2100: + return f"{int(time_column_year)}年" + return "某年" + + +def _table54_year_prefix_for_slot_tail(tail: str, *, time_column_year: int | None) -> str: + """表头「{年}可研报告」中的「{年}」:优先列键自带年份,否则用推断的日历年,最后退回「某年」。""" + cy = _parse_real_year_from_table54_slot_tail(tail) + if cy is not None: + return f"{cy}年" + if _table54_placeholder_year_tail(tail): + return _table54_year_label_prefix(time_column_year) + t = (tail or "").strip() + if not t: + return "某年" + if re.match(r"^\d{4}年", t): + return t.split("#", 1)[0] + return f"{t}年" if not t.endswith("年") else t + + +def _table54_bare_metric_header_label(col_key: str, *, time_column_year: int | None) -> str | None: + """无「指标|年度槽」时的列键:表头带评价年。""" + s = str(col_key or "").strip() + if not s: + return None + ypfx = _table54_year_label_prefix(time_column_year) + if s == "实际值": + return f"{ypfx}实际值" + if s in ("可研值", "可研报告"): + return f"{ypfx}可研报告" + if s.startswith("增减"): + rest = s[len("增减") :] + return f"{ypfx}增减{rest}" + return None + + +def _table54_markdown_header_labels( + col_order: list[str], + *, + time_column_year: int | None = None, +) -> list[str]: + """扁平表头:单位、{年}可研报告、{年}实际值、{年}增减;{年}来自列键或要素日历年推断。""" + out: list[str] = [] + for ck in col_order: + s = str(ck).strip() + if s == "单位": + out.append("单位") + continue + if "|" not in s: + bare_l = _table54_bare_metric_header_label(s, time_column_year=time_column_year) + if bare_l is not None: + out.append(bare_l) + continue + out.append(s.replace("|", "|")) + continue + g, t = s.split("|", 1) + g, t = g.strip(), t.strip() + g_norm = _table54_ck_norm(g) + if (g not in _TABLE54_PIPE_METRIC_PREFIXES and g_norm not in _TABLE54_PIPE_METRIC_PREFIXES) or not t: + out.append(s.replace("|", "|")) + continue + ypfx = _table54_year_prefix_for_slot_tail(t, time_column_year=time_column_year) + if g_norm in ("可研报告", "指标", "可研值") or g in ("可研报告", "指标", "可研值"): + out.append(f"{ypfx}可研报告") + elif g_norm == "实际值" or g == "实际值": + out.append(f"{ypfx}实际值") + elif g_norm.startswith("增减") or g.startswith("增减"): + rest = g[len("增减") :] + out.append(f"{ypfx}增减{rest}") + else: + out.append(s.replace("|", "|")) + return out + + +def create_report_job( + project_id: str, + db: Session, + *, + template_id: Optional[str] = None, + top_k: int = 10, + requested_by: Optional[str] = None, +) -> GenerateReportJobItem: + project = _resolve_project(db, project_id) + if not project: + raise HTTPException(status_code=404, detail="项目不存在") + template = _resolve_template(db, template_id) + sections = _sections_for_generation(_list_template_sections(db, template.id)) + if not sections: + raise HTTPException(status_code=400, detail="模板未配置章节") + + now = datetime.now() + job = ReportGenerationJob( + id=uuid.uuid4().hex, + project_id=project.uuid, + template_id=template.id, + status="pending", + progress=0, + requested_by=requested_by, + options={"topK": max(5, min(int(top_k or 10), 20))}, + created_at=now, + updated_at=now, + ) + db.add(job) + # 先把父任务写入当前事务,确保后续章节插入满足外键约束。 + db.flush() + for s in sections: + db.add( + ReportGenerationChapter( + id=uuid.uuid4().hex, + job_id=job.id, + section_key=s.section_key, + section_title=s.section_title, + section_order=s.section_order, + status="pending", + created_at=now, + updated_at=now, + ) + ) + db.commit() + init_job_state( + job_id=job.id, + project_id=project.uuid, + template_id=template.id, + chapters=[ + { + "sectionKey": s.section_key, + "sectionTitle": s.section_title, + "sectionOrder": s.section_order, + "status": "pending", + } + for s in sections + ], + ) + _start_job_worker(job.id) + return get_report_job(project.uuid, job.id, db) + + +def get_report_job(project_id: str, job_id: str, db: Session) -> GenerateReportJobItem: + project = _resolve_project(db, project_id) + if not project: + raise HTTPException(status_code=404, detail="项目不存在") + job = ( + db.query(ReportGenerationJob) + .filter(ReportGenerationJob.id == job_id, ReportGenerationJob.project_id == project.uuid) + .first() + ) + if not job: + raise HTTPException(status_code=404, detail="任务不存在") + _recover_stalled_job(db, job) + chapters = ( + db.query(ReportGenerationChapter) + .filter(ReportGenerationChapter.job_id == job.id) + .order_by(ReportGenerationChapter.section_order.asc()) + .all() + ) + runtime_state = get_job_state(job.id) + runtime_chapter_map = ((runtime_state or {}).get("chapters") or {}) if isinstance(runtime_state, dict) else {} + return GenerateReportJobItem( + jobId=job.id, + projectId=job.project_id, + templateId=job.template_id, + status=(runtime_state or {}).get("status") or job.status, + progress=int((runtime_state or {}).get("progress") or job.progress or 0), + currentSectionKey=(runtime_state or {}).get("currentSectionKey") or job.current_section_key, + errorMessage=(runtime_state or {}).get("errorMessage") or job.error_message, + createdAt=_fmt_dt(job.created_at), + updatedAt=(runtime_state or {}).get("updatedAt") or _fmt_dt(job.updated_at), + completedAt=(runtime_state or {}).get("completedAt") or _fmt_dt(job.completed_at), + chapters=[ + GenerateReportChapterItem( + sectionKey=c.section_key, + sectionTitle=c.section_title, + sectionOrder=c.section_order, + status=(runtime_chapter_map.get(c.section_key) or {}).get("status") or c.status, + updatedAt=(runtime_chapter_map.get(c.section_key) or {}).get("updatedAt") or _fmt_dt(c.updated_at), + errorMessage=(runtime_chapter_map.get(c.section_key) or {}).get("errorMessage") or c.error_message, + ) + for c in chapters + ], + ) + + +def get_report_result( + project_id: str, + job_id: str, + db: Session, + *, + include_debug: bool = False, +) -> GenerateReportResult: + project = _resolve_project(db, project_id) + if not project: + raise HTTPException(status_code=404, detail="项目不存在") + job = ( + db.query(ReportGenerationJob) + .filter(ReportGenerationJob.id == job_id, ReportGenerationJob.project_id == project.uuid) + .first() + ) + if not job: + raise HTTPException(status_code=404, detail="任务不存在") + _recover_stalled_job(db, job) + chapter_rows = ( + db.query(ReportGenerationChapter) + .filter(ReportGenerationChapter.job_id == job.id) + .order_by(ReportGenerationChapter.section_order.asc()) + .all() + ) + chapter_title_map: dict[str, str] = {} + if job.template_id: + chapter_title_map = _build_section_title_map( + _list_template_sections(db, job.template_id) + ) + runtime_state = get_job_state(job.id) + if runtime_state: + report_text, chapter_items = _build_live_result_from_runtime( + runtime_state, + include_debug=include_debug, + chapter_title_map=chapter_title_map, + ) + else: + report_text, chapter_items = _build_live_result_from_chapters( + chapter_rows, + include_debug=include_debug, + chapter_title_map=chapter_title_map, + ) + report_text = _append_report_appendices(db, project.uuid, report_text) + report_text = _normalize_table_captions_in_markdown(report_text or "") + consistency = _check_consistency(report_text or "", project.name) + return GenerateReportResult( + jobId=job.id, + status=(runtime_state or {}).get("status") or job.status, + report=report_text, + consistency=consistency, + chapters=chapter_items, + ) + + +def _build_live_result_from_chapters( + chapters: list[ReportGenerationChapter], + *, + include_debug: bool = False, + chapter_title_map: Optional[dict[str, str]] = None, +) -> tuple[str, list[GenerateReportResultChapter]]: + chapter_items: list[GenerateReportResultChapter] = [] + report_parts: list[str] = [] + title_map = chapter_title_map or {} + for i, c in enumerate(chapters): + normalized_content = _fix_numeric_line_breaks(str(c.content or "").strip()) if c.content else c.content + if normalized_content: + normalized_content, _ = _collapse_consecutive_text_repetitions( + str(normalized_content).strip() + ) + normalized_content = _normalize_table_captions_in_markdown(str(normalized_content).strip()) + prev_body = "" + if i > 0 and chapters[i - 1].content: + prev_body = _fix_numeric_line_breaks(str(chapters[i - 1].content).strip()) + normalized_content = _inject_missing_parent_section_headings( + str(c.section_title or ""), + str(normalized_content).strip(), + prev_body, + title_map, + ) + chapter_items.append( + GenerateReportResultChapter( + sectionKey=c.section_key, + sectionTitle=c.section_title, + sectionOrder=c.section_order, + status=c.status, + content=normalized_content, + errorMessage=c.error_message, + promptText=(c.prompt_text if include_debug else None), + evidencePayload=(c.evidence_payload if include_debug else None), + validationPayload=(c.validation_payload if include_debug else None), + ) + ) + if normalized_content: + report_parts.append(str(normalized_content).strip()) + return _fix_numeric_line_breaks("\n\n".join(report_parts).strip()), chapter_items + + +def _build_live_result_from_runtime( + runtime_state: dict, + *, + include_debug: bool = False, + chapter_title_map: Optional[dict[str, str]] = None, +) -> tuple[str, list[GenerateReportResultChapter]]: + chapter_items: list[GenerateReportResultChapter] = [] + report_parts: list[str] = [] + title_map = chapter_title_map or {} + chapter_values = list(((runtime_state or {}).get("chapters") or {}).values()) + chapter_values.sort(key=lambda x: int((x or {}).get("sectionOrder") or 0)) + for i, chapter in enumerate(chapter_values): + if not isinstance(chapter, dict): + continue + normalized_content = _fix_numeric_line_breaks(str(chapter.get("content") or "").strip()) + if normalized_content: + normalized_content, _ = _collapse_consecutive_text_repetitions( + str(normalized_content).strip() + ) + normalized_content = _normalize_table_captions_in_markdown(str(normalized_content).strip()) + prev_body = "" + if i > 0 and isinstance(chapter_values[i - 1], dict): + prev_body = _fix_numeric_line_breaks(str(chapter_values[i - 1].get("content") or "").strip()) + normalized_content = _inject_missing_parent_section_headings( + str(chapter.get("sectionTitle") or ""), + normalized_content, + prev_body, + title_map, + ) + chapter_items.append( + GenerateReportResultChapter( + sectionKey=str(chapter.get("sectionKey") or ""), + sectionTitle=str(chapter.get("sectionTitle") or ""), + sectionOrder=int(chapter.get("sectionOrder") or 0), + status=str(chapter.get("status") or "pending"), + content=normalized_content or None, + errorMessage=chapter.get("errorMessage"), + promptText=(chapter.get("promptText") if include_debug else None), + evidencePayload=(chapter.get("evidencePayload") if include_debug else None), + validationPayload=(chapter.get("validationPayload") if include_debug else None), + ) + ) + if normalized_content: + report_parts.append(normalized_content) + return _fix_numeric_line_breaks("\n\n".join(report_parts).strip()), chapter_items + + +def get_report_stream_snapshot( + job_id: str, + *, + include_debug: bool = False, +) -> Optional[dict[str, Any]]: + runtime_state = get_job_state(job_id) + if not runtime_state: + return None + chapter_title_map: dict[str, str] = {} + template_id = runtime_state.get("templateId") + if template_id: + with SessionLocal() as db: + chapter_title_map = _build_section_title_map( + _list_template_sections(db, str(template_id)) + ) + report_text, chapter_items = _build_live_result_from_runtime( + runtime_state, + include_debug=include_debug, + chapter_title_map=chapter_title_map, + ) + runtime_chapters = list(((runtime_state or {}).get("chapters") or {}).values()) + runtime_chapters.sort(key=lambda x: int((x or {}).get("sectionOrder") or 0)) + job_payload = { + "jobId": runtime_state.get("jobId"), + "projectId": runtime_state.get("projectId"), + "templateId": runtime_state.get("templateId"), + "status": runtime_state.get("status"), + "progress": int(runtime_state.get("progress") or 0), + "currentSectionKey": runtime_state.get("currentSectionKey"), + "errorMessage": runtime_state.get("errorMessage"), + "createdAt": runtime_state.get("createdAt"), + "updatedAt": runtime_state.get("updatedAt"), + "completedAt": runtime_state.get("completedAt"), + "chapters": [ + { + "sectionKey": str(c.get("sectionKey") or ""), + "sectionTitle": str(c.get("sectionTitle") or ""), + "sectionOrder": int(c.get("sectionOrder") or 0), + "status": str(c.get("status") or "pending"), + "updatedAt": c.get("updatedAt"), + "errorMessage": c.get("errorMessage"), + } + for c in runtime_chapters + ], + } + result_payload = { + "jobId": runtime_state.get("jobId"), + "status": runtime_state.get("status"), + "report": report_text, + "consistency": [], + "chapters": [c.model_dump() for c in chapter_items], + } + return { + "job": job_payload, + "result": result_payload, + } + + +def retry_report_chapter(project_id: str, job_id: str, section_key: str, db: Session) -> GenerateReportJobItem: + project = _resolve_project(db, project_id) + if not project: + raise HTTPException(status_code=404, detail="项目不存在") + job = ( + db.query(ReportGenerationJob) + .filter(ReportGenerationJob.id == job_id, ReportGenerationJob.project_id == project.uuid) + .first() + ) + if not job: + raise HTTPException(status_code=404, detail="任务不存在") + chapter = ( + db.query(ReportGenerationChapter) + .filter(ReportGenerationChapter.job_id == job.id, ReportGenerationChapter.section_key == section_key) + .first() + ) + if not chapter: + raise HTTPException(status_code=404, detail="章节不存在") + now = datetime.now() + chapter.status = "pending" + chapter.error_message = None + chapter.updated_at = now + job.status = "running" + job.updated_at = now + db.commit() + update_job_state(job.id, status="running", errorMessage=None, completedAt=None) + update_chapter_state( + job.id, + section_key, + status="pending", + content=None, + errorMessage=None, + promptText=None, + evidencePayload=None, + validationPayload=None, + ) + _start_job_worker(job.id, section_key=section_key) + return get_report_job(project.uuid, job_id, db) + + +def cancel_report_job(project_id: str, job_id: str, db: Session) -> GenerateReportJobItem: + project = _resolve_project(db, project_id) + if not project: + raise HTTPException(status_code=404, detail="项目不存在") + job = ( + db.query(ReportGenerationJob) + .filter(ReportGenerationJob.id == job_id, ReportGenerationJob.project_id == project.uuid) + .first() + ) + if not job: + raise HTTPException(status_code=404, detail="任务不存在") + + now = datetime.now() + if job.status in ("completed", "failed", "cancelled"): + return get_report_job(project.uuid, job_id, db) + + chapters = ( + db.query(ReportGenerationChapter) + .filter(ReportGenerationChapter.job_id == job.id) + .all() + ) + for c in chapters: + if c.status in ("pending", "running"): + c.status = "failed" + c.error_message = "任务已由用户取消" + c.updated_at = now + + job.status = "cancelled" + job.error_message = "任务已由用户取消" + job.current_section_key = None + job.updated_at = now + job.completed_at = now + db.commit() + update_job_state( + job.id, + status="cancelled", + errorMessage="任务已由用户取消", + currentSectionKey=None, + completedAt=_fmt_dt(now), + ) + for c in chapters: + if c.status in ("failed", "cancelled") or c.error_message == "任务已由用户取消": + update_chapter_state( + job.id, + c.section_key, + status="failed", + errorMessage="任务已由用户取消", + ) + return get_report_job(project.uuid, job_id, db) + + +def _start_job_worker(job_id: str, section_key: Optional[str] = None) -> None: + threading.Thread( + target=_run_job_worker, + args=(job_id, section_key), + daemon=True, + name=f"report-job-{job_id[:8]}", + ).start() + + +def _run_job_worker(job_id: str, only_section_key: Optional[str] = None) -> None: + with SessionLocal() as db: + job = db.query(ReportGenerationJob).filter(ReportGenerationJob.id == job_id).first() + if not job: + return + try: + job.status = "running" + job.error_message = None + job.updated_at = datetime.now() + db.commit() + update_job_state(job.id, status="running", errorMessage=None) + + project = db.query(Project).filter(Project.uuid == job.project_id).first() + if not project: + raise RuntimeError("项目不存在") + template = _resolve_template(db, job.template_id) + all_template_sections = _list_template_sections(db, template.id) + sections = _sections_for_generation(all_template_sections) + chapter_title_map = _build_section_title_map(all_template_sections) + chapters = ( + db.query(ReportGenerationChapter) + .filter(ReportGenerationChapter.job_id == job.id) + .order_by(ReportGenerationChapter.section_order.asc()) + .all() + ) + chapter_map = {c.section_key: c for c in chapters} + completed_section_contents: dict[str, str] = { + c.section_key: str(c.content or "").strip() + for c in chapters + if c.status == "completed" and str(c.content or "").strip() + } + target_sections = [s for s in sections if (not only_section_key or s.section_key == only_section_key)] + retrieval = RetrievalService() + top_k = int((job.options or {}).get("topK") or 10) + completed_count = 0 + pending_sections = [] + for section in target_sections: + chapter = chapter_map.get(section.section_key) + if not chapter: + continue + if not only_section_key and chapter.status == "completed": + completed_count += 1 + continue + pending_sections.append(section) + + total_count = max(1, completed_count + len(pending_sections)) + logger.info( + "报告生成 job start | job=%s | project=%s | total_sections=%d | pending=%d | completed=%d | top_k=%d", + job.id, project.uuid, len(sections), len(pending_sections), completed_count, top_k, + ) + for idx, section in enumerate(pending_sections, start=1): + db.refresh(job) + if job.status == "cancelled": + return + chapter = chapter_map.get(section.section_key) + if not chapter: + continue + + section_no = _extract_section_number(section.section_title or "") + logger.info( + "报告生成 start chapter | job=%s | section=%s | title=%s | section_no=%s | idx=%d/%d", + job.id, section.section_key, section.section_title, section_no, idx, total_count, + ) + + _update_chapter_status(db, job, chapter, "running", None) + update_job_state(job.id, currentSectionKey=section.section_key) + update_chapter_state( + job.id, + section.section_key, + status="running", + errorMessage=None, + content=None, + promptText=None, + evidencePayload=None, + validationPayload={"streamPhase": "waiting"}, + ) + required_tables = _extract_required_table_tokens( + section.section_prompt or "", + _extract_section_number(section.section_title or ""), + contract_text=_effective_section_output_contract(section), + ) + if _extract_section_number(section.section_title or "") == "5.3.2": + na8 = _norm_table_token("附表8") + required_tables = [ + t for t in required_tables if _norm_table_token(str(t)) != na8 + ] + evidence, retrieval_stage = _collect_evidence_progressive( + db, + retrieval, + project.uuid, + section, + top_k=top_k, + required_tables=required_tables, + ) + prior_sibling_sections_text = _build_prior_sibling_sections_text( + section, + sections, + completed_section_contents, + ) + section_reference = _load_section_reference_for_chapter( + db, + section.section_key, + section.section_title, + template_id=template.id, + ) + logger.info( + "section_reference 注入 | section=%s | template_id=%s | 命中=%s", + section.section_key, + template.id, + "是" if section_reference else "否", + ) + prompt = _build_chapter_prompt( + section, + evidence, + prior_sibling_sections_text=prior_sibling_sections_text, + section_reference=section_reference, + ) + _dump_runtime_prompt( + job_id=job.id, + section_key=section.section_key, + section_title=section.section_title, + system_prompt=chapter_generation_system_prompt(), + user_prompt=prompt, + ) + stream_state = { + "buffer": "", + "phase": "waiting", + } + + def _on_content_delta(event: str, delta_text: str) -> None: + if event == "delta": + if delta_text: + stream_state["phase"] = "streaming" + stream_state["buffer"] = str(stream_state.get("buffer") or "") + delta_text + append_chapter_content( + job.id, + section.section_key, + delta_text, + stream_phase="streaming", + ) + elif event == "finalizing": + stream_state["phase"] = "finalizing" + set_chapter_stream_phase(job.id, section.section_key, "finalizing") + + content, validation, model_output = _generate_chapter_content( + section, + prompt, + on_content_delta=_on_content_delta, + ) + content = _apply_canonical_field_backfill(section, evidence, content) + _cur_section_no = _extract_section_number(section.section_title or "") + _skip_table_enforcement = _cur_section_no in {"2.1.1"} + if _skip_table_enforcement: + remaining_missing_tables = [] + content_after_tables = content + else: + content, remaining_missing_tables = _enforce_required_tables( + section, + prompt, + content, + evidence, + ) + content_after_tables = content + content = _strip_tables_from_non_table_section( + section.section_title or "", content, section=section + ) + content = _strip_forbidden_tables( + section.section_title or "", content, + ) + content, format_issues = _enforce_template_format_contract( + section, + content, + evidence, + chapter_title_map=chapter_title_map, + ) + _sec_no = _extract_section_number(section.section_title or "") + _refresh_tokens: tuple[str, ...] = ("表5-4",) + if _sec_no == "5.1": + _refresh_tokens = ("表5-1",) + elif _sec_no == "5.2.1": + _refresh_tokens = ("表5-2", "表5-3") + content = _refresh_element_table_markdown_tokens( + content, evidence, _refresh_tokens + ) + content = _strip_bracketed_three_part_labels(content) + content = _strip_placeholder_table_notes(content) + content = _normalize_table_captions_in_markdown(content) + content = _strip_trailing_partial_missing_markers(content) + content = _fix_numeric_line_breaks(content) + content = _cleanup_section_table_artifacts( + section.section_title or "", + content, + allowed_table_tokens=required_tables, + ) + if _sec_no == "5.3.1": + content = _refresh_element_table_markdown_tokens( + content, evidence, ("表5-4",) + ) + content = _fill_required_table_caption_stubs( + content, ["表5-4"], evidence + ) + content = _strip_orphan_markdown_table_rows(content) + content = _strip_minimal_missing_table_tail(content) + content, intra_repeat_removed = _collapse_consecutive_text_repetitions(content) + content, chapter_dedupe_removed = _dedupe_long_chapter_repetition(content) + chapter_dedupe_removed += intra_repeat_removed + if chapter_dedupe_removed > 0: + warnings = validation.get("warnings") if isinstance(validation, dict) else [] + if not isinstance(warnings, list): + warnings = [] + warnings.append( + f"章节去重:已移除 {chapter_dedupe_removed} 处重复段落/表格" + ) + validation["warnings"] = warnings + validation["chapterDedupeRemoved"] = chapter_dedupe_removed + if required_tables and not _skip_table_enforcement: + content = _restore_required_tables_safety_net( + content, + required_tables, + evidence, + content_after_tables, + ) + content = _finalize_section_table_dedupe(content, required_tables) + if remaining_missing_tables: + warnings = validation.get("warnings") if isinstance(validation, dict) else [] + if not isinstance(warnings, list): + warnings = [] + warnings.append( + "部分必需表格仍缺失,已插入占位表:" + + "、".join(remaining_missing_tables) + ) + validation["warnings"] = warnings + if format_issues: + warnings = validation.get("warnings") if isinstance(validation, dict) else [] + if not isinstance(warnings, list): + warnings = [] + warnings.extend([f"格式验收器:{x}" for x in format_issues][:8]) + validation["warnings"] = warnings + validation["retrievalStage"] = retrieval_stage + validation["streamPhase"] = "completed" + diagnostics = _build_field_diagnostics(section, evidence, content) + if diagnostics: + validation["fieldDiagnostics"] = diagnostics + if model_output: + validation["modelOutput"] = model_output + content = _inject_missing_parent_section_headings( + section.section_title or "", + content, + _previous_completed_section_content( + section, sections, completed_section_contents + ), + chapter_title_map, + ) + now = datetime.now() + chapter.content = content + completed_section_contents[section.section_key] = content + chapter.prompt_text = prompt[:20000] + chapter.evidence_payload = evidence + chapter.validation_payload = validation + chapter.status = "completed" + chapter.error_message = None + chapter.updated_at = now + chapter.completed_at = now + if not only_section_key: + job.progress = int((completed_count + idx) * 100 / total_count) + job.current_section_key = section.section_key + job.updated_at = now + db.commit() + dump_out_path = _dump_report_chapter_json_markdown( + job_id=job.id, + section_key=section.section_key, + section_title=section.section_title, + output_json={ + "modelOutput": model_output or {}, + "persistedChapter": { + "sectionKey": section.section_key, + "sectionTitle": section.section_title, + "sectionOrder": section.section_order, + "status": "completed", + "content": content, + "promptText": prompt[:20000], + "evidencePayload": evidence, + "validationPayload": validation, + }, + }, + ) + logger.info( + "章节生成落盘 | job=%s | section=%s | prompt_len=%s | content_len=%s | output_file=%s", + job.id, section.section_key, len(prompt[:20000]), len(content), + dump_out_path or "(已存在合并写入)", + ) + update_chapter_state( + job.id, + section.section_key, + status="completed", + content=content, + errorMessage=None, + promptText=prompt[:20000], + evidencePayload=evidence, + validationPayload=validation, + ) + if not only_section_key: + update_job_state( + job.id, + progress=int((completed_count + idx) * 100 / total_count), + currentSectionKey=section.section_key, + ) + else: + update_job_state(job.id, currentSectionKey=section.section_key) + + db.refresh(job) + if job.status == "cancelled": + return + + db.refresh(job) + if job.status == "cancelled": + return + + if only_section_key: + # 单章重跑不应直接终结整任务,仅回写章节并刷新任务进度。 + all_chapters = ( + db.query(ReportGenerationChapter) + .filter(ReportGenerationChapter.job_id == job.id) + .order_by(ReportGenerationChapter.section_order.asc()) + .all() + ) + done = sum(1 for c in all_chapters if c.status == "completed") + total = max(1, len(all_chapters)) + job.progress = int(done * 100 / total) + job.current_section_key = None + job.updated_at = datetime.now() + db.commit() + update_job_state(job.id, progress=int(done * 100 / total), currentSectionKey=None) + else: + job.snapshot = None + job.status = "completed" + job.progress = 100 + job.current_section_key = None + job.completed_at = datetime.now() + job.updated_at = datetime.now() + db.commit() + logger.info( + "报告生成 job completed | job=%s | project=%s | total_chapters=%d", + job.id, project.uuid, len(sections), + ) + update_job_state( + job.id, + status="completed", + progress=100, + currentSectionKey=None, + completedAt=_fmt_dt(job.completed_at), + ) + except Exception as e: + current_section_key = job.current_section_key + logger.error( + "报告生成 job failed | job=%s | project=%s | section=%s | err=%s", + job.id, project.uuid, current_section_key, e, + ) + job.status = "failed" + job.error_message = str(e) + job.updated_at = datetime.now() + db.commit() + update_job_state(job.id, status="failed", errorMessage=str(e)) + if current_section_key: + update_chapter_state( + job.id, + current_section_key, + status="failed", + errorMessage=str(e), + ) + + +# 这些章节号在 L1/L2 已判「证据充足」时仍继续走 L2/L3,避免仅章节定向检索就提前结束而漏掉关键词召回。 +_SECTION_NUMBERS_FORCE_L3_KEYWORD_RETRIEVAL: frozenset[str] = frozenset({"2.1.5", "3.3.3", "3.4.1", "3.6", "3.7", "3.8", "3.10", "4.3.3","5.2.3", "6.1.1.1", "6.1.1.2", "6.2.1", "6.2.4"}) + + +def _section_forces_l3_keyword_retrieval(section: ReportTemplateSection) -> bool: + return _extract_section_number(section.section_title or "") in _SECTION_NUMBERS_FORCE_L3_KEYWORD_RETRIEVAL + + +def _collect_evidence_progressive( + db: Session, + retrieval: RetrievalService, + project_uuid: str, + section: ReportTemplateSection, + *, + top_k: int, + required_tables: Optional[list[str]] = None, +) -> tuple[dict, str]: + force_l3 = _section_forces_l3_keyword_retrieval(section) + # L1: 仅要素与结构化表 + evidence = _collect_evidence( + db, + retrieval, + project_uuid, + section, + top_k=top_k, + required_tables=required_tables, + include_chapter_docs=False, + include_keyword_docs=False, + ) + if _is_evidence_sufficient(section, evidence) and not force_l3: + return evidence, "elements_only" + + # L2: 补充章节定向检索段落 + evidence = _collect_evidence( + db, + retrieval, + project_uuid, + section, + top_k=top_k, + required_tables=required_tables, + include_chapter_docs=True, + include_keyword_docs=False, + ) + if _is_evidence_sufficient(section, evidence) and not force_l3: + return evidence, "elements_plus_chapter_docs" + + # L3: 最后补充关键词兜底检索 + evidence = _collect_evidence( + db, + retrieval, + project_uuid, + section, + top_k=top_k, + required_tables=required_tables, + include_chapter_docs=True, + include_keyword_docs=True, + ) + return evidence, "elements_plus_chapter_and_keyword_docs" + + +def _latest_element_payloads_by_row_col( + db: Session, + project_uuid: str, + row_keys: list[str], + *, + non_empty_value: bool = True, +) -> list[dict[str, Any]]: + """按 ``row_key + col_key`` 去重,保留 ``updated_at`` 最新的一条(查询已按时间倒序)。""" + if not row_keys: + return [] + q = db.query(ElementCell).filter( + ElementCell.project_id == project_uuid, + ElementCell.row_key.in_(row_keys), + ) + if non_empty_value: + q = q.filter(ElementCell.value.isnot(None), ElementCell.value != "") + cells = q.order_by(ElementCell.updated_at.desc()).all() + picked: dict[tuple[str, str], dict[str, Any]] = {} + for cell in cells: + rk = str(cell.row_key or "") + ck = str(cell.col_key or "") + key = (rk, ck) + if key in picked: + continue + picked[key] = { + "rowKey": cell.row_key, + "colKey": cell.col_key, + "value": str(cell.value or "")[:500], + "sourceDocumentId": cell.source_document_id, + } + return list(picked.values()) + + +def _merge_section_11_forced_elements( + forced_payloads: list[dict[str, Any]], + scored_top_payloads: list[dict[str, Any]], + *, + max_additional_scored: int = 40, +) -> list[dict[str, Any]]: + """1.1 节:先发制人并入第 1 章概况要素,再追加与其它章节相同的 Top-K 打分单元格(去重)。""" + seen: set[tuple[str, str]] = set() + out: list[dict[str, Any]] = [] + for p in forced_payloads: + key = (str(p.get("rowKey") or ""), str(p.get("colKey") or "")) + if key in seen: + continue + seen.add(key) + out.append(p) + added = 0 + for p in scored_top_payloads: + key = (str(p.get("rowKey") or ""), str(p.get("colKey") or "")) + if key in seen: + continue + seen.add(key) + out.append(p) + added += 1 + if added >= max_additional_scored: + break + return out + + +def _collect_evidence( + db: Session, + retrieval: RetrievalService, + project_uuid: str, + section: ReportTemplateSection, + *, + top_k: int, + required_tables: Optional[list[str]] = None, + include_chapter_docs: bool = True, + include_keyword_docs: bool = True, +) -> dict: + section_no = _extract_section_number(section.section_title or "") + tokens = _extract_tokens(f"{section.section_title} {section.section_prompt or ''}")[:14] + if section_no == "1.2": + # 标题词过短会导致检索跑偏;补充决策类短语提高召回 + extra = " ".join( + [ + "项目决策要点 建设必要性 立项背景", + "国VI 国Ⅵ 汽油质量升级 芳烃 烯烃 环保", + "预期目标 烷基化油 产量 辛烷值 万吨", + "可研 批复 投资 效益 利润", + ] + ) + merged = _extract_tokens(f"{section.section_title} {section.section_prompt or ''} {extra}") + tokens = list(dict.fromkeys(merged))[:20] + cells_query = ( + db.query(ElementCell, ElementTable.table_name) + .join(ElementTable, ElementTable.id == ElementCell.table_id) + .filter( + ElementCell.project_id == project_uuid, + ElementTable.project_id == project_uuid, + ElementCell.value.isnot(None), + ElementCell.value != "", + ) + .order_by(ElementCell.updated_at.desc()) + ) + candidate_cells: list[tuple[int, dict]] = [] + for cell, table_name in cells_query.limit(800).all(): + payload = { + "tableId": cell.table_id, + "tableName": table_name, + "rowKey": cell.row_key, + "colKey": cell.col_key, + "year": cell.year, + "value": str(cell.value or "")[:500], + "sourceDocumentId": cell.source_document_id, + "sourceType": cell.source_type, + } + score = _score_element_cell_relevance( + section.section_title, + tokens, + payload.get("rowKey"), + payload.get("colKey"), + payload.get("value"), + table_name=payload.get("tableName"), + section=section, + ) + # 无 token 命中但字段语义强相关时仍保留(例如 value 内存在“项目名称:xxx”)。 + if score > 0: + candidate_cells.append((score, payload)) + elif not tokens: + candidate_cells.append((1, payload)) + candidate_cells.sort(key=lambda x: x[0], reverse=True) + matched_cells = [x[1] for x in candidate_cells[:40]] + # 1.1「项目基本情况」:强制并入「章节要素-第1章项目概况」全部非空格子,避免被全局 Top40 相关性截断挤出导致建设投资等待补充。 + if _extract_section_number(section.section_title or "") == "1.1": + ch1_rows = section_table_row_keys(CHAPTER1_PROJECT_OVERVIEW_TABLE_GROUP) + forced_ch1 = _latest_element_payloads_by_row_col(db, project_uuid, ch1_rows, non_empty_value=True) + if forced_ch1: + matched_cells = _merge_section_11_forced_elements(forced_ch1, matched_cells, max_additional_scored=40) + required = [str(t) for t in (required_tables or []) if str(t).strip()] + structured_tables = _collect_structured_tables( + db, + project_uuid, + required, + section_title=str(section.section_title or ""), + section_tokens=tokens, + ) + chapter_docs = [] + if include_chapter_docs: + chapter_docs = retrieval.get_chapter_materials(project_uuid, section.section_title, top_k=top_k) + keyword_docs: list[dict] = [] + if include_keyword_docs and tokens: + if section_no == "1.2": + queries = [ + "国VI 国Ⅵ 汽油 质量升级 芳烃 烯烃 环保 标准", + "项目 建设 必要性 决策 依据 立项", + "预期 目标 烷基化油 产量 辛烷值 效益 万吨", + " ".join(tokens[:8]), + ] + seen: set[tuple[str, str]] = set() + for q in queries: + docs = retrieval.search_by_query(q, top_k=6, filter_project=project_uuid) + for d in docs: + did = str(d.metadata.get("doc_id", "") or "") + body = str(d.page_content or "")[:2000] + key = (did, body[:240]) + if key in seen: + continue + seen.add(key) + keyword_docs.append( + { + "heading": d.metadata.get("heading", ""), + "content": body, + "docId": did, + "query": q[:120], + } + ) + if len(keyword_docs) >= 14: + break + if len(keyword_docs) >= 14: + break + else: + q = " ".join(tokens[:5]) + docs = retrieval.search_by_query(q, top_k=10, filter_project=project_uuid) + for d in docs: + keyword_docs.append( + { + "heading": d.metadata.get("heading", ""), + "content": str(d.page_content or "")[:2000], + "docId": d.metadata.get("doc_id", ""), + } + ) + return { + "tokens": tokens, + "requiredTables": required, + "structuredTables": structured_tables, + "canonicalFields": _extract_canonical_fields( + section.section_title, matched_cells, section=section + ), + "elements": matched_cells, + "chapterDocs": chapter_docs[:top_k], + "keywordDocs": keyword_docs[:14] if section_no == "1.2" else keyword_docs[:8], + } + + +def _is_evidence_sufficient(section: ReportTemplateSection, evidence: dict) -> bool: + required_tables = evidence.get("requiredTables") if isinstance(evidence, dict) else [] + structured_tables = evidence.get("structuredTables") if isinstance(evidence, dict) else [] + elements = evidence.get("elements") if isinstance(evidence, dict) else [] + chapter_docs = evidence.get("chapterDocs") if isinstance(evidence, dict) else [] + keyword_docs = evidence.get("keywordDocs") if isinstance(evidence, dict) else [] + + required_count = len(required_tables) if isinstance(required_tables, list) else 0 + structured_count = len(structured_tables) if isinstance(structured_tables, list) else 0 + element_count = len(elements) if isinstance(elements, list) else 0 + chapter_doc_count = len(chapter_docs) if isinstance(chapter_docs, list) else 0 + keyword_doc_count = len(keyword_docs) if isinstance(keyword_docs, list) else 0 + + # 有必需表格时优先保证结构化表匹配覆盖 + if required_count > 0 and structured_count < min(required_count, 2): + return False + + # 没有足够要素时,需要至少一种文档证据补充 + if element_count < 6 and chapter_doc_count == 0 and keyword_doc_count == 0: + return False + + # 表格相关章节通常需要更高证据密度 + title = str(section.section_title or "") + if "表" in title and (structured_count == 0 and element_count < 10): + return False + + # 关键章节按字段完整性判定,避免“有数量但没关键字段”时误判为充足 + title_norm = re.sub(r"\s+", "", title) + if "1.1项目基本情况" in title_norm: + required_groups = [ + ["建设单位", "建设单位名称"], + ["建设地点", "厂址"], + ["建设规模", "装置规模", "能力", "万吨/年"], + ["投资", "概算", "估算", "决算"], + ] + for group in required_groups: + if not _evidence_contains_any_fact(evidence, group): + return False + + if "1.2项目决策要点" in title_norm: + required_groups = [ + ["国vi", "国ⅵ", "质量升级", "汽油标准", "环保", "环评", "排放", "清洁生产"], + ["高标号", "辛烷值", "汽油池", "产品结构", "汽油"], + ["碳四", "液化气", "原料", "物料平衡", "资源利用", "附加值"], + ["杂质", "预处理", "丁二烯", "选择性加氢", "催化剂", "甲醇", "二甲醚"], + ["万吨", "产量", "烷基化油", "效益", "利润", "营业收入", "预期", "目标"], + ] + hit_count = 0 + for group in required_groups: + if _evidence_contains_any_fact(evidence, group): + hit_count += 1 + # 至少命中 2 组:安全评价里常有杂质/物料平衡,可研/环评可补环保与目标 + if hit_count < 2: + return False + + return True + + +def _score_element_cell_relevance( + section_title: str, + tokens: list[str], + row_key: Optional[str], + col_key: Optional[str], + value: Optional[str], + *, + table_name: Optional[str] = None, + section: ReportTemplateSection | None = None, +) -> int: + table = str(table_name or "") + row = str(row_key or "") + col = str(col_key or "") + val = str(value or "") + full_text = f"{table} {row} {col} {val}" + full_text_l = full_text.lower() + key_text_l = f"{table} {row} {col}".lower() + score = 0 + + for t in (tokens or []): + tt = str(t or "").strip() + if not tt: + continue + if tt in full_text: + score += 1 + if table and tt in table: + score += 2 + + title_norm = re.sub(r"\s+", "", str(section_title or "")) + table_norm = re.sub(r"\s+", "", table) + if title_norm and table_norm and (title_norm in table_norm or table_norm in title_norm): + score += 8 + section_no = _extract_section_number(section_title) + if section_no and table_norm and section_no.replace(".", ""): + section_no_norm = section_no.replace(".", "") + table_no_norm = re.sub(r"\D", "", table_norm[:12]) + if table_no_norm and table_no_norm.startswith(section_no_norm): + score += 3 + + # 对关键章节字段进行强加权,降低无关单元格被截断前占位的概率。 + expected = _section_expected_fields(section_title, section) + for field in expected: + aliases = [str(a).strip() for a in _field_aliases(field) if str(a).strip()] + alias_hit = False + for alias in aliases: + a_l = alias.lower() + if a_l in key_text_l: + score += 4 + alias_hit = True + break + if alias_hit: + continue + # 若 row/col 不包含字段名,尝试 value 中“字段:值”模式。 + if _extract_value_by_alias_from_text(val, aliases): + score += 5 + continue + # 最弱相关:value 中仅出现别名关键词。 + if any(str(a).lower() in full_text_l for a in aliases): + score += 1 + + return score + + +def _evidence_contains_any_fact(evidence: dict, keywords: list[str]) -> bool: + if not isinstance(evidence, dict): + return False + lowered_keywords = [str(k).strip().lower() for k in keywords if str(k).strip()] + if not lowered_keywords: + return False + + elements = evidence.get("elements") if isinstance(evidence.get("elements"), list) else [] + for row in elements: + if not isinstance(row, dict): + continue + row_key = str(row.get("rowKey") or "").lower() + col_key = str(row.get("colKey") or "").lower() + value = str(row.get("value") or "").strip() + value_l = value.lower() + if _is_missing_like(value): + continue + if any(k in row_key or k in col_key or k in value_l for k in lowered_keywords): + return True + + for doc_field in ("chapterDocs", "keywordDocs"): + docs = evidence.get(doc_field) if isinstance(evidence.get(doc_field), list) else [] + for d in docs[:12]: + if not isinstance(d, dict): + continue + text = (str(d.get("heading") or "") + " " + str(d.get("content") or "")).lower() + if any(k in text for k in lowered_keywords): + return True + return False + + +def _recover_stalled_job(db: Session, job: ReportGenerationJob) -> None: + if not job or job.status != "running": + return + now = datetime.now() + running_chapter = ( + db.query(ReportGenerationChapter) + .filter(ReportGenerationChapter.job_id == job.id, ReportGenerationChapter.status == "running") + .order_by(ReportGenerationChapter.updated_at.asc()) + .first() + ) + if not running_chapter or not running_chapter.updated_at: + return + stale_seconds = (now - running_chapter.updated_at).total_seconds() + if stale_seconds < RUNNING_CHAPTER_STALE_SECONDS: + return + + running_chapter.status = "pending" + running_chapter.error_message = "检测到章节长时间未更新,已自动回收并重试" + running_chapter.updated_at = now + job.status = "pending" + job.error_message = None + job.current_section_key = None + job.updated_at = now + db.commit() + update_job_state(job.id, status="pending", errorMessage=None, currentSectionKey=None) + update_chapter_state( + job.id, + running_chapter.section_key, + status="pending", + errorMessage="检测到章节长时间未更新,已自动回收并重试", + content=None, + promptText=None, + evidencePayload=None, + validationPayload=None, + ) + _start_job_worker(job.id) + + +def _load_section_reference_for_chapter( + db: Session, + section_key: str, + section_title: str, + *, + template_id: Optional[str] = None, + max_chars: int = 8000, +) -> str: + """ + 从 report_section_references 表加载当前章节存储的原始章节内容(content), + 直接用于填充 user-prompt 的 section_reference_block,不做 LLM 脱敏。 + 优先按 section_key 精确匹配,其次从标题中提取编号匹配,最后按标题模糊匹配。 + + template_id: 选中模板的 ID。传入后只注入与该模板关联(report_section_references.template_id) + 的参考范文,实现“按模板过滤参考范文”;为空则不做模板过滤(取最新一条)。 + """ + from services.reference_service import ( + load_section_reference_raw, + load_section_reference_raw_by_title, + ) + + tid = (template_id or "").strip() or None + + content = load_section_reference_raw( + db, section_key, template_id=tid, max_chars=max_chars + ) + if content: + return content + + # 兜底:按标题匹配(仍限定在同一模板内) + return load_section_reference_raw_by_title( + db, section_title, template_id=tid, max_chars=max_chars + ) + + +def _build_chapter_prompt( + section: ReportTemplateSection, + evidence: dict, + *, + prior_sibling_sections_text: str = "", + section_reference: str = "", +) -> str: + selected_example = _select_chapter_example( + section.section_title, + section.examples, + evidence, + ) + section_contract = _effective_section_output_contract(section) + section_no = _extract_section_number(section.section_title) + heading_rule = SECTION_HEADING_RULES.get(section_no, DEFAULT_HEADING_RULE) + expected_fields = _section_expected_fields(section.section_title, section) + return build_report_chapter_prompt( + section_title=section.section_title, + section_prompt=_effective_section_prompt_for_generation(section, section_contract), + required_tables_text="、".join(evidence.get("requiredTables") or []) or "无", + structured_tables_text=_render_structured_tables_for_prompt(evidence), + canonical_fields_text=_render_canonical_fields_for_prompt( + evidence, allowed_fields=expected_fields or None + ), + selected_example=selected_example, + heading_rule=heading_rule, + section_contract=section_contract, + evidence_json=json.dumps(evidence, ensure_ascii=False), + prior_sibling_sections_text=prior_sibling_sections_text, + section_reference=section_reference, + ) + + +def _generate_chapter_content( + section: ReportTemplateSection, + prompt: str, + on_content_delta: Optional[callable] = None, +) -> tuple[str, dict, dict]: + section_no = _extract_section_number(section.section_title or "") + logger.info( + "LLM 章节生成 start | section=%s | section_no=%s | max_tokens=%s", + section.section_key, section_no, _chapter_generation_max_tokens(section_no), + ) + obj = chat_completions_json( + system_prompt=chapter_generation_system_prompt(), + user_prompt=prompt, + temperature=0.1, + max_tokens=_chapter_generation_max_tokens(section_no), + timeout_sec=120, + on_content_delta=on_content_delta, + log_context=f"章节生成 section_key={section.section_key} | {section.section_title}", + ) + content = str(obj.get("content") or "").strip() + if not content: + content = f"{section.section_title}\n\n待补充" + # 不对章节编号/条目序号做“统一编号归一化”改写,避免破坏模板章节层级(如 2.1.1、3.4.2 等)。 + # 仅清理证据标签/引用编号等噪声。 + content = _strip_inline_evidence_labels(content) + if section_no == "1.2": + content = re.sub( + r"(?m)^[\s\u3000]*1[\s\u3000]*[)\)][\s\u3000]*项目背景[\s\u3000]*$", + "1.2.1项目背景", + content, + ) + content = re.sub( + r"(?m)^[\s\u3000]*2[\s\u3000]*[)\)][\s\u3000]*预期目标[\s\u3000]*$", + "1.2.2预期目标", + content, + ) + content = _normalize_section_12_content(content) + missing = obj.get("missingInfo") if isinstance(obj.get("missingInfo"), list) else [] + checks = obj.get("qualityChecks") if isinstance(obj.get("qualityChecks"), list) else [] + validation = { + "missingInfo": [str(x) for x in missing][:20], + "qualityChecks": [str(x) for x in checks][:20], + "warnings": _basic_warnings(section.section_title, content), + } + return content, validation, obj + + +def _normalize_ordered_item_markers(content: str) -> str: + text = _strip_inline_evidence_labels(str(content or "")) + if not text: + return text + cn_num_to_idx = { + "一": 1, + "二": 2, + "三": 3, + "四": 4, + "五": 5, + "六": 6, + "七": 7, + "八": 8, + "九": 9, + "十": 10, + } + + # Keep the first non-empty line unchanged, to avoid mutating the section title. + lines = text.splitlines() + first_non_empty_idx = -1 + for i, ln in enumerate(lines): + if ln.strip(): + first_non_empty_idx = i + break + + # Convert line-leading markers such as: + # - Chinese numerals: "一、" / "(一)" + # - Arabic numerals: "1." / "2." / "1.2." / "3.1" + # into a unified "n)" style. + cn_pattern = re.compile(r"^(\s*(?:#+\s*)?(?:[-*]\s*)?)(?:(([一二三四五六七八九十]))|([一二三四五六七八九十])、)\s*") + ar_pattern = re.compile(r"^(\s*(?:#+\s*)?(?:[-*]\s*)?)(\d+(?:\.\d+)*)(?:\.)?\s+") + + def _replace_line(ln: str) -> str: + m_cn = cn_pattern.match(ln) + if m_cn: + prefix = m_cn.group(1) or "" + cn = m_cn.group(2) or m_cn.group(3) or "" + idx = cn_num_to_idx.get(cn) + if idx: + return cn_pattern.sub(f"{prefix}{idx})", ln, count=1) + return ln + + m_ar = ar_pattern.match(ln) + if m_ar: + prefix = m_ar.group(1) or "" + seq = m_ar.group(2) or "" + parts = [p for p in seq.split(".") if p] + # Use the last segment as list index: 1.2 -> 2), 3.1 -> 1) + idx = parts[-1] if parts else "" + if idx.isdigit(): + return ar_pattern.sub(f"{prefix}{int(idx)}) ", ln, count=1) + return ln + + out: list[str] = [] + for i, ln in enumerate(lines): + if i == first_non_empty_idx: + out.append(ln) + continue + out.append(_replace_line(ln)) + return "\n".join(out) + + +def _strip_inline_evidence_labels(text: str) -> str: + src = str(text or "") + if not src: + return src + cleaned = re.sub(r"【\s*证据依据\s*[::]\s*[0-9a-fA-F]{16,}\s*】", "", src) + cleaned = re.sub(r"\[\s*证据依据\s*[::]\s*[0-9a-fA-F]{16,}\s*\]", "", cleaned) + # Remove simple inline numeric citations like [1], [2] that often leak from evidence. + cleaned = re.sub(r"\[\s*\d{1,3}\s*\]", "", cleaned) + # Strip leaked meta sections from model JSON fields when they are accidentally merged into content. + cleaned = re.sub( + r"(?is)\n*【\s*缺失信息说明\s*】[\s\S]*?(?=\n【\s*质量检查\s*】|\Z)", + "\n", + cleaned, + ) + cleaned = re.sub(r"(?is)\n*【\s*质量检查\s*】[\s\S]*$", "\n", cleaned) + return re.sub(r"[ \t]{2,}", " ", cleaned) + + +def _markdown_hashes_for_section_no(section_no: str) -> str: + parts = str(section_no or "").strip().split(".") + if len(parts) == 1: + return "##" + if len(parts) == 2: + return "###" + return "####" + + +def _normalize_numbered_heading_spacing(content: str) -> str: + """编号与题名之间补空格,便于前后端一致识别为标题。""" + text = str(content or "") + if not text: + return text + + def _line_repl(m: re.Match[str]) -> str: + indent, num, title = m.group(1), m.group(2), str(m.group(3) or "").strip() + parts = num.split(".") + if len(parts) < 2 or len(parts) > 4: + return m.group(0) + for part in parts: + if not part.isdigit() or int(part) < 1 or int(part) > 30: + return m.group(0) + if not title or len(title) > 36 or re.search(r"[,。;:!?]", title): + return m.group(0) + return f"{indent}{num} {title}" + + return re.sub( + r"(?m)^([\s\u3000]*)(\d+(?:\.\d+)+)\s*([\u4e00-\u9fff][^\n]{0,40})\s*$", + _line_repl, + text, + ) + + +def _normalize_markdown_heading_levels(content: str) -> str: + """三节及以上编号统一为 ####,避免 ### 与 #### 混用导致同级标题字号不一致。""" + text = str(content or "") + if not text: + return text + + def _line_repl(m: re.Match[str]) -> str: + hashes, num, tail = m.group(1), m.group(2), m.group(3) + parts = num.split(".") + if len(parts) < 3: + return m.group(0) + want = _markdown_hashes_for_section_no(num) + if hashes == want: + return m.group(0) + return f"{want} {num} {tail}" + + return re.sub( + r"(?m)^(#{1,6})\s+(\d+(?:\.\d+)+)\s+([\u4e00-\u9fff].*)$", + _line_repl, + text, + ) + + +def _normalize_section_heading_markdown(content: str) -> str: + return _normalize_markdown_heading_levels( + _normalize_numbered_heading_spacing(content) + ) + + +def _normalize_section_12_content(content: str) -> str: + """1.2 合同为纯文本编号体:首行「项目决策要点」、小节「1.2.1项目背景」无空格。 + 与模板/标题验收叠加后会重复节标题,且前端/导出无法识别为 h4。""" + text = str(content or "").strip() + if not text: + return text + text = re.sub( + r"(?m)^[\s\u3000]*项目决策要点[\s\u3000]*(?:[::])?[\s\u3000]*\n+", + "", + text, + count=1, + ) + text = re.sub( + r"(?m)^([\s\u3000]*)(1\.2\.[12])(项目背景|预期目标)[\s\u3000]*$", + r"\1\2 \3", + text, + ) + return re.sub(r"\n{3,}", "\n\n", text).strip() + + +_CONTRACT_FIELD_LINE_RE = re.compile( + r"^\s*(\d+)[))]\s*(?P[^::\n]+)[::]\s*(?P.*)$", + re.MULTILINE, +) +_CONTRACT_FIELD_SKIP_RE = re.compile( + r"必须|不得|禁止|严禁|应|需|写|输出|背景|规则|约束|表\d|后评价|若|当|正文|首行|写作|请", +) + + +def _parse_expected_fields_from_contract(contract: str | None) -> list[str]: + """从模版输出合同解析「1) 字段名:...」连续编号字段;无则返回空。""" + text = str(contract or "").strip() + if not text: + return [] + fields: list[str] = [] + nums: list[int] = [] + for m in _CONTRACT_FIELD_LINE_RE.finditer(text): + field = str(m.group("field") or "").strip() + tail = str(m.group("tail") or "").strip() + if not field or len(field) > 10 or _CONTRACT_FIELD_SKIP_RE.search(field): + continue + if tail and not re.fullmatch(r"\.{2,}|待补充", tail) and len(tail) > 6: + continue + fields.append(field) + nums.append(int(m.group(1))) + if len(fields) < 3 or not nums or nums[0] != 1: + return [] + for i in range(1, len(nums)): + if nums[i] != nums[i - 1] + 1: + return [] + return fields + + +def _section_expected_fields( + section_title: str, + section: ReportTemplateSection | None = None, +) -> list[str]: + """从模版输出合同解析应输出字段;无编号/枚举字段时返回空(不再写死 1.1 八项)。""" + if section is not None: + contract = _effective_section_output_contract(section) + else: + contract = _section_output_contract(section_title) + parsed = _parse_expected_fields_from_contract(contract) + if parsed: + return parsed + title_norm = re.sub(r"\s+", "", str(section_title or "")) + if "1.2项目决策要点" in title_norm: + return ["规模目标", "质量目标", "效益目标"] + return [] + + +def _effective_section_prompt_for_generation( + section: ReportTemplateSection, + contract: str, +) -> str: + """模版合同为结构权威;与合同重复的 section_prompt 不再注入,避免双源冲突。""" + stored = str(section.section_prompt or "").strip() + contract_text = str(contract or "").strip() + if stored and stored != contract_text: + return stored + return "" + + +def _field_aliases(field: str) -> list[str]: + base = str(field or "").strip() + aliases: dict[str, list[str]] = { + "项目名称": ["项目名称", "工程名称", "装置名称"], + "建设单位": ["建设单位", "业主单位", "实施单位"], + "建设地点": ["建设地点", "建设地址", "厂址", "所在地"], + "建设类型": ["建设类型", "项目类型", "新建", "改扩建"], + "起止时间": ["起止时间", "工作起止时间", "开工时间", "完工时间", "建设工期", "建设期限"], + # 抽取/填表侧常将“建设内容”写作“项目内容/工程内容/装置内容”,需兼容回填。 + "建设内容": ["建设内容", "主要建设内容", "建设范围", "项目内容", "工程内容", "装置内容"], + "建设投资": ["建设投资", "总投资", "投资估算", "项目总投资", "概算"], + "占地面积": ["占地面积", "用地面积"], + "规模目标": ["规模目标", "产量", "规模", "万吨"], + "质量目标": ["质量目标", "辛烷值", "质量升级", "国VI", "国Ⅵ"], + "效益目标": ["效益目标", "利润", "收益", "营业收入", "内部收益率", "IRR"], + } + out = aliases.get(base, []) + if base and base not in out: + out.insert(0, base) + return out[:8] + + +def _is_missing_like(value: str) -> bool: + text = str(value or "").strip() + if not text: + return True + lowered = text.lower() + missing_like = { + "待补充", + "无", + "n/a", + "na", + "-", + "—", + "——", + "暂无", + "未知", + "未提供", + } + return lowered in missing_like + + +def _normalize_land_area_value(value: str) -> str: + """ + 规范化“占地面积”字段: + - 遇到“84m×187m=15708m2”这类表达时,仅保留等号后的结果; + - 将 m2/m^2/m²/㎡ 统一为 ㎡,避免导出时出现单位显示异常。 + """ + text = str(value or "").strip() + if not text: + return text + core = text + if "=" in core: + core = core.split("=")[-1].strip() + # 中文全角等号兼容 + if "=" in core: + core = core.split("=")[-1].strip() + unified = re.sub(r"(?i)\bm\s*(?:\^?\s*2)\b", "㎡", core) + unified = unified.replace("m²", "㎡").replace("M²", "㎡") + unified = re.sub(r"\s*㎡", "㎡", unified) + return unified or text + + +def _normalize_canonical_field_value(field: str, value: str) -> str: + f = str(field or "").strip() + v = str(value or "").strip() + if not v: + return v + if f == "占地面积": + return _normalize_land_area_value(v) + return v + + +def _extract_value_by_alias_from_text(text: str, aliases: list[str]) -> str: + src = str(text or "").strip() + if not src: + return "" + for alias in aliases: + a = str(alias or "").strip() + if not a: + continue + # 支持“字段名:值”或“字段名:值”,值截取到常见分隔符前。 + pattern = rf"{re.escape(a)}\s*[::]\s*([^\n;;,,。]+)" + m = re.search(pattern, src, flags=re.IGNORECASE) + if m: + val = str(m.group(1) or "").strip() + if val and not _is_missing_like(val): + return val + return "" + + +def _is_valid_value_for_field(field: str, value: str, row_key: str = "", col_key: str = "") -> bool: + f = str(field or "").strip() + v = str(value or "").strip() + rk = str(row_key or "").strip().lower() + ck = str(col_key or "").strip().lower() + if not v or _is_missing_like(v): + return False + if f != "建设投资": + return True + + # “建设投资”仅接受金额口径,过滤收益率/回收期等财务指标,避免把 2.89 这类比率误填入。 + key_text = f"{rk} {ck}" + if any(x in key_text for x in ["收益率", "irr", "回收期", "净现值", "百分点", "利润率"]): + return False + if re.search(r"%|%|‰", v): + return False + if re.search(r"(收益率|回收期|净现值|利润率|irr)", v, flags=re.IGNORECASE): + return False + + has_amount_unit = bool(re.search(r"(万元|万|亿元|亿元人民币|元)", v)) + number_match = re.search(r"\d+(?:\.\d+)?", v) + if has_amount_unit: + return True + if not number_match: + return False + + # 无单位纯数字时,过小值大概率是比率而非投资金额(如 2.89)。 + num = float(number_match.group(0)) + return num >= 100 + + +def _extract_canonical_fields( + section_title: str, + elements: list[dict], + *, + section: ReportTemplateSection | None = None, +) -> dict[str, str]: + expected = _section_expected_fields(section_title, section) + if not expected: + return {} + rows = elements if isinstance(elements, list) else [] + out: dict[str, str] = {} + for field in expected: + aliases_raw = [str(a).strip() for a in _field_aliases(field) if str(a).strip()] + aliases = [a.lower() for a in aliases_raw] + best_value = "" + for row in rows: + if not isinstance(row, dict): + continue + row_key = str(row.get("rowKey") or "") + col_key = str(row.get("colKey") or "") + value = str(row.get("value") or "").strip() + if _is_missing_like(value): + continue + key_text = f"{row_key} {col_key}".lower() + if any(a in key_text for a in aliases): + if _is_valid_value_for_field(field, value, row_key=row_key, col_key=col_key): + best_value = value + break + # 兼容 row/col 泛化时,直接从 value 文本中解析“字段: 值”。 + from_value = _extract_value_by_alias_from_text(value, aliases_raw) + if from_value and _is_valid_value_for_field(field, from_value, row_key=row_key, col_key=col_key): + best_value = from_value + break + normalized = _normalize_canonical_field_value(field, best_value) + out[field] = normalized or "待补充" + return out + + +def _render_canonical_fields_for_prompt( + evidence: dict, + *, + allowed_fields: list[str] | None = None, +) -> str: + canonical = evidence.get("canonicalFields") if isinstance(evidence, dict) else None + if not isinstance(canonical, dict) or not canonical: + return "无字段级已抽取结果。" + allowed_set = {str(f).strip() for f in (allowed_fields or []) if str(f).strip()} + lines: list[str] = [] + for field, value in canonical.items(): + f = str(field or "").strip() + if allowed_set and f not in allowed_set: + continue + v = _normalize_canonical_field_value(f, str(value or "").strip()) or "待补充" + if not f: + continue + lines.append(f"- {f}: {v}") + return "\n".join(lines) if lines else "无字段级已抽取结果。" + + +def _extract_field_value_from_docs(field: str, docs: list[dict]) -> str: + aliases = [str(a).strip() for a in _field_aliases(field) if str(a).strip()] + if not aliases or not isinstance(docs, list): + return "" + texts: list[str] = [] + for doc in docs: + if not isinstance(doc, dict): + continue + heading = str(doc.get("heading") or "").strip() + content = str(doc.get("content") or "").strip() + merged = f"{heading}\n{content}".strip() + if merged: + texts.append(merged[:8000]) + + # 先尝试“字段: 值”类型,命中率高且更稳。 + for text in texts: + val = _extract_value_by_alias_from_text(text, aliases) + if val and not _is_missing_like(val): + return _normalize_canonical_field_value(field, val) + + # “建设内容”常写成段落而非冒号键值,补充宽松句式抽取。 + if field == "建设内容": + for text in texts: + for alias in aliases: + pattern = rf"{re.escape(alias)}\s*(?:为|包括|包含|主要包括)\s*([^\n。]{{12,420}})" + m = re.search(pattern, text, flags=re.IGNORECASE) + if m: + val = str(m.group(1) or "").strip(" ::;;,,") + if val and not _is_missing_like(val): + return _normalize_canonical_field_value(field, val) + return "" + + +def _merge_canonical_fields_from_docs( + section_title: str, + evidence: dict, + canonical: dict[str, str], + *, + section: ReportTemplateSection | None = None, +) -> dict[str, str]: + expected = _section_expected_fields(section_title, section) + if not expected or not isinstance(evidence, dict): + return canonical + merged = { + str(k): _normalize_canonical_field_value(str(k), str(v)) + for k, v in dict(canonical or {}).items() + } + docs: list[dict] = [] + chapter_docs = evidence.get("chapterDocs") + keyword_docs = evidence.get("keywordDocs") + if isinstance(chapter_docs, list): + docs.extend(chapter_docs) + # 1.1 项目基本情况:必须优先使用“要素管理-章节要素-第一章项目概况”的表格要素。 + # 仅当章节要素表整体为空/极少时,才允许使用 keywordDocs 做跨文档回退匹配, + # 避免将其它章节的“投资/总投资”等金额误回填到 1.1(例如建设投资被污染)。 + title_norm = re.sub(r"\s+", "", str(section_title or "")) + allow_keyword_fallback = True + if "1.1项目基本情况" in title_norm: + elements = evidence.get("elements") if isinstance(evidence.get("elements"), list) else [] + non_missing_elements = 0 + for row in elements[:80]: + if not isinstance(row, dict): + continue + v = str(row.get("value") or "").strip() + if v and not _is_missing_like(v): + non_missing_elements += 1 + if non_missing_elements >= 4: + break + # “有一定数量的非空单元格”即认为章节要素不空:禁止 keywordDocs 参与回填。 + allow_keyword_fallback = non_missing_elements < 4 + if allow_keyword_fallback and isinstance(keyword_docs, list): + docs.extend(keyword_docs) + if not docs: + return merged + for field in expected: + current = str(merged.get(field) or "").strip() + if current and not _is_missing_like(current): + continue + from_docs = _extract_field_value_from_docs(field, docs) + if from_docs and not _is_missing_like(from_docs): + merged[field] = _normalize_canonical_field_value(field, from_docs) + return merged + + +def _apply_canonical_field_backfill( + section: ReportTemplateSection, + evidence: dict, + content: str, +) -> str: + text = str(content or "") + canonical = evidence.get("canonicalFields") if isinstance(evidence, dict) else {} + if not isinstance(canonical, dict) or not canonical: + elements = evidence.get("elements") if isinstance(evidence, dict) else [] + canonical = _extract_canonical_fields( + section.section_title, + elements if isinstance(elements, list) else [], + section=section, + ) + canonical = _merge_canonical_fields_from_docs( + section.section_title, evidence, canonical, section=section + ) + if not canonical: + return text + repaired = text + for field in _section_expected_fields(section.section_title, section): + value = str(canonical.get(field) or "").strip() + if _is_missing_like(value): + continue + # 先按“字段名: 待补充”进行宽松替换,兼容编号/加粗等格式包装。 + broad_pattern = rf"(^.*{re.escape(field)}.*?[::]\s*)待补充(?:\s|$)" + repaired = re.sub( + broad_pattern, + rf"\g<1>{value}\n", + repaired, + flags=re.MULTILINE, + ) + labels = list(dict.fromkeys([x for x in _field_aliases(field) if str(x).strip()])) + for label in labels: + pattern = rf"({re.escape(label)}\s*[::]\s*)待补充\b" + repaired = re.sub(pattern, rf"\g<1>{value}", repaired) + # 若正文还没有落入该字段值,则追加一行显式键值,避免模型遗漏。 + if value not in repaired and re.search(rf"{re.escape(field)}\s*[::]", repaired): + repaired += f"\n{field}:{value}" + return repaired + + +def _build_field_diagnostics(section: ReportTemplateSection, evidence: dict, content: str) -> list[dict[str, Any]]: + expected = _section_expected_fields(section.section_title, section) + if not expected: + return [] + elements = evidence.get("elements") if isinstance(evidence, dict) else [] + if not isinstance(elements, list): + elements = [] + content_text = str(content or "") + out: list[dict[str, Any]] = [] + for field in expected: + aliases_raw = [str(a).strip() for a in _field_aliases(field) if str(a).strip()] + aliases = [a.lower() for a in aliases_raw] + hits: list[str] = [] + for row in elements: + if not isinstance(row, dict): + continue + row_key = str(row.get("rowKey") or "") + col_key = str(row.get("colKey") or "") + value = str(row.get("value") or "").strip() + if _is_missing_like(value): + continue + key_text = f"{row_key} {col_key}".lower() + if any(a in key_text for a in aliases): + hits.append(value[:120]) + else: + from_value = _extract_value_by_alias_from_text(value, aliases_raw) + if from_value: + hits.append(from_value[:120]) + if len(hits) >= 5: + break + unique_hits = list(dict.fromkeys(hits)) + content_has_value = any((not _is_missing_like(v)) and v in content_text for v in unique_hits) + content_marked_missing = bool( + re.search( + rf"{re.escape(field)}\s*[::].*?待补充", + content_text, + flags=re.IGNORECASE | re.DOTALL, + ) + ) + status = "unknown" + if unique_hits and content_has_value: + status = "used" + elif unique_hits and content_marked_missing: + status = "extracted_but_missing_in_content" + elif unique_hits: + status = "extracted_but_not_matched" + elif content_marked_missing: + status = "not_extracted_and_missing" + out.append( + { + "field": field, + "extractedValues": unique_hits, + "contentHasExtractedValue": content_has_value, + "contentMarkedMissing": content_marked_missing, + "status": status, + } + ) + return out + + +def _section_output_contract(section_title: str) -> str: + section_no = _extract_section_number(str(section_title or "")) + if section_no in SECTION_OUTPUT_CONTRACTS: + return SECTION_OUTPUT_CONTRACTS[section_no] + return DEFAULT_SECTION_OUTPUT_CONTRACT + + +def _effective_section_output_contract(section: ReportTemplateSection) -> str: + raw = getattr(section, "section_output_contract", None) + if isinstance(raw, str) and raw.strip(): + return raw.strip() + return _section_output_contract(section.section_title or "") + + +def _section_requires_tables(section_title: str, *, contract_text: str | None = None) -> bool: + """判断章节合同是否包含【表格强制要求】,决定该节是否允许出现表格。""" + c = (str(contract_text or "").strip() or _section_output_contract(section_title)) + return "表格强制要求" in c + + +def _strip_tables_from_non_table_section( + section_title: str, + content: str, + *, + section: ReportTemplateSection | None = None, +) -> str: + """对无表格需求的章节,移除模型可能自行生成的 Markdown 表格。""" + if section is not None: + contract = _effective_section_output_contract(section) + else: + contract = _section_output_contract(section_title) + if "表格强制要求" in contract: + return content + if not content: + return content + + lines = content.split("\n") + out: list[str] = [] + in_table = False + for line in lines: + stripped = line.strip() + is_table_line = stripped.startswith("|") and stripped.endswith("|") + is_separator = bool(re.match(r"^\|[\s\-:|]+\|$", stripped)) if stripped else False + if is_table_line or is_separator: + if not in_table: + in_table = True + if out and out[-1].strip().startswith("###") and "表" in out[-1]: + out.pop() + continue + else: + if in_table: + in_table = False + if stripped.startswith("[ \t]*\n)*" + r"(?:[ \t]*\|[^\n]*\|[ \t]*\n)+)", + flags=re.IGNORECASE, + ) + m = pat.search(content) + return m.group(0).strip() if m else "" + + +def _find_table_insert_position(content: str, token: str, required_tables: list[str]) -> int | None: + """在 content 中找到 token 对应表应插入的位置。 + + 规则:插入到下一个必需表的表题行之前;若没有后续表,返回 None(追加到末尾)。 + """ + token_idx = None + for i, t in enumerate(required_tables): + if _norm_table_token(t) == _norm_table_token(token): + token_idx = i + break + if token_idx is None: + return None + for later_token in required_tables[token_idx + 1:]: + later_plain = re.sub(r"\s+", "", str(later_token or "")) + if not later_plain: + continue + later_re = re.escape(later_plain).replace(r"\-", r"[--—–]") + later_pat = re.compile( + r"(?:^|\n)([^\n]*?" + later_re + r"[^\n]*)\n", + flags=re.IGNORECASE, + ) + m = later_pat.search(content) + if m: + pos = m.start() + if pos > 0 and content[pos] == "\n": + pos += 1 + return pos + return None + + +def _enforce_required_tables( + section: ReportTemplateSection, + prompt: str, + content: str, + evidence: dict, +) -> tuple[str, list[str]]: + required = _extract_required_table_tokens( + section.section_prompt or "", + _extract_section_number(section.section_title or ""), + contract_text=_effective_section_output_contract(section), + ) + if not required: + return content, [] + # 模板必需表优先“要素表直出”,避免模型改写结构化表中的真实数据。 + repaired = _append_authoritative_required_tables(content, required, evidence) + missing = [t for t in required if not _table_token_exists(repaired, t)] + if missing: + repaired = _append_structured_missing_tables(repaired, missing, evidence) + still_missing = [t for t in required if not _table_token_exists(repaired, t)] + if still_missing: + repaired = _repair_missing_tables(section, prompt, repaired, still_missing, evidence) + still_missing = [t for t in required if not _table_token_exists(repaired, t)] + if still_missing: + repaired = _append_minimal_missing_tables(repaired, still_missing) + # 章节间串表清理:4.3.2 仅保留运行周期统计表;4.3.3 仅保留装置运行分析表。 + repaired = _remove_cross_section_table_pollution(section.section_title or "", repaired) + # 末尾兜底:若必需表已“存在”但表体残缺(仅分隔行/缺数据行), + # 仍要强制回填要素管理中的完整结构化表。 + repaired = _ensure_required_structured_tables_integrity(repaired, required, evidence) + # 即使 missing 为空(如 5.1 已由 LLM 写出表5-1),仍须去重,避免 LLM 表 + 要素直出表并存。 + repaired = _finalize_section_table_dedupe(repaired, required) + repaired = _fill_required_table_caption_stubs(repaired, required, evidence) + repaired = _finalize_section_table_dedupe(repaired, required) + final_missing = [t for t in required if not _table_token_exists(repaired, t)] + return repaired, final_missing + + +def _extract_required_table_tokens( + section_prompt: str, + section_no: str = "", + *, + contract_text: Optional[str] = None, +) -> list[str]: + """ + 从模板 section_prompt 与章节输出合同(section_output_contracts)中抽取「表 x-x / 附表 x」, + 使合同内写死的「见表2-3」等也能触发 _append_authoritative_required_tables 要素直出。 + + 严格规则: + - 仅当合同中包含「【表格强制要求】」标签时,才提取正文表(表x-x)。 + - 「见附表N」「附表N~附表M」等仅为引用语,不视为本节必需内嵌的表格(含区间端点及中间附表)。 + - 附图与附表在正文之后由 _append_report_appendices 统一汇总(附图在上、附表在下)。 + """ + parts = [str(section_prompt or "").strip(), str(contract_text or "").strip()] + text = "\n".join(p for p in parts if p) + if not text: + return [] + + has_table_mandate = "表格强制要求" in text + + if not has_table_mandate: + return [] + + # 剔除「【禁止】」段落,避免将禁止示例中的表号(如"表2.6-1")误判为必需表。 + text_for_extraction = re.sub( + r"【禁止】.*?(?=【|$)", "", text, flags=re.DOTALL, + ) + + raw = re.findall( + r"(附表\s*\d+(?:\s*[.\--]\s*\d+)*|表\s*\d+(?:\s*[.\--]\s*\d+)*)", + text_for_extraction, + ) + out: list[str] = [] + seen = set() + chapter_no = "" + m_sec = re.match(r"^\s*(\d+)", str(section_no or "")) + if m_sec: + chapter_no = m_sec.group(1) + + _REF_ONLY_PATTERN = re.compile( + r"(?:见|详见|参见|参照|详)\s*附表\s*\d+", + ) + ref_only_appendices: set[str] = set() + for m in _REF_ONLY_PATTERN.finditer(text): + tok_in_ref = re.findall(r"(附表\s*\d+(?:\s*[.\--]\s*\d+)*)", m.group()) + for t in tok_in_ref: + ref_only_appendices.add(re.sub(r"\s+", "", t)) + + for tok in raw: + norm = re.sub(r"\s+", "", tok) + if norm.startswith("附表") and norm in ref_only_appendices: + continue + if chapter_no: + m_tok = re.match(r"^(?:附表|表)\s*(\d+)", norm) + if m_tok: + tail = norm[m_tok.end() :] + # 「表1」「表2」等为节内顺序号,首位数字不等于章号(如第二章下的表1);仅对「表2-3」「表2.6-1」等带子级编号的表号按章首数字过滤。 + if tail and tail[0] in ".--—–": + if m_tok.group(1) != chapter_no: + continue + if norm in seen: + continue + seen.add(norm) + out.append(norm) + + # 5.3.2 合同正文仅允许表5-5、表5-6;模板示例里若夹带「附表8」等,一律不纳入必需表,避免要素直出串表。 + if str(section_no or "").strip() == "5.3.2": + allow_532 = {_norm_table_token("表5-5"), _norm_table_token("表5-6")} + out = [t for t in out if _norm_table_token(t) in allow_532] + # 7.1.2 仅内嵌表7-1;合同/模板中若夹带其他章表号,不纳入本节必需表。 + if str(section_no or "").strip() == "7.1.2": + allow_712 = {_norm_table_token("表7-1")} + out = [t for t in out if _norm_table_token(t) in allow_712] + return out[:20] + + +def _repair_missing_tables( + section: ReportTemplateSection, + prompt: str, + content: str, + missing_tables: list[str], + evidence: dict, +) -> str: + fix_prompt = build_repair_missing_tables_prompt( + section_title=section.section_title, + original_prompt=prompt, + content=content, + missing_tables=missing_tables, + evidence_json=json.dumps(evidence, ensure_ascii=False), + ) + obj = chat_completions_json( + system_prompt=repair_missing_tables_system_prompt(), + user_prompt=fix_prompt, + temperature=0.1, + max_tokens=2200, + timeout_sec=120, + log_context=f"补缺失表格 section_key={section.section_key} | {section.section_title}", + ) + new_content = str(obj.get("content") or "").strip() + return new_content or content + + +def _append_minimal_missing_tables(content: str, missing_tables: list[str]) -> str: + blocks = [content.rstrip()] + for t in missing_tables: + blocks.append( + MINIMAL_MISSING_TABLE_TEMPLATE.format( + table_name=_normalize_table_caption_number_name_gap(str(t or "").strip()) + ) + ) + return "".join(blocks).strip() + + +def _remove_cross_section_table_pollution(section_title: str, content: str) -> str: + """ + 清理 4.3.2 / 4.3.3 的跨节串表: + - 4.3.2 不允许出现“装置运行分析”表 + - 4.3.3 不允许出现“投产以来运行周期统计表” + """ + text = str(content or "") + section_no = _extract_section_number(section_title) + if section_no not in {"4.3.2", "4.3.3"}: + return text + + if section_no == "4.3.2": + forbidden_kw = "装置运行分析" + else: + forbidden_kw = "投产以来运行周期统计表" + + # 表题行 + Markdown 表格(允许表题与表格之间有空行/注释行) + md_pat = re.compile( + rf"(?:^|\n)[^\n]*{re.escape(forbidden_kw)}[^\n]*\n" + rf"(?:\s*\n|\s*\n)*" + rf"(?:\s*\|[^\n]+\|\s*\n)+", + flags=re.IGNORECASE, + ) + text = md_pat.sub("\n", text) + + # 表题行 + HTML 表格(允许表题与表格之间有空行/注释行) + html_pat = re.compile( + rf"(?:^|\n)[^\n]*{re.escape(forbidden_kw)}[^\n]*\n" + rf"(?:\s*\n|\s*\n)*" + rf"\s*[\s\S]*?
", + flags=re.IGNORECASE, + ) + text = html_pat.sub("\n", text) + + # 残留单独表题行(无表体)也移除,避免视觉噪音 + title_only_pat = re.compile( + rf"(?:^|\n)\s*[#>*\-\d\.\)()\s]*[^\n]*{re.escape(forbidden_kw)}[^\n]*(?=\n|$)", + flags=re.IGNORECASE, + ) + text = title_only_pat.sub("\n", text) + # 折叠多余空行 + text = re.sub(r"\n{3,}", "\n\n", text) + return text.strip() + + +def _title_compare_norm(s: str) -> str: + """标题宽松比较:折叠空白,并去掉中英括号两侧多余空格。""" + t = re.sub(r"\s+", " ", str(s or "")).strip() + t = re.sub(r"\s*([((])\s*", r"\1", t) + t = re.sub(r"\s*([))])\s*", r"\1", t) + return t + + +def _heading_line_section_number(line: str) -> str: + normalized = str(line or "").strip().lstrip("#").strip() + m = re.match(r"^(\d+(?:\.\d+)*)", normalized) + return m.group(1) if m else "" + + +def _is_heading_line_for_section(line: str, section_no: str) -> bool: + if not section_no: + return False + return _heading_line_section_number(line) == section_no + + +def _strip_leading_section_heading_lines(lines: list[str], section_no: str) -> list[str]: + """去掉正文开头连续的、与 section_no 同编号的标题行(避免 prepend 后重复)。""" + trimmed = list(lines) + while trimmed: + first = trimmed[0] + if not str(first).strip(): + trimmed.pop(0) + continue + if _is_heading_line_for_section(first, section_no): + trimmed.pop(0) + while trimmed and not str(trimmed[0]).strip(): + trimmed.pop(0) + continue + break + return trimmed + + +def _replace_first_section_heading_line(content: str, section_no: str, canonical_title: str) -> str: + lines_list = content.splitlines() + for idx_l, ln in enumerate(lines_list): + if not ln.strip(): + continue + if not _is_heading_line_for_section(ln, section_no): + break + stripped = ln.strip() + section_no_heading = section_no + plain_numbered_24x = section_no_heading in { + "2.4.1", + "2.4.2", + "2.4.3", + "2.4.4", + } + if stripped.startswith("#") and not plain_numbered_24x: + hm = re.match(r"^(#+\s*)", stripped) + prefix = hm.group(1) if hm else "" + lines_list[idx_l] = (prefix + canonical_title).rstrip() + else: + lines_list[idx_l] = canonical_title + break + return "\n".join(lines_list) + + +def _enforce_template_format_contract( + section: ReportTemplateSection, + content: str, + evidence: dict, + *, + chapter_title_map: Optional[dict[str, str]] = None, +) -> tuple[str, list[str]]: + issues: list[str] = [] + # 不对编号样式做统一归一化,避免破坏章节层级编号(如 1.2.1 / 2.1.3 / 3.4.2)。 + repaired = _strip_inline_evidence_labels(str(content or "").strip()) + repaired = _normalize_section_heading_markdown(repaired) + if _extract_section_number(str(section.section_title or "")) == "1.2": + repaired = _normalize_section_12_content(repaired) + + # 0) 先拆行再做标题验收,否则步骤 1 会因首行 != 标准标题而重复插入标题。 + # 0.1) 上一段正文末尾与下一小节编号粘在同一行(如「……证明了2.4.4 初步……」)。 + repaired = _split_inline_template_headings(repaired, chapter_title_map or {}) + # 0.2) 小节标题后与正文首字粘在同行(如「2.4.4 初步设计审查工作评价2017年……」)。 + repaired = _split_glued_template_heading_body(repaired, chapter_title_map or {}) + + # 1) 标题验收: + # - 普通节:首行为当前节标题(允许附加 # 前缀) + # - 每章第一节(x.1):首行为章标题,且必须包含当前节标题 + title = str(section.section_title or "").strip() + section_no_heading = _extract_section_number(title) + title_norm = _title_compare_norm(title) + non_empty_lines: list[str] = [] + for line in repaired.splitlines(): + if line.strip(): + non_empty_lines.append(line.strip().lstrip("#").strip()) + first_non_empty = non_empty_lines[0] if non_empty_lines else "" + first_matches_title = bool( + title and first_non_empty and _title_compare_norm(first_non_empty) == title_norm + ) + first_is_section_heading = bool( + title + and section_no_heading + and first_non_empty + and _is_heading_line_for_section(first_non_empty, section_no_heading) + ) + + ancestor_titles = _resolve_ancestor_titles_for_section( + section, + chapter_title_map or {}, + ) + if ancestor_titles: + required_titles = ancestor_titles + ([title] if title else []) + required_norms = [_title_compare_norm(t) for t in required_titles] + existing_lines = repaired.splitlines() + trimmed_lines = list(existing_lines) + existing_title_chain: list[str] = [] + while trimmed_lines: + first_line = trimmed_lines[0] + normalized = _title_compare_norm(first_line.strip().lstrip("#").strip()) + if not normalized: + trimmed_lines.pop(0) + continue + if normalized in required_norms: + existing_title_chain.append(normalized) + trimmed_lines.pop(0) + while trimmed_lines and not trimmed_lines[0].strip(): + trimmed_lines.pop(0) + continue + if ( + title + and section_no_heading + and len(existing_title_chain) == len(required_norms) - 1 + and _is_heading_line_for_section(first_line, section_no_heading) + ): + existing_title_chain.append(required_norms[-1]) + trimmed_lines.pop(0) + while trimmed_lines and not trimmed_lines[0].strip(): + trimmed_lines.pop(0) + continue + break + if existing_title_chain != required_norms: + body_lines = _strip_leading_section_heading_lines(trimmed_lines, section_no_heading) + body = "\n".join(body_lines).strip() + repaired = "\n\n".join(required_titles + ([body] if body else [])).strip() + issues.append("章节缺少父级标题链,已自动补齐") + elif title and first_is_section_heading and first_non_empty != title: + repaired = _replace_first_section_heading_line(repaired, section_no_heading, title) + elif title and not first_matches_title and first_is_section_heading: + repaired = _replace_first_section_heading_line(repaired, section_no_heading, title) + if first_non_empty != title: + issues.append("章节标题与模板不一致,已规范为标准标题行") + elif title and not first_matches_title: + repaired = f"{title}\n\n{repaired}".strip() + issues.append("章节标题与模板不一致,已自动补齐标准标题行") + elif title and first_matches_title and first_non_empty != title: + repaired = _replace_first_section_heading_line(repaired, section_no_heading, title) + + # 1.42) 2.4.1~2.4.4:首行可能是「### 2.4.x …」且去 # 后与模板标题一致,此时不会进入上一分支,须去掉 Markdown 前缀。 + if ( + _extract_section_number(title) in {"2.4.1", "2.4.2", "2.4.3", "2.4.4"} + and title + ): + lns_strip = repaired.splitlines() + for _is, ln_s in enumerate(lns_strip): + if not ln_s.strip(): + continue + sh = ln_s.strip() + if sh.startswith("#") and _title_compare_norm(sh.lstrip("#").strip()) == title_norm: + lns_strip[_is] = title + break + repaired = "\n".join(lns_strip) + + # 1.5) 确保标题行(如 "2.4.4 初步设计审查工作评价")后面有空行, + # 否则前端 Markdown 渲染或 DOCX 导出时可能无法识别为标题。 + repaired = _ensure_heading_lines_separated(repaired) + + # 2) 标题树验收:若该节定义了固定子节顺序,缺失则自动补齐占位小节。 + section_no = _extract_section_number(title) + repaired, missing_children = _auto_append_missing_child_headings(repaired, section_no) + if missing_children: + issues.append("缺失下级小节已自动补齐:" + "、".join(missing_children[:10])) + + # 3) 仅当章节合同显式要求表格时,才做模板表格规格验收与修复。 + # 否则像 5.2.2/5.2.3/5.2.4 这类纯文字章节会被示例表误触发补表,产生脏表格。 + if not _section_requires_tables( + title, contract_text=_effective_section_output_contract(section) + ): + return _strip_inline_evidence_labels(repaired), issues + + # 4) 解析模板示例中的表规格(表名 + 表头关键字) + table_specs = _extract_template_table_specs(section.examples) + if not table_specs: + return repaired, issues + + # 4.1) 第 5 章共用示例里同时出现「表5-1/表5-2」宁夏样例与各小节真实合同(如 5.3.1 仅表5-4)。 + # 若不按合同过滤,_find_table_format_issues 会误报缺表5-1,_repair_table_format_by_template 会把表5-4「修」成样例表头。 + contract_required = _extract_required_table_tokens( + section.section_prompt or "", + section_no, + contract_text=_effective_section_output_contract(section), + ) + if contract_required: + allow = {_norm_table_token(t) for t in contract_required if _norm_table_token(t)} + narrowed = [ + s + for s in table_specs + if _norm_table_token(str(s.get("token") or "")) in allow + ] + if narrowed: + table_specs = narrowed + + table_issues = _find_table_format_issues(repaired, table_specs) + if table_issues: + issues.extend(table_issues) + repaired = _repair_table_format_by_template(section, repaired, table_specs, evidence) + # 二次验收,仍不通过则提示但不循环重试 + still = _find_table_format_issues(repaired, table_specs) + if still: + issues.extend([f"二次修正后仍存在:{x}" for x in still[:4]]) + return _strip_inline_evidence_labels(repaired), issues + + +def _extract_template_table_specs(raw_examples: Optional[str]) -> list[dict]: + text = str(raw_examples or "").strip() + if not text: + return [] + lines = [ln.rstrip() for ln in text.splitlines()] + specs: list[dict] = [] + i = 0 + while i < len(lines): + line = lines[i].strip() + m = re.match(r"^(附表\s*\d+|表\s*\d+(?:\s*-\s*\d+)?)\s*(.*)$", line) + if not m: + i += 1 + continue + token = re.sub(r"\s+", "", m.group(1)) + title_tail = str(m.group(2) or "").strip() + title = f"{m.group(1)} {title_tail}".strip() + + j = i + 1 + header_keywords: list[str] = [] + # 采集该表后面的头部字段线索 + while j < len(lines): + cur = lines[j].strip() + if not cur: + j += 1 + if header_keywords: + break + continue + if re.match(r"^(附表\s*\d+|表\s*\d+(?:\s*-\s*\d+)?)\s*", cur): + break + if re.match(r"^\d+(?:\.\d+)*\s+", cur): # 下一个章节 + break + if cur.startswith("注"): + break + if "|" in cur: + # Markdown 表头 + cells = [c.strip() for c in cur.split("|") if c.strip()] + for c in cells[:8]: + if c and c not in ("---", "—"): + header_keywords.append(c) + break + # 普通文本表头行 + if len(cur) <= 24 and not re.fullmatch(r"[0-9.%()()\-~~:/\s]+", cur): + header_keywords.append(cur) + if len(header_keywords) >= 8: + break + j += 1 + + specs.append( + { + "token": token, + "title": title, + "headerKeywords": list(dict.fromkeys(header_keywords))[:8], + } + ) + i = j + # 去重同 token + dedup: dict[str, dict] = {} + for s in specs: + tk = str(s.get("token") or "") + if not tk or tk in dedup: + continue + dedup[tk] = s + return list(dedup.values())[:12] + + +def _find_table_format_issues(content: str, table_specs: list[dict]) -> list[str]: + issues: list[str] = [] + c = str(content or "") + c_norm = _norm_table_token(c) + for spec in table_specs: + token = str(spec.get("token") or "") + title = str(spec.get("title") or token) + token_norm = _norm_table_token(token) + if token_norm and token_norm not in c_norm: + issues.append(f"缺少模板表名:{title}") + continue + # 若模板存在表头关键词,则要求至少命中2个(或全部,如果少于2) + headers = [str(h).strip() for h in (spec.get("headerKeywords") or []) if str(h).strip()] + if not headers: + continue + hit = sum(1 for h in headers if h in c) + need = min(2, len(headers)) + if hit < need: + issues.append(f"表头与模板不一致:{title}") + return issues + + +def _repair_table_format_by_template( + section: ReportTemplateSection, + content: str, + table_specs: list[dict], + evidence: dict, +) -> str: + specs_text = json.dumps(table_specs, ensure_ascii=False) + fix_prompt = build_table_format_repair_prompt( + section_title=section.section_title, + table_specs_json=specs_text, + content=content, + evidence_json=json.dumps(evidence, ensure_ascii=False), + ) + obj = chat_completions_json( + system_prompt=table_format_repair_system_prompt(), + user_prompt=fix_prompt, + temperature=0.1, + max_tokens=2600, + timeout_sec=120, + log_context=f"表格格式修复 section_key={section.section_key} | {section.section_title}", + ) + fixed = str(obj.get("content") or "").strip() + return fixed or content + + +def _render_structured_tables_for_prompt(evidence: dict) -> str: + rows = evidence.get("structuredTables") if isinstance(evidence, dict) else [] + if not isinstance(rows, list) or not rows: + return "无结构化表格证据" + blocks: list[str] = [] + for row in rows[:8]: + if not isinstance(row, dict): + continue + name = str(row.get("tableName") or "").strip() + md = str(row.get("markdown") or "").strip() + if not name or not md: + continue + blocks.append(f"### {_normalize_table_caption_number_name_gap(name)}\n\n{md}") + return "\n\n".join(blocks) if blocks else "无结构化表格证据" + + +def _strip_bracketed_three_part_labels(content: str) -> str: + text = str(content or "") + if not text: + return text + # 全章统一移除方括号三段式标题,保留其后正文内容。 + patterns = [ + r"^\s{0,3}#{0,6}\s*【事实依据】\s*$", + r"^\s{0,3}#{0,6}\s*【评价判断】\s*$", + r"^\s{0,3}#{0,6}\s*【问题与建议】\s*$", + r"^\s{0,3}#{0,6}\s*事实依据\s*[::、]?\s*$", + r"^\s{0,3}#{0,6}\s*评价判断\s*[::、]?\s*$", + r"^\s{0,3}#{0,6}\s*问题与建议\s*[::、]?\s*$", + r"【事实依据】", + r"【评价判断】", + r"【问题与建议】", + ] + for p in patterns: + text = re.sub(p, "", text, flags=re.MULTILINE) + text = re.sub(r"\n{3,}", "\n\n", text).strip() + return text + + +def _strip_placeholder_table_notes(content: str) -> str: + text = str(content or "") + if not text: + return text + placeholder_note_pattern = re.compile( + r"^\s{0,3}(?:[-*+]\s*)?(?:>\s*)?(?:\*\*|__)?\s*注\s*\d+\s*[\.。::、]?\s*待补充\s*(?:\*\*|__)?\s*$", + flags=re.IGNORECASE, + ) + boilerplate_note_line_pattern = re.compile( + r"可酌情增减指标|可酌情增减|根据项目的情况|根据项目实际需要进行增减|根据项目不同进行增减|根据项目具体情况增减|表中内容可根据", + flags=re.IGNORECASE, + ) + boilerplate_full_line_pattern = re.compile( + r"^\s{0,3}(?:[-*+]\s*)?(?:>\s*)?(?:\*\*|__)?\s*注\s*[\.。::]?\s*(?:\d+\s*[\.。、::]?\s*)?(?:表中内容)?可根据项目.{0,20}(?:增减|调整)", + flags=re.IGNORECASE, + ) + note_header_pattern = re.compile( + r"^\s*(?:[-*+]\s*)?(?:>\s*)?(?:\*\*|__)?\s*注\s*[::]\s*(?:\*\*|__)?\s*$", + flags=re.IGNORECASE, + ) + + src_lines = text.splitlines() + lines: list[str] = [] + i = 0 + while i < len(src_lines): + line = src_lines[i] + if placeholder_note_pattern.match(line): + i += 1 + continue + if boilerplate_full_line_pattern.match(line): + i += 1 + continue + if boilerplate_note_line_pattern.search(line): + i += 1 + continue + if note_header_pattern.match(line): + j = i + 1 + skipped_boilerplate = False + while j < len(src_lines): + nxt = src_lines[j] + if not str(nxt).strip(" \t\u3000"): + j += 1 + continue + if boilerplate_note_line_pattern.search(nxt): + skipped_boilerplate = True + j += 1 + break + if skipped_boilerplate: + i = j + continue + lines.append(line) + i += 1 + return re.sub(r"\n{3,}", "\n\n", "\n".join(lines)).strip() + + +def _strip_trailing_partial_missing_markers(content: str) -> str: + text = str(content or "") + if not text: + return text + + cleaned_lines: list[str] = [] + for raw_line in text.splitlines(): + line = raw_line.rstrip() + compact = re.sub(r"\s+", "", line) + if compact in {"待补充", "-待补充", "*待补充"}: + cleaned_lines.append(line) + continue + + updated = re.sub(r"\s*待补充\s*(?:\[\s*\d{1,3}\s*\])?\s*$", "", line) + updated = re.sub(r"\s{2,}", " ", updated).rstrip() + stripped = updated.strip() + core_len = len(re.sub(r"[^\u4e00-\u9fffA-Za-z0-9]", "", stripped)) + + if stripped and stripped != line.strip() and core_len >= 12: + cleaned_lines.append(updated) + else: + cleaned_lines.append(line) + + return re.sub(r"\n{3,}", "\n\n", "\n".join(cleaned_lines)).strip() + + +def _is_pipe_markdown_table_row_line(line: str) -> bool: + s = line.strip() + return len(s) >= 2 and s.startswith("|") and s.endswith("|") + + +def _is_pipe_markdown_table_separator_line(line: str) -> bool: + s = line.strip() + return bool(re.match(r"^\|[\s\-:|]+\|$", s)) if s else False + + +def _markdown_table_body_fingerprint(md: str) -> str: + """用于判断两张 Markdown 表是否实质相同(忽略行间空行与首尾空白)。""" + lines = [ + re.sub(r"\s+", "", ln.strip()) + for ln in str(md or "").splitlines() + if ln.strip().startswith("|") or ln.strip().startswith("|") + ] + return "\n".join(lines) + + +_INTRA_REPEAT_MIN_FRAGMENT_LEN = 18 +_INTRA_REPEAT_MIN_CONSECUTIVE_COUNT = 3 + + +def _collapse_consecutive_repetitions_in_string(text: str) -> tuple[str, int]: + """ + 折叠同一行/字符串内连续重复片段(如模型将同一句财务描述拼接数十次)。 + 仅处理紧邻重复,避免误伤正常文中偶然出现的相同短语。 + """ + s = str(text or "") + min_len = _INTRA_REPEAT_MIN_FRAGMENT_LEN + min_count = _INTRA_REPEAT_MIN_CONSECUTIVE_COUNT + if len(s) < min_len * min_count: + return s, 0 + + removed = 0 + out: list[str] = [] + i = 0 + n = len(s) + while i < n: + best_plen = 0 + best_count = 0 + max_plen = (n - i) // min_count + for plen in range(min_len, max_plen + 1): + pat = s[i : i + plen] + if not pat.strip(): + continue + count = 1 + j = i + plen + while j + plen <= n and s[j : j + plen] == pat: + count += 1 + j += plen + if count >= min_count: + span = plen * count + if span > best_plen * best_count: + best_plen = plen + best_count = count + if best_plen: + out.append(s[i : i + best_plen]) + removed += best_count - 1 + i += best_plen * best_count + else: + out.append(s[i]) + i += 1 + return "".join(out), removed + + +def _collapse_consecutive_text_repetitions(content: str) -> tuple[str, int]: + """按行折叠段内连续重复;返回 (正文, 移除的重复次数)。""" + lines = str(content or "").splitlines() + if not lines: + return str(content or ""), 0 + total_removed = 0 + collapsed_lines: list[str] = [] + for line in lines: + collapsed, removed = _collapse_consecutive_repetitions_in_string(line) + total_removed += removed + collapsed_lines.append(collapsed) + return "\n".join(collapsed_lines), total_removed + + +_CHAPTER_CONTENT_DEDUPE_MIN_CHARS = 2000 +_CHAPTER_DEDUPE_MIN_BLOCK_CORE_LEN = 48 +_CHAPTER_DEDUPE_NEAR_MATCH_RATIO = 0.90 + + +def _chapter_block_core_len(text: str) -> int: + return len(re.sub(r"[^\u4e00-\u9fffA-Za-z0-9]", "", _strip_inline_evidence_labels(str(text or "")))) + + +def _chapter_text_block_fingerprint(text: str) -> str: + t = _strip_inline_evidence_labels(str(text or "")) + t = re.sub(r"\s+", "", t) + return t.casefold() + + +def _is_likely_table_caption_line(line: str) -> bool: + prev = str(line or "").strip() + if not prev or len(prev) > 120: + return False + if prev.startswith("#"): + return True + if re.search(r"表\s*[\d一二三四五六七八九十\--—–]+", prev): + return True + return "表" in prev and len(prev) <= 80 + + +def _chapter_block_is_table(block: str) -> bool: + pipe_rows = [ + ln for ln in str(block or "").splitlines() if ln.strip() and _is_pipe_markdown_table_row_line(ln) + ] + return len(pipe_rows) >= 2 + + +def _split_chapter_blocks_for_dedupe(content: str) -> list[str]: + """将章节正文拆成段落块与 Markdown 表块,便于做重复检测。""" + lines = str(content or "").splitlines() + blocks: list[str] = [] + i = 0 + n = len(lines) + pending_caption: list[str] = [] + + def _flush_pending_caption() -> None: + nonlocal pending_caption + if pending_caption: + blocks.append("\n".join(pending_caption)) + pending_caption = [] + + while i < n: + if not lines[i].strip(): + i += 1 + continue + if _is_pipe_markdown_table_row_line(lines[i]): + table_lines: list[str] = [] + while i < n and lines[i].strip() and _is_pipe_markdown_table_row_line(lines[i]): + table_lines.append(lines[i]) + i += 1 + if table_lines: + block_lines = list(pending_caption) + table_lines + pending_caption = [] + start = i - len(table_lines) + scan = start - 1 + while scan >= 0 and lines[scan].strip(): + if _is_pipe_markdown_table_row_line(lines[scan]): + break + if _is_likely_table_caption_line(lines[scan]): + block_lines.insert(0, lines[scan]) + scan -= 1 + continue + break + blocks.append("\n".join(block_lines)) + continue + para_lines: list[str] = [] + while i < n: + if not lines[i].strip(): + i += 1 + break + if _is_pipe_markdown_table_row_line(lines[i]): + break + para_lines.append(lines[i]) + i += 1 + if not para_lines: + continue + if len(para_lines) == 1 and _is_likely_table_caption_line(para_lines[0]): + _flush_pending_caption() + pending_caption = para_lines + continue + _flush_pending_caption() + blocks.append("\n".join(para_lines)) + _flush_pending_caption() + return blocks + + +def _chapter_blocks_near_duplicate(a: str, b: str) -> bool: + fa = _chapter_text_block_fingerprint(a) + fb = _chapter_text_block_fingerprint(b) + if not fa or not fb: + return False + if fa == fb: + return True + short, long = (fa, fb) if len(fa) <= len(fb) else (fb, fa) + if len(short) >= _CHAPTER_DEDUPE_MIN_BLOCK_CORE_LEN and short in long: + if len(short) / max(len(long), 1) >= 0.82: + return True + if min(len(fa), len(fb)) < 80: + return False + return SequenceMatcher(None, fa, fb).ratio() >= _CHAPTER_DEDUPE_NEAR_MATCH_RATIO + + +def _chapter_block_duplicate_key(block: str) -> tuple[str, str]: + text = str(block or "") + if _chapter_block_is_table(text): + hdr = _extract_table_header_key(text) + fp = _markdown_table_body_fingerprint(text) + return ("table", hdr or fp) + return ("text", _chapter_text_block_fingerprint(text)) + + +def _dedupe_long_chapter_repetition(content: str) -> tuple[str, int]: + """ + 单章字数超过阈值时,对段落/表格块做去重,缓解模型整段或整表重复输出。 + 保留首次出现块,删除后续实质重复块。 + """ + text = str(content or "") + if len(text) <= _CHAPTER_CONTENT_DEDUPE_MIN_CHARS: + return text, 0 + + blocks = _split_chapter_blocks_for_dedupe(text) + if len(blocks) < 2: + return text, 0 + + kept: list[str] = [] + seen_table_hdr: set[str] = set() + seen_table_fp: set[str] = set() + seen_text_fp: set[str] = set() + kept_text_samples: list[str] = [] + removed = 0 + + for block in blocks: + core_len = _chapter_block_core_len(block) + kind, key = _chapter_block_duplicate_key(block) + is_dup = False + + if kind == "table": + hdr = _extract_table_header_key(block) if key else "" + fp = _markdown_table_body_fingerprint(block) + if hdr and hdr in seen_table_hdr: + is_dup = True + elif fp and fp in seen_table_fp: + is_dup = True + elif key and key in seen_text_fp: + is_dup = True + elif core_len >= _CHAPTER_DEDUPE_MIN_BLOCK_CORE_LEN: + for prev in kept_text_samples: + if _chapter_blocks_near_duplicate(block, prev): + is_dup = True + break + + if is_dup: + removed += 1 + continue + + if kind == "table": + hdr = _extract_table_header_key(block) + fp = _markdown_table_body_fingerprint(block) + if hdr: + seen_table_hdr.add(hdr) + if fp: + seen_table_fp.add(fp) + elif key: + seen_text_fp.add(key) + if core_len >= _CHAPTER_DEDUPE_MIN_BLOCK_CORE_LEN: + kept_text_samples.append(block) + + kept.append(block) + + if removed <= 0: + return text, 0 + + merged = "\n\n".join(b.strip() for b in kept if b.strip()) + return re.sub(r"\n{3,}", "\n\n", merged).strip(), removed + + +def _dedupe_structured_table_hits(hits: list[dict]) -> list[dict]: + """ + 同一必需表 token 可能命中多张历史表或重复 markdown;合并输出会导致章节内连续两张相同表。 + 按 tableId 与表体指纹去重,保留表体最完整的一条(表5-4 优先可研/实际/增减结构)。 + """ + ranked: list[tuple[int, dict]] = [] + for hit in hits: + if not isinstance(hit, dict): + continue + md = str(hit.get("markdown") or "").strip() + if not md: + continue + fp = _markdown_table_body_fingerprint(md) + if not fp: + continue + ranked.append((_score_structured_table_hit_dict(hit), hit)) + ranked.sort(key=lambda x: (-x[0], str(x[1].get("tableId") or ""))) + + out: list[dict] = [] + seen_ids: set[str] = set() + seen_fp: set[str] = set() + for _score, hit in ranked: + tid = str(hit.get("tableId") or "").strip() + md = str(hit.get("markdown") or "").strip() + fp = _markdown_table_body_fingerprint(md) + if tid and tid in seen_ids: + continue + if fp in seen_fp: + continue + if tid: + seen_ids.add(tid) + seen_fp.add(fp) + out.append(hit) + return out + + +def _is_likely_table1_raw_material_caption(line: str) -> bool: + s = re.sub(r"\s+", "", str(line or "")) + if not s: + return False + if "原料数量及组成对比" in s: + return True + if re.search(r"表\s*1", s) and "原料" in s: + return True + return False + + +def _extract_table_header_key(table_block: str) -> str: + """提取表格的表头行(第一条 pipe 行),归一化后作为同表判断依据。""" + for ln in str(table_block or "").splitlines(): + s = ln.strip() + if s.startswith("|") and s.endswith("|") and not re.match(r"^\|[\s\-:|]+\|$", s): + return re.sub(r"\s+", "", s) + return "" + + +def _is_433_operation_analysis_table_header(header_norm: str) -> bool: + """4.3.3 表4-2「烷基化装置运行分析」常见 Markdown 表头(全列或仅实际值列)。""" + h = str(header_norm or "") + if "序号" not in h or "项目" not in h: + return False + if "实际值" in h: + return True + if "设计值" in h and "标定值" in h: + return True + return False + + +def _433_op_analysis_table_has_canonical_caption(text: str, table_block_start: int) -> bool: + """表前若干行内是否出现合同规定的表4-2 烷基化装置运行分析表题(用于保留规范副本、去掉无表题重复表)。""" + before = str(text or "")[: int(table_block_start)].rstrip() + lines = before.split("\n") + tail = "\n".join(lines[-18:]) + if "烷基化装置运行分析" not in tail: + return False + n = re.sub(r"\s+", "", tail) + return bool(re.search(r"表4[--—–]2", n)) + + +def _dedupe_433_alkylation_operation_analysis_markdown_tables(content: str) -> str: + """ + 4.3.3 常见故障:模型在「2) 主要装置达标评价」下先输出无表题的同结构表, + 又在「3) 全厂达标评价」下重复输出带「表4-2 …烷基化装置运行分析…」表题的同一表。 + 对表头/表体指纹相同的重复表:优先保留表前带规范表4-2 表题的一张;否则保留文档中第一张。 + """ + text = str(content or "") + if not text.strip(): + return text + + pat = re.compile(r"(?m)(?:^\s*\|.+\|\s*\n){3,}") + matches = list(pat.finditer(text)) + if len(matches) < 2: + return text + + items: list[dict] = [] + for m in matches: + block = m.group(0) + hdr = _extract_table_header_key(block) + if not _is_433_operation_analysis_table_header(hdr): + continue + fp = _markdown_table_body_fingerprint(block) + items.append( + { + "m": m, + "hdr": hdr, + "fp": fp, + "cap": _433_op_analysis_table_has_canonical_caption(text, m.start()), + } + ) + + n = len(items) + if n < 2: + return text + + parent = list(range(n)) + + def find(x: int) -> int: + if parent[x] != x: + parent[x] = find(parent[x]) + return parent[x] + + def union(x: int, y: int) -> None: + rx, ry = find(x), find(y) + if rx != ry: + parent[ry] = rx + + for i in range(n): + for j in range(i + 1, n): + a, b = items[i], items[j] + same_hdr = bool(a["hdr"] and a["hdr"] == b["hdr"]) + same_fp = bool(a["fp"] and a["fp"] == b["fp"]) + if same_hdr or same_fp: + union(i, j) + + clusters: dict[int, list[int]] = {} + for i in range(n): + r = find(i) + clusters.setdefault(r, []).append(i) + + remove_spans: list[tuple[int, int]] = [] + for _root, idxs in clusters.items(): + if len(idxs) < 2: + continue + idxs_sorted = sorted(idxs, key=lambda ii: items[ii]["m"].start()) + caps = [ii for ii in idxs_sorted if items[ii]["cap"]] + keep_idx = caps[0] if caps else idxs_sorted[0] + for ii in idxs_sorted: + if ii == keep_idx: + continue + m = items[ii]["m"] + start = m.start() + prefix = text[:start].rstrip("\n") + last_nl = prefix.rfind("\n") + title_line = prefix[last_nl + 1 :] if last_nl >= 0 else prefix + tl = title_line.strip() + if "烷基化装置运行分析" in tl and re.search( + r"表4[--—–]2", re.sub(r"\s+", "", tl) + ): + start = last_nl + 1 if last_nl >= 0 else 0 + before = text[:start] + if before.rstrip().endswith("-->"): + comment_start = before.rstrip().rfind(""): + comment_start = before.rstrip().rfind(""): + comment_start = before.rstrip().rfind("\s*\n)*" + r"(?:\s*\|[^\n]+\|\s*\n)+", + flags=re.IGNORECASE, + ) + text = md1.sub("\n", text) + # 「###」独占行后再起表题(与 DOCX 导出兼容) + md2 = re.compile( + rf"(?:^|\n)(?:\s*#{{1,6}}\s*\n)+(?:\s*\n)*" + rf"(?:[^\n]*{kw8}[^\n]*\n(?:\s*[^\n]*{kwname}[^\n]*\n)?)" + r"(?:\s*\n|\s*\n)*" + r"(?:\s*\|[^\n]+\|\s*\n)+", + flags=re.IGNORECASE, + ) + text = md2.sub("\n", text) + html_pat = re.compile( + rf"(?:^|\n)[^\n]*{kw8}[^\n]*{kwname}[^\n]*\n" + r"(?:\s*\n|\s*\n)*" + r"\s*[\s\S]*?
", + flags=re.IGNORECASE, + ) + text = html_pat.sub("\n", text) + title_only = re.compile( + rf"(?:^|\n)(?:\s*#{{1,6}}\s*\n)+(?:\s*\n)*[^\n]*{kw8}[^\n]*(?:{kwname})?[^\n]*(?=\n|$)", + flags=re.IGNORECASE, + ) + text = title_only.sub("\n", text) + title_only2 = re.compile( + rf"(?:^|\n)\s*[#>*\-\d\.\)()\s]*[^\n]*{kw8}[^\n]*{kwname}[^\n]*(?=\n|$)", + flags=re.IGNORECASE, + ) + text = title_only2.sub("\n", text) + return re.sub(r"\n{3,}", "\n\n", text).strip() + + +def _pipe_markdown_row_cells(line: str) -> list[str]: + raw = str(line or "").rstrip("\n") + s = raw.strip() + if not s.startswith("|") or not s.endswith("|"): + return [] + inner = s[1:-1] + return [p.strip() for p in inner.split("|")] + + +def _strip_md_cell_noise(s: str) -> str: + t = re.sub(r"\*+", "", str(s or "")) + t = re.sub(r"", "", t, flags=re.I) + return t.strip() + + +def _strip_532_table55_bad_markdown_columns(content: str) -> str: + """去掉正文中「表5-5 主要生产经营指标」Markdown 表的多余列(如「后评价-时点点后预测值」及冗余裸预测列)。""" + text = str(content or "") + if not text or "主要生产经营指标" not in text: + return text + fc = "后评价时点后预测值" + + def _bad_header_indices(header_cells: list[str]) -> set[int]: + bad: set[int] = set() + comp_cells = [_compact_zh_ident(_strip_md_cell_noise(h)) for h in header_cells] + has_slot = False + for i, h in enumerate(header_cells): + hs = str(h or "") + parts = _split_group_year_col_key(hs) + if parts and parts[0] == fc: + tail = parts[1].strip() + if _appendix_norm_year_tail(tail) or ( + _BARE_FOUR_DIGIT_YEAR_COL.fullmatch(tail) and 1900 <= int(tail) <= 2100 + ): + has_slot = True + break + if re.search(r"后评价时点后预测值\s*[--—–]\s*\d{4}", hs): + has_slot = True + break + fc_c = _compact_zh_ident(fc) + for i, c in enumerate(comp_cells): + if "时点点后" in c: + bad.add(i) + if has_slot and c == fc_c: + bad.add(i) + return bad + + def _drop_cols_from_pipe_block(block: str, drop_idx: set[int]) -> str: + if not drop_idx: + return block + out_lines: list[str] = [] + for ln in block.splitlines(): + if not ln.strip().startswith("|"): + out_lines.append(ln) + continue + cells = _pipe_markdown_row_cells(ln) + if not cells: + out_lines.append(ln) + continue + new_cells = [c for j, c in enumerate(cells) if j not in drop_idx] + if len(new_cells) == len(cells): + out_lines.append(ln) + else: + out_lines.append("| " + " | ".join(new_cells) + " |") + return "\n".join(out_lines) + + rx = re.compile( + r"((?:^|\n)[^\n]*表\s*5\s*[--\..·]\s*5[^\n]*主要生产经营指标[^\n]*\n)" + r"(?:\s*\n|\s*\n)*" + r"((?:^[ \t]*\|[^\n]+\|\s*\n)+)", + flags=re.MULTILINE | re.IGNORECASE, + ) + + def _repl(m: re.Match) -> str: + prefix, body = m.group(1), m.group(2) + tbl_lines = [ + ln + for ln in body.splitlines() + if ln.strip().startswith("|") and ln.strip().endswith("|") + ] + if len(tbl_lines) < 2: + return m.group(0) + hdr = _pipe_markdown_row_cells(tbl_lines[0]) + if not hdr: + return m.group(0) + drop = _bad_header_indices(hdr) + if not drop: + return m.group(0) + return prefix + _drop_cols_from_pipe_block(body, drop) + + return rx.sub(_repl, text) + + +def _cleanup_section_table_artifacts( + section_title: str, + content: str, + *, + allowed_table_tokens: Optional[list[str]] = None, +) -> str: + section_no = _extract_section_number(str(section_title or "")) + text = _strip_unallowed_table_references( + str(content or ""), + allowed_table_tokens=allowed_table_tokens, + ) + # 末尾兜底:防止中间步骤再次引入 4.3.2/4.3.3 串表。 + if section_no in {"4.3.2", "4.3.3"}: + text = _remove_cross_section_table_pollution(section_title, text) + if section_no == "4.3.3": + text = _dedupe_433_alkylation_operation_analysis_markdown_tables(text) + if section_no == "5.3.2": + text = _strip_532_embedded_appendix8_table(text) + text = _strip_532_table55_bad_markdown_columns(text) + if section_no == "3.3.1": + return _strip_331_table_crossrefs(text) + if section_no in {"2.1.5", "3.3.2", "3.3.4", "5.1", "5.3.1", "5.3.2"}: + text = _finalize_section_table_dedupe(text, allowed_table_tokens) + if section_no == "3.3.3": + return _strip_333_trailing_table_caption_lines(text) + if section_no == "3.4.1": + return _strip_341_table_artifacts(text) + if section_no == "2.1.1": + text = _strip_211_stray_table_261(text) + text = _dedupe_211_duplicate_markdown_tables(text) + if section_no == "5.2.1": + text = _fix_521_table52_wrong_caption(text) + text = _strip_521_spurious_llm_table52(text) + text = _finalize_section_table_dedupe(text, allowed_table_tokens) + if section_no == "5.3.1": + text = _strip_531_spurious_llm_table(text) + if section_no == "5.4": + text = _strip_54_spurious_llm_table(text) + return text + + +def _chapter5_opening_heading_present(text: str) -> bool: + """判断正文块是否已以第5章章题开头(「5 投资…」与第1章「1 项目概况」同体例,便于前端提升为 ##)。""" + t = str(text or "").strip() + if not t: + return False + first = t.split("\n", 1)[0].strip() + if first.startswith("#"): + first = first.lstrip("#").strip() + if "第5章" in first and "投资与经济效益评价" in first: + return True + return bool(re.match(r"^5\s+投资与经济效益评价", first)) + + +def _canonicalize_chapter5_shell_heading_line(text: str) -> str: + """ + 将独立行的「第5章 投资与经济效益评价」规范为「5 投资与经济效益评价」, + 与模板第1章及 promoteNumberedHeadingLinesToMarkdown(## 章级)一致。 + """ + lines = str(text or "").split("\n") + out: list[str] = [] + replaced = False + for line in lines: + if not replaced and line.strip(): + stripped = line.strip().lstrip("#").strip() + if stripped == "第5章 投资与经济效益评价" or ( + stripped.startswith("第5章") and "投资与经济效益评价" in stripped + ): + out.append("5 投资与经济效益评价") + replaced = True + continue + out.append(line) + return "\n".join(out) + + +def _section_heading_present_in_text(text: str, heading_title: str) -> bool: + """判断正文中是否已出现指定节标题行。""" + if not str(text or "").strip() or not str(heading_title or "").strip(): + return False + target_norm = _title_compare_norm(heading_title) + section_no = _extract_section_number(heading_title) + for line in str(text).splitlines(): + stripped = line.strip() + if not stripped: + continue + plain = stripped.lstrip("#").strip() + if _title_compare_norm(plain) == target_norm: + return True + if section_no and _is_heading_line_for_section(plain, section_no): + return True + if section_no == "5" and _chapter5_opening_heading_present(text): + return True + return False + + +def _inject_missing_parent_section_headings( + section_title: str, + content: str, + previous_section_content: str, + chapter_title_map: dict[str, str], +) -> str: + """ + 仅生成叶子节时,父节壳(如 5.2、5.3、5)不会单独落库;在首个子节(x.y.1)前补足父节标题。 + """ + if not chapter_title_map: + return content + + stub = SimpleNamespace(section_title=str(section_title or "").strip()) + ancestors = _resolve_ancestor_titles_for_section(stub, chapter_title_map) + if not ancestors: + return content + + body = str(content or "").strip() + if not body: + return content + if _extract_section_number(section_title or "") == "5.1": + body = _canonicalize_chapter5_shell_heading_line(body) + + prior = str(previous_section_content or "") + missing: list[str] = [] + for anc in ancestors: + if _section_heading_present_in_text(body, anc): + continue + if _section_heading_present_in_text(prior, anc): + continue + missing.append(anc) + if not missing: + return body + return "\n\n".join(missing + [body]).strip() + + +def _inject_chapter5_title_before_section_51( + section_key: str, + content: str, + previous_section_content: str, + *, + section_title: str = "", + chapter_title_map: Optional[dict[str, str]] = None, +) -> str: + """兼容旧调用;优先走通用父节标题注入。""" + if chapter_title_map and section_title: + return _inject_missing_parent_section_headings( + section_title, content, previous_section_content, chapter_title_map + ) + if str(section_key or "").strip() != "5-1": + return content + body = _canonicalize_chapter5_shell_heading_line(str(content or "").strip()) + if not body: + return content + if _chapter5_opening_heading_present(body): + return body + if _chapter5_opening_heading_present(previous_section_content): + return body + return f"5 投资与经济效益评价\n\n{body}" + + +def _previous_completed_section_content( + section: ReportTemplateSection, + sections: list[ReportTemplateSection], + completed_contents: dict[str, str], +) -> str: + """按模板顺序取当前节之前最近一节已生成正文(用于父节标题是否已出现)。""" + ordered = list(sections or []) + try: + idx = next(i for i, s in enumerate(ordered) if s.section_key == section.section_key) + except StopIteration: + return "" + for j in range(idx - 1, -1, -1): + body = str(completed_contents.get(ordered[j].section_key) or "").strip() + if body: + return body + return "" + + +def _prev_line_invites_metric_continuation(prev_line: str) -> bool: + """上一行是否像在句中被截断、下一行应以能耗/物耗数值续写。""" + s = str(prev_line or "").strip() + if not s: + return False + if s.startswith("|"): + return False + if re.match(r"^\s{0,3}#{0,6}\s*\d+(?:\.\d+)+\s+[\u4e00-\u9fff]", s): + return False + if re.search(r"(?:\[\d+\]\s*)+$", s): + return True + if re.search( + r"(?:单耗|电耗|能耗|水耗|物耗|损失|运行值|设计值|加工量|负荷|占比)为?" + r"\s*(?:\[\d+\]\s*)*$", + s, + ): + return True + if re.search( + r"(上升至|升至|降至|下降为|提高为|降低为|为|达到|至)\s*(?:\[\d+\]\s*)*$", + s, + ): + return True + return bool(re.search(r"[至为是到]$", s)) + + +def _merge_orphan_energy_metric_lines(text: str) -> str: + """ + 将误断成独立行的能耗/物耗数值片段并回上一行正文。 + 例:「…可研报告 [50]\\n132.41 MJ/t产品及初步设计」→ 合并为一行,避免前端误判为 ### 标题。 + """ + from services.docx_export_service import _is_likely_section_number + + metric_re = re.compile( + r"^\s*(?:#{1,6}\s+)?(\d+(?:\.\d+)?)\s+(MJ/t|kWh/t|kgce/t|t产品)", + re.IGNORECASE, + ) + lines = str(text or "").split("\n") + out: list[str] = [] + for line in lines: + stripped = re.sub(r"^#{1,6}\s+", "", str(line or "").strip()) + m = metric_re.match(stripped) + if m and not _is_likely_section_number(m.group(1)): + prev = len(out) - 1 + while prev >= 0 and not str(out[prev] or "").strip(): + prev -= 1 + if prev >= 0 and _prev_line_invites_metric_continuation(str(out[prev] or "")): + out[prev] = out[prev].rstrip() + stripped + continue + out.append(line) + return "\n".join(out) + + +def _fix_numeric_line_breaks(content: str) -> str: + """ + 修复数字与单位/日期在换行处被意外拆分的问题。 + 仅合并明显数字语义连续场景,尽量不影响正常段落换行。 + """ + text = str(content or "") + if not text: + return text + # 统一各种换行分隔符,避免 \u2028/\u2029 导致规则失效 + text = text.replace("\r\n", "\n").replace("\r", "\n") + text = text.replace("\u2028", "\n").replace("\u2029", "\n") + + # 保护 Markdown 表格行之间及表格行与后续正文之间的换行, + # 否则数字合并规则会把表格末行和下一行粘在一起变成多余列。 + # 策略:按行拆分,识别所有表格行(以 | 开头或以 | 结尾), + # 将其前后换行替换为保护占位符,合并规则处理完毕后再恢复。 + table_nl_token = "\u0000TABLE_NL\u0000" + _lines = text.split("\n") + for _li in range(len(_lines)): + _stripped = _lines[_li].strip() + _is_table = _stripped.startswith("|") or _stripped.endswith("|") + if _is_table: + _lines[_li] = table_nl_token + _lines[_li] + table_nl_token + text = "\n".join(_lines) + text = text.replace(table_nl_token + "\n" + table_nl_token, table_nl_token) + text = text.replace("\n" + table_nl_token, table_nl_token) + text = text.replace(table_nl_token + "\n", table_nl_token) + + # 先保护“章节标题换行”(如:1 项目概况\n1.1 项目基本情况),避免被数字合并规则误伤。 + heading_nl_token = "\u0000HEADING_NL\u0000" + text = re.sub( + r"\n(?=\s*\d+(?:\.\d+)*\s+[\u4e00-\u9fff]{2,}(?:\s|$))", + heading_nl_token, + text, + ) + + # 数字/中文与下一行之间的合并:换行两侧仅允许水平空白(不含 \\n), + # 否则 \\s* 会吞掉段落空行的第一个 \\n,使 \\n(?!\\n) 失效,误把「标题\\n\\n2017年…」粘回一行。 + _hsp = r"[ \t\u3000]*" + # 例:2018 年 11 月\n4 日、24.48\n%、1906\nm2、0.05\ng + text = re.sub( + rf"(?<=\d){_hsp}\n(?!\n){_hsp}(?=(?:\d|[年月日时分秒度%%℃吨米台套项]|[A-Za-z]))", + "", + text, + ) + # 例:烈度\n7 度、规模\n15 万吨/年(中文描述后接数字) + # 仅在“下一行是数字 + 常见单位/量纲”时合并,避免误伤编号列表(如 1)/1.) + text = re.sub( + rf"(?<=[\u4e00-\u9fff]){_hsp}\n(?!\n){_hsp}(?=\d+(?:\.\d+)?\s*(?![))、.])(?:[年月日时分秒度%%℃吨米台套项个级亩万亿千百十gGlLmMkKvVaAwWhHzHPp]|[A-Za-z]{{1,4}}\b))", + "", + text, + ) + # 例:106万\n工时、15万吨/年\n烷基化项目(数量级后接中文语义单位) + text = re.sub( + rf"(?<=[\d万亿千百十]){_hsp}\n(?!\n){_hsp}(?=(?:工时|吨/年|万吨/年|亿元|万元|万人|m2|m3|m²|m³|项|台|套|个|座|处|条|次|年|月|日))", + "", + text, + flags=re.IGNORECASE, + ) + # 例:kgEo/\nt、m\n2 等单位被拆分 + text = re.sub(r"(?<=[A-Za-z/])\s*\n\s*(?=\d)", "", text) + text = re.sub(r"(?<=[A-Za-z])\s*\n\s*(?=[A-Za-z])", "", text) + # 例:实际运行值为\n137.88 MJ/t;…单耗为 [93][94]\n\n1.38 MJ/t(2.1.1/2.1.6 常见断行) + _metric_num = r"\d+(?:\.\d+)?\s*(?:MJ/t|kWh/t|kgce/t|t产品)" + text = re.sub( + rf"((?:\[\d+\]\s*)+)\s*\n+\s*({_metric_num})", + r"\1 \2", + text, + flags=re.IGNORECASE, + ) + text = re.sub( + rf"(?<=[\u4e00-\u9fff)\])])\s*\n+\s*({_metric_num})", + r" \1", + text, + flags=re.IGNORECASE, + ) + text = _merge_orphan_energy_metric_lines(text) + + # 统一面积/体积单位写法:m2/m3 -> m²/m³(兼容空格、大小写、^ 写法) + text = re.sub(r"(?i)\bm\s*(?:\^?\s*2)\b", "m²", text) + text = re.sub(r"(?i)\bm\s*(?:\^?\s*3)\b", "m³", text) + text = text.replace(heading_nl_token, "\n") + text = text.replace(table_nl_token, "\n") + return text + + +def _canonical_global_table_name_for_token(token: str) -> str | None: + t = str(token or "").strip() + if not t: + return None + for name in MULTI_COLUMN_GLOBAL_SPECS: + if _table_token_matches_name(t, name): + return name + return None + + +def _skeleton_markdown_for_table_token(token: str, *, table_name: str = "") -> str: + """按细则模版生成占位 Markdown 表体(要素无有效单元格时仍保证表3-4 等有表体)。""" + full_name = _canonical_global_table_name_for_token(token) or "" + if not full_name: + tn = str(table_name or "").strip() + if _multi_column_global_spec_for_table(tn): + full_name = tn + elif tn: + full_name = _canonical_global_table_name_for_token(tn) or tn + spec = _multi_column_global_spec_for_table(full_name) + if not spec: + return "" + row_order = global_table_row_keys(full_name) + if not row_order: + return "" + col_order = [str(c).strip() for c in (spec[0] or []) if str(c).strip()] + if not col_order: + return "" + md, _ = _render_markdown_table(full_name, row_order, col_order, {}) + return str(md or "").strip() + + +def _authoritative_block_for_required_table(token: str, evidence: dict) -> str | None: + """要素直出整块:优先 structuredTables 中的 markdown,否则用模版骨架表。""" + table_rows = evidence.get("structuredTables") if isinstance(evidence, dict) else [] + title = str(token or "").strip() + md = "" + if isinstance(table_rows, list): + best_row: dict | None = None + best_row_score = -1 + for row in table_rows: + if not isinstance(row, dict): + continue + token_hit = str(row.get("token") or "") + table_name_hit = str(row.get("tableName") or "") + if _table_token_matches_name(token, token_hit) or _table_token_matches_name( + token, table_name_hit + ): + row_md = str(row.get("markdown") or "").strip() + if not row_md: + continue + row_score = ( + _score_structured_table_hit_dict(row) + if _table_token_matches_name(token, "表5-4") + else len(row_md) + ) + if row_score > best_row_score: + best_row_score = row_score + best_row = row + if best_row: + md = str(best_row.get("markdown") or "").strip() + title = str(best_row.get("tableName") or token).strip() or token + if not md: + sk = _skeleton_markdown_for_table_token(token, table_name=title) + if sk: + md = sk + canon = _canonical_global_table_name_for_token(token) + if canon: + title = canon + if not md: + return None + return ( + f"{title}\n\n" + "\n" + f"{md}" + ) + + +def _fill_required_table_caption_stubs( + content: str, required_tables: list[str], evidence: dict +) -> str: + """将仅有表题、段内无 Markdown 表体的必需表替换为要素直出或模版骨架。""" + text = str(content or "") + changed = False + for token in required_tables or []: + if not _table_token_caption_line_re(token).search(text): + continue + seg = _segment_after_table_caption(text, token) + if _segment_has_markdown_table_body(seg): + seg_tbl = re.search(r"(?m)(?:^\s*\|[^\n]+\|\s*\n){3,}", seg) + if not ( + _table_token_matches_name(token, "表5-4") + and seg_tbl + and _is_table54_simplified_extract_body(seg_tbl.group(0)) + ): + continue + block = _authoritative_block_for_required_table(token, evidence) + if not block: + continue + text = _replace_caption_stub_with_authoritative_table(text, token, block) + changed = True + return text if changed else content + + +def _append_structured_missing_tables(content: str, missing_tables: list[str], evidence: dict) -> str: + out_content = str(content or "").rstrip() + used = False + for token in missing_tables: + block = _authoritative_block_for_required_table(token, evidence) + if not block: + continue + if _table_token_caption_line_re(token).search(out_content): + out_content = _replace_caption_stub_with_authoritative_table( + out_content, token, block + ) + else: + out_content = out_content + "\n\n" + block + used = True + return out_content.strip() if used else content + + +def _replace_llm_table_with_authoritative(content: str, token: str, replacement_md: str) -> str: + """将 LLM 自行生成的同 token 表格(表题行 + 表格体)替换为要素管理直出内容。 + + 关键:管道行匹配使用 ``[ \\t]*`` 而非 ``\\s*``,防止 ``\\s`` 跨越空行 + 把分析文字中的 token 引用误关联到远处另一张表的管道行。 + 表题行与首条管道行之间允许至多一个空行(``\\n?``)。 + """ + token_plain = re.sub(r"\s+", "", str(token or "")) + if not token_plain or not replacement_md: + return content + token_re = re.escape(token_plain).replace(r"\-", r"[--—–]") + md_table_pat = re.compile( + r"((?:^|\n)[^\n]*?" + token_re + r"[^\n]*\n)" + r"(\n?(?:[ \t]*\|[^\n]+\|[ \t]*\n)+)", + flags=re.IGNORECASE, + ) + m = md_table_pat.search(content) + if m: + return content[:m.start()] + "\n" + replacement_md + "\n\n" + content[m.end():].lstrip("\n") + html_table_pat = re.compile( + r"((?:^|\n)[^\n]*?" + token_re + r"[^\n]*\n)" + r"(\s*[\s\S]*?
)", + flags=re.IGNORECASE, + ) + m = html_table_pat.search(content) + if m: + return content[:m.start()] + "\n" + replacement_md + "\n\n" + content[m.end():].lstrip("\n") + return content + + +def _caption_followed_by_element_table_comment(content: str, token: str) -> bool: + """仅当「本表表题行后」紧跟要素直出注释时,才视为已权威化,避免全篇任一注释误伤其它表的替换。""" + token_plain = re.sub(r"\s+", "", str(token or "")) + if not token_plain: + return False + token_re = re.escape(token_plain).replace(r"\-", r"[--—–]") + return bool( + re.search( + r"(?:^|\n)[^\n]*?" + token_re + r"[^\n]*\n" + r"(?:[ \t]*\n)?[ \t]*\n" + f"{hit_md}" + ) + if _table_token_exists(out, token_n): + out = _replace_llm_table_with_authoritative(out, token_n, rep) + return out + + +def _append_authoritative_required_tables(content: str, required_tables: list[str], evidence: dict) -> str: + """ + 为模板必需表追加"要素表直出"块,确保表格数据直接来自结构化要素表。 + 若 LLM 已自行生成了同 token 的表格,用要素管理数据替换之。 + """ + if not required_tables: + return content + + out_content = str(content or "") + used = False + for token in required_tables: + already_authoritative = ( + _caption_followed_by_element_table_comment(out_content, token) + and _table_token_exists(out_content, token) + ) + if already_authoritative: + continue + combined_md = _authoritative_block_for_required_table(token, evidence) + if not combined_md: + continue + if _table_token_exists(out_content, token): + replaced = _replace_llm_table_with_authoritative(out_content, token, combined_md) + out_content = ( + replaced + if replaced != out_content + else _replace_caption_stub_with_authoritative_table( + out_content, token, combined_md + ) + ) + elif _table_token_caption_line_re(token).search(out_content): + out_content = _replace_caption_stub_with_authoritative_table( + out_content, token, combined_md + ) + else: + out_content = out_content.rstrip() + "\n\n" + combined_md + used = True + return out_content.strip() if used else content + + +def _is_effective_markdown_table_block(md_block: str) -> bool: + lines = [str(ln or "").strip() for ln in str(md_block or "").splitlines() if str(ln or "").strip()] + if len(lines) < 3: + return False + if _is_pipe_markdown_table_separator_line(lines[0]): + return False + if not _is_pipe_markdown_table_separator_line(lines[1]): + return False + data_rows = [ + ln for ln in lines[2:] + if _is_pipe_markdown_table_row_line(ln) and not _is_pipe_markdown_table_separator_line(ln) + ] + return bool(data_rows) + + +def _ensure_required_structured_tables_integrity(content: str, required_tables: list[str], evidence: dict) -> str: + if not required_tables: + return content + repaired = str(content or "") + + for token in required_tables: + authoritative_block = _authoritative_block_for_required_table(token, evidence) + if not authoritative_block: + continue + + if _table_token_caption_line_re(token).search(repaired): + seg = _segment_after_table_caption(repaired, token) + if not _segment_has_markdown_table_body(seg): + repaired = _replace_caption_stub_with_authoritative_table( + repaired, token, authoritative_block + ) + continue + if _table_token_matches_name(token, "表5-4"): + m_seg = re.search( + r"(?m)(?:^\s*\|[^\n]+\|\s*\n){3,}", seg + ) + if m_seg and _is_table54_simplified_extract_body(m_seg.group(0)): + repaired = _replace_caption_stub_with_authoritative_table( + repaired, token, authoritative_block + ) + continue + + token_plain = re.sub(r"\s+", "", str(token or "")) + token_re = re.escape(token_plain).replace(r"\-", r"[--—–]") + table_pat = re.compile( + r"((?:^|\n)[^\n]*?" + token_re + r"[^\n]*\n(?:\n|[ \t]*[ \t]*\n)*)" + r"((?:[ \t]*\|[^\n]*\|[ \t]*\n)+)", + flags=re.IGNORECASE, + ) + m = table_pat.search(repaired) + if m: + cur_table = str(m.group(2) or "") + need_replace = not _is_effective_markdown_table_block(cur_table) + if _table_token_matches_name(token, "表5-4") and _is_table54_simplified_extract_body( + cur_table + ): + need_replace = True + if need_replace: + repaired = ( + repaired[:m.start()] + + "\n" + + authoritative_block + + "\n\n" + + repaired[m.end():].lstrip("\n") + ) + elif not _table_token_exists(repaired, token): + repaired = repaired.rstrip() + "\n\n" + authoritative_block + + return repaired.strip() + + +def _collect_structured_tables( + db: Session, + project_uuid: str, + required_tables: list[str], + *, + section_title: str, + section_tokens: list[str], +) -> list[dict]: + """ + 报告生成阶段的结构化表来源必须与“要素管理”一致(element_tables/element_cells)。 + + 规则: + - 若模板 prompt 中声明了必需表(如 表2-1/附表8),优先按 token 精准匹配; + - 若未声明必需表,或声明了但匹配不到,则按章节标题/关键词从要素管理中选取最相关的表直出, + 避免模型自行编造表格。 + """ + + def _table_relevance_score(table_name: str) -> int: + name = str(table_name or "").strip() + if not name: + return 0 + name_l = name.lower() + score = 0 + # 章节标题强相关加权 + t = str(section_title or "").strip() + if t and t in name: + score += 10 + # token 命中加分 + for tok in (section_tokens or [])[:20]: + tt = str(tok or "").strip() + if not tt: + continue + if tt.lower() in name_l: + score += 2 + # 常见表名关键字(表/附表/对比/评价)做轻微加权,便于优先输出真正的表 + if any(k in name for k in ("表", "附表", "对比", "评价", "评分")): + score += 1 + return score + + tables: list[ElementTable] = ( + db.query(ElementTable) + .filter(ElementTable.project_id == project_uuid) + .order_by(ElementTable.sort_order.asc(), ElementTable.updated_at.desc()) + .all() + ) + if not tables: + return [] + + # 5.3.2:正文仅需表5-5/表5-6;附表8 归入全书「## 附表」,勿纳入本节结构化证据。 + if _extract_section_number(str(section_title or "")) == "5.3.2": + tables = [ + t + for t in tables + if not ( + ("附表8" in str(t.table_name or "")) + and ("可研报告和后评价参数对比表" in str(t.table_name or "")) + ) + ] + if not tables: + return [] + + # 1) 必需表:按 token/表名匹配(尽量“直接用要素管理中的表”) + required_norm = [_norm_table_token(t) for t in (required_tables or []) if _norm_table_token(t)] + required_hits: list[ElementTable] = [] + if required_norm: + for t in tables: + name_norm = _norm_table_token(t.table_name) + if not name_norm: + continue + if any(req and _table_token_matches_name(req, name_norm, normalized=True) for req in required_norm): + required_hits.append(t) + if required_hits: + req_hint_words: list[str] = [] + for req in required_norm: + req_hint_words.extend(_TABLE_TOKEN_PREFERRED_NAME_HINTS.get(req, ())) + + def _required_hit_score(tb: ElementTable) -> tuple[int, int, int]: + tb_name = str(tb.table_name or "").strip() + hint_hit = 0 + if req_hint_words: + for hint in req_hint_words: + if hint and hint in tb_name: + hint_hit += 1 + collect = 0 + if any(_table_token_matches_name(req, "表5-4") for req in required_norm): + collect = _element_table_collect_score(db, tb, "表5-4") + return (collect, hint_hit, _table_relevance_score(tb_name)) + + required_hits.sort(key=_required_hit_score, reverse=True) + # 保留靠前若干张,避免同 token 多张历史表导致提示词爆炸 + required_hits = required_hits[:8] + + # 2) 回退:未声明必需表,或声明了但没匹配上时,按相关性挑选 + selected: list[ElementTable] = list(required_hits) + if not selected: + scored: list[tuple[int, ElementTable]] = [] + for t in tables: + s = _table_relevance_score(t.table_name) + if s > 0: + scored.append((s, t)) + scored.sort(key=lambda x: x[0], reverse=True) + selected = [t for _, t in scored[:4]] + + selected_has_time54 = any( + _is_table54_operating_benefit(str(t.table_name or "")) + and str(t.table_type or "").strip() == "time" + for t in selected + ) + + out: list[dict] = [] + for table in selected: + is_time = str(table.table_type or "").strip() == "time" + if is_time: + ty_row = db.query(ElementTable.year).filter(ElementTable.id == table.id).first() + tbl_y = ( + int(ty_row[0]) + if ty_row and ty_row[0] is not None and int(ty_row[0]) > 0 + else None + ) + year_items = _build_time_table_markdowns_by_year(db, table.id, table.table_name) + if _is_table54_operating_benefit(table.table_name): + picked = _pick_table54_year_markdown(year_items, table_year=tbl_y) + if picked: + year_items = [picked] + for display_name, md in year_items: + if not md: + continue + token = _extract_table_short_token(display_name) + out.append( + { + "tableId": table.id, + "tableName": display_name, + "token": token, + "markdown": md, + } + ) + if len(out) >= 12: + break + else: + if ( + selected_has_time54 + and _is_table54_operating_benefit(table.table_name) + ): + continue + md, common_unit = _build_structured_table_markdown(db, table.id, table.table_name) + display_name = _merge_table_title_with_common_unit(str(table.table_name or "").strip(), common_unit) + token = _extract_table_short_token(table.table_name) + if not md: + md = _skeleton_markdown_for_table_token( + token or display_name, table_name=display_name + ) + if not md: + continue + hit = { + "tableId": table.id, + "tableName": display_name, + "token": token, + "markdown": md, + } + if _is_table54_operating_benefit(table.table_name) and _is_table54_simplified_extract_body(md): + continue + out.append(hit) + if len(out) >= 12: + break + + t54_norm = _norm_table_token("表5-4") + t54_hits = [h for h in out if _norm_table_token(str(h.get("token") or "")) == t54_norm] + if t54_hits: + rest = [h for h in out if _norm_table_token(str(h.get("token") or "")) != t54_norm] + ranked = _dedupe_structured_table_hits(t54_hits) + out = rest + (ranked[:1] if ranked else []) + return out[:12] + + +def _table_2_5_general_layout_comparison_name(table_name: str) -> bool: + """与要素管理 quick-fill 表2-5 判定一致(表头用「项目名称」,不含依托对比)。""" + n = str(table_name or "") + if "依托" in n: + return False + return "表2-5" in n or "总图、储运、公用工程及辅助工程对比" in n + + +def _table_2_6_reliance_comparison_name(table_name: str) -> bool: + """与要素管理 quick-fill 表2-6判定一致(行展示去「依托·」等类别前缀、表头用依托项目名称)。""" + n = str(table_name or "") + return ( + "表2-6" in n + or "储运、公用工程及辅助工程依托对比" in n + or "辅助工程依托对比" in n + ) + + +def _table_3_1_contracting_units_name(table_name: str) -> bool: + """与要素管理 quick-fill 表3-1判定一致(行展示去「承包单元·」前缀、表头用单元名称)。""" + n = str(table_name or "") + return "表3-1" in n or "项目承包单位情况" in n + + +def _table_3_3_plantwide_design_change_name(table_name: str) -> bool: + """与要素管理 quick-fill 表3-3 判定一致(表头用「单元名称」)。""" + n = str(table_name or "") + return ("表3-3" in n or "施工图设计变更情况" in n) and "全厂" in n + + +def _table_3_4_single_unit_design_change_name(table_name: str) -> bool: + """与要素管理 quick-fill 表3-4 判定一致(表头用「专业」)。""" + n = str(table_name or "") + return ("表3-4" in n or "施工图设计变更情况" in n) and "单装置" in n + + +def _table_3_5_major_design_change_name(table_name: str) -> bool: + """与要素管理 quick-fill 表3-5判定一致(行展示去「重大变更·」前缀、表头用单元名称)。""" + n = str(table_name or "") + return "表3-5" in n or "影响投资或工期" in n + + +def _table_3_7_procurement_name(table_name: str) -> bool: + """与要素管理 quick-fill 表3-7判定一致(行展示去「采购物资·」前缀、表头用物资(类别)名称)。""" + n = str(table_name or "") + return "表3-7" in n or "采购工作情况" in n + + +def _table_4_2_alkylation_operation_analysis_name(table_name: str) -> bool: + """烷基化装置运行分析表(含历史误标为表4-1、用户改写考核日期后的表题)。""" + n = re.sub(r"\s+", "", str(table_name or "")) + return ("烷基化装置运行分析" in n) and ("考核时间" in n) + + +_TABLE42_ANALYSIS_TEMPLATE_NAME = "表4-2 烷基化装置运行分析表(考核时间:×年×月×日)" + + +def _multi_column_global_spec_for_table(table_name: str): + """按表名取多列模版;表4-2 槽位改名后仍套用标准列序(单位/设计值/标定值/实际值)。""" + tn = str(table_name or "").strip() + spec = MULTI_COLUMN_GLOBAL_SPECS.get(tn) + if spec: + return spec + if _table_4_2_alkylation_operation_analysis_name(table_name): + return MULTI_COLUMN_GLOBAL_SPECS.get(_TABLE42_ANALYSIS_TEMPLATE_NAME) + return None + + +def _element_manage_row_label_after_first_dot(label: str) -> str: + """与 quick-fill.js parseRowKeyForDisplay 一致:去掉行键第一个「…·」段(仅作展示)。""" + s = str(label or "").strip() + if "\u00b7" in s: + rest = "\u00b7".join(s.split("\u00b7", 1)[1:]).strip() + return rest if rest else s + return s + + +def _element_manage_table_row_display_label(table_name: str, label: str) -> str: + """表2-6/表3-1/表3-5/表3-7/表4-2 等与要素管理行名展示对齐(库内 row_key 仍保留类别前缀)。""" + if ( + _table_2_6_reliance_comparison_name(table_name) + or _table_3_1_contracting_units_name(table_name) + or _table_3_5_major_design_change_name(table_name) + or _table_3_7_procurement_name(table_name) + or _table_4_2_alkylation_operation_analysis_name(table_name) + ): + return _element_manage_row_label_after_first_dot(label) + return str(label or "").strip() + + +def _row_header_name_for_table(table_name: str) -> str: + name = str(table_name or "") + if "产品方案对比表" in name: + return "产品" + if "原料数量及组成对比表" in name: + return "原料名称" + if "原料)性质对比表" in name or "原料性质对比表" in name: + return "名称" + if _table_2_5_general_layout_comparison_name(name): + return "项目名称" + if _table_2_6_reliance_comparison_name(name): + return "依托项目名称" + if _table_3_3_plantwide_design_change_name(name): + return "单元名称" + if _table_3_4_single_unit_design_change_name(name): + return "专业" + if _table_3_1_contracting_units_name(name) or _table_3_5_major_design_change_name(name): + return "单元名称" + if _table_3_7_procurement_name(name): + return "物资(类别)名称" + if _table_4_2_alkylation_operation_analysis_name(name): + return "项目名称" + if ( + _table52_investment_change_name(name) + or _table53_engineering_cost_change_name(name) + or _appendix2_investment_structure_name(name) + ): + return "工程或费用名称" + return "项目" + + +_GROUP_HEADER_PREFIXES = [ + "可研报告", "可研", "初步设计", "实际生产", "实际运行", "实际实施", "后评价", +] +# 含完整时点组名,避免「后评价时点后预测值」被前缀「后评价」误拆成两行表头 +_GROUP_HEADER_EXACT = {p for p in _GROUP_HEADER_PREFIXES} | { + "后评价时点前实际值", + "后评价时点后预测值", +} + + +def _group_column_headers(col_order: list[str]) -> tuple[list[str], list[str]] | None: + """识别多级列头。仅当列名形如"可研报告数量(万吨)"时拆分为 top=可研报告 sub=数量(万吨)。 + 独立列名(如"可研报告""初步设计""实际实施")不视为分组,避免误拆。""" + top_headers: list[str] = [] + sub_headers: list[str] = [] + has_group = False + for col in col_order: + text = str(col or "").strip() + if not text: + top_headers.append("") + sub_headers.append("") + continue + # 列键笔误「…时点点后…」勿按「后评价」前缀拆分,否则 Markdown 展平成「后评价-时点点后…」。 + if "时点点后" in text: + has_group = True + top_headers.append("") + sub_headers.append(text.replace("时点点后", "时点后", 1)) + continue + # 附表3~7、表5-5:列键为「后评价时点后预测值|2021」等,必须在「后评价」前缀规则之前处理, + # 否则会拆成 top=后评价、sub=时点后预测值|2021,Markdown 单行表头与按 col_key 取值的列错位。 + pipe_sep = "|" if "|" in text else ("\uff5c" if "\uff5c" in text else None) + if pipe_sep is not None: + group, tail = text.split(pipe_sep, 1) + group, tail = group.strip(), tail.strip() + if group in _APPENDIX_TIME_SLOT_GROUPS and tail: + has_group = True + top_headers.append(group) + sub_headers.append(tail) + continue + # 表5-4:「可研报告|××年#1」等不得按「可研报告」前缀拆成「可研报告-|××年#1」 + if group in _TABLE54_PIPE_METRIC_PREFIXES and tail: + has_group = True + top_headers.append("") + sub_headers.append(text) + continue + if text in _GROUP_HEADER_EXACT: + top_headers.append("") + sub_headers.append(text) + continue + matched = False + for prefix in _GROUP_HEADER_PREFIXES: + if text.startswith(prefix) and len(text) > len(prefix): + suffix = text[len(prefix):].strip() + if suffix: + has_group = True + top_headers.append(prefix) + sub_headers.append(suffix) + matched = True + break + if matched: + continue + if "·" in text: + has_group = True + left, right = [part.strip() for part in text.split("·", 1)] + top_headers.append(left) + sub_headers.append(right) + continue + top_headers.append("") + sub_headers.append(text) + return (top_headers, sub_headers) if has_group else None + + +def _table51_main_economic_indicators_name(table_name: str) -> bool: + n = str(table_name or "") + return "表5-1" in n and "主要经济指标对比" in n + + +def _table52_investment_change_name(table_name: str) -> bool: + n = str(table_name or "") + return "表5-2" in n and "投资变动情况表" in n + + +def _table53_engineering_cost_change_name(table_name: str) -> bool: + n = str(table_name or "") + return "表5-3" in n and "工程费用变动情况表" in n + + +def _appendix2_investment_structure_name(table_name: str) -> bool: + n = str(table_name or "") + return "附表2" in n and "项目竣工决算投资构成表" in n + + +def _appendix3_cashflow_name(table_name: str) -> bool: + n = str(table_name or "") + return "附表3" in n and "项目投资财务现金流量表" in n + + +def _appendix4_profit_name(table_name: str) -> bool: + n = str(table_name or "") + return "附表4" in n and "利润与利润分配计算表" in n + + +def _appendix5_revenue_tax_name(table_name: str) -> bool: + n = str(table_name or "") + return "附表5" in n and "营业收入与营业税金及附加计算表" in n + + +def _appendix6_cost_name(table_name: str) -> bool: + n = str(table_name or "") + return "附表6" in n and "总成本费用计算表" in n + + +def _appendix7_materials_name(table_name: str) -> bool: + n = str(table_name or "") + return "附表7" in n and "原材料、燃料及动力费用计算表" in n + + +def _appendix8_param_name(table_name: str) -> bool: + n = str(table_name or "") + return "附表8" in n and "可研报告和后评价参数对比表" in n + + +def _appendix_time_table_name(table_name: str) -> bool: + return ( + _appendix3_cashflow_name(table_name) + or _appendix4_profit_name(table_name) + or _appendix5_revenue_tax_name(table_name) + or _appendix6_cost_name(table_name) + or _appendix7_materials_name(table_name) + ) + + +def _table_row_seq_name_split_display(table_name: str) -> bool: + """投资/附表类表:项目列仅展示名称(序号另列,与要素管理一致)。""" + return ( + _table51_main_economic_indicators_name(table_name) + or _table52_investment_change_name(table_name) + or _table53_engineering_cost_change_name(table_name) + or _appendix2_investment_structure_name(table_name) + or _appendix_time_table_name(table_name) + or _appendix8_param_name(table_name) + ) + + +def _pick_row_key_with_legacy( + canon: str, row_set: set[str], legacy_map: dict[str, str] +) -> str | None: + """在库内实际 row_key 中选取规范键或其旧版别名(优先规范键)。""" + if canon in row_set: + return canon + for legacy, normalized in legacy_map.items(): + if normalized == canon and legacy in row_set: + return legacy + return None + + +def _order_rows_by_preferred( + row_order: list[str], + preferred: list[str], + *, + legacy_map: dict[str, str] | None = None, +) -> list[str]: + row_set = set(row_order) + ordered: list[str] = [] + seen: set[str] = set() + for canon in preferred: + picked: str | None + if legacy_map: + picked = _pick_row_key_with_legacy(canon, row_set, legacy_map) + else: + picked = canon if canon in row_set else None + if picked and picked not in seen: + seen.add(picked) + ordered.append(picked) + extras = sorted(rk for rk in row_order if rk not in seen) + return ordered + extras + + +def _legacy_map_for_table(table_name: str) -> dict[str, str] | None: + if _appendix2_investment_structure_name(table_name): + return APPENDIX2_LEGACY_ROW_KEY_MAP + if _appendix8_param_name(table_name): + return APPENDIX8_LEGACY_ROW_KEY_MAP + return None + + +def _apply_global_table_standard_row_order(table_name: str, row_order: list[str]) -> list[str]: + """表5-1/5-2/5-3、附表2~8:与要素管理、标准模版一致的标准行序。""" + if not row_order: + return row_order + tn = str(table_name or "") + row_set = set(row_order) + + preferred = canonical_row_order_for_table(tn) + if preferred is not None: + return _order_rows_by_preferred(row_order, preferred, legacy_map=_legacy_map_for_table(tn)) + + if _table53_engineering_cost_change_name(tn): + ordered: list[str] = [] + seen: set[str] = set() + for alts in TABLE_5_3_ROW_KEY_ALTERNATES: + picked: str | None = None + for rk in alts: + if rk in row_set: + picked = rk + break + if picked and picked not in seen: + seen.add(picked) + ordered.append(picked) + extras = sorted(rk for rk in row_order if rk not in seen) + return ordered + extras + + preferred: list[str] | None = None + if ( + _table51_main_economic_indicators_name(tn) + or _table52_investment_change_name(tn) + ): + preferred = global_table_row_keys(tn) + + if not preferred: + return row_order + + ordered = [rk for rk in preferred if rk in row_set] + seen = set(ordered) + extras = sorted(rk for rk in row_order if rk not in seen) + return ordered + extras + + +def _normalize_table_row_order(row_order: list[str], *, table_name: str = "") -> list[str]: + row_order = _apply_global_table_standard_row_order(table_name, row_order) + normal_rows: list[str] = [] + total_rows: list[str] = [] + for row in row_order: + text = str(row or "").strip() + compact = re.sub(r"\s+", "", text) + if compact in {"合计", "总计"}: + total_rows.append(row) + else: + normal_rows.append(row) + return normal_rows + total_rows + + +_BARE_FOUR_DIGIT_YEAR_COL = re.compile(r"^\d{4}$") +_APPENDIX_YEAR_TAIL_NORM = re.compile(r"^(\d{4})年?$") + + +def _appendix_norm_year_tail(tail: str) -> str | None: + """将列键尾部「2020」「2020年」规范为四位年字符串;非日历年返回 None。""" + t = str(tail or "").strip() + m = _APPENDIX_YEAR_TAIL_NORM.fullmatch(t) + if not m: + return None + y = m.group(1) + try: + yi = int(y) + except ValueError: + return None + if 1900 <= yi <= 2100: + return y + return None + + +def _filter_redundant_bare_year_columns(col_order: list[str]) -> list[str]: + """去掉与「组|年度」列重复的旧版纯年份列键(如模板历史同步遗留的 2019 / 2020)。""" + if not col_order: + return col_order + cols = [str(c).strip() for c in col_order if str(c or "").strip()] + if not cols: + return list(col_order) + years_from_piped: set[str] = set() + for c in cols: + if "|" not in c: + continue + tail = c.rsplit("|", 1)[-1].strip() + ny = _appendix_norm_year_tail(tail) + if ny: + years_from_piped.add(ny) + for tok in re.split(r"[\s,,、/-]+", tail): + t = tok.strip() + if _BARE_FOUR_DIGIT_YEAR_COL.fullmatch(t) and 1900 <= int(t) <= 2100: + years_from_piped.add(t) + if not years_from_piped: + return list(col_order) + out: list[str] = [] + for c in col_order: + cs = str(c or "").strip() + ny_bare = _appendix_norm_year_tail(cs) + if ny_bare and ny_bare in years_from_piped: + continue + if _BARE_FOUR_DIGIT_YEAR_COL.fullmatch(cs) and cs in years_from_piped: + continue + out.append(c) + return out if out else list(col_order) + + +_APPENDIX_TIME_SLOT_GROUPS = frozenset({"建设期", "后评价时点前实际值", "后评价时点后预测值"}) + + +def _appendix_time_slot_group_tail_is_real(tail: str) -> bool: + """附表时间分组下子列是否为真实年份(YYYY / YYYY年)。""" + if _appendix_norm_year_tail(tail): + return True + t = (tail or "").strip() + if _BARE_FOUR_DIGIT_YEAR_COL.fullmatch(t): + try: + return 1900 <= int(t) <= 2100 + except ValueError: + return False + return False + + +def _bare_appendix_year_placeholder_col_key(s: str) -> bool: + """无竖线列键:末栏「××年」「xx年」等占位列(兼容 x/×/全角拉丁混写)。""" + t = str(s or "").strip() + if not t: + return False + if t in ("…", "..."): + return True + buf: list[str] = [] + for ch in t: + if ch in "xXxX": + buf.append("×") + elif ch == "\u00d7": + buf.append("×") + else: + buf.append(ch) + u = "".join(buf) + return bool(re.fullmatch(r"×{2}年(?:#\d+)?", u)) + + +def _filter_appendix_placeholder_slot_columns(col_order: list[str]) -> list[str]: + """某组下已有真实年份列时,该组内只保留 YYYY / YYYY年 子列,并去掉裸组名列与裸「××年」占位列。""" + if not col_order: + return col_order + groups_with_real_year: set[str] = set() + for c in col_order: + cs = str(c or "").strip() + if "|" not in cs: + continue + group, tail = cs.split("|", 1) + group, tail = group.strip(), tail.strip() + if group not in _APPENDIX_TIME_SLOT_GROUPS: + continue + if _appendix_time_slot_group_tail_is_real(tail): + groups_with_real_year.add(group) + if not groups_with_real_year: + return list(col_order) + out: list[str] = [] + for c in col_order: + cs = str(c or "").strip() + if "|" not in cs: + if cs in groups_with_real_year: + continue + if _bare_appendix_year_placeholder_col_key(cs): + continue + out.append(c) + continue + group, tail = cs.split("|", 1) + group, tail = group.strip(), tail.strip() + if group in groups_with_real_year: + if _appendix_time_slot_group_tail_is_real(tail): + out.append(c) + continue + out.append(c) + return out if out else list(col_order) + + +def _filter_appendix3_summary_duplicate_forecast_years(table_name: str, col_order: list[str]) -> list[str]: + """附表3:「建设期」「时点前」下与「后评价时点后预测值」同年栏重复时去掉,避免表尾多出 2019/2020 等重复列。""" + tn = str(table_name or "").strip() + if "附表3" not in tn or "项目投资财务现金流量" not in tn: + return col_order + forecast_g = "后评价时点后预测值" + summary_gs = frozenset({"建设期", "后评价时点前实际值"}) + fy: set[str] = set() + for c in col_order: + cs = str(c or "").strip() + if "|" not in cs: + continue + g, tail = cs.split("|", 1) + if g.strip() != forecast_g: + continue + ny = _appendix_norm_year_tail(tail.strip()) + if ny: + fy.add(ny) + if not fy: + return col_order + drop: set[str] = set() + for c in col_order: + cs = str(c or "").strip() + if "|" not in cs: + continue + g, tail = cs.split("|", 1) + g, tail = g.strip(), tail.strip() + if g not in summary_gs: + continue + ny = _appendix_norm_year_tail(tail) + if ny and ny in fy: + drop.add(cs) + if not drop: + return col_order + out = [c for c in col_order if str(c).strip() not in drop] + return out if out else list(col_order) + + +def _filter_appendix3_placeholders_when_forecast_has_real_year(table_name: str, col_order: list[str]) -> list[str]: + """附表3:后评价时点后预测值已有 YYYY 列时,三组内所有「××年#n」占位列均剔除(含建设期/时点前仅余占位的情况)。""" + tn = str(table_name or "").strip() + if "附表3" not in tn or "项目投资财务现金流量" not in tn: + return list(col_order) + forecast_g = "后评价时点后预测值" + has_forecast_real = False + for c in col_order: + cs = str(c or "").strip() + if "|" not in cs: + continue + g, tail = cs.split("|", 1) + if g.strip() != forecast_g: + continue + if _appendix_norm_year_tail(tail.strip()): + has_forecast_real = True + break + if not has_forecast_real: + return list(col_order) + out: list[str] = [] + for c in col_order: + cs = str(c or "").strip() + if "|" not in cs: + out.append(c) + continue + g, tail = cs.split("|", 1) + g, tail = g.strip(), tail.strip() + if g in _APPENDIX_TIME_SLOT_GROUPS and not _appendix_time_slot_group_tail_is_real(tail): + continue + out.append(c) + return out if out else list(col_order) + + +def _filter_appendix5_orphan_price_unit_column(table_name: str, col_order: list[str]) -> list[str]: + """附表5:去掉与「价格(元/t)」重复的独立列键「(元/t)」(多为表头拆行误入数据列)。""" + tn = str(table_name or "") + if "附表5" not in tn or "营业收入与营业税金" not in tn: + return col_order + if not any("价格" in str(c) and "元/t" in str(c) for c in col_order): + return col_order + orphans = {"(元/t)", "(元/t)"} + out = [c for c in col_order if str(c).strip() not in orphans] + return out if out else list(col_order) + + +# 表5-5:仅按表号匹配(不要求表名含「主要生产经营指标」,避免要素表标题变体导致过滤未生效) +_TABLE_55_TITLE_RX = re.compile(r"表\s*5\s*[--\..·]\s*5") +_TABLE_55_FORECAST_GROUP = "后评价时点后预测值" +_TABLE_55_FORECAST_HYPHEN_YEAR = re.compile( + rf"^{re.escape(_TABLE_55_FORECAST_GROUP)}\s*[--—–]\s*(\d{{4}})(?:年)?$" +) + + +def _compact_zh_ident(s: str) -> str: + return re.sub(r"\s+", "", unicodedata.normalize("NFKC", str(s or ""))) + + +def _split_group_year_col_key(col: str) -> tuple[str, str] | None: + """解析「组|子列」;支持半角/全角竖线。""" + st = str(col or "").strip() + if not st: + return None + for sep in ("|", "\uff5c"): # U+FF5C 全角竖线 + if sep in st: + a, b = st.split(sep, 1) + return a.strip(), b.strip() + return None + + +def _table55_has_forecast_year_slot_columns(col_order: list[str]) -> bool: + """是否存在「后评价时点后预测值」下的分年列(|、| 或 后缀 -YYYY)。""" + for c in col_order: + parts = _split_group_year_col_key(str(c or "")) + if parts: + g, tail = parts + if g != _TABLE_55_FORECAST_GROUP: + continue + ts = tail.strip() + if _appendix_norm_year_tail(ts): + return True + if _BARE_FOUR_DIGIT_YEAR_COL.fullmatch(ts): + try: + if 1900 <= int(ts) <= 2100: + return True + except ValueError: + pass + continue + st = str(c or "").strip() + m = _TABLE_55_FORECAST_HYPHEN_YEAR.match(st) + if m: + try: + if 1900 <= int(m.group(1)) <= 2100: + return True + except ValueError: + pass + return False + + +def _table55_col_should_drop(col: str, *, has_forecast_year_slots: bool) -> bool: + c0 = _compact_zh_ident(col) + if "时点点后" in c0: + return True + if has_forecast_year_slots and c0 == _compact_zh_ident(_TABLE_55_FORECAST_GROUP): + return True + return False + + +def _filter_table55_redundant_malformed_forecast_column(table_name: str, col_order: list[str]) -> list[str]: + """表5-5:去掉笔误列「…时点点后…」及在有分年预测列时多余的裸「后评价时点后预测值」列。""" + if not col_order: + return col_order + if not _TABLE_55_TITLE_RX.search(str(table_name or "")): + return list(col_order) + has_slots = _table55_has_forecast_year_slot_columns(col_order) + out = [c for c in col_order if not _table55_col_should_drop(str(c), has_forecast_year_slots=has_slots)] + return out if out else list(col_order) + + +_APPENDIX_TIME_GROUP_YEAR_HYPHEN_RE = re.compile( + r"^(建设期|后评价时点前实际值|后评价时点后预测值)\s*[--—–]\s*(.+)$" +) + + +def _appendix_time_col_group_and_tail(col: str) -> tuple[str | None, str | None]: + """解析附表时间列键为 (组名, 子列);支持「组|年」「组-年」及裸组名列。""" + st = str(col or "").strip() + if not st: + return None, None + parts = _split_group_year_col_key(st) + if parts: + return parts[0], parts[1] + m = _APPENDIX_TIME_GROUP_YEAR_HYPHEN_RE.match(st) + if m: + return m.group(1).strip(), m.group(2).strip() + if st in _APPENDIX_TIME_SLOT_GROUPS or st == "价格(元/t)": + return st, "" + return None, None + + +def _appendix_time_tail_sort_key(tail: str | None) -> tuple[int, int, str]: + """组内子列排序:裸组名 < 分年列(年份升序) < 占位列 < 其它。""" + t = str(tail or "").strip() + if not t: + return (0, -1, "") + ny = _appendix_norm_year_tail(t) + if ny: + return (1, int(ny), "") + if _BARE_FOUR_DIGIT_YEAR_COL.fullmatch(t): + try: + yi = int(t) + if 1900 <= yi <= 2100: + return (1, yi, "") + except ValueError: + pass + buf: list[str] = [] + for ch in t: + if ch in "xXxX": + buf.append("×") + elif ch == "\u00d7": + buf.append("×") + else: + buf.append(ch) + pm = re.fullmatch(r"×{2}年#(\d+)", "".join(buf)) + if pm: + return (2, int(pm.group(1)), "") + return (3, 0, t) + + +def _reorder_appendix_time_col_order(table_name: str, col_order: list[str]) -> list[str]: + """附表3~7:按细则组序排列列,组内年份从小到大。""" + if not col_order or not _appendix_time_table_name(table_name): + return list(col_order) + spec_groups = time_table_default_columns_for_name(table_name) or [] + group_rank: dict[str, int] = {g: i for i, g in enumerate(spec_groups)} + by_group: dict[str, list[str]] = {} + ungrouped: list[str] = [] + for col in col_order: + cs = str(col or "").strip() + if not cs: + continue + g, _ = _appendix_time_col_group_and_tail(cs) + if g == "价格(元/t)" or g in _APPENDIX_TIME_SLOT_GROUPS: + by_group.setdefault(g, []).append(cs) + if g not in group_rank: + group_rank[g] = len(group_rank) + 100 + else: + ungrouped.append(cs) + + def _sort_group_cols(cols: list[str]) -> list[str]: + return sorted( + cols, + key=lambda c: _appendix_time_tail_sort_key(_appendix_time_col_group_and_tail(c)[1]), + ) + + ordered_groups = list(spec_groups) + for g in sorted(by_group.keys(), key=lambda x: group_rank.get(x, 999)): + if g not in ordered_groups: + ordered_groups.append(g) + out: list[str] = [] + seen: set[str] = set() + for g in ordered_groups: + cols = by_group.get(g) + if not cols: + continue + for c in _sort_group_cols(cols): + if c not in seen: + out.append(c) + seen.add(c) + for c in ungrouped: + if c not in seen: + out.append(c) + seen.add(c) + return out if out else list(col_order) + + +def _build_structured_table_html( + table_name: str, + row_order: list[str], + col_order: list[str], + latest: dict[tuple[str, str], str], +) -> str: + row_order = _normalize_table_row_order(row_order, table_name=table_name) + row_header = _row_header_name_for_table(table_name) + grouped = _group_column_headers(col_order) + + lines: list[str] = ["", " "] + if grouped: + top_headers, sub_headers = grouped + lines.append(" ") + lines.append(' ') + lines.append(f' ') + idx = 0 + while idx < len(top_headers): + group = top_headers[idx] + if not group: + lines.append(f' ') + idx += 1 + continue + span = 1 + while idx + span < len(top_headers) and top_headers[idx + span] == group: + span += 1 + lines.append(f' ') + idx += span + lines.append(" ") + lines.append(" ") + for top, sub in zip(top_headers, sub_headers): + if top: + lines.append(f" ") + lines.append(" ") + else: + lines.append(" ") + lines.append(" ") + lines.append(f" ") + for col in col_order: + lines.append(f" ") + lines.append(" ") + lines.append(" ") + lines.append(" ") + max_rows = min(120, len(row_order)) if _is_table54_operating_benefit(table_name) else min(24, len(row_order)) + serial_col = _table_row_outline_serial_column(row_order, max_rows=max_rows) + for idx, rk in enumerate(row_order[:max_rows], start=1): + display_rk = _project_column_row_label( + table_name, rk, latest, serial_col=serial_col + ) + serial_cell = _serial_cell_for_report_table( + table_name, rk, idx, serial_col, serial_idx=idx - 1 + ) + lines.append(" ") + lines.append(f" ") + lines.append(f" ") + for ck in col_order: + val = latest.get((rk, ck), "待补充") or "待补充" + lines.append(f" ") + lines.append(" ") + lines.append(" ") + lines.append("
序号{row_header}{sub_headers[idx]}{group}
{sub}
序号{row_header}{col}
{serial_cell}{display_rk}{val}
") + return "\n".join(lines) + + +_RE_MD_HEADER_NAME_UNIT = re.compile(r"^(.+?)\s*([((][^))]+[))])$") + + +def _strip_md_bold_markup(text: str) -> str: + """去掉 Markdown 加粗标记 **,保留其余内容。""" + s = str(text or "") + while True: + new = re.sub(r"\*\*([^*]+?)\*\*", r"\1", s) + if new == s: + break + s = new + return s + + +def _markdown_table_header_cell_display(col_label: str, *, plain: bool = False) -> str: + """表头栏 Markdown:量纲写在名称下方,单位加括号(同一单元格内用
换行);不加粗。""" + del plain # 保留参数以兼容旧调用;表头一律不加 ** 包裹 + s = _strip_md_bold_markup(str(col_label or "").strip()).replace("|", "|") + if not s: + return "" + if re.search(r"", s, re.I): + parts = [ + _strip_md_bold_markup(p).strip() + for p in re.split(r"", s, flags=re.I) + ] + out_parts = [p for p in parts if p] + return "
".join(out_parts) if out_parts else s + m = _RE_MD_HEADER_NAME_UNIT.match(s) + if m: + name, unit = m.group(1).strip(), m.group(2).strip() + if name: + return f"{name}
{unit}" if unit else name + return s + + +def _common_trailing_parenthetical_unit_from_flat_labels( + flat_cols: list[str], +) -> tuple[str | None, list[str]]: + """当合并后的列表头列名末尾「(单位)」在各列一致时,返回该单位及去掉单位后的表头文案。""" + stripped: list[str] = [] + units: list[str | None] = [] + for lab in flat_cols: + s = str(lab or "").strip() + m = _RE_MD_HEADER_NAME_UNIT.match(s) + if m: + stripped.append(m.group(1).strip()) + units.append(m.group(2).strip()) + else: + stripped.append(s) + units.append(None) + present = [u for u in units if u] + if not present: + return None, list(flat_cols) + u0 = present[0] + if any(units[i] is not None and units[i] != u0 for i in range(len(units))): + return None, list(flat_cols) + return u0, stripped + + +# 表号与表名之间空两格:采用两个全角空格(与公文「空两格」习惯一致) +_TABLE_CAPTION_NUMBER_NAME_GAP = "\u3000\u3000" +_TABLE52_INVESTMENT_CHANGE_CAPTION = ( + f"表5-2{_TABLE_CAPTION_NUMBER_NAME_GAP}投资变动情况表(单位:万元、万美元)" +) +_RE_TABLE_CAPTION_LEADING_TOKEN = re.compile( + r"^(附表\s*\d+(?:\s*[.\--.]\s*\d+)*|表\s*\d+(?:\s*[.\--.]\s*\d+)*)\s*(.*)$", + re.DOTALL, +) + + +def _fix_521_table52_wrong_caption(content: str) -> str: + """5.2.1 若表题误用「表5-2 同类烷基化…」等,改回标准投资变动情况表表题。""" + text = str(content or "") + if not text.strip(): + return text + caption_re = re.compile( + r"^(\s*(?:#{1,6}\s+)?)(表\s*5\s*[--.]\s*2\s*(.*))$", + re.IGNORECASE, + ) + out: list[str] = [] + for line in text.split("\n"): + m = caption_re.match(line) + if m: + tail = (m.group(3) or "").strip() + if "投资变动情况表" not in tail: + out.append(f"{m.group(1)}{_TABLE52_INVESTMENT_CHANGE_CAPTION}") + continue + out.append(line) + return "\n".join(out) + + +def _normalize_table_caption_number_name_gap(title: str) -> str: + """将「表2-4xxx」「表 2 - 4 xxx」规范为「表2-4」+ 两全角空格 + 表名。""" + s = str(title or "").strip() + if not s: + return s + m = _RE_TABLE_CAPTION_LEADING_TOKEN.match(s) + if not m: + return s + token_compact = re.sub(r"\s+", "", (m.group(1) or "").strip()) + rest = (m.group(2) or "").strip() + if not rest: + return token_compact + return f"{token_compact}{_TABLE_CAPTION_NUMBER_NAME_GAP}{rest}" + + +def _rewrite_table_caption_line_for_number_name_gap(line: str) -> str: + """修正独立表题行(非表格管道行)中表号与表名间距。""" + if "|" in line or not line.strip(): + return line + if line.strip().startswith("```"): + return line + m = re.match(r"^(\s*)(.*)$", line) + if not m: + return line + indent, rest = m.group(1), m.group(2) + h = "" + m2 = re.match(r"^(#{1,6}\s+)(.*)$", rest) + if m2: + h, rest = m2.group(1), m2.group(2) + rest_st = rest.strip() + if not rest_st: + return line + if rest_st.startswith("**") and rest_st.endswith("**") and len(rest_st) >= 4: + inner = _strip_md_bold_markup(rest_st[2:-2]).strip() + n = _normalize_table_caption_number_name_gap(inner) + return f"{indent}{h}{n}" + n2 = _normalize_table_caption_number_name_gap(_strip_md_bold_markup(rest_st)) + if n2 != rest_st: + return f"{indent}{h}{n2}" + return line + + +def _debold_md_table_row(line: str) -> str: + if "|" not in line: + return line + return "|".join(_strip_md_bold_markup(part) for part in line.split("|")) + + +def _debold_markdown_table_blocks_in_content(content: str) -> str: + """去掉 Markdown 管道表表头行中的 ** 加粗(含双行表头)。""" + lines = str(content or "").split("\n") + if not lines: + return str(content or "") + out: list[str] = [] + i = 0 + while i < len(lines): + ln = lines[i] + if _is_pipe_markdown_table_row_line(ln) and not _is_pipe_markdown_table_separator_line(ln): + header_rows: list[str] = [] + j = i + while j < len(lines) and _is_pipe_markdown_table_row_line(lines[j]) and not _is_pipe_markdown_table_separator_line(lines[j]): + header_rows.append(lines[j]) + j += 1 + if j < len(lines) and _is_pipe_markdown_table_separator_line(lines[j]): + out.extend(_debold_md_table_row(hr) for hr in header_rows) + out.append(lines[j]) + j += 1 + while j < len(lines) and ( + _is_pipe_markdown_table_row_line(lines[j]) + or _is_pipe_markdown_table_separator_line(lines[j]) + ): + out.append(lines[j]) + j += 1 + i = j + continue + out.extend(header_rows) + i += len(header_rows) + continue + out.append(ln) + i += 1 + return "\n".join(out) + + +def _normalize_table_captions_in_markdown(content: str) -> str: + if not str(content or "").strip(): + return str(content or "") + lines = str(content).split("\n") + text = "\n".join(_rewrite_table_caption_line_for_number_name_gap(ln) for ln in lines) + return _debold_markdown_table_blocks_in_content(text) + + +def _merge_table_title_with_common_unit(base_title: str, unit: str | None) -> str: + """表题末尾追加各列相同的公共单位(括号形式);表题已含该单位则不重复。""" + b = str(base_title or "").strip() + if not unit or not str(unit).strip(): + return _normalize_table_caption_number_name_gap(b) + u = str(unit).strip() + bc = re.sub(r"\s+", "", b) + uc = re.sub(r"\s+", "", u) + if uc and bc.endswith(uc): + return _normalize_table_caption_number_name_gap(b) + return _normalize_table_caption_number_name_gap(f"{b} {u}") + + +def _render_table_7_1_markdown( + row_order: list[str], + col_order: list[str], + latest: dict[tuple[str, str], str], +) -> str: + """表7-1 行键为「指标·要素」或「综合得分」;输出合同要求的「指标」「要素」分列。""" + def esc(v: str) -> str: + return str(v or "").replace("|", "|") + + data_cols: list[str] = [] + for c in TABLE_7_1_COLUMN_KEYS: + if c in col_order: + data_cols.append(c) + for c in col_order: + if c not in data_cols: + data_cols.append(c) + + preferred = [rk for rk, _ in TABLE_7_1_ROW_CELL_DEFAULTS] + preferred_set = set(preferred) + ordered_rows = [rk for rk in preferred if rk in row_order] + for rk in row_order: + if rk not in preferred_set: + ordered_rows.append(rk) + + header = "| " + " | ".join( + [_markdown_table_header_cell_display("指标"), _markdown_table_header_cell_display("要素")] + + [_markdown_table_header_cell_display(c) for c in data_cols] + ) + " |" + sep = "| " + " | ".join(["---"] * (2 + len(data_cols))) + " |" + lines = [header, sep] + for rk in ordered_rows: + rk_s = str(rk or "").strip() + if rk_s == "综合得分": + ind, elem = "综合得分", "" + elif "·" in rk_s: + left, right = rk_s.split("·", 1) + ind, elem = left.strip(), right.strip() + else: + ind, elem = rk_s, "" + vals: list[str] = [] + for ck in data_cols: + raw = str(latest.get((rk_s, ck), "") or "").strip() + vals.append(esc(raw if raw else "待补充")) + lines.append("| " + " | ".join([esc(ind), esc(elem)] + vals) + " |") + return "\n".join(lines) + "\n" + + +def _build_structured_table_markdown(db: Session, table_id: str, table_name: str = "") -> tuple[str, str | None]: + cells = ( + db.query(ElementCell) + .filter( + ElementCell.table_id == table_id, + ElementCell.value.isnot(None), + ElementCell.value != "", + ) + .order_by(ElementCell.updated_at.desc()) + .all() + ) + if not cells: + return "", None + latest: dict[tuple[str, str], str] = {} + row_order: list[str] = [] + col_order: list[str] = [] + for cell in cells: + row_key = str(cell.row_key or "").strip() + col_key = str(cell.col_key or "内容").strip() or "内容" + if not row_key: + continue + key = (row_key, col_key) + if key not in latest: + latest[key] = str(cell.value or "").strip() + if row_key not in row_order: + row_order.append(row_key) + if col_key not in col_order: + col_order.append(col_key) + if not row_order: + return "", None + row_order = _normalize_table_row_order(row_order, table_name=table_name) + spec = _multi_column_global_spec_for_table(table_name) + if spec: + spec_cols = [str(col).strip() for col in (spec[0] or []) if str(col).strip()] + ordered = [col for col in spec_cols if col in col_order] + extras = [col for col in col_order if col not in ordered] + col_order = ordered + extras + col_order = _filter_redundant_bare_year_columns(col_order) + col_order = _filter_appendix_placeholder_slot_columns(col_order) + col_order = _filter_appendix3_summary_duplicate_forecast_years(table_name, col_order) + col_order = _filter_appendix3_placeholders_when_forecast_has_real_year(table_name, col_order) + col_order = _filter_appendix5_orphan_price_unit_column(table_name, col_order) + col_order = _filter_table55_redundant_malformed_forecast_column(table_name, col_order) + col_order = _reorder_appendix_time_col_order(table_name, col_order) + inferred_ty: int | None = None + if _is_table54_operating_benefit(str(table_name or "").strip()): + ty_row = db.query(ElementTable.year).filter(ElementTable.id == table_id).first() + tbl_y = int(ty_row[0]) if ty_row and ty_row[0] is not None and int(ty_row[0]) > 0 else None + inferred_ty = _infer_time_column_year_for_table54(col_order, cells, tbl_y) + if str(table_name or "").strip() == TABLE_7_1_SCORING_TABLE_NAME: + return _render_table_7_1_markdown(row_order, col_order, latest), None + return _render_markdown_table( + table_name, row_order, col_order, latest, time_column_year=inferred_ty + ) + + +_PLACEHOLDER_ROW_PREFIX = re.compile(r"^(产品名称|原料名称|项目名称|名称|产品|项目)\s*[·.]\s*") + + +def _display_row_key(table_name: str, rk: str, latest: dict[tuple[str, str], str]) -> str: + """将模板占位行名替换为真实名称(优先使用该行单元格中的项目/产品名称)。""" + text = str(rk or "").strip() + m = _PLACEHOLDER_ROW_PREFIX.match(text) + if not m: + return text + + preferred_cols: list[str] = [] + tn = str(table_name or "") + if "表2-4" in tn or "产品流向" in tn: + preferred_cols.extend(["项目名称", "产品名称", "规格"]) + preferred_cols.extend(["项目名称", "产品名称", "名称", "规格"]) + + for col in preferred_cols: + v = str(latest.get((rk, col), "") or "").strip() + if v and v != "待补充": + return v + + suffix = text[m.end():].strip() + return f"产品{suffix}" if suffix else text + + +# 与正文层次编号一致:row_key 形如「1.1 建设投资」「1.2.3 工艺」「3原料」 +_ROWKEY_OUTLINE_PREFIX = re.compile( + r"^\s*(\d+(?:\.\d+)*)(?:\s*[、..]?\s+(?=\S)|(?=[\u4e00-\u9fffA-Za-z((]))" +) +_CN_OUTLINE_ROWKEY_PREFIX = re.compile(r"^\s*([一二三四五六七八九十百千]+)\s+(.+)$") +_APPENDIX5_PRODUCT_TRIPLE = re.compile( + r"^(\d+(?:\.\d+)*)\s+(.+?)·(销量|营业收入|销项税)$" +) +_APPENDIX7_DETAIL_ROW = re.compile( + r"^(\d+(?:\.\d+)*)\s+([^·]+?)(?:·(单价|数量|进项税额|……))?$" +) + +_TABLE_53_LEGACY_ROW_DISPLAY: dict[str, str] = { + "工程费用变动·批准单位": "批准单位", + "工程费用变动·批准文号": "批准文号", + "工程费用变动·工程费用合计": "工程费用合计", + "工程费用变动·工艺生产装置": "1 工艺生产装置", + "工程费用变动·装置·设备购置费": "1.1.1 设备购置费", + "工程费用变动·装置·安装工程费": "1.1.2 安装工程费", + "工程费用变动·装置·建筑工程费": "1.1.3 建筑工程费", + "工程费用变动·总图运输": "2 总图运输", + "工程费用变动·储运工程": "3 储运工程", + "工程费用变动·其它分项(可增删)": "其它分项(可增删)", +} + + +def _strip_table_prefix_from_row_key(rk: str) -> str: + s = str(rk or "").strip() + if "\u00b7" in s: + return "\u00b7".join(s.split("\u00b7")[1:]).strip() + return s + + +def _parse_row_key_seq_and_name(rk: str, *, table_name: str = "") -> tuple[str, str]: + """与 quick-fill.js ``parseRowKeyForDisplay`` 一致。""" + s = str(rk or "").strip() + if not s: + return "", "" + legacy = _legacy_map_for_table(table_name) + if legacy: + s = legacy.get(s, s) + if _table53_engineering_cost_change_name(table_name): + s = _TABLE_53_LEGACY_ROW_DISPLAY.get(s, s) + if _appendix_time_table_name(table_name) or _appendix8_param_name(table_name): + s = _strip_table_prefix_from_row_key(s) + elif "\u00b7" in s: + s = "\u00b7".join(s.split("\u00b7")[1:]).strip() + m = _ROWKEY_OUTLINE_PREFIX.match(s) + if m: + rest = s[m.end():].strip() + return m.group(1), rest if rest else s + m_cn = _CN_OUTLINE_ROWKEY_PREFIX.match(s) + if m_cn: + return m_cn.group(1), m_cn.group(2).strip() + return "", s + + +def _row_display_name_for_table(table_name: str, rk: str) -> str: + """项目/工程名称列展示文案(去表内前缀与层次编号,附表5/7 明细行单独处理)。""" + s0 = str(rk or "").strip() + if not s0: + return "" + if _appendix5_revenue_tax_name(table_name): + s = _strip_table_prefix_from_row_key(s0) + m = _APPENDIX5_PRODUCT_TRIPLE.match(s) + if m: + return m.group(3) + if _appendix7_materials_name(table_name): + s = _strip_table_prefix_from_row_key(s0) + m = _APPENDIX7_DETAIL_ROW.match(s) + if m and m.group(3): + return m.group(3) + if m: + return m.group(2).strip() + if _table_row_seq_name_split_display(table_name): + _, name = _parse_row_key_seq_and_name(s0, table_name=table_name) + return name or s0 + return s0 + + +def _serial_cell_for_report_table( + table_name: str, + rk: str, + idx: int, + serial_col: list[str] | None, + *, + serial_idx: int, +) -> str: + """表5-2/5-3、附表2~8 用连续 1..n;表5-1 用层次编号;其余表沿用原逻辑。""" + if _table_row_seq_name_split_display(table_name) and not _table51_main_economic_indicators_name( + table_name + ): + return str(idx) + if _table51_main_economic_indicators_name(table_name): + seq, _ = _parse_row_key_seq_and_name(rk, table_name=table_name) + return seq if seq else str(idx) + if serial_col is not None: + return serial_col[serial_idx] + return str(idx) + + +def _project_column_row_label( + table_name: str, + rk: str, + latest: dict[tuple[str, str], str], + *, + serial_col: list[str] | None, +) -> str: + if _table_row_seq_name_split_display(table_name): + label = _row_display_name_for_table(table_name, rk) + elif serial_col is not None: + label = _strip_row_key_leading_outline_for_display(rk) + if not str(label or "").strip(): + label = rk + else: + label = rk + return _element_manage_table_row_display_label( + table_name, _display_row_key(table_name, label, latest) + ) + + +def _outline_serial_from_row_key(rk: str) -> str | None: + """若 row_key 以阿拉伯数字层次编号开头,返回该编号字符串。""" + rk_s = str(rk or "").strip() + if not rk_s: + return None + compact = re.sub(r"\s+", "", rk_s) + if compact in ("合计", "总计"): + return None + m = _ROWKEY_OUTLINE_PREFIX.match(rk_s) + if not m: + return None + num = m.group(1) + if re.fullmatch(r"\d{4}", num): + try: + yi = int(num) + except ValueError: + return None + if 1900 <= yi <= 2100: + return None + return num + + +def _table_row_outline_serial_column(row_order: list[str], *, max_rows: int) -> list[str] | None: + """当每一数据行(合计/总计除外)的 row_key 均带层次编号时,序号列采用该编号。""" + rows = row_order[:max_rows] + if not rows: + return None + serials: list[str] = [] + for rk in rows: + compact = re.sub(r"\s+", "", str(rk or "")) + if compact in ("合计", "总计") or str(rk or "").strip() in ("合计", "总计"): + serials.append("—") + continue + s = _outline_serial_from_row_key(str(rk) or "") + if s is None: + return None + serials.append(s) + return serials + + +def _strip_row_key_leading_outline_for_display(rk: str) -> str: + """去掉 row_key 首部层次编号,避免第二列与序号列重复。""" + rk_s = str(rk or "").strip() + m = _ROWKEY_OUTLINE_PREFIX.match(rk_s) + if not m: + return rk_s + rest = rk_s[m.end():].strip() + return rest if rest else rk_s + + +def _render_markdown_table( + table_name: str, + row_order: list[str], + col_order: list[str], + latest: dict[tuple[str, str], str], + *, + time_column_year: int | None = None, +) -> tuple[str, str | None]: + col_order = list(col_order) + tn = str(table_name or "").strip() + table54 = _is_table54_operating_benefit(tn) + if table54: + _table54_remap_indicator_unit_latest(latest) + _table54_rekey_latest_col_keys(latest) + col_order = _reorder_table54_col_order(col_order) + grouped = _group_column_headers(col_order) + if grouped: + top_headers, sub_headers = grouped + flat_cols: list[str] = [] + for top, sub in zip(top_headers, sub_headers): + if top and sub: + flat_cols.append(f"{top}-{sub}") + elif top: + flat_cols.append(top) + else: + flat_cols.append(sub) + else: + flat_cols = list(col_order) + + if table54: + flat_cols = list(col_order) + flat_header_labels = _table54_markdown_header_labels( + col_order, time_column_year=time_column_year + ) + common_unit = None + else: + common_unit, flat_header_labels = _common_trailing_parenthetical_unit_from_flat_labels(flat_cols) + if common_unit is None: + flat_header_labels = flat_cols + + def _esc_pipe(v: str) -> str: + return str(v or "").replace("|", "|") + + row_header = _row_header_name_for_table(table_name) + if table54: + row_header = "项目" + + header = ( + "| " + + " | ".join( + [ + _markdown_table_header_cell_display("序号"), + _markdown_table_header_cell_display(row_header), + ] + + [_markdown_table_header_cell_display(c) for c in flat_header_labels] + ) + + " |" + ) + split = "| --- | --- | " + " | ".join(["---"] * len(flat_cols)) + " |" + lines = [header, split] + max_rows = min(120, len(row_order)) if table54 else min(24, len(row_order)) + serial_col = _table_row_outline_serial_column(row_order, max_rows=max_rows) + for idx, rk in enumerate(row_order[:max_rows], start=1): + vals = [_esc_pipe(latest.get((rk, ck), "待补充") or "待补充") for ck in col_order] + display_rk = _project_column_row_label( + table_name, rk, latest, serial_col=serial_col + ) + serial_cell = _serial_cell_for_report_table( + table_name, rk, idx, serial_col, serial_idx=idx - 1 + ) + lines.append("| " + serial_cell + " | " + _esc_pipe(display_rk) + " | " + " | ".join(vals) + " |") + return "\n".join(lines) + "\n", common_unit + + +def _build_time_table_markdowns_by_year( + db: Session, table_id: str, table_name: str = "", +) -> list[tuple[str, str]]: + """为时间要素表按 year 拆分,返回 [(display_table_name, markdown), ...] 列表。 + + 时间表的 ElementCell 通过 year 字段区分不同年份的数据;前端用 col_key + "|" + year + 渲染多级表头。本函数按年份分别聚合 cell,为每个年份生成独立的 Markdown 表格, + 表名中的「××年」替换为实际年份。 + """ + cells = ( + db.query(ElementCell) + .filter( + ElementCell.table_id == table_id, + ElementCell.value.isnot(None), + ElementCell.value != "", + ) + .order_by(ElementCell.updated_at.desc()) + .all() + ) + if not cells: + return [] + + from collections import defaultdict + year_cells: dict[int | None, list[ElementCell]] = defaultdict(list) + for cell in cells: + year_cells[cell.year].append(cell) + + ty_row = db.query(ElementTable.year).filter(ElementTable.id == table_id).first() + tbl_y = ( + int(ty_row[0]) + if ty_row and ty_row[0] is not None and int(ty_row[0]) > 0 + else None + ) + base_name = str(table_name or "").strip() + if _is_table54_operating_benefit(base_name): + year_cells, real_years = _table54_merge_year_cells_for_table_year( + year_cells, table_year=tbl_y + ) + else: + real_years = sorted(y for y in year_cells if y is not None) + if not real_years: + md, common_unit = _build_structured_table_markdown(db, table_id, table_name) + if not md: + return [] + disp = _merge_table_title_with_common_unit(str(table_name or "").strip(), common_unit) + return [(disp, md)] + + results: list[tuple[str, str]] = [] + for year in real_years: + year_cell_list = year_cells[year] + latest: dict[tuple[str, str], str] = {} + row_order: list[str] = [] + col_order: list[str] = [] + for cell in year_cell_list: + row_key = str(cell.row_key or "").strip() + col_key = str(cell.col_key or "内容").strip() or "内容" + if not row_key: + continue + key = (row_key, col_key) + if key not in latest: + latest[key] = str(cell.value or "").strip() + if row_key not in row_order: + row_order.append(row_key) + if col_key not in col_order: + col_order.append(col_key) + if not row_order: + continue + if _is_table54_operating_benefit(base_name): + _table54_coalesce_legacy_bare_metric_cols(latest, row_order) + row_order = _normalize_table_row_order(row_order, table_name=base_name) + time_spec_cols = time_table_default_columns_for_name(base_name) + if time_spec_cols and _is_table54_operating_benefit(base_name): + col_order = ["单位"] + [c for c in time_spec_cols if c != "单位"] + elif time_spec_cols: + ordered = [col for col in time_spec_cols if col in col_order] + extras = [col for col in col_order if col not in ordered] + col_order = ordered + extras + col_order = _filter_redundant_bare_year_columns(col_order) + col_order = _filter_appendix_placeholder_slot_columns(col_order) + col_order = _filter_appendix3_summary_duplicate_forecast_years(base_name, col_order) + col_order = _filter_appendix3_placeholders_when_forecast_has_real_year(base_name, col_order) + col_order = _filter_appendix5_orphan_price_unit_column(base_name, col_order) + col_order = _filter_table55_redundant_malformed_forecast_column(base_name, col_order) + col_order = _reorder_appendix_time_col_order(base_name, col_order) + display_name = re.sub(r"××年", f"{year}年", base_name) + md, common_unit = _render_markdown_table( + display_name, row_order, col_order, latest, time_column_year=year + ) + if md: + results.append((_merge_table_title_with_common_unit(display_name, common_unit), md)) + + if not results and None in year_cells: + md, common_unit = _build_structured_table_markdown(db, table_id, table_name) + if md: + results.append((_merge_table_title_with_common_unit(str(table_name or "").strip(), common_unit), md)) + return results + + +def _extract_table_short_token(table_name: str) -> str: + text = str(table_name or "") + m = re.search(r"(附表\s*\d+(?:\s*[.\--]\s*\d+)*|表\s*\d+(?:\s*[.\--]\s*\d+)*)", text) + return re.sub(r"\s+", "", m.group(1)) if m else "" + + +def _norm_table_token(token: str) -> str: + text = re.sub(r"\s+", "", str(token or "")).lower() + return text.replace("-", "-").replace("—", "-").replace("–", "-") + + +def _table_token_matches_name(token: str, name: str, *, normalized: bool = False) -> bool: + """ + 表号精确匹配,避免“表1”误命中“表10”。 + - normalized=True: token/name 已经是 _norm_table_token 结果。 + """ + t = token if normalized else _norm_table_token(token) + n = name if normalized else _norm_table_token(name) + if not t or not n: + return False + if t == n: + return True + # 兼容历史项目:4.3.3 的“烷基化装置运行分析”可能仍存为表4-1,仍应视为表4-2 的同义候选。 + if t == _norm_table_token("表4-2"): + raw_name = str(name or "") + name_plain = re.sub(r"\s+", "", raw_name) + if ("烷基化装置运行分析" in name_plain) and ("考核时间" in name_plain): + if ("表4-1" in name_plain) or ("表4-2" in name_plain) or ("表41" in _norm_table_token(name_plain)): + return True + # 后面不能紧跟 1-2 位数字后即结束或遇到非数字(避免 表1→表10、表2-4→表2-40), + # 但允许紧跟 4 位年份(如 表2-42019年…)或非数字字符(如 表2-4××年…)。 + pattern = re.compile(rf"{re.escape(t)}(?!\d{{1,2}}(?!\d))") + return bool(pattern.search(n)) + + +def _table_token_caption_line_re(token: str) -> re.Pattern[str]: + token_plain = re.sub(r"\s+", "", str(token or "")) + token_re = re.escape(token_plain).replace(r"\-", r"[--—–]") + return re.compile( + r"(?:^|\n)([^\n]*?" + token_re + r"[^\n]*)\n", + flags=re.IGNORECASE, + ) + + +def _segment_after_table_caption(content: str, token: str) -> str: + """本表表题行之后、下一张「表 x-x …」表题之前的内容(不含引用语中的表号)。""" + text = str(content or "") + cap = _table_token_caption_line_re(token).search(text) + if not cap: + return "" + rest = text[cap.end() :] + next_cap = re.search( + r"\n[^\n]*?表\s*\d+(?:\s*[--.]\s*\d+)*\s+[\u4e00-\u9fff]", + rest, + flags=re.IGNORECASE, + ) + if next_cap: + return rest[: next_cap.start()] + return rest + + +def _segment_has_markdown_table_body(segment: str) -> bool: + seg = str(segment or "") + if not seg.strip(): + return False + return bool( + re.search( + r"(?:|(?:\n[ \t]*\|[^\n]+\|[ \t]*\n[ \t]*\|[-:\s|]+\|))", + seg, + flags=re.IGNORECASE, + ) + ) + + +def _replace_caption_stub_with_authoritative_table( + content: str, token: str, authoritative_block: str +) -> str: + """将「仅有表题/注释、无表体」的占位段替换为要素直出整块(用于 3.3.4 表3-4 等)。""" + text = str(content or "") + block = str(authoritative_block or "").strip() + if not block: + return text + cap = _table_token_caption_line_re(token).search(text) + if not cap: + return text.rstrip() + "\n\n" + block + region_start = cap.start() + if region_start > 0 and text[region_start] == "\n": + region_start += 1 + rest = text[cap.end() :] + next_cap = re.search( + r"\n[^\n]*?表\s*\d+(?:\s*[--.]\s*\d+)*\s+[\u4e00-\u9fff]", + rest, + flags=re.IGNORECASE, + ) + region_end = cap.end() + (next_cap.start() if next_cap else len(rest)) + head = text[:region_start].rstrip("\n") + tail = text[region_end:].lstrip("\n") + if head: + return f"{head}\n\n{block}\n\n{tail}".strip() if tail else f"{head}\n\n{block}".strip() + return f"{block}\n\n{tail}".strip() if tail else block + + +def _table_token_exists(content: str, token: str) -> bool: + text = str(content or "") + t = _norm_table_token(token) + if not text or not t: + return False + # 须有独立表题行;正文「见表3-3~表3-5」等引用不算。 + if not _table_token_caption_line_re(token).search(text): + return False + # 表体必须紧跟在本表表题与下一张表题之间,不得借用后续表的 Markdown 块(如 3.3.4 仅表3-4 题、表3-5 有体)。 + return _segment_has_markdown_table_body(_segment_after_table_caption(text, token)) + + +def _basic_warnings(section_title: str, content: str) -> list[str]: + warnings: list[str] = [] + if len(content.strip()) < 80: + warnings.append("章节内容过短,建议补充证据后重试") + title_norm = re.sub(r"\s+", "", str(section_title or "")) + if "1.2项目决策要点" in title_norm: + if "1.2.1项目背景" not in content or "1.2.2预期目标" not in content: + warnings.append("1.2 未按固定结构输出(缺少“1.2.1项目背景/1.2.2预期目标”小节)") + if "2.1.1资源与原料评价" in title_norm: + if "原料数量及组成对比表" not in content: + warnings.append("2.1.1 缺少模版规定的「原料数量及组成对比表」标题") + if "原料性质对比表(醚后碳四)" not in content and "原料性质对比表" not in content: + warnings.append("2.1.1 缺少模版规定的「原料性质对比表(醚后碳四)」标题") + if "原料选择加氢工艺技术对比" in content or ( + "表2.6-1" in content + and "原料数量及组成对比" not in content + and "原料选择加氢" in content + ): + warnings.append("2.1.1 不应出现安评类「表2.6-1 原料选择加氢工艺技术对比」等内容,本节仅允许模版主表") + if "附录:原料预处理工艺方案比选" in content or "(非模版主表)" in content: + warnings.append("2.1.1 不应出现附录或“非模版主表”字样,请仅保留模版两张主表") + if "表" in section_title and "|" not in content: + warnings.append("章节标题疑似要求表格,但输出未包含 Markdown 表格") + if "待补充" in content and len(content.strip()) < 140: + warnings.append("缺失信息较多,建议补充材料后重跑") + return warnings + + +def _check_consistency(report: str, project_name: str) -> list[str]: + out: list[str] = [] + if project_name and project_name not in report: + out.append("正文未显式出现项目名称,请检查第一章基本信息。") + amounts = re.findall(r"(\d+(?:\.\d+)?)\s*(亿元|万元|万)", report) + if amounts: + normalized = [f"{v}-{u}" for v, u in amounts] + if len(normalized) >= 4 and len(set(normalized[:10])) >= 6: + out.append("金额口径较分散,建议统一投资/决算/效益统计口径。") + unit_lines = re.findall(r"(?:单位|计量单位)\s*[::]\s*([^\n]{1,40})", report) + if unit_lines and len(set(unit_lines)) > 1: + out.append("检测到多个计量单位定义,建议统一单位说明(如万元、吨/年)。") + years = [int(y) for y in re.findall(r"(20\d{2})年", report)] + if years: + min_y, max_y = min(years), max(years) + if max_y - min_y >= 12: + out.append("年份跨度较大,建议复核建设期与运营期时间线是否混写。") + if "待补充" in report: + missing_count = report.count("待补充") + if missing_count >= 10: + out.append(f"全篇“待补充”出现 {missing_count} 次,建议补充关键材料后重跑。") + if _has_conflict_terms(report): + out.append("发现同一指标存在“增加/下降”等相反表述,建议人工复核结论口径。") + return out + + +def _append_report_appendices(db: Session, project_uuid: str, report_text: str) -> str: + """ + 为最终报告追加「附图/附表」(细则顺序:附图在上,附表在下)。 + + 说明: + - 附图:从项目知识库 .docx 中解析嵌入图(全厂/装置物料平衡等),以 Markdown 内嵌图输出; + 解析不到则不输出该项(无占位说明)。 + - 附表:从结构化表(element_tables/element_cells)汇总,优先抓取表名包含「附表」的表。 + """ + base = (report_text or "").strip() + if not base: + base = "" + + appendix_tables = _build_appendix_tables_markdown(db, project_uuid) + appendix_figures = _build_appendix_figures_markdown(db, project_uuid) + + parts = [base] if base else [] + if appendix_figures: + parts.append(appendix_figures) + if appendix_tables: + parts.append(appendix_tables) + return "\n\n".join([p for p in parts if str(p).strip()]).strip() + + +def _build_appendix_tables_markdown(db: Session, project_uuid: str) -> str: + tables = ( + db.query(ElementTable) + .filter(ElementTable.project_id == project_uuid) + .order_by(ElementTable.table_name.asc(), ElementTable.updated_at.desc()) + .all() + ) + appendix = [t for t in tables if "附表" in (t.table_name or "")] + if not appendix: + return "" + + blocks: list[str] = ["## 附表"] + used = 0 + for t in appendix: + md, common_unit = _build_structured_table_markdown(db, t.id, t.table_name) + title = str(t.table_name or "").strip() or f"附表({t.id})" + title = _merge_table_title_with_common_unit(title, common_unit) + if not md: + md = _build_appendix_table_fallback_markdown(title) + if not md: + continue + blocks.append(f"### {title}\n\n{md}") + used += 1 + if used >= 30: + break + return "\n\n".join(blocks).strip() if used else "" + + +def _build_appendix_table_fallback_markdown(table_name: str) -> str: + """ + 当 element_cells 暂无有效数据时,按固定模板输出占位附表,避免附表缺失。 + 当前优先支持:附表8 可研报告和后评价参数对比表。 + """ + name = str(table_name or "").replace(" ", "") + if ("附表8" in name) and ("可研报告和后评价参数对比表" in name): + return APPENDIX8_PARAMETER_COMPARISON_TABLE + return "" + + +def _resolve_appendix_figure_blobs_from_kb(db: Session, project_uuid: str) -> dict[int, tuple[bytes, str, str]]: + """自知识库 docx 抽取附图嵌入图:slot -> (blob, content_type, source_filename)。""" + doc_root = Path(settings.DOC_PAT).resolve() + rows = ( + db.query(KbDocument) + .filter(KbDocument.project_id == project_uuid) + .order_by(KbDocument.uploaded_at.desc()) + .all() + ) + per_doc: list[tuple[str, dict[int, list[tuple[int, bytes, str]]]]] = [] + for d in rows: + name = str(d.name or "") + if not name.lower().endswith(".docx"): + continue + full = _kb_doc_absolute_file_path_for_model(doc_root, d) + if not full.is_file(): + continue + try: + cand = extract_appendix_figure_candidates_from_docx(full) + except Exception as exc: + logger.warning("appendix figure extraction failed %s: %s", full, exc) + continue + per_doc.append((name, cand)) + return merge_best_appendix_figures(per_doc) + + +def _build_appendix_figures_markdown(db: Session, project_uuid: str) -> str: + """ + 附图固定两项(细则): + - 附图1 全厂物料平衡图 + - 附图2 烷基化装置物料平衡图(常见为装置物料平衡图) + + 仅从知识库 .docx 嵌入对象抽取真实图片;解析不到则不在报告中展示该项(不输出占位说明)。 + """ + targets = APPENDIX_FIGURE_TARGETS + resolved = _resolve_appendix_figure_blobs_from_kb(db, project_uuid) + md_by_slot = appendix_figure_markdown_images(resolved, label_title=list(targets)) + + figure_parts: list[str] = [] + for slot in range(1, len(targets) + 1): + md = md_by_slot.get(slot) + if md and str(md).strip(): + figure_parts.append(str(md).strip()) + if not figure_parts: + return "" + return "## 附图\n\n" + "\n\n".join(figure_parts) + + +def _update_chapter_status( + db: Session, + job: ReportGenerationJob, + chapter: ReportGenerationChapter, + status: str, + error_message: Optional[str], +) -> None: + now = datetime.now() + chapter.status = status + chapter.error_message = error_message + chapter.updated_at = now + job.current_section_key = chapter.section_key + job.updated_at = now + db.commit() + + +def _resolve_template(db: Session, template_id: Optional[str]) -> ReportTemplate: + template = None + if template_id: + template = db.query(ReportTemplate).filter(ReportTemplate.id == template_id).first() + if not template: + template = ( + db.query(ReportTemplate) + .filter(ReportTemplate.is_default == True, ReportTemplate.is_active == True) # noqa: E712 + .first() + ) + if not template: + raise HTTPException(status_code=404, detail="未找到可用模板") + return template + + +def _list_template_sections(db: Session, template_id: str) -> list[ReportTemplateSection]: + return ( + db.query(ReportTemplateSection) + .filter(ReportTemplateSection.template_id == template_id) + .order_by(ReportTemplateSection.section_order.asc()) + .all() + ) + + +def _sections_for_generation(sections: list[ReportTemplateSection]) -> list[ReportTemplateSection]: + items = list(sections or []) + if not items: + return [] + + section_nos = { + _extract_section_number(section.section_title or "") + for section in items + if _extract_section_number(section.section_title or "") + } + + filtered: list[ReportTemplateSection] = [] + for section in items: + section_no = _extract_section_number(section.section_title or "") + # 无法解析编号时保持兼容,继续参与生成。 + if not section_no: + filtered.append(section) + continue + # 仅生成叶子节:若存在任一后续子节(前缀匹配 x.y.z ...),则当前节跳过。 + has_children = any(no.startswith(f"{section_no}.") for no in section_nos) + if not has_children: + filtered.append(section) + return filtered + + +def _extract_tokens(text: str) -> list[str]: + src = str(text or "") + zh = re.findall(r"[\u4e00-\u9fa5]{2,8}", src) + en = re.findall(r"[A-Za-z]{3,20}", src.lower()) + raw = zh + en + out: list[str] = [] + seen = set() + for t in raw: + if t in seen: + continue + seen.add(t) + out.append(t) + return out + + +def _fmt_dt(dt: Optional[datetime]) -> Optional[str]: + if not dt: + return None + return dt.strftime("%Y-%m-%d %H:%M:%S") + + +def _select_chapter_example(section_title: str, raw_examples: Optional[str], evidence: dict) -> str: + # 1.2 对“叙事+证据锚点”敏感,宁夏石化类示例易带偏叙述重点,故不使用章节示例 + if _extract_section_number(section_title or "") == "1.2": + return "" + # 2.1.1 须严格对齐《模版.doc》两张原料表,章节示例中的其他项目表结构易干扰 + if _extract_section_number(section_title or "") == "2.1.1": + return "" + # 5.1 表5-1 由合同与要素直出约束;第5章样例为简化「指标|可研值|后评价值」表,易与要素表重复输出 + if _extract_section_number(section_title or "") == "5.1": + return "" + # 5.2.1 表5-2/表5-3 由输出合同与要素直出约束;第5章共用样例曾误标「表5-2 同类对标」易带偏表题 + if _extract_section_number(section_title or "") == "5.2.1": + return "" + # 5.3.1 仅允许要素直出表5-4;第5章样例含「指标|可研值|后评价值」简表易诱发重复表 + if _extract_section_number(section_title or "") == "5.3.1": + return "" + # 5.4 仅允许表5-7;第5章样例(表5-1 可研/后评对比)易诱发重复简表 + if _extract_section_number(section_title or "") == "5.4": + return "" + samples = _parse_examples(raw_examples) + if not samples: + return "" + title = str(section_title or "") + has_table_need = ("表" in title) or _evidence_has_table_signal(evidence) + scored: list[tuple[int, str]] = [] + for s in samples: + score = 0 + txt = s.lower() + if has_table_need and ("|" in s or "表" in s): + score += 4 + if any(k in txt for k in ("万元", "亿元", "投资", "收益")) and ("投资" in title or "财务" in title): + score += 3 + if any(k in txt for k in ("环保", "安全", "排放")) and ("影响" in title or "持续" in title): + score += 3 + if any(k in txt for k in ("结论", "建议", "经验")) and ("结论" in title or "综合" in title): + score += 3 + score += min(len(s) // 300, 2) + scored.append((score, s)) + scored.sort(key=lambda x: x[0], reverse=True) + return scored[0][1] + + +def _parse_examples(raw_examples: Optional[str]) -> list[str]: + text = str(raw_examples or "").strip() + if not text: + return [] + # 支持 JSON 数组格式:["示例1","示例2","示例3"] + if text.startswith("[") and text.endswith("]"): + try: + arr = json.loads(text) + if isinstance(arr, list): + out = [str(x).strip() for x in arr if str(x).strip()] + return out[:3] + except Exception: + pass + # 支持分隔符:---EXAMPLE--- 或 \n\n====\n\n + for sep in ("\n---EXAMPLE---\n", "\n====\n"): + if sep in text: + return [x.strip() for x in text.split(sep) if x.strip()][:3] + # 兼容“示例1/示例2/示例3”文本段 + blocks = re.split(r"\n\s*示例\s*[1-3][::]\s*", "\n" + text) + blocks = [b.strip() for b in blocks if b.strip()] + if len(blocks) >= 2: + return blocks[:3] + return [text] + + +def _evidence_has_table_signal(evidence: dict) -> bool: + docs = evidence.get("chapterDocs") if isinstance(evidence, dict) else [] + if not isinstance(docs, list): + return False + for d in docs[:8]: + if not isinstance(d, dict): + continue + content = str(d.get("content") or "") + if "|" in content or "表" in content[:200]: + return True + return False + + +def _has_conflict_terms(report: str) -> bool: + pairs = [ + ("增加", "下降"), + ("达标", "未达标"), + ("盈利", "亏损"), + ("改善", "恶化"), + ] + for a, b in pairs: + if a in report and b in report: + return True + return False + + +def _resolve_project(db: Session, project_id: str) -> Optional[Project]: + if not project_id: + return None + p = db.query(Project).filter(Project.uuid == project_id).first() + if p: + return p + try: + pid = int(project_id) + except Exception: + return None + return db.query(Project).filter(Project.id == pid).first() diff --git a/services/report_prompt_service.py b/services/report_prompt_service.py new file mode 100644 index 0000000..5c1f445 --- /dev/null +++ b/services/report_prompt_service.py @@ -0,0 +1,135 @@ +from __future__ import annotations + +from services.prompt_template_service import render_prompt +from prompts.report_generation.prompt_defaults import ( + DEFAULT_SECTION_PROMPT_FALLBACK, + DEFAULT_SELECTED_EXAMPLE_FALLBACK, +) + + +def chapter_generation_system_prompt() -> str: + return render_prompt("report_generation/chapter_generation_system.md") + + +def repair_missing_tables_system_prompt() -> str: + return render_prompt("report_generation/repair_missing_tables_system.md") + + +def table_format_repair_system_prompt() -> str: + return render_prompt("report_generation/table_format_repair_system.md") + + +def _build_prior_sibling_sections_prompt_block(prior_sibling_sections_text: str) -> str: + body = str(prior_sibling_sections_text or "").strip() + if not body: + return "" + return ( + "【同章前序小节正文(时间与金额须保持一致)】\n" + f"{body}\n\n" + "【同章一致性约束】\n" + "1. 竣工时间、开工/中交/投产/验收等关键里程碑日期,以及建设投资、总投资、营业收入、利润等各类金额数字," + "须与本章前序小节已写明的口径完全一致(年月日表述可适度简化,但不得出现另一套矛盾日期或金额);\n" + "2. 若【证据包】或【字段级已抽取结果】中某日期/金额与前序小节矛盾,以前序小节为准写入本节," + "不得在正文中另写一套矛盾数值;\n" + "3. 前序小节为「待补充」的字段,本节仍写「待补充」,不得自行编造;\n" + "4. 可补充本节新增信息,但不得改写或否定前序小节已确立的时间与金额。" + ) + + +def _build_prior_chapters_prompt_block(prior_chapters_text: str) -> str: + body = str(prior_chapters_text or "").strip() + if not body: + return "" + return ( + "【前序章节正文(第1~6章,本章须据此总结)】\n" + f"{body}\n\n" + "【前序章节使用约束】\n" + "1. 第7章各节是对第1~6章已生成正文的归纳、提炼与升华,不得与前面章节结论矛盾;\n" + "2. 可概括前文要点,禁止大段照搬;数据与结论须与前文一致;\n" + "3. 若前序章节某处为「待补充」,本节对应表述也应为「待补充」,不得编造;\n" + "4. 须由要素管理直出的表格(如表7-1)仍按【章节输出结构约束】执行,不受本条限制。" + ) + + +def _build_section_reference_block(section_reference: str) -> str: + body = str(section_reference or "").strip() + if not body: + return "" + return ( + "【本章参考范文(本节写作蓝本:结构与行文风格须高度贴合;禁止复用数据、禁止照抄)】\n" + f"{body}\n\n" + "【参考范文使用约束】\n" + "1. 以范文为写作蓝本:段落数量与顺序、每段主题、论述逻辑、句式笔法与篇幅颗粒度均须与范文高度一致,做到逐段对应、同一笔法;\n" + "2. 严禁复用范文中的项目名称、时间、金额、指标值等任何事实数据,须全部替换为当前项目证据包的真实值;\n" + "3. 范文中的表格结构(表头、列顺序、行项)须沿用,但表内数据必须替换为当前项目证据包的值;\n" + "4. 禁止逐字照抄:不得出现与范文连续相同超过 15 字的文字,须改写措辞做到“形似而文不同”;\n" + "5. 若范文与证据包存在矛盾,以证据包为准。" + ) + + +def build_report_chapter_prompt( + *, + section_title: str, + section_prompt: str, + required_tables_text: str, + structured_tables_text: str, + canonical_fields_text: str, + selected_example: str, + heading_rule: str, + section_contract: str, + evidence_json: str, + prior_sibling_sections_text: str = "", + prior_chapters_text: str = "", + section_reference: str = "", +) -> str: + return render_prompt( + "report_generation/chapter_generation_user_ref_aligned.md", + section_title=section_title, + section_prompt=section_prompt or DEFAULT_SECTION_PROMPT_FALLBACK, + required_tables_text=required_tables_text or "无", + structured_tables_text=structured_tables_text, + canonical_fields_text=canonical_fields_text, + selected_example=selected_example or DEFAULT_SELECTED_EXAMPLE_FALLBACK, + heading_rule=heading_rule, + section_contract=section_contract, + evidence_json=evidence_json, + prior_sibling_sections_block=_build_prior_sibling_sections_prompt_block( + prior_sibling_sections_text + ), + prior_chapters_block=_build_prior_chapters_prompt_block(prior_chapters_text), + section_reference_block=_build_section_reference_block(section_reference), + ) + + +def build_repair_missing_tables_prompt( + *, + section_title: str, + original_prompt: str, + content: str, + missing_tables: list[str], + evidence_json: str, +) -> str: + return render_prompt( + "report_generation/repair_missing_tables_user.md", + section_title=section_title, + missing_tables=", ".join(missing_tables), + content=content, + original_prompt=original_prompt[:8000], + evidence_json=evidence_json[:12000], + ) + + +def build_table_format_repair_prompt( + *, + section_title: str, + table_specs_json: str, + content: str, + evidence_json: str, +) -> str: + return render_prompt( + "report_generation/table_format_repair_user.md", + section_title=section_title, + table_specs_json=table_specs_json, + content=content, + evidence_json=evidence_json[:12000], + ) diff --git a/services/report_runtime_store.py b/services/report_runtime_store.py new file mode 100644 index 0000000..26839b2 --- /dev/null +++ b/services/report_runtime_store.py @@ -0,0 +1,145 @@ +from __future__ import annotations + +from copy import deepcopy +from datetime import datetime +import threading +from typing import Any, Optional + + +_RUNTIME_LOCK = threading.RLock() +_JOB_STATES: dict[str, dict[str, Any]] = {} + + +def _now_str() -> str: + return datetime.now().strftime("%Y-%m-%d %H:%M:%S") + + +def _chapter_payload( + *, + section_key: str, + section_title: str, + section_order: int, + status: str = "pending", +) -> dict[str, Any]: + return { + "sectionKey": section_key, + "sectionTitle": section_title, + "sectionOrder": section_order, + "status": status, + "content": None, + "errorMessage": None, + "updatedAt": _now_str(), + "promptText": None, + "evidencePayload": None, + "validationPayload": None, + } + + +def init_job_state( + *, + job_id: str, + project_id: str, + template_id: Optional[str], + chapters: list[dict[str, Any]], +) -> None: + with _RUNTIME_LOCK: + _JOB_STATES[job_id] = { + "jobId": job_id, + "projectId": project_id, + "templateId": template_id, + "status": "pending", + "progress": 0, + "currentSectionKey": None, + "errorMessage": None, + "createdAt": _now_str(), + "updatedAt": _now_str(), + "completedAt": None, + "chapters": { + str(item["sectionKey"]): _chapter_payload( + section_key=str(item["sectionKey"]), + section_title=str(item["sectionTitle"]), + section_order=int(item["sectionOrder"]), + status=str(item.get("status") or "pending"), + ) + for item in (chapters or []) + }, + } + + +def get_job_state(job_id: str) -> Optional[dict[str, Any]]: + with _RUNTIME_LOCK: + state = _JOB_STATES.get(job_id) + return deepcopy(state) if state else None + + +def update_job_state(job_id: str, **fields: Any) -> None: + with _RUNTIME_LOCK: + state = _JOB_STATES.get(job_id) + if not state: + return + state.update(fields) + state["updatedAt"] = _now_str() + + +def update_chapter_state( + job_id: str, + section_key: str, + **fields: Any, +) -> None: + with _RUNTIME_LOCK: + state = _JOB_STATES.get(job_id) + if not state: + return + chapter = state.get("chapters", {}).get(section_key) + if not chapter: + return + chapter.update(fields) + chapter["updatedAt"] = _now_str() + state["updatedAt"] = _now_str() + + +def append_chapter_content( + job_id: str, + section_key: str, + delta_text: str, + *, + stream_phase: str, +) -> None: + if not delta_text: + return + with _RUNTIME_LOCK: + state = _JOB_STATES.get(job_id) + if not state: + return + chapter = state.get("chapters", {}).get(section_key) + if not chapter: + return + current = str(chapter.get("content") or "") + validation_payload = dict(chapter.get("validationPayload") or {}) + validation_payload["streamPhase"] = stream_phase + chapter["content"] = current + delta_text + chapter["validationPayload"] = validation_payload + chapter["updatedAt"] = _now_str() + state["currentSectionKey"] = section_key + state["updatedAt"] = _now_str() + + +def set_chapter_stream_phase(job_id: str, section_key: str, stream_phase: str) -> None: + with _RUNTIME_LOCK: + state = _JOB_STATES.get(job_id) + if not state: + return + chapter = state.get("chapters", {}).get(section_key) + if not chapter: + return + validation_payload = dict(chapter.get("validationPayload") or {}) + validation_payload["streamPhase"] = stream_phase + chapter["validationPayload"] = validation_payload + chapter["updatedAt"] = _now_str() + state["currentSectionKey"] = section_key + state["updatedAt"] = _now_str() + + +def remove_job_state(job_id: str) -> None: + with _RUNTIME_LOCK: + _JOB_STATES.pop(job_id, None) diff --git a/services/retrieval_service.py b/services/retrieval_service.py new file mode 100644 index 0000000..a38ffe9 --- /dev/null +++ b/services/retrieval_service.py @@ -0,0 +1,324 @@ +""" +services/retrieval_service.py +后评价报告材料检索服务 +用于从向量库中检索与后评价报告相关的材料 +""" + +from typing import List, Dict, Any, Optional +from langchain_core.documents import Document +from function.vector_store import VectorStore + + +class RetrievalService: + """后评价报告材料检索服务""" + + def __init__(self, collection_name: str = "eval_report"): + """ + 初始化检索服务 + + Args: + collection_name: 向量库集合名称 + """ + self.collection_name = collection_name + self.vector_store = VectorStore(collection_name=collection_name, drop_old=False) + + def search_by_query(self, query: str, top_k: int = 10, filter_project: Optional[str] = None) -> List[Document]: + """ + 根据查询语句检索相关材料 + + Args: + query: 查询语句,例如"项目背景"、"财务评价"、"技术方案" + top_k: 返回结果数量 + filter_project: 可选的项目 UUID 过滤 + + Returns: + 检索到的文档列表 + """ + # 构建查询语句 + if filter_project: + full_query = f"{query} 项目 UUID:{filter_project}" + else: + full_query = query + + # 执行检索 + results = self.vector_store.similarity_search_with_score(full_query, k=top_k) + + # 过滤并返回文档 + docs = [] + for doc, score in results: + # 如果指定了项目过滤,检查文档是否属于该项目 + if filter_project and doc.metadata.get("project_uuid") != filter_project: + continue + docs.append(doc) + + return docs + + def search_by_category(self, category: str, project_uuid: str, top_k: int = 10) -> List[Dict[str, Any]]: + """ + 根据类别检索材料 + + Args: + category: 类别,如"项目概况"、"技术方案"、"财务评价"、"效益分析" + project_uuid: 项目 UUID + top_k: 返回结果数量 + + Returns: + 检索结果列表,包含文档内容和元数据 + """ + # 定义类别对应的检索关键词 + category_keywords = { + "项目概况": ["项目背景", "建设内容", "项目规模", "建设地点", "建设单位", "项目决策", "立项依据"], + "技术方案": ["技术方案", "工艺技术", "设备选型", "工程设计", "施工安装", "调试运行", "专利技术"], + "财务评价": ["投资估算", "资金筹措", "财务分析", "现金流量", "利润计算", "成本分析", "经济效益"], + "效益分析": ["经济效益", "社会效益", "环境效益", "环境影响", "资源利用", "节能降耗"], + "风险分析": ["风险分析", "风险识别", "风险评价", "风险对策", "不确定性分析"], + "后评价结论": ["后评价结论", "经验教训", "建议措施", "综合评价"], + } + + # 使用多个关键词进行检索 + all_docs = [] + for keyword in category_keywords.get(category, [category]): + docs = self.search_by_query(keyword, top_k=5, filter_project=project_uuid) + all_docs.extend(docs) + + # 去重并返回 + seen = set() + unique_docs = [] + for doc in all_docs: + key = (doc.page_content[:100], doc.metadata.get("heading", "")) + if key not in seen: + seen.add(key) + unique_docs.append(doc) + + # 转换为字典格式 + result = [] + for doc in unique_docs[:top_k]: + result.append({ + "content": doc.page_content, + "heading": doc.metadata.get("heading", ""), + "heading_level": doc.metadata.get("heading_level", 0), + "doc_id": doc.metadata.get("doc_id", ""), + "path": doc.metadata.get("path", ""), + "score": doc.metadata.get("score", 0.0), + }) + + return result + + def get_project_materials(self, project_uuid: str) -> Dict[str, Any]: + """ + 获取项目的所有相关材料 + + Args: + project_uuid: 项目 UUID + + Returns: + 包含项目所有材料的字典 + """ + # 检索项目基本信息 + basic_info = self.search_by_query( + "项目概况 项目基本情况", + top_k=5, + filter_project=project_uuid + ) + # 检索技术方案 + tech_info = self.search_by_query( + "技术方案 工艺技术", + top_k=5, + filter_project=project_uuid + ) + # 检索财务信息 + finance_info = self.search_by_query( + "财务评价 经济效益", + top_k=5, + filter_project=project_uuid + ) + # 检索效益分析 + benefit_info = self.search_by_query( + "效益分析 社会效益", + top_k=5, + filter_project=project_uuid + ) + return { + "basic_info": [doc.page_content for doc in basic_info], + "tech_info": [doc.page_content for doc in tech_info], + "finance_info": [doc.page_content for doc in finance_info], + "benefit_info": [doc.page_content for doc in benefit_info], + } + + def search_similar_report(self, reference_content: str, top_k: int = 5) -> List[Document]: + """ + 根据参考内容检索相似报告 + + Args: + reference_content: 参考报告内容 + top_k: 返回结果数量 + + Returns: + 相似报告列表 + """ + # 提取关键信息用于检索 + query = f"后评价报告 项目概况 技术方案 财务评价" + results = self.vector_store.similarity_search_with_score(query, k=top_k) + + docs = [] + for doc, score in results: + docs.append(doc) + + return docs + + def get_template_data(self, project_uuid: str, query: str = "项目概况 技术方案 财务评价", top_k: int = 15) -> Dict[str, Any]: + """ + 获取符合模板要求的数据 + + Args: + project_uuid: 项目 UUID + query: 检索查询语句 + top_k: 检索结果数量 + + Returns: + 符合模板字段要求的数据字典 + """ + from report_template import ReportTemplate + + # 检索材料 + materials = self.search_by_query(query, top_k=top_k, filter_project=project_uuid) + + if not materials: + return { + "materials": [], + "template_data": {}, + "key_info": {} + } + + # 提取关键信息 + key_info = ReportTemplate.extract_key_info([doc.page_content for doc in materials]) + + # 映射到模板字段 + template_data = ReportTemplate.map_materials_to_template([doc.page_content for doc in materials]) + + return { + "materials": [doc for doc in materials], + "materials_text": [doc.page_content for doc in materials], + "template_data": template_data, + "key_info": key_info + } + + def get_chapter_materials(self, project_uuid: str, chapter: str, top_k: int = 10) -> List[Dict[str, Any]]: + """ + 获取指定章节的材料 + + Args: + project_uuid: 项目 UUID + chapter: 章节名称 + top_k: 返回结果数量 + + Returns: + 材料列表 + """ + # 定义章节对应的检索关键词 + chapter_keywords = { + "项目概况": ["项目背景", "建设内容", "项目规模", "建设地点", "建设单位", "项目决策", "立项依据"], + "技术方案": ["技术方案", "工艺技术", "设备选型", "工程设计", "施工安装", "调试运行", "专利技术"], + "项目全过程总结与管理评价": [ + # ---- 强优先:表1~表14 + 编号小节 ---- + "2.1", "2.1.1", "2.1.1.3", "2.1.6", "2.2", "2.2.1", "2.2.10", "2.3", "2.3.1", "2.3.6", + "表1原料数量及组成对比表", "表2原料性质对比表", + "表3前期预测和2019年实际产品对比表", + "表4装置规模及实际运行负荷对比表", + "表5项目规模对比表", + "表6可研报告与基础设计阶段工程内容对比表", + "表7项目承包商的招投标情况表", + "表8项目设计主要进度控制情况表", + "表9施工图设计变更情况表", + "表10重大设计变更情况表", + "表11主要设备采购情况表", + "表12施工重要节点进度表", + "表13原料性质对比表", + "表14主要标定结果与设计指标对比表", + + # ---- 次优先:结构性关键词 ---- + "可行性研究", "可研编制", "可研报告", "评估会", "可研批复", "资源与原料评价", + "基础设计", "设计审查", "审查意见", "设计变更", "施工图设计", "招投标", "施工准备", + "工程监理", "HSE", "竣工验收", + "投产管理", "生产准备", "联合试运", "试生产", "生产运行评价", "原料供应评价", "标定结果", + "原料数量及组成对比", "装置规模", "负荷率", + ], + "财务评价": ["投资估算", "资金筹措", "财务分析", "现金流量", "利润计算", "成本分析", "经济效益"], + "效益分析": ["经济效益", "社会效益", "环境效益", "环境影响", "资源利用", "节能降耗"], + "项目目标和可持续性评价": [ + # 强优先:章节标题与编号 + "5", "5.1", "5.1.1", "5.1.2", "5.1.3", "5.2", "5.3", "5.3.1", "5.3.2", "5.3.3", "5.3.4", "5.3.5", + "项目目标实现程度评价", "项目绩效对标分析", "项目持续性评价", + + # 目标实现(工程/技术/经济) + "工程规模", "项目进度", "工程质量", "项目功能", "投资控制", + "加工量", "负荷", "产品产量", "产品质量", "技术指标", "标定", "设计值", "考核", + "主要经济指标", "IRR", "内部收益率", "净现值", "NPV", "投资回收期", "营业收入", "成本费用", "税后利润", + + # 对标 + "对标", "横向对比", "同类装置", "单位投资", "单位能耗", "蒸汽能耗", "综合能耗", "辛烷值", "收率", "烯烃", + + # 持续性(资源/产品/内部/政策) + "资源分析", "原料供应", "资源保障", + "产品分析", "市场需求", "国Ⅵ", "国ⅥA", "国ⅥB", + "项目内部因素", "装置规模合理性", "工艺方案", "技术水平", + "国家政策", "产业政策", "质量标准", + + # 若材料以安全/环保合规支撑持续性 + "个人风险", "社会风险", "可接受", "风险曲线", + "非甲烷总烃", "无组织排放", "mg/m3", "标准值", + ], + "风险分析": ["风险分析", "风险识别", "风险评价", "风险对策", "不确定性分析"], + "后评价结论": ["后评价结论", "经验教训", "建议措施", "综合评价"], + } + + keywords = chapter_keywords.get(chapter, [chapter]) + + # 使用多个关键词进行检索 + all_docs = [] + for keyword in keywords: + docs = self.search_by_query(keyword, top_k=5, filter_project=project_uuid) + all_docs.extend(docs) + + # 去重并返回 + seen = set() + unique_docs = [] + for doc in all_docs: + key = (doc.page_content[:100], doc.metadata.get("heading", "")) + if key not in seen: + seen.add(key) + unique_docs.append(doc) + + # 转换为字典格式 + result = [] + for doc in unique_docs[:top_k]: + result.append({ + "content": doc.page_content, + "heading": doc.metadata.get("heading", ""), + "heading_level": doc.metadata.get("heading_level", 0), + "doc_id": doc.metadata.get("doc_id", ""), + "path": doc.metadata.get("path", ""), + "score": doc.metadata.get("score", 0.0), + }) + + return result + + +# 检索示例 +if __name__ == "__main__": + # 创建检索服务实例 + service = RetrievalService() + + # 示例 1:搜索项目背景 + print("示例 1:搜索项目背景") + docs = service.search_by_query("项目背景 建设内容", top_k=3) + for doc in docs: + print(f"标题:{doc.metadata.get('heading', 'N/A')}") + print(f"内容:{doc.page_content[:200]}...\n") + + # 示例 2:搜索财务评价 + print("示例 2:搜索财务评价") + docs = service.search_by_query("财务评价 现金流量", top_k=3) + for doc in docs: + print(f"标题:{doc.metadata.get('heading', 'N/A')}") + print(f"内容:{doc.page_content[:200]}...\n") diff --git a/services/standard_elements_2020.py b/services/standard_elements_2020.py new file mode 100644 index 0000000..09a43f3 --- /dev/null +++ b/services/standard_elements_2020.py @@ -0,0 +1,1395 @@ +""" +2020 版《炼油化工建设项目后评价报告编制细则(修订)》— 附件/附表结构。 + +依据工作区《炼油化工建设项目后评价报告编制细则(修订).doc》抽取的**正文表格**与**附表1~8**表题、表头整理: +- **全局要素表**:第1章项目概况(文字要素)+正文各章**非按年**对比表(表2-1~表3-7、表4-1、表5-1~5-3/5-6/5-7、表6-1、表7-1)+ **附表1、附表2、附表8**。 +- **时间要素表**:**附表3~附表7**(利润表、税金表等按「建设期 / 后评价时点前、后」及年度栏填报)+ 正文**按年(或按评价期)**列表:**表2-4** 产品流向、**表5-4** 生产经营及效益对比、**表5-5** 主要生产经营指标;库中按「**表名 × 日历年**」各建一张时间表,列名与细则表头一致(见 TIME_TABLE_MULTI_COLUMNS)。 + +附表3~7 各行在库中使用「现金流·」「利润·」等**表内前缀**避免不同附表中同名行(如「营业收入」)在抽取路由上冲突;展示时仍以细则原文行名为 lvl3/说明。 + +规则抽取(build_rule_factor_items)仅包含概况 + 附表1/2/8 行 + 少量高频时间指标;完整模版见 ALL_GLOBAL_TABLES + ALL_TIME_TABLE_SPECS。 +""" + +from __future__ import annotations + +def _S(src: str, names: list[str]) -> list[tuple[str, str, str]]: + """章节要素行:source 作为提示词路径,lvl3 默认与 name 一致。""" + return [(n, src, n) for n in names] + + +# 细则“报告摘要、前言、第1~7章”章节要素(非表格字段),用于新建项目预置空值并供 LLM 回填。 +GLOBAL_SECTION_TABLES: list[tuple[str, int, list[tuple[str, str, str]]]] = [ + ( + "章节要素-摘要与前言", + 100, + _S( + "报告摘要与前言", + [ + "摘要·项目基本情况", + "摘要·总体评价结论", + "摘要·成功度评价结果", + "摘要·主要经验", + "摘要·主要问题", + "摘要·主要建议", + "前言·评价依据", + "前言·评价范围与时点", + "前言·组织方式与过程", + "前言·基础资料清单", + "前言·需解释问题清单", + ], + ), + ), + ( + "章节要素-第1章项目概况", + 110, + _S( + "第1章 项目概况", + [ + "第1章·项目名称", + "第1章·建设单位", + "第1章·建设地点", + "第1章·建设类型", + "第1章·起止时间", + "第1章·建设内容", + "第1章·建设投资", + "第1章·占地面积", + ], + ), + ), + ( + "章节要素-第2章前期工作评价", + 120, + _S( + "第2章 前期工作评价", + [ + "第2章·资源原料评价结论", + "第2章·产品方案评价结论", + "第2章·产品市场评价结论", + "第2章·工艺技术方案评价结论", + "第2章·设备方案评价结论", + "第2章·厂址与外部条件评价结论", + "第2章·总图与配套工程评价结论", + "第2章·技术指标评价结论", + "第2章·风险分析评价结论", + "第2章·可研编制单位资质与选择评价", + "第2章·可研进度评价", + "第2章·可研质量评价", + "第2章·前评估意见采纳落实评价", + "第2章·初步设计评价结论", + "第2章·前期决策程序合规性", + "第2章·前期工作总体结论", + ], + ), + ), + ( + "章节要素-第3章建设实施评价", + 130, + _S( + "第3章 建设实施评价", + [ + "第3章·建设管理模式评价结论", + "第3章·招投标评价结论", + "第3章·施工图设计符合性评价", + "第3章·施工图设计进度评价", + "第3章·施工图设计质量评价", + "第3章·设计变更管理评价", + "第3章·施工准备评价", + "第3章·施工计划执行评价", + "第3章·采购工作评价结论", + "第3章·工程监理评价结论", + "第3章·工程质量评价结论", + "第3章·HSE管理评价结论", + "第3章·三查四定与中间交接评价", + "第3章·竣工验收评价结论", + "第3章·建设实施总体结论", + ], + ), + ), + ( + "章节要素-第4章生产运行评价", + 140, + _S( + "第4章 生产运行评价", + [ + "第4章·生产准备评价结论", + "第4章·联合试运与试生产评价结论", + "第4章·原料供应评价结论", + "第4章·生产运行总体评价", + "第4章·达标评价结论", + "第4章·工艺技术评价结论", + "第4章·设备运行评价结论", + "第4章·公用工程与辅助设施评价结论", + "第4章·生产运行总体结论", + ], + ), + ), + ( + "章节要素-第5章投资与经济效益评价", + 150, + _S( + "第5章 投资与经济效益评价", + [ + "第5章·主要经济指标实现程度评价", + "第5章·投资控制及变动原因结论", + "第5章·投资水平分析结论", + "第5章·资金来源及到位评价结论", + "第5章·投资控制经验教训", + "第5章·营业收入变动原因", + "第5章·总成本费用变动原因", + "第5章·税后利润变动原因", + "第5章·财务后评价IRR", + "第5章·财务后评价NPV", + "第5章·财务后评价回收期", + "第5章·不确定性分析结论", + "第5章·投资与经济效益总体结论", + ], + ), + ), + ( + "章节要素-第6章影响与持续性评价", + 160, + _S( + "第6章 影响与持续性评价", + [ + "第6章·环境影响评价结论", + "第6章·安全影响评价结论", + "第6章·科技进步影响评价结论", + "第6章·社会影响评价结论", + "第6章·项目影响总体结论", + "第6章·资源持续性评价结论", + "第6章·产品持续性评价结论", + "第6章·技术经济竞争力评价结论", + "第6章·项目持续性总体结论", + ], + ), + ), +] + + +def section_table_row_keys(table_group_name: str) -> list[str]: + """返回 ``GLOBAL_SECTION_TABLES`` 中某「章节要素-*」分组的全部行键(与要素库 ``row_key`` 一致)。""" + for name, _, rows in GLOBAL_SECTION_TABLES: + if name == table_group_name: + return [str(r[0]) for r in rows if r and str(r[0]).strip()] + return [] + + +CHAPTER1_PROJECT_OVERVIEW_TABLE_GROUP = "章节要素-第1章项目概况" + +# 细则「附表1」项目建设工作程序表(全局,单行键 = 程序节点) +APPENDIX1_PROGRAM_ROWS: list[tuple[str, str, str]] = [ + ("项目建议书(预可研)批复", "附表1 项目建设工作程序表", "项目建议书"), + ("可行性研究报告编制", "附表1 项目建设工作程序表", "可行性研究报告编制"), + ("环境影响报告编制", "附表1 项目建设工作程序表", "环境影响报告编制"), + ("环境影响报告批复", "附表1 项目建设工作程序表", "环境影响报告批复"), + ("安全评价报告编制", "附表1 项目建设工作程序表", "安全评价报告编制"), + ("安全评价报告批复", "附表1 项目建设工作程序表", "安全评价报告批复"), + ("节能评估报告编制", "附表1 项目建设工作程序表", "节能评估报告编制"), + ("节能评估报告批复", "附表1 项目建设工作程序表", "节能评估报告批复"), + ("可行性研究报告评估", "附表1 项目建设工作程序表", "可行性研究报告评估"), + ("可行性研究报告批复", "附表1 项目建设工作程序表", "可行性研究报告批复"), + ("核准报告批复", "附表1 项目建设工作程序表", "核准报告批复"), + ("初步设计编制", "附表1 项目建设工作程序表", "初步设计编制"), + ("初步设计审查", "附表1 项目建设工作程序表", "初步设计审查"), + ("初步设计批复", "附表1 项目建设工作程序表", "初步设计批复"), + ("施工图设计编制", "附表1 项目建设工作程序表", "施工图设计编制"), + ("开工报告批复", "附表1 项目建设工作程序表", "开工报告批复"), + ("开工建设", "附表1 项目建设工作程序表", "开工建设"), + ("投产运行", "附表1 项目建设工作程序表", "投产运行"), + ("竣工验收", "附表1 项目建设工作程序表", "竣工验收"), +] + +# 细则「附表2」竣工决算投资构成(全局) +APPENDIX2_INVESTMENT_ROWS: list[tuple[str, str, str]] = [ + ("建设投资", "附表2 项目竣工决算投资构成表(万元)", "建设投资"), + ("固定资产投资", "附表2 项目竣工决算投资构成表(万元)", "固定资产投资"), + ("工程费用", "附表2 项目竣工决算投资构成表(万元)", "工程费用"), + ("工艺生产装置", "附表2 项目竣工决算投资构成表(万元)", "工艺生产装置"), + ("总图运输", "附表2 项目竣工决算投资构成表(万元)", "总图运输"), + ("储运工程", "附表2 项目竣工决算投资构成表(万元)", "储运工程"), + ("辅助设施", "附表2 项目竣工决算投资构成表(万元)", "辅助设施"), + ("公用工程", "附表2 项目竣工决算投资构成表(万元)", "公用工程"), + ("生产管理设施", "附表2 项目竣工决算投资构成表(万元)", "生产管理设施"), + ("厂外工程", "附表2 项目竣工决算投资构成表(万元)", "厂外工程"), + ("工器具及生产家具购置费", "附表2 项目竣工决算投资构成表(万元)", "工器具及生产家具购置费"), + ("固定资产其他费用", "附表2 项目竣工决算投资构成表(万元)", "固定资产其他费用"), + ("无形资产费用", "附表2 项目竣工决算投资构成表(万元)", "无形资产费用"), + ("递延资产费用", "附表2 项目竣工决算投资构成表(万元)", "递延资产费用"), + ("固定资产投资方向调节税", "附表2 项目竣工决算投资构成表(万元)", "固定资产投资方向调节税"), + ("建设期利息", "附表2 项目竣工决算投资构成表(万元)", "建设期利息"), + ("铺底流动资金", "附表2 项目竣工决算投资构成表(万元)", "铺底流动资金"), + ("报批项目总投资", "附表2 项目竣工决算投资构成表(万元)", "报批项目总投资"), +] + +# 附表2 规范行键顺序(与 templates/js/quick-fill.js preferredA2Rows 一致) +APPENDIX2_CANONICAL_ROW_ORDER: list[str] = [ + "一 建设投资", + "1 固定资产投资", + "1.1 工程费用", + "1.1.1 工艺生产装置", + "1.1.2 总图运输", + "1.1.3 储运工程", + "1.1.4 辅助设施", + "1.1.5 公用工程", + "1.1.6 生产管理设施", + "1.1.7 厂外工程", + "1.1.8 工器具及生产家具购置费", + "1.2 固定资产其他费用", + "1.2.1 ×××费用", + "1.2.2 ×××费用", + "2 无形资产费用", + "2.1 ×××费用", + "3 递延资产费用", + "3.1 ×××费用", + "二 固定资产投资方向调节税", + "三 建设期利息", + "四 铺底流动资金", + "报批项目总投资", +] + +# 附表2 旧版短行键 → 规范行键(与 quick-fill.js getLegacyAppendix2RowKeyMap 一致) +APPENDIX2_LEGACY_ROW_KEY_MAP: dict[str, str] = { + "建设投资": "一 建设投资", + "固定资产投资": "1 固定资产投资", + "工程费用": "1.1 工程费用", + "工艺生产装置": "1.1.1 工艺生产装置", + "总图运输": "1.1.2 总图运输", + "储运工程": "1.1.3 储运工程", + "辅助设施": "1.1.4 辅助设施", + "公用工程": "1.1.5 公用工程", + "生产管理设施": "1.1.6 生产管理设施", + "厂外工程": "1.1.7 厂外工程", + "工器具及生产家具购置费": "1.1.8 工器具及生产家具购置费", + "固定资产其他费用": "1.2 固定资产其他费用", + "无形资产费用": "2 无形资产费用", + "递延资产费用": "3 递延资产费用", + "固定资产投资方向调节税": "二 固定资产投资方向调节税", + "建设期利息": "三 建设期利息", + "铺底流动资金": "四 铺底流动资金", + "报批项目总投资": "报批项目总投资", +} + +# 细则「附表8」可研与后评价参数对比(全局;行键与要素表编辑/回填一致;税收明细用「税类·子项」避免与成本 3.x 序号冲突) +_A8 = "附表8 可研报告和后评价参数对比表" +APPENDIX8_PARAM_ROWS: list[tuple[str, str, str]] = [ + ("一 成本参数", _A8, "一 成本参数"), + ("1 原料价格", _A8, "1 原料价格"), + ("1.1 氢气", _A8, "1.1 氢气"), + ("2 催化剂和化学药剂", _A8, "2 催化剂和化学药剂"), + ("3 燃料动力价格", _A8, "3 燃料动力价格"), + ("3.1 除盐水价格", _A8, "3.1 除盐水价格"), + ("3.2 除氧水价格", _A8, "3.2 除氧水价格"), + ("3.3 循环水价格", _A8, "3.3 循环水价格"), + ("3.4 1.0MPa蒸汽价格", _A8, "3.4 1.0MPa蒸汽价格"), + ("3.5 3.5MPa蒸汽价格", _A8, "3.5 3.5MPa蒸汽价格"), + ("3.6 新鲜水", _A8, "3.6 新鲜水"), + ("3.7 电价格", _A8, "3.7 电价格"), + ("3.8 净化风价格", _A8, "3.8 净化风价格"), + ("3.9 氮气价格", _A8, "3.9 氮气价格"), + ("3.10 凝结水", _A8, "3.10 凝结水"), + ("3.11 废渣处置", _A8, "3.11 废渣处置"), + ("4 人员费用", _A8, "4 人员费用"), + ("5 折旧年限", _A8, "5 折旧年限"), + ("6 无形资产摊销年限", _A8, "6 无形资产摊销年限"), + ("7 其他资产摊销年限", _A8, "7 其他资产摊销年限"), + ("8 安全生产费用", _A8, "8 安全生产费用"), + ("9 安保基金", _A8, "9 安保基金"), + ("10 其他制造费用", _A8, "10 其他制造费用"), + ("11 其他管理费用", _A8, "11 其他管理费用"), + ("12 长期贷款利息", _A8, "12 长期贷款利息"), + ("13 短期贷款/流动资产贷款利率", _A8, "13 短期贷款/流动资产贷款利率"), + ("14 其他销售费用", _A8, "14 其他销售费用"), + ("二 营业收入参数", _A8, "二 营业收入参数"), + ("2.1 98#汽油", _A8, "2.1 98#汽油"), + ("2.2 95#汽油", _A8, "2.2 95#汽油"), + ("2.3 92#汽油", _A8, "2.3 92#汽油"), + ("2.4 异丁烷", _A8, "2.4 异丁烷"), + ("2.5 正丁烷", _A8, "2.5 正丁烷"), + ("2.6 燃料气", _A8, "2.6 燃料气"), + ("2.7 液化气", _A8, "2.7 液化气"), + ("三 税收参数", _A8, "三 税收参数"), + ("增值税税率", _A8, "增值税税率"), + ("增值税·汽油各品种产品", _A8, "汽油各品种产品"), + ("增值税·异丁烷", _A8, "异丁烷"), + ("增值税·正丁烷", _A8, "正丁烷"), + ("增值税·燃料气", _A8, "燃料气"), + ("增值税·液化气", _A8, "液化气"), + ("消费税税率", _A8, "消费税税率"), + ("消费税·产品汽油税率", _A8, "产品汽油税率"), + ("城市维护建设税税率", _A8, "城市维护建设税税率"), + ("教育费附加", _A8, "教育费附加"), + ("所得税税率", _A8, "所得税税率"), + ("四 基准收益率", _A8, "四 基准收益率"), +] + +APPENDIX8_CANONICAL_ROW_ORDER: list[str] = [str(r[0]) for r in APPENDIX8_PARAM_ROWS] + +# 与 quick-fill.js legacyA8Map 一致(旧 row_key → 规范 row_key) +APPENDIX8_LEGACY_ROW_KEY_MAP: dict[str, str] = { + "3.1.1 增值税·产品A税率": "增值税·汽油各品种产品", + "3.1.2 增值税·产品B税率": "增值税·异丁烷", + "3.2.1 消费税·产品A税率": "消费税·产品汽油税率", + "3.2.2 消费税·产品B税率": "消费税·产品汽油税率", + "3.1 增值税税率": "增值税税率", + "3.2 消费税税率": "消费税税率", + "3.3 城市维护建设税税率": "城市维护建设税税率", + "3.4 教育费附加": "教育费附加", + "3.5 所得税税率": "所得税税率", +} + +GLOBAL_APPENDIX_TABLES: list[tuple[str, int, list[tuple[str, str, str]]]] = [ + ("附表1 项目建设工作程序表", 600, APPENDIX1_PROGRAM_ROWS), + ("附表2 项目竣工决算投资构成表(万元)", 700, APPENDIX2_INVESTMENT_ROWS), + ("附表8 可研报告和后评价参数对比表", 800, APPENDIX8_PARAM_ROWS), +] + + +def _T(src: str, names: list[str]) -> list[tuple[str, str, str]]: + """细则表格行:source 用于检索路径前缀,lvl3 与行名一致便于规则抽取。""" + return [(n, src, n) for n in names] + + +# --------------------------------------------------------------------------- +# 表7-1 项目综合评价评分表:多列(要素权重/评分/得分、指标评分/权重/得分),行键=「指标·要素」 +# --------------------------------------------------------------------------- +TABLE_7_1_SCORING_TABLE_NAME = "表7-1 项目综合评价评分表" + +TABLE_7_1_COLUMN_KEYS: list[str] = [ + "要素权重", + "要素评分", + "要素得分", + "指标评分", + "指标权重", + "指标得分", +] + +# 每行预置的列值(细则给定权重;评分/得分类由填报时录入,默认不写库或 None) +TABLE_7_1_ROW_CELL_DEFAULTS: list[tuple[str, dict[str, str]]] = [ + ("前期工作·资料完备性", {"要素权重": "0.4", "指标权重": "0.2"}), + ("前期工作·程序规范性", {"要素权重": "0.4", "指标权重": "0.2"}), + ("前期工作·前期工作质量", {"要素权重": "0.2", "指标权重": "0.2"}), + ("建设实施·施工图设计质量", {"要素权重": "0.2", "指标权重": "0.2"}), + ("建设实施·管理规范性", {"要素权重": "0.2", "指标权重": "0.2"}), + ("建设实施·合同、招投标及采购", {"要素权重": "0.2", "指标权重": "0.2"}), + ("建设实施·工程质量及进度", {"要素权重": "0.2", "指标权重": "0.2"}), + ("建设实施·施工安全", {"要素权重": "0.1", "指标权重": "0.2"}), + ("建设实施·竣工验收", {"要素权重": "0.1", "指标权重": "0.2"}), + ("生产运行·生产准备", {"要素权重": "0.1", "指标权重": "0.2"}), + ("生产运行·生产装置负荷率", {"要素权重": "0.3", "指标权重": "0.2"}), + ("生产运行·生产达标率", {"要素权重": "0.3", "指标权重": "0.2"}), + ("生产运行·生产运行周期", {"要素权重": "0.2", "指标权重": "0.2"}), + ("生产运行·安全环保达标情况", {"要素权重": "0.1", "指标权重": "0.2"}), + ("投资与经济效益·投资控制", {"要素权重": "0.5", "指标权重": "0.2"}), + ("投资与经济效益·经济效益", {"要素权重": "0.5", "指标权重": "0.2"}), + ("影响与持续性·装置规模和技术竞争力", {"要素权重": "0.4", "指标权重": "0.2"}), + ("影响与持续性·安全环保节能等政策影响", {"要素权重": "0.3", "指标权重": "0.2"}), + ("影响与持续性·科技进步和社会影响", {"要素权重": "0.1", "指标权重": "0.2"}), + ("影响与持续性·资源持续性", {"要素权重": "0.2", "指标权重": "0.2"}), + ("综合得分", {"要素权重": "1.0"}), +] + +TABLE_7_1_FIELDS: list[tuple[str, str, str]] = [ + (rk, "7.1.2 成功度评价", rk) for rk, _ in TABLE_7_1_ROW_CELL_DEFAULTS +] + +# --------------------------------------------------------------------------- +# 多列表格:与 ALL_GLOBAL_TABLES 中 table_name 一致。 +# 元组为 (数据列名列表, 行级默认单元格值);行键仍为「项目/要素名称」列(与单列表相同)。 +# --------------------------------------------------------------------------- +MULTI_COLUMN_GLOBAL_SPECS: dict[str, tuple[list[str], Optional[dict[str, dict[str, str]]]]] = { + "附表1 项目建设工作程序表": ( + ["开始时间", "完成时间", "文号", "部门/单位", "备注"], + None, + ), + "附表2 项目竣工决算投资构成表(万元)": ( + [ + "设备购置", + "安装工程", + "建筑工程", + "其他费用", + "合计", + "其中外汇", + "占建设投资的比例(%)", + "备注", + ], + None, + ), + "附表8 可研报告和后评价参数对比表": ( + ["单位", "可研报告", "后评价报告", "备注"], + None, + ), + TABLE_7_1_SCORING_TABLE_NAME: ( + TABLE_7_1_COLUMN_KEYS, + dict(TABLE_7_1_ROW_CELL_DEFAULTS), + ), + # 正文表(细则列结构,便于多列采集与回填) + # 细则表2-1:行键=原料名称;列与 Word 表头一致(序号仅展示用,不入库为列) + "表2-1 资源(原料)组成、数量对比表": ( + [ + "规格", + "可研报告数量(万吨)", + "可研报告占比(%)", + "初步设计数量(万吨)", + "初步设计占比(%)", + "实际生产数量(万吨)", + "实际生产占比(%)", + "备注", + ], + None, + ), + "表2-2 资源(原料)性质对比表": ( + ["可研报告", "初步设计", "实际生产", "备注"], + None, + ), + "表2-3 产品方案对比表": ( + ["可研规格", "可研数量(万吨/年)", "实际规格", "实际数量(万吨/年)", "备注"], + None, + ), + "表2-5 总图、储运、公用工程及辅助工程对比": ( + ["单位", "可研报告", "初步设计", "实际实施", "备注"], + None, + ), + "表2-6 储运、公用工程及辅助工程依托对比": ( + ["单位", "可研报告", "初步设计", "实际实施", "备注"], + None, + ), + "表2-7 主要设计指标对比表": ( + ["单位", "可研报告", "初步设计", "实际运行", "备注"], + None, + ), + "表5-1 主要经济指标对比表": ( + ["单位", "可研值", "后评价值", "差值", "比例(%)", "备注"], + None, + ), + "表3-1 项目承包单位情况": ( + ["承包单位", "(合同金额)(万元)", "是/否招标", "资质情况"], + None, + ), + "表3-2 施工图设计进度情况": ( + ["设计单位", "合同期限", "实际执行情况", "备注"], + None, + ), + "表3-3 施工图设计变更情况(全厂性项目)": ( + ["设计变更(份数)", "设计变更金额(万元)", "备注"], + None, + ), + "表3-4 施工图设计变更情况(单装置项目)": ( + ["设计变更(份数)", "设计变更金额(万元)", "备注"], + None, + ), + "表3-5 影响投资或工期重(较)大设计变更及原因分析": ( + ["变更内容", "金额(万元)", "原因", "备注"], + None, + ), + "表3-6 施工进度情况": ( + ["施工单位", "合同期限", "实际执行情况", "备注"], + None, + ), + "表3-7 采购工作情况": ( + ["采购方式", "制造商", "供货商", "金额(万元)", "未招标原因"], + None, + ), + "表4-1 投产以来运行周期统计表": ( + [ + "本周期开工日期", + "本周期运行时间(天)", + "非计划停工·次数(次)", + "非计划停工·时数(时)", + "原因简要分析", + ], + None, + ), + "表4-2 烷基化装置运行分析表(考核时间:×年×月×日)": ( + ["单位", "设计值", "标定值", "实际值", "备注"], + None, + ), + "表5-2 投资变动情况表(单位:万元、万美元)": ( + [ + "投资估算", + "初设概算", + "竣工决算", + "决算较估算·差额", + "决算较估算·比例(%)", + "决算较概算·差额", + "决算较概算·比例(%)", + ], + None, + ), + "表5-3 工程费用变动情况表(万元、万美元)": ( + [ + "投资估算", + "初设概算", + "竣工决算", + "决算较估算·差额", + "决算较估算·比例(%)", + "决算较概算·差额", + "决算较概算·比例(%)", + ], + None, + ), + "表5-6 不同因素变化对项目内部收益率的影响": ( + ["财务内部收益率(%)", "变化幅度", "占比"], + None, + ), + "表5-7 内部收益率为基准收益率时不确定因素临界点或临界值": ( + ["单位", "数值", "备注"], + None, + ), + "表6-1 装置技术经济指标对比表": ( + [ + "技术来源", + "规模(万吨/年)", + "物耗(Wt)%", + "能耗(kgEo/t)", + "产品质量", + "产品收率(Wt)%", + "排名", + ], + None, + ), +} + +# 时间表默认列(细则附表4/6/7、表5-5 等以「后评价时点前/后」分栏;具体见 TIME_TABLE_MULTI_COLUMNS)。 +TIME_APPENDIX_MULTI_COLUMNS: list[str] = ["后评价时点前实际值", "后评价时点后预测值"] + +TIME_TABLE_MULTI_COLUMNS: dict[str, list[str]] = { + "附表3 项目投资财务现金流量表(万元)": ["建设期", "后评价时点前实际值", "后评价时点后预测值"], + "附表4 利润与利润分配计算表(万元)": ["后评价时点前实际值", "后评价时点后预测值"], + "附表5 营业收入与营业税金及附加计算表(万元)": [ + "价格(元/t)", + "后评价时点前实际值", + "后评价时点后预测值", + ], + "附表6 总成本费用计算表(万元)": ["后评价时点前实际值", "后评价时点后预测值"], + "附表7 原材料、燃料及动力费用计算表(万元)": ["后评价时点前实际值", "后评价时点后预测值"], + "表2-4 ××年项目主要产品流向状况": [ + "规格", + "实际产量", + "销量", + "产品实际流向", + "可研报告产品流向", + "备注", + ], + # 与前端多年栏一致:每年三列,首年槽位「××年#1」,避免多栏共用一个 col_key + "表5-4 生产经营及效益情况对比表": [ + "可研报告|××年#1", + "实际值|××年#1", + "增减(%)|××年#1", + ], + "表5-5 主要生产经营指标": ["后评价时点前实际值", "后评价时点后预测值"], +} + + +def _norm_time_table_lookup_key(name: str) -> str: + """空白/全半角括号/连字符差异下稳定匹配 TIME_TABLE_MULTI_COLUMNS 键(如「(万元)」与「(万元)」)。""" + t = "".join(str(name or "").split()) + t = ( + t.replace("(", "(") + .replace(")", ")") + .replace("-", "-") + .replace("—", "-") + .replace("–", "-") + ) + return t.casefold() + + +def time_table_default_columns_for_name(table_name: str) -> list[str] | None: + """时间表默认列顺序:先精确命中,再规范化表名后模糊命中。""" + raw = str(table_name or "").strip() + if not raw: + return None + hit = TIME_TABLE_MULTI_COLUMNS.get(raw) + if hit is not None: + return list(hit) + n = _norm_time_table_lookup_key(raw) + for k, v in TIME_TABLE_MULTI_COLUMNS.items(): + if _norm_time_table_lookup_key(k) == n: + return list(v) + return None + + +# 细则正文「第2章~第7章」表格(与 Word 中表题一致;行键在可能冲突处加类别前缀) +GLOBAL_CHAPTER_TABLES: list[tuple[str, int, list[tuple[str, str, str]]]] = [ + ( + "表2-1 资源(原料)组成、数量对比表", + 850, + _T("2.1.1 资源与原料评价", ["气分重碳四", "MTBE醚后碳四", "氢气", "合计"]), + ), + ( + "表2-2 资源(原料)性质对比表", + 851, + _T( + "2.1.1 资源与原料评价", + ["密度(kg/m³)", "硫含量(ppm)", "氮含量(ppm)", "其它指标(可增删)"], + ), + ), + ( + "表2-3 产品方案对比表", + 852, + _T( + "2.1.2.1 产品方案评价", + [ + "汽油", + "航空煤油", + "柴油", + "XX化工品", + "XX润滑油", + "其它产品", + "轻油产品率(%)", + "综合商品率(%)", + "柴汽比", + ], + ), + ), + ( + "表2-5 总图、储运、公用工程及辅助工程对比", + 854, + _T( + "2.1.5 总图及系统配套工程评价", + [ + "占地面积", + "建筑面积", + "铁路专用线", + "产品仓库面积", + "产品储罐总容积", + "原料储罐总容积", + "净水厂总能力", + "循环水厂总能力", + "污水处理厂总能力", + "总变电所总容量", + "锅炉供热总能力", + "辅助设施", + "其它(可增删)", + ], + ), + ), + ( + "表2-6 储运、公用工程及辅助工程依托对比", + 855, + _T( + "2.1.5 总图及系统配套工程评价", + [ + "依托·铁路专用线", + "依托·产品仓库面积", + "依托·原料储罐容积", + "依托·产品储罐容积", + "依托·净化水厂能力", + "依托·循环水厂能力", + "依托·污水处理厂能力", + "依托·总变电所容量", + "依托·锅炉供热能力", + "依托·辅助设施", + "依托·其它(可增删)", + ], + ), + ), + ( + "表2-7 主要设计指标对比表", + 856, + _T( + "2.1.7 主要技术指标评价", + [ + "原油加工量", + "综合商品率", + "全厂柴汽比", + "全厂新鲜水耗", + "全厂平均电耗", + "能耗", + "其它综合指标", + "常减压蒸馏装置能耗", + "其它装置指标(可增删)", + ], + ), + ), + ( + "表3-1 项目承包单位情况", + 860, + _T("3.2 招投标评价", ["承包单元·示例1", "承包单元·示例2", "承包单元·示例3"]), + ), + ( + "表3-2 施工图设计进度情况", + 861, + _T("3.3.2 设计进度评价", ["工艺装置", "公用工程", "辅助设施"]), + ), + ( + "表3-3 施工图设计变更情况(全厂性项目)", + 862, + _T("3.3.4 施工图设计变更管理评价", ["工艺装置", "公用工程", "辅助设施", "合计"]), + ), + ( + "表3-4 施工图设计变更情况(单装置项目)", + 863, + _T("3.3.4 施工图设计变更管理评价", ["工艺", "电气", "其它专业(可增删)", "合计"]), + ), + ( + "表3-5 影响投资或工期重(较)大设计变更及原因分析", + 864, + _T("3.3.4 施工图设计变更管理评价", ["重大变更·示例1", "重大变更·示例2", "重大变更·示例3"]), + ), + ( + "表3-6 施工进度情况", + 865, + _T("3.4.2 施工计划的执行情况", ["工艺装置", "公用工程", "辅助设施"]), + ), + ( + "表3-7 采购工作情况", + 866, + _T( + "3.5 采购工作评价", + [ + "采购物资·示例1", + "采购物资·示例2", + "采购物资·示例3", + "应招标数量(个)", + "招标数量率(%)", + "应招标金额(万元)", + "招标金额率(%)", + ], + ), + ), + ( + "表4-1 投产以来运行周期统计表", + 870, + _T("4.3.2 生产运行总体情况评价", ["运行周期·装置示例1", "运行周期·装置示例2"]), + ), + ( + "表4-2 烷基化装置运行分析表(考核时间:×年×月×日)", + 871, + _T( + "4.3.3 达标评价", + [ + "生产能力", + "主要原材料(代表物料)", + "主要产品产量(代表产品)", + "公用工程消耗·水", + "公用工程消耗·蒸汽", + "公用工程消耗·电", + "公用工程消耗·燃料气", + "综合能耗", + "现金加工成本", + "单位毛利", + "其它(可增删)", + ], + ), + ), + ( + "表5-1 主要经济指标对比表", + 880, + _T( + "5.1 主要经济指标实现程度评价", + [ + "1 项目报批总投资", + "1.1 建设投资", + "1.2 建设期利息", + "1.3 铺底流动资金", + "2 年均营业收入", + "3 年均总成本费用", + "4 年均流转税金及附加", + "5 年均利润总额", + "6 年均所得税金", + "7 年均税后利润", + "8 项目投资内部收益率(税后)", + "9 项目投资财务净现值(税后)", + "10 项目静态投资回收期(含建设期)", + ], + ), + ), + ( + "表5-2 投资变动情况表(单位:万元、万美元)", + 881, + _T( + "5.2.1 投资控制及变动原因分析", + [ + "批准单位", + "批准文号", + "一 建设投资", + "1 固定资产投资", + "1.1 工程费用", + "1.1.1 工艺生产装置", + "1.1.2 总图运输", + "1.1.3 储运工程", + "1.1.4 辅助设施", + "1.1.5 公用工程", + "1.1.6 生产管理设施", + "1.1.7 厂外工程", + "1.1.8 工器具及生产家具购置费", + "1.2 固定资产其它费用", + "1.2.1 ×××费用", + "1.2.2 ×××费用", + "2 无形资产费用", + "2.1 ×××费用", + "3 递延资产费用", + "3.1 ×××费用", + "4 预备费用", + "4.1 基本预备费", + "4.2 价差预备费", + "二 固定资产投资方向调节税", + "三 建设期利息", + "四 铺底流动资金", + # 与附表2「报批项目总投资」行键区分,避免全局回填串表 + "报批项目总投资(投资变动表)", + "其中:外汇(投资变动表)", + ], + ), + ), + ( + "表5-3 工程费用变动情况表(万元、万美元)", + 882, + _T( + "5.2.1 投资控制及变动原因分析", + [ + "批准单位", + "批准文号", + "工程费用", + "工程费用·其中:外汇", + "1 工艺生产装置", + "1 工艺生产装置·其中:外汇", + "1.1 ×××装置", + "1.1 ×××装置·其中:外汇", + "1.1.1 设备购置费", + "1.1.1 设备购置费·其中:外汇", + "1.1.2 安装工程费", + "1.1.2 安装工程费·其中:外汇", + "1.1.3 建筑工程费", + "1.2 ×××装置", + "1.2 ×××装置·其中:外汇", + "1.2.1 设备购置费", + "1.2.1 设备购置费·其中:外汇", + "1.2.2 安装工程费", + "1.2.2 安装工程费·其中:外汇", + "1.2.3 建筑工程费", + "2 总图运输", + "3 储运工程", + "其它分项(可增删)", + "工程费用合计", + ], + ), + ), + ( + "表5-6 不同因素变化对项目内部收益率的影响", + 890, + _T( + "5.3.2 项目经济效益后评价", + [ + "可研报告(基准)", + "后评价报告", + "建设投资变动", + "价格体系变动", + "生产负荷变动", + "建设周期变动", + "其它因素(可增删)", + ], + ), + ), + ( + "表5-7 内部收益率为基准收益率时不确定因素临界点或临界值", + 891, + _T( + "5.4 不确定性分析", + ["生产负荷临界点", "产品价格临界值", "主要原材料价格临界值", "其它不确定因素"], + ), + ), + ( + "表6-1 装置技术经济指标对比表", + 895, + _T( + "6.2.3 主要技术及经济指标对比", + [ + "XX装置·示例1", + "XX装置·示例2", + "XX装置·示例3", + "XX装置·示例4", + "XX装置·示例5", + ], + ), + ), + ( + TABLE_7_1_SCORING_TABLE_NAME, + 896, + TABLE_7_1_FIELDS, + ), +] + +ALL_GLOBAL_TABLES: list[tuple[str, int, list[tuple[str, str, str]]]] = ( + GLOBAL_SECTION_TABLES + GLOBAL_CHAPTER_TABLES + GLOBAL_APPENDIX_TABLES +) + +# 表5-3 新旧 row_key 同义组(与 templates/js/quick-fill.js preferred53Specs 一致;报告/要素展示优先新键) +TABLE_5_3_ROW_KEY_ALTERNATES: tuple[tuple[str, ...], ...] = ( + ("批准单位", "工程费用变动·批准单位"), + ("批准文号", "工程费用变动·批准文号"), + ("工程费用",), + ("工程费用·其中:外汇",), + ("工程费用合计", "工程费用变动·工程费用合计"), + ("1 工艺生产装置", "工程费用变动·工艺生产装置"), + ("1 工艺生产装置·其中:外汇",), + ("1.1 ×××装置",), + ("1.1 ×××装置·其中:外汇",), + ("1.1.1 设备购置费", "工程费用变动·装置·设备购置费"), + ("1.1.1 设备购置费·其中:外汇",), + ("1.1.2 安装工程费", "工程费用变动·装置·安装工程费"), + ("1.1.2 安装工程费·其中:外汇",), + ("1.1.3 建筑工程费", "工程费用变动·装置·建筑工程费"), + ("1.2 ×××装置",), + ("1.2 ×××装置·其中:外汇",), + ("1.2.1 设备购置费",), + ("1.2.1 设备购置费·其中:外汇",), + ("1.2.2 安装工程费",), + ("1.2.2 安装工程费·其中:外汇",), + ("1.2.3 建筑工程费",), + ("2 总图运输", "工程费用变动·总图运输"), + ("3 储运工程", "工程费用变动·储运工程"), + ("其它分项(可增删)", "工程费用变动·其它分项(可增删)"), +) + + +def _norm_global_table_lookup_key(name: str) -> str: + """空白/全半角括号/连字符差异下稳定匹配 ALL_GLOBAL_TABLES 表名。""" + return _norm_time_table_lookup_key(name) + + +def global_table_row_keys(table_name: str) -> list[str]: + """返回 ``ALL_GLOBAL_TABLES`` 中某张全局表的 canonical 行键顺序(与要素库 ``row_key`` 一致)。""" + raw = str(table_name or "").strip() + if not raw: + return [] + for name, _, fields in ALL_GLOBAL_TABLES: + if name == raw: + return [str(r[0]) for r in fields if r and str(r[0]).strip()] + n = _norm_global_table_lookup_key(raw) + for name, _, fields in ALL_GLOBAL_TABLES: + if _norm_global_table_lookup_key(name) == n: + return [str(r[0]) for r in fields if r and str(r[0]).strip()] + return [] + + +def canonical_row_order_for_table(table_name: str) -> list[str] | None: + """附表2~8 及时间附表3~7 的标准行键顺序(与 quick-fill.js preferred*Rows / TIME_APPENDIX_SPECS 一致)。""" + raw = str(table_name or "").strip() + if not raw: + return None + n = _norm_global_table_lookup_key(raw) + if "附表2" in raw and "项目竣工决算投资构成表" in raw: + return list(APPENDIX2_CANONICAL_ROW_ORDER) + for spec_name, rows in TIME_APPENDIX_SPECS: + if _norm_global_table_lookup_key(spec_name) == n: + return list(rows) + for spec_name, rows in TIME_BODY_SPECS: + if _norm_global_table_lookup_key(spec_name) == n: + return list(rows) + if "附表8" in raw and "可研报告和后评价参数对比表" in raw: + return list(APPENDIX8_CANONICAL_ROW_ORDER) + return None + + +# 规则 /factors 抽取用:仅概况 + 附表1/2/8,避免正文几十张表拖慢超时;正文表仍完整预置在 ALL_GLOBAL_TABLES。 +RULE_GLOBAL_TABLES: list[tuple[str, int, list[tuple[str, str, str]]]] = ( + GLOBAL_SECTION_TABLES + GLOBAL_APPENDIX_TABLES +) + +# --------------------------------------------------------------------------- +# 时间维度:附表3~7 + 正文按年表;每(表名, 年)一张 ElementTable,行键与细则「项目名称」一致并加表前缀防冲突。 +# --------------------------------------------------------------------------- +_CF = "现金流量·" +_PL = "利润表·" +_TX = "税金表·" +_CT = "成本表·" +_MT = "料燃动·" + +TIME_APPENDIX_SPECS: list[tuple[str, list[str]]] = [ + ( + "附表3 项目投资财务现金流量表(万元)", + [ + _CF + "1 现金流入", + _CF + "1.1 营业收入", + _CF + "1.2 回收固定资产余值", + _CF + "1.3 回收流动资金", + _CF + "2 现金流出", + _CF + "2.1 建设投资", + _CF + "2.2 流动资金", + _CF + "2.3 经营成本", + _CF + "2.4 营业税金及附加", + _CF + "2.5 调整所得税", + _CF + "3 净现金流量", + _CF + "计算指标·所得税后财务内部收益率(%)", + _CF + "计算指标·所得税后财务净现值(万元)", + _CF + "计算指标·所得税后静态投资回收期(年)", + ], + ), + ( + "附表4 利润与利润分配计算表(万元)", + [ + _PL + "1 营业收入", + _PL + "2 总成本费用", + _PL + "3 营业税金及附加", + _PL + "4 利润总额", + _PL + "5 弥补以前年度亏损", + _PL + "6 应纳税所得额", + _PL + "7 所得税", + _PL + "8 净利润", + _PL + "9 盈余公积及公益金", + _PL + "10 可供分配利润", + _PL + "11 息税前利润总额", + _PL + "12 调整所得税", + ], + ), + ( + "附表5 营业收入与营业税金及附加计算表(万元)", + [ + _TX + "1 营业收入", + _TX + "1.1 产品A·销量", + _TX + "1.1 产品A·营业收入", + _TX + "1.1 产品A·销项税", + _TX + "1.2 产品B·销量", + _TX + "1.2 产品B·营业收入", + _TX + "1.2 产品B·销项税", + _TX + "1.3 产品·……", + _TX + "3 增值税", + _TX + "3.1 销项税", + _TX + "3.2 进项税", + _TX + "3.3 设备材料进项税", + _TX + "4 消费税", + _TX + "4.1 汽油", + _TX + "4.2 柴油", + _TX + "4.3 ……", + _TX + "5 城建税", + _TX + "6 教育费附加", + _TX + "7 营业税金及附加", + ], + ), + ( + "附表6 总成本费用计算表(万元)", + [ + _CT + "1 生产成本", + _CT + "1.1 原材料", + _CT + "1.2 辅助材料", + _CT + "1.3 燃料", + _CT + "1.4 动力", + _CT + "1.5 员工工资及福利", + _CT + "1.6 制造费用", + _CT + "1.6.1 折旧费", + _CT + "1.6.2 修理费", + _CT + "1.6.3 其他制造费用", + _CT + "2 期间费用", + _CT + "2.1 无形资产摊销", + _CT + "2.2 递延资产摊销", + _CT + "2.3 安全生产费用", + _CT + "2.4 安保基金", + _CT + "2.5 其他管理费", + _CT + "3 财务费用", + _CT + "3.1 长期借款利息", + _CT + "3.2 流动资金借款利息", + _CT + "4 总成本费用", + _CT + "4.1 固定成本", + _CT + "4.2 可变成本", + _CT + "5 经营成本", + _CT + "6 单位加工成本", + _CT + "7 单位产品生产成本(化工项目)", + ], + ), + ( + "附表7 原材料、燃料及动力费用计算表(万元)", + [ + _MT + "1 原材料费用", + _MT + "1.1 原料A", + _MT + "1.1 原料A·单价", + _MT + "1.1 原料A·数量", + _MT + "1.1 原料A·进项税额", + _MT + "1.2 原料B", + _MT + "1.2 原料B·……", + _MT + "2 辅助材料费用", + _MT + "2.1 辅助材料A", + _MT + "2.1 辅助材料A·单价", + _MT + "2.1 辅助材料A·数量", + _MT + "2.1 辅助材料A·进项税额", + _MT + "2.2 辅助材料B", + _MT + "2.2 辅助材料B·……", + _MT + "3 燃料费", + _MT + "3.1 燃料A", + _MT + "3.1 燃料A·单价", + _MT + "3.1 燃料A·数量", + _MT + "3.1 燃料A·进项税额", + _MT + "3.2 燃料B", + _MT + "3.2 燃料B·……", + _MT + "4 动力费", + _MT + "4.1 动力A", + _MT + "4.1 动力A·单价", + _MT + "4.1 动力A·数量", + _MT + "4.1 动力A·进项税额", + _MT + "4.2 动力B", + _MT + "4.2 动力B·……", + _MT + "5 进项税合计", + ], + ), +] + +# 细则正文要求按年(或评价年)填报的表格,与附表3~7 相同按「年 × 表」预置 +TIME_BODY_SPECS: list[tuple[str, list[str]]] = [ + ( + "表2-4 ××年项目主要产品流向状况", + [ + "产品名称·1", + "产品名称·2", + "产品名称·3", + "小计", + ], + ), + ( + "表5-4 生产经营及效益情况对比表", + [ + "运行情况·生产天数", + "运行情况·负荷率", + "主要原料价格·氢气", + "主要产品年产量·98#汽油", + "主要产品年产量·95#汽油", + "主要产品年产量·92#汽油", + "主要产品年产量·异丁烷", + "主要产品年产量·正丁烷", + "主要产品年产量·燃料气", + "主要产品年产量·液化气", + "主要产品年销售量·98#汽油", + "主要产品年销售量·95#汽油", + "主要产品年销售量·92#汽油", + "主要产品年销售量·异丁烷", + "主要产品年销售量·正丁烷", + "主要产品年销售量·燃料气", + "主要产品年销售量·液化气", + "主要原料和公用工程消耗量·氢气", + "主要原料和公用工程消耗量·辅助材料", + "主要原料和公用工程消耗量·电", + "主要原料和公用工程消耗量·净化风", + "主要原料和公用工程消耗量·循环水", + "主要原料和公用工程消耗量·除盐水", + "主要原料和公用工程消耗量·除氧水", + "主要原料和公用工程消耗量·蒸汽1.0MPa", + "主要原料和公用工程消耗量·蒸汽3.5MPa", + "主要原料和公用工程消耗量·氮气", + "主要原料和公用工程消耗量·废渣处理", + "主要原料和公用工程消耗量·新鲜水", + "主要原料和公用工程消耗量·凝结水", + "主要经济指标·营业收入", + "主要经济指标·成本费用", + "主要经济指标·利润总额", + "主要经济指标·税后利润", + ], + ), + ( + "表5-5 主要生产经营指标", + [ + "生产负荷", + "原料消耗量", + "燃料消耗量", + "动力消耗量", + "产品产量", + "其它", + ], + ), +] + +ALL_TIME_TABLE_SPECS: list[tuple[str, list[str]]] = TIME_APPENDIX_SPECS + TIME_BODY_SPECS + +# 哈尔滨石化分公司烷基化装置建设项目 — 表5-4 生产经营及效益情况对比(可研预测 vs 时点前实际值 2019 年) +# 通过 element_service.apply_harbin_alkylation_table54_preset 或 POST .../presets/harbin-alkylation-table54 写入指定项目的要素表。 +# 列键须与 TIME_TABLE_MULTI_COLUMNS 中「可研报告|××年#1」一致,避免与模板同步占位列并存时出现双「××年」空栏。 +HARBIN_ALKYLATION_TABLE54_CELL_VALUES: dict[str, dict[str, str]] = { + "运行情况·生产天数": {"可研报告|××年#1": "365", "实际值|××年#1": "334", "增减(%)|××年#1": "-8.49"}, + "运行情况·负荷率": {"可研报告|××年#1": "100.00%", "实际值|××年#1": "42.87%", "增减(%)|××年#1": "-57.13"}, + "主要原料价格·氢气": {"可研报告|××年#1": "4000", "实际值|××年#1": "4376.74", "增减(%)|××年#1": "9.42"}, + "主要产品年产量·98#汽油": {"可研报告|××年#1": "1.08", "实际值|××年#1": "0.30", "增减(%)|××年#1": "-72.16"}, + "主要产品年产量·95#汽油": {"可研报告|××年#1": "14.03", "实际值|××年#1": "6.13", "增减(%)|××年#1": "-56.34"}, + "主要产品年产量·92#汽油": {"可研报告|××年#1": "1.08", "实际值|××年#1": "0.00", "增减(%)|××年#1": "-100.00"}, + "主要产品年产量·异丁烷": {"可研报告|××年#1": "", "实际值|××年#1": "1.38", "增减(%)|××年#1": "/"}, + "主要产品年产量·正丁烷": {"可研报告|××年#1": "5.64", "实际值|××年#1": "1.17", "增减(%)|××年#1": "-79.27"}, + "主要产品年产量·燃料气": {"可研报告|××年#1": "", "实际值|××年#1": "0.12", "增减(%)|××年#1": "/"}, + "主要产品年产量·液化气": {"可研报告|××年#1": "-21.93", "实际值|××年#1": "-9.24", "增减(%)|××年#1": "-57.85"}, + "主要产品年销售量·98#汽油": {"可研报告|××年#1": "1.08", "实际值|××年#1": "0.30", "增减(%)|××年#1": "-72.16"}, + "主要产品年销售量·95#汽油": {"可研报告|××年#1": "14.03", "实际值|××年#1": "6.13", "增减(%)|××年#1": "-56.34"}, + "主要产品年销售量·92#汽油": {"可研报告|××年#1": "1.08", "实际值|××年#1": "0.00", "增减(%)|××年#1": "-100.00"}, + "主要产品年销售量·异丁烷": {"可研报告|××年#1": "", "实际值|××年#1": "1.38", "增减(%)|××年#1": "/"}, + "主要产品年销售量·正丁烷": {"可研报告|××年#1": "5.64", "实际值|××年#1": "1.17", "增减(%)|××年#1": "-79.27"}, + "主要产品年销售量·燃料气": {"可研报告|××年#1": "", "实际值|××年#1": "0.12", "增减(%)|××年#1": "/"}, + "主要产品年销售量·液化气": {"可研报告|××年#1": "-21.93", "实际值|××年#1": "-9.24", "增减(%)|××年#1": "-57.85"}, + "主要原料和公用工程消耗量·氢气": {"可研报告|××年#1": "0.02", "实际值|××年#1": "0.01", "增减(%)|××年#1": "-51.49"}, + "主要原料和公用工程消耗量·辅助材料": {"可研报告|××年#1": "3054", "实际值|××年#1": "796.66", "增减(%)|××年#1": "-73.91"}, + "主要原料和公用工程消耗量·电": {"可研报告|××年#1": "1346", "实际值|××年#1": "669.75", "增减(%)|××年#1": "-50.24"}, + "主要原料和公用工程消耗量·净化风": {"可研报告|××年#1": "294", "实际值|××年#1": "235.02", "增减(%)|××年#1": "-20.06"}, + "主要原料和公用工程消耗量·循环水": {"可研报告|××年#1": "483", "实际值|××年#1": "231.96", "增减(%)|××年#1": "-51.97"}, + "主要原料和公用工程消耗量·除盐水": {"可研报告|××年#1": "10.50", "实际值|××年#1": "4.01", "增减(%)|××年#1": "-61.80"}, + "主要原料和公用工程消耗量·除氧水": {"可研报告|××年#1": "", "实际值|××年#1": "0.01", "增减(%)|××年#1": "/"}, + "主要原料和公用工程消耗量·蒸汽1.0MPa": {"可研报告|××年#1": "-8.99", "实际值|××年#1": "-6.11", "增减(%)|××年#1": "-31.98"}, + "主要原料和公用工程消耗量·蒸汽3.5MPa": {"可研报告|××年#1": "28.31", "实际值|××年#1": "19.67", "增减(%)|××年#1": "-30.52"}, + "主要原料和公用工程消耗量·氮气": {"可研报告|××年#1": "", "实际值|××年#1": "288.17", "增减(%)|××年#1": "/"}, + "主要原料和公用工程消耗量·废渣处理": {"可研报告|××年#1": "0.04", "实际值|××年#1": "0.0121", "增减(%)|××年#1": "-67.73"}, + "主要原料和公用工程消耗量·新鲜水": {"可研报告|××年#1": "", "实际值|××年#1": "0.15", "增减(%)|××年#1": "/"}, + "主要原料和公用工程消耗量·凝结水": {"可研报告|××年#1": "19.32", "实际值|××年#1": "0.00", "增减(%)|××年#1": "-100.00"}, + "主要经济指标·营业收入": {"可研报告|××年#1": "64278", "实际值|××年#1": "30610", "增减(%)|××年#1": "-52.38"}, + "主要经济指标·成本费用": {"可研报告|××年#1": "10627", "实际值|××年#1": "7332", "增减(%)|××年#1": "-31.01"}, + "主要经济指标·利润总额": {"可研报告|××年#1": "13785", "实际值|××年#1": "7876", "增减(%)|××年#1": "-42.87"}, + "主要经济指标·税后利润": {"可研报告|××年#1": "10339", "实际值|××年#1": "5907", "增减(%)|××年#1": "-42.87"}, +} + +# 行键已按表加前缀,一般无需覆盖;保留字典供日后特例。 +TIME_ROW_TABLE_OVERRIDE: dict[str, str] = {} + +TIME_ROW_PRIMARY_TABLE: dict[str, str] = {} +for _tname, _rows in ALL_TIME_TABLE_SPECS: + for _rk in _rows: + if _rk not in TIME_ROW_PRIMARY_TABLE: + TIME_ROW_PRIMARY_TABLE[_rk] = _tname +for _k, _t in TIME_ROW_TABLE_OVERRIDE.items(): + TIME_ROW_PRIMARY_TABLE[_k] = _t + +TIME_KEY_SET: set[str] = {r for _, rows in ALL_TIME_TABLE_SPECS for r in rows} + +# 规则抽取用:全局 + 少量时间字段(name 须与时间表行键一致) +RULE_EXTRACT_EXTRA_TIME: list[tuple[str, str, str, str]] = [ + ("附表4 利润与利润分配计算表(万元)", _PL + "1 营业收入", "5.3.1 项目投产以来生产经营及效益状况", "营业收入"), + ("附表4 利润与利润分配计算表(万元)", _PL + "2 总成本费用", "5.3.1 项目投产以来生产经营及效益状况", "总成本费用"), + ("附表4 利润与利润分配计算表(万元)", _PL + "8 净利润", "5.3.1 项目投产以来生产经营及效益状况", "净利润"), + ("附表5 营业收入与营业税金及附加计算表(万元)", _TX + "7 营业税金及附加", "5.3.2 项目经济效益后评价", "营业税金及附加"), + ("附表3 项目投资财务现金流量表(万元)", _CF + "3 净现金流量", "5.3.2 项目经济效益后评价", "净现金流量"), +] + + +def build_rule_factor_items() -> list[dict[str, str]]: + """规则抽取条目(精简);完整要素模版见 ALL_GLOBAL_TABLES + ALL_TIME_TABLE_SPECS。""" + out: list[dict[str, str]] = [] + for table_name, _base, fields in RULE_GLOBAL_TABLES: + for name, source, lvl3 in fields: + out.append( + { + "name": name, + "source": source, + "lvl3": lvl3, + "table_name": table_name, + "table_type": "global", + } + ) + for table_name, name, source, lvl3 in RULE_EXTRACT_EXTRA_TIME: + out.append( + { + "name": name, + "source": source, + "lvl3": lvl3, + "table_name": table_name, + "table_type": "time", + } + ) + return out + + +ROW_KEY_TO_GLOBAL_TABLE: dict[str, str] = {} +for _tn, _b, fields in ALL_GLOBAL_TABLES: + for key, _, _ in fields: + ROW_KEY_TO_GLOBAL_TABLE[key] = _tn + +GLOBAL_KEY_SET = {key for key in ROW_KEY_TO_GLOBAL_TABLE} + + +def all_global_row_specs() -> list[tuple[str, str, str, str]]: + rows: list[tuple[str, str, str, str]] = [] + for table_name, _b, fields in ALL_GLOBAL_TABLES: + for name, source, lvl3 in fields: + rows.append((table_name, name, source, lvl3)) + return rows + + +# 章节/表到“材料文档关键词”硬映射: +# key: source 分组名(通常为 table_name,如“章节要素-第2章前期工作评价”“附表4 ...”) +# value: 该分组允许命中的文档名关键词(按 document_markdowns.extracted_filename/kb_documents.name 匹配) +# +# 说明: +# 1) 这里不做打分,命中即纳入; +# 2) 未配置的分组默认不过滤(避免因未补齐映射导致漏抽); +# 3) 关键词建议用 2~8 字的稳定片段,如“可研”“初设”“竣工决算”“后评价报告”。 +SOURCE_DOC_KEYWORDS_MAP: dict[str, list[str]] = { + # 摘要与正文章节(从摘要开始) + "章节要素-摘要与前言": ["可研"], + "章节要素-第1章项目概况": ["可研", "竣工验收"], + "章节要素-第2章前期工作评价": ["可研", "初设"], + "章节要素-第3章建设实施评价": ["初设", "施工", "监理", "竣工验收"], + "章节要素-第4章生产运行评价": ["生产运行", "标定", "运行月报", "可研"], + "章节要素-第5章投资与经济效益评价": ["竣工决算", "财务", "可研"], + "章节要素-第6章影响与持续性评价": [ + "可研", + "环评", + "环境影响", + "环保", + "环境保护", + "验收监测", + "安评", + "安全评价", + "安全预评价", + "安全验收", + "安全设施", + ], + + # 第2章相关表 + "表2-1 资源(原料)组成、数量对比表": ["可研"], + "表2-2 资源(原料)性质对比表": ["可研"], + "表2-3 产品方案对比表": ["可研"], + "表2-4 ××年项目主要产品流向状况": ["可研"], + "表2-5 总图、储运、公用工程及辅助工程对比": ["可研"], + "表2-6 储运、公用工程及辅助工程依托对比": ["可研"], + "表2-7 主要设计指标对比表": ["可研"], + + # 第3章相关表 + "表3-1 项目承包单位情况": ["施工", "监理", "可研"], + "表3-2 施工图设计进度情况": ["施工图", "初设", "可研"], + "表3-3 施工图设计变更情况(全厂性项目)": ["施工图", "设计变更", "可研"], + "表3-4 施工图设计变更情况(单装置项目)": ["施工图", "设计变更", "可研"], + "表3-5 影响投资或工期重(较)大设计变更及原因分析": ["设计变更", "可研"], + "表3-6 施工进度情况": ["施工进度", "可研"], + "表3-7 采购工作情况": ["采购", "可研"], + + # 第4章相关表 + "表4-1 投产以来运行周期统计表": ["生产运行", "可研"], + "表4-2 烷基化装置运行分析表(考核时间:×年×月×日)": ["生产运行", "标定", "可研"], + + # 第5章相关表 + "表5-1 主要经济指标对比表": ["财务", "竣工决算", "可研"], + "表5-2 投资变动情况表(单位:万元、万美元)": ["竣工决算", "投资", "可研"], + "表5-3 工程费用变动情况表(万元、万美元)": ["竣工决算", "工程费用", "可研"], + "表5-4 生产经营及效益情况对比表": ["生产经营", "财务", "可研"], + "表5-5 主要生产经营指标": ["生产经营", "财务", "可研"], + "表5-6 不同因素变化对项目内部收益率的影响": ["财务", "可研"], + "表5-7 内部收益率为基准收益率时不确定因素临界点或临界值": ["财务", "可研"], + + # 第6/7章相关表 + "表6-1 装置技术经济指标对比表": [ + "可研", + "环评", + "环境影响", + "环保", + "安评", + "安全评价", + "安全设施", + ], + + # 附表 + "附表1 项目建设工作程序表": ["可研", "前期", "批复"], + "附表2 项目竣工决算投资构成表(万元)": ["竣工决算"], + "附表3 项目投资财务现金流量表(万元)": ["财务", "可研"], + "附表4 利润与利润分配计算表(万元)": ["财务", "可研"], + "附表5 营业收入与营业税金及附加计算表(万元)": ["财务", "可研"], + "附表6 总成本费用计算表(万元)": ["财务", "可研"], + "附表7 原材料、燃料及动力费用计算表(万元)": ["财务", "可研"], + "附表8 可研报告和后评价参数对比表": ["可研"], +} + + +def source_doc_keywords_for(src: str) -> list[str]: + """ + 获取某个 source 分组对应的文档关键词(支持前缀键匹配)。 + """ + key = str(src or "").strip() + if not key: + return [] + exact = SOURCE_DOC_KEYWORDS_MAP.get(key) + if exact is not None: + return [str(x).strip() for x in exact if str(x).strip()] + for k, vals in SOURCE_DOC_KEYWORDS_MAP.items(): + ks = str(k or "").strip() + if ks and key.startswith(ks): + return [str(x).strip() for x in vals if str(x).strip()] + return []