from __future__ import annotations import json import logging import re import threading from difflib import SequenceMatcher import unicodedata import uuid from datetime import datetime from pathlib import Path from types import SimpleNamespace from typing import Any, Optional from fastapi import HTTPException logger = logging.getLogger(__name__) # ── 运行时提示词落盘 ────────────────────────────────────────────────────────── _PROMPT_DUMP_ROOT = Path(__file__).resolve().parent.parent / "comp" / "runtime" _REPORT_OUTPUT_DUMP_ROOT = Path(__file__).resolve().parent.parent / "comp" / "report_outputs" def _safe_markdown_filename(name: str, fallback: str = "section") -> str: safe = re.sub(r'[\\/:*?"<>|]', "_", str(name or "").strip()) safe = re.sub(r"\s+", " ", safe).strip(" ._") return safe[:120] or fallback def _dump_runtime_prompt( job_id: str, section_key: str, section_title: str, system_prompt: str, user_prompt: str, ) -> None: """将本次实际调用大模型的完整提示词(含证据)写入 comp/runtime//.md。""" try: out_dir = _PROMPT_DUMP_ROOT / job_id out_dir.mkdir(parents=True, exist_ok=True) safe_key = re.sub(r'[\\/:*?"<>|]', "_", section_key) out_path = out_dir / f"{safe_key}.md" ts = datetime.now().strftime("%Y-%m-%d %H:%M:%S") content = ( f"# {section_title}\n\n" f"> job_id: `{job_id}` \n" f"> section_key: `{section_key}` \n" f"> 生成时间: {ts}\n\n" "---\n\n" "## System Prompt\n\n" f"```\n{system_prompt}\n```\n\n" "---\n\n" "## User Prompt\n\n" f"```\n{user_prompt}\n```\n" ) out_path.write_text(content, encoding="utf-8") except Exception as exc: logger.warning("dump runtime prompt failed: %s", exc) def _dump_report_chapter_json_markdown( *, job_id: str, section_key: str, section_title: str, output_json: dict[str, Any], ) -> Optional[str]: """在章节流式生成结束并准备入库时,将该章节最终 JSON 输出写入 markdown 文件。 Returns: 写入的文件路径,文件已存在(合并写入)时返回 None。 """ try: out_dir = _REPORT_OUTPUT_DUMP_ROOT / job_id out_dir.mkdir(parents=True, exist_ok=True) safe_title = _safe_markdown_filename(section_title, fallback=_safe_markdown_filename(section_key)) out_path = out_dir / f"{safe_title}.md" ts = datetime.now().strftime("%Y-%m-%d %H:%M:%S") json_text = json.dumps(output_json or {}, ensure_ascii=False, indent=2, default=str) content = ( f"# {section_title}\n\n" f"> job_id: `{job_id}` \n" f"> section_key: `{section_key}` \n" f"> 写入时间: {ts}\n\n" "```json\n" f"{json_text}\n" "```\n" ) out_path.write_text(content, encoding="utf-8") return str(out_path) except Exception as exc: logger.warning("dump report chapter json markdown failed: %s", exc) return None from sqlalchemy.orm import Session from database import SessionLocal from database.models import ( ElementCell, ElementTable, Project, KbDocument, ReportGenerationChapter, ReportGenerationJob, ReportTemplate, ReportTemplateSection, ) from schemas.write import ( GenerateReportJobItem, GenerateReportChapterItem, GenerateReportResult, GenerateReportResultChapter, ) from services.llm_client import chat_completions_json from services.report_prompt_service import ( build_report_chapter_prompt, build_repair_missing_tables_prompt, build_table_format_repair_prompt, chapter_generation_system_prompt, repair_missing_tables_system_prompt, table_format_repair_system_prompt, ) from services.retrieval_service import RetrievalService from config import settings from services.appendix_figure_extraction import ( appendix_figure_markdown_images, extract_appendix_figure_candidates_from_docx, merge_best_appendix_figures, ) from services.kb_service import _kb_doc_absolute_file_path_for_model from services.report_runtime_store import ( append_chapter_content, get_job_state, init_job_state, set_chapter_stream_phase, update_chapter_state, update_job_state, ) from services.standard_elements_2020 import ( CHAPTER1_PROJECT_OVERVIEW_TABLE_GROUP, MULTI_COLUMN_GLOBAL_SPECS, APPENDIX2_CANONICAL_ROW_ORDER, APPENDIX2_LEGACY_ROW_KEY_MAP, APPENDIX8_LEGACY_ROW_KEY_MAP, TABLE_5_3_ROW_KEY_ALTERNATES, canonical_row_order_for_table, TABLE_7_1_COLUMN_KEYS, TABLE_7_1_ROW_CELL_DEFAULTS, TABLE_7_1_SCORING_TABLE_NAME, global_table_row_keys, section_table_row_keys, time_table_default_columns_for_name, ) from prompts.report_generation.section_output_contracts import ( DEFAULT_SECTION_OUTPUT_CONTRACT, SECTION_OUTPUT_CONTRACTS, ) from prompts.report_generation.heading_rules import ( DEFAULT_HEADING_RULE, SECTION_HEADING_RULES, ) from prompts.report_generation.appendix_templates import ( APPENDIX8_PARAMETER_COMPARISON_TABLE, APPENDIX_FIGURE_TARGETS, MINIMAL_MISSING_TABLE_TEMPLATE, missing_child_heading_markdown, ) RUNNING_CHAPTER_STALE_SECONDS = 180 # 同一表号存在多张历史/别名表时,优先命中该表号的标准表名关键词,避免误选。 _TABLE_TOKEN_PREFERRED_NAME_HINTS: dict[str, tuple[str, ...]] = { "表2-5": ("总图、储运、公用工程及辅助工程对比",), "表2-6": ("储运、公用工程及辅助工程依托对比", "依托"), "表3-3": ("施工图设计变更情况", "全厂性项目"), "表3-4": ("施工图设计变更情况", "单装置项目"), "表3-5": ("影响投资或工期", "重大设计变更"), "表5-4": ("生产经营及效益情况对比表",), "表5-5": ("主要生产经营指标",), "表5-6": ("不同因素变化对项目内部收益率的影响",), "表5-7": ("内部收益率为基准收益率时不确定因素临界点或临界值",), } # 表5-4 列键形如「可研报告|××年#1」:须与附表时间槽区分,且不可走「可研报告」前缀拆行,否则会生成「可研报告-|××年#1」错位表头。 _TABLE54_PIPE_METRIC_PREFIXES = frozenset( {"可研报告", "可研值", "实际值", "增减(%)", "增减", "指标"} ) # 与表5-1 等混同步入的非细则列,直出时剔除 _TABLE54_DROP_COL_KEYS = frozenset({"后评价值", "后评价报告"}) _TABLE54_INVISIBLE_RE = re.compile(r"[\ufeff\u200b-\u200d]") def _table54_ck_norm(ck: str) -> str: """列键 NFKC 与去空白、BOM,便于识别误写入的「unit」全角变体等。""" t = unicodedata.normalize("NFKC", str(ck or "")).strip() return _TABLE54_INVISIBLE_RE.sub("", t) def _is_table54_operating_benefit(table_name: str) -> bool: tn = str(table_name or "").strip() return "表5-4" in tn and "生产经营及效益情况对比表" in tn def _element_table_collect_score(db: Session, table: ElementTable, token: str) -> int: """报告生成选表:表5-4 须优先时间表且列键为「可研报告|××年#1」结构,避免误选抽取简表。""" if not _table_token_matches_name(token, "表5-4"): return 0 score = 0 if str(table.table_type or "").strip() == "time": score += 200 name = str(table.table_name or "") if "生产经营及效益" in name: score += 40 sample = ( db.query(ElementCell.row_key, ElementCell.col_key) .filter( ElementCell.table_id == table.id, ElementCell.value.isnot(None), ElementCell.value != "", ) .limit(48) .all() ) for rk, ck in sample: rk_s, ck_s = str(rk or ""), str(ck or "") if "|" in ck_s and any( p in ck_s for p in ("可研报告", "实际值", "增减") ): score += 8 if "·" in rk_s: score += 2 if "年份未识别" in rk_s or "年份未识别" in ck_s: score -= 40 return score def _pick_table54_year_markdown( year_items: list[tuple[str, str]], *, table_year: int | None = None, ) -> tuple[str, str] | None: """多张按年拆分的表5-4 取评价年(优先 element_tables.year / 2019)且表体最完整的一张。""" if not year_items: return None if len(year_items) == 1: return year_items[0] def _item_score(item: tuple[str, str]) -> int: disp, md = item sc = 0 if table_year is not None and str(table_year) in str(disp): sc += 120 if "2019" in str(disp) or re.search(r"2019\s*年", md[:800]): sc += 80 if "可研报告" in md and "实际值" in md: sc += 70 if "增减" in md: sc += 25 if "运行情况·" in md or "主要经济指标·" in md: sc += 35 if "主要经济指标-" in md and "可研报告" not in md: sc -= 60 sc += min(md.count("\n|"), 60) return sc return max(year_items, key=_item_score) def _score_structured_table_hit_dict(hit: dict) -> int: """structuredTables 条目评分:完整表5-4 对比表优先于 LLM 三行简表。""" if not isinstance(hit, dict): return 0 md = str(hit.get("markdown") or "") if not md: return 0 if _is_table54_simplified_extract_body(md): return 0 sc = 0 if "可研报告" in md and "实际值" in md: sc += 90 if "增减" in md: sc += 25 if "运行情况·" in md or "主要经济指标·" in md: sc += 40 if "主要经济指标-" in md and "可研报告" not in md: sc -= 70 sc += min(md.count("\n|"), 80) return sc def _table54_body_preceded_by_element_source(text_before: str, *, max_chars: int = 600) -> bool: """表体紧邻前是否已有要素直出注释(有则视为权威表5-4,勿删勿换)。""" tail = str(text_before or "")[-max_chars:] if "表格来源:要素管理" not in tail: return False after = tail.rsplit("表格来源:要素管理", 1)[-1] chunk = after.split("\n", 8)[-1] return not any( ln.strip().startswith("|") or _is_pipe_markdown_table_row_line(ln) for ln in chunk.splitlines()[:6] if ln.strip() ) def _is_table54_simplified_extract_body(block: str) -> bool: """ 识别抽取/LLM 三行简表:仅「2019年实际值」等单列 + 少量「主要经济指标·」行, 无「可研报告|…」与「增减」对比结构。 """ md = str(block or "").strip() if not md or "|" not in md: return False hdr = re.sub(r"\s+", "", _extract_table_header_key(md)).lower() hdr = re.sub(r"
.*", "", hdr, flags=re.IGNORECASE) if not hdr: return False if "后评价值" in hdr or ("可研值" in hdr and "项目" not in hdr and "运行情况" not in md): return True has_compare_cols = ("可研报告" in hdr or ("可研" in hdr and "增减" in hdr)) and ( "实际值" in hdr or "实际" in hdr ) if has_compare_cols and ("运行情况·" in md or md.count("\n|") >= 12): return False single_actual_year = bool( re.search(r"\d{4}\s*年\s*实际值", hdr) or re.search(r"\d{4}年实际值", hdr) ) if single_actual_year and "可研" not in hdr and "增减" not in hdr: if "主要经济指标" in md or "主要经济指标-" in md: return True if "主要经济指标-" in md and "可研报告" not in md and "增减" not in md: return True pipe_rows = [ ln for ln in md.splitlines() if _is_pipe_markdown_table_row_line(ln) and not _is_pipe_markdown_table_separator_line(ln) ] if ( len(pipe_rows) <= 5 and "主要经济指标" in md and "可研报告" not in md and "运行情况·" not in md ): return True return False def _reorder_table54_col_order(col_order: list[str]) -> list[str]: """单位列置前;其余按年度槽与「可研→实际→增减」顺序排列;剔除无效英文 unit 列与表5-1 混入列。""" cols: list[str] = [] for c in col_order: s = _table54_ck_norm(c) if not s: continue if s.lower() in ("unit", "__unit__"): continue if s in _TABLE54_DROP_COL_KEYS: continue cols.append(s) cols = ["单位" if c == "指标单位" else c for c in cols] seen: set[str] = set() deduped: list[str] = [] for c in cols: if c in seen: continue seen.add(c) deduped.append(c) cols = deduped units = [c for c in cols if c == "单位"] metrics = [c for c in cols if c != "单位"] def _metric_rank(g: str) -> int: gs = g.strip() if gs in ("可研报告", "指标", "可研值"): return 0 if gs == "实际值": return 1 if gs.startswith("增减"): return 2 return 9 def _sort_key(ck: str) -> tuple[str, int, str]: if "|" not in ck: return ("\xff", 99, ck) g, t = ck.split("|", 1) return (t.strip(), _metric_rank(g), ck) metrics = sorted(metrics, key=_sort_key) if not units: return ["单位"] + metrics return units + metrics def _table54_rekey_latest_col_keys(latest: dict[tuple[str, str], str]) -> None: """将 latest 的 col_key 与 _reorder_table54_col_order 一致地做 NFKC 等规范化,否则「增减(%)」与「增减(%)」无法对齐。""" tmp: dict[tuple[str, str], str] = {} def _prefer_val(cur: str, new: str) -> str: s_new = str(new or "").strip() if s_new and s_new != "待补充": return str(new) s_cur = str(cur or "").strip() if s_cur and s_cur != "待补充": return str(cur) return s_new or s_cur or "" for (rk, ck), v in list(latest.items()): rk_s = str(rk) nk = _table54_ck_norm(str(ck)) key = (rk_s, nk) if key in tmp: tmp[key] = _prefer_val(tmp[key], v) else: tmp[key] = str(v or "") latest.clear() latest.update(tmp) def _table54_coalesce_legacy_bare_metric_cols( latest: dict[tuple[str, str], str], row_order: list[str] ) -> None: """ 要素管理常见:数据写在裸列「可研报告/实际值/增减(%)」, 列定义仍为「可研报告|××年#1」等;合并到槽位列以便与 UI 一致。 """ slot_map = { "可研报告": "可研报告|××年#1", "实际值": "实际值|××年#1", "增减(%)": "增减(%)|××年#1", "增减(%)": "增减(%)|××年#1", } def _prefer(a: str, b: str) -> str: sa, sb = str(a or "").strip(), str(b or "").strip() if sa and sa != "待补充": return sa if sb and sb != "待补充": return sb return sa or sb for rk in row_order: for bare, slot in slot_map.items(): merged = _prefer(latest.get((rk, slot), ""), latest.get((rk, bare), "")) if merged: latest[(rk, slot)] = merged def _table54_merge_year_cells_for_table_year( year_cells: dict[int | None, list], *, table_year: int | None, ) -> tuple[dict[int | None, list], list[int]]: """ 表5-4:同一张 element_tables(year=2019)下多数格子 element_cells.year 为空, 须与 year=2019 的少量格子合并后再渲染,否则只剩单列「实际值」简表。 """ if table_year is None or int(table_year) <= 0: real = sorted(y for y in year_cells if y is not None) return year_cells, real ty = int(table_year) merged: list = list(year_cells.get(None, [])) for cy in sorted(y for y in year_cells if y is not None): if cy == ty: merged.extend(year_cells.get(cy, [])) if not merged: return year_cells, sorted(y for y in year_cells if y is not None) return {ty: merged}, [ty] def _table54_remap_indicator_unit_latest(latest: dict[tuple[str, str], str]) -> None: """将历史列键「指标单位」的值并入「单位」,避免列键规范为「单位」后取不到数。""" touched: list[tuple[str, str]] = [] for (rk, ck), v in list(latest.items()): if str(ck) != "指标单位": continue rk_s = str(rk) k_unit = (rk_s, "单位") cur = str(latest.get(k_unit, "") or "").strip() nv = str(v or "").strip() if nv and (not cur or cur == "待补充"): latest[k_unit] = v elif not cur: latest[k_unit] = v touched.append((rk_s, str(ck))) for pair in touched: latest.pop(pair, None) _TABLE54_SLOT_YEAR_RE = re.compile(r"^(\d{4})年(?:#\d+)?$") def _norm_table54_placeholder_year_tail(tail: str) -> str: buf: list[str] = [] for ch in (tail or "").strip(): if ch in "xXxX": buf.append("×") elif ch == "\u00d7": buf.append("×") else: buf.append(ch) return "".join(buf) def _table54_placeholder_year_tail(tail: str) -> bool: """列键尾部为「××年#n」等占位列(与前端 isEmTable54YearSlotColKey 一致)。""" u = _norm_table54_placeholder_year_tail(tail) return bool(re.fullmatch(r"×{2}年(?:#\d+)?", u)) def _parse_real_year_from_table54_slot_tail(tail: str) -> int | None: """列键尾部为「2019」「2019年」「2019年#1」等真实日历时返回四位年。""" t = (tail or "").strip() if _table54_placeholder_year_tail(t): return None m = _TABLE54_SLOT_YEAR_RE.fullmatch(t) if m: y = int(m.group(1)) if 1900 <= y <= 2100: return y m2 = re.match(r"^(\d{4})年", t) if m2: y = int(m2.group(1)) if 1900 <= y <= 2100: return y return None def _infer_time_column_year_for_table54( col_order: list[str], cells: list[Any], table_year: int | None, ) -> int | None: """ 从单元格 year、时间表 element_tables.year、或列键「…|2019年」推断表5-4 年度栏对应的日历年。 无法唯一确定时返回 None(表头占位列退回「某年」)。 """ ys = sorted( { int(c.year) for c in cells if getattr(c, "year", None) is not None and int(c.year) > 0 } ) if len(ys) == 1: return ys[0] if table_year is not None and int(table_year) > 0: return int(table_year) parsed: list[int] = [] for ck in col_order: s = str(ck or "").strip() if "|" not in s: continue _, tail = s.split("|", 1) cy = _parse_real_year_from_table54_slot_tail(tail.strip()) if cy is not None: parsed.append(cy) uniq = sorted(set(parsed)) if len(uniq) == 1: return uniq[0] # 列键正文含四位年(如「2019年可研报告」类裸列名);多列多年份时不武断取第一个 text_years: list[int] = [] for ck in col_order: m = re.search(r"(19|20)\d{2}", str(ck or "")) if m: yi = int(m.group(0)) if 1900 <= yi <= 2100: text_years.append(yi) ty_uniq = sorted(set(text_years)) if len(ty_uniq) == 1: return ty_uniq[0] return None def _table54_year_label_prefix(time_column_year: int | None) -> str: if time_column_year is not None and 1900 <= int(time_column_year) <= 2100: return f"{int(time_column_year)}年" return "某年" def _table54_year_prefix_for_slot_tail(tail: str, *, time_column_year: int | None) -> str: """表头「{年}可研报告」中的「{年}」:优先列键自带年份,否则用推断的日历年,最后退回「某年」。""" cy = _parse_real_year_from_table54_slot_tail(tail) if cy is not None: return f"{cy}年" if _table54_placeholder_year_tail(tail): return _table54_year_label_prefix(time_column_year) t = (tail or "").strip() if not t: return "某年" if re.match(r"^\d{4}年", t): return t.split("#", 1)[0] return f"{t}年" if not t.endswith("年") else t def _table54_bare_metric_header_label(col_key: str, *, time_column_year: int | None) -> str | None: """无「指标|年度槽」时的列键:表头带评价年。""" s = str(col_key or "").strip() if not s: return None ypfx = _table54_year_label_prefix(time_column_year) if s == "实际值": return f"{ypfx}实际值" if s in ("可研值", "可研报告"): return f"{ypfx}可研报告" if s.startswith("增减"): rest = s[len("增减") :] return f"{ypfx}增减{rest}" return None def _table54_markdown_header_labels( col_order: list[str], *, time_column_year: int | None = None, ) -> list[str]: """扁平表头:单位、{年}可研报告、{年}实际值、{年}增减;{年}来自列键或要素日历年推断。""" out: list[str] = [] for ck in col_order: s = str(ck).strip() if s == "单位": out.append("单位") continue if "|" not in s: bare_l = _table54_bare_metric_header_label(s, time_column_year=time_column_year) if bare_l is not None: out.append(bare_l) continue out.append(s.replace("|", "|")) continue g, t = s.split("|", 1) g, t = g.strip(), t.strip() g_norm = _table54_ck_norm(g) if (g not in _TABLE54_PIPE_METRIC_PREFIXES and g_norm not in _TABLE54_PIPE_METRIC_PREFIXES) or not t: out.append(s.replace("|", "|")) continue ypfx = _table54_year_prefix_for_slot_tail(t, time_column_year=time_column_year) if g_norm in ("可研报告", "指标", "可研值") or g in ("可研报告", "指标", "可研值"): out.append(f"{ypfx}可研报告") elif g_norm == "实际值" or g == "实际值": out.append(f"{ypfx}实际值") elif g_norm.startswith("增减") or g.startswith("增减"): rest = g[len("增减") :] out.append(f"{ypfx}增减{rest}") else: out.append(s.replace("|", "|")) return out def create_report_job( project_id: str, db: Session, *, template_id: Optional[str] = None, top_k: int = 10, requested_by: Optional[str] = None, ) -> GenerateReportJobItem: project = _resolve_project(db, project_id) if not project: raise HTTPException(status_code=404, detail="项目不存在") template = _resolve_template(db, template_id) sections = _sections_for_generation(_list_template_sections(db, template.id)) if not sections: raise HTTPException(status_code=400, detail="模板未配置章节") now = datetime.now() job = ReportGenerationJob( id=uuid.uuid4().hex, project_id=project.uuid, template_id=template.id, status="pending", progress=0, requested_by=requested_by, options={"topK": max(5, min(int(top_k or 10), 20))}, created_at=now, updated_at=now, ) db.add(job) # 先把父任务写入当前事务,确保后续章节插入满足外键约束。 db.flush() for s in sections: db.add( ReportGenerationChapter( id=uuid.uuid4().hex, job_id=job.id, section_key=s.section_key, section_title=s.section_title, section_order=s.section_order, status="pending", created_at=now, updated_at=now, ) ) db.commit() init_job_state( job_id=job.id, project_id=project.uuid, template_id=template.id, chapters=[ { "sectionKey": s.section_key, "sectionTitle": s.section_title, "sectionOrder": s.section_order, "status": "pending", } for s in sections ], ) _start_job_worker(job.id) return get_report_job(project.uuid, job.id, db) def get_report_job(project_id: str, job_id: str, db: Session) -> GenerateReportJobItem: project = _resolve_project(db, project_id) if not project: raise HTTPException(status_code=404, detail="项目不存在") job = ( db.query(ReportGenerationJob) .filter(ReportGenerationJob.id == job_id, ReportGenerationJob.project_id == project.uuid) .first() ) if not job: raise HTTPException(status_code=404, detail="任务不存在") _recover_stalled_job(db, job) chapters = ( db.query(ReportGenerationChapter) .filter(ReportGenerationChapter.job_id == job.id) .order_by(ReportGenerationChapter.section_order.asc()) .all() ) runtime_state = get_job_state(job.id) runtime_chapter_map = ((runtime_state or {}).get("chapters") or {}) if isinstance(runtime_state, dict) else {} return GenerateReportJobItem( jobId=job.id, projectId=job.project_id, templateId=job.template_id, status=(runtime_state or {}).get("status") or job.status, progress=int((runtime_state or {}).get("progress") or job.progress or 0), currentSectionKey=(runtime_state or {}).get("currentSectionKey") or job.current_section_key, errorMessage=(runtime_state or {}).get("errorMessage") or job.error_message, createdAt=_fmt_dt(job.created_at), updatedAt=(runtime_state or {}).get("updatedAt") or _fmt_dt(job.updated_at), completedAt=(runtime_state or {}).get("completedAt") or _fmt_dt(job.completed_at), chapters=[ GenerateReportChapterItem( sectionKey=c.section_key, sectionTitle=c.section_title, sectionOrder=c.section_order, status=(runtime_chapter_map.get(c.section_key) or {}).get("status") or c.status, updatedAt=(runtime_chapter_map.get(c.section_key) or {}).get("updatedAt") or _fmt_dt(c.updated_at), errorMessage=(runtime_chapter_map.get(c.section_key) or {}).get("errorMessage") or c.error_message, ) for c in chapters ], ) def get_report_result( project_id: str, job_id: str, db: Session, *, include_debug: bool = False, ) -> GenerateReportResult: project = _resolve_project(db, project_id) if not project: raise HTTPException(status_code=404, detail="项目不存在") job = ( db.query(ReportGenerationJob) .filter(ReportGenerationJob.id == job_id, ReportGenerationJob.project_id == project.uuid) .first() ) if not job: raise HTTPException(status_code=404, detail="任务不存在") _recover_stalled_job(db, job) chapter_rows = ( db.query(ReportGenerationChapter) .filter(ReportGenerationChapter.job_id == job.id) .order_by(ReportGenerationChapter.section_order.asc()) .all() ) chapter_title_map: dict[str, str] = {} if job.template_id: chapter_title_map = _build_section_title_map( _list_template_sections(db, job.template_id) ) runtime_state = get_job_state(job.id) if runtime_state: report_text, chapter_items = _build_live_result_from_runtime( runtime_state, include_debug=include_debug, chapter_title_map=chapter_title_map, ) else: report_text, chapter_items = _build_live_result_from_chapters( chapter_rows, include_debug=include_debug, chapter_title_map=chapter_title_map, ) report_text = _append_report_appendices(db, project.uuid, report_text) report_text = _normalize_table_captions_in_markdown(report_text or "") consistency = _check_consistency(report_text or "", project.name) return GenerateReportResult( jobId=job.id, status=(runtime_state or {}).get("status") or job.status, report=report_text, consistency=consistency, chapters=chapter_items, ) def _build_live_result_from_chapters( chapters: list[ReportGenerationChapter], *, include_debug: bool = False, chapter_title_map: Optional[dict[str, str]] = None, ) -> tuple[str, list[GenerateReportResultChapter]]: chapter_items: list[GenerateReportResultChapter] = [] report_parts: list[str] = [] title_map = chapter_title_map or {} for i, c in enumerate(chapters): normalized_content = _fix_numeric_line_breaks(str(c.content or "").strip()) if c.content else c.content if normalized_content: normalized_content, _ = _collapse_consecutive_text_repetitions( str(normalized_content).strip() ) normalized_content = _normalize_table_captions_in_markdown(str(normalized_content).strip()) prev_body = "" if i > 0 and chapters[i - 1].content: prev_body = _fix_numeric_line_breaks(str(chapters[i - 1].content).strip()) normalized_content = _inject_missing_parent_section_headings( str(c.section_title or ""), str(normalized_content).strip(), prev_body, title_map, ) chapter_items.append( GenerateReportResultChapter( sectionKey=c.section_key, sectionTitle=c.section_title, sectionOrder=c.section_order, status=c.status, content=normalized_content, errorMessage=c.error_message, promptText=(c.prompt_text if include_debug else None), evidencePayload=(c.evidence_payload if include_debug else None), validationPayload=(c.validation_payload if include_debug else None), ) ) if normalized_content: report_parts.append(str(normalized_content).strip()) return _fix_numeric_line_breaks("\n\n".join(report_parts).strip()), chapter_items def _build_live_result_from_runtime( runtime_state: dict, *, include_debug: bool = False, chapter_title_map: Optional[dict[str, str]] = None, ) -> tuple[str, list[GenerateReportResultChapter]]: chapter_items: list[GenerateReportResultChapter] = [] report_parts: list[str] = [] title_map = chapter_title_map or {} chapter_values = list(((runtime_state or {}).get("chapters") or {}).values()) chapter_values.sort(key=lambda x: int((x or {}).get("sectionOrder") or 0)) for i, chapter in enumerate(chapter_values): if not isinstance(chapter, dict): continue normalized_content = _fix_numeric_line_breaks(str(chapter.get("content") or "").strip()) if normalized_content: normalized_content, _ = _collapse_consecutive_text_repetitions( str(normalized_content).strip() ) normalized_content = _normalize_table_captions_in_markdown(str(normalized_content).strip()) prev_body = "" if i > 0 and isinstance(chapter_values[i - 1], dict): prev_body = _fix_numeric_line_breaks(str(chapter_values[i - 1].get("content") or "").strip()) normalized_content = _inject_missing_parent_section_headings( str(chapter.get("sectionTitle") or ""), normalized_content, prev_body, title_map, ) chapter_items.append( GenerateReportResultChapter( sectionKey=str(chapter.get("sectionKey") or ""), sectionTitle=str(chapter.get("sectionTitle") or ""), sectionOrder=int(chapter.get("sectionOrder") or 0), status=str(chapter.get("status") or "pending"), content=normalized_content or None, errorMessage=chapter.get("errorMessage"), promptText=(chapter.get("promptText") if include_debug else None), evidencePayload=(chapter.get("evidencePayload") if include_debug else None), validationPayload=(chapter.get("validationPayload") if include_debug else None), ) ) if normalized_content: report_parts.append(normalized_content) return _fix_numeric_line_breaks("\n\n".join(report_parts).strip()), chapter_items def get_report_stream_snapshot( job_id: str, *, include_debug: bool = False, ) -> Optional[dict[str, Any]]: runtime_state = get_job_state(job_id) if not runtime_state: return None chapter_title_map: dict[str, str] = {} template_id = runtime_state.get("templateId") if template_id: with SessionLocal() as db: chapter_title_map = _build_section_title_map( _list_template_sections(db, str(template_id)) ) report_text, chapter_items = _build_live_result_from_runtime( runtime_state, include_debug=include_debug, chapter_title_map=chapter_title_map, ) runtime_chapters = list(((runtime_state or {}).get("chapters") or {}).values()) runtime_chapters.sort(key=lambda x: int((x or {}).get("sectionOrder") or 0)) job_payload = { "jobId": runtime_state.get("jobId"), "projectId": runtime_state.get("projectId"), "templateId": runtime_state.get("templateId"), "status": runtime_state.get("status"), "progress": int(runtime_state.get("progress") or 0), "currentSectionKey": runtime_state.get("currentSectionKey"), "errorMessage": runtime_state.get("errorMessage"), "createdAt": runtime_state.get("createdAt"), "updatedAt": runtime_state.get("updatedAt"), "completedAt": runtime_state.get("completedAt"), "chapters": [ { "sectionKey": str(c.get("sectionKey") or ""), "sectionTitle": str(c.get("sectionTitle") or ""), "sectionOrder": int(c.get("sectionOrder") or 0), "status": str(c.get("status") or "pending"), "updatedAt": c.get("updatedAt"), "errorMessage": c.get("errorMessage"), } for c in runtime_chapters ], } result_payload = { "jobId": runtime_state.get("jobId"), "status": runtime_state.get("status"), "report": report_text, "consistency": [], "chapters": [c.model_dump() for c in chapter_items], } return { "job": job_payload, "result": result_payload, } def retry_report_chapter(project_id: str, job_id: str, section_key: str, db: Session) -> GenerateReportJobItem: project = _resolve_project(db, project_id) if not project: raise HTTPException(status_code=404, detail="项目不存在") job = ( db.query(ReportGenerationJob) .filter(ReportGenerationJob.id == job_id, ReportGenerationJob.project_id == project.uuid) .first() ) if not job: raise HTTPException(status_code=404, detail="任务不存在") chapter = ( db.query(ReportGenerationChapter) .filter(ReportGenerationChapter.job_id == job.id, ReportGenerationChapter.section_key == section_key) .first() ) if not chapter: raise HTTPException(status_code=404, detail="章节不存在") now = datetime.now() chapter.status = "pending" chapter.error_message = None chapter.updated_at = now job.status = "running" job.updated_at = now db.commit() update_job_state(job.id, status="running", errorMessage=None, completedAt=None) update_chapter_state( job.id, section_key, status="pending", content=None, errorMessage=None, promptText=None, evidencePayload=None, validationPayload=None, ) _start_job_worker(job.id, section_key=section_key) return get_report_job(project.uuid, job_id, db) def cancel_report_job(project_id: str, job_id: str, db: Session) -> GenerateReportJobItem: project = _resolve_project(db, project_id) if not project: raise HTTPException(status_code=404, detail="项目不存在") job = ( db.query(ReportGenerationJob) .filter(ReportGenerationJob.id == job_id, ReportGenerationJob.project_id == project.uuid) .first() ) if not job: raise HTTPException(status_code=404, detail="任务不存在") now = datetime.now() if job.status in ("completed", "failed", "cancelled"): return get_report_job(project.uuid, job_id, db) chapters = ( db.query(ReportGenerationChapter) .filter(ReportGenerationChapter.job_id == job.id) .all() ) for c in chapters: if c.status in ("pending", "running"): c.status = "failed" c.error_message = "任务已由用户取消" c.updated_at = now job.status = "cancelled" job.error_message = "任务已由用户取消" job.current_section_key = None job.updated_at = now job.completed_at = now db.commit() update_job_state( job.id, status="cancelled", errorMessage="任务已由用户取消", currentSectionKey=None, completedAt=_fmt_dt(now), ) for c in chapters: if c.status in ("failed", "cancelled") or c.error_message == "任务已由用户取消": update_chapter_state( job.id, c.section_key, status="failed", errorMessage="任务已由用户取消", ) return get_report_job(project.uuid, job_id, db) def _start_job_worker(job_id: str, section_key: Optional[str] = None) -> None: threading.Thread( target=_run_job_worker, args=(job_id, section_key), daemon=True, name=f"report-job-{job_id[:8]}", ).start() def _run_job_worker(job_id: str, only_section_key: Optional[str] = None) -> None: with SessionLocal() as db: job = db.query(ReportGenerationJob).filter(ReportGenerationJob.id == job_id).first() if not job: return try: job.status = "running" job.error_message = None job.updated_at = datetime.now() db.commit() update_job_state(job.id, status="running", errorMessage=None) project = db.query(Project).filter(Project.uuid == job.project_id).first() if not project: raise RuntimeError("项目不存在") template = _resolve_template(db, job.template_id) all_template_sections = _list_template_sections(db, template.id) sections = _sections_for_generation(all_template_sections) chapter_title_map = _build_section_title_map(all_template_sections) chapters = ( db.query(ReportGenerationChapter) .filter(ReportGenerationChapter.job_id == job.id) .order_by(ReportGenerationChapter.section_order.asc()) .all() ) chapter_map = {c.section_key: c for c in chapters} completed_section_contents: dict[str, str] = { c.section_key: str(c.content or "").strip() for c in chapters if c.status == "completed" and str(c.content or "").strip() } target_sections = [s for s in sections if (not only_section_key or s.section_key == only_section_key)] retrieval = RetrievalService() top_k = int((job.options or {}).get("topK") or 10) completed_count = 0 pending_sections = [] for section in target_sections: chapter = chapter_map.get(section.section_key) if not chapter: continue if not only_section_key and chapter.status == "completed": completed_count += 1 continue pending_sections.append(section) total_count = max(1, completed_count + len(pending_sections)) logger.info( "报告生成 job start | job=%s | project=%s | total_sections=%d | pending=%d | completed=%d | top_k=%d", job.id, project.uuid, len(sections), len(pending_sections), completed_count, top_k, ) for idx, section in enumerate(pending_sections, start=1): db.refresh(job) if job.status == "cancelled": return chapter = chapter_map.get(section.section_key) if not chapter: continue section_no = _extract_section_number(section.section_title or "") logger.info( "报告生成 start chapter | job=%s | section=%s | title=%s | section_no=%s | idx=%d/%d", job.id, section.section_key, section.section_title, section_no, idx, total_count, ) _update_chapter_status(db, job, chapter, "running", None) update_job_state(job.id, currentSectionKey=section.section_key) update_chapter_state( job.id, section.section_key, status="running", errorMessage=None, content=None, promptText=None, evidencePayload=None, validationPayload={"streamPhase": "waiting"}, ) required_tables = _extract_required_table_tokens( section.section_prompt or "", _extract_section_number(section.section_title or ""), contract_text=_effective_section_output_contract(section), ) if _extract_section_number(section.section_title or "") == "5.3.2": na8 = _norm_table_token("附表8") required_tables = [ t for t in required_tables if _norm_table_token(str(t)) != na8 ] evidence, retrieval_stage = _collect_evidence_progressive( db, retrieval, project.uuid, section, top_k=top_k, required_tables=required_tables, ) prior_sibling_sections_text = _build_prior_sibling_sections_text( section, sections, completed_section_contents, ) section_reference = _load_section_reference_for_chapter( db, section.section_key, section.section_title, template_id=template.id, ) logger.info( "section_reference 注入 | section=%s | template_id=%s | 命中=%s", section.section_key, template.id, "是" if section_reference else "否", ) prompt = _build_chapter_prompt( section, evidence, prior_sibling_sections_text=prior_sibling_sections_text, section_reference=section_reference, ) _dump_runtime_prompt( job_id=job.id, section_key=section.section_key, section_title=section.section_title, system_prompt=chapter_generation_system_prompt(), user_prompt=prompt, ) stream_state = { "buffer": "", "phase": "waiting", } def _on_content_delta(event: str, delta_text: str) -> None: if event == "delta": if delta_text: stream_state["phase"] = "streaming" stream_state["buffer"] = str(stream_state.get("buffer") or "") + delta_text append_chapter_content( job.id, section.section_key, delta_text, stream_phase="streaming", ) elif event == "finalizing": stream_state["phase"] = "finalizing" set_chapter_stream_phase(job.id, section.section_key, "finalizing") content, validation, model_output = _generate_chapter_content( section, prompt, on_content_delta=_on_content_delta, ) content = _apply_canonical_field_backfill(section, evidence, content) _cur_section_no = _extract_section_number(section.section_title or "") _skip_table_enforcement = _cur_section_no in {"2.1.1"} if _skip_table_enforcement: remaining_missing_tables = [] content_after_tables = content else: content, remaining_missing_tables = _enforce_required_tables( section, prompt, content, evidence, ) content_after_tables = content content = _strip_tables_from_non_table_section( section.section_title or "", content, section=section ) content = _strip_forbidden_tables( section.section_title or "", content, ) content, format_issues = _enforce_template_format_contract( section, content, evidence, chapter_title_map=chapter_title_map, ) _sec_no = _extract_section_number(section.section_title or "") _refresh_tokens: tuple[str, ...] = ("表5-4",) if _sec_no == "5.1": _refresh_tokens = ("表5-1",) elif _sec_no == "5.2.1": _refresh_tokens = ("表5-2", "表5-3") content = _refresh_element_table_markdown_tokens( content, evidence, _refresh_tokens ) content = _strip_bracketed_three_part_labels(content) content = _strip_placeholder_table_notes(content) content = _normalize_table_captions_in_markdown(content) content = _strip_trailing_partial_missing_markers(content) content = _fix_numeric_line_breaks(content) content = _cleanup_section_table_artifacts( section.section_title or "", content, allowed_table_tokens=required_tables, ) if _sec_no == "5.3.1": content = _refresh_element_table_markdown_tokens( content, evidence, ("表5-4",) ) content = _fill_required_table_caption_stubs( content, ["表5-4"], evidence ) content = _strip_orphan_markdown_table_rows(content) content = _strip_minimal_missing_table_tail(content) content, intra_repeat_removed = _collapse_consecutive_text_repetitions(content) content, chapter_dedupe_removed = _dedupe_long_chapter_repetition(content) chapter_dedupe_removed += intra_repeat_removed if chapter_dedupe_removed > 0: warnings = validation.get("warnings") if isinstance(validation, dict) else [] if not isinstance(warnings, list): warnings = [] warnings.append( f"章节去重:已移除 {chapter_dedupe_removed} 处重复段落/表格" ) validation["warnings"] = warnings validation["chapterDedupeRemoved"] = chapter_dedupe_removed if required_tables and not _skip_table_enforcement: content = _restore_required_tables_safety_net( content, required_tables, evidence, content_after_tables, ) content = _finalize_section_table_dedupe(content, required_tables) if remaining_missing_tables: warnings = validation.get("warnings") if isinstance(validation, dict) else [] if not isinstance(warnings, list): warnings = [] warnings.append( "部分必需表格仍缺失,已插入占位表:" + "、".join(remaining_missing_tables) ) validation["warnings"] = warnings if format_issues: warnings = validation.get("warnings") if isinstance(validation, dict) else [] if not isinstance(warnings, list): warnings = [] warnings.extend([f"格式验收器:{x}" for x in format_issues][:8]) validation["warnings"] = warnings validation["retrievalStage"] = retrieval_stage validation["streamPhase"] = "completed" diagnostics = _build_field_diagnostics(section, evidence, content) if diagnostics: validation["fieldDiagnostics"] = diagnostics if model_output: validation["modelOutput"] = model_output content = _inject_missing_parent_section_headings( section.section_title or "", content, _previous_completed_section_content( section, sections, completed_section_contents ), chapter_title_map, ) now = datetime.now() chapter.content = content completed_section_contents[section.section_key] = content chapter.prompt_text = prompt[:20000] chapter.evidence_payload = evidence chapter.validation_payload = validation chapter.status = "completed" chapter.error_message = None chapter.updated_at = now chapter.completed_at = now if not only_section_key: job.progress = int((completed_count + idx) * 100 / total_count) job.current_section_key = section.section_key job.updated_at = now db.commit() dump_out_path = _dump_report_chapter_json_markdown( job_id=job.id, section_key=section.section_key, section_title=section.section_title, output_json={ "modelOutput": model_output or {}, "persistedChapter": { "sectionKey": section.section_key, "sectionTitle": section.section_title, "sectionOrder": section.section_order, "status": "completed", "content": content, "promptText": prompt[:20000], "evidencePayload": evidence, "validationPayload": validation, }, }, ) logger.info( "章节生成落盘 | job=%s | section=%s | prompt_len=%s | content_len=%s | output_file=%s", job.id, section.section_key, len(prompt[:20000]), len(content), dump_out_path or "(已存在合并写入)", ) update_chapter_state( job.id, section.section_key, status="completed", content=content, errorMessage=None, promptText=prompt[:20000], evidencePayload=evidence, validationPayload=validation, ) if not only_section_key: update_job_state( job.id, progress=int((completed_count + idx) * 100 / total_count), currentSectionKey=section.section_key, ) else: update_job_state(job.id, currentSectionKey=section.section_key) db.refresh(job) if job.status == "cancelled": return db.refresh(job) if job.status == "cancelled": return if only_section_key: # 单章重跑不应直接终结整任务,仅回写章节并刷新任务进度。 all_chapters = ( db.query(ReportGenerationChapter) .filter(ReportGenerationChapter.job_id == job.id) .order_by(ReportGenerationChapter.section_order.asc()) .all() ) done = sum(1 for c in all_chapters if c.status == "completed") total = max(1, len(all_chapters)) job.progress = int(done * 100 / total) job.current_section_key = None job.updated_at = datetime.now() db.commit() update_job_state(job.id, progress=int(done * 100 / total), currentSectionKey=None) else: job.snapshot = None job.status = "completed" job.progress = 100 job.current_section_key = None job.completed_at = datetime.now() job.updated_at = datetime.now() db.commit() logger.info( "报告生成 job completed | job=%s | project=%s | total_chapters=%d", job.id, project.uuid, len(sections), ) update_job_state( job.id, status="completed", progress=100, currentSectionKey=None, completedAt=_fmt_dt(job.completed_at), ) except Exception as e: current_section_key = job.current_section_key logger.error( "报告生成 job failed | job=%s | project=%s | section=%s | err=%s", job.id, project.uuid, current_section_key, e, ) job.status = "failed" job.error_message = str(e) job.updated_at = datetime.now() db.commit() update_job_state(job.id, status="failed", errorMessage=str(e)) if current_section_key: update_chapter_state( job.id, current_section_key, status="failed", errorMessage=str(e), ) # 这些章节号在 L1/L2 已判「证据充足」时仍继续走 L2/L3,避免仅章节定向检索就提前结束而漏掉关键词召回。 _SECTION_NUMBERS_FORCE_L3_KEYWORD_RETRIEVAL: frozenset[str] = frozenset({"2.1.5", "3.3.3", "3.4.1", "3.6", "3.7", "3.8", "3.10", "4.3.3","5.2.3", "6.1.1.1", "6.1.1.2", "6.2.1", "6.2.4"}) def _section_forces_l3_keyword_retrieval(section: ReportTemplateSection) -> bool: return _extract_section_number(section.section_title or "") in _SECTION_NUMBERS_FORCE_L3_KEYWORD_RETRIEVAL def _collect_evidence_progressive( db: Session, retrieval: RetrievalService, project_uuid: str, section: ReportTemplateSection, *, top_k: int, required_tables: Optional[list[str]] = None, ) -> tuple[dict, str]: force_l3 = _section_forces_l3_keyword_retrieval(section) # L1: 仅要素与结构化表 evidence = _collect_evidence( db, retrieval, project_uuid, section, top_k=top_k, required_tables=required_tables, include_chapter_docs=False, include_keyword_docs=False, ) if _is_evidence_sufficient(section, evidence) and not force_l3: return evidence, "elements_only" # L2: 补充章节定向检索段落 evidence = _collect_evidence( db, retrieval, project_uuid, section, top_k=top_k, required_tables=required_tables, include_chapter_docs=True, include_keyword_docs=False, ) if _is_evidence_sufficient(section, evidence) and not force_l3: return evidence, "elements_plus_chapter_docs" # L3: 最后补充关键词兜底检索 evidence = _collect_evidence( db, retrieval, project_uuid, section, top_k=top_k, required_tables=required_tables, include_chapter_docs=True, include_keyword_docs=True, ) return evidence, "elements_plus_chapter_and_keyword_docs" def _latest_element_payloads_by_row_col( db: Session, project_uuid: str, row_keys: list[str], *, non_empty_value: bool = True, ) -> list[dict[str, Any]]: """按 ``row_key + col_key`` 去重,保留 ``updated_at`` 最新的一条(查询已按时间倒序)。""" if not row_keys: return [] q = db.query(ElementCell).filter( ElementCell.project_id == project_uuid, ElementCell.row_key.in_(row_keys), ) if non_empty_value: q = q.filter(ElementCell.value.isnot(None), ElementCell.value != "") cells = q.order_by(ElementCell.updated_at.desc()).all() picked: dict[tuple[str, str], dict[str, Any]] = {} for cell in cells: rk = str(cell.row_key or "") ck = str(cell.col_key or "") key = (rk, ck) if key in picked: continue picked[key] = { "rowKey": cell.row_key, "colKey": cell.col_key, "value": str(cell.value or "")[:500], "sourceDocumentId": cell.source_document_id, } return list(picked.values()) def _merge_section_11_forced_elements( forced_payloads: list[dict[str, Any]], scored_top_payloads: list[dict[str, Any]], *, max_additional_scored: int = 40, ) -> list[dict[str, Any]]: """1.1 节:先发制人并入第 1 章概况要素,再追加与其它章节相同的 Top-K 打分单元格(去重)。""" seen: set[tuple[str, str]] = set() out: list[dict[str, Any]] = [] for p in forced_payloads: key = (str(p.get("rowKey") or ""), str(p.get("colKey") or "")) if key in seen: continue seen.add(key) out.append(p) added = 0 for p in scored_top_payloads: key = (str(p.get("rowKey") or ""), str(p.get("colKey") or "")) if key in seen: continue seen.add(key) out.append(p) added += 1 if added >= max_additional_scored: break return out def _collect_evidence( db: Session, retrieval: RetrievalService, project_uuid: str, section: ReportTemplateSection, *, top_k: int, required_tables: Optional[list[str]] = None, include_chapter_docs: bool = True, include_keyword_docs: bool = True, ) -> dict: section_no = _extract_section_number(section.section_title or "") tokens = _extract_tokens(f"{section.section_title} {section.section_prompt or ''}")[:14] if section_no == "1.2": # 标题词过短会导致检索跑偏;补充决策类短语提高召回 extra = " ".join( [ "项目决策要点 建设必要性 立项背景", "国VI 国Ⅵ 汽油质量升级 芳烃 烯烃 环保", "预期目标 烷基化油 产量 辛烷值 万吨", "可研 批复 投资 效益 利润", ] ) merged = _extract_tokens(f"{section.section_title} {section.section_prompt or ''} {extra}") tokens = list(dict.fromkeys(merged))[:20] cells_query = ( db.query(ElementCell, ElementTable.table_name) .join(ElementTable, ElementTable.id == ElementCell.table_id) .filter( ElementCell.project_id == project_uuid, ElementTable.project_id == project_uuid, ElementCell.value.isnot(None), ElementCell.value != "", ) .order_by(ElementCell.updated_at.desc()) ) candidate_cells: list[tuple[int, dict]] = [] for cell, table_name in cells_query.limit(800).all(): payload = { "tableId": cell.table_id, "tableName": table_name, "rowKey": cell.row_key, "colKey": cell.col_key, "year": cell.year, "value": str(cell.value or "")[:500], "sourceDocumentId": cell.source_document_id, "sourceType": cell.source_type, } score = _score_element_cell_relevance( section.section_title, tokens, payload.get("rowKey"), payload.get("colKey"), payload.get("value"), table_name=payload.get("tableName"), section=section, ) # 无 token 命中但字段语义强相关时仍保留(例如 value 内存在“项目名称:xxx”)。 if score > 0: candidate_cells.append((score, payload)) elif not tokens: candidate_cells.append((1, payload)) candidate_cells.sort(key=lambda x: x[0], reverse=True) matched_cells = [x[1] for x in candidate_cells[:40]] # 1.1「项目基本情况」:强制并入「章节要素-第1章项目概况」全部非空格子,避免被全局 Top40 相关性截断挤出导致建设投资等待补充。 if _extract_section_number(section.section_title or "") == "1.1": ch1_rows = section_table_row_keys(CHAPTER1_PROJECT_OVERVIEW_TABLE_GROUP) forced_ch1 = _latest_element_payloads_by_row_col(db, project_uuid, ch1_rows, non_empty_value=True) if forced_ch1: matched_cells = _merge_section_11_forced_elements(forced_ch1, matched_cells, max_additional_scored=40) required = [str(t) for t in (required_tables or []) if str(t).strip()] structured_tables = _collect_structured_tables( db, project_uuid, required, section_title=str(section.section_title or ""), section_tokens=tokens, ) chapter_docs = [] if include_chapter_docs: chapter_docs = retrieval.get_chapter_materials(project_uuid, section.section_title, top_k=top_k) keyword_docs: list[dict] = [] if include_keyword_docs and tokens: if section_no == "1.2": queries = [ "国VI 国Ⅵ 汽油 质量升级 芳烃 烯烃 环保 标准", "项目 建设 必要性 决策 依据 立项", "预期 目标 烷基化油 产量 辛烷值 效益 万吨", " ".join(tokens[:8]), ] seen: set[tuple[str, str]] = set() for q in queries: docs = retrieval.search_by_query(q, top_k=6, filter_project=project_uuid) for d in docs: did = str(d.metadata.get("doc_id", "") or "") body = str(d.page_content or "")[:2000] key = (did, body[:240]) if key in seen: continue seen.add(key) keyword_docs.append( { "heading": d.metadata.get("heading", ""), "content": body, "docId": did, "query": q[:120], } ) if len(keyword_docs) >= 14: break if len(keyword_docs) >= 14: break else: q = " ".join(tokens[:5]) docs = retrieval.search_by_query(q, top_k=10, filter_project=project_uuid) for d in docs: keyword_docs.append( { "heading": d.metadata.get("heading", ""), "content": str(d.page_content or "")[:2000], "docId": d.metadata.get("doc_id", ""), } ) return { "tokens": tokens, "requiredTables": required, "structuredTables": structured_tables, "canonicalFields": _extract_canonical_fields( section.section_title, matched_cells, section=section ), "elements": matched_cells, "chapterDocs": chapter_docs[:top_k], "keywordDocs": keyword_docs[:14] if section_no == "1.2" else keyword_docs[:8], } def _is_evidence_sufficient(section: ReportTemplateSection, evidence: dict) -> bool: required_tables = evidence.get("requiredTables") if isinstance(evidence, dict) else [] structured_tables = evidence.get("structuredTables") if isinstance(evidence, dict) else [] elements = evidence.get("elements") if isinstance(evidence, dict) else [] chapter_docs = evidence.get("chapterDocs") if isinstance(evidence, dict) else [] keyword_docs = evidence.get("keywordDocs") if isinstance(evidence, dict) else [] required_count = len(required_tables) if isinstance(required_tables, list) else 0 structured_count = len(structured_tables) if isinstance(structured_tables, list) else 0 element_count = len(elements) if isinstance(elements, list) else 0 chapter_doc_count = len(chapter_docs) if isinstance(chapter_docs, list) else 0 keyword_doc_count = len(keyword_docs) if isinstance(keyword_docs, list) else 0 # 有必需表格时优先保证结构化表匹配覆盖 if required_count > 0 and structured_count < min(required_count, 2): return False # 没有足够要素时,需要至少一种文档证据补充 if element_count < 6 and chapter_doc_count == 0 and keyword_doc_count == 0: return False # 表格相关章节通常需要更高证据密度 title = str(section.section_title or "") if "表" in title and (structured_count == 0 and element_count < 10): return False # 关键章节按字段完整性判定,避免“有数量但没关键字段”时误判为充足 title_norm = re.sub(r"\s+", "", title) if "1.1项目基本情况" in title_norm: required_groups = [ ["建设单位", "建设单位名称"], ["建设地点", "厂址"], ["建设规模", "装置规模", "能力", "万吨/年"], ["投资", "概算", "估算", "决算"], ] for group in required_groups: if not _evidence_contains_any_fact(evidence, group): return False if "1.2项目决策要点" in title_norm: required_groups = [ ["国vi", "国ⅵ", "质量升级", "汽油标准", "环保", "环评", "排放", "清洁生产"], ["高标号", "辛烷值", "汽油池", "产品结构", "汽油"], ["碳四", "液化气", "原料", "物料平衡", "资源利用", "附加值"], ["杂质", "预处理", "丁二烯", "选择性加氢", "催化剂", "甲醇", "二甲醚"], ["万吨", "产量", "烷基化油", "效益", "利润", "营业收入", "预期", "目标"], ] hit_count = 0 for group in required_groups: if _evidence_contains_any_fact(evidence, group): hit_count += 1 # 至少命中 2 组:安全评价里常有杂质/物料平衡,可研/环评可补环保与目标 if hit_count < 2: return False return True def _score_element_cell_relevance( section_title: str, tokens: list[str], row_key: Optional[str], col_key: Optional[str], value: Optional[str], *, table_name: Optional[str] = None, section: ReportTemplateSection | None = None, ) -> int: table = str(table_name or "") row = str(row_key or "") col = str(col_key or "") val = str(value or "") full_text = f"{table} {row} {col} {val}" full_text_l = full_text.lower() key_text_l = f"{table} {row} {col}".lower() score = 0 for t in (tokens or []): tt = str(t or "").strip() if not tt: continue if tt in full_text: score += 1 if table and tt in table: score += 2 title_norm = re.sub(r"\s+", "", str(section_title or "")) table_norm = re.sub(r"\s+", "", table) if title_norm and table_norm and (title_norm in table_norm or table_norm in title_norm): score += 8 section_no = _extract_section_number(section_title) if section_no and table_norm and section_no.replace(".", ""): section_no_norm = section_no.replace(".", "") table_no_norm = re.sub(r"\D", "", table_norm[:12]) if table_no_norm and table_no_norm.startswith(section_no_norm): score += 3 # 对关键章节字段进行强加权,降低无关单元格被截断前占位的概率。 expected = _section_expected_fields(section_title, section) for field in expected: aliases = [str(a).strip() for a in _field_aliases(field) if str(a).strip()] alias_hit = False for alias in aliases: a_l = alias.lower() if a_l in key_text_l: score += 4 alias_hit = True break if alias_hit: continue # 若 row/col 不包含字段名,尝试 value 中“字段:值”模式。 if _extract_value_by_alias_from_text(val, aliases): score += 5 continue # 最弱相关:value 中仅出现别名关键词。 if any(str(a).lower() in full_text_l for a in aliases): score += 1 return score def _evidence_contains_any_fact(evidence: dict, keywords: list[str]) -> bool: if not isinstance(evidence, dict): return False lowered_keywords = [str(k).strip().lower() for k in keywords if str(k).strip()] if not lowered_keywords: return False elements = evidence.get("elements") if isinstance(evidence.get("elements"), list) else [] for row in elements: if not isinstance(row, dict): continue row_key = str(row.get("rowKey") or "").lower() col_key = str(row.get("colKey") or "").lower() value = str(row.get("value") or "").strip() value_l = value.lower() if _is_missing_like(value): continue if any(k in row_key or k in col_key or k in value_l for k in lowered_keywords): return True for doc_field in ("chapterDocs", "keywordDocs"): docs = evidence.get(doc_field) if isinstance(evidence.get(doc_field), list) else [] for d in docs[:12]: if not isinstance(d, dict): continue text = (str(d.get("heading") or "") + " " + str(d.get("content") or "")).lower() if any(k in text for k in lowered_keywords): return True return False def _recover_stalled_job(db: Session, job: ReportGenerationJob) -> None: if not job or job.status != "running": return now = datetime.now() running_chapter = ( db.query(ReportGenerationChapter) .filter(ReportGenerationChapter.job_id == job.id, ReportGenerationChapter.status == "running") .order_by(ReportGenerationChapter.updated_at.asc()) .first() ) if not running_chapter or not running_chapter.updated_at: return stale_seconds = (now - running_chapter.updated_at).total_seconds() if stale_seconds < RUNNING_CHAPTER_STALE_SECONDS: return running_chapter.status = "pending" running_chapter.error_message = "检测到章节长时间未更新,已自动回收并重试" running_chapter.updated_at = now job.status = "pending" job.error_message = None job.current_section_key = None job.updated_at = now db.commit() update_job_state(job.id, status="pending", errorMessage=None, currentSectionKey=None) update_chapter_state( job.id, running_chapter.section_key, status="pending", errorMessage="检测到章节长时间未更新,已自动回收并重试", content=None, promptText=None, evidencePayload=None, validationPayload=None, ) _start_job_worker(job.id) def _load_section_reference_for_chapter( db: Session, section_key: str, section_title: str, *, template_id: Optional[str] = None, max_chars: int = 8000, ) -> str: """ 从 report_section_references 表加载当前章节存储的原始章节内容(content), 直接用于填充 user-prompt 的 section_reference_block,不做 LLM 脱敏。 优先按 section_key 精确匹配,其次从标题中提取编号匹配,最后按标题模糊匹配。 template_id: 选中模板的 ID。传入后只注入与该模板关联(report_section_references.template_id) 的参考范文,实现“按模板过滤参考范文”;为空则不做模板过滤(取最新一条)。 """ from services.reference_service import ( load_section_reference_raw, load_section_reference_raw_by_title, ) tid = (template_id or "").strip() or None content = load_section_reference_raw( db, section_key, template_id=tid, max_chars=max_chars ) if content: return content # 兜底:按标题匹配(仍限定在同一模板内) return load_section_reference_raw_by_title( db, section_title, template_id=tid, max_chars=max_chars ) def _build_chapter_prompt( section: ReportTemplateSection, evidence: dict, *, prior_sibling_sections_text: str = "", section_reference: str = "", ) -> str: selected_example = _select_chapter_example( section.section_title, section.examples, evidence, ) section_contract = _effective_section_output_contract(section) section_no = _extract_section_number(section.section_title) heading_rule = SECTION_HEADING_RULES.get(section_no, DEFAULT_HEADING_RULE) expected_fields = _section_expected_fields(section.section_title, section) return build_report_chapter_prompt( section_title=section.section_title, section_prompt=_effective_section_prompt_for_generation(section, section_contract), required_tables_text="、".join(evidence.get("requiredTables") or []) or "无", structured_tables_text=_render_structured_tables_for_prompt(evidence), canonical_fields_text=_render_canonical_fields_for_prompt( evidence, allowed_fields=expected_fields or None ), selected_example=selected_example, heading_rule=heading_rule, section_contract=section_contract, evidence_json=json.dumps(evidence, ensure_ascii=False), prior_sibling_sections_text=prior_sibling_sections_text, section_reference=section_reference, ) def _generate_chapter_content( section: ReportTemplateSection, prompt: str, on_content_delta: Optional[callable] = None, ) -> tuple[str, dict, dict]: section_no = _extract_section_number(section.section_title or "") logger.info( "LLM 章节生成 start | section=%s | section_no=%s | max_tokens=%s", section.section_key, section_no, _chapter_generation_max_tokens(section_no), ) obj = chat_completions_json( system_prompt=chapter_generation_system_prompt(), user_prompt=prompt, temperature=0.1, max_tokens=_chapter_generation_max_tokens(section_no), timeout_sec=120, on_content_delta=on_content_delta, log_context=f"章节生成 section_key={section.section_key} | {section.section_title}", ) content = str(obj.get("content") or "").strip() if not content: content = f"{section.section_title}\n\n待补充" # 不对章节编号/条目序号做“统一编号归一化”改写,避免破坏模板章节层级(如 2.1.1、3.4.2 等)。 # 仅清理证据标签/引用编号等噪声。 content = _strip_inline_evidence_labels(content) if section_no == "1.2": content = re.sub( r"(?m)^[\s\u3000]*1[\s\u3000]*[)\)][\s\u3000]*项目背景[\s\u3000]*$", "1.2.1项目背景", content, ) content = re.sub( r"(?m)^[\s\u3000]*2[\s\u3000]*[)\)][\s\u3000]*预期目标[\s\u3000]*$", "1.2.2预期目标", content, ) content = _normalize_section_12_content(content) missing = obj.get("missingInfo") if isinstance(obj.get("missingInfo"), list) else [] checks = obj.get("qualityChecks") if isinstance(obj.get("qualityChecks"), list) else [] validation = { "missingInfo": [str(x) for x in missing][:20], "qualityChecks": [str(x) for x in checks][:20], "warnings": _basic_warnings(section.section_title, content), } return content, validation, obj def _normalize_ordered_item_markers(content: str) -> str: text = _strip_inline_evidence_labels(str(content or "")) if not text: return text cn_num_to_idx = { "一": 1, "二": 2, "三": 3, "四": 4, "五": 5, "六": 6, "七": 7, "八": 8, "九": 9, "十": 10, } # Keep the first non-empty line unchanged, to avoid mutating the section title. lines = text.splitlines() first_non_empty_idx = -1 for i, ln in enumerate(lines): if ln.strip(): first_non_empty_idx = i break # Convert line-leading markers such as: # - Chinese numerals: "一、" / "(一)" # - Arabic numerals: "1." / "2." / "1.2." / "3.1" # into a unified "n)" style. cn_pattern = re.compile(r"^(\s*(?:#+\s*)?(?:[-*]\s*)?)(?:(([一二三四五六七八九十]))|([一二三四五六七八九十])、)\s*") ar_pattern = re.compile(r"^(\s*(?:#+\s*)?(?:[-*]\s*)?)(\d+(?:\.\d+)*)(?:\.)?\s+") def _replace_line(ln: str) -> str: m_cn = cn_pattern.match(ln) if m_cn: prefix = m_cn.group(1) or "" cn = m_cn.group(2) or m_cn.group(3) or "" idx = cn_num_to_idx.get(cn) if idx: return cn_pattern.sub(f"{prefix}{idx})", ln, count=1) return ln m_ar = ar_pattern.match(ln) if m_ar: prefix = m_ar.group(1) or "" seq = m_ar.group(2) or "" parts = [p for p in seq.split(".") if p] # Use the last segment as list index: 1.2 -> 2), 3.1 -> 1) idx = parts[-1] if parts else "" if idx.isdigit(): return ar_pattern.sub(f"{prefix}{int(idx)}) ", ln, count=1) return ln out: list[str] = [] for i, ln in enumerate(lines): if i == first_non_empty_idx: out.append(ln) continue out.append(_replace_line(ln)) return "\n".join(out) def _strip_inline_evidence_labels(text: str) -> str: src = str(text or "") if not src: return src cleaned = re.sub(r"【\s*证据依据\s*[::]\s*[0-9a-fA-F]{16,}\s*】", "", src) cleaned = re.sub(r"\[\s*证据依据\s*[::]\s*[0-9a-fA-F]{16,}\s*\]", "", cleaned) # Remove simple inline numeric citations like [1], [2] that often leak from evidence. cleaned = re.sub(r"\[\s*\d{1,3}\s*\]", "", cleaned) # Strip leaked meta sections from model JSON fields when they are accidentally merged into content. cleaned = re.sub( r"(?is)\n*【\s*缺失信息说明\s*】[\s\S]*?(?=\n【\s*质量检查\s*】|\Z)", "\n", cleaned, ) cleaned = re.sub(r"(?is)\n*【\s*质量检查\s*】[\s\S]*$", "\n", cleaned) return re.sub(r"[ \t]{2,}", " ", cleaned) def _markdown_hashes_for_section_no(section_no: str) -> str: parts = str(section_no or "").strip().split(".") if len(parts) == 1: return "##" if len(parts) == 2: return "###" return "####" def _normalize_numbered_heading_spacing(content: str) -> str: """编号与题名之间补空格,便于前后端一致识别为标题。""" text = str(content or "") if not text: return text def _line_repl(m: re.Match[str]) -> str: indent, num, title = m.group(1), m.group(2), str(m.group(3) or "").strip() parts = num.split(".") if len(parts) < 2 or len(parts) > 4: return m.group(0) for part in parts: if not part.isdigit() or int(part) < 1 or int(part) > 30: return m.group(0) if not title or len(title) > 36 or re.search(r"[,。;:!?]", title): return m.group(0) return f"{indent}{num} {title}" return re.sub( r"(?m)^([\s\u3000]*)(\d+(?:\.\d+)+)\s*([\u4e00-\u9fff][^\n]{0,40})\s*$", _line_repl, text, ) def _normalize_markdown_heading_levels(content: str) -> str: """三节及以上编号统一为 ####,避免 ### 与 #### 混用导致同级标题字号不一致。""" text = str(content or "") if not text: return text def _line_repl(m: re.Match[str]) -> str: hashes, num, tail = m.group(1), m.group(2), m.group(3) parts = num.split(".") if len(parts) < 3: return m.group(0) want = _markdown_hashes_for_section_no(num) if hashes == want: return m.group(0) return f"{want} {num} {tail}" return re.sub( r"(?m)^(#{1,6})\s+(\d+(?:\.\d+)+)\s+([\u4e00-\u9fff].*)$", _line_repl, text, ) def _normalize_section_heading_markdown(content: str) -> str: return _normalize_markdown_heading_levels( _normalize_numbered_heading_spacing(content) ) def _normalize_section_12_content(content: str) -> str: """1.2 合同为纯文本编号体:首行「项目决策要点」、小节「1.2.1项目背景」无空格。 与模板/标题验收叠加后会重复节标题,且前端/导出无法识别为 h4。""" text = str(content or "").strip() if not text: return text text = re.sub( r"(?m)^[\s\u3000]*项目决策要点[\s\u3000]*(?:[::])?[\s\u3000]*\n+", "", text, count=1, ) text = re.sub( r"(?m)^([\s\u3000]*)(1\.2\.[12])(项目背景|预期目标)[\s\u3000]*$", r"\1\2 \3", text, ) return re.sub(r"\n{3,}", "\n\n", text).strip() _CONTRACT_FIELD_LINE_RE = re.compile( r"^\s*(\d+)[))]\s*(?P[^::\n]+)[::]\s*(?P.*)$", re.MULTILINE, ) _CONTRACT_FIELD_SKIP_RE = re.compile( r"必须|不得|禁止|严禁|应|需|写|输出|背景|规则|约束|表\d|后评价|若|当|正文|首行|写作|请", ) def _parse_expected_fields_from_contract(contract: str | None) -> list[str]: """从模版输出合同解析「1) 字段名:...」连续编号字段;无则返回空。""" text = str(contract or "").strip() if not text: return [] fields: list[str] = [] nums: list[int] = [] for m in _CONTRACT_FIELD_LINE_RE.finditer(text): field = str(m.group("field") or "").strip() tail = str(m.group("tail") or "").strip() if not field or len(field) > 10 or _CONTRACT_FIELD_SKIP_RE.search(field): continue if tail and not re.fullmatch(r"\.{2,}|待补充", tail) and len(tail) > 6: continue fields.append(field) nums.append(int(m.group(1))) if len(fields) < 3 or not nums or nums[0] != 1: return [] for i in range(1, len(nums)): if nums[i] != nums[i - 1] + 1: return [] return fields def _section_expected_fields( section_title: str, section: ReportTemplateSection | None = None, ) -> list[str]: """从模版输出合同解析应输出字段;无编号/枚举字段时返回空(不再写死 1.1 八项)。""" if section is not None: contract = _effective_section_output_contract(section) else: contract = _section_output_contract(section_title) parsed = _parse_expected_fields_from_contract(contract) if parsed: return parsed title_norm = re.sub(r"\s+", "", str(section_title or "")) if "1.2项目决策要点" in title_norm: return ["规模目标", "质量目标", "效益目标"] return [] def _effective_section_prompt_for_generation( section: ReportTemplateSection, contract: str, ) -> str: """模版合同为结构权威;与合同重复的 section_prompt 不再注入,避免双源冲突。""" stored = str(section.section_prompt or "").strip() contract_text = str(contract or "").strip() if stored and stored != contract_text: return stored return "" def _field_aliases(field: str) -> list[str]: base = str(field or "").strip() aliases: dict[str, list[str]] = { "项目名称": ["项目名称", "工程名称", "装置名称"], "建设单位": ["建设单位", "业主单位", "实施单位"], "建设地点": ["建设地点", "建设地址", "厂址", "所在地"], "建设类型": ["建设类型", "项目类型", "新建", "改扩建"], "起止时间": ["起止时间", "工作起止时间", "开工时间", "完工时间", "建设工期", "建设期限"], # 抽取/填表侧常将“建设内容”写作“项目内容/工程内容/装置内容”,需兼容回填。 "建设内容": ["建设内容", "主要建设内容", "建设范围", "项目内容", "工程内容", "装置内容"], "建设投资": ["建设投资", "总投资", "投资估算", "项目总投资", "概算"], "占地面积": ["占地面积", "用地面积"], "规模目标": ["规模目标", "产量", "规模", "万吨"], "质量目标": ["质量目标", "辛烷值", "质量升级", "国VI", "国Ⅵ"], "效益目标": ["效益目标", "利润", "收益", "营业收入", "内部收益率", "IRR"], } out = aliases.get(base, []) if base and base not in out: out.insert(0, base) return out[:8] def _is_missing_like(value: str) -> bool: text = str(value or "").strip() if not text: return True lowered = text.lower() missing_like = { "待补充", "无", "n/a", "na", "-", "—", "——", "暂无", "未知", "未提供", } return lowered in missing_like def _normalize_land_area_value(value: str) -> str: """ 规范化“占地面积”字段: - 遇到“84m×187m=15708m2”这类表达时,仅保留等号后的结果; - 将 m2/m^2/m²/㎡ 统一为 ㎡,避免导出时出现单位显示异常。 """ text = str(value or "").strip() if not text: return text core = text if "=" in core: core = core.split("=")[-1].strip() # 中文全角等号兼容 if "=" in core: core = core.split("=")[-1].strip() unified = re.sub(r"(?i)\bm\s*(?:\^?\s*2)\b", "㎡", core) unified = unified.replace("m²", "㎡").replace("M²", "㎡") unified = re.sub(r"\s*㎡", "㎡", unified) return unified or text def _normalize_canonical_field_value(field: str, value: str) -> str: f = str(field or "").strip() v = str(value or "").strip() if not v: return v if f == "占地面积": return _normalize_land_area_value(v) return v def _extract_value_by_alias_from_text(text: str, aliases: list[str]) -> str: src = str(text or "").strip() if not src: return "" for alias in aliases: a = str(alias or "").strip() if not a: continue # 支持“字段名:值”或“字段名:值”,值截取到常见分隔符前。 pattern = rf"{re.escape(a)}\s*[::]\s*([^\n;;,,。]+)" m = re.search(pattern, src, flags=re.IGNORECASE) if m: val = str(m.group(1) or "").strip() if val and not _is_missing_like(val): return val return "" def _is_valid_value_for_field(field: str, value: str, row_key: str = "", col_key: str = "") -> bool: f = str(field or "").strip() v = str(value or "").strip() rk = str(row_key or "").strip().lower() ck = str(col_key or "").strip().lower() if not v or _is_missing_like(v): return False if f != "建设投资": return True # “建设投资”仅接受金额口径,过滤收益率/回收期等财务指标,避免把 2.89 这类比率误填入。 key_text = f"{rk} {ck}" if any(x in key_text for x in ["收益率", "irr", "回收期", "净现值", "百分点", "利润率"]): return False if re.search(r"%|%|‰", v): return False if re.search(r"(收益率|回收期|净现值|利润率|irr)", v, flags=re.IGNORECASE): return False has_amount_unit = bool(re.search(r"(万元|万|亿元|亿元人民币|元)", v)) number_match = re.search(r"\d+(?:\.\d+)?", v) if has_amount_unit: return True if not number_match: return False # 无单位纯数字时,过小值大概率是比率而非投资金额(如 2.89)。 num = float(number_match.group(0)) return num >= 100 def _extract_canonical_fields( section_title: str, elements: list[dict], *, section: ReportTemplateSection | None = None, ) -> dict[str, str]: expected = _section_expected_fields(section_title, section) if not expected: return {} rows = elements if isinstance(elements, list) else [] out: dict[str, str] = {} for field in expected: aliases_raw = [str(a).strip() for a in _field_aliases(field) if str(a).strip()] aliases = [a.lower() for a in aliases_raw] best_value = "" for row in rows: if not isinstance(row, dict): continue row_key = str(row.get("rowKey") or "") col_key = str(row.get("colKey") or "") value = str(row.get("value") or "").strip() if _is_missing_like(value): continue key_text = f"{row_key} {col_key}".lower() if any(a in key_text for a in aliases): if _is_valid_value_for_field(field, value, row_key=row_key, col_key=col_key): best_value = value break # 兼容 row/col 泛化时,直接从 value 文本中解析“字段: 值”。 from_value = _extract_value_by_alias_from_text(value, aliases_raw) if from_value and _is_valid_value_for_field(field, from_value, row_key=row_key, col_key=col_key): best_value = from_value break normalized = _normalize_canonical_field_value(field, best_value) out[field] = normalized or "待补充" return out def _render_canonical_fields_for_prompt( evidence: dict, *, allowed_fields: list[str] | None = None, ) -> str: canonical = evidence.get("canonicalFields") if isinstance(evidence, dict) else None if not isinstance(canonical, dict) or not canonical: return "无字段级已抽取结果。" allowed_set = {str(f).strip() for f in (allowed_fields or []) if str(f).strip()} lines: list[str] = [] for field, value in canonical.items(): f = str(field or "").strip() if allowed_set and f not in allowed_set: continue v = _normalize_canonical_field_value(f, str(value or "").strip()) or "待补充" if not f: continue lines.append(f"- {f}: {v}") return "\n".join(lines) if lines else "无字段级已抽取结果。" def _extract_field_value_from_docs(field: str, docs: list[dict]) -> str: aliases = [str(a).strip() for a in _field_aliases(field) if str(a).strip()] if not aliases or not isinstance(docs, list): return "" texts: list[str] = [] for doc in docs: if not isinstance(doc, dict): continue heading = str(doc.get("heading") or "").strip() content = str(doc.get("content") or "").strip() merged = f"{heading}\n{content}".strip() if merged: texts.append(merged[:8000]) # 先尝试“字段: 值”类型,命中率高且更稳。 for text in texts: val = _extract_value_by_alias_from_text(text, aliases) if val and not _is_missing_like(val): return _normalize_canonical_field_value(field, val) # “建设内容”常写成段落而非冒号键值,补充宽松句式抽取。 if field == "建设内容": for text in texts: for alias in aliases: pattern = rf"{re.escape(alias)}\s*(?:为|包括|包含|主要包括)\s*([^\n。]{{12,420}})" m = re.search(pattern, text, flags=re.IGNORECASE) if m: val = str(m.group(1) or "").strip(" ::;;,,") if val and not _is_missing_like(val): return _normalize_canonical_field_value(field, val) return "" def _merge_canonical_fields_from_docs( section_title: str, evidence: dict, canonical: dict[str, str], *, section: ReportTemplateSection | None = None, ) -> dict[str, str]: expected = _section_expected_fields(section_title, section) if not expected or not isinstance(evidence, dict): return canonical merged = { str(k): _normalize_canonical_field_value(str(k), str(v)) for k, v in dict(canonical or {}).items() } docs: list[dict] = [] chapter_docs = evidence.get("chapterDocs") keyword_docs = evidence.get("keywordDocs") if isinstance(chapter_docs, list): docs.extend(chapter_docs) # 1.1 项目基本情况:必须优先使用“要素管理-章节要素-第一章项目概况”的表格要素。 # 仅当章节要素表整体为空/极少时,才允许使用 keywordDocs 做跨文档回退匹配, # 避免将其它章节的“投资/总投资”等金额误回填到 1.1(例如建设投资被污染)。 title_norm = re.sub(r"\s+", "", str(section_title or "")) allow_keyword_fallback = True if "1.1项目基本情况" in title_norm: elements = evidence.get("elements") if isinstance(evidence.get("elements"), list) else [] non_missing_elements = 0 for row in elements[:80]: if not isinstance(row, dict): continue v = str(row.get("value") or "").strip() if v and not _is_missing_like(v): non_missing_elements += 1 if non_missing_elements >= 4: break # “有一定数量的非空单元格”即认为章节要素不空:禁止 keywordDocs 参与回填。 allow_keyword_fallback = non_missing_elements < 4 if allow_keyword_fallback and isinstance(keyword_docs, list): docs.extend(keyword_docs) if not docs: return merged for field in expected: current = str(merged.get(field) or "").strip() if current and not _is_missing_like(current): continue from_docs = _extract_field_value_from_docs(field, docs) if from_docs and not _is_missing_like(from_docs): merged[field] = _normalize_canonical_field_value(field, from_docs) return merged def _apply_canonical_field_backfill( section: ReportTemplateSection, evidence: dict, content: str, ) -> str: text = str(content or "") canonical = evidence.get("canonicalFields") if isinstance(evidence, dict) else {} if not isinstance(canonical, dict) or not canonical: elements = evidence.get("elements") if isinstance(evidence, dict) else [] canonical = _extract_canonical_fields( section.section_title, elements if isinstance(elements, list) else [], section=section, ) canonical = _merge_canonical_fields_from_docs( section.section_title, evidence, canonical, section=section ) if not canonical: return text repaired = text for field in _section_expected_fields(section.section_title, section): value = str(canonical.get(field) or "").strip() if _is_missing_like(value): continue # 先按“字段名: 待补充”进行宽松替换,兼容编号/加粗等格式包装。 broad_pattern = rf"(^.*{re.escape(field)}.*?[::]\s*)待补充(?:\s|$)" repaired = re.sub( broad_pattern, rf"\g<1>{value}\n", repaired, flags=re.MULTILINE, ) labels = list(dict.fromkeys([x for x in _field_aliases(field) if str(x).strip()])) for label in labels: pattern = rf"({re.escape(label)}\s*[::]\s*)待补充\b" repaired = re.sub(pattern, rf"\g<1>{value}", repaired) # 若正文还没有落入该字段值,则追加一行显式键值,避免模型遗漏。 if value not in repaired and re.search(rf"{re.escape(field)}\s*[::]", repaired): repaired += f"\n{field}:{value}" return repaired def _build_field_diagnostics(section: ReportTemplateSection, evidence: dict, content: str) -> list[dict[str, Any]]: expected = _section_expected_fields(section.section_title, section) if not expected: return [] elements = evidence.get("elements") if isinstance(evidence, dict) else [] if not isinstance(elements, list): elements = [] content_text = str(content or "") out: list[dict[str, Any]] = [] for field in expected: aliases_raw = [str(a).strip() for a in _field_aliases(field) if str(a).strip()] aliases = [a.lower() for a in aliases_raw] hits: list[str] = [] for row in elements: if not isinstance(row, dict): continue row_key = str(row.get("rowKey") or "") col_key = str(row.get("colKey") or "") value = str(row.get("value") or "").strip() if _is_missing_like(value): continue key_text = f"{row_key} {col_key}".lower() if any(a in key_text for a in aliases): hits.append(value[:120]) else: from_value = _extract_value_by_alias_from_text(value, aliases_raw) if from_value: hits.append(from_value[:120]) if len(hits) >= 5: break unique_hits = list(dict.fromkeys(hits)) content_has_value = any((not _is_missing_like(v)) and v in content_text for v in unique_hits) content_marked_missing = bool( re.search( rf"{re.escape(field)}\s*[::].*?待补充", content_text, flags=re.IGNORECASE | re.DOTALL, ) ) status = "unknown" if unique_hits and content_has_value: status = "used" elif unique_hits and content_marked_missing: status = "extracted_but_missing_in_content" elif unique_hits: status = "extracted_but_not_matched" elif content_marked_missing: status = "not_extracted_and_missing" out.append( { "field": field, "extractedValues": unique_hits, "contentHasExtractedValue": content_has_value, "contentMarkedMissing": content_marked_missing, "status": status, } ) return out def _section_output_contract(section_title: str) -> str: section_no = _extract_section_number(str(section_title or "")) if section_no in SECTION_OUTPUT_CONTRACTS: return SECTION_OUTPUT_CONTRACTS[section_no] return DEFAULT_SECTION_OUTPUT_CONTRACT def _effective_section_output_contract(section: ReportTemplateSection) -> str: raw = getattr(section, "section_output_contract", None) if isinstance(raw, str) and raw.strip(): return raw.strip() return _section_output_contract(section.section_title or "") def _section_requires_tables(section_title: str, *, contract_text: str | None = None) -> bool: """判断章节合同是否包含【表格强制要求】,决定该节是否允许出现表格。""" c = (str(contract_text or "").strip() or _section_output_contract(section_title)) return "表格强制要求" in c def _strip_tables_from_non_table_section( section_title: str, content: str, *, section: ReportTemplateSection | None = None, ) -> str: """对无表格需求的章节,移除模型可能自行生成的 Markdown 表格。""" if section is not None: contract = _effective_section_output_contract(section) else: contract = _section_output_contract(section_title) if "表格强制要求" in contract: return content if not content: return content lines = content.split("\n") out: list[str] = [] in_table = False for line in lines: stripped = line.strip() is_table_line = stripped.startswith("|") and stripped.endswith("|") is_separator = bool(re.match(r"^\|[\s\-:|]+\|$", stripped)) if stripped else False if is_table_line or is_separator: if not in_table: in_table = True if out and out[-1].strip().startswith("###") and "表" in out[-1]: out.pop() continue else: if in_table: in_table = False if stripped.startswith("[ \t]*\n)*" r"(?:[ \t]*\|[^\n]*\|[ \t]*\n)+)", flags=re.IGNORECASE, ) m = pat.search(content) return m.group(0).strip() if m else "" def _find_table_insert_position(content: str, token: str, required_tables: list[str]) -> int | None: """在 content 中找到 token 对应表应插入的位置。 规则:插入到下一个必需表的表题行之前;若没有后续表,返回 None(追加到末尾)。 """ token_idx = None for i, t in enumerate(required_tables): if _norm_table_token(t) == _norm_table_token(token): token_idx = i break if token_idx is None: return None for later_token in required_tables[token_idx + 1:]: later_plain = re.sub(r"\s+", "", str(later_token or "")) if not later_plain: continue later_re = re.escape(later_plain).replace(r"\-", r"[--—–]") later_pat = re.compile( r"(?:^|\n)([^\n]*?" + later_re + r"[^\n]*)\n", flags=re.IGNORECASE, ) m = later_pat.search(content) if m: pos = m.start() if pos > 0 and content[pos] == "\n": pos += 1 return pos return None def _enforce_required_tables( section: ReportTemplateSection, prompt: str, content: str, evidence: dict, ) -> tuple[str, list[str]]: required = _extract_required_table_tokens( section.section_prompt or "", _extract_section_number(section.section_title or ""), contract_text=_effective_section_output_contract(section), ) if not required: return content, [] # 模板必需表优先“要素表直出”,避免模型改写结构化表中的真实数据。 repaired = _append_authoritative_required_tables(content, required, evidence) missing = [t for t in required if not _table_token_exists(repaired, t)] if missing: repaired = _append_structured_missing_tables(repaired, missing, evidence) still_missing = [t for t in required if not _table_token_exists(repaired, t)] if still_missing: repaired = _repair_missing_tables(section, prompt, repaired, still_missing, evidence) still_missing = [t for t in required if not _table_token_exists(repaired, t)] if still_missing: repaired = _append_minimal_missing_tables(repaired, still_missing) # 章节间串表清理:4.3.2 仅保留运行周期统计表;4.3.3 仅保留装置运行分析表。 repaired = _remove_cross_section_table_pollution(section.section_title or "", repaired) # 末尾兜底:若必需表已“存在”但表体残缺(仅分隔行/缺数据行), # 仍要强制回填要素管理中的完整结构化表。 repaired = _ensure_required_structured_tables_integrity(repaired, required, evidence) # 即使 missing 为空(如 5.1 已由 LLM 写出表5-1),仍须去重,避免 LLM 表 + 要素直出表并存。 repaired = _finalize_section_table_dedupe(repaired, required) repaired = _fill_required_table_caption_stubs(repaired, required, evidence) repaired = _finalize_section_table_dedupe(repaired, required) final_missing = [t for t in required if not _table_token_exists(repaired, t)] return repaired, final_missing def _extract_required_table_tokens( section_prompt: str, section_no: str = "", *, contract_text: Optional[str] = None, ) -> list[str]: """ 从模板 section_prompt 与章节输出合同(section_output_contracts)中抽取「表 x-x / 附表 x」, 使合同内写死的「见表2-3」等也能触发 _append_authoritative_required_tables 要素直出。 严格规则: - 仅当合同中包含「【表格强制要求】」标签时,才提取正文表(表x-x)。 - 「见附表N」「附表N~附表M」等仅为引用语,不视为本节必需内嵌的表格(含区间端点及中间附表)。 - 附图与附表在正文之后由 _append_report_appendices 统一汇总(附图在上、附表在下)。 """ parts = [str(section_prompt or "").strip(), str(contract_text or "").strip()] text = "\n".join(p for p in parts if p) if not text: return [] has_table_mandate = "表格强制要求" in text if not has_table_mandate: return [] # 剔除「【禁止】」段落,避免将禁止示例中的表号(如"表2.6-1")误判为必需表。 text_for_extraction = re.sub( r"【禁止】.*?(?=【|$)", "", text, flags=re.DOTALL, ) raw = re.findall( r"(附表\s*\d+(?:\s*[.\--]\s*\d+)*|表\s*\d+(?:\s*[.\--]\s*\d+)*)", text_for_extraction, ) out: list[str] = [] seen = set() chapter_no = "" m_sec = re.match(r"^\s*(\d+)", str(section_no or "")) if m_sec: chapter_no = m_sec.group(1) _REF_ONLY_PATTERN = re.compile( r"(?:见|详见|参见|参照|详)\s*附表\s*\d+", ) ref_only_appendices: set[str] = set() for m in _REF_ONLY_PATTERN.finditer(text): tok_in_ref = re.findall(r"(附表\s*\d+(?:\s*[.\--]\s*\d+)*)", m.group()) for t in tok_in_ref: ref_only_appendices.add(re.sub(r"\s+", "", t)) for tok in raw: norm = re.sub(r"\s+", "", tok) if norm.startswith("附表") and norm in ref_only_appendices: continue if chapter_no: m_tok = re.match(r"^(?:附表|表)\s*(\d+)", norm) if m_tok: tail = norm[m_tok.end() :] # 「表1」「表2」等为节内顺序号,首位数字不等于章号(如第二章下的表1);仅对「表2-3」「表2.6-1」等带子级编号的表号按章首数字过滤。 if tail and tail[0] in ".--—–": if m_tok.group(1) != chapter_no: continue if norm in seen: continue seen.add(norm) out.append(norm) # 5.3.2 合同正文仅允许表5-5、表5-6;模板示例里若夹带「附表8」等,一律不纳入必需表,避免要素直出串表。 if str(section_no or "").strip() == "5.3.2": allow_532 = {_norm_table_token("表5-5"), _norm_table_token("表5-6")} out = [t for t in out if _norm_table_token(t) in allow_532] # 7.1.2 仅内嵌表7-1;合同/模板中若夹带其他章表号,不纳入本节必需表。 if str(section_no or "").strip() == "7.1.2": allow_712 = {_norm_table_token("表7-1")} out = [t for t in out if _norm_table_token(t) in allow_712] return out[:20] def _repair_missing_tables( section: ReportTemplateSection, prompt: str, content: str, missing_tables: list[str], evidence: dict, ) -> str: fix_prompt = build_repair_missing_tables_prompt( section_title=section.section_title, original_prompt=prompt, content=content, missing_tables=missing_tables, evidence_json=json.dumps(evidence, ensure_ascii=False), ) obj = chat_completions_json( system_prompt=repair_missing_tables_system_prompt(), user_prompt=fix_prompt, temperature=0.1, max_tokens=2200, timeout_sec=120, log_context=f"补缺失表格 section_key={section.section_key} | {section.section_title}", ) new_content = str(obj.get("content") or "").strip() return new_content or content def _append_minimal_missing_tables(content: str, missing_tables: list[str]) -> str: blocks = [content.rstrip()] for t in missing_tables: blocks.append( MINIMAL_MISSING_TABLE_TEMPLATE.format( table_name=_normalize_table_caption_number_name_gap(str(t or "").strip()) ) ) return "".join(blocks).strip() def _remove_cross_section_table_pollution(section_title: str, content: str) -> str: """ 清理 4.3.2 / 4.3.3 的跨节串表: - 4.3.2 不允许出现“装置运行分析”表 - 4.3.3 不允许出现“投产以来运行周期统计表” """ text = str(content or "") section_no = _extract_section_number(section_title) if section_no not in {"4.3.2", "4.3.3"}: return text if section_no == "4.3.2": forbidden_kw = "装置运行分析" else: forbidden_kw = "投产以来运行周期统计表" # 表题行 + Markdown 表格(允许表题与表格之间有空行/注释行) md_pat = re.compile( rf"(?:^|\n)[^\n]*{re.escape(forbidden_kw)}[^\n]*\n" rf"(?:\s*\n|\s*\n)*" rf"(?:\s*\|[^\n]+\|\s*\n)+", flags=re.IGNORECASE, ) text = md_pat.sub("\n", text) # 表题行 + HTML 表格(允许表题与表格之间有空行/注释行) html_pat = re.compile( rf"(?:^|\n)[^\n]*{re.escape(forbidden_kw)}[^\n]*\n" rf"(?:\s*\n|\s*\n)*" rf"\s*[\s\S]*?
", flags=re.IGNORECASE, ) text = html_pat.sub("\n", text) # 残留单独表题行(无表体)也移除,避免视觉噪音 title_only_pat = re.compile( rf"(?:^|\n)\s*[#>*\-\d\.\)()\s]*[^\n]*{re.escape(forbidden_kw)}[^\n]*(?=\n|$)", flags=re.IGNORECASE, ) text = title_only_pat.sub("\n", text) # 折叠多余空行 text = re.sub(r"\n{3,}", "\n\n", text) return text.strip() def _title_compare_norm(s: str) -> str: """标题宽松比较:折叠空白,并去掉中英括号两侧多余空格。""" t = re.sub(r"\s+", " ", str(s or "")).strip() t = re.sub(r"\s*([((])\s*", r"\1", t) t = re.sub(r"\s*([))])\s*", r"\1", t) return t def _heading_line_section_number(line: str) -> str: normalized = str(line or "").strip().lstrip("#").strip() m = re.match(r"^(\d+(?:\.\d+)*)", normalized) return m.group(1) if m else "" def _is_heading_line_for_section(line: str, section_no: str) -> bool: if not section_no: return False return _heading_line_section_number(line) == section_no def _strip_leading_section_heading_lines(lines: list[str], section_no: str) -> list[str]: """去掉正文开头连续的、与 section_no 同编号的标题行(避免 prepend 后重复)。""" trimmed = list(lines) while trimmed: first = trimmed[0] if not str(first).strip(): trimmed.pop(0) continue if _is_heading_line_for_section(first, section_no): trimmed.pop(0) while trimmed and not str(trimmed[0]).strip(): trimmed.pop(0) continue break return trimmed def _replace_first_section_heading_line(content: str, section_no: str, canonical_title: str) -> str: lines_list = content.splitlines() for idx_l, ln in enumerate(lines_list): if not ln.strip(): continue if not _is_heading_line_for_section(ln, section_no): break stripped = ln.strip() section_no_heading = section_no plain_numbered_24x = section_no_heading in { "2.4.1", "2.4.2", "2.4.3", "2.4.4", } if stripped.startswith("#") and not plain_numbered_24x: hm = re.match(r"^(#+\s*)", stripped) prefix = hm.group(1) if hm else "" lines_list[idx_l] = (prefix + canonical_title).rstrip() else: lines_list[idx_l] = canonical_title break return "\n".join(lines_list) def _enforce_template_format_contract( section: ReportTemplateSection, content: str, evidence: dict, *, chapter_title_map: Optional[dict[str, str]] = None, ) -> tuple[str, list[str]]: issues: list[str] = [] # 不对编号样式做统一归一化,避免破坏章节层级编号(如 1.2.1 / 2.1.3 / 3.4.2)。 repaired = _strip_inline_evidence_labels(str(content or "").strip()) repaired = _normalize_section_heading_markdown(repaired) if _extract_section_number(str(section.section_title or "")) == "1.2": repaired = _normalize_section_12_content(repaired) # 0) 先拆行再做标题验收,否则步骤 1 会因首行 != 标准标题而重复插入标题。 # 0.1) 上一段正文末尾与下一小节编号粘在同一行(如「……证明了2.4.4 初步……」)。 repaired = _split_inline_template_headings(repaired, chapter_title_map or {}) # 0.2) 小节标题后与正文首字粘在同行(如「2.4.4 初步设计审查工作评价2017年……」)。 repaired = _split_glued_template_heading_body(repaired, chapter_title_map or {}) # 1) 标题验收: # - 普通节:首行为当前节标题(允许附加 # 前缀) # - 每章第一节(x.1):首行为章标题,且必须包含当前节标题 title = str(section.section_title or "").strip() section_no_heading = _extract_section_number(title) title_norm = _title_compare_norm(title) non_empty_lines: list[str] = [] for line in repaired.splitlines(): if line.strip(): non_empty_lines.append(line.strip().lstrip("#").strip()) first_non_empty = non_empty_lines[0] if non_empty_lines else "" first_matches_title = bool( title and first_non_empty and _title_compare_norm(first_non_empty) == title_norm ) first_is_section_heading = bool( title and section_no_heading and first_non_empty and _is_heading_line_for_section(first_non_empty, section_no_heading) ) ancestor_titles = _resolve_ancestor_titles_for_section( section, chapter_title_map or {}, ) if ancestor_titles: required_titles = ancestor_titles + ([title] if title else []) required_norms = [_title_compare_norm(t) for t in required_titles] existing_lines = repaired.splitlines() trimmed_lines = list(existing_lines) existing_title_chain: list[str] = [] while trimmed_lines: first_line = trimmed_lines[0] normalized = _title_compare_norm(first_line.strip().lstrip("#").strip()) if not normalized: trimmed_lines.pop(0) continue if normalized in required_norms: existing_title_chain.append(normalized) trimmed_lines.pop(0) while trimmed_lines and not trimmed_lines[0].strip(): trimmed_lines.pop(0) continue if ( title and section_no_heading and len(existing_title_chain) == len(required_norms) - 1 and _is_heading_line_for_section(first_line, section_no_heading) ): existing_title_chain.append(required_norms[-1]) trimmed_lines.pop(0) while trimmed_lines and not trimmed_lines[0].strip(): trimmed_lines.pop(0) continue break if existing_title_chain != required_norms: body_lines = _strip_leading_section_heading_lines(trimmed_lines, section_no_heading) body = "\n".join(body_lines).strip() repaired = "\n\n".join(required_titles + ([body] if body else [])).strip() issues.append("章节缺少父级标题链,已自动补齐") elif title and first_is_section_heading and first_non_empty != title: repaired = _replace_first_section_heading_line(repaired, section_no_heading, title) elif title and not first_matches_title and first_is_section_heading: repaired = _replace_first_section_heading_line(repaired, section_no_heading, title) if first_non_empty != title: issues.append("章节标题与模板不一致,已规范为标准标题行") elif title and not first_matches_title: repaired = f"{title}\n\n{repaired}".strip() issues.append("章节标题与模板不一致,已自动补齐标准标题行") elif title and first_matches_title and first_non_empty != title: repaired = _replace_first_section_heading_line(repaired, section_no_heading, title) # 1.42) 2.4.1~2.4.4:首行可能是「### 2.4.x …」且去 # 后与模板标题一致,此时不会进入上一分支,须去掉 Markdown 前缀。 if ( _extract_section_number(title) in {"2.4.1", "2.4.2", "2.4.3", "2.4.4"} and title ): lns_strip = repaired.splitlines() for _is, ln_s in enumerate(lns_strip): if not ln_s.strip(): continue sh = ln_s.strip() if sh.startswith("#") and _title_compare_norm(sh.lstrip("#").strip()) == title_norm: lns_strip[_is] = title break repaired = "\n".join(lns_strip) # 1.5) 确保标题行(如 "2.4.4 初步设计审查工作评价")后面有空行, # 否则前端 Markdown 渲染或 DOCX 导出时可能无法识别为标题。 repaired = _ensure_heading_lines_separated(repaired) # 2) 标题树验收:若该节定义了固定子节顺序,缺失则自动补齐占位小节。 section_no = _extract_section_number(title) repaired, missing_children = _auto_append_missing_child_headings(repaired, section_no) if missing_children: issues.append("缺失下级小节已自动补齐:" + "、".join(missing_children[:10])) # 3) 仅当章节合同显式要求表格时,才做模板表格规格验收与修复。 # 否则像 5.2.2/5.2.3/5.2.4 这类纯文字章节会被示例表误触发补表,产生脏表格。 if not _section_requires_tables( title, contract_text=_effective_section_output_contract(section) ): return _strip_inline_evidence_labels(repaired), issues # 4) 解析模板示例中的表规格(表名 + 表头关键字) table_specs = _extract_template_table_specs(section.examples) if not table_specs: return repaired, issues # 4.1) 第 5 章共用示例里同时出现「表5-1/表5-2」宁夏样例与各小节真实合同(如 5.3.1 仅表5-4)。 # 若不按合同过滤,_find_table_format_issues 会误报缺表5-1,_repair_table_format_by_template 会把表5-4「修」成样例表头。 contract_required = _extract_required_table_tokens( section.section_prompt or "", section_no, contract_text=_effective_section_output_contract(section), ) if contract_required: allow = {_norm_table_token(t) for t in contract_required if _norm_table_token(t)} narrowed = [ s for s in table_specs if _norm_table_token(str(s.get("token") or "")) in allow ] if narrowed: table_specs = narrowed table_issues = _find_table_format_issues(repaired, table_specs) if table_issues: issues.extend(table_issues) repaired = _repair_table_format_by_template(section, repaired, table_specs, evidence) # 二次验收,仍不通过则提示但不循环重试 still = _find_table_format_issues(repaired, table_specs) if still: issues.extend([f"二次修正后仍存在:{x}" for x in still[:4]]) return _strip_inline_evidence_labels(repaired), issues def _extract_template_table_specs(raw_examples: Optional[str]) -> list[dict]: text = str(raw_examples or "").strip() if not text: return [] lines = [ln.rstrip() for ln in text.splitlines()] specs: list[dict] = [] i = 0 while i < len(lines): line = lines[i].strip() m = re.match(r"^(附表\s*\d+|表\s*\d+(?:\s*-\s*\d+)?)\s*(.*)$", line) if not m: i += 1 continue token = re.sub(r"\s+", "", m.group(1)) title_tail = str(m.group(2) or "").strip() title = f"{m.group(1)} {title_tail}".strip() j = i + 1 header_keywords: list[str] = [] # 采集该表后面的头部字段线索 while j < len(lines): cur = lines[j].strip() if not cur: j += 1 if header_keywords: break continue if re.match(r"^(附表\s*\d+|表\s*\d+(?:\s*-\s*\d+)?)\s*", cur): break if re.match(r"^\d+(?:\.\d+)*\s+", cur): # 下一个章节 break if cur.startswith("注"): break if "|" in cur: # Markdown 表头 cells = [c.strip() for c in cur.split("|") if c.strip()] for c in cells[:8]: if c and c not in ("---", "—"): header_keywords.append(c) break # 普通文本表头行 if len(cur) <= 24 and not re.fullmatch(r"[0-9.%()()\-~~:/\s]+", cur): header_keywords.append(cur) if len(header_keywords) >= 8: break j += 1 specs.append( { "token": token, "title": title, "headerKeywords": list(dict.fromkeys(header_keywords))[:8], } ) i = j # 去重同 token dedup: dict[str, dict] = {} for s in specs: tk = str(s.get("token") or "") if not tk or tk in dedup: continue dedup[tk] = s return list(dedup.values())[:12] def _find_table_format_issues(content: str, table_specs: list[dict]) -> list[str]: issues: list[str] = [] c = str(content or "") c_norm = _norm_table_token(c) for spec in table_specs: token = str(spec.get("token") or "") title = str(spec.get("title") or token) token_norm = _norm_table_token(token) if token_norm and token_norm not in c_norm: issues.append(f"缺少模板表名:{title}") continue # 若模板存在表头关键词,则要求至少命中2个(或全部,如果少于2) headers = [str(h).strip() for h in (spec.get("headerKeywords") or []) if str(h).strip()] if not headers: continue hit = sum(1 for h in headers if h in c) need = min(2, len(headers)) if hit < need: issues.append(f"表头与模板不一致:{title}") return issues def _repair_table_format_by_template( section: ReportTemplateSection, content: str, table_specs: list[dict], evidence: dict, ) -> str: specs_text = json.dumps(table_specs, ensure_ascii=False) fix_prompt = build_table_format_repair_prompt( section_title=section.section_title, table_specs_json=specs_text, content=content, evidence_json=json.dumps(evidence, ensure_ascii=False), ) obj = chat_completions_json( system_prompt=table_format_repair_system_prompt(), user_prompt=fix_prompt, temperature=0.1, max_tokens=2600, timeout_sec=120, log_context=f"表格格式修复 section_key={section.section_key} | {section.section_title}", ) fixed = str(obj.get("content") or "").strip() return fixed or content def _render_structured_tables_for_prompt(evidence: dict) -> str: rows = evidence.get("structuredTables") if isinstance(evidence, dict) else [] if not isinstance(rows, list) or not rows: return "无结构化表格证据" blocks: list[str] = [] for row in rows[:8]: if not isinstance(row, dict): continue name = str(row.get("tableName") or "").strip() md = str(row.get("markdown") or "").strip() if not name or not md: continue blocks.append(f"### {_normalize_table_caption_number_name_gap(name)}\n\n{md}") return "\n\n".join(blocks) if blocks else "无结构化表格证据" def _strip_bracketed_three_part_labels(content: str) -> str: text = str(content or "") if not text: return text # 全章统一移除方括号三段式标题,保留其后正文内容。 patterns = [ r"^\s{0,3}#{0,6}\s*【事实依据】\s*$", r"^\s{0,3}#{0,6}\s*【评价判断】\s*$", r"^\s{0,3}#{0,6}\s*【问题与建议】\s*$", r"^\s{0,3}#{0,6}\s*事实依据\s*[::、]?\s*$", r"^\s{0,3}#{0,6}\s*评价判断\s*[::、]?\s*$", r"^\s{0,3}#{0,6}\s*问题与建议\s*[::、]?\s*$", r"【事实依据】", r"【评价判断】", r"【问题与建议】", ] for p in patterns: text = re.sub(p, "", text, flags=re.MULTILINE) text = re.sub(r"\n{3,}", "\n\n", text).strip() return text def _strip_placeholder_table_notes(content: str) -> str: text = str(content or "") if not text: return text placeholder_note_pattern = re.compile( r"^\s{0,3}(?:[-*+]\s*)?(?:>\s*)?(?:\*\*|__)?\s*注\s*\d+\s*[\.。::、]?\s*待补充\s*(?:\*\*|__)?\s*$", flags=re.IGNORECASE, ) boilerplate_note_line_pattern = re.compile( r"可酌情增减指标|可酌情增减|根据项目的情况|根据项目实际需要进行增减|根据项目不同进行增减|根据项目具体情况增减|表中内容可根据", flags=re.IGNORECASE, ) boilerplate_full_line_pattern = re.compile( r"^\s{0,3}(?:[-*+]\s*)?(?:>\s*)?(?:\*\*|__)?\s*注\s*[\.。::]?\s*(?:\d+\s*[\.。、::]?\s*)?(?:表中内容)?可根据项目.{0,20}(?:增减|调整)", flags=re.IGNORECASE, ) note_header_pattern = re.compile( r"^\s*(?:[-*+]\s*)?(?:>\s*)?(?:\*\*|__)?\s*注\s*[::]\s*(?:\*\*|__)?\s*$", flags=re.IGNORECASE, ) src_lines = text.splitlines() lines: list[str] = [] i = 0 while i < len(src_lines): line = src_lines[i] if placeholder_note_pattern.match(line): i += 1 continue if boilerplate_full_line_pattern.match(line): i += 1 continue if boilerplate_note_line_pattern.search(line): i += 1 continue if note_header_pattern.match(line): j = i + 1 skipped_boilerplate = False while j < len(src_lines): nxt = src_lines[j] if not str(nxt).strip(" \t\u3000"): j += 1 continue if boilerplate_note_line_pattern.search(nxt): skipped_boilerplate = True j += 1 break if skipped_boilerplate: i = j continue lines.append(line) i += 1 return re.sub(r"\n{3,}", "\n\n", "\n".join(lines)).strip() def _strip_trailing_partial_missing_markers(content: str) -> str: text = str(content or "") if not text: return text cleaned_lines: list[str] = [] for raw_line in text.splitlines(): line = raw_line.rstrip() compact = re.sub(r"\s+", "", line) if compact in {"待补充", "-待补充", "*待补充"}: cleaned_lines.append(line) continue updated = re.sub(r"\s*待补充\s*(?:\[\s*\d{1,3}\s*\])?\s*$", "", line) updated = re.sub(r"\s{2,}", " ", updated).rstrip() stripped = updated.strip() core_len = len(re.sub(r"[^\u4e00-\u9fffA-Za-z0-9]", "", stripped)) if stripped and stripped != line.strip() and core_len >= 12: cleaned_lines.append(updated) else: cleaned_lines.append(line) return re.sub(r"\n{3,}", "\n\n", "\n".join(cleaned_lines)).strip() def _is_pipe_markdown_table_row_line(line: str) -> bool: s = line.strip() return len(s) >= 2 and s.startswith("|") and s.endswith("|") def _is_pipe_markdown_table_separator_line(line: str) -> bool: s = line.strip() return bool(re.match(r"^\|[\s\-:|]+\|$", s)) if s else False def _markdown_table_body_fingerprint(md: str) -> str: """用于判断两张 Markdown 表是否实质相同(忽略行间空行与首尾空白)。""" lines = [ re.sub(r"\s+", "", ln.strip()) for ln in str(md or "").splitlines() if ln.strip().startswith("|") or ln.strip().startswith("|") ] return "\n".join(lines) _INTRA_REPEAT_MIN_FRAGMENT_LEN = 18 _INTRA_REPEAT_MIN_CONSECUTIVE_COUNT = 3 def _collapse_consecutive_repetitions_in_string(text: str) -> tuple[str, int]: """ 折叠同一行/字符串内连续重复片段(如模型将同一句财务描述拼接数十次)。 仅处理紧邻重复,避免误伤正常文中偶然出现的相同短语。 """ s = str(text or "") min_len = _INTRA_REPEAT_MIN_FRAGMENT_LEN min_count = _INTRA_REPEAT_MIN_CONSECUTIVE_COUNT if len(s) < min_len * min_count: return s, 0 removed = 0 out: list[str] = [] i = 0 n = len(s) while i < n: best_plen = 0 best_count = 0 max_plen = (n - i) // min_count for plen in range(min_len, max_plen + 1): pat = s[i : i + plen] if not pat.strip(): continue count = 1 j = i + plen while j + plen <= n and s[j : j + plen] == pat: count += 1 j += plen if count >= min_count: span = plen * count if span > best_plen * best_count: best_plen = plen best_count = count if best_plen: out.append(s[i : i + best_plen]) removed += best_count - 1 i += best_plen * best_count else: out.append(s[i]) i += 1 return "".join(out), removed def _collapse_consecutive_text_repetitions(content: str) -> tuple[str, int]: """按行折叠段内连续重复;返回 (正文, 移除的重复次数)。""" lines = str(content or "").splitlines() if not lines: return str(content or ""), 0 total_removed = 0 collapsed_lines: list[str] = [] for line in lines: collapsed, removed = _collapse_consecutive_repetitions_in_string(line) total_removed += removed collapsed_lines.append(collapsed) return "\n".join(collapsed_lines), total_removed _CHAPTER_CONTENT_DEDUPE_MIN_CHARS = 2000 _CHAPTER_DEDUPE_MIN_BLOCK_CORE_LEN = 48 _CHAPTER_DEDUPE_NEAR_MATCH_RATIO = 0.90 def _chapter_block_core_len(text: str) -> int: return len(re.sub(r"[^\u4e00-\u9fffA-Za-z0-9]", "", _strip_inline_evidence_labels(str(text or "")))) def _chapter_text_block_fingerprint(text: str) -> str: t = _strip_inline_evidence_labels(str(text or "")) t = re.sub(r"\s+", "", t) return t.casefold() def _is_likely_table_caption_line(line: str) -> bool: prev = str(line or "").strip() if not prev or len(prev) > 120: return False if prev.startswith("#"): return True if re.search(r"表\s*[\d一二三四五六七八九十\--—–]+", prev): return True return "表" in prev and len(prev) <= 80 def _chapter_block_is_table(block: str) -> bool: pipe_rows = [ ln for ln in str(block or "").splitlines() if ln.strip() and _is_pipe_markdown_table_row_line(ln) ] return len(pipe_rows) >= 2 def _split_chapter_blocks_for_dedupe(content: str) -> list[str]: """将章节正文拆成段落块与 Markdown 表块,便于做重复检测。""" lines = str(content or "").splitlines() blocks: list[str] = [] i = 0 n = len(lines) pending_caption: list[str] = [] def _flush_pending_caption() -> None: nonlocal pending_caption if pending_caption: blocks.append("\n".join(pending_caption)) pending_caption = [] while i < n: if not lines[i].strip(): i += 1 continue if _is_pipe_markdown_table_row_line(lines[i]): table_lines: list[str] = [] while i < n and lines[i].strip() and _is_pipe_markdown_table_row_line(lines[i]): table_lines.append(lines[i]) i += 1 if table_lines: block_lines = list(pending_caption) + table_lines pending_caption = [] start = i - len(table_lines) scan = start - 1 while scan >= 0 and lines[scan].strip(): if _is_pipe_markdown_table_row_line(lines[scan]): break if _is_likely_table_caption_line(lines[scan]): block_lines.insert(0, lines[scan]) scan -= 1 continue break blocks.append("\n".join(block_lines)) continue para_lines: list[str] = [] while i < n: if not lines[i].strip(): i += 1 break if _is_pipe_markdown_table_row_line(lines[i]): break para_lines.append(lines[i]) i += 1 if not para_lines: continue if len(para_lines) == 1 and _is_likely_table_caption_line(para_lines[0]): _flush_pending_caption() pending_caption = para_lines continue _flush_pending_caption() blocks.append("\n".join(para_lines)) _flush_pending_caption() return blocks def _chapter_blocks_near_duplicate(a: str, b: str) -> bool: fa = _chapter_text_block_fingerprint(a) fb = _chapter_text_block_fingerprint(b) if not fa or not fb: return False if fa == fb: return True short, long = (fa, fb) if len(fa) <= len(fb) else (fb, fa) if len(short) >= _CHAPTER_DEDUPE_MIN_BLOCK_CORE_LEN and short in long: if len(short) / max(len(long), 1) >= 0.82: return True if min(len(fa), len(fb)) < 80: return False return SequenceMatcher(None, fa, fb).ratio() >= _CHAPTER_DEDUPE_NEAR_MATCH_RATIO def _chapter_block_duplicate_key(block: str) -> tuple[str, str]: text = str(block or "") if _chapter_block_is_table(text): hdr = _extract_table_header_key(text) fp = _markdown_table_body_fingerprint(text) return ("table", hdr or fp) return ("text", _chapter_text_block_fingerprint(text)) def _dedupe_long_chapter_repetition(content: str) -> tuple[str, int]: """ 单章字数超过阈值时,对段落/表格块做去重,缓解模型整段或整表重复输出。 保留首次出现块,删除后续实质重复块。 """ text = str(content or "") if len(text) <= _CHAPTER_CONTENT_DEDUPE_MIN_CHARS: return text, 0 blocks = _split_chapter_blocks_for_dedupe(text) if len(blocks) < 2: return text, 0 kept: list[str] = [] seen_table_hdr: set[str] = set() seen_table_fp: set[str] = set() seen_text_fp: set[str] = set() kept_text_samples: list[str] = [] removed = 0 for block in blocks: core_len = _chapter_block_core_len(block) kind, key = _chapter_block_duplicate_key(block) is_dup = False if kind == "table": hdr = _extract_table_header_key(block) if key else "" fp = _markdown_table_body_fingerprint(block) if hdr and hdr in seen_table_hdr: is_dup = True elif fp and fp in seen_table_fp: is_dup = True elif key and key in seen_text_fp: is_dup = True elif core_len >= _CHAPTER_DEDUPE_MIN_BLOCK_CORE_LEN: for prev in kept_text_samples: if _chapter_blocks_near_duplicate(block, prev): is_dup = True break if is_dup: removed += 1 continue if kind == "table": hdr = _extract_table_header_key(block) fp = _markdown_table_body_fingerprint(block) if hdr: seen_table_hdr.add(hdr) if fp: seen_table_fp.add(fp) elif key: seen_text_fp.add(key) if core_len >= _CHAPTER_DEDUPE_MIN_BLOCK_CORE_LEN: kept_text_samples.append(block) kept.append(block) if removed <= 0: return text, 0 merged = "\n\n".join(b.strip() for b in kept if b.strip()) return re.sub(r"\n{3,}", "\n\n", merged).strip(), removed def _dedupe_structured_table_hits(hits: list[dict]) -> list[dict]: """ 同一必需表 token 可能命中多张历史表或重复 markdown;合并输出会导致章节内连续两张相同表。 按 tableId 与表体指纹去重,保留表体最完整的一条(表5-4 优先可研/实际/增减结构)。 """ ranked: list[tuple[int, dict]] = [] for hit in hits: if not isinstance(hit, dict): continue md = str(hit.get("markdown") or "").strip() if not md: continue fp = _markdown_table_body_fingerprint(md) if not fp: continue ranked.append((_score_structured_table_hit_dict(hit), hit)) ranked.sort(key=lambda x: (-x[0], str(x[1].get("tableId") or ""))) out: list[dict] = [] seen_ids: set[str] = set() seen_fp: set[str] = set() for _score, hit in ranked: tid = str(hit.get("tableId") or "").strip() md = str(hit.get("markdown") or "").strip() fp = _markdown_table_body_fingerprint(md) if tid and tid in seen_ids: continue if fp in seen_fp: continue if tid: seen_ids.add(tid) seen_fp.add(fp) out.append(hit) return out def _is_likely_table1_raw_material_caption(line: str) -> bool: s = re.sub(r"\s+", "", str(line or "")) if not s: return False if "原料数量及组成对比" in s: return True if re.search(r"表\s*1", s) and "原料" in s: return True return False def _extract_table_header_key(table_block: str) -> str: """提取表格的表头行(第一条 pipe 行),归一化后作为同表判断依据。""" for ln in str(table_block or "").splitlines(): s = ln.strip() if s.startswith("|") and s.endswith("|") and not re.match(r"^\|[\s\-:|]+\|$", s): return re.sub(r"\s+", "", s) return "" def _is_433_operation_analysis_table_header(header_norm: str) -> bool: """4.3.3 表4-2「烷基化装置运行分析」常见 Markdown 表头(全列或仅实际值列)。""" h = str(header_norm or "") if "序号" not in h or "项目" not in h: return False if "实际值" in h: return True if "设计值" in h and "标定值" in h: return True return False def _433_op_analysis_table_has_canonical_caption(text: str, table_block_start: int) -> bool: """表前若干行内是否出现合同规定的表4-2 烷基化装置运行分析表题(用于保留规范副本、去掉无表题重复表)。""" before = str(text or "")[: int(table_block_start)].rstrip() lines = before.split("\n") tail = "\n".join(lines[-18:]) if "烷基化装置运行分析" not in tail: return False n = re.sub(r"\s+", "", tail) return bool(re.search(r"表4[--—–]2", n)) def _dedupe_433_alkylation_operation_analysis_markdown_tables(content: str) -> str: """ 4.3.3 常见故障:模型在「2) 主要装置达标评价」下先输出无表题的同结构表, 又在「3) 全厂达标评价」下重复输出带「表4-2 …烷基化装置运行分析…」表题的同一表。 对表头/表体指纹相同的重复表:优先保留表前带规范表4-2 表题的一张;否则保留文档中第一张。 """ text = str(content or "") if not text.strip(): return text pat = re.compile(r"(?m)(?:^\s*\|.+\|\s*\n){3,}") matches = list(pat.finditer(text)) if len(matches) < 2: return text items: list[dict] = [] for m in matches: block = m.group(0) hdr = _extract_table_header_key(block) if not _is_433_operation_analysis_table_header(hdr): continue fp = _markdown_table_body_fingerprint(block) items.append( { "m": m, "hdr": hdr, "fp": fp, "cap": _433_op_analysis_table_has_canonical_caption(text, m.start()), } ) n = len(items) if n < 2: return text parent = list(range(n)) def find(x: int) -> int: if parent[x] != x: parent[x] = find(parent[x]) return parent[x] def union(x: int, y: int) -> None: rx, ry = find(x), find(y) if rx != ry: parent[ry] = rx for i in range(n): for j in range(i + 1, n): a, b = items[i], items[j] same_hdr = bool(a["hdr"] and a["hdr"] == b["hdr"]) same_fp = bool(a["fp"] and a["fp"] == b["fp"]) if same_hdr or same_fp: union(i, j) clusters: dict[int, list[int]] = {} for i in range(n): r = find(i) clusters.setdefault(r, []).append(i) remove_spans: list[tuple[int, int]] = [] for _root, idxs in clusters.items(): if len(idxs) < 2: continue idxs_sorted = sorted(idxs, key=lambda ii: items[ii]["m"].start()) caps = [ii for ii in idxs_sorted if items[ii]["cap"]] keep_idx = caps[0] if caps else idxs_sorted[0] for ii in idxs_sorted: if ii == keep_idx: continue m = items[ii]["m"] start = m.start() prefix = text[:start].rstrip("\n") last_nl = prefix.rfind("\n") title_line = prefix[last_nl + 1 :] if last_nl >= 0 else prefix tl = title_line.strip() if "烷基化装置运行分析" in tl and re.search( r"表4[--—–]2", re.sub(r"\s+", "", tl) ): start = last_nl + 1 if last_nl >= 0 else 0 before = text[:start] if before.rstrip().endswith("-->"): comment_start = before.rstrip().rfind(""): comment_start = before.rstrip().rfind(""): comment_start = before.rstrip().rfind("\s*\n)*" r"(?:\s*\|[^\n]+\|\s*\n)+", flags=re.IGNORECASE, ) text = md1.sub("\n", text) # 「###」独占行后再起表题(与 DOCX 导出兼容) md2 = re.compile( rf"(?:^|\n)(?:\s*#{{1,6}}\s*\n)+(?:\s*\n)*" rf"(?:[^\n]*{kw8}[^\n]*\n(?:\s*[^\n]*{kwname}[^\n]*\n)?)" r"(?:\s*\n|\s*\n)*" r"(?:\s*\|[^\n]+\|\s*\n)+", flags=re.IGNORECASE, ) text = md2.sub("\n", text) html_pat = re.compile( rf"(?:^|\n)[^\n]*{kw8}[^\n]*{kwname}[^\n]*\n" r"(?:\s*\n|\s*\n)*" r"\s*[\s\S]*?
", flags=re.IGNORECASE, ) text = html_pat.sub("\n", text) title_only = re.compile( rf"(?:^|\n)(?:\s*#{{1,6}}\s*\n)+(?:\s*\n)*[^\n]*{kw8}[^\n]*(?:{kwname})?[^\n]*(?=\n|$)", flags=re.IGNORECASE, ) text = title_only.sub("\n", text) title_only2 = re.compile( rf"(?:^|\n)\s*[#>*\-\d\.\)()\s]*[^\n]*{kw8}[^\n]*{kwname}[^\n]*(?=\n|$)", flags=re.IGNORECASE, ) text = title_only2.sub("\n", text) return re.sub(r"\n{3,}", "\n\n", text).strip() def _pipe_markdown_row_cells(line: str) -> list[str]: raw = str(line or "").rstrip("\n") s = raw.strip() if not s.startswith("|") or not s.endswith("|"): return [] inner = s[1:-1] return [p.strip() for p in inner.split("|")] def _strip_md_cell_noise(s: str) -> str: t = re.sub(r"\*+", "", str(s or "")) t = re.sub(r"", "", t, flags=re.I) return t.strip() def _strip_532_table55_bad_markdown_columns(content: str) -> str: """去掉正文中「表5-5 主要生产经营指标」Markdown 表的多余列(如「后评价-时点点后预测值」及冗余裸预测列)。""" text = str(content or "") if not text or "主要生产经营指标" not in text: return text fc = "后评价时点后预测值" def _bad_header_indices(header_cells: list[str]) -> set[int]: bad: set[int] = set() comp_cells = [_compact_zh_ident(_strip_md_cell_noise(h)) for h in header_cells] has_slot = False for i, h in enumerate(header_cells): hs = str(h or "") parts = _split_group_year_col_key(hs) if parts and parts[0] == fc: tail = parts[1].strip() if _appendix_norm_year_tail(tail) or ( _BARE_FOUR_DIGIT_YEAR_COL.fullmatch(tail) and 1900 <= int(tail) <= 2100 ): has_slot = True break if re.search(r"后评价时点后预测值\s*[--—–]\s*\d{4}", hs): has_slot = True break fc_c = _compact_zh_ident(fc) for i, c in enumerate(comp_cells): if "时点点后" in c: bad.add(i) if has_slot and c == fc_c: bad.add(i) return bad def _drop_cols_from_pipe_block(block: str, drop_idx: set[int]) -> str: if not drop_idx: return block out_lines: list[str] = [] for ln in block.splitlines(): if not ln.strip().startswith("|"): out_lines.append(ln) continue cells = _pipe_markdown_row_cells(ln) if not cells: out_lines.append(ln) continue new_cells = [c for j, c in enumerate(cells) if j not in drop_idx] if len(new_cells) == len(cells): out_lines.append(ln) else: out_lines.append("| " + " | ".join(new_cells) + " |") return "\n".join(out_lines) rx = re.compile( r"((?:^|\n)[^\n]*表\s*5\s*[--\..·]\s*5[^\n]*主要生产经营指标[^\n]*\n)" r"(?:\s*\n|\s*\n)*" r"((?:^[ \t]*\|[^\n]+\|\s*\n)+)", flags=re.MULTILINE | re.IGNORECASE, ) def _repl(m: re.Match) -> str: prefix, body = m.group(1), m.group(2) tbl_lines = [ ln for ln in body.splitlines() if ln.strip().startswith("|") and ln.strip().endswith("|") ] if len(tbl_lines) < 2: return m.group(0) hdr = _pipe_markdown_row_cells(tbl_lines[0]) if not hdr: return m.group(0) drop = _bad_header_indices(hdr) if not drop: return m.group(0) return prefix + _drop_cols_from_pipe_block(body, drop) return rx.sub(_repl, text) def _cleanup_section_table_artifacts( section_title: str, content: str, *, allowed_table_tokens: Optional[list[str]] = None, ) -> str: section_no = _extract_section_number(str(section_title or "")) text = _strip_unallowed_table_references( str(content or ""), allowed_table_tokens=allowed_table_tokens, ) # 末尾兜底:防止中间步骤再次引入 4.3.2/4.3.3 串表。 if section_no in {"4.3.2", "4.3.3"}: text = _remove_cross_section_table_pollution(section_title, text) if section_no == "4.3.3": text = _dedupe_433_alkylation_operation_analysis_markdown_tables(text) if section_no == "5.3.2": text = _strip_532_embedded_appendix8_table(text) text = _strip_532_table55_bad_markdown_columns(text) if section_no == "3.3.1": return _strip_331_table_crossrefs(text) if section_no in {"2.1.5", "3.3.2", "3.3.4", "5.1", "5.3.1", "5.3.2"}: text = _finalize_section_table_dedupe(text, allowed_table_tokens) if section_no == "3.3.3": return _strip_333_trailing_table_caption_lines(text) if section_no == "3.4.1": return _strip_341_table_artifacts(text) if section_no == "2.1.1": text = _strip_211_stray_table_261(text) text = _dedupe_211_duplicate_markdown_tables(text) if section_no == "5.2.1": text = _fix_521_table52_wrong_caption(text) text = _strip_521_spurious_llm_table52(text) text = _finalize_section_table_dedupe(text, allowed_table_tokens) if section_no == "5.3.1": text = _strip_531_spurious_llm_table(text) if section_no == "5.4": text = _strip_54_spurious_llm_table(text) return text def _chapter5_opening_heading_present(text: str) -> bool: """判断正文块是否已以第5章章题开头(「5 投资…」与第1章「1 项目概况」同体例,便于前端提升为 ##)。""" t = str(text or "").strip() if not t: return False first = t.split("\n", 1)[0].strip() if first.startswith("#"): first = first.lstrip("#").strip() if "第5章" in first and "投资与经济效益评价" in first: return True return bool(re.match(r"^5\s+投资与经济效益评价", first)) def _canonicalize_chapter5_shell_heading_line(text: str) -> str: """ 将独立行的「第5章 投资与经济效益评价」规范为「5 投资与经济效益评价」, 与模板第1章及 promoteNumberedHeadingLinesToMarkdown(## 章级)一致。 """ lines = str(text or "").split("\n") out: list[str] = [] replaced = False for line in lines: if not replaced and line.strip(): stripped = line.strip().lstrip("#").strip() if stripped == "第5章 投资与经济效益评价" or ( stripped.startswith("第5章") and "投资与经济效益评价" in stripped ): out.append("5 投资与经济效益评价") replaced = True continue out.append(line) return "\n".join(out) def _section_heading_present_in_text(text: str, heading_title: str) -> bool: """判断正文中是否已出现指定节标题行。""" if not str(text or "").strip() or not str(heading_title or "").strip(): return False target_norm = _title_compare_norm(heading_title) section_no = _extract_section_number(heading_title) for line in str(text).splitlines(): stripped = line.strip() if not stripped: continue plain = stripped.lstrip("#").strip() if _title_compare_norm(plain) == target_norm: return True if section_no and _is_heading_line_for_section(plain, section_no): return True if section_no == "5" and _chapter5_opening_heading_present(text): return True return False def _inject_missing_parent_section_headings( section_title: str, content: str, previous_section_content: str, chapter_title_map: dict[str, str], ) -> str: """ 仅生成叶子节时,父节壳(如 5.2、5.3、5)不会单独落库;在首个子节(x.y.1)前补足父节标题。 """ if not chapter_title_map: return content stub = SimpleNamespace(section_title=str(section_title or "").strip()) ancestors = _resolve_ancestor_titles_for_section(stub, chapter_title_map) if not ancestors: return content body = str(content or "").strip() if not body: return content if _extract_section_number(section_title or "") == "5.1": body = _canonicalize_chapter5_shell_heading_line(body) prior = str(previous_section_content or "") missing: list[str] = [] for anc in ancestors: if _section_heading_present_in_text(body, anc): continue if _section_heading_present_in_text(prior, anc): continue missing.append(anc) if not missing: return body return "\n\n".join(missing + [body]).strip() def _inject_chapter5_title_before_section_51( section_key: str, content: str, previous_section_content: str, *, section_title: str = "", chapter_title_map: Optional[dict[str, str]] = None, ) -> str: """兼容旧调用;优先走通用父节标题注入。""" if chapter_title_map and section_title: return _inject_missing_parent_section_headings( section_title, content, previous_section_content, chapter_title_map ) if str(section_key or "").strip() != "5-1": return content body = _canonicalize_chapter5_shell_heading_line(str(content or "").strip()) if not body: return content if _chapter5_opening_heading_present(body): return body if _chapter5_opening_heading_present(previous_section_content): return body return f"5 投资与经济效益评价\n\n{body}" def _previous_completed_section_content( section: ReportTemplateSection, sections: list[ReportTemplateSection], completed_contents: dict[str, str], ) -> str: """按模板顺序取当前节之前最近一节已生成正文(用于父节标题是否已出现)。""" ordered = list(sections or []) try: idx = next(i for i, s in enumerate(ordered) if s.section_key == section.section_key) except StopIteration: return "" for j in range(idx - 1, -1, -1): body = str(completed_contents.get(ordered[j].section_key) or "").strip() if body: return body return "" def _prev_line_invites_metric_continuation(prev_line: str) -> bool: """上一行是否像在句中被截断、下一行应以能耗/物耗数值续写。""" s = str(prev_line or "").strip() if not s: return False if s.startswith("|"): return False if re.match(r"^\s{0,3}#{0,6}\s*\d+(?:\.\d+)+\s+[\u4e00-\u9fff]", s): return False if re.search(r"(?:\[\d+\]\s*)+$", s): return True if re.search( r"(?:单耗|电耗|能耗|水耗|物耗|损失|运行值|设计值|加工量|负荷|占比)为?" r"\s*(?:\[\d+\]\s*)*$", s, ): return True if re.search( r"(上升至|升至|降至|下降为|提高为|降低为|为|达到|至)\s*(?:\[\d+\]\s*)*$", s, ): return True return bool(re.search(r"[至为是到]$", s)) def _merge_orphan_energy_metric_lines(text: str) -> str: """ 将误断成独立行的能耗/物耗数值片段并回上一行正文。 例:「…可研报告 [50]\\n132.41 MJ/t产品及初步设计」→ 合并为一行,避免前端误判为 ### 标题。 """ from services.docx_export_service import _is_likely_section_number metric_re = re.compile( r"^\s*(?:#{1,6}\s+)?(\d+(?:\.\d+)?)\s+(MJ/t|kWh/t|kgce/t|t产品)", re.IGNORECASE, ) lines = str(text or "").split("\n") out: list[str] = [] for line in lines: stripped = re.sub(r"^#{1,6}\s+", "", str(line or "").strip()) m = metric_re.match(stripped) if m and not _is_likely_section_number(m.group(1)): prev = len(out) - 1 while prev >= 0 and not str(out[prev] or "").strip(): prev -= 1 if prev >= 0 and _prev_line_invites_metric_continuation(str(out[prev] or "")): out[prev] = out[prev].rstrip() + stripped continue out.append(line) return "\n".join(out) def _fix_numeric_line_breaks(content: str) -> str: """ 修复数字与单位/日期在换行处被意外拆分的问题。 仅合并明显数字语义连续场景,尽量不影响正常段落换行。 """ text = str(content or "") if not text: return text # 统一各种换行分隔符,避免 \u2028/\u2029 导致规则失效 text = text.replace("\r\n", "\n").replace("\r", "\n") text = text.replace("\u2028", "\n").replace("\u2029", "\n") # 保护 Markdown 表格行之间及表格行与后续正文之间的换行, # 否则数字合并规则会把表格末行和下一行粘在一起变成多余列。 # 策略:按行拆分,识别所有表格行(以 | 开头或以 | 结尾), # 将其前后换行替换为保护占位符,合并规则处理完毕后再恢复。 table_nl_token = "\u0000TABLE_NL\u0000" _lines = text.split("\n") for _li in range(len(_lines)): _stripped = _lines[_li].strip() _is_table = _stripped.startswith("|") or _stripped.endswith("|") if _is_table: _lines[_li] = table_nl_token + _lines[_li] + table_nl_token text = "\n".join(_lines) text = text.replace(table_nl_token + "\n" + table_nl_token, table_nl_token) text = text.replace("\n" + table_nl_token, table_nl_token) text = text.replace(table_nl_token + "\n", table_nl_token) # 先保护“章节标题换行”(如:1 项目概况\n1.1 项目基本情况),避免被数字合并规则误伤。 heading_nl_token = "\u0000HEADING_NL\u0000" text = re.sub( r"\n(?=\s*\d+(?:\.\d+)*\s+[\u4e00-\u9fff]{2,}(?:\s|$))", heading_nl_token, text, ) # 数字/中文与下一行之间的合并:换行两侧仅允许水平空白(不含 \\n), # 否则 \\s* 会吞掉段落空行的第一个 \\n,使 \\n(?!\\n) 失效,误把「标题\\n\\n2017年…」粘回一行。 _hsp = r"[ \t\u3000]*" # 例:2018 年 11 月\n4 日、24.48\n%、1906\nm2、0.05\ng text = re.sub( rf"(?<=\d){_hsp}\n(?!\n){_hsp}(?=(?:\d|[年月日时分秒度%%℃吨米台套项]|[A-Za-z]))", "", text, ) # 例:烈度\n7 度、规模\n15 万吨/年(中文描述后接数字) # 仅在“下一行是数字 + 常见单位/量纲”时合并,避免误伤编号列表(如 1)/1.) text = re.sub( rf"(?<=[\u4e00-\u9fff]){_hsp}\n(?!\n){_hsp}(?=\d+(?:\.\d+)?\s*(?![))、.])(?:[年月日时分秒度%%℃吨米台套项个级亩万亿千百十gGlLmMkKvVaAwWhHzHPp]|[A-Za-z]{{1,4}}\b))", "", text, ) # 例:106万\n工时、15万吨/年\n烷基化项目(数量级后接中文语义单位) text = re.sub( rf"(?<=[\d万亿千百十]){_hsp}\n(?!\n){_hsp}(?=(?:工时|吨/年|万吨/年|亿元|万元|万人|m2|m3|m²|m³|项|台|套|个|座|处|条|次|年|月|日))", "", text, flags=re.IGNORECASE, ) # 例:kgEo/\nt、m\n2 等单位被拆分 text = re.sub(r"(?<=[A-Za-z/])\s*\n\s*(?=\d)", "", text) text = re.sub(r"(?<=[A-Za-z])\s*\n\s*(?=[A-Za-z])", "", text) # 例:实际运行值为\n137.88 MJ/t;…单耗为 [93][94]\n\n1.38 MJ/t(2.1.1/2.1.6 常见断行) _metric_num = r"\d+(?:\.\d+)?\s*(?:MJ/t|kWh/t|kgce/t|t产品)" text = re.sub( rf"((?:\[\d+\]\s*)+)\s*\n+\s*({_metric_num})", r"\1 \2", text, flags=re.IGNORECASE, ) text = re.sub( rf"(?<=[\u4e00-\u9fff)\])])\s*\n+\s*({_metric_num})", r" \1", text, flags=re.IGNORECASE, ) text = _merge_orphan_energy_metric_lines(text) # 统一面积/体积单位写法:m2/m3 -> m²/m³(兼容空格、大小写、^ 写法) text = re.sub(r"(?i)\bm\s*(?:\^?\s*2)\b", "m²", text) text = re.sub(r"(?i)\bm\s*(?:\^?\s*3)\b", "m³", text) text = text.replace(heading_nl_token, "\n") text = text.replace(table_nl_token, "\n") return text def _canonical_global_table_name_for_token(token: str) -> str | None: t = str(token or "").strip() if not t: return None for name in MULTI_COLUMN_GLOBAL_SPECS: if _table_token_matches_name(t, name): return name return None def _skeleton_markdown_for_table_token(token: str, *, table_name: str = "") -> str: """按细则模版生成占位 Markdown 表体(要素无有效单元格时仍保证表3-4 等有表体)。""" full_name = _canonical_global_table_name_for_token(token) or "" if not full_name: tn = str(table_name or "").strip() if _multi_column_global_spec_for_table(tn): full_name = tn elif tn: full_name = _canonical_global_table_name_for_token(tn) or tn spec = _multi_column_global_spec_for_table(full_name) if not spec: return "" row_order = global_table_row_keys(full_name) if not row_order: return "" col_order = [str(c).strip() for c in (spec[0] or []) if str(c).strip()] if not col_order: return "" md, _ = _render_markdown_table(full_name, row_order, col_order, {}) return str(md or "").strip() def _authoritative_block_for_required_table(token: str, evidence: dict) -> str | None: """要素直出整块:优先 structuredTables 中的 markdown,否则用模版骨架表。""" table_rows = evidence.get("structuredTables") if isinstance(evidence, dict) else [] title = str(token or "").strip() md = "" if isinstance(table_rows, list): best_row: dict | None = None best_row_score = -1 for row in table_rows: if not isinstance(row, dict): continue token_hit = str(row.get("token") or "") table_name_hit = str(row.get("tableName") or "") if _table_token_matches_name(token, token_hit) or _table_token_matches_name( token, table_name_hit ): row_md = str(row.get("markdown") or "").strip() if not row_md: continue row_score = ( _score_structured_table_hit_dict(row) if _table_token_matches_name(token, "表5-4") else len(row_md) ) if row_score > best_row_score: best_row_score = row_score best_row = row if best_row: md = str(best_row.get("markdown") or "").strip() title = str(best_row.get("tableName") or token).strip() or token if not md: sk = _skeleton_markdown_for_table_token(token, table_name=title) if sk: md = sk canon = _canonical_global_table_name_for_token(token) if canon: title = canon if not md: return None return ( f"{title}\n\n" "\n" f"{md}" ) def _fill_required_table_caption_stubs( content: str, required_tables: list[str], evidence: dict ) -> str: """将仅有表题、段内无 Markdown 表体的必需表替换为要素直出或模版骨架。""" text = str(content or "") changed = False for token in required_tables or []: if not _table_token_caption_line_re(token).search(text): continue seg = _segment_after_table_caption(text, token) if _segment_has_markdown_table_body(seg): seg_tbl = re.search(r"(?m)(?:^\s*\|[^\n]+\|\s*\n){3,}", seg) if not ( _table_token_matches_name(token, "表5-4") and seg_tbl and _is_table54_simplified_extract_body(seg_tbl.group(0)) ): continue block = _authoritative_block_for_required_table(token, evidence) if not block: continue text = _replace_caption_stub_with_authoritative_table(text, token, block) changed = True return text if changed else content def _append_structured_missing_tables(content: str, missing_tables: list[str], evidence: dict) -> str: out_content = str(content or "").rstrip() used = False for token in missing_tables: block = _authoritative_block_for_required_table(token, evidence) if not block: continue if _table_token_caption_line_re(token).search(out_content): out_content = _replace_caption_stub_with_authoritative_table( out_content, token, block ) else: out_content = out_content + "\n\n" + block used = True return out_content.strip() if used else content def _replace_llm_table_with_authoritative(content: str, token: str, replacement_md: str) -> str: """将 LLM 自行生成的同 token 表格(表题行 + 表格体)替换为要素管理直出内容。 关键:管道行匹配使用 ``[ \\t]*`` 而非 ``\\s*``,防止 ``\\s`` 跨越空行 把分析文字中的 token 引用误关联到远处另一张表的管道行。 表题行与首条管道行之间允许至多一个空行(``\\n?``)。 """ token_plain = re.sub(r"\s+", "", str(token or "")) if not token_plain or not replacement_md: return content token_re = re.escape(token_plain).replace(r"\-", r"[--—–]") md_table_pat = re.compile( r"((?:^|\n)[^\n]*?" + token_re + r"[^\n]*\n)" r"(\n?(?:[ \t]*\|[^\n]+\|[ \t]*\n)+)", flags=re.IGNORECASE, ) m = md_table_pat.search(content) if m: return content[:m.start()] + "\n" + replacement_md + "\n\n" + content[m.end():].lstrip("\n") html_table_pat = re.compile( r"((?:^|\n)[^\n]*?" + token_re + r"[^\n]*\n)" r"(\s*[\s\S]*?
)", flags=re.IGNORECASE, ) m = html_table_pat.search(content) if m: return content[:m.start()] + "\n" + replacement_md + "\n\n" + content[m.end():].lstrip("\n") return content def _caption_followed_by_element_table_comment(content: str, token: str) -> bool: """仅当「本表表题行后」紧跟要素直出注释时,才视为已权威化,避免全篇任一注释误伤其它表的替换。""" token_plain = re.sub(r"\s+", "", str(token or "")) if not token_plain: return False token_re = re.escape(token_plain).replace(r"\-", r"[--—–]") return bool( re.search( r"(?:^|\n)[^\n]*?" + token_re + r"[^\n]*\n" r"(?:[ \t]*\n)?[ \t]*\n" f"{hit_md}" ) if _table_token_exists(out, token_n): out = _replace_llm_table_with_authoritative(out, token_n, rep) return out def _append_authoritative_required_tables(content: str, required_tables: list[str], evidence: dict) -> str: """ 为模板必需表追加"要素表直出"块,确保表格数据直接来自结构化要素表。 若 LLM 已自行生成了同 token 的表格,用要素管理数据替换之。 """ if not required_tables: return content out_content = str(content or "") used = False for token in required_tables: already_authoritative = ( _caption_followed_by_element_table_comment(out_content, token) and _table_token_exists(out_content, token) ) if already_authoritative: continue combined_md = _authoritative_block_for_required_table(token, evidence) if not combined_md: continue if _table_token_exists(out_content, token): replaced = _replace_llm_table_with_authoritative(out_content, token, combined_md) out_content = ( replaced if replaced != out_content else _replace_caption_stub_with_authoritative_table( out_content, token, combined_md ) ) elif _table_token_caption_line_re(token).search(out_content): out_content = _replace_caption_stub_with_authoritative_table( out_content, token, combined_md ) else: out_content = out_content.rstrip() + "\n\n" + combined_md used = True return out_content.strip() if used else content def _is_effective_markdown_table_block(md_block: str) -> bool: lines = [str(ln or "").strip() for ln in str(md_block or "").splitlines() if str(ln or "").strip()] if len(lines) < 3: return False if _is_pipe_markdown_table_separator_line(lines[0]): return False if not _is_pipe_markdown_table_separator_line(lines[1]): return False data_rows = [ ln for ln in lines[2:] if _is_pipe_markdown_table_row_line(ln) and not _is_pipe_markdown_table_separator_line(ln) ] return bool(data_rows) def _ensure_required_structured_tables_integrity(content: str, required_tables: list[str], evidence: dict) -> str: if not required_tables: return content repaired = str(content or "") for token in required_tables: authoritative_block = _authoritative_block_for_required_table(token, evidence) if not authoritative_block: continue if _table_token_caption_line_re(token).search(repaired): seg = _segment_after_table_caption(repaired, token) if not _segment_has_markdown_table_body(seg): repaired = _replace_caption_stub_with_authoritative_table( repaired, token, authoritative_block ) continue if _table_token_matches_name(token, "表5-4"): m_seg = re.search( r"(?m)(?:^\s*\|[^\n]+\|\s*\n){3,}", seg ) if m_seg and _is_table54_simplified_extract_body(m_seg.group(0)): repaired = _replace_caption_stub_with_authoritative_table( repaired, token, authoritative_block ) continue token_plain = re.sub(r"\s+", "", str(token or "")) token_re = re.escape(token_plain).replace(r"\-", r"[--—–]") table_pat = re.compile( r"((?:^|\n)[^\n]*?" + token_re + r"[^\n]*\n(?:\n|[ \t]*[ \t]*\n)*)" r"((?:[ \t]*\|[^\n]*\|[ \t]*\n)+)", flags=re.IGNORECASE, ) m = table_pat.search(repaired) if m: cur_table = str(m.group(2) or "") need_replace = not _is_effective_markdown_table_block(cur_table) if _table_token_matches_name(token, "表5-4") and _is_table54_simplified_extract_body( cur_table ): need_replace = True if need_replace: repaired = ( repaired[:m.start()] + "\n" + authoritative_block + "\n\n" + repaired[m.end():].lstrip("\n") ) elif not _table_token_exists(repaired, token): repaired = repaired.rstrip() + "\n\n" + authoritative_block return repaired.strip() def _collect_structured_tables( db: Session, project_uuid: str, required_tables: list[str], *, section_title: str, section_tokens: list[str], ) -> list[dict]: """ 报告生成阶段的结构化表来源必须与“要素管理”一致(element_tables/element_cells)。 规则: - 若模板 prompt 中声明了必需表(如 表2-1/附表8),优先按 token 精准匹配; - 若未声明必需表,或声明了但匹配不到,则按章节标题/关键词从要素管理中选取最相关的表直出, 避免模型自行编造表格。 """ def _table_relevance_score(table_name: str) -> int: name = str(table_name or "").strip() if not name: return 0 name_l = name.lower() score = 0 # 章节标题强相关加权 t = str(section_title or "").strip() if t and t in name: score += 10 # token 命中加分 for tok in (section_tokens or [])[:20]: tt = str(tok or "").strip() if not tt: continue if tt.lower() in name_l: score += 2 # 常见表名关键字(表/附表/对比/评价)做轻微加权,便于优先输出真正的表 if any(k in name for k in ("表", "附表", "对比", "评价", "评分")): score += 1 return score tables: list[ElementTable] = ( db.query(ElementTable) .filter(ElementTable.project_id == project_uuid) .order_by(ElementTable.sort_order.asc(), ElementTable.updated_at.desc()) .all() ) if not tables: return [] # 5.3.2:正文仅需表5-5/表5-6;附表8 归入全书「## 附表」,勿纳入本节结构化证据。 if _extract_section_number(str(section_title or "")) == "5.3.2": tables = [ t for t in tables if not ( ("附表8" in str(t.table_name or "")) and ("可研报告和后评价参数对比表" in str(t.table_name or "")) ) ] if not tables: return [] # 1) 必需表:按 token/表名匹配(尽量“直接用要素管理中的表”) required_norm = [_norm_table_token(t) for t in (required_tables or []) if _norm_table_token(t)] required_hits: list[ElementTable] = [] if required_norm: for t in tables: name_norm = _norm_table_token(t.table_name) if not name_norm: continue if any(req and _table_token_matches_name(req, name_norm, normalized=True) for req in required_norm): required_hits.append(t) if required_hits: req_hint_words: list[str] = [] for req in required_norm: req_hint_words.extend(_TABLE_TOKEN_PREFERRED_NAME_HINTS.get(req, ())) def _required_hit_score(tb: ElementTable) -> tuple[int, int, int]: tb_name = str(tb.table_name or "").strip() hint_hit = 0 if req_hint_words: for hint in req_hint_words: if hint and hint in tb_name: hint_hit += 1 collect = 0 if any(_table_token_matches_name(req, "表5-4") for req in required_norm): collect = _element_table_collect_score(db, tb, "表5-4") return (collect, hint_hit, _table_relevance_score(tb_name)) required_hits.sort(key=_required_hit_score, reverse=True) # 保留靠前若干张,避免同 token 多张历史表导致提示词爆炸 required_hits = required_hits[:8] # 2) 回退:未声明必需表,或声明了但没匹配上时,按相关性挑选 selected: list[ElementTable] = list(required_hits) if not selected: scored: list[tuple[int, ElementTable]] = [] for t in tables: s = _table_relevance_score(t.table_name) if s > 0: scored.append((s, t)) scored.sort(key=lambda x: x[0], reverse=True) selected = [t for _, t in scored[:4]] selected_has_time54 = any( _is_table54_operating_benefit(str(t.table_name or "")) and str(t.table_type or "").strip() == "time" for t in selected ) out: list[dict] = [] for table in selected: is_time = str(table.table_type or "").strip() == "time" if is_time: ty_row = db.query(ElementTable.year).filter(ElementTable.id == table.id).first() tbl_y = ( int(ty_row[0]) if ty_row and ty_row[0] is not None and int(ty_row[0]) > 0 else None ) year_items = _build_time_table_markdowns_by_year(db, table.id, table.table_name) if _is_table54_operating_benefit(table.table_name): picked = _pick_table54_year_markdown(year_items, table_year=tbl_y) if picked: year_items = [picked] for display_name, md in year_items: if not md: continue token = _extract_table_short_token(display_name) out.append( { "tableId": table.id, "tableName": display_name, "token": token, "markdown": md, } ) if len(out) >= 12: break else: if ( selected_has_time54 and _is_table54_operating_benefit(table.table_name) ): continue md, common_unit = _build_structured_table_markdown(db, table.id, table.table_name) display_name = _merge_table_title_with_common_unit(str(table.table_name or "").strip(), common_unit) token = _extract_table_short_token(table.table_name) if not md: md = _skeleton_markdown_for_table_token( token or display_name, table_name=display_name ) if not md: continue hit = { "tableId": table.id, "tableName": display_name, "token": token, "markdown": md, } if _is_table54_operating_benefit(table.table_name) and _is_table54_simplified_extract_body(md): continue out.append(hit) if len(out) >= 12: break t54_norm = _norm_table_token("表5-4") t54_hits = [h for h in out if _norm_table_token(str(h.get("token") or "")) == t54_norm] if t54_hits: rest = [h for h in out if _norm_table_token(str(h.get("token") or "")) != t54_norm] ranked = _dedupe_structured_table_hits(t54_hits) out = rest + (ranked[:1] if ranked else []) return out[:12] def _table_2_5_general_layout_comparison_name(table_name: str) -> bool: """与要素管理 quick-fill 表2-5 判定一致(表头用「项目名称」,不含依托对比)。""" n = str(table_name or "") if "依托" in n: return False return "表2-5" in n or "总图、储运、公用工程及辅助工程对比" in n def _table_2_6_reliance_comparison_name(table_name: str) -> bool: """与要素管理 quick-fill 表2-6判定一致(行展示去「依托·」等类别前缀、表头用依托项目名称)。""" n = str(table_name or "") return ( "表2-6" in n or "储运、公用工程及辅助工程依托对比" in n or "辅助工程依托对比" in n ) def _table_3_1_contracting_units_name(table_name: str) -> bool: """与要素管理 quick-fill 表3-1判定一致(行展示去「承包单元·」前缀、表头用单元名称)。""" n = str(table_name or "") return "表3-1" in n or "项目承包单位情况" in n def _table_3_3_plantwide_design_change_name(table_name: str) -> bool: """与要素管理 quick-fill 表3-3 判定一致(表头用「单元名称」)。""" n = str(table_name or "") return ("表3-3" in n or "施工图设计变更情况" in n) and "全厂" in n def _table_3_4_single_unit_design_change_name(table_name: str) -> bool: """与要素管理 quick-fill 表3-4 判定一致(表头用「专业」)。""" n = str(table_name or "") return ("表3-4" in n or "施工图设计变更情况" in n) and "单装置" in n def _table_3_5_major_design_change_name(table_name: str) -> bool: """与要素管理 quick-fill 表3-5判定一致(行展示去「重大变更·」前缀、表头用单元名称)。""" n = str(table_name or "") return "表3-5" in n or "影响投资或工期" in n def _table_3_7_procurement_name(table_name: str) -> bool: """与要素管理 quick-fill 表3-7判定一致(行展示去「采购物资·」前缀、表头用物资(类别)名称)。""" n = str(table_name or "") return "表3-7" in n or "采购工作情况" in n def _table_4_2_alkylation_operation_analysis_name(table_name: str) -> bool: """烷基化装置运行分析表(含历史误标为表4-1、用户改写考核日期后的表题)。""" n = re.sub(r"\s+", "", str(table_name or "")) return ("烷基化装置运行分析" in n) and ("考核时间" in n) _TABLE42_ANALYSIS_TEMPLATE_NAME = "表4-2 烷基化装置运行分析表(考核时间:×年×月×日)" def _multi_column_global_spec_for_table(table_name: str): """按表名取多列模版;表4-2 槽位改名后仍套用标准列序(单位/设计值/标定值/实际值)。""" tn = str(table_name or "").strip() spec = MULTI_COLUMN_GLOBAL_SPECS.get(tn) if spec: return spec if _table_4_2_alkylation_operation_analysis_name(table_name): return MULTI_COLUMN_GLOBAL_SPECS.get(_TABLE42_ANALYSIS_TEMPLATE_NAME) return None def _element_manage_row_label_after_first_dot(label: str) -> str: """与 quick-fill.js parseRowKeyForDisplay 一致:去掉行键第一个「…·」段(仅作展示)。""" s = str(label or "").strip() if "\u00b7" in s: rest = "\u00b7".join(s.split("\u00b7", 1)[1:]).strip() return rest if rest else s return s def _element_manage_table_row_display_label(table_name: str, label: str) -> str: """表2-6/表3-1/表3-5/表3-7/表4-2 等与要素管理行名展示对齐(库内 row_key 仍保留类别前缀)。""" if ( _table_2_6_reliance_comparison_name(table_name) or _table_3_1_contracting_units_name(table_name) or _table_3_5_major_design_change_name(table_name) or _table_3_7_procurement_name(table_name) or _table_4_2_alkylation_operation_analysis_name(table_name) ): return _element_manage_row_label_after_first_dot(label) return str(label or "").strip() def _row_header_name_for_table(table_name: str) -> str: name = str(table_name or "") if "产品方案对比表" in name: return "产品" if "原料数量及组成对比表" in name: return "原料名称" if "原料)性质对比表" in name or "原料性质对比表" in name: return "名称" if _table_2_5_general_layout_comparison_name(name): return "项目名称" if _table_2_6_reliance_comparison_name(name): return "依托项目名称" if _table_3_3_plantwide_design_change_name(name): return "单元名称" if _table_3_4_single_unit_design_change_name(name): return "专业" if _table_3_1_contracting_units_name(name) or _table_3_5_major_design_change_name(name): return "单元名称" if _table_3_7_procurement_name(name): return "物资(类别)名称" if _table_4_2_alkylation_operation_analysis_name(name): return "项目名称" if ( _table52_investment_change_name(name) or _table53_engineering_cost_change_name(name) or _appendix2_investment_structure_name(name) ): return "工程或费用名称" return "项目" _GROUP_HEADER_PREFIXES = [ "可研报告", "可研", "初步设计", "实际生产", "实际运行", "实际实施", "后评价", ] # 含完整时点组名,避免「后评价时点后预测值」被前缀「后评价」误拆成两行表头 _GROUP_HEADER_EXACT = {p for p in _GROUP_HEADER_PREFIXES} | { "后评价时点前实际值", "后评价时点后预测值", } def _group_column_headers(col_order: list[str]) -> tuple[list[str], list[str]] | None: """识别多级列头。仅当列名形如"可研报告数量(万吨)"时拆分为 top=可研报告 sub=数量(万吨)。 独立列名(如"可研报告""初步设计""实际实施")不视为分组,避免误拆。""" top_headers: list[str] = [] sub_headers: list[str] = [] has_group = False for col in col_order: text = str(col or "").strip() if not text: top_headers.append("") sub_headers.append("") continue # 列键笔误「…时点点后…」勿按「后评价」前缀拆分,否则 Markdown 展平成「后评价-时点点后…」。 if "时点点后" in text: has_group = True top_headers.append("") sub_headers.append(text.replace("时点点后", "时点后", 1)) continue # 附表3~7、表5-5:列键为「后评价时点后预测值|2021」等,必须在「后评价」前缀规则之前处理, # 否则会拆成 top=后评价、sub=时点后预测值|2021,Markdown 单行表头与按 col_key 取值的列错位。 pipe_sep = "|" if "|" in text else ("\uff5c" if "\uff5c" in text else None) if pipe_sep is not None: group, tail = text.split(pipe_sep, 1) group, tail = group.strip(), tail.strip() if group in _APPENDIX_TIME_SLOT_GROUPS and tail: has_group = True top_headers.append(group) sub_headers.append(tail) continue # 表5-4:「可研报告|××年#1」等不得按「可研报告」前缀拆成「可研报告-|××年#1」 if group in _TABLE54_PIPE_METRIC_PREFIXES and tail: has_group = True top_headers.append("") sub_headers.append(text) continue if text in _GROUP_HEADER_EXACT: top_headers.append("") sub_headers.append(text) continue matched = False for prefix in _GROUP_HEADER_PREFIXES: if text.startswith(prefix) and len(text) > len(prefix): suffix = text[len(prefix):].strip() if suffix: has_group = True top_headers.append(prefix) sub_headers.append(suffix) matched = True break if matched: continue if "·" in text: has_group = True left, right = [part.strip() for part in text.split("·", 1)] top_headers.append(left) sub_headers.append(right) continue top_headers.append("") sub_headers.append(text) return (top_headers, sub_headers) if has_group else None def _table51_main_economic_indicators_name(table_name: str) -> bool: n = str(table_name or "") return "表5-1" in n and "主要经济指标对比" in n def _table52_investment_change_name(table_name: str) -> bool: n = str(table_name or "") return "表5-2" in n and "投资变动情况表" in n def _table53_engineering_cost_change_name(table_name: str) -> bool: n = str(table_name or "") return "表5-3" in n and "工程费用变动情况表" in n def _appendix2_investment_structure_name(table_name: str) -> bool: n = str(table_name or "") return "附表2" in n and "项目竣工决算投资构成表" in n def _appendix3_cashflow_name(table_name: str) -> bool: n = str(table_name or "") return "附表3" in n and "项目投资财务现金流量表" in n def _appendix4_profit_name(table_name: str) -> bool: n = str(table_name or "") return "附表4" in n and "利润与利润分配计算表" in n def _appendix5_revenue_tax_name(table_name: str) -> bool: n = str(table_name or "") return "附表5" in n and "营业收入与营业税金及附加计算表" in n def _appendix6_cost_name(table_name: str) -> bool: n = str(table_name or "") return "附表6" in n and "总成本费用计算表" in n def _appendix7_materials_name(table_name: str) -> bool: n = str(table_name or "") return "附表7" in n and "原材料、燃料及动力费用计算表" in n def _appendix8_param_name(table_name: str) -> bool: n = str(table_name or "") return "附表8" in n and "可研报告和后评价参数对比表" in n def _appendix_time_table_name(table_name: str) -> bool: return ( _appendix3_cashflow_name(table_name) or _appendix4_profit_name(table_name) or _appendix5_revenue_tax_name(table_name) or _appendix6_cost_name(table_name) or _appendix7_materials_name(table_name) ) def _table_row_seq_name_split_display(table_name: str) -> bool: """投资/附表类表:项目列仅展示名称(序号另列,与要素管理一致)。""" return ( _table51_main_economic_indicators_name(table_name) or _table52_investment_change_name(table_name) or _table53_engineering_cost_change_name(table_name) or _appendix2_investment_structure_name(table_name) or _appendix_time_table_name(table_name) or _appendix8_param_name(table_name) ) def _pick_row_key_with_legacy( canon: str, row_set: set[str], legacy_map: dict[str, str] ) -> str | None: """在库内实际 row_key 中选取规范键或其旧版别名(优先规范键)。""" if canon in row_set: return canon for legacy, normalized in legacy_map.items(): if normalized == canon and legacy in row_set: return legacy return None def _order_rows_by_preferred( row_order: list[str], preferred: list[str], *, legacy_map: dict[str, str] | None = None, ) -> list[str]: row_set = set(row_order) ordered: list[str] = [] seen: set[str] = set() for canon in preferred: picked: str | None if legacy_map: picked = _pick_row_key_with_legacy(canon, row_set, legacy_map) else: picked = canon if canon in row_set else None if picked and picked not in seen: seen.add(picked) ordered.append(picked) extras = sorted(rk for rk in row_order if rk not in seen) return ordered + extras def _legacy_map_for_table(table_name: str) -> dict[str, str] | None: if _appendix2_investment_structure_name(table_name): return APPENDIX2_LEGACY_ROW_KEY_MAP if _appendix8_param_name(table_name): return APPENDIX8_LEGACY_ROW_KEY_MAP return None def _apply_global_table_standard_row_order(table_name: str, row_order: list[str]) -> list[str]: """表5-1/5-2/5-3、附表2~8:与要素管理、标准模版一致的标准行序。""" if not row_order: return row_order tn = str(table_name or "") row_set = set(row_order) preferred = canonical_row_order_for_table(tn) if preferred is not None: return _order_rows_by_preferred(row_order, preferred, legacy_map=_legacy_map_for_table(tn)) if _table53_engineering_cost_change_name(tn): ordered: list[str] = [] seen: set[str] = set() for alts in TABLE_5_3_ROW_KEY_ALTERNATES: picked: str | None = None for rk in alts: if rk in row_set: picked = rk break if picked and picked not in seen: seen.add(picked) ordered.append(picked) extras = sorted(rk for rk in row_order if rk not in seen) return ordered + extras preferred: list[str] | None = None if ( _table51_main_economic_indicators_name(tn) or _table52_investment_change_name(tn) ): preferred = global_table_row_keys(tn) if not preferred: return row_order ordered = [rk for rk in preferred if rk in row_set] seen = set(ordered) extras = sorted(rk for rk in row_order if rk not in seen) return ordered + extras def _normalize_table_row_order(row_order: list[str], *, table_name: str = "") -> list[str]: row_order = _apply_global_table_standard_row_order(table_name, row_order) normal_rows: list[str] = [] total_rows: list[str] = [] for row in row_order: text = str(row or "").strip() compact = re.sub(r"\s+", "", text) if compact in {"合计", "总计"}: total_rows.append(row) else: normal_rows.append(row) return normal_rows + total_rows _BARE_FOUR_DIGIT_YEAR_COL = re.compile(r"^\d{4}$") _APPENDIX_YEAR_TAIL_NORM = re.compile(r"^(\d{4})年?$") def _appendix_norm_year_tail(tail: str) -> str | None: """将列键尾部「2020」「2020年」规范为四位年字符串;非日历年返回 None。""" t = str(tail or "").strip() m = _APPENDIX_YEAR_TAIL_NORM.fullmatch(t) if not m: return None y = m.group(1) try: yi = int(y) except ValueError: return None if 1900 <= yi <= 2100: return y return None def _filter_redundant_bare_year_columns(col_order: list[str]) -> list[str]: """去掉与「组|年度」列重复的旧版纯年份列键(如模板历史同步遗留的 2019 / 2020)。""" if not col_order: return col_order cols = [str(c).strip() for c in col_order if str(c or "").strip()] if not cols: return list(col_order) years_from_piped: set[str] = set() for c in cols: if "|" not in c: continue tail = c.rsplit("|", 1)[-1].strip() ny = _appendix_norm_year_tail(tail) if ny: years_from_piped.add(ny) for tok in re.split(r"[\s,,、/-]+", tail): t = tok.strip() if _BARE_FOUR_DIGIT_YEAR_COL.fullmatch(t) and 1900 <= int(t) <= 2100: years_from_piped.add(t) if not years_from_piped: return list(col_order) out: list[str] = [] for c in col_order: cs = str(c or "").strip() ny_bare = _appendix_norm_year_tail(cs) if ny_bare and ny_bare in years_from_piped: continue if _BARE_FOUR_DIGIT_YEAR_COL.fullmatch(cs) and cs in years_from_piped: continue out.append(c) return out if out else list(col_order) _APPENDIX_TIME_SLOT_GROUPS = frozenset({"建设期", "后评价时点前实际值", "后评价时点后预测值"}) def _appendix_time_slot_group_tail_is_real(tail: str) -> bool: """附表时间分组下子列是否为真实年份(YYYY / YYYY年)。""" if _appendix_norm_year_tail(tail): return True t = (tail or "").strip() if _BARE_FOUR_DIGIT_YEAR_COL.fullmatch(t): try: return 1900 <= int(t) <= 2100 except ValueError: return False return False def _bare_appendix_year_placeholder_col_key(s: str) -> bool: """无竖线列键:末栏「××年」「xx年」等占位列(兼容 x/×/全角拉丁混写)。""" t = str(s or "").strip() if not t: return False if t in ("…", "..."): return True buf: list[str] = [] for ch in t: if ch in "xXxX": buf.append("×") elif ch == "\u00d7": buf.append("×") else: buf.append(ch) u = "".join(buf) return bool(re.fullmatch(r"×{2}年(?:#\d+)?", u)) def _filter_appendix_placeholder_slot_columns(col_order: list[str]) -> list[str]: """某组下已有真实年份列时,该组内只保留 YYYY / YYYY年 子列,并去掉裸组名列与裸「××年」占位列。""" if not col_order: return col_order groups_with_real_year: set[str] = set() for c in col_order: cs = str(c or "").strip() if "|" not in cs: continue group, tail = cs.split("|", 1) group, tail = group.strip(), tail.strip() if group not in _APPENDIX_TIME_SLOT_GROUPS: continue if _appendix_time_slot_group_tail_is_real(tail): groups_with_real_year.add(group) if not groups_with_real_year: return list(col_order) out: list[str] = [] for c in col_order: cs = str(c or "").strip() if "|" not in cs: if cs in groups_with_real_year: continue if _bare_appendix_year_placeholder_col_key(cs): continue out.append(c) continue group, tail = cs.split("|", 1) group, tail = group.strip(), tail.strip() if group in groups_with_real_year: if _appendix_time_slot_group_tail_is_real(tail): out.append(c) continue out.append(c) return out if out else list(col_order) def _filter_appendix3_summary_duplicate_forecast_years(table_name: str, col_order: list[str]) -> list[str]: """附表3:「建设期」「时点前」下与「后评价时点后预测值」同年栏重复时去掉,避免表尾多出 2019/2020 等重复列。""" tn = str(table_name or "").strip() if "附表3" not in tn or "项目投资财务现金流量" not in tn: return col_order forecast_g = "后评价时点后预测值" summary_gs = frozenset({"建设期", "后评价时点前实际值"}) fy: set[str] = set() for c in col_order: cs = str(c or "").strip() if "|" not in cs: continue g, tail = cs.split("|", 1) if g.strip() != forecast_g: continue ny = _appendix_norm_year_tail(tail.strip()) if ny: fy.add(ny) if not fy: return col_order drop: set[str] = set() for c in col_order: cs = str(c or "").strip() if "|" not in cs: continue g, tail = cs.split("|", 1) g, tail = g.strip(), tail.strip() if g not in summary_gs: continue ny = _appendix_norm_year_tail(tail) if ny and ny in fy: drop.add(cs) if not drop: return col_order out = [c for c in col_order if str(c).strip() not in drop] return out if out else list(col_order) def _filter_appendix3_placeholders_when_forecast_has_real_year(table_name: str, col_order: list[str]) -> list[str]: """附表3:后评价时点后预测值已有 YYYY 列时,三组内所有「××年#n」占位列均剔除(含建设期/时点前仅余占位的情况)。""" tn = str(table_name or "").strip() if "附表3" not in tn or "项目投资财务现金流量" not in tn: return list(col_order) forecast_g = "后评价时点后预测值" has_forecast_real = False for c in col_order: cs = str(c or "").strip() if "|" not in cs: continue g, tail = cs.split("|", 1) if g.strip() != forecast_g: continue if _appendix_norm_year_tail(tail.strip()): has_forecast_real = True break if not has_forecast_real: return list(col_order) out: list[str] = [] for c in col_order: cs = str(c or "").strip() if "|" not in cs: out.append(c) continue g, tail = cs.split("|", 1) g, tail = g.strip(), tail.strip() if g in _APPENDIX_TIME_SLOT_GROUPS and not _appendix_time_slot_group_tail_is_real(tail): continue out.append(c) return out if out else list(col_order) def _filter_appendix5_orphan_price_unit_column(table_name: str, col_order: list[str]) -> list[str]: """附表5:去掉与「价格(元/t)」重复的独立列键「(元/t)」(多为表头拆行误入数据列)。""" tn = str(table_name or "") if "附表5" not in tn or "营业收入与营业税金" not in tn: return col_order if not any("价格" in str(c) and "元/t" in str(c) for c in col_order): return col_order orphans = {"(元/t)", "(元/t)"} out = [c for c in col_order if str(c).strip() not in orphans] return out if out else list(col_order) # 表5-5:仅按表号匹配(不要求表名含「主要生产经营指标」,避免要素表标题变体导致过滤未生效) _TABLE_55_TITLE_RX = re.compile(r"表\s*5\s*[--\..·]\s*5") _TABLE_55_FORECAST_GROUP = "后评价时点后预测值" _TABLE_55_FORECAST_HYPHEN_YEAR = re.compile( rf"^{re.escape(_TABLE_55_FORECAST_GROUP)}\s*[--—–]\s*(\d{{4}})(?:年)?$" ) def _compact_zh_ident(s: str) -> str: return re.sub(r"\s+", "", unicodedata.normalize("NFKC", str(s or ""))) def _split_group_year_col_key(col: str) -> tuple[str, str] | None: """解析「组|子列」;支持半角/全角竖线。""" st = str(col or "").strip() if not st: return None for sep in ("|", "\uff5c"): # U+FF5C 全角竖线 if sep in st: a, b = st.split(sep, 1) return a.strip(), b.strip() return None def _table55_has_forecast_year_slot_columns(col_order: list[str]) -> bool: """是否存在「后评价时点后预测值」下的分年列(|、| 或 后缀 -YYYY)。""" for c in col_order: parts = _split_group_year_col_key(str(c or "")) if parts: g, tail = parts if g != _TABLE_55_FORECAST_GROUP: continue ts = tail.strip() if _appendix_norm_year_tail(ts): return True if _BARE_FOUR_DIGIT_YEAR_COL.fullmatch(ts): try: if 1900 <= int(ts) <= 2100: return True except ValueError: pass continue st = str(c or "").strip() m = _TABLE_55_FORECAST_HYPHEN_YEAR.match(st) if m: try: if 1900 <= int(m.group(1)) <= 2100: return True except ValueError: pass return False def _table55_col_should_drop(col: str, *, has_forecast_year_slots: bool) -> bool: c0 = _compact_zh_ident(col) if "时点点后" in c0: return True if has_forecast_year_slots and c0 == _compact_zh_ident(_TABLE_55_FORECAST_GROUP): return True return False def _filter_table55_redundant_malformed_forecast_column(table_name: str, col_order: list[str]) -> list[str]: """表5-5:去掉笔误列「…时点点后…」及在有分年预测列时多余的裸「后评价时点后预测值」列。""" if not col_order: return col_order if not _TABLE_55_TITLE_RX.search(str(table_name or "")): return list(col_order) has_slots = _table55_has_forecast_year_slot_columns(col_order) out = [c for c in col_order if not _table55_col_should_drop(str(c), has_forecast_year_slots=has_slots)] return out if out else list(col_order) _APPENDIX_TIME_GROUP_YEAR_HYPHEN_RE = re.compile( r"^(建设期|后评价时点前实际值|后评价时点后预测值)\s*[--—–]\s*(.+)$" ) def _appendix_time_col_group_and_tail(col: str) -> tuple[str | None, str | None]: """解析附表时间列键为 (组名, 子列);支持「组|年」「组-年」及裸组名列。""" st = str(col or "").strip() if not st: return None, None parts = _split_group_year_col_key(st) if parts: return parts[0], parts[1] m = _APPENDIX_TIME_GROUP_YEAR_HYPHEN_RE.match(st) if m: return m.group(1).strip(), m.group(2).strip() if st in _APPENDIX_TIME_SLOT_GROUPS or st == "价格(元/t)": return st, "" return None, None def _appendix_time_tail_sort_key(tail: str | None) -> tuple[int, int, str]: """组内子列排序:裸组名 < 分年列(年份升序) < 占位列 < 其它。""" t = str(tail or "").strip() if not t: return (0, -1, "") ny = _appendix_norm_year_tail(t) if ny: return (1, int(ny), "") if _BARE_FOUR_DIGIT_YEAR_COL.fullmatch(t): try: yi = int(t) if 1900 <= yi <= 2100: return (1, yi, "") except ValueError: pass buf: list[str] = [] for ch in t: if ch in "xXxX": buf.append("×") elif ch == "\u00d7": buf.append("×") else: buf.append(ch) pm = re.fullmatch(r"×{2}年#(\d+)", "".join(buf)) if pm: return (2, int(pm.group(1)), "") return (3, 0, t) def _reorder_appendix_time_col_order(table_name: str, col_order: list[str]) -> list[str]: """附表3~7:按细则组序排列列,组内年份从小到大。""" if not col_order or not _appendix_time_table_name(table_name): return list(col_order) spec_groups = time_table_default_columns_for_name(table_name) or [] group_rank: dict[str, int] = {g: i for i, g in enumerate(spec_groups)} by_group: dict[str, list[str]] = {} ungrouped: list[str] = [] for col in col_order: cs = str(col or "").strip() if not cs: continue g, _ = _appendix_time_col_group_and_tail(cs) if g == "价格(元/t)" or g in _APPENDIX_TIME_SLOT_GROUPS: by_group.setdefault(g, []).append(cs) if g not in group_rank: group_rank[g] = len(group_rank) + 100 else: ungrouped.append(cs) def _sort_group_cols(cols: list[str]) -> list[str]: return sorted( cols, key=lambda c: _appendix_time_tail_sort_key(_appendix_time_col_group_and_tail(c)[1]), ) ordered_groups = list(spec_groups) for g in sorted(by_group.keys(), key=lambda x: group_rank.get(x, 999)): if g not in ordered_groups: ordered_groups.append(g) out: list[str] = [] seen: set[str] = set() for g in ordered_groups: cols = by_group.get(g) if not cols: continue for c in _sort_group_cols(cols): if c not in seen: out.append(c) seen.add(c) for c in ungrouped: if c not in seen: out.append(c) seen.add(c) return out if out else list(col_order) def _build_structured_table_html( table_name: str, row_order: list[str], col_order: list[str], latest: dict[tuple[str, str], str], ) -> str: row_order = _normalize_table_row_order(row_order, table_name=table_name) row_header = _row_header_name_for_table(table_name) grouped = _group_column_headers(col_order) lines: list[str] = ["", " "] if grouped: top_headers, sub_headers = grouped lines.append(" ") lines.append(' ') lines.append(f' ') idx = 0 while idx < len(top_headers): group = top_headers[idx] if not group: lines.append(f' ') idx += 1 continue span = 1 while idx + span < len(top_headers) and top_headers[idx + span] == group: span += 1 lines.append(f' ') idx += span lines.append(" ") lines.append(" ") for top, sub in zip(top_headers, sub_headers): if top: lines.append(f" ") lines.append(" ") else: lines.append(" ") lines.append(" ") lines.append(f" ") for col in col_order: lines.append(f" ") lines.append(" ") lines.append(" ") lines.append(" ") max_rows = min(120, len(row_order)) if _is_table54_operating_benefit(table_name) else min(24, len(row_order)) serial_col = _table_row_outline_serial_column(row_order, max_rows=max_rows) for idx, rk in enumerate(row_order[:max_rows], start=1): display_rk = _project_column_row_label( table_name, rk, latest, serial_col=serial_col ) serial_cell = _serial_cell_for_report_table( table_name, rk, idx, serial_col, serial_idx=idx - 1 ) lines.append(" ") lines.append(f" ") lines.append(f" ") for ck in col_order: val = latest.get((rk, ck), "待补充") or "待补充" lines.append(f" ") lines.append(" ") lines.append(" ") lines.append("
序号{row_header}{sub_headers[idx]}{group}
{sub}
序号{row_header}{col}
{serial_cell}{display_rk}{val}
") return "\n".join(lines) _RE_MD_HEADER_NAME_UNIT = re.compile(r"^(.+?)\s*([((][^))]+[))])$") def _strip_md_bold_markup(text: str) -> str: """去掉 Markdown 加粗标记 **,保留其余内容。""" s = str(text or "") while True: new = re.sub(r"\*\*([^*]+?)\*\*", r"\1", s) if new == s: break s = new return s def _markdown_table_header_cell_display(col_label: str, *, plain: bool = False) -> str: """表头栏 Markdown:量纲写在名称下方,单位加括号(同一单元格内用
换行);不加粗。""" del plain # 保留参数以兼容旧调用;表头一律不加 ** 包裹 s = _strip_md_bold_markup(str(col_label or "").strip()).replace("|", "|") if not s: return "" if re.search(r"", s, re.I): parts = [ _strip_md_bold_markup(p).strip() for p in re.split(r"", s, flags=re.I) ] out_parts = [p for p in parts if p] return "
".join(out_parts) if out_parts else s m = _RE_MD_HEADER_NAME_UNIT.match(s) if m: name, unit = m.group(1).strip(), m.group(2).strip() if name: return f"{name}
{unit}" if unit else name return s def _common_trailing_parenthetical_unit_from_flat_labels( flat_cols: list[str], ) -> tuple[str | None, list[str]]: """当合并后的列表头列名末尾「(单位)」在各列一致时,返回该单位及去掉单位后的表头文案。""" stripped: list[str] = [] units: list[str | None] = [] for lab in flat_cols: s = str(lab or "").strip() m = _RE_MD_HEADER_NAME_UNIT.match(s) if m: stripped.append(m.group(1).strip()) units.append(m.group(2).strip()) else: stripped.append(s) units.append(None) present = [u for u in units if u] if not present: return None, list(flat_cols) u0 = present[0] if any(units[i] is not None and units[i] != u0 for i in range(len(units))): return None, list(flat_cols) return u0, stripped # 表号与表名之间空两格:采用两个全角空格(与公文「空两格」习惯一致) _TABLE_CAPTION_NUMBER_NAME_GAP = "\u3000\u3000" _TABLE52_INVESTMENT_CHANGE_CAPTION = ( f"表5-2{_TABLE_CAPTION_NUMBER_NAME_GAP}投资变动情况表(单位:万元、万美元)" ) _RE_TABLE_CAPTION_LEADING_TOKEN = re.compile( r"^(附表\s*\d+(?:\s*[.\--.]\s*\d+)*|表\s*\d+(?:\s*[.\--.]\s*\d+)*)\s*(.*)$", re.DOTALL, ) def _fix_521_table52_wrong_caption(content: str) -> str: """5.2.1 若表题误用「表5-2 同类烷基化…」等,改回标准投资变动情况表表题。""" text = str(content or "") if not text.strip(): return text caption_re = re.compile( r"^(\s*(?:#{1,6}\s+)?)(表\s*5\s*[--.]\s*2\s*(.*))$", re.IGNORECASE, ) out: list[str] = [] for line in text.split("\n"): m = caption_re.match(line) if m: tail = (m.group(3) or "").strip() if "投资变动情况表" not in tail: out.append(f"{m.group(1)}{_TABLE52_INVESTMENT_CHANGE_CAPTION}") continue out.append(line) return "\n".join(out) def _normalize_table_caption_number_name_gap(title: str) -> str: """将「表2-4xxx」「表 2 - 4 xxx」规范为「表2-4」+ 两全角空格 + 表名。""" s = str(title or "").strip() if not s: return s m = _RE_TABLE_CAPTION_LEADING_TOKEN.match(s) if not m: return s token_compact = re.sub(r"\s+", "", (m.group(1) or "").strip()) rest = (m.group(2) or "").strip() if not rest: return token_compact return f"{token_compact}{_TABLE_CAPTION_NUMBER_NAME_GAP}{rest}" def _rewrite_table_caption_line_for_number_name_gap(line: str) -> str: """修正独立表题行(非表格管道行)中表号与表名间距。""" if "|" in line or not line.strip(): return line if line.strip().startswith("```"): return line m = re.match(r"^(\s*)(.*)$", line) if not m: return line indent, rest = m.group(1), m.group(2) h = "" m2 = re.match(r"^(#{1,6}\s+)(.*)$", rest) if m2: h, rest = m2.group(1), m2.group(2) rest_st = rest.strip() if not rest_st: return line if rest_st.startswith("**") and rest_st.endswith("**") and len(rest_st) >= 4: inner = _strip_md_bold_markup(rest_st[2:-2]).strip() n = _normalize_table_caption_number_name_gap(inner) return f"{indent}{h}{n}" n2 = _normalize_table_caption_number_name_gap(_strip_md_bold_markup(rest_st)) if n2 != rest_st: return f"{indent}{h}{n2}" return line def _debold_md_table_row(line: str) -> str: if "|" not in line: return line return "|".join(_strip_md_bold_markup(part) for part in line.split("|")) def _debold_markdown_table_blocks_in_content(content: str) -> str: """去掉 Markdown 管道表表头行中的 ** 加粗(含双行表头)。""" lines = str(content or "").split("\n") if not lines: return str(content or "") out: list[str] = [] i = 0 while i < len(lines): ln = lines[i] if _is_pipe_markdown_table_row_line(ln) and not _is_pipe_markdown_table_separator_line(ln): header_rows: list[str] = [] j = i while j < len(lines) and _is_pipe_markdown_table_row_line(lines[j]) and not _is_pipe_markdown_table_separator_line(lines[j]): header_rows.append(lines[j]) j += 1 if j < len(lines) and _is_pipe_markdown_table_separator_line(lines[j]): out.extend(_debold_md_table_row(hr) for hr in header_rows) out.append(lines[j]) j += 1 while j < len(lines) and ( _is_pipe_markdown_table_row_line(lines[j]) or _is_pipe_markdown_table_separator_line(lines[j]) ): out.append(lines[j]) j += 1 i = j continue out.extend(header_rows) i += len(header_rows) continue out.append(ln) i += 1 return "\n".join(out) def _normalize_table_captions_in_markdown(content: str) -> str: if not str(content or "").strip(): return str(content or "") lines = str(content).split("\n") text = "\n".join(_rewrite_table_caption_line_for_number_name_gap(ln) for ln in lines) return _debold_markdown_table_blocks_in_content(text) def _merge_table_title_with_common_unit(base_title: str, unit: str | None) -> str: """表题末尾追加各列相同的公共单位(括号形式);表题已含该单位则不重复。""" b = str(base_title or "").strip() if not unit or not str(unit).strip(): return _normalize_table_caption_number_name_gap(b) u = str(unit).strip() bc = re.sub(r"\s+", "", b) uc = re.sub(r"\s+", "", u) if uc and bc.endswith(uc): return _normalize_table_caption_number_name_gap(b) return _normalize_table_caption_number_name_gap(f"{b} {u}") def _render_table_7_1_markdown( row_order: list[str], col_order: list[str], latest: dict[tuple[str, str], str], ) -> str: """表7-1 行键为「指标·要素」或「综合得分」;输出合同要求的「指标」「要素」分列。""" def esc(v: str) -> str: return str(v or "").replace("|", "|") data_cols: list[str] = [] for c in TABLE_7_1_COLUMN_KEYS: if c in col_order: data_cols.append(c) for c in col_order: if c not in data_cols: data_cols.append(c) preferred = [rk for rk, _ in TABLE_7_1_ROW_CELL_DEFAULTS] preferred_set = set(preferred) ordered_rows = [rk for rk in preferred if rk in row_order] for rk in row_order: if rk not in preferred_set: ordered_rows.append(rk) header = "| " + " | ".join( [_markdown_table_header_cell_display("指标"), _markdown_table_header_cell_display("要素")] + [_markdown_table_header_cell_display(c) for c in data_cols] ) + " |" sep = "| " + " | ".join(["---"] * (2 + len(data_cols))) + " |" lines = [header, sep] for rk in ordered_rows: rk_s = str(rk or "").strip() if rk_s == "综合得分": ind, elem = "综合得分", "" elif "·" in rk_s: left, right = rk_s.split("·", 1) ind, elem = left.strip(), right.strip() else: ind, elem = rk_s, "" vals: list[str] = [] for ck in data_cols: raw = str(latest.get((rk_s, ck), "") or "").strip() vals.append(esc(raw if raw else "待补充")) lines.append("| " + " | ".join([esc(ind), esc(elem)] + vals) + " |") return "\n".join(lines) + "\n" def _build_structured_table_markdown(db: Session, table_id: str, table_name: str = "") -> tuple[str, str | None]: cells = ( db.query(ElementCell) .filter( ElementCell.table_id == table_id, ElementCell.value.isnot(None), ElementCell.value != "", ) .order_by(ElementCell.updated_at.desc()) .all() ) if not cells: return "", None latest: dict[tuple[str, str], str] = {} row_order: list[str] = [] col_order: list[str] = [] for cell in cells: row_key = str(cell.row_key or "").strip() col_key = str(cell.col_key or "内容").strip() or "内容" if not row_key: continue key = (row_key, col_key) if key not in latest: latest[key] = str(cell.value or "").strip() if row_key not in row_order: row_order.append(row_key) if col_key not in col_order: col_order.append(col_key) if not row_order: return "", None row_order = _normalize_table_row_order(row_order, table_name=table_name) spec = _multi_column_global_spec_for_table(table_name) if spec: spec_cols = [str(col).strip() for col in (spec[0] or []) if str(col).strip()] ordered = [col for col in spec_cols if col in col_order] extras = [col for col in col_order if col not in ordered] col_order = ordered + extras col_order = _filter_redundant_bare_year_columns(col_order) col_order = _filter_appendix_placeholder_slot_columns(col_order) col_order = _filter_appendix3_summary_duplicate_forecast_years(table_name, col_order) col_order = _filter_appendix3_placeholders_when_forecast_has_real_year(table_name, col_order) col_order = _filter_appendix5_orphan_price_unit_column(table_name, col_order) col_order = _filter_table55_redundant_malformed_forecast_column(table_name, col_order) col_order = _reorder_appendix_time_col_order(table_name, col_order) inferred_ty: int | None = None if _is_table54_operating_benefit(str(table_name or "").strip()): ty_row = db.query(ElementTable.year).filter(ElementTable.id == table_id).first() tbl_y = int(ty_row[0]) if ty_row and ty_row[0] is not None and int(ty_row[0]) > 0 else None inferred_ty = _infer_time_column_year_for_table54(col_order, cells, tbl_y) if str(table_name or "").strip() == TABLE_7_1_SCORING_TABLE_NAME: return _render_table_7_1_markdown(row_order, col_order, latest), None return _render_markdown_table( table_name, row_order, col_order, latest, time_column_year=inferred_ty ) _PLACEHOLDER_ROW_PREFIX = re.compile(r"^(产品名称|原料名称|项目名称|名称|产品|项目)\s*[·.]\s*") def _display_row_key(table_name: str, rk: str, latest: dict[tuple[str, str], str]) -> str: """将模板占位行名替换为真实名称(优先使用该行单元格中的项目/产品名称)。""" text = str(rk or "").strip() m = _PLACEHOLDER_ROW_PREFIX.match(text) if not m: return text preferred_cols: list[str] = [] tn = str(table_name or "") if "表2-4" in tn or "产品流向" in tn: preferred_cols.extend(["项目名称", "产品名称", "规格"]) preferred_cols.extend(["项目名称", "产品名称", "名称", "规格"]) for col in preferred_cols: v = str(latest.get((rk, col), "") or "").strip() if v and v != "待补充": return v suffix = text[m.end():].strip() return f"产品{suffix}" if suffix else text # 与正文层次编号一致:row_key 形如「1.1 建设投资」「1.2.3 工艺」「3原料」 _ROWKEY_OUTLINE_PREFIX = re.compile( r"^\s*(\d+(?:\.\d+)*)(?:\s*[、..]?\s+(?=\S)|(?=[\u4e00-\u9fffA-Za-z((]))" ) _CN_OUTLINE_ROWKEY_PREFIX = re.compile(r"^\s*([一二三四五六七八九十百千]+)\s+(.+)$") _APPENDIX5_PRODUCT_TRIPLE = re.compile( r"^(\d+(?:\.\d+)*)\s+(.+?)·(销量|营业收入|销项税)$" ) _APPENDIX7_DETAIL_ROW = re.compile( r"^(\d+(?:\.\d+)*)\s+([^·]+?)(?:·(单价|数量|进项税额|……))?$" ) _TABLE_53_LEGACY_ROW_DISPLAY: dict[str, str] = { "工程费用变动·批准单位": "批准单位", "工程费用变动·批准文号": "批准文号", "工程费用变动·工程费用合计": "工程费用合计", "工程费用变动·工艺生产装置": "1 工艺生产装置", "工程费用变动·装置·设备购置费": "1.1.1 设备购置费", "工程费用变动·装置·安装工程费": "1.1.2 安装工程费", "工程费用变动·装置·建筑工程费": "1.1.3 建筑工程费", "工程费用变动·总图运输": "2 总图运输", "工程费用变动·储运工程": "3 储运工程", "工程费用变动·其它分项(可增删)": "其它分项(可增删)", } def _strip_table_prefix_from_row_key(rk: str) -> str: s = str(rk or "").strip() if "\u00b7" in s: return "\u00b7".join(s.split("\u00b7")[1:]).strip() return s def _parse_row_key_seq_and_name(rk: str, *, table_name: str = "") -> tuple[str, str]: """与 quick-fill.js ``parseRowKeyForDisplay`` 一致。""" s = str(rk or "").strip() if not s: return "", "" legacy = _legacy_map_for_table(table_name) if legacy: s = legacy.get(s, s) if _table53_engineering_cost_change_name(table_name): s = _TABLE_53_LEGACY_ROW_DISPLAY.get(s, s) if _appendix_time_table_name(table_name) or _appendix8_param_name(table_name): s = _strip_table_prefix_from_row_key(s) elif "\u00b7" in s: s = "\u00b7".join(s.split("\u00b7")[1:]).strip() m = _ROWKEY_OUTLINE_PREFIX.match(s) if m: rest = s[m.end():].strip() return m.group(1), rest if rest else s m_cn = _CN_OUTLINE_ROWKEY_PREFIX.match(s) if m_cn: return m_cn.group(1), m_cn.group(2).strip() return "", s def _row_display_name_for_table(table_name: str, rk: str) -> str: """项目/工程名称列展示文案(去表内前缀与层次编号,附表5/7 明细行单独处理)。""" s0 = str(rk or "").strip() if not s0: return "" if _appendix5_revenue_tax_name(table_name): s = _strip_table_prefix_from_row_key(s0) m = _APPENDIX5_PRODUCT_TRIPLE.match(s) if m: return m.group(3) if _appendix7_materials_name(table_name): s = _strip_table_prefix_from_row_key(s0) m = _APPENDIX7_DETAIL_ROW.match(s) if m and m.group(3): return m.group(3) if m: return m.group(2).strip() if _table_row_seq_name_split_display(table_name): _, name = _parse_row_key_seq_and_name(s0, table_name=table_name) return name or s0 return s0 def _serial_cell_for_report_table( table_name: str, rk: str, idx: int, serial_col: list[str] | None, *, serial_idx: int, ) -> str: """表5-2/5-3、附表2~8 用连续 1..n;表5-1 用层次编号;其余表沿用原逻辑。""" if _table_row_seq_name_split_display(table_name) and not _table51_main_economic_indicators_name( table_name ): return str(idx) if _table51_main_economic_indicators_name(table_name): seq, _ = _parse_row_key_seq_and_name(rk, table_name=table_name) return seq if seq else str(idx) if serial_col is not None: return serial_col[serial_idx] return str(idx) def _project_column_row_label( table_name: str, rk: str, latest: dict[tuple[str, str], str], *, serial_col: list[str] | None, ) -> str: if _table_row_seq_name_split_display(table_name): label = _row_display_name_for_table(table_name, rk) elif serial_col is not None: label = _strip_row_key_leading_outline_for_display(rk) if not str(label or "").strip(): label = rk else: label = rk return _element_manage_table_row_display_label( table_name, _display_row_key(table_name, label, latest) ) def _outline_serial_from_row_key(rk: str) -> str | None: """若 row_key 以阿拉伯数字层次编号开头,返回该编号字符串。""" rk_s = str(rk or "").strip() if not rk_s: return None compact = re.sub(r"\s+", "", rk_s) if compact in ("合计", "总计"): return None m = _ROWKEY_OUTLINE_PREFIX.match(rk_s) if not m: return None num = m.group(1) if re.fullmatch(r"\d{4}", num): try: yi = int(num) except ValueError: return None if 1900 <= yi <= 2100: return None return num def _table_row_outline_serial_column(row_order: list[str], *, max_rows: int) -> list[str] | None: """当每一数据行(合计/总计除外)的 row_key 均带层次编号时,序号列采用该编号。""" rows = row_order[:max_rows] if not rows: return None serials: list[str] = [] for rk in rows: compact = re.sub(r"\s+", "", str(rk or "")) if compact in ("合计", "总计") or str(rk or "").strip() in ("合计", "总计"): serials.append("—") continue s = _outline_serial_from_row_key(str(rk) or "") if s is None: return None serials.append(s) return serials def _strip_row_key_leading_outline_for_display(rk: str) -> str: """去掉 row_key 首部层次编号,避免第二列与序号列重复。""" rk_s = str(rk or "").strip() m = _ROWKEY_OUTLINE_PREFIX.match(rk_s) if not m: return rk_s rest = rk_s[m.end():].strip() return rest if rest else rk_s def _render_markdown_table( table_name: str, row_order: list[str], col_order: list[str], latest: dict[tuple[str, str], str], *, time_column_year: int | None = None, ) -> tuple[str, str | None]: col_order = list(col_order) tn = str(table_name or "").strip() table54 = _is_table54_operating_benefit(tn) if table54: _table54_remap_indicator_unit_latest(latest) _table54_rekey_latest_col_keys(latest) col_order = _reorder_table54_col_order(col_order) grouped = _group_column_headers(col_order) if grouped: top_headers, sub_headers = grouped flat_cols: list[str] = [] for top, sub in zip(top_headers, sub_headers): if top and sub: flat_cols.append(f"{top}-{sub}") elif top: flat_cols.append(top) else: flat_cols.append(sub) else: flat_cols = list(col_order) if table54: flat_cols = list(col_order) flat_header_labels = _table54_markdown_header_labels( col_order, time_column_year=time_column_year ) common_unit = None else: common_unit, flat_header_labels = _common_trailing_parenthetical_unit_from_flat_labels(flat_cols) if common_unit is None: flat_header_labels = flat_cols def _esc_pipe(v: str) -> str: return str(v or "").replace("|", "|") row_header = _row_header_name_for_table(table_name) if table54: row_header = "项目" header = ( "| " + " | ".join( [ _markdown_table_header_cell_display("序号"), _markdown_table_header_cell_display(row_header), ] + [_markdown_table_header_cell_display(c) for c in flat_header_labels] ) + " |" ) split = "| --- | --- | " + " | ".join(["---"] * len(flat_cols)) + " |" lines = [header, split] max_rows = min(120, len(row_order)) if table54 else min(24, len(row_order)) serial_col = _table_row_outline_serial_column(row_order, max_rows=max_rows) for idx, rk in enumerate(row_order[:max_rows], start=1): vals = [_esc_pipe(latest.get((rk, ck), "待补充") or "待补充") for ck in col_order] display_rk = _project_column_row_label( table_name, rk, latest, serial_col=serial_col ) serial_cell = _serial_cell_for_report_table( table_name, rk, idx, serial_col, serial_idx=idx - 1 ) lines.append("| " + serial_cell + " | " + _esc_pipe(display_rk) + " | " + " | ".join(vals) + " |") return "\n".join(lines) + "\n", common_unit def _build_time_table_markdowns_by_year( db: Session, table_id: str, table_name: str = "", ) -> list[tuple[str, str]]: """为时间要素表按 year 拆分,返回 [(display_table_name, markdown), ...] 列表。 时间表的 ElementCell 通过 year 字段区分不同年份的数据;前端用 col_key + "|" + year 渲染多级表头。本函数按年份分别聚合 cell,为每个年份生成独立的 Markdown 表格, 表名中的「××年」替换为实际年份。 """ cells = ( db.query(ElementCell) .filter( ElementCell.table_id == table_id, ElementCell.value.isnot(None), ElementCell.value != "", ) .order_by(ElementCell.updated_at.desc()) .all() ) if not cells: return [] from collections import defaultdict year_cells: dict[int | None, list[ElementCell]] = defaultdict(list) for cell in cells: year_cells[cell.year].append(cell) ty_row = db.query(ElementTable.year).filter(ElementTable.id == table_id).first() tbl_y = ( int(ty_row[0]) if ty_row and ty_row[0] is not None and int(ty_row[0]) > 0 else None ) base_name = str(table_name or "").strip() if _is_table54_operating_benefit(base_name): year_cells, real_years = _table54_merge_year_cells_for_table_year( year_cells, table_year=tbl_y ) else: real_years = sorted(y for y in year_cells if y is not None) if not real_years: md, common_unit = _build_structured_table_markdown(db, table_id, table_name) if not md: return [] disp = _merge_table_title_with_common_unit(str(table_name or "").strip(), common_unit) return [(disp, md)] results: list[tuple[str, str]] = [] for year in real_years: year_cell_list = year_cells[year] latest: dict[tuple[str, str], str] = {} row_order: list[str] = [] col_order: list[str] = [] for cell in year_cell_list: row_key = str(cell.row_key or "").strip() col_key = str(cell.col_key or "内容").strip() or "内容" if not row_key: continue key = (row_key, col_key) if key not in latest: latest[key] = str(cell.value or "").strip() if row_key not in row_order: row_order.append(row_key) if col_key not in col_order: col_order.append(col_key) if not row_order: continue if _is_table54_operating_benefit(base_name): _table54_coalesce_legacy_bare_metric_cols(latest, row_order) row_order = _normalize_table_row_order(row_order, table_name=base_name) time_spec_cols = time_table_default_columns_for_name(base_name) if time_spec_cols and _is_table54_operating_benefit(base_name): col_order = ["单位"] + [c for c in time_spec_cols if c != "单位"] elif time_spec_cols: ordered = [col for col in time_spec_cols if col in col_order] extras = [col for col in col_order if col not in ordered] col_order = ordered + extras col_order = _filter_redundant_bare_year_columns(col_order) col_order = _filter_appendix_placeholder_slot_columns(col_order) col_order = _filter_appendix3_summary_duplicate_forecast_years(base_name, col_order) col_order = _filter_appendix3_placeholders_when_forecast_has_real_year(base_name, col_order) col_order = _filter_appendix5_orphan_price_unit_column(base_name, col_order) col_order = _filter_table55_redundant_malformed_forecast_column(base_name, col_order) col_order = _reorder_appendix_time_col_order(base_name, col_order) display_name = re.sub(r"××年", f"{year}年", base_name) md, common_unit = _render_markdown_table( display_name, row_order, col_order, latest, time_column_year=year ) if md: results.append((_merge_table_title_with_common_unit(display_name, common_unit), md)) if not results and None in year_cells: md, common_unit = _build_structured_table_markdown(db, table_id, table_name) if md: results.append((_merge_table_title_with_common_unit(str(table_name or "").strip(), common_unit), md)) return results def _extract_table_short_token(table_name: str) -> str: text = str(table_name or "") m = re.search(r"(附表\s*\d+(?:\s*[.\--]\s*\d+)*|表\s*\d+(?:\s*[.\--]\s*\d+)*)", text) return re.sub(r"\s+", "", m.group(1)) if m else "" def _norm_table_token(token: str) -> str: text = re.sub(r"\s+", "", str(token or "")).lower() return text.replace("-", "-").replace("—", "-").replace("–", "-") def _table_token_matches_name(token: str, name: str, *, normalized: bool = False) -> bool: """ 表号精确匹配,避免“表1”误命中“表10”。 - normalized=True: token/name 已经是 _norm_table_token 结果。 """ t = token if normalized else _norm_table_token(token) n = name if normalized else _norm_table_token(name) if not t or not n: return False if t == n: return True # 兼容历史项目:4.3.3 的“烷基化装置运行分析”可能仍存为表4-1,仍应视为表4-2 的同义候选。 if t == _norm_table_token("表4-2"): raw_name = str(name or "") name_plain = re.sub(r"\s+", "", raw_name) if ("烷基化装置运行分析" in name_plain) and ("考核时间" in name_plain): if ("表4-1" in name_plain) or ("表4-2" in name_plain) or ("表41" in _norm_table_token(name_plain)): return True # 后面不能紧跟 1-2 位数字后即结束或遇到非数字(避免 表1→表10、表2-4→表2-40), # 但允许紧跟 4 位年份(如 表2-42019年…)或非数字字符(如 表2-4××年…)。 pattern = re.compile(rf"{re.escape(t)}(?!\d{{1,2}}(?!\d))") return bool(pattern.search(n)) def _table_token_caption_line_re(token: str) -> re.Pattern[str]: token_plain = re.sub(r"\s+", "", str(token or "")) token_re = re.escape(token_plain).replace(r"\-", r"[--—–]") return re.compile( r"(?:^|\n)([^\n]*?" + token_re + r"[^\n]*)\n", flags=re.IGNORECASE, ) def _segment_after_table_caption(content: str, token: str) -> str: """本表表题行之后、下一张「表 x-x …」表题之前的内容(不含引用语中的表号)。""" text = str(content or "") cap = _table_token_caption_line_re(token).search(text) if not cap: return "" rest = text[cap.end() :] next_cap = re.search( r"\n[^\n]*?表\s*\d+(?:\s*[--.]\s*\d+)*\s+[\u4e00-\u9fff]", rest, flags=re.IGNORECASE, ) if next_cap: return rest[: next_cap.start()] return rest def _segment_has_markdown_table_body(segment: str) -> bool: seg = str(segment or "") if not seg.strip(): return False return bool( re.search( r"(?:|(?:\n[ \t]*\|[^\n]+\|[ \t]*\n[ \t]*\|[-:\s|]+\|))", seg, flags=re.IGNORECASE, ) ) def _replace_caption_stub_with_authoritative_table( content: str, token: str, authoritative_block: str ) -> str: """将「仅有表题/注释、无表体」的占位段替换为要素直出整块(用于 3.3.4 表3-4 等)。""" text = str(content or "") block = str(authoritative_block or "").strip() if not block: return text cap = _table_token_caption_line_re(token).search(text) if not cap: return text.rstrip() + "\n\n" + block region_start = cap.start() if region_start > 0 and text[region_start] == "\n": region_start += 1 rest = text[cap.end() :] next_cap = re.search( r"\n[^\n]*?表\s*\d+(?:\s*[--.]\s*\d+)*\s+[\u4e00-\u9fff]", rest, flags=re.IGNORECASE, ) region_end = cap.end() + (next_cap.start() if next_cap else len(rest)) head = text[:region_start].rstrip("\n") tail = text[region_end:].lstrip("\n") if head: return f"{head}\n\n{block}\n\n{tail}".strip() if tail else f"{head}\n\n{block}".strip() return f"{block}\n\n{tail}".strip() if tail else block def _table_token_exists(content: str, token: str) -> bool: text = str(content or "") t = _norm_table_token(token) if not text or not t: return False # 须有独立表题行;正文「见表3-3~表3-5」等引用不算。 if not _table_token_caption_line_re(token).search(text): return False # 表体必须紧跟在本表表题与下一张表题之间,不得借用后续表的 Markdown 块(如 3.3.4 仅表3-4 题、表3-5 有体)。 return _segment_has_markdown_table_body(_segment_after_table_caption(text, token)) def _basic_warnings(section_title: str, content: str) -> list[str]: warnings: list[str] = [] if len(content.strip()) < 80: warnings.append("章节内容过短,建议补充证据后重试") title_norm = re.sub(r"\s+", "", str(section_title or "")) if "1.2项目决策要点" in title_norm: if "1.2.1项目背景" not in content or "1.2.2预期目标" not in content: warnings.append("1.2 未按固定结构输出(缺少“1.2.1项目背景/1.2.2预期目标”小节)") if "2.1.1资源与原料评价" in title_norm: if "原料数量及组成对比表" not in content: warnings.append("2.1.1 缺少模版规定的「原料数量及组成对比表」标题") if "原料性质对比表(醚后碳四)" not in content and "原料性质对比表" not in content: warnings.append("2.1.1 缺少模版规定的「原料性质对比表(醚后碳四)」标题") if "原料选择加氢工艺技术对比" in content or ( "表2.6-1" in content and "原料数量及组成对比" not in content and "原料选择加氢" in content ): warnings.append("2.1.1 不应出现安评类「表2.6-1 原料选择加氢工艺技术对比」等内容,本节仅允许模版主表") if "附录:原料预处理工艺方案比选" in content or "(非模版主表)" in content: warnings.append("2.1.1 不应出现附录或“非模版主表”字样,请仅保留模版两张主表") if "表" in section_title and "|" not in content: warnings.append("章节标题疑似要求表格,但输出未包含 Markdown 表格") if "待补充" in content and len(content.strip()) < 140: warnings.append("缺失信息较多,建议补充材料后重跑") return warnings def _check_consistency(report: str, project_name: str) -> list[str]: out: list[str] = [] if project_name and project_name not in report: out.append("正文未显式出现项目名称,请检查第一章基本信息。") amounts = re.findall(r"(\d+(?:\.\d+)?)\s*(亿元|万元|万)", report) if amounts: normalized = [f"{v}-{u}" for v, u in amounts] if len(normalized) >= 4 and len(set(normalized[:10])) >= 6: out.append("金额口径较分散,建议统一投资/决算/效益统计口径。") unit_lines = re.findall(r"(?:单位|计量单位)\s*[::]\s*([^\n]{1,40})", report) if unit_lines and len(set(unit_lines)) > 1: out.append("检测到多个计量单位定义,建议统一单位说明(如万元、吨/年)。") years = [int(y) for y in re.findall(r"(20\d{2})年", report)] if years: min_y, max_y = min(years), max(years) if max_y - min_y >= 12: out.append("年份跨度较大,建议复核建设期与运营期时间线是否混写。") if "待补充" in report: missing_count = report.count("待补充") if missing_count >= 10: out.append(f"全篇“待补充”出现 {missing_count} 次,建议补充关键材料后重跑。") if _has_conflict_terms(report): out.append("发现同一指标存在“增加/下降”等相反表述,建议人工复核结论口径。") return out def _append_report_appendices(db: Session, project_uuid: str, report_text: str) -> str: """ 为最终报告追加「附图/附表」(细则顺序:附图在上,附表在下)。 说明: - 附图:从项目知识库 .docx 中解析嵌入图(全厂/装置物料平衡等),以 Markdown 内嵌图输出; 解析不到则不输出该项(无占位说明)。 - 附表:从结构化表(element_tables/element_cells)汇总,优先抓取表名包含「附表」的表。 """ base = (report_text or "").strip() if not base: base = "" appendix_tables = _build_appendix_tables_markdown(db, project_uuid) appendix_figures = _build_appendix_figures_markdown(db, project_uuid) parts = [base] if base else [] if appendix_figures: parts.append(appendix_figures) if appendix_tables: parts.append(appendix_tables) return "\n\n".join([p for p in parts if str(p).strip()]).strip() def _build_appendix_tables_markdown(db: Session, project_uuid: str) -> str: tables = ( db.query(ElementTable) .filter(ElementTable.project_id == project_uuid) .order_by(ElementTable.table_name.asc(), ElementTable.updated_at.desc()) .all() ) appendix = [t for t in tables if "附表" in (t.table_name or "")] if not appendix: return "" blocks: list[str] = ["## 附表"] used = 0 for t in appendix: md, common_unit = _build_structured_table_markdown(db, t.id, t.table_name) title = str(t.table_name or "").strip() or f"附表({t.id})" title = _merge_table_title_with_common_unit(title, common_unit) if not md: md = _build_appendix_table_fallback_markdown(title) if not md: continue blocks.append(f"### {title}\n\n{md}") used += 1 if used >= 30: break return "\n\n".join(blocks).strip() if used else "" def _build_appendix_table_fallback_markdown(table_name: str) -> str: """ 当 element_cells 暂无有效数据时,按固定模板输出占位附表,避免附表缺失。 当前优先支持:附表8 可研报告和后评价参数对比表。 """ name = str(table_name or "").replace(" ", "") if ("附表8" in name) and ("可研报告和后评价参数对比表" in name): return APPENDIX8_PARAMETER_COMPARISON_TABLE return "" def _resolve_appendix_figure_blobs_from_kb(db: Session, project_uuid: str) -> dict[int, tuple[bytes, str, str]]: """自知识库 docx 抽取附图嵌入图:slot -> (blob, content_type, source_filename)。""" doc_root = Path(settings.DOC_PAT).resolve() rows = ( db.query(KbDocument) .filter(KbDocument.project_id == project_uuid) .order_by(KbDocument.uploaded_at.desc()) .all() ) per_doc: list[tuple[str, dict[int, list[tuple[int, bytes, str]]]]] = [] for d in rows: name = str(d.name or "") if not name.lower().endswith(".docx"): continue full = _kb_doc_absolute_file_path_for_model(doc_root, d) if not full.is_file(): continue try: cand = extract_appendix_figure_candidates_from_docx(full) except Exception as exc: logger.warning("appendix figure extraction failed %s: %s", full, exc) continue per_doc.append((name, cand)) return merge_best_appendix_figures(per_doc) def _build_appendix_figures_markdown(db: Session, project_uuid: str) -> str: """ 附图固定两项(细则): - 附图1 全厂物料平衡图 - 附图2 烷基化装置物料平衡图(常见为装置物料平衡图) 仅从知识库 .docx 嵌入对象抽取真实图片;解析不到则不在报告中展示该项(不输出占位说明)。 """ targets = APPENDIX_FIGURE_TARGETS resolved = _resolve_appendix_figure_blobs_from_kb(db, project_uuid) md_by_slot = appendix_figure_markdown_images(resolved, label_title=list(targets)) figure_parts: list[str] = [] for slot in range(1, len(targets) + 1): md = md_by_slot.get(slot) if md and str(md).strip(): figure_parts.append(str(md).strip()) if not figure_parts: return "" return "## 附图\n\n" + "\n\n".join(figure_parts) def _update_chapter_status( db: Session, job: ReportGenerationJob, chapter: ReportGenerationChapter, status: str, error_message: Optional[str], ) -> None: now = datetime.now() chapter.status = status chapter.error_message = error_message chapter.updated_at = now job.current_section_key = chapter.section_key job.updated_at = now db.commit() def _resolve_template(db: Session, template_id: Optional[str]) -> ReportTemplate: template = None if template_id: template = db.query(ReportTemplate).filter(ReportTemplate.id == template_id).first() if not template: template = ( db.query(ReportTemplate) .filter(ReportTemplate.is_default == True, ReportTemplate.is_active == True) # noqa: E712 .first() ) if not template: raise HTTPException(status_code=404, detail="未找到可用模板") return template def _list_template_sections(db: Session, template_id: str) -> list[ReportTemplateSection]: return ( db.query(ReportTemplateSection) .filter(ReportTemplateSection.template_id == template_id) .order_by(ReportTemplateSection.section_order.asc()) .all() ) def _sections_for_generation(sections: list[ReportTemplateSection]) -> list[ReportTemplateSection]: items = list(sections or []) if not items: return [] section_nos = { _extract_section_number(section.section_title or "") for section in items if _extract_section_number(section.section_title or "") } filtered: list[ReportTemplateSection] = [] for section in items: section_no = _extract_section_number(section.section_title or "") # 无法解析编号时保持兼容,继续参与生成。 if not section_no: filtered.append(section) continue # 仅生成叶子节:若存在任一后续子节(前缀匹配 x.y.z ...),则当前节跳过。 has_children = any(no.startswith(f"{section_no}.") for no in section_nos) if not has_children: filtered.append(section) return filtered def _extract_tokens(text: str) -> list[str]: src = str(text or "") zh = re.findall(r"[\u4e00-\u9fa5]{2,8}", src) en = re.findall(r"[A-Za-z]{3,20}", src.lower()) raw = zh + en out: list[str] = [] seen = set() for t in raw: if t in seen: continue seen.add(t) out.append(t) return out def _fmt_dt(dt: Optional[datetime]) -> Optional[str]: if not dt: return None return dt.strftime("%Y-%m-%d %H:%M:%S") def _select_chapter_example(section_title: str, raw_examples: Optional[str], evidence: dict) -> str: # 1.2 对“叙事+证据锚点”敏感,宁夏石化类示例易带偏叙述重点,故不使用章节示例 if _extract_section_number(section_title or "") == "1.2": return "" # 2.1.1 须严格对齐《模版.doc》两张原料表,章节示例中的其他项目表结构易干扰 if _extract_section_number(section_title or "") == "2.1.1": return "" # 5.1 表5-1 由合同与要素直出约束;第5章样例为简化「指标|可研值|后评价值」表,易与要素表重复输出 if _extract_section_number(section_title or "") == "5.1": return "" # 5.2.1 表5-2/表5-3 由输出合同与要素直出约束;第5章共用样例曾误标「表5-2 同类对标」易带偏表题 if _extract_section_number(section_title or "") == "5.2.1": return "" # 5.3.1 仅允许要素直出表5-4;第5章样例含「指标|可研值|后评价值」简表易诱发重复表 if _extract_section_number(section_title or "") == "5.3.1": return "" # 5.4 仅允许表5-7;第5章样例(表5-1 可研/后评对比)易诱发重复简表 if _extract_section_number(section_title or "") == "5.4": return "" samples = _parse_examples(raw_examples) if not samples: return "" title = str(section_title or "") has_table_need = ("表" in title) or _evidence_has_table_signal(evidence) scored: list[tuple[int, str]] = [] for s in samples: score = 0 txt = s.lower() if has_table_need and ("|" in s or "表" in s): score += 4 if any(k in txt for k in ("万元", "亿元", "投资", "收益")) and ("投资" in title or "财务" in title): score += 3 if any(k in txt for k in ("环保", "安全", "排放")) and ("影响" in title or "持续" in title): score += 3 if any(k in txt for k in ("结论", "建议", "经验")) and ("结论" in title or "综合" in title): score += 3 score += min(len(s) // 300, 2) scored.append((score, s)) scored.sort(key=lambda x: x[0], reverse=True) return scored[0][1] def _parse_examples(raw_examples: Optional[str]) -> list[str]: text = str(raw_examples or "").strip() if not text: return [] # 支持 JSON 数组格式:["示例1","示例2","示例3"] if text.startswith("[") and text.endswith("]"): try: arr = json.loads(text) if isinstance(arr, list): out = [str(x).strip() for x in arr if str(x).strip()] return out[:3] except Exception: pass # 支持分隔符:---EXAMPLE--- 或 \n\n====\n\n for sep in ("\n---EXAMPLE---\n", "\n====\n"): if sep in text: return [x.strip() for x in text.split(sep) if x.strip()][:3] # 兼容“示例1/示例2/示例3”文本段 blocks = re.split(r"\n\s*示例\s*[1-3][::]\s*", "\n" + text) blocks = [b.strip() for b in blocks if b.strip()] if len(blocks) >= 2: return blocks[:3] return [text] def _evidence_has_table_signal(evidence: dict) -> bool: docs = evidence.get("chapterDocs") if isinstance(evidence, dict) else [] if not isinstance(docs, list): return False for d in docs[:8]: if not isinstance(d, dict): continue content = str(d.get("content") or "") if "|" in content or "表" in content[:200]: return True return False def _has_conflict_terms(report: str) -> bool: pairs = [ ("增加", "下降"), ("达标", "未达标"), ("盈利", "亏损"), ("改善", "恶化"), ] for a, b in pairs: if a in report and b in report: return True return False def _resolve_project(db: Session, project_id: str) -> Optional[Project]: if not project_id: return None p = db.query(Project).filter(Project.uuid == project_id).first() if p: return p try: pid = int(project_id) except Exception: return None return db.query(Project).filter(Project.id == pid).first()