report_generation/services/report_generation_service.py

from __future__ import annotations

import json
import logging
import re
import threading
from difflib import SequenceMatcher
import unicodedata
import uuid
from datetime import datetime
from pathlib import Path
from types import SimpleNamespace
from typing import Any, Optional

from fastapi import HTTPException

logger = logging.getLogger(__name__)

# ── 运行时提示词落盘 ──────────────────────────────────────────────────────────
_PROMPT_DUMP_ROOT = Path(__file__).resolve().parent.parent / "comp" / "runtime"
_REPORT_OUTPUT_DUMP_ROOT = Path(__file__).resolve().parent.parent / "comp" / "report_outputs"


def _safe_markdown_filename(name: str, fallback: str = "section") -> str:
    safe = re.sub(r'[\\/:*?"<>|]', "_", str(name or "").strip())
    safe = re.sub(r"\s+", " ", safe).strip(" ._")
    return safe[:120] or fallback


def _dump_runtime_prompt(
    job_id: str,
    section_key: str,
    section_title: str,
    system_prompt: str,
    user_prompt: str,
) -> None:
    """将本次实际调用大模型的完整提示词（含证据）写入 comp/runtime/<job_id>/<section_key>.md。"""
    try:
        out_dir = _PROMPT_DUMP_ROOT / job_id
        out_dir.mkdir(parents=True, exist_ok=True)
        safe_key = re.sub(r'[\\/:*?"<>|]', "_", section_key)
        out_path = out_dir / f"{safe_key}.md"
        ts = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
        content = (
            f"# {section_title}\n\n"
            f"> job_id: `{job_id}`  \n"
            f"> section_key: `{section_key}`  \n"
            f"> 生成时间: {ts}\n\n"
            "---\n\n"
            "## System Prompt\n\n"
            f"```\n{system_prompt}\n```\n\n"
            "---\n\n"
            "## User Prompt\n\n"
            f"```\n{user_prompt}\n```\n"
        )
        out_path.write_text(content, encoding="utf-8")
    except Exception as exc:
        logger.warning("dump runtime prompt failed: %s", exc)


def _dump_report_chapter_json_markdown(
    *,
    job_id: str,
    section_key: str,
    section_title: str,
    output_json: dict[str, Any],
) -> Optional[str]:
    """在章节流式生成结束并准备入库时，将该章节最终 JSON 输出写入 markdown 文件。

    Returns:
        写入的文件路径，文件已存在（合并写入）时返回 None。
    """
    try:
        out_dir = _REPORT_OUTPUT_DUMP_ROOT / job_id
        out_dir.mkdir(parents=True, exist_ok=True)
        safe_title = _safe_markdown_filename(section_title, fallback=_safe_markdown_filename(section_key))
        out_path = out_dir / f"{safe_title}.md"
        ts = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
        json_text = json.dumps(output_json or {}, ensure_ascii=False, indent=2, default=str)
        content = (
            f"# {section_title}\n\n"
            f"> job_id: `{job_id}`  \n"
            f"> section_key: `{section_key}`  \n"
            f"> 写入时间: {ts}\n\n"
            "```json\n"
            f"{json_text}\n"
            "```\n"
        )
        out_path.write_text(content, encoding="utf-8")
        return str(out_path)
    except Exception as exc:
        logger.warning("dump report chapter json markdown failed: %s", exc)
        return None
from sqlalchemy.orm import Session

from database import SessionLocal
from database.models import (
    ElementCell,
    ElementTable,
    Project,
    KbDocument,
    ReportGenerationChapter,
    ReportGenerationJob,
    ReportTemplate,
    ReportTemplateSection,
)
from schemas.write import (
    GenerateReportJobItem,
    GenerateReportChapterItem,
    GenerateReportResult,
    GenerateReportResultChapter,
)
from services.llm_client import chat_completions_json
from services.report_prompt_service import (
    build_report_chapter_prompt,
    build_repair_missing_tables_prompt,
    build_table_format_repair_prompt,
    chapter_generation_system_prompt,
    repair_missing_tables_system_prompt,
    table_format_repair_system_prompt,
)
from services.retrieval_service import RetrievalService
from config import settings
from services.appendix_figure_extraction import (
    appendix_figure_markdown_images,
    extract_appendix_figure_candidates_from_docx,
    merge_best_appendix_figures,
)
from services.kb_service import _kb_doc_absolute_file_path_for_model
from services.report_runtime_store import (
    append_chapter_content,
    get_job_state,
    init_job_state,
    set_chapter_stream_phase,
    update_chapter_state,
    update_job_state,
)
from services.standard_elements_2020 import (
    CHAPTER1_PROJECT_OVERVIEW_TABLE_GROUP,
    MULTI_COLUMN_GLOBAL_SPECS,
    APPENDIX2_CANONICAL_ROW_ORDER,
    APPENDIX2_LEGACY_ROW_KEY_MAP,
    APPENDIX8_LEGACY_ROW_KEY_MAP,
    TABLE_5_3_ROW_KEY_ALTERNATES,
    canonical_row_order_for_table,
    TABLE_7_1_COLUMN_KEYS,
    TABLE_7_1_ROW_CELL_DEFAULTS,
    TABLE_7_1_SCORING_TABLE_NAME,
    global_table_row_keys,
    section_table_row_keys,
    time_table_default_columns_for_name,
)
from prompts.report_generation.section_output_contracts import (
    DEFAULT_SECTION_OUTPUT_CONTRACT,
    SECTION_OUTPUT_CONTRACTS,
)
from prompts.report_generation.heading_rules import (
    DEFAULT_HEADING_RULE,
    SECTION_HEADING_RULES,
)
from prompts.report_generation.appendix_templates import (
    APPENDIX8_PARAMETER_COMPARISON_TABLE,
    APPENDIX_FIGURE_TARGETS,
    MINIMAL_MISSING_TABLE_TEMPLATE,
    missing_child_heading_markdown,
)

RUNNING_CHAPTER_STALE_SECONDS = 180

# 同一表号存在多张历史/别名表时，优先命中该表号的标准表名关键词，避免误选。
_TABLE_TOKEN_PREFERRED_NAME_HINTS: dict[str, tuple[str, ...]] = {
    "表2-5": ("总图、储运、公用工程及辅助工程对比",),
    "表2-6": ("储运、公用工程及辅助工程依托对比", "依托"),
    "表3-3": ("施工图设计变更情况", "全厂性项目"),
    "表3-4": ("施工图设计变更情况", "单装置项目"),
    "表3-5": ("影响投资或工期", "重大设计变更"),
    "表5-4": ("生产经营及效益情况对比表",),
    "表5-5": ("主要生产经营指标",),
    "表5-6": ("不同因素变化对项目内部收益率的影响",),
    "表5-7": ("内部收益率为基准收益率时不确定因素临界点或临界值",),
}

# 表5-4 列键形如「可研报告|××年#1」：须与附表时间槽区分，且不可走「可研报告」前缀拆行，否则会生成「可研报告-|××年#1」错位表头。
_TABLE54_PIPE_METRIC_PREFIXES = frozenset(
    {"可研报告", "可研值", "实际值", "增减（%）", "增减", "指标"}
)
# 与表5-1 等混同步入的非细则列，直出时剔除
_TABLE54_DROP_COL_KEYS = frozenset({"后评价值", "后评价报告"})
_TABLE54_INVISIBLE_RE = re.compile(r"[\ufeff\u200b-\u200d]")


def _table54_ck_norm(ck: str) -> str:
    """列键 NFKC 与去空白、BOM，便于识别误写入的「unit」全角变体等。"""
    t = unicodedata.normalize("NFKC", str(ck or "")).strip()
    return _TABLE54_INVISIBLE_RE.sub("", t)


def _is_table54_operating_benefit(table_name: str) -> bool:
    tn = str(table_name or "").strip()
    return "表5-4" in tn and "生产经营及效益情况对比表" in tn


def _element_table_collect_score(db: Session, table: ElementTable, token: str) -> int:
    """报告生成选表：表5-4 须优先时间表且列键为「可研报告|××年#1」结构，避免误选抽取简表。"""
    if not _table_token_matches_name(token, "表5-4"):
        return 0
    score = 0
    if str(table.table_type or "").strip() == "time":
        score += 200
    name = str(table.table_name or "")
    if "生产经营及效益" in name:
        score += 40
    sample = (
        db.query(ElementCell.row_key, ElementCell.col_key)
        .filter(
            ElementCell.table_id == table.id,
            ElementCell.value.isnot(None),
            ElementCell.value != "",
        )
        .limit(48)
        .all()
    )
    for rk, ck in sample:
        rk_s, ck_s = str(rk or ""), str(ck or "")
        if "|" in ck_s and any(
            p in ck_s for p in ("可研报告", "实际值", "增减")
        ):
            score += 8
        if "·" in rk_s:
            score += 2
        if "年份未识别" in rk_s or "年份未识别" in ck_s:
            score -= 40
    return score


def _pick_table54_year_markdown(
    year_items: list[tuple[str, str]],
    *,
    table_year: int | None = None,
) -> tuple[str, str] | None:
    """多张按年拆分的表5-4 取评价年（优先 element_tables.year / 2019）且表体最完整的一张。"""
    if not year_items:
        return None
    if len(year_items) == 1:
        return year_items[0]

    def _item_score(item: tuple[str, str]) -> int:
        disp, md = item
        sc = 0
        if table_year is not None and str(table_year) in str(disp):
            sc += 120
        if "2019" in str(disp) or re.search(r"2019\s*年", md[:800]):
            sc += 80
        if "可研报告" in md and "实际值" in md:
            sc += 70
        if "增减" in md:
            sc += 25
        if "运行情况·" in md or "主要经济指标·" in md:
            sc += 35
        if "主要经济指标-" in md and "可研报告" not in md:
            sc -= 60
        sc += min(md.count("\n|"), 60)
        return sc

    return max(year_items, key=_item_score)


def _score_structured_table_hit_dict(hit: dict) -> int:
    """structuredTables 条目评分：完整表5-4 对比表优先于 LLM 三行简表。"""
    if not isinstance(hit, dict):
        return 0
    md = str(hit.get("markdown") or "")
    if not md:
        return 0
    if _is_table54_simplified_extract_body(md):
        return 0
    sc = 0
    if "可研报告" in md and "实际值" in md:
        sc += 90
    if "增减" in md:
        sc += 25
    if "运行情况·" in md or "主要经济指标·" in md:
        sc += 40
    if "主要经济指标-" in md and "可研报告" not in md:
        sc -= 70
    sc += min(md.count("\n|"), 80)
    return sc


def _table54_body_preceded_by_element_source(text_before: str, *, max_chars: int = 600) -> bool:
    """表体紧邻前是否已有要素直出注释（有则视为权威表5-4，勿删勿换）。"""
    tail = str(text_before or "")[-max_chars:]
    if "表格来源：要素管理" not in tail:
        return False
    after = tail.rsplit("表格来源：要素管理", 1)[-1]
    chunk = after.split("\n", 8)[-1]
    return not any(
        ln.strip().startswith("|") or _is_pipe_markdown_table_row_line(ln)
        for ln in chunk.splitlines()[:6]
        if ln.strip()
    )


def _is_table54_simplified_extract_body(block: str) -> bool:
    """
    识别抽取/LLM 三行简表：仅「2019年实际值」等单列 + 少量「主要经济指标·」行，
    无「可研报告|…」与「增减」对比结构。
    """
    md = str(block or "").strip()
    if not md or "|" not in md:
        return False
    hdr = re.sub(r"\s+", "", _extract_table_header_key(md)).lower()
    hdr = re.sub(r"<br>.*", "", hdr, flags=re.IGNORECASE)
    if not hdr:
        return False
    if "后评价值" in hdr or ("可研值" in hdr and "项目" not in hdr and "运行情况" not in md):
        return True
    has_compare_cols = ("可研报告" in hdr or ("可研" in hdr and "增减" in hdr)) and (
        "实际值" in hdr or "实际" in hdr
    )
    if has_compare_cols and ("运行情况·" in md or md.count("\n|") >= 12):
        return False
    single_actual_year = bool(
        re.search(r"\d{4}\s*年\s*实际值", hdr) or re.search(r"\d{4}年实际值", hdr)
    )
    if single_actual_year and "可研" not in hdr and "增减" not in hdr:
        if "主要经济指标" in md or "主要经济指标-" in md:
            return True
    if "主要经济指标-" in md and "可研报告" not in md and "增减" not in md:
        return True
    pipe_rows = [
        ln
        for ln in md.splitlines()
        if _is_pipe_markdown_table_row_line(ln) and not _is_pipe_markdown_table_separator_line(ln)
    ]
    if (
        len(pipe_rows) <= 5
        and "主要经济指标" in md
        and "可研报告" not in md
        and "运行情况·" not in md
    ):
        return True
    return False


def _reorder_table54_col_order(col_order: list[str]) -> list[str]:
    """单位列置前；其余按年度槽与「可研→实际→增减」顺序排列；剔除无效英文 unit 列与表5-1 混入列。"""
    cols: list[str] = []
    for c in col_order:
        s = _table54_ck_norm(c)
        if not s:
            continue
        if s.lower() in ("unit", "__unit__"):
            continue
        if s in _TABLE54_DROP_COL_KEYS:
            continue
        cols.append(s)
    cols = ["单位" if c == "指标单位" else c for c in cols]
    seen: set[str] = set()
    deduped: list[str] = []
    for c in cols:
        if c in seen:
            continue
        seen.add(c)
        deduped.append(c)
    cols = deduped

    units = [c for c in cols if c == "单位"]
    metrics = [c for c in cols if c != "单位"]

    def _metric_rank(g: str) -> int:
        gs = g.strip()
        if gs in ("可研报告", "指标", "可研值"):
            return 0
        if gs == "实际值":
            return 1
        if gs.startswith("增减"):
            return 2
        return 9

    def _sort_key(ck: str) -> tuple[str, int, str]:
        if "|" not in ck:
            return ("\xff", 99, ck)
        g, t = ck.split("|", 1)
        return (t.strip(), _metric_rank(g), ck)

    metrics = sorted(metrics, key=_sort_key)
    if not units:
        return ["单位"] + metrics
    return units + metrics


def _table54_rekey_latest_col_keys(latest: dict[tuple[str, str], str]) -> None:
    """将 latest 的 col_key 与 _reorder_table54_col_order 一致地做 NFKC 等规范化，否则「增减（%）」与「增减(%)」无法对齐。"""
    tmp: dict[tuple[str, str], str] = {}

    def _prefer_val(cur: str, new: str) -> str:
        s_new = str(new or "").strip()
        if s_new and s_new != "待补充":
            return str(new)
        s_cur = str(cur or "").strip()
        if s_cur and s_cur != "待补充":
            return str(cur)
        return s_new or s_cur or ""

    for (rk, ck), v in list(latest.items()):
        rk_s = str(rk)
        nk = _table54_ck_norm(str(ck))
        key = (rk_s, nk)
        if key in tmp:
            tmp[key] = _prefer_val(tmp[key], v)
        else:
            tmp[key] = str(v or "")
    latest.clear()
    latest.update(tmp)


def _table54_coalesce_legacy_bare_metric_cols(
    latest: dict[tuple[str, str], str], row_order: list[str]
) -> None:
    """
    要素管理常见：数据写在裸列「可研报告/实际值/增减（%）」，
    列定义仍为「可研报告|××年#1」等；合并到槽位列以便与 UI 一致。
    """
    slot_map = {
        "可研报告": "可研报告|××年#1",
        "实际值": "实际值|××年#1",
        "增减（%）": "增减（%）|××年#1",
        "增减(%)": "增减（%）|××年#1",
    }

    def _prefer(a: str, b: str) -> str:
        sa, sb = str(a or "").strip(), str(b or "").strip()
        if sa and sa != "待补充":
            return sa
        if sb and sb != "待补充":
            return sb
        return sa or sb

    for rk in row_order:
        for bare, slot in slot_map.items():
            merged = _prefer(latest.get((rk, slot), ""), latest.get((rk, bare), ""))
            if merged:
                latest[(rk, slot)] = merged


def _table54_merge_year_cells_for_table_year(
    year_cells: dict[int | None, list],
    *,
    table_year: int | None,
) -> tuple[dict[int | None, list], list[int]]:
    """
    表5-4：同一张 element_tables（year=2019）下多数格子 element_cells.year 为空，
    须与 year=2019 的少量格子合并后再渲染，否则只剩单列「实际值」简表。
    """
    if table_year is None or int(table_year) <= 0:
        real = sorted(y for y in year_cells if y is not None)
        return year_cells, real
    ty = int(table_year)
    merged: list = list(year_cells.get(None, []))
    for cy in sorted(y for y in year_cells if y is not None):
        if cy == ty:
            merged.extend(year_cells.get(cy, []))
    if not merged:
        return year_cells, sorted(y for y in year_cells if y is not None)
    return {ty: merged}, [ty]


def _table54_remap_indicator_unit_latest(latest: dict[tuple[str, str], str]) -> None:
    """将历史列键「指标单位」的值并入「单位」，避免列键规范为「单位」后取不到数。"""
    touched: list[tuple[str, str]] = []
    for (rk, ck), v in list(latest.items()):
        if str(ck) != "指标单位":
            continue
        rk_s = str(rk)
        k_unit = (rk_s, "单位")
        cur = str(latest.get(k_unit, "") or "").strip()
        nv = str(v or "").strip()
        if nv and (not cur or cur == "待补充"):
            latest[k_unit] = v
        elif not cur:
            latest[k_unit] = v
        touched.append((rk_s, str(ck)))
    for pair in touched:
        latest.pop(pair, None)


_TABLE54_SLOT_YEAR_RE = re.compile(r"^(\d{4})年(?:#\d+)?$")


def _norm_table54_placeholder_year_tail(tail: str) -> str:
    buf: list[str] = []
    for ch in (tail or "").strip():
        if ch in "xXｘＸ":
            buf.append("×")
        elif ch == "\u00d7":
            buf.append("×")
        else:
            buf.append(ch)
    return "".join(buf)


def _table54_placeholder_year_tail(tail: str) -> bool:
    """列键尾部为「××年#n」等占位列（与前端 isEmTable54YearSlotColKey 一致）。"""
    u = _norm_table54_placeholder_year_tail(tail)
    return bool(re.fullmatch(r"×{2}年(?:#\d+)?", u))


def _parse_real_year_from_table54_slot_tail(tail: str) -> int | None:
    """列键尾部为「2019」「2019年」「2019年#1」等真实日历时返回四位年。"""
    t = (tail or "").strip()
    if _table54_placeholder_year_tail(t):
        return None
    m = _TABLE54_SLOT_YEAR_RE.fullmatch(t)
    if m:
        y = int(m.group(1))
        if 1900 <= y <= 2100:
            return y
    m2 = re.match(r"^(\d{4})年", t)
    if m2:
        y = int(m2.group(1))
        if 1900 <= y <= 2100:
            return y
    return None


def _infer_time_column_year_for_table54(
    col_order: list[str],
    cells: list[Any],
    table_year: int | None,
) -> int | None:
    """
    从单元格 year、时间表 element_tables.year、或列键「…|2019年」推断表5-4 年度栏对应的日历年。
    无法唯一确定时返回 None（表头占位列退回「某年」）。
    """
    ys = sorted(
        {
            int(c.year)
            for c in cells
            if getattr(c, "year", None) is not None and int(c.year) > 0
        }
    )
    if len(ys) == 1:
        return ys[0]
    if table_year is not None and int(table_year) > 0:
        return int(table_year)
    parsed: list[int] = []
    for ck in col_order:
        s = str(ck or "").strip()
        if "|" not in s:
            continue
        _, tail = s.split("|", 1)
        cy = _parse_real_year_from_table54_slot_tail(tail.strip())
        if cy is not None:
            parsed.append(cy)
    uniq = sorted(set(parsed))
    if len(uniq) == 1:
        return uniq[0]
    # 列键正文含四位年（如「2019年可研报告」类裸列名）；多列多年份时不武断取第一个
    text_years: list[int] = []
    for ck in col_order:
        m = re.search(r"(19|20)\d{2}", str(ck or ""))
        if m:
            yi = int(m.group(0))
            if 1900 <= yi <= 2100:
                text_years.append(yi)
    ty_uniq = sorted(set(text_years))
    if len(ty_uniq) == 1:
        return ty_uniq[0]
    return None


def _table54_year_label_prefix(time_column_year: int | None) -> str:
    if time_column_year is not None and 1900 <= int(time_column_year) <= 2100:
        return f"{int(time_column_year)}年"
    return "某年"


def _table54_year_prefix_for_slot_tail(tail: str, *, time_column_year: int | None) -> str:
    """表头「{年}可研报告」中的「{年}」：优先列键自带年份，否则用推断的日历年，最后退回「某年」。"""
    cy = _parse_real_year_from_table54_slot_tail(tail)
    if cy is not None:
        return f"{cy}年"
    if _table54_placeholder_year_tail(tail):
        return _table54_year_label_prefix(time_column_year)
    t = (tail or "").strip()
    if not t:
        return "某年"
    if re.match(r"^\d{4}年", t):
        return t.split("#", 1)[0]
    return f"{t}年" if not t.endswith("年") else t


def _table54_bare_metric_header_label(col_key: str, *, time_column_year: int | None) -> str | None:
    """无「指标|年度槽」时的列键：表头带评价年。"""
    s = str(col_key or "").strip()
    if not s:
        return None
    ypfx = _table54_year_label_prefix(time_column_year)
    if s == "实际值":
        return f"{ypfx}实际值"
    if s in ("可研值", "可研报告"):
        return f"{ypfx}可研报告"
    if s.startswith("增减"):
        rest = s[len("增减") :]
        return f"{ypfx}增减{rest}"
    return None


def _table54_markdown_header_labels(
    col_order: list[str],
    *,
    time_column_year: int | None = None,
) -> list[str]:
    """扁平表头：单位、{年}可研报告、{年}实际值、{年}增减；{年}来自列键或要素日历年推断。"""
    out: list[str] = []
    for ck in col_order:
        s = str(ck).strip()
        if s == "单位":
            out.append("单位")
            continue
        if "|" not in s:
            bare_l = _table54_bare_metric_header_label(s, time_column_year=time_column_year)
            if bare_l is not None:
                out.append(bare_l)
                continue
            out.append(s.replace("|", "｜"))
            continue
        g, t = s.split("|", 1)
        g, t = g.strip(), t.strip()
        g_norm = _table54_ck_norm(g)
        if (g not in _TABLE54_PIPE_METRIC_PREFIXES and g_norm not in _TABLE54_PIPE_METRIC_PREFIXES) or not t:
            out.append(s.replace("|", "｜"))
            continue
        ypfx = _table54_year_prefix_for_slot_tail(t, time_column_year=time_column_year)
        if g_norm in ("可研报告", "指标", "可研值") or g in ("可研报告", "指标", "可研值"):
            out.append(f"{ypfx}可研报告")
        elif g_norm == "实际值" or g == "实际值":
            out.append(f"{ypfx}实际值")
        elif g_norm.startswith("增减") or g.startswith("增减"):
            rest = g[len("增减") :]
            out.append(f"{ypfx}增减{rest}")
        else:
            out.append(s.replace("|", "｜"))
    return out


def create_report_job(
    project_id: str,
    db: Session,
    *,
    template_id: Optional[str] = None,
    top_k: int = 10,
    requested_by: Optional[str] = None,
) -> GenerateReportJobItem:
    project = _resolve_project(db, project_id)
    if not project:
        raise HTTPException(status_code=404, detail="项目不存在")
    template = _resolve_template(db, template_id)
    sections = _sections_for_generation(_list_template_sections(db, template.id))
    if not sections:
        raise HTTPException(status_code=400, detail="模板未配置章节")

    now = datetime.now()
    job = ReportGenerationJob(
        id=uuid.uuid4().hex,
        project_id=project.uuid,
        template_id=template.id,
        status="pending",
        progress=0,
        requested_by=requested_by,
        options={"topK": max(5, min(int(top_k or 10), 20))},
        created_at=now,
        updated_at=now,
    )
    db.add(job)
    # 先把父任务写入当前事务，确保后续章节插入满足外键约束。
    db.flush()
    for s in sections:
        db.add(
            ReportGenerationChapter(
                id=uuid.uuid4().hex,
                job_id=job.id,
                section_key=s.section_key,
                section_title=s.section_title,
                section_order=s.section_order,
                status="pending",
                created_at=now,
                updated_at=now,
            )
        )
    db.commit()
    init_job_state(
        job_id=job.id,
        project_id=project.uuid,
        template_id=template.id,
        chapters=[
            {
                "sectionKey": s.section_key,
                "sectionTitle": s.section_title,
                "sectionOrder": s.section_order,
                "status": "pending",
            }
            for s in sections
        ],
    )
    _start_job_worker(job.id)
    return get_report_job(project.uuid, job.id, db)


def get_report_job(project_id: str, job_id: str, db: Session) -> GenerateReportJobItem:
    project = _resolve_project(db, project_id)
    if not project:
        raise HTTPException(status_code=404, detail="项目不存在")
    job = (
        db.query(ReportGenerationJob)
        .filter(ReportGenerationJob.id == job_id, ReportGenerationJob.project_id == project.uuid)
        .first()
    )
    if not job:
        raise HTTPException(status_code=404, detail="任务不存在")
    _recover_stalled_job(db, job)
    chapters = (
        db.query(ReportGenerationChapter)
        .filter(ReportGenerationChapter.job_id == job.id)
        .order_by(ReportGenerationChapter.section_order.asc())
        .all()
    )
    runtime_state = get_job_state(job.id)
    runtime_chapter_map = ((runtime_state or {}).get("chapters") or {}) if isinstance(runtime_state, dict) else {}
    return GenerateReportJobItem(
        jobId=job.id,
        projectId=job.project_id,
        templateId=job.template_id,
        status=(runtime_state or {}).get("status") or job.status,
        progress=int((runtime_state or {}).get("progress") or job.progress or 0),
        currentSectionKey=(runtime_state or {}).get("currentSectionKey") or job.current_section_key,
        errorMessage=(runtime_state or {}).get("errorMessage") or job.error_message,
        createdAt=_fmt_dt(job.created_at),
        updatedAt=(runtime_state or {}).get("updatedAt") or _fmt_dt(job.updated_at),
        completedAt=(runtime_state or {}).get("completedAt") or _fmt_dt(job.completed_at),
        chapters=[
            GenerateReportChapterItem(
                sectionKey=c.section_key,
                sectionTitle=c.section_title,
                sectionOrder=c.section_order,
                status=(runtime_chapter_map.get(c.section_key) or {}).get("status") or c.status,
                updatedAt=(runtime_chapter_map.get(c.section_key) or {}).get("updatedAt") or _fmt_dt(c.updated_at),
                errorMessage=(runtime_chapter_map.get(c.section_key) or {}).get("errorMessage") or c.error_message,
            )
            for c in chapters
        ],
    )


def get_report_result(
    project_id: str,
    job_id: str,
    db: Session,
    *,
    include_debug: bool = False,
) -> GenerateReportResult:
    project = _resolve_project(db, project_id)
    if not project:
        raise HTTPException(status_code=404, detail="项目不存在")
    job = (
        db.query(ReportGenerationJob)
        .filter(ReportGenerationJob.id == job_id, ReportGenerationJob.project_id == project.uuid)
        .first()
    )
    if not job:
        raise HTTPException(status_code=404, detail="任务不存在")
    _recover_stalled_job(db, job)
    chapter_rows = (
        db.query(ReportGenerationChapter)
        .filter(ReportGenerationChapter.job_id == job.id)
        .order_by(ReportGenerationChapter.section_order.asc())
        .all()
    )
    chapter_title_map: dict[str, str] = {}
    if job.template_id:
        chapter_title_map = _build_section_title_map(
            _list_template_sections(db, job.template_id)
        )
    runtime_state = get_job_state(job.id)
    if runtime_state:
        report_text, chapter_items = _build_live_result_from_runtime(
            runtime_state,
            include_debug=include_debug,
            chapter_title_map=chapter_title_map,
        )
    else:
        report_text, chapter_items = _build_live_result_from_chapters(
            chapter_rows,
            include_debug=include_debug,
            chapter_title_map=chapter_title_map,
        )
    report_text = _append_report_appendices(db, project.uuid, report_text)
    report_text = _normalize_table_captions_in_markdown(report_text or "")
    consistency = _check_consistency(report_text or "", project.name)
    return GenerateReportResult(
        jobId=job.id,
        status=(runtime_state or {}).get("status") or job.status,
        report=report_text,
        consistency=consistency,
        chapters=chapter_items,
    )


def _build_live_result_from_chapters(
    chapters: list[ReportGenerationChapter],
    *,
    include_debug: bool = False,
    chapter_title_map: Optional[dict[str, str]] = None,
) -> tuple[str, list[GenerateReportResultChapter]]:
    chapter_items: list[GenerateReportResultChapter] = []
    report_parts: list[str] = []
    title_map = chapter_title_map or {}
    for i, c in enumerate(chapters):
        normalized_content = _fix_numeric_line_breaks(str(c.content or "").strip()) if c.content else c.content
        if normalized_content:
            normalized_content, _ = _collapse_consecutive_text_repetitions(
                str(normalized_content).strip()
            )
            normalized_content = _normalize_table_captions_in_markdown(str(normalized_content).strip())
            prev_body = ""
            if i > 0 and chapters[i - 1].content:
                prev_body = _fix_numeric_line_breaks(str(chapters[i - 1].content).strip())
            normalized_content = _inject_missing_parent_section_headings(
                str(c.section_title or ""),
                str(normalized_content).strip(),
                prev_body,
                title_map,
            )
        chapter_items.append(
            GenerateReportResultChapter(
                sectionKey=c.section_key,
                sectionTitle=c.section_title,
                sectionOrder=c.section_order,
                status=c.status,
                content=normalized_content,
                errorMessage=c.error_message,
                promptText=(c.prompt_text if include_debug else None),
                evidencePayload=(c.evidence_payload if include_debug else None),
                validationPayload=(c.validation_payload if include_debug else None),
            )
        )
        if normalized_content:
            report_parts.append(str(normalized_content).strip())
    return _fix_numeric_line_breaks("\n\n".join(report_parts).strip()), chapter_items


def _build_live_result_from_runtime(
    runtime_state: dict,
    *,
    include_debug: bool = False,
    chapter_title_map: Optional[dict[str, str]] = None,
) -> tuple[str, list[GenerateReportResultChapter]]:
    chapter_items: list[GenerateReportResultChapter] = []
    report_parts: list[str] = []
    title_map = chapter_title_map or {}
    chapter_values = list(((runtime_state or {}).get("chapters") or {}).values())
    chapter_values.sort(key=lambda x: int((x or {}).get("sectionOrder") or 0))
    for i, chapter in enumerate(chapter_values):
        if not isinstance(chapter, dict):
            continue
        normalized_content = _fix_numeric_line_breaks(str(chapter.get("content") or "").strip())
        if normalized_content:
            normalized_content, _ = _collapse_consecutive_text_repetitions(
                str(normalized_content).strip()
            )
            normalized_content = _normalize_table_captions_in_markdown(str(normalized_content).strip())
            prev_body = ""
            if i > 0 and isinstance(chapter_values[i - 1], dict):
                prev_body = _fix_numeric_line_breaks(str(chapter_values[i - 1].get("content") or "").strip())
            normalized_content = _inject_missing_parent_section_headings(
                str(chapter.get("sectionTitle") or ""),
                normalized_content,
                prev_body,
                title_map,
            )
        chapter_items.append(
            GenerateReportResultChapter(
                sectionKey=str(chapter.get("sectionKey") or ""),
                sectionTitle=str(chapter.get("sectionTitle") or ""),
                sectionOrder=int(chapter.get("sectionOrder") or 0),
                status=str(chapter.get("status") or "pending"),
                content=normalized_content or None,
                errorMessage=chapter.get("errorMessage"),
                promptText=(chapter.get("promptText") if include_debug else None),
                evidencePayload=(chapter.get("evidencePayload") if include_debug else None),
                validationPayload=(chapter.get("validationPayload") if include_debug else None),
            )
        )
        if normalized_content:
            report_parts.append(normalized_content)
    return _fix_numeric_line_breaks("\n\n".join(report_parts).strip()), chapter_items


def get_report_stream_snapshot(
    job_id: str,
    *,
    include_debug: bool = False,
) -> Optional[dict[str, Any]]:
    runtime_state = get_job_state(job_id)
    if not runtime_state:
        return None
    chapter_title_map: dict[str, str] = {}
    template_id = runtime_state.get("templateId")
    if template_id:
        with SessionLocal() as db:
            chapter_title_map = _build_section_title_map(
                _list_template_sections(db, str(template_id))
            )
    report_text, chapter_items = _build_live_result_from_runtime(
        runtime_state,
        include_debug=include_debug,
        chapter_title_map=chapter_title_map,
    )
    runtime_chapters = list(((runtime_state or {}).get("chapters") or {}).values())
    runtime_chapters.sort(key=lambda x: int((x or {}).get("sectionOrder") or 0))
    job_payload = {
        "jobId": runtime_state.get("jobId"),
        "projectId": runtime_state.get("projectId"),
        "templateId": runtime_state.get("templateId"),
        "status": runtime_state.get("status"),
        "progress": int(runtime_state.get("progress") or 0),
        "currentSectionKey": runtime_state.get("currentSectionKey"),
        "errorMessage": runtime_state.get("errorMessage"),
        "createdAt": runtime_state.get("createdAt"),
        "updatedAt": runtime_state.get("updatedAt"),
        "completedAt": runtime_state.get("completedAt"),
        "chapters": [
            {
                "sectionKey": str(c.get("sectionKey") or ""),
                "sectionTitle": str(c.get("sectionTitle") or ""),
                "sectionOrder": int(c.get("sectionOrder") or 0),
                "status": str(c.get("status") or "pending"),
                "updatedAt": c.get("updatedAt"),
                "errorMessage": c.get("errorMessage"),
            }
            for c in runtime_chapters
        ],
    }
    result_payload = {
        "jobId": runtime_state.get("jobId"),
        "status": runtime_state.get("status"),
        "report": report_text,
        "consistency": [],
        "chapters": [c.model_dump() for c in chapter_items],
    }
    return {
        "job": job_payload,
        "result": result_payload,
    }


def retry_report_chapter(project_id: str, job_id: str, section_key: str, db: Session) -> GenerateReportJobItem:
    project = _resolve_project(db, project_id)
    if not project:
        raise HTTPException(status_code=404, detail="项目不存在")
    job = (
        db.query(ReportGenerationJob)
        .filter(ReportGenerationJob.id == job_id, ReportGenerationJob.project_id == project.uuid)
        .first()
    )
    if not job:
        raise HTTPException(status_code=404, detail="任务不存在")
    chapter = (
        db.query(ReportGenerationChapter)
        .filter(ReportGenerationChapter.job_id == job.id, ReportGenerationChapter.section_key == section_key)
        .first()
    )
    if not chapter:
        raise HTTPException(status_code=404, detail="章节不存在")
    now = datetime.now()
    chapter.status = "pending"
    chapter.error_message = None
    chapter.updated_at = now
    job.status = "running"
    job.updated_at = now
    db.commit()
    update_job_state(job.id, status="running", errorMessage=None, completedAt=None)
    update_chapter_state(
        job.id,
        section_key,
        status="pending",
        content=None,
        errorMessage=None,
        promptText=None,
        evidencePayload=None,
        validationPayload=None,
    )
    _start_job_worker(job.id, section_key=section_key)
    return get_report_job(project.uuid, job_id, db)


def cancel_report_job(project_id: str, job_id: str, db: Session) -> GenerateReportJobItem:
    project = _resolve_project(db, project_id)
    if not project:
        raise HTTPException(status_code=404, detail="项目不存在")
    job = (
        db.query(ReportGenerationJob)
        .filter(ReportGenerationJob.id == job_id, ReportGenerationJob.project_id == project.uuid)
        .first()
    )
    if not job:
        raise HTTPException(status_code=404, detail="任务不存在")

    now = datetime.now()
    if job.status in ("completed", "failed", "cancelled"):
        return get_report_job(project.uuid, job_id, db)

    chapters = (
        db.query(ReportGenerationChapter)
        .filter(ReportGenerationChapter.job_id == job.id)
        .all()
    )
    for c in chapters:
        if c.status in ("pending", "running"):
            c.status = "failed"
            c.error_message = "任务已由用户取消"
            c.updated_at = now

    job.status = "cancelled"
    job.error_message = "任务已由用户取消"
    job.current_section_key = None
    job.updated_at = now
    job.completed_at = now
    db.commit()
    update_job_state(
        job.id,
        status="cancelled",
        errorMessage="任务已由用户取消",
        currentSectionKey=None,
        completedAt=_fmt_dt(now),
    )
    for c in chapters:
        if c.status in ("failed", "cancelled") or c.error_message == "任务已由用户取消":
            update_chapter_state(
                job.id,
                c.section_key,
                status="failed",
                errorMessage="任务已由用户取消",
            )
    return get_report_job(project.uuid, job_id, db)


def _start_job_worker(job_id: str, section_key: Optional[str] = None) -> None:
    threading.Thread(
        target=_run_job_worker,
        args=(job_id, section_key),
        daemon=True,
        name=f"report-job-{job_id[:8]}",
    ).start()


def _run_job_worker(job_id: str, only_section_key: Optional[str] = None) -> None:
    with SessionLocal() as db:
        job = db.query(ReportGenerationJob).filter(ReportGenerationJob.id == job_id).first()
        if not job:
            return
        try:
            job.status = "running"
            job.error_message = None
            job.updated_at = datetime.now()
            db.commit()
            update_job_state(job.id, status="running", errorMessage=None)

            project = db.query(Project).filter(Project.uuid == job.project_id).first()
            if not project:
                raise RuntimeError("项目不存在")
            template = _resolve_template(db, job.template_id)
            all_template_sections = _list_template_sections(db, template.id)
            sections = _sections_for_generation(all_template_sections)
            chapter_title_map = _build_section_title_map(all_template_sections)
            chapters = (
                db.query(ReportGenerationChapter)
                .filter(ReportGenerationChapter.job_id == job.id)
                .order_by(ReportGenerationChapter.section_order.asc())
                .all()
            )
            chapter_map = {c.section_key: c for c in chapters}
            completed_section_contents: dict[str, str] = {
                c.section_key: str(c.content or "").strip()
                for c in chapters
                if c.status == "completed" and str(c.content or "").strip()
            }
            target_sections = [s for s in sections if (not only_section_key or s.section_key == only_section_key)]
            retrieval = RetrievalService()
            top_k = int((job.options or {}).get("topK") or 10)
            completed_count = 0
            pending_sections = []
            for section in target_sections:
                chapter = chapter_map.get(section.section_key)
                if not chapter:
                    continue
                if not only_section_key and chapter.status == "completed":
                    completed_count += 1
                    continue
                pending_sections.append(section)

            total_count = max(1, completed_count + len(pending_sections))
            logger.info(
                "报告生成 job start | job=%s | project=%s | total_sections=%d | pending=%d | completed=%d | top_k=%d",
                job.id, project.uuid, len(sections), len(pending_sections), completed_count, top_k,
            )
            for idx, section in enumerate(pending_sections, start=1):
                db.refresh(job)
                if job.status == "cancelled":
                    return
                chapter = chapter_map.get(section.section_key)
                if not chapter:
                    continue

                section_no = _extract_section_number(section.section_title or "")
                logger.info(
                    "报告生成 start chapter | job=%s | section=%s | title=%s | section_no=%s | idx=%d/%d",
                    job.id, section.section_key, section.section_title, section_no, idx, total_count,
                )

                _update_chapter_status(db, job, chapter, "running", None)
                update_job_state(job.id, currentSectionKey=section.section_key)
                update_chapter_state(
                    job.id,
                    section.section_key,
                    status="running",
                    errorMessage=None,
                    content=None,
                    promptText=None,
                    evidencePayload=None,
                    validationPayload={"streamPhase": "waiting"},
                )
                required_tables = _extract_required_table_tokens(
                    section.section_prompt or "",
                    _extract_section_number(section.section_title or ""),
                    contract_text=_effective_section_output_contract(section),
                )
                if _extract_section_number(section.section_title or "") == "5.3.2":
                    na8 = _norm_table_token("附表8")
                    required_tables = [
                        t for t in required_tables if _norm_table_token(str(t)) != na8
                    ]
                evidence, retrieval_stage = _collect_evidence_progressive(
                    db,
                    retrieval,
                    project.uuid,
                    section,
                    top_k=top_k,
                    required_tables=required_tables,
                )
                prior_sibling_sections_text = _build_prior_sibling_sections_text(
                    section,
                    sections,
                    completed_section_contents,
                )
                section_reference = _load_section_reference_for_chapter(
                    db,
                    section.section_key,
                    section.section_title,
                    template_id=template.id,
                )
                logger.info(
                    "section_reference 注入 | section=%s | template_id=%s | 命中=%s",
                    section.section_key,
                    template.id,
                    "是" if section_reference else "否",
                )
                prompt = _build_chapter_prompt(
                    section,
                    evidence,
                    prior_sibling_sections_text=prior_sibling_sections_text,
                    section_reference=section_reference,
                )
                _dump_runtime_prompt(
                    job_id=job.id,
                    section_key=section.section_key,
                    section_title=section.section_title,
                    system_prompt=chapter_generation_system_prompt(),
                    user_prompt=prompt,
                )
                stream_state = {
                    "buffer": "",
                    "phase": "waiting",
                }

                def _on_content_delta(event: str, delta_text: str) -> None:
                    if event == "delta":
                        if delta_text:
                            stream_state["phase"] = "streaming"
                            stream_state["buffer"] = str(stream_state.get("buffer") or "") + delta_text
                            append_chapter_content(
                                job.id,
                                section.section_key,
                                delta_text,
                                stream_phase="streaming",
                            )
                    elif event == "finalizing":
                        stream_state["phase"] = "finalizing"
                        set_chapter_stream_phase(job.id, section.section_key, "finalizing")

                content, validation, model_output = _generate_chapter_content(
                    section,
                    prompt,
                    on_content_delta=_on_content_delta,
                )
                content = _apply_canonical_field_backfill(section, evidence, content)
                _cur_section_no = _extract_section_number(section.section_title or "")
                _skip_table_enforcement = _cur_section_no in {"2.1.1"}
                if _skip_table_enforcement:
                    remaining_missing_tables = []
                    content_after_tables = content
                else:
                    content, remaining_missing_tables = _enforce_required_tables(
                        section,
                        prompt,
                        content,
                        evidence,
                    )
                    content_after_tables = content
                content = _strip_tables_from_non_table_section(
                    section.section_title or "", content, section=section
                )
                content = _strip_forbidden_tables(
                    section.section_title or "", content,
                )
                content, format_issues = _enforce_template_format_contract(
                    section,
                    content,
                    evidence,
                    chapter_title_map=chapter_title_map,
                )
                _sec_no = _extract_section_number(section.section_title or "")
                _refresh_tokens: tuple[str, ...] = ("表5-4",)
                if _sec_no == "5.1":
                    _refresh_tokens = ("表5-1",)
                elif _sec_no == "5.2.1":
                    _refresh_tokens = ("表5-2", "表5-3")
                content = _refresh_element_table_markdown_tokens(
                    content, evidence, _refresh_tokens
                )
                content = _strip_bracketed_three_part_labels(content)
                content = _strip_placeholder_table_notes(content)
                content = _normalize_table_captions_in_markdown(content)
                content = _strip_trailing_partial_missing_markers(content)
                content = _fix_numeric_line_breaks(content)
                content = _cleanup_section_table_artifacts(
                    section.section_title or "",
                    content,
                    allowed_table_tokens=required_tables,
                )
                if _sec_no == "5.3.1":
                    content = _refresh_element_table_markdown_tokens(
                        content, evidence, ("表5-4",)
                    )
                    content = _fill_required_table_caption_stubs(
                        content, ["表5-4"], evidence
                    )
                content = _strip_orphan_markdown_table_rows(content)
                content = _strip_minimal_missing_table_tail(content)
                content, intra_repeat_removed = _collapse_consecutive_text_repetitions(content)
                content, chapter_dedupe_removed = _dedupe_long_chapter_repetition(content)
                chapter_dedupe_removed += intra_repeat_removed
                if chapter_dedupe_removed > 0:
                    warnings = validation.get("warnings") if isinstance(validation, dict) else []
                    if not isinstance(warnings, list):
                        warnings = []
                    warnings.append(
                        f"章节去重：已移除 {chapter_dedupe_removed} 处重复段落/表格"
                    )
                    validation["warnings"] = warnings
                    validation["chapterDedupeRemoved"] = chapter_dedupe_removed
                if required_tables and not _skip_table_enforcement:
                    content = _restore_required_tables_safety_net(
                        content,
                        required_tables,
                        evidence,
                        content_after_tables,
                    )
                    content = _finalize_section_table_dedupe(content, required_tables)
                if remaining_missing_tables:
                    warnings = validation.get("warnings") if isinstance(validation, dict) else []
                    if not isinstance(warnings, list):
                        warnings = []
                    warnings.append(
                        "部分必需表格仍缺失，已插入占位表："
                        + "、".join(remaining_missing_tables)
                    )
                    validation["warnings"] = warnings
                if format_issues:
                    warnings = validation.get("warnings") if isinstance(validation, dict) else []
                    if not isinstance(warnings, list):
                        warnings = []
                    warnings.extend([f"格式验收器：{x}" for x in format_issues][:8])
                    validation["warnings"] = warnings
                validation["retrievalStage"] = retrieval_stage
                validation["streamPhase"] = "completed"
                diagnostics = _build_field_diagnostics(section, evidence, content)
                if diagnostics:
                    validation["fieldDiagnostics"] = diagnostics
                if model_output:
                    validation["modelOutput"] = model_output
                content = _inject_missing_parent_section_headings(
                    section.section_title or "",
                    content,
                    _previous_completed_section_content(
                        section, sections, completed_section_contents
                    ),
                    chapter_title_map,
                )
                now = datetime.now()
                chapter.content = content
                completed_section_contents[section.section_key] = content
                chapter.prompt_text = prompt[:20000]
                chapter.evidence_payload = evidence
                chapter.validation_payload = validation
                chapter.status = "completed"
                chapter.error_message = None
                chapter.updated_at = now
                chapter.completed_at = now
                if not only_section_key:
                    job.progress = int((completed_count + idx) * 100 / total_count)
                job.current_section_key = section.section_key
                job.updated_at = now
                db.commit()
                dump_out_path = _dump_report_chapter_json_markdown(
                    job_id=job.id,
                    section_key=section.section_key,
                    section_title=section.section_title,
                    output_json={
                        "modelOutput": model_output or {},
                        "persistedChapter": {
                            "sectionKey": section.section_key,
                            "sectionTitle": section.section_title,
                            "sectionOrder": section.section_order,
                            "status": "completed",
                            "content": content,
                            "promptText": prompt[:20000],
                            "evidencePayload": evidence,
                            "validationPayload": validation,
                        },
                    },
                )
                logger.info(
                    "章节生成落盘 | job=%s | section=%s | prompt_len=%s | content_len=%s | output_file=%s",
                    job.id, section.section_key, len(prompt[:20000]), len(content),
                    dump_out_path or "(已存在合并写入)",
                )
                update_chapter_state(
                    job.id,
                    section.section_key,
                    status="completed",
                    content=content,
                    errorMessage=None,
                    promptText=prompt[:20000],
                    evidencePayload=evidence,
                    validationPayload=validation,
                )
                if not only_section_key:
                    update_job_state(
                        job.id,
                        progress=int((completed_count + idx) * 100 / total_count),
                        currentSectionKey=section.section_key,
                    )
                else:
                    update_job_state(job.id, currentSectionKey=section.section_key)

                db.refresh(job)
                if job.status == "cancelled":
                    return

            db.refresh(job)
            if job.status == "cancelled":
                return

            if only_section_key:
                # 单章重跑不应直接终结整任务，仅回写章节并刷新任务进度。
                all_chapters = (
                    db.query(ReportGenerationChapter)
                    .filter(ReportGenerationChapter.job_id == job.id)
                    .order_by(ReportGenerationChapter.section_order.asc())
                    .all()
                )
                done = sum(1 for c in all_chapters if c.status == "completed")
                total = max(1, len(all_chapters))
                job.progress = int(done * 100 / total)
                job.current_section_key = None
                job.updated_at = datetime.now()
                db.commit()
                update_job_state(job.id, progress=int(done * 100 / total), currentSectionKey=None)
            else:
                job.snapshot = None
                job.status = "completed"
                job.progress = 100
                job.current_section_key = None
                job.completed_at = datetime.now()
                job.updated_at = datetime.now()
                db.commit()
                logger.info(
                    "报告生成 job completed | job=%s | project=%s | total_chapters=%d",
                    job.id, project.uuid, len(sections),
                )
                update_job_state(
                    job.id,
                    status="completed",
                    progress=100,
                    currentSectionKey=None,
                    completedAt=_fmt_dt(job.completed_at),
                )
        except Exception as e:
            current_section_key = job.current_section_key
            logger.error(
                "报告生成 job failed | job=%s | project=%s | section=%s | err=%s",
                job.id, project.uuid, current_section_key, e,
            )
            job.status = "failed"
            job.error_message = str(e)
            job.updated_at = datetime.now()
            db.commit()
            update_job_state(job.id, status="failed", errorMessage=str(e))
            if current_section_key:
                update_chapter_state(
                    job.id,
                    current_section_key,
                    status="failed",
                    errorMessage=str(e),
                )


# 这些章节号在 L1/L2 已判「证据充足」时仍继续走 L2/L3，避免仅章节定向检索就提前结束而漏掉关键词召回。
_SECTION_NUMBERS_FORCE_L3_KEYWORD_RETRIEVAL: frozenset[str] = frozenset({"2.1.5", "3.3.3", "3.4.1", "3.6", "3.7", "3.8", "3.10", "4.3.3","5.2.3", "6.1.1.1", "6.1.1.2", "6.2.1", "6.2.4"})


def _section_forces_l3_keyword_retrieval(section: ReportTemplateSection) -> bool:
    return _extract_section_number(section.section_title or "") in _SECTION_NUMBERS_FORCE_L3_KEYWORD_RETRIEVAL


def _collect_evidence_progressive(
    db: Session,
    retrieval: RetrievalService,
    project_uuid: str,
    section: ReportTemplateSection,
    *,
    top_k: int,
    required_tables: Optional[list[str]] = None,
) -> tuple[dict, str]:
    force_l3 = _section_forces_l3_keyword_retrieval(section)
    # L1: 仅要素与结构化表
    evidence = _collect_evidence(
        db,
        retrieval,
        project_uuid,
        section,
        top_k=top_k,
        required_tables=required_tables,
        include_chapter_docs=False,
        include_keyword_docs=False,
    )
    if _is_evidence_sufficient(section, evidence) and not force_l3:
        return evidence, "elements_only"

    # L2: 补充章节定向检索段落
    evidence = _collect_evidence(
        db,
        retrieval,
        project_uuid,
        section,
        top_k=top_k,
        required_tables=required_tables,
        include_chapter_docs=True,
        include_keyword_docs=False,
    )
    if _is_evidence_sufficient(section, evidence) and not force_l3:
        return evidence, "elements_plus_chapter_docs"

    # L3: 最后补充关键词兜底检索
    evidence = _collect_evidence(
        db,
        retrieval,
        project_uuid,
        section,
        top_k=top_k,
        required_tables=required_tables,
        include_chapter_docs=True,
        include_keyword_docs=True,
    )
    return evidence, "elements_plus_chapter_and_keyword_docs"


def _latest_element_payloads_by_row_col(
    db: Session,
    project_uuid: str,
    row_keys: list[str],
    *,
    non_empty_value: bool = True,
) -> list[dict[str, Any]]:
    """按 ``row_key + col_key`` 去重，保留 ``updated_at`` 最新的一条（查询已按时间倒序）。"""
    if not row_keys:
        return []
    q = db.query(ElementCell).filter(
        ElementCell.project_id == project_uuid,
        ElementCell.row_key.in_(row_keys),
    )
    if non_empty_value:
        q = q.filter(ElementCell.value.isnot(None), ElementCell.value != "")
    cells = q.order_by(ElementCell.updated_at.desc()).all()
    picked: dict[tuple[str, str], dict[str, Any]] = {}
    for cell in cells:
        rk = str(cell.row_key or "")
        ck = str(cell.col_key or "")
        key = (rk, ck)
        if key in picked:
            continue
        picked[key] = {
            "rowKey": cell.row_key,
            "colKey": cell.col_key,
            "value": str(cell.value or "")[:500],
            "sourceDocumentId": cell.source_document_id,
        }
    return list(picked.values())


def _merge_section_11_forced_elements(
    forced_payloads: list[dict[str, Any]],
    scored_top_payloads: list[dict[str, Any]],
    *,
    max_additional_scored: int = 40,
) -> list[dict[str, Any]]:
    """1.1 节：先发制人并入第 1 章概况要素，再追加与其它章节相同的 Top-K 打分单元格（去重）。"""
    seen: set[tuple[str, str]] = set()
    out: list[dict[str, Any]] = []
    for p in forced_payloads:
        key = (str(p.get("rowKey") or ""), str(p.get("colKey") or ""))
        if key in seen:
            continue
        seen.add(key)
        out.append(p)
    added = 0
    for p in scored_top_payloads:
        key = (str(p.get("rowKey") or ""), str(p.get("colKey") or ""))
        if key in seen:
            continue
        seen.add(key)
        out.append(p)
        added += 1
        if added >= max_additional_scored:
            break
    return out


def _collect_evidence(
    db: Session,
    retrieval: RetrievalService,
    project_uuid: str,
    section: ReportTemplateSection,
    *,
    top_k: int,
    required_tables: Optional[list[str]] = None,
    include_chapter_docs: bool = True,
    include_keyword_docs: bool = True,
) -> dict:
    section_no = _extract_section_number(section.section_title or "")
    tokens = _extract_tokens(f"{section.section_title} {section.section_prompt or ''}")[:14]
    if section_no == "1.2":
        # 标题词过短会导致检索跑偏；补充决策类短语提高召回
        extra = " ".join(
            [
                "项目决策要点 建设必要性 立项背景",
                "国VI 国Ⅵ 汽油质量升级 芳烃 烯烃 环保",
                "预期目标 烷基化油 产量 辛烷值 万吨",
                "可研 批复 投资 效益 利润",
            ]
        )
        merged = _extract_tokens(f"{section.section_title} {section.section_prompt or ''} {extra}")
        tokens = list(dict.fromkeys(merged))[:20]
    cells_query = (
        db.query(ElementCell, ElementTable.table_name)
        .join(ElementTable, ElementTable.id == ElementCell.table_id)
        .filter(
            ElementCell.project_id == project_uuid,
            ElementTable.project_id == project_uuid,
            ElementCell.value.isnot(None),
            ElementCell.value != "",
        )
        .order_by(ElementCell.updated_at.desc())
    )
    candidate_cells: list[tuple[int, dict]] = []
    for cell, table_name in cells_query.limit(800).all():
        payload = {
            "tableId": cell.table_id,
            "tableName": table_name,
            "rowKey": cell.row_key,
            "colKey": cell.col_key,
            "year": cell.year,
            "value": str(cell.value or "")[:500],
            "sourceDocumentId": cell.source_document_id,
            "sourceType": cell.source_type,
        }
        score = _score_element_cell_relevance(
            section.section_title,
            tokens,
            payload.get("rowKey"),
            payload.get("colKey"),
            payload.get("value"),
            table_name=payload.get("tableName"),
            section=section,
        )
        # 无 token 命中但字段语义强相关时仍保留（例如 value 内存在“项目名称：xxx”）。
        if score > 0:
            candidate_cells.append((score, payload))
        elif not tokens:
            candidate_cells.append((1, payload))
    candidate_cells.sort(key=lambda x: x[0], reverse=True)
    matched_cells = [x[1] for x in candidate_cells[:40]]
    # 1.1「项目基本情况」：强制并入「章节要素-第1章项目概况」全部非空格子，避免被全局 Top40 相关性截断挤出导致建设投资等待补充。
    if _extract_section_number(section.section_title or "") == "1.1":
        ch1_rows = section_table_row_keys(CHAPTER1_PROJECT_OVERVIEW_TABLE_GROUP)
        forced_ch1 = _latest_element_payloads_by_row_col(db, project_uuid, ch1_rows, non_empty_value=True)
        if forced_ch1:
            matched_cells = _merge_section_11_forced_elements(forced_ch1, matched_cells, max_additional_scored=40)
    required = [str(t) for t in (required_tables or []) if str(t).strip()]
    structured_tables = _collect_structured_tables(
        db,
        project_uuid,
        required,
        section_title=str(section.section_title or ""),
        section_tokens=tokens,
    )
    chapter_docs = []
    if include_chapter_docs:
        chapter_docs = retrieval.get_chapter_materials(project_uuid, section.section_title, top_k=top_k)
    keyword_docs: list[dict] = []
    if include_keyword_docs and tokens:
        if section_no == "1.2":
            queries = [
                "国VI 国Ⅵ 汽油 质量升级 芳烃 烯烃 环保 标准",
                "项目 建设 必要性 决策 依据 立项",
                "预期 目标 烷基化油 产量 辛烷值 效益 万吨",
                " ".join(tokens[:8]),
            ]
            seen: set[tuple[str, str]] = set()
            for q in queries:
                docs = retrieval.search_by_query(q, top_k=6, filter_project=project_uuid)
                for d in docs:
                    did = str(d.metadata.get("doc_id", "") or "")
                    body = str(d.page_content or "")[:2000]
                    key = (did, body[:240])
                    if key in seen:
                        continue
                    seen.add(key)
                    keyword_docs.append(
                        {
                            "heading": d.metadata.get("heading", ""),
                            "content": body,
                            "docId": did,
                            "query": q[:120],
                        }
                    )
                    if len(keyword_docs) >= 14:
                        break
                if len(keyword_docs) >= 14:
                    break
        else:
            q = " ".join(tokens[:5])
            docs = retrieval.search_by_query(q, top_k=10, filter_project=project_uuid)
            for d in docs:
                keyword_docs.append(
                    {
                        "heading": d.metadata.get("heading", ""),
                        "content": str(d.page_content or "")[:2000],
                        "docId": d.metadata.get("doc_id", ""),
                    }
                )
    return {
        "tokens": tokens,
        "requiredTables": required,
        "structuredTables": structured_tables,
        "canonicalFields": _extract_canonical_fields(
            section.section_title, matched_cells, section=section
        ),
        "elements": matched_cells,
        "chapterDocs": chapter_docs[:top_k],
        "keywordDocs": keyword_docs[:14] if section_no == "1.2" else keyword_docs[:8],
    }


def _is_evidence_sufficient(section: ReportTemplateSection, evidence: dict) -> bool:
    required_tables = evidence.get("requiredTables") if isinstance(evidence, dict) else []
    structured_tables = evidence.get("structuredTables") if isinstance(evidence, dict) else []
    elements = evidence.get("elements") if isinstance(evidence, dict) else []
    chapter_docs = evidence.get("chapterDocs") if isinstance(evidence, dict) else []
    keyword_docs = evidence.get("keywordDocs") if isinstance(evidence, dict) else []

    required_count = len(required_tables) if isinstance(required_tables, list) else 0
    structured_count = len(structured_tables) if isinstance(structured_tables, list) else 0
    element_count = len(elements) if isinstance(elements, list) else 0
    chapter_doc_count = len(chapter_docs) if isinstance(chapter_docs, list) else 0
    keyword_doc_count = len(keyword_docs) if isinstance(keyword_docs, list) else 0

    # 有必需表格时优先保证结构化表匹配覆盖
    if required_count > 0 and structured_count < min(required_count, 2):
        return False

    # 没有足够要素时，需要至少一种文档证据补充
    if element_count < 6 and chapter_doc_count == 0 and keyword_doc_count == 0:
        return False

    # 表格相关章节通常需要更高证据密度
    title = str(section.section_title or "")
    if "表" in title and (structured_count == 0 and element_count < 10):
        return False

    # 关键章节按字段完整性判定，避免“有数量但没关键字段”时误判为充足
    title_norm = re.sub(r"\s+", "", title)
    if "1.1项目基本情况" in title_norm:
        required_groups = [
            ["建设单位", "建设单位名称"],
            ["建设地点", "厂址"],
            ["建设规模", "装置规模", "能力", "万吨/年"],
            ["投资", "概算", "估算", "决算"],
        ]
        for group in required_groups:
            if not _evidence_contains_any_fact(evidence, group):
                return False

    if "1.2项目决策要点" in title_norm:
        required_groups = [
            ["国vi", "国ⅵ", "质量升级", "汽油标准", "环保", "环评", "排放", "清洁生产"],
            ["高标号", "辛烷值", "汽油池", "产品结构", "汽油"],
            ["碳四", "液化气", "原料", "物料平衡", "资源利用", "附加值"],
            ["杂质", "预处理", "丁二烯", "选择性加氢", "催化剂", "甲醇", "二甲醚"],
            ["万吨", "产量", "烷基化油", "效益", "利润", "营业收入", "预期", "目标"],
        ]
        hit_count = 0
        for group in required_groups:
            if _evidence_contains_any_fact(evidence, group):
                hit_count += 1
        # 至少命中 2 组：安全评价里常有杂质/物料平衡，可研/环评可补环保与目标
        if hit_count < 2:
            return False

    return True


def _score_element_cell_relevance(
    section_title: str,
    tokens: list[str],
    row_key: Optional[str],
    col_key: Optional[str],
    value: Optional[str],
    *,
    table_name: Optional[str] = None,
    section: ReportTemplateSection | None = None,
) -> int:
    table = str(table_name or "")
    row = str(row_key or "")
    col = str(col_key or "")
    val = str(value or "")
    full_text = f"{table} {row} {col} {val}"
    full_text_l = full_text.lower()
    key_text_l = f"{table} {row} {col}".lower()
    score = 0

    for t in (tokens or []):
        tt = str(t or "").strip()
        if not tt:
            continue
        if tt in full_text:
            score += 1
        if table and tt in table:
            score += 2

    title_norm = re.sub(r"\s+", "", str(section_title or ""))
    table_norm = re.sub(r"\s+", "", table)
    if title_norm and table_norm and (title_norm in table_norm or table_norm in title_norm):
        score += 8
    section_no = _extract_section_number(section_title)
    if section_no and table_norm and section_no.replace(".", ""):
        section_no_norm = section_no.replace(".", "")
        table_no_norm = re.sub(r"\D", "", table_norm[:12])
        if table_no_norm and table_no_norm.startswith(section_no_norm):
            score += 3

    # 对关键章节字段进行强加权，降低无关单元格被截断前占位的概率。
    expected = _section_expected_fields(section_title, section)
    for field in expected:
        aliases = [str(a).strip() for a in _field_aliases(field) if str(a).strip()]
        alias_hit = False
        for alias in aliases:
            a_l = alias.lower()
            if a_l in key_text_l:
                score += 4
                alias_hit = True
                break
        if alias_hit:
            continue
        # 若 row/col 不包含字段名，尝试 value 中“字段:值”模式。
        if _extract_value_by_alias_from_text(val, aliases):
            score += 5
            continue
        # 最弱相关：value 中仅出现别名关键词。
        if any(str(a).lower() in full_text_l for a in aliases):
            score += 1

    return score


def _evidence_contains_any_fact(evidence: dict, keywords: list[str]) -> bool:
    if not isinstance(evidence, dict):
        return False
    lowered_keywords = [str(k).strip().lower() for k in keywords if str(k).strip()]
    if not lowered_keywords:
        return False

    elements = evidence.get("elements") if isinstance(evidence.get("elements"), list) else []
    for row in elements:
        if not isinstance(row, dict):
            continue
        row_key = str(row.get("rowKey") or "").lower()
        col_key = str(row.get("colKey") or "").lower()
        value = str(row.get("value") or "").strip()
        value_l = value.lower()
        if _is_missing_like(value):
            continue
        if any(k in row_key or k in col_key or k in value_l for k in lowered_keywords):
            return True

    for doc_field in ("chapterDocs", "keywordDocs"):
        docs = evidence.get(doc_field) if isinstance(evidence.get(doc_field), list) else []
        for d in docs[:12]:
            if not isinstance(d, dict):
                continue
            text = (str(d.get("heading") or "") + " " + str(d.get("content") or "")).lower()
            if any(k in text for k in lowered_keywords):
                return True
    return False


def _recover_stalled_job(db: Session, job: ReportGenerationJob) -> None:
    if not job or job.status != "running":
        return
    now = datetime.now()
    running_chapter = (
        db.query(ReportGenerationChapter)
        .filter(ReportGenerationChapter.job_id == job.id, ReportGenerationChapter.status == "running")
        .order_by(ReportGenerationChapter.updated_at.asc())
        .first()
    )
    if not running_chapter or not running_chapter.updated_at:
        return
    stale_seconds = (now - running_chapter.updated_at).total_seconds()
    if stale_seconds < RUNNING_CHAPTER_STALE_SECONDS:
        return

    running_chapter.status = "pending"
    running_chapter.error_message = "检测到章节长时间未更新，已自动回收并重试"
    running_chapter.updated_at = now
    job.status = "pending"
    job.error_message = None
    job.current_section_key = None
    job.updated_at = now
    db.commit()
    update_job_state(job.id, status="pending", errorMessage=None, currentSectionKey=None)
    update_chapter_state(
        job.id,
        running_chapter.section_key,
        status="pending",
        errorMessage="检测到章节长时间未更新，已自动回收并重试",
        content=None,
        promptText=None,
        evidencePayload=None,
        validationPayload=None,
    )
    _start_job_worker(job.id)


def _load_section_reference_for_chapter(
    db: Session,
    section_key: str,
    section_title: str,
    *,
    template_id: Optional[str] = None,
    max_chars: int = 8000,
) -> str:
    """
    从 report_section_references 表加载当前章节存储的原始章节内容（content），
    直接用于填充 user-prompt 的 section_reference_block，不做 LLM 脱敏。
    优先按 section_key 精确匹配，其次从标题中提取编号匹配，最后按标题模糊匹配。

    template_id: 选中模板的 ID。传入后只注入与该模板关联（report_section_references.template_id）
    的参考范文，实现“按模板过滤参考范文”；为空则不做模板过滤（取最新一条）。
    """
    from services.reference_service import (
        load_section_reference_raw,
        load_section_reference_raw_by_title,
    )

    tid = (template_id or "").strip() or None

    content = load_section_reference_raw(
        db, section_key, template_id=tid, max_chars=max_chars
    )
    if content:
        return content

    # 兜底：按标题匹配（仍限定在同一模板内）
    return load_section_reference_raw_by_title(
        db, section_title, template_id=tid, max_chars=max_chars
    )


def _build_chapter_prompt(
    section: ReportTemplateSection,
    evidence: dict,
    *,
    prior_sibling_sections_text: str = "",
    section_reference: str = "",
) -> str:
    selected_example = _select_chapter_example(
        section.section_title,
        section.examples,
        evidence,
    )
    section_contract = _effective_section_output_contract(section)
    section_no = _extract_section_number(section.section_title)
    heading_rule = SECTION_HEADING_RULES.get(section_no, DEFAULT_HEADING_RULE)
    expected_fields = _section_expected_fields(section.section_title, section)
    return build_report_chapter_prompt(
        section_title=section.section_title,
        section_prompt=_effective_section_prompt_for_generation(section, section_contract),
        required_tables_text="、".join(evidence.get("requiredTables") or []) or "无",
        structured_tables_text=_render_structured_tables_for_prompt(evidence),
        canonical_fields_text=_render_canonical_fields_for_prompt(
            evidence, allowed_fields=expected_fields or None
        ),
        selected_example=selected_example,
        heading_rule=heading_rule,
        section_contract=section_contract,
        evidence_json=json.dumps(evidence, ensure_ascii=False),
        prior_sibling_sections_text=prior_sibling_sections_text,
        section_reference=section_reference,
    )


def _generate_chapter_content(
    section: ReportTemplateSection,
    prompt: str,
    on_content_delta: Optional[callable] = None,
) -> tuple[str, dict, dict]:
    section_no = _extract_section_number(section.section_title or "")
    logger.info(
        "LLM 章节生成 start | section=%s | section_no=%s | max_tokens=%s",
        section.section_key, section_no, _chapter_generation_max_tokens(section_no),
    )
    obj = chat_completions_json(
        system_prompt=chapter_generation_system_prompt(),
        user_prompt=prompt,
        temperature=0.1,
        max_tokens=_chapter_generation_max_tokens(section_no),
        timeout_sec=120,
        on_content_delta=on_content_delta,
        log_context=f"章节生成 section_key={section.section_key} | {section.section_title}",
    )
    content = str(obj.get("content") or "").strip()
    if not content:
        content = f"{section.section_title}\n\n待补充"
    # 不对章节编号/条目序号做“统一编号归一化”改写，避免破坏模板章节层级（如 2.1.1、3.4.2 等）。
    # 仅清理证据标签/引用编号等噪声。
    content = _strip_inline_evidence_labels(content)
    if section_no == "1.2":
        content = re.sub(
            r"(?m)^[\s\u3000]*1[\s\u3000]*[）\)][\s\u3000]*项目背景[\s\u3000]*$",
            "1.2.1项目背景",
            content,
        )
        content = re.sub(
            r"(?m)^[\s\u3000]*2[\s\u3000]*[）\)][\s\u3000]*预期目标[\s\u3000]*$",
            "1.2.2预期目标",
            content,
        )
        content = _normalize_section_12_content(content)
    missing = obj.get("missingInfo") if isinstance(obj.get("missingInfo"), list) else []
    checks = obj.get("qualityChecks") if isinstance(obj.get("qualityChecks"), list) else []
    validation = {
        "missingInfo": [str(x) for x in missing][:20],
        "qualityChecks": [str(x) for x in checks][:20],
        "warnings": _basic_warnings(section.section_title, content),
    }
    return content, validation, obj


def _normalize_ordered_item_markers(content: str) -> str:
    text = _strip_inline_evidence_labels(str(content or ""))
    if not text:
        return text
    cn_num_to_idx = {
        "一": 1,
        "二": 2,
        "三": 3,
        "四": 4,
        "五": 5,
        "六": 6,
        "七": 7,
        "八": 8,
        "九": 9,
        "十": 10,
    }

    # Keep the first non-empty line unchanged, to avoid mutating the section title.
    lines = text.splitlines()
    first_non_empty_idx = -1
    for i, ln in enumerate(lines):
        if ln.strip():
            first_non_empty_idx = i
            break

    # Convert line-leading markers such as:
    # - Chinese numerals: "一、" / "（一）"
    # - Arabic numerals: "1." / "2." / "1.2." / "3.1"
    # into a unified "n）" style.
    cn_pattern = re.compile(r"^(\s*(?:#+\s*)?(?:[-*]\s*)?)(?:（([一二三四五六七八九十])）|([一二三四五六七八九十])、)\s*")
    ar_pattern = re.compile(r"^(\s*(?:#+\s*)?(?:[-*]\s*)?)(\d+(?:\.\d+)*)(?:\.)?\s+")

    def _replace_line(ln: str) -> str:
        m_cn = cn_pattern.match(ln)
        if m_cn:
            prefix = m_cn.group(1) or ""
            cn = m_cn.group(2) or m_cn.group(3) or ""
            idx = cn_num_to_idx.get(cn)
            if idx:
                return cn_pattern.sub(f"{prefix}{idx}）", ln, count=1)
            return ln

        m_ar = ar_pattern.match(ln)
        if m_ar:
            prefix = m_ar.group(1) or ""
            seq = m_ar.group(2) or ""
            parts = [p for p in seq.split(".") if p]
            # Use the last segment as list index: 1.2 -> 2）, 3.1 -> 1）
            idx = parts[-1] if parts else ""
            if idx.isdigit():
                return ar_pattern.sub(f"{prefix}{int(idx)}） ", ln, count=1)
        return ln

    out: list[str] = []
    for i, ln in enumerate(lines):
        if i == first_non_empty_idx:
            out.append(ln)
            continue
        out.append(_replace_line(ln))
    return "\n".join(out)


def _strip_inline_evidence_labels(text: str) -> str:
    src = str(text or "")
    if not src:
        return src
    cleaned = re.sub(r"【\s*证据依据\s*[：:]\s*[0-9a-fA-F]{16,}\s*】", "", src)
    cleaned = re.sub(r"\[\s*证据依据\s*[：:]\s*[0-9a-fA-F]{16,}\s*\]", "", cleaned)
    # Remove simple inline numeric citations like [1], [2] that often leak from evidence.
    cleaned = re.sub(r"\[\s*\d{1,3}\s*\]", "", cleaned)
    # Strip leaked meta sections from model JSON fields when they are accidentally merged into content.
    cleaned = re.sub(
        r"(?is)\n*【\s*缺失信息说明\s*】[\s\S]*?(?=\n【\s*质量检查\s*】|\Z)",
        "\n",
        cleaned,
    )
    cleaned = re.sub(r"(?is)\n*【\s*质量检查\s*】[\s\S]*$", "\n", cleaned)
    return re.sub(r"[ \t]{2,}", " ", cleaned)


def _markdown_hashes_for_section_no(section_no: str) -> str:
    parts = str(section_no or "").strip().split(".")
    if len(parts) == 1:
        return "##"
    if len(parts) == 2:
        return "###"
    return "####"


def _normalize_numbered_heading_spacing(content: str) -> str:
    """编号与题名之间补空格，便于前后端一致识别为标题。"""
    text = str(content or "")
    if not text:
        return text

    def _line_repl(m: re.Match[str]) -> str:
        indent, num, title = m.group(1), m.group(2), str(m.group(3) or "").strip()
        parts = num.split(".")
        if len(parts) < 2 or len(parts) > 4:
            return m.group(0)
        for part in parts:
            if not part.isdigit() or int(part) < 1 or int(part) > 30:
                return m.group(0)
        if not title or len(title) > 36 or re.search(r"[，。；：！？]", title):
            return m.group(0)
        return f"{indent}{num} {title}"

    return re.sub(
        r"(?m)^([\s\u3000]*)(\d+(?:\.\d+)+)\s*([\u4e00-\u9fff][^\n]{0,40})\s*$",
        _line_repl,
        text,
    )


def _normalize_markdown_heading_levels(content: str) -> str:
    """三节及以上编号统一为 ####，避免 ### 与 #### 混用导致同级标题字号不一致。"""
    text = str(content or "")
    if not text:
        return text

    def _line_repl(m: re.Match[str]) -> str:
        hashes, num, tail = m.group(1), m.group(2), m.group(3)
        parts = num.split(".")
        if len(parts) < 3:
            return m.group(0)
        want = _markdown_hashes_for_section_no(num)
        if hashes == want:
            return m.group(0)
        return f"{want} {num} {tail}"

    return re.sub(
        r"(?m)^(#{1,6})\s+(\d+(?:\.\d+)+)\s+([\u4e00-\u9fff].*)$",
        _line_repl,
        text,
    )


def _normalize_section_heading_markdown(content: str) -> str:
    return _normalize_markdown_heading_levels(
        _normalize_numbered_heading_spacing(content)
    )


def _normalize_section_12_content(content: str) -> str:
    """1.2 合同为纯文本编号体：首行「项目决策要点」、小节「1.2.1项目背景」无空格。
    与模板/标题验收叠加后会重复节标题，且前端/导出无法识别为 h4。"""
    text = str(content or "").strip()
    if not text:
        return text
    text = re.sub(
        r"(?m)^[\s\u3000]*项目决策要点[\s\u3000]*(?:[:：])?[\s\u3000]*\n+",
        "",
        text,
        count=1,
    )
    text = re.sub(
        r"(?m)^([\s\u3000]*)(1\.2\.[12])(项目背景|预期目标)[\s\u3000]*$",
        r"\1\2 \3",
        text,
    )
    return re.sub(r"\n{3,}", "\n\n", text).strip()


_CONTRACT_FIELD_LINE_RE = re.compile(
    r"^\s*(\d+)[）)]\s*(?P<field>[^：:\n]+)[：:]\s*(?P<tail>.*)$",
    re.MULTILINE,
)
_CONTRACT_FIELD_SKIP_RE = re.compile(
    r"必须|不得|禁止|严禁|应|需|写|输出|背景|规则|约束|表\d|后评价|若|当|正文|首行|写作|请",
)


def _parse_expected_fields_from_contract(contract: str | None) -> list[str]:
    """从模版输出合同解析「1) 字段名：...」连续编号字段；无则返回空。"""
    text = str(contract or "").strip()
    if not text:
        return []
    fields: list[str] = []
    nums: list[int] = []
    for m in _CONTRACT_FIELD_LINE_RE.finditer(text):
        field = str(m.group("field") or "").strip()
        tail = str(m.group("tail") or "").strip()
        if not field or len(field) > 10 or _CONTRACT_FIELD_SKIP_RE.search(field):
            continue
        if tail and not re.fullmatch(r"\.{2,}|待补充", tail) and len(tail) > 6:
            continue
        fields.append(field)
        nums.append(int(m.group(1)))
    if len(fields) < 3 or not nums or nums[0] != 1:
        return []
    for i in range(1, len(nums)):
        if nums[i] != nums[i - 1] + 1:
            return []
    return fields


def _section_expected_fields(
    section_title: str,
    section: ReportTemplateSection | None = None,
) -> list[str]:
    """从模版输出合同解析应输出字段；无编号/枚举字段时返回空（不再写死 1.1 八项）。"""
    if section is not None:
        contract = _effective_section_output_contract(section)
    else:
        contract = _section_output_contract(section_title)
    parsed = _parse_expected_fields_from_contract(contract)
    if parsed:
        return parsed
    title_norm = re.sub(r"\s+", "", str(section_title or ""))
    if "1.2项目决策要点" in title_norm:
        return ["规模目标", "质量目标", "效益目标"]
    return []


def _effective_section_prompt_for_generation(
    section: ReportTemplateSection,
    contract: str,
) -> str:
    """模版合同为结构权威；与合同重复的 section_prompt 不再注入，避免双源冲突。"""
    stored = str(section.section_prompt or "").strip()
    contract_text = str(contract or "").strip()
    if stored and stored != contract_text:
        return stored
    return ""


def _field_aliases(field: str) -> list[str]:
    base = str(field or "").strip()
    aliases: dict[str, list[str]] = {
        "项目名称": ["项目名称", "工程名称", "装置名称"],
        "建设单位": ["建设单位", "业主单位", "实施单位"],
        "建设地点": ["建设地点", "建设地址", "厂址", "所在地"],
        "建设类型": ["建设类型", "项目类型", "新建", "改扩建"],
        "起止时间": ["起止时间", "工作起止时间", "开工时间", "完工时间", "建设工期", "建设期限"],
        # 抽取/填表侧常将“建设内容”写作“项目内容/工程内容/装置内容”，需兼容回填。
        "建设内容": ["建设内容", "主要建设内容", "建设范围", "项目内容", "工程内容", "装置内容"],
        "建设投资": ["建设投资", "总投资", "投资估算", "项目总投资", "概算"],
        "占地面积": ["占地面积", "用地面积"],
        "规模目标": ["规模目标", "产量", "规模", "万吨"],
        "质量目标": ["质量目标", "辛烷值", "质量升级", "国VI", "国Ⅵ"],
        "效益目标": ["效益目标", "利润", "收益", "营业收入", "内部收益率", "IRR"],
    }
    out = aliases.get(base, [])
    if base and base not in out:
        out.insert(0, base)
    return out[:8]


def _is_missing_like(value: str) -> bool:
    text = str(value or "").strip()
    if not text:
        return True
    lowered = text.lower()
    missing_like = {
        "待补充",
        "无",
        "n/a",
        "na",
        "-",
        "—",
        "——",
        "暂无",
        "未知",
        "未提供",
    }
    return lowered in missing_like


def _normalize_land_area_value(value: str) -> str:
    """
    规范化“占地面积”字段：
    - 遇到“84m×187m=15708m2”这类表达时，仅保留等号后的结果；
    - 将 m2/m^2/m²/㎡ 统一为 ㎡，避免导出时出现单位显示异常。
    """
    text = str(value or "").strip()
    if not text:
        return text
    core = text
    if "=" in core:
        core = core.split("=")[-1].strip()
    # 中文全角等号兼容
    if "＝" in core:
        core = core.split("＝")[-1].strip()
    unified = re.sub(r"(?i)\bm\s*(?:\^?\s*2)\b", "㎡", core)
    unified = unified.replace("m²", "㎡").replace("M²", "㎡")
    unified = re.sub(r"\s*㎡", "㎡", unified)
    return unified or text


def _normalize_canonical_field_value(field: str, value: str) -> str:
    f = str(field or "").strip()
    v = str(value or "").strip()
    if not v:
        return v
    if f == "占地面积":
        return _normalize_land_area_value(v)
    return v


def _extract_value_by_alias_from_text(text: str, aliases: list[str]) -> str:
    src = str(text or "").strip()
    if not src:
        return ""
    for alias in aliases:
        a = str(alias or "").strip()
        if not a:
            continue
        # 支持“字段名：值”或“字段名:值”，值截取到常见分隔符前。
        pattern = rf"{re.escape(a)}\s*[：:]\s*([^\n；;，,。]+)"
        m = re.search(pattern, src, flags=re.IGNORECASE)
        if m:
            val = str(m.group(1) or "").strip()
            if val and not _is_missing_like(val):
                return val
    return ""


def _is_valid_value_for_field(field: str, value: str, row_key: str = "", col_key: str = "") -> bool:
    f = str(field or "").strip()
    v = str(value or "").strip()
    rk = str(row_key or "").strip().lower()
    ck = str(col_key or "").strip().lower()
    if not v or _is_missing_like(v):
        return False
    if f != "建设投资":
        return True

    # “建设投资”仅接受金额口径，过滤收益率/回收期等财务指标，避免把 2.89 这类比率误填入。
    key_text = f"{rk} {ck}"
    if any(x in key_text for x in ["收益率", "irr", "回收期", "净现值", "百分点", "利润率"]):
        return False
    if re.search(r"%|％|‰", v):
        return False
    if re.search(r"(收益率|回收期|净现值|利润率|irr)", v, flags=re.IGNORECASE):
        return False

    has_amount_unit = bool(re.search(r"(万元|万|亿元|亿元人民币|元)", v))
    number_match = re.search(r"\d+(?:\.\d+)?", v)
    if has_amount_unit:
        return True
    if not number_match:
        return False

    # 无单位纯数字时，过小值大概率是比率而非投资金额（如 2.89）。
    num = float(number_match.group(0))
    return num >= 100


def _extract_canonical_fields(
    section_title: str,
    elements: list[dict],
    *,
    section: ReportTemplateSection | None = None,
) -> dict[str, str]:
    expected = _section_expected_fields(section_title, section)
    if not expected:
        return {}
    rows = elements if isinstance(elements, list) else []
    out: dict[str, str] = {}
    for field in expected:
        aliases_raw = [str(a).strip() for a in _field_aliases(field) if str(a).strip()]
        aliases = [a.lower() for a in aliases_raw]
        best_value = ""
        for row in rows:
            if not isinstance(row, dict):
                continue
            row_key = str(row.get("rowKey") or "")
            col_key = str(row.get("colKey") or "")
            value = str(row.get("value") or "").strip()
            if _is_missing_like(value):
                continue
            key_text = f"{row_key} {col_key}".lower()
            if any(a in key_text for a in aliases):
                if _is_valid_value_for_field(field, value, row_key=row_key, col_key=col_key):
                    best_value = value
                    break
            # 兼容 row/col 泛化时，直接从 value 文本中解析“字段: 值”。
            from_value = _extract_value_by_alias_from_text(value, aliases_raw)
            if from_value and _is_valid_value_for_field(field, from_value, row_key=row_key, col_key=col_key):
                best_value = from_value
                break
        normalized = _normalize_canonical_field_value(field, best_value)
        out[field] = normalized or "待补充"
    return out


def _render_canonical_fields_for_prompt(
    evidence: dict,
    *,
    allowed_fields: list[str] | None = None,
) -> str:
    canonical = evidence.get("canonicalFields") if isinstance(evidence, dict) else None
    if not isinstance(canonical, dict) or not canonical:
        return "无字段级已抽取结果。"
    allowed_set = {str(f).strip() for f in (allowed_fields or []) if str(f).strip()}
    lines: list[str] = []
    for field, value in canonical.items():
        f = str(field or "").strip()
        if allowed_set and f not in allowed_set:
            continue
        v = _normalize_canonical_field_value(f, str(value or "").strip()) or "待补充"
        if not f:
            continue
        lines.append(f"- {f}: {v}")
    return "\n".join(lines) if lines else "无字段级已抽取结果。"


def _extract_field_value_from_docs(field: str, docs: list[dict]) -> str:
    aliases = [str(a).strip() for a in _field_aliases(field) if str(a).strip()]
    if not aliases or not isinstance(docs, list):
        return ""
    texts: list[str] = []
    for doc in docs:
        if not isinstance(doc, dict):
            continue
        heading = str(doc.get("heading") or "").strip()
        content = str(doc.get("content") or "").strip()
        merged = f"{heading}\n{content}".strip()
        if merged:
            texts.append(merged[:8000])

    # 先尝试“字段: 值”类型，命中率高且更稳。
    for text in texts:
        val = _extract_value_by_alias_from_text(text, aliases)
        if val and not _is_missing_like(val):
            return _normalize_canonical_field_value(field, val)

    # “建设内容”常写成段落而非冒号键值，补充宽松句式抽取。
    if field == "建设内容":
        for text in texts:
            for alias in aliases:
                pattern = rf"{re.escape(alias)}\s*(?:为|包括|包含|主要包括)\s*([^\n。]{{12,420}})"
                m = re.search(pattern, text, flags=re.IGNORECASE)
                if m:
                    val = str(m.group(1) or "").strip(" ：:;；，,")
                    if val and not _is_missing_like(val):
                        return _normalize_canonical_field_value(field, val)
    return ""


def _merge_canonical_fields_from_docs(
    section_title: str,
    evidence: dict,
    canonical: dict[str, str],
    *,
    section: ReportTemplateSection | None = None,
) -> dict[str, str]:
    expected = _section_expected_fields(section_title, section)
    if not expected or not isinstance(evidence, dict):
        return canonical
    merged = {
        str(k): _normalize_canonical_field_value(str(k), str(v))
        for k, v in dict(canonical or {}).items()
    }
    docs: list[dict] = []
    chapter_docs = evidence.get("chapterDocs")
    keyword_docs = evidence.get("keywordDocs")
    if isinstance(chapter_docs, list):
        docs.extend(chapter_docs)
    # 1.1 项目基本情况：必须优先使用“要素管理-章节要素-第一章项目概况”的表格要素。
    # 仅当章节要素表整体为空/极少时，才允许使用 keywordDocs 做跨文档回退匹配，
    # 避免将其它章节的“投资/总投资”等金额误回填到 1.1（例如建设投资被污染）。
    title_norm = re.sub(r"\s+", "", str(section_title or ""))
    allow_keyword_fallback = True
    if "1.1项目基本情况" in title_norm:
        elements = evidence.get("elements") if isinstance(evidence.get("elements"), list) else []
        non_missing_elements = 0
        for row in elements[:80]:
            if not isinstance(row, dict):
                continue
            v = str(row.get("value") or "").strip()
            if v and not _is_missing_like(v):
                non_missing_elements += 1
                if non_missing_elements >= 4:
                    break
        # “有一定数量的非空单元格”即认为章节要素不空：禁止 keywordDocs 参与回填。
        allow_keyword_fallback = non_missing_elements < 4
    if allow_keyword_fallback and isinstance(keyword_docs, list):
        docs.extend(keyword_docs)
    if not docs:
        return merged
    for field in expected:
        current = str(merged.get(field) or "").strip()
        if current and not _is_missing_like(current):
            continue
        from_docs = _extract_field_value_from_docs(field, docs)
        if from_docs and not _is_missing_like(from_docs):
            merged[field] = _normalize_canonical_field_value(field, from_docs)
    return merged


def _apply_canonical_field_backfill(
    section: ReportTemplateSection,
    evidence: dict,
    content: str,
) -> str:
    text = str(content or "")
    canonical = evidence.get("canonicalFields") if isinstance(evidence, dict) else {}
    if not isinstance(canonical, dict) or not canonical:
        elements = evidence.get("elements") if isinstance(evidence, dict) else []
        canonical = _extract_canonical_fields(
            section.section_title,
            elements if isinstance(elements, list) else [],
            section=section,
        )
    canonical = _merge_canonical_fields_from_docs(
        section.section_title, evidence, canonical, section=section
    )
    if not canonical:
        return text
    repaired = text
    for field in _section_expected_fields(section.section_title, section):
        value = str(canonical.get(field) or "").strip()
        if _is_missing_like(value):
            continue
        # 先按“字段名: 待补充”进行宽松替换，兼容编号/加粗等格式包装。
        broad_pattern = rf"(^.*{re.escape(field)}.*?[：:]\s*)待补充(?:\s|$)"
        repaired = re.sub(
            broad_pattern,
            rf"\g<1>{value}\n",
            repaired,
            flags=re.MULTILINE,
        )
        labels = list(dict.fromkeys([x for x in _field_aliases(field) if str(x).strip()]))
        for label in labels:
            pattern = rf"({re.escape(label)}\s*[：:]\s*)待补充\b"
            repaired = re.sub(pattern, rf"\g<1>{value}", repaired)
        # 若正文还没有落入该字段值，则追加一行显式键值，避免模型遗漏。
        if value not in repaired and re.search(rf"{re.escape(field)}\s*[：:]", repaired):
            repaired += f"\n{field}：{value}"
    return repaired


def _build_field_diagnostics(section: ReportTemplateSection, evidence: dict, content: str) -> list[dict[str, Any]]:
    expected = _section_expected_fields(section.section_title, section)
    if not expected:
        return []
    elements = evidence.get("elements") if isinstance(evidence, dict) else []
    if not isinstance(elements, list):
        elements = []
    content_text = str(content or "")
    out: list[dict[str, Any]] = []
    for field in expected:
        aliases_raw = [str(a).strip() for a in _field_aliases(field) if str(a).strip()]
        aliases = [a.lower() for a in aliases_raw]
        hits: list[str] = []
        for row in elements:
            if not isinstance(row, dict):
                continue
            row_key = str(row.get("rowKey") or "")
            col_key = str(row.get("colKey") or "")
            value = str(row.get("value") or "").strip()
            if _is_missing_like(value):
                continue
            key_text = f"{row_key} {col_key}".lower()
            if any(a in key_text for a in aliases):
                hits.append(value[:120])
            else:
                from_value = _extract_value_by_alias_from_text(value, aliases_raw)
                if from_value:
                    hits.append(from_value[:120])
            if len(hits) >= 5:
                break
        unique_hits = list(dict.fromkeys(hits))
        content_has_value = any((not _is_missing_like(v)) and v in content_text for v in unique_hits)
        content_marked_missing = bool(
            re.search(
                rf"{re.escape(field)}\s*[：:].*?待补充",
                content_text,
                flags=re.IGNORECASE | re.DOTALL,
            )
        )
        status = "unknown"
        if unique_hits and content_has_value:
            status = "used"
        elif unique_hits and content_marked_missing:
            status = "extracted_but_missing_in_content"
        elif unique_hits:
            status = "extracted_but_not_matched"
        elif content_marked_missing:
            status = "not_extracted_and_missing"
        out.append(
            {
                "field": field,
                "extractedValues": unique_hits,
                "contentHasExtractedValue": content_has_value,
                "contentMarkedMissing": content_marked_missing,
                "status": status,
            }
        )
    return out


def _section_output_contract(section_title: str) -> str:
    section_no = _extract_section_number(str(section_title or ""))
    if section_no in SECTION_OUTPUT_CONTRACTS:
        return SECTION_OUTPUT_CONTRACTS[section_no]
    return DEFAULT_SECTION_OUTPUT_CONTRACT


def _effective_section_output_contract(section: ReportTemplateSection) -> str:
    raw = getattr(section, "section_output_contract", None)
    if isinstance(raw, str) and raw.strip():
        return raw.strip()
    return _section_output_contract(section.section_title or "")


def _section_requires_tables(section_title: str, *, contract_text: str | None = None) -> bool:
    """判断章节合同是否包含【表格强制要求】，决定该节是否允许出现表格。"""
    c = (str(contract_text or "").strip() or _section_output_contract(section_title))
    return "表格强制要求" in c


def _strip_tables_from_non_table_section(
    section_title: str,
    content: str,
    *,
    section: ReportTemplateSection | None = None,
) -> str:
    """对无表格需求的章节，移除模型可能自行生成的 Markdown 表格。"""
    if section is not None:
        contract = _effective_section_output_contract(section)
    else:
        contract = _section_output_contract(section_title)
    if "表格强制要求" in contract:
        return content
    if not content:
        return content

    lines = content.split("\n")
    out: list[str] = []
    in_table = False
    for line in lines:
        stripped = line.strip()
        is_table_line = stripped.startswith("|") and stripped.endswith("|")
        is_separator = bool(re.match(r"^\|[\s\-:|]+\|$", stripped)) if stripped else False
        if is_table_line or is_separator:
            if not in_table:
                in_table = True
                if out and out[-1].strip().startswith("###") and "表" in out[-1]:
                    out.pop()
            continue
        else:
            if in_table:
                in_table = False
                if stripped.startswith("<!-- 表格来源"):
                    continue
            out.append(line)
    return "\n".join(out).strip()


_FORBIDDEN_TABLE_PATTERNS: list[tuple[str, re.Pattern]] = [
    (
        "2.1.1",
        re.compile(
            r"(?:^|\n)[^\n]*?表\s*2[\.\s]*6[-－—–]\s*1[^\n]*\n"
            r"(?:(?:\s*\|[^\n]+\|\s*\n)+|(?:\s*<table>[\s\S]*?</table>))?",
            flags=re.IGNORECASE,
        ),
    ),
]


def _strip_forbidden_tables(section_title: str, content: str) -> str:
    """移除 2.1.1 中误粘贴的安评类「表2.6-1 + 原料选择加氢/工艺技术对比」整段（不误删「原料数量及组成」模版主表）。"""
    if not content:
        return content
    section_no = _extract_section_number(section_title)
    for match_section, pat in _FORBIDDEN_TABLE_PATTERNS:
        if section_no == match_section:
            content = pat.sub("", content)
    return content.strip()


def _extract_section_number(section_title: str) -> str:
    m = re.match(r"^\s*(\d+(?:\.\d+)*)", str(section_title or ""))
    return m.group(1) if m else ""


def _major_chapter_number(section_no: str) -> str:
    no = str(section_no or "").strip()
    if not no:
        return ""
    return no.split(".", 1)[0]


def _build_prior_sibling_sections_text(
    section: ReportTemplateSection,
    ordered_sections: list[ReportTemplateSection],
    completed_contents: dict[str, str],
    *,
    max_chars_per_section: int = 2500,
    max_total_chars: int = 10000,
) -> str:
    """拼接同一大章（如第1章）中、当前小节之前已生成正文的摘要，供后续小节保持日期/金额一致。"""
    current_no = _extract_section_number(section.section_title or "")
    major = _major_chapter_number(current_no)
    if not major or not current_no:
        return ""

    blocks: list[str] = []
    total = 0
    for prior in ordered_sections or []:
        if prior.section_order >= section.section_order:
            break
        prior_no = _extract_section_number(prior.section_title or "")
        if not prior_no or _major_chapter_number(prior_no) != major:
            continue
        body = str(completed_contents.get(prior.section_key) or "").strip()
        if not body:
            continue
        snippet = body
        if len(snippet) > max_chars_per_section:
            snippet = snippet[:max_chars_per_section] + "\n…（下文已截断）"
        title = str(prior.section_title or prior_no).strip()
        block = f"### {title}\n{snippet}"
        if total + len(block) > max_total_chars:
            break
        blocks.append(block)
        total += len(block)
    return "\n\n".join(blocks)


def _build_section_title_map(sections: list[ReportTemplateSection]) -> dict[str, str]:
    out: dict[str, str] = {}
    for section in sections or []:
        title = str(section.section_title or "").strip()
        section_no = _extract_section_number(title)
        if not section_no:
            continue
        out[section_no] = title
    return out


def _resolve_ancestor_titles_for_section(
    section: ReportTemplateSection,
    chapter_title_map: dict[str, str],
) -> list[str]:
    title = str(section.section_title or "").strip()
    section_no = _extract_section_number(title)
    if not section_no or "." not in section_no:
        return []
    # 仅同层级第一个小节补父章节；若其父级本身也是上一级中的第一个小节，则继续向上补齐。
    if section_no.split(".")[-1] != "1":
        return []
    ancestors: list[str] = []
    current_no = section_no
    while "." in current_no:
        parent_no = current_no.rsplit(".", 1)[0]
        parent_title = str(chapter_title_map.get(parent_no) or "").strip()
        if parent_title:
            ancestors.append(parent_title)
        if parent_no.split(".")[-1] != "1":
            break
        current_no = parent_no
    return list(reversed(ancestors))


def _expected_child_headings(section_no: str) -> list[str]:
    n = str(section_no or "").strip()
    mapping: dict[str, list[str]] = {
        "2": ["2.1", "2.2", "2.3", "2.4", "2.5", "2.6"],
        "2.1": ["2.1.1", "2.1.2", "2.1.3", "2.1.4", "2.1.5", "2.1.6", "2.1.7"],
        "2.2": ["2.2.1", "2.2.2", "2.2.3", "2.2.4"],
        "2.4": ["2.4.1", "2.4.2", "2.4.3", "2.4.4"],
        "3": ["3.1", "3.2", "3.3", "3.4", "3.5", "3.6", "3.7", "3.8", "3.9", "3.10", "3.11"],
        "3.3": ["3.3.1", "3.3.2", "3.3.3", "3.3.4"],
        "3.4": ["3.4.1", "3.4.2"],
        "4": ["4.1", "4.2", "4.3", "4.4"],
        "4.3": ["4.3.1", "4.3.2", "4.3.3", "4.3.4", "4.3.5", "4.3.6"],
        "5": ["5.1", "5.2", "5.3", "5.4", "5.5"],
        "5.2": ["5.2.1", "5.2.2", "5.2.3", "5.2.4"],
        "5.3": ["5.3.1", "5.3.2"],
        "6": ["6.1", "6.2"],
        "6.1": ["6.1.1", "6.1.2", "6.1.3", "6.1.4", "6.1.5"],
        "6.2": ["6.2.1", "6.2.2", "6.2.3", "6.2.4"],
        "7": ["7.1", "7.2", "7.3"],
        "7.1": ["7.1.1", "7.1.2"],
    }
    return mapping.get(n, [])


def _chapter_generation_max_tokens(section_no: str) -> int:
    """
    默认输出上限略高于历史 3500，避免长段中文在尾部截断后与下一小节标题粘在一段内。
    三节编号小节（如 2.4.3）常与多要素叙述叠加，更易触顶。
    """
    if not section_no:
        return 4096
    parts = section_no.split(".")
    if len(parts) >= 3:
        return 6000
    return 4096


def _split_inline_template_headings(content: str, chapter_title_map: dict[str, str]) -> str:
    """
    将「正文末尾与下一小节标题挤在同一物理行」的情况拆开（常见于输出触顶截断或模型漏换行）。
    仅拆分 chapter_title_map 中存在的三节及以上编号（如 2.4.4），且要求标题后文字与模板标题首字一致，降低误判。
    """
    if not content or not chapter_title_map:
        return content
    valid = {
        no
        for no in chapter_title_map
        if no and no.count(".") >= 2 and re.fullmatch(r"\d+(?:\.\d+)*", no)
    }
    if not valid:
        return content

    def split_line_once(line: str) -> tuple[str, bool]:
        # 小节编号前一版可能挤在句号/括号后且无空格（如「证明了2.4.4 初步……」）。
        ms = list(
            re.finditer(
                r"(?:^|(?<=[\u4e00-\u9fff。；;：:）\)」』\]\.\!\?]))\s*"
                r"([1-9]\d{0,2}\.\d{1,2}\.\d{1,3})(?:[ \t\u3000]|　)+",
                line,
            )
        )
        for m in ms:
            num = m.group(1)
            if num not in valid:
                continue
            start = m.start(1)
            prefix = line[:start]
            if not prefix.strip():
                continue
            full_title = str(chapter_title_map.get(num) or "").strip()
            if not full_title:
                continue
            tail_m = re.match(rf"^\s*{re.escape(num)}\s+(.+)$", full_title)
            name_part = tail_m.group(1).strip() if tail_m else ""
            if len(name_part) < 2:
                continue
            after = line[m.end() : m.end() + min(48, len(name_part) + 8)]
            if after and after[0] != name_part[0]:
                continue
            head = prefix.rstrip()
            rest = line[start:].lstrip()
            return f"{head}\n\n{rest}", True
        return line, False

    new_lines: list[str] = []
    for raw_line in content.split("\n"):
        cur = raw_line
        while True:
            nxt, changed = split_line_once(cur)
            if not changed:
                new_lines.append(cur)
                break
            chunks = nxt.split("\n\n", 1)
            new_lines.append(chunks[0])
            if len(chunks) > 1:
                new_lines.append("")
                cur = chunks[1]
            else:
                cur = ""
    return "\n".join(new_lines)


def _split_glued_template_heading_body(content: str, chapter_title_map: dict[str, str]) -> str:
    """
    小节标题与正文挤在同一行、中间无换行（如「2.4.4 初步设计审查工作评价2017年12月……」）时，
    在模板规定的标题尾部与后续正文之间插入空行，便于 Markdown 将标题行与正文分开渲染。
    """
    if not content or not chapter_title_map:
        return content
    valid = sorted(
        (
            no
            for no in chapter_title_map
            if no and no.count(".") >= 1 and re.fullmatch(r"\d+(?:\.\d+)*", no)
        ),
        key=len,
        reverse=True,
    )
    if not valid:
        return content

    def split_one_line(line: str) -> tuple[str, bool]:
        for num in valid:
            full_title = str(chapter_title_map.get(num) or "").strip()
            if not full_title:
                continue
            tail_m = re.match(rf"^\s*{re.escape(num)}\s+(.+)$", full_title)
            if not tail_m:
                continue
            name_part = tail_m.group(1).strip()
            if len(name_part) < 2:
                continue
            m = re.match(
                rf"^\s*(?:#\s*){{0,6}}"
                rf"{re.escape(num)}(?:\s+|　)+{re.escape(name_part)}",
                line,
            )
            if not m:
                continue
            rest = line[m.end() :].lstrip(" \t\u3000")
            if not rest:
                continue
            if rest[0] in "\r\n":
                continue
            if not (rest[0].isdigit() or "\u4e00" <= rest[0] <= "\u9fff"):
                continue
            return line[: m.end()].rstrip() + "\n\n" + rest, True
        return line, False

    out_lines: list[str] = []
    for raw in content.split("\n"):
        cur = raw
        while True:
            nxt, ok = split_one_line(cur)
            if not ok:
                out_lines.append(cur)
                break
            parts = nxt.split("\n\n", 1)
            out_lines.append(parts[0])
            if len(parts) > 1:
                out_lines.append("")
                cur = parts[1]
            else:
                cur = ""
    return "\n".join(out_lines)


def _ensure_heading_lines_separated(content: str) -> str:
    """确保编号标题行（如 '2.4.4  初步设计审查工作评价'）前后各有空行。"""
    if not content:
        return content
    lines = content.split("\n")
    result: list[str] = []
    heading_re = re.compile(r"^\s{0,3}#{0,6}\s*\d+(?:\.\d+)+\s+[\u4e00-\u9fff]")
    for i, line in enumerate(lines):
        stripped = line.strip()
        if stripped and heading_re.match(stripped):
            if result and result[-1].strip() != "":
                result.append("")
            result.append(line)
            if i + 1 < len(lines) and lines[i + 1].strip() != "":
                result.append("")
        else:
            result.append(line)
    return "\n".join(result)


def _auto_append_missing_child_headings(content: str, section_no: str) -> tuple[str, list[str]]:
    expected = _expected_child_headings(section_no)
    if not expected:
        return content, []
    text = str(content or "").rstrip()
    missing: list[str] = []
    for heading_no in expected:
        # 兼容 "# 3.1 xxx" / "3.1 xxx" / "### 3.1.1 xxx" 等写法
        if re.search(rf"(?m)^\s{{0,3}}#{0,6}\s*{re.escape(heading_no)}(?:\s|$)", text):
            continue
        missing.append(heading_no)
    if not missing:
        return text, []
    blocks = [text] if text else []
    for heading_no in missing:
        blocks.append(missing_child_heading_markdown(heading_no))
    repaired = "".join(blocks).strip()
    return repaired, missing


def _restore_required_tables_safety_net(
    content: str,
    required_tables: list[str],
    evidence: dict,
    pre_postprocess_content: str,
) -> str:
    """后处理安全网：若后处理流水线意外删除了必需表，从预存快照中恢复。

    对每个 required token，若当前 content 中已不存在但 pre_postprocess_content
    中存在，则从 pre_postprocess_content 中提取该表块（表题行 + 管道行），
    在当前 content 中找到合适位置插回。
    """
    if not required_tables:
        return content
    text = str(content or "")
    pre = str(pre_postprocess_content or "")
    if not text or not pre:
        return content

    for token in required_tables:
        if _table_token_exists(text, token):
            continue
        auth_block = _authoritative_block_for_required_table(token, evidence)
        if auth_block and _table_token_caption_line_re(token).search(text):
            text = _replace_caption_stub_with_authoritative_table(text, token, auth_block)
            continue
        if not _table_token_exists(pre, token):
            continue
        table_block = _extract_table_block_from_content(pre, token)
        if not table_block:
            continue
        insert_pos = _find_table_insert_position(text, token, required_tables)
        if insert_pos is not None:
            text = text[:insert_pos] + "\n\n" + table_block + "\n\n" + text[insert_pos:]
        else:
            text = text.rstrip() + "\n\n" + table_block
    return re.sub(r"\n{3,}", "\n\n", text).strip()


def _extract_table_block_from_content(content: str, token: str) -> str:
    """从内容中提取 token 对应的表块（表题行 + 可选注释行 + 管道行）。"""
    token_plain = re.sub(r"\s+", "", str(token or ""))
    if not token_plain:
        return ""
    token_re = re.escape(token_plain).replace(r"\-", r"[-－—–]")
    pat = re.compile(
        r"((?:^|\n)[^\n]*?" + token_re + r"[^\n]*\n"
        r"(?:\n|[ \t]*<!--[^\n]*-->[ \t]*\n)*"
        r"(?:[ \t]*\|[^\n]*\|[ \t]*\n)+)",
        flags=re.IGNORECASE,
    )
    m = pat.search(content)
    return m.group(0).strip() if m else ""


def _find_table_insert_position(content: str, token: str, required_tables: list[str]) -> int | None:
    """在 content 中找到 token 对应表应插入的位置。

    规则：插入到下一个必需表的表题行之前；若没有后续表，返回 None（追加到末尾）。
    """
    token_idx = None
    for i, t in enumerate(required_tables):
        if _norm_table_token(t) == _norm_table_token(token):
            token_idx = i
            break
    if token_idx is None:
        return None
    for later_token in required_tables[token_idx + 1:]:
        later_plain = re.sub(r"\s+", "", str(later_token or ""))
        if not later_plain:
            continue
        later_re = re.escape(later_plain).replace(r"\-", r"[-－—–]")
        later_pat = re.compile(
            r"(?:^|\n)([^\n]*?" + later_re + r"[^\n]*)\n",
            flags=re.IGNORECASE,
        )
        m = later_pat.search(content)
        if m:
            pos = m.start()
            if pos > 0 and content[pos] == "\n":
                pos += 1
            return pos
    return None


def _enforce_required_tables(
    section: ReportTemplateSection,
    prompt: str,
    content: str,
    evidence: dict,
) -> tuple[str, list[str]]:
    required = _extract_required_table_tokens(
        section.section_prompt or "",
        _extract_section_number(section.section_title or ""),
        contract_text=_effective_section_output_contract(section),
    )
    if not required:
        return content, []
    # 模板必需表优先“要素表直出”，避免模型改写结构化表中的真实数据。
    repaired = _append_authoritative_required_tables(content, required, evidence)
    missing = [t for t in required if not _table_token_exists(repaired, t)]
    if missing:
        repaired = _append_structured_missing_tables(repaired, missing, evidence)
        still_missing = [t for t in required if not _table_token_exists(repaired, t)]
        if still_missing:
            repaired = _repair_missing_tables(section, prompt, repaired, still_missing, evidence)
        still_missing = [t for t in required if not _table_token_exists(repaired, t)]
        if still_missing:
            repaired = _append_minimal_missing_tables(repaired, still_missing)
    # 章节间串表清理：4.3.2 仅保留运行周期统计表；4.3.3 仅保留装置运行分析表。
    repaired = _remove_cross_section_table_pollution(section.section_title or "", repaired)
    # 末尾兜底：若必需表已“存在”但表体残缺（仅分隔行/缺数据行），
    # 仍要强制回填要素管理中的完整结构化表。
    repaired = _ensure_required_structured_tables_integrity(repaired, required, evidence)
    # 即使 missing 为空（如 5.1 已由 LLM 写出表5-1），仍须去重，避免 LLM 表 + 要素直出表并存。
    repaired = _finalize_section_table_dedupe(repaired, required)
    repaired = _fill_required_table_caption_stubs(repaired, required, evidence)
    repaired = _finalize_section_table_dedupe(repaired, required)
    final_missing = [t for t in required if not _table_token_exists(repaired, t)]
    return repaired, final_missing


def _extract_required_table_tokens(
    section_prompt: str,
    section_no: str = "",
    *,
    contract_text: Optional[str] = None,
) -> list[str]:
    """
    从模板 section_prompt 与章节输出合同（section_output_contracts）中抽取「表 x-x / 附表 x」，
    使合同内写死的「见表2-3」等也能触发 _append_authoritative_required_tables 要素直出。

    严格规则：
    - 仅当合同中包含「【表格强制要求】」标签时，才提取正文表（表x-x）。
    - 「见附表N」「附表N~附表M」等仅为引用语，不视为本节必需内嵌的表格（含区间端点及中间附表）。
    - 附图与附表在正文之后由 _append_report_appendices 统一汇总（附图在上、附表在下）。
    """
    parts = [str(section_prompt or "").strip(), str(contract_text or "").strip()]
    text = "\n".join(p for p in parts if p)
    if not text:
        return []

    has_table_mandate = "表格强制要求" in text

    if not has_table_mandate:
        return []

    # 剔除「【禁止】」段落，避免将禁止示例中的表号（如"表2.6-1"）误判为必需表。
    text_for_extraction = re.sub(
        r"【禁止】.*?(?=【|$)", "", text, flags=re.DOTALL,
    )

    raw = re.findall(
        r"(附表\s*\d+(?:\s*[.\-－]\s*\d+)*|表\s*\d+(?:\s*[.\-－]\s*\d+)*)",
        text_for_extraction,
    )
    out: list[str] = []
    seen = set()
    chapter_no = ""
    m_sec = re.match(r"^\s*(\d+)", str(section_no or ""))
    if m_sec:
        chapter_no = m_sec.group(1)

    _REF_ONLY_PATTERN = re.compile(
        r"(?:见|详见|参见|参照|详)\s*附表\s*\d+",
    )
    ref_only_appendices: set[str] = set()
    for m in _REF_ONLY_PATTERN.finditer(text):
        tok_in_ref = re.findall(r"(附表\s*\d+(?:\s*[.\-－]\s*\d+)*)", m.group())
        for t in tok_in_ref:
            ref_only_appendices.add(re.sub(r"\s+", "", t))

    for tok in raw:
        norm = re.sub(r"\s+", "", tok)
        if norm.startswith("附表") and norm in ref_only_appendices:
            continue
        if chapter_no:
            m_tok = re.match(r"^(?:附表|表)\s*(\d+)", norm)
            if m_tok:
                tail = norm[m_tok.end() :]
                # 「表1」「表2」等为节内顺序号，首位数字不等于章号（如第二章下的表1）；仅对「表2-3」「表2.6-1」等带子级编号的表号按章首数字过滤。
                if tail and tail[0] in ".-－—–":
                    if m_tok.group(1) != chapter_no:
                        continue
        if norm in seen:
            continue
        seen.add(norm)
        out.append(norm)

    # 5.3.2 合同正文仅允许表5-5、表5-6；模板示例里若夹带「附表8」等，一律不纳入必需表，避免要素直出串表。
    if str(section_no or "").strip() == "5.3.2":
        allow_532 = {_norm_table_token("表5-5"), _norm_table_token("表5-6")}
        out = [t for t in out if _norm_table_token(t) in allow_532]
    # 7.1.2 仅内嵌表7-1；合同/模板中若夹带其他章表号，不纳入本节必需表。
    if str(section_no or "").strip() == "7.1.2":
        allow_712 = {_norm_table_token("表7-1")}
        out = [t for t in out if _norm_table_token(t) in allow_712]
    return out[:20]


def _repair_missing_tables(
    section: ReportTemplateSection,
    prompt: str,
    content: str,
    missing_tables: list[str],
    evidence: dict,
) -> str:
    fix_prompt = build_repair_missing_tables_prompt(
        section_title=section.section_title,
        original_prompt=prompt,
        content=content,
        missing_tables=missing_tables,
        evidence_json=json.dumps(evidence, ensure_ascii=False),
    )
    obj = chat_completions_json(
        system_prompt=repair_missing_tables_system_prompt(),
        user_prompt=fix_prompt,
        temperature=0.1,
        max_tokens=2200,
        timeout_sec=120,
        log_context=f"补缺失表格 section_key={section.section_key} | {section.section_title}",
    )
    new_content = str(obj.get("content") or "").strip()
    return new_content or content


def _append_minimal_missing_tables(content: str, missing_tables: list[str]) -> str:
    blocks = [content.rstrip()]
    for t in missing_tables:
        blocks.append(
            MINIMAL_MISSING_TABLE_TEMPLATE.format(
                table_name=_normalize_table_caption_number_name_gap(str(t or "").strip())
            )
        )
    return "".join(blocks).strip()


def _remove_cross_section_table_pollution(section_title: str, content: str) -> str:
    """
    清理 4.3.2 / 4.3.3 的跨节串表：
    - 4.3.2 不允许出现“装置运行分析”表
    - 4.3.3 不允许出现“投产以来运行周期统计表”
    """
    text = str(content or "")
    section_no = _extract_section_number(section_title)
    if section_no not in {"4.3.2", "4.3.3"}:
        return text

    if section_no == "4.3.2":
        forbidden_kw = "装置运行分析"
    else:
        forbidden_kw = "投产以来运行周期统计表"

    # 表题行 + Markdown 表格（允许表题与表格之间有空行/注释行）
    md_pat = re.compile(
        rf"(?:^|\n)[^\n]*{re.escape(forbidden_kw)}[^\n]*\n"
        rf"(?:\s*\n|<!--[\s\S]*?-->\s*\n)*"
        rf"(?:\s*\|[^\n]+\|\s*\n)+",
        flags=re.IGNORECASE,
    )
    text = md_pat.sub("\n", text)

    # 表题行 + HTML 表格（允许表题与表格之间有空行/注释行）
    html_pat = re.compile(
        rf"(?:^|\n)[^\n]*{re.escape(forbidden_kw)}[^\n]*\n"
        rf"(?:\s*\n|<!--[\s\S]*?-->\s*\n)*"
        rf"\s*<table>[\s\S]*?</table>",
        flags=re.IGNORECASE,
    )
    text = html_pat.sub("\n", text)

    # 残留单独表题行（无表体）也移除，避免视觉噪音
    title_only_pat = re.compile(
        rf"(?:^|\n)\s*[#>*\-\d\.\)（）\s]*[^\n]*{re.escape(forbidden_kw)}[^\n]*(?=\n|$)",
        flags=re.IGNORECASE,
    )
    text = title_only_pat.sub("\n", text)
    # 折叠多余空行
    text = re.sub(r"\n{3,}", "\n\n", text)
    return text.strip()


def _title_compare_norm(s: str) -> str:
    """标题宽松比较：折叠空白，并去掉中英括号两侧多余空格。"""
    t = re.sub(r"\s+", " ", str(s or "")).strip()
    t = re.sub(r"\s*([（(])\s*", r"\1", t)
    t = re.sub(r"\s*([）)])\s*", r"\1", t)
    return t


def _heading_line_section_number(line: str) -> str:
    normalized = str(line or "").strip().lstrip("#").strip()
    m = re.match(r"^(\d+(?:\.\d+)*)", normalized)
    return m.group(1) if m else ""


def _is_heading_line_for_section(line: str, section_no: str) -> bool:
    if not section_no:
        return False
    return _heading_line_section_number(line) == section_no


def _strip_leading_section_heading_lines(lines: list[str], section_no: str) -> list[str]:
    """去掉正文开头连续的、与 section_no 同编号的标题行（避免 prepend 后重复）。"""
    trimmed = list(lines)
    while trimmed:
        first = trimmed[0]
        if not str(first).strip():
            trimmed.pop(0)
            continue
        if _is_heading_line_for_section(first, section_no):
            trimmed.pop(0)
            while trimmed and not str(trimmed[0]).strip():
                trimmed.pop(0)
            continue
        break
    return trimmed


def _replace_first_section_heading_line(content: str, section_no: str, canonical_title: str) -> str:
    lines_list = content.splitlines()
    for idx_l, ln in enumerate(lines_list):
        if not ln.strip():
            continue
        if not _is_heading_line_for_section(ln, section_no):
            break
        stripped = ln.strip()
        section_no_heading = section_no
        plain_numbered_24x = section_no_heading in {
            "2.4.1",
            "2.4.2",
            "2.4.3",
            "2.4.4",
        }
        if stripped.startswith("#") and not plain_numbered_24x:
            hm = re.match(r"^(#+\s*)", stripped)
            prefix = hm.group(1) if hm else ""
            lines_list[idx_l] = (prefix + canonical_title).rstrip()
        else:
            lines_list[idx_l] = canonical_title
        break
    return "\n".join(lines_list)


def _enforce_template_format_contract(
    section: ReportTemplateSection,
    content: str,
    evidence: dict,
    *,
    chapter_title_map: Optional[dict[str, str]] = None,
) -> tuple[str, list[str]]:
    issues: list[str] = []
    # 不对编号样式做统一归一化，避免破坏章节层级编号（如 1.2.1 / 2.1.3 / 3.4.2）。
    repaired = _strip_inline_evidence_labels(str(content or "").strip())
    repaired = _normalize_section_heading_markdown(repaired)
    if _extract_section_number(str(section.section_title or "")) == "1.2":
        repaired = _normalize_section_12_content(repaired)

    # 0) 先拆行再做标题验收，否则步骤 1 会因首行 != 标准标题而重复插入标题。
    # 0.1) 上一段正文末尾与下一小节编号粘在同一行（如「……证明了2.4.4 初步……」）。
    repaired = _split_inline_template_headings(repaired, chapter_title_map or {})
    # 0.2) 小节标题后与正文首字粘在同行（如「2.4.4 初步设计审查工作评价2017年……」）。
    repaired = _split_glued_template_heading_body(repaired, chapter_title_map or {})

    # 1) 标题验收：
    # - 普通节：首行为当前节标题（允许附加 # 前缀）
    # - 每章第一节（x.1）：首行为章标题，且必须包含当前节标题
    title = str(section.section_title or "").strip()
    section_no_heading = _extract_section_number(title)
    title_norm = _title_compare_norm(title)
    non_empty_lines: list[str] = []
    for line in repaired.splitlines():
        if line.strip():
            non_empty_lines.append(line.strip().lstrip("#").strip())
    first_non_empty = non_empty_lines[0] if non_empty_lines else ""
    first_matches_title = bool(
        title and first_non_empty and _title_compare_norm(first_non_empty) == title_norm
    )
    first_is_section_heading = bool(
        title
        and section_no_heading
        and first_non_empty
        and _is_heading_line_for_section(first_non_empty, section_no_heading)
    )

    ancestor_titles = _resolve_ancestor_titles_for_section(
        section,
        chapter_title_map or {},
    )
    if ancestor_titles:
        required_titles = ancestor_titles + ([title] if title else [])
        required_norms = [_title_compare_norm(t) for t in required_titles]
        existing_lines = repaired.splitlines()
        trimmed_lines = list(existing_lines)
        existing_title_chain: list[str] = []
        while trimmed_lines:
            first_line = trimmed_lines[0]
            normalized = _title_compare_norm(first_line.strip().lstrip("#").strip())
            if not normalized:
                trimmed_lines.pop(0)
                continue
            if normalized in required_norms:
                existing_title_chain.append(normalized)
                trimmed_lines.pop(0)
                while trimmed_lines and not trimmed_lines[0].strip():
                    trimmed_lines.pop(0)
                continue
            if (
                title
                and section_no_heading
                and len(existing_title_chain) == len(required_norms) - 1
                and _is_heading_line_for_section(first_line, section_no_heading)
            ):
                existing_title_chain.append(required_norms[-1])
                trimmed_lines.pop(0)
                while trimmed_lines and not trimmed_lines[0].strip():
                    trimmed_lines.pop(0)
                continue
            break
        if existing_title_chain != required_norms:
            body_lines = _strip_leading_section_heading_lines(trimmed_lines, section_no_heading)
            body = "\n".join(body_lines).strip()
            repaired = "\n\n".join(required_titles + ([body] if body else [])).strip()
            issues.append("章节缺少父级标题链，已自动补齐")
        elif title and first_is_section_heading and first_non_empty != title:
            repaired = _replace_first_section_heading_line(repaired, section_no_heading, title)
    elif title and not first_matches_title and first_is_section_heading:
        repaired = _replace_first_section_heading_line(repaired, section_no_heading, title)
        if first_non_empty != title:
            issues.append("章节标题与模板不一致，已规范为标准标题行")
    elif title and not first_matches_title:
        repaired = f"{title}\n\n{repaired}".strip()
        issues.append("章节标题与模板不一致，已自动补齐标准标题行")
    elif title and first_matches_title and first_non_empty != title:
        repaired = _replace_first_section_heading_line(repaired, section_no_heading, title)

    # 1.42) 2.4.1～2.4.4：首行可能是「### 2.4.x …」且去 # 后与模板标题一致，此时不会进入上一分支，须去掉 Markdown 前缀。
    if (
        _extract_section_number(title) in {"2.4.1", "2.4.2", "2.4.3", "2.4.4"}
        and title
    ):
        lns_strip = repaired.splitlines()
        for _is, ln_s in enumerate(lns_strip):
            if not ln_s.strip():
                continue
            sh = ln_s.strip()
            if sh.startswith("#") and _title_compare_norm(sh.lstrip("#").strip()) == title_norm:
                lns_strip[_is] = title
            break
        repaired = "\n".join(lns_strip)

    # 1.5) 确保标题行（如 "2.4.4  初步设计审查工作评价"）后面有空行，
    #       否则前端 Markdown 渲染或 DOCX 导出时可能无法识别为标题。
    repaired = _ensure_heading_lines_separated(repaired)

    # 2) 标题树验收：若该节定义了固定子节顺序，缺失则自动补齐占位小节。
    section_no = _extract_section_number(title)
    repaired, missing_children = _auto_append_missing_child_headings(repaired, section_no)
    if missing_children:
        issues.append("缺失下级小节已自动补齐：" + "、".join(missing_children[:10]))

    # 3) 仅当章节合同显式要求表格时，才做模板表格规格验收与修复。
    # 否则像 5.2.2/5.2.3/5.2.4 这类纯文字章节会被示例表误触发补表，产生脏表格。
    if not _section_requires_tables(
        title, contract_text=_effective_section_output_contract(section)
    ):
        return _strip_inline_evidence_labels(repaired), issues

    # 4) 解析模板示例中的表规格（表名 + 表头关键字）
    table_specs = _extract_template_table_specs(section.examples)
    if not table_specs:
        return repaired, issues

    # 4.1) 第 5 章共用示例里同时出现「表5-1/表5-2」宁夏样例与各小节真实合同（如 5.3.1 仅表5-4）。
    # 若不按合同过滤，_find_table_format_issues 会误报缺表5-1，_repair_table_format_by_template 会把表5-4「修」成样例表头。
    contract_required = _extract_required_table_tokens(
        section.section_prompt or "",
        section_no,
        contract_text=_effective_section_output_contract(section),
    )
    if contract_required:
        allow = {_norm_table_token(t) for t in contract_required if _norm_table_token(t)}
        narrowed = [
            s
            for s in table_specs
            if _norm_table_token(str(s.get("token") or "")) in allow
        ]
        if narrowed:
            table_specs = narrowed

    table_issues = _find_table_format_issues(repaired, table_specs)
    if table_issues:
        issues.extend(table_issues)
        repaired = _repair_table_format_by_template(section, repaired, table_specs, evidence)
        # 二次验收，仍不通过则提示但不循环重试
        still = _find_table_format_issues(repaired, table_specs)
        if still:
            issues.extend([f"二次修正后仍存在：{x}" for x in still[:4]])
    return _strip_inline_evidence_labels(repaired), issues


def _extract_template_table_specs(raw_examples: Optional[str]) -> list[dict]:
    text = str(raw_examples or "").strip()
    if not text:
        return []
    lines = [ln.rstrip() for ln in text.splitlines()]
    specs: list[dict] = []
    i = 0
    while i < len(lines):
        line = lines[i].strip()
        m = re.match(r"^(附表\s*\d+|表\s*\d+(?:\s*-\s*\d+)?)\s*(.*)$", line)
        if not m:
            i += 1
            continue
        token = re.sub(r"\s+", "", m.group(1))
        title_tail = str(m.group(2) or "").strip()
        title = f"{m.group(1)} {title_tail}".strip()

        j = i + 1
        header_keywords: list[str] = []
        # 采集该表后面的头部字段线索
        while j < len(lines):
            cur = lines[j].strip()
            if not cur:
                j += 1
                if header_keywords:
                    break
                continue
            if re.match(r"^(附表\s*\d+|表\s*\d+(?:\s*-\s*\d+)?)\s*", cur):
                break
            if re.match(r"^\d+(?:\.\d+)*\s+", cur):  # 下一个章节
                break
            if cur.startswith("注"):
                break
            if "|" in cur:
                # Markdown 表头
                cells = [c.strip() for c in cur.split("|") if c.strip()]
                for c in cells[:8]:
                    if c and c not in ("---", "—"):
                        header_keywords.append(c)
                break
            # 普通文本表头行
            if len(cur) <= 24 and not re.fullmatch(r"[0-9.%（）()\-～~:/\s]+", cur):
                header_keywords.append(cur)
            if len(header_keywords) >= 8:
                break
            j += 1

        specs.append(
            {
                "token": token,
                "title": title,
                "headerKeywords": list(dict.fromkeys(header_keywords))[:8],
            }
        )
        i = j
    # 去重同 token
    dedup: dict[str, dict] = {}
    for s in specs:
        tk = str(s.get("token") or "")
        if not tk or tk in dedup:
            continue
        dedup[tk] = s
    return list(dedup.values())[:12]


def _find_table_format_issues(content: str, table_specs: list[dict]) -> list[str]:
    issues: list[str] = []
    c = str(content or "")
    c_norm = _norm_table_token(c)
    for spec in table_specs:
        token = str(spec.get("token") or "")
        title = str(spec.get("title") or token)
        token_norm = _norm_table_token(token)
        if token_norm and token_norm not in c_norm:
            issues.append(f"缺少模板表名：{title}")
            continue
        # 若模板存在表头关键词，则要求至少命中2个（或全部，如果少于2）
        headers = [str(h).strip() for h in (spec.get("headerKeywords") or []) if str(h).strip()]
        if not headers:
            continue
        hit = sum(1 for h in headers if h in c)
        need = min(2, len(headers))
        if hit < need:
            issues.append(f"表头与模板不一致：{title}")
    return issues


def _repair_table_format_by_template(
    section: ReportTemplateSection,
    content: str,
    table_specs: list[dict],
    evidence: dict,
) -> str:
    specs_text = json.dumps(table_specs, ensure_ascii=False)
    fix_prompt = build_table_format_repair_prompt(
        section_title=section.section_title,
        table_specs_json=specs_text,
        content=content,
        evidence_json=json.dumps(evidence, ensure_ascii=False),
    )
    obj = chat_completions_json(
        system_prompt=table_format_repair_system_prompt(),
        user_prompt=fix_prompt,
        temperature=0.1,
        max_tokens=2600,
        timeout_sec=120,
        log_context=f"表格格式修复 section_key={section.section_key} | {section.section_title}",
    )
    fixed = str(obj.get("content") or "").strip()
    return fixed or content


def _render_structured_tables_for_prompt(evidence: dict) -> str:
    rows = evidence.get("structuredTables") if isinstance(evidence, dict) else []
    if not isinstance(rows, list) or not rows:
        return "无结构化表格证据"
    blocks: list[str] = []
    for row in rows[:8]:
        if not isinstance(row, dict):
            continue
        name = str(row.get("tableName") or "").strip()
        md = str(row.get("markdown") or "").strip()
        if not name or not md:
            continue
        blocks.append(f"### {_normalize_table_caption_number_name_gap(name)}\n\n{md}")
    return "\n\n".join(blocks) if blocks else "无结构化表格证据"


def _strip_bracketed_three_part_labels(content: str) -> str:
    text = str(content or "")
    if not text:
        return text
    # 全章统一移除方括号三段式标题，保留其后正文内容。
    patterns = [
        r"^\s{0,3}#{0,6}\s*【事实依据】\s*$",
        r"^\s{0,3}#{0,6}\s*【评价判断】\s*$",
        r"^\s{0,3}#{0,6}\s*【问题与建议】\s*$",
        r"^\s{0,3}#{0,6}\s*事实依据\s*[：:、]?\s*$",
        r"^\s{0,3}#{0,6}\s*评价判断\s*[：:、]?\s*$",
        r"^\s{0,3}#{0,6}\s*问题与建议\s*[：:、]?\s*$",
        r"【事实依据】",
        r"【评价判断】",
        r"【问题与建议】",
    ]
    for p in patterns:
        text = re.sub(p, "", text, flags=re.MULTILINE)
    text = re.sub(r"\n{3,}", "\n\n", text).strip()
    return text


def _strip_placeholder_table_notes(content: str) -> str:
    text = str(content or "")
    if not text:
        return text
    placeholder_note_pattern = re.compile(
        r"^\s{0,3}(?:[-*+]\s*)?(?:>\s*)?(?:\*\*|__)?\s*注\s*\d+\s*[\.。:：、]?\s*待补充\s*(?:\*\*|__)?\s*$",
        flags=re.IGNORECASE,
    )
    boilerplate_note_line_pattern = re.compile(
        r"可酌情增减指标|可酌情增减|根据项目的情况|根据项目实际需要进行增减|根据项目不同进行增减|根据项目具体情况增减|表中内容可根据",
        flags=re.IGNORECASE,
    )
    boilerplate_full_line_pattern = re.compile(
        r"^\s{0,3}(?:[-*+]\s*)?(?:>\s*)?(?:\*\*|__)?\s*注\s*[\.。:：]?\s*(?:\d+\s*[\.。、:：]?\s*)?(?:表中内容)?可根据项目.{0,20}(?:增减|调整)",
        flags=re.IGNORECASE,
    )
    note_header_pattern = re.compile(
        r"^\s*(?:[-*+]\s*)?(?:>\s*)?(?:\*\*|__)?\s*注\s*[：:]\s*(?:\*\*|__)?\s*$",
        flags=re.IGNORECASE,
    )

    src_lines = text.splitlines()
    lines: list[str] = []
    i = 0
    while i < len(src_lines):
        line = src_lines[i]
        if placeholder_note_pattern.match(line):
            i += 1
            continue
        if boilerplate_full_line_pattern.match(line):
            i += 1
            continue
        if boilerplate_note_line_pattern.search(line):
            i += 1
            continue
        if note_header_pattern.match(line):
            j = i + 1
            skipped_boilerplate = False
            while j < len(src_lines):
                nxt = src_lines[j]
                if not str(nxt).strip(" \t\u3000"):
                    j += 1
                    continue
                if boilerplate_note_line_pattern.search(nxt):
                    skipped_boilerplate = True
                    j += 1
                break
            if skipped_boilerplate:
                i = j
                continue
        lines.append(line)
        i += 1
    return re.sub(r"\n{3,}", "\n\n", "\n".join(lines)).strip()


def _strip_trailing_partial_missing_markers(content: str) -> str:
    text = str(content or "")
    if not text:
        return text

    cleaned_lines: list[str] = []
    for raw_line in text.splitlines():
        line = raw_line.rstrip()
        compact = re.sub(r"\s+", "", line)
        if compact in {"待补充", "-待补充", "*待补充"}:
            cleaned_lines.append(line)
            continue

        updated = re.sub(r"\s*待补充\s*(?:\[\s*\d{1,3}\s*\])?\s*$", "", line)
        updated = re.sub(r"\s{2,}", " ", updated).rstrip()
        stripped = updated.strip()
        core_len = len(re.sub(r"[^\u4e00-\u9fffA-Za-z0-9]", "", stripped))

        if stripped and stripped != line.strip() and core_len >= 12:
            cleaned_lines.append(updated)
        else:
            cleaned_lines.append(line)

    return re.sub(r"\n{3,}", "\n\n", "\n".join(cleaned_lines)).strip()


def _is_pipe_markdown_table_row_line(line: str) -> bool:
    s = line.strip()
    return len(s) >= 2 and s.startswith("|") and s.endswith("|")


def _is_pipe_markdown_table_separator_line(line: str) -> bool:
    s = line.strip()
    return bool(re.match(r"^\|[\s\-:|]+\|$", s)) if s else False


def _markdown_table_body_fingerprint(md: str) -> str:
    """用于判断两张 Markdown 表是否实质相同（忽略行间空行与首尾空白）。"""
    lines = [
        re.sub(r"\s+", "", ln.strip())
        for ln in str(md or "").splitlines()
        if ln.strip().startswith("|") or ln.strip().startswith("｜")
    ]
    return "\n".join(lines)


_INTRA_REPEAT_MIN_FRAGMENT_LEN = 18
_INTRA_REPEAT_MIN_CONSECUTIVE_COUNT = 3


def _collapse_consecutive_repetitions_in_string(text: str) -> tuple[str, int]:
    """
    折叠同一行/字符串内连续重复片段（如模型将同一句财务描述拼接数十次）。
    仅处理紧邻重复，避免误伤正常文中偶然出现的相同短语。
    """
    s = str(text or "")
    min_len = _INTRA_REPEAT_MIN_FRAGMENT_LEN
    min_count = _INTRA_REPEAT_MIN_CONSECUTIVE_COUNT
    if len(s) < min_len * min_count:
        return s, 0

    removed = 0
    out: list[str] = []
    i = 0
    n = len(s)
    while i < n:
        best_plen = 0
        best_count = 0
        max_plen = (n - i) // min_count
        for plen in range(min_len, max_plen + 1):
            pat = s[i : i + plen]
            if not pat.strip():
                continue
            count = 1
            j = i + plen
            while j + plen <= n and s[j : j + plen] == pat:
                count += 1
                j += plen
            if count >= min_count:
                span = plen * count
                if span > best_plen * best_count:
                    best_plen = plen
                    best_count = count
        if best_plen:
            out.append(s[i : i + best_plen])
            removed += best_count - 1
            i += best_plen * best_count
        else:
            out.append(s[i])
            i += 1
    return "".join(out), removed


def _collapse_consecutive_text_repetitions(content: str) -> tuple[str, int]:
    """按行折叠段内连续重复；返回 (正文, 移除的重复次数)。"""
    lines = str(content or "").splitlines()
    if not lines:
        return str(content or ""), 0
    total_removed = 0
    collapsed_lines: list[str] = []
    for line in lines:
        collapsed, removed = _collapse_consecutive_repetitions_in_string(line)
        total_removed += removed
        collapsed_lines.append(collapsed)
    return "\n".join(collapsed_lines), total_removed


_CHAPTER_CONTENT_DEDUPE_MIN_CHARS = 2000
_CHAPTER_DEDUPE_MIN_BLOCK_CORE_LEN = 48
_CHAPTER_DEDUPE_NEAR_MATCH_RATIO = 0.90


def _chapter_block_core_len(text: str) -> int:
    return len(re.sub(r"[^\u4e00-\u9fffA-Za-z0-9]", "", _strip_inline_evidence_labels(str(text or ""))))


def _chapter_text_block_fingerprint(text: str) -> str:
    t = _strip_inline_evidence_labels(str(text or ""))
    t = re.sub(r"\s+", "", t)
    return t.casefold()


def _is_likely_table_caption_line(line: str) -> bool:
    prev = str(line or "").strip()
    if not prev or len(prev) > 120:
        return False
    if prev.startswith("#"):
        return True
    if re.search(r"表\s*[\d一二三四五六七八九十\-－—–]+", prev):
        return True
    return "表" in prev and len(prev) <= 80


def _chapter_block_is_table(block: str) -> bool:
    pipe_rows = [
        ln for ln in str(block or "").splitlines() if ln.strip() and _is_pipe_markdown_table_row_line(ln)
    ]
    return len(pipe_rows) >= 2


def _split_chapter_blocks_for_dedupe(content: str) -> list[str]:
    """将章节正文拆成段落块与 Markdown 表块，便于做重复检测。"""
    lines = str(content or "").splitlines()
    blocks: list[str] = []
    i = 0
    n = len(lines)
    pending_caption: list[str] = []

    def _flush_pending_caption() -> None:
        nonlocal pending_caption
        if pending_caption:
            blocks.append("\n".join(pending_caption))
            pending_caption = []

    while i < n:
        if not lines[i].strip():
            i += 1
            continue
        if _is_pipe_markdown_table_row_line(lines[i]):
            table_lines: list[str] = []
            while i < n and lines[i].strip() and _is_pipe_markdown_table_row_line(lines[i]):
                table_lines.append(lines[i])
                i += 1
            if table_lines:
                block_lines = list(pending_caption) + table_lines
                pending_caption = []
                start = i - len(table_lines)
                scan = start - 1
                while scan >= 0 and lines[scan].strip():
                    if _is_pipe_markdown_table_row_line(lines[scan]):
                        break
                    if _is_likely_table_caption_line(lines[scan]):
                        block_lines.insert(0, lines[scan])
                        scan -= 1
                        continue
                    break
                blocks.append("\n".join(block_lines))
            continue
        para_lines: list[str] = []
        while i < n:
            if not lines[i].strip():
                i += 1
                break
            if _is_pipe_markdown_table_row_line(lines[i]):
                break
            para_lines.append(lines[i])
            i += 1
        if not para_lines:
            continue
        if len(para_lines) == 1 and _is_likely_table_caption_line(para_lines[0]):
            _flush_pending_caption()
            pending_caption = para_lines
            continue
        _flush_pending_caption()
        blocks.append("\n".join(para_lines))
    _flush_pending_caption()
    return blocks


def _chapter_blocks_near_duplicate(a: str, b: str) -> bool:
    fa = _chapter_text_block_fingerprint(a)
    fb = _chapter_text_block_fingerprint(b)
    if not fa or not fb:
        return False
    if fa == fb:
        return True
    short, long = (fa, fb) if len(fa) <= len(fb) else (fb, fa)
    if len(short) >= _CHAPTER_DEDUPE_MIN_BLOCK_CORE_LEN and short in long:
        if len(short) / max(len(long), 1) >= 0.82:
            return True
    if min(len(fa), len(fb)) < 80:
        return False
    return SequenceMatcher(None, fa, fb).ratio() >= _CHAPTER_DEDUPE_NEAR_MATCH_RATIO


def _chapter_block_duplicate_key(block: str) -> tuple[str, str]:
    text = str(block or "")
    if _chapter_block_is_table(text):
        hdr = _extract_table_header_key(text)
        fp = _markdown_table_body_fingerprint(text)
        return ("table", hdr or fp)
    return ("text", _chapter_text_block_fingerprint(text))


def _dedupe_long_chapter_repetition(content: str) -> tuple[str, int]:
    """
    单章字数超过阈值时，对段落/表格块做去重，缓解模型整段或整表重复输出。
    保留首次出现块，删除后续实质重复块。
    """
    text = str(content or "")
    if len(text) <= _CHAPTER_CONTENT_DEDUPE_MIN_CHARS:
        return text, 0

    blocks = _split_chapter_blocks_for_dedupe(text)
    if len(blocks) < 2:
        return text, 0

    kept: list[str] = []
    seen_table_hdr: set[str] = set()
    seen_table_fp: set[str] = set()
    seen_text_fp: set[str] = set()
    kept_text_samples: list[str] = []
    removed = 0

    for block in blocks:
        core_len = _chapter_block_core_len(block)
        kind, key = _chapter_block_duplicate_key(block)
        is_dup = False

        if kind == "table":
            hdr = _extract_table_header_key(block) if key else ""
            fp = _markdown_table_body_fingerprint(block)
            if hdr and hdr in seen_table_hdr:
                is_dup = True
            elif fp and fp in seen_table_fp:
                is_dup = True
        elif key and key in seen_text_fp:
            is_dup = True
        elif core_len >= _CHAPTER_DEDUPE_MIN_BLOCK_CORE_LEN:
            for prev in kept_text_samples:
                if _chapter_blocks_near_duplicate(block, prev):
                    is_dup = True
                    break

        if is_dup:
            removed += 1
            continue

        if kind == "table":
            hdr = _extract_table_header_key(block)
            fp = _markdown_table_body_fingerprint(block)
            if hdr:
                seen_table_hdr.add(hdr)
            if fp:
                seen_table_fp.add(fp)
        elif key:
            seen_text_fp.add(key)
            if core_len >= _CHAPTER_DEDUPE_MIN_BLOCK_CORE_LEN:
                kept_text_samples.append(block)

        kept.append(block)

    if removed <= 0:
        return text, 0

    merged = "\n\n".join(b.strip() for b in kept if b.strip())
    return re.sub(r"\n{3,}", "\n\n", merged).strip(), removed


def _dedupe_structured_table_hits(hits: list[dict]) -> list[dict]:
    """
    同一必需表 token 可能命中多张历史表或重复 markdown；合并输出会导致章节内连续两张相同表。
    按 tableId 与表体指纹去重，保留表体最完整的一条（表5-4 优先可研/实际/增减结构）。
    """
    ranked: list[tuple[int, dict]] = []
    for hit in hits:
        if not isinstance(hit, dict):
            continue
        md = str(hit.get("markdown") or "").strip()
        if not md:
            continue
        fp = _markdown_table_body_fingerprint(md)
        if not fp:
            continue
        ranked.append((_score_structured_table_hit_dict(hit), hit))
    ranked.sort(key=lambda x: (-x[0], str(x[1].get("tableId") or "")))

    out: list[dict] = []
    seen_ids: set[str] = set()
    seen_fp: set[str] = set()
    for _score, hit in ranked:
        tid = str(hit.get("tableId") or "").strip()
        md = str(hit.get("markdown") or "").strip()
        fp = _markdown_table_body_fingerprint(md)
        if tid and tid in seen_ids:
            continue
        if fp in seen_fp:
            continue
        if tid:
            seen_ids.add(tid)
        seen_fp.add(fp)
        out.append(hit)
    return out


def _is_likely_table1_raw_material_caption(line: str) -> bool:
    s = re.sub(r"\s+", "", str(line or ""))
    if not s:
        return False
    if "原料数量及组成对比" in s:
        return True
    if re.search(r"表\s*1", s) and "原料" in s:
        return True
    return False


def _extract_table_header_key(table_block: str) -> str:
    """提取表格的表头行（第一条 pipe 行），归一化后作为同表判断依据。"""
    for ln in str(table_block or "").splitlines():
        s = ln.strip()
        if s.startswith("|") and s.endswith("|") and not re.match(r"^\|[\s\-:|]+\|$", s):
            return re.sub(r"\s+", "", s)
    return ""


def _is_433_operation_analysis_table_header(header_norm: str) -> bool:
    """4.3.3 表4-2「烷基化装置运行分析」常见 Markdown 表头（全列或仅实际值列）。"""
    h = str(header_norm or "")
    if "序号" not in h or "项目" not in h:
        return False
    if "实际值" in h:
        return True
    if "设计值" in h and "标定值" in h:
        return True
    return False


def _433_op_analysis_table_has_canonical_caption(text: str, table_block_start: int) -> bool:
    """表前若干行内是否出现合同规定的表4-2 烷基化装置运行分析表题（用于保留规范副本、去掉无表题重复表）。"""
    before = str(text or "")[: int(table_block_start)].rstrip()
    lines = before.split("\n")
    tail = "\n".join(lines[-18:])
    if "烷基化装置运行分析" not in tail:
        return False
    n = re.sub(r"\s+", "", tail)
    return bool(re.search(r"表4[-－—–]2", n))


def _dedupe_433_alkylation_operation_analysis_markdown_tables(content: str) -> str:
    """
    4.3.3 常见故障：模型在「2) 主要装置达标评价」下先输出无表题的同结构表，
    又在「3) 全厂达标评价」下重复输出带「表4-2 …烷基化装置运行分析…」表题的同一表。
    对表头/表体指纹相同的重复表：优先保留表前带规范表4-2 表题的一张；否则保留文档中第一张。
    """
    text = str(content or "")
    if not text.strip():
        return text

    pat = re.compile(r"(?m)(?:^\s*\|.+\|\s*\n){3,}")
    matches = list(pat.finditer(text))
    if len(matches) < 2:
        return text

    items: list[dict] = []
    for m in matches:
        block = m.group(0)
        hdr = _extract_table_header_key(block)
        if not _is_433_operation_analysis_table_header(hdr):
            continue
        fp = _markdown_table_body_fingerprint(block)
        items.append(
            {
                "m": m,
                "hdr": hdr,
                "fp": fp,
                "cap": _433_op_analysis_table_has_canonical_caption(text, m.start()),
            }
        )

    n = len(items)
    if n < 2:
        return text

    parent = list(range(n))

    def find(x: int) -> int:
        if parent[x] != x:
            parent[x] = find(parent[x])
        return parent[x]

    def union(x: int, y: int) -> None:
        rx, ry = find(x), find(y)
        if rx != ry:
            parent[ry] = rx

    for i in range(n):
        for j in range(i + 1, n):
            a, b = items[i], items[j]
            same_hdr = bool(a["hdr"] and a["hdr"] == b["hdr"])
            same_fp = bool(a["fp"] and a["fp"] == b["fp"])
            if same_hdr or same_fp:
                union(i, j)

    clusters: dict[int, list[int]] = {}
    for i in range(n):
        r = find(i)
        clusters.setdefault(r, []).append(i)

    remove_spans: list[tuple[int, int]] = []
    for _root, idxs in clusters.items():
        if len(idxs) < 2:
            continue
        idxs_sorted = sorted(idxs, key=lambda ii: items[ii]["m"].start())
        caps = [ii for ii in idxs_sorted if items[ii]["cap"]]
        keep_idx = caps[0] if caps else idxs_sorted[0]
        for ii in idxs_sorted:
            if ii == keep_idx:
                continue
            m = items[ii]["m"]
            start = m.start()
            prefix = text[:start].rstrip("\n")
            last_nl = prefix.rfind("\n")
            title_line = prefix[last_nl + 1 :] if last_nl >= 0 else prefix
            tl = title_line.strip()
            if "烷基化装置运行分析" in tl and re.search(
                r"表4[-－—–]2", re.sub(r"\s+", "", tl)
            ):
                start = last_nl + 1 if last_nl >= 0 else 0
            before = text[:start]
            if before.rstrip().endswith("-->"):
                comment_start = before.rstrip().rfind("<!--")
                if comment_start >= 0:
                    nl_before_comment = before.rfind("\n", 0, comment_start)
                    start = nl_before_comment + 1 if nl_before_comment >= 0 else comment_start
            remove_spans.append((start, m.end()))

    if not remove_spans:
        return text

    remove_spans.sort(key=lambda x: x[0], reverse=True)
    for a, b in remove_spans:
        text = text[:a] + text[b:]
    return re.sub(r"\n{3,}", "\n\n", text).strip()


def _nearest_table_caption_token_before(text_before: str) -> str:
    """表块前若干行内最近的「表 x-x」表号（归一化，如 表3-2）。"""
    lines = [ln for ln in str(text_before or "").splitlines() if ln.strip()]
    for ln in reversed(lines[-14:]):
        if "|" in ln or ln.strip().startswith("<!--"):
            continue
        m = re.search(r"表\s*\d+\s*[-－．]\s*\d+", ln, flags=re.IGNORECASE)
        if m:
            return re.sub(r"\s+", "", m.group(0))
    return ""


def _table_block_has_element_source_comment(text_before: str) -> bool:
    return "表格来源：要素管理" in str(text_before or "")[-800:]


def _score_table_block_for_keep(text_before: str, block: str) -> int:
    score = 0
    if _table_block_has_element_source_comment(text_before):
        score += 100
    if _is_effective_markdown_table_block(block):
        score += 20
    score += min(len([ln for ln in block.splitlines() if ln.strip().startswith("|")]), 30)
    return score


def _span_for_duplicate_table_removal(text: str, match: re.Match[str]) -> tuple[int, int]:
    """重复表删除范围：含紧贴表题行与要素直出注释行。"""
    start = match.start()
    prefix = text[:start]
    stripped_prefix = prefix.rstrip("\n")
    last_nl = stripped_prefix.rfind("\n")
    title_line = stripped_prefix[last_nl + 1 :] if last_nl >= 0 else stripped_prefix
    if re.search(r"表\s*\d+\s*[-－．]\s*\d+", title_line, flags=re.IGNORECASE):
        start = last_nl + 1 if last_nl >= 0 else 0
    before = text[:start]
    if before.rstrip().endswith("-->"):
        comment_start = before.rstrip().rfind("<!--")
        if comment_start >= 0:
            nl_before = before.rfind("\n", 0, comment_start)
            start = nl_before + 1 if nl_before >= 0 else comment_start
    end = match.end()
    while end < len(text) and text[end] == "\n":
        end += 1
    return start, end


# 至少 3 行管道表；末行可无换行（否则要素直出表尾缺 \n 时无法参与去重）
_PIPE_MARKDOWN_TABLE_BLOCK_RE = re.compile(
    r"(?m)(?:(?:^\s*\|[^\n]+\|\s*)(?:\n|$)){3,}"
)


def _iter_pipe_markdown_table_blocks(text: str) -> list[re.Match[str]]:
    return list(_PIPE_MARKDOWN_TABLE_BLOCK_RE.finditer(str(text or "")))


def _table_token_in_recent_context(text_before: str, token: str, *, max_lines: int = 16) -> bool:
    """表块前若干行（或块内表题行）是否出现给定表号。"""
    if not token:
        return False
    lines = [ln for ln in str(text_before or "").splitlines() if ln.strip()]
    for ln in reversed(lines[-max_lines:]):
        if "|" in ln or ln.strip().startswith("<!--"):
            continue
        if _table_token_matches_name(token, ln):
            return True
    return False


def _dedupe_key_for_markdown_table_block(
    text: str, match: re.Match[str], *, block_index: int
) -> str:
    """同表去重分组键：优先归一化表号，其次表头/表体指纹。"""
    before = text[: match.start()]
    block = match.group(0)
    cap_tok = _nearest_table_caption_token_before(before)
    if cap_tok:
        return _norm_table_token(cap_tok)
    for ln in block.splitlines()[:4]:
        if "|" in ln:
            continue
        m = re.search(r"表\s*\d+(?:\s*[-－．]\s*\d+)*", ln, flags=re.IGNORECASE)
        if m:
            return _norm_table_token(re.sub(r"\s+", "", m.group(0)))
    hdr = _extract_table_header_key(block)
    if hdr:
        return f"hdr:{hdr}"
    fp = _markdown_table_body_fingerprint(block)
    if fp:
        return f"fp:{fp}"
    return f"__idx_{block_index}"


def _apply_markdown_table_block_removals(text: str, remove_spans: list[tuple[int, int]]) -> str:
    if not remove_spans:
        return text
    remove_spans.sort(key=lambda x: x[0], reverse=True)
    for a, b in remove_spans:
        text = text[:a] + text[b:]
    return re.sub(r"\n{3,}", "\n\n", text).strip()


def _table_block_matches_required_token_spec(block: str, token: str) -> bool:
    """无表题时，用细则列名与表头匹配判定是否为该必需表（便于合并 orphan LLM 表）。"""
    canon = _canonical_global_table_name_for_token(token) or ""
    spec = _multi_column_global_spec_for_table(canon)
    if not spec:
        return False
    hdr = re.sub(r"\s+", "", _extract_table_header_key(block)).lower()
    hdr = re.sub(r"<br>.*", "", hdr, flags=re.IGNORECASE)
    if not hdr:
        return False
    cols = [str(c).strip() for c in (spec[0] or []) if str(c).strip()]
    if not cols:
        return False
    hit = sum(
        1
        for col in cols
        if re.sub(r"\s+|<br>.*", "", col, flags=re.IGNORECASE).lower() in hdr
    )
    if hit < max(1, min(2, len(cols) // 2 + 1)):
        return False
    row_hdr = _row_header_name_for_table(canon)
    if row_hdr:
        row_norm = re.sub(r"\s+", "", row_hdr).lower()
        if row_norm not in hdr:
            return False
    # 表3-3/表3-4 数据列相同，须用行表头列区分，避免 orphan 表误并。
    if _table_token_matches_name(token, "表3-3") and "专业" in hdr:
        return False
    if _table_token_matches_name(token, "表3-4") and (
        "单元名称" in hdr or ("项目" in hdr and "专业" not in hdr)
    ):
        return False
    if _table_token_matches_name(token, "表3-5") and not any(
        k in hdr for k in ("变更内容", "原因")
    ):
        return False
    # 表2-5/表2-6 数据列相同；「项目」会误命中「依托项目名称」。
    if _table_token_matches_name(token, "表2-5"):
        if "依托项目名称" in hdr or "依托项目" in hdr:
            return False
        if "项目名称" not in hdr and "项目" not in hdr:
            return False
    if _table_token_matches_name(token, "表2-6") and "依托项目名称" not in hdr:
        return False
    return True


def _markdown_table_block_belongs_to_required_token(
    before: str,
    block: str,
    token: str,
    all_tokens: list[str],
) -> bool:
    """判定管道表块是否属于某必需表号（3.3.4 等多表相邻时须用最近表题，勿用宽窗口误并）。"""
    cap_tok = _nearest_table_caption_token_before(before)
    if cap_tok:
        return _table_token_matches_name(token, cap_tok)
    if _table_token_in_recent_context(block, token, max_lines=6):
        return True
    if not _table_block_matches_required_token_spec(block, token):
        return False
    for ot in all_tokens:
        if ot == token:
            continue
        if _table_block_matches_required_token_spec(block, ot):
            return False
    return True


def _dedupe_duplicate_tables_for_required_tokens(
    content: str, required_tables: list[str]
) -> str:
    """
    按合同必需表号强制去重：同节内同一「表 x-x」只保留一张（优先要素直出/较完整表体）。
    解决 LLM 表与要素直出表头不一致、或无前表题 orphan 表导致通用去重未合并的情况。
    """
    text = str(content or "")
    tokens = [
        str(t or "").strip()
        for t in (required_tables or [])
        if str(t or "").strip()
    ]
    if not text.strip() or not tokens:
        return text
    matches = _iter_pipe_markdown_table_blocks(text)
    if len(matches) < 2:
        return text

    remove_spans: list[tuple[int, int]] = []
    for token in tokens:
        hits: list[tuple[re.Match[str], int]] = []
        for m in matches:
            before = text[: m.start()]
            block = m.group(0)
            belongs = _markdown_table_block_belongs_to_required_token(
                before, block, token, tokens
            )
            if belongs:
                hits.append(
                    (m, _score_table_block_for_keep(before, block))
                )
        if len(hits) < 2:
            continue
        hits.sort(key=lambda x: (-x[1], x[0].start()))
        for dup, _score in hits[1:]:
            remove_spans.append(_span_for_duplicate_table_removal(text, dup))

    return _apply_markdown_table_block_removals(text, remove_spans)


def _finalize_section_table_dedupe(content: str, required_tables: list[str] | None) -> str:
    """章节表去重终态：通用表号/表头合并 + 合同必需表号强制合并。"""
    text = _dedupe_duplicate_captioned_markdown_tables(content)
    if required_tables:
        text = _dedupe_duplicate_tables_for_required_tokens(text, required_tables)
    return text


def _dedupe_duplicate_captioned_markdown_tables(content: str) -> str:
    """
    同节内重复输出「表题 + 表体」（如 3.3.2 表3-2 出现两次：LLM 表 + 要素直出表）。
    按表号或表头指纹分组，保留要素直出或较完整的一张。
    """
    text = str(content or "")
    if not text.strip():
        return text
    matches = _iter_pipe_markdown_table_blocks(text)
    if len(matches) < 2:
        return text

    groups: dict[str, list[dict]] = {}
    for idx, m in enumerate(matches):
        before = text[: m.start()]
        block = m.group(0)
        key = _dedupe_key_for_markdown_table_block(text, m, block_index=idx)
        groups.setdefault(key, []).append(
            {
                "idx": idx,
                "m": m,
                "score": _score_table_block_for_keep(before, block),
            }
        )

    remove_spans: list[tuple[int, int]] = []
    for _key, items in groups.items():
        if len(items) < 2:
            continue
        items.sort(key=lambda x: (-x["score"], x["idx"]))
        for dup in items[1:]:
            remove_spans.append(_span_for_duplicate_table_removal(text, dup["m"]))

    return _apply_markdown_table_block_removals(text, remove_spans)


def _dedupe_211_duplicate_markdown_tables(content: str) -> str:
    """
    2.1.1 常见故障：模型在段落间重复输出同一张「原料数量及组成」表，或要素直出与模型表并存。
    判断规则：表头行归一化后一致即视为同表（兼容 LLM 表与要素管理表数据略不同的情况）。
    若表头也不同，则回退到全表体指纹一致判断。
    保留每组重复中的第一张，删除后续副本（可含紧贴的表题行）。
    """
    text = str(content or "")
    if not text.strip():
        return text

    pat = re.compile(r"(?m)(?:^\s*\|.+\|\s*\n){3,}")
    matches = list(pat.finditer(text))
    if len(matches) < 2:
        return text

    headers: list[str] = []
    fingerprints: list[str] = []
    for m in matches:
        headers.append(_extract_table_header_key(m.group(0)))
        fingerprints.append(_markdown_table_body_fingerprint(m.group(0)))

    seen_headers: dict[str, int] = {}
    seen_fps: dict[str, int] = {}
    remove_spans: list[tuple[int, int]] = []

    for idx, m in enumerate(matches):
        hdr = headers[idx]
        fp = fingerprints[idx]
        is_dup = False
        if hdr and hdr in seen_headers:
            is_dup = True
        elif fp and fp in seen_fps:
            is_dup = True

        if not is_dup:
            if hdr:
                seen_headers[hdr] = idx
            if fp:
                seen_fps[fp] = idx
            continue

        start = m.start()
        prefix = text[:start]
        stripped_prefix = prefix.rstrip("\n")
        last_nl = stripped_prefix.rfind("\n")
        title_line = stripped_prefix[last_nl + 1 :] if last_nl >= 0 else stripped_prefix
        if _is_likely_table1_raw_material_caption(title_line):
            start = last_nl + 1 if last_nl >= 0 else 0
        # 同时去掉紧接在表前的 HTML 注释行（要素管理标记）
        before = text[:start]
        if before.rstrip().endswith("-->"):
            comment_start = before.rstrip().rfind("<!--")
            if comment_start >= 0:
                nl_before_comment = before.rfind("\n", 0, comment_start)
                start = nl_before_comment + 1 if nl_before_comment >= 0 else comment_start
        remove_spans.append((start, m.end()))

    if not remove_spans:
        return text

    remove_spans.sort(key=lambda x: x[0], reverse=True)
    for a, b in remove_spans:
        text = text[:a] + text[b:]
    return re.sub(r"\n{3,}", "\n\n", text).strip()


def _strip_minimal_missing_table_tail(content: str) -> str:
    """
    移除 _append_minimal_missing_tables 产生的占位表残留：整段或仅余「| 关键数据 | 待补充 |」一行。
    兼容全角竖线「｜」。
    """
    t = str(content or "").replace("｜", "|")
    if not t.strip():
        return t

    changed = True
    while changed:
        changed = False
        old = t
        # 完整三行占位表（可选上一行 ### 表题）
        t = re.sub(
            r"\n(?:#{1,6}\s*[^\n]+\n\n)?\|\s*项目\s*\|\s*内容\s*\|\s*\n\|\s*---\s*\|\s*---\s*\|\s*\n\|\s*关键数据\s*\|\s*待补充\s*\|\s*$",
            "",
            t,
            flags=re.IGNORECASE,
        )
        # 仅余数据行（表头已被其它步骤剥掉时）
        t = re.sub(
            r"\n\|\s*关键数据\s*\|\s*待补充\s*\|\s*$",
            "",
            t,
            flags=re.IGNORECASE,
        )
        if t != old:
            changed = True
    return t.rstrip()


def _strip_orphan_markdown_table_rows(content: str) -> str:
    """
    删除模型偶尔在整张 Markdown 表格之后又多输出的一行「| ... |」（常带残缺引用残留），避免表外出现残缺表行。
    """
    lines = content.split("\n")
    out: list[str] = []
    i = 0
    n = len(lines)
    while i < n:
        line = lines[i]
        if not (_is_pipe_markdown_table_row_line(line) or _is_pipe_markdown_table_separator_line(line)):
            out.append(line)
            i += 1
            continue

        tbl: list[str] = []
        while i < n and (
            _is_pipe_markdown_table_row_line(lines[i])
            or _is_pipe_markdown_table_separator_line(lines[i])
        ):
            tbl.append(lines[i])
            i += 1
        out.extend(tbl)

        blank_run: list[str] = []
        while i < n and not lines[i].strip():
            blank_run.append(lines[i])
            i += 1

        if (
            i < n
            and _is_pipe_markdown_table_row_line(lines[i])
            and not _is_pipe_markdown_table_separator_line(lines[i])
        ):
            k2 = i + 1
            while k2 < n and not lines[k2].strip():
                k2 += 1
            starts_new_table = k2 < n and _is_pipe_markdown_table_separator_line(lines[k2])
            if not starts_new_table:
                i += 1
                out.extend(blank_run)
                continue

        out.extend(blank_run)
    return "\n".join(out).strip()


_TABLE_NO_TOKEN = r"表\s*\d+\s*[\-－—]\s*\d+"


def _strip_331_table_crossrefs(content: str) -> str:
    """3.3.1 为纯文字符合性评价，移除误插入的表号与“详见表…”类交叉引用。"""
    s = str(content or "")
    if not s:
        return s

    chain = rf"{_TABLE_NO_TOKEN}(?:\s*[、,，]\s*{_TABLE_NO_TOKEN})*"
    comma_lead = r"[，,；;、]"
    s = re.sub(rf"[（(]\s*详见\s*{chain}\s*[）)]", "", s)
    s = re.sub(rf"[（(]\s*参见\s*{chain}\s*[）)]", "", s)
    # 「，详见…」整块删除易导致前后句粘连，替换为句号断句。
    s = re.sub(rf"([\u4e00-\u9fff]){comma_lead}\s*详见\s*{chain}\s*[。．]*", r"\1。", s)
    s = re.sub(rf"{comma_lead}\s*详见\s*{chain}\s*[。．]*", "", s)
    s = re.sub(rf"([\u4e00-\u9fff]){comma_lead}\s*参见\s*{chain}\s*[。．]*", r"\1。", s)
    s = re.sub(rf"{comma_lead}\s*参见\s*{chain}\s*[。．]*", "", s)
    s = re.sub(
        rf"{comma_lead}\s*见\s*{chain}(?:\s*[~～\-至到]\s*{chain})?(?:\s*所示)?\s*[。．]*",
        "",
        s,
    )
    s = re.sub(rf"{comma_lead}\s*如\s*{chain}\s*所示\s*[。．]*", "", s)

    s = re.sub(
        rf"([\u4e00-\u9fff])(?:详见|参见)\s*{chain}\s*[。．]?",
        r"\1。",
        s,
    )
    s = re.sub(
        rf"(?:^|(?<=[。\n]))[\s\u3000]*详见\s*{chain}\s*[。．]+",
        "",
        s,
        flags=re.MULTILINE,
    )
    s = re.sub(r"[。．]{2,}", "。", s)
    s = re.sub(r"[，,]{2,}", "，", s)
    return re.sub(r"\n{3,}", "\n\n", s).strip()


def _strip_333_trailing_table_caption_lines(content: str) -> str:
    """3.3.3 不应以其他小节的表题列表结尾，删除段末误粘贴的表题行。"""
    lines = content.splitlines()
    cap = re.compile(
        r"^[\s#]*表\s*(?:3\s*[\-－—]\s*[234]|2\s*[\-－—]\s*7)\b.*$",
        flags=re.IGNORECASE,
    )
    while lines:
        cur = lines[-1]
        if not cur.strip():
            lines.pop()
            continue
        if cap.match(cur.strip()):
            lines.pop()
            continue
        break
    return "\n".join(lines).strip()


def _strip_341_table_artifacts(content: str) -> str:
    """3.4.1 为施工准备文字评价，移除误插入的表格交叉引用与段末表题行。"""
    s = str(content or "")
    if not s:
        return s

    chain = rf"{_TABLE_NO_TOKEN}(?:\s*[、,，]\s*{_TABLE_NO_TOKEN})*"
    comma_lead = r"[，,；;、]"
    s = re.sub(rf"[（(]\s*(?:详见|参见)\s*{chain}\s*[）)]", "", s)
    s = re.sub(rf"([\u4e00-\u9fff]){comma_lead}\s*详见\s*{chain}\s*[。．]*", r"\1。", s)
    s = re.sub(rf"{comma_lead}\s*详见\s*{chain}\s*[。．]*", "", s)
    s = re.sub(rf"([\u4e00-\u9fff]){comma_lead}\s*参见\s*{chain}\s*[。．]*", r"\1。", s)
    s = re.sub(rf"{comma_lead}\s*参见\s*{chain}\s*[。．]*", "", s)
    s = re.sub(
        rf"{comma_lead}\s*见\s*{chain}(?:\s*[~～\-至到]\s*{chain})?(?:\s*所示)?\s*[。．]*",
        "",
        s,
    )
    s = re.sub(rf"{comma_lead}\s*如\s*{chain}\s*所示\s*[。．]*", "", s)

    cap = re.compile(
        r"^[\s#]*表\s*(?:3\s*[\-－—]\s*[234]|2\s*[\-－—]\s*7)\b.*$",
        flags=re.IGNORECASE,
    )
    lines = s.splitlines()
    while lines:
        cur = lines[-1]
        if not cur.strip():
            lines.pop()
            continue
        if cap.match(cur.strip()):
            lines.pop()
            continue
        break
    s = "\n".join(lines)
    s = re.sub(r"[。．]{2,}", "。", s)
    s = re.sub(r"[，,]{2,}", "，", s)
    return re.sub(r"\n{3,}", "\n\n", s).strip()


def _strip_unallowed_table_references(
    content: str,
    *,
    allowed_table_tokens: Optional[list[str]] = None,
) -> str:
    """
    通用串表清洗：
    - 删除不在本节允许名单内的「详见/参见/见/如表…」引用；
    - 删除仅由不允许表号开头的独立表题行（模型误贴常见形态）。
    """
    text = str(content or "")
    if not text:
        return text
    allowed = {_norm_table_token(t) for t in (allowed_table_tokens or []) if str(t).strip()}

    def _is_allowed(tok: str) -> bool:
        nt = _norm_table_token(tok)
        return bool(nt) and nt in allowed

    ref_pat = re.compile(
        r"(?P<prefix>详见|参见|见|如)\s*"
        r"(?P<tok>(?:附表|表)\s*\d+(?:\s*[.\-－—–]\s*\d+)*)"
        r"(?P<tail>(?:\s*[~～\-至到]\s*(?:附表|表)\s*\d+(?:\s*[.\-－—–]\s*\d+)*)?(?:\s*所示)?)"
    )
    title_line_start_pat = re.compile(r"^[#>*\-\d\.\)（）\s]*(?:附表|表)\s*\d+", flags=re.IGNORECASE)
    title_line_token_pat = re.compile(r"(?:附表|表)\s*\d+(?:\s*[.\-－—–]\s*\d+)*")

    out: list[str] = []
    for raw in text.splitlines():
        s = str(raw or "")
        if title_line_start_pat.match(s.strip()):
            toks = title_line_token_pat.findall(s)
            if toks and all(not _is_allowed(t) for t in toks):
                continue

        def _repl(m: re.Match) -> str:
            tok = str(m.group("tok") or "")
            return m.group(0) if _is_allowed(tok) else ""

        s = ref_pat.sub(_repl, s)
        s = re.sub(r"[，,；;、]\s*$", "", s)
        s = re.sub(r"\s{2,}", " ", s).rstrip()
        out.append(s)

    merged = "\n".join(out)
    merged = re.sub(r"[。．]{2,}", "。", merged)
    merged = re.sub(r"[，,]{2,}", "，", merged)
    return re.sub(r"\n{3,}", "\n\n", merged).strip()


_RE_TABLE_261 = re.compile(
    r"(?:^|\n)\s*[#*]*\s*\**\s*表\s*2[.\s]*6\s*[-－—–.]\s*1\s*\**[^\n]*\n"
    r"(?:\s*\n)*"
    r"(?:(?:\s*\|[^\n]+\|\s*\n)+)?",
)


def _strip_211_stray_table_261(content: str) -> str:
    """移除 2.1.1 中残留的「表2.6-1」标题行及其紧跟的小表（如仅含项目/内容两列的空壳表）。"""
    if not content or "2" not in content:
        return content
    return re.sub(r"\n{3,}", "\n\n", _RE_TABLE_261.sub("", content)).strip()


def _table54_caption_in_preceding_lines(text_before: str, *, max_lines: int = 10) -> bool:
    """表块前是否已有「表5-4 …生产经营及效益…」表题。"""
    lines = [ln for ln in str(text_before or "").splitlines() if ln.strip()]
    tail = "\n".join(lines[-max_lines:])
    compact = re.sub(r"\s+", "", tail)
    if not re.search(r"表\s*5\s*[-－．]\s*4", compact, flags=re.I):
        return False
    return "生产经营" in tail or "效益情况对比" in tail


def _is_531_spurious_simple_benefit_table(block: str, text_before: str) -> bool:
    """
    5.3.1 模型常在表5-4 表题下自造三行简表，或正文后附「可研值|后评价值」样例表。
    要素直出完整表5-4（可研/实际/增减）不在此列。
    """
    if _table54_body_preceded_by_element_source(text_before):
        return False
    if _is_table54_simplified_extract_body(block):
        return True
    hdr = _extract_table_header_key(block)
    if not hdr:
        return False
    if "后评价值" in hdr or "后评价报告" in hdr:
        return True
    if "指标" in hdr and "可研值" in hdr and "项目" not in hdr and "不确定因素" not in hdr:
        return True
    return False


def _first_table57_caption_pos(content: str) -> int | None:
    """5.4 正文中首张「表5-7」表题行的起始位置（无则 None）。"""
    text = str(content or "")
    m = re.search(
        r"(?:^|\n)([^\n]*?表\s*5\s*[-－．]\s*7[^\n]*)\n",
        text,
        flags=re.IGNORECASE,
    )
    if not m:
        return None
    pos = m.start()
    if pos > 0 and text[pos] == "\n":
        pos += 1
    return pos


def _is_57_authoritative_table_preceding(text_before: str) -> bool:
    """判定表块是否紧跟表5-7 表题或要素直出注释。"""
    tail = str(text_before or "")[-1200:]
    if "表格来源：要素管理" in tail:
        return True
    compact = re.sub(r"\s+", "", tail[-400:])
    return bool(re.search(r"表\s*5\s*[-－．]\s*7", compact, flags=re.IGNORECASE))


def _is_54_spurious_table(block: str, text_before: str, *, before_table57: bool) -> bool:
    """
    5.4 仅允许表5-7。模型常自造「指标|可研值|后评价值|差值」简表（多来自第5章样例），
    或把表5-7 放在正文段之后重复输出。
    """
    if before_table57:
        return True
    if _is_57_authoritative_table_preceding(text_before):
        return False
    hdr = _extract_table_header_key(block)
    if not hdr:
        return True
    if "后评价值" in hdr or "后评价报告" in hdr:
        return True
    if "可研值" in hdr and ("差值" in hdr or "增减" in hdr):
        return True
    if "指标" in hdr and "可研值" in hdr and "项目" not in hdr and "不确定因素" not in hdr:
        return True
    # 表5-7 合法表头通常含 项目/不确定因素 + 数值/临界点，不含可研/后评分列
    if ("项目" in hdr or "不确定因素" in hdr) and (
        "数值" in hdr or "临界点" in hdr or "临界值" in hdr
    ):
        return False
    if "可研值" in hdr:
        return True
    return False


def _strip_54_spurious_llm_table(content: str) -> str:
    """移除 5.4 中表5-7 之前的自造表，以及非要素直出的「可研/后评」对比简表。"""
    text = str(content or "")
    if not text.strip():
        return text
    first_57 = _first_table57_caption_pos(text)
    pat = re.compile(r"(?m)((?:^\s*\|.+\|\s*\n){3,})")
    matches = list(pat.finditer(text))
    if not matches:
        return text
    remove_ranges: list[tuple[int, int]] = []
    for m in matches:
        before_57 = first_57 is not None and m.start() < first_57
        if not _is_54_spurious_table(
            m.group(0), text[: m.start()], before_table57=before_57
        ):
            continue
        start = m.start()
        while start > 0 and text[start - 1] == "\n":
            start -= 1
        end = m.end()
        while end < len(text) and text[end] == "\n":
            end += 1
        remove_ranges.append((start, end))
    if not remove_ranges:
        return text
    out: list[str] = []
    pos = 0
    for start, end in sorted(remove_ranges):
        out.append(text[pos:start])
        pos = end
    out.append(text[pos:])
    return re.sub(r"\n{3,}", "\n\n", "".join(out)).strip()


def _is_521_proper_investment_table52_block(block: str) -> bool:
    """是否为细则「投资变动情况表」表体（含工程或费用名称 + 投资估算/初设概算/竣工决算等列）。"""
    hdr = re.sub(r"\s+", "", _extract_table_header_key(block))
    if not hdr:
        return False
    if "工程或费用名称" not in hdr and "工程或费用" not in hdr:
        return False
    return any(k in hdr for k in ("投资估算", "初设概算", "竣工决算"))


def _strip_521_spurious_llm_table52(content: str) -> str:
    """
    5.2.1：移除首张「表5-2」下误用的对标简表（规模/单位造价等，多来自第5章样例），
    保留其后要素直出或列结构完整的投资变动情况表。
    """
    text = str(content or "")
    matches = _iter_pipe_markdown_table_blocks(text)
    if len(matches) < 2:
        return text
    proper_idx = next(
        (
            i
            for i, m in enumerate(matches)
            if _is_521_proper_investment_table52_block(m.group(0))
        ),
        None,
    )
    if proper_idx is None:
        return text
    remove_spans: list[tuple[int, int]] = []
    for m in matches[:proper_idx]:
        block = m.group(0)
        if _is_521_proper_investment_table52_block(block):
            continue
        before = text[: m.start()]
        if not _table_token_in_recent_context(before, "表5-2"):
            continue
        remove_spans.append(_span_for_duplicate_table_removal(text, m))
    return _apply_markdown_table_block_removals(text, remove_spans)


def _strip_531_spurious_llm_table(content: str) -> str:
    """移除 5.3.1 中非要素直出的表5-4 简表（含表题下误放的 LLM/抽取三行表）。"""
    text = str(content or "")
    if not text.strip():
        return text
    pat = re.compile(r"(?m)(?:^\s*\|.+\|\s*\n){3,}")
    matches = list(pat.finditer(text))
    if not matches:
        return text
    remove_ranges: list[tuple[int, int]] = []
    for m in matches:
        block = m.group(0)
        if not _is_531_spurious_simple_benefit_table(block, text[: m.start()]):
            continue
        start = m.start()
        while start > 0 and text[start - 1] == "\n":
            start -= 1
        end = m.end()
        while end < len(text) and text[end] == "\n":
            end += 1
        remove_ranges.append((start, end))
    if not remove_ranges:
        return text
    out: list[str] = []
    pos = 0
    for start, end in sorted(remove_ranges):
        out.append(text[pos:start])
        pos = end
    out.append(text[pos:])
    return re.sub(r"\n{3,}", "\n\n", "".join(out)).strip()


def _strip_532_embedded_appendix8_table(content: str) -> str:
    """5.3.2 勿内嵌「附表8 可研报告和后评价参数对比表」（含 ### 换行表题、表题拆行等写法）。"""
    text = str(content or "")
    if not text:
        return text
    kw8 = r"附表\s*8"
    kwname = r"可研报告和后评价参数对比表"
    # 表题与表体在同一行或同一物理段内
    md1 = re.compile(
        rf"(?:^|\n)[^\n]*{kw8}[^\n]*{kwname}[^\n]*\n"
        r"(?:\s*\n|<!--[\s\S]*?-->\s*\n)*"
        r"(?:\s*\|[^\n]+\|\s*\n)+",
        flags=re.IGNORECASE,
    )
    text = md1.sub("\n", text)
    # 「###」独占行后再起表题（与 DOCX 导出兼容）
    md2 = re.compile(
        rf"(?:^|\n)(?:\s*#{{1,6}}\s*\n)+(?:\s*\n)*"
        rf"(?:[^\n]*{kw8}[^\n]*\n(?:\s*[^\n]*{kwname}[^\n]*\n)?)"
        r"(?:\s*\n|<!--[\s\S]*?-->\s*\n)*"
        r"(?:\s*\|[^\n]+\|\s*\n)+",
        flags=re.IGNORECASE,
    )
    text = md2.sub("\n", text)
    html_pat = re.compile(
        rf"(?:^|\n)[^\n]*{kw8}[^\n]*{kwname}[^\n]*\n"
        r"(?:\s*\n|<!--[\s\S]*?-->\s*\n)*"
        r"\s*<table>[\s\S]*?</table>",
        flags=re.IGNORECASE,
    )
    text = html_pat.sub("\n", text)
    title_only = re.compile(
        rf"(?:^|\n)(?:\s*#{{1,6}}\s*\n)+(?:\s*\n)*[^\n]*{kw8}[^\n]*(?:{kwname})?[^\n]*(?=\n|$)",
        flags=re.IGNORECASE,
    )
    text = title_only.sub("\n", text)
    title_only2 = re.compile(
        rf"(?:^|\n)\s*[#>*\-\d\.\)（）\s]*[^\n]*{kw8}[^\n]*{kwname}[^\n]*(?=\n|$)",
        flags=re.IGNORECASE,
    )
    text = title_only2.sub("\n", text)
    return re.sub(r"\n{3,}", "\n\n", text).strip()


def _pipe_markdown_row_cells(line: str) -> list[str]:
    raw = str(line or "").rstrip("\n")
    s = raw.strip()
    if not s.startswith("|") or not s.endswith("|"):
        return []
    inner = s[1:-1]
    return [p.strip() for p in inner.split("|")]


def _strip_md_cell_noise(s: str) -> str:
    t = re.sub(r"\*+", "", str(s or ""))
    t = re.sub(r"<br\s*/?>", "", t, flags=re.I)
    return t.strip()


def _strip_532_table55_bad_markdown_columns(content: str) -> str:
    """去掉正文中「表5-5 主要生产经营指标」Markdown 表的多余列（如「后评价-时点点后预测值」及冗余裸预测列）。"""
    text = str(content or "")
    if not text or "主要生产经营指标" not in text:
        return text
    fc = "后评价时点后预测值"

    def _bad_header_indices(header_cells: list[str]) -> set[int]:
        bad: set[int] = set()
        comp_cells = [_compact_zh_ident(_strip_md_cell_noise(h)) for h in header_cells]
        has_slot = False
        for i, h in enumerate(header_cells):
            hs = str(h or "")
            parts = _split_group_year_col_key(hs)
            if parts and parts[0] == fc:
                tail = parts[1].strip()
                if _appendix_norm_year_tail(tail) or (
                    _BARE_FOUR_DIGIT_YEAR_COL.fullmatch(tail) and 1900 <= int(tail) <= 2100
                ):
                    has_slot = True
                    break
            if re.search(r"后评价时点后预测值\s*[-－—–]\s*\d{4}", hs):
                has_slot = True
                break
        fc_c = _compact_zh_ident(fc)
        for i, c in enumerate(comp_cells):
            if "时点点后" in c:
                bad.add(i)
            if has_slot and c == fc_c:
                bad.add(i)
        return bad

    def _drop_cols_from_pipe_block(block: str, drop_idx: set[int]) -> str:
        if not drop_idx:
            return block
        out_lines: list[str] = []
        for ln in block.splitlines():
            if not ln.strip().startswith("|"):
                out_lines.append(ln)
                continue
            cells = _pipe_markdown_row_cells(ln)
            if not cells:
                out_lines.append(ln)
                continue
            new_cells = [c for j, c in enumerate(cells) if j not in drop_idx]
            if len(new_cells) == len(cells):
                out_lines.append(ln)
            else:
                out_lines.append("| " + " | ".join(new_cells) + " |")
        return "\n".join(out_lines)

    rx = re.compile(
        r"((?:^|\n)[^\n]*表\s*5\s*[-－\.．·]\s*5[^\n]*主要生产经营指标[^\n]*\n)"
        r"(?:\s*\n|<!--[^\n]*-->\s*\n)*"
        r"((?:^[ \t]*\|[^\n]+\|\s*\n)+)",
        flags=re.MULTILINE | re.IGNORECASE,
    )

    def _repl(m: re.Match) -> str:
        prefix, body = m.group(1), m.group(2)
        tbl_lines = [
            ln
            for ln in body.splitlines()
            if ln.strip().startswith("|") and ln.strip().endswith("|")
        ]
        if len(tbl_lines) < 2:
            return m.group(0)
        hdr = _pipe_markdown_row_cells(tbl_lines[0])
        if not hdr:
            return m.group(0)
        drop = _bad_header_indices(hdr)
        if not drop:
            return m.group(0)
        return prefix + _drop_cols_from_pipe_block(body, drop)

    return rx.sub(_repl, text)


def _cleanup_section_table_artifacts(
    section_title: str,
    content: str,
    *,
    allowed_table_tokens: Optional[list[str]] = None,
) -> str:
    section_no = _extract_section_number(str(section_title or ""))
    text = _strip_unallowed_table_references(
        str(content or ""),
        allowed_table_tokens=allowed_table_tokens,
    )
    # 末尾兜底：防止中间步骤再次引入 4.3.2/4.3.3 串表。
    if section_no in {"4.3.2", "4.3.3"}:
        text = _remove_cross_section_table_pollution(section_title, text)
    if section_no == "4.3.3":
        text = _dedupe_433_alkylation_operation_analysis_markdown_tables(text)
    if section_no == "5.3.2":
        text = _strip_532_embedded_appendix8_table(text)
        text = _strip_532_table55_bad_markdown_columns(text)
    if section_no == "3.3.1":
        return _strip_331_table_crossrefs(text)
    if section_no in {"2.1.5", "3.3.2", "3.3.4", "5.1", "5.3.1", "5.3.2"}:
        text = _finalize_section_table_dedupe(text, allowed_table_tokens)
    if section_no == "3.3.3":
        return _strip_333_trailing_table_caption_lines(text)
    if section_no == "3.4.1":
        return _strip_341_table_artifacts(text)
    if section_no == "2.1.1":
        text = _strip_211_stray_table_261(text)
        text = _dedupe_211_duplicate_markdown_tables(text)
    if section_no == "5.2.1":
        text = _fix_521_table52_wrong_caption(text)
        text = _strip_521_spurious_llm_table52(text)
        text = _finalize_section_table_dedupe(text, allowed_table_tokens)
    if section_no == "5.3.1":
        text = _strip_531_spurious_llm_table(text)
    if section_no == "5.4":
        text = _strip_54_spurious_llm_table(text)
    return text


def _chapter5_opening_heading_present(text: str) -> bool:
    """判断正文块是否已以第5章章题开头（「5 投资…」与第1章「1 项目概况」同体例，便于前端提升为 ##）。"""
    t = str(text or "").strip()
    if not t:
        return False
    first = t.split("\n", 1)[0].strip()
    if first.startswith("#"):
        first = first.lstrip("#").strip()
    if "第5章" in first and "投资与经济效益评价" in first:
        return True
    return bool(re.match(r"^5\s+投资与经济效益评价", first))


def _canonicalize_chapter5_shell_heading_line(text: str) -> str:
    """
    将独立行的「第5章 投资与经济效益评价」规范为「5 投资与经济效益评价」，
    与模板第1章及 promoteNumberedHeadingLinesToMarkdown（## 章级）一致。
    """
    lines = str(text or "").split("\n")
    out: list[str] = []
    replaced = False
    for line in lines:
        if not replaced and line.strip():
            stripped = line.strip().lstrip("#").strip()
            if stripped == "第5章 投资与经济效益评价" or (
                stripped.startswith("第5章") and "投资与经济效益评价" in stripped
            ):
                out.append("5 投资与经济效益评价")
                replaced = True
                continue
        out.append(line)
    return "\n".join(out)


def _section_heading_present_in_text(text: str, heading_title: str) -> bool:
    """判断正文中是否已出现指定节标题行。"""
    if not str(text or "").strip() or not str(heading_title or "").strip():
        return False
    target_norm = _title_compare_norm(heading_title)
    section_no = _extract_section_number(heading_title)
    for line in str(text).splitlines():
        stripped = line.strip()
        if not stripped:
            continue
        plain = stripped.lstrip("#").strip()
        if _title_compare_norm(plain) == target_norm:
            return True
        if section_no and _is_heading_line_for_section(plain, section_no):
            return True
    if section_no == "5" and _chapter5_opening_heading_present(text):
        return True
    return False


def _inject_missing_parent_section_headings(
    section_title: str,
    content: str,
    previous_section_content: str,
    chapter_title_map: dict[str, str],
) -> str:
    """
    仅生成叶子节时，父节壳（如 5.2、5.3、5）不会单独落库；在首个子节（x.y.1）前补足父节标题。
    """
    if not chapter_title_map:
        return content

    stub = SimpleNamespace(section_title=str(section_title or "").strip())
    ancestors = _resolve_ancestor_titles_for_section(stub, chapter_title_map)
    if not ancestors:
        return content

    body = str(content or "").strip()
    if not body:
        return content
    if _extract_section_number(section_title or "") == "5.1":
        body = _canonicalize_chapter5_shell_heading_line(body)

    prior = str(previous_section_content or "")
    missing: list[str] = []
    for anc in ancestors:
        if _section_heading_present_in_text(body, anc):
            continue
        if _section_heading_present_in_text(prior, anc):
            continue
        missing.append(anc)
    if not missing:
        return body
    return "\n\n".join(missing + [body]).strip()


def _inject_chapter5_title_before_section_51(
    section_key: str,
    content: str,
    previous_section_content: str,
    *,
    section_title: str = "",
    chapter_title_map: Optional[dict[str, str]] = None,
) -> str:
    """兼容旧调用；优先走通用父节标题注入。"""
    if chapter_title_map and section_title:
        return _inject_missing_parent_section_headings(
            section_title, content, previous_section_content, chapter_title_map
        )
    if str(section_key or "").strip() != "5-1":
        return content
    body = _canonicalize_chapter5_shell_heading_line(str(content or "").strip())
    if not body:
        return content
    if _chapter5_opening_heading_present(body):
        return body
    if _chapter5_opening_heading_present(previous_section_content):
        return body
    return f"5 投资与经济效益评价\n\n{body}"


def _previous_completed_section_content(
    section: ReportTemplateSection,
    sections: list[ReportTemplateSection],
    completed_contents: dict[str, str],
) -> str:
    """按模板顺序取当前节之前最近一节已生成正文（用于父节标题是否已出现）。"""
    ordered = list(sections or [])
    try:
        idx = next(i for i, s in enumerate(ordered) if s.section_key == section.section_key)
    except StopIteration:
        return ""
    for j in range(idx - 1, -1, -1):
        body = str(completed_contents.get(ordered[j].section_key) or "").strip()
        if body:
            return body
    return ""


def _prev_line_invites_metric_continuation(prev_line: str) -> bool:
    """上一行是否像在句中被截断、下一行应以能耗/物耗数值续写。"""
    s = str(prev_line or "").strip()
    if not s:
        return False
    if s.startswith("|"):
        return False
    if re.match(r"^\s{0,3}#{0,6}\s*\d+(?:\.\d+)+\s+[\u4e00-\u9fff]", s):
        return False
    if re.search(r"(?:\[\d+\]\s*)+$", s):
        return True
    if re.search(
        r"(?:单耗|电耗|能耗|水耗|物耗|损失|运行值|设计值|加工量|负荷|占比)为?"
        r"\s*(?:\[\d+\]\s*)*$",
        s,
    ):
        return True
    if re.search(
        r"(上升至|升至|降至|下降为|提高为|降低为|为|达到|至)\s*(?:\[\d+\]\s*)*$",
        s,
    ):
        return True
    return bool(re.search(r"[至为是到]$", s))


def _merge_orphan_energy_metric_lines(text: str) -> str:
    """
    将误断成独立行的能耗/物耗数值片段并回上一行正文。
    例：「…可研报告 [50]\\n132.41 MJ/t产品及初步设计」→ 合并为一行，避免前端误判为 ### 标题。
    """
    from services.docx_export_service import _is_likely_section_number

    metric_re = re.compile(
        r"^\s*(?:#{1,6}\s+)?(\d+(?:\.\d+)?)\s+(MJ/t|kWh/t|kgce/t|t产品)",
        re.IGNORECASE,
    )
    lines = str(text or "").split("\n")
    out: list[str] = []
    for line in lines:
        stripped = re.sub(r"^#{1,6}\s+", "", str(line or "").strip())
        m = metric_re.match(stripped)
        if m and not _is_likely_section_number(m.group(1)):
            prev = len(out) - 1
            while prev >= 0 and not str(out[prev] or "").strip():
                prev -= 1
            if prev >= 0 and _prev_line_invites_metric_continuation(str(out[prev] or "")):
                out[prev] = out[prev].rstrip() + stripped
                continue
        out.append(line)
    return "\n".join(out)


def _fix_numeric_line_breaks(content: str) -> str:
    """
    修复数字与单位/日期在换行处被意外拆分的问题。
    仅合并明显数字语义连续场景，尽量不影响正常段落换行。
    """
    text = str(content or "")
    if not text:
        return text
    # 统一各种换行分隔符，避免 \u2028/\u2029 导致规则失效
    text = text.replace("\r\n", "\n").replace("\r", "\n")
    text = text.replace("\u2028", "\n").replace("\u2029", "\n")

    # 保护 Markdown 表格行之间及表格行与后续正文之间的换行，
    # 否则数字合并规则会把表格末行和下一行粘在一起变成多余列。
    # 策略：按行拆分，识别所有表格行（以 | 开头或以 | 结尾），
    # 将其前后换行替换为保护占位符，合并规则处理完毕后再恢复。
    table_nl_token = "\u0000TABLE_NL\u0000"
    _lines = text.split("\n")
    for _li in range(len(_lines)):
        _stripped = _lines[_li].strip()
        _is_table = _stripped.startswith("|") or _stripped.endswith("|")
        if _is_table:
            _lines[_li] = table_nl_token + _lines[_li] + table_nl_token
    text = "\n".join(_lines)
    text = text.replace(table_nl_token + "\n" + table_nl_token, table_nl_token)
    text = text.replace("\n" + table_nl_token, table_nl_token)
    text = text.replace(table_nl_token + "\n", table_nl_token)

    # 先保护“章节标题换行”（如：1 项目概况\n1.1 项目基本情况），避免被数字合并规则误伤。
    heading_nl_token = "\u0000HEADING_NL\u0000"
    text = re.sub(
        r"\n(?=\s*\d+(?:\.\d+)*\s+[\u4e00-\u9fff]{2,}(?:\s|$))",
        heading_nl_token,
        text,
    )

    # 数字/中文与下一行之间的合并：换行两侧仅允许水平空白（不含 \\n），
    # 否则 \\s* 会吞掉段落空行的第一个 \\n，使 \\n(?!\\n) 失效，误把「标题\\n\\n2017年…」粘回一行。
    _hsp = r"[ \t\u3000]*"
    # 例：2018 年 11 月\n4 日、24.48\n%、1906\nm2、0.05\ng
    text = re.sub(
        rf"(?<=\d){_hsp}\n(?!\n){_hsp}(?=(?:\d|[年月日时分秒度%％℃吨米台套项]|[A-Za-z]))",
        "",
        text,
    )
    # 例：烈度\n7 度、规模\n15 万吨/年（中文描述后接数字）
    # 仅在“下一行是数字 + 常见单位/量纲”时合并，避免误伤编号列表（如 1）/1.）
    text = re.sub(
        rf"(?<=[\u4e00-\u9fff]){_hsp}\n(?!\n){_hsp}(?=\d+(?:\.\d+)?\s*(?![）)、.])(?:[年月日时分秒度%％℃吨米台套项个级亩万亿千百十gGlLmMkKvVaAwWhHzHPp]|[A-Za-z]{{1,4}}\b))",
        "",
        text,
    )
    # 例：106万\n工时、15万吨/年\n烷基化项目（数量级后接中文语义单位）
    text = re.sub(
        rf"(?<=[\d万亿千百十]){_hsp}\n(?!\n){_hsp}(?=(?:工时|吨/年|万吨/年|亿元|万元|万人|m2|m3|m²|m³|项|台|套|个|座|处|条|次|年|月|日))",
        "",
        text,
        flags=re.IGNORECASE,
    )
    # 例：kgEo/\nt、m\n2 等单位被拆分
    text = re.sub(r"(?<=[A-Za-z/])\s*\n\s*(?=\d)", "", text)
    text = re.sub(r"(?<=[A-Za-z])\s*\n\s*(?=[A-Za-z])", "", text)
    # 例：实际运行值为\n137.88 MJ/t；…单耗为 [93][94]\n\n1.38 MJ/t（2.1.1/2.1.6 常见断行）
    _metric_num = r"\d+(?:\.\d+)?\s*(?:MJ/t|kWh/t|kgce/t|t产品)"
    text = re.sub(
        rf"((?:\[\d+\]\s*)+)\s*\n+\s*({_metric_num})",
        r"\1 \2",
        text,
        flags=re.IGNORECASE,
    )
    text = re.sub(
        rf"(?<=[\u4e00-\u9fff）\]）])\s*\n+\s*({_metric_num})",
        r" \1",
        text,
        flags=re.IGNORECASE,
    )
    text = _merge_orphan_energy_metric_lines(text)

    # 统一面积/体积单位写法：m2/m3 -> m²/m³（兼容空格、大小写、^ 写法）
    text = re.sub(r"(?i)\bm\s*(?:\^?\s*2)\b", "m²", text)
    text = re.sub(r"(?i)\bm\s*(?:\^?\s*3)\b", "m³", text)
    text = text.replace(heading_nl_token, "\n")
    text = text.replace(table_nl_token, "\n")
    return text


def _canonical_global_table_name_for_token(token: str) -> str | None:
    t = str(token or "").strip()
    if not t:
        return None
    for name in MULTI_COLUMN_GLOBAL_SPECS:
        if _table_token_matches_name(t, name):
            return name
    return None


def _skeleton_markdown_for_table_token(token: str, *, table_name: str = "") -> str:
    """按细则模版生成占位 Markdown 表体（要素无有效单元格时仍保证表3-4 等有表体）。"""
    full_name = _canonical_global_table_name_for_token(token) or ""
    if not full_name:
        tn = str(table_name or "").strip()
        if _multi_column_global_spec_for_table(tn):
            full_name = tn
        elif tn:
            full_name = _canonical_global_table_name_for_token(tn) or tn
    spec = _multi_column_global_spec_for_table(full_name)
    if not spec:
        return ""
    row_order = global_table_row_keys(full_name)
    if not row_order:
        return ""
    col_order = [str(c).strip() for c in (spec[0] or []) if str(c).strip()]
    if not col_order:
        return ""
    md, _ = _render_markdown_table(full_name, row_order, col_order, {})
    return str(md or "").strip()


def _authoritative_block_for_required_table(token: str, evidence: dict) -> str | None:
    """要素直出整块：优先 structuredTables 中的 markdown，否则用模版骨架表。"""
    table_rows = evidence.get("structuredTables") if isinstance(evidence, dict) else []
    title = str(token or "").strip()
    md = ""
    if isinstance(table_rows, list):
        best_row: dict | None = None
        best_row_score = -1
        for row in table_rows:
            if not isinstance(row, dict):
                continue
            token_hit = str(row.get("token") or "")
            table_name_hit = str(row.get("tableName") or "")
            if _table_token_matches_name(token, token_hit) or _table_token_matches_name(
                token, table_name_hit
            ):
                row_md = str(row.get("markdown") or "").strip()
                if not row_md:
                    continue
                row_score = (
                    _score_structured_table_hit_dict(row)
                    if _table_token_matches_name(token, "表5-4")
                    else len(row_md)
                )
                if row_score > best_row_score:
                    best_row_score = row_score
                    best_row = row
        if best_row:
            md = str(best_row.get("markdown") or "").strip()
            title = str(best_row.get("tableName") or token).strip() or token
    if not md:
        sk = _skeleton_markdown_for_table_token(token, table_name=title)
        if sk:
            md = sk
            canon = _canonical_global_table_name_for_token(token)
            if canon:
                title = canon
    if not md:
        return None
    return (
        f"{title}\n\n"
        "<!-- 表格来源：要素管理（结构化表直出） -->\n"
        f"{md}"
    )


def _fill_required_table_caption_stubs(
    content: str, required_tables: list[str], evidence: dict
) -> str:
    """将仅有表题、段内无 Markdown 表体的必需表替换为要素直出或模版骨架。"""
    text = str(content or "")
    changed = False
    for token in required_tables or []:
        if not _table_token_caption_line_re(token).search(text):
            continue
        seg = _segment_after_table_caption(text, token)
        if _segment_has_markdown_table_body(seg):
            seg_tbl = re.search(r"(?m)(?:^\s*\|[^\n]+\|\s*\n){3,}", seg)
            if not (
                _table_token_matches_name(token, "表5-4")
                and seg_tbl
                and _is_table54_simplified_extract_body(seg_tbl.group(0))
            ):
                continue
        block = _authoritative_block_for_required_table(token, evidence)
        if not block:
            continue
        text = _replace_caption_stub_with_authoritative_table(text, token, block)
        changed = True
    return text if changed else content


def _append_structured_missing_tables(content: str, missing_tables: list[str], evidence: dict) -> str:
    out_content = str(content or "").rstrip()
    used = False
    for token in missing_tables:
        block = _authoritative_block_for_required_table(token, evidence)
        if not block:
            continue
        if _table_token_caption_line_re(token).search(out_content):
            out_content = _replace_caption_stub_with_authoritative_table(
                out_content, token, block
            )
        else:
            out_content = out_content + "\n\n" + block
        used = True
    return out_content.strip() if used else content


def _replace_llm_table_with_authoritative(content: str, token: str, replacement_md: str) -> str:
    """将 LLM 自行生成的同 token 表格（表题行 + 表格体）替换为要素管理直出内容。

    关键：管道行匹配使用 ``[ \\t]*`` 而非 ``\\s*``，防止 ``\\s`` 跨越空行
    把分析文字中的 token 引用误关联到远处另一张表的管道行。
    表题行与首条管道行之间允许至多一个空行（``\\n?``）。
    """
    token_plain = re.sub(r"\s+", "", str(token or ""))
    if not token_plain or not replacement_md:
        return content
    token_re = re.escape(token_plain).replace(r"\-", r"[-－—–]")
    md_table_pat = re.compile(
        r"((?:^|\n)[^\n]*?" + token_re + r"[^\n]*\n)"
        r"(\n?(?:[ \t]*\|[^\n]+\|[ \t]*\n)+)",
        flags=re.IGNORECASE,
    )
    m = md_table_pat.search(content)
    if m:
        return content[:m.start()] + "\n" + replacement_md + "\n\n" + content[m.end():].lstrip("\n")
    html_table_pat = re.compile(
        r"((?:^|\n)[^\n]*?" + token_re + r"[^\n]*\n)"
        r"(\s*<table>[\s\S]*?</table>)",
        flags=re.IGNORECASE,
    )
    m = html_table_pat.search(content)
    if m:
        return content[:m.start()] + "\n" + replacement_md + "\n\n" + content[m.end():].lstrip("\n")
    return content


def _caption_followed_by_element_table_comment(content: str, token: str) -> bool:
    """仅当「本表表题行后」紧跟要素直出注释时，才视为已权威化，避免全篇任一注释误伤其它表的替换。"""
    token_plain = re.sub(r"\s+", "", str(token or ""))
    if not token_plain:
        return False
    token_re = re.escape(token_plain).replace(r"\-", r"[-－—–]")
    return bool(
        re.search(
            r"(?:^|\n)[^\n]*?" + token_re + r"[^\n]*\n"
            r"(?:[ \t]*\n)?[ \t]*<!--\s*表格来源：要素管理",
            str(content or ""),
            flags=re.IGNORECASE | re.MULTILINE,
        )
    )


def _refresh_element_table_markdown_tokens(
    content: str,
    evidence: dict,
    tokens: tuple[str, ...],
) -> str:
    """用要素包中的结构化 Markdown 再次覆盖正文内指定表号（抵消模板格式合同 LLM 修复对表头的改写）。"""
    rows = evidence.get("structuredTables") if isinstance(evidence, dict) else []
    if not isinstance(rows, list) or not rows:
        return content
    out = str(content or "")
    for token in tokens:
        token_n = str(token or "").strip()
        if not token_n:
            continue
        hit_md: str | None = None
        hit_title: str | None = None
        best_sc = -1
        for row in rows:
            if not isinstance(row, dict):
                continue
            th = str(row.get("token") or "")
            tn = str(row.get("tableName") or "")
            if _table_token_matches_name(token_n, th) or _table_token_matches_name(token_n, tn):
                md = str(row.get("markdown") or "").strip()
                if not md:
                    continue
                sc = (
                    _score_structured_table_hit_dict(row)
                    if _table_token_matches_name(token_n, "表5-4")
                    else len(md)
                )
                if sc > best_sc:
                    best_sc = sc
                    hit_md = md
                    hit_title = str(row.get("tableName") or token_n).strip() or token_n
        if not hit_md or not hit_title:
            continue
        rep = (
            f"{hit_title}\n\n"
            "<!-- 表格来源：要素管理（结构化表直出） -->\n"
            f"{hit_md}"
        )
        if _table_token_exists(out, token_n):
            out = _replace_llm_table_with_authoritative(out, token_n, rep)
    return out


def _append_authoritative_required_tables(content: str, required_tables: list[str], evidence: dict) -> str:
    """
    为模板必需表追加"要素表直出"块，确保表格数据直接来自结构化要素表。
    若 LLM 已自行生成了同 token 的表格，用要素管理数据替换之。
    """
    if not required_tables:
        return content

    out_content = str(content or "")
    used = False
    for token in required_tables:
        already_authoritative = (
            _caption_followed_by_element_table_comment(out_content, token)
            and _table_token_exists(out_content, token)
        )
        if already_authoritative:
            continue
        combined_md = _authoritative_block_for_required_table(token, evidence)
        if not combined_md:
            continue
        if _table_token_exists(out_content, token):
            replaced = _replace_llm_table_with_authoritative(out_content, token, combined_md)
            out_content = (
                replaced
                if replaced != out_content
                else _replace_caption_stub_with_authoritative_table(
                    out_content, token, combined_md
                )
            )
        elif _table_token_caption_line_re(token).search(out_content):
            out_content = _replace_caption_stub_with_authoritative_table(
                out_content, token, combined_md
            )
        else:
            out_content = out_content.rstrip() + "\n\n" + combined_md
        used = True
    return out_content.strip() if used else content


def _is_effective_markdown_table_block(md_block: str) -> bool:
    lines = [str(ln or "").strip() for ln in str(md_block or "").splitlines() if str(ln or "").strip()]
    if len(lines) < 3:
        return False
    if _is_pipe_markdown_table_separator_line(lines[0]):
        return False
    if not _is_pipe_markdown_table_separator_line(lines[1]):
        return False
    data_rows = [
        ln for ln in lines[2:]
        if _is_pipe_markdown_table_row_line(ln) and not _is_pipe_markdown_table_separator_line(ln)
    ]
    return bool(data_rows)


def _ensure_required_structured_tables_integrity(content: str, required_tables: list[str], evidence: dict) -> str:
    if not required_tables:
        return content
    repaired = str(content or "")

    for token in required_tables:
        authoritative_block = _authoritative_block_for_required_table(token, evidence)
        if not authoritative_block:
            continue

        if _table_token_caption_line_re(token).search(repaired):
            seg = _segment_after_table_caption(repaired, token)
            if not _segment_has_markdown_table_body(seg):
                repaired = _replace_caption_stub_with_authoritative_table(
                    repaired, token, authoritative_block
                )
                continue
            if _table_token_matches_name(token, "表5-4"):
                m_seg = re.search(
                    r"(?m)(?:^\s*\|[^\n]+\|\s*\n){3,}", seg
                )
                if m_seg and _is_table54_simplified_extract_body(m_seg.group(0)):
                    repaired = _replace_caption_stub_with_authoritative_table(
                        repaired, token, authoritative_block
                    )
                    continue

        token_plain = re.sub(r"\s+", "", str(token or ""))
        token_re = re.escape(token_plain).replace(r"\-", r"[-－—–]")
        table_pat = re.compile(
            r"((?:^|\n)[^\n]*?" + token_re + r"[^\n]*\n(?:\n|[ \t]*<!--[^\n]*-->[ \t]*\n)*)"
            r"((?:[ \t]*\|[^\n]*\|[ \t]*\n)+)",
            flags=re.IGNORECASE,
        )
        m = table_pat.search(repaired)
        if m:
            cur_table = str(m.group(2) or "")
            need_replace = not _is_effective_markdown_table_block(cur_table)
            if _table_token_matches_name(token, "表5-4") and _is_table54_simplified_extract_body(
                cur_table
            ):
                need_replace = True
            if need_replace:
                repaired = (
                    repaired[:m.start()]
                    + "\n"
                    + authoritative_block
                    + "\n\n"
                    + repaired[m.end():].lstrip("\n")
                )
        elif not _table_token_exists(repaired, token):
            repaired = repaired.rstrip() + "\n\n" + authoritative_block

    return repaired.strip()


def _collect_structured_tables(
    db: Session,
    project_uuid: str,
    required_tables: list[str],
    *,
    section_title: str,
    section_tokens: list[str],
) -> list[dict]:
    """
    报告生成阶段的结构化表来源必须与“要素管理”一致（element_tables/element_cells）。

    规则：
    - 若模板 prompt 中声明了必需表（如 表2-1/附表8），优先按 token 精准匹配；
    - 若未声明必需表，或声明了但匹配不到，则按章节标题/关键词从要素管理中选取最相关的表直出，
      避免模型自行编造表格。
    """

    def _table_relevance_score(table_name: str) -> int:
        name = str(table_name or "").strip()
        if not name:
            return 0
        name_l = name.lower()
        score = 0
        # 章节标题强相关加权
        t = str(section_title or "").strip()
        if t and t in name:
            score += 10
        # token 命中加分
        for tok in (section_tokens or [])[:20]:
            tt = str(tok or "").strip()
            if not tt:
                continue
            if tt.lower() in name_l:
                score += 2
        # 常见表名关键字（表/附表/对比/评价）做轻微加权，便于优先输出真正的表
        if any(k in name for k in ("表", "附表", "对比", "评价", "评分")):
            score += 1
        return score

    tables: list[ElementTable] = (
        db.query(ElementTable)
        .filter(ElementTable.project_id == project_uuid)
        .order_by(ElementTable.sort_order.asc(), ElementTable.updated_at.desc())
        .all()
    )
    if not tables:
        return []

    # 5.3.2：正文仅需表5-5/表5-6；附表8 归入全书「## 附表」，勿纳入本节结构化证据。
    if _extract_section_number(str(section_title or "")) == "5.3.2":
        tables = [
            t
            for t in tables
            if not (
                ("附表8" in str(t.table_name or ""))
                and ("可研报告和后评价参数对比表" in str(t.table_name or ""))
            )
        ]
        if not tables:
            return []

    # 1) 必需表：按 token/表名匹配（尽量“直接用要素管理中的表”）
    required_norm = [_norm_table_token(t) for t in (required_tables or []) if _norm_table_token(t)]
    required_hits: list[ElementTable] = []
    if required_norm:
        for t in tables:
            name_norm = _norm_table_token(t.table_name)
            if not name_norm:
                continue
            if any(req and _table_token_matches_name(req, name_norm, normalized=True) for req in required_norm):
                required_hits.append(t)
        if required_hits:
            req_hint_words: list[str] = []
            for req in required_norm:
                req_hint_words.extend(_TABLE_TOKEN_PREFERRED_NAME_HINTS.get(req, ()))

            def _required_hit_score(tb: ElementTable) -> tuple[int, int, int]:
                tb_name = str(tb.table_name or "").strip()
                hint_hit = 0
                if req_hint_words:
                    for hint in req_hint_words:
                        if hint and hint in tb_name:
                            hint_hit += 1
                collect = 0
                if any(_table_token_matches_name(req, "表5-4") for req in required_norm):
                    collect = _element_table_collect_score(db, tb, "表5-4")
                return (collect, hint_hit, _table_relevance_score(tb_name))

            required_hits.sort(key=_required_hit_score, reverse=True)
        # 保留靠前若干张，避免同 token 多张历史表导致提示词爆炸
        required_hits = required_hits[:8]

    # 2) 回退：未声明必需表，或声明了但没匹配上时，按相关性挑选
    selected: list[ElementTable] = list(required_hits)
    if not selected:
        scored: list[tuple[int, ElementTable]] = []
        for t in tables:
            s = _table_relevance_score(t.table_name)
            if s > 0:
                scored.append((s, t))
        scored.sort(key=lambda x: x[0], reverse=True)
        selected = [t for _, t in scored[:4]]

    selected_has_time54 = any(
        _is_table54_operating_benefit(str(t.table_name or ""))
        and str(t.table_type or "").strip() == "time"
        for t in selected
    )

    out: list[dict] = []
    for table in selected:
        is_time = str(table.table_type or "").strip() == "time"
        if is_time:
            ty_row = db.query(ElementTable.year).filter(ElementTable.id == table.id).first()
            tbl_y = (
                int(ty_row[0])
                if ty_row and ty_row[0] is not None and int(ty_row[0]) > 0
                else None
            )
            year_items = _build_time_table_markdowns_by_year(db, table.id, table.table_name)
            if _is_table54_operating_benefit(table.table_name):
                picked = _pick_table54_year_markdown(year_items, table_year=tbl_y)
                if picked:
                    year_items = [picked]
            for display_name, md in year_items:
                if not md:
                    continue
                token = _extract_table_short_token(display_name)
                out.append(
                    {
                        "tableId": table.id,
                        "tableName": display_name,
                        "token": token,
                        "markdown": md,
                    }
                )
                if len(out) >= 12:
                    break
        else:
            if (
                selected_has_time54
                and _is_table54_operating_benefit(table.table_name)
            ):
                continue
            md, common_unit = _build_structured_table_markdown(db, table.id, table.table_name)
            display_name = _merge_table_title_with_common_unit(str(table.table_name or "").strip(), common_unit)
            token = _extract_table_short_token(table.table_name)
            if not md:
                md = _skeleton_markdown_for_table_token(
                    token or display_name, table_name=display_name
                )
            if not md:
                continue
            hit = {
                "tableId": table.id,
                "tableName": display_name,
                "token": token,
                "markdown": md,
            }
            if _is_table54_operating_benefit(table.table_name) and _is_table54_simplified_extract_body(md):
                continue
            out.append(hit)
        if len(out) >= 12:
            break

    t54_norm = _norm_table_token("表5-4")
    t54_hits = [h for h in out if _norm_table_token(str(h.get("token") or "")) == t54_norm]
    if t54_hits:
        rest = [h for h in out if _norm_table_token(str(h.get("token") or "")) != t54_norm]
        ranked = _dedupe_structured_table_hits(t54_hits)
        out = rest + (ranked[:1] if ranked else [])
    return out[:12]


def _table_2_5_general_layout_comparison_name(table_name: str) -> bool:
    """与要素管理 quick-fill 表2-5 判定一致（表头用「项目名称」，不含依托对比）。"""
    n = str(table_name or "")
    if "依托" in n:
        return False
    return "表2-5" in n or "总图、储运、公用工程及辅助工程对比" in n


def _table_2_6_reliance_comparison_name(table_name: str) -> bool:
    """与要素管理 quick-fill 表2-6判定一致（行展示去「依托·」等类别前缀、表头用依托项目名称）。"""
    n = str(table_name or "")
    return (
        "表2-6" in n
        or "储运、公用工程及辅助工程依托对比" in n
        or "辅助工程依托对比" in n
    )


def _table_3_1_contracting_units_name(table_name: str) -> bool:
    """与要素管理 quick-fill 表3-1判定一致（行展示去「承包单元·」前缀、表头用单元名称）。"""
    n = str(table_name or "")
    return "表3-1" in n or "项目承包单位情况" in n


def _table_3_3_plantwide_design_change_name(table_name: str) -> bool:
    """与要素管理 quick-fill 表3-3 判定一致（表头用「单元名称」）。"""
    n = str(table_name or "")
    return ("表3-3" in n or "施工图设计变更情况" in n) and "全厂" in n


def _table_3_4_single_unit_design_change_name(table_name: str) -> bool:
    """与要素管理 quick-fill 表3-4 判定一致（表头用「专业」）。"""
    n = str(table_name or "")
    return ("表3-4" in n or "施工图设计变更情况" in n) and "单装置" in n


def _table_3_5_major_design_change_name(table_name: str) -> bool:
    """与要素管理 quick-fill 表3-5判定一致（行展示去「重大变更·」前缀、表头用单元名称）。"""
    n = str(table_name or "")
    return "表3-5" in n or "影响投资或工期" in n


def _table_3_7_procurement_name(table_name: str) -> bool:
    """与要素管理 quick-fill 表3-7判定一致（行展示去「采购物资·」前缀、表头用物资（类别）名称）。"""
    n = str(table_name or "")
    return "表3-7" in n or "采购工作情况" in n


def _table_4_2_alkylation_operation_analysis_name(table_name: str) -> bool:
    """烷基化装置运行分析表（含历史误标为表4-1、用户改写考核日期后的表题）。"""
    n = re.sub(r"\s+", "", str(table_name or ""))
    return ("烷基化装置运行分析" in n) and ("考核时间" in n)


_TABLE42_ANALYSIS_TEMPLATE_NAME = "表4-2  烷基化装置运行分析表（考核时间：×年×月×日）"


def _multi_column_global_spec_for_table(table_name: str):
    """按表名取多列模版；表4-2 槽位改名后仍套用标准列序（单位/设计值/标定值/实际值）。"""
    tn = str(table_name or "").strip()
    spec = MULTI_COLUMN_GLOBAL_SPECS.get(tn)
    if spec:
        return spec
    if _table_4_2_alkylation_operation_analysis_name(table_name):
        return MULTI_COLUMN_GLOBAL_SPECS.get(_TABLE42_ANALYSIS_TEMPLATE_NAME)
    return None


def _element_manage_row_label_after_first_dot(label: str) -> str:
    """与 quick-fill.js parseRowKeyForDisplay 一致：去掉行键第一个「…·」段（仅作展示）。"""
    s = str(label or "").strip()
    if "\u00b7" in s:
        rest = "\u00b7".join(s.split("\u00b7", 1)[1:]).strip()
        return rest if rest else s
    return s


def _element_manage_table_row_display_label(table_name: str, label: str) -> str:
    """表2-6/表3-1/表3-5/表3-7/表4-2 等与要素管理行名展示对齐（库内 row_key 仍保留类别前缀）。"""
    if (
        _table_2_6_reliance_comparison_name(table_name)
        or _table_3_1_contracting_units_name(table_name)
        or _table_3_5_major_design_change_name(table_name)
        or _table_3_7_procurement_name(table_name)
        or _table_4_2_alkylation_operation_analysis_name(table_name)
    ):
        return _element_manage_row_label_after_first_dot(label)
    return str(label or "").strip()


def _row_header_name_for_table(table_name: str) -> str:
    name = str(table_name or "")
    if "产品方案对比表" in name:
        return "产品"
    if "原料数量及组成对比表" in name:
        return "原料名称"
    if "原料）性质对比表" in name or "原料性质对比表" in name:
        return "名称"
    if _table_2_5_general_layout_comparison_name(name):
        return "项目名称"
    if _table_2_6_reliance_comparison_name(name):
        return "依托项目名称"
    if _table_3_3_plantwide_design_change_name(name):
        return "单元名称"
    if _table_3_4_single_unit_design_change_name(name):
        return "专业"
    if _table_3_1_contracting_units_name(name) or _table_3_5_major_design_change_name(name):
        return "单元名称"
    if _table_3_7_procurement_name(name):
        return "物资（类别）名称"
    if _table_4_2_alkylation_operation_analysis_name(name):
        return "项目名称"
    if (
        _table52_investment_change_name(name)
        or _table53_engineering_cost_change_name(name)
        or _appendix2_investment_structure_name(name)
    ):
        return "工程或费用名称"
    return "项目"


_GROUP_HEADER_PREFIXES = [
    "可研报告", "可研", "初步设计", "实际生产", "实际运行", "实际实施", "后评价",
]
# 含完整时点组名，避免「后评价时点后预测值」被前缀「后评价」误拆成两行表头
_GROUP_HEADER_EXACT = {p for p in _GROUP_HEADER_PREFIXES} | {
    "后评价时点前实际值",
    "后评价时点后预测值",
}


def _group_column_headers(col_order: list[str]) -> tuple[list[str], list[str]] | None:
    """识别多级列头。仅当列名形如"可研报告数量（万吨）"时拆分为 top=可研报告 sub=数量（万吨）。
    独立列名（如"可研报告""初步设计""实际实施"）不视为分组，避免误拆。"""
    top_headers: list[str] = []
    sub_headers: list[str] = []
    has_group = False
    for col in col_order:
        text = str(col or "").strip()
        if not text:
            top_headers.append("")
            sub_headers.append("")
            continue
        # 列键笔误「…时点点后…」勿按「后评价」前缀拆分，否则 Markdown 展平成「后评价-时点点后…」。
        if "时点点后" in text:
            has_group = True
            top_headers.append("")
            sub_headers.append(text.replace("时点点后", "时点后", 1))
            continue
        # 附表3～7、表5-5：列键为「后评价时点后预测值|2021」等，必须在「后评价」前缀规则之前处理，
        # 否则会拆成 top=后评价、sub=时点后预测值|2021，Markdown 单行表头与按 col_key 取值的列错位。
        pipe_sep = "|" if "|" in text else ("\uff5c" if "\uff5c" in text else None)
        if pipe_sep is not None:
            group, tail = text.split(pipe_sep, 1)
            group, tail = group.strip(), tail.strip()
            if group in _APPENDIX_TIME_SLOT_GROUPS and tail:
                has_group = True
                top_headers.append(group)
                sub_headers.append(tail)
                continue
            # 表5-4：「可研报告|××年#1」等不得按「可研报告」前缀拆成「可研报告-|××年#1」
            if group in _TABLE54_PIPE_METRIC_PREFIXES and tail:
                has_group = True
                top_headers.append("")
                sub_headers.append(text)
                continue
        if text in _GROUP_HEADER_EXACT:
            top_headers.append("")
            sub_headers.append(text)
            continue
        matched = False
        for prefix in _GROUP_HEADER_PREFIXES:
            if text.startswith(prefix) and len(text) > len(prefix):
                suffix = text[len(prefix):].strip()
                if suffix:
                    has_group = True
                    top_headers.append(prefix)
                    sub_headers.append(suffix)
                    matched = True
                    break
        if matched:
            continue
        if "·" in text:
            has_group = True
            left, right = [part.strip() for part in text.split("·", 1)]
            top_headers.append(left)
            sub_headers.append(right)
            continue
        top_headers.append("")
        sub_headers.append(text)
    return (top_headers, sub_headers) if has_group else None


def _table51_main_economic_indicators_name(table_name: str) -> bool:
    n = str(table_name or "")
    return "表5-1" in n and "主要经济指标对比" in n


def _table52_investment_change_name(table_name: str) -> bool:
    n = str(table_name or "")
    return "表5-2" in n and "投资变动情况表" in n


def _table53_engineering_cost_change_name(table_name: str) -> bool:
    n = str(table_name or "")
    return "表5-3" in n and "工程费用变动情况表" in n


def _appendix2_investment_structure_name(table_name: str) -> bool:
    n = str(table_name or "")
    return "附表2" in n and "项目竣工决算投资构成表" in n


def _appendix3_cashflow_name(table_name: str) -> bool:
    n = str(table_name or "")
    return "附表3" in n and "项目投资财务现金流量表" in n


def _appendix4_profit_name(table_name: str) -> bool:
    n = str(table_name or "")
    return "附表4" in n and "利润与利润分配计算表" in n


def _appendix5_revenue_tax_name(table_name: str) -> bool:
    n = str(table_name or "")
    return "附表5" in n and "营业收入与营业税金及附加计算表" in n


def _appendix6_cost_name(table_name: str) -> bool:
    n = str(table_name or "")
    return "附表6" in n and "总成本费用计算表" in n


def _appendix7_materials_name(table_name: str) -> bool:
    n = str(table_name or "")
    return "附表7" in n and "原材料、燃料及动力费用计算表" in n


def _appendix8_param_name(table_name: str) -> bool:
    n = str(table_name or "")
    return "附表8" in n and "可研报告和后评价参数对比表" in n


def _appendix_time_table_name(table_name: str) -> bool:
    return (
        _appendix3_cashflow_name(table_name)
        or _appendix4_profit_name(table_name)
        or _appendix5_revenue_tax_name(table_name)
        or _appendix6_cost_name(table_name)
        or _appendix7_materials_name(table_name)
    )


def _table_row_seq_name_split_display(table_name: str) -> bool:
    """投资/附表类表：项目列仅展示名称（序号另列，与要素管理一致）。"""
    return (
        _table51_main_economic_indicators_name(table_name)
        or _table52_investment_change_name(table_name)
        or _table53_engineering_cost_change_name(table_name)
        or _appendix2_investment_structure_name(table_name)
        or _appendix_time_table_name(table_name)
        or _appendix8_param_name(table_name)
    )


def _pick_row_key_with_legacy(
    canon: str, row_set: set[str], legacy_map: dict[str, str]
) -> str | None:
    """在库内实际 row_key 中选取规范键或其旧版别名（优先规范键）。"""
    if canon in row_set:
        return canon
    for legacy, normalized in legacy_map.items():
        if normalized == canon and legacy in row_set:
            return legacy
    return None


def _order_rows_by_preferred(
    row_order: list[str],
    preferred: list[str],
    *,
    legacy_map: dict[str, str] | None = None,
) -> list[str]:
    row_set = set(row_order)
    ordered: list[str] = []
    seen: set[str] = set()
    for canon in preferred:
        picked: str | None
        if legacy_map:
            picked = _pick_row_key_with_legacy(canon, row_set, legacy_map)
        else:
            picked = canon if canon in row_set else None
        if picked and picked not in seen:
            seen.add(picked)
            ordered.append(picked)
    extras = sorted(rk for rk in row_order if rk not in seen)
    return ordered + extras


def _legacy_map_for_table(table_name: str) -> dict[str, str] | None:
    if _appendix2_investment_structure_name(table_name):
        return APPENDIX2_LEGACY_ROW_KEY_MAP
    if _appendix8_param_name(table_name):
        return APPENDIX8_LEGACY_ROW_KEY_MAP
    return None


def _apply_global_table_standard_row_order(table_name: str, row_order: list[str]) -> list[str]:
    """表5-1/5-2/5-3、附表2～8：与要素管理、标准模版一致的标准行序。"""
    if not row_order:
        return row_order
    tn = str(table_name or "")
    row_set = set(row_order)

    preferred = canonical_row_order_for_table(tn)
    if preferred is not None:
        return _order_rows_by_preferred(row_order, preferred, legacy_map=_legacy_map_for_table(tn))

    if _table53_engineering_cost_change_name(tn):
        ordered: list[str] = []
        seen: set[str] = set()
        for alts in TABLE_5_3_ROW_KEY_ALTERNATES:
            picked: str | None = None
            for rk in alts:
                if rk in row_set:
                    picked = rk
                    break
            if picked and picked not in seen:
                seen.add(picked)
                ordered.append(picked)
        extras = sorted(rk for rk in row_order if rk not in seen)
        return ordered + extras

    preferred: list[str] | None = None
    if (
        _table51_main_economic_indicators_name(tn)
        or _table52_investment_change_name(tn)
    ):
        preferred = global_table_row_keys(tn)

    if not preferred:
        return row_order

    ordered = [rk for rk in preferred if rk in row_set]
    seen = set(ordered)
    extras = sorted(rk for rk in row_order if rk not in seen)
    return ordered + extras


def _normalize_table_row_order(row_order: list[str], *, table_name: str = "") -> list[str]:
    row_order = _apply_global_table_standard_row_order(table_name, row_order)
    normal_rows: list[str] = []
    total_rows: list[str] = []
    for row in row_order:
        text = str(row or "").strip()
        compact = re.sub(r"\s+", "", text)
        if compact in {"合计", "总计"}:
            total_rows.append(row)
        else:
            normal_rows.append(row)
    return normal_rows + total_rows


_BARE_FOUR_DIGIT_YEAR_COL = re.compile(r"^\d{4}$")
_APPENDIX_YEAR_TAIL_NORM = re.compile(r"^(\d{4})年?$")


def _appendix_norm_year_tail(tail: str) -> str | None:
    """将列键尾部「2020」「2020年」规范为四位年字符串；非日历年返回 None。"""
    t = str(tail or "").strip()
    m = _APPENDIX_YEAR_TAIL_NORM.fullmatch(t)
    if not m:
        return None
    y = m.group(1)
    try:
        yi = int(y)
    except ValueError:
        return None
    if 1900 <= yi <= 2100:
        return y
    return None


def _filter_redundant_bare_year_columns(col_order: list[str]) -> list[str]:
    """去掉与「组|年度」列重复的旧版纯年份列键（如模板历史同步遗留的 2019 / 2020）。"""
    if not col_order:
        return col_order
    cols = [str(c).strip() for c in col_order if str(c or "").strip()]
    if not cols:
        return list(col_order)
    years_from_piped: set[str] = set()
    for c in cols:
        if "|" not in c:
            continue
        tail = c.rsplit("|", 1)[-1].strip()
        ny = _appendix_norm_year_tail(tail)
        if ny:
            years_from_piped.add(ny)
        for tok in re.split(r"[\s,，、/-]+", tail):
            t = tok.strip()
            if _BARE_FOUR_DIGIT_YEAR_COL.fullmatch(t) and 1900 <= int(t) <= 2100:
                years_from_piped.add(t)
    if not years_from_piped:
        return list(col_order)
    out: list[str] = []
    for c in col_order:
        cs = str(c or "").strip()
        ny_bare = _appendix_norm_year_tail(cs)
        if ny_bare and ny_bare in years_from_piped:
            continue
        if _BARE_FOUR_DIGIT_YEAR_COL.fullmatch(cs) and cs in years_from_piped:
            continue
        out.append(c)
    return out if out else list(col_order)


_APPENDIX_TIME_SLOT_GROUPS = frozenset({"建设期", "后评价时点前实际值", "后评价时点后预测值"})


def _appendix_time_slot_group_tail_is_real(tail: str) -> bool:
    """附表时间分组下子列是否为真实年份（YYYY / YYYY年）。"""
    if _appendix_norm_year_tail(tail):
        return True
    t = (tail or "").strip()
    if _BARE_FOUR_DIGIT_YEAR_COL.fullmatch(t):
        try:
            return 1900 <= int(t) <= 2100
        except ValueError:
            return False
    return False


def _bare_appendix_year_placeholder_col_key(s: str) -> bool:
    """无竖线列键：末栏「××年」「xx年」等占位列（兼容 x/×/全角拉丁混写）。"""
    t = str(s or "").strip()
    if not t:
        return False
    if t in ("…", "..."):
        return True
    buf: list[str] = []
    for ch in t:
        if ch in "xXｘＸ":
            buf.append("×")
        elif ch == "\u00d7":
            buf.append("×")
        else:
            buf.append(ch)
    u = "".join(buf)
    return bool(re.fullmatch(r"×{2}年(?:#\d+)?", u))


def _filter_appendix_placeholder_slot_columns(col_order: list[str]) -> list[str]:
    """某组下已有真实年份列时，该组内只保留 YYYY / YYYY年 子列，并去掉裸组名列与裸「××年」占位列。"""
    if not col_order:
        return col_order
    groups_with_real_year: set[str] = set()
    for c in col_order:
        cs = str(c or "").strip()
        if "|" not in cs:
            continue
        group, tail = cs.split("|", 1)
        group, tail = group.strip(), tail.strip()
        if group not in _APPENDIX_TIME_SLOT_GROUPS:
            continue
        if _appendix_time_slot_group_tail_is_real(tail):
            groups_with_real_year.add(group)
    if not groups_with_real_year:
        return list(col_order)
    out: list[str] = []
    for c in col_order:
        cs = str(c or "").strip()
        if "|" not in cs:
            if cs in groups_with_real_year:
                continue
            if _bare_appendix_year_placeholder_col_key(cs):
                continue
            out.append(c)
            continue
        group, tail = cs.split("|", 1)
        group, tail = group.strip(), tail.strip()
        if group in groups_with_real_year:
            if _appendix_time_slot_group_tail_is_real(tail):
                out.append(c)
            continue
        out.append(c)
    return out if out else list(col_order)


def _filter_appendix3_summary_duplicate_forecast_years(table_name: str, col_order: list[str]) -> list[str]:
    """附表3：「建设期」「时点前」下与「后评价时点后预测值」同年栏重复时去掉，避免表尾多出 2019/2020 等重复列。"""
    tn = str(table_name or "").strip()
    if "附表3" not in tn or "项目投资财务现金流量" not in tn:
        return col_order
    forecast_g = "后评价时点后预测值"
    summary_gs = frozenset({"建设期", "后评价时点前实际值"})
    fy: set[str] = set()
    for c in col_order:
        cs = str(c or "").strip()
        if "|" not in cs:
            continue
        g, tail = cs.split("|", 1)
        if g.strip() != forecast_g:
            continue
        ny = _appendix_norm_year_tail(tail.strip())
        if ny:
            fy.add(ny)
    if not fy:
        return col_order
    drop: set[str] = set()
    for c in col_order:
        cs = str(c or "").strip()
        if "|" not in cs:
            continue
        g, tail = cs.split("|", 1)
        g, tail = g.strip(), tail.strip()
        if g not in summary_gs:
            continue
        ny = _appendix_norm_year_tail(tail)
        if ny and ny in fy:
            drop.add(cs)
    if not drop:
        return col_order
    out = [c for c in col_order if str(c).strip() not in drop]
    return out if out else list(col_order)


def _filter_appendix3_placeholders_when_forecast_has_real_year(table_name: str, col_order: list[str]) -> list[str]:
    """附表3：后评价时点后预测值已有 YYYY 列时，三组内所有「××年#n」占位列均剔除（含建设期/时点前仅余占位的情况）。"""
    tn = str(table_name or "").strip()
    if "附表3" not in tn or "项目投资财务现金流量" not in tn:
        return list(col_order)
    forecast_g = "后评价时点后预测值"
    has_forecast_real = False
    for c in col_order:
        cs = str(c or "").strip()
        if "|" not in cs:
            continue
        g, tail = cs.split("|", 1)
        if g.strip() != forecast_g:
            continue
        if _appendix_norm_year_tail(tail.strip()):
            has_forecast_real = True
            break
    if not has_forecast_real:
        return list(col_order)
    out: list[str] = []
    for c in col_order:
        cs = str(c or "").strip()
        if "|" not in cs:
            out.append(c)
            continue
        g, tail = cs.split("|", 1)
        g, tail = g.strip(), tail.strip()
        if g in _APPENDIX_TIME_SLOT_GROUPS and not _appendix_time_slot_group_tail_is_real(tail):
            continue
        out.append(c)
    return out if out else list(col_order)


def _filter_appendix5_orphan_price_unit_column(table_name: str, col_order: list[str]) -> list[str]:
    """附表5：去掉与「价格（元/t）」重复的独立列键「（元/t）」（多为表头拆行误入数据列）。"""
    tn = str(table_name or "")
    if "附表5" not in tn or "营业收入与营业税金" not in tn:
        return col_order
    if not any("价格" in str(c) and "元/t" in str(c) for c in col_order):
        return col_order
    orphans = {"（元/t）", "(元/t)"}
    out = [c for c in col_order if str(c).strip() not in orphans]
    return out if out else list(col_order)


# 表5-5：仅按表号匹配（不要求表名含「主要生产经营指标」，避免要素表标题变体导致过滤未生效）
_TABLE_55_TITLE_RX = re.compile(r"表\s*5\s*[-－\.．·]\s*5")
_TABLE_55_FORECAST_GROUP = "后评价时点后预测值"
_TABLE_55_FORECAST_HYPHEN_YEAR = re.compile(
    rf"^{re.escape(_TABLE_55_FORECAST_GROUP)}\s*[-－—–]\s*(\d{{4}})(?:年)?$"
)


def _compact_zh_ident(s: str) -> str:
    return re.sub(r"\s+", "", unicodedata.normalize("NFKC", str(s or "")))


def _split_group_year_col_key(col: str) -> tuple[str, str] | None:
    """解析「组|子列」；支持半角/全角竖线。"""
    st = str(col or "").strip()
    if not st:
        return None
    for sep in ("|", "\uff5c"):  # U+FF5C 全角竖线
        if sep in st:
            a, b = st.split(sep, 1)
            return a.strip(), b.strip()
    return None


def _table55_has_forecast_year_slot_columns(col_order: list[str]) -> bool:
    """是否存在「后评价时点后预测值」下的分年列（|、｜ 或 后缀 -YYYY）。"""
    for c in col_order:
        parts = _split_group_year_col_key(str(c or ""))
        if parts:
            g, tail = parts
            if g != _TABLE_55_FORECAST_GROUP:
                continue
            ts = tail.strip()
            if _appendix_norm_year_tail(ts):
                return True
            if _BARE_FOUR_DIGIT_YEAR_COL.fullmatch(ts):
                try:
                    if 1900 <= int(ts) <= 2100:
                        return True
                except ValueError:
                    pass
            continue
        st = str(c or "").strip()
        m = _TABLE_55_FORECAST_HYPHEN_YEAR.match(st)
        if m:
            try:
                if 1900 <= int(m.group(1)) <= 2100:
                    return True
            except ValueError:
                pass
    return False


def _table55_col_should_drop(col: str, *, has_forecast_year_slots: bool) -> bool:
    c0 = _compact_zh_ident(col)
    if "时点点后" in c0:
        return True
    if has_forecast_year_slots and c0 == _compact_zh_ident(_TABLE_55_FORECAST_GROUP):
        return True
    return False


def _filter_table55_redundant_malformed_forecast_column(table_name: str, col_order: list[str]) -> list[str]:
    """表5-5：去掉笔误列「…时点点后…」及在有分年预测列时多余的裸「后评价时点后预测值」列。"""
    if not col_order:
        return col_order
    if not _TABLE_55_TITLE_RX.search(str(table_name or "")):
        return list(col_order)
    has_slots = _table55_has_forecast_year_slot_columns(col_order)
    out = [c for c in col_order if not _table55_col_should_drop(str(c), has_forecast_year_slots=has_slots)]
    return out if out else list(col_order)


_APPENDIX_TIME_GROUP_YEAR_HYPHEN_RE = re.compile(
    r"^(建设期|后评价时点前实际值|后评价时点后预测值)\s*[-－—–]\s*(.+)$"
)


def _appendix_time_col_group_and_tail(col: str) -> tuple[str | None, str | None]:
    """解析附表时间列键为 (组名, 子列)；支持「组|年」「组-年」及裸组名列。"""
    st = str(col or "").strip()
    if not st:
        return None, None
    parts = _split_group_year_col_key(st)
    if parts:
        return parts[0], parts[1]
    m = _APPENDIX_TIME_GROUP_YEAR_HYPHEN_RE.match(st)
    if m:
        return m.group(1).strip(), m.group(2).strip()
    if st in _APPENDIX_TIME_SLOT_GROUPS or st == "价格（元/t）":
        return st, ""
    return None, None


def _appendix_time_tail_sort_key(tail: str | None) -> tuple[int, int, str]:
    """组内子列排序：裸组名 < 分年列（年份升序） < 占位列 < 其它。"""
    t = str(tail or "").strip()
    if not t:
        return (0, -1, "")
    ny = _appendix_norm_year_tail(t)
    if ny:
        return (1, int(ny), "")
    if _BARE_FOUR_DIGIT_YEAR_COL.fullmatch(t):
        try:
            yi = int(t)
            if 1900 <= yi <= 2100:
                return (1, yi, "")
        except ValueError:
            pass
    buf: list[str] = []
    for ch in t:
        if ch in "xXｘＸ":
            buf.append("×")
        elif ch == "\u00d7":
            buf.append("×")
        else:
            buf.append(ch)
    pm = re.fullmatch(r"×{2}年#(\d+)", "".join(buf))
    if pm:
        return (2, int(pm.group(1)), "")
    return (3, 0, t)


def _reorder_appendix_time_col_order(table_name: str, col_order: list[str]) -> list[str]:
    """附表3～7：按细则组序排列列，组内年份从小到大。"""
    if not col_order or not _appendix_time_table_name(table_name):
        return list(col_order)
    spec_groups = time_table_default_columns_for_name(table_name) or []
    group_rank: dict[str, int] = {g: i for i, g in enumerate(spec_groups)}
    by_group: dict[str, list[str]] = {}
    ungrouped: list[str] = []
    for col in col_order:
        cs = str(col or "").strip()
        if not cs:
            continue
        g, _ = _appendix_time_col_group_and_tail(cs)
        if g == "价格（元/t）" or g in _APPENDIX_TIME_SLOT_GROUPS:
            by_group.setdefault(g, []).append(cs)
            if g not in group_rank:
                group_rank[g] = len(group_rank) + 100
        else:
            ungrouped.append(cs)

    def _sort_group_cols(cols: list[str]) -> list[str]:
        return sorted(
            cols,
            key=lambda c: _appendix_time_tail_sort_key(_appendix_time_col_group_and_tail(c)[1]),
        )

    ordered_groups = list(spec_groups)
    for g in sorted(by_group.keys(), key=lambda x: group_rank.get(x, 999)):
        if g not in ordered_groups:
            ordered_groups.append(g)
    out: list[str] = []
    seen: set[str] = set()
    for g in ordered_groups:
        cols = by_group.get(g)
        if not cols:
            continue
        for c in _sort_group_cols(cols):
            if c not in seen:
                out.append(c)
                seen.add(c)
    for c in ungrouped:
        if c not in seen:
            out.append(c)
            seen.add(c)
    return out if out else list(col_order)


def _build_structured_table_html(
    table_name: str,
    row_order: list[str],
    col_order: list[str],
    latest: dict[tuple[str, str], str],
) -> str:
    row_order = _normalize_table_row_order(row_order, table_name=table_name)
    row_header = _row_header_name_for_table(table_name)
    grouped = _group_column_headers(col_order)

    lines: list[str] = ["<table>", "  <thead>"]
    if grouped:
        top_headers, sub_headers = grouped
        lines.append("    <tr>")
        lines.append('      <th rowspan="2">序号</th>')
        lines.append(f'      <th rowspan="2">{row_header}</th>')
        idx = 0
        while idx < len(top_headers):
            group = top_headers[idx]
            if not group:
                lines.append(f'      <th rowspan="2">{sub_headers[idx]}</th>')
                idx += 1
                continue
            span = 1
            while idx + span < len(top_headers) and top_headers[idx + span] == group:
                span += 1
            lines.append(f'      <th colspan="{span}">{group}</th>')
            idx += span
        lines.append("    </tr>")
        lines.append("    <tr>")
        for top, sub in zip(top_headers, sub_headers):
            if top:
                lines.append(f"      <th>{sub}</th>")
        lines.append("    </tr>")
    else:
        lines.append("    <tr>")
        lines.append("      <th>序号</th>")
        lines.append(f"      <th>{row_header}</th>")
        for col in col_order:
            lines.append(f"      <th>{col}</th>")
        lines.append("    </tr>")
    lines.append("  </thead>")
    lines.append("  <tbody>")
    max_rows = min(120, len(row_order)) if _is_table54_operating_benefit(table_name) else min(24, len(row_order))
    serial_col = _table_row_outline_serial_column(row_order, max_rows=max_rows)
    for idx, rk in enumerate(row_order[:max_rows], start=1):
        display_rk = _project_column_row_label(
            table_name, rk, latest, serial_col=serial_col
        )
        serial_cell = _serial_cell_for_report_table(
            table_name, rk, idx, serial_col, serial_idx=idx - 1
        )
        lines.append("    <tr>")
        lines.append(f"      <td>{serial_cell}</td>")
        lines.append(f"      <td>{display_rk}</td>")
        for ck in col_order:
            val = latest.get((rk, ck), "待补充") or "待补充"
            lines.append(f"      <td>{val}</td>")
        lines.append("    </tr>")
    lines.append("  </tbody>")
    lines.append("</table>")
    return "\n".join(lines)


_RE_MD_HEADER_NAME_UNIT = re.compile(r"^(.+?)\s*([（(][^）)]+[）)])$")


def _strip_md_bold_markup(text: str) -> str:
    """去掉 Markdown 加粗标记 **，保留其余内容。"""
    s = str(text or "")
    while True:
        new = re.sub(r"\*\*([^*]+?)\*\*", r"\1", s)
        if new == s:
            break
        s = new
    return s


def _markdown_table_header_cell_display(col_label: str, *, plain: bool = False) -> str:
    """表头栏 Markdown：量纲写在名称下方，单位加括号（同一单元格内用 <br> 换行）；不加粗。"""
    del plain  # 保留参数以兼容旧调用；表头一律不加 ** 包裹
    s = _strip_md_bold_markup(str(col_label or "").strip()).replace("|", "｜")
    if not s:
        return ""
    if re.search(r"<br\s*/?>", s, re.I):
        parts = [
            _strip_md_bold_markup(p).strip()
            for p in re.split(r"<br\s*/?>", s, flags=re.I)
        ]
        out_parts = [p for p in parts if p]
        return "<br>".join(out_parts) if out_parts else s
    m = _RE_MD_HEADER_NAME_UNIT.match(s)
    if m:
        name, unit = m.group(1).strip(), m.group(2).strip()
        if name:
            return f"{name}<br>{unit}" if unit else name
    return s


def _common_trailing_parenthetical_unit_from_flat_labels(
    flat_cols: list[str],
) -> tuple[str | None, list[str]]:
    """当合并后的列表头列名末尾「（单位）」在各列一致时，返回该单位及去掉单位后的表头文案。"""
    stripped: list[str] = []
    units: list[str | None] = []
    for lab in flat_cols:
        s = str(lab or "").strip()
        m = _RE_MD_HEADER_NAME_UNIT.match(s)
        if m:
            stripped.append(m.group(1).strip())
            units.append(m.group(2).strip())
        else:
            stripped.append(s)
            units.append(None)
    present = [u for u in units if u]
    if not present:
        return None, list(flat_cols)
    u0 = present[0]
    if any(units[i] is not None and units[i] != u0 for i in range(len(units))):
        return None, list(flat_cols)
    return u0, stripped


# 表号与表名之间空两格：采用两个全角空格（与公文「空两格」习惯一致）
_TABLE_CAPTION_NUMBER_NAME_GAP = "\u3000\u3000"
_TABLE52_INVESTMENT_CHANGE_CAPTION = (
    f"表5-2{_TABLE_CAPTION_NUMBER_NAME_GAP}投资变动情况表（单位：万元、万美元）"
)
_RE_TABLE_CAPTION_LEADING_TOKEN = re.compile(
    r"^(附表\s*\d+(?:\s*[.\-－．]\s*\d+)*|表\s*\d+(?:\s*[.\-－．]\s*\d+)*)\s*(.*)$",
    re.DOTALL,
)


def _fix_521_table52_wrong_caption(content: str) -> str:
    """5.2.1 若表题误用「表5-2 同类烷基化…」等，改回标准投资变动情况表表题。"""
    text = str(content or "")
    if not text.strip():
        return text
    caption_re = re.compile(
        r"^(\s*(?:#{1,6}\s+)?)(表\s*5\s*[-－．]\s*2\s*(.*))$",
        re.IGNORECASE,
    )
    out: list[str] = []
    for line in text.split("\n"):
        m = caption_re.match(line)
        if m:
            tail = (m.group(3) or "").strip()
            if "投资变动情况表" not in tail:
                out.append(f"{m.group(1)}{_TABLE52_INVESTMENT_CHANGE_CAPTION}")
                continue
        out.append(line)
    return "\n".join(out)


def _normalize_table_caption_number_name_gap(title: str) -> str:
    """将「表2-4xxx」「表 2 - 4  xxx」规范为「表2-4」+ 两全角空格 + 表名。"""
    s = str(title or "").strip()
    if not s:
        return s
    m = _RE_TABLE_CAPTION_LEADING_TOKEN.match(s)
    if not m:
        return s
    token_compact = re.sub(r"\s+", "", (m.group(1) or "").strip())
    rest = (m.group(2) or "").strip()
    if not rest:
        return token_compact
    return f"{token_compact}{_TABLE_CAPTION_NUMBER_NAME_GAP}{rest}"


def _rewrite_table_caption_line_for_number_name_gap(line: str) -> str:
    """修正独立表题行（非表格管道行）中表号与表名间距。"""
    if "|" in line or not line.strip():
        return line
    if line.strip().startswith("```"):
        return line
    m = re.match(r"^(\s*)(.*)$", line)
    if not m:
        return line
    indent, rest = m.group(1), m.group(2)
    h = ""
    m2 = re.match(r"^(#{1,6}\s+)(.*)$", rest)
    if m2:
        h, rest = m2.group(1), m2.group(2)
    rest_st = rest.strip()
    if not rest_st:
        return line
    if rest_st.startswith("**") and rest_st.endswith("**") and len(rest_st) >= 4:
        inner = _strip_md_bold_markup(rest_st[2:-2]).strip()
        n = _normalize_table_caption_number_name_gap(inner)
        return f"{indent}{h}{n}"
    n2 = _normalize_table_caption_number_name_gap(_strip_md_bold_markup(rest_st))
    if n2 != rest_st:
        return f"{indent}{h}{n2}"
    return line


def _debold_md_table_row(line: str) -> str:
    if "|" not in line:
        return line
    return "|".join(_strip_md_bold_markup(part) for part in line.split("|"))


def _debold_markdown_table_blocks_in_content(content: str) -> str:
    """去掉 Markdown 管道表表头行中的 ** 加粗（含双行表头）。"""
    lines = str(content or "").split("\n")
    if not lines:
        return str(content or "")
    out: list[str] = []
    i = 0
    while i < len(lines):
        ln = lines[i]
        if _is_pipe_markdown_table_row_line(ln) and not _is_pipe_markdown_table_separator_line(ln):
            header_rows: list[str] = []
            j = i
            while j < len(lines) and _is_pipe_markdown_table_row_line(lines[j]) and not _is_pipe_markdown_table_separator_line(lines[j]):
                header_rows.append(lines[j])
                j += 1
            if j < len(lines) and _is_pipe_markdown_table_separator_line(lines[j]):
                out.extend(_debold_md_table_row(hr) for hr in header_rows)
                out.append(lines[j])
                j += 1
                while j < len(lines) and (
                    _is_pipe_markdown_table_row_line(lines[j])
                    or _is_pipe_markdown_table_separator_line(lines[j])
                ):
                    out.append(lines[j])
                    j += 1
                i = j
                continue
            out.extend(header_rows)
            i += len(header_rows)
            continue
        out.append(ln)
        i += 1
    return "\n".join(out)


def _normalize_table_captions_in_markdown(content: str) -> str:
    if not str(content or "").strip():
        return str(content or "")
    lines = str(content).split("\n")
    text = "\n".join(_rewrite_table_caption_line_for_number_name_gap(ln) for ln in lines)
    return _debold_markdown_table_blocks_in_content(text)


def _merge_table_title_with_common_unit(base_title: str, unit: str | None) -> str:
    """表题末尾追加各列相同的公共单位（括号形式）；表题已含该单位则不重复。"""
    b = str(base_title or "").strip()
    if not unit or not str(unit).strip():
        return _normalize_table_caption_number_name_gap(b)
    u = str(unit).strip()
    bc = re.sub(r"\s+", "", b)
    uc = re.sub(r"\s+", "", u)
    if uc and bc.endswith(uc):
        return _normalize_table_caption_number_name_gap(b)
    return _normalize_table_caption_number_name_gap(f"{b} {u}")


def _render_table_7_1_markdown(
    row_order: list[str],
    col_order: list[str],
    latest: dict[tuple[str, str], str],
) -> str:
    """表7-1 行键为「指标·要素」或「综合得分」；输出合同要求的「指标」「要素」分列。"""
    def esc(v: str) -> str:
        return str(v or "").replace("|", "｜")

    data_cols: list[str] = []
    for c in TABLE_7_1_COLUMN_KEYS:
        if c in col_order:
            data_cols.append(c)
    for c in col_order:
        if c not in data_cols:
            data_cols.append(c)

    preferred = [rk for rk, _ in TABLE_7_1_ROW_CELL_DEFAULTS]
    preferred_set = set(preferred)
    ordered_rows = [rk for rk in preferred if rk in row_order]
    for rk in row_order:
        if rk not in preferred_set:
            ordered_rows.append(rk)

    header = "| " + " | ".join(
        [_markdown_table_header_cell_display("指标"), _markdown_table_header_cell_display("要素")]
        + [_markdown_table_header_cell_display(c) for c in data_cols]
    ) + " |"
    sep = "| " + " | ".join(["---"] * (2 + len(data_cols))) + " |"
    lines = [header, sep]
    for rk in ordered_rows:
        rk_s = str(rk or "").strip()
        if rk_s == "综合得分":
            ind, elem = "综合得分", ""
        elif "·" in rk_s:
            left, right = rk_s.split("·", 1)
            ind, elem = left.strip(), right.strip()
        else:
            ind, elem = rk_s, ""
        vals: list[str] = []
        for ck in data_cols:
            raw = str(latest.get((rk_s, ck), "") or "").strip()
            vals.append(esc(raw if raw else "待补充"))
        lines.append("| " + " | ".join([esc(ind), esc(elem)] + vals) + " |")
    return "\n".join(lines) + "\n"


def _build_structured_table_markdown(db: Session, table_id: str, table_name: str = "") -> tuple[str, str | None]:
    cells = (
        db.query(ElementCell)
        .filter(
            ElementCell.table_id == table_id,
            ElementCell.value.isnot(None),
            ElementCell.value != "",
        )
        .order_by(ElementCell.updated_at.desc())
        .all()
    )
    if not cells:
        return "", None
    latest: dict[tuple[str, str], str] = {}
    row_order: list[str] = []
    col_order: list[str] = []
    for cell in cells:
        row_key = str(cell.row_key or "").strip()
        col_key = str(cell.col_key or "内容").strip() or "内容"
        if not row_key:
            continue
        key = (row_key, col_key)
        if key not in latest:
            latest[key] = str(cell.value or "").strip()
        if row_key not in row_order:
            row_order.append(row_key)
        if col_key not in col_order:
            col_order.append(col_key)
    if not row_order:
        return "", None
    row_order = _normalize_table_row_order(row_order, table_name=table_name)
    spec = _multi_column_global_spec_for_table(table_name)
    if spec:
        spec_cols = [str(col).strip() for col in (spec[0] or []) if str(col).strip()]
        ordered = [col for col in spec_cols if col in col_order]
        extras = [col for col in col_order if col not in ordered]
        col_order = ordered + extras
    col_order = _filter_redundant_bare_year_columns(col_order)
    col_order = _filter_appendix_placeholder_slot_columns(col_order)
    col_order = _filter_appendix3_summary_duplicate_forecast_years(table_name, col_order)
    col_order = _filter_appendix3_placeholders_when_forecast_has_real_year(table_name, col_order)
    col_order = _filter_appendix5_orphan_price_unit_column(table_name, col_order)
    col_order = _filter_table55_redundant_malformed_forecast_column(table_name, col_order)
    col_order = _reorder_appendix_time_col_order(table_name, col_order)
    inferred_ty: int | None = None
    if _is_table54_operating_benefit(str(table_name or "").strip()):
        ty_row = db.query(ElementTable.year).filter(ElementTable.id == table_id).first()
        tbl_y = int(ty_row[0]) if ty_row and ty_row[0] is not None and int(ty_row[0]) > 0 else None
        inferred_ty = _infer_time_column_year_for_table54(col_order, cells, tbl_y)
    if str(table_name or "").strip() == TABLE_7_1_SCORING_TABLE_NAME:
        return _render_table_7_1_markdown(row_order, col_order, latest), None
    return _render_markdown_table(
        table_name, row_order, col_order, latest, time_column_year=inferred_ty
    )


_PLACEHOLDER_ROW_PREFIX = re.compile(r"^(产品名称|原料名称|项目名称|名称|产品|项目)\s*[·.]\s*")


def _display_row_key(table_name: str, rk: str, latest: dict[tuple[str, str], str]) -> str:
    """将模板占位行名替换为真实名称（优先使用该行单元格中的项目/产品名称）。"""
    text = str(rk or "").strip()
    m = _PLACEHOLDER_ROW_PREFIX.match(text)
    if not m:
        return text

    preferred_cols: list[str] = []
    tn = str(table_name or "")
    if "表2-4" in tn or "产品流向" in tn:
        preferred_cols.extend(["项目名称", "产品名称", "规格"])
    preferred_cols.extend(["项目名称", "产品名称", "名称", "规格"])

    for col in preferred_cols:
        v = str(latest.get((rk, col), "") or "").strip()
        if v and v != "待补充":
            return v

    suffix = text[m.end():].strip()
    return f"产品{suffix}" if suffix else text


# 与正文层次编号一致：row_key 形如「1.1 建设投资」「1.2.3 工艺」「3原料」
_ROWKEY_OUTLINE_PREFIX = re.compile(
    r"^\s*(\d+(?:\.\d+)*)(?:\s*[、．.]?\s+(?=\S)|(?=[\u4e00-\u9fffA-Za-z（(]))"
)
_CN_OUTLINE_ROWKEY_PREFIX = re.compile(r"^\s*([一二三四五六七八九十百千]+)\s+(.+)$")
_APPENDIX5_PRODUCT_TRIPLE = re.compile(
    r"^(\d+(?:\.\d+)*)\s+(.+?)·(销量|营业收入|销项税)$"
)
_APPENDIX7_DETAIL_ROW = re.compile(
    r"^(\d+(?:\.\d+)*)\s+([^·]+?)(?:·(单价|数量|进项税额|……))?$"
)

_TABLE_53_LEGACY_ROW_DISPLAY: dict[str, str] = {
    "工程费用变动·批准单位": "批准单位",
    "工程费用变动·批准文号": "批准文号",
    "工程费用变动·工程费用合计": "工程费用合计",
    "工程费用变动·工艺生产装置": "1 工艺生产装置",
    "工程费用变动·装置·设备购置费": "1.1.1 设备购置费",
    "工程费用变动·装置·安装工程费": "1.1.2 安装工程费",
    "工程费用变动·装置·建筑工程费": "1.1.3 建筑工程费",
    "工程费用变动·总图运输": "2 总图运输",
    "工程费用变动·储运工程": "3 储运工程",
    "工程费用变动·其它分项（可增删）": "其它分项（可增删）",
}


def _strip_table_prefix_from_row_key(rk: str) -> str:
    s = str(rk or "").strip()
    if "\u00b7" in s:
        return "\u00b7".join(s.split("\u00b7")[1:]).strip()
    return s


def _parse_row_key_seq_and_name(rk: str, *, table_name: str = "") -> tuple[str, str]:
    """与 quick-fill.js ``parseRowKeyForDisplay`` 一致。"""
    s = str(rk or "").strip()
    if not s:
        return "", ""
    legacy = _legacy_map_for_table(table_name)
    if legacy:
        s = legacy.get(s, s)
    if _table53_engineering_cost_change_name(table_name):
        s = _TABLE_53_LEGACY_ROW_DISPLAY.get(s, s)
    if _appendix_time_table_name(table_name) or _appendix8_param_name(table_name):
        s = _strip_table_prefix_from_row_key(s)
    elif "\u00b7" in s:
        s = "\u00b7".join(s.split("\u00b7")[1:]).strip()
    m = _ROWKEY_OUTLINE_PREFIX.match(s)
    if m:
        rest = s[m.end():].strip()
        return m.group(1), rest if rest else s
    m_cn = _CN_OUTLINE_ROWKEY_PREFIX.match(s)
    if m_cn:
        return m_cn.group(1), m_cn.group(2).strip()
    return "", s


def _row_display_name_for_table(table_name: str, rk: str) -> str:
    """项目/工程名称列展示文案（去表内前缀与层次编号，附表5/7 明细行单独处理）。"""
    s0 = str(rk or "").strip()
    if not s0:
        return ""
    if _appendix5_revenue_tax_name(table_name):
        s = _strip_table_prefix_from_row_key(s0)
        m = _APPENDIX5_PRODUCT_TRIPLE.match(s)
        if m:
            return m.group(3)
    if _appendix7_materials_name(table_name):
        s = _strip_table_prefix_from_row_key(s0)
        m = _APPENDIX7_DETAIL_ROW.match(s)
        if m and m.group(3):
            return m.group(3)
        if m:
            return m.group(2).strip()
    if _table_row_seq_name_split_display(table_name):
        _, name = _parse_row_key_seq_and_name(s0, table_name=table_name)
        return name or s0
    return s0


def _serial_cell_for_report_table(
    table_name: str,
    rk: str,
    idx: int,
    serial_col: list[str] | None,
    *,
    serial_idx: int,
) -> str:
    """表5-2/5-3、附表2～8 用连续 1..n；表5-1 用层次编号；其余表沿用原逻辑。"""
    if _table_row_seq_name_split_display(table_name) and not _table51_main_economic_indicators_name(
        table_name
    ):
        return str(idx)
    if _table51_main_economic_indicators_name(table_name):
        seq, _ = _parse_row_key_seq_and_name(rk, table_name=table_name)
        return seq if seq else str(idx)
    if serial_col is not None:
        return serial_col[serial_idx]
    return str(idx)


def _project_column_row_label(
    table_name: str,
    rk: str,
    latest: dict[tuple[str, str], str],
    *,
    serial_col: list[str] | None,
) -> str:
    if _table_row_seq_name_split_display(table_name):
        label = _row_display_name_for_table(table_name, rk)
    elif serial_col is not None:
        label = _strip_row_key_leading_outline_for_display(rk)
        if not str(label or "").strip():
            label = rk
    else:
        label = rk
    return _element_manage_table_row_display_label(
        table_name, _display_row_key(table_name, label, latest)
    )


def _outline_serial_from_row_key(rk: str) -> str | None:
    """若 row_key 以阿拉伯数字层次编号开头，返回该编号字符串。"""
    rk_s = str(rk or "").strip()
    if not rk_s:
        return None
    compact = re.sub(r"\s+", "", rk_s)
    if compact in ("合计", "总计"):
        return None
    m = _ROWKEY_OUTLINE_PREFIX.match(rk_s)
    if not m:
        return None
    num = m.group(1)
    if re.fullmatch(r"\d{4}", num):
        try:
            yi = int(num)
        except ValueError:
            return None
        if 1900 <= yi <= 2100:
            return None
    return num


def _table_row_outline_serial_column(row_order: list[str], *, max_rows: int) -> list[str] | None:
    """当每一数据行（合计/总计除外）的 row_key 均带层次编号时，序号列采用该编号。"""
    rows = row_order[:max_rows]
    if not rows:
        return None
    serials: list[str] = []
    for rk in rows:
        compact = re.sub(r"\s+", "", str(rk or ""))
        if compact in ("合计", "总计") or str(rk or "").strip() in ("合计", "总计"):
            serials.append("—")
            continue
        s = _outline_serial_from_row_key(str(rk) or "")
        if s is None:
            return None
        serials.append(s)
    return serials


def _strip_row_key_leading_outline_for_display(rk: str) -> str:
    """去掉 row_key 首部层次编号，避免第二列与序号列重复。"""
    rk_s = str(rk or "").strip()
    m = _ROWKEY_OUTLINE_PREFIX.match(rk_s)
    if not m:
        return rk_s
    rest = rk_s[m.end():].strip()
    return rest if rest else rk_s


def _render_markdown_table(
    table_name: str,
    row_order: list[str],
    col_order: list[str],
    latest: dict[tuple[str, str], str],
    *,
    time_column_year: int | None = None,
) -> tuple[str, str | None]:
    col_order = list(col_order)
    tn = str(table_name or "").strip()
    table54 = _is_table54_operating_benefit(tn)
    if table54:
        _table54_remap_indicator_unit_latest(latest)
        _table54_rekey_latest_col_keys(latest)
        col_order = _reorder_table54_col_order(col_order)
    grouped = _group_column_headers(col_order)
    if grouped:
        top_headers, sub_headers = grouped
        flat_cols: list[str] = []
        for top, sub in zip(top_headers, sub_headers):
            if top and sub:
                flat_cols.append(f"{top}-{sub}")
            elif top:
                flat_cols.append(top)
            else:
                flat_cols.append(sub)
    else:
        flat_cols = list(col_order)

    if table54:
        flat_cols = list(col_order)
        flat_header_labels = _table54_markdown_header_labels(
            col_order, time_column_year=time_column_year
        )
        common_unit = None
    else:
        common_unit, flat_header_labels = _common_trailing_parenthetical_unit_from_flat_labels(flat_cols)
        if common_unit is None:
            flat_header_labels = flat_cols

    def _esc_pipe(v: str) -> str:
        return str(v or "").replace("|", "｜")

    row_header = _row_header_name_for_table(table_name)
    if table54:
        row_header = "项目"

    header = (
        "| "
        + " | ".join(
            [
                _markdown_table_header_cell_display("序号"),
                _markdown_table_header_cell_display(row_header),
            ]
            + [_markdown_table_header_cell_display(c) for c in flat_header_labels]
        )
        + " |"
    )
    split = "| --- | --- | " + " | ".join(["---"] * len(flat_cols)) + " |"
    lines = [header, split]
    max_rows = min(120, len(row_order)) if table54 else min(24, len(row_order))
    serial_col = _table_row_outline_serial_column(row_order, max_rows=max_rows)
    for idx, rk in enumerate(row_order[:max_rows], start=1):
        vals = [_esc_pipe(latest.get((rk, ck), "待补充") or "待补充") for ck in col_order]
        display_rk = _project_column_row_label(
            table_name, rk, latest, serial_col=serial_col
        )
        serial_cell = _serial_cell_for_report_table(
            table_name, rk, idx, serial_col, serial_idx=idx - 1
        )
        lines.append("| " + serial_cell + " | " + _esc_pipe(display_rk) + " | " + " | ".join(vals) + " |")
    return "\n".join(lines) + "\n", common_unit


def _build_time_table_markdowns_by_year(
    db: Session, table_id: str, table_name: str = "",
) -> list[tuple[str, str]]:
    """为时间要素表按 year 拆分，返回 [(display_table_name, markdown), ...] 列表。

    时间表的 ElementCell 通过 year 字段区分不同年份的数据；前端用 col_key + "|" + year
    渲染多级表头。本函数按年份分别聚合 cell，为每个年份生成独立的 Markdown 表格，
    表名中的「××年」替换为实际年份。
    """
    cells = (
        db.query(ElementCell)
        .filter(
            ElementCell.table_id == table_id,
            ElementCell.value.isnot(None),
            ElementCell.value != "",
        )
        .order_by(ElementCell.updated_at.desc())
        .all()
    )
    if not cells:
        return []

    from collections import defaultdict
    year_cells: dict[int | None, list[ElementCell]] = defaultdict(list)
    for cell in cells:
        year_cells[cell.year].append(cell)

    ty_row = db.query(ElementTable.year).filter(ElementTable.id == table_id).first()
    tbl_y = (
        int(ty_row[0])
        if ty_row and ty_row[0] is not None and int(ty_row[0]) > 0
        else None
    )
    base_name = str(table_name or "").strip()
    if _is_table54_operating_benefit(base_name):
        year_cells, real_years = _table54_merge_year_cells_for_table_year(
            year_cells, table_year=tbl_y
        )
    else:
        real_years = sorted(y for y in year_cells if y is not None)
    if not real_years:
        md, common_unit = _build_structured_table_markdown(db, table_id, table_name)
        if not md:
            return []
        disp = _merge_table_title_with_common_unit(str(table_name or "").strip(), common_unit)
        return [(disp, md)]

    results: list[tuple[str, str]] = []
    for year in real_years:
        year_cell_list = year_cells[year]
        latest: dict[tuple[str, str], str] = {}
        row_order: list[str] = []
        col_order: list[str] = []
        for cell in year_cell_list:
            row_key = str(cell.row_key or "").strip()
            col_key = str(cell.col_key or "内容").strip() or "内容"
            if not row_key:
                continue
            key = (row_key, col_key)
            if key not in latest:
                latest[key] = str(cell.value or "").strip()
            if row_key not in row_order:
                row_order.append(row_key)
            if col_key not in col_order:
                col_order.append(col_key)
        if not row_order:
            continue
        if _is_table54_operating_benefit(base_name):
            _table54_coalesce_legacy_bare_metric_cols(latest, row_order)
        row_order = _normalize_table_row_order(row_order, table_name=base_name)
        time_spec_cols = time_table_default_columns_for_name(base_name)
        if time_spec_cols and _is_table54_operating_benefit(base_name):
            col_order = ["单位"] + [c for c in time_spec_cols if c != "单位"]
        elif time_spec_cols:
            ordered = [col for col in time_spec_cols if col in col_order]
            extras = [col for col in col_order if col not in ordered]
            col_order = ordered + extras
        col_order = _filter_redundant_bare_year_columns(col_order)
        col_order = _filter_appendix_placeholder_slot_columns(col_order)
        col_order = _filter_appendix3_summary_duplicate_forecast_years(base_name, col_order)
        col_order = _filter_appendix3_placeholders_when_forecast_has_real_year(base_name, col_order)
        col_order = _filter_appendix5_orphan_price_unit_column(base_name, col_order)
        col_order = _filter_table55_redundant_malformed_forecast_column(base_name, col_order)
        col_order = _reorder_appendix_time_col_order(base_name, col_order)
        display_name = re.sub(r"××年", f"{year}年", base_name)
        md, common_unit = _render_markdown_table(
            display_name, row_order, col_order, latest, time_column_year=year
        )
        if md:
            results.append((_merge_table_title_with_common_unit(display_name, common_unit), md))

    if not results and None in year_cells:
        md, common_unit = _build_structured_table_markdown(db, table_id, table_name)
        if md:
            results.append((_merge_table_title_with_common_unit(str(table_name or "").strip(), common_unit), md))
    return results


def _extract_table_short_token(table_name: str) -> str:
    text = str(table_name or "")
    m = re.search(r"(附表\s*\d+(?:\s*[.\-－]\s*\d+)*|表\s*\d+(?:\s*[.\-－]\s*\d+)*)", text)
    return re.sub(r"\s+", "", m.group(1)) if m else ""


def _norm_table_token(token: str) -> str:
    text = re.sub(r"\s+", "", str(token or "")).lower()
    return text.replace("－", "-").replace("—", "-").replace("–", "-")


def _table_token_matches_name(token: str, name: str, *, normalized: bool = False) -> bool:
    """
    表号精确匹配，避免“表1”误命中“表10”。
    - normalized=True: token/name 已经是 _norm_table_token 结果。
    """
    t = token if normalized else _norm_table_token(token)
    n = name if normalized else _norm_table_token(name)
    if not t or not n:
        return False
    if t == n:
        return True
    # 兼容历史项目：4.3.3 的“烷基化装置运行分析”可能仍存为表4-1，仍应视为表4-2 的同义候选。
    if t == _norm_table_token("表4-2"):
        raw_name = str(name or "")
        name_plain = re.sub(r"\s+", "", raw_name)
        if ("烷基化装置运行分析" in name_plain) and ("考核时间" in name_plain):
            if ("表4-1" in name_plain) or ("表4-2" in name_plain) or ("表41" in _norm_table_token(name_plain)):
                return True
    # 后面不能紧跟 1-2 位数字后即结束或遇到非数字（避免 表1→表10、表2-4→表2-40），
    # 但允许紧跟 4 位年份（如 表2-42019年…）或非数字字符（如 表2-4××年…）。
    pattern = re.compile(rf"{re.escape(t)}(?!\d{{1,2}}(?!\d))")
    return bool(pattern.search(n))


def _table_token_caption_line_re(token: str) -> re.Pattern[str]:
    token_plain = re.sub(r"\s+", "", str(token or ""))
    token_re = re.escape(token_plain).replace(r"\-", r"[-－—–]")
    return re.compile(
        r"(?:^|\n)([^\n]*?" + token_re + r"[^\n]*)\n",
        flags=re.IGNORECASE,
    )


def _segment_after_table_caption(content: str, token: str) -> str:
    """本表表题行之后、下一张「表 x-x …」表题之前的内容（不含引用语中的表号）。"""
    text = str(content or "")
    cap = _table_token_caption_line_re(token).search(text)
    if not cap:
        return ""
    rest = text[cap.end() :]
    next_cap = re.search(
        r"\n[^\n]*?表\s*\d+(?:\s*[-－．]\s*\d+)*\s+[\u4e00-\u9fff]",
        rest,
        flags=re.IGNORECASE,
    )
    if next_cap:
        return rest[: next_cap.start()]
    return rest


def _segment_has_markdown_table_body(segment: str) -> bool:
    seg = str(segment or "")
    if not seg.strip():
        return False
    return bool(
        re.search(
            r"(?:<table>|(?:\n[ \t]*\|[^\n]+\|[ \t]*\n[ \t]*\|[-:\s|]+\|))",
            seg,
            flags=re.IGNORECASE,
        )
    )


def _replace_caption_stub_with_authoritative_table(
    content: str, token: str, authoritative_block: str
) -> str:
    """将「仅有表题/注释、无表体」的占位段替换为要素直出整块（用于 3.3.4 表3-4 等）。"""
    text = str(content or "")
    block = str(authoritative_block or "").strip()
    if not block:
        return text
    cap = _table_token_caption_line_re(token).search(text)
    if not cap:
        return text.rstrip() + "\n\n" + block
    region_start = cap.start()
    if region_start > 0 and text[region_start] == "\n":
        region_start += 1
    rest = text[cap.end() :]
    next_cap = re.search(
        r"\n[^\n]*?表\s*\d+(?:\s*[-－．]\s*\d+)*\s+[\u4e00-\u9fff]",
        rest,
        flags=re.IGNORECASE,
    )
    region_end = cap.end() + (next_cap.start() if next_cap else len(rest))
    head = text[:region_start].rstrip("\n")
    tail = text[region_end:].lstrip("\n")
    if head:
        return f"{head}\n\n{block}\n\n{tail}".strip() if tail else f"{head}\n\n{block}".strip()
    return f"{block}\n\n{tail}".strip() if tail else block


def _table_token_exists(content: str, token: str) -> bool:
    text = str(content or "")
    t = _norm_table_token(token)
    if not text or not t:
        return False
    # 须有独立表题行；正文「见表3-3~表3-5」等引用不算。
    if not _table_token_caption_line_re(token).search(text):
        return False
    # 表体必须紧跟在本表表题与下一张表题之间，不得借用后续表的 Markdown 块（如 3.3.4 仅表3-4 题、表3-5 有体）。
    return _segment_has_markdown_table_body(_segment_after_table_caption(text, token))


def _basic_warnings(section_title: str, content: str) -> list[str]:
    warnings: list[str] = []
    if len(content.strip()) < 80:
        warnings.append("章节内容过短，建议补充证据后重试")
    title_norm = re.sub(r"\s+", "", str(section_title or ""))
    if "1.2项目决策要点" in title_norm:
        if "1.2.1项目背景" not in content or "1.2.2预期目标" not in content:
            warnings.append("1.2 未按固定结构输出（缺少“1.2.1项目背景/1.2.2预期目标”小节）")
    if "2.1.1资源与原料评价" in title_norm:
        if "原料数量及组成对比表" not in content:
            warnings.append("2.1.1 缺少模版规定的「原料数量及组成对比表」标题")
        if "原料性质对比表（醚后碳四）" not in content and "原料性质对比表" not in content:
            warnings.append("2.1.1 缺少模版规定的「原料性质对比表（醚后碳四）」标题")
        if "原料选择加氢工艺技术对比" in content or (
            "表2.6-1" in content
            and "原料数量及组成对比" not in content
            and "原料选择加氢" in content
        ):
            warnings.append("2.1.1 不应出现安评类「表2.6-1 原料选择加氢工艺技术对比」等内容，本节仅允许模版主表")
        if "附录：原料预处理工艺方案比选" in content or "（非模版主表）" in content:
            warnings.append("2.1.1 不应出现附录或“非模版主表”字样，请仅保留模版两张主表")
    if "表" in section_title and "|" not in content:
        warnings.append("章节标题疑似要求表格，但输出未包含 Markdown 表格")
    if "待补充" in content and len(content.strip()) < 140:
        warnings.append("缺失信息较多，建议补充材料后重跑")
    return warnings


def _check_consistency(report: str, project_name: str) -> list[str]:
    out: list[str] = []
    if project_name and project_name not in report:
        out.append("正文未显式出现项目名称，请检查第一章基本信息。")
    amounts = re.findall(r"(\d+(?:\.\d+)?)\s*(亿元|万元|万)", report)
    if amounts:
        normalized = [f"{v}-{u}" for v, u in amounts]
        if len(normalized) >= 4 and len(set(normalized[:10])) >= 6:
            out.append("金额口径较分散，建议统一投资/决算/效益统计口径。")
    unit_lines = re.findall(r"(?:单位|计量单位)\s*[:：]\s*([^\n]{1,40})", report)
    if unit_lines and len(set(unit_lines)) > 1:
        out.append("检测到多个计量单位定义，建议统一单位说明（如万元、吨/年）。")
    years = [int(y) for y in re.findall(r"(20\d{2})年", report)]
    if years:
        min_y, max_y = min(years), max(years)
        if max_y - min_y >= 12:
            out.append("年份跨度较大，建议复核建设期与运营期时间线是否混写。")
    if "待补充" in report:
        missing_count = report.count("待补充")
        if missing_count >= 10:
            out.append(f"全篇“待补充”出现 {missing_count} 次，建议补充关键材料后重跑。")
    if _has_conflict_terms(report):
        out.append("发现同一指标存在“增加/下降”等相反表述，建议人工复核结论口径。")
    return out


def _append_report_appendices(db: Session, project_uuid: str, report_text: str) -> str:
    """
    为最终报告追加「附图/附表」（细则顺序：附图在上，附表在下）。

    说明：
    - 附图：从项目知识库 .docx 中解析嵌入图（全厂/装置物料平衡等），以 Markdown 内嵌图输出；
      解析不到则不输出该项（无占位说明）。
    - 附表：从结构化表（element_tables/element_cells）汇总，优先抓取表名包含「附表」的表。
    """
    base = (report_text or "").strip()
    if not base:
        base = ""

    appendix_tables = _build_appendix_tables_markdown(db, project_uuid)
    appendix_figures = _build_appendix_figures_markdown(db, project_uuid)

    parts = [base] if base else []
    if appendix_figures:
        parts.append(appendix_figures)
    if appendix_tables:
        parts.append(appendix_tables)
    return "\n\n".join([p for p in parts if str(p).strip()]).strip()


def _build_appendix_tables_markdown(db: Session, project_uuid: str) -> str:
    tables = (
        db.query(ElementTable)
        .filter(ElementTable.project_id == project_uuid)
        .order_by(ElementTable.table_name.asc(), ElementTable.updated_at.desc())
        .all()
    )
    appendix = [t for t in tables if "附表" in (t.table_name or "")]
    if not appendix:
        return ""

    blocks: list[str] = ["## 附表"]
    used = 0
    for t in appendix:
        md, common_unit = _build_structured_table_markdown(db, t.id, t.table_name)
        title = str(t.table_name or "").strip() or f"附表（{t.id}）"
        title = _merge_table_title_with_common_unit(title, common_unit)
        if not md:
            md = _build_appendix_table_fallback_markdown(title)
        if not md:
            continue
        blocks.append(f"### {title}\n\n{md}")
        used += 1
        if used >= 30:
            break
    return "\n\n".join(blocks).strip() if used else ""


def _build_appendix_table_fallback_markdown(table_name: str) -> str:
    """
    当 element_cells 暂无有效数据时，按固定模板输出占位附表，避免附表缺失。
    当前优先支持：附表8 可研报告和后评价参数对比表。
    """
    name = str(table_name or "").replace(" ", "")
    if ("附表8" in name) and ("可研报告和后评价参数对比表" in name):
        return APPENDIX8_PARAMETER_COMPARISON_TABLE
    return ""


def _resolve_appendix_figure_blobs_from_kb(db: Session, project_uuid: str) -> dict[int, tuple[bytes, str, str]]:
    """自知识库 docx 抽取附图嵌入图：slot -> (blob, content_type, source_filename)。"""
    doc_root = Path(settings.DOC_PAT).resolve()
    rows = (
        db.query(KbDocument)
        .filter(KbDocument.project_id == project_uuid)
        .order_by(KbDocument.uploaded_at.desc())
        .all()
    )
    per_doc: list[tuple[str, dict[int, list[tuple[int, bytes, str]]]]] = []
    for d in rows:
        name = str(d.name or "")
        if not name.lower().endswith(".docx"):
            continue
        full = _kb_doc_absolute_file_path_for_model(doc_root, d)
        if not full.is_file():
            continue
        try:
            cand = extract_appendix_figure_candidates_from_docx(full)
        except Exception as exc:
            logger.warning("appendix figure extraction failed %s: %s", full, exc)
            continue
        per_doc.append((name, cand))
    return merge_best_appendix_figures(per_doc)


def _build_appendix_figures_markdown(db: Session, project_uuid: str) -> str:
    """
    附图固定两项（细则）：
    - 附图1　全厂物料平衡图
    - 附图2　烷基化装置物料平衡图（常见为装置物料平衡图）

    仅从知识库 .docx 嵌入对象抽取真实图片；解析不到则不在报告中展示该项（不输出占位说明）。
    """
    targets = APPENDIX_FIGURE_TARGETS
    resolved = _resolve_appendix_figure_blobs_from_kb(db, project_uuid)
    md_by_slot = appendix_figure_markdown_images(resolved, label_title=list(targets))

    figure_parts: list[str] = []
    for slot in range(1, len(targets) + 1):
        md = md_by_slot.get(slot)
        if md and str(md).strip():
            figure_parts.append(str(md).strip())
    if not figure_parts:
        return ""
    return "## 附图\n\n" + "\n\n".join(figure_parts)


def _update_chapter_status(
    db: Session,
    job: ReportGenerationJob,
    chapter: ReportGenerationChapter,
    status: str,
    error_message: Optional[str],
) -> None:
    now = datetime.now()
    chapter.status = status
    chapter.error_message = error_message
    chapter.updated_at = now
    job.current_section_key = chapter.section_key
    job.updated_at = now
    db.commit()


def _resolve_template(db: Session, template_id: Optional[str]) -> ReportTemplate:
    template = None
    if template_id:
        template = db.query(ReportTemplate).filter(ReportTemplate.id == template_id).first()
    if not template:
        template = (
            db.query(ReportTemplate)
            .filter(ReportTemplate.is_default == True, ReportTemplate.is_active == True)  # noqa: E712
            .first()
        )
    if not template:
        raise HTTPException(status_code=404, detail="未找到可用模板")
    return template


def _list_template_sections(db: Session, template_id: str) -> list[ReportTemplateSection]:
    return (
        db.query(ReportTemplateSection)
        .filter(ReportTemplateSection.template_id == template_id)
        .order_by(ReportTemplateSection.section_order.asc())
        .all()
    )


def _sections_for_generation(sections: list[ReportTemplateSection]) -> list[ReportTemplateSection]:
    items = list(sections or [])
    if not items:
        return []

    section_nos = {
        _extract_section_number(section.section_title or "")
        for section in items
        if _extract_section_number(section.section_title or "")
    }

    filtered: list[ReportTemplateSection] = []
    for section in items:
        section_no = _extract_section_number(section.section_title or "")
        # 无法解析编号时保持兼容，继续参与生成。
        if not section_no:
            filtered.append(section)
            continue
        # 仅生成叶子节：若存在任一后续子节（前缀匹配 x.y.z ...），则当前节跳过。
        has_children = any(no.startswith(f"{section_no}.") for no in section_nos)
        if not has_children:
            filtered.append(section)
    return filtered


def _extract_tokens(text: str) -> list[str]:
    src = str(text or "")
    zh = re.findall(r"[\u4e00-\u9fa5]{2,8}", src)
    en = re.findall(r"[A-Za-z]{3,20}", src.lower())
    raw = zh + en
    out: list[str] = []
    seen = set()
    for t in raw:
        if t in seen:
            continue
        seen.add(t)
        out.append(t)
    return out


def _fmt_dt(dt: Optional[datetime]) -> Optional[str]:
    if not dt:
        return None
    return dt.strftime("%Y-%m-%d %H:%M:%S")


def _select_chapter_example(section_title: str, raw_examples: Optional[str], evidence: dict) -> str:
    # 1.2 对“叙事+证据锚点”敏感，宁夏石化类示例易带偏叙述重点，故不使用章节示例
    if _extract_section_number(section_title or "") == "1.2":
        return ""
    # 2.1.1 须严格对齐《模版.doc》两张原料表，章节示例中的其他项目表结构易干扰
    if _extract_section_number(section_title or "") == "2.1.1":
        return ""
    # 5.1 表5-1 由合同与要素直出约束；第5章样例为简化「指标|可研值|后评价值」表，易与要素表重复输出
    if _extract_section_number(section_title or "") == "5.1":
        return ""
    # 5.2.1 表5-2/表5-3 由输出合同与要素直出约束；第5章共用样例曾误标「表5-2 同类对标」易带偏表题
    if _extract_section_number(section_title or "") == "5.2.1":
        return ""
    # 5.3.1 仅允许要素直出表5-4；第5章样例含「指标|可研值|后评价值」简表易诱发重复表
    if _extract_section_number(section_title or "") == "5.3.1":
        return ""
    # 5.4 仅允许表5-7；第5章样例（表5-1 可研/后评对比）易诱发重复简表
    if _extract_section_number(section_title or "") == "5.4":
        return ""
    samples = _parse_examples(raw_examples)
    if not samples:
        return ""
    title = str(section_title or "")
    has_table_need = ("表" in title) or _evidence_has_table_signal(evidence)
    scored: list[tuple[int, str]] = []
    for s in samples:
        score = 0
        txt = s.lower()
        if has_table_need and ("|" in s or "表" in s):
            score += 4
        if any(k in txt for k in ("万元", "亿元", "投资", "收益")) and ("投资" in title or "财务" in title):
            score += 3
        if any(k in txt for k in ("环保", "安全", "排放")) and ("影响" in title or "持续" in title):
            score += 3
        if any(k in txt for k in ("结论", "建议", "经验")) and ("结论" in title or "综合" in title):
            score += 3
        score += min(len(s) // 300, 2)
        scored.append((score, s))
    scored.sort(key=lambda x: x[0], reverse=True)
    return scored[0][1]


def _parse_examples(raw_examples: Optional[str]) -> list[str]:
    text = str(raw_examples or "").strip()
    if not text:
        return []
    # 支持 JSON 数组格式：["示例1","示例2","示例3"]
    if text.startswith("[") and text.endswith("]"):
        try:
            arr = json.loads(text)
            if isinstance(arr, list):
                out = [str(x).strip() for x in arr if str(x).strip()]
                return out[:3]
        except Exception:
            pass
    # 支持分隔符：---EXAMPLE--- 或 \n\n====\n\n
    for sep in ("\n---EXAMPLE---\n", "\n====\n"):
        if sep in text:
            return [x.strip() for x in text.split(sep) if x.strip()][:3]
    # 兼容“示例1/示例2/示例3”文本段
    blocks = re.split(r"\n\s*示例\s*[1-3][:：]\s*", "\n" + text)
    blocks = [b.strip() for b in blocks if b.strip()]
    if len(blocks) >= 2:
        return blocks[:3]
    return [text]


def _evidence_has_table_signal(evidence: dict) -> bool:
    docs = evidence.get("chapterDocs") if isinstance(evidence, dict) else []
    if not isinstance(docs, list):
        return False
    for d in docs[:8]:
        if not isinstance(d, dict):
            continue
        content = str(d.get("content") or "")
        if "|" in content or "表" in content[:200]:
            return True
    return False


def _has_conflict_terms(report: str) -> bool:
    pairs = [
        ("增加", "下降"),
        ("达标", "未达标"),
        ("盈利", "亏损"),
        ("改善", "恶化"),
    ]
    for a, b in pairs:
        if a in report and b in report:
            return True
    return False


def _resolve_project(db: Session, project_id: str) -> Optional[Project]:
    if not project_id:
        return None
    p = db.query(Project).filter(Project.uuid == project_id).first()
    if p:
        return p
    try:
        pid = int(project_id)
    except Exception:
        return None
    return db.query(Project).filter(Project.id == pid).first()