from __future__ import annotations import logging import re from concurrent.futures import ThreadPoolExecutor, as_completed from typing import Any from config import settings from prompts.report_generation.template_prompt_rules import DEFAULT_SECTION_PROMPT from services.template_service import ( _clean_section_title, _core_title, _extract_number_prefix, _guideline_prompt_for, _normalize_section_identity, _section_key_to_number, _title_match_score, build_default_template_catalog, default_section_output_contract, ) logger = logging.getLogger(__name__) _FUZZY_MATCH_THRESHOLD = 40 _LLM_MATCH_CONFIDENCE = 0.55 def resolve_uploaded_template_prompts( uploaded_sections: list[dict[str, str]], *, use_llm: bool | None = None, ) -> list[dict[str, str]]: """ 为上传模版各章节解析 sectionPrompt / sectionOutputContract: 1. 目录与默认模版完全一致 → 按位套用默认提示词; 2. 仅编号序列一致、标题不同 → 按标题(结合编号)匹配默认章节并套用其提示词; 3. 否则本地模糊匹配相同语义标题; 4. 仍未匹配 → 调用大模型匹配或生成。 """ if not uploaded_sections: return [] catalog = build_default_template_catalog() default_by_key = {row["sectionKey"]: row for row in catalog} default_by_title = {_normalize_section_identity(row["sectionTitle"]): row for row in catalog} uploaded_meta = [_section_meta(s, i) for i, s in enumerate(uploaded_sections)] default_meta = [_catalog_meta(row, i) for i, row in enumerate(catalog)] results: list[dict[str, str] | None] = [None] * len(uploaded_meta) match_sources: list[str] = [""] * len(uploaded_meta) matched_default_numbers: list[str | None] = [None] * len(uploaded_meta) used_default_keys: set[str] = set() # 1) 标题完全一致(顺序相同)→ 按索引套用 if _titles_equal_in_order(uploaded_meta, default_meta): for i, dm in enumerate(default_meta): if i >= len(uploaded_meta): break results[i] = _prompt_bundle(dm) match_sources[i] = "exact_title_order" return _finalize_results( uploaded_meta, results, match_sources, matched_default_numbers ) # 2) 编号序列一致、仅标题不同 → 按标题(同编号约束)匹配默认节,不按列表位置硬套 if _numbers_equal_in_order(uploaded_meta, default_meta): for um in uploaded_meta: matched = _match_default_by_title_and_number( um, default_meta, used_default_keys=used_default_keys, same_number_is_enough=True, ) if matched: bundle, src_num = _bundle_from_match_with_inline( um, matched, uploaded_meta, default_meta ) results[um["index"]] = bundle match_sources[um["index"]] = "same_number_title_match" matched_default_numbers[um["index"]] = src_num _mark_default_used(um, matched, used_default_keys) if all(r is not None for r in results): return _finalize_results( uploaded_meta, results, match_sources, matched_default_numbers ) # 3) 本地模糊匹配(按标题 + 编号) # for um in uploaded_meta: # if results[um["index"]] is not None: # continue # matched = _local_match_default( # um, # default_meta, # default_by_title, # uploaded_meta, # used_default_keys=used_default_keys, # ) # if matched: # bundle, src_num = _bundle_from_match_with_inline( # um, matched, uploaded_meta, default_meta # ) # results[um["index"]] = bundle # match_sources[um["index"]] = "fuzzy_title" # matched_default_numbers[um["index"]] = src_num # _mark_default_used(um, matched, used_default_keys) # 4) LLM 匹配 / 生成 unresolved = [um for um in uploaded_meta if results[um["index"]] is None] llm_enabled = use_llm if use_llm is not None else bool( getattr(settings, "TEMPLATE_UPLOAD_LLM_PROMPT_MAPPING", True) ) if unresolved and llm_enabled and _llm_configured(): _apply_llm_mapping( unresolved, default_meta, default_by_key, results, match_sources, matched_default_numbers, uploaded_meta, ) # 5) 兜底:仅细则全文标题命中,否则按上传标题生成通用提示(避免按章号误套默认提示词) for um in uploaded_meta: if results[um["index"]] is not None: continue title = um["title"] key = um["key"] results[um["index"]] = { "sectionPrompt": _fallback_prompt_for_unmatched(title, key), "sectionOutputContract": _fallback_contract_for_unmatched(title, key), } match_sources[um["index"]] = "fallback_generic" return _finalize_results( uploaded_meta, results, match_sources, matched_default_numbers ) def _finalize_results( uploaded_meta: list[dict[str, Any]], results: list[dict[str, str] | None], match_sources: list[str], matched_default_numbers: list[str | None] | None = None, ) -> list[dict[str, str]]: defaults = matched_default_numbers or [None] * len(uploaded_meta) out: list[dict[str, str]] = [] for um, src in zip(uploaded_meta, defaults): idx = um["index"] bundle = results[idx] or { "sectionPrompt": DEFAULT_SECTION_PROMPT, "sectionOutputContract": _fallback_contract_for_unmatched(um["title"], um["key"]), } prompt = bundle.get("sectionPrompt") or DEFAULT_SECTION_PROMPT contract = bundle.get("sectionOutputContract") or "" dst = um.get("number") or "" if src and dst and src != dst: leaf = _use_leaf_number_rewrite(contract, src, um, uploaded_meta) prompt = _rewrite_numbers_and_tables(prompt, src, dst, leaf_slice=leaf) contract = _rewrite_numbers_and_tables(contract, src, dst, leaf_slice=leaf) prompt = _adapt_prompt_to_uploaded_structure(prompt, um, uploaded_meta) contract = _adapt_prompt_to_uploaded_structure(contract, um, uploaded_meta) out.append( { "sectionPrompt": prompt, "sectionOutputContract": contract, } ) matched = sum(1 for s in match_sources if s and not s.startswith("fallback")) logger.info( "template_prompt_mapper: sections=%s matched=%s sources=%s", len(uploaded_meta), matched, {s: match_sources.count(s) for s in set(match_sources) if s}, ) return out def _section_meta(section: dict[str, str], index: int) -> dict[str, Any]: title = str(section.get("sectionTitle") or "").strip() key = str(section.get("sectionKey") or "").strip() number = _extract_number_prefix(title) or _section_key_to_number(key) return { "index": index, "key": key, "title": title, "number": number, "norm_title": _normalize_section_identity(title), "core_title": _core_title(_clean_section_title(title) or title), } def _catalog_meta(row: dict[str, str], index: int) -> dict[str, Any]: title = row["sectionTitle"] key = row["sectionKey"] return { "index": index, "key": key, "title": title, "number": row.get("sectionNumber") or _extract_number_prefix(title) or _section_key_to_number(key), "norm_title": _normalize_section_identity(title), "core_title": _core_title(_clean_section_title(title) or title), "sectionPrompt": row["sectionPrompt"], "sectionOutputContract": row["sectionOutputContract"], } def _prompt_bundle(dm: dict[str, Any]) -> dict[str, str]: return { "sectionPrompt": str(dm.get("sectionPrompt") or ""), "sectionOutputContract": str(dm.get("sectionOutputContract") or ""), } def _build_section_remap(src: str, dst: str) -> dict[str, str]: """单节编号替换(含子编号后缀,如 6.1.1 -> 4.1 则 6.1.1.1 -> 4.1.1)。""" if not src or not dst or src == dst: return {} return {src: dst} def _build_chapter_remap(src: str, dst: str) -> dict[str, str]: """章级编号替换,用于 表5-1 -> 表3-1 这类表号。""" if not src or not dst: return {} src_ch = src.split(".", 1)[0] dst_ch = dst.split(".", 1)[0] if not src_ch.isdigit() or not dst_ch.isdigit() or src_ch == dst_ch: return {} return {src_ch: dst_ch} _TABLE_NUM_RE = re.compile(r"表(\d+)-(\d+)") def _rewrite_table_numbers_in_text(text: str, chapter_remap: dict[str, str]) -> str: if not text or not chapter_remap: return text def _sub(match: re.Match[str]) -> str: ch, seq = match.group(1), match.group(2) new_ch = chapter_remap.get(ch) if new_ch: return f"表{new_ch}-{seq}" return match.group(0) return _TABLE_NUM_RE.sub(_sub, text) def _rewrite_numbers_and_tables(text: str, src: str, dst: str, *, leaf_slice: bool = False) -> str: if not text or not src or not dst or src == dst: return text if leaf_slice: text = _rewrite_leaf_subsection_numbers(text, src, dst) else: text = _rewrite_section_numbers_in_text(text, _build_section_remap(src, dst)) chapter_remap = _build_chapter_remap(src, dst) return _rewrite_table_numbers_in_text(text, chapter_remap) def _top_chapter_number(section_number: str | None) -> int | None: m = re.match(r"^(\d+)", str(section_number or "").strip()) return int(m.group(1)) if m else None def _section_number_tuple(section_number: str) -> tuple[int, ...]: parts = [] for p in str(section_number or "").strip().split("."): if p.isdigit(): parts.append(int(p)) else: return tuple() return tuple(parts) def _direct_child_sections( all_uploaded: list[dict[str, Any]], parent_number: str ) -> list[dict[str, Any]]: parent = str(parent_number or "").strip() if not parent: return [] prefix = parent + "." out: list[dict[str, Any]] = [] for um in all_uploaded: num = str(um.get("number") or "").strip() if not num.startswith(prefix) or num == parent: continue suffix = num[len(prefix) :] if suffix and "." not in suffix: out.append(um) out.sort(key=lambda u: _section_number_tuple(str(u.get("number") or ""))) return out def _preceding_chapters_label( all_uploaded: list[dict[str, Any]], current_number: str | None ) -> tuple[str, int]: """返回(第1~N章, N)用于替换默认合同里的「第1~6章」「前六章」。""" cur_top = _top_chapter_number(current_number) if cur_top is None: return "前序章节", 0 tops = sorted( { t for um in all_uploaded if (t := _top_chapter_number(um.get("number"))) is not None } ) preced = [t for t in tops if t < cur_top] if not preced: return "前序章节", 0 if len(preced) >= 2 and preced[-1] - preced[0] + 1 == len(preced): return f"第{preced[0]}~{preced[-1]}章", len(preced) return "、".join(f"第{t}章" for t in preced), len(preced) _CN_COUNT = ("", "一", "二", "三", "四", "五", "六", "七", "八", "九", "十") def _cn_count(n: int) -> str: if 0 < n < len(_CN_COUNT): return _CN_COUNT[n] return str(n) _CHILDREN_COVER_RE = re.compile(r"(并按顺序完整覆盖下级小节[::])\s*[^。\n;]+") _PRECEDING_RANGE_RE = re.compile(r"第\d+~\d+章") def _rewrite_preceding_chapter_refs(text: str, range_label: str, chapter_count: int) -> str: if not text or not range_label: return text text = text.replace("【前序章节正文(第1~6章)】", f"【前序章节正文({range_label})】") text = _PRECEDING_RANGE_RE.sub(range_label, text) if chapter_count > 0: cn = _cn_count(chapter_count) text = re.sub(r"前[一二三四五六七八九十]+章", f"前{chapter_count}章", text) text = text.replace("前六章", f"前{chapter_count}章") text = text.replace(f"前{cn}章", f"前{chapter_count}章") text = text.replace("是对前六章内容的总结", f"是对{range_label}内容的总结") text = text.replace("是对前6章内容的总结", f"是对{range_label}内容的总结") return text def _rewrite_children_cover_clause(text: str, child_numbers: list[str]) -> str: if not text or not child_numbers: return text listing = "、".join(child_numbers) def _repl(m: re.Match[str]) -> str: return f"{m.group(1)}{listing}" return _CHILDREN_COVER_RE.sub(_repl, text, count=1) def _rewrite_children_outline_block( text: str, parent_number: str, children: list[dict[str, Any]] ) -> str: """将提示词里枚举的下级小节列表改为上传模版中的实际子节。""" if not text or not parent_number or len(children) < 2: return text parent = re.escape(parent_number) line_re = re.compile(rf"^(\s*)(\d+))\s*{parent}\.\d+\s+.+$") lines = text.splitlines() out: list[str] = [] i = 0 replaced = False while i < len(lines): if line_re.match(lines[i].strip()) or ( lines[i].strip() and re.match(rf"^\d+)\s*{parent}\.\d+", lines[i].strip()) ): if not replaced: for j, um in enumerate(children, 1): num = str(um.get("number") or "").strip() label = _clean_section_title(um.get("title") or "") or um.get( "core_title", "" ) out.append(f" {j}){num} {label}") replaced = True while i < len(lines) and ( line_re.match(lines[i].strip()) or re.match(rf"^\d+)\s*{parent}\.\d+", lines[i].strip()) ): i += 1 continue out.append(lines[i]) i += 1 return "\n".join(out) def _contract_has_inline_child_list(contract: str, parent_num: str) -> bool: """默认合同把子条写在父节内(如 4.1.1、4.1.2 列表),而非独立章节。""" if not contract or not parent_num: return False if "按顺序固定输出以下" not in contract and "小标题并分别展开" not in contract: return False return bool(re.search(rf"{re.escape(parent_num)}\.\d+", contract)) def _parse_inline_child_entries(contract: str, parent_num: str) -> list[dict[str, str]]: entries: list[dict[str, str]] = [] for line in str(contract or "").splitlines(): stripped = line.strip() m = re.match(rf"^{re.escape(parent_num)}\.(\d+)\s*(.+)$", stripped) if m: entries.append( { "suffix": m.group(1), "default_num": f"{parent_num}.{m.group(1)}", "label": m.group(2).strip(), } ) return entries def _extract_inline_child_guidance(contract: str, child_default_num: str) -> str: needle = f"在{child_default_num}中" lines = str(contract or "").splitlines() for i, line in enumerate(lines): compact = line.replace(" ", "") if needle not in compact: continue chunk = re.sub(r"^\d+)\s*", "", line.strip()).strip() for j in range(i + 1, len(lines)): nxt = lines[j].strip() if re.match(r"^\d+)", nxt): break if nxt: chunk += nxt return chunk return "" def _uploaded_parent_number( uploaded_num: str | None, all_uploaded: list[dict[str, Any]] ) -> str | None: parts = str(uploaded_num or "").strip().split(".") if len(parts) < 2: return None parent = ".".join(parts[:-1]) if any(str(o.get("number") or "").strip() == parent for o in all_uploaded): return parent return None def _should_skip_whole_parent_match( uploaded: dict[str, Any], dm: dict[str, Any], all_uploaded: list[dict[str, Any]] ) -> bool: """上传模版已拆出子节时,不要把整段父节合同套到叶节上。""" u_num = str(uploaded.get("number") or "").strip() d_num = str(dm.get("number") or "").strip() if not u_num or not d_num or u_num.count(".") <= d_num.count("."): return False if not _uploaded_parent_number(u_num, all_uploaded): return False return _contract_has_inline_child_list( str(dm.get("sectionOutputContract") or ""), d_num ) def _try_inline_child_match( uploaded: dict[str, Any], default_meta: list[dict[str, Any]], all_uploaded: list[dict[str, Any]], ) -> tuple[dict[str, Any], dict[str, str]] | None: """上传叶节对应默认父节合同中的某一条(如 4.1.1 投产组织)。""" parent_num = _uploaded_parent_number(uploaded.get("number"), all_uploaded) if not parent_num: return None u_num = str(uploaded.get("number") or "").strip() child_suffix = u_num[len(parent_num) + 1 :] if not child_suffix or "." in child_suffix or not child_suffix.isdigit(): return None parent_um = next( (o for o in all_uploaded if str(o.get("number") or "").strip() == parent_num), None, ) u_core = uploaded["core_title"] # 1) 上传父节标题 + 子节序号对齐(如 2.3.1.1 ↔ 4.1.1) if parent_um: for dm in default_meta: pnum = str(dm.get("number") or "").strip() contract = str(dm.get("sectionOutputContract") or "") if not _contract_has_inline_child_list(contract, pnum): continue parent_score = _title_match_score(parent_um["core_title"], dm["core_title"]) if parent_score < 35: continue for entry in _parse_inline_child_entries(contract, pnum): if entry["suffix"] != child_suffix: continue child_score = _title_match_score(u_core, _core_title(entry["label"])) if child_score >= 12 or parent_score >= 50: return dm, entry # 2) 按子条标题模糊匹配(措辞略异时阈值放宽) best_dm: dict[str, Any] | None = None best_entry: dict[str, str] | None = None best_score = -1 for dm in default_meta: pnum = str(dm.get("number") or "").strip() contract = str(dm.get("sectionOutputContract") or "") if not _contract_has_inline_child_list(contract, pnum): continue for entry in _parse_inline_child_entries(contract, pnum): score = _title_match_score(u_core, _core_title(entry["label"])) if entry["suffix"] == child_suffix: score += 15 if score > best_score: best_score = score best_dm = dm best_entry = entry if best_score < 20 or not best_dm or not best_entry: return None return best_dm, best_entry def _build_inline_child_contract( uploaded: dict[str, Any], parent_dm: dict[str, Any], entry: dict[str, str], ) -> str: uploaded_num = str(uploaded.get("number") or "").strip() label = _clean_section_title(uploaded.get("title") or "") or uploaded["core_title"] guidance = _extract_inline_child_guidance( str(parent_dm.get("sectionOutputContract") or ""), entry["default_num"], ) body = guidance or f"围绕「{label}」撰写本段内容,依据证据材料,缺失写「待补充」,禁止编造。" return ( "必须严格按以下格式与顺序输出,不得缺项、不得改名:\n" f'1)首行固定输出标题:"{uploaded_num} {label}"。\n' f"2){body}\n" "【写作约束】\n" "不得新增无关小标题;不得写入同级其他小条目的内容;证据不足处写「待补充」,禁止编造。" ) def _adapt_prompt_to_uploaded_structure( text: str, section: dict[str, Any], all_uploaded: list[dict[str, Any]], ) -> str: if not text: return text num = str(section.get("number") or "").strip() # 叶节合同不应再展开父节内嵌子条列表 if _is_leaf_subsection_contract(text): return text children = _direct_child_sections(all_uploaded, num) if children and "按顺序固定输出以下" in text: child_nums = [str(c.get("number") or "") for c in children] text = _rewrite_children_cover_clause(text, child_nums) text = _rewrite_children_outline_block(text, num, children) range_label, count = _preceding_chapters_label(all_uploaded, num) if count > 0 and _top_chapter_number(num) is not None: text = _rewrite_preceding_chapter_refs(text, range_label, count) return text def _bundle_from_match_with_inline( uploaded: dict[str, Any], matched: dict[str, Any], all_uploaded: list[dict[str, Any]], default_meta: list[dict[str, Any]], ) -> tuple[dict[str, str], str]: inline_hit = _try_inline_child_match(uploaded, default_meta, all_uploaded) inline_entry = inline_hit[1] if inline_hit else None parent_dm = inline_hit[0] if inline_hit else matched return _bundle_from_default_match( uploaded, parent_dm, all_uploaded, inline_entry=inline_entry ) def _mark_default_used( uploaded: dict[str, Any], default: dict[str, Any], used_default_keys: set[str], ) -> None: """同一默认父节可被多个上传子节切片复用,仅整节独占时标记已用。""" u_core = uploaded.get("core_title") or "" d_core = default.get("core_title") or "" if u_core == d_core or _title_match_score(u_core, d_core) >= 58: used_default_keys.add(default["key"]) def _bundle_from_default_match( uploaded: dict[str, Any], default: dict[str, Any], all_uploaded: list[dict[str, Any]], *, inline_entry: dict[str, str] | None = None, ) -> tuple[dict[str, str], str]: """按标题从默认节取提示词;子节从父节合同中切片,父节去掉已单独成节的内容。""" src_num = str(default.get("number") or "") if inline_entry: contract = _build_inline_child_contract(uploaded, default, inline_entry) child_src = inline_entry["default_num"] return ( { "sectionPrompt": contract, "sectionOutputContract": contract, }, child_src, ) prompt = str(default.get("sectionPrompt") or "") contract = str(default.get("sectionOutputContract") or "") u_core = uploaded["core_title"] d_core = default["core_title"] title_score = _title_match_score(u_core, d_core) if u_core and d_core else 0 subsection = _extract_subsection_from_contract(contract, u_core) if subsection and u_core != d_core and title_score < 58: label = _clean_section_title(uploaded["title"]) or u_core num = uploaded.get("number") or "" heading = f"{num} {label}".strip() if num else label body = _strip_redundant_subsection_heading(subsection, u_core) contract = ( "必须严格按以下格式与顺序输出,不得缺项、不得改名:\n" f'1)首行固定输出标题:"{heading}"。\n' f"{body}" ) if not prompt.strip() or len(prompt) < 80: prompt = contract if _has_uploaded_children(uploaded, all_uploaded): contract = _trim_parent_contract_for_children(contract, src_num) if prompt == str(default.get("sectionPrompt") or ""): prompt = contract return ( { "sectionPrompt": prompt or DEFAULT_SECTION_PROMPT, "sectionOutputContract": contract, }, src_num, ) def _has_uploaded_children(section: dict[str, Any], all_uploaded: list[dict[str, Any]]) -> bool: prefix = str(section.get("number") or "").strip() if not prefix: return False child_prefix = prefix + "." for other in all_uploaded: num = str(other.get("number") or "") if num.startswith(child_prefix) and num != prefix: return True return False def _extract_subsection_from_contract(contract: str, core_title: str) -> str | None: if not contract or not core_title: return None core = str(core_title).strip() if core not in contract: return None blocks = re.split(r"(?=\d+)固定输出小节标题)", contract) matched: list[str] = [] for block in blocks: if core in block and "固定输出小节标题" in block: matched.append(block.strip()) if not matched: return None if len(matched) == 1: return matched[0] # 多个同名子节(如「效果及影响」)取与环境/监测更相关的一块 for block in matched: if any(k in block for k in ("废气", "废水", "噪声监测", "环保措施")): return block return matched[0] def _strip_redundant_subsection_heading(subsection: str, core_title: str) -> str: """ 去掉切片里与节标题重复的「固定输出小节标题」行,正文从 2)起编号。 例:3)固定输出小节标题:"x.x.x 环保措施",并在该小节下… → 2)并在该小节下… """ if not subsection: return "" core = str(core_title).strip() out_lines: list[str] = [] for line in subsection.splitlines(): stripped = line.strip() if not stripped: if out_lines: out_lines.append(line) continue if "固定输出小节标题" in stripped and (not core or core in stripped): m = re.search( r'固定输出小节标题\s*[::]\s*["\u201c][^"\u201d]+["\u201d]\s*[,,]?\s*(.*)$', stripped, ) tail = (m.group(1) if m else "").strip() if tail: out_lines.append(f"2){tail}") continue if re.match(r"^\d+)固定输出小节标题", stripped): continue out_lines.append(line) body = "\n".join(out_lines).strip() body = _trim_parent_tail_from_subsection(body) if body and not re.match(r"^\d+)", body): body = f"2){body}" return body def _trim_parent_tail_from_subsection(body: str) -> str: """去掉误带入的父节收尾条款(如整节「后评价认为」结论)。""" if not body: return "" kept: list[str] = [] for line in body.splitlines(): stripped = line.strip() if re.match(r"^5)末尾必须以", stripped): break if stripped.startswith("【写作约束】"): break kept.append(line) return "\n".join(kept).strip() def _is_leaf_subsection_contract(contract: str) -> bool: """叶节合同:已有首行标题,且不再以「固定输出小节标题」开头。""" text = (contract or "").lstrip() if not text.startswith("必须严格"): return False if "按顺序固定输出以下" in text and "小标题并分别展开" in text: return False return "首行固定输出标题" in text[:200] and not re.search( r"^2)固定输出小节标题", text, re.MULTILINE ) def _use_leaf_number_rewrite( contract: str, src: str, uploaded: dict[str, Any], all_uploaded: list[dict[str, Any]], ) -> bool: """仅对真正叶节切片使用「整段替换为叶节编号」;含内嵌子条列表的父节不用。""" if _contract_has_inline_child_list(contract, src): return False if _has_uploaded_children(uploaded, all_uploaded): return False return _is_leaf_subsection_contract(contract) def _rewrite_leaf_subsection_numbers(text: str, src_root: str, dst_leaf: str) -> str: """子节切片:将默认父节下所有编号(6.1.1.x)统一替换为上传叶节编号(4.1.1)。""" if not text or not src_root or not dst_leaf: return text def _sub(match: re.Match[str]) -> str: num = match.group(1) if num == src_root or num.startswith(src_root + "."): return dst_leaf return match.group(0) return _SECTION_NUM_IN_TEXT_RE.sub(_sub, text) def _trim_parent_contract_for_children( contract: str, parent_num: str | None = None ) -> str: m = re.search(r"\d+)固定输出小节标题", contract) if m: trimmed = contract[: m.start()].rstrip() return trimmed if trimmed else contract if parent_num and _contract_has_inline_child_list(contract, parent_num): return _trim_inline_parent_contract_for_children(contract, parent_num) return contract def _trim_inline_parent_contract_for_children(contract: str, parent_num: str) -> str: """父节内嵌子条已单独成节时,去掉各子条撰写细则,保留总述与总结。""" lines = str(contract or "").splitlines() kept: list[str] = [] for line in lines: stripped = line.strip() if re.match(r"^[3-7])", stripped) and f"在{parent_num}." in stripped.replace(" ", ""): continue if re.match(r"^[3-7])", stripped) and f"在{parent_num}中" in stripped.replace(" ", ""): continue kept.append(line) return "\n".join(kept).strip() or contract def _remap_single_number(num: str, remap: dict[str, str]) -> str: if not num or not remap: return num if num in remap: return remap[num] parts = num.split(".") for end in range(len(parts) - 1, 0, -1): prefix = ".".join(parts[:end]) if prefix in remap: return remap[prefix] + num[len(prefix) :] return num _SECTION_NUM_IN_TEXT_RE = re.compile(r"(? str: if not text or not remap: return text def _sub(match: re.Match[str]) -> str: num = match.group(1) new_num = _remap_single_number(num, remap) return new_num if new_num != num else match.group(0) return _SECTION_NUM_IN_TEXT_RE.sub(_sub, text) def _titles_equal_in_order( uploaded: list[dict[str, Any]], default: list[dict[str, Any]], ) -> bool: if len(uploaded) != len(default): return False for u, d in zip(uploaded, default): if u["norm_title"] != d["norm_title"]: return False return True def _numbers_equal_in_order( uploaded: list[dict[str, Any]], default: list[dict[str, Any]], ) -> bool: if len(uploaded) != len(default): return False for u, d in zip(uploaded, default): if (u["number"] or "") != (d["number"] or ""): return False return True def _match_default_by_title_and_number( uploaded: dict[str, Any], default_meta: list[dict[str, Any]], *, used_default_keys: set[str] | None = None, same_number_is_enough: bool = False, ) -> dict[str, Any] | None: """ 按上传标题中的章节编号定位默认目录中的对应节,再按标题语义择优。 same_number_is_enough:编号序列已与默认一致时,同编号唯一默认节直接套用(标题仅措辞不同)。 """ used = used_default_keys or set() u_num = uploaded["number"] u_core = uploaded["core_title"] candidates = [ dm for dm in default_meta if dm["key"] not in used and (not u_num or dm["number"] == u_num) ] if not candidates: return None if len(candidates) == 1 and u_num and candidates[0]["number"] == u_num: d_core = candidates[0]["core_title"] if same_number_is_enough: if _titles_topic_compatible(u_core, d_core): return candidates[0] return None title_score = _title_match_score(u_core, d_core) if title_score >= 20: return candidates[0] return None best: dict[str, Any] | None = None best_score = -1 for dm in candidates: title_score = _title_match_score(u_core, dm["core_title"]) if title_score < _FUZZY_MATCH_THRESHOLD: continue score = title_score + (20 if u_num and dm["number"] == u_num else 0) if score > best_score: best_score = score best = dm return best def _local_match_default( uploaded: dict[str, Any], default_meta: list[dict[str, Any]], default_by_title: dict[str, dict[str, str]], all_uploaded: list[dict[str, Any]], *, used_default_keys: set[str] | None = None, ) -> dict[str, Any] | None: if uploaded["norm_title"] in default_by_title: row = default_by_title[uploaded["norm_title"]] dm = _catalog_meta(row, -1) if not used_default_keys or dm["key"] not in used_default_keys: return dm inline_hit = _try_inline_child_match(uploaded, default_meta, all_uploaded) if inline_hit: parent_dm, _entry = inline_hit return parent_dm subsection_parent = _find_subsection_parent(uploaded, default_meta, used_default_keys) if subsection_parent: return subsection_parent return _match_default_by_title_semantic( uploaded, default_meta, all_uploaded, used_default_keys=used_default_keys ) def _find_subsection_parent( uploaded: dict[str, Any], default_meta: list[dict[str, Any]], used_default_keys: set[str] | None, ) -> dict[str, Any] | None: """上传节为子标题(如 环保措施),在默认父节合同中找到对应切片时匹配父节(可复用同一父节)。""" u_core = uploaded["core_title"] if not u_core: return None best: dict[str, Any] | None = None best_score = -1 for dm in default_meta: contract = str(dm.get("sectionOutputContract") or "") subsection = _extract_subsection_from_contract(contract, u_core) if not subsection: continue if _title_match_score(u_core, dm["core_title"]) >= 58: continue score = _title_match_score(u_core, dm["core_title"]) if "环境" in u_core or "环保" in u_core: if "环境" in dm["core_title"] or "环保" in dm["core_title"]: score += 30 if "安全" in dm["core_title"]: score -= 25 if "监测" in subsection or "废气" in subsection: if "环境" in dm["core_title"]: score += 15 if score > best_score: best_score = score best = dm return best def _match_default_by_title_semantic( uploaded: dict[str, Any], default_meta: list[dict[str, Any]], all_uploaded: list[dict[str, Any]], *, used_default_keys: set[str] | None = None, ) -> dict[str, Any] | None: used = used_default_keys or set() u_core = uploaded["core_title"] if not u_core: return None u_num = str(uploaded.get("number") or "") min_score = 55 if u_num and "." not in u_num else _FUZZY_MATCH_THRESHOLD best: dict[str, Any] | None = None best_score = -1 for dm in default_meta: if dm["key"] in used: continue d_core = dm["core_title"] title_score = _title_match_score(u_core, d_core) if u_core == d_core: return dm if _should_skip_whole_parent_match(uploaded, dm, all_uploaded): continue if title_score < 45 and not _titles_topic_compatible(u_core, d_core): continue if title_score < 45: continue if title_score > best_score: best_score = title_score best = dm return best if best_score >= min_score else None _GENERIC_TITLE_FRAGS = frozenset( { "评价", "分析", "结论", "建议", "概况", "情况", "说明", "管理", "工作", } ) def _titles_topic_compatible(uploaded_core: str, default_core: str) -> bool: """判断两节标题是否同一主题(措辞略异为真,换题为假)。""" if not uploaded_core or not default_core: return False if _title_match_score(uploaded_core, default_core) >= 12: return True tks_u = set(re.findall(r"[\u4e00-\u9fa5]{2,8}", uploaded_core)) - _GENERIC_TITLE_FRAGS tks_d = set(re.findall(r"[\u4e00-\u9fa5]{2,8}", default_core)) - _GENERIC_TITLE_FRAGS if tks_u & tks_d: return True for n in (4, 3, 2): for i in range(len(uploaded_core) - n + 1): frag = uploaded_core[i : i + n] if frag in _GENERIC_TITLE_FRAGS: continue if frag in default_core: return True return False def _fallback_prompt_for_unmatched(title: str, section_key: str | None) -> str: guideline = _guideline_prompt_for(title, section_key) if guideline: return guideline return _fallback_contract_for_unmatched(title, section_key) def _fallback_contract_for_unmatched(title: str, section_key: str | None) -> str: label = _clean_section_title(title) or str(title or "").strip() or "本章节" num = _extract_number_prefix(title) or _section_key_to_number(section_key) heading = f"{num} {label}".strip() if num else label return ( f"必须严格按以下要求输出:\n" f'1)首行固定输出标题:"{heading}"。\n' f"2)正文围绕「{label}」撰写,结构须与本节标题一致,先事实后评价。\n" f"3)依据证据材料,缺失写「待补充」,禁止编造。" ) def _llm_configured() -> bool: return bool( (settings.LLM_API_BASE or "").strip() and (settings.LLM_API_KEY or "").strip() and (settings.LLM_MODEL_NAME or "").strip() ) _LLM_MAPPING_SYSTEM_PROMPT = ( "你是炼油化工建设项目后评价报告模版专家。" "任务:判断上传模版章节能否复用系统默认章节的撰写提示词,并为无法复用的章节生成简短提示词。" "只输出 JSON object,不要解释。" ) def _build_llm_mapping_user_prompt( default_meta: list[dict[str, Any]], default_lines: list[str], batch: list[dict[str, Any]], ) -> str: upload_lines = [ f'- index={um["index"]} number={um["number"]} title={um["title"]} core={um["core_title"]}' for um in batch ] return f"""系统默认模版共 {len(default_meta)} 节(节选提示词预览): {chr(10).join(default_lines[:120])} 待处理的上传章节(index 为上传列表下标): {chr(10).join(upload_lines)} 请返回 JSON: {{ "structure_compatible": true/false, "matches": [ {{"upload_index": 0, "default_key": "3-1", "confidence": 0.0-1.0}} ], "generated": [ {{ "upload_index": 5, "section_prompt": "200字以内的章节撰写要求,面向后评价报告,缺失写待补充,禁止编造", "section_output_contract": "可选,100字以内的输出结构约束;不需要可空字符串" }} ] }} 规则: 1. structure_compatible:上传模版与默认模版目录层级、编号体系一致且仅标题措辞略异时为 true。 2. matches:语义与默认某节相同或高度相近时,填写 default_key(必须来自默认列表的 key);confidence>=0.55 才有效。 3. generated:无法对应默认章节时,根据上传标题写 section_prompt;contract 可简述需含表格/小节等。 4. 同一 upload_index 只出现在 matches 或 generated 之一;不要重复。 5. 禁止编造与标题无关的细则内容。""" def _apply_llm_mapping( unresolved: list[dict[str, Any]], default_meta: list[dict[str, Any]], default_by_key: dict[str, dict[str, str]], results: list[dict[str, str] | None], match_sources: list[str], matched_default_numbers: list[str | None], all_uploaded_meta: list[dict[str, Any]], ) -> None: """把未匹配章节分批并行调用 LLM,再统一合并结果。 单次大请求的耗时随待生成条目数线性增长;分批后每个请求输出更小、可并行, 显著缩短整体等待时间(LLM 调用为网络 I/O,多线程下真正并行)。 """ try: from services.llm_client import chat_completions_json except Exception as e: logger.warning("template_prompt_mapper: llm import failed: %s", e) return default_lines = [] for dm in default_meta: prompt_preview = re.sub(r"\s+", " ", str(dm.get("sectionPrompt") or ""))[:240] default_lines.append( f'- key={dm["key"]} number={dm["number"]} title={dm["title"]} ' f'prompt_preview="{prompt_preview}"' ) batch_size = max(int(getattr(settings, "TEMPLATE_UPLOAD_LLM_BATCH_SIZE", 8) or 8), 1) max_workers = max(int(getattr(settings, "TEMPLATE_UPLOAD_LLM_MAX_WORKERS", 4) or 4), 1) max_tokens = int(getattr(settings, "TEMPLATE_UPLOAD_LLM_MAX_TOKENS", 4096) or 4096) timeout_sec = int(getattr(settings, "LLM_HTTP_TIMEOUT_SEC", 120) or 120) batches = [unresolved[i : i + batch_size] for i in range(0, len(unresolved), batch_size)] def _run_batch(batch: list[dict[str, Any]]) -> dict: user_prompt = _build_llm_mapping_user_prompt(default_meta, default_lines, batch) try: return chat_completions_json( system_prompt=_LLM_MAPPING_SYSTEM_PROMPT, user_prompt=user_prompt, temperature=0.1, max_tokens=max_tokens, timeout_sec=timeout_sec, ) except Exception as e: # noqa: BLE001 logger.warning("template_prompt_mapper: llm batch call failed: %s", e) return {} collected: list[dict] = [] if len(batches) <= 1: collected = [_run_batch(b) for b in batches] else: workers = min(max_workers, len(batches)) with ThreadPoolExecutor(max_workers=workers) as executor: futures = [executor.submit(_run_batch, b) for b in batches] for fut in as_completed(futures): collected.append(fut.result()) logger.info( "template_prompt_mapper: llm 并行匹配 | 待处理=%s | 批数=%s | 线程=%s", len(unresolved), len(batches), workers, ) for data in collected: if isinstance(data, dict): _merge_llm_mapping_response( data, unresolved, default_by_key, results, match_sources, matched_default_numbers, all_uploaded_meta, default_meta, ) def _merge_llm_mapping_response( data: dict, unresolved: list[dict[str, Any]], default_by_key: dict[str, dict[str, str]], results: list[dict[str, str] | None], match_sources: list[str], matched_default_numbers: list[str | None], all_uploaded_meta: list[dict[str, Any]], default_meta: list[dict[str, Any]], ) -> None: for item in data.get("matches") or []: if not isinstance(item, dict): continue try: idx = int(item.get("upload_index")) except (TypeError, ValueError): continue if idx < 0 or idx >= len(results) or results[idx] is not None: continue try: conf = float(item.get("confidence") or 0) except (TypeError, ValueError): conf = 0.0 if conf < _LLM_MATCH_CONFIDENCE: continue default_key = str(item.get("default_key") or "").strip() row = default_by_key.get(default_key) if not row: continue dm = _catalog_meta(row, -1) um = next((u for u in unresolved if u["index"] == idx), None) if um: bundle, src_num = _bundle_from_match_with_inline( um, dm, all_uploaded_meta, default_meta ) results[idx] = bundle matched_default_numbers[idx] = src_num else: results[idx] = _prompt_bundle(dm) matched_default_numbers[idx] = dm.get("number") or "" match_sources[idx] = "llm_match" for item in data.get("generated") or []: if not isinstance(item, dict): continue try: idx = int(item.get("upload_index")) except (TypeError, ValueError): continue if idx < 0 or idx >= len(results) or results[idx] is not None: continue prompt = str(item.get("section_prompt") or "").strip() contract = str(item.get("section_output_contract") or "").strip() if not prompt: continue um = next((u for u in unresolved if u["index"] == idx), None) title = um["title"] if um else "" key = um["key"] if um else "" results[idx] = { "sectionPrompt": prompt, "sectionOutputContract": contract or default_section_output_contract(title, key), } match_sources[idx] = "llm_generated"