report_generation/services/desensitize_service.py
xxy 43f3e0b746 Initial commit
Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
2026-06-05 18:41:06 +08:00

81 lines
2.9 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

"""
services/desensitize_service.py
章节内容脱敏:把范文正文中的"精确数据"过滤掉,得到可复用的模板化内容。
规则(默认):
- 阿拉伯数字串(含小数、千分位、全角数字)→ 占位符(默认 "X"
"总投资10.5亿元""总投资X亿元""2020年3月""X年X月""85.3%""X%"
- 标题行(以 # 开头)整行保留,不动章节编号/标题。
- 行首的列表/枚举序号(如 "1" "1." "2")保留,仅脱敏正文中的数字。
- 单位与符号(万元/亿元/%/吨/年 等)保留,仅去掉其中的精确数值。
可通过 config 调整占位符、是否脱敏表格数字、是否启用。
中文数字(一二三…)通常用于序数/层级,默认保留。
"""
from __future__ import annotations
import logging
import re
from config import settings
logger = logging.getLogger(__name__)
# 阿拉伯数字(含全角)串,允许小数点/千分位分隔
_NUMBER_RE = re.compile(r"[0-9-]+(?:[.,][0-9-]+)*")
# 行首枚举序号1 / 1. / 2 / 2、 等(这些是结构标记,保留)
_LEADING_ENUM_RE = re.compile(r"^(\s*(?:[(]\s*[0-9-]+\s*[)]|[0-9-]+\s*[).、.]))")
_HEADING_RE = re.compile(r"^\s*#{1,6}\s")
_TABLE_ROW_RE = re.compile(r"^\s*\|.*\|\s*$")
_TABLE_SEP_RE = re.compile(r"^\s*\|?[\s:\-|]+\|?\s*$")
def _mask_numbers(segment: str, placeholder: str) -> str:
return _NUMBER_RE.sub(placeholder, segment)
def _desensitize_line(line: str, placeholder: str, mask_table_numbers: bool) -> str:
# 标题行整行保留(不动章节编号/标题)
if _HEADING_RE.match(line):
return line
# 表格行
if _TABLE_ROW_RE.match(line):
if _TABLE_SEP_RE.match(line): # 分隔行 |---|---|
return line
if not mask_table_numbers:
return line
return _mask_numbers(line, placeholder)
# 普通正文:保留行首枚举序号,仅脱敏其余部分
m = _LEADING_ENUM_RE.match(line)
if m:
prefix = m.group(1)
rest = line[len(prefix):]
return prefix + _mask_numbers(rest, placeholder)
return _mask_numbers(line, placeholder)
def desensitize_content(text: str) -> str:
"""对单个章节正文脱敏。未启用时原样返回。"""
if not text:
return text
if not bool(getattr(settings, "DESENSITIZE_ENABLED", True)):
return text
placeholder = str(getattr(settings, "DESENSITIZE_PLACEHOLDER", "X") or "X")
mask_table = bool(getattr(settings, "DESENSITIZE_MASK_TABLE_NUMBERS", True))
lines = text.splitlines()
out = [_desensitize_line(ln, placeholder, mask_table) for ln in lines]
return "\n".join(out)
def count_masked_numbers(original: str, filtered: str) -> int:
"""粗略统计脱敏掉的数字串数量(用于日志)。"""
return len(_NUMBER_RE.findall(original or "")) - len(_NUMBER_RE.findall(filtered or ""))