81 lines
2.9 KiB
Python
81 lines
2.9 KiB
Python
"""
|
||
services/desensitize_service.py
|
||
章节内容脱敏:把范文正文中的"精确数据"过滤掉,得到可复用的模板化内容。
|
||
|
||
规则(默认):
|
||
- 阿拉伯数字串(含小数、千分位、全角数字)→ 占位符(默认 "X"),
|
||
如 "总投资10.5亿元" → "总投资X亿元"、"2020年3月" → "X年X月"、"85.3%" → "X%"。
|
||
- 标题行(以 # 开头)整行保留,不动章节编号/标题。
|
||
- 行首的列表/枚举序号(如 "1)" "1." "(2)")保留,仅脱敏正文中的数字。
|
||
- 单位与符号(万元/亿元/%/吨/年 等)保留,仅去掉其中的精确数值。
|
||
|
||
可通过 config 调整占位符、是否脱敏表格数字、是否启用。
|
||
中文数字(一二三…)通常用于序数/层级,默认保留。
|
||
"""
|
||
|
||
from __future__ import annotations
|
||
|
||
import logging
|
||
import re
|
||
|
||
from config import settings
|
||
|
||
logger = logging.getLogger(__name__)
|
||
|
||
# 阿拉伯数字(含全角)串,允许小数点/千分位分隔
|
||
_NUMBER_RE = re.compile(r"[0-90-9]+(?:[..,,][0-90-9]+)*")
|
||
|
||
# 行首枚举序号:1) / 1. / (2) / 2、 等(这些是结构标记,保留)
|
||
_LEADING_ENUM_RE = re.compile(r"^(\s*(?:[((]\s*[0-90-9]+\s*[))]|[0-90-9]+\s*[)).、.]))")
|
||
|
||
_HEADING_RE = re.compile(r"^\s*#{1,6}\s")
|
||
_TABLE_ROW_RE = re.compile(r"^\s*\|.*\|\s*$")
|
||
_TABLE_SEP_RE = re.compile(r"^\s*\|?[\s:\-|]+\|?\s*$")
|
||
|
||
|
||
def _mask_numbers(segment: str, placeholder: str) -> str:
|
||
return _NUMBER_RE.sub(placeholder, segment)
|
||
|
||
|
||
def _desensitize_line(line: str, placeholder: str, mask_table_numbers: bool) -> str:
|
||
# 标题行整行保留(不动章节编号/标题)
|
||
if _HEADING_RE.match(line):
|
||
return line
|
||
|
||
# 表格行
|
||
if _TABLE_ROW_RE.match(line):
|
||
if _TABLE_SEP_RE.match(line): # 分隔行 |---|---|
|
||
return line
|
||
if not mask_table_numbers:
|
||
return line
|
||
return _mask_numbers(line, placeholder)
|
||
|
||
# 普通正文:保留行首枚举序号,仅脱敏其余部分
|
||
m = _LEADING_ENUM_RE.match(line)
|
||
if m:
|
||
prefix = m.group(1)
|
||
rest = line[len(prefix):]
|
||
return prefix + _mask_numbers(rest, placeholder)
|
||
|
||
return _mask_numbers(line, placeholder)
|
||
|
||
|
||
def desensitize_content(text: str) -> str:
|
||
"""对单个章节正文脱敏。未启用时原样返回。"""
|
||
if not text:
|
||
return text
|
||
if not bool(getattr(settings, "DESENSITIZE_ENABLED", True)):
|
||
return text
|
||
|
||
placeholder = str(getattr(settings, "DESENSITIZE_PLACEHOLDER", "X") or "X")
|
||
mask_table = bool(getattr(settings, "DESENSITIZE_MASK_TABLE_NUMBERS", True))
|
||
|
||
lines = text.splitlines()
|
||
out = [_desensitize_line(ln, placeholder, mask_table) for ln in lines]
|
||
return "\n".join(out)
|
||
|
||
|
||
def count_masked_numbers(original: str, filtered: str) -> int:
|
||
"""粗略统计脱敏掉的数字串数量(用于日志)。"""
|
||
return len(_NUMBER_RE.findall(original or "")) - len(_NUMBER_RE.findall(filtered or ""))
|