283 lines
8.2 KiB
Python
283 lines
8.2 KiB
Python
|
|
"""
|
|||
|
|
Segment Humanizer for Mid Platform.
|
|||
|
|
[AC-MARH-10] 分段策略组件(语义/长度切分)。
|
|||
|
|
[AC-MARH-11] delay 策略租户化配置。
|
|||
|
|
|
|||
|
|
将文本按语义/长度切分为 segments,并生成拟人化 delay。
|
|||
|
|
"""
|
|||
|
|
|
|||
|
|
import logging
|
|||
|
|
import re
|
|||
|
|
import uuid
|
|||
|
|
from dataclasses import dataclass, field
|
|||
|
|
from typing import Any
|
|||
|
|
|
|||
|
|
from app.models.mid.schemas import Segment, SegmentStats
|
|||
|
|
|
|||
|
|
logger = logging.getLogger(__name__)
|
|||
|
|
|
|||
|
|
DEFAULT_MIN_DELAY_MS = 50
|
|||
|
|
DEFAULT_MAX_DELAY_MS = 500
|
|||
|
|
DEFAULT_SEGMENT_MIN_LENGTH = 10
|
|||
|
|
DEFAULT_SEGMENT_MAX_LENGTH = 200
|
|||
|
|
|
|||
|
|
|
|||
|
|
@dataclass
|
|||
|
|
class HumanizeConfig:
|
|||
|
|
"""拟人化配置。"""
|
|||
|
|
enabled: bool = True
|
|||
|
|
min_delay_ms: int = DEFAULT_MIN_DELAY_MS
|
|||
|
|
max_delay_ms: int = DEFAULT_MAX_DELAY_MS
|
|||
|
|
length_bucket_strategy: str = "simple"
|
|||
|
|
segment_min_length: int = DEFAULT_SEGMENT_MIN_LENGTH
|
|||
|
|
segment_max_length: int = DEFAULT_SEGMENT_MAX_LENGTH
|
|||
|
|
|
|||
|
|
def to_dict(self) -> dict[str, Any]:
|
|||
|
|
return {
|
|||
|
|
"enabled": self.enabled,
|
|||
|
|
"min_delay_ms": self.min_delay_ms,
|
|||
|
|
"max_delay_ms": self.max_delay_ms,
|
|||
|
|
"length_bucket_strategy": self.length_bucket_strategy,
|
|||
|
|
"segment_min_length": self.segment_min_length,
|
|||
|
|
"segment_max_length": self.segment_max_length,
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
@classmethod
|
|||
|
|
def from_dict(cls, data: dict[str, Any]) -> "HumanizeConfig":
|
|||
|
|
return cls(
|
|||
|
|
enabled=data.get("enabled", True),
|
|||
|
|
min_delay_ms=data.get("min_delay_ms", DEFAULT_MIN_DELAY_MS),
|
|||
|
|
max_delay_ms=data.get("max_delay_ms", DEFAULT_MAX_DELAY_MS),
|
|||
|
|
length_bucket_strategy=data.get("length_bucket_strategy", "simple"),
|
|||
|
|
segment_min_length=data.get("segment_min_length", DEFAULT_SEGMENT_MIN_LENGTH),
|
|||
|
|
segment_max_length=data.get("segment_max_length", DEFAULT_SEGMENT_MAX_LENGTH),
|
|||
|
|
)
|
|||
|
|
|
|||
|
|
|
|||
|
|
@dataclass
|
|||
|
|
class LengthBucket:
|
|||
|
|
"""长度区间与对应 delay。"""
|
|||
|
|
min_length: int
|
|||
|
|
max_length: int
|
|||
|
|
delay_ms: int
|
|||
|
|
|
|||
|
|
|
|||
|
|
DEFAULT_LENGTH_BUCKETS = [
|
|||
|
|
LengthBucket(min_length=0, max_length=20, delay_ms=100),
|
|||
|
|
LengthBucket(min_length=20, max_length=50, delay_ms=200),
|
|||
|
|
LengthBucket(min_length=50, max_length=100, delay_ms=300),
|
|||
|
|
LengthBucket(min_length=100, max_length=200, delay_ms=400),
|
|||
|
|
LengthBucket(min_length=200, max_length=10000, delay_ms=500),
|
|||
|
|
]
|
|||
|
|
|
|||
|
|
|
|||
|
|
class SegmentHumanizer:
|
|||
|
|
"""
|
|||
|
|
[AC-MARH-10, AC-MARH-11] 分段拟人化组件。
|
|||
|
|
|
|||
|
|
Features:
|
|||
|
|
- 按语义/长度切分文本
|
|||
|
|
- 生成拟人化 delay
|
|||
|
|
- 支持租户配置覆盖
|
|||
|
|
- 输出 segment_stats 统计
|
|||
|
|
"""
|
|||
|
|
|
|||
|
|
def __init__(
|
|||
|
|
self,
|
|||
|
|
config: HumanizeConfig | None = None,
|
|||
|
|
length_buckets: list[LengthBucket] | None = None,
|
|||
|
|
):
|
|||
|
|
self._config = config or HumanizeConfig()
|
|||
|
|
self._length_buckets = length_buckets or DEFAULT_LENGTH_BUCKETS
|
|||
|
|
|
|||
|
|
def humanize(
|
|||
|
|
self,
|
|||
|
|
text: str,
|
|||
|
|
override_config: HumanizeConfig | None = None,
|
|||
|
|
) -> tuple[list[Segment], SegmentStats]:
|
|||
|
|
"""
|
|||
|
|
[AC-MARH-10] 将文本转换为拟人化分段。
|
|||
|
|
|
|||
|
|
Args:
|
|||
|
|
text: 输入文本
|
|||
|
|
override_config: 租户覆盖配置
|
|||
|
|
|
|||
|
|
Returns:
|
|||
|
|
Tuple of (segments, segment_stats)
|
|||
|
|
"""
|
|||
|
|
config = override_config or self._config
|
|||
|
|
|
|||
|
|
if not config.enabled:
|
|||
|
|
segments = [Segment(
|
|||
|
|
segment_id=str(uuid.uuid4()),
|
|||
|
|
text=text,
|
|||
|
|
delay_after=0,
|
|||
|
|
)]
|
|||
|
|
stats = SegmentStats(
|
|||
|
|
segment_count=1,
|
|||
|
|
avg_segment_length=len(text),
|
|||
|
|
humanize_strategy="disabled",
|
|||
|
|
)
|
|||
|
|
return segments, stats
|
|||
|
|
|
|||
|
|
raw_segments = self._split_text(text, config)
|
|||
|
|
segments = []
|
|||
|
|
|
|||
|
|
for i, seg_text in enumerate(raw_segments):
|
|||
|
|
is_last = i == len(raw_segments) - 1
|
|||
|
|
delay_after = 0 if is_last else self._calculate_delay(seg_text, config)
|
|||
|
|
|
|||
|
|
segments.append(Segment(
|
|||
|
|
segment_id=str(uuid.uuid4()),
|
|||
|
|
text=seg_text,
|
|||
|
|
delay_after=delay_after,
|
|||
|
|
))
|
|||
|
|
|
|||
|
|
total_length = sum(len(s.text) for s in segments)
|
|||
|
|
avg_length = total_length / len(segments) if segments else 0.0
|
|||
|
|
|
|||
|
|
stats = SegmentStats(
|
|||
|
|
segment_count=len(segments),
|
|||
|
|
avg_segment_length=avg_length,
|
|||
|
|
humanize_strategy=config.length_bucket_strategy,
|
|||
|
|
)
|
|||
|
|
|
|||
|
|
logger.info(
|
|||
|
|
f"[AC-MARH-10] Humanized text: segments={len(segments)}, "
|
|||
|
|
f"avg_length={avg_length:.1f}, strategy={config.length_bucket_strategy}"
|
|||
|
|
)
|
|||
|
|
|
|||
|
|
return segments, stats
|
|||
|
|
|
|||
|
|
def _split_text(
|
|||
|
|
self,
|
|||
|
|
text: str,
|
|||
|
|
config: HumanizeConfig,
|
|||
|
|
) -> list[str]:
|
|||
|
|
"""切分文本。"""
|
|||
|
|
if config.length_bucket_strategy == "semantic":
|
|||
|
|
return self._split_semantic(text, config)
|
|||
|
|
else:
|
|||
|
|
return self._split_simple(text, config)
|
|||
|
|
|
|||
|
|
def _split_simple(
|
|||
|
|
self,
|
|||
|
|
text: str,
|
|||
|
|
config: HumanizeConfig,
|
|||
|
|
) -> list[str]:
|
|||
|
|
"""简单切分:按段落。"""
|
|||
|
|
paragraphs = re.split(r'\n\s*\n', text.strip())
|
|||
|
|
segments = []
|
|||
|
|
|
|||
|
|
for para in paragraphs:
|
|||
|
|
para = para.strip()
|
|||
|
|
if not para:
|
|||
|
|
continue
|
|||
|
|
|
|||
|
|
if len(para) <= config.segment_max_length:
|
|||
|
|
segments.append(para)
|
|||
|
|
else:
|
|||
|
|
sub_segments = self._split_by_length(para, config.segment_max_length)
|
|||
|
|
segments.extend(sub_segments)
|
|||
|
|
|
|||
|
|
if not segments:
|
|||
|
|
segments = [text.strip()]
|
|||
|
|
|
|||
|
|
return [s for s in segments if s.strip()]
|
|||
|
|
|
|||
|
|
def _split_semantic(
|
|||
|
|
self,
|
|||
|
|
text: str,
|
|||
|
|
config: HumanizeConfig,
|
|||
|
|
) -> list[str]:
|
|||
|
|
"""语义切分:按句子边界。"""
|
|||
|
|
sentence_endings = re.compile(r'([。!?.!?]+)')
|
|||
|
|
parts = sentence_endings.split(text.strip())
|
|||
|
|
|
|||
|
|
sentences = []
|
|||
|
|
current = ""
|
|||
|
|
for i, part in enumerate(parts):
|
|||
|
|
current += part
|
|||
|
|
if sentence_endings.match(part):
|
|||
|
|
sentences.append(current.strip())
|
|||
|
|
current = ""
|
|||
|
|
|
|||
|
|
if current.strip():
|
|||
|
|
sentences.append(current.strip())
|
|||
|
|
|
|||
|
|
if not sentences:
|
|||
|
|
sentences = [text.strip()]
|
|||
|
|
|
|||
|
|
segments = []
|
|||
|
|
current_segment = ""
|
|||
|
|
|
|||
|
|
for sentence in sentences:
|
|||
|
|
if len(current_segment) + len(sentence) <= config.segment_max_length:
|
|||
|
|
current_segment += sentence
|
|||
|
|
else:
|
|||
|
|
if current_segment:
|
|||
|
|
segments.append(current_segment)
|
|||
|
|
current_segment = sentence
|
|||
|
|
|
|||
|
|
if current_segment:
|
|||
|
|
segments.append(current_segment)
|
|||
|
|
|
|||
|
|
return [s for s in segments if s.strip()]
|
|||
|
|
|
|||
|
|
def _split_by_length(
|
|||
|
|
self,
|
|||
|
|
text: str,
|
|||
|
|
max_length: int,
|
|||
|
|
) -> list[str]:
|
|||
|
|
"""按长度切分。"""
|
|||
|
|
segments = []
|
|||
|
|
remaining = text
|
|||
|
|
|
|||
|
|
while remaining:
|
|||
|
|
if len(remaining) <= max_length:
|
|||
|
|
segments.append(remaining.strip())
|
|||
|
|
break
|
|||
|
|
|
|||
|
|
split_pos = max_length
|
|||
|
|
for i in range(max_length - 1, max(0, max_length - 20), -1):
|
|||
|
|
if remaining[i] in ',,;;:: ':
|
|||
|
|
split_pos = i + 1
|
|||
|
|
break
|
|||
|
|
|
|||
|
|
segments.append(remaining[:split_pos].strip())
|
|||
|
|
remaining = remaining[split_pos:]
|
|||
|
|
|
|||
|
|
return [s for s in segments if s.strip()]
|
|||
|
|
|
|||
|
|
def _calculate_delay(
|
|||
|
|
self,
|
|||
|
|
text: str,
|
|||
|
|
config: HumanizeConfig,
|
|||
|
|
) -> int:
|
|||
|
|
"""[AC-MARH-11] 计算拟人化 delay。"""
|
|||
|
|
text_length = len(text)
|
|||
|
|
|
|||
|
|
for bucket in self._length_buckets:
|
|||
|
|
if bucket.min_length <= text_length < bucket.max_length:
|
|||
|
|
delay = bucket.delay_ms
|
|||
|
|
return max(config.min_delay_ms, min(delay, config.max_delay_ms))
|
|||
|
|
|
|||
|
|
return config.min_delay_ms
|
|||
|
|
|
|||
|
|
def get_config(self) -> HumanizeConfig:
|
|||
|
|
"""获取当前配置。"""
|
|||
|
|
return self._config
|
|||
|
|
|
|||
|
|
|
|||
|
|
_segment_humanizer: SegmentHumanizer | None = None
|
|||
|
|
|
|||
|
|
|
|||
|
|
def get_segment_humanizer(
|
|||
|
|
config: HumanizeConfig | None = None,
|
|||
|
|
) -> SegmentHumanizer:
|
|||
|
|
"""获取或创建 SegmentHumanizer 实例。"""
|
|||
|
|
global _segment_humanizer
|
|||
|
|
if _segment_humanizer is None:
|
|||
|
|
_segment_humanizer = SegmentHumanizer(config=config)
|
|||
|
|
return _segment_humanizer
|