ai-robot-core/ai-service/app/services/mid/segment_humanizer.py

283 lines
8.2 KiB
Python
Raw Normal View History

"""
Segment Humanizer for Mid Platform.
[AC-MARH-10] 分段策略组件语义/长度切分
[AC-MARH-11] delay 策略租户化配置
将文本按语义/长度切分为 segments并生成拟人化 delay
"""
import logging
import re
import uuid
from dataclasses import dataclass, field
from typing import Any
from app.models.mid.schemas import Segment, SegmentStats
logger = logging.getLogger(__name__)
DEFAULT_MIN_DELAY_MS = 50
DEFAULT_MAX_DELAY_MS = 500
DEFAULT_SEGMENT_MIN_LENGTH = 10
DEFAULT_SEGMENT_MAX_LENGTH = 200
@dataclass
class HumanizeConfig:
"""拟人化配置。"""
enabled: bool = True
min_delay_ms: int = DEFAULT_MIN_DELAY_MS
max_delay_ms: int = DEFAULT_MAX_DELAY_MS
length_bucket_strategy: str = "simple"
segment_min_length: int = DEFAULT_SEGMENT_MIN_LENGTH
segment_max_length: int = DEFAULT_SEGMENT_MAX_LENGTH
def to_dict(self) -> dict[str, Any]:
return {
"enabled": self.enabled,
"min_delay_ms": self.min_delay_ms,
"max_delay_ms": self.max_delay_ms,
"length_bucket_strategy": self.length_bucket_strategy,
"segment_min_length": self.segment_min_length,
"segment_max_length": self.segment_max_length,
}
@classmethod
def from_dict(cls, data: dict[str, Any]) -> "HumanizeConfig":
return cls(
enabled=data.get("enabled", True),
min_delay_ms=data.get("min_delay_ms", DEFAULT_MIN_DELAY_MS),
max_delay_ms=data.get("max_delay_ms", DEFAULT_MAX_DELAY_MS),
length_bucket_strategy=data.get("length_bucket_strategy", "simple"),
segment_min_length=data.get("segment_min_length", DEFAULT_SEGMENT_MIN_LENGTH),
segment_max_length=data.get("segment_max_length", DEFAULT_SEGMENT_MAX_LENGTH),
)
@dataclass
class LengthBucket:
"""长度区间与对应 delay。"""
min_length: int
max_length: int
delay_ms: int
DEFAULT_LENGTH_BUCKETS = [
LengthBucket(min_length=0, max_length=20, delay_ms=100),
LengthBucket(min_length=20, max_length=50, delay_ms=200),
LengthBucket(min_length=50, max_length=100, delay_ms=300),
LengthBucket(min_length=100, max_length=200, delay_ms=400),
LengthBucket(min_length=200, max_length=10000, delay_ms=500),
]
class SegmentHumanizer:
"""
[AC-MARH-10, AC-MARH-11] 分段拟人化组件
Features:
- 按语义/长度切分文本
- 生成拟人化 delay
- 支持租户配置覆盖
- 输出 segment_stats 统计
"""
def __init__(
self,
config: HumanizeConfig | None = None,
length_buckets: list[LengthBucket] | None = None,
):
self._config = config or HumanizeConfig()
self._length_buckets = length_buckets or DEFAULT_LENGTH_BUCKETS
def humanize(
self,
text: str,
override_config: HumanizeConfig | None = None,
) -> tuple[list[Segment], SegmentStats]:
"""
[AC-MARH-10] 将文本转换为拟人化分段
Args:
text: 输入文本
override_config: 租户覆盖配置
Returns:
Tuple of (segments, segment_stats)
"""
config = override_config or self._config
if not config.enabled:
segments = [Segment(
segment_id=str(uuid.uuid4()),
text=text,
delay_after=0,
)]
stats = SegmentStats(
segment_count=1,
avg_segment_length=len(text),
humanize_strategy="disabled",
)
return segments, stats
raw_segments = self._split_text(text, config)
segments = []
for i, seg_text in enumerate(raw_segments):
is_last = i == len(raw_segments) - 1
delay_after = 0 if is_last else self._calculate_delay(seg_text, config)
segments.append(Segment(
segment_id=str(uuid.uuid4()),
text=seg_text,
delay_after=delay_after,
))
total_length = sum(len(s.text) for s in segments)
avg_length = total_length / len(segments) if segments else 0.0
stats = SegmentStats(
segment_count=len(segments),
avg_segment_length=avg_length,
humanize_strategy=config.length_bucket_strategy,
)
logger.info(
f"[AC-MARH-10] Humanized text: segments={len(segments)}, "
f"avg_length={avg_length:.1f}, strategy={config.length_bucket_strategy}"
)
return segments, stats
def _split_text(
self,
text: str,
config: HumanizeConfig,
) -> list[str]:
"""切分文本。"""
if config.length_bucket_strategy == "semantic":
return self._split_semantic(text, config)
else:
return self._split_simple(text, config)
def _split_simple(
self,
text: str,
config: HumanizeConfig,
) -> list[str]:
"""简单切分:按段落。"""
paragraphs = re.split(r'\n\s*\n', text.strip())
segments = []
for para in paragraphs:
para = para.strip()
if not para:
continue
if len(para) <= config.segment_max_length:
segments.append(para)
else:
sub_segments = self._split_by_length(para, config.segment_max_length)
segments.extend(sub_segments)
if not segments:
segments = [text.strip()]
return [s for s in segments if s.strip()]
def _split_semantic(
self,
text: str,
config: HumanizeConfig,
) -> list[str]:
"""语义切分:按句子边界。"""
sentence_endings = re.compile(r'([。!?.!?]+)')
parts = sentence_endings.split(text.strip())
sentences = []
current = ""
for i, part in enumerate(parts):
current += part
if sentence_endings.match(part):
sentences.append(current.strip())
current = ""
if current.strip():
sentences.append(current.strip())
if not sentences:
sentences = [text.strip()]
segments = []
current_segment = ""
for sentence in sentences:
if len(current_segment) + len(sentence) <= config.segment_max_length:
current_segment += sentence
else:
if current_segment:
segments.append(current_segment)
current_segment = sentence
if current_segment:
segments.append(current_segment)
return [s for s in segments if s.strip()]
def _split_by_length(
self,
text: str,
max_length: int,
) -> list[str]:
"""按长度切分。"""
segments = []
remaining = text
while remaining:
if len(remaining) <= max_length:
segments.append(remaining.strip())
break
split_pos = max_length
for i in range(max_length - 1, max(0, max_length - 20), -1):
if remaining[i] in ',;: ':
split_pos = i + 1
break
segments.append(remaining[:split_pos].strip())
remaining = remaining[split_pos:]
return [s for s in segments if s.strip()]
def _calculate_delay(
self,
text: str,
config: HumanizeConfig,
) -> int:
"""[AC-MARH-11] 计算拟人化 delay。"""
text_length = len(text)
for bucket in self._length_buckets:
if bucket.min_length <= text_length < bucket.max_length:
delay = bucket.delay_ms
return max(config.min_delay_ms, min(delay, config.max_delay_ms))
return config.min_delay_ms
def get_config(self) -> HumanizeConfig:
"""获取当前配置。"""
return self._config
_segment_humanizer: SegmentHumanizer | None = None
def get_segment_humanizer(
config: HumanizeConfig | None = None,
) -> SegmentHumanizer:
"""获取或创建 SegmentHumanizer 实例。"""
global _segment_humanizer
if _segment_humanizer is None:
_segment_humanizer = SegmentHumanizer(config=config)
return _segment_humanizer