ai-robot-core/ai-service/app/services/mid/segment_humanizer.py

283 lines
8.2 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

"""
Segment Humanizer for Mid Platform.
[AC-MARH-10] 分段策略组件(语义/长度切分)。
[AC-MARH-11] delay 策略租户化配置。
将文本按语义/长度切分为 segments并生成拟人化 delay。
"""
import logging
import re
import uuid
from dataclasses import dataclass, field
from typing import Any
from app.models.mid.schemas import Segment, SegmentStats
logger = logging.getLogger(__name__)
DEFAULT_MIN_DELAY_MS = 50
DEFAULT_MAX_DELAY_MS = 500
DEFAULT_SEGMENT_MIN_LENGTH = 10
DEFAULT_SEGMENT_MAX_LENGTH = 200
@dataclass
class HumanizeConfig:
"""拟人化配置。"""
enabled: bool = True
min_delay_ms: int = DEFAULT_MIN_DELAY_MS
max_delay_ms: int = DEFAULT_MAX_DELAY_MS
length_bucket_strategy: str = "simple"
segment_min_length: int = DEFAULT_SEGMENT_MIN_LENGTH
segment_max_length: int = DEFAULT_SEGMENT_MAX_LENGTH
def to_dict(self) -> dict[str, Any]:
return {
"enabled": self.enabled,
"min_delay_ms": self.min_delay_ms,
"max_delay_ms": self.max_delay_ms,
"length_bucket_strategy": self.length_bucket_strategy,
"segment_min_length": self.segment_min_length,
"segment_max_length": self.segment_max_length,
}
@classmethod
def from_dict(cls, data: dict[str, Any]) -> "HumanizeConfig":
return cls(
enabled=data.get("enabled", True),
min_delay_ms=data.get("min_delay_ms", DEFAULT_MIN_DELAY_MS),
max_delay_ms=data.get("max_delay_ms", DEFAULT_MAX_DELAY_MS),
length_bucket_strategy=data.get("length_bucket_strategy", "simple"),
segment_min_length=data.get("segment_min_length", DEFAULT_SEGMENT_MIN_LENGTH),
segment_max_length=data.get("segment_max_length", DEFAULT_SEGMENT_MAX_LENGTH),
)
@dataclass
class LengthBucket:
"""长度区间与对应 delay。"""
min_length: int
max_length: int
delay_ms: int
DEFAULT_LENGTH_BUCKETS = [
LengthBucket(min_length=0, max_length=20, delay_ms=100),
LengthBucket(min_length=20, max_length=50, delay_ms=200),
LengthBucket(min_length=50, max_length=100, delay_ms=300),
LengthBucket(min_length=100, max_length=200, delay_ms=400),
LengthBucket(min_length=200, max_length=10000, delay_ms=500),
]
class SegmentHumanizer:
"""
[AC-MARH-10, AC-MARH-11] 分段拟人化组件。
Features:
- 按语义/长度切分文本
- 生成拟人化 delay
- 支持租户配置覆盖
- 输出 segment_stats 统计
"""
def __init__(
self,
config: HumanizeConfig | None = None,
length_buckets: list[LengthBucket] | None = None,
):
self._config = config or HumanizeConfig()
self._length_buckets = length_buckets or DEFAULT_LENGTH_BUCKETS
def humanize(
self,
text: str,
override_config: HumanizeConfig | None = None,
) -> tuple[list[Segment], SegmentStats]:
"""
[AC-MARH-10] 将文本转换为拟人化分段。
Args:
text: 输入文本
override_config: 租户覆盖配置
Returns:
Tuple of (segments, segment_stats)
"""
config = override_config or self._config
if not config.enabled:
segments = [Segment(
segment_id=str(uuid.uuid4()),
text=text,
delay_after=0,
)]
stats = SegmentStats(
segment_count=1,
avg_segment_length=len(text),
humanize_strategy="disabled",
)
return segments, stats
raw_segments = self._split_text(text, config)
segments = []
for i, seg_text in enumerate(raw_segments):
is_last = i == len(raw_segments) - 1
delay_after = 0 if is_last else self._calculate_delay(seg_text, config)
segments.append(Segment(
segment_id=str(uuid.uuid4()),
text=seg_text,
delay_after=delay_after,
))
total_length = sum(len(s.text) for s in segments)
avg_length = total_length / len(segments) if segments else 0.0
stats = SegmentStats(
segment_count=len(segments),
avg_segment_length=avg_length,
humanize_strategy=config.length_bucket_strategy,
)
logger.info(
f"[AC-MARH-10] Humanized text: segments={len(segments)}, "
f"avg_length={avg_length:.1f}, strategy={config.length_bucket_strategy}"
)
return segments, stats
def _split_text(
self,
text: str,
config: HumanizeConfig,
) -> list[str]:
"""切分文本。"""
if config.length_bucket_strategy == "semantic":
return self._split_semantic(text, config)
else:
return self._split_simple(text, config)
def _split_simple(
self,
text: str,
config: HumanizeConfig,
) -> list[str]:
"""简单切分:按段落。"""
paragraphs = re.split(r'\n\s*\n', text.strip())
segments = []
for para in paragraphs:
para = para.strip()
if not para:
continue
if len(para) <= config.segment_max_length:
segments.append(para)
else:
sub_segments = self._split_by_length(para, config.segment_max_length)
segments.extend(sub_segments)
if not segments:
segments = [text.strip()]
return [s for s in segments if s.strip()]
def _split_semantic(
self,
text: str,
config: HumanizeConfig,
) -> list[str]:
"""语义切分:按句子边界。"""
sentence_endings = re.compile(r'([。!?.!?]+)')
parts = sentence_endings.split(text.strip())
sentences = []
current = ""
for i, part in enumerate(parts):
current += part
if sentence_endings.match(part):
sentences.append(current.strip())
current = ""
if current.strip():
sentences.append(current.strip())
if not sentences:
sentences = [text.strip()]
segments = []
current_segment = ""
for sentence in sentences:
if len(current_segment) + len(sentence) <= config.segment_max_length:
current_segment += sentence
else:
if current_segment:
segments.append(current_segment)
current_segment = sentence
if current_segment:
segments.append(current_segment)
return [s for s in segments if s.strip()]
def _split_by_length(
self,
text: str,
max_length: int,
) -> list[str]:
"""按长度切分。"""
segments = []
remaining = text
while remaining:
if len(remaining) <= max_length:
segments.append(remaining.strip())
break
split_pos = max_length
for i in range(max_length - 1, max(0, max_length - 20), -1):
if remaining[i] in ',;: ':
split_pos = i + 1
break
segments.append(remaining[:split_pos].strip())
remaining = remaining[split_pos:]
return [s for s in segments if s.strip()]
def _calculate_delay(
self,
text: str,
config: HumanizeConfig,
) -> int:
"""[AC-MARH-11] 计算拟人化 delay。"""
text_length = len(text)
for bucket in self._length_buckets:
if bucket.min_length <= text_length < bucket.max_length:
delay = bucket.delay_ms
return max(config.min_delay_ms, min(delay, config.max_delay_ms))
return config.min_delay_ms
def get_config(self) -> HumanizeConfig:
"""获取当前配置。"""
return self._config
_segment_humanizer: SegmentHumanizer | None = None
def get_segment_humanizer(
config: HumanizeConfig | None = None,
) -> SegmentHumanizer:
"""获取或创建 SegmentHumanizer 实例。"""
global _segment_humanizer
if _segment_humanizer is None:
_segment_humanizer = SegmentHumanizer(config=config)
return _segment_humanizer