""" Segment Humanizer for Mid Platform. [AC-MARH-10] 分段策略组件(语义/长度切分)。 [AC-MARH-11] delay 策略租户化配置。 将文本按语义/长度切分为 segments,并生成拟人化 delay。 """ import logging import re import uuid from dataclasses import dataclass, field from typing import Any from app.models.mid.schemas import Segment, SegmentStats logger = logging.getLogger(__name__) DEFAULT_MIN_DELAY_MS = 50 DEFAULT_MAX_DELAY_MS = 500 DEFAULT_SEGMENT_MIN_LENGTH = 10 DEFAULT_SEGMENT_MAX_LENGTH = 200 @dataclass class HumanizeConfig: """拟人化配置。""" enabled: bool = True min_delay_ms: int = DEFAULT_MIN_DELAY_MS max_delay_ms: int = DEFAULT_MAX_DELAY_MS length_bucket_strategy: str = "simple" segment_min_length: int = DEFAULT_SEGMENT_MIN_LENGTH segment_max_length: int = DEFAULT_SEGMENT_MAX_LENGTH def to_dict(self) -> dict[str, Any]: return { "enabled": self.enabled, "min_delay_ms": self.min_delay_ms, "max_delay_ms": self.max_delay_ms, "length_bucket_strategy": self.length_bucket_strategy, "segment_min_length": self.segment_min_length, "segment_max_length": self.segment_max_length, } @classmethod def from_dict(cls, data: dict[str, Any]) -> "HumanizeConfig": return cls( enabled=data.get("enabled", True), min_delay_ms=data.get("min_delay_ms", DEFAULT_MIN_DELAY_MS), max_delay_ms=data.get("max_delay_ms", DEFAULT_MAX_DELAY_MS), length_bucket_strategy=data.get("length_bucket_strategy", "simple"), segment_min_length=data.get("segment_min_length", DEFAULT_SEGMENT_MIN_LENGTH), segment_max_length=data.get("segment_max_length", DEFAULT_SEGMENT_MAX_LENGTH), ) @dataclass class LengthBucket: """长度区间与对应 delay。""" min_length: int max_length: int delay_ms: int DEFAULT_LENGTH_BUCKETS = [ LengthBucket(min_length=0, max_length=20, delay_ms=100), LengthBucket(min_length=20, max_length=50, delay_ms=200), LengthBucket(min_length=50, max_length=100, delay_ms=300), LengthBucket(min_length=100, max_length=200, delay_ms=400), LengthBucket(min_length=200, max_length=10000, delay_ms=500), ] class SegmentHumanizer: """ [AC-MARH-10, AC-MARH-11] 分段拟人化组件。 Features: - 按语义/长度切分文本 - 生成拟人化 delay - 支持租户配置覆盖 - 输出 segment_stats 统计 """ def __init__( self, config: HumanizeConfig | None = None, length_buckets: list[LengthBucket] | None = None, ): self._config = config or HumanizeConfig() self._length_buckets = length_buckets or DEFAULT_LENGTH_BUCKETS def humanize( self, text: str, override_config: HumanizeConfig | None = None, ) -> tuple[list[Segment], SegmentStats]: """ [AC-MARH-10] 将文本转换为拟人化分段。 Args: text: 输入文本 override_config: 租户覆盖配置 Returns: Tuple of (segments, segment_stats) """ config = override_config or self._config if not config.enabled: segments = [Segment( segment_id=str(uuid.uuid4()), text=text, delay_after=0, )] stats = SegmentStats( segment_count=1, avg_segment_length=len(text), humanize_strategy="disabled", ) return segments, stats raw_segments = self._split_text(text, config) segments = [] for i, seg_text in enumerate(raw_segments): is_last = i == len(raw_segments) - 1 delay_after = 0 if is_last else self._calculate_delay(seg_text, config) segments.append(Segment( segment_id=str(uuid.uuid4()), text=seg_text, delay_after=delay_after, )) total_length = sum(len(s.text) for s in segments) avg_length = total_length / len(segments) if segments else 0.0 stats = SegmentStats( segment_count=len(segments), avg_segment_length=avg_length, humanize_strategy=config.length_bucket_strategy, ) logger.info( f"[AC-MARH-10] Humanized text: segments={len(segments)}, " f"avg_length={avg_length:.1f}, strategy={config.length_bucket_strategy}" ) return segments, stats def _split_text( self, text: str, config: HumanizeConfig, ) -> list[str]: """切分文本。""" if config.length_bucket_strategy == "semantic": return self._split_semantic(text, config) else: return self._split_simple(text, config) def _split_simple( self, text: str, config: HumanizeConfig, ) -> list[str]: """简单切分:按段落。""" paragraphs = re.split(r'\n\s*\n', text.strip()) segments = [] for para in paragraphs: para = para.strip() if not para: continue if len(para) <= config.segment_max_length: segments.append(para) else: sub_segments = self._split_by_length(para, config.segment_max_length) segments.extend(sub_segments) if not segments: segments = [text.strip()] return [s for s in segments if s.strip()] def _split_semantic( self, text: str, config: HumanizeConfig, ) -> list[str]: """语义切分:按句子边界。""" sentence_endings = re.compile(r'([。!?.!?]+)') parts = sentence_endings.split(text.strip()) sentences = [] current = "" for i, part in enumerate(parts): current += part if sentence_endings.match(part): sentences.append(current.strip()) current = "" if current.strip(): sentences.append(current.strip()) if not sentences: sentences = [text.strip()] segments = [] current_segment = "" for sentence in sentences: if len(current_segment) + len(sentence) <= config.segment_max_length: current_segment += sentence else: if current_segment: segments.append(current_segment) current_segment = sentence if current_segment: segments.append(current_segment) return [s for s in segments if s.strip()] def _split_by_length( self, text: str, max_length: int, ) -> list[str]: """按长度切分。""" segments = [] remaining = text while remaining: if len(remaining) <= max_length: segments.append(remaining.strip()) break split_pos = max_length for i in range(max_length - 1, max(0, max_length - 20), -1): if remaining[i] in ',,;;:: ': split_pos = i + 1 break segments.append(remaining[:split_pos].strip()) remaining = remaining[split_pos:] return [s for s in segments if s.strip()] def _calculate_delay( self, text: str, config: HumanizeConfig, ) -> int: """[AC-MARH-11] 计算拟人化 delay。""" text_length = len(text) for bucket in self._length_buckets: if bucket.min_length <= text_length < bucket.max_length: delay = bucket.delay_ms return max(config.min_delay_ms, min(delay, config.max_delay_ms)) return config.min_delay_ms def get_config(self) -> HumanizeConfig: """获取当前配置。""" return self._config _segment_humanizer: SegmentHumanizer | None = None def get_segment_humanizer( config: HumanizeConfig | None = None, ) -> SegmentHumanizer: """获取或创建 SegmentHumanizer 实例。""" global _segment_humanizer if _segment_humanizer is None: _segment_humanizer = SegmentHumanizer(config=config) return _segment_humanizer