From 774744d5346049d4a050220139a604c9273f1fa8 Mon Sep 17 00:00:00 2001 From: MerCry Date: Wed, 25 Feb 2026 23:09:24 +0800 Subject: [PATCH] =?UTF-8?q?feat:=20=E6=96=87=E6=A1=A3=E7=B4=A2=E5=BC=95?= =?UTF-8?q?=E4=BC=98=E5=8C=96=EF=BC=8C=E6=94=AF=E6=8C=81=E5=A4=9A=E7=BC=96?= =?UTF-8?q?=E7=A0=81=E8=A7=A3=E7=A0=81=E5=92=8C=E6=8C=89=E8=A1=8C=E5=88=86?= =?UTF-8?q?=E5=9D=97=20[AC-AISVC-21,=20AC-AISVC-22]?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- ai-service/app/api/admin/kb.py | 68 ++++++++++++++++--- .../app/services/document/text_parser.py | 58 ++++++++-------- 2 files changed, 85 insertions(+), 41 deletions(-) diff --git a/ai-service/app/api/admin/kb.py b/ai-service/app/api/admin/kb.py index 6dbdb25..e150d72 100644 --- a/ai-service/app/api/admin/kb.py +++ b/ai-service/app/api/admin/kb.py @@ -37,6 +37,42 @@ class TextChunk: source: str | None = None +def chunk_text_by_lines( + text: str, + min_line_length: int = 10, + source: str | None = None, +) -> list[TextChunk]: + """ + 按行分块,每行作为一个独立的检索单元。 + + Args: + text: 要分块的文本 + min_line_length: 最小行长度,低于此长度的行会被跳过 + source: 来源文件路径(可选) + + Returns: + 分块列表,每个块对应一行文本 + """ + lines = text.split('\n') + chunks: list[TextChunk] = [] + + for i, line in enumerate(lines): + line = line.strip() + + if len(line) < min_line_length: + continue + + chunks.append(TextChunk( + text=line, + start_token=i, + end_token=i + 1, + page=None, + source=source, + )) + + return chunks + + def chunk_text_with_tiktoken( text: str, chunk_size: int = 512, @@ -318,8 +354,19 @@ async def _index_document(tenant_id: str, job_id: str, doc_id: str, content: byt text_extensions = {".txt", ".md", ".markdown", ".rst", ".log", ".json", ".xml", ".yaml", ".yml"} if file_ext in text_extensions or not file_ext: - logger.info(f"[INDEX] Treating as text file, decoding with UTF-8") - text = content.decode("utf-8", errors="ignore") + logger.info(f"[INDEX] Treating as text file, trying multiple encodings") + text = None + for encoding in ["utf-8", "gbk", "gb2312", "gb18030", "big5", "utf-16", "latin-1"]: + try: + text = content.decode(encoding) + logger.info(f"[INDEX] Successfully decoded with encoding: {encoding}") + break + except (UnicodeDecodeError, LookupError): + continue + + if text is None: + text = content.decode("utf-8", errors="replace") + logger.warning(f"[INDEX] Failed to decode with known encodings, using utf-8 with replacement") else: logger.info(f"[INDEX] Binary file detected, will parse with document parser") await kb_service.update_job_status( @@ -374,23 +421,22 @@ async def _index_document(tenant_id: str, job_id: str, doc_id: str, content: byt all_chunks: list[TextChunk] = [] if parse_result and parse_result.pages: - logger.info(f"[INDEX] PDF with {len(parse_result.pages)} pages, using tiktoken chunking with page metadata") + logger.info(f"[INDEX] PDF with {len(parse_result.pages)} pages, using line-based chunking with page metadata") for page in parse_result.pages: - page_chunks = chunk_text_with_tiktoken( + page_chunks = chunk_text_by_lines( page.text, - chunk_size=512, - overlap=100, - page=page.page, + min_line_length=10, source=filename, ) + for pc in page_chunks: + pc.page = page.page all_chunks.extend(page_chunks) logger.info(f"[INDEX] Total chunks from PDF: {len(all_chunks)}") else: - logger.info(f"[INDEX] Using tiktoken chunking without page metadata") - all_chunks = chunk_text_with_tiktoken( + logger.info(f"[INDEX] Using line-based chunking") + all_chunks = chunk_text_by_lines( text, - chunk_size=512, - overlap=100, + min_line_length=10, source=filename, ) logger.info(f"[INDEX] Total chunks: {len(all_chunks)}") diff --git a/ai-service/app/services/document/text_parser.py b/ai-service/app/services/document/text_parser.py index 3af0c00..551b712 100644 --- a/ai-service/app/services/document/text_parser.py +++ b/ai-service/app/services/document/text_parser.py @@ -15,17 +15,39 @@ from app.services.document.base import ( logger = logging.getLogger(__name__) +ENCODINGS_TO_TRY = ["utf-8", "gbk", "gb2312", "gb18030", "big5", "utf-16", "latin-1"] + class TextParser(DocumentParser): """ Parser for plain text files. - [AC-AISVC-33] Direct text extraction. + [AC-AISVC-33] Direct text extraction with multiple encoding support. """ def __init__(self, encoding: str = "utf-8", **kwargs: Any): self._encoding = encoding self._extra_config = kwargs + def _try_encodings(self, path: Path) -> tuple[str, str]: + """ + Try multiple encodings to read the file. + Returns: (text, encoding_used) + """ + for enc in ENCODINGS_TO_TRY: + try: + with open(path, "r", encoding=enc) as f: + text = f.read() + logger.info(f"Successfully parsed with encoding: {enc}") + return text, enc + except (UnicodeDecodeError, LookupError): + continue + + raise DocumentParseException( + f"Failed to decode file with any known encoding", + file_path=str(path), + parser="text" + ) + def parse(self, file_path: str | Path) -> ParseResult: """ Parse a text file and extract content. @@ -41,15 +63,14 @@ class TextParser(DocumentParser): ) try: - with open(path, "r", encoding=self._encoding) as f: - text = f.read() + text, encoding_used = self._try_encodings(path) file_size = path.stat().st_size line_count = text.count("\n") + 1 logger.info( f"Parsed text: {path.name}, lines={line_count}, " - f"chars={len(text)}, size={file_size}" + f"chars={len(text)}, size={file_size}, encoding={encoding_used}" ) return ParseResult( @@ -59,35 +80,12 @@ class TextParser(DocumentParser): metadata={ "format": "text", "line_count": line_count, - "encoding": self._encoding, + "encoding": encoding_used, } ) - except UnicodeDecodeError: - try: - with open(path, "r", encoding="gbk") as f: - text = f.read() - - file_size = path.stat().st_size - line_count = text.count("\n") + 1 - - return ParseResult( - text=text, - source_path=str(path), - file_size=file_size, - metadata={ - "format": "text", - "line_count": line_count, - "encoding": "gbk", - } - ) - except Exception as e: - raise DocumentParseException( - f"Failed to parse text file with encoding fallback: {e}", - file_path=str(path), - parser="text", - details={"error": str(e)} - ) + except DocumentParseException: + raise except Exception as e: raise DocumentParseException( f"Failed to parse text file: {e}",