diff --git a/ai-service/app/api/admin/kb.py b/ai-service/app/api/admin/kb.py index 6dbdb25..e150d72 100644 --- a/ai-service/app/api/admin/kb.py +++ b/ai-service/app/api/admin/kb.py @@ -37,6 +37,42 @@ class TextChunk: source: str | None = None +def chunk_text_by_lines( + text: str, + min_line_length: int = 10, + source: str | None = None, +) -> list[TextChunk]: + """ + 按行分块,每行作为一个独立的检索单元。 + + Args: + text: 要分块的文本 + min_line_length: 最小行长度,低于此长度的行会被跳过 + source: 来源文件路径(可选) + + Returns: + 分块列表,每个块对应一行文本 + """ + lines = text.split('\n') + chunks: list[TextChunk] = [] + + for i, line in enumerate(lines): + line = line.strip() + + if len(line) < min_line_length: + continue + + chunks.append(TextChunk( + text=line, + start_token=i, + end_token=i + 1, + page=None, + source=source, + )) + + return chunks + + def chunk_text_with_tiktoken( text: str, chunk_size: int = 512, @@ -318,8 +354,19 @@ async def _index_document(tenant_id: str, job_id: str, doc_id: str, content: byt text_extensions = {".txt", ".md", ".markdown", ".rst", ".log", ".json", ".xml", ".yaml", ".yml"} if file_ext in text_extensions or not file_ext: - logger.info(f"[INDEX] Treating as text file, decoding with UTF-8") - text = content.decode("utf-8", errors="ignore") + logger.info(f"[INDEX] Treating as text file, trying multiple encodings") + text = None + for encoding in ["utf-8", "gbk", "gb2312", "gb18030", "big5", "utf-16", "latin-1"]: + try: + text = content.decode(encoding) + logger.info(f"[INDEX] Successfully decoded with encoding: {encoding}") + break + except (UnicodeDecodeError, LookupError): + continue + + if text is None: + text = content.decode("utf-8", errors="replace") + logger.warning(f"[INDEX] Failed to decode with known encodings, using utf-8 with replacement") else: logger.info(f"[INDEX] Binary file detected, will parse with document parser") await kb_service.update_job_status( @@ -374,23 +421,22 @@ async def _index_document(tenant_id: str, job_id: str, doc_id: str, content: byt all_chunks: list[TextChunk] = [] if parse_result and parse_result.pages: - logger.info(f"[INDEX] PDF with {len(parse_result.pages)} pages, using tiktoken chunking with page metadata") + logger.info(f"[INDEX] PDF with {len(parse_result.pages)} pages, using line-based chunking with page metadata") for page in parse_result.pages: - page_chunks = chunk_text_with_tiktoken( + page_chunks = chunk_text_by_lines( page.text, - chunk_size=512, - overlap=100, - page=page.page, + min_line_length=10, source=filename, ) + for pc in page_chunks: + pc.page = page.page all_chunks.extend(page_chunks) logger.info(f"[INDEX] Total chunks from PDF: {len(all_chunks)}") else: - logger.info(f"[INDEX] Using tiktoken chunking without page metadata") - all_chunks = chunk_text_with_tiktoken( + logger.info(f"[INDEX] Using line-based chunking") + all_chunks = chunk_text_by_lines( text, - chunk_size=512, - overlap=100, + min_line_length=10, source=filename, ) logger.info(f"[INDEX] Total chunks: {len(all_chunks)}") diff --git a/ai-service/app/services/document/text_parser.py b/ai-service/app/services/document/text_parser.py index 3af0c00..551b712 100644 --- a/ai-service/app/services/document/text_parser.py +++ b/ai-service/app/services/document/text_parser.py @@ -15,17 +15,39 @@ from app.services.document.base import ( logger = logging.getLogger(__name__) +ENCODINGS_TO_TRY = ["utf-8", "gbk", "gb2312", "gb18030", "big5", "utf-16", "latin-1"] + class TextParser(DocumentParser): """ Parser for plain text files. - [AC-AISVC-33] Direct text extraction. + [AC-AISVC-33] Direct text extraction with multiple encoding support. """ def __init__(self, encoding: str = "utf-8", **kwargs: Any): self._encoding = encoding self._extra_config = kwargs + def _try_encodings(self, path: Path) -> tuple[str, str]: + """ + Try multiple encodings to read the file. + Returns: (text, encoding_used) + """ + for enc in ENCODINGS_TO_TRY: + try: + with open(path, "r", encoding=enc) as f: + text = f.read() + logger.info(f"Successfully parsed with encoding: {enc}") + return text, enc + except (UnicodeDecodeError, LookupError): + continue + + raise DocumentParseException( + f"Failed to decode file with any known encoding", + file_path=str(path), + parser="text" + ) + def parse(self, file_path: str | Path) -> ParseResult: """ Parse a text file and extract content. @@ -41,15 +63,14 @@ class TextParser(DocumentParser): ) try: - with open(path, "r", encoding=self._encoding) as f: - text = f.read() + text, encoding_used = self._try_encodings(path) file_size = path.stat().st_size line_count = text.count("\n") + 1 logger.info( f"Parsed text: {path.name}, lines={line_count}, " - f"chars={len(text)}, size={file_size}" + f"chars={len(text)}, size={file_size}, encoding={encoding_used}" ) return ParseResult( @@ -59,35 +80,12 @@ class TextParser(DocumentParser): metadata={ "format": "text", "line_count": line_count, - "encoding": self._encoding, + "encoding": encoding_used, } ) - except UnicodeDecodeError: - try: - with open(path, "r", encoding="gbk") as f: - text = f.read() - - file_size = path.stat().st_size - line_count = text.count("\n") + 1 - - return ParseResult( - text=text, - source_path=str(path), - file_size=file_size, - metadata={ - "format": "text", - "line_count": line_count, - "encoding": "gbk", - } - ) - except Exception as e: - raise DocumentParseException( - f"Failed to parse text file with encoding fallback: {e}", - file_path=str(path), - parser="text", - details={"error": str(e)} - ) + except DocumentParseException: + raise except Exception as e: raise DocumentParseException( f"Failed to parse text file: {e}",