feat: 文档索引优化,支持多编码解码和按行分块 [AC-AISVC-21, AC-AISVC-22]

This commit is contained in:
MerCry 2026-02-25 23:09:24 +08:00
parent ac8c33cf94
commit 774744d534
2 changed files with 85 additions and 41 deletions

View File

@ -37,6 +37,42 @@ class TextChunk:
source: str | None = None
def chunk_text_by_lines(
text: str,
min_line_length: int = 10,
source: str | None = None,
) -> list[TextChunk]:
"""
按行分块每行作为一个独立的检索单元
Args:
text: 要分块的文本
min_line_length: 最小行长度低于此长度的行会被跳过
source: 来源文件路径可选
Returns:
分块列表每个块对应一行文本
"""
lines = text.split('\n')
chunks: list[TextChunk] = []
for i, line in enumerate(lines):
line = line.strip()
if len(line) < min_line_length:
continue
chunks.append(TextChunk(
text=line,
start_token=i,
end_token=i + 1,
page=None,
source=source,
))
return chunks
def chunk_text_with_tiktoken(
text: str,
chunk_size: int = 512,
@ -318,8 +354,19 @@ async def _index_document(tenant_id: str, job_id: str, doc_id: str, content: byt
text_extensions = {".txt", ".md", ".markdown", ".rst", ".log", ".json", ".xml", ".yaml", ".yml"}
if file_ext in text_extensions or not file_ext:
logger.info(f"[INDEX] Treating as text file, decoding with UTF-8")
text = content.decode("utf-8", errors="ignore")
logger.info(f"[INDEX] Treating as text file, trying multiple encodings")
text = None
for encoding in ["utf-8", "gbk", "gb2312", "gb18030", "big5", "utf-16", "latin-1"]:
try:
text = content.decode(encoding)
logger.info(f"[INDEX] Successfully decoded with encoding: {encoding}")
break
except (UnicodeDecodeError, LookupError):
continue
if text is None:
text = content.decode("utf-8", errors="replace")
logger.warning(f"[INDEX] Failed to decode with known encodings, using utf-8 with replacement")
else:
logger.info(f"[INDEX] Binary file detected, will parse with document parser")
await kb_service.update_job_status(
@ -374,23 +421,22 @@ async def _index_document(tenant_id: str, job_id: str, doc_id: str, content: byt
all_chunks: list[TextChunk] = []
if parse_result and parse_result.pages:
logger.info(f"[INDEX] PDF with {len(parse_result.pages)} pages, using tiktoken chunking with page metadata")
logger.info(f"[INDEX] PDF with {len(parse_result.pages)} pages, using line-based chunking with page metadata")
for page in parse_result.pages:
page_chunks = chunk_text_with_tiktoken(
page_chunks = chunk_text_by_lines(
page.text,
chunk_size=512,
overlap=100,
page=page.page,
min_line_length=10,
source=filename,
)
for pc in page_chunks:
pc.page = page.page
all_chunks.extend(page_chunks)
logger.info(f"[INDEX] Total chunks from PDF: {len(all_chunks)}")
else:
logger.info(f"[INDEX] Using tiktoken chunking without page metadata")
all_chunks = chunk_text_with_tiktoken(
logger.info(f"[INDEX] Using line-based chunking")
all_chunks = chunk_text_by_lines(
text,
chunk_size=512,
overlap=100,
min_line_length=10,
source=filename,
)
logger.info(f"[INDEX] Total chunks: {len(all_chunks)}")

View File

@ -15,17 +15,39 @@ from app.services.document.base import (
logger = logging.getLogger(__name__)
ENCODINGS_TO_TRY = ["utf-8", "gbk", "gb2312", "gb18030", "big5", "utf-16", "latin-1"]
class TextParser(DocumentParser):
"""
Parser for plain text files.
[AC-AISVC-33] Direct text extraction.
[AC-AISVC-33] Direct text extraction with multiple encoding support.
"""
def __init__(self, encoding: str = "utf-8", **kwargs: Any):
self._encoding = encoding
self._extra_config = kwargs
def _try_encodings(self, path: Path) -> tuple[str, str]:
"""
Try multiple encodings to read the file.
Returns: (text, encoding_used)
"""
for enc in ENCODINGS_TO_TRY:
try:
with open(path, "r", encoding=enc) as f:
text = f.read()
logger.info(f"Successfully parsed with encoding: {enc}")
return text, enc
except (UnicodeDecodeError, LookupError):
continue
raise DocumentParseException(
f"Failed to decode file with any known encoding",
file_path=str(path),
parser="text"
)
def parse(self, file_path: str | Path) -> ParseResult:
"""
Parse a text file and extract content.
@ -41,15 +63,14 @@ class TextParser(DocumentParser):
)
try:
with open(path, "r", encoding=self._encoding) as f:
text = f.read()
text, encoding_used = self._try_encodings(path)
file_size = path.stat().st_size
line_count = text.count("\n") + 1
logger.info(
f"Parsed text: {path.name}, lines={line_count}, "
f"chars={len(text)}, size={file_size}"
f"chars={len(text)}, size={file_size}, encoding={encoding_used}"
)
return ParseResult(
@ -59,35 +80,12 @@ class TextParser(DocumentParser):
metadata={
"format": "text",
"line_count": line_count,
"encoding": self._encoding,
"encoding": encoding_used,
}
)
except UnicodeDecodeError:
try:
with open(path, "r", encoding="gbk") as f:
text = f.read()
file_size = path.stat().st_size
line_count = text.count("\n") + 1
return ParseResult(
text=text,
source_path=str(path),
file_size=file_size,
metadata={
"format": "text",
"line_count": line_count,
"encoding": "gbk",
}
)
except Exception as e:
raise DocumentParseException(
f"Failed to parse text file with encoding fallback: {e}",
file_path=str(path),
parser="text",
details={"error": str(e)}
)
except DocumentParseException:
raise
except Exception as e:
raise DocumentParseException(
f"Failed to parse text file: {e}",