feat: 文档索引优化,支持多编码解码和按行分块 [AC-AISVC-21, AC-AISVC-22]
This commit is contained in:
parent
ac8c33cf94
commit
774744d534
|
|
@ -37,6 +37,42 @@ class TextChunk:
|
||||||
source: str | None = None
|
source: str | None = None
|
||||||
|
|
||||||
|
|
||||||
|
def chunk_text_by_lines(
|
||||||
|
text: str,
|
||||||
|
min_line_length: int = 10,
|
||||||
|
source: str | None = None,
|
||||||
|
) -> list[TextChunk]:
|
||||||
|
"""
|
||||||
|
按行分块,每行作为一个独立的检索单元。
|
||||||
|
|
||||||
|
Args:
|
||||||
|
text: 要分块的文本
|
||||||
|
min_line_length: 最小行长度,低于此长度的行会被跳过
|
||||||
|
source: 来源文件路径(可选)
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
分块列表,每个块对应一行文本
|
||||||
|
"""
|
||||||
|
lines = text.split('\n')
|
||||||
|
chunks: list[TextChunk] = []
|
||||||
|
|
||||||
|
for i, line in enumerate(lines):
|
||||||
|
line = line.strip()
|
||||||
|
|
||||||
|
if len(line) < min_line_length:
|
||||||
|
continue
|
||||||
|
|
||||||
|
chunks.append(TextChunk(
|
||||||
|
text=line,
|
||||||
|
start_token=i,
|
||||||
|
end_token=i + 1,
|
||||||
|
page=None,
|
||||||
|
source=source,
|
||||||
|
))
|
||||||
|
|
||||||
|
return chunks
|
||||||
|
|
||||||
|
|
||||||
def chunk_text_with_tiktoken(
|
def chunk_text_with_tiktoken(
|
||||||
text: str,
|
text: str,
|
||||||
chunk_size: int = 512,
|
chunk_size: int = 512,
|
||||||
|
|
@ -318,8 +354,19 @@ async def _index_document(tenant_id: str, job_id: str, doc_id: str, content: byt
|
||||||
text_extensions = {".txt", ".md", ".markdown", ".rst", ".log", ".json", ".xml", ".yaml", ".yml"}
|
text_extensions = {".txt", ".md", ".markdown", ".rst", ".log", ".json", ".xml", ".yaml", ".yml"}
|
||||||
|
|
||||||
if file_ext in text_extensions or not file_ext:
|
if file_ext in text_extensions or not file_ext:
|
||||||
logger.info(f"[INDEX] Treating as text file, decoding with UTF-8")
|
logger.info(f"[INDEX] Treating as text file, trying multiple encodings")
|
||||||
text = content.decode("utf-8", errors="ignore")
|
text = None
|
||||||
|
for encoding in ["utf-8", "gbk", "gb2312", "gb18030", "big5", "utf-16", "latin-1"]:
|
||||||
|
try:
|
||||||
|
text = content.decode(encoding)
|
||||||
|
logger.info(f"[INDEX] Successfully decoded with encoding: {encoding}")
|
||||||
|
break
|
||||||
|
except (UnicodeDecodeError, LookupError):
|
||||||
|
continue
|
||||||
|
|
||||||
|
if text is None:
|
||||||
|
text = content.decode("utf-8", errors="replace")
|
||||||
|
logger.warning(f"[INDEX] Failed to decode with known encodings, using utf-8 with replacement")
|
||||||
else:
|
else:
|
||||||
logger.info(f"[INDEX] Binary file detected, will parse with document parser")
|
logger.info(f"[INDEX] Binary file detected, will parse with document parser")
|
||||||
await kb_service.update_job_status(
|
await kb_service.update_job_status(
|
||||||
|
|
@ -374,23 +421,22 @@ async def _index_document(tenant_id: str, job_id: str, doc_id: str, content: byt
|
||||||
all_chunks: list[TextChunk] = []
|
all_chunks: list[TextChunk] = []
|
||||||
|
|
||||||
if parse_result and parse_result.pages:
|
if parse_result and parse_result.pages:
|
||||||
logger.info(f"[INDEX] PDF with {len(parse_result.pages)} pages, using tiktoken chunking with page metadata")
|
logger.info(f"[INDEX] PDF with {len(parse_result.pages)} pages, using line-based chunking with page metadata")
|
||||||
for page in parse_result.pages:
|
for page in parse_result.pages:
|
||||||
page_chunks = chunk_text_with_tiktoken(
|
page_chunks = chunk_text_by_lines(
|
||||||
page.text,
|
page.text,
|
||||||
chunk_size=512,
|
min_line_length=10,
|
||||||
overlap=100,
|
|
||||||
page=page.page,
|
|
||||||
source=filename,
|
source=filename,
|
||||||
)
|
)
|
||||||
|
for pc in page_chunks:
|
||||||
|
pc.page = page.page
|
||||||
all_chunks.extend(page_chunks)
|
all_chunks.extend(page_chunks)
|
||||||
logger.info(f"[INDEX] Total chunks from PDF: {len(all_chunks)}")
|
logger.info(f"[INDEX] Total chunks from PDF: {len(all_chunks)}")
|
||||||
else:
|
else:
|
||||||
logger.info(f"[INDEX] Using tiktoken chunking without page metadata")
|
logger.info(f"[INDEX] Using line-based chunking")
|
||||||
all_chunks = chunk_text_with_tiktoken(
|
all_chunks = chunk_text_by_lines(
|
||||||
text,
|
text,
|
||||||
chunk_size=512,
|
min_line_length=10,
|
||||||
overlap=100,
|
|
||||||
source=filename,
|
source=filename,
|
||||||
)
|
)
|
||||||
logger.info(f"[INDEX] Total chunks: {len(all_chunks)}")
|
logger.info(f"[INDEX] Total chunks: {len(all_chunks)}")
|
||||||
|
|
|
||||||
|
|
@ -15,17 +15,39 @@ from app.services.document.base import (
|
||||||
|
|
||||||
logger = logging.getLogger(__name__)
|
logger = logging.getLogger(__name__)
|
||||||
|
|
||||||
|
ENCODINGS_TO_TRY = ["utf-8", "gbk", "gb2312", "gb18030", "big5", "utf-16", "latin-1"]
|
||||||
|
|
||||||
|
|
||||||
class TextParser(DocumentParser):
|
class TextParser(DocumentParser):
|
||||||
"""
|
"""
|
||||||
Parser for plain text files.
|
Parser for plain text files.
|
||||||
[AC-AISVC-33] Direct text extraction.
|
[AC-AISVC-33] Direct text extraction with multiple encoding support.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
def __init__(self, encoding: str = "utf-8", **kwargs: Any):
|
def __init__(self, encoding: str = "utf-8", **kwargs: Any):
|
||||||
self._encoding = encoding
|
self._encoding = encoding
|
||||||
self._extra_config = kwargs
|
self._extra_config = kwargs
|
||||||
|
|
||||||
|
def _try_encodings(self, path: Path) -> tuple[str, str]:
|
||||||
|
"""
|
||||||
|
Try multiple encodings to read the file.
|
||||||
|
Returns: (text, encoding_used)
|
||||||
|
"""
|
||||||
|
for enc in ENCODINGS_TO_TRY:
|
||||||
|
try:
|
||||||
|
with open(path, "r", encoding=enc) as f:
|
||||||
|
text = f.read()
|
||||||
|
logger.info(f"Successfully parsed with encoding: {enc}")
|
||||||
|
return text, enc
|
||||||
|
except (UnicodeDecodeError, LookupError):
|
||||||
|
continue
|
||||||
|
|
||||||
|
raise DocumentParseException(
|
||||||
|
f"Failed to decode file with any known encoding",
|
||||||
|
file_path=str(path),
|
||||||
|
parser="text"
|
||||||
|
)
|
||||||
|
|
||||||
def parse(self, file_path: str | Path) -> ParseResult:
|
def parse(self, file_path: str | Path) -> ParseResult:
|
||||||
"""
|
"""
|
||||||
Parse a text file and extract content.
|
Parse a text file and extract content.
|
||||||
|
|
@ -41,15 +63,14 @@ class TextParser(DocumentParser):
|
||||||
)
|
)
|
||||||
|
|
||||||
try:
|
try:
|
||||||
with open(path, "r", encoding=self._encoding) as f:
|
text, encoding_used = self._try_encodings(path)
|
||||||
text = f.read()
|
|
||||||
|
|
||||||
file_size = path.stat().st_size
|
file_size = path.stat().st_size
|
||||||
line_count = text.count("\n") + 1
|
line_count = text.count("\n") + 1
|
||||||
|
|
||||||
logger.info(
|
logger.info(
|
||||||
f"Parsed text: {path.name}, lines={line_count}, "
|
f"Parsed text: {path.name}, lines={line_count}, "
|
||||||
f"chars={len(text)}, size={file_size}"
|
f"chars={len(text)}, size={file_size}, encoding={encoding_used}"
|
||||||
)
|
)
|
||||||
|
|
||||||
return ParseResult(
|
return ParseResult(
|
||||||
|
|
@ -59,35 +80,12 @@ class TextParser(DocumentParser):
|
||||||
metadata={
|
metadata={
|
||||||
"format": "text",
|
"format": "text",
|
||||||
"line_count": line_count,
|
"line_count": line_count,
|
||||||
"encoding": self._encoding,
|
"encoding": encoding_used,
|
||||||
}
|
}
|
||||||
)
|
)
|
||||||
|
|
||||||
except UnicodeDecodeError:
|
except DocumentParseException:
|
||||||
try:
|
raise
|
||||||
with open(path, "r", encoding="gbk") as f:
|
|
||||||
text = f.read()
|
|
||||||
|
|
||||||
file_size = path.stat().st_size
|
|
||||||
line_count = text.count("\n") + 1
|
|
||||||
|
|
||||||
return ParseResult(
|
|
||||||
text=text,
|
|
||||||
source_path=str(path),
|
|
||||||
file_size=file_size,
|
|
||||||
metadata={
|
|
||||||
"format": "text",
|
|
||||||
"line_count": line_count,
|
|
||||||
"encoding": "gbk",
|
|
||||||
}
|
|
||||||
)
|
|
||||||
except Exception as e:
|
|
||||||
raise DocumentParseException(
|
|
||||||
f"Failed to parse text file with encoding fallback: {e}",
|
|
||||||
file_path=str(path),
|
|
||||||
parser="text",
|
|
||||||
details={"error": str(e)}
|
|
||||||
)
|
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
raise DocumentParseException(
|
raise DocumentParseException(
|
||||||
f"Failed to parse text file: {e}",
|
f"Failed to parse text file: {e}",
|
||||||
|
|
|
||||||
Loading…
Reference in New Issue