[AC-AISVC-02, AC-AISVC-16] 多个需求合并 #1
|
|
@ -37,6 +37,42 @@ class TextChunk:
|
|||
source: str | None = None
|
||||
|
||||
|
||||
def chunk_text_by_lines(
|
||||
text: str,
|
||||
min_line_length: int = 10,
|
||||
source: str | None = None,
|
||||
) -> list[TextChunk]:
|
||||
"""
|
||||
按行分块,每行作为一个独立的检索单元。
|
||||
|
||||
Args:
|
||||
text: 要分块的文本
|
||||
min_line_length: 最小行长度,低于此长度的行会被跳过
|
||||
source: 来源文件路径(可选)
|
||||
|
||||
Returns:
|
||||
分块列表,每个块对应一行文本
|
||||
"""
|
||||
lines = text.split('\n')
|
||||
chunks: list[TextChunk] = []
|
||||
|
||||
for i, line in enumerate(lines):
|
||||
line = line.strip()
|
||||
|
||||
if len(line) < min_line_length:
|
||||
continue
|
||||
|
||||
chunks.append(TextChunk(
|
||||
text=line,
|
||||
start_token=i,
|
||||
end_token=i + 1,
|
||||
page=None,
|
||||
source=source,
|
||||
))
|
||||
|
||||
return chunks
|
||||
|
||||
|
||||
def chunk_text_with_tiktoken(
|
||||
text: str,
|
||||
chunk_size: int = 512,
|
||||
|
|
@ -318,8 +354,19 @@ async def _index_document(tenant_id: str, job_id: str, doc_id: str, content: byt
|
|||
text_extensions = {".txt", ".md", ".markdown", ".rst", ".log", ".json", ".xml", ".yaml", ".yml"}
|
||||
|
||||
if file_ext in text_extensions or not file_ext:
|
||||
logger.info(f"[INDEX] Treating as text file, decoding with UTF-8")
|
||||
text = content.decode("utf-8", errors="ignore")
|
||||
logger.info(f"[INDEX] Treating as text file, trying multiple encodings")
|
||||
text = None
|
||||
for encoding in ["utf-8", "gbk", "gb2312", "gb18030", "big5", "utf-16", "latin-1"]:
|
||||
try:
|
||||
text = content.decode(encoding)
|
||||
logger.info(f"[INDEX] Successfully decoded with encoding: {encoding}")
|
||||
break
|
||||
except (UnicodeDecodeError, LookupError):
|
||||
continue
|
||||
|
||||
if text is None:
|
||||
text = content.decode("utf-8", errors="replace")
|
||||
logger.warning(f"[INDEX] Failed to decode with known encodings, using utf-8 with replacement")
|
||||
else:
|
||||
logger.info(f"[INDEX] Binary file detected, will parse with document parser")
|
||||
await kb_service.update_job_status(
|
||||
|
|
@ -374,23 +421,22 @@ async def _index_document(tenant_id: str, job_id: str, doc_id: str, content: byt
|
|||
all_chunks: list[TextChunk] = []
|
||||
|
||||
if parse_result and parse_result.pages:
|
||||
logger.info(f"[INDEX] PDF with {len(parse_result.pages)} pages, using tiktoken chunking with page metadata")
|
||||
logger.info(f"[INDEX] PDF with {len(parse_result.pages)} pages, using line-based chunking with page metadata")
|
||||
for page in parse_result.pages:
|
||||
page_chunks = chunk_text_with_tiktoken(
|
||||
page_chunks = chunk_text_by_lines(
|
||||
page.text,
|
||||
chunk_size=512,
|
||||
overlap=100,
|
||||
page=page.page,
|
||||
min_line_length=10,
|
||||
source=filename,
|
||||
)
|
||||
for pc in page_chunks:
|
||||
pc.page = page.page
|
||||
all_chunks.extend(page_chunks)
|
||||
logger.info(f"[INDEX] Total chunks from PDF: {len(all_chunks)}")
|
||||
else:
|
||||
logger.info(f"[INDEX] Using tiktoken chunking without page metadata")
|
||||
all_chunks = chunk_text_with_tiktoken(
|
||||
logger.info(f"[INDEX] Using line-based chunking")
|
||||
all_chunks = chunk_text_by_lines(
|
||||
text,
|
||||
chunk_size=512,
|
||||
overlap=100,
|
||||
min_line_length=10,
|
||||
source=filename,
|
||||
)
|
||||
logger.info(f"[INDEX] Total chunks: {len(all_chunks)}")
|
||||
|
|
|
|||
|
|
@ -15,17 +15,39 @@ from app.services.document.base import (
|
|||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
ENCODINGS_TO_TRY = ["utf-8", "gbk", "gb2312", "gb18030", "big5", "utf-16", "latin-1"]
|
||||
|
||||
|
||||
class TextParser(DocumentParser):
|
||||
"""
|
||||
Parser for plain text files.
|
||||
[AC-AISVC-33] Direct text extraction.
|
||||
[AC-AISVC-33] Direct text extraction with multiple encoding support.
|
||||
"""
|
||||
|
||||
def __init__(self, encoding: str = "utf-8", **kwargs: Any):
|
||||
self._encoding = encoding
|
||||
self._extra_config = kwargs
|
||||
|
||||
def _try_encodings(self, path: Path) -> tuple[str, str]:
|
||||
"""
|
||||
Try multiple encodings to read the file.
|
||||
Returns: (text, encoding_used)
|
||||
"""
|
||||
for enc in ENCODINGS_TO_TRY:
|
||||
try:
|
||||
with open(path, "r", encoding=enc) as f:
|
||||
text = f.read()
|
||||
logger.info(f"Successfully parsed with encoding: {enc}")
|
||||
return text, enc
|
||||
except (UnicodeDecodeError, LookupError):
|
||||
continue
|
||||
|
||||
raise DocumentParseException(
|
||||
f"Failed to decode file with any known encoding",
|
||||
file_path=str(path),
|
||||
parser="text"
|
||||
)
|
||||
|
||||
def parse(self, file_path: str | Path) -> ParseResult:
|
||||
"""
|
||||
Parse a text file and extract content.
|
||||
|
|
@ -41,15 +63,14 @@ class TextParser(DocumentParser):
|
|||
)
|
||||
|
||||
try:
|
||||
with open(path, "r", encoding=self._encoding) as f:
|
||||
text = f.read()
|
||||
text, encoding_used = self._try_encodings(path)
|
||||
|
||||
file_size = path.stat().st_size
|
||||
line_count = text.count("\n") + 1
|
||||
|
||||
logger.info(
|
||||
f"Parsed text: {path.name}, lines={line_count}, "
|
||||
f"chars={len(text)}, size={file_size}"
|
||||
f"chars={len(text)}, size={file_size}, encoding={encoding_used}"
|
||||
)
|
||||
|
||||
return ParseResult(
|
||||
|
|
@ -59,35 +80,12 @@ class TextParser(DocumentParser):
|
|||
metadata={
|
||||
"format": "text",
|
||||
"line_count": line_count,
|
||||
"encoding": self._encoding,
|
||||
"encoding": encoding_used,
|
||||
}
|
||||
)
|
||||
|
||||
except UnicodeDecodeError:
|
||||
try:
|
||||
with open(path, "r", encoding="gbk") as f:
|
||||
text = f.read()
|
||||
|
||||
file_size = path.stat().st_size
|
||||
line_count = text.count("\n") + 1
|
||||
|
||||
return ParseResult(
|
||||
text=text,
|
||||
source_path=str(path),
|
||||
file_size=file_size,
|
||||
metadata={
|
||||
"format": "text",
|
||||
"line_count": line_count,
|
||||
"encoding": "gbk",
|
||||
}
|
||||
)
|
||||
except Exception as e:
|
||||
raise DocumentParseException(
|
||||
f"Failed to parse text file with encoding fallback: {e}",
|
||||
file_path=str(path),
|
||||
parser="text",
|
||||
details={"error": str(e)}
|
||||
)
|
||||
except DocumentParseException:
|
||||
raise
|
||||
except Exception as e:
|
||||
raise DocumentParseException(
|
||||
f"Failed to parse text file: {e}",
|
||||
|
|
|
|||
Loading…
Reference in New Issue