feat(AISVC): Excel/CSV parser converts to JSON format for better RAG retrieval [AC-AISVC-35]

- ExcelParser: convert spreadsheet data to JSON records with header as keys
- CSVParser: convert CSV data to JSON records preserving structure
- Add _sheet field to identify worksheet source in Excel output
- Preserve numeric types (int/float/bool) in JSON output
- Support UTF-8 and GBK encoding fallback for CSV files
This commit is contained in:
MerCry 2026-02-25 01:12:07 +08:00
parent f2116b95f2
commit e9fee2f80e
1 changed files with 119 additions and 85 deletions

View File

@ -2,9 +2,11 @@
Excel document parser implementation. Excel document parser implementation.
[AC-AISVC-35] Excel (.xlsx) parsing using openpyxl. [AC-AISVC-35] Excel (.xlsx) parsing using openpyxl.
Extracts text content from Excel spreadsheets. Extracts text content from Excel spreadsheets and converts to JSON format
to preserve structural relationships for better RAG retrieval.
""" """
import json
import logging import logging
from pathlib import Path from pathlib import Path
from typing import Any from typing import Any
@ -22,6 +24,7 @@ class ExcelParser(DocumentParser):
""" """
Parser for Excel documents. Parser for Excel documents.
[AC-AISVC-35] Uses openpyxl for text extraction. [AC-AISVC-35] Uses openpyxl for text extraction.
Converts spreadsheet data to JSON format to preserve structure.
""" """
def __init__( def __init__(
@ -48,10 +51,48 @@ class ExcelParser(DocumentParser):
) )
return self._openpyxl return self._openpyxl
def _sheet_to_records(self, sheet, sheet_name: str) -> list[dict[str, Any]]:
"""
Convert a worksheet to a list of record dictionaries.
First row is treated as header (column names).
"""
records = []
rows = list(sheet.iter_rows(max_row=self._max_rows_per_sheet, values_only=True))
if not rows:
return records
headers = rows[0]
header_list = [str(h) if h is not None else f"column_{i}" for i, h in enumerate(headers)]
for row in rows[1:]:
record = {"_sheet": sheet_name}
has_content = False
for i, value in enumerate(row):
if i < len(header_list):
key = header_list[i]
else:
key = f"column_{i}"
if value is not None:
has_content = True
if isinstance(value, (int, float, bool)):
record[key] = value
else:
record[key] = str(value)
elif self._include_empty_cells:
record[key] = None
if has_content or self._include_empty_cells:
records.append(record)
return records
def parse(self, file_path: str | Path) -> ParseResult: def parse(self, file_path: str | Path) -> ParseResult:
""" """
Parse an Excel document and extract text content. Parse an Excel document and extract text content as JSON.
[AC-AISVC-35] Converts spreadsheet data to structured text. [AC-AISVC-35] Converts spreadsheet data to JSON format.
""" """
path = Path(file_path) path = Path(file_path)
@ -74,53 +115,33 @@ class ExcelParser(DocumentParser):
try: try:
workbook = openpyxl.load_workbook(path, read_only=True, data_only=True) workbook = openpyxl.load_workbook(path, read_only=True, data_only=True)
text_parts = [] all_records: list[dict[str, Any]] = []
sheet_count = len(workbook.sheetnames) sheet_count = len(workbook.sheetnames)
total_rows = 0 total_rows = 0
for sheet_name in workbook.sheetnames: for sheet_name in workbook.sheetnames:
sheet = workbook[sheet_name] sheet = workbook[sheet_name]
sheet_text_parts = [] records = self._sheet_to_records(sheet, sheet_name)
row_count = 0 all_records.extend(records)
total_rows += len(records)
for row in sheet.iter_rows(max_row=self._max_rows_per_sheet):
row_values = []
has_content = False
for cell in row:
value = cell.value
if value is not None:
has_content = True
row_values.append(str(value))
elif self._include_empty_cells:
row_values.append("")
else:
row_values.append("")
if has_content or self._include_empty_cells:
sheet_text_parts.append(" | ".join(row_values))
row_count += 1
if sheet_text_parts:
text_parts.append(f"[Sheet: {sheet_name}]\n" + "\n".join(sheet_text_parts))
total_rows += row_count
workbook.close() workbook.close()
full_text = "\n\n".join(text_parts) json_str = json.dumps(all_records, ensure_ascii=False, indent=2)
file_size = path.stat().st_size file_size = path.stat().st_size
logger.info( logger.info(
f"Parsed Excel: {path.name}, sheets={sheet_count}, " f"Parsed Excel (JSON): {path.name}, sheets={sheet_count}, "
f"rows={total_rows}, chars={len(full_text)}, size={file_size}" f"rows={total_rows}, chars={len(json_str)}, size={file_size}"
) )
return ParseResult( return ParseResult(
text=full_text, text=json_str,
source_path=str(path), source_path=str(path),
file_size=file_size, file_size=file_size,
metadata={ metadata={
"format": "xlsx", "format": "xlsx",
"output_format": "json",
"sheet_count": sheet_count, "sheet_count": sheet_count,
"total_rows": total_rows, "total_rows": total_rows,
} }
@ -145,6 +166,7 @@ class CSVParser(DocumentParser):
""" """
Parser for CSV files. Parser for CSV files.
[AC-AISVC-35] Uses Python's built-in csv module. [AC-AISVC-35] Uses Python's built-in csv module.
Converts CSV data to JSON format to preserve structure.
""" """
def __init__(self, delimiter: str = ",", encoding: str = "utf-8", **kwargs: Any): def __init__(self, delimiter: str = ",", encoding: str = "utf-8", **kwargs: Any):
@ -152,13 +174,46 @@ class CSVParser(DocumentParser):
self._encoding = encoding self._encoding = encoding
self._extra_config = kwargs self._extra_config = kwargs
def parse(self, file_path: str | Path) -> ParseResult: def _parse_csv_to_records(self, path: Path, encoding: str) -> list[dict[str, Any]]:
""" """Parse CSV file and return list of record dictionaries."""
Parse a CSV file and extract text content.
[AC-AISVC-35] Converts CSV data to structured text.
"""
import csv import csv
records = []
with open(path, "r", encoding=encoding, newline="") as f:
reader = csv.reader(f, delimiter=self._delimiter)
rows = list(reader)
if not rows:
return records
headers = rows[0]
header_list = [str(h) if h else f"column_{i}" for i, h in enumerate(headers)]
for row in rows[1:]:
record = {}
has_content = False
for i, value in enumerate(row):
if i < len(header_list):
key = header_list[i]
else:
key = f"column_{i}"
if value:
has_content = True
record[key] = value
if has_content:
records.append(record)
return records
def parse(self, file_path: str | Path) -> ParseResult:
"""
Parse a CSV file and extract text content as JSON.
[AC-AISVC-35] Converts CSV data to JSON format.
"""
path = Path(file_path) path = Path(file_path)
if not path.exists(): if not path.exists():
@ -169,56 +224,14 @@ class CSVParser(DocumentParser):
) )
try: try:
text_parts = [] records = self._parse_csv_to_records(path, self._encoding)
row_count = 0 row_count = len(records)
used_encoding = self._encoding
with open(path, "r", encoding=self._encoding, newline="") as f:
reader = csv.reader(f, delimiter=self._delimiter)
for row in reader:
text_parts.append(" | ".join(row))
row_count += 1
full_text = "\n".join(text_parts)
file_size = path.stat().st_size
logger.info(
f"Parsed CSV: {path.name}, rows={row_count}, "
f"chars={len(full_text)}, size={file_size}"
)
return ParseResult(
text=full_text,
source_path=str(path),
file_size=file_size,
metadata={
"format": "csv",
"row_count": row_count,
"delimiter": self._delimiter,
}
)
except UnicodeDecodeError: except UnicodeDecodeError:
try: try:
with open(path, "r", encoding="gbk", newline="") as f: records = self._parse_csv_to_records(path, "gbk")
reader = csv.reader(f, delimiter=self._delimiter) row_count = len(records)
for row in reader: used_encoding = "gbk"
text_parts.append(" | ".join(row))
row_count += 1
full_text = "\n".join(text_parts)
file_size = path.stat().st_size
return ParseResult(
text=full_text,
source_path=str(path),
file_size=file_size,
metadata={
"format": "csv",
"row_count": row_count,
"delimiter": self._delimiter,
"encoding": "gbk",
}
)
except Exception as e: except Exception as e:
raise DocumentParseException( raise DocumentParseException(
f"Failed to parse CSV with encoding fallback: {e}", f"Failed to parse CSV with encoding fallback: {e}",
@ -234,6 +247,27 @@ class CSVParser(DocumentParser):
details={"error": str(e)} details={"error": str(e)}
) )
json_str = json.dumps(records, ensure_ascii=False, indent=2)
file_size = path.stat().st_size
logger.info(
f"Parsed CSV (JSON): {path.name}, rows={row_count}, "
f"chars={len(json_str)}, size={file_size}"
)
return ParseResult(
text=json_str,
source_path=str(path),
file_size=file_size,
metadata={
"format": "csv",
"output_format": "json",
"row_count": row_count,
"delimiter": self._delimiter,
"encoding": used_encoding,
}
)
def get_supported_extensions(self) -> list[str]: def get_supported_extensions(self) -> list[str]:
"""Get supported file extensions.""" """Get supported file extensions."""
return [".csv"] return [".csv"]