[AC-KB-ENHANCE] feat(kb): 增强 KB 向量日志和元数据过滤功能

- 新增 KB 向量日志配置项 kb_vector_log_enabled 和 kb_vector_log_path
- 新增 KB 向量日志记录器支持滚动日志文件
- 增强 Qdrant 元数据过滤支持操作符格式 (\, \)
- 支持 MatchAny 实现多值匹配
- 新增图片文件索引支持
This commit is contained in:
MerCry 2026-03-11 18:57:27 +08:00
parent 4de2a2aece
commit e9de808969
3 changed files with 174 additions and 9 deletions

View File

@ -10,6 +10,7 @@ import json
import hashlib import hashlib
from dataclasses import dataclass from dataclasses import dataclass
from typing import Annotated, Any, Optional from typing import Annotated, Any, Optional
from logging.handlers import RotatingFileHandler
import tiktoken import tiktoken
from fastapi import APIRouter, BackgroundTasks, Depends, File, Form, HTTPException, Query, UploadFile from fastapi import APIRouter, BackgroundTasks, Depends, File, Form, HTTPException, Query, UploadFile
@ -38,6 +39,20 @@ from app.services.metadata_field_definition_service import MetadataFieldDefiniti
logger = logging.getLogger(__name__) logger = logging.getLogger(__name__)
settings = get_settings()
kb_vector_logger = logging.getLogger("kb_vector_payload")
if settings.kb_vector_log_enabled and not kb_vector_logger.handlers:
handler = RotatingFileHandler(
filename=settings.kb_vector_log_path,
maxBytes=10 * 1024 * 1024,
backupCount=5,
encoding="utf-8",
)
handler.setFormatter(logging.Formatter("%(asctime)s - %(message)s"))
kb_vector_logger.addHandler(handler)
kb_vector_logger.setLevel(logging.INFO)
kb_vector_logger.propagate = False
router = APIRouter(prefix="/admin/kb", tags=["KB Management"]) router = APIRouter(prefix="/admin/kb", tags=["KB Management"])
@ -661,6 +676,7 @@ async def _index_document(
logger.info(f"[INDEX] File extension: {file_ext}, content size: {len(content)} bytes") logger.info(f"[INDEX] File extension: {file_ext}, content size: {len(content)} bytes")
text_extensions = {".txt", ".md", ".markdown", ".rst", ".log", ".json", ".xml", ".yaml", ".yml"} text_extensions = {".txt", ".md", ".markdown", ".rst", ".log", ".json", ".xml", ".yaml", ".yml"}
image_extensions = {".jpg", ".jpeg", ".png", ".gif", ".webp", ".bmp", ".tiff", ".tif"}
if file_ext in text_extensions or not file_ext: if file_ext in text_extensions or not file_ext:
logger.info("[INDEX] Treating as text file, trying multiple encodings") logger.info("[INDEX] Treating as text file, trying multiple encodings")
@ -676,6 +692,44 @@ async def _index_document(
if text is None: if text is None:
text = content.decode("utf-8", errors="replace") text = content.decode("utf-8", errors="replace")
logger.warning("[INDEX] Failed to decode with known encodings, using utf-8 with replacement") logger.warning("[INDEX] Failed to decode with known encodings, using utf-8 with replacement")
elif file_ext in image_extensions:
logger.info("[INDEX] Image file detected, will parse with multimodal LLM")
await kb_service.update_job_status(
tenant_id, job_id, IndexJobStatus.PROCESSING.value, progress=15
)
await session.commit()
with tempfile.NamedTemporaryFile(delete=False, suffix=file_ext) as tmp_file:
tmp_file.write(content)
tmp_path = tmp_file.name
logger.info(f"[INDEX] Temp file created: {tmp_path}")
try:
from app.services.document.image_parser import ImageParser
logger.info(f"[INDEX] Starting image parsing for {file_ext}...")
image_parser = ImageParser()
image_result = await image_parser.parse_with_chunks(tmp_path)
text = image_result.raw_text
parse_result = type('ParseResult', (), {
'text': text,
'metadata': image_result.metadata,
'pages': None,
'image_chunks': image_result.chunks,
'image_summary': image_result.image_summary,
})()
logger.info(
f"[INDEX] Parsed image SUCCESS: {filename}, "
f"chars={len(text)}, chunks={len(image_result.chunks)}, "
f"summary={image_result.image_summary[:50] if image_result.image_summary else 'N/A'}..."
)
except Exception as e:
logger.error(f"[INDEX] Image parsing error: {type(e).__name__}: {e}")
text = ""
parse_result = None
finally:
Path(tmp_path).unlink(missing_ok=True)
logger.info("[INDEX] Temp file cleaned up")
else: else:
logger.info("[INDEX] Binary file detected, will parse with document parser") logger.info("[INDEX] Binary file detected, will parse with document parser")
await kb_service.update_job_status( await kb_service.update_job_status(
@ -723,13 +777,64 @@ async def _index_document(
) )
await session.commit() await session.commit()
from app.services.metadata_auto_inference_service import MetadataAutoInferenceService
inference_service = MetadataAutoInferenceService(session)
image_base64_for_inference = None
mime_type_for_inference = None
if file_ext in image_extensions:
import base64
image_base64_for_inference = base64.b64encode(content).decode("utf-8")
mime_type_map = {
".jpg": "image/jpeg", ".jpeg": "image/jpeg",
".png": "image/png", ".gif": "image/gif",
".webp": "image/webp", ".bmp": "image/bmp",
".tiff": "image/tiff", ".tif": "image/tiff",
}
mime_type_for_inference = mime_type_map.get(file_ext, "image/jpeg")
logger.info("[INDEX] Starting metadata auto-inference...")
inference_result = await inference_service.infer_metadata(
tenant_id=tenant_id,
content=text or "",
scope="kb_document",
existing_metadata=metadata,
image_base64=image_base64_for_inference,
mime_type=mime_type_for_inference,
)
if inference_result.success:
metadata = inference_result.inferred_metadata
logger.info(
f"[INDEX] Metadata inference SUCCESS: "
f"inferred_fields={list(inference_result.inferred_metadata.keys())}, "
f"confidence_scores={inference_result.confidence_scores}"
)
else:
logger.warning(
f"[INDEX] Metadata inference FAILED: {inference_result.error_message}, "
f"using existing metadata"
)
logger.info("[INDEX] Getting embedding provider...") logger.info("[INDEX] Getting embedding provider...")
embedding_provider = await get_embedding_provider() embedding_provider = await get_embedding_provider()
logger.info(f"[INDEX] Embedding provider: {type(embedding_provider).__name__}") logger.info(f"[INDEX] Embedding provider: {type(embedding_provider).__name__}")
all_chunks: list[TextChunk] = [] all_chunks: list[TextChunk] = []
if parse_result and parse_result.pages: if parse_result and hasattr(parse_result, 'image_chunks') and parse_result.image_chunks:
logger.info(f"[INDEX] Image with {len(parse_result.image_chunks)} intelligent chunks from LLM")
for img_chunk in parse_result.image_chunks:
all_chunks.append(TextChunk(
text=img_chunk.content,
start_token=img_chunk.chunk_index,
end_token=img_chunk.chunk_index + 1,
page=None,
source=filename,
))
logger.info(f"[INDEX] Total chunks from image: {len(all_chunks)}")
elif parse_result and parse_result.pages:
logger.info(f"[INDEX] PDF with {len(parse_result.pages)} pages, using line-based chunking with page metadata") logger.info(f"[INDEX] PDF with {len(parse_result.pages)} pages, using line-based chunking with page metadata")
for page in parse_result.pages: for page in parse_result.pages:
page_chunks = chunk_text_by_lines( page_chunks = chunk_text_by_lines(
@ -807,6 +912,35 @@ async def _index_document(
await session.commit() await session.commit()
if points: if points:
if settings.kb_vector_log_enabled:
vector_payloads = []
for point in points:
if use_multi_vector:
payload = {
"id": point.get("id"),
"vector": point.get("vector"),
"payload": point.get("payload"),
}
else:
payload = {
"id": point.id,
"vector": point.vector,
"payload": point.payload,
}
vector_payloads.append(payload)
kb_vector_logger.info(json.dumps({
"tenant_id": tenant_id,
"kb_id": kb_id,
"doc_id": doc_id,
"job_id": job_id,
"filename": filename,
"file_ext": file_ext,
"is_image": file_ext in image_extensions,
"metadata": doc_metadata,
"vectors": vector_payloads,
}, ensure_ascii=False))
logger.info(f"[INDEX] Upserting {len(points)} vectors to Qdrant for kb_id={kb_id}...") logger.info(f"[INDEX] Upserting {len(points)} vectors to Qdrant for kb_id={kb_id}...")
if use_multi_vector: if use_multi_vector:
await qdrant.upsert_multi_vector(tenant_id, points, kb_id=kb_id) await qdrant.upsert_multi_vector(tenant_id, points, kb_id=kb_id)

View File

@ -23,6 +23,9 @@ class Settings(BaseSettings):
log_level: str = "INFO" log_level: str = "INFO"
kb_vector_log_enabled: bool = False
kb_vector_log_path: str = "logs/kb_vector_payload.log"
llm_provider: str = "openai" llm_provider: str = "openai"
llm_api_key: str = "" llm_api_key: str = ""
llm_base_url: str = "https://api.openai.com/v1" llm_base_url: str = "https://api.openai.com/v1"

View File

@ -492,23 +492,51 @@ class QdrantClient:
构建 Qdrant 过滤条件 构建 Qdrant 过滤条件
Args: Args:
metadata_filter: 元数据过滤条件 {"grade": "三年级", "subject": "语文"} metadata_filter: 元数据过滤条件支持两种格式
- 简单值格式: {"grade": "三年级", "subject": "语文"}
- 操作符格式: {"grade": {"$eq": "三年级"}, "kb_scene": {"$eq": "open_consult"}}
Returns: Returns:
Qdrant Filter 对象 Qdrant Filter 对象
""" """
from qdrant_client.models import FieldCondition, Filter, MatchValue from qdrant_client.models import FieldCondition, Filter, MatchValue, MatchAny
must_conditions = [] must_conditions = []
for key, value in metadata_filter.items(): for key, value in metadata_filter.items():
# 支持嵌套 metadata 字段,如 metadata.grade
field_path = f"metadata.{key}" field_path = f"metadata.{key}"
condition = FieldCondition(
key=field_path, if isinstance(value, dict):
match=MatchValue(value=value), op = list(value.keys())[0] if value else None
) actual_value = value.get(op) if op else None
must_conditions.append(condition)
if op == "$eq" and actual_value is not None:
condition = FieldCondition(
key=field_path,
match=MatchValue(value=actual_value),
)
must_conditions.append(condition)
elif op == "$in" and isinstance(actual_value, list):
condition = FieldCondition(
key=field_path,
match=MatchAny(any=actual_value),
)
must_conditions.append(condition)
else:
logger.warning(
f"[AC-AISVC-16] Unsupported filter operator: {op}, using as direct value"
)
condition = FieldCondition(
key=field_path,
match=MatchValue(value=value),
)
must_conditions.append(condition)
else:
condition = FieldCondition(
key=field_path,
match=MatchValue(value=value),
)
must_conditions.append(condition)
return Filter(must=must_conditions) if must_conditions else None return Filter(must=must_conditions) if must_conditions else None