feat: enhance metadata handling and document processing

- Add metadata field to Document type for frontend
- Add type field to MetadataFieldUpdateRequest
- Update KB API with URL decode support for Chinese filenames
- Enhance metadata auto inference service
- Improve metadata field definition service
- Update .gitignore to exclude logs and snapshots
This commit is contained in:
MerCry 2026-03-12 12:45:54 +08:00
parent 51d8de0621
commit dd1c6aba14
10 changed files with 259 additions and 111 deletions

3
.gitignore vendored
View File

@ -163,7 +163,10 @@ cython_debug/
# Project specific
ai-service/uploads/
ai-service/config/
ai-service/logs/
*.local
qdrant_snapshots/
*.snapshot
/.trae/
/.claude/

View File

@ -34,6 +34,7 @@ export interface Document {
kbId: string
fileName: string
status: string
metadata?: Record<string, any>
jobId?: string
createdAt: string
updatedAt: string

View File

@ -39,6 +39,7 @@ export interface MetadataFieldCreateRequest {
export interface MetadataFieldUpdateRequest {
label?: string
type?: MetadataFieldType
required?: boolean
options?: string[]
default?: string | number | boolean

View File

@ -16,7 +16,7 @@
<input
ref="fileInputRef"
type="file"
accept=".txt,.md,.pdf,.doc,.docx,.xls,.xlsx"
accept=".txt,.md,.markdown,.pdf,.doc,.docx,.xls,.xlsx,.jpg,.jpeg,.png,.gif,.webp,.bmp,.tiff,.tif"
style="display: none"
@change="handleFileSelect"
/>
@ -298,7 +298,10 @@ const handleFileSelect = (event: Event) => {
const file = target.files?.[0]
if (!file) return
const allowedExtensions = ['.txt', '.md', '.pdf', '.doc', '.docx', '.xls', '.xlsx']
const allowedExtensions = [
'.txt', '.md', '.markdown', '.pdf', '.doc', '.docx', '.xls', '.xlsx',
'.jpg', '.jpeg', '.png', '.gif', '.webp', '.bmp', '.tiff', '.tif'
]
const ext = '.' + file.name.split('.').pop()?.toLowerCase()
if (!allowedExtensions.includes(ext)) {

View File

@ -11,6 +11,7 @@ import hashlib
from dataclasses import dataclass
from typing import Annotated, Any, Optional
from logging.handlers import RotatingFileHandler
from urllib.parse import unquote
import tiktoken
from fastapi import APIRouter, BackgroundTasks, Depends, File, Form, HTTPException, Query, UploadFile
@ -64,6 +65,7 @@ class TextChunk:
end_token: int
page: int | None = None
source: str | None = None
metadata: dict | None = None
def chunk_text_by_lines(
@ -602,10 +604,14 @@ async def upload_document(
doc_kb_service = KBService(session)
file_content = await file.read()
# URL decode filename to handle Chinese characters
decoded_filename = unquote(file.filename or "unknown")
document, job = await doc_kb_service.upload_document(
tenant_id=tenant_id,
kb_id=kb_id,
file_name=file.filename or "unknown",
file_name=decoded_filename,
file_content=file_content,
file_type=file.content_type,
metadata=metadata_dict,
@ -615,7 +621,7 @@ async def upload_document(
await session.commit()
background_tasks.add_task(
_index_document, tenant_id, kb_id, str(job.id), str(document.id), file_content, file.filename, metadata_dict
_index_document, tenant_id, kb_id, str(job.id), str(document.id), file_content, decoded_filename, metadata_dict
)
return JSONResponse(
@ -676,6 +682,7 @@ async def _index_document(
logger.info(f"[INDEX] File extension: {file_ext}, content size: {len(content)} bytes")
text_extensions = {".txt", ".md", ".markdown", ".rst", ".log", ".json", ".xml", ".yaml", ".yml"}
markdown_extensions = {".md", ".markdown"}
image_extensions = {".jpg", ".jpeg", ".png", ".gif", ".webp", ".bmp", ".tiff", ".tif"}
if file_ext in text_extensions or not file_ext:
@ -783,16 +790,6 @@ async def _index_document(
image_base64_for_inference = None
mime_type_for_inference = None
if file_ext in image_extensions:
import base64
image_base64_for_inference = base64.b64encode(content).decode("utf-8")
mime_type_map = {
".jpg": "image/jpeg", ".jpeg": "image/jpeg",
".png": "image/png", ".gif": "image/gif",
".webp": "image/webp", ".bmp": "image/bmp",
".tiff": "image/tiff", ".tif": "image/tiff",
}
mime_type_for_inference = mime_type_map.get(file_ext, "image/jpeg")
logger.info("[INDEX] Starting metadata auto-inference...")
inference_result = await inference_service.infer_metadata(
@ -811,6 +808,11 @@ async def _index_document(
f"inferred_fields={list(inference_result.inferred_metadata.keys())}, "
f"confidence_scores={inference_result.confidence_scores}"
)
document = await kb_service.get_document(tenant_id, doc_id)
if document:
document.doc_metadata = metadata
logger.info(f"[INDEX] Updated document metadata in database: {metadata}")
else:
logger.warning(
f"[INDEX] Metadata inference FAILED: {inference_result.error_message}, "
@ -846,6 +848,31 @@ async def _index_document(
pc.page = page.page
all_chunks.extend(page_chunks)
logger.info(f"[INDEX] Total chunks from PDF: {len(all_chunks)}")
elif file_ext in markdown_extensions:
logger.info("[INDEX] Markdown file detected, using intelligent chunking")
from app.services.document.markdown_chunker import MarkdownChunker, MarkdownElementType
chunker = MarkdownChunker(max_chunk_size=1000, min_chunk_size=50)
md_chunks = chunker.chunk(text, doc_id=doc_id)
for i, md_chunk in enumerate(md_chunks):
chunk_metadata = {
"element_type": md_chunk.element_type.value,
"header_context": md_chunk.header_context,
"language": md_chunk.language,
}
chunk_metadata.update(md_chunk.metadata)
all_chunks.append(TextChunk(
text=md_chunk.content,
start_token=i,
end_token=i + 1,
page=None,
source=filename,
))
if all_chunks:
all_chunks[-1].metadata = chunk_metadata
logger.info(f"[INDEX] Total chunks from Markdown: {len(all_chunks)}, element types: {[c.element_type.value for c in md_chunks[:5]]}...")
else:
logger.info("[INDEX] Using line-based chunking")
all_chunks = chunk_text_by_lines(
@ -913,21 +940,13 @@ async def _index_document(
if points:
if settings.kb_vector_log_enabled:
vector_payloads = []
payloads_only = []
for point in points:
if use_multi_vector:
payload = {
"id": point.get("id"),
"vector": point.get("vector"),
"payload": point.get("payload"),
}
payload = point.get("payload")
else:
payload = {
"id": point.id,
"vector": point.vector,
"payload": point.payload,
}
vector_payloads.append(payload)
payload = point.payload
payloads_only.append(payload)
kb_vector_logger.info(json.dumps({
"tenant_id": tenant_id,
@ -938,7 +957,8 @@ async def _index_document(
"file_ext": file_ext,
"is_image": file_ext in image_extensions,
"metadata": doc_metadata,
"vectors": vector_payloads,
"chunk_count": len(payloads_only),
"payloads": payloads_only,
}, ensure_ascii=False))
logger.info(f"[INDEX] Upserting {len(points)} vectors to Qdrant for kb_id={kb_id}...")

View File

@ -7,10 +7,18 @@ import uuid
from datetime import datetime
from enum import Enum
from typing import Any
from zoneinfo import ZoneInfo
from sqlalchemy import JSON, Column
from sqlmodel import Field, Index, SQLModel
SHANGHAI_TZ = ZoneInfo("Asia/Shanghai")
def beijing_now() -> datetime:
"""Get current time in Shanghai/Beijing timezone (Asia/Shanghai)"""
return datetime.now(SHANGHAI_TZ).replace(tzinfo=None)
class ChatSession(SQLModel, table=True):
"""
@ -32,8 +40,8 @@ class ChatSession(SQLModel, table=True):
sa_column=Column("metadata", JSON, nullable=True),
description="Session metadata"
)
created_at: datetime = Field(default_factory=datetime.utcnow, description="Session creation time")
updated_at: datetime = Field(default_factory=datetime.utcnow, description="Last update time")
created_at: datetime = Field(default_factory=beijing_now, description="Session creation time")
updated_at: datetime = Field(default_factory=beijing_now, description="Last update time")
class ChatMessage(SQLModel, table=True):
@ -65,7 +73,7 @@ class ChatMessage(SQLModel, table=True):
first_token_ms: int | None = Field(default=None, description="Time to first token in milliseconds (for streaming)")
is_error: bool = Field(default=False, description="Whether this message is an error response")
error_message: str | None = Field(default=None, description="Error message if any")
created_at: datetime = Field(default_factory=datetime.utcnow, description="Message creation time")
created_at: datetime = Field(default_factory=beijing_now, description="Message creation time")
prompt_template_id: uuid.UUID | None = Field(
default=None,
@ -150,8 +158,8 @@ class UserMemory(SQLModel, table=True):
summary_version: int = Field(default=1, description="Summary version / update round")
last_turn_id: str | None = Field(default=None, description="Last turn identifier (optional)")
expires_at: datetime | None = Field(default=None, description="Expiration time (optional)")
created_at: datetime = Field(default_factory=datetime.utcnow, description="Creation time")
updated_at: datetime = Field(default_factory=datetime.utcnow, description="Last update time")
created_at: datetime = Field(default_factory=beijing_now, description="Creation time")
updated_at: datetime = Field(default_factory=beijing_now, description="Last update time")
class SharedSession(SQLModel, table=True):
@ -177,8 +185,8 @@ class SharedSession(SQLModel, table=True):
is_active: bool = Field(default=True, description="Whether share is active")
max_concurrent_users: int = Field(default=10, description="Maximum concurrent users allowed")
current_users: int = Field(default=0, description="Current number of online users")
created_at: datetime = Field(default_factory=datetime.utcnow, description="Creation time")
updated_at: datetime = Field(default_factory=datetime.utcnow, description="Last update time")
created_at: datetime = Field(default_factory=beijing_now, description="Creation time")
updated_at: datetime = Field(default_factory=beijing_now, description="Last update time")
class DocumentStatus(str, Enum):
@ -213,8 +221,8 @@ class Tenant(SQLModel, table=True):
tenant_id: str = Field(..., description="Full tenant ID (format: name@ash@year)", unique=True, index=True)
name: str = Field(..., description="Tenant display name (first part of tenant_id)")
year: str = Field(..., description="Year part from tenant_id")
created_at: datetime = Field(default_factory=datetime.utcnow, description="Creation time")
updated_at: datetime = Field(default_factory=datetime.utcnow, description="Last update time")
created_at: datetime = Field(default_factory=beijing_now, description="Creation time")
updated_at: datetime = Field(default_factory=beijing_now, description="Last update time")
class KBType(str, Enum):
@ -247,8 +255,8 @@ class KnowledgeBase(SQLModel, table=True):
priority: int = Field(default=0, ge=0, description="Priority weight, higher value means higher priority")
is_enabled: bool = Field(default=True, description="Whether the knowledge base is enabled")
doc_count: int = Field(default=0, ge=0, description="Document count (cached statistic)")
created_at: datetime = Field(default_factory=datetime.utcnow, description="Creation time")
updated_at: datetime = Field(default_factory=datetime.utcnow, description="Last update time")
created_at: datetime = Field(default_factory=beijing_now, description="Creation time")
updated_at: datetime = Field(default_factory=beijing_now, description="Last update time")
class Document(SQLModel, table=True):
@ -272,8 +280,8 @@ class Document(SQLModel, table=True):
status: str = Field(default=DocumentStatus.PENDING.value, description="Document status")
error_msg: str | None = Field(default=None, description="Error message if failed")
doc_metadata: dict | None = Field(default=None, sa_type=JSON, description="Document metadata as JSON")
created_at: datetime = Field(default_factory=datetime.utcnow, description="Upload time")
updated_at: datetime = Field(default_factory=datetime.utcnow, description="Last update time")
created_at: datetime = Field(default_factory=beijing_now, description="Upload time")
updated_at: datetime = Field(default_factory=beijing_now, description="Last update time")
class IndexJob(SQLModel, table=True):
@ -293,8 +301,8 @@ class IndexJob(SQLModel, table=True):
status: str = Field(default=IndexJobStatus.PENDING.value, description="Job status")
progress: int = Field(default=0, ge=0, le=100, description="Progress percentage")
error_msg: str | None = Field(default=None, description="Error message if failed")
created_at: datetime = Field(default_factory=datetime.utcnow, description="Job creation time")
updated_at: datetime = Field(default_factory=datetime.utcnow, description="Last update time")
created_at: datetime = Field(default_factory=beijing_now, description="Job creation time")
updated_at: datetime = Field(default_factory=beijing_now, description="Last update time")
class KnowledgeBaseCreate(SQLModel):
@ -346,8 +354,8 @@ class ApiKey(SQLModel, table=True):
description="Optional IP allowlist for this key",
)
rate_limit_qpm: int | None = Field(default=60, description="Per-minute quota for this key")
created_at: datetime = Field(default_factory=datetime.utcnow, description="Creation time")
updated_at: datetime = Field(default_factory=datetime.utcnow, description="Last update time")
created_at: datetime = Field(default_factory=beijing_now, description="Creation time")
updated_at: datetime = Field(default_factory=beijing_now, description="Last update time")
class ApiKeyCreate(SQLModel):
@ -390,8 +398,8 @@ class PromptTemplate(SQLModel, table=True):
sa_column=Column("metadata", JSON, nullable=True),
description="[AC-IDSMETA-16] Structured metadata for the prompt template"
)
created_at: datetime = Field(default_factory=datetime.utcnow, description="Creation time")
updated_at: datetime = Field(default_factory=datetime.utcnow, description="Last update time")
created_at: datetime = Field(default_factory=beijing_now, description="Creation time")
updated_at: datetime = Field(default_factory=beijing_now, description="Last update time")
class PromptTemplateVersion(SQLModel, table=True):
@ -426,7 +434,7 @@ class PromptTemplateVersion(SQLModel, table=True):
sa_column=Column("variables", JSON, nullable=True),
description="Variable definitions, e.g., [{'name': 'persona_name', 'default': '小N'}]"
)
created_at: datetime = Field(default_factory=datetime.utcnow, description="Creation time")
created_at: datetime = Field(default_factory=beijing_now, description="Creation time")
class PromptTemplateCreate(SQLModel):
@ -514,8 +522,8 @@ class IntentRule(SQLModel, table=True):
sa_column=Column("semantic_examples", JSON, nullable=True),
description="[v0.8.0] Semantic example sentences for dynamic vector computation"
)
created_at: datetime = Field(default_factory=datetime.utcnow, description="Creation time")
updated_at: datetime = Field(default_factory=datetime.utcnow, description="Last update time")
created_at: datetime = Field(default_factory=beijing_now, description="Creation time")
updated_at: datetime = Field(default_factory=beijing_now, description="Last update time")
class IntentRuleCreate(SQLModel):
@ -614,8 +622,8 @@ class ForbiddenWord(SQLModel, table=True):
fallback_reply: str | None = Field(default=None, description="Fallback reply for 'block' strategy")
is_enabled: bool = Field(default=True, description="Whether the word is enabled")
hit_count: int = Field(default=0, ge=0, description="Hit count for statistics")
created_at: datetime = Field(default_factory=datetime.utcnow, description="Creation time")
updated_at: datetime = Field(default_factory=datetime.utcnow, description="Last update time")
created_at: datetime = Field(default_factory=beijing_now, description="Creation time")
updated_at: datetime = Field(default_factory=beijing_now, description="Last update time")
class ForbiddenWordCreate(SQLModel):
@ -666,8 +674,8 @@ class BehaviorRule(SQLModel, table=True):
)
category: str = Field(..., description="Category: compliance/tone/boundary/custom")
is_enabled: bool = Field(default=True, description="Whether the rule is enabled")
created_at: datetime = Field(default_factory=datetime.utcnow, description="Creation time")
updated_at: datetime = Field(default_factory=datetime.utcnow, description="Last update time")
created_at: datetime = Field(default_factory=beijing_now, description="Creation time")
updated_at: datetime = Field(default_factory=beijing_now, description="Last update time")
class BehaviorRuleCreate(SQLModel):
@ -779,8 +787,8 @@ class ScriptFlow(SQLModel, table=True):
sa_column=Column("metadata", JSON, nullable=True),
description="[AC-IDSMETA-16] Structured metadata for the script flow"
)
created_at: datetime = Field(default_factory=datetime.utcnow, description="Creation time")
updated_at: datetime = Field(default_factory=datetime.utcnow, description="Last update time")
created_at: datetime = Field(default_factory=beijing_now, description="Creation time")
updated_at: datetime = Field(default_factory=beijing_now, description="Last update time")
class FlowInstance(SQLModel, table=True):
@ -814,8 +822,8 @@ class FlowInstance(SQLModel, table=True):
sa_column=Column("context", JSON, nullable=True),
description="Flow execution context, stores user inputs"
)
started_at: datetime = Field(default_factory=datetime.utcnow, description="Instance start time")
updated_at: datetime = Field(default_factory=datetime.utcnow, description="Last update time")
started_at: datetime = Field(default_factory=beijing_now, description="Instance start time")
updated_at: datetime = Field(default_factory=beijing_now, description="Last update time")
completed_at: datetime | None = Field(default=None, description="Completion time (nullable)")
@ -981,7 +989,7 @@ class FlowTestRecord(SQLModel, table=True):
description="Final ChatResponse with reply, confidence, should_transfer"
)
total_duration_ms: int | None = Field(default=None, description="Total execution time in milliseconds")
created_at: datetime = Field(default_factory=datetime.utcnow, description="Record creation time", index=True)
created_at: datetime = Field(default_factory=beijing_now, description="Record creation time", index=True)
class FlowTestStepResult(SQLModel):
@ -1034,7 +1042,7 @@ class ExportTask(SQLModel, table=True):
)
error_message: str | None = Field(default=None, description="Error message if failed")
expires_at: datetime | None = Field(default=None, description="File expiration time (for cleanup)")
created_at: datetime = Field(default_factory=datetime.utcnow, description="Task creation time")
created_at: datetime = Field(default_factory=beijing_now, description="Task creation time")
completed_at: datetime | None = Field(default=None, description="Completion time")
@ -1167,8 +1175,8 @@ class MetadataFieldDefinition(SQLModel, table=True):
description="字段状态: draft/active/deprecated"
)
version: int = Field(default=1, description="版本号")
created_at: datetime = Field(default_factory=datetime.utcnow, description="创建时间")
updated_at: datetime = Field(default_factory=datetime.utcnow, description="更新时间")
created_at: datetime = Field(default_factory=beijing_now, description="创建时间")
updated_at: datetime = Field(default_factory=beijing_now, description="更新时间")
class MetadataFieldDefinitionCreate(SQLModel):
@ -1299,8 +1307,8 @@ class SlotDefinition(SQLModel, table=True):
description="关联的元数据字段 ID",
foreign_key="metadata_field_definitions.id",
)
created_at: datetime = Field(default_factory=datetime.utcnow, description="创建时间")
updated_at: datetime = Field(default_factory=datetime.utcnow, description="更新时间")
created_at: datetime = Field(default_factory=beijing_now, description="创建时间")
updated_at: datetime = Field(default_factory=beijing_now, description="更新时间")
def get_effective_strategies(self) -> list[str]:
"""
@ -1423,7 +1431,7 @@ class SlotValue(SQLModel):
description="置信度 0.0~1.0"
)
updated_at: datetime = Field(
default_factory=datetime.utcnow,
default_factory=beijing_now,
description="最后更新时间"
)
@ -1450,8 +1458,8 @@ class MetadataSchema(SQLModel, table=True):
)
is_default: bool = Field(default=False, description="是否为租户默认模式")
is_enabled: bool = Field(default=True, description="是否启用")
created_at: datetime = Field(default_factory=datetime.utcnow, description="创建时间")
updated_at: datetime = Field(default_factory=datetime.utcnow, description="更新时间")
created_at: datetime = Field(default_factory=beijing_now, description="创建时间")
updated_at: datetime = Field(default_factory=beijing_now, description="更新时间")
class MetadataSchemaCreate(SQLModel):
@ -1530,8 +1538,8 @@ class DecompositionTemplate(SQLModel, table=True):
sa_column=Column("example_output", JSON, nullable=True),
description="示例输出 JSON"
)
created_at: datetime = Field(default_factory=datetime.utcnow, description="创建时间")
updated_at: datetime = Field(default_factory=datetime.utcnow, description="更新时间")
created_at: datetime = Field(default_factory=beijing_now, description="创建时间")
updated_at: datetime = Field(default_factory=beijing_now, description="更新时间")
class DecompositionTemplateCreate(SQLModel):
@ -1625,8 +1633,8 @@ class HighRiskPolicy(SQLModel, table=True):
)
priority: int = Field(default=0, description="优先级 (值越高优先级越高)")
is_enabled: bool = Field(default=True, description="是否启用")
created_at: datetime = Field(default_factory=datetime.utcnow, description="创建时间")
updated_at: datetime = Field(default_factory=datetime.utcnow, description="更新时间")
created_at: datetime = Field(default_factory=beijing_now, description="创建时间")
updated_at: datetime = Field(default_factory=beijing_now, description="更新时间")
class HighRiskPolicyCreate(SQLModel):
@ -1673,8 +1681,8 @@ class SessionModeRecord(SQLModel, table=True):
)
reason: str | None = Field(default=None, description="模式切换原因")
switched_at: datetime | None = Field(default=None, description="模式切换时间")
created_at: datetime = Field(default_factory=datetime.utcnow, description="创建时间")
updated_at: datetime = Field(default_factory=datetime.utcnow, description="更新时间")
created_at: datetime = Field(default_factory=beijing_now, description="创建时间")
updated_at: datetime = Field(default_factory=beijing_now, description="更新时间")
class MidAuditLog(SQLModel, table=True):
@ -1708,7 +1716,7 @@ class MidAuditLog(SQLModel, table=True):
react_iterations: int | None = Field(default=None, description="ReAct循环次数")
high_risk_scenario: str | None = Field(default=None, description="触发的高风险场景")
latency_ms: int | None = Field(default=None, description="总耗时(ms)")
created_at: datetime = Field(default_factory=datetime.utcnow, description="创建时间", index=True)
created_at: datetime = Field(default_factory=beijing_now, description="创建时间", index=True)
class SceneSlotBundleStatus(str, Enum):
@ -1784,8 +1792,8 @@ class SceneSlotBundle(SQLModel, table=True):
description="状态: draft/active/deprecated"
)
version: int = Field(default=1, description="版本号")
created_at: datetime = Field(default_factory=datetime.utcnow, description="创建时间")
updated_at: datetime = Field(default_factory=datetime.utcnow, description="更新时间")
created_at: datetime = Field(default_factory=beijing_now, description="创建时间")
updated_at: datetime = Field(default_factory=beijing_now, description="更新时间")
class SceneSlotBundleCreate(SQLModel):

View File

@ -107,12 +107,14 @@ class MetadataFieldDefinitionUpdateRequest(BaseModel):
"""[AC-MRS-01] 更新元数据字段定义请求"""
label: str | None = Field(default=None, min_length=1, max_length=64)
type: str | None = Field(default=None, description="字段类型")
required: bool | None = None
options: list[str] | None = None
default_value: Any | None = None
scope: list[str] | None = None
is_filterable: bool | None = None
is_rank_feature: bool | None = None
usage_description: str | None = Field(default=None, description="用途说明")
field_roles: list[str] | None = Field(
default=None,
description="[AC-MRS-01] 字段角色列表"

View File

@ -295,7 +295,7 @@ class OpenAIClient(LLMClient):
role = msg.get("role", "unknown")
content = msg.get("content", "")
logger.info(f"[AC-AISVC-06] [{i}] role={role}, content_length={len(content)}")
logger.info(f"[AC-AISVC-06] [{i}] content:\n{content}")
logger.info("[AC-AISVC-06] ======================================")
try:

View File

@ -42,10 +42,10 @@ class AutoInferenceResult:
error_message: str | None = None
METADATA_INFERENCE_SYSTEM_PROMPT = """你是一个专业的文档元数据分析助手。你的任务是根据文档内容,自动推断并填写元数据字段
METADATA_INFERENCE_SYSTEM_PROMPT = """你是一个专业的文档元数据分析助手。你的任务是根据文档内容,自动推断并填写元数据字段,并以 JSON 格式输出结果
## 输出要求
严格按照以下 JSON 格式输出不要添加任何其他内容
你必须严格按照以下 JSON 格式输出不要添加任何其他内容
```json
{
@ -60,6 +60,29 @@ METADATA_INFERENCE_SYSTEM_PROMPT = """你是一个专业的文档元数据分析
}
```
## JSON 输出样例
假设有以下字段定义
- grade (枚举类型选项小学初中高中)
- subject (枚举类型选项语文数学英语)
- difficulty (数字类型1-5)
如果文档内容是关于"高一数学函数知识点"则输出
```json
{
"inferred_metadata": {
"grade": "高中",
"subject": "数学",
"difficulty": 3
},
"confidence_scores": {
"grade": 0.95,
"subject": 0.95,
"difficulty": 0.7
}
}
```
## 推断规则
1. **仔细分析文档内容**根据文档的主题关键词上下文来推断元数据
2. **遵循字段定义**
@ -78,7 +101,10 @@ METADATA_INFERENCE_SYSTEM_PROMPT = """你是一个专业的文档元数据分析
## 注意事项
- 必须严格按照字段定义的类型和选项填写
- 不要编造不存在的选项值
- 保持客观基于文档内容推断"""
- 保持客观基于文档内容推断
- **只输出合法的 JSON 格式不要输出思考过程解释或任何额外文本**
- 确保 JSON 格式完整闭合所有字符串用双引号包裹
- 如果无法推断某字段可省略该字段但整体 JSON 必须完整闭合"""
class MetadataAutoInferenceService:
@ -98,9 +124,9 @@ class MetadataAutoInferenceService:
def __init__(
self,
session: AsyncSession,
model: str | None = None,
model: str | None = "glm-4.7",
max_tokens: int = 1024,
timeout_seconds: int = 60,
timeout_seconds: int = 180,
):
self._session = session
self._model = model
@ -148,9 +174,12 @@ class MetadataAutoInferenceService:
error_message="No field definitions configured",
)
logger.info(f"[MetadataAutoInference] Found {len(field_definitions)} field definitions: {[f.field_key for f in field_definitions]}")
field_contexts = self._build_field_contexts(field_definitions)
if not field_contexts:
logger.warning(f"[MetadataAutoInference] No field contexts built from definitions")
return AutoInferenceResult(
inferred_metadata=existing_metadata or {},
confidence_scores={},
@ -159,15 +188,26 @@ class MetadataAutoInferenceService:
)
user_prompt = self._build_user_prompt(content, field_contexts, existing_metadata)
logger.info(f"[MetadataAutoInference] === SYSTEM PROMPT ===\n{METADATA_INFERENCE_SYSTEM_PROMPT}\n=== END SYSTEM PROMPT ===")
logger.info(f"[MetadataAutoInference] === USER PROMPT ===\n{user_prompt}\n=== END USER PROMPT ===")
try:
if image_base64 and mime_type:
raw_response = await self._call_multimodal_llm(
user_prompt, image_base64, mime_type
)
logger.info(f"[MetadataAutoInference] Using multimodal LLM for image recognition, mime_type={mime_type}")
image_content = await self._recognize_image(image_base64, mime_type)
logger.info(f"[MetadataAutoInference] Image recognition result (first 200 chars): {image_content[:200] if image_content else 'EMPTY'}")
combined_content = f"{content}\n\n[图片识别内容]\n{image_content}" if content else f"[图片识别内容]\n{image_content}"
user_prompt = self._build_user_prompt(combined_content, field_contexts, existing_metadata)
logger.info(f"[MetadataAutoInference] === USER PROMPT (with image content) ===\n{user_prompt}\n=== END USER PROMPT ===")
raw_response = await self._call_text_llm(user_prompt)
else:
raw_response = await self._call_text_llm(user_prompt)
logger.info(f"[MetadataAutoInference] LLM response length: {len(raw_response) if raw_response else 0}")
logger.info(f"[MetadataAutoInference] LLM response (first 300 chars): {raw_response[:300] if raw_response else 'EMPTY'}")
result = self._parse_llm_response(raw_response, field_contexts)
if existing_metadata:
@ -331,12 +371,15 @@ class MetadataAutoInferenceService:
return prompt
async def _call_text_llm(self, prompt: str) -> str:
"""调用文本 LLM"""
"""调用文本 LLM 进行元数据推断(使用对话模型配置,支持 JSON 格式化输出)"""
manager = get_llm_config_manager()
client = manager.get_kb_processing_client()
client = manager.get_chat_client()
config = manager.kb_processing_config
model = self._model or config.get("model", "gpt-4o-mini")
config = manager.chat_config
model = config.get("model")
if not model:
logger.warning("[MetadataAutoInference] No model configured in chat config, using default")
model = self._model
from app.services.llm.base import LLMConfig
@ -345,8 +388,13 @@ class MetadataAutoInferenceService:
max_tokens=self._max_tokens,
temperature=0.3,
timeout_seconds=self._timeout_seconds,
extra_params={
"response_format": {"type": "json_object"},
},
)
logger.info(f"[MetadataAutoInference] Using model: {model} for text LLM")
messages = [
{"role": "system", "content": METADATA_INFERENCE_SYSTEM_PROMPT},
{"role": "user", "content": prompt},
@ -355,34 +403,45 @@ class MetadataAutoInferenceService:
response = await client.generate(messages=messages, config=llm_config)
return response.content or ""
async def _call_multimodal_llm(
self,
prompt: str,
image_base64: str,
mime_type: str,
) -> str:
"""调用多模态 LLM"""
async def _recognize_image(self, image_base64: str, mime_type: str) -> str:
"""
使用多模态模型识别图片内容
这个方法只负责图片内容识别不负责 JSON 格式化输出
使用对话模型配置
"""
manager = get_llm_config_manager()
client = manager.get_kb_processing_client()
client = manager.get_chat_client()
config = manager.kb_processing_config
model = self._model or config.get("model", "gpt-4o-mini")
config = manager.chat_config
model = config.get("model")
if not model:
logger.warning("[MetadataAutoInference] No model configured in chat config for image recognition")
model = self._model
from app.services.llm.base import LLMConfig
llm_config = LLMConfig(
model=model,
max_tokens=self._max_tokens,
max_tokens=1024,
temperature=0.3,
timeout_seconds=self._timeout_seconds,
)
logger.info(f"[MetadataAutoInference] Using model: {model} for image recognition")
recognition_prompt = """请仔细分析这张图片,提取其中的关键信息,包括:
1. 图片类型文档截图图表照片示意图等
2. 图片中的文字内容如有
3. 图片的主要内容和主题
4. 任何可见的数据数字或关键信息
请用简洁的中文描述图片内容"""
messages = [
{"role": "system", "content": METADATA_INFERENCE_SYSTEM_PROMPT},
{
"role": "user",
"content": [
{"type": "text", "text": prompt},
{"type": "text", "text": recognition_prompt},
{
"type": "image_url",
"image_url": {
@ -396,12 +455,29 @@ class MetadataAutoInferenceService:
response = await client.generate(messages=messages, config=llm_config)
return response.content or ""
async def _call_multimodal_llm(
self,
prompt: str,
image_base64: str,
mime_type: str,
) -> str:
"""
调用多模态 LLM已弃用保留向后兼容
推荐使用 _recognize_image + _call_text_llm 组合
"""
logger.warning("[MetadataAutoInference] _call_multimodal_llm is deprecated, use _recognize_image + _call_text_llm instead")
image_content = await self._recognize_image(image_base64, mime_type)
combined_prompt = f"{prompt}\n\n[图片识别内容]\n{image_content}"
return await self._call_text_llm(combined_prompt)
def _parse_llm_response(
self,
response: str,
field_contexts: list[InferenceFieldContext],
) -> AutoInferenceResult:
"""解析 LLM 响应"""
json_str = ""
try:
json_str = self._extract_json(response)
data = json.loads(json_str)
@ -410,19 +486,30 @@ class MetadataAutoInferenceService:
confidence_scores = data.get("confidence_scores", {})
field_map = {ctx.field_key: ctx for ctx in field_contexts}
label_to_key = {ctx.label: ctx.field_key for ctx in field_contexts}
validated_metadata = {}
validated_scores = {}
for field_key, value in inferred_metadata.items():
if field_key not in field_map:
continue
for field_key_or_label, value in inferred_metadata.items():
actual_field_key = field_key_or_label
ctx = field_map[field_key]
if field_key_or_label not in field_map:
if field_key_or_label in label_to_key:
actual_field_key = label_to_key[field_key_or_label]
else:
logger.warning(f"[MetadataAutoInference] Unknown field: {field_key_or_label}")
continue
ctx = field_map[actual_field_key]
validated_value = self._validate_field_value(ctx, value)
if validated_value is not None:
validated_metadata[field_key] = validated_value
validated_scores[field_key] = confidence_scores.get(field_key, 0.5)
validated_metadata[actual_field_key] = validated_value
validated_scores[actual_field_key] = confidence_scores.get(field_key_or_label, 0.5)
logger.info(f"[MetadataAutoInference] Validated metadata: {validated_metadata}")
logger.info(f"[MetadataAutoInference] Confidence scores: {validated_scores}")
return AutoInferenceResult(
inferred_metadata=validated_metadata,
@ -433,6 +520,8 @@ class MetadataAutoInferenceService:
except json.JSONDecodeError as e:
logger.warning(f"[MetadataAutoInference] Failed to parse JSON: {e}")
logger.warning(f"[MetadataAutoInference] Raw LLM response (first 500 chars): {response[:500] if response else 'EMPTY'}")
logger.warning(f"[MetadataAutoInference] Extracted JSON string: {json_str[:500] if json_str else 'EMPTY'}")
return AutoInferenceResult(
inferred_metadata={},
confidence_scores={},
@ -482,15 +571,32 @@ class MetadataAutoInferenceService:
def _extract_json(self, content: str) -> str:
"""从响应中提取 JSON"""
import re
content = content.strip()
if content.startswith("```"):
content = re.sub(r"^```[a-zA-Z0-9_-]*\s*", "", content)
content = re.sub(r"\s*```$", "", content).strip()
if content.startswith("{") and content.endswith("}"):
return content
json_start = content.find("{")
json_end = content.rfind("}")
if json_start == -1:
return content
if json_start != -1 and json_end != -1 and json_end > json_start:
return content[json_start:json_end + 1]
json_str = content[json_start:]
return content
open_braces = json_str.count("{")
close_braces = json_str.count("}")
if close_braces < open_braces:
json_str += "}" * (open_braces - close_braces)
json_str = re.sub(r",\s*([}\]])", r"\1", json_str)
json_end = json_str.rfind("}")
if json_end != -1:
json_str = json_str[:json_end + 1]
return json_str

View File

@ -226,6 +226,8 @@ class MetadataFieldDefinitionService:
if field_update.label is not None:
field.label = field_update.label
if field_update.type is not None:
field.type = field_update.type
if field_update.required is not None:
field.required = field_update.required
if field_update.options is not None:
@ -239,6 +241,8 @@ class MetadataFieldDefinitionService:
field.is_filterable = field_update.is_filterable
if field_update.is_rank_feature is not None:
field.is_rank_feature = field_update.is_rank_feature
if field_update.usage_description is not None:
field.usage_description = field_update.usage_description
# [AC-MRS-01] 修复:添加 field_roles 更新逻辑
if field_update.field_roles is not None:
self._validate_field_roles(field_update.field_roles)