feat: enhance metadata handling and document processing

- Add metadata field to Document type for frontend
- Add type field to MetadataFieldUpdateRequest
- Update KB API with URL decode support for Chinese filenames
- Enhance metadata auto inference service
- Improve metadata field definition service
- Update .gitignore to exclude logs and snapshots
This commit is contained in:
MerCry 2026-03-12 12:45:54 +08:00
parent 51d8de0621
commit dd1c6aba14
10 changed files with 259 additions and 111 deletions

3
.gitignore vendored
View File

@ -163,7 +163,10 @@ cython_debug/
# Project specific # Project specific
ai-service/uploads/ ai-service/uploads/
ai-service/config/ ai-service/config/
ai-service/logs/
*.local *.local
qdrant_snapshots/
*.snapshot
/.trae/ /.trae/
/.claude/ /.claude/

View File

@ -34,6 +34,7 @@ export interface Document {
kbId: string kbId: string
fileName: string fileName: string
status: string status: string
metadata?: Record<string, any>
jobId?: string jobId?: string
createdAt: string createdAt: string
updatedAt: string updatedAt: string

View File

@ -39,6 +39,7 @@ export interface MetadataFieldCreateRequest {
export interface MetadataFieldUpdateRequest { export interface MetadataFieldUpdateRequest {
label?: string label?: string
type?: MetadataFieldType
required?: boolean required?: boolean
options?: string[] options?: string[]
default?: string | number | boolean default?: string | number | boolean

View File

@ -16,7 +16,7 @@
<input <input
ref="fileInputRef" ref="fileInputRef"
type="file" type="file"
accept=".txt,.md,.pdf,.doc,.docx,.xls,.xlsx" accept=".txt,.md,.markdown,.pdf,.doc,.docx,.xls,.xlsx,.jpg,.jpeg,.png,.gif,.webp,.bmp,.tiff,.tif"
style="display: none" style="display: none"
@change="handleFileSelect" @change="handleFileSelect"
/> />
@ -298,7 +298,10 @@ const handleFileSelect = (event: Event) => {
const file = target.files?.[0] const file = target.files?.[0]
if (!file) return if (!file) return
const allowedExtensions = ['.txt', '.md', '.pdf', '.doc', '.docx', '.xls', '.xlsx'] const allowedExtensions = [
'.txt', '.md', '.markdown', '.pdf', '.doc', '.docx', '.xls', '.xlsx',
'.jpg', '.jpeg', '.png', '.gif', '.webp', '.bmp', '.tiff', '.tif'
]
const ext = '.' + file.name.split('.').pop()?.toLowerCase() const ext = '.' + file.name.split('.').pop()?.toLowerCase()
if (!allowedExtensions.includes(ext)) { if (!allowedExtensions.includes(ext)) {

View File

@ -11,6 +11,7 @@ import hashlib
from dataclasses import dataclass from dataclasses import dataclass
from typing import Annotated, Any, Optional from typing import Annotated, Any, Optional
from logging.handlers import RotatingFileHandler from logging.handlers import RotatingFileHandler
from urllib.parse import unquote
import tiktoken import tiktoken
from fastapi import APIRouter, BackgroundTasks, Depends, File, Form, HTTPException, Query, UploadFile from fastapi import APIRouter, BackgroundTasks, Depends, File, Form, HTTPException, Query, UploadFile
@ -64,6 +65,7 @@ class TextChunk:
end_token: int end_token: int
page: int | None = None page: int | None = None
source: str | None = None source: str | None = None
metadata: dict | None = None
def chunk_text_by_lines( def chunk_text_by_lines(
@ -602,10 +604,14 @@ async def upload_document(
doc_kb_service = KBService(session) doc_kb_service = KBService(session)
file_content = await file.read() file_content = await file.read()
# URL decode filename to handle Chinese characters
decoded_filename = unquote(file.filename or "unknown")
document, job = await doc_kb_service.upload_document( document, job = await doc_kb_service.upload_document(
tenant_id=tenant_id, tenant_id=tenant_id,
kb_id=kb_id, kb_id=kb_id,
file_name=file.filename or "unknown", file_name=decoded_filename,
file_content=file_content, file_content=file_content,
file_type=file.content_type, file_type=file.content_type,
metadata=metadata_dict, metadata=metadata_dict,
@ -615,7 +621,7 @@ async def upload_document(
await session.commit() await session.commit()
background_tasks.add_task( background_tasks.add_task(
_index_document, tenant_id, kb_id, str(job.id), str(document.id), file_content, file.filename, metadata_dict _index_document, tenant_id, kb_id, str(job.id), str(document.id), file_content, decoded_filename, metadata_dict
) )
return JSONResponse( return JSONResponse(
@ -676,6 +682,7 @@ async def _index_document(
logger.info(f"[INDEX] File extension: {file_ext}, content size: {len(content)} bytes") logger.info(f"[INDEX] File extension: {file_ext}, content size: {len(content)} bytes")
text_extensions = {".txt", ".md", ".markdown", ".rst", ".log", ".json", ".xml", ".yaml", ".yml"} text_extensions = {".txt", ".md", ".markdown", ".rst", ".log", ".json", ".xml", ".yaml", ".yml"}
markdown_extensions = {".md", ".markdown"}
image_extensions = {".jpg", ".jpeg", ".png", ".gif", ".webp", ".bmp", ".tiff", ".tif"} image_extensions = {".jpg", ".jpeg", ".png", ".gif", ".webp", ".bmp", ".tiff", ".tif"}
if file_ext in text_extensions or not file_ext: if file_ext in text_extensions or not file_ext:
@ -783,16 +790,6 @@ async def _index_document(
image_base64_for_inference = None image_base64_for_inference = None
mime_type_for_inference = None mime_type_for_inference = None
if file_ext in image_extensions:
import base64
image_base64_for_inference = base64.b64encode(content).decode("utf-8")
mime_type_map = {
".jpg": "image/jpeg", ".jpeg": "image/jpeg",
".png": "image/png", ".gif": "image/gif",
".webp": "image/webp", ".bmp": "image/bmp",
".tiff": "image/tiff", ".tif": "image/tiff",
}
mime_type_for_inference = mime_type_map.get(file_ext, "image/jpeg")
logger.info("[INDEX] Starting metadata auto-inference...") logger.info("[INDEX] Starting metadata auto-inference...")
inference_result = await inference_service.infer_metadata( inference_result = await inference_service.infer_metadata(
@ -811,6 +808,11 @@ async def _index_document(
f"inferred_fields={list(inference_result.inferred_metadata.keys())}, " f"inferred_fields={list(inference_result.inferred_metadata.keys())}, "
f"confidence_scores={inference_result.confidence_scores}" f"confidence_scores={inference_result.confidence_scores}"
) )
document = await kb_service.get_document(tenant_id, doc_id)
if document:
document.doc_metadata = metadata
logger.info(f"[INDEX] Updated document metadata in database: {metadata}")
else: else:
logger.warning( logger.warning(
f"[INDEX] Metadata inference FAILED: {inference_result.error_message}, " f"[INDEX] Metadata inference FAILED: {inference_result.error_message}, "
@ -846,6 +848,31 @@ async def _index_document(
pc.page = page.page pc.page = page.page
all_chunks.extend(page_chunks) all_chunks.extend(page_chunks)
logger.info(f"[INDEX] Total chunks from PDF: {len(all_chunks)}") logger.info(f"[INDEX] Total chunks from PDF: {len(all_chunks)}")
elif file_ext in markdown_extensions:
logger.info("[INDEX] Markdown file detected, using intelligent chunking")
from app.services.document.markdown_chunker import MarkdownChunker, MarkdownElementType
chunker = MarkdownChunker(max_chunk_size=1000, min_chunk_size=50)
md_chunks = chunker.chunk(text, doc_id=doc_id)
for i, md_chunk in enumerate(md_chunks):
chunk_metadata = {
"element_type": md_chunk.element_type.value,
"header_context": md_chunk.header_context,
"language": md_chunk.language,
}
chunk_metadata.update(md_chunk.metadata)
all_chunks.append(TextChunk(
text=md_chunk.content,
start_token=i,
end_token=i + 1,
page=None,
source=filename,
))
if all_chunks:
all_chunks[-1].metadata = chunk_metadata
logger.info(f"[INDEX] Total chunks from Markdown: {len(all_chunks)}, element types: {[c.element_type.value for c in md_chunks[:5]]}...")
else: else:
logger.info("[INDEX] Using line-based chunking") logger.info("[INDEX] Using line-based chunking")
all_chunks = chunk_text_by_lines( all_chunks = chunk_text_by_lines(
@ -913,21 +940,13 @@ async def _index_document(
if points: if points:
if settings.kb_vector_log_enabled: if settings.kb_vector_log_enabled:
vector_payloads = [] payloads_only = []
for point in points: for point in points:
if use_multi_vector: if use_multi_vector:
payload = { payload = point.get("payload")
"id": point.get("id"),
"vector": point.get("vector"),
"payload": point.get("payload"),
}
else: else:
payload = { payload = point.payload
"id": point.id, payloads_only.append(payload)
"vector": point.vector,
"payload": point.payload,
}
vector_payloads.append(payload)
kb_vector_logger.info(json.dumps({ kb_vector_logger.info(json.dumps({
"tenant_id": tenant_id, "tenant_id": tenant_id,
@ -938,7 +957,8 @@ async def _index_document(
"file_ext": file_ext, "file_ext": file_ext,
"is_image": file_ext in image_extensions, "is_image": file_ext in image_extensions,
"metadata": doc_metadata, "metadata": doc_metadata,
"vectors": vector_payloads, "chunk_count": len(payloads_only),
"payloads": payloads_only,
}, ensure_ascii=False)) }, ensure_ascii=False))
logger.info(f"[INDEX] Upserting {len(points)} vectors to Qdrant for kb_id={kb_id}...") logger.info(f"[INDEX] Upserting {len(points)} vectors to Qdrant for kb_id={kb_id}...")

View File

@ -7,10 +7,18 @@ import uuid
from datetime import datetime from datetime import datetime
from enum import Enum from enum import Enum
from typing import Any from typing import Any
from zoneinfo import ZoneInfo
from sqlalchemy import JSON, Column from sqlalchemy import JSON, Column
from sqlmodel import Field, Index, SQLModel from sqlmodel import Field, Index, SQLModel
SHANGHAI_TZ = ZoneInfo("Asia/Shanghai")
def beijing_now() -> datetime:
"""Get current time in Shanghai/Beijing timezone (Asia/Shanghai)"""
return datetime.now(SHANGHAI_TZ).replace(tzinfo=None)
class ChatSession(SQLModel, table=True): class ChatSession(SQLModel, table=True):
""" """
@ -32,8 +40,8 @@ class ChatSession(SQLModel, table=True):
sa_column=Column("metadata", JSON, nullable=True), sa_column=Column("metadata", JSON, nullable=True),
description="Session metadata" description="Session metadata"
) )
created_at: datetime = Field(default_factory=datetime.utcnow, description="Session creation time") created_at: datetime = Field(default_factory=beijing_now, description="Session creation time")
updated_at: datetime = Field(default_factory=datetime.utcnow, description="Last update time") updated_at: datetime = Field(default_factory=beijing_now, description="Last update time")
class ChatMessage(SQLModel, table=True): class ChatMessage(SQLModel, table=True):
@ -65,7 +73,7 @@ class ChatMessage(SQLModel, table=True):
first_token_ms: int | None = Field(default=None, description="Time to first token in milliseconds (for streaming)") first_token_ms: int | None = Field(default=None, description="Time to first token in milliseconds (for streaming)")
is_error: bool = Field(default=False, description="Whether this message is an error response") is_error: bool = Field(default=False, description="Whether this message is an error response")
error_message: str | None = Field(default=None, description="Error message if any") error_message: str | None = Field(default=None, description="Error message if any")
created_at: datetime = Field(default_factory=datetime.utcnow, description="Message creation time") created_at: datetime = Field(default_factory=beijing_now, description="Message creation time")
prompt_template_id: uuid.UUID | None = Field( prompt_template_id: uuid.UUID | None = Field(
default=None, default=None,
@ -150,8 +158,8 @@ class UserMemory(SQLModel, table=True):
summary_version: int = Field(default=1, description="Summary version / update round") summary_version: int = Field(default=1, description="Summary version / update round")
last_turn_id: str | None = Field(default=None, description="Last turn identifier (optional)") last_turn_id: str | None = Field(default=None, description="Last turn identifier (optional)")
expires_at: datetime | None = Field(default=None, description="Expiration time (optional)") expires_at: datetime | None = Field(default=None, description="Expiration time (optional)")
created_at: datetime = Field(default_factory=datetime.utcnow, description="Creation time") created_at: datetime = Field(default_factory=beijing_now, description="Creation time")
updated_at: datetime = Field(default_factory=datetime.utcnow, description="Last update time") updated_at: datetime = Field(default_factory=beijing_now, description="Last update time")
class SharedSession(SQLModel, table=True): class SharedSession(SQLModel, table=True):
@ -177,8 +185,8 @@ class SharedSession(SQLModel, table=True):
is_active: bool = Field(default=True, description="Whether share is active") is_active: bool = Field(default=True, description="Whether share is active")
max_concurrent_users: int = Field(default=10, description="Maximum concurrent users allowed") max_concurrent_users: int = Field(default=10, description="Maximum concurrent users allowed")
current_users: int = Field(default=0, description="Current number of online users") current_users: int = Field(default=0, description="Current number of online users")
created_at: datetime = Field(default_factory=datetime.utcnow, description="Creation time") created_at: datetime = Field(default_factory=beijing_now, description="Creation time")
updated_at: datetime = Field(default_factory=datetime.utcnow, description="Last update time") updated_at: datetime = Field(default_factory=beijing_now, description="Last update time")
class DocumentStatus(str, Enum): class DocumentStatus(str, Enum):
@ -213,8 +221,8 @@ class Tenant(SQLModel, table=True):
tenant_id: str = Field(..., description="Full tenant ID (format: name@ash@year)", unique=True, index=True) tenant_id: str = Field(..., description="Full tenant ID (format: name@ash@year)", unique=True, index=True)
name: str = Field(..., description="Tenant display name (first part of tenant_id)") name: str = Field(..., description="Tenant display name (first part of tenant_id)")
year: str = Field(..., description="Year part from tenant_id") year: str = Field(..., description="Year part from tenant_id")
created_at: datetime = Field(default_factory=datetime.utcnow, description="Creation time") created_at: datetime = Field(default_factory=beijing_now, description="Creation time")
updated_at: datetime = Field(default_factory=datetime.utcnow, description="Last update time") updated_at: datetime = Field(default_factory=beijing_now, description="Last update time")
class KBType(str, Enum): class KBType(str, Enum):
@ -247,8 +255,8 @@ class KnowledgeBase(SQLModel, table=True):
priority: int = Field(default=0, ge=0, description="Priority weight, higher value means higher priority") priority: int = Field(default=0, ge=0, description="Priority weight, higher value means higher priority")
is_enabled: bool = Field(default=True, description="Whether the knowledge base is enabled") is_enabled: bool = Field(default=True, description="Whether the knowledge base is enabled")
doc_count: int = Field(default=0, ge=0, description="Document count (cached statistic)") doc_count: int = Field(default=0, ge=0, description="Document count (cached statistic)")
created_at: datetime = Field(default_factory=datetime.utcnow, description="Creation time") created_at: datetime = Field(default_factory=beijing_now, description="Creation time")
updated_at: datetime = Field(default_factory=datetime.utcnow, description="Last update time") updated_at: datetime = Field(default_factory=beijing_now, description="Last update time")
class Document(SQLModel, table=True): class Document(SQLModel, table=True):
@ -272,8 +280,8 @@ class Document(SQLModel, table=True):
status: str = Field(default=DocumentStatus.PENDING.value, description="Document status") status: str = Field(default=DocumentStatus.PENDING.value, description="Document status")
error_msg: str | None = Field(default=None, description="Error message if failed") error_msg: str | None = Field(default=None, description="Error message if failed")
doc_metadata: dict | None = Field(default=None, sa_type=JSON, description="Document metadata as JSON") doc_metadata: dict | None = Field(default=None, sa_type=JSON, description="Document metadata as JSON")
created_at: datetime = Field(default_factory=datetime.utcnow, description="Upload time") created_at: datetime = Field(default_factory=beijing_now, description="Upload time")
updated_at: datetime = Field(default_factory=datetime.utcnow, description="Last update time") updated_at: datetime = Field(default_factory=beijing_now, description="Last update time")
class IndexJob(SQLModel, table=True): class IndexJob(SQLModel, table=True):
@ -293,8 +301,8 @@ class IndexJob(SQLModel, table=True):
status: str = Field(default=IndexJobStatus.PENDING.value, description="Job status") status: str = Field(default=IndexJobStatus.PENDING.value, description="Job status")
progress: int = Field(default=0, ge=0, le=100, description="Progress percentage") progress: int = Field(default=0, ge=0, le=100, description="Progress percentage")
error_msg: str | None = Field(default=None, description="Error message if failed") error_msg: str | None = Field(default=None, description="Error message if failed")
created_at: datetime = Field(default_factory=datetime.utcnow, description="Job creation time") created_at: datetime = Field(default_factory=beijing_now, description="Job creation time")
updated_at: datetime = Field(default_factory=datetime.utcnow, description="Last update time") updated_at: datetime = Field(default_factory=beijing_now, description="Last update time")
class KnowledgeBaseCreate(SQLModel): class KnowledgeBaseCreate(SQLModel):
@ -346,8 +354,8 @@ class ApiKey(SQLModel, table=True):
description="Optional IP allowlist for this key", description="Optional IP allowlist for this key",
) )
rate_limit_qpm: int | None = Field(default=60, description="Per-minute quota for this key") rate_limit_qpm: int | None = Field(default=60, description="Per-minute quota for this key")
created_at: datetime = Field(default_factory=datetime.utcnow, description="Creation time") created_at: datetime = Field(default_factory=beijing_now, description="Creation time")
updated_at: datetime = Field(default_factory=datetime.utcnow, description="Last update time") updated_at: datetime = Field(default_factory=beijing_now, description="Last update time")
class ApiKeyCreate(SQLModel): class ApiKeyCreate(SQLModel):
@ -390,8 +398,8 @@ class PromptTemplate(SQLModel, table=True):
sa_column=Column("metadata", JSON, nullable=True), sa_column=Column("metadata", JSON, nullable=True),
description="[AC-IDSMETA-16] Structured metadata for the prompt template" description="[AC-IDSMETA-16] Structured metadata for the prompt template"
) )
created_at: datetime = Field(default_factory=datetime.utcnow, description="Creation time") created_at: datetime = Field(default_factory=beijing_now, description="Creation time")
updated_at: datetime = Field(default_factory=datetime.utcnow, description="Last update time") updated_at: datetime = Field(default_factory=beijing_now, description="Last update time")
class PromptTemplateVersion(SQLModel, table=True): class PromptTemplateVersion(SQLModel, table=True):
@ -426,7 +434,7 @@ class PromptTemplateVersion(SQLModel, table=True):
sa_column=Column("variables", JSON, nullable=True), sa_column=Column("variables", JSON, nullable=True),
description="Variable definitions, e.g., [{'name': 'persona_name', 'default': '小N'}]" description="Variable definitions, e.g., [{'name': 'persona_name', 'default': '小N'}]"
) )
created_at: datetime = Field(default_factory=datetime.utcnow, description="Creation time") created_at: datetime = Field(default_factory=beijing_now, description="Creation time")
class PromptTemplateCreate(SQLModel): class PromptTemplateCreate(SQLModel):
@ -514,8 +522,8 @@ class IntentRule(SQLModel, table=True):
sa_column=Column("semantic_examples", JSON, nullable=True), sa_column=Column("semantic_examples", JSON, nullable=True),
description="[v0.8.0] Semantic example sentences for dynamic vector computation" description="[v0.8.0] Semantic example sentences for dynamic vector computation"
) )
created_at: datetime = Field(default_factory=datetime.utcnow, description="Creation time") created_at: datetime = Field(default_factory=beijing_now, description="Creation time")
updated_at: datetime = Field(default_factory=datetime.utcnow, description="Last update time") updated_at: datetime = Field(default_factory=beijing_now, description="Last update time")
class IntentRuleCreate(SQLModel): class IntentRuleCreate(SQLModel):
@ -614,8 +622,8 @@ class ForbiddenWord(SQLModel, table=True):
fallback_reply: str | None = Field(default=None, description="Fallback reply for 'block' strategy") fallback_reply: str | None = Field(default=None, description="Fallback reply for 'block' strategy")
is_enabled: bool = Field(default=True, description="Whether the word is enabled") is_enabled: bool = Field(default=True, description="Whether the word is enabled")
hit_count: int = Field(default=0, ge=0, description="Hit count for statistics") hit_count: int = Field(default=0, ge=0, description="Hit count for statistics")
created_at: datetime = Field(default_factory=datetime.utcnow, description="Creation time") created_at: datetime = Field(default_factory=beijing_now, description="Creation time")
updated_at: datetime = Field(default_factory=datetime.utcnow, description="Last update time") updated_at: datetime = Field(default_factory=beijing_now, description="Last update time")
class ForbiddenWordCreate(SQLModel): class ForbiddenWordCreate(SQLModel):
@ -666,8 +674,8 @@ class BehaviorRule(SQLModel, table=True):
) )
category: str = Field(..., description="Category: compliance/tone/boundary/custom") category: str = Field(..., description="Category: compliance/tone/boundary/custom")
is_enabled: bool = Field(default=True, description="Whether the rule is enabled") is_enabled: bool = Field(default=True, description="Whether the rule is enabled")
created_at: datetime = Field(default_factory=datetime.utcnow, description="Creation time") created_at: datetime = Field(default_factory=beijing_now, description="Creation time")
updated_at: datetime = Field(default_factory=datetime.utcnow, description="Last update time") updated_at: datetime = Field(default_factory=beijing_now, description="Last update time")
class BehaviorRuleCreate(SQLModel): class BehaviorRuleCreate(SQLModel):
@ -779,8 +787,8 @@ class ScriptFlow(SQLModel, table=True):
sa_column=Column("metadata", JSON, nullable=True), sa_column=Column("metadata", JSON, nullable=True),
description="[AC-IDSMETA-16] Structured metadata for the script flow" description="[AC-IDSMETA-16] Structured metadata for the script flow"
) )
created_at: datetime = Field(default_factory=datetime.utcnow, description="Creation time") created_at: datetime = Field(default_factory=beijing_now, description="Creation time")
updated_at: datetime = Field(default_factory=datetime.utcnow, description="Last update time") updated_at: datetime = Field(default_factory=beijing_now, description="Last update time")
class FlowInstance(SQLModel, table=True): class FlowInstance(SQLModel, table=True):
@ -814,8 +822,8 @@ class FlowInstance(SQLModel, table=True):
sa_column=Column("context", JSON, nullable=True), sa_column=Column("context", JSON, nullable=True),
description="Flow execution context, stores user inputs" description="Flow execution context, stores user inputs"
) )
started_at: datetime = Field(default_factory=datetime.utcnow, description="Instance start time") started_at: datetime = Field(default_factory=beijing_now, description="Instance start time")
updated_at: datetime = Field(default_factory=datetime.utcnow, description="Last update time") updated_at: datetime = Field(default_factory=beijing_now, description="Last update time")
completed_at: datetime | None = Field(default=None, description="Completion time (nullable)") completed_at: datetime | None = Field(default=None, description="Completion time (nullable)")
@ -981,7 +989,7 @@ class FlowTestRecord(SQLModel, table=True):
description="Final ChatResponse with reply, confidence, should_transfer" description="Final ChatResponse with reply, confidence, should_transfer"
) )
total_duration_ms: int | None = Field(default=None, description="Total execution time in milliseconds") total_duration_ms: int | None = Field(default=None, description="Total execution time in milliseconds")
created_at: datetime = Field(default_factory=datetime.utcnow, description="Record creation time", index=True) created_at: datetime = Field(default_factory=beijing_now, description="Record creation time", index=True)
class FlowTestStepResult(SQLModel): class FlowTestStepResult(SQLModel):
@ -1034,7 +1042,7 @@ class ExportTask(SQLModel, table=True):
) )
error_message: str | None = Field(default=None, description="Error message if failed") error_message: str | None = Field(default=None, description="Error message if failed")
expires_at: datetime | None = Field(default=None, description="File expiration time (for cleanup)") expires_at: datetime | None = Field(default=None, description="File expiration time (for cleanup)")
created_at: datetime = Field(default_factory=datetime.utcnow, description="Task creation time") created_at: datetime = Field(default_factory=beijing_now, description="Task creation time")
completed_at: datetime | None = Field(default=None, description="Completion time") completed_at: datetime | None = Field(default=None, description="Completion time")
@ -1167,8 +1175,8 @@ class MetadataFieldDefinition(SQLModel, table=True):
description="字段状态: draft/active/deprecated" description="字段状态: draft/active/deprecated"
) )
version: int = Field(default=1, description="版本号") version: int = Field(default=1, description="版本号")
created_at: datetime = Field(default_factory=datetime.utcnow, description="创建时间") created_at: datetime = Field(default_factory=beijing_now, description="创建时间")
updated_at: datetime = Field(default_factory=datetime.utcnow, description="更新时间") updated_at: datetime = Field(default_factory=beijing_now, description="更新时间")
class MetadataFieldDefinitionCreate(SQLModel): class MetadataFieldDefinitionCreate(SQLModel):
@ -1299,8 +1307,8 @@ class SlotDefinition(SQLModel, table=True):
description="关联的元数据字段 ID", description="关联的元数据字段 ID",
foreign_key="metadata_field_definitions.id", foreign_key="metadata_field_definitions.id",
) )
created_at: datetime = Field(default_factory=datetime.utcnow, description="创建时间") created_at: datetime = Field(default_factory=beijing_now, description="创建时间")
updated_at: datetime = Field(default_factory=datetime.utcnow, description="更新时间") updated_at: datetime = Field(default_factory=beijing_now, description="更新时间")
def get_effective_strategies(self) -> list[str]: def get_effective_strategies(self) -> list[str]:
""" """
@ -1423,7 +1431,7 @@ class SlotValue(SQLModel):
description="置信度 0.0~1.0" description="置信度 0.0~1.0"
) )
updated_at: datetime = Field( updated_at: datetime = Field(
default_factory=datetime.utcnow, default_factory=beijing_now,
description="最后更新时间" description="最后更新时间"
) )
@ -1450,8 +1458,8 @@ class MetadataSchema(SQLModel, table=True):
) )
is_default: bool = Field(default=False, description="是否为租户默认模式") is_default: bool = Field(default=False, description="是否为租户默认模式")
is_enabled: bool = Field(default=True, description="是否启用") is_enabled: bool = Field(default=True, description="是否启用")
created_at: datetime = Field(default_factory=datetime.utcnow, description="创建时间") created_at: datetime = Field(default_factory=beijing_now, description="创建时间")
updated_at: datetime = Field(default_factory=datetime.utcnow, description="更新时间") updated_at: datetime = Field(default_factory=beijing_now, description="更新时间")
class MetadataSchemaCreate(SQLModel): class MetadataSchemaCreate(SQLModel):
@ -1530,8 +1538,8 @@ class DecompositionTemplate(SQLModel, table=True):
sa_column=Column("example_output", JSON, nullable=True), sa_column=Column("example_output", JSON, nullable=True),
description="示例输出 JSON" description="示例输出 JSON"
) )
created_at: datetime = Field(default_factory=datetime.utcnow, description="创建时间") created_at: datetime = Field(default_factory=beijing_now, description="创建时间")
updated_at: datetime = Field(default_factory=datetime.utcnow, description="更新时间") updated_at: datetime = Field(default_factory=beijing_now, description="更新时间")
class DecompositionTemplateCreate(SQLModel): class DecompositionTemplateCreate(SQLModel):
@ -1625,8 +1633,8 @@ class HighRiskPolicy(SQLModel, table=True):
) )
priority: int = Field(default=0, description="优先级 (值越高优先级越高)") priority: int = Field(default=0, description="优先级 (值越高优先级越高)")
is_enabled: bool = Field(default=True, description="是否启用") is_enabled: bool = Field(default=True, description="是否启用")
created_at: datetime = Field(default_factory=datetime.utcnow, description="创建时间") created_at: datetime = Field(default_factory=beijing_now, description="创建时间")
updated_at: datetime = Field(default_factory=datetime.utcnow, description="更新时间") updated_at: datetime = Field(default_factory=beijing_now, description="更新时间")
class HighRiskPolicyCreate(SQLModel): class HighRiskPolicyCreate(SQLModel):
@ -1673,8 +1681,8 @@ class SessionModeRecord(SQLModel, table=True):
) )
reason: str | None = Field(default=None, description="模式切换原因") reason: str | None = Field(default=None, description="模式切换原因")
switched_at: datetime | None = Field(default=None, description="模式切换时间") switched_at: datetime | None = Field(default=None, description="模式切换时间")
created_at: datetime = Field(default_factory=datetime.utcnow, description="创建时间") created_at: datetime = Field(default_factory=beijing_now, description="创建时间")
updated_at: datetime = Field(default_factory=datetime.utcnow, description="更新时间") updated_at: datetime = Field(default_factory=beijing_now, description="更新时间")
class MidAuditLog(SQLModel, table=True): class MidAuditLog(SQLModel, table=True):
@ -1708,7 +1716,7 @@ class MidAuditLog(SQLModel, table=True):
react_iterations: int | None = Field(default=None, description="ReAct循环次数") react_iterations: int | None = Field(default=None, description="ReAct循环次数")
high_risk_scenario: str | None = Field(default=None, description="触发的高风险场景") high_risk_scenario: str | None = Field(default=None, description="触发的高风险场景")
latency_ms: int | None = Field(default=None, description="总耗时(ms)") latency_ms: int | None = Field(default=None, description="总耗时(ms)")
created_at: datetime = Field(default_factory=datetime.utcnow, description="创建时间", index=True) created_at: datetime = Field(default_factory=beijing_now, description="创建时间", index=True)
class SceneSlotBundleStatus(str, Enum): class SceneSlotBundleStatus(str, Enum):
@ -1784,8 +1792,8 @@ class SceneSlotBundle(SQLModel, table=True):
description="状态: draft/active/deprecated" description="状态: draft/active/deprecated"
) )
version: int = Field(default=1, description="版本号") version: int = Field(default=1, description="版本号")
created_at: datetime = Field(default_factory=datetime.utcnow, description="创建时间") created_at: datetime = Field(default_factory=beijing_now, description="创建时间")
updated_at: datetime = Field(default_factory=datetime.utcnow, description="更新时间") updated_at: datetime = Field(default_factory=beijing_now, description="更新时间")
class SceneSlotBundleCreate(SQLModel): class SceneSlotBundleCreate(SQLModel):

View File

@ -107,12 +107,14 @@ class MetadataFieldDefinitionUpdateRequest(BaseModel):
"""[AC-MRS-01] 更新元数据字段定义请求""" """[AC-MRS-01] 更新元数据字段定义请求"""
label: str | None = Field(default=None, min_length=1, max_length=64) label: str | None = Field(default=None, min_length=1, max_length=64)
type: str | None = Field(default=None, description="字段类型")
required: bool | None = None required: bool | None = None
options: list[str] | None = None options: list[str] | None = None
default_value: Any | None = None default_value: Any | None = None
scope: list[str] | None = None scope: list[str] | None = None
is_filterable: bool | None = None is_filterable: bool | None = None
is_rank_feature: bool | None = None is_rank_feature: bool | None = None
usage_description: str | None = Field(default=None, description="用途说明")
field_roles: list[str] | None = Field( field_roles: list[str] | None = Field(
default=None, default=None,
description="[AC-MRS-01] 字段角色列表" description="[AC-MRS-01] 字段角色列表"

View File

@ -295,7 +295,7 @@ class OpenAIClient(LLMClient):
role = msg.get("role", "unknown") role = msg.get("role", "unknown")
content = msg.get("content", "") content = msg.get("content", "")
logger.info(f"[AC-AISVC-06] [{i}] role={role}, content_length={len(content)}") logger.info(f"[AC-AISVC-06] [{i}] role={role}, content_length={len(content)}")
logger.info(f"[AC-AISVC-06] [{i}] content:\n{content}")
logger.info("[AC-AISVC-06] ======================================") logger.info("[AC-AISVC-06] ======================================")
try: try:

View File

@ -42,10 +42,10 @@ class AutoInferenceResult:
error_message: str | None = None error_message: str | None = None
METADATA_INFERENCE_SYSTEM_PROMPT = """你是一个专业的文档元数据分析助手。你的任务是根据文档内容,自动推断并填写元数据字段 METADATA_INFERENCE_SYSTEM_PROMPT = """你是一个专业的文档元数据分析助手。你的任务是根据文档内容,自动推断并填写元数据字段,并以 JSON 格式输出结果
## 输出要求 ## 输出要求
严格按照以下 JSON 格式输出不要添加任何其他内容 你必须严格按照以下 JSON 格式输出不要添加任何其他内容
```json ```json
{ {
@ -60,6 +60,29 @@ METADATA_INFERENCE_SYSTEM_PROMPT = """你是一个专业的文档元数据分析
} }
``` ```
## JSON 输出样例
假设有以下字段定义
- grade (枚举类型选项小学初中高中)
- subject (枚举类型选项语文数学英语)
- difficulty (数字类型1-5)
如果文档内容是关于"高一数学函数知识点"则输出
```json
{
"inferred_metadata": {
"grade": "高中",
"subject": "数学",
"difficulty": 3
},
"confidence_scores": {
"grade": 0.95,
"subject": 0.95,
"difficulty": 0.7
}
}
```
## 推断规则 ## 推断规则
1. **仔细分析文档内容**根据文档的主题关键词上下文来推断元数据 1. **仔细分析文档内容**根据文档的主题关键词上下文来推断元数据
2. **遵循字段定义** 2. **遵循字段定义**
@ -78,7 +101,10 @@ METADATA_INFERENCE_SYSTEM_PROMPT = """你是一个专业的文档元数据分析
## 注意事项 ## 注意事项
- 必须严格按照字段定义的类型和选项填写 - 必须严格按照字段定义的类型和选项填写
- 不要编造不存在的选项值 - 不要编造不存在的选项值
- 保持客观基于文档内容推断""" - 保持客观基于文档内容推断
- **只输出合法的 JSON 格式不要输出思考过程解释或任何额外文本**
- 确保 JSON 格式完整闭合所有字符串用双引号包裹
- 如果无法推断某字段可省略该字段但整体 JSON 必须完整闭合"""
class MetadataAutoInferenceService: class MetadataAutoInferenceService:
@ -98,9 +124,9 @@ class MetadataAutoInferenceService:
def __init__( def __init__(
self, self,
session: AsyncSession, session: AsyncSession,
model: str | None = None, model: str | None = "glm-4.7",
max_tokens: int = 1024, max_tokens: int = 1024,
timeout_seconds: int = 60, timeout_seconds: int = 180,
): ):
self._session = session self._session = session
self._model = model self._model = model
@ -148,9 +174,12 @@ class MetadataAutoInferenceService:
error_message="No field definitions configured", error_message="No field definitions configured",
) )
logger.info(f"[MetadataAutoInference] Found {len(field_definitions)} field definitions: {[f.field_key for f in field_definitions]}")
field_contexts = self._build_field_contexts(field_definitions) field_contexts = self._build_field_contexts(field_definitions)
if not field_contexts: if not field_contexts:
logger.warning(f"[MetadataAutoInference] No field contexts built from definitions")
return AutoInferenceResult( return AutoInferenceResult(
inferred_metadata=existing_metadata or {}, inferred_metadata=existing_metadata or {},
confidence_scores={}, confidence_scores={},
@ -159,15 +188,26 @@ class MetadataAutoInferenceService:
) )
user_prompt = self._build_user_prompt(content, field_contexts, existing_metadata) user_prompt = self._build_user_prompt(content, field_contexts, existing_metadata)
logger.info(f"[MetadataAutoInference] === SYSTEM PROMPT ===\n{METADATA_INFERENCE_SYSTEM_PROMPT}\n=== END SYSTEM PROMPT ===")
logger.info(f"[MetadataAutoInference] === USER PROMPT ===\n{user_prompt}\n=== END USER PROMPT ===")
try: try:
if image_base64 and mime_type: if image_base64 and mime_type:
raw_response = await self._call_multimodal_llm( logger.info(f"[MetadataAutoInference] Using multimodal LLM for image recognition, mime_type={mime_type}")
user_prompt, image_base64, mime_type image_content = await self._recognize_image(image_base64, mime_type)
) logger.info(f"[MetadataAutoInference] Image recognition result (first 200 chars): {image_content[:200] if image_content else 'EMPTY'}")
combined_content = f"{content}\n\n[图片识别内容]\n{image_content}" if content else f"[图片识别内容]\n{image_content}"
user_prompt = self._build_user_prompt(combined_content, field_contexts, existing_metadata)
logger.info(f"[MetadataAutoInference] === USER PROMPT (with image content) ===\n{user_prompt}\n=== END USER PROMPT ===")
raw_response = await self._call_text_llm(user_prompt)
else: else:
raw_response = await self._call_text_llm(user_prompt) raw_response = await self._call_text_llm(user_prompt)
logger.info(f"[MetadataAutoInference] LLM response length: {len(raw_response) if raw_response else 0}")
logger.info(f"[MetadataAutoInference] LLM response (first 300 chars): {raw_response[:300] if raw_response else 'EMPTY'}")
result = self._parse_llm_response(raw_response, field_contexts) result = self._parse_llm_response(raw_response, field_contexts)
if existing_metadata: if existing_metadata:
@ -331,12 +371,15 @@ class MetadataAutoInferenceService:
return prompt return prompt
async def _call_text_llm(self, prompt: str) -> str: async def _call_text_llm(self, prompt: str) -> str:
"""调用文本 LLM""" """调用文本 LLM 进行元数据推断(使用对话模型配置,支持 JSON 格式化输出)"""
manager = get_llm_config_manager() manager = get_llm_config_manager()
client = manager.get_kb_processing_client() client = manager.get_chat_client()
config = manager.kb_processing_config config = manager.chat_config
model = self._model or config.get("model", "gpt-4o-mini") model = config.get("model")
if not model:
logger.warning("[MetadataAutoInference] No model configured in chat config, using default")
model = self._model
from app.services.llm.base import LLMConfig from app.services.llm.base import LLMConfig
@ -345,8 +388,13 @@ class MetadataAutoInferenceService:
max_tokens=self._max_tokens, max_tokens=self._max_tokens,
temperature=0.3, temperature=0.3,
timeout_seconds=self._timeout_seconds, timeout_seconds=self._timeout_seconds,
extra_params={
"response_format": {"type": "json_object"},
},
) )
logger.info(f"[MetadataAutoInference] Using model: {model} for text LLM")
messages = [ messages = [
{"role": "system", "content": METADATA_INFERENCE_SYSTEM_PROMPT}, {"role": "system", "content": METADATA_INFERENCE_SYSTEM_PROMPT},
{"role": "user", "content": prompt}, {"role": "user", "content": prompt},
@ -355,34 +403,45 @@ class MetadataAutoInferenceService:
response = await client.generate(messages=messages, config=llm_config) response = await client.generate(messages=messages, config=llm_config)
return response.content or "" return response.content or ""
async def _call_multimodal_llm( async def _recognize_image(self, image_base64: str, mime_type: str) -> str:
self, """
prompt: str, 使用多模态模型识别图片内容
image_base64: str, 这个方法只负责图片内容识别不负责 JSON 格式化输出
mime_type: str, 使用对话模型配置
) -> str: """
"""调用多模态 LLM"""
manager = get_llm_config_manager() manager = get_llm_config_manager()
client = manager.get_kb_processing_client() client = manager.get_chat_client()
config = manager.kb_processing_config config = manager.chat_config
model = self._model or config.get("model", "gpt-4o-mini") model = config.get("model")
if not model:
logger.warning("[MetadataAutoInference] No model configured in chat config for image recognition")
model = self._model
from app.services.llm.base import LLMConfig from app.services.llm.base import LLMConfig
llm_config = LLMConfig( llm_config = LLMConfig(
model=model, model=model,
max_tokens=self._max_tokens, max_tokens=1024,
temperature=0.3, temperature=0.3,
timeout_seconds=self._timeout_seconds, timeout_seconds=self._timeout_seconds,
) )
logger.info(f"[MetadataAutoInference] Using model: {model} for image recognition")
recognition_prompt = """请仔细分析这张图片,提取其中的关键信息,包括:
1. 图片类型文档截图图表照片示意图等
2. 图片中的文字内容如有
3. 图片的主要内容和主题
4. 任何可见的数据数字或关键信息
请用简洁的中文描述图片内容"""
messages = [ messages = [
{"role": "system", "content": METADATA_INFERENCE_SYSTEM_PROMPT},
{ {
"role": "user", "role": "user",
"content": [ "content": [
{"type": "text", "text": prompt}, {"type": "text", "text": recognition_prompt},
{ {
"type": "image_url", "type": "image_url",
"image_url": { "image_url": {
@ -396,12 +455,29 @@ class MetadataAutoInferenceService:
response = await client.generate(messages=messages, config=llm_config) response = await client.generate(messages=messages, config=llm_config)
return response.content or "" return response.content or ""
async def _call_multimodal_llm(
self,
prompt: str,
image_base64: str,
mime_type: str,
) -> str:
"""
调用多模态 LLM已弃用保留向后兼容
推荐使用 _recognize_image + _call_text_llm 组合
"""
logger.warning("[MetadataAutoInference] _call_multimodal_llm is deprecated, use _recognize_image + _call_text_llm instead")
image_content = await self._recognize_image(image_base64, mime_type)
combined_prompt = f"{prompt}\n\n[图片识别内容]\n{image_content}"
return await self._call_text_llm(combined_prompt)
def _parse_llm_response( def _parse_llm_response(
self, self,
response: str, response: str,
field_contexts: list[InferenceFieldContext], field_contexts: list[InferenceFieldContext],
) -> AutoInferenceResult: ) -> AutoInferenceResult:
"""解析 LLM 响应""" """解析 LLM 响应"""
json_str = ""
try: try:
json_str = self._extract_json(response) json_str = self._extract_json(response)
data = json.loads(json_str) data = json.loads(json_str)
@ -410,19 +486,30 @@ class MetadataAutoInferenceService:
confidence_scores = data.get("confidence_scores", {}) confidence_scores = data.get("confidence_scores", {})
field_map = {ctx.field_key: ctx for ctx in field_contexts} field_map = {ctx.field_key: ctx for ctx in field_contexts}
label_to_key = {ctx.label: ctx.field_key for ctx in field_contexts}
validated_metadata = {} validated_metadata = {}
validated_scores = {} validated_scores = {}
for field_key, value in inferred_metadata.items(): for field_key_or_label, value in inferred_metadata.items():
if field_key not in field_map: actual_field_key = field_key_or_label
continue
if field_key_or_label not in field_map:
if field_key_or_label in label_to_key:
actual_field_key = label_to_key[field_key_or_label]
else:
logger.warning(f"[MetadataAutoInference] Unknown field: {field_key_or_label}")
continue
ctx = field_map[field_key] ctx = field_map[actual_field_key]
validated_value = self._validate_field_value(ctx, value) validated_value = self._validate_field_value(ctx, value)
if validated_value is not None: if validated_value is not None:
validated_metadata[field_key] = validated_value validated_metadata[actual_field_key] = validated_value
validated_scores[field_key] = confidence_scores.get(field_key, 0.5) validated_scores[actual_field_key] = confidence_scores.get(field_key_or_label, 0.5)
logger.info(f"[MetadataAutoInference] Validated metadata: {validated_metadata}")
logger.info(f"[MetadataAutoInference] Confidence scores: {validated_scores}")
return AutoInferenceResult( return AutoInferenceResult(
inferred_metadata=validated_metadata, inferred_metadata=validated_metadata,
@ -433,6 +520,8 @@ class MetadataAutoInferenceService:
except json.JSONDecodeError as e: except json.JSONDecodeError as e:
logger.warning(f"[MetadataAutoInference] Failed to parse JSON: {e}") logger.warning(f"[MetadataAutoInference] Failed to parse JSON: {e}")
logger.warning(f"[MetadataAutoInference] Raw LLM response (first 500 chars): {response[:500] if response else 'EMPTY'}")
logger.warning(f"[MetadataAutoInference] Extracted JSON string: {json_str[:500] if json_str else 'EMPTY'}")
return AutoInferenceResult( return AutoInferenceResult(
inferred_metadata={}, inferred_metadata={},
confidence_scores={}, confidence_scores={},
@ -482,15 +571,32 @@ class MetadataAutoInferenceService:
def _extract_json(self, content: str) -> str: def _extract_json(self, content: str) -> str:
"""从响应中提取 JSON""" """从响应中提取 JSON"""
import re
content = content.strip() content = content.strip()
if content.startswith("```"):
content = re.sub(r"^```[a-zA-Z0-9_-]*\s*", "", content)
content = re.sub(r"\s*```$", "", content).strip()
if content.startswith("{") and content.endswith("}"): if content.startswith("{") and content.endswith("}"):
return content return content
json_start = content.find("{") json_start = content.find("{")
json_end = content.rfind("}") if json_start == -1:
return content
if json_start != -1 and json_end != -1 and json_end > json_start: json_str = content[json_start:]
return content[json_start:json_end + 1]
return content open_braces = json_str.count("{")
close_braces = json_str.count("}")
if close_braces < open_braces:
json_str += "}" * (open_braces - close_braces)
json_str = re.sub(r",\s*([}\]])", r"\1", json_str)
json_end = json_str.rfind("}")
if json_end != -1:
json_str = json_str[:json_end + 1]
return json_str

View File

@ -226,6 +226,8 @@ class MetadataFieldDefinitionService:
if field_update.label is not None: if field_update.label is not None:
field.label = field_update.label field.label = field_update.label
if field_update.type is not None:
field.type = field_update.type
if field_update.required is not None: if field_update.required is not None:
field.required = field_update.required field.required = field_update.required
if field_update.options is not None: if field_update.options is not None:
@ -239,6 +241,8 @@ class MetadataFieldDefinitionService:
field.is_filterable = field_update.is_filterable field.is_filterable = field_update.is_filterable
if field_update.is_rank_feature is not None: if field_update.is_rank_feature is not None:
field.is_rank_feature = field_update.is_rank_feature field.is_rank_feature = field_update.is_rank_feature
if field_update.usage_description is not None:
field.usage_description = field_update.usage_description
# [AC-MRS-01] 修复:添加 field_roles 更新逻辑 # [AC-MRS-01] 修复:添加 field_roles 更新逻辑
if field_update.field_roles is not None: if field_update.field_roles is not None:
self._validate_field_roles(field_update.field_roles) self._validate_field_roles(field_update.field_roles)