feat: enhance metadata handling and document processing
- Add metadata field to Document type for frontend - Add type field to MetadataFieldUpdateRequest - Update KB API with URL decode support for Chinese filenames - Enhance metadata auto inference service - Improve metadata field definition service - Update .gitignore to exclude logs and snapshots
This commit is contained in:
parent
51d8de0621
commit
dd1c6aba14
|
|
@ -163,7 +163,10 @@ cython_debug/
|
|||
# Project specific
|
||||
ai-service/uploads/
|
||||
ai-service/config/
|
||||
ai-service/logs/
|
||||
*.local
|
||||
qdrant_snapshots/
|
||||
*.snapshot
|
||||
|
||||
/.trae/
|
||||
/.claude/
|
||||
|
|
|
|||
|
|
@ -34,6 +34,7 @@ export interface Document {
|
|||
kbId: string
|
||||
fileName: string
|
||||
status: string
|
||||
metadata?: Record<string, any>
|
||||
jobId?: string
|
||||
createdAt: string
|
||||
updatedAt: string
|
||||
|
|
|
|||
|
|
@ -39,6 +39,7 @@ export interface MetadataFieldCreateRequest {
|
|||
|
||||
export interface MetadataFieldUpdateRequest {
|
||||
label?: string
|
||||
type?: MetadataFieldType
|
||||
required?: boolean
|
||||
options?: string[]
|
||||
default?: string | number | boolean
|
||||
|
|
|
|||
|
|
@ -16,7 +16,7 @@
|
|||
<input
|
||||
ref="fileInputRef"
|
||||
type="file"
|
||||
accept=".txt,.md,.pdf,.doc,.docx,.xls,.xlsx"
|
||||
accept=".txt,.md,.markdown,.pdf,.doc,.docx,.xls,.xlsx,.jpg,.jpeg,.png,.gif,.webp,.bmp,.tiff,.tif"
|
||||
style="display: none"
|
||||
@change="handleFileSelect"
|
||||
/>
|
||||
|
|
@ -298,7 +298,10 @@ const handleFileSelect = (event: Event) => {
|
|||
const file = target.files?.[0]
|
||||
if (!file) return
|
||||
|
||||
const allowedExtensions = ['.txt', '.md', '.pdf', '.doc', '.docx', '.xls', '.xlsx']
|
||||
const allowedExtensions = [
|
||||
'.txt', '.md', '.markdown', '.pdf', '.doc', '.docx', '.xls', '.xlsx',
|
||||
'.jpg', '.jpeg', '.png', '.gif', '.webp', '.bmp', '.tiff', '.tif'
|
||||
]
|
||||
const ext = '.' + file.name.split('.').pop()?.toLowerCase()
|
||||
|
||||
if (!allowedExtensions.includes(ext)) {
|
||||
|
|
|
|||
|
|
@ -11,6 +11,7 @@ import hashlib
|
|||
from dataclasses import dataclass
|
||||
from typing import Annotated, Any, Optional
|
||||
from logging.handlers import RotatingFileHandler
|
||||
from urllib.parse import unquote
|
||||
|
||||
import tiktoken
|
||||
from fastapi import APIRouter, BackgroundTasks, Depends, File, Form, HTTPException, Query, UploadFile
|
||||
|
|
@ -64,6 +65,7 @@ class TextChunk:
|
|||
end_token: int
|
||||
page: int | None = None
|
||||
source: str | None = None
|
||||
metadata: dict | None = None
|
||||
|
||||
|
||||
def chunk_text_by_lines(
|
||||
|
|
@ -602,10 +604,14 @@ async def upload_document(
|
|||
|
||||
doc_kb_service = KBService(session)
|
||||
file_content = await file.read()
|
||||
|
||||
# URL decode filename to handle Chinese characters
|
||||
decoded_filename = unquote(file.filename or "unknown")
|
||||
|
||||
document, job = await doc_kb_service.upload_document(
|
||||
tenant_id=tenant_id,
|
||||
kb_id=kb_id,
|
||||
file_name=file.filename or "unknown",
|
||||
file_name=decoded_filename,
|
||||
file_content=file_content,
|
||||
file_type=file.content_type,
|
||||
metadata=metadata_dict,
|
||||
|
|
@ -615,7 +621,7 @@ async def upload_document(
|
|||
await session.commit()
|
||||
|
||||
background_tasks.add_task(
|
||||
_index_document, tenant_id, kb_id, str(job.id), str(document.id), file_content, file.filename, metadata_dict
|
||||
_index_document, tenant_id, kb_id, str(job.id), str(document.id), file_content, decoded_filename, metadata_dict
|
||||
)
|
||||
|
||||
return JSONResponse(
|
||||
|
|
@ -676,6 +682,7 @@ async def _index_document(
|
|||
logger.info(f"[INDEX] File extension: {file_ext}, content size: {len(content)} bytes")
|
||||
|
||||
text_extensions = {".txt", ".md", ".markdown", ".rst", ".log", ".json", ".xml", ".yaml", ".yml"}
|
||||
markdown_extensions = {".md", ".markdown"}
|
||||
image_extensions = {".jpg", ".jpeg", ".png", ".gif", ".webp", ".bmp", ".tiff", ".tif"}
|
||||
|
||||
if file_ext in text_extensions or not file_ext:
|
||||
|
|
@ -783,16 +790,6 @@ async def _index_document(
|
|||
|
||||
image_base64_for_inference = None
|
||||
mime_type_for_inference = None
|
||||
if file_ext in image_extensions:
|
||||
import base64
|
||||
image_base64_for_inference = base64.b64encode(content).decode("utf-8")
|
||||
mime_type_map = {
|
||||
".jpg": "image/jpeg", ".jpeg": "image/jpeg",
|
||||
".png": "image/png", ".gif": "image/gif",
|
||||
".webp": "image/webp", ".bmp": "image/bmp",
|
||||
".tiff": "image/tiff", ".tif": "image/tiff",
|
||||
}
|
||||
mime_type_for_inference = mime_type_map.get(file_ext, "image/jpeg")
|
||||
|
||||
logger.info("[INDEX] Starting metadata auto-inference...")
|
||||
inference_result = await inference_service.infer_metadata(
|
||||
|
|
@ -811,6 +808,11 @@ async def _index_document(
|
|||
f"inferred_fields={list(inference_result.inferred_metadata.keys())}, "
|
||||
f"confidence_scores={inference_result.confidence_scores}"
|
||||
)
|
||||
|
||||
document = await kb_service.get_document(tenant_id, doc_id)
|
||||
if document:
|
||||
document.doc_metadata = metadata
|
||||
logger.info(f"[INDEX] Updated document metadata in database: {metadata}")
|
||||
else:
|
||||
logger.warning(
|
||||
f"[INDEX] Metadata inference FAILED: {inference_result.error_message}, "
|
||||
|
|
@ -846,6 +848,31 @@ async def _index_document(
|
|||
pc.page = page.page
|
||||
all_chunks.extend(page_chunks)
|
||||
logger.info(f"[INDEX] Total chunks from PDF: {len(all_chunks)}")
|
||||
elif file_ext in markdown_extensions:
|
||||
logger.info("[INDEX] Markdown file detected, using intelligent chunking")
|
||||
from app.services.document.markdown_chunker import MarkdownChunker, MarkdownElementType
|
||||
chunker = MarkdownChunker(max_chunk_size=1000, min_chunk_size=50)
|
||||
md_chunks = chunker.chunk(text, doc_id=doc_id)
|
||||
|
||||
for i, md_chunk in enumerate(md_chunks):
|
||||
chunk_metadata = {
|
||||
"element_type": md_chunk.element_type.value,
|
||||
"header_context": md_chunk.header_context,
|
||||
"language": md_chunk.language,
|
||||
}
|
||||
chunk_metadata.update(md_chunk.metadata)
|
||||
|
||||
all_chunks.append(TextChunk(
|
||||
text=md_chunk.content,
|
||||
start_token=i,
|
||||
end_token=i + 1,
|
||||
page=None,
|
||||
source=filename,
|
||||
))
|
||||
if all_chunks:
|
||||
all_chunks[-1].metadata = chunk_metadata
|
||||
|
||||
logger.info(f"[INDEX] Total chunks from Markdown: {len(all_chunks)}, element types: {[c.element_type.value for c in md_chunks[:5]]}...")
|
||||
else:
|
||||
logger.info("[INDEX] Using line-based chunking")
|
||||
all_chunks = chunk_text_by_lines(
|
||||
|
|
@ -913,21 +940,13 @@ async def _index_document(
|
|||
|
||||
if points:
|
||||
if settings.kb_vector_log_enabled:
|
||||
vector_payloads = []
|
||||
payloads_only = []
|
||||
for point in points:
|
||||
if use_multi_vector:
|
||||
payload = {
|
||||
"id": point.get("id"),
|
||||
"vector": point.get("vector"),
|
||||
"payload": point.get("payload"),
|
||||
}
|
||||
payload = point.get("payload")
|
||||
else:
|
||||
payload = {
|
||||
"id": point.id,
|
||||
"vector": point.vector,
|
||||
"payload": point.payload,
|
||||
}
|
||||
vector_payloads.append(payload)
|
||||
payload = point.payload
|
||||
payloads_only.append(payload)
|
||||
|
||||
kb_vector_logger.info(json.dumps({
|
||||
"tenant_id": tenant_id,
|
||||
|
|
@ -938,7 +957,8 @@ async def _index_document(
|
|||
"file_ext": file_ext,
|
||||
"is_image": file_ext in image_extensions,
|
||||
"metadata": doc_metadata,
|
||||
"vectors": vector_payloads,
|
||||
"chunk_count": len(payloads_only),
|
||||
"payloads": payloads_only,
|
||||
}, ensure_ascii=False))
|
||||
|
||||
logger.info(f"[INDEX] Upserting {len(points)} vectors to Qdrant for kb_id={kb_id}...")
|
||||
|
|
|
|||
|
|
@ -7,10 +7,18 @@ import uuid
|
|||
from datetime import datetime
|
||||
from enum import Enum
|
||||
from typing import Any
|
||||
from zoneinfo import ZoneInfo
|
||||
|
||||
from sqlalchemy import JSON, Column
|
||||
from sqlmodel import Field, Index, SQLModel
|
||||
|
||||
SHANGHAI_TZ = ZoneInfo("Asia/Shanghai")
|
||||
|
||||
|
||||
def beijing_now() -> datetime:
|
||||
"""Get current time in Shanghai/Beijing timezone (Asia/Shanghai)"""
|
||||
return datetime.now(SHANGHAI_TZ).replace(tzinfo=None)
|
||||
|
||||
|
||||
class ChatSession(SQLModel, table=True):
|
||||
"""
|
||||
|
|
@ -32,8 +40,8 @@ class ChatSession(SQLModel, table=True):
|
|||
sa_column=Column("metadata", JSON, nullable=True),
|
||||
description="Session metadata"
|
||||
)
|
||||
created_at: datetime = Field(default_factory=datetime.utcnow, description="Session creation time")
|
||||
updated_at: datetime = Field(default_factory=datetime.utcnow, description="Last update time")
|
||||
created_at: datetime = Field(default_factory=beijing_now, description="Session creation time")
|
||||
updated_at: datetime = Field(default_factory=beijing_now, description="Last update time")
|
||||
|
||||
|
||||
class ChatMessage(SQLModel, table=True):
|
||||
|
|
@ -65,7 +73,7 @@ class ChatMessage(SQLModel, table=True):
|
|||
first_token_ms: int | None = Field(default=None, description="Time to first token in milliseconds (for streaming)")
|
||||
is_error: bool = Field(default=False, description="Whether this message is an error response")
|
||||
error_message: str | None = Field(default=None, description="Error message if any")
|
||||
created_at: datetime = Field(default_factory=datetime.utcnow, description="Message creation time")
|
||||
created_at: datetime = Field(default_factory=beijing_now, description="Message creation time")
|
||||
|
||||
prompt_template_id: uuid.UUID | None = Field(
|
||||
default=None,
|
||||
|
|
@ -150,8 +158,8 @@ class UserMemory(SQLModel, table=True):
|
|||
summary_version: int = Field(default=1, description="Summary version / update round")
|
||||
last_turn_id: str | None = Field(default=None, description="Last turn identifier (optional)")
|
||||
expires_at: datetime | None = Field(default=None, description="Expiration time (optional)")
|
||||
created_at: datetime = Field(default_factory=datetime.utcnow, description="Creation time")
|
||||
updated_at: datetime = Field(default_factory=datetime.utcnow, description="Last update time")
|
||||
created_at: datetime = Field(default_factory=beijing_now, description="Creation time")
|
||||
updated_at: datetime = Field(default_factory=beijing_now, description="Last update time")
|
||||
|
||||
|
||||
class SharedSession(SQLModel, table=True):
|
||||
|
|
@ -177,8 +185,8 @@ class SharedSession(SQLModel, table=True):
|
|||
is_active: bool = Field(default=True, description="Whether share is active")
|
||||
max_concurrent_users: int = Field(default=10, description="Maximum concurrent users allowed")
|
||||
current_users: int = Field(default=0, description="Current number of online users")
|
||||
created_at: datetime = Field(default_factory=datetime.utcnow, description="Creation time")
|
||||
updated_at: datetime = Field(default_factory=datetime.utcnow, description="Last update time")
|
||||
created_at: datetime = Field(default_factory=beijing_now, description="Creation time")
|
||||
updated_at: datetime = Field(default_factory=beijing_now, description="Last update time")
|
||||
|
||||
|
||||
class DocumentStatus(str, Enum):
|
||||
|
|
@ -213,8 +221,8 @@ class Tenant(SQLModel, table=True):
|
|||
tenant_id: str = Field(..., description="Full tenant ID (format: name@ash@year)", unique=True, index=True)
|
||||
name: str = Field(..., description="Tenant display name (first part of tenant_id)")
|
||||
year: str = Field(..., description="Year part from tenant_id")
|
||||
created_at: datetime = Field(default_factory=datetime.utcnow, description="Creation time")
|
||||
updated_at: datetime = Field(default_factory=datetime.utcnow, description="Last update time")
|
||||
created_at: datetime = Field(default_factory=beijing_now, description="Creation time")
|
||||
updated_at: datetime = Field(default_factory=beijing_now, description="Last update time")
|
||||
|
||||
|
||||
class KBType(str, Enum):
|
||||
|
|
@ -247,8 +255,8 @@ class KnowledgeBase(SQLModel, table=True):
|
|||
priority: int = Field(default=0, ge=0, description="Priority weight, higher value means higher priority")
|
||||
is_enabled: bool = Field(default=True, description="Whether the knowledge base is enabled")
|
||||
doc_count: int = Field(default=0, ge=0, description="Document count (cached statistic)")
|
||||
created_at: datetime = Field(default_factory=datetime.utcnow, description="Creation time")
|
||||
updated_at: datetime = Field(default_factory=datetime.utcnow, description="Last update time")
|
||||
created_at: datetime = Field(default_factory=beijing_now, description="Creation time")
|
||||
updated_at: datetime = Field(default_factory=beijing_now, description="Last update time")
|
||||
|
||||
|
||||
class Document(SQLModel, table=True):
|
||||
|
|
@ -272,8 +280,8 @@ class Document(SQLModel, table=True):
|
|||
status: str = Field(default=DocumentStatus.PENDING.value, description="Document status")
|
||||
error_msg: str | None = Field(default=None, description="Error message if failed")
|
||||
doc_metadata: dict | None = Field(default=None, sa_type=JSON, description="Document metadata as JSON")
|
||||
created_at: datetime = Field(default_factory=datetime.utcnow, description="Upload time")
|
||||
updated_at: datetime = Field(default_factory=datetime.utcnow, description="Last update time")
|
||||
created_at: datetime = Field(default_factory=beijing_now, description="Upload time")
|
||||
updated_at: datetime = Field(default_factory=beijing_now, description="Last update time")
|
||||
|
||||
|
||||
class IndexJob(SQLModel, table=True):
|
||||
|
|
@ -293,8 +301,8 @@ class IndexJob(SQLModel, table=True):
|
|||
status: str = Field(default=IndexJobStatus.PENDING.value, description="Job status")
|
||||
progress: int = Field(default=0, ge=0, le=100, description="Progress percentage")
|
||||
error_msg: str | None = Field(default=None, description="Error message if failed")
|
||||
created_at: datetime = Field(default_factory=datetime.utcnow, description="Job creation time")
|
||||
updated_at: datetime = Field(default_factory=datetime.utcnow, description="Last update time")
|
||||
created_at: datetime = Field(default_factory=beijing_now, description="Job creation time")
|
||||
updated_at: datetime = Field(default_factory=beijing_now, description="Last update time")
|
||||
|
||||
|
||||
class KnowledgeBaseCreate(SQLModel):
|
||||
|
|
@ -346,8 +354,8 @@ class ApiKey(SQLModel, table=True):
|
|||
description="Optional IP allowlist for this key",
|
||||
)
|
||||
rate_limit_qpm: int | None = Field(default=60, description="Per-minute quota for this key")
|
||||
created_at: datetime = Field(default_factory=datetime.utcnow, description="Creation time")
|
||||
updated_at: datetime = Field(default_factory=datetime.utcnow, description="Last update time")
|
||||
created_at: datetime = Field(default_factory=beijing_now, description="Creation time")
|
||||
updated_at: datetime = Field(default_factory=beijing_now, description="Last update time")
|
||||
|
||||
|
||||
class ApiKeyCreate(SQLModel):
|
||||
|
|
@ -390,8 +398,8 @@ class PromptTemplate(SQLModel, table=True):
|
|||
sa_column=Column("metadata", JSON, nullable=True),
|
||||
description="[AC-IDSMETA-16] Structured metadata for the prompt template"
|
||||
)
|
||||
created_at: datetime = Field(default_factory=datetime.utcnow, description="Creation time")
|
||||
updated_at: datetime = Field(default_factory=datetime.utcnow, description="Last update time")
|
||||
created_at: datetime = Field(default_factory=beijing_now, description="Creation time")
|
||||
updated_at: datetime = Field(default_factory=beijing_now, description="Last update time")
|
||||
|
||||
|
||||
class PromptTemplateVersion(SQLModel, table=True):
|
||||
|
|
@ -426,7 +434,7 @@ class PromptTemplateVersion(SQLModel, table=True):
|
|||
sa_column=Column("variables", JSON, nullable=True),
|
||||
description="Variable definitions, e.g., [{'name': 'persona_name', 'default': '小N'}]"
|
||||
)
|
||||
created_at: datetime = Field(default_factory=datetime.utcnow, description="Creation time")
|
||||
created_at: datetime = Field(default_factory=beijing_now, description="Creation time")
|
||||
|
||||
|
||||
class PromptTemplateCreate(SQLModel):
|
||||
|
|
@ -514,8 +522,8 @@ class IntentRule(SQLModel, table=True):
|
|||
sa_column=Column("semantic_examples", JSON, nullable=True),
|
||||
description="[v0.8.0] Semantic example sentences for dynamic vector computation"
|
||||
)
|
||||
created_at: datetime = Field(default_factory=datetime.utcnow, description="Creation time")
|
||||
updated_at: datetime = Field(default_factory=datetime.utcnow, description="Last update time")
|
||||
created_at: datetime = Field(default_factory=beijing_now, description="Creation time")
|
||||
updated_at: datetime = Field(default_factory=beijing_now, description="Last update time")
|
||||
|
||||
|
||||
class IntentRuleCreate(SQLModel):
|
||||
|
|
@ -614,8 +622,8 @@ class ForbiddenWord(SQLModel, table=True):
|
|||
fallback_reply: str | None = Field(default=None, description="Fallback reply for 'block' strategy")
|
||||
is_enabled: bool = Field(default=True, description="Whether the word is enabled")
|
||||
hit_count: int = Field(default=0, ge=0, description="Hit count for statistics")
|
||||
created_at: datetime = Field(default_factory=datetime.utcnow, description="Creation time")
|
||||
updated_at: datetime = Field(default_factory=datetime.utcnow, description="Last update time")
|
||||
created_at: datetime = Field(default_factory=beijing_now, description="Creation time")
|
||||
updated_at: datetime = Field(default_factory=beijing_now, description="Last update time")
|
||||
|
||||
|
||||
class ForbiddenWordCreate(SQLModel):
|
||||
|
|
@ -666,8 +674,8 @@ class BehaviorRule(SQLModel, table=True):
|
|||
)
|
||||
category: str = Field(..., description="Category: compliance/tone/boundary/custom")
|
||||
is_enabled: bool = Field(default=True, description="Whether the rule is enabled")
|
||||
created_at: datetime = Field(default_factory=datetime.utcnow, description="Creation time")
|
||||
updated_at: datetime = Field(default_factory=datetime.utcnow, description="Last update time")
|
||||
created_at: datetime = Field(default_factory=beijing_now, description="Creation time")
|
||||
updated_at: datetime = Field(default_factory=beijing_now, description="Last update time")
|
||||
|
||||
|
||||
class BehaviorRuleCreate(SQLModel):
|
||||
|
|
@ -779,8 +787,8 @@ class ScriptFlow(SQLModel, table=True):
|
|||
sa_column=Column("metadata", JSON, nullable=True),
|
||||
description="[AC-IDSMETA-16] Structured metadata for the script flow"
|
||||
)
|
||||
created_at: datetime = Field(default_factory=datetime.utcnow, description="Creation time")
|
||||
updated_at: datetime = Field(default_factory=datetime.utcnow, description="Last update time")
|
||||
created_at: datetime = Field(default_factory=beijing_now, description="Creation time")
|
||||
updated_at: datetime = Field(default_factory=beijing_now, description="Last update time")
|
||||
|
||||
|
||||
class FlowInstance(SQLModel, table=True):
|
||||
|
|
@ -814,8 +822,8 @@ class FlowInstance(SQLModel, table=True):
|
|||
sa_column=Column("context", JSON, nullable=True),
|
||||
description="Flow execution context, stores user inputs"
|
||||
)
|
||||
started_at: datetime = Field(default_factory=datetime.utcnow, description="Instance start time")
|
||||
updated_at: datetime = Field(default_factory=datetime.utcnow, description="Last update time")
|
||||
started_at: datetime = Field(default_factory=beijing_now, description="Instance start time")
|
||||
updated_at: datetime = Field(default_factory=beijing_now, description="Last update time")
|
||||
completed_at: datetime | None = Field(default=None, description="Completion time (nullable)")
|
||||
|
||||
|
||||
|
|
@ -981,7 +989,7 @@ class FlowTestRecord(SQLModel, table=True):
|
|||
description="Final ChatResponse with reply, confidence, should_transfer"
|
||||
)
|
||||
total_duration_ms: int | None = Field(default=None, description="Total execution time in milliseconds")
|
||||
created_at: datetime = Field(default_factory=datetime.utcnow, description="Record creation time", index=True)
|
||||
created_at: datetime = Field(default_factory=beijing_now, description="Record creation time", index=True)
|
||||
|
||||
|
||||
class FlowTestStepResult(SQLModel):
|
||||
|
|
@ -1034,7 +1042,7 @@ class ExportTask(SQLModel, table=True):
|
|||
)
|
||||
error_message: str | None = Field(default=None, description="Error message if failed")
|
||||
expires_at: datetime | None = Field(default=None, description="File expiration time (for cleanup)")
|
||||
created_at: datetime = Field(default_factory=datetime.utcnow, description="Task creation time")
|
||||
created_at: datetime = Field(default_factory=beijing_now, description="Task creation time")
|
||||
completed_at: datetime | None = Field(default=None, description="Completion time")
|
||||
|
||||
|
||||
|
|
@ -1167,8 +1175,8 @@ class MetadataFieldDefinition(SQLModel, table=True):
|
|||
description="字段状态: draft/active/deprecated"
|
||||
)
|
||||
version: int = Field(default=1, description="版本号")
|
||||
created_at: datetime = Field(default_factory=datetime.utcnow, description="创建时间")
|
||||
updated_at: datetime = Field(default_factory=datetime.utcnow, description="更新时间")
|
||||
created_at: datetime = Field(default_factory=beijing_now, description="创建时间")
|
||||
updated_at: datetime = Field(default_factory=beijing_now, description="更新时间")
|
||||
|
||||
|
||||
class MetadataFieldDefinitionCreate(SQLModel):
|
||||
|
|
@ -1299,8 +1307,8 @@ class SlotDefinition(SQLModel, table=True):
|
|||
description="关联的元数据字段 ID",
|
||||
foreign_key="metadata_field_definitions.id",
|
||||
)
|
||||
created_at: datetime = Field(default_factory=datetime.utcnow, description="创建时间")
|
||||
updated_at: datetime = Field(default_factory=datetime.utcnow, description="更新时间")
|
||||
created_at: datetime = Field(default_factory=beijing_now, description="创建时间")
|
||||
updated_at: datetime = Field(default_factory=beijing_now, description="更新时间")
|
||||
|
||||
def get_effective_strategies(self) -> list[str]:
|
||||
"""
|
||||
|
|
@ -1423,7 +1431,7 @@ class SlotValue(SQLModel):
|
|||
description="置信度 0.0~1.0"
|
||||
)
|
||||
updated_at: datetime = Field(
|
||||
default_factory=datetime.utcnow,
|
||||
default_factory=beijing_now,
|
||||
description="最后更新时间"
|
||||
)
|
||||
|
||||
|
|
@ -1450,8 +1458,8 @@ class MetadataSchema(SQLModel, table=True):
|
|||
)
|
||||
is_default: bool = Field(default=False, description="是否为租户默认模式")
|
||||
is_enabled: bool = Field(default=True, description="是否启用")
|
||||
created_at: datetime = Field(default_factory=datetime.utcnow, description="创建时间")
|
||||
updated_at: datetime = Field(default_factory=datetime.utcnow, description="更新时间")
|
||||
created_at: datetime = Field(default_factory=beijing_now, description="创建时间")
|
||||
updated_at: datetime = Field(default_factory=beijing_now, description="更新时间")
|
||||
|
||||
|
||||
class MetadataSchemaCreate(SQLModel):
|
||||
|
|
@ -1530,8 +1538,8 @@ class DecompositionTemplate(SQLModel, table=True):
|
|||
sa_column=Column("example_output", JSON, nullable=True),
|
||||
description="示例输出 JSON"
|
||||
)
|
||||
created_at: datetime = Field(default_factory=datetime.utcnow, description="创建时间")
|
||||
updated_at: datetime = Field(default_factory=datetime.utcnow, description="更新时间")
|
||||
created_at: datetime = Field(default_factory=beijing_now, description="创建时间")
|
||||
updated_at: datetime = Field(default_factory=beijing_now, description="更新时间")
|
||||
|
||||
|
||||
class DecompositionTemplateCreate(SQLModel):
|
||||
|
|
@ -1625,8 +1633,8 @@ class HighRiskPolicy(SQLModel, table=True):
|
|||
)
|
||||
priority: int = Field(default=0, description="优先级 (值越高优先级越高)")
|
||||
is_enabled: bool = Field(default=True, description="是否启用")
|
||||
created_at: datetime = Field(default_factory=datetime.utcnow, description="创建时间")
|
||||
updated_at: datetime = Field(default_factory=datetime.utcnow, description="更新时间")
|
||||
created_at: datetime = Field(default_factory=beijing_now, description="创建时间")
|
||||
updated_at: datetime = Field(default_factory=beijing_now, description="更新时间")
|
||||
|
||||
|
||||
class HighRiskPolicyCreate(SQLModel):
|
||||
|
|
@ -1673,8 +1681,8 @@ class SessionModeRecord(SQLModel, table=True):
|
|||
)
|
||||
reason: str | None = Field(default=None, description="模式切换原因")
|
||||
switched_at: datetime | None = Field(default=None, description="模式切换时间")
|
||||
created_at: datetime = Field(default_factory=datetime.utcnow, description="创建时间")
|
||||
updated_at: datetime = Field(default_factory=datetime.utcnow, description="更新时间")
|
||||
created_at: datetime = Field(default_factory=beijing_now, description="创建时间")
|
||||
updated_at: datetime = Field(default_factory=beijing_now, description="更新时间")
|
||||
|
||||
|
||||
class MidAuditLog(SQLModel, table=True):
|
||||
|
|
@ -1708,7 +1716,7 @@ class MidAuditLog(SQLModel, table=True):
|
|||
react_iterations: int | None = Field(default=None, description="ReAct循环次数")
|
||||
high_risk_scenario: str | None = Field(default=None, description="触发的高风险场景")
|
||||
latency_ms: int | None = Field(default=None, description="总耗时(ms)")
|
||||
created_at: datetime = Field(default_factory=datetime.utcnow, description="创建时间", index=True)
|
||||
created_at: datetime = Field(default_factory=beijing_now, description="创建时间", index=True)
|
||||
|
||||
|
||||
class SceneSlotBundleStatus(str, Enum):
|
||||
|
|
@ -1784,8 +1792,8 @@ class SceneSlotBundle(SQLModel, table=True):
|
|||
description="状态: draft/active/deprecated"
|
||||
)
|
||||
version: int = Field(default=1, description="版本号")
|
||||
created_at: datetime = Field(default_factory=datetime.utcnow, description="创建时间")
|
||||
updated_at: datetime = Field(default_factory=datetime.utcnow, description="更新时间")
|
||||
created_at: datetime = Field(default_factory=beijing_now, description="创建时间")
|
||||
updated_at: datetime = Field(default_factory=beijing_now, description="更新时间")
|
||||
|
||||
|
||||
class SceneSlotBundleCreate(SQLModel):
|
||||
|
|
|
|||
|
|
@ -107,12 +107,14 @@ class MetadataFieldDefinitionUpdateRequest(BaseModel):
|
|||
"""[AC-MRS-01] 更新元数据字段定义请求"""
|
||||
|
||||
label: str | None = Field(default=None, min_length=1, max_length=64)
|
||||
type: str | None = Field(default=None, description="字段类型")
|
||||
required: bool | None = None
|
||||
options: list[str] | None = None
|
||||
default_value: Any | None = None
|
||||
scope: list[str] | None = None
|
||||
is_filterable: bool | None = None
|
||||
is_rank_feature: bool | None = None
|
||||
usage_description: str | None = Field(default=None, description="用途说明")
|
||||
field_roles: list[str] | None = Field(
|
||||
default=None,
|
||||
description="[AC-MRS-01] 字段角色列表"
|
||||
|
|
|
|||
|
|
@ -295,7 +295,7 @@ class OpenAIClient(LLMClient):
|
|||
role = msg.get("role", "unknown")
|
||||
content = msg.get("content", "")
|
||||
logger.info(f"[AC-AISVC-06] [{i}] role={role}, content_length={len(content)}")
|
||||
logger.info(f"[AC-AISVC-06] [{i}] content:\n{content}")
|
||||
|
||||
logger.info("[AC-AISVC-06] ======================================")
|
||||
|
||||
try:
|
||||
|
|
|
|||
|
|
@ -42,10 +42,10 @@ class AutoInferenceResult:
|
|||
error_message: str | None = None
|
||||
|
||||
|
||||
METADATA_INFERENCE_SYSTEM_PROMPT = """你是一个专业的文档元数据分析助手。你的任务是根据文档内容,自动推断并填写元数据字段。
|
||||
METADATA_INFERENCE_SYSTEM_PROMPT = """你是一个专业的文档元数据分析助手。你的任务是根据文档内容,自动推断并填写元数据字段,并以 JSON 格式输出结果。
|
||||
|
||||
## 输出要求
|
||||
请严格按照以下 JSON 格式输出,不要添加任何其他内容:
|
||||
你必须严格按照以下 JSON 格式输出,不要添加任何其他内容:
|
||||
|
||||
```json
|
||||
{
|
||||
|
|
@ -60,6 +60,29 @@ METADATA_INFERENCE_SYSTEM_PROMPT = """你是一个专业的文档元数据分析
|
|||
}
|
||||
```
|
||||
|
||||
## JSON 输出样例
|
||||
假设有以下字段定义:
|
||||
- grade (枚举类型,选项:小学、初中、高中)
|
||||
- subject (枚举类型,选项:语文、数学、英语)
|
||||
- difficulty (数字类型,1-5)
|
||||
|
||||
如果文档内容是关于"高一数学函数知识点",则输出:
|
||||
|
||||
```json
|
||||
{
|
||||
"inferred_metadata": {
|
||||
"grade": "高中",
|
||||
"subject": "数学",
|
||||
"difficulty": 3
|
||||
},
|
||||
"confidence_scores": {
|
||||
"grade": 0.95,
|
||||
"subject": 0.95,
|
||||
"difficulty": 0.7
|
||||
}
|
||||
}
|
||||
```
|
||||
|
||||
## 推断规则
|
||||
1. **仔细分析文档内容**:根据文档的主题、关键词、上下文来推断元数据
|
||||
2. **遵循字段定义**:
|
||||
|
|
@ -78,7 +101,10 @@ METADATA_INFERENCE_SYSTEM_PROMPT = """你是一个专业的文档元数据分析
|
|||
## 注意事项
|
||||
- 必须严格按照字段定义的类型和选项填写
|
||||
- 不要编造不存在的选项值
|
||||
- 保持客观,基于文档内容推断"""
|
||||
- 保持客观,基于文档内容推断
|
||||
- **只输出合法的 JSON 格式,不要输出思考过程、解释或任何额外文本**
|
||||
- 确保 JSON 格式完整闭合,所有字符串用双引号包裹
|
||||
- 如果无法推断某字段,可省略该字段,但整体 JSON 必须完整闭合"""
|
||||
|
||||
|
||||
class MetadataAutoInferenceService:
|
||||
|
|
@ -98,9 +124,9 @@ class MetadataAutoInferenceService:
|
|||
def __init__(
|
||||
self,
|
||||
session: AsyncSession,
|
||||
model: str | None = None,
|
||||
model: str | None = "glm-4.7",
|
||||
max_tokens: int = 1024,
|
||||
timeout_seconds: int = 60,
|
||||
timeout_seconds: int = 180,
|
||||
):
|
||||
self._session = session
|
||||
self._model = model
|
||||
|
|
@ -148,9 +174,12 @@ class MetadataAutoInferenceService:
|
|||
error_message="No field definitions configured",
|
||||
)
|
||||
|
||||
logger.info(f"[MetadataAutoInference] Found {len(field_definitions)} field definitions: {[f.field_key for f in field_definitions]}")
|
||||
|
||||
field_contexts = self._build_field_contexts(field_definitions)
|
||||
|
||||
if not field_contexts:
|
||||
logger.warning(f"[MetadataAutoInference] No field contexts built from definitions")
|
||||
return AutoInferenceResult(
|
||||
inferred_metadata=existing_metadata or {},
|
||||
confidence_scores={},
|
||||
|
|
@ -159,15 +188,26 @@ class MetadataAutoInferenceService:
|
|||
)
|
||||
|
||||
user_prompt = self._build_user_prompt(content, field_contexts, existing_metadata)
|
||||
logger.info(f"[MetadataAutoInference] === SYSTEM PROMPT ===\n{METADATA_INFERENCE_SYSTEM_PROMPT}\n=== END SYSTEM PROMPT ===")
|
||||
logger.info(f"[MetadataAutoInference] === USER PROMPT ===\n{user_prompt}\n=== END USER PROMPT ===")
|
||||
|
||||
try:
|
||||
if image_base64 and mime_type:
|
||||
raw_response = await self._call_multimodal_llm(
|
||||
user_prompt, image_base64, mime_type
|
||||
)
|
||||
logger.info(f"[MetadataAutoInference] Using multimodal LLM for image recognition, mime_type={mime_type}")
|
||||
image_content = await self._recognize_image(image_base64, mime_type)
|
||||
logger.info(f"[MetadataAutoInference] Image recognition result (first 200 chars): {image_content[:200] if image_content else 'EMPTY'}")
|
||||
|
||||
combined_content = f"{content}\n\n[图片识别内容]\n{image_content}" if content else f"[图片识别内容]\n{image_content}"
|
||||
user_prompt = self._build_user_prompt(combined_content, field_contexts, existing_metadata)
|
||||
logger.info(f"[MetadataAutoInference] === USER PROMPT (with image content) ===\n{user_prompt}\n=== END USER PROMPT ===")
|
||||
|
||||
raw_response = await self._call_text_llm(user_prompt)
|
||||
else:
|
||||
raw_response = await self._call_text_llm(user_prompt)
|
||||
|
||||
logger.info(f"[MetadataAutoInference] LLM response length: {len(raw_response) if raw_response else 0}")
|
||||
logger.info(f"[MetadataAutoInference] LLM response (first 300 chars): {raw_response[:300] if raw_response else 'EMPTY'}")
|
||||
|
||||
result = self._parse_llm_response(raw_response, field_contexts)
|
||||
|
||||
if existing_metadata:
|
||||
|
|
@ -331,12 +371,15 @@ class MetadataAutoInferenceService:
|
|||
return prompt
|
||||
|
||||
async def _call_text_llm(self, prompt: str) -> str:
|
||||
"""调用文本 LLM"""
|
||||
"""调用文本 LLM 进行元数据推断(使用对话模型配置,支持 JSON 格式化输出)"""
|
||||
manager = get_llm_config_manager()
|
||||
client = manager.get_kb_processing_client()
|
||||
client = manager.get_chat_client()
|
||||
|
||||
config = manager.kb_processing_config
|
||||
model = self._model or config.get("model", "gpt-4o-mini")
|
||||
config = manager.chat_config
|
||||
model = config.get("model")
|
||||
if not model:
|
||||
logger.warning("[MetadataAutoInference] No model configured in chat config, using default")
|
||||
model = self._model
|
||||
|
||||
from app.services.llm.base import LLMConfig
|
||||
|
||||
|
|
@ -345,8 +388,13 @@ class MetadataAutoInferenceService:
|
|||
max_tokens=self._max_tokens,
|
||||
temperature=0.3,
|
||||
timeout_seconds=self._timeout_seconds,
|
||||
extra_params={
|
||||
"response_format": {"type": "json_object"},
|
||||
},
|
||||
)
|
||||
|
||||
logger.info(f"[MetadataAutoInference] Using model: {model} for text LLM")
|
||||
|
||||
messages = [
|
||||
{"role": "system", "content": METADATA_INFERENCE_SYSTEM_PROMPT},
|
||||
{"role": "user", "content": prompt},
|
||||
|
|
@ -355,34 +403,45 @@ class MetadataAutoInferenceService:
|
|||
response = await client.generate(messages=messages, config=llm_config)
|
||||
return response.content or ""
|
||||
|
||||
async def _call_multimodal_llm(
|
||||
self,
|
||||
prompt: str,
|
||||
image_base64: str,
|
||||
mime_type: str,
|
||||
) -> str:
|
||||
"""调用多模态 LLM"""
|
||||
async def _recognize_image(self, image_base64: str, mime_type: str) -> str:
|
||||
"""
|
||||
使用多模态模型识别图片内容。
|
||||
这个方法只负责图片内容识别,不负责 JSON 格式化输出。
|
||||
使用对话模型配置。
|
||||
"""
|
||||
manager = get_llm_config_manager()
|
||||
client = manager.get_kb_processing_client()
|
||||
client = manager.get_chat_client()
|
||||
|
||||
config = manager.kb_processing_config
|
||||
model = self._model or config.get("model", "gpt-4o-mini")
|
||||
config = manager.chat_config
|
||||
model = config.get("model")
|
||||
if not model:
|
||||
logger.warning("[MetadataAutoInference] No model configured in chat config for image recognition")
|
||||
model = self._model
|
||||
|
||||
from app.services.llm.base import LLMConfig
|
||||
|
||||
llm_config = LLMConfig(
|
||||
model=model,
|
||||
max_tokens=self._max_tokens,
|
||||
max_tokens=1024,
|
||||
temperature=0.3,
|
||||
timeout_seconds=self._timeout_seconds,
|
||||
)
|
||||
|
||||
logger.info(f"[MetadataAutoInference] Using model: {model} for image recognition")
|
||||
|
||||
recognition_prompt = """请仔细分析这张图片,提取其中的关键信息,包括:
|
||||
1. 图片类型(如:文档截图、图表、照片、示意图等)
|
||||
2. 图片中的文字内容(如有)
|
||||
3. 图片的主要内容和主题
|
||||
4. 任何可见的数据、数字或关键信息
|
||||
|
||||
请用简洁的中文描述图片内容:"""
|
||||
|
||||
messages = [
|
||||
{"role": "system", "content": METADATA_INFERENCE_SYSTEM_PROMPT},
|
||||
{
|
||||
"role": "user",
|
||||
"content": [
|
||||
{"type": "text", "text": prompt},
|
||||
{"type": "text", "text": recognition_prompt},
|
||||
{
|
||||
"type": "image_url",
|
||||
"image_url": {
|
||||
|
|
@ -396,12 +455,29 @@ class MetadataAutoInferenceService:
|
|||
response = await client.generate(messages=messages, config=llm_config)
|
||||
return response.content or ""
|
||||
|
||||
async def _call_multimodal_llm(
|
||||
self,
|
||||
prompt: str,
|
||||
image_base64: str,
|
||||
mime_type: str,
|
||||
) -> str:
|
||||
"""
|
||||
调用多模态 LLM(已弃用,保留向后兼容)。
|
||||
推荐使用 _recognize_image + _call_text_llm 组合。
|
||||
"""
|
||||
logger.warning("[MetadataAutoInference] _call_multimodal_llm is deprecated, use _recognize_image + _call_text_llm instead")
|
||||
|
||||
image_content = await self._recognize_image(image_base64, mime_type)
|
||||
combined_prompt = f"{prompt}\n\n[图片识别内容]\n{image_content}"
|
||||
return await self._call_text_llm(combined_prompt)
|
||||
|
||||
def _parse_llm_response(
|
||||
self,
|
||||
response: str,
|
||||
field_contexts: list[InferenceFieldContext],
|
||||
) -> AutoInferenceResult:
|
||||
"""解析 LLM 响应"""
|
||||
json_str = ""
|
||||
try:
|
||||
json_str = self._extract_json(response)
|
||||
data = json.loads(json_str)
|
||||
|
|
@ -410,19 +486,30 @@ class MetadataAutoInferenceService:
|
|||
confidence_scores = data.get("confidence_scores", {})
|
||||
|
||||
field_map = {ctx.field_key: ctx for ctx in field_contexts}
|
||||
label_to_key = {ctx.label: ctx.field_key for ctx in field_contexts}
|
||||
|
||||
validated_metadata = {}
|
||||
validated_scores = {}
|
||||
|
||||
for field_key, value in inferred_metadata.items():
|
||||
if field_key not in field_map:
|
||||
for field_key_or_label, value in inferred_metadata.items():
|
||||
actual_field_key = field_key_or_label
|
||||
|
||||
if field_key_or_label not in field_map:
|
||||
if field_key_or_label in label_to_key:
|
||||
actual_field_key = label_to_key[field_key_or_label]
|
||||
else:
|
||||
logger.warning(f"[MetadataAutoInference] Unknown field: {field_key_or_label}")
|
||||
continue
|
||||
|
||||
ctx = field_map[field_key]
|
||||
ctx = field_map[actual_field_key]
|
||||
validated_value = self._validate_field_value(ctx, value)
|
||||
|
||||
if validated_value is not None:
|
||||
validated_metadata[field_key] = validated_value
|
||||
validated_scores[field_key] = confidence_scores.get(field_key, 0.5)
|
||||
validated_metadata[actual_field_key] = validated_value
|
||||
validated_scores[actual_field_key] = confidence_scores.get(field_key_or_label, 0.5)
|
||||
|
||||
logger.info(f"[MetadataAutoInference] Validated metadata: {validated_metadata}")
|
||||
logger.info(f"[MetadataAutoInference] Confidence scores: {validated_scores}")
|
||||
|
||||
return AutoInferenceResult(
|
||||
inferred_metadata=validated_metadata,
|
||||
|
|
@ -433,6 +520,8 @@ class MetadataAutoInferenceService:
|
|||
|
||||
except json.JSONDecodeError as e:
|
||||
logger.warning(f"[MetadataAutoInference] Failed to parse JSON: {e}")
|
||||
logger.warning(f"[MetadataAutoInference] Raw LLM response (first 500 chars): {response[:500] if response else 'EMPTY'}")
|
||||
logger.warning(f"[MetadataAutoInference] Extracted JSON string: {json_str[:500] if json_str else 'EMPTY'}")
|
||||
return AutoInferenceResult(
|
||||
inferred_metadata={},
|
||||
confidence_scores={},
|
||||
|
|
@ -482,15 +571,32 @@ class MetadataAutoInferenceService:
|
|||
|
||||
def _extract_json(self, content: str) -> str:
|
||||
"""从响应中提取 JSON"""
|
||||
import re
|
||||
|
||||
content = content.strip()
|
||||
|
||||
if content.startswith("```"):
|
||||
content = re.sub(r"^```[a-zA-Z0-9_-]*\s*", "", content)
|
||||
content = re.sub(r"\s*```$", "", content).strip()
|
||||
|
||||
if content.startswith("{") and content.endswith("}"):
|
||||
return content
|
||||
|
||||
json_start = content.find("{")
|
||||
json_end = content.rfind("}")
|
||||
|
||||
if json_start != -1 and json_end != -1 and json_end > json_start:
|
||||
return content[json_start:json_end + 1]
|
||||
|
||||
if json_start == -1:
|
||||
return content
|
||||
|
||||
json_str = content[json_start:]
|
||||
|
||||
open_braces = json_str.count("{")
|
||||
close_braces = json_str.count("}")
|
||||
if close_braces < open_braces:
|
||||
json_str += "}" * (open_braces - close_braces)
|
||||
|
||||
json_str = re.sub(r",\s*([}\]])", r"\1", json_str)
|
||||
|
||||
json_end = json_str.rfind("}")
|
||||
if json_end != -1:
|
||||
json_str = json_str[:json_end + 1]
|
||||
|
||||
return json_str
|
||||
|
|
|
|||
|
|
@ -226,6 +226,8 @@ class MetadataFieldDefinitionService:
|
|||
|
||||
if field_update.label is not None:
|
||||
field.label = field_update.label
|
||||
if field_update.type is not None:
|
||||
field.type = field_update.type
|
||||
if field_update.required is not None:
|
||||
field.required = field_update.required
|
||||
if field_update.options is not None:
|
||||
|
|
@ -239,6 +241,8 @@ class MetadataFieldDefinitionService:
|
|||
field.is_filterable = field_update.is_filterable
|
||||
if field_update.is_rank_feature is not None:
|
||||
field.is_rank_feature = field_update.is_rank_feature
|
||||
if field_update.usage_description is not None:
|
||||
field.usage_description = field_update.usage_description
|
||||
# [AC-MRS-01] 修复:添加 field_roles 更新逻辑
|
||||
if field_update.field_roles is not None:
|
||||
self._validate_field_roles(field_update.field_roles)
|
||||
|
|
|
|||
Loading…
Reference in New Issue