From dd1c6aba1492c31e8a36be0c26b99b2678aa045d Mon Sep 17 00:00:00 2001 From: MerCry Date: Thu, 12 Mar 2026 12:45:54 +0800 Subject: [PATCH] feat: enhance metadata handling and document processing - Add metadata field to Document type for frontend - Add type field to MetadataFieldUpdateRequest - Update KB API with URL decode support for Chinese filenames - Enhance metadata auto inference service - Improve metadata field definition service - Update .gitignore to exclude logs and snapshots --- .gitignore | 3 + ai-service-admin/src/types/knowledge-base.ts | 1 + ai-service-admin/src/types/metadata.ts | 1 + .../components/DocumentList.vue | 7 +- ai-service/app/api/admin/kb.py | 70 ++++--- ai-service/app/models/entities.py | 104 ++++++----- ai-service/app/schemas/metadata.py | 2 + ai-service/app/services/llm/openai_client.py | 2 +- .../metadata_auto_inference_service.py | 176 ++++++++++++++---- .../metadata_field_definition_service.py | 4 + 10 files changed, 259 insertions(+), 111 deletions(-) diff --git a/.gitignore b/.gitignore index 49497a1..e31bd6e 100644 --- a/.gitignore +++ b/.gitignore @@ -163,7 +163,10 @@ cython_debug/ # Project specific ai-service/uploads/ ai-service/config/ +ai-service/logs/ *.local +qdrant_snapshots/ +*.snapshot /.trae/ /.claude/ diff --git a/ai-service-admin/src/types/knowledge-base.ts b/ai-service-admin/src/types/knowledge-base.ts index 4996373..f73c6f4 100644 --- a/ai-service-admin/src/types/knowledge-base.ts +++ b/ai-service-admin/src/types/knowledge-base.ts @@ -34,6 +34,7 @@ export interface Document { kbId: string fileName: string status: string + metadata?: Record jobId?: string createdAt: string updatedAt: string diff --git a/ai-service-admin/src/types/metadata.ts b/ai-service-admin/src/types/metadata.ts index 901d74e..130c004 100644 --- a/ai-service-admin/src/types/metadata.ts +++ b/ai-service-admin/src/types/metadata.ts @@ -39,6 +39,7 @@ export interface MetadataFieldCreateRequest { export interface MetadataFieldUpdateRequest { label?: string + type?: MetadataFieldType required?: boolean options?: string[] default?: string | number | boolean diff --git a/ai-service-admin/src/views/admin/knowledge-base/components/DocumentList.vue b/ai-service-admin/src/views/admin/knowledge-base/components/DocumentList.vue index 6f87d6a..f70dc68 100644 --- a/ai-service-admin/src/views/admin/knowledge-base/components/DocumentList.vue +++ b/ai-service-admin/src/views/admin/knowledge-base/components/DocumentList.vue @@ -16,7 +16,7 @@ @@ -298,7 +298,10 @@ const handleFileSelect = (event: Event) => { const file = target.files?.[0] if (!file) return - const allowedExtensions = ['.txt', '.md', '.pdf', '.doc', '.docx', '.xls', '.xlsx'] + const allowedExtensions = [ + '.txt', '.md', '.markdown', '.pdf', '.doc', '.docx', '.xls', '.xlsx', + '.jpg', '.jpeg', '.png', '.gif', '.webp', '.bmp', '.tiff', '.tif' + ] const ext = '.' + file.name.split('.').pop()?.toLowerCase() if (!allowedExtensions.includes(ext)) { diff --git a/ai-service/app/api/admin/kb.py b/ai-service/app/api/admin/kb.py index ec1a00e..9588132 100644 --- a/ai-service/app/api/admin/kb.py +++ b/ai-service/app/api/admin/kb.py @@ -11,6 +11,7 @@ import hashlib from dataclasses import dataclass from typing import Annotated, Any, Optional from logging.handlers import RotatingFileHandler +from urllib.parse import unquote import tiktoken from fastapi import APIRouter, BackgroundTasks, Depends, File, Form, HTTPException, Query, UploadFile @@ -64,6 +65,7 @@ class TextChunk: end_token: int page: int | None = None source: str | None = None + metadata: dict | None = None def chunk_text_by_lines( @@ -602,10 +604,14 @@ async def upload_document( doc_kb_service = KBService(session) file_content = await file.read() + + # URL decode filename to handle Chinese characters + decoded_filename = unquote(file.filename or "unknown") + document, job = await doc_kb_service.upload_document( tenant_id=tenant_id, kb_id=kb_id, - file_name=file.filename or "unknown", + file_name=decoded_filename, file_content=file_content, file_type=file.content_type, metadata=metadata_dict, @@ -615,7 +621,7 @@ async def upload_document( await session.commit() background_tasks.add_task( - _index_document, tenant_id, kb_id, str(job.id), str(document.id), file_content, file.filename, metadata_dict + _index_document, tenant_id, kb_id, str(job.id), str(document.id), file_content, decoded_filename, metadata_dict ) return JSONResponse( @@ -676,6 +682,7 @@ async def _index_document( logger.info(f"[INDEX] File extension: {file_ext}, content size: {len(content)} bytes") text_extensions = {".txt", ".md", ".markdown", ".rst", ".log", ".json", ".xml", ".yaml", ".yml"} + markdown_extensions = {".md", ".markdown"} image_extensions = {".jpg", ".jpeg", ".png", ".gif", ".webp", ".bmp", ".tiff", ".tif"} if file_ext in text_extensions or not file_ext: @@ -783,16 +790,6 @@ async def _index_document( image_base64_for_inference = None mime_type_for_inference = None - if file_ext in image_extensions: - import base64 - image_base64_for_inference = base64.b64encode(content).decode("utf-8") - mime_type_map = { - ".jpg": "image/jpeg", ".jpeg": "image/jpeg", - ".png": "image/png", ".gif": "image/gif", - ".webp": "image/webp", ".bmp": "image/bmp", - ".tiff": "image/tiff", ".tif": "image/tiff", - } - mime_type_for_inference = mime_type_map.get(file_ext, "image/jpeg") logger.info("[INDEX] Starting metadata auto-inference...") inference_result = await inference_service.infer_metadata( @@ -811,6 +808,11 @@ async def _index_document( f"inferred_fields={list(inference_result.inferred_metadata.keys())}, " f"confidence_scores={inference_result.confidence_scores}" ) + + document = await kb_service.get_document(tenant_id, doc_id) + if document: + document.doc_metadata = metadata + logger.info(f"[INDEX] Updated document metadata in database: {metadata}") else: logger.warning( f"[INDEX] Metadata inference FAILED: {inference_result.error_message}, " @@ -846,6 +848,31 @@ async def _index_document( pc.page = page.page all_chunks.extend(page_chunks) logger.info(f"[INDEX] Total chunks from PDF: {len(all_chunks)}") + elif file_ext in markdown_extensions: + logger.info("[INDEX] Markdown file detected, using intelligent chunking") + from app.services.document.markdown_chunker import MarkdownChunker, MarkdownElementType + chunker = MarkdownChunker(max_chunk_size=1000, min_chunk_size=50) + md_chunks = chunker.chunk(text, doc_id=doc_id) + + for i, md_chunk in enumerate(md_chunks): + chunk_metadata = { + "element_type": md_chunk.element_type.value, + "header_context": md_chunk.header_context, + "language": md_chunk.language, + } + chunk_metadata.update(md_chunk.metadata) + + all_chunks.append(TextChunk( + text=md_chunk.content, + start_token=i, + end_token=i + 1, + page=None, + source=filename, + )) + if all_chunks: + all_chunks[-1].metadata = chunk_metadata + + logger.info(f"[INDEX] Total chunks from Markdown: {len(all_chunks)}, element types: {[c.element_type.value for c in md_chunks[:5]]}...") else: logger.info("[INDEX] Using line-based chunking") all_chunks = chunk_text_by_lines( @@ -913,21 +940,13 @@ async def _index_document( if points: if settings.kb_vector_log_enabled: - vector_payloads = [] + payloads_only = [] for point in points: if use_multi_vector: - payload = { - "id": point.get("id"), - "vector": point.get("vector"), - "payload": point.get("payload"), - } + payload = point.get("payload") else: - payload = { - "id": point.id, - "vector": point.vector, - "payload": point.payload, - } - vector_payloads.append(payload) + payload = point.payload + payloads_only.append(payload) kb_vector_logger.info(json.dumps({ "tenant_id": tenant_id, @@ -938,7 +957,8 @@ async def _index_document( "file_ext": file_ext, "is_image": file_ext in image_extensions, "metadata": doc_metadata, - "vectors": vector_payloads, + "chunk_count": len(payloads_only), + "payloads": payloads_only, }, ensure_ascii=False)) logger.info(f"[INDEX] Upserting {len(points)} vectors to Qdrant for kb_id={kb_id}...") diff --git a/ai-service/app/models/entities.py b/ai-service/app/models/entities.py index 77f9a7d..6dad0f1 100644 --- a/ai-service/app/models/entities.py +++ b/ai-service/app/models/entities.py @@ -7,10 +7,18 @@ import uuid from datetime import datetime from enum import Enum from typing import Any +from zoneinfo import ZoneInfo from sqlalchemy import JSON, Column from sqlmodel import Field, Index, SQLModel +SHANGHAI_TZ = ZoneInfo("Asia/Shanghai") + + +def beijing_now() -> datetime: + """Get current time in Shanghai/Beijing timezone (Asia/Shanghai)""" + return datetime.now(SHANGHAI_TZ).replace(tzinfo=None) + class ChatSession(SQLModel, table=True): """ @@ -32,8 +40,8 @@ class ChatSession(SQLModel, table=True): sa_column=Column("metadata", JSON, nullable=True), description="Session metadata" ) - created_at: datetime = Field(default_factory=datetime.utcnow, description="Session creation time") - updated_at: datetime = Field(default_factory=datetime.utcnow, description="Last update time") + created_at: datetime = Field(default_factory=beijing_now, description="Session creation time") + updated_at: datetime = Field(default_factory=beijing_now, description="Last update time") class ChatMessage(SQLModel, table=True): @@ -65,7 +73,7 @@ class ChatMessage(SQLModel, table=True): first_token_ms: int | None = Field(default=None, description="Time to first token in milliseconds (for streaming)") is_error: bool = Field(default=False, description="Whether this message is an error response") error_message: str | None = Field(default=None, description="Error message if any") - created_at: datetime = Field(default_factory=datetime.utcnow, description="Message creation time") + created_at: datetime = Field(default_factory=beijing_now, description="Message creation time") prompt_template_id: uuid.UUID | None = Field( default=None, @@ -150,8 +158,8 @@ class UserMemory(SQLModel, table=True): summary_version: int = Field(default=1, description="Summary version / update round") last_turn_id: str | None = Field(default=None, description="Last turn identifier (optional)") expires_at: datetime | None = Field(default=None, description="Expiration time (optional)") - created_at: datetime = Field(default_factory=datetime.utcnow, description="Creation time") - updated_at: datetime = Field(default_factory=datetime.utcnow, description="Last update time") + created_at: datetime = Field(default_factory=beijing_now, description="Creation time") + updated_at: datetime = Field(default_factory=beijing_now, description="Last update time") class SharedSession(SQLModel, table=True): @@ -177,8 +185,8 @@ class SharedSession(SQLModel, table=True): is_active: bool = Field(default=True, description="Whether share is active") max_concurrent_users: int = Field(default=10, description="Maximum concurrent users allowed") current_users: int = Field(default=0, description="Current number of online users") - created_at: datetime = Field(default_factory=datetime.utcnow, description="Creation time") - updated_at: datetime = Field(default_factory=datetime.utcnow, description="Last update time") + created_at: datetime = Field(default_factory=beijing_now, description="Creation time") + updated_at: datetime = Field(default_factory=beijing_now, description="Last update time") class DocumentStatus(str, Enum): @@ -213,8 +221,8 @@ class Tenant(SQLModel, table=True): tenant_id: str = Field(..., description="Full tenant ID (format: name@ash@year)", unique=True, index=True) name: str = Field(..., description="Tenant display name (first part of tenant_id)") year: str = Field(..., description="Year part from tenant_id") - created_at: datetime = Field(default_factory=datetime.utcnow, description="Creation time") - updated_at: datetime = Field(default_factory=datetime.utcnow, description="Last update time") + created_at: datetime = Field(default_factory=beijing_now, description="Creation time") + updated_at: datetime = Field(default_factory=beijing_now, description="Last update time") class KBType(str, Enum): @@ -247,8 +255,8 @@ class KnowledgeBase(SQLModel, table=True): priority: int = Field(default=0, ge=0, description="Priority weight, higher value means higher priority") is_enabled: bool = Field(default=True, description="Whether the knowledge base is enabled") doc_count: int = Field(default=0, ge=0, description="Document count (cached statistic)") - created_at: datetime = Field(default_factory=datetime.utcnow, description="Creation time") - updated_at: datetime = Field(default_factory=datetime.utcnow, description="Last update time") + created_at: datetime = Field(default_factory=beijing_now, description="Creation time") + updated_at: datetime = Field(default_factory=beijing_now, description="Last update time") class Document(SQLModel, table=True): @@ -272,8 +280,8 @@ class Document(SQLModel, table=True): status: str = Field(default=DocumentStatus.PENDING.value, description="Document status") error_msg: str | None = Field(default=None, description="Error message if failed") doc_metadata: dict | None = Field(default=None, sa_type=JSON, description="Document metadata as JSON") - created_at: datetime = Field(default_factory=datetime.utcnow, description="Upload time") - updated_at: datetime = Field(default_factory=datetime.utcnow, description="Last update time") + created_at: datetime = Field(default_factory=beijing_now, description="Upload time") + updated_at: datetime = Field(default_factory=beijing_now, description="Last update time") class IndexJob(SQLModel, table=True): @@ -293,8 +301,8 @@ class IndexJob(SQLModel, table=True): status: str = Field(default=IndexJobStatus.PENDING.value, description="Job status") progress: int = Field(default=0, ge=0, le=100, description="Progress percentage") error_msg: str | None = Field(default=None, description="Error message if failed") - created_at: datetime = Field(default_factory=datetime.utcnow, description="Job creation time") - updated_at: datetime = Field(default_factory=datetime.utcnow, description="Last update time") + created_at: datetime = Field(default_factory=beijing_now, description="Job creation time") + updated_at: datetime = Field(default_factory=beijing_now, description="Last update time") class KnowledgeBaseCreate(SQLModel): @@ -346,8 +354,8 @@ class ApiKey(SQLModel, table=True): description="Optional IP allowlist for this key", ) rate_limit_qpm: int | None = Field(default=60, description="Per-minute quota for this key") - created_at: datetime = Field(default_factory=datetime.utcnow, description="Creation time") - updated_at: datetime = Field(default_factory=datetime.utcnow, description="Last update time") + created_at: datetime = Field(default_factory=beijing_now, description="Creation time") + updated_at: datetime = Field(default_factory=beijing_now, description="Last update time") class ApiKeyCreate(SQLModel): @@ -390,8 +398,8 @@ class PromptTemplate(SQLModel, table=True): sa_column=Column("metadata", JSON, nullable=True), description="[AC-IDSMETA-16] Structured metadata for the prompt template" ) - created_at: datetime = Field(default_factory=datetime.utcnow, description="Creation time") - updated_at: datetime = Field(default_factory=datetime.utcnow, description="Last update time") + created_at: datetime = Field(default_factory=beijing_now, description="Creation time") + updated_at: datetime = Field(default_factory=beijing_now, description="Last update time") class PromptTemplateVersion(SQLModel, table=True): @@ -426,7 +434,7 @@ class PromptTemplateVersion(SQLModel, table=True): sa_column=Column("variables", JSON, nullable=True), description="Variable definitions, e.g., [{'name': 'persona_name', 'default': '小N'}]" ) - created_at: datetime = Field(default_factory=datetime.utcnow, description="Creation time") + created_at: datetime = Field(default_factory=beijing_now, description="Creation time") class PromptTemplateCreate(SQLModel): @@ -514,8 +522,8 @@ class IntentRule(SQLModel, table=True): sa_column=Column("semantic_examples", JSON, nullable=True), description="[v0.8.0] Semantic example sentences for dynamic vector computation" ) - created_at: datetime = Field(default_factory=datetime.utcnow, description="Creation time") - updated_at: datetime = Field(default_factory=datetime.utcnow, description="Last update time") + created_at: datetime = Field(default_factory=beijing_now, description="Creation time") + updated_at: datetime = Field(default_factory=beijing_now, description="Last update time") class IntentRuleCreate(SQLModel): @@ -614,8 +622,8 @@ class ForbiddenWord(SQLModel, table=True): fallback_reply: str | None = Field(default=None, description="Fallback reply for 'block' strategy") is_enabled: bool = Field(default=True, description="Whether the word is enabled") hit_count: int = Field(default=0, ge=0, description="Hit count for statistics") - created_at: datetime = Field(default_factory=datetime.utcnow, description="Creation time") - updated_at: datetime = Field(default_factory=datetime.utcnow, description="Last update time") + created_at: datetime = Field(default_factory=beijing_now, description="Creation time") + updated_at: datetime = Field(default_factory=beijing_now, description="Last update time") class ForbiddenWordCreate(SQLModel): @@ -666,8 +674,8 @@ class BehaviorRule(SQLModel, table=True): ) category: str = Field(..., description="Category: compliance/tone/boundary/custom") is_enabled: bool = Field(default=True, description="Whether the rule is enabled") - created_at: datetime = Field(default_factory=datetime.utcnow, description="Creation time") - updated_at: datetime = Field(default_factory=datetime.utcnow, description="Last update time") + created_at: datetime = Field(default_factory=beijing_now, description="Creation time") + updated_at: datetime = Field(default_factory=beijing_now, description="Last update time") class BehaviorRuleCreate(SQLModel): @@ -779,8 +787,8 @@ class ScriptFlow(SQLModel, table=True): sa_column=Column("metadata", JSON, nullable=True), description="[AC-IDSMETA-16] Structured metadata for the script flow" ) - created_at: datetime = Field(default_factory=datetime.utcnow, description="Creation time") - updated_at: datetime = Field(default_factory=datetime.utcnow, description="Last update time") + created_at: datetime = Field(default_factory=beijing_now, description="Creation time") + updated_at: datetime = Field(default_factory=beijing_now, description="Last update time") class FlowInstance(SQLModel, table=True): @@ -814,8 +822,8 @@ class FlowInstance(SQLModel, table=True): sa_column=Column("context", JSON, nullable=True), description="Flow execution context, stores user inputs" ) - started_at: datetime = Field(default_factory=datetime.utcnow, description="Instance start time") - updated_at: datetime = Field(default_factory=datetime.utcnow, description="Last update time") + started_at: datetime = Field(default_factory=beijing_now, description="Instance start time") + updated_at: datetime = Field(default_factory=beijing_now, description="Last update time") completed_at: datetime | None = Field(default=None, description="Completion time (nullable)") @@ -981,7 +989,7 @@ class FlowTestRecord(SQLModel, table=True): description="Final ChatResponse with reply, confidence, should_transfer" ) total_duration_ms: int | None = Field(default=None, description="Total execution time in milliseconds") - created_at: datetime = Field(default_factory=datetime.utcnow, description="Record creation time", index=True) + created_at: datetime = Field(default_factory=beijing_now, description="Record creation time", index=True) class FlowTestStepResult(SQLModel): @@ -1034,7 +1042,7 @@ class ExportTask(SQLModel, table=True): ) error_message: str | None = Field(default=None, description="Error message if failed") expires_at: datetime | None = Field(default=None, description="File expiration time (for cleanup)") - created_at: datetime = Field(default_factory=datetime.utcnow, description="Task creation time") + created_at: datetime = Field(default_factory=beijing_now, description="Task creation time") completed_at: datetime | None = Field(default=None, description="Completion time") @@ -1167,8 +1175,8 @@ class MetadataFieldDefinition(SQLModel, table=True): description="字段状态: draft/active/deprecated" ) version: int = Field(default=1, description="版本号") - created_at: datetime = Field(default_factory=datetime.utcnow, description="创建时间") - updated_at: datetime = Field(default_factory=datetime.utcnow, description="更新时间") + created_at: datetime = Field(default_factory=beijing_now, description="创建时间") + updated_at: datetime = Field(default_factory=beijing_now, description="更新时间") class MetadataFieldDefinitionCreate(SQLModel): @@ -1299,8 +1307,8 @@ class SlotDefinition(SQLModel, table=True): description="关联的元数据字段 ID", foreign_key="metadata_field_definitions.id", ) - created_at: datetime = Field(default_factory=datetime.utcnow, description="创建时间") - updated_at: datetime = Field(default_factory=datetime.utcnow, description="更新时间") + created_at: datetime = Field(default_factory=beijing_now, description="创建时间") + updated_at: datetime = Field(default_factory=beijing_now, description="更新时间") def get_effective_strategies(self) -> list[str]: """ @@ -1423,7 +1431,7 @@ class SlotValue(SQLModel): description="置信度 0.0~1.0" ) updated_at: datetime = Field( - default_factory=datetime.utcnow, + default_factory=beijing_now, description="最后更新时间" ) @@ -1450,8 +1458,8 @@ class MetadataSchema(SQLModel, table=True): ) is_default: bool = Field(default=False, description="是否为租户默认模式") is_enabled: bool = Field(default=True, description="是否启用") - created_at: datetime = Field(default_factory=datetime.utcnow, description="创建时间") - updated_at: datetime = Field(default_factory=datetime.utcnow, description="更新时间") + created_at: datetime = Field(default_factory=beijing_now, description="创建时间") + updated_at: datetime = Field(default_factory=beijing_now, description="更新时间") class MetadataSchemaCreate(SQLModel): @@ -1530,8 +1538,8 @@ class DecompositionTemplate(SQLModel, table=True): sa_column=Column("example_output", JSON, nullable=True), description="示例输出 JSON" ) - created_at: datetime = Field(default_factory=datetime.utcnow, description="创建时间") - updated_at: datetime = Field(default_factory=datetime.utcnow, description="更新时间") + created_at: datetime = Field(default_factory=beijing_now, description="创建时间") + updated_at: datetime = Field(default_factory=beijing_now, description="更新时间") class DecompositionTemplateCreate(SQLModel): @@ -1625,8 +1633,8 @@ class HighRiskPolicy(SQLModel, table=True): ) priority: int = Field(default=0, description="优先级 (值越高优先级越高)") is_enabled: bool = Field(default=True, description="是否启用") - created_at: datetime = Field(default_factory=datetime.utcnow, description="创建时间") - updated_at: datetime = Field(default_factory=datetime.utcnow, description="更新时间") + created_at: datetime = Field(default_factory=beijing_now, description="创建时间") + updated_at: datetime = Field(default_factory=beijing_now, description="更新时间") class HighRiskPolicyCreate(SQLModel): @@ -1673,8 +1681,8 @@ class SessionModeRecord(SQLModel, table=True): ) reason: str | None = Field(default=None, description="模式切换原因") switched_at: datetime | None = Field(default=None, description="模式切换时间") - created_at: datetime = Field(default_factory=datetime.utcnow, description="创建时间") - updated_at: datetime = Field(default_factory=datetime.utcnow, description="更新时间") + created_at: datetime = Field(default_factory=beijing_now, description="创建时间") + updated_at: datetime = Field(default_factory=beijing_now, description="更新时间") class MidAuditLog(SQLModel, table=True): @@ -1708,7 +1716,7 @@ class MidAuditLog(SQLModel, table=True): react_iterations: int | None = Field(default=None, description="ReAct循环次数") high_risk_scenario: str | None = Field(default=None, description="触发的高风险场景") latency_ms: int | None = Field(default=None, description="总耗时(ms)") - created_at: datetime = Field(default_factory=datetime.utcnow, description="创建时间", index=True) + created_at: datetime = Field(default_factory=beijing_now, description="创建时间", index=True) class SceneSlotBundleStatus(str, Enum): @@ -1784,8 +1792,8 @@ class SceneSlotBundle(SQLModel, table=True): description="状态: draft/active/deprecated" ) version: int = Field(default=1, description="版本号") - created_at: datetime = Field(default_factory=datetime.utcnow, description="创建时间") - updated_at: datetime = Field(default_factory=datetime.utcnow, description="更新时间") + created_at: datetime = Field(default_factory=beijing_now, description="创建时间") + updated_at: datetime = Field(default_factory=beijing_now, description="更新时间") class SceneSlotBundleCreate(SQLModel): diff --git a/ai-service/app/schemas/metadata.py b/ai-service/app/schemas/metadata.py index 31fd2e1..f649a26 100644 --- a/ai-service/app/schemas/metadata.py +++ b/ai-service/app/schemas/metadata.py @@ -107,12 +107,14 @@ class MetadataFieldDefinitionUpdateRequest(BaseModel): """[AC-MRS-01] 更新元数据字段定义请求""" label: str | None = Field(default=None, min_length=1, max_length=64) + type: str | None = Field(default=None, description="字段类型") required: bool | None = None options: list[str] | None = None default_value: Any | None = None scope: list[str] | None = None is_filterable: bool | None = None is_rank_feature: bool | None = None + usage_description: str | None = Field(default=None, description="用途说明") field_roles: list[str] | None = Field( default=None, description="[AC-MRS-01] 字段角色列表" diff --git a/ai-service/app/services/llm/openai_client.py b/ai-service/app/services/llm/openai_client.py index 9e2246d..c8c6e95 100644 --- a/ai-service/app/services/llm/openai_client.py +++ b/ai-service/app/services/llm/openai_client.py @@ -295,7 +295,7 @@ class OpenAIClient(LLMClient): role = msg.get("role", "unknown") content = msg.get("content", "") logger.info(f"[AC-AISVC-06] [{i}] role={role}, content_length={len(content)}") - logger.info(f"[AC-AISVC-06] [{i}] content:\n{content}") + logger.info("[AC-AISVC-06] ======================================") try: diff --git a/ai-service/app/services/metadata_auto_inference_service.py b/ai-service/app/services/metadata_auto_inference_service.py index cbd0778..7c87c00 100644 --- a/ai-service/app/services/metadata_auto_inference_service.py +++ b/ai-service/app/services/metadata_auto_inference_service.py @@ -42,10 +42,10 @@ class AutoInferenceResult: error_message: str | None = None -METADATA_INFERENCE_SYSTEM_PROMPT = """你是一个专业的文档元数据分析助手。你的任务是根据文档内容,自动推断并填写元数据字段。 +METADATA_INFERENCE_SYSTEM_PROMPT = """你是一个专业的文档元数据分析助手。你的任务是根据文档内容,自动推断并填写元数据字段,并以 JSON 格式输出结果。 ## 输出要求 -请严格按照以下 JSON 格式输出,不要添加任何其他内容: +你必须严格按照以下 JSON 格式输出,不要添加任何其他内容: ```json { @@ -60,6 +60,29 @@ METADATA_INFERENCE_SYSTEM_PROMPT = """你是一个专业的文档元数据分析 } ``` +## JSON 输出样例 +假设有以下字段定义: +- grade (枚举类型,选项:小学、初中、高中) +- subject (枚举类型,选项:语文、数学、英语) +- difficulty (数字类型,1-5) + +如果文档内容是关于"高一数学函数知识点",则输出: + +```json +{ + "inferred_metadata": { + "grade": "高中", + "subject": "数学", + "difficulty": 3 + }, + "confidence_scores": { + "grade": 0.95, + "subject": 0.95, + "difficulty": 0.7 + } +} +``` + ## 推断规则 1. **仔细分析文档内容**:根据文档的主题、关键词、上下文来推断元数据 2. **遵循字段定义**: @@ -78,7 +101,10 @@ METADATA_INFERENCE_SYSTEM_PROMPT = """你是一个专业的文档元数据分析 ## 注意事项 - 必须严格按照字段定义的类型和选项填写 - 不要编造不存在的选项值 -- 保持客观,基于文档内容推断""" +- 保持客观,基于文档内容推断 +- **只输出合法的 JSON 格式,不要输出思考过程、解释或任何额外文本** +- 确保 JSON 格式完整闭合,所有字符串用双引号包裹 +- 如果无法推断某字段,可省略该字段,但整体 JSON 必须完整闭合""" class MetadataAutoInferenceService: @@ -98,9 +124,9 @@ class MetadataAutoInferenceService: def __init__( self, session: AsyncSession, - model: str | None = None, + model: str | None = "glm-4.7", max_tokens: int = 1024, - timeout_seconds: int = 60, + timeout_seconds: int = 180, ): self._session = session self._model = model @@ -148,9 +174,12 @@ class MetadataAutoInferenceService: error_message="No field definitions configured", ) + logger.info(f"[MetadataAutoInference] Found {len(field_definitions)} field definitions: {[f.field_key for f in field_definitions]}") + field_contexts = self._build_field_contexts(field_definitions) if not field_contexts: + logger.warning(f"[MetadataAutoInference] No field contexts built from definitions") return AutoInferenceResult( inferred_metadata=existing_metadata or {}, confidence_scores={}, @@ -159,15 +188,26 @@ class MetadataAutoInferenceService: ) user_prompt = self._build_user_prompt(content, field_contexts, existing_metadata) + logger.info(f"[MetadataAutoInference] === SYSTEM PROMPT ===\n{METADATA_INFERENCE_SYSTEM_PROMPT}\n=== END SYSTEM PROMPT ===") + logger.info(f"[MetadataAutoInference] === USER PROMPT ===\n{user_prompt}\n=== END USER PROMPT ===") try: if image_base64 and mime_type: - raw_response = await self._call_multimodal_llm( - user_prompt, image_base64, mime_type - ) + logger.info(f"[MetadataAutoInference] Using multimodal LLM for image recognition, mime_type={mime_type}") + image_content = await self._recognize_image(image_base64, mime_type) + logger.info(f"[MetadataAutoInference] Image recognition result (first 200 chars): {image_content[:200] if image_content else 'EMPTY'}") + + combined_content = f"{content}\n\n[图片识别内容]\n{image_content}" if content else f"[图片识别内容]\n{image_content}" + user_prompt = self._build_user_prompt(combined_content, field_contexts, existing_metadata) + logger.info(f"[MetadataAutoInference] === USER PROMPT (with image content) ===\n{user_prompt}\n=== END USER PROMPT ===") + + raw_response = await self._call_text_llm(user_prompt) else: raw_response = await self._call_text_llm(user_prompt) + logger.info(f"[MetadataAutoInference] LLM response length: {len(raw_response) if raw_response else 0}") + logger.info(f"[MetadataAutoInference] LLM response (first 300 chars): {raw_response[:300] if raw_response else 'EMPTY'}") + result = self._parse_llm_response(raw_response, field_contexts) if existing_metadata: @@ -331,12 +371,15 @@ class MetadataAutoInferenceService: return prompt async def _call_text_llm(self, prompt: str) -> str: - """调用文本 LLM""" + """调用文本 LLM 进行元数据推断(使用对话模型配置,支持 JSON 格式化输出)""" manager = get_llm_config_manager() - client = manager.get_kb_processing_client() + client = manager.get_chat_client() - config = manager.kb_processing_config - model = self._model or config.get("model", "gpt-4o-mini") + config = manager.chat_config + model = config.get("model") + if not model: + logger.warning("[MetadataAutoInference] No model configured in chat config, using default") + model = self._model from app.services.llm.base import LLMConfig @@ -345,8 +388,13 @@ class MetadataAutoInferenceService: max_tokens=self._max_tokens, temperature=0.3, timeout_seconds=self._timeout_seconds, + extra_params={ + "response_format": {"type": "json_object"}, + }, ) + logger.info(f"[MetadataAutoInference] Using model: {model} for text LLM") + messages = [ {"role": "system", "content": METADATA_INFERENCE_SYSTEM_PROMPT}, {"role": "user", "content": prompt}, @@ -355,34 +403,45 @@ class MetadataAutoInferenceService: response = await client.generate(messages=messages, config=llm_config) return response.content or "" - async def _call_multimodal_llm( - self, - prompt: str, - image_base64: str, - mime_type: str, - ) -> str: - """调用多模态 LLM""" + async def _recognize_image(self, image_base64: str, mime_type: str) -> str: + """ + 使用多模态模型识别图片内容。 + 这个方法只负责图片内容识别,不负责 JSON 格式化输出。 + 使用对话模型配置。 + """ manager = get_llm_config_manager() - client = manager.get_kb_processing_client() + client = manager.get_chat_client() - config = manager.kb_processing_config - model = self._model or config.get("model", "gpt-4o-mini") + config = manager.chat_config + model = config.get("model") + if not model: + logger.warning("[MetadataAutoInference] No model configured in chat config for image recognition") + model = self._model from app.services.llm.base import LLMConfig llm_config = LLMConfig( model=model, - max_tokens=self._max_tokens, + max_tokens=1024, temperature=0.3, timeout_seconds=self._timeout_seconds, ) + logger.info(f"[MetadataAutoInference] Using model: {model} for image recognition") + + recognition_prompt = """请仔细分析这张图片,提取其中的关键信息,包括: +1. 图片类型(如:文档截图、图表、照片、示意图等) +2. 图片中的文字内容(如有) +3. 图片的主要内容和主题 +4. 任何可见的数据、数字或关键信息 + +请用简洁的中文描述图片内容:""" + messages = [ - {"role": "system", "content": METADATA_INFERENCE_SYSTEM_PROMPT}, { "role": "user", "content": [ - {"type": "text", "text": prompt}, + {"type": "text", "text": recognition_prompt}, { "type": "image_url", "image_url": { @@ -396,12 +455,29 @@ class MetadataAutoInferenceService: response = await client.generate(messages=messages, config=llm_config) return response.content or "" + async def _call_multimodal_llm( + self, + prompt: str, + image_base64: str, + mime_type: str, + ) -> str: + """ + 调用多模态 LLM(已弃用,保留向后兼容)。 + 推荐使用 _recognize_image + _call_text_llm 组合。 + """ + logger.warning("[MetadataAutoInference] _call_multimodal_llm is deprecated, use _recognize_image + _call_text_llm instead") + + image_content = await self._recognize_image(image_base64, mime_type) + combined_prompt = f"{prompt}\n\n[图片识别内容]\n{image_content}" + return await self._call_text_llm(combined_prompt) + def _parse_llm_response( self, response: str, field_contexts: list[InferenceFieldContext], ) -> AutoInferenceResult: """解析 LLM 响应""" + json_str = "" try: json_str = self._extract_json(response) data = json.loads(json_str) @@ -410,19 +486,30 @@ class MetadataAutoInferenceService: confidence_scores = data.get("confidence_scores", {}) field_map = {ctx.field_key: ctx for ctx in field_contexts} + label_to_key = {ctx.label: ctx.field_key for ctx in field_contexts} + validated_metadata = {} validated_scores = {} - for field_key, value in inferred_metadata.items(): - if field_key not in field_map: - continue + for field_key_or_label, value in inferred_metadata.items(): + actual_field_key = field_key_or_label + + if field_key_or_label not in field_map: + if field_key_or_label in label_to_key: + actual_field_key = label_to_key[field_key_or_label] + else: + logger.warning(f"[MetadataAutoInference] Unknown field: {field_key_or_label}") + continue - ctx = field_map[field_key] + ctx = field_map[actual_field_key] validated_value = self._validate_field_value(ctx, value) if validated_value is not None: - validated_metadata[field_key] = validated_value - validated_scores[field_key] = confidence_scores.get(field_key, 0.5) + validated_metadata[actual_field_key] = validated_value + validated_scores[actual_field_key] = confidence_scores.get(field_key_or_label, 0.5) + + logger.info(f"[MetadataAutoInference] Validated metadata: {validated_metadata}") + logger.info(f"[MetadataAutoInference] Confidence scores: {validated_scores}") return AutoInferenceResult( inferred_metadata=validated_metadata, @@ -433,6 +520,8 @@ class MetadataAutoInferenceService: except json.JSONDecodeError as e: logger.warning(f"[MetadataAutoInference] Failed to parse JSON: {e}") + logger.warning(f"[MetadataAutoInference] Raw LLM response (first 500 chars): {response[:500] if response else 'EMPTY'}") + logger.warning(f"[MetadataAutoInference] Extracted JSON string: {json_str[:500] if json_str else 'EMPTY'}") return AutoInferenceResult( inferred_metadata={}, confidence_scores={}, @@ -482,15 +571,32 @@ class MetadataAutoInferenceService: def _extract_json(self, content: str) -> str: """从响应中提取 JSON""" + import re + content = content.strip() + if content.startswith("```"): + content = re.sub(r"^```[a-zA-Z0-9_-]*\s*", "", content) + content = re.sub(r"\s*```$", "", content).strip() + if content.startswith("{") and content.endswith("}"): return content json_start = content.find("{") - json_end = content.rfind("}") + if json_start == -1: + return content - if json_start != -1 and json_end != -1 and json_end > json_start: - return content[json_start:json_end + 1] + json_str = content[json_start:] - return content + open_braces = json_str.count("{") + close_braces = json_str.count("}") + if close_braces < open_braces: + json_str += "}" * (open_braces - close_braces) + + json_str = re.sub(r",\s*([}\]])", r"\1", json_str) + + json_end = json_str.rfind("}") + if json_end != -1: + json_str = json_str[:json_end + 1] + + return json_str diff --git a/ai-service/app/services/metadata_field_definition_service.py b/ai-service/app/services/metadata_field_definition_service.py index b092f2f..0943273 100644 --- a/ai-service/app/services/metadata_field_definition_service.py +++ b/ai-service/app/services/metadata_field_definition_service.py @@ -226,6 +226,8 @@ class MetadataFieldDefinitionService: if field_update.label is not None: field.label = field_update.label + if field_update.type is not None: + field.type = field_update.type if field_update.required is not None: field.required = field_update.required if field_update.options is not None: @@ -239,6 +241,8 @@ class MetadataFieldDefinitionService: field.is_filterable = field_update.is_filterable if field_update.is_rank_feature is not None: field.is_rank_feature = field_update.is_rank_feature + if field_update.usage_description is not None: + field.usage_description = field_update.usage_description # [AC-MRS-01] 修复:添加 field_roles 更新逻辑 if field_update.field_roles is not None: self._validate_field_roles(field_update.field_roles)