331 lines
10 KiB
Python
331 lines
10 KiB
Python
|
|
"""
|
||
|
|
Knowledge base management API with RAG optimization features.
|
||
|
|
Reference: rag-optimization/spec.md Section 4.2
|
||
|
|
"""
|
||
|
|
|
||
|
|
import logging
|
||
|
|
from datetime import date
|
||
|
|
from typing import Any
|
||
|
|
|
||
|
|
from fastapi import APIRouter, Depends, HTTPException, status
|
||
|
|
from pydantic import BaseModel, Field
|
||
|
|
from sqlalchemy import select
|
||
|
|
from sqlalchemy.ext.asyncio import AsyncSession
|
||
|
|
|
||
|
|
from app.core.database import get_session
|
||
|
|
from app.services.retrieval import (
|
||
|
|
ChunkMetadata,
|
||
|
|
ChunkMetadataModel,
|
||
|
|
IndexingProgress,
|
||
|
|
IndexingResult,
|
||
|
|
KnowledgeIndexer,
|
||
|
|
MetadataFilter,
|
||
|
|
RetrievalStrategy,
|
||
|
|
get_knowledge_indexer,
|
||
|
|
)
|
||
|
|
|
||
|
|
logger = logging.getLogger(__name__)
|
||
|
|
|
||
|
|
router = APIRouter(prefix="/api/kb", tags=["Knowledge Base"])
|
||
|
|
|
||
|
|
|
||
|
|
class IndexDocumentRequest(BaseModel):
|
||
|
|
"""Request to index a document."""
|
||
|
|
tenant_id: str = Field(..., description="Tenant ID")
|
||
|
|
document_id: str = Field(..., description="Document ID")
|
||
|
|
text: str = Field(..., description="Document text content")
|
||
|
|
metadata: ChunkMetadataModel | None = Field(default=None, description="Document metadata")
|
||
|
|
|
||
|
|
|
||
|
|
class IndexDocumentResponse(BaseModel):
|
||
|
|
"""Response from document indexing."""
|
||
|
|
success: bool
|
||
|
|
total_chunks: int
|
||
|
|
indexed_chunks: int
|
||
|
|
failed_chunks: int
|
||
|
|
elapsed_seconds: float
|
||
|
|
error_message: str | None = None
|
||
|
|
|
||
|
|
|
||
|
|
class IndexingProgressResponse(BaseModel):
|
||
|
|
"""Response with current indexing progress."""
|
||
|
|
total_chunks: int
|
||
|
|
processed_chunks: int
|
||
|
|
failed_chunks: int
|
||
|
|
progress_percent: int
|
||
|
|
elapsed_seconds: float
|
||
|
|
current_document: str
|
||
|
|
|
||
|
|
|
||
|
|
class MetadataFilterRequest(BaseModel):
|
||
|
|
"""Request for metadata filtering."""
|
||
|
|
categories: list[str] | None = None
|
||
|
|
target_audiences: list[str] | None = None
|
||
|
|
departments: list[str] | None = None
|
||
|
|
valid_only: bool = True
|
||
|
|
min_priority: int | None = None
|
||
|
|
keywords: list[str] | None = None
|
||
|
|
|
||
|
|
|
||
|
|
class RetrieveRequest(BaseModel):
|
||
|
|
"""Request for knowledge retrieval."""
|
||
|
|
tenant_id: str = Field(..., description="Tenant ID")
|
||
|
|
query: str = Field(..., description="Search query")
|
||
|
|
top_k: int = Field(default=10, ge=1, le=50, description="Number of results")
|
||
|
|
filters: MetadataFilterRequest | None = Field(default=None, description="Metadata filters")
|
||
|
|
strategy: RetrievalStrategy = Field(default=RetrievalStrategy.HYBRID, description="Retrieval strategy")
|
||
|
|
|
||
|
|
|
||
|
|
class RetrieveResponse(BaseModel):
|
||
|
|
"""Response from knowledge retrieval."""
|
||
|
|
hits: list[dict[str, Any]]
|
||
|
|
total_hits: int
|
||
|
|
max_score: float
|
||
|
|
is_insufficient: bool
|
||
|
|
diagnostics: dict[str, Any]
|
||
|
|
|
||
|
|
|
||
|
|
class MetadataOptionsResponse(BaseModel):
|
||
|
|
"""Response with available metadata options."""
|
||
|
|
categories: list[str]
|
||
|
|
departments: list[str]
|
||
|
|
target_audiences: list[str]
|
||
|
|
priorities: list[int]
|
||
|
|
|
||
|
|
|
||
|
|
@router.post("/index", response_model=IndexDocumentResponse)
|
||
|
|
async def index_document(
|
||
|
|
request: IndexDocumentRequest,
|
||
|
|
session: AsyncSession = Depends(get_session),
|
||
|
|
):
|
||
|
|
"""
|
||
|
|
Index a document with optimized embedding.
|
||
|
|
|
||
|
|
Features:
|
||
|
|
- Task prefixes (search_document:) for document embedding
|
||
|
|
- Multi-dimensional vectors (256/512/768)
|
||
|
|
- Metadata support
|
||
|
|
"""
|
||
|
|
try:
|
||
|
|
index = get_knowledge_indexer()
|
||
|
|
|
||
|
|
chunk_metadata = None
|
||
|
|
if request.metadata:
|
||
|
|
chunk_metadata = ChunkMetadata(
|
||
|
|
category=request.metadata.category,
|
||
|
|
subcategory=request.metadata.subcategory,
|
||
|
|
target_audience=request.metadata.target_audience,
|
||
|
|
source_doc=request.metadata.source_doc,
|
||
|
|
source_url=request.metadata.source_url,
|
||
|
|
department=request.metadata.department,
|
||
|
|
priority=request.metadata.priority,
|
||
|
|
keywords=request.metadata.keywords,
|
||
|
|
)
|
||
|
|
|
||
|
|
result = await index.index_document(
|
||
|
|
tenant_id=request.tenant_id,
|
||
|
|
document_id=request.document_id,
|
||
|
|
text=request.text,
|
||
|
|
metadata=chunk_metadata,
|
||
|
|
)
|
||
|
|
|
||
|
|
return IndexDocumentResponse(
|
||
|
|
success=result.success,
|
||
|
|
total_chunks=result.total_chunks,
|
||
|
|
indexed_chunks=result.indexed_chunks,
|
||
|
|
failed_chunks=result.failed_chunks,
|
||
|
|
elapsed_seconds=result.elapsed_seconds,
|
||
|
|
error_message=result.error_message,
|
||
|
|
)
|
||
|
|
|
||
|
|
except Exception as e:
|
||
|
|
logger.error(f"[KB-API] Failed to index document: {e}")
|
||
|
|
raise HTTPException(
|
||
|
|
status_code=status.HTTP_500_INTERNAL_SERVER_ERROR,
|
||
|
|
detail=f"索引失败: {str(e)}"
|
||
|
|
)
|
||
|
|
|
||
|
|
|
||
|
|
@router.get("/index/progress", response_model=IndexingProgressResponse | None)
|
||
|
|
async def get_indexing_progress():
|
||
|
|
"""Get current indexing progress."""
|
||
|
|
try:
|
||
|
|
index = get_knowledge_indexer()
|
||
|
|
progress = index.get_progress()
|
||
|
|
|
||
|
|
if progress is None:
|
||
|
|
return None
|
||
|
|
|
||
|
|
return IndexingProgressResponse(
|
||
|
|
total_chunks=progress.total_chunks,
|
||
|
|
processed_chunks=progress.processed_chunks,
|
||
|
|
failed_chunks=progress.failed_chunks,
|
||
|
|
progress_percent=progress.progress_percent,
|
||
|
|
elapsed_seconds=progress.elapsed_seconds,
|
||
|
|
current_document=progress.current_document,
|
||
|
|
)
|
||
|
|
|
||
|
|
except Exception as e:
|
||
|
|
logger.error(f"[KB-API] Failed to get progress: {e}")
|
||
|
|
raise HTTPException(
|
||
|
|
status_code=status.HTTP_500_INTERNAL_SERVER_ERROR,
|
||
|
|
detail=f"获取进度失败: {str(e)}"
|
||
|
|
)
|
||
|
|
|
||
|
|
|
||
|
|
@router.post("/retrieve", response_model=RetrieveResponse)
|
||
|
|
async def retrieve_knowledge(request: RetrieveRequest):
|
||
|
|
"""
|
||
|
|
Retrieve knowledge using optimized RAG.
|
||
|
|
|
||
|
|
Strategies:
|
||
|
|
- vector: Simple vector search
|
||
|
|
- bm25: BM25 keyword search
|
||
|
|
- hybrid: RRF combination of vector + BM25 (default)
|
||
|
|
- two_stage: Two-stage retrieval with Matryoshka dimensions
|
||
|
|
"""
|
||
|
|
try:
|
||
|
|
from app.services.retrieval.optimized_retriever import get_optimized_retriever
|
||
|
|
from app.services.retrieval.base import RetrievalContext
|
||
|
|
|
||
|
|
retriever = await get_optimized_retriever()
|
||
|
|
|
||
|
|
metadata_filter = None
|
||
|
|
if request.filters:
|
||
|
|
filter_dict = request.filters.model_dump(exclude_none=True)
|
||
|
|
metadata_filter = MetadataFilter(**filter_dict)
|
||
|
|
|
||
|
|
ctx = RetrievalContext(
|
||
|
|
tenant_id=request.tenant_id,
|
||
|
|
query=request.query,
|
||
|
|
)
|
||
|
|
|
||
|
|
if metadata_filter:
|
||
|
|
ctx.metadata = {"filter": metadata_filter.to_qdrant_filter()}
|
||
|
|
|
||
|
|
result = await retriever.retrieve(ctx)
|
||
|
|
|
||
|
|
return RetrieveResponse(
|
||
|
|
hits=[
|
||
|
|
{
|
||
|
|
"text": hit.text,
|
||
|
|
"score": hit.score,
|
||
|
|
"source": hit.source,
|
||
|
|
"metadata": hit.metadata,
|
||
|
|
}
|
||
|
|
for hit in result.hits
|
||
|
|
],
|
||
|
|
total_hits=result.hit_count,
|
||
|
|
max_score=result.max_score,
|
||
|
|
is_insufficient=result.diagnostics.get("is_insufficient", False),
|
||
|
|
diagnostics=result.diagnostics or {},
|
||
|
|
)
|
||
|
|
|
||
|
|
except Exception as e:
|
||
|
|
logger.error(f"[KB-API] Failed to retrieve: {e}")
|
||
|
|
raise HTTPException(
|
||
|
|
status_code=status.HTTP_500_INTERNAL_SERVER_ERROR,
|
||
|
|
detail=f"检索失败: {str(e)}"
|
||
|
|
)
|
||
|
|
|
||
|
|
|
||
|
|
@router.get("/metadata/options", response_model=MetadataOptionsResponse)
|
||
|
|
async def get_metadata_options():
|
||
|
|
"""
|
||
|
|
Get available metadata options for filtering.
|
||
|
|
These would typically be loaded from a database.
|
||
|
|
"""
|
||
|
|
try:
|
||
|
|
return MetadataOptionsResponse(
|
||
|
|
categories=[
|
||
|
|
"课程咨询",
|
||
|
|
"考试政策",
|
||
|
|
"学籍管理",
|
||
|
|
"奖助学金",
|
||
|
|
"宿舍管理",
|
||
|
|
"校园服务",
|
||
|
|
"就业指导",
|
||
|
|
"其他",
|
||
|
|
],
|
||
|
|
departments=[
|
||
|
|
"教务处",
|
||
|
|
"学生处",
|
||
|
|
"财务处",
|
||
|
|
"后勤处",
|
||
|
|
"就业指导中心",
|
||
|
|
"图书馆",
|
||
|
|
"信息中心",
|
||
|
|
],
|
||
|
|
target_audiences=[
|
||
|
|
"本科生",
|
||
|
|
"研究生",
|
||
|
|
"留学生",
|
||
|
|
"新生",
|
||
|
|
"毕业生",
|
||
|
|
"教职工",
|
||
|
|
],
|
||
|
|
priorities=list(range(1, 11)),
|
||
|
|
)
|
||
|
|
|
||
|
|
except Exception as e:
|
||
|
|
logger.error(f"[KB-API] Failed to get metadata options: {e}")
|
||
|
|
raise HTTPException(
|
||
|
|
status_code=status.HTTP_500_INTERNAL_SERVER_ERROR,
|
||
|
|
detail=f"获取选项失败: {str(e)}"
|
||
|
|
)
|
||
|
|
|
||
|
|
|
||
|
|
@router.post("/reindex")
|
||
|
|
async def reindex_all(
|
||
|
|
tenant_id: str,
|
||
|
|
session: AsyncSession = Depends(get_session),
|
||
|
|
):
|
||
|
|
"""
|
||
|
|
Reindex all documents for a tenant with optimized embedding.
|
||
|
|
This would typically read from the documents table and reindex.
|
||
|
|
"""
|
||
|
|
try:
|
||
|
|
from app.models.entities import Document, DocumentStatus
|
||
|
|
|
||
|
|
stmt = select(Document).where(
|
||
|
|
Document.tenant_id == tenant_id,
|
||
|
|
Document.status == DocumentStatus.COMPLETED.value,
|
||
|
|
)
|
||
|
|
result = await session.execute(stmt)
|
||
|
|
documents = result.scalars().all()
|
||
|
|
|
||
|
|
index = get_knowledge_indexer()
|
||
|
|
|
||
|
|
total_indexed = 0
|
||
|
|
total_failed = 0
|
||
|
|
|
||
|
|
for doc in documents:
|
||
|
|
if doc.file_path:
|
||
|
|
import os
|
||
|
|
if os.path.exists(doc.file_path):
|
||
|
|
with open(doc.file_path, 'r', encoding='utf-8') as f:
|
||
|
|
text = f.read()
|
||
|
|
|
||
|
|
result = await index.index_document(
|
||
|
|
tenant_id=tenant_id,
|
||
|
|
document_id=str(doc.id),
|
||
|
|
text=text,
|
||
|
|
)
|
||
|
|
|
||
|
|
total_indexed += result.indexed_chunks
|
||
|
|
total_failed += result.failed_chunks
|
||
|
|
|
||
|
|
return {
|
||
|
|
"success": True,
|
||
|
|
"total_documents": len(documents),
|
||
|
|
"total_indexed": total_indexed,
|
||
|
|
"total_failed": total_failed,
|
||
|
|
}
|
||
|
|
|
||
|
|
except Exception as e:
|
||
|
|
logger.error(f"[KB-API] Failed to reindex: {e}")
|
||
|
|
raise HTTPException(
|
||
|
|
status_code=status.HTTP_500_INTERNAL_SERVER_ERROR,
|
||
|
|
detail=f"重新索引失败: {str(e)}"
|
||
|
|
)
|