ai-robot-core/ai-service/scripts/check_course_kb_status.py

99 lines
3.4 KiB
Python

"""
检查课程知识库的录入情况
"""
import asyncio
import sys
from pathlib import Path
sys.path.insert(0, str(Path(__file__).parent.parent))
from sqlalchemy import select
from sqlalchemy.ext.asyncio import AsyncSession, create_async_engine
from sqlalchemy.orm import sessionmaker
from app.core.config import get_settings
from app.core.qdrant_client import QdrantClient
from app.models.entities import Document
async def check_course_kb_status():
"""检查课程知识库的录入情况"""
settings = get_settings()
engine = create_async_engine(settings.database_url)
async_session = sessionmaker(engine, class_=AsyncSession, expire_on_commit=False)
tenant_id = "szmp@ash@2026"
kb_id = "75c465fe-277d-455d-a30b-4b168adcc03b"
print(f"\n{'='*80}")
print(f"检查课程知识库的录入情况")
print(f"{'='*80}")
print(f"租户 ID: {tenant_id}")
print(f"知识库 ID: {kb_id}")
async with async_session() as session:
stmt = select(Document).where(
Document.tenant_id == tenant_id,
Document.kb_id == kb_id,
)
result = await session.execute(stmt)
documents = result.scalars().all()
print(f"\n数据库中的文档记录: {len(documents)}")
if documents:
for doc in documents[:5]:
print(f" - {doc.file_name} (status: {doc.status})")
if len(documents) > 5:
print(f" ... 还有 {len(documents) - 5} 个文档")
client = QdrantClient()
qdrant = await client.get_client()
collection_name = client.get_kb_collection_name(tenant_id, kb_id)
print(f"\nQdrant Collection 名称: {collection_name}")
exists = await qdrant.collection_exists(collection_name)
if exists:
points_result = await qdrant.scroll(
collection_name=collection_name,
limit=5,
with_vectors=False,
)
points = points_result[0] if isinstance(points_result, tuple) else points_result
print(f"Qdrant Collection 存在,有 {len(points)} 条数据")
for i, point in enumerate(points, 1):
if hasattr(point, 'payload'):
payload = point.payload
point_id = point.id
else:
payload = point.get('payload', {})
point_id = point.get('id', 'unknown')
print(f" [{i}] id: {point_id}")
if 'text' in payload:
text = payload['text'][:50] + '...' if len(payload['text']) > 50 else payload['text']
print(f" text: {text}")
else:
print(f"Qdrant Collection 不存在!")
print(f"\n{'='*80}")
print(f"结论:")
if len(documents) > 0 and not exists:
print(" 数据库有文档记录,但 Qdrant Collection 不存在")
print(" 需要等待文档向量化任务完成")
elif len(documents) == 0 and exists:
print(" 数据库没有文档记录,但 Qdrant Collection 存在")
print(" 可能是旧数据")
elif len(documents) > 0 and exists:
print(f" 数据库有 {len(documents)} 个文档记录")
print(f" Qdrant Collection 存在")
print(" ✅ 知识库已录入完成")
else:
print(" 数据库没有文档记录")
print(" Qdrant Collection 不存在")
print(" ❌ 知识库未录入")
if __name__ == "__main__":
asyncio.run(check_course_kb_status())