99 lines
3.4 KiB
Python
99 lines
3.4 KiB
Python
"""
|
|
检查课程知识库的录入情况
|
|
"""
|
|
|
|
import asyncio
|
|
import sys
|
|
from pathlib import Path
|
|
|
|
sys.path.insert(0, str(Path(__file__).parent.parent))
|
|
|
|
from sqlalchemy import select
|
|
from sqlalchemy.ext.asyncio import AsyncSession, create_async_engine
|
|
from sqlalchemy.orm import sessionmaker
|
|
from app.core.config import get_settings
|
|
from app.core.qdrant_client import QdrantClient
|
|
from app.models.entities import Document
|
|
|
|
|
|
async def check_course_kb_status():
|
|
"""检查课程知识库的录入情况"""
|
|
settings = get_settings()
|
|
|
|
engine = create_async_engine(settings.database_url)
|
|
async_session = sessionmaker(engine, class_=AsyncSession, expire_on_commit=False)
|
|
|
|
tenant_id = "szmp@ash@2026"
|
|
kb_id = "75c465fe-277d-455d-a30b-4b168adcc03b"
|
|
|
|
print(f"\n{'='*80}")
|
|
print(f"检查课程知识库的录入情况")
|
|
print(f"{'='*80}")
|
|
print(f"租户 ID: {tenant_id}")
|
|
print(f"知识库 ID: {kb_id}")
|
|
|
|
async with async_session() as session:
|
|
stmt = select(Document).where(
|
|
Document.tenant_id == tenant_id,
|
|
Document.kb_id == kb_id,
|
|
)
|
|
result = await session.execute(stmt)
|
|
documents = result.scalars().all()
|
|
|
|
print(f"\n数据库中的文档记录: {len(documents)} 个")
|
|
if documents:
|
|
for doc in documents[:5]:
|
|
print(f" - {doc.file_name} (status: {doc.status})")
|
|
if len(documents) > 5:
|
|
print(f" ... 还有 {len(documents) - 5} 个文档")
|
|
|
|
client = QdrantClient()
|
|
qdrant = await client.get_client()
|
|
|
|
collection_name = client.get_kb_collection_name(tenant_id, kb_id)
|
|
print(f"\nQdrant Collection 名称: {collection_name}")
|
|
|
|
exists = await qdrant.collection_exists(collection_name)
|
|
if exists:
|
|
points_result = await qdrant.scroll(
|
|
collection_name=collection_name,
|
|
limit=5,
|
|
with_vectors=False,
|
|
)
|
|
points = points_result[0] if isinstance(points_result, tuple) else points_result
|
|
print(f"Qdrant Collection 存在,有 {len(points)} 条数据")
|
|
for i, point in enumerate(points, 1):
|
|
if hasattr(point, 'payload'):
|
|
payload = point.payload
|
|
point_id = point.id
|
|
else:
|
|
payload = point.get('payload', {})
|
|
point_id = point.get('id', 'unknown')
|
|
print(f" [{i}] id: {point_id}")
|
|
if 'text' in payload:
|
|
text = payload['text'][:50] + '...' if len(payload['text']) > 50 else payload['text']
|
|
print(f" text: {text}")
|
|
else:
|
|
print(f"Qdrant Collection 不存在!")
|
|
|
|
print(f"\n{'='*80}")
|
|
print(f"结论:")
|
|
if len(documents) > 0 and not exists:
|
|
print(" 数据库有文档记录,但 Qdrant Collection 不存在")
|
|
print(" 需要等待文档向量化任务完成")
|
|
elif len(documents) == 0 and exists:
|
|
print(" 数据库没有文档记录,但 Qdrant Collection 存在")
|
|
print(" 可能是旧数据")
|
|
elif len(documents) > 0 and exists:
|
|
print(f" 数据库有 {len(documents)} 个文档记录")
|
|
print(f" Qdrant Collection 存在")
|
|
print(" ✅ 知识库已录入完成")
|
|
else:
|
|
print(" 数据库没有文档记录")
|
|
print(" Qdrant Collection 不存在")
|
|
print(" ❌ 知识库未录入")
|
|
|
|
|
|
if __name__ == "__main__":
|
|
asyncio.run(check_course_kb_status())
|