ai-robot-core/ai-service/scripts/query_collection_points.py

121 lines
3.6 KiB
Python
Raw Permalink Normal View History

"""
查询 Qdrant Collection 中的所有内容
"""
import asyncio
import json
import sys
from pathlib import Path
sys.path.insert(0, str(Path(__file__).parent.parent))
from qdrant_client import AsyncQdrantClient
from qdrant_client.models import ScrollRequest
from app.core.config import get_settings
async def query_all_points(collection_name: str):
"""查询 collection 中的所有 points"""
settings = get_settings()
client = AsyncQdrantClient(url=settings.qdrant_url, check_compatibility=False)
print(f"🔍 查询 Collection: {collection_name}")
print("=" * 80)
try:
# 获取 collection 信息
info = await client.get_collection(collection_name)
total_points = info.points_count
print(f"📊 总向量数: {total_points}\n")
# 分页获取所有 points
all_points = []
offset = None
batch_size = 100
while True:
scroll_result = await client.scroll(
collection_name=collection_name,
offset=offset,
limit=batch_size,
with_payload=True,
with_vectors=False
)
points, next_offset = scroll_result
all_points.extend(points)
if next_offset is None:
break
offset = next_offset
# 显示进度
if len(all_points) % 500 == 0:
print(f" 已获取 {len(all_points)} / {total_points} 条记录...")
print(f"✅ 成功获取全部 {len(all_points)} 条记录\n")
print("=" * 80)
# 显示所有内容
for i, point in enumerate(all_points, 1):
payload = point.payload or {}
print(f"\n📄 记录 {i}/{len(all_points)} (ID: {point.id})")
print("-" * 80)
# 显示主要字段
text = payload.get('text', '')
kb_id = payload.get('kb_id', 'N/A')
source = payload.get('source', 'N/A')
chunk_index = payload.get('chunk_index', 'N/A')
metadata = payload.get('metadata', {})
print(f" KB ID: {kb_id}")
print(f" Source: {source}")
print(f" Chunk Index: {chunk_index}")
if metadata:
print(f" Metadata: {json.dumps(metadata, ensure_ascii=False)}")
# 显示文本内容(格式化)
print(f"\n 文本内容:")
if text:
# 按行显示,保持格式
lines = text.split('\n')
for line in lines:
if line.strip():
print(f" {line}")
else:
print(" (无文本内容)")
print("\n" + "=" * 80)
print(f"✅ 查询完成,共 {len(all_points)} 条记录")
# 统计信息
print("\n📈 统计信息:")
kb_ids = {}
for point in all_points:
payload = point.payload or {}
kb_id = payload.get('kb_id', 'N/A')
kb_ids[kb_id] = kb_ids.get(kb_id, 0) + 1
print(f" KB ID 分布:")
for kb_id, count in sorted(kb_ids.items()):
print(f" - {kb_id}: {count}")
except Exception as e:
print(f"❌ 查询失败: {e}")
import traceback
traceback.print_exc()
finally:
await client.close()
async def main():
collection_name = "kb_szmp_ash_2026_30c19c84"
await query_all_points(collection_name)
if __name__ == "__main__":
asyncio.run(main())