""" 查询 Qdrant Collection 中的所有内容 """ import asyncio import json import sys from pathlib import Path sys.path.insert(0, str(Path(__file__).parent.parent)) from qdrant_client import AsyncQdrantClient from qdrant_client.models import ScrollRequest from app.core.config import get_settings async def query_all_points(collection_name: str): """查询 collection 中的所有 points""" settings = get_settings() client = AsyncQdrantClient(url=settings.qdrant_url, check_compatibility=False) print(f"🔍 查询 Collection: {collection_name}") print("=" * 80) try: # 获取 collection 信息 info = await client.get_collection(collection_name) total_points = info.points_count print(f"📊 总向量数: {total_points}\n") # 分页获取所有 points all_points = [] offset = None batch_size = 100 while True: scroll_result = await client.scroll( collection_name=collection_name, offset=offset, limit=batch_size, with_payload=True, with_vectors=False ) points, next_offset = scroll_result all_points.extend(points) if next_offset is None: break offset = next_offset # 显示进度 if len(all_points) % 500 == 0: print(f" 已获取 {len(all_points)} / {total_points} 条记录...") print(f"✅ 成功获取全部 {len(all_points)} 条记录\n") print("=" * 80) # 显示所有内容 for i, point in enumerate(all_points, 1): payload = point.payload or {} print(f"\n📄 记录 {i}/{len(all_points)} (ID: {point.id})") print("-" * 80) # 显示主要字段 text = payload.get('text', '') kb_id = payload.get('kb_id', 'N/A') source = payload.get('source', 'N/A') chunk_index = payload.get('chunk_index', 'N/A') metadata = payload.get('metadata', {}) print(f" KB ID: {kb_id}") print(f" Source: {source}") print(f" Chunk Index: {chunk_index}") if metadata: print(f" Metadata: {json.dumps(metadata, ensure_ascii=False)}") # 显示文本内容(格式化) print(f"\n 文本内容:") if text: # 按行显示,保持格式 lines = text.split('\n') for line in lines: if line.strip(): print(f" {line}") else: print(" (无文本内容)") print("\n" + "=" * 80) print(f"✅ 查询完成,共 {len(all_points)} 条记录") # 统计信息 print("\n📈 统计信息:") kb_ids = {} for point in all_points: payload = point.payload or {} kb_id = payload.get('kb_id', 'N/A') kb_ids[kb_id] = kb_ids.get(kb_id, 0) + 1 print(f" KB ID 分布:") for kb_id, count in sorted(kb_ids.items()): print(f" - {kb_id}: {count} 条") except Exception as e: print(f"❌ 查询失败: {e}") import traceback traceback.print_exc() finally: await client.close() async def main(): collection_name = "kb_szmp_ash_2026_30c19c84" await query_all_points(collection_name) if __name__ == "__main__": asyncio.run(main())