121 lines
3.6 KiB
Python
121 lines
3.6 KiB
Python
"""
|
|
查询 Qdrant Collection 中的所有内容
|
|
"""
|
|
|
|
import asyncio
|
|
import json
|
|
import sys
|
|
from pathlib import Path
|
|
|
|
sys.path.insert(0, str(Path(__file__).parent.parent))
|
|
|
|
from qdrant_client import AsyncQdrantClient
|
|
from qdrant_client.models import ScrollRequest
|
|
from app.core.config import get_settings
|
|
|
|
|
|
async def query_all_points(collection_name: str):
|
|
"""查询 collection 中的所有 points"""
|
|
settings = get_settings()
|
|
client = AsyncQdrantClient(url=settings.qdrant_url, check_compatibility=False)
|
|
|
|
print(f"🔍 查询 Collection: {collection_name}")
|
|
print("=" * 80)
|
|
|
|
try:
|
|
# 获取 collection 信息
|
|
info = await client.get_collection(collection_name)
|
|
total_points = info.points_count
|
|
print(f"📊 总向量数: {total_points}\n")
|
|
|
|
# 分页获取所有 points
|
|
all_points = []
|
|
offset = None
|
|
batch_size = 100
|
|
|
|
while True:
|
|
scroll_result = await client.scroll(
|
|
collection_name=collection_name,
|
|
offset=offset,
|
|
limit=batch_size,
|
|
with_payload=True,
|
|
with_vectors=False
|
|
)
|
|
|
|
points, next_offset = scroll_result
|
|
all_points.extend(points)
|
|
|
|
if next_offset is None:
|
|
break
|
|
offset = next_offset
|
|
|
|
# 显示进度
|
|
if len(all_points) % 500 == 0:
|
|
print(f" 已获取 {len(all_points)} / {total_points} 条记录...")
|
|
|
|
print(f"✅ 成功获取全部 {len(all_points)} 条记录\n")
|
|
print("=" * 80)
|
|
|
|
# 显示所有内容
|
|
for i, point in enumerate(all_points, 1):
|
|
payload = point.payload or {}
|
|
|
|
print(f"\n📄 记录 {i}/{len(all_points)} (ID: {point.id})")
|
|
print("-" * 80)
|
|
|
|
# 显示主要字段
|
|
text = payload.get('text', '')
|
|
kb_id = payload.get('kb_id', 'N/A')
|
|
source = payload.get('source', 'N/A')
|
|
chunk_index = payload.get('chunk_index', 'N/A')
|
|
metadata = payload.get('metadata', {})
|
|
|
|
print(f" KB ID: {kb_id}")
|
|
print(f" Source: {source}")
|
|
print(f" Chunk Index: {chunk_index}")
|
|
|
|
if metadata:
|
|
print(f" Metadata: {json.dumps(metadata, ensure_ascii=False)}")
|
|
|
|
# 显示文本内容(格式化)
|
|
print(f"\n 文本内容:")
|
|
if text:
|
|
# 按行显示,保持格式
|
|
lines = text.split('\n')
|
|
for line in lines:
|
|
if line.strip():
|
|
print(f" {line}")
|
|
else:
|
|
print(" (无文本内容)")
|
|
|
|
print("\n" + "=" * 80)
|
|
print(f"✅ 查询完成,共 {len(all_points)} 条记录")
|
|
|
|
# 统计信息
|
|
print("\n📈 统计信息:")
|
|
kb_ids = {}
|
|
for point in all_points:
|
|
payload = point.payload or {}
|
|
kb_id = payload.get('kb_id', 'N/A')
|
|
kb_ids[kb_id] = kb_ids.get(kb_id, 0) + 1
|
|
|
|
print(f" KB ID 分布:")
|
|
for kb_id, count in sorted(kb_ids.items()):
|
|
print(f" - {kb_id}: {count} 条")
|
|
|
|
except Exception as e:
|
|
print(f"❌ 查询失败: {e}")
|
|
import traceback
|
|
traceback.print_exc()
|
|
finally:
|
|
await client.close()
|
|
|
|
|
|
async def main():
|
|
collection_name = "kb_szmp_ash_2026_30c19c84"
|
|
await query_all_points(collection_name)
|
|
|
|
|
|
if __name__ == "__main__":
|
|
asyncio.run(main())
|