ai-robot-core/ai-service/scripts/check_qdrant.py

81 lines
2.6 KiB
Python
Raw Normal View History

"""
Check Qdrant vector database contents - detailed view.
"""
import asyncio
import sys
sys.path.insert(0, ".")
from qdrant_client import AsyncQdrantClient
from app.core.config import get_settings
from collections import defaultdict
settings = get_settings()
async def check_qdrant():
"""Check Qdrant collections and vectors."""
client = AsyncQdrantClient(url=settings.qdrant_url, check_compatibility=False)
print(f"\n{'='*60}")
print(f"Qdrant URL: {settings.qdrant_url}")
print(f"{'='*60}\n")
# List all collections
collections = await client.get_collections()
# Check kb_default collection
for c in collections.collections:
if c.name == "kb_default":
print(f"\n--- Collection: {c.name} ---")
# Get collection info
info = await client.get_collection(c.name)
print(f" Total vectors: {info.points_count}")
# Scroll through all points and group by source
all_points = []
offset = None
while True:
points, offset = await client.scroll(
collection_name=c.name,
limit=100,
offset=offset,
with_payload=True,
with_vectors=False,
)
all_points.extend(points)
if offset is None:
break
# Group by source
by_source = defaultdict(list)
for p in all_points:
source = p.payload.get("source", "unknown") if p.payload else "unknown"
by_source[source].append(p)
print(f"\n Documents by source:")
for source, points in by_source.items():
print(f"\n Source: {source}")
print(f" Chunks: {len(points)}")
# Check first chunk content
first_point = points[0]
text = first_point.payload.get("text", "") if first_point.payload else ""
# Check if it's binary garbage or proper text
is_garbage = any(ord(c) > 0xFFFF or (ord(c) < 32 and c not in '\n\r\t') for c in text[:200])
if is_garbage:
print(f" Status: ❌ BINARY GARBAGE (parsing failed)")
else:
print(f" Status: ✅ PROPER TEXT (parsed correctly)")
print(f" Preview: {text[:150]}...")
await client.close()
if __name__ == "__main__":
asyncio.run(check_qdrant())