81 lines
2.6 KiB
Python
81 lines
2.6 KiB
Python
"""
|
|
Check Qdrant vector database contents - detailed view.
|
|
"""
|
|
import asyncio
|
|
import sys
|
|
sys.path.insert(0, ".")
|
|
|
|
from qdrant_client import AsyncQdrantClient
|
|
from app.core.config import get_settings
|
|
from collections import defaultdict
|
|
|
|
settings = get_settings()
|
|
|
|
|
|
async def check_qdrant():
|
|
"""Check Qdrant collections and vectors."""
|
|
client = AsyncQdrantClient(url=settings.qdrant_url, check_compatibility=False)
|
|
|
|
print(f"\n{'='*60}")
|
|
print(f"Qdrant URL: {settings.qdrant_url}")
|
|
print(f"{'='*60}\n")
|
|
|
|
# List all collections
|
|
collections = await client.get_collections()
|
|
|
|
# Check kb_default collection
|
|
for c in collections.collections:
|
|
if c.name == "kb_default":
|
|
print(f"\n--- Collection: {c.name} ---")
|
|
|
|
# Get collection info
|
|
info = await client.get_collection(c.name)
|
|
print(f" Total vectors: {info.points_count}")
|
|
|
|
# Scroll through all points and group by source
|
|
all_points = []
|
|
offset = None
|
|
|
|
while True:
|
|
points, offset = await client.scroll(
|
|
collection_name=c.name,
|
|
limit=100,
|
|
offset=offset,
|
|
with_payload=True,
|
|
with_vectors=False,
|
|
)
|
|
all_points.extend(points)
|
|
if offset is None:
|
|
break
|
|
|
|
# Group by source
|
|
by_source = defaultdict(list)
|
|
for p in all_points:
|
|
source = p.payload.get("source", "unknown") if p.payload else "unknown"
|
|
by_source[source].append(p)
|
|
|
|
print(f"\n Documents by source:")
|
|
for source, points in by_source.items():
|
|
print(f"\n Source: {source}")
|
|
print(f" Chunks: {len(points)}")
|
|
|
|
# Check first chunk content
|
|
first_point = points[0]
|
|
text = first_point.payload.get("text", "") if first_point.payload else ""
|
|
|
|
# Check if it's binary garbage or proper text
|
|
is_garbage = any(ord(c) > 0xFFFF or (ord(c) < 32 and c not in '\n\r\t') for c in text[:200])
|
|
|
|
if is_garbage:
|
|
print(f" Status: ❌ BINARY GARBAGE (parsing failed)")
|
|
else:
|
|
print(f" Status: ✅ PROPER TEXT (parsed correctly)")
|
|
|
|
print(f" Preview: {text[:150]}...")
|
|
|
|
await client.close()
|
|
|
|
|
|
if __name__ == "__main__":
|
|
asyncio.run(check_qdrant())
|