""" Clean up garbage data from Qdrant vector database. Removes vectors that contain binary garbage (failed parsing results). """ import asyncio import sys sys.path.insert(0, ".") from qdrant_client import AsyncQdrantClient from qdrant_client.models import PointIdsList from app.core.config import get_settings from collections import defaultdict settings = get_settings() def is_garbage_text(text: str) -> bool: """Check if text contains binary garbage.""" if not text: return True sample = text[:500] garbage_chars = sum(1 for c in sample if ord(c) > 0xFFFF or (ord(c) < 32 and c not in '\n\r\t')) return garbage_chars > len(sample) * 0.1 async def cleanup_garbage(): """Clean up garbage data from Qdrant.""" client = AsyncQdrantClient(url=settings.qdrant_url, check_compatibility=False) print(f"\n{'='*60}") print(f"Cleaning up garbage data from Qdrant") print(f"URL: {settings.qdrant_url}") print(f"{'='*60}\n") collections = await client.get_collections() for c in collections.collections: if not c.name.startswith(settings.qdrant_collection_prefix): continue print(f"\n--- Collection: {c.name} ---") info = await client.get_collection(c.name) print(f" Total vectors: {info.points_count}") all_points = [] offset = None while True: points, offset = await client.scroll( collection_name=c.name, limit=100, offset=offset, with_payload=True, with_vectors=False, ) all_points.extend(points) if offset is None: break by_source = defaultdict(list) for p in all_points: source = p.payload.get("source", "unknown") if p.payload else "unknown" by_source[source].append(p) garbage_sources = [] good_sources = [] for source, points in by_source.items(): first_point = points[0] text = first_point.payload.get("text", "") if first_point.payload else "" if is_garbage_text(text): garbage_sources.append((source, points)) else: good_sources.append((source, points)) print(f"\n Good documents: {len(good_sources)}") print(f" Garbage documents: {len(garbage_sources)}") if garbage_sources: print(f"\n Garbage documents to delete:") for source, points in garbage_sources: print(f" - {source} ({len(points)} chunks)") preview = "" if points[0].payload: preview = points[0].payload.get("text", "")[:80] print(f" Preview: {repr(preview)}...") confirm = input("\n Delete these garbage documents? (y/n): ") if confirm.lower() == 'y': for source, points in garbage_sources: point_ids = [p.id for p in points] await client.delete( collection_name=c.name, points_selector=PointIdsList(points=point_ids) ) print(f" Deleted {len(point_ids)} vectors for source {source}") print(f"\n Cleanup complete!") else: print(f"\n Cancelled.") else: print(f"\n No garbage data found.") await client.close() if __name__ == "__main__": asyncio.run(cleanup_garbage())