ai-robot-core/ai-service/scripts/cleanup_garbage.py

116 lines
3.7 KiB
Python

"""
Clean up garbage data from Qdrant vector database.
Removes vectors that contain binary garbage (failed parsing results).
"""
import asyncio
import sys
sys.path.insert(0, ".")
from qdrant_client import AsyncQdrantClient
from qdrant_client.models import PointIdsList
from app.core.config import get_settings
from collections import defaultdict
settings = get_settings()
def is_garbage_text(text: str) -> bool:
"""Check if text contains binary garbage."""
if not text:
return True
sample = text[:500]
garbage_chars = sum(1 for c in sample if ord(c) > 0xFFFF or (ord(c) < 32 and c not in '\n\r\t'))
return garbage_chars > len(sample) * 0.1
async def cleanup_garbage():
"""Clean up garbage data from Qdrant."""
client = AsyncQdrantClient(url=settings.qdrant_url, check_compatibility=False)
print(f"\n{'='*60}")
print(f"Cleaning up garbage data from Qdrant")
print(f"URL: {settings.qdrant_url}")
print(f"{'='*60}\n")
collections = await client.get_collections()
for c in collections.collections:
if not c.name.startswith(settings.qdrant_collection_prefix):
continue
print(f"\n--- Collection: {c.name} ---")
info = await client.get_collection(c.name)
print(f" Total vectors: {info.points_count}")
all_points = []
offset = None
while True:
points, offset = await client.scroll(
collection_name=c.name,
limit=100,
offset=offset,
with_payload=True,
with_vectors=False,
)
all_points.extend(points)
if offset is None:
break
by_source = defaultdict(list)
for p in all_points:
source = p.payload.get("source", "unknown") if p.payload else "unknown"
by_source[source].append(p)
garbage_sources = []
good_sources = []
for source, points in by_source.items():
first_point = points[0]
text = first_point.payload.get("text", "") if first_point.payload else ""
if is_garbage_text(text):
garbage_sources.append((source, points))
else:
good_sources.append((source, points))
print(f"\n Good documents: {len(good_sources)}")
print(f" Garbage documents: {len(garbage_sources)}")
if garbage_sources:
print(f"\n Garbage documents to delete:")
for source, points in garbage_sources:
print(f" - {source} ({len(points)} chunks)")
preview = ""
if points[0].payload:
preview = points[0].payload.get("text", "")[:80]
print(f" Preview: {repr(preview)}...")
confirm = input("\n Delete these garbage documents? (y/n): ")
if confirm.lower() == 'y':
for source, points in garbage_sources:
point_ids = [p.id for p in points]
await client.delete(
collection_name=c.name,
points_selector=PointIdsList(points=point_ids)
)
print(f" Deleted {len(point_ids)} vectors for source {source}")
print(f"\n Cleanup complete!")
else:
print(f"\n Cancelled.")
else:
print(f"\n No garbage data found.")
await client.close()
if __name__ == "__main__":
asyncio.run(cleanup_garbage())