116 lines
3.7 KiB
Python
116 lines
3.7 KiB
Python
|
|
"""
|
||
|
|
Clean up garbage data from Qdrant vector database.
|
||
|
|
Removes vectors that contain binary garbage (failed parsing results).
|
||
|
|
"""
|
||
|
|
import asyncio
|
||
|
|
import sys
|
||
|
|
sys.path.insert(0, ".")
|
||
|
|
|
||
|
|
from qdrant_client import AsyncQdrantClient
|
||
|
|
from qdrant_client.models import PointIdsList
|
||
|
|
from app.core.config import get_settings
|
||
|
|
from collections import defaultdict
|
||
|
|
|
||
|
|
settings = get_settings()
|
||
|
|
|
||
|
|
|
||
|
|
def is_garbage_text(text: str) -> bool:
|
||
|
|
"""Check if text contains binary garbage."""
|
||
|
|
if not text:
|
||
|
|
return True
|
||
|
|
|
||
|
|
sample = text[:500]
|
||
|
|
|
||
|
|
garbage_chars = sum(1 for c in sample if ord(c) > 0xFFFF or (ord(c) < 32 and c not in '\n\r\t'))
|
||
|
|
|
||
|
|
return garbage_chars > len(sample) * 0.1
|
||
|
|
|
||
|
|
|
||
|
|
async def cleanup_garbage():
|
||
|
|
"""Clean up garbage data from Qdrant."""
|
||
|
|
client = AsyncQdrantClient(url=settings.qdrant_url, check_compatibility=False)
|
||
|
|
|
||
|
|
print(f"\n{'='*60}")
|
||
|
|
print(f"Cleaning up garbage data from Qdrant")
|
||
|
|
print(f"URL: {settings.qdrant_url}")
|
||
|
|
print(f"{'='*60}\n")
|
||
|
|
|
||
|
|
collections = await client.get_collections()
|
||
|
|
|
||
|
|
for c in collections.collections:
|
||
|
|
if not c.name.startswith(settings.qdrant_collection_prefix):
|
||
|
|
continue
|
||
|
|
|
||
|
|
print(f"\n--- Collection: {c.name} ---")
|
||
|
|
|
||
|
|
info = await client.get_collection(c.name)
|
||
|
|
print(f" Total vectors: {info.points_count}")
|
||
|
|
|
||
|
|
all_points = []
|
||
|
|
offset = None
|
||
|
|
|
||
|
|
while True:
|
||
|
|
points, offset = await client.scroll(
|
||
|
|
collection_name=c.name,
|
||
|
|
limit=100,
|
||
|
|
offset=offset,
|
||
|
|
with_payload=True,
|
||
|
|
with_vectors=False,
|
||
|
|
)
|
||
|
|
all_points.extend(points)
|
||
|
|
if offset is None:
|
||
|
|
break
|
||
|
|
|
||
|
|
by_source = defaultdict(list)
|
||
|
|
for p in all_points:
|
||
|
|
source = p.payload.get("source", "unknown") if p.payload else "unknown"
|
||
|
|
by_source[source].append(p)
|
||
|
|
|
||
|
|
garbage_sources = []
|
||
|
|
good_sources = []
|
||
|
|
|
||
|
|
for source, points in by_source.items():
|
||
|
|
first_point = points[0]
|
||
|
|
text = first_point.payload.get("text", "") if first_point.payload else ""
|
||
|
|
|
||
|
|
if is_garbage_text(text):
|
||
|
|
garbage_sources.append((source, points))
|
||
|
|
else:
|
||
|
|
good_sources.append((source, points))
|
||
|
|
|
||
|
|
print(f"\n Good documents: {len(good_sources)}")
|
||
|
|
print(f" Garbage documents: {len(garbage_sources)}")
|
||
|
|
|
||
|
|
if garbage_sources:
|
||
|
|
print(f"\n Garbage documents to delete:")
|
||
|
|
for source, points in garbage_sources:
|
||
|
|
print(f" - {source} ({len(points)} chunks)")
|
||
|
|
preview = ""
|
||
|
|
if points[0].payload:
|
||
|
|
preview = points[0].payload.get("text", "")[:80]
|
||
|
|
print(f" Preview: {repr(preview)}...")
|
||
|
|
|
||
|
|
confirm = input("\n Delete these garbage documents? (y/n): ")
|
||
|
|
|
||
|
|
if confirm.lower() == 'y':
|
||
|
|
for source, points in garbage_sources:
|
||
|
|
point_ids = [p.id for p in points]
|
||
|
|
|
||
|
|
await client.delete(
|
||
|
|
collection_name=c.name,
|
||
|
|
points_selector=PointIdsList(points=point_ids)
|
||
|
|
)
|
||
|
|
print(f" Deleted {len(point_ids)} vectors for source {source}")
|
||
|
|
|
||
|
|
print(f"\n Cleanup complete!")
|
||
|
|
else:
|
||
|
|
print(f"\n Cancelled.")
|
||
|
|
else:
|
||
|
|
print(f"\n No garbage data found.")
|
||
|
|
|
||
|
|
await client.close()
|
||
|
|
|
||
|
|
|
||
|
|
if __name__ == "__main__":
|
||
|
|
asyncio.run(cleanup_garbage())
|