ai-robot-core/ai-service/scripts/clear_kb_vectors.py

179 lines
5.9 KiB
Python

"""
Script to cleanup vector data for a specific knowledge base.
Clears the Qdrant collection for the given KB ID, allowing re-indexing.
"""
import asyncio
import logging
import sys
sys.path.insert(0, "Q:\\agentProject\\ai-robot-core\\ai-service")
from app.core.config import get_settings
from app.core.qdrant_client import get_qdrant_client
from app.core.database import get_session
from app.models.entities import KnowledgeBase, Document
from sqlalchemy import select
logging.basicConfig(
level=logging.INFO,
format="%(asctime)s - %(levelname)s - %(message)s"
)
logger = logging.getLogger(__name__)
async def get_knowledge_base_info(kb_id: str) -> dict | None:
"""Get knowledge base information from database."""
async for session in get_session():
stmt = select(KnowledgeBase).where(KnowledgeBase.id == kb_id)
result = await session.execute(stmt)
kb = result.scalar_one_or_none()
if kb:
doc_stmt = select(Document).where(Document.kb_id == kb_id)
doc_result = await session.execute(doc_stmt)
documents = doc_result.scalars().all()
return {
"id": str(kb.id),
"tenant_id": kb.tenant_id,
"name": kb.name,
"doc_count": len(documents),
"document_ids": [str(doc.id) for doc in documents]
}
return None
async def list_kb_collections(tenant_id: str, kb_id: str) -> list[str]:
"""List all collections that might be related to the KB."""
client = await get_qdrant_client()
qdrant = await client.get_client()
collections = await qdrant.get_collections()
all_names = [c.name for c in collections.collections]
safe_tenant = tenant_id.replace('@', '_')
safe_kb = kb_id.replace('-', '_')[:8]
matching = [
name for name in all_names
if safe_kb in name or kb_id.replace('-', '')[:8] in name.replace('_', '')
]
return matching
async def clear_kb_vector_data(tenant_id: str, kb_id: str, delete_docs: bool = False) -> bool:
"""
Clear vector data for a specific knowledge base.
Args:
tenant_id: Tenant identifier
kb_id: Knowledge base ID
delete_docs: Whether to also delete document records from database
Returns:
True if successful
"""
client = await get_qdrant_client()
qdrant = await client.get_client()
collection_name = client.get_kb_collection_name(tenant_id, kb_id)
try:
exists = await qdrant.collection_exists(collection_name)
if exists:
await qdrant.delete_collection(collection_name=collection_name)
logger.info(f"Deleted Qdrant collection: {collection_name}")
else:
logger.info(f"Collection {collection_name} does not exist")
if delete_docs:
async for session in get_session():
doc_stmt = select(Document).where(Document.kb_id == kb_id)
doc_result = await session.execute(doc_stmt)
documents = doc_result.scalars().all()
for doc in documents:
await session.delete(doc)
stmt = select(KnowledgeBase).where(KnowledgeBase.id == kb_id)
result = await session.execute(stmt)
kb = result.scalar_one_or_none()
if kb:
kb.doc_count = 0
kb.updated_at = datetime.utcnow()
await session.commit()
logger.info(f"Deleted {len(documents)} document records from database")
break
return True
except Exception as e:
logger.error(f"Failed to clear KB vector data: {e}")
return False
async def main(kb_id: str, delete_docs: bool = False):
"""Main function to clear KB vector data."""
logger.info(f"Starting cleanup for knowledge base: {kb_id}")
kb_info = await get_knowledge_base_info(kb_id)
if not kb_info:
logger.error(f"Knowledge base not found: {kb_id}")
return False
logger.info(f"Found knowledge base:")
logger.info(f" - ID: {kb_info['id']}")
logger.info(f" - Name: {kb_info['name']}")
logger.info(f" - Tenant: {kb_info['tenant_id']}")
logger.info(f" - Document count: {kb_info['doc_count']}")
matching_collections = await list_kb_collections(kb_info['tenant_id'], kb_id)
if matching_collections:
logger.info(f" - Related collections: {matching_collections}")
print()
print("=" * 60)
print("WARNING: This will delete all vector data for this knowledge base!")
print(f"Collection to delete: kb_{kb_info['tenant_id'].replace('@', '_')}_{kb_id.replace('-', '_')[:8]}")
if delete_docs:
print("Document records in database will also be deleted!")
print("=" * 60)
print()
confirm = input("Continue? (yes/no): ")
if confirm.lower() != "yes":
print("Cancelled")
return False
success = await clear_kb_vector_data(
tenant_id=kb_info['tenant_id'],
kb_id=kb_id,
delete_docs=delete_docs
)
if success:
logger.info(f"Successfully cleared vector data for KB: {kb_id}")
logger.info("You can now re-index the knowledge base documents.")
else:
logger.error(f"Failed to clear vector data for KB: {kb_id}")
return success
if __name__ == "__main__":
import argparse
from datetime import datetime
parser = argparse.ArgumentParser(description="Clear vector data for a knowledge base")
parser.add_argument("kb_id", help="Knowledge base ID to clear")
parser.add_argument("--delete-docs", action="store_true",
help="Also delete document records from database")
args = parser.parse_args()
asyncio.run(main(args.kb_id, args.delete_docs))