ai-robot-core/ai-service/app/services/retrieval/metadata.py

211 lines
6.4 KiB
Python

"""
Metadata models for RAG optimization.
Implements structured metadata for knowledge chunks.
Reference: rag-optimization/spec.md Section 3.2
"""
from dataclasses import dataclass, field
from datetime import date, datetime
from enum import Enum
from typing import Any
from pydantic import BaseModel
class RetrievalStrategy(str, Enum):
"""Retrieval strategy options."""
VECTOR_ONLY = "vector"
BM25_ONLY = "bm25"
HYBRID = "hybrid"
TWO_STAGE = "two_stage"
class ChunkMetadataModel(BaseModel):
"""Pydantic model for API serialization."""
category: str = ""
subcategory: str = ""
target_audience: list[str] = []
source_doc: str = ""
source_url: str = ""
department: str = ""
valid_from: str | None = None
valid_until: str | None = None
priority: int = 5
keywords: list[str] = []
@dataclass
class ChunkMetadata:
"""
Metadata for knowledge chunks.
Reference: rag-optimization/spec.md Section 3.2.2
"""
category: str = ""
subcategory: str = ""
target_audience: list[str] = field(default_factory=list)
source_doc: str = ""
source_url: str = ""
department: str = ""
valid_from: date | None = None
valid_until: date | None = None
priority: int = 5
keywords: list[str] = field(default_factory=list)
def to_dict(self) -> dict[str, Any]:
"""Convert to dictionary for storage."""
return {
"category": self.category,
"subcategory": self.subcategory,
"target_audience": self.target_audience,
"source_doc": self.source_doc,
"source_url": self.source_url,
"department": self.department,
"valid_from": self.valid_from.isoformat() if self.valid_from else None,
"valid_until": self.valid_until.isoformat() if self.valid_until else None,
"priority": self.priority,
"keywords": self.keywords,
}
@classmethod
def from_dict(cls, data: dict[str, Any]) -> "ChunkMetadata":
"""Create from dictionary."""
return cls(
category=data.get("category", ""),
subcategory=data.get("subcategory", ""),
target_audience=data.get("target_audience", []),
source_doc=data.get("source_doc", ""),
source_url=data.get("source_url", ""),
department=data.get("department", ""),
valid_from=date.fromisoformat(data["valid_from"]) if data.get("valid_from") else None,
valid_until=date.fromisoformat(data["valid_until"]) if data.get("valid_until") else None,
priority=data.get("priority", 5),
keywords=data.get("keywords", []),
)
@dataclass
class MetadataFilter:
"""
Filter conditions for metadata-based retrieval.
Reference: rag-optimization/spec.md Section 4.1
"""
categories: list[str] | None = None
target_audiences: list[str] | None = None
departments: list[str] | None = None
valid_only: bool = True
min_priority: int | None = None
keywords: list[str] | None = None
def to_qdrant_filter(self) -> dict[str, Any] | None:
"""Convert to Qdrant filter format."""
conditions = []
if self.categories:
conditions.append({
"key": "metadata.category",
"match": {"any": self.categories}
})
if self.departments:
conditions.append({
"key": "metadata.department",
"match": {"any": self.departments}
})
if self.target_audiences:
conditions.append({
"key": "metadata.target_audience",
"match": {"any": self.target_audiences}
})
if self.valid_only:
today = date.today().isoformat()
conditions.append({
"should": [
{"key": "metadata.valid_until", "match": {"value": None}},
{"key": "metadata.valid_until", "range": {"gte": today}}
]
})
if self.min_priority is not None:
conditions.append({
"key": "metadata.priority",
"range": {"lte": self.min_priority}
})
if not conditions:
return None
if len(conditions) == 1:
return {"must": conditions}
return {"must": conditions}
@dataclass
class KnowledgeChunk:
"""
Knowledge chunk with multi-dimensional embeddings.
Reference: rag-optimization/spec.md Section 3.2.1
"""
chunk_id: str
document_id: str
content: str
embedding_full: list[float] = field(default_factory=list)
embedding_256: list[float] = field(default_factory=list)
embedding_512: list[float] = field(default_factory=list)
metadata: ChunkMetadata = field(default_factory=ChunkMetadata)
created_at: datetime = field(default_factory=datetime.utcnow)
updated_at: datetime = field(default_factory=datetime.utcnow)
def to_qdrant_point(self, point_id: int | str) -> dict[str, Any]:
"""Convert to Qdrant point format."""
return {
"id": point_id,
"vector": {
"full": self.embedding_full,
"dim_256": self.embedding_256,
"dim_512": self.embedding_512,
},
"payload": {
"chunk_id": self.chunk_id,
"document_id": self.document_id,
"text": self.content,
"metadata": self.metadata.to_dict(),
"created_at": self.created_at.isoformat(),
"updated_at": self.updated_at.isoformat(),
}
}
@dataclass
class RetrieveRequest:
"""
Request for knowledge retrieval.
Reference: rag-optimization/spec.md Section 4.1
"""
query: str
query_with_prefix: str = ""
top_k: int = 10
filters: MetadataFilter | None = None
strategy: RetrievalStrategy = RetrievalStrategy.HYBRID
def __post_init__(self):
if not self.query_with_prefix:
self.query_with_prefix = f"search_query:{self.query}"
@dataclass
class RetrieveResult:
"""
Result from knowledge retrieval.
Reference: rag-optimization/spec.md Section 4.1
"""
chunk_id: str
content: str
score: float
vector_score: float = 0.0
bm25_score: float = 0.0
metadata: ChunkMetadata = field(default_factory=ChunkMetadata)
rank: int = 0