211 lines
6.4 KiB
Python
211 lines
6.4 KiB
Python
|
|
"""
|
||
|
|
Metadata models for RAG optimization.
|
||
|
|
Implements structured metadata for knowledge chunks.
|
||
|
|
Reference: rag-optimization/spec.md Section 3.2
|
||
|
|
"""
|
||
|
|
|
||
|
|
from dataclasses import dataclass, field
|
||
|
|
from datetime import date, datetime
|
||
|
|
from enum import Enum
|
||
|
|
from typing import Any
|
||
|
|
|
||
|
|
from pydantic import BaseModel
|
||
|
|
|
||
|
|
|
||
|
|
class RetrievalStrategy(str, Enum):
|
||
|
|
"""Retrieval strategy options."""
|
||
|
|
VECTOR_ONLY = "vector"
|
||
|
|
BM25_ONLY = "bm25"
|
||
|
|
HYBRID = "hybrid"
|
||
|
|
TWO_STAGE = "two_stage"
|
||
|
|
|
||
|
|
|
||
|
|
class ChunkMetadataModel(BaseModel):
|
||
|
|
"""Pydantic model for API serialization."""
|
||
|
|
category: str = ""
|
||
|
|
subcategory: str = ""
|
||
|
|
target_audience: list[str] = []
|
||
|
|
source_doc: str = ""
|
||
|
|
source_url: str = ""
|
||
|
|
department: str = ""
|
||
|
|
valid_from: str | None = None
|
||
|
|
valid_until: str | None = None
|
||
|
|
priority: int = 5
|
||
|
|
keywords: list[str] = []
|
||
|
|
|
||
|
|
|
||
|
|
@dataclass
|
||
|
|
class ChunkMetadata:
|
||
|
|
"""
|
||
|
|
Metadata for knowledge chunks.
|
||
|
|
Reference: rag-optimization/spec.md Section 3.2.2
|
||
|
|
"""
|
||
|
|
category: str = ""
|
||
|
|
subcategory: str = ""
|
||
|
|
target_audience: list[str] = field(default_factory=list)
|
||
|
|
source_doc: str = ""
|
||
|
|
source_url: str = ""
|
||
|
|
department: str = ""
|
||
|
|
valid_from: date | None = None
|
||
|
|
valid_until: date | None = None
|
||
|
|
priority: int = 5
|
||
|
|
keywords: list[str] = field(default_factory=list)
|
||
|
|
|
||
|
|
def to_dict(self) -> dict[str, Any]:
|
||
|
|
"""Convert to dictionary for storage."""
|
||
|
|
return {
|
||
|
|
"category": self.category,
|
||
|
|
"subcategory": self.subcategory,
|
||
|
|
"target_audience": self.target_audience,
|
||
|
|
"source_doc": self.source_doc,
|
||
|
|
"source_url": self.source_url,
|
||
|
|
"department": self.department,
|
||
|
|
"valid_from": self.valid_from.isoformat() if self.valid_from else None,
|
||
|
|
"valid_until": self.valid_until.isoformat() if self.valid_until else None,
|
||
|
|
"priority": self.priority,
|
||
|
|
"keywords": self.keywords,
|
||
|
|
}
|
||
|
|
|
||
|
|
@classmethod
|
||
|
|
def from_dict(cls, data: dict[str, Any]) -> "ChunkMetadata":
|
||
|
|
"""Create from dictionary."""
|
||
|
|
return cls(
|
||
|
|
category=data.get("category", ""),
|
||
|
|
subcategory=data.get("subcategory", ""),
|
||
|
|
target_audience=data.get("target_audience", []),
|
||
|
|
source_doc=data.get("source_doc", ""),
|
||
|
|
source_url=data.get("source_url", ""),
|
||
|
|
department=data.get("department", ""),
|
||
|
|
valid_from=date.fromisoformat(data["valid_from"]) if data.get("valid_from") else None,
|
||
|
|
valid_until=date.fromisoformat(data["valid_until"]) if data.get("valid_until") else None,
|
||
|
|
priority=data.get("priority", 5),
|
||
|
|
keywords=data.get("keywords", []),
|
||
|
|
)
|
||
|
|
|
||
|
|
|
||
|
|
@dataclass
|
||
|
|
class MetadataFilter:
|
||
|
|
"""
|
||
|
|
Filter conditions for metadata-based retrieval.
|
||
|
|
Reference: rag-optimization/spec.md Section 4.1
|
||
|
|
"""
|
||
|
|
categories: list[str] | None = None
|
||
|
|
target_audiences: list[str] | None = None
|
||
|
|
departments: list[str] | None = None
|
||
|
|
valid_only: bool = True
|
||
|
|
min_priority: int | None = None
|
||
|
|
keywords: list[str] | None = None
|
||
|
|
|
||
|
|
def to_qdrant_filter(self) -> dict[str, Any] | None:
|
||
|
|
"""Convert to Qdrant filter format."""
|
||
|
|
conditions = []
|
||
|
|
|
||
|
|
if self.categories:
|
||
|
|
conditions.append({
|
||
|
|
"key": "metadata.category",
|
||
|
|
"match": {"any": self.categories}
|
||
|
|
})
|
||
|
|
|
||
|
|
if self.departments:
|
||
|
|
conditions.append({
|
||
|
|
"key": "metadata.department",
|
||
|
|
"match": {"any": self.departments}
|
||
|
|
})
|
||
|
|
|
||
|
|
if self.target_audiences:
|
||
|
|
conditions.append({
|
||
|
|
"key": "metadata.target_audience",
|
||
|
|
"match": {"any": self.target_audiences}
|
||
|
|
})
|
||
|
|
|
||
|
|
if self.valid_only:
|
||
|
|
today = date.today().isoformat()
|
||
|
|
conditions.append({
|
||
|
|
"should": [
|
||
|
|
{"key": "metadata.valid_until", "match": {"value": None}},
|
||
|
|
{"key": "metadata.valid_until", "range": {"gte": today}}
|
||
|
|
]
|
||
|
|
})
|
||
|
|
|
||
|
|
if self.min_priority is not None:
|
||
|
|
conditions.append({
|
||
|
|
"key": "metadata.priority",
|
||
|
|
"range": {"lte": self.min_priority}
|
||
|
|
})
|
||
|
|
|
||
|
|
if not conditions:
|
||
|
|
return None
|
||
|
|
|
||
|
|
if len(conditions) == 1:
|
||
|
|
return {"must": conditions}
|
||
|
|
|
||
|
|
return {"must": conditions}
|
||
|
|
|
||
|
|
|
||
|
|
@dataclass
|
||
|
|
class KnowledgeChunk:
|
||
|
|
"""
|
||
|
|
Knowledge chunk with multi-dimensional embeddings.
|
||
|
|
Reference: rag-optimization/spec.md Section 3.2.1
|
||
|
|
"""
|
||
|
|
chunk_id: str
|
||
|
|
document_id: str
|
||
|
|
content: str
|
||
|
|
embedding_full: list[float] = field(default_factory=list)
|
||
|
|
embedding_256: list[float] = field(default_factory=list)
|
||
|
|
embedding_512: list[float] = field(default_factory=list)
|
||
|
|
metadata: ChunkMetadata = field(default_factory=ChunkMetadata)
|
||
|
|
created_at: datetime = field(default_factory=datetime.utcnow)
|
||
|
|
updated_at: datetime = field(default_factory=datetime.utcnow)
|
||
|
|
|
||
|
|
def to_qdrant_point(self, point_id: int | str) -> dict[str, Any]:
|
||
|
|
"""Convert to Qdrant point format."""
|
||
|
|
return {
|
||
|
|
"id": point_id,
|
||
|
|
"vector": {
|
||
|
|
"full": self.embedding_full,
|
||
|
|
"dim_256": self.embedding_256,
|
||
|
|
"dim_512": self.embedding_512,
|
||
|
|
},
|
||
|
|
"payload": {
|
||
|
|
"chunk_id": self.chunk_id,
|
||
|
|
"document_id": self.document_id,
|
||
|
|
"text": self.content,
|
||
|
|
"metadata": self.metadata.to_dict(),
|
||
|
|
"created_at": self.created_at.isoformat(),
|
||
|
|
"updated_at": self.updated_at.isoformat(),
|
||
|
|
}
|
||
|
|
}
|
||
|
|
|
||
|
|
|
||
|
|
@dataclass
|
||
|
|
class RetrieveRequest:
|
||
|
|
"""
|
||
|
|
Request for knowledge retrieval.
|
||
|
|
Reference: rag-optimization/spec.md Section 4.1
|
||
|
|
"""
|
||
|
|
query: str
|
||
|
|
query_with_prefix: str = ""
|
||
|
|
top_k: int = 10
|
||
|
|
filters: MetadataFilter | None = None
|
||
|
|
strategy: RetrievalStrategy = RetrievalStrategy.HYBRID
|
||
|
|
|
||
|
|
def __post_init__(self):
|
||
|
|
if not self.query_with_prefix:
|
||
|
|
self.query_with_prefix = f"search_query:{self.query}"
|
||
|
|
|
||
|
|
|
||
|
|
@dataclass
|
||
|
|
class RetrieveResult:
|
||
|
|
"""
|
||
|
|
Result from knowledge retrieval.
|
||
|
|
Reference: rag-optimization/spec.md Section 4.1
|
||
|
|
"""
|
||
|
|
chunk_id: str
|
||
|
|
content: str
|
||
|
|
score: float
|
||
|
|
vector_score: float = 0.0
|
||
|
|
bm25_score: float = 0.0
|
||
|
|
metadata: ChunkMetadata = field(default_factory=ChunkMetadata)
|
||
|
|
rank: int = 0
|