ai-robot-core/ai-service/app/services/embedding/nomic_provider.py

292 lines
9.2 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

"""
Nomic embedding provider with task prefixes and Matryoshka support.
Implements RAG optimization spec:
- Task prefixes: search_document: / search_query:
- Matryoshka dimension truncation: 256/512/768 dimensions
"""
import logging
import time
from dataclasses import dataclass, field
from enum import Enum
from typing import Any
import httpx
import numpy as np
from app.services.embedding.base import (
EmbeddingConfig,
EmbeddingException,
EmbeddingProvider,
)
logger = logging.getLogger(__name__)
class EmbeddingTask(str, Enum):
"""Task type for nomic-embed-text v1.5 model."""
DOCUMENT = "search_document"
QUERY = "search_query"
@dataclass
class NomicEmbeddingResult:
"""Result from Nomic embedding with multiple dimensions."""
embedding_full: list[float]
embedding_256: list[float]
embedding_512: list[float]
dimension: int
model: str
task: EmbeddingTask
latency_ms: float = 0.0
metadata: dict[str, Any] = field(default_factory=dict)
class NomicEmbeddingProvider(EmbeddingProvider):
"""
Nomic-embed-text v1.5 embedding provider with task prefixes.
Key features:
- Task prefixes: search_document: for documents, search_query: for queries
- Matryoshka dimension truncation: 256/512/768 dimensions
- Automatic normalization after truncation
Reference: rag-optimization/spec.md Section 2.1, 2.3
"""
PROVIDER_NAME = "nomic"
DOCUMENT_PREFIX = "search_document:"
QUERY_PREFIX = "search_query:"
FULL_DIMENSION = 768
def __init__(
self,
base_url: str = "http://localhost:11434",
model: str = "nomic-embed-text",
dimension: int = 768,
timeout_seconds: int = 60,
enable_matryoshka: bool = True,
**kwargs: Any,
):
self._base_url = base_url.rstrip("/")
self._model = model
self._dimension = dimension
self._timeout = timeout_seconds
self._enable_matryoshka = enable_matryoshka
self._client: httpx.AsyncClient | None = None
self._extra_config = kwargs
async def _get_client(self) -> httpx.AsyncClient:
if self._client is None:
self._client = httpx.AsyncClient(timeout=self._timeout)
return self._client
def _add_prefix(self, text: str, task: EmbeddingTask) -> str:
"""Add task prefix to text."""
if task == EmbeddingTask.DOCUMENT:
prefix = self.DOCUMENT_PREFIX
else:
prefix = self.QUERY_PREFIX
if text.startswith(prefix):
return text
return f"{prefix}{text}"
def _truncate_and_normalize(self, embedding: list[float], target_dim: int) -> list[float]:
"""
Truncate embedding to target dimension and normalize.
Matryoshka representation learning allows dimension truncation.
"""
truncated = embedding[:target_dim]
arr = np.array(truncated, dtype=np.float32)
norm = np.linalg.norm(arr)
if norm > 0:
arr = arr / norm
return arr.tolist()
async def embed_with_task(
self,
text: str,
task: EmbeddingTask,
) -> NomicEmbeddingResult:
"""
Generate embedding with specified task prefix.
Args:
text: Input text to embed
task: DOCUMENT for indexing, QUERY for retrieval
Returns:
NomicEmbeddingResult with all dimension variants
"""
start_time = time.perf_counter()
prefixed_text = self._add_prefix(text, task)
try:
client = await self._get_client()
response = await client.post(
f"{self._base_url}/api/embeddings",
json={
"model": self._model,
"prompt": prefixed_text,
}
)
response.raise_for_status()
data = response.json()
embedding = data.get("embedding", [])
if not embedding:
raise EmbeddingException(
"Empty embedding returned",
provider=self.PROVIDER_NAME,
details={"text_length": len(text), "task": task.value}
)
latency_ms = (time.perf_counter() - start_time) * 1000
embedding_256 = self._truncate_and_normalize(embedding, 256)
embedding_512 = self._truncate_and_normalize(embedding, 512)
logger.debug(
f"Generated Nomic embedding: task={task.value}, "
f"dim={len(embedding)}, latency={latency_ms:.2f}ms"
)
return NomicEmbeddingResult(
embedding_full=embedding,
embedding_256=embedding_256,
embedding_512=embedding_512,
dimension=len(embedding),
model=self._model,
task=task,
latency_ms=latency_ms,
)
except httpx.HTTPStatusError as e:
raise EmbeddingException(
f"Ollama API error: {e.response.status_code}",
provider=self.PROVIDER_NAME,
details={"status_code": e.response.status_code, "response": e.response.text}
)
except httpx.RequestError as e:
raise EmbeddingException(
f"Ollama connection error: {e}",
provider=self.PROVIDER_NAME,
details={"base_url": self._base_url}
)
except EmbeddingException:
raise
except Exception as e:
raise EmbeddingException(
f"Embedding generation failed: {e}",
provider=self.PROVIDER_NAME
)
async def embed_document(self, text: str) -> NomicEmbeddingResult:
"""
Generate embedding for document (with search_document: prefix).
Use this when indexing documents into vector store.
"""
return await self.embed_with_task(text, EmbeddingTask.DOCUMENT)
async def embed_query(self, text: str) -> NomicEmbeddingResult:
"""
Generate embedding for query (with search_query: prefix).
Use this when searching/retrieving documents.
"""
return await self.embed_with_task(text, EmbeddingTask.QUERY)
async def embed(self, text: str) -> list[float]:
"""
Generate embedding vector for a single text.
Default uses QUERY task for backward compatibility.
"""
result = await self.embed_query(text)
return result.embedding_full
async def embed_batch(self, texts: list[str]) -> list[list[float]]:
"""
Generate embedding vectors for multiple texts.
Uses QUERY task by default.
"""
embeddings = []
for text in texts:
embedding = await self.embed(text)
embeddings.append(embedding)
return embeddings
async def embed_documents_batch(
self,
texts: list[str],
) -> list[NomicEmbeddingResult]:
"""
Generate embeddings for multiple documents (DOCUMENT task).
Use this when batch indexing documents.
"""
results = []
for text in texts:
result = await self.embed_document(text)
results.append(result)
return results
async def embed_queries_batch(
self,
texts: list[str],
) -> list[NomicEmbeddingResult]:
"""
Generate embeddings for multiple queries (QUERY task).
Use this when batch processing queries.
"""
results = []
for text in texts:
result = await self.embed_query(text)
results.append(result)
return results
def get_dimension(self) -> int:
"""Get the dimension of embedding vectors."""
return self._dimension
def get_provider_name(self) -> str:
"""Get the name of this embedding provider."""
return self.PROVIDER_NAME
def get_config_schema(self) -> dict[str, Any]:
"""Get the configuration schema for Nomic provider."""
return {
"base_url": {
"type": "string",
"description": "Ollama API 地址",
"default": "http://localhost:11434",
},
"model": {
"type": "string",
"description": "嵌入模型名称(推荐 nomic-embed-text v1.5",
"default": "nomic-embed-text",
},
"dimension": {
"type": "integer",
"description": "向量维度(支持 256/512/768",
"default": 768,
},
"timeout_seconds": {
"type": "integer",
"description": "请求超时时间(秒)",
"default": 60,
},
"enable_matryoshka": {
"type": "boolean",
"description": "启用 Matryoshka 维度截断",
"default": True,
},
}
async def close(self) -> None:
"""Close the HTTP client."""
if self._client:
await self._client.aclose()
self._client = None