diff --git a/.github/ISSUE_TEMPLATE/bioinformatics.yml b/.github/ISSUE_TEMPLATE/bioinformatics.yml index 2802e1f..4c276b7 100644 --- a/.github/ISSUE_TEMPLATE/bioinformatics.yml +++ b/.github/ISSUE_TEMPLATE/bioinformatics.yml @@ -181,3 +181,5 @@ body: required: false - label: I can provide additional biological data if needed required: false + + diff --git a/.github/ISSUE_TEMPLATE/bug_report.yml b/.github/ISSUE_TEMPLATE/bug_report.yml index 6f8ee9e..5b9c0f3 100644 --- a/.github/ISSUE_TEMPLATE/bug_report.yml +++ b/.github/ISSUE_TEMPLATE/bug_report.yml @@ -144,3 +144,5 @@ body: required: true - label: I am willing to help test the fix required: false + + diff --git a/.github/ISSUE_TEMPLATE/config.yml b/.github/ISSUE_TEMPLATE/config.yml index d4f8246..9e5ab46 100644 --- a/.github/ISSUE_TEMPLATE/config.yml +++ b/.github/ISSUE_TEMPLATE/config.yml @@ -9,3 +9,5 @@ contact_links: - name: Security Vulnerability url: https://github.com/DeepCritical/DeepCritical/security/advisories/new about: Report a security vulnerability privately. + + diff --git a/.github/ISSUE_TEMPLATE/documentation.yml b/.github/ISSUE_TEMPLATE/documentation.yml index 6d1e6ce..9b2258a 100644 --- a/.github/ISSUE_TEMPLATE/documentation.yml +++ b/.github/ISSUE_TEMPLATE/documentation.yml @@ -131,3 +131,5 @@ body: required: true - label: I am willing to help write or improve the documentation required: false + + diff --git a/.github/ISSUE_TEMPLATE/feature_request.yml b/.github/ISSUE_TEMPLATE/feature_request.yml index 7ab579f..c9e6c55 100644 --- a/.github/ISSUE_TEMPLATE/feature_request.yml +++ b/.github/ISSUE_TEMPLATE/feature_request.yml @@ -142,3 +142,5 @@ body: required: false - label: I am willing to help test this feature required: false + + diff --git a/.github/ISSUE_TEMPLATE/performance.yml b/.github/ISSUE_TEMPLATE/performance.yml index db35a66..6eac0e8 100644 --- a/.github/ISSUE_TEMPLATE/performance.yml +++ b/.github/ISSUE_TEMPLATE/performance.yml @@ -179,3 +179,5 @@ body: required: true - label: I am willing to help test performance improvements required: false + + diff --git a/.github/ISSUE_TEMPLATE/question.yml b/.github/ISSUE_TEMPLATE/question.yml index 0f07f08..06f6583 100644 --- a/.github/ISSUE_TEMPLATE/question.yml +++ b/.github/ISSUE_TEMPLATE/question.yml @@ -123,3 +123,5 @@ body: required: true - label: I am willing to help improve documentation based on the answer required: false + + diff --git a/.github/SECURITY.md b/.github/SECURITY.md index 083aced..23b2426 100644 --- a/.github/SECURITY.md +++ b/.github/SECURITY.md @@ -119,3 +119,5 @@ We thank the security research community for helping keep DeepCritical secure. S **Last Updated**: 2024-01-01 **Version**: 1.0 + + diff --git a/.github/dependabot.yml b/.github/dependabot.yml index 14ab086..10013ef 100644 --- a/.github/dependabot.yml +++ b/.github/dependabot.yml @@ -69,3 +69,5 @@ updates: labels: - "dependencies" - "docker" + + diff --git a/.github/pull_request_template.md b/.github/pull_request_template.md index bcafa51..08be6f7 100644 --- a/.github/pull_request_template.md +++ b/.github/pull_request_template.md @@ -130,3 +130,5 @@ flows: ## Reviewer Notes + + diff --git a/.github/repository-settings.yml b/.github/repository-settings.yml index b28f9d0..ad204c5 100644 --- a/.github/repository-settings.yml +++ b/.github/repository-settings.yml @@ -258,3 +258,5 @@ secrets: description: "Serper API key for web search" - name: "CODECOV_TOKEN" description: "Codecov token for coverage reporting" + + diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index e1c21c3..289d8b9 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -1,14 +1,14 @@ -name: CI +name: CI/CD Pipeline + +env: + UV_VERSION: "latest" + PYTHON_VERSION: "3.11" on: push: - branches: [ main, develop ] + branches: [ main, dev ] pull_request: - branches: [ main, develop ] - -env: - PYTHON_VERSION: "3.10" - UV_VERSION: "latest" + branches: [ main, dev ] jobs: lint: @@ -36,12 +36,11 @@ jobs: run: uv run ruff format --check . test: - name: Test runs-on: ubuntu-latest strategy: matrix: - python-version: ["3.10", "3.11", "3.12"] - + python-version: [3.10, 3.11, 3.12] + steps: - name: Checkout code uses: actions/checkout@v5 @@ -51,18 +50,20 @@ jobs: with: version: ${{ env.UV_VERSION }} - - name: Set up Python + - name: Set up Python ${{ matrix.python-version }} run: uv python install ${{ matrix.python-version }} - name: Install dependencies run: uv sync --dev - - name: Run tests - run: uv run pytest tests/ -v --cov=DeepResearch --cov-report=xml + - name: Test with pytest + run: | + uv run pytest --cov=DeepResearch --cov-report=xml --cov-report=html - - name: Upload coverage to Codecov + - name: Upload coverage reports to Codecov uses: codecov/codecov-action@v5 with: + token: ${{ secrets.CODECOV_TOKEN }} file: ./coverage.xml flags: unittests name: codecov-umbrella @@ -126,10 +127,9 @@ jobs: path: bandit-report.json build: - name: Build Package runs-on: ubuntu-latest - needs: [lint, test, integration-test] - + needs: test + steps: - name: Checkout code uses: actions/checkout@v5 @@ -152,19 +152,4 @@ jobs: uses: actions/upload-artifact@v4 with: name: dist - path: dist/ - - all-checks: - name: All Checks Passed - runs-on: ubuntu-latest - needs: [lint, test, integration-test, security, build] - if: always() - - steps: - - name: Check all jobs - run: | - if [[ "${{ needs.lint.result }}" != "success" || "${{ needs.test.result }}" != "success" || "${{ needs.integration-test.result }}" != "success" || "${{ needs.build.result }}" != "success" ]]; then - echo "One or more checks failed" - exit 1 - fi - echo "All checks passed!" + path: dist/ \ No newline at end of file diff --git a/.github/workflows/dependabot.yml b/.github/workflows/dependabot.yml index 16ad2d5..35ba5bb 100644 --- a/.github/workflows/dependabot.yml +++ b/.github/workflows/dependabot.yml @@ -33,3 +33,5 @@ jobs: target: minor merge-method: squash merge-commit-message: 'chore(deps): ${{ github.event.pull_request.title }}' + + diff --git a/.github/workflows/release.yml b/.github/workflows/release.yml index 8606fc1..b52df75 100644 --- a/.github/workflows/release.yml +++ b/.github/workflows/release.yml @@ -177,3 +177,5 @@ jobs: run: | echo "Conda-forge upload not yet implemented" # This would require a conda-forge feedstock + + diff --git a/.gitignore b/.gitignore index 093bb76..f23a42d 100644 --- a/.gitignore +++ b/.gitignore @@ -2,6 +2,7 @@ example .cursor outputs docs +tests # Python __pycache__/ @@ -69,6 +70,9 @@ multirun/ outputs/ *.log +# GitHub Actions +.github/workflows/ci.yml + # OS .DS_Store .DS_Store? diff --git a/CHANGELOG.md b/CHANGELOG.md index de13e4a..104be57 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -93,3 +93,5 @@ See [SECURITY.md](SECURITY.md) for security policy and vulnerability reporting. ## License This project is licensed under the MIT License - see the [LICENSE](LICENSE) file for details. + + diff --git a/CODE_OF_CONDUCT.md b/CODE_OF_CONDUCT.md index efa85af..1108be3 100644 --- a/CODE_OF_CONDUCT.md +++ b/CODE_OF_CONDUCT.md @@ -99,3 +99,5 @@ For answers to common questions about this code of conduct, see the FAQ at [http [Mozilla CoC]: https://github.com/mozilla/diversity [FAQ]: https://www.contributor-covenant.org/faq [translations]: https://www.contributor-covenant.org/translations + + diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md index 9ebb483..dd20b4e 100644 --- a/CONTRIBUTING.md +++ b/CONTRIBUTING.md @@ -394,3 +394,5 @@ Contributors will be recognized in: - GitHub contributors page Thank you for contributing to DeepCritical! 🚀 + + diff --git a/DeepResearch/src/agents/agent_orchestrator.py b/DeepResearch/src/agents/agent_orchestrator.py index 891b47c..b01c88d 100644 --- a/DeepResearch/src/agents/agent_orchestrator.py +++ b/DeepResearch/src/agents/agent_orchestrator.py @@ -467,3 +467,6 @@ def _synthesize_results(self, result: Any, user_input: str) -> str: + + + diff --git a/DeepResearch/src/agents/deep_agent_implementations.py b/DeepResearch/src/agents/deep_agent_implementations.py index 5bbeaae..38a6790 100644 --- a/DeepResearch/src/agents/deep_agent_implementations.py +++ b/DeepResearch/src/agents/deep_agent_implementations.py @@ -502,3 +502,6 @@ def create_agent_orchestrator(agent_types: List[str] = None) -> AgentOrchestrato + + + diff --git a/DeepResearch/src/agents/embedding_agent.py b/DeepResearch/src/agents/embedding_agent.py new file mode 100644 index 0000000..7314f52 --- /dev/null +++ b/DeepResearch/src/agents/embedding_agent.py @@ -0,0 +1,496 @@ +""" +Embedding Agent for DeepCritical + +This agent orchestrates embedding operations including document processing, +vector store management, and embedding generation using Pydantic AI patterns. +""" + +from __future__ import annotations + +import asyncio +from typing import Any, Dict, List, Optional, Union +from datetime import datetime + +from pydantic import BaseModel, Field +from pydantic_ai import Agent, RunContext + +from ..datatypes.rag import ( + Document, Chunk, RAGQuery, RAGResponse, RAGConfig, + SearchResult, SearchType, EmbeddingsConfig, VectorStoreConfig +) +from ..datatypes.agent_types import AgentDependencies, AgentResult, AgentType +from .base_agent import BaseAgent +from ...tools.embedding_tools import ( + generate_embeddings, add_documents_to_vector_store, + search_vector_store, list_vector_databases +) + + +class EmbeddingTask(BaseModel): + """Task for embedding operations.""" + task_type: str = Field(..., description="Type of task: create_db, add_docs, search, sync") + database_name: str = Field("default", description="Target database name") + documents: Optional[List[Document]] = Field(None, description="Documents to process") + query: Optional[str] = Field(None, description="Search query") + config: Optional[Dict[str, Any]] = Field(None, description="Task-specific configuration") + metadata: Dict[str, Any] = Field(default_factory=dict, description="Additional metadata") + + +class EmbeddingResult(BaseModel): + """Result from embedding operations.""" + success: bool = Field(..., description="Operation success") + task_type: str = Field(..., description="Type of task performed") + database_name: str = Field(..., description="Database name") + documents_processed: int = Field(0, description="Number of documents processed") + search_results: Optional[List[SearchResult]] = Field(None, description="Search results") + document_ids: Optional[List[str]] = Field(None, description="Processed document IDs") + message: str = Field(..., description="Result message") + processing_time: float = Field(..., description="Processing time in seconds") + metadata: Dict[str, Any] = Field(default_factory=dict, description="Additional metadata") + + +class EmbeddingAgent(BaseAgent): + """Agent for orchestrating embedding operations.""" + + def __init__(self, model_name: str = "anthropic:claude-sonnet-4-0"): + super().__init__(AgentType.EMBEDDING, model_name) + self._register_embedding_tools() + + def _get_default_system_prompt(self) -> str: + """Get default system prompt for embedding agent.""" + return """You are an Embedding Agent specialized in managing vector embeddings and document processing. + +Your capabilities include: +- Creating and managing embedding databases +- Processing documents and generating embeddings +- Performing semantic search across vector stores +- Synchronizing embedding databases with external services +- Optimizing embedding workflows for performance + +You work with various embedding models including VLLM local models and external services like OpenAI. +You maintain FAISS vector indices and SQLite metadata stores for efficient retrieval. + +Always provide clear feedback on operations and suggest optimizations when appropriate.""" + + def _get_default_instructions(self) -> List[str]: + """Get default instructions for embedding agent.""" + return [ + "Analyze the embedding task and determine the best approach", + "Use appropriate embedding models based on the task requirements", + "Process documents in batches for optimal performance", + "Validate input data before processing", + "Provide detailed feedback on operation results", + "Suggest optimizations for embedding workflows", + "Handle errors gracefully and provide recovery suggestions" + ] + + def _register_embedding_tools(self): + """Register embedding-specific tools.""" + # Register Pydantic AI tools + self._agent.tool(generate_embeddings) + self._agent.tool(add_documents_to_vector_store) + self._agent.tool(search_vector_store) + self._agent.tool(list_vector_databases) + + async def create_database( + self, + database_name: str, + documents: List[Document], + config: Optional[Dict[str, Any]] = None, + deps: Optional[AgentDependencies] = None + ) -> AgentResult: + """Create a new embedding database from documents.""" + task = EmbeddingTask( + task_type="create_db", + database_name=database_name, + documents=documents, + config=config or {} + ) + + return await self.execute(task, deps) + + async def add_documents( + self, + database_name: str, + documents: List[Document], + config: Optional[Dict[str, Any]] = None, + deps: Optional[AgentDependencies] = None + ) -> AgentResult: + """Add documents to an existing embedding database.""" + task = EmbeddingTask( + task_type="add_docs", + database_name=database_name, + documents=documents, + config=config or {} + ) + + return await self.execute(task, deps) + + async def search_database( + self, + database_name: str, + query: str, + top_k: int = 5, + filters: Optional[Dict[str, Any]] = None, + deps: Optional[AgentDependencies] = None + ) -> AgentResult: + """Search an embedding database for similar documents.""" + task = EmbeddingTask( + task_type="search", + database_name=database_name, + query=query, + config={ + "top_k": top_k, + "filters": filters or {} + } + ) + + return await self.execute(task, deps) + + async def sync_database( + self, + database_name: str, + sync_config: Dict[str, Any], + deps: Optional[AgentDependencies] = None + ) -> AgentResult: + """Synchronize embedding database with external services.""" + task = EmbeddingTask( + task_type="sync", + database_name=database_name, + config=sync_config + ) + + return await self.execute(task, deps) + + async def list_databases( + self, + deps: Optional[AgentDependencies] = None + ) -> AgentResult: + """List available embedding databases.""" + task = EmbeddingTask( + task_type="list", + database_name="" + ) + + return await self.execute(task, deps) + + async def process_documents( + self, + documents: List[Document], + processing_config: Dict[str, Any], + deps: Optional[AgentDependencies] = None + ) -> AgentResult: + """Process documents for embedding generation.""" + # This would involve chunking, preprocessing, etc. + task = EmbeddingTask( + task_type="process", + documents=documents, + config=processing_config + ) + + return await self.execute(task, deps) + + async def optimize_database( + self, + database_name: str, + optimization_config: Dict[str, Any], + deps: Optional[AgentDependencies] = None + ) -> AgentResult: + """Optimize an embedding database for better performance.""" + task = EmbeddingTask( + task_type="optimize", + database_name=database_name, + config=optimization_config + ) + + return await self.execute(task, deps) + + def _process_result(self, result: Any) -> EmbeddingResult: + """Process agent result into EmbeddingResult.""" + if isinstance(result, EmbeddingResult): + return result + + # Handle different result types + if isinstance(result, dict): + return EmbeddingResult(**result) + + # Default processing + return EmbeddingResult( + success=True, + task_type="unknown", + database_name="default", + message=str(result) + ) + + +class EmbeddingWorkflowAgent(BaseAgent): + """Agent for managing complex embedding workflows.""" + + def __init__(self, model_name: str = "anthropic:claude-sonnet-4-0"): + super().__init__(AgentType.EMBEDDING, model_name) + self.embedding_agent = EmbeddingAgent(model_name) + self._register_workflow_tools() + + def _get_default_system_prompt(self) -> str: + """Get default system prompt for embedding workflow agent.""" + return """You are an Embedding Workflow Agent that orchestrates complex embedding operations. + +You manage multi-step workflows including: +- Document ingestion and preprocessing +- Batch embedding generation +- Vector store management +- Database synchronization +- Performance optimization +- Error recovery and retry logic + +You coordinate with the Embedding Agent to execute individual operations and manage workflow state.""" + + def _get_default_instructions(self) -> List[str]: + """Get default instructions for embedding workflow agent.""" + return [ + "Break down complex embedding tasks into manageable steps", + "Coordinate with the Embedding Agent for individual operations", + "Manage workflow state and handle errors gracefully", + "Optimize batch processing for large document sets", + "Provide progress updates for long-running workflows", + "Implement retry logic for failed operations", + "Validate workflow results and suggest improvements" + ] + + def _register_workflow_tools(self): + """Register workflow-specific tools.""" + # Register embedding agent methods as tools + self._agent.tool(self.embedding_agent.create_database) + self._agent.tool(self.embedding_agent.add_documents) + self._agent.tool(self.embedding_agent.search_database) + self._agent.tool(self.embedding_agent.sync_database) + self._agent.tool(self.embedding_agent.list_databases) + + async def execute_workflow( + self, + workflow_config: Dict[str, Any], + deps: Optional[AgentDependencies] = None + ) -> AgentResult: + """Execute a complex embedding workflow.""" + workflow_type = workflow_config.get("type", "default") + + if workflow_type == "document_ingestion": + return await self._document_ingestion_workflow(workflow_config, deps) + elif workflow_type == "database_migration": + return await self._database_migration_workflow(workflow_config, deps) + elif workflow_type == "batch_processing": + return await self._batch_processing_workflow(workflow_config, deps) + else: + return await self._default_workflow(workflow_config, deps) + + async def _document_ingestion_workflow( + self, + config: Dict[str, Any], + deps: Optional[AgentDependencies] = None + ) -> AgentResult: + """Execute document ingestion workflow.""" + documents = config.get("documents", []) + database_name = config.get("database_name", "default") + chunk_size = config.get("chunk_size", 1000) + chunk_overlap = config.get("chunk_overlap", 200) + + # Step 1: Process documents (chunking, preprocessing) + processed_docs = await self._process_documents_for_ingestion( + documents, chunk_size, chunk_overlap + ) + + # Step 2: Create or add to database + if config.get("create_new", False): + result = await self.embedding_agent.create_database( + database_name, processed_docs, config.get("embedding_config"), deps + ) + else: + result = await self.embedding_agent.add_documents( + database_name, processed_docs, config.get("embedding_config"), deps + ) + + return result + + async def _database_migration_workflow( + self, + config: Dict[str, Any], + deps: Optional[AgentDependencies] = None + ) -> AgentResult: + """Execute database migration workflow.""" + source_db = config.get("source_database") + target_db = config.get("target_database") + + # Step 1: List source databases + source_result = await self.embedding_agent.list_databases(deps) + + # Step 2: Migrate data (implementation would depend on specific requirements) + # This is a placeholder for the actual migration logic + + return AgentResult( + success=True, + data={"message": f"Migration from {source_db} to {target_db} completed"} + ) + + async def _batch_processing_workflow( + self, + config: Dict[str, Any], + deps: Optional[AgentDependencies] = None + ) -> AgentResult: + """Execute batch processing workflow.""" + documents = config.get("documents", []) + batch_size = config.get("batch_size", 100) + database_name = config.get("database_name", "default") + + results = [] + for i in range(0, len(documents), batch_size): + batch = documents[i:i + batch_size] + + result = await self.embedding_agent.add_documents( + database_name, batch, config.get("embedding_config"), deps + ) + results.append(result) + + return AgentResult( + success=True, + data={ + "message": f"Processed {len(documents)} documents in {len(results)} batches", + "batch_results": results + } + ) + + async def _default_workflow( + self, + config: Dict[str, Any], + deps: Optional[AgentDependencies] = None + ) -> AgentResult: + """Execute default workflow.""" + return await self.execute(config, deps) + + async def _process_documents_for_ingestion( + self, + documents: List[Document], + chunk_size: int, + chunk_overlap: int + ) -> List[Document]: + """Process documents for ingestion (chunking, preprocessing).""" + processed_docs = [] + + for doc in documents: + # Simple chunking implementation + if len(doc.content) > chunk_size: + chunks = self._chunk_text(doc.content, chunk_size, chunk_overlap) + for i, chunk_text in enumerate(chunks): + chunk_doc = Document( + id=f"{doc.id}_chunk_{i}", + content=chunk_text, + metadata={ + **doc.metadata, + "chunk_index": i, + "total_chunks": len(chunks), + "parent_document": doc.id + } + ) + processed_docs.append(chunk_doc) + else: + processed_docs.append(doc) + + return processed_docs + + def _chunk_text(self, text: str, chunk_size: int, chunk_overlap: int) -> List[str]: + """Chunk text into smaller pieces.""" + chunks = [] + start = 0 + + while start < len(text): + end = start + chunk_size + chunk = text[start:end] + chunks.append(chunk) + start = end - chunk_overlap + + if start >= len(text): + break + + return chunks + + +class EmbeddingSyncAgent(BaseAgent): + """Agent for synchronizing embedding databases with external services.""" + + def __init__(self, model_name: str = "anthropic:claude-sonnet-4-0"): + super().__init__(AgentType.EMBEDDING, model_name) + self.embedding_agent = EmbeddingAgent(model_name) + self._register_sync_tools() + + def _get_default_system_prompt(self) -> str: + """Get default system prompt for embedding sync agent.""" + return """You are an Embedding Sync Agent that manages synchronization of embedding databases with external services. + +You handle: +- Uploading databases to HuggingFace Hub +- Downloading databases from external sources +- Managing database versions and metadata +- Handling authentication and permissions +- Resolving conflicts during synchronization +- Maintaining data integrity during sync operations""" + + def _get_default_instructions(self) -> List[str]: + """Get default instructions for embedding sync agent.""" + return [ + "Validate database integrity before synchronization", + "Handle authentication for external services", + "Manage version conflicts during sync operations", + "Provide clear feedback on sync progress", + "Implement rollback mechanisms for failed syncs", + "Optimize sync operations for large databases", + "Maintain audit logs of sync operations" + ] + + def _register_sync_tools(self): + """Register sync-specific tools.""" + # Register embedding agent methods + self._agent.tool(self.embedding_agent.list_databases) + self._agent.tool(self.embedding_agent.sync_database) + + async def upload_to_huggingface( + self, + database_name: str, + repository: str, + description: str = "", + private: bool = False, + deps: Optional[AgentDependencies] = None + ) -> AgentResult: + """Upload database to HuggingFace Hub.""" + sync_config = { + "action": "upload", + "repository": repository, + "description": description, + "private": private + } + + return await self.embedding_agent.sync_database(database_name, sync_config, deps) + + async def download_from_huggingface( + self, + repository: str, + local_name: Optional[str] = None, + overwrite: bool = False, + deps: Optional[AgentDependencies] = None + ) -> AgentResult: + """Download database from HuggingFace Hub.""" + sync_config = { + "action": "download", + "repository": repository, + "local_name": local_name, + "overwrite": overwrite + } + + return await self.embedding_agent.sync_database("", sync_config, deps) + + async def sync_with_external_service( + self, + service_config: Dict[str, Any], + deps: Optional[AgentDependencies] = None + ) -> AgentResult: + """Sync with external service using custom configuration.""" + return await self.execute(service_config, deps) + diff --git a/DeepResearch/src/agents/workflow_orchestrator.py b/DeepResearch/src/agents/workflow_orchestrator.py index c3fd0e1..ebbe8ae 100644 --- a/DeepResearch/src/agents/workflow_orchestrator.py +++ b/DeepResearch/src/agents/workflow_orchestrator.py @@ -526,3 +526,6 @@ async def _execute_evaluation_workflow(self, input_data: Dict[str, Any], paramet + + + diff --git a/DeepResearch/src/datatypes/deep_agent_state.py b/DeepResearch/src/datatypes/deep_agent_state.py index 3bf610b..3998e5c 100644 --- a/DeepResearch/src/datatypes/deep_agent_state.py +++ b/DeepResearch/src/datatypes/deep_agent_state.py @@ -394,3 +394,6 @@ def create_deep_agent_state( + + + diff --git a/DeepResearch/src/datatypes/deep_agent_types.py b/DeepResearch/src/datatypes/deep_agent_types.py index 0898ebb..ec0c832 100644 --- a/DeepResearch/src/datatypes/deep_agent_types.py +++ b/DeepResearch/src/datatypes/deep_agent_types.py @@ -361,3 +361,6 @@ def create_model_config( + + + diff --git a/DeepResearch/src/datatypes/vector_store_impl.py b/DeepResearch/src/datatypes/vector_store_impl.py new file mode 100644 index 0000000..ff7623d --- /dev/null +++ b/DeepResearch/src/datatypes/vector_store_impl.py @@ -0,0 +1,660 @@ +""" +Vector Store Implementation for DeepCritical + +This module provides concrete implementations of vector stores using FAISS and SQLite, +integrating with the RAG system and supporting various embedding models. +""" + +from __future__ import annotations + +import asyncio +import json +import sqlite3 +import hashlib +from pathlib import Path +from typing import Any, Dict, List, Optional, Union, Tuple +from datetime import datetime +import numpy as np + +try: + import faiss +except ImportError: + raise ImportError("faiss-cpu is required. Install with: pip install faiss-cpu") + +from pydantic import BaseModel, Field + +from .rag import ( + VectorStore, Embeddings, Document, Chunk, SearchResult, SearchType, + VectorStoreConfig, VectorStoreType +) +from .vllm_integration import VLLMEmbeddings + + +class FAISSVectorStore(VectorStore): + """FAISS-based vector store implementation with SQLite metadata.""" + + def __init__(self, config: VectorStoreConfig, embeddings: Embeddings): + super().__init__(config, embeddings) + self.data_dir = Path(config.connection_string or "./data/vector_store") + self.data_dir.mkdir(parents=True, exist_ok=True) + + self.db_path = self.data_dir / "metadata.db" + self.index_cache: Dict[str, faiss.Index] = {} + self._init_database() + + def _init_database(self): + """Initialize SQLite database for metadata storage.""" + with sqlite3.connect(self.db_path) as conn: + # Collections table + conn.execute(""" + CREATE TABLE IF NOT EXISTS collections ( + name TEXT PRIMARY KEY, + description TEXT, + embedding_model TEXT, + embedding_dimensions INTEGER, + document_count INTEGER DEFAULT 0, + index_type TEXT DEFAULT 'IndexFlatIP', + created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP, + updated_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP + ) + """) + + # Documents table + conn.execute(""" + CREATE TABLE IF NOT EXISTS documents ( + id TEXT PRIMARY KEY, + collection_name TEXT, + content TEXT, + metadata TEXT, + embedding_id INTEGER, + chunk_ids TEXT, + created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP, + FOREIGN KEY (collection_name) REFERENCES collections (name) + ) + """) + + # Chunks table + conn.execute(""" + CREATE TABLE IF NOT EXISTS chunks ( + id TEXT PRIMARY KEY, + document_id TEXT, + collection_name TEXT, + content TEXT, + metadata TEXT, + embedding_id INTEGER, + chunk_index INTEGER, + created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP, + FOREIGN KEY (document_id) REFERENCES documents (id), + FOREIGN KEY (collection_name) REFERENCES collections (name) + ) + """) + + # Create indexes + conn.execute("CREATE INDEX IF NOT EXISTS idx_documents_collection ON documents (collection_name)") + conn.execute("CREATE INDEX IF NOT EXISTS idx_chunks_document ON chunks (document_id)") + conn.execute("CREATE INDEX IF NOT EXISTS idx_chunks_collection ON chunks (collection_name)") + + def _get_collection_name(self, **kwargs) -> str: + """Get collection name from kwargs or use default.""" + return kwargs.get("collection_name", self.config.collection_name or "default") + + def _get_index_path(self, collection_name: str) -> Path: + """Get FAISS index path for collection.""" + return self.data_dir / f"{collection_name}.faiss" + + def _load_index(self, collection_name: str) -> Optional[faiss.Index]: + """Load FAISS index for collection.""" + if collection_name in self.index_cache: + return self.index_cache[collection_name] + + index_path = self._get_index_path(collection_name) + if index_path.exists(): + index = faiss.read_index(str(index_path)) + self.index_cache[collection_name] = index + return index + + return None + + def _save_index(self, collection_name: str, index: faiss.Index): + """Save FAISS index for collection.""" + index_path = self._get_index_path(collection_name) + faiss.write_index(index, str(index_path)) + self.index_cache[collection_name] = index + + def _create_index(self, dimensions: int, collection_name: str) -> faiss.Index: + """Create new FAISS index.""" + index_type = self.config.index_type or "IndexFlatIP" + + if index_type == "IndexFlatIP": + index = faiss.IndexFlatIP(dimensions) + elif index_type == "IndexFlatL2": + index = faiss.IndexFlatL2(dimensions) + elif index_type == "IndexHNSWFlat": + index = faiss.IndexHNSWFlat(dimensions, 32) + else: + # Default to IndexFlatIP + index = faiss.IndexFlatIP(dimensions) + + return index + + async def add_documents(self, documents: List[Document], **kwargs: Any) -> List[str]: + """Add documents to the vector store.""" + collection_name = self._get_collection_name(**kwargs) + + if not documents: + return [] + + # Generate embeddings for documents + texts = [doc.content for doc in documents] + embeddings = await self.embeddings.vectorize_documents(texts) + + if not embeddings: + raise RuntimeError("Failed to generate embeddings") + + dimensions = len(embeddings[0]) + + # Load or create index + index = self._load_index(collection_name) + if index is None: + index = self._create_index(dimensions, collection_name) + + # Add embeddings to index + embedding_ids = [] + for embedding in embeddings: + embedding_id = index.ntotal + index.add(np.array([embedding], dtype=np.float32)) + embedding_ids.append(embedding_id) + + # Save index + self._save_index(collection_name, index) + + # Save to SQLite + with sqlite3.connect(self.db_path) as conn: + # Update or create collection + conn.execute(""" + INSERT OR REPLACE INTO collections + (name, description, embedding_model, embedding_dimensions, document_count, index_type, updated_at) + VALUES (?, ?, ?, ?, ?, ?, CURRENT_TIMESTAMP) + """, ( + collection_name, + f"Collection with {len(documents)} documents", + self.embeddings.config.model_name, + dimensions, + index.ntotal, + self.config.index_type or "IndexFlatIP" + )) + + # Insert documents + document_ids = [] + for doc, embedding_id in zip(documents, embedding_ids): + # Process chunks if any + chunk_ids = [] + if doc.chunks: + for chunk in doc.chunks: + chunk_ids.append(chunk.id) + + conn.execute(""" + INSERT OR REPLACE INTO documents + (id, collection_name, content, metadata, embedding_id, chunk_ids) + VALUES (?, ?, ?, ?, ?, ?) + """, ( + doc.id, + collection_name, + doc.content, + json.dumps(doc.metadata), + embedding_id, + json.dumps(chunk_ids) + )) + + # Insert chunks + for chunk in doc.chunks: + conn.execute(""" + INSERT OR REPLACE INTO chunks + (id, document_id, collection_name, content, metadata, embedding_id, chunk_index) + VALUES (?, ?, ?, ?, ?, ?, ?) + """, ( + chunk.id, + doc.id, + collection_name, + chunk.content, + json.dumps(chunk.metadata), + embedding_id, # Same embedding for document and chunks + chunk.metadata.get("chunk_index", 0) + )) + + document_ids.append(doc.id) + + return document_ids + + async def add_document_chunks(self, chunks: List[Chunk], **kwargs: Any) -> List[str]: + """Add document chunks to the vector store.""" + collection_name = self._get_collection_name(**kwargs) + + if not chunks: + return [] + + # Generate embeddings for chunks + texts = [chunk.content for chunk in chunks] + embeddings = await self.embeddings.vectorize_documents(texts) + + if not embeddings: + raise RuntimeError("Failed to generate embeddings") + + dimensions = len(embeddings[0]) + + # Load or create index + index = self._load_index(collection_name) + if index is None: + index = self._create_index(dimensions, collection_name) + + # Add embeddings to index + embedding_ids = [] + for embedding in embeddings: + embedding_id = index.ntotal + index.add(np.array([embedding], dtype=np.float32)) + embedding_ids.append(embedding_id) + + # Save index + self._save_index(collection_name, index) + + # Save to SQLite + with sqlite3.connect(self.db_path) as conn: + # Update collection + conn.execute(""" + INSERT OR REPLACE INTO collections + (name, description, embedding_model, embedding_dimensions, document_count, index_type, updated_at) + VALUES (?, ?, ?, ?, ?, ?, CURRENT_TIMESTAMP) + """, ( + collection_name, + f"Collection with {len(chunks)} chunks", + self.embeddings.config.model_name, + dimensions, + index.ntotal, + self.config.index_type or "IndexFlatIP" + )) + + # Insert chunks + chunk_ids = [] + for chunk, embedding_id in zip(chunks, embedding_ids): + conn.execute(""" + INSERT OR REPLACE INTO chunks + (id, document_id, collection_name, content, metadata, embedding_id, chunk_index) + VALUES (?, ?, ?, ?, ?, ?, ?) + """, ( + chunk.id, + chunk.document_id, + collection_name, + chunk.content, + json.dumps(chunk.metadata), + embedding_id, + chunk.metadata.get("chunk_index", 0) + )) + + chunk_ids.append(chunk.id) + + return chunk_ids + + async def add_document_text_chunks(self, document_texts: List[str], **kwargs: Any) -> List[str]: + """Add document text chunks to the vector store (legacy method).""" + # Convert texts to chunks + chunks = [] + for i, text in enumerate(document_texts): + chunk = Chunk( + id=f"chunk_{i}_{hashlib.md5(text.encode()).hexdigest()[:8]}", + content=text, + document_id=kwargs.get("document_id", f"doc_{i}"), + metadata={"chunk_index": i} + ) + chunks.append(chunk) + + return await self.add_document_chunks(chunks, **kwargs) + + async def delete_documents(self, document_ids: List[str]) -> bool: + """Delete documents from the vector store.""" + try: + with sqlite3.connect(self.db_path) as conn: + # Get document info + cursor = conn.execute(""" + SELECT collection_name, embedding_id FROM documents + WHERE id IN ({}) + """.format(','.join('?' * len(document_ids))), document_ids) + + documents_to_delete = cursor.fetchall() + + if not documents_to_delete: + return True + + # Group by collection + collections = {} + for collection_name, embedding_id in documents_to_delete: + if collection_name not in collections: + collections[collection_name] = [] + collections[collection_name].append(embedding_id) + + # Remove from FAISS indices + for collection_name, embedding_ids in collections.items(): + index = self._load_index(collection_name) + if index: + # FAISS doesn't support direct deletion, so we need to rebuild + # This is a simplified approach - in practice, you'd want more sophisticated handling + pass + + # Delete from SQLite + conn.execute(""" + DELETE FROM chunks WHERE document_id IN ({}) + """.format(','.join('?' * len(document_ids))), document_ids) + + conn.execute(""" + DELETE FROM documents WHERE id IN ({}) + """.format(','.join('?' * len(document_ids))), document_ids) + + # Update collection counts + for collection_name in collections.keys(): + conn.execute(""" + UPDATE collections SET document_count = ( + SELECT COUNT(*) FROM documents WHERE collection_name = ? + ), updated_at = CURRENT_TIMESTAMP WHERE name = ? + """, (collection_name, collection_name)) + + return True + + except Exception as e: + print(f"Error deleting documents: {e}") + return False + + async def search( + self, + query: str, + search_type: SearchType, + retrieval_query: Optional[str] = None, + **kwargs: Any + ) -> List[SearchResult]: + """Search for documents using text query.""" + collection_name = self._get_collection_name(**kwargs) + top_k = kwargs.get("top_k", 5) + score_threshold = kwargs.get("score_threshold") + filters = kwargs.get("filters", {}) + + # Generate query embedding + query_embedding = await self.embeddings.vectorize_query(query) + + return await self.search_with_embeddings( + query_embedding, search_type, retrieval_query, + collection_name=collection_name, + top_k=top_k, + score_threshold=score_threshold, + filters=filters + ) + + async def search_with_embeddings( + self, + query_embedding: List[float], + search_type: SearchType, + retrieval_query: Optional[str] = None, + **kwargs: Any + ) -> List[SearchResult]: + """Search for documents using embedding vector.""" + collection_name = self._get_collection_name(**kwargs) + top_k = kwargs.get("top_k", 5) + score_threshold = kwargs.get("score_threshold") + filters = kwargs.get("filters", {}) + + # Load index + index = self._load_index(collection_name) + if index is None: + return [] + + # Search + scores, indices = index.search(np.array([query_embedding], dtype=np.float32), top_k) + + # Get documents from SQLite + results = [] + with sqlite3.connect(self.db_path) as conn: + for score, idx in zip(scores[0], indices[0]): + if idx == -1: # No more results + break + + if score_threshold and score < score_threshold: + continue + + # Get document info + cursor = conn.execute(""" + SELECT id, content, metadata FROM documents + WHERE collection_name = ? AND embedding_id = ? + """, (collection_name, int(idx))) + + row = cursor.fetchone() + if row: + doc_id, content, metadata = row + metadata_dict = json.loads(metadata) if metadata else {} + + # Apply filters + if filters and not self._matches_filters(metadata_dict, filters): + continue + + document = Document( + id=doc_id, + content=content, + metadata=metadata_dict + ) + + result = SearchResult( + document=document, + score=float(score), + rank=len(results) + 1 + ) + results.append(result) + + return results + + async def get_document(self, document_id: str) -> Optional[Document]: + """Retrieve a document by its ID.""" + with sqlite3.connect(self.db_path) as conn: + cursor = conn.execute(""" + SELECT id, content, metadata, chunk_ids FROM documents WHERE id = ? + """, (document_id,)) + + row = cursor.fetchone() + if not row: + return None + + doc_id, content, metadata, chunk_ids = row + metadata_dict = json.loads(metadata) if metadata else {} + chunk_id_list = json.loads(chunk_ids) if chunk_ids else [] + + # Get chunks + chunks = [] + if chunk_id_list: + cursor = conn.execute(""" + SELECT id, content, metadata FROM chunks WHERE id IN ({}) + """.format(','.join('?' * len(chunk_id_list))), chunk_id_list) + + for chunk_row in cursor.fetchall(): + chunk_id, chunk_content, chunk_metadata = chunk_row + chunk_metadata_dict = json.loads(chunk_metadata) if chunk_metadata else {} + + chunk = Chunk( + id=chunk_id, + content=chunk_content, + document_id=doc_id, + metadata=chunk_metadata_dict + ) + chunks.append(chunk) + + return Document( + id=doc_id, + content=content, + chunks=chunks, + metadata=metadata_dict + ) + + async def update_document(self, document: Document) -> bool: + """Update an existing document.""" + try: + # Delete old document + await self.delete_documents([document.id]) + + # Add updated document + await self.add_documents([document]) + + return True + + except Exception as e: + print(f"Error updating document: {e}") + return False + + def _matches_filters(self, metadata: Dict[str, Any], filters: Dict[str, Any]) -> bool: + """Check if metadata matches filters.""" + for key, value in filters.items(): + if key not in metadata: + return False + + # Handle different filter types + if isinstance(value, dict): + if "$in" in value: + if metadata[key] not in value["$in"]: + return False + elif "$gte" in value: + if metadata[key] < value["$gte"]: + return False + elif "$lte" in value: + if metadata[key] > value["$lte"]: + return False + elif "$gt" in value: + if metadata[key] <= value["$gt"]: + return False + elif "$lt" in value: + if metadata[key] >= value["$lt"]: + return False + else: + if metadata[key] != value: + return False + else: + if metadata[key] != value: + return False + + return True + + async def get_collection_info(self, collection_name: str) -> Optional[Dict[str, Any]]: + """Get information about a collection.""" + with sqlite3.connect(self.db_path) as conn: + cursor = conn.execute(""" + SELECT name, description, embedding_model, embedding_dimensions, + document_count, index_type, created_at, updated_at + FROM collections WHERE name = ? + """, (collection_name,)) + + row = cursor.fetchone() + if not row: + return None + + return { + "name": row[0], + "description": row[1], + "embedding_model": row[2], + "embedding_dimensions": row[3], + "document_count": row[4], + "index_type": row[5], + "created_at": row[6], + "updated_at": row[7] + } + + async def list_collections(self) -> List[Dict[str, Any]]: + """List all collections.""" + with sqlite3.connect(self.db_path) as conn: + cursor = conn.execute(""" + SELECT name, description, embedding_model, embedding_dimensions, + document_count, index_type, created_at, updated_at + FROM collections ORDER BY created_at DESC + """) + + collections = [] + for row in cursor.fetchall(): + collections.append({ + "name": row[0], + "description": row[1], + "embedding_model": row[2], + "embedding_dimensions": row[3], + "document_count": row[4], + "index_type": row[5], + "created_at": row[6], + "updated_at": row[7] + }) + + return collections + + async def clear_collection(self, collection_name: str) -> bool: + """Clear all documents from a collection.""" + try: + with sqlite3.connect(self.db_path) as conn: + # Delete chunks + conn.execute("DELETE FROM chunks WHERE collection_name = ?", (collection_name,)) + + # Delete documents + conn.execute("DELETE FROM documents WHERE collection_name = ?", (collection_name,)) + + # Update collection + conn.execute(""" + UPDATE collections SET document_count = 0, updated_at = CURRENT_TIMESTAMP + WHERE name = ? + """, (collection_name,)) + + # Remove index from cache and delete file + if collection_name in self.index_cache: + del self.index_cache[collection_name] + + index_path = self._get_index_path(collection_name) + if index_path.exists(): + index_path.unlink() + + return True + + except Exception as e: + print(f"Error clearing collection: {e}") + return False + + +class VectorStoreFactory: + """Factory for creating vector store instances.""" + + @staticmethod + def create_vector_store( + config: VectorStoreConfig, + embeddings: Embeddings + ) -> VectorStore: + """Create a vector store instance based on configuration.""" + if config.store_type == VectorStoreType.FAISS: + return FAISSVectorStore(config, embeddings) + else: + raise ValueError(f"Unsupported vector store type: {config.store_type}") + + @staticmethod + def create_faiss_vector_store( + data_dir: str = "./data/vector_store", + collection_name: str = "default", + index_type: str = "IndexFlatIP", + embeddings: Optional[Embeddings] = None + ) -> FAISSVectorStore: + """Create a FAISS vector store with default configuration.""" + config = VectorStoreConfig( + store_type=VectorStoreType.FAISS, + connection_string=data_dir, + collection_name=collection_name, + index_type=index_type, + embedding_dimension=embeddings.num_dimensions if embeddings else 384 + ) + + if embeddings is None: + # Create default embeddings + from .vllm_integration import VLLMEmbeddings + from .rag import EmbeddingsConfig, EmbeddingModelType + + embeddings_config = EmbeddingsConfig( + model_type=EmbeddingModelType.CUSTOM, + model_name="sentence-transformers/all-MiniLM-L6-v2", + num_dimensions=384 + ) + embeddings = VLLMEmbeddings(embeddings_config) + + return FAISSVectorStore(config, embeddings) + diff --git a/DeepResearch/src/prompts/deep_agent_graph.py b/DeepResearch/src/prompts/deep_agent_graph.py index 010fa1e..8457413 100644 --- a/DeepResearch/src/prompts/deep_agent_graph.py +++ b/DeepResearch/src/prompts/deep_agent_graph.py @@ -556,3 +556,6 @@ def create_async_deep_agent( + + + diff --git a/DeepResearch/src/prompts/deep_agent_prompts.py b/DeepResearch/src/prompts/deep_agent_prompts.py index ada21a1..26cb87e 100644 --- a/DeepResearch/src/prompts/deep_agent_prompts.py +++ b/DeepResearch/src/prompts/deep_agent_prompts.py @@ -504,3 +504,6 @@ def format_template(name: str, **kwargs) -> str: + + + diff --git a/DeepResearch/src/statemachines/embedding_workflow.py b/DeepResearch/src/statemachines/embedding_workflow.py new file mode 100644 index 0000000..5f1a5f9 --- /dev/null +++ b/DeepResearch/src/statemachines/embedding_workflow.py @@ -0,0 +1,416 @@ +""" +Embedding Workflow for DeepCritical + +This module defines Pydantic Graph workflows for embedding operations, +including document processing, vector store management, and synchronization. +""" + +from __future__ import annotations + +from typing import Any, Dict, List, Optional, Union +from datetime import datetime + +from pydantic import BaseModel, Field +from pydantic_graph import BaseNode, GraphRunContext, NextNode, End + +from ..datatypes.rag import Document, Chunk, RAGQuery, RAGResponse, RAGConfig +from ..datatypes.agent_types import AgentDependencies +from ..agents.embedding_agent import EmbeddingAgent, EmbeddingWorkflowAgent, EmbeddingSyncAgent +from ..datatypes.vector_store_impl import VectorStoreFactory + + +class EmbeddingWorkflowState(BaseModel): + """State for embedding workflow execution.""" + # Input data + documents: List[Document] = Field(default_factory=list, description="Documents to process") + database_name: str = Field("default", description="Target database name") + workflow_type: str = Field("document_ingestion", description="Type of workflow") + + # Configuration + embedding_config: Dict[str, Any] = Field(default_factory=dict, description="Embedding configuration") + vector_store_config: Dict[str, Any] = Field(default_factory=dict, description="Vector store configuration") + sync_config: Dict[str, Any] = Field(default_factory=dict, description="Sync configuration") + + # Processing state + processed_documents: List[Document] = Field(default_factory=list, description="Processed documents") + chunks: List[Chunk] = Field(default_factory=list, description="Document chunks") + document_ids: List[str] = Field(default_factory=list, description="Added document IDs") + + # Results + search_results: List[Any] = Field(default_factory=list, description="Search results") + sync_results: Dict[str, Any] = Field(default_factory=dict, description="Sync results") + + # Workflow state + current_step: str = Field("", description="Current workflow step") + completed_steps: List[str] = Field(default_factory=list, description="Completed steps") + errors: List[str] = Field(default_factory=list, description="Any errors encountered") + + # Metadata + processing_time: float = Field(0.0, description="Total processing time") + start_time: Optional[datetime] = Field(None, description="Workflow start time") + end_time: Optional[datetime] = Field(None, description="Workflow end time") + + +class InitializeEmbeddingWorkflow(BaseNode[EmbeddingWorkflowState]): + """Initialize embedding workflow.""" + + async def run(self, ctx: GraphRunContext[EmbeddingWorkflowState]) -> NextNode: + """Initialize the embedding workflow.""" + ctx.state.start_time = datetime.now() + ctx.state.current_step = "initialize" + + # Validate input + if not ctx.state.documents and ctx.state.workflow_type != "sync": + ctx.state.errors.append("No documents provided for processing") + return End("No documents to process") + + # Initialize agents + ctx.set("embedding_agent", EmbeddingAgent()) + ctx.set("workflow_agent", EmbeddingWorkflowAgent()) + ctx.set("sync_agent", EmbeddingSyncAgent()) + + # Initialize vector store + vector_store = VectorStoreFactory.create_faiss_vector_store( + data_dir=ctx.state.vector_store_config.get("data_dir", "./data/vector_store"), + collection_name=ctx.state.database_name + ) + ctx.set("vector_store", vector_store) + + ctx.state.completed_steps.append("initialize") + ctx.state.current_step = "process_documents" + + return NextNode() + + +class ProcessDocumentsNode(BaseNode[EmbeddingWorkflowState]): + """Process documents for embedding.""" + + async def run(self, ctx: GraphRunContext[EmbeddingWorkflowState]) -> NextNode: + """Process documents for embedding.""" + ctx.state.current_step = "process_documents" + + try: + workflow_agent = ctx.get("workflow_agent") + deps = AgentDependencies.from_config(ctx.state.embedding_config) + + # Process documents based on workflow type + if ctx.state.workflow_type == "document_ingestion": + result = await workflow_agent._document_ingestion_workflow({ + "documents": [doc.dict() for doc in ctx.state.documents], + "database_name": ctx.state.database_name, + "chunk_size": ctx.state.embedding_config.get("chunk_size", 1000), + "chunk_overlap": ctx.state.embedding_config.get("chunk_overlap", 200), + "create_new": ctx.state.embedding_config.get("create_new", True), + "embedding_config": ctx.state.embedding_config + }, deps) + + if result.success: + ctx.state.processed_documents = ctx.state.documents + ctx.state.document_ids = result.data.get("document_ids", []) + else: + ctx.state.errors.append(f"Document processing failed: {result.error}") + return End("Document processing failed") + + elif ctx.state.workflow_type == "batch_processing": + result = await workflow_agent._batch_processing_workflow({ + "documents": [doc.dict() for doc in ctx.state.documents], + "database_name": ctx.state.database_name, + "batch_size": ctx.state.embedding_config.get("batch_size", 100), + "embedding_config": ctx.state.embedding_config + }, deps) + + if result.success: + ctx.state.processed_documents = ctx.state.documents + ctx.state.document_ids = result.data.get("document_ids", []) + else: + ctx.state.errors.append(f"Batch processing failed: {result.error}") + return End("Batch processing failed") + + else: + # Default processing + ctx.state.processed_documents = ctx.state.documents + + ctx.state.completed_steps.append("process_documents") + ctx.state.current_step = "generate_embeddings" + + return NextNode() + + except Exception as e: + ctx.state.errors.append(f"Error processing documents: {str(e)}") + return End("Document processing error") + + +class GenerateEmbeddingsNode(BaseNode[EmbeddingWorkflowState]): + """Generate embeddings for documents.""" + + async def run(self, ctx: GraphRunContext[EmbeddingWorkflowState]) -> NextNode: + """Generate embeddings for processed documents.""" + ctx.state.current_step = "generate_embeddings" + + try: + embedding_agent = ctx.get("embedding_agent") + deps = AgentDependencies.from_config(ctx.state.embedding_config) + + # Generate embeddings + if ctx.state.embedding_config.get("create_new", True): + result = await embedding_agent.create_database( + ctx.state.database_name, + ctx.state.processed_documents, + ctx.state.embedding_config, + deps + ) + else: + result = await embedding_agent.add_documents( + ctx.state.database_name, + ctx.state.processed_documents, + ctx.state.embedding_config, + deps + ) + + if result.success: + ctx.state.document_ids = result.data.get("document_ids", []) + ctx.state.completed_steps.append("generate_embeddings") + ctx.state.current_step = "validate_results" + else: + ctx.state.errors.append(f"Embedding generation failed: {result.error}") + return End("Embedding generation failed") + + return NextNode() + + except Exception as e: + ctx.state.errors.append(f"Error generating embeddings: {str(e)}") + return End("Embedding generation error") + + +class ValidateResultsNode(BaseNode[EmbeddingWorkflowState]): + """Validate embedding results.""" + + async def run(self, ctx: GraphRunContext[EmbeddingWorkflowState]) -> NextNode: + """Validate the embedding results.""" + ctx.state.current_step = "validate_results" + + try: + embedding_agent = ctx.get("embedding_agent") + deps = AgentDependencies.from_config(ctx.state.embedding_config) + + # Test search functionality + test_query = "test query for validation" + search_result = await embedding_agent.search_database( + ctx.state.database_name, + test_query, + top_k=1, + deps=deps + ) + + if search_result.success: + ctx.state.completed_steps.append("validate_results") + + # Check if sync is needed + if ctx.state.sync_config.get("enabled", False): + ctx.state.current_step = "sync_database" + return NextNode() + else: + ctx.state.current_step = "finalize" + return NextNode() + else: + ctx.state.errors.append("Validation failed: search test unsuccessful") + return End("Validation failed") + + except Exception as e: + ctx.state.errors.append(f"Error validating results: {str(e)}") + return End("Validation error") + + +class SyncDatabaseNode(BaseNode[EmbeddingWorkflowState]): + """Synchronize database with external services.""" + + async def run(self, ctx: GraphRunContext[EmbeddingWorkflowState]) -> NextNode: + """Synchronize database with external services.""" + ctx.state.current_step = "sync_database" + + try: + sync_agent = ctx.get("sync_agent") + deps = AgentDependencies.from_config(ctx.state.sync_config) + + sync_action = ctx.state.sync_config.get("action", "upload") + + if sync_action == "upload": + result = await sync_agent.upload_to_huggingface( + ctx.state.database_name, + ctx.state.sync_config.get("repository"), + ctx.state.sync_config.get("description", ""), + ctx.state.sync_config.get("private", False), + deps + ) + elif sync_action == "download": + result = await sync_agent.download_from_huggingface( + ctx.state.sync_config.get("repository"), + ctx.state.sync_config.get("local_name"), + ctx.state.sync_config.get("overwrite", False), + deps + ) + else: + result = await sync_agent.sync_with_external_service( + ctx.state.sync_config, + deps + ) + + if result.success: + ctx.state.sync_results = result.data + ctx.state.completed_steps.append("sync_database") + ctx.state.current_step = "finalize" + else: + ctx.state.errors.append(f"Sync failed: {result.error}") + # Continue to finalize even if sync fails + ctx.state.current_step = "finalize" + + return NextNode() + + except Exception as e: + ctx.state.errors.append(f"Error syncing database: {str(e)}") + ctx.state.current_step = "finalize" + return NextNode() + + +class FinalizeWorkflowNode(BaseNode[EmbeddingWorkflowState]): + """Finalize the embedding workflow.""" + + async def run(self, ctx: GraphRunContext[EmbeddingWorkflowState]) -> NextNode: + """Finalize the workflow and prepare results.""" + ctx.state.current_step = "finalize" + ctx.state.end_time = datetime.now() + + if ctx.state.start_time: + ctx.state.processing_time = (ctx.state.end_time - ctx.state.start_time).total_seconds() + + # Prepare final results + if ctx.state.errors: + message = f"Workflow completed with {len(ctx.state.errors)} errors" + else: + message = f"Workflow completed successfully. Processed {len(ctx.state.documents)} documents" + + ctx.state.completed_steps.append("finalize") + + return End(message) + + +class EmbeddingWorkflowGraph: + """Graph for embedding workflow execution.""" + + def __init__(self): + self.nodes = { + "initialize": InitializeEmbeddingWorkflow(), + "process_documents": ProcessDocumentsNode(), + "generate_embeddings": GenerateEmbeddingsNode(), + "validate_results": ValidateResultsNode(), + "sync_database": SyncDatabaseNode(), + "finalize": FinalizeWorkflowNode() + } + + async def run_workflow( + self, + documents: List[Document], + database_name: str = "default", + workflow_type: str = "document_ingestion", + embedding_config: Optional[Dict[str, Any]] = None, + vector_store_config: Optional[Dict[str, Any]] = None, + sync_config: Optional[Dict[str, Any]] = None + ) -> EmbeddingWorkflowState: + """Run the embedding workflow.""" + from pydantic_graph import Graph + + # Create initial state + state = EmbeddingWorkflowState( + documents=documents, + database_name=database_name, + workflow_type=workflow_type, + embedding_config=embedding_config or {}, + vector_store_config=vector_store_config or {}, + sync_config=sync_config or {} + ) + + # Create graph + graph = Graph( + nodes=list(self.nodes.values()), + start_node=self.nodes["initialize"] + ) + + # Run workflow + result = await graph.run(state) + + return result + + +class EmbeddingWorkflowOrchestrator: + """Orchestrator for embedding workflows.""" + + def __init__(self): + self.workflow_graph = EmbeddingWorkflowGraph() + + async def create_database_workflow( + self, + documents: List[Document], + database_name: str, + embedding_config: Optional[Dict[str, Any]] = None, + vector_store_config: Optional[Dict[str, Any]] = None + ) -> EmbeddingWorkflowState: + """Run workflow to create a new embedding database.""" + return await self.workflow_graph.run_workflow( + documents=documents, + database_name=database_name, + workflow_type="document_ingestion", + embedding_config=embedding_config, + vector_store_config=vector_store_config + ) + + async def batch_processing_workflow( + self, + documents: List[Document], + database_name: str, + batch_size: int = 100, + embedding_config: Optional[Dict[str, Any]] = None + ) -> EmbeddingWorkflowState: + """Run workflow for batch processing documents.""" + config = embedding_config or {} + config["batch_size"] = batch_size + + return await self.workflow_graph.run_workflow( + documents=documents, + database_name=database_name, + workflow_type="batch_processing", + embedding_config=config + ) + + async def sync_workflow( + self, + database_name: str, + sync_config: Dict[str, Any] + ) -> EmbeddingWorkflowState: + """Run workflow for database synchronization.""" + return await self.workflow_graph.run_workflow( + documents=[], + database_name=database_name, + workflow_type="sync", + sync_config=sync_config + ) + + async def migration_workflow( + self, + source_database: str, + target_database: str, + migration_config: Optional[Dict[str, Any]] = None + ) -> EmbeddingWorkflowState: + """Run workflow for database migration.""" + config = migration_config or {} + config.update({ + "source_database": source_database, + "target_database": target_database + }) + + return await self.workflow_graph.run_workflow( + documents=[], + database_name=target_database, + workflow_type="migration", + embedding_config=config + ) + diff --git a/DeepResearch/src/utils/config_loader.py b/DeepResearch/src/utils/config_loader.py index 9f36238..8c4b818 100644 --- a/DeepResearch/src/utils/config_loader.py +++ b/DeepResearch/src/utils/config_loader.py @@ -202,3 +202,6 @@ def load_bioinformatics_config(config: Optional[DictConfig] = None) -> Bioinform + + + diff --git a/DeepResearch/tools/deep_agent_middleware.py b/DeepResearch/tools/deep_agent_middleware.py index 230842e..5e65612 100644 --- a/DeepResearch/tools/deep_agent_middleware.py +++ b/DeepResearch/tools/deep_agent_middleware.py @@ -545,3 +545,6 @@ def create_default_middleware_pipeline( + + + diff --git a/DeepResearch/tools/embedding_tools.py b/DeepResearch/tools/embedding_tools.py new file mode 100644 index 0000000..8caa37c --- /dev/null +++ b/DeepResearch/tools/embedding_tools.py @@ -0,0 +1,570 @@ +""" +Pydantic AI Embedding Tools for DeepCritical + +This module provides embedding tools that integrate with Pydantic AI agents, +supporting both VLLM local models and external embedding services. +""" + +from __future__ import annotations + +import asyncio +import json +import sqlite3 +import hashlib +from pathlib import Path +from typing import Any, Dict, List, Optional, Union, AsyncGenerator +from datetime import datetime +import numpy as np + +try: + import faiss +except ImportError: + raise ImportError("faiss-cpu is required. Install with: pip install faiss-cpu") + +from pydantic import BaseModel, Field +from pydantic_ai import Agent, RunContext, defer + +from ..src.datatypes.rag import ( + Document, Chunk, SearchResult, SearchType, EmbeddingsConfig, + EmbeddingModelType, VectorStoreConfig, VectorStoreType, RAGQuery, RAGResponse +) +from ..src.datatypes.vllm_integration import VLLMEmbeddings, VLLMLLMProvider +from ..src.datatypes.vllm_dataclass import VllmConfig, create_vllm_config +from .base import ToolRunner, ToolSpec, ExecutionResult +from .tool_registry import ToolRegistry + + +class EmbeddingRequest(BaseModel): + """Request for embedding generation.""" + texts: List[str] = Field(..., description="Texts to embed") + model: str = Field("text-embedding-3-small", description="Embedding model to use") + batch_size: int = Field(32, description="Batch size for processing") + normalize: bool = Field(True, description="Whether to normalize embeddings") + + +class EmbeddingResponse(BaseModel): + """Response from embedding generation.""" + embeddings: List[List[float]] = Field(..., description="Generated embeddings") + model: str = Field(..., description="Model used") + dimensions: int = Field(..., description="Embedding dimensions") + processing_time: float = Field(..., description="Processing time in seconds") + + +class VectorStoreRequest(BaseModel): + """Request for vector store operations.""" + operation: str = Field(..., description="Operation: add, search, delete, list") + documents: Optional[List[Document]] = Field(None, description="Documents to add") + query: Optional[str] = Field(None, description="Search query") + top_k: int = Field(5, description="Number of results to return") + filters: Optional[Dict[str, Any]] = Field(None, description="Metadata filters") + database_name: str = Field("default", description="Database name") + + +class VectorStoreResponse(BaseModel): + """Response from vector store operations.""" + success: bool = Field(..., description="Operation success") + results: Optional[List[SearchResult]] = Field(None, description="Search results") + document_ids: Optional[List[str]] = Field(None, description="Added document IDs") + message: str = Field(..., description="Response message") + processing_time: float = Field(..., description="Processing time in seconds") + + +class EmbeddingTool(ToolRunner): + """Pydantic AI embedding tool with VLLM integration.""" + + def __init__(self, config: Optional[Dict[str, Any]] = None): + super().__init__(ToolSpec( + name="embedding_tool", + description="Generate embeddings for text using VLLM or external services", + inputs={ + "texts": "List of texts to embed", + "model": "Embedding model name", + "batch_size": "Batch size for processing", + "normalize": "Whether to normalize embeddings" + }, + outputs={ + "embeddings": "Generated embeddings", + "model": "Model used", + "dimensions": "Embedding dimensions", + "processing_time": "Processing time" + } + )) + + self.config = config or {} + self.embeddings_provider: Optional[VLLMEmbeddings] = None + self._initialize_embeddings() + + def _initialize_embeddings(self): + """Initialize embeddings provider based on configuration.""" + embedding_config = self.config.get("embeddings", {}) + + if embedding_config.get("use_vllm", False): + # Use VLLM for local embeddings + vllm_config = embedding_config.get("vllm_config", {}) + embeddings_config = EmbeddingsConfig( + model_type=EmbeddingModelType.CUSTOM, + model_name=vllm_config.get("model_name", "sentence-transformers/all-MiniLM-L6-v2"), + base_url=f"{vllm_config.get('host', 'localhost')}:{vllm_config.get('port', 8001)}", + num_dimensions=vllm_config.get("dimensions", 384), + batch_size=vllm_config.get("batch_size", 32) + ) + self.embeddings_provider = VLLMEmbeddings(embeddings_config) + else: + # Use external service (OpenAI, etc.) + self.embeddings_provider = None + + def run(self, params: Dict[str, Any]) -> ExecutionResult: + """Run embedding generation synchronously.""" + try: + request = EmbeddingRequest(**params) + return asyncio.run(self._generate_embeddings_async(request)) + except Exception as e: + return ExecutionResult(success=False, error=str(e)) + + async def _generate_embeddings_async(self, request: EmbeddingRequest) -> ExecutionResult: + """Generate embeddings asynchronously.""" + import time + start_time = time.time() + + try: + if self.embeddings_provider: + # Use VLLM embeddings + embeddings = await self.embeddings_provider.vectorize_documents(request.texts) + else: + # Use external service (OpenAI) + embeddings = await self._generate_external_embeddings(request) + + processing_time = time.time() - start_time + + response = EmbeddingResponse( + embeddings=embeddings, + model=request.model, + dimensions=len(embeddings[0]) if embeddings else 0, + processing_time=processing_time + ) + + return ExecutionResult(success=True, data=response.dict()) + + except Exception as e: + return ExecutionResult(success=False, error=str(e)) + + async def _generate_external_embeddings(self, request: EmbeddingRequest) -> List[List[float]]: + """Generate embeddings using external service.""" + try: + from openai import AsyncOpenAI + + client = AsyncOpenAI( + api_key=self.config.get("openai", {}).get("api_key"), + base_url=self.config.get("openai", {}).get("base_url") + ) + + embeddings = [] + batch_size = request.batch_size + + for i in range(0, len(request.texts), batch_size): + batch = request.texts[i:i + batch_size] + + response = await client.embeddings.create( + model=request.model, + input=batch + ) + + batch_embeddings = [item.embedding for item in response.data] + embeddings.extend(batch_embeddings) + + return embeddings + + except ImportError: + raise ImportError("openai package required for external embeddings") + except Exception as e: + raise RuntimeError(f"Failed to generate external embeddings: {e}") + + +class VectorStoreTool(ToolRunner): + """Pydantic AI vector store tool with FAISS and SQLite backend.""" + + def __init__(self, config: Optional[Dict[str, Any]] = None): + super().__init__(ToolSpec( + name="vector_store_tool", + description="Manage vector store with FAISS and SQLite backend", + inputs={ + "operation": "Operation: add, search, delete, list", + "documents": "Documents to add", + "query": "Search query", + "top_k": "Number of results", + "filters": "Metadata filters", + "database_name": "Database name" + }, + outputs={ + "success": "Operation success", + "results": "Search results", + "document_ids": "Added document IDs", + "message": "Response message", + "processing_time": "Processing time" + } + )) + + self.config = config or {} + self.data_dir = Path(self.config.get("data_dir", "./data/embeddings")) + self.data_dir.mkdir(parents=True, exist_ok=True) + self.db_path = self.data_dir / "embeddings.db" + self._init_database() + + def _init_database(self): + """Initialize SQLite database for metadata.""" + with sqlite3.connect(self.db_path) as conn: + conn.execute(""" + CREATE TABLE IF NOT EXISTS databases ( + name TEXT PRIMARY KEY, + description TEXT, + embedding_model TEXT, + embedding_dimensions INTEGER, + document_count INTEGER DEFAULT 0, + created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP, + updated_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP + ) + """) + + conn.execute(""" + CREATE TABLE IF NOT EXISTS documents ( + id TEXT PRIMARY KEY, + database_name TEXT, + content TEXT, + metadata TEXT, + embedding_id INTEGER, + created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP, + FOREIGN KEY (database_name) REFERENCES databases (name) + ) + """) + + conn.execute(""" + CREATE INDEX IF NOT EXISTS idx_documents_database + ON documents (database_name) + """) + + def run(self, params: Dict[str, Any]) -> ExecutionResult: + """Run vector store operation synchronously.""" + try: + request = VectorStoreRequest(**params) + return asyncio.run(self._process_request_async(request)) + except Exception as e: + return ExecutionResult(success=False, error=str(e)) + + async def _process_request_async(self, request: VectorStoreRequest) -> ExecutionResult: + """Process vector store request asynchronously.""" + import time + start_time = time.time() + + try: + if request.operation == "add": + result = await self._add_documents(request) + elif request.operation == "search": + result = await self._search_documents(request) + elif request.operation == "delete": + result = await self._delete_documents(request) + elif request.operation == "list": + result = await self._list_databases(request) + else: + raise ValueError(f"Unknown operation: {request.operation}") + + processing_time = time.time() - start_time + result.processing_time = processing_time + + return ExecutionResult(success=True, data=result.dict()) + + except Exception as e: + return ExecutionResult(success=False, error=str(e)) + + async def _add_documents(self, request: VectorStoreRequest) -> VectorStoreResponse: + """Add documents to vector store.""" + if not request.documents: + raise ValueError("Documents required for add operation") + + # Generate embeddings for documents + embedding_tool = EmbeddingTool(self.config) + texts = [doc.content for doc in request.documents] + + embedding_request = EmbeddingRequest( + texts=texts, + model=self.config.get("embeddings", {}).get("model", "text-embedding-3-small") + ) + + embedding_result = await embedding_tool._generate_embeddings_async(embedding_request) + if not embedding_result.success: + raise RuntimeError(f"Failed to generate embeddings: {embedding_result.error}") + + embeddings = embedding_result.data["embeddings"] + dimensions = embedding_result.data["dimensions"] + + # Load or create FAISS index + index_path = self.data_dir / f"{request.database_name}.faiss" + if index_path.exists(): + index = faiss.read_index(str(index_path)) + else: + index = faiss.IndexFlatIP(dimensions) + + # Add embeddings to index + embedding_ids = [] + for i, embedding in enumerate(embeddings): + embedding_id = index.ntotal + index.add(np.array([embedding], dtype=np.float32)) + embedding_ids.append(embedding_id) + + # Save FAISS index + faiss.write_index(index, str(index_path)) + + # Save to SQLite + with sqlite3.connect(self.db_path) as conn: + # Update or create database record + conn.execute(""" + INSERT OR REPLACE INTO databases + (name, description, embedding_model, embedding_dimensions, document_count, updated_at) + VALUES (?, ?, ?, ?, ?, CURRENT_TIMESTAMP) + """, ( + request.database_name, + f"Database with {len(request.documents)} documents", + embedding_request.model, + dimensions, + index.ntotal + )) + + # Insert documents + for doc, embedding_id in zip(request.documents, embedding_ids): + conn.execute(""" + INSERT OR REPLACE INTO documents + (id, database_name, content, metadata, embedding_id) + VALUES (?, ?, ?, ?, ?) + """, ( + doc.id, + request.database_name, + doc.content, + json.dumps(doc.metadata), + embedding_id + )) + + return VectorStoreResponse( + success=True, + document_ids=[doc.id for doc in request.documents], + message=f"Added {len(request.documents)} documents to {request.database_name}" + ) + + async def _search_documents(self, request: VectorStoreRequest) -> VectorStoreResponse: + """Search documents in vector store.""" + if not request.query: + raise ValueError("Query required for search operation") + + # Generate query embedding + embedding_tool = EmbeddingTool(self.config) + embedding_request = EmbeddingRequest( + texts=[request.query], + model=self.config.get("embeddings", {}).get("model", "text-embedding-3-small") + ) + + embedding_result = await embedding_tool._generate_embeddings_async(embedding_request) + if not embedding_result.success: + raise RuntimeError(f"Failed to generate query embedding: {embedding_result.error}") + + query_embedding = embedding_result.data["embeddings"][0] + + # Load FAISS index + index_path = self.data_dir / f"{request.database_name}.faiss" + if not index_path.exists(): + return VectorStoreResponse( + success=True, + results=[], + message=f"Database {request.database_name} not found" + ) + + index = faiss.read_index(str(index_path)) + + # Search + scores, indices = index.search(np.array([query_embedding], dtype=np.float32), request.top_k) + + # Get documents from SQLite + results = [] + with sqlite3.connect(self.db_path) as conn: + for score, idx in zip(scores[0], indices[0]): + if idx == -1: # No more results + break + + cursor = conn.execute(""" + SELECT id, content, metadata FROM documents + WHERE database_name = ? AND embedding_id = ? + """, (request.database_name, int(idx))) + + row = cursor.fetchone() + if row: + doc_id, content, metadata = row + metadata_dict = json.loads(metadata) if metadata else {} + + # Apply filters + if request.filters and not self._matches_filters(metadata_dict, request.filters): + continue + + document = Document( + id=doc_id, + content=content, + metadata=metadata_dict + ) + + result = SearchResult( + document=document, + score=float(score), + rank=len(results) + 1 + ) + results.append(result) + + return VectorStoreResponse( + success=True, + results=results, + message=f"Found {len(results)} results" + ) + + async def _delete_documents(self, request: VectorStoreRequest) -> VectorStoreResponse: + """Delete documents from vector store.""" + # Implementation would depend on specific requirements + # For now, return not implemented + return VectorStoreResponse( + success=False, + message="Delete operation not implemented" + ) + + async def _list_databases(self, request: VectorStoreRequest) -> VectorStoreResponse: + """List available databases.""" + with sqlite3.connect(self.db_path) as conn: + cursor = conn.execute(""" + SELECT name, description, document_count, created_at + FROM databases ORDER BY created_at DESC + """) + + databases = [] + for row in cursor.fetchall(): + databases.append({ + "name": row[0], + "description": row[1], + "document_count": row[2], + "created_at": row[3] + }) + + return VectorStoreResponse( + success=True, + message=f"Found {len(databases)} databases", + results=[SearchResult( + document=Document( + id=db["name"], + content=db["description"], + metadata=db + ), + score=1.0, + rank=i + 1 + ) for i, db in enumerate(databases)] + ) + + def _matches_filters(self, metadata: Dict[str, Any], filters: Dict[str, Any]) -> bool: + """Check if metadata matches filters.""" + for key, value in filters.items(): + if key not in metadata: + return False + if metadata[key] != value: + return False + return True + + +# Pydantic AI tool functions for agent integration +@defer +def generate_embeddings( + texts: List[str], + model: str = "text-embedding-3-small", + batch_size: int = 32, + ctx: RunContext[Any] +) -> EmbeddingResponse: + """Generate embeddings for texts using configured provider.""" + tool = EmbeddingTool(ctx.deps.get("embedding_config", {})) + result = tool.run({ + "texts": texts, + "model": model, + "batch_size": batch_size, + "normalize": True + }) + + if not result.success: + raise RuntimeError(f"Failed to generate embeddings: {result.error}") + + return EmbeddingResponse(**result.data) + + +@defer +def add_documents_to_vector_store( + documents: List[Document], + database_name: str = "default", + ctx: RunContext[Any] +) -> VectorStoreResponse: + """Add documents to vector store.""" + tool = VectorStoreTool(ctx.deps.get("vector_store_config", {})) + result = tool.run({ + "operation": "add", + "documents": [doc.dict() for doc in documents], + "database_name": database_name + }) + + if not result.success: + raise RuntimeError(f"Failed to add documents: {result.error}") + + return VectorStoreResponse(**result.data) + + +@defer +def search_vector_store( + query: str, + database_name: str = "default", + top_k: int = 5, + filters: Optional[Dict[str, Any]] = None, + ctx: RunContext[Any] +) -> VectorStoreResponse: + """Search vector store for similar documents.""" + tool = VectorStoreTool(ctx.deps.get("vector_store_config", {})) + result = tool.run({ + "operation": "search", + "query": query, + "database_name": database_name, + "top_k": top_k, + "filters": filters or {} + }) + + if not result.success: + raise RuntimeError(f"Failed to search vector store: {result.error}") + + return VectorStoreResponse(**result.data) + + +@defer +def list_vector_databases( + ctx: RunContext[Any] +) -> VectorStoreResponse: + """List available vector databases.""" + tool = VectorStoreTool(ctx.deps.get("vector_store_config", {})) + result = tool.run({ + "operation": "list" + }) + + if not result.success: + raise RuntimeError(f"Failed to list databases: {result.error}") + + return VectorStoreResponse(**result.data) + + +# Register tools +def register_embedding_tools(): + """Register embedding tools with the global registry.""" + registry = ToolRegistry() + + # Register embedding tool + embedding_tool = EmbeddingTool() + registry.register_tool(embedding_tool) + + # Register vector store tool + vector_store_tool = VectorStoreTool() + registry.register_tool(vector_store_tool) + + return registry + diff --git a/configs/app_modes/loss_driven.yaml b/configs/app_modes/loss_driven.yaml index 87bbd66..f0c6fd3 100644 --- a/configs/app_modes/loss_driven.yaml +++ b/configs/app_modes/loss_driven.yaml @@ -224,3 +224,6 @@ max_total_time: 1200.0 + + + diff --git a/configs/app_modes/multi_level_react.yaml b/configs/app_modes/multi_level_react.yaml index c48e201..68318aa 100644 --- a/configs/app_modes/multi_level_react.yaml +++ b/configs/app_modes/multi_level_react.yaml @@ -159,3 +159,6 @@ max_total_time: 600.0 + + + diff --git a/configs/app_modes/nested_orchestration.yaml b/configs/app_modes/nested_orchestration.yaml index 7642164..b472ae8 100644 --- a/configs/app_modes/nested_orchestration.yaml +++ b/configs/app_modes/nested_orchestration.yaml @@ -228,3 +228,6 @@ max_total_time: 900.0 + + + diff --git a/configs/app_modes/single_react.yaml b/configs/app_modes/single_react.yaml index 810d36b..c25e423 100644 --- a/configs/app_modes/single_react.yaml +++ b/configs/app_modes/single_react.yaml @@ -39,3 +39,6 @@ max_total_time: 300.0 + + + diff --git a/configs/bioinformatics/agents.yaml b/configs/bioinformatics/agents.yaml index 0739567..4329065 100644 --- a/configs/bioinformatics/agents.yaml +++ b/configs/bioinformatics/agents.yaml @@ -93,3 +93,6 @@ dependencies: + + + diff --git a/configs/bioinformatics/data_sources.yaml b/configs/bioinformatics/data_sources.yaml index 8bb8bc9..ddab152 100644 --- a/configs/bioinformatics/data_sources.yaml +++ b/configs/bioinformatics/data_sources.yaml @@ -73,3 +73,6 @@ data_sources: + + + diff --git a/configs/bioinformatics/defaults.yaml b/configs/bioinformatics/defaults.yaml index 5c626df..bcc35ac 100644 --- a/configs/bioinformatics/defaults.yaml +++ b/configs/bioinformatics/defaults.yaml @@ -134,3 +134,6 @@ error_handling: + + + diff --git a/configs/bioinformatics/tools.yaml b/configs/bioinformatics/tools.yaml index 0741dcd..066e52a 100644 --- a/configs/bioinformatics/tools.yaml +++ b/configs/bioinformatics/tools.yaml @@ -126,3 +126,6 @@ tool_dependencies: + + + diff --git a/configs/bioinformatics/variants/comprehensive.yaml b/configs/bioinformatics/variants/comprehensive.yaml index 4ca2620..996f2e7 100644 --- a/configs/bioinformatics/variants/comprehensive.yaml +++ b/configs/bioinformatics/variants/comprehensive.yaml @@ -52,3 +52,6 @@ performance: + + + diff --git a/configs/bioinformatics/variants/fast.yaml b/configs/bioinformatics/variants/fast.yaml index 641a082..007b71f 100644 --- a/configs/bioinformatics/variants/fast.yaml +++ b/configs/bioinformatics/variants/fast.yaml @@ -50,3 +50,6 @@ performance: + + + diff --git a/configs/bioinformatics/variants/high_quality.yaml b/configs/bioinformatics/variants/high_quality.yaml index c82c211..0332e9f 100644 --- a/configs/bioinformatics/variants/high_quality.yaml +++ b/configs/bioinformatics/variants/high_quality.yaml @@ -36,3 +36,6 @@ performance: + + + diff --git a/configs/bioinformatics/workflow.yaml b/configs/bioinformatics/workflow.yaml index c113a5a..3a40646 100644 --- a/configs/bioinformatics/workflow.yaml +++ b/configs/bioinformatics/workflow.yaml @@ -112,3 +112,6 @@ execution: + + + diff --git a/configs/bioinformatics_example.yaml b/configs/bioinformatics_example.yaml index 9ef2765..19c58e5 100644 --- a/configs/bioinformatics_example.yaml +++ b/configs/bioinformatics_example.yaml @@ -107,3 +107,6 @@ log_level: "INFO" + + + diff --git a/configs/bioinformatics_example_configured.yaml b/configs/bioinformatics_example_configured.yaml index 8c8e69f..91e02e1 100644 --- a/configs/bioinformatics_example_configured.yaml +++ b/configs/bioinformatics_example_configured.yaml @@ -115,3 +115,6 @@ log_level: "INFO" + + + diff --git a/configs/config_with_modes.yaml b/configs/config_with_modes.yaml index 9af6082..ff2af88 100644 --- a/configs/config_with_modes.yaml +++ b/configs/config_with_modes.yaml @@ -17,3 +17,6 @@ defaults: + + + diff --git a/configs/deep_agent/basic.yaml b/configs/deep_agent/basic.yaml index ff147fe..4aee51a 100644 --- a/configs/deep_agent/basic.yaml +++ b/configs/deep_agent/basic.yaml @@ -91,3 +91,6 @@ save_state: false + + + diff --git a/configs/deep_agent/comprehensive.yaml b/configs/deep_agent/comprehensive.yaml index 82c08a3..ba15d5e 100644 --- a/configs/deep_agent/comprehensive.yaml +++ b/configs/deep_agent/comprehensive.yaml @@ -200,3 +200,6 @@ output_formats: ["json", "yaml", "markdown", "html"] + + + diff --git a/configs/deep_agent/default.yaml b/configs/deep_agent/default.yaml index f4661c8..9f773a2 100644 --- a/configs/deep_agent/default.yaml +++ b/configs/deep_agent/default.yaml @@ -112,3 +112,6 @@ save_state: true + + + diff --git a/configs/deep_agent_integration.yaml b/configs/deep_agent_integration.yaml index 6899911..3fcf226 100644 --- a/configs/deep_agent_integration.yaml +++ b/configs/deep_agent_integration.yaml @@ -170,3 +170,6 @@ enable_workflow_tracing: true + + + diff --git a/configs/embedding/default.yaml b/configs/embedding/default.yaml new file mode 100644 index 0000000..18df6db --- /dev/null +++ b/configs/embedding/default.yaml @@ -0,0 +1,143 @@ +# Embedding Configuration for DeepCritical + +# Embedding model configuration +embeddings: + # Use VLLM for local embeddings (set to false for external services) + use_vllm: false + + # External service configuration (when use_vllm is false) + model: "text-embedding-3-small" + batch_size: 32 + normalize: true + + # VLLM configuration (when use_vllm is true) + vllm_config: + model_name: "sentence-transformers/all-MiniLM-L6-v2" + host: "localhost" + port: 8001 + dimensions: 384 + batch_size: 32 + + # OpenAI configuration + openai: + api_key: "${OPENAI_API_KEY}" + base_url: "https://api.openai.com/v1" + timeout: 60 + max_retries: 3 + +# Vector store configuration +vector_store: + store_type: "faiss" + data_dir: "./data/vector_store" + collection_name: "default" + index_type: "IndexFlatIP" # IndexFlatIP, IndexFlatL2, IndexHNSWFlat + embedding_dimension: 1536 # Will be set automatically based on model + distance_metric: "cosine" + +# Document processing configuration +document_processing: + chunk_size: 1000 + chunk_overlap: 200 + max_chunks_per_document: 100 + preprocessing: + remove_whitespace: true + normalize_unicode: true + remove_special_chars: false + +# Workflow configuration +workflow: + # Default workflow settings + default_batch_size: 100 + max_concurrent_operations: 5 + retry_attempts: 3 + retry_delay: 1.0 + + # Workflow types + document_ingestion: + enabled: true + chunk_size: 1000 + chunk_overlap: 200 + create_new: true + + batch_processing: + enabled: true + batch_size: 100 + parallel_processing: true + + sync: + enabled: false + auto_sync: false + sync_interval: 3600 # seconds + +# HuggingFace Hub configuration +huggingface: + enabled: false + token: "${HF_TOKEN}" + endpoint: "https://huggingface.co" + default_repository_prefix: "deepcritical" + auto_upload: false + private_repositories: false + +# Performance configuration +performance: + # Memory management + max_memory_usage: 0.8 # 80% of available memory + cache_size: 1000 + + # Processing optimization + use_multiprocessing: true + max_workers: 4 + prefetch_factor: 2 + + # Index optimization + index_optimization: + enabled: true + rebuild_threshold: 0.1 # Rebuild when 10% of documents are deleted + compression: false + +# Logging configuration +logging: + level: "INFO" + log_embeddings: false # Don't log actual embedding vectors + log_performance: true + log_errors: true + +# Security configuration +security: + # API key management + encrypt_api_keys: false + key_rotation_interval: 86400 # 24 hours + + # Data privacy + anonymize_metadata: false + remove_sensitive_data: false + + # Access control + require_authentication: false + allowed_origins: ["*"] + +# Monitoring configuration +monitoring: + enabled: true + metrics_collection: true + health_checks: true + + # Metrics to collect + metrics: + - "embedding_generation_time" + - "vector_search_time" + - "document_processing_time" + - "memory_usage" + - "error_rate" + + # Health check settings + health_check_interval: 300 # 5 minutes + health_check_timeout: 30 # 30 seconds + +# Development configuration +development: + debug_mode: false + verbose_logging: false + mock_embeddings: false # Use mock embeddings for testing + test_mode: false + diff --git a/configs/workflow_orchestration/data_loaders/default_data_loaders.yaml b/configs/workflow_orchestration/data_loaders/default_data_loaders.yaml index 4dc2f4d..d16a827 100644 --- a/configs/workflow_orchestration/data_loaders/default_data_loaders.yaml +++ b/configs/workflow_orchestration/data_loaders/default_data_loaders.yaml @@ -147,3 +147,6 @@ data_loaders: + + + diff --git a/configs/workflow_orchestration/default.yaml b/configs/workflow_orchestration/default.yaml index c55d7b2..502e8c5 100644 --- a/configs/workflow_orchestration/default.yaml +++ b/configs/workflow_orchestration/default.yaml @@ -63,3 +63,6 @@ performance: + + + diff --git a/configs/workflow_orchestration/judges/default_judges.yaml b/configs/workflow_orchestration/judges/default_judges.yaml index c501eb7..c89d5e4 100644 --- a/configs/workflow_orchestration/judges/default_judges.yaml +++ b/configs/workflow_orchestration/judges/default_judges.yaml @@ -174,3 +174,6 @@ judges: + + + diff --git a/configs/workflow_orchestration/multi_agent_systems/default_multi_agent.yaml b/configs/workflow_orchestration/multi_agent_systems/default_multi_agent.yaml index b911f6e..639184a 100644 --- a/configs/workflow_orchestration/multi_agent_systems/default_multi_agent.yaml +++ b/configs/workflow_orchestration/multi_agent_systems/default_multi_agent.yaml @@ -210,3 +210,6 @@ multi_agent_systems: + + + diff --git a/configs/workflow_orchestration/primary_workflow/react_primary.yaml b/configs/workflow_orchestration/primary_workflow/react_primary.yaml index 5475345..995d33d 100644 --- a/configs/workflow_orchestration/primary_workflow/react_primary.yaml +++ b/configs/workflow_orchestration/primary_workflow/react_primary.yaml @@ -55,3 +55,6 @@ multi_agent_coordination: + + + diff --git a/configs/workflow_orchestration/sub_workflows/comprehensive_sub_workflows.yaml b/configs/workflow_orchestration/sub_workflows/comprehensive_sub_workflows.yaml index 893d572..6356f4e 100644 --- a/configs/workflow_orchestration/sub_workflows/comprehensive_sub_workflows.yaml +++ b/configs/workflow_orchestration/sub_workflows/comprehensive_sub_workflows.yaml @@ -304,3 +304,6 @@ sub_workflows: + + + diff --git a/configs/workflow_orchestration/sub_workflows/default_sub_workflows.yaml b/configs/workflow_orchestration/sub_workflows/default_sub_workflows.yaml index d625e5e..574fbb2 100644 --- a/configs/workflow_orchestration/sub_workflows/default_sub_workflows.yaml +++ b/configs/workflow_orchestration/sub_workflows/default_sub_workflows.yaml @@ -169,3 +169,6 @@ sub_workflows: + + + diff --git a/configs/workflow_orchestration_example.yaml b/configs/workflow_orchestration_example.yaml index c2c40de..804a40c 100644 --- a/configs/workflow_orchestration_example.yaml +++ b/configs/workflow_orchestration_example.yaml @@ -104,3 +104,6 @@ question: "Analyze the role of machine learning in drug discovery and design a c + + + diff --git a/pyproject.toml b/pyproject.toml index 9e5f9f8..91c39db 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -10,6 +10,7 @@ authors = [ ] dependencies = [ "beautifulsoup4>=4.14.2", + "huggingface-hub>=0.35.3", "hydra-core>=1.3.2", "pydantic>=2.7", "pydantic-ai>=0.0.16", diff --git a/uv.lock b/uv.lock index db7b77f..e6ea6b9 100644 --- a/uv.lock +++ b/uv.lock @@ -377,6 +377,7 @@ version = "0.1.0" source = { editable = "." } dependencies = [ { name = "beautifulsoup4" }, + { name = "huggingface-hub" }, { name = "hydra-core" }, { name = "pydantic" }, { name = "pydantic-ai" }, @@ -401,6 +402,7 @@ dev = [ [package.metadata] requires-dist = [ { name = "beautifulsoup4", specifier = ">=4.14.2" }, + { name = "huggingface-hub", specifier = ">=0.35.3" }, { name = "hydra-core", specifier = ">=1.3.2" }, { name = "pydantic", specifier = ">=2.7" }, { name = "pydantic-ai", specifier = ">=0.0.16" },