diff --git a/README.md b/README.md index d239927..dc92873 100644 --- a/README.md +++ b/README.md @@ -43,6 +43,7 @@ OpenTryOn is an open-source AI toolkit designed for fashion technology and virtu - **Model Swap**: Swap garments on different models - **Interactive Demos**: Gradio-based web interfaces for all features - **Preprocessing Pipeline**: Complete preprocessing pipeline for training and inference +- **AI Agents**: LangChain-based agents for intelligent virtual try-on operations ## šŸ“‹ Table of Contents @@ -54,6 +55,7 @@ OpenTryOn is an open-source AI toolkit designed for fashion technology and virtu - [Virtual Try-On with Amazon Nova Canvas](#virtual-try-on-with-amazon-nova-canvas) - [Virtual Try-On with Kling AI](#virtual-try-on-with-kling-ai) - [Virtual Try-On with Segmind](#virtual-try-on-with-segmind) + - [Virtual Try-On Agent](#virtual-try-on-agent) - [Image Generation with Nano Banana](#image-generation-with-nano-banana) - [Image Generation with FLUX.2](#image-generation-with-flux2) - [Image Generation with Luma AI](#luma-ai-image-generation) @@ -147,6 +149,13 @@ LUMA_AI_API_KEY=your_luma_ai_api_key # OpenAI Credentials (required for OpenAI GPT-Image-1 image generation) OPENAI_API_KEY=your_openai_api_key + +# LLM Provider Credentials (required for Virtual Try-On Agent) +OPENAI_API_KEY=your_openai_api_key # For OpenAI (default) +# OR +ANTHROPIC_API_KEY=your_anthropic_api_key # For Anthropic Claude +# OR +GOOGLE_API_KEY=your_google_api_key # For Google Gemini ``` **Notes**: @@ -160,7 +169,11 @@ OPENAI_API_KEY=your_openai_api_key - For FLUX.2 models, obtain your API key from [BFL AI](https://docs.bfl.ai/) - For Luma AI, obtain your API key from [Luma Labs AI](https://lumalabs.ai/api) -- For OpenAI, obtain your API key from [OpenAI Platform](https://platform.openai.com/settings/organization/api-keys) +- For OpenAI, obtain your API key from [OpenAI Platform](https://platform.openai.com/settings/organization/api-keys) +- For Virtual Try-On Agent, obtain LLM API keys from: + - OpenAI: [OpenAI API Keys](https://platform.openai.com/api-keys) + - Anthropic: [Anthropic API Keys](https://console.anthropic.com/) + - Google: [Google AI Studio](https://aistudio.google.com/app/apikey) ## šŸŽ® Quick Start @@ -636,6 +649,160 @@ for idx, image in enumerate(images): **Reference**: [Segmind Try-On Diffusion API Documentation](https://www.segmind.com/models/try-on-diffusion/api) +### Virtual Try-On Agent + +A LangChain-based agent that intelligently selects and uses the appropriate virtual try-on adapter based on user prompts. The agent analyzes natural language requests and automatically chooses between Kling AI, Amazon Nova Canvas, or Segmind. + +#### Prerequisites + +1. **LangChain Installation**: + ```bash + pip install langchain langchain-openai langchain-anthropic langchain-google-genai + ``` + +2. **LLM Provider Setup**: + - Choose an LLM provider: OpenAI, Anthropic Claude, or Google Gemini + - Set the appropriate API key in your `.env` file: + ```env + OPENAI_API_KEY=your_openai_api_key + # OR + ANTHROPIC_API_KEY=your_anthropic_api_key + # OR + GOOGLE_API_KEY=your_google_api_key + ``` + +3. **Virtual Try-On API Credentials**: + - Ensure you have credentials for at least one VTON provider (Kling AI, Nova Canvas, or Segmind) + - See the individual provider sections above for setup instructions + +#### Command Line Usage + +```bash +# Basic usage with default OpenAI provider +python vton_agent.py --person person.jpg --garment shirt.jpg --prompt "Create a virtual try-on using Kling AI" + +# Specify LLM provider +python vton_agent.py --person person.jpg --garment shirt.jpg --prompt "Use Nova Canvas for virtual try-on" --llm-provider anthropic + +# Use Google Gemini as LLM +python vton_agent.py --person person.jpg --garment shirt.jpg --prompt "Generate try-on with Segmind" --llm-provider google + +# Specify LLM model +python vton_agent.py --person person.jpg --garment shirt.jpg --prompt "Use Kling AI" --llm-model gpt-4-turbo-preview + +# Save output to specific directory +python vton_agent.py --person person.jpg --garment shirt.jpg --prompt "Create virtual try-on" --output-dir results/ + +# Use URLs instead of file paths +python vton_agent.py --person https://example.com/person.jpg --garment https://example.com/shirt.jpg --prompt "Use Kling AI" + +# Verbose output to see agent reasoning +python vton_agent.py --person person.jpg --garment shirt.jpg --prompt "Use Kling AI" --verbose +``` + +#### Python API Usage + +```python +from tryon.agents.vton import VTOnAgent + +# Initialize the agent with your preferred LLM provider +agent = VTOnAgent(llm_provider="openai") + +# Generate virtual try-on using natural language prompt +result = agent.generate( + person_image="person.jpg", + garment_image="shirt.jpg", + prompt="Use Kling AI to create a virtual try-on of this shirt" +) + +if result["status"] == "success": + print(f"Generated {len(result['images'])} images using {result['provider']}") +``` + +#### Provider Selection + +The agent automatically selects the provider based on keywords in your prompt: + +- **Kling AI**: "kling ai", "kling", "kolors" +- **Nova Canvas**: "nova canvas", "amazon nova", "aws", "bedrock" +- **Segmind**: "segmind" + +Examples: + +```python +# Uses Kling AI +result = agent.generate( + person_image="person.jpg", + garment_image="shirt.jpg", + prompt="Use Kling AI to generate the try-on" +) + +# Uses Nova Canvas +result = agent.generate( + person_image="person.jpg", + garment_image="shirt.jpg", + prompt="Generate with Amazon Nova Canvas" +) + +# Uses Segmind +result = agent.generate( + person_image="person.jpg", + garment_image="shirt.jpg", + prompt="Try Segmind for this virtual try-on" +) +``` + +#### Using Different LLM Providers + +```python +# OpenAI +agent = VTOnAgent(llm_provider="openai", llm_model="gpt-4-turbo-preview") + +# Anthropic Claude +agent = VTOnAgent(llm_provider="anthropic", llm_model="claude-3-opus-20240229") + +# Google Gemini +agent = VTOnAgent(llm_provider="google", llm_model="gemini-pro") +``` + +#### Complete Example + +```python +from tryon.agents.vton import VTOnAgent + +# Initialize agent +agent = VTOnAgent(llm_provider="openai") + +# Generate virtual try-on +result = agent.generate( + person_image="https://example.com/person.jpg", + garment_image="https://example.com/shirt.jpg", + prompt="Create a virtual try-on using Kling AI for best quality" +) + +# Handle results +if result["status"] == "success": + images = result["images"] # List of image URLs or base64 strings + provider = result["provider"] # "kling_ai", "nova_canvas", or "segmind" + print(f"Successfully generated {len(images)} images using {provider}") +else: + print(f"Error: {result.get('error')}") +``` + +#### Supported Providers + +- **Kling AI**: High-quality virtual try-on with asynchronous processing +- **Amazon Nova Canvas**: AWS Bedrock-based virtual try-on with automatic garment detection +- **Segmind**: Fast and efficient virtual try-on generation + +#### Documentation + +For complete documentation, API reference, architecture details, and advanced usage examples, see: + +šŸ“š **[Virtual Try-On Agent Documentation →](https://tryonlabs.github.io/opentryon/docs/agents/vton-agent)** + +**Reference**: [Virtual Try-On Agent Documentation](https://tryonlabs.github.io/opentryon/docs/agents/vton-agent) + ### Image Generation with Nano Banana Generate high-quality images using Google's Gemini image generation models (Nano Banana and Nano Banana Pro). These models support text-to-image generation, image editing, multi-image composition, and batch generation. @@ -1632,6 +1799,7 @@ opentryon/ ā”œā”€ā”€ main.py # Main CLI entry point ā”œā”€ā”€ run_demo.py # Demo launcher (Gradio demos) ā”œā”€ā”€ vton.py # Virtual try-on CLI (Amazon Nova Canvas, Kling AI, Segmind) +ā”œā”€ā”€ vton_agent.py # Virtual try-on agent CLI (LangChain-based intelligent provider selection) ā”œā”€ā”€ image_gen.py # Image generation CLI (Nano Banana, FLUX.2) ā”œā”€ā”€ requirements.txt # Python dependencies ā”œā”€ā”€ environment.yml # Conda environment @@ -1704,6 +1872,10 @@ Key dependencies include: - uvicorn[standard] (== 0.38.0) - python-multipart (== 0.0.20) - lumaai (== 1.18.1) +- langchain (>= 1.0.0) - Latest LangChain 1.x API +- langchain-openai (>= 0.2.0) +- langchain-anthropic (>= 0.2.0) +- langchain-google-genai (>= 2.0.0) See `requirements.txt` or `environment.yml` for the complete list of dependencies. diff --git a/docs/docs/agents/vton-agent.md b/docs/docs/agents/vton-agent.md new file mode 100644 index 0000000..604931e --- /dev/null +++ b/docs/docs/agents/vton-agent.md @@ -0,0 +1,307 @@ +--- +title: Virtual Try-On Agent +description: LangChain-based agent that intelligently selects and uses the appropriate virtual try-on adapter based on user prompts. +keywords: + - virtual try-on agent + - langchain agent + - kling ai + - nova canvas + - segmind + - AI agent + - vton agent +--- + +# Virtual Try-On Agent + +A LangChain-based agent that intelligently selects and uses the appropriate virtual try-on adapter based on user prompts. + +## Overview + +The Virtual Try-On Agent uses LangChain to analyze user requests and automatically select the best virtual try-on adapter. It supports multiple providers: + +- **Kling AI**: High-quality virtual try-on with asynchronous processing +- **Amazon Nova Canvas**: AWS Bedrock-based virtual try-on with automatic garment detection +- **Segmind**: Fast and efficient virtual try-on generation + +## Features + +- **Intelligent Provider Selection**: Automatically selects the adapter based on user prompts +- **Natural Language Interface**: Accepts natural language prompts describing the desired operation +- **Multiple LLM Support**: Works with OpenAI, Anthropic Claude, and Google Gemini +- **Flexible Input**: Supports file paths, URLs, and base64-encoded images +- **Error Handling**: Comprehensive error handling and reporting + +## Installation + +```bash +pip install langchain langchain-openai langchain-anthropic langchain-google-genai +``` + +**Note**: This agent uses LangChain 1.x API (`create_agent`). See [LangChain 1.x documentation](https://docs.langchain.com/oss/python/langchain/agents) for details. + +## Quick Start + +```python +from tryon.agents.vton import VTOnAgent + +# Initialize the agent +agent = VTOnAgent(llm_provider="openai") + +# Generate virtual try-on +result = agent.generate( + person_image="person.jpg", + garment_image="shirt.jpg", + prompt="Use Kling AI to create a virtual try-on of this shirt" +) + +print(result) +``` + +## Usage + +### Command Line Interface + +The Virtual Try-On Agent includes a command-line interface for easy usage: + +```bash +# Basic usage with default OpenAI provider +python vton_agent.py --person person.jpg --garment shirt.jpg --prompt "Create a virtual try-on using Kling AI" + +# Specify LLM provider +python vton_agent.py --person person.jpg --garment shirt.jpg --prompt "Use Nova Canvas for virtual try-on" --llm-provider anthropic + +# Use Google Gemini as LLM +python vton_agent.py --person person.jpg --garment shirt.jpg --prompt "Generate try-on with Segmind" --llm-provider google + +# Specify LLM model +python vton_agent.py --person person.jpg --garment shirt.jpg --prompt "Use Kling AI" --llm-model gpt-4-turbo-preview + +# Save output to specific directory +python vton_agent.py --person person.jpg --garment shirt.jpg --prompt "Create virtual try-on" --output-dir results/ + +# Use URLs instead of file paths +python vton_agent.py --person https://example.com/person.jpg --garment https://example.com/shirt.jpg --prompt "Use Kling AI" + +# Verbose output to see agent reasoning +python vton_agent.py --person person.jpg --garment shirt.jpg --prompt "Use Kling AI" --verbose +``` + +#### CLI Arguments + +- `--person`, `-p`: Path or URL to person/model image (required) +- `--garment`, `-g`: Path or URL to garment/cloth image (required) +- `--prompt`: Natural language prompt describing the virtual try-on request (required) +- `--llm-provider`: LLM provider to use (default: `openai`, options: `openai`, `anthropic`, `google`) +- `--llm-model`: Specific LLM model name (optional, uses default for provider) +- `--llm-temperature`: Temperature for LLM (default: `0.0`) +- `--llm-api-key`: API key for LLM provider (optional, can use environment variables) +- `--output-dir`, `-o`: Directory to save generated images (default: `outputs/`) +- `--save-base64`: Also save Base64 encoded strings to .txt files +- `--verbose`: Print verbose output including agent reasoning steps + +### Python API Usage + +#### Basic Usage + +```python +from tryon.agents.vton import VTOnAgent + +agent = VTOnAgent(llm_provider="openai") + +result = agent.generate( + person_image="path/to/person.jpg", + garment_image="path/to/garment.jpg", + prompt="Generate a virtual try-on using Nova Canvas" +) +``` + +### Provider Selection + +The agent automatically selects the provider based on keywords in your prompt: + +- **Kling AI**: "kling ai", "kling", "kolors" +- **Nova Canvas**: "nova canvas", "amazon nova", "aws", "bedrock" +- **Segmind**: "segmind" + +Examples: + +```python +# Uses Kling AI +result = agent.generate( + person_image="person.jpg", + garment_image="shirt.jpg", + prompt="Use Kling AI to generate the try-on" +) + +# Uses Nova Canvas +result = agent.generate( + person_image="person.jpg", + garment_image="shirt.jpg", + prompt="Generate with Amazon Nova Canvas" +) + +# Uses Segmind +result = agent.generate( + person_image="person.jpg", + garment_image="shirt.jpg", + prompt="Try Segmind for this virtual try-on" +) +``` + +### Using Different LLM Providers + +```python +# OpenAI +agent = VTOnAgent(llm_provider="openai", llm_model="gpt-4-turbo-preview") + +# Anthropic Claude +agent = VTOnAgent(llm_provider="anthropic", llm_model="claude-3-opus-20240229") + +# Google Gemini +agent = VTOnAgent(llm_provider="google", llm_model="gemini-pro") +``` + +### Environment Variables + +Set the following environment variables for API keys: + +```bash +# For OpenAI +export OPENAI_API_KEY="your-openai-api-key" + +# For Anthropic +export ANTHROPIC_API_KEY="your-anthropic-api-key" + +# For Google +export GOOGLE_API_KEY="your-google-api-key" + +# For Virtual Try-On APIs +export KLING_AI_API_KEY="your-kling-api-key" +export KLING_AI_SECRET_KEY="your-kling-secret-key" +export SEGMIND_API_KEY="your-segmind-api-key" +export AMAZON_NOVA_REGION="us-east-1" # For Nova Canvas +``` + +## API Reference + +### VTOnAgent + +#### `__init__(llm_provider, llm_model=None, temperature=0.0, api_key=None, **llm_kwargs)` + +Initialize the Virtual Try-On Agent. + +**Parameters:** +- `llm_provider` (str): LLM provider to use. Options: "openai", "anthropic", "google" +- `llm_model` (str, optional): Specific model name. If None, uses default for provider +- `temperature` (float): Temperature for LLM (default: 0.0) +- `api_key` (str, optional): API key for LLM provider +- `**llm_kwargs`: Additional keyword arguments for LLM initialization + +#### `generate(person_image, garment_image, prompt, **kwargs)` + +Generate virtual try-on images using the agent. + +**Parameters:** +- `person_image` (str): Path or URL to the person/model image +- `garment_image` (str): Path or URL to the garment/cloth image +- `prompt` (str): Natural language prompt describing the request +- `**kwargs`: Additional parameters to pass to the agent + +**Returns:** +- Dictionary containing: + - `status`: "success" or "error" + - `provider`: Name of the provider used + - `images`: List of generated images (URLs or base64 strings) + - `result`: Full agent response + - `error`: Error message (if status is "error") + +## Architecture + +The agent uses LangChain's ReAct agent framework: + +1. **Tools**: Each virtual try-on adapter is wrapped as a LangChain tool +2. **Agent**: A ReAct agent that selects and uses tools based on user prompts +3. **LLM**: Language model (OpenAI, Anthropic, or Google) that powers the agent + +### Tool Structure + +Each tool follows this pattern: + +```python +@tool("provider_name_virtual_tryon", args_schema=InputSchema) +def provider_virtual_tryon(person_image, garment_image, **kwargs): + """Tool description""" + adapter = ProviderAdapter() + result = adapter.generate(...) + return result +``` + +## Examples + +### Example 1: Basic Virtual Try-On + +```python +from tryon.agents.vton import VTOnAgent + +agent = VTOnAgent(llm_provider="openai") + +result = agent.generate( + person_image="https://example.com/person.jpg", + garment_image="https://example.com/shirt.jpg", + prompt="Create a virtual try-on using Kling AI" +) + +if result["status"] == "success": + print(f"Generated {len(result['images'])} images using {result['provider']}") +else: + print(f"Error: {result.get('error')}") +``` + +### Example 2: Provider Selection + +```python +agent = VTOnAgent(llm_provider="anthropic") + +# The agent will select Kling AI based on the prompt +result = agent.generate( + person_image="person.jpg", + garment_image="dress.jpg", + prompt="I want to see how this dress looks. Use Kling AI for best quality." +) +``` + +### Example 3: Custom Parameters + +```python +agent = VTOnAgent(llm_provider="google") + +# The agent can extract parameters from the prompt +result = agent.generate( + person_image="person.jpg", + garment_image="pants.jpg", + prompt="Generate virtual try-on with Nova Canvas for lower body garment" +) +``` + +## Limitations + +- Currently supports only dedicated virtual try-on APIs (Kling AI, Nova Canvas, Segmind) +- Image generation APIs (Nano Banana Pro, FLUX 2 Pro, FLUX 2 Flex) are not yet integrated +- No vector store support (as requested) +- Agent output parsing may need refinement for complex scenarios + +## Future Enhancements + +- Add support for image generation APIs (Nano Banana Pro, FLUX 2 Pro, FLUX 2 Flex) +- Improve prompt understanding for better parameter extraction +- Add support for batch processing +- Implement image decoding utilities +- Add result caching + +## Related Documentation + +- [Agent Ideas](./agent-ideas.md) - Overview of Fashion AI Agents ecosystem +- [API Reference - Kling AI](../api-reference/kling-ai.md) - Kling AI adapter documentation +- [API Reference - Nova Canvas](../api-reference/nova-canvas.md) - Nova Canvas adapter documentation +- [API Reference - Segmind](../api-reference/segmind.md) - Segmind adapter documentation + diff --git a/docs/sidebars.ts b/docs/sidebars.ts index ec7ab6e..bb30f62 100644 --- a/docs/sidebars.ts +++ b/docs/sidebars.ts @@ -101,6 +101,7 @@ const sidebars: SidebarsConfig = { items: [ 'agents/agent-ideas-summary', 'agents/agent-ideas', + 'agents/vton-agent', ], }, { diff --git a/environment.yml b/environment.yml index c63db79..3b05277 100644 --- a/environment.yml +++ b/environment.yml @@ -89,7 +89,6 @@ dependencies: - jedi==0.19.0 - jupyter-client==8.3.1 - jupyter-core==5.3.1 - - keras==2.14.0 - kiwisolver==1.4.5 - lazy-loader==0.3 - libclang==16.0.6 @@ -145,9 +144,6 @@ dependencies: - sympy==1.12 - tensorboard==2.14.1 - tensorboard-data-server==0.7.1 - - tensorflow==2.14.0 - - tensorflow-estimator==2.14.0 - - tensorflow-io-gcs-filesystem==0.34.0 - termcolor==2.3.0 - tifffile==2024.2.12 - tokenizers==0.19.1 @@ -172,4 +168,8 @@ dependencies: - boto3==1.40.64 - requests>=2.31.0 - PyJWT>=2.10.1 + - langchain>=1.0.0 + - langchain-openai>=0.2.0 + - langchain-anthropic>=0.2.0 + - langchain-google-genai>=2.0.0 prefix: /Users/apple/miniconda3/envs/opentryon diff --git a/requirements.txt b/requirements.txt index 05e5215..a2bc85e 100644 --- a/requirements.txt +++ b/requirements.txt @@ -15,4 +15,10 @@ fastapi==0.124.0 uvicorn[standard]==0.38.0 python-multipart==0.0.20 lumaai>=1.18.1 -openai>=2.9.0 \ No newline at end of file +openai>=2.9.0 +lumaai>=1.18.1 +langchain>=1.0.0 +langchain-openai>=0.2.0 +langchain-anthropic>=0.2.0 +langchain-google-genai>=2.0.0 +pydantic>=2.0.0 diff --git a/setup.py b/setup.py index 16611e1..bf1d2bb 100644 --- a/setup.py +++ b/setup.py @@ -35,6 +35,11 @@ "uvicorn[standard]==0.38.0", "python-multipart==0.0.20", "lumaai>=1.18.1", + "langchain>=1.0.0", + "langchain-openai>=0.2.0", + "langchain-anthropic>=0.2.0", + "langchain-google-genai>=2.0.0", + "pydantic>=2.0.0", ], keywords=[ "virtual try-on", diff --git a/tryon/agents/__init__.py b/tryon/agents/__init__.py new file mode 100644 index 0000000..3970f7e --- /dev/null +++ b/tryon/agents/__init__.py @@ -0,0 +1,12 @@ +""" +Agents Module + +This module provides AI agents for various fashion and virtual try-on tasks. +""" + +from .vton.agent import VTOnAgent + +__all__ = [ + "VTOnAgent", +] + diff --git a/tryon/agents/vton/__init__.py b/tryon/agents/vton/__init__.py new file mode 100644 index 0000000..9b90cb1 --- /dev/null +++ b/tryon/agents/vton/__init__.py @@ -0,0 +1,34 @@ +""" +Virtual Try-On Agent Module + +This module provides a LangChain-based agent for virtual try-on operations. +The agent intelligently selects and uses the appropriate adapter based on +user prompts. + +Example: + >>> from tryon.agents.vton import VTOnAgent + >>> + >>> agent = VTOnAgent(llm_provider="openai") + >>> result = agent.generate( + ... person_image="person.jpg", + ... garment_image="shirt.jpg", + ... prompt="Use Kling AI to generate a virtual try-on" + ... ) +""" + +from .agent import VTOnAgent +from .tools import ( + get_vton_tools, + kling_ai_virtual_tryon, + nova_canvas_virtual_tryon, + segmind_virtual_tryon, +) + +__all__ = [ + "VTOnAgent", + "get_vton_tools", + "kling_ai_virtual_tryon", + "nova_canvas_virtual_tryon", + "segmind_virtual_tryon", +] + diff --git a/tryon/agents/vton/agent.py b/tryon/agents/vton/agent.py new file mode 100644 index 0000000..e29f729 --- /dev/null +++ b/tryon/agents/vton/agent.py @@ -0,0 +1,520 @@ +""" +Virtual Try-On Agent using LangChain + +This agent uses LangChain 1.x to intelligently select and use the appropriate +virtual try-on adapter based on user prompts. The agent receives a person image, +garment image, and a natural language prompt, then decides which adapter to use. + +Based on LangChain 1.x API: https://docs.langchain.com/oss/python/langchain/agents +""" + +import json +import asyncio +from typing import Optional, Dict, Any, List, Union + +# LangChain 1.x imports +from langchain.agents import create_agent +from langchain_openai import ChatOpenAI +from langchain_anthropic import ChatAnthropic +from langchain_google_genai import ChatGoogleGenerativeAI +from langchain_core.language_models import BaseChatModel + +from .tools import get_vton_tools, get_tool_output_from_cache + + +class VTOnAgent: + """ + LangChain-based Virtual Try-On Agent. + + This agent intelligently selects and uses the appropriate virtual try-on + adapter based on user prompts. It supports multiple adapters: + - Kling AI + - Amazon Nova Canvas + - Segmind + + The agent analyzes the user's prompt to determine which adapter to use, + then performs the virtual try-on operation. + + Example: + >>> from tryon.agents.vton import VTOnAgent + >>> + >>> agent = VTOnAgent(llm_provider="openai") + >>> result = agent.generate( + ... person_image="person.jpg", + ... garment_image="shirt.jpg", + ... prompt="Use Kling AI to generate a virtual try-on" + ... ) + >>> print(result) + """ + + # Provider name mappings for prompt matching + PROVIDER_KEYWORDS = { + "kling_ai": ["kling ai", "kling", "kolors"], + "nova_canvas": ["nova canvas", "amazon nova", "aws", "bedrock", "amazon"], + "segmind": ["segmind", "segmind try-on"], + } + + def __init__( + self, + llm_provider: str = "openai", + llm_model: Optional[str] = None, + temperature: float = 0.0, + api_key: Optional[str] = None, + **llm_kwargs + ): + """ + Initialize the Virtual Try-On Agent. + + Args: + llm_provider: LLM provider to use. Options: "openai", "anthropic", "google" + llm_model: Specific model name (e.g., "gpt-4", "claude-3-opus-20240229") + If None, uses default model for the provider + temperature: Temperature for LLM (default: 0.0 for deterministic behavior) + api_key: API key for the LLM provider (if not set via environment variable) + **llm_kwargs: Additional keyword arguments for LLM initialization + + Raises: + ValueError: If llm_provider is not supported + """ + self.llm_provider = llm_provider.lower() + self.tools = get_vton_tools() + self.llm = self._initialize_llm( + llm_provider=self.llm_provider, + llm_model=llm_model, + temperature=temperature, + api_key=api_key, + **llm_kwargs + ) + self.agent = self._create_agent() + + def _initialize_llm( + self, + llm_provider: str, + llm_model: Optional[str], + temperature: float, + api_key: Optional[str], + **kwargs + ) -> BaseChatModel: + """Initialize the LLM based on provider.""" + # Ensure API key is a string, not a callable (for sync client compatibility) + if api_key and callable(api_key): + raise ValueError( + "API key must be a string, not a callable. " + "For async operations, use async methods instead." + ) + + if llm_provider == "openai": + model_name = llm_model or "gpt-5.1" + llm_kwargs = { + "model": model_name, + "temperature": temperature, + **kwargs + } + # Only add api_key if provided (let it use env var if not) + if api_key: + llm_kwargs["api_key"] = api_key + return ChatOpenAI(**llm_kwargs) + elif llm_provider == "anthropic": + model_name = llm_model or "claude-sonnet-4-5-20250929" + llm_kwargs = { + "model": model_name, + "temperature": temperature, + **kwargs + } + # Only add api_key if provided (let it use env var if not) + if api_key: + llm_kwargs["api_key"] = api_key + return ChatAnthropic(**llm_kwargs) + elif llm_provider == "google": + model_name = llm_model or "gemini-2.5-pro" + llm_kwargs = { + "model": model_name, + "temperature": temperature, + **kwargs + } + # Only add google_api_key if provided (let it use env var if not) + if api_key: + llm_kwargs["google_api_key"] = api_key + return ChatGoogleGenerativeAI(**llm_kwargs) + else: + raise ValueError( + f"Unsupported LLM provider: {llm_provider}. " + f"Supported providers: 'openai', 'anthropic', 'google'" + ) + + def _create_agent(self): + """ + Create the LangChain 1.x agent using create_agent. + + Reference: https://docs.langchain.com/oss/python/langchain/agents + """ + system_prompt = """You are a helpful virtual try-on assistant. Your task is to analyze user requests and select +the appropriate virtual try-on tool based on the user's prompt. + +Available tools: +- kling_ai_virtual_tryon: Use when user mentions "kling ai", "kling", or "kolors". Best for high-quality results. +- nova_canvas_virtual_tryon: Use when user mentions "nova canvas", "amazon nova", "aws", or "bedrock". Good for AWS integration. +- segmind_virtual_tryon: Use when user mentions "segmind". Fast and efficient for quick iterations. + +User will provide: +1. Person image (path or URL) +2. Garment image (path or URL) +3. A prompt describing what they want + +Your task: +1. Analyze the user's prompt to identify which provider they want to use +2. Extract any additional parameters from the prompt (e.g., garment class, category) +3. Call the appropriate tool with the person_image and garment_image +4. Return the result as a JSON string with status, provider, and images fields + +If the user doesn't specify a provider, default to kling_ai_virtual_tryon for best quality. +""" + + # Create agent using LangChain 1.x API + agent = create_agent( + model=self.llm, + tools=self.tools, + system_prompt=system_prompt + ) + + return agent + + def generate( + self, + person_image: str, + garment_image: str, + prompt: str, + verbose: bool = False, + **kwargs + ) -> Dict[str, Any]: + """ + Generate virtual try-on images using the agent. + + The agent analyzes the prompt to determine which adapter to use, + then performs the virtual try-on operation. + + Args: + person_image: Path or URL to the person/model image + garment_image: Path or URL to the garment/cloth image + prompt: Natural language prompt describing the request. + Should mention the desired provider (e.g., "Use Kling AI", + "Generate with Nova Canvas", "Try Segmind") + verbose: If True, print debug information about message parsing + **kwargs: Additional parameters to pass to the agent + + Returns: + Dictionary containing: + - 'status': 'success' or 'error' + - 'provider': Name of the provider used + - 'images': List of generated images (URLs or base64 strings) + - 'result': Full agent response + - 'error': Error message (if status is 'error') + + Example: + >>> agent = VTOnAgent() + >>> result = agent.generate( + ... person_image="person.jpg", + ... garment_image="shirt.jpg", + ... prompt="Use Kling AI to create a virtual try-on of this shirt" + ... ) + >>> print(result['images']) + """ + # Construct the input message for the agent (LangChain 1.x format) + user_message = f"""Person Image: {person_image} +Garment Image: {garment_image} +User Request: {prompt} + +Please perform virtual try-on using the appropriate tool based on the user's request.""" + + try: + # Execute the agent with streaming to show intermediate steps + if verbose: + print("\nšŸ¤” Analyzing request and selecting provider...") + + # Use streaming to capture intermediate steps + result = None + last_message_count = 0 + + async def stream_agent(): + nonlocal result, last_message_count + try: + # Use astream with stream_mode="values" to get full state at each step + async for chunk in self.agent.astream( + {"messages": [{"role": "user", "content": user_message}]}, + stream_mode="values", + **kwargs + ): + # Process each chunk - chunk is the full state at each step + if isinstance(chunk, dict): + messages = chunk.get("messages", []) + if len(messages) > last_message_count: + # New messages added + for msg in messages[last_message_count:]: + msg_type = getattr(msg, 'type', None) or (msg.get("type") if isinstance(msg, dict) else None) + + if msg_type == "ai": + # Agent is thinking/responding + content = getattr(msg, 'content', None) or (msg.get("content") if isinstance(msg, dict) else "") + if content and verbose: + # Check if agent is calling a tool + tool_calls = getattr(msg, 'tool_calls', None) or (msg.get("tool_calls") if isinstance(msg, dict) else []) + if tool_calls: + tool_names = [] + for tc in tool_calls: + if isinstance(tc, dict): + tool_names.append(tc.get("name", "unknown")) + else: + tool_names.append(getattr(tc, 'name', 'unknown')) + if tool_names: + print(f"šŸ”§ Calling tool: {', '.join(tool_names)}") + elif content.strip() and len(content.strip()) > 10: + if verbose: + print(f"šŸ’­ Agent: {content[:200]}") + + elif msg_type == "tool": + # Tool execution started/completed + if verbose: + tool_name = getattr(msg, 'name', None) or (msg.get("name") if isinstance(msg, dict) else "unknown") + print(f"āš™ļø Tool '{tool_name}' executing...") + + last_message_count = len(messages) + + # Always update result with latest chunk (final chunk has complete state) + result = chunk.copy() if hasattr(chunk, 'copy') else chunk + except Exception as e: + if verbose: + print(f"āš ļø Streaming error: {e}, falling back to standard execution...") + # Fallback to non-streaming + result = await self.agent.ainvoke( + {"messages": [{"role": "user", "content": user_message}]}, + **kwargs + ) + + # Run the streaming agent + asyncio.run(stream_agent()) + + # If result is still None or empty, fallback to non-streaming + if not result or not result.get("messages"): + if verbose: + print("āš ļø No result from streaming, using standard execution...") + result = asyncio.run( + self.agent.ainvoke( + {"messages": [{"role": "user", "content": user_message}]}, + **kwargs + ) + ) + + # Extract the output from messages (LangChain 1.x format) + # Result contains messages list with the conversation history + if not result: + raise ValueError("Agent execution returned no result") + + messages = result.get("messages", []) + if not messages: + # If no messages, result might be in a different format + if verbose: + print(f"āš ļø No messages found in result. Result keys: {list(result.keys()) if isinstance(result, dict) else 'not a dict'}") + # Try to get messages from different possible locations + if isinstance(result, dict): + # Check if messages are nested differently + for key in ["messages", "output", "state"]: + if key in result: + potential_messages = result[key] + if isinstance(potential_messages, list): + messages = potential_messages + break + output = "" + tool_output = None + + if verbose: + print(f"\nšŸ“Š Processing {len(messages)} messages...") + if messages: + for i, msg in enumerate(messages): + msg_type = getattr(msg, 'type', None) or (msg.get("type") if isinstance(msg, dict) else None) + msg_content = getattr(msg, 'content', None) or (msg.get("content") if isinstance(msg, dict) else str(msg)) + print(f" [{i}] {msg_type}: {str(msg_content)[:100]}") + else: + print(" āš ļø No messages to process") + if isinstance(result, dict): + print(f" Result keys: {list(result.keys())}") + print(f" Result preview: {str(result)[:500]}") + + # Look for tool outputs in messages (LangChain 1.x stores tool results in messages) + for message in reversed(messages): + # Check if this is a tool message with output + message_type = None + if hasattr(message, 'type'): + message_type = message.type + elif isinstance(message, dict): + message_type = message.get("type") or message.get("message_type") + + # Tool messages contain the actual tool output + # In LangChain 1.x, tool outputs are in messages with type "tool" + if message_type == "tool" or (isinstance(message, dict) and message.get("type") == "tool"): + # Extract tool output + if hasattr(message, 'content'): + tool_output = message.content + elif isinstance(message, dict): + tool_output = message.get("content", "") + + if tool_output: + if verbose: + print(f"āœ… Tool output received") + # Try to parse tool output to show provider + try: + tool_result = json.loads(tool_output) + provider = tool_result.get("provider", "unknown") + if provider != "unknown": + print(f"šŸ“ø Provider selected: {provider}") + except: + pass + break + + # Get assistant message content as fallback + if not output and not tool_output: + if hasattr(message, 'content'): + content = message.content + elif isinstance(message, dict): + content = message.get("content", "") + else: + content = str(message) + + if content and content.strip(): + output = content + + # Prefer tool output over assistant message + if tool_output: + output = tool_output + + # Fallback: convert last message to string if no content found + if not output and messages: + output = str(messages[-1]) + + # Try to extract structured data from the output + # The tool returns JSON strings, so parse them + parsed_result = None + try: + # First, try to parse the entire output as JSON + parsed_result = json.loads(output) + except (json.JSONDecodeError, TypeError): + # If that fails, look for JSON in the output text + try: + if "{" in output and "}" in output: + json_start = output.find("{") + json_end = output.rfind("}") + 1 + json_str = output[json_start:json_end] + parsed_result = json.loads(json_str) + except json.JSONDecodeError: + pass + + # If we successfully parsed JSON, extract images + if parsed_result: + # Check if it's an error result + if parsed_result.get("status") == "error": + return { + "status": "error", + "provider": parsed_result.get("provider", "unknown"), + "images": [], + "error": parsed_result.get("error", "Unknown error from tool"), + "result": output, + "raw_output": result + } + + # Check if we have a cache_key (new format to avoid token limits) + cache_key = parsed_result.get("cache_key") + if cache_key: + if verbose: + print(f"šŸ” Retrieving images from cache (key: {cache_key[:8]}...)") + # Retrieve full images from cache + cached_data = get_tool_output_from_cache(cache_key) + if cached_data: + images = cached_data.get("images", []) + provider = cached_data.get("provider", parsed_result.get("provider", "unknown")) + if verbose: + print(f"āœ… Retrieved {len(images)} image(s) from cache") + else: + if verbose: + print("āš ļø Cache miss, trying alternative extraction...") + # Cache miss, try to get from parsed result + images = parsed_result.get("images", []) + provider = parsed_result.get("provider", "unknown") + else: + # Old format - extract directly from parsed result + if verbose: + print("šŸ“„ Extracting images from tool output...") + images = parsed_result.get("images", []) + provider = parsed_result.get("provider", "unknown") + + if verbose: + print(f"āœ… Successfully extracted {len(images)} image(s) from {provider}") + + return { + "status": "success", + "provider": provider, + "images": images if isinstance(images, list) else [images] if images else [], + "result": output, + "raw_output": result + } + + # Return text output if JSON parsing fails + debug_info = f"Could not parse JSON from output. Output type: {type(output)}, Output preview: {str(output)[:200]}" + if verbose: + print(f"[DEBUG] {debug_info}") + print(f"[DEBUG] Full output: {output}") + + return { + "status": "success", + "provider": "unknown", + "images": [], + "result": output, + "raw_output": result, + "debug_info": debug_info + } + + except Exception as e: + return { + "status": "error", + "provider": "unknown", + "images": [], + "error": str(e), + "raw_output": None + } + + def generate_and_decode( + self, + person_image: str, + garment_image: str, + prompt: str, + **kwargs + ) -> List: + """ + Generate virtual try-on images and decode them to PIL Images. + + This is a convenience method that calls generate() and then decodes + the resulting images to PIL Image objects. + + Args: + person_image: Path or URL to the person/model image + garment_image: Path or URL to the garment/cloth image + prompt: Natural language prompt describing the request + **kwargs: Additional parameters + + Returns: + List of PIL Image objects + + Note: + This method requires the adapters to support generate_and_decode(). + Currently, this is a placeholder that returns the raw result. + Full implementation would decode base64 images or download URLs. + """ + result = self.generate( + person_image=person_image, + garment_image=garment_image, + prompt=prompt, + **kwargs + ) + + # TODO: Implement image decoding based on result format + # For now, return the raw result + return result + diff --git a/tryon/agents/vton/tools.py b/tryon/agents/vton/tools.py new file mode 100644 index 0000000..3e0959e --- /dev/null +++ b/tryon/agents/vton/tools.py @@ -0,0 +1,275 @@ +""" +Tools for Virtual Try-On Agent + +This module provides LangChain tools for each virtual try-on adapter, +allowing the agent to select and use the appropriate adapter based on user input. + +Note: Tools store full image data in a global cache to avoid token limit issues +when returning results to the LLM. The agent extracts images from this cache. +""" + +import json +from typing import Optional, Union +from pydantic import BaseModel, Field +from langchain.tools import tool + +from tryon.api import ( + KlingAIVTONAdapter, + AmazonNovaCanvasVTONAdapter, + SegmindVTONAdapter, +) + +# Global cache to store full tool outputs (to avoid token limits) +_tool_output_cache = {} + + +class KlingAIVTONToolInput(BaseModel): + """Input schema for Kling AI virtual try-on tool.""" + person_image: str = Field(description="Path or URL to the person/model image") + garment_image: str = Field(description="Path or URL to the garment/cloth image") + model: Optional[str] = Field( + default=None, + description="Optional model version (e.g., 'kolors-virtual-try-on-v1-5')" + ) + + +class NovaCanvasVTONToolInput(BaseModel): + """Input schema for Amazon Nova Canvas virtual try-on tool.""" + person_image: str = Field(description="Path or URL to the person/model image") + garment_image: str = Field(description="Path or URL to the garment/cloth image") + mask_type: str = Field( + default="GARMENT", + description="Mask type: 'GARMENT' (automatic) or 'IMAGE' (custom mask)" + ) + garment_class: Optional[str] = Field( + default="UPPER_BODY", + description="Garment class: 'UPPER_BODY', 'LOWER_BODY', 'FULL_BODY', or 'FOOTWEAR'" + ) + + +class SegmindVTONToolInput(BaseModel): + """Input schema for Segmind virtual try-on tool.""" + person_image: str = Field(description="Path or URL to the person/model image") + garment_image: str = Field(description="Path or URL to the garment/cloth image") + category: str = Field( + default="Upper body", + description="Garment category: 'Upper body', 'Lower body', or 'Dresses'" + ) + + +@tool("kling_ai_virtual_tryon", args_schema=KlingAIVTONToolInput) +def kling_ai_virtual_tryon( + person_image: str, + garment_image: str, + model: Optional[str] = None +) -> str: + """ + Generate virtual try-on images using Kling AI's Kolors Virtual Try-On API. + + Kling AI provides high-quality virtual try-on with asynchronous processing + and automatic polling. Supports high-resolution images (up to 16M pixels). + + Use this tool when the user mentions "kling ai" or "kling" in their request. + + Args: + person_image: Path or URL to the person/model image + garment_image: Path or URL to the garment/cloth image + model: Optional model version (e.g., 'kolors-virtual-try-on-v1-5') + + Returns: + JSON string containing image URLs or base64-encoded images + """ + try: + print(" šŸ”„ Initializing Kling AI adapter...") + adapter = KlingAIVTONAdapter() + print(" āš™ļø Generating virtual try-on images (this may take a moment)...") + images = adapter.generate( + source_image=person_image, + reference_image=garment_image, + model=model + ) + print(" āœ… Kling AI generation completed") + + # Store full images in cache (keyed by a hash of inputs) + import hashlib + cache_key = hashlib.md5( + f"{person_image}_{garment_image}_{model}".encode() + ).hexdigest() + _tool_output_cache[cache_key] = { + "provider": "kling_ai", + "images": images if isinstance(images, list) else [images] + } + + # Return only metadata to avoid token limits + result = { + "status": "success", + "provider": "kling_ai", + "image_count": len(images) if isinstance(images, list) else 1, + "cache_key": cache_key, # Reference to full data + "message": "Images generated successfully. Use cache_key to retrieve full image data." + } + return json.dumps(result) + except Exception as e: + result = { + "status": "error", + "provider": "kling_ai", + "error": str(e) + } + return json.dumps(result) + + +@tool("nova_canvas_virtual_tryon", args_schema=NovaCanvasVTONToolInput) +def nova_canvas_virtual_tryon( + person_image: str, + garment_image: str, + mask_type: str = "GARMENT", + garment_class: Optional[str] = "UPPER_BODY" +) -> str: + """ + Generate virtual try-on images using Amazon Nova Canvas (AWS Bedrock). + + Amazon Nova Canvas provides virtual try-on through AWS Bedrock with automatic + garment detection and masking. Supports multiple garment classes and custom masks. + Maximum image size: 4.1M pixels (2048x2048). + + Use this tool when the user mentions "nova canvas", "amazon nova", "aws", or "bedrock". + + Args: + person_image: Path or URL to the person/model image + garment_image: Path or URL to the garment/cloth image + mask_type: 'GARMENT' for automatic detection or 'IMAGE' for custom mask + garment_class: 'UPPER_BODY', 'LOWER_BODY', 'FULL_BODY', or 'FOOTWEAR' + + Returns: + JSON string containing base64-encoded images + """ + try: + print(" šŸ”„ Initializing Amazon Nova Canvas adapter...") + adapter = AmazonNovaCanvasVTONAdapter() + print(" āš™ļø Generating virtual try-on images...") + images = adapter.generate( + source_image=person_image, + reference_image=garment_image, + mask_type=mask_type, + garment_class=garment_class + ) + print(" āœ… Nova Canvas generation completed") + + # Store full images in cache + import hashlib + cache_key = hashlib.md5( + f"{person_image}_{garment_image}_{mask_type}_{garment_class}".encode() + ).hexdigest() + _tool_output_cache[cache_key] = { + "provider": "nova_canvas", + "images": images if isinstance(images, list) else [images] + } + + # Return only metadata to avoid token limits + result = { + "status": "success", + "provider": "nova_canvas", + "image_count": len(images) if isinstance(images, list) else 1, + "cache_key": cache_key, + "message": "Images generated successfully. Use cache_key to retrieve full image data." + } + return json.dumps(result) + except Exception as e: + result = { + "status": "error", + "provider": "nova_canvas", + "error": str(e) + } + return json.dumps(result) + + +@tool("segmind_virtual_tryon", args_schema=SegmindVTONToolInput) +def segmind_virtual_tryon( + person_image: str, + garment_image: str, + category: str = "Upper body" +) -> str: + """ + Generate virtual try-on images using Segmind Try-On Diffusion API. + + Segmind provides fast and efficient virtual try-on generation with support for + multiple garment categories. Good for quick iterations and testing. + + Use this tool when the user mentions "segmind" in their request. + + Args: + person_image: Path or URL to the person/model image + garment_image: Path or URL to the garment/cloth image + category: 'Upper body', 'Lower body', or 'Dresses' + + Returns: + JSON string containing base64-encoded images or URLs + """ + try: + print(" šŸ”„ Initializing Segmind adapter...") + adapter = SegmindVTONAdapter() + print(" āš™ļø Generating virtual try-on images...") + images = adapter.generate( + model_image=person_image, + cloth_image=garment_image, + category=category + ) + print(" āœ… Segmind generation completed") + # Handle both single image and list responses + if not isinstance(images, list): + images = [images] + + # Store full images in cache + import hashlib + cache_key = hashlib.md5( + f"{person_image}_{garment_image}_{category}".encode() + ).hexdigest() + _tool_output_cache[cache_key] = { + "provider": "segmind", + "images": images + } + + # Return only metadata to avoid token limits + result = { + "status": "success", + "provider": "segmind", + "image_count": len(images), + "cache_key": cache_key, + "message": "Images generated successfully. Use cache_key to retrieve full image data." + } + return json.dumps(result) + except Exception as e: + result = { + "status": "error", + "provider": "segmind", + "error": str(e) + } + return json.dumps(result) + + +def get_vton_tools(): + """ + Get all available virtual try-on tools. + + Returns: + List of LangChain tools for virtual try-on + """ + return [ + kling_ai_virtual_tryon, + nova_canvas_virtual_tryon, + segmind_virtual_tryon, + ] + + +def get_tool_output_from_cache(cache_key: str) -> Optional[dict]: + """ + Retrieve full tool output from cache using cache_key. + + Args: + cache_key: Cache key returned in tool output + + Returns: + Dictionary with provider and images, or None if not found + """ + return _tool_output_cache.get(cache_key) + diff --git a/vton_agent.py b/vton_agent.py new file mode 100644 index 0000000..804bbda --- /dev/null +++ b/vton_agent.py @@ -0,0 +1,256 @@ +from dotenv import load_dotenv +load_dotenv() + +import os +import json +import argparse +import base64 +import io +import requests +from pathlib import Path +from PIL import Image +from tryon.agents.vton import VTOnAgent + +def main(): + parser = argparse.ArgumentParser( + description="Generate virtual try-on images using AI agent that intelligently selects the best provider", + formatter_class=argparse.RawDescriptionHelpFormatter, + epilog=""" +Examples: + # Basic usage with default OpenAI provider + python vton_agent.py --person person.jpg --garment shirt.jpg --prompt "Create a virtual try-on using Kling AI" + + # Specify LLM provider + python vton_agent.py --person person.jpg --garment shirt.jpg --prompt "Use Nova Canvas for virtual try-on" --llm-provider anthropic + + # Use Google Gemini as LLM + python vton_agent.py --person person.jpg --garment shirt.jpg --prompt "Generate try-on with Segmind" --llm-provider google + + # Specify LLM model + python vton_agent.py --person person.jpg --garment shirt.jpg --prompt "Use Kling AI" --llm-model gpt-4-turbo-preview + + # Save output to specific directory + python vton_agent.py --person person.jpg --garment shirt.jpg --prompt "Create virtual try-on" --output-dir results/ + + # Use URLs instead of file paths + python vton_agent.py --person https://example.com/person.jpg --garment https://example.com/shirt.jpg --prompt "Use Kling AI" + """ + ) + + # Required arguments + parser.add_argument( + '-p', '--person', + type=str, + required=True, + help='Path or URL to person/model image' + ) + + parser.add_argument( + '-g', '--garment', + type=str, + required=True, + help='Path or URL to garment/cloth image' + ) + + parser.add_argument( + '--prompt', + type=str, + required=True, + help='Natural language prompt describing the virtual try-on request. The agent will select the provider based on keywords in the prompt (e.g., "Use Kling AI", "Generate with Nova Canvas", "Try Segmind")' + ) + + # LLM configuration + parser.add_argument( + '--llm-provider', + type=str, + default='openai', + choices=['openai', 'anthropic', 'google'], + help='LLM provider to use for the agent. Options: openai (default), anthropic, google' + ) + + parser.add_argument( + '--llm-model', + type=str, + default=None, + help='Specific LLM model to use (e.g., "gpt-4-turbo-preview", "claude-3-opus-20240229", "gemini-pro"). If not specified, uses default model for the provider' + ) + + parser.add_argument( + '--llm-temperature', + type=float, + default=0.0, + help='Temperature for LLM (default: 0.0 for deterministic behavior). Range: 0.0-2.0' + ) + + parser.add_argument( + '--llm-api-key', + type=str, + default=None, + help='API key for LLM provider (if not set via environment variable). For OpenAI: OPENAI_API_KEY, Anthropic: ANTHROPIC_API_KEY, Google: GOOGLE_API_KEY' + ) + + # Output configuration + parser.add_argument( + '-o', '--output-dir', + type=str, + default='outputs', + help='Directory to save generated images. Default: outputs/' + ) + + parser.add_argument( + '--save-base64', + action='store_true', + help='Also save Base64 encoded strings to .txt files (in addition to PNG images)' + ) + + parser.add_argument( + '--verbose', + action='store_true', + help='Print verbose output including agent reasoning steps' + ) + + args = parser.parse_args() + + # Validate file paths (if they're local files, not URLs) + if not args.person.startswith(('http://', 'https://')): + if not os.path.exists(args.person): + raise FileNotFoundError(f"Person image not found: {args.person}") + + if not args.garment.startswith(('http://', 'https://')): + if not os.path.exists(args.garment): + raise FileNotFoundError(f"Garment image not found: {args.garment}") + + # Create output directory if it doesn't exist + output_dir = Path(args.output_dir) + output_dir.mkdir(parents=True, exist_ok=True) + + # Initialize agent + print(f"Initializing Virtual Try-On Agent...") + print(f" LLM Provider: {args.llm_provider}") + if args.llm_model: + print(f" LLM Model: {args.llm_model}") + print(f" Temperature: {args.llm_temperature}") + + try: + agent = VTOnAgent( + llm_provider=args.llm_provider, + llm_model=args.llm_model, + temperature=args.llm_temperature, + api_key=args.llm_api_key + ) + except ValueError as e: + print(f"\nāœ— Error initializing agent: {e}") + print("\nPlease ensure:") + print(" 1. Required LLM API key is set in environment variables or --llm-api-key") + print(" 2. LangChain dependencies are installed: pip install langchain langchain-openai langchain-anthropic langchain-google-genai") + return 1 + except Exception as e: + print(f"\nāœ— Unexpected error initializing agent: {e}") + return 1 + + # Generate virtual try-on + print(f"\n{'='*60}") + print(f"šŸš€ Starting Virtual Try-On Generation") + print(f"{'='*60}") + print(f" šŸ‘¤ Person image: {args.person}") + print(f" šŸ‘• Garment image: {args.garment}") + print(f" šŸ’¬ Prompt: {args.prompt}") + print(f"{'='*60}\n") + + try: + # Always show progress (verbose controls detail level) + result = agent.generate( + person_image=args.person, + garment_image=args.garment, + prompt=args.prompt, + verbose=True # Always show intermediate steps + ) + + if args.verbose: + print(f"\nšŸ“‹ Full agent result:") + print(json.dumps(result, indent=2, default=str)) + + if result["status"] == "error": + print(f"\n{'='*60}") + print(f"āŒ Error: {result.get('error', 'Unknown error')}") + print(f"{'='*60}") + return 1 + + # Extract images + images = result.get("images", []) + provider = result.get("provider", "unknown") + + if not images: + print(f"\n{'='*60}") + print(f"āŒ Error: No images generated") + print(f"{'='*60}") + return 1 + + print(f"\n{'='*60}") + print(f"āœ… Successfully generated {len(images)} image(s) using {provider}") + print(f"{'='*60}") + + # Process and save images + saved_images = [] + for idx, image_data in enumerate(images): + try: + # Handle different image formats + if isinstance(image_data, str): + # Check if it's a URL + if image_data.startswith(('http://', 'https://')): + # Download image from URL + img_response = requests.get(image_data) + img_response.raise_for_status() + image_bytes = img_response.content + image = Image.open(io.BytesIO(image_bytes)) + else: + # Assume it's base64 + image_bytes = base64.b64decode(image_data) + image = Image.open(io.BytesIO(image_bytes)) + else: + # Already bytes or PIL Image + if isinstance(image_data, bytes): + image = Image.open(io.BytesIO(image_data)) + else: + image = image_data + + # Save PNG + output_path = output_dir / f"vton_agent_result_{idx}.png" + image.save(output_path) + saved_images.append(output_path) + print(f"šŸ’¾ Saved image {idx + 1}/{len(images)}: {output_path}") + + # Optionally save Base64 + if args.save_base64: + buffer = io.BytesIO() + image.save(buffer, format='PNG') + image_bytes = buffer.getvalue() + image_base64 = base64.b64encode(image_bytes).decode('utf-8') + + output_path_txt = output_dir / f"vton_agent_result_{idx}.txt" + with open(output_path_txt, 'w') as f: + f.write(image_base64) + print(f"šŸ’¾ Saved Base64 string {idx + 1}: {output_path_txt}") + + except Exception as e: + print(f"āœ— Error processing image {idx + 1}: {e}") + continue + + print(f"\n{'='*60}") + print(f"šŸŽ‰ Complete! Saved {len(saved_images)} image(s) to {output_dir}") + print(f"{'='*60}") + return 0 + + except ValueError as e: + print(f"\nāœ— Error: {e}") + return 1 + except Exception as e: + print(f"\nāœ— Unexpected error: {e}") + if args.verbose: + import traceback + traceback.print_exc() + return 1 + +if __name__ == "__main__": + exit(main()) +