minitap-ai · plfavreau · Jan 7, 2026 · Jan 2, 2026 · Jan 2, 2026 · Jan 2, 2026
diff --git a/llm-config.defaults.jsonc b/llm-config.defaults.jsonc
@@ -58,6 +58,18 @@
           "provider": "openai",
           "model": "gpt-5-mini"
         }
+      },
+      // video_analyzer is optional - only needed when using video recording tools
+      // Requires a video-capable model. Ex of supported models:
+      // gemini-3-pro-preview, gemini-3-flash-preview, gemini-2.5-flash,
+      // gemini-2.5-flash-lite, gemini-2.5-pro, gemini-2.0-flash, gemini-2.0-flash-lite
+      "video_analyzer": {
+        "provider": "google",
+        "model": "gemini-3-flash-preview",
+        "fallback": {
+          "provider": "google",
+          "model": "gemini-2.5-flash"
+        }
       }
     }
   },
@@ -121,6 +133,18 @@
           "provider": "minitap",
           "model": "openai/gpt-5-mini"
         }
+      },
+      // video_analyzer is optional - only needed when using video recording tools
+      // Requires a video-capable model. Ex of supported models:
+      // gemini-3-pro-preview, gemini-3-flash-preview, gemini-2.5-flash,
+      // gemini-2.5-flash-lite, gemini-2.5-pro, gemini-2.0-flash, gemini-2.0-flash-lite
+      "video_analyzer": {
+        "provider": "minitap",
+        "model": "google/gemini-3-flash-preview",
+        "fallback": {
+          "provider": "minitap",
+          "model": "google/gemini-2.5-flash"
+        }
       }
     }
   }

diff --git a/llm-config.override.template.jsonc b/llm-config.override.template.jsonc
@@ -38,6 +38,14 @@
     "outputter": {
       "provider": "",
       "model": ""
+    },
+    "video_analyzer": {
+      "provider": "",
+      "model": "",
+      "fallback": {
+        "provider": "",
+        "model": ""
+      }
     }
   }
 }
diff --git a/minitap/all-projects.code-workspace b/minitap/all-projects.code-workspace
@@ -0,0 +1,10 @@
+{
+  "folders": [
+    {
+      "path": ".."
+    }
+  ],
+  "settings": {
+    "python.languageServer": "None"
+  }
+}
diff --git a/minitap/mobile_use/agents/cortex/cortex.py b/minitap/mobile_use/agents/cortex/cortex.py
@@ -19,7 +19,11 @@
 from minitap.mobile_use.graph.state import State
 from minitap.mobile_use.services.llm import get_llm, invoke_llm_with_timeout_message, with_fallback
 from minitap.mobile_use.services.telemetry import telemetry
-from minitap.mobile_use.tools.index import EXECUTOR_WRAPPERS_TOOLS, format_tools_list
+from minitap.mobile_use.tools.index import (
+    EXECUTOR_WRAPPERS_TOOLS,
+    VIDEO_RECORDING_WRAPPERS,
+    format_tools_list,
+)
 from minitap.mobile_use.utils.conversations import get_screenshot_message_for_llm
 from minitap.mobile_use.utils.decorators import wrap_with_callbacks
 from minitap.mobile_use.utils.logger import get_logger
@@ -43,6 +47,10 @@ async def __call__(self, state: State):
             self.ctx.execution_setup.get_locked_app_package() if self.ctx.execution_setup else None
         )
 
+        executor_wrappers = list(EXECUTOR_WRAPPERS_TOOLS)
+        if self.ctx.video_recording_enabled:
+            executor_wrappers.extend(VIDEO_RECORDING_WRAPPERS)
+
         system_message = Template(
             Path(__file__).parent.joinpath("cortex.md").read_text(encoding="utf-8")
         ).render(
@@ -51,7 +59,7 @@ async def __call__(self, state: State):
             subgoal_plan=state.subgoal_plan,
             current_subgoal=get_current_subgoal(state.subgoal_plan),
             executor_feedback=executor_feedback,
-            executor_tools_list=format_tools_list(ctx=self.ctx, wrappers=EXECUTOR_WRAPPERS_TOOLS),
+            executor_tools_list=format_tools_list(ctx=self.ctx, wrappers=executor_wrappers),
             locked_app_package=current_locked_app_package,
         )
         messages = [

diff --git a/minitap/mobile_use/agents/executor/executor.py b/minitap/mobile_use/agents/executor/executor.py
@@ -9,7 +9,11 @@
 from minitap.mobile_use.context import MobileUseContext
 from minitap.mobile_use.graph.state import State
 from minitap.mobile_use.services.llm import get_llm, invoke_llm_with_timeout_message, with_fallback
-from minitap.mobile_use.tools.index import EXECUTOR_WRAPPERS_TOOLS, get_tools_from_wrappers
+from minitap.mobile_use.tools.index import (
+    EXECUTOR_WRAPPERS_TOOLS,
+    VIDEO_RECORDING_WRAPPERS,
+    get_tools_from_wrappers,
+)
 from minitap.mobile_use.utils.decorators import wrap_with_callbacks
 from minitap.mobile_use.utils.logger import get_logger
 
@@ -54,8 +58,13 @@ async def __call__(self, state: State):
 
         llm = get_llm(ctx=self.ctx, name="executor")
         llm_fallback = get_llm(ctx=self.ctx, name="executor", use_fallback=True)
+
+        executor_wrappers = list(EXECUTOR_WRAPPERS_TOOLS)
+        if self.ctx.video_recording_enabled:
+            executor_wrappers.extend(VIDEO_RECORDING_WRAPPERS)
+
         llm_bind_tools_kwargs: dict = {
-            "tools": get_tools_from_wrappers(self.ctx, EXECUTOR_WRAPPERS_TOOLS),
+            "tools": get_tools_from_wrappers(self.ctx, executor_wrappers),
         }
 
         # ChatGoogleGenerativeAI does not support the "parallel_tool_calls" keyword

diff --git a/minitap/mobile_use/agents/planner/planner.py b/minitap/mobile_use/agents/planner/planner.py
@@ -11,7 +11,11 @@
 )
 from minitap.mobile_use.graph.state import State
 from minitap.mobile_use.services.llm import get_llm, invoke_llm_with_timeout_message, with_fallback
-from minitap.mobile_use.tools.index import EXECUTOR_WRAPPERS_TOOLS, format_tools_list
+from minitap.mobile_use.tools.index import (
+    EXECUTOR_WRAPPERS_TOOLS,
+    VIDEO_RECORDING_WRAPPERS,
+    format_tools_list,
+)
 from minitap.mobile_use.utils.decorators import wrap_with_callbacks
 from minitap.mobile_use.utils.logger import get_logger
 
@@ -35,11 +39,15 @@ async def __call__(self, state: State):
         )
         current_foreground_app = get_current_foreground_package(self.ctx)
 
+        executor_wrappers = list(EXECUTOR_WRAPPERS_TOOLS)
+        if self.ctx.video_recording_enabled:
+            executor_wrappers.extend(VIDEO_RECORDING_WRAPPERS)
+
         system_message = Template(
             Path(__file__).parent.joinpath("planner.md").read_text(encoding="utf-8")
         ).render(
             platform=self.ctx.device.mobile_platform.value,
-            executor_tools_list=format_tools_list(ctx=self.ctx, wrappers=EXECUTOR_WRAPPERS_TOOLS),
+            executor_tools_list=format_tools_list(ctx=self.ctx, wrappers=executor_wrappers),
             locked_app_package=current_locked_app_package,
             current_foreground_app=current_foreground_app,
         )

diff --git a/minitap/mobile_use/agents/video_analyzer/__init__.py b/minitap/mobile_use/agents/video_analyzer/__init__.py
@@ -0,0 +1,5 @@
+"""Video analyzer utility for analyzing video content with Gemini models."""
+
+from minitap.mobile_use.agents.video_analyzer.video_analyzer import analyze_video
+
+__all__ = ["analyze_video"]
diff --git a/minitap/mobile_use/agents/video_analyzer/human.md b/minitap/mobile_use/agents/video_analyzer/human.md
@@ -0,0 +1,5 @@
+Please analyze the following video recording and respond to my request.
+
+---
+
+**My Request**: {{ prompt }}
diff --git a/minitap/mobile_use/agents/video_analyzer/video_analyzer.md b/minitap/mobile_use/agents/video_analyzer/video_analyzer.md
@@ -0,0 +1,37 @@
+## You are a **Video Analysis Assistant**
+
+You analyze video recordings of mobile device screens and provide accurate, detailed responses based on what you observe.
+
+---
+
+## Your Focus Areas
+
+When analyzing videos, pay attention to:
+
+- **UI elements** and their states (buttons, text fields, toggles, etc.)
+- **Text content** displayed on screen
+- **Actions that occur** (taps, scrolls, transitions, animations)
+- **Notifications or dialogs** that appear
+- **Changes in the interface** over time
+- **Audio content** if present (transcribe speech, describe sounds)
+
+---
+
+## Guidelines
+
+- **Be precise and factual** - Only describe what you can actually see or hear
+- **Note uncertainty** - If you cannot clearly see or determine something, say so
+- **Be thorough** - Capture all relevant details that relate to the user's question
+- **Use timestamps** when describing sequences of events (e.g., "At 0:05, the user taps...")
+- **Structure your response** clearly when there's a lot of information
+
+---
+
+## Response Format
+
+Adapt your response format to the user's request:
+
+- For **transcription requests**: Provide clean, readable text of what was spoken or displayed
+- For **description requests**: Give a chronological narrative of events
+- For **specific questions**: Answer directly and concisely
+- For **extraction requests**: List items clearly (e.g., notifications, text content)
diff --git a/minitap/mobile_use/agents/video_analyzer/video_analyzer.py b/minitap/mobile_use/agents/video_analyzer/video_analyzer.py
@@ -0,0 +1,99 @@
+"""
+Video Analyzer utility for analyzing video content using Gemini models.
+
+This utility sends video files to video-capable Gemini models for analysis
+and returns text descriptions based on the provided prompt.
+"""
+
+import base64
+from pathlib import Path
+
+from jinja2 import Template
+from langchain_core.messages import HumanMessage, SystemMessage
+
+from minitap.mobile_use.context import MobileUseContext
+from minitap.mobile_use.services.llm import get_llm, invoke_llm_with_timeout_message, with_fallback
+from minitap.mobile_use.utils.logger import get_logger
+
+logger = get_logger(__name__)
+
+
+async def analyze_video(
+    ctx: MobileUseContext,
+    video_path: Path,
+    prompt: str,
+) -> str:
+    """
+    Analyze a video file using a video-capable Gemini model.
+
+    Args:
+        ctx: The MobileUseContext containing LLM configuration
+        video_path: Path to the video file (MP4)
+        prompt: The analysis prompt/question about the video
+
+    Returns:
+        Text analysis result from the model
+
+    Raises:
+        Exception: If video analysis fails
+    """
+    logger.info(f"Starting video analysis for {video_path}")
+
+    if not video_path.exists():
+        raise FileNotFoundError(f"Video file not found: {video_path}")
+
+    with open(video_path, "rb") as video_file:
+        video_bytes = video_file.read()
+
+    video_base64 = base64.b64encode(video_bytes).decode("utf-8")
+
+    suffix = video_path.suffix.lower()
+    mime_type = "video/mp4" if suffix in [".mp4", ".m4v"] else f"video/{suffix[1:]}"
+
+    system_message_content = Template(
+        Path(__file__).parent.joinpath("video_analyzer.md").read_text(encoding="utf-8")
+    ).render()
+
+    human_message_content = Template(
+        Path(__file__).parent.joinpath("human.md").read_text(encoding="utf-8")
+    ).render(prompt=prompt)
+
+    messages = [
+        SystemMessage(content=system_message_content),
+        HumanMessage(
+            content=[
+                {
+                    "type": "text",
+                    "text": human_message_content,
+                },
+                {
+                    "type": "file",
+                    "source_type": "base64",
+                    "mime_type": mime_type,
+                    "data": video_base64,
+                },
+            ]
+        ),
+    ]
+
+    llm = get_llm(ctx=ctx, name="video_analyzer", is_utils=True, temperature=0.2)
+    llm_fallback = get_llm(
+        ctx=ctx, name="video_analyzer", is_utils=True, use_fallback=True, temperature=0.2
+    )
+
+    logger.info("Sending video to LLM for analysis...")
+
+    response = await with_fallback(
+        main_call=lambda: invoke_llm_with_timeout_message(
+            llm.ainvoke(messages), timeout_seconds=120
+        ),
+        fallback_call=lambda: invoke_llm_with_timeout_message(
+            llm_fallback.ainvoke(messages), timeout_seconds=120
+        ),
+    )
+
+    content = response.content if hasattr(response, "content") else str(response)
+    result = content if isinstance(content, str) else str(content)
+    logger.info("Video analysis completed")
+
+    return result
diff --git a/minitap/mobile_use/config.py b/minitap/mobile_use/config.py
@@ -93,14 +93,15 @@ def record_events(output_path: Path | None, events: list[str] | BaseModel | Any)
 ### LLM Configuration
 
 LLMProvider = Literal["openai", "google", "openrouter", "xai", "vertexai", "minitap"]
-LLMUtilsNode = Literal["outputter", "hopper"]
+LLMUtilsNode = Literal["outputter", "hopper", "video_analyzer"]
 LLMUtilsNodeWithFallback = LLMUtilsNode
 AgentNode = Literal[
     "planner",
     "orchestrator",
     "contextor",
     "cortex",
     "executor",
+    "video_analyzer",
 ]
 AgentNodeWithFallback = AgentNode
 
@@ -158,6 +159,7 @@ def __str__(self):
 class LLMConfigUtils(BaseModel):
     outputter: LLMWithFallback
     hopper: LLMWithFallback
+    video_analyzer: LLMWithFallback | None = None
 
 
 class LLMConfig(BaseModel):
@@ -176,6 +178,8 @@ def validate_providers(self):
         self.executor.validate_provider("Executor")
         self.utils.outputter.validate_provider("Outputter")
         self.utils.hopper.validate_provider("Hopper")
+        if self.utils.video_analyzer:
+            self.utils.video_analyzer.validate_provider("VideoAnalyzer")
 
     def __str__(self):
         return f"""
@@ -187,13 +191,20 @@ def __str__(self):
 🧩 Utils:
     🔽 Hopper: {self.utils.hopper}
     📝 Outputter: {self.utils.outputter}
+    🎬 Video Analyzer: {self.utils.video_analyzer or "Not configured"}
 """
 
     def get_agent(self, item: AgentNode) -> LLMWithFallback:
         return getattr(self, item)
 
     def get_utils(self, item: LLMUtilsNode) -> LLMWithFallback:
-        return getattr(self.utils, item)
+        value = getattr(self.utils, item)
+        if value is None:
+            raise ValueError(
+                f"Utils '{item}' is not configured. "
+                f"Please add it to your LLM config or enable it via AgentConfigBuilder."
+            )
+        return value
 
 
 def get_default_llm_config() -> LLMConfig:

diff --git a/minitap/mobile_use/context.py b/minitap/mobile_use/context.py
@@ -87,6 +87,7 @@ class MobileUseContext(BaseModel):
     on_agent_thought: Callable[[AgentNode, str], Coroutine] | None = None
     on_plan_changes: Callable[[list[Subgoal], IsReplan], Coroutine] | None = None
     minitap_api_key: str | None = None
+    video_recording_enabled: bool = False
 
     def get_adb_client(self) -> AdbClient:
         if self.adb_client is None: