Skip to content
Merged
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
24 changes: 24 additions & 0 deletions llm-config.defaults.jsonc
Original file line number Diff line number Diff line change
Expand Up @@ -58,6 +58,18 @@
"provider": "openai",
"model": "gpt-5-mini"
}
},
// video_analyzer is optional - only needed when using video recording tools
// Requires a video-capable model. Ex of supported models:
// gemini-3-pro-preview, gemini-3-flash-preview, gemini-2.5-flash,
// gemini-2.5-flash-lite, gemini-2.5-pro, gemini-2.0-flash, gemini-2.0-flash-lite
"video_analyzer": {
"provider": "google",
"model": "gemini-3-flash-preview",
"fallback": {
"provider": "google",
"model": "gemini-2.5-flash"
}
}
}
Comment thread
plfavreau marked this conversation as resolved.
},
Expand Down Expand Up @@ -121,6 +133,18 @@
"provider": "minitap",
"model": "openai/gpt-5-mini"
}
},
// video_analyzer is optional - only needed when using video recording tools
// Requires a video-capable model. Ex of supported models:
// gemini-3-pro-preview, gemini-3-flash-preview, gemini-2.5-flash,
// gemini-2.5-flash-lite, gemini-2.5-pro, gemini-2.0-flash, gemini-2.0-flash-lite
"video_analyzer": {
"provider": "minitap",
"model": "google/gemini-3-flash-preview",
"fallback": {
"provider": "minitap",
"model": "google/gemini-2.5-flash"
}
}
}
}
Expand Down
8 changes: 8 additions & 0 deletions llm-config.override.template.jsonc
Original file line number Diff line number Diff line change
Expand Up @@ -38,6 +38,14 @@
"outputter": {
"provider": "",
"model": ""
},
"video_analyzer": {
"provider": "",
"model": "",
"fallback": {
"provider": "",
"model": ""
}
}
}
}
10 changes: 10 additions & 0 deletions minitap/all-projects.code-workspace
Original file line number Diff line number Diff line change
@@ -0,0 +1,10 @@
{
"folders": [
{
"path": ".."
}
],
"settings": {
"python.languageServer": "None"
}
}
Copy link
Copy Markdown
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This file has been pushed by mistake and is removed in a next commit x)

12 changes: 10 additions & 2 deletions minitap/mobile_use/agents/cortex/cortex.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,11 @@
from minitap.mobile_use.graph.state import State
from minitap.mobile_use.services.llm import get_llm, invoke_llm_with_timeout_message, with_fallback
from minitap.mobile_use.services.telemetry import telemetry
from minitap.mobile_use.tools.index import EXECUTOR_WRAPPERS_TOOLS, format_tools_list
from minitap.mobile_use.tools.index import (
EXECUTOR_WRAPPERS_TOOLS,
VIDEO_RECORDING_WRAPPERS,
format_tools_list,
)
from minitap.mobile_use.utils.conversations import get_screenshot_message_for_llm
from minitap.mobile_use.utils.decorators import wrap_with_callbacks
from minitap.mobile_use.utils.logger import get_logger
Expand All @@ -43,6 +47,10 @@ async def __call__(self, state: State):
self.ctx.execution_setup.get_locked_app_package() if self.ctx.execution_setup else None
)

executor_wrappers = list(EXECUTOR_WRAPPERS_TOOLS)
if self.ctx.video_recording_enabled:
executor_wrappers.extend(VIDEO_RECORDING_WRAPPERS)

system_message = Template(
Path(__file__).parent.joinpath("cortex.md").read_text(encoding="utf-8")
).render(
Expand All @@ -51,7 +59,7 @@ async def __call__(self, state: State):
subgoal_plan=state.subgoal_plan,
current_subgoal=get_current_subgoal(state.subgoal_plan),
executor_feedback=executor_feedback,
executor_tools_list=format_tools_list(ctx=self.ctx, wrappers=EXECUTOR_WRAPPERS_TOOLS),
executor_tools_list=format_tools_list(ctx=self.ctx, wrappers=executor_wrappers),
locked_app_package=current_locked_app_package,
)
messages = [
Expand Down
13 changes: 11 additions & 2 deletions minitap/mobile_use/agents/executor/executor.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,11 @@
from minitap.mobile_use.context import MobileUseContext
from minitap.mobile_use.graph.state import State
from minitap.mobile_use.services.llm import get_llm, invoke_llm_with_timeout_message, with_fallback
from minitap.mobile_use.tools.index import EXECUTOR_WRAPPERS_TOOLS, get_tools_from_wrappers
from minitap.mobile_use.tools.index import (
EXECUTOR_WRAPPERS_TOOLS,
VIDEO_RECORDING_WRAPPERS,
get_tools_from_wrappers,
)
from minitap.mobile_use.utils.decorators import wrap_with_callbacks
from minitap.mobile_use.utils.logger import get_logger

Expand Down Expand Up @@ -54,8 +58,13 @@ async def __call__(self, state: State):

llm = get_llm(ctx=self.ctx, name="executor")
llm_fallback = get_llm(ctx=self.ctx, name="executor", use_fallback=True)

executor_wrappers = list(EXECUTOR_WRAPPERS_TOOLS)
if self.ctx.video_recording_enabled:
executor_wrappers.extend(VIDEO_RECORDING_WRAPPERS)

llm_bind_tools_kwargs: dict = {
"tools": get_tools_from_wrappers(self.ctx, EXECUTOR_WRAPPERS_TOOLS),
"tools": get_tools_from_wrappers(self.ctx, executor_wrappers),
}

# ChatGoogleGenerativeAI does not support the "parallel_tool_calls" keyword
Expand Down
12 changes: 10 additions & 2 deletions minitap/mobile_use/agents/planner/planner.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,11 @@
)
from minitap.mobile_use.graph.state import State
from minitap.mobile_use.services.llm import get_llm, invoke_llm_with_timeout_message, with_fallback
from minitap.mobile_use.tools.index import EXECUTOR_WRAPPERS_TOOLS, format_tools_list
from minitap.mobile_use.tools.index import (
EXECUTOR_WRAPPERS_TOOLS,
VIDEO_RECORDING_WRAPPERS,
format_tools_list,
)
from minitap.mobile_use.utils.decorators import wrap_with_callbacks
from minitap.mobile_use.utils.logger import get_logger

Expand All @@ -35,11 +39,15 @@ async def __call__(self, state: State):
)
current_foreground_app = get_current_foreground_package(self.ctx)

executor_wrappers = list(EXECUTOR_WRAPPERS_TOOLS)
if self.ctx.video_recording_enabled:
executor_wrappers.extend(VIDEO_RECORDING_WRAPPERS)

system_message = Template(
Path(__file__).parent.joinpath("planner.md").read_text(encoding="utf-8")
).render(
platform=self.ctx.device.mobile_platform.value,
executor_tools_list=format_tools_list(ctx=self.ctx, wrappers=EXECUTOR_WRAPPERS_TOOLS),
executor_tools_list=format_tools_list(ctx=self.ctx, wrappers=executor_wrappers),
locked_app_package=current_locked_app_package,
current_foreground_app=current_foreground_app,
)
Expand Down
5 changes: 5 additions & 0 deletions minitap/mobile_use/agents/video_analyzer/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
"""Video analyzer utility for analyzing video content with Gemini models."""

from minitap.mobile_use.agents.video_analyzer.video_analyzer import analyze_video

__all__ = ["analyze_video"]
5 changes: 5 additions & 0 deletions minitap/mobile_use/agents/video_analyzer/human.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
Please analyze the following video recording and respond to my request.

---

**My Request**: {{ prompt }}
37 changes: 37 additions & 0 deletions minitap/mobile_use/agents/video_analyzer/video_analyzer.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,37 @@
## You are a **Video Analysis Assistant**

You analyze video recordings of mobile device screens and provide accurate, detailed responses based on what you observe.

---

## Your Focus Areas

When analyzing videos, pay attention to:

- **UI elements** and their states (buttons, text fields, toggles, etc.)
- **Text content** displayed on screen
- **Actions that occur** (taps, scrolls, transitions, animations)
- **Notifications or dialogs** that appear
- **Changes in the interface** over time
- **Audio content** if present (transcribe speech, describe sounds)

---

## Guidelines

- **Be precise and factual** - Only describe what you can actually see or hear
- **Note uncertainty** - If you cannot clearly see or determine something, say so
- **Be thorough** - Capture all relevant details that relate to the user's question
- **Use timestamps** when describing sequences of events (e.g., "At 0:05, the user taps...")
- **Structure your response** clearly when there's a lot of information

---

## Response Format

Adapt your response format to the user's request:

- For **transcription requests**: Provide clean, readable text of what was spoken or displayed
- For **description requests**: Give a chronological narrative of events
- For **specific questions**: Answer directly and concisely
- For **extraction requests**: List items clearly (e.g., notifications, text content)
99 changes: 99 additions & 0 deletions minitap/mobile_use/agents/video_analyzer/video_analyzer.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,99 @@
"""
Video Analyzer utility for analyzing video content using Gemini models.

This utility sends video files to video-capable Gemini models for analysis
and returns text descriptions based on the provided prompt.
"""

import base64
from pathlib import Path

from jinja2 import Template
from langchain_core.messages import HumanMessage, SystemMessage

from minitap.mobile_use.context import MobileUseContext
from minitap.mobile_use.services.llm import get_llm, invoke_llm_with_timeout_message, with_fallback
from minitap.mobile_use.utils.logger import get_logger

logger = get_logger(__name__)


async def analyze_video(
ctx: MobileUseContext,
video_path: Path,
prompt: str,
) -> str:
"""
Analyze a video file using a video-capable Gemini model.

Args:
ctx: The MobileUseContext containing LLM configuration
video_path: Path to the video file (MP4)
prompt: The analysis prompt/question about the video

Returns:
Text analysis result from the model

Raises:
Exception: If video analysis fails
"""
logger.info(f"Starting video analysis for {video_path}")

if not video_path.exists():
raise FileNotFoundError(f"Video file not found: {video_path}")

with open(video_path, "rb") as video_file:
video_bytes = video_file.read()

video_base64 = base64.b64encode(video_bytes).decode("utf-8")

suffix = video_path.suffix.lower()
mime_type = "video/mp4" if suffix in [".mp4", ".m4v"] else f"video/{suffix[1:]}"

system_message_content = Template(
Path(__file__).parent.joinpath("video_analyzer.md").read_text(encoding="utf-8")
).render()

human_message_content = Template(
Path(__file__).parent.joinpath("human.md").read_text(encoding="utf-8")
).render(prompt=prompt)

messages = [
SystemMessage(content=system_message_content),
HumanMessage(
content=[
{
"type": "text",
"text": human_message_content,
},
{
"type": "file",
"source_type": "base64",
"mime_type": mime_type,
"data": video_base64,
},
]
),
]

llm = get_llm(ctx=ctx, name="video_analyzer", is_utils=True, temperature=0.2)
llm_fallback = get_llm(
ctx=ctx, name="video_analyzer", is_utils=True, use_fallback=True, temperature=0.2
)

logger.info("Sending video to LLM for analysis...")

response = await with_fallback(
main_call=lambda: invoke_llm_with_timeout_message(
llm.ainvoke(messages), timeout_seconds=120
),
fallback_call=lambda: invoke_llm_with_timeout_message(
llm_fallback.ainvoke(messages), timeout_seconds=120
),
)

content = response.content if hasattr(response, "content") else str(response)
result = content if isinstance(content, str) else str(content)
logger.info("Video analysis completed")

return result
15 changes: 13 additions & 2 deletions minitap/mobile_use/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -93,14 +93,15 @@ def record_events(output_path: Path | None, events: list[str] | BaseModel | Any)
### LLM Configuration

LLMProvider = Literal["openai", "google", "openrouter", "xai", "vertexai", "minitap"]
LLMUtilsNode = Literal["outputter", "hopper"]
LLMUtilsNode = Literal["outputter", "hopper", "video_analyzer"]
LLMUtilsNodeWithFallback = LLMUtilsNode
AgentNode = Literal[
"planner",
"orchestrator",
"contextor",
"cortex",
"executor",
"video_analyzer",
]
Comment thread
coderabbitai[bot] marked this conversation as resolved.
AgentNodeWithFallback = AgentNode

Expand Down Expand Up @@ -158,6 +159,7 @@ def __str__(self):
class LLMConfigUtils(BaseModel):
outputter: LLMWithFallback
hopper: LLMWithFallback
video_analyzer: LLMWithFallback | None = None


class LLMConfig(BaseModel):
Expand All @@ -176,6 +178,8 @@ def validate_providers(self):
self.executor.validate_provider("Executor")
self.utils.outputter.validate_provider("Outputter")
self.utils.hopper.validate_provider("Hopper")
if self.utils.video_analyzer:
self.utils.video_analyzer.validate_provider("VideoAnalyzer")

def __str__(self):
return f"""
Expand All @@ -187,13 +191,20 @@ def __str__(self):
🧩 Utils:
πŸ”½ Hopper: {self.utils.hopper}
πŸ“ Outputter: {self.utils.outputter}
🎬 Video Analyzer: {self.utils.video_analyzer or "Not configured"}
"""

def get_agent(self, item: AgentNode) -> LLMWithFallback:
return getattr(self, item)

def get_utils(self, item: LLMUtilsNode) -> LLMWithFallback:
return getattr(self.utils, item)
value = getattr(self.utils, item)
if value is None:
raise ValueError(
f"Utils '{item}' is not configured. "
f"Please add it to your LLM config or enable it via AgentConfigBuilder."
)
return value


def get_default_llm_config() -> LLMConfig:
Expand Down
1 change: 1 addition & 0 deletions minitap/mobile_use/context.py
Original file line number Diff line number Diff line change
Expand Up @@ -87,6 +87,7 @@ class MobileUseContext(BaseModel):
on_agent_thought: Callable[[AgentNode, str], Coroutine] | None = None
on_plan_changes: Callable[[list[Subgoal], IsReplan], Coroutine] | None = None
minitap_api_key: str | None = None
video_recording_enabled: bool = False

def get_adb_client(self) -> AdbClient:
if self.adb_client is None:
Expand Down
Loading