diff --git a/.gitignore b/.gitignore index 78cbee6..889a72b 100644 --- a/.gitignore +++ b/.gitignore @@ -5,4 +5,5 @@ output/ running_log gelab-zero-4b-preview/ -model_config.yaml \ No newline at end of file +model_config.yaml +venv/ \ No newline at end of file diff --git a/README.md b/README.md index 94eba64..3ed27f1 100644 --- a/README.md +++ b/README.md @@ -1,4 +1,5 @@ -![GELab-Zero Main Image](./images/main_en.png) + +Stepfun-ai-gelab-zero-12-21-2025_09_05_PM > 👋 Hi, everyone! We are proud to present the first fully open-source GUI Agent with both model and infrastructure. Our solution features plug-and-play engineering with no cloud dependencies, giving you complete privacy control. @@ -16,18 +17,6 @@ 简体中文

-## 📰 News - -* 🎁 **[2025-12-18]** We release **Step-GUI Technical Report** on [**arXiv**](https://arxiv.org/abs/2512.15431)! -* 🎁 **[2025-12-18]** We release a more powerful **API** for GUI automation tasks. [Apply for API access here](https://wvixbzgc0u7.feishu.cn/share/base/form/shrcnNStxEmuE7aY6jTW07CZHMf)! -* 🎁 **[2025-12-12]** We release **MCP-Server** support for multi-device management and task distribution. See [Installation & Quick Start](#-installation-quick-start) and [MCP-Server Setup](#optional-mcp-server-setup) for setup instructions. -* 🎁 **[2025-12-1]** We thank the following projects and authors for providing quantization tools & tutorials: [GGUF_v1](https://huggingface.co/bartowski/stepfun-ai_GELab-Zero-4B-preview-GGUF), [GGUF_v2](https://huggingface.co/noctrex/GELab-Zero-4B-preview-GGUF), [EXL3](https://huggingface.co/ArtusDev/stepfun-ai_GELab-Zero-4B-preview-EXL3), [Tutorials_CN](http://xhslink.com/o/1WrmgHGWFYh), [Tutorials_EN](https://www.youtube.com/watch?v=4BMiDyQOpos) -* 🎁 **[2025-11-31]** We release a lightweight **4B** model GELab-Zero-4B-preview on [**Hugging Face**](https://huggingface.co/stepfun-ai/GELab-Zero-4B-preview) and [**Model Scope**](https://modelscope.cn/models/stepfun-ai/GELab-Zero-4B-preview). -* 🎁 **[2025-11-31]** We release the tasks from the [**AndroidDaily**](https://huggingface.co/datasets/stepfun-ai/AndroidDaily) benchmark. -* 🎁 **[2025-11-30]** We release the current **GELab-Zero** engineering infrastructure. -* 🎁 **[2025-10]** Our [**research**](https://github.com/summoneryhl/gelab-engine) paper on GELab-Engine is accepted by **NeurIPS 2025**. - - ## 📑 Table of Contents @@ -38,15 +27,6 @@ - [📝 Citation](#-citation) -## 📧 Contact - -You can contact us and communicate with us by joining our WeChat group: - -| WeChat Group | -|:-------------------------:| -| | - - ## 📖 Background @@ -377,6 +357,47 @@ AN2CVB4C28000731 device If you do not see any devices, please check if the USB cable and the USB debugging settings on your phone are correctly enabled. When connecting the phone for the first time, an authorization prompt may pop up on the phone; simply select "Allow." As shown in the image below: +### Wireless Debugging in Web UI (Recommended) + +1. **Prepare Device** + - Ensure phone and computer are on the same WiFi network + - On phone: Settings → Developer Options → Wireless Debugging (Enable) + +2. **Connect Wireless Device** + - Open Web UI (http://localhost:8865) + - Find "📶 Wireless Debugging" section in the left panel (expanded by default) + - Enter the phone's IP address (visible in phone's wireless debugging settings) + - Port defaults to 5555, modify as your phone settings + - Click "🔗 Connect Wireless Device" button + +3. **USB to Wireless** + - If your device is USB connected: + - Click "📡 Enable TCP/IP Mode (USB to Wireless)" + - System will automatically get device IP and enable wireless mode + - Disconnect USB cable and use wireless connection + +4. **Manage Devices** + - Click "🔄 Check Device Status" to view all connected devices + - Click "📋 ADB Device List" to get detailed device connection information + - Click "🔄 Restart ADB Service" to resolve ADB connection issues + - System will show device type: 🔌 USB or 📶 Wireless + - Click "✂️ Disconnect Wireless Device" to disconnect wireless connection + +### Command Line Method + +```bash +# Connect via WiFi +adb connect 192.168.1.100:5555 + +# Verify connection +adb devices + +# View device list +adb devices + +# Restart ADB service +adb kill-server +adb start-server
Authorization Prompt on Xiaomi @@ -396,10 +417,43 @@ cd gelab-zero # Install dependencies pip install -r requirements.txt -# To inference a single task +# To inference a single task (Command Line) python examples/run_single_task.py + +# Or use the Web UI (Recommended) +python start_web_ui.py ``` +#### Web UI Features + +The Web UI provides a more user-friendly way to interact with GELab-Zero, featuring a two-column layout: + +**Left Panel - Control** + +| Module | Features | +|--------|----------| +| **📱 Device Management** | Check device status, view device list, restart ADB service | +| **📶 Wireless Debugging** | Connect device via IP address, enable TCP/IP mode, disconnect | +| **📊 Task Monitoring** | View task status (Ready/Running/Waiting for Input), select historical Sessions | +| **💬 Command/Reply** | Enter task instructions or reply to Agent queries, supports `Ctrl+Enter` shortcut | +| **⚙️ Model Configuration** | Select model provider (auto-loaded from `model_config.yaml`), set Base URL and API Key | +| **🛠 Utilities** | Launch scrcpy screen mirroring, get installed app list | + +**Right Panel - Display** + +| Module | Features | +|--------|----------| +| **📱 Task Trajectory** | Visual replay of each execution step, including screenshots, thought process, and action details | +| **📋 Real-time Logs** | Real-time display of task execution terminal output, with clear and copy buttons | + +**Interaction Enhancements** + +- **🔄 Smart Auto-scroll**: Auto-scrolls during task execution; stops when task completes, allowing free navigation through history +- **🖼️ Image Lightbox**: Click screenshots in trajectory to view full-size, with download support +- **⌨️ Keyboard Shortcut**: `Ctrl+Enter` to quickly submit commands/replies + +After starting the Web UI, open your browser and go to `http://localhost:8866` to access the interface. + ### (Optional) Step 4: Trajectory Visualization Environment Setup The trajectory will be defult saved in the `running_log/server_log/os-copilot-local-eval-logs/` directory. You can visualize the trajectory using streamlit: diff --git a/README_CN.md b/README_CN.md index d466678..6afc2bb 100644 --- a/README_CN.md +++ b/README_CN.md @@ -1,5 +1,6 @@ -![GELab-Zero 主图](./images/main_cn.png) +Stepfun-ai-gelab-zero-12-21-2025_09_05_PM + > 👋 hi大家好!我们很荣幸推出首个同时包含模型和基础设施的全开源 GUI Agent。我们的解决方案主打即插即用的工程化体验,无需依赖云端,赋予您完全的隐私控制权。 @@ -18,17 +19,6 @@ 简体中文

-## 📰 新闻 - -* 🎁 **[2025-12-18]** 我们在 **[arXiv](https://arxiv.org/abs/2512.15431)** 上发布了 **Step-GUI 技术报告**! -* 🎁 **[2025-12-18]** 我们发布了更强大的 GUI 自动化任务 **API**。[点击此处申请 API 访问权限](https://wvixbzgc0u7.feishu.cn/share/base/form/shrcnNStxEmuE7aY6jTW07CZHMf)! -* 🎁 **[2025-12-12]** 我们发布了支持多设备管理和任务分发的 **MCP-Server**。请参阅 [安装-快速开始](#-安装-快速开始) 和 [MCP-Server 配置](#可选-mcp-server-配置) 了解配置说明。 -* 🎁 **[2025-12-01]** 感谢以下项目和作者提供量化工具及教程:[GGUF_v1](https://huggingface.co/bartowski/stepfun-ai_GELab-Zero-4B-preview-GGUF)、[GGUF_v2](https://huggingface.co/noctrex/GELab-Zero-4B-preview-GGUF)、[EXL3](https://huggingface.co/ArtusDev/stepfun-ai_GELab-Zero-4B-preview-EXL3)、[中文教程](http://xhslink.com/o/1WrmgHGWFYh)、[英文教程](https://www.youtube.com/watch?v=4BMiDyQOpos)。 -* 🎁 **[2025-11-31]** 我们在 **[Hugging Face](https://huggingface.co/stepfun-ai/GELab-Zero-4B-preview)** 和 **[Model Scope](https://modelscope.cn/models/stepfun-ai/GELab-Zero-4B-preview)** 上发布了轻量级 **4B** 模型 GELab-Zero-4B-preview。 -* 🎁 **[2025-11-31]** 我们发布了 **[AndroidDaily](https://huggingface.co/datasets/stepfun-ai/AndroidDaily)** 基准测试中的任务数据。 -* 🎁 **[2025-11-30]** 我们发布了当前的 **GELab-Zero** 工程基础设施。 -* 🎁 **[2025-10]** 我们关于 GELab-Engine 的 **[研究论文](https://github.com/summoneryhl/gelab-engine)** 被 **NeurIPS 2025** 录用。 - ## 📑 目录 @@ -39,15 +29,6 @@ - [📝 引用](#-引用) -## 📧 联系我们 - -欢迎加入我们的微信群与我们联系和交流: - -| WeChat Group | -|:-------------------------:| -| | - - ## 📖 背景 随着 AI 体验日益深入消费级终端设备,移动 Agent 研究正处于从 **“可行性验证”** 向 **“大规模应用”** 转型的关键节点。虽然基于 GUI 的方案具有通用兼容性,但移动生态的碎片化带来了沉重的工程负担,阻碍了创新。GELab-Zero 旨在打破这些壁垒。 @@ -349,6 +330,48 @@ AN2CVB4C28000731 device 如果仍然无法成功安装或连接,可以参考第三方文档进行进一步排查:https://github.com/quickappcn/issues/issues/120 +### Web UI 中的无线调试(推荐) + +1. **准备设备** + - 确保手机和电脑在同一 WiFi 网络 + - 手机上:设置 → 开发者选项 → 无线调试(启用) + +2. **连接无线设备** + - 打开 Web UI (http://localhost:8865) + - 在左侧面板找到"📶 无线调试"部分(默认展开) + - 输入手机的 IP 地址(可以在手机的无线调试设置中查看) + - 端口默认为 5555,根据你的实际手机情况修改 + - 点击"🔗 连接无线设备"按钮 + +3. **USB 转 无线** + - 如果您的设备是 USB 连接: + - 点击"📡 启用TCP/IP模式(USB转无线)" + - 系统会自动获取设备 IP 并启用无线模式 + - 断开 USB 线后即可使用无线连接 + +4. **管理设备** + - 点击"🔄 检查设备状态"查看所有已连接的设备 + - 点击"📋 ADB设备列表"获取详细的设备连接信息 + - 点击"🔄 重启ADB服务"解决ADB连接问题 + - 系统会显示设备类型:🔌 USB 或 📶 无线 + - 点击"✂️ 断开无线设备"可以断开无线连接 + +### 命令行方式 + +```bash +# 通过 WiFi 连接 +adb connect 192.168.1.100:5555 + +# 验证连接 +adb devices + +# 查看设备列表 +adb devices + +# 重启ADB服务 +adb kill-server +adb start-server + ### Step 3: GELab-Zero Agent 运行环境搭建 完成以上步骤后,可以通过以下命令部署 GELab-Zero 的运行环境: @@ -361,10 +384,43 @@ cd gelab-zero # 安装依赖 pip install -r requirements.txt -# 运行单个任务推理示例 +# 运行单个任务推理示例(命令行方式) python examples/run_single_task.py + +# 或使用 Web UI(推荐) +python start_web_ui.py ``` +#### Web UI 功能特性 + +Web UI 提供了更友好的交互方式,界面分为左右两栏布局: + +**左栏 - 控制面板** + +| 模块 | 功能 | +|------|------| +| **📱 设备管理** | 检查设备状态、查看设备列表、重启 ADB 服务 | +| **📶 无线调试** | 通过 IP 地址无线连接设备、启用 TCP/IP 模式、断开连接 | +| **📊 任务监控** | 查看任务状态(就绪/运行中/等待输入)、选择历史 Session | +| **💬 命令/回复** | 输入任务指令或回复 Agent 询问,支持 `Ctrl+Enter` 快捷提交 | +| **⚙️ 参数配置** | 选择模型提供商(从 `model_config.yaml` 自动加载)、设置 Base URL 和 API Key | +| **� 实用工具** | 启动 scrcpy 屏幕镜像、获取手机应用列表 | + +**右栏 - 任务展示** + +| 模块 | 功能 | +|------|------| +| **📱 任务轨迹** | 可视化回放每个执行步骤,包含截图、思考过程、动作详情 | +| **📋 实时日志** | 实时显示任务执行的终端输出,支持清空和复制 | + +**交互优化** + +- **🔄 智能滚动**:任务运行时自动滚动到最新内容;任务结束后停止滚动,可自由翻阅历史日志 +- **🖼️ 图片放大**:点击轨迹中的截图可放大查看,支持下载 +- **⌨️ 快捷键**:`Ctrl+Enter` 快速提交命令/回复 + +启动 Web UI 后,在浏览器中访问 `http://localhost:8866` 即可使用。 + ### (可选)Step 4: 轨迹可视化环境搭建 任务轨迹会默认保存在 `running_log/server_log/os-copilot-local-eval-logs/` 目录下。你可以使用 streamlit 对轨迹进行可视化: diff --git a/copilot_agent_client/__pycache__/mcp_agent_loop.cpython-312.pyc b/copilot_agent_client/__pycache__/mcp_agent_loop.cpython-312.pyc new file mode 100644 index 0000000..a6e5dce Binary files /dev/null and b/copilot_agent_client/__pycache__/mcp_agent_loop.cpython-312.pyc differ diff --git a/copilot_agent_client/__pycache__/pu_client.cpython-312.pyc b/copilot_agent_client/__pycache__/pu_client.cpython-312.pyc new file mode 100644 index 0000000..2fae9d6 Binary files /dev/null and b/copilot_agent_client/__pycache__/pu_client.cpython-312.pyc differ diff --git a/copilot_agent_client/mcp_agent_loop.py b/copilot_agent_client/mcp_agent_loop.py index d9f93b4..ac6569c 100644 --- a/copilot_agent_client/mcp_agent_loop.py +++ b/copilot_agent_client/mcp_agent_loop.py @@ -27,6 +27,42 @@ import threading +# 暂停信号文件路径 - 使用绝对路径基于项目根目录 +def get_pause_signal_file(): + """获取暂停信号文件的绝对路径""" + # 从 copilot_agent_client 目录向上一级找到项目根目录 + project_root = os.path.dirname(os.path.dirname(os.path.abspath(__file__))) + return os.path.join(project_root, "tmp_screenshot", "pause_signal.txt") + +def check_pause_signal(): + """ + 检查是否有外部暂停信号。 + 如果存在暂停信号文件,删除文件并返回 True (should_pause). + Returns: should_pause: bool + """ + pause_file = get_pause_signal_file() + if os.path.exists(pause_file): + try: + # 仅检测文件存在,不需要读取内容 + os.remove(pause_file) + print(f"[DEBUG] 检测到暂停信号: {pause_file}") + return True + except Exception as e: + print(f"[WARNING] 读取/删除暂停信号文件失败: {e}") + return False + return False + +def clear_pause_signal(): + """清除暂停信号文件(如果存在)""" + pause_file = get_pause_signal_file() + if os.path.exists(pause_file): + try: + os.remove(pause_file) + print(f"[DEBUG] 已清除暂停信号: {pause_file}") + except: + pass + + def auto_reply(current_image_url, task, info_action, model_provider, model_name): """ Reply with information action. @@ -266,6 +302,14 @@ def gui_agent_loop( # restart the steps from 0, even continuing an existing session for step_idx in range(max_steps): + # >>> 检查外部暂停信号 <<< + # >>> 检查外部暂停信号 <<< + should_pause = check_pause_signal() + if should_pause: + print(f"[PAUSED] 检测到暂停信号,暂停任务...") + stop_reason = "USER_PAUSED" + break + if not dectect_screen_on(device_id): print("Screen is off, turn on the screen first") stop_reason = "MANUAL_STOP_SCREEN_OFF" @@ -313,7 +357,15 @@ def gui_agent_loop( # assume when reply info is provided, it must be used for current step if reply_info is not None: print(f"Using reply from client: {reply_info}") - payload['observation']['query'] = reply_info + # 增强提示,确保LLM优先执行用户干预指令,不要继续完成原任务 + payload['observation']['query'] = f"""【紧急用户干预 - 最高优先级】 +用户要求:{reply_info} + +重要提示: +1. 立即停止当前正在执行的任务 +2. 优先执行用户的新指令 +3. 不要输出 COMPLETE,除非新指令已完成 +4. 根据当前屏幕状态,执行用户的新要求""" reply_info = None # reset after use server_return = agent_server.automate_step(payload) @@ -423,9 +475,11 @@ def gui_agent_loop( return_log['intermediate_logs'] = [] pass - if stop_reason in ['MANUAL_STOP_SCREEN_OFF', 'INFO_ACTION_NEEDS_REPLY', "NOT_STARTED"]: + if stop_reason in ['MANUAL_STOP_SCREEN_OFF', 'INFO_ACTION_NEEDS_REPLY', "NOT_STARTED", "USER_PAUSED"]: + pass + elif action is None: pass - elif action['action_type'].upper() == 'COMPLETE': + elif action['action_type'].upper() == 'COMPLETE': stop_reason = "TASK_COMPLETED_SUCCESSFULLY" elif action['action_type'].upper() == 'ABORT': stop_reason = "TASK_ABORTED_BY_AGENT" diff --git a/copilot_agent_server/__pycache__/__init__.cpython-312.pyc b/copilot_agent_server/__pycache__/__init__.cpython-312.pyc new file mode 100644 index 0000000..edfdb05 Binary files /dev/null and b/copilot_agent_server/__pycache__/__init__.cpython-312.pyc differ diff --git a/copilot_agent_server/__pycache__/base_logger.cpython-312.pyc b/copilot_agent_server/__pycache__/base_logger.cpython-312.pyc new file mode 100644 index 0000000..850e3ec Binary files /dev/null and b/copilot_agent_server/__pycache__/base_logger.cpython-312.pyc differ diff --git a/copilot_agent_server/__pycache__/base_server.cpython-312.pyc b/copilot_agent_server/__pycache__/base_server.cpython-312.pyc new file mode 100644 index 0000000..4d741ab Binary files /dev/null and b/copilot_agent_server/__pycache__/base_server.cpython-312.pyc differ diff --git a/copilot_agent_server/__pycache__/local_server.cpython-312.pyc b/copilot_agent_server/__pycache__/local_server.cpython-312.pyc new file mode 100644 index 0000000..aa6c186 Binary files /dev/null and b/copilot_agent_server/__pycache__/local_server.cpython-312.pyc differ diff --git a/copilot_agent_server/__pycache__/local_server_logger.cpython-312.pyc b/copilot_agent_server/__pycache__/local_server_logger.cpython-312.pyc new file mode 100644 index 0000000..4aafa6f Binary files /dev/null and b/copilot_agent_server/__pycache__/local_server_logger.cpython-312.pyc differ diff --git a/copilot_agent_server/__pycache__/parser_factory.cpython-312.pyc b/copilot_agent_server/__pycache__/parser_factory.cpython-312.pyc new file mode 100644 index 0000000..297dfd5 Binary files /dev/null and b/copilot_agent_server/__pycache__/parser_factory.cpython-312.pyc differ diff --git a/copilot_agent_server/local_server.py b/copilot_agent_server/local_server.py index cec1da6..89fac18 100644 --- a/copilot_agent_server/local_server.py +++ b/copilot_agent_server/local_server.py @@ -145,6 +145,12 @@ def get_envs_acts_from_logs(logs): "frequency_penalty": 0.0, "max_tokens": 512, }) + + # 将 base_url 和 api_key 传入 args,供 ask_llm_anything 使用 + if 'base_url' in model_config: + args['base_url'] = model_config['base_url'] + if 'api_key' in model_config: + args['api_key'] = model_config['api_key'] image_preprocess = model_config.get('image_preprocess', None) diff --git a/copilot_front_end/__pycache__/mobile_action_helper.cpython-312.pyc b/copilot_front_end/__pycache__/mobile_action_helper.cpython-312.pyc new file mode 100644 index 0000000..a059dba Binary files /dev/null and b/copilot_front_end/__pycache__/mobile_action_helper.cpython-312.pyc differ diff --git a/copilot_front_end/__pycache__/package_map.cpython-312.pyc b/copilot_front_end/__pycache__/package_map.cpython-312.pyc new file mode 100644 index 0000000..ededb94 Binary files /dev/null and b/copilot_front_end/__pycache__/package_map.cpython-312.pyc differ diff --git a/copilot_front_end/__pycache__/pu_frontend_executor.cpython-312.pyc b/copilot_front_end/__pycache__/pu_frontend_executor.cpython-312.pyc new file mode 100644 index 0000000..aead34e Binary files /dev/null and b/copilot_front_end/__pycache__/pu_frontend_executor.cpython-312.pyc differ diff --git a/copilot_front_end/mobile_action_helper.py b/copilot_front_end/mobile_action_helper.py index a8efae6..06e8cf2 100644 --- a/copilot_front_end/mobile_action_helper.py +++ b/copilot_front_end/mobile_action_helper.py @@ -134,9 +134,27 @@ def dectect_screen_on(device_id, print_command = False): command = f"{adb_command} shell dumpsys display" if print_command: print(f"Executing command: {command}") - result = subprocess.run(command, shell=True, capture_output=True, text=True) - result.stdout = result.stdout.encode('utf-8').decode('utf-8') - screen_state = local_str_grep(result.stdout, "mScreenState").strip() + + # Use text=False (or capture_output=True default) to get bytes, avoiding implicit decoding errors + result = subprocess.run(command, shell=True, capture_output=True, text=False) + + if result.stdout: + # Decode carefully, ignoring errors if necessary + # Try utf-8 first, then gbk, or just replace errors + try: + # ADB output is usually UTF-8, but on Windows shell it might get mixed. + # using errors='ignore' or 'replace' is safest for logging/grepping + output_str = result.stdout.decode('utf-8', errors='replace') + except Exception: + output_str = result.stdout.decode('gbk', errors='replace') + else: + output_str = "" + + screen_state = local_str_grep(output_str, "mScreenState") + if screen_state: + screen_state = screen_state.strip() + else: + screen_state = "" else: command = f"{adb_command} shell dumpsys display | grep mScreenState" if print_command: diff --git a/copilot_front_end/package_map.py b/copilot_front_end/package_map.py index 99401aa..f99fb4f 100644 --- a/copilot_front_end/package_map.py +++ b/copilot_front_end/package_map.py @@ -14,7 +14,7 @@ "网易有道词典": "com.youdao.dict", "百度贴吧": "com.baidu.tieba", "腾讯新闻": "com.tencent.news", - "饿了么": "me.ele", + "淘宝闪购": "me.ele", "百度输入法": "com.baidu.input", "优酷视频": "com.youku.phone", "抖音": "com.ss.android.ugc.aweme", diff --git a/copilot_tools/__pycache__/parser_0920_summary.cpython-312.pyc b/copilot_tools/__pycache__/parser_0920_summary.cpython-312.pyc new file mode 100644 index 0000000..4710ad5 Binary files /dev/null and b/copilot_tools/__pycache__/parser_0920_summary.cpython-312.pyc differ diff --git a/examples/run_single_task.py b/examples/run_single_task.py index c3720c2..ca6c716 100644 --- a/examples/run_single_task.py +++ b/examples/run_single_task.py @@ -63,49 +63,211 @@ def timed_automate_step(payload): server_instance.automate_step = timed_automate_step if __name__ == "__main__": + import argparse - # task = "打开微信,给柏茗,发helloworld" - # task = "打开 给到 app,在主页,下滑寻找,员工权益-奋斗食代,帮我领劵。如果不能领取就退出。" - # task = "open wechat to send a message 'helloworld' to 'TKJ'" - #task = "去淘宝帮我买本书" - if len(sys.argv) < 2: + parser = argparse.ArgumentParser(description="Run a single task solely.") + parser.add_argument("task", type=str, nargs='?', help="The task description.") + parser.add_argument("--device-id", type=str, help="The device ID to use.") + parser.add_argument("--model", type=str, default="gelab-zero-4b-preview", help="Model name.") + parser.add_argument("--base-url", type=str, help="Base URL for the model API.") + parser.add_argument("--api-key", type=str, help="API Key for the model.") + parser.add_argument("--continue-session", type=str, help="Continue an existing session by session ID.") + parser.add_argument("--injection", type=str, help="User injection command to modify task direction.") + + args = parser.parse_args() + + # 检查是否是继续模式 + is_continue_mode = args.continue_session is not None + + if not args.task and not is_continue_mode: print("❌ 错误:未传入任务参数!") print("📝 使用方法:") - print(f" python {sys.argv[0]} \"你的任务描述\"") + print(f" python {sys.argv[0]} \"你的任务描述\" [options]") print(" 示例1:python script.py \"去淘宝帮我买本书\"") - print(" 示例2:python script.py \"打开微信,给柏茗发helloworld\"") - sys.exit(1) - - task = ' '.join(sys.argv[1:]) + print(" 示例2:python script.py \"打开微信,给柏茗发helloworld\" --device-id 123456") + print(f" 示例3:python script.py --continue-session --injection \"修正指令\"") + sys.exit(1) + + task = args.task # May be None in continue mode + + # Use provided device_id or find the first available one + if args.device_id: + device_id = args.device_id + # Verify device is connected + available_devices = list_devices() + if device_id not in available_devices: + print(f"Warning: Device {device_id} not found in connected devices: {available_devices}") + else: + devices = list_devices() + if not devices: + print("❌ Error: No devices connected.") + sys.exit(1) + device_id = devices[0] + print(f"Auto-selected device: {device_id}") - # The device ID you want to use - device_id = list_devices()[0] device_wm_size = get_device_wm_size(device_id) device_info = { "device_id": device_id, "device_wm_size": device_wm_size } - + # Update model configuration based on arguments + tmp_rollout_config = local_model_config.copy() + if args.model: + tmp_rollout_config["model_config"]["model_name"] = args.model + + if args.base_url or args.api_key: + # Switch provider to openai if URL/Key provided, or keep local if just overriding local params? + # Assuming if URL is provided, we might want to treat it as an OpenAI-compatible endpoint + # BUT for now, let's just inject these into args or model_config if the backend supports it. + # Looking at local_server.py might be needed to see how it handles base_url/api_key. + # For 'local' provider, it might not use them. Let's assume user knows what they are doing. + # If it is 'custom' or 'openai', provider might need to change. + # FOR NOW: We just update the 'args' or specific keys if the server class supports it. + + # NOTE: The current LocalServer implementation details are not fully visible here. + # But commonly these are passed in model_config. + if args.base_url: + tmp_rollout_config["model_config"]["base_url"] = args.base_url + if args.api_key: + tmp_rollout_config["model_config"]["api_key"] = args.api_key + + # If external URL is used, we might need to change provider from 'local' to 'openai' or similar if logic dictates + if args.base_url and "local" in tmp_rollout_config["model_config"]["model_provider"]: + # Heuristic: if base_url is set, it's likely not just 'local' weights but an invalidference server + pass + + # Ensure log directories exist + if "log_dir" in tmp_server_config and not os.path.exists(tmp_server_config["log_dir"]): + os.makedirs(tmp_server_config["log_dir"], exist_ok=True) + if "image_dir" in tmp_server_config and not os.path.exists(tmp_server_config["image_dir"]): + os.makedirs(tmp_server_config["image_dir"], exist_ok=True) - tmp_rollout_config = local_model_config + # Use tmp_server_config for LocalServer initialization as it expects log_dir etc. l2_server = LocalServer(tmp_server_config) # 注入计时逻辑 wrap_automate_step_with_timing(l2_server) + # 执行任务并计总时间 total_start = time.time() - # Disable auto reply - evaluate_task_on_device(l2_server, device_info, task, tmp_rollout_config, reflush_app=True) + + # 使用 gui_agent_loop 支持暂停/继续 + from copilot_agent_client.mcp_agent_loop import gui_agent_loop, clear_pause_signal + + # 清除可能存在的旧暂停信号 + clear_pause_signal() + + if is_continue_mode: + # 继续已有 session + continue_session_id = args.continue_session + injection_text = args.injection or "" + + print(f"[CONTINUE] 继续 Session: {continue_session_id}") + if injection_text: + print(f"[INJECTION] 用户注入指令: {injection_text}") + print(f"Device: {device_id}") + print(f"Model: {tmp_rollout_config['model_config']['model_name']}") + + result = gui_agent_loop( + agent_server=l2_server, + agent_loop_config=tmp_rollout_config, + device_id=device_id, + max_steps=tmp_rollout_config.get('max_steps', 400), + reply_mode="pass_to_client", + session_id=continue_session_id, + reply_from_client=injection_text if injection_text else None, + ) + else: + # 新任务 + print(f"Starting task: {task}") + print(f"Device: {device_id}") + print(f"Model: {tmp_rollout_config['model_config']['model_name']}") + + result = gui_agent_loop( + agent_server=l2_server, + agent_loop_config=tmp_rollout_config, + device_id=device_id, + max_steps=tmp_rollout_config.get('max_steps', 400), + reply_mode="pass_to_client", + task=task, + ) + + # 暂停/继续循环 + total_steps = result.get('global_step_idx', 0) + while True: + stop_reason = result.get('stop_reason') + + # 情况1: 用户手动暂停 + if stop_reason == 'USER_PAUSED': + print("\n[PAUSED] 任务已暂停。请在 Web UI 输入补充信息并点击 [执行/回复] 继续...") + # 关键:这里阻塞等待 Web UI 发送输入 + # 输入格式约定: "__PAUSE_INPUT__:用户实际输入的文本" + # Web UI 需要发送这个前缀,或者我们直接接受任何输入 + user_input = input("WAITING_FOR_INPUT") + + print(f"[RESUME] 收到补充信息: {user_input}") + + remaining_steps = tmp_rollout_config.get('max_steps', 400) - total_steps + if remaining_steps <= 0: + print("[WARNING] 已达到最大步数限制") + break + + result = gui_agent_loop( + agent_server=l2_server, + agent_loop_config=tmp_rollout_config, + device_id=device_id, + max_steps=remaining_steps, + reply_mode="pass_to_client", + session_id=result['session_id'], # 继续会话 + reply_from_client=user_input, # 注入补充信息 + ) + total_steps = result.get('global_step_idx', total_steps) + continue + + # 情况2: 之前的逻辑 (USER_PAUSED_WITH_NEW_PROMPT 已弃用) + elif stop_reason == 'USER_PAUSED_WITH_NEW_PROMPT': + # 兼容旧逻辑,但不应该再走到这里 + pass + + # 其他情况:INFO需要回复,或者任务结束 + break + + # original loop for INFO action handling + while result.get('stop_reason') == 'INFO_ACTION_NEEDS_REPLY': + info_action = result.get('final_action', {}).get('agent_action', {}) + print(f"\n[INFO] Agent 询问: {info_action.get('value', '未知问题')}") + print("请在 Web UI 中回复或输入回复内容:") + + # 确保 WAITING_FOR_INPUT 被刷新输出,让 Web UI 能检测到 + print("WAITING_FOR_INPUT", flush=True) + import sys + sys.stdout.flush() + reply_info = input("") + + remaining_steps = tmp_rollout_config.get('max_steps', 400) - total_steps + if remaining_steps <= 0: + print("[WARNING] 已达到最大步数限制") + break + + result = gui_agent_loop( + agent_server=l2_server, + agent_loop_config=tmp_rollout_config, + device_id=device_id, + max_steps=remaining_steps, + reply_mode="pass_to_client", + session_id=result['session_id'], + reply_from_client=reply_info, + ) + total_steps = result.get('global_step_idx', total_steps) + + # 检查是否又被暂停了 + if result.get('stop_reason') == 'USER_PAUSED_WITH_NEW_PROMPT': + continue # 继续外层的暂停/继续循环 + total_time = time.time() - total_start # 在最后加一行总时间 print(f"总计执行时间为 {total_time} 秒") - - pass - # Enable auto reply - # evaluate_task_on_device(l2_server, device_info, task, tmp_rollout_config, reflush_app=True, auto_reply=True) - - + print(f"最终状态: {result.get('stop_reason', 'UNKNOWN')}") - pass diff --git a/model_config.yaml b/model_config.yaml index 97f066f..77ea2ec 100644 --- a/model_config.yaml +++ b/model_config.yaml @@ -4,4 +4,4 @@ local: stepfun: api_base: "https://api.stepfun.com/v1" - api_key: "EMPTY" \ No newline at end of file + api_key: "" \ No newline at end of file diff --git a/requirements.txt b/requirements.txt index bdb38e0..ae9505c 100644 --- a/requirements.txt +++ b/requirements.txt @@ -17,3 +17,4 @@ tqdm requests fastmcp +gradio \ No newline at end of file diff --git a/scrcpy-win64-v3.3.3/AdbWinApi.dll b/scrcpy-win64-v3.3.3/AdbWinApi.dll new file mode 100644 index 0000000..1da794e Binary files /dev/null and b/scrcpy-win64-v3.3.3/AdbWinApi.dll differ diff --git a/scrcpy-win64-v3.3.3/AdbWinUsbApi.dll b/scrcpy-win64-v3.3.3/AdbWinUsbApi.dll new file mode 100644 index 0000000..7f75aec Binary files /dev/null and b/scrcpy-win64-v3.3.3/AdbWinUsbApi.dll differ diff --git a/scrcpy-win64-v3.3.3/SDL2.dll b/scrcpy-win64-v3.3.3/SDL2.dll new file mode 100644 index 0000000..82d2b1a Binary files /dev/null and b/scrcpy-win64-v3.3.3/SDL2.dll differ diff --git a/scrcpy-win64-v3.3.3/adb.exe b/scrcpy-win64-v3.3.3/adb.exe new file mode 100644 index 0000000..34a0fd2 Binary files /dev/null and b/scrcpy-win64-v3.3.3/adb.exe differ diff --git a/scrcpy-win64-v3.3.3/avcodec-61.dll b/scrcpy-win64-v3.3.3/avcodec-61.dll new file mode 100644 index 0000000..a44b6c6 Binary files /dev/null and b/scrcpy-win64-v3.3.3/avcodec-61.dll differ diff --git a/scrcpy-win64-v3.3.3/avformat-61.dll b/scrcpy-win64-v3.3.3/avformat-61.dll new file mode 100644 index 0000000..347e657 Binary files /dev/null and b/scrcpy-win64-v3.3.3/avformat-61.dll differ diff --git a/scrcpy-win64-v3.3.3/avutil-59.dll b/scrcpy-win64-v3.3.3/avutil-59.dll new file mode 100644 index 0000000..9b847e3 Binary files /dev/null and b/scrcpy-win64-v3.3.3/avutil-59.dll differ diff --git a/scrcpy-win64-v3.3.3/icon.png b/scrcpy-win64-v3.3.3/icon.png new file mode 100644 index 0000000..b96a1af Binary files /dev/null and b/scrcpy-win64-v3.3.3/icon.png differ diff --git a/scrcpy-win64-v3.3.3/libusb-1.0.dll b/scrcpy-win64-v3.3.3/libusb-1.0.dll new file mode 100644 index 0000000..b36c945 Binary files /dev/null and b/scrcpy-win64-v3.3.3/libusb-1.0.dll differ diff --git a/scrcpy-win64-v3.3.3/open_a_terminal_here.bat b/scrcpy-win64-v3.3.3/open_a_terminal_here.bat new file mode 100644 index 0000000..24d557f --- /dev/null +++ b/scrcpy-win64-v3.3.3/open_a_terminal_here.bat @@ -0,0 +1 @@ +@cmd diff --git a/scrcpy-win64-v3.3.3/scrcpy-console.bat b/scrcpy-win64-v3.3.3/scrcpy-console.bat new file mode 100644 index 0000000..877c7f3 --- /dev/null +++ b/scrcpy-win64-v3.3.3/scrcpy-console.bat @@ -0,0 +1,2 @@ +@echo off +scrcpy.exe --pause-on-exit=if-error %* diff --git a/scrcpy-win64-v3.3.3/scrcpy-noconsole.vbs b/scrcpy-win64-v3.3.3/scrcpy-noconsole.vbs new file mode 100644 index 0000000..7a1c579 --- /dev/null +++ b/scrcpy-win64-v3.3.3/scrcpy-noconsole.vbs @@ -0,0 +1,7 @@ +strCommand = "cmd /c scrcpy.exe" + +For Each Arg In WScript.Arguments + strCommand = strCommand & " """ & replace(Arg, """", """""""""") & """" +Next + +CreateObject("Wscript.Shell").Run strCommand, 0, false diff --git a/scrcpy-win64-v3.3.3/scrcpy-server b/scrcpy-win64-v3.3.3/scrcpy-server new file mode 100644 index 0000000..b36f14d Binary files /dev/null and b/scrcpy-win64-v3.3.3/scrcpy-server differ diff --git a/scrcpy-win64-v3.3.3/scrcpy.exe b/scrcpy-win64-v3.3.3/scrcpy.exe new file mode 100644 index 0000000..496ec24 Binary files /dev/null and b/scrcpy-win64-v3.3.3/scrcpy.exe differ diff --git a/scrcpy-win64-v3.3.3/swresample-5.dll b/scrcpy-win64-v3.3.3/swresample-5.dll new file mode 100644 index 0000000..49d9cb0 Binary files /dev/null and b/scrcpy-win64-v3.3.3/swresample-5.dll differ diff --git a/start_web_ui.py b/start_web_ui.py new file mode 100644 index 0000000..2288425 --- /dev/null +++ b/start_web_ui.py @@ -0,0 +1,106 @@ +#!/usr/bin/env python3 +""" +启动AutoGLM Web UI +""" + +import os +import sys +import subprocess + +def check_dependencies(): + """检查依赖是否安装""" + try: + import gradio as gr + print("OK - Gradio已安装") + except ImportError: + print("ERROR - Gradio未安装,请运行: pip install -r requirements.txt") + return False + + try: + import PIL + print("OK - Pillow已安装") + except ImportError: + print("ERROR - Pillow未安装,请运行: pip install -r requirements.txt") + return False + + return True + +def main(): + print("启动AutoGLM Web UI...") + + # 检查依赖 + if not check_dependencies(): + sys.exit(1) + + # 检查ADB + print("检查ADB连接...") + try: + result = subprocess.run(["adb", "version"], capture_output=True, text=True, timeout=5) + if result.returncode == 0: + print("OK - ADB已安装") + else: + print("WARNING - ADB未正确安装,请确保ADB已添加到系统PATH") + except: + print("WARNING - ADB未找到,请确保ADB已安装并添加到系统PATH") + + # 检查web_ui目录是否存在 + web_ui_dir = os.path.join(os.path.dirname(os.path.abspath(__file__)), "web_ui") + if not os.path.exists(web_ui_dir): + print("ERROR - web_ui目录不存在") + sys.exit(1) + + # 将web_ui目录添加到Python路径 + sys.path.insert(0, web_ui_dir) + + # 启动Gradio应用 + print("\n正在启动Web界面...") + print("正在尝试可用端口...") + + # 强制尝试清理端口 8866 + target_port = 8866 + print(f"\n检查端口 {target_port} 占用情况...") + try: + # Windows下查找占用端口的进程 + cmd_find = f"netstat -ano | findstr :{target_port}" + result = subprocess.run(cmd_find, shell=True, capture_output=True, text=True) + + if result.stdout: + lines = result.stdout.strip().split('\n') + for line in lines: + parts = line.split() + if len(parts) >= 5: + pid = parts[-1] + try: + # 排除自身进程 + if int(pid) != os.getpid(): + print(f"发现占用端口 {target_port} 的进程 PID: {pid},正在尝试终止...") + subprocess.run(f"taskkill /F /PID {pid}", shell=True, capture_output=True) + print(f"进程 {pid} 已终止") + except ValueError: + pass + except Exception as e: + print(f"清理端口时出错: {e}") + + try: + from app import create_ui + + demo, css, head = create_ui() + + print(f"访问地址: http://localhost:{target_port}") + demo.launch( + server_name="0.0.0.0", + server_port=target_port, + share=False, + inbrowser=True, + show_error=True, + quiet=False, + css=css, + head=head + ) + except Exception as e: + print(f"ERROR - 启动失败: {e}") + print("请确保已安装所有依赖:pip install -r requirements.txt") + sys.exit(1) + +if __name__ == "__main__": + main() \ No newline at end of file diff --git a/tools/__pycache__/ask_llm_v2.cpython-312.pyc b/tools/__pycache__/ask_llm_v2.cpython-312.pyc new file mode 100644 index 0000000..d58f586 Binary files /dev/null and b/tools/__pycache__/ask_llm_v2.cpython-312.pyc differ diff --git a/tools/__pycache__/image_tools.cpython-312.pyc b/tools/__pycache__/image_tools.cpython-312.pyc new file mode 100644 index 0000000..01fc26e Binary files /dev/null and b/tools/__pycache__/image_tools.cpython-312.pyc differ diff --git a/tools/ask_llm_v2.py b/tools/ask_llm_v2.py index 3dcc35a..91ee654 100644 --- a/tools/ask_llm_v2.py +++ b/tools/ask_llm_v2.py @@ -18,17 +18,25 @@ def ask_llm_anything(model_provider, model_name, messages, args= { "frequency_penalty": 0.0, }, resize_config=None): - with smart_open("model_config.yaml", "r") as f: - model_config = yaml.safe_load(f) + # 优先使用 args 中传入的 api_base 和 api_key + override_api_base = args.get("api_base") or args.get("base_url") + override_api_key = args.get("api_key") - - if model_provider in model_config: - openai.api_base = model_config[model_provider]["api_base"] - openai.api_key = model_config[model_provider]["api_key"] - - + if override_api_base and override_api_key: + # 使用传入的配置 + openai.api_base = override_api_base + openai.api_key = override_api_key + print(f"[LLM] Using override config: {override_api_base}") else: - raise ValueError(f"Unknown model provider: {model_provider}") + # 从配置文件读取 + with smart_open("model_config.yaml", "r") as f: + model_config = yaml.safe_load(f) + + if model_provider in model_config: + openai.api_base = model_config[model_provider]["api_base"] + openai.api_key = model_config[model_provider]["api_key"] + else: + raise ValueError(f"Unknown model provider: {model_provider}") # preprocess def preprocess_messages(messages): diff --git a/web_ui/README.md b/web_ui/README.md new file mode 100644 index 0000000..8b6b9ff --- /dev/null +++ b/web_ui/README.md @@ -0,0 +1,117 @@ +# AutoGLM Web UI + +基于 Gradio 构建的现代化 Web 界面,提供友好的用户体验来使用 AutoGLM 进行 Android 设备自动化操作。 + +## 快速开始 + +### 1. 安装依赖 + +```bash +# 在项目根目录下 +pip install -r requirements.txt +``` + +### 2. 启动 Web UI + +```bash +# 方法1:使用启动脚本(推荐) +python start_web_ui.py + +# 方法2:直接运行 +cd web_ui +python app.py +``` + +### 3. 访问界面 + +打开浏览器访问:http://localhost:7860 + +## 界面功能 + +### 左侧面板 + +- **设备状态**:检查 ADB 连接和 Android 设备状态 +- **模型配置**: + - 选项A:选择预设的模型服务配置 + - 选项B:自定义本地模型配置 +- **应用列表**:查看所有支持的应用 + +### 右侧面板 + +- **命令输入**:输入自然语言命令 +- **命令示例**:查看常用命令示例 +- **执行结果**:显示 AI 的执行过程和结果 + +## 预设配置 + +Web UI 提供了以下预设配置: + +1. **智谱AI (推荐)**:官方提供的 AutoGLM 服务 +2. **本地Ollama**:本地部署的 Ollama 服务 +3. **本地vLLM**:本地 vLLM 部署的模型服务 + +## 常用命令示例 + +- "打开美团搜索附近的火锅店" +- "发送微信消息给张三" +- "打开抖音并搜索美食视频" +- "设置明天早上8点的闹钟" +- "拍照并发送给联系人" + +## 注意事项 + +1. 确保 ADB 已正确安装并配置环境变量 +2. 确保 Android 设备已开启 USB 调试并连接电脑 +3. 确保已安装并启用 ADB Keyboard +4. 确保模型服务正在运行(如果使用本地部署) + +## 故障排除 + +### 设备未连接 +- 检查 USB 数据线是否支持数据传输 +- 确认手机已开启 USB 调试 +- 点击"检查状态"按钮查看详细错误信息 + +### 模型连接失败 +- 确认模型服务正在运行 +- 检查网络连接和 URL 配置 +- 验证 API Key 是否正确(如果需要) + +### 命令执行失败 +- 查看输出区域的错误信息 +- 确认目标应用已安装在手机上 +- 检查手机屏幕是否处于可操作状态 + +## 高级配置 + +### 修改端口 + +编辑 `web_ui/app.py` 文件,修改 `demo.launch()` 的 `server_port` 参数: + +```python +demo.launch( + server_name="0.0.0.0", + server_port=8080, # 修改为您想要的端口 + share=False, + inbrowser=True, + show_error=True +) +``` + +### 启用公网访问 + +如果您想让其他设备也能访问 Web UI,可以启用 `share` 参数: + +```python +demo.launch(share=True) # 会生成公网链接 +``` + +### 自定义主题 + +编辑 `web_ui/app.py` 中的 `theme` 参数来更改界面主题: + +```python +from gradio.themes import Soft, Base, Default + +theme = Soft() # 可选:Soft, Base, Default, Monochrome +``` \ No newline at end of file diff --git a/web_ui/__pycache__/app.cpython-312.pyc b/web_ui/__pycache__/app.cpython-312.pyc new file mode 100644 index 0000000..e1fdc41 Binary files /dev/null and b/web_ui/__pycache__/app.cpython-312.pyc differ diff --git a/web_ui/__pycache__/app_with_scrcpy.cpython-312.pyc b/web_ui/__pycache__/app_with_scrcpy.cpython-312.pyc new file mode 100644 index 0000000..224903a Binary files /dev/null and b/web_ui/__pycache__/app_with_scrcpy.cpython-312.pyc differ diff --git a/web_ui/__pycache__/scrcpy_integration.cpython-312.pyc b/web_ui/__pycache__/scrcpy_integration.cpython-312.pyc new file mode 100644 index 0000000..5d8b784 Binary files /dev/null and b/web_ui/__pycache__/scrcpy_integration.cpython-312.pyc differ diff --git a/web_ui/app.py b/web_ui/app.py new file mode 100644 index 0000000..e6720d6 --- /dev/null +++ b/web_ui/app.py @@ -0,0 +1,1475 @@ +""" +Gradio Web UI for AutoGLM +提供用户友好的Web界面来使用AutoGLM进行Android设备自动化操作 +集成轨迹可视化功能 +""" + +import gradio as gr +import subprocess +import threading +import queue +import time +import os +import sys +import datetime +import json +import re +import glob +import yaml + +from PIL import Image +from io import BytesIO +import base64 + +# 确保能找到项目模块 +if "." not in sys.path: + sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) + +try: + import jsonlines + from megfile import smart_open, smart_exists + HAS_MEGFILE = True +except ImportError: + HAS_MEGFILE = False + print("[WARNING] megfile/jsonlines not installed, visualization may be limited") + +# --- 轨迹可视化工具函数 --- + +def long_side_resize(image, long_side=600): + """将图片长边限制到指定尺寸""" + image = image.convert("RGB") + width, height = image.size + if max(width, height) > long_side: + if width >= height: + new_width = long_side + new_height = int(height * long_side / width) + else: + new_height = long_side + new_width = int(width * long_side / height) + image = image.resize((new_width, new_height), Image.Resampling.LANCZOS) + return image + +def image_to_base64(image): + """将PIL图片转换为base64 URL""" + buffered = BytesIO() + image.save(buffered, format="JPEG", quality=85) + img_str = base64.b64encode(buffered.getvalue()).decode() + return f"data:image/jpeg;base64,{img_str}" + +def load_session_logs(session_id): + """加载指定session的日志""" + if not HAS_MEGFILE or not session_id: + return [] + + log_file = f"running_log/server_log/os-copilot-local-eval-logs/traces/{session_id}.jsonl" + + if not smart_exists(log_file): + return [] + + try: + with smart_open(log_file, "r", encoding='utf-8') as f: + reader = jsonlines.Reader(f) + logs = [log for log in reader] + return logs + except Exception as e: + print(f"[ERROR] 加载日志失败: {e}") + return [] + +def logs_to_chatbot_messages(logs): + """将日志转换为Gradio Chatbot格式的消息列表 (Gradio 6.x messages格式)""" + if not logs: + return [] + + messages = [] + + # 第一条是配置信息 + config_log = logs[0] + task = config_log.get('message', {}).get('task', '未知任务') + model_name = config_log.get('message', {}).get('model_config', {}).get('model_name', '未知模型') + + # Gradio 6.x 使用 {"role": "user"|"assistant", "content": "..."} 格式 + messages.append({"role": "assistant", "content": f"### 📋 任务: {task}\n\n**模型**: {model_name}"}) + + # 后续是环境-动作对 + env_act_logs = logs[1:] + for idx, log in enumerate(env_act_logs): + try: + env = log.get('message', {}).get('environment', {}) + act = log.get('message', {}).get('action', {}) + + image_url = env.get('image', '') + thought = act.get('cot', '') + action_type = act.get('action_type', '') + + # 尝试加载截图 + img_content = None + if image_url and HAS_MEGFILE: + try: + # 优先尝试处理过的图片 + processed_url = image_url.replace(".jpeg", "_processed.jpeg") + target_url = processed_url if smart_exists(processed_url) else image_url + + with smart_open(target_url, "rb") as f: + image = Image.open(f) + image = long_side_resize(image, long_side=800) # 保留较大尺寸以便放大查看 + img_content = image_to_base64(image) + except Exception as e: + print(f"[WARNING] 加载图片失败: {e}") + + # 用户消息显示步骤编号 + 截图 + if img_content: + # Gradio 6.x 支持 gr.Image 或 HTML 格式显示图片 + messages.append({"role": "user", "content": f"📱 Step {idx + 1}\n\n![screenshot]({img_content})"}) + else: + messages.append({"role": "user", "content": f"📱 Step {idx + 1}"}) + + # 构建动作描述 + action_desc = f"**Step {idx + 1}**\n\n" + if thought: + action_desc += f"💭 **思考**: {thought}\n\n" + action_desc += f"🎯 **动作**: `{action_type}`\n\n" + + # 添加动作详情 + action_copy = {k: v for k, v in act.items() if k not in ['cot']} + action_desc += f"```json\n{json.dumps(action_copy, indent=2, ensure_ascii=False)}\n```" + + # 助手回复动作详情 + messages.append({"role": "assistant", "content": action_desc}) + + except Exception as e: + print(f"[WARNING] 处理日志条目失败: {e}") + continue + + return messages + +def get_available_sessions(): + """获取所有可用的session ID列表""" + traces_dir = "running_log/server_log/os-copilot-local-eval-logs/traces" + if not os.path.exists(traces_dir): + return [] + + sessions = [] + for f in glob.glob(os.path.join(traces_dir, "*.jsonl")): + session_id = os.path.basename(f).replace(".jsonl", "") + # 获取文件修改时间 + mtime = os.path.getmtime(f) + sessions.append((session_id, mtime)) + + # 按时间倒序排列(最新的在前) + sessions.sort(key=lambda x: x[1], reverse=True) + return [s[0] for s in sessions[:20]] # 只返回最近20个 + + +def export_trajectory_to_pdf(session_id, output_path=None): + """ + 将轨迹导出为 PDF 文件 + + Args: + session_id: Session ID + output_path: 输出文件路径,默认为 traces 目录下 + + Returns: + 生成的 PDF 文件路径,失败返回 None + """ + try: + from reportlab.lib.pagesizes import A4 + from reportlab.lib.units import mm + from reportlab.platypus import SimpleDocTemplate, Paragraph, Spacer, Image as RLImage + from reportlab.lib.styles import getSampleStyleSheet, ParagraphStyle + from reportlab.pdfbase import pdfmetrics + from reportlab.pdfbase.ttfonts import TTFont + except ImportError: + print("[ERROR] 需要安装 reportlab: pip install reportlab") + return None + + # 加载日志 + logs = load_session_logs(session_id) + if not logs: + print(f"[ERROR] 没有找到日志: {session_id}") + return None + + # 输出路径 + traces_dir = "running_log/server_log/os-copilot-local-eval-logs/traces" + if output_path is None: + output_path = os.path.join(traces_dir, f"{session_id}.pdf") + + # 确保目录存在 + os.makedirs(os.path.dirname(output_path), exist_ok=True) + + # 尝试注册中文字体 + try: + font_paths = [ + "C:/Windows/Fonts/msyh.ttc", + "C:/Windows/Fonts/simsun.ttc", + "C:/Windows/Fonts/simhei.ttf", + ] + font_registered = False + for fp in font_paths: + if os.path.exists(fp): + pdfmetrics.registerFont(TTFont('ChineseFont', fp)) + font_registered = True + break + except Exception as e: + print(f"[WARNING] 注册中文字体失败: {e}") + font_registered = False + + # 创建 PDF + doc = SimpleDocTemplate( + output_path, + pagesize=A4, + rightMargin=20*mm, + leftMargin=20*mm, + topMargin=20*mm, + bottomMargin=20*mm + ) + + # 样式 + styles = getSampleStyleSheet() + if font_registered: + title_style = ParagraphStyle( + 'ChineseTitle', + parent=styles['Heading1'], + fontName='ChineseFont', + fontSize=18, + spaceAfter=12 + ) + body_style = ParagraphStyle( + 'ChineseBody', + parent=styles['Normal'], + fontName='ChineseFont', + fontSize=10, + leading=14 + ) + else: + title_style = styles['Heading1'] + body_style = styles['Normal'] + + # 构建内容 + story = [] + + # 标题 + config_log = logs[0] + task = config_log.get('message', {}).get('task', session_id) + story.append(Paragraph(f"任务轨迹: {task[:50]}", title_style)) + story.append(Spacer(1, 10*mm)) + + # 后续是环境-动作对 + env_act_logs = logs[1:] + for idx, log in enumerate(env_act_logs): + try: + env = log.get('message', {}).get('environment', {}) + act = log.get('message', {}).get('action', {}) + + thought = act.get('cot', '') + action_type = act.get('action_type', '') + image_url = env.get('image', '') + + # 步骤标题 + story.append(Paragraph(f"Step {idx + 1}", body_style)) + story.append(Spacer(1, 2*mm)) + + # 思考 + if thought: + thought_short = thought[:200] + "..." if len(thought) > 200 else thought + story.append(Paragraph(f"思考: {thought_short}", body_style)) + + # 动作 + story.append(Paragraph(f"动作: {action_type}", body_style)) + + # 截图 + if image_url and HAS_MEGFILE: + try: + processed_url = image_url.replace(".jpeg", "_processed.jpeg") + target_url = processed_url if smart_exists(processed_url) else image_url + + with smart_open(target_url, "rb") as f: + pil_img = Image.open(f) + img_w, img_h = pil_img.size + + # 保存临时文件 + temp_img_path = os.path.join(traces_dir, f"temp_{session_id}_{idx}.jpg") + pil_img = long_side_resize(pil_img, 600) + pil_img.save(temp_img_path, "JPEG", quality=80) + + # 添加到 PDF + img_w, img_h = pil_img.size + max_width = 160 * mm + max_height = 200 * mm + scale = min(max_width / img_w, max_height / img_h, 1.0) + + rl_img = RLImage(temp_img_path, width=img_w * scale, height=img_h * scale) + story.append(Spacer(1, 3*mm)) + story.append(rl_img) + except Exception as e: + print(f"[WARNING] 加载图片失败: {e}") + + story.append(Spacer(1, 8*mm)) + story.append(Paragraph("
", body_style)) + story.append(Spacer(1, 5*mm)) + + except Exception as e: + print(f"[WARNING] 处理步骤 {idx+1} 失败: {e}") + continue + + # 生成 PDF + try: + doc.build(story) + print(f"[PDF] 导出成功: {output_path}") + return output_path + except Exception as e: + print(f"[ERROR] PDF 生成失败: {e}") + return None + +# --- 全局命令执行管理器 --- +class CommandRunner: + def __init__(self): + self.process = None + self.logs = "" + self.is_running = False + self.log_lock = threading.Lock() + self.current_session_id = None # 追踪当前session ID + self.waiting_for_input = False # 是否等待用户输入 + self.paused_session_id = None # 暂停时的session ID + self.is_paused = False # 是否处于暂停状态 + + def start(self, cmd_args, cwd=None, env=None): + """启动新命令""" + if self.is_running: + return False, "当前已有任务在运行,请先停止" + + self.stop() + + with self.log_lock: + self.logs = f"--- 任务开始: {' '.join(cmd_args)} ---\n" + self.current_session_id = None # 重置session ID + print(f"\n[WebUI] 启动任务: {' '.join(cmd_args)}") + + self.is_running = True + + thread = threading.Thread(target=self._run_thread, args=(cmd_args, cwd, env), daemon=True) + thread.start() + return True, "任务已启动" + + def stop(self, is_pause=False): + """停止当前任务 + Args: + is_pause: 如果是暂停操作,保存session_id供后续继续 + """ + if self.process and self.process.poll() is None: + try: + self.process.terminate() + time.sleep(0.5) + if self.process.poll() is None: + self.process.kill() + except Exception as e: + self._append_log(f"\n[系统] 停止进程失败: {e}\n") + + if is_pause: + # 暂停模式: 保存session_id,标记暂停状态 + with self.log_lock: + self.paused_session_id = self.current_session_id + self.is_paused = True + + self.is_running = False + return True, "任务停止指令已发送" + + def _run_thread(self, cmd_args, cwd, env): + try: + self.process = subprocess.Popen( + cmd_args, + cwd=cwd, + env=env, + stdin=subprocess.PIPE, # 添加stdin支持 + stdout=subprocess.PIPE, + stderr=subprocess.STDOUT, + text=True, + encoding='utf-8', + errors='replace', + bufsize=1, + universal_newlines=True + ) + + for line in iter(self.process.stdout.readline, ''): + if line: + self._append_log(line) + print(line, end="", flush=True) + + # 解析 Session ID + match = re.search(r'Session ID:\s*([a-f0-9\-]+)', line) + if match: + with self.log_lock: + self.current_session_id = match.group(1) + print(f"[WebUI] 捕获到 Session ID: {self.current_session_id}") + + # 检测是否需要用户输入 + if any(k in line for k in ['Please Reply:', '回复一下', '[PAUSED]', 'WAITING_FOR_INPUT']): + with self.log_lock: + self.waiting_for_input = True + + self.process.wait() + end_msg = f"\n--- 任务结束 (代码: {self.process.returncode}) ---\n" + self._append_log(end_msg) + print(end_msg) + + except Exception as e: + err_msg = f"\n[系统错误] 执行异常: {str(e)}\n" + self._append_log(err_msg) + print(err_msg) + finally: + self.is_running = False + self.waiting_for_input = False + self.process = None + + def send_input(self, text): + """发送输入到进程的stdin""" + if self.process and self.process.poll() is None and self.process.stdin: + try: + self.process.stdin.write(text + "\n") + self.process.stdin.flush() + self._append_log(f"\n[用户回复] {text}\n") + with self.log_lock: + self.waiting_for_input = False + return True, "已发送回复" + except Exception as e: + return False, f"发送失败: {e}" + return False, "没有正在运行的任务" + + def _append_log(self, text): + with self.log_lock: + if len(self.logs) > 500000: + self.logs = self.logs[-400000:] + self.logs += text + + def get_logs(self): + with self.log_lock: + return self.logs + + def get_status(self): + if self.is_paused: + return "⏸ 已暂停 - 输入修正指令后点击 [执行/回复] 继续" + if self.waiting_for_input: + return "🟡 等待输入" + return "🟢 运行中" if self.is_running else "⚪ 就绪" + + def get_current_session_id(self): + with self.log_lock: + return self.current_session_id + + def get_paused_session_id(self): + with self.log_lock: + return self.paused_session_id + + def is_waiting_for_input(self): + with self.log_lock: + return self.waiting_for_input + + def clear_pause_state(self): + """清除暂停状态""" + with self.log_lock: + self.is_paused = False + self.paused_session_id = None + +# 全局单例 +runner = CommandRunner() + +# --- 辅助函数 --- + +def get_adb_devices(): + """获取所有已连接的设备""" + try: + result = subprocess.run(["adb", "devices"], capture_output=True, text=True, encoding='utf-8', errors='ignore') + devices = [] + device_details = [] + + if result.returncode == 0: + lines = result.stdout.split('\n')[1:] + for line in lines: + if '\tdevice' in line: + device_id = line.split('\t')[0] + devices.append(device_id) + device_type = "📶 无线" if ':' in device_id else "🔌 USB" + device_details.append(f"{device_type}: {device_id}") + + if not device_details: + return ["未找到设备"], "" + + device_list = "\n".join(device_details) + return devices, f"已连接设备 ({len(devices)}个):\n\n{device_list}\n\n默认设备: {devices[0]}" + except Exception as e: + return [f"错误: {str(e)}"], f"获取设备列表失败: {str(e)}" + +def connect_wireless_device(ip_address, port="5555"): + """连接无线设备""" + try: + parts = ip_address.strip().split('.') + if len(parts) != 4: + return False, "无效的IP地址格式" + + connect_addr = f"{ip_address}:{port}" + result = subprocess.run( + ["adb", "connect", connect_addr], + capture_output=True, text=True, encoding='utf-8', errors='ignore', timeout=10 + ) + + if result.returncode == 0: + devices_result = subprocess.run(["adb", "devices"], capture_output=True, text=True, encoding='utf-8') + if connect_addr in devices_result.stdout and "device" in devices_result.stdout: + return True, f"成功连接到无线设备: {connect_addr}" + else: + return False, "连接失败,请检查设备设置" + else: + return False, f"连接失败: {result.stderr.strip() if result.stderr else result.stdout.strip()}" + + except subprocess.TimeoutExpired: + return False, "连接超时" + except Exception as e: + return False, f"连接出错: {str(e)}" + +def disconnect_wireless_device(device_id): + """断开无线设备""" + try: + result = subprocess.run( + ["adb", "disconnect"] if not device_id else ["adb", "disconnect", device_id], + capture_output=True, text=True, encoding='utf-8' + ) + return True, "已断开无线设备连接" + except Exception as e: + return False, f"断开连接出错: {str(e)}" + +def enable_tcpip(device_id, port="5555"): + """启用TCP/IP模式""" + try: + result = subprocess.run( + ["adb", "-s", device_id, "tcpip", str(port)], + capture_output=True, text=True, encoding='utf-8', timeout=10 + ) + if result.returncode == 0: + ip_result = subprocess.run( + ["adb", "-s", device_id, "shell", "ip", "route", "get", "8.8.8.8"], + capture_output=True, text=True, encoding='utf-8' + ) + device_ip = "未知" + if ip_result.returncode == 0 and "src" in ip_result.stdout: + parts = ip_result.stdout.split() + for i, part in enumerate(parts): + if part == "src" and i + 1 < len(parts): + device_ip = parts[i + 1] + break + return True, f"TCP/IP已启用\n设备IP: {device_ip}" + return False, f"启用失败: {result.stderr}" + except Exception as e: + return False, f"启用TCP/IP出错: {str(e)}" + +def get_available_apps(): + try: + result = subprocess.run( + ["adb", "shell", "pm", "list", "packages", "-3"], + capture_output=True, text=True, encoding='utf-8', errors='ignore' + ) + if result.returncode != 0: + return "获取失败" + apps = [line.replace('package:', '').strip() for line in result.stdout.splitlines() if line.strip()] + apps.sort() + return "\n".join(apps) + except Exception as e: + return str(e) + +def start_scrcpy(): + """启动 scrcpy 屏幕镜像""" + try: + current_dir = os.path.dirname(os.path.abspath(__file__)) + project_dir = os.path.dirname(current_dir) + scrcpy_path = os.path.join(project_dir, "scrcpy-win64-v3.3.3", "scrcpy.exe") + + if not os.path.exists(scrcpy_path): + return f"未找到 scrcpy.exe: {scrcpy_path}" + + result = subprocess.run(["adb", "devices"], capture_output=True, text=True, encoding='utf-8') + devices = [line.split('\t')[0] for line in result.stdout.split('\n')[1:] if '\tdevice' in line] + + if not devices: + return "没有检测到已连接的设备" + + scrcpy_cmd = [scrcpy_path, '--no-audio'] + if len(devices) > 1: + scrcpy_cmd.extend(['-s', devices[0]]) + + def run_scrcpy(): + try: + if os.name == 'nt': + subprocess.Popen(scrcpy_cmd, creationflags=subprocess.CREATE_NEW_CONSOLE) + else: + subprocess.Popen(scrcpy_cmd) + except Exception as e: + print(f"[ERROR] 启动 scrcpy 失败: {e}") + + threading.Thread(target=run_scrcpy, daemon=True).start() + time.sleep(0.5) + return f"✅ scrcpy 已启动 (设备: {devices[0]})" + + except Exception as e: + return f"启动失败: {str(e)}" + +def check_adb_connection(): + """检查ADB连接状态""" + try: + subprocess.run(["adb", "start-server"], capture_output=True, text=True, timeout=5) + result = subprocess.run(["adb", "devices"], capture_output=True, text=True, timeout=5) + + if result.returncode == 0: + lines = result.stdout.strip().split('\n') + devices = [] + for line in lines[1:]: + if line.strip(): + parts = line.split('\t') + if len(parts) >= 2: + devices.append(f"📱 {parts[0]} - {parts[1]}") + + if devices: + return True, f"✅ ADB服务正常\n已连接设备:\n" + "\n".join(devices) + else: + return False, "⚠️ ADB服务正常但无设备连接" + return False, f"❌ ADB命令执行失败" + + except FileNotFoundError: + return False, "❌ ADB未安装或未添加到PATH" + except subprocess.TimeoutExpired: + return False, "❌ ADB命令超时" + except Exception as e: + return False, f"❌ 检查ADB连接时出错: {str(e)}" + +def restart_adb(): + """重启ADB服务""" + try: + subprocess.run(["adb", "kill-server"], capture_output=True, text=True, timeout=10) + time.sleep(1) + subprocess.run(["adb", "start-server"], capture_output=True, text=True, timeout=10) + + result = subprocess.run(["adb", "devices"], capture_output=True, text=True, timeout=5) + if result.returncode == 0: + lines = result.stdout.strip().split('\n') + devices = [f"📱 {line.split()[0]}" for line in lines[1:] if '\tdevice' in line] + if devices: + return True, f"✅ ADB重启成功\n当前设备:\n" + "\n".join(devices) + return True, "✅ ADB重启成功\n当前无设备连接" + return False, "❌ ADB重启失败" + except Exception as e: + return False, f"❌ 重启出错: {str(e)}" + +# --- Gradio 界面 --- + +def create_ui(): + # 自定义CSS:简洁样式 + custom_css = """ + /* 轨迹图片样式 */ + .trajectory-chatbot img { + max-width: 280px !important; + max-height: 500px !important; + width: auto !important; + height: auto !important; + object-fit: contain !important; + cursor: pointer; + transition: opacity 0.2s; + border-radius: 8px; + } + .trajectory-chatbot img:hover { + opacity: 0.85; + } + .trajectory-chatbot .message { + max-width: 100% !important; + } + + /* 命令/回复文本框滚动条样式 */ + #user-input-box textarea { + overflow-y: auto !important; + max-height: 120px !important; + } + """ + + # 灯箱脚本 - 使用head参数注入 (使用MutationObserver确保动态内容可点击) + lightbox_head = """ + + + """ + + with gr.Blocks(title="Stepfun-ai/gelab-zero") as demo: + + gr.Markdown("## 🤖 Stepfun-ai/gelab-zero 控制台") + + with gr.Row(): + # --- 左列:设备管理、配置、任务监控 --- + with gr.Column(scale=1, min_width=350): + + # 1. 设备管理 + with gr.Group(): + gr.Markdown("### 📱 设备管理") + + device_status = gr.Textbox( + label="设备状态", + value="❓ 未检查", + interactive=False, + lines=3 + ) + with gr.Row(): + check_status_btn = gr.Button("检查", size="sm", min_width=1, scale=1) + adb_devices_btn = gr.Button("列表", size="sm", min_width=1, scale=1) + restart_adb_btn = gr.Button("重启ADB", size="sm", min_width=1, scale=1) + + with gr.Accordion("📶 无线调试", open=False): + with gr.Row(): + wireless_ip = gr.Textbox(label="IP", placeholder="192.168.1.x", scale=3) + wireless_port = gr.Textbox(label="端口", value="5555", scale=1) + + with gr.Row(): + connect_wireless_btn = gr.Button("🔗 连接", variant="primary", size="sm") + disconnect_wireless_btn = gr.Button("✂️ 断开", size="sm") + + enable_tcpip_btn = gr.Button("📡 启用TCP/IP模式", size="sm") + wireless_status = gr.Textbox(label="状态", interactive=False, lines=1) + + # 2. 任务监控(放在设备管理下面) + with gr.Group(): + gr.Markdown("### 📊 任务监控") + with gr.Row(): + session_dropdown = gr.Dropdown( + label="Session", + choices=[], + value=None, + scale=20, + allow_custom_value=True, + min_width=200 + ) + with gr.Column(scale=1, min_width=60): + gr.HTML("
") # 占位符对其下拉框 + refresh_sessions_btn = gr.Button("🔄", size="sm") + + task_status = gr.Textbox( + label="任务状态", + value="⚪ 就绪", + interactive=False, + lines=1 + ) + user_input = gr.Textbox( + label="命令/回复", + placeholder="输入任务指令 或 回复Agent询问... (Ctrl+Enter 提交)", + lines=3, + max_lines=5, + elem_id="user-input-box" + ) + with gr.Row(): + submit_btn = gr.Button("▶ 执行/回复", variant="primary", scale=2, elem_id="submit-btn") + pause_btn = gr.Button("⏸ 暂停", variant="secondary", scale=1) + stop_btn = gr.Button("⏹ 停止", variant="stop", scale=1) + + # 3. 参数配置 + with gr.Accordion("⚙️ 参数配置", open=False): + try: + config_path = os.path.join(os.path.dirname(os.path.dirname(os.path.abspath(__file__))), "model_config.yaml") + with open(config_path, "r", encoding="utf-8") as f: + full_config = yaml.safe_load(f) + except Exception as e: + print(f"Error loading config: {e}") + full_config = {} + + # 准备 Provider 选项: (Display Name, Key) + provider_choices = [] + for key, val in full_config.items(): + display = val.get("display_name", key) + provider_choices.append((display, key)) + provider_choices.append(("自定义", "custom")) + + # default selection + default_prov = provider_choices[0][1] if provider_choices else "custom" + default_cfg = full_config.get(default_prov, {}) + + # 模型提供商选择 + provider_dd = gr.Dropdown( + label="模型提供商", + choices=provider_choices, + value=default_prov + ) + + # Base URL 单独一行 + base_url_input = gr.Textbox( + label="Base URL", + value=default_cfg.get("api_base", ""), + interactive=True, + placeholder="例如: http://localhost:11434/v1" + ) + + # API Key 单独一行 + api_key_input = gr.Textbox( + label="API Key", + type="password", + value=default_cfg.get("api_key", ""), + interactive=True, + placeholder="留空使用配置文件中的默认值" + ) + + # 模型名称 单独一行 + model_name_input = gr.Textbox( + label="模型名称", + value=default_cfg.get("default_model", ""), + interactive=True, + placeholder="例如: gelab-zero-4b-preview" + ) + + # 检查连接按钮和状态 + with gr.Row(): + check_model_btn = gr.Button("🔍 检查模型连接", size="sm") + model_status = gr.Textbox( + label="连接状态", + value="❓ 未检查", + interactive=False, + lines=2 + ) + + # Event: Provider Change + def on_provider_change(provider): + if provider == "custom": + return ( + gr.update(value="", interactive=True), + gr.update(value="", interactive=True), + gr.update(value="", interactive=True) + ) + + cfg = full_config.get(provider, {}) + new_base = cfg.get("api_base", "") + new_key = cfg.get("api_key", "") + new_model = cfg.get("default_model", "") + + return ( + gr.update(value=new_base, interactive=True), + gr.update(value=new_key, interactive=True), + gr.update(value=new_model, interactive=True) + ) + + provider_dd.change( + fn=on_provider_change, + inputs=[provider_dd], + outputs=[base_url_input, api_key_input, model_name_input] + ) + + # 检查模型连接 + def check_model_connection(base_url, model_name, api_key): + """检查模型连接状态""" + if not base_url: + return "⚠️ 请先填写 Base URL" + if not model_name: + return "⚠️ 请先填写模型名称" + + import requests + base = base_url.rstrip('/') + headers = {"Content-Type": "application/json"} + if api_key: + headers["Authorization"] = f"Bearer {api_key}" + + # 判断是本地还是在线 + is_local = "localhost" in base or "127.0.0.1" in base or "0.0.0.0" in base + api_type = "本地" if is_local else "在线" + + # 直接测试 /chat/completions 接口 + try: + url = base + '/chat/completions' + test_payload = { + "model": model_name, + "messages": [{"role": "user", "content": "test"}], + "max_tokens": 1 + } + response = requests.post(url, json=test_payload, headers=headers, timeout=15) + + if response.status_code == 200: + return f"✅ 连接成功 ({api_type})\n📍 {base}\n🤖 {model_name}" + elif response.status_code == 404: + return f"❌ 模型 {model_name} 不存在" + else: + try: + err_msg = response.json().get('error', {}).get('message', response.text[:80]) + except: + err_msg = response.text[:80] + return f"❌ 请求失败 ({response.status_code})\n{err_msg}" + except requests.exceptions.ConnectionError: + return f"❌ 无法连接 {base}" + except requests.exceptions.Timeout: + return f"❌ 连接超时" + except Exception as e: + return f"❌ {str(e)[:60]}" + + check_model_btn.click( + fn=check_model_connection, + inputs=[base_url_input, model_name_input, api_key_input], + outputs=[model_status] + ) + + with gr.Row(): + device_dd = gr.Dropdown(label="当前设备", choices=[], value=None, scale=3) + refresh_dev_btn = gr.Button("🔄", scale=1) + + # 4. 实用工具 + with gr.Accordion("🛠 实用工具", open=False): + scrcpy_btn = gr.Button("🖥️ 启动屏幕镜像", variant="secondary") + scrcpy_status = gr.Textbox(label="状态", interactive=False, lines=1) + + list_apps_btn = gr.Button("📲 获取应用列表", size="sm") + app_list_output = gr.Textbox(label="应用列表", lines=3, interactive=False) + + # --- 右列:日志与轨迹并排(更大空间) --- + with gr.Column(scale=3, min_width=700): + with gr.Row(): + # 左边:任务轨迹 + with gr.Column(scale=1): + gr.Markdown("### 📱 任务轨迹") + trajectory_output = gr.Chatbot( + label="轨迹回放", + height=660, + show_label=False, + elem_classes=["trajectory-chatbot"] + ) + with gr.Row(): + export_pdf_btn = gr.Button("📄 导出 PDF", size="sm") + export_file = gr.File(label="下载", visible=False) + + # 右边:实时日志 + with gr.Column(scale=1): + gr.Markdown("### 📋 实时日志") + log_output = gr.Textbox( + label="终端输出", + value="", + lines=25, + max_lines=30, + interactive=False, + elem_id="log-window" + ) + with gr.Row(): + clear_log_btn = gr.Button("🗑 清空", size="sm") + copy_log_btn = gr.Button("📋 复制", size="sm") + + # --- 逻辑绑定 --- + + # 刷新设备 + def refresh_devices(): + devices, _ = get_adb_devices() + valid_devices = [d for d in devices if not d.startswith("错误") and d != "未找到设备"] + return gr.Dropdown(choices=valid_devices, value=valid_devices[0] if valid_devices else None) + + refresh_dev_btn.click(refresh_devices, outputs=device_dd) + demo.load(refresh_devices, outputs=device_dd) + + # 刷新session列表 + def refresh_sessions(): + sessions = get_available_sessions() + current = runner.get_current_session_id() + # 如果有当前session且不在列表中,添加到最前面 + if current and current not in sessions: + sessions = [current] + sessions + return gr.Dropdown(choices=sessions, value=current if current else (sessions[0] if sessions else None)) + + refresh_sessions_btn.click(refresh_sessions, outputs=session_dropdown) + demo.load(refresh_sessions, outputs=session_dropdown) + + # 加载轨迹 + def load_trajectory(session_id): + if not session_id: + return [] + logs = load_session_logs(session_id) + messages = logs_to_chatbot_messages(logs) + return messages + + # session_dropdown.change 事件在下方统一处理,避免重复绑定 + + # PDF 导出 + def export_pdf_handler(session_id): + if not session_id: + return gr.update(value=None, visible=False) + pdf_path = export_trajectory_to_pdf(session_id) + if pdf_path: + return gr.update(value=pdf_path, visible=True) + else: + return gr.update(value=None, visible=False) + + export_pdf_btn.click(export_pdf_handler, inputs=[session_dropdown], outputs=[export_file]) + + # 列出应用 + list_apps_btn.click(get_available_apps, outputs=app_list_output) + + # 启动 scrcpy + scrcpy_btn.click(fn=start_scrcpy, outputs=[scrcpy_status]) + + # 核心:智能提交(命令 或 回复 或 暂停后继续) + def smart_submit(prompt, provider, base_url, api_key, model_name, device): + # 情况1: 处于暂停状态 → 作为注入指令继续 + if runner.is_paused: + paused_session = runner.get_paused_session_id() + if not paused_session: + runner.clear_pause_state() + return "⚠️ 没有可继续的会话", prompt + + script_path = os.path.join(os.path.dirname(os.path.dirname(os.path.abspath(__file__))), "examples", "run_single_task.py") + cmd_list = [sys.executable, script_path, "--continue-session", paused_session] + + # 如果有注入指令 + if prompt.strip(): + cmd_list.extend(["--injection", prompt.strip()]) + + # 添加模型参数 + if base_url: cmd_list.extend(["--base-url", base_url]) + if model_name: cmd_list.extend(["--model", model_name]) + if api_key: cmd_list.extend(["--api-key", api_key]) + if device and device != "未找到设备": + cmd_list.extend(["--device-id", device]) + + env = os.environ.copy() + env["PYTHONIOENCODING"] = "utf-8" + env["PYTHONUNBUFFERED"] = "1" + + runner.clear_pause_state() + success, msg = runner.start(cmd_list, cwd=os.getcwd(), env=env) + return ("🟢 继续运行中" if success else f"🔴 {msg}"), "" + + # 情况2: 无输入时仅返回当前状态 + if not prompt.strip(): + return runner.get_status(), "" + + # 情况3: 任务正在运行且等待输入,作为回复发送 + if runner.is_running and runner.is_waiting_for_input(): + success, msg = runner.send_input(prompt.strip()) + return runner.get_status(), "" # 清空输入框 + + # 情况4: 任务运行中 → 提示先停止 + if runner.is_running: + return "⚠️ 任务运行中,请先暂停或停止", prompt + + # 情况5: 空闲 → 启动新任务 + final_url = base_url + final_model = model_name + final_key = api_key + + script_path = os.path.join(os.path.dirname(os.path.dirname(os.path.abspath(__file__))), "examples", "run_single_task.py") + cmd_list = [sys.executable, script_path, prompt] + + if final_url: cmd_list.extend(["--base-url", final_url]) + if final_model: cmd_list.extend(["--model", final_model]) + if final_key: cmd_list.extend(["--api-key", final_key]) + if device and device != "未找到设备": + cmd_list.extend(["--device-id", device]) + + env = os.environ.copy() + env["PYTHONIOENCODING"] = "utf-8" + env["PYTHONUNBUFFERED"] = "1" + + success, msg = runner.start(cmd_list, cwd=os.getcwd(), env=env) + return ("🟢 运行中" if success else f"🔴 {msg}"), "" # 清空输入框 + + submit_btn.click( + smart_submit, + inputs=[user_input, provider_dd, base_url_input, api_key_input, model_name_input, device_dd], + outputs=[task_status, user_input] + ) + + user_input.submit( + smart_submit, + inputs=[user_input, provider_dd, base_url_input, api_key_input, model_name_input, device_dd], + outputs=[task_status, user_input] + ) + + # 停止任务(完全停止,清除暂停状态) + def stop_command(): + runner.stop() + runner.clear_pause_state() # 确保清除暂停状态 + return "⚪ 已停止" + + stop_btn.click(stop_command, outputs=[task_status]) + + # 暂停任务:立即终止进程,保存session供后续继续 + def pause_and_inject(prompt_text): + """暂停当前任务:立即终止进程,保存 session_id 供后续继续""" + if not runner.is_running: + return "⚠️ 没有正在运行的任务" + + current_session = runner.get_current_session_id() + if not current_session: + return "⚠️ 无法获取当前会话ID,请稍后重试" + + # 直接终止进程,并保存 session_id + runner.stop(is_pause=True) + + print(f"[PAUSED] 任务已暂停,Session: {current_session}") + return f"⏸ 已暂停 (Session: {current_session[:8]}...) - 输入修正指令后点击 [执行/回复] 继续" + + pause_btn.click(pause_and_inject, inputs=[user_input], outputs=[task_status]) + + # 检查状态 + def check_status_handler(): + devices, device_info = get_adb_devices() + return device_info if device_info else "❌ 未发现设备" + + check_status_btn.click(check_status_handler, outputs=device_status) + + # 无线调试 + def handle_connect_wireless(ip, port): + success, message = connect_wireless_device(ip, port) + if success: + devices, device_info = get_adb_devices() + return device_info, f"✅ {message}" + return "", f"❌ {message}" + + connect_wireless_btn.click(handle_connect_wireless, inputs=[wireless_ip, wireless_port], outputs=[device_status, wireless_status]) + + def handle_disconnect_wireless(): + devices, _ = get_adb_devices() + wireless_devices = [d for d in devices if ':' in d] + if wireless_devices: + disconnect_wireless_device("") + devices, device_info = get_adb_devices() + return device_info, "✅ 已断开" + return "", "ℹ️ 没有无线设备" + + disconnect_wireless_btn.click(handle_disconnect_wireless, outputs=[device_status, wireless_status]) + + def handle_enable_tcpip(): + devices, _ = get_adb_devices() + usb_devices = [d for d in devices if ':' not in d and d != "未找到设备" and not d.startswith("错误")] + if not usb_devices: + return "", "❌ 没有USB设备" + success, message = enable_tcpip(usb_devices[0]) + return (f"✅ {message}", "✅ TCP/IP已启用") if success else ("", f"❌ {message}") + + enable_tcpip_btn.click(handle_enable_tcpip, outputs=[device_status, wireless_status]) + + def handle_adb_devices(): + success, message = check_adb_connection() + return message, message + + adb_devices_btn.click(handle_adb_devices, outputs=[device_status, wireless_status]) + + def handle_restart_adb(): + success, message = restart_adb() + return message, message + + restart_adb_btn.click(handle_restart_adb, outputs=[device_status, wireless_status]) + + # 清除日志 + def clear_logs(): + with runner.log_lock: + runner.logs = "" + return "" + + clear_log_btn.click(clear_logs, outputs=log_output) + + # 复制日志 + copy_log_btn.click( + fn=None, inputs=[], outputs=[], + js="""() => { + let el = document.querySelector('#log-window textarea'); + if (el && el.value) { + navigator.clipboard.writeText(el.value).then(() => alert('已复制')).catch(() => alert('复制失败')); + } + }""" + ) + + # 实时轮询 + timer = gr.Timer(1.0) # 1秒刷新一次 + + # 追踪上一次的运行状态,用于判断任务是否从停止变为运行 + was_running_state = gr.State(value=False) + # 追踪上一次检测到的运行中session + last_running_session = gr.State(value=None) + + def poll_updates(dropdown_value, was_running, last_session, user_switched): + """轮询更新日志、状态和轨迹""" + logs = runner.get_logs() + + # 检查是否有暂停信号待处理 - 使用绝对路径 + pause_file = os.path.join(os.path.dirname(os.path.dirname(os.path.abspath(__file__))), "tmp_screenshot", "pause_signal.txt") + if runner.is_waiting_for_input(): + status = "⏸ 已暂停 (等待输入...)" + elif os.path.exists(pause_file) and runner.is_running: + status = "⏸ 暂停信号已发送,等待任务暂停..." + else: + status = runner.get_status() + + is_running = runner.is_running + current_running_session = runner.get_current_session_id() + + # 获取可用sessions列表 + sessions = get_available_sessions() + if current_running_session and current_running_session not in sessions: + sessions = [current_running_session] + sessions + + # 检测是否有新的session(基于session ID变化) + has_new_session = ( + current_running_session and + current_running_session != last_session + ) + + # 更新状态 + new_was_running = is_running + new_last_session = current_running_session if current_running_session else last_session + new_user_switched = user_switched + + # Dropdown 和 Trajectory 更新策略 + if has_new_session: + # 检测到新session:切换到新session并加载轨迹 + # 同时重置 user_switched 标志 + print(f"[DEBUG] 新session检测到: {current_running_session}") + dropdown_update = gr.update(choices=sessions, value=current_running_session) + traj_logs = load_session_logs(current_running_session) + print(f"[DEBUG] 加载轨迹日志: {len(traj_logs)} 条") + trajectory_update = logs_to_chatbot_messages(traj_logs) + new_user_switched = False # 新任务开始,重置用户切换标志 + elif is_running and current_running_session and not user_switched: + # 任务运行中且用户没有手动切换:实时刷新当前运行session的轨迹 + print(f"[DEBUG] 刷新运行中任务: {current_running_session}, is_running={is_running}, user_switched={user_switched}") + dropdown_update = gr.update(choices=sessions, value=current_running_session) + traj_logs = load_session_logs(current_running_session) + print(f"[DEBUG] 加载轨迹日志: {len(traj_logs)} 条") + trajectory_update = logs_to_chatbot_messages(traj_logs) + else: + # 用户手动切换了session 或 任务未运行:只更新choices,保持轨迹不变 + print(f"[DEBUG] 保持轨迹不变: is_running={is_running}, current_session={current_running_session}, user_switched={user_switched}") + dropdown_update = gr.update(choices=sessions) + trajectory_update = gr.update() + + return ( + logs, + status, + dropdown_update, + trajectory_update, + new_was_running, + new_last_session, + new_user_switched + ) + + # 追踪用户是否手动切换了session + user_switched_session = gr.State(value=False) + + timer.tick( + fn=poll_updates, + inputs=[session_dropdown, was_running_state, last_running_session, user_switched_session], + outputs=[log_output, task_status, session_dropdown, trajectory_output, was_running_state, last_running_session, user_switched_session], + js="""() => { + setTimeout(() => { + // 检测日志内容是否包含"任务结束" + let logEl = document.querySelector('#log-window textarea'); + let taskEnded = false; + if (logEl && logEl.value) { + taskEnded = logEl.value.includes('任务结束'); + } + + // 只在任务未结束时自动滚动日志窗口 + if (logEl && !taskEnded) { + logEl.scrollTop = logEl.scrollHeight; + } + + // 轨迹窗口:只在任务未结束时自动滚动 + let trajEl = document.querySelector('.trajectory-chatbot'); + if (trajEl && !taskEnded) { + let scrollContainer = trajEl.querySelector('[class*="chatbot"]') || trajEl; + scrollContainer.scrollTop = scrollContainer.scrollHeight; + } + }, 100); + }""" + ) + + # 当用户手动选择session时,加载对应的轨迹并标记用户已切换 + def on_session_select(session_id): + """用户手动选择session""" + messages = load_trajectory(session_id) + # 只有当选择的session与当前运行的session不同时,才标记为用户切换 + # 这样当程序自动切换到新session时,不会被误标记 + current_running = runner.get_current_session_id() + user_switched = (session_id != current_running) if current_running else False + print(f"[DEBUG] on_session_select: session_id={session_id}, current_running={current_running}, user_switched={user_switched}") + return messages, user_switched + + session_dropdown.change( + on_session_select, + inputs=[session_dropdown], + outputs=[trajectory_output, user_switched_session] + ) + + return demo, custom_css, lightbox_head + +if __name__ == "__main__": + ui, css, head = create_ui() + ui.launch( + server_name="0.0.0.0", + server_port=8870, + show_error=True, + css=css, + head=head + ) \ No newline at end of file diff --git a/web_ui/run.py b/web_ui/run.py new file mode 100644 index 0000000..a38f36f --- /dev/null +++ b/web_ui/run.py @@ -0,0 +1,27 @@ +#!/usr/bin/env python3 +""" +直接运行Web UI的脚本 +""" + +if __name__ == "__main__": + import sys + import os + + # 将当前目录添加到Python路径 + current_dir = os.path.dirname(os.path.abspath(__file__)) + sys.path.insert(0, current_dir) + + try: + from app import create_ui + demo = create_ui() + print("启动AutoGLM Web UI...") + demo.launch( + server_name="0.0.0.0", + server_port=7861, # 使用不同的端口 + share=False, + inbrowser=True, + show_error=True + ) + except Exception as e: + print(f"启动失败: {e}") + print("请确保已安装所有依赖:pip install -r requirements.txt") \ No newline at end of file