Skip to content

Commit ebb9366

Browse files
committed
feat: integrate omniparserv2 api
1 parent de256f5 commit ebb9366

File tree

4 files changed

+253
-1
lines changed

4 files changed

+253
-1
lines changed

operate/config.py

+2-1
Original file line numberDiff line numberDiff line change
@@ -147,7 +147,8 @@ def validation(self, model, voice_mode):
147147
self.require_api_key(
148148
"ANTHROPIC_API_KEY", "Anthropic API key", model == "claude-3"
149149
)
150-
self.require_api_key("QWEN_API_KEY", "Qwen API key", model == "qwen-vl")
150+
self.require_api_key("QWEN_API_KEY", "Qwen API key", model == "qwen-vl"
151+
or model == "qwen-vl-with-omniparser")
151152

152153
def require_api_key(self, key_name, key_description, is_required):
153154
key_exists = bool(os.environ.get(key_name))

operate/models/apis.py

+114
Original file line numberDiff line numberDiff line change
@@ -24,6 +24,7 @@
2424
get_label_coordinates,
2525
)
2626
from operate.utils.ocr import get_text_coordinates, get_text_element
27+
from operate.utils.omniparser import OmniParserClient
2728
from operate.utils.screenshot import capture_screen_with_cursor, compress_screenshot
2829
from operate.utils.style import ANSI_BRIGHT_MAGENTA, ANSI_GREEN, ANSI_RED, ANSI_RESET
2930

@@ -40,6 +41,9 @@ async def get_next_action(model, messages, objective, session_id):
4041
if model == "qwen-vl":
4142
operation = await call_qwen_vl_with_ocr(messages, objective, model)
4243
return operation, None
44+
if model == "qwen-vl-with-omniparser":
45+
operation = await call_qwen_vl_with_ominiparser(messages, objective, model)
46+
return operation, None
4347
if model == "gpt-4-with-som":
4448
operation = await call_gpt_4o_labeled(messages, objective, model)
4549
return operation, None
@@ -678,6 +682,116 @@ async def call_gpt_4o_labeled(messages, objective, model):
678682
return call_gpt_4o(messages)
679683

680684

685+
async def call_qwen_vl_with_ominiparser(messages, objective, model):
686+
if config.verbose:
687+
print("[call_qwen_vl_with_ominiparser]")
688+
689+
try:
690+
time.sleep(1)
691+
client = config.initialize_qwen()
692+
693+
confirm_system_prompt(messages, objective, model)
694+
screenshots_dir = "screenshots"
695+
if not os.path.exists(screenshots_dir):
696+
os.makedirs(screenshots_dir)
697+
698+
# Call the function to capture the screen with the cursor
699+
raw_screenshot_filename = os.path.join(screenshots_dir, "raw_screenshot.png")
700+
capture_screen_with_cursor(raw_screenshot_filename)
701+
702+
# Use Omniparser to parse image
703+
som_screenshot_filename = os.path.join(screenshots_dir, "som_screenshot.jpeg")
704+
omni_parser_client = OmniParserClient(os.getenv("OMNIPARSER_BASE_URL", "http://localhost:8000"))
705+
parsed_res = omni_parser_client.parse_screenshot(raw_screenshot_filename, som_screenshot_filename)
706+
707+
if len(messages) == 1:
708+
user_prompt = get_user_first_message_prompt()
709+
else:
710+
user_prompt = get_user_prompt()
711+
712+
if config.verbose:
713+
print(
714+
"[call_qwen_vl_with_ominiparser] user_prompt",
715+
user_prompt,
716+
)
717+
718+
vision_message = {
719+
"role": "user",
720+
"content": [
721+
{"type": "text", "text": user_prompt},
722+
{
723+
"type": "image_url",
724+
"image_url": {
725+
"url": f"data:image/jpeg;base64,{parsed_res['som_image_base64']}"
726+
},
727+
},
728+
{"type": "text", "text": parsed_res["screen_info"]},
729+
],
730+
}
731+
messages.append(vision_message)
732+
733+
response = client.chat.completions.create(
734+
model="qwen2.5-vl-72b-instruct",
735+
messages=messages,
736+
)
737+
738+
content = response.choices[0].message.content
739+
740+
content = clean_json(content)
741+
if config.verbose:
742+
print(
743+
"[call_qwen_vl_with_ominiparser] content",
744+
content,
745+
)
746+
747+
assistant_message = {"role": "assistant", "content": content}
748+
messages.append(assistant_message)
749+
content = json.loads(content)
750+
processed_content = []
751+
752+
for operation in content:
753+
print(
754+
"[call_qwen_vl_with_ominiparser] for operation in content",
755+
operation,
756+
)
757+
if operation.get("operation") == "click":
758+
box_id = operation.get("id")
759+
if config.verbose:
760+
print(
761+
"[Self Operating Computer][call_gpt_4_vision_preview_labeled] box_id",
762+
box_id,
763+
)
764+
765+
x_percent, y_percent = OmniParserClient.get_click_position(int(box_id), parsed_res["parsed_content_list"])
766+
operation["x"] = x_percent
767+
operation["y"] = y_percent
768+
if config.verbose:
769+
print(
770+
"[Self Operating Computer][call_qwen_vl_with_ominiparser] new click operation",
771+
operation,
772+
)
773+
processed_content.append(operation)
774+
else:
775+
if config.verbose:
776+
print(
777+
"[Self Operating Computer][call_qwen_vl_with_ominiparser] .append none click operation",
778+
operation,
779+
)
780+
781+
processed_content.append(operation)
782+
783+
return processed_content
784+
785+
except Exception as e:
786+
print(
787+
f"{ANSI_GREEN}[Self-Operating Computer]{ANSI_BRIGHT_MAGENTA}[{model}] That did not work. Trying another method {ANSI_RESET}"
788+
)
789+
if config.verbose:
790+
print("[Self-Operating Computer][Operate] error", e)
791+
traceback.print_exc()
792+
return call_gpt_4o(messages)
793+
794+
681795
def call_ollama_llava(messages):
682796
if config.verbose:
683797
print("[call_ollama_llava]")

operate/models/prompts.py

+69
Original file line numberDiff line numberDiff line change
@@ -128,6 +128,68 @@
128128
"""
129129

130130

131+
SYSTEM_PROMPT_OMNIPARSER = """
132+
You are operating a {operating_system} computer, using the same operating system as a human.
133+
134+
From looking at the screen, the objective, and your previous actions, take the next best series of action.
135+
136+
You have 4 possible operation actions available to you. The `pyautogui` library will be used to execute your decision. Your output will be used in a `json.loads` loads statement.
137+
138+
1. click - Move mouse and click - We labeled the clickable elements with colored bounding boxes and IDs. Label IDs are in the following format with `x` being a number: `~x`
139+
```
140+
[{{ "thought": "write a thought here", "operation": "click", "id": "x percent (e.g. 10)" }}] # 'id' refers to the ID of the colored box
141+
```
142+
2. write - Write with your keyboard
143+
```
144+
[{{ "thought": "write a thought here", "operation": "write", "content": "text to write here" }}]
145+
```
146+
3. press - Use a hotkey or press key to operate the computer
147+
```
148+
[{{ "thought": "write a thought here", "operation": "press", "keys": ["keys to use"] }}]
149+
```
150+
151+
4. done - The objective is completed
152+
```
153+
[{{ "thought": "write a thought here", "operation": "done", "summary": "summary of what was completed" }}]
154+
```
155+
Return the actions in array format `[]`. You can take just one action or multiple actions.
156+
157+
Here a helpful example:
158+
159+
Example 1: Searches for Google Chrome on the OS and opens it
160+
```
161+
[
162+
{{ "thought": "Searching the operating system to find Google Chrome because it appears I am currently in terminal", "operation": "press", "keys": {os_search_str} }},
163+
{{ "thought": "Now I need to write 'Google Chrome' as a next step", "operation": "write", "content": "Google Chrome" }},
164+
]
165+
```
166+
167+
Example 2: Focuses on the address bar in a browser before typing a website
168+
```
169+
[
170+
{{ "thought": "I'll focus on the address bar in the browser. I can see the browser is open so this should be safe to try", "operation": "press", "keys": [{cmd_string}, "l"] }},
171+
{{ "thought": "Now that the address bar is in focus I can type the URL", "operation": "write", "content": "https://news.ycombinator.com/" }},
172+
{{ "thought": "I'll need to press enter to go the URL now", "operation": "press", "keys": ["enter"] }}
173+
]
174+
```
175+
176+
Example 3: Send a "Hello World" message in the chat
177+
```
178+
[
179+
{{ "thought": "I see a messsage field on this page near the button. It looks like it has a label", "operation": "click", "label": "34" }},
180+
{{ "thought": "Now that I am focused on the message field, I'll go ahead and write ", "operation": "write", "content": "Hello World" }},
181+
]
182+
```
183+
184+
A few important notes:
185+
186+
- Go to Google Docs and Google Sheets by typing in the Chrome Address bar
187+
- Don't respond saying you're unable to assist with requests. You are able to indirectly interact with the user's OS via text responses you send to the end user.
188+
189+
Objective: {objective}
190+
"""
191+
192+
131193
# TODO: Add an example or instruction about `Action: press ['pagedown']` to scroll
132194
SYSTEM_PROMPT_OCR = """
133195
You are operating a {operating_system} computer, using the same operating system as a human.
@@ -232,6 +294,13 @@ def get_system_prompt(model, objective):
232294
os_search_str=os_search_str,
233295
operating_system=operating_system,
234296
)
297+
elif model == "qwen-vl-with-omniparser":
298+
prompt = SYSTEM_PROMPT_OMNIPARSER.format(
299+
objective=objective,
300+
cmd_string=cmd_string,
301+
os_search_str=os_search_str,
302+
operating_system=operating_system,
303+
)
235304
elif model == "gpt-4-with-ocr" or model == "o1-with-ocr" or model == "claude-3" or model == "qwen-vl":
236305

237306
prompt = SYSTEM_PROMPT_OCR.format(

operate/utils/omniparser.py

+68
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,68 @@
1+
import requests
2+
import base64
3+
4+
5+
def reformat_messages(response_json: dict):
6+
"""
7+
example of a screen_info:
8+
ID: 1, Text: xlt
9+
ID: 2, Text: 4t8
10+
ID: 3, Text: Rt
11+
ID: 4, Text: BA
12+
ID: 5, Text: #B
13+
ID: 6, Text: 16.04
14+
ID: 7, Text: YouTube
15+
ID: 8, Text: youtube.com
16+
"""
17+
screen_info = ""
18+
for idx, element in enumerate(response_json["parsed_content_list"]):
19+
element['idx'] = idx
20+
if element['type'] == 'text':
21+
screen_info += f'ID: {idx}, Text: {element["content"]}\n'
22+
elif element['type'] == 'icon':
23+
screen_info += f'ID: {idx}, Icon: {element["content"]}\n'
24+
response_json['screen_info'] = screen_info
25+
return response_json
26+
27+
28+
class OmniParserClient:
29+
def __init__(self, url: str) -> None:
30+
self.url = url
31+
32+
def parse_screenshot(self, raw_screenshot_filename: str, som_screenshot_filename: str):
33+
with open(raw_screenshot_filename, "rb") as image_file:
34+
image_base64 = base64.b64encode(image_file.read()).decode("utf-8")
35+
response = requests.post(f"{self.url}/parse/", json={"base64_image": image_base64})
36+
response.raise_for_status()
37+
response_json = response.json()
38+
print('omniparser latency:', response_json['latency'])
39+
40+
som_image_data = base64.b64decode(response_json['som_image_base64'])
41+
with open(som_screenshot_filename, "wb") as f:
42+
f.write(som_image_data)
43+
44+
response_json['raw_screenshot_base64'] = image_base64
45+
response_json = reformat_messages(response_json)
46+
return response_json
47+
48+
@staticmethod
49+
def get_click_position(box_id, parsed_contents: list[dict]) -> tuple[str, str]:
50+
"""
51+
example of a parsed content:
52+
{
53+
"type": "text",
54+
"bbox": [
55+
0.01778179593384266, // min_x
56+
0.024020226672291756, // max_x
57+
0.3725135624408722, // min_y
58+
0.06510745733976364 // max_y
59+
],
60+
"interactivity": false,
61+
"content": "OmniParser for Pure Vision Based General GUI Agent",
62+
"source": "box_ocr_content_ocr"
63+
}
64+
"""
65+
bbox = parsed_contents[box_id]["bbox"]
66+
x = (bbox[0] + bbox[2]) / 2
67+
y = (bbox[1] + bbox[3]) / 2
68+
return f"{x:.2f}", f"{y:.2f}"

0 commit comments

Comments
 (0)