34
34
35
35
# Load configuration
36
36
config = Config ()
37
- VERBOSE = config .verbose
38
-
39
37
40
38
async def get_next_action (model , messages , objective , session_id ):
41
- if VERBOSE :
39
+ if config . verbose :
42
40
print ("[Self-Operating Computer][get_next_action]" )
43
41
print ("[Self-Operating Computer][get_next_action] model" , model )
44
42
if model == "gpt-4" :
@@ -61,7 +59,7 @@ async def get_next_action(model, messages, objective, session_id):
61
59
62
60
63
61
def call_gpt_4_vision_preview (messages ):
64
- if VERBOSE :
62
+ if config . verbose :
65
63
print ("[call_gpt_4_v]" )
66
64
time .sleep (1 )
67
65
client = config .initialize_openai ()
@@ -82,7 +80,7 @@ def call_gpt_4_vision_preview(messages):
82
80
else :
83
81
user_prompt = get_user_prompt ()
84
82
85
- if VERBOSE :
83
+ if config . verbose :
86
84
print (
87
85
"[call_gpt_4_v] user_prompt" ,
88
86
user_prompt ,
@@ -117,7 +115,7 @@ def call_gpt_4_vision_preview(messages):
117
115
content = content [: - len ("```" )] # Remove ending
118
116
119
117
assistant_message = {"role" : "assistant" , "content" : content }
120
- if VERBOSE :
118
+ if config . verbose :
121
119
print (
122
120
"[call_gpt_4_v] content" ,
123
121
content ,
@@ -137,7 +135,7 @@ def call_gpt_4_vision_preview(messages):
137
135
f"{ ANSI_GREEN } [Self-Operating Computer]{ ANSI_RED } [Error] AI response was { ANSI_RESET } " ,
138
136
content ,
139
137
)
140
- if VERBOSE :
138
+ if config . verbose :
141
139
traceback .print_exc ()
142
140
return call_gpt_4_vision_preview (messages )
143
141
@@ -146,7 +144,7 @@ def call_gemini_pro_vision(messages, objective):
146
144
"""
147
145
Get the next action for Self-Operating Computer using Gemini Pro Vision
148
146
"""
149
- if VERBOSE :
147
+ if config . verbose :
150
148
print (
151
149
"[Self Operating Computer][call_gemini_pro_vision]" ,
152
150
)
@@ -165,18 +163,18 @@ def call_gemini_pro_vision(messages, objective):
165
163
prompt = get_system_prompt ("gemini-pro-vision" , objective )
166
164
167
165
model = config .initialize_google ()
168
- if VERBOSE :
166
+ if config . verbose :
169
167
print ("[call_gemini_pro_vision] model" , model )
170
168
171
169
response = model .generate_content ([prompt , Image .open (screenshot_filename )])
172
170
173
171
content = response .text [1 :]
174
- if VERBOSE :
172
+ if config . verbose :
175
173
print ("[call_gemini_pro_vision] response" , response )
176
174
print ("[call_gemini_pro_vision] content" , content )
177
175
178
176
content = json .loads (content )
179
- if VERBOSE :
177
+ if config . verbose :
180
178
print (
181
179
"[get_next_action][call_gemini_pro_vision] content" ,
182
180
content ,
@@ -188,14 +186,14 @@ def call_gemini_pro_vision(messages, objective):
188
186
print (
189
187
f"{ ANSI_GREEN } [Self-Operating Computer]{ ANSI_BRIGHT_MAGENTA } [Operate] That did not work. Trying another method { ANSI_RESET } "
190
188
)
191
- if VERBOSE :
189
+ if config . verbose :
192
190
print ("[Self-Operating Computer][Operate] error" , e )
193
191
traceback .print_exc ()
194
192
return call_gpt_4_vision_preview (messages )
195
193
196
194
197
195
async def call_gpt_4_vision_preview_ocr (messages , objective , model ):
198
- if VERBOSE :
196
+ if config . verbose :
199
197
print ("[call_gpt_4_vision_preview_ocr]" )
200
198
201
199
# Construct the path to the file within the package
@@ -260,7 +258,7 @@ async def call_gpt_4_vision_preview_ocr(messages, objective, model):
260
258
# Normalize line breaks and remove any unwanted characters
261
259
content = "\n " .join (line .strip () for line in content .splitlines ())
262
260
263
- if VERBOSE :
261
+ if config . verbose :
264
262
print (
265
263
"\n \n \n [call_gpt_4_vision_preview_ocr] content after cleaning" , content
266
264
)
@@ -274,7 +272,7 @@ async def call_gpt_4_vision_preview_ocr(messages, objective, model):
274
272
for operation in content :
275
273
if operation .get ("operation" ) == "click" :
276
274
text_to_click = operation .get ("text" )
277
- if VERBOSE :
275
+ if config . verbose :
278
276
print (
279
277
"[call_gpt_4_vision_preview_ocr][click] text_to_click" ,
280
278
text_to_click ,
@@ -296,7 +294,7 @@ async def call_gpt_4_vision_preview_ocr(messages, objective, model):
296
294
operation ["x" ] = coordinates ["x" ]
297
295
operation ["y" ] = coordinates ["y" ]
298
296
299
- if VERBOSE :
297
+ if config . verbose :
300
298
print (
301
299
"[call_gpt_4_vision_preview_ocr][click] text_element_index" ,
302
300
text_element_index ,
@@ -324,7 +322,7 @@ async def call_gpt_4_vision_preview_ocr(messages, objective, model):
324
322
print (
325
323
f"{ ANSI_GREEN } [Self-Operating Computer]{ ANSI_BRIGHT_MAGENTA } [Operate] That did not work. Trying another method { ANSI_RESET } "
326
324
)
327
- if VERBOSE :
325
+ if config . verbose :
328
326
print ("[Self-Operating Computer][Operate] error" , e )
329
327
traceback .print_exc ()
330
328
return gpt_4_fallback (messages , objective , model )
@@ -356,7 +354,7 @@ async def call_gpt_4_vision_preview_labeled(messages, objective):
356
354
else :
357
355
user_prompt = get_user_prompt ()
358
356
359
- if VERBOSE :
357
+ if config . verbose :
360
358
print (
361
359
"[call_gpt_4_vision_preview_labeled] user_prompt" ,
362
360
user_prompt ,
@@ -393,7 +391,7 @@ async def call_gpt_4_vision_preview_labeled(messages, objective):
393
391
content = content [: - len ("```" )] # Remove ending
394
392
395
393
assistant_message = {"role" : "assistant" , "content" : content }
396
- if VERBOSE :
394
+ if config . verbose :
397
395
print (
398
396
"[call_gpt_4_vision_preview_labeled] content" ,
399
397
content ,
@@ -407,14 +405,14 @@ async def call_gpt_4_vision_preview_labeled(messages, objective):
407
405
for operation in content :
408
406
if operation .get ("operation" ) == "click" :
409
407
label = operation .get ("label" )
410
- if VERBOSE :
408
+ if config . verbose :
411
409
print (
412
410
"[Self Operating Computer][call_gpt_4_vision_preview_labeled] label" ,
413
411
label ,
414
412
)
415
413
416
414
coordinates = get_label_coordinates (label , label_coordinates )
417
- if VERBOSE :
415
+ if config . verbose :
418
416
print (
419
417
"[Self Operating Computer][call_gpt_4_vision_preview_labeled] coordinates" ,
420
418
coordinates ,
@@ -426,7 +424,7 @@ async def call_gpt_4_vision_preview_labeled(messages, objective):
426
424
click_position_percent = get_click_position_in_percent (
427
425
coordinates , image_size
428
426
)
429
- if VERBOSE :
427
+ if config . verbose :
430
428
print (
431
429
"[Self Operating Computer][call_gpt_4_vision_preview_labeled] click_position_percent" ,
432
430
click_position_percent ,
@@ -441,7 +439,7 @@ async def call_gpt_4_vision_preview_labeled(messages, objective):
441
439
y_percent = f"{ click_position_percent [1 ]:.2f} "
442
440
operation ["x" ] = x_percent
443
441
operation ["y" ] = y_percent
444
- if VERBOSE :
442
+ if config . verbose :
445
443
print (
446
444
"[Self Operating Computer][call_gpt_4_vision_preview_labeled] new click operation" ,
447
445
operation ,
@@ -450,7 +448,7 @@ async def call_gpt_4_vision_preview_labeled(messages, objective):
450
448
else :
451
449
processed_content .append (operation )
452
450
453
- if VERBOSE :
451
+ if config . verbose :
454
452
print (
455
453
"[Self Operating Computer][call_gpt_4_vision_preview_labeled] new processed_content" ,
456
454
processed_content ,
@@ -461,14 +459,14 @@ async def call_gpt_4_vision_preview_labeled(messages, objective):
461
459
print (
462
460
f"{ ANSI_GREEN } [Self-Operating Computer]{ ANSI_BRIGHT_MAGENTA } [Operate] That did not work. Trying another method { ANSI_RESET } "
463
461
)
464
- if VERBOSE :
462
+ if config . verbose :
465
463
print ("[Self-Operating Computer][Operate] error" , e )
466
464
traceback .print_exc ()
467
465
return call_gpt_4_vision_preview (messages )
468
466
469
467
470
468
def call_ollama_llava (messages ):
471
- if VERBOSE :
469
+ if config . verbose :
472
470
print ("[call_ollama_llava]" )
473
471
time .sleep (1 )
474
472
try :
@@ -485,7 +483,7 @@ def call_ollama_llava(messages):
485
483
else :
486
484
user_prompt = get_user_prompt ()
487
485
488
- if VERBOSE :
486
+ if config . verbose :
489
487
print (
490
488
"[call_ollama_llava] user_prompt" ,
491
489
user_prompt ,
@@ -516,7 +514,7 @@ def call_ollama_llava(messages):
516
514
content = content [: - len ("```" )] # Remove ending
517
515
518
516
assistant_message = {"role" : "assistant" , "content" : content }
519
- if VERBOSE :
517
+ if config . verbose :
520
518
print (
521
519
"[call_ollama_llava] content" ,
522
520
content ,
@@ -542,7 +540,7 @@ def call_ollama_llava(messages):
542
540
f"{ ANSI_GREEN } [Self-Operating Computer]{ ANSI_RED } [Error] AI response was { ANSI_RESET } " ,
543
541
content ,
544
542
)
545
- if VERBOSE :
543
+ if config . verbose :
546
544
traceback .print_exc ()
547
545
return call_ollama_llava (messages )
548
546
@@ -562,15 +560,15 @@ def get_last_assistant_message(messages):
562
560
563
561
564
562
def gpt_4_fallback (messages , objective , model ):
565
- if VERBOSE :
563
+ if config . verbose :
566
564
print ("[gpt_4_fallback]" )
567
565
system_prompt = get_system_prompt ("gpt-4-vision-preview" , objective )
568
566
new_system_message = {"role" : "system" , "content" : system_prompt }
569
567
# remove and replace the first message in `messages` with `new_system_message`
570
568
571
569
messages [0 ] = new_system_message
572
570
573
- if VERBOSE :
571
+ if config . verbose :
574
572
print ("[gpt_4_fallback][updated]" )
575
573
print ("[gpt_4_fallback][updated] len(messages)" , len (messages ))
576
574
@@ -581,7 +579,7 @@ def confirm_system_prompt(messages, objective, model):
581
579
"""
582
580
On `Exception` we default to `call_gpt_4_vision_preview` so we have this function to reassign system prompt in case of a previous failure
583
581
"""
584
- if VERBOSE :
582
+ if config . verbose :
585
583
print ("[confirm_system_prompt] model" , model )
586
584
587
585
system_prompt = get_system_prompt (model , objective )
@@ -590,7 +588,7 @@ def confirm_system_prompt(messages, objective, model):
590
588
591
589
messages [0 ] = new_system_message
592
590
593
- if VERBOSE :
591
+ if config . verbose :
594
592
print ("[confirm_system_prompt]" )
595
593
print ("[confirm_system_prompt] len(messages)" , len (messages ))
596
594
for m in messages :
0 commit comments