Skip to content

swap out for gpt-4o #233

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Draft
wants to merge 1 commit into
base: gpt-4-turbo
Choose a base branch
from
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 2 additions & 2 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@
</div>

<!--
:rotating_light: **OUTAGE NOTIFICATION: gpt-4-vision-preview**
:rotating_light: **OUTAGE NOTIFICATION: gpt-4o**
**This model is currently experiencing an outage so the self-operating computer may not work as expected.**
-->

Expand Down Expand Up @@ -176,5 +176,5 @@ Stay updated with the latest developments:
- This project is compatible with Mac OS, Windows, and Linux (with X server installed).

## OpenAI Rate Limiting Note
The ```gpt-4-vision-preview``` model is required. To unlock access to this model, your account needs to spend at least \$5 in API credits. Pre-paying for these credits will unlock access if you haven't already spent the minimum \$5.
The ```gpt-4o``` model is required. To unlock access to this model, your account needs to spend at least \$5 in API credits. Pre-paying for these credits will unlock access if you haven't already spent the minimum \$5.
Learn more **[here](https://platform.openai.com/docs/guides/rate-limits?context=tier-one)**
73 changes: 42 additions & 31 deletions evaluate.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,8 @@
Guideline: {guideline}
"""

SCREENSHOT_PATH = os.path.join('screenshots', 'screenshot.png')
SCREENSHOT_PATH = os.path.join("screenshots", "screenshot.png")


# Check if on a windows terminal that supports ANSI escape codes
def supports_ansi():
Expand All @@ -37,6 +38,7 @@ def supports_ansi():
is_a_tty = hasattr(sys.stdout, "isatty") and sys.stdout.isatty()
return supported_platform and is_a_tty


if supports_ansi():
# Standard green text
ANSI_GREEN = "\033[32m"
Expand All @@ -62,8 +64,8 @@ def supports_ansi():
ANSI_YELLOW = ""
ANSI_RED = ""
ANSI_BRIGHT_MAGENTA = ""


def format_evaluation_prompt(guideline):
prompt = EVALUATION_PROMPT.format(guideline=guideline)
return prompt
Expand All @@ -72,33 +74,37 @@ def format_evaluation_prompt(guideline):
def parse_eval_content(content):
try:
res = json.loads(content)

print(res["reason"])

return res["guideline_met"]
except:
print("The model gave a bad evaluation response and it couldn't be parsed. Exiting...")
print(
"The model gave a bad evaluation response and it couldn't be parsed. Exiting..."
)
exit(1)


def evaluate_final_screenshot(guideline):
'''Load the final screenshot and return True or False if it meets the given guideline.'''
"""Load the final screenshot and return True or False if it meets the given guideline."""
with open(SCREENSHOT_PATH, "rb") as img_file:
img_base64 = base64.b64encode(img_file.read()).decode("utf-8")

eval_message = [{
"role": "user",
"content": [
{"type": "text", "text": format_evaluation_prompt(guideline)},
{
"type": "image_url",
"image_url": {"url": f"data:image/jpeg;base64,{img_base64}"},
},
],
}]

eval_message = [
{
"role": "user",
"content": [
{"type": "text", "text": format_evaluation_prompt(guideline)},
{
"type": "image_url",
"image_url": {"url": f"data:image/jpeg;base64,{img_base64}"},
},
],
}
]

response = openai.chat.completions.create(
model="gpt-4-vision-preview",
model="gpt-4o",
messages=eval_message,
presence_penalty=1,
frequency_penalty=1,
Expand All @@ -107,53 +113,57 @@ def evaluate_final_screenshot(guideline):
)

eval_content = response.choices[0].message.content

return parse_eval_content(eval_content)


def run_test_case(objective, guideline, model):
'''Returns True if the result of the test with the given prompt meets the given guideline for the given model.'''
"""Returns True if the result of the test with the given prompt meets the given guideline for the given model."""
# Run `operate` with the model to evaluate and the test case prompt
subprocess.run(['operate', '-m', model, '--prompt', f'"{objective}"'], stdout=subprocess.DEVNULL)

subprocess.run(
["operate", "-m", model, "--prompt", f'"{objective}"'],
stdout=subprocess.DEVNULL,
)

try:
result = evaluate_final_screenshot(guideline)
except(OSError):
except OSError:
print("[Error] Couldn't open the screenshot for evaluation")
return False

return result


def get_test_model():
parser = argparse.ArgumentParser(
description="Run the self-operating-computer with a specified model."
)

parser.add_argument(
"-m",
"--model",
help="Specify the model to evaluate.",
required=False,
default="gpt-4-with-ocr",
)

return parser.parse_args().model


def main():
load_dotenv()
openai.api_key = os.getenv("OPENAI_API_KEY")

model = get_test_model()

print(f"{ANSI_BLUE}[EVALUATING MODEL `{model}`]{ANSI_RESET}")
print(f"{ANSI_BRIGHT_MAGENTA}[STARTING EVALUATION]{ANSI_RESET}")

passed = 0; failed = 0
passed = 0
failed = 0
for objective, guideline in TEST_CASES.items():
print(f"{ANSI_BLUE}[EVALUATING]{ANSI_RESET} '{objective}'")

result = run_test_case(objective, guideline, model)
if result:
print(f"{ANSI_GREEN}[PASSED]{ANSI_RESET} '{objective}'")
Expand All @@ -166,5 +176,6 @@ def main():
f"{ANSI_BRIGHT_MAGENTA}[EVALUATION COMPLETE]{ANSI_RESET} {passed} test{'' if passed == 1 else 's'} passed, {failed} test{'' if failed == 1 else 's'} failed"
)


if __name__ == "__main__":
main()
40 changes: 20 additions & 20 deletions operate/models/apis.py
Original file line number Diff line number Diff line change
Expand Up @@ -41,12 +41,12 @@ async def get_next_action(model, messages, objective, session_id):
print("[Self-Operating Computer][get_next_action]")
print("[Self-Operating Computer][get_next_action] model", model)
if model == "gpt-4":
return call_gpt_4_vision_preview(messages), None
return call_gpt_4o(messages), None
if model == "gpt-4-with-som":
operation = await call_gpt_4_vision_preview_labeled(messages, objective, model)
operation = await call_gpt_4o_labeled(messages, objective, model)
return operation, None
if model == "gpt-4-with-ocr":
operation = await call_gpt_4_vision_preview_ocr(messages, objective, model)
operation = await call_gpt_4o_with_ocr(messages, objective, model)
return operation, None
if model == "agent-1":
return "coming soon"
Expand All @@ -61,7 +61,7 @@ async def get_next_action(model, messages, objective, session_id):
raise ModelNotRecognizedException(model)


def call_gpt_4_vision_preview(messages):
def call_gpt_4o(messages):
if config.verbose:
print("[call_gpt_4_v]")
time.sleep(1)
Expand Down Expand Up @@ -102,7 +102,7 @@ def call_gpt_4_vision_preview(messages):
messages.append(vision_message)

response = client.chat.completions.create(
model="gpt-4-vision-preview",
model="gpt-4o",
messages=messages,
presence_penalty=1,
frequency_penalty=1,
Expand Down Expand Up @@ -137,7 +137,7 @@ def call_gpt_4_vision_preview(messages):
)
if config.verbose:
traceback.print_exc()
return call_gpt_4_vision_preview(messages)
return call_gpt_4o(messages)


def call_gemini_pro_vision(messages, objective):
Expand Down Expand Up @@ -189,12 +189,12 @@ def call_gemini_pro_vision(messages, objective):
if config.verbose:
print("[Self-Operating Computer][Operate] error", e)
traceback.print_exc()
return call_gpt_4_vision_preview(messages)
return call_gpt_4o(messages)


async def call_gpt_4_vision_preview_ocr(messages, objective, model):
async def call_gpt_4o_with_ocr(messages, objective, model):
if config.verbose:
print("[call_gpt_4_vision_preview_ocr]")
print("[call_gpt_4o_with_ocr]")

# Construct the path to the file within the package
try:
Expand Down Expand Up @@ -231,7 +231,7 @@ async def call_gpt_4_vision_preview_ocr(messages, objective, model):
messages.append(vision_message)

response = client.chat.completions.create(
model="gpt-4-vision-preview",
model="gpt-4o",
messages=messages,
temperature=0.7,
max_tokens=3000,
Expand All @@ -253,7 +253,7 @@ async def call_gpt_4_vision_preview_ocr(messages, objective, model):
text_to_click = operation.get("text")
if config.verbose:
print(
"[call_gpt_4_vision_preview_ocr][click] text_to_click",
"[call_gpt_4o_with_ocr][click] text_to_click",
text_to_click,
)
# Initialize EasyOCR Reader
Expand All @@ -275,15 +275,15 @@ async def call_gpt_4_vision_preview_ocr(messages, objective, model):

if config.verbose:
print(
"[call_gpt_4_vision_preview_ocr][click] text_element_index",
"[call_gpt_4o_with_ocr][click] text_element_index",
text_element_index,
)
print(
"[call_gpt_4_vision_preview_ocr][click] coordinates",
"[call_gpt_4o_with_ocr][click] coordinates",
coordinates,
)
print(
"[call_gpt_4_vision_preview_ocr][click] final operation",
"[call_gpt_4o_with_ocr][click] final operation",
operation,
)
processed_content.append(operation)
Expand All @@ -307,7 +307,7 @@ async def call_gpt_4_vision_preview_ocr(messages, objective, model):
return gpt_4_fallback(messages, objective, model)


async def call_gpt_4_vision_preview_labeled(messages, objective, model):
async def call_gpt_4o_labeled(messages, objective, model):
time.sleep(1)

try:
Expand Down Expand Up @@ -355,7 +355,7 @@ async def call_gpt_4_vision_preview_labeled(messages, objective, model):
messages.append(vision_message)

response = client.chat.completions.create(
model="gpt-4-vision-preview",
model="gpt-4o",
messages=messages,
presence_penalty=1,
frequency_penalty=1,
Expand Down Expand Up @@ -415,7 +415,7 @@ async def call_gpt_4_vision_preview_labeled(messages, objective, model):
print(
f"{ANSI_GREEN}[Self-Operating Computer]{ANSI_RED}[Error] Failed to get click position in percent. Trying another method {ANSI_RESET}"
)
return call_gpt_4_vision_preview(messages)
return call_gpt_4o(messages)

x_percent = f"{click_position_percent[0]:.2f}"
y_percent = f"{click_position_percent[1]:.2f}"
Expand Down Expand Up @@ -450,7 +450,7 @@ async def call_gpt_4_vision_preview_labeled(messages, objective, model):
if config.verbose:
print("[Self-Operating Computer][Operate] error", e)
traceback.print_exc()
return call_gpt_4_vision_preview(messages)
return call_gpt_4o(messages)


def call_ollama_llava(messages):
Expand Down Expand Up @@ -742,7 +742,7 @@ def get_last_assistant_message(messages):
def gpt_4_fallback(messages, objective, model):
if config.verbose:
print("[gpt_4_fallback]")
system_prompt = get_system_prompt("gpt-4-vision-preview", objective)
system_prompt = get_system_prompt("gpt-4o", objective)
new_system_message = {"role": "system", "content": system_prompt}
# remove and replace the first message in `messages` with `new_system_message`

Expand All @@ -752,7 +752,7 @@ def gpt_4_fallback(messages, objective, model):
print("[gpt_4_fallback][updated]")
print("[gpt_4_fallback][updated] len(messages)", len(messages))

return call_gpt_4_vision_preview(messages)
return call_gpt_4o(messages)


def confirm_system_prompt(messages, objective, model):
Expand Down