diff --git a/README.md b/README.md index 1c9c958f..ae40d082 100644 --- a/README.md +++ b/README.md @@ -12,7 +12,7 @@ @@ -176,5 +176,5 @@ Stay updated with the latest developments: - This project is compatible with Mac OS, Windows, and Linux (with X server installed). ## OpenAI Rate Limiting Note -The ```gpt-4-vision-preview``` model is required. To unlock access to this model, your account needs to spend at least \$5 in API credits. Pre-paying for these credits will unlock access if you haven't already spent the minimum \$5. +The ```gpt-4o``` model is required. To unlock access to this model, your account needs to spend at least \$5 in API credits. Pre-paying for these credits will unlock access if you haven't already spent the minimum \$5. Learn more **[here](https://platform.openai.com/docs/guides/rate-limits?context=tier-one)** diff --git a/evaluate.py b/evaluate.py index 124e9ac0..4c1e3676 100644 --- a/evaluate.py +++ b/evaluate.py @@ -25,7 +25,8 @@ Guideline: {guideline} """ -SCREENSHOT_PATH = os.path.join('screenshots', 'screenshot.png') +SCREENSHOT_PATH = os.path.join("screenshots", "screenshot.png") + # Check if on a windows terminal that supports ANSI escape codes def supports_ansi(): @@ -37,6 +38,7 @@ def supports_ansi(): is_a_tty = hasattr(sys.stdout, "isatty") and sys.stdout.isatty() return supported_platform and is_a_tty + if supports_ansi(): # Standard green text ANSI_GREEN = "\033[32m" @@ -62,8 +64,8 @@ def supports_ansi(): ANSI_YELLOW = "" ANSI_RED = "" ANSI_BRIGHT_MAGENTA = "" - - + + def format_evaluation_prompt(guideline): prompt = EVALUATION_PROMPT.format(guideline=guideline) return prompt @@ -72,33 +74,37 @@ def format_evaluation_prompt(guideline): def parse_eval_content(content): try: res = json.loads(content) - + print(res["reason"]) - + return res["guideline_met"] except: - print("The model gave a bad evaluation response and it couldn't be parsed. Exiting...") + print( + "The model gave a bad evaluation response and it couldn't be parsed. Exiting..." + ) exit(1) def evaluate_final_screenshot(guideline): - '''Load the final screenshot and return True or False if it meets the given guideline.''' + """Load the final screenshot and return True or False if it meets the given guideline.""" with open(SCREENSHOT_PATH, "rb") as img_file: img_base64 = base64.b64encode(img_file.read()).decode("utf-8") - eval_message = [{ - "role": "user", - "content": [ - {"type": "text", "text": format_evaluation_prompt(guideline)}, - { - "type": "image_url", - "image_url": {"url": f"data:image/jpeg;base64,{img_base64}"}, - }, - ], - }] - + eval_message = [ + { + "role": "user", + "content": [ + {"type": "text", "text": format_evaluation_prompt(guideline)}, + { + "type": "image_url", + "image_url": {"url": f"data:image/jpeg;base64,{img_base64}"}, + }, + ], + } + ] + response = openai.chat.completions.create( - model="gpt-4-vision-preview", + model="gpt-4o", messages=eval_message, presence_penalty=1, frequency_penalty=1, @@ -107,21 +113,24 @@ def evaluate_final_screenshot(guideline): ) eval_content = response.choices[0].message.content - + return parse_eval_content(eval_content) def run_test_case(objective, guideline, model): - '''Returns True if the result of the test with the given prompt meets the given guideline for the given model.''' + """Returns True if the result of the test with the given prompt meets the given guideline for the given model.""" # Run `operate` with the model to evaluate and the test case prompt - subprocess.run(['operate', '-m', model, '--prompt', f'"{objective}"'], stdout=subprocess.DEVNULL) - + subprocess.run( + ["operate", "-m", model, "--prompt", f'"{objective}"'], + stdout=subprocess.DEVNULL, + ) + try: result = evaluate_final_screenshot(guideline) - except(OSError): + except OSError: print("[Error] Couldn't open the screenshot for evaluation") return False - + return result @@ -129,7 +138,7 @@ def get_test_model(): parser = argparse.ArgumentParser( description="Run the self-operating-computer with a specified model." ) - + parser.add_argument( "-m", "--model", @@ -137,23 +146,24 @@ def get_test_model(): required=False, default="gpt-4-with-ocr", ) - + return parser.parse_args().model def main(): load_dotenv() openai.api_key = os.getenv("OPENAI_API_KEY") - + model = get_test_model() - + print(f"{ANSI_BLUE}[EVALUATING MODEL `{model}`]{ANSI_RESET}") print(f"{ANSI_BRIGHT_MAGENTA}[STARTING EVALUATION]{ANSI_RESET}") - passed = 0; failed = 0 + passed = 0 + failed = 0 for objective, guideline in TEST_CASES.items(): print(f"{ANSI_BLUE}[EVALUATING]{ANSI_RESET} '{objective}'") - + result = run_test_case(objective, guideline, model) if result: print(f"{ANSI_GREEN}[PASSED]{ANSI_RESET} '{objective}'") @@ -166,5 +176,6 @@ def main(): f"{ANSI_BRIGHT_MAGENTA}[EVALUATION COMPLETE]{ANSI_RESET} {passed} test{'' if passed == 1 else 's'} passed, {failed} test{'' if failed == 1 else 's'} failed" ) + if __name__ == "__main__": main() diff --git a/operate/models/apis.py b/operate/models/apis.py index 0dff9723..b239ec86 100644 --- a/operate/models/apis.py +++ b/operate/models/apis.py @@ -41,12 +41,12 @@ async def get_next_action(model, messages, objective, session_id): print("[Self-Operating Computer][get_next_action]") print("[Self-Operating Computer][get_next_action] model", model) if model == "gpt-4": - return call_gpt_4_vision_preview(messages), None + return call_gpt_4o(messages), None if model == "gpt-4-with-som": - operation = await call_gpt_4_vision_preview_labeled(messages, objective, model) + operation = await call_gpt_4o_labeled(messages, objective, model) return operation, None if model == "gpt-4-with-ocr": - operation = await call_gpt_4_vision_preview_ocr(messages, objective, model) + operation = await call_gpt_4o_with_ocr(messages, objective, model) return operation, None if model == "agent-1": return "coming soon" @@ -61,7 +61,7 @@ async def get_next_action(model, messages, objective, session_id): raise ModelNotRecognizedException(model) -def call_gpt_4_vision_preview(messages): +def call_gpt_4o(messages): if config.verbose: print("[call_gpt_4_v]") time.sleep(1) @@ -102,7 +102,7 @@ def call_gpt_4_vision_preview(messages): messages.append(vision_message) response = client.chat.completions.create( - model="gpt-4-vision-preview", + model="gpt-4o", messages=messages, presence_penalty=1, frequency_penalty=1, @@ -137,7 +137,7 @@ def call_gpt_4_vision_preview(messages): ) if config.verbose: traceback.print_exc() - return call_gpt_4_vision_preview(messages) + return call_gpt_4o(messages) def call_gemini_pro_vision(messages, objective): @@ -189,12 +189,12 @@ def call_gemini_pro_vision(messages, objective): if config.verbose: print("[Self-Operating Computer][Operate] error", e) traceback.print_exc() - return call_gpt_4_vision_preview(messages) + return call_gpt_4o(messages) -async def call_gpt_4_vision_preview_ocr(messages, objective, model): +async def call_gpt_4o_with_ocr(messages, objective, model): if config.verbose: - print("[call_gpt_4_vision_preview_ocr]") + print("[call_gpt_4o_with_ocr]") # Construct the path to the file within the package try: @@ -231,7 +231,7 @@ async def call_gpt_4_vision_preview_ocr(messages, objective, model): messages.append(vision_message) response = client.chat.completions.create( - model="gpt-4-vision-preview", + model="gpt-4o", messages=messages, temperature=0.7, max_tokens=3000, @@ -253,7 +253,7 @@ async def call_gpt_4_vision_preview_ocr(messages, objective, model): text_to_click = operation.get("text") if config.verbose: print( - "[call_gpt_4_vision_preview_ocr][click] text_to_click", + "[call_gpt_4o_with_ocr][click] text_to_click", text_to_click, ) # Initialize EasyOCR Reader @@ -275,15 +275,15 @@ async def call_gpt_4_vision_preview_ocr(messages, objective, model): if config.verbose: print( - "[call_gpt_4_vision_preview_ocr][click] text_element_index", + "[call_gpt_4o_with_ocr][click] text_element_index", text_element_index, ) print( - "[call_gpt_4_vision_preview_ocr][click] coordinates", + "[call_gpt_4o_with_ocr][click] coordinates", coordinates, ) print( - "[call_gpt_4_vision_preview_ocr][click] final operation", + "[call_gpt_4o_with_ocr][click] final operation", operation, ) processed_content.append(operation) @@ -307,7 +307,7 @@ async def call_gpt_4_vision_preview_ocr(messages, objective, model): return gpt_4_fallback(messages, objective, model) -async def call_gpt_4_vision_preview_labeled(messages, objective, model): +async def call_gpt_4o_labeled(messages, objective, model): time.sleep(1) try: @@ -355,7 +355,7 @@ async def call_gpt_4_vision_preview_labeled(messages, objective, model): messages.append(vision_message) response = client.chat.completions.create( - model="gpt-4-vision-preview", + model="gpt-4o", messages=messages, presence_penalty=1, frequency_penalty=1, @@ -415,7 +415,7 @@ async def call_gpt_4_vision_preview_labeled(messages, objective, model): print( f"{ANSI_GREEN}[Self-Operating Computer]{ANSI_RED}[Error] Failed to get click position in percent. Trying another method {ANSI_RESET}" ) - return call_gpt_4_vision_preview(messages) + return call_gpt_4o(messages) x_percent = f"{click_position_percent[0]:.2f}" y_percent = f"{click_position_percent[1]:.2f}" @@ -450,7 +450,7 @@ async def call_gpt_4_vision_preview_labeled(messages, objective, model): if config.verbose: print("[Self-Operating Computer][Operate] error", e) traceback.print_exc() - return call_gpt_4_vision_preview(messages) + return call_gpt_4o(messages) def call_ollama_llava(messages): @@ -742,7 +742,7 @@ def get_last_assistant_message(messages): def gpt_4_fallback(messages, objective, model): if config.verbose: print("[gpt_4_fallback]") - system_prompt = get_system_prompt("gpt-4-vision-preview", objective) + system_prompt = get_system_prompt("gpt-4o", objective) new_system_message = {"role": "system", "content": system_prompt} # remove and replace the first message in `messages` with `new_system_message` @@ -752,7 +752,7 @@ def gpt_4_fallback(messages, objective, model): print("[gpt_4_fallback][updated]") print("[gpt_4_fallback][updated] len(messages)", len(messages)) - return call_gpt_4_vision_preview(messages) + return call_gpt_4o(messages) def confirm_system_prompt(messages, objective, model):