diff --git a/README.md b/README.md index e338e12d..4bf3122b 100644 --- a/README.md +++ b/README.md @@ -83,28 +83,37 @@ Use Qwen-vl with Vision to see how it stacks up to GPT-4-Vision at operating a c operate -m qwen-vl ``` -#### Try LLaVa Hosted Through Ollama `-m llava` +#### Try Multimodal Models Hosted Through Ollama `-m ` If you wish to experiment with the Self-Operating Computer Framework using LLaVA on your own machine, you can with Ollama! *Note: Ollama currently only supports MacOS and Linux. Windows now in Preview* -First, install Ollama on your machine from https://ollama.ai/download. +First, install Ollama on your machine from https://ollama.com/download. -Once Ollama is installed, pull the LLaVA model: +Once Ollama is installed, pull the model you want to use: ``` -ollama pull llava +ollama pull ``` -This will download the model on your machine which takes approximately 5 GB of storage. +This will download the model on your machine which takes approximately 5 GB of storage for llava:7b. -When Ollama has finished pulling LLaVA, start the server: +When Ollama has finished pulling the model, start the server: ``` ollama serve ``` -That's it! Now start `operate` and select the LLaVA model: +That's it! Now start `operate` and specify the model you want to use directly: +``` +operate -m llama-3.1-vision +``` + +For better text recognition when clicking on elements, you can enable OCR with the `--ocr` flag: +``` +operate -m llama-3.1-vision --ocr ``` -operate -m llava -``` -**Important:** Error rates when using LLaVA are very high. This is simply intended to be a base to build off of as local multimodal models improve over time. + +**Important:** +- The OCR flag is only available for Ollama models +- The system will attempt to run any model you specify, regardless of whether it's detected as multimodal +- Error rates when using ollama are very high, even with large models like llama3.2-vision:90b. This is simply intended to be a base to build off of as local multimodal models improve over time. Learn more about Ollama at its [GitHub Repository](https://www.github.com/ollama/ollama) @@ -136,6 +145,19 @@ Run with voice mode operate --voice ``` +### Browser Preference `-b` or `--browser` +By default, Self-Operating Computer uses Google Chrome as the browser when providing instructions. If you prefer a different browser, you can specify it using the `-b` or `--browser` flag: + +``` +operate -b "Firefox" +``` + +``` +operate --browser "Microsoft Edge" +``` + +The specified browser will be used in the system prompts to guide the model. + ### Optical Character Recognition Mode `-m gpt-4-with-ocr` The Self-Operating Computer Framework now integrates Optical Character Recognition (OCR) capabilities with the `gpt-4-with-ocr` mode. This mode gives GPT-4 a hash map of clickable elements by coordinates. GPT-4 can decide to `click` elements by text and then the code references the hash map to get the coordinates for that element GPT-4 wanted to click. diff --git a/operate/config.py b/operate/config.py index 92f800ff..e93a0a66 100644 --- a/operate/config.py +++ b/operate/config.py @@ -18,6 +18,8 @@ class Config: openai_api_key (str): API key for OpenAI. google_api_key (str): API key for Google. ollama_host (str): url to ollama running remotely. + ocr_enabled (bool): Flag indicating whether OCR is enabled for Ollama models. + browser (str): Preferred browser to use in system prompts. """ _instance = None @@ -31,6 +33,8 @@ def __new__(cls): def __init__(self): load_dotenv() self.verbose = False + self.ocr_enabled = False + self.browser = "Google Chrome" self.openai_api_key = ( None # instance variables are backups in case saving to a `.env` fails ) diff --git a/operate/main.py b/operate/main.py index 86832e4e..f5aef588 100644 --- a/operate/main.py +++ b/operate/main.py @@ -39,14 +39,36 @@ def main_entry(): type=str, required=False, ) + + # Add OCR flag for Ollama models + parser.add_argument( + "--ocr", + help="Enable OCR for Ollama models", + action="store_true", + ) + + # Add browser preference flag + parser.add_argument( + "-b", + "--browser", + help="Specify preferred browser (default: Google Chrome)", + type=str, + default="Google Chrome", + ) try: args = parser.parse_args() + + # No need to prompt for model name if it's directly specified + # The Ollama model name can now be passed directly + main( args.model, terminal_prompt=args.prompt, voice_mode=args.voice, - verbose_mode=args.verbose + verbose_mode=args.verbose, + ocr_mode=args.ocr, + browser=args.browser ) except KeyboardInterrupt: print(f"\n{ANSI_BRIGHT_MAGENTA}Exiting...") diff --git a/operate/models/apis.py b/operate/models/apis.py index 14e55310..64207a91 100644 --- a/operate/models/apis.py +++ b/operate/models/apis.py @@ -25,7 +25,7 @@ ) from operate.utils.ocr import get_text_coordinates, get_text_element from operate.utils.screenshot import capture_screen_with_cursor, compress_screenshot -from operate.utils.style import ANSI_BRIGHT_MAGENTA, ANSI_GREEN, ANSI_RED, ANSI_RESET +from operate.utils.style import ANSI_BRIGHT_MAGENTA, ANSI_GREEN, ANSI_RED, ANSI_RESET, ANSI_YELLOW # Load configuration config = Config() @@ -53,15 +53,53 @@ async def get_next_action(model, messages, objective, session_id): return "coming soon" if model == "gemini-pro-vision": return call_gemini_pro_vision(messages, objective), None - if model == "llava": - operation = call_ollama_llava(messages) - return operation, None if model == "claude-3": operation = await call_claude_3_with_ocr(messages, objective, model) return operation, None + if ollama_model_installed(model): + is_multimodal = ollama_model_multimodal(model) + if is_multimodal or not is_multimodal: # Run regardless of multimodality check + if config.ocr_enabled: + operation = await call_ollama_with_ocr(messages, model) + else: + operation = call_ollama(messages, model) + return operation, None raise ModelNotRecognizedException(model) +def ollama_model_installed(model_name): + import ollama + installed_models = ollama.list() + + for model in installed_models.get('models', []): + if model_name == model['name']: + return True + + return False + + +def ollama_model_multimodal(model_name): + """ + Check if an Ollama model appears to support multimodal inputs. + Note: This check is not definitive and the model will run regardless. + """ + model_info = ollama.show(model_name) + if 'details' in model_info: + if 'families' in model_info['details']: + families = model_info['details']['families'] + multimodal_indicators = [ + 'clip', 'vision', 'llava', 'bakllava', 'multimodal'] + for indicator in multimodal_indicators: + if any(indicator.lower() in family.lower() for family in families): + return True + if 'vision' in model_info.get('details', {}).get('capabilities', []): + return True + + # Print a warning but continue anyway + print(f"{ANSI_GREEN}[Self-Operating Computer]{ANSI_YELLOW}[Warning] Model {model_name} doesn't appear to be multimodal but will be used anyway{ANSI_RESET}") + return False + + def call_gpt_4o(messages): if config.verbose: print("[call_gpt_4_v]") @@ -154,7 +192,8 @@ async def call_qwen_vl_with_ocr(messages, objective, model): os.makedirs(screenshots_dir) # Call the function to capture the screen with the cursor - raw_screenshot_filename = os.path.join(screenshots_dir, "raw_screenshot.png") + raw_screenshot_filename = os.path.join( + screenshots_dir, "raw_screenshot.png") capture_screen_with_cursor(raw_screenshot_filename) # Compress screenshot image to make size be smaller @@ -256,6 +295,7 @@ async def call_qwen_vl_with_ocr(messages, objective, model): traceback.print_exc() return gpt_4_fallback(messages, objective, model) + def call_gemini_pro_vision(messages, objective): """ Get the next action for Self-Operating Computer using Gemini Pro Vision @@ -282,7 +322,8 @@ def call_gemini_pro_vision(messages, objective): if config.verbose: print("[call_gemini_pro_vision] model", model) - response = model.generate_content([prompt, Image.open(screenshot_filename)]) + response = model.generate_content( + [prompt, Image.open(screenshot_filename)]) content = response.text[1:] if config.verbose: @@ -541,7 +582,8 @@ async def call_gpt_4o_labeled(messages, objective, model): client = config.initialize_openai() confirm_system_prompt(messages, objective, model) - file_path = pkg_resources.resource_filename("operate.models.weights", "best.pt") + file_path = pkg_resources.resource_filename( + "operate.models.weights", "best.pt") yolo_model = YOLO(file_path) # Load your trained model screenshots_dir = "screenshots" if not os.path.exists(screenshots_dir): @@ -554,7 +596,8 @@ async def call_gpt_4o_labeled(messages, objective, model): with open(screenshot_filename, "rb") as img_file: img_base64 = base64.b64encode(img_file.read()).decode("utf-8") - img_base64_labeled, label_coordinates = add_labels(img_base64, yolo_model) + img_base64_labeled, label_coordinates = add_labels( + img_base64, yolo_model) if len(messages) == 1: user_prompt = get_user_first_message_prompt() @@ -627,7 +670,8 @@ async def call_gpt_4o_labeled(messages, objective, model): image = Image.open( io.BytesIO(base64.b64decode(img_base64)) ) # Load the image to get its size - image_size = image.size # Get the size of the image (width, height) + # Get the size of the image (width, height) + image_size = image.size click_position_percent = get_click_position_in_percent( coordinates, image_size ) @@ -678,12 +722,12 @@ async def call_gpt_4o_labeled(messages, objective, model): return call_gpt_4o(messages) -def call_ollama_llava(messages): +def call_ollama(messages, model_name): if config.verbose: - print("[call_ollama_llava]") + print("[call_ollama]") time.sleep(1) try: - model = config.initialize_ollama() + ollama_client = config.initialize_ollama() screenshots_dir = "screenshots" if not os.path.exists(screenshots_dir): os.makedirs(screenshots_dir) @@ -699,7 +743,7 @@ def call_ollama_llava(messages): if config.verbose: print( - "[call_ollama_llava] user_prompt", + "[call_ollama] user_prompt", user_prompt, ) @@ -710,8 +754,8 @@ def call_ollama_llava(messages): } messages.append(vision_message) - response = model.chat( - model="llava", + response = ollama_client.chat( + model=model_name, messages=messages, ) @@ -727,10 +771,18 @@ def call_ollama_llava(messages): assistant_message = {"role": "assistant", "content": content} if config.verbose: print( - "[call_ollama_llava] content", + "[call_ollama] content", content, ) - content = json.loads(content) + + try: + content = json.loads(content) + if config.verbose: + print("[call_ollama] Successfully parsed JSON from model response") + except json.JSONDecodeError as e: + if config.verbose: + print(f"[call_ollama] Failed to parse JSON: {e}") + raise # Re-raise to be caught by outer exception handler messages.append(assistant_message) @@ -738,22 +790,22 @@ def call_ollama_llava(messages): except ollama.ResponseError as e: print( - f"{ANSI_GREEN}[Self-Operating Computer]{ANSI_RED}[Operate] Couldn't connect to Ollama. With Ollama installed, run `ollama pull llava` then `ollama serve`{ANSI_RESET}", + f"{ANSI_GREEN}[Self-Operating Computer]{ANSI_RED}[Operate] Couldn't connect to Ollama. With Ollama installed, run `ollama pull {model_name}` then `ollama serve`{ANSI_RESET}", e, ) except Exception as e: print( - f"{ANSI_GREEN}[Self-Operating Computer]{ANSI_BRIGHT_MAGENTA}[llava] That did not work. Trying again {ANSI_RESET}", + f"{ANSI_GREEN}[Self-Operating Computer]{ANSI_BRIGHT_MAGENTA}[{model_name}] That did not work. Trying again {ANSI_RESET}", e, ) print( f"{ANSI_GREEN}[Self-Operating Computer]{ANSI_RED}[Error] AI response was {ANSI_RESET}", - content, + content if 'content' in locals() else "Not available", ) if config.verbose: traceback.print_exc() - return call_ollama_llava(messages) + return call_ollama(messages, model_name) async def call_claude_3_with_ocr(messages, objective, model): @@ -789,7 +841,8 @@ async def call_claude_3_with_ocr(messages, objective, model): print("[call_claude_3_with_ocr] resizing claude") # Resize the image - img_resized = img.resize((new_width, new_height), Image.Resampling.LANCZOS) + img_resized = img.resize( + (new_width, new_height), Image.Resampling.LANCZOS) # Save the resized and converted image to a BytesIO object for JPEG format img_buffer = io.BytesIO() @@ -942,7 +995,8 @@ async def call_claude_3_with_ocr(messages, objective, model): else: updated_content.append(item) - gpt4_messages.append({"role": "user", "content": updated_content}) + gpt4_messages.append( + {"role": "user", "content": updated_content}) elif message["role"] == "assistant": gpt4_messages.append( {"role": "assistant", "content": message["content"]} @@ -1010,11 +1064,11 @@ def clean_json(content): print("\n\n[clean_json] content before cleaning", content) if content.startswith("```json"): content = content[ - len("```json") : + len("```json"): ].strip() # Remove starting ```json and trim whitespace elif content.startswith("```"): content = content[ - len("```") : + len("```"): ].strip() # Remove starting ``` and trim whitespace if content.endswith("```"): content = content[ @@ -1026,5 +1080,144 @@ def clean_json(content): if config.verbose: print("\n\n[clean_json] content after cleaning", content) + # Check if the JSON is valid + try: + json.loads(content) + print("[clean_json] ✅ JSON is valid") + except json.JSONDecodeError as e: + print(f"[clean_json] ❌ JSON is invalid: {e}") return content + + +async def call_ollama_with_ocr(messages, model_name): + """ + Call Ollama model with OCR capabilities similar to other OCR-enabled models. + """ + if config.verbose: + print("[call_ollama_with_ocr]") + time.sleep(1) + try: + ollama_client = config.initialize_ollama() + screenshots_dir = "screenshots" + if not os.path.exists(screenshots_dir): + os.makedirs(screenshots_dir) + + screenshot_filename = os.path.join(screenshots_dir, "screenshot.png") + # Call the function to capture the screen with the cursor + capture_screen_with_cursor(screenshot_filename) + + if len(messages) == 1: + user_prompt = get_user_first_message_prompt() + else: + user_prompt = get_user_prompt() + + if config.verbose: + print( + "[call_ollama_with_ocr] user_prompt", + user_prompt, + ) + + vision_message = { + "role": "user", + "content": user_prompt, + "images": [screenshot_filename], + } + messages.append(vision_message) + + response = ollama_client.chat( + model=model_name, + messages=messages, + ) + + # Important: Remove the image path from the message history. + # Ollama will attempt to load each image reference and will + # eventually timeout. + messages[-1]["images"] = None + + content = response["message"]["content"].strip() + + content = clean_json(content) + + # used later for the messages + content_str = content + + try: + content = json.loads(content) + if config.verbose: + print("[call_ollama_with_ocr] Successfully parsed JSON from model response") + except json.JSONDecodeError as e: + if config.verbose: + print(f"[call_ollama_with_ocr] Failed to parse JSON: {e}") + print(f"[call_ollama_with_ocr] Raw content that failed to parse: {content_str}") + raise # Re-raise to be caught by outer exception handler + + processed_content = [] + + for operation in content: + if operation.get("operation") == "click": + text_to_click = operation.get("text") + if config.verbose: + print( + "[call_ollama_with_ocr][click] text_to_click", + text_to_click, + ) + # Initialize EasyOCR Reader + reader = easyocr.Reader(["en"]) + + # Read the screenshot + result = reader.readtext(screenshot_filename) + + text_element_index = get_text_element( + result, text_to_click, screenshot_filename + ) + coordinates = get_text_coordinates( + result, text_element_index, screenshot_filename + ) + + # add `coordinates`` to `content` + operation["x"] = coordinates["x"] + operation["y"] = coordinates["y"] + + if config.verbose: + print( + "[call_ollama_with_ocr][click] text_element_index", + text_element_index, + ) + print( + "[call_ollama_with_ocr][click] coordinates", + coordinates, + ) + print( + "[call_ollama_with_ocr][click] final operation", + operation, + ) + processed_content.append(operation) + + else: + processed_content.append(operation) + + # wait to append the assistant message so that if the `processed_content` step fails we don't append a message and mess up message history + assistant_message = {"role": "assistant", "content": content_str} + messages.append(assistant_message) + + return processed_content + + except ollama.ResponseError as e: + print( + f"{ANSI_GREEN}[Self-Operating Computer]{ANSI_RED}[Operate] Couldn't connect to Ollama. With Ollama installed, run `ollama pull {model_name}` then `ollama serve`{ANSI_RESET}", + e, + ) + + except Exception as e: + print( + f"{ANSI_GREEN}[Self-Operating Computer]{ANSI_BRIGHT_MAGENTA}[{model_name}] That did not work. Trying again {ANSI_RESET}", + e, + ) + print( + f"{ANSI_GREEN}[Self-Operating Computer]{ANSI_RED}[Error] AI response was {ANSI_RESET}", + content_str if 'content_str' in locals() else "Not available", + ) + if config.verbose: + traceback.print_exc() + return call_ollama(messages, model_name) diff --git a/operate/models/prompts.py b/operate/models/prompts.py index 1acbe14f..bd30d79c 100644 --- a/operate/models/prompts.py +++ b/operate/models/prompts.py @@ -39,12 +39,12 @@ Here a helpful example: -Example 1: Searches for Google Chrome on the OS and opens it +Example 1: Searches for {browser} on the OS and opens it ``` [ - {{ "thought": "Searching the operating system to find Google Chrome because it appears I am currently in terminal", "operation": "press", "keys": {os_search_str} }}, - {{ "thought": "Now I need to write 'Google Chrome' as a next step", "operation": "write", "content": "Google Chrome" }}, - {{ "thought": "Finally I'll press enter to open Google Chrome assuming it is available", "operation": "press", "keys": ["enter"] }} + {{ "thought": "Searching the operating system to find {browser} because it appears I am currently in terminal", "operation": "press", "keys": {os_search_str} }}, + {{ "thought": "Now I need to write '{browser}' as a next step", "operation": "write", "content": "{browser}" }}, + {{ "thought": "Finally I'll press enter to open {browser} assuming it is available", "operation": "press", "keys": ["enter"] }} ] ``` @@ -59,7 +59,8 @@ A few important notes: -- Go to Google Docs and Google Sheets by typing in the Chrome Address bar +- Default to {browser} as your browser +- Go to Google Docs and Google Sheets by typing in the address bar - Don't respond saying you're unable to assist with requests. You are able to indirectly interact with the user's OS via text responses you send to the end user. Objective: {objective} @@ -94,11 +95,11 @@ Here a helpful example: -Example 1: Searches for Google Chrome on the OS and opens it +Example 1: Searches for {browser} on the OS and opens it ``` [ - {{ "thought": "Searching the operating system to find Google Chrome because it appears I am currently in terminal", "operation": "press", "keys": {os_search_str} }}, - {{ "thought": "Now I need to write 'Google Chrome' as a next step", "operation": "write", "content": "Google Chrome" }}, + {{ "thought": "Searching the operating system to find {browser} because it appears I am currently in terminal", "operation": "press", "keys": {os_search_str} }}, + {{ "thought": "Now I need to write '{browser}' as a next step", "operation": "write", "content": "{browser}" }}, ] ``` @@ -121,7 +122,8 @@ A few important notes: -- Go to Google Docs and Google Sheets by typing in the Chrome Address bar +- Default to {browser} as your browser +- Go to Google Docs and Google Sheets by typing in the address bar - Don't respond saying you're unable to assist with requests. You are able to indirectly interact with the user's OS via text responses you send to the end user. Objective: {objective} @@ -186,7 +188,7 @@ A few important notes: -- Default to Google Chrome as the browser +- Default to {browser} as the browser - Go to websites by opening a new tab with `press` and then `write` the URL - Reflect on previous actions and the screenshot to ensure they align and that your previous actions worked. - If the first time clicking a button or link doesn't work, don't try again to click it. Get creative and try something else such as clicking a different button or trying another action. @@ -231,22 +233,34 @@ def get_system_prompt(model, objective): cmd_string=cmd_string, os_search_str=os_search_str, operating_system=operating_system, + browser=config.browser ) elif model == "gpt-4-with-ocr" or model == "o1-with-ocr" or model == "claude-3" or model == "qwen-vl": - + # Use OCR prompt for built-in OCR models prompt = SYSTEM_PROMPT_OCR.format( objective=objective, cmd_string=cmd_string, os_search_str=os_search_str, operating_system=operating_system, + browser=config.browser + ) + # Check if OCR is enabled for other models (like Ollama) + elif config.ocr_enabled: + # Use OCR prompt for models with OCR flag enabled + prompt = SYSTEM_PROMPT_OCR.format( + objective=objective, + cmd_string=cmd_string, + os_search_str=os_search_str, + operating_system=operating_system, + browser=config.browser ) - else: prompt = SYSTEM_PROMPT_STANDARD.format( objective=objective, cmd_string=cmd_string, os_search_str=os_search_str, operating_system=operating_system, + browser=config.browser ) # Optional verbose output diff --git a/operate/operate.py b/operate/operate.py index c63d9851..32a93064 100644 --- a/operate/operate.py +++ b/operate/operate.py @@ -30,7 +30,7 @@ operating_system = OperatingSystem() -def main(model, terminal_prompt, voice_mode=False, verbose_mode=False): +def main(model, terminal_prompt, voice_mode=False, verbose_mode=False, ocr_mode=False): """ Main function for the Self-Operating Computer. @@ -38,6 +38,8 @@ def main(model, terminal_prompt, voice_mode=False, verbose_mode=False): - model: The model used for generating responses. - terminal_prompt: A string representing the prompt provided in the terminal. - voice_mode: A boolean indicating whether to enable voice mode. + - verbose_mode: A boolean indicating whether to enable verbose logging. + - ocr_mode: A boolean indicating whether to enable OCR for Ollama models. Returns: None @@ -47,6 +49,7 @@ def main(model, terminal_prompt, voice_mode=False, verbose_mode=False): # Initialize `WhisperMic`, if `voice_mode` is True config.verbose = verbose_mode + config.ocr_enabled = ocr_mode config.validation(model, voice_mode) if voice_mode: