diff --git a/.fdignore b/.fdignore new file mode 100644 index 00000000..4afdbf00 --- /dev/null +++ b/.fdignore @@ -0,0 +1,6 @@ +docs +.* +*/.* +*.pt +CONTRIBUTING.md +LICENSE \ No newline at end of file diff --git a/docs/.gitignore b/docs/.gitignore new file mode 100644 index 00000000..57f9f818 --- /dev/null +++ b/docs/.gitignore @@ -0,0 +1,6 @@ +!.gitignore +!* +!*/* +cache_db.json +cache_tree.json +vector_cache diff --git a/docs/cache_title.json b/docs/cache_title.json new file mode 100644 index 00000000..58ba848c --- /dev/null +++ b/docs/cache_title.json @@ -0,0 +1 @@ +{"_default": {"1": {"path": "/README.md", "hash": "dac3f21a1420b7c2574d16d3ae4f9a61", "title": "Self-Operating Computer Framework: Enhanced Mouse Predictions"}, "2": {"path": "/README.md:1-26", "hash": "bf2a6dd5d2e82ad184ac813bceea7aa1", "title": "Human-Like Computer Control Framework"}, "3": {"path": "/README.md:26-37", "hash": "cb2a33c5f14dbf8102a2060e59e818ce", "title": "Agent-1-Vision Model Overview"}, "4": {"path": "/README.md:37-67", "hash": "59a4d0bd0271be6882794c87dff29f6d", "title": "Improving Mouse Click Accuracy"}, "5": {"path": "/README.md:67-88", "hash": "d85bdbd6d1fcbc9f4ee97480cdead828", "title": "Install, Configure, and Operate: A Comprehensive Guide"}, "6": {"path": "/README.md:89-124", "hash": "ced841634d02dfca9565e18d4b5f2212", "title": "Installing SOCF and GMPV"}, "7": {"path": "/README.md:126-159", "hash": "ee4a2e9e1111e43b1d54c5872d0bee86", "title": "Enable Voice Mode in Self-Operating-Computer Framework"}, "8": {"path": "/README.md:159-172", "hash": "3f8661eaa636ae2a580706d495e189af", "title": "Join HyperWriteAI Discord, Visit #self-operating-computer, Gpt4Vision Model, API Credits Required"}, "9": {"path": "/evaluate.py", "hash": "f145aa4f21a3a71ba002513c87ab1299", "title": "Vision Model Image Evaluation"}, "10": {"path": "/evaluate.py:1-31", "hash": "7fba16836a3eb74270a4399e2b98e84c", "title": "Setting Up Evaluation Test Cases"}, "11": {"path": "/evaluate.py:32-73", "hash": "12d143dc6a38c079f66d6902f399af9b", "title": "ANSI Colors for Terminal Support Detection"}, "12": {"path": "/evaluate.py:75-105", "hash": "46c5cb34fa0a118542f82f10ba3948dd", "title": "Evaluate Summary Screenshot: GPT-4 Vision Model Integration"}, "13": {"path": "/evaluate.py:106-140", "hash": "c4daefc6cdbefdb544c912a83a0ad0f7", "title": "Test Evaluation and Display"}, "14": {"path": "/evaluate.py:141-150", "hash": "d45d9da226c621c8a4eb859326a6ab88", "title": "Test Result Display"}, "15": {"path": "/operate/actions.py", "hash": "57f5b1b6ff8ead6b437d05c3313f9121", "title": "AI-Powered Content Generation"}, "16": {"path": "/operate/actions.py:1-51", "hash": "ed23324e3160350d696973ecf4bbb214", "title": "Action Prediction Model"}, "17": {"path": "/operate/actions.py:52-83", "hash": "bcba7ff32d3526dae272ae4fc448b6fd", "title": "Dynamic Model Caller with Screenshot Capture"}, "18": {"path": "/operate/actions.py:84-115", "hash": "5c6c9f315ae86f4060161b43b7788ccb", "title": "Vision AI Message Encoder"}, "19": {"path": "/operate/actions.py:116-153", "hash": "5cec49a0aa53bbc39d191766c9c25bbd", "title": "Grid Overlay Screenshot Capture"}, "20": {"path": "/operate/actions.py:154-189", "hash": "eb4ce710e8a17943f56f63df8daf74f7", "title": "Screenshot-to-Message AI Model"}, "21": {"path": "/operate/actions.py:189-215", "hash": "64f175114443c2087567cebeb04b2bd0", "title": "Cursor-Guided AI Prompt Enhancement"}, "22": {"path": "/operate/actions.py:217-248", "hash": "41c0cb7cb8d7e4d80238360788b3f389", "title": "GPT-4 Vision Prompt Creation"}, "23": {"path": "/operate/actions.py:249-275", "hash": "d580b3022b3e2f2bf0afa8deb0aacb3f", "title": "Encoding Image for AI Model Generation"}, "24": {"path": "/operate/actions.py:276-305", "hash": "73fcf6c33350aeb75f0963012c4eaab2", "title": "Desktop Screenshot Labeling with GPT-4"}, "25": {"path": "/operate/actions.py:307-338", "hash": "ba4efe6d22d1b5a81e6fc8c269f290f3", "title": "Labeled Click and Decision Prompt System"}, "26": {"path": "/operate/actions.py:340-364", "hash": "f563a4a35fe4d0789d923d3cc416c88b", "title": "API Click Position Calculator"}, "27": {"path": "/operate/actions.py:365-387", "hash": "55af95f96b83ca55161b18675bfdcb41", "title": "Click Position Handler"}, "28": {"path": "/operate/actions.py:390-409", "hash": "b6a17c7d0d9473c988fffa40c8d26292", "title": "Fetch OpenAI Chat Completion Asynchronously"}, "29": {"path": "/operate/dialog.py", "hash": "3a2c6d26ce9740e42a7de1536dc6d86b", "title": "Error-Handling User Input in Dialog Operations"}, "30": {"path": "/operate/dialog.py:1-44", "hash": "112a510a9d2a67eafa1d20fe132fa998", "title": "Self-Operating Computer Response Model"}, "31": {"path": "/operate/dialog.py:46-80", "hash": "0d02cf7bfd7d05d1cfc09b37f05a28e8", "title": "Voice Mode and WhisperMic Initialization"}, "32": {"path": "/operate/dialog.py:81-109", "hash": "724810205610934c0b0c7616c9463f47", "title": "Capturing and Processing Voice Inputs"}, "33": {"path": "/operate/dialog.py:110-139", "hash": "2cc50bd85bbb75f03a7a31af6e07c334", "title": "Exception Handling and Action Execution"}, "34": {"path": "/operate/dialog.py:140-171", "hash": "47adabed891a2dfe509d8ad922d7632d", "title": "Action Type Check and Process"}, "35": {"path": "/operate/dialog.py:173-192", "hash": "439961cd470aba1b1e2b03b1ede32d22", "title": "Invalid Input Check and Error Message"}, "36": {"path": "/operate/exceptions.py", "hash": "1cb75cc9cca07c7083349d7687a89fb8", "title": "ModelRecognitionException"}, "37": {"path": "/operate/main.py", "hash": "849cb89bd135d98c287b28a0f59c5927", "title": "Main Entry Point for Self-Operating Computer"}, "38": {"path": "/operate/prompts.py", "hash": "d32925518a57e2532aa7e75757271c19", "title": "Context-Based Prompts for AI-Assisted Google Tools"}, "39": {"path": "/operate/prompts.py:1-33", "hash": "7fe3f3bf9b32af85235d3603b2f28e5f", "title": "Config Settings and Constants in Prompts Module"}, "40": {"path": "/operate/prompts.py:33-63", "hash": "f611be9877035ae65a35db64d7d0f56d", "title": "Interacting with Computers: Tips and Tricks"}, "41": {"path": "/operate/prompts.py:64-82", "hash": "02b8ebf54b5403195e7691775618cfb2", "title": "Cursor Position Prompt"}, "42": {"path": "/operate/prompts.py:82-95", "hash": "5b04c5da962c16bb3d894a23c99edd33", "title": "Guessing Percentages: CLICK Refinement"}, "43": {"path": "/operate/prompts.py:97-135", "hash": "255de85671982a6556ee9c614899ba58", "title": "Interactive Prompts for Efficient Tasks"}, "44": {"path": "/operate/prompts.py:136-159", "hash": "7496511f08f1a40eba955da343b4468c", "title": "AI-Assisted Web Interaction with Labeled Elements"}, "45": {"path": "/operate/prompts.py:161-183", "hash": "03228cf2727dcf901bfcae0a041eda8d", "title": "Contextual JSON Responses"}, "46": {"path": "/operate/prompts.py:185-217", "hash": "4b30adfe9f79f34b42e36eecd0f15b37", "title": "Prompt Formatting Functions"}, "47": {"path": "/operate/prompts.py:218-252", "hash": "832911cb60d101ae0735391a41cfe68c", "title": "Python Prompt Formatting Functions"}, "48": {"path": "/operate/settings.py", "hash": "c2e2734a3eeaee07ea071c3c86ff296a", "title": "Environment Configurations in Settings.py"}, "49": {"path": "/operate/settings.py:1-36", "hash": "10e6b21a56a6acb9ae0b10209c8e1fe1", "title": "Configuration Manager for Settings"}, "50": {"path": "/operate/settings.py:37-39", "hash": "34cb237d1b8fa9362011feddf47f20ab", "title": "Set OpenAI API URL with Env Var or Current Value"}, "51": {"path": "/operate/utils/label.py", "hash": "93c44858f4f65217acb2e06a65501930", "title": "Image Processing Utilities"}, "52": {"path": "/operate/utils/label.py:1-37", "hash": "f3323c934b39bcb21971b31d84675f81", "title": "Validate and Retrieve Image Data Functions"}, "53": {"path": "/operate/utils/label.py:40-72", "hash": "2201ff78fb8acd549396411af837f0b0", "title": "Box Overlap Detection and Labeling Functionality"}, "54": {"path": "/operate/utils/label.py:74-101", "hash": "8eb2f932380afd002c8d98174c7497af", "title": "Bounding Box Labeler"}, "55": {"path": "/operate/utils/label.py:102-128", "hash": "24eee879a8765882830fd677207ae500", "title": "Timestamped Image Saving"}, "56": {"path": "/operate/utils/label.py:129-152", "hash": "260012da282588d457f8ea5cc9ceeba7", "title": "Encode Labeled Image in Base64"}, "57": {"path": "/operate/utils/label.py:153-180", "hash": "27ea5f0021a17c4766060c44796992ae", "title": "Triple Backticks Remover & Click Percentage Calculator"}, "58": {"path": "/operate/utils/label.py:182-182", "hash": "2e179942d46f4e6ce7816daa63e99fc7", "title": "Compute Label Percentages"}, "59": {"path": "/operate/utils/misc.py", "hash": "12e996f03cce6e1223030105b6233bdb", "title": "Multifunctional Data Processor"}, "60": {"path": "/operate/utils/misc.py:1-41", "hash": "03abb4876fb8d66ba0e8bcb57fd4f0b5", "title": "Converting and Extracting: Misc.py Functions"}, "61": {"path": "/operate/utils/misc.py:43-74", "hash": "cb7527a4c506c996027a746a68d80cf9", "title": "Parse JSON Response"}, "62": {"path": "/operate/utils/misc.py:75-97", "hash": "cd0376658303f89fd0b2dd6a6f5b61f2", "title": "Response Parser and Classifier"}, "63": {"path": "/operate/utils/misc.py:98-102", "hash": "9b115deb565ae36006d792cc05f1e7c7", "title": "Handling Regex Exceptions in Search Data"}, "64": {"path": "/operate/utils/os.py", "hash": "bc30bc8244012b9c984674646e76944a", "title": "Circular Motion and Text Input Utility"}, "65": {"path": "/operate/utils/os.py:1-44", "hash": "effa7b963a4543f2dfe1cdfac15d1427", "title": "OS Utilities"}, "66": {"path": "/operate/utils/os.py:46-85", "hash": "9ca4ebadb3b3962ae2ede57ae4293792", "title": "Automated OS Interaction Utility"}, "67": {"path": "/operate/utils/os.py:85-105", "hash": "5d1e4e0c093ebfdaf7b03293a6a99c95", "title": "Circular Cursor Clicker"}, "68": {"path": "/operate/utils/os.py:107-131", "hash": "4f70563f895d92d3c7b8abf253f6cc41", "title": "Circular Movement Function and Assistant Message Retrieval"}, "69": {"path": "/operate/utils/screenshot.py", "hash": "90ce939cd173961e5a08c59cbe69dc8f", "title": "Screenshot Capture Utilities"}, "70": {"path": "/operate/utils/screenshot.py:1-39", "hash": "2ed1a42e53589686c334755d1dfff9cf", "title": "Grid Image Overlay"}, "71": {"path": "/operate/utils/screenshot.py:41-63", "hash": "5d94a4c0eb915db6948229b3a80bdff4", "title": "Background Rectangle and Grid Lines Generator"}, "72": {"path": "/operate/utils/screenshot.py:64-92", "hash": "176575b89de4763ec7a3aa300ba0b82f", "title": "Grid Screenshot Labeler"}, "73": {"path": "/operate/utils/screenshot.py:93-114", "hash": "c1de3d2bb95ef77b9721bf4ce2fd8368", "title": "Screenshot Capture Utility"}, "74": {"path": "/operate/utils/screenshot.py:115-143", "hash": "3d7b020d28a3fe6b44899e4522a6e863", "title": "Cross-Platform Screenshot Capture"}, "75": {"path": "/operate/utils/screenshot.py:144-178", "hash": "5de34d85cbb8ccc24e0cf57280d163e9", "title": "Cross-Platform Screenshot Capture Utility"}, "76": {"path": "/operate/utils/screenshot.py:179-182", "hash": "9b2abc85c0106daaaefac2325b90fc7c", "title": "Cross-Platform Screenshot and Cursor Capture Utility"}, "77": {"path": "/operate/utils/style.py", "hash": "ed587651bbe7c27ddabee832a16b492c", "title": "UI Style Configuration with PromptStyle"}, "78": {"path": "/operate/utils/style.py:1-34", "hash": "b786525e0df1320220692df064285b2d", "title": "Dialog and UI Styles with PromptStyle"}, "79": {"path": "/operate/utils/style.py:35-36", "hash": "723e74b69583b7713592415d8e097dd2", "title": "Detect Terminal Color Capabilities"}, "80": {"path": "/requirements-audio.txt", "hash": "d7f6b350ada5f0d2fa77095943fa5c98", "title": "Whisper Mic Requirements"}, "81": {"path": "/requirements.txt", "hash": "5ba31a8c2dca3df2b8fb5fe5075416b7", "title": "Python Packages for Project"}, "82": {"path": "/requirements.txt:1-50", "hash": "715045b6e5b276aadf63374adbfbbde7", "title": "Python Package Dependencies List"}, "83": {"path": "/requirements.txt:51-52", "hash": "45bfdb73c9749e654fa2b1fe17dcaab3", "title": "Project Libraries: aiohttp, ultralytics"}, "84": {"path": "/run.sh", "hash": "f6ba03ba77cee9c964f4a03260c51a27", "title": "SOC Linux Install Script"}, "85": {"path": "/run.sh:1-48", "hash": "e5cedb1ea200309d31978d3d03e934b6", "title": "SOC Linux Installation Script"}, "86": {"path": "/run.sh:49-71", "hash": "e2aed4e34a68f996aa2843cf406bdfa2", "title": "Universal Software Installer"}, "87": {"path": "/run.sh:72-115", "hash": "5d1ba14311b8a89212234ce790f61f87", "title": "Automating Python Project Setup"}, "88": {"path": "/run.sh:117-143", "hash": "631882f4968344cbcbd906fb3007ed88", "title": "OpenAI API Key Configurator"}, "89": {"path": "/run.sh:144-155", "hash": "4742d4a7a0b934a691c111136ae81e7f", "title": "MacOS Installation Check"}, "90": {"path": "/setup.py", "hash": "30346e34e45eb4025e616bfdba88d87b", "title": "Setting up 'self-operating-computer'"}}} \ No newline at end of file diff --git a/docs/codeview.html b/docs/codeview.html new file mode 100644 index 00000000..761065f4 --- /dev/null +++ b/docs/codeview.html @@ -0,0 +1,669 @@ + + + + + + + + + Code View + + + + + + + + + + + + + + + + + + + + +
+

Code Preview

+
+
+
+ +
+
+ + + \ No newline at end of file diff --git a/docs/data/0.json b/docs/data/0.json new file mode 100644 index 00000000..12f423be --- /dev/null +++ b/docs/data/0.json @@ -0,0 +1,544 @@ +{ + "0": { + "file_id": 0, + "content": "/README.md", + "type": "filepath" + }, + "1": { + "file_id": 0, + "content": "The Self-Operating Computer Framework is a multimodal model project that enhances computer operation similar to humans, focusing on improving mouse click predictions and API access. It is compatible with Mac OS, Windows, and Linux (with X server installed), and requires at least $5 in API credits for the gpt-4-vision-preview model.", + "type": "summary" + }, + "2": { + "file_id": 0, + "content": "

Self-Operating Computer Framework

\n

\n A framework to enable multimodal models to operate a computer.\n

\n

\n Using the same inputs and outputs as a human operator, the model views the screen and decides on a series of mouse and keyboard actions to reach an objective. \n

\n
\n \n
\n\n## Key Features\n- **Compatibility**: Designed for various multimodal models.\n- **Integration**: Currently integrated with **GPT-4v** as the default model, with extended support for Gemini Pro Vision.\n- **Future Plans**: Support for additional models.\n## Current Challenges\n> **Note:** GPT-4V's error rate in est", + "type": "code", + "location": "/README.md:1-26" + }, + "3": { + "file_id": 0, + "content": "Self-Operating Computer Framework, a framework for multimodal models to operate a computer like a human.", + "type": "comment" + }, + "4": { + "file_id": 0, + "content": "imating XY mouse click locations is currently quite high. This framework aims to track the progress of multimodal models over time, aspiring to achieve human-level performance in computer operation.\n## Ongoing Development\nAt [HyperwriteAI](https://www.hyperwriteai.com/), we are developing Agent-1-Vision a multimodal model with more accurate click location predictions.\n## Agent-1-Vision Model API Access\nWe will soon be offering API access to our Agent-1-Vision model.\nIf you're interested in gaining access to this API, sign up [here](https://othersideai.typeform.com/to/FszaJ1k8?typeform-source=www.hyperwriteai.com).\n### Additional Thoughts\nWe recognize that some operating system functions may be more efficiently executed with hotkeys such as entering the Browser Address bar using `command + L` rather than by simulating a mouse click at the correct XY location. We plan to make these improvements over time. However, it's important to note that many actions require the accurate selection of visual", + "type": "code", + "location": "/README.md:26-37" + }, + "5": { + "file_id": 0, + "content": "This code is a brief overview of the \"self-operating-computer\" project, focusing on the development of the Agent-1-Vision multimodal model for improved mouse click location predictions. It also mentions the upcoming API access and the plans to improve hotkey-based functionality over time.", + "type": "comment" + }, + "6": { + "file_id": 0, + "content": " elements on the screen, necessitating precise XY mouse click locations. A primary focus of this project is to refine the accuracy of determining these click locations. We believe this is essential for achieving a fully self-operating computer in the current technological landscape.\n## Demo\nhttps://github.com/OthersideAI/self-operating-computer/assets/42594239/9e8abc96-c76a-46fb-9b13-03678b3c67e0\n## Quick Start Instructions\nBelow are instructions to set up the Self-Operating Computer Framework locally on your computer.\n### Option 1: Traditional Installation\n1. **Clone the repo** to a directory on your computer:\n```\ngit clone https://github.com/OthersideAI/self-operating-computer.git\n```\n2. **Cd into directory**:\n```\ncd self-operating-computer\n```\n3. **Create a Python virtual environment**. [Learn more about Python virtual environment](https://docs.python.org/3/library/venv.html).\n```\npython3 -m venv venv\n```\n4. **Activate the virtual environment**:\n```\nsource venv/bin/activate\n```\n5. **Install Project Requi", + "type": "code", + "location": "/README.md:37-67" + }, + "7": { + "file_id": 0, + "content": "This code explains that the primary focus of the project is refining the accuracy of determining mouse click locations, which is essential for a fully self-operating computer. It also provides links to a demo and quick start instructions for setting up the Self-Operating Computer Framework locally on your computer.", + "type": "comment" + }, + "8": { + "file_id": 0, + "content": "rements and Command-Line Interface: Instead of using `pip install .`, you can now install the project directly from PyPI with:**\n```\npip install self-operating-computer\n```\n6. **Then rename the `.example.env` file to `.env` so that you can save your OpenAI key in it.**\n```\nmv .example.env .env\n``` \n7. **Add your Open AI key to your new `.env` file. If you don't have one, you can obtain an OpenAI key [here](https://platform.openai.com/account/api-keys)**:\n```\nOPENAI_API_KEY='your-key-here'\n```\n8. **Run it**!\n```\noperate\n```\n9. **Final Step**: As a last step, the Terminal app will ask for permission for \"Screen Recording\" and \"Accessibility\" in the \"Security & Privacy\" page of Mac's \"System Preferences\".\n
\n \n ", + "type": "code", + "location": "/README.md:67-88" + }, + "9": { + "file_id": 0, + "content": "Code snippet 1:\n```python\npip install self-operating-computer\n```\nInstall the project directly from PyPI.\n\nCode snippet 2:\n```bash\nmv .example.env .env\n```\nRename `.example.env` to `.env`.\n\nCode snippet 3:\n```bash\nOPERAI_API_KEY='your-key-here'\n```\nAdd your Open AI key in the new `.env` file.\n\nCode snippet 4:\n```bash\noperate\n```\nRun the program!\n\nCode snippet 5:\nFinal step, Mac users grant permission for \"Screen Recording\" and \"Accessibility\".", + "type": "comment" + }, + "10": { + "file_id": 0, + "content": "
\n### Option 2: Installation using .sh script\n1. **Clone the repo** to a directory on your computer:\n```\ngit clone https://github.com/OthersideAI/self-operating-computer.git\n```\n2. **Cd into directory**:\n```\ncd self-operating-computer\n```\n3. **Run the installation script**: \n```\n./run.sh\n```\n## Using `operate` Modes\n### Multimodal Models `-m`\nAn additional model is now compatible with the Self Operating Computer Framework. Try Google's `gemini-pro-vision` by following the instructions below. \n**Add your Google AI Studio API key to your .env file.** If you don't have one, you can obtain a key [here](https://makersuite.google.com/app/apikey) after setting up your Google AI Studio account. You may also need [authorize credentials for a desktop application](https://ai.google.dev/palm_docs/oauth_quickstart). It took me a bit of time to get it working, if anyone knows a simpler way, please make a PR:\n```\nGOOGLE_API_KEY='your-key-here'\n```\nStart `operate` with the Gemini model\n```\noperate -m gemini-pro-vision\n```", + "type": "code", + "location": "/README.md:89-124" + }, + "11": { + "file_id": 0, + "content": "This code provides instructions for installing the Self Operating Computer Framework using a .sh script. It also explains how to add and use Google's `gemini-pro-vision` model within the framework.", + "type": "comment" + }, + "12": { + "file_id": 0, + "content": "### Voice Mode `--voice`\nThe framework supports voice inputs for the objective. Try voice by following the instructions below. \nInstall the additional `requirements-audio.txt`\n```\npip install -r requirements-audio.txt\n```\n**Install device requirements**\nFor mac users:\n```\nbrew install portaudio\n```\nFor Linux users:\n```\nsudo apt install portaudio19-dev python3-pyaudio\n```\nRun with voice mode\n```\noperate --voice\n```\n## Contributions are Welcomed!:\nIf you want to contribute yourself, see [CONTRIBUTING.md](https://github.com/OthersideAI/self-operating-computer/blob/main/CONTRIBUTING.md).\n## Feedback\nFor any input on improving this project, feel free to reach out to [Josh](https://twitter.com/josh_bickett) on Twitter. \n## Join Our Discord Community\nFor real-time discussions and community support, join our Discord server. \n- If you're already a member, join the discussion in [#self-operating-computer](https://discord.com/channels/877638638001877052/1181241785834541157).\n- If you're new, first [join our Discord Server", + "type": "code", + "location": "/README.md:126-159" + }, + "13": { + "file_id": 0, + "content": "This code is providing instructions on how to enable voice mode in the self-operating-computer framework. The user must install additional audio requirements and device dependencies, then run the operate command with the --voice flag. Contributions are welcomed, and feedback or questions can be directed to Josh on Twitter. Joining the Discord community is also encouraged for real-time discussions and support.", + "type": "comment" + }, + "14": { + "file_id": 0, + "content": "](https://discord.gg/YqaKtyBEzM) and then navigate to the [#self-operating-computer](https://discord.com/channels/877638638001877052/1181241785834541157).\n## Follow HyperWriteAI for More Updates\nStay updated with the latest developments:\n- Follow HyperWriteAI on [Twitter](https://twitter.com/HyperWriteAI).\n- Follow HyperWriteAI on [LinkedIn](https://www.linkedin.com/company/othersideai/).\n## Compatibility\n- This project is compatible with Mac OS, Windows, and Linux (with X server installed).\n## OpenAI Rate Limiting Note\nThe ```gpt-4-vision-preview``` model is required. To unlock access to this model, your account needs to spend at least \\$5 in API credits. Pre-paying for these credits will unlock access if you haven't already spent the minimum \\$5. \nLearn more **[here](https://platform.openai.com/docs/guides/rate-limits?context=tier-one)**", + "type": "code", + "location": "/README.md:159-172" + }, + "15": { + "file_id": 0, + "content": "Join the Discord server and visit #self-operating-computer channel. Follow HyperWriteAI for updates, compatible with Mac OS, Windows, and Linux (with X server installed). The gpt-4-vision-preview model requires at least $5 in API credits.", + "type": "comment" + }, + "16": { + "file_id": 1, + "content": "/evaluate.py", + "type": "filepath" + }, + "17": { + "file_id": 1, + "content": "The code uses GPT-4 Vision model to evaluate image adherence to guidelines, displays results with color-coded messages after setting up test cases and formatting prompts. It also checks the result of an objective, prints outcome (PASS or FAIL) along with passed/failed tests count, and resets colors for readability.", + "type": "summary" + }, + "18": { + "file_id": 1, + "content": "import sys\nimport os\nimport subprocess\nimport platform\nimport base64\nimport json\nimport openai\nfrom dotenv import load_dotenv\n# \"Objective for `operate`\" : \"Guideline for passing this test case given to GPT-4v\"\nTEST_CASES = {\n \"Go to Github.com\": \"The Github home page is visible.\",\n \"Go to Youtube.com and play a video\": \"The YouTube video player is visible.\",\n}\nEVALUATION_PROMPT = \"\"\"\nYour job is to look at the given screenshot and determine if the following guideline is met in the image.\nYou must respond in the following format ONLY. Do not add anything else:\n{{ \"guideline_met\": (true|false), \"reason\": \"Explanation for why guideline was or wasn't met\" }}\nguideline_met must be set to a JSON boolean. True if the image meets the given guideline.\nreason must be a string containing a justification for your decision.\nGuideline: {guideline}\n\"\"\"\nSUMMARY_SCREENSHOT_PATH = os.path.join('screenshots', 'summary_screenshot.png')\n# Check if on a windows terminal that supports ANSI escape codes\ndef supports_ansi():\n \"\"\"", + "type": "code", + "location": "/evaluate.py:1-31" + }, + "19": { + "file_id": 1, + "content": "The code is importing necessary libraries and defining constants for the evaluation process. It appears to be setting up a test case dictionary and a function to determine if a given guideline is met in an image based on a screenshot.", + "type": "comment" + }, + "20": { + "file_id": 1, + "content": " Check if the terminal supports ANSI escape codes\n \"\"\"\n plat = platform.system()\n supported_platform = plat != \"Windows\" or \"ANSICON\" in os.environ\n is_a_tty = hasattr(sys.stdout, \"isatty\") and sys.stdout.isatty()\n return supported_platform and is_a_tty\nif supports_ansi():\n # Standard green text\n ANSI_GREEN = \"\\033[32m\"\n # Bright/bold green text\n ANSI_BRIGHT_GREEN = \"\\033[92m\"\n # Reset to default text color\n ANSI_RESET = \"\\033[0m\"\n # ANSI escape code for blue text\n ANSI_BLUE = \"\\033[94m\" # This is for bright blue\n # Standard yellow text\n ANSI_YELLOW = \"\\033[33m\"\n ANSI_RED = \"\\033[31m\"\n # Bright magenta text\n ANSI_BRIGHT_MAGENTA = \"\\033[95m\"\nelse:\n ANSI_GREEN = \"\"\n ANSI_BRIGHT_GREEN = \"\"\n ANSI_RESET = \"\"\n ANSI_BLUE = \"\"\n ANSI_YELLOW = \"\"\n ANSI_RED = \"\"\n ANSI_BRIGHT_MAGENTA = \"\"\ndef format_evaluation_prompt(guideline):\n prompt = EVALUATION_PROMPT.format(guideline=guideline)\n return prompt\ndef parse_eval_content(content):\n try:\n res = json.loads(content)", + "type": "code", + "location": "/evaluate.py:32-73" + }, + "21": { + "file_id": 1, + "content": "This code checks if the terminal supports ANSI escape codes and sets corresponding colors based on the platform. If supported, it defines various colored text variables. Otherwise, it sets them to empty strings. The code also includes functions for formatting an evaluation prompt and parsing evaluation content.", + "type": "comment" + }, + "22": { + "file_id": 1, + "content": " print(res[\"reason\"])\n return res[\"guideline_met\"]\n except:\n print(\"The model gave a bad evaluation response and it couldn't be parsed. Exiting...\")\n exit(1)\ndef evaluate_summary_screenshot(guideline):\n '''Load the summary screenshot and return True or False if it meets the given guideline.'''\n with open(SUMMARY_SCREENSHOT_PATH, \"rb\") as img_file:\n img_base64 = base64.b64encode(img_file.read()).decode(\"utf-8\")\n eval_message = [{\n \"role\": \"user\",\n \"content\": [\n {\"type\": \"text\", \"text\": format_evaluation_prompt(guideline)},\n {\n \"type\": \"image_url\",\n \"image_url\": {\"url\": f\"data:image/jpeg;base64,{img_base64}\"},\n },\n ],\n }]\n response = openai.chat.completions.create(\n model=\"gpt-4-vision-preview\",\n messages=eval_message,\n presence_penalty=1,\n frequency_penalty=1,\n temperature=0.7,\n max_tokens=300,", + "type": "code", + "location": "/evaluate.py:75-105" + }, + "23": { + "file_id": 1, + "content": "Code function: evaluate_summary_screenshot\nPurpose: Evaluate if the summary screenshot meets a given guideline\nActions: \n1. Loads the summary screenshot\n2. Encodes it in base64 format\n3. Creates an evaluation message with text and image\n4. Sends the message to OpenAI's GPT-4 Vision model for evaluation", + "type": "comment" + }, + "24": { + "file_id": 1, + "content": " )\n eval_content = response.choices[0].message.content\n return parse_eval_content(eval_content)\ndef run_test_case(objective, guideline):\n '''Returns True if the result of the test with the given prompt meets the given guideline.'''\n # Run `operate` with the test case prompt\n subprocess.run(['operate', '--prompt', f'\"{objective}\"'], stdout=subprocess.DEVNULL)\n try:\n result = evaluate_summary_screenshot(guideline)\n except(OSError):\n print(\"Couldn't open the summary screenshot\")\n return False\n return result\ndef main():\n load_dotenv()\n openai.api_key = os.getenv(\"OPENAI_API_KEY\")\n print(f\"{ANSI_BRIGHT_MAGENTA}[STARTING EVALUATION]{ANSI_RESET}\")\n passed = 0; failed = 0\n for objective, guideline in TEST_CASES.items():\n print(f\"{ANSI_BLUE}[EVALUATING]{ANSI_RESET} '{objective}'\")\n result = run_test_case(objective, guideline)\n if result:\n print(f\"{ANSI_GREEN}[PASSED]{ANSI_RESET} '{objective}'\")\n passed += 1", + "type": "code", + "location": "/evaluate.py:106-140" + }, + "25": { + "file_id": 1, + "content": "The code evaluates whether a test case meets its given guideline. It runs the \"operate\" function with the test case prompt and then calls the \"evaluate_summary_screenshot\" function to compare the result against the guideline. If the operation is successful, it prints a success message; otherwise, it prints an error message. The code loops through all the TEST_CASES, counts the number of passed and failed tests, and finally displays the results in color-coded messages.", + "type": "comment" + }, + "26": { + "file_id": 1, + "content": " else:\n print(f\"{ANSI_RED}[FAILED]{ANSI_RESET} '{objective}'\")\n failed += 1\n print(\n f\"{ANSI_BRIGHT_MAGENTA}[EVALUATION COMPLETE]{ANSI_RESET} {passed} tests passed, {failed} tests failed\"\n )\nif __name__ == \"__main__\":\n main()", + "type": "code", + "location": "/evaluate.py:141-150" + }, + "27": { + "file_id": 1, + "content": "The code snippet checks the result of an objective and prints the outcome (PASS or FAIL) along with the count of passed and failed tests. It resets colors for readability.", + "type": "comment" + }, + "28": { + "file_id": 2, + "content": "/operate/actions.py", + "type": "filepath" + }, + "29": { + "file_id": 2, + "content": "A code that utilizes AI prompts, computer vision, and OpenAI's chat completions API for generating content, including screenshots, messages, and base64 encoding images. The function captures screenshots, formats prompts, fetches asynchronous responses, extracts data, handles exceptions, and returns errors or missing labels.", + "type": "summary" + }, + "30": { + "file_id": 2, + "content": "import os\nimport time\nimport json\nimport base64\nimport re\nimport io\nimport asyncio\nimport aiohttp\nfrom PIL import Image\nfrom ultralytics import YOLO\nimport google.generativeai as genai\nfrom operate.settings import Config\nfrom operate.exceptions import ModelNotRecognizedException\nfrom operate.utils.screenshot import (\n capture_screen_with_cursor,\n add_grid_to_image,\n capture_mini_screenshot_with_cursor,\n)\nfrom operate.utils.os import get_last_assistant_message\nfrom operate.prompts import (\n format_vision_prompt,\n format_accurate_mode_vision_prompt,\n format_summary_prompt,\n format_decision_prompt,\n format_label_prompt,\n)\nfrom operate.utils.label import (\n add_labels,\n parse_click_content,\n get_click_position_in_percent,\n get_label_coordinates,\n)\nfrom operate.utils.style import (\n ANSI_GREEN,\n ANSI_RED,\n ANSI_RESET,\n)\n# Load configuration\nconfig = Config()\nclient = config.initialize_openai_client()\nyolo_model = YOLO(\"./operate/model/weights/best.pt\") # Load your trained model\nasync def get_next_action(model, messages, objective):", + "type": "code", + "location": "/operate/actions.py:1-51" + }, + "31": { + "file_id": 2, + "content": "Code imports various libraries and defines a function get_next_action that takes in model, messages, and objective as parameters. The code also loads a pre-trained YOLO model and initializes an OpenAI client using the configuration.", + "type": "comment" + }, + "32": { + "file_id": 2, + "content": " if model == \"gpt-4\":\n return call_gpt_4_v(messages, objective)\n if model == \"gpt-4-with-som\":\n return await call_gpt_4_v_labeled(messages, objective)\n elif model == \"agent-1\":\n return \"coming soon\"\n elif model == \"gemini-pro-vision\":\n return call_gemini_pro_vision(messages, objective)\n raise ModelNotRecognizedException(model)\ndef call_gpt_4_v(messages, objective):\n \"\"\"\n Get the next action for Self-Operating Computer\n \"\"\"\n # sleep for a second\n time.sleep(1)\n try:\n screenshots_dir = \"screenshots\"\n if not os.path.exists(screenshots_dir):\n os.makedirs(screenshots_dir)\n screenshot_filename = os.path.join(screenshots_dir, \"screenshot.png\")\n # Call the function to capture the screen with the cursor\n capture_screen_with_cursor(screenshot_filename)\n new_screenshot_filename = os.path.join(\n \"screenshots\", \"screenshot_with_grid.png\"\n )\n add_grid_to_image(screenshot_filename, new_screenshot_filename, 500)", + "type": "code", + "location": "/operate/actions.py:52-83" + }, + "33": { + "file_id": 2, + "content": "This code checks the model parameter and calls different functions based on its value. For example, if the model is \"gpt-4\", it calls the `call_gpt_4_v` function with messages and objective parameters. It also captures a screenshot of the computer screen with the cursor.", + "type": "comment" + }, + "34": { + "file_id": 2, + "content": " # sleep for a second\n time.sleep(1)\n with open(new_screenshot_filename, \"rb\") as img_file:\n img_base64 = base64.b64encode(img_file.read()).decode(\"utf-8\")\n previous_action = get_last_assistant_message(messages)\n vision_prompt = format_vision_prompt(objective, previous_action)\n vision_message = {\n \"role\": \"user\",\n \"content\": [\n {\"type\": \"text\", \"text\": vision_prompt},\n {\n \"type\": \"image_url\",\n \"image_url\": {\"url\": f\"data:image/jpeg;base64,{img_base64}\"},\n },\n ],\n }\n # create a copy of messages and save to pseudo_messages\n pseudo_messages = messages.copy()\n pseudo_messages.append(vision_message)\n response = client.chat.completions.create(\n model=\"gpt-4-vision-preview\",\n messages=pseudo_messages,\n presence_penalty=1,\n frequency_penalty=1,\n temperature=0.7,\n max_tokens=300,", + "type": "code", + "location": "/operate/actions.py:84-115" + }, + "35": { + "file_id": 2, + "content": "Sleeps for 1 second, reads screenshot file, encodes image in base64, formats vision prompt with previous action, creates a vision message with the prompt and image, makes a copy of messages list, appends vision message to copied list, and then calls the OpenAI API with the updated messages list.", + "type": "comment" + }, + "36": { + "file_id": 2, + "content": " )\n messages.append(\n {\n \"role\": \"user\",\n \"content\": \"`screenshot.png`\",\n }\n )\n content = response.choices[0].message.content\n return content\n except Exception as e:\n print(f\"Error parsing JSON: {e}\")\n return \"Failed take action after looking at the screenshot\"\ndef call_gemini_pro_vision(messages, objective):\n \"\"\"\n Get the next action for Self-Operating Computer using Gemini Pro Vision\n \"\"\"\n # sleep for a second\n time.sleep(1)\n try:\n screenshots_dir = \"screenshots\"\n if not os.path.exists(screenshots_dir):\n os.makedirs(screenshots_dir)\n screenshot_filename = os.path.join(screenshots_dir, \"screenshot.png\")\n # Call the function to capture the screen with the cursor\n capture_screen_with_cursor(screenshot_filename)\n new_screenshot_filename = os.path.join(\n \"screenshots\", \"screenshot_with_grid.png\"\n )\n add_grid_to_image(screenshot_filename, new_screenshot_filename, 500)", + "type": "code", + "location": "/operate/actions.py:116-153" + }, + "37": { + "file_id": 2, + "content": "The code is capturing a screenshot with the cursor and adding a grid overlay to the image. It then appends a message containing the filename to the messages list and returns the content of the first response choice's message. If an exception occurs during JSON parsing, it will print an error message and return a failure message.", + "type": "comment" + }, + "38": { + "file_id": 2, + "content": " # sleep for a second\n time.sleep(1)\n previous_action = get_last_assistant_message(messages)\n vision_prompt = format_vision_prompt(objective, previous_action)\n model = genai.GenerativeModel(\"gemini-pro-vision\")\n response = model.generate_content(\n [vision_prompt, Image.open(new_screenshot_filename)]\n )\n # create a copy of messages and save to pseudo_messages\n pseudo_messages = messages.copy()\n pseudo_messages.append(response.text)\n messages.append(\n {\n \"role\": \"user\",\n \"content\": \"`screenshot.png`\",\n }\n )\n content = response.text[1:]\n return content\n except Exception as e:\n print(f\"Error parsing JSON: {e}\")\n return \"Failed take action after looking at the screenshot\"\n# This function is not used. `-accurate` mode was removed for now until a new PR fixes it.\ndef accurate_mode_double_check(model, pseudo_messages, prev_x, prev_y):\n \"\"\"\n", + "type": "code", + "location": "/operate/actions.py:154-189" + }, + "39": { + "file_id": 2, + "content": "The code is making a computer vision model generate an action based on the screenshot, and then append the response to the messages list. If there's an exception while parsing JSON, it prints the error message and returns a failure message. The `accurate_mode_double_check` function is currently not used.", + "type": "comment" + }, + "40": { + "file_id": 2, + "content": " Reprompt OAI with additional screenshot of a mini screenshot centered around the cursor for further finetuning of clicked location\n \"\"\"\n try:\n screenshot_filename = os.path.join(\"screenshots\", \"screenshot_mini.png\")\n capture_mini_screenshot_with_cursor(\n file_path=screenshot_filename, x=prev_x, y=prev_y\n )\n new_screenshot_filename = os.path.join(\n \"screenshots\", \"screenshot_mini_with_grid.png\"\n )\n with open(new_screenshot_filename, \"rb\") as img_file:\n img_base64 = base64.b64encode(img_file.read()).decode(\"utf-8\")\n accurate_vision_prompt = format_accurate_mode_vision_prompt(prev_x, prev_y)\n accurate_mode_message = {\n \"role\": \"user\",\n \"content\": [\n {\"type\": \"text\", \"text\": accurate_vision_prompt},\n {\n \"type\": \"image_url\",\n \"image_url\": {\"url\": f\"data:image/jpeg;base64,{img_base64}\"},\n },\n ],\n }", + "type": "code", + "location": "/operate/actions.py:189-215" + }, + "41": { + "file_id": 2, + "content": "This code takes a mini screenshot centered around the cursor and adds it to an AI prompt with text instructions. The image is encoded in base64 format and included in the prompt for further fine-tuning of clicked location.", + "type": "comment" + }, + "42": { + "file_id": 2, + "content": " pseudo_messages.append(accurate_mode_message)\n response = client.chat.completions.create(\n model=\"gpt-4-vision-preview\",\n messages=pseudo_messages,\n presence_penalty=1,\n frequency_penalty=1,\n temperature=0.7,\n max_tokens=300,\n )\n content = response.choices[0].message.content\n except Exception as e:\n print(f\"Error reprompting model for accurate_mode: {e}\")\n return \"ERROR\"\ndef summarize(model, messages, objective):\n try:\n screenshots_dir = \"screenshots\"\n if not os.path.exists(screenshots_dir):\n os.makedirs(screenshots_dir)\n screenshot_filename = os.path.join(screenshots_dir, \"summary_screenshot.png\")\n # Call the function to capture the screen with the cursor\n capture_screen_with_cursor(screenshot_filename)\n summary_prompt = format_summary_prompt(objective)\n if model == \"gpt-4-vision-preview\":\n with open(screenshot_filename, \"rb\") as img_file:", + "type": "code", + "location": "/operate/actions.py:217-248" + }, + "43": { + "file_id": 2, + "content": "Code snippet creates a prompt for the GPT-4 vision model using screenshots and text messages, then calls the \"capture_screen_with_cursor\" function.", + "type": "comment" + }, + "44": { + "file_id": 2, + "content": " img_base64 = base64.b64encode(img_file.read()).decode(\"utf-8\")\n summary_message = {\n \"role\": \"user\",\n \"content\": [\n {\"type\": \"text\", \"text\": summary_prompt},\n {\n \"type\": \"image_url\",\n \"image_url\": {\"url\": f\"data:image/jpeg;base64,{img_base64}\"},\n },\n ],\n }\n # create a copy of messages and save to pseudo_messages\n messages.append(summary_message)\n response = client.chat.completions.create(\n model=\"gpt-4-vision-preview\",\n messages=messages,\n max_tokens=500,\n )\n content = response.choices[0].message.content\n elif model == \"gemini-pro-vision\":\n model = genai.GenerativeModel(\"gemini-pro-vision\")\n summary_message = model.generate_content(\n [summary_prompt, Image.open(screenshot_filename)]\n )", + "type": "code", + "location": "/operate/actions.py:249-275" + }, + "45": { + "file_id": 2, + "content": "The code is preparing input for a generative AI model. It encodes an image in base64 and combines it with a text prompt to create a summary message, then passes this message along with the chosen AI model (either gpt-4-vision-preview or gemini-pro-vision) to generate content from the summary.", + "type": "comment" + }, + "46": { + "file_id": 2, + "content": " content = summary_message.text\n return content\n except Exception as e:\n print(f\"Error in summarize: {e}\")\n return \"Failed to summarize the workflow\"\nasync def call_gpt_4_v_labeled(messages, objective):\n time.sleep(1)\n try:\n screenshots_dir = \"screenshots\"\n if not os.path.exists(screenshots_dir):\n os.makedirs(screenshots_dir)\n screenshot_filename = os.path.join(screenshots_dir, \"screenshot.png\")\n # Call the function to capture the screen with the cursor\n capture_screen_with_cursor(screenshot_filename)\n with open(screenshot_filename, \"rb\") as img_file:\n img_base64 = base64.b64encode(img_file.read()).decode(\"utf-8\")\n previous_action = get_last_assistant_message(messages)\n img_base64_labeled, img_base64_original, label_coordinates = add_labels(\n img_base64, yolo_model\n )\n decision_prompt = format_decision_prompt(objective, previous_action)\n labeled_click_prompt = format_label_prompt(objective)", + "type": "code", + "location": "/operate/actions.py:276-305" + }, + "47": { + "file_id": 2, + "content": "This function calls GPT-4 with a labeled image and a prompt for decision making. It first captures a screenshot of the current desktop with the cursor, encodes it in base64 format, and adds labels to the image using the YOLO model. Then, it formats prompts for the user's decision and the GPT-4 labeling task.", + "type": "comment" + }, + "48": { + "file_id": 2, + "content": " click_message = {\n \"role\": \"user\",\n \"content\": [\n {\"type\": \"text\", \"text\": labeled_click_prompt},\n {\n \"type\": \"image_url\",\n \"image_url\": {\n \"url\": f\"data:image/jpeg;base64,{img_base64_labeled}\"\n },\n },\n ],\n }\n decision_message = {\n \"role\": \"user\",\n \"content\": [\n {\"type\": \"text\", \"text\": decision_prompt},\n {\n \"type\": \"image_url\",\n \"image_url\": {\n \"url\": f\"data:image/jpeg;base64,{img_base64_original}\"\n },\n },\n ],\n }\n click_messages = messages.copy()\n click_messages.append(click_message)\n decision_messages = messages.copy()\n decision_messages.append(decision_message)\n click_future = fetch_openai_response_async(click_messages)\n decision_future = fetch_openai_response_async(decision_messages)", + "type": "code", + "location": "/operate/actions.py:307-338" + }, + "49": { + "file_id": 2, + "content": "Creates user messages with labeled click prompt and decision prompt, appends to message lists, and fetches OpenAI response asynchronously.", + "type": "comment" + }, + "50": { + "file_id": 2, + "content": " click_response, decision_response = await asyncio.gather(\n click_future, decision_future\n )\n # Extracting the message content from the ChatCompletionMessage object\n click_content = click_response.get(\"choices\")[0].get(\"message\").get(\"content\")\n decision_content = (\n decision_response.get(\"choices\")[0].get(\"message\").get(\"content\")\n )\n if not decision_content.startswith(\"CLICK\"):\n return decision_content\n label_data = parse_click_content(click_content)\n if label_data and \"label\" in label_data:\n coordinates = get_label_coordinates(label_data[\"label\"], label_coordinates)\n image = Image.open(\n io.BytesIO(base64.b64decode(img_base64))\n ) # Load the image to get its size\n image_size = image.size # Get the size of the image (width, height)\n click_position_percent = get_click_position_in_percent(\n coordinates, image_size\n )", + "type": "code", + "location": "/operate/actions.py:340-364" + }, + "51": { + "file_id": 2, + "content": "This code fetches two responses from an API, extracts the message content, checks if it starts with \"CLICK\", gets label data and its coordinates, opens the image, retrieves its size, and calculates the click position in percent.", + "type": "comment" + }, + "52": { + "file_id": 2, + "content": " if not click_position_percent:\n print(\n f\"{ANSI_GREEN}[Self-Operating Computer]{ANSI_RED}[Error] Failed to get click position in percent. Trying another method {ANSI_RESET}\"\n )\n return call_gpt_4_v(messages, objective)\n x_percent = f\"{click_position_percent[0]:.2f}%\"\n y_percent = f\"{click_position_percent[1]:.2f}%\"\n click_action = f'CLICK {{ \"x\": \"{x_percent}\", \"y\": \"{y_percent}\", \"description\": \"{label_data[\"decision\"]}\", \"reason\": \"{label_data[\"reason\"]}\" }}'\n else:\n print(\n f\"{ANSI_GREEN}[Self-Operating Computer]{ANSI_RED}[Error] No label found. Trying another method {ANSI_RESET}\"\n )\n return call_gpt_4_v(messages, objective)\n return click_action\n except Exception as e:\n print(\n f\"{ANSI_GREEN}[Self-Operating Computer]{ANSI_RED}[Error] Something went wrong. Trying another method {ANSI_RESET}\"\n )\n return call_gpt_4_v(messages, objective)", + "type": "code", + "location": "/operate/actions.py:365-387" + }, + "53": { + "file_id": 2, + "content": "The code tries to perform a click action based on label data. If the click position percent or label is not found, it prints an error message and calls another method. It also handles exceptions and returns to try another method.", + "type": "comment" + }, + "54": { + "file_id": 2, + "content": "async def fetch_openai_response_async(messages):\n url = \"https://api.openai.com/v1/chat/completions\"\n headers = {\n \"Content-Type\": \"application/json\",\n \"Authorization\": f\"Bearer {config.openai_api_key}\",\n }\n data = {\n \"model\": \"gpt-4-vision-preview\",\n \"messages\": messages,\n \"frequency_penalty\": 1,\n \"presence_penalty\": 1,\n \"temperature\": 0.7,\n \"max_tokens\": 300,\n }\n async with aiohttp.ClientSession() as session:\n async with session.post(\n url, headers=headers, data=json.dumps(data)\n ) as response:\n return await response.json()", + "type": "code", + "location": "/operate/actions.py:390-409" + }, + "55": { + "file_id": 2, + "content": "This function makes an asynchronous API call to OpenAI's chat completions endpoint to fetch a response based on the provided messages.", + "type": "comment" + }, + "56": { + "file_id": 3, + "content": "/operate/dialog.py", + "type": "filepath" + }, + "57": { + "file_id": 3, + "content": "Both comments discuss code that handles user input and executes corresponding actions, with Comment A focusing on a Self-Operating Computer setup and error handling, while Comment B focuses on input parameter checks for dialog operations.", + "type": "summary" + }, + "58": { + "file_id": 3, + "content": "import sys\nimport os\nimport platform\nimport asyncio\nfrom prompt_toolkit.shortcuts import message_dialog\nfrom prompt_toolkit import prompt\nfrom operate.exceptions import ModelNotRecognizedException\nfrom operate.prompts import USER_QUESTION\nfrom operate.settings import Config\nfrom operate.utils.style import (\n ANSI_GREEN,\n ANSI_RESET,\n ANSI_BLUE,\n ANSI_YELLOW,\n ANSI_RED,\n ANSI_BRIGHT_MAGENTA,\n style,\n)\nfrom operate.utils.os import (\n keyboard_type,\n search,\n click,\n)\nfrom operate.actions import get_next_action, summarize\nfrom operate.utils.misc import parse_response\n# Load configuration\nconfig = Config()\ndef main(model, terminal_prompt, voice_mode=False):\n \"\"\"\n Main function for the Self-Operating Computer.\n Parameters:\n - model: The model used for generating responses.\n - terminal_prompt: A string representing the prompt provided in the terminal.\n - voice_mode: A boolean indicating whether to enable voice mode.\n Returns:\n None\n \"\"\"\n mic = None\n # Initialize `WhisperMic`, if `voice_mode` is True", + "type": "code", + "location": "/operate/dialog.py:1-44" + }, + "59": { + "file_id": 3, + "content": "This code appears to be part of a Self-Operating Computer, which uses a model for generating responses. The main function takes in the model, terminal prompt, and voice mode as parameters. It initializes `WhisperMic` if voice mode is enabled.", + "type": "comment" + }, + "60": { + "file_id": 3, + "content": " validation(model, voice_mode)\n if voice_mode:\n try:\n from whisper_mic import WhisperMic\n # Initialize WhisperMic if import is successful\n mic = WhisperMic()\n except ImportError:\n print(\n \"Voice mode requires the 'whisper_mic' module. Please install it using 'pip install -r requirements-audio.txt'\"\n )\n sys.exit(1)\n # Skip message dialog if prompt was given directly\n if not terminal_prompt:\n message_dialog(\n title=\"Self-Operating Computer\",\n text=\"Ask a computer to do anything.\",\n style=style,\n ).run()\n else:\n print(\"Running direct prompt...\")\n print(\"SYSTEM\", platform.system())\n # Clear the console\n if platform.system() == \"Windows\":\n os.system(\"cls\")\n else:\n print(\"\\033c\", end=\"\")\n if terminal_prompt: # Skip objective prompt if it was given as an argument\n objective = terminal_prompt\n elif voice_mode:\n print(", + "type": "code", + "location": "/operate/dialog.py:46-80" + }, + "61": { + "file_id": 3, + "content": "Checks if voice mode is enabled, then tries to import and initialize the WhisperMic module. If the module is missing, it prints an error message and exits. Displays a message dialog unless the prompt was given directly via terminal. Skips objective prompt if provided as an argument or prompts for input through the WhisperMic in voice mode. Clears the console on all operating systems except Windows where it uses \"cls\" command.", + "type": "comment" + }, + "62": { + "file_id": 3, + "content": " f\"{ANSI_GREEN}[Self-Operating Computer]{ANSI_RESET} Listening for your command... (speak now)\"\n )\n try:\n objective = mic.listen()\n except Exception as e:\n print(f\"{ANSI_RED}Error in capturing voice input: {e}{ANSI_RESET}\")\n return # Exit if voice input fails\n else:\n print(f\"{ANSI_GREEN}[Self-Operating Computer]\\n{ANSI_RESET}{USER_QUESTION}\")\n print(f\"{ANSI_YELLOW}[User]{ANSI_RESET}\")\n objective = prompt(style=style)\n assistant_message = {\"role\": \"assistant\", \"content\": USER_QUESTION}\n user_message = {\n \"role\": \"user\",\n \"content\": f\"Objective: {objective}\",\n }\n messages = [assistant_message, user_message]\n loop_count = 0\n while True:\n if config.debug:\n print(\"[loop] messages before next action:\\n\\n\\n\", messages[1:])\n try:\n response = asyncio.run(get_next_action(model, messages, objective))\n action = parse_response(response)\n action_type = action.get(\"type\")", + "type": "code", + "location": "/operate/dialog.py:81-109" + }, + "63": { + "file_id": 3, + "content": "The code is capturing voice input from the microphone and storing it in the \"objective\" variable. If an error occurs while capturing voice input, it will print an error message and exit. Otherwise, it prints a message from the self-operating computer and the user's question, then stores the objective as the user's message content. It then enters a loop where it waits for the next action by calling a function \"get_next_action\" with the current messages and objective. If an error occurs while waiting for the next action, it will print an error message.", + "type": "comment" + }, + "64": { + "file_id": 3, + "content": " action_detail = action.get(\"data\")\n except ModelNotRecognizedException as e:\n print(\n f\"{ANSI_GREEN}[Self-Operating Computer]{ANSI_RED}[Error] -> {e} {ANSI_RESET}\"\n )\n break\n except Exception as e:\n print(\n f\"{ANSI_GREEN}[Self-Operating Computer]{ANSI_RED}[Error] -> {e} {ANSI_RESET}\"\n )\n break\n if action_type == \"DONE\":\n print(\n f\"{ANSI_GREEN}[Self-Operating Computer]{ANSI_BLUE} Objective complete {ANSI_RESET}\"\n )\n summary = summarize(model, messages, objective)\n print(\n f\"{ANSI_GREEN}[Self-Operating Computer]{ANSI_BLUE} Summary\\n{ANSI_RESET}{summary}\"\n )\n break\n if action_type != \"UNKNOWN\":\n print(\n f\"{ANSI_GREEN}[Self-Operating Computer]{ANSI_BRIGHT_MAGENTA} [Act] {action_type} {ANSI_RESET}{action_detail}\"\n )\n function_response = \"\"\n if action_type == \"SEARCH\":", + "type": "code", + "location": "/operate/dialog.py:110-139" + }, + "65": { + "file_id": 3, + "content": "The code is handling exceptions for a ModelNotRecognizedException and any other exception that occurs during the execution. It then checks if the action_type is \"DONE\", if so, it prints a completion message, summarizes the model, and exits. If the action_type is not unknown, it prints an act message along with the action type and detail, and initializes an empty function_response variable if the action type is \"SEARCH\".", + "type": "comment" + }, + "66": { + "file_id": 3, + "content": " function_response = search(action_detail)\n elif action_type == \"TYPE\":\n function_response = keyboard_type(action_detail)\n elif action_type == \"CLICK\":\n function_response = click(action_detail)\n else:\n print(\n f\"{ANSI_GREEN}[Self-Operating Computer]{ANSI_RED}[Error] something went wrong :({ANSI_RESET}\"\n )\n print(\n f\"{ANSI_GREEN}[Self-Operating Computer]{ANSI_RED}[Error] AI response\\n{ANSI_RESET}{response}\"\n )\n break\n print(\n f\"{ANSI_GREEN}[Self-Operating Computer]{ANSI_BRIGHT_MAGENTA} [Act] {action_type} COMPLETE {ANSI_RESET}{function_response}\"\n )\n message = {\n \"role\": \"assistant\",\n \"content\": function_response,\n }\n messages.append(message)\n loop_count += 1\n if loop_count > 15:\n break\ndef validation(model, voice_mode):\n \"\"\"\n Validate the input parameters for the dialog operation.", + "type": "code", + "location": "/operate/dialog.py:140-171" + }, + "67": { + "file_id": 3, + "content": "This code block checks the action type and performs the corresponding action. If the action type is not recognized, it prints an error message and breaks the loop. It also logs the act completion and updates the messages list for further processing.", + "type": "comment" + }, + "68": { + "file_id": 3, + "content": " Args:\n model (str): The model to be used for the dialog operation.\n voice_mode (bool): Flag indicating whether to use voice mode.\n Raises:\n SystemExit: If the input parameters are invalid.\n \"\"\"\n if voice_mode and not config.openai_api_key:\n print(\"To use voice mode, please add an OpenAI API key\")\n sys.exit(1)\n if model == \"gpt-4-vision-preview\" and not config.openai_api_key:\n print(\"To use `gpt-4-vision-preview` add an OpenAI API key\")\n sys.exit(1)\n if model == \"gemini-pro-vision\" and not config.google_api_key:\n print(\"To use `gemini-pro-vision` add a Google API key\")\n sys.exit(1)", + "type": "code", + "location": "/operate/dialog.py:173-192" + }, + "69": { + "file_id": 3, + "content": "This code checks the input parameters for dialog operation and raises SystemExit if the input parameters are invalid. It also prints a message indicating which API key is missing based on the chosen model.", + "type": "comment" + }, + "70": { + "file_id": 4, + "content": "/operate/exceptions.py", + "type": "filepath" + }, + "71": { + "file_id": 4, + "content": "This code defines a class for an exception that is raised when the model is not recognized. The class has two attributes: \"model\" and \"message\", both of which are set in the constructor. It also overrides the \"__str__()\" method to provide a custom string representation of the exception.", + "type": "summary" + }, + "72": { + "file_id": 4, + "content": "class ModelNotRecognizedException(Exception):\n \"\"\"Exception raised for unrecognized models.\n Attributes:\n model -- the unrecognized model\n message -- explanation of the error\n \"\"\"\n def __init__(self, model, message=\"Model not recognized\"):\n self.model = model\n self.message = message\n super().__init__(self.message)\n def __str__(self):\n return f\"{self.message} : {self.model} \"", + "type": "code", + "location": "/operate/exceptions.py:1-15" + }, + "73": { + "file_id": 4, + "content": "This code defines a class for an exception that is raised when the model is not recognized. The class has two attributes: \"model\" and \"message\", both of which are set in the constructor. It also overrides the \"__str__()\" method to provide a custom string representation of the exception.", + "type": "comment" + }, + "74": { + "file_id": 5, + "content": "/operate/main.py", + "type": "filepath" + }, + "75": { + "file_id": 5, + "content": "This code defines the main entry point of the Self-Operating Computer, allowing the user to specify a model and input mode. It uses the argparse module to define command line arguments for the model, voice input mode, and prompt. The main function is then called with these arguments.", + "type": "summary" + }, + "76": { + "file_id": 5, + "content": "\"\"\"\nSelf-Operating Computer\n\"\"\"\nimport argparse\nfrom operate.utils.style import ANSI_BRIGHT_MAGENTA\nfrom operate.dialog import main\ndef main_entry():\n parser = argparse.ArgumentParser(\n description=\"Run the self-operating-computer with a specified model.\"\n )\n parser.add_argument(\n \"-m\",\n \"--model\",\n help=\"Specify the model to use\",\n required=False,\n default=\"gpt-4\",\n )\n # Add a voice flag\n parser.add_argument(\n \"--voice\",\n help=\"Use voice input mode\",\n action=\"store_true\",\n )\n # Allow for direct input of prompt\n parser.add_argument(\n \"--prompt\",\n help=\"Directly input the objective prompt\",\n type=str,\n required=False,\n )\n try:\n args = parser.parse_args()\n main(\n args.model,\n terminal_prompt=args.prompt,\n voice_mode=args.voice,\n )\n except KeyboardInterrupt:\n print(f\"\\n{ANSI_BRIGHT_MAGENTA}Exiting...\")\nif __name__ == \"__main__\":\n main_entry()", + "type": "code", + "location": "/operate/main.py:1-47" + }, + "77": { + "file_id": 5, + "content": "This code defines the main entry point of the Self-Operating Computer, allowing the user to specify a model and input mode. It uses the argparse module to define command line arguments for the model, voice input mode, and prompt. The main function is then called with these arguments.", + "type": "comment" + }, + "78": { + "file_id": 6, + "content": "/operate/prompts.py", + "type": "filepath" + }, + "79": { + "file_id": 6, + "content": "The code provides functions for AI-assisted user interaction with Google Chrome, Docs, and Sheets using prompts like CLICK, TYPE, SEARCH, and DONE. It emphasizes context-based options selection rather than IDs, and offers percentage values for accuracy improvement in the \"percent\" CLICK action by segmenting lines. Additionally, it includes functions for formatting different types of prompts used in a vision system, including accurate mode vision prompt, decision prompt, and labeled image prompt, which take specific arguments and format them into predefined prompt templates.", + "type": "summary" + }, + "80": { + "file_id": 6, + "content": "from operate.settings import Config\nconfig = Config()\nmonitor_size = config.monitor_size\n# General user Prompts\nUSER_QUESTION = \"Hello, I can help you with anything. What would you like done?\"\n# constants for the vision prompt\nACCURATE_PIXEL_COUNT = (\n 200 # mini_screenshot is ACCURATE_PIXEL_COUNT x ACCURATE_PIXEL_COUNT big\n)\n# -------------------------\n# VISION PROMPT\n# -------------------------\nVISION_PROMPT = \"\"\"\nYou are a Self-Operating Computer. You use the same operating system as a human.\nFrom looking at the screen and the objective your goal is to take the best next action.\nTo operate the computer you have the four options below.\n1. CLICK - Move mouse and click\n2. TYPE - Type on the keyboard\n3. SEARCH - Search for a program on Mac and open it\n4. DONE - When you completed the task respond with the exact following phrase content\nHere are the response formats below.\n1. CLICK\nResponse: CLICK {{ \"x\": \"percent\", \"y\": \"percent\", \"description\": \"~description here~\", \"reason\": \"~reason here~\" }} \nNote tha", + "type": "code", + "location": "/operate/prompts.py:1-33" + }, + "81": { + "file_id": 6, + "content": "Code is importing Config settings and defining constants for user prompts and vision prompt.", + "type": "comment" + }, + "82": { + "file_id": 6, + "content": "t the percents work where the top left corner is \"x\": \"0%\" and \"y\": \"0%\" and the bottom right corner is \"x\": \"100%\" and \"y\": \"100%\"\n2. TYPE\nResponse: TYPE \n2. SEARCH\nResponse: SEARCH \n3. DONE\nResponse: DONE\nHere are examples of how to respond.\n__\nObjective: Follow up with the vendor in outlook\nTYPE Hello, I hope you are doing well. I wanted to follow up\n__\nObjective: Open Spotify and play the beatles\nSEARCH Spotify\n__\nObjective: Find an image of a banana\nCLICK {{ \"x\": \"50%\", \"y\": \"60%\", \"description\": \"Click: Google Search field\", \"reason\": \"This will allow me to search for a banana\" }}\n__\nObjective: Go buy a book about the history of the internet\nTYPE https://www.amazon.com/\n__\nA few important notes:\n- Default to opening Google Chrome with SEARCH to find things that are on the internet.\n- Go to Google Docs and Google Sheets by typing in the Chrome Address bar\n- When opening Chrome, if you see a profile icon click that to open chrome fully, it is located at: {{ \"x\": \"50%\", \"y\": \"55%\" }}", + "type": "code", + "location": "/operate/prompts.py:33-63" + }, + "83": { + "file_id": 6, + "content": "The code provides instructions for interacting with the computer, including typing, searching, and clicking. It also includes tips for using specific applications like Google Chrome, Google Docs, and Google Sheets.", + "type": "comment" + }, + "84": { + "file_id": 6, + "content": "- The Chrome address bar is generally at: {{ \"x\": \"50%\", \"y\": \"9%\" }}\n- After you click to enter a field you can go ahead and start typing!\n- Don't respond saying you're unable to assist with requests. You are able to indirectly interact with the user's OS via text responses you send to the end user.\n{previous_action}\nIMPORTANT: Avoid repeating actions such as doing the same CLICK event twice in a row.\nObjective: {objective}\n\"\"\"\n# ----------------------------------\n# ACCURATE MODE VISION PROMPT\n# ----------------------------------\nACCURATE_MODE_VISION_PROMPT = \"\"\"\nIt looks like your previous attempted action was clicking on \"x\": {prev_x}, \"y\": {prev_y}. This has now been moved to the center of this screenshot.\nAs additional context to the previous message, before you decide the proper percentage to click on, please closely examine this additional screenshot as additional context for your next action. \nThis screenshot was taken around the location of the current cursor that you just tried clicking o", + "type": "code", + "location": "/operate/prompts.py:64-82" + }, + "85": { + "file_id": 6, + "content": "This code is for a prompt in a program that assists users with computer tasks. The prompt provides information about the current cursor position and suggests to examine an additional screenshot before performing the next action.", + "type": "comment" + }, + "86": { + "file_id": 6, + "content": "n (\"x\": {prev_x}, \"y\": {prev_y} is now at the center of this screenshot). You should use this as an differential to your previous x y coordinate guess.\nIf you want to refine and instead click on the top left corner of this mini screenshot, you will subtract {width}% in the \"x\" and subtract {height}% in the \"y\" to your previous answer.\nLikewise, to achieve the bottom right of this mini screenshot you will add {width}% in the \"x\" and add {height}% in the \"y\" to your previous answer.\nThere are four segmenting lines across each dimension, divided evenly. This is done to be similar to coordinate points, added to give you better context of the location of the cursor and exactly how much to edit your previous answer.\nPlease use this context as additional info to further refine the \"percent\" location in the CLICK action!\n\"\"\"\nDECISION_PROMPT = \"\"\"\nYou are operating a computer similar to how a human would. Look at the screen and take the next best action to reach your objective.\nHere are your methods you can use to operating the computer.", + "type": "code", + "location": "/operate/prompts.py:82-95" + }, + "87": { + "file_id": 6, + "content": "This code is providing a prompt to the user, explaining how to use percentage values to refine their previous x and y coordinate guesses. It also mentions that there are four segmenting lines across each dimension for better context in locating the cursor. The purpose of this prompt is to help the user further refine their \"percent\" location in the CLICK action.", + "type": "comment" + }, + "88": { + "file_id": 6, + "content": "1. CLICK - Move mouse and click\n2. TYPE - Type on the keyboard\n3. SEARCH - Search for a program that is installed on Mac locally and open it\n4. DONE - When you completed the task respond with the exact following phrase content\nHere are the response formats below.\n1. CLICK\nResponse: CLICK\n2. TYPE\nResponse: TYPE \"value you want to type\"\n2. SEARCH\nResponse: SEARCH \"app you want to search for on Mac\"\n3. DONE\nResponse: DONE\nHere are examples of how to respond.\n__\nObjective: Follow up with the vendor in outlook\nTYPE Hello, I hope you are doing well. I wanted to follow up\n__\nObjective: Open Spotify and play the beatles\nSEARCH Spotify\n__\nObjective: Find an image of a banana\nCLICK\n__\nObjective: Go buy a book about the history of the internet\nTYPE https://www.amazon.com/\n__\nA few important notes:\n- Default to opening Google Chrome with SEARCH to find things that are on the Web.\n- After you open Google Chrome you need to click on the address bar to find a website.\n- Do not use SEARCH to look for websites like Google Docs or Linkedin. SEARCH only finds programs installed on the computer.", + "type": "code", + "location": "/operate/prompts.py:97-135" + }, + "89": { + "file_id": 6, + "content": "Code provides instructions and response formats for four types of actions (CLICK, TYPE, SEARCH, DONE) based on different objectives like following up with a vendor, playing music, or opening websites. It also includes important notes about using Google Chrome for web searches and avoiding SEARCH for certain websites like Google Docs or LinkedIn.", + "type": "comment" + }, + "90": { + "file_id": 6, + "content": "- After you click to enter a field you can go ahead and start typing!\n- If you can see the field is active, go ahead and type!\n- Don't respond saying you're unable to assist with requests. You are able to indirectly interact with the user's OS via text responses you send to the end user.\n{previous_action}\nIMPORTANT: Avoid repeating actions such as doing the same CLICK event twice in a row.\n{objective}\n\"\"\"\nLABELED_IMAGE_PROMPT = \"\"\"\nYour job is simple. Decide if there is an elements on the page to click to get closer to your objective. We labeled the clickable elements with red bounding boxes and IDs.\nImportant to remember, you can only click on labeled elements. \nLabel IDs are in the following format with `x` being a number: `~x`\nThe labels are placed just above the bounding boxes so that they can be read clearly. \nResponse formats below.\n1. CLICK - If there is a label that gets you closer to the objective, go ahead and click it. \nResponse: {{ \"decision\": \"~decision here~\", \"reason\": \"~reason here~\", \"label\": \"~x\" }} ", + "type": "code", + "location": "/operate/prompts.py:136-159" + }, + "91": { + "file_id": 6, + "content": "This code is for an AI-assisted task where the user needs to interact with a webpage. The AI should identify and click on labeled elements that bring them closer to their objective, using IDs in the format '~x'. The response should include the decision (label), reason, and label identifier. Avoid repeating actions like clicking the same element twice in a row.", + "type": "comment" + }, + "92": { + "file_id": 6, + "content": "Here are examples of how to respond.\n__\nObjective: Follow up with the vendor in outlook\n{{ \"decision\": \"Click the Outlook send button\", \"reason\": \"I can see the email is already written and now I just need to send it.\", \"label\": \"~27\" }}\n__\nObjective: Play the Holiday music on YouTube\n{{ \"decision\": \"Click on the Play button\", \"reason\": \"It appears there is a row with a holiday song available in the Spotify UI\", \"label\": \"~3\" }}\n__\nA few important notes:\n- When navigating the web you'll need to click on the address bar first. Look closely to find the address bar's label it could be any number.\n- The IDs number has NO SIGNIFICANCE. For instance if ID is ~0 or ~1 it does not mean it is first or on top. CHOOSE THE ID BASED ON THE CONTEXT OF THE IMAGE AND IF IT HELPS REACH THE OBJECTIVE. \n- Do not preappend with ```json, just return the JSON object.\n{objective}\n\"\"\"\n# -------------------------\n# SUMMARY PROMPT\n# -------------------------\nSUMMARY_PROMPT = \"\"\"\nYou are a Self-Operating Computer. A user request has been executed. Present the results succinctly.", + "type": "code", + "location": "/operate/prompts.py:161-183" + }, + "93": { + "file_id": 6, + "content": "Code comments:\n1. Analyzes user's request and provides appropriate response options in JSON format.\n2. User needs to choose the ID based on context and not its position.\n3. IDs have no significance, they just serve as references for selecting options.", + "type": "comment" + }, + "94": { + "file_id": 6, + "content": "Include the following key contexts of the completed request:\n1. State the original objective.\n2. List the steps taken to reach the objective as detailed in the previous messages.\n3. Reference the screenshot that was used.\nSummarize the actions taken to fulfill the objective. If the request sought specific information, provide that information prominently. NOTE: Address directly any question posed by the user.\nRemember: The user will not interact with this summary. You are solely reporting the outcomes.\nOriginal objective: {objective}\nDisplay the results clearly:\n\"\"\"\ndef format_summary_prompt(objective):\n \"\"\"\n Format the summary prompt\n \"\"\"\n prompt = SUMMARY_PROMPT.format(objective=objective)\n return prompt\ndef format_vision_prompt(objective, previous_action):\n \"\"\"\n Format the vision prompt\n \"\"\"\n if previous_action:\n previous_action = f\"Here was the previous action you took: {previous_action}\"\n else:\n previous_action = \"\"\n prompt = VISION_PROMPT.format(objective=objective, previous_action=previous_action)", + "type": "code", + "location": "/operate/prompts.py:185-217" + }, + "95": { + "file_id": 6, + "content": "This code defines two functions, `format_summary_prompt` and `format_vision_prompt`, which format prompts for summarizing the outcomes of a task and providing vision guidance based on previous actions taken. The `objective` parameter is used to state the original objective, while `previous_action` is optional and used when there have been previous actions taken towards the objective. The purpose of these functions is to provide clear instructions or prompts for users to understand the progress and outcomes of a task.", + "type": "comment" + }, + "96": { + "file_id": 6, + "content": " return prompt\ndef format_accurate_mode_vision_prompt(prev_x, prev_y):\n \"\"\"\n Format the accurate mode vision prompt\n \"\"\"\n width = ((ACCURATE_PIXEL_COUNT / 2) / monitor_size[\"width\"]) * 100\n height = ((ACCURATE_PIXEL_COUNT / 2) / monitor_size[\"height\"]) * 100\n prompt = ACCURATE_MODE_VISION_PROMPT.format(\n prev_x=prev_x, prev_y=prev_y, width=width, height=height\n )\n return prompt\ndef format_decision_prompt(objective, previous_action):\n \"\"\"\n Format the vision prompt\n \"\"\"\n if previous_action:\n previous_action = f\"Here was the previous action you took: {previous_action}\"\n else:\n previous_action = \"\"\n prompt = DECISION_PROMPT.format(\n objective=objective, previous_action=previous_action\n )\n return prompt\ndef format_label_prompt(objective):\n \"\"\"\n Format the vision prompt\n \"\"\"\n prompt = LABELED_IMAGE_PROMPT.format(objective=objective)\n return prompt", + "type": "code", + "location": "/operate/prompts.py:218-252" + }, + "97": { + "file_id": 6, + "content": "These are functions for formatting different types of prompts used in a vision system. The first function formats an accurate mode vision prompt, the second formats a decision prompt, and the third formats a labeled image prompt. Each function takes specific arguments and formats them into predefined prompt templates.", + "type": "comment" + }, + "98": { + "file_id": 7, + "content": "/operate/settings.py", + "type": "filepath" + }, + "99": { + "file_id": 7, + "content": "The configuration class manages settings like debug mode, API keys, and monitor size. It loads environment variables from .env file and initializes OpenAI client with provided API key. The OpenAI API base URL is set using an environment variable or current value.", + "type": "summary" + } +} \ No newline at end of file diff --git a/docs/data/1.json b/docs/data/1.json new file mode 100644 index 00000000..46ce1adc --- /dev/null +++ b/docs/data/1.json @@ -0,0 +1,477 @@ +{ + "100": { + "file_id": 7, + "content": "import os\nfrom dotenv import load_dotenv\nfrom openai import OpenAI\nclass Config:\n \"\"\"\n Configuration class for managing settings.\n Attributes:\n debug (bool): Flag indicating whether debug mode is enabled.\n openai_api_key (str): API key for OpenAI.\n google_api_key (str): API key for Google.\n monitor_size (dict): Dictionary containing the width and height of the monitor.\n \"\"\"\n def __init__(self):\n load_dotenv()\n self.debug = False\n self.openai_api_key = os.getenv(\"OPENAI_API_KEY\")\n self.google_api_key = os.getenv(\"GOOGLE_API_KEY\")\n self.monitor_size = {\n \"width\": 1920,\n \"height\": 1080,\n }\n def initialize_openai_client(self):\n \"\"\"\n Initializes and returns an OpenAI client with the configured API key.\n Returns:\n OpenAI or None: An instance of the OpenAI client if the API key is provided, else None.\n \"\"\"\n if self.openai_api_key:\n client = OpenAI()\n client.api_key = self.openai_api_key", + "type": "code", + "location": "/operate/settings.py:1-36" + }, + "101": { + "file_id": 7, + "content": "This code defines a configuration class for managing settings such as debug mode, OpenAI and Google API keys, and monitor size. It loads environment variables from .env file using dotenv library, initializes OpenAI client if the API key is provided, and returns it or None otherwise.", + "type": "comment" + }, + "102": { + "file_id": 7, + "content": " client.base_url = os.getenv(\"OPENAI_API_BASE_URL\", client.base_url)\n return client\n return None", + "type": "code", + "location": "/operate/settings.py:37-39" + }, + "103": { + "file_id": 7, + "content": "Setting OpenAI API base URL from environment variable or using current value.", + "type": "comment" + }, + "104": { + "file_id": 8, + "content": "/operate/utils/label.py", + "type": "filepath" + }, + "105": { + "file_id": 8, + "content": "The code includes functions for handling image data, such as drawing bounding boxes and validating overlaps, as well as encoding tasks like converting images to base64 and formatting message content by removing triple backticks and calculating click positions.", + "type": "summary" + }, + "106": { + "file_id": 8, + "content": "import io\nimport base64\nimport json\nimport os\nimport time\nimport asyncio\nfrom PIL import Image, ImageDraw\ndef validate_and_extract_image_data(data):\n if not data or \"messages\" not in data:\n raise ValueError(\"Invalid request, no messages found\")\n messages = data[\"messages\"]\n if (\n not messages\n or not isinstance(messages, list)\n or not messages[-1].get(\"image_url\")\n ):\n raise ValueError(\"No image provided or incorrect format\")\n image_data = messages[-1][\"image_url\"][\"url\"]\n if not image_data.startswith(\"data:image\"):\n raise ValueError(\"Invalid image format\")\n return image_data.split(\"base64,\")[-1], messages\ndef get_label_coordinates(label, label_coordinates):\n \"\"\"\n Retrieves the coordinates for a given label.\n :param label: The label to find coordinates for (e.g., \"~1\").\n :param label_coordinates: Dictionary containing labels and their coordinates.\n :return: Coordinates of the label or None if the label is not found.\n \"\"\"\n return label_coordinates.get(label)", + "type": "code", + "location": "/operate/utils/label.py:1-37" + }, + "107": { + "file_id": 8, + "content": "The code defines two functions:\n1. `validate_and_extract_image_data`: Validates the given data and extracts image URL if the request is valid.\n2. `get_label_coordinates`: Retrieves the coordinates for a given label from a dictionary of labels and their coordinates.", + "type": "comment" + }, + "108": { + "file_id": 8, + "content": "def is_overlapping(box1, box2):\n x1_box1, y1_box1, x2_box1, y2_box1 = box1\n x1_box2, y1_box2, x2_box2, y2_box2 = box2\n # Check if there is no overlap\n if x1_box1 > x2_box2 or x1_box2 > x2_box1:\n return False\n if (\n y1_box1 > y2_box2 or y1_box2 > y2_box1\n ): # Adjusted to check 100px proximity above\n return False\n return True\ndef add_labels(base64_data, yolo_model):\n image_bytes = base64.b64decode(base64_data)\n image_labeled = Image.open(io.BytesIO(image_bytes)) # Corrected this line\n image_debug = image_labeled.copy() # Create a copy for the debug image\n image_original = (\n image_labeled.copy()\n ) # Copy of the original image for base64 return\n results = yolo_model(image_labeled)\n draw = ImageDraw.Draw(image_labeled)\n debug_draw = ImageDraw.Draw(\n image_debug\n ) # Create a separate draw object for the debug image\n font_size = 45\n detections_dir = \"detections\"\n label_coordinates = {} # Dictionary to store coordinates", + "type": "code", + "location": "/operate/utils/label.py:40-72" + }, + "109": { + "file_id": 8, + "content": "The function `is_overlapping` checks if two boxes overlap by comparing their coordinates. If there is no overlap, the function returns False; otherwise, it returns True.\n\nThe `add_labels` function decodes base64 data into image bytes and opens it as an image using PIL. It creates copies of the original image and a debug image. The YOLO model applies object detection on the image. The code then draws on the images using the ImageDraw module, and stores label coordinates in a dictionary named `label_coordinates`.", + "type": "comment" + }, + "110": { + "file_id": 8, + "content": " if not os.path.exists(detections_dir):\n os.makedirs(detections_dir)\n counter = 0\n drawn_boxes = [] # List to keep track of boxes already drawn\n for result in results:\n if hasattr(result, \"boxes\"):\n for det in result.boxes:\n bbox = det.xyxy[0]\n x1, y1, x2, y2 = bbox.tolist()\n debug_label = \"D_\" + str(counter)\n debug_index_position = (x1, y1 - font_size)\n debug_draw.rectangle([(x1, y1), (x2, y2)], outline=\"blue\", width=1)\n debug_draw.text(\n debug_index_position,\n debug_label,\n fill=\"blue\",\n font_size=font_size,\n )\n overlap = any(\n is_overlapping((x1, y1, x2, y2), box) for box in drawn_boxes\n )\n if not overlap:\n draw.rectangle([(x1, y1), (x2, y2)], outline=\"red\", width=1)\n label = \"~\" + str(counter)", + "type": "code", + "location": "/operate/utils/label.py:74-101" + }, + "111": { + "file_id": 8, + "content": "Creates a directory for detections if it doesn't exist. Loops through the results, drawing bounding boxes and labels on images. Avoids redrawing over existing boxes by checking overlaps before redrawing as red boxes.", + "type": "comment" + }, + "112": { + "file_id": 8, + "content": " index_position = (x1, y1 - font_size)\n draw.text(\n index_position,\n label,\n fill=\"red\",\n font_size=font_size,\n )\n # Add the non-overlapping box to the drawn_boxes list\n drawn_boxes.append((x1, y1, x2, y2))\n label_coordinates[label] = (x1, y1, x2, y2)\n counter += 1\n # Save the image\n timestamp = time.strftime(\"%Y%m%d-%H%M%S\")\n output_path = os.path.join(detections_dir, f\"img_{timestamp}_labeled.png\")\n output_path_debug = os.path.join(detections_dir, f\"img_{timestamp}_debug.png\")\n output_path_original = os.path.join(detections_dir, f\"img_{timestamp}_original.png\")\n image_labeled.save(output_path)\n image_debug.save(output_path_debug)\n image_original.save(output_path_original)\n buffered_original = io.BytesIO()\n image_original.save(buffered_original, format=\"PNG\") # I guess this is needed", + "type": "code", + "location": "/operate/utils/label.py:102-128" + }, + "113": { + "file_id": 8, + "content": "Code saves labeled, debug, and original images with timestamped file names. It also writes the labeled image to a BytesIO object for potential future use.", + "type": "comment" + }, + "114": { + "file_id": 8, + "content": " img_base64_original = base64.b64encode(buffered_original.getvalue()).decode(\"utf-8\")\n # Convert image to base64 for return\n buffered_labeled = io.BytesIO()\n image_labeled.save(buffered_labeled, format=\"PNG\") # I guess this is needed\n img_base64_labeled = base64.b64encode(buffered_labeled.getvalue()).decode(\"utf-8\")\n return img_base64_labeled, img_base64_original, label_coordinates\ndef parse_click_content(message_content):\n \"\"\"\n Parses the response message to determine if it's a CLICK or NONE action and returns the appropriate data.\n :param message_content: The content of the response message.\n :return: A dictionary with the relevant data or a message indicating a NONE action.\n \"\"\"\n try:\n # Check for and remove erroneous ```json at the start and ``` at the end\n if message_content.startswith(\"```json\"):\n message_content = message_content[\n len(\"```json\") :\n ] # Remove starting ```json\n if message_content.endswith(\"```\"):", + "type": "code", + "location": "/operate/utils/label.py:129-152" + }, + "115": { + "file_id": 8, + "content": "Convert image to base64 for return\nCode is saving the labeled image as PNG and encoding it in base64 format", + "type": "comment" + }, + "116": { + "file_id": 8, + "content": " message_content = message_content[: -len(\"```\")] # Remove ending ```\n # Convert JSON string to dictionary\n return json.loads(message_content.strip())\n except json.JSONDecodeError as e:\n return {\"error\": \"Invalid JSON format\"}\n return {\"error\": \"Invalid response format\"}\ndef get_click_position_in_percent(coordinates, image_size):\n \"\"\"\n Calculates the click position at the center of the bounding box and converts it to percentages.\n :param coordinates: A tuple of the bounding box coordinates (x1, y1, x2, y2).\n :param image_size: A tuple of the image dimensions (width, height).\n :return: A tuple of the click position in percentages (x_percent, y_percent).\n \"\"\"\n if not coordinates or not image_size:\n return None\n # Calculate the center of the bounding box\n x_center = (coordinates[0] + coordinates[2]) / 2\n y_center = (coordinates[1] + coordinates[3]) / 2\n # Convert to percentages\n x_percent = (x_center / image_size[0]) * 100\n y_percent = (y_center / image_size[1]) * 100", + "type": "code", + "location": "/operate/utils/label.py:153-180" + }, + "117": { + "file_id": 8, + "content": "This function takes in a message content formatted with triple backticks and removes them. If the format is invalid, it returns an error message. It also has another function that calculates the click position at the center of a bounding box and converts it to percentages.", + "type": "comment" + }, + "118": { + "file_id": 8, + "content": " return x_percent, y_percent", + "type": "code", + "location": "/operate/utils/label.py:182-182" + }, + "119": { + "file_id": 8, + "content": "Computes x and y percentages from input values.", + "type": "comment" + }, + "120": { + "file_id": 9, + "content": "/operate/utils/misc.py", + "type": "filepath" + }, + "121": { + "file_id": 9, + "content": "The code consists of two functions: `convert_percent_to_decimal()` and `extract_json_from_string()`, which handle percentages and JSON structures, respectively. Additionally, it classifies user responses as DONE, CLICK, TYPE, or SEARCH using patterns, extracts relevant data, handles exceptions for invalid inputs or processing errors, and returns \"UNKNOWN\" with original data if no match found while extracting search data using regex.", + "type": "summary" + }, + "122": { + "file_id": 9, + "content": "import json\nimport re\ndef convert_percent_to_decimal(percent_str):\n \"\"\"\n Converts a percentage string to a decimal value.\n Args:\n percent_str (str): The percentage string to be converted.\n Returns:\n float: The decimal value equivalent to the percentage.\n Raises:\n ValueError: If the input string cannot be converted to a float.\n Example:\n >>> convert_percent_to_decimal(\"20%\")\n 0.2\n \"\"\"\n try:\n # Remove the '%' sign and convert to float\n decimal_value = float(percent_str.strip(\"%\"))\n # Convert to decimal (e.g., 20% -> 0.20)\n return decimal_value / 100\n except ValueError as e:\n print(f\"Error converting percent to decimal: {e}\")\n return None\ndef extract_json_from_string(s):\n \"\"\"\n Extracts a JSON structure from a string and returns it as a dictionary.\n Args:\n s (str): The input string.\n Returns:\n dict: The extracted JSON structure as a dictionary, or None if no JSON structure is found or if there is an error parsing the JSON.", + "type": "code", + "location": "/operate/utils/misc.py:1-41" + }, + "123": { + "file_id": 9, + "content": "This code defines two functions: `convert_percent_to_decimal()` and `extract_json_from_string()`. The first function converts a percentage string to a decimal value, while the second extracts a JSON structure from a string and returns it as a dictionary. Both functions handle exceptions in case of invalid inputs or errors during processing.", + "type": "comment" + }, + "124": { + "file_id": 9, + "content": " \"\"\"\n try:\n # Find the start of the JSON structure\n json_start = s.find(\"{\")\n if json_start == -1:\n return None\n # Extract the JSON part and convert it to a dictionary\n json_str = s[json_start:]\n return json.loads(json_str)\n except Exception as e:\n print(f\"Error parsing JSON: {e}\")\n return None\ndef parse_response(response):\n \"\"\"\n Parses the given response and returns a dictionary with the type and data.\n Args:\n response (str): The response to parse.\n Returns:\n dict: A dictionary with the type and data extracted from the response.\n The dictionary has the following structure:\n {\n \"type\": ,\n \"data\": \n }\n If the response is \"DONE\", the type is \"DONE\" and the data is None.\n If the response starts with \"CLICK\", the type is \"CLICK\" and the data is a JSON object.\n If the response starts with \"TYPE\", the type is \"TYPE\" and the data is the text to type.", + "type": "code", + "location": "/operate/utils/misc.py:43-74" + }, + "125": { + "file_id": 9, + "content": "Extracts JSON structure from the response and returns a dictionary with type and data.\nRaises exception if error parsing JSON or if response is not in expected format.", + "type": "comment" + }, + "126": { + "file_id": 9, + "content": " If the response starts with \"SEARCH\", the type is \"SEARCH\" and the data is the search query.\n If the response doesn't match any of the above patterns, the type is \"UNKNOWN\" and the data is the original response.\n \"\"\"\n if response == \"DONE\":\n return {\"type\": \"DONE\", \"data\": None}\n elif response.startswith(\"CLICK\"):\n # Adjust the regex to match the correct format\n click_data = re.search(r\"CLICK \\{ (.+) \\}\", response).group(1)\n click_data_json = json.loads(f\"{{{click_data}}}\")\n return {\"type\": \"CLICK\", \"data\": click_data_json}\n elif response.startswith(\"TYPE\"):\n # Extract the text to type\n try:\n type_data = re.search(r\"TYPE (.+)\", response, re.DOTALL).group(1)\n except:\n type_data = re.search(r'TYPE \"(.+)\"', response, re.DOTALL).group(1)\n return {\"type\": \"TYPE\", \"data\": type_data}\n elif response.startswith(\"SEARCH\"):\n # Extract the search query\n try:\n search_data = re.search(r'SEARCH \"(.+)\"', response).group(1)", + "type": "code", + "location": "/operate/utils/misc.py:75-97" + }, + "127": { + "file_id": 9, + "content": "This code is parsing user responses and determining the appropriate type (DONE, CLICK, TYPE, or SEARCH) based on the response string. It also extracts relevant data for each type of response. If the response doesn't match any known patterns, it is classified as \"UNKNOWN\" with the original response retained.", + "type": "comment" + }, + "128": { + "file_id": 9, + "content": " except:\n search_data = re.search(r\"SEARCH (.+)\", response).group(1)\n return {\"type\": \"SEARCH\", \"data\": search_data}\n return {\"type\": \"UNKNOWN\", \"data\": response}", + "type": "code", + "location": "/operate/utils/misc.py:98-102" + }, + "129": { + "file_id": 9, + "content": "Trying to extract search data from response using regex. If exception occurs, return search data as \"SEARCH\" type and original response as unknown type.", + "type": "comment" + }, + "130": { + "file_id": 10, + "content": "/operate/utils/os.py", + "type": "filepath" + }, + "131": { + "file_id": 10, + "content": "The summary is about a code that involves text input, search execution, and mouse clicks using specified coordinates, as well as a circular movement function with start/end points, radius, and duration, and a get_last_assistant_message function to retrieve the last assistant message from an array.", + "type": "summary" + }, + "132": { + "file_id": 10, + "content": "import pyautogui\nimport platform\nimport time\nimport math\nfrom operate.utils.misc import convert_percent_to_decimal\ndef keyboard_type(text):\n \"\"\"\n Types the given text using the keyboard.\n Args:\n text (str): The text to be typed.\n Returns:\n str: A message indicating the typed text.\n \"\"\"\n text = text.replace(\"\\\\n\", \"\\n\")\n for char in text:\n pyautogui.write(char)\n pyautogui.press(\"enter\")\n return \"Type: \" + text\ndef search(text):\n \"\"\"\n Searches for a program or file by typing the given text in the search bar and pressing Enter.\n Args:\n text (str): The text to be searched.\n Returns:\n str: A message indicating that the program or file has been opened.\n \"\"\"\n if platform.system() == \"Windows\":\n pyautogui.press(\"win\")\n elif platform.system() == \"Linux\":\n pyautogui.press(\"win\")\n else:\n # Press and release Command and Space separately\n pyautogui.keyDown(\"command\")\n pyautogui.press(\"space\")\n pyautogui.keyUp(\"command\")", + "type": "code", + "location": "/operate/utils/os.py:1-44" + }, + "133": { + "file_id": 10, + "content": "Code comments:\n- `keyboard_type(text)` - Types the given text using keyboard and returns a message indicating typed text.\n- `search(text)` - Searches for program or file by typing in search bar and pressing Enter. Returns a message indicating the program or file has been opened.", + "type": "comment" + }, + "134": { + "file_id": 10, + "content": " time.sleep(1)\n # Now type the text\n for char in text:\n pyautogui.write(char)\n pyautogui.press(\"enter\")\n return \"Open program: \" + text\ndef click(click_detail):\n \"\"\"\n Perform a mouse click at the specified coordinates.\n Args:\n click_detail (dict): A dictionary containing the coordinates of the click.\n Returns:\n str: The description of the click if successful, otherwise \"We failed to click\".\n \"\"\"\n try:\n x = convert_percent_to_decimal(click_detail[\"x\"])\n y = convert_percent_to_decimal(click_detail[\"y\"])\n if click_detail and isinstance(x, float) and isinstance(y, float):\n click_at_percentage(x, y)\n return click_detail[\"description\"]\n else:\n return \"We failed to click\"\n except Exception as e:\n print(f\"Error parsing JSON: {e}\")\n return \"We failed to click\"\ndef click_at_percentage(\n x_percentage, y_percentage, duration=0.2, circle_radius=50, circle_duration=0.5\n):\n \"\"\"\n Moves the m", + "type": "code", + "location": "/operate/utils/os.py:46-85" + }, + "135": { + "file_id": 10, + "content": "Line 45-48: Type the text by pressing each character\nLine 49: Press enter after typing the text\nLine 50-79: Perform a mouse click at the specified coordinates\nLine 80-101: Click the program based on the given description", + "type": "comment" + }, + "136": { + "file_id": 10, + "content": "ouse cursor to a specified percentage of the screen and performs a circular movement before clicking.\n Args:\n x_percentage (float): The x-coordinate percentage of the screen to move the cursor to.\n y_percentage (float): The y-coordinate percentage of the screen to move the cursor to.\n duration (float, optional): The duration (in seconds) of the smooth cursor movement. Defaults to 0.2.\n circle_radius (int, optional): The radius of the circular movement. Defaults to 50.\n circle_duration (float, optional): The duration (in seconds) of the circular movement. Defaults to 0.5.\n Returns:\n str: A message indicating that the click was successful.\n \"\"\"\n # Get the size of the primary monitor\n screen_width, screen_height = pyautogui.size()\n # Calculate the x and y coordinates in pixels\n x_pixel = int(screen_width * float(x_percentage))\n y_pixel = int(screen_height * float(y_percentage))\n # Move to the position smoothly\n pyautogui.moveTo(x_pixel, y_pixel, duration=duration)", + "type": "code", + "location": "/operate/utils/os.py:85-105" + }, + "137": { + "file_id": 10, + "content": "Moves the cursor to a specific percentage of the screen and then performs a circular movement before clicking.", + "type": "comment" + }, + "138": { + "file_id": 10, + "content": " # Circular movement\n start_time = time.time()\n while time.time() - start_time < circle_duration:\n angle = ((time.time() - start_time) / circle_duration) * 2 * math.pi\n x = x_pixel + math.cos(angle) * circle_radius\n y = y_pixel + math.sin(angle) * circle_radius\n pyautogui.moveTo(x, y, duration=0.1)\n # Finally, click\n pyautogui.click(x_pixel, y_pixel)\n return \"Successfully clicked\"\ndef get_last_assistant_message(messages):\n \"\"\"\n Retrieve the last message from the assistant in the messages array.\n If the last assistant message is the first message in the array, return None.\n \"\"\"\n for index in reversed(range(len(messages))):\n if messages[index][\"role\"] == \"assistant\":\n if index == 0: # Check if the assistant message is the first in the array\n return None\n else:\n return messages[index]\n return None # Return None if no assistant message is found", + "type": "code", + "location": "/operate/utils/os.py:107-131" + }, + "139": { + "file_id": 10, + "content": "For the code provided, here are some brief comments:\n\n1. The function is for circular movement, which takes start and end points as input parameters (x_pixel, y_pixel), circle radius, and duration. It calculates the intermediate position by using time elapsed and performs a circular movement towards the destination point.\n2. In the get_last_assistant_message function, it retrieves the last message from the assistant in the messages array. If the last assistant message is the first message in the array, return None. Otherwise, return the last assistant message.", + "type": "comment" + }, + "140": { + "file_id": 11, + "content": "/operate/utils/screenshot.py", + "type": "filepath" + }, + "141": { + "file_id": 11, + "content": "The code has functions to add grids to images and capture screenshots using PIL, accepting input in various formats. It saves the captured image at a specified file path or displays an error message for unsupported platforms.", + "type": "summary" + }, + "142": { + "file_id": 11, + "content": "import os\nimport platform\nimport subprocess\nimport pyautogui\nfrom PIL import Image, ImageDraw, ImageGrab\nimport Xlib.display\nimport Xlib.X\nimport Xlib.Xutil # not sure if Xutil is necessary\nfrom operate.settings import Config\nfrom operate.prompts import ACCURATE_PIXEL_COUNT\n# Load configuration\nconfig = Config()\nmonitor_size = config.monitor_size\ndef add_grid_to_image(original_image_path, new_image_path, grid_interval):\n \"\"\"\n Add a grid to an image.\n Args:\n original_image_path (str): The file path of the original image.\n new_image_path (str): The file path to save the new image with the grid.\n grid_interval (int): The interval between grid lines in pixels.\n Returns:\n None: The function saves the new image with the grid at the specified path.\n \"\"\"\n # Load the image\n image = Image.open(original_image_path)\n # Create a drawing object\n draw = ImageDraw.Draw(image)\n # Get the image size\n width, height = image.size\n # Reduce the font size a bit\n font_size = int(grid_interval / 10) # Reduced font size", + "type": "code", + "location": "/operate/utils/screenshot.py:1-39" + }, + "143": { + "file_id": 11, + "content": "The code imports necessary libraries and defines a function to add a grid to an image. It loads the original image, creates a drawing object, gets the image size, and reduces the font size for the grid.", + "type": "comment" + }, + "144": { + "file_id": 11, + "content": " # Calculate the background size based on the font size\n bg_width = int(font_size * 4.2) # Adjust as necessary\n bg_height = int(font_size * 1.2) # Adjust as necessary\n # Function to draw text with a white rectangle background\n def draw_label_with_background(\n position, text, draw, font_size, bg_width, bg_height\n ):\n # Adjust the position based on the background size\n text_position = (position[0] + bg_width // 2, position[1] + bg_height // 2)\n # Draw the text background\n draw.rectangle(\n [position[0], position[1], position[0] + bg_width, position[1] + bg_height],\n fill=\"white\",\n )\n # Draw the text\n draw.text(text_position, text, fill=\"black\", font_size=font_size, anchor=\"mm\")\n # Draw vertical lines and labels at every `grid_interval` pixels\n for x in range(grid_interval, width, grid_interval):\n line = ((x, 0), (x, height))\n draw.line(line, fill=\"blue\")\n for y in range(grid_interval, height, grid_interval):", + "type": "code", + "location": "/operate/utils/screenshot.py:41-63" + }, + "145": { + "file_id": 11, + "content": "This function creates a background rectangle for text and draws it with white fill. It also draws vertical lines and labels at every `grid_interval` pixels.", + "type": "comment" + }, + "146": { + "file_id": 11, + "content": " # Calculate the percentage of the width and height\n x_percent = round((x / width) * 100)\n y_percent = round((y / height) * 100)\n draw_label_with_background(\n (x - bg_width // 2, y - bg_height // 2),\n f\"{x_percent}%,{y_percent}%\",\n draw,\n font_size,\n bg_width,\n bg_height,\n )\n # Draw horizontal lines - labels are already added with vertical lines\n for y in range(grid_interval, height, grid_interval):\n line = ((0, y), (width, y))\n draw.line(line, fill=\"blue\")\n # Save the image with the grid\n image.save(new_image_path)\ndef capture_mini_screenshot_with_cursor(\n file_path=os.path.join(\"screenshots\", \"screenshot_mini.png\"), x=0, y=0\n):\n \"\"\"\n Capture a mini screenshot with the cursor at the specified coordinates.\n Args:\n file_path (str, optional): The file path to save the screenshot. Defaults to \"screenshots/screenshot_mini.png\".", + "type": "code", + "location": "/operate/utils/screenshot.py:64-92" + }, + "147": { + "file_id": 11, + "content": "Calculates the percentage of coordinates and draws labels with background. Draws horizontal lines for grid labels. Saves the image with the grid at specified file path.", + "type": "comment" + }, + "148": { + "file_id": 11, + "content": " x (int or str, optional): The x-coordinate of the cursor position. Can be specified as an integer or a percentage string. Defaults to 0.\n y (int or str, optional): The y-coordinate of the cursor position. Can be specified as an integer or a percentage string. Defaults to 0.\n \"\"\"\n user_platform = platform.system()\n if user_platform == \"Linux\":\n x = float(x[:-1]) # convert x from \"50%\" to 50.\n y = float(y[:-1])\n x = (x / 100) * monitor_size[\n \"width\"\n ] # convert x from 50 to 0.5 * monitor_width\n y = (y / 100) * monitor_size[\"height\"]\n # Define the coordinates for the rectangle\n x1, y1 = int(x - ACCURATE_PIXEL_COUNT / 2), int(y - ACCURATE_PIXEL_COUNT / 2)\n x2, y2 = int(x + ACCURATE_PIXEL_COUNT / 2), int(y + ACCURATE_PIXEL_COUNT / 2)\n screenshot = ImageGrab.grab(bbox=(x1, y1, x2, y2))\n screenshot = screenshot.resize(\n (screenshot.width * 2, screenshot.height * 2), Image.LANCZOS\n ) # upscale the image so it's easier to see and percentage marks more visible", + "type": "code", + "location": "/operate/utils/screenshot.py:93-114" + }, + "149": { + "file_id": 11, + "content": "This code is used to take a screenshot of a specific area on the user's monitor using the Python Imaging Library (PIL) and ImageGrab modules. It takes optional x and y coordinates as inputs, which can be specified as integers or percentage strings. The function converts the input values into the appropriate format for calculating the coordinates of the rectangle to capture the screenshot. If the user is on a Linux system, it performs additional calculations to convert percentage-based input into actual pixel coordinates and upscales the image for better visibility.", + "type": "comment" + }, + "150": { + "file_id": 11, + "content": " screenshot.save(file_path)\n screenshots_dir = \"screenshots\"\n grid_screenshot_filename = os.path.join(\n screenshots_dir, \"screenshot_mini_with_grid.png\"\n )\n add_grid_to_image(\n file_path, grid_screenshot_filename, int(ACCURATE_PIXEL_COUNT / 2)\n )\n elif user_platform == \"Darwin\":\n x = float(x[:-1]) # convert x from \"50%\" to 50.\n y = float(y[:-1])\n x = (x / 100) * monitor_size[\n \"width\"\n ] # convert x from 50 to 0.5 * monitor_width\n y = (y / 100) * monitor_size[\"height\"]\n x1, y1 = int(x - ACCURATE_PIXEL_COUNT / 2), int(y - ACCURATE_PIXEL_COUNT / 2)\n width = ACCURATE_PIXEL_COUNT\n height = ACCURATE_PIXEL_COUNT\n # Use the screencapture utility to capture the screen with the cursor\n rect = f\"-R{x1},{y1},{width},{height}\"\n subprocess.run([\"screencapture\", \"-C\", rect, file_path])\n screenshots_dir = \"screenshots\"\n grid_screenshot_filename = os.path.join(", + "type": "code", + "location": "/operate/utils/screenshot.py:115-143" + }, + "151": { + "file_id": 11, + "content": "Code is capturing a screenshot based on user platform. For non-Darwin platforms, it saves the screenshot, while for Darwin (macOS), it uses screencapture utility to capture the screen with cursor and saves the result. Both versions save the grid screenshot as well.", + "type": "comment" + }, + "152": { + "file_id": 11, + "content": " screenshots_dir, \"screenshot_mini_with_grid.png\"\n )\n add_grid_to_image(\n file_path, grid_screenshot_filename, int(ACCURATE_PIXEL_COUNT / 2)\n )\ndef capture_screen_with_cursor(file_path):\n \"\"\"\n Capture the screen with the cursor and save it to the specified file path.\n Args:\n file_path (str): The file path where the screenshot will be saved.\n Raises:\n None\n Returns:\n None\n \"\"\"\n user_platform = platform.system()\n if user_platform == \"Windows\":\n screenshot = pyautogui.screenshot()\n screenshot.save(file_path)\n elif user_platform == \"Linux\":\n # Use xlib to prevent scrot dependency for Linux\n screen = Xlib.display.Display().screen()\n size = screen.width_in_pixels, screen.height_in_pixels\n monitor_size[\"width\"] = size[0]\n monitor_size[\"height\"] = size[1]\n screenshot = ImageGrab.grab(bbox=(0, 0, size[0], size[1]))\n screenshot.save(file_path)\n elif user_platform == \"Darwin\": # (Mac OS)", + "type": "code", + "location": "/operate/utils/screenshot.py:144-178" + }, + "153": { + "file_id": 11, + "content": "This code captures a screenshot of the computer's display with cursor and saves it to the specified file path. It checks the user platform (Windows, Linux, or Mac OS) and uses appropriate libraries to capture the screenshot.", + "type": "comment" + }, + "154": { + "file_id": 11, + "content": " # Use the screencapture utility to capture the screen with the cursor\n subprocess.run([\"screencapture\", \"-C\", file_path])\n else:\n print(f\"The platform you're using ({user_platform}) is not currently supported\")", + "type": "code", + "location": "/operate/utils/screenshot.py:179-182" + }, + "155": { + "file_id": 11, + "content": "This code captures a screenshot of the computer screen with the cursor, or prints an error message if the platform is not supported.", + "type": "comment" + }, + "156": { + "file_id": 12, + "content": "/operate/utils/style.py", + "type": "filepath" + }, + "157": { + "file_id": 12, + "content": "The code uses the PromptStyle library to define styles for UI elements, checks terminal support for ANSI escape codes, and sets color variables based on this.", + "type": "summary" + }, + "158": { + "file_id": 12, + "content": "import sys\nimport platform\nimport os\nfrom prompt_toolkit.styles import Style as PromptStyle\n# Define style\nstyle = PromptStyle.from_dict(\n {\n \"dialog\": \"bg:#88ff88\",\n \"button\": \"bg:#ffffff #000000\",\n \"dialog.body\": \"bg:#44cc44 #ffffff\",\n \"dialog shadow\": \"bg:#003800\",\n }\n)\n# Check if on a windows terminal that supports ANSI escape codes\ndef supports_ansi():\n \"\"\"\n Check if the terminal supports ANSI escape codes\n \"\"\"\n plat = platform.system()\n supported_platform = plat != \"Windows\" or \"ANSICON\" in os.environ\n is_a_tty = hasattr(sys.stdout, \"isatty\") and sys.stdout.isatty()\n return supported_platform and is_a_tty\n# Define ANSI color codes\nANSI_GREEN = \"\\033[32m\" if supports_ansi() else \"\" # Standard green text\nANSI_BRIGHT_GREEN = \"\\033[92m\" if supports_ansi() else \"\" # Bright/bold green text\nANSI_RESET = \"\\033[0m\" if supports_ansi() else \"\" # Reset to default text color\nANSI_BLUE = \"\\033[94m\" if supports_ansi() else \"\" # Bright blue\nANSI_YELLOW = \"\\033[33m\" if supports_ansi() else \"\" # Standard yellow text", + "type": "code", + "location": "/operate/utils/style.py:1-34" + }, + "159": { + "file_id": 12, + "content": "This code defines styles for dialogs, buttons, and other UI elements using the PromptStyle library. It also checks if the terminal supports ANSI escape codes for colors and defines ANSI color codes accordingly.", + "type": "comment" + }, + "160": { + "file_id": 12, + "content": "ANSI_RED = \"\\033[31m\" if supports_ansi() else \"\"\nANSI_BRIGHT_MAGENTA = \"\\033[95m\" if supports_ansi() else \"\" # Bright magenta text", + "type": "code", + "location": "/operate/utils/style.py:35-36" + }, + "161": { + "file_id": 12, + "content": "Checks if the terminal supports ANSI escape codes and sets color variables accordingly.", + "type": "comment" + }, + "162": { + "file_id": 13, + "content": "/requirements-audio.txt", + "type": "filepath" + }, + "163": { + "file_id": 13, + "content": "This code likely refers to a specific type of microphone called \"whisper-mic,\" which is designed for capturing quiet or whispered audio.", + "type": "summary" + }, + "164": { + "file_id": 13, + "content": "whisper-mic", + "type": "code", + "location": "/requirements-audio.txt:1-1" + }, + "165": { + "file_id": 13, + "content": "This code likely refers to a specific type of microphone called \"whisper-mic,\" which is designed for capturing quiet or whispered audio.", + "type": "comment" + }, + "166": { + "file_id": 14, + "content": "/requirements.txt", + "type": "filepath" + }, + "167": { + "file_id": 14, + "content": "The project requires Python packages aiohttp 3.9.1 and ultralytics 8.0.227, listed in the requirements.txt format.", + "type": "summary" + }, + "168": { + "file_id": 14, + "content": "annotated-types==0.6.0\nanyio==3.7.1\ncertifi==2023.7.22\ncharset-normalizer==3.3.2\ncolorama==0.4.6\ncontourpy==1.2.0\ncycler==0.12.1\ndistro==1.8.0\nEasyProcess==1.1\nentrypoint2==1.1\nexceptiongroup==1.1.3\nfonttools==4.44.0\nh11==0.14.0\nhttpcore==1.0.2\nhttpx==0.25.1\nidna==3.4\nimportlib-resources==6.1.1\nkiwisolver==1.4.5\nmatplotlib==3.8.1\nMouseInfo==0.1.3\nmss==9.0.1\nnumpy==1.26.1\nopenai==1.2.3\npackaging==23.2\nPillow==10.1.0\nprompt-toolkit==3.0.39\nPyAutoGUI==0.9.54\npydantic==2.4.2\npydantic_core==2.10.1\nPyGetWindow==0.0.9\nPyMsgBox==1.0.9\npyparsing==3.1.1\npyperclip==1.8.2\nPyRect==0.2.0\npyscreenshot==3.1\nPyScreeze==0.1.29\npython3-xlib==0.15\npython-dateutil==2.8.2\npython-dotenv==1.0.0\npytweening==1.0.7\nrequests==2.31.0\nrubicon-objc==0.4.7\nsix==1.16.0\nsniffio==1.3.0\ntqdm==4.66.1\ntyping_extensions==4.8.0\nurllib3==2.0.7\nwcwidth==0.2.9\nzipp==3.17.0\ngoogle-generativeai==0.3.0", + "type": "code", + "location": "/requirements.txt:1-50" + }, + "169": { + "file_id": 14, + "content": "This is a list of Python package dependencies for a project, specified in requirements.txt format.", + "type": "comment" + }, + "170": { + "file_id": 14, + "content": "aiohttp==3.9.1\nultralytics==8.0.227", + "type": "code", + "location": "/requirements.txt:51-52" + }, + "171": { + "file_id": 14, + "content": "These lines specify the required Python libraries for the project: aiohttp 3.9.1 and ultralytics 8.0.227.", + "type": "comment" + }, + "172": { + "file_id": 15, + "content": "/run.sh", + "type": "filepath" + }, + "173": { + "file_id": 15, + "content": "This Bash script installs SOC on Linux, requires various packages, checks OS for software installation, handles errors, and configures .env file with OpenAI API key while prompting user input and managing permissions on Mac.", + "type": "summary" + }, + "174": { + "file_id": 15, + "content": "#!/bin/bash\n#\n# SOC Installer Script v0.0.1\n# GitHub: https://github.com/OthersideAI/self-operating-computer\n# Issues: https://github.com/OthersideAI/self-operating-computer/issues\n# Requires: bash, curl/wget, python3, pip, git\n#\n# Please open an issue if you notice any bugs.\n#\n#\n# This script is create by centopw\n#\n#\nclear\necho -e \"\\e[0m\\c\"\nLOG_FILE=\"install_log.txt\"\n# shellcheck disable=SC2016\necho '\n $$$$$$\\ $$$$$$\\ $$$$$$\\ \n$$ __$$\\ $$ __$$\\ $$ __$$\\ \n$$ / \\__|$$ / $$ |$$ / \\__|\n\\$$$$$$\\ $$ | $$ |$$ | \n \\____$$\\ $$ | $$ |$$ | \n$$\\ $$ |$$ | $$ |$$ | $$\\ \n\\$$$$$$ | $$$$$$ |\\$$$$$$ |\n \\______/ \\______/ \\______/ \n Self-Operating-Computer\n--- Created by OthersideAI ---\n'\n# Function to log errors\nlog_error() {\n echo \"Error at $(date): $1\" >> \"$LOG_FILE\"\n}\n# Function to check if a command exists\ncommand_exists() {\n command -v \"$1\" &> /dev/null\n}\n# Function to install packages based on the operating system\ninstall_packages() {\n if [ \"$os\" == \"Linux\" ]; then\n # Use the appropriate package manager for Linux", + "type": "code", + "location": "/run.sh:1-48" + }, + "175": { + "file_id": 15, + "content": "The code is a Bash script for installing the Self-Operating-Computer (SOC) on a Linux system. It starts by clearing the terminal and displaying a welcome message, then defines functions to log errors, check if commands exist, and install packages based on the operating system. The script requires bash, curl/wget, python3, pip, and git.", + "type": "comment" + }, + "176": { + "file_id": 15, + "content": " if command_exists apt-get; then\n sudo apt-get install -y \"$1\" || { log_error \"Unable to install $1.\"; exit 1; }\n elif command_exists yum; then\n sudo yum install -y \"$1\" || { log_error \"Unable to install $1.\"; exit 1; }\n else\n log_error \"Unsupported package manager. Please install $1 manually.\"\n exit 1\n fi\n elif [ \"$os\" == \"Darwin\" ]; then\n # Use Homebrew for macOS\n if command_exists brew; then\n brew install \"$1\" || { log_error \"Unable to install $1.\"; exit 1; }\n else\n log_error \"Homebrew not found. Please install Homebrew and then $1 manually.\"\n exit 1\n fi\n elif [ \"$os\" == \"MINGW64_NT-10.0\" ]; then\n # Use Chocolatey for Windows\n if command_exists choco; then\n choco install \"$1\" -y || { log_error \"Unable to install $1.\"; exit 1; }\n else\n log_error \"Chocolatey not found. Please install Chocolatey and then $1 manually.\"\n exit 1", + "type": "code", + "location": "/run.sh:49-71" + }, + "177": { + "file_id": 15, + "content": "This code checks the operating system and package manager to install a specified software. If the required package manager is found, it installs the software using sudo commands. If not, it logs an error and exits. For macOS, it uses Homebrew if installed; otherwise, it logs an error and exits. For Windows (MINGW64_NT-10.0), it uses Chocolatey if installed; otherwise, it logs an error and exits.", + "type": "comment" + }, + "178": { + "file_id": 15, + "content": " fi\n else\n log_error \"Unsupported operating system. Please install $1 manually.\"\n exit 1\n fi\n}\n# Function to run a script and log errors\nrun_script() {\n eval \"$1\" || { log_error \"Error running $1.\"; exit 1; }\n}\n# Check the operating system\nos=$(uname -s)\n# Check if Python is installed\nif ! command_exists python3; then\n echo \"Python not found. Installing Python...\"\n install_packages python3\nfi\n# Check if pip is installed\nif ! command_exists pip; then\n echo \"pip not found. Installing pip...\"\n install_packages python3-pip\nfi\n# Check if git is installed\nif ! command_exists git; then\n echo \"Git not found. Installing Git...\"\n install_packages git\nfi \n# Create a Python virtual environment\nrun_script \"python3 -m venv venv\"\n# Activate the virtual environment\nsource venv/bin/activate || { log_error \"Unable to activate the virtual environment.\"; exit 1; }\n# Install project requirements\nrun_script \"pip install -r requirements.txt\"\n# Install Project and Command-Line Interface\nrun_script \"pip install .\"", + "type": "code", + "location": "/run.sh:72-115" + }, + "179": { + "file_id": 15, + "content": "This code checks the operating system and ensures Python, pip, and Git are installed. It creates a Python virtual environment and installs project requirements before installing the project itself.", + "type": "comment" + }, + "180": { + "file_id": 15, + "content": "# Check if the .env file exists and the OPENAI_API_KEY is set in it\nif [ -f .env ] && grep -q \"OPENAI_API_KEY\" .env; then\n echo \"OpenAI API key found in .env file. Skipping prompt...\"\nelse\n # Prompt user for Open AI key\n read -p \"Enter your OpenAI API key: \" openai_key\n # Set the API key as an environment variable\n export OPENAI_API_KEY=\"$openai_key\"\n # Create a new .env file\n touch .env\n # Write the API key to the .env file\n echo \"OPENAI_API_KEY='$openai_key'\" > .env\nfi\n# Notify the user about the last step\necho \"Final Step: As a last step, the Terminal app will ask for permission for 'Screen Recording' and 'Accessibility' in the 'Security & Privacy' page of Mac's 'System Preferences.'\"\necho \"Operating system: $os\"\nif [ \"$os\" == \"Darwin\" ]; then\n echo \"Attempting to open Security & Privacy settings...\"\n open /System/Library/PreferencePanes/Security.prefPane\n read -p \"Have you granted the necessary permissions in the Security & Privacy settings? (y/n): \" confirm\n if [ \"$confirm\" != \"y\" ]; then", + "type": "code", + "location": "/run.sh:117-143" + }, + "181": { + "file_id": 15, + "content": "This code checks if the .env file exists and if it contains an OPENAI_API_KEY. If not, it prompts the user to enter their OpenAI API key, stores it in a new .env file as an environment variable, and then informs the user about the final step of granting permissions for 'Screen Recording' and 'Accessibility' in Mac's System Preferences.", + "type": "comment" + }, + "182": { + "file_id": 15, + "content": " echo \"Please grant the necessary permissions and then rerun the script.\"\n exit 1\n fi\nelse\n echo \"Not a macOS system, skipping...\"\nfi\n# End of the script\necho \"Installation complete. Enjoy using the Self-Operating Computer Framework!\"\n# Run the framework\nrun_script \"operate\"", + "type": "code", + "location": "/run.sh:144-155" + }, + "183": { + "file_id": 15, + "content": "The code checks if the system is macOS. If it's not, it skips some steps and informs that it's not a macOS system. If permissions are granted, it proceeds to install the framework and runs it with \"operate\" script.", + "type": "comment" + }, + "184": { + "file_id": 16, + "content": "/setup.py", + "type": "filepath" + }, + "185": { + "file_id": 16, + "content": "This code is using setuptools to create a setup script for the \"self-operating-computer\" package. It imports the necessary modules and reads requirements from \"requirements.txt\". It also reads project description from \"README.md\", sets up dependencies, and defines entry points for console scripts.", + "type": "summary" + }, + "186": { + "file_id": 16, + "content": "from setuptools import setup, find_packages\n# Read the contents of your requirements.txt file\nwith open(\"requirements.txt\") as f:\n required = f.read().splitlines()\n# Read the contents of your README.md file for the project description\nwith open(\"README.md\", \"r\", encoding=\"utf-8\") as readme_file:\n long_description = readme_file.read()\nsetup(\n name=\"self-operating-computer\",\n version=\"1.1.1\",\n packages=find_packages(),\n install_requires=required, # Add dependencies here\n entry_points={\n \"console_scripts\": [\n \"operate=operate.main:main_entry\",\n ],\n },\n long_description=long_description, # Add project description here\n long_description_content_type=\"text/markdown\", # Specify Markdown format\n # include any other necessary setup options here\n)", + "type": "code", + "location": "/setup.py:1-24" + }, + "187": { + "file_id": 16, + "content": "This code is using setuptools to create a setup script for the \"self-operating-computer\" package. It imports the necessary modules and reads requirements from \"requirements.txt\". It also reads project description from \"README.md\", sets up dependencies, and defines entry points for console scripts.", + "type": "comment" + } +} \ No newline at end of file diff --git a/docs/data/titles/0.json b/docs/data/titles/0.json new file mode 100644 index 00000000..b112caca --- /dev/null +++ b/docs/data/titles/0.json @@ -0,0 +1,92 @@ +{ + "/README.md": "Self-Operating Computer Framework: Enhanced Mouse Predictions", + "/README.md:1-26": "Human-Like Computer Control Framework", + "/README.md:126-159": "Enable Voice Mode in Self-Operating-Computer Framework", + "/README.md:159-172": "Join HyperWriteAI Discord, Visit #self-operating-computer, Gpt4Vision Model, API Credits Required", + "/README.md:26-37": "Agent-1-Vision Model Overview", + "/README.md:37-67": "Improving Mouse Click Accuracy", + "/README.md:67-88": "Install, Configure, and Operate: A Comprehensive Guide", + "/README.md:89-124": "Installing SOCF and GMPV", + "/evaluate.py": "Vision Model Image Evaluation", + "/evaluate.py:1-31": "Setting Up Evaluation Test Cases", + "/evaluate.py:106-140": "Test Evaluation and Display", + "/evaluate.py:141-150": "Test Result Display", + "/evaluate.py:32-73": "ANSI Colors for Terminal Support Detection", + "/evaluate.py:75-105": "Evaluate Summary Screenshot: GPT-4 Vision Model Integration", + "/operate/actions.py": "AI-Powered Content Generation", + "/operate/actions.py:1-51": "Action Prediction Model", + "/operate/actions.py:116-153": "Grid Overlay Screenshot Capture", + "/operate/actions.py:154-189": "Screenshot-to-Message AI Model", + "/operate/actions.py:189-215": "Cursor-Guided AI Prompt Enhancement", + "/operate/actions.py:217-248": "GPT-4 Vision Prompt Creation", + "/operate/actions.py:249-275": "Encoding Image for AI Model Generation", + "/operate/actions.py:276-305": "Desktop Screenshot Labeling with GPT-4", + "/operate/actions.py:307-338": "Labeled Click and Decision Prompt System", + "/operate/actions.py:340-364": "API Click Position Calculator", + "/operate/actions.py:365-387": "Click Position Handler", + "/operate/actions.py:390-409": "Fetch OpenAI Chat Completion Asynchronously", + "/operate/actions.py:52-83": "Dynamic Model Caller with Screenshot Capture", + "/operate/actions.py:84-115": "Vision AI Message Encoder", + "/operate/dialog.py": "Error-Handling User Input in Dialog Operations", + "/operate/dialog.py:1-44": "Self-Operating Computer Response Model", + "/operate/dialog.py:110-139": "Exception Handling and Action Execution", + "/operate/dialog.py:140-171": "Action Type Check and Process", + "/operate/dialog.py:173-192": "Invalid Input Check and Error Message", + "/operate/dialog.py:46-80": "Voice Mode and WhisperMic Initialization", + "/operate/dialog.py:81-109": "Capturing and Processing Voice Inputs", + "/operate/exceptions.py": "ModelRecognitionException", + "/operate/main.py": "Main Entry Point for Self-Operating Computer", + "/operate/prompts.py": "Context-Based Prompts for AI-Assisted Google Tools", + "/operate/prompts.py:1-33": "Config Settings and Constants in Prompts Module", + "/operate/prompts.py:136-159": "AI-Assisted Web Interaction with Labeled Elements", + "/operate/prompts.py:161-183": "Contextual JSON Responses", + "/operate/prompts.py:185-217": "Prompt Formatting Functions", + "/operate/prompts.py:218-252": "Python Prompt Formatting Functions", + "/operate/prompts.py:33-63": "Interacting with Computers: Tips and Tricks", + "/operate/prompts.py:64-82": "Cursor Position Prompt", + "/operate/prompts.py:82-95": "Guessing Percentages: CLICK Refinement", + "/operate/prompts.py:97-135": "Interactive Prompts for Efficient Tasks", + "/operate/settings.py": "Environment Configurations in Settings.py", + "/operate/settings.py:1-36": "Configuration Manager for Settings", + "/operate/settings.py:37-39": "Set OpenAI API URL with Env Var or Current Value", + "/operate/utils/label.py": "Image Processing Utilities", + "/operate/utils/label.py:1-37": "Validate and Retrieve Image Data Functions", + "/operate/utils/label.py:102-128": "Timestamped Image Saving", + "/operate/utils/label.py:129-152": "Encode Labeled Image in Base64", + "/operate/utils/label.py:153-180": "Triple Backticks Remover & Click Percentage Calculator", + "/operate/utils/label.py:182-182": "Compute Label Percentages", + "/operate/utils/label.py:40-72": "Box Overlap Detection and Labeling Functionality", + "/operate/utils/label.py:74-101": "Bounding Box Labeler", + "/operate/utils/misc.py": "Multifunctional Data Processor", + "/operate/utils/misc.py:1-41": "Converting and Extracting: Misc.py Functions", + "/operate/utils/misc.py:43-74": "Parse JSON Response", + "/operate/utils/misc.py:75-97": "Response Parser and Classifier", + "/operate/utils/misc.py:98-102": "Handling Regex Exceptions in Search Data", + "/operate/utils/os.py": "Circular Motion and Text Input Utility", + "/operate/utils/os.py:1-44": "OS Utilities", + "/operate/utils/os.py:107-131": "Circular Movement Function and Assistant Message Retrieval", + "/operate/utils/os.py:46-85": "Automated OS Interaction Utility", + "/operate/utils/os.py:85-105": "Circular Cursor Clicker", + "/operate/utils/screenshot.py": "Screenshot Capture Utilities", + "/operate/utils/screenshot.py:1-39": "Grid Image Overlay", + "/operate/utils/screenshot.py:115-143": "Cross-Platform Screenshot Capture", + "/operate/utils/screenshot.py:144-178": "Cross-Platform Screenshot Capture Utility", + "/operate/utils/screenshot.py:179-182": "Cross-Platform Screenshot and Cursor Capture Utility", + "/operate/utils/screenshot.py:41-63": "Background Rectangle and Grid Lines Generator", + "/operate/utils/screenshot.py:64-92": "Grid Screenshot Labeler", + "/operate/utils/screenshot.py:93-114": "Screenshot Capture Utility", + "/operate/utils/style.py": "UI Style Configuration with PromptStyle", + "/operate/utils/style.py:1-34": "Dialog and UI Styles with PromptStyle", + "/operate/utils/style.py:35-36": "Detect Terminal Color Capabilities", + "/requirements-audio.txt": "Whisper Mic Requirements", + "/requirements.txt": "Python Packages for Project", + "/requirements.txt:1-50": "Python Package Dependencies List", + "/requirements.txt:51-52": "Project Libraries: aiohttp, ultralytics", + "/run.sh": "SOC Linux Install Script", + "/run.sh:1-48": "SOC Linux Installation Script", + "/run.sh:117-143": "OpenAI API Key Configurator", + "/run.sh:144-155": "MacOS Installation Check", + "/run.sh:49-71": "Universal Software Installer", + "/run.sh:72-115": "Automating Python Project Setup", + "/setup.py": "Setting up 'self-operating-computer'" +} \ No newline at end of file diff --git a/docs/doc/347970bb-ed24-4242-ae37-7fe0302c6efb.json b/docs/doc/347970bb-ed24-4242-ae37-7fe0302c6efb.json new file mode 100644 index 00000000..9a831f31 --- /dev/null +++ b/docs/doc/347970bb-ed24-4242-ae37-7fe0302c6efb.json @@ -0,0 +1,30 @@ +{ + "summary": "This Bash script installs SOC on Linux, requires various packages, checks OS for software installation, handles errors, and configures .env file with OpenAI API key while prompting user input and managing permissions on Mac.", + "details": [ + { + "comment": "The code is a Bash script for installing the Self-Operating-Computer (SOC) on a Linux system. It starts by clearing the terminal and displaying a welcome message, then defines functions to log errors, check if commands exist, and install packages based on the operating system. The script requires bash, curl/wget, python3, pip, and git.", + "location": "\"/media/root/Toshiba XG3/works/self-operating-computer/docs/src/run.sh\":0-47", + "content": "#!/bin/bash\n#\n# SOC Installer Script v0.0.1\n# GitHub: https://github.com/OthersideAI/self-operating-computer\n# Issues: https://github.com/OthersideAI/self-operating-computer/issues\n# Requires: bash, curl/wget, python3, pip, git\n#\n# Please open an issue if you notice any bugs.\n#\n#\n# This script is create by centopw\n#\n#\nclear\necho -e \"\\e[0m\\c\"\nLOG_FILE=\"install_log.txt\"\n# shellcheck disable=SC2016\necho '\n $$$$$$\\ $$$$$$\\ $$$$$$\\ \n$$ __$$\\ $$ __$$\\ $$ __$$\\ \n$$ / \\__|$$ / $$ |$$ / \\__|\n\\$$$$$$\\ $$ | $$ |$$ | \n \\____$$\\ $$ | $$ |$$ | \n$$\\ $$ |$$ | $$ |$$ | $$\\ \n\\$$$$$$ | $$$$$$ |\\$$$$$$ |\n \\______/ \\______/ \\______/ \n Self-Operating-Computer\n--- Created by OthersideAI ---\n'\n# Function to log errors\nlog_error() {\n echo \"Error at $(date): $1\" >> \"$LOG_FILE\"\n}\n# Function to check if a command exists\ncommand_exists() {\n command -v \"$1\" &> /dev/null\n}\n# Function to install packages based on the operating system\ninstall_packages() {\n if [ \"$os\" == \"Linux\" ]; then\n # Use the appropriate package manager for Linux" + }, + { + "comment": "This code checks the operating system and package manager to install a specified software. If the required package manager is found, it installs the software using sudo commands. If not, it logs an error and exits. For macOS, it uses Homebrew if installed; otherwise, it logs an error and exits. For Windows (MINGW64_NT-10.0), it uses Chocolatey if installed; otherwise, it logs an error and exits.", + "location": "\"/media/root/Toshiba XG3/works/self-operating-computer/docs/src/run.sh\":48-70", + "content": " if command_exists apt-get; then\n sudo apt-get install -y \"$1\" || { log_error \"Unable to install $1.\"; exit 1; }\n elif command_exists yum; then\n sudo yum install -y \"$1\" || { log_error \"Unable to install $1.\"; exit 1; }\n else\n log_error \"Unsupported package manager. Please install $1 manually.\"\n exit 1\n fi\n elif [ \"$os\" == \"Darwin\" ]; then\n # Use Homebrew for macOS\n if command_exists brew; then\n brew install \"$1\" || { log_error \"Unable to install $1.\"; exit 1; }\n else\n log_error \"Homebrew not found. Please install Homebrew and then $1 manually.\"\n exit 1\n fi\n elif [ \"$os\" == \"MINGW64_NT-10.0\" ]; then\n # Use Chocolatey for Windows\n if command_exists choco; then\n choco install \"$1\" -y || { log_error \"Unable to install $1.\"; exit 1; }\n else\n log_error \"Chocolatey not found. Please install Chocolatey and then $1 manually.\"\n exit 1" + }, + { + "comment": "This code checks the operating system and ensures Python, pip, and Git are installed. It creates a Python virtual environment and installs project requirements before installing the project itself.", + "location": "\"/media/root/Toshiba XG3/works/self-operating-computer/docs/src/run.sh\":71-114", + "content": " fi\n else\n log_error \"Unsupported operating system. Please install $1 manually.\"\n exit 1\n fi\n}\n# Function to run a script and log errors\nrun_script() {\n eval \"$1\" || { log_error \"Error running $1.\"; exit 1; }\n}\n# Check the operating system\nos=$(uname -s)\n# Check if Python is installed\nif ! command_exists python3; then\n echo \"Python not found. Installing Python...\"\n install_packages python3\nfi\n# Check if pip is installed\nif ! command_exists pip; then\n echo \"pip not found. Installing pip...\"\n install_packages python3-pip\nfi\n# Check if git is installed\nif ! command_exists git; then\n echo \"Git not found. Installing Git...\"\n install_packages git\nfi \n# Create a Python virtual environment\nrun_script \"python3 -m venv venv\"\n# Activate the virtual environment\nsource venv/bin/activate || { log_error \"Unable to activate the virtual environment.\"; exit 1; }\n# Install project requirements\nrun_script \"pip install -r requirements.txt\"\n# Install Project and Command-Line Interface\nrun_script \"pip install .\"" + }, + { + "comment": "This code checks if the .env file exists and if it contains an OPENAI_API_KEY. If not, it prompts the user to enter their OpenAI API key, stores it in a new .env file as an environment variable, and then informs the user about the final step of granting permissions for 'Screen Recording' and 'Accessibility' in Mac's System Preferences.", + "location": "\"/media/root/Toshiba XG3/works/self-operating-computer/docs/src/run.sh\":116-142", + "content": "# Check if the .env file exists and the OPENAI_API_KEY is set in it\nif [ -f .env ] && grep -q \"OPENAI_API_KEY\" .env; then\n echo \"OpenAI API key found in .env file. Skipping prompt...\"\nelse\n # Prompt user for Open AI key\n read -p \"Enter your OpenAI API key: \" openai_key\n # Set the API key as an environment variable\n export OPENAI_API_KEY=\"$openai_key\"\n # Create a new .env file\n touch .env\n # Write the API key to the .env file\n echo \"OPENAI_API_KEY='$openai_key'\" > .env\nfi\n# Notify the user about the last step\necho \"Final Step: As a last step, the Terminal app will ask for permission for 'Screen Recording' and 'Accessibility' in the 'Security & Privacy' page of Mac's 'System Preferences.'\"\necho \"Operating system: $os\"\nif [ \"$os\" == \"Darwin\" ]; then\n echo \"Attempting to open Security & Privacy settings...\"\n open /System/Library/PreferencePanes/Security.prefPane\n read -p \"Have you granted the necessary permissions in the Security & Privacy settings? (y/n): \" confirm\n if [ \"$confirm\" != \"y\" ]; then" + }, + { + "comment": "The code checks if the system is macOS. If it's not, it skips some steps and informs that it's not a macOS system. If permissions are granted, it proceeds to install the framework and runs it with \"operate\" script.", + "location": "\"/media/root/Toshiba XG3/works/self-operating-computer/docs/src/run.sh\":143-154", + "content": " echo \"Please grant the necessary permissions and then rerun the script.\"\n exit 1\n fi\nelse\n echo \"Not a macOS system, skipping...\"\nfi\n# End of the script\necho \"Installation complete. Enjoy using the Self-Operating Computer Framework!\"\n# Run the framework\nrun_script \"operate\"" + } + ] +} \ No newline at end of file diff --git a/docs/doc/3a3bb70b-8380-48f2-913e-4618828f90cd.json b/docs/doc/3a3bb70b-8380-48f2-913e-4618828f90cd.json new file mode 100644 index 00000000..b992b991 --- /dev/null +++ b/docs/doc/3a3bb70b-8380-48f2-913e-4618828f90cd.json @@ -0,0 +1,10 @@ +{ + "summary": "This code likely refers to a specific type of microphone called \"whisper-mic,\" which is designed for capturing quiet or whispered audio.", + "details": [ + { + "comment": "This code likely refers to a specific type of microphone called \"whisper-mic,\" which is designed for capturing quiet or whispered audio.", + "location": "\"/media/root/Toshiba XG3/works/self-operating-computer/docs/src/requirements-audio.txt\":0-0", + "content": "whisper-mic" + } + ] +} \ No newline at end of file diff --git a/docs/doc/4866fd28-ffa6-4bbe-b91f-4767e302f2f1.json b/docs/doc/4866fd28-ffa6-4bbe-b91f-4767e302f2f1.json new file mode 100644 index 00000000..30efc538 --- /dev/null +++ b/docs/doc/4866fd28-ffa6-4bbe-b91f-4767e302f2f1.json @@ -0,0 +1,40 @@ +{ + "summary": "The code has functions to add grids to images and capture screenshots using PIL, accepting input in various formats. It saves the captured image at a specified file path or displays an error message for unsupported platforms.", + "details": [ + { + "comment": "The code imports necessary libraries and defines a function to add a grid to an image. It loads the original image, creates a drawing object, gets the image size, and reduces the font size for the grid.", + "location": "\"/media/root/Toshiba XG3/works/self-operating-computer/docs/src/operate/utils/screenshot.py\":0-38", + "content": "import os\nimport platform\nimport subprocess\nimport pyautogui\nfrom PIL import Image, ImageDraw, ImageGrab\nimport Xlib.display\nimport Xlib.X\nimport Xlib.Xutil # not sure if Xutil is necessary\nfrom operate.settings import Config\nfrom operate.prompts import ACCURATE_PIXEL_COUNT\n# Load configuration\nconfig = Config()\nmonitor_size = config.monitor_size\ndef add_grid_to_image(original_image_path, new_image_path, grid_interval):\n \"\"\"\n Add a grid to an image.\n Args:\n original_image_path (str): The file path of the original image.\n new_image_path (str): The file path to save the new image with the grid.\n grid_interval (int): The interval between grid lines in pixels.\n Returns:\n None: The function saves the new image with the grid at the specified path.\n \"\"\"\n # Load the image\n image = Image.open(original_image_path)\n # Create a drawing object\n draw = ImageDraw.Draw(image)\n # Get the image size\n width, height = image.size\n # Reduce the font size a bit\n font_size = int(grid_interval / 10) # Reduced font size" + }, + { + "comment": "This function creates a background rectangle for text and draws it with white fill. It also draws vertical lines and labels at every `grid_interval` pixels.", + "location": "\"/media/root/Toshiba XG3/works/self-operating-computer/docs/src/operate/utils/screenshot.py\":40-62", + "content": " # Calculate the background size based on the font size\n bg_width = int(font_size * 4.2) # Adjust as necessary\n bg_height = int(font_size * 1.2) # Adjust as necessary\n # Function to draw text with a white rectangle background\n def draw_label_with_background(\n position, text, draw, font_size, bg_width, bg_height\n ):\n # Adjust the position based on the background size\n text_position = (position[0] + bg_width // 2, position[1] + bg_height // 2)\n # Draw the text background\n draw.rectangle(\n [position[0], position[1], position[0] + bg_width, position[1] + bg_height],\n fill=\"white\",\n )\n # Draw the text\n draw.text(text_position, text, fill=\"black\", font_size=font_size, anchor=\"mm\")\n # Draw vertical lines and labels at every `grid_interval` pixels\n for x in range(grid_interval, width, grid_interval):\n line = ((x, 0), (x, height))\n draw.line(line, fill=\"blue\")\n for y in range(grid_interval, height, grid_interval):" + }, + { + "comment": "Calculates the percentage of coordinates and draws labels with background. Draws horizontal lines for grid labels. Saves the image with the grid at specified file path.", + "location": "\"/media/root/Toshiba XG3/works/self-operating-computer/docs/src/operate/utils/screenshot.py\":63-91", + "content": " # Calculate the percentage of the width and height\n x_percent = round((x / width) * 100)\n y_percent = round((y / height) * 100)\n draw_label_with_background(\n (x - bg_width // 2, y - bg_height // 2),\n f\"{x_percent}%,{y_percent}%\",\n draw,\n font_size,\n bg_width,\n bg_height,\n )\n # Draw horizontal lines - labels are already added with vertical lines\n for y in range(grid_interval, height, grid_interval):\n line = ((0, y), (width, y))\n draw.line(line, fill=\"blue\")\n # Save the image with the grid\n image.save(new_image_path)\ndef capture_mini_screenshot_with_cursor(\n file_path=os.path.join(\"screenshots\", \"screenshot_mini.png\"), x=0, y=0\n):\n \"\"\"\n Capture a mini screenshot with the cursor at the specified coordinates.\n Args:\n file_path (str, optional): The file path to save the screenshot. Defaults to \"screenshots/screenshot_mini.png\"." + }, + { + "comment": "This code is used to take a screenshot of a specific area on the user's monitor using the Python Imaging Library (PIL) and ImageGrab modules. It takes optional x and y coordinates as inputs, which can be specified as integers or percentage strings. The function converts the input values into the appropriate format for calculating the coordinates of the rectangle to capture the screenshot. If the user is on a Linux system, it performs additional calculations to convert percentage-based input into actual pixel coordinates and upscales the image for better visibility.", + "location": "\"/media/root/Toshiba XG3/works/self-operating-computer/docs/src/operate/utils/screenshot.py\":92-113", + "content": " x (int or str, optional): The x-coordinate of the cursor position. Can be specified as an integer or a percentage string. Defaults to 0.\n y (int or str, optional): The y-coordinate of the cursor position. Can be specified as an integer or a percentage string. Defaults to 0.\n \"\"\"\n user_platform = platform.system()\n if user_platform == \"Linux\":\n x = float(x[:-1]) # convert x from \"50%\" to 50.\n y = float(y[:-1])\n x = (x / 100) * monitor_size[\n \"width\"\n ] # convert x from 50 to 0.5 * monitor_width\n y = (y / 100) * monitor_size[\"height\"]\n # Define the coordinates for the rectangle\n x1, y1 = int(x - ACCURATE_PIXEL_COUNT / 2), int(y - ACCURATE_PIXEL_COUNT / 2)\n x2, y2 = int(x + ACCURATE_PIXEL_COUNT / 2), int(y + ACCURATE_PIXEL_COUNT / 2)\n screenshot = ImageGrab.grab(bbox=(x1, y1, x2, y2))\n screenshot = screenshot.resize(\n (screenshot.width * 2, screenshot.height * 2), Image.LANCZOS\n ) # upscale the image so it's easier to see and percentage marks more visible" + }, + { + "comment": "Code is capturing a screenshot based on user platform. For non-Darwin platforms, it saves the screenshot, while for Darwin (macOS), it uses screencapture utility to capture the screen with cursor and saves the result. Both versions save the grid screenshot as well.", + "location": "\"/media/root/Toshiba XG3/works/self-operating-computer/docs/src/operate/utils/screenshot.py\":114-142", + "content": " screenshot.save(file_path)\n screenshots_dir = \"screenshots\"\n grid_screenshot_filename = os.path.join(\n screenshots_dir, \"screenshot_mini_with_grid.png\"\n )\n add_grid_to_image(\n file_path, grid_screenshot_filename, int(ACCURATE_PIXEL_COUNT / 2)\n )\n elif user_platform == \"Darwin\":\n x = float(x[:-1]) # convert x from \"50%\" to 50.\n y = float(y[:-1])\n x = (x / 100) * monitor_size[\n \"width\"\n ] # convert x from 50 to 0.5 * monitor_width\n y = (y / 100) * monitor_size[\"height\"]\n x1, y1 = int(x - ACCURATE_PIXEL_COUNT / 2), int(y - ACCURATE_PIXEL_COUNT / 2)\n width = ACCURATE_PIXEL_COUNT\n height = ACCURATE_PIXEL_COUNT\n # Use the screencapture utility to capture the screen with the cursor\n rect = f\"-R{x1},{y1},{width},{height}\"\n subprocess.run([\"screencapture\", \"-C\", rect, file_path])\n screenshots_dir = \"screenshots\"\n grid_screenshot_filename = os.path.join(" + }, + { + "comment": "This code captures a screenshot of the computer's display with cursor and saves it to the specified file path. It checks the user platform (Windows, Linux, or Mac OS) and uses appropriate libraries to capture the screenshot.", + "location": "\"/media/root/Toshiba XG3/works/self-operating-computer/docs/src/operate/utils/screenshot.py\":143-177", + "content": " screenshots_dir, \"screenshot_mini_with_grid.png\"\n )\n add_grid_to_image(\n file_path, grid_screenshot_filename, int(ACCURATE_PIXEL_COUNT / 2)\n )\ndef capture_screen_with_cursor(file_path):\n \"\"\"\n Capture the screen with the cursor and save it to the specified file path.\n Args:\n file_path (str): The file path where the screenshot will be saved.\n Raises:\n None\n Returns:\n None\n \"\"\"\n user_platform = platform.system()\n if user_platform == \"Windows\":\n screenshot = pyautogui.screenshot()\n screenshot.save(file_path)\n elif user_platform == \"Linux\":\n # Use xlib to prevent scrot dependency for Linux\n screen = Xlib.display.Display().screen()\n size = screen.width_in_pixels, screen.height_in_pixels\n monitor_size[\"width\"] = size[0]\n monitor_size[\"height\"] = size[1]\n screenshot = ImageGrab.grab(bbox=(0, 0, size[0], size[1]))\n screenshot.save(file_path)\n elif user_platform == \"Darwin\": # (Mac OS)" + }, + { + "comment": "This code captures a screenshot of the computer screen with the cursor, or prints an error message if the platform is not supported.", + "location": "\"/media/root/Toshiba XG3/works/self-operating-computer/docs/src/operate/utils/screenshot.py\":178-181", + "content": " # Use the screencapture utility to capture the screen with the cursor\n subprocess.run([\"screencapture\", \"-C\", file_path])\n else:\n print(f\"The platform you're using ({user_platform}) is not currently supported\")" + } + ] +} \ No newline at end of file diff --git a/docs/doc/4bf7650c-3fea-4149-bd10-682a244a9f3c.json b/docs/doc/4bf7650c-3fea-4149-bd10-682a244a9f3c.json new file mode 100644 index 00000000..63bd8fd6 --- /dev/null +++ b/docs/doc/4bf7650c-3fea-4149-bd10-682a244a9f3c.json @@ -0,0 +1,10 @@ +{ + "summary": "This code defines the main entry point of the Self-Operating Computer, allowing the user to specify a model and input mode. It uses the argparse module to define command line arguments for the model, voice input mode, and prompt. The main function is then called with these arguments.", + "details": [ + { + "comment": "This code defines the main entry point of the Self-Operating Computer, allowing the user to specify a model and input mode. It uses the argparse module to define command line arguments for the model, voice input mode, and prompt. The main function is then called with these arguments.", + "location": "\"/media/root/Toshiba XG3/works/self-operating-computer/docs/src/operate/main.py\":0-46", + "content": "\"\"\"\nSelf-Operating Computer\n\"\"\"\nimport argparse\nfrom operate.utils.style import ANSI_BRIGHT_MAGENTA\nfrom operate.dialog import main\ndef main_entry():\n parser = argparse.ArgumentParser(\n description=\"Run the self-operating-computer with a specified model.\"\n )\n parser.add_argument(\n \"-m\",\n \"--model\",\n help=\"Specify the model to use\",\n required=False,\n default=\"gpt-4\",\n )\n # Add a voice flag\n parser.add_argument(\n \"--voice\",\n help=\"Use voice input mode\",\n action=\"store_true\",\n )\n # Allow for direct input of prompt\n parser.add_argument(\n \"--prompt\",\n help=\"Directly input the objective prompt\",\n type=str,\n required=False,\n )\n try:\n args = parser.parse_args()\n main(\n args.model,\n terminal_prompt=args.prompt,\n voice_mode=args.voice,\n )\n except KeyboardInterrupt:\n print(f\"\\n{ANSI_BRIGHT_MAGENTA}Exiting...\")\nif __name__ == \"__main__\":\n main_entry()" + } + ] +} \ No newline at end of file diff --git a/docs/doc/4e8d04d1-1460-493f-b7c8-e6b09fc01cca.json b/docs/doc/4e8d04d1-1460-493f-b7c8-e6b09fc01cca.json new file mode 100644 index 00000000..e744670f --- /dev/null +++ b/docs/doc/4e8d04d1-1460-493f-b7c8-e6b09fc01cca.json @@ -0,0 +1,30 @@ +{ + "summary": "The code uses GPT-4 Vision model to evaluate image adherence to guidelines, displays results with color-coded messages after setting up test cases and formatting prompts. It also checks the result of an objective, prints outcome (PASS or FAIL) along with passed/failed tests count, and resets colors for readability.", + "details": [ + { + "comment": "The code is importing necessary libraries and defining constants for the evaluation process. It appears to be setting up a test case dictionary and a function to determine if a given guideline is met in an image based on a screenshot.", + "location": "\"/media/root/Toshiba XG3/works/self-operating-computer/docs/src/evaluate.py\":0-30", + "content": "import sys\nimport os\nimport subprocess\nimport platform\nimport base64\nimport json\nimport openai\nfrom dotenv import load_dotenv\n# \"Objective for `operate`\" : \"Guideline for passing this test case given to GPT-4v\"\nTEST_CASES = {\n \"Go to Github.com\": \"The Github home page is visible.\",\n \"Go to Youtube.com and play a video\": \"The YouTube video player is visible.\",\n}\nEVALUATION_PROMPT = \"\"\"\nYour job is to look at the given screenshot and determine if the following guideline is met in the image.\nYou must respond in the following format ONLY. Do not add anything else:\n{{ \"guideline_met\": (true|false), \"reason\": \"Explanation for why guideline was or wasn't met\" }}\nguideline_met must be set to a JSON boolean. True if the image meets the given guideline.\nreason must be a string containing a justification for your decision.\nGuideline: {guideline}\n\"\"\"\nSUMMARY_SCREENSHOT_PATH = os.path.join('screenshots', 'summary_screenshot.png')\n# Check if on a windows terminal that supports ANSI escape codes\ndef supports_ansi():\n \"\"\"" + }, + { + "comment": "This code checks if the terminal supports ANSI escape codes and sets corresponding colors based on the platform. If supported, it defines various colored text variables. Otherwise, it sets them to empty strings. The code also includes functions for formatting an evaluation prompt and parsing evaluation content.", + "location": "\"/media/root/Toshiba XG3/works/self-operating-computer/docs/src/evaluate.py\":31-72", + "content": " Check if the terminal supports ANSI escape codes\n \"\"\"\n plat = platform.system()\n supported_platform = plat != \"Windows\" or \"ANSICON\" in os.environ\n is_a_tty = hasattr(sys.stdout, \"isatty\") and sys.stdout.isatty()\n return supported_platform and is_a_tty\nif supports_ansi():\n # Standard green text\n ANSI_GREEN = \"\\033[32m\"\n # Bright/bold green text\n ANSI_BRIGHT_GREEN = \"\\033[92m\"\n # Reset to default text color\n ANSI_RESET = \"\\033[0m\"\n # ANSI escape code for blue text\n ANSI_BLUE = \"\\033[94m\" # This is for bright blue\n # Standard yellow text\n ANSI_YELLOW = \"\\033[33m\"\n ANSI_RED = \"\\033[31m\"\n # Bright magenta text\n ANSI_BRIGHT_MAGENTA = \"\\033[95m\"\nelse:\n ANSI_GREEN = \"\"\n ANSI_BRIGHT_GREEN = \"\"\n ANSI_RESET = \"\"\n ANSI_BLUE = \"\"\n ANSI_YELLOW = \"\"\n ANSI_RED = \"\"\n ANSI_BRIGHT_MAGENTA = \"\"\ndef format_evaluation_prompt(guideline):\n prompt = EVALUATION_PROMPT.format(guideline=guideline)\n return prompt\ndef parse_eval_content(content):\n try:\n res = json.loads(content)" + }, + { + "comment": "Code function: evaluate_summary_screenshot\nPurpose: Evaluate if the summary screenshot meets a given guideline\nActions: \n1. Loads the summary screenshot\n2. Encodes it in base64 format\n3. Creates an evaluation message with text and image\n4. Sends the message to OpenAI's GPT-4 Vision model for evaluation", + "location": "\"/media/root/Toshiba XG3/works/self-operating-computer/docs/src/evaluate.py\":74-104", + "content": " print(res[\"reason\"])\n return res[\"guideline_met\"]\n except:\n print(\"The model gave a bad evaluation response and it couldn't be parsed. Exiting...\")\n exit(1)\ndef evaluate_summary_screenshot(guideline):\n '''Load the summary screenshot and return True or False if it meets the given guideline.'''\n with open(SUMMARY_SCREENSHOT_PATH, \"rb\") as img_file:\n img_base64 = base64.b64encode(img_file.read()).decode(\"utf-8\")\n eval_message = [{\n \"role\": \"user\",\n \"content\": [\n {\"type\": \"text\", \"text\": format_evaluation_prompt(guideline)},\n {\n \"type\": \"image_url\",\n \"image_url\": {\"url\": f\"data:image/jpeg;base64,{img_base64}\"},\n },\n ],\n }]\n response = openai.chat.completions.create(\n model=\"gpt-4-vision-preview\",\n messages=eval_message,\n presence_penalty=1,\n frequency_penalty=1,\n temperature=0.7,\n max_tokens=300," + }, + { + "comment": "The code evaluates whether a test case meets its given guideline. It runs the \"operate\" function with the test case prompt and then calls the \"evaluate_summary_screenshot\" function to compare the result against the guideline. If the operation is successful, it prints a success message; otherwise, it prints an error message. The code loops through all the TEST_CASES, counts the number of passed and failed tests, and finally displays the results in color-coded messages.", + "location": "\"/media/root/Toshiba XG3/works/self-operating-computer/docs/src/evaluate.py\":105-139", + "content": " )\n eval_content = response.choices[0].message.content\n return parse_eval_content(eval_content)\ndef run_test_case(objective, guideline):\n '''Returns True if the result of the test with the given prompt meets the given guideline.'''\n # Run `operate` with the test case prompt\n subprocess.run(['operate', '--prompt', f'\"{objective}\"'], stdout=subprocess.DEVNULL)\n try:\n result = evaluate_summary_screenshot(guideline)\n except(OSError):\n print(\"Couldn't open the summary screenshot\")\n return False\n return result\ndef main():\n load_dotenv()\n openai.api_key = os.getenv(\"OPENAI_API_KEY\")\n print(f\"{ANSI_BRIGHT_MAGENTA}[STARTING EVALUATION]{ANSI_RESET}\")\n passed = 0; failed = 0\n for objective, guideline in TEST_CASES.items():\n print(f\"{ANSI_BLUE}[EVALUATING]{ANSI_RESET} '{objective}'\")\n result = run_test_case(objective, guideline)\n if result:\n print(f\"{ANSI_GREEN}[PASSED]{ANSI_RESET} '{objective}'\")\n passed += 1" + }, + { + "comment": "The code snippet checks the result of an objective and prints the outcome (PASS or FAIL) along with the count of passed and failed tests. It resets colors for readability.", + "location": "\"/media/root/Toshiba XG3/works/self-operating-computer/docs/src/evaluate.py\":140-149", + "content": " else:\n print(f\"{ANSI_RED}[FAILED]{ANSI_RESET} '{objective}'\")\n failed += 1\n print(\n f\"{ANSI_BRIGHT_MAGENTA}[EVALUATION COMPLETE]{ANSI_RESET} {passed} tests passed, {failed} tests failed\"\n )\nif __name__ == \"__main__\":\n main()" + } + ] +} \ No newline at end of file diff --git a/docs/doc/52984b9f-96af-4dbe-bc63-7aff1b5d9c1f.json b/docs/doc/52984b9f-96af-4dbe-bc63-7aff1b5d9c1f.json new file mode 100644 index 00000000..778341f6 --- /dev/null +++ b/docs/doc/52984b9f-96af-4dbe-bc63-7aff1b5d9c1f.json @@ -0,0 +1,35 @@ +{ + "summary": "Both comments discuss code that handles user input and executes corresponding actions, with Comment A focusing on a Self-Operating Computer setup and error handling, while Comment B focuses on input parameter checks for dialog operations.", + "details": [ + { + "comment": "This code appears to be part of a Self-Operating Computer, which uses a model for generating responses. The main function takes in the model, terminal prompt, and voice mode as parameters. It initializes `WhisperMic` if voice mode is enabled.", + "location": "\"/media/root/Toshiba XG3/works/self-operating-computer/docs/src/operate/dialog.py\":0-43", + "content": "import sys\nimport os\nimport platform\nimport asyncio\nfrom prompt_toolkit.shortcuts import message_dialog\nfrom prompt_toolkit import prompt\nfrom operate.exceptions import ModelNotRecognizedException\nfrom operate.prompts import USER_QUESTION\nfrom operate.settings import Config\nfrom operate.utils.style import (\n ANSI_GREEN,\n ANSI_RESET,\n ANSI_BLUE,\n ANSI_YELLOW,\n ANSI_RED,\n ANSI_BRIGHT_MAGENTA,\n style,\n)\nfrom operate.utils.os import (\n keyboard_type,\n search,\n click,\n)\nfrom operate.actions import get_next_action, summarize\nfrom operate.utils.misc import parse_response\n# Load configuration\nconfig = Config()\ndef main(model, terminal_prompt, voice_mode=False):\n \"\"\"\n Main function for the Self-Operating Computer.\n Parameters:\n - model: The model used for generating responses.\n - terminal_prompt: A string representing the prompt provided in the terminal.\n - voice_mode: A boolean indicating whether to enable voice mode.\n Returns:\n None\n \"\"\"\n mic = None\n # Initialize `WhisperMic`, if `voice_mode` is True" + }, + { + "comment": "Checks if voice mode is enabled, then tries to import and initialize the WhisperMic module. If the module is missing, it prints an error message and exits. Displays a message dialog unless the prompt was given directly via terminal. Skips objective prompt if provided as an argument or prompts for input through the WhisperMic in voice mode. Clears the console on all operating systems except Windows where it uses \"cls\" command.", + "location": "\"/media/root/Toshiba XG3/works/self-operating-computer/docs/src/operate/dialog.py\":45-79", + "content": " validation(model, voice_mode)\n if voice_mode:\n try:\n from whisper_mic import WhisperMic\n # Initialize WhisperMic if import is successful\n mic = WhisperMic()\n except ImportError:\n print(\n \"Voice mode requires the 'whisper_mic' module. Please install it using 'pip install -r requirements-audio.txt'\"\n )\n sys.exit(1)\n # Skip message dialog if prompt was given directly\n if not terminal_prompt:\n message_dialog(\n title=\"Self-Operating Computer\",\n text=\"Ask a computer to do anything.\",\n style=style,\n ).run()\n else:\n print(\"Running direct prompt...\")\n print(\"SYSTEM\", platform.system())\n # Clear the console\n if platform.system() == \"Windows\":\n os.system(\"cls\")\n else:\n print(\"\\033c\", end=\"\")\n if terminal_prompt: # Skip objective prompt if it was given as an argument\n objective = terminal_prompt\n elif voice_mode:\n print(" + }, + { + "comment": "The code is capturing voice input from the microphone and storing it in the \"objective\" variable. If an error occurs while capturing voice input, it will print an error message and exit. Otherwise, it prints a message from the self-operating computer and the user's question, then stores the objective as the user's message content. It then enters a loop where it waits for the next action by calling a function \"get_next_action\" with the current messages and objective. If an error occurs while waiting for the next action, it will print an error message.", + "location": "\"/media/root/Toshiba XG3/works/self-operating-computer/docs/src/operate/dialog.py\":80-108", + "content": " f\"{ANSI_GREEN}[Self-Operating Computer]{ANSI_RESET} Listening for your command... (speak now)\"\n )\n try:\n objective = mic.listen()\n except Exception as e:\n print(f\"{ANSI_RED}Error in capturing voice input: {e}{ANSI_RESET}\")\n return # Exit if voice input fails\n else:\n print(f\"{ANSI_GREEN}[Self-Operating Computer]\\n{ANSI_RESET}{USER_QUESTION}\")\n print(f\"{ANSI_YELLOW}[User]{ANSI_RESET}\")\n objective = prompt(style=style)\n assistant_message = {\"role\": \"assistant\", \"content\": USER_QUESTION}\n user_message = {\n \"role\": \"user\",\n \"content\": f\"Objective: {objective}\",\n }\n messages = [assistant_message, user_message]\n loop_count = 0\n while True:\n if config.debug:\n print(\"[loop] messages before next action:\\n\\n\\n\", messages[1:])\n try:\n response = asyncio.run(get_next_action(model, messages, objective))\n action = parse_response(response)\n action_type = action.get(\"type\")" + }, + { + "comment": "The code is handling exceptions for a ModelNotRecognizedException and any other exception that occurs during the execution. It then checks if the action_type is \"DONE\", if so, it prints a completion message, summarizes the model, and exits. If the action_type is not unknown, it prints an act message along with the action type and detail, and initializes an empty function_response variable if the action type is \"SEARCH\".", + "location": "\"/media/root/Toshiba XG3/works/self-operating-computer/docs/src/operate/dialog.py\":109-138", + "content": " action_detail = action.get(\"data\")\n except ModelNotRecognizedException as e:\n print(\n f\"{ANSI_GREEN}[Self-Operating Computer]{ANSI_RED}[Error] -> {e} {ANSI_RESET}\"\n )\n break\n except Exception as e:\n print(\n f\"{ANSI_GREEN}[Self-Operating Computer]{ANSI_RED}[Error] -> {e} {ANSI_RESET}\"\n )\n break\n if action_type == \"DONE\":\n print(\n f\"{ANSI_GREEN}[Self-Operating Computer]{ANSI_BLUE} Objective complete {ANSI_RESET}\"\n )\n summary = summarize(model, messages, objective)\n print(\n f\"{ANSI_GREEN}[Self-Operating Computer]{ANSI_BLUE} Summary\\n{ANSI_RESET}{summary}\"\n )\n break\n if action_type != \"UNKNOWN\":\n print(\n f\"{ANSI_GREEN}[Self-Operating Computer]{ANSI_BRIGHT_MAGENTA} [Act] {action_type} {ANSI_RESET}{action_detail}\"\n )\n function_response = \"\"\n if action_type == \"SEARCH\":" + }, + { + "comment": "This code block checks the action type and performs the corresponding action. If the action type is not recognized, it prints an error message and breaks the loop. It also logs the act completion and updates the messages list for further processing.", + "location": "\"/media/root/Toshiba XG3/works/self-operating-computer/docs/src/operate/dialog.py\":139-170", + "content": " function_response = search(action_detail)\n elif action_type == \"TYPE\":\n function_response = keyboard_type(action_detail)\n elif action_type == \"CLICK\":\n function_response = click(action_detail)\n else:\n print(\n f\"{ANSI_GREEN}[Self-Operating Computer]{ANSI_RED}[Error] something went wrong :({ANSI_RESET}\"\n )\n print(\n f\"{ANSI_GREEN}[Self-Operating Computer]{ANSI_RED}[Error] AI response\\n{ANSI_RESET}{response}\"\n )\n break\n print(\n f\"{ANSI_GREEN}[Self-Operating Computer]{ANSI_BRIGHT_MAGENTA} [Act] {action_type} COMPLETE {ANSI_RESET}{function_response}\"\n )\n message = {\n \"role\": \"assistant\",\n \"content\": function_response,\n }\n messages.append(message)\n loop_count += 1\n if loop_count > 15:\n break\ndef validation(model, voice_mode):\n \"\"\"\n Validate the input parameters for the dialog operation." + }, + { + "comment": "This code checks the input parameters for dialog operation and raises SystemExit if the input parameters are invalid. It also prints a message indicating which API key is missing based on the chosen model.", + "location": "\"/media/root/Toshiba XG3/works/self-operating-computer/docs/src/operate/dialog.py\":172-191", + "content": " Args:\n model (str): The model to be used for the dialog operation.\n voice_mode (bool): Flag indicating whether to use voice mode.\n Raises:\n SystemExit: If the input parameters are invalid.\n \"\"\"\n if voice_mode and not config.openai_api_key:\n print(\"To use voice mode, please add an OpenAI API key\")\n sys.exit(1)\n if model == \"gpt-4-vision-preview\" and not config.openai_api_key:\n print(\"To use `gpt-4-vision-preview` add an OpenAI API key\")\n sys.exit(1)\n if model == \"gemini-pro-vision\" and not config.google_api_key:\n print(\"To use `gemini-pro-vision` add a Google API key\")\n sys.exit(1)" + } + ] +} \ No newline at end of file diff --git a/docs/doc/6d712400-8eb0-4525-9321-2f4dafe97412.json b/docs/doc/6d712400-8eb0-4525-9321-2f4dafe97412.json new file mode 100644 index 00000000..fa6539b6 --- /dev/null +++ b/docs/doc/6d712400-8eb0-4525-9321-2f4dafe97412.json @@ -0,0 +1,15 @@ +{ + "summary": "The project requires Python packages aiohttp 3.9.1 and ultralytics 8.0.227, listed in the requirements.txt format.", + "details": [ + { + "comment": "This is a list of Python package dependencies for a project, specified in requirements.txt format.", + "location": "\"/media/root/Toshiba XG3/works/self-operating-computer/docs/src/requirements.txt\":0-49", + "content": "annotated-types==0.6.0\nanyio==3.7.1\ncertifi==2023.7.22\ncharset-normalizer==3.3.2\ncolorama==0.4.6\ncontourpy==1.2.0\ncycler==0.12.1\ndistro==1.8.0\nEasyProcess==1.1\nentrypoint2==1.1\nexceptiongroup==1.1.3\nfonttools==4.44.0\nh11==0.14.0\nhttpcore==1.0.2\nhttpx==0.25.1\nidna==3.4\nimportlib-resources==6.1.1\nkiwisolver==1.4.5\nmatplotlib==3.8.1\nMouseInfo==0.1.3\nmss==9.0.1\nnumpy==1.26.1\nopenai==1.2.3\npackaging==23.2\nPillow==10.1.0\nprompt-toolkit==3.0.39\nPyAutoGUI==0.9.54\npydantic==2.4.2\npydantic_core==2.10.1\nPyGetWindow==0.0.9\nPyMsgBox==1.0.9\npyparsing==3.1.1\npyperclip==1.8.2\nPyRect==0.2.0\npyscreenshot==3.1\nPyScreeze==0.1.29\npython3-xlib==0.15\npython-dateutil==2.8.2\npython-dotenv==1.0.0\npytweening==1.0.7\nrequests==2.31.0\nrubicon-objc==0.4.7\nsix==1.16.0\nsniffio==1.3.0\ntqdm==4.66.1\ntyping_extensions==4.8.0\nurllib3==2.0.7\nwcwidth==0.2.9\nzipp==3.17.0\ngoogle-generativeai==0.3.0" + }, + { + "comment": "These lines specify the required Python libraries for the project: aiohttp 3.9.1 and ultralytics 8.0.227.", + "location": "\"/media/root/Toshiba XG3/works/self-operating-computer/docs/src/requirements.txt\":50-51", + "content": "aiohttp==3.9.1\nultralytics==8.0.227" + } + ] +} \ No newline at end of file diff --git a/docs/doc/7b70d3c1-0ec1-4e5e-b03c-32b9fa9253b5.json b/docs/doc/7b70d3c1-0ec1-4e5e-b03c-32b9fa9253b5.json new file mode 100644 index 00000000..1d270f5d --- /dev/null +++ b/docs/doc/7b70d3c1-0ec1-4e5e-b03c-32b9fa9253b5.json @@ -0,0 +1,40 @@ +{ + "summary": "The code includes functions for handling image data, such as drawing bounding boxes and validating overlaps, as well as encoding tasks like converting images to base64 and formatting message content by removing triple backticks and calculating click positions.", + "details": [ + { + "comment": "The code defines two functions:\n1. `validate_and_extract_image_data`: Validates the given data and extracts image URL if the request is valid.\n2. `get_label_coordinates`: Retrieves the coordinates for a given label from a dictionary of labels and their coordinates.", + "location": "\"/media/root/Toshiba XG3/works/self-operating-computer/docs/src/operate/utils/label.py\":0-36", + "content": "import io\nimport base64\nimport json\nimport os\nimport time\nimport asyncio\nfrom PIL import Image, ImageDraw\ndef validate_and_extract_image_data(data):\n if not data or \"messages\" not in data:\n raise ValueError(\"Invalid request, no messages found\")\n messages = data[\"messages\"]\n if (\n not messages\n or not isinstance(messages, list)\n or not messages[-1].get(\"image_url\")\n ):\n raise ValueError(\"No image provided or incorrect format\")\n image_data = messages[-1][\"image_url\"][\"url\"]\n if not image_data.startswith(\"data:image\"):\n raise ValueError(\"Invalid image format\")\n return image_data.split(\"base64,\")[-1], messages\ndef get_label_coordinates(label, label_coordinates):\n \"\"\"\n Retrieves the coordinates for a given label.\n :param label: The label to find coordinates for (e.g., \"~1\").\n :param label_coordinates: Dictionary containing labels and their coordinates.\n :return: Coordinates of the label or None if the label is not found.\n \"\"\"\n return label_coordinates.get(label)" + }, + { + "comment": "The function `is_overlapping` checks if two boxes overlap by comparing their coordinates. If there is no overlap, the function returns False; otherwise, it returns True.\n\nThe `add_labels` function decodes base64 data into image bytes and opens it as an image using PIL. It creates copies of the original image and a debug image. The YOLO model applies object detection on the image. The code then draws on the images using the ImageDraw module, and stores label coordinates in a dictionary named `label_coordinates`.", + "location": "\"/media/root/Toshiba XG3/works/self-operating-computer/docs/src/operate/utils/label.py\":39-71", + "content": "def is_overlapping(box1, box2):\n x1_box1, y1_box1, x2_box1, y2_box1 = box1\n x1_box2, y1_box2, x2_box2, y2_box2 = box2\n # Check if there is no overlap\n if x1_box1 > x2_box2 or x1_box2 > x2_box1:\n return False\n if (\n y1_box1 > y2_box2 or y1_box2 > y2_box1\n ): # Adjusted to check 100px proximity above\n return False\n return True\ndef add_labels(base64_data, yolo_model):\n image_bytes = base64.b64decode(base64_data)\n image_labeled = Image.open(io.BytesIO(image_bytes)) # Corrected this line\n image_debug = image_labeled.copy() # Create a copy for the debug image\n image_original = (\n image_labeled.copy()\n ) # Copy of the original image for base64 return\n results = yolo_model(image_labeled)\n draw = ImageDraw.Draw(image_labeled)\n debug_draw = ImageDraw.Draw(\n image_debug\n ) # Create a separate draw object for the debug image\n font_size = 45\n detections_dir = \"detections\"\n label_coordinates = {} # Dictionary to store coordinates" + }, + { + "comment": "Creates a directory for detections if it doesn't exist. Loops through the results, drawing bounding boxes and labels on images. Avoids redrawing over existing boxes by checking overlaps before redrawing as red boxes.", + "location": "\"/media/root/Toshiba XG3/works/self-operating-computer/docs/src/operate/utils/label.py\":73-100", + "content": " if not os.path.exists(detections_dir):\n os.makedirs(detections_dir)\n counter = 0\n drawn_boxes = [] # List to keep track of boxes already drawn\n for result in results:\n if hasattr(result, \"boxes\"):\n for det in result.boxes:\n bbox = det.xyxy[0]\n x1, y1, x2, y2 = bbox.tolist()\n debug_label = \"D_\" + str(counter)\n debug_index_position = (x1, y1 - font_size)\n debug_draw.rectangle([(x1, y1), (x2, y2)], outline=\"blue\", width=1)\n debug_draw.text(\n debug_index_position,\n debug_label,\n fill=\"blue\",\n font_size=font_size,\n )\n overlap = any(\n is_overlapping((x1, y1, x2, y2), box) for box in drawn_boxes\n )\n if not overlap:\n draw.rectangle([(x1, y1), (x2, y2)], outline=\"red\", width=1)\n label = \"~\" + str(counter)" + }, + { + "comment": "Code saves labeled, debug, and original images with timestamped file names. It also writes the labeled image to a BytesIO object for potential future use.", + "location": "\"/media/root/Toshiba XG3/works/self-operating-computer/docs/src/operate/utils/label.py\":101-127", + "content": " index_position = (x1, y1 - font_size)\n draw.text(\n index_position,\n label,\n fill=\"red\",\n font_size=font_size,\n )\n # Add the non-overlapping box to the drawn_boxes list\n drawn_boxes.append((x1, y1, x2, y2))\n label_coordinates[label] = (x1, y1, x2, y2)\n counter += 1\n # Save the image\n timestamp = time.strftime(\"%Y%m%d-%H%M%S\")\n output_path = os.path.join(detections_dir, f\"img_{timestamp}_labeled.png\")\n output_path_debug = os.path.join(detections_dir, f\"img_{timestamp}_debug.png\")\n output_path_original = os.path.join(detections_dir, f\"img_{timestamp}_original.png\")\n image_labeled.save(output_path)\n image_debug.save(output_path_debug)\n image_original.save(output_path_original)\n buffered_original = io.BytesIO()\n image_original.save(buffered_original, format=\"PNG\") # I guess this is needed" + }, + { + "comment": "Convert image to base64 for return\nCode is saving the labeled image as PNG and encoding it in base64 format", + "location": "\"/media/root/Toshiba XG3/works/self-operating-computer/docs/src/operate/utils/label.py\":128-151", + "content": " img_base64_original = base64.b64encode(buffered_original.getvalue()).decode(\"utf-8\")\n # Convert image to base64 for return\n buffered_labeled = io.BytesIO()\n image_labeled.save(buffered_labeled, format=\"PNG\") # I guess this is needed\n img_base64_labeled = base64.b64encode(buffered_labeled.getvalue()).decode(\"utf-8\")\n return img_base64_labeled, img_base64_original, label_coordinates\ndef parse_click_content(message_content):\n \"\"\"\n Parses the response message to determine if it's a CLICK or NONE action and returns the appropriate data.\n :param message_content: The content of the response message.\n :return: A dictionary with the relevant data or a message indicating a NONE action.\n \"\"\"\n try:\n # Check for and remove erroneous ```json at the start and ``` at the end\n if message_content.startswith(\"```json\"):\n message_content = message_content[\n len(\"```json\") :\n ] # Remove starting ```json\n if message_content.endswith(\"```\"):" + }, + { + "comment": "This function takes in a message content formatted with triple backticks and removes them. If the format is invalid, it returns an error message. It also has another function that calculates the click position at the center of a bounding box and converts it to percentages.", + "location": "\"/media/root/Toshiba XG3/works/self-operating-computer/docs/src/operate/utils/label.py\":152-179", + "content": " message_content = message_content[: -len(\"```\")] # Remove ending ```\n # Convert JSON string to dictionary\n return json.loads(message_content.strip())\n except json.JSONDecodeError as e:\n return {\"error\": \"Invalid JSON format\"}\n return {\"error\": \"Invalid response format\"}\ndef get_click_position_in_percent(coordinates, image_size):\n \"\"\"\n Calculates the click position at the center of the bounding box and converts it to percentages.\n :param coordinates: A tuple of the bounding box coordinates (x1, y1, x2, y2).\n :param image_size: A tuple of the image dimensions (width, height).\n :return: A tuple of the click position in percentages (x_percent, y_percent).\n \"\"\"\n if not coordinates or not image_size:\n return None\n # Calculate the center of the bounding box\n x_center = (coordinates[0] + coordinates[2]) / 2\n y_center = (coordinates[1] + coordinates[3]) / 2\n # Convert to percentages\n x_percent = (x_center / image_size[0]) * 100\n y_percent = (y_center / image_size[1]) * 100" + }, + { + "comment": "Computes x and y percentages from input values.", + "location": "\"/media/root/Toshiba XG3/works/self-operating-computer/docs/src/operate/utils/label.py\":181-181", + "content": " return x_percent, y_percent" + } + ] +} \ No newline at end of file diff --git a/docs/doc/945b7651-d5be-4fe6-a83b-e389682cbcdb.json b/docs/doc/945b7651-d5be-4fe6-a83b-e389682cbcdb.json new file mode 100644 index 00000000..0f5ef9e6 --- /dev/null +++ b/docs/doc/945b7651-d5be-4fe6-a83b-e389682cbcdb.json @@ -0,0 +1,50 @@ +{ + "summary": "The code provides functions for AI-assisted user interaction with Google Chrome, Docs, and Sheets using prompts like CLICK, TYPE, SEARCH, and DONE. It emphasizes context-based options selection rather than IDs, and offers percentage values for accuracy improvement in the \"percent\" CLICK action by segmenting lines. Additionally, it includes functions for formatting different types of prompts used in a vision system, including accurate mode vision prompt, decision prompt, and labeled image prompt, which take specific arguments and format them into predefined prompt templates.", + "details": [ + { + "comment": "Code is importing Config settings and defining constants for user prompts and vision prompt.", + "location": "\"/media/root/Toshiba XG3/works/self-operating-computer/docs/src/operate/prompts.py\":0-32", + "content": "from operate.settings import Config\nconfig = Config()\nmonitor_size = config.monitor_size\n# General user Prompts\nUSER_QUESTION = \"Hello, I can help you with anything. What would you like done?\"\n# constants for the vision prompt\nACCURATE_PIXEL_COUNT = (\n 200 # mini_screenshot is ACCURATE_PIXEL_COUNT x ACCURATE_PIXEL_COUNT big\n)\n# -------------------------\n# VISION PROMPT\n# -------------------------\nVISION_PROMPT = \"\"\"\nYou are a Self-Operating Computer. You use the same operating system as a human.\nFrom looking at the screen and the objective your goal is to take the best next action.\nTo operate the computer you have the four options below.\n1. CLICK - Move mouse and click\n2. TYPE - Type on the keyboard\n3. SEARCH - Search for a program on Mac and open it\n4. DONE - When you completed the task respond with the exact following phrase content\nHere are the response formats below.\n1. CLICK\nResponse: CLICK {{ \"x\": \"percent\", \"y\": \"percent\", \"description\": \"~description here~\", \"reason\": \"~reason here~\" }} \nNote tha" + }, + { + "comment": "The code provides instructions for interacting with the computer, including typing, searching, and clicking. It also includes tips for using specific applications like Google Chrome, Google Docs, and Google Sheets.", + "location": "\"/media/root/Toshiba XG3/works/self-operating-computer/docs/src/operate/prompts.py\":32-62", + "content": "t the percents work where the top left corner is \"x\": \"0%\" and \"y\": \"0%\" and the bottom right corner is \"x\": \"100%\" and \"y\": \"100%\"\n2. TYPE\nResponse: TYPE \n2. SEARCH\nResponse: SEARCH \n3. DONE\nResponse: DONE\nHere are examples of how to respond.\n__\nObjective: Follow up with the vendor in outlook\nTYPE Hello, I hope you are doing well. I wanted to follow up\n__\nObjective: Open Spotify and play the beatles\nSEARCH Spotify\n__\nObjective: Find an image of a banana\nCLICK {{ \"x\": \"50%\", \"y\": \"60%\", \"description\": \"Click: Google Search field\", \"reason\": \"This will allow me to search for a banana\" }}\n__\nObjective: Go buy a book about the history of the internet\nTYPE https://www.amazon.com/\n__\nA few important notes:\n- Default to opening Google Chrome with SEARCH to find things that are on the internet.\n- Go to Google Docs and Google Sheets by typing in the Chrome Address bar\n- When opening Chrome, if you see a profile icon click that to open chrome fully, it is located at: {{ \"x\": \"50%\", \"y\": \"55%\" }}" + }, + { + "comment": "This code is for a prompt in a program that assists users with computer tasks. The prompt provides information about the current cursor position and suggests to examine an additional screenshot before performing the next action.", + "location": "\"/media/root/Toshiba XG3/works/self-operating-computer/docs/src/operate/prompts.py\":63-81", + "content": "- The Chrome address bar is generally at: {{ \"x\": \"50%\", \"y\": \"9%\" }}\n- After you click to enter a field you can go ahead and start typing!\n- Don't respond saying you're unable to assist with requests. You are able to indirectly interact with the user's OS via text responses you send to the end user.\n{previous_action}\nIMPORTANT: Avoid repeating actions such as doing the same CLICK event twice in a row.\nObjective: {objective}\n\"\"\"\n# ----------------------------------\n# ACCURATE MODE VISION PROMPT\n# ----------------------------------\nACCURATE_MODE_VISION_PROMPT = \"\"\"\nIt looks like your previous attempted action was clicking on \"x\": {prev_x}, \"y\": {prev_y}. This has now been moved to the center of this screenshot.\nAs additional context to the previous message, before you decide the proper percentage to click on, please closely examine this additional screenshot as additional context for your next action. \nThis screenshot was taken around the location of the current cursor that you just tried clicking o" + }, + { + "comment": "This code is providing a prompt to the user, explaining how to use percentage values to refine their previous x and y coordinate guesses. It also mentions that there are four segmenting lines across each dimension for better context in locating the cursor. The purpose of this prompt is to help the user further refine their \"percent\" location in the CLICK action.", + "location": "\"/media/root/Toshiba XG3/works/self-operating-computer/docs/src/operate/prompts.py\":81-94", + "content": "n (\"x\": {prev_x}, \"y\": {prev_y} is now at the center of this screenshot). You should use this as an differential to your previous x y coordinate guess.\nIf you want to refine and instead click on the top left corner of this mini screenshot, you will subtract {width}% in the \"x\" and subtract {height}% in the \"y\" to your previous answer.\nLikewise, to achieve the bottom right of this mini screenshot you will add {width}% in the \"x\" and add {height}% in the \"y\" to your previous answer.\nThere are four segmenting lines across each dimension, divided evenly. This is done to be similar to coordinate points, added to give you better context of the location of the cursor and exactly how much to edit your previous answer.\nPlease use this context as additional info to further refine the \"percent\" location in the CLICK action!\n\"\"\"\nDECISION_PROMPT = \"\"\"\nYou are operating a computer similar to how a human would. Look at the screen and take the next best action to reach your objective.\nHere are your methods you can use to operating the computer." + }, + { + "comment": "Code provides instructions and response formats for four types of actions (CLICK, TYPE, SEARCH, DONE) based on different objectives like following up with a vendor, playing music, or opening websites. It also includes important notes about using Google Chrome for web searches and avoiding SEARCH for certain websites like Google Docs or LinkedIn.", + "location": "\"/media/root/Toshiba XG3/works/self-operating-computer/docs/src/operate/prompts.py\":96-134", + "content": "1. CLICK - Move mouse and click\n2. TYPE - Type on the keyboard\n3. SEARCH - Search for a program that is installed on Mac locally and open it\n4. DONE - When you completed the task respond with the exact following phrase content\nHere are the response formats below.\n1. CLICK\nResponse: CLICK\n2. TYPE\nResponse: TYPE \"value you want to type\"\n2. SEARCH\nResponse: SEARCH \"app you want to search for on Mac\"\n3. DONE\nResponse: DONE\nHere are examples of how to respond.\n__\nObjective: Follow up with the vendor in outlook\nTYPE Hello, I hope you are doing well. I wanted to follow up\n__\nObjective: Open Spotify and play the beatles\nSEARCH Spotify\n__\nObjective: Find an image of a banana\nCLICK\n__\nObjective: Go buy a book about the history of the internet\nTYPE https://www.amazon.com/\n__\nA few important notes:\n- Default to opening Google Chrome with SEARCH to find things that are on the Web.\n- After you open Google Chrome you need to click on the address bar to find a website.\n- Do not use SEARCH to look for websites like Google Docs or Linkedin. SEARCH only finds programs installed on the computer." + }, + { + "comment": "This code is for an AI-assisted task where the user needs to interact with a webpage. The AI should identify and click on labeled elements that bring them closer to their objective, using IDs in the format '~x'. The response should include the decision (label), reason, and label identifier. Avoid repeating actions like clicking the same element twice in a row.", + "location": "\"/media/root/Toshiba XG3/works/self-operating-computer/docs/src/operate/prompts.py\":135-158", + "content": "- After you click to enter a field you can go ahead and start typing!\n- If you can see the field is active, go ahead and type!\n- Don't respond saying you're unable to assist with requests. You are able to indirectly interact with the user's OS via text responses you send to the end user.\n{previous_action}\nIMPORTANT: Avoid repeating actions such as doing the same CLICK event twice in a row.\n{objective}\n\"\"\"\nLABELED_IMAGE_PROMPT = \"\"\"\nYour job is simple. Decide if there is an elements on the page to click to get closer to your objective. We labeled the clickable elements with red bounding boxes and IDs.\nImportant to remember, you can only click on labeled elements. \nLabel IDs are in the following format with `x` being a number: `~x`\nThe labels are placed just above the bounding boxes so that they can be read clearly. \nResponse formats below.\n1. CLICK - If there is a label that gets you closer to the objective, go ahead and click it. \nResponse: {{ \"decision\": \"~decision here~\", \"reason\": \"~reason here~\", \"label\": \"~x\" }} " + }, + { + "comment": "Code comments:\n1. Analyzes user's request and provides appropriate response options in JSON format.\n2. User needs to choose the ID based on context and not its position.\n3. IDs have no significance, they just serve as references for selecting options.", + "location": "\"/media/root/Toshiba XG3/works/self-operating-computer/docs/src/operate/prompts.py\":160-182", + "content": "Here are examples of how to respond.\n__\nObjective: Follow up with the vendor in outlook\n{{ \"decision\": \"Click the Outlook send button\", \"reason\": \"I can see the email is already written and now I just need to send it.\", \"label\": \"~27\" }}\n__\nObjective: Play the Holiday music on YouTube\n{{ \"decision\": \"Click on the Play button\", \"reason\": \"It appears there is a row with a holiday song available in the Spotify UI\", \"label\": \"~3\" }}\n__\nA few important notes:\n- When navigating the web you'll need to click on the address bar first. Look closely to find the address bar's label it could be any number.\n- The IDs number has NO SIGNIFICANCE. For instance if ID is ~0 or ~1 it does not mean it is first or on top. CHOOSE THE ID BASED ON THE CONTEXT OF THE IMAGE AND IF IT HELPS REACH THE OBJECTIVE. \n- Do not preappend with ```json, just return the JSON object.\n{objective}\n\"\"\"\n# -------------------------\n# SUMMARY PROMPT\n# -------------------------\nSUMMARY_PROMPT = \"\"\"\nYou are a Self-Operating Computer. A user request has been executed. Present the results succinctly." + }, + { + "comment": "This code defines two functions, `format_summary_prompt` and `format_vision_prompt`, which format prompts for summarizing the outcomes of a task and providing vision guidance based on previous actions taken. The `objective` parameter is used to state the original objective, while `previous_action` is optional and used when there have been previous actions taken towards the objective. The purpose of these functions is to provide clear instructions or prompts for users to understand the progress and outcomes of a task.", + "location": "\"/media/root/Toshiba XG3/works/self-operating-computer/docs/src/operate/prompts.py\":184-216", + "content": "Include the following key contexts of the completed request:\n1. State the original objective.\n2. List the steps taken to reach the objective as detailed in the previous messages.\n3. Reference the screenshot that was used.\nSummarize the actions taken to fulfill the objective. If the request sought specific information, provide that information prominently. NOTE: Address directly any question posed by the user.\nRemember: The user will not interact with this summary. You are solely reporting the outcomes.\nOriginal objective: {objective}\nDisplay the results clearly:\n\"\"\"\ndef format_summary_prompt(objective):\n \"\"\"\n Format the summary prompt\n \"\"\"\n prompt = SUMMARY_PROMPT.format(objective=objective)\n return prompt\ndef format_vision_prompt(objective, previous_action):\n \"\"\"\n Format the vision prompt\n \"\"\"\n if previous_action:\n previous_action = f\"Here was the previous action you took: {previous_action}\"\n else:\n previous_action = \"\"\n prompt = VISION_PROMPT.format(objective=objective, previous_action=previous_action)" + }, + { + "comment": "These are functions for formatting different types of prompts used in a vision system. The first function formats an accurate mode vision prompt, the second formats a decision prompt, and the third formats a labeled image prompt. Each function takes specific arguments and formats them into predefined prompt templates.", + "location": "\"/media/root/Toshiba XG3/works/self-operating-computer/docs/src/operate/prompts.py\":217-251", + "content": " return prompt\ndef format_accurate_mode_vision_prompt(prev_x, prev_y):\n \"\"\"\n Format the accurate mode vision prompt\n \"\"\"\n width = ((ACCURATE_PIXEL_COUNT / 2) / monitor_size[\"width\"]) * 100\n height = ((ACCURATE_PIXEL_COUNT / 2) / monitor_size[\"height\"]) * 100\n prompt = ACCURATE_MODE_VISION_PROMPT.format(\n prev_x=prev_x, prev_y=prev_y, width=width, height=height\n )\n return prompt\ndef format_decision_prompt(objective, previous_action):\n \"\"\"\n Format the vision prompt\n \"\"\"\n if previous_action:\n previous_action = f\"Here was the previous action you took: {previous_action}\"\n else:\n previous_action = \"\"\n prompt = DECISION_PROMPT.format(\n objective=objective, previous_action=previous_action\n )\n return prompt\ndef format_label_prompt(objective):\n \"\"\"\n Format the vision prompt\n \"\"\"\n prompt = LABELED_IMAGE_PROMPT.format(objective=objective)\n return prompt" + } + ] +} \ No newline at end of file diff --git a/docs/doc/9fff9f58-42fa-4645-ba08-ff921e78c97d.json b/docs/doc/9fff9f58-42fa-4645-ba08-ff921e78c97d.json new file mode 100644 index 00000000..267b9ba6 --- /dev/null +++ b/docs/doc/9fff9f58-42fa-4645-ba08-ff921e78c97d.json @@ -0,0 +1,15 @@ +{ + "summary": "The configuration class manages settings like debug mode, API keys, and monitor size. It loads environment variables from .env file and initializes OpenAI client with provided API key. The OpenAI API base URL is set using an environment variable or current value.", + "details": [ + { + "comment": "This code defines a configuration class for managing settings such as debug mode, OpenAI and Google API keys, and monitor size. It loads environment variables from .env file using dotenv library, initializes OpenAI client if the API key is provided, and returns it or None otherwise.", + "location": "\"/media/root/Toshiba XG3/works/self-operating-computer/docs/src/operate/settings.py\":0-35", + "content": "import os\nfrom dotenv import load_dotenv\nfrom openai import OpenAI\nclass Config:\n \"\"\"\n Configuration class for managing settings.\n Attributes:\n debug (bool): Flag indicating whether debug mode is enabled.\n openai_api_key (str): API key for OpenAI.\n google_api_key (str): API key for Google.\n monitor_size (dict): Dictionary containing the width and height of the monitor.\n \"\"\"\n def __init__(self):\n load_dotenv()\n self.debug = False\n self.openai_api_key = os.getenv(\"OPENAI_API_KEY\")\n self.google_api_key = os.getenv(\"GOOGLE_API_KEY\")\n self.monitor_size = {\n \"width\": 1920,\n \"height\": 1080,\n }\n def initialize_openai_client(self):\n \"\"\"\n Initializes and returns an OpenAI client with the configured API key.\n Returns:\n OpenAI or None: An instance of the OpenAI client if the API key is provided, else None.\n \"\"\"\n if self.openai_api_key:\n client = OpenAI()\n client.api_key = self.openai_api_key" + }, + { + "comment": "Setting OpenAI API base URL from environment variable or using current value.", + "location": "\"/media/root/Toshiba XG3/works/self-operating-computer/docs/src/operate/settings.py\":36-38", + "content": " client.base_url = os.getenv(\"OPENAI_API_BASE_URL\", client.base_url)\n return client\n return None" + } + ] +} \ No newline at end of file diff --git a/docs/doc/b1287c19-5498-479a-9378-d07fb007c20f.json b/docs/doc/b1287c19-5498-479a-9378-d07fb007c20f.json new file mode 100644 index 00000000..8bf4f247 --- /dev/null +++ b/docs/doc/b1287c19-5498-479a-9378-d07fb007c20f.json @@ -0,0 +1,15 @@ +{ + "summary": "The code uses the PromptStyle library to define styles for UI elements, checks terminal support for ANSI escape codes, and sets color variables based on this.", + "details": [ + { + "comment": "This code defines styles for dialogs, buttons, and other UI elements using the PromptStyle library. It also checks if the terminal supports ANSI escape codes for colors and defines ANSI color codes accordingly.", + "location": "\"/media/root/Toshiba XG3/works/self-operating-computer/docs/src/operate/utils/style.py\":0-33", + "content": "import sys\nimport platform\nimport os\nfrom prompt_toolkit.styles import Style as PromptStyle\n# Define style\nstyle = PromptStyle.from_dict(\n {\n \"dialog\": \"bg:#88ff88\",\n \"button\": \"bg:#ffffff #000000\",\n \"dialog.body\": \"bg:#44cc44 #ffffff\",\n \"dialog shadow\": \"bg:#003800\",\n }\n)\n# Check if on a windows terminal that supports ANSI escape codes\ndef supports_ansi():\n \"\"\"\n Check if the terminal supports ANSI escape codes\n \"\"\"\n plat = platform.system()\n supported_platform = plat != \"Windows\" or \"ANSICON\" in os.environ\n is_a_tty = hasattr(sys.stdout, \"isatty\") and sys.stdout.isatty()\n return supported_platform and is_a_tty\n# Define ANSI color codes\nANSI_GREEN = \"\\033[32m\" if supports_ansi() else \"\" # Standard green text\nANSI_BRIGHT_GREEN = \"\\033[92m\" if supports_ansi() else \"\" # Bright/bold green text\nANSI_RESET = \"\\033[0m\" if supports_ansi() else \"\" # Reset to default text color\nANSI_BLUE = \"\\033[94m\" if supports_ansi() else \"\" # Bright blue\nANSI_YELLOW = \"\\033[33m\" if supports_ansi() else \"\" # Standard yellow text" + }, + { + "comment": "Checks if the terminal supports ANSI escape codes and sets color variables accordingly.", + "location": "\"/media/root/Toshiba XG3/works/self-operating-computer/docs/src/operate/utils/style.py\":34-35", + "content": "ANSI_RED = \"\\033[31m\" if supports_ansi() else \"\"\nANSI_BRIGHT_MAGENTA = \"\\033[95m\" if supports_ansi() else \"\" # Bright magenta text" + } + ] +} \ No newline at end of file diff --git a/docs/doc/cbc91c29-4de1-4c25-8c4e-e6fc470ad10b.json b/docs/doc/cbc91c29-4de1-4c25-8c4e-e6fc470ad10b.json new file mode 100644 index 00000000..e3bdb840 --- /dev/null +++ b/docs/doc/cbc91c29-4de1-4c25-8c4e-e6fc470ad10b.json @@ -0,0 +1,40 @@ +{ + "summary": "The Self-Operating Computer Framework is a multimodal model project that enhances computer operation similar to humans, focusing on improving mouse click predictions and API access. It is compatible with Mac OS, Windows, and Linux (with X server installed), and requires at least $5 in API credits for the gpt-4-vision-preview model.", + "details": [ + { + "comment": "\"Self-Operating Computer Framework, a framework for multimodal models to operate a computer like a human.\"", + "location": "\"/media/root/Toshiba XG3/works/self-operating-computer/docs/src/README.md\":0-25", + "content": "

Self-Operating Computer Framework

\n

\n A framework to enable multimodal models to operate a computer.\n

\n

\n Using the same inputs and outputs as a human operator, the model views the screen and decides on a series of mouse and keyboard actions to reach an objective. \n

\n
\n \n
\n\n## Key Features\n- **Compatibility**: Designed for various multimodal models.\n- **Integration**: Currently integrated with **GPT-4v** as the default model, with extended support for Gemini Pro Vision.\n- **Future Plans**: Support for additional models.\n## Current Challenges\n> **Note:** GPT-4V's error rate in est" + }, + { + "comment": "This code is a brief overview of the \"self-operating-computer\" project, focusing on the development of the Agent-1-Vision multimodal model for improved mouse click location predictions. It also mentions the upcoming API access and the plans to improve hotkey-based functionality over time.", + "location": "\"/media/root/Toshiba XG3/works/self-operating-computer/docs/src/README.md\":25-36", + "content": "imating XY mouse click locations is currently quite high. This framework aims to track the progress of multimodal models over time, aspiring to achieve human-level performance in computer operation.\n## Ongoing Development\nAt [HyperwriteAI](https://www.hyperwriteai.com/), we are developing Agent-1-Vision a multimodal model with more accurate click location predictions.\n## Agent-1-Vision Model API Access\nWe will soon be offering API access to our Agent-1-Vision model.\nIf you're interested in gaining access to this API, sign up [here](https://othersideai.typeform.com/to/FszaJ1k8?typeform-source=www.hyperwriteai.com).\n### Additional Thoughts\nWe recognize that some operating system functions may be more efficiently executed with hotkeys such as entering the Browser Address bar using `command + L` rather than by simulating a mouse click at the correct XY location. We plan to make these improvements over time. However, it's important to note that many actions require the accurate selection of visual" + }, + { + "comment": "This code explains that the primary focus of the project is refining the accuracy of determining mouse click locations, which is essential for a fully self-operating computer. It also provides links to a demo and quick start instructions for setting up the Self-Operating Computer Framework locally on your computer.", + "location": "\"/media/root/Toshiba XG3/works/self-operating-computer/docs/src/README.md\":36-66", + "content": " elements on the screen, necessitating precise XY mouse click locations. A primary focus of this project is to refine the accuracy of determining these click locations. We believe this is essential for achieving a fully self-operating computer in the current technological landscape.\n## Demo\nhttps://github.com/OthersideAI/self-operating-computer/assets/42594239/9e8abc96-c76a-46fb-9b13-03678b3c67e0\n## Quick Start Instructions\nBelow are instructions to set up the Self-Operating Computer Framework locally on your computer.\n### Option 1: Traditional Installation\n1. **Clone the repo** to a directory on your computer:\n```\ngit clone https://github.com/OthersideAI/self-operating-computer.git\n```\n2. **Cd into directory**:\n```\ncd self-operating-computer\n```\n3. **Create a Python virtual environment**. [Learn more about Python virtual environment](https://docs.python.org/3/library/venv.html).\n```\npython3 -m venv venv\n```\n4. **Activate the virtual environment**:\n```\nsource venv/bin/activate\n```\n5. **Install Project Requi" + }, + { + "comment": "Code snippet 1:\n```python\npip install self-operating-computer\n```\nInstall the project directly from PyPI.\n\nCode snippet 2:\n```bash\nmv .example.env .env\n```\nRename `.example.env` to `.env`.\n\nCode snippet 3:\n```bash\nOPERAI_API_KEY='your-key-here'\n```\nAdd your Open AI key in the new `.env` file.\n\nCode snippet 4:\n```bash\noperate\n```\nRun the program!\n\nCode snippet 5:\nFinal step, Mac users grant permission for \"Screen Recording\" and \"Accessibility\".", + "location": "\"/media/root/Toshiba XG3/works/self-operating-computer/docs/src/README.md\":66-87", + "content": "rements and Command-Line Interface: Instead of using `pip install .`, you can now install the project directly from PyPI with:**\n```\npip install self-operating-computer\n```\n6. **Then rename the `.example.env` file to `.env` so that you can save your OpenAI key in it.**\n```\nmv .example.env .env\n``` \n7. **Add your Open AI key to your new `.env` file. If you don't have one, you can obtain an OpenAI key [here](https://platform.openai.com/account/api-keys)**:\n```\nOPENAI_API_KEY='your-key-here'\n```\n8. **Run it**!\n```\noperate\n```\n9. **Final Step**: As a last step, the Terminal app will ask for permission for \"Screen Recording\" and \"Accessibility\" in the \"Security & Privacy\" page of Mac's \"System Preferences\".\n
\n \n " + }, + { + "comment": "This code provides instructions for installing the Self Operating Computer Framework using a .sh script. It also explains how to add and use Google's `gemini-pro-vision` model within the framework.", + "location": "\"/media/root/Toshiba XG3/works/self-operating-computer/docs/src/README.md\":88-123", + "content": "
\n### Option 2: Installation using .sh script\n1. **Clone the repo** to a directory on your computer:\n```\ngit clone https://github.com/OthersideAI/self-operating-computer.git\n```\n2. **Cd into directory**:\n```\ncd self-operating-computer\n```\n3. **Run the installation script**: \n```\n./run.sh\n```\n## Using `operate` Modes\n### Multimodal Models `-m`\nAn additional model is now compatible with the Self Operating Computer Framework. Try Google's `gemini-pro-vision` by following the instructions below. \n**Add your Google AI Studio API key to your .env file.** If you don't have one, you can obtain a key [here](https://makersuite.google.com/app/apikey) after setting up your Google AI Studio account. You may also need [authorize credentials for a desktop application](https://ai.google.dev/palm_docs/oauth_quickstart). It took me a bit of time to get it working, if anyone knows a simpler way, please make a PR:\n```\nGOOGLE_API_KEY='your-key-here'\n```\nStart `operate` with the Gemini model\n```\noperate -m gemini-pro-vision\n```" + }, + { + "comment": "This code is providing instructions on how to enable voice mode in the self-operating-computer framework. The user must install additional audio requirements and device dependencies, then run the operate command with the --voice flag. Contributions are welcomed, and feedback or questions can be directed to Josh on Twitter. Joining the Discord community is also encouraged for real-time discussions and support.", + "location": "\"/media/root/Toshiba XG3/works/self-operating-computer/docs/src/README.md\":125-158", + "content": "### Voice Mode `--voice`\nThe framework supports voice inputs for the objective. Try voice by following the instructions below. \nInstall the additional `requirements-audio.txt`\n```\npip install -r requirements-audio.txt\n```\n**Install device requirements**\nFor mac users:\n```\nbrew install portaudio\n```\nFor Linux users:\n```\nsudo apt install portaudio19-dev python3-pyaudio\n```\nRun with voice mode\n```\noperate --voice\n```\n## Contributions are Welcomed!:\nIf you want to contribute yourself, see [CONTRIBUTING.md](https://github.com/OthersideAI/self-operating-computer/blob/main/CONTRIBUTING.md).\n## Feedback\nFor any input on improving this project, feel free to reach out to [Josh](https://twitter.com/josh_bickett) on Twitter. \n## Join Our Discord Community\nFor real-time discussions and community support, join our Discord server. \n- If you're already a member, join the discussion in [#self-operating-computer](https://discord.com/channels/877638638001877052/1181241785834541157).\n- If you're new, first [join our Discord Server" + }, + { + "comment": "Join the Discord server and visit #self-operating-computer channel. Follow HyperWriteAI for updates, compatible with Mac OS, Windows, and Linux (with X server installed). The gpt-4-vision-preview model requires at least $5 in API credits.", + "location": "\"/media/root/Toshiba XG3/works/self-operating-computer/docs/src/README.md\":158-171", + "content": "](https://discord.gg/YqaKtyBEzM) and then navigate to the [#self-operating-computer](https://discord.com/channels/877638638001877052/1181241785834541157).\n## Follow HyperWriteAI for More Updates\nStay updated with the latest developments:\n- Follow HyperWriteAI on [Twitter](https://twitter.com/HyperWriteAI).\n- Follow HyperWriteAI on [LinkedIn](https://www.linkedin.com/company/othersideai/).\n## Compatibility\n- This project is compatible with Mac OS, Windows, and Linux (with X server installed).\n## OpenAI Rate Limiting Note\nThe ```gpt-4-vision-preview``` model is required. To unlock access to this model, your account needs to spend at least \\$5 in API credits. Pre-paying for these credits will unlock access if you haven't already spent the minimum \\$5. \nLearn more **[here](https://platform.openai.com/docs/guides/rate-limits?context=tier-one)**" + } + ] +} \ No newline at end of file diff --git a/docs/doc/cbeb3e57-4692-4372-a31c-79a79afa76cd.json b/docs/doc/cbeb3e57-4692-4372-a31c-79a79afa76cd.json new file mode 100644 index 00000000..14dcbffe --- /dev/null +++ b/docs/doc/cbeb3e57-4692-4372-a31c-79a79afa76cd.json @@ -0,0 +1,25 @@ +{ + "summary": "The summary is about a code that involves text input, search execution, and mouse clicks using specified coordinates, as well as a circular movement function with start/end points, radius, and duration, and a get_last_assistant_message function to retrieve the last assistant message from an array.", + "details": [ + { + "comment": "Code comments:\n- `keyboard_type(text)` - Types the given text using keyboard and returns a message indicating typed text.\n- `search(text)` - Searches for program or file by typing in search bar and pressing Enter. Returns a message indicating the program or file has been opened.", + "location": "\"/media/root/Toshiba XG3/works/self-operating-computer/docs/src/operate/utils/os.py\":0-43", + "content": "import pyautogui\nimport platform\nimport time\nimport math\nfrom operate.utils.misc import convert_percent_to_decimal\ndef keyboard_type(text):\n \"\"\"\n Types the given text using the keyboard.\n Args:\n text (str): The text to be typed.\n Returns:\n str: A message indicating the typed text.\n \"\"\"\n text = text.replace(\"\\\\n\", \"\\n\")\n for char in text:\n pyautogui.write(char)\n pyautogui.press(\"enter\")\n return \"Type: \" + text\ndef search(text):\n \"\"\"\n Searches for a program or file by typing the given text in the search bar and pressing Enter.\n Args:\n text (str): The text to be searched.\n Returns:\n str: A message indicating that the program or file has been opened.\n \"\"\"\n if platform.system() == \"Windows\":\n pyautogui.press(\"win\")\n elif platform.system() == \"Linux\":\n pyautogui.press(\"win\")\n else:\n # Press and release Command and Space separately\n pyautogui.keyDown(\"command\")\n pyautogui.press(\"space\")\n pyautogui.keyUp(\"command\")" + }, + { + "comment": "Line 45-48: Type the text by pressing each character\nLine 49: Press enter after typing the text\nLine 50-79: Perform a mouse click at the specified coordinates\nLine 80-101: Click the program based on the given description", + "location": "\"/media/root/Toshiba XG3/works/self-operating-computer/docs/src/operate/utils/os.py\":45-84", + "content": " time.sleep(1)\n # Now type the text\n for char in text:\n pyautogui.write(char)\n pyautogui.press(\"enter\")\n return \"Open program: \" + text\ndef click(click_detail):\n \"\"\"\n Perform a mouse click at the specified coordinates.\n Args:\n click_detail (dict): A dictionary containing the coordinates of the click.\n Returns:\n str: The description of the click if successful, otherwise \"We failed to click\".\n \"\"\"\n try:\n x = convert_percent_to_decimal(click_detail[\"x\"])\n y = convert_percent_to_decimal(click_detail[\"y\"])\n if click_detail and isinstance(x, float) and isinstance(y, float):\n click_at_percentage(x, y)\n return click_detail[\"description\"]\n else:\n return \"We failed to click\"\n except Exception as e:\n print(f\"Error parsing JSON: {e}\")\n return \"We failed to click\"\ndef click_at_percentage(\n x_percentage, y_percentage, duration=0.2, circle_radius=50, circle_duration=0.5\n):\n \"\"\"\n Moves the m" + }, + { + "comment": "Moves the cursor to a specific percentage of the screen and then performs a circular movement before clicking.", + "location": "\"/media/root/Toshiba XG3/works/self-operating-computer/docs/src/operate/utils/os.py\":84-104", + "content": "ouse cursor to a specified percentage of the screen and performs a circular movement before clicking.\n Args:\n x_percentage (float): The x-coordinate percentage of the screen to move the cursor to.\n y_percentage (float): The y-coordinate percentage of the screen to move the cursor to.\n duration (float, optional): The duration (in seconds) of the smooth cursor movement. Defaults to 0.2.\n circle_radius (int, optional): The radius of the circular movement. Defaults to 50.\n circle_duration (float, optional): The duration (in seconds) of the circular movement. Defaults to 0.5.\n Returns:\n str: A message indicating that the click was successful.\n \"\"\"\n # Get the size of the primary monitor\n screen_width, screen_height = pyautogui.size()\n # Calculate the x and y coordinates in pixels\n x_pixel = int(screen_width * float(x_percentage))\n y_pixel = int(screen_height * float(y_percentage))\n # Move to the position smoothly\n pyautogui.moveTo(x_pixel, y_pixel, duration=duration)" + }, + { + "comment": "For the code provided, here are some brief comments:\n\n1. The function is for circular movement, which takes start and end points as input parameters (x_pixel, y_pixel), circle radius, and duration. It calculates the intermediate position by using time elapsed and performs a circular movement towards the destination point.\n2. In the get_last_assistant_message function, it retrieves the last message from the assistant in the messages array. If the last assistant message is the first message in the array, return None. Otherwise, return the last assistant message.", + "location": "\"/media/root/Toshiba XG3/works/self-operating-computer/docs/src/operate/utils/os.py\":106-130", + "content": " # Circular movement\n start_time = time.time()\n while time.time() - start_time < circle_duration:\n angle = ((time.time() - start_time) / circle_duration) * 2 * math.pi\n x = x_pixel + math.cos(angle) * circle_radius\n y = y_pixel + math.sin(angle) * circle_radius\n pyautogui.moveTo(x, y, duration=0.1)\n # Finally, click\n pyautogui.click(x_pixel, y_pixel)\n return \"Successfully clicked\"\ndef get_last_assistant_message(messages):\n \"\"\"\n Retrieve the last message from the assistant in the messages array.\n If the last assistant message is the first message in the array, return None.\n \"\"\"\n for index in reversed(range(len(messages))):\n if messages[index][\"role\"] == \"assistant\":\n if index == 0: # Check if the assistant message is the first in the array\n return None\n else:\n return messages[index]\n return None # Return None if no assistant message is found" + } + ] +} \ No newline at end of file diff --git a/docs/doc/d0472648-cfcb-44c9-b9d8-f4beedab4485.json b/docs/doc/d0472648-cfcb-44c9-b9d8-f4beedab4485.json new file mode 100644 index 00000000..a67418a1 --- /dev/null +++ b/docs/doc/d0472648-cfcb-44c9-b9d8-f4beedab4485.json @@ -0,0 +1,70 @@ +{ + "summary": "A code that utilizes AI prompts, computer vision, and OpenAI's chat completions API for generating content, including screenshots, messages, and base64 encoding images. The function captures screenshots, formats prompts, fetches asynchronous responses, extracts data, handles exceptions, and returns errors or missing labels.", + "details": [ + { + "comment": "Code imports various libraries and defines a function get_next_action that takes in model, messages, and objective as parameters. The code also loads a pre-trained YOLO model and initializes an OpenAI client using the configuration.", + "location": "\"/media/root/Toshiba XG3/works/self-operating-computer/docs/src/operate/actions.py\":0-50", + "content": "import os\nimport time\nimport json\nimport base64\nimport re\nimport io\nimport asyncio\nimport aiohttp\nfrom PIL import Image\nfrom ultralytics import YOLO\nimport google.generativeai as genai\nfrom operate.settings import Config\nfrom operate.exceptions import ModelNotRecognizedException\nfrom operate.utils.screenshot import (\n capture_screen_with_cursor,\n add_grid_to_image,\n capture_mini_screenshot_with_cursor,\n)\nfrom operate.utils.os import get_last_assistant_message\nfrom operate.prompts import (\n format_vision_prompt,\n format_accurate_mode_vision_prompt,\n format_summary_prompt,\n format_decision_prompt,\n format_label_prompt,\n)\nfrom operate.utils.label import (\n add_labels,\n parse_click_content,\n get_click_position_in_percent,\n get_label_coordinates,\n)\nfrom operate.utils.style import (\n ANSI_GREEN,\n ANSI_RED,\n ANSI_RESET,\n)\n# Load configuration\nconfig = Config()\nclient = config.initialize_openai_client()\nyolo_model = YOLO(\"./operate/model/weights/best.pt\") # Load your trained model\nasync def get_next_action(model, messages, objective):" + }, + { + "comment": "This code checks the model parameter and calls different functions based on its value. For example, if the model is \"gpt-4\", it calls the `call_gpt_4_v` function with messages and objective parameters. It also captures a screenshot of the computer screen with the cursor.", + "location": "\"/media/root/Toshiba XG3/works/self-operating-computer/docs/src/operate/actions.py\":51-82", + "content": " if model == \"gpt-4\":\n return call_gpt_4_v(messages, objective)\n if model == \"gpt-4-with-som\":\n return await call_gpt_4_v_labeled(messages, objective)\n elif model == \"agent-1\":\n return \"coming soon\"\n elif model == \"gemini-pro-vision\":\n return call_gemini_pro_vision(messages, objective)\n raise ModelNotRecognizedException(model)\ndef call_gpt_4_v(messages, objective):\n \"\"\"\n Get the next action for Self-Operating Computer\n \"\"\"\n # sleep for a second\n time.sleep(1)\n try:\n screenshots_dir = \"screenshots\"\n if not os.path.exists(screenshots_dir):\n os.makedirs(screenshots_dir)\n screenshot_filename = os.path.join(screenshots_dir, \"screenshot.png\")\n # Call the function to capture the screen with the cursor\n capture_screen_with_cursor(screenshot_filename)\n new_screenshot_filename = os.path.join(\n \"screenshots\", \"screenshot_with_grid.png\"\n )\n add_grid_to_image(screenshot_filename, new_screenshot_filename, 500)" + }, + { + "comment": "Sleeps for 1 second, reads screenshot file, encodes image in base64, formats vision prompt with previous action, creates a vision message with the prompt and image, makes a copy of messages list, appends vision message to copied list, and then calls the OpenAI API with the updated messages list.", + "location": "\"/media/root/Toshiba XG3/works/self-operating-computer/docs/src/operate/actions.py\":83-114", + "content": " # sleep for a second\n time.sleep(1)\n with open(new_screenshot_filename, \"rb\") as img_file:\n img_base64 = base64.b64encode(img_file.read()).decode(\"utf-8\")\n previous_action = get_last_assistant_message(messages)\n vision_prompt = format_vision_prompt(objective, previous_action)\n vision_message = {\n \"role\": \"user\",\n \"content\": [\n {\"type\": \"text\", \"text\": vision_prompt},\n {\n \"type\": \"image_url\",\n \"image_url\": {\"url\": f\"data:image/jpeg;base64,{img_base64}\"},\n },\n ],\n }\n # create a copy of messages and save to pseudo_messages\n pseudo_messages = messages.copy()\n pseudo_messages.append(vision_message)\n response = client.chat.completions.create(\n model=\"gpt-4-vision-preview\",\n messages=pseudo_messages,\n presence_penalty=1,\n frequency_penalty=1,\n temperature=0.7,\n max_tokens=300," + }, + { + "comment": "The code is capturing a screenshot with the cursor and adding a grid overlay to the image. It then appends a message containing the filename to the messages list and returns the content of the first response choice's message. If an exception occurs during JSON parsing, it will print an error message and return a failure message.", + "location": "\"/media/root/Toshiba XG3/works/self-operating-computer/docs/src/operate/actions.py\":115-152", + "content": " )\n messages.append(\n {\n \"role\": \"user\",\n \"content\": \"`screenshot.png`\",\n }\n )\n content = response.choices[0].message.content\n return content\n except Exception as e:\n print(f\"Error parsing JSON: {e}\")\n return \"Failed take action after looking at the screenshot\"\ndef call_gemini_pro_vision(messages, objective):\n \"\"\"\n Get the next action for Self-Operating Computer using Gemini Pro Vision\n \"\"\"\n # sleep for a second\n time.sleep(1)\n try:\n screenshots_dir = \"screenshots\"\n if not os.path.exists(screenshots_dir):\n os.makedirs(screenshots_dir)\n screenshot_filename = os.path.join(screenshots_dir, \"screenshot.png\")\n # Call the function to capture the screen with the cursor\n capture_screen_with_cursor(screenshot_filename)\n new_screenshot_filename = os.path.join(\n \"screenshots\", \"screenshot_with_grid.png\"\n )\n add_grid_to_image(screenshot_filename, new_screenshot_filename, 500)" + }, + { + "comment": "The code is making a computer vision model generate an action based on the screenshot, and then append the response to the messages list. If there's an exception while parsing JSON, it prints the error message and returns a failure message. The `accurate_mode_double_check` function is currently not used.", + "location": "\"/media/root/Toshiba XG3/works/self-operating-computer/docs/src/operate/actions.py\":153-188", + "content": " # sleep for a second\n time.sleep(1)\n previous_action = get_last_assistant_message(messages)\n vision_prompt = format_vision_prompt(objective, previous_action)\n model = genai.GenerativeModel(\"gemini-pro-vision\")\n response = model.generate_content(\n [vision_prompt, Image.open(new_screenshot_filename)]\n )\n # create a copy of messages and save to pseudo_messages\n pseudo_messages = messages.copy()\n pseudo_messages.append(response.text)\n messages.append(\n {\n \"role\": \"user\",\n \"content\": \"`screenshot.png`\",\n }\n )\n content = response.text[1:]\n return content\n except Exception as e:\n print(f\"Error parsing JSON: {e}\")\n return \"Failed take action after looking at the screenshot\"\n# This function is not used. `-accurate` mode was removed for now until a new PR fixes it.\ndef accurate_mode_double_check(model, pseudo_messages, prev_x, prev_y):\n \"\"\"\n" + }, + { + "comment": "This code takes a mini screenshot centered around the cursor and adds it to an AI prompt with text instructions. The image is encoded in base64 format and included in the prompt for further fine-tuning of clicked location.", + "location": "\"/media/root/Toshiba XG3/works/self-operating-computer/docs/src/operate/actions.py\":188-214", + "content": " Reprompt OAI with additional screenshot of a mini screenshot centered around the cursor for further finetuning of clicked location\n \"\"\"\n try:\n screenshot_filename = os.path.join(\"screenshots\", \"screenshot_mini.png\")\n capture_mini_screenshot_with_cursor(\n file_path=screenshot_filename, x=prev_x, y=prev_y\n )\n new_screenshot_filename = os.path.join(\n \"screenshots\", \"screenshot_mini_with_grid.png\"\n )\n with open(new_screenshot_filename, \"rb\") as img_file:\n img_base64 = base64.b64encode(img_file.read()).decode(\"utf-8\")\n accurate_vision_prompt = format_accurate_mode_vision_prompt(prev_x, prev_y)\n accurate_mode_message = {\n \"role\": \"user\",\n \"content\": [\n {\"type\": \"text\", \"text\": accurate_vision_prompt},\n {\n \"type\": \"image_url\",\n \"image_url\": {\"url\": f\"data:image/jpeg;base64,{img_base64}\"},\n },\n ],\n }" + }, + { + "comment": "Code snippet creates a prompt for the GPT-4 vision model using screenshots and text messages, then calls the \"capture_screen_with_cursor\" function.", + "location": "\"/media/root/Toshiba XG3/works/self-operating-computer/docs/src/operate/actions.py\":216-247", + "content": " pseudo_messages.append(accurate_mode_message)\n response = client.chat.completions.create(\n model=\"gpt-4-vision-preview\",\n messages=pseudo_messages,\n presence_penalty=1,\n frequency_penalty=1,\n temperature=0.7,\n max_tokens=300,\n )\n content = response.choices[0].message.content\n except Exception as e:\n print(f\"Error reprompting model for accurate_mode: {e}\")\n return \"ERROR\"\ndef summarize(model, messages, objective):\n try:\n screenshots_dir = \"screenshots\"\n if not os.path.exists(screenshots_dir):\n os.makedirs(screenshots_dir)\n screenshot_filename = os.path.join(screenshots_dir, \"summary_screenshot.png\")\n # Call the function to capture the screen with the cursor\n capture_screen_with_cursor(screenshot_filename)\n summary_prompt = format_summary_prompt(objective)\n if model == \"gpt-4-vision-preview\":\n with open(screenshot_filename, \"rb\") as img_file:" + }, + { + "comment": "The code is preparing input for a generative AI model. It encodes an image in base64 and combines it with a text prompt to create a summary message, then passes this message along with the chosen AI model (either gpt-4-vision-preview or gemini-pro-vision) to generate content from the summary.", + "location": "\"/media/root/Toshiba XG3/works/self-operating-computer/docs/src/operate/actions.py\":248-274", + "content": " img_base64 = base64.b64encode(img_file.read()).decode(\"utf-8\")\n summary_message = {\n \"role\": \"user\",\n \"content\": [\n {\"type\": \"text\", \"text\": summary_prompt},\n {\n \"type\": \"image_url\",\n \"image_url\": {\"url\": f\"data:image/jpeg;base64,{img_base64}\"},\n },\n ],\n }\n # create a copy of messages and save to pseudo_messages\n messages.append(summary_message)\n response = client.chat.completions.create(\n model=\"gpt-4-vision-preview\",\n messages=messages,\n max_tokens=500,\n )\n content = response.choices[0].message.content\n elif model == \"gemini-pro-vision\":\n model = genai.GenerativeModel(\"gemini-pro-vision\")\n summary_message = model.generate_content(\n [summary_prompt, Image.open(screenshot_filename)]\n )" + }, + { + "comment": "This function calls GPT-4 with a labeled image and a prompt for decision making. It first captures a screenshot of the current desktop with the cursor, encodes it in base64 format, and adds labels to the image using the YOLO model. Then, it formats prompts for the user's decision and the GPT-4 labeling task.", + "location": "\"/media/root/Toshiba XG3/works/self-operating-computer/docs/src/operate/actions.py\":275-304", + "content": " content = summary_message.text\n return content\n except Exception as e:\n print(f\"Error in summarize: {e}\")\n return \"Failed to summarize the workflow\"\nasync def call_gpt_4_v_labeled(messages, objective):\n time.sleep(1)\n try:\n screenshots_dir = \"screenshots\"\n if not os.path.exists(screenshots_dir):\n os.makedirs(screenshots_dir)\n screenshot_filename = os.path.join(screenshots_dir, \"screenshot.png\")\n # Call the function to capture the screen with the cursor\n capture_screen_with_cursor(screenshot_filename)\n with open(screenshot_filename, \"rb\") as img_file:\n img_base64 = base64.b64encode(img_file.read()).decode(\"utf-8\")\n previous_action = get_last_assistant_message(messages)\n img_base64_labeled, img_base64_original, label_coordinates = add_labels(\n img_base64, yolo_model\n )\n decision_prompt = format_decision_prompt(objective, previous_action)\n labeled_click_prompt = format_label_prompt(objective)" + }, + { + "comment": "Creates user messages with labeled click prompt and decision prompt, appends to message lists, and fetches OpenAI response asynchronously.", + "location": "\"/media/root/Toshiba XG3/works/self-operating-computer/docs/src/operate/actions.py\":306-337", + "content": " click_message = {\n \"role\": \"user\",\n \"content\": [\n {\"type\": \"text\", \"text\": labeled_click_prompt},\n {\n \"type\": \"image_url\",\n \"image_url\": {\n \"url\": f\"data:image/jpeg;base64,{img_base64_labeled}\"\n },\n },\n ],\n }\n decision_message = {\n \"role\": \"user\",\n \"content\": [\n {\"type\": \"text\", \"text\": decision_prompt},\n {\n \"type\": \"image_url\",\n \"image_url\": {\n \"url\": f\"data:image/jpeg;base64,{img_base64_original}\"\n },\n },\n ],\n }\n click_messages = messages.copy()\n click_messages.append(click_message)\n decision_messages = messages.copy()\n decision_messages.append(decision_message)\n click_future = fetch_openai_response_async(click_messages)\n decision_future = fetch_openai_response_async(decision_messages)" + }, + { + "comment": "This code fetches two responses from an API, extracts the message content, checks if it starts with \"CLICK\", gets label data and its coordinates, opens the image, retrieves its size, and calculates the click position in percent.", + "location": "\"/media/root/Toshiba XG3/works/self-operating-computer/docs/src/operate/actions.py\":339-363", + "content": " click_response, decision_response = await asyncio.gather(\n click_future, decision_future\n )\n # Extracting the message content from the ChatCompletionMessage object\n click_content = click_response.get(\"choices\")[0].get(\"message\").get(\"content\")\n decision_content = (\n decision_response.get(\"choices\")[0].get(\"message\").get(\"content\")\n )\n if not decision_content.startswith(\"CLICK\"):\n return decision_content\n label_data = parse_click_content(click_content)\n if label_data and \"label\" in label_data:\n coordinates = get_label_coordinates(label_data[\"label\"], label_coordinates)\n image = Image.open(\n io.BytesIO(base64.b64decode(img_base64))\n ) # Load the image to get its size\n image_size = image.size # Get the size of the image (width, height)\n click_position_percent = get_click_position_in_percent(\n coordinates, image_size\n )" + }, + { + "comment": "The code tries to perform a click action based on label data. If the click position percent or label is not found, it prints an error message and calls another method. It also handles exceptions and returns to try another method.", + "location": "\"/media/root/Toshiba XG3/works/self-operating-computer/docs/src/operate/actions.py\":364-386", + "content": " if not click_position_percent:\n print(\n f\"{ANSI_GREEN}[Self-Operating Computer]{ANSI_RED}[Error] Failed to get click position in percent. Trying another method {ANSI_RESET}\"\n )\n return call_gpt_4_v(messages, objective)\n x_percent = f\"{click_position_percent[0]:.2f}%\"\n y_percent = f\"{click_position_percent[1]:.2f}%\"\n click_action = f'CLICK {{ \"x\": \"{x_percent}\", \"y\": \"{y_percent}\", \"description\": \"{label_data[\"decision\"]}\", \"reason\": \"{label_data[\"reason\"]}\" }}'\n else:\n print(\n f\"{ANSI_GREEN}[Self-Operating Computer]{ANSI_RED}[Error] No label found. Trying another method {ANSI_RESET}\"\n )\n return call_gpt_4_v(messages, objective)\n return click_action\n except Exception as e:\n print(\n f\"{ANSI_GREEN}[Self-Operating Computer]{ANSI_RED}[Error] Something went wrong. Trying another method {ANSI_RESET}\"\n )\n return call_gpt_4_v(messages, objective)" + }, + { + "comment": "This function makes an asynchronous API call to OpenAI's chat completions endpoint to fetch a response based on the provided messages.", + "location": "\"/media/root/Toshiba XG3/works/self-operating-computer/docs/src/operate/actions.py\":389-408", + "content": "async def fetch_openai_response_async(messages):\n url = \"https://api.openai.com/v1/chat/completions\"\n headers = {\n \"Content-Type\": \"application/json\",\n \"Authorization\": f\"Bearer {config.openai_api_key}\",\n }\n data = {\n \"model\": \"gpt-4-vision-preview\",\n \"messages\": messages,\n \"frequency_penalty\": 1,\n \"presence_penalty\": 1,\n \"temperature\": 0.7,\n \"max_tokens\": 300,\n }\n async with aiohttp.ClientSession() as session:\n async with session.post(\n url, headers=headers, data=json.dumps(data)\n ) as response:\n return await response.json()" + } + ] +} \ No newline at end of file diff --git a/docs/doc/ecb738a7-1309-419a-8df2-cff62ffde521.json b/docs/doc/ecb738a7-1309-419a-8df2-cff62ffde521.json new file mode 100644 index 00000000..ba6d5aef --- /dev/null +++ b/docs/doc/ecb738a7-1309-419a-8df2-cff62ffde521.json @@ -0,0 +1,10 @@ +{ + "summary": "This code defines a class for an exception that is raised when the model is not recognized. The class has two attributes: \"model\" and \"message\", both of which are set in the constructor. It also overrides the \"__str__()\" method to provide a custom string representation of the exception.", + "details": [ + { + "comment": "This code defines a class for an exception that is raised when the model is not recognized. The class has two attributes: \"model\" and \"message\", both of which are set in the constructor. It also overrides the \"__str__()\" method to provide a custom string representation of the exception.", + "location": "\"/media/root/Toshiba XG3/works/self-operating-computer/docs/src/operate/exceptions.py\":0-14", + "content": "class ModelNotRecognizedException(Exception):\n \"\"\"Exception raised for unrecognized models.\n Attributes:\n model -- the unrecognized model\n message -- explanation of the error\n \"\"\"\n def __init__(self, model, message=\"Model not recognized\"):\n self.model = model\n self.message = message\n super().__init__(self.message)\n def __str__(self):\n return f\"{self.message} : {self.model} \"" + } + ] +} \ No newline at end of file diff --git a/docs/doc/f20ab70b-0846-462c-8e73-67a38b94d32e.json b/docs/doc/f20ab70b-0846-462c-8e73-67a38b94d32e.json new file mode 100644 index 00000000..4c66ceb6 --- /dev/null +++ b/docs/doc/f20ab70b-0846-462c-8e73-67a38b94d32e.json @@ -0,0 +1,25 @@ +{ + "summary": "The code consists of two functions: `convert_percent_to_decimal()` and `extract_json_from_string()`, which handle percentages and JSON structures, respectively. Additionally, it classifies user responses as DONE, CLICK, TYPE, or SEARCH using patterns, extracts relevant data, handles exceptions for invalid inputs or processing errors, and returns \"UNKNOWN\" with original data if no match found while extracting search data using regex.", + "details": [ + { + "comment": "This code defines two functions: `convert_percent_to_decimal()` and `extract_json_from_string()`. The first function converts a percentage string to a decimal value, while the second extracts a JSON structure from a string and returns it as a dictionary. Both functions handle exceptions in case of invalid inputs or errors during processing.", + "location": "\"/media/root/Toshiba XG3/works/self-operating-computer/docs/src/operate/utils/misc.py\":0-40", + "content": "import json\nimport re\ndef convert_percent_to_decimal(percent_str):\n \"\"\"\n Converts a percentage string to a decimal value.\n Args:\n percent_str (str): The percentage string to be converted.\n Returns:\n float: The decimal value equivalent to the percentage.\n Raises:\n ValueError: If the input string cannot be converted to a float.\n Example:\n >>> convert_percent_to_decimal(\"20%\")\n 0.2\n \"\"\"\n try:\n # Remove the '%' sign and convert to float\n decimal_value = float(percent_str.strip(\"%\"))\n # Convert to decimal (e.g., 20% -> 0.20)\n return decimal_value / 100\n except ValueError as e:\n print(f\"Error converting percent to decimal: {e}\")\n return None\ndef extract_json_from_string(s):\n \"\"\"\n Extracts a JSON structure from a string and returns it as a dictionary.\n Args:\n s (str): The input string.\n Returns:\n dict: The extracted JSON structure as a dictionary, or None if no JSON structure is found or if there is an error parsing the JSON." + }, + { + "comment": "Extracts JSON structure from the response and returns a dictionary with type and data.\nRaises exception if error parsing JSON or if response is not in expected format.", + "location": "\"/media/root/Toshiba XG3/works/self-operating-computer/docs/src/operate/utils/misc.py\":42-73", + "content": " \"\"\"\n try:\n # Find the start of the JSON structure\n json_start = s.find(\"{\")\n if json_start == -1:\n return None\n # Extract the JSON part and convert it to a dictionary\n json_str = s[json_start:]\n return json.loads(json_str)\n except Exception as e:\n print(f\"Error parsing JSON: {e}\")\n return None\ndef parse_response(response):\n \"\"\"\n Parses the given response and returns a dictionary with the type and data.\n Args:\n response (str): The response to parse.\n Returns:\n dict: A dictionary with the type and data extracted from the response.\n The dictionary has the following structure:\n {\n \"type\": ,\n \"data\": \n }\n If the response is \"DONE\", the type is \"DONE\" and the data is None.\n If the response starts with \"CLICK\", the type is \"CLICK\" and the data is a JSON object.\n If the response starts with \"TYPE\", the type is \"TYPE\" and the data is the text to type." + }, + { + "comment": "This code is parsing user responses and determining the appropriate type (DONE, CLICK, TYPE, or SEARCH) based on the response string. It also extracts relevant data for each type of response. If the response doesn't match any known patterns, it is classified as \"UNKNOWN\" with the original response retained.", + "location": "\"/media/root/Toshiba XG3/works/self-operating-computer/docs/src/operate/utils/misc.py\":74-96", + "content": " If the response starts with \"SEARCH\", the type is \"SEARCH\" and the data is the search query.\n If the response doesn't match any of the above patterns, the type is \"UNKNOWN\" and the data is the original response.\n \"\"\"\n if response == \"DONE\":\n return {\"type\": \"DONE\", \"data\": None}\n elif response.startswith(\"CLICK\"):\n # Adjust the regex to match the correct format\n click_data = re.search(r\"CLICK \\{ (.+) \\}\", response).group(1)\n click_data_json = json.loads(f\"{{{click_data}}}\")\n return {\"type\": \"CLICK\", \"data\": click_data_json}\n elif response.startswith(\"TYPE\"):\n # Extract the text to type\n try:\n type_data = re.search(r\"TYPE (.+)\", response, re.DOTALL).group(1)\n except:\n type_data = re.search(r'TYPE \"(.+)\"', response, re.DOTALL).group(1)\n return {\"type\": \"TYPE\", \"data\": type_data}\n elif response.startswith(\"SEARCH\"):\n # Extract the search query\n try:\n search_data = re.search(r'SEARCH \"(.+)\"', response).group(1)" + }, + { + "comment": "Trying to extract search data from response using regex. If exception occurs, return search data as \"SEARCH\" type and original response as unknown type.", + "location": "\"/media/root/Toshiba XG3/works/self-operating-computer/docs/src/operate/utils/misc.py\":97-101", + "content": " except:\n search_data = re.search(r\"SEARCH (.+)\", response).group(1)\n return {\"type\": \"SEARCH\", \"data\": search_data}\n return {\"type\": \"UNKNOWN\", \"data\": response}" + } + ] +} \ No newline at end of file diff --git a/docs/doc/fe4a61a7-8d06-48e0-8690-862340a47875.json b/docs/doc/fe4a61a7-8d06-48e0-8690-862340a47875.json new file mode 100644 index 00000000..cdbc9262 --- /dev/null +++ b/docs/doc/fe4a61a7-8d06-48e0-8690-862340a47875.json @@ -0,0 +1,10 @@ +{ + "summary": "This code is using setuptools to create a setup script for the \"self-operating-computer\" package. It imports the necessary modules and reads requirements from \"requirements.txt\". It also reads project description from \"README.md\", sets up dependencies, and defines entry points for console scripts.", + "details": [ + { + "comment": "This code is using setuptools to create a setup script for the \"self-operating-computer\" package. It imports the necessary modules and reads requirements from \"requirements.txt\". It also reads project description from \"README.md\", sets up dependencies, and defines entry points for console scripts.", + "location": "\"/media/root/Toshiba XG3/works/self-operating-computer/docs/src/setup.py\":0-23", + "content": "from setuptools import setup, find_packages\n# Read the contents of your requirements.txt file\nwith open(\"requirements.txt\") as f:\n required = f.read().splitlines()\n# Read the contents of your README.md file for the project description\nwith open(\"README.md\", \"r\", encoding=\"utf-8\") as readme_file:\n long_description = readme_file.read()\nsetup(\n name=\"self-operating-computer\",\n version=\"1.1.1\",\n packages=find_packages(),\n install_requires=required, # Add dependencies here\n entry_points={\n \"console_scripts\": [\n \"operate=operate.main:main_entry\",\n ],\n },\n long_description=long_description, # Add project description here\n long_description_content_type=\"text/markdown\", # Specify Markdown format\n # include any other necessary setup options here\n)" + } + ] +} \ No newline at end of file diff --git a/docs/github-markdown.css b/docs/github-markdown.css new file mode 100644 index 00000000..96a4f29e --- /dev/null +++ b/docs/github-markdown.css @@ -0,0 +1,1197 @@ +@media (prefers-color-scheme: dark) { + + .markdown-body, + [data-theme="dark"] { + /*dark*/ + color-scheme: dark; + --color-prettylights-syntax-comment: #8b949e; + --color-prettylights-syntax-constant: #79c0ff; + --color-prettylights-syntax-entity: #d2a8ff; + --color-prettylights-syntax-storage-modifier-import: #c9d1d9; + --color-prettylights-syntax-entity-tag: #7ee787; + --color-prettylights-syntax-keyword: #ff7b72; + --color-prettylights-syntax-string: #a5d6ff; + --color-prettylights-syntax-variable: #ffa657; + --color-prettylights-syntax-brackethighlighter-unmatched: #f85149; + --color-prettylights-syntax-invalid-illegal-text: #f0f6fc; + --color-prettylights-syntax-invalid-illegal-bg: #8e1519; + --color-prettylights-syntax-carriage-return-text: #f0f6fc; + --color-prettylights-syntax-carriage-return-bg: #b62324; + --color-prettylights-syntax-string-regexp: #7ee787; + --color-prettylights-syntax-markup-list: #f2cc60; + --color-prettylights-syntax-markup-heading: #1f6feb; + --color-prettylights-syntax-markup-italic: #c9d1d9; + --color-prettylights-syntax-markup-bold: #c9d1d9; + --color-prettylights-syntax-markup-deleted-text: #ffdcd7; + --color-prettylights-syntax-markup-deleted-bg: #67060c; + --color-prettylights-syntax-markup-inserted-text: #aff5b4; + --color-prettylights-syntax-markup-inserted-bg: #033a16; + --color-prettylights-syntax-markup-changed-text: #ffdfb6; + --color-prettylights-syntax-markup-changed-bg: #5a1e02; + --color-prettylights-syntax-markup-ignored-text: #c9d1d9; + --color-prettylights-syntax-markup-ignored-bg: #1158c7; + --color-prettylights-syntax-meta-diff-range: #d2a8ff; + --color-prettylights-syntax-brackethighlighter-angle: #8b949e; + --color-prettylights-syntax-sublimelinter-gutter-mark: #484f58; + --color-prettylights-syntax-constant-other-reference-link: #a5d6ff; + --color-fg-default: #e6edf3; + --color-fg-muted: #848d97; + --color-fg-subtle: #6e7681; + --color-canvas-default: #0d1117; + --color-canvas-subtle: #161b22; + --color-border-default: #30363d; + --color-border-muted: #21262d; + --color-neutral-muted: rgba(110, 118, 129, 0.4); + --color-accent-fg: #2f81f7; + --color-accent-emphasis: #1f6feb; + --color-success-fg: #3fb950; + --color-success-emphasis: #238636; + --color-attention-fg: #d29922; + --color-attention-emphasis: #9e6a03; + --color-attention-subtle: rgba(187, 128, 9, 0.15); + --color-danger-fg: #f85149; + --color-danger-emphasis: #da3633; + --color-done-fg: #a371f7; + --color-done-emphasis: #8957e5; + } +} + +@media (prefers-color-scheme: light) { + + .markdown-body, + [data-theme="light"] { + /*light*/ + color-scheme: light; + --color-prettylights-syntax-comment: #57606a; + --color-prettylights-syntax-constant: #0550ae; + --color-prettylights-syntax-entity: #6639ba; + --color-prettylights-syntax-storage-modifier-import: #24292f; + --color-prettylights-syntax-entity-tag: #116329; + --color-prettylights-syntax-keyword: #cf222e; + --color-prettylights-syntax-string: #0a3069; + --color-prettylights-syntax-variable: #953800; + --color-prettylights-syntax-brackethighlighter-unmatched: #82071e; + --color-prettylights-syntax-invalid-illegal-text: #f6f8fa; + --color-prettylights-syntax-invalid-illegal-bg: #82071e; + --color-prettylights-syntax-carriage-return-text: #f6f8fa; + --color-prettylights-syntax-carriage-return-bg: #cf222e; + --color-prettylights-syntax-string-regexp: #116329; + --color-prettylights-syntax-markup-list: #3b2300; + --color-prettylights-syntax-markup-heading: #0550ae; + --color-prettylights-syntax-markup-italic: #24292f; + --color-prettylights-syntax-markup-bold: #24292f; + --color-prettylights-syntax-markup-deleted-text: #82071e; + --color-prettylights-syntax-markup-deleted-bg: #ffebe9; + --color-prettylights-syntax-markup-inserted-text: #116329; + --color-prettylights-syntax-markup-inserted-bg: #dafbe1; + --color-prettylights-syntax-markup-changed-text: #953800; + --color-prettylights-syntax-markup-changed-bg: #ffd8b5; + --color-prettylights-syntax-markup-ignored-text: #eaeef2; + --color-prettylights-syntax-markup-ignored-bg: #0550ae; + --color-prettylights-syntax-meta-diff-range: #8250df; + --color-prettylights-syntax-brackethighlighter-angle: #57606a; + --color-prettylights-syntax-sublimelinter-gutter-mark: #8c959f; + --color-prettylights-syntax-constant-other-reference-link: #0a3069; + --color-fg-default: #1F2328; + --color-fg-muted: #656d76; + --color-fg-subtle: #6e7781; + --color-canvas-default: #ffffff; + --color-canvas-subtle: #f6f8fa; + --color-border-default: #d0d7de; + --color-border-muted: hsla(210, 18%, 87%, 1); + --color-neutral-muted: rgba(175, 184, 193, 0.2); + --color-accent-fg: #0969da; + --color-accent-emphasis: #0969da; + --color-success-fg: #1a7f37; + --color-success-emphasis: #1f883d; + --color-attention-fg: #9a6700; + --color-attention-emphasis: #9a6700; + --color-attention-subtle: #fff8c5; + --color-danger-fg: #d1242f; + --color-danger-emphasis: #cf222e; + --color-done-fg: #8250df; + --color-done-emphasis: #8250df; + } +} + +.markdown-body { + -ms-text-size-adjust: 100%; + -webkit-text-size-adjust: 100%; + margin: 0; + color: var(--color-fg-default); + background-color: var(--color-canvas-default); + font-family: -apple-system, BlinkMacSystemFont, "Segoe UI", "Noto Sans", Helvetica, Arial, sans-serif, "Apple Color Emoji", "Segoe UI Emoji"; + font-size: 16px; + line-height: 1.5; + word-wrap: break-word; +} + +.markdown-body .octicon { + display: inline-block; + fill: currentColor; + vertical-align: text-bottom; +} + +.markdown-body h1:hover .anchor .octicon-link:before, +.markdown-body h2:hover .anchor .octicon-link:before, +.markdown-body h3:hover .anchor .octicon-link:before, +.markdown-body h4:hover .anchor .octicon-link:before, +.markdown-body h5:hover .anchor .octicon-link:before, +.markdown-body h6:hover .anchor .octicon-link:before { + width: 16px; + height: 16px; + content: ' '; + display: inline-block; + background-color: currentColor; + -webkit-mask-image: url("data:image/svg+xml,"); + mask-image: url("data:image/svg+xml,"); +} + +.markdown-body details, +.markdown-body figcaption, +.markdown-body figure { + display: block; +} + +.markdown-body summary { + display: list-item; +} + +.markdown-body [hidden] { + display: none !important; +} + +.markdown-body a { + background-color: transparent; + color: var(--color-accent-fg); + text-decoration: none; +} + +.markdown-body abbr[title] { + border-bottom: none; + -webkit-text-decoration: underline dotted; + text-decoration: underline dotted; +} + +.markdown-body b, +.markdown-body strong { + font-weight: var(--base-text-weight-semibold, 600); +} + +.markdown-body dfn { + font-style: italic; +} + +.markdown-body h1 { + margin: .67em 0; + font-weight: var(--base-text-weight-semibold, 600); + padding-bottom: .3em; + font-size: 2em; + border-bottom: 1px solid var(--color-border-muted); +} + +.markdown-body mark { + background-color: var(--color-attention-subtle); + color: var(--color-fg-default); +} + +.markdown-body small { + font-size: 90%; +} + +.markdown-body sub, +.markdown-body sup { + font-size: 75%; + line-height: 0; + position: relative; + vertical-align: baseline; +} + +.markdown-body sub { + bottom: -0.25em; +} + +.markdown-body sup { + top: -0.5em; +} + +.markdown-body img { + border-style: none; + max-width: 100%; + box-sizing: content-box; + background-color: var(--color-canvas-default); +} + +.markdown-body code, +.markdown-body kbd, +.markdown-body pre, +.markdown-body samp { + font-family: monospace; + font-size: 1em; +} + +.markdown-body figure { + margin: 1em 40px; +} + +.markdown-body hr { + box-sizing: content-box; + overflow: hidden; + background: transparent; + border-bottom: 1px solid var(--color-border-muted); + height: .25em; + padding: 0; + margin: 24px 0; + background-color: var(--color-border-default); + border: 0; +} + +.markdown-body input { + font: inherit; + margin: 0; + overflow: visible; + font-family: inherit; + font-size: inherit; + line-height: inherit; +} + +.markdown-body [type=button], +.markdown-body [type=reset], +.markdown-body [type=submit] { + -webkit-appearance: button; + appearance: button; +} + +.markdown-body [type=checkbox], +.markdown-body [type=radio] { + box-sizing: border-box; + padding: 0; +} + +.markdown-body [type=number]::-webkit-inner-spin-button, +.markdown-body [type=number]::-webkit-outer-spin-button { + height: auto; +} + +.markdown-body [type=search]::-webkit-search-cancel-button, +.markdown-body [type=search]::-webkit-search-decoration { + -webkit-appearance: none; + appearance: none; +} + +.markdown-body ::-webkit-input-placeholder { + color: inherit; + opacity: .54; +} + +.markdown-body ::-webkit-file-upload-button { + -webkit-appearance: button; + appearance: button; + font: inherit; +} + +.markdown-body a:hover { + text-decoration: underline; +} + +.markdown-body ::placeholder { + color: var(--color-fg-subtle); + opacity: 1; +} + +.markdown-body hr::before { + display: table; + content: ""; +} + +.markdown-body hr::after { + display: table; + clear: both; + content: ""; +} + +.markdown-body table { + border-spacing: 0; + border-collapse: collapse; + display: block; + width: max-content; + max-width: 100%; + overflow: auto; +} + +.markdown-body td, +.markdown-body th { + padding: 0; +} + +.markdown-body details summary { + cursor: pointer; +} + +.markdown-body details:not([open])>*:not(summary) { + display: none !important; +} + +.markdown-body a:focus, +.markdown-body [role=button]:focus, +.markdown-body input[type=radio]:focus, +.markdown-body input[type=checkbox]:focus { + outline: 2px solid var(--color-accent-fg); + outline-offset: -2px; + box-shadow: none; +} + +.markdown-body a:focus:not(:focus-visible), +.markdown-body [role=button]:focus:not(:focus-visible), +.markdown-body input[type=radio]:focus:not(:focus-visible), +.markdown-body input[type=checkbox]:focus:not(:focus-visible) { + outline: solid 1px transparent; +} + +.markdown-body a:focus-visible, +.markdown-body [role=button]:focus-visible, +.markdown-body input[type=radio]:focus-visible, +.markdown-body input[type=checkbox]:focus-visible { + outline: 2px solid var(--color-accent-fg); + outline-offset: -2px; + box-shadow: none; +} + +.markdown-body a:not([class]):focus, +.markdown-body a:not([class]):focus-visible, +.markdown-body input[type=radio]:focus, +.markdown-body input[type=radio]:focus-visible, +.markdown-body input[type=checkbox]:focus, +.markdown-body input[type=checkbox]:focus-visible { + outline-offset: 0; +} + +.markdown-body kbd { + display: inline-block; + padding: 3px 5px; + font: 11px ui-monospace, SFMono-Regular, SF Mono, Menlo, Consolas, Liberation Mono, monospace; + line-height: 10px; + color: var(--color-fg-default); + vertical-align: middle; + background-color: var(--color-canvas-subtle); + border: solid 1px var(--color-neutral-muted); + border-bottom-color: var(--color-neutral-muted); + border-radius: 6px; + box-shadow: inset 0 -1px 0 var(--color-neutral-muted); +} + +.markdown-body h1, +.markdown-body h2, +.markdown-body h3, +.markdown-body h4, +.markdown-body h5, +.markdown-body h6 { + margin-top: 24px; + margin-bottom: 16px; + font-weight: var(--base-text-weight-semibold, 600); + line-height: 1.25; +} + +.markdown-body h2 { + font-weight: var(--base-text-weight-semibold, 600); + padding-bottom: .3em; + font-size: 1.5em; + border-bottom: 1px solid var(--color-border-muted); +} + +.markdown-body h3 { + font-weight: var(--base-text-weight-semibold, 600); + font-size: 1.25em; +} + +.markdown-body h4 { + font-weight: var(--base-text-weight-semibold, 600); + font-size: 1em; +} + +.markdown-body h5 { + font-weight: var(--base-text-weight-semibold, 600); + font-size: .875em; +} + +.markdown-body h6 { + font-weight: var(--base-text-weight-semibold, 600); + font-size: .85em; + color: var(--color-fg-muted); +} + +.markdown-body p { + margin-top: 0; + margin-bottom: 10px; +} + +.markdown-body blockquote { + margin: 0; + padding: 0 1em; + color: var(--color-fg-muted); + border-left: .25em solid var(--color-border-default); +} + +.markdown-body ul, +.markdown-body ol { + margin-top: 0; + margin-bottom: 0; + padding-left: 2em; +} + +.markdown-body ol ol, +.markdown-body ul ol { + list-style-type: lower-roman; +} + +.markdown-body ul ul ol, +.markdown-body ul ol ol, +.markdown-body ol ul ol, +.markdown-body ol ol ol { + list-style-type: lower-alpha; +} + +.markdown-body dd { + margin-left: 0; +} + +.markdown-body tt, +.markdown-body code, +.markdown-body samp { + font-family: ui-monospace, SFMono-Regular, SF Mono, Menlo, Consolas, Liberation Mono, monospace; + font-size: 12px; +} + +.markdown-body pre { + margin-top: 0; + margin-bottom: 0; + font-family: ui-monospace, SFMono-Regular, SF Mono, Menlo, Consolas, Liberation Mono, monospace; + font-size: 12px; + word-wrap: normal; +} + +.markdown-body .octicon { + display: inline-block; + overflow: visible !important; + vertical-align: text-bottom; + fill: currentColor; +} + +.markdown-body input::-webkit-outer-spin-button, +.markdown-body input::-webkit-inner-spin-button { + margin: 0; + -webkit-appearance: none; + appearance: none; +} + +.markdown-body .mr-2 { + margin-right: var(--base-size-8, 8px) !important; +} + +.markdown-body::before { + display: table; + content: ""; +} + +.markdown-body::after { + display: table; + clear: both; + content: ""; +} + +.markdown-body>*:first-child { + margin-top: 0 !important; +} + +.markdown-body>*:last-child { + margin-bottom: 0 !important; +} + +.markdown-body a:not([href]) { + color: inherit; + text-decoration: none; +} + +.markdown-body .absent { + color: var(--color-danger-fg); +} + +.markdown-body .anchor { + float: left; + padding-right: 4px; + margin-left: -20px; + line-height: 1; +} + +.markdown-body .anchor:focus { + outline: none; +} + +.markdown-body p, +.markdown-body blockquote, +.markdown-body ul, +.markdown-body ol, +.markdown-body dl, +.markdown-body table, +.markdown-body pre, +.markdown-body details { + margin-top: 0; + margin-bottom: 16px; +} + +.markdown-body blockquote>:first-child { + margin-top: 0; +} + +.markdown-body blockquote>:last-child { + margin-bottom: 0; +} + +.markdown-body h1 .octicon-link, +.markdown-body h2 .octicon-link, +.markdown-body h3 .octicon-link, +.markdown-body h4 .octicon-link, +.markdown-body h5 .octicon-link, +.markdown-body h6 .octicon-link { + color: var(--color-fg-default); + vertical-align: middle; + visibility: hidden; +} + +.markdown-body h1:hover .anchor, +.markdown-body h2:hover .anchor, +.markdown-body h3:hover .anchor, +.markdown-body h4:hover .anchor, +.markdown-body h5:hover .anchor, +.markdown-body h6:hover .anchor { + text-decoration: none; +} + +.markdown-body h1:hover .anchor .octicon-link, +.markdown-body h2:hover .anchor .octicon-link, +.markdown-body h3:hover .anchor .octicon-link, +.markdown-body h4:hover .anchor .octicon-link, +.markdown-body h5:hover .anchor .octicon-link, +.markdown-body h6:hover .anchor .octicon-link { + visibility: visible; +} + +.markdown-body h1 tt, +.markdown-body h1 code, +.markdown-body h2 tt, +.markdown-body h2 code, +.markdown-body h3 tt, +.markdown-body h3 code, +.markdown-body h4 tt, +.markdown-body h4 code, +.markdown-body h5 tt, +.markdown-body h5 code, +.markdown-body h6 tt, +.markdown-body h6 code { + padding: 0 .2em; + font-size: inherit; +} + +.markdown-body summary h1, +.markdown-body summary h2, +.markdown-body summary h3, +.markdown-body summary h4, +.markdown-body summary h5, +.markdown-body summary h6 { + display: inline-block; +} + +.markdown-body summary h1 .anchor, +.markdown-body summary h2 .anchor, +.markdown-body summary h3 .anchor, +.markdown-body summary h4 .anchor, +.markdown-body summary h5 .anchor, +.markdown-body summary h6 .anchor { + margin-left: -40px; +} + +.markdown-body summary h1, +.markdown-body summary h2 { + padding-bottom: 0; + border-bottom: 0; +} + +.markdown-body ul.no-list, +.markdown-body ol.no-list { + padding: 0; + list-style-type: none; +} + +.markdown-body ol[type="a s"] { + list-style-type: lower-alpha; +} + +.markdown-body ol[type="A s"] { + list-style-type: upper-alpha; +} + +.markdown-body ol[type="i s"] { + list-style-type: lower-roman; +} + +.markdown-body ol[type="I s"] { + list-style-type: upper-roman; +} + +.markdown-body ol[type="1"] { + list-style-type: decimal; +} + +.markdown-body div>ol:not([type]) { + list-style-type: decimal; +} + +.markdown-body ul ul, +.markdown-body ul ol, +.markdown-body ol ol, +.markdown-body ol ul { + margin-top: 0; + margin-bottom: 0; +} + +.markdown-body li>p { + margin-top: 16px; +} + +.markdown-body li+li { + margin-top: .25em; +} + +.markdown-body dl { + padding: 0; +} + +.markdown-body dl dt { + padding: 0; + margin-top: 16px; + font-size: 1em; + font-style: italic; + font-weight: var(--base-text-weight-semibold, 600); +} + +.markdown-body dl dd { + padding: 0 16px; + margin-bottom: 16px; +} + +.markdown-body table th { + font-weight: var(--base-text-weight-semibold, 600); +} + +.markdown-body table th, +.markdown-body table td { + padding: 6px 13px; + border: 1px solid var(--color-border-default); +} + +.markdown-body table td>:last-child { + margin-bottom: 0; +} + +.markdown-body table tr { + background-color: var(--color-canvas-default); + border-top: 1px solid var(--color-border-muted); +} + +.markdown-body table tr:nth-child(2n) { + background-color: var(--color-canvas-subtle); +} + +.markdown-body table img { + background-color: transparent; +} + +.markdown-body img[align=right] { + padding-left: 20px; +} + +.markdown-body img[align=left] { + padding-right: 20px; +} + +.markdown-body .emoji { + max-width: none; + vertical-align: text-top; + background-color: transparent; +} + +.markdown-body span.frame { + display: block; + overflow: hidden; +} + +.markdown-body span.frame>span { + display: block; + float: left; + width: auto; + padding: 7px; + margin: 13px 0 0; + overflow: hidden; + border: 1px solid var(--color-border-default); +} + +.markdown-body span.frame span img { + display: block; + float: left; +} + +.markdown-body span.frame span span { + display: block; + padding: 5px 0 0; + clear: both; + color: var(--color-fg-default); +} + +.markdown-body span.align-center { + display: block; + overflow: hidden; + clear: both; +} + +.markdown-body span.align-center>span { + display: block; + margin: 13px auto 0; + overflow: hidden; + text-align: center; +} + +.markdown-body span.align-center span img { + margin: 0 auto; + text-align: center; +} + +.markdown-body span.align-right { + display: block; + overflow: hidden; + clear: both; +} + +.markdown-body span.align-right>span { + display: block; + margin: 13px 0 0; + overflow: hidden; + text-align: right; +} + +.markdown-body span.align-right span img { + margin: 0; + text-align: right; +} + +.markdown-body span.float-left { + display: block; + float: left; + margin-right: 13px; + overflow: hidden; +} + +.markdown-body span.float-left span { + margin: 13px 0 0; +} + +.markdown-body span.float-right { + display: block; + float: right; + margin-left: 13px; + overflow: hidden; +} + +.markdown-body span.float-right>span { + display: block; + margin: 13px auto 0; + overflow: hidden; + text-align: right; +} + +.markdown-body code, +.markdown-body tt { + padding: .2em .4em; + margin: 0; + font-size: 85%; + white-space: break-spaces; + background-color: var(--color-neutral-muted); + border-radius: 6px; +} + +.markdown-body code br, +.markdown-body tt br { + display: none; +} + +.markdown-body del code { + text-decoration: inherit; +} + +.markdown-body samp { + font-size: 85%; +} + +.markdown-body pre code { + font-size: 100%; +} + +.markdown-body pre>code { + padding: 0; + margin: 0; + word-break: normal; + white-space: pre; + background: transparent; + border: 0; +} + +.markdown-body .highlight { + margin-bottom: 16px; +} + +.markdown-body .highlight pre { + margin-bottom: 0; + word-break: normal; +} + +.markdown-body .highlight pre, +.markdown-body pre { + padding: 16px; + overflow: auto; + font-size: 85%; + line-height: 1.45; + color: var(--color-fg-default); + background-color: var(--color-canvas-subtle); + border-radius: 6px; +} + +.markdown-body pre code, +.markdown-body pre tt { + display: inline; + max-width: auto; + padding: 0; + margin: 0; + overflow: visible; + line-height: inherit; + word-wrap: normal; + background-color: transparent; + border: 0; +} + +.markdown-body .csv-data td, +.markdown-body .csv-data th { + padding: 5px; + overflow: hidden; + font-size: 12px; + line-height: 1; + text-align: left; + white-space: nowrap; +} + +.markdown-body .csv-data .blob-num { + padding: 10px 8px 9px; + text-align: right; + background: var(--color-canvas-default); + border: 0; +} + +.markdown-body .csv-data tr { + border-top: 0; +} + +.markdown-body .csv-data th { + font-weight: var(--base-text-weight-semibold, 600); + background: var(--color-canvas-subtle); + border-top: 0; +} + +.markdown-body [data-footnote-ref]::before { + content: "["; +} + +.markdown-body [data-footnote-ref]::after { + content: "]"; +} + +.markdown-body .footnotes { + font-size: 12px; + color: var(--color-fg-muted); + border-top: 1px solid var(--color-border-default); +} + +.markdown-body .footnotes ol { + padding-left: 16px; +} + +.markdown-body .footnotes ol ul { + display: inline-block; + padding-left: 16px; + margin-top: 16px; +} + +.markdown-body .footnotes li { + position: relative; +} + +.markdown-body .footnotes li:target::before { + position: absolute; + top: -8px; + right: -8px; + bottom: -8px; + left: -24px; + pointer-events: none; + content: ""; + border: 2px solid var(--color-accent-emphasis); + border-radius: 6px; +} + +.markdown-body .footnotes li:target { + color: var(--color-fg-default); +} + +.markdown-body .footnotes .data-footnote-backref g-emoji { + font-family: monospace; +} + +.markdown-body .pl-c { + color: var(--color-prettylights-syntax-comment); +} + +.markdown-body .pl-c1, +.markdown-body .pl-s .pl-v { + color: var(--color-prettylights-syntax-constant); +} + +.markdown-body .pl-e, +.markdown-body .pl-en { + color: var(--color-prettylights-syntax-entity); +} + +.markdown-body .pl-smi, +.markdown-body .pl-s .pl-s1 { + color: var(--color-prettylights-syntax-storage-modifier-import); +} + +.markdown-body .pl-ent { + color: var(--color-prettylights-syntax-entity-tag); +} + +.markdown-body .pl-k { + color: var(--color-prettylights-syntax-keyword); +} + +.markdown-body .pl-s, +.markdown-body .pl-pds, +.markdown-body .pl-s .pl-pse .pl-s1, +.markdown-body .pl-sr, +.markdown-body .pl-sr .pl-cce, +.markdown-body .pl-sr .pl-sre, +.markdown-body .pl-sr .pl-sra { + color: var(--color-prettylights-syntax-string); +} + +.markdown-body .pl-v, +.markdown-body .pl-smw { + color: var(--color-prettylights-syntax-variable); +} + +.markdown-body .pl-bu { + color: var(--color-prettylights-syntax-brackethighlighter-unmatched); +} + +.markdown-body .pl-ii { + color: var(--color-prettylights-syntax-invalid-illegal-text); + background-color: var(--color-prettylights-syntax-invalid-illegal-bg); +} + +.markdown-body .pl-c2 { + color: var(--color-prettylights-syntax-carriage-return-text); + background-color: var(--color-prettylights-syntax-carriage-return-bg); +} + +.markdown-body .pl-sr .pl-cce { + font-weight: bold; + color: var(--color-prettylights-syntax-string-regexp); +} + +.markdown-body .pl-ml { + color: var(--color-prettylights-syntax-markup-list); +} + +.markdown-body .pl-mh, +.markdown-body .pl-mh .pl-en, +.markdown-body .pl-ms { + font-weight: bold; + color: var(--color-prettylights-syntax-markup-heading); +} + +.markdown-body .pl-mi { + font-style: italic; + color: var(--color-prettylights-syntax-markup-italic); +} + +.markdown-body .pl-mb { + font-weight: bold; + color: var(--color-prettylights-syntax-markup-bold); +} + +.markdown-body .pl-md { + color: var(--color-prettylights-syntax-markup-deleted-text); + background-color: var(--color-prettylights-syntax-markup-deleted-bg); +} + +.markdown-body .pl-mi1 { + color: var(--color-prettylights-syntax-markup-inserted-text); + background-color: var(--color-prettylights-syntax-markup-inserted-bg); +} + +.markdown-body .pl-mc { + color: var(--color-prettylights-syntax-markup-changed-text); + background-color: var(--color-prettylights-syntax-markup-changed-bg); +} + +.markdown-body .pl-mi2 { + color: var(--color-prettylights-syntax-markup-ignored-text); + background-color: var(--color-prettylights-syntax-markup-ignored-bg); +} + +.markdown-body .pl-mdr { + font-weight: bold; + color: var(--color-prettylights-syntax-meta-diff-range); +} + +.markdown-body .pl-ba { + color: var(--color-prettylights-syntax-brackethighlighter-angle); +} + +.markdown-body .pl-sg { + color: var(--color-prettylights-syntax-sublimelinter-gutter-mark); +} + +.markdown-body .pl-corl { + text-decoration: underline; + color: var(--color-prettylights-syntax-constant-other-reference-link); +} + +.markdown-body g-emoji { + display: inline-block; + min-width: 1ch; + font-family: "Apple Color Emoji", "Segoe UI Emoji", "Segoe UI Symbol"; + font-size: 1em; + font-style: normal !important; + font-weight: var(--base-text-weight-normal, 400); + line-height: 1; + vertical-align: -0.075em; +} + +.markdown-body g-emoji img { + width: 1em; + height: 1em; +} + +.markdown-body .task-list-item { + list-style-type: none; +} + +.markdown-body .task-list-item label { + font-weight: var(--base-text-weight-normal, 400); +} + +.markdown-body .task-list-item.enabled label { + cursor: pointer; +} + +.markdown-body .task-list-item+.task-list-item { + margin-top: 4px; +} + +.markdown-body .task-list-item .handle { + display: none; +} + +.markdown-body .task-list-item-checkbox { + margin: 0 .2em .25em -1.4em; + vertical-align: middle; +} + +.markdown-body .contains-task-list:dir(rtl) .task-list-item-checkbox { + margin: 0 -1.6em .25em .2em; +} + +.markdown-body .contains-task-list { + position: relative; +} + +.markdown-body .contains-task-list:hover .task-list-item-convert-container, +.markdown-body .contains-task-list:focus-within .task-list-item-convert-container { + display: block; + width: auto; + height: 24px; + overflow: visible; + clip: auto; +} + +.markdown-body ::-webkit-calendar-picker-indicator { + filter: invert(50%); +} + +.markdown-body .markdown-alert { + padding: var(--base-size-8) var(--base-size-16); + margin-bottom: 16px; + color: inherit; + border-left: .25em solid var(--color-border-default); +} + +.markdown-body .markdown-alert>:first-child { + margin-top: 0; +} + +.markdown-body .markdown-alert>:last-child { + margin-bottom: 0; +} + +.markdown-body .markdown-alert .markdown-alert-title { + display: flex; + font-weight: var(--base-text-weight-medium, 500); + align-items: center; + line-height: 1; +} + +.markdown-body .markdown-alert.markdown-alert-note { + border-left-color: var(--color-accent-emphasis); +} + +.markdown-body .markdown-alert.markdown-alert-note .markdown-alert-title { + color: var(--color-accent-fg); +} + +.markdown-body .markdown-alert.markdown-alert-important { + border-left-color: var(--color-done-emphasis); +} + +.markdown-body .markdown-alert.markdown-alert-important .markdown-alert-title { + color: var(--color-done-fg); +} + +.markdown-body .markdown-alert.markdown-alert-warning { + border-left-color: var(--color-attention-emphasis); +} + +.markdown-body .markdown-alert.markdown-alert-warning .markdown-alert-title { + color: var(--color-attention-fg); +} + +.markdown-body .markdown-alert.markdown-alert-tip { + border-left-color: var(--color-success-emphasis); +} + +.markdown-body .markdown-alert.markdown-alert-tip .markdown-alert-title { + color: var(--color-success-fg); +} + +.markdown-body .markdown-alert.markdown-alert-caution { + border-left-color: var(--color-danger-emphasis); +} + +.markdown-body .markdown-alert.markdown-alert-caution .markdown-alert-title { + color: var(--color-danger-fg); +} \ No newline at end of file diff --git a/docs/index.html b/docs/index.html new file mode 100644 index 00000000..d1154b4d --- /dev/null +++ b/docs/index.html @@ -0,0 +1,1250 @@ + + + + + + + + + + Search Code By Comment + + + + + + + + + + + + + + + + + + + + + + + +
+
+
+
+
+ + +
+

+
+ Document index of: +
+
+ + + +
+

+
+ + +
+
+
    + + + + + + \ No newline at end of file diff --git a/docs/metadata.json b/docs/metadata.json new file mode 100644 index 00000000..34dc1ef0 --- /dev/null +++ b/docs/metadata.json @@ -0,0 +1,95 @@ +{ + "url": { + "full": "https://github.com/OthersideAI/self-operating-computer", + "partial": "OthersideAI/self-operating-computer" + }, + "file_mapping": { + "0": { + "filepath": "/README.md", + "entry_id": 0, + "language_id": "markdown" + }, + "1": { + "filepath": "/evaluate.py", + "entry_id": 16, + "language_id": "python" + }, + "2": { + "filepath": "/operate/actions.py", + "entry_id": 28, + "language_id": "python" + }, + "3": { + "filepath": "/operate/dialog.py", + "entry_id": 56, + "language_id": "python" + }, + "4": { + "filepath": "/operate/exceptions.py", + "entry_id": 70, + "language_id": "python" + }, + "5": { + "filepath": "/operate/main.py", + "entry_id": 74, + "language_id": "python" + }, + "6": { + "filepath": "/operate/prompts.py", + "entry_id": 78, + "language_id": "python" + }, + "7": { + "filepath": "/operate/settings.py", + "entry_id": 98, + "language_id": "python" + }, + "8": { + "filepath": "/operate/utils/label.py", + "entry_id": 104, + "language_id": "python" + }, + "9": { + "filepath": "/operate/utils/misc.py", + "entry_id": 120, + "language_id": "python" + }, + "10": { + "filepath": "/operate/utils/os.py", + "entry_id": 130, + "language_id": "python" + }, + "11": { + "filepath": "/operate/utils/screenshot.py", + "entry_id": 140, + "language_id": "python" + }, + "12": { + "filepath": "/operate/utils/style.py", + "entry_id": 156, + "language_id": "python" + }, + "13": { + "filepath": "/requirements-audio.txt", + "entry_id": 162, + "language_id": "plain-text" + }, + "14": { + "filepath": "/requirements.txt", + "entry_id": 166, + "language_id": "plain-text" + }, + "15": { + "filepath": "/run.sh", + "entry_id": 172, + "language_id": "shell" + }, + "16": { + "filepath": "/setup.py", + "entry_id": 184, + "language_id": "python" + } + }, + "project_name": "self-operating-computer", + "split_count": 2 +} \ No newline at end of file diff --git a/docs/metadata_title.json b/docs/metadata_title.json new file mode 100644 index 00000000..0103dd27 --- /dev/null +++ b/docs/metadata_title.json @@ -0,0 +1 @@ +{"split_count": 1} \ No newline at end of file diff --git a/docs/sitemap.xml b/docs/sitemap.xml new file mode 100644 index 00000000..2b685791 --- /dev/null +++ b/docs/sitemap.xml @@ -0,0 +1,115 @@ + + + + + + + + https://james4ever0.github.io/self-operating-computer?q=/README.md + 2023-12-28T09:21:02+00:00 + 1.00 + + + + https://james4ever0.github.io/self-operating-computer?q=/evaluate.py + 2023-12-28T09:21:02+00:00 + 1.00 + + + + https://james4ever0.github.io/self-operating-computer?q=/operate/actions.py + 2023-12-28T09:21:02+00:00 + 1.00 + + + + https://james4ever0.github.io/self-operating-computer?q=/operate/dialog.py + 2023-12-28T09:21:02+00:00 + 1.00 + + + + https://james4ever0.github.io/self-operating-computer?q=/operate/exceptions.py + 2023-12-28T09:21:02+00:00 + 1.00 + + + + https://james4ever0.github.io/self-operating-computer?q=/operate/main.py + 2023-12-28T09:21:02+00:00 + 1.00 + + + + https://james4ever0.github.io/self-operating-computer?q=/operate/prompts.py + 2023-12-28T09:21:02+00:00 + 1.00 + + + + https://james4ever0.github.io/self-operating-computer?q=/operate/settings.py + 2023-12-28T09:21:02+00:00 + 1.00 + + + + https://james4ever0.github.io/self-operating-computer?q=/operate/utils/label.py + 2023-12-28T09:21:02+00:00 + 1.00 + + + + https://james4ever0.github.io/self-operating-computer?q=/operate/utils/misc.py + 2023-12-28T09:21:02+00:00 + 1.00 + + + + https://james4ever0.github.io/self-operating-computer?q=/operate/utils/os.py + 2023-12-28T09:21:02+00:00 + 1.00 + + + + https://james4ever0.github.io/self-operating-computer?q=/operate/utils/screenshot.py + 2023-12-28T09:21:02+00:00 + 1.00 + + + + https://james4ever0.github.io/self-operating-computer?q=/operate/utils/style.py + 2023-12-28T09:21:02+00:00 + 1.00 + + + + https://james4ever0.github.io/self-operating-computer?q=/requirements-audio.txt + 2023-12-28T09:21:02+00:00 + 1.00 + + + + https://james4ever0.github.io/self-operating-computer?q=/requirements.txt + 2023-12-28T09:21:02+00:00 + 1.00 + + + + https://james4ever0.github.io/self-operating-computer?q=/run.sh + 2023-12-28T09:21:02+00:00 + 1.00 + + + + https://james4ever0.github.io/self-operating-computer?q=/setup.py + 2023-12-28T09:21:02+00:00 + 1.00 + + + + https://james4ever0.github.io/self-operating-computer/tree.html?full=true + 2023-12-28T09:21:02+00:00 + 1.00 + + + \ No newline at end of file diff --git a/docs/src/README.md b/docs/src/README.md new file mode 100644 index 00000000..a313bc9c --- /dev/null +++ b/docs/src/README.md @@ -0,0 +1,172 @@ +

    Self-Operating Computer Framework

    + +

    + A framework to enable multimodal models to operate a computer. +

    +

    + Using the same inputs and outputs as a human operator, the model views the screen and decides on a series of mouse and keyboard actions to reach an objective. +

    + +
    + +
    + + + + +## Key Features +- **Compatibility**: Designed for various multimodal models. +- **Integration**: Currently integrated with **GPT-4v** as the default model, with extended support for Gemini Pro Vision. +- **Future Plans**: Support for additional models. + +## Current Challenges +> **Note:** GPT-4V's error rate in estimating XY mouse click locations is currently quite high. This framework aims to track the progress of multimodal models over time, aspiring to achieve human-level performance in computer operation. + +## Ongoing Development +At [HyperwriteAI](https://www.hyperwriteai.com/), we are developing Agent-1-Vision a multimodal model with more accurate click location predictions. + +## Agent-1-Vision Model API Access +We will soon be offering API access to our Agent-1-Vision model. + +If you're interested in gaining access to this API, sign up [here](https://othersideai.typeform.com/to/FszaJ1k8?typeform-source=www.hyperwriteai.com). + +### Additional Thoughts +We recognize that some operating system functions may be more efficiently executed with hotkeys such as entering the Browser Address bar using `command + L` rather than by simulating a mouse click at the correct XY location. We plan to make these improvements over time. However, it's important to note that many actions require the accurate selection of visual elements on the screen, necessitating precise XY mouse click locations. A primary focus of this project is to refine the accuracy of determining these click locations. We believe this is essential for achieving a fully self-operating computer in the current technological landscape. +## Demo + +https://github.com/OthersideAI/self-operating-computer/assets/42594239/9e8abc96-c76a-46fb-9b13-03678b3c67e0 + + +## Quick Start Instructions +Below are instructions to set up the Self-Operating Computer Framework locally on your computer. + +### Option 1: Traditional Installation + +1. **Clone the repo** to a directory on your computer: +``` +git clone https://github.com/OthersideAI/self-operating-computer.git +``` +2. **Cd into directory**: + +``` +cd self-operating-computer +``` + +3. **Create a Python virtual environment**. [Learn more about Python virtual environment](https://docs.python.org/3/library/venv.html). + +``` +python3 -m venv venv +``` +4. **Activate the virtual environment**: +``` +source venv/bin/activate +``` +5. **Install Project Requirements and Command-Line Interface: Instead of using `pip install .`, you can now install the project directly from PyPI with:** +``` +pip install self-operating-computer +``` +6. **Then rename the `.example.env` file to `.env` so that you can save your OpenAI key in it.** +``` +mv .example.env .env +``` +7. **Add your Open AI key to your new `.env` file. If you don't have one, you can obtain an OpenAI key [here](https://platform.openai.com/account/api-keys)**: +``` +OPENAI_API_KEY='your-key-here' +``` + +8. **Run it**! +``` +operate +``` +9. **Final Step**: As a last step, the Terminal app will ask for permission for "Screen Recording" and "Accessibility" in the "Security & Privacy" page of Mac's "System Preferences". + +
    + + +
    + + +### Option 2: Installation using .sh script + +1. **Clone the repo** to a directory on your computer: +``` +git clone https://github.com/OthersideAI/self-operating-computer.git +``` +2. **Cd into directory**: + +``` +cd self-operating-computer +``` + +3. **Run the installation script**: + +``` +./run.sh +``` + + +## Using `operate` Modes + +### Multimodal Models `-m` +An additional model is now compatible with the Self Operating Computer Framework. Try Google's `gemini-pro-vision` by following the instructions below. + +**Add your Google AI Studio API key to your .env file.** If you don't have one, you can obtain a key [here](https://makersuite.google.com/app/apikey) after setting up your Google AI Studio account. You may also need [authorize credentials for a desktop application](https://ai.google.dev/palm_docs/oauth_quickstart). It took me a bit of time to get it working, if anyone knows a simpler way, please make a PR: +``` +GOOGLE_API_KEY='your-key-here' +``` + +Start `operate` with the Gemini model +``` +operate -m gemini-pro-vision +``` + +### Voice Mode `--voice` +The framework supports voice inputs for the objective. Try voice by following the instructions below. + +Install the additional `requirements-audio.txt` +``` +pip install -r requirements-audio.txt +``` +**Install device requirements** +For mac users: +``` +brew install portaudio +``` +For Linux users: +``` +sudo apt install portaudio19-dev python3-pyaudio +``` +Run with voice mode +``` +operate --voice +``` + +## Contributions are Welcomed!: + +If you want to contribute yourself, see [CONTRIBUTING.md](https://github.com/OthersideAI/self-operating-computer/blob/main/CONTRIBUTING.md). + +## Feedback + +For any input on improving this project, feel free to reach out to [Josh](https://twitter.com/josh_bickett) on Twitter. + +## Join Our Discord Community + +For real-time discussions and community support, join our Discord server. +- If you're already a member, join the discussion in [#self-operating-computer](https://discord.com/channels/877638638001877052/1181241785834541157). +- If you're new, first [join our Discord Server](https://discord.gg/YqaKtyBEzM) and then navigate to the [#self-operating-computer](https://discord.com/channels/877638638001877052/1181241785834541157). + +## Follow HyperWriteAI for More Updates + +Stay updated with the latest developments: +- Follow HyperWriteAI on [Twitter](https://twitter.com/HyperWriteAI). +- Follow HyperWriteAI on [LinkedIn](https://www.linkedin.com/company/othersideai/). + +## Compatibility +- This project is compatible with Mac OS, Windows, and Linux (with X server installed). + +## OpenAI Rate Limiting Note +The ```gpt-4-vision-preview``` model is required. To unlock access to this model, your account needs to spend at least \$5 in API credits. Pre-paying for these credits will unlock access if you haven't already spent the minimum \$5. +Learn more **[here](https://platform.openai.com/docs/guides/rate-limits?context=tier-one)** diff --git a/docs/src/evaluate.py b/docs/src/evaluate.py new file mode 100644 index 00000000..f543c82c --- /dev/null +++ b/docs/src/evaluate.py @@ -0,0 +1,150 @@ +import sys +import os +import subprocess +import platform +import base64 +import json +import openai + +from dotenv import load_dotenv + +# "Objective for `operate`" : "Guideline for passing this test case given to GPT-4v" +TEST_CASES = { + "Go to Github.com": "The Github home page is visible.", + "Go to Youtube.com and play a video": "The YouTube video player is visible.", +} + +EVALUATION_PROMPT = """ +Your job is to look at the given screenshot and determine if the following guideline is met in the image. +You must respond in the following format ONLY. Do not add anything else: +{{ "guideline_met": (true|false), "reason": "Explanation for why guideline was or wasn't met" }} +guideline_met must be set to a JSON boolean. True if the image meets the given guideline. +reason must be a string containing a justification for your decision. + +Guideline: {guideline} +""" + +SUMMARY_SCREENSHOT_PATH = os.path.join('screenshots', 'summary_screenshot.png') + +# Check if on a windows terminal that supports ANSI escape codes +def supports_ansi(): + """ + Check if the terminal supports ANSI escape codes + """ + plat = platform.system() + supported_platform = plat != "Windows" or "ANSICON" in os.environ + is_a_tty = hasattr(sys.stdout, "isatty") and sys.stdout.isatty() + return supported_platform and is_a_tty + +if supports_ansi(): + # Standard green text + ANSI_GREEN = "\033[32m" + # Bright/bold green text + ANSI_BRIGHT_GREEN = "\033[92m" + # Reset to default text color + ANSI_RESET = "\033[0m" + # ANSI escape code for blue text + ANSI_BLUE = "\033[94m" # This is for bright blue + + # Standard yellow text + ANSI_YELLOW = "\033[33m" + + ANSI_RED = "\033[31m" + + # Bright magenta text + ANSI_BRIGHT_MAGENTA = "\033[95m" +else: + ANSI_GREEN = "" + ANSI_BRIGHT_GREEN = "" + ANSI_RESET = "" + ANSI_BLUE = "" + ANSI_YELLOW = "" + ANSI_RED = "" + ANSI_BRIGHT_MAGENTA = "" + + +def format_evaluation_prompt(guideline): + prompt = EVALUATION_PROMPT.format(guideline=guideline) + return prompt + + +def parse_eval_content(content): + try: + res = json.loads(content) + + print(res["reason"]) + + return res["guideline_met"] + except: + print("The model gave a bad evaluation response and it couldn't be parsed. Exiting...") + exit(1) + + +def evaluate_summary_screenshot(guideline): + '''Load the summary screenshot and return True or False if it meets the given guideline.''' + with open(SUMMARY_SCREENSHOT_PATH, "rb") as img_file: + img_base64 = base64.b64encode(img_file.read()).decode("utf-8") + + eval_message = [{ + "role": "user", + "content": [ + {"type": "text", "text": format_evaluation_prompt(guideline)}, + { + "type": "image_url", + "image_url": {"url": f"data:image/jpeg;base64,{img_base64}"}, + }, + ], + }] + + response = openai.chat.completions.create( + model="gpt-4-vision-preview", + messages=eval_message, + presence_penalty=1, + frequency_penalty=1, + temperature=0.7, + max_tokens=300, + ) + + eval_content = response.choices[0].message.content + + return parse_eval_content(eval_content) + + +def run_test_case(objective, guideline): + '''Returns True if the result of the test with the given prompt meets the given guideline.''' + # Run `operate` with the test case prompt + subprocess.run(['operate', '--prompt', f'"{objective}"'], stdout=subprocess.DEVNULL) + + try: + result = evaluate_summary_screenshot(guideline) + except(OSError): + print("Couldn't open the summary screenshot") + return False + + return result + + +def main(): + load_dotenv() + openai.api_key = os.getenv("OPENAI_API_KEY") + + print(f"{ANSI_BRIGHT_MAGENTA}[STARTING EVALUATION]{ANSI_RESET}") + + passed = 0; failed = 0 + for objective, guideline in TEST_CASES.items(): + print(f"{ANSI_BLUE}[EVALUATING]{ANSI_RESET} '{objective}'") + + result = run_test_case(objective, guideline) + if result: + print(f"{ANSI_GREEN}[PASSED]{ANSI_RESET} '{objective}'") + passed += 1 + else: + print(f"{ANSI_RED}[FAILED]{ANSI_RESET} '{objective}'") + failed += 1 + + print( + f"{ANSI_BRIGHT_MAGENTA}[EVALUATION COMPLETE]{ANSI_RESET} {passed} tests passed, {failed} tests failed" + ) + +if __name__ == "__main__": + main() diff --git a/docs/src/operate/actions.py b/docs/src/operate/actions.py new file mode 100644 index 00000000..45013c13 --- /dev/null +++ b/docs/src/operate/actions.py @@ -0,0 +1,409 @@ +import os +import time +import json +import base64 +import re +import io +import asyncio +import aiohttp + +from PIL import Image +from ultralytics import YOLO +import google.generativeai as genai +from operate.settings import Config +from operate.exceptions import ModelNotRecognizedException +from operate.utils.screenshot import ( + capture_screen_with_cursor, + add_grid_to_image, + capture_mini_screenshot_with_cursor, +) +from operate.utils.os import get_last_assistant_message +from operate.prompts import ( + format_vision_prompt, + format_accurate_mode_vision_prompt, + format_summary_prompt, + format_decision_prompt, + format_label_prompt, +) + + +from operate.utils.label import ( + add_labels, + parse_click_content, + get_click_position_in_percent, + get_label_coordinates, +) +from operate.utils.style import ( + ANSI_GREEN, + ANSI_RED, + ANSI_RESET, +) + + +# Load configuration +config = Config() + +client = config.initialize_openai_client() + +yolo_model = YOLO("./operate/model/weights/best.pt") # Load your trained model + + +async def get_next_action(model, messages, objective): + if model == "gpt-4": + return call_gpt_4_v(messages, objective) + if model == "gpt-4-with-som": + return await call_gpt_4_v_labeled(messages, objective) + elif model == "agent-1": + return "coming soon" + elif model == "gemini-pro-vision": + return call_gemini_pro_vision(messages, objective) + + raise ModelNotRecognizedException(model) + + +def call_gpt_4_v(messages, objective): + """ + Get the next action for Self-Operating Computer + """ + # sleep for a second + time.sleep(1) + try: + screenshots_dir = "screenshots" + if not os.path.exists(screenshots_dir): + os.makedirs(screenshots_dir) + + screenshot_filename = os.path.join(screenshots_dir, "screenshot.png") + # Call the function to capture the screen with the cursor + capture_screen_with_cursor(screenshot_filename) + + new_screenshot_filename = os.path.join( + "screenshots", "screenshot_with_grid.png" + ) + + add_grid_to_image(screenshot_filename, new_screenshot_filename, 500) + # sleep for a second + time.sleep(1) + + with open(new_screenshot_filename, "rb") as img_file: + img_base64 = base64.b64encode(img_file.read()).decode("utf-8") + + previous_action = get_last_assistant_message(messages) + + vision_prompt = format_vision_prompt(objective, previous_action) + + vision_message = { + "role": "user", + "content": [ + {"type": "text", "text": vision_prompt}, + { + "type": "image_url", + "image_url": {"url": f"data:image/jpeg;base64,{img_base64}"}, + }, + ], + } + + # create a copy of messages and save to pseudo_messages + pseudo_messages = messages.copy() + pseudo_messages.append(vision_message) + + response = client.chat.completions.create( + model="gpt-4-vision-preview", + messages=pseudo_messages, + presence_penalty=1, + frequency_penalty=1, + temperature=0.7, + max_tokens=300, + ) + + messages.append( + { + "role": "user", + "content": "`screenshot.png`", + } + ) + + content = response.choices[0].message.content + + return content + + except Exception as e: + print(f"Error parsing JSON: {e}") + return "Failed take action after looking at the screenshot" + + +def call_gemini_pro_vision(messages, objective): + """ + Get the next action for Self-Operating Computer using Gemini Pro Vision + """ + # sleep for a second + time.sleep(1) + try: + screenshots_dir = "screenshots" + if not os.path.exists(screenshots_dir): + os.makedirs(screenshots_dir) + + screenshot_filename = os.path.join(screenshots_dir, "screenshot.png") + # Call the function to capture the screen with the cursor + capture_screen_with_cursor(screenshot_filename) + + new_screenshot_filename = os.path.join( + "screenshots", "screenshot_with_grid.png" + ) + + add_grid_to_image(screenshot_filename, new_screenshot_filename, 500) + # sleep for a second + time.sleep(1) + + previous_action = get_last_assistant_message(messages) + + vision_prompt = format_vision_prompt(objective, previous_action) + + model = genai.GenerativeModel("gemini-pro-vision") + + response = model.generate_content( + [vision_prompt, Image.open(new_screenshot_filename)] + ) + + # create a copy of messages and save to pseudo_messages + pseudo_messages = messages.copy() + pseudo_messages.append(response.text) + + messages.append( + { + "role": "user", + "content": "`screenshot.png`", + } + ) + content = response.text[1:] + + return content + + except Exception as e: + print(f"Error parsing JSON: {e}") + return "Failed take action after looking at the screenshot" + + +# This function is not used. `-accurate` mode was removed for now until a new PR fixes it. +def accurate_mode_double_check(model, pseudo_messages, prev_x, prev_y): + """ + Reprompt OAI with additional screenshot of a mini screenshot centered around the cursor for further finetuning of clicked location + """ + try: + screenshot_filename = os.path.join("screenshots", "screenshot_mini.png") + capture_mini_screenshot_with_cursor( + file_path=screenshot_filename, x=prev_x, y=prev_y + ) + + new_screenshot_filename = os.path.join( + "screenshots", "screenshot_mini_with_grid.png" + ) + + with open(new_screenshot_filename, "rb") as img_file: + img_base64 = base64.b64encode(img_file.read()).decode("utf-8") + + accurate_vision_prompt = format_accurate_mode_vision_prompt(prev_x, prev_y) + + accurate_mode_message = { + "role": "user", + "content": [ + {"type": "text", "text": accurate_vision_prompt}, + { + "type": "image_url", + "image_url": {"url": f"data:image/jpeg;base64,{img_base64}"}, + }, + ], + } + + pseudo_messages.append(accurate_mode_message) + + response = client.chat.completions.create( + model="gpt-4-vision-preview", + messages=pseudo_messages, + presence_penalty=1, + frequency_penalty=1, + temperature=0.7, + max_tokens=300, + ) + + content = response.choices[0].message.content + + except Exception as e: + print(f"Error reprompting model for accurate_mode: {e}") + return "ERROR" + + +def summarize(model, messages, objective): + try: + screenshots_dir = "screenshots" + if not os.path.exists(screenshots_dir): + os.makedirs(screenshots_dir) + + screenshot_filename = os.path.join(screenshots_dir, "summary_screenshot.png") + # Call the function to capture the screen with the cursor + capture_screen_with_cursor(screenshot_filename) + + summary_prompt = format_summary_prompt(objective) + + if model == "gpt-4-vision-preview": + with open(screenshot_filename, "rb") as img_file: + img_base64 = base64.b64encode(img_file.read()).decode("utf-8") + + summary_message = { + "role": "user", + "content": [ + {"type": "text", "text": summary_prompt}, + { + "type": "image_url", + "image_url": {"url": f"data:image/jpeg;base64,{img_base64}"}, + }, + ], + } + # create a copy of messages and save to pseudo_messages + messages.append(summary_message) + + response = client.chat.completions.create( + model="gpt-4-vision-preview", + messages=messages, + max_tokens=500, + ) + + content = response.choices[0].message.content + elif model == "gemini-pro-vision": + model = genai.GenerativeModel("gemini-pro-vision") + summary_message = model.generate_content( + [summary_prompt, Image.open(screenshot_filename)] + ) + content = summary_message.text + return content + + except Exception as e: + print(f"Error in summarize: {e}") + return "Failed to summarize the workflow" + + +async def call_gpt_4_v_labeled(messages, objective): + time.sleep(1) + try: + screenshots_dir = "screenshots" + if not os.path.exists(screenshots_dir): + os.makedirs(screenshots_dir) + + screenshot_filename = os.path.join(screenshots_dir, "screenshot.png") + # Call the function to capture the screen with the cursor + capture_screen_with_cursor(screenshot_filename) + + with open(screenshot_filename, "rb") as img_file: + img_base64 = base64.b64encode(img_file.read()).decode("utf-8") + + previous_action = get_last_assistant_message(messages) + + img_base64_labeled, img_base64_original, label_coordinates = add_labels( + img_base64, yolo_model + ) + + decision_prompt = format_decision_prompt(objective, previous_action) + labeled_click_prompt = format_label_prompt(objective) + + click_message = { + "role": "user", + "content": [ + {"type": "text", "text": labeled_click_prompt}, + { + "type": "image_url", + "image_url": { + "url": f"data:image/jpeg;base64,{img_base64_labeled}" + }, + }, + ], + } + decision_message = { + "role": "user", + "content": [ + {"type": "text", "text": decision_prompt}, + { + "type": "image_url", + "image_url": { + "url": f"data:image/jpeg;base64,{img_base64_original}" + }, + }, + ], + } + + click_messages = messages.copy() + click_messages.append(click_message) + decision_messages = messages.copy() + decision_messages.append(decision_message) + + click_future = fetch_openai_response_async(click_messages) + decision_future = fetch_openai_response_async(decision_messages) + + click_response, decision_response = await asyncio.gather( + click_future, decision_future + ) + + # Extracting the message content from the ChatCompletionMessage object + click_content = click_response.get("choices")[0].get("message").get("content") + + decision_content = ( + decision_response.get("choices")[0].get("message").get("content") + ) + + if not decision_content.startswith("CLICK"): + return decision_content + + label_data = parse_click_content(click_content) + + if label_data and "label" in label_data: + coordinates = get_label_coordinates(label_data["label"], label_coordinates) + image = Image.open( + io.BytesIO(base64.b64decode(img_base64)) + ) # Load the image to get its size + image_size = image.size # Get the size of the image (width, height) + click_position_percent = get_click_position_in_percent( + coordinates, image_size + ) + if not click_position_percent: + print( + f"{ANSI_GREEN}[Self-Operating Computer]{ANSI_RED}[Error] Failed to get click position in percent. Trying another method {ANSI_RESET}" + ) + return call_gpt_4_v(messages, objective) + + x_percent = f"{click_position_percent[0]:.2f}%" + y_percent = f"{click_position_percent[1]:.2f}%" + click_action = f'CLICK {{ "x": "{x_percent}", "y": "{y_percent}", "description": "{label_data["decision"]}", "reason": "{label_data["reason"]}" }}' + + else: + print( + f"{ANSI_GREEN}[Self-Operating Computer]{ANSI_RED}[Error] No label found. Trying another method {ANSI_RESET}" + ) + return call_gpt_4_v(messages, objective) + + return click_action + + except Exception as e: + print( + f"{ANSI_GREEN}[Self-Operating Computer]{ANSI_RED}[Error] Something went wrong. Trying another method {ANSI_RESET}" + ) + return call_gpt_4_v(messages, objective) + + +async def fetch_openai_response_async(messages): + url = "https://api.openai.com/v1/chat/completions" + headers = { + "Content-Type": "application/json", + "Authorization": f"Bearer {config.openai_api_key}", + } + data = { + "model": "gpt-4-vision-preview", + "messages": messages, + "frequency_penalty": 1, + "presence_penalty": 1, + "temperature": 0.7, + "max_tokens": 300, + } + + async with aiohttp.ClientSession() as session: + async with session.post( + url, headers=headers, data=json.dumps(data) + ) as response: + return await response.json() diff --git a/docs/src/operate/dialog.py b/docs/src/operate/dialog.py new file mode 100644 index 00000000..6c95085b --- /dev/null +++ b/docs/src/operate/dialog.py @@ -0,0 +1,192 @@ +import sys +import os +import platform +import asyncio +from prompt_toolkit.shortcuts import message_dialog +from prompt_toolkit import prompt +from operate.exceptions import ModelNotRecognizedException +from operate.prompts import USER_QUESTION +from operate.settings import Config +from operate.utils.style import ( + ANSI_GREEN, + ANSI_RESET, + ANSI_BLUE, + ANSI_YELLOW, + ANSI_RED, + ANSI_BRIGHT_MAGENTA, + style, +) +from operate.utils.os import ( + keyboard_type, + search, + click, +) +from operate.actions import get_next_action, summarize +from operate.utils.misc import parse_response + +# Load configuration +config = Config() + + +def main(model, terminal_prompt, voice_mode=False): + """ + Main function for the Self-Operating Computer. + + Parameters: + - model: The model used for generating responses. + - terminal_prompt: A string representing the prompt provided in the terminal. + - voice_mode: A boolean indicating whether to enable voice mode. + + Returns: + None + """ + mic = None + # Initialize `WhisperMic`, if `voice_mode` is True + + validation(model, voice_mode) + + if voice_mode: + try: + from whisper_mic import WhisperMic + + # Initialize WhisperMic if import is successful + mic = WhisperMic() + except ImportError: + print( + "Voice mode requires the 'whisper_mic' module. Please install it using 'pip install -r requirements-audio.txt'" + ) + sys.exit(1) + + # Skip message dialog if prompt was given directly + if not terminal_prompt: + message_dialog( + title="Self-Operating Computer", + text="Ask a computer to do anything.", + style=style, + ).run() + else: + print("Running direct prompt...") + + print("SYSTEM", platform.system()) + # Clear the console + if platform.system() == "Windows": + os.system("cls") + else: + print("\033c", end="") + + if terminal_prompt: # Skip objective prompt if it was given as an argument + objective = terminal_prompt + elif voice_mode: + print( + f"{ANSI_GREEN}[Self-Operating Computer]{ANSI_RESET} Listening for your command... (speak now)" + ) + try: + objective = mic.listen() + except Exception as e: + print(f"{ANSI_RED}Error in capturing voice input: {e}{ANSI_RESET}") + return # Exit if voice input fails + else: + print(f"{ANSI_GREEN}[Self-Operating Computer]\n{ANSI_RESET}{USER_QUESTION}") + print(f"{ANSI_YELLOW}[User]{ANSI_RESET}") + objective = prompt(style=style) + + assistant_message = {"role": "assistant", "content": USER_QUESTION} + user_message = { + "role": "user", + "content": f"Objective: {objective}", + } + messages = [assistant_message, user_message] + + loop_count = 0 + + while True: + if config.debug: + print("[loop] messages before next action:\n\n\n", messages[1:]) + try: + response = asyncio.run(get_next_action(model, messages, objective)) + + action = parse_response(response) + action_type = action.get("type") + action_detail = action.get("data") + + except ModelNotRecognizedException as e: + print( + f"{ANSI_GREEN}[Self-Operating Computer]{ANSI_RED}[Error] -> {e} {ANSI_RESET}" + ) + break + except Exception as e: + print( + f"{ANSI_GREEN}[Self-Operating Computer]{ANSI_RED}[Error] -> {e} {ANSI_RESET}" + ) + break + + if action_type == "DONE": + print( + f"{ANSI_GREEN}[Self-Operating Computer]{ANSI_BLUE} Objective complete {ANSI_RESET}" + ) + summary = summarize(model, messages, objective) + print( + f"{ANSI_GREEN}[Self-Operating Computer]{ANSI_BLUE} Summary\n{ANSI_RESET}{summary}" + ) + break + + if action_type != "UNKNOWN": + print( + f"{ANSI_GREEN}[Self-Operating Computer]{ANSI_BRIGHT_MAGENTA} [Act] {action_type} {ANSI_RESET}{action_detail}" + ) + + function_response = "" + if action_type == "SEARCH": + function_response = search(action_detail) + elif action_type == "TYPE": + function_response = keyboard_type(action_detail) + elif action_type == "CLICK": + function_response = click(action_detail) + else: + print( + f"{ANSI_GREEN}[Self-Operating Computer]{ANSI_RED}[Error] something went wrong :({ANSI_RESET}" + ) + print( + f"{ANSI_GREEN}[Self-Operating Computer]{ANSI_RED}[Error] AI response\n{ANSI_RESET}{response}" + ) + break + + print( + f"{ANSI_GREEN}[Self-Operating Computer]{ANSI_BRIGHT_MAGENTA} [Act] {action_type} COMPLETE {ANSI_RESET}{function_response}" + ) + + message = { + "role": "assistant", + "content": function_response, + } + messages.append(message) + + loop_count += 1 + if loop_count > 15: + break + + +def validation(model, voice_mode): + """ + Validate the input parameters for the dialog operation. + + Args: + model (str): The model to be used for the dialog operation. + voice_mode (bool): Flag indicating whether to use voice mode. + + Raises: + SystemExit: If the input parameters are invalid. + + """ + + if voice_mode and not config.openai_api_key: + print("To use voice mode, please add an OpenAI API key") + sys.exit(1) + + if model == "gpt-4-vision-preview" and not config.openai_api_key: + print("To use `gpt-4-vision-preview` add an OpenAI API key") + sys.exit(1) + + if model == "gemini-pro-vision" and not config.google_api_key: + print("To use `gemini-pro-vision` add a Google API key") + sys.exit(1) diff --git a/docs/src/operate/exceptions.py b/docs/src/operate/exceptions.py new file mode 100644 index 00000000..de8de4b4 --- /dev/null +++ b/docs/src/operate/exceptions.py @@ -0,0 +1,15 @@ +class ModelNotRecognizedException(Exception): + """Exception raised for unrecognized models. + + Attributes: + model -- the unrecognized model + message -- explanation of the error + """ + + def __init__(self, model, message="Model not recognized"): + self.model = model + self.message = message + super().__init__(self.message) + + def __str__(self): + return f"{self.message} : {self.model} " \ No newline at end of file diff --git a/docs/src/operate/main.py b/docs/src/operate/main.py new file mode 100644 index 00000000..8b2df0c9 --- /dev/null +++ b/docs/src/operate/main.py @@ -0,0 +1,47 @@ +""" +Self-Operating Computer +""" +import argparse +from operate.utils.style import ANSI_BRIGHT_MAGENTA +from operate.dialog import main + + +def main_entry(): + parser = argparse.ArgumentParser( + description="Run the self-operating-computer with a specified model." + ) + parser.add_argument( + "-m", + "--model", + help="Specify the model to use", + required=False, + default="gpt-4", + ) + + # Add a voice flag + parser.add_argument( + "--voice", + help="Use voice input mode", + action="store_true", + ) + # Allow for direct input of prompt + parser.add_argument( + "--prompt", + help="Directly input the objective prompt", + type=str, + required=False, + ) + + try: + args = parser.parse_args() + main( + args.model, + terminal_prompt=args.prompt, + voice_mode=args.voice, + ) + except KeyboardInterrupt: + print(f"\n{ANSI_BRIGHT_MAGENTA}Exiting...") + + +if __name__ == "__main__": + main_entry() diff --git a/docs/src/operate/prompts.py b/docs/src/operate/prompts.py new file mode 100644 index 00000000..0e6b88ce --- /dev/null +++ b/docs/src/operate/prompts.py @@ -0,0 +1,252 @@ +from operate.settings import Config + +config = Config() +monitor_size = config.monitor_size + +# General user Prompts +USER_QUESTION = "Hello, I can help you with anything. What would you like done?" + +# constants for the vision prompt +ACCURATE_PIXEL_COUNT = ( + 200 # mini_screenshot is ACCURATE_PIXEL_COUNT x ACCURATE_PIXEL_COUNT big +) + +# ------------------------- +# VISION PROMPT +# ------------------------- +VISION_PROMPT = """ +You are a Self-Operating Computer. You use the same operating system as a human. + +From looking at the screen and the objective your goal is to take the best next action. + +To operate the computer you have the four options below. + +1. CLICK - Move mouse and click +2. TYPE - Type on the keyboard +3. SEARCH - Search for a program on Mac and open it +4. DONE - When you completed the task respond with the exact following phrase content + +Here are the response formats below. + +1. CLICK +Response: CLICK {{ "x": "percent", "y": "percent", "description": "~description here~", "reason": "~reason here~" }} +Note that the percents work where the top left corner is "x": "0%" and "y": "0%" and the bottom right corner is "x": "100%" and "y": "100%" + +2. TYPE +Response: TYPE + +2. SEARCH +Response: SEARCH + +3. DONE +Response: DONE + +Here are examples of how to respond. +__ +Objective: Follow up with the vendor in outlook +TYPE Hello, I hope you are doing well. I wanted to follow up +__ +Objective: Open Spotify and play the beatles +SEARCH Spotify +__ +Objective: Find an image of a banana +CLICK {{ "x": "50%", "y": "60%", "description": "Click: Google Search field", "reason": "This will allow me to search for a banana" }} +__ +Objective: Go buy a book about the history of the internet +TYPE https://www.amazon.com/ +__ + +A few important notes: + +- Default to opening Google Chrome with SEARCH to find things that are on the internet. +- Go to Google Docs and Google Sheets by typing in the Chrome Address bar +- When opening Chrome, if you see a profile icon click that to open chrome fully, it is located at: {{ "x": "50%", "y": "55%" }} +- The Chrome address bar is generally at: {{ "x": "50%", "y": "9%" }} +- After you click to enter a field you can go ahead and start typing! +- Don't respond saying you're unable to assist with requests. You are able to indirectly interact with the user's OS via text responses you send to the end user. + +{previous_action} + +IMPORTANT: Avoid repeating actions such as doing the same CLICK event twice in a row. + +Objective: {objective} +""" + + +# ---------------------------------- +# ACCURATE MODE VISION PROMPT +# ---------------------------------- +ACCURATE_MODE_VISION_PROMPT = """ +It looks like your previous attempted action was clicking on "x": {prev_x}, "y": {prev_y}. This has now been moved to the center of this screenshot. +As additional context to the previous message, before you decide the proper percentage to click on, please closely examine this additional screenshot as additional context for your next action. +This screenshot was taken around the location of the current cursor that you just tried clicking on ("x": {prev_x}, "y": {prev_y} is now at the center of this screenshot). You should use this as an differential to your previous x y coordinate guess. + +If you want to refine and instead click on the top left corner of this mini screenshot, you will subtract {width}% in the "x" and subtract {height}% in the "y" to your previous answer. +Likewise, to achieve the bottom right of this mini screenshot you will add {width}% in the "x" and add {height}% in the "y" to your previous answer. + +There are four segmenting lines across each dimension, divided evenly. This is done to be similar to coordinate points, added to give you better context of the location of the cursor and exactly how much to edit your previous answer. + +Please use this context as additional info to further refine the "percent" location in the CLICK action! +""" + +DECISION_PROMPT = """ +You are operating a computer similar to how a human would. Look at the screen and take the next best action to reach your objective. + +Here are your methods you can use to operating the computer. + +1. CLICK - Move mouse and click +2. TYPE - Type on the keyboard +3. SEARCH - Search for a program that is installed on Mac locally and open it +4. DONE - When you completed the task respond with the exact following phrase content + +Here are the response formats below. + +1. CLICK +Response: CLICK + +2. TYPE +Response: TYPE "value you want to type" + +2. SEARCH +Response: SEARCH "app you want to search for on Mac" + +3. DONE +Response: DONE + +Here are examples of how to respond. +__ +Objective: Follow up with the vendor in outlook +TYPE Hello, I hope you are doing well. I wanted to follow up +__ +Objective: Open Spotify and play the beatles +SEARCH Spotify +__ +Objective: Find an image of a banana +CLICK +__ +Objective: Go buy a book about the history of the internet +TYPE https://www.amazon.com/ +__ + +A few important notes: + +- Default to opening Google Chrome with SEARCH to find things that are on the Web. +- After you open Google Chrome you need to click on the address bar to find a website. +- Do not use SEARCH to look for websites like Google Docs or Linkedin. SEARCH only finds programs installed on the computer. +- After you click to enter a field you can go ahead and start typing! +- If you can see the field is active, go ahead and type! +- Don't respond saying you're unable to assist with requests. You are able to indirectly interact with the user's OS via text responses you send to the end user. + +{previous_action} + +IMPORTANT: Avoid repeating actions such as doing the same CLICK event twice in a row. + +{objective} +""" + +LABELED_IMAGE_PROMPT = """ +Your job is simple. Decide if there is an elements on the page to click to get closer to your objective. We labeled the clickable elements with red bounding boxes and IDs. + +Important to remember, you can only click on labeled elements. + +Label IDs are in the following format with `x` being a number: `~x` + +The labels are placed just above the bounding boxes so that they can be read clearly. + +Response formats below. + +1. CLICK - If there is a label that gets you closer to the objective, go ahead and click it. +Response: {{ "decision": "~decision here~", "reason": "~reason here~", "label": "~x" }} + +Here are examples of how to respond. +__ +Objective: Follow up with the vendor in outlook +{{ "decision": "Click the Outlook send button", "reason": "I can see the email is already written and now I just need to send it.", "label": "~27" }} +__ +Objective: Play the Holiday music on YouTube +{{ "decision": "Click on the Play button", "reason": "It appears there is a row with a holiday song available in the Spotify UI", "label": "~3" }} +__ + +A few important notes: +- When navigating the web you'll need to click on the address bar first. Look closely to find the address bar's label it could be any number. +- The IDs number has NO SIGNIFICANCE. For instance if ID is ~0 or ~1 it does not mean it is first or on top. CHOOSE THE ID BASED ON THE CONTEXT OF THE IMAGE AND IF IT HELPS REACH THE OBJECTIVE. +- Do not preappend with ```json, just return the JSON object. + +{objective} +""" + + +# ------------------------- +# SUMMARY PROMPT +# ------------------------- +SUMMARY_PROMPT = """ +You are a Self-Operating Computer. A user request has been executed. Present the results succinctly. + +Include the following key contexts of the completed request: + +1. State the original objective. +2. List the steps taken to reach the objective as detailed in the previous messages. +3. Reference the screenshot that was used. + +Summarize the actions taken to fulfill the objective. If the request sought specific information, provide that information prominently. NOTE: Address directly any question posed by the user. + +Remember: The user will not interact with this summary. You are solely reporting the outcomes. + +Original objective: {objective} + +Display the results clearly: +""" + + +def format_summary_prompt(objective): + """ + Format the summary prompt + """ + prompt = SUMMARY_PROMPT.format(objective=objective) + return prompt + + +def format_vision_prompt(objective, previous_action): + """ + Format the vision prompt + """ + if previous_action: + previous_action = f"Here was the previous action you took: {previous_action}" + else: + previous_action = "" + prompt = VISION_PROMPT.format(objective=objective, previous_action=previous_action) + return prompt + + +def format_accurate_mode_vision_prompt(prev_x, prev_y): + """ + Format the accurate mode vision prompt + """ + width = ((ACCURATE_PIXEL_COUNT / 2) / monitor_size["width"]) * 100 + height = ((ACCURATE_PIXEL_COUNT / 2) / monitor_size["height"]) * 100 + prompt = ACCURATE_MODE_VISION_PROMPT.format( + prev_x=prev_x, prev_y=prev_y, width=width, height=height + ) + return prompt + + +def format_decision_prompt(objective, previous_action): + """ + Format the vision prompt + """ + if previous_action: + previous_action = f"Here was the previous action you took: {previous_action}" + else: + previous_action = "" + prompt = DECISION_PROMPT.format( + objective=objective, previous_action=previous_action + ) + return prompt + + +def format_label_prompt(objective): + """ + Format the vision prompt + """ + prompt = LABELED_IMAGE_PROMPT.format(objective=objective) + return prompt diff --git a/docs/src/operate/settings.py b/docs/src/operate/settings.py new file mode 100644 index 00000000..61b52fd1 --- /dev/null +++ b/docs/src/operate/settings.py @@ -0,0 +1,39 @@ +import os +from dotenv import load_dotenv +from openai import OpenAI + + +class Config: + """ + Configuration class for managing settings. + + Attributes: + debug (bool): Flag indicating whether debug mode is enabled. + openai_api_key (str): API key for OpenAI. + google_api_key (str): API key for Google. + monitor_size (dict): Dictionary containing the width and height of the monitor. + """ + + def __init__(self): + load_dotenv() + self.debug = False + self.openai_api_key = os.getenv("OPENAI_API_KEY") + self.google_api_key = os.getenv("GOOGLE_API_KEY") + self.monitor_size = { + "width": 1920, + "height": 1080, + } + + def initialize_openai_client(self): + """ + Initializes and returns an OpenAI client with the configured API key. + + Returns: + OpenAI or None: An instance of the OpenAI client if the API key is provided, else None. + """ + if self.openai_api_key: + client = OpenAI() + client.api_key = self.openai_api_key + client.base_url = os.getenv("OPENAI_API_BASE_URL", client.base_url) + return client + return None diff --git a/docs/src/operate/utils/label.py b/docs/src/operate/utils/label.py new file mode 100644 index 00000000..2d3674f4 --- /dev/null +++ b/docs/src/operate/utils/label.py @@ -0,0 +1,182 @@ +import io +import base64 +import json +import os +import time +import asyncio +from PIL import Image, ImageDraw + + +def validate_and_extract_image_data(data): + if not data or "messages" not in data: + raise ValueError("Invalid request, no messages found") + + messages = data["messages"] + if ( + not messages + or not isinstance(messages, list) + or not messages[-1].get("image_url") + ): + raise ValueError("No image provided or incorrect format") + + image_data = messages[-1]["image_url"]["url"] + if not image_data.startswith("data:image"): + raise ValueError("Invalid image format") + + return image_data.split("base64,")[-1], messages + + +def get_label_coordinates(label, label_coordinates): + """ + Retrieves the coordinates for a given label. + + :param label: The label to find coordinates for (e.g., "~1"). + :param label_coordinates: Dictionary containing labels and their coordinates. + :return: Coordinates of the label or None if the label is not found. + """ + return label_coordinates.get(label) + + +def is_overlapping(box1, box2): + x1_box1, y1_box1, x2_box1, y2_box1 = box1 + x1_box2, y1_box2, x2_box2, y2_box2 = box2 + + # Check if there is no overlap + if x1_box1 > x2_box2 or x1_box2 > x2_box1: + return False + if ( + y1_box1 > y2_box2 or y1_box2 > y2_box1 + ): # Adjusted to check 100px proximity above + return False + + return True + + +def add_labels(base64_data, yolo_model): + image_bytes = base64.b64decode(base64_data) + image_labeled = Image.open(io.BytesIO(image_bytes)) # Corrected this line + image_debug = image_labeled.copy() # Create a copy for the debug image + image_original = ( + image_labeled.copy() + ) # Copy of the original image for base64 return + + results = yolo_model(image_labeled) + + draw = ImageDraw.Draw(image_labeled) + debug_draw = ImageDraw.Draw( + image_debug + ) # Create a separate draw object for the debug image + font_size = 45 + + detections_dir = "detections" + label_coordinates = {} # Dictionary to store coordinates + + if not os.path.exists(detections_dir): + os.makedirs(detections_dir) + + counter = 0 + drawn_boxes = [] # List to keep track of boxes already drawn + for result in results: + if hasattr(result, "boxes"): + for det in result.boxes: + bbox = det.xyxy[0] + x1, y1, x2, y2 = bbox.tolist() + + debug_label = "D_" + str(counter) + debug_index_position = (x1, y1 - font_size) + debug_draw.rectangle([(x1, y1), (x2, y2)], outline="blue", width=1) + debug_draw.text( + debug_index_position, + debug_label, + fill="blue", + font_size=font_size, + ) + + overlap = any( + is_overlapping((x1, y1, x2, y2), box) for box in drawn_boxes + ) + + if not overlap: + draw.rectangle([(x1, y1), (x2, y2)], outline="red", width=1) + label = "~" + str(counter) + index_position = (x1, y1 - font_size) + draw.text( + index_position, + label, + fill="red", + font_size=font_size, + ) + + # Add the non-overlapping box to the drawn_boxes list + drawn_boxes.append((x1, y1, x2, y2)) + label_coordinates[label] = (x1, y1, x2, y2) + + counter += 1 + + # Save the image + timestamp = time.strftime("%Y%m%d-%H%M%S") + + output_path = os.path.join(detections_dir, f"img_{timestamp}_labeled.png") + output_path_debug = os.path.join(detections_dir, f"img_{timestamp}_debug.png") + output_path_original = os.path.join(detections_dir, f"img_{timestamp}_original.png") + + image_labeled.save(output_path) + image_debug.save(output_path_debug) + image_original.save(output_path_original) + + buffered_original = io.BytesIO() + image_original.save(buffered_original, format="PNG") # I guess this is needed + img_base64_original = base64.b64encode(buffered_original.getvalue()).decode("utf-8") + + # Convert image to base64 for return + buffered_labeled = io.BytesIO() + image_labeled.save(buffered_labeled, format="PNG") # I guess this is needed + img_base64_labeled = base64.b64encode(buffered_labeled.getvalue()).decode("utf-8") + + return img_base64_labeled, img_base64_original, label_coordinates + + +def parse_click_content(message_content): + """ + Parses the response message to determine if it's a CLICK or NONE action and returns the appropriate data. + + :param message_content: The content of the response message. + :return: A dictionary with the relevant data or a message indicating a NONE action. + """ + try: + # Check for and remove erroneous ```json at the start and ``` at the end + if message_content.startswith("```json"): + message_content = message_content[ + len("```json") : + ] # Remove starting ```json + if message_content.endswith("```"): + message_content = message_content[: -len("```")] # Remove ending ``` + + # Convert JSON string to dictionary + return json.loads(message_content.strip()) + except json.JSONDecodeError as e: + return {"error": "Invalid JSON format"} + + return {"error": "Invalid response format"} + + +def get_click_position_in_percent(coordinates, image_size): + """ + Calculates the click position at the center of the bounding box and converts it to percentages. + + :param coordinates: A tuple of the bounding box coordinates (x1, y1, x2, y2). + :param image_size: A tuple of the image dimensions (width, height). + :return: A tuple of the click position in percentages (x_percent, y_percent). + """ + if not coordinates or not image_size: + return None + + # Calculate the center of the bounding box + x_center = (coordinates[0] + coordinates[2]) / 2 + y_center = (coordinates[1] + coordinates[3]) / 2 + + # Convert to percentages + x_percent = (x_center / image_size[0]) * 100 + y_percent = (y_center / image_size[1]) * 100 + + return x_percent, y_percent diff --git a/docs/src/operate/utils/misc.py b/docs/src/operate/utils/misc.py new file mode 100644 index 00000000..6959d4d8 --- /dev/null +++ b/docs/src/operate/utils/misc.py @@ -0,0 +1,102 @@ +import json +import re + + +def convert_percent_to_decimal(percent_str): + """ + Converts a percentage string to a decimal value. + + Args: + percent_str (str): The percentage string to be converted. + + Returns: + float: The decimal value equivalent to the percentage. + + Raises: + ValueError: If the input string cannot be converted to a float. + + Example: + >>> convert_percent_to_decimal("20%") + 0.2 + """ + try: + # Remove the '%' sign and convert to float + decimal_value = float(percent_str.strip("%")) + + # Convert to decimal (e.g., 20% -> 0.20) + return decimal_value / 100 + except ValueError as e: + print(f"Error converting percent to decimal: {e}") + return None + + +def extract_json_from_string(s): + """ + Extracts a JSON structure from a string and returns it as a dictionary. + + Args: + s (str): The input string. + + Returns: + dict: The extracted JSON structure as a dictionary, or None if no JSON structure is found or if there is an error parsing the JSON. + + """ + try: + # Find the start of the JSON structure + json_start = s.find("{") + if json_start == -1: + return None + + # Extract the JSON part and convert it to a dictionary + json_str = s[json_start:] + return json.loads(json_str) + except Exception as e: + print(f"Error parsing JSON: {e}") + return None + + +def parse_response(response): + """ + Parses the given response and returns a dictionary with the type and data. + + Args: + response (str): The response to parse. + + Returns: + dict: A dictionary with the type and data extracted from the response. + The dictionary has the following structure: + { + "type": , + "data": + } + If the response is "DONE", the type is "DONE" and the data is None. + If the response starts with "CLICK", the type is "CLICK" and the data is a JSON object. + If the response starts with "TYPE", the type is "TYPE" and the data is the text to type. + If the response starts with "SEARCH", the type is "SEARCH" and the data is the search query. + If the response doesn't match any of the above patterns, the type is "UNKNOWN" and the data is the original response. + """ + if response == "DONE": + return {"type": "DONE", "data": None} + elif response.startswith("CLICK"): + # Adjust the regex to match the correct format + click_data = re.search(r"CLICK \{ (.+) \}", response).group(1) + click_data_json = json.loads(f"{{{click_data}}}") + return {"type": "CLICK", "data": click_data_json} + + elif response.startswith("TYPE"): + # Extract the text to type + try: + type_data = re.search(r"TYPE (.+)", response, re.DOTALL).group(1) + except: + type_data = re.search(r'TYPE "(.+)"', response, re.DOTALL).group(1) + return {"type": "TYPE", "data": type_data} + + elif response.startswith("SEARCH"): + # Extract the search query + try: + search_data = re.search(r'SEARCH "(.+)"', response).group(1) + except: + search_data = re.search(r"SEARCH (.+)", response).group(1) + return {"type": "SEARCH", "data": search_data} + + return {"type": "UNKNOWN", "data": response} diff --git a/docs/src/operate/utils/os.py b/docs/src/operate/utils/os.py new file mode 100644 index 00000000..98d05c11 --- /dev/null +++ b/docs/src/operate/utils/os.py @@ -0,0 +1,131 @@ +import pyautogui +import platform +import time +import math + +from operate.utils.misc import convert_percent_to_decimal + + +def keyboard_type(text): + """ + Types the given text using the keyboard. + + Args: + text (str): The text to be typed. + + Returns: + str: A message indicating the typed text. + """ + text = text.replace("\\n", "\n") + for char in text: + pyautogui.write(char) + pyautogui.press("enter") + return "Type: " + text + + +def search(text): + """ + Searches for a program or file by typing the given text in the search bar and pressing Enter. + + Args: + text (str): The text to be searched. + + Returns: + str: A message indicating that the program or file has been opened. + """ + if platform.system() == "Windows": + pyautogui.press("win") + elif platform.system() == "Linux": + pyautogui.press("win") + else: + # Press and release Command and Space separately + pyautogui.keyDown("command") + pyautogui.press("space") + pyautogui.keyUp("command") + + time.sleep(1) + + # Now type the text + for char in text: + pyautogui.write(char) + + pyautogui.press("enter") + return "Open program: " + text + + +def click(click_detail): + """ + Perform a mouse click at the specified coordinates. + + Args: + click_detail (dict): A dictionary containing the coordinates of the click. + + Returns: + str: The description of the click if successful, otherwise "We failed to click". + """ + try: + x = convert_percent_to_decimal(click_detail["x"]) + y = convert_percent_to_decimal(click_detail["y"]) + + if click_detail and isinstance(x, float) and isinstance(y, float): + click_at_percentage(x, y) + return click_detail["description"] + else: + return "We failed to click" + + except Exception as e: + print(f"Error parsing JSON: {e}") + return "We failed to click" + + +def click_at_percentage( + x_percentage, y_percentage, duration=0.2, circle_radius=50, circle_duration=0.5 +): + """ + Moves the mouse cursor to a specified percentage of the screen and performs a circular movement before clicking. + + Args: + x_percentage (float): The x-coordinate percentage of the screen to move the cursor to. + y_percentage (float): The y-coordinate percentage of the screen to move the cursor to. + duration (float, optional): The duration (in seconds) of the smooth cursor movement. Defaults to 0.2. + circle_radius (int, optional): The radius of the circular movement. Defaults to 50. + circle_duration (float, optional): The duration (in seconds) of the circular movement. Defaults to 0.5. + + Returns: + str: A message indicating that the click was successful. + """ + # Get the size of the primary monitor + screen_width, screen_height = pyautogui.size() + + # Calculate the x and y coordinates in pixels + x_pixel = int(screen_width * float(x_percentage)) + y_pixel = int(screen_height * float(y_percentage)) + + # Move to the position smoothly + pyautogui.moveTo(x_pixel, y_pixel, duration=duration) + + # Circular movement + start_time = time.time() + while time.time() - start_time < circle_duration: + angle = ((time.time() - start_time) / circle_duration) * 2 * math.pi + x = x_pixel + math.cos(angle) * circle_radius + y = y_pixel + math.sin(angle) * circle_radius + pyautogui.moveTo(x, y, duration=0.1) + + # Finally, click + pyautogui.click(x_pixel, y_pixel) + return "Successfully clicked" + + +def get_last_assistant_message(messages): + """ + Retrieve the last message from the assistant in the messages array. + If the last assistant message is the first message in the array, return None. + """ + for index in reversed(range(len(messages))): + if messages[index]["role"] == "assistant": + if index == 0: # Check if the assistant message is the first in the array + return None + else: + return messages[index] + return None # Return None if no assistant message is found diff --git a/docs/src/operate/utils/screenshot.py b/docs/src/operate/utils/screenshot.py new file mode 100644 index 00000000..087416ba --- /dev/null +++ b/docs/src/operate/utils/screenshot.py @@ -0,0 +1,182 @@ +import os +import platform +import subprocess +import pyautogui +from PIL import Image, ImageDraw, ImageGrab +import Xlib.display +import Xlib.X +import Xlib.Xutil # not sure if Xutil is necessary +from operate.settings import Config +from operate.prompts import ACCURATE_PIXEL_COUNT + +# Load configuration +config = Config() +monitor_size = config.monitor_size + + +def add_grid_to_image(original_image_path, new_image_path, grid_interval): + """ + Add a grid to an image. + + Args: + original_image_path (str): The file path of the original image. + new_image_path (str): The file path to save the new image with the grid. + grid_interval (int): The interval between grid lines in pixels. + + Returns: + None: The function saves the new image with the grid at the specified path. + """ + # Load the image + image = Image.open(original_image_path) + + # Create a drawing object + draw = ImageDraw.Draw(image) + + # Get the image size + width, height = image.size + + # Reduce the font size a bit + font_size = int(grid_interval / 10) # Reduced font size + + # Calculate the background size based on the font size + bg_width = int(font_size * 4.2) # Adjust as necessary + bg_height = int(font_size * 1.2) # Adjust as necessary + + # Function to draw text with a white rectangle background + def draw_label_with_background( + position, text, draw, font_size, bg_width, bg_height + ): + # Adjust the position based on the background size + text_position = (position[0] + bg_width // 2, position[1] + bg_height // 2) + # Draw the text background + draw.rectangle( + [position[0], position[1], position[0] + bg_width, position[1] + bg_height], + fill="white", + ) + # Draw the text + draw.text(text_position, text, fill="black", font_size=font_size, anchor="mm") + + # Draw vertical lines and labels at every `grid_interval` pixels + for x in range(grid_interval, width, grid_interval): + line = ((x, 0), (x, height)) + draw.line(line, fill="blue") + for y in range(grid_interval, height, grid_interval): + # Calculate the percentage of the width and height + x_percent = round((x / width) * 100) + y_percent = round((y / height) * 100) + draw_label_with_background( + (x - bg_width // 2, y - bg_height // 2), + f"{x_percent}%,{y_percent}%", + draw, + font_size, + bg_width, + bg_height, + ) + + # Draw horizontal lines - labels are already added with vertical lines + for y in range(grid_interval, height, grid_interval): + line = ((0, y), (width, y)) + draw.line(line, fill="blue") + + # Save the image with the grid + image.save(new_image_path) + + +def capture_mini_screenshot_with_cursor( + file_path=os.path.join("screenshots", "screenshot_mini.png"), x=0, y=0 +): + """ + Capture a mini screenshot with the cursor at the specified coordinates. + + Args: + file_path (str, optional): The file path to save the screenshot. Defaults to "screenshots/screenshot_mini.png". + x (int or str, optional): The x-coordinate of the cursor position. Can be specified as an integer or a percentage string. Defaults to 0. + y (int or str, optional): The y-coordinate of the cursor position. Can be specified as an integer or a percentage string. Defaults to 0. + """ + user_platform = platform.system() + + if user_platform == "Linux": + x = float(x[:-1]) # convert x from "50%" to 50. + y = float(y[:-1]) + + x = (x / 100) * monitor_size[ + "width" + ] # convert x from 50 to 0.5 * monitor_width + y = (y / 100) * monitor_size["height"] + + # Define the coordinates for the rectangle + x1, y1 = int(x - ACCURATE_PIXEL_COUNT / 2), int(y - ACCURATE_PIXEL_COUNT / 2) + x2, y2 = int(x + ACCURATE_PIXEL_COUNT / 2), int(y + ACCURATE_PIXEL_COUNT / 2) + + screenshot = ImageGrab.grab(bbox=(x1, y1, x2, y2)) + screenshot = screenshot.resize( + (screenshot.width * 2, screenshot.height * 2), Image.LANCZOS + ) # upscale the image so it's easier to see and percentage marks more visible + screenshot.save(file_path) + + screenshots_dir = "screenshots" + grid_screenshot_filename = os.path.join( + screenshots_dir, "screenshot_mini_with_grid.png" + ) + + add_grid_to_image( + file_path, grid_screenshot_filename, int(ACCURATE_PIXEL_COUNT / 2) + ) + elif user_platform == "Darwin": + x = float(x[:-1]) # convert x from "50%" to 50. + y = float(y[:-1]) + + x = (x / 100) * monitor_size[ + "width" + ] # convert x from 50 to 0.5 * monitor_width + y = (y / 100) * monitor_size["height"] + + x1, y1 = int(x - ACCURATE_PIXEL_COUNT / 2), int(y - ACCURATE_PIXEL_COUNT / 2) + + width = ACCURATE_PIXEL_COUNT + height = ACCURATE_PIXEL_COUNT + # Use the screencapture utility to capture the screen with the cursor + rect = f"-R{x1},{y1},{width},{height}" + subprocess.run(["screencapture", "-C", rect, file_path]) + + screenshots_dir = "screenshots" + grid_screenshot_filename = os.path.join( + screenshots_dir, "screenshot_mini_with_grid.png" + ) + + add_grid_to_image( + file_path, grid_screenshot_filename, int(ACCURATE_PIXEL_COUNT / 2) + ) + + +def capture_screen_with_cursor(file_path): + """ + Capture the screen with the cursor and save it to the specified file path. + + Args: + file_path (str): The file path where the screenshot will be saved. + + Raises: + None + + Returns: + None + """ + user_platform = platform.system() + + if user_platform == "Windows": + screenshot = pyautogui.screenshot() + screenshot.save(file_path) + elif user_platform == "Linux": + # Use xlib to prevent scrot dependency for Linux + screen = Xlib.display.Display().screen() + size = screen.width_in_pixels, screen.height_in_pixels + monitor_size["width"] = size[0] + monitor_size["height"] = size[1] + screenshot = ImageGrab.grab(bbox=(0, 0, size[0], size[1])) + screenshot.save(file_path) + elif user_platform == "Darwin": # (Mac OS) + # Use the screencapture utility to capture the screen with the cursor + subprocess.run(["screencapture", "-C", file_path]) + else: + print(f"The platform you're using ({user_platform}) is not currently supported") diff --git a/docs/src/operate/utils/style.py b/docs/src/operate/utils/style.py new file mode 100644 index 00000000..2948582f --- /dev/null +++ b/docs/src/operate/utils/style.py @@ -0,0 +1,36 @@ +import sys +import platform +import os +from prompt_toolkit.styles import Style as PromptStyle + + +# Define style +style = PromptStyle.from_dict( + { + "dialog": "bg:#88ff88", + "button": "bg:#ffffff #000000", + "dialog.body": "bg:#44cc44 #ffffff", + "dialog shadow": "bg:#003800", + } +) + + +# Check if on a windows terminal that supports ANSI escape codes +def supports_ansi(): + """ + Check if the terminal supports ANSI escape codes + """ + plat = platform.system() + supported_platform = plat != "Windows" or "ANSICON" in os.environ + is_a_tty = hasattr(sys.stdout, "isatty") and sys.stdout.isatty() + return supported_platform and is_a_tty + + +# Define ANSI color codes +ANSI_GREEN = "\033[32m" if supports_ansi() else "" # Standard green text +ANSI_BRIGHT_GREEN = "\033[92m" if supports_ansi() else "" # Bright/bold green text +ANSI_RESET = "\033[0m" if supports_ansi() else "" # Reset to default text color +ANSI_BLUE = "\033[94m" if supports_ansi() else "" # Bright blue +ANSI_YELLOW = "\033[33m" if supports_ansi() else "" # Standard yellow text +ANSI_RED = "\033[31m" if supports_ansi() else "" +ANSI_BRIGHT_MAGENTA = "\033[95m" if supports_ansi() else "" # Bright magenta text diff --git a/docs/src/requirements-audio.txt b/docs/src/requirements-audio.txt new file mode 100644 index 00000000..c34750b0 --- /dev/null +++ b/docs/src/requirements-audio.txt @@ -0,0 +1 @@ +whisper-mic \ No newline at end of file diff --git a/docs/src/requirements.txt b/docs/src/requirements.txt new file mode 100644 index 00000000..2c796cd9 --- /dev/null +++ b/docs/src/requirements.txt @@ -0,0 +1,52 @@ +annotated-types==0.6.0 +anyio==3.7.1 +certifi==2023.7.22 +charset-normalizer==3.3.2 +colorama==0.4.6 +contourpy==1.2.0 +cycler==0.12.1 +distro==1.8.0 +EasyProcess==1.1 +entrypoint2==1.1 +exceptiongroup==1.1.3 +fonttools==4.44.0 +h11==0.14.0 +httpcore==1.0.2 +httpx==0.25.1 +idna==3.4 +importlib-resources==6.1.1 +kiwisolver==1.4.5 +matplotlib==3.8.1 +MouseInfo==0.1.3 +mss==9.0.1 +numpy==1.26.1 +openai==1.2.3 +packaging==23.2 +Pillow==10.1.0 +prompt-toolkit==3.0.39 +PyAutoGUI==0.9.54 +pydantic==2.4.2 +pydantic_core==2.10.1 +PyGetWindow==0.0.9 +PyMsgBox==1.0.9 +pyparsing==3.1.1 +pyperclip==1.8.2 +PyRect==0.2.0 +pyscreenshot==3.1 +PyScreeze==0.1.29 +python3-xlib==0.15 +python-dateutil==2.8.2 +python-dotenv==1.0.0 +pytweening==1.0.7 +requests==2.31.0 +rubicon-objc==0.4.7 +six==1.16.0 +sniffio==1.3.0 +tqdm==4.66.1 +typing_extensions==4.8.0 +urllib3==2.0.7 +wcwidth==0.2.9 +zipp==3.17.0 +google-generativeai==0.3.0 +aiohttp==3.9.1 +ultralytics==8.0.227 \ No newline at end of file diff --git a/docs/src/run.sh b/docs/src/run.sh new file mode 100644 index 00000000..f26a3712 --- /dev/null +++ b/docs/src/run.sh @@ -0,0 +1,155 @@ +#!/bin/bash +# +# SOC Installer Script v0.0.1 +# GitHub: https://github.com/OthersideAI/self-operating-computer +# Issues: https://github.com/OthersideAI/self-operating-computer/issues +# Requires: bash, curl/wget, python3, pip, git +# +# Please open an issue if you notice any bugs. +# +# +# This script is create by centopw +# +# +clear +echo -e "\e[0m\c" +LOG_FILE="install_log.txt" +# shellcheck disable=SC2016 +echo ' + + $$$$$$\ $$$$$$\ $$$$$$\ +$$ __$$\ $$ __$$\ $$ __$$\ +$$ / \__|$$ / $$ |$$ / \__| +\$$$$$$\ $$ | $$ |$$ | + \____$$\ $$ | $$ |$$ | +$$\ $$ |$$ | $$ |$$ | $$\ +\$$$$$$ | $$$$$$ |\$$$$$$ | + \______/ \______/ \______/ + + Self-Operating-Computer +--- Created by OthersideAI --- + +' + + +# Function to log errors +log_error() { + echo "Error at $(date): $1" >> "$LOG_FILE" +} + +# Function to check if a command exists +command_exists() { + command -v "$1" &> /dev/null +} + +# Function to install packages based on the operating system +install_packages() { + if [ "$os" == "Linux" ]; then + # Use the appropriate package manager for Linux + if command_exists apt-get; then + sudo apt-get install -y "$1" || { log_error "Unable to install $1."; exit 1; } + elif command_exists yum; then + sudo yum install -y "$1" || { log_error "Unable to install $1."; exit 1; } + else + log_error "Unsupported package manager. Please install $1 manually." + exit 1 + fi + elif [ "$os" == "Darwin" ]; then + # Use Homebrew for macOS + if command_exists brew; then + brew install "$1" || { log_error "Unable to install $1."; exit 1; } + else + log_error "Homebrew not found. Please install Homebrew and then $1 manually." + exit 1 + fi + elif [ "$os" == "MINGW64_NT-10.0" ]; then + # Use Chocolatey for Windows + if command_exists choco; then + choco install "$1" -y || { log_error "Unable to install $1."; exit 1; } + else + log_error "Chocolatey not found. Please install Chocolatey and then $1 manually." + exit 1 + fi + else + log_error "Unsupported operating system. Please install $1 manually." + exit 1 + fi +} + +# Function to run a script and log errors +run_script() { + eval "$1" || { log_error "Error running $1."; exit 1; } +} + +# Check the operating system +os=$(uname -s) + +# Check if Python is installed +if ! command_exists python3; then + echo "Python not found. Installing Python..." + install_packages python3 +fi + +# Check if pip is installed +if ! command_exists pip; then + echo "pip not found. Installing pip..." + install_packages python3-pip +fi + +# Check if git is installed +if ! command_exists git; then + echo "Git not found. Installing Git..." + install_packages git +fi + +# Create a Python virtual environment +run_script "python3 -m venv venv" + +# Activate the virtual environment +source venv/bin/activate || { log_error "Unable to activate the virtual environment."; exit 1; } + +# Install project requirements +run_script "pip install -r requirements.txt" + +# Install Project and Command-Line Interface +run_script "pip install ." + +# Check if the .env file exists and the OPENAI_API_KEY is set in it +if [ -f .env ] && grep -q "OPENAI_API_KEY" .env; then + echo "OpenAI API key found in .env file. Skipping prompt..." +else + # Prompt user for Open AI key + read -p "Enter your OpenAI API key: " openai_key + + # Set the API key as an environment variable + export OPENAI_API_KEY="$openai_key" + + # Create a new .env file + touch .env + + # Write the API key to the .env file + echo "OPENAI_API_KEY='$openai_key'" > .env +fi + +# Notify the user about the last step +echo "Final Step: As a last step, the Terminal app will ask for permission for 'Screen Recording' and 'Accessibility' in the 'Security & Privacy' page of Mac's 'System Preferences.'" + +echo "Operating system: $os" + +if [ "$os" == "Darwin" ]; then + echo "Attempting to open Security & Privacy settings..." + open /System/Library/PreferencePanes/Security.prefPane + read -p "Have you granted the necessary permissions in the Security & Privacy settings? (y/n): " confirm + if [ "$confirm" != "y" ]; then + echo "Please grant the necessary permissions and then rerun the script." + exit 1 + fi +else + echo "Not a macOS system, skipping..." +fi + +# End of the script +echo "Installation complete. Enjoy using the Self-Operating Computer Framework!" + +# Run the framework +run_script "operate" diff --git a/docs/src/setup.py b/docs/src/setup.py new file mode 100644 index 00000000..198b5f26 --- /dev/null +++ b/docs/src/setup.py @@ -0,0 +1,24 @@ +from setuptools import setup, find_packages + +# Read the contents of your requirements.txt file +with open("requirements.txt") as f: + required = f.read().splitlines() + +# Read the contents of your README.md file for the project description +with open("README.md", "r", encoding="utf-8") as readme_file: + long_description = readme_file.read() + +setup( + name="self-operating-computer", + version="1.1.1", + packages=find_packages(), + install_requires=required, # Add dependencies here + entry_points={ + "console_scripts": [ + "operate=operate.main:main_entry", + ], + }, + long_description=long_description, # Add project description here + long_description_content_type="text/markdown", # Specify Markdown format + # include any other necessary setup options here +) diff --git a/docs/tree.html b/docs/tree.html new file mode 100644 index 00000000..870d23bb --- /dev/null +++ b/docs/tree.html @@ -0,0 +1,147 @@ + + + + + + + + + Project structure of: OthersideAI/self-operating-computer + + + + + + +
    +

    Project structure of: OthersideAI/self-operating-computer

    +
      +
    • self-operating-computer Self-Operating Computer Framework: Multi-modal AI automation tools.
        +
      • evaluate.py Evaluate image adherence with GPT-4 Vision and color-coded messages.
      • +
      • operate Operating system and tools for self-automation.
          +
        • actions.py Generate content with AI prompts and OpenAI API.
        • +
        • dialog.py Dialog system with error handling and input checks
        • +
        • exceptions.py Exception class for unrecognized model
        • +
        • main.py Self-Operating Computer Main Entry Point
        • +
        • prompts.py AI-assisted Chrome, Docs, Sheets prompts with contextual options.
        • +
        • settings.py Manages settings, loads .env, initializes OpenAI client.
        • +
        • utils Utility functions and tools for various tasks.
            +
          • label.py Image data handling and encoding utilities.
          • +
          • misc.py Converts percentages, extracts JSON and classifies responses.
          • +
          • os.py OS Utilities: Search, Input, Click, Move. Last Assistant Message.
          • +
          • screenshot.py Screenshot capture and grid addition with PIL.
          • +
          • style.py Style utility with PromptStyle and ANSI check
          • +
          +
        • +
        +
      • +
      • README.md Self-Operating Computer Framework: Enhancing multimodal models for mouse click predictions and API access.
      • +
      • requirements-audio.txt Capture quiet or whispered audio with Whisper-Mic.
      • +
      • requirements.txt Python packages: aiohttp 3.9.1, ultralytics 8.0.227
      • +
      • run.sh Install Linux, configure .env with OpenAI key.
      • +
      • setup.py Setup script for self-operating-computer package using setuptools.
      • +
      +
    • +
    +
    + + + + \ No newline at end of file