Joshuaclymer · grace-sodunke · Jan 21, 2024 · Feb 7, 2024 · Feb 15, 2024 · Feb 15, 2024
diff --git a/.gitignore b/.gitignore
@@ -1,5 +1,7 @@
 credentials.json
 
+**/__pycache__/
+
 # Created by https://www.toptal.com/developers/gitignore/api/python
 # Edit at https://www.toptal.com/developers/gitignore?templates=python
 

diff --git a/LICENSE.txt b/LICENSE.txt
@@ -0,0 +1,21 @@
+MIT License
+
+Copyright (c) 2024 Anthony Costarelli
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.
diff --git a/README.md b/README.md
@@ -1,12 +1,42 @@
-# Setup
-In the repo root:
+# [GameBench: Evaluating Strategic Reasoning Abilities of LLM Agents](https://gamebench-website.vercel.app/)
+
+This repository contains both the code for the benchmark and the data we collected so far.
+
+The code is available under the MIT license, and the data are available under the CC-BY license.
+
+The match data is located in [`matches.json`](https://github.com/Joshuaclymer/GameBench/tree/main/matches.json).
+
+### Setup
+In the repository root:
 
 ```
 conda create -n gameenv python=3.10
 conda activate gameenv
 pip install -e .
 ```
-Ask Josh for the credentials file.
+You must provide your own OpenAI API key in a file `credentials.json` at the top-level directory. It should have the format:
+```json
+{
+    "openai_api_key": "your_openai_api_key_here"
+}
+```
+
+### Replicating figures
+
+The Python script [`generate_all_results.py`](https://github.com/Joshuaclymer/GameBench/tree/main/generate_all_results.py) generates all the figures from the paper into [`figures/`](https://github.com/Joshuaclymer/GameBench/tree/main/figures/). Use the command:
+
+```py
+python3 generate_all_results.py
+```
+
+### Collecting data
+
+The scripts provided in [`scripts/`](https://github.com/Joshuaclymer/GameBench/tree/main/scripts/) run some individual games with preconfigured settings. You can run/modify these scripts or create another. To run a script, execute:
+```sh
+sh ./scripts/<script_name>.sh
+```
+
+Alternatively, you can run `api.play_game.play_game` directly from a Python script created in the top-level directory.
 
 ### `llm-reasoners` dependency
 
@@ -19,4 +49,4 @@ Ask Josh for the credentials file.
   journal={arXiv preprint arXiv:2305.14992},
   year={2023}
 }
-```
+```
diff --git a/agents/gpt.py b/agents/gpt.py
@@ -1,10 +1,15 @@
+from collections import defaultdict
 from dataclasses import dataclass, field
 from api.classes import Agent, AvailableActions, Action, Observation, Rules
 import random
 import openai
 import api.util as util
 import ast
 import json
+from PIL import Image
+import base64
+from io import BytesIO
+import re
 
 
 action_format_instructions_no_openended = """\
@@ -27,6 +32,15 @@
     api_key=util.load_json("credentials.json")["openai_api_key"]
 )
 
+tokens = defaultdict(int)
+def completions(*args, **kwargs):
+    ret = openai_client.chat.completions.create(*args, **kwargs)
+
+    model = kwargs["model"]
+    tokens[f"{model}_input"] += ret.usage.prompt_tokens
+    tokens[f"{model}_output"] += ret.usage.completion_tokens
+    print("*******************", tokens)
+    return ret
 
 @dataclass
 class OpenAITextAgent(Agent):
@@ -48,15 +62,66 @@ def take_action(
         available_actions: AvailableActions,
         show_state: bool,
     ):
+        messages = [{"role": "system", "content": self.system_message}]
         valid_actions = []
         prompt = f"You are playing a game called {rules.title}. The rules are as follows:\n{rules.summary}\n"
         if rules.additional_details != None:
             prompt += "The following are headings with additional information about the rules that you can expand by taking the action Explain(<heading key>).\n"
-            details_dict = {f"H{i+1}": topic for i, topic in enumerate(rules.additional_details)}
+            details_dict = {
+                f"H{i+1}": topic for i, topic in enumerate(rules.additional_details)
+            }
             prompt += json.dumps(details_dict, indent=4)
-            valid_actions.extend(f"Explain({h})" for h in list(details_dict.keys()))
+            #valid_actions.extend(f"Explain({h})" for h in list(details_dict.keys()))
 
         prompt += f"\n# Observation\nThe following describes the current state of the game:\n{observation.text}\n"
+        if observation.image is not None:
+            if self.openai_model == "gpt-4-1106-preview":
+                self.print("Image observation recieved.")
+                buffered = BytesIO()
+                observation.image.save(buffered, format="JPEG")
+                base64_image = base64.b64encode(buffered.getvalue())
+                messages.append(
+                    {
+                        "role": "user",
+                        "content": [
+                            {"type": "text", "text": prompt},
+                            {
+                                "type": "image",
+                                "image_url": {
+                                    "url": f"data:image/jpeg;base64,{base64_image}"
+                                },
+                            },
+                        ],
+                    }
+                )
+                prompt = ""
+            else:
+                self.print("Image observation recieved. Using GPT4 to generate text description.")
+                buffered = BytesIO()
+                image.save(buffered, format="JPEG")
+                base64_image = base64.b64encode(buffered.getvalue())
+
+                imagedesc = completions(
+                    model="gpt-4-vision-preview",
+                    messages=[
+                        {
+                            "role": "user",
+                            "content": [
+                                {
+                                    "type": "text",
+                                    "text": prompt
+                                },
+                                {
+                                    "type": "image",
+                                    "image_url": {"url": f"data:image/jpeg;base64,{base64_image}"},
+                                },
+                            ],
+                        }
+                    ],
+                ).choices[0].message.content
+                prompt += imagedesc
+                observation.image = None
+
         assert available_actions.predefined != {} or available_actions.openended != {}
         prompt += f"\n# Actions\n"
         prompt += f"{available_actions.instructions}\n"
@@ -83,16 +148,17 @@ def take_action(
         ):
             prompt += "Return the action Explain(<action>) to receive additional info about what any of the above actions do.\n"
 
-        messages = [{"role": "system", "content": self.system_message}]
-
         # Chain of Thought
         if self.mode == 1:
             prompt += "First, let's reason out loud about which action you should take to maximize your probability of winning."
             messages.append({"role": "user", "content": prompt})
 
             response = (
-                openai_client.chat.completions.create(
-                    model=self.openai_model, messages=messages
+                completions(
+                    model=self.openai_model
+                    if observation.image is None
+                    else "gpt-4-vision-preview",
+                    messages=messages,
                 )
                 .choices[0]
                 .message.content
@@ -109,18 +175,19 @@ def take_action(
 
             messages.append({"role": "user", "content": prompt})
             response = (
-                openai_client.chat.completions.create(
-                    model=self.openai_model, messages=messages
+                completions(
+                    model=self.openai_model
+                    if observation.image is None
+                    else "gpt-4-vision-preview",
+                    messages=messages,
                 )
                 .choices[0]
                 .message.content
             )
             messages.append({"role": "assistant", "content": response})
             prompt = ""
 
-            self.print(
-                f"GPT listed the following actions as possibilities: {response}"
-            )
+            self.print(f"GPT listed the following actions as possibilities: {response}")
 
         prompt += "\nTo summarize, if you choose a predefined action, you must return json with an 'action' key which contains one of the following valid actions:\n"
         prompt += str(list(available_actions.predefined))
@@ -131,8 +198,10 @@ def take_action(
         result = None
         for _ in range(self.max_retries):
             response = (
-                openai_client.chat.completions.create(
-                    model=self.openai_model,
+                completions(
+                    model=self.openai_model
+                    if observation.image is None
+                    else "gpt-4-vision-preview",
                     response_format={"type": "json_object"},
                     messages=messages,
                 )
@@ -143,17 +212,44 @@ def take_action(
             self.print("GPT responded with", response)
 
             try:
-                action = ast.literal_eval(response)
+                action = ast.literal_eval(response.strip())
+                action["action"]
             except:
                 self.print("GPT returned invalid JSON")
                 continue
 
-            if action["action"] in available_actions.openended and "openended_response" not in action:
-                self.print("GPT chose openended action but didn't include response", action)
+            if (
+                action["action"] in available_actions.openended
+                and "openended_response" not in action
+            ):
+                self.print(
+                    "GPT chose openended action but didn't include response", action
+                )
                 error_message = "You chose an openended action, and so your json must have an 'openended_response' key."
                 messages.append({"role": "user", "content": error_message})
                 continue
 
+            try:
+                explain = re.findall(r"Explain\((H\d+)\)", action["action"])
+                if len(explain):
+                    self.print("GPT is asking for rules explanation.")
+                    rule = details_dict[explain[0]]
+                    desc = rules.additional_details[rule]
+                    messages.append({"role": "user", "content": desc})
+                    continue
+
+                explain = re.findall(r"Explain\((.+)\)", action["action"])
+                if len(explain):
+                    self.print("GPT is asking for action explanation.")
+                    desc = available_actions.predefined.get(explain[0], "") + available_actions.openended.get(explain[0], "")
+                    messages.append({"role": "user", "content": desc})
+                    continue
+            except:
+                self.print("GPT tried asking for an expalanation but failed.")
+                error_message = "This is an invalid Explain action."
+                messages.append({"role": "user", "content": error_message})
+                continue
+
             if action["action"] in valid_actions:
                 self.print("GPT chose valid action", action)
                 result = action
@@ -167,35 +263,48 @@ def take_action(
             messages.append({"role": "user", "content": error_message})
         if result == None:
             self.print(
-                f"WARNING: GPT returned an a random action after {self.max_retries} tries"
+                f"WARNING: GPT returned too many invalid actions after {self.max_retries} tries"
             )
             return Action(action_id=None)
+
         return Action(
             action_id=result["action"],
             openended_response=result.get("openended_response"),
         )
 
 
 @dataclass
-class ChatGPTText(OpenAITextAgent):
+class GPT3(OpenAITextAgent):
+    openai_model: str = "gpt-3.5-turbo-1106"
+    agent_type_id: str = "gpt-3"
+    mode: int = 0
+
+@dataclass
+class GPT3CoT(OpenAITextAgent):
     openai_model: str = "gpt-3.5-turbo-1106"
-    agent_type_id: str = "gpt-3.5"
+    agent_type_id: str = "gpt-3-cot"
+    mode: int = 1
 
+@dataclass
+class GPT3BaP(OpenAITextAgent):
+    openai_model: str = "gpt-3.5-turbo-1106"
+    agent_type_id: str = "gpt-3-bap"
+    mode: int = 2
 
 @dataclass
-class GPT4Text(OpenAITextAgent):
+class GPT4(OpenAITextAgent):
     openai_model: str = "gpt-4-1106-preview"
     agent_type_id: str = "gpt-4"
-
+    mode: int = 0
 
 @dataclass
-class ChainOfThought(OpenAITextAgent):
+class GPT4CoT(OpenAITextAgent):
     openai_model: str = "gpt-4-1106-preview"
-    agent_type_id: str = "cot"
+    agent_type_id: str = "gpt-4-cot"
     mode: int = 1
 
 @dataclass
-class BabbleAndPrune(OpenAITextAgent):
+class GPT4BaP(OpenAITextAgent):
     openai_model: str = "gpt-4-1106-preview"
-    agent_type_id: str = "b&p"
-    mode: int = 2
+    agent_type_id: str = "gpt-4-bap"
+    mode: int = 2
diff --git a/agents/random_agent.py b/agents/random_agent.py
@@ -7,5 +7,5 @@ class RandomAgent(Agent):
     agent_type_id : str = "random"
 
     def take_action(self, rules : Rules, observation: Observation, available_actions: AvailableActions, show_state : bool):
-        actions = list(available_actions.predefined.keys())
-        return Action(action_id=random.choice(actions))
+        actions = list(available_actions.predefined.keys()) + list(available_actions.openended.keys())
+        return Action(action_id=random.choice(actions), openended_response="")
diff --git a/agents/rap/agent.py b/agents/rap/agent.py
@@ -14,9 +14,9 @@ class ReasoningViaPlanning(Agent, WorldModel, SearchConfig):
     """Inherents Agent from api.classes, and WorldModel and SearchConfig
     from the llm-agents library."""
 
-    agent_type_id: str = "rap"
+    agent_type_id: str = "gpt4-rap"
     transparent_reasoning: bool = False
-    agent_type: int = 0  # 0 = random replies, 1 = human interaction, 2 = openai
+    agent_type: int = 2  # 0 = random replies, 1 = human interaction, 2 = openai
 
     context_builder: Callable[[str, str], ContextType] = None
     completions: CompletionsFunction = None

diff --git a/agents/rap/chat.py b/agents/rap/chat.py
@@ -71,7 +71,7 @@ def probabilities(
 
         top_logprobs = (
             openai_client.chat.completions.create(
-                model="gpt-3.5-turbo-1106",
+                model=model,
                 messages=context,
                 logprobs=True,
                 top_logprobs=n,
@@ -156,7 +156,7 @@ def image_description(image: Image, rules: Rules) -> str:
                 "content": [
                     {
                         "type": "text",
-                        "text": "You are playing a game called {rules.title}. The rules are as follows: {rules.summary}.\nThis image is your observation of the game. Describe what's going on in the image.",
+                        "text": f"You are playing a game called {rules.title}. The rules are as follows: {rules.summary}.\nThis image is your observation of the game. Describe what's going on in the image.",
                     },
                     {
                         "type": "image",