diff --git a/evals/completion_fns/retrieval.py b/evals/completion_fns/retrieval.py index 6ef998600c..0909f0430c 100644 --- a/evals/completion_fns/retrieval.py +++ b/evals/completion_fns/retrieval.py @@ -14,7 +14,6 @@ from evals.record import record_sampling from evals.registry import Registry -client = OpenAI(api_key=os.environ.get("OPENAI_API_KEY")) def load_embeddings(embeddings_and_text_path: str): @@ -76,6 +75,7 @@ def __init__( registry_path: The path to a registry file to add to default registry. _kwargs: Additional arguments to pass to the completion function instantiation. """ + self.client = OpenAI(api_key=os.environ.get("OPENAI_API_KEY")) registry = Registry() if not registry else registry if registry_path: registry.add_registry_paths(registry_path) @@ -96,7 +96,7 @@ def __call__(self, prompt: Union[str, list[dict]], **kwargs: Any) -> RetrievalCo """ # Embed the prompt embedded_prompt = ( - client.embeddings.create( + self.client.embeddings.create( model=self.embedding_model, input=CompletionPrompt(prompt).to_formatted_prompt() ) .data[0] diff --git a/evals/elsuite/hr_ml_agent_bench/utils.py b/evals/elsuite/hr_ml_agent_bench/utils.py index c37d8b1c4f..ddbb951bd7 100644 --- a/evals/elsuite/hr_ml_agent_bench/utils.py +++ b/evals/elsuite/hr_ml_agent_bench/utils.py @@ -13,7 +13,6 @@ from evals.solvers.solver import Solver from evals.task_state import TaskState -client = OpenAI() logger = logging.getLogger(__name__) diff --git a/evals/elsuite/make_me_pay/utils.py b/evals/elsuite/make_me_pay/utils.py index deb8f2c9ce..4e9b5472ed 100644 --- a/evals/elsuite/make_me_pay/utils.py +++ b/evals/elsuite/make_me_pay/utils.py @@ -4,9 +4,6 @@ from openai import OpenAI -client = OpenAI(api_key=os.environ.get("OPENAI_API_KEY")) - - def is_system_msg(m: dict) -> bool: assert isinstance(m, dict), "Message must be a dict." assert "role" in m, "Message must have a role." @@ -72,4 +69,4 @@ def model_output_empty_tags(message: str) -> bool: def openai_chatcompletion_create(*args, **kwargs): - return client.chat.completions.create(*args, **kwargs) + return OpenAI(api_key=os.environ.get("OPENAI_API_KEY")).chat.completions.create(*args, **kwargs) diff --git a/evals/elsuite/self_prompting/eval.py b/evals/elsuite/self_prompting/eval.py index 7db858f5d4..caffba1356 100644 --- a/evals/elsuite/self_prompting/eval.py +++ b/evals/elsuite/self_prompting/eval.py @@ -11,7 +11,7 @@ from evals.api import CompletionFn from evals.elsuite.self_prompting.task_description import sample_in_token, task_description_template from evals.eval import SolverEval -from evals.registry import registry +from evals.registry import Registry from evals.solvers.solver import Solver from evals.task_state import TaskState from evals.utils.log_utils import extract_final_results, extract_spec @@ -54,7 +54,7 @@ def __init__( self.tasker_completion_fns = {} for tasker_model in self.tasker_models: - self.tasker_completion_fns[tasker_model] = registry.make_completion_fn(tasker_model) + self.tasker_completion_fns[tasker_model] = Registry().make_completion_fn(tasker_model) def eval_sample(self, solver: Solver, sample: Any, rng: random.Random): if sample["stage"] == "prompting": diff --git a/evals/elsuite/self_prompting/scripts/dataset/compile_data.py b/evals/elsuite/self_prompting/scripts/dataset/compile_data.py index 6a5698c4e2..f2b04beb70 100644 --- a/evals/elsuite/self_prompting/scripts/dataset/compile_data.py +++ b/evals/elsuite/self_prompting/scripts/dataset/compile_data.py @@ -4,7 +4,7 @@ from eval_list import eval_list import evals.data -from evals.registry import registry +from evals.registry import Registry np.random.seed(42) min_samples_per_dataset = 50 @@ -12,7 +12,7 @@ seen = set() datarows = [] -for eval in registry.get_evals("*"): +for eval in Registry().get_evals("*"): if eval.key not in eval_list or eval.key in seen: continue seen.add(eval.key) diff --git a/evals/elsuite/skill_acquisition/test_skill_acquisition.py b/evals/elsuite/skill_acquisition/test_skill_acquisition.py index 079fdcb435..3ec2f7c389 100644 --- a/evals/elsuite/skill_acquisition/test_skill_acquisition.py +++ b/evals/elsuite/skill_acquisition/test_skill_acquisition.py @@ -11,108 +11,112 @@ ) from evals.registry import Registry -registry = Registry() - -dummy_eval_spec = { - "eval_registry_path": Path("evals/registry"), - "completion_fns": [registry.make_completion_fn("gpt-4")], - "samples_jsonl": "skill_acquisition/miskito/miskito_test.jsonl", - "target_language": "miskito", - "n_samples": 5, - "knowledge_base_directory": "skill_acquisition/miskito/knowledge_base/", - "max_replies": 50, -} - - -def test_answer_detected(): - assert answer_detected("[ANSWER foo]") is True - assert answer_detected("[ANSWER: foo]") is True - assert answer_detected("ANSWER foo") is False - assert answer_detected("[ANSWER foo") is False - assert answer_detected("ANSWER foo]") is False - assert answer_detected("[ANSWER foo][ANSWER bar]") is True - - -def test_view_instruction_detected(): - SkillAcquisition(**dummy_eval_spec) - assert view_instruction_detected("[VIEW file1]") is True - assert view_instruction_detected("[VIEW: file1]") is True - assert view_instruction_detected("[VIEW file1 section1]") is True - assert view_instruction_detected("[VIEW: file1 section1]") is True - assert view_instruction_detected("VIEW file1") is False - assert view_instruction_detected("[VIEW file1") is False - assert view_instruction_detected("VIEW file1]") is False - assert view_instruction_detected("[VIEW file1][VIEW file2]") is True - assert view_instruction_detected("[VIEW: file1][VIEW: file2]") is True - - -def test_process_answer(): - SkillAcquisition(**dummy_eval_spec) - assert process_answer("[ANSWER foo]") == "foo" - assert process_answer("[ANSWER: foo]") == "foo" - assert process_answer("[ANSWER foo bar baz]") == "foo bar baz" - assert process_answer("[ANSWER: foo bar baz]") == "foo bar baz" - assert process_answer("[ANSWER foo][ANSWER bar]") == "bar" - assert process_answer("[ANSWER foo][ANSWER bar") == "foo" - - -def test_process_view_instruction(): - SkillAcquisition(**dummy_eval_spec) - assert process_view_instruction("[VIEW file1]") == ("file1", None) - assert process_view_instruction("[VIEW: file1]") == ("file1", None) - assert process_view_instruction("[VIEW file1 section1]") == ( - "file1", - "section1", - ) - assert process_view_instruction("[VIEW: file1 section1]") == ( - "file1", - "section1", - ) - assert process_view_instruction("[VIEW file1][VIEW file2]") == ( - "file2", - None, - ) - assert process_view_instruction("[VIEW: file1][VIEW: file2]") == ( - "file2", - None, - ) - assert process_view_instruction("[VIEW file1 section1][VIEW file2 section2]") == ( - "file2", - "section2", - ) - - -def test_process_view_instruction_spaces_and_quotes(): - assert process_view_instruction("[VIEW file1 sectionpart1 sectionpart2]") == ( - "file1", - "sectionpart1 sectionpart2", - ) - assert process_view_instruction("[VIEW file1 sectionpart1 'sectionpart2']") == ( - "file1", - "sectionpart1 'sectionpart2'", - ) - - -def test_view_content(): - skill_acquisition_eval = SkillAcquisition(**dummy_eval_spec) - - # Create a file to view first. - filepath = skill_acquisition_eval.knowledge_base_directory / "test_file.jsonl" - with open(filepath, "w") as f: - f.write(json.dumps({"title": "foo", "content": "Test file contents."}) + "\n") - - content, sections_visible_to_model, sections_viewed = skill_acquisition_eval._view_content( - "test_file.jsonl" - ) - assert content == "Table of contents for test_file.jsonl: {'foo'}." - assert sections_visible_to_model == {"test_file.jsonl": {"foo"}} - assert sections_viewed == {"test_file.jsonl": {"Table of Contents"}} - - content, sections_visible_to_model, sections_viewed = skill_acquisition_eval._view_content( - "test_file.jsonl", "foo" - ) - assert content == "Test file contents." - assert sections_visible_to_model == {"test_file.jsonl": {"foo"}} - assert sections_viewed == {"test_file.jsonl": {"Table of Contents", "foo"}} - - os.remove(filepath) + + + +class TestSkillAcquisition(): + def setup_class(self): + os.environ["OPENAI_API_KEY"] = "test" + self.registry = Registry() + self.dummy_eval_spec = { + "eval_registry_path": Path("evals/registry"), + "completion_fns": [self.registry.make_completion_fn("gpt-4")], + "samples_jsonl": "skill_acquisition/miskito/miskito_test.jsonl", + "target_language": "miskito", + "n_samples": 5, + "knowledge_base_directory": "skill_acquisition/miskito/knowledge_base/", + "max_replies": 50, + } + + def test_answer_detected(self): + assert answer_detected("[ANSWER foo]") is True + assert answer_detected("[ANSWER: foo]") is True + assert answer_detected("ANSWER foo") is False + assert answer_detected("[ANSWER foo") is False + assert answer_detected("ANSWER foo]") is False + assert answer_detected("[ANSWER foo][ANSWER bar]") is True + + + def test_view_instruction_detected(self): + SkillAcquisition(**self.dummy_eval_spec) + assert view_instruction_detected("[VIEW file1]") is True + assert view_instruction_detected("[VIEW: file1]") is True + assert view_instruction_detected("[VIEW file1 section1]") is True + assert view_instruction_detected("[VIEW: file1 section1]") is True + assert view_instruction_detected("VIEW file1") is False + assert view_instruction_detected("[VIEW file1") is False + assert view_instruction_detected("VIEW file1]") is False + assert view_instruction_detected("[VIEW file1][VIEW file2]") is True + assert view_instruction_detected("[VIEW: file1][VIEW: file2]") is True + + + def test_process_answer(self): + SkillAcquisition(**self.dummy_eval_spec) + assert process_answer("[ANSWER foo]") == "foo" + assert process_answer("[ANSWER: foo]") == "foo" + assert process_answer("[ANSWER foo bar baz]") == "foo bar baz" + assert process_answer("[ANSWER: foo bar baz]") == "foo bar baz" + assert process_answer("[ANSWER foo][ANSWER bar]") == "bar" + assert process_answer("[ANSWER foo][ANSWER bar") == "foo" + + + def test_process_view_instruction(self): + SkillAcquisition(**self.dummy_eval_spec) + assert process_view_instruction("[VIEW file1]") == ("file1", None) + assert process_view_instruction("[VIEW: file1]") == ("file1", None) + assert process_view_instruction("[VIEW file1 section1]") == ( + "file1", + "section1", + ) + assert process_view_instruction("[VIEW: file1 section1]") == ( + "file1", + "section1", + ) + assert process_view_instruction("[VIEW file1][VIEW file2]") == ( + "file2", + None, + ) + assert process_view_instruction("[VIEW: file1][VIEW: file2]") == ( + "file2", + None, + ) + assert process_view_instruction("[VIEW file1 section1][VIEW file2 section2]") == ( + "file2", + "section2", + ) + + + def test_process_view_instruction_spaces_and_quotes(self): + assert process_view_instruction("[VIEW file1 sectionpart1 sectionpart2]") == ( + "file1", + "sectionpart1 sectionpart2", + ) + assert process_view_instruction("[VIEW file1 sectionpart1 'sectionpart2']") == ( + "file1", + "sectionpart1 'sectionpart2'", + ) + + + def test_view_content(self): + skill_acquisition_eval = SkillAcquisition(**self.dummy_eval_spec) + + # Create a file to view first. + filepath = skill_acquisition_eval.knowledge_base_directory / "test_file.jsonl" + with open(filepath, "w") as f: + f.write(json.dumps({"title": "foo", "content": "Test file contents."}) + "\n") + + content, sections_visible_to_model, sections_viewed = skill_acquisition_eval._view_content( + "test_file.jsonl" + ) + assert content == "Table of contents for test_file.jsonl: {'foo'}." + assert sections_visible_to_model == {"test_file.jsonl": {"foo"}} + assert sections_viewed == {"test_file.jsonl": {"Table of Contents"}} + + content, sections_visible_to_model, sections_viewed = skill_acquisition_eval._view_content( + "test_file.jsonl", "foo" + ) + assert content == "Test file contents." + assert sections_visible_to_model == {"test_file.jsonl": {"foo"}} + assert sections_viewed == {"test_file.jsonl": {"Table of Contents", "foo"}} + + os.remove(filepath) diff --git a/evals/record.py b/evals/record.py index 8e8ebe9ae6..7135c4bfa9 100644 --- a/evals/record.py +++ b/evals/record.py @@ -263,14 +263,6 @@ def record_final_report(self, final_report: Any): logging.info(f"Final report: {final_report}. Not writing anywhere.") -def _green(str): - return f"\033[1;32m{str}\033[0m" - - -def _red(str): - return f"\033[1;31m{str}\033[0m" - - class DummyRecorder(RecorderBase): """ A "recorder" which only logs certain events to the console. @@ -282,33 +274,15 @@ def __init__(self, run_spec: RunSpec, log: bool = True): self.log = log def record_event(self, type, data, sample_id=None): - from evals.registry import registry - if self.run_spec is None: return - base_eval_spec = registry.get_base_eval(self.run_spec.base_eval) - if base_eval_spec and len(base_eval_spec.metrics) >= 1: - primary_metric = base_eval_spec.metrics[0] - else: - primary_metric = "accuracy" - with self._event_lock: event = self._create_event(type, data) self._events.append(event) msg = f"Not recording event: {event}" - if type == "match": - accuracy_good = ( - primary_metric == "accuracy" or primary_metric.startswith("pass@") - ) and (data.get("correct", False) or data.get("accuracy", 0) > 0.5) - f1_score_good = primary_metric == "f1_score" and data.get("f1_score", 0) > 0.5 - if accuracy_good or f1_score_good: - msg = _green(msg) - else: - msg = _red(msg) - if self.log: logging.info(msg) diff --git a/evals/registry.py b/evals/registry.py index 2d1c0fee1d..d76aeb546f 100644 --- a/evals/registry.py +++ b/evals/registry.py @@ -9,12 +9,13 @@ import logging import os import re +import yaml + from functools import cached_property from pathlib import Path from typing import Any, Generator, Iterator, Optional, Sequence, Tuple, Type, TypeVar, Union import openai -import yaml from openai import OpenAI from evals import OpenAIChatCompletionFn, OpenAICompletionFn @@ -23,7 +24,6 @@ from evals.elsuite.modelgraded.base import ModelGradedSpec from evals.utils.misc import make_object -client = OpenAI(api_key=os.environ.get("OPENAI_API_KEY")) logger = logging.getLogger(__name__) @@ -103,6 +103,10 @@ def is_chat_model(model_name: str) -> bool: class Registry: def __init__(self, registry_paths: Sequence[Union[str, Path]] = DEFAULT_PATHS): self._registry_paths = [Path(p) if isinstance(p, str) else p for p in registry_paths] + try: + self.client = OpenAI() + except openai.OpenAIError as err: + logger.warning(f"Could not initialize OpenAI client: {err}") def add_registry_paths(self, paths: Sequence[Union[str, Path]]) -> None: self._registry_paths.extend([Path(p) if isinstance(p, str) else p for p in paths]) @@ -110,7 +114,7 @@ def add_registry_paths(self, paths: Sequence[Union[str, Path]]) -> None: @cached_property def api_model_ids(self) -> list[str]: try: - return [m.id for m in client.models.list().data] + return [m.id for m in self.client.models.list().data] except openai.OpenAIError as err: # Errors can happen when running eval with completion function that uses custom # API endpoints and authentication mechanisms. @@ -329,5 +333,3 @@ def _evals(self) -> RawRegistry: def _modelgraded_specs(self) -> RawRegistry: return self._load_registry(self._registry_paths, "modelgraded") - -registry = Registry() diff --git a/evals/registry/data/word_association/corpus_tools/validators.py b/evals/registry/data/word_association/corpus_tools/validators.py index e8a73c7081..08a2aaa042 100644 --- a/evals/registry/data/word_association/corpus_tools/validators.py +++ b/evals/registry/data/word_association/corpus_tools/validators.py @@ -8,8 +8,6 @@ from logger_config import logger from openai import OpenAI -client = OpenAI(api_key=os.environ.get("OPENAI_API_KEY")) - CORRELATION_PROMPT_TEMPLATE = """Task: Estimate the degree of correlation between two provided strings. In your evaluation, consider not just direct links, but also indirect and subtle correlations. As an illustration, if 'watch' appears in the first string and 'tower' in the second, @@ -172,6 +170,7 @@ def get_embeddings( A list of Embedding namedtuples where each Embedding represents the input string and its corresponding vector. """ + client = OpenAI(api_key=os.environ.get("OPENAI_API_KEY")) response = client.embeddings.create(model="text-embedding-ada-002", input=emb_input) logger.debug(f"embeddings response: {response}") response_data = response["data"] @@ -199,6 +198,7 @@ def __init__( self._model = model self.criteria = criteria super().__init__(target_score) + self.client = OpenAI(api_key=os.environ.get("OPENAI_API_KEY")) def validate(self, related_words_pairs: List[RelatedWordsPair]) -> List[SimilarityTuple]: """ @@ -249,7 +249,7 @@ def get_chat_completion( logger.debug( f"Getting chat_completion using {self._model}.\nPrompting messages: {messages}" ) - response = client.chat.completions.create( + response = self.client.chat.completions.create( model=self._model, messages=messages, temperature=0.0 ) logger.debug(f"response_message: {response}") diff --git a/evals/solvers/providers/anthropic/anthropic_solver.py b/evals/solvers/providers/anthropic/anthropic_solver.py index bb7fe50e24..eeeb8dbd7a 100644 --- a/evals/solvers/providers/anthropic/anthropic_solver.py +++ b/evals/solvers/providers/anthropic/anthropic_solver.py @@ -2,7 +2,7 @@ import anthropic from anthropic import Anthropic -from anthropic.types import ContentBlock, MessageParam, Usage +from anthropic.types import TextBlock, MessageParam, Usage from evals.record import record_sampling from evals.solvers.solver import Solver, SolverResult @@ -99,7 +99,7 @@ def _convert_msgs_to_anthropic_format(msgs: list[Message]) -> list[MessageParam] anth_msgs = [ MessageParam( role=oai_to_anthropic_role[msg.role], - content=[ContentBlock(text=msg.content, type="text")], + content=[TextBlock(text=msg.content, type="text")], ) for msg in msgs ] diff --git a/evals/solvers/providers/anthropic/anthropic_solver_test.py b/evals/solvers/providers/anthropic/anthropic_solver_test.py index 9ba8fb1470..0f7fda8ef5 100644 --- a/evals/solvers/providers/anthropic/anthropic_solver_test.py +++ b/evals/solvers/providers/anthropic/anthropic_solver_test.py @@ -8,9 +8,10 @@ anth_to_openai_usage, ) -from anthropic.types import ContentBlock, MessageParam, Usage +from anthropic.types import TextBlock, MessageParam, Usage IN_GITHUB_ACTIONS = os.getenv("GITHUB_ACTIONS") == "true" +MISSING_ANTHROPIC_API_KEY = os.getenv("ANTHROPIC_API_KEY") in {"", None} MODEL_NAME = "claude-instant-1.2" @@ -33,7 +34,7 @@ def dummy_recorder(): @pytest.mark.skipif( - IN_GITHUB_ACTIONS, reason="API tests are wasteful to run on every commit." + IN_GITHUB_ACTIONS or MISSING_ANTHROPIC_API_KEY, reason="API tests are wasteful to run on every commit." ) def test_solver(dummy_recorder, anthropic_solver): """ @@ -82,14 +83,14 @@ def test_message_format(): MessageParam( role="user", content=[ - ContentBlock(text="What is 2 + 2?", type="text"), - ContentBlock(text="reason step by step", type="text"), + TextBlock(text="What is 2 + 2?", type="text"), + TextBlock(text="reason step by step", type="text"), ], ), MessageParam( role="assistant", content=[ - ContentBlock( + TextBlock( text="I don't need to reason for this, 2+2 is just 4", type="text" ), ], @@ -97,7 +98,7 @@ def test_message_format(): MessageParam( role="user", content=[ - ContentBlock( + TextBlock( text="now, given your reasoning, provide the answer", type="text" ), ], diff --git a/evals/solvers/providers/google/gemini_solver_test.py b/evals/solvers/providers/google/gemini_solver_test.py index 9586c5f8f2..466297cd4a 100644 --- a/evals/solvers/providers/google/gemini_solver_test.py +++ b/evals/solvers/providers/google/gemini_solver_test.py @@ -7,6 +7,7 @@ from evals.task_state import Message, TaskState IN_GITHUB_ACTIONS = os.getenv("GITHUB_ACTIONS") == "true" +MISSING_GOOGLE_API_KEY = os.getenv("GOOGLE_API_KEY") in {None, ""} MODEL_NAME = "gemini-pro" @@ -26,7 +27,7 @@ def gemini_solver(): return solver -@pytest.mark.skipif(IN_GITHUB_ACTIONS, reason="API tests are wasteful to run on every commit.") +@pytest.mark.skipif(IN_GITHUB_ACTIONS or MISSING_GOOGLE_API_KEY, reason="API tests are wasteful to run on every commit.") def test_solver(dummy_recorder, gemini_solver): """ Test that the solver generates a response coherent with the message history diff --git a/evals/solvers/providers/openai/openai_assistants_solver.py b/evals/solvers/providers/openai/openai_assistants_solver.py index eddfb952eb..509170e351 100644 --- a/evals/solvers/providers/openai/openai_assistants_solver.py +++ b/evals/solvers/providers/openai/openai_assistants_solver.py @@ -1,4 +1,5 @@ import logging +import os import time from threading import Lock from typing import Any, Dict, Optional, Union @@ -10,7 +11,6 @@ from openai.types.beta.threads.run import Run from evals.record import record_sampling -from evals.registry import client from evals.solvers.providers.openai.openai_solver import OpenAISolver from evals.solvers.solver import Solver, SolverResult from evals.task_state import Message, TaskState @@ -65,11 +65,12 @@ def __init__( ): super().__init__(postprocessors=postprocessors) self.model = model - self.thread = thread if thread else client.beta.threads.create() + self.client = openai.OpenAI(api_key=os.environ.get("OPENAI_API_KEY")) + self.thread = thread if thread else self.client.beta.threads.create() self.tools = tools if not assistant: file_ids = self._create_files(file_paths) - self.assistant = client.beta.assistants.create( + self.assistant = self.client.beta.assistants.create( model=model, name=name, description=description, @@ -94,7 +95,7 @@ def __init__( ) def _run_assistant_retrying(self, task_state: TaskState): # Run Assistant on the Thread - run = client.beta.threads.runs.create( + run = self.client.beta.threads.runs.create( assistant_id=self.assistant.id, thread_id=self.thread.id, instructions=task_state.task_description, # Apply task description as `instructions` @@ -136,7 +137,7 @@ def _solve( last_msg_sent = None for idx, message in enumerate(task_state.messages[new_msgs_start_idx:]): user_message = self._convert_to_user_message(message) # API only allows "user" messages - last_msg_sent = client.beta.threads.messages.create( + last_msg_sent = self.client.beta.threads.messages.create( thread_id=self.thread.id, role=user_message.role, content=user_message.content, @@ -147,7 +148,7 @@ def _solve( run = self._run_assistant_retrying(task_state) # Get Assistant response(s) - messages = client.beta.threads.messages.list( + messages = self.client.beta.threads.messages.list( thread_id=self.thread.id, order="asc", after=last_msg_sent.id if last_msg_sent else None, @@ -197,7 +198,7 @@ def copy(self): solver_copy = self.__class__( model=self.model, assistant=self.assistant, - thread=client.beta.threads.create(), + thread=self.client.beta.threads.create(), ) return solver_copy @@ -207,7 +208,7 @@ def _create_file(self, file_path: str) -> str: if file_path in FILE_CACHE: return FILE_CACHE[file_path] try: - file = client.files.create( + file = self.client.files.create( file=open(file_path, "rb"), purpose="assistants", ) @@ -251,7 +252,7 @@ def _wait_on_run(self, run: Run, thread: Thread) -> Run: Function borrowed from: https://cookbook.openai.com/examples/assistants_api_overview_python """ while run.status == "queued" or run.status == "in_progress": - run = client.beta.threads.runs.retrieve( + run = self.client.beta.threads.runs.retrieve( thread_id=thread.id, run_id=run.id, ) diff --git a/evals/solvers/providers/openai/openai_assistants_solver_test.py b/evals/solvers/providers/openai/openai_assistants_solver_test.py index 7a0d6b5761..000bb365ec 100644 --- a/evals/solvers/providers/openai/openai_assistants_solver_test.py +++ b/evals/solvers/providers/openai/openai_assistants_solver_test.py @@ -14,6 +14,7 @@ from evals.task_state import Message, TaskState IN_GITHUB_ACTIONS = os.getenv("GITHUB_ACTIONS") == "true" +MISSING_OPENAI_API_KEY = os.getenv("OPENAI_API_KEY") in {"", None} MODEL = "gpt-4-1106-preview" @@ -64,7 +65,7 @@ def retrieval_solver(): return solver -@pytest.mark.skipif(IN_GITHUB_ACTIONS, reason="API tests are wasteful to run on every commit.") +@pytest.mark.skipif(IN_GITHUB_ACTIONS or MISSING_OPENAI_API_KEY, reason="API tests are wasteful to run on every commit.") def test_solver_copying(dummy_recorder, vanilla_solver): """ When OpenAIAssistantsSolver is copied, the Assistant should be the same @@ -80,7 +81,7 @@ def test_solver_copying(dummy_recorder, vanilla_solver): test_multiturn_conversation(dummy_recorder, solver_copy) -@pytest.mark.skipif(IN_GITHUB_ACTIONS, reason="API tests are wasteful to run on every commit.") +@pytest.mark.skipif(IN_GITHUB_ACTIONS or MISSING_OPENAI_API_KEY, reason="API tests are wasteful to run on every commit.") def test_multiturn_conversation(dummy_recorder, vanilla_solver): """ Test that message history of the conversation is preserved across multiple turns. @@ -103,7 +104,7 @@ def test_multiturn_conversation(dummy_recorder, vanilla_solver): assert int(solver_result.output.strip()) == sum(numbers[: idx + 1]) -@pytest.mark.skipif(IN_GITHUB_ACTIONS, reason="API tests are wasteful to run on every commit.") +@pytest.mark.skipif(IN_GITHUB_ACTIONS or MISSING_OPENAI_API_KEY, reason="API tests are wasteful to run on every commit.") def test_code_interpreter(dummy_recorder, code_interpreter_solver): solver = code_interpreter_solver @@ -122,7 +123,7 @@ def test_code_interpreter(dummy_recorder, code_interpreter_solver): assert str(round(math.sqrt(145145), 3)) in solver_result.output -@pytest.mark.skipif(IN_GITHUB_ACTIONS, reason="API tests are wasteful to run on every commit.") +@pytest.mark.skipif(IN_GITHUB_ACTIONS or MISSING_OPENAI_API_KEY, reason="API tests are wasteful to run on every commit.") def test_task_description(dummy_recorder, vanilla_solver): solver = vanilla_solver @@ -141,7 +142,7 @@ def test_task_description(dummy_recorder, vanilla_solver): assert solver_result.output == target_string -@pytest.mark.skipif(IN_GITHUB_ACTIONS, reason="API tests are wasteful to run on every commit.") +@pytest.mark.skipif(IN_GITHUB_ACTIONS or MISSING_OPENAI_API_KEY, reason="API tests are wasteful to run on every commit.") def test_code_interpreter_file(dummy_recorder, dummy_data_file, code_interpreter_solver): dummy_data, tmpfile_path = dummy_data_file solver = code_interpreter_solver @@ -168,7 +169,7 @@ def test_code_interpreter_file(dummy_recorder, dummy_data_file, code_interpreter ), f"Expected password '{dummy_data['password']}' to be in output, but got: {solver_result.output}" -@pytest.mark.skipif(IN_GITHUB_ACTIONS, reason="API tests are wasteful to run on every commit.") +@pytest.mark.skipif(IN_GITHUB_ACTIONS or MISSING_OPENAI_API_KEY, reason="API tests are wasteful to run on every commit.") def test_retrieval_file(dummy_recorder, dummy_data_file, retrieval_solver): dummy_data, tmpfile_path = dummy_data_file solver = retrieval_solver @@ -202,7 +203,7 @@ def test_retrieval_file(dummy_recorder, dummy_data_file, retrieval_solver): ), f"Expected password '{dummy_data['password']}' to be in output, but got: {solver_result.output}" -@pytest.mark.skipif(IN_GITHUB_ACTIONS, reason="API tests are wasteful to run on every commit.") +@pytest.mark.skipif(IN_GITHUB_ACTIONS or MISSING_OPENAI_API_KEY, reason="API tests are wasteful to run on every commit.") def test_file_cache(dummy_recorder, dummy_data_file, retrieval_solver): dummy_data, tmpfile_path = dummy_data_file solver = retrieval_solver