scicode-bench · Zilinghan · Feb 17, 2025 · Feb 17, 2025 · Feb 17, 2025 · Feb 17, 2025
diff --git a/README.md b/README.md
@@ -7,6 +7,8 @@ This repo contains the evaluation code for the paper "[SciCode: A Research Codin
 
 ## 🔔News
 
+**[2025-02-17]: SciCode benchmark is available at [HuggingFace Datasets](https://huggingface.co/datasets/SciCode1/SciCode)!**
+
 **[2025-02-01]: Results for DeepSeek-R1, DeepSeek-V3, and OpenAI o3-mini are added.**
 
 **[2025-01-24]: SciCode has been integrated with [`inspect_ai`](https://inspect.ai-safety-institute.org.uk/) for easier and faster model evaluations.**
@@ -54,27 +56,30 @@ SciCode sources challenging and realistic research-level coding problems across
 | Mixtral-8x22B-Instruct   | <div align="center">**0.0**</div>       | <div align="center" style="color:grey">16.3</div>     |
 | Llama-3-70B-Chat         | <div align="center">**0.0**</div>       | <div align="center" style="color:grey">14.6</div>     |
 
+## Instructions to evaluate a new model using `inspect_ai` (recommended)
+
 
-## Instructions to evaluate a new model
+Scicode has been integrated with `inspect_ai` for easier and faster model evaluation. You need to run the following steps ro run:
 
 1. Clone this repository `git clone [email protected]:scicode-bench/SciCode.git`
 2. Install the `scicode` package with `pip install -e .`
 3. Download the [numeric test results](https://drive.google.com/drive/folders/1W5GZW6_bdiDAiipuFMqdUhvUaHIj6-pR?usp=drive_link) and save them as `./eval/data/test_data.h5`
-4. Run `eval/scripts/gencode_json.py` to generate new model outputs (see the [`eval/scripts` readme](eval/scripts/)) for more information
-5. Run `eval/scripts/test_generated_code.py` to evaluate the unittests
-
-
-## Instructions to evaluate a new model using `inspect_ai` (recommended)
-
-Scicode has been integrated with `inspect_ai` for easier and faster model evaluation, compared with the methods above. You need to run the first three steps in the [above section](#instructions-to-evaluate-a-new-model), and then go to the `eval/inspect_ai` directory, setup correspoinding API key, and run the following command:
+4. Go to the `eval/inspect_ai` directory, setup correspoinding API key, and run the following command:
 
 ```bash
 cd eval/inspect_ai
 export OPENAI_API_KEY=your-openai-api-key
 inspect eval scicode.py --model openai/gpt-4o --temperature 0
 ```
 
-For more detailed information of using `inspect_ai`, see [`eval/inspect_ai` readme](eval/inspect_ai/)
+💡 For more detailed information of using `inspect_ai`, see [`eval/inspect_ai` readme](eval/inspect_ai/)
+
+## Instructions to evaluate a new model in two steps (deprecated)
+
+It should be noted that this is a deprecated way to evaluating models, and using `inspect_ai` is the recommended way. Please use this method only if `inspect_ai` does not work for your need. You need to run the first three steps in the above section, then run the following two commands:
+
+4. Run `eval/scripts/gencode.py` to generate new model outputs (see the [`eval/scripts` readme](eval/scripts/)) for more information
+5. Run `eval/scripts/test_generated_code.py` to evaluate the unittests
 
 ## More information and FAQ
 

diff --git a/eval/data/problems_all.jsonl b/eval/data/problems_all.jsonl
diff --git a/eval/data/problems_dev.jsonl b/eval/data/problems_dev.jsonl
diff --git a/eval/inspect_ai/README.md b/eval/inspect_ai/README.md
@@ -16,26 +16,26 @@ However, there are some additional command line arguments that could be useful a
 
 - `--max-connections`: Maximum amount of API connections to the evaluated model.
 - `--limit`: Limit of the number of samples to evaluate in the SciCode dataset.
-- `-T input_path=<another_input_json_file>`: This is useful when user wants to change to another json dataset (e.g., the dev set).
+- `-T split=validation/test`: Whether the user wants to run on the small `validation` set (15 samples) or the large `test` set (65 samples).
 - `-T output_dir=<your_output_dir>`: This changes the default output directory (`./tmp`).
 - `-T h5py_file=<your_h5py_file>`: This is used if your h5py file is not downloaded in the recommended directory.
 - `-T with_background=True/False`: Whether to include problem background.
 - `-T mode=normal/gold/dummy`: This provides two additional modes for sanity checks.
     - `normal` mode is the standard mode to evaluate a model
-    - `gold` mode can only be used on the dev set which loads the gold answer
+    - `gold` mode can only be used on the validation set which loads the gold answer
     - `dummy` mode does not call any real LLMs and generates some dummy outputs
 
-For example, user can run five sames on the dev set with background as
+For example, user can run five samples on the validation set with background as
 
 ```bash
 inspect eval scicode.py \
     --model openai/gpt-4o \
     --temperature 0 \
     --limit 5 \
-    -T input_path=../data/problems_dev.jsonl \
-    -T output_dir=./tmp/dev \
+    -T split=validation \
+    -T output_dir=./tmp/val \
     -T with_background=True \
-    -T mode=gold
+    -T mode=normal
 ```
 
 User can run the evaluation on `Deepseek-v3` using together ai via the following command:

diff --git a/eval/inspect_ai/scicode.py b/eval/inspect_ai/scicode.py
@@ -5,7 +5,7 @@
 from typing import Any
 from pathlib import Path
 from inspect_ai import Task, task
-from inspect_ai.dataset import json_dataset, Sample
+from inspect_ai.dataset import Sample, hf_dataset
 from inspect_ai.solver import solver, TaskState, Generate
 from inspect_ai.scorer import scorer, mean, metric, Metric, Score, Target
 from scicode.parse.parse import extract_function_name, get_function_from_code
@@ -392,26 +392,26 @@ async def score(state: TaskState, target: Target):
 
 @task
 def scicode(
-    input_path: str = '../data/problems_all.jsonl',
+    split: str = 'test',
     output_dir: str = './tmp',
     with_background: bool = False,
     h5py_file: str = '../data/test_data.h5',
     mode: str = 'normal',
 ):
-    dataset = json_dataset(
-        input_path, 
-        record_to_sample
+
+    dataset =  hf_dataset(
+        'SciCode1/SciCode',
+        split=split,
+        sample_fields=record_to_sample,
     )
     return Task(
         dataset=dataset,
         solver=scicode_solver(
-            input_path=input_path,
             output_dir=output_dir,
             with_background=with_background,
             mode=mode,
         ),
         scorer=scicode_scorer(
-            input_path=input_path,
             output_dir=output_dir,
             with_background=with_background,
             h5py_file=h5py_file,

diff --git a/eval/scripts/README.md b/eval/scripts/README.md
@@ -23,19 +23,19 @@ TOGETHERAI_API_KEY = 'your_api_key'
 To generate code using the **Together AI** model (e.g., `Meta-Llama-3.1-70B-Instruct-Turbo`), go to the root of this repo and run:
 
 ```bash
-python eval/scripts/gencode_json.py --model litellm/together_ai/meta-llama/Meta-Llama-3.1-70B-Instruct-Turbo
+python eval/scripts/gencode.py --model litellm/together_ai/meta-llama/Meta-Llama-3.1-70B-Instruct-Turbo
 ```
 
 To generate code using **GPT-4o** (with default settings), go to the root of this repo and run:
 
 ```bash
-python eval/scripts/gencode_json.py --model gpt-4o
+python eval/scripts/gencode.py --model gpt-4o
 ```
 
 If you want to include **scientist-annotated background** in the prompts, use the `--with-background` flag:
 
 ```bash
-python eval/scripts/gencode_json.py --model gpt-4o --with-background
+python eval/scripts/gencode.py --model gpt-4o --with-background
 ```
 
 Please note that we do not plan to release the ground truth code for each problem to the public. However, we have made a dev set available that includes the ground truth code in `eval/data/problems_dev.jsonl`. 
@@ -44,11 +44,11 @@ In this repository, **we only support evaluating with previously generated code
 
 ### Command-Line Arguments
 
-When running the `gencode_json.py` script, you can use the following options:
+When running the `gencode.py` script, you can use the following options:
 
 - `--model`: Specifies the model name to be used for generating code (e.g., `gpt-4o` or `litellm/together_ai/meta-llama/Meta-Llama-3.1-70B-Instruct-Turbo`).
+- `--split`: Specifies which problem split (either `validation` or `test`) to run on. 
 - `--output-dir`: Directory where the generated code outputs will be saved. Default is `eval_results/generated_code`.
-- `--input-path`: Directory containing the JSON files describing the problems. Default is `eval/data/problems_all.jsonl`.
 - `--prompt-dir`: Directory where prompt files are saved. Default is `eval_results/prompt`.
 - `--with-background`: If enabled, includes the problem background in the generated code.
 - `--temperature`: Controls the randomness of the output. Default is 0.
@@ -66,7 +66,7 @@ Download the [numeric test results](https://drive.google.com/drive/folders/1W5GZ
 To evaluate the generated code using a specific model, go to the root of this repo and use the following command:
 
 ```bash
-python eval/scripts/test_generated_code.py --model "model_name"
+python eval/scripts/test_generated_code.py --model "model_name" 
 ```
 
 Replace `"model_name"` with the appropriate model name, and include `--with-background` if the code is generated with **scientist-annotated background**.
diff --git a/eval/scripts/gencode_json.py → eval/scripts/gencode.py b/eval/scripts/gencode_json.py → eval/scripts/gencode.py
@@ -4,7 +4,7 @@
 from scicode.parse.parse import (
     extract_function_name,
     get_function_from_code,
-    read_from_jsonl
+    read_from_hf_dataset,
 )
 from scicode.gen.models import extract_python_script, get_model_function
 
@@ -150,18 +150,19 @@ def get_cli() -> argparse.ArgumentParser:
     parser.add_argument(
         "--model", type=str, default="gpt-4o", help="Model name"
     )
+    parser.add_argument(
+        "--split", 
+        type=str, 
+        default="test", 
+        choices=["validation", "test"], 
+        help="Dataset split manner",
+    )
     parser.add_argument(
         "--output-dir",
         type=Path,
         default=Path("eval_results", "generated_code"),
         help="Output directory",
     )
-    parser.add_argument(
-        "--input-path",
-        type=Path,
-        default=Path("eval", "data", "problems_all.jsonl"),
-        help="Input directory",
-    )
     parser.add_argument(
         "--prompt-dir",
         type=Path,
@@ -183,8 +184,8 @@ def get_cli() -> argparse.ArgumentParser:
 
 
 def main(model: str,
+         split: str,
          output_dir: Path,
-         input_path: Path,
          prompt_dir: Path,
          with_background: bool,
          temperature: float
@@ -194,7 +195,7 @@ def main(model: str,
         prompt_dir=prompt_dir,  with_background=with_background, temperature=temperature
     )
     prompt_template = BACKGOUND_PROMPT_TEMPLATE if with_background else DEFAULT_PROMPT_TEMPLATE
-    data = read_from_jsonl(input_path)
+    data = read_from_hf_dataset(split)
     for problem in data:
         prob_id = problem['problem_id']
         steps = len(problem['sub_steps'])

diff --git a/eval/scripts/test_generated_code.py b/eval/scripts/test_generated_code.py
@@ -7,7 +7,7 @@
 import argparse
 
 from scicode.parse.parse import H5PY_FILE
-from scicode.parse.parse import read_from_jsonl
+from scicode.parse.parse import read_from_hf_dataset
 
 
 PROB_NUM = 80
@@ -20,16 +20,23 @@ def _get_background_dir(with_background):
     return "with_background" if with_background else "without_background"
 
 
-def test_code(model_name, code_dir, log_dir, output_dir,
-              jsonl_path, dev_set=False, with_background=False):
+def test_code(
+    model_name, 
+    split,
+    code_dir, 
+    log_dir, 
+    output_dir,
+    with_background=False
+):
 
-    jsonl_data = read_from_jsonl(jsonl_path)
+    scicode_data = read_from_hf_dataset(split)
+    scicode_data = [data for data in scicode_data]
     json_dct = {}
     json_idx = {}
 
-    for prob_data in jsonl_data:
+    for prob_data in scicode_data:
         json_dct[prob_data['problem_id']] = len(prob_data['sub_steps'])
-        json_idx[prob_data['problem_id']] = jsonl_data.index(prob_data)
+        json_idx[prob_data['problem_id']] = scicode_data.index(prob_data)
     start_time = time.time()
 
     code_dir_ = Path(code_dir, model_name, _get_background_dir(with_background))
@@ -44,7 +51,7 @@ def test_code(model_name, code_dir, log_dir, output_dir,
             file_step = file_name.split(".")[1]
 
             code_content = file_path.read_text(encoding='utf-8')
-            json_content = jsonl_data[json_idx[file_id]]
+            json_content = scicode_data[json_idx[file_id]]
             step_id = json_content["sub_steps"][int(file_step) - 1]["step_number"]
             test_lst = json_content["sub_steps"][int(file_step) - 1]["test_cases"]
             assert_file = Path(tmp_dir, f'{step_id}.py')
@@ -119,14 +126,14 @@ def run_script(script_path):
                            correct_prob[i] == tot_prob[i]
                            and tot_prob[i] != 0)
 
-    print(f'correct problems: {correct_prob_num}/{DEV_PROB_NUM if dev_set else PROB_NUM - DEV_PROB_NUM}')
-    print(f'correct steps: {len(correct_step)}/{DEV_STEP_NUM if dev_set else STEP_NUM}')
+    print(f'correct problems: {correct_prob_num}/{DEV_PROB_NUM if (split == "validation") else PROB_NUM - DEV_PROB_NUM}')
+    print(f'correct steps: {len(correct_step)}/{DEV_STEP_NUM if (split == "validation") else STEP_NUM}')
 
     Path(output_dir).mkdir(parents=True, exist_ok=True)
 
     with open(f'{output_dir}/{model_name}_{_get_background_dir(with_background)}.txt', 'w') as f:
-        f.write(f'correct problems: {correct_prob_num}/{DEV_PROB_NUM if dev_set else PROB_NUM - DEV_PROB_NUM}\n')
-        f.write(f'correct steps: {len(correct_step)}/{DEV_STEP_NUM if dev_set else STEP_NUM}\n\n')
+        f.write(f'correct problems: {correct_prob_num}/{DEV_PROB_NUM if (split == "validation") else PROB_NUM - DEV_PROB_NUM}\n')
+        f.write(f'correct steps: {len(correct_step)}/{DEV_STEP_NUM if (split == "validation") else STEP_NUM}\n\n')
         f.write(f'duration: {test_time} seconds\n')
         f.write('\ncorrect problems: ')
         f.write(f'\n\n{[i + 1 for i in range(PROB_NUM) if correct_prob[i] == tot_prob[i] and tot_prob[i] != 0]}\n')
@@ -144,6 +151,13 @@ def get_cli() -> argparse.ArgumentParser:
     parser.add_argument(
         "--model", type=str, default="gpt-4o", help="Model name"
     )
+    parser.add_argument(
+        "--split", 
+        type=str, 
+        default="test", 
+        choices=["validation", "test"],
+        help="Data split"
+    )
     parser.add_argument(
         "--code-dir",
         type=Path,
@@ -162,17 +176,6 @@ def get_cli() -> argparse.ArgumentParser:
         default=Path("eval_results"),
         help="Eval results directory",
     )
-    parser.add_argument(
-        "--jsonl-path",
-        type=Path,
-        default=Path("eval", "data", "problems_all.jsonl"),
-        help="Path to jsonl file",
-    )
-    parser.add_argument(
-        "--dev-set",
-        action='store_true',
-        help="Test dev set if enabled",
-    ),
     parser.add_argument(
         "--with-background",
         action="store_true",
@@ -182,17 +185,16 @@ def get_cli() -> argparse.ArgumentParser:
 
 
 def main(model: str,
+         split: str,
          code_dir: Path,
          log_dir: Path,
          output_dir: Path,
-         jsonl_path: Path,
-         dev_set: bool,
          with_background: bool
 ) -> None:
     if not Path(H5PY_FILE).exists():
         raise FileNotFoundError("Please download the numeric test results before testing generated code.")
     model = Path(model).parts[-1]
-    test_code(model, code_dir, log_dir, output_dir, jsonl_path, dev_set, with_background)
+    test_code(model, split, code_dir, log_dir, output_dir, with_background)
 
 
 if __name__ == "__main__":

diff --git a/pyproject.toml b/pyproject.toml
@@ -32,12 +32,13 @@ dependencies = [
     "pytest",
     "pytest-cov",
     "litellm",
+    "inspect-ai",
+    "datasets",
     # requirements for execution
     "numpy",
     "scipy",
     "matplotlib",
     "sympy",
-    "inspect-ai",
 ]
 
 # Classifiers help users find your project by categorizing it.

diff --git a/src/scicode/parse/parse.py b/src/scicode/parse/parse.py
@@ -8,6 +8,7 @@
 import scipy
 import numpy as np
 from sympy import Symbol
+from datasets import load_dataset
 
 OrderedContent = list[tuple[str, str]]
 
@@ -56,6 +57,10 @@ def read_from_jsonl(file_path):
             data.append(json.loads(line.strip()))
     return data
 
+def read_from_hf_dataset(split='validation'):
+    dataset = load_dataset('SciCode1/SciCode', split=split)
+    return dataset
+
 def rm_comments(string: str) -> str:
     ret_lines = []
     lines = string.split('\n')