diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..52e2f81 --- /dev/null +++ b/.gitignore @@ -0,0 +1,606 @@ +### JetBrains template +# Covers JetBrains IDEs: IntelliJ, RubyMine, PhpStorm, AppCode, PyCharm, CLion, Android Studio, WebStorm and Rider +# Reference: https://intellij-support.jetbrains.com/hc/en-us/articles/206544839 + +# User-specific stuff +.idea/**/workspace.xml +.idea/**/tasks.xml +.idea/**/usage.statistics.xml +.idea/**/dictionaries +.idea/**/shelf + +# Generated files +.idea/**/contentModel.xml + +# Sensitive or high-churn files +.idea/**/dataSources/ +.idea/**/dataSources.ids +.idea/**/dataSources.local.xml +.idea/**/sqlDataSources.xml +.idea/**/dynamic.xml +.idea/**/uiDesigner.xml +.idea/**/dbnavigator.xml + +# Gradle +.idea/**/gradle.xml +.idea/**/libraries + +# Gradle and Maven with auto-import +# When using Gradle or Maven with auto-import, you should exclude module files, +# since they will be recreated, and may cause churn. Uncomment if using +# auto-import. +# .idea/artifacts +# .idea/compiler.xml +# .idea/jarRepositories.xml +# .idea/modules.xml +# .idea/*.iml +# .idea/modules +# *.iml +# *.ipr + +# CMake +cmake-build-*/ + +# Mongo Explorer plugin +.idea/**/mongoSettings.xml + +# File-based project format +*.iws + +# IntelliJ +out/ + +# mpeltonen/sbt-idea plugin +.idea_modules/ + +# JIRA plugin +atlassian-ide-plugin.xml + +# Cursive Clojure plugin +.idea/replstate.xml + +# Crashlytics plugin (for Android Studio and IntelliJ) +com_crashlytics_export_strings.xml +crashlytics.properties +crashlytics-build.properties +fabric.properties + +# Editor-based Rest Client +.idea/httpRequests + +# Android studio 3.1+ serialized cache file +.idea/caches/build_file_checksums.ser + +### VirtualEnv template +# Virtualenv +# http://iamzed.com/2009/05/07/a-primer-on-virtualenv/ +.Python +[Bb]in +[Ii]nclude +[Ll]ib +[Ll]ib64 +[Ll]ocal +[Ss]cripts +pyvenv.cfg +.venv +pip-selfcheck.json + +### VisualStudio template +## Ignore Visual Studio temporary files, build results, and +## files generated by popular Visual Studio add-ons. +## +## Get latest from https://github.com/github/gitignore/blob/master/VisualStudio.gitignore + +# User-specific files +*.rsuser +*.suo +*.user +*.userosscache +*.sln.docstates + +# User-specific files (MonoDevelop/Xamarin Studio) +*.userprefs + +# Mono auto generated files +mono_crash.* + +# Build results +[Dd]ebug/ +[Dd]ebugPublic/ +[Rr]elease/ +[Rr]eleases/ +x64/ +x86/ +[Ww][Ii][Nn]32/ +[Aa][Rr][Mm]/ +[Aa][Rr][Mm]64/ +bld/ +[Bb]in/ +[Oo]bj/ +[Ll]og/ +[Ll]ogs/ + +# Visual Studio 2015/2017 cache/options directory +.vs/ +# Uncomment if you have tasks that create the project's static files in wwwroot +#wwwroot/ + +# Visual Studio 2017 auto generated files +Generated\ Files/ + +# MSTest test Results +[Tt]est[Rr]esult*/ +[Bb]uild[Ll]og.* + +# NUnit +*.VisualState.xml +TestResult.xml +nunit-*.xml + +# Build Results of an ATL Project +[Dd]ebugPS/ +[Rr]eleasePS/ +dlldata.c + +# Benchmark Results +BenchmarkDotNet.Artifacts/ + +# .NET Core +project.lock.json +project.fragment.lock.json +artifacts/ + +# ASP.NET Scaffolding +ScaffoldingReadMe.txt + +# StyleCop +StyleCopReport.xml + +# Files built by Visual Studio +*_i.c +*_p.c +*_h.h +*.ilk +*.meta +*.obj +*.iobj +*.pch +*.pdb +*.ipdb +*.pgc +*.pgd +*.rsp +*.sbr +*.tlb +*.tli +*.tlh +*.tmp +*.tmp_proj +*_wpftmp.csproj +*.log +*.vspscc +*.vssscc +.builds +*.pidb +*.svclog +*.scc + +# Chutzpah Test files +_Chutzpah* + +# Visual C++ cache files +ipch/ +*.aps +*.ncb +*.opendb +*.opensdf +*.sdf +*.cachefile +*.VC.db +*.VC.VC.opendb + +# Visual Studio profiler +*.psess +*.vsp +*.vspx +*.sap + +# Visual Studio Trace Files +*.e2e + +# TFS 2012 Local Workspace +$tf/ + +# Guidance Automation Toolkit +*.gpState + +# ReSharper is a .NET coding add-in +_ReSharper*/ +*.[Rr]e[Ss]harper +*.DotSettings.user + +# TeamCity is a build add-in +_TeamCity* + +# DotCover is a Code Coverage Tool +*.dotCover + +# AxoCover is a Code Coverage Tool +.axoCover/* +!.axoCover/settings.json + +# Coverlet is a free, cross platform Code Coverage Tool +coverage*.json +coverage*.xml +coverage*.info + +# Visual Studio code coverage results +*.coverage +*.coveragexml + +# NCrunch +_NCrunch_* +.*crunch*.local.xml +nCrunchTemp_* + +# MightyMoose +*.mm.* +AutoTest.Net/ + +# Web workbench (sass) +.sass-cache/ + +# Installshield output folder +[Ee]xpress/ + +# DocProject is a documentation generator add-in +DocProject/buildhelp/ +DocProject/Help/*.HxT +DocProject/Help/*.HxC +DocProject/Help/*.hhc +DocProject/Help/*.hhk +DocProject/Help/*.hhp +DocProject/Help/Html2 +DocProject/Help/html + +# Click-Once directory +publish/ + +# Publish Web Output +*.[Pp]ublish.xml +*.azurePubxml +# Note: Comment the next line if you want to checkin your web deploy settings, +# but database connection strings (with potential passwords) will be unencrypted +*.pubxml +*.publishproj + +# Microsoft Azure Web App publish settings. Comment the next line if you want to +# checkin your Azure Web App publish settings, but sensitive information contained +# in these scripts will be unencrypted +PublishScripts/ + +# NuGet Packages +*.nupkg +# NuGet Symbol Packages +*.snupkg +# The packages folder can be ignored because of Package Restore +**/[Pp]ackages/* +# except build/, which is used as an MSBuild target. +!**/[Pp]ackages/build/ +# Uncomment if necessary however generally it will be regenerated when needed +#!**/[Pp]ackages/repositories.config +# NuGet v3's project.json files produces more ignorable files +*.nuget.props +*.nuget.targets + +# Microsoft Azure Build Output +csx/ +*.build.csdef + +# Microsoft Azure Emulator +ecf/ +rcf/ + +# Windows Store app package directories and files +AppPackages/ +BundleArtifacts/ +Package.StoreAssociation.xml +_pkginfo.txt +*.appx +*.appxbundle +*.appxupload + +# Visual Studio cache files +# files ending in .cache can be ignored +*.[Cc]ache +# but keep track of directories ending in .cache +!?*.[Cc]ache/ + +# Others +ClientBin/ +~$* +*~ +*.dbmdl +*.dbproj.schemaview +*.jfm +*.pfx +*.publishsettings +orleans.codegen.cs + +# Including strong name files can present a security risk +# (https://github.com/github/gitignore/pull/2483#issue-259490424) +#*.snk + +# Since there are multiple workflows, uncomment next line to ignore bower_components +# (https://github.com/github/gitignore/pull/1529#issuecomment-104372622) +#bower_components/ + +# RIA/Silverlight projects +Generated_Code/ + +# Backup & report files from converting an old project file +# to a newer Visual Studio version. Backup files are not needed, +# because we have git ;-) +_UpgradeReport_Files/ +Backup*/ +UpgradeLog*.XML +UpgradeLog*.htm +ServiceFabricBackup/ +*.rptproj.bak + +# SQL Server files +*.mdf +*.ldf +*.ndf + +# Business Intelligence projects +*.rdl.data +*.bim.layout +*.bim_*.settings +*.rptproj.rsuser +*- [Bb]ackup.rdl +*- [Bb]ackup ([0-9]).rdl +*- [Bb]ackup ([0-9][0-9]).rdl + +# Microsoft Fakes +FakesAssemblies/ + +# GhostDoc plugin setting file +*.GhostDoc.xml + +# Node.js Tools for Visual Studio +.ntvs_analysis.dat +node_modules/ + +# Visual Studio 6 build log +*.plg + +# Visual Studio 6 workspace options file +*.opt + +# Visual Studio 6 auto-generated workspace file (contains which files were open etc.) +*.vbw + +# Visual Studio LightSwitch build output +**/*.HTMLClient/GeneratedArtifacts +**/*.DesktopClient/GeneratedArtifacts +**/*.DesktopClient/ModelManifest.xml +**/*.Server/GeneratedArtifacts +**/*.Server/ModelManifest.xml +_Pvt_Extensions + +# Paket dependency manager +.paket/paket.exe +paket-files/ + +# FAKE - F# Make +.fake/ + +# CodeRush personal settings +.cr/personal + +# Python Tools for Visual Studio (PTVS) +__pycache__/ +*.pyc + +# Cake - Uncomment if you are using it +# tools/** +# !tools/packages.config + +# Tabs Studio +*.tss + +# Telerik's JustMock configuration file +*.jmconfig + +# BizTalk build output +*.btp.cs +*.btm.cs +*.odx.cs +*.xsd.cs + +# OpenCover UI analysis results +OpenCover/ + +# Azure Stream Analytics local run output +ASALocalRun/ + +# MSBuild Binary and Structured Log +*.binlog + +# NVidia Nsight GPU debugger configuration file +*.nvuser + +# MFractors (Xamarin productivity tool) working folder +.mfractor/ + +# Local History for Visual Studio +.localhistory/ + +# BeatPulse healthcheck temp database +healthchecksdb + +# Backup folder for Package Reference Convert tool in Visual Studio 2017 +MigrationBackup/ + +# Ionide (cross platform F# VS Code tools) working folder +.ionide/ + +# Fody - auto-generated XML schema +FodyWeavers.xsd + +### JupyterNotebooks template +# gitignore template for Jupyter Notebooks +# website: http://jupyter.org/ + +.ipynb_checkpoints +*/.ipynb_checkpoints/* + +# IPython +profile_default/ +ipython_config.py + +# Remove previous ipynb_checkpoints +# git rm -r .ipynb_checkpoints/ + +### Python template +# Byte-compiled / optimized / DLL files +__pycache__/ +*.py[cod] +*$py.class + +# C extensions +*.so + +# Distribution / packaging +.Python +build/ +develop-eggs/ +dist/ +downloads/ +eggs/ +.eggs/ +lib/ +lib64/ +parts/ +sdist/ +var/ +wheels/ +share/python-wheels/ +*.egg-info/ +.installed.cfg +*.egg +MANIFEST + +# PyInstaller +# Usually these files are written by a python script from a template +# before PyInstaller builds the exe, so as to inject date/other infos into it. +*.manifest +*.spec + +# Installer logs +pip-log.txt +pip-delete-this-directory.txt + +# Unit test / coverage reports +htmlcov/ +.tox/ +.nox/ +.coverage +.coverage.* +.cache +nosetests.xml +coverage.xml +*.cover +*.py,cover +.hypothesis/ +.pytest_cache/ +cover/ + +# Translations +*.mo +*.pot + +# Django stuff: +*.log +local_settings.py +db.sqlite3 +db.sqlite3-journal + +# Flask stuff: +instance/ +.webassets-cache + +# Scrapy stuff: +.scrapy + +# Sphinx documentation +docs/_build/ + +# PyBuilder +.pybuilder/ +target/ + +# Jupyter Notebook +.ipynb_checkpoints + +# IPython +profile_default/ +ipython_config.py + +# pyenv +# For a library or package, you might want to ignore these files since the code is +# intended to run in multiple environments; otherwise, check them in: +# .python-version + +# pipenv +# According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. +# However, in case of collaboration, if having platform-specific dependencies or dependencies +# having no cross-platform support, pipenv may install dependencies that don't work, or not +# install all needed dependencies. +#Pipfile.lock + +# PEP 582; used by e.g. github.com/David-OConnor/pyflow +__pypackages__/ + +# Celery stuff +celerybeat-schedule +celerybeat.pid + +# SageMath parsed files +*.sage.py + +# Environments +.env +.venv +env/ +venv/ +ENV/ +env.bak/ +venv.bak/ + +# Spyder project settings +.spyderproject +.spyproject + +# Rope project settings +.ropeproject + +# mkdocs documentation +/site + +# mypy +.mypy_cache/ +.dmypy.json +dmypy.json + +# Pyre type checker +.pyre/ + +# pytype static type analyzer +.pytype/ + +# Cython debug symbols +cython_debug/ + +.idea \ No newline at end of file diff --git a/README.md b/README.md index 3cfd22f..705ee5c 100644 --- a/README.md +++ b/README.md @@ -18,6 +18,11 @@ $ git clone https://github.com/openai/human-eval $ pip install -e human-eval ``` +Or use pip directly to install from repository: +``` +$ pip install git+https://github.com/openai/human-eval.git@master#egg=human-eval +``` + ## Usage **This program exists to run untrusted model-generated code. Users are strongly diff --git a/human_eval/evaluation.py b/human_eval/evaluation.py index 9ce96df..788a7b6 100644 --- a/human_eval/evaluation.py +++ b/human_eval/evaluation.py @@ -42,6 +42,7 @@ def evaluate_functional_correctness( n_workers: int = 4, timeout: float = 3.0, problem_file: str = HUMAN_EVAL, + ignore_incomplete: bool = False ): """ Evaluates the functional correctness of generated samples, and writes @@ -68,7 +69,8 @@ def evaluate_functional_correctness( completion_id[task_id] += 1 n_samples += 1 - assert len(completion_id) == len(problems), "Some problems are not attempted." + if not ignore_incomplete: + assert len(completion_id) == len(problems), "Some problems are not attempted." print("Running test suites...") for future in tqdm.tqdm(as_completed(futures), total=len(futures)): diff --git a/human_eval/execution.py b/human_eval/execution.py index bc509f5..2ea9f74 100644 --- a/human_eval/execution.py +++ b/human_eval/execution.py @@ -1,13 +1,49 @@ -from typing import Optional, Callable, Dict -import ast +from typing import Optional, Dict, Any, List import contextlib import faulthandler import io import os -import multiprocessing import platform import signal import tempfile +from multiprocessing import Manager, Process + + +def unsafe_execute(problem: Dict[str, Any], completion: str, timeout: float, result: List): + with create_tempdir(): + + # These system calls are needed when cleaning up tempdir. + import os + import shutil + rmtree = shutil.rmtree + rmdir = os.rmdir + chdir = os.chdir + + # Disable functionalities that can make destructive changes to the test. + reliability_guard() + + # Construct the check program and run it. + check_program = ( + problem["prompt"] + completion + "\n" + + problem["test"] + "\n" + + f"check({problem['entry_point']})" + ) + + try: + exec_globals = {} + with swallow_io(): + with time_limit(timeout): + exec(check_program, exec_globals) + result.append("passed") + except TimeoutException: + result.append("timed out") + except BaseException as e: + result.append(f"failed: {e}") + + # Needed for cleaning up. + shutil.rmtree = rmtree + os.rmdir = rmdir + os.chdir = chdir def check_correctness(problem: Dict, completion: str, timeout: float, @@ -19,78 +55,31 @@ def check_correctness(problem: Dict, completion: str, timeout: float, :param completion_id: an optional completion ID so we can match the results later even if execution finishes asynchronously. """ + with Manager() as manager: + result = manager.list() - def unsafe_execute(): - - with create_tempdir(): - - # These system calls are needed when cleaning up tempdir. - import os - import shutil - rmtree = shutil.rmtree - rmdir = os.rmdir - chdir = os.chdir + p = Process(target=unsafe_execute, args=(problem, completion, timeout, result)) + p.start() + p.join(timeout=timeout + 1) + if p.is_alive(): + p.kill() - # Disable functionalities that can make destructive changes to the test. - reliability_guard() + if not result: + result.append("timed out") - # Construct the check program and run it. - check_program = ( - problem["prompt"] + completion + "\n" + - problem["test"] + "\n" + - f"check({problem['entry_point']})" - ) - - try: - exec_globals = {} - with swallow_io(): - with time_limit(timeout): -# WARNING -# This program exists to execute untrusted model-generated code. Although -# it is highly unlikely that model-generated code will do something overtly -# malicious in response to this test suite, model-generated code may act -# destructively due to a lack of model capability or alignment. -# Users are strongly encouraged to sandbox this evaluation suite so that it -# does not perform destructive actions on their host or network. For more -# information on how OpenAI sandboxes its code, see the accompanying paper. -# Once you have read this disclaimer and taken appropriate precautions, -# uncomment the following line and proceed at your own risk: -# exec(check_program, exec_globals) - result.append("passed") - except TimeoutException: - result.append("timed out") - except BaseException as e: - result.append(f"failed: {e}") - - # Needed for cleaning up. - shutil.rmtree = rmtree - os.rmdir = rmdir - os.chdir = chdir - - manager = multiprocessing.Manager() - result = manager.list() - - p = multiprocessing.Process(target=unsafe_execute) - p.start() - p.join(timeout=timeout + 1) - if p.is_alive(): - p.kill() - - if not result: - result.append("timed out") - - return dict( - task_id=problem["task_id"], - passed=result[0] == "passed", - result=result[0], - completion_id=completion_id, - ) + return dict( + task_id=problem["task_id"], + passed=result[0] == "passed", + result=result[0], + completion_id=completion_id, + ) @contextlib.contextmanager def time_limit(seconds: float): def signal_handler(signum, frame): raise TimeoutException("Timed out!") + signal.setitimer(signal.ITIMER_REAL, seconds) signal.signal(signal.SIGALRM, signal_handler) try: diff --git a/setup.py b/setup.py index 600c47d..72e4c69 100644 --- a/setup.py +++ b/setup.py @@ -7,7 +7,7 @@ setup( name="human-eval", py_modules=["human-eval"], - version="1.0", + version="1.0.2", description="", author="OpenAI", packages=find_packages(), @@ -19,7 +19,7 @@ ], entry_points={ "console_scripts": [ - "evaluate_functional_correctness = human_eval.evaluate_functional_correctness", + "evaluate_functional_correctness = human_eval.evaluate_functional_correctness:main", ] } )