Kaggle
diff --git a/‎.devcontainer/Dockerfile‎
Lines changed: 26 additions & 0 deletions b/‎.devcontainer/Dockerfile‎
Lines changed: 26 additions & 0 deletions
diff --git a/‎.devcontainer/devcontainer.json‎
Lines changed: 43 additions & 0 deletions b/‎.devcontainer/devcontainer.json‎
Lines changed: 43 additions & 0 deletions
diff --git a/‎.dockerignore‎
Lines changed: 8 additions & 0 deletions b/‎.dockerignore‎
Lines changed: 8 additions & 0 deletions
diff --git a/‎.gitignore‎
Lines changed: 12 additions & 0 deletions b/‎.gitignore‎
Lines changed: 12 additions & 0 deletions
diff --git a/‎.pre-commit-config.yaml‎
Lines changed: 27 additions & 0 deletions b/‎.pre-commit-config.yaml‎
Lines changed: 27 additions & 0 deletions
diff --git a/‎.python-version‎
Lines changed: 1 addition & 0 deletions b/‎.python-version‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎.vscode/settings.json‎
Lines changed: 34 additions & 0 deletions b/‎.vscode/settings.json‎
Lines changed: 34 additions & 0 deletions
diff --git a/‎AGENTS.md‎
Lines changed: 110 additions & 0 deletions b/‎AGENTS.md‎
Lines changed: 110 additions & 0 deletions
diff --git a/‎CHANGELOG.md‎
Lines changed: 10 additions & 0 deletions b/‎CHANGELOG.md‎
Lines changed: 10 additions & 0 deletions
diff --git a/‎CONTRIBUTING.md‎
Lines changed: 33 additions & 0 deletions b/‎CONTRIBUTING.md‎
Lines changed: 33 additions & 0 deletions
@@ -0,0 +1,26 @@
+# Copyright 2025 Kaggle Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+FROM mcr.microsoft.com/devcontainers/python:1-3.11-bullseye
+
+RUN curl -LO https://github.com/protocolbuffers/protobuf/releases/download/v32.1/protoc-32.1-linux-x86_64.zip && \
+    unzip protoc-32.1-linux-x86_64.zip -d /usr && \
+    rm protoc-32.1-linux-x86_64.zip
+
+
+COPY --from=ghcr.io/astral-sh/uv:latest /uv /uvx /bin/
+
+
+RUN pip install playwright && \
+    playwright install chromium --only-shell --with-deps
@@ -0,0 +1,43 @@
+{
+  "name": "Python 3",
+  "build": {
+    "dockerfile": "Dockerfile",
+    "options": [
+      "--network=host"
+    ],
+  },
+  "runArgs": [
+    "--network=host"
+  ],
+  "features": {
+    "ghcr.io/va-h/devcontainers-features/uv:1": {
+      "shellautocompletion": true,
+      "version": "latest"
+    },
+    "ghcr.io/rocker-org/devcontainer-features/quarto-cli:1": {
+      "installChromium": true
+    },
+    "ghcr.io/devcontainers/features/docker-in-docker:2": {},
+  },
+  "customizations": {
+    "vscode": {
+      "extensions": [
+        "ms-python.python",
+        "charliermarsh.ruff",
+        "tamasfe.even-better-toml",
+        "ms-toolsai.jupyter",
+        "ms-python.vscode-pylance",
+        "zxh404.vscode-proto3",
+        "google.geminicodeassist",
+        "GitHub.vscode-pull-request-github",
+        "ms-vscode.live-server",
+        "quarto.quarto",
+      ]
+    }
+  },
+  "postCreateCommand": "uv sync && playwright install --with-deps",
+  "postStartCommand": "uv run pre-commit install",
+  "mounts": [
+    "source=${localEnv:HOME}/.ssh,target=/home/vscode/.ssh,type=bind,consistency=cached",
+  ],
+}
@@ -0,0 +1,8 @@
+.git
+cicd
+.Trash-0
+.env
+notebooks/
+**/*.pyc
+.venv/
+.cache
@@ -0,0 +1,12 @@
+__pycache__
+.ipynb_checkpoints
+.Trash-0
+.env
+.venv
+.*cache
+**/*task.json
+**/*run.json
+dev/
+_site/
+site_libs/
+docs/
@@ -0,0 +1,27 @@
+# Copyright 2025 Kaggle Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+repos:
+- repo: https://github.com/astral-sh/ruff-pre-commit
+  rev: v0.11.4
+  hooks:
+    - id: ruff
+      args: [ --fix ]
+    - id: ruff-format
+- repo: https://github.com/google/addlicense
+  rev: v1.2.0
+  hooks:
+  - id: addlicense
+    args: [ "-c", "Kaggle Inc.", "-l", "apache", "*.py" ]
+    exclude: ^src/kaggle_benchmarks/kaggle/benchmark_types_pb2\.py$
@@ -0,0 +1 @@
+3.11
@@ -0,0 +1,34 @@
+{
+  "[jsonc]": {
+    "editor.defaultFormatter": "vscode.json-language-features",
+    "editor.tabSize": 2,
+  },
+  "[python]": {
+    "editor.codeActionsOnSave": {
+      "source.organizeImports": "explicit"
+    },
+    "editor.defaultFormatter": "charliermarsh.ruff"
+  },
+  "[toml]": {
+    "editor.defaultFormatter": "tamasfe.even-better-toml",
+    "editor.tabSize": 2
+  },
+  "editor.detectIndentation": false,
+  "editor.formatOnPaste": true,
+  "editor.formatOnSave": true,
+  "files.autoSave": "onFocusChange",
+  "files.insertFinalNewline": true,
+  "files.trimFinalNewlines": true,
+  "files.trimTrailingWhitespace": true,
+  "git.autofetch": true,
+  "git.pruneOnFetch": true,
+  "notebook.formatOnSave.enabled": true,
+  "python.defaultInterpreterPath": ".venv/bin/python",
+  "python.terminal.activateEnvironment": true,
+  "python.testing.pytestArgs": [
+    "tests"
+  ],
+  "python.testing.pytestEnabled": true,
+  "python.testing.unittestEnabled": false,
+  "python.analysis.typeCheckingMode": "standard",
+}
@@ -0,0 +1,110 @@
+# AGENTS.md - AI Assistant Guide for kaggle-benchmarks
+
+## Project Context
+
+`kaggle-benchmarks` is a Python library for rigorously evaluating LLMs on custom tasks using decorators, assertions, and tool-augmented interactions. **Tech Stack:** Python 3.11+, uv (package manager), pytest, ruff, mypy, Protocol Buffers.
+
+## High-Level Architecture
+
+This is a **library-first codebase** organized by functional concerns:
+
+- **`src/kaggle_benchmarks/`** - Core library implementing the benchmark framework
+  - Top-level modules define primitives: tasks, assertions, clients, messages, results, runs
+  - Subdirectories provide specialized subsystems: `actors/` (LLM interaction), `tools/` (Python interpreter, web search), `envs/` (execution environments), `kaggle/` (platform integration), `ui/` (Panel-based interfaces)
+- **`tests/`** - Pytest test suite mirroring `src/` structure
+- **`research_benchmarks/`** - Reference implementations of academic benchmarks (MathVista, SimpleQA)
+- **`documentation/`** - Quarto-based docs with executable examples in `examples/`
+- **`protos/`** - Protocol Buffer schemas for serialization
+- **`cicd/`** - Docker and CI/CD scripts
+
+**Mental Model:** Users write decorated functions (`@kbench.task`) that prompt LLMs and assert outputs. The library handles orchestration, caching, serialization, and UI rendering.
+
+## Sources of Truth (The Map)
+
+### Configuration & Environment
+- **Environment variables** → Read `.env` file format in `README.md`
+- **Execution modes** → `src/kaggle_benchmarks/_config.py` defines `ExecutionMode` enum and `Config` dataclass
+- **Package metadata** → `pyproject.toml` (dependencies, version, tool configs)
+
+### Core API Surface
+- **Public exports** → `src/kaggle_benchmarks/__init__.py` defines what users import
+- **Task/benchmark decorators** → `src/kaggle_benchmarks/tasks.py` (`@task`, `@benchmark`)
+- **Assertions** → `src/kaggle_benchmarks/assertions.py` (all `assert_*` functions)
+- **LLM clients** → `src/kaggle_benchmarks/clients.py` (client abstraction and resolution)
+
+### Subsystems
+- **Actor system** → `src/kaggle_benchmarks/actors/` (LLMChat, Actor base classes)
+- **Tools** → `src/kaggle_benchmarks/tools/` (Python interpreter, web search)
+- **Execution environments** → `src/kaggle_benchmarks/envs/` (local, docker)
+- **Kaggle integration** → `src/kaggle_benchmarks/kaggle/` (model loading, serialization)
+- **UI components** → `src/kaggle_benchmarks/ui/` (Panel-based rendering)
+
+### Testing & Quality
+- **Test fixtures** → `tests/conftest.py`
+- **Pre-commit hooks** → `.pre-commit-config.yaml` (ruff, addlicense)
+- **Type checking config** → `pyproject.toml` `[tool.mypy]` section
+
+### Documentation
+- **User-facing guides** → `documentation/quick_start.qmd`, `documentation/user_guide.qmd`
+- **Example code** → `documentation/examples/*.py`
+
+## Critical Implementation Rules
+
+1. **Never manually edit generated files** - Files matching `**/*_pb2.py` are auto-generated from `protos/`. Run `cd protos && ./build.sh` to regenerate.
+
+2. **Use `uv` for all dependency operations** - Not `pip`. Commands: `uv pip install`, `uv run --group <group> <command>`. Dependency groups defined in `pyproject.toml` lines 29-72.
+
+## Operational Commands
+
+### Setup
+```bash
+# Create virtual environment and install
+uv venv
+source .venv/bin/activate  # Windows: .venv\Scripts\activate
+uv pip install -e .
+
+# Install dev dependencies
+uv pip install -e ".[dev]"
+```
+
+### Testing
+```bash
+# Run all tests
+uv run --group test pytest tests
+
+# Run specific test file
+uv run --group test pytest tests/test_assertions.py
+
+# Run with verbose output
+uv run --group test pytest tests -v
+```
+
+### Code Quality
+```bash
+# Format code
+ruff format .
+
+# Lint and auto-fix
+ruff check --fix .
+
+# Type check
+mypy src/
+
+# Run all pre-commit hooks
+pre-commit run --all-files
+```
+
+### Protocol Buffers
+```bash
+# Rebuild protobuf definitions (required after editing protos/)
+cd protos && ./build.sh
+```
+
+### Docker
+```bash
+# Build, run, or start Jupyter
+cd cicd
+./build.sh   # Build image
+./run.sh     # Run container
+./jupyter.sh # Start Jupyter server
+```
@@ -0,0 +1,10 @@
+# Changelog
+
+## Next Release
+
+## v0.2.0 (Nov 19th, 2025)
+
+* Initial public release.
+* Core implementation.
+* Basic documentation and usage examples.
+* CI/CD pipeline for publishing to PyPI.
@@ -0,0 +1,33 @@
+# How to contribute
+
+We'd love to accept your patches and contributions to this project.
+
+## Before you begin
+
+### Sign our Contributor License Agreement
+
+Contributions to this project must be accompanied by a
+[Contributor License Agreement](https://cla.developers.google.com/about) (CLA).
+You (or your employer) retain the copyright to your contribution; this simply
+gives us permission to use and redistribute your contributions as part of the
+project.
+
+If you or your current employer have already signed the Google CLA (even if it
+was for a different project), you probably don't need to do it again.
+
+Visit <https://cla.developers.google.com/> to see your current agreements or to
+sign a new one.
+
+### Review our community guidelines
+
+This project follows
+[Google's Open Source Community Guidelines](https://opensource.google/conduct/).
+
+## Contribution process
+
+### Code reviews
+
+All submissions, including submissions by project members, require review. We
+use GitHub pull requests for this purpose. Consult
+[GitHub Help](https://help.github.com/articles/about-pull-requests/) for more
+information on using pull requests.