Skip to content

Commit 0d75928

Browse files
kaggleteamdolaameng
authored andcommitted
Initial Release
0 parents  commit 0d75928

File tree

143 files changed

+23607
-0
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

143 files changed

+23607
-0
lines changed

.devcontainer/Dockerfile

Lines changed: 26 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,26 @@
1+
# Copyright 2025 Kaggle Inc.
2+
#
3+
# Licensed under the Apache License, Version 2.0 (the "License");
4+
# you may not use this file except in compliance with the License.
5+
# You may obtain a copy of the License at
6+
#
7+
# http://www.apache.org/licenses/LICENSE-2.0
8+
#
9+
# Unless required by applicable law or agreed to in writing, software
10+
# distributed under the License is distributed on an "AS IS" BASIS,
11+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12+
# See the License for the specific language governing permissions and
13+
# limitations under the License.
14+
15+
FROM mcr.microsoft.com/devcontainers/python:1-3.11-bullseye
16+
17+
RUN curl -LO https://github.com/protocolbuffers/protobuf/releases/download/v32.1/protoc-32.1-linux-x86_64.zip && \
18+
unzip protoc-32.1-linux-x86_64.zip -d /usr && \
19+
rm protoc-32.1-linux-x86_64.zip
20+
21+
22+
COPY --from=ghcr.io/astral-sh/uv:latest /uv /uvx /bin/
23+
24+
25+
RUN pip install playwright && \
26+
playwright install chromium --only-shell --with-deps

.devcontainer/devcontainer.json

Lines changed: 43 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,43 @@
1+
{
2+
"name": "Python 3",
3+
"build": {
4+
"dockerfile": "Dockerfile",
5+
"options": [
6+
"--network=host"
7+
],
8+
},
9+
"runArgs": [
10+
"--network=host"
11+
],
12+
"features": {
13+
"ghcr.io/va-h/devcontainers-features/uv:1": {
14+
"shellautocompletion": true,
15+
"version": "latest"
16+
},
17+
"ghcr.io/rocker-org/devcontainer-features/quarto-cli:1": {
18+
"installChromium": true
19+
},
20+
"ghcr.io/devcontainers/features/docker-in-docker:2": {},
21+
},
22+
"customizations": {
23+
"vscode": {
24+
"extensions": [
25+
"ms-python.python",
26+
"charliermarsh.ruff",
27+
"tamasfe.even-better-toml",
28+
"ms-toolsai.jupyter",
29+
"ms-python.vscode-pylance",
30+
"zxh404.vscode-proto3",
31+
"google.geminicodeassist",
32+
"GitHub.vscode-pull-request-github",
33+
"ms-vscode.live-server",
34+
"quarto.quarto",
35+
]
36+
}
37+
},
38+
"postCreateCommand": "uv sync && playwright install --with-deps",
39+
"postStartCommand": "uv run pre-commit install",
40+
"mounts": [
41+
"source=${localEnv:HOME}/.ssh,target=/home/vscode/.ssh,type=bind,consistency=cached",
42+
],
43+
}

.dockerignore

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,8 @@
1+
.git
2+
cicd
3+
.Trash-0
4+
.env
5+
notebooks/
6+
**/*.pyc
7+
.venv/
8+
.cache

.gitignore

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,12 @@
1+
__pycache__
2+
.ipynb_checkpoints
3+
.Trash-0
4+
.env
5+
.venv
6+
.*cache
7+
**/*task.json
8+
**/*run.json
9+
dev/
10+
_site/
11+
site_libs/
12+
docs/

.pre-commit-config.yaml

Lines changed: 27 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,27 @@
1+
# Copyright 2025 Kaggle Inc.
2+
#
3+
# Licensed under the Apache License, Version 2.0 (the "License");
4+
# you may not use this file except in compliance with the License.
5+
# You may obtain a copy of the License at
6+
#
7+
# http://www.apache.org/licenses/LICENSE-2.0
8+
#
9+
# Unless required by applicable law or agreed to in writing, software
10+
# distributed under the License is distributed on an "AS IS" BASIS,
11+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12+
# See the License for the specific language governing permissions and
13+
# limitations under the License.
14+
15+
repos:
16+
- repo: https://github.com/astral-sh/ruff-pre-commit
17+
rev: v0.11.4
18+
hooks:
19+
- id: ruff
20+
args: [ --fix ]
21+
- id: ruff-format
22+
- repo: https://github.com/google/addlicense
23+
rev: v1.2.0
24+
hooks:
25+
- id: addlicense
26+
args: [ "-c", "Kaggle Inc.", "-l", "apache", "*.py" ]
27+
exclude: ^src/kaggle_benchmarks/kaggle/benchmark_types_pb2\.py$

.python-version

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
3.11

.vscode/settings.json

Lines changed: 34 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,34 @@
1+
{
2+
"[jsonc]": {
3+
"editor.defaultFormatter": "vscode.json-language-features",
4+
"editor.tabSize": 2,
5+
},
6+
"[python]": {
7+
"editor.codeActionsOnSave": {
8+
"source.organizeImports": "explicit"
9+
},
10+
"editor.defaultFormatter": "charliermarsh.ruff"
11+
},
12+
"[toml]": {
13+
"editor.defaultFormatter": "tamasfe.even-better-toml",
14+
"editor.tabSize": 2
15+
},
16+
"editor.detectIndentation": false,
17+
"editor.formatOnPaste": true,
18+
"editor.formatOnSave": true,
19+
"files.autoSave": "onFocusChange",
20+
"files.insertFinalNewline": true,
21+
"files.trimFinalNewlines": true,
22+
"files.trimTrailingWhitespace": true,
23+
"git.autofetch": true,
24+
"git.pruneOnFetch": true,
25+
"notebook.formatOnSave.enabled": true,
26+
"python.defaultInterpreterPath": ".venv/bin/python",
27+
"python.terminal.activateEnvironment": true,
28+
"python.testing.pytestArgs": [
29+
"tests"
30+
],
31+
"python.testing.pytestEnabled": true,
32+
"python.testing.unittestEnabled": false,
33+
"python.analysis.typeCheckingMode": "standard",
34+
}

AGENTS.md

Lines changed: 110 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,110 @@
1+
# AGENTS.md - AI Assistant Guide for kaggle-benchmarks
2+
3+
## Project Context
4+
5+
`kaggle-benchmarks` is a Python library for rigorously evaluating LLMs on custom tasks using decorators, assertions, and tool-augmented interactions. **Tech Stack:** Python 3.11+, uv (package manager), pytest, ruff, mypy, Protocol Buffers.
6+
7+
## High-Level Architecture
8+
9+
This is a **library-first codebase** organized by functional concerns:
10+
11+
- **`src/kaggle_benchmarks/`** - Core library implementing the benchmark framework
12+
- Top-level modules define primitives: tasks, assertions, clients, messages, results, runs
13+
- Subdirectories provide specialized subsystems: `actors/` (LLM interaction), `tools/` (Python interpreter, web search), `envs/` (execution environments), `kaggle/` (platform integration), `ui/` (Panel-based interfaces)
14+
- **`tests/`** - Pytest test suite mirroring `src/` structure
15+
- **`research_benchmarks/`** - Reference implementations of academic benchmarks (MathVista, SimpleQA)
16+
- **`documentation/`** - Quarto-based docs with executable examples in `examples/`
17+
- **`protos/`** - Protocol Buffer schemas for serialization
18+
- **`cicd/`** - Docker and CI/CD scripts
19+
20+
**Mental Model:** Users write decorated functions (`@kbench.task`) that prompt LLMs and assert outputs. The library handles orchestration, caching, serialization, and UI rendering.
21+
22+
## Sources of Truth (The Map)
23+
24+
### Configuration & Environment
25+
- **Environment variables** → Read `.env` file format in `README.md`
26+
- **Execution modes**`src/kaggle_benchmarks/_config.py` defines `ExecutionMode` enum and `Config` dataclass
27+
- **Package metadata**`pyproject.toml` (dependencies, version, tool configs)
28+
29+
### Core API Surface
30+
- **Public exports**`src/kaggle_benchmarks/__init__.py` defines what users import
31+
- **Task/benchmark decorators**`src/kaggle_benchmarks/tasks.py` (`@task`, `@benchmark`)
32+
- **Assertions**`src/kaggle_benchmarks/assertions.py` (all `assert_*` functions)
33+
- **LLM clients**`src/kaggle_benchmarks/clients.py` (client abstraction and resolution)
34+
35+
### Subsystems
36+
- **Actor system**`src/kaggle_benchmarks/actors/` (LLMChat, Actor base classes)
37+
- **Tools**`src/kaggle_benchmarks/tools/` (Python interpreter, web search)
38+
- **Execution environments**`src/kaggle_benchmarks/envs/` (local, docker)
39+
- **Kaggle integration**`src/kaggle_benchmarks/kaggle/` (model loading, serialization)
40+
- **UI components**`src/kaggle_benchmarks/ui/` (Panel-based rendering)
41+
42+
### Testing & Quality
43+
- **Test fixtures**`tests/conftest.py`
44+
- **Pre-commit hooks**`.pre-commit-config.yaml` (ruff, addlicense)
45+
- **Type checking config**`pyproject.toml` `[tool.mypy]` section
46+
47+
### Documentation
48+
- **User-facing guides**`documentation/quick_start.qmd`, `documentation/user_guide.qmd`
49+
- **Example code**`documentation/examples/*.py`
50+
51+
## Critical Implementation Rules
52+
53+
1. **Never manually edit generated files** - Files matching `**/*_pb2.py` are auto-generated from `protos/`. Run `cd protos && ./build.sh` to regenerate.
54+
55+
2. **Use `uv` for all dependency operations** - Not `pip`. Commands: `uv pip install`, `uv run --group <group> <command>`. Dependency groups defined in `pyproject.toml` lines 29-72.
56+
57+
## Operational Commands
58+
59+
### Setup
60+
```bash
61+
# Create virtual environment and install
62+
uv venv
63+
source .venv/bin/activate # Windows: .venv\Scripts\activate
64+
uv pip install -e .
65+
66+
# Install dev dependencies
67+
uv pip install -e ".[dev]"
68+
```
69+
70+
### Testing
71+
```bash
72+
# Run all tests
73+
uv run --group test pytest tests
74+
75+
# Run specific test file
76+
uv run --group test pytest tests/test_assertions.py
77+
78+
# Run with verbose output
79+
uv run --group test pytest tests -v
80+
```
81+
82+
### Code Quality
83+
```bash
84+
# Format code
85+
ruff format .
86+
87+
# Lint and auto-fix
88+
ruff check --fix .
89+
90+
# Type check
91+
mypy src/
92+
93+
# Run all pre-commit hooks
94+
pre-commit run --all-files
95+
```
96+
97+
### Protocol Buffers
98+
```bash
99+
# Rebuild protobuf definitions (required after editing protos/)
100+
cd protos && ./build.sh
101+
```
102+
103+
### Docker
104+
```bash
105+
# Build, run, or start Jupyter
106+
cd cicd
107+
./build.sh # Build image
108+
./run.sh # Run container
109+
./jupyter.sh # Start Jupyter server
110+
```

CHANGELOG.md

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,10 @@
1+
# Changelog
2+
3+
## Next Release
4+
5+
## v0.2.0 (Nov 19th, 2025)
6+
7+
* Initial public release.
8+
* Core implementation.
9+
* Basic documentation and usage examples.
10+
* CI/CD pipeline for publishing to PyPI.

CONTRIBUTING.md

Lines changed: 33 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,33 @@
1+
# How to contribute
2+
3+
We'd love to accept your patches and contributions to this project.
4+
5+
## Before you begin
6+
7+
### Sign our Contributor License Agreement
8+
9+
Contributions to this project must be accompanied by a
10+
[Contributor License Agreement](https://cla.developers.google.com/about) (CLA).
11+
You (or your employer) retain the copyright to your contribution; this simply
12+
gives us permission to use and redistribute your contributions as part of the
13+
project.
14+
15+
If you or your current employer have already signed the Google CLA (even if it
16+
was for a different project), you probably don't need to do it again.
17+
18+
Visit <https://cla.developers.google.com/> to see your current agreements or to
19+
sign a new one.
20+
21+
### Review our community guidelines
22+
23+
This project follows
24+
[Google's Open Source Community Guidelines](https://opensource.google/conduct/).
25+
26+
## Contribution process
27+
28+
### Code reviews
29+
30+
All submissions, including submissions by project members, require review. We
31+
use GitHub pull requests for this purpose. Consult
32+
[GitHub Help](https://help.github.com/articles/about-pull-requests/) for more
33+
information on using pull requests.

0 commit comments

Comments
 (0)