Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
37 commits
Select commit Hold shift + click to select a range
9a9400b
[just testing] GPU CI
ebsmothers Oct 7, 2025
3f02302
change repo owner
ebsmothers Oct 7, 2025
636afc4
+extra
ebsmothers Oct 7, 2025
14a3eac
install forge duh
ebsmothers Oct 7, 2025
a472be0
Merge branch 'main' into gpu-ci
ebsmothers Oct 7, 2025
0a4a258
[wip] install script updates
ebsmothers Oct 7, 2025
d47c067
some changes
ebsmothers Oct 8, 2025
6aaee8f
Merge branch 'main' into gpu-ci
ebsmothers Oct 10, 2025
a5c23b0
finally got something (mostly) working locally
ebsmothers Oct 10, 2025
cdddb77
torchstore, shared lib paths
ebsmothers Oct 10, 2025
441ab6e
install titan main
ebsmothers Oct 10, 2025
89beed0
no ssh
ebsmothers Oct 10, 2025
17bed82
fix typo
ebsmothers Oct 10, 2025
438d96c
debug changes
ebsmothers Oct 10, 2025
5642841
more debug
ebsmothers Oct 10, 2025
7816b39
more debug
ebsmothers Oct 10, 2025
b21490b
typo
ebsmothers Oct 10, 2025
d21e639
could it be
ebsmothers Oct 10, 2025
6d7deb3
more debug
ebsmothers Oct 10, 2025
4c9f92e
more debug
ebsmothers Oct 10, 2025
2d5e679
more debug
ebsmothers Oct 10, 2025
f69da06
are we actually not even activating conda env
ebsmothers Oct 10, 2025
9a9d96e
fix path
ebsmothers Oct 10, 2025
4c0f1a2
hardcode something but idk why
ebsmothers Oct 10, 2025
dc58179
idk one more try
ebsmothers Oct 10, 2025
7eb9225
back to old path
ebsmothers Oct 10, 2025
ef09ac4
add print statement
ebsmothers Oct 10, 2025
87df3fd
oh boy
ebsmothers Oct 10, 2025
0bbddb0
my god it works. cleanup
ebsmothers Oct 10, 2025
a908e32
delete file, explain this abomination
ebsmothers Oct 10, 2025
a701472
leave install.sh as is for this PR
ebsmothers Oct 10, 2025
4ea74e8
Revert "leave install.sh as is for this PR"
ebsmothers Oct 11, 2025
a1016dd
Revert "delete file, explain this abomination"
ebsmothers Oct 11, 2025
b527d13
Revert "my god it works. cleanup"
ebsmothers Oct 11, 2025
b7ce6c2
Reapply "my god it works. cleanup"
ebsmothers Oct 11, 2025
ca6b768
Reapply "delete file, explain this abomination"
ebsmothers Oct 11, 2025
6ac48b3
Reapply "leave install.sh as is for this PR"
ebsmothers Oct 11, 2025
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
147 changes: 147 additions & 0 deletions .github/packaging/vllm_reqs.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,147 @@
# These requirements were generated by running steps 1-3 of scripts/build_wheels.shell
# then running pip freeze and manually removing the vllm dependency.
# The intention of this file is to use these known requirements for a fixed
# vLLM build to supplement a vLLM install from download.pytorch.org without
# resorting to --extra-index-url https://download.pytorch.org/whl/nightly to find
# vLLM dependencies (as this results in a ResolutionTooDeep error from pip).
# See the file .github/workflows/gpu_test.yaml for an E2E forge installation using this approach.
# TODO: this should be done way less hackily
aiohappyeyeballs==2.6.1
aiohttp==3.13.0
aiosignal==1.4.0
annotated-types==0.7.0
anyio==4.11.0
astor==0.8.1
async-timeout==5.0.1
attrs==25.4.0
blake3==1.0.7
cachetools==6.2.0
cbor2==5.7.0
certifi==2025.10.5
cffi==2.0.0
charset-normalizer==3.4.3
click==8.3.0
cloudpickle==3.1.1
cmake==4.1.0
compressed-tensors==0.10.2
cupy-cuda12x==13.6.0
depyf==0.19.0
dill==0.4.0
diskcache==5.6.3
distro==1.9.0
dnspython==2.8.0
einops==0.8.1
email-validator==2.3.0
exceptiongroup==1.3.0
fastapi==0.118.3
fastapi-cli==0.0.13
fastapi-cloud-cli==0.3.1
fastrlock==0.8.3
filelock==3.19.1
frozenlist==1.8.0
fsspec==2025.9.0
gguf==0.17.1
h11==0.16.0
hf-xet==1.1.10
httpcore==1.0.9
httptools==0.7.1
httpx==0.28.1
huggingface-hub==0.35.3
idna==3.10
interegular==0.3.3
Jinja2==3.1.6
jiter==0.11.0
jsonschema==4.25.1
jsonschema-specifications==2025.9.1
lark==1.2.2
llguidance==0.7.30
llvmlite==0.44.0
lm-format-enforcer==0.10.12
markdown-it-py==4.0.0
MarkupSafe==3.0.2
mdurl==0.1.2
mistral_common==1.8.5
mpmath==1.3.0
msgpack==1.1.2
msgspec==0.19.0
multidict==6.7.0
networkx==3.4.2
ninja==1.13.0
numba==0.61.2
numpy==2.2.6
nvidia-cublas-cu12==12.9.1.4
nvidia-cuda-cupti-cu12==12.9.79
nvidia-cuda-nvrtc-cu12==12.9.86
nvidia-cuda-runtime-cu12==12.9.79
nvidia-cudnn-cu12==9.10.2.21
nvidia-cufft-cu12==11.4.1.4
nvidia-cufile-cu12==1.14.1.1
nvidia-curand-cu12==10.3.10.19
nvidia-cusolver-cu12==11.7.5.82
nvidia-cusparse-cu12==12.5.10.65
nvidia-cusparselt-cu12==0.7.1
nvidia-nccl-cu12==2.27.5
nvidia-nvjitlink-cu12==12.9.86
nvidia-nvshmem-cu12==3.3.20
nvidia-nvtx-cu12==12.9.79
openai==1.90.0
opencv-python-headless==4.12.0.88
outlines_core==0.2.10
packaging==25.0
partial-json-parser==0.2.1.1.post6
pillow==11.3.0
prometheus-fastapi-instrumentator==7.1.0
prometheus_client==0.23.1
propcache==0.4.1
protobuf==6.32.1
psutil==7.1.0
py-cpuinfo==9.0.0
pybase64==1.4.2
pycountry==24.6.1
pycparser==2.23
pydantic==2.12.0
pydantic-extra-types==2.10.6
pydantic_core==2.41.1
Pygments==2.19.2
python-dotenv==1.1.1
python-json-logger==4.0.0
python-multipart==0.0.20
pytorch-triton==3.4.0+gitf7888497
PyYAML==6.0.3
pyzmq==27.1.0
ray==2.49.2
referencing==0.36.2
regex==2025.9.18
requests==2.32.5
rich==14.2.0
rich-toolkit==0.15.1
rignore==0.7.0
rpds-py==0.27.1
safetensors==0.6.2
scipy==1.15.3
sentencepiece==0.2.1
sentry-sdk==2.41.0
setuptools-scm==9.2.0
shellingham==1.5.4
sniffio==1.3.1
soundfile==0.13.1
soxr==1.0.0
starlette==0.48.0
sympy==1.14.0
tiktoken==0.12.0
tokenizers==0.22.1
tomli==2.3.0
torch==2.9.0.dev20250905+cu129
tqdm==4.67.1
transformers==4.57.0
triton==3.4.0
typer==0.19.2
typing-inspection==0.4.2
typing_extensions==4.15.0
urllib3==2.5.0
uvicorn==0.37.0
uvloop==0.21.0
watchfiles==1.1.0
websockets==15.0.1
xgrammar==0.1.21
yarl==1.22.0
67 changes: 67 additions & 0 deletions .github/workflows/gpu_test.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,67 @@
name: GPU tests

on:
schedule:
# Runs at midnight every day
- cron: '0 0 * * *'
push:
branches: [ main ]
pull_request:
workflow_dispatch:

concurrency:
group: gpu-test-${{ github.workflow }}-${{ github.ref == 'refs/heads/main' && github.run_number || github.ref }}
cancel-in-progress: true

permissions:
id-token: write
contents: read

defaults:
run:
shell: bash -l -eo pipefail {0}

jobs:
gpu_test:
if: github.repository_owner == 'meta-pytorch'
runs-on: linux.g5.12xlarge.nvidia.gpu
strategy:
matrix:
python-version: ['3.10']
steps:
- name: Check out repo
uses: actions/checkout@v4
- name: Setup conda env
uses: conda-incubator/setup-miniconda@v2
with:
auto-update-conda: true
miniconda-version: "latest"
activate-environment: test
python-version: ${{ matrix.python-version }}
- name: Update pip
run: python -m pip install --upgrade pip
- name: Install pinned torch nightly
run: python -m pip install --pre torch==2.9.0.dev20250905 --no-cache-dir --index-url https://download.pytorch.org/whl/nightly/cu129
- name: Download and install vLLM and its dependencies
# TODO: this honestly could not be hackier if I tried
run: |
python -m pip install -r .github/packaging/vllm_reqs.txt
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

this seems stable

python -m pip install vllm==0.10.1.dev0+g6d8d0a24c.d20251009.cu129 --no-cache-dir --index-url https://download.pytorch.org/whl/preview/forge
- name: Download and install monarch and its dependencies
run: |
python -m pip install -r https://raw.githubusercontent.com/meta-pytorch/monarch/main/requirements.txt
python -m pip install torchmonarch --extra-index-url https://download.pytorch.org/whl/preview/forge
- name: Install torchtitan and torchstore
run: |
python -m pip install git+https://github.com/pytorch/torchtitan.git
python -m pip install git+https://github.com/meta-pytorch/torchstore.git
- name: Install dependencies
run: python -m pip install --no-build-isolation -e ".[dev]"
- name: Run unit tests with coverage
# TODO add all tests
run: |
export LD_PRELOAD=$CONDA/envs/test/lib/libpython3.10.so.1.0
export LD_LIBRARY_PATH=$CONDA/envs/test/lib/libpython3.10.so.1.0
pytest tests/unit_tests --cov=. --cov-report=xml --durations=20 -vv
- name: Upload Coverage to Codecov
uses: codecov/codecov-action@v3
Loading