From 9a9400bc1950933366842e7737c72930594940e3 Mon Sep 17 00:00:00 2001 From: Evan Smothers Date: Tue, 7 Oct 2025 11:36:20 -0700 Subject: [PATCH 01/35] [just testing] GPU CI --- .github/workflows/gpu_test.yaml | 55 +++++++++++++++++++++++++++++++++ 1 file changed, 55 insertions(+) create mode 100644 .github/workflows/gpu_test.yaml diff --git a/.github/workflows/gpu_test.yaml b/.github/workflows/gpu_test.yaml new file mode 100644 index 00000000..a55e380c --- /dev/null +++ b/.github/workflows/gpu_test.yaml @@ -0,0 +1,55 @@ +name: GPU tests + +on: + schedule: + # Runs at midnight every day + - cron: '0 0 * * *' + push: + branches: [ main ] + pull_request: + workflow_dispatch: + +concurrency: + group: gpu-test-${{ github.workflow }}-${{ github.ref == 'refs/heads/main' && github.run_number || github.ref }} + cancel-in-progress: true + +permissions: + id-token: write + contents: read + +defaults: + run: + shell: bash -l -eo pipefail {0} + +jobs: + gpu_test: + if: github.repository_owner == 'pytorch' + runs-on: linux.g5.12xlarge.nvidia.gpu + strategy: + matrix: + python-version: ['3.10'] + steps: + - name: Check out repo + uses: actions/checkout@v4 + - name: Setup conda env + uses: conda-incubator/setup-miniconda@v2 + with: + auto-update-conda: true + miniconda-version: "latest" + activate-environment: test + python-version: ${{ matrix.python-version }} + - name: Update pip + run: python -m pip install --upgrade pip + - name: Install torch nightly + run: python -m pip install torch --extra-index-url https://download.pytorch.org/cu128 + - name: Install torchtitan and torchstore + run: python -m pip install torchtitan torchstore + - name: Download and install monarch + run: pip install monarch --index-url https://download.pytorch.org/whl/preview/forge + - name: Download and install vLLM + run: pip install vllm --index-url https://download.pytorch.org/whl/preview/forge + - name: Run unit tests with coverage + # TODO add all tests + run: pytest tests/unit_tests --cov=. --cov-report=xml --durations=20 -vv + - name: Upload Coverage to Codecov + uses: codecov/codecov-action@v3 From 3f023022424c998376304c37edc128d2ef2f0225 Mon Sep 17 00:00:00 2001 From: Evan Smothers Date: Tue, 7 Oct 2025 11:40:03 -0700 Subject: [PATCH 02/35] change repo owner --- .github/workflows/gpu_test.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/gpu_test.yaml b/.github/workflows/gpu_test.yaml index a55e380c..1c663c7c 100644 --- a/.github/workflows/gpu_test.yaml +++ b/.github/workflows/gpu_test.yaml @@ -23,7 +23,7 @@ defaults: jobs: gpu_test: - if: github.repository_owner == 'pytorch' + if: github.repository_owner == 'meta-pytorch' runs-on: linux.g5.12xlarge.nvidia.gpu strategy: matrix: From 636afc436e501019b7857d7cf074959a479e0b62 Mon Sep 17 00:00:00 2001 From: Evan Smothers Date: Tue, 7 Oct 2025 13:16:20 -0700 Subject: [PATCH 03/35] +extra --- .github/workflows/gpu_test.yaml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/.github/workflows/gpu_test.yaml b/.github/workflows/gpu_test.yaml index 1c663c7c..ace22e7a 100644 --- a/.github/workflows/gpu_test.yaml +++ b/.github/workflows/gpu_test.yaml @@ -45,9 +45,9 @@ jobs: - name: Install torchtitan and torchstore run: python -m pip install torchtitan torchstore - name: Download and install monarch - run: pip install monarch --index-url https://download.pytorch.org/whl/preview/forge + run: pip install monarch --extra-index-url https://download.pytorch.org/whl/preview/forge - name: Download and install vLLM - run: pip install vllm --index-url https://download.pytorch.org/whl/preview/forge + run: pip install vllm --extra-index-url https://download.pytorch.org/whl/preview/forge - name: Run unit tests with coverage # TODO add all tests run: pytest tests/unit_tests --cov=. --cov-report=xml --durations=20 -vv From 14a3eac5325c1bb8f31084c786d488435624f236 Mon Sep 17 00:00:00 2001 From: Evan Smothers Date: Tue, 7 Oct 2025 13:23:09 -0700 Subject: [PATCH 04/35] install forge duh --- .github/workflows/gpu_test.yaml | 2 ++ 1 file changed, 2 insertions(+) diff --git a/.github/workflows/gpu_test.yaml b/.github/workflows/gpu_test.yaml index ace22e7a..604a11dc 100644 --- a/.github/workflows/gpu_test.yaml +++ b/.github/workflows/gpu_test.yaml @@ -48,6 +48,8 @@ jobs: run: pip install monarch --extra-index-url https://download.pytorch.org/whl/preview/forge - name: Download and install vLLM run: pip install vllm --extra-index-url https://download.pytorch.org/whl/preview/forge + - name: Install dependencies + run: python -m pip install --no-build-isolation -e ".[dev]" - name: Run unit tests with coverage # TODO add all tests run: pytest tests/unit_tests --cov=. --cov-report=xml --durations=20 -vv From 0a4a25889de60f24c916e0b89e55873cdba774b2 Mon Sep 17 00:00:00 2001 From: Evan Smothers Date: Tue, 7 Oct 2025 16:17:09 -0700 Subject: [PATCH 05/35] [wip] install script updates --- scripts/install.sh | 98 ++++++---------------------------------------- 1 file changed, 12 insertions(+), 86 deletions(-) diff --git a/scripts/install.sh b/scripts/install.sh index eb4776cf..9959fdf4 100755 --- a/scripts/install.sh +++ b/scripts/install.sh @@ -124,85 +124,6 @@ install_system_packages() { fi } -# Check to see if gh is installed, if not, it will be installed via conda-forge channel -check_gh_install() { - if ! command -v gh &> /dev/null; then - log_warning "GitHub CLI (gh) not found. Installing via Conda..." - conda install gh --channel conda-forge -y - log_info "GitHub CLI (gh) installed successfully." - log_info "Please run 'gh auth login' to authenticate with GitHub." - else - log_info "GitHub CLI (gh) already installed." - fi -} - -# Check wheels exist -check_wheels() { - if [ ! -d "$WHEEL_DIR" ]; then - log_error "Wheels directory not found: $WHEEL_DIR" - exit 1 - fi - - local wheel_count=$(ls -1 "$WHEEL_DIR"/*.whl 2>/dev/null | wc -l) - log_info "Found $wheel_count local wheels" -} - -# Download vLLM wheel from GitHub releases -download_vllm_wheel() { - log_info "Downloading vLLM wheel from GitHub releases..." - - # Check if gh is installed - if ! command -v gh &> /dev/null; then - log_error "GitHub CLI (gh) is required to download vLLM wheel" - log_info "Install it with: sudo dnf install gh" - log_info "Then run: gh auth login" - exit 1 - fi - - # Get the vLLM wheel filename from the release - local vllm_wheel_name - vllm_wheel_name=$(gh release view "$RELEASE_TAG" --repo "$GITHUB_REPO" --json assets --jq '.assets[] | select(.name | contains("vllm")) | .name' | head -1) - - if [ -z "$vllm_wheel_name" ]; then - log_error "Could not find vLLM wheel in release $RELEASE_TAG" - log_info "Make sure the vLLM wheel has been uploaded to the GitHub release" - exit 1 - fi - for f in assets/wheels/vllm-*; do - [ -e "$f" ] || continue # skip if glob didn't match - if [ "$(basename "$f")" != "$vllm_wheel_name" ]; then - log_info "Removing stale vLLM wheel: $(basename "$f")" - rm -f "$f" - fi - done - - local local_path="$WHEEL_DIR/$vllm_wheel_name" - - if [ -f "$local_path" ]; then - log_info "vLLM wheel already downloaded: $vllm_wheel_name" - return 0 - fi - - log_info "Downloading: $vllm_wheel_name" - - # Save current directory and change to wheel directory - local original_dir=$(pwd) - cd "$WHEEL_DIR" - gh release download "$RELEASE_TAG" --repo "$GITHUB_REPO" --pattern "*vllm*" - local download_result=$? - - # Always return to original directory - cd "$original_dir" - - if [ $download_result -eq 0 ]; then - log_info "Successfully downloaded vLLM wheel" - else - log_error "Failed to download vLLM wheel" - exit 1 - fi -} - - # Parse command line arguments parse_args() { USE_SUDO=false @@ -251,20 +172,25 @@ main() { echo "" check_conda_env - check_wheels # Install openssl as we overwrite the default version when we update LD_LIBRARY_PATH conda install -y openssl install_system_packages "$USE_SUDO" - check_gh_install - download_vllm_wheel log_info "Installing PyTorch nightly..." - pip install torch==$PYTORCH_VERSION --index-url https://download.pytorch.org/whl/nightly/cu129 + pip install torch==$PYTORCH_VERSION --index-url https://download.pytorch.org/whl/nightly/cu128 + + log_info "Installing torchtitan and torchstore..." + pip install torchtitan torchstore + + # Install Monarch from wheel at download.pytorch.org + log_info "Downloading and installing Monarch wheel..." + pip install monarch --extra-index-url https://download.pytorch.org/whl/preview/forge - log_info "Installing all wheels (local + downloaded)..." - pip install "$WHEEL_DIR"/*.whl + # Install vLLM from wheel at download.pytorch.org + log_info "Downloading and installing vLLM wheel..." + pip install vllm --extra-index-url https://download.pytorch.org/whl/preview/forge log_info "Installing Forge from source..." pip install -e . @@ -287,7 +213,7 @@ main() { local cuda_activation_script="${conda_env_dir}/etc/conda/activate.d/cuda_env.sh" cat > "$cuda_activation_script" << 'EOF' # CUDA environment for Forge -export CUDA_VERSION=12.9 +export CUDA_VERSION=12.8 export NVCC=/usr/local/cuda-${CUDA_VERSION}/bin/nvcc export CUDA_NVCC_EXECUTABLE=/usr/local/cuda-${CUDA_VERSION}/bin/nvcc export CUDA_HOME=/usr/local/cuda-${CUDA_VERSION} From d47c0677b917036358b461c8e68cca0ad20e82cd Mon Sep 17 00:00:00 2001 From: Evan Smothers Date: Wed, 8 Oct 2025 09:41:56 -0700 Subject: [PATCH 06/35] some changes --- .github/workflows/gpu_test.yaml | 10 ++++++---- scripts/install.sh | 4 ++-- 2 files changed, 8 insertions(+), 6 deletions(-) diff --git a/.github/workflows/gpu_test.yaml b/.github/workflows/gpu_test.yaml index 604a11dc..83f81d39 100644 --- a/.github/workflows/gpu_test.yaml +++ b/.github/workflows/gpu_test.yaml @@ -41,15 +41,17 @@ jobs: - name: Update pip run: python -m pip install --upgrade pip - name: Install torch nightly - run: python -m pip install torch --extra-index-url https://download.pytorch.org/cu128 + run: python -m pip3 install --pre torch --index-url https://download.pytorch.org/whl/nightly/cu128 - name: Install torchtitan and torchstore run: python -m pip install torchtitan torchstore - - name: Download and install monarch - run: pip install monarch --extra-index-url https://download.pytorch.org/whl/preview/forge + - name: Download and install monarch and its dependencies + run: | + pip install -r https://raw.githubusercontent.com/meta-pytorch/monarch/main/requirements.txt + pip install monarch --extra-index-url https://download.pytorch.org/whl/preview/forge - name: Download and install vLLM run: pip install vllm --extra-index-url https://download.pytorch.org/whl/preview/forge - name: Install dependencies - run: python -m pip install --no-build-isolation -e ".[dev]" + run: ./scripts/install.sh - name: Run unit tests with coverage # TODO add all tests run: pytest tests/unit_tests --cov=. --cov-report=xml --durations=20 -vv diff --git a/scripts/install.sh b/scripts/install.sh index 9959fdf4..4e8f4655 100755 --- a/scripts/install.sh +++ b/scripts/install.sh @@ -162,7 +162,6 @@ main() { echo "======================" echo "" echo "Note: Run this from the root of the forge repository" - echo "This script requires GitHub CLI (gh) to download large wheels" if [ "$USE_SUDO" = "true" ]; then echo "System packages will be installed via system package manager (requires sudo)" check_sudo @@ -186,6 +185,7 @@ main() { # Install Monarch from wheel at download.pytorch.org log_info "Downloading and installing Monarch wheel..." + pip install -r https://raw.githubusercontent.com/meta-pytorch/monarch/main/requirements.txt pip install monarch --extra-index-url https://download.pytorch.org/whl/preview/forge # Install vLLM from wheel at download.pytorch.org @@ -193,7 +193,7 @@ main() { pip install vllm --extra-index-url https://download.pytorch.org/whl/preview/forge log_info "Installing Forge from source..." - pip install -e . + pip install -e ".[dev]" # Set up environment log_info "Setting up environment..." From a5c23b07952fc00d6f81001d2d0616eacc8c603e Mon Sep 17 00:00:00 2001 From: Evan Smothers Date: Fri, 10 Oct 2025 12:03:45 -0700 Subject: [PATCH 07/35] finally got something (mostly) working locally --- .github/packaging/vllm_reqs.txt | 139 ++++++++++++++++++++++++++++++++ .github/workflows/gpu_test.yaml | 20 ++--- 2 files changed, 150 insertions(+), 9 deletions(-) create mode 100644 .github/packaging/vllm_reqs.txt diff --git a/.github/packaging/vllm_reqs.txt b/.github/packaging/vllm_reqs.txt new file mode 100644 index 00000000..996dbbce --- /dev/null +++ b/.github/packaging/vllm_reqs.txt @@ -0,0 +1,139 @@ +aiohappyeyeballs==2.6.1 +aiohttp==3.13.0 +aiosignal==1.4.0 +annotated-types==0.7.0 +anyio==4.11.0 +astor==0.8.1 +async-timeout==5.0.1 +attrs==25.4.0 +blake3==1.0.7 +cachetools==6.2.0 +cbor2==5.7.0 +certifi==2025.10.5 +cffi==2.0.0 +charset-normalizer==3.4.3 +click==8.3.0 +cloudpickle==3.1.1 +cmake==4.1.0 +compressed-tensors==0.10.2 +cupy-cuda12x==13.6.0 +depyf==0.19.0 +dill==0.4.0 +diskcache==5.6.3 +distro==1.9.0 +dnspython==2.8.0 +einops==0.8.1 +email-validator==2.3.0 +exceptiongroup==1.3.0 +fastapi==0.118.3 +fastapi-cli==0.0.13 +fastapi-cloud-cli==0.3.1 +fastrlock==0.8.3 +filelock==3.19.1 +frozenlist==1.8.0 +fsspec==2025.9.0 +gguf==0.17.1 +h11==0.16.0 +hf-xet==1.1.10 +httpcore==1.0.9 +httptools==0.7.1 +httpx==0.28.1 +huggingface-hub==0.35.3 +idna==3.10 +interegular==0.3.3 +Jinja2==3.1.6 +jiter==0.11.0 +jsonschema==4.25.1 +jsonschema-specifications==2025.9.1 +lark==1.2.2 +llguidance==0.7.30 +llvmlite==0.44.0 +lm-format-enforcer==0.10.12 +markdown-it-py==4.0.0 +MarkupSafe==3.0.2 +mdurl==0.1.2 +mistral_common==1.8.5 +mpmath==1.3.0 +msgpack==1.1.2 +msgspec==0.19.0 +multidict==6.7.0 +networkx==3.4.2 +ninja==1.13.0 +numba==0.61.2 +numpy==2.2.6 +nvidia-cublas-cu12==12.9.1.4 +nvidia-cuda-cupti-cu12==12.9.79 +nvidia-cuda-nvrtc-cu12==12.9.86 +nvidia-cuda-runtime-cu12==12.9.79 +nvidia-cudnn-cu12==9.10.2.21 +nvidia-cufft-cu12==11.4.1.4 +nvidia-cufile-cu12==1.14.1.1 +nvidia-curand-cu12==10.3.10.19 +nvidia-cusolver-cu12==11.7.5.82 +nvidia-cusparse-cu12==12.5.10.65 +nvidia-cusparselt-cu12==0.7.1 +nvidia-nccl-cu12==2.27.5 +nvidia-nvjitlink-cu12==12.9.86 +nvidia-nvshmem-cu12==3.3.20 +nvidia-nvtx-cu12==12.9.79 +openai==1.90.0 +opencv-python-headless==4.12.0.88 +outlines_core==0.2.10 +packaging==25.0 +partial-json-parser==0.2.1.1.post6 +pillow==11.3.0 +prometheus-fastapi-instrumentator==7.1.0 +prometheus_client==0.23.1 +propcache==0.4.1 +protobuf==6.32.1 +psutil==7.1.0 +py-cpuinfo==9.0.0 +pybase64==1.4.2 +pycountry==24.6.1 +pycparser==2.23 +pydantic==2.12.0 +pydantic-extra-types==2.10.6 +pydantic_core==2.41.1 +Pygments==2.19.2 +python-dotenv==1.1.1 +python-json-logger==4.0.0 +python-multipart==0.0.20 +pytorch-triton==3.4.0+gitf7888497 +PyYAML==6.0.3 +pyzmq==27.1.0 +ray==2.49.2 +referencing==0.36.2 +regex==2025.9.18 +requests==2.32.5 +rich==14.2.0 +rich-toolkit==0.15.1 +rignore==0.7.0 +rpds-py==0.27.1 +safetensors==0.6.2 +scipy==1.15.3 +sentencepiece==0.2.1 +sentry-sdk==2.41.0 +setuptools-scm==9.2.0 +shellingham==1.5.4 +sniffio==1.3.1 +soundfile==0.13.1 +soxr==1.0.0 +starlette==0.48.0 +sympy==1.14.0 +tiktoken==0.12.0 +tokenizers==0.22.1 +tomli==2.3.0 +torch==2.9.0.dev20250905+cu129 +tqdm==4.67.1 +transformers==4.57.0 +triton==3.4.0 +typer==0.19.2 +typing-inspection==0.4.2 +typing_extensions==4.15.0 +urllib3==2.5.0 +uvicorn==0.37.0 +uvloop==0.21.0 +watchfiles==1.1.0 +websockets==15.0.1 +xgrammar==0.1.21 +yarl==1.22.0 diff --git a/.github/workflows/gpu_test.yaml b/.github/workflows/gpu_test.yaml index 83f81d39..da55f32d 100644 --- a/.github/workflows/gpu_test.yaml +++ b/.github/workflows/gpu_test.yaml @@ -40,18 +40,20 @@ jobs: python-version: ${{ matrix.python-version }} - name: Update pip run: python -m pip install --upgrade pip - - name: Install torch nightly - run: python -m pip3 install --pre torch --index-url https://download.pytorch.org/whl/nightly/cu128 - - name: Install torchtitan and torchstore - run: python -m pip install torchtitan torchstore + - name: Install pinned torch nightly + run: python -m pip install --pre torch==2.9.0.dev20250905 --no-cache-dir --index-url https://download.pytorch.org/whl/nightly/cu129 + - name: Install vLLM + run: | + python -m pip install -r .github/packaging/vllm_reqs.txt + python -m pip install vllm==0.10.1.dev0+g6d8d0a24c.d20251009.cu129 --no-cache-dir --index-url https://download.pytorch.org/whl/preview/forge - name: Download and install monarch and its dependencies run: | - pip install -r https://raw.githubusercontent.com/meta-pytorch/monarch/main/requirements.txt - pip install monarch --extra-index-url https://download.pytorch.org/whl/preview/forge - - name: Download and install vLLM - run: pip install vllm --extra-index-url https://download.pytorch.org/whl/preview/forge + python -m pip install -r https://raw.githubusercontent.com/meta-pytorch/monarch/main/requirements.txt + python -m pip install torchmonarch --extra-index-url https://download.pytorch.org/whl/preview/forge + - name: Install torchtitan and torchstore + run: python -m pip install torchtitan torchstore - name: Install dependencies - run: ./scripts/install.sh + run: python -m pip install --no-build-isolation -e ".[dev]" - name: Run unit tests with coverage # TODO add all tests run: pytest tests/unit_tests --cov=. --cov-report=xml --durations=20 -vv From cdddb77e221d5d858dae7c58b4d0d30845297dbe Mon Sep 17 00:00:00 2001 From: Evan Smothers Date: Fri, 10 Oct 2025 12:20:36 -0700 Subject: [PATCH 08/35] torchstore, shared lib paths --- .github/workflows/gpu_test.yaml | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) diff --git a/.github/workflows/gpu_test.yaml b/.github/workflows/gpu_test.yaml index da55f32d..275e111f 100644 --- a/.github/workflows/gpu_test.yaml +++ b/.github/workflows/gpu_test.yaml @@ -42,7 +42,8 @@ jobs: run: python -m pip install --upgrade pip - name: Install pinned torch nightly run: python -m pip install --pre torch==2.9.0.dev20250905 --no-cache-dir --index-url https://download.pytorch.org/whl/nightly/cu129 - - name: Install vLLM + - name: Download and install vLLM and its dependencies + # TODO: this honestly could not be hackier if I tried run: | python -m pip install -r .github/packaging/vllm_reqs.txt python -m pip install vllm==0.10.1.dev0+g6d8d0a24c.d20251009.cu129 --no-cache-dir --index-url https://download.pytorch.org/whl/preview/forge @@ -51,9 +52,15 @@ jobs: python -m pip install -r https://raw.githubusercontent.com/meta-pytorch/monarch/main/requirements.txt python -m pip install torchmonarch --extra-index-url https://download.pytorch.org/whl/preview/forge - name: Install torchtitan and torchstore - run: python -m pip install torchtitan torchstore + run: | + python -m pip install torchtitan + python -m pip install git+ssh://git@github.com/meta-pytorch/torchstore.git - name: Install dependencies run: python -m pip install --no-build-isolation -e ".[dev]" + - name: Patch shared paths + run: | + export LD_PRELOAD=${CONDA_PREFIX}/lib/libpython3.10.so.1.0:${LD_PRELOAD} + export LD_LIBRARY_PATH=${CONDA_PREFIX}/lib:${LD_LIBRARY_PATH} - name: Run unit tests with coverage # TODO add all tests run: pytest tests/unit_tests --cov=. --cov-report=xml --durations=20 -vv From 441ab6efcd0da6f0a107251017865cc2c16875dc Mon Sep 17 00:00:00 2001 From: Evan Smothers Date: Fri, 10 Oct 2025 12:29:51 -0700 Subject: [PATCH 09/35] install titan main --- .github/workflows/gpu_test.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/gpu_test.yaml b/.github/workflows/gpu_test.yaml index 275e111f..df6a805f 100644 --- a/.github/workflows/gpu_test.yaml +++ b/.github/workflows/gpu_test.yaml @@ -53,7 +53,7 @@ jobs: python -m pip install torchmonarch --extra-index-url https://download.pytorch.org/whl/preview/forge - name: Install torchtitan and torchstore run: | - python -m pip install torchtitan + python -m pip install git+ssh://git@github.com/pytorch/torchtitan.git python -m pip install git+ssh://git@github.com/meta-pytorch/torchstore.git - name: Install dependencies run: python -m pip install --no-build-isolation -e ".[dev]" From 89beed0a864104f02eceece1ddb56bb3cb652e00 Mon Sep 17 00:00:00 2001 From: Evan Smothers Date: Fri, 10 Oct 2025 12:40:58 -0700 Subject: [PATCH 10/35] no ssh --- .github/workflows/gpu_test.yaml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/.github/workflows/gpu_test.yaml b/.github/workflows/gpu_test.yaml index df6a805f..b5c410b8 100644 --- a/.github/workflows/gpu_test.yaml +++ b/.github/workflows/gpu_test.yaml @@ -53,8 +53,8 @@ jobs: python -m pip install torchmonarch --extra-index-url https://download.pytorch.org/whl/preview/forge - name: Install torchtitan and torchstore run: | - python -m pip install git+ssh://git@github.com/pytorch/torchtitan.git - python -m pip install git+ssh://git@github.com/meta-pytorch/torchstore.git + python -m pip install git+https://github.com/pytorch/torchtitan.git + python -m pip install git+https://github.com/meta-pytorch/torchstore.git - name: Install dependencies run: python -m pip install --no-build-isolation -e ".[dev]" - name: Patch shared paths From 17bed82839a4e2cc6d5680418355663b204f3bd9 Mon Sep 17 00:00:00 2001 From: Evan Smothers Date: Fri, 10 Oct 2025 13:54:44 -0700 Subject: [PATCH 11/35] fix typo --- .github/workflows/gpu_test.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/gpu_test.yaml b/.github/workflows/gpu_test.yaml index b5c410b8..8d6612c4 100644 --- a/.github/workflows/gpu_test.yaml +++ b/.github/workflows/gpu_test.yaml @@ -60,7 +60,7 @@ jobs: - name: Patch shared paths run: | export LD_PRELOAD=${CONDA_PREFIX}/lib/libpython3.10.so.1.0:${LD_PRELOAD} - export LD_LIBRARY_PATH=${CONDA_PREFIX}/lib:${LD_LIBRARY_PATH} + export LD_LIBRARY_PATH=${CONDA_PREFIX}/lib/libpython3.10.so.1.0:${LD_LIBRARY_PATH} - name: Run unit tests with coverage # TODO add all tests run: pytest tests/unit_tests --cov=. --cov-report=xml --durations=20 -vv From 438d96c7c25bb244dc57c541054d8369a436ce6a Mon Sep 17 00:00:00 2001 From: Evan Smothers Date: Fri, 10 Oct 2025 14:23:52 -0700 Subject: [PATCH 12/35] debug changes --- .github/workflows/unit_test.yaml | 29 +++++++++++++---------------- 1 file changed, 13 insertions(+), 16 deletions(-) diff --git a/.github/workflows/unit_test.yaml b/.github/workflows/unit_test.yaml index 9a839f32..1c12813a 100644 --- a/.github/workflows/unit_test.yaml +++ b/.github/workflows/unit_test.yaml @@ -23,20 +23,17 @@ jobs: python-version: ${{ matrix.python-version }} - name: Update pip run: python -m pip install --upgrade pip - - name: Install pytorch - run: python -m pip install torch==2.9.0.dev20250826 --extra-index-url https://download.pytorch.org/whl/nightly/cpu - - name: Install monarch - run: python -m pip install monarch-no-torch==0.1.0.dev20250826 --find-links assets/ci - - name: Install torchstore - run: pip install assets/wheels/torchstore-0.1.0-py3-none-any.whl - - name: Install torchtitan + # TODO: these are just debug changes + - name: print some stuff run: | - pip install assets/wheels/torchtitan-0.1.0-py3-none-any.whl - pip install tyro - - name: Install dependencies - run: python -m pip install --no-build-isolation -e ".[dev]" - - name: Run unit tests with coverage - # TODO add all tests - run: pytest tests/unit_tests --cov=. --cov-report=xml --durations=20 -vv - - name: Upload Coverage to Codecov - uses: codecov/codecov-action@v3 + echo "Conda prefix: ${CONDA_PREFIX}" + echo "before export" + echo "LD_LIBRARY_PATH: ${LD_LIBRARY_PATH}" + echo "LD_PRELOAD: ${LD_PRELOAD}" + ls -ahl ${CONDA_PREFIX} + ls -ahl ${CONDA_PREFIX}/lib + export LD_PRELOAD=${CONDA_PREFIX}/lib/libpython3.10.so.1.0:${LD_PRELOAD} + export LD_LIBRARY_PATH=${CONDA_PREFIX}/lib/libpython3.10.so.1.0:${LD_LIBRARY_PATH} + echo "after export" + echo "LD_LIBRARY_PATH: ${LD_LIBRARY_PATH}" + echo "LD_PRELOAD: ${LD_PRELOAD}" From 5642841aeb74e922176a427e3e3f82c277d01719 Mon Sep 17 00:00:00 2001 From: Evan Smothers Date: Fri, 10 Oct 2025 14:29:01 -0700 Subject: [PATCH 13/35] more debug --- .github/workflows/unit_test.yaml | 2 ++ 1 file changed, 2 insertions(+) diff --git a/.github/workflows/unit_test.yaml b/.github/workflows/unit_test.yaml index 1c12813a..59c41461 100644 --- a/.github/workflows/unit_test.yaml +++ b/.github/workflows/unit_test.yaml @@ -27,6 +27,8 @@ jobs: - name: print some stuff run: | echo "Conda prefix: ${CONDA_PREFIX}" + echo "Current path: ${{ env.PATH }} + conda env list echo "before export" echo "LD_LIBRARY_PATH: ${LD_LIBRARY_PATH}" echo "LD_PRELOAD: ${LD_PRELOAD}" From 7816b39d966afb86ec9703d6e0ab06d1b56ae47a Mon Sep 17 00:00:00 2001 From: Evan Smothers Date: Fri, 10 Oct 2025 14:33:31 -0700 Subject: [PATCH 14/35] more debug --- .github/workflows/unit_test.yaml | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/.github/workflows/unit_test.yaml b/.github/workflows/unit_test.yaml index 59c41461..305429bc 100644 --- a/.github/workflows/unit_test.yaml +++ b/.github/workflows/unit_test.yaml @@ -27,7 +27,8 @@ jobs: - name: print some stuff run: | echo "Conda prefix: ${CONDA_PREFIX}" - echo "Current path: ${{ env.PATH }} + echo "Conda: $CONDA" + echo "Current path: ${{ env.PATH }}"" conda env list echo "before export" echo "LD_LIBRARY_PATH: ${LD_LIBRARY_PATH}" From b21490b8f76108ab247652b6999068b8f81be7d5 Mon Sep 17 00:00:00 2001 From: Evan Smothers Date: Fri, 10 Oct 2025 14:36:46 -0700 Subject: [PATCH 15/35] typo --- .github/workflows/unit_test.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/unit_test.yaml b/.github/workflows/unit_test.yaml index 305429bc..49c16e6b 100644 --- a/.github/workflows/unit_test.yaml +++ b/.github/workflows/unit_test.yaml @@ -28,7 +28,7 @@ jobs: run: | echo "Conda prefix: ${CONDA_PREFIX}" echo "Conda: $CONDA" - echo "Current path: ${{ env.PATH }}"" + echo "Current path: ${{ env.PATH }}" conda env list echo "before export" echo "LD_LIBRARY_PATH: ${LD_LIBRARY_PATH}" From d21e6394a62f9c8837a055ac0e80e6ff507c9844 Mon Sep 17 00:00:00 2001 From: Evan Smothers Date: Fri, 10 Oct 2025 14:49:26 -0700 Subject: [PATCH 16/35] could it be --- .github/workflows/gpu_test.yaml | 4 ++-- .github/workflows/unit_test.yaml | 32 ++++++++++++++++---------------- 2 files changed, 18 insertions(+), 18 deletions(-) diff --git a/.github/workflows/gpu_test.yaml b/.github/workflows/gpu_test.yaml index 8d6612c4..6ee23d0d 100644 --- a/.github/workflows/gpu_test.yaml +++ b/.github/workflows/gpu_test.yaml @@ -59,8 +59,8 @@ jobs: run: python -m pip install --no-build-isolation -e ".[dev]" - name: Patch shared paths run: | - export LD_PRELOAD=${CONDA_PREFIX}/lib/libpython3.10.so.1.0:${LD_PRELOAD} - export LD_LIBRARY_PATH=${CONDA_PREFIX}/lib/libpython3.10.so.1.0:${LD_LIBRARY_PATH} + export LD_PRELOAD=${CONDA}/envs/test/lib/libpython3.10.so.1.0:${LD_PRELOAD} + export LD_LIBRARY_PATH=${CONDA}/envs/test/lib/libpython3.10.so.1.0:${LD_LIBRARY_PATH} - name: Run unit tests with coverage # TODO add all tests run: pytest tests/unit_tests --cov=. --cov-report=xml --durations=20 -vv diff --git a/.github/workflows/unit_test.yaml b/.github/workflows/unit_test.yaml index 49c16e6b..9a839f32 100644 --- a/.github/workflows/unit_test.yaml +++ b/.github/workflows/unit_test.yaml @@ -23,20 +23,20 @@ jobs: python-version: ${{ matrix.python-version }} - name: Update pip run: python -m pip install --upgrade pip - # TODO: these are just debug changes - - name: print some stuff + - name: Install pytorch + run: python -m pip install torch==2.9.0.dev20250826 --extra-index-url https://download.pytorch.org/whl/nightly/cpu + - name: Install monarch + run: python -m pip install monarch-no-torch==0.1.0.dev20250826 --find-links assets/ci + - name: Install torchstore + run: pip install assets/wheels/torchstore-0.1.0-py3-none-any.whl + - name: Install torchtitan run: | - echo "Conda prefix: ${CONDA_PREFIX}" - echo "Conda: $CONDA" - echo "Current path: ${{ env.PATH }}" - conda env list - echo "before export" - echo "LD_LIBRARY_PATH: ${LD_LIBRARY_PATH}" - echo "LD_PRELOAD: ${LD_PRELOAD}" - ls -ahl ${CONDA_PREFIX} - ls -ahl ${CONDA_PREFIX}/lib - export LD_PRELOAD=${CONDA_PREFIX}/lib/libpython3.10.so.1.0:${LD_PRELOAD} - export LD_LIBRARY_PATH=${CONDA_PREFIX}/lib/libpython3.10.so.1.0:${LD_LIBRARY_PATH} - echo "after export" - echo "LD_LIBRARY_PATH: ${LD_LIBRARY_PATH}" - echo "LD_PRELOAD: ${LD_PRELOAD}" + pip install assets/wheels/torchtitan-0.1.0-py3-none-any.whl + pip install tyro + - name: Install dependencies + run: python -m pip install --no-build-isolation -e ".[dev]" + - name: Run unit tests with coverage + # TODO add all tests + run: pytest tests/unit_tests --cov=. --cov-report=xml --durations=20 -vv + - name: Upload Coverage to Codecov + uses: codecov/codecov-action@v3 From 6d7deb3e855169b51fa387aa897a2e605f6a6034 Mon Sep 17 00:00:00 2001 From: Evan Smothers Date: Fri, 10 Oct 2025 14:57:16 -0700 Subject: [PATCH 17/35] more debug --- .github/workflows/gpu_test.yaml | 2 ++ .github/workflows/unit_test.yaml | 36 ++++++++++++++++++-------------- 2 files changed, 22 insertions(+), 16 deletions(-) diff --git a/.github/workflows/gpu_test.yaml b/.github/workflows/gpu_test.yaml index 6ee23d0d..21ed08fc 100644 --- a/.github/workflows/gpu_test.yaml +++ b/.github/workflows/gpu_test.yaml @@ -61,6 +61,8 @@ jobs: run: | export LD_PRELOAD=${CONDA}/envs/test/lib/libpython3.10.so.1.0:${LD_PRELOAD} export LD_LIBRARY_PATH=${CONDA}/envs/test/lib/libpython3.10.so.1.0:${LD_LIBRARY_PATH} + echo "LD_PRELOAD=${LD_PRELOAD}" + echo "LD_LIBRARY_PATH=${LD_LIBRARY_PATH}" - name: Run unit tests with coverage # TODO add all tests run: pytest tests/unit_tests --cov=. --cov-report=xml --durations=20 -vv diff --git a/.github/workflows/unit_test.yaml b/.github/workflows/unit_test.yaml index 9a839f32..94270dd7 100644 --- a/.github/workflows/unit_test.yaml +++ b/.github/workflows/unit_test.yaml @@ -23,20 +23,24 @@ jobs: python-version: ${{ matrix.python-version }} - name: Update pip run: python -m pip install --upgrade pip - - name: Install pytorch - run: python -m pip install torch==2.9.0.dev20250826 --extra-index-url https://download.pytorch.org/whl/nightly/cpu - - name: Install monarch - run: python -m pip install monarch-no-torch==0.1.0.dev20250826 --find-links assets/ci - - name: Install torchstore - run: pip install assets/wheels/torchstore-0.1.0-py3-none-any.whl - - name: Install torchtitan + # TODO: these are just debug changes + - name: print some stuff run: | - pip install assets/wheels/torchtitan-0.1.0-py3-none-any.whl - pip install tyro - - name: Install dependencies - run: python -m pip install --no-build-isolation -e ".[dev]" - - name: Run unit tests with coverage - # TODO add all tests - run: pytest tests/unit_tests --cov=. --cov-report=xml --durations=20 -vv - - name: Upload Coverage to Codecov - uses: codecov/codecov-action@v3 + export LD_PRELOAD=${CONDA}/envs/test/lib/libpython3.10.so.1.0:${LD_PRELOAD} + export LD_LIBRARY_PATH=${CONDA}/envs/test/lib/libpython3.10.so.1.0:${LD_LIBRARY_PATH} + echo "LD_PRELOAD=${LD_PRELOAD}" + echo "LD_LIBRARY_PATH=${LD_LIBRARY_PATH}" + echo "Conda prefix: ${CONDA_PREFIX}" + echo "Conda: $CONDA" + echo "Current path: ${{ env.PATH }}" + conda env list + echo "before export" + echo "LD_LIBRARY_PATH: ${LD_LIBRARY_PATH}" + echo "LD_PRELOAD: ${LD_PRELOAD}" + ls -ahl ${CONDA_PREFIX} + ls -ahl ${CONDA_PREFIX}/lib + export LD_PRELOAD=${CONDA_PREFIX}/lib/libpython3.10.so.1.0:${LD_PRELOAD} + export LD_LIBRARY_PATH=${CONDA_PREFIX}/lib/libpython3.10.so.1.0:${LD_LIBRARY_PATH} + echo "after export" + echo "LD_LIBRARY_PATH: ${LD_LIBRARY_PATH}" + echo "LD_PRELOAD: ${LD_PRELOAD}" From 4c9f92e364dc237a701c2b2026de0b8eae3f8efc Mon Sep 17 00:00:00 2001 From: Evan Smothers Date: Fri, 10 Oct 2025 15:00:58 -0700 Subject: [PATCH 18/35] more debug --- .github/workflows/unit_test.yaml | 1 + 1 file changed, 1 insertion(+) diff --git a/.github/workflows/unit_test.yaml b/.github/workflows/unit_test.yaml index 94270dd7..f114dbae 100644 --- a/.github/workflows/unit_test.yaml +++ b/.github/workflows/unit_test.yaml @@ -39,6 +39,7 @@ jobs: echo "LD_PRELOAD: ${LD_PRELOAD}" ls -ahl ${CONDA_PREFIX} ls -ahl ${CONDA_PREFIX}/lib + ls -ahl usr/lib export LD_PRELOAD=${CONDA_PREFIX}/lib/libpython3.10.so.1.0:${LD_PRELOAD} export LD_LIBRARY_PATH=${CONDA_PREFIX}/lib/libpython3.10.so.1.0:${LD_LIBRARY_PATH} echo "after export" From 2d5e67973a6e4d7e0887805d19e93aa080337a92 Mon Sep 17 00:00:00 2001 From: Evan Smothers Date: Fri, 10 Oct 2025 15:02:48 -0700 Subject: [PATCH 19/35] more debug --- .github/workflows/unit_test.yaml | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/.github/workflows/unit_test.yaml b/.github/workflows/unit_test.yaml index f114dbae..ba438d44 100644 --- a/.github/workflows/unit_test.yaml +++ b/.github/workflows/unit_test.yaml @@ -37,9 +37,7 @@ jobs: echo "before export" echo "LD_LIBRARY_PATH: ${LD_LIBRARY_PATH}" echo "LD_PRELOAD: ${LD_PRELOAD}" - ls -ahl ${CONDA_PREFIX} - ls -ahl ${CONDA_PREFIX}/lib - ls -ahl usr/lib + ls -ahl $CONDA/envs/test/lib export LD_PRELOAD=${CONDA_PREFIX}/lib/libpython3.10.so.1.0:${LD_PRELOAD} export LD_LIBRARY_PATH=${CONDA_PREFIX}/lib/libpython3.10.so.1.0:${LD_LIBRARY_PATH} echo "after export" From f69da06627ed2f943c8360fd501d688b708d86ce Mon Sep 17 00:00:00 2001 From: Evan Smothers Date: Fri, 10 Oct 2025 15:11:35 -0700 Subject: [PATCH 20/35] are we actually not even activating conda env --- .github/workflows/gpu_test.yaml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/.github/workflows/gpu_test.yaml b/.github/workflows/gpu_test.yaml index 21ed08fc..f0cb144e 100644 --- a/.github/workflows/gpu_test.yaml +++ b/.github/workflows/gpu_test.yaml @@ -59,8 +59,8 @@ jobs: run: python -m pip install --no-build-isolation -e ".[dev]" - name: Patch shared paths run: | - export LD_PRELOAD=${CONDA}/envs/test/lib/libpython3.10.so.1.0:${LD_PRELOAD} - export LD_LIBRARY_PATH=${CONDA}/envs/test/lib/libpython3.10.so.1.0:${LD_LIBRARY_PATH} + export LD_PRELOAD=${CONDA}/envs/base/lib/libpython3.10.so.1.0:${LD_PRELOAD} + export LD_LIBRARY_PATH=${CONDA}/envs/base/lib/libpython3.10.so.1.0:${LD_LIBRARY_PATH} echo "LD_PRELOAD=${LD_PRELOAD}" echo "LD_LIBRARY_PATH=${LD_LIBRARY_PATH}" - name: Run unit tests with coverage From 9a9d96ed3191f09174d68e7095c247d902d5b543 Mon Sep 17 00:00:00 2001 From: Evan Smothers Date: Fri, 10 Oct 2025 15:12:06 -0700 Subject: [PATCH 21/35] fix path --- .github/workflows/gpu_test.yaml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/.github/workflows/gpu_test.yaml b/.github/workflows/gpu_test.yaml index f0cb144e..3b3004ec 100644 --- a/.github/workflows/gpu_test.yaml +++ b/.github/workflows/gpu_test.yaml @@ -59,8 +59,8 @@ jobs: run: python -m pip install --no-build-isolation -e ".[dev]" - name: Patch shared paths run: | - export LD_PRELOAD=${CONDA}/envs/base/lib/libpython3.10.so.1.0:${LD_PRELOAD} - export LD_LIBRARY_PATH=${CONDA}/envs/base/lib/libpython3.10.so.1.0:${LD_LIBRARY_PATH} + export LD_PRELOAD=${CONDA}/lib/libpython3.10.so.1.0:${LD_PRELOAD} + export LD_LIBRARY_PATH=${CONDA}/lib/libpython3.10.so.1.0:${LD_LIBRARY_PATH} echo "LD_PRELOAD=${LD_PRELOAD}" echo "LD_LIBRARY_PATH=${LD_LIBRARY_PATH}" - name: Run unit tests with coverage From 4c0f1a2fb304b7ae0f628eb27e41b942c117d3f3 Mon Sep 17 00:00:00 2001 From: Evan Smothers Date: Fri, 10 Oct 2025 15:17:50 -0700 Subject: [PATCH 22/35] hardcode something but idk why --- .github/workflows/gpu_test.yaml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/.github/workflows/gpu_test.yaml b/.github/workflows/gpu_test.yaml index 3b3004ec..52db7d17 100644 --- a/.github/workflows/gpu_test.yaml +++ b/.github/workflows/gpu_test.yaml @@ -59,8 +59,8 @@ jobs: run: python -m pip install --no-build-isolation -e ".[dev]" - name: Patch shared paths run: | - export LD_PRELOAD=${CONDA}/lib/libpython3.10.so.1.0:${LD_PRELOAD} - export LD_LIBRARY_PATH=${CONDA}/lib/libpython3.10.so.1.0:${LD_LIBRARY_PATH} + export LD_PRELOAD=3/envs/test/lib/libpython3.10.so.1.0:${LD_PRELOAD} + export LD_LIBRARY_PATH=3/envs/test/lib/libpython3.10.so.1.0:${LD_LIBRARY_PATH} echo "LD_PRELOAD=${LD_PRELOAD}" echo "LD_LIBRARY_PATH=${LD_LIBRARY_PATH}" - name: Run unit tests with coverage From dc58179037853c658882ae6b3e64a8f0b515bcaa Mon Sep 17 00:00:00 2001 From: Evan Smothers Date: Fri, 10 Oct 2025 15:27:00 -0700 Subject: [PATCH 23/35] idk one more try --- .github/workflows/gpu_test.yaml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/.github/workflows/gpu_test.yaml b/.github/workflows/gpu_test.yaml index 52db7d17..0f9ab326 100644 --- a/.github/workflows/gpu_test.yaml +++ b/.github/workflows/gpu_test.yaml @@ -59,8 +59,8 @@ jobs: run: python -m pip install --no-build-isolation -e ".[dev]" - name: Patch shared paths run: | - export LD_PRELOAD=3/envs/test/lib/libpython3.10.so.1.0:${LD_PRELOAD} - export LD_LIBRARY_PATH=3/envs/test/lib/libpython3.10.so.1.0:${LD_LIBRARY_PATH} + export LD_PRELOAD=/home/ec2-user/actions-runner/_work/forge/forge/3/envs/test/lib/libpython3.10.so.1.0:${LD_PRELOAD} + export LD_LIBRARY_PATH=/home/ec2-user/actions-runner/_work/forge/forge/3/envs/test/lib/libpython3.10.so.1.0:${LD_LIBRARY_PATH} echo "LD_PRELOAD=${LD_PRELOAD}" echo "LD_LIBRARY_PATH=${LD_LIBRARY_PATH}" - name: Run unit tests with coverage From 7eb92259e7644950f4017b5c54a29179577d069f Mon Sep 17 00:00:00 2001 From: Evan Smothers Date: Fri, 10 Oct 2025 15:36:03 -0700 Subject: [PATCH 24/35] back to old path --- .github/workflows/gpu_test.yaml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/.github/workflows/gpu_test.yaml b/.github/workflows/gpu_test.yaml index 0f9ab326..debc7486 100644 --- a/.github/workflows/gpu_test.yaml +++ b/.github/workflows/gpu_test.yaml @@ -59,8 +59,8 @@ jobs: run: python -m pip install --no-build-isolation -e ".[dev]" - name: Patch shared paths run: | - export LD_PRELOAD=/home/ec2-user/actions-runner/_work/forge/forge/3/envs/test/lib/libpython3.10.so.1.0:${LD_PRELOAD} - export LD_LIBRARY_PATH=/home/ec2-user/actions-runner/_work/forge/forge/3/envs/test/lib/libpython3.10.so.1.0:${LD_LIBRARY_PATH} + export LD_PRELOAD=$CONDA/envs/test/lib/libpython3.10.so.1.0 + export LD_LIBRARY_PATH=$CONDA/envs/test/lib/libpython3.10.so.1.0 echo "LD_PRELOAD=${LD_PRELOAD}" echo "LD_LIBRARY_PATH=${LD_LIBRARY_PATH}" - name: Run unit tests with coverage From ef09ac455f9c4cae9e2759c5301f9b279580490e Mon Sep 17 00:00:00 2001 From: Evan Smothers Date: Fri, 10 Oct 2025 15:36:36 -0700 Subject: [PATCH 25/35] add print statement --- .github/workflows/gpu_test.yaml | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/.github/workflows/gpu_test.yaml b/.github/workflows/gpu_test.yaml index debc7486..9713c1e4 100644 --- a/.github/workflows/gpu_test.yaml +++ b/.github/workflows/gpu_test.yaml @@ -65,6 +65,9 @@ jobs: echo "LD_LIBRARY_PATH=${LD_LIBRARY_PATH}" - name: Run unit tests with coverage # TODO add all tests - run: pytest tests/unit_tests --cov=. --cov-report=xml --durations=20 -vv + run: | + echo "LD_PRELOAD=${LD_PRELOAD}" + echo "LD_LIBRARY_PATH=${LD_LIBRARY_PATH}" + pytest tests/unit_tests --cov=. --cov-report=xml --durations=20 -vv - name: Upload Coverage to Codecov uses: codecov/codecov-action@v3 From 87df3fd0a25dd56c0795787bab01c491a604963e Mon Sep 17 00:00:00 2001 From: Evan Smothers Date: Fri, 10 Oct 2025 15:46:50 -0700 Subject: [PATCH 26/35] oh boy --- .github/workflows/gpu_test.yaml | 8 ++------ 1 file changed, 2 insertions(+), 6 deletions(-) diff --git a/.github/workflows/gpu_test.yaml b/.github/workflows/gpu_test.yaml index 9713c1e4..5577ecc5 100644 --- a/.github/workflows/gpu_test.yaml +++ b/.github/workflows/gpu_test.yaml @@ -57,15 +57,11 @@ jobs: python -m pip install git+https://github.com/meta-pytorch/torchstore.git - name: Install dependencies run: python -m pip install --no-build-isolation -e ".[dev]" - - name: Patch shared paths - run: | - export LD_PRELOAD=$CONDA/envs/test/lib/libpython3.10.so.1.0 - export LD_LIBRARY_PATH=$CONDA/envs/test/lib/libpython3.10.so.1.0 - echo "LD_PRELOAD=${LD_PRELOAD}" - echo "LD_LIBRARY_PATH=${LD_LIBRARY_PATH}" - name: Run unit tests with coverage # TODO add all tests run: | + export LD_PRELOAD=$CONDA/envs/test/lib/libpython3.10.so.1.0 + export LD_LIBRARY_PATH=$CONDA/envs/test/lib/libpython3.10.so.1.0 echo "LD_PRELOAD=${LD_PRELOAD}" echo "LD_LIBRARY_PATH=${LD_LIBRARY_PATH}" pytest tests/unit_tests --cov=. --cov-report=xml --durations=20 -vv From 0bbddb04e54aba7b0b0067ecdad735e843576426 Mon Sep 17 00:00:00 2001 From: Evan Smothers Date: Fri, 10 Oct 2025 15:54:52 -0700 Subject: [PATCH 27/35] my god it works. cleanup --- .github/packaging/build_for_ci.sh | 11 ++++++++++ .github/workflows/gpu_test.yaml | 2 -- .github/workflows/unit_test.yaml | 35 ++++++++++++++----------------- 3 files changed, 27 insertions(+), 21 deletions(-) create mode 100755 .github/packaging/build_for_ci.sh diff --git a/.github/packaging/build_for_ci.sh b/.github/packaging/build_for_ci.sh new file mode 100755 index 00000000..a4f8ff45 --- /dev/null +++ b/.github/packaging/build_for_ci.sh @@ -0,0 +1,11 @@ +python -m pip install --pre torch==2.9.0.dev20250905 --no-cache-dir --index-url https://download.pytorch.org/whl/nightly/cu129 +python -m pip install -r .github/packaging/vllm_reqs.txt +python -m pip install vllm==0.10.1.dev0+g6d8d0a24c.d20251009.cu129 --no-cache-dir --index-url https://download.pytorch.org/whl/preview/forge +python -m pip install -r https://raw.githubusercontent.com/meta-pytorch/monarch/main/requirements.txt +python -m pip install torchmonarch --extra-index-url https://download.pytorch.org/whl/preview/forge +python -m pip install git+ssh://git@github.com/pytorch/torchtitan.git +python -m pip install git+ssh://git@github.com/meta-pytorch/torchstore.git +export LD_LIBRARY_PATH=${CONDA_PREFIX}/lib/libpython3.10.so.1.0:${LD_LIBRARY_PATH} +export LD_PRELOAD=${CONDA_PREFIX}/lib/libpython3.10.so.1.0:${LD_PRELOAD} +pip install -e ".[dev]" +pytest tests/unit_tests diff --git a/.github/workflows/gpu_test.yaml b/.github/workflows/gpu_test.yaml index 5577ecc5..fb6cf507 100644 --- a/.github/workflows/gpu_test.yaml +++ b/.github/workflows/gpu_test.yaml @@ -62,8 +62,6 @@ jobs: run: | export LD_PRELOAD=$CONDA/envs/test/lib/libpython3.10.so.1.0 export LD_LIBRARY_PATH=$CONDA/envs/test/lib/libpython3.10.so.1.0 - echo "LD_PRELOAD=${LD_PRELOAD}" - echo "LD_LIBRARY_PATH=${LD_LIBRARY_PATH}" pytest tests/unit_tests --cov=. --cov-report=xml --durations=20 -vv - name: Upload Coverage to Codecov uses: codecov/codecov-action@v3 diff --git a/.github/workflows/unit_test.yaml b/.github/workflows/unit_test.yaml index ba438d44..9a839f32 100644 --- a/.github/workflows/unit_test.yaml +++ b/.github/workflows/unit_test.yaml @@ -23,23 +23,20 @@ jobs: python-version: ${{ matrix.python-version }} - name: Update pip run: python -m pip install --upgrade pip - # TODO: these are just debug changes - - name: print some stuff + - name: Install pytorch + run: python -m pip install torch==2.9.0.dev20250826 --extra-index-url https://download.pytorch.org/whl/nightly/cpu + - name: Install monarch + run: python -m pip install monarch-no-torch==0.1.0.dev20250826 --find-links assets/ci + - name: Install torchstore + run: pip install assets/wheels/torchstore-0.1.0-py3-none-any.whl + - name: Install torchtitan run: | - export LD_PRELOAD=${CONDA}/envs/test/lib/libpython3.10.so.1.0:${LD_PRELOAD} - export LD_LIBRARY_PATH=${CONDA}/envs/test/lib/libpython3.10.so.1.0:${LD_LIBRARY_PATH} - echo "LD_PRELOAD=${LD_PRELOAD}" - echo "LD_LIBRARY_PATH=${LD_LIBRARY_PATH}" - echo "Conda prefix: ${CONDA_PREFIX}" - echo "Conda: $CONDA" - echo "Current path: ${{ env.PATH }}" - conda env list - echo "before export" - echo "LD_LIBRARY_PATH: ${LD_LIBRARY_PATH}" - echo "LD_PRELOAD: ${LD_PRELOAD}" - ls -ahl $CONDA/envs/test/lib - export LD_PRELOAD=${CONDA_PREFIX}/lib/libpython3.10.so.1.0:${LD_PRELOAD} - export LD_LIBRARY_PATH=${CONDA_PREFIX}/lib/libpython3.10.so.1.0:${LD_LIBRARY_PATH} - echo "after export" - echo "LD_LIBRARY_PATH: ${LD_LIBRARY_PATH}" - echo "LD_PRELOAD: ${LD_PRELOAD}" + pip install assets/wheels/torchtitan-0.1.0-py3-none-any.whl + pip install tyro + - name: Install dependencies + run: python -m pip install --no-build-isolation -e ".[dev]" + - name: Run unit tests with coverage + # TODO add all tests + run: pytest tests/unit_tests --cov=. --cov-report=xml --durations=20 -vv + - name: Upload Coverage to Codecov + uses: codecov/codecov-action@v3 From a908e3299184f75b43a2d65c8119e31cef27ffa3 Mon Sep 17 00:00:00 2001 From: Evan Smothers Date: Fri, 10 Oct 2025 16:01:40 -0700 Subject: [PATCH 28/35] delete file, explain this abomination --- .github/packaging/build_for_ci.sh | 11 ----------- .github/packaging/vllm_reqs.txt | 7 +++++++ 2 files changed, 7 insertions(+), 11 deletions(-) delete mode 100755 .github/packaging/build_for_ci.sh diff --git a/.github/packaging/build_for_ci.sh b/.github/packaging/build_for_ci.sh deleted file mode 100755 index a4f8ff45..00000000 --- a/.github/packaging/build_for_ci.sh +++ /dev/null @@ -1,11 +0,0 @@ -python -m pip install --pre torch==2.9.0.dev20250905 --no-cache-dir --index-url https://download.pytorch.org/whl/nightly/cu129 -python -m pip install -r .github/packaging/vllm_reqs.txt -python -m pip install vllm==0.10.1.dev0+g6d8d0a24c.d20251009.cu129 --no-cache-dir --index-url https://download.pytorch.org/whl/preview/forge -python -m pip install -r https://raw.githubusercontent.com/meta-pytorch/monarch/main/requirements.txt -python -m pip install torchmonarch --extra-index-url https://download.pytorch.org/whl/preview/forge -python -m pip install git+ssh://git@github.com/pytorch/torchtitan.git -python -m pip install git+ssh://git@github.com/meta-pytorch/torchstore.git -export LD_LIBRARY_PATH=${CONDA_PREFIX}/lib/libpython3.10.so.1.0:${LD_LIBRARY_PATH} -export LD_PRELOAD=${CONDA_PREFIX}/lib/libpython3.10.so.1.0:${LD_PRELOAD} -pip install -e ".[dev]" -pytest tests/unit_tests diff --git a/.github/packaging/vllm_reqs.txt b/.github/packaging/vllm_reqs.txt index 996dbbce..082050e6 100644 --- a/.github/packaging/vllm_reqs.txt +++ b/.github/packaging/vllm_reqs.txt @@ -1,3 +1,10 @@ +# These requirements were generated by running steps 1-3 of scripts/build_wheels.shell +# then running pip freeze and manually removing the vllm dependency. +# The intention of this file is to use these known requirements for a fixed +# vLLM build to supplement a vLLM install from download.pytorch.org without +# resorting to --extra-index-url https://download.pytorch.org/whl/nightly to find +# vLLM dependencies (as this results in a ResolutionTooDeep error from pip). +# TODO: this should be done way less hackily aiohappyeyeballs==2.6.1 aiohttp==3.13.0 aiosignal==1.4.0 From a70147261fddc0d6eaf8ad4b91f1a99dd34f9a5d Mon Sep 17 00:00:00 2001 From: Evan Smothers Date: Fri, 10 Oct 2025 16:03:37 -0700 Subject: [PATCH 29/35] leave install.sh as is for this PR --- .github/packaging/vllm_reqs.txt | 1 + scripts/install.sh | 102 +++++++++++++++++++++++++++----- 2 files changed, 89 insertions(+), 14 deletions(-) diff --git a/.github/packaging/vllm_reqs.txt b/.github/packaging/vllm_reqs.txt index 082050e6..929e5a3f 100644 --- a/.github/packaging/vllm_reqs.txt +++ b/.github/packaging/vllm_reqs.txt @@ -4,6 +4,7 @@ # vLLM build to supplement a vLLM install from download.pytorch.org without # resorting to --extra-index-url https://download.pytorch.org/whl/nightly to find # vLLM dependencies (as this results in a ResolutionTooDeep error from pip). +# See the file .github/workflows/gpu_test.yaml for an E2E forge installation using this approach. # TODO: this should be done way less hackily aiohappyeyeballs==2.6.1 aiohttp==3.13.0 diff --git a/scripts/install.sh b/scripts/install.sh index 4e8f4655..eb4776cf 100755 --- a/scripts/install.sh +++ b/scripts/install.sh @@ -124,6 +124,85 @@ install_system_packages() { fi } +# Check to see if gh is installed, if not, it will be installed via conda-forge channel +check_gh_install() { + if ! command -v gh &> /dev/null; then + log_warning "GitHub CLI (gh) not found. Installing via Conda..." + conda install gh --channel conda-forge -y + log_info "GitHub CLI (gh) installed successfully." + log_info "Please run 'gh auth login' to authenticate with GitHub." + else + log_info "GitHub CLI (gh) already installed." + fi +} + +# Check wheels exist +check_wheels() { + if [ ! -d "$WHEEL_DIR" ]; then + log_error "Wheels directory not found: $WHEEL_DIR" + exit 1 + fi + + local wheel_count=$(ls -1 "$WHEEL_DIR"/*.whl 2>/dev/null | wc -l) + log_info "Found $wheel_count local wheels" +} + +# Download vLLM wheel from GitHub releases +download_vllm_wheel() { + log_info "Downloading vLLM wheel from GitHub releases..." + + # Check if gh is installed + if ! command -v gh &> /dev/null; then + log_error "GitHub CLI (gh) is required to download vLLM wheel" + log_info "Install it with: sudo dnf install gh" + log_info "Then run: gh auth login" + exit 1 + fi + + # Get the vLLM wheel filename from the release + local vllm_wheel_name + vllm_wheel_name=$(gh release view "$RELEASE_TAG" --repo "$GITHUB_REPO" --json assets --jq '.assets[] | select(.name | contains("vllm")) | .name' | head -1) + + if [ -z "$vllm_wheel_name" ]; then + log_error "Could not find vLLM wheel in release $RELEASE_TAG" + log_info "Make sure the vLLM wheel has been uploaded to the GitHub release" + exit 1 + fi + for f in assets/wheels/vllm-*; do + [ -e "$f" ] || continue # skip if glob didn't match + if [ "$(basename "$f")" != "$vllm_wheel_name" ]; then + log_info "Removing stale vLLM wheel: $(basename "$f")" + rm -f "$f" + fi + done + + local local_path="$WHEEL_DIR/$vllm_wheel_name" + + if [ -f "$local_path" ]; then + log_info "vLLM wheel already downloaded: $vllm_wheel_name" + return 0 + fi + + log_info "Downloading: $vllm_wheel_name" + + # Save current directory and change to wheel directory + local original_dir=$(pwd) + cd "$WHEEL_DIR" + gh release download "$RELEASE_TAG" --repo "$GITHUB_REPO" --pattern "*vllm*" + local download_result=$? + + # Always return to original directory + cd "$original_dir" + + if [ $download_result -eq 0 ]; then + log_info "Successfully downloaded vLLM wheel" + else + log_error "Failed to download vLLM wheel" + exit 1 + fi +} + + # Parse command line arguments parse_args() { USE_SUDO=false @@ -162,6 +241,7 @@ main() { echo "======================" echo "" echo "Note: Run this from the root of the forge repository" + echo "This script requires GitHub CLI (gh) to download large wheels" if [ "$USE_SUDO" = "true" ]; then echo "System packages will be installed via system package manager (requires sudo)" check_sudo @@ -171,29 +251,23 @@ main() { echo "" check_conda_env + check_wheels # Install openssl as we overwrite the default version when we update LD_LIBRARY_PATH conda install -y openssl install_system_packages "$USE_SUDO" + check_gh_install + download_vllm_wheel log_info "Installing PyTorch nightly..." - pip install torch==$PYTORCH_VERSION --index-url https://download.pytorch.org/whl/nightly/cu128 - - log_info "Installing torchtitan and torchstore..." - pip install torchtitan torchstore - - # Install Monarch from wheel at download.pytorch.org - log_info "Downloading and installing Monarch wheel..." - pip install -r https://raw.githubusercontent.com/meta-pytorch/monarch/main/requirements.txt - pip install monarch --extra-index-url https://download.pytorch.org/whl/preview/forge + pip install torch==$PYTORCH_VERSION --index-url https://download.pytorch.org/whl/nightly/cu129 - # Install vLLM from wheel at download.pytorch.org - log_info "Downloading and installing vLLM wheel..." - pip install vllm --extra-index-url https://download.pytorch.org/whl/preview/forge + log_info "Installing all wheels (local + downloaded)..." + pip install "$WHEEL_DIR"/*.whl log_info "Installing Forge from source..." - pip install -e ".[dev]" + pip install -e . # Set up environment log_info "Setting up environment..." @@ -213,7 +287,7 @@ main() { local cuda_activation_script="${conda_env_dir}/etc/conda/activate.d/cuda_env.sh" cat > "$cuda_activation_script" << 'EOF' # CUDA environment for Forge -export CUDA_VERSION=12.8 +export CUDA_VERSION=12.9 export NVCC=/usr/local/cuda-${CUDA_VERSION}/bin/nvcc export CUDA_NVCC_EXECUTABLE=/usr/local/cuda-${CUDA_VERSION}/bin/nvcc export CUDA_HOME=/usr/local/cuda-${CUDA_VERSION} From 4ea74e8f793931ff9c2dda036dd403d6b43e1ccd Mon Sep 17 00:00:00 2001 From: Evan Smothers Date: Fri, 10 Oct 2025 17:16:13 -0700 Subject: [PATCH 30/35] Revert "leave install.sh as is for this PR" This reverts commit a70147261fddc0d6eaf8ad4b91f1a99dd34f9a5d. --- .github/packaging/vllm_reqs.txt | 1 - scripts/install.sh | 102 +++++--------------------------- 2 files changed, 14 insertions(+), 89 deletions(-) diff --git a/.github/packaging/vllm_reqs.txt b/.github/packaging/vllm_reqs.txt index 929e5a3f..082050e6 100644 --- a/.github/packaging/vllm_reqs.txt +++ b/.github/packaging/vllm_reqs.txt @@ -4,7 +4,6 @@ # vLLM build to supplement a vLLM install from download.pytorch.org without # resorting to --extra-index-url https://download.pytorch.org/whl/nightly to find # vLLM dependencies (as this results in a ResolutionTooDeep error from pip). -# See the file .github/workflows/gpu_test.yaml for an E2E forge installation using this approach. # TODO: this should be done way less hackily aiohappyeyeballs==2.6.1 aiohttp==3.13.0 diff --git a/scripts/install.sh b/scripts/install.sh index eb4776cf..4e8f4655 100755 --- a/scripts/install.sh +++ b/scripts/install.sh @@ -124,85 +124,6 @@ install_system_packages() { fi } -# Check to see if gh is installed, if not, it will be installed via conda-forge channel -check_gh_install() { - if ! command -v gh &> /dev/null; then - log_warning "GitHub CLI (gh) not found. Installing via Conda..." - conda install gh --channel conda-forge -y - log_info "GitHub CLI (gh) installed successfully." - log_info "Please run 'gh auth login' to authenticate with GitHub." - else - log_info "GitHub CLI (gh) already installed." - fi -} - -# Check wheels exist -check_wheels() { - if [ ! -d "$WHEEL_DIR" ]; then - log_error "Wheels directory not found: $WHEEL_DIR" - exit 1 - fi - - local wheel_count=$(ls -1 "$WHEEL_DIR"/*.whl 2>/dev/null | wc -l) - log_info "Found $wheel_count local wheels" -} - -# Download vLLM wheel from GitHub releases -download_vllm_wheel() { - log_info "Downloading vLLM wheel from GitHub releases..." - - # Check if gh is installed - if ! command -v gh &> /dev/null; then - log_error "GitHub CLI (gh) is required to download vLLM wheel" - log_info "Install it with: sudo dnf install gh" - log_info "Then run: gh auth login" - exit 1 - fi - - # Get the vLLM wheel filename from the release - local vllm_wheel_name - vllm_wheel_name=$(gh release view "$RELEASE_TAG" --repo "$GITHUB_REPO" --json assets --jq '.assets[] | select(.name | contains("vllm")) | .name' | head -1) - - if [ -z "$vllm_wheel_name" ]; then - log_error "Could not find vLLM wheel in release $RELEASE_TAG" - log_info "Make sure the vLLM wheel has been uploaded to the GitHub release" - exit 1 - fi - for f in assets/wheels/vllm-*; do - [ -e "$f" ] || continue # skip if glob didn't match - if [ "$(basename "$f")" != "$vllm_wheel_name" ]; then - log_info "Removing stale vLLM wheel: $(basename "$f")" - rm -f "$f" - fi - done - - local local_path="$WHEEL_DIR/$vllm_wheel_name" - - if [ -f "$local_path" ]; then - log_info "vLLM wheel already downloaded: $vllm_wheel_name" - return 0 - fi - - log_info "Downloading: $vllm_wheel_name" - - # Save current directory and change to wheel directory - local original_dir=$(pwd) - cd "$WHEEL_DIR" - gh release download "$RELEASE_TAG" --repo "$GITHUB_REPO" --pattern "*vllm*" - local download_result=$? - - # Always return to original directory - cd "$original_dir" - - if [ $download_result -eq 0 ]; then - log_info "Successfully downloaded vLLM wheel" - else - log_error "Failed to download vLLM wheel" - exit 1 - fi -} - - # Parse command line arguments parse_args() { USE_SUDO=false @@ -241,7 +162,6 @@ main() { echo "======================" echo "" echo "Note: Run this from the root of the forge repository" - echo "This script requires GitHub CLI (gh) to download large wheels" if [ "$USE_SUDO" = "true" ]; then echo "System packages will be installed via system package manager (requires sudo)" check_sudo @@ -251,23 +171,29 @@ main() { echo "" check_conda_env - check_wheels # Install openssl as we overwrite the default version when we update LD_LIBRARY_PATH conda install -y openssl install_system_packages "$USE_SUDO" - check_gh_install - download_vllm_wheel log_info "Installing PyTorch nightly..." - pip install torch==$PYTORCH_VERSION --index-url https://download.pytorch.org/whl/nightly/cu129 + pip install torch==$PYTORCH_VERSION --index-url https://download.pytorch.org/whl/nightly/cu128 + + log_info "Installing torchtitan and torchstore..." + pip install torchtitan torchstore + + # Install Monarch from wheel at download.pytorch.org + log_info "Downloading and installing Monarch wheel..." + pip install -r https://raw.githubusercontent.com/meta-pytorch/monarch/main/requirements.txt + pip install monarch --extra-index-url https://download.pytorch.org/whl/preview/forge - log_info "Installing all wheels (local + downloaded)..." - pip install "$WHEEL_DIR"/*.whl + # Install vLLM from wheel at download.pytorch.org + log_info "Downloading and installing vLLM wheel..." + pip install vllm --extra-index-url https://download.pytorch.org/whl/preview/forge log_info "Installing Forge from source..." - pip install -e . + pip install -e ".[dev]" # Set up environment log_info "Setting up environment..." @@ -287,7 +213,7 @@ main() { local cuda_activation_script="${conda_env_dir}/etc/conda/activate.d/cuda_env.sh" cat > "$cuda_activation_script" << 'EOF' # CUDA environment for Forge -export CUDA_VERSION=12.9 +export CUDA_VERSION=12.8 export NVCC=/usr/local/cuda-${CUDA_VERSION}/bin/nvcc export CUDA_NVCC_EXECUTABLE=/usr/local/cuda-${CUDA_VERSION}/bin/nvcc export CUDA_HOME=/usr/local/cuda-${CUDA_VERSION} From a1016ddb7b27bd250f7b98d35c9afb9920877d4b Mon Sep 17 00:00:00 2001 From: Evan Smothers Date: Fri, 10 Oct 2025 17:16:15 -0700 Subject: [PATCH 31/35] Revert "delete file, explain this abomination" This reverts commit a908e3299184f75b43a2d65c8119e31cef27ffa3. --- .github/packaging/build_for_ci.sh | 11 +++++++++++ .github/packaging/vllm_reqs.txt | 7 ------- 2 files changed, 11 insertions(+), 7 deletions(-) create mode 100755 .github/packaging/build_for_ci.sh diff --git a/.github/packaging/build_for_ci.sh b/.github/packaging/build_for_ci.sh new file mode 100755 index 00000000..a4f8ff45 --- /dev/null +++ b/.github/packaging/build_for_ci.sh @@ -0,0 +1,11 @@ +python -m pip install --pre torch==2.9.0.dev20250905 --no-cache-dir --index-url https://download.pytorch.org/whl/nightly/cu129 +python -m pip install -r .github/packaging/vllm_reqs.txt +python -m pip install vllm==0.10.1.dev0+g6d8d0a24c.d20251009.cu129 --no-cache-dir --index-url https://download.pytorch.org/whl/preview/forge +python -m pip install -r https://raw.githubusercontent.com/meta-pytorch/monarch/main/requirements.txt +python -m pip install torchmonarch --extra-index-url https://download.pytorch.org/whl/preview/forge +python -m pip install git+ssh://git@github.com/pytorch/torchtitan.git +python -m pip install git+ssh://git@github.com/meta-pytorch/torchstore.git +export LD_LIBRARY_PATH=${CONDA_PREFIX}/lib/libpython3.10.so.1.0:${LD_LIBRARY_PATH} +export LD_PRELOAD=${CONDA_PREFIX}/lib/libpython3.10.so.1.0:${LD_PRELOAD} +pip install -e ".[dev]" +pytest tests/unit_tests diff --git a/.github/packaging/vllm_reqs.txt b/.github/packaging/vllm_reqs.txt index 082050e6..996dbbce 100644 --- a/.github/packaging/vllm_reqs.txt +++ b/.github/packaging/vllm_reqs.txt @@ -1,10 +1,3 @@ -# These requirements were generated by running steps 1-3 of scripts/build_wheels.shell -# then running pip freeze and manually removing the vllm dependency. -# The intention of this file is to use these known requirements for a fixed -# vLLM build to supplement a vLLM install from download.pytorch.org without -# resorting to --extra-index-url https://download.pytorch.org/whl/nightly to find -# vLLM dependencies (as this results in a ResolutionTooDeep error from pip). -# TODO: this should be done way less hackily aiohappyeyeballs==2.6.1 aiohttp==3.13.0 aiosignal==1.4.0 From b527d134ddd6acbf8fdd51c6eea20926c5bd30b3 Mon Sep 17 00:00:00 2001 From: Evan Smothers Date: Fri, 10 Oct 2025 17:16:16 -0700 Subject: [PATCH 32/35] Revert "my god it works. cleanup" This reverts commit 0bbddb04e54aba7b0b0067ecdad735e843576426. --- .github/packaging/build_for_ci.sh | 11 ---------- .github/workflows/gpu_test.yaml | 2 ++ .github/workflows/unit_test.yaml | 35 +++++++++++++++++-------------- 3 files changed, 21 insertions(+), 27 deletions(-) delete mode 100755 .github/packaging/build_for_ci.sh diff --git a/.github/packaging/build_for_ci.sh b/.github/packaging/build_for_ci.sh deleted file mode 100755 index a4f8ff45..00000000 --- a/.github/packaging/build_for_ci.sh +++ /dev/null @@ -1,11 +0,0 @@ -python -m pip install --pre torch==2.9.0.dev20250905 --no-cache-dir --index-url https://download.pytorch.org/whl/nightly/cu129 -python -m pip install -r .github/packaging/vllm_reqs.txt -python -m pip install vllm==0.10.1.dev0+g6d8d0a24c.d20251009.cu129 --no-cache-dir --index-url https://download.pytorch.org/whl/preview/forge -python -m pip install -r https://raw.githubusercontent.com/meta-pytorch/monarch/main/requirements.txt -python -m pip install torchmonarch --extra-index-url https://download.pytorch.org/whl/preview/forge -python -m pip install git+ssh://git@github.com/pytorch/torchtitan.git -python -m pip install git+ssh://git@github.com/meta-pytorch/torchstore.git -export LD_LIBRARY_PATH=${CONDA_PREFIX}/lib/libpython3.10.so.1.0:${LD_LIBRARY_PATH} -export LD_PRELOAD=${CONDA_PREFIX}/lib/libpython3.10.so.1.0:${LD_PRELOAD} -pip install -e ".[dev]" -pytest tests/unit_tests diff --git a/.github/workflows/gpu_test.yaml b/.github/workflows/gpu_test.yaml index fb6cf507..5577ecc5 100644 --- a/.github/workflows/gpu_test.yaml +++ b/.github/workflows/gpu_test.yaml @@ -62,6 +62,8 @@ jobs: run: | export LD_PRELOAD=$CONDA/envs/test/lib/libpython3.10.so.1.0 export LD_LIBRARY_PATH=$CONDA/envs/test/lib/libpython3.10.so.1.0 + echo "LD_PRELOAD=${LD_PRELOAD}" + echo "LD_LIBRARY_PATH=${LD_LIBRARY_PATH}" pytest tests/unit_tests --cov=. --cov-report=xml --durations=20 -vv - name: Upload Coverage to Codecov uses: codecov/codecov-action@v3 diff --git a/.github/workflows/unit_test.yaml b/.github/workflows/unit_test.yaml index 9a839f32..ba438d44 100644 --- a/.github/workflows/unit_test.yaml +++ b/.github/workflows/unit_test.yaml @@ -23,20 +23,23 @@ jobs: python-version: ${{ matrix.python-version }} - name: Update pip run: python -m pip install --upgrade pip - - name: Install pytorch - run: python -m pip install torch==2.9.0.dev20250826 --extra-index-url https://download.pytorch.org/whl/nightly/cpu - - name: Install monarch - run: python -m pip install monarch-no-torch==0.1.0.dev20250826 --find-links assets/ci - - name: Install torchstore - run: pip install assets/wheels/torchstore-0.1.0-py3-none-any.whl - - name: Install torchtitan + # TODO: these are just debug changes + - name: print some stuff run: | - pip install assets/wheels/torchtitan-0.1.0-py3-none-any.whl - pip install tyro - - name: Install dependencies - run: python -m pip install --no-build-isolation -e ".[dev]" - - name: Run unit tests with coverage - # TODO add all tests - run: pytest tests/unit_tests --cov=. --cov-report=xml --durations=20 -vv - - name: Upload Coverage to Codecov - uses: codecov/codecov-action@v3 + export LD_PRELOAD=${CONDA}/envs/test/lib/libpython3.10.so.1.0:${LD_PRELOAD} + export LD_LIBRARY_PATH=${CONDA}/envs/test/lib/libpython3.10.so.1.0:${LD_LIBRARY_PATH} + echo "LD_PRELOAD=${LD_PRELOAD}" + echo "LD_LIBRARY_PATH=${LD_LIBRARY_PATH}" + echo "Conda prefix: ${CONDA_PREFIX}" + echo "Conda: $CONDA" + echo "Current path: ${{ env.PATH }}" + conda env list + echo "before export" + echo "LD_LIBRARY_PATH: ${LD_LIBRARY_PATH}" + echo "LD_PRELOAD: ${LD_PRELOAD}" + ls -ahl $CONDA/envs/test/lib + export LD_PRELOAD=${CONDA_PREFIX}/lib/libpython3.10.so.1.0:${LD_PRELOAD} + export LD_LIBRARY_PATH=${CONDA_PREFIX}/lib/libpython3.10.so.1.0:${LD_LIBRARY_PATH} + echo "after export" + echo "LD_LIBRARY_PATH: ${LD_LIBRARY_PATH}" + echo "LD_PRELOAD: ${LD_PRELOAD}" From b7ce6c240921b4865ec56277bc55da8873292c06 Mon Sep 17 00:00:00 2001 From: Evan Smothers Date: Fri, 10 Oct 2025 17:24:15 -0700 Subject: [PATCH 33/35] Reapply "my god it works. cleanup" This reverts commit b527d134ddd6acbf8fdd51c6eea20926c5bd30b3. --- .github/packaging/build_for_ci.sh | 11 ++++++++++ .github/workflows/gpu_test.yaml | 2 -- .github/workflows/unit_test.yaml | 35 ++++++++++++++----------------- 3 files changed, 27 insertions(+), 21 deletions(-) create mode 100755 .github/packaging/build_for_ci.sh diff --git a/.github/packaging/build_for_ci.sh b/.github/packaging/build_for_ci.sh new file mode 100755 index 00000000..a4f8ff45 --- /dev/null +++ b/.github/packaging/build_for_ci.sh @@ -0,0 +1,11 @@ +python -m pip install --pre torch==2.9.0.dev20250905 --no-cache-dir --index-url https://download.pytorch.org/whl/nightly/cu129 +python -m pip install -r .github/packaging/vllm_reqs.txt +python -m pip install vllm==0.10.1.dev0+g6d8d0a24c.d20251009.cu129 --no-cache-dir --index-url https://download.pytorch.org/whl/preview/forge +python -m pip install -r https://raw.githubusercontent.com/meta-pytorch/monarch/main/requirements.txt +python -m pip install torchmonarch --extra-index-url https://download.pytorch.org/whl/preview/forge +python -m pip install git+ssh://git@github.com/pytorch/torchtitan.git +python -m pip install git+ssh://git@github.com/meta-pytorch/torchstore.git +export LD_LIBRARY_PATH=${CONDA_PREFIX}/lib/libpython3.10.so.1.0:${LD_LIBRARY_PATH} +export LD_PRELOAD=${CONDA_PREFIX}/lib/libpython3.10.so.1.0:${LD_PRELOAD} +pip install -e ".[dev]" +pytest tests/unit_tests diff --git a/.github/workflows/gpu_test.yaml b/.github/workflows/gpu_test.yaml index 5577ecc5..fb6cf507 100644 --- a/.github/workflows/gpu_test.yaml +++ b/.github/workflows/gpu_test.yaml @@ -62,8 +62,6 @@ jobs: run: | export LD_PRELOAD=$CONDA/envs/test/lib/libpython3.10.so.1.0 export LD_LIBRARY_PATH=$CONDA/envs/test/lib/libpython3.10.so.1.0 - echo "LD_PRELOAD=${LD_PRELOAD}" - echo "LD_LIBRARY_PATH=${LD_LIBRARY_PATH}" pytest tests/unit_tests --cov=. --cov-report=xml --durations=20 -vv - name: Upload Coverage to Codecov uses: codecov/codecov-action@v3 diff --git a/.github/workflows/unit_test.yaml b/.github/workflows/unit_test.yaml index ba438d44..9a839f32 100644 --- a/.github/workflows/unit_test.yaml +++ b/.github/workflows/unit_test.yaml @@ -23,23 +23,20 @@ jobs: python-version: ${{ matrix.python-version }} - name: Update pip run: python -m pip install --upgrade pip - # TODO: these are just debug changes - - name: print some stuff + - name: Install pytorch + run: python -m pip install torch==2.9.0.dev20250826 --extra-index-url https://download.pytorch.org/whl/nightly/cpu + - name: Install monarch + run: python -m pip install monarch-no-torch==0.1.0.dev20250826 --find-links assets/ci + - name: Install torchstore + run: pip install assets/wheels/torchstore-0.1.0-py3-none-any.whl + - name: Install torchtitan run: | - export LD_PRELOAD=${CONDA}/envs/test/lib/libpython3.10.so.1.0:${LD_PRELOAD} - export LD_LIBRARY_PATH=${CONDA}/envs/test/lib/libpython3.10.so.1.0:${LD_LIBRARY_PATH} - echo "LD_PRELOAD=${LD_PRELOAD}" - echo "LD_LIBRARY_PATH=${LD_LIBRARY_PATH}" - echo "Conda prefix: ${CONDA_PREFIX}" - echo "Conda: $CONDA" - echo "Current path: ${{ env.PATH }}" - conda env list - echo "before export" - echo "LD_LIBRARY_PATH: ${LD_LIBRARY_PATH}" - echo "LD_PRELOAD: ${LD_PRELOAD}" - ls -ahl $CONDA/envs/test/lib - export LD_PRELOAD=${CONDA_PREFIX}/lib/libpython3.10.so.1.0:${LD_PRELOAD} - export LD_LIBRARY_PATH=${CONDA_PREFIX}/lib/libpython3.10.so.1.0:${LD_LIBRARY_PATH} - echo "after export" - echo "LD_LIBRARY_PATH: ${LD_LIBRARY_PATH}" - echo "LD_PRELOAD: ${LD_PRELOAD}" + pip install assets/wheels/torchtitan-0.1.0-py3-none-any.whl + pip install tyro + - name: Install dependencies + run: python -m pip install --no-build-isolation -e ".[dev]" + - name: Run unit tests with coverage + # TODO add all tests + run: pytest tests/unit_tests --cov=. --cov-report=xml --durations=20 -vv + - name: Upload Coverage to Codecov + uses: codecov/codecov-action@v3 From ca6b768507066ab3898b9ed3c9ff51c522b6977a Mon Sep 17 00:00:00 2001 From: Evan Smothers Date: Fri, 10 Oct 2025 17:24:17 -0700 Subject: [PATCH 34/35] Reapply "delete file, explain this abomination" This reverts commit a1016ddb7b27bd250f7b98d35c9afb9920877d4b. --- .github/packaging/build_for_ci.sh | 11 ----------- .github/packaging/vllm_reqs.txt | 7 +++++++ 2 files changed, 7 insertions(+), 11 deletions(-) delete mode 100755 .github/packaging/build_for_ci.sh diff --git a/.github/packaging/build_for_ci.sh b/.github/packaging/build_for_ci.sh deleted file mode 100755 index a4f8ff45..00000000 --- a/.github/packaging/build_for_ci.sh +++ /dev/null @@ -1,11 +0,0 @@ -python -m pip install --pre torch==2.9.0.dev20250905 --no-cache-dir --index-url https://download.pytorch.org/whl/nightly/cu129 -python -m pip install -r .github/packaging/vllm_reqs.txt -python -m pip install vllm==0.10.1.dev0+g6d8d0a24c.d20251009.cu129 --no-cache-dir --index-url https://download.pytorch.org/whl/preview/forge -python -m pip install -r https://raw.githubusercontent.com/meta-pytorch/monarch/main/requirements.txt -python -m pip install torchmonarch --extra-index-url https://download.pytorch.org/whl/preview/forge -python -m pip install git+ssh://git@github.com/pytorch/torchtitan.git -python -m pip install git+ssh://git@github.com/meta-pytorch/torchstore.git -export LD_LIBRARY_PATH=${CONDA_PREFIX}/lib/libpython3.10.so.1.0:${LD_LIBRARY_PATH} -export LD_PRELOAD=${CONDA_PREFIX}/lib/libpython3.10.so.1.0:${LD_PRELOAD} -pip install -e ".[dev]" -pytest tests/unit_tests diff --git a/.github/packaging/vllm_reqs.txt b/.github/packaging/vllm_reqs.txt index 996dbbce..082050e6 100644 --- a/.github/packaging/vllm_reqs.txt +++ b/.github/packaging/vllm_reqs.txt @@ -1,3 +1,10 @@ +# These requirements were generated by running steps 1-3 of scripts/build_wheels.shell +# then running pip freeze and manually removing the vllm dependency. +# The intention of this file is to use these known requirements for a fixed +# vLLM build to supplement a vLLM install from download.pytorch.org without +# resorting to --extra-index-url https://download.pytorch.org/whl/nightly to find +# vLLM dependencies (as this results in a ResolutionTooDeep error from pip). +# TODO: this should be done way less hackily aiohappyeyeballs==2.6.1 aiohttp==3.13.0 aiosignal==1.4.0 From 6ac48b3be1635b8bceda2d63a1bf580be2290cba Mon Sep 17 00:00:00 2001 From: Evan Smothers Date: Fri, 10 Oct 2025 17:24:18 -0700 Subject: [PATCH 35/35] Reapply "leave install.sh as is for this PR" This reverts commit 4ea74e8f793931ff9c2dda036dd403d6b43e1ccd. --- .github/packaging/vllm_reqs.txt | 1 + scripts/install.sh | 102 +++++++++++++++++++++++++++----- 2 files changed, 89 insertions(+), 14 deletions(-) diff --git a/.github/packaging/vllm_reqs.txt b/.github/packaging/vllm_reqs.txt index 082050e6..929e5a3f 100644 --- a/.github/packaging/vllm_reqs.txt +++ b/.github/packaging/vllm_reqs.txt @@ -4,6 +4,7 @@ # vLLM build to supplement a vLLM install from download.pytorch.org without # resorting to --extra-index-url https://download.pytorch.org/whl/nightly to find # vLLM dependencies (as this results in a ResolutionTooDeep error from pip). +# See the file .github/workflows/gpu_test.yaml for an E2E forge installation using this approach. # TODO: this should be done way less hackily aiohappyeyeballs==2.6.1 aiohttp==3.13.0 diff --git a/scripts/install.sh b/scripts/install.sh index 4e8f4655..eb4776cf 100755 --- a/scripts/install.sh +++ b/scripts/install.sh @@ -124,6 +124,85 @@ install_system_packages() { fi } +# Check to see if gh is installed, if not, it will be installed via conda-forge channel +check_gh_install() { + if ! command -v gh &> /dev/null; then + log_warning "GitHub CLI (gh) not found. Installing via Conda..." + conda install gh --channel conda-forge -y + log_info "GitHub CLI (gh) installed successfully." + log_info "Please run 'gh auth login' to authenticate with GitHub." + else + log_info "GitHub CLI (gh) already installed." + fi +} + +# Check wheels exist +check_wheels() { + if [ ! -d "$WHEEL_DIR" ]; then + log_error "Wheels directory not found: $WHEEL_DIR" + exit 1 + fi + + local wheel_count=$(ls -1 "$WHEEL_DIR"/*.whl 2>/dev/null | wc -l) + log_info "Found $wheel_count local wheels" +} + +# Download vLLM wheel from GitHub releases +download_vllm_wheel() { + log_info "Downloading vLLM wheel from GitHub releases..." + + # Check if gh is installed + if ! command -v gh &> /dev/null; then + log_error "GitHub CLI (gh) is required to download vLLM wheel" + log_info "Install it with: sudo dnf install gh" + log_info "Then run: gh auth login" + exit 1 + fi + + # Get the vLLM wheel filename from the release + local vllm_wheel_name + vllm_wheel_name=$(gh release view "$RELEASE_TAG" --repo "$GITHUB_REPO" --json assets --jq '.assets[] | select(.name | contains("vllm")) | .name' | head -1) + + if [ -z "$vllm_wheel_name" ]; then + log_error "Could not find vLLM wheel in release $RELEASE_TAG" + log_info "Make sure the vLLM wheel has been uploaded to the GitHub release" + exit 1 + fi + for f in assets/wheels/vllm-*; do + [ -e "$f" ] || continue # skip if glob didn't match + if [ "$(basename "$f")" != "$vllm_wheel_name" ]; then + log_info "Removing stale vLLM wheel: $(basename "$f")" + rm -f "$f" + fi + done + + local local_path="$WHEEL_DIR/$vllm_wheel_name" + + if [ -f "$local_path" ]; then + log_info "vLLM wheel already downloaded: $vllm_wheel_name" + return 0 + fi + + log_info "Downloading: $vllm_wheel_name" + + # Save current directory and change to wheel directory + local original_dir=$(pwd) + cd "$WHEEL_DIR" + gh release download "$RELEASE_TAG" --repo "$GITHUB_REPO" --pattern "*vllm*" + local download_result=$? + + # Always return to original directory + cd "$original_dir" + + if [ $download_result -eq 0 ]; then + log_info "Successfully downloaded vLLM wheel" + else + log_error "Failed to download vLLM wheel" + exit 1 + fi +} + + # Parse command line arguments parse_args() { USE_SUDO=false @@ -162,6 +241,7 @@ main() { echo "======================" echo "" echo "Note: Run this from the root of the forge repository" + echo "This script requires GitHub CLI (gh) to download large wheels" if [ "$USE_SUDO" = "true" ]; then echo "System packages will be installed via system package manager (requires sudo)" check_sudo @@ -171,29 +251,23 @@ main() { echo "" check_conda_env + check_wheels # Install openssl as we overwrite the default version when we update LD_LIBRARY_PATH conda install -y openssl install_system_packages "$USE_SUDO" + check_gh_install + download_vllm_wheel log_info "Installing PyTorch nightly..." - pip install torch==$PYTORCH_VERSION --index-url https://download.pytorch.org/whl/nightly/cu128 - - log_info "Installing torchtitan and torchstore..." - pip install torchtitan torchstore - - # Install Monarch from wheel at download.pytorch.org - log_info "Downloading and installing Monarch wheel..." - pip install -r https://raw.githubusercontent.com/meta-pytorch/monarch/main/requirements.txt - pip install monarch --extra-index-url https://download.pytorch.org/whl/preview/forge + pip install torch==$PYTORCH_VERSION --index-url https://download.pytorch.org/whl/nightly/cu129 - # Install vLLM from wheel at download.pytorch.org - log_info "Downloading and installing vLLM wheel..." - pip install vllm --extra-index-url https://download.pytorch.org/whl/preview/forge + log_info "Installing all wheels (local + downloaded)..." + pip install "$WHEEL_DIR"/*.whl log_info "Installing Forge from source..." - pip install -e ".[dev]" + pip install -e . # Set up environment log_info "Setting up environment..." @@ -213,7 +287,7 @@ main() { local cuda_activation_script="${conda_env_dir}/etc/conda/activate.d/cuda_env.sh" cat > "$cuda_activation_script" << 'EOF' # CUDA environment for Forge -export CUDA_VERSION=12.8 +export CUDA_VERSION=12.9 export NVCC=/usr/local/cuda-${CUDA_VERSION}/bin/nvcc export CUDA_NVCC_EXECUTABLE=/usr/local/cuda-${CUDA_VERSION}/bin/nvcc export CUDA_HOME=/usr/local/cuda-${CUDA_VERSION}