Skip to content

Build llama.cpp with CUDA #82

Build llama.cpp with CUDA

Build llama.cpp with CUDA #82

Workflow file for this run

name: Build llama.cpp with CUDA
on:
schedule:
# Run daily at 00:00 UTC
- cron: '0 0 * * *'
workflow_dispatch:
inputs:
force_build:
description: 'Force build even if no new release'
required: false
type: boolean
default: false
jobs:
check-release:
runs-on: ubuntu-latest
outputs:
should_build: ${{ steps.check.outputs.should_build }}
release_tag: ${{ steps.check.outputs.release_tag }}
release_hash: ${{ steps.check.outputs.release_hash }}
steps:
- name: Checkout
uses: actions/checkout@v4
with:
fetch-depth: 0
- name: Check for new llama.cpp release
id: check
run: |
# Get latest release from llama.cpp
LATEST_RELEASE=$(curl -s https://api.github.com/repos/ggml-org/llama.cpp/releases/latest | jq -r '.tag_name')
RELEASE_HASH=$(curl -s https://api.github.com/repos/ggml-org/llama.cpp/releases/latest | jq -r '.target_commitish')
echo "Latest llama.cpp release: $LATEST_RELEASE"
echo "Release hash: $RELEASE_HASH"
# Check if we've already built this release
if git tag | grep -q "^${LATEST_RELEASE}$"; then
echo "Release $LATEST_RELEASE already built"
if [ "${{ github.event.inputs.force_build }}" = "true" ]; then
echo "Force build enabled, building anyway"
echo "should_build=true" >> $GITHUB_OUTPUT
else
echo "should_build=false" >> $GITHUB_OUTPUT
fi
else
echo "New release detected: $LATEST_RELEASE"
echo "should_build=true" >> $GITHUB_OUTPUT
fi
echo "release_tag=$LATEST_RELEASE" >> $GITHUB_OUTPUT
echo "release_hash=$RELEASE_HASH" >> $GITHUB_OUTPUT
build:
needs: check-release
if: needs.check-release.outputs.should_build == 'true'
runs-on: ubuntu-latest
strategy:
matrix:
cuda_version: ['12.4.1', '12.6.3', '12.8.1', '12.9.1', '13.0.1']
include:
- cuda_version: '12.4.1'
cuda_version_short: '12.4'
cuda_tag: '12.4.1-cudnn-devel-ubuntu22.04'
architectures: '61-virtual;70-virtual;75-virtual;80-virtual;86-virtual;89-virtual;90-virtual'
- cuda_version: '12.6.3'
cuda_version_short: '12.6'
cuda_tag: '12.6.3-cudnn-devel-ubuntu22.04'
architectures: '61-virtual;70-virtual;75-virtual;80-virtual;86-virtual;89-virtual;90-virtual'
- cuda_version: '12.8.1'
cuda_version_short: '12.8'
cuda_tag: '12.8.1-cudnn-devel-ubuntu22.04'
architectures: '61-virtual;70-virtual;75-virtual;80-virtual;86-virtual;89-virtual;90-virtual;100-virtual;120-virtual'
- cuda_version: '12.9.1'
cuda_version_short: '12.9'
cuda_tag: '12.9.1-cudnn-devel-ubuntu22.04'
architectures: '61-virtual;70-virtual;75-virtual;80-virtual;86-virtual;89-virtual;90-virtual;100-virtual;120-virtual'
- cuda_version: '13.0.1'
cuda_version_short: '13.0'
cuda_tag: '13.0.1-cudnn-devel-ubuntu22.04'
architectures: '75-virtual;80-virtual;86-virtual;89-virtual;90-virtual;100-virtual;120-virtual'
steps:
- name: Checkout
uses: actions/checkout@v4
- name: Maximize build space
run: |
echo "=== Initial Disk Space ==="
df -h
echo "=== Removing unnecessary packages ==="
sudo apt-get remove -y '^aspnetcore-.*' || true
sudo apt-get remove -y '^dotnet-.*' --fix-missing || true
sudo apt-get remove -y '^llvm-.*' --fix-missing || true
sudo apt-get remove -y 'php.*' --fix-missing || true
sudo apt-get remove -y '^mongodb-.*' --fix-missing || true
sudo apt-get remove -y '^mysql-.*' --fix-missing || true
sudo apt-get remove -y azure-cli google-chrome-stable firefox powershell mono-devel libgl1-mesa-dri --fix-missing || true
sudo apt-get autoremove -y || true
sudo apt-get clean || true
echo "=== Removing large directories ==="
sudo rm -rf /usr/share/dotnet
sudo rm -rf /usr/local/lib/android
sudo rm -rf /opt/ghc
sudo rm -rf /opt/hostedtoolcache/CodeQL
sudo rm -rf /usr/local/share/boost
sudo rm -rf /usr/share/swift
sudo rm -rf /usr/local/.ghcup
sudo rm -rf "$AGENT_TOOLSDIRECTORY"
# Clean docker system
docker system prune -af || true
echo "=== After Cleanup ==="
df -h
- name: Build llama.cpp with CUDA
run: |
# Pull Docker image
docker pull nvidia/cuda:${{ matrix.cuda_tag }}
# Run build in container
docker run --rm \
-v $PWD:/workspace \
nvidia/cuda:${{ matrix.cuda_tag }} \
bash -c "
set -e
echo '=== Installing minimal dependencies ==='
apt-get update -qq
apt-get install -y --no-install-recommends git cmake ninja-build build-essential libcurl4-openssl-dev ca-certificates
apt-get clean
rm -rf /var/lib/apt/lists/*
echo '=== Cloning llama.cpp ==='
cd /workspace
git clone --depth 1 --branch ${{ needs.check-release.outputs.release_tag }} https://github.com/ggml-org/llama.cpp.git || \
(git clone https://github.com/ggml-org/llama.cpp.git && cd llama.cpp && git checkout ${{ needs.check-release.outputs.release_hash }})
cd llama.cpp
echo '=== Configuring build with Ninja ==='
export LIBRARY_PATH="/usr/local/cuda/lib64/stubs${LIBRARY_PATH:+:$LIBRARY_PATH}"
ln -sf /usr/local/cuda/lib64/stubs/libcuda.so /usr/local/cuda/lib64/stubs/libcuda.so.1
cmake -B build -S . \
-G Ninja \
-DGGML_CUDA=ON \
-DCMAKE_CUDA_ARCHITECTURES='${{ matrix.architectures }}' \
-DCMAKE_BUILD_TYPE=Release \
-DBUILD_SHARED_LIBS=ON \
-DGGML_NATIVE=OFF \
-DLLAMA_BUILD_TESTS=OFF \
-DLLAMA_BUILD_EXAMPLES=OFF \
-DCMAKE_EXE_LINKER_FLAGS='-Wl,-rpath-link,/usr/local/cuda/lib64/stubs' \
-DCMAKE_SHARED_LINKER_FLAGS='-Wl,-rpath-link,/usr/local/cuda/lib64/stubs'
echo '=== Building with Ninja (parallel: all cores) ==='
cmake --build build --config Release -j\$(nproc)
echo '=== Copying binaries ==='
cd /workspace
mkdir -p binaries/cuda-${{ matrix.cuda_version_short }}
# Copy everything from build/bin
cp -r llama.cpp/build/bin/* binaries/cuda-${{ matrix.cuda_version_short }}/
# Strip binaries to reduce size (executables only, not .so files)
find binaries/cuda-${{ matrix.cuda_version_short }}/ -type f -executable ! -name "*.so*" -exec strip {} \; 2>/dev/null || true
echo '=== Creating version info ==='
echo 'llama.cpp version: ${{ needs.check-release.outputs.release_tag }}' > binaries/cuda-${{ matrix.cuda_version_short }}/VERSION.txt
echo 'CUDA version: ${{ matrix.cuda_version }}' >> binaries/cuda-${{ matrix.cuda_version_short }}/VERSION.txt
echo 'Architectures: ${{ matrix.architectures }}' >> binaries/cuda-${{ matrix.cuda_version_short }}/VERSION.txt
echo 'Build date: '$(date -u +%Y-%m-%d) >> binaries/cuda-${{ matrix.cuda_version_short }}/VERSION.txt
echo 'Build hash: ${{ needs.check-release.outputs.release_hash }}' >> binaries/cuda-${{ matrix.cuda_version_short }}/VERSION.txt
echo '=== Build complete ==='
ls -lh binaries/cuda-${{ matrix.cuda_version_short }}/
echo '=== Cleaning up build directory ==='
rm -rf llama.cpp
"
# Remove Docker image to free space
docker rmi nvidia/cuda:${{ matrix.cuda_tag }} || true
# Fix permissions for files created by root in container
sudo chown -R $(id -u):$(id -g) $PWD
echo "=== Final disk usage ==="
df -h
- name: Create tarball
run: |
cd binaries
tar -czf llama.cpp-${{ needs.check-release.outputs.release_tag }}-cuda-${{ matrix.cuda_version_short }}.tar.gz cuda-${{ matrix.cuda_version_short }}
ls -lh *.tar.gz
- name: Upload artifact
uses: actions/upload-artifact@v4
with:
name: llama.cpp-cuda-${{ matrix.cuda_version_short }}
path: binaries/*.tar.gz
retention-days: 1
release:
needs: [check-release, build]
runs-on: ubuntu-latest
permissions:
contents: write
steps:
- name: Checkout
uses: actions/checkout@v4
- name: Download all artifacts
uses: actions/download-artifact@v4
with:
path: artifacts
- name: Prepare release assets
run: |
mkdir -p release-assets
find artifacts -name "*.tar.gz" -exec cp {} release-assets/ \;
ls -lh release-assets/
- name: Create Release
uses: softprops/action-gh-release@v1
with:
tag_name: ${{ needs.check-release.outputs.release_tag }}
name: llama.cpp ${{ needs.check-release.outputs.release_tag }} with CUDA
body: |
# llama.cpp ${{ needs.check-release.outputs.release_tag }} with CUDA Support
Pre-built binaries of llama.cpp with CUDA support for multiple CUDA versions.
**Source:** https://github.com/ggml-org/llama.cpp/releases/tag/${{ needs.check-release.outputs.release_tag }}
**Commit:** ${{ needs.check-release.outputs.release_hash }}
## CUDA Versions
- CUDA 12.4 - Architectures: 6.1, 7.0, 7.5, 8.0, 8.6, 8.9, 9.0
- CUDA 12.6 - Architectures: 6.1, 7.0, 7.5, 8.0, 8.6, 8.9, 9.0
- CUDA 12.8 - Architectures: 6.1, 7.0, 7.5, 8.0, 8.6, 8.9, 9.0, 10.0, 12.0
- CUDA 12.9 - Architectures: 6.1, 7.0, 7.5, 8.0, 8.6, 8.9, 9.0, 10.0, 12.0
- CUDA 13.0 - Architectures: 7.5, 8.0, 8.6, 8.9, 9.0, 10.0, 12.0
## Architecture Reference
- 6.1: Titan XP, Tesla P40, GTX 10xx
- 7.0: Tesla V100
- 7.5: Tesla T4, RTX 20xx series, Quadro RTX
- 8.0: A100
- 8.6: RTX 3000 series
- 8.9: RTX 4000 series, L4, L40
- 9.0: H100, H200
- 10.0: B200
- 12.0: RTX Pro series, RTX 50xx
## Usage
Download the appropriate tarball for your CUDA version and extract:
```bash
tar -xzf llama.cpp-${{ needs.check-release.outputs.release_tag }}-cuda-12.8.tar.gz
./llama-cli --help
```
files: release-assets/*
draft: false
prerelease: false