mudler · mudler · Jul 3, 2025 · Jul 4, 2025 · Jul 7, 2025 · Jul 8, 2025
diff --git a/.dockerignore b/.dockerignore
@@ -3,7 +3,9 @@
 .vscode
 .devcontainer
 models
+backends
 examples/chatbot-ui/models
+backend/go/image/stablediffusion-ggml/build/
 examples/rwkv/models
 examples/**/models
 Dockerfile*
@@ -14,4 +16,4 @@ __pycache__
 
 # backend virtual environments
 **/venv
-backend/python/**/source
+backend/python/**/source
diff --git a/.github/bump_deps.sh b/.github/bump_deps.sh
@@ -3,15 +3,20 @@ set -xe
 REPO=$1
 BRANCH=$2
 VAR=$3
+FILE=$4
+
+if [ -z "$FILE" ]; then
+    FILE="Makefile"
+fi
 
 LAST_COMMIT=$(curl -s -H "Accept: application/vnd.github.VERSION.sha" "https://api.github.com/repos/$REPO/commits/$BRANCH")
 
 # Read $VAR from Makefile (only first match)
 set +e
-CURRENT_COMMIT="$(grep -m1 "^$VAR?=" Makefile | cut -d'=' -f2)"
+CURRENT_COMMIT="$(grep -m1 "^$VAR?=" $FILE | cut -d'=' -f2)"
 set -e
 
-sed -i Makefile -e "s/$VAR?=.*/$VAR?=$LAST_COMMIT/"
+sed -i $FILE -e "s/$VAR?=.*/$VAR?=$LAST_COMMIT/"
 
 if [ -z "$CURRENT_COMMIT" ]; then
     echo "Could not find $VAR in Makefile."

diff --git a/.github/workflows/backend.yml b/.github/workflows/backend.yml
@@ -50,6 +50,17 @@ jobs:
             backend: "rerankers"
             dockerfile: "./backend/Dockerfile.python"
             context: "./backend"
+          - build-type: 'cublas'
+            cuda-major-version: "11"
+            cuda-minor-version: "7"
+            platforms: 'linux/amd64,linux/arm64'
+            tag-latest: 'true'
+            tag-suffix: '-gpu-nvidia-cuda-11-llama-cpp'
+            runs-on: 'ubuntu-latest'
+            base-image: "ubuntu:22.04"
+            backend: "llama-cpp"
+            dockerfile: "./backend/Dockerfile.llama-cpp"
+            context: "./"
           - build-type: 'cublas'
             cuda-major-version: "11"
             cuda-minor-version: "7"
@@ -151,6 +162,17 @@ jobs:
             backend: "rerankers"
             dockerfile: "./backend/Dockerfile.python"
             context: "./backend"
+          - build-type: 'cublas'
+            cuda-major-version: "12"
+            cuda-minor-version: "0"
+            platforms: 'linux/amd64,linux/arm64'
+            tag-latest: 'true'
+            tag-suffix: '-gpu-nvidia-cuda-12-llama-cpp'
+            runs-on: 'ubuntu-latest'
+            base-image: "ubuntu:22.04"
+            backend: "llama-cpp"
+            dockerfile: "./backend/Dockerfile.llama-cpp"
+            context: "./"
           - build-type: 'cublas'
             cuda-major-version: "12"
             cuda-minor-version: "0"
@@ -252,6 +274,17 @@ jobs:
             backend: "rerankers"
             dockerfile: "./backend/Dockerfile.python"
             context: "./backend"
+          - build-type: 'hipblas'
+            cuda-major-version: ""
+            cuda-minor-version: ""
+            platforms: 'linux/amd64'
+            tag-latest: 'true'
+            tag-suffix: '-gpu-rocm-hipblas-llama-cpp'
+            runs-on: 'ubuntu-latest'
+            base-image: "rocm/dev-ubuntu-22.04:6.1"
+            backend: "llama-cpp"
+            dockerfile: "./backend/Dockerfile.llama-cpp"
+            context: "./"
           - build-type: 'hipblas'
             cuda-major-version: ""
             cuda-minor-version: ""
@@ -353,6 +386,28 @@ jobs:
             backend: "rerankers"
             dockerfile: "./backend/Dockerfile.python"
             context: "./backend"
+          - build-type: 'sycl_f32'
+            cuda-major-version: ""
+            cuda-minor-version: ""
+            platforms: 'linux/amd64'
+            tag-latest: 'true'
+            tag-suffix: '-gpu-intel-sycl-f32-llama-cpp'
+            runs-on: 'ubuntu-latest'
+            base-image: "quay.io/go-skynet/intel-oneapi-base:latest"
+            backend: "llama-cpp"
+            dockerfile: "./backend/Dockerfile.llama-cpp"
+            context: "./"
+          - build-type: 'sycl_f16'
+            cuda-major-version: ""
+            cuda-minor-version: ""
+            platforms: 'linux/amd64'
+            tag-latest: 'true'
+            tag-suffix: '-gpu-intel-sycl-f16-llama-cpp'
+            runs-on: 'ubuntu-latest'
+            base-image: "quay.io/go-skynet/intel-oneapi-base:latest"
+            backend: "llama-cpp"
+            dockerfile: "./backend/Dockerfile.llama-cpp"
+            context: "./"
           - build-type: 'sycl_f32'
             cuda-major-version: ""
             cuda-minor-version: ""
@@ -508,4 +563,15 @@ jobs:
             base-image: "ubuntu:22.04"
             backend: "bark"
             dockerfile: "./backend/Dockerfile.go"
+            context: "./"
+          - build-type: ''
+            cuda-major-version: ""
+            cuda-minor-version: ""
+            platforms: 'linux/amd64,linux/arm64'
+            tag-latest: 'true'
+            tag-suffix: '-cpu-llama-cpp'
+            runs-on: 'ubuntu-latest'
+            base-image: "ubuntu:22.04"
+            backend: "llama-cpp"
+            dockerfile: "./backend/Dockerfile.llama-cpp"
             context: "./"
diff --git a/.github/workflows/bump_deps.yaml b/.github/workflows/bump_deps.yaml
@@ -10,30 +10,36 @@ jobs:
       matrix:
         include:
           - repository: "ggml-org/llama.cpp"
-            variable: "CPPLLAMA_VERSION"
+            variable: "LLAMA_VERSION"
             branch: "master"
+            file: "backend/cpp/llama-cpp/Makefile"
           - repository: "ggml-org/whisper.cpp"
             variable: "WHISPER_CPP_VERSION"
             branch: "master"
+            file: "Makefile"
           - repository: "PABannier/bark.cpp"
             variable: "BARKCPP_VERSION"
             branch: "main"
+            file: "Makefile"
           - repository: "leejet/stable-diffusion.cpp"
             variable: "STABLEDIFFUSION_GGML_VERSION"
             branch: "master"
+            file: "Makefile"
           - repository: "mudler/go-stable-diffusion"
             variable: "STABLEDIFFUSION_VERSION"
             branch: "master"
+            file: "Makefile"
           - repository: "mudler/go-piper"
             variable: "PIPER_VERSION"
             branch: "master"
+            file: "Makefile"
     runs-on: ubuntu-latest
     steps:
       - uses: actions/checkout@v4
       - name: Bump dependencies 🔧
         id: bump
         run: |
-          bash .github/bump_deps.sh ${{ matrix.repository }} ${{ matrix.branch }} ${{ matrix.variable }}
+          bash .github/bump_deps.sh ${{ matrix.repository }} ${{ matrix.branch }} ${{ matrix.variable }} ${{ matrix.file }}
           {
             echo 'message<<EOF'
             cat "${{ matrix.variable }}_message.txt"

diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml
@@ -67,18 +67,21 @@ jobs:
       # You can test your matrix by printing the current Go version
       - name: Display Go version
         run: go version
+      - name: Proto Dependencies
+        run: |
+          # Install protoc
+          curl -L -s https://github.com/protocolbuffers/protobuf/releases/download/v26.1/protoc-26.1-linux-x86_64.zip -o protoc.zip && \
+          unzip -j -d /usr/local/bin protoc.zip bin/protoc && \
+          rm protoc.zip
+          go install google.golang.org/protobuf/cmd/[email protected]
+          go install google.golang.org/grpc/cmd/protoc-gen-go-grpc@1958fcbe2ca8bd93af633f11e97d44e567e945af
+          go install github.com/GeertJohan/go.rice/rice@latest
+          PATH="$PATH:$HOME/go/bin" make protogen-go
       - name: Dependencies
         run: |
           sudo apt-get update
           sudo apt-get install build-essential ccache upx-ucl curl ffmpeg
           sudo apt-get install -y libgmock-dev clang
-          curl https://repo.anaconda.com/pkgs/misc/gpgkeys/anaconda.asc | gpg --dearmor > conda.gpg && \
-             sudo install -o root -g root -m 644 conda.gpg /usr/share/keyrings/conda-archive-keyring.gpg && \
-             gpg --keyring /usr/share/keyrings/conda-archive-keyring.gpg --no-default-keyring --fingerprint 34161F5BF5EB1D4BFBBB8F0A8AEB4F8B29D82806 && \
-             sudo /bin/bash -c 'echo "deb [arch=amd64 signed-by=/usr/share/keyrings/conda-archive-keyring.gpg] https://repo.anaconda.com/pkgs/misc/debrepo/conda stable main" > /etc/apt/sources.list.d/conda.list' && \
-             sudo /bin/bash -c 'echo "deb [arch=amd64 signed-by=/usr/share/keyrings/conda-archive-keyring.gpg] https://repo.anaconda.com/pkgs/misc/debrepo/conda stable main" | tee -a /etc/apt/sources.list.d/conda.list' && \
-             sudo apt-get update && \
-             sudo apt-get install -y conda
           # Install UV
           curl -LsSf https://astral.sh/uv/install.sh | sh
           sudo apt-get install -y ca-certificates cmake patch python3-pip unzip
@@ -94,9 +97,6 @@ jobs:
           sudo apt-get install -y cuda-nvcc-${CUDA_VERSION} libcublas-dev-${CUDA_VERSION}
           export CUDACXX=/usr/local/cuda/bin/nvcc
 
-          go install google.golang.org/protobuf/cmd/[email protected]
-          go install google.golang.org/grpc/cmd/protoc-gen-go-grpc@1958fcbe2ca8bd93af633f11e97d44e567e945af
-          go install github.com/GeertJohan/go.rice/rice@latest
 
           # The python3-grpc-tools package in 22.04 is too old
           pip install --user grpcio-tools==1.71.0 grpcio==1.71.0
@@ -107,25 +107,10 @@ jobs:
           make sources/go-piper && \
           GO_TAGS="tts" make -C sources/go-piper piper.o && \
           sudo cp -rfv sources/go-piper/piper-phonemize/pi/lib/. /usr/lib/
+
+          make backends/llama-cpp
         env:
           CUDA_VERSION: 12-4
-      - name: Cache grpc
-        id: cache-grpc
-        uses: actions/cache@v4
-        with:
-          path: grpc
-          key: ${{ runner.os }}-grpc-${{ env.GRPC_VERSION }}
-      - name: Build grpc
-        if: steps.cache-grpc.outputs.cache-hit != 'true'
-        run: |
-          git clone --recurse-submodules -b ${{ env.GRPC_VERSION }} --depth 1 --jobs 5 --shallow-submodules https://github.com/grpc/grpc && \
-          cd grpc && sed -i "216i\  TESTONLY" "third_party/abseil-cpp/absl/container/CMakeLists.txt" && mkdir -p cmake/build && cd cmake/build && \
-          cmake -DgRPC_INSTALL=ON \
-            -DgRPC_BUILD_TESTS=OFF \
-            ../.. && sudo make --jobs 5
-      - name: Install gRPC
-        run: |
-          cd grpc && cd cmake/build && sudo make --jobs 5 install
       - name: Test
         run: |
           PATH="$PATH:/root/go/bin" GO_TAGS="tts" make --jobs 5 --output-sync=target test
@@ -186,14 +171,9 @@ jobs:
           go install google.golang.org/grpc/cmd/protoc-gen-go-grpc@1958fcbe2ca8bd93af633f11e97d44e567e945af
           go install github.com/GeertJohan/go.rice/rice@latest
           PATH="$PATH:$HOME/go/bin" make protogen-go
-      - name: Build images
-        run: |
-          docker build --build-arg FFMPEG=true --build-arg MAKEFLAGS="--jobs=5 --output-sync=target" -t local-ai:tests -f Dockerfile .
-          BASE_IMAGE=local-ai:tests DOCKER_AIO_IMAGE=local-ai-aio:test make docker-aio
       - name: Test
         run: |
-            PATH="$PATH:$HOME/go/bin" LOCALAI_MODELS_DIR=$PWD/models LOCALAI_IMAGE_TAG=test LOCALAI_IMAGE=local-ai-aio \
-            make run-e2e-aio
+            PATH="$PATH:$HOME/go/bin" make backends/llama-cpp docker-build-aio e2e-aio
       - name: Setup tmate session if tests fail
         if: ${{ failure() }}
         uses: mxschmitt/[email protected]
@@ -225,14 +205,23 @@ jobs:
           brew install protobuf grpc make protoc-gen-go protoc-gen-go-grpc libomp llvm
           pip install --user --no-cache-dir grpcio-tools==1.71.0 grpcio==1.71.0
           go install github.com/GeertJohan/go.rice/rice@latest
+      - name: Build llama-cpp-darwin
+        run: |
+          make protogen-go
+          make build-api
+          bash scripts/build-llama-cpp-darwin.sh
+          ls -la build/darwin.tar
+          mv build/darwin.tar build/llama-cpp.tar
+          ./local-ai backends install "ocifile://$PWD/build/llama-cpp.tar"
       - name: Test
         run: |
           export C_INCLUDE_PATH=/usr/local/include
           export CPLUS_INCLUDE_PATH=/usr/local/include
           export CC=/opt/homebrew/opt/llvm/bin/clang
           # Used to run the newer GNUMake version from brew that supports --output-sync
           export PATH="/opt/homebrew/opt/make/libexec/gnubin:$PATH"
-          BUILD_TYPE="GITHUB_CI_HAS_BROKEN_METAL" CMAKE_ARGS="-DGGML_F16C=OFF -DGGML_AVX512=OFF -DGGML_AVX2=OFF -DGGML_FMA=OFF" make --jobs 4 --output-sync=target test
+          PATH="$PATH:$HOME/go/bin" make protogen-go
+          PATH="$PATH:$HOME/go/bin" BUILD_TYPE="GITHUB_CI_HAS_BROKEN_METAL" CMAKE_ARGS="-DGGML_F16C=OFF -DGGML_AVX512=OFF -DGGML_AVX2=OFF -DGGML_FMA=OFF" make --jobs 4 --output-sync=target test
       - name: Setup tmate session if tests fail
         if: ${{ failure() }}
         uses: mxschmitt/[email protected]

diff --git a/.gitignore b/.gitignore
@@ -5,9 +5,11 @@ __pycache__/
 *.o
 get-sources
 prepare-sources
-/backend/cpp/llama/grpc-server
-/backend/cpp/llama/llama.cpp
+/backend/cpp/llama-cpp/grpc-server
+/backend/cpp/llama-cpp/llama.cpp
 /backend/cpp/llama-*
+!backend/cpp/llama-cpp
+/backends
 
 *.log
 
@@ -56,4 +58,4 @@ docs/static/gallery.html
 **/venv
 
 # per-developer customization files for the development container
-.devcontainer/customization/*
+.devcontainer/customization/*
diff --git a/Dockerfile b/Dockerfile
@@ -25,6 +25,7 @@ ARG TARGETVARIANT
 ENV BUILD_TYPE=${BUILD_TYPE}
 
 RUN mkdir -p /run/localai
+RUN echo "default" > /run/localai/capability
 
 # Vulkan requirements
 RUN <<EOT bash
@@ -299,11 +300,7 @@ COPY ./pkg/langchain ./pkg/langchain
 RUN ls -l ./
 RUN make backend-assets
 RUN make prepare
-RUN if [ "${TARGETARCH}" = "arm64" ] || [ "${BUILD_TYPE}" = "hipblas" ]; then \
-        SKIP_GRPC_BACKEND="backend-assets/grpc/llama-cpp-avx512 backend-assets/grpc/llama-cpp-avx backend-assets/grpc/llama-cpp-avx2" make grpcs; \
-    else \
-        make grpcs; \
-    fi
+RUN make grpcs
 
 # The builder target compiles LocalAI. This target is not the target that will be uploaded to the registry.
 # Adjustments to the build process should likely be made here.
@@ -316,11 +313,7 @@ COPY . .
 ## Build the binary
 ## If we're on arm64 AND using cublas/hipblas, skip some of the llama-compat backends to save space
 ## Otherwise just run the normal build
-RUN if [ "${TARGETARCH}" = "arm64" ] || [ "${BUILD_TYPE}" = "hipblas" ]; then \
-        SKIP_GRPC_BACKEND="backend-assets/grpc/llama-cpp-avx512 backend-assets/grpc/llama-cpp-avx backend-assets/grpc/llama-cpp-avx2" make build; \
-    else \
-        make build; \
-    fi
+RUN make build
 
 RUN if [ ! -d "/build/sources/go-piper/piper-phonemize/pi/lib/" ]; then \
         mkdir -p /build/sources/go-piper/piper-phonemize/pi/lib/ \