MZWNET · pull · Mar 24, 2026 · Mar 24, 2026 · Mar 24, 2026 · Mar 24, 2026
diff --git a/.github/workflows/build-and-release.yaml b/.github/workflows/build-and-release.yaml
@@ -46,6 +46,13 @@ jobs:
         env:
           # disable repair
           CIBW_REPAIR_WHEEL_COMMAND: ""
+          # Skip cibuildwheel's default i686 sidecar and keep Linux release
+          # wheels on a portable x86_64 CPU baseline.
+          CIBW_ARCHS_LINUX: "auto64"
+          CIBW_ENVIRONMENT_LINUX: CMAKE_ARGS="-DGGML_NATIVE=off"
+          # Keep macOS release wheels on a portable CPU baseline instead of
+          # inheriting the hosted runner's native flags.
+          CIBW_ENVIRONMENT_MACOS: CMAKE_ARGS="-DGGML_NATIVE=off"
         with:
           package-dir: .
           output-dir: wheelhouse
@@ -57,24 +64,21 @@ jobs:
 
   build_wheels_arm64:
     name: Build arm64 wheels
-    runs-on: ubuntu-latest
+    runs-on: ubuntu-24.04-arm
     steps:
       - uses: actions/checkout@v4
         with:
           submodules: "recursive"
 
-      - name: Set up QEMU
-        uses: docker/setup-qemu-action@v3
-        with:
-          platforms: linux/arm64
-
       - name: Build wheels
         uses: pypa/cibuildwheel@v2.22.0
         env:
           CIBW_SKIP: "*musllinux* pp*"
           CIBW_REPAIR_WHEEL_COMMAND: ""
           CIBW_ARCHS: "aarch64"
-          CIBW_ENVIRONMENT: CMAKE_ARGS="-DCMAKE_OSX_ARCHITECTURES=arm64 -DCMAKE_APPLE_SILICON_PROCESSOR=arm64 -DCMAKE_CROSSCOMPILING=ON"
+          # Keep native arm64 builds on a portable CPU baseline instead of
+          # tuning wheels to the hosted runner.
+          CIBW_ENVIRONMENT: CMAKE_ARGS="-DGGML_NATIVE=off"
           CIBW_BUILD: "cp38-* cp39-* cp310-* cp311-* cp312-*"
         with:
           output-dir: wheelhouse
@@ -86,8 +90,27 @@ jobs:
           path: ./wheelhouse/*.whl
 
   build_wheels_riscv64:
-    name: Build riscv64 wheels
+    name: Build riscv64 wheels (${{ matrix.shard.name }})
     runs-on: ubuntu-latest
+    strategy:
+      fail-fast: false
+      matrix:
+        shard:
+          - name: cp310
+            build: "cp310-*"
+            artifact: wheels_riscv64_cp310
+          - name: cp311
+            build: "cp311-*"
+            artifact: wheels_riscv64_cp311
+          - name: cp312
+            build: "cp312-*"
+            artifact: wheels_riscv64_cp312
+          - name: cp313
+            build: "cp313-*"
+            artifact: wheels_riscv64_cp313
+          - name: cp314
+            build: "cp314-*"
+            artifact: wheels_riscv64_cp314
     steps:
       - uses: actions/checkout@v4
         with:
@@ -104,14 +127,19 @@ jobs:
           CIBW_SKIP: "*musllinux* pp*"
           CIBW_REPAIR_WHEEL_COMMAND: ""
           CIBW_ARCHS: "riscv64"
-          CIBW_BUILD: "cp310-* cp311-* cp312-* cp313-* cp314-*"
+          # Build riscv64 wheels against a conservative baseline instead of
+          # enabling RVV-related extensions from the build container.
+          CIBW_ENVIRONMENT: CMAKE_ARGS="-DGGML_NATIVE=off -DGGML_RVV=off -DGGML_RV_ZFH=off -DGGML_RV_ZVFH=off -DGGML_RV_ZICBOP=off -DGGML_RV_ZIHINTPAUSE=off"
+          # Split the emulated riscv64 build into one Python version per job
+          # to minimize wall-clock time without changing the release artifacts.
+          CIBW_BUILD: ${{ matrix.shard.build }}
         with:
           output-dir: wheelhouse
 
       - name: Upload wheels as artifacts
         uses: actions/upload-artifact@v4
         with:
-          name: wheels_riscv64
+          name: ${{ matrix.shard.artifact }}
           path: ./wheelhouse/*.whl
 
   build_sdist:
@@ -159,6 +187,7 @@ jobs:
   release:
     name: Release
     needs: [build_wheels, build_wheels_arm64, build_wheels_riscv64, build_sdist]
+    if: startsWith(github.ref, 'refs/tags/')
     runs-on: ubuntu-latest
 
     steps:

diff --git a/.github/workflows/build-docker.yaml b/.github/workflows/build-docker.yaml
@@ -16,6 +16,15 @@ jobs:
         with:
           submodules: "recursive"
 
+      - name: Set image tag
+        run: |
+          if [[ "${GITHUB_REF_TYPE}" == "tag" ]]; then
+            image_tag="${GITHUB_REF_NAME}"
+          else
+            image_tag="${GITHUB_REF_NAME//\//-}"
+          fi
+          echo "IMAGE_TAG=$image_tag" >> "$GITHUB_ENV"
+
       - name: Set up QEMU
         uses: docker/setup-qemu-action@v3
 
@@ -40,7 +49,7 @@ jobs:
           platforms: linux/amd64,linux/arm64
           tags: |
             ghcr.io/abetlen/llama-cpp-python:latest
-            ghcr.io/abetlen/llama-cpp-python:${{ github.ref_name }}
+            ghcr.io/abetlen/llama-cpp-python:${{ env.IMAGE_TAG }}
           build-args: |
             BUILDKIT_INLINE_CACHE=1
 

diff --git a/.github/workflows/build-wheels-cuda.yaml b/.github/workflows/build-wheels-cuda.yaml
@@ -99,21 +99,63 @@ jobs:
           MAMBA_NO_LOW_SPEED_LIMIT: "1"
         run: |
           $cudaVersion = $env:CUDAVER
-          mamba install -y 'cuda' -c nvidia/label/cuda-$cudaVersion
+          $cudaChannel = "nvidia/label/cuda-$cudaVersion"
+          if ($IsLinux) {
+            # Keep nvcc, cudart, and headers on the same NVIDIA label so the
+            # detected toolkit version matches the published wheel tag.
+            mamba install -y --channel-priority flexible --override-channels -c $cudaChannel "$cudaChannel::cuda-toolkit=$cudaVersion" "$cudaChannel::cuda-nvcc_linux-64=$cudaVersion" "$cudaChannel::cuda-cudart" "$cudaChannel::cuda-cudart-dev"
+          } else {
+            mamba install -y --channel-priority flexible --override-channels -c $cudaChannel "$cudaChannel::cuda-toolkit=$cudaVersion"
+          }
+          if ($LASTEXITCODE -ne 0) {
+            exit $LASTEXITCODE
+          }
           python -m pip install build wheel
 
       - name: Build Wheel
         run: |
-          $cudaVersion = $env:CUDAVER.Remove($env:CUDAVER.LastIndexOf('.')).Replace('.','')
           $env:CUDA_PATH = $env:CONDA_PREFIX
           $env:CUDA_HOME = $env:CONDA_PREFIX
           $env:CUDA_TOOLKIT_ROOT_DIR = $env:CONDA_PREFIX
+          $cudaHostCompilerArg = ''
+          $env:CMAKE_ARGS = ''
           if ($IsLinux) {
-            $env:LD_LIBRARY_PATH = $env:CONDA_PREFIX + '/lib:' + $env:LD_LIBRARY_PATH
+            if (Test-Path '/usr/bin/g++-12') {
+              $env:CC = '/usr/bin/gcc-12'
+              $env:CXX = '/usr/bin/g++-12'
+              $env:CUDAHOSTCXX = '/usr/bin/g++-12'
+              $cudaHostCompilerArg = " -DCMAKE_CUDA_HOST_COMPILER=$env:CUDAHOSTCXX"
+            }
+            if (Test-Path (Join-Path $env:CONDA_PREFIX 'include/cuda_runtime.h')) {
+              $env:CUDAToolkit_ROOT = $env:CONDA_PREFIX
+              $env:CUDA_TOOLKIT_ROOT_DIR = $env:CONDA_PREFIX
+              $env:CMAKE_ARGS = "-DCUDAToolkit_ROOT=$env:CONDA_PREFIX -DCUDA_TOOLKIT_ROOT_DIR=$env:CONDA_PREFIX$cudaHostCompilerArg"
+              $env:CPATH = "$env:CONDA_PREFIX/include:$env:CPATH"
+              $env:CPLUS_INCLUDE_PATH = "$env:CONDA_PREFIX/include:$env:CPLUS_INCLUDE_PATH"
+              $env:LIBRARY_PATH = "$env:CONDA_PREFIX/lib:$env:LIBRARY_PATH"
+              $env:LD_LIBRARY_PATH = "$env:CONDA_PREFIX/lib:$env:LD_LIBRARY_PATH"
+            } else {
+              $env:CMAKE_ARGS = $cudaHostCompilerArg.Trim()
+            }
+          }
+          $nvccPath = Join-Path $env:CONDA_PREFIX 'bin/nvcc'
+          if (-not (Test-Path $nvccPath)) {
+            $nvccPath = Join-Path $env:CONDA_PREFIX 'targets/x86_64-linux/bin/nvcc'
+          }
+          if (-not (Test-Path $nvccPath)) {
+            throw 'Failed to find nvcc in the conda environment'
+          }
+          $env:CUDACXX = $nvccPath
+          $env:PATH = "$(Split-Path $nvccPath):$env:PATH"
+          $nvccVersion = ((& $nvccPath --version) | Select-String 'release ([0-9]+\.[0-9]+)').Matches[0].Groups[1].Value
+          if (-not $nvccVersion) {
+            throw 'Failed to detect the installed CUDA toolkit version'
           }
+          $cudaTagVersion = $nvccVersion.Replace('.','')
           $env:VERBOSE = '1'
-          $env:CMAKE_ARGS = '-DGGML_CUDA=on -DCMAKE_CUDA_ARCHITECTURES=all'
-          $env:CMAKE_ARGS = "-DGGML_CUDA_FORCE_MMQ=ON $env:CMAKE_ARGS"
+          # Keep a portable SM set, including sm_70, instead of CMake's `all`,
+          # which now pulls in future targets the hosted-runner toolchains cannot assemble.
+          $env:CMAKE_ARGS = "-DGGML_CUDA_FORCE_MMQ=ON -DGGML_CUDA=on -DCMAKE_CUDA_ARCHITECTURES=70;75;80;86;89;90 -DCMAKE_CUDA_FLAGS=--allow-unsupported-compiler $env:CMAKE_ARGS"
           # if ($env:AVXVER -eq 'AVX') {
           $env:CMAKE_ARGS = $env:CMAKE_ARGS + ' -DGGML_AVX2=off -DGGML_FMA=off -DGGML_F16C=off'
           # }
@@ -124,10 +166,11 @@ jobs:
           #  $env:CMAKE_ARGS = $env:CMAKE_ARGS + ' -DGGML_AVX=off -DGGML_AVX2=off -DGGML_FMA=off -DGGML_F16C=off'
           # }
           python -m build --wheel
-          # write the build tag to the output
-          Write-Output "CUDA_VERSION=$cudaVersion" >> $env:GITHUB_ENV
+          # Publish tags that reflect the actual installed toolkit version.
+          Write-Output "CUDA_VERSION=$cudaTagVersion" >> $env:GITHUB_ENV
 
       - uses: softprops/action-gh-release@v2
+        if: startsWith(github.ref, 'refs/tags/')
         with:
           files: dist/*
           # Set tag_name to <tag>-cu<cuda_version>

diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -7,6 +7,13 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
 
 ## [Unreleased]
 
+## [0.3.18]
+
+- feat: Expose `attention_type` in `Llama.__init__` for non-causal embedding models by @jamesbiederbeck in #2143
+- fix(ci): Build Docker images from the checked-out source and sanitize branch tags by @abetlen in #2156
+- fix(ci): Fix the CUDA wheel workflow and keep release tags aligned with the built toolkit by @abetlen in #2155
+- fix(ci): Speed up release wheel builds by moving arm64 off QEMU and parallelizing riscv64 by @abetlen in #2154
+
 ## [0.3.17]
 
 - feat: Update llama.cpp to ggerganov/llama.cpp@49bfddeca18e62fa3d39114a23e9fcbdf8a22388 and sync Python bindings by @abetlen in #2151

diff --git a/docker/simple/Dockerfile b/docker/simple/Dockerfile
@@ -6,6 +6,7 @@ FROM ${IMAGE}
 
 # Re-declare the ARG after FROM
 ARG IMAGE
+ARG CMAKE_ARGS="-DGGML_NATIVE=off"
 
 # Update and upgrade the existing packages 
 RUN apt-get update && apt-get upgrade -y && apt-get install -y --no-install-recommends \
@@ -26,7 +27,7 @@ RUN python3 -m pip install --upgrade pip
 
 RUN python3 -m pip install --upgrade pip pytest cmake scikit-build setuptools fastapi uvicorn sse-starlette pydantic-settings starlette-context
 
-RUN pip install llama-cpp-python --verbose;
+RUN CMAKE_ARGS="${CMAKE_ARGS}" pip install . --verbose
 
 # Set environment variable for the host
 ENV HOST=0.0.0.0

diff --git a/llama_cpp/__init__.py b/llama_cpp/__init__.py
@@ -1,4 +1,4 @@
 from .llama_cpp import *
 from .llama import *
 
-__version__ = "0.3.17"
+__version__ = "0.3.18"
diff --git a/llama_cpp/llama.py b/llama_cpp/llama.py
@@ -81,6 +81,7 @@ def __init__(
             int
         ] = llama_cpp.LLAMA_ROPE_SCALING_TYPE_UNSPECIFIED,
         pooling_type: int = llama_cpp.LLAMA_POOLING_TYPE_UNSPECIFIED,
+        attention_type: int = llama_cpp.LLAMA_ATTENTION_TYPE_UNSPECIFIED,
         rope_freq_base: float = 0.0,
         rope_freq_scale: float = 0.0,
         yarn_ext_factor: float = -1.0,
@@ -163,6 +164,7 @@ def __init__(
             n_threads_batch: Number of threads to use for batch processing
             rope_scaling_type: RoPE scaling type, from `enum llama_rope_scaling_type`. ref: https://github.com/ggerganov/llama.cpp/pull/2054
             pooling_type: Pooling type, from `enum llama_pooling_type`.
+            attention_type: Attention type, from `enum llama_attention_type`.
             rope_freq_base: RoPE base frequency, 0 = from model
             rope_freq_scale: RoPE frequency scaling factor, 0 = from model
             yarn_ext_factor: YaRN extrapolation mix factor, negative = from model
@@ -319,6 +321,7 @@ def __init__(
             else llama_cpp.LLAMA_ROPE_SCALING_TYPE_UNSPECIFIED
         )
         self.context_params.pooling_type = pooling_type
+        self.context_params.attention_type = attention_type
         self.context_params.rope_freq_base = (
             rope_freq_base if rope_freq_base != 0.0 else 0
         )
@@ -2100,6 +2103,7 @@ def __getstate__(self):
             n_threads_batch=self.context_params.n_threads_batch,
             rope_scaling_type=self.context_params.rope_scaling_type,
             pooling_type=self.context_params.pooling_type,
+            attention_type=self.context_params.attention_type,
             rope_freq_base=self.context_params.rope_freq_base,
             rope_freq_scale=self.context_params.rope_freq_scale,
             yarn_ext_factor=self.context_params.yarn_ext_factor,