From f0391c5ea7159b4c4916d9f4aced2f982adbd1f4 Mon Sep 17 00:00:00 2001 From: Andrei Date: Tue, 24 Mar 2026 00:59:19 -0700 Subject: [PATCH 1/5] fix(ci): release wheel workflow (#2154) * fix(ci): harden release wheel workflow * fix(ci): document and pin release wheel baselines * fix(ci): speed up release arch builds * fix(ci): split riscv64 by python version * fix(ci): sanitize riscv64 artifact names --- .github/workflows/build-and-release.yaml | 49 +++++++++++++++++++----- 1 file changed, 39 insertions(+), 10 deletions(-) diff --git a/.github/workflows/build-and-release.yaml b/.github/workflows/build-and-release.yaml index 0121febe8..3a9e6f369 100644 --- a/.github/workflows/build-and-release.yaml +++ b/.github/workflows/build-and-release.yaml @@ -46,6 +46,13 @@ jobs: env: # disable repair CIBW_REPAIR_WHEEL_COMMAND: "" + # Skip cibuildwheel's default i686 sidecar and keep Linux release + # wheels on a portable x86_64 CPU baseline. + CIBW_ARCHS_LINUX: "auto64" + CIBW_ENVIRONMENT_LINUX: CMAKE_ARGS="-DGGML_NATIVE=off" + # Keep macOS release wheels on a portable CPU baseline instead of + # inheriting the hosted runner's native flags. + CIBW_ENVIRONMENT_MACOS: CMAKE_ARGS="-DGGML_NATIVE=off" with: package-dir: . output-dir: wheelhouse @@ -57,24 +64,21 @@ jobs: build_wheels_arm64: name: Build arm64 wheels - runs-on: ubuntu-latest + runs-on: ubuntu-24.04-arm steps: - uses: actions/checkout@v4 with: submodules: "recursive" - - name: Set up QEMU - uses: docker/setup-qemu-action@v3 - with: - platforms: linux/arm64 - - name: Build wheels uses: pypa/cibuildwheel@v2.22.0 env: CIBW_SKIP: "*musllinux* pp*" CIBW_REPAIR_WHEEL_COMMAND: "" CIBW_ARCHS: "aarch64" - CIBW_ENVIRONMENT: CMAKE_ARGS="-DCMAKE_OSX_ARCHITECTURES=arm64 -DCMAKE_APPLE_SILICON_PROCESSOR=arm64 -DCMAKE_CROSSCOMPILING=ON" + # Keep native arm64 builds on a portable CPU baseline instead of + # tuning wheels to the hosted runner. + CIBW_ENVIRONMENT: CMAKE_ARGS="-DGGML_NATIVE=off" CIBW_BUILD: "cp38-* cp39-* cp310-* cp311-* cp312-*" with: output-dir: wheelhouse @@ -86,8 +90,27 @@ jobs: path: ./wheelhouse/*.whl build_wheels_riscv64: - name: Build riscv64 wheels + name: Build riscv64 wheels (${{ matrix.shard.name }}) runs-on: ubuntu-latest + strategy: + fail-fast: false + matrix: + shard: + - name: cp310 + build: "cp310-*" + artifact: wheels_riscv64_cp310 + - name: cp311 + build: "cp311-*" + artifact: wheels_riscv64_cp311 + - name: cp312 + build: "cp312-*" + artifact: wheels_riscv64_cp312 + - name: cp313 + build: "cp313-*" + artifact: wheels_riscv64_cp313 + - name: cp314 + build: "cp314-*" + artifact: wheels_riscv64_cp314 steps: - uses: actions/checkout@v4 with: @@ -104,14 +127,19 @@ jobs: CIBW_SKIP: "*musllinux* pp*" CIBW_REPAIR_WHEEL_COMMAND: "" CIBW_ARCHS: "riscv64" - CIBW_BUILD: "cp310-* cp311-* cp312-* cp313-* cp314-*" + # Build riscv64 wheels against a conservative baseline instead of + # enabling RVV-related extensions from the build container. + CIBW_ENVIRONMENT: CMAKE_ARGS="-DGGML_NATIVE=off -DGGML_RVV=off -DGGML_RV_ZFH=off -DGGML_RV_ZVFH=off -DGGML_RV_ZICBOP=off -DGGML_RV_ZIHINTPAUSE=off" + # Split the emulated riscv64 build into one Python version per job + # to minimize wall-clock time without changing the release artifacts. + CIBW_BUILD: ${{ matrix.shard.build }} with: output-dir: wheelhouse - name: Upload wheels as artifacts uses: actions/upload-artifact@v4 with: - name: wheels_riscv64 + name: ${{ matrix.shard.artifact }} path: ./wheelhouse/*.whl build_sdist: @@ -159,6 +187,7 @@ jobs: release: name: Release needs: [build_wheels, build_wheels_arm64, build_wheels_riscv64, build_sdist] + if: startsWith(github.ref, 'refs/tags/') runs-on: ubuntu-latest steps: From 909ebf1246a52c15ebc95460c7e5957e3b64711e Mon Sep 17 00:00:00 2001 From: Andrei Date: Tue, 24 Mar 2026 01:00:50 -0700 Subject: [PATCH 2/5] fix(ci): cuda wheel workflow (#2155) * fix(ci): harden cuda wheel workflow * fix(ci): pin cuda toolkit versions accurately * fix(ci): resolve exact cuda toolkit installs * fix(ci): align cuda toolkit roots and tags * fix(ci): pin cuda packages to nvidia label * fix(ci): allow cuda solver to mix non-cuda deps --- .github/workflows/build-wheels-cuda.yaml | 57 +++++++++++++++++++++--- 1 file changed, 50 insertions(+), 7 deletions(-) diff --git a/.github/workflows/build-wheels-cuda.yaml b/.github/workflows/build-wheels-cuda.yaml index 07b30cfc0..b8d6c9dce 100644 --- a/.github/workflows/build-wheels-cuda.yaml +++ b/.github/workflows/build-wheels-cuda.yaml @@ -99,21 +99,63 @@ jobs: MAMBA_NO_LOW_SPEED_LIMIT: "1" run: | $cudaVersion = $env:CUDAVER - mamba install -y 'cuda' -c nvidia/label/cuda-$cudaVersion + $cudaChannel = "nvidia/label/cuda-$cudaVersion" + if ($IsLinux) { + # Keep nvcc, cudart, and headers on the same NVIDIA label so the + # detected toolkit version matches the published wheel tag. + mamba install -y --channel-priority flexible --override-channels -c $cudaChannel "$cudaChannel::cuda-toolkit=$cudaVersion" "$cudaChannel::cuda-nvcc_linux-64=$cudaVersion" "$cudaChannel::cuda-cudart" "$cudaChannel::cuda-cudart-dev" + } else { + mamba install -y --channel-priority flexible --override-channels -c $cudaChannel "$cudaChannel::cuda-toolkit=$cudaVersion" + } + if ($LASTEXITCODE -ne 0) { + exit $LASTEXITCODE + } python -m pip install build wheel - name: Build Wheel run: | - $cudaVersion = $env:CUDAVER.Remove($env:CUDAVER.LastIndexOf('.')).Replace('.','') $env:CUDA_PATH = $env:CONDA_PREFIX $env:CUDA_HOME = $env:CONDA_PREFIX $env:CUDA_TOOLKIT_ROOT_DIR = $env:CONDA_PREFIX + $cudaHostCompilerArg = '' + $env:CMAKE_ARGS = '' if ($IsLinux) { - $env:LD_LIBRARY_PATH = $env:CONDA_PREFIX + '/lib:' + $env:LD_LIBRARY_PATH + if (Test-Path '/usr/bin/g++-12') { + $env:CC = '/usr/bin/gcc-12' + $env:CXX = '/usr/bin/g++-12' + $env:CUDAHOSTCXX = '/usr/bin/g++-12' + $cudaHostCompilerArg = " -DCMAKE_CUDA_HOST_COMPILER=$env:CUDAHOSTCXX" + } + if (Test-Path (Join-Path $env:CONDA_PREFIX 'include/cuda_runtime.h')) { + $env:CUDAToolkit_ROOT = $env:CONDA_PREFIX + $env:CUDA_TOOLKIT_ROOT_DIR = $env:CONDA_PREFIX + $env:CMAKE_ARGS = "-DCUDAToolkit_ROOT=$env:CONDA_PREFIX -DCUDA_TOOLKIT_ROOT_DIR=$env:CONDA_PREFIX$cudaHostCompilerArg" + $env:CPATH = "$env:CONDA_PREFIX/include:$env:CPATH" + $env:CPLUS_INCLUDE_PATH = "$env:CONDA_PREFIX/include:$env:CPLUS_INCLUDE_PATH" + $env:LIBRARY_PATH = "$env:CONDA_PREFIX/lib:$env:LIBRARY_PATH" + $env:LD_LIBRARY_PATH = "$env:CONDA_PREFIX/lib:$env:LD_LIBRARY_PATH" + } else { + $env:CMAKE_ARGS = $cudaHostCompilerArg.Trim() + } + } + $nvccPath = Join-Path $env:CONDA_PREFIX 'bin/nvcc' + if (-not (Test-Path $nvccPath)) { + $nvccPath = Join-Path $env:CONDA_PREFIX 'targets/x86_64-linux/bin/nvcc' + } + if (-not (Test-Path $nvccPath)) { + throw 'Failed to find nvcc in the conda environment' + } + $env:CUDACXX = $nvccPath + $env:PATH = "$(Split-Path $nvccPath):$env:PATH" + $nvccVersion = ((& $nvccPath --version) | Select-String 'release ([0-9]+\.[0-9]+)').Matches[0].Groups[1].Value + if (-not $nvccVersion) { + throw 'Failed to detect the installed CUDA toolkit version' } + $cudaTagVersion = $nvccVersion.Replace('.','') $env:VERBOSE = '1' - $env:CMAKE_ARGS = '-DGGML_CUDA=on -DCMAKE_CUDA_ARCHITECTURES=all' - $env:CMAKE_ARGS = "-DGGML_CUDA_FORCE_MMQ=ON $env:CMAKE_ARGS" + # Keep a portable SM set, including sm_70, instead of CMake's `all`, + # which now pulls in future targets the hosted-runner toolchains cannot assemble. + $env:CMAKE_ARGS = "-DGGML_CUDA_FORCE_MMQ=ON -DGGML_CUDA=on -DCMAKE_CUDA_ARCHITECTURES=70;75;80;86;89;90 -DCMAKE_CUDA_FLAGS=--allow-unsupported-compiler $env:CMAKE_ARGS" # if ($env:AVXVER -eq 'AVX') { $env:CMAKE_ARGS = $env:CMAKE_ARGS + ' -DGGML_AVX2=off -DGGML_FMA=off -DGGML_F16C=off' # } @@ -124,10 +166,11 @@ jobs: # $env:CMAKE_ARGS = $env:CMAKE_ARGS + ' -DGGML_AVX=off -DGGML_AVX2=off -DGGML_FMA=off -DGGML_F16C=off' # } python -m build --wheel - # write the build tag to the output - Write-Output "CUDA_VERSION=$cudaVersion" >> $env:GITHUB_ENV + # Publish tags that reflect the actual installed toolkit version. + Write-Output "CUDA_VERSION=$cudaTagVersion" >> $env:GITHUB_ENV - uses: softprops/action-gh-release@v2 + if: startsWith(github.ref, 'refs/tags/') with: files: dist/* # Set tag_name to -cu From ccc6bc0454b2d73431a419620aad92fda1aba162 Mon Sep 17 00:00:00 2001 From: Andrei Date: Tue, 24 Mar 2026 01:02:14 -0700 Subject: [PATCH 3/5] fix(ci): docker build workflow (#2156) * fix(ci): harden docker build workflow * docs: update changelog for ci workflows --- .github/workflows/build-docker.yaml | 11 ++++++++++- CHANGELOG.md | 4 ++++ docker/simple/Dockerfile | 3 ++- 3 files changed, 16 insertions(+), 2 deletions(-) diff --git a/.github/workflows/build-docker.yaml b/.github/workflows/build-docker.yaml index b290f6273..c65695847 100644 --- a/.github/workflows/build-docker.yaml +++ b/.github/workflows/build-docker.yaml @@ -16,6 +16,15 @@ jobs: with: submodules: "recursive" + - name: Set image tag + run: | + if [[ "${GITHUB_REF_TYPE}" == "tag" ]]; then + image_tag="${GITHUB_REF_NAME}" + else + image_tag="${GITHUB_REF_NAME//\//-}" + fi + echo "IMAGE_TAG=$image_tag" >> "$GITHUB_ENV" + - name: Set up QEMU uses: docker/setup-qemu-action@v3 @@ -40,7 +49,7 @@ jobs: platforms: linux/amd64,linux/arm64 tags: | ghcr.io/abetlen/llama-cpp-python:latest - ghcr.io/abetlen/llama-cpp-python:${{ github.ref_name }} + ghcr.io/abetlen/llama-cpp-python:${{ env.IMAGE_TAG }} build-args: | BUILDKIT_INLINE_CACHE=1 diff --git a/CHANGELOG.md b/CHANGELOG.md index d1195cc2a..b47613109 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -7,6 +7,10 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 ## [Unreleased] +- fix(ci): Build Docker images from the checked-out source and sanitize branch tags by @abetlen in #2156 +- fix(ci): Fix the CUDA wheel workflow and keep release tags aligned with the built toolkit by @abetlen in #2155 +- fix(ci): Speed up release wheel builds by moving arm64 off QEMU and parallelizing riscv64 by @abetlen in #2154 + ## [0.3.17] - feat: Update llama.cpp to ggerganov/llama.cpp@49bfddeca18e62fa3d39114a23e9fcbdf8a22388 and sync Python bindings by @abetlen in #2151 diff --git a/docker/simple/Dockerfile b/docker/simple/Dockerfile index 06483d44e..bad4f456f 100644 --- a/docker/simple/Dockerfile +++ b/docker/simple/Dockerfile @@ -6,6 +6,7 @@ FROM ${IMAGE} # Re-declare the ARG after FROM ARG IMAGE +ARG CMAKE_ARGS="-DGGML_NATIVE=off" # Update and upgrade the existing packages RUN apt-get update && apt-get upgrade -y && apt-get install -y --no-install-recommends \ @@ -26,7 +27,7 @@ RUN python3 -m pip install --upgrade pip RUN python3 -m pip install --upgrade pip pytest cmake scikit-build setuptools fastapi uvicorn sse-starlette pydantic-settings starlette-context -RUN pip install llama-cpp-python --verbose; +RUN CMAKE_ARGS="${CMAKE_ARGS}" pip install . --verbose # Set environment variable for the host ENV HOST=0.0.0.0 From 7b38c3122d2ff3ad23e1502de045807836ced4a7 Mon Sep 17 00:00:00 2001 From: Victor Biederbeck Date: Tue, 24 Mar 2026 02:50:15 -0700 Subject: [PATCH 4/5] feat: expose attention_type parameter in Llama.__init__ (#2143) * feat: expose attention_type parameter in Llama.__init__ * docs: preserve attention_type in pickled state * docs: update changelog for attention_type --------- Co-authored-by: Victor Biederbeck Co-authored-by: abetlen --- CHANGELOG.md | 1 + llama_cpp/llama.py | 4 ++++ 2 files changed, 5 insertions(+) diff --git a/CHANGELOG.md b/CHANGELOG.md index b47613109..de4f070ff 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -7,6 +7,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 ## [Unreleased] +- feat: Expose `attention_type` in `Llama.__init__` for non-causal embedding models by @jamesbiederbeck in #2143 - fix(ci): Build Docker images from the checked-out source and sanitize branch tags by @abetlen in #2156 - fix(ci): Fix the CUDA wheel workflow and keep release tags aligned with the built toolkit by @abetlen in #2155 - fix(ci): Speed up release wheel builds by moving arm64 off QEMU and parallelizing riscv64 by @abetlen in #2154 diff --git a/llama_cpp/llama.py b/llama_cpp/llama.py index 88bc2e5bb..ad484c4d5 100644 --- a/llama_cpp/llama.py +++ b/llama_cpp/llama.py @@ -81,6 +81,7 @@ def __init__( int ] = llama_cpp.LLAMA_ROPE_SCALING_TYPE_UNSPECIFIED, pooling_type: int = llama_cpp.LLAMA_POOLING_TYPE_UNSPECIFIED, + attention_type: int = llama_cpp.LLAMA_ATTENTION_TYPE_UNSPECIFIED, rope_freq_base: float = 0.0, rope_freq_scale: float = 0.0, yarn_ext_factor: float = -1.0, @@ -163,6 +164,7 @@ def __init__( n_threads_batch: Number of threads to use for batch processing rope_scaling_type: RoPE scaling type, from `enum llama_rope_scaling_type`. ref: https://github.com/ggerganov/llama.cpp/pull/2054 pooling_type: Pooling type, from `enum llama_pooling_type`. + attention_type: Attention type, from `enum llama_attention_type`. rope_freq_base: RoPE base frequency, 0 = from model rope_freq_scale: RoPE frequency scaling factor, 0 = from model yarn_ext_factor: YaRN extrapolation mix factor, negative = from model @@ -319,6 +321,7 @@ def __init__( else llama_cpp.LLAMA_ROPE_SCALING_TYPE_UNSPECIFIED ) self.context_params.pooling_type = pooling_type + self.context_params.attention_type = attention_type self.context_params.rope_freq_base = ( rope_freq_base if rope_freq_base != 0.0 else 0 ) @@ -2100,6 +2103,7 @@ def __getstate__(self): n_threads_batch=self.context_params.n_threads_batch, rope_scaling_type=self.context_params.rope_scaling_type, pooling_type=self.context_params.pooling_type, + attention_type=self.context_params.attention_type, rope_freq_base=self.context_params.rope_freq_base, rope_freq_scale=self.context_params.rope_freq_scale, yarn_ext_factor=self.context_params.yarn_ext_factor, From d6f46a50d6b4cda10460c05e2acdbaec74428c1b Mon Sep 17 00:00:00 2001 From: Andrei Date: Tue, 24 Mar 2026 02:56:01 -0700 Subject: [PATCH 5/5] chore: bump version (#2157) --- CHANGELOG.md | 2 ++ llama_cpp/__init__.py | 2 +- 2 files changed, 3 insertions(+), 1 deletion(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index de4f070ff..4118f4848 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -7,6 +7,8 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 ## [Unreleased] +## [0.3.18] + - feat: Expose `attention_type` in `Llama.__init__` for non-causal embedding models by @jamesbiederbeck in #2143 - fix(ci): Build Docker images from the checked-out source and sanitize branch tags by @abetlen in #2156 - fix(ci): Fix the CUDA wheel workflow and keep release tags aligned with the built toolkit by @abetlen in #2155 diff --git a/llama_cpp/__init__.py b/llama_cpp/__init__.py index a7c40478b..bdaefb9e0 100644 --- a/llama_cpp/__init__.py +++ b/llama_cpp/__init__.py @@ -1,4 +1,4 @@ from .llama_cpp import * from .llama import * -__version__ = "0.3.17" +__version__ = "0.3.18"