Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
49 changes: 39 additions & 10 deletions .github/workflows/build-and-release.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -46,6 +46,13 @@ jobs:
env:
# disable repair
CIBW_REPAIR_WHEEL_COMMAND: ""
# Skip cibuildwheel's default i686 sidecar and keep Linux release
# wheels on a portable x86_64 CPU baseline.
CIBW_ARCHS_LINUX: "auto64"
CIBW_ENVIRONMENT_LINUX: CMAKE_ARGS="-DGGML_NATIVE=off"
# Keep macOS release wheels on a portable CPU baseline instead of
# inheriting the hosted runner's native flags.
CIBW_ENVIRONMENT_MACOS: CMAKE_ARGS="-DGGML_NATIVE=off"
with:
package-dir: .
output-dir: wheelhouse
Expand All @@ -57,24 +64,21 @@ jobs:

build_wheels_arm64:
name: Build arm64 wheels
runs-on: ubuntu-latest
runs-on: ubuntu-24.04-arm
steps:
- uses: actions/checkout@v4
with:
submodules: "recursive"

- name: Set up QEMU
uses: docker/setup-qemu-action@v3
with:
platforms: linux/arm64

- name: Build wheels
uses: pypa/cibuildwheel@v2.22.0
env:
CIBW_SKIP: "*musllinux* pp*"
CIBW_REPAIR_WHEEL_COMMAND: ""
CIBW_ARCHS: "aarch64"
CIBW_ENVIRONMENT: CMAKE_ARGS="-DCMAKE_OSX_ARCHITECTURES=arm64 -DCMAKE_APPLE_SILICON_PROCESSOR=arm64 -DCMAKE_CROSSCOMPILING=ON"
# Keep native arm64 builds on a portable CPU baseline instead of
# tuning wheels to the hosted runner.
CIBW_ENVIRONMENT: CMAKE_ARGS="-DGGML_NATIVE=off"
CIBW_BUILD: "cp38-* cp39-* cp310-* cp311-* cp312-*"
with:
output-dir: wheelhouse
Expand All @@ -86,8 +90,27 @@ jobs:
path: ./wheelhouse/*.whl

build_wheels_riscv64:
name: Build riscv64 wheels
name: Build riscv64 wheels (${{ matrix.shard.name }})
runs-on: ubuntu-latest
strategy:
fail-fast: false
matrix:
shard:
- name: cp310
build: "cp310-*"
artifact: wheels_riscv64_cp310
- name: cp311
build: "cp311-*"
artifact: wheels_riscv64_cp311
- name: cp312
build: "cp312-*"
artifact: wheels_riscv64_cp312
- name: cp313
build: "cp313-*"
artifact: wheels_riscv64_cp313
- name: cp314
build: "cp314-*"
artifact: wheels_riscv64_cp314
steps:
- uses: actions/checkout@v4
with:
Expand All @@ -104,14 +127,19 @@ jobs:
CIBW_SKIP: "*musllinux* pp*"
CIBW_REPAIR_WHEEL_COMMAND: ""
CIBW_ARCHS: "riscv64"
CIBW_BUILD: "cp310-* cp311-* cp312-* cp313-* cp314-*"
# Build riscv64 wheels against a conservative baseline instead of
# enabling RVV-related extensions from the build container.
CIBW_ENVIRONMENT: CMAKE_ARGS="-DGGML_NATIVE=off -DGGML_RVV=off -DGGML_RV_ZFH=off -DGGML_RV_ZVFH=off -DGGML_RV_ZICBOP=off -DGGML_RV_ZIHINTPAUSE=off"
# Split the emulated riscv64 build into one Python version per job
# to minimize wall-clock time without changing the release artifacts.
CIBW_BUILD: ${{ matrix.shard.build }}
with:
output-dir: wheelhouse

- name: Upload wheels as artifacts
uses: actions/upload-artifact@v4
with:
name: wheels_riscv64
name: ${{ matrix.shard.artifact }}
path: ./wheelhouse/*.whl

build_sdist:
Expand Down Expand Up @@ -159,6 +187,7 @@ jobs:
release:
name: Release
needs: [build_wheels, build_wheels_arm64, build_wheels_riscv64, build_sdist]
if: startsWith(github.ref, 'refs/tags/')
runs-on: ubuntu-latest

steps:
Expand Down
11 changes: 10 additions & 1 deletion .github/workflows/build-docker.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,15 @@ jobs:
with:
submodules: "recursive"

- name: Set image tag
run: |
if [[ "${GITHUB_REF_TYPE}" == "tag" ]]; then
image_tag="${GITHUB_REF_NAME}"
else
image_tag="${GITHUB_REF_NAME//\//-}"
fi
echo "IMAGE_TAG=$image_tag" >> "$GITHUB_ENV"

- name: Set up QEMU
uses: docker/setup-qemu-action@v3

Expand All @@ -40,7 +49,7 @@ jobs:
platforms: linux/amd64,linux/arm64
tags: |
ghcr.io/abetlen/llama-cpp-python:latest
ghcr.io/abetlen/llama-cpp-python:${{ github.ref_name }}
ghcr.io/abetlen/llama-cpp-python:${{ env.IMAGE_TAG }}
build-args: |
BUILDKIT_INLINE_CACHE=1

Expand Down
57 changes: 50 additions & 7 deletions .github/workflows/build-wheels-cuda.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -99,21 +99,63 @@ jobs:
MAMBA_NO_LOW_SPEED_LIMIT: "1"
run: |
$cudaVersion = $env:CUDAVER
mamba install -y 'cuda' -c nvidia/label/cuda-$cudaVersion
$cudaChannel = "nvidia/label/cuda-$cudaVersion"
if ($IsLinux) {
# Keep nvcc, cudart, and headers on the same NVIDIA label so the
# detected toolkit version matches the published wheel tag.
mamba install -y --channel-priority flexible --override-channels -c $cudaChannel "$cudaChannel::cuda-toolkit=$cudaVersion" "$cudaChannel::cuda-nvcc_linux-64=$cudaVersion" "$cudaChannel::cuda-cudart" "$cudaChannel::cuda-cudart-dev"
} else {
mamba install -y --channel-priority flexible --override-channels -c $cudaChannel "$cudaChannel::cuda-toolkit=$cudaVersion"
}
if ($LASTEXITCODE -ne 0) {
exit $LASTEXITCODE
}
python -m pip install build wheel

- name: Build Wheel
run: |
$cudaVersion = $env:CUDAVER.Remove($env:CUDAVER.LastIndexOf('.')).Replace('.','')
$env:CUDA_PATH = $env:CONDA_PREFIX
$env:CUDA_HOME = $env:CONDA_PREFIX
$env:CUDA_TOOLKIT_ROOT_DIR = $env:CONDA_PREFIX
$cudaHostCompilerArg = ''
$env:CMAKE_ARGS = ''
if ($IsLinux) {
$env:LD_LIBRARY_PATH = $env:CONDA_PREFIX + '/lib:' + $env:LD_LIBRARY_PATH
if (Test-Path '/usr/bin/g++-12') {
$env:CC = '/usr/bin/gcc-12'
$env:CXX = '/usr/bin/g++-12'
$env:CUDAHOSTCXX = '/usr/bin/g++-12'
$cudaHostCompilerArg = " -DCMAKE_CUDA_HOST_COMPILER=$env:CUDAHOSTCXX"
}
if (Test-Path (Join-Path $env:CONDA_PREFIX 'include/cuda_runtime.h')) {
$env:CUDAToolkit_ROOT = $env:CONDA_PREFIX
$env:CUDA_TOOLKIT_ROOT_DIR = $env:CONDA_PREFIX
$env:CMAKE_ARGS = "-DCUDAToolkit_ROOT=$env:CONDA_PREFIX -DCUDA_TOOLKIT_ROOT_DIR=$env:CONDA_PREFIX$cudaHostCompilerArg"
$env:CPATH = "$env:CONDA_PREFIX/include:$env:CPATH"
$env:CPLUS_INCLUDE_PATH = "$env:CONDA_PREFIX/include:$env:CPLUS_INCLUDE_PATH"
$env:LIBRARY_PATH = "$env:CONDA_PREFIX/lib:$env:LIBRARY_PATH"
$env:LD_LIBRARY_PATH = "$env:CONDA_PREFIX/lib:$env:LD_LIBRARY_PATH"
} else {
$env:CMAKE_ARGS = $cudaHostCompilerArg.Trim()
}
}
$nvccPath = Join-Path $env:CONDA_PREFIX 'bin/nvcc'
if (-not (Test-Path $nvccPath)) {
$nvccPath = Join-Path $env:CONDA_PREFIX 'targets/x86_64-linux/bin/nvcc'
}
if (-not (Test-Path $nvccPath)) {
throw 'Failed to find nvcc in the conda environment'
}
$env:CUDACXX = $nvccPath
$env:PATH = "$(Split-Path $nvccPath):$env:PATH"
$nvccVersion = ((& $nvccPath --version) | Select-String 'release ([0-9]+\.[0-9]+)').Matches[0].Groups[1].Value
if (-not $nvccVersion) {
throw 'Failed to detect the installed CUDA toolkit version'
}
$cudaTagVersion = $nvccVersion.Replace('.','')
$env:VERBOSE = '1'
$env:CMAKE_ARGS = '-DGGML_CUDA=on -DCMAKE_CUDA_ARCHITECTURES=all'
$env:CMAKE_ARGS = "-DGGML_CUDA_FORCE_MMQ=ON $env:CMAKE_ARGS"
# Keep a portable SM set, including sm_70, instead of CMake's `all`,
# which now pulls in future targets the hosted-runner toolchains cannot assemble.
$env:CMAKE_ARGS = "-DGGML_CUDA_FORCE_MMQ=ON -DGGML_CUDA=on -DCMAKE_CUDA_ARCHITECTURES=70;75;80;86;89;90 -DCMAKE_CUDA_FLAGS=--allow-unsupported-compiler $env:CMAKE_ARGS"
# if ($env:AVXVER -eq 'AVX') {
$env:CMAKE_ARGS = $env:CMAKE_ARGS + ' -DGGML_AVX2=off -DGGML_FMA=off -DGGML_F16C=off'
# }
Expand All @@ -124,10 +166,11 @@ jobs:
# $env:CMAKE_ARGS = $env:CMAKE_ARGS + ' -DGGML_AVX=off -DGGML_AVX2=off -DGGML_FMA=off -DGGML_F16C=off'
# }
python -m build --wheel
# write the build tag to the output
Write-Output "CUDA_VERSION=$cudaVersion" >> $env:GITHUB_ENV
# Publish tags that reflect the actual installed toolkit version.
Write-Output "CUDA_VERSION=$cudaTagVersion" >> $env:GITHUB_ENV

- uses: softprops/action-gh-release@v2
if: startsWith(github.ref, 'refs/tags/')
with:
files: dist/*
# Set tag_name to <tag>-cu<cuda_version>
Expand Down
7 changes: 7 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,13 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0

## [Unreleased]

## [0.3.18]

- feat: Expose `attention_type` in `Llama.__init__` for non-causal embedding models by @jamesbiederbeck in #2143
- fix(ci): Build Docker images from the checked-out source and sanitize branch tags by @abetlen in #2156
- fix(ci): Fix the CUDA wheel workflow and keep release tags aligned with the built toolkit by @abetlen in #2155
- fix(ci): Speed up release wheel builds by moving arm64 off QEMU and parallelizing riscv64 by @abetlen in #2154

## [0.3.17]

- feat: Update llama.cpp to ggerganov/llama.cpp@49bfddeca18e62fa3d39114a23e9fcbdf8a22388 and sync Python bindings by @abetlen in #2151
Expand Down
3 changes: 2 additions & 1 deletion docker/simple/Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@ FROM ${IMAGE}

# Re-declare the ARG after FROM
ARG IMAGE
ARG CMAKE_ARGS="-DGGML_NATIVE=off"

# Update and upgrade the existing packages
RUN apt-get update && apt-get upgrade -y && apt-get install -y --no-install-recommends \
Expand All @@ -26,7 +27,7 @@ RUN python3 -m pip install --upgrade pip

RUN python3 -m pip install --upgrade pip pytest cmake scikit-build setuptools fastapi uvicorn sse-starlette pydantic-settings starlette-context

RUN pip install llama-cpp-python --verbose;
RUN CMAKE_ARGS="${CMAKE_ARGS}" pip install . --verbose

# Set environment variable for the host
ENV HOST=0.0.0.0
Expand Down
2 changes: 1 addition & 1 deletion llama_cpp/__init__.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
from .llama_cpp import *
from .llama import *

__version__ = "0.3.17"
__version__ = "0.3.18"
4 changes: 4 additions & 0 deletions llama_cpp/llama.py
Original file line number Diff line number Diff line change
Expand Up @@ -81,6 +81,7 @@ def __init__(
int
] = llama_cpp.LLAMA_ROPE_SCALING_TYPE_UNSPECIFIED,
pooling_type: int = llama_cpp.LLAMA_POOLING_TYPE_UNSPECIFIED,
attention_type: int = llama_cpp.LLAMA_ATTENTION_TYPE_UNSPECIFIED,
rope_freq_base: float = 0.0,
rope_freq_scale: float = 0.0,
yarn_ext_factor: float = -1.0,
Expand Down Expand Up @@ -163,6 +164,7 @@ def __init__(
n_threads_batch: Number of threads to use for batch processing
rope_scaling_type: RoPE scaling type, from `enum llama_rope_scaling_type`. ref: https://github.com/ggerganov/llama.cpp/pull/2054
pooling_type: Pooling type, from `enum llama_pooling_type`.
attention_type: Attention type, from `enum llama_attention_type`.
rope_freq_base: RoPE base frequency, 0 = from model
rope_freq_scale: RoPE frequency scaling factor, 0 = from model
yarn_ext_factor: YaRN extrapolation mix factor, negative = from model
Expand Down Expand Up @@ -319,6 +321,7 @@ def __init__(
else llama_cpp.LLAMA_ROPE_SCALING_TYPE_UNSPECIFIED
)
self.context_params.pooling_type = pooling_type
self.context_params.attention_type = attention_type
self.context_params.rope_freq_base = (
rope_freq_base if rope_freq_base != 0.0 else 0
)
Expand Down Expand Up @@ -2100,6 +2103,7 @@ def __getstate__(self):
n_threads_batch=self.context_params.n_threads_batch,
rope_scaling_type=self.context_params.rope_scaling_type,
pooling_type=self.context_params.pooling_type,
attention_type=self.context_params.attention_type,
rope_freq_base=self.context_params.rope_freq_base,
rope_freq_scale=self.context_params.rope_freq_scale,
yarn_ext_factor=self.context_params.yarn_ext_factor,
Expand Down
Loading