Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
31 changes: 30 additions & 1 deletion .github/workflows/build-and-release.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -85,6 +85,35 @@ jobs:
name: wheels_arm64
path: ./wheelhouse/*.whl

build_wheels_riscv64:
name: Build riscv64 wheels
runs-on: ubuntu-latest
steps:
- uses: actions/checkout@v4
with:
submodules: "recursive"

- name: Set up QEMU
uses: docker/setup-qemu-action@v3
with:
platforms: linux/riscv64

- name: Build wheels
uses: pypa/cibuildwheel@v3.1.2
env:
CIBW_SKIP: "*musllinux* pp*"
CIBW_REPAIR_WHEEL_COMMAND: ""
CIBW_ARCHS: "riscv64"
CIBW_BUILD: "cp310-* cp311-* cp312-* cp313-* cp314-*"
with:
output-dir: wheelhouse

- name: Upload wheels as artifacts
uses: actions/upload-artifact@v4
with:
name: wheels_riscv64
path: ./wheelhouse/*.whl

build_sdist:
name: Build source distribution
runs-on: ubuntu-latest
Expand Down Expand Up @@ -129,7 +158,7 @@ jobs:

release:
name: Release
needs: [build_wheels, build_wheels_arm64, build_sdist]
needs: [build_wheels, build_wheels_arm64, build_wheels_riscv64, build_sdist]
runs-on: ubuntu-latest

steps:
Expand Down
4 changes: 2 additions & 2 deletions .github/workflows/test.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -8,8 +8,8 @@ on:
- main

env:
REPO_ID: Qwen/Qwen2-0.5B-Instruct-GGUF
MODEL_FILE: qwen2-0_5b-instruct-q8_0.gguf
REPO_ID: lmstudio-community/Qwen3.5-0.8B-GGUF
MODEL_FILE: Qwen3.5-0.8B-Q8_0.gguf

jobs:
download-model:
Expand Down
5 changes: 5 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -7,9 +7,14 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0

## [Unreleased]

## [0.3.17]

- feat: Update llama.cpp to ggerganov/llama.cpp@49bfddeca18e62fa3d39114a23e9fcbdf8a22388 and sync Python bindings by @abetlen in #2151
- fix: Handle Qwen 3.5 hybrid prefix reuse by @codavidgarcia and @r-dh in #2152
- chore(dev): Add Ruff-based formatting and a safe lint baseline, and run it in CI for pull requests and pushes to `main`
- fix(ci): Run macOS CI on supported Apple Silicon and Intel runners by @abetlen in #2150
- fix(ci): Use the `hf` CLI instead of the deprecated `huggingface-cli` name in GitHub Actions and docs by @abetlen in #2149
- ci: add riscv64 wheel builds to release workflow by @gounthar in #2139

## [0.3.16]

Expand Down
20 changes: 20 additions & 0 deletions CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -153,6 +153,26 @@ if (LLAMA_BUILD)
add_compile_definitions(GGML_USE_METAL)
endif()

# Upstream mtmd expects LLAMA_INSTALL_VERSION to be set by llama.cpp's
# top-level CMakeLists.txt. When we include tools/mtmd directly from the
# Python package build, that directory scope is skipped.
if (NOT DEFINED LLAMA_INSTALL_VERSION OR "${LLAMA_INSTALL_VERSION}" STREQUAL "")
set(LLAMA_INSTALL_VERSION 0.0.0)
find_package(Git QUIET)
if (Git_FOUND)
execute_process(
COMMAND ${GIT_EXECUTABLE} rev-list --count HEAD
WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR}/vendor/llama.cpp
OUTPUT_VARIABLE LLAMA_MTMD_BUILD_NUMBER
OUTPUT_STRIP_TRAILING_WHITESPACE
RESULT_VARIABLE LLAMA_MTMD_BUILD_NUMBER_RESULT
)
if (LLAMA_MTMD_BUILD_NUMBER_RESULT EQUAL 0)
set(LLAMA_INSTALL_VERSION 0.0.${LLAMA_MTMD_BUILD_NUMBER})
endif()
endif()
endif()

# Building llava
add_subdirectory(vendor/llama.cpp/tools/mtmd)

Expand Down
2 changes: 0 additions & 2 deletions Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -82,8 +82,6 @@ run-server:
python3 -m llama_cpp.server --model ${MODEL}

clean:
- cd vendor/llama.cpp && make clean
- cd vendor/llama.cpp && rm libllama.so
- rm -rf _skbuild
- rm llama_cpp/lib/*.so
- rm llama_cpp/lib/*.dylib
Expand Down
6 changes: 3 additions & 3 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -322,8 +322,8 @@ You'll need to install the `huggingface-hub` package to use this feature (`pip i

```python
llm = Llama.from_pretrained(
repo_id="Qwen/Qwen2-0.5B-Instruct-GGUF",
filename="*q8_0.gguf",
repo_id="lmstudio-community/Qwen3.5-0.8B-GGUF",
filename="*Q8_0.gguf",
verbose=False
)
```
Expand Down Expand Up @@ -685,7 +685,7 @@ For possible options, see [llama_cpp/llama_chat_format.py](llama_cpp/llama_chat_
If you have `huggingface-hub` installed, you can also use the `--hf_model_repo_id` flag to load a model from the Hugging Face Hub.

```bash
python3 -m llama_cpp.server --hf_model_repo_id Qwen/Qwen2-0.5B-Instruct-GGUF --model '*q8_0.gguf'
python3 -m llama_cpp.server --hf_model_repo_id lmstudio-community/Qwen3.5-0.8B-GGUF --model '*Q8_0.gguf'
```

### Web Server Features
Expand Down
6 changes: 3 additions & 3 deletions examples/gradio_chat/local.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,10 +4,10 @@
import gradio as gr

llama = llama_cpp.Llama.from_pretrained(
repo_id="Qwen/Qwen1.5-0.5B-Chat-GGUF",
filename="*q8_0.gguf",
repo_id="lmstudio-community/Qwen3.5-0.8B-GGUF",
filename="*Q8_0.gguf",
tokenizer=llama_cpp.llama_tokenizer.LlamaHFTokenizer.from_pretrained(
"Qwen/Qwen1.5-0.5B"
"Qwen/Qwen3.5-0.8B"
),
verbose=False,
)
Expand Down
6 changes: 3 additions & 3 deletions examples/hf_pull/main.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,10 +3,10 @@


llama = llama_cpp.Llama.from_pretrained(
repo_id="Qwen/Qwen1.5-0.5B-Chat-GGUF",
filename="*q8_0.gguf",
repo_id="lmstudio-community/Qwen3.5-0.8B-GGUF",
filename="*Q8_0.gguf",
tokenizer=llama_cpp.llama_tokenizer.LlamaHFTokenizer.from_pretrained(
"Qwen/Qwen1.5-0.5B"
"Qwen/Qwen3.5-0.8B"
),
verbose=False,
)
Expand Down
2 changes: 1 addition & 1 deletion llama_cpp/__init__.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
from .llama_cpp import *
from .llama import *

__version__ = "0.3.16"
__version__ = "0.3.17"
12 changes: 8 additions & 4 deletions llama_cpp/_internals.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@

import os
import ctypes
import warnings

from typing import (
Dict,
Expand Down Expand Up @@ -290,10 +291,10 @@ def kv_cache_clear(self):
assert self.memory is not None, "Memory is not initialized"
llama_cpp.llama_memory_clear(self.memory, True)

def kv_cache_seq_rm(self, seq_id: int, p0: int, p1: int):
def kv_cache_seq_rm(self, seq_id: int, p0: int, p1: int) -> bool:
assert self.memory is not None, "Memory is not initialized"
seq_id = seq_id if seq_id >= 0 else 0
llama_cpp.llama_memory_seq_rm(self.memory, seq_id, p0, p1)
return llama_cpp.llama_memory_seq_rm(self.memory, seq_id, p0, p1)

def kv_cache_seq_cp(self, seq_id_src: int, seq_id_dst: int, p0: int, p1: int):
assert self.memory is not None, "Memory is not initialized"
Expand Down Expand Up @@ -699,8 +700,11 @@ def add_dist(self, seed: int):
llama_cpp.llama_sampler_chain_add(self.sampler, sampler)

def add_softmax(self):
sampler = llama_cpp.llama_sampler_init_softmax()
llama_cpp.llama_sampler_chain_add(self.sampler, sampler)
warnings.warn(
"add_softmax is deprecated; llama_sampler_init_dist now samples directly from logits",
DeprecationWarning,
stacklevel=2,
)

def add_top_k(self, k: int):
sampler = llama_cpp.llama_sampler_init_top_k(k)
Expand Down
41 changes: 27 additions & 14 deletions llama_cpp/llama.py
Original file line number Diff line number Diff line change
Expand Up @@ -341,7 +341,11 @@ def __init__(
self._logits_all = logits_all if draft_model is None else True
self.context_params.embeddings = embedding # TODO: Rename to embeddings
self.context_params.offload_kqv = offload_kqv
self.context_params.flash_attn = flash_attn
self.context_params.flash_attn_type = (
llama_cpp.LLAMA_FLASH_ATTN_TYPE_ENABLED
if flash_attn
else llama_cpp.LLAMA_FLASH_ATTN_TYPE_DISABLED
)

if op_offload is not None:
self.context_params.op_offload = op_offload
Expand Down Expand Up @@ -431,9 +435,9 @@ def free_lora_adapter():

self._stack.callback(free_lora_adapter)

if llama_cpp.llama_set_adapter_lora(
self._ctx.ctx, self._lora_adapter, self.lora_scale
):
adapters = (llama_cpp.llama_adapter_lora_p_ctypes * 1)(self._lora_adapter)
scales = (ctypes.c_float * 1)(self.lora_scale)
if llama_cpp.llama_set_adapters_lora(self._ctx.ctx, adapters, 1, scales):
raise RuntimeError(
f"Failed to set LoRA adapter from lora path: {self.lora_path}"
)
Expand Down Expand Up @@ -726,7 +730,6 @@ def apply_func(token_data_array: llama_cpp.llama_token_data_array_p):
sampler.add_grammar(self._model, grammar)

if temp < 0.0:
sampler.add_softmax()
sampler.add_dist(self._seed)
elif temp == 0.0:
sampler.add_greedy()
Expand Down Expand Up @@ -888,13 +891,20 @@ def generate(
else:
break
if longest_prefix > 0:
reset = False
tokens = tokens[longest_prefix:]
self.n_tokens = longest_prefix
if self.verbose:
if self._ctx.kv_cache_seq_rm(-1, longest_prefix, -1):
reset = False
tokens = tokens[longest_prefix:]
self.n_tokens = longest_prefix
if self.verbose:
print(
f"Llama.generate: {longest_prefix} prefix-match hit, "
f"remaining {len(tokens)} prompt tokens to eval",
file=sys.stderr,
)
elif self.verbose:
print(
f"Llama.generate: {longest_prefix} prefix-match hit, "
f"remaining {len(tokens)} prompt tokens to eval",
f"Llama.generate: {longest_prefix} prefix-match found "
f"but partial kv removal not supported, re-evaluating full prompt",
file=sys.stderr,
)

Expand Down Expand Up @@ -1042,7 +1052,7 @@ def embed(
data: Union[List[List[float]], List[List[List[float]]]] = []

def decode_batch(seq_sizes: List[int]):
llama_cpp.llama_kv_self_clear(self._ctx.ctx)
self._ctx.kv_cache_clear()
self._ctx.decode(self._batch)
self._batch.reset()

Expand Down Expand Up @@ -1113,7 +1123,7 @@ def decode_batch(seq_sizes: List[int]):

output = data[0] if isinstance(input, str) else data

llama_cpp.llama_kv_self_clear(self._ctx.ctx)
self._ctx.kv_cache_clear()
self.reset()

if return_count:
Expand Down Expand Up @@ -2100,7 +2110,10 @@ def __getstate__(self):
logits_all=self._logits_all,
embedding=self.context_params.embeddings,
offload_kqv=self.context_params.offload_kqv,
flash_attn=self.context_params.flash_attn,
flash_attn=(
self.context_params.flash_attn_type
== llama_cpp.LLAMA_FLASH_ATTN_TYPE_ENABLED
),
op_offload=self.context_params.op_offload,
swa_full=self.context_params.swa_full,
# Sampling Params
Expand Down
9 changes: 8 additions & 1 deletion llama_cpp/llama_chat_format.py
Original file line number Diff line number Diff line change
Expand Up @@ -2755,7 +2755,14 @@ def _init_mtmd_context(self, llama_model: llama.Llama):
ctx_params.use_gpu = True # TODO: Make this configurable
ctx_params.print_timings = self.verbose
ctx_params.n_threads = llama_model.n_threads
ctx_params.verbosity = 2 if self.verbose else 0 # GGML_LOG_LEVEL_INFO = 2
ctx_params.flash_attn_type = (
llama_cpp.LLAMA_FLASH_ATTN_TYPE_ENABLED
if (
llama_model.context_params.flash_attn_type
== llama_cpp.LLAMA_FLASH_ATTN_TYPE_ENABLED
)
else llama_cpp.LLAMA_FLASH_ATTN_TYPE_DISABLED
)

# Initialize mtmd context
self.mtmd_ctx = self._mtmd_cpp.mtmd_init_from_file(
Expand Down
Loading
Loading