MZWNET · pull · Mar 23, 2026 · Mar 23, 2026 · Mar 23, 2026 · Mar 23, 2026
diff --git a/.github/workflows/build-and-release.yaml b/.github/workflows/build-and-release.yaml
@@ -85,6 +85,35 @@ jobs:
           name: wheels_arm64
           path: ./wheelhouse/*.whl
 
+  build_wheels_riscv64:
+    name: Build riscv64 wheels
+    runs-on: ubuntu-latest
+    steps:
+      - uses: actions/checkout@v4
+        with:
+          submodules: "recursive"
+
+      - name: Set up QEMU
+        uses: docker/setup-qemu-action@v3
+        with:
+          platforms: linux/riscv64
+
+      - name: Build wheels
+        uses: pypa/cibuildwheel@v3.1.2
+        env:
+          CIBW_SKIP: "*musllinux* pp*"
+          CIBW_REPAIR_WHEEL_COMMAND: ""
+          CIBW_ARCHS: "riscv64"
+          CIBW_BUILD: "cp310-* cp311-* cp312-* cp313-* cp314-*"
+        with:
+          output-dir: wheelhouse
+
+      - name: Upload wheels as artifacts
+        uses: actions/upload-artifact@v4
+        with:
+          name: wheels_riscv64
+          path: ./wheelhouse/*.whl
+
   build_sdist:
     name: Build source distribution
     runs-on: ubuntu-latest
@@ -129,7 +158,7 @@ jobs:
 
   release:
     name: Release
-    needs: [build_wheels, build_wheels_arm64, build_sdist]
+    needs: [build_wheels, build_wheels_arm64, build_wheels_riscv64, build_sdist]
     runs-on: ubuntu-latest
 
     steps:

diff --git a/.github/workflows/test.yaml b/.github/workflows/test.yaml
@@ -8,8 +8,8 @@ on:
       - main
 
 env:
-  REPO_ID: Qwen/Qwen2-0.5B-Instruct-GGUF
-  MODEL_FILE: qwen2-0_5b-instruct-q8_0.gguf
+  REPO_ID: lmstudio-community/Qwen3.5-0.8B-GGUF
+  MODEL_FILE: Qwen3.5-0.8B-Q8_0.gguf
 
 jobs:
   download-model:

diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -7,9 +7,14 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
 
 ## [Unreleased]
 
+## [0.3.17]
+
+- feat: Update llama.cpp to ggerganov/llama.cpp@49bfddeca18e62fa3d39114a23e9fcbdf8a22388 and sync Python bindings by @abetlen in #2151
+- fix: Handle Qwen 3.5 hybrid prefix reuse by @codavidgarcia and @r-dh in #2152
 - chore(dev): Add Ruff-based formatting and a safe lint baseline, and run it in CI for pull requests and pushes to `main`
 - fix(ci): Run macOS CI on supported Apple Silicon and Intel runners by @abetlen in #2150
 - fix(ci): Use the `hf` CLI instead of the deprecated `huggingface-cli` name in GitHub Actions and docs by @abetlen in #2149
+- ci: add riscv64 wheel builds to release workflow by @gounthar in #2139
 
 ## [0.3.16]
 

diff --git a/CMakeLists.txt b/CMakeLists.txt
@@ -153,6 +153,26 @@ if (LLAMA_BUILD)
             add_compile_definitions(GGML_USE_METAL)
         endif()
 
+        # Upstream mtmd expects LLAMA_INSTALL_VERSION to be set by llama.cpp's
+        # top-level CMakeLists.txt. When we include tools/mtmd directly from the
+        # Python package build, that directory scope is skipped.
+        if (NOT DEFINED LLAMA_INSTALL_VERSION OR "${LLAMA_INSTALL_VERSION}" STREQUAL "")
+            set(LLAMA_INSTALL_VERSION 0.0.0)
+            find_package(Git QUIET)
+            if (Git_FOUND)
+                execute_process(
+                    COMMAND ${GIT_EXECUTABLE} rev-list --count HEAD
+                    WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR}/vendor/llama.cpp
+                    OUTPUT_VARIABLE LLAMA_MTMD_BUILD_NUMBER
+                    OUTPUT_STRIP_TRAILING_WHITESPACE
+                    RESULT_VARIABLE LLAMA_MTMD_BUILD_NUMBER_RESULT
+                )
+                if (LLAMA_MTMD_BUILD_NUMBER_RESULT EQUAL 0)
+                    set(LLAMA_INSTALL_VERSION 0.0.${LLAMA_MTMD_BUILD_NUMBER})
+                endif()
+            endif()
+        endif()
+
         # Building llava
         add_subdirectory(vendor/llama.cpp/tools/mtmd)
 

diff --git a/Makefile b/Makefile
@@ -82,8 +82,6 @@ run-server:
 	python3 -m llama_cpp.server --model ${MODEL}
 
 clean:
-	- cd vendor/llama.cpp && make clean
-	- cd vendor/llama.cpp && rm libllama.so
 	- rm -rf _skbuild
 	- rm llama_cpp/lib/*.so
 	- rm llama_cpp/lib/*.dylib

diff --git a/README.md b/README.md
@@ -322,8 +322,8 @@ You'll need to install the `huggingface-hub` package to use this feature (`pip i
 
 ```python
 llm = Llama.from_pretrained(
-    repo_id="Qwen/Qwen2-0.5B-Instruct-GGUF",
-    filename="*q8_0.gguf",
+    repo_id="lmstudio-community/Qwen3.5-0.8B-GGUF",
+    filename="*Q8_0.gguf",
     verbose=False
 )
 ```
@@ -685,7 +685,7 @@ For possible options, see [llama_cpp/llama_chat_format.py](llama_cpp/llama_chat_
 If you have `huggingface-hub` installed, you can also use the `--hf_model_repo_id` flag to load a model from the Hugging Face Hub.
 
 ```bash
-python3 -m llama_cpp.server --hf_model_repo_id Qwen/Qwen2-0.5B-Instruct-GGUF --model '*q8_0.gguf'
+python3 -m llama_cpp.server --hf_model_repo_id lmstudio-community/Qwen3.5-0.8B-GGUF --model '*Q8_0.gguf'
 ```
 
 ### Web Server Features

diff --git a/examples/gradio_chat/local.py b/examples/gradio_chat/local.py
@@ -4,10 +4,10 @@
 import gradio as gr
 
 llama = llama_cpp.Llama.from_pretrained(
-    repo_id="Qwen/Qwen1.5-0.5B-Chat-GGUF",
-    filename="*q8_0.gguf",
+    repo_id="lmstudio-community/Qwen3.5-0.8B-GGUF",
+    filename="*Q8_0.gguf",
     tokenizer=llama_cpp.llama_tokenizer.LlamaHFTokenizer.from_pretrained(
-        "Qwen/Qwen1.5-0.5B"
+        "Qwen/Qwen3.5-0.8B"
     ),
     verbose=False,
 )

diff --git a/examples/hf_pull/main.py b/examples/hf_pull/main.py
@@ -3,10 +3,10 @@
 
 
 llama = llama_cpp.Llama.from_pretrained(
-    repo_id="Qwen/Qwen1.5-0.5B-Chat-GGUF",
-    filename="*q8_0.gguf",
+    repo_id="lmstudio-community/Qwen3.5-0.8B-GGUF",
+    filename="*Q8_0.gguf",
     tokenizer=llama_cpp.llama_tokenizer.LlamaHFTokenizer.from_pretrained(
-        "Qwen/Qwen1.5-0.5B"
+        "Qwen/Qwen3.5-0.8B"
     ),
     verbose=False,
 )

diff --git a/llama_cpp/__init__.py b/llama_cpp/__init__.py
@@ -1,4 +1,4 @@
 from .llama_cpp import *
 from .llama import *
 
-__version__ = "0.3.16"
+__version__ = "0.3.17"
diff --git a/llama_cpp/_internals.py b/llama_cpp/_internals.py
@@ -2,6 +2,7 @@
 
 import os
 import ctypes
+import warnings
 
 from typing import (
     Dict,
@@ -290,10 +291,10 @@ def kv_cache_clear(self):
         assert self.memory is not None, "Memory is not initialized"
         llama_cpp.llama_memory_clear(self.memory, True)
 
-    def kv_cache_seq_rm(self, seq_id: int, p0: int, p1: int):
+    def kv_cache_seq_rm(self, seq_id: int, p0: int, p1: int) -> bool:
         assert self.memory is not None, "Memory is not initialized"
         seq_id = seq_id if seq_id >= 0 else 0
-        llama_cpp.llama_memory_seq_rm(self.memory, seq_id, p0, p1)
+        return llama_cpp.llama_memory_seq_rm(self.memory, seq_id, p0, p1)
 
     def kv_cache_seq_cp(self, seq_id_src: int, seq_id_dst: int, p0: int, p1: int):
         assert self.memory is not None, "Memory is not initialized"
@@ -699,8 +700,11 @@ def add_dist(self, seed: int):
         llama_cpp.llama_sampler_chain_add(self.sampler, sampler)
 
     def add_softmax(self):
-        sampler = llama_cpp.llama_sampler_init_softmax()
-        llama_cpp.llama_sampler_chain_add(self.sampler, sampler)
+        warnings.warn(
+            "add_softmax is deprecated; llama_sampler_init_dist now samples directly from logits",
+            DeprecationWarning,
+            stacklevel=2,
+        )
 
     def add_top_k(self, k: int):
         sampler = llama_cpp.llama_sampler_init_top_k(k)

diff --git a/llama_cpp/llama.py b/llama_cpp/llama.py
@@ -341,7 +341,11 @@ def __init__(
         self._logits_all = logits_all if draft_model is None else True
         self.context_params.embeddings = embedding  # TODO: Rename to embeddings
         self.context_params.offload_kqv = offload_kqv
-        self.context_params.flash_attn = flash_attn
+        self.context_params.flash_attn_type = (
+            llama_cpp.LLAMA_FLASH_ATTN_TYPE_ENABLED
+            if flash_attn
+            else llama_cpp.LLAMA_FLASH_ATTN_TYPE_DISABLED
+        )
 
         if op_offload is not None:
             self.context_params.op_offload = op_offload
@@ -431,9 +435,9 @@ def free_lora_adapter():
 
             self._stack.callback(free_lora_adapter)
 
-            if llama_cpp.llama_set_adapter_lora(
-                self._ctx.ctx, self._lora_adapter, self.lora_scale
-            ):
+            adapters = (llama_cpp.llama_adapter_lora_p_ctypes * 1)(self._lora_adapter)
+            scales = (ctypes.c_float * 1)(self.lora_scale)
+            if llama_cpp.llama_set_adapters_lora(self._ctx.ctx, adapters, 1, scales):
                 raise RuntimeError(
                     f"Failed to set LoRA adapter from lora path: {self.lora_path}"
                 )
@@ -726,7 +730,6 @@ def apply_func(token_data_array: llama_cpp.llama_token_data_array_p):
             sampler.add_grammar(self._model, grammar)
 
         if temp < 0.0:
-            sampler.add_softmax()
             sampler.add_dist(self._seed)
         elif temp == 0.0:
             sampler.add_greedy()
@@ -888,13 +891,20 @@ def generate(
                 else:
                     break
             if longest_prefix > 0:
-                reset = False
-                tokens = tokens[longest_prefix:]
-                self.n_tokens = longest_prefix
-                if self.verbose:
+                if self._ctx.kv_cache_seq_rm(-1, longest_prefix, -1):
+                    reset = False
+                    tokens = tokens[longest_prefix:]
+                    self.n_tokens = longest_prefix
+                    if self.verbose:
+                        print(
+                            f"Llama.generate: {longest_prefix} prefix-match hit, "
+                            f"remaining {len(tokens)} prompt tokens to eval",
+                            file=sys.stderr,
+                        )
+                elif self.verbose:
                     print(
-                        f"Llama.generate: {longest_prefix} prefix-match hit, "
-                        f"remaining {len(tokens)} prompt tokens to eval",
+                        f"Llama.generate: {longest_prefix} prefix-match found "
+                        f"but partial kv removal not supported, re-evaluating full prompt",
                         file=sys.stderr,
                     )
 
@@ -1042,7 +1052,7 @@ def embed(
         data: Union[List[List[float]], List[List[List[float]]]] = []
 
         def decode_batch(seq_sizes: List[int]):
-            llama_cpp.llama_kv_self_clear(self._ctx.ctx)
+            self._ctx.kv_cache_clear()
             self._ctx.decode(self._batch)
             self._batch.reset()
 
@@ -1113,7 +1123,7 @@ def decode_batch(seq_sizes: List[int]):
 
         output = data[0] if isinstance(input, str) else data
 
-        llama_cpp.llama_kv_self_clear(self._ctx.ctx)
+        self._ctx.kv_cache_clear()
         self.reset()
 
         if return_count:
@@ -2100,7 +2110,10 @@ def __getstate__(self):
             logits_all=self._logits_all,
             embedding=self.context_params.embeddings,
             offload_kqv=self.context_params.offload_kqv,
-            flash_attn=self.context_params.flash_attn,
+            flash_attn=(
+                self.context_params.flash_attn_type
+                == llama_cpp.LLAMA_FLASH_ATTN_TYPE_ENABLED
+            ),
             op_offload=self.context_params.op_offload,
             swa_full=self.context_params.swa_full,
             # Sampling Params

diff --git a/llama_cpp/llama_chat_format.py b/llama_cpp/llama_chat_format.py
@@ -2755,7 +2755,14 @@ def _init_mtmd_context(self, llama_model: llama.Llama):
             ctx_params.use_gpu = True  # TODO: Make this configurable
             ctx_params.print_timings = self.verbose
             ctx_params.n_threads = llama_model.n_threads
-            ctx_params.verbosity = 2 if self.verbose else 0  # GGML_LOG_LEVEL_INFO = 2
+            ctx_params.flash_attn_type = (
+                llama_cpp.LLAMA_FLASH_ATTN_TYPE_ENABLED
+                if (
+                    llama_model.context_params.flash_attn_type
+                    == llama_cpp.LLAMA_FLASH_ATTN_TYPE_ENABLED
+                )
+                else llama_cpp.LLAMA_FLASH_ATTN_TYPE_DISABLED
+            )
 
             # Initialize mtmd context
             self.mtmd_ctx = self._mtmd_cpp.mtmd_init_from_file(