From 3e7230c386dd1883eede04a4bfb643a424ae3053 Mon Sep 17 00:00:00 2001 From: n1ck-guo Date: Thu, 19 Mar 2026 19:30:40 +0800 Subject: [PATCH 1/6] fix gguf format fail infer for qwen3.5 series Signed-off-by: n1ck-guo --- auto_round/compressors/base.py | 2 +- auto_round/export/export_to_gguf/convert.py | 13 ++++++++++++- 2 files changed, 13 insertions(+), 2 deletions(-) diff --git a/auto_round/compressors/base.py b/auto_round/compressors/base.py index 0a2352e2a..c1623b0a4 100644 --- a/auto_round/compressors/base.py +++ b/auto_round/compressors/base.py @@ -1229,7 +1229,7 @@ def _quantize_layer_via_rtn(self, name: str, dtype: torch.dtype = None, to_cpu=T set_module(self.model, name, m) tuning_device = m.tuning_device if hasattr(m, "tuning_device") else self.device # Step 1: let gguf merge layers or rename module first and we will handle the RTN is gguf specific logic - if self.is_immediate_packing and self.iters == 0 and self.formats[0].is_gguf() and not self.disable_opt_rtn: + if self.is_immediate_packing and self.iters == 0 and self.formats[0].is_gguf(): m = m.to(tuning_device) m.scale = None m.zp = None diff --git a/auto_round/export/export_to_gguf/convert.py b/auto_round/export/export_to_gguf/convert.py index bd28d89ef..d6e4e9bf2 100644 --- a/auto_round/export/export_to_gguf/convert.py +++ b/auto_round/export/export_to_gguf/convert.py @@ -220,6 +220,7 @@ def _quant_data_with_args( def need_modify_tensor(cls, name): hf_arch = getattr(cls, "hf_arch", "") + # if hf_arch in ("Qwen3NextForCausalLM", "Qwen3_5ForCausalLM", "Qwen3_5MoeForCausalLM") and "in_proj_qkvz.weight" in name: if hf_arch == "Qwen3NextForCausalLM" and "in_proj_qkvz.weight" in name: return True return False @@ -379,9 +380,14 @@ def remove_prefix(name, key_list): def prepare_tensors(cls): - max_name_len = max(len(s) for _, s in cls.tensor_map.mapping.values()) + len(".weight,") device = get_packing_device(cls.device) + # Handle empty tensor_map for models with block_count=0 (like MobileNetV5) + if cls.tensor_map.mapping: + max_name_len = max(len(s) for _, s in cls.tensor_map.mapping.values()) + len(".weight,") + else: + max_name_len = len("vision_encoder.weight,") # Default reasonable length + for name, data_torch in chain(cls.generate_extra_tensors(), cls.get_tensors()): if name in getattr(cls.model, "_tied_weights_keys", []) and not is_separate_tensor(cls.model, name): continue @@ -441,6 +447,7 @@ def prepare_tensors(cls): cls.match_model_tensor_name(new_name, key, bid) for key in ( gguf.MODEL_TENSOR.FFN_GATE_INP, + gguf.MODEL_TENSOR.FFN_GATE_INP_SHEXP, gguf.MODEL_TENSOR.POS_EMBD, gguf.MODEL_TENSOR.TOKEN_TYPES, gguf.MODEL_TENSOR.SSM_CONV1D, @@ -457,6 +464,10 @@ def prepare_tensors(cls): gguf.MODEL_TENSOR.A_ENC_EMBD_POS, gguf.MODEL_TENSOR.ALTUP_CORRECT_COEF, gguf.MODEL_TENSOR.ALTUP_PREDICT_COEF, + # Kimi KDA conv weights should be F32 + gguf.MODEL_TENSOR.SSM_CONV1D_Q, + gguf.MODEL_TENSOR.SSM_CONV1D_K, + gguf.MODEL_TENSOR.SSM_CONV1D_V, ) ) or not new_name.endswith(".weight") From bf37f6b3e1aad5d907f5ea4d6b4afc5ba822143d Mon Sep 17 00:00:00 2001 From: n1ck-guo Date: Thu, 19 Mar 2026 19:47:11 +0800 Subject: [PATCH 2/6] codescan Signed-off-by: n1ck-guo --- auto_round/export/export_to_gguf/convert.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/auto_round/export/export_to_gguf/convert.py b/auto_round/export/export_to_gguf/convert.py index d6e4e9bf2..7f8de55fe 100644 --- a/auto_round/export/export_to_gguf/convert.py +++ b/auto_round/export/export_to_gguf/convert.py @@ -220,8 +220,10 @@ def _quant_data_with_args( def need_modify_tensor(cls, name): hf_arch = getattr(cls, "hf_arch", "") - # if hf_arch in ("Qwen3NextForCausalLM", "Qwen3_5ForCausalLM", "Qwen3_5MoeForCausalLM") and "in_proj_qkvz.weight" in name: - if hf_arch == "Qwen3NextForCausalLM" and "in_proj_qkvz.weight" in name: + if ( + hf_arch in ("Qwen3NextForCausalLM", "Qwen3_5ForCausalLM", "Qwen3_5MoeForCausalLM") + and "in_proj_qkvz.weight" in name + ): return True return False From b9d760ee2003d89826f06737d97f6d4a1c0659b9 Mon Sep 17 00:00:00 2001 From: n1ck-guo Date: Fri, 20 Mar 2026 19:47:02 +0800 Subject: [PATCH 3/6] fix Signed-off-by: n1ck-guo --- auto_round/alg_ext.py | 10 ++++- auto_round/compressors/base.py | 4 +- auto_round/data_type/gguf.py | 8 +++- auto_round/export/export_to_gguf/convert.py | 48 +++++++++++++++------ auto_round/export/export_to_gguf/packing.py | 2 +- 5 files changed, 53 insertions(+), 19 deletions(-) diff --git a/auto_round/alg_ext.py b/auto_round/alg_ext.py index 8a45b443e..cbb2753df 100644 --- a/auto_round/alg_ext.py +++ b/auto_round/alg_ext.py @@ -561,7 +561,10 @@ def make_qp_quants(nmax, data, quant_weights, v=0): L = torch.round(iscale * data + v).clip(max=nmax) sumlx = torch.sum(quant_weights * data * L, dim=-1) suml2 = torch.sum(quant_weights * L * L, dim=-1) - return sumlx / suml2, L + # When suml2 is zero (all L=0 or all quant_weights=0), fall back to the + # simple max-based scale estimate to avoid NaN propagating into the GGUF file. + fallback_d = group_max.squeeze(-1) / nmax + return torch.where(suml2 > 0, sumlx / suml2, fallback_d), L def iterative_wls_quant_search(data, bits=4, rrmin=-1.0, rdelta=0.1, nstep=20, use_mad=False, weights=None, v=0): @@ -685,7 +688,10 @@ def make_qp_new_quants(data, orig_scale, orig_mins, quant_weights, bits=4, super quant_weights = quant_weights.view(orig_scale.shape) sumlx = torch.sum(quant_weights * orig_scale * L, dim=-1) suml2 = torch.sum(quant_weights * L * L, dim=-1) - return sumlx / suml2, L + # When suml2 is zero, fall back to the simple max-based scale estimate + # to avoid NaN propagating into the GGUF file. + fallback_d = group_max.squeeze(-1) / nmax + return torch.where(suml2 > 0, sumlx / suml2, fallback_d), L def quant_tensor_gguf_asym_dq( diff --git a/auto_round/compressors/base.py b/auto_round/compressors/base.py index c1623b0a4..7dedf803f 100644 --- a/auto_round/compressors/base.py +++ b/auto_round/compressors/base.py @@ -1229,7 +1229,7 @@ def _quantize_layer_via_rtn(self, name: str, dtype: torch.dtype = None, to_cpu=T set_module(self.model, name, m) tuning_device = m.tuning_device if hasattr(m, "tuning_device") else self.device # Step 1: let gguf merge layers or rename module first and we will handle the RTN is gguf specific logic - if self.is_immediate_packing and self.iters == 0 and self.formats[0].is_gguf(): + if self.is_immediate_packing and self.iters == 0 and self.formats[0].is_gguf() and not self.disable_opt_rtn: m = m.to(tuning_device) m.scale = None m.zp = None @@ -1394,7 +1394,7 @@ def _quantize_rtn(self) -> tuple[torch.nn.Module, dict[str, Any]]: tied_weights_layers.append(lm_head_name) if use_blockwise_quantization: # The ram usage is a little higher - all_to_quantized_module_names = list(set(all_to_quantized_module_names)) + all_to_quantized_module_names = list(dict.fromkeys(all_to_quantized_module_names)) all_blocks = self.quant_block_list if self.quant_block_list else get_block_names(self.model) pbar = tqdm(range(sum(len(block) for block in all_blocks))) diff --git a/auto_round/data_type/gguf.py b/auto_round/data_type/gguf.py index 27a97f010..a63c3df90 100644 --- a/auto_round/data_type/gguf.py +++ b/auto_round/data_type/gguf.py @@ -244,7 +244,11 @@ def make_qp_quants(nmax, data, quant_weights): # if n_changed == 0: # break - return sumlx / suml2, L + # When suml2 is zero (all L=0 due to zero scales, or all quant_weights=0 due to + # unactivated calibration features), fall back to the simple max-based scale + # estimate to avoid NaN propagating into the GGUF file. + fallback_d = group_max.squeeze(-1) / nmax + return torch.where(suml2 > 0, sumlx / suml2, fallback_d), L @register_dtype("int_asym_dq") @@ -329,7 +333,7 @@ def _imatrix_handle_zero(imatrix: Union[torch.Tensor, float], weight: torch.Tens "please use more data via setting `nsamples` to improve accuracy as calibration activations contain 0" ) - zero_cnt = torch.sum(imatrix == 0, dim=-1) + zero_cnt = torch.sum(imatrix <= 1e-30, dim=-1) replace_index = zero_cnt > group_size // 2 if torch.sum(replace_index) > 0: ## fallback to no imatrix diff --git a/auto_round/export/export_to_gguf/convert.py b/auto_round/export/export_to_gguf/convert.py index 7f8de55fe..4035fd283 100644 --- a/auto_round/export/export_to_gguf/convert.py +++ b/auto_round/export/export_to_gguf/convert.py @@ -220,10 +220,7 @@ def _quant_data_with_args( def need_modify_tensor(cls, name): hf_arch = getattr(cls, "hf_arch", "") - if ( - hf_arch in ("Qwen3NextForCausalLM", "Qwen3_5ForCausalLM", "Qwen3_5MoeForCausalLM") - and "in_proj_qkvz.weight" in name - ): + if hf_arch in "Qwen3NextForCausalLM" and "in_proj_qkvz.weight" in name: return True return False @@ -257,21 +254,48 @@ def _quant_data(cls, data_torch, data_qtype, name, modify_name, new_name, bid, d "wmin": None, "imatrix": None, } - # support for MOE model with cls eexperts not linear - # if hasattr(module, "scale") or ("exps" in new_name and len(data_torch.shape) == 3): - for attr in ["scale", "zp", "w_d_scale", "w_d_wmin", "w_wmin"]: - if hasattr(module, attr) and getattr(module, attr) is not None: + # patch for Qwen3_5, Qwen3_5 handles some weights specially, + # but the scale doesn't match; these weights are handled by gguf itself. + # Define model architectures that need special handling + QWEN3_5_MODELS = { + "Qwen3_5ForCausalLM", + "Qwen3_5MoeForCausalLM", + "Qwen3_5MoeForConditionalGeneration", + "Qwen3_5ForConditionalGeneration", + } + + QWEN3_5_SKIP_KEYS = { + ".in_proj_qkv.", + ".in_proj_z", + ".conv1d", + ".in_proj_b.", + ".in_proj_a.", + ".A_log", + ".dt_bias", + ".out_proj.", + } + + hf_arch = getattr(cls, "hf_arch", "") + should_skip = hf_arch in QWEN3_5_MODELS and any(key in name for key in QWEN3_5_SKIP_KEYS) + + if not should_skip: + # support for MOE model with cls experts not linear + for attr in ["scale", "zp", "w_d_scale", "w_d_wmin", "w_wmin"]: + if not (hasattr(module, attr) and getattr(module, attr) is not None): + continue + attr_tensor = getattr(module, attr) if not isinstance(attr_tensor, torch.Tensor): continue + if hasattr(cls, "permute") or need_modify_tensor(cls, name): bs = module.weight.shape[0] attr_tensors_dict = dict(cls.modify_tensors(attr_tensor.reshape(bs, -1), modify_name, bid)) attr_tensor = attr_tensors_dict[new_name] - if attr in kwargs: - kwargs[attr] = attr_tensor.to(torch.float32) - else: - kwargs[attr.replace("w_", "")] = attr_tensor.to(torch.float32) + + # Map attribute names to kwargs keys: w_d_scale -> d_scale, w_d_wmin -> d_wmin, w_wmin -> wmin + kwargs_key = attr.replace("w_", "") if attr.startswith("w_") else attr + kwargs[kwargs_key] = attr_tensor.to(torch.float32) data_torch = data_torch.to(torch.float32) data = ggml_quant(data_torch, data_qtype.name.lower(), device=device, **kwargs) diff --git a/auto_round/export/export_to_gguf/packing.py b/auto_round/export/export_to_gguf/packing.py index c64066932..218e7ca94 100644 --- a/auto_round/export/export_to_gguf/packing.py +++ b/auto_round/export/export_to_gguf/packing.py @@ -654,7 +654,7 @@ def q2_k_quant_block(blocks, scale=None, wmin=None, d_scale=None, d_wmin=None, i else: from auto_round.data_type.gguf import quant_tensor_gguf_asym_dq - blocks.reshape(blocks.shape[0], -1) + blocks = blocks.reshape(blocks.shape[0], -1) blocks, scales, mins = quant_tensor_gguf_asym_dq(blocks, bits=2, scale_dtype=torch.float32, imatrix=imatrix) scales, d_scale = scales["scale"], scales["d_scale"] mins, d_wmin = mins["wmin"], mins["d_wmin"] From 0d81b6007a6abf27b2eb8bfcb63c16e0a6824166 Mon Sep 17 00:00:00 2001 From: n1ck-guo Date: Mon, 23 Mar 2026 09:48:26 +0800 Subject: [PATCH 4/6] fix 1585 Signed-off-by: n1ck-guo --- auto_round/compressors/utils.py | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/auto_round/compressors/utils.py b/auto_round/compressors/utils.py index fdba49a88..4fde72a46 100644 --- a/auto_round/compressors/utils.py +++ b/auto_round/compressors/utils.py @@ -713,13 +713,16 @@ def _set_config(config, target_config): i_attention_wv = 0 i_ffn_down = 0 layer_config_copy = copy.deepcopy(layer_config) - target_bits = None + base_target_bits = None if inner_gguf_format.startswith("gguf:q") and len(inner_gguf_format) >= 7 and (inner_gguf_format[6]).isdigit(): - target_bits = int(inner_gguf_format[6]) + base_target_bits = int(inner_gguf_format[6]) for layer_name, config in layer_config_copy.items(): if not check_to_quantized(config): continue + # Reset target_bits each iteration to prevent lm_head/embedding settings + # from bleeding into subsequent block layers and bypassing their special logic. + target_bits = base_target_bits new_type = GGUF_CONFIG[target_gguf_format]["mostly"] layer = get_module(model, layer_name) if type(layer) == transformers.pytorch_utils.Conv1D: From e4f265f7b719b79406e998f57c22bf1e020d846b Mon Sep 17 00:00:00 2001 From: n1ck-guo Date: Mon, 23 Mar 2026 14:20:39 +0800 Subject: [PATCH 5/6] update Signed-off-by: n1ck-guo --- auto_round/compressors/utils.py | 5 +++- auto_round/export/export_to_gguf/packing.py | 6 +---- test/test_cuda/export/test_gguf_format.py | 29 +++++++++++++++++++++ 3 files changed, 34 insertions(+), 6 deletions(-) diff --git a/auto_round/compressors/utils.py b/auto_round/compressors/utils.py index 4fde72a46..03d4902bf 100644 --- a/auto_round/compressors/utils.py +++ b/auto_round/compressors/utils.py @@ -429,7 +429,10 @@ def normalize_item(item: Union[str, dict, "QuantizationScheme"], layer_name: str m = get_module(model, name) if len(list(m.children())) == 0 and type(m) not in supported_types: layer_config.pop(name) - logger.debug(f"{name} is not supported in current scheme, ignoring its setting in `layer_config`") + logger.warning( + f"'{name}' exists in the model but is not a supported quantization target " + f"in the current scheme, ignoring its setting in `layer_config`" + ) continue regex = re.compile(to_standard_regex(name)) diff --git a/auto_round/export/export_to_gguf/packing.py b/auto_round/export/export_to_gguf/packing.py index 55650c628..218e7ca94 100644 --- a/auto_round/export/export_to_gguf/packing.py +++ b/auto_round/export/export_to_gguf/packing.py @@ -655,11 +655,7 @@ def q2_k_quant_block(blocks, scale=None, wmin=None, d_scale=None, d_wmin=None, i from auto_round.data_type.gguf import quant_tensor_gguf_asym_dq blocks = blocks.reshape(blocks.shape[0], -1) - # blocks, scales, mins = quant_tensor_gguf_asym_dq( - # blocks, bits=2, scale_dtype=torch.float32, imatrix=imatrix) - blocks, scales, mins = quant_tensor_gguf_opt_rtn_asym_dq( - blocks, bits=2, scale_dtype=torch.float32, imatrix=imatrix - ) + blocks, scales, mins = quant_tensor_gguf_asym_dq(blocks, bits=2, scale_dtype=torch.float32, imatrix=imatrix) scales, d_scale = scales["scale"], scales["d_scale"] mins, d_wmin = mins["wmin"], mins["d_wmin"] blocks = blocks.reshape((nb, QK_K // 16, 16)) diff --git a/test/test_cuda/export/test_gguf_format.py b/test/test_cuda/export/test_gguf_format.py index a89214616..832233226 100644 --- a/test/test_cuda/export/test_gguf_format.py +++ b/test/test_cuda/export/test_gguf_format.py @@ -204,6 +204,35 @@ def test_q2k_mixed(self): shutil.rmtree(saved_tiny_model_path, ignore_errors=True) + @require_gguf + def test_q2_k_s_ffn_down_q4k(self): + """Verify blk.0.ffn_down.weight is Q4_K in gguf:q2_k_s format. + Blocks where i_layer < n_layer/8 should use Q4_K instead of Q2_K for ffn_down.""" + from gguf.gguf_reader import GGUFReader + + model_path = get_model_path("Qwen/Qwen3-1.7B") + tiny_model_path = "./tmp/tiny_qwen3_1b" + save_tiny_model(model_path, tiny_model_path, num_layers=8) + autoround = AutoRound( + tiny_model_path, + iters=0, + nsamples=1, + seqlen=16, + disable_opt_rtn=True, + ) + quantized_model_path = self.save_dir + autoround.quantize_and_save(output_dir=quantized_model_path, format="gguf:q2_k_s") + gguf_file = os.listdir(quantized_model_path)[0] + gguf_model = GGUFReader(os.path.join(quantized_model_path, gguf_file)) + ffn_down_type = None + for tensor in gguf_model.tensors: + if tensor.name == "blk.0.ffn_down.weight": + ffn_down_type = tensor.tensor_type.name + break + assert ffn_down_type is not None, "blk.0.ffn_down.weight not found in GGUF file" + assert ffn_down_type == "Q4_K", f"Expected Q4_K for blk.0.ffn_down.weight but got {ffn_down_type}" + shutil.rmtree(tiny_model_path, ignore_errors=True) + @pytest.mark.skip_ci(reason="Only tiny model is suggested for CI") def test_gguf_baseline(self): model_name = get_model_path("Qwen/Qwen2.5-1.5B-Instruct") From 766f67f0c81411fd482af78dacea5b62ebaf2e11 Mon Sep 17 00:00:00 2001 From: n1ck-guo Date: Tue, 24 Mar 2026 14:31:20 +0800 Subject: [PATCH 6/6] fix bug Signed-off-by: n1ck-guo --- auto_round/export/export_to_gguf/export.py | 47 ++++++++++------------ 1 file changed, 22 insertions(+), 25 deletions(-) diff --git a/auto_round/export/export_to_gguf/export.py b/auto_round/export/export_to_gguf/export.py index 57d510ac4..c3a06c2ad 100644 --- a/auto_round/export/export_to_gguf/export.py +++ b/auto_round/export/export_to_gguf/export.py @@ -177,32 +177,29 @@ def pack_gguf_layer( ) ) - if not hasattr(model, "last_layer_name_to_block_name"): - block_name_to_last_layer_name = {} - block_names = get_block_names(model, quant_vision=True) - block_names_flatten = flatten_list(block_names) - all_qlayer_name = [] - for n, m in model.named_modules(): - if not check_to_quantized(m): - continue - all_qlayer_name.append(n) - for block_name in block_names_flatten: - block_name_split = block_name.split(".") - name_split = n.split(".") - if ( - len(name_split) < len(block_name_split) - or name_split[: len(block_name_split)] != block_name_split - ): - continue - block_name_to_last_layer_name[block_name] = n - last_layer_name_to_block_name = {v: k for k, v in block_name_to_last_layer_name.items()} - model.last_layer_name_to_block_name = last_layer_name_to_block_name - names_in_blocks = [] + if not hasattr(model, "last_layer_name_to_block_name"): + block_name_to_last_layer_name = {} + block_names = get_block_names(model, quant_vision=True) + block_names_flatten = flatten_list(block_names) + all_qlayer_name = [] + for n, m in model.named_modules(): + if not check_to_quantized(m): + continue + all_qlayer_name.append(n) for block_name in block_names_flatten: - block = get_module(model, block_name) - for n, m in block.named_modules(): - if check_to_quantized(m): - names_in_blocks.append(m.global_name) + block_name_split = block_name.split(".") + name_split = n.split(".") + if len(name_split) < len(block_name_split) or name_split[: len(block_name_split)] != block_name_split: + continue + block_name_to_last_layer_name[block_name] = n + last_layer_name_to_block_name = {v: k for k, v in block_name_to_last_layer_name.items()} + model.last_layer_name_to_block_name = last_layer_name_to_block_name + names_in_blocks = [] + for block_name in block_names_flatten: + block = get_module(model, block_name) + for n, m in block.named_modules(): + if check_to_quantized(m): + names_in_blocks.append(m.global_name) if name in model.last_layer_name_to_block_name: # Packing block