intel · XuehaoSun · Mar 24, 2026 · Mar 19, 2026 · Mar 19, 2026 · Mar 20, 2026
diff --git a/auto_round/alg_ext.py b/auto_round/alg_ext.py
@@ -561,7 +561,10 @@ def make_qp_quants(nmax, data, quant_weights, v=0):
     L = torch.round(iscale * data + v).clip(max=nmax)
     sumlx = torch.sum(quant_weights * data * L, dim=-1)
     suml2 = torch.sum(quant_weights * L * L, dim=-1)
-    return sumlx / suml2, L
+    # When suml2 is zero (all L=0 or all quant_weights=0), fall back to the
+    # simple max-based scale estimate to avoid NaN propagating into the GGUF file.
+    fallback_d = group_max.squeeze(-1) / nmax
+    return torch.where(suml2 > 0, sumlx / suml2, fallback_d), L
 
 
 # @torch._disable_dynamo()
@@ -686,7 +689,10 @@ def make_qp_new_quants(data, orig_scale, orig_mins, quant_weights, bits=4, super
     quant_weights = quant_weights.view(orig_scale.shape)
     sumlx = torch.sum(quant_weights * orig_scale * L, dim=-1)
     suml2 = torch.sum(quant_weights * L * L, dim=-1)
-    return sumlx / suml2, L
+    # When suml2 is zero, fall back to the simple max-based scale estimate
+    # to avoid NaN propagating into the GGUF file.
+    fallback_d = group_max.squeeze(-1) / nmax
+    return torch.where(suml2 > 0, sumlx / suml2, fallback_d), L
 
 
 def quant_tensor_gguf_asym_dq(

diff --git a/auto_round/compressors/base.py b/auto_round/compressors/base.py
@@ -1400,7 +1400,7 @@ def _quantize_rtn(self) -> tuple[torch.nn.Module, dict[str, Any]]:
                     tied_weights_layers.append(lm_head_name)
 
             if use_blockwise_quantization:  # The ram usage is a little higher
-                all_to_quantized_module_names = list(set(all_to_quantized_module_names))
+                all_to_quantized_module_names = list(dict.fromkeys(all_to_quantized_module_names))
 
                 all_blocks = self.quant_block_list if self.quant_block_list else get_block_names(self.model)
                 pbar = tqdm(range(sum(len(block) for block in all_blocks)))

diff --git a/auto_round/compressors/utils.py b/auto_round/compressors/utils.py
@@ -429,7 +429,10 @@ def normalize_item(item: Union[str, dict, "QuantizationScheme"], layer_name: str
             m = get_module(model, name)
             if len(list(m.children())) == 0 and type(m) not in supported_types:
                 layer_config.pop(name)
-                logger.debug(f"{name} is not supported in current scheme, ignoring its setting in `layer_config`")
+                logger.warning(
+                    f"'{name}' exists in the model but is not a supported quantization target "
+                    f"in the current scheme, ignoring its setting in `layer_config`"
+                )
                 continue
 
         regex = re.compile(to_standard_regex(name))
@@ -713,13 +716,16 @@ def _set_config(config, target_config):
     i_attention_wv = 0
     i_ffn_down = 0
     layer_config_copy = copy.deepcopy(layer_config)
-    target_bits = None
+    base_target_bits = None
     if inner_gguf_format.startswith("gguf:q") and len(inner_gguf_format) >= 7 and (inner_gguf_format[6]).isdigit():
-        target_bits = int(inner_gguf_format[6])
+        base_target_bits = int(inner_gguf_format[6])
 
     for layer_name, config in layer_config_copy.items():
         if not check_to_quantized(config):
             continue
+        # Reset target_bits each iteration to prevent lm_head/embedding settings
+        # from bleeding into subsequent block layers and bypassing their special logic.
+        target_bits = base_target_bits
         new_type = GGUF_CONFIG[target_gguf_format]["mostly"]
         layer = get_module(model, layer_name)
         if type(layer) == transformers.pytorch_utils.Conv1D:

diff --git a/auto_round/data_type/gguf.py b/auto_round/data_type/gguf.py
@@ -244,7 +244,11 @@ def make_qp_quants(nmax, data, quant_weights):
     #     if n_changed == 0:
     #         break
 
-    return sumlx / suml2, L
+    # When suml2 is zero (all L=0 due to zero scales, or all quant_weights=0 due to
+    # unactivated calibration features), fall back to the simple max-based scale
+    # estimate to avoid NaN propagating into the GGUF file.
+    fallback_d = group_max.squeeze(-1) / nmax
+    return torch.where(suml2 > 0, sumlx / suml2, fallback_d), L
 
 
 @register_dtype("int_asym_dq")
@@ -329,7 +333,7 @@ def _imatrix_handle_zero(imatrix: Union[torch.Tensor, float], weight: torch.Tens
             "please use more data via setting `nsamples` to improve accuracy as calibration activations contain 0"
         )
 
-        zero_cnt = torch.sum(imatrix == 0, dim=-1)
+        zero_cnt = torch.sum(imatrix <= 1e-30, dim=-1)
         replace_index = zero_cnt > group_size // 2
         if torch.sum(replace_index) > 0:
             ## fallback to no imatrix

diff --git a/auto_round/export/export_to_gguf/convert.py b/auto_round/export/export_to_gguf/convert.py
@@ -220,7 +220,7 @@ def _quant_data_with_args(
 
 def need_modify_tensor(cls, name):
     hf_arch = getattr(cls, "hf_arch", "")
-    if hf_arch == "Qwen3NextForCausalLM" and "in_proj_qkvz.weight" in name:
+    if hf_arch in "Qwen3NextForCausalLM" and "in_proj_qkvz.weight" in name:
         return True
     return False
 
@@ -254,21 +254,48 @@ def _quant_data(cls, data_torch, data_qtype, name, modify_name, new_name, bid, d
         "wmin": None,
         "imatrix": None,
     }
-    # support for MOE model with cls eexperts not linear
-    # if hasattr(module, "scale") or ("exps" in new_name and len(data_torch.shape) == 3):
-    for attr in ["scale", "zp", "w_d_scale", "w_d_wmin", "w_wmin"]:
-        if hasattr(module, attr) and getattr(module, attr) is not None:
+    # patch for Qwen3_5, Qwen3_5 handles some weights specially,
+    # but the scale doesn't match; these weights are handled by gguf itself.
+    # Define model architectures that need special handling
+    QWEN3_5_MODELS = {
+        "Qwen3_5ForCausalLM",
+        "Qwen3_5MoeForCausalLM",
+        "Qwen3_5MoeForConditionalGeneration",
+        "Qwen3_5ForConditionalGeneration",
+    }
+
+    QWEN3_5_SKIP_KEYS = {
+        ".in_proj_qkv.",
+        ".in_proj_z",
+        ".conv1d",
+        ".in_proj_b.",
+        ".in_proj_a.",
+        ".A_log",
+        ".dt_bias",
+        ".out_proj.",
+    }
+
+    hf_arch = getattr(cls, "hf_arch", "")
+    should_skip = hf_arch in QWEN3_5_MODELS and any(key in name for key in QWEN3_5_SKIP_KEYS)
+
+    if not should_skip:
+        # support for MOE model with cls experts not linear
+        for attr in ["scale", "zp", "w_d_scale", "w_d_wmin", "w_wmin"]:
+            if not (hasattr(module, attr) and getattr(module, attr) is not None):
+                continue
+
             attr_tensor = getattr(module, attr)
             if not isinstance(attr_tensor, torch.Tensor):
                 continue
+
             if hasattr(cls, "permute") or need_modify_tensor(cls, name):
                 bs = module.weight.shape[0]
                 attr_tensors_dict = dict(cls.modify_tensors(attr_tensor.reshape(bs, -1), modify_name, bid))
                 attr_tensor = attr_tensors_dict[new_name]
-            if attr in kwargs:
-                kwargs[attr] = attr_tensor.to(torch.float32)
-            else:
-                kwargs[attr.replace("w_", "")] = attr_tensor.to(torch.float32)
+
+            # Map attribute names to kwargs keys: w_d_scale -> d_scale, w_d_wmin -> d_wmin, w_wmin -> wmin
+            kwargs_key = attr.replace("w_", "") if attr.startswith("w_") else attr
+            kwargs[kwargs_key] = attr_tensor.to(torch.float32)
     data_torch = data_torch.to(torch.float32)
 
     data = ggml_quant(data_torch, data_qtype.name.lower(), device=device, **kwargs)
@@ -379,9 +406,14 @@ def remove_prefix(name, key_list):
 
 
 def prepare_tensors(cls):
-    max_name_len = max(len(s) for _, s in cls.tensor_map.mapping.values()) + len(".weight,")
     device = get_packing_device(cls.device)
 
+    # Handle empty tensor_map for models with block_count=0 (like MobileNetV5)
+    if cls.tensor_map.mapping:
+        max_name_len = max(len(s) for _, s in cls.tensor_map.mapping.values()) + len(".weight,")
+    else:
+        max_name_len = len("vision_encoder.weight,")  # Default reasonable length
+
     for name, data_torch in chain(cls.generate_extra_tensors(), cls.get_tensors()):
         if name in getattr(cls.model, "_tied_weights_keys", []) and not is_separate_tensor(cls.model, name):
             continue
@@ -441,6 +473,7 @@ def prepare_tensors(cls):
                     cls.match_model_tensor_name(new_name, key, bid)
                     for key in (
                         gguf.MODEL_TENSOR.FFN_GATE_INP,
+                        gguf.MODEL_TENSOR.FFN_GATE_INP_SHEXP,
                         gguf.MODEL_TENSOR.POS_EMBD,
                         gguf.MODEL_TENSOR.TOKEN_TYPES,
                         gguf.MODEL_TENSOR.SSM_CONV1D,
@@ -457,6 +490,10 @@ def prepare_tensors(cls):
                         gguf.MODEL_TENSOR.A_ENC_EMBD_POS,
                         gguf.MODEL_TENSOR.ALTUP_CORRECT_COEF,
                         gguf.MODEL_TENSOR.ALTUP_PREDICT_COEF,
+                        # Kimi KDA conv weights should be F32
+                        gguf.MODEL_TENSOR.SSM_CONV1D_Q,
+                        gguf.MODEL_TENSOR.SSM_CONV1D_K,
+                        gguf.MODEL_TENSOR.SSM_CONV1D_V,
                     )
                 )
                 or not new_name.endswith(".weight")

diff --git a/auto_round/export/export_to_gguf/export.py b/auto_round/export/export_to_gguf/export.py
@@ -177,32 +177,29 @@ def pack_gguf_layer(
                 )
             )
 
-        if not hasattr(model, "last_layer_name_to_block_name"):
-            block_name_to_last_layer_name = {}
-            block_names = get_block_names(model, quant_vision=True)
-            block_names_flatten = flatten_list(block_names)
-            all_qlayer_name = []
-            for n, m in model.named_modules():
-                if not check_to_quantized(m):
-                    continue
-                all_qlayer_name.append(n)
-                for block_name in block_names_flatten:
-                    block_name_split = block_name.split(".")
-                    name_split = n.split(".")
-                    if (
-                        len(name_split) < len(block_name_split)
-                        or name_split[: len(block_name_split)] != block_name_split
-                    ):
-                        continue
-                    block_name_to_last_layer_name[block_name] = n
-            last_layer_name_to_block_name = {v: k for k, v in block_name_to_last_layer_name.items()}
-            model.last_layer_name_to_block_name = last_layer_name_to_block_name
-            names_in_blocks = []
+    if not hasattr(model, "last_layer_name_to_block_name"):
+        block_name_to_last_layer_name = {}
+        block_names = get_block_names(model, quant_vision=True)
+        block_names_flatten = flatten_list(block_names)
+        all_qlayer_name = []
+        for n, m in model.named_modules():
+            if not check_to_quantized(m):
+                continue
+            all_qlayer_name.append(n)
             for block_name in block_names_flatten:
-                block = get_module(model, block_name)
-                for n, m in block.named_modules():
-                    if check_to_quantized(m):
-                        names_in_blocks.append(m.global_name)
+                block_name_split = block_name.split(".")
+                name_split = n.split(".")
+                if len(name_split) < len(block_name_split) or name_split[: len(block_name_split)] != block_name_split:
+                    continue
+                block_name_to_last_layer_name[block_name] = n
+        last_layer_name_to_block_name = {v: k for k, v in block_name_to_last_layer_name.items()}
+        model.last_layer_name_to_block_name = last_layer_name_to_block_name
+        names_in_blocks = []
+        for block_name in block_names_flatten:
+            block = get_module(model, block_name)
+            for n, m in block.named_modules():
+                if check_to_quantized(m):
+                    names_in_blocks.append(m.global_name)
 
     if name in model.last_layer_name_to_block_name:
         # Packing block

diff --git a/auto_round/export/export_to_gguf/packing.py b/auto_round/export/export_to_gguf/packing.py
@@ -654,7 +654,7 @@ def q2_k_quant_block(blocks, scale=None, wmin=None, d_scale=None, d_wmin=None, i
     else:
         from auto_round.data_type.gguf import quant_tensor_gguf_asym_dq
 
-        blocks.reshape(blocks.shape[0], -1)
+        blocks = blocks.reshape(blocks.shape[0], -1)
         blocks, scales, mins = quant_tensor_gguf_asym_dq(blocks, bits=2, scale_dtype=torch.float32, imatrix=imatrix)
         scales, d_scale = scales["scale"], scales["d_scale"]
         mins, d_wmin = mins["wmin"], mins["d_wmin"]

diff --git a/test/test_cuda/export/test_gguf_format.py b/test/test_cuda/export/test_gguf_format.py
@@ -204,6 +204,35 @@ def test_q2k_mixed(self):
 
         shutil.rmtree(saved_tiny_model_path, ignore_errors=True)
 
+    @require_gguf
+    def test_q2_k_s_ffn_down_q4k(self):
+        """Verify blk.0.ffn_down.weight is Q4_K in gguf:q2_k_s format.
+        Blocks where i_layer < n_layer/8 should use Q4_K instead of Q2_K for ffn_down."""
+        from gguf.gguf_reader import GGUFReader
+
+        model_path = get_model_path("Qwen/Qwen3-1.7B")
+        tiny_model_path = "./tmp/tiny_qwen3_1b"
+        save_tiny_model(model_path, tiny_model_path, num_layers=8)
+        autoround = AutoRound(
+            tiny_model_path,
+            iters=0,
+            nsamples=1,
+            seqlen=16,
+            disable_opt_rtn=True,
+        )
+        quantized_model_path = self.save_dir
+        autoround.quantize_and_save(output_dir=quantized_model_path, format="gguf:q2_k_s")
+        gguf_file = os.listdir(quantized_model_path)[0]
+        gguf_model = GGUFReader(os.path.join(quantized_model_path, gguf_file))
+        ffn_down_type = None
+        for tensor in gguf_model.tensors:
+            if tensor.name == "blk.0.ffn_down.weight":
+                ffn_down_type = tensor.tensor_type.name
+                break
+        assert ffn_down_type is not None, "blk.0.ffn_down.weight not found in GGUF file"
+        assert ffn_down_type == "Q4_K", f"Expected Q4_K for blk.0.ffn_down.weight but got {ffn_down_type}"
+        shutil.rmtree(tiny_model_path, ignore_errors=True)
+
     @pytest.mark.skip_ci(reason="Only tiny model is suggested for CI")
     def test_gguf_baseline(self):
         model_name = get_model_path("Qwen/Qwen2.5-1.5B-Instruct")