Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
10 changes: 8 additions & 2 deletions auto_round/alg_ext.py
Original file line number Diff line number Diff line change
Expand Up @@ -561,7 +561,10 @@ def make_qp_quants(nmax, data, quant_weights, v=0):
L = torch.round(iscale * data + v).clip(max=nmax)
sumlx = torch.sum(quant_weights * data * L, dim=-1)
suml2 = torch.sum(quant_weights * L * L, dim=-1)
return sumlx / suml2, L
# When suml2 is zero (all L=0 or all quant_weights=0), fall back to the
# simple max-based scale estimate to avoid NaN propagating into the GGUF file.
fallback_d = group_max.squeeze(-1) / nmax
return torch.where(suml2 > 0, sumlx / suml2, fallback_d), L


# @torch._disable_dynamo()
Expand Down Expand Up @@ -686,7 +689,10 @@ def make_qp_new_quants(data, orig_scale, orig_mins, quant_weights, bits=4, super
quant_weights = quant_weights.view(orig_scale.shape)
sumlx = torch.sum(quant_weights * orig_scale * L, dim=-1)
suml2 = torch.sum(quant_weights * L * L, dim=-1)
return sumlx / suml2, L
# When suml2 is zero, fall back to the simple max-based scale estimate
# to avoid NaN propagating into the GGUF file.
fallback_d = group_max.squeeze(-1) / nmax
return torch.where(suml2 > 0, sumlx / suml2, fallback_d), L


def quant_tensor_gguf_asym_dq(
Expand Down
2 changes: 1 addition & 1 deletion auto_round/compressors/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -1400,7 +1400,7 @@ def _quantize_rtn(self) -> tuple[torch.nn.Module, dict[str, Any]]:
tied_weights_layers.append(lm_head_name)

if use_blockwise_quantization: # The ram usage is a little higher
all_to_quantized_module_names = list(set(all_to_quantized_module_names))
all_to_quantized_module_names = list(dict.fromkeys(all_to_quantized_module_names))

all_blocks = self.quant_block_list if self.quant_block_list else get_block_names(self.model)
pbar = tqdm(range(sum(len(block) for block in all_blocks)))
Expand Down
12 changes: 9 additions & 3 deletions auto_round/compressors/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -429,7 +429,10 @@ def normalize_item(item: Union[str, dict, "QuantizationScheme"], layer_name: str
m = get_module(model, name)
if len(list(m.children())) == 0 and type(m) not in supported_types:
layer_config.pop(name)
logger.debug(f"{name} is not supported in current scheme, ignoring its setting in `layer_config`")
logger.warning(
f"'{name}' exists in the model but is not a supported quantization target "
f"in the current scheme, ignoring its setting in `layer_config`"
)
continue

regex = re.compile(to_standard_regex(name))
Expand Down Expand Up @@ -713,13 +716,16 @@ def _set_config(config, target_config):
i_attention_wv = 0
i_ffn_down = 0
layer_config_copy = copy.deepcopy(layer_config)
target_bits = None
base_target_bits = None
if inner_gguf_format.startswith("gguf:q") and len(inner_gguf_format) >= 7 and (inner_gguf_format[6]).isdigit():
target_bits = int(inner_gguf_format[6])
base_target_bits = int(inner_gguf_format[6])

for layer_name, config in layer_config_copy.items():
if not check_to_quantized(config):
continue
# Reset target_bits each iteration to prevent lm_head/embedding settings
# from bleeding into subsequent block layers and bypassing their special logic.
target_bits = base_target_bits
new_type = GGUF_CONFIG[target_gguf_format]["mostly"]
layer = get_module(model, layer_name)
if type(layer) == transformers.pytorch_utils.Conv1D:
Expand Down
8 changes: 6 additions & 2 deletions auto_round/data_type/gguf.py
Original file line number Diff line number Diff line change
Expand Up @@ -244,7 +244,11 @@ def make_qp_quants(nmax, data, quant_weights):
# if n_changed == 0:
# break

return sumlx / suml2, L
# When suml2 is zero (all L=0 due to zero scales, or all quant_weights=0 due to
# unactivated calibration features), fall back to the simple max-based scale
# estimate to avoid NaN propagating into the GGUF file.
fallback_d = group_max.squeeze(-1) / nmax
return torch.where(suml2 > 0, sumlx / suml2, fallback_d), L


@register_dtype("int_asym_dq")
Expand Down Expand Up @@ -329,7 +333,7 @@ def _imatrix_handle_zero(imatrix: Union[torch.Tensor, float], weight: torch.Tens
"please use more data via setting `nsamples` to improve accuracy as calibration activations contain 0"
)

zero_cnt = torch.sum(imatrix == 0, dim=-1)
zero_cnt = torch.sum(imatrix <= 1e-30, dim=-1)
replace_index = zero_cnt > group_size // 2
if torch.sum(replace_index) > 0:
## fallback to no imatrix
Expand Down
57 changes: 47 additions & 10 deletions auto_round/export/export_to_gguf/convert.py
Original file line number Diff line number Diff line change
Expand Up @@ -220,7 +220,7 @@ def _quant_data_with_args(

def need_modify_tensor(cls, name):
hf_arch = getattr(cls, "hf_arch", "")
if hf_arch == "Qwen3NextForCausalLM" and "in_proj_qkvz.weight" in name:
if hf_arch in "Qwen3NextForCausalLM" and "in_proj_qkvz.weight" in name:
return True
return False

Expand Down Expand Up @@ -254,21 +254,48 @@ def _quant_data(cls, data_torch, data_qtype, name, modify_name, new_name, bid, d
"wmin": None,
"imatrix": None,
}
# support for MOE model with cls eexperts not linear
# if hasattr(module, "scale") or ("exps" in new_name and len(data_torch.shape) == 3):
for attr in ["scale", "zp", "w_d_scale", "w_d_wmin", "w_wmin"]:
if hasattr(module, attr) and getattr(module, attr) is not None:
# patch for Qwen3_5, Qwen3_5 handles some weights specially,
# but the scale doesn't match; these weights are handled by gguf itself.
# Define model architectures that need special handling
QWEN3_5_MODELS = {
"Qwen3_5ForCausalLM",
"Qwen3_5MoeForCausalLM",
"Qwen3_5MoeForConditionalGeneration",
"Qwen3_5ForConditionalGeneration",
}

QWEN3_5_SKIP_KEYS = {
".in_proj_qkv.",
".in_proj_z",
".conv1d",
".in_proj_b.",
".in_proj_a.",
".A_log",
".dt_bias",
".out_proj.",
}

hf_arch = getattr(cls, "hf_arch", "")
should_skip = hf_arch in QWEN3_5_MODELS and any(key in name for key in QWEN3_5_SKIP_KEYS)

if not should_skip:
# support for MOE model with cls experts not linear
for attr in ["scale", "zp", "w_d_scale", "w_d_wmin", "w_wmin"]:
if not (hasattr(module, attr) and getattr(module, attr) is not None):
continue

attr_tensor = getattr(module, attr)
if not isinstance(attr_tensor, torch.Tensor):
continue

if hasattr(cls, "permute") or need_modify_tensor(cls, name):
bs = module.weight.shape[0]
attr_tensors_dict = dict(cls.modify_tensors(attr_tensor.reshape(bs, -1), modify_name, bid))
attr_tensor = attr_tensors_dict[new_name]
if attr in kwargs:
kwargs[attr] = attr_tensor.to(torch.float32)
else:
kwargs[attr.replace("w_", "")] = attr_tensor.to(torch.float32)

# Map attribute names to kwargs keys: w_d_scale -> d_scale, w_d_wmin -> d_wmin, w_wmin -> wmin
kwargs_key = attr.replace("w_", "") if attr.startswith("w_") else attr
kwargs[kwargs_key] = attr_tensor.to(torch.float32)
data_torch = data_torch.to(torch.float32)

data = ggml_quant(data_torch, data_qtype.name.lower(), device=device, **kwargs)
Expand Down Expand Up @@ -379,9 +406,14 @@ def remove_prefix(name, key_list):


def prepare_tensors(cls):
max_name_len = max(len(s) for _, s in cls.tensor_map.mapping.values()) + len(".weight,")
device = get_packing_device(cls.device)

# Handle empty tensor_map for models with block_count=0 (like MobileNetV5)
if cls.tensor_map.mapping:
max_name_len = max(len(s) for _, s in cls.tensor_map.mapping.values()) + len(".weight,")
else:
max_name_len = len("vision_encoder.weight,") # Default reasonable length

for name, data_torch in chain(cls.generate_extra_tensors(), cls.get_tensors()):
if name in getattr(cls.model, "_tied_weights_keys", []) and not is_separate_tensor(cls.model, name):
continue
Expand Down Expand Up @@ -441,6 +473,7 @@ def prepare_tensors(cls):
cls.match_model_tensor_name(new_name, key, bid)
for key in (
gguf.MODEL_TENSOR.FFN_GATE_INP,
gguf.MODEL_TENSOR.FFN_GATE_INP_SHEXP,
gguf.MODEL_TENSOR.POS_EMBD,
gguf.MODEL_TENSOR.TOKEN_TYPES,
gguf.MODEL_TENSOR.SSM_CONV1D,
Expand All @@ -457,6 +490,10 @@ def prepare_tensors(cls):
gguf.MODEL_TENSOR.A_ENC_EMBD_POS,
gguf.MODEL_TENSOR.ALTUP_CORRECT_COEF,
gguf.MODEL_TENSOR.ALTUP_PREDICT_COEF,
# Kimi KDA conv weights should be F32
gguf.MODEL_TENSOR.SSM_CONV1D_Q,
gguf.MODEL_TENSOR.SSM_CONV1D_K,
gguf.MODEL_TENSOR.SSM_CONV1D_V,
)
)
or not new_name.endswith(".weight")
Expand Down
47 changes: 22 additions & 25 deletions auto_round/export/export_to_gguf/export.py
Original file line number Diff line number Diff line change
Expand Up @@ -177,32 +177,29 @@ def pack_gguf_layer(
)
)

if not hasattr(model, "last_layer_name_to_block_name"):
block_name_to_last_layer_name = {}
block_names = get_block_names(model, quant_vision=True)
block_names_flatten = flatten_list(block_names)
all_qlayer_name = []
for n, m in model.named_modules():
if not check_to_quantized(m):
continue
all_qlayer_name.append(n)
for block_name in block_names_flatten:
block_name_split = block_name.split(".")
name_split = n.split(".")
if (
len(name_split) < len(block_name_split)
or name_split[: len(block_name_split)] != block_name_split
):
continue
block_name_to_last_layer_name[block_name] = n
last_layer_name_to_block_name = {v: k for k, v in block_name_to_last_layer_name.items()}
model.last_layer_name_to_block_name = last_layer_name_to_block_name
names_in_blocks = []
if not hasattr(model, "last_layer_name_to_block_name"):
block_name_to_last_layer_name = {}
block_names = get_block_names(model, quant_vision=True)
block_names_flatten = flatten_list(block_names)
all_qlayer_name = []
for n, m in model.named_modules():
if not check_to_quantized(m):
continue
all_qlayer_name.append(n)
for block_name in block_names_flatten:
block = get_module(model, block_name)
for n, m in block.named_modules():
if check_to_quantized(m):
names_in_blocks.append(m.global_name)
block_name_split = block_name.split(".")
name_split = n.split(".")
if len(name_split) < len(block_name_split) or name_split[: len(block_name_split)] != block_name_split:
continue
block_name_to_last_layer_name[block_name] = n
last_layer_name_to_block_name = {v: k for k, v in block_name_to_last_layer_name.items()}
model.last_layer_name_to_block_name = last_layer_name_to_block_name
names_in_blocks = []
for block_name in block_names_flatten:
block = get_module(model, block_name)
for n, m in block.named_modules():
if check_to_quantized(m):
names_in_blocks.append(m.global_name)

if name in model.last_layer_name_to_block_name:
# Packing block
Expand Down
2 changes: 1 addition & 1 deletion auto_round/export/export_to_gguf/packing.py
Original file line number Diff line number Diff line change
Expand Up @@ -654,7 +654,7 @@ def q2_k_quant_block(blocks, scale=None, wmin=None, d_scale=None, d_wmin=None, i
else:
from auto_round.data_type.gguf import quant_tensor_gguf_asym_dq

blocks.reshape(blocks.shape[0], -1)
blocks = blocks.reshape(blocks.shape[0], -1)
blocks, scales, mins = quant_tensor_gguf_asym_dq(blocks, bits=2, scale_dtype=torch.float32, imatrix=imatrix)
scales, d_scale = scales["scale"], scales["d_scale"]
mins, d_wmin = mins["wmin"], mins["d_wmin"]
Expand Down
29 changes: 29 additions & 0 deletions test/test_cuda/export/test_gguf_format.py
Original file line number Diff line number Diff line change
Expand Up @@ -204,6 +204,35 @@ def test_q2k_mixed(self):

shutil.rmtree(saved_tiny_model_path, ignore_errors=True)

@require_gguf
def test_q2_k_s_ffn_down_q4k(self):
"""Verify blk.0.ffn_down.weight is Q4_K in gguf:q2_k_s format.
Blocks where i_layer < n_layer/8 should use Q4_K instead of Q2_K for ffn_down."""
from gguf.gguf_reader import GGUFReader

model_path = get_model_path("Qwen/Qwen3-1.7B")
tiny_model_path = "./tmp/tiny_qwen3_1b"
save_tiny_model(model_path, tiny_model_path, num_layers=8)
autoround = AutoRound(
tiny_model_path,
iters=0,
nsamples=1,
seqlen=16,
disable_opt_rtn=True,
)
quantized_model_path = self.save_dir
autoround.quantize_and_save(output_dir=quantized_model_path, format="gguf:q2_k_s")
gguf_file = os.listdir(quantized_model_path)[0]
gguf_model = GGUFReader(os.path.join(quantized_model_path, gguf_file))
ffn_down_type = None
for tensor in gguf_model.tensors:
if tensor.name == "blk.0.ffn_down.weight":
ffn_down_type = tensor.tensor_type.name
break
assert ffn_down_type is not None, "blk.0.ffn_down.weight not found in GGUF file"
assert ffn_down_type == "Q4_K", f"Expected Q4_K for blk.0.ffn_down.weight but got {ffn_down_type}"
shutil.rmtree(tiny_model_path, ignore_errors=True)

@pytest.mark.skip_ci(reason="Only tiny model is suggested for CI")
def test_gguf_baseline(self):
model_name = get_model_path("Qwen/Qwen2.5-1.5B-Instruct")
Expand Down
Loading