From 74d4f4e57625c29f60ba9037f6b8ac2abe27769a Mon Sep 17 00:00:00 2001 From: lkk12014402 Date: Tue, 7 Apr 2026 09:11:43 +0000 Subject: [PATCH 01/13] fix hadamard transform weight dtype, using float64 as default. Signed-off-by: lkk12014402 --- auto_round/experimental/transform/apply.py | 20 +++++++++++++------ .../experimental/transform/hadamards.py | 16 ++++++++++++++- .../experimental/transform/triton/mxfp4.py | 5 +++++ 3 files changed, 34 insertions(+), 7 deletions(-) diff --git a/auto_round/experimental/transform/apply.py b/auto_round/experimental/transform/apply.py index 6980d75e4..93b306683 100644 --- a/auto_round/experimental/transform/apply.py +++ b/auto_round/experimental/transform/apply.py @@ -100,7 +100,6 @@ def _apply_to_module( location="input", inverse=True, device="cpu", - precision=module.dtype, ) if config.hadamard_type != "random_hadamard": @@ -115,6 +114,7 @@ def input_hook(self, args): input = args[0] # transform(input) orig_shape = input.shape + orig_dtype = input.dtype x_flat = input.contiguous().flatten(end_dim=-2) qdq_input, _ = mxfp4_forward_kernel_wrapper( x_flat, @@ -122,7 +122,7 @@ def input_hook(self, args): hadamard_weight if hadamard_weight is not None else self.hadamard_matrix.T ), # this matrix from w_transform, needs transpose ) - return qdq_input.reshape(orig_shape) + return qdq_input.reshape(orig_shape).to(orig_dtype) # for fused transform + quantization kernel module.pre_dequantized_input = True @@ -135,13 +135,23 @@ def input_hook(self, args): input = args[0] ori_shape = input.shape + orig_dtype = input.dtype if hadamard_weight is not None: input = input.view(-1, hadamard_weight.shape[0]) - return _multihead_matmul(input, hadamard_weight.to(input.device)).view(ori_shape) + return ( + _multihead_matmul( + input.to(hadamard_weight.dtype), + hadamard_weight.to(input.device) + ) + ).view(ori_shape).to(orig_dtype) else: input = input.view(-1, self.hadamard_matrix.shape[0]) - return _multihead_matmul(input, self.hadamard_matrix.T).view(ori_shape) + return ( + _multihead_matmul( + input.to(self.hadamard_matrix.dtype), + self.hadamard_matrix.T) + ).view(ori_shape).to(orig_dtype) # for fused transform + quantization kernel module.pre_dequantized_input = False @@ -156,7 +166,6 @@ def input_hook(self, args): **config.dict(), location="weight", device=module.weight.device, - precision=module.weight.dtype, ) # need save random hadamard matrix needed when inference @@ -180,7 +189,6 @@ def input_hook(self, args): location="input", inverse=True, device=module.weight.device, - precision=module.weight.dtype, ) patch_wrapperlinear_to_apply_transform(weight_hadamard_transform, input_hadamard_transform) diff --git a/auto_round/experimental/transform/hadamards.py b/auto_round/experimental/transform/hadamards.py index 712232a9a..dea423dd6 100644 --- a/auto_round/experimental/transform/hadamards.py +++ b/auto_round/experimental/transform/hadamards.py @@ -34,11 +34,25 @@ def __init__( self, block_size: int = 32, device: torch.device = None, - precision: torch.dtype = None, + precision: torch.dtype = torch.float64, location: str = "weight", module_type: type[torch.nn.Module] = torch.nn.Linear, inverse: bool = False, ): + """Initialize a Hadamard transform module. + + Args: + block_size: Size of each Hadamard block. The input tensor is reshaped + to ``(-1, block_size)`` before applying the transform. + device: Device on which to create the Hadamard matrix. + precision: Data type used for the Hadamard matrix weights, using float64 as default. + location: Target location used by ``apply_transform_weight`` when + applying the transform. + module_type: Module type associated with the transform application, + typically ``torch.nn.Linear``. + inverse: Whether to build the inverse form of the transform. + """ + super().__init__() self.size = block_size self.scale = 1 / math.sqrt(self.size) diff --git a/auto_round/experimental/transform/triton/mxfp4.py b/auto_round/experimental/transform/triton/mxfp4.py index c26413248..8028c167b 100644 --- a/auto_round/experimental/transform/triton/mxfp4.py +++ b/auto_round/experimental/transform/triton/mxfp4.py @@ -161,6 +161,11 @@ def mxfp4_forward_kernel_wrapper( if hadamard_matrix.device != device: hadamard_matrix = hadamard_matrix.to(device) + dtype = hadamard_matrix.dtype + + if x.dtype != dtype: + x = x.to(dtype) + # Make sure inputs are contiguous x = x.contiguous() hadamard_matrix = hadamard_matrix.contiguous() From aa06e4391d1be3bff4bfac0f232a13d755bda0e5 Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Tue, 7 Apr 2026 09:07:25 +0000 Subject: [PATCH 02/13] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- auto_round/experimental/transform/apply.py | 17 ++++++++--------- 1 file changed, 8 insertions(+), 9 deletions(-) diff --git a/auto_round/experimental/transform/apply.py b/auto_round/experimental/transform/apply.py index 93b306683..2d0d4225d 100644 --- a/auto_round/experimental/transform/apply.py +++ b/auto_round/experimental/transform/apply.py @@ -140,18 +140,17 @@ def input_hook(self, args): if hadamard_weight is not None: input = input.view(-1, hadamard_weight.shape[0]) return ( - _multihead_matmul( - input.to(hadamard_weight.dtype), - hadamard_weight.to(input.device) - ) - ).view(ori_shape).to(orig_dtype) + (_multihead_matmul(input.to(hadamard_weight.dtype), hadamard_weight.to(input.device))) + .view(ori_shape) + .to(orig_dtype) + ) else: input = input.view(-1, self.hadamard_matrix.shape[0]) return ( - _multihead_matmul( - input.to(self.hadamard_matrix.dtype), - self.hadamard_matrix.T) - ).view(ori_shape).to(orig_dtype) + (_multihead_matmul(input.to(self.hadamard_matrix.dtype), self.hadamard_matrix.T)) + .view(ori_shape) + .to(orig_dtype) + ) # for fused transform + quantization kernel module.pre_dequantized_input = False From 928b155183a11a2950ccb957c5e616b38fa7cd74 Mon Sep 17 00:00:00 2001 From: lkk12014402 Date: Tue, 7 Apr 2026 10:52:10 +0000 Subject: [PATCH 03/13] float32 maybe enough for hadamard transform. Signed-off-by: lkk12014402 --- auto_round/experimental/transform/hadamards.py | 2 +- auto_round/experimental/transform/utils/hadamard.py | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/auto_round/experimental/transform/hadamards.py b/auto_round/experimental/transform/hadamards.py index dea423dd6..79223fae5 100644 --- a/auto_round/experimental/transform/hadamards.py +++ b/auto_round/experimental/transform/hadamards.py @@ -34,7 +34,7 @@ def __init__( self, block_size: int = 32, device: torch.device = None, - precision: torch.dtype = torch.float64, + precision: torch.dtype = torch.float32, location: str = "weight", module_type: type[torch.nn.Module] = torch.nn.Linear, inverse: bool = False, diff --git a/auto_round/experimental/transform/utils/hadamard.py b/auto_round/experimental/transform/utils/hadamard.py index 5ec6bccbd..5c7ade385 100644 --- a/auto_round/experimental/transform/utils/hadamard.py +++ b/auto_round/experimental/transform/utils/hadamard.py @@ -70,8 +70,8 @@ def random_hadamard_matrix( :param gen: Optional generator random values :return: randomly generated hadamard matrix """ - Q = torch.randint(low=0, high=2, size=(size,), generator=gen, dtype=dtype) # cpu - Q = Q.to(device=device) + Q = torch.randint(low=0, high=2, size=(size,), generator=gen) # cpu + Q = Q.to(device=device, dtype=dtype) Q = Q * 2 - 1 Q = torch.diag(Q) return _matmul_hadU(Q) From c67b95d70466ee78126129ecafd9a0bdbf3b9dbf Mon Sep 17 00:00:00 2001 From: lkk12014402 Date: Wed, 8 Apr 2026 08:56:33 +0000 Subject: [PATCH 04/13] in-place weight when auto-round tuning. Signed-off-by: lkk12014402 --- auto_round/experimental/transform/apply.py | 52 ++++++------- .../experimental/transform/patch_modules.py | 78 +++++-------------- 2 files changed, 43 insertions(+), 87 deletions(-) diff --git a/auto_round/experimental/transform/apply.py b/auto_round/experimental/transform/apply.py index 2d0d4225d..2a0b88298 100644 --- a/auto_round/experimental/transform/apply.py +++ b/auto_round/experimental/transform/apply.py @@ -140,17 +140,18 @@ def input_hook(self, args): if hadamard_weight is not None: input = input.view(-1, hadamard_weight.shape[0]) return ( - (_multihead_matmul(input.to(hadamard_weight.dtype), hadamard_weight.to(input.device))) - .view(ori_shape) - .to(orig_dtype) - ) + _multihead_matmul( + input.to(hadamard_weight.dtype), + hadamard_weight.to(input.device) + ) + ).view(ori_shape).to(orig_dtype) else: input = input.view(-1, self.hadamard_matrix.shape[0]) return ( - (_multihead_matmul(input.to(self.hadamard_matrix.dtype), self.hadamard_matrix.T)) - .view(ori_shape) - .to(orig_dtype) - ) + _multihead_matmul( + input.to(self.hadamard_matrix.dtype), + self.hadamard_matrix.T) + ).view(ori_shape).to(orig_dtype) # for fused transform + quantization kernel module.pre_dequantized_input = False @@ -175,30 +176,23 @@ def input_hook(self, args): patch_quantlinear(config.hadamard_type) - if need_calibration: - # for training, the weight changes with every forward pass - # for autoround tuning: patch wrapper linear qdq_weight func - from auto_round.experimental.transform.patch_modules import ( - patch_wrapperlinear_to_apply_transform, - patch_wrapperwalayer_forward_to_apply_transform, - ) + # for autoround tuning: weight not tuning + # for rtn: weight transformed before saving + from auto_round.experimental.transform.patch_modules import ( + patch_wrapperlinear_to_apply_transform, + patch_wrapperwalayer_forward_to_apply_transform, + ) - input_hadamard_transform = build_hadamard_transform( - **config.dict(), - location="input", - inverse=True, - device=module.weight.device, - ) + input_hadamard_transform = build_hadamard_transform( + **config.dict(), + location="input", + inverse=True, + device=module.weight.device, + ) - patch_wrapperlinear_to_apply_transform(weight_hadamard_transform, input_hadamard_transform) - patch_wrapperwalayer_forward_to_apply_transform(input_hadamard_transform) + patch_wrapperlinear_to_apply_transform(weight_hadamard_transform, input_hadamard_transform) + patch_wrapperwalayer_forward_to_apply_transform(input_hadamard_transform) - else: - # transform is no longer needed (unfusing is not supported) - # delattr(module, transform_name) - # fuse transform into weight - with torch.no_grad(): - getattr(module, "weight").copy_(weight_hadamard_transform(module.weight).to(module.weight.device)) else: # TODO: apply transform to output/q/k diff --git a/auto_round/experimental/transform/patch_modules.py b/auto_round/experimental/transform/patch_modules.py index 934ebea9d..a7f6d1d9e 100644 --- a/auto_round/experimental/transform/patch_modules.py +++ b/auto_round/experimental/transform/patch_modules.py @@ -32,67 +32,29 @@ def _qdq_weight_patched(self, value, min_scale, max_scale): # keep original behavior for >=16bit to avoid changing semantics unexpectedly return orig_qdq_weight(self, value, min_scale, max_scale) - min_scale.data.clamp_(0, 1.0) - max_scale.data.clamp_(0, 1.0) - - weight = self.orig_layer.weight - if weight.device.type == "meta": - weight = self.orig_layer.get_weight().to(self.device) - - is_conv1d = type(self.orig_layer) == transformers.pytorch_utils.Conv1D - if is_conv1d: - weight = weight.t() - - weight = weight.to(self.device) - - weight_t = w_transform(weight) - - quant_kwargs = {} - if hasattr(self.orig_layer, "super_bits"): - quant_kwargs["super_bits"] = self.orig_layer.super_bits - quant_kwargs["super_group_size"] = self.orig_layer.super_group_size - - weight_q, scale, zp = self.weight_quant_func( - weight_t, - bits=self.orig_layer.bits, - group_size=self.orig_layer.group_size, - v=value, - min_scale=min_scale, - max_scale=max_scale, - scale_dtype=self.orig_layer.scale_dtype, - tensor_min=self.weight_min, - tensor_max=self.weight_max, - data_type=self.data_type, - q_scale_thresh=self.q_scale_thresh, - imatrix=self.orig_layer.imatrix.to(self.device) if hasattr(self.orig_layer, "imatrix") else None, - global_scale=getattr(self, "weight_global_scale", None), - **quant_kwargs, - ) - - weight_q = weight_q.to(dtype=weight.dtype) - - if is_conv1d: - weight_q = weight_q.t() - - return weight_q, scale, zp - + if getattr(self, "applied_weight_hadamard", None) is None: + with torch.no_grad(): + weight = self.orig_layer.weight + if weight.device.type == "meta": + weight = self.orig_layer.get_weight().to(self.device) + + is_conv1d = type(self.orig_layer) == transformers.pytorch_utils.Conv1D + if is_conv1d: + weight = weight.t().continuous() + new_weight = w_transform(weight) + if is_conv1d: + new_weight = weight.t().continuous() + self.orig_layer.weight.data.copy_(new_weight) + self.applied_weight_hadamard = True + + return orig_qdq_weight(self, value, min_scale, max_scale) + + orig_qdq_act = WrapperLinear._qdq_act def _qdq_act_patched(self, x, act_max_scale, act_max=None): - # transform = getattr(self.orig_layer, transform_attr) x = inp_transform(x) - act_max_scale.data.clamp_(0, 1.0) - x, scale, zp = self.act_quant_func( - x, - bits=self.orig_layer.act_bits, - group_size=self.orig_layer.act_group_size, - scale_dtype=self.orig_layer.scale_dtype, - q_scale_thresh=self.q_scale_thresh, - data_type=self.act_data_type, - max_scale=act_max_scale, - tensor_max=act_max, - global_scale=getattr(self, "input_global_scale", None), - ) - return x, scale, zp + + return orig_qdq_act(self, x, act_max_scale, act_max) WrapperLinear._qdq_weight = _qdq_weight_patched WrapperLinear._qdq_act = _qdq_act_patched From 4700eb234ebaf2560638c18ce3d536c68babfd7f Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Wed, 8 Apr 2026 08:51:08 +0000 Subject: [PATCH 05/13] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- auto_round/experimental/transform/apply.py | 18 ++++++++---------- .../experimental/transform/patch_modules.py | 1 + 2 files changed, 9 insertions(+), 10 deletions(-) diff --git a/auto_round/experimental/transform/apply.py b/auto_round/experimental/transform/apply.py index 2a0b88298..d90aa9b2f 100644 --- a/auto_round/experimental/transform/apply.py +++ b/auto_round/experimental/transform/apply.py @@ -140,18 +140,17 @@ def input_hook(self, args): if hadamard_weight is not None: input = input.view(-1, hadamard_weight.shape[0]) return ( - _multihead_matmul( - input.to(hadamard_weight.dtype), - hadamard_weight.to(input.device) - ) - ).view(ori_shape).to(orig_dtype) + (_multihead_matmul(input.to(hadamard_weight.dtype), hadamard_weight.to(input.device))) + .view(ori_shape) + .to(orig_dtype) + ) else: input = input.view(-1, self.hadamard_matrix.shape[0]) return ( - _multihead_matmul( - input.to(self.hadamard_matrix.dtype), - self.hadamard_matrix.T) - ).view(ori_shape).to(orig_dtype) + (_multihead_matmul(input.to(self.hadamard_matrix.dtype), self.hadamard_matrix.T)) + .view(ori_shape) + .to(orig_dtype) + ) # for fused transform + quantization kernel module.pre_dequantized_input = False @@ -193,7 +192,6 @@ def input_hook(self, args): patch_wrapperlinear_to_apply_transform(weight_hadamard_transform, input_hadamard_transform) patch_wrapperwalayer_forward_to_apply_transform(input_hadamard_transform) - else: # TODO: apply transform to output/q/k raise NotImplementedError() diff --git a/auto_round/experimental/transform/patch_modules.py b/auto_round/experimental/transform/patch_modules.py index a7f6d1d9e..e099a518d 100644 --- a/auto_round/experimental/transform/patch_modules.py +++ b/auto_round/experimental/transform/patch_modules.py @@ -50,6 +50,7 @@ def _qdq_weight_patched(self, value, min_scale, max_scale): return orig_qdq_weight(self, value, min_scale, max_scale) orig_qdq_act = WrapperLinear._qdq_act + def _qdq_act_patched(self, x, act_max_scale, act_max=None): x = inp_transform(x) From 43ff2c6703a14cc196d36c1ebd35e78ef5a92591 Mon Sep 17 00:00:00 2001 From: lkk12014402 Date: Thu, 9 Apr 2026 11:00:32 +0000 Subject: [PATCH 06/13] support nvfp4. Signed-off-by: lkk12014402 --- auto_round/compressors/base.py | 10 +-- auto_round/experimental/qmodules/__init__.py | 2 +- auto_round/experimental/qmodules/nvfp4.py | 18 ++++ auto_round/experimental/transform/apply.py | 12 +-- .../experimental/transform/hadamards.py | 18 +++- auto_round/experimental/utils.py | 89 ++++++++++++++----- auto_round/inference/backend.py | 4 + auto_round/inference/convert_model.py | 6 +- 8 files changed, 120 insertions(+), 39 deletions(-) diff --git a/auto_round/compressors/base.py b/auto_round/compressors/base.py index a8735407e..ff6886c97 100644 --- a/auto_round/compressors/base.py +++ b/auto_round/compressors/base.py @@ -560,15 +560,11 @@ def __init__( # apply hadamard transform if hadamard_config: from auto_round.experimental.transform.apply import apply_hadamard_transform - from auto_round.experimental.utils import check_supported_schemes, normalize_hadamard_config + from auto_round.experimental.utils import normalize_hadamard_config - check_supported_schemes(self.scheme) + self.hadamard_config = normalize_hadamard_config(hadamard_config, self.scheme) + self.model = apply_hadamard_transform(self.model, self.hadamard_config) - self.model = apply_hadamard_transform( - self.model, hadamard_config, need_calibration=True if self.iters > 0 else False - ) - - self.hadamard_config = normalize_hadamard_config(hadamard_config) def _gen_auto_scheme(self) -> dict[str, dict]: if self.mllm: diff --git a/auto_round/experimental/qmodules/__init__.py b/auto_round/experimental/qmodules/__init__.py index 3862e0293..377784055 100644 --- a/auto_round/experimental/qmodules/__init__.py +++ b/auto_round/experimental/qmodules/__init__.py @@ -13,5 +13,5 @@ # limitations under the License. from auto_round.experimental.qmodules.mx import MXFP4QuantLinear, MXFP8QuantLinear, HadamardMXFP4QuantLinear -from auto_round.experimental.qmodules.nvfp4 import NVFP4QuantLinear +from auto_round.experimental.qmodules.nvfp4 import NVFP4QuantLinear, HadamardNVFP4QuantLinear from auto_round.experimental.qmodules.fp8_static import WeightFP8ActFP8StaticQuantLinear diff --git a/auto_round/experimental/qmodules/nvfp4.py b/auto_round/experimental/qmodules/nvfp4.py index 81aea8b54..c82846f44 100644 --- a/auto_round/experimental/qmodules/nvfp4.py +++ b/auto_round/experimental/qmodules/nvfp4.py @@ -204,3 +204,21 @@ def unpack_data(self, packed_data: torch.Tensor) -> torch.Tensor: m, half_n = packed_data.shape unpacked_data = unpack_fp4_from_uint8(packed_data, m, half_n * 2, dtype=self.dtype) return unpacked_data + + +class HadamardNVFP4QuantLinear(NVFP4QuantLinear): + """ + Quantized linear layer using the NVFP4 quantization scheme. + """ + + def __init__(self, *args, **kwargs): + super().__init__(*args, **kwargs) + self.enable_transform = True + self.register_buffer( + "hadamard_matrix", + torch.empty( + self.group_size, + self.group_size, + dtype=self.dtype, + ), + ) diff --git a/auto_round/experimental/transform/apply.py b/auto_round/experimental/transform/apply.py index d90aa9b2f..4625d4361 100644 --- a/auto_round/experimental/transform/apply.py +++ b/auto_round/experimental/transform/apply.py @@ -4,7 +4,7 @@ import torch import tqdm -from auto_round.experimental.qmodules.mx import MXQuantLinearBase +from auto_round.experimental.qmodules.base import QModuleBase from auto_round.experimental.transform.hadamard_config import HadamardConfig from auto_round.experimental.transform.hadamards import build_hadamard_transform from auto_round.experimental.utils import is_triton_kernel_available, normalize_hadamard_config @@ -15,10 +15,10 @@ def apply_hadamard_transform( model: torch.nn.Module, config: str | dict | HadamardConfig | None, - need_calibration: bool = False, location: str = "weight", use_tqdm=True, desc=None, + data_type="mx_fp" ): """ Apply a transform configuration to a model. @@ -60,14 +60,14 @@ def apply_hadamard_transform( modules_config = [ (name, module, config) for name, module in model.named_modules() - if isinstance(module, torch.nn.Linear) or isinstance(module, MXQuantLinearBase) + if isinstance(module, torch.nn.Linear) or isinstance(module, QModuleBase) ] desc = f"Applying {config.hadamard_type} transforms" if desc is None else desc for name, module, config in tqdm.tqdm(modules_config, desc=desc, disable=(not use_tqdm)): if "lm_head" in name: continue - _apply_to_module(model, module, config, need_calibration, location) + _apply_to_module(model, module, config, location, data_type) # attach config to model for compression/serialization setattr(model, "hadamard_config", config) @@ -79,8 +79,8 @@ def _apply_to_module( model: torch.nn.Module, module: torch.nn.Module, config: HadamardConfig, - need_calibration: bool = False, location: str = "weight", + data_type: str = "mx_fp" ): """ Create transforms and apply them to the module @@ -107,7 +107,7 @@ def _apply_to_module( else: hadamard_weight = None - if is_triton_kernel_available(): + if is_triton_kernel_available(data_type): from auto_round.experimental.transform.triton.mxfp4 import mxfp4_forward_kernel_wrapper def input_hook(self, args): diff --git a/auto_round/experimental/transform/hadamards.py b/auto_round/experimental/transform/hadamards.py index 79223fae5..0d29e5cb0 100644 --- a/auto_round/experimental/transform/hadamards.py +++ b/auto_round/experimental/transform/hadamards.py @@ -92,10 +92,14 @@ def forward(self, x: torch.Tensor): class RandomHadamardTransform(HadamardTransform): def __init__( self, - *args, + block_size: int = 32, + device: torch.device = None, + precision: torch.dtype = None, + location: str = "weight", + module_type: type[torch.nn.Module] = torch.nn.Linear, + inverse: bool = False, seed: int | None = None, generator: torch.Generator | None = None, - **kwargs, ): if generator is not None: self.generator = generator @@ -103,7 +107,15 @@ def __init__( self.generator = torch.Generator() if seed is not None: self.generator.manual_seed(seed) - super().__init__(*args, **kwargs) + + super().__init__( + block_size=block_size, + device=device, + precision=precision, + location=location, + module_type=module_type, + inverse=inverse, + ) def _create_weight( self, diff --git a/auto_round/experimental/utils.py b/auto_round/experimental/utils.py index 39a7ff135..1e9b8edb7 100644 --- a/auto_round/experimental/utils.py +++ b/auto_round/experimental/utils.py @@ -20,7 +20,7 @@ from auto_round.experimental.transform.hadamards import HADAMARDS from auto_round.utils import logger -SUPPORTED_QUANTIZATION_SCHEMES = ["MXFP4"] +SUPPORTED_QUANTIZATION_SCHEMES = ["MXFP4", "NVFP4"] def per_tensor_fp8_qdq( @@ -114,10 +114,12 @@ def clean_model_parameters_and_buffers_(model: torch.nn.Module, name_tuple: tupl _clean_param_or_buff_if_exists(module, name_tuple) -def is_triton_kernel_available() -> bool: +def is_triton_kernel_available(data_type: str) -> bool: """ Best-effort check for whether Triton kernel path can be used. """ + if is_nv_fp(data_type): + return False try: import triton # pylint: disable=E0401 except Exception: @@ -134,62 +136,107 @@ def is_triton_kernel_available() -> bool: return True -def normalize_hadamard_config(hadamard_config: str | dict | HadamardConfig | None) -> dict[str, Any]: +def normalize_hadamard_config( + hadamard_config: str | dict | HadamardConfig | None, scheme: str +) -> dict[str, Any]: """ Normalize and validate `hadamard_config`. Supported input types: - - None -> {} - - dict -> validated via HadamardConfig + - None -> {} + - dict -> validated via HadamardConfig - HadamardConfig -> validated & converted to dict - - str -> shorthand for `transform_type` in TRANSFORMS keys - - On any validation failure, raises ValueError/TypeError. + - str -> shorthand for `hadamard_type` in HADAMARDS keys + + Additional behavior: + - If block_size is not set: + - MXFP4 -> default block_size to 32 + - NVFP4 -> default block_size to 16 + - other schemes -> emit a warning + - If block_size is set but does not match the recommended value: + - MXFP4 expects 32 + - NVFP4 expects 16 + - emit a warning """ + + check_supported_schemes(scheme) + + def _apply_scheme_block_size(cfg_dict: dict[str, Any]) -> dict[str, Any]: + block_size = cfg_dict.get("block_size") + + if block_size is None: + if scheme == "MXFP4": + cfg_dict["block_size"] = 32 + logger.warning("block_size is not set for scheme 'MXFP4'; defaulting to 32.") + elif scheme == "NVFP4": + cfg_dict["block_size"] = 16 + logger.warning("block_size is not set for scheme 'NVFP4'; defaulting to 16.") + else: + logger.warning( + f"block_size is not set and cannot be inferred for scheme {scheme!r}; " + "please set block_size explicitly in hadamard_config if needed." + ) + else: + if scheme == "MXFP4" and block_size != 32: + logger.warning(f"scheme is 'MXFP4' but block_size={block_size}; recommended value is 32.") + elif scheme == "NVFP4" and block_size != 16: + logger.warning(f"scheme is 'NVFP4' but block_size={block_size}; recommended value is 16.") + + return cfg_dict + + # 1) None -> {} if hadamard_config is None: return {} # 2) Already a HadamardConfig instance if isinstance(hadamard_config, HadamardConfig): - # Ensure it passes its own validation and convert to dict - cfg = HadamardConfig.model_validate(hadamard_config).model_dump() - return cfg + try: + cfg_dict = HadamardConfig.model_validate(hadamard_config).model_dump() + cfg_dict = _apply_scheme_block_size(cfg_dict) + return HadamardConfig.model_validate(cfg_dict).model_dump() + except Exception as e: + raise ValueError(f"Invalid HadamardConfig: {e}") from e # 3) dict -> validate via HadamardConfig if isinstance(hadamard_config, dict): try: - cfg = HadamardConfig.model_validate(hadamard_config).model_dump() + cfg_dict = HadamardConfig.model_validate(hadamard_config).model_dump() + cfg_dict = _apply_scheme_block_size(cfg_dict) + return HadamardConfig.model_validate(cfg_dict).model_dump() except Exception as e: raise ValueError(f"Invalid hadamard_config dict: {e}") from e - return cfg - # 4) str -> shorthand for transform_type + # 4) str -> shorthand for hadamard_type if isinstance(hadamard_config, str): key = hadamard_config.strip() if not key: return {} if key == "default": - cfg = HadamardConfig() - return cfg.model_dump() + cfg_dict = HadamardConfig().model_dump() + cfg_dict = _apply_scheme_block_size(cfg_dict) + try: + return HadamardConfig.model_validate(cfg_dict).model_dump() + except Exception as e: + raise ValueError(f"Invalid default hadamard_config after scheme adjustment: {e}") from e if key not in HADAMARDS: raise ValueError( - f"Invalid hadamard_config string: {key!r}. " f"Expected one of {sorted(HADAMARDS.keys())}." + f"Invalid hadamard_config string: {key!r}. Expected one of {sorted(HADAMARDS.keys())}." ) cfg_dict = {"hadamard_type": key} + cfg_dict = _apply_scheme_block_size(cfg_dict) try: - cfg = HadamardConfig.model_validate(cfg_dict).model_dump() + return HadamardConfig.model_validate(cfg_dict).model_dump() except Exception as e: raise ValueError(f"hadamard_config built from string {key!r} is invalid for HadamardConfig: {e}") from e - return cfg - raise TypeError( - "hadamard_config must be one of: None, dict, HadamardConfig, or str " f"(got {type(hadamard_config).__name__})" + "hadamard_config must be one of: None, dict, HadamardConfig, or str " + f"(got {type(hadamard_config).__name__})" ) diff --git a/auto_round/inference/backend.py b/auto_round/inference/backend.py index d98545679..609091ce3 100644 --- a/auto_round/inference/backend.py +++ b/auto_round/inference/backend.py @@ -781,6 +781,10 @@ def dynamic_import_inference_linear(backend, config): return ar_qmodules.HadamardMXFP4QuantLinear return ar_qmodules.MXFP4QuantLinear if "torch_nvfp4" in backend: + hadamard_config = getattr(config, "hadamard_config", None) + if hadamard_config is not None and hadamard_config: + if hadamard_config["hadamard_type"] == "random_hadamard": + return ar_qmodules.HadamardNVFP4QuantLinear return ar_qmodules.NVFP4QuantLinear if "auto_round_kernel" in backend or "ark" in backend: diff --git a/auto_round/inference/convert_model.py b/auto_round/inference/convert_model.py index a5b9096b3..f544ca58e 100644 --- a/auto_round/inference/convert_model.py +++ b/auto_round/inference/convert_model.py @@ -687,7 +687,11 @@ def convert_hf_model(model: nn.Module, target_device: str = "cpu") -> tuple[nn.M hadamard_type=hadamard_config["hadamard_type"], ) # apply to activation model = apply_hadamard_transform( - model, act_hadamard_config, location="input", desc="Register pre forward hook for hadamard transform" + model, + act_hadamard_config, + location="input", + desc="Register pre forward hook for hadamard transform", + data_type=quantization_config.data_type ) # Suggest a better backend if available From 36d314d5b6c8eb25c0ef7c89dd5942da73728006 Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Thu, 9 Apr 2026 10:55:11 +0000 Subject: [PATCH 07/13] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- auto_round/compressors/base.py | 1 - auto_round/experimental/transform/apply.py | 4 ++-- auto_round/experimental/utils.py | 12 +++--------- auto_round/inference/convert_model.py | 2 +- 4 files changed, 6 insertions(+), 13 deletions(-) diff --git a/auto_round/compressors/base.py b/auto_round/compressors/base.py index ff6886c97..21a56f51b 100644 --- a/auto_round/compressors/base.py +++ b/auto_round/compressors/base.py @@ -565,7 +565,6 @@ def __init__( self.hadamard_config = normalize_hadamard_config(hadamard_config, self.scheme) self.model = apply_hadamard_transform(self.model, self.hadamard_config) - def _gen_auto_scheme(self) -> dict[str, dict]: if self.mllm: logger.info("AutoScheme is not yet supported for multimodal LLMs.") diff --git a/auto_round/experimental/transform/apply.py b/auto_round/experimental/transform/apply.py index 4625d4361..1c8e3deef 100644 --- a/auto_round/experimental/transform/apply.py +++ b/auto_round/experimental/transform/apply.py @@ -18,7 +18,7 @@ def apply_hadamard_transform( location: str = "weight", use_tqdm=True, desc=None, - data_type="mx_fp" + data_type="mx_fp", ): """ Apply a transform configuration to a model. @@ -80,7 +80,7 @@ def _apply_to_module( module: torch.nn.Module, config: HadamardConfig, location: str = "weight", - data_type: str = "mx_fp" + data_type: str = "mx_fp", ): """ Create transforms and apply them to the module diff --git a/auto_round/experimental/utils.py b/auto_round/experimental/utils.py index 1e9b8edb7..ccd3da0cf 100644 --- a/auto_round/experimental/utils.py +++ b/auto_round/experimental/utils.py @@ -136,9 +136,7 @@ def is_triton_kernel_available(data_type: str) -> bool: return True -def normalize_hadamard_config( - hadamard_config: str | dict | HadamardConfig | None, scheme: str -) -> dict[str, Any]: +def normalize_hadamard_config(hadamard_config: str | dict | HadamardConfig | None, scheme: str) -> dict[str, Any]: """ Normalize and validate `hadamard_config`. @@ -184,7 +182,6 @@ def _apply_scheme_block_size(cfg_dict: dict[str, Any]) -> dict[str, Any]: return cfg_dict - # 1) None -> {} if hadamard_config is None: return {} @@ -222,9 +219,7 @@ def _apply_scheme_block_size(cfg_dict: dict[str, Any]) -> dict[str, Any]: raise ValueError(f"Invalid default hadamard_config after scheme adjustment: {e}") from e if key not in HADAMARDS: - raise ValueError( - f"Invalid hadamard_config string: {key!r}. Expected one of {sorted(HADAMARDS.keys())}." - ) + raise ValueError(f"Invalid hadamard_config string: {key!r}. Expected one of {sorted(HADAMARDS.keys())}.") cfg_dict = {"hadamard_type": key} cfg_dict = _apply_scheme_block_size(cfg_dict) @@ -235,8 +230,7 @@ def _apply_scheme_block_size(cfg_dict: dict[str, Any]) -> dict[str, Any]: raise ValueError(f"hadamard_config built from string {key!r} is invalid for HadamardConfig: {e}") from e raise TypeError( - "hadamard_config must be one of: None, dict, HadamardConfig, or str " - f"(got {type(hadamard_config).__name__})" + "hadamard_config must be one of: None, dict, HadamardConfig, or str " f"(got {type(hadamard_config).__name__})" ) diff --git a/auto_round/inference/convert_model.py b/auto_round/inference/convert_model.py index f544ca58e..2e2002af0 100644 --- a/auto_round/inference/convert_model.py +++ b/auto_round/inference/convert_model.py @@ -691,7 +691,7 @@ def convert_hf_model(model: nn.Module, target_device: str = "cpu") -> tuple[nn.M act_hadamard_config, location="input", desc="Register pre forward hook for hadamard transform", - data_type=quantization_config.data_type + data_type=quantization_config.data_type, ) # Suggest a better backend if available From 6d69b0e5ef4e8d0c1c752de523b79cb5f192b3a6 Mon Sep 17 00:00:00 2001 From: lkk12014402 Date: Thu, 9 Apr 2026 11:09:28 +0000 Subject: [PATCH 08/13] fix typo. Signed-off-by: lkk12014402 --- auto_round/compressors/base.py | 2 +- auto_round/experimental/transform/apply.py | 3 ++- 2 files changed, 3 insertions(+), 2 deletions(-) diff --git a/auto_round/compressors/base.py b/auto_round/compressors/base.py index 21a56f51b..074ffcf1c 100644 --- a/auto_round/compressors/base.py +++ b/auto_round/compressors/base.py @@ -563,7 +563,7 @@ def __init__( from auto_round.experimental.utils import normalize_hadamard_config self.hadamard_config = normalize_hadamard_config(hadamard_config, self.scheme) - self.model = apply_hadamard_transform(self.model, self.hadamard_config) + self.model = apply_hadamard_transform(self.model, self.hadamard_config, scheme=self.scheme) def _gen_auto_scheme(self) -> dict[str, dict]: if self.mllm: diff --git a/auto_round/experimental/transform/apply.py b/auto_round/experimental/transform/apply.py index 1c8e3deef..aeef57023 100644 --- a/auto_round/experimental/transform/apply.py +++ b/auto_round/experimental/transform/apply.py @@ -19,6 +19,7 @@ def apply_hadamard_transform( use_tqdm=True, desc=None, data_type="mx_fp", + scheme="MXFP4" ): """ Apply a transform configuration to a model. @@ -53,7 +54,7 @@ def apply_hadamard_transform( ``config.transform_type``. """ - config = normalize_hadamard_config(config) + config = normalize_hadamard_config(config, scheme) if not isinstance(config, HadamardConfig): config = HadamardConfig(**config) From 95436ed7ced33f011181b0a0342f6f34acb293aa Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Thu, 9 Apr 2026 11:04:00 +0000 Subject: [PATCH 09/13] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- auto_round/experimental/transform/apply.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/auto_round/experimental/transform/apply.py b/auto_round/experimental/transform/apply.py index aeef57023..d99e0b928 100644 --- a/auto_round/experimental/transform/apply.py +++ b/auto_round/experimental/transform/apply.py @@ -19,7 +19,7 @@ def apply_hadamard_transform( use_tqdm=True, desc=None, data_type="mx_fp", - scheme="MXFP4" + scheme="MXFP4", ): """ Apply a transform configuration to a model. From c558effce566b44420f8d31e693cc8c5b61df612 Mon Sep 17 00:00:00 2001 From: lkk <33276950+lkk12014402@users.noreply.github.com> Date: Thu, 9 Apr 2026 19:07:05 +0800 Subject: [PATCH 10/13] fix import issue. --- auto_round/experimental/utils.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/auto_round/experimental/utils.py b/auto_round/experimental/utils.py index ccd3da0cf..92445e7dd 100644 --- a/auto_round/experimental/utils.py +++ b/auto_round/experimental/utils.py @@ -15,7 +15,7 @@ from typing import Any import torch - +from auto_round.compressors.utils import is_nv_fp from auto_round.experimental.transform.hadamard_config import HadamardConfig from auto_round.experimental.transform.hadamards import HADAMARDS from auto_round.utils import logger From 037aad617c7564c75e738fcd9a03a7083dbe336e Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Thu, 9 Apr 2026 11:07:38 +0000 Subject: [PATCH 11/13] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- auto_round/experimental/utils.py | 1 + 1 file changed, 1 insertion(+) diff --git a/auto_round/experimental/utils.py b/auto_round/experimental/utils.py index 92445e7dd..7f65a3231 100644 --- a/auto_round/experimental/utils.py +++ b/auto_round/experimental/utils.py @@ -15,6 +15,7 @@ from typing import Any import torch + from auto_round.compressors.utils import is_nv_fp from auto_round.experimental.transform.hadamard_config import HadamardConfig from auto_round.experimental.transform.hadamards import HADAMARDS From 65092ff56e2b5c3aa6b5c402b58f3a2bcc19c92e Mon Sep 17 00:00:00 2001 From: lkk <33276950+lkk12014402@users.noreply.github.com> Date: Thu, 9 Apr 2026 19:36:15 +0800 Subject: [PATCH 12/13] enhance the function `normalize_hadamard_config` --- auto_round/experimental/utils.py | 54 ++++++++++++++++++++------------ 1 file changed, 34 insertions(+), 20 deletions(-) diff --git a/auto_round/experimental/utils.py b/auto_round/experimental/utils.py index 7f65a3231..11759af22 100644 --- a/auto_round/experimental/utils.py +++ b/auto_round/experimental/utils.py @@ -137,7 +137,9 @@ def is_triton_kernel_available(data_type: str) -> bool: return True -def normalize_hadamard_config(hadamard_config: str | dict | HadamardConfig | None, scheme: str) -> dict[str, Any]: +def normalize_hadamard_config( + hadamard_config: str | dict | HadamardConfig | None, scheme: str +) -> dict[str, Any]: """ Normalize and validate `hadamard_config`. @@ -148,7 +150,7 @@ def normalize_hadamard_config(hadamard_config: str | dict | HadamardConfig | Non - str -> shorthand for `hadamard_type` in HADAMARDS keys Additional behavior: - - If block_size is not set: + - If block_size is not set by user: - MXFP4 -> default block_size to 32 - NVFP4 -> default block_size to 16 - other schemes -> emit a warning @@ -158,16 +160,18 @@ def normalize_hadamard_config(hadamard_config: str | dict | HadamardConfig | Non - emit a warning """ - check_supported_schemes(scheme) + def _normalize_scheme(s: str) -> str: + return s.strip().upper() - def _apply_scheme_block_size(cfg_dict: dict[str, Any]) -> dict[str, Any]: + def _apply_scheme_block_size(cfg_dict: dict[str, Any], block_size_explicitly_set: bool) -> dict[str, Any]: + normalized_scheme = _normalize_scheme(scheme) block_size = cfg_dict.get("block_size") - if block_size is None: - if scheme == "MXFP4": + if not block_size_explicitly_set or block_size is None: + if normalized_scheme == "MXFP4": cfg_dict["block_size"] = 32 logger.warning("block_size is not set for scheme 'MXFP4'; defaulting to 32.") - elif scheme == "NVFP4": + elif normalized_scheme == "NVFP4": cfg_dict["block_size"] = 16 logger.warning("block_size is not set for scheme 'NVFP4'; defaulting to 16.") else: @@ -176,9 +180,9 @@ def _apply_scheme_block_size(cfg_dict: dict[str, Any]) -> dict[str, Any]: "please set block_size explicitly in hadamard_config if needed." ) else: - if scheme == "MXFP4" and block_size != 32: + if normalized_scheme == "MXFP4" and block_size != 32: logger.warning(f"scheme is 'MXFP4' but block_size={block_size}; recommended value is 32.") - elif scheme == "NVFP4" and block_size != 16: + elif normalized_scheme == "NVFP4" and block_size != 16: logger.warning(f"scheme is 'NVFP4' but block_size={block_size}; recommended value is 16.") return cfg_dict @@ -187,20 +191,27 @@ def _apply_scheme_block_size(cfg_dict: dict[str, Any]) -> dict[str, Any]: if hadamard_config is None: return {} - # 2) Already a HadamardConfig instance + # 2) HadamardConfig instance if isinstance(hadamard_config, HadamardConfig): + raw_cfg_dict = hadamard_config.model_dump(exclude_unset=True) + block_size_explicitly_set = "block_size" in raw_cfg_dict + + cfg_dict = dict(raw_cfg_dict) + cfg_dict = _apply_scheme_block_size(cfg_dict, block_size_explicitly_set) + try: - cfg_dict = HadamardConfig.model_validate(hadamard_config).model_dump() - cfg_dict = _apply_scheme_block_size(cfg_dict) return HadamardConfig.model_validate(cfg_dict).model_dump() except Exception as e: raise ValueError(f"Invalid HadamardConfig: {e}") from e - # 3) dict -> validate via HadamardConfig + # 3) dict if isinstance(hadamard_config, dict): + block_size_explicitly_set = "block_size" in hadamard_config + + cfg_dict = dict(hadamard_config) + cfg_dict = _apply_scheme_block_size(cfg_dict, block_size_explicitly_set) + try: - cfg_dict = HadamardConfig.model_validate(hadamard_config).model_dump() - cfg_dict = _apply_scheme_block_size(cfg_dict) return HadamardConfig.model_validate(cfg_dict).model_dump() except Exception as e: raise ValueError(f"Invalid hadamard_config dict: {e}") from e @@ -212,18 +223,20 @@ def _apply_scheme_block_size(cfg_dict: dict[str, Any]) -> dict[str, Any]: return {} if key == "default": - cfg_dict = HadamardConfig().model_dump() - cfg_dict = _apply_scheme_block_size(cfg_dict) + cfg_dict = {} + cfg_dict = _apply_scheme_block_size(cfg_dict, block_size_explicitly_set=False) try: return HadamardConfig.model_validate(cfg_dict).model_dump() except Exception as e: raise ValueError(f"Invalid default hadamard_config after scheme adjustment: {e}") from e if key not in HADAMARDS: - raise ValueError(f"Invalid hadamard_config string: {key!r}. Expected one of {sorted(HADAMARDS.keys())}.") + raise ValueError( + f"Invalid hadamard_config string: {key!r}. Expected one of {sorted(HADAMARDS.keys())}." + ) cfg_dict = {"hadamard_type": key} - cfg_dict = _apply_scheme_block_size(cfg_dict) + cfg_dict = _apply_scheme_block_size(cfg_dict, block_size_explicitly_set=False) try: return HadamardConfig.model_validate(cfg_dict).model_dump() @@ -231,7 +244,8 @@ def _apply_scheme_block_size(cfg_dict: dict[str, Any]) -> dict[str, Any]: raise ValueError(f"hadamard_config built from string {key!r} is invalid for HadamardConfig: {e}") from e raise TypeError( - "hadamard_config must be one of: None, dict, HadamardConfig, or str " f"(got {type(hadamard_config).__name__})" + "hadamard_config must be one of: None, dict, HadamardConfig, or str " + f"(got {type(hadamard_config).__name__})" ) From 506c595f0468f7e386764e4bb99d3d0244b400bf Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Thu, 9 Apr 2026 11:36:41 +0000 Subject: [PATCH 13/13] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- auto_round/experimental/utils.py | 11 +++-------- 1 file changed, 3 insertions(+), 8 deletions(-) diff --git a/auto_round/experimental/utils.py b/auto_round/experimental/utils.py index 11759af22..b32132c7a 100644 --- a/auto_round/experimental/utils.py +++ b/auto_round/experimental/utils.py @@ -137,9 +137,7 @@ def is_triton_kernel_available(data_type: str) -> bool: return True -def normalize_hadamard_config( - hadamard_config: str | dict | HadamardConfig | None, scheme: str -) -> dict[str, Any]: +def normalize_hadamard_config(hadamard_config: str | dict | HadamardConfig | None, scheme: str) -> dict[str, Any]: """ Normalize and validate `hadamard_config`. @@ -231,9 +229,7 @@ def _apply_scheme_block_size(cfg_dict: dict[str, Any], block_size_explicitly_set raise ValueError(f"Invalid default hadamard_config after scheme adjustment: {e}") from e if key not in HADAMARDS: - raise ValueError( - f"Invalid hadamard_config string: {key!r}. Expected one of {sorted(HADAMARDS.keys())}." - ) + raise ValueError(f"Invalid hadamard_config string: {key!r}. Expected one of {sorted(HADAMARDS.keys())}.") cfg_dict = {"hadamard_type": key} cfg_dict = _apply_scheme_block_size(cfg_dict, block_size_explicitly_set=False) @@ -244,8 +240,7 @@ def _apply_scheme_block_size(cfg_dict: dict[str, Any], block_size_explicitly_set raise ValueError(f"hadamard_config built from string {key!r} is invalid for HadamardConfig: {e}") from e raise TypeError( - "hadamard_config must be one of: None, dict, HadamardConfig, or str " - f"(got {type(hadamard_config).__name__})" + "hadamard_config must be one of: None, dict, HadamardConfig, or str " f"(got {type(hadamard_config).__name__})" )