From 31b11356ed88e629fe471cf3541800776eaea62d Mon Sep 17 00:00:00 2001 From: Mengni Wang Date: Thu, 2 Apr 2026 23:09:17 +0800 Subject: [PATCH 1/6] add mxint4 Signed-off-by: Mengni Wang --- auto_round/__main__.py | 6 +++- auto_round/compressors/utils.py | 6 ++++ auto_round/experimental/qmodules/__init__.py | 2 +- auto_round/experimental/qmodules/mx.py | 29 ++++++++++++++++++- .../export_to_nvfp_mxfp.py | 6 ++-- auto_round/formats.py | 17 ++++++++++- auto_round/inference/backend.py | 23 +++++++++++++++ auto_round/inference/convert_model.py | 1 + auto_round/schemes.py | 13 +++++++++ 9 files changed, 97 insertions(+), 6 deletions(-) diff --git a/auto_round/__main__.py b/auto_round/__main__.py index ab4da0b68..264524b75 100644 --- a/auto_round/__main__.py +++ b/auto_round/__main__.py @@ -756,7 +756,11 @@ def tune(args): suffix = f"a{autoround.act_bits}" else: suffix = f"g{autoround.group_size}" - prefix = autoround.data_type.lower().replace("_", "") if "int" not in autoround.data_type else "" + prefix = ( + autoround.data_type.lower().replace("_", "") + if "int" not in autoround.data_type or "mx" in autoround.data_type + else "" + ) export_dir = os.path.join( args.output_dir, model_name.split("/")[-1] + (f"-{prefix}" if prefix else "") + f"-w{autoround.bits}{suffix}", diff --git a/auto_round/compressors/utils.py b/auto_round/compressors/utils.py index 58f64f683..0b9bb1599 100644 --- a/auto_round/compressors/utils.py +++ b/auto_round/compressors/utils.py @@ -35,6 +35,7 @@ class BackendDataType(str, Enum): STANDARD_FP = "fp" MX_FP = "mx_fp" NV_FP = "nv_fp" + MX_INT = "mx_int" def is_standard_fp(backend): @@ -47,6 +48,11 @@ def is_mx_fp(backend): return BackendDataType.MX_FP in backend +def is_mx_int(backend): + backend = backend.lower() + return BackendDataType.MX_INT in backend + + def is_nv_fp(backend): backend = backend.lower() return BackendDataType.NV_FP in backend diff --git a/auto_round/experimental/qmodules/__init__.py b/auto_round/experimental/qmodules/__init__.py index 3862e0293..0d1973770 100644 --- a/auto_round/experimental/qmodules/__init__.py +++ b/auto_round/experimental/qmodules/__init__.py @@ -12,6 +12,6 @@ # See the License for the specific language governing permissions and # limitations under the License. -from auto_round.experimental.qmodules.mx import MXFP4QuantLinear, MXFP8QuantLinear, HadamardMXFP4QuantLinear +from auto_round.experimental.qmodules.mx import MXFP4QuantLinear, MXFP8QuantLinear, MXINT4QuantLinear, HadamardMXFP4QuantLinear from auto_round.experimental.qmodules.nvfp4 import NVFP4QuantLinear from auto_round.experimental.qmodules.fp8_static import WeightFP8ActFP8StaticQuantLinear diff --git a/auto_round/experimental/qmodules/mx.py b/auto_round/experimental/qmodules/mx.py index b5bc3e939..d2b4cf26e 100644 --- a/auto_round/experimental/qmodules/mx.py +++ b/auto_round/experimental/qmodules/mx.py @@ -20,10 +20,11 @@ from auto_round.data_type.utils import get_quant_func from auto_round.experimental.qmodules.base import QModuleBase from auto_round.experimental.qmodules.fp4_utils import unpack_fp4_from_uint8 +from auto_round.experimental.qmodules.int4_utils import unpack_int4_from_uint8 from auto_round.logger import logger from auto_round.schemes import QuantizationScheme -__all__ = ["MXFP4QuantLinear", "MXFP8QuantLinear"] +__all__ = ["MXFP4QuantLinear", "MXFP8QuantLinear", "MXINT4QuantLinear"] SUPPORTED_HIGHER_DTYPE = [torch.bfloat16, torch.float16, torch.float32] E8M0_EXPONENT_BIAS = 127 @@ -196,6 +197,32 @@ def unpack_data(self, packed_data: torch.Tensor) -> torch.Tensor: return unpacked_data +class MXINT4QuantLinear(MXQuantLinearBase): + """ + Quantized linear layer using the MXINT4 quantization scheme. + """ + + def __init__(self, *args, **kwargs): + self.weight_name = "weight_packed" + super().__init__(*args, **kwargs) + + def initialize_weights(self, weight: Optional[torch.Tensor]) -> torch.Tensor: + weight_dtype = torch.uint8 + weight_in_features = self.in_features // 2 + return torch.zeros((self.out_features, weight_in_features), dtype=weight_dtype) if weight is None else weight + + def dequant_weight_online(self) -> torch.Tensor: + if self.pre_dequantized: + return self.weight + dq_weight = self.dequant_mx_tensor(self.weight_packed, self.weight_scale) + return dq_weight + + def unpack_data(self, packed_data: torch.Tensor) -> torch.Tensor: + m, half_n = packed_data.shape + unpacked_data = unpack_int4_from_uint8(packed_data, m, half_n * 2, dtype=self.dtype) + return unpacked_data + + class HadamardMXFP4QuantLinear(MXFP4QuantLinear): """ Quantized linear layer using the MXFP4 quantization scheme. diff --git a/auto_round/export/export_to_autoround/export_to_nvfp_mxfp.py b/auto_round/export/export_to_autoround/export_to_nvfp_mxfp.py index 502c49676..81ffb3e51 100644 --- a/auto_round/export/export_to_autoround/export_to_nvfp_mxfp.py +++ b/auto_round/export/export_to_autoround/export_to_nvfp_mxfp.py @@ -44,7 +44,8 @@ ) from auto_round.wrapper import WrapperWALayer -from .qlinear_fp import QuantLinear +from .qlinear_fp import QuantLinear as FpQuantLinear +from .qlinear_int import QuantLinear as IntQuantLinear __all__ = [ "pack_layer", @@ -94,7 +95,8 @@ def pack_layer(name, model, backend, device=None): bias = layer.bias is not None ##bias = True ## if using the above, llama3 lambada RTN will be NAN , TODO why? - qlayer = QuantLinear( ##pylint: disable=E1123 + linear_func = FpQuantLinear if "fp" in data_type else IntQuantLinear + qlayer = linear_func( ##pylint: disable=E1123 bits, group_size, in_features, diff --git a/auto_round/formats.py b/auto_round/formats.py index 03213cef6..a2a290490 100644 --- a/auto_round/formats.py +++ b/auto_round/formats.py @@ -31,6 +31,7 @@ is_dynamic_afp8, is_dynamic_wint8aint8, is_mx_fp, + is_mx_int, is_nv_fp, is_standard_fp, is_static_wfp8afp8, @@ -69,6 +70,8 @@ class AutoRoundExportFormat(str, Enum): NV_FP4_WITH_STATIC_GS = "nv_fp4_with_static_gs" INT8_W8A8 = "int8_w8a8" FP8_BLOCK = "fp8_block" + MXINT4 = "mxint4" + MX_INT = "mx_int" if TYPE_CHECKING: @@ -1077,6 +1080,7 @@ class AutoRoundFormat(OutputFormat): "FP8_STATIC", "BF16", "FP8_BLOCK", + "MXINT4", ] format_name = "auto_round" @@ -1085,7 +1089,7 @@ def __init__(self, format: str, ar: BaseCompressor): self.backend = None if format == "auto_round": - if ar.sym and "int" in ar.data_type: + if ar.sym and "int" in ar.data_type and "mx" not in ar.data_type: self.backend = AutoGPTQFormat("auto_round:auto_gptq", ar) elif ar.bits == 4 and not ar.sym and "int" in ar.data_type: if ar.layer_config is None: @@ -1098,6 +1102,8 @@ def __init__(self, format: str, ar: BaseCompressor): self.backend = AutoAWQFormat("auto_round:auto_awq", ar) elif is_nv_fp(ar.data_type) or is_mx_fp(ar.data_type): self.backend = AutoRoundFormat(ar.data_type, ar) + elif is_mx_int(ar.data_type): + self.backend = AutoRoundFormat(ar.data_type, ar) elif is_static_wfp8afp8(ar): # static wfp8afp8 self.backend = AutoRoundFormat(AutoRoundExportFormat.FP8_STATIC.value, ar) elif ar.data_type.startswith("fp") and ar.bits == 8 and ar.act_bits >= 16: # woq fp8 @@ -1157,6 +1163,10 @@ def pack_layer(self, layer_name, model, device=None, **kwargs): ]: from auto_round.export.export_to_autoround.export_to_nvfp_mxfp import pack_layer + pack_func = pack_layer + elif self.output_format in [f"auto_round:{AutoRoundExportFormat.MX_INT.value}"]: + from auto_round.export.export_to_autoround.export_to_nvfp_mxfp import pack_layer + pack_func = pack_layer elif self.output_format in [ f"auto_round:{AutoRoundExportFormat.FP8.value}", @@ -1205,6 +1215,11 @@ def save_quantized( backend = "auto_round:fp8_static" if serialization_dict.get("act_bits", 16) == 8 else None export_func = save_quantized_as_autoround + elif re.search(f"{AutoRoundExportFormat.MX_INT.value}", backend): + from auto_round.export.export_to_autoround.export_to_nvfp_mxfp import save_quantized_as_fp + + backend = "auto_round:mx_int4" + export_func = save_quantized_as_fp else: from auto_round.export.export_to_autoround.export import save_quantized_as_autoround diff --git a/auto_round/inference/backend.py b/auto_round/inference/backend.py index d98545679..3073a122d 100644 --- a/auto_round/inference/backend.py +++ b/auto_round/inference/backend.py @@ -114,6 +114,7 @@ class BackendInfo: MX_TENSOR_DATA_TYPES = [ "mx_fp", "mx_fp_rceil", + "mx_int", ] @@ -303,6 +304,26 @@ def fp8_static_scheme_checker( requirements=["auto-round>0.7.0"], ) +# MXINT4 +BackendInfos["auto_round:torch_mxint4"] = BackendInfo( + device=["cuda", "cpu"], + packing_format=["auto_round:mx_int4"], + sym=[True], + compute_dtype=["float32", "float16", "bfloat16"], + data_type=MX_TENSOR_DATA_TYPES, + group_size=[32], + bits=[4], + act_bits=[4], + act_group_size=[32], + act_sym=[True], + act_data_type=MX_TENSOR_DATA_TYPES, + act_dynamic=[True], + priority=0, + checkers=[mxfp_nvfp_feature_checker], + alias=["auto_round", "torch"], + requirements=["auto-round>0.11.0"], +) + # NVFP4 BackendInfos["auto_round:torch_nvfp4"] = BackendInfo( @@ -774,6 +795,8 @@ def dynamic_import_inference_linear(backend, config): return ar_qmodules.WeightFP8ActFP8StaticQuantLinear if "torch_mxfp8" in backend: return ar_qmodules.MXFP8QuantLinear + if "torch_mxint4" in backend: + return ar_qmodules.MXINT4QuantLinear if "torch_mxfp4" in backend: hadamard_config = getattr(config, "hadamard_config", None) if hadamard_config is not None and hadamard_config: diff --git a/auto_round/inference/convert_model.py b/auto_round/inference/convert_model.py index a5b9096b3..9f2d6fca3 100644 --- a/auto_round/inference/convert_model.py +++ b/auto_round/inference/convert_model.py @@ -447,6 +447,7 @@ def _create_quant_layer(layer, layer_backend, config, in_features, out_features) or AutoRoundExportFormat.MXFP8.value in layer_backend or AutoRoundExportFormat.MXFP4.value in layer_backend or AutoRoundExportFormat.NVFP4.value in layer_backend + or AutoRoundExportFormat.MXINT4.value in layer_backend ): return QuantLinear.from_original(config, layer) diff --git a/auto_round/schemes.py b/auto_round/schemes.py index 5318b1fec..2e0641554 100644 --- a/auto_round/schemes.py +++ b/auto_round/schemes.py @@ -228,6 +228,18 @@ def is_preset_scheme(name: str) -> bool: } ) +MXINT4 = QuantizationScheme.from_dict( + { + "bits": 4, + "group_size": 32, + "data_type": "mx_int", + "act_bits": 4, + "act_data_type": "mx_int", + "act_group_size": 32, + "act_sym": True, + "act_dynamic": True, + } +) NVFP4 = QuantizationScheme.from_dict( { @@ -330,6 +342,7 @@ def is_preset_scheme(name: str) -> bool: "W4A16_MIXED": W4A16, "INT8_W8A8": INT8_W8A8, "FP8_BLOCK": FP8_BLOCK, + "MXINT4": MXINT4, } from auto_round.export.export_to_gguf.config import GGUF_CONFIG From 14275101f6e39fb41c782ee15e857309833a6474 Mon Sep 17 00:00:00 2001 From: Mengni Wang Date: Tue, 7 Apr 2026 17:29:58 +0800 Subject: [PATCH 2/6] refine code and add ut Signed-off-by: Mengni Wang --- .../experimental/qmodules/int4_utils.py | 109 +++++++ auto_round/experimental/qmodules/mx.py | 15 + .../export_to_nvfp_mxfp.py | 277 ------------------ .../export/export_to_autoround/qlinear_int.py | 204 +++++++++++++ auto_round/formats.py | 10 +- auto_round/inference/backend.py | 2 +- .../quantization/test_mx_quant_linear.py | 86 ++++++ 7 files changed, 420 insertions(+), 283 deletions(-) create mode 100644 auto_round/experimental/qmodules/int4_utils.py delete mode 100644 auto_round/export/export_to_autoround/export_to_nvfp_mxfp.py create mode 100644 auto_round/export/export_to_autoround/qlinear_int.py diff --git a/auto_round/experimental/qmodules/int4_utils.py b/auto_round/experimental/qmodules/int4_utils.py new file mode 100644 index 000000000..eeed82e5e --- /dev/null +++ b/auto_round/experimental/qmodules/int4_utils.py @@ -0,0 +1,109 @@ +# Copyright (c) 2025 Intel Corporation +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +# Copyright (c) 2021 - present / Neuralmagic, Inc. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from typing import Optional + +import torch + +_DEVICE_E0M4_TENSORS = {} + +# Constants for INT4 values +_E0M4_VALUES = [0.0, 0.25, 0.5, 0.75, 1.0, 1.25, 1.5, 1.75] + + +def get_e0m4_tensor(device): + """Get device-specific E0M4 lookup tensor, creating it if needed.""" + device_str = str(device) + if device_str not in _DEVICE_E0M4_TENSORS: + _DEVICE_E0M4_TENSORS[device_str] = torch.tensor(_E0M4_VALUES, dtype=torch.float32, device=device) + return _DEVICE_E0M4_TENSORS[device_str] + + +def unpack_int4_from_uint8( + a: torch.Tensor, m: int, n: int, dtype: Optional[torch.dtype] = torch.bfloat16 +) -> torch.Tensor: + """ + Unpacks uint8 values into int4. Each uint8 contains two int4 values + (low nibble first). The 4-bit indices are mapped to int4 values using kE0M4ToFloat. + """ + if a.device.type == "cuda": + return _unpack_int4_from_uint8_cuda(a, m, n, dtype) + else: + return _unpack_int4_from_uint8_cpu(a, m, n, dtype) + + +@torch.compiler.disable() +def _unpack_int4_from_uint8_cpu( + a: torch.Tensor, m: int, n: int, dtype: Optional[torch.dtype] = torch.bfloat16 +) -> torch.Tensor: + return _unpack_int4_from_uint8(a, m, n, dtype) + + +# @torch.compile(fullgraph=True, dynamic=True) +def _unpack_int4_from_uint8_cuda( + a: torch.Tensor, m: int, n: int, dtype: Optional[torch.dtype] = torch.bfloat16 +) -> torch.Tensor: + return _unpack_int4_from_uint8(a, m, n, dtype) + + +# reference: : https://github.com/vllm-project/vllm/pull/16362 +def _unpack_int4_from_uint8( + a: torch.Tensor, m: int, n: int, dtype: Optional[torch.dtype] = torch.bfloat16 +) -> torch.Tensor: + """ + Unpacks uint8 values into int4. Each uint8 consists of two int4 values + (i.e. first four bits correspond to one int4 value, last four correspond to a + consecutive int4 value). The bits represent an index, which are mapped to an int4 + value. + + :param a: tensor to unpack + :param m: original dim 0 size of the unpacked tensor + :param n: original dim 1 size of the unpacked tensor + :param dtype: dense dtype to cast the unpacked tensor to + """ + assert a.dtype == torch.uint8, f"expected uint8, got {a.dtype}" + + # Vectorized nibble processing + a_flat = a.flatten() + high = (a_flat & 0xF0) >> 4 # Upper nibbles + low = a_flat & 0x0F # Lower nibbles + + # Combine nibbles for batch processing + combined = torch.stack((low, high), dim=1).flatten() + + # Vectorized sign and magnitude extraction + signs = (combined & 0x08).to(torch.bool) # Sign bits + abs_vals = (combined & 0x07).to(torch.long) # Magnitude indices + + # Device-aware lookup and sign application + kE0M4 = get_e0m4_tensor(device=a.device) + values = kE0M4[abs_vals] * torch.where(signs, -1.0, 1.0) + + # Reshape to final form + return values.reshape(m, n).to(dtype=dtype) diff --git a/auto_round/experimental/qmodules/mx.py b/auto_round/experimental/qmodules/mx.py index d2b4cf26e..449e5f348 100644 --- a/auto_round/experimental/qmodules/mx.py +++ b/auto_round/experimental/qmodules/mx.py @@ -222,6 +222,21 @@ def unpack_data(self, packed_data: torch.Tensor) -> torch.Tensor: unpacked_data = unpack_int4_from_uint8(packed_data, m, half_n * 2, dtype=self.dtype) return unpacked_data + @classmethod + def from_original(cls, config: Optional[QuantizationScheme], original_layer: torch.nn.Linear): + """ + Create an `MXQuantLinear` layer from an original linear layer. + """ + logger.warning_once("MXINT quantization is still in experimental stage, the inference speed might be slow.") + qdq_linear = cls( + in_features=original_layer.in_features, + out_features=original_layer.out_features, + config=config, + bias=original_layer.bias, + dtype=original_layer.weight.dtype, + ) + return qdq_linear + class HadamardMXFP4QuantLinear(MXFP4QuantLinear): """ diff --git a/auto_round/export/export_to_autoround/export_to_nvfp_mxfp.py b/auto_round/export/export_to_autoround/export_to_nvfp_mxfp.py deleted file mode 100644 index 81ffb3e51..000000000 --- a/auto_round/export/export_to_autoround/export_to_nvfp_mxfp.py +++ /dev/null @@ -1,277 +0,0 @@ -# Copyright (c) 2023 Intel Corporation -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import copy -import inspect -import json -import os -from concurrent.futures import ThreadPoolExecutor -from dataclasses import fields -from typing import Callable, Union - -import threadpoolctl as tctl -import torch -import torch.nn as nn -import transformers -from tqdm import tqdm - -from auto_round.compressors.utils import is_mx_fp, is_nv_fp -from auto_round.export.export_to_autoround.utils import check_neq_config -from auto_round.export.utils import filter_quantization_config, release_layer_safely, save_model -from auto_round.logger import logger -from auto_round.schemes import QuantizationScheme -from auto_round.utils import ( - SUPPORTED_LAYER_TYPES, - check_start_with_block_name, - check_to_quantized, - copy_python_files_from_model_cache, - get_module, - get_packing_device, - set_amax_for_all_moe_layers, - set_module, - to_standard_regex, -) -from auto_round.wrapper import WrapperWALayer - -from .qlinear_fp import QuantLinear as FpQuantLinear -from .qlinear_int import QuantLinear as IntQuantLinear - -__all__ = [ - "pack_layer", - "save_quantized_as_fp", -] - - -def pack_layer(name, model, backend, device=None): - layer = get_module(model, name) - if type(layer) not in SUPPORTED_LAYER_TYPES and not isinstance(layer, WrapperWALayer): ##already packed - return - - if isinstance(layer, WrapperWALayer): # revert WrapperWALayer for offline usage - wp_layer = layer - layer = wp_layer.orig_layer - set_module(model, name, layer) - - orig_device = layer.weight.device - data_type = layer.data_type - act_bits = layer.act_bits - act_data_type = layer.act_data_type - bits = layer.bits - if bits > 8: - return - group_size = layer.group_size - sym = layer.sym - - if is_nv_fp(act_data_type) and act_bits <= 8: - input_global_scale = getattr(layer, "input_global_scale", None) - if input_global_scale is None: - assert hasattr(layer, "act_max") - from auto_round.data_type.nvfp import calculate_gparam - - input_global_scale = calculate_gparam(layer.act_max, layer.group_size, "cpu") - setattr(layer, "input_global_scale", input_global_scale) - delattr(layer, "act_max") - - if type(layer) == nn.Linear: - in_features = layer.in_features - out_features = layer.out_features - elif type(layer) == nn.Conv2d: - in_features = layer.in_channels - out_features = layer.out_channels - elif type(layer) == transformers.pytorch_utils.Conv1D: - in_features = layer.weight.shape[0] - out_features = layer.weight.shape[1] - - bias = layer.bias is not None - ##bias = True ## if using the above, llama3 lambada RTN will be NAN , TODO why? - linear_func = FpQuantLinear if "fp" in data_type else IntQuantLinear - qlayer = linear_func( ##pylint: disable=E1123 - bits, - group_size, - in_features, - out_features, - bias, - weight_dtype=layer.weight.dtype, - sym=sym, - data_type=data_type, - act_bits=act_bits, - act_data_type=act_data_type, - ) - - qlayer.device = orig_device - scale = layer.scale - global_scale = getattr(layer, "weight_global_scale", None) - input_global_scale = getattr(layer, "input_global_scale", None) - ## no zeros to handle, as mxfp/nvfp do not support asym quantization - # zero = layer.zp - qlayer.pack(layer, scale, global_scale=global_scale, input_global_scale=input_global_scale, device=device) - qlayer.to(orig_device) - set_module(model, name, qlayer) - # Note: release weight and bias explicitly, in case they are referenced elsewhere - release_layer_safely(layer) - - -def save_quantized_as_fp( - output_dir: str, - model: torch.nn.Module = None, - tokenizer: Callable = None, - layer_config: dict = None, - inplace: bool = True, - device: Union[str, torch.device] = "cpu", - backend: str = "autoround:exllamav2", - serialization_dict: dict = None, - **kwargs, -) -> torch.nn.Module: - """ - Saves a quantized model of mxfp/nvfp data_type in the auto-round format. - - Args: - output_dir (str): The directory where the quantized model will be saved. - inplace (bool, optional): If True, modifies the model in place. Otherwise, creates a deepcopy of the model. - Default is True. - backend (str, optional): The backend to be used for quantization. - Default is "autoround:exllamav2". - **kwargs: Additional keyword arguments including: - - model (nn.Module): The model to be quantized. - - layer_config (dict): The layer configuration for each layer. - - serialization_dict (dict): The serialization configuration. - - tokenizer (Tokenizer, optional): The tokenizer to be saved. - - Returns: - None - - Raises: - ValueError: If the backend is not supported. - """ - bits = serialization_dict.get("bits", None) - data_type = serialization_dict.get("data_type", None) - act_bits = serialization_dict.get("act_bits", None) - act_data_type = serialization_dict.get("act_data_type", None) - safe_serialization = True if "safe_serialization" not in kwargs.keys() else kwargs["safe_serialization"] - if not inplace: - model = copy.deepcopy(model.to("cpu")) - quantization_config = serialization_dict - quantization_config["block_name_to_quantize"] = quantization_config.pop("to_quant_block_names", None) - quantization_config["quant_method"] = "auto-round" - quantization_config["packing_format"] = backend - - processor = kwargs.get("processor", None) - image_processor = kwargs.get("image_processor", None) - extra_config = {} - - if act_bits <= 8: - # revert WrapperWALayer for offline usage - for n, m in model.named_modules(): - if isinstance(m, WrapperWALayer): - orig_layer = m.orig_layer - set_module(model, n, orig_layer) - - if is_nv_fp(act_data_type) and "static_gs" in str(act_data_type).lower(): - # Ensure all MOE layers have act_max set (needed after deep copy or for uncalibrated layers) - from auto_round.utils.model import is_moe_model, set_amax_for_all_moe_layers - - if is_moe_model(model): - set_amax_for_all_moe_layers(model) - - # generate static input_global_scale - for n, m in model.named_modules(): - if type(m) in SUPPORTED_LAYER_TYPES: - layer = m - if hasattr(layer, "act_bits") and layer.act_bits < 8 and not getattr(layer, "input_global_scale", None): - assert hasattr(layer, "act_max") - from auto_round.data_type.nvfp import calculate_gparam - - input_global_scale = calculate_gparam(layer.act_max, layer.group_size, model.device) - setattr(layer, "input_global_scale", input_global_scale) - delattr(layer, "act_max") - # update fused input_global_scale - from auto_round.data_type.utils import update_fused_layer_global_scales - - modules = list(model.modules()) - for module in tqdm(modules, desc="Update input global scale for fuse modules"): - update_fused_layer_global_scales(module, base_name="input") - - block_name_to_quantize = quantization_config["block_name_to_quantize"] - if isinstance(block_name_to_quantize, str): - block_name_to_quantize = block_name_to_quantize.split(",") - elif isinstance(block_name_to_quantize, list): - for i in range(len(block_name_to_quantize)): - block_name_to_quantize[i] = os.path.commonprefix(block_name_to_quantize[i]).rstrip(".") - - scheme_keys = [f.name for f in fields(QuantizationScheme)] - for layer_name, cfg in layer_config.items(): - if not cfg["in_blocks"] and cfg["bits"] <= 8: # lm head - extra_config[layer_name] = {key: cfg.get(key) for key in scheme_keys} - elif cfg["in_blocks"] or ( - block_name_to_quantize is not None and check_start_with_block_name(layer_name, block_name_to_quantize) - ): - neq_keys = check_neq_config(cfg, **{k: quantization_config[k] for k in scheme_keys}) - if len(neq_keys) > 0: - extra_config[layer_name] = {} - for key in neq_keys: - if cfg.get(key, None) is not None: - extra_config[layer_name][key] = cfg.get(key, None) - - regex_config = quantization_config.pop("regex_config") - if regex_config is not None: - for name, cfg in regex_config.items(): - regex_name = to_standard_regex(name) - neq_keys = check_neq_config(cfg, **{k: quantization_config[k] for k in scheme_keys}) - if len(neq_keys) > 0: - extra_config[regex_name] = {} - for key in neq_keys: - if cfg.get(key) is not None: - extra_config[regex_name][key] = cfg[key] - - if len(extra_config) > 0: - quantization_config["extra_config"] = extra_config - names = list(layer_config.keys()) - max_workers = 1 - if not torch.cuda.is_available() and not torch.xpu.is_available(): - max_workers = 2 ## 2 with cuda packing will cause hang occasionally - with ThreadPoolExecutor(max_workers=max_workers) as executor: - with tqdm(total=len(names), leave=True) as pbar: - - def wrapper(name): - pbar.set_description(f"packing {name}") - with tctl.threadpool_limits(limits=1): - pack_layer(name, model, backend, device) - pbar.update(1) - - for _ in executor.map(wrapper, names): - pass - filter_quantization_config(quantization_config) - - if hasattr(model, "config"): - model.config.quantization_config = quantization_config - if output_dir is None: - return model - - if output_dir is None: - model.tokenizer = tokenizer - return model - if os.path.exists(output_dir): - logger.warning(f"{output_dir} already exists, this may cause model conflict") - if tokenizer is not None: - tokenizer.save_pretrained(output_dir) - - if processor is not None: - processor.save_pretrained(output_dir) - if image_processor is not None: - image_processor.save_pretrained(output_dir) - - dtype = None - save_model(model, output_dir, safe_serialization=safe_serialization, dtype=dtype) - - return model diff --git a/auto_round/export/export_to_autoround/qlinear_int.py b/auto_round/export/export_to_autoround/qlinear_int.py new file mode 100644 index 000000000..c5e156c46 --- /dev/null +++ b/auto_round/export/export_to_autoround/qlinear_int.py @@ -0,0 +1,204 @@ +# Copyright (c) 2023 Intel Corporation +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# Copyright (c) 2021 - present / Neuralmagic, Inc. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import math + +import numpy as np +import torch +import torch.nn as nn +import transformers + +import auto_round.envs as envs +from auto_round.compressors.utils import BackendDataType +from auto_round.data_type.mxfp import FP32_EXPONENT_BIAS, FP32_MIN_NORMAL +from auto_round.data_type.utils import reshape_pad_tensor_by_group_size, revert_tensor_by_pad +from auto_round.utils import get_packing_device, logger + +# from auto_round.utils import get_weight_compress_dtype +E8M0_EXPONENT_BIAS = 127 +E8M0_EXPONENT_NAN_VAL = 255 + +__all__ = ["QuantLinear"] + +FLOAT_TO_E0M4 = [ + 0.0, + 0.25, + 0.5, + 0.75, + 1.0, + 1.25, + 1.5, + 1.75, +] + + +class QuantLinear(nn.Module): + """ + MXFP quantized linear layer. + """ + + QUANT_TYPE = "MXINT" + + def __init__( + self, bits, group_size, infeatures, outfeatures, bias, trainable=False, data_type="mx_int4", **kwargs + ): + super().__init__() + if bits not in [4]: + raise NotImplementedError("Only 4 bits are supported.") + if group_size != 32: + raise NotImplementedError(f"Only group_size 32 are supported for {BackendDataType.MX_INT} data type.") + if infeatures % group_size != 0: + raise NotImplementedError( + f"in_feature must be divisible by {group_size} for {BackendDataType.MX_INT} data type." + ) + self.infeatures = infeatures + self.outfeatures = outfeatures + self.bits = bits + self.data_type = data_type + self.sym = kwargs.get("sym", True) + self.group_size = group_size if group_size != -1 else infeatures + self.maxq = 2**self.bits - 1 + self.act_bits = kwargs.get("act_bits", None) + + weight_name = "weight_packed" + weight_infeatures = infeatures if self.bits == 8 else infeatures // 2 + weight_dtype = torch.uint8 + ## TODO check the dtype of weight_packed and weight_scale + self.register_buffer( + weight_name, + torch.zeros((outfeatures, weight_infeatures), dtype=weight_dtype), + ) + self.register_buffer( + "weight_scale", + torch.zeros( + (outfeatures, math.ceil(infeatures / self.group_size)), + dtype=torch.float16, ## TODO update to correct scale dtype for different bits + ), + ) + if bias: + self.register_buffer("bias", torch.zeros((outfeatures), dtype=torch.float16)) + else: + self.bias = None + + self.trainable = trainable + + def post_init(self): + pass + + def pack(self, linear, scales, zeros=None, g_idx=None, global_scale=None, input_global_scale=None, device=None): + device = get_packing_device(device) + if getattr(linear, "bias", None) is not None: + self.bias = linear.bias.detach().to(torch.float16) + + W = linear.weight.data.detach().to(device) + if type(linear) == nn.Conv2d: + W = W.flatten(1) + if type(linear) == transformers.pytorch_utils.Conv1D: + W = W.t() + + tensor, orig_shape, pad_len = reshape_pad_tensor_by_group_size(W, self.group_size) + scales = scales.to(device) + scaled_tensor = tensor / (2 ** scales.reshape(tensor.shape[0], -1)) + scaled_tensor = revert_tensor_by_pad(scaled_tensor, orig_shape=orig_shape, pad_len=pad_len) + final_scale = (scales + E8M0_EXPONENT_BIAS).clamp(0, E8M0_EXPONENT_NAN_VAL).to(torch.uint8) + + self.weight_scale = final_scale + compress_dtype = torch.uint8 + self.weight_packed = pack_int4_to_uint8(scaled_tensor) + + +def pack_int4_to_uint8(scaled_tensor: torch.Tensor): + if scaled_tensor.device.type == "cuda": + return pack_int4_to_uint8_cuda(scaled_tensor) + else: + return pack_int4_to_uint8_cpu(scaled_tensor) + + +# The torch.compile with dynamic=True is incompatible with multiple threads +# https://github.com/pytorch/pytorch/issues/126024 +@torch.compiler.disable() +def pack_int4_to_uint8_cpu(x: torch.Tensor) -> torch.Tensor: + return _pack_int4_to_uint8(x) + + +# Adapted from https://github.com/neuralmagic/compressed-tensors/pull/400 + + +def _get_packing_fn(): + if envs.AR_ENABLE_COMPILE_PACKING: + logger.warning_once( + "Compiled INT4 to UINT8 packing may be incompatible with multi-threading." + " Disable it by setting AR_ENABLE_COMPILE_PACKING=0" + ) + return torch.compile(fullgraph=True, dynamic=True)(_pack_int4_to_uint8) + else: + return torch.compiler.disable()(_pack_int4_to_uint8) + + +def pack_int4_to_uint8_cuda(x: torch.Tensor) -> torch.Tensor: + """ + Packs a tensor with values in the int4 range into uint8. + + :param x: tensor to pack + returns: a packed tensor in uint8 + """ + pack_fn = _get_packing_fn() + return pack_fn(x) + + +def _pack_int4_to_uint8(x: torch.Tensor) -> torch.Tensor: + + m, n = x.shape + device = x.device + + # Create lookup table for INT4 values to indices + # Map the absolute values to 0-7 indices + kE0M4 = torch.tensor(FLOAT_TO_E0M4, device=device, dtype=x.dtype) + + # Find closest valid INT4 value index for each element + abs_x = torch.abs(x) + abs_diff_x = torch.abs(abs_x.unsqueeze(-1) - kE0M4) # [m, n, 8] + abs_indices = torch.argmin(abs_diff_x, dim=-1) # [m, n] + + # Apply sign bit (bit 3) to get final 4-bit representation + indices = abs_indices + (torch.signbit(x).to(torch.long) << 3) + + # Reshape to prepare for packing pairs of values + indices = indices.reshape(-1) + + # Handle odd length by padding if necessary + if indices.numel() % 2 != 0: + indices = torch.cat([indices, torch.zeros(1, dtype=torch.long, device=device)]) + + # Reshape to pair consecutive elements + indices = indices.reshape(-1, 2) + + # Pack pairs of 4-bit values into 8-bit values + packed = (indices[:, 0] | (indices[:, 1] << 4)).to(torch.uint8) + + return packed.reshape(m, n // 2) diff --git a/auto_round/formats.py b/auto_round/formats.py index a2a290490..a3d908fa5 100644 --- a/auto_round/formats.py +++ b/auto_round/formats.py @@ -1102,7 +1102,7 @@ def __init__(self, format: str, ar: BaseCompressor): self.backend = AutoAWQFormat("auto_round:auto_awq", ar) elif is_nv_fp(ar.data_type) or is_mx_fp(ar.data_type): self.backend = AutoRoundFormat(ar.data_type, ar) - elif is_mx_int(ar.data_type): + elif is_mx_int(ar.data_type) and ar.bits == 4: # only add mx_int4 now self.backend = AutoRoundFormat(ar.data_type, ar) elif is_static_wfp8afp8(ar): # static wfp8afp8 self.backend = AutoRoundFormat(AutoRoundExportFormat.FP8_STATIC.value, ar) @@ -1161,11 +1161,11 @@ def pack_layer(self, layer_name, model, device=None, **kwargs): f"auto_round:{AutoRoundExportFormat.MX_FP_RCEIL.value}", f"auto_round:{AutoRoundExportFormat.NV_FP4_WITH_STATIC_GS.value}", ]: - from auto_round.export.export_to_autoround.export_to_nvfp_mxfp import pack_layer + from auto_round.export.export_to_autoround.export_to_nvfp_mx import pack_layer pack_func = pack_layer elif self.output_format in [f"auto_round:{AutoRoundExportFormat.MX_INT.value}"]: - from auto_round.export.export_to_autoround.export_to_nvfp_mxfp import pack_layer + from auto_round.export.export_to_autoround.export_to_nvfp_mx import pack_layer pack_func = pack_layer elif self.output_format in [ @@ -1206,7 +1206,7 @@ def save_quantized( ) backend = self.get_backend_name() if re.search(f"{AutoRoundExportFormat.MX_FP.value}|{AutoRoundExportFormat.NV_FP.value}", backend): - from auto_round.export.export_to_autoround.export_to_nvfp_mxfp import save_quantized_as_fp + from auto_round.export.export_to_autoround.export_to_nvfp_mx import save_quantized_as_fp backend = "auto_round:llm_compressor" export_func = save_quantized_as_fp @@ -1216,7 +1216,7 @@ def save_quantized( backend = "auto_round:fp8_static" if serialization_dict.get("act_bits", 16) == 8 else None export_func = save_quantized_as_autoround elif re.search(f"{AutoRoundExportFormat.MX_INT.value}", backend): - from auto_round.export.export_to_autoround.export_to_nvfp_mxfp import save_quantized_as_fp + from auto_round.export.export_to_autoround.export_to_nvfp_mx import save_quantized_as_fp backend = "auto_round:mx_int4" export_func = save_quantized_as_fp diff --git a/auto_round/inference/backend.py b/auto_round/inference/backend.py index 3073a122d..e25c4da66 100644 --- a/auto_round/inference/backend.py +++ b/auto_round/inference/backend.py @@ -321,7 +321,7 @@ def fp8_static_scheme_checker( priority=0, checkers=[mxfp_nvfp_feature_checker], alias=["auto_round", "torch"], - requirements=["auto-round>0.11.0"], + requirements=["auto-round>0.12.0"], ) # NVFP4 diff --git a/test/test_cpu/quantization/test_mx_quant_linear.py b/test/test_cpu/quantization/test_mx_quant_linear.py index c2e9a3c00..392d7617f 100644 --- a/test/test_cpu/quantization/test_mx_quant_linear.py +++ b/test/test_cpu/quantization/test_mx_quant_linear.py @@ -4,13 +4,16 @@ from auto_round.data_type.utils import get_quant_func from auto_round.experimental import qmodules as ar_qmodules from auto_round.export.export_to_autoround.qlinear_fp import QuantLinear as _MXFPLinear +from auto_round.export.export_to_autoround.qlinear_int import QuantLinear as _MXINTLinear from auto_round.formats import AutoRoundExportFormat from auto_round.schemes import PRESET_SCHEMES mx_schemes = [AutoRoundExportFormat.MXFP8.value, AutoRoundExportFormat.MXFP4.value] +mx_int_schemes = [AutoRoundExportFormat.MXINT4.value] QMODULE_MAPPING = { AutoRoundExportFormat.MXFP8.value: ar_qmodules.MXFP8QuantLinear, AutoRoundExportFormat.MXFP4.value: ar_qmodules.MXFP4QuantLinear, + AutoRoundExportFormat.MXINT4.value: ar_qmodules.MXINT4QuantLinear, } @@ -107,3 +110,86 @@ def test_mxquantlinear_from_original_and_forward(scheme): # Assert that the outputs are close within a tolerance assert diff_amax < 5e-1, f"Outputs differ too much for scheme {scheme}!" + + +@pytest.mark.parametrize("scheme", mx_int_schemes) +@torch.inference_mode() +def test_mxint_quantlinear_from_original_and_forward(scheme): + """ + Test MXINT4 quantization schemes by creating quantized layers + from an original torch.nn.Linear layer and validating their forward pass. + """ + # Set random seed for reproducibility + torch.manual_seed(42) + + # Define layer dimensions + in_features = 64 + out_features = 512 + + # Create an original torch.nn.Linear layer + original_layer = torch.nn.Linear(in_features, out_features, bias=False) + + # Select the quantization scheme + config = PRESET_SCHEMES[scheme.upper()] + + # Define weight scale shape + weight_scale_shape = (out_features, in_features // config.group_size) + + # Quantize the weights using the quantization function + qdq_func, _ = get_quant_func(dtype=config.data_type, bits=config.bits, sym=config.sym) + qdq_weight, shared_exp, _ = qdq_func( + tensor=original_layer.weight, + bits=config.bits, + group_size=config.group_size, + data_type=config.data_type + str(config.bits) + ) + shared_exp = shared_exp.reshape(weight_scale_shape) + + # Pack the weights using the QuantLinear class + mxint_lin = _MXINTLinear( + bits=config.bits, + group_size=config.group_size, + infeatures=in_features, + outfeatures=out_features, + bias=original_layer.bias is not None, + data_type=config.data_type, + ) + mxint_lin.pack(linear=original_layer, scales=shared_exp) + + # Create an MXQuantLinear layer from the original layer + QuantLinearClass = QMODULE_MAPPING[scheme] + mxint_layer = QuantLinearClass.from_original( + config=config, + original_layer=original_layer, + ) + + # Copy the packed weights and scales to the quantized layer + packed_weight = mxint_lin.weight_packed + if config.bits == 4: + mxint_layer.weight_packed.data.copy_(packed_weight) + else: + raise ValueError("Only 4-bit quantization are supported.") + mxint_layer.weight_scale.data.copy_(mxint_lin.weight_scale) + + # Validate layer attributes + assert mxint_layer.in_features == original_layer.in_features + assert mxint_layer.out_features == original_layer.out_features + + # Generate a random input tensor + input_tensor = torch.randn((4, in_features), dtype=torch.float32) + + # Perform a forward pass with both layers + original_output = original_layer(input_tensor) + mx_output = mxint_layer(input_tensor) + + # Compute the difference between the outputs + diff = mx_output - original_output + # Note: Remove NaN values, as we might get NaN when casting scales to FP8 + diff = diff[~torch.isnan(diff)] + diff_amax = diff.abs().max() + + # Print the maximum difference for debugging + print(f"Scheme: {scheme}, Max Difference: {diff_amax}") + + # Assert that the outputs are close within a tolerance + assert diff_amax < 5e-1, f"Outputs differ too much for scheme {scheme}!" From fa398e7be5b9d1a6e36f44b1216373e726d8063d Mon Sep 17 00:00:00 2001 From: Mengni Wang Date: Tue, 7 Apr 2026 17:33:10 +0800 Subject: [PATCH 3/6] update doc Signed-off-by: Mengni Wang --- docs/step_by_step.md | 2 +- docs/step_by_step_CN.md | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/docs/step_by_step.md b/docs/step_by_step.md index a076e9acb..e0573adb3 100644 --- a/docs/step_by_step.md +++ b/docs/step_by_step.md @@ -157,7 +157,7 @@ adopted within the community, **only 4-bits quantization is supported**. Please | Format | Supported Schemes | |:---|:------------------------------------------------------------------------------------------------------------------------------------------------------------------------| -| **auto_round** | W4A16, W2A16, W3A16, W8A16, W2A16G64, W2A16G32, `MXFP4`, `MXFP8`, `MXFP4_RCEIL`, `MXFP8_RCEIL`, `NVFP4`, `FPW8A16`, `FP8_STATIC`, `FP8_BLOCK`, `BF16` | +| **auto_round** | W4A16, W2A16, W3A16, W8A16, W2A16G64, W2A16G32, `MXFP4`, `MXFP8`, `MXFP4_RCEIL`, `MXFP8_RCEIL`, `NVFP4`, `FPW8A16`, `FP8_STATIC`, `FP8_BLOCK`, `BF16`, `MXINT4` | | **auto_awq** | W4A16, BF16 | | **auto_gptq** | W4A16, W2A16, W3A16, W8A16,W2A16G64, W2A16G32, BF16 | | **llm_compressor** | NVFP4, `MXFP4`, `MXFP8`, `FPW8A16`, `FP8_STATIC`, FP8_BLOCK | diff --git a/docs/step_by_step_CN.md b/docs/step_by_step_CN.md index b7cd57f64..a85cbd5d4 100644 --- a/docs/step_by_step_CN.md +++ b/docs/step_by_step_CN.md @@ -147,7 +147,7 @@ AutoRound 支持多种量化配置: | 格式 | 支持的量化方案 | |:-------------- |:------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | -| **auto_round** | W4A16、W2A16、W3A16、W8A16、W2A16G64、W2A16G32、`MXFP4`、`MXFP8`、`MXFP4_RCEIL`、`MXFP8_RCEIL`、`NVFP4`、`FPW8A16`、`FP8_STATIC`、`FP8_BLOCK`、`BF16` | +| **auto_round** | W4A16、W2A16、W3A16、W8A16、W2A16G64、W2A16G32、`MXFP4`、`MXFP8`、`MXFP4_RCEIL`、`MXFP8_RCEIL`、`NVFP4`、`FPW8A16`、`FP8_STATIC`、`FP8_BLOCK`、`BF16`, `MXINT4` | | **auto_awq** | W4A16、BF16 | | **auto_gptq** | W4A16、W2A16、W3A16、W8A16、W2A16G64、W2A16G32、BF16 | | **llm_compressor** | NVFP4、`MXFP4`、`MXFP8`、`FPW8A16`、`FP8_STATIC`、FP8_STATIC | From dec438b5c789b21a764c5f383e847579e261ab4e Mon Sep 17 00:00:00 2001 From: Mengni Wang Date: Tue, 7 Apr 2026 17:33:42 +0800 Subject: [PATCH 4/6] add file Signed-off-by: Mengni Wang --- .../export_to_autoround/export_to_nvfp_mx.py | 277 ++++++++++++++++++ 1 file changed, 277 insertions(+) create mode 100644 auto_round/export/export_to_autoround/export_to_nvfp_mx.py diff --git a/auto_round/export/export_to_autoround/export_to_nvfp_mx.py b/auto_round/export/export_to_autoround/export_to_nvfp_mx.py new file mode 100644 index 000000000..81ffb3e51 --- /dev/null +++ b/auto_round/export/export_to_autoround/export_to_nvfp_mx.py @@ -0,0 +1,277 @@ +# Copyright (c) 2023 Intel Corporation +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import copy +import inspect +import json +import os +from concurrent.futures import ThreadPoolExecutor +from dataclasses import fields +from typing import Callable, Union + +import threadpoolctl as tctl +import torch +import torch.nn as nn +import transformers +from tqdm import tqdm + +from auto_round.compressors.utils import is_mx_fp, is_nv_fp +from auto_round.export.export_to_autoround.utils import check_neq_config +from auto_round.export.utils import filter_quantization_config, release_layer_safely, save_model +from auto_round.logger import logger +from auto_round.schemes import QuantizationScheme +from auto_round.utils import ( + SUPPORTED_LAYER_TYPES, + check_start_with_block_name, + check_to_quantized, + copy_python_files_from_model_cache, + get_module, + get_packing_device, + set_amax_for_all_moe_layers, + set_module, + to_standard_regex, +) +from auto_round.wrapper import WrapperWALayer + +from .qlinear_fp import QuantLinear as FpQuantLinear +from .qlinear_int import QuantLinear as IntQuantLinear + +__all__ = [ + "pack_layer", + "save_quantized_as_fp", +] + + +def pack_layer(name, model, backend, device=None): + layer = get_module(model, name) + if type(layer) not in SUPPORTED_LAYER_TYPES and not isinstance(layer, WrapperWALayer): ##already packed + return + + if isinstance(layer, WrapperWALayer): # revert WrapperWALayer for offline usage + wp_layer = layer + layer = wp_layer.orig_layer + set_module(model, name, layer) + + orig_device = layer.weight.device + data_type = layer.data_type + act_bits = layer.act_bits + act_data_type = layer.act_data_type + bits = layer.bits + if bits > 8: + return + group_size = layer.group_size + sym = layer.sym + + if is_nv_fp(act_data_type) and act_bits <= 8: + input_global_scale = getattr(layer, "input_global_scale", None) + if input_global_scale is None: + assert hasattr(layer, "act_max") + from auto_round.data_type.nvfp import calculate_gparam + + input_global_scale = calculate_gparam(layer.act_max, layer.group_size, "cpu") + setattr(layer, "input_global_scale", input_global_scale) + delattr(layer, "act_max") + + if type(layer) == nn.Linear: + in_features = layer.in_features + out_features = layer.out_features + elif type(layer) == nn.Conv2d: + in_features = layer.in_channels + out_features = layer.out_channels + elif type(layer) == transformers.pytorch_utils.Conv1D: + in_features = layer.weight.shape[0] + out_features = layer.weight.shape[1] + + bias = layer.bias is not None + ##bias = True ## if using the above, llama3 lambada RTN will be NAN , TODO why? + linear_func = FpQuantLinear if "fp" in data_type else IntQuantLinear + qlayer = linear_func( ##pylint: disable=E1123 + bits, + group_size, + in_features, + out_features, + bias, + weight_dtype=layer.weight.dtype, + sym=sym, + data_type=data_type, + act_bits=act_bits, + act_data_type=act_data_type, + ) + + qlayer.device = orig_device + scale = layer.scale + global_scale = getattr(layer, "weight_global_scale", None) + input_global_scale = getattr(layer, "input_global_scale", None) + ## no zeros to handle, as mxfp/nvfp do not support asym quantization + # zero = layer.zp + qlayer.pack(layer, scale, global_scale=global_scale, input_global_scale=input_global_scale, device=device) + qlayer.to(orig_device) + set_module(model, name, qlayer) + # Note: release weight and bias explicitly, in case they are referenced elsewhere + release_layer_safely(layer) + + +def save_quantized_as_fp( + output_dir: str, + model: torch.nn.Module = None, + tokenizer: Callable = None, + layer_config: dict = None, + inplace: bool = True, + device: Union[str, torch.device] = "cpu", + backend: str = "autoround:exllamav2", + serialization_dict: dict = None, + **kwargs, +) -> torch.nn.Module: + """ + Saves a quantized model of mxfp/nvfp data_type in the auto-round format. + + Args: + output_dir (str): The directory where the quantized model will be saved. + inplace (bool, optional): If True, modifies the model in place. Otherwise, creates a deepcopy of the model. + Default is True. + backend (str, optional): The backend to be used for quantization. + Default is "autoround:exllamav2". + **kwargs: Additional keyword arguments including: + - model (nn.Module): The model to be quantized. + - layer_config (dict): The layer configuration for each layer. + - serialization_dict (dict): The serialization configuration. + - tokenizer (Tokenizer, optional): The tokenizer to be saved. + + Returns: + None + + Raises: + ValueError: If the backend is not supported. + """ + bits = serialization_dict.get("bits", None) + data_type = serialization_dict.get("data_type", None) + act_bits = serialization_dict.get("act_bits", None) + act_data_type = serialization_dict.get("act_data_type", None) + safe_serialization = True if "safe_serialization" not in kwargs.keys() else kwargs["safe_serialization"] + if not inplace: + model = copy.deepcopy(model.to("cpu")) + quantization_config = serialization_dict + quantization_config["block_name_to_quantize"] = quantization_config.pop("to_quant_block_names", None) + quantization_config["quant_method"] = "auto-round" + quantization_config["packing_format"] = backend + + processor = kwargs.get("processor", None) + image_processor = kwargs.get("image_processor", None) + extra_config = {} + + if act_bits <= 8: + # revert WrapperWALayer for offline usage + for n, m in model.named_modules(): + if isinstance(m, WrapperWALayer): + orig_layer = m.orig_layer + set_module(model, n, orig_layer) + + if is_nv_fp(act_data_type) and "static_gs" in str(act_data_type).lower(): + # Ensure all MOE layers have act_max set (needed after deep copy or for uncalibrated layers) + from auto_round.utils.model import is_moe_model, set_amax_for_all_moe_layers + + if is_moe_model(model): + set_amax_for_all_moe_layers(model) + + # generate static input_global_scale + for n, m in model.named_modules(): + if type(m) in SUPPORTED_LAYER_TYPES: + layer = m + if hasattr(layer, "act_bits") and layer.act_bits < 8 and not getattr(layer, "input_global_scale", None): + assert hasattr(layer, "act_max") + from auto_round.data_type.nvfp import calculate_gparam + + input_global_scale = calculate_gparam(layer.act_max, layer.group_size, model.device) + setattr(layer, "input_global_scale", input_global_scale) + delattr(layer, "act_max") + # update fused input_global_scale + from auto_round.data_type.utils import update_fused_layer_global_scales + + modules = list(model.modules()) + for module in tqdm(modules, desc="Update input global scale for fuse modules"): + update_fused_layer_global_scales(module, base_name="input") + + block_name_to_quantize = quantization_config["block_name_to_quantize"] + if isinstance(block_name_to_quantize, str): + block_name_to_quantize = block_name_to_quantize.split(",") + elif isinstance(block_name_to_quantize, list): + for i in range(len(block_name_to_quantize)): + block_name_to_quantize[i] = os.path.commonprefix(block_name_to_quantize[i]).rstrip(".") + + scheme_keys = [f.name for f in fields(QuantizationScheme)] + for layer_name, cfg in layer_config.items(): + if not cfg["in_blocks"] and cfg["bits"] <= 8: # lm head + extra_config[layer_name] = {key: cfg.get(key) for key in scheme_keys} + elif cfg["in_blocks"] or ( + block_name_to_quantize is not None and check_start_with_block_name(layer_name, block_name_to_quantize) + ): + neq_keys = check_neq_config(cfg, **{k: quantization_config[k] for k in scheme_keys}) + if len(neq_keys) > 0: + extra_config[layer_name] = {} + for key in neq_keys: + if cfg.get(key, None) is not None: + extra_config[layer_name][key] = cfg.get(key, None) + + regex_config = quantization_config.pop("regex_config") + if regex_config is not None: + for name, cfg in regex_config.items(): + regex_name = to_standard_regex(name) + neq_keys = check_neq_config(cfg, **{k: quantization_config[k] for k in scheme_keys}) + if len(neq_keys) > 0: + extra_config[regex_name] = {} + for key in neq_keys: + if cfg.get(key) is not None: + extra_config[regex_name][key] = cfg[key] + + if len(extra_config) > 0: + quantization_config["extra_config"] = extra_config + names = list(layer_config.keys()) + max_workers = 1 + if not torch.cuda.is_available() and not torch.xpu.is_available(): + max_workers = 2 ## 2 with cuda packing will cause hang occasionally + with ThreadPoolExecutor(max_workers=max_workers) as executor: + with tqdm(total=len(names), leave=True) as pbar: + + def wrapper(name): + pbar.set_description(f"packing {name}") + with tctl.threadpool_limits(limits=1): + pack_layer(name, model, backend, device) + pbar.update(1) + + for _ in executor.map(wrapper, names): + pass + filter_quantization_config(quantization_config) + + if hasattr(model, "config"): + model.config.quantization_config = quantization_config + if output_dir is None: + return model + + if output_dir is None: + model.tokenizer = tokenizer + return model + if os.path.exists(output_dir): + logger.warning(f"{output_dir} already exists, this may cause model conflict") + if tokenizer is not None: + tokenizer.save_pretrained(output_dir) + + if processor is not None: + processor.save_pretrained(output_dir) + if image_processor is not None: + image_processor.save_pretrained(output_dir) + + dtype = None + save_model(model, output_dir, safe_serialization=safe_serialization, dtype=dtype) + + return model From b86dc11099fc5112097c4393092f73b53d565fff Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Tue, 7 Apr 2026 09:36:47 +0000 Subject: [PATCH 5/6] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- auto_round/experimental/qmodules/__init__.py | 7 ++++++- auto_round/export/export_to_autoround/qlinear_int.py | 4 +--- auto_round/formats.py | 2 +- test/test_cpu/quantization/test_mx_quant_linear.py | 2 +- 4 files changed, 9 insertions(+), 6 deletions(-) diff --git a/auto_round/experimental/qmodules/__init__.py b/auto_round/experimental/qmodules/__init__.py index 0d1973770..df20d4afa 100644 --- a/auto_round/experimental/qmodules/__init__.py +++ b/auto_round/experimental/qmodules/__init__.py @@ -12,6 +12,11 @@ # See the License for the specific language governing permissions and # limitations under the License. -from auto_round.experimental.qmodules.mx import MXFP4QuantLinear, MXFP8QuantLinear, MXINT4QuantLinear, HadamardMXFP4QuantLinear +from auto_round.experimental.qmodules.mx import ( + MXFP4QuantLinear, + MXFP8QuantLinear, + MXINT4QuantLinear, + HadamardMXFP4QuantLinear, +) from auto_round.experimental.qmodules.nvfp4 import NVFP4QuantLinear from auto_round.experimental.qmodules.fp8_static import WeightFP8ActFP8StaticQuantLinear diff --git a/auto_round/export/export_to_autoround/qlinear_int.py b/auto_round/export/export_to_autoround/qlinear_int.py index c5e156c46..62c730410 100644 --- a/auto_round/export/export_to_autoround/qlinear_int.py +++ b/auto_round/export/export_to_autoround/qlinear_int.py @@ -64,9 +64,7 @@ class QuantLinear(nn.Module): QUANT_TYPE = "MXINT" - def __init__( - self, bits, group_size, infeatures, outfeatures, bias, trainable=False, data_type="mx_int4", **kwargs - ): + def __init__(self, bits, group_size, infeatures, outfeatures, bias, trainable=False, data_type="mx_int4", **kwargs): super().__init__() if bits not in [4]: raise NotImplementedError("Only 4 bits are supported.") diff --git a/auto_round/formats.py b/auto_round/formats.py index a3d908fa5..60df0131a 100644 --- a/auto_round/formats.py +++ b/auto_round/formats.py @@ -1102,7 +1102,7 @@ def __init__(self, format: str, ar: BaseCompressor): self.backend = AutoAWQFormat("auto_round:auto_awq", ar) elif is_nv_fp(ar.data_type) or is_mx_fp(ar.data_type): self.backend = AutoRoundFormat(ar.data_type, ar) - elif is_mx_int(ar.data_type) and ar.bits == 4: # only add mx_int4 now + elif is_mx_int(ar.data_type) and ar.bits == 4: # only add mx_int4 now self.backend = AutoRoundFormat(ar.data_type, ar) elif is_static_wfp8afp8(ar): # static wfp8afp8 self.backend = AutoRoundFormat(AutoRoundExportFormat.FP8_STATIC.value, ar) diff --git a/test/test_cpu/quantization/test_mx_quant_linear.py b/test/test_cpu/quantization/test_mx_quant_linear.py index 392d7617f..1ec5cb729 100644 --- a/test/test_cpu/quantization/test_mx_quant_linear.py +++ b/test/test_cpu/quantization/test_mx_quant_linear.py @@ -141,7 +141,7 @@ def test_mxint_quantlinear_from_original_and_forward(scheme): tensor=original_layer.weight, bits=config.bits, group_size=config.group_size, - data_type=config.data_type + str(config.bits) + data_type=config.data_type + str(config.bits), ) shared_exp = shared_exp.reshape(weight_scale_shape) From 3372e60dfd74ff4268ba12e5be3ff27be89aa130 Mon Sep 17 00:00:00 2001 From: Mengni Wang Date: Tue, 7 Apr 2026 21:41:31 +0800 Subject: [PATCH 6/6] fix ut Signed-off-by: Mengni Wang --- test/test_cpu/quantization/test_mxfp_save_load.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/test/test_cpu/quantization/test_mxfp_save_load.py b/test/test_cpu/quantization/test_mxfp_save_load.py index 5e12edc68..25e5a2428 100644 --- a/test/test_cpu/quantization/test_mxfp_save_load.py +++ b/test/test_cpu/quantization/test_mxfp_save_load.py @@ -28,11 +28,12 @@ AutoRoundExportFormat.MXFP8.value: ar_schemes.MXFP8, AutoRoundExportFormat.MXFP4.value: ar_schemes.MXFP4, } +MX_TENSOR_DATA_TYPES_FP = [i for i in MX_TENSOR_DATA_TYPES if "int" not in i] @pytest.mark.parametrize("scheme_name", testing_scheme_name_lst) -@pytest.mark.parametrize("weight_data_type", MX_TENSOR_DATA_TYPES) -@pytest.mark.parametrize("act_data_type", MX_TENSOR_DATA_TYPES) +@pytest.mark.parametrize("weight_data_type", MX_TENSOR_DATA_TYPES_FP) +@pytest.mark.parametrize("act_data_type", MX_TENSOR_DATA_TYPES_FP) @torch.inference_mode() def test_e2e_quant_and_load(scheme_name, weight_data_type, act_data_type): # Use a temporary directory for saving the quantized model