From 4a5957019be254a151b61ba83f4a466b73e5c1b2 Mon Sep 17 00:00:00 2001 From: Xin He Date: Mon, 23 Mar 2026 11:33:56 +0800 Subject: [PATCH 1/8] support compressed-tensors refactor Signed-off-by: Xin He --- auto_round/utils/weight_handler.py | 76 +++++++++++++++++-- .../test_low_precision_input_model.py | 28 ++++--- 2 files changed, 90 insertions(+), 14 deletions(-) diff --git a/auto_round/utils/weight_handler.py b/auto_round/utils/weight_handler.py index 11aaf452d..757499208 100644 --- a/auto_round/utils/weight_handler.py +++ b/auto_round/utils/weight_handler.py @@ -487,12 +487,25 @@ def detect_layer(self, module: torch.nn.Module) -> bool: return "Float" in compressor_name return True + if hasattr(module, "quantization_scheme"): + from compressed_tensors.quantization.utils import is_module_quantized + + if is_module_quantized(module): + q_scheme = module.quantization_scheme + if ( + q_scheme.weights.num_bits == 8 + and "float" in q_scheme.weights.data_type + and q_scheme.input_activations.num_bits == 8 + and "float" in q_scheme.input_activations.data_type + ): + return True + # Check for FP8Linear layer type if module.__class__.__name__ == "FP8Linear": return True # Fallback: Check for FP8 dtype (for torch.nn.Linear with FP8 weights) - if type(module) == torch.nn.Linear and module.weight is not None: + if type(module) == torch.nn.Linear and getattr(module, "weight", None) is not None: if str(module.weight.dtype).startswith("torch.float8"): return True @@ -506,6 +519,12 @@ def convert_layer( to_cpu: bool = False, ) -> torch.nn.Module: """Convert a single FP8/CompressedLinear layer to a standard Linear layer.""" + if hasattr(layer, "quantization_scheme") and layer.__class__.__name__ == "Linear": + from compressed_tensors.compressors.base import decompress_module + + decompress_module(layer) + return layer + from auto_round.schemes import QuantizationScheme from auto_round.utils.device import is_gaudi2 @@ -563,10 +582,23 @@ class MXFP4Handler(WeightTypeHandler): def detect_layer(self, module: torch.nn.Module) -> bool: """Check if a module is an MXFP4 CompressedLinear layer.""" if module.__class__.__name__ != "CompressedLinear": - return False - if hasattr(module, "compressor") and module.compressor is not None: - compressor_name = module.compressor.__class__.__name__ - return "MXFP4" in compressor_name + if hasattr(module, "compressor") and module.compressor is not None: + compressor_name = module.compressor.__class__.__name__ + return "MXFP4" in compressor_name + if hasattr(module, "quantization_scheme"): + from compressed_tensors.quantization.utils import is_module_quantized + + if is_module_quantized(module): + q_scheme = module.quantization_scheme + if ( + q_scheme.weights.num_bits == 4 + and q_scheme.weights.type == "float" + and q_scheme.weights.group_size == 32 + and q_scheme.input_activations.num_bits == 4 + and q_scheme.input_activations.type == "float" + and q_scheme.input_activations.group_size == 32 + ): + return True return False def convert_layer( @@ -643,6 +675,20 @@ def detect_layer(self, module: torch.nn.Module) -> bool: if hasattr(module, "compressor") and module.compressor is not None: compressor_name = module.compressor.__class__.__name__ return "MXFP8" in compressor_name + if hasattr(module, "quantization_scheme"): + from compressed_tensors.quantization.utils import is_module_quantized + + if is_module_quantized(module): + q_scheme = module.quantization_scheme + if ( + q_scheme.weights.num_bits == 8 + and q_scheme.weights.type == "float" + and q_scheme.weights.group_size == 32 + and q_scheme.input_activations.num_bits == 8 + and q_scheme.input_activations.type == "float" + and q_scheme.input_activations.group_size == 32 + ): + return True return False def convert_layer( @@ -715,6 +761,20 @@ def detect_layer(self, module: torch.nn.Module) -> bool: if hasattr(module, "compressor") and module.compressor is not None: compressor_name = module.compressor.__class__.__name__ return "NVFP4" in compressor_name + if hasattr(module, "quantization_scheme"): + from compressed_tensors.quantization.utils import is_module_quantized + + if is_module_quantized(module): + q_scheme = module.quantization_scheme + if ( + q_scheme.weights.num_bits == 4 + and q_scheme.weights.type == "float" + and q_scheme.weights.group_size == 16 + and q_scheme.input_activations.num_bits == 4 + and q_scheme.input_activations.type == "float" + and q_scheme.input_activations.group_size == 16 + ): + return True return False def convert_layer( @@ -725,6 +785,12 @@ def convert_layer( to_cpu: bool = False, ) -> torch.nn.Module: """Convert an NVFP4 CompressedLinear layer to a standard Linear layer.""" + if hasattr(layer, "quantization_scheme") and layer.__class__.__name__ == "Linear": + from compressed_tensors.compressors.base import decompress_module + + decompress_module(layer) + return layer + from auto_round.schemes import QuantizationScheme from auto_round.utils.device import is_gaudi2 diff --git a/test/test_cpu/advanced/test_low_precision_input_model.py b/test/test_cpu/advanced/test_low_precision_input_model.py index ccdf0634f..99c146f4a 100644 --- a/test/test_cpu/advanced/test_low_precision_input_model.py +++ b/test/test_cpu/advanced/test_low_precision_input_model.py @@ -15,41 +15,51 @@ class TestCompressedTensor: mxfp4_model_path = "QuixiAI/Llama-3.2-1B-MXFP4" fp8_block_model_path = "RedHatAI/Qwen3-0.6B-FP8-BLOCK" - @pytest.mark.skip(reason="CompressedLinear removed in compressed_tensors PR #610, see #1578") def test_fp8_block(self): model = get_tiny_model(get_model_path(self.fp8_block_model_path)) assert ( - type(model.model.layers[0].mlp.up_proj).__name__ == "CompressedLinear" + model.model.layers[0].mlp.up_proj.weight.dtype == torch.float8_e4m3fn + ), "Original weight is not in FP8 format" + assert hasattr( + model.model.layers[0].mlp.up_proj, "quantization_scheme" ), "Model does not contain CompressedLinear layers" detected_types = check_and_mark_quantized_module(model) assert ModuleWeightType.FP8 in detected_types model = convert_module_to_hp_if_necessary(model) assert ( - type(model.model.layers[0].mlp.up_proj) is torch.nn.Linear + model.model.layers[0].mlp.up_proj.weight.dtype == torch.bfloat16 ), "CompressedLinear layer was not converted to Linear" - @pytest.mark.skip(reason="CompressedLinear removed in compressed_tensors PR #610, see #1578") + @pytest.mark.skip( + reason="NVFP4 models are currently not supported due to issues with the compressed_tensors library. See https://github.com/vllm-project/compressed-tensors/issues/642" + ) def test_nvfp4(self): model = get_tiny_model(get_model_path(self.nvfp4_model_path)) assert ( - type(model.model.layers[0].mlp.up_proj).__name__ == "CompressedLinear" + model.model.layers[0].mlp.up_proj.weight_packed.dtype == torch.uint8 + ), "Original weight is not in FP8 format" + assert hasattr( + model.model.layers[0].mlp.up_proj, "quantization_scheme" ), "Model does not contain CompressedLinear layers" detected_types = check_and_mark_quantized_module(model) assert ModuleWeightType.NVFP4 in detected_types model = convert_module_to_hp_if_necessary(model) assert ( - type(model.model.layers[0].mlp.up_proj) is torch.nn.Linear + model.model.layers[0].mlp.up_proj.weight.dtype == torch.bfloat16 ), "CompressedLinear layer was not converted to Linear" - @pytest.mark.skip(reason="CompressedLinear removed in compressed_tensors PR #610, see #1578") def test_mxfp4(self): model = get_tiny_model(get_model_path(self.mxfp4_model_path)) assert ( - type(model.model.layers[0].mlp.up_proj).__name__ == "CompressedLinear" + model.model.layers[0].mlp.up_proj.weight_packed.dtype == torch.uint8 + ), "Original weight is not in FP8 format" + print(model.model.layers[0].mlp.up_proj.quantization_scheme) + assert hasattr( + model.model.layers[0].mlp.up_proj, "quantization_scheme" ), "Model does not contain CompressedLinear layers" detected_types = check_and_mark_quantized_module(model) assert ModuleWeightType.MXFP4 in detected_types model = convert_module_to_hp_if_necessary(model) assert ( - type(model.model.layers[0].mlp.up_proj) is torch.nn.Linear + model.model.layers[0].mlp.up_proj.weight.dtype == torch.bfloat16 ), "CompressedLinear layer was not converted to Linear" From e99111f1d4def9a2b3da6574079e75549b6f7967 Mon Sep 17 00:00:00 2001 From: Xin He Date: Mon, 23 Mar 2026 15:56:52 +0800 Subject: [PATCH 2/8] fix pylint Signed-off-by: Xin He --- auto_round/utils/weight_handler.py | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/auto_round/utils/weight_handler.py b/auto_round/utils/weight_handler.py index 757499208..bd2e56a7b 100644 --- a/auto_round/utils/weight_handler.py +++ b/auto_round/utils/weight_handler.py @@ -488,7 +488,7 @@ def detect_layer(self, module: torch.nn.Module) -> bool: return True if hasattr(module, "quantization_scheme"): - from compressed_tensors.quantization.utils import is_module_quantized + from compressed_tensors.quantization.utils import is_module_quantized # pylint: disable=E0401 if is_module_quantized(module): q_scheme = module.quantization_scheme @@ -520,7 +520,7 @@ def convert_layer( ) -> torch.nn.Module: """Convert a single FP8/CompressedLinear layer to a standard Linear layer.""" if hasattr(layer, "quantization_scheme") and layer.__class__.__name__ == "Linear": - from compressed_tensors.compressors.base import decompress_module + from compressed_tensors.compressors.base import decompress_module # pylint: disable=E0401 decompress_module(layer) return layer @@ -586,7 +586,7 @@ def detect_layer(self, module: torch.nn.Module) -> bool: compressor_name = module.compressor.__class__.__name__ return "MXFP4" in compressor_name if hasattr(module, "quantization_scheme"): - from compressed_tensors.quantization.utils import is_module_quantized + from compressed_tensors.quantization.utils import is_module_quantized # pylint: disable=E0401 if is_module_quantized(module): q_scheme = module.quantization_scheme @@ -676,7 +676,7 @@ def detect_layer(self, module: torch.nn.Module) -> bool: compressor_name = module.compressor.__class__.__name__ return "MXFP8" in compressor_name if hasattr(module, "quantization_scheme"): - from compressed_tensors.quantization.utils import is_module_quantized + from compressed_tensors.quantization.utils import is_module_quantized # pylint: disable=E0401 if is_module_quantized(module): q_scheme = module.quantization_scheme @@ -762,7 +762,7 @@ def detect_layer(self, module: torch.nn.Module) -> bool: compressor_name = module.compressor.__class__.__name__ return "NVFP4" in compressor_name if hasattr(module, "quantization_scheme"): - from compressed_tensors.quantization.utils import is_module_quantized + from compressed_tensors.quantization.utils import is_module_quantized # pylint: disable=E0401 if is_module_quantized(module): q_scheme = module.quantization_scheme @@ -786,7 +786,7 @@ def convert_layer( ) -> torch.nn.Module: """Convert an NVFP4 CompressedLinear layer to a standard Linear layer.""" if hasattr(layer, "quantization_scheme") and layer.__class__.__name__ == "Linear": - from compressed_tensors.compressors.base import decompress_module + from compressed_tensors.compressors.base import decompress_module # pylint: disable=E0401 decompress_module(layer) return layer From b23967fd50bb74723160602756f56a53e1102f68 Mon Sep 17 00:00:00 2001 From: Xin He Date: Mon, 23 Mar 2026 19:49:33 +0800 Subject: [PATCH 3/8] fix bug Signed-off-by: Xin He --- auto_round/utils/weight_handler.py | 20 +++++++++----------- 1 file changed, 9 insertions(+), 11 deletions(-) diff --git a/auto_round/utils/weight_handler.py b/auto_round/utils/weight_handler.py index bd2e56a7b..ef41d8175 100644 --- a/auto_round/utils/weight_handler.py +++ b/auto_round/utils/weight_handler.py @@ -581,7 +581,7 @@ class MXFP4Handler(WeightTypeHandler): def detect_layer(self, module: torch.nn.Module) -> bool: """Check if a module is an MXFP4 CompressedLinear layer.""" - if module.__class__.__name__ != "CompressedLinear": + if module.__class__.__name__ == "CompressedLinear": if hasattr(module, "compressor") and module.compressor is not None: compressor_name = module.compressor.__class__.__name__ return "MXFP4" in compressor_name @@ -670,11 +670,10 @@ class MXFP8Handler(WeightTypeHandler): def detect_layer(self, module: torch.nn.Module) -> bool: """Check if a module is an MXFP8 CompressedLinear layer.""" - if module.__class__.__name__ != "CompressedLinear": - return False - if hasattr(module, "compressor") and module.compressor is not None: - compressor_name = module.compressor.__class__.__name__ - return "MXFP8" in compressor_name + if module.__class__.__name__ == "CompressedLinear": + if hasattr(module, "compressor") and module.compressor is not None: + compressor_name = module.compressor.__class__.__name__ + return "MXFP8" in compressor_name if hasattr(module, "quantization_scheme"): from compressed_tensors.quantization.utils import is_module_quantized # pylint: disable=E0401 @@ -756,11 +755,10 @@ class NVFP4Handler(WeightTypeHandler): def detect_layer(self, module: torch.nn.Module) -> bool: """Check if a module is an NVFP4 CompressedLinear layer.""" - if module.__class__.__name__ != "CompressedLinear": - return False - if hasattr(module, "compressor") and module.compressor is not None: - compressor_name = module.compressor.__class__.__name__ - return "NVFP4" in compressor_name + if module.__class__.__name__ == "CompressedLinear": + if hasattr(module, "compressor") and module.compressor is not None: + compressor_name = module.compressor.__class__.__name__ + return "NVFP4" in compressor_name if hasattr(module, "quantization_scheme"): from compressed_tensors.quantization.utils import is_module_quantized # pylint: disable=E0401 From c9affc2c021ad9c40e616fbbc7bdf39f52d5680f Mon Sep 17 00:00:00 2001 From: Xin He Date: Tue, 24 Mar 2026 10:04:28 +0800 Subject: [PATCH 4/8] fix typo Signed-off-by: Xin He --- auto_round/utils/weight_handler.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/auto_round/utils/weight_handler.py b/auto_round/utils/weight_handler.py index ef41d8175..1f21a1014 100644 --- a/auto_round/utils/weight_handler.py +++ b/auto_round/utils/weight_handler.py @@ -494,9 +494,9 @@ def detect_layer(self, module: torch.nn.Module) -> bool: q_scheme = module.quantization_scheme if ( q_scheme.weights.num_bits == 8 - and "float" in q_scheme.weights.data_type + and q_scheme.weights.type == "float" and q_scheme.input_activations.num_bits == 8 - and "float" in q_scheme.input_activations.data_type + and q_scheme.input_activations.type == "float" ): return True From 0c66d675ffc9b68616a99c513990fac885c66ff0 Mon Sep 17 00:00:00 2001 From: Xin He Date: Tue, 24 Mar 2026 11:13:45 +0800 Subject: [PATCH 5/8] compatible for llmc integration Signed-off-by: Xin He --- auto_round/utils/weight_handler.py | 8 ++++---- test/test_cpu/advanced/test_low_precision_input_model.py | 1 - 2 files changed, 4 insertions(+), 5 deletions(-) diff --git a/auto_round/utils/weight_handler.py b/auto_round/utils/weight_handler.py index 1f21a1014..92cdb504c 100644 --- a/auto_round/utils/weight_handler.py +++ b/auto_round/utils/weight_handler.py @@ -490,7 +490,7 @@ def detect_layer(self, module: torch.nn.Module) -> bool: if hasattr(module, "quantization_scheme"): from compressed_tensors.quantization.utils import is_module_quantized # pylint: disable=E0401 - if is_module_quantized(module): + if is_module_quantized(module) and module.quantization_status.value == "compressed": q_scheme = module.quantization_scheme if ( q_scheme.weights.num_bits == 8 @@ -588,7 +588,7 @@ def detect_layer(self, module: torch.nn.Module) -> bool: if hasattr(module, "quantization_scheme"): from compressed_tensors.quantization.utils import is_module_quantized # pylint: disable=E0401 - if is_module_quantized(module): + if is_module_quantized(module) and module.quantization_status.value == "compressed": q_scheme = module.quantization_scheme if ( q_scheme.weights.num_bits == 4 @@ -677,7 +677,7 @@ def detect_layer(self, module: torch.nn.Module) -> bool: if hasattr(module, "quantization_scheme"): from compressed_tensors.quantization.utils import is_module_quantized # pylint: disable=E0401 - if is_module_quantized(module): + if is_module_quantized(module) and module.quantization_status.value == "compressed": q_scheme = module.quantization_scheme if ( q_scheme.weights.num_bits == 8 @@ -762,7 +762,7 @@ def detect_layer(self, module: torch.nn.Module) -> bool: if hasattr(module, "quantization_scheme"): from compressed_tensors.quantization.utils import is_module_quantized # pylint: disable=E0401 - if is_module_quantized(module): + if is_module_quantized(module) and module.quantization_status.value == "compressed": q_scheme = module.quantization_scheme if ( q_scheme.weights.num_bits == 4 diff --git a/test/test_cpu/advanced/test_low_precision_input_model.py b/test/test_cpu/advanced/test_low_precision_input_model.py index 99c146f4a..defee4c37 100644 --- a/test/test_cpu/advanced/test_low_precision_input_model.py +++ b/test/test_cpu/advanced/test_low_precision_input_model.py @@ -53,7 +53,6 @@ def test_mxfp4(self): assert ( model.model.layers[0].mlp.up_proj.weight_packed.dtype == torch.uint8 ), "Original weight is not in FP8 format" - print(model.model.layers[0].mlp.up_proj.quantization_scheme) assert hasattr( model.model.layers[0].mlp.up_proj, "quantization_scheme" ), "Model does not contain CompressedLinear layers" From eaa62bdfacf1372377dca532f05d14e5de24479c Mon Sep 17 00:00:00 2001 From: Xin He Date: Tue, 24 Mar 2026 13:32:29 +0800 Subject: [PATCH 6/8] Update requirements.txt --- test/test_cpu/requirements.txt | 1 - 1 file changed, 1 deletion(-) diff --git a/test/test_cpu/requirements.txt b/test/test_cpu/requirements.txt index cb1723616..76e405e7f 100644 --- a/test/test_cpu/requirements.txt +++ b/test/test_cpu/requirements.txt @@ -10,5 +10,4 @@ llmcompressor @ git+https://github.com/vllm-project/llm-compressor.git@main lm_eval >= 0.4.10 # for transformers >= 5.0.0 diffusers protobuf -compressed-tensors==0.14.1a20260313 # temporary pin for llmcompressor transformers < 5.0.0 From c0137cfe4e2334059fcee4253b43d33ff74ebfd1 Mon Sep 17 00:00:00 2001 From: Xin He Date: Tue, 24 Mar 2026 15:20:42 +0800 Subject: [PATCH 7/8] Update requirements.txt --- test/test_cpu/requirements.txt | 1 + 1 file changed, 1 insertion(+) diff --git a/test/test_cpu/requirements.txt b/test/test_cpu/requirements.txt index 76e405e7f..61376cd04 100644 --- a/test/test_cpu/requirements.txt +++ b/test/test_cpu/requirements.txt @@ -10,4 +10,5 @@ llmcompressor @ git+https://github.com/vllm-project/llm-compressor.git@main lm_eval >= 0.4.10 # for transformers >= 5.0.0 diffusers protobuf +compressed-tensors==0.14.1a20260313 # temporary pin for llmcompressor transformers < 5.0.0 From b2ceaa89f4c3c0943b4a0f952752bad230d2b598 Mon Sep 17 00:00:00 2001 From: Xin He Date: Tue, 24 Mar 2026 15:21:02 +0800 Subject: [PATCH 8/8] Update requirements.txt --- test/test_cpu/requirements.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/test/test_cpu/requirements.txt b/test/test_cpu/requirements.txt index 61376cd04..cb1723616 100644 --- a/test/test_cpu/requirements.txt +++ b/test/test_cpu/requirements.txt @@ -10,5 +10,5 @@ llmcompressor @ git+https://github.com/vllm-project/llm-compressor.git@main lm_eval >= 0.4.10 # for transformers >= 5.0.0 diffusers protobuf -compressed-tensors==0.14.1a20260313 # temporary pin for llmcompressor +compressed-tensors==0.14.1a20260313 # temporary pin for llmcompressor transformers < 5.0.0