From 4a5957019be254a151b61ba83f4a466b73e5c1b2 Mon Sep 17 00:00:00 2001
From: Xin He <xin3.he@intel.com>
Date: Mon, 23 Mar 2026 11:33:56 +0800
Subject: [PATCH 1/8] support compressed-tensors refactor

Signed-off-by: Xin He <xin3.he@intel.com>
---
 auto_round/utils/weight_handler.py            | 76 +++++++++++++++++--
 .../test_low_precision_input_model.py         | 28 ++++---
 2 files changed, 90 insertions(+), 14 deletions(-)

diff --git a/auto_round/utils/weight_handler.py b/auto_round/utils/weight_handler.py
index 11aaf452d..757499208 100644
--- a/auto_round/utils/weight_handler.py
+++ b/auto_round/utils/weight_handler.py
@@ -487,12 +487,25 @@ def detect_layer(self, module: torch.nn.Module) -> bool:
                 return "Float" in compressor_name
             return True
 
+        if hasattr(module, "quantization_scheme"):
+            from compressed_tensors.quantization.utils import is_module_quantized
+
+            if is_module_quantized(module):
+                q_scheme = module.quantization_scheme
+                if (
+                    q_scheme.weights.num_bits == 8
+                    and "float" in q_scheme.weights.data_type
+                    and q_scheme.input_activations.num_bits == 8
+                    and "float" in q_scheme.input_activations.data_type
+                ):
+                    return True
+
         # Check for FP8Linear layer type
         if module.__class__.__name__ == "FP8Linear":
             return True
 
         # Fallback: Check for FP8 dtype (for torch.nn.Linear with FP8 weights)
-        if type(module) == torch.nn.Linear and module.weight is not None:
+        if type(module) == torch.nn.Linear and getattr(module, "weight", None) is not None:
             if str(module.weight.dtype).startswith("torch.float8"):
                 return True
 
@@ -506,6 +519,12 @@ def convert_layer(
         to_cpu: bool = False,
     ) -> torch.nn.Module:
         """Convert a single FP8/CompressedLinear layer to a standard Linear layer."""
+        if hasattr(layer, "quantization_scheme") and layer.__class__.__name__ == "Linear":
+            from compressed_tensors.compressors.base import decompress_module
+
+            decompress_module(layer)
+            return layer
+
         from auto_round.schemes import QuantizationScheme
         from auto_round.utils.device import is_gaudi2
 
@@ -563,10 +582,23 @@ class MXFP4Handler(WeightTypeHandler):
     def detect_layer(self, module: torch.nn.Module) -> bool:
         """Check if a module is an MXFP4 CompressedLinear layer."""
         if module.__class__.__name__ != "CompressedLinear":
-            return False
-        if hasattr(module, "compressor") and module.compressor is not None:
-            compressor_name = module.compressor.__class__.__name__
-            return "MXFP4" in compressor_name
+            if hasattr(module, "compressor") and module.compressor is not None:
+                compressor_name = module.compressor.__class__.__name__
+                return "MXFP4" in compressor_name
+        if hasattr(module, "quantization_scheme"):
+            from compressed_tensors.quantization.utils import is_module_quantized
+
+            if is_module_quantized(module):
+                q_scheme = module.quantization_scheme
+                if (
+                    q_scheme.weights.num_bits == 4
+                    and q_scheme.weights.type == "float"
+                    and q_scheme.weights.group_size == 32
+                    and q_scheme.input_activations.num_bits == 4
+                    and q_scheme.input_activations.type == "float"
+                    and q_scheme.input_activations.group_size == 32
+                ):
+                    return True
         return False
 
     def convert_layer(
@@ -643,6 +675,20 @@ def detect_layer(self, module: torch.nn.Module) -> bool:
         if hasattr(module, "compressor") and module.compressor is not None:
             compressor_name = module.compressor.__class__.__name__
             return "MXFP8" in compressor_name
+        if hasattr(module, "quantization_scheme"):
+            from compressed_tensors.quantization.utils import is_module_quantized
+
+            if is_module_quantized(module):
+                q_scheme = module.quantization_scheme
+                if (
+                    q_scheme.weights.num_bits == 8
+                    and q_scheme.weights.type == "float"
+                    and q_scheme.weights.group_size == 32
+                    and q_scheme.input_activations.num_bits == 8
+                    and q_scheme.input_activations.type == "float"
+                    and q_scheme.input_activations.group_size == 32
+                ):
+                    return True
         return False
 
     def convert_layer(
@@ -715,6 +761,20 @@ def detect_layer(self, module: torch.nn.Module) -> bool:
         if hasattr(module, "compressor") and module.compressor is not None:
             compressor_name = module.compressor.__class__.__name__
             return "NVFP4" in compressor_name
+        if hasattr(module, "quantization_scheme"):
+            from compressed_tensors.quantization.utils import is_module_quantized
+
+            if is_module_quantized(module):
+                q_scheme = module.quantization_scheme
+                if (
+                    q_scheme.weights.num_bits == 4
+                    and q_scheme.weights.type == "float"
+                    and q_scheme.weights.group_size == 16
+                    and q_scheme.input_activations.num_bits == 4
+                    and q_scheme.input_activations.type == "float"
+                    and q_scheme.input_activations.group_size == 16
+                ):
+                    return True
         return False
 
     def convert_layer(
@@ -725,6 +785,12 @@ def convert_layer(
         to_cpu: bool = False,
     ) -> torch.nn.Module:
         """Convert an NVFP4 CompressedLinear layer to a standard Linear layer."""
+        if hasattr(layer, "quantization_scheme") and layer.__class__.__name__ == "Linear":
+            from compressed_tensors.compressors.base import decompress_module
+
+            decompress_module(layer)
+            return layer
+
         from auto_round.schemes import QuantizationScheme
         from auto_round.utils.device import is_gaudi2
 
diff --git a/test/test_cpu/advanced/test_low_precision_input_model.py b/test/test_cpu/advanced/test_low_precision_input_model.py
index ccdf0634f..99c146f4a 100644
--- a/test/test_cpu/advanced/test_low_precision_input_model.py
+++ b/test/test_cpu/advanced/test_low_precision_input_model.py
@@ -15,41 +15,51 @@ class TestCompressedTensor:
     mxfp4_model_path = "QuixiAI/Llama-3.2-1B-MXFP4"
     fp8_block_model_path = "RedHatAI/Qwen3-0.6B-FP8-BLOCK"
 
-    @pytest.mark.skip(reason="CompressedLinear removed in compressed_tensors PR #610, see #1578")
     def test_fp8_block(self):
         model = get_tiny_model(get_model_path(self.fp8_block_model_path))
         assert (
-            type(model.model.layers[0].mlp.up_proj).__name__ == "CompressedLinear"
+            model.model.layers[0].mlp.up_proj.weight.dtype == torch.float8_e4m3fn
+        ), "Original weight is not in FP8 format"
+        assert hasattr(
+            model.model.layers[0].mlp.up_proj, "quantization_scheme"
         ), "Model does not contain CompressedLinear layers"
         detected_types = check_and_mark_quantized_module(model)
         assert ModuleWeightType.FP8 in detected_types
         model = convert_module_to_hp_if_necessary(model)
         assert (
-            type(model.model.layers[0].mlp.up_proj) is torch.nn.Linear
+            model.model.layers[0].mlp.up_proj.weight.dtype == torch.bfloat16
         ), "CompressedLinear layer was not converted to Linear"
 
-    @pytest.mark.skip(reason="CompressedLinear removed in compressed_tensors PR #610, see #1578")
+    @pytest.mark.skip(
+        reason="NVFP4 models are currently not supported due to issues with the compressed_tensors library. See https://github.com/vllm-project/compressed-tensors/issues/642"
+    )
     def test_nvfp4(self):
         model = get_tiny_model(get_model_path(self.nvfp4_model_path))
         assert (
-            type(model.model.layers[0].mlp.up_proj).__name__ == "CompressedLinear"
+            model.model.layers[0].mlp.up_proj.weight_packed.dtype == torch.uint8
+        ), "Original weight is not in FP8 format"
+        assert hasattr(
+            model.model.layers[0].mlp.up_proj, "quantization_scheme"
         ), "Model does not contain CompressedLinear layers"
         detected_types = check_and_mark_quantized_module(model)
         assert ModuleWeightType.NVFP4 in detected_types
         model = convert_module_to_hp_if_necessary(model)
         assert (
-            type(model.model.layers[0].mlp.up_proj) is torch.nn.Linear
+            model.model.layers[0].mlp.up_proj.weight.dtype == torch.bfloat16
         ), "CompressedLinear layer was not converted to Linear"
 
-    @pytest.mark.skip(reason="CompressedLinear removed in compressed_tensors PR #610, see #1578")
     def test_mxfp4(self):
         model = get_tiny_model(get_model_path(self.mxfp4_model_path))
         assert (
-            type(model.model.layers[0].mlp.up_proj).__name__ == "CompressedLinear"
+            model.model.layers[0].mlp.up_proj.weight_packed.dtype == torch.uint8
+        ), "Original weight is not in FP8 format"
+        print(model.model.layers[0].mlp.up_proj.quantization_scheme)
+        assert hasattr(
+            model.model.layers[0].mlp.up_proj, "quantization_scheme"
         ), "Model does not contain CompressedLinear layers"
         detected_types = check_and_mark_quantized_module(model)
         assert ModuleWeightType.MXFP4 in detected_types
         model = convert_module_to_hp_if_necessary(model)
         assert (
-            type(model.model.layers[0].mlp.up_proj) is torch.nn.Linear
+            model.model.layers[0].mlp.up_proj.weight.dtype == torch.bfloat16
         ), "CompressedLinear layer was not converted to Linear"

From e99111f1d4def9a2b3da6574079e75549b6f7967 Mon Sep 17 00:00:00 2001
From: Xin He <xin3.he@intel.com>
Date: Mon, 23 Mar 2026 15:56:52 +0800
Subject: [PATCH 2/8] fix pylint

Signed-off-by: Xin He <xin3.he@intel.com>
---
 auto_round/utils/weight_handler.py | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/auto_round/utils/weight_handler.py b/auto_round/utils/weight_handler.py
index 757499208..bd2e56a7b 100644
--- a/auto_round/utils/weight_handler.py
+++ b/auto_round/utils/weight_handler.py
@@ -488,7 +488,7 @@ def detect_layer(self, module: torch.nn.Module) -> bool:
             return True
 
         if hasattr(module, "quantization_scheme"):
-            from compressed_tensors.quantization.utils import is_module_quantized
+            from compressed_tensors.quantization.utils import is_module_quantized  # pylint: disable=E0401
 
             if is_module_quantized(module):
                 q_scheme = module.quantization_scheme
@@ -520,7 +520,7 @@ def convert_layer(
     ) -> torch.nn.Module:
         """Convert a single FP8/CompressedLinear layer to a standard Linear layer."""
         if hasattr(layer, "quantization_scheme") and layer.__class__.__name__ == "Linear":
-            from compressed_tensors.compressors.base import decompress_module
+            from compressed_tensors.compressors.base import decompress_module  # pylint: disable=E0401
 
             decompress_module(layer)
             return layer
@@ -586,7 +586,7 @@ def detect_layer(self, module: torch.nn.Module) -> bool:
                 compressor_name = module.compressor.__class__.__name__
                 return "MXFP4" in compressor_name
         if hasattr(module, "quantization_scheme"):
-            from compressed_tensors.quantization.utils import is_module_quantized
+            from compressed_tensors.quantization.utils import is_module_quantized  # pylint: disable=E0401
 
             if is_module_quantized(module):
                 q_scheme = module.quantization_scheme
@@ -676,7 +676,7 @@ def detect_layer(self, module: torch.nn.Module) -> bool:
             compressor_name = module.compressor.__class__.__name__
             return "MXFP8" in compressor_name
         if hasattr(module, "quantization_scheme"):
-            from compressed_tensors.quantization.utils import is_module_quantized
+            from compressed_tensors.quantization.utils import is_module_quantized  # pylint: disable=E0401
 
             if is_module_quantized(module):
                 q_scheme = module.quantization_scheme
@@ -762,7 +762,7 @@ def detect_layer(self, module: torch.nn.Module) -> bool:
             compressor_name = module.compressor.__class__.__name__
             return "NVFP4" in compressor_name
         if hasattr(module, "quantization_scheme"):
-            from compressed_tensors.quantization.utils import is_module_quantized
+            from compressed_tensors.quantization.utils import is_module_quantized  # pylint: disable=E0401
 
             if is_module_quantized(module):
                 q_scheme = module.quantization_scheme
@@ -786,7 +786,7 @@ def convert_layer(
     ) -> torch.nn.Module:
         """Convert an NVFP4 CompressedLinear layer to a standard Linear layer."""
         if hasattr(layer, "quantization_scheme") and layer.__class__.__name__ == "Linear":
-            from compressed_tensors.compressors.base import decompress_module
+            from compressed_tensors.compressors.base import decompress_module  # pylint: disable=E0401
 
             decompress_module(layer)
             return layer

From b23967fd50bb74723160602756f56a53e1102f68 Mon Sep 17 00:00:00 2001
From: Xin He <xin3.he@intel.com>
Date: Mon, 23 Mar 2026 19:49:33 +0800
Subject: [PATCH 3/8] fix bug

Signed-off-by: Xin He <xin3.he@intel.com>
---
 auto_round/utils/weight_handler.py | 20 +++++++++-----------
 1 file changed, 9 insertions(+), 11 deletions(-)

diff --git a/auto_round/utils/weight_handler.py b/auto_round/utils/weight_handler.py
index bd2e56a7b..ef41d8175 100644
--- a/auto_round/utils/weight_handler.py
+++ b/auto_round/utils/weight_handler.py
@@ -581,7 +581,7 @@ class MXFP4Handler(WeightTypeHandler):
 
     def detect_layer(self, module: torch.nn.Module) -> bool:
         """Check if a module is an MXFP4 CompressedLinear layer."""
-        if module.__class__.__name__ != "CompressedLinear":
+        if module.__class__.__name__ == "CompressedLinear":
             if hasattr(module, "compressor") and module.compressor is not None:
                 compressor_name = module.compressor.__class__.__name__
                 return "MXFP4" in compressor_name
@@ -670,11 +670,10 @@ class MXFP8Handler(WeightTypeHandler):
 
     def detect_layer(self, module: torch.nn.Module) -> bool:
         """Check if a module is an MXFP8 CompressedLinear layer."""
-        if module.__class__.__name__ != "CompressedLinear":
-            return False
-        if hasattr(module, "compressor") and module.compressor is not None:
-            compressor_name = module.compressor.__class__.__name__
-            return "MXFP8" in compressor_name
+        if module.__class__.__name__ == "CompressedLinear":
+            if hasattr(module, "compressor") and module.compressor is not None:
+                compressor_name = module.compressor.__class__.__name__
+                return "MXFP8" in compressor_name
         if hasattr(module, "quantization_scheme"):
             from compressed_tensors.quantization.utils import is_module_quantized  # pylint: disable=E0401
 
@@ -756,11 +755,10 @@ class NVFP4Handler(WeightTypeHandler):
 
     def detect_layer(self, module: torch.nn.Module) -> bool:
         """Check if a module is an NVFP4 CompressedLinear layer."""
-        if module.__class__.__name__ != "CompressedLinear":
-            return False
-        if hasattr(module, "compressor") and module.compressor is not None:
-            compressor_name = module.compressor.__class__.__name__
-            return "NVFP4" in compressor_name
+        if module.__class__.__name__ == "CompressedLinear":
+            if hasattr(module, "compressor") and module.compressor is not None:
+                compressor_name = module.compressor.__class__.__name__
+                return "NVFP4" in compressor_name
         if hasattr(module, "quantization_scheme"):
             from compressed_tensors.quantization.utils import is_module_quantized  # pylint: disable=E0401
 

From c9affc2c021ad9c40e616fbbc7bdf39f52d5680f Mon Sep 17 00:00:00 2001
From: Xin He <xin3.he@intel.com>
Date: Tue, 24 Mar 2026 10:04:28 +0800
Subject: [PATCH 4/8] fix typo

Signed-off-by: Xin He <xin3.he@intel.com>
---
 auto_round/utils/weight_handler.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/auto_round/utils/weight_handler.py b/auto_round/utils/weight_handler.py
index ef41d8175..1f21a1014 100644
--- a/auto_round/utils/weight_handler.py
+++ b/auto_round/utils/weight_handler.py
@@ -494,9 +494,9 @@ def detect_layer(self, module: torch.nn.Module) -> bool:
                 q_scheme = module.quantization_scheme
                 if (
                     q_scheme.weights.num_bits == 8
-                    and "float" in q_scheme.weights.data_type
+                    and q_scheme.weights.type == "float"
                     and q_scheme.input_activations.num_bits == 8
-                    and "float" in q_scheme.input_activations.data_type
+                    and q_scheme.input_activations.type == "float"
                 ):
                     return True
 

From 0c66d675ffc9b68616a99c513990fac885c66ff0 Mon Sep 17 00:00:00 2001
From: Xin He <xin3.he@intel.com>
Date: Tue, 24 Mar 2026 11:13:45 +0800
Subject: [PATCH 5/8] compatible for llmc integration

Signed-off-by: Xin He <xin3.he@intel.com>
---
 auto_round/utils/weight_handler.py                       | 8 ++++----
 test/test_cpu/advanced/test_low_precision_input_model.py | 1 -
 2 files changed, 4 insertions(+), 5 deletions(-)

diff --git a/auto_round/utils/weight_handler.py b/auto_round/utils/weight_handler.py
index 1f21a1014..92cdb504c 100644
--- a/auto_round/utils/weight_handler.py
+++ b/auto_round/utils/weight_handler.py
@@ -490,7 +490,7 @@ def detect_layer(self, module: torch.nn.Module) -> bool:
         if hasattr(module, "quantization_scheme"):
             from compressed_tensors.quantization.utils import is_module_quantized  # pylint: disable=E0401
 
-            if is_module_quantized(module):
+            if is_module_quantized(module) and module.quantization_status.value == "compressed":
                 q_scheme = module.quantization_scheme
                 if (
                     q_scheme.weights.num_bits == 8
@@ -588,7 +588,7 @@ def detect_layer(self, module: torch.nn.Module) -> bool:
         if hasattr(module, "quantization_scheme"):
             from compressed_tensors.quantization.utils import is_module_quantized  # pylint: disable=E0401
 
-            if is_module_quantized(module):
+            if is_module_quantized(module) and module.quantization_status.value == "compressed":
                 q_scheme = module.quantization_scheme
                 if (
                     q_scheme.weights.num_bits == 4
@@ -677,7 +677,7 @@ def detect_layer(self, module: torch.nn.Module) -> bool:
         if hasattr(module, "quantization_scheme"):
             from compressed_tensors.quantization.utils import is_module_quantized  # pylint: disable=E0401
 
-            if is_module_quantized(module):
+            if is_module_quantized(module) and module.quantization_status.value == "compressed":
                 q_scheme = module.quantization_scheme
                 if (
                     q_scheme.weights.num_bits == 8
@@ -762,7 +762,7 @@ def detect_layer(self, module: torch.nn.Module) -> bool:
         if hasattr(module, "quantization_scheme"):
             from compressed_tensors.quantization.utils import is_module_quantized  # pylint: disable=E0401
 
-            if is_module_quantized(module):
+            if is_module_quantized(module) and module.quantization_status.value == "compressed":
                 q_scheme = module.quantization_scheme
                 if (
                     q_scheme.weights.num_bits == 4
diff --git a/test/test_cpu/advanced/test_low_precision_input_model.py b/test/test_cpu/advanced/test_low_precision_input_model.py
index 99c146f4a..defee4c37 100644
--- a/test/test_cpu/advanced/test_low_precision_input_model.py
+++ b/test/test_cpu/advanced/test_low_precision_input_model.py
@@ -53,7 +53,6 @@ def test_mxfp4(self):
         assert (
             model.model.layers[0].mlp.up_proj.weight_packed.dtype == torch.uint8
         ), "Original weight is not in FP8 format"
-        print(model.model.layers[0].mlp.up_proj.quantization_scheme)
         assert hasattr(
             model.model.layers[0].mlp.up_proj, "quantization_scheme"
         ), "Model does not contain CompressedLinear layers"

From eaa62bdfacf1372377dca532f05d14e5de24479c Mon Sep 17 00:00:00 2001
From: Xin He <xin3.he@intel.com>
Date: Tue, 24 Mar 2026 13:32:29 +0800
Subject: [PATCH 6/8] Update requirements.txt

---
 test/test_cpu/requirements.txt | 1 -
 1 file changed, 1 deletion(-)

diff --git a/test/test_cpu/requirements.txt b/test/test_cpu/requirements.txt
index cb1723616..76e405e7f 100644
--- a/test/test_cpu/requirements.txt
+++ b/test/test_cpu/requirements.txt
@@ -10,5 +10,4 @@ llmcompressor @ git+https://github.com/vllm-project/llm-compressor.git@main
 lm_eval >= 0.4.10  # for transformers >= 5.0.0
 diffusers
 protobuf
-compressed-tensors==0.14.1a20260313 # temporary pin for llmcompressor
 transformers < 5.0.0

From c0137cfe4e2334059fcee4253b43d33ff74ebfd1 Mon Sep 17 00:00:00 2001
From: Xin He <xin3.he@intel.com>
Date: Tue, 24 Mar 2026 15:20:42 +0800
Subject: [PATCH 7/8] Update requirements.txt

---
 test/test_cpu/requirements.txt | 1 +
 1 file changed, 1 insertion(+)

diff --git a/test/test_cpu/requirements.txt b/test/test_cpu/requirements.txt
index 76e405e7f..61376cd04 100644
--- a/test/test_cpu/requirements.txt
+++ b/test/test_cpu/requirements.txt
@@ -10,4 +10,5 @@ llmcompressor @ git+https://github.com/vllm-project/llm-compressor.git@main
 lm_eval >= 0.4.10  # for transformers >= 5.0.0
 diffusers
 protobuf
+compressed-tensors==0.14.1a20260313 # temporary pin for llmcompressor 
 transformers < 5.0.0

From b2ceaa89f4c3c0943b4a0f952752bad230d2b598 Mon Sep 17 00:00:00 2001
From: Xin He <xin3.he@intel.com>
Date: Tue, 24 Mar 2026 15:21:02 +0800
Subject: [PATCH 8/8] Update requirements.txt

---
 test/test_cpu/requirements.txt | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/test/test_cpu/requirements.txt b/test/test_cpu/requirements.txt
index 61376cd04..cb1723616 100644
--- a/test/test_cpu/requirements.txt
+++ b/test/test_cpu/requirements.txt
@@ -10,5 +10,5 @@ llmcompressor @ git+https://github.com/vllm-project/llm-compressor.git@main
 lm_eval >= 0.4.10  # for transformers >= 5.0.0
 diffusers
 protobuf
-compressed-tensors==0.14.1a20260313 # temporary pin for llmcompressor 
+compressed-tensors==0.14.1a20260313 # temporary pin for llmcompressor
 transformers < 5.0.0