From 2bc369717eb113538fb61aa9a7e1d2f5d4f2a79c Mon Sep 17 00:00:00 2001
From: Xin He <xin3.he@intel.com>
Date: Mon, 30 Mar 2026 08:40:28 +0000
Subject: [PATCH 01/10] fix nextstep loading issue

Signed-off-by: Xin He <xin3.he@intel.com>
---
 auto_round/utils/common.py | 1 +
 auto_round/utils/model.py  | 3 +++
 2 files changed, 4 insertions(+)

diff --git a/auto_round/utils/common.py b/auto_round/utils/common.py
index a81dcefbd..22091d788 100644
--- a/auto_round/utils/common.py
+++ b/auto_round/utils/common.py
@@ -494,6 +494,7 @@ def __getitem__(self, key):
     "patch_merger",
     "pre_mm_projector_norm",
     "vision",
+    "image",
 ]
 
 
diff --git a/auto_round/utils/model.py b/auto_round/utils/model.py
index f0aec180a..5d86b18b7 100644
--- a/auto_round/utils/model.py
+++ b/auto_round/utils/model.py
@@ -557,6 +557,9 @@ def mllm_load_model(
                 cls = getattr(base_lib, architectures)
             else:
                 cls = AutoModelForCausalLM
+            # A special case for NextStep
+            if model_type == "nextstep":
+                cls = AutoModel
             try:
                 model_load_kwargs = {}
                 if model_subfolder is not None:

From 2bb744ac7a8c22468d449b8c446ca6819e80c56d Mon Sep 17 00:00:00 2001
From: Xin He <xin3.he@intel.com>
Date: Mon, 30 Mar 2026 13:34:14 +0000
Subject: [PATCH 02/10] support 6.0.0 gptqmodel

Signed-off-by: Xin He <xin3.he@intel.com>
---
 auto_round_extension/cuda/gptqmodel_marlin.py | 70 +++++++++++++++----
 1 file changed, 56 insertions(+), 14 deletions(-)

diff --git a/auto_round_extension/cuda/gptqmodel_marlin.py b/auto_round_extension/cuda/gptqmodel_marlin.py
index 16949cb0a..761589c3f 100644
--- a/auto_round_extension/cuda/gptqmodel_marlin.py
+++ b/auto_round_extension/cuda/gptqmodel_marlin.py
@@ -30,6 +30,9 @@ def get_marlin_layer():  ##use an ugly wrapper to  import gptqmodel on demand
     NEW_VERSION = False
     if Version(gptqmodel.__version__) >= Version("5.0.0"):
         NEW_VERSION = True
+    NEW_VERSION_6_0 = False
+    if Version(gptqmodel.__version__) >= Version("6.0.0"):
+        NEW_VERSION_6_0 = True
     from gptqmodel.models._const import DEVICE, PLATFORM  # pylint: disable=E0401
     from gptqmodel.nn_modules.qlinear import BaseQuantLinear  # pylint: disable=E0401
     from gptqmodel.utils.backend import BACKEND  # pylint: disable=E0401
@@ -244,20 +247,59 @@ def __init__(
                 # (since we have only one group per output channel)
                 desc_act = False
 
-            super().__init__(
-                bits=bits,
-                group_size=group_size,
-                sym=sym,
-                desc_act=desc_act,
-                in_features=in_features,
-                out_features=out_features,
-                bias=bias,
-                pack_dtype=pack_dtype,
-                backend=kwargs.pop("backend", BACKEND.MARLIN),
-                adapter=None,
-                register_buffers=False,
-                **kwargs,
-            )
+            backend = kwargs.pop("backend", BACKEND.MARLIN)
+            if NEW_VERSION_6_0:
+                # gptqmodel >= 6.0.0: BaseQuantLinear no longer accepts group_size/sym/desc_act/pack_dtype
+                # directly; they must be passed via validate_kwargs. Attributes are also set manually.
+                super().__init__(
+                    bits=bits,
+                    in_features=in_features,
+                    out_features=out_features,
+                    bias=bias,
+                    backend=backend,
+                    adapter=None,
+                    register_buffers=False,
+                    validate_kwargs={
+                        "group_size": group_size,
+                        "desc_act": desc_act,
+                        "sym": sym,
+                        "pack_dtype": pack_dtype,
+                    },
+                    **kwargs,
+                )
+                # Set attributes that intermediate classes (PackedQuantLinear /
+                # GPTQQuantLinear) would have set in the old API.
+                self.pack_dtype = pack_dtype
+                if pack_dtype == torch.int8:
+                    self.pack_dtype_bits = 8
+                elif pack_dtype == torch.int16:
+                    self.pack_dtype_bits = 16
+                elif pack_dtype == torch.int32:
+                    self.pack_dtype_bits = 32
+                elif pack_dtype == torch.int64:
+                    self.pack_dtype_bits = 64
+                else:
+                    raise ValueError(f"Unsupported pack_dtype: {pack_dtype}")
+                self.pack_factor = self.pack_dtype_bits // bits
+                self.group_size = group_size if group_size != -1 else in_features
+                self.requested_group_size = group_size
+                self.desc_act = desc_act
+                self.sym = sym
+            else:
+                super().__init__(
+                    bits=bits,
+                    group_size=group_size,
+                    sym=sym,
+                    desc_act=desc_act,
+                    in_features=in_features,
+                    out_features=out_features,
+                    bias=bias,
+                    pack_dtype=pack_dtype,
+                    backend=backend,
+                    adapter=None,
+                    register_buffers=False,
+                    **kwargs,
+                )
 
             # toggle fp32 mode depending on MARLIN or MARLIN_FP16 backend
             self.fp32 = True if self.backend in [BACKEND.MARLIN, BACKEND.AUTO] else False

From 576e13086c85acae2180564b39d16778c929eaab Mon Sep 17 00:00:00 2001
From: Xin He <xin3.he@intel.com>
Date: Fri, 3 Apr 2026 09:55:34 +0800
Subject: [PATCH 03/10] Enhance DiffusionCompressor with custom pipeline
 support and improve model loading for NextStep

Signed-off-by: Xin He <xin3.he@intel.com>
---
 .../compressors/diffusion/compressor.py       | 100 +++++++++++++-----
 auto_round/utils/common.py                    |   1 -
 auto_round/utils/model.py                     |  77 +++++++++++++-
 3 files changed, 146 insertions(+), 32 deletions(-)

diff --git a/auto_round/compressors/diffusion/compressor.py b/auto_round/compressors/diffusion/compressor.py
index ea7eba39e..11d987728 100644
--- a/auto_round/compressors/diffusion/compressor.py
+++ b/auto_round/compressors/diffusion/compressor.py
@@ -56,6 +56,11 @@ class DiffusionCompressor(BaseCompressor):
                                 The more it is, the more closely it follows the prompt (default is 7.5).
         num_inference_steps (int): The reference number of denoising steps (default is 50).
         generator_seed (int): A sees that controls the initial noise from which an image is generated (default is None).
+        pipeline_fn (callable, optional): Custom callable to run the pipeline during calibration.
+            Signature: ``fn(pipe, prompts, *, guidance_scale, num_inference_steps, generator, **kwargs)``.
+            Use this to support models whose inference API differs from the standard convention
+            (e.g. NextStep). If ``None``, the standard ``pipe(prompts, ...)`` call is used unless
+            the loaded pipeline already exposes an ``_autoround_pipeline_fn`` attribute.
         scheme: (str| dict | QuantizationScheme ): A preset scheme that defines the quantization configurations.
         layer_config (dict): Configuration for weight quantization (default is None).
         dataset: The path or name of the calib dataset.
@@ -102,9 +107,15 @@ def __init__(
         device_map: Union[str, torch.device, int, dict] = 0,
         enable_torch_compile: bool = False,
         seed: int = 42,
+        pipeline_fn: callable = None,
         **kwargs,
     ):
         logger.warning("Diffusion model quantization is experimental and is only validated on Flux models.")
+        if dataset == "NeelNanda/pile-10k":
+            dataset = "coco2014"
+            logger.warning(
+                "Dataset 'NeelNanda/pile-10k' is not suitable for diffusion model quantization, use coco2014 dataset instead."
+            )
         model_dtype = kwargs.pop("model_dtype", None)
 
         self.guidance_scale = guidance_scale
@@ -120,6 +131,8 @@ def __init__(
 
         self.model = model
         self.pipe = pipe
+        # Use explicit pipeline_fn; fall back to whatever diffusion_load_model attached to the pipe
+        self.pipeline_fn = pipeline_fn or getattr(pipe, "_autoround_pipeline_fn", None)
 
         all_blocks = get_block_names(model)
         self.quant_block_list = find_matching_blocks(model, all_blocks, to_quant_block_names)
@@ -278,6 +291,64 @@ def _get_current_num_elm(
         current_input_ids = [input_ids["hidden_states"][i] for i in indices]
         return sum(id.numel() for id in current_input_ids)
 
+    def _run_pipeline(self, prompts: list) -> None:
+        """Execute one full diffusion pipeline forward pass for calibration input capture.
+
+        This drives all transformer blocks so that their intermediate inputs are recorded
+        by the hooks installed during calibration.
+
+        **Extending for custom models** – choose whichever approach is simpler:
+
+        * Pass a ``pipeline_fn`` to the constructor (no subclassing required).  The
+          callable receives ``(pipe, prompts, *, guidance_scale, num_inference_steps,
+          generator, **kwargs)`` and must trigger a full forward pass.
+        * Subclass :class:`DiffusionCompressor` and override this method directly for
+          full control over the inference logic.
+
+        Example – NextStep model::
+
+            def nextstep_fn(pipe, prompts, guidance_scale=7.5,
+                            num_inference_steps=28, generator=None,
+                            hw=(1024, 1024), **kwargs):
+                for prompt in (prompts if isinstance(prompts, list) else [prompts]):
+                    pipe.generate_image(
+                        prompt,
+                        cfg=guidance_scale,
+                        num_sampling_steps=num_inference_steps,
+                        hw=hw,
+                        **kwargs,
+                    )
+
+            compressor = DiffusionCompressor(
+                model="path/to/nextstep",
+                pipeline_fn=nextstep_fn,
+                pipeline_fn_kwargs={"hw": (512, 512)},
+            )
+
+        Args:
+            prompts (list[str]): Text prompts for the current calibration batch.
+        """
+        generator = (
+            None
+            if self.generator_seed is None
+            else torch.Generator(device=self.pipe.device).manual_seed(self.generator_seed)
+        )
+        if self.pipeline_fn is not None:
+            self.pipeline_fn(
+                self.pipe,
+                prompts,
+                guidance_scale=self.guidance_scale,
+                num_inference_steps=self.num_inference_steps,
+                generator=generator,
+            )
+        else:
+            self.pipe(
+                prompts,
+                guidance_scale=self.guidance_scale,
+                num_inference_steps=self.num_inference_steps,
+                generator=generator,
+            )
+
     def calib(self, nsamples, bs):
         """Perform calibration for quantization.
 
@@ -308,40 +379,13 @@ def calib(self, nsamples, bs):
         total_cnt = 0
 
         total = nsamples if not hasattr(self.dataloader, "len") else min(nsamples, len(self.dataloader))
-        if self.pipe.dtype != self.model.dtype:
-            self.pipe.to(self.model.dtype)
-
-        if (
-            hasattr(self.model, "hf_device_map")
-            and len(self.model.hf_device_map) > 0
-            and self.pipe.device != self.model.device
-            and torch.device(self.model.device).type in ["cuda", "xpu"]
-        ):
-            logger.error(
-                "Diffusion model is activated sequential model offloading, it will crash during moving to GPU/XPU. "
-                "Please use model path for quantization or "
-                "move the pipeline object to GPU/XPU before passing them into API."
-            )
-            exit(-1)
 
-        if self.pipe.device != self.model.device:
-            self.pipe.to(self.model.device)
-        self.pipe.to(self.model.dtype)
         with tqdm(range(1, total + 1), desc="cache block inputs") as pbar:
             for ids, prompts in self.dataloader:
                 if isinstance(prompts, tuple):
                     prompts = list(prompts)
                 try:
-                    self.pipe(
-                        prompt=prompts,
-                        guidance_scale=self.guidance_scale,
-                        num_inference_steps=self.num_inference_steps,
-                        generator=(
-                            None
-                            if self.generator_seed is None
-                            else torch.Generator(device=self.pipe.device).manual_seed(self.generator_seed)
-                        ),
-                    )
+                    self._run_pipeline(prompts)
                 except NotImplementedError:
                     pass
                 except Exception as error:
diff --git a/auto_round/utils/common.py b/auto_round/utils/common.py
index 22091d788..a81dcefbd 100644
--- a/auto_round/utils/common.py
+++ b/auto_round/utils/common.py
@@ -494,7 +494,6 @@ def __getitem__(self, key):
     "patch_merger",
     "pre_mm_projector_norm",
     "vision",
-    "image",
 ]
 
 
diff --git a/auto_round/utils/model.py b/auto_round/utils/model.py
index 5d86b18b7..5878d3df3 100644
--- a/auto_round/utils/model.py
+++ b/auto_round/utils/model.py
@@ -557,9 +557,6 @@ def mllm_load_model(
                 cls = getattr(base_lib, architectures)
             else:
                 cls = AutoModelForCausalLM
-            # A special case for NextStep
-            if model_type == "nextstep":
-                cls = AutoModel
             try:
                 model_load_kwargs = {}
                 if model_subfolder is not None:
@@ -669,6 +666,46 @@ def diffusion_load_model(
     if device_str is not None and "hpu" in device_str:
         torch_dtype = torch.bfloat16
 
+    try:
+        from transformers import AutoConfig
+
+        config = AutoConfig.from_pretrained(pretrained_model_name_or_path, trust_remote_code=True)
+    except:
+        config = None
+
+    model_type = getattr(config, "model_type", "")
+    # A special case for NextStep
+    if model_type == "nextstep":
+        from models.gen_pipeline import NextStepPipeline  # pylint: disable=E0401
+        from transformers import AutoModel, AutoTokenizer
+
+        tokenizer = AutoTokenizer.from_pretrained(
+            pretrained_model_name_or_path, local_files_only=True, trust_remote_code=True
+        )
+        model = AutoModel.from_pretrained(pretrained_model_name_or_path, local_files_only=True, trust_remote_code=True)
+        # The model is loaded onto the device because more than one block requires input data.
+        pipe = NextStepPipeline(tokenizer=tokenizer, model=model).to(device=device_str, dtype=torch.bfloat16)
+
+        def _nextstep_pipeline_fn(pipe, prompts, guidance_scale=7.5, num_inference_steps=28, generator=None, **kwargs):
+            """Default pipeline_fn for NextStep models.
+
+            Maps standard :class:`DiffusionCompressor` parameters to NextStep's
+            ``generate_image`` API.  Pass a custom ``pipeline_fn`` to
+            :class:`DiffusionCompressor` to override defaults or supply
+            model-specific kwargs (e.g. ``hw``, ``positive_prompt``,
+            ``cfg_schedule``, ``timesteps_shift``).
+            """
+            for prompt in (prompts if isinstance(prompts, list) else [prompts]):
+                pipe.generate_image(
+                    prompt,
+                    cfg=guidance_scale,
+                    num_sampling_steps=num_inference_steps,
+                    **kwargs,
+                )
+
+        pipe._autoround_pipeline_fn = _nextstep_pipeline_fn
+        return pipe, pipe.model
+
     pipelines = LazyImport("diffusers.pipelines")
     if isinstance(pretrained_model_name_or_path, str):
         if torch_dtype == "auto":
@@ -732,6 +769,25 @@ def model_save_pretrained(model, save_directory, **kwargs):
 
     # non-meta model uses model.save_pretrained for model and config saving
     setattr(model, "save_pretrained", partial(model_save_pretrained, model))
+
+    if pipe.dtype != model.dtype:
+        pipe.to(model.dtype)
+    if pipe.device != model.device:
+        pipe.to(model.device)
+
+    if (
+        hasattr(model, "hf_device_map")
+        and len(model.hf_device_map) > 0
+        and pipe.device != model.device
+        and torch.device(model.device).type in ["cuda", "xpu"]
+    ):
+        logger.error(
+            "Diffusion model is activated sequential model offloading, it will crash during moving to GPU/XPU. "
+            "Please use model path for quantization or "
+            "move the pipeline object to GPU/XPU before passing them into API."
+        )
+        exit(-1)
+
     return pipe, model.to(device)
 
 
@@ -797,6 +853,21 @@ def is_gguf_model(model_path: Union[str, torch.nn.Module]) -> bool:
 def is_diffusion_model(model_or_path: Union[str, object]) -> bool:
     from auto_round.utils.common import LazyImport
 
+    # First check if it's a known diffusion pipeline by config/model_type to avoid unnecessary imports and file checks for non-diffusion models, which can be time-consuming.
+    try:
+        from transformers import AutoConfig
+
+        config = AutoConfig.from_pretrained(model_or_path, trust_remote_code=True)
+        model_type = getattr(config, "model_type", "")
+        # A special case for NextStep
+        if model_type == "nextstep":
+            return True
+    except:
+        logger.warning(
+            f"Failed to load config for {model_or_path}, trying to check model_index.json for diffusion pipeline."
+        )
+
+    # Then check if model_index.json exists for diffusion pipeline, which is a strong signal of being a diffusion pipeline.
     if isinstance(model_or_path, str):
         index_file = None
         if not os.path.isdir(model_or_path):

From bb22b290c17a491a75b987067d22ecd95e429eb9 Mon Sep 17 00:00:00 2001
From: Xin He <xin3.he@intel.com>
Date: Tue, 7 Apr 2026 12:23:03 +0000
Subject: [PATCH 04/10] fix bug for nextstep

Signed-off-by: Xin He <xin3.he@intel.com>
---
 auto_round/compressors/base.py                |   3 +-
 .../compressors/diffusion/compressor.py       |  50 +++++-
 auto_round/utils/model.py                     | 169 ++++++++++--------
 3 files changed, 141 insertions(+), 81 deletions(-)

diff --git a/auto_round/compressors/base.py b/auto_round/compressors/base.py
index a8735407e..a4500d556 100644
--- a/auto_round/compressors/base.py
+++ b/auto_round/compressors/base.py
@@ -2320,7 +2320,8 @@ def try_cache_inter_data_gpucpu(self, block_names, nsamples, layer_names=None, l
                             max_memory=new_max_memory,
                             no_split_module_classes=no_split_modules,
                         )
-                        self.model.tie_weights()
+                        if hasattr(self.model, "tie_weights") and callable(self.model.tie_weights):
+                            self.model.tie_weights()
                         device_map = infer_auto_device_map(
                             self.model, max_memory=new_max_memory, no_split_module_classes=no_split_modules
                         )
diff --git a/auto_round/compressors/diffusion/compressor.py b/auto_round/compressors/diffusion/compressor.py
index 11d987728..8339f4162 100644
--- a/auto_round/compressors/diffusion/compressor.py
+++ b/auto_round/compressors/diffusion/compressor.py
@@ -35,6 +35,7 @@
     get_block_names,
     merge_block_output_keys,
     wrap_block_forward_positional_to_kwargs,
+    copy_python_files_from_model_cache,
 )
 
 pipeline_utils = LazyImport("diffusers.pipelines.pipeline_utils")
@@ -207,7 +208,7 @@ def _get_current_q_output(
         device: str,
         cache_device: str = "cpu",
     ) -> torch.Tensor:
-        output_config = output_configs.get(block.__class__.__name__, [])
+        output_config = output_configs.get(block.__class__.__name__, ["hidden_states"])
         idx = None if "hidden_states" not in output_config else output_config.index("hidden_states")
         current_input_ids, current_input_others = self._sampling_inputs(
             input_ids,
@@ -251,7 +252,7 @@ def _get_block_outputs(
         """
 
         output = defaultdict(list)
-        output_config = output_configs.get(block.__class__.__name__, [])
+        output_config = output_configs.get(block.__class__.__name__, ["hidden_states"])
         if isinstance(input_ids, dict):
             nsamples = len(input_ids["hidden_states"])
         else:
@@ -269,6 +270,8 @@ def _get_block_outputs(
                 tmp_input_ids = hidden_states
 
             tmp_output = block_forward(block, tmp_input_ids, tmp_input_others, self.amp, self.amp_dtype, device, None)
+            if isinstance(tmp_output, torch.Tensor):
+                tmp_output = [tmp_output]
             assert len(output_config) == len(tmp_output)
             tmp_output = dict(zip(output_config, tmp_output))
 
@@ -379,6 +382,7 @@ def calib(self, nsamples, bs):
         total_cnt = 0
 
         total = nsamples if not hasattr(self.dataloader, "len") else min(nsamples, len(self.dataloader))
+        self._align_device_and_dtype()
 
         with tqdm(range(1, total + 1), desc="cache block inputs") as pbar:
             for ids, prompts in self.dataloader:
@@ -439,6 +443,12 @@ def _get_save_folder_name(self, format: OutputFormat) -> str:
         """
         # Replace special characters to make the folder name filesystem-safe
         sanitized_format = format.get_backend_name().replace(":", "-").replace("_", "-")
+        if hasattr(self.model, "config") and getattr(self.model.config, "model_type", None) == "nextstep":
+            # Use a subfolder only if there are multiple formats
+            if len(self.formats) > 1:
+                return os.path.join(self.orig_output_dir, sanitized_format)
+
+            return self.orig_output_dir
 
         # Use a subfolder only if there are multiple formats
         if len(self.formats) > 1:
@@ -467,6 +477,16 @@ def save_quantized(self, output_dir=None, format="auto_round", inplace=True, **k
             return super().save_quantized(output_dir, format=format, inplace=inplace, **kwargs)
 
         compressed_model = None
+        if hasattr(self.model, "config") and getattr(self.model.config, "model_type", None) == "nextstep":
+            compressed_model = super().save_quantized(
+                output_dir=output_dir,
+                format=format,
+                inplace=inplace,
+                **kwargs,
+            )
+            self.pipe.tokenizer.save_pretrained(output_dir)
+            copy_python_files_from_model_cache(self.model, output_dir, copy_folders=["models", "vae", "utils"])
+            return compressed_model
         for name in self.pipe.components.keys():
             val = getattr(self.pipe, name)
             sub_module_path = (
@@ -487,3 +507,29 @@ def save_quantized(self, output_dir=None, format="auto_round", inplace=True, **k
                 val.save_pretrained(sub_module_path)
         self.pipe.config.save_pretrained(output_dir)
         return compressed_model
+
+    def _align_device_and_dtype(self):
+        if hasattr(self.model, "config") and getattr(self.model.config, "model_type", None) == "nextstep":
+            return
+        if (
+            hasattr(self.model, "hf_device_map")
+            and len(self.model.hf_device_map) > 0
+            and type(self.pipe.device) != type(self.model.device)
+            and self.pipe.device != self.model.device
+            and torch.device(self.model.device).type in ["cuda", "xpu"]
+        ):
+            logger.error(
+                "Diffusion model is activated sequential model offloading, it will crash during moving to GPU/XPU. "
+                "Please use model path for quantization or "
+                "move the pipeline object to GPU/XPU before passing them into API."
+            )
+            exit(-1)
+
+        if self.pipe.device != self.model.device:
+            self.pipe.to(self.model.device)
+        if self.pipe.dtype != self.model.dtype:
+            self.pipe.to(self.model.dtype)
+
+
+def save_next_step_diffusion():
+    ar.model.save_pretrained("nextstep_diffusion_model")
diff --git a/auto_round/utils/model.py b/auto_round/utils/model.py
index 5878d3df3..b08e537f3 100644
--- a/auto_round/utils/model.py
+++ b/auto_round/utils/model.py
@@ -676,34 +676,7 @@ def diffusion_load_model(
     model_type = getattr(config, "model_type", "")
     # A special case for NextStep
     if model_type == "nextstep":
-        from models.gen_pipeline import NextStepPipeline  # pylint: disable=E0401
-        from transformers import AutoModel, AutoTokenizer
-
-        tokenizer = AutoTokenizer.from_pretrained(
-            pretrained_model_name_or_path, local_files_only=True, trust_remote_code=True
-        )
-        model = AutoModel.from_pretrained(pretrained_model_name_or_path, local_files_only=True, trust_remote_code=True)
-        # The model is loaded onto the device because more than one block requires input data.
-        pipe = NextStepPipeline(tokenizer=tokenizer, model=model).to(device=device_str, dtype=torch.bfloat16)
-
-        def _nextstep_pipeline_fn(pipe, prompts, guidance_scale=7.5, num_inference_steps=28, generator=None, **kwargs):
-            """Default pipeline_fn for NextStep models.
-
-            Maps standard :class:`DiffusionCompressor` parameters to NextStep's
-            ``generate_image`` API.  Pass a custom ``pipeline_fn`` to
-            :class:`DiffusionCompressor` to override defaults or supply
-            model-specific kwargs (e.g. ``hw``, ``positive_prompt``,
-            ``cfg_schedule``, ``timesteps_shift``).
-            """
-            for prompt in (prompts if isinstance(prompts, list) else [prompts]):
-                pipe.generate_image(
-                    prompt,
-                    cfg=guidance_scale,
-                    num_sampling_steps=num_inference_steps,
-                    **kwargs,
-                )
-
-        pipe._autoround_pipeline_fn = _nextstep_pipeline_fn
+        pipe, model = load_next_step_diffusion(pretrained_model_name_or_path, device_str)
         return pipe, pipe.model
 
     pipelines = LazyImport("diffusers.pipelines")
@@ -769,25 +742,6 @@ def model_save_pretrained(model, save_directory, **kwargs):
 
     # non-meta model uses model.save_pretrained for model and config saving
     setattr(model, "save_pretrained", partial(model_save_pretrained, model))
-
-    if pipe.dtype != model.dtype:
-        pipe.to(model.dtype)
-    if pipe.device != model.device:
-        pipe.to(model.device)
-
-    if (
-        hasattr(model, "hf_device_map")
-        and len(model.hf_device_map) > 0
-        and pipe.device != model.device
-        and torch.device(model.device).type in ["cuda", "xpu"]
-    ):
-        logger.error(
-            "Diffusion model is activated sequential model offloading, it will crash during moving to GPU/XPU. "
-            "Please use model path for quantization or "
-            "move the pipeline object to GPU/XPU before passing them into API."
-        )
-        exit(-1)
-
     return pipe, model.to(device)
 
 
@@ -1751,43 +1705,71 @@ def _copy_extra_model_files(src_dir: str, dst_dir: str):
 
 # Adapted from https://github.com/vllm-project/llm-compressor/blob/
 # 5b3ddff74cae9651f24bef15d3255c4ee053fc60/src/llmcompressor/pytorch/model_load/helpers.py#L144
-def copy_python_files_from_model_cache(model, save_path: str):
+def copy_python_files_from_model_cache(
+    model, save_path: str, copy_folders: bool | list[str] | tuple[str, ...] = False
+):
+    """Copy Python files (and optionally subdirectories) from the model cache to *save_path*.
+
+    Args:
+        model: The model whose ``config._name_or_path`` points to the source cache.
+        save_path (str): Destination directory.
+        copy_folders (bool | list[str] | tuple[str, ...]): Controls which subdirectories
+            are copied from the cache root to *save_path*:
+
+            * ``False`` (default) – no folders are copied.
+            * ``True`` – every subdirectory that does not already exist in *save_path*
+              is copied (e.g. all of ``vae``, ``scheduler``, …).
+            * A list/tuple of folder names (e.g. ``["vae", "scheduler"]``) – only the
+              named subdirectories are copied.
+    """
+    import shutil
+
+    from huggingface_hub import hf_hub_download
+
     config = model.config
-    if hasattr(config, "_name_or_path"):
-        import os
-        import shutil
+    if not hasattr(config, "_name_or_path"):
+        return
 
-        from huggingface_hub import hf_hub_download
+    if version.parse(transformers.__version__) < version.parse("5.0.0"):
+        from transformers.utils import TRANSFORMERS_CACHE
 
-        if version.parse(transformers.__version__) < version.parse("5.0.0"):
-            from transformers.utils import TRANSFORMERS_CACHE
+        cache_dir = os.environ.get("HF_HOME", TRANSFORMERS_CACHE)
+    else:
+        from huggingface_hub.constants import HF_HUB_CACHE
+
+        cache_dir = os.environ.get("HF_HOME", HF_HUB_CACHE)
+    from transformers.utils import http_user_agent
+
+    cache_path = config._name_or_path
+    if not os.path.exists(cache_path):
+        user_agent = http_user_agent()
+        config_file_path = hf_hub_download(
+            repo_id=cache_path,
+            filename="config.json",
+            cache_dir=cache_dir,
+            force_download=False,
+            user_agent=user_agent,
+        )
+        cache_path = os.path.sep.join(config_file_path.split(os.path.sep)[:-1])
 
-            cache_dir = os.environ.get("HF_HOME", TRANSFORMERS_CACHE)
-        else:
-            from huggingface_hub.constants import HF_HUB_CACHE
-
-            cache_dir = os.environ.get("HF_HOME", HF_HUB_CACHE)
-        from transformers.utils import http_user_agent
-
-        cache_path = config._name_or_path
-        if not os.path.exists(cache_path):
-            user_agent = http_user_agent()
-            config_file_path = hf_hub_download(
-                repo_id=cache_path,
-                filename="config.json",
-                cache_dir=cache_dir,
-                force_download=False,
-                user_agent=user_agent,
-            )
-            cache_path = os.path.sep.join(config_file_path.split(os.path.sep)[:-1])
+    for file in os.listdir(cache_path):
+        full_file_name = os.path.join(cache_path, file)
+        if file.endswith(".py") and os.path.isfile(full_file_name):
+            logger.debug(f"Transferring {full_file_name} to {save_path}")
+            shutil.copy(full_file_name, save_path)
 
-        for file in os.listdir(cache_path):
-            full_file_name = os.path.join(cache_path, file)
-            if file.endswith(".py") and os.path.isfile(full_file_name):
-                logger.debug(f"Transferring {full_file_name} to {save_path}")
-                shutil.copy(full_file_name, save_path)
+    _copy_extra_model_files(cache_path, save_path)
 
-        _copy_extra_model_files(cache_path, save_path)
+    if copy_folders is not False:
+        for entry in os.listdir(cache_path):
+            src_entry = os.path.join(cache_path, entry)
+            dst_entry = os.path.join(save_path, entry)
+            if not os.path.isdir(src_entry):
+                continue
+            if copy_folders is True or entry in copy_folders:
+                if not os.path.exists(dst_entry):
+                    logger.debug(f"Transferring folder {src_entry} to {save_path}")
+                    shutil.copytree(src_entry, dst_entry)
 
 
 def extract_block_names_to_str(quant_block_list):
@@ -1956,3 +1938,34 @@ def forward(m, hidden_states=None, *positional_inputs, **kwargs):
         return base_hook(m, hidden_states, *positional_inputs, **kwargs)
 
     return forward
+
+def load_next_step_diffusion(pretrained_model_name_or_path, device_str):
+    from models.gen_pipeline import NextStepPipeline  # pylint: disable=E0401
+    from transformers import AutoModel, AutoTokenizer
+
+    tokenizer = AutoTokenizer.from_pretrained(
+        pretrained_model_name_or_path, local_files_only=True, trust_remote_code=True
+    )
+    model = AutoModel.from_pretrained(pretrained_model_name_or_path, local_files_only=True, trust_remote_code=True)
+    # The model is loaded onto the device because more than one block requires input data.
+    pipe = NextStepPipeline(tokenizer=tokenizer, model=model).to(device=device_str, dtype=torch.bfloat16)
+
+    def _nextstep_pipeline_fn(pipe, prompts, guidance_scale=7.5, num_inference_steps=28, generator=None, **kwargs):
+        """Default pipeline_fn for NextStep models.
+
+        Maps standard :class:`DiffusionCompressor` parameters to NextStep's
+        ``generate_image`` API.  Pass a custom ``pipeline_fn`` to
+        :class:`DiffusionCompressor` to override defaults or supply
+        model-specific kwargs (e.g. ``hw``, ``positive_prompt``,
+        ``cfg_schedule``, ``timesteps_shift``).
+        """
+        for prompt in (prompts if isinstance(prompts, list) else [prompts]):
+            pipe.generate_image(
+                prompt,
+                cfg=guidance_scale,
+                num_sampling_steps=num_inference_steps,
+                **kwargs,
+            )
+
+    pipe._autoround_pipeline_fn = _nextstep_pipeline_fn
+    return pipe, model

From 87c403786694e0f3f1fc495686a0fbe8f34e8e1f Mon Sep 17 00:00:00 2001
From: Xin He <xin3.he@intel.com>
Date: Tue, 7 Apr 2026 13:46:26 +0000
Subject: [PATCH 05/10] set self.num_inference_steps=1 for calibration

Signed-off-by: Xin He <xin3.he@intel.com>
---
 auto_round/compressors/diffusion/README.md     | 9 ++++++---
 auto_round/compressors/diffusion/compressor.py | 5 +++++
 2 files changed, 11 insertions(+), 3 deletions(-)

diff --git a/auto_round/compressors/diffusion/README.md b/auto_round/compressors/diffusion/README.md
index c5b307df5..c0f8eb534 100644
--- a/auto_round/compressors/diffusion/README.md
+++ b/auto_round/compressors/diffusion/README.md
@@ -64,9 +64,12 @@ auto-round \
 
 For diffusion models, currently we only validate quantizaion on the FLUX.1-dev, which involves quantizing the transformer component of the pipeline.
 
-| Model     | calibration dataset |
-|--------------|--------------|
-| black-forest-labs/FLUX.1-dev | COCO2014      |
+| Model         | calibration dataset |  Model Link  |
+|---------------|---------------------|--------------|
+| black-forest-labs/FLUX.1-dev  | COCO2014      | - |
+| Tongyi-MAI/Z-Image            | COCO2014      | - |
+| Tongyi-MAI/Z-Image-Turb       | COCO2014      | - |
+| stepfun-ai/NextStep-1.1       | COCO2014      | - |
 
 
 
diff --git a/auto_round/compressors/diffusion/compressor.py b/auto_round/compressors/diffusion/compressor.py
index 8339f4162..b729f3f8e 100644
--- a/auto_round/compressors/diffusion/compressor.py
+++ b/auto_round/compressors/diffusion/compressor.py
@@ -368,6 +368,9 @@ def calib(self, nsamples, bs):
             "Diffusion model will catch nsamples * num_inference_steps inputs, "
             "you can reduce nsamples or num_inference_steps if OOM or take too much time."
         )
+        raw_num_inference_steps = self.num_inference_steps
+        self.num_inference_steps = 1
+        logger.info(f"Set num_inference_steps to 1 for calibration, original num_inference_steps is {raw_num_inference_steps}")
         if isinstance(self.dataset, str):
             dataset = self.dataset.replace(" ", "")
             self.dataloader, self.batch_size, self.gradient_accumulate_steps = get_diffusion_dataloader(
@@ -422,6 +425,8 @@ def calib(self, nsamples, bs):
                         self.inputs[k][key] = v[key][:max_len]
 
         # torch.cuda.empty_cache()
+        self.num_inference_steps = raw_num_inference_steps
+        logger.info(f"Restore num_inference_steps to {self.num_inference_steps} after calibration")
 
     def _should_stop_cache_forward(self, name: str) -> bool:
         """Determine whether current forward pass can stop after caching `name`."""

From 1c012007ebc01681ab1cb763d2711217d03514fc Mon Sep 17 00:00:00 2001
From: Xin He <xin3.he@intel.com>
Date: Wed, 8 Apr 2026 02:46:10 +0000
Subject: [PATCH 06/10] Remove unused function; add CUDA CI for diffusion
 tuning test; revert gptqmodel fix

Signed-off-by: Xin He <xin3.he@intel.com>
---
 .../compressors/diffusion/compressor.py       |  4 --
 auto_round_extension/cuda/gptqmodel_marlin.py | 70 ++++---------------
 test/test_cuda/models/test_diffusion.py       |  1 -
 3 files changed, 14 insertions(+), 61 deletions(-)

diff --git a/auto_round/compressors/diffusion/compressor.py b/auto_round/compressors/diffusion/compressor.py
index b729f3f8e..5c2feebe1 100644
--- a/auto_round/compressors/diffusion/compressor.py
+++ b/auto_round/compressors/diffusion/compressor.py
@@ -534,7 +534,3 @@ def _align_device_and_dtype(self):
             self.pipe.to(self.model.device)
         if self.pipe.dtype != self.model.dtype:
             self.pipe.to(self.model.dtype)
-
-
-def save_next_step_diffusion():
-    ar.model.save_pretrained("nextstep_diffusion_model")
diff --git a/auto_round_extension/cuda/gptqmodel_marlin.py b/auto_round_extension/cuda/gptqmodel_marlin.py
index 761589c3f..16949cb0a 100644
--- a/auto_round_extension/cuda/gptqmodel_marlin.py
+++ b/auto_round_extension/cuda/gptqmodel_marlin.py
@@ -30,9 +30,6 @@ def get_marlin_layer():  ##use an ugly wrapper to  import gptqmodel on demand
     NEW_VERSION = False
     if Version(gptqmodel.__version__) >= Version("5.0.0"):
         NEW_VERSION = True
-    NEW_VERSION_6_0 = False
-    if Version(gptqmodel.__version__) >= Version("6.0.0"):
-        NEW_VERSION_6_0 = True
     from gptqmodel.models._const import DEVICE, PLATFORM  # pylint: disable=E0401
     from gptqmodel.nn_modules.qlinear import BaseQuantLinear  # pylint: disable=E0401
     from gptqmodel.utils.backend import BACKEND  # pylint: disable=E0401
@@ -247,59 +244,20 @@ def __init__(
                 # (since we have only one group per output channel)
                 desc_act = False
 
-            backend = kwargs.pop("backend", BACKEND.MARLIN)
-            if NEW_VERSION_6_0:
-                # gptqmodel >= 6.0.0: BaseQuantLinear no longer accepts group_size/sym/desc_act/pack_dtype
-                # directly; they must be passed via validate_kwargs. Attributes are also set manually.
-                super().__init__(
-                    bits=bits,
-                    in_features=in_features,
-                    out_features=out_features,
-                    bias=bias,
-                    backend=backend,
-                    adapter=None,
-                    register_buffers=False,
-                    validate_kwargs={
-                        "group_size": group_size,
-                        "desc_act": desc_act,
-                        "sym": sym,
-                        "pack_dtype": pack_dtype,
-                    },
-                    **kwargs,
-                )
-                # Set attributes that intermediate classes (PackedQuantLinear /
-                # GPTQQuantLinear) would have set in the old API.
-                self.pack_dtype = pack_dtype
-                if pack_dtype == torch.int8:
-                    self.pack_dtype_bits = 8
-                elif pack_dtype == torch.int16:
-                    self.pack_dtype_bits = 16
-                elif pack_dtype == torch.int32:
-                    self.pack_dtype_bits = 32
-                elif pack_dtype == torch.int64:
-                    self.pack_dtype_bits = 64
-                else:
-                    raise ValueError(f"Unsupported pack_dtype: {pack_dtype}")
-                self.pack_factor = self.pack_dtype_bits // bits
-                self.group_size = group_size if group_size != -1 else in_features
-                self.requested_group_size = group_size
-                self.desc_act = desc_act
-                self.sym = sym
-            else:
-                super().__init__(
-                    bits=bits,
-                    group_size=group_size,
-                    sym=sym,
-                    desc_act=desc_act,
-                    in_features=in_features,
-                    out_features=out_features,
-                    bias=bias,
-                    pack_dtype=pack_dtype,
-                    backend=backend,
-                    adapter=None,
-                    register_buffers=False,
-                    **kwargs,
-                )
+            super().__init__(
+                bits=bits,
+                group_size=group_size,
+                sym=sym,
+                desc_act=desc_act,
+                in_features=in_features,
+                out_features=out_features,
+                bias=bias,
+                pack_dtype=pack_dtype,
+                backend=kwargs.pop("backend", BACKEND.MARLIN),
+                adapter=None,
+                register_buffers=False,
+                **kwargs,
+            )
 
             # toggle fp32 mode depending on MARLIN or MARLIN_FP16 backend
             self.fp32 = True if self.backend in [BACKEND.MARLIN, BACKEND.AUTO] else False
diff --git a/test/test_cuda/models/test_diffusion.py b/test/test_cuda/models/test_diffusion.py
index 11ea7da54..a46293f26 100644
--- a/test/test_cuda/models/test_diffusion.py
+++ b/test/test_cuda/models/test_diffusion.py
@@ -49,7 +49,6 @@ def test_diffusion_rtn(self, tiny_flux_model_path):
         # skip model saving since it takes much time
         autoround.quantize()
 
-    @pytest.mark.skip_ci(reason="Tuning will OOM in CI; Only tiny model is suggested")  # skip this test in CI
     @require_optimum
     def test_diffusion_tune(self, tiny_flux_model_path):
         from diffusers import AutoPipelineForText2Image

From 240df28f3a1e646fd5fbff4be7dc46d0b37b972d Mon Sep 17 00:00:00 2001
From: "pre-commit-ci[bot]"
 <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Date: Wed, 8 Apr 2026 02:44:52 +0000
Subject: [PATCH 07/10] [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci
---
 auto_round/compressors/diffusion/compressor.py | 6 ++++--
 auto_round/utils/model.py                      | 5 ++---
 2 files changed, 6 insertions(+), 5 deletions(-)

diff --git a/auto_round/compressors/diffusion/compressor.py b/auto_round/compressors/diffusion/compressor.py
index 5c2feebe1..14685b5d7 100644
--- a/auto_round/compressors/diffusion/compressor.py
+++ b/auto_round/compressors/diffusion/compressor.py
@@ -29,13 +29,13 @@
 from auto_round.utils import (
     LazyImport,
     clear_memory,
+    copy_python_files_from_model_cache,
     diffusion_load_model,
     extract_block_names_to_str,
     find_matching_blocks,
     get_block_names,
     merge_block_output_keys,
     wrap_block_forward_positional_to_kwargs,
-    copy_python_files_from_model_cache,
 )
 
 pipeline_utils = LazyImport("diffusers.pipelines.pipeline_utils")
@@ -370,7 +370,9 @@ def calib(self, nsamples, bs):
         )
         raw_num_inference_steps = self.num_inference_steps
         self.num_inference_steps = 1
-        logger.info(f"Set num_inference_steps to 1 for calibration, original num_inference_steps is {raw_num_inference_steps}")
+        logger.info(
+            f"Set num_inference_steps to 1 for calibration, original num_inference_steps is {raw_num_inference_steps}"
+        )
         if isinstance(self.dataset, str):
             dataset = self.dataset.replace(" ", "")
             self.dataloader, self.batch_size, self.gradient_accumulate_steps = get_diffusion_dataloader(
diff --git a/auto_round/utils/model.py b/auto_round/utils/model.py
index b08e537f3..7b2c574f1 100644
--- a/auto_round/utils/model.py
+++ b/auto_round/utils/model.py
@@ -1705,9 +1705,7 @@ def _copy_extra_model_files(src_dir: str, dst_dir: str):
 
 # Adapted from https://github.com/vllm-project/llm-compressor/blob/
 # 5b3ddff74cae9651f24bef15d3255c4ee053fc60/src/llmcompressor/pytorch/model_load/helpers.py#L144
-def copy_python_files_from_model_cache(
-    model, save_path: str, copy_folders: bool | list[str] | tuple[str, ...] = False
-):
+def copy_python_files_from_model_cache(model, save_path: str, copy_folders: bool | list[str] | tuple[str, ...] = False):
     """Copy Python files (and optionally subdirectories) from the model cache to *save_path*.
 
     Args:
@@ -1939,6 +1937,7 @@ def forward(m, hidden_states=None, *positional_inputs, **kwargs):
 
     return forward
 
+
 def load_next_step_diffusion(pretrained_model_name_or_path, device_str):
     from models.gen_pipeline import NextStepPipeline  # pylint: disable=E0401
     from transformers import AutoModel, AutoTokenizer

From 04bb9100a11207882ac499a40050a93341921b1d Mon Sep 17 00:00:00 2001
From: Xin He <xin3.he@intel.com>
Date: Wed, 8 Apr 2026 02:52:25 +0000
Subject: [PATCH 08/10] fix bug

Signed-off-by: Xin He <xin3.he@intel.com>
---
 auto_round/compressors/diffusion/compressor.py | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/auto_round/compressors/diffusion/compressor.py b/auto_round/compressors/diffusion/compressor.py
index 14685b5d7..6699d6cb2 100644
--- a/auto_round/compressors/diffusion/compressor.py
+++ b/auto_round/compressors/diffusion/compressor.py
@@ -532,7 +532,6 @@ def _align_device_and_dtype(self):
             )
             exit(-1)
 
+        self.pipe.to(self.model.dtype)
         if self.pipe.device != self.model.device:
             self.pipe.to(self.model.device)
-        if self.pipe.dtype != self.model.dtype:
-            self.pipe.to(self.model.dtype)

From d84c2d24d2a9a3f890fdc87fabe71818190c6448 Mon Sep 17 00:00:00 2001
From: Xin He <xin3.he@intel.com>
Date: Wed, 8 Apr 2026 06:44:35 +0000
Subject: [PATCH 09/10] Refactor device dispatching logic; remove unused
 function and update imports

Signed-off-by: Xin He <xin3.he@intel.com>
---
 auto_round/auto_scheme/delta_loss.py          |   2 +-
 auto_round/auto_scheme/utils.py               |  49 +------
 auto_round/compressors/base.py                |  12 +-
 .../compressors/diffusion/compressor.py       |   8 +-
 auto_round/eval/eval_cli.py                   |  14 +-
 auto_round/utils/device.py                    | 123 ++++++++++++++++--
 test/test_cuda/models/test_diffusion.py       |   4 +-
 7 files changed, 134 insertions(+), 78 deletions(-)

diff --git a/auto_round/auto_scheme/delta_loss.py b/auto_round/auto_scheme/delta_loss.py
index 66e116e84..ec5320a96 100644
--- a/auto_round/auto_scheme/delta_loss.py
+++ b/auto_round/auto_scheme/delta_loss.py
@@ -27,7 +27,6 @@
     apply_quant_scheme,
     compute_avg_bits_for_scheme,
     compute_layer_bits,
-    dispatch_model_by_all_available_devices,
     parse_shared_layers,
     remove_quant_scheme,
 )
@@ -54,6 +53,7 @@
     set_module,
     set_non_auto_device_map,
     to_device,
+    dispatch_model_by_all_available_devices,
 )
 from auto_round.utils.device import MemoryMonitor
 from auto_round.utils.offload import OffloadManager
diff --git a/auto_round/auto_scheme/utils.py b/auto_round/auto_scheme/utils.py
index 51b8da8e1..2b09f1459 100644
--- a/auto_round/auto_scheme/utils.py
+++ b/auto_round/auto_scheme/utils.py
@@ -29,6 +29,7 @@
     is_hpex_available,
     normalize_no_split_modules,
     parse_available_devices,
+    DEVICE_ENVIRON_VARIABLE_MAPPING
 )
 
 
@@ -213,54 +214,6 @@ def compute_layer_bits(
     return total_bits, avg_bits
 
 
-# Important Notice This dispatch does not follow dict device_map, just extract all available devices and use them
-def dispatch_model_by_all_available_devices(
-    model: torch.nn.Module, device_map: Union[str, int, dict, None]
-) -> torch.nn.Module:
-    if device_map is None:
-        device_map = 0
-
-    no_split_modules = normalize_no_split_modules(getattr(model, "_no_split_modules", []))
-    if device_map == "auto":
-        max_memory = get_balanced_memory(
-            model,
-            max_memory=None,
-            no_split_module_classes=no_split_modules,
-        )
-        device_map = infer_auto_device_map(model, max_memory=max_memory, no_split_module_classes=no_split_modules)
-        model = dispatch_model(model, device_map=device_map)
-        return model
-
-    devices = parse_available_devices(device_map)
-
-    if len(devices) == 1:
-        model.to(devices[0])
-        return model
-
-    max_memory = get_balanced_memory(
-        model,
-        max_memory=None,
-        no_split_module_classes=no_split_modules,
-    )
-
-    # Filter max_memory with devices
-    #  assume only one GPU model
-    new_max_memory = {}
-    for device in devices:
-        if ":" in device:
-            device = int(device.split(":")[-1])
-        elif device == "cpu":
-            device = "cpu"
-        elif isinstance(device, str):
-            device = 0
-        else:
-            raise ValueError(f"Unsupported device {device} in device_map: {device_map}")
-        new_max_memory[device] = max_memory[device]
-    model.tie_weights()
-    device_map = infer_auto_device_map(model, max_memory=max_memory, no_split_module_classes=no_split_modules)
-    model = dispatch_model(model, device_map=device_map)
-    return model
-
 
 def merge_lists_unionfind(list_of_lists):
     parent = {}
diff --git a/auto_round/compressors/base.py b/auto_round/compressors/base.py
index a4500d556..633301194 100644
--- a/auto_round/compressors/base.py
+++ b/auto_round/compressors/base.py
@@ -1573,7 +1573,11 @@ def _quantize_via_rtn_blockwise(self, all_to_quantized_module_names: list[str])
                 block = convert_module_to_hp_if_necessary(block, dtype=self.amp_dtype, device=self.device)
                 update_block_global_scale_if_needed(block, self.data_type, self.group_size)
                 self._register_act_max_hook(block)
-                if is_auto_device_mapping(self.device_map) and len(self.device_list) > 1:
+                if (
+                    is_auto_device_mapping(self.device_map)
+                    and len(self.device_list) > 1
+                    and not getattr(self, "is_diffusion", False)
+                ):
                     set_auto_device_map_for_block_with_tuning(
                         block, self.device_map, input_ids, self.low_gpu_mem_usage, self.batch_size, self.device
                     )
@@ -2980,7 +2984,11 @@ def _quantize_block(
         if auto_offload:
             # card_0_in_high_risk indicates that card_0 memory is already in high usage (90%) w/o any weights
             # loss_device is used to calculate loss on the second device if available and card_0_in_high_risk
-            if is_auto_device_mapping(self.device_map) and len(self.device_list) > 1:
+            if (
+                is_auto_device_mapping(self.device_map)
+                and len(self.device_list) > 1
+                and not getattr(self, "is_diffusion", False)
+            ):
                 card_0_in_high_risk, loss_device = set_auto_device_map_for_block_with_tuning(
                     block, self.device_map, input_ids, self.low_gpu_mem_usage, self.batch_size, device
                 )
diff --git a/auto_round/compressors/diffusion/compressor.py b/auto_round/compressors/diffusion/compressor.py
index 6699d6cb2..914fe95cc 100644
--- a/auto_round/compressors/diffusion/compressor.py
+++ b/auto_round/compressors/diffusion/compressor.py
@@ -36,6 +36,7 @@
     get_block_names,
     merge_block_output_keys,
     wrap_block_forward_positional_to_kwargs,
+    dispatch_model_by_all_available_devices,
 )
 
 pipeline_utils = LazyImport("diffusers.pipelines.pipeline_utils")
@@ -87,6 +88,7 @@ class DiffusionCompressor(BaseCompressor):
     act_dynamic: bool | None
     super_bits: int | None
     super_group_size: int | None
+    is_diffusion: bool = True
 
     def __init__(
         self,
@@ -176,6 +178,7 @@ def __init__(
             to_quant_block_names=to_quant_block_names,
             **kwargs,
         )
+        self._align_device_and_dtype()
 
     def _update_inputs(self, inputs: dict, q_inputs: dict) -> tuple[dict, dict]:
         # flux transformer model's blocks will update hidden_states and encoder_hidden_states
@@ -387,7 +390,6 @@ def calib(self, nsamples, bs):
         total_cnt = 0
 
         total = nsamples if not hasattr(self.dataloader, "len") else min(nsamples, len(self.dataloader))
-        self._align_device_and_dtype()
 
         with tqdm(range(1, total + 1), desc="cache block inputs") as pbar:
             for ids, prompts in self.dataloader:
@@ -533,5 +535,5 @@ def _align_device_and_dtype(self):
             exit(-1)
 
         self.pipe.to(self.model.dtype)
-        if self.pipe.device != self.model.device:
-            self.pipe.to(self.model.device)
+
+        dispatch_model_by_all_available_devices(self.pipe, self.device_map)
diff --git a/auto_round/eval/eval_cli.py b/auto_round/eval/eval_cli.py
index 76811ec3f..874615ad9 100644
--- a/auto_round/eval/eval_cli.py
+++ b/auto_round/eval/eval_cli.py
@@ -22,10 +22,11 @@
 from auto_round.utils import (
     dispatch_model_block_wise,
     get_device_and_parallelism,
-    get_device_str,
+    detect_device,
     get_model_dtype,
     is_diffusion_model,
     set_cuda_visible_devices,
+    DEVICE_ENVIRON_VARIABLE_MAPPING,
 )
 
 
@@ -285,19 +286,14 @@ def eval_with_vllm(args):
         logger.info(f"Overriding VLLM parameters with custom args: {custom_vllm_kwargs}")
         vllm_kwargs.update(custom_vllm_kwargs)
 
-    device = get_device_str()
-    environ_mapping = {
-        "cuda": "CUDA_VISIBLE_DEVICES",
-        "xpu": "ZE_AFFINITY_MASK",
-        "hpu": "HABANA_VISIBLE_MODULES",
-    }
+    device = detect_device()
     if "tensor_parallel_size" not in vllm_kwargs:
         # Parse device_map to determine tensor_parallel_size and set the relevant env var
         # Only accept formats like "0" or "0,1,2". If the environment variable is
         # already set externally, do not overwrite it — but still derive
         # `tensor_parallel_size` from the existing value.
-        assert device in environ_mapping, f"Device {device} not supported for vllm tensor parallelism."
-        environ_name = environ_mapping[device]
+        assert device in DEVICE_ENVIRON_VARIABLE_MAPPING, f"Device {device} not supported for vllm tensor parallelism."
+        environ_name = DEVICE_ENVIRON_VARIABLE_MAPPING[device]
         device_map = args.device_map
         device_ids = [d.strip() for d in str(device_map).split(",") if d.strip().isdigit()]
 
diff --git a/auto_round/utils/device.py b/auto_round/utils/device.py
index 747a4eb2b..e6611ece9 100644
--- a/auto_round/utils/device.py
+++ b/auto_round/utils/device.py
@@ -34,6 +34,13 @@
 from auto_round.logger import logger
 from auto_round.utils.model import check_to_quantized, get_block_names, get_layer_features, get_module
 
+DEVICE_ENVIRON_VARIABLE_MAPPING = {
+    "cuda": "CUDA_VISIBLE_DEVICES",
+    "xpu": "ZE_AFFINITY_MASK",
+    "hpu": "HABANA_VISIBLE_MODULES",
+}
+
+
 # Note on HPU usage:
 # There are two modes available for enabling auto-round on HPU:
 # 1. Compile Mode
@@ -42,7 +49,6 @@
 #   The compile mode can speed up quantization process but still in experimental stage.
 # 2. Lazy Mode (By default)
 
-
 ################ Check available sys.module to decide behavior #################
 def is_package_available(package_name: str) -> bool:
     """Check if the package exists in the environment without importing.
@@ -1694,18 +1700,6 @@ def log_summary(self, msg: str = "", level: str = "info"):
         return summary
 
 
-def get_device_str():
-    """Get a string representation of the automatically detected device."""
-    if torch.cuda.is_available():
-        return "cuda"
-    elif torch.xpu.is_available():  # pragma: no cover
-        return "xpu"
-    elif is_hpex_available():  # pragma: no cover
-        return "hpu"
-    else:  # pragma: no cover
-        return "cpu"
-
-
 # Global singleton instance
 memory_monitor = MemoryMonitor()
 
@@ -1740,3 +1734,106 @@ def wrapper(*args, **kwargs):
         return wrapper
 
     return decorator
+
+
+# This function is designed for Auto Scheme and Diffusion Pipeline, 
+# which requires dispatching the whole model on all available devices.
+def dispatch_model_by_all_available_devices(
+    model: torch.nn.Module, device_map: Union[str, int, dict, None]
+) -> torch.nn.Module:
+    # Important Notice: This dispatch does not follow dict device_map, just extract all available devices and use them
+    device_type = detect_device()
+    if device_type in DEVICE_ENVIRON_VARIABLE_MAPPING:
+        existing_env = os.environ.get(DEVICE_ENVIRON_VARIABLE_MAPPING[device_type])
+        if existing_env is None:
+            logger.warning_once("`get_balanced_memory` is used here, but no environment variable "
+                + "is set to specify device visibility. This may lead to OOM issue even the memory "
+                + "is large enough.")
+
+    # Handle DiffusionPipeline: dispatch only the main sub-model (transformer / unet)
+    # across devices and move the remaining pipeline components to the primary device.
+    try:
+        from diffusers.pipelines.pipeline_utils import DiffusionPipeline
+
+        if isinstance(model, DiffusionPipeline):
+            pipe = model
+            _device_map = 0 if device_map is None else device_map
+            devices = parse_available_devices(_device_map)
+            # Identify the main quantisable sub-model
+            main_attr = next(
+                (
+                    attr
+                    for attr in ("transformer", "unet")
+                    if isinstance(getattr(pipe, attr, None), torch.nn.Module)
+                ),
+                None,
+            )
+            if main_attr is None or len(devices) == 1:
+                # No identifiable main sub-model, or single target device:
+                # move the entire pipeline to the (first) device.
+                pipe.to(devices[0] if devices else "cuda:0")
+                return pipe
+            # Multi-device path: recursively dispatch the main sub-model,
+            # then move all remaining pipeline components to the primary device.
+            main_model = getattr(pipe, main_attr)
+            dispatched = dispatch_model_by_all_available_devices(main_model, _device_map)
+            setattr(pipe, main_attr, dispatched)
+            primary_device = devices[0]
+            for attr, component in pipe.components.items():
+                if attr == main_attr:
+                    continue
+                if isinstance(component, torch.nn.Module):
+                    try:
+                        component.to(primary_device)
+                    except Exception:
+                        pass
+            return pipe
+    except ImportError:
+        pass
+
+    if device_map is None:
+        device_map = 0
+
+    from auto_round.utils.common import normalize_no_split_modules
+
+    no_split_modules = normalize_no_split_modules(getattr(model, "_no_split_modules", []))
+    if device_map == "auto":
+        max_memory = get_balanced_memory(
+            model,
+            max_memory=None,
+            no_split_module_classes=no_split_modules,
+        )
+        device_map = infer_auto_device_map(model, max_memory=max_memory, no_split_module_classes=no_split_modules)
+        model = dispatch_model(model, device_map=device_map)
+        return model
+
+    devices = parse_available_devices(device_map)
+
+    if len(devices) == 1:
+        model.to(devices[0])
+        return model
+
+    max_memory = get_balanced_memory(
+        model,
+        max_memory=None,
+        no_split_module_classes=no_split_modules,
+    )
+
+    # Filter max_memory with devices
+    #  assume only one GPU model
+    new_max_memory = {}
+    for device in devices:
+        if ":" in device:
+            device = int(device.split(":")[-1])
+        elif device == "cpu":
+            device = "cpu"
+        elif isinstance(device, str):
+            device = 0
+        else:
+            raise ValueError(f"Unsupported device {device} in device_map: {device_map}")
+        new_max_memory[device] = max_memory[device]
+    if hasattr(model, "tie_weights") and callable(model.tie_weights):
+        model.tie_weights()
+    device_map = infer_auto_device_map(model, max_memory=new_max_memory, no_split_module_classes=no_split_modules)
+    model = dispatch_model(model, device_map=device_map)
+    return model
diff --git a/test/test_cuda/models/test_diffusion.py b/test/test_cuda/models/test_diffusion.py
index a46293f26..f7d265b00 100644
--- a/test/test_cuda/models/test_diffusion.py
+++ b/test/test_cuda/models/test_diffusion.py
@@ -50,7 +50,7 @@ def test_diffusion_rtn(self, tiny_flux_model_path):
         autoround.quantize()
 
     @require_optimum
-    def test_diffusion_tune(self, tiny_flux_model_path):
+    def test_diffusion_tune(self, tiny_flux_model_path, tmp_path):
         from diffusers import AutoPipelineForText2Image
 
         ## load the model
@@ -82,7 +82,7 @@ def test_diffusion_tune(self, tiny_flux_model_path):
             dataset=get_captions_dataset_path(),
         )
         # skip model saving since it takes much time
-        autoround.quantize()
+        autoround.quantize_and_save(tmp_path)
 
     @pytest.mark.skip_ci(reason="Download large model; Time-consuming")
     def test_diffusion_model_checker(self):

From 86328025ad0b9f4ee8ebeb828887ac81f381a2a6 Mon Sep 17 00:00:00 2001
From: "pre-commit-ci[bot]"
 <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Date: Wed, 8 Apr 2026 07:18:54 +0000
Subject: [PATCH 10/10] [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci
---
 auto_round/auto_scheme/delta_loss.py           |  2 +-
 auto_round/auto_scheme/utils.py                |  3 +--
 auto_round/compressors/diffusion/compressor.py |  2 +-
 auto_round/eval/eval_cli.py                    |  4 ++--
 auto_round/utils/device.py                     | 15 +++++++--------
 5 files changed, 12 insertions(+), 14 deletions(-)

diff --git a/auto_round/auto_scheme/delta_loss.py b/auto_round/auto_scheme/delta_loss.py
index ec5320a96..7610e94af 100644
--- a/auto_round/auto_scheme/delta_loss.py
+++ b/auto_round/auto_scheme/delta_loss.py
@@ -44,6 +44,7 @@
     SUPPORTED_LAYER_TYPES,
     check_to_quantized,
     clear_memory,
+    dispatch_model_by_all_available_devices,
     get_block_names,
     get_major_device,
     get_module,
@@ -53,7 +54,6 @@
     set_module,
     set_non_auto_device_map,
     to_device,
-    dispatch_model_by_all_available_devices,
 )
 from auto_round.utils.device import MemoryMonitor
 from auto_round.utils.offload import OffloadManager
diff --git a/auto_round/auto_scheme/utils.py b/auto_round/auto_scheme/utils.py
index 2b09f1459..b6b744a5a 100644
--- a/auto_round/auto_scheme/utils.py
+++ b/auto_round/auto_scheme/utils.py
@@ -21,6 +21,7 @@
 
 from auto_round.schemes import QuantizationScheme, preset_name_to_scheme
 from auto_round.utils import (
+    DEVICE_ENVIRON_VARIABLE_MAPPING,
     SUPPORTED_LAYER_TYPES,
     check_to_quantized,
     get_block_names,
@@ -29,7 +30,6 @@
     is_hpex_available,
     normalize_no_split_modules,
     parse_available_devices,
-    DEVICE_ENVIRON_VARIABLE_MAPPING
 )
 
 
@@ -214,7 +214,6 @@ def compute_layer_bits(
     return total_bits, avg_bits
 
 
-
 def merge_lists_unionfind(list_of_lists):
     parent = {}
 
diff --git a/auto_round/compressors/diffusion/compressor.py b/auto_round/compressors/diffusion/compressor.py
index 914fe95cc..c338a1f1e 100644
--- a/auto_round/compressors/diffusion/compressor.py
+++ b/auto_round/compressors/diffusion/compressor.py
@@ -31,12 +31,12 @@
     clear_memory,
     copy_python_files_from_model_cache,
     diffusion_load_model,
+    dispatch_model_by_all_available_devices,
     extract_block_names_to_str,
     find_matching_blocks,
     get_block_names,
     merge_block_output_keys,
     wrap_block_forward_positional_to_kwargs,
-    dispatch_model_by_all_available_devices,
 )
 
 pipeline_utils = LazyImport("diffusers.pipelines.pipeline_utils")
diff --git a/auto_round/eval/eval_cli.py b/auto_round/eval/eval_cli.py
index 874615ad9..71d142a74 100644
--- a/auto_round/eval/eval_cli.py
+++ b/auto_round/eval/eval_cli.py
@@ -20,13 +20,13 @@
 from transformers.utils.versions import require_version
 
 from auto_round.utils import (
+    DEVICE_ENVIRON_VARIABLE_MAPPING,
+    detect_device,
     dispatch_model_block_wise,
     get_device_and_parallelism,
-    detect_device,
     get_model_dtype,
     is_diffusion_model,
     set_cuda_visible_devices,
-    DEVICE_ENVIRON_VARIABLE_MAPPING,
 )
 
 
diff --git a/auto_round/utils/device.py b/auto_round/utils/device.py
index e6611ece9..30ee33e49 100644
--- a/auto_round/utils/device.py
+++ b/auto_round/utils/device.py
@@ -49,6 +49,7 @@
 #   The compile mode can speed up quantization process but still in experimental stage.
 # 2. Lazy Mode (By default)
 
+
 ################ Check available sys.module to decide behavior #################
 def is_package_available(package_name: str) -> bool:
     """Check if the package exists in the environment without importing.
@@ -1736,7 +1737,7 @@ def wrapper(*args, **kwargs):
     return decorator
 
 
-# This function is designed for Auto Scheme and Diffusion Pipeline, 
+# This function is designed for Auto Scheme and Diffusion Pipeline,
 # which requires dispatching the whole model on all available devices.
 def dispatch_model_by_all_available_devices(
     model: torch.nn.Module, device_map: Union[str, int, dict, None]
@@ -1746,9 +1747,11 @@ def dispatch_model_by_all_available_devices(
     if device_type in DEVICE_ENVIRON_VARIABLE_MAPPING:
         existing_env = os.environ.get(DEVICE_ENVIRON_VARIABLE_MAPPING[device_type])
         if existing_env is None:
-            logger.warning_once("`get_balanced_memory` is used here, but no environment variable "
+            logger.warning_once(
+                "`get_balanced_memory` is used here, but no environment variable "
                 + "is set to specify device visibility. This may lead to OOM issue even the memory "
-                + "is large enough.")
+                + "is large enough."
+            )
 
     # Handle DiffusionPipeline: dispatch only the main sub-model (transformer / unet)
     # across devices and move the remaining pipeline components to the primary device.
@@ -1761,11 +1764,7 @@ def dispatch_model_by_all_available_devices(
             devices = parse_available_devices(_device_map)
             # Identify the main quantisable sub-model
             main_attr = next(
-                (
-                    attr
-                    for attr in ("transformer", "unet")
-                    if isinstance(getattr(pipe, attr, None), torch.nn.Module)
-                ),
+                (attr for attr in ("transformer", "unet") if isinstance(getattr(pipe, attr, None), torch.nn.Module)),
                 None,
             )
             if main_attr is None or len(devices) == 1: