diff --git a/auto_round/auto_scheme/delta_loss.py b/auto_round/auto_scheme/delta_loss.py
index 66e116e84..7610e94af 100644
--- a/auto_round/auto_scheme/delta_loss.py
+++ b/auto_round/auto_scheme/delta_loss.py
@@ -27,7 +27,6 @@
     apply_quant_scheme,
     compute_avg_bits_for_scheme,
     compute_layer_bits,
-    dispatch_model_by_all_available_devices,
     parse_shared_layers,
     remove_quant_scheme,
 )
@@ -45,6 +44,7 @@
     SUPPORTED_LAYER_TYPES,
     check_to_quantized,
     clear_memory,
+    dispatch_model_by_all_available_devices,
     get_block_names,
     get_major_device,
     get_module,
diff --git a/auto_round/auto_scheme/utils.py b/auto_round/auto_scheme/utils.py
index 51b8da8e1..b6b744a5a 100644
--- a/auto_round/auto_scheme/utils.py
+++ b/auto_round/auto_scheme/utils.py
@@ -21,6 +21,7 @@
 
 from auto_round.schemes import QuantizationScheme, preset_name_to_scheme
 from auto_round.utils import (
+    DEVICE_ENVIRON_VARIABLE_MAPPING,
     SUPPORTED_LAYER_TYPES,
     check_to_quantized,
     get_block_names,
@@ -213,55 +214,6 @@ def compute_layer_bits(
     return total_bits, avg_bits
 
 
-# Important Notice This dispatch does not follow dict device_map, just extract all available devices and use them
-def dispatch_model_by_all_available_devices(
-    model: torch.nn.Module, device_map: Union[str, int, dict, None]
-) -> torch.nn.Module:
-    if device_map is None:
-        device_map = 0
-
-    no_split_modules = normalize_no_split_modules(getattr(model, "_no_split_modules", []))
-    if device_map == "auto":
-        max_memory = get_balanced_memory(
-            model,
-            max_memory=None,
-            no_split_module_classes=no_split_modules,
-        )
-        device_map = infer_auto_device_map(model, max_memory=max_memory, no_split_module_classes=no_split_modules)
-        model = dispatch_model(model, device_map=device_map)
-        return model
-
-    devices = parse_available_devices(device_map)
-
-    if len(devices) == 1:
-        model.to(devices[0])
-        return model
-
-    max_memory = get_balanced_memory(
-        model,
-        max_memory=None,
-        no_split_module_classes=no_split_modules,
-    )
-
-    # Filter max_memory with devices
-    #  assume only one GPU model
-    new_max_memory = {}
-    for device in devices:
-        if ":" in device:
-            device = int(device.split(":")[-1])
-        elif device == "cpu":
-            device = "cpu"
-        elif isinstance(device, str):
-            device = 0
-        else:
-            raise ValueError(f"Unsupported device {device} in device_map: {device_map}")
-        new_max_memory[device] = max_memory[device]
-    model.tie_weights()
-    device_map = infer_auto_device_map(model, max_memory=max_memory, no_split_module_classes=no_split_modules)
-    model = dispatch_model(model, device_map=device_map)
-    return model
-
-
 def merge_lists_unionfind(list_of_lists):
     parent = {}
 
diff --git a/auto_round/compressors/base.py b/auto_round/compressors/base.py
index a8735407e..633301194 100644
--- a/auto_round/compressors/base.py
+++ b/auto_round/compressors/base.py
@@ -1573,7 +1573,11 @@ def _quantize_via_rtn_blockwise(self, all_to_quantized_module_names: list[str])
                 block = convert_module_to_hp_if_necessary(block, dtype=self.amp_dtype, device=self.device)
                 update_block_global_scale_if_needed(block, self.data_type, self.group_size)
                 self._register_act_max_hook(block)
-                if is_auto_device_mapping(self.device_map) and len(self.device_list) > 1:
+                if (
+                    is_auto_device_mapping(self.device_map)
+                    and len(self.device_list) > 1
+                    and not getattr(self, "is_diffusion", False)
+                ):
                     set_auto_device_map_for_block_with_tuning(
                         block, self.device_map, input_ids, self.low_gpu_mem_usage, self.batch_size, self.device
                     )
@@ -2320,7 +2324,8 @@ def try_cache_inter_data_gpucpu(self, block_names, nsamples, layer_names=None, l
                             max_memory=new_max_memory,
                             no_split_module_classes=no_split_modules,
                         )
-                        self.model.tie_weights()
+                        if hasattr(self.model, "tie_weights") and callable(self.model.tie_weights):
+                            self.model.tie_weights()
                         device_map = infer_auto_device_map(
                             self.model, max_memory=new_max_memory, no_split_module_classes=no_split_modules
                         )
@@ -2979,7 +2984,11 @@ def _quantize_block(
         if auto_offload:
             # card_0_in_high_risk indicates that card_0 memory is already in high usage (90%) w/o any weights
             # loss_device is used to calculate loss on the second device if available and card_0_in_high_risk
-            if is_auto_device_mapping(self.device_map) and len(self.device_list) > 1:
+            if (
+                is_auto_device_mapping(self.device_map)
+                and len(self.device_list) > 1
+                and not getattr(self, "is_diffusion", False)
+            ):
                 card_0_in_high_risk, loss_device = set_auto_device_map_for_block_with_tuning(
                     block, self.device_map, input_ids, self.low_gpu_mem_usage, self.batch_size, device
                 )
diff --git a/auto_round/compressors/diffusion/README.md b/auto_round/compressors/diffusion/README.md
index c5b307df5..c0f8eb534 100644
--- a/auto_round/compressors/diffusion/README.md
+++ b/auto_round/compressors/diffusion/README.md
@@ -64,9 +64,12 @@ auto-round \
 
 For diffusion models, currently we only validate quantizaion on the FLUX.1-dev, which involves quantizing the transformer component of the pipeline.
 
-| Model     | calibration dataset |
-|--------------|--------------|
-| black-forest-labs/FLUX.1-dev | COCO2014      |
+| Model         | calibration dataset |  Model Link  |
+|---------------|---------------------|--------------|
+| black-forest-labs/FLUX.1-dev  | COCO2014      | - |
+| Tongyi-MAI/Z-Image            | COCO2014      | - |
+| Tongyi-MAI/Z-Image-Turb       | COCO2014      | - |
+| stepfun-ai/NextStep-1.1       | COCO2014      | - |
 
 
 
diff --git a/auto_round/compressors/diffusion/compressor.py b/auto_round/compressors/diffusion/compressor.py
index ea7eba39e..c338a1f1e 100644
--- a/auto_round/compressors/diffusion/compressor.py
+++ b/auto_round/compressors/diffusion/compressor.py
@@ -29,7 +29,9 @@
 from auto_round.utils import (
     LazyImport,
     clear_memory,
+    copy_python_files_from_model_cache,
     diffusion_load_model,
+    dispatch_model_by_all_available_devices,
     extract_block_names_to_str,
     find_matching_blocks,
     get_block_names,
@@ -56,6 +58,11 @@ class DiffusionCompressor(BaseCompressor):
                                 The more it is, the more closely it follows the prompt (default is 7.5).
         num_inference_steps (int): The reference number of denoising steps (default is 50).
         generator_seed (int): A sees that controls the initial noise from which an image is generated (default is None).
+        pipeline_fn (callable, optional): Custom callable to run the pipeline during calibration.
+            Signature: ``fn(pipe, prompts, *, guidance_scale, num_inference_steps, generator, **kwargs)``.
+            Use this to support models whose inference API differs from the standard convention
+            (e.g. NextStep). If ``None``, the standard ``pipe(prompts, ...)`` call is used unless
+            the loaded pipeline already exposes an ``_autoround_pipeline_fn`` attribute.
         scheme: (str| dict | QuantizationScheme ): A preset scheme that defines the quantization configurations.
         layer_config (dict): Configuration for weight quantization (default is None).
         dataset: The path or name of the calib dataset.
@@ -81,6 +88,7 @@ class DiffusionCompressor(BaseCompressor):
     act_dynamic: bool | None
     super_bits: int | None
     super_group_size: int | None
+    is_diffusion: bool = True
 
     def __init__(
         self,
@@ -102,9 +110,15 @@ def __init__(
         device_map: Union[str, torch.device, int, dict] = 0,
         enable_torch_compile: bool = False,
         seed: int = 42,
+        pipeline_fn: callable = None,
         **kwargs,
     ):
         logger.warning("Diffusion model quantization is experimental and is only validated on Flux models.")
+        if dataset == "NeelNanda/pile-10k":
+            dataset = "coco2014"
+            logger.warning(
+                "Dataset 'NeelNanda/pile-10k' is not suitable for diffusion model quantization, use coco2014 dataset instead."
+            )
         model_dtype = kwargs.pop("model_dtype", None)
 
         self.guidance_scale = guidance_scale
@@ -120,6 +134,8 @@ def __init__(
 
         self.model = model
         self.pipe = pipe
+        # Use explicit pipeline_fn; fall back to whatever diffusion_load_model attached to the pipe
+        self.pipeline_fn = pipeline_fn or getattr(pipe, "_autoround_pipeline_fn", None)
 
         all_blocks = get_block_names(model)
         self.quant_block_list = find_matching_blocks(model, all_blocks, to_quant_block_names)
@@ -162,6 +178,7 @@ def __init__(
             to_quant_block_names=to_quant_block_names,
             **kwargs,
         )
+        self._align_device_and_dtype()
 
     def _update_inputs(self, inputs: dict, q_inputs: dict) -> tuple[dict, dict]:
         # flux transformer model's blocks will update hidden_states and encoder_hidden_states
@@ -194,7 +211,7 @@ def _get_current_q_output(
         device: str,
         cache_device: str = "cpu",
     ) -> torch.Tensor:
-        output_config = output_configs.get(block.__class__.__name__, [])
+        output_config = output_configs.get(block.__class__.__name__, ["hidden_states"])
         idx = None if "hidden_states" not in output_config else output_config.index("hidden_states")
         current_input_ids, current_input_others = self._sampling_inputs(
             input_ids,
@@ -238,7 +255,7 @@ def _get_block_outputs(
         """
 
         output = defaultdict(list)
-        output_config = output_configs.get(block.__class__.__name__, [])
+        output_config = output_configs.get(block.__class__.__name__, ["hidden_states"])
         if isinstance(input_ids, dict):
             nsamples = len(input_ids["hidden_states"])
         else:
@@ -256,6 +273,8 @@ def _get_block_outputs(
                 tmp_input_ids = hidden_states
 
             tmp_output = block_forward(block, tmp_input_ids, tmp_input_others, self.amp, self.amp_dtype, device, None)
+            if isinstance(tmp_output, torch.Tensor):
+                tmp_output = [tmp_output]
             assert len(output_config) == len(tmp_output)
             tmp_output = dict(zip(output_config, tmp_output))
 
@@ -278,6 +297,64 @@ def _get_current_num_elm(
         current_input_ids = [input_ids["hidden_states"][i] for i in indices]
         return sum(id.numel() for id in current_input_ids)
 
+    def _run_pipeline(self, prompts: list) -> None:
+        """Execute one full diffusion pipeline forward pass for calibration input capture.
+
+        This drives all transformer blocks so that their intermediate inputs are recorded
+        by the hooks installed during calibration.
+
+        **Extending for custom models** – choose whichever approach is simpler:
+
+        * Pass a ``pipeline_fn`` to the constructor (no subclassing required).  The
+          callable receives ``(pipe, prompts, *, guidance_scale, num_inference_steps,
+          generator, **kwargs)`` and must trigger a full forward pass.
+        * Subclass :class:`DiffusionCompressor` and override this method directly for
+          full control over the inference logic.
+
+        Example – NextStep model::
+
+            def nextstep_fn(pipe, prompts, guidance_scale=7.5,
+                            num_inference_steps=28, generator=None,
+                            hw=(1024, 1024), **kwargs):
+                for prompt in (prompts if isinstance(prompts, list) else [prompts]):
+                    pipe.generate_image(
+                        prompt,
+                        cfg=guidance_scale,
+                        num_sampling_steps=num_inference_steps,
+                        hw=hw,
+                        **kwargs,
+                    )
+
+            compressor = DiffusionCompressor(
+                model="path/to/nextstep",
+                pipeline_fn=nextstep_fn,
+                pipeline_fn_kwargs={"hw": (512, 512)},
+            )
+
+        Args:
+            prompts (list[str]): Text prompts for the current calibration batch.
+        """
+        generator = (
+            None
+            if self.generator_seed is None
+            else torch.Generator(device=self.pipe.device).manual_seed(self.generator_seed)
+        )
+        if self.pipeline_fn is not None:
+            self.pipeline_fn(
+                self.pipe,
+                prompts,
+                guidance_scale=self.guidance_scale,
+                num_inference_steps=self.num_inference_steps,
+                generator=generator,
+            )
+        else:
+            self.pipe(
+                prompts,
+                guidance_scale=self.guidance_scale,
+                num_inference_steps=self.num_inference_steps,
+                generator=generator,
+            )
+
     def calib(self, nsamples, bs):
         """Perform calibration for quantization.
 
@@ -294,6 +371,11 @@ def calib(self, nsamples, bs):
             "Diffusion model will catch nsamples * num_inference_steps inputs, "
             "you can reduce nsamples or num_inference_steps if OOM or take too much time."
         )
+        raw_num_inference_steps = self.num_inference_steps
+        self.num_inference_steps = 1
+        logger.info(
+            f"Set num_inference_steps to 1 for calibration, original num_inference_steps is {raw_num_inference_steps}"
+        )
         if isinstance(self.dataset, str):
             dataset = self.dataset.replace(" ", "")
             self.dataloader, self.batch_size, self.gradient_accumulate_steps = get_diffusion_dataloader(
@@ -308,40 +390,13 @@ def calib(self, nsamples, bs):
         total_cnt = 0
 
         total = nsamples if not hasattr(self.dataloader, "len") else min(nsamples, len(self.dataloader))
-        if self.pipe.dtype != self.model.dtype:
-            self.pipe.to(self.model.dtype)
-
-        if (
-            hasattr(self.model, "hf_device_map")
-            and len(self.model.hf_device_map) > 0
-            and self.pipe.device != self.model.device
-            and torch.device(self.model.device).type in ["cuda", "xpu"]
-        ):
-            logger.error(
-                "Diffusion model is activated sequential model offloading, it will crash during moving to GPU/XPU. "
-                "Please use model path for quantization or "
-                "move the pipeline object to GPU/XPU before passing them into API."
-            )
-            exit(-1)
 
-        if self.pipe.device != self.model.device:
-            self.pipe.to(self.model.device)
-        self.pipe.to(self.model.dtype)
         with tqdm(range(1, total + 1), desc="cache block inputs") as pbar:
             for ids, prompts in self.dataloader:
                 if isinstance(prompts, tuple):
                     prompts = list(prompts)
                 try:
-                    self.pipe(
-                        prompt=prompts,
-                        guidance_scale=self.guidance_scale,
-                        num_inference_steps=self.num_inference_steps,
-                        generator=(
-                            None
-                            if self.generator_seed is None
-                            else torch.Generator(device=self.pipe.device).manual_seed(self.generator_seed)
-                        ),
-                    )
+                    self._run_pipeline(prompts)
                 except NotImplementedError:
                     pass
                 except Exception as error:
@@ -374,6 +429,8 @@ def calib(self, nsamples, bs):
                         self.inputs[k][key] = v[key][:max_len]
 
         # torch.cuda.empty_cache()
+        self.num_inference_steps = raw_num_inference_steps
+        logger.info(f"Restore num_inference_steps to {self.num_inference_steps} after calibration")
 
     def _should_stop_cache_forward(self, name: str) -> bool:
         """Determine whether current forward pass can stop after caching `name`."""
@@ -395,6 +452,12 @@ def _get_save_folder_name(self, format: OutputFormat) -> str:
         """
         # Replace special characters to make the folder name filesystem-safe
         sanitized_format = format.get_backend_name().replace(":", "-").replace("_", "-")
+        if hasattr(self.model, "config") and getattr(self.model.config, "model_type", None) == "nextstep":
+            # Use a subfolder only if there are multiple formats
+            if len(self.formats) > 1:
+                return os.path.join(self.orig_output_dir, sanitized_format)
+
+            return self.orig_output_dir
 
         # Use a subfolder only if there are multiple formats
         if len(self.formats) > 1:
@@ -423,6 +486,16 @@ def save_quantized(self, output_dir=None, format="auto_round", inplace=True, **k
             return super().save_quantized(output_dir, format=format, inplace=inplace, **kwargs)
 
         compressed_model = None
+        if hasattr(self.model, "config") and getattr(self.model.config, "model_type", None) == "nextstep":
+            compressed_model = super().save_quantized(
+                output_dir=output_dir,
+                format=format,
+                inplace=inplace,
+                **kwargs,
+            )
+            self.pipe.tokenizer.save_pretrained(output_dir)
+            copy_python_files_from_model_cache(self.model, output_dir, copy_folders=["models", "vae", "utils"])
+            return compressed_model
         for name in self.pipe.components.keys():
             val = getattr(self.pipe, name)
             sub_module_path = (
@@ -443,3 +516,24 @@ def save_quantized(self, output_dir=None, format="auto_round", inplace=True, **k
                 val.save_pretrained(sub_module_path)
         self.pipe.config.save_pretrained(output_dir)
         return compressed_model
+
+    def _align_device_and_dtype(self):
+        if hasattr(self.model, "config") and getattr(self.model.config, "model_type", None) == "nextstep":
+            return
+        if (
+            hasattr(self.model, "hf_device_map")
+            and len(self.model.hf_device_map) > 0
+            and type(self.pipe.device) != type(self.model.device)
+            and self.pipe.device != self.model.device
+            and torch.device(self.model.device).type in ["cuda", "xpu"]
+        ):
+            logger.error(
+                "Diffusion model is activated sequential model offloading, it will crash during moving to GPU/XPU. "
+                "Please use model path for quantization or "
+                "move the pipeline object to GPU/XPU before passing them into API."
+            )
+            exit(-1)
+
+        self.pipe.to(self.model.dtype)
+
+        dispatch_model_by_all_available_devices(self.pipe, self.device_map)
diff --git a/auto_round/eval/eval_cli.py b/auto_round/eval/eval_cli.py
index 76811ec3f..71d142a74 100644
--- a/auto_round/eval/eval_cli.py
+++ b/auto_round/eval/eval_cli.py
@@ -20,9 +20,10 @@
 from transformers.utils.versions import require_version
 
 from auto_round.utils import (
+    DEVICE_ENVIRON_VARIABLE_MAPPING,
+    detect_device,
     dispatch_model_block_wise,
     get_device_and_parallelism,
-    get_device_str,
     get_model_dtype,
     is_diffusion_model,
     set_cuda_visible_devices,
@@ -285,19 +286,14 @@ def eval_with_vllm(args):
         logger.info(f"Overriding VLLM parameters with custom args: {custom_vllm_kwargs}")
         vllm_kwargs.update(custom_vllm_kwargs)
 
-    device = get_device_str()
-    environ_mapping = {
-        "cuda": "CUDA_VISIBLE_DEVICES",
-        "xpu": "ZE_AFFINITY_MASK",
-        "hpu": "HABANA_VISIBLE_MODULES",
-    }
+    device = detect_device()
     if "tensor_parallel_size" not in vllm_kwargs:
         # Parse device_map to determine tensor_parallel_size and set the relevant env var
         # Only accept formats like "0" or "0,1,2". If the environment variable is
         # already set externally, do not overwrite it — but still derive
         # `tensor_parallel_size` from the existing value.
-        assert device in environ_mapping, f"Device {device} not supported for vllm tensor parallelism."
-        environ_name = environ_mapping[device]
+        assert device in DEVICE_ENVIRON_VARIABLE_MAPPING, f"Device {device} not supported for vllm tensor parallelism."
+        environ_name = DEVICE_ENVIRON_VARIABLE_MAPPING[device]
         device_map = args.device_map
         device_ids = [d.strip() for d in str(device_map).split(",") if d.strip().isdigit()]
 
diff --git a/auto_round/utils/device.py b/auto_round/utils/device.py
index 747a4eb2b..30ee33e49 100644
--- a/auto_round/utils/device.py
+++ b/auto_round/utils/device.py
@@ -34,6 +34,13 @@
 from auto_round.logger import logger
 from auto_round.utils.model import check_to_quantized, get_block_names, get_layer_features, get_module
 
+DEVICE_ENVIRON_VARIABLE_MAPPING = {
+    "cuda": "CUDA_VISIBLE_DEVICES",
+    "xpu": "ZE_AFFINITY_MASK",
+    "hpu": "HABANA_VISIBLE_MODULES",
+}
+
+
 # Note on HPU usage:
 # There are two modes available for enabling auto-round on HPU:
 # 1. Compile Mode
@@ -1694,18 +1701,6 @@ def log_summary(self, msg: str = "", level: str = "info"):
         return summary
 
 
-def get_device_str():
-    """Get a string representation of the automatically detected device."""
-    if torch.cuda.is_available():
-        return "cuda"
-    elif torch.xpu.is_available():  # pragma: no cover
-        return "xpu"
-    elif is_hpex_available():  # pragma: no cover
-        return "hpu"
-    else:  # pragma: no cover
-        return "cpu"
-
-
 # Global singleton instance
 memory_monitor = MemoryMonitor()
 
@@ -1740,3 +1735,104 @@ def wrapper(*args, **kwargs):
         return wrapper
 
     return decorator
+
+
+# This function is designed for Auto Scheme and Diffusion Pipeline,
+# which requires dispatching the whole model on all available devices.
+def dispatch_model_by_all_available_devices(
+    model: torch.nn.Module, device_map: Union[str, int, dict, None]
+) -> torch.nn.Module:
+    # Important Notice: This dispatch does not follow dict device_map, just extract all available devices and use them
+    device_type = detect_device()
+    if device_type in DEVICE_ENVIRON_VARIABLE_MAPPING:
+        existing_env = os.environ.get(DEVICE_ENVIRON_VARIABLE_MAPPING[device_type])
+        if existing_env is None:
+            logger.warning_once(
+                "`get_balanced_memory` is used here, but no environment variable "
+                + "is set to specify device visibility. This may lead to OOM issue even the memory "
+                + "is large enough."
+            )
+
+    # Handle DiffusionPipeline: dispatch only the main sub-model (transformer / unet)
+    # across devices and move the remaining pipeline components to the primary device.
+    try:
+        from diffusers.pipelines.pipeline_utils import DiffusionPipeline
+
+        if isinstance(model, DiffusionPipeline):
+            pipe = model
+            _device_map = 0 if device_map is None else device_map
+            devices = parse_available_devices(_device_map)
+            # Identify the main quantisable sub-model
+            main_attr = next(
+                (attr for attr in ("transformer", "unet") if isinstance(getattr(pipe, attr, None), torch.nn.Module)),
+                None,
+            )
+            if main_attr is None or len(devices) == 1:
+                # No identifiable main sub-model, or single target device:
+                # move the entire pipeline to the (first) device.
+                pipe.to(devices[0] if devices else "cuda:0")
+                return pipe
+            # Multi-device path: recursively dispatch the main sub-model,
+            # then move all remaining pipeline components to the primary device.
+            main_model = getattr(pipe, main_attr)
+            dispatched = dispatch_model_by_all_available_devices(main_model, _device_map)
+            setattr(pipe, main_attr, dispatched)
+            primary_device = devices[0]
+            for attr, component in pipe.components.items():
+                if attr == main_attr:
+                    continue
+                if isinstance(component, torch.nn.Module):
+                    try:
+                        component.to(primary_device)
+                    except Exception:
+                        pass
+            return pipe
+    except ImportError:
+        pass
+
+    if device_map is None:
+        device_map = 0
+
+    from auto_round.utils.common import normalize_no_split_modules
+
+    no_split_modules = normalize_no_split_modules(getattr(model, "_no_split_modules", []))
+    if device_map == "auto":
+        max_memory = get_balanced_memory(
+            model,
+            max_memory=None,
+            no_split_module_classes=no_split_modules,
+        )
+        device_map = infer_auto_device_map(model, max_memory=max_memory, no_split_module_classes=no_split_modules)
+        model = dispatch_model(model, device_map=device_map)
+        return model
+
+    devices = parse_available_devices(device_map)
+
+    if len(devices) == 1:
+        model.to(devices[0])
+        return model
+
+    max_memory = get_balanced_memory(
+        model,
+        max_memory=None,
+        no_split_module_classes=no_split_modules,
+    )
+
+    # Filter max_memory with devices
+    #  assume only one GPU model
+    new_max_memory = {}
+    for device in devices:
+        if ":" in device:
+            device = int(device.split(":")[-1])
+        elif device == "cpu":
+            device = "cpu"
+        elif isinstance(device, str):
+            device = 0
+        else:
+            raise ValueError(f"Unsupported device {device} in device_map: {device_map}")
+        new_max_memory[device] = max_memory[device]
+    if hasattr(model, "tie_weights") and callable(model.tie_weights):
+        model.tie_weights()
+    device_map = infer_auto_device_map(model, max_memory=new_max_memory, no_split_module_classes=no_split_modules)
+    model = dispatch_model(model, device_map=device_map)
+    return model
diff --git a/auto_round/utils/model.py b/auto_round/utils/model.py
index f0aec180a..7b2c574f1 100644
--- a/auto_round/utils/model.py
+++ b/auto_round/utils/model.py
@@ -666,6 +666,19 @@ def diffusion_load_model(
     if device_str is not None and "hpu" in device_str:
         torch_dtype = torch.bfloat16
 
+    try:
+        from transformers import AutoConfig
+
+        config = AutoConfig.from_pretrained(pretrained_model_name_or_path, trust_remote_code=True)
+    except:
+        config = None
+
+    model_type = getattr(config, "model_type", "")
+    # A special case for NextStep
+    if model_type == "nextstep":
+        pipe, model = load_next_step_diffusion(pretrained_model_name_or_path, device_str)
+        return pipe, pipe.model
+
     pipelines = LazyImport("diffusers.pipelines")
     if isinstance(pretrained_model_name_or_path, str):
         if torch_dtype == "auto":
@@ -794,6 +807,21 @@ def is_gguf_model(model_path: Union[str, torch.nn.Module]) -> bool:
 def is_diffusion_model(model_or_path: Union[str, object]) -> bool:
     from auto_round.utils.common import LazyImport
 
+    # First check if it's a known diffusion pipeline by config/model_type to avoid unnecessary imports and file checks for non-diffusion models, which can be time-consuming.
+    try:
+        from transformers import AutoConfig
+
+        config = AutoConfig.from_pretrained(model_or_path, trust_remote_code=True)
+        model_type = getattr(config, "model_type", "")
+        # A special case for NextStep
+        if model_type == "nextstep":
+            return True
+    except:
+        logger.warning(
+            f"Failed to load config for {model_or_path}, trying to check model_index.json for diffusion pipeline."
+        )
+
+    # Then check if model_index.json exists for diffusion pipeline, which is a strong signal of being a diffusion pipeline.
     if isinstance(model_or_path, str):
         index_file = None
         if not os.path.isdir(model_or_path):
@@ -1677,43 +1705,69 @@ def _copy_extra_model_files(src_dir: str, dst_dir: str):
 
 # Adapted from https://github.com/vllm-project/llm-compressor/blob/
 # 5b3ddff74cae9651f24bef15d3255c4ee053fc60/src/llmcompressor/pytorch/model_load/helpers.py#L144
-def copy_python_files_from_model_cache(model, save_path: str):
+def copy_python_files_from_model_cache(model, save_path: str, copy_folders: bool | list[str] | tuple[str, ...] = False):
+    """Copy Python files (and optionally subdirectories) from the model cache to *save_path*.
+
+    Args:
+        model: The model whose ``config._name_or_path`` points to the source cache.
+        save_path (str): Destination directory.
+        copy_folders (bool | list[str] | tuple[str, ...]): Controls which subdirectories
+            are copied from the cache root to *save_path*:
+
+            * ``False`` (default) – no folders are copied.
+            * ``True`` – every subdirectory that does not already exist in *save_path*
+              is copied (e.g. all of ``vae``, ``scheduler``, …).
+            * A list/tuple of folder names (e.g. ``["vae", "scheduler"]``) – only the
+              named subdirectories are copied.
+    """
+    import shutil
+
+    from huggingface_hub import hf_hub_download
+
     config = model.config
-    if hasattr(config, "_name_or_path"):
-        import os
-        import shutil
+    if not hasattr(config, "_name_or_path"):
+        return
 
-        from huggingface_hub import hf_hub_download
+    if version.parse(transformers.__version__) < version.parse("5.0.0"):
+        from transformers.utils import TRANSFORMERS_CACHE
 
-        if version.parse(transformers.__version__) < version.parse("5.0.0"):
-            from transformers.utils import TRANSFORMERS_CACHE
+        cache_dir = os.environ.get("HF_HOME", TRANSFORMERS_CACHE)
+    else:
+        from huggingface_hub.constants import HF_HUB_CACHE
+
+        cache_dir = os.environ.get("HF_HOME", HF_HUB_CACHE)
+    from transformers.utils import http_user_agent
+
+    cache_path = config._name_or_path
+    if not os.path.exists(cache_path):
+        user_agent = http_user_agent()
+        config_file_path = hf_hub_download(
+            repo_id=cache_path,
+            filename="config.json",
+            cache_dir=cache_dir,
+            force_download=False,
+            user_agent=user_agent,
+        )
+        cache_path = os.path.sep.join(config_file_path.split(os.path.sep)[:-1])
 
-            cache_dir = os.environ.get("HF_HOME", TRANSFORMERS_CACHE)
-        else:
-            from huggingface_hub.constants import HF_HUB_CACHE
-
-            cache_dir = os.environ.get("HF_HOME", HF_HUB_CACHE)
-        from transformers.utils import http_user_agent
-
-        cache_path = config._name_or_path
-        if not os.path.exists(cache_path):
-            user_agent = http_user_agent()
-            config_file_path = hf_hub_download(
-                repo_id=cache_path,
-                filename="config.json",
-                cache_dir=cache_dir,
-                force_download=False,
-                user_agent=user_agent,
-            )
-            cache_path = os.path.sep.join(config_file_path.split(os.path.sep)[:-1])
+    for file in os.listdir(cache_path):
+        full_file_name = os.path.join(cache_path, file)
+        if file.endswith(".py") and os.path.isfile(full_file_name):
+            logger.debug(f"Transferring {full_file_name} to {save_path}")
+            shutil.copy(full_file_name, save_path)
 
-        for file in os.listdir(cache_path):
-            full_file_name = os.path.join(cache_path, file)
-            if file.endswith(".py") and os.path.isfile(full_file_name):
-                logger.debug(f"Transferring {full_file_name} to {save_path}")
-                shutil.copy(full_file_name, save_path)
+    _copy_extra_model_files(cache_path, save_path)
 
-        _copy_extra_model_files(cache_path, save_path)
+    if copy_folders is not False:
+        for entry in os.listdir(cache_path):
+            src_entry = os.path.join(cache_path, entry)
+            dst_entry = os.path.join(save_path, entry)
+            if not os.path.isdir(src_entry):
+                continue
+            if copy_folders is True or entry in copy_folders:
+                if not os.path.exists(dst_entry):
+                    logger.debug(f"Transferring folder {src_entry} to {save_path}")
+                    shutil.copytree(src_entry, dst_entry)
 
 
 def extract_block_names_to_str(quant_block_list):
@@ -1882,3 +1936,35 @@ def forward(m, hidden_states=None, *positional_inputs, **kwargs):
         return base_hook(m, hidden_states, *positional_inputs, **kwargs)
 
     return forward
+
+
+def load_next_step_diffusion(pretrained_model_name_or_path, device_str):
+    from models.gen_pipeline import NextStepPipeline  # pylint: disable=E0401
+    from transformers import AutoModel, AutoTokenizer
+
+    tokenizer = AutoTokenizer.from_pretrained(
+        pretrained_model_name_or_path, local_files_only=True, trust_remote_code=True
+    )
+    model = AutoModel.from_pretrained(pretrained_model_name_or_path, local_files_only=True, trust_remote_code=True)
+    # The model is loaded onto the device because more than one block requires input data.
+    pipe = NextStepPipeline(tokenizer=tokenizer, model=model).to(device=device_str, dtype=torch.bfloat16)
+
+    def _nextstep_pipeline_fn(pipe, prompts, guidance_scale=7.5, num_inference_steps=28, generator=None, **kwargs):
+        """Default pipeline_fn for NextStep models.
+
+        Maps standard :class:`DiffusionCompressor` parameters to NextStep's
+        ``generate_image`` API.  Pass a custom ``pipeline_fn`` to
+        :class:`DiffusionCompressor` to override defaults or supply
+        model-specific kwargs (e.g. ``hw``, ``positive_prompt``,
+        ``cfg_schedule``, ``timesteps_shift``).
+        """
+        for prompt in (prompts if isinstance(prompts, list) else [prompts]):
+            pipe.generate_image(
+                prompt,
+                cfg=guidance_scale,
+                num_sampling_steps=num_inference_steps,
+                **kwargs,
+            )
+
+    pipe._autoround_pipeline_fn = _nextstep_pipeline_fn
+    return pipe, model
diff --git a/test/test_cuda/models/test_diffusion.py b/test/test_cuda/models/test_diffusion.py
index 11ea7da54..f7d265b00 100644
--- a/test/test_cuda/models/test_diffusion.py
+++ b/test/test_cuda/models/test_diffusion.py
@@ -49,9 +49,8 @@ def test_diffusion_rtn(self, tiny_flux_model_path):
         # skip model saving since it takes much time
         autoround.quantize()
 
-    @pytest.mark.skip_ci(reason="Tuning will OOM in CI; Only tiny model is suggested")  # skip this test in CI
     @require_optimum
-    def test_diffusion_tune(self, tiny_flux_model_path):
+    def test_diffusion_tune(self, tiny_flux_model_path, tmp_path):
         from diffusers import AutoPipelineForText2Image
 
         ## load the model
@@ -83,7 +82,7 @@ def test_diffusion_tune(self, tiny_flux_model_path):
             dataset=get_captions_dataset_path(),
         )
         # skip model saving since it takes much time
-        autoround.quantize()
+        autoround.quantize_and_save(tmp_path)
 
     @pytest.mark.skip_ci(reason="Download large model; Time-consuming")
     def test_diffusion_model_checker(self):