From 2bc369717eb113538fb61aa9a7e1d2f5d4f2a79c Mon Sep 17 00:00:00 2001 From: Xin He Date: Mon, 30 Mar 2026 08:40:28 +0000 Subject: [PATCH 01/10] fix nextstep loading issue Signed-off-by: Xin He --- auto_round/utils/common.py | 1 + auto_round/utils/model.py | 3 +++ 2 files changed, 4 insertions(+) diff --git a/auto_round/utils/common.py b/auto_round/utils/common.py index a81dcefbd..22091d788 100644 --- a/auto_round/utils/common.py +++ b/auto_round/utils/common.py @@ -494,6 +494,7 @@ def __getitem__(self, key): "patch_merger", "pre_mm_projector_norm", "vision", + "image", ] diff --git a/auto_round/utils/model.py b/auto_round/utils/model.py index f0aec180a..5d86b18b7 100644 --- a/auto_round/utils/model.py +++ b/auto_round/utils/model.py @@ -557,6 +557,9 @@ def mllm_load_model( cls = getattr(base_lib, architectures) else: cls = AutoModelForCausalLM + # A special case for NextStep + if model_type == "nextstep": + cls = AutoModel try: model_load_kwargs = {} if model_subfolder is not None: From 2bb744ac7a8c22468d449b8c446ca6819e80c56d Mon Sep 17 00:00:00 2001 From: Xin He Date: Mon, 30 Mar 2026 13:34:14 +0000 Subject: [PATCH 02/10] support 6.0.0 gptqmodel Signed-off-by: Xin He --- auto_round_extension/cuda/gptqmodel_marlin.py | 70 +++++++++++++++---- 1 file changed, 56 insertions(+), 14 deletions(-) diff --git a/auto_round_extension/cuda/gptqmodel_marlin.py b/auto_round_extension/cuda/gptqmodel_marlin.py index 16949cb0a..761589c3f 100644 --- a/auto_round_extension/cuda/gptqmodel_marlin.py +++ b/auto_round_extension/cuda/gptqmodel_marlin.py @@ -30,6 +30,9 @@ def get_marlin_layer(): ##use an ugly wrapper to import gptqmodel on demand NEW_VERSION = False if Version(gptqmodel.__version__) >= Version("5.0.0"): NEW_VERSION = True + NEW_VERSION_6_0 = False + if Version(gptqmodel.__version__) >= Version("6.0.0"): + NEW_VERSION_6_0 = True from gptqmodel.models._const import DEVICE, PLATFORM # pylint: disable=E0401 from gptqmodel.nn_modules.qlinear import BaseQuantLinear # pylint: disable=E0401 from gptqmodel.utils.backend import BACKEND # pylint: disable=E0401 @@ -244,20 +247,59 @@ def __init__( # (since we have only one group per output channel) desc_act = False - super().__init__( - bits=bits, - group_size=group_size, - sym=sym, - desc_act=desc_act, - in_features=in_features, - out_features=out_features, - bias=bias, - pack_dtype=pack_dtype, - backend=kwargs.pop("backend", BACKEND.MARLIN), - adapter=None, - register_buffers=False, - **kwargs, - ) + backend = kwargs.pop("backend", BACKEND.MARLIN) + if NEW_VERSION_6_0: + # gptqmodel >= 6.0.0: BaseQuantLinear no longer accepts group_size/sym/desc_act/pack_dtype + # directly; they must be passed via validate_kwargs. Attributes are also set manually. + super().__init__( + bits=bits, + in_features=in_features, + out_features=out_features, + bias=bias, + backend=backend, + adapter=None, + register_buffers=False, + validate_kwargs={ + "group_size": group_size, + "desc_act": desc_act, + "sym": sym, + "pack_dtype": pack_dtype, + }, + **kwargs, + ) + # Set attributes that intermediate classes (PackedQuantLinear / + # GPTQQuantLinear) would have set in the old API. + self.pack_dtype = pack_dtype + if pack_dtype == torch.int8: + self.pack_dtype_bits = 8 + elif pack_dtype == torch.int16: + self.pack_dtype_bits = 16 + elif pack_dtype == torch.int32: + self.pack_dtype_bits = 32 + elif pack_dtype == torch.int64: + self.pack_dtype_bits = 64 + else: + raise ValueError(f"Unsupported pack_dtype: {pack_dtype}") + self.pack_factor = self.pack_dtype_bits // bits + self.group_size = group_size if group_size != -1 else in_features + self.requested_group_size = group_size + self.desc_act = desc_act + self.sym = sym + else: + super().__init__( + bits=bits, + group_size=group_size, + sym=sym, + desc_act=desc_act, + in_features=in_features, + out_features=out_features, + bias=bias, + pack_dtype=pack_dtype, + backend=backend, + adapter=None, + register_buffers=False, + **kwargs, + ) # toggle fp32 mode depending on MARLIN or MARLIN_FP16 backend self.fp32 = True if self.backend in [BACKEND.MARLIN, BACKEND.AUTO] else False From 576e13086c85acae2180564b39d16778c929eaab Mon Sep 17 00:00:00 2001 From: Xin He Date: Fri, 3 Apr 2026 09:55:34 +0800 Subject: [PATCH 03/10] Enhance DiffusionCompressor with custom pipeline support and improve model loading for NextStep Signed-off-by: Xin He --- .../compressors/diffusion/compressor.py | 100 +++++++++++++----- auto_round/utils/common.py | 1 - auto_round/utils/model.py | 77 +++++++++++++- 3 files changed, 146 insertions(+), 32 deletions(-) diff --git a/auto_round/compressors/diffusion/compressor.py b/auto_round/compressors/diffusion/compressor.py index ea7eba39e..11d987728 100644 --- a/auto_round/compressors/diffusion/compressor.py +++ b/auto_round/compressors/diffusion/compressor.py @@ -56,6 +56,11 @@ class DiffusionCompressor(BaseCompressor): The more it is, the more closely it follows the prompt (default is 7.5). num_inference_steps (int): The reference number of denoising steps (default is 50). generator_seed (int): A sees that controls the initial noise from which an image is generated (default is None). + pipeline_fn (callable, optional): Custom callable to run the pipeline during calibration. + Signature: ``fn(pipe, prompts, *, guidance_scale, num_inference_steps, generator, **kwargs)``. + Use this to support models whose inference API differs from the standard convention + (e.g. NextStep). If ``None``, the standard ``pipe(prompts, ...)`` call is used unless + the loaded pipeline already exposes an ``_autoround_pipeline_fn`` attribute. scheme: (str| dict | QuantizationScheme ): A preset scheme that defines the quantization configurations. layer_config (dict): Configuration for weight quantization (default is None). dataset: The path or name of the calib dataset. @@ -102,9 +107,15 @@ def __init__( device_map: Union[str, torch.device, int, dict] = 0, enable_torch_compile: bool = False, seed: int = 42, + pipeline_fn: callable = None, **kwargs, ): logger.warning("Diffusion model quantization is experimental and is only validated on Flux models.") + if dataset == "NeelNanda/pile-10k": + dataset = "coco2014" + logger.warning( + "Dataset 'NeelNanda/pile-10k' is not suitable for diffusion model quantization, use coco2014 dataset instead." + ) model_dtype = kwargs.pop("model_dtype", None) self.guidance_scale = guidance_scale @@ -120,6 +131,8 @@ def __init__( self.model = model self.pipe = pipe + # Use explicit pipeline_fn; fall back to whatever diffusion_load_model attached to the pipe + self.pipeline_fn = pipeline_fn or getattr(pipe, "_autoround_pipeline_fn", None) all_blocks = get_block_names(model) self.quant_block_list = find_matching_blocks(model, all_blocks, to_quant_block_names) @@ -278,6 +291,64 @@ def _get_current_num_elm( current_input_ids = [input_ids["hidden_states"][i] for i in indices] return sum(id.numel() for id in current_input_ids) + def _run_pipeline(self, prompts: list) -> None: + """Execute one full diffusion pipeline forward pass for calibration input capture. + + This drives all transformer blocks so that their intermediate inputs are recorded + by the hooks installed during calibration. + + **Extending for custom models** – choose whichever approach is simpler: + + * Pass a ``pipeline_fn`` to the constructor (no subclassing required). The + callable receives ``(pipe, prompts, *, guidance_scale, num_inference_steps, + generator, **kwargs)`` and must trigger a full forward pass. + * Subclass :class:`DiffusionCompressor` and override this method directly for + full control over the inference logic. + + Example – NextStep model:: + + def nextstep_fn(pipe, prompts, guidance_scale=7.5, + num_inference_steps=28, generator=None, + hw=(1024, 1024), **kwargs): + for prompt in (prompts if isinstance(prompts, list) else [prompts]): + pipe.generate_image( + prompt, + cfg=guidance_scale, + num_sampling_steps=num_inference_steps, + hw=hw, + **kwargs, + ) + + compressor = DiffusionCompressor( + model="path/to/nextstep", + pipeline_fn=nextstep_fn, + pipeline_fn_kwargs={"hw": (512, 512)}, + ) + + Args: + prompts (list[str]): Text prompts for the current calibration batch. + """ + generator = ( + None + if self.generator_seed is None + else torch.Generator(device=self.pipe.device).manual_seed(self.generator_seed) + ) + if self.pipeline_fn is not None: + self.pipeline_fn( + self.pipe, + prompts, + guidance_scale=self.guidance_scale, + num_inference_steps=self.num_inference_steps, + generator=generator, + ) + else: + self.pipe( + prompts, + guidance_scale=self.guidance_scale, + num_inference_steps=self.num_inference_steps, + generator=generator, + ) + def calib(self, nsamples, bs): """Perform calibration for quantization. @@ -308,40 +379,13 @@ def calib(self, nsamples, bs): total_cnt = 0 total = nsamples if not hasattr(self.dataloader, "len") else min(nsamples, len(self.dataloader)) - if self.pipe.dtype != self.model.dtype: - self.pipe.to(self.model.dtype) - - if ( - hasattr(self.model, "hf_device_map") - and len(self.model.hf_device_map) > 0 - and self.pipe.device != self.model.device - and torch.device(self.model.device).type in ["cuda", "xpu"] - ): - logger.error( - "Diffusion model is activated sequential model offloading, it will crash during moving to GPU/XPU. " - "Please use model path for quantization or " - "move the pipeline object to GPU/XPU before passing them into API." - ) - exit(-1) - if self.pipe.device != self.model.device: - self.pipe.to(self.model.device) - self.pipe.to(self.model.dtype) with tqdm(range(1, total + 1), desc="cache block inputs") as pbar: for ids, prompts in self.dataloader: if isinstance(prompts, tuple): prompts = list(prompts) try: - self.pipe( - prompt=prompts, - guidance_scale=self.guidance_scale, - num_inference_steps=self.num_inference_steps, - generator=( - None - if self.generator_seed is None - else torch.Generator(device=self.pipe.device).manual_seed(self.generator_seed) - ), - ) + self._run_pipeline(prompts) except NotImplementedError: pass except Exception as error: diff --git a/auto_round/utils/common.py b/auto_round/utils/common.py index 22091d788..a81dcefbd 100644 --- a/auto_round/utils/common.py +++ b/auto_round/utils/common.py @@ -494,7 +494,6 @@ def __getitem__(self, key): "patch_merger", "pre_mm_projector_norm", "vision", - "image", ] diff --git a/auto_round/utils/model.py b/auto_round/utils/model.py index 5d86b18b7..5878d3df3 100644 --- a/auto_round/utils/model.py +++ b/auto_round/utils/model.py @@ -557,9 +557,6 @@ def mllm_load_model( cls = getattr(base_lib, architectures) else: cls = AutoModelForCausalLM - # A special case for NextStep - if model_type == "nextstep": - cls = AutoModel try: model_load_kwargs = {} if model_subfolder is not None: @@ -669,6 +666,46 @@ def diffusion_load_model( if device_str is not None and "hpu" in device_str: torch_dtype = torch.bfloat16 + try: + from transformers import AutoConfig + + config = AutoConfig.from_pretrained(pretrained_model_name_or_path, trust_remote_code=True) + except: + config = None + + model_type = getattr(config, "model_type", "") + # A special case for NextStep + if model_type == "nextstep": + from models.gen_pipeline import NextStepPipeline # pylint: disable=E0401 + from transformers import AutoModel, AutoTokenizer + + tokenizer = AutoTokenizer.from_pretrained( + pretrained_model_name_or_path, local_files_only=True, trust_remote_code=True + ) + model = AutoModel.from_pretrained(pretrained_model_name_or_path, local_files_only=True, trust_remote_code=True) + # The model is loaded onto the device because more than one block requires input data. + pipe = NextStepPipeline(tokenizer=tokenizer, model=model).to(device=device_str, dtype=torch.bfloat16) + + def _nextstep_pipeline_fn(pipe, prompts, guidance_scale=7.5, num_inference_steps=28, generator=None, **kwargs): + """Default pipeline_fn for NextStep models. + + Maps standard :class:`DiffusionCompressor` parameters to NextStep's + ``generate_image`` API. Pass a custom ``pipeline_fn`` to + :class:`DiffusionCompressor` to override defaults or supply + model-specific kwargs (e.g. ``hw``, ``positive_prompt``, + ``cfg_schedule``, ``timesteps_shift``). + """ + for prompt in (prompts if isinstance(prompts, list) else [prompts]): + pipe.generate_image( + prompt, + cfg=guidance_scale, + num_sampling_steps=num_inference_steps, + **kwargs, + ) + + pipe._autoround_pipeline_fn = _nextstep_pipeline_fn + return pipe, pipe.model + pipelines = LazyImport("diffusers.pipelines") if isinstance(pretrained_model_name_or_path, str): if torch_dtype == "auto": @@ -732,6 +769,25 @@ def model_save_pretrained(model, save_directory, **kwargs): # non-meta model uses model.save_pretrained for model and config saving setattr(model, "save_pretrained", partial(model_save_pretrained, model)) + + if pipe.dtype != model.dtype: + pipe.to(model.dtype) + if pipe.device != model.device: + pipe.to(model.device) + + if ( + hasattr(model, "hf_device_map") + and len(model.hf_device_map) > 0 + and pipe.device != model.device + and torch.device(model.device).type in ["cuda", "xpu"] + ): + logger.error( + "Diffusion model is activated sequential model offloading, it will crash during moving to GPU/XPU. " + "Please use model path for quantization or " + "move the pipeline object to GPU/XPU before passing them into API." + ) + exit(-1) + return pipe, model.to(device) @@ -797,6 +853,21 @@ def is_gguf_model(model_path: Union[str, torch.nn.Module]) -> bool: def is_diffusion_model(model_or_path: Union[str, object]) -> bool: from auto_round.utils.common import LazyImport + # First check if it's a known diffusion pipeline by config/model_type to avoid unnecessary imports and file checks for non-diffusion models, which can be time-consuming. + try: + from transformers import AutoConfig + + config = AutoConfig.from_pretrained(model_or_path, trust_remote_code=True) + model_type = getattr(config, "model_type", "") + # A special case for NextStep + if model_type == "nextstep": + return True + except: + logger.warning( + f"Failed to load config for {model_or_path}, trying to check model_index.json for diffusion pipeline." + ) + + # Then check if model_index.json exists for diffusion pipeline, which is a strong signal of being a diffusion pipeline. if isinstance(model_or_path, str): index_file = None if not os.path.isdir(model_or_path): From bb22b290c17a491a75b987067d22ecd95e429eb9 Mon Sep 17 00:00:00 2001 From: Xin He Date: Tue, 7 Apr 2026 12:23:03 +0000 Subject: [PATCH 04/10] fix bug for nextstep Signed-off-by: Xin He --- auto_round/compressors/base.py | 3 +- .../compressors/diffusion/compressor.py | 50 +++++- auto_round/utils/model.py | 169 ++++++++++-------- 3 files changed, 141 insertions(+), 81 deletions(-) diff --git a/auto_round/compressors/base.py b/auto_round/compressors/base.py index a8735407e..a4500d556 100644 --- a/auto_round/compressors/base.py +++ b/auto_round/compressors/base.py @@ -2320,7 +2320,8 @@ def try_cache_inter_data_gpucpu(self, block_names, nsamples, layer_names=None, l max_memory=new_max_memory, no_split_module_classes=no_split_modules, ) - self.model.tie_weights() + if hasattr(self.model, "tie_weights") and callable(self.model.tie_weights): + self.model.tie_weights() device_map = infer_auto_device_map( self.model, max_memory=new_max_memory, no_split_module_classes=no_split_modules ) diff --git a/auto_round/compressors/diffusion/compressor.py b/auto_round/compressors/diffusion/compressor.py index 11d987728..8339f4162 100644 --- a/auto_round/compressors/diffusion/compressor.py +++ b/auto_round/compressors/diffusion/compressor.py @@ -35,6 +35,7 @@ get_block_names, merge_block_output_keys, wrap_block_forward_positional_to_kwargs, + copy_python_files_from_model_cache, ) pipeline_utils = LazyImport("diffusers.pipelines.pipeline_utils") @@ -207,7 +208,7 @@ def _get_current_q_output( device: str, cache_device: str = "cpu", ) -> torch.Tensor: - output_config = output_configs.get(block.__class__.__name__, []) + output_config = output_configs.get(block.__class__.__name__, ["hidden_states"]) idx = None if "hidden_states" not in output_config else output_config.index("hidden_states") current_input_ids, current_input_others = self._sampling_inputs( input_ids, @@ -251,7 +252,7 @@ def _get_block_outputs( """ output = defaultdict(list) - output_config = output_configs.get(block.__class__.__name__, []) + output_config = output_configs.get(block.__class__.__name__, ["hidden_states"]) if isinstance(input_ids, dict): nsamples = len(input_ids["hidden_states"]) else: @@ -269,6 +270,8 @@ def _get_block_outputs( tmp_input_ids = hidden_states tmp_output = block_forward(block, tmp_input_ids, tmp_input_others, self.amp, self.amp_dtype, device, None) + if isinstance(tmp_output, torch.Tensor): + tmp_output = [tmp_output] assert len(output_config) == len(tmp_output) tmp_output = dict(zip(output_config, tmp_output)) @@ -379,6 +382,7 @@ def calib(self, nsamples, bs): total_cnt = 0 total = nsamples if not hasattr(self.dataloader, "len") else min(nsamples, len(self.dataloader)) + self._align_device_and_dtype() with tqdm(range(1, total + 1), desc="cache block inputs") as pbar: for ids, prompts in self.dataloader: @@ -439,6 +443,12 @@ def _get_save_folder_name(self, format: OutputFormat) -> str: """ # Replace special characters to make the folder name filesystem-safe sanitized_format = format.get_backend_name().replace(":", "-").replace("_", "-") + if hasattr(self.model, "config") and getattr(self.model.config, "model_type", None) == "nextstep": + # Use a subfolder only if there are multiple formats + if len(self.formats) > 1: + return os.path.join(self.orig_output_dir, sanitized_format) + + return self.orig_output_dir # Use a subfolder only if there are multiple formats if len(self.formats) > 1: @@ -467,6 +477,16 @@ def save_quantized(self, output_dir=None, format="auto_round", inplace=True, **k return super().save_quantized(output_dir, format=format, inplace=inplace, **kwargs) compressed_model = None + if hasattr(self.model, "config") and getattr(self.model.config, "model_type", None) == "nextstep": + compressed_model = super().save_quantized( + output_dir=output_dir, + format=format, + inplace=inplace, + **kwargs, + ) + self.pipe.tokenizer.save_pretrained(output_dir) + copy_python_files_from_model_cache(self.model, output_dir, copy_folders=["models", "vae", "utils"]) + return compressed_model for name in self.pipe.components.keys(): val = getattr(self.pipe, name) sub_module_path = ( @@ -487,3 +507,29 @@ def save_quantized(self, output_dir=None, format="auto_round", inplace=True, **k val.save_pretrained(sub_module_path) self.pipe.config.save_pretrained(output_dir) return compressed_model + + def _align_device_and_dtype(self): + if hasattr(self.model, "config") and getattr(self.model.config, "model_type", None) == "nextstep": + return + if ( + hasattr(self.model, "hf_device_map") + and len(self.model.hf_device_map) > 0 + and type(self.pipe.device) != type(self.model.device) + and self.pipe.device != self.model.device + and torch.device(self.model.device).type in ["cuda", "xpu"] + ): + logger.error( + "Diffusion model is activated sequential model offloading, it will crash during moving to GPU/XPU. " + "Please use model path for quantization or " + "move the pipeline object to GPU/XPU before passing them into API." + ) + exit(-1) + + if self.pipe.device != self.model.device: + self.pipe.to(self.model.device) + if self.pipe.dtype != self.model.dtype: + self.pipe.to(self.model.dtype) + + +def save_next_step_diffusion(): + ar.model.save_pretrained("nextstep_diffusion_model") diff --git a/auto_round/utils/model.py b/auto_round/utils/model.py index 5878d3df3..b08e537f3 100644 --- a/auto_round/utils/model.py +++ b/auto_round/utils/model.py @@ -676,34 +676,7 @@ def diffusion_load_model( model_type = getattr(config, "model_type", "") # A special case for NextStep if model_type == "nextstep": - from models.gen_pipeline import NextStepPipeline # pylint: disable=E0401 - from transformers import AutoModel, AutoTokenizer - - tokenizer = AutoTokenizer.from_pretrained( - pretrained_model_name_or_path, local_files_only=True, trust_remote_code=True - ) - model = AutoModel.from_pretrained(pretrained_model_name_or_path, local_files_only=True, trust_remote_code=True) - # The model is loaded onto the device because more than one block requires input data. - pipe = NextStepPipeline(tokenizer=tokenizer, model=model).to(device=device_str, dtype=torch.bfloat16) - - def _nextstep_pipeline_fn(pipe, prompts, guidance_scale=7.5, num_inference_steps=28, generator=None, **kwargs): - """Default pipeline_fn for NextStep models. - - Maps standard :class:`DiffusionCompressor` parameters to NextStep's - ``generate_image`` API. Pass a custom ``pipeline_fn`` to - :class:`DiffusionCompressor` to override defaults or supply - model-specific kwargs (e.g. ``hw``, ``positive_prompt``, - ``cfg_schedule``, ``timesteps_shift``). - """ - for prompt in (prompts if isinstance(prompts, list) else [prompts]): - pipe.generate_image( - prompt, - cfg=guidance_scale, - num_sampling_steps=num_inference_steps, - **kwargs, - ) - - pipe._autoround_pipeline_fn = _nextstep_pipeline_fn + pipe, model = load_next_step_diffusion(pretrained_model_name_or_path, device_str) return pipe, pipe.model pipelines = LazyImport("diffusers.pipelines") @@ -769,25 +742,6 @@ def model_save_pretrained(model, save_directory, **kwargs): # non-meta model uses model.save_pretrained for model and config saving setattr(model, "save_pretrained", partial(model_save_pretrained, model)) - - if pipe.dtype != model.dtype: - pipe.to(model.dtype) - if pipe.device != model.device: - pipe.to(model.device) - - if ( - hasattr(model, "hf_device_map") - and len(model.hf_device_map) > 0 - and pipe.device != model.device - and torch.device(model.device).type in ["cuda", "xpu"] - ): - logger.error( - "Diffusion model is activated sequential model offloading, it will crash during moving to GPU/XPU. " - "Please use model path for quantization or " - "move the pipeline object to GPU/XPU before passing them into API." - ) - exit(-1) - return pipe, model.to(device) @@ -1751,43 +1705,71 @@ def _copy_extra_model_files(src_dir: str, dst_dir: str): # Adapted from https://github.com/vllm-project/llm-compressor/blob/ # 5b3ddff74cae9651f24bef15d3255c4ee053fc60/src/llmcompressor/pytorch/model_load/helpers.py#L144 -def copy_python_files_from_model_cache(model, save_path: str): +def copy_python_files_from_model_cache( + model, save_path: str, copy_folders: bool | list[str] | tuple[str, ...] = False +): + """Copy Python files (and optionally subdirectories) from the model cache to *save_path*. + + Args: + model: The model whose ``config._name_or_path`` points to the source cache. + save_path (str): Destination directory. + copy_folders (bool | list[str] | tuple[str, ...]): Controls which subdirectories + are copied from the cache root to *save_path*: + + * ``False`` (default) – no folders are copied. + * ``True`` – every subdirectory that does not already exist in *save_path* + is copied (e.g. all of ``vae``, ``scheduler``, …). + * A list/tuple of folder names (e.g. ``["vae", "scheduler"]``) – only the + named subdirectories are copied. + """ + import shutil + + from huggingface_hub import hf_hub_download + config = model.config - if hasattr(config, "_name_or_path"): - import os - import shutil + if not hasattr(config, "_name_or_path"): + return - from huggingface_hub import hf_hub_download + if version.parse(transformers.__version__) < version.parse("5.0.0"): + from transformers.utils import TRANSFORMERS_CACHE - if version.parse(transformers.__version__) < version.parse("5.0.0"): - from transformers.utils import TRANSFORMERS_CACHE + cache_dir = os.environ.get("HF_HOME", TRANSFORMERS_CACHE) + else: + from huggingface_hub.constants import HF_HUB_CACHE + + cache_dir = os.environ.get("HF_HOME", HF_HUB_CACHE) + from transformers.utils import http_user_agent + + cache_path = config._name_or_path + if not os.path.exists(cache_path): + user_agent = http_user_agent() + config_file_path = hf_hub_download( + repo_id=cache_path, + filename="config.json", + cache_dir=cache_dir, + force_download=False, + user_agent=user_agent, + ) + cache_path = os.path.sep.join(config_file_path.split(os.path.sep)[:-1]) - cache_dir = os.environ.get("HF_HOME", TRANSFORMERS_CACHE) - else: - from huggingface_hub.constants import HF_HUB_CACHE - - cache_dir = os.environ.get("HF_HOME", HF_HUB_CACHE) - from transformers.utils import http_user_agent - - cache_path = config._name_or_path - if not os.path.exists(cache_path): - user_agent = http_user_agent() - config_file_path = hf_hub_download( - repo_id=cache_path, - filename="config.json", - cache_dir=cache_dir, - force_download=False, - user_agent=user_agent, - ) - cache_path = os.path.sep.join(config_file_path.split(os.path.sep)[:-1]) + for file in os.listdir(cache_path): + full_file_name = os.path.join(cache_path, file) + if file.endswith(".py") and os.path.isfile(full_file_name): + logger.debug(f"Transferring {full_file_name} to {save_path}") + shutil.copy(full_file_name, save_path) - for file in os.listdir(cache_path): - full_file_name = os.path.join(cache_path, file) - if file.endswith(".py") and os.path.isfile(full_file_name): - logger.debug(f"Transferring {full_file_name} to {save_path}") - shutil.copy(full_file_name, save_path) + _copy_extra_model_files(cache_path, save_path) - _copy_extra_model_files(cache_path, save_path) + if copy_folders is not False: + for entry in os.listdir(cache_path): + src_entry = os.path.join(cache_path, entry) + dst_entry = os.path.join(save_path, entry) + if not os.path.isdir(src_entry): + continue + if copy_folders is True or entry in copy_folders: + if not os.path.exists(dst_entry): + logger.debug(f"Transferring folder {src_entry} to {save_path}") + shutil.copytree(src_entry, dst_entry) def extract_block_names_to_str(quant_block_list): @@ -1956,3 +1938,34 @@ def forward(m, hidden_states=None, *positional_inputs, **kwargs): return base_hook(m, hidden_states, *positional_inputs, **kwargs) return forward + +def load_next_step_diffusion(pretrained_model_name_or_path, device_str): + from models.gen_pipeline import NextStepPipeline # pylint: disable=E0401 + from transformers import AutoModel, AutoTokenizer + + tokenizer = AutoTokenizer.from_pretrained( + pretrained_model_name_or_path, local_files_only=True, trust_remote_code=True + ) + model = AutoModel.from_pretrained(pretrained_model_name_or_path, local_files_only=True, trust_remote_code=True) + # The model is loaded onto the device because more than one block requires input data. + pipe = NextStepPipeline(tokenizer=tokenizer, model=model).to(device=device_str, dtype=torch.bfloat16) + + def _nextstep_pipeline_fn(pipe, prompts, guidance_scale=7.5, num_inference_steps=28, generator=None, **kwargs): + """Default pipeline_fn for NextStep models. + + Maps standard :class:`DiffusionCompressor` parameters to NextStep's + ``generate_image`` API. Pass a custom ``pipeline_fn`` to + :class:`DiffusionCompressor` to override defaults or supply + model-specific kwargs (e.g. ``hw``, ``positive_prompt``, + ``cfg_schedule``, ``timesteps_shift``). + """ + for prompt in (prompts if isinstance(prompts, list) else [prompts]): + pipe.generate_image( + prompt, + cfg=guidance_scale, + num_sampling_steps=num_inference_steps, + **kwargs, + ) + + pipe._autoround_pipeline_fn = _nextstep_pipeline_fn + return pipe, model From 87c403786694e0f3f1fc495686a0fbe8f34e8e1f Mon Sep 17 00:00:00 2001 From: Xin He Date: Tue, 7 Apr 2026 13:46:26 +0000 Subject: [PATCH 05/10] set self.num_inference_steps=1 for calibration Signed-off-by: Xin He --- auto_round/compressors/diffusion/README.md | 9 ++++++--- auto_round/compressors/diffusion/compressor.py | 5 +++++ 2 files changed, 11 insertions(+), 3 deletions(-) diff --git a/auto_round/compressors/diffusion/README.md b/auto_round/compressors/diffusion/README.md index c5b307df5..c0f8eb534 100644 --- a/auto_round/compressors/diffusion/README.md +++ b/auto_round/compressors/diffusion/README.md @@ -64,9 +64,12 @@ auto-round \ For diffusion models, currently we only validate quantizaion on the FLUX.1-dev, which involves quantizing the transformer component of the pipeline. -| Model | calibration dataset | -|--------------|--------------| -| black-forest-labs/FLUX.1-dev | COCO2014 | +| Model | calibration dataset | Model Link | +|---------------|---------------------|--------------| +| black-forest-labs/FLUX.1-dev | COCO2014 | - | +| Tongyi-MAI/Z-Image | COCO2014 | - | +| Tongyi-MAI/Z-Image-Turb | COCO2014 | - | +| stepfun-ai/NextStep-1.1 | COCO2014 | - | diff --git a/auto_round/compressors/diffusion/compressor.py b/auto_round/compressors/diffusion/compressor.py index 8339f4162..b729f3f8e 100644 --- a/auto_round/compressors/diffusion/compressor.py +++ b/auto_round/compressors/diffusion/compressor.py @@ -368,6 +368,9 @@ def calib(self, nsamples, bs): "Diffusion model will catch nsamples * num_inference_steps inputs, " "you can reduce nsamples or num_inference_steps if OOM or take too much time." ) + raw_num_inference_steps = self.num_inference_steps + self.num_inference_steps = 1 + logger.info(f"Set num_inference_steps to 1 for calibration, original num_inference_steps is {raw_num_inference_steps}") if isinstance(self.dataset, str): dataset = self.dataset.replace(" ", "") self.dataloader, self.batch_size, self.gradient_accumulate_steps = get_diffusion_dataloader( @@ -422,6 +425,8 @@ def calib(self, nsamples, bs): self.inputs[k][key] = v[key][:max_len] # torch.cuda.empty_cache() + self.num_inference_steps = raw_num_inference_steps + logger.info(f"Restore num_inference_steps to {self.num_inference_steps} after calibration") def _should_stop_cache_forward(self, name: str) -> bool: """Determine whether current forward pass can stop after caching `name`.""" From 1c012007ebc01681ab1cb763d2711217d03514fc Mon Sep 17 00:00:00 2001 From: Xin He Date: Wed, 8 Apr 2026 02:46:10 +0000 Subject: [PATCH 06/10] Remove unused function; add CUDA CI for diffusion tuning test; revert gptqmodel fix Signed-off-by: Xin He --- .../compressors/diffusion/compressor.py | 4 -- auto_round_extension/cuda/gptqmodel_marlin.py | 70 ++++--------------- test/test_cuda/models/test_diffusion.py | 1 - 3 files changed, 14 insertions(+), 61 deletions(-) diff --git a/auto_round/compressors/diffusion/compressor.py b/auto_round/compressors/diffusion/compressor.py index b729f3f8e..5c2feebe1 100644 --- a/auto_round/compressors/diffusion/compressor.py +++ b/auto_round/compressors/diffusion/compressor.py @@ -534,7 +534,3 @@ def _align_device_and_dtype(self): self.pipe.to(self.model.device) if self.pipe.dtype != self.model.dtype: self.pipe.to(self.model.dtype) - - -def save_next_step_diffusion(): - ar.model.save_pretrained("nextstep_diffusion_model") diff --git a/auto_round_extension/cuda/gptqmodel_marlin.py b/auto_round_extension/cuda/gptqmodel_marlin.py index 761589c3f..16949cb0a 100644 --- a/auto_round_extension/cuda/gptqmodel_marlin.py +++ b/auto_round_extension/cuda/gptqmodel_marlin.py @@ -30,9 +30,6 @@ def get_marlin_layer(): ##use an ugly wrapper to import gptqmodel on demand NEW_VERSION = False if Version(gptqmodel.__version__) >= Version("5.0.0"): NEW_VERSION = True - NEW_VERSION_6_0 = False - if Version(gptqmodel.__version__) >= Version("6.0.0"): - NEW_VERSION_6_0 = True from gptqmodel.models._const import DEVICE, PLATFORM # pylint: disable=E0401 from gptqmodel.nn_modules.qlinear import BaseQuantLinear # pylint: disable=E0401 from gptqmodel.utils.backend import BACKEND # pylint: disable=E0401 @@ -247,59 +244,20 @@ def __init__( # (since we have only one group per output channel) desc_act = False - backend = kwargs.pop("backend", BACKEND.MARLIN) - if NEW_VERSION_6_0: - # gptqmodel >= 6.0.0: BaseQuantLinear no longer accepts group_size/sym/desc_act/pack_dtype - # directly; they must be passed via validate_kwargs. Attributes are also set manually. - super().__init__( - bits=bits, - in_features=in_features, - out_features=out_features, - bias=bias, - backend=backend, - adapter=None, - register_buffers=False, - validate_kwargs={ - "group_size": group_size, - "desc_act": desc_act, - "sym": sym, - "pack_dtype": pack_dtype, - }, - **kwargs, - ) - # Set attributes that intermediate classes (PackedQuantLinear / - # GPTQQuantLinear) would have set in the old API. - self.pack_dtype = pack_dtype - if pack_dtype == torch.int8: - self.pack_dtype_bits = 8 - elif pack_dtype == torch.int16: - self.pack_dtype_bits = 16 - elif pack_dtype == torch.int32: - self.pack_dtype_bits = 32 - elif pack_dtype == torch.int64: - self.pack_dtype_bits = 64 - else: - raise ValueError(f"Unsupported pack_dtype: {pack_dtype}") - self.pack_factor = self.pack_dtype_bits // bits - self.group_size = group_size if group_size != -1 else in_features - self.requested_group_size = group_size - self.desc_act = desc_act - self.sym = sym - else: - super().__init__( - bits=bits, - group_size=group_size, - sym=sym, - desc_act=desc_act, - in_features=in_features, - out_features=out_features, - bias=bias, - pack_dtype=pack_dtype, - backend=backend, - adapter=None, - register_buffers=False, - **kwargs, - ) + super().__init__( + bits=bits, + group_size=group_size, + sym=sym, + desc_act=desc_act, + in_features=in_features, + out_features=out_features, + bias=bias, + pack_dtype=pack_dtype, + backend=kwargs.pop("backend", BACKEND.MARLIN), + adapter=None, + register_buffers=False, + **kwargs, + ) # toggle fp32 mode depending on MARLIN or MARLIN_FP16 backend self.fp32 = True if self.backend in [BACKEND.MARLIN, BACKEND.AUTO] else False diff --git a/test/test_cuda/models/test_diffusion.py b/test/test_cuda/models/test_diffusion.py index 11ea7da54..a46293f26 100644 --- a/test/test_cuda/models/test_diffusion.py +++ b/test/test_cuda/models/test_diffusion.py @@ -49,7 +49,6 @@ def test_diffusion_rtn(self, tiny_flux_model_path): # skip model saving since it takes much time autoround.quantize() - @pytest.mark.skip_ci(reason="Tuning will OOM in CI; Only tiny model is suggested") # skip this test in CI @require_optimum def test_diffusion_tune(self, tiny_flux_model_path): from diffusers import AutoPipelineForText2Image From 240df28f3a1e646fd5fbff4be7dc46d0b37b972d Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Wed, 8 Apr 2026 02:44:52 +0000 Subject: [PATCH 07/10] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- auto_round/compressors/diffusion/compressor.py | 6 ++++-- auto_round/utils/model.py | 5 ++--- 2 files changed, 6 insertions(+), 5 deletions(-) diff --git a/auto_round/compressors/diffusion/compressor.py b/auto_round/compressors/diffusion/compressor.py index 5c2feebe1..14685b5d7 100644 --- a/auto_round/compressors/diffusion/compressor.py +++ b/auto_round/compressors/diffusion/compressor.py @@ -29,13 +29,13 @@ from auto_round.utils import ( LazyImport, clear_memory, + copy_python_files_from_model_cache, diffusion_load_model, extract_block_names_to_str, find_matching_blocks, get_block_names, merge_block_output_keys, wrap_block_forward_positional_to_kwargs, - copy_python_files_from_model_cache, ) pipeline_utils = LazyImport("diffusers.pipelines.pipeline_utils") @@ -370,7 +370,9 @@ def calib(self, nsamples, bs): ) raw_num_inference_steps = self.num_inference_steps self.num_inference_steps = 1 - logger.info(f"Set num_inference_steps to 1 for calibration, original num_inference_steps is {raw_num_inference_steps}") + logger.info( + f"Set num_inference_steps to 1 for calibration, original num_inference_steps is {raw_num_inference_steps}" + ) if isinstance(self.dataset, str): dataset = self.dataset.replace(" ", "") self.dataloader, self.batch_size, self.gradient_accumulate_steps = get_diffusion_dataloader( diff --git a/auto_round/utils/model.py b/auto_round/utils/model.py index b08e537f3..7b2c574f1 100644 --- a/auto_round/utils/model.py +++ b/auto_round/utils/model.py @@ -1705,9 +1705,7 @@ def _copy_extra_model_files(src_dir: str, dst_dir: str): # Adapted from https://github.com/vllm-project/llm-compressor/blob/ # 5b3ddff74cae9651f24bef15d3255c4ee053fc60/src/llmcompressor/pytorch/model_load/helpers.py#L144 -def copy_python_files_from_model_cache( - model, save_path: str, copy_folders: bool | list[str] | tuple[str, ...] = False -): +def copy_python_files_from_model_cache(model, save_path: str, copy_folders: bool | list[str] | tuple[str, ...] = False): """Copy Python files (and optionally subdirectories) from the model cache to *save_path*. Args: @@ -1939,6 +1937,7 @@ def forward(m, hidden_states=None, *positional_inputs, **kwargs): return forward + def load_next_step_diffusion(pretrained_model_name_or_path, device_str): from models.gen_pipeline import NextStepPipeline # pylint: disable=E0401 from transformers import AutoModel, AutoTokenizer From 04bb9100a11207882ac499a40050a93341921b1d Mon Sep 17 00:00:00 2001 From: Xin He Date: Wed, 8 Apr 2026 02:52:25 +0000 Subject: [PATCH 08/10] fix bug Signed-off-by: Xin He --- auto_round/compressors/diffusion/compressor.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/auto_round/compressors/diffusion/compressor.py b/auto_round/compressors/diffusion/compressor.py index 14685b5d7..6699d6cb2 100644 --- a/auto_round/compressors/diffusion/compressor.py +++ b/auto_round/compressors/diffusion/compressor.py @@ -532,7 +532,6 @@ def _align_device_and_dtype(self): ) exit(-1) + self.pipe.to(self.model.dtype) if self.pipe.device != self.model.device: self.pipe.to(self.model.device) - if self.pipe.dtype != self.model.dtype: - self.pipe.to(self.model.dtype) From d84c2d24d2a9a3f890fdc87fabe71818190c6448 Mon Sep 17 00:00:00 2001 From: Xin He Date: Wed, 8 Apr 2026 06:44:35 +0000 Subject: [PATCH 09/10] Refactor device dispatching logic; remove unused function and update imports Signed-off-by: Xin He --- auto_round/auto_scheme/delta_loss.py | 2 +- auto_round/auto_scheme/utils.py | 49 +------ auto_round/compressors/base.py | 12 +- .../compressors/diffusion/compressor.py | 8 +- auto_round/eval/eval_cli.py | 14 +- auto_round/utils/device.py | 123 ++++++++++++++++-- test/test_cuda/models/test_diffusion.py | 4 +- 7 files changed, 134 insertions(+), 78 deletions(-) diff --git a/auto_round/auto_scheme/delta_loss.py b/auto_round/auto_scheme/delta_loss.py index 66e116e84..ec5320a96 100644 --- a/auto_round/auto_scheme/delta_loss.py +++ b/auto_round/auto_scheme/delta_loss.py @@ -27,7 +27,6 @@ apply_quant_scheme, compute_avg_bits_for_scheme, compute_layer_bits, - dispatch_model_by_all_available_devices, parse_shared_layers, remove_quant_scheme, ) @@ -54,6 +53,7 @@ set_module, set_non_auto_device_map, to_device, + dispatch_model_by_all_available_devices, ) from auto_round.utils.device import MemoryMonitor from auto_round.utils.offload import OffloadManager diff --git a/auto_round/auto_scheme/utils.py b/auto_round/auto_scheme/utils.py index 51b8da8e1..2b09f1459 100644 --- a/auto_round/auto_scheme/utils.py +++ b/auto_round/auto_scheme/utils.py @@ -29,6 +29,7 @@ is_hpex_available, normalize_no_split_modules, parse_available_devices, + DEVICE_ENVIRON_VARIABLE_MAPPING ) @@ -213,54 +214,6 @@ def compute_layer_bits( return total_bits, avg_bits -# Important Notice This dispatch does not follow dict device_map, just extract all available devices and use them -def dispatch_model_by_all_available_devices( - model: torch.nn.Module, device_map: Union[str, int, dict, None] -) -> torch.nn.Module: - if device_map is None: - device_map = 0 - - no_split_modules = normalize_no_split_modules(getattr(model, "_no_split_modules", [])) - if device_map == "auto": - max_memory = get_balanced_memory( - model, - max_memory=None, - no_split_module_classes=no_split_modules, - ) - device_map = infer_auto_device_map(model, max_memory=max_memory, no_split_module_classes=no_split_modules) - model = dispatch_model(model, device_map=device_map) - return model - - devices = parse_available_devices(device_map) - - if len(devices) == 1: - model.to(devices[0]) - return model - - max_memory = get_balanced_memory( - model, - max_memory=None, - no_split_module_classes=no_split_modules, - ) - - # Filter max_memory with devices - # assume only one GPU model - new_max_memory = {} - for device in devices: - if ":" in device: - device = int(device.split(":")[-1]) - elif device == "cpu": - device = "cpu" - elif isinstance(device, str): - device = 0 - else: - raise ValueError(f"Unsupported device {device} in device_map: {device_map}") - new_max_memory[device] = max_memory[device] - model.tie_weights() - device_map = infer_auto_device_map(model, max_memory=max_memory, no_split_module_classes=no_split_modules) - model = dispatch_model(model, device_map=device_map) - return model - def merge_lists_unionfind(list_of_lists): parent = {} diff --git a/auto_round/compressors/base.py b/auto_round/compressors/base.py index a4500d556..633301194 100644 --- a/auto_round/compressors/base.py +++ b/auto_round/compressors/base.py @@ -1573,7 +1573,11 @@ def _quantize_via_rtn_blockwise(self, all_to_quantized_module_names: list[str]) block = convert_module_to_hp_if_necessary(block, dtype=self.amp_dtype, device=self.device) update_block_global_scale_if_needed(block, self.data_type, self.group_size) self._register_act_max_hook(block) - if is_auto_device_mapping(self.device_map) and len(self.device_list) > 1: + if ( + is_auto_device_mapping(self.device_map) + and len(self.device_list) > 1 + and not getattr(self, "is_diffusion", False) + ): set_auto_device_map_for_block_with_tuning( block, self.device_map, input_ids, self.low_gpu_mem_usage, self.batch_size, self.device ) @@ -2980,7 +2984,11 @@ def _quantize_block( if auto_offload: # card_0_in_high_risk indicates that card_0 memory is already in high usage (90%) w/o any weights # loss_device is used to calculate loss on the second device if available and card_0_in_high_risk - if is_auto_device_mapping(self.device_map) and len(self.device_list) > 1: + if ( + is_auto_device_mapping(self.device_map) + and len(self.device_list) > 1 + and not getattr(self, "is_diffusion", False) + ): card_0_in_high_risk, loss_device = set_auto_device_map_for_block_with_tuning( block, self.device_map, input_ids, self.low_gpu_mem_usage, self.batch_size, device ) diff --git a/auto_round/compressors/diffusion/compressor.py b/auto_round/compressors/diffusion/compressor.py index 6699d6cb2..914fe95cc 100644 --- a/auto_round/compressors/diffusion/compressor.py +++ b/auto_round/compressors/diffusion/compressor.py @@ -36,6 +36,7 @@ get_block_names, merge_block_output_keys, wrap_block_forward_positional_to_kwargs, + dispatch_model_by_all_available_devices, ) pipeline_utils = LazyImport("diffusers.pipelines.pipeline_utils") @@ -87,6 +88,7 @@ class DiffusionCompressor(BaseCompressor): act_dynamic: bool | None super_bits: int | None super_group_size: int | None + is_diffusion: bool = True def __init__( self, @@ -176,6 +178,7 @@ def __init__( to_quant_block_names=to_quant_block_names, **kwargs, ) + self._align_device_and_dtype() def _update_inputs(self, inputs: dict, q_inputs: dict) -> tuple[dict, dict]: # flux transformer model's blocks will update hidden_states and encoder_hidden_states @@ -387,7 +390,6 @@ def calib(self, nsamples, bs): total_cnt = 0 total = nsamples if not hasattr(self.dataloader, "len") else min(nsamples, len(self.dataloader)) - self._align_device_and_dtype() with tqdm(range(1, total + 1), desc="cache block inputs") as pbar: for ids, prompts in self.dataloader: @@ -533,5 +535,5 @@ def _align_device_and_dtype(self): exit(-1) self.pipe.to(self.model.dtype) - if self.pipe.device != self.model.device: - self.pipe.to(self.model.device) + + dispatch_model_by_all_available_devices(self.pipe, self.device_map) diff --git a/auto_round/eval/eval_cli.py b/auto_round/eval/eval_cli.py index 76811ec3f..874615ad9 100644 --- a/auto_round/eval/eval_cli.py +++ b/auto_round/eval/eval_cli.py @@ -22,10 +22,11 @@ from auto_round.utils import ( dispatch_model_block_wise, get_device_and_parallelism, - get_device_str, + detect_device, get_model_dtype, is_diffusion_model, set_cuda_visible_devices, + DEVICE_ENVIRON_VARIABLE_MAPPING, ) @@ -285,19 +286,14 @@ def eval_with_vllm(args): logger.info(f"Overriding VLLM parameters with custom args: {custom_vllm_kwargs}") vllm_kwargs.update(custom_vllm_kwargs) - device = get_device_str() - environ_mapping = { - "cuda": "CUDA_VISIBLE_DEVICES", - "xpu": "ZE_AFFINITY_MASK", - "hpu": "HABANA_VISIBLE_MODULES", - } + device = detect_device() if "tensor_parallel_size" not in vllm_kwargs: # Parse device_map to determine tensor_parallel_size and set the relevant env var # Only accept formats like "0" or "0,1,2". If the environment variable is # already set externally, do not overwrite it — but still derive # `tensor_parallel_size` from the existing value. - assert device in environ_mapping, f"Device {device} not supported for vllm tensor parallelism." - environ_name = environ_mapping[device] + assert device in DEVICE_ENVIRON_VARIABLE_MAPPING, f"Device {device} not supported for vllm tensor parallelism." + environ_name = DEVICE_ENVIRON_VARIABLE_MAPPING[device] device_map = args.device_map device_ids = [d.strip() for d in str(device_map).split(",") if d.strip().isdigit()] diff --git a/auto_round/utils/device.py b/auto_round/utils/device.py index 747a4eb2b..e6611ece9 100644 --- a/auto_round/utils/device.py +++ b/auto_round/utils/device.py @@ -34,6 +34,13 @@ from auto_round.logger import logger from auto_round.utils.model import check_to_quantized, get_block_names, get_layer_features, get_module +DEVICE_ENVIRON_VARIABLE_MAPPING = { + "cuda": "CUDA_VISIBLE_DEVICES", + "xpu": "ZE_AFFINITY_MASK", + "hpu": "HABANA_VISIBLE_MODULES", +} + + # Note on HPU usage: # There are two modes available for enabling auto-round on HPU: # 1. Compile Mode @@ -42,7 +49,6 @@ # The compile mode can speed up quantization process but still in experimental stage. # 2. Lazy Mode (By default) - ################ Check available sys.module to decide behavior ################# def is_package_available(package_name: str) -> bool: """Check if the package exists in the environment without importing. @@ -1694,18 +1700,6 @@ def log_summary(self, msg: str = "", level: str = "info"): return summary -def get_device_str(): - """Get a string representation of the automatically detected device.""" - if torch.cuda.is_available(): - return "cuda" - elif torch.xpu.is_available(): # pragma: no cover - return "xpu" - elif is_hpex_available(): # pragma: no cover - return "hpu" - else: # pragma: no cover - return "cpu" - - # Global singleton instance memory_monitor = MemoryMonitor() @@ -1740,3 +1734,106 @@ def wrapper(*args, **kwargs): return wrapper return decorator + + +# This function is designed for Auto Scheme and Diffusion Pipeline, +# which requires dispatching the whole model on all available devices. +def dispatch_model_by_all_available_devices( + model: torch.nn.Module, device_map: Union[str, int, dict, None] +) -> torch.nn.Module: + # Important Notice: This dispatch does not follow dict device_map, just extract all available devices and use them + device_type = detect_device() + if device_type in DEVICE_ENVIRON_VARIABLE_MAPPING: + existing_env = os.environ.get(DEVICE_ENVIRON_VARIABLE_MAPPING[device_type]) + if existing_env is None: + logger.warning_once("`get_balanced_memory` is used here, but no environment variable " + + "is set to specify device visibility. This may lead to OOM issue even the memory " + + "is large enough.") + + # Handle DiffusionPipeline: dispatch only the main sub-model (transformer / unet) + # across devices and move the remaining pipeline components to the primary device. + try: + from diffusers.pipelines.pipeline_utils import DiffusionPipeline + + if isinstance(model, DiffusionPipeline): + pipe = model + _device_map = 0 if device_map is None else device_map + devices = parse_available_devices(_device_map) + # Identify the main quantisable sub-model + main_attr = next( + ( + attr + for attr in ("transformer", "unet") + if isinstance(getattr(pipe, attr, None), torch.nn.Module) + ), + None, + ) + if main_attr is None or len(devices) == 1: + # No identifiable main sub-model, or single target device: + # move the entire pipeline to the (first) device. + pipe.to(devices[0] if devices else "cuda:0") + return pipe + # Multi-device path: recursively dispatch the main sub-model, + # then move all remaining pipeline components to the primary device. + main_model = getattr(pipe, main_attr) + dispatched = dispatch_model_by_all_available_devices(main_model, _device_map) + setattr(pipe, main_attr, dispatched) + primary_device = devices[0] + for attr, component in pipe.components.items(): + if attr == main_attr: + continue + if isinstance(component, torch.nn.Module): + try: + component.to(primary_device) + except Exception: + pass + return pipe + except ImportError: + pass + + if device_map is None: + device_map = 0 + + from auto_round.utils.common import normalize_no_split_modules + + no_split_modules = normalize_no_split_modules(getattr(model, "_no_split_modules", [])) + if device_map == "auto": + max_memory = get_balanced_memory( + model, + max_memory=None, + no_split_module_classes=no_split_modules, + ) + device_map = infer_auto_device_map(model, max_memory=max_memory, no_split_module_classes=no_split_modules) + model = dispatch_model(model, device_map=device_map) + return model + + devices = parse_available_devices(device_map) + + if len(devices) == 1: + model.to(devices[0]) + return model + + max_memory = get_balanced_memory( + model, + max_memory=None, + no_split_module_classes=no_split_modules, + ) + + # Filter max_memory with devices + # assume only one GPU model + new_max_memory = {} + for device in devices: + if ":" in device: + device = int(device.split(":")[-1]) + elif device == "cpu": + device = "cpu" + elif isinstance(device, str): + device = 0 + else: + raise ValueError(f"Unsupported device {device} in device_map: {device_map}") + new_max_memory[device] = max_memory[device] + if hasattr(model, "tie_weights") and callable(model.tie_weights): + model.tie_weights() + device_map = infer_auto_device_map(model, max_memory=new_max_memory, no_split_module_classes=no_split_modules) + model = dispatch_model(model, device_map=device_map) + return model diff --git a/test/test_cuda/models/test_diffusion.py b/test/test_cuda/models/test_diffusion.py index a46293f26..f7d265b00 100644 --- a/test/test_cuda/models/test_diffusion.py +++ b/test/test_cuda/models/test_diffusion.py @@ -50,7 +50,7 @@ def test_diffusion_rtn(self, tiny_flux_model_path): autoround.quantize() @require_optimum - def test_diffusion_tune(self, tiny_flux_model_path): + def test_diffusion_tune(self, tiny_flux_model_path, tmp_path): from diffusers import AutoPipelineForText2Image ## load the model @@ -82,7 +82,7 @@ def test_diffusion_tune(self, tiny_flux_model_path): dataset=get_captions_dataset_path(), ) # skip model saving since it takes much time - autoround.quantize() + autoround.quantize_and_save(tmp_path) @pytest.mark.skip_ci(reason="Download large model; Time-consuming") def test_diffusion_model_checker(self): From 86328025ad0b9f4ee8ebeb828887ac81f381a2a6 Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Wed, 8 Apr 2026 07:18:54 +0000 Subject: [PATCH 10/10] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- auto_round/auto_scheme/delta_loss.py | 2 +- auto_round/auto_scheme/utils.py | 3 +-- auto_round/compressors/diffusion/compressor.py | 2 +- auto_round/eval/eval_cli.py | 4 ++-- auto_round/utils/device.py | 15 +++++++-------- 5 files changed, 12 insertions(+), 14 deletions(-) diff --git a/auto_round/auto_scheme/delta_loss.py b/auto_round/auto_scheme/delta_loss.py index ec5320a96..7610e94af 100644 --- a/auto_round/auto_scheme/delta_loss.py +++ b/auto_round/auto_scheme/delta_loss.py @@ -44,6 +44,7 @@ SUPPORTED_LAYER_TYPES, check_to_quantized, clear_memory, + dispatch_model_by_all_available_devices, get_block_names, get_major_device, get_module, @@ -53,7 +54,6 @@ set_module, set_non_auto_device_map, to_device, - dispatch_model_by_all_available_devices, ) from auto_round.utils.device import MemoryMonitor from auto_round.utils.offload import OffloadManager diff --git a/auto_round/auto_scheme/utils.py b/auto_round/auto_scheme/utils.py index 2b09f1459..b6b744a5a 100644 --- a/auto_round/auto_scheme/utils.py +++ b/auto_round/auto_scheme/utils.py @@ -21,6 +21,7 @@ from auto_round.schemes import QuantizationScheme, preset_name_to_scheme from auto_round.utils import ( + DEVICE_ENVIRON_VARIABLE_MAPPING, SUPPORTED_LAYER_TYPES, check_to_quantized, get_block_names, @@ -29,7 +30,6 @@ is_hpex_available, normalize_no_split_modules, parse_available_devices, - DEVICE_ENVIRON_VARIABLE_MAPPING ) @@ -214,7 +214,6 @@ def compute_layer_bits( return total_bits, avg_bits - def merge_lists_unionfind(list_of_lists): parent = {} diff --git a/auto_round/compressors/diffusion/compressor.py b/auto_round/compressors/diffusion/compressor.py index 914fe95cc..c338a1f1e 100644 --- a/auto_round/compressors/diffusion/compressor.py +++ b/auto_round/compressors/diffusion/compressor.py @@ -31,12 +31,12 @@ clear_memory, copy_python_files_from_model_cache, diffusion_load_model, + dispatch_model_by_all_available_devices, extract_block_names_to_str, find_matching_blocks, get_block_names, merge_block_output_keys, wrap_block_forward_positional_to_kwargs, - dispatch_model_by_all_available_devices, ) pipeline_utils = LazyImport("diffusers.pipelines.pipeline_utils") diff --git a/auto_round/eval/eval_cli.py b/auto_round/eval/eval_cli.py index 874615ad9..71d142a74 100644 --- a/auto_round/eval/eval_cli.py +++ b/auto_round/eval/eval_cli.py @@ -20,13 +20,13 @@ from transformers.utils.versions import require_version from auto_round.utils import ( + DEVICE_ENVIRON_VARIABLE_MAPPING, + detect_device, dispatch_model_block_wise, get_device_and_parallelism, - detect_device, get_model_dtype, is_diffusion_model, set_cuda_visible_devices, - DEVICE_ENVIRON_VARIABLE_MAPPING, ) diff --git a/auto_round/utils/device.py b/auto_round/utils/device.py index e6611ece9..30ee33e49 100644 --- a/auto_round/utils/device.py +++ b/auto_round/utils/device.py @@ -49,6 +49,7 @@ # The compile mode can speed up quantization process but still in experimental stage. # 2. Lazy Mode (By default) + ################ Check available sys.module to decide behavior ################# def is_package_available(package_name: str) -> bool: """Check if the package exists in the environment without importing. @@ -1736,7 +1737,7 @@ def wrapper(*args, **kwargs): return decorator -# This function is designed for Auto Scheme and Diffusion Pipeline, +# This function is designed for Auto Scheme and Diffusion Pipeline, # which requires dispatching the whole model on all available devices. def dispatch_model_by_all_available_devices( model: torch.nn.Module, device_map: Union[str, int, dict, None] @@ -1746,9 +1747,11 @@ def dispatch_model_by_all_available_devices( if device_type in DEVICE_ENVIRON_VARIABLE_MAPPING: existing_env = os.environ.get(DEVICE_ENVIRON_VARIABLE_MAPPING[device_type]) if existing_env is None: - logger.warning_once("`get_balanced_memory` is used here, but no environment variable " + logger.warning_once( + "`get_balanced_memory` is used here, but no environment variable " + "is set to specify device visibility. This may lead to OOM issue even the memory " - + "is large enough.") + + "is large enough." + ) # Handle DiffusionPipeline: dispatch only the main sub-model (transformer / unet) # across devices and move the remaining pipeline components to the primary device. @@ -1761,11 +1764,7 @@ def dispatch_model_by_all_available_devices( devices = parse_available_devices(_device_map) # Identify the main quantisable sub-model main_attr = next( - ( - attr - for attr in ("transformer", "unet") - if isinstance(getattr(pipe, attr, None), torch.nn.Module) - ), + (attr for attr in ("transformer", "unet") if isinstance(getattr(pipe, attr, None), torch.nn.Module)), None, ) if main_attr is None or len(devices) == 1: