diff --git a/auto_round/auto_scheme/delta_loss.py b/auto_round/auto_scheme/delta_loss.py index 66e116e84..7610e94af 100644 --- a/auto_round/auto_scheme/delta_loss.py +++ b/auto_round/auto_scheme/delta_loss.py @@ -27,7 +27,6 @@ apply_quant_scheme, compute_avg_bits_for_scheme, compute_layer_bits, - dispatch_model_by_all_available_devices, parse_shared_layers, remove_quant_scheme, ) @@ -45,6 +44,7 @@ SUPPORTED_LAYER_TYPES, check_to_quantized, clear_memory, + dispatch_model_by_all_available_devices, get_block_names, get_major_device, get_module, diff --git a/auto_round/auto_scheme/utils.py b/auto_round/auto_scheme/utils.py index 51b8da8e1..b6b744a5a 100644 --- a/auto_round/auto_scheme/utils.py +++ b/auto_round/auto_scheme/utils.py @@ -21,6 +21,7 @@ from auto_round.schemes import QuantizationScheme, preset_name_to_scheme from auto_round.utils import ( + DEVICE_ENVIRON_VARIABLE_MAPPING, SUPPORTED_LAYER_TYPES, check_to_quantized, get_block_names, @@ -213,55 +214,6 @@ def compute_layer_bits( return total_bits, avg_bits -# Important Notice This dispatch does not follow dict device_map, just extract all available devices and use them -def dispatch_model_by_all_available_devices( - model: torch.nn.Module, device_map: Union[str, int, dict, None] -) -> torch.nn.Module: - if device_map is None: - device_map = 0 - - no_split_modules = normalize_no_split_modules(getattr(model, "_no_split_modules", [])) - if device_map == "auto": - max_memory = get_balanced_memory( - model, - max_memory=None, - no_split_module_classes=no_split_modules, - ) - device_map = infer_auto_device_map(model, max_memory=max_memory, no_split_module_classes=no_split_modules) - model = dispatch_model(model, device_map=device_map) - return model - - devices = parse_available_devices(device_map) - - if len(devices) == 1: - model.to(devices[0]) - return model - - max_memory = get_balanced_memory( - model, - max_memory=None, - no_split_module_classes=no_split_modules, - ) - - # Filter max_memory with devices - # assume only one GPU model - new_max_memory = {} - for device in devices: - if ":" in device: - device = int(device.split(":")[-1]) - elif device == "cpu": - device = "cpu" - elif isinstance(device, str): - device = 0 - else: - raise ValueError(f"Unsupported device {device} in device_map: {device_map}") - new_max_memory[device] = max_memory[device] - model.tie_weights() - device_map = infer_auto_device_map(model, max_memory=max_memory, no_split_module_classes=no_split_modules) - model = dispatch_model(model, device_map=device_map) - return model - - def merge_lists_unionfind(list_of_lists): parent = {} diff --git a/auto_round/compressors/base.py b/auto_round/compressors/base.py index a8735407e..633301194 100644 --- a/auto_round/compressors/base.py +++ b/auto_round/compressors/base.py @@ -1573,7 +1573,11 @@ def _quantize_via_rtn_blockwise(self, all_to_quantized_module_names: list[str]) block = convert_module_to_hp_if_necessary(block, dtype=self.amp_dtype, device=self.device) update_block_global_scale_if_needed(block, self.data_type, self.group_size) self._register_act_max_hook(block) - if is_auto_device_mapping(self.device_map) and len(self.device_list) > 1: + if ( + is_auto_device_mapping(self.device_map) + and len(self.device_list) > 1 + and not getattr(self, "is_diffusion", False) + ): set_auto_device_map_for_block_with_tuning( block, self.device_map, input_ids, self.low_gpu_mem_usage, self.batch_size, self.device ) @@ -2320,7 +2324,8 @@ def try_cache_inter_data_gpucpu(self, block_names, nsamples, layer_names=None, l max_memory=new_max_memory, no_split_module_classes=no_split_modules, ) - self.model.tie_weights() + if hasattr(self.model, "tie_weights") and callable(self.model.tie_weights): + self.model.tie_weights() device_map = infer_auto_device_map( self.model, max_memory=new_max_memory, no_split_module_classes=no_split_modules ) @@ -2979,7 +2984,11 @@ def _quantize_block( if auto_offload: # card_0_in_high_risk indicates that card_0 memory is already in high usage (90%) w/o any weights # loss_device is used to calculate loss on the second device if available and card_0_in_high_risk - if is_auto_device_mapping(self.device_map) and len(self.device_list) > 1: + if ( + is_auto_device_mapping(self.device_map) + and len(self.device_list) > 1 + and not getattr(self, "is_diffusion", False) + ): card_0_in_high_risk, loss_device = set_auto_device_map_for_block_with_tuning( block, self.device_map, input_ids, self.low_gpu_mem_usage, self.batch_size, device ) diff --git a/auto_round/compressors/diffusion/README.md b/auto_round/compressors/diffusion/README.md index c5b307df5..c0f8eb534 100644 --- a/auto_round/compressors/diffusion/README.md +++ b/auto_round/compressors/diffusion/README.md @@ -64,9 +64,12 @@ auto-round \ For diffusion models, currently we only validate quantizaion on the FLUX.1-dev, which involves quantizing the transformer component of the pipeline. -| Model | calibration dataset | -|--------------|--------------| -| black-forest-labs/FLUX.1-dev | COCO2014 | +| Model | calibration dataset | Model Link | +|---------------|---------------------|--------------| +| black-forest-labs/FLUX.1-dev | COCO2014 | - | +| Tongyi-MAI/Z-Image | COCO2014 | - | +| Tongyi-MAI/Z-Image-Turb | COCO2014 | - | +| stepfun-ai/NextStep-1.1 | COCO2014 | - | diff --git a/auto_round/compressors/diffusion/compressor.py b/auto_round/compressors/diffusion/compressor.py index ea7eba39e..c338a1f1e 100644 --- a/auto_round/compressors/diffusion/compressor.py +++ b/auto_round/compressors/diffusion/compressor.py @@ -29,7 +29,9 @@ from auto_round.utils import ( LazyImport, clear_memory, + copy_python_files_from_model_cache, diffusion_load_model, + dispatch_model_by_all_available_devices, extract_block_names_to_str, find_matching_blocks, get_block_names, @@ -56,6 +58,11 @@ class DiffusionCompressor(BaseCompressor): The more it is, the more closely it follows the prompt (default is 7.5). num_inference_steps (int): The reference number of denoising steps (default is 50). generator_seed (int): A sees that controls the initial noise from which an image is generated (default is None). + pipeline_fn (callable, optional): Custom callable to run the pipeline during calibration. + Signature: ``fn(pipe, prompts, *, guidance_scale, num_inference_steps, generator, **kwargs)``. + Use this to support models whose inference API differs from the standard convention + (e.g. NextStep). If ``None``, the standard ``pipe(prompts, ...)`` call is used unless + the loaded pipeline already exposes an ``_autoround_pipeline_fn`` attribute. scheme: (str| dict | QuantizationScheme ): A preset scheme that defines the quantization configurations. layer_config (dict): Configuration for weight quantization (default is None). dataset: The path or name of the calib dataset. @@ -81,6 +88,7 @@ class DiffusionCompressor(BaseCompressor): act_dynamic: bool | None super_bits: int | None super_group_size: int | None + is_diffusion: bool = True def __init__( self, @@ -102,9 +110,15 @@ def __init__( device_map: Union[str, torch.device, int, dict] = 0, enable_torch_compile: bool = False, seed: int = 42, + pipeline_fn: callable = None, **kwargs, ): logger.warning("Diffusion model quantization is experimental and is only validated on Flux models.") + if dataset == "NeelNanda/pile-10k": + dataset = "coco2014" + logger.warning( + "Dataset 'NeelNanda/pile-10k' is not suitable for diffusion model quantization, use coco2014 dataset instead." + ) model_dtype = kwargs.pop("model_dtype", None) self.guidance_scale = guidance_scale @@ -120,6 +134,8 @@ def __init__( self.model = model self.pipe = pipe + # Use explicit pipeline_fn; fall back to whatever diffusion_load_model attached to the pipe + self.pipeline_fn = pipeline_fn or getattr(pipe, "_autoround_pipeline_fn", None) all_blocks = get_block_names(model) self.quant_block_list = find_matching_blocks(model, all_blocks, to_quant_block_names) @@ -162,6 +178,7 @@ def __init__( to_quant_block_names=to_quant_block_names, **kwargs, ) + self._align_device_and_dtype() def _update_inputs(self, inputs: dict, q_inputs: dict) -> tuple[dict, dict]: # flux transformer model's blocks will update hidden_states and encoder_hidden_states @@ -194,7 +211,7 @@ def _get_current_q_output( device: str, cache_device: str = "cpu", ) -> torch.Tensor: - output_config = output_configs.get(block.__class__.__name__, []) + output_config = output_configs.get(block.__class__.__name__, ["hidden_states"]) idx = None if "hidden_states" not in output_config else output_config.index("hidden_states") current_input_ids, current_input_others = self._sampling_inputs( input_ids, @@ -238,7 +255,7 @@ def _get_block_outputs( """ output = defaultdict(list) - output_config = output_configs.get(block.__class__.__name__, []) + output_config = output_configs.get(block.__class__.__name__, ["hidden_states"]) if isinstance(input_ids, dict): nsamples = len(input_ids["hidden_states"]) else: @@ -256,6 +273,8 @@ def _get_block_outputs( tmp_input_ids = hidden_states tmp_output = block_forward(block, tmp_input_ids, tmp_input_others, self.amp, self.amp_dtype, device, None) + if isinstance(tmp_output, torch.Tensor): + tmp_output = [tmp_output] assert len(output_config) == len(tmp_output) tmp_output = dict(zip(output_config, tmp_output)) @@ -278,6 +297,64 @@ def _get_current_num_elm( current_input_ids = [input_ids["hidden_states"][i] for i in indices] return sum(id.numel() for id in current_input_ids) + def _run_pipeline(self, prompts: list) -> None: + """Execute one full diffusion pipeline forward pass for calibration input capture. + + This drives all transformer blocks so that their intermediate inputs are recorded + by the hooks installed during calibration. + + **Extending for custom models** – choose whichever approach is simpler: + + * Pass a ``pipeline_fn`` to the constructor (no subclassing required). The + callable receives ``(pipe, prompts, *, guidance_scale, num_inference_steps, + generator, **kwargs)`` and must trigger a full forward pass. + * Subclass :class:`DiffusionCompressor` and override this method directly for + full control over the inference logic. + + Example – NextStep model:: + + def nextstep_fn(pipe, prompts, guidance_scale=7.5, + num_inference_steps=28, generator=None, + hw=(1024, 1024), **kwargs): + for prompt in (prompts if isinstance(prompts, list) else [prompts]): + pipe.generate_image( + prompt, + cfg=guidance_scale, + num_sampling_steps=num_inference_steps, + hw=hw, + **kwargs, + ) + + compressor = DiffusionCompressor( + model="path/to/nextstep", + pipeline_fn=nextstep_fn, + pipeline_fn_kwargs={"hw": (512, 512)}, + ) + + Args: + prompts (list[str]): Text prompts for the current calibration batch. + """ + generator = ( + None + if self.generator_seed is None + else torch.Generator(device=self.pipe.device).manual_seed(self.generator_seed) + ) + if self.pipeline_fn is not None: + self.pipeline_fn( + self.pipe, + prompts, + guidance_scale=self.guidance_scale, + num_inference_steps=self.num_inference_steps, + generator=generator, + ) + else: + self.pipe( + prompts, + guidance_scale=self.guidance_scale, + num_inference_steps=self.num_inference_steps, + generator=generator, + ) + def calib(self, nsamples, bs): """Perform calibration for quantization. @@ -294,6 +371,11 @@ def calib(self, nsamples, bs): "Diffusion model will catch nsamples * num_inference_steps inputs, " "you can reduce nsamples or num_inference_steps if OOM or take too much time." ) + raw_num_inference_steps = self.num_inference_steps + self.num_inference_steps = 1 + logger.info( + f"Set num_inference_steps to 1 for calibration, original num_inference_steps is {raw_num_inference_steps}" + ) if isinstance(self.dataset, str): dataset = self.dataset.replace(" ", "") self.dataloader, self.batch_size, self.gradient_accumulate_steps = get_diffusion_dataloader( @@ -308,40 +390,13 @@ def calib(self, nsamples, bs): total_cnt = 0 total = nsamples if not hasattr(self.dataloader, "len") else min(nsamples, len(self.dataloader)) - if self.pipe.dtype != self.model.dtype: - self.pipe.to(self.model.dtype) - - if ( - hasattr(self.model, "hf_device_map") - and len(self.model.hf_device_map) > 0 - and self.pipe.device != self.model.device - and torch.device(self.model.device).type in ["cuda", "xpu"] - ): - logger.error( - "Diffusion model is activated sequential model offloading, it will crash during moving to GPU/XPU. " - "Please use model path for quantization or " - "move the pipeline object to GPU/XPU before passing them into API." - ) - exit(-1) - if self.pipe.device != self.model.device: - self.pipe.to(self.model.device) - self.pipe.to(self.model.dtype) with tqdm(range(1, total + 1), desc="cache block inputs") as pbar: for ids, prompts in self.dataloader: if isinstance(prompts, tuple): prompts = list(prompts) try: - self.pipe( - prompt=prompts, - guidance_scale=self.guidance_scale, - num_inference_steps=self.num_inference_steps, - generator=( - None - if self.generator_seed is None - else torch.Generator(device=self.pipe.device).manual_seed(self.generator_seed) - ), - ) + self._run_pipeline(prompts) except NotImplementedError: pass except Exception as error: @@ -374,6 +429,8 @@ def calib(self, nsamples, bs): self.inputs[k][key] = v[key][:max_len] # torch.cuda.empty_cache() + self.num_inference_steps = raw_num_inference_steps + logger.info(f"Restore num_inference_steps to {self.num_inference_steps} after calibration") def _should_stop_cache_forward(self, name: str) -> bool: """Determine whether current forward pass can stop after caching `name`.""" @@ -395,6 +452,12 @@ def _get_save_folder_name(self, format: OutputFormat) -> str: """ # Replace special characters to make the folder name filesystem-safe sanitized_format = format.get_backend_name().replace(":", "-").replace("_", "-") + if hasattr(self.model, "config") and getattr(self.model.config, "model_type", None) == "nextstep": + # Use a subfolder only if there are multiple formats + if len(self.formats) > 1: + return os.path.join(self.orig_output_dir, sanitized_format) + + return self.orig_output_dir # Use a subfolder only if there are multiple formats if len(self.formats) > 1: @@ -423,6 +486,16 @@ def save_quantized(self, output_dir=None, format="auto_round", inplace=True, **k return super().save_quantized(output_dir, format=format, inplace=inplace, **kwargs) compressed_model = None + if hasattr(self.model, "config") and getattr(self.model.config, "model_type", None) == "nextstep": + compressed_model = super().save_quantized( + output_dir=output_dir, + format=format, + inplace=inplace, + **kwargs, + ) + self.pipe.tokenizer.save_pretrained(output_dir) + copy_python_files_from_model_cache(self.model, output_dir, copy_folders=["models", "vae", "utils"]) + return compressed_model for name in self.pipe.components.keys(): val = getattr(self.pipe, name) sub_module_path = ( @@ -443,3 +516,24 @@ def save_quantized(self, output_dir=None, format="auto_round", inplace=True, **k val.save_pretrained(sub_module_path) self.pipe.config.save_pretrained(output_dir) return compressed_model + + def _align_device_and_dtype(self): + if hasattr(self.model, "config") and getattr(self.model.config, "model_type", None) == "nextstep": + return + if ( + hasattr(self.model, "hf_device_map") + and len(self.model.hf_device_map) > 0 + and type(self.pipe.device) != type(self.model.device) + and self.pipe.device != self.model.device + and torch.device(self.model.device).type in ["cuda", "xpu"] + ): + logger.error( + "Diffusion model is activated sequential model offloading, it will crash during moving to GPU/XPU. " + "Please use model path for quantization or " + "move the pipeline object to GPU/XPU before passing them into API." + ) + exit(-1) + + self.pipe.to(self.model.dtype) + + dispatch_model_by_all_available_devices(self.pipe, self.device_map) diff --git a/auto_round/eval/eval_cli.py b/auto_round/eval/eval_cli.py index 76811ec3f..71d142a74 100644 --- a/auto_round/eval/eval_cli.py +++ b/auto_round/eval/eval_cli.py @@ -20,9 +20,10 @@ from transformers.utils.versions import require_version from auto_round.utils import ( + DEVICE_ENVIRON_VARIABLE_MAPPING, + detect_device, dispatch_model_block_wise, get_device_and_parallelism, - get_device_str, get_model_dtype, is_diffusion_model, set_cuda_visible_devices, @@ -285,19 +286,14 @@ def eval_with_vllm(args): logger.info(f"Overriding VLLM parameters with custom args: {custom_vllm_kwargs}") vllm_kwargs.update(custom_vllm_kwargs) - device = get_device_str() - environ_mapping = { - "cuda": "CUDA_VISIBLE_DEVICES", - "xpu": "ZE_AFFINITY_MASK", - "hpu": "HABANA_VISIBLE_MODULES", - } + device = detect_device() if "tensor_parallel_size" not in vllm_kwargs: # Parse device_map to determine tensor_parallel_size and set the relevant env var # Only accept formats like "0" or "0,1,2". If the environment variable is # already set externally, do not overwrite it — but still derive # `tensor_parallel_size` from the existing value. - assert device in environ_mapping, f"Device {device} not supported for vllm tensor parallelism." - environ_name = environ_mapping[device] + assert device in DEVICE_ENVIRON_VARIABLE_MAPPING, f"Device {device} not supported for vllm tensor parallelism." + environ_name = DEVICE_ENVIRON_VARIABLE_MAPPING[device] device_map = args.device_map device_ids = [d.strip() for d in str(device_map).split(",") if d.strip().isdigit()] diff --git a/auto_round/utils/device.py b/auto_round/utils/device.py index 747a4eb2b..30ee33e49 100644 --- a/auto_round/utils/device.py +++ b/auto_round/utils/device.py @@ -34,6 +34,13 @@ from auto_round.logger import logger from auto_round.utils.model import check_to_quantized, get_block_names, get_layer_features, get_module +DEVICE_ENVIRON_VARIABLE_MAPPING = { + "cuda": "CUDA_VISIBLE_DEVICES", + "xpu": "ZE_AFFINITY_MASK", + "hpu": "HABANA_VISIBLE_MODULES", +} + + # Note on HPU usage: # There are two modes available for enabling auto-round on HPU: # 1. Compile Mode @@ -1694,18 +1701,6 @@ def log_summary(self, msg: str = "", level: str = "info"): return summary -def get_device_str(): - """Get a string representation of the automatically detected device.""" - if torch.cuda.is_available(): - return "cuda" - elif torch.xpu.is_available(): # pragma: no cover - return "xpu" - elif is_hpex_available(): # pragma: no cover - return "hpu" - else: # pragma: no cover - return "cpu" - - # Global singleton instance memory_monitor = MemoryMonitor() @@ -1740,3 +1735,104 @@ def wrapper(*args, **kwargs): return wrapper return decorator + + +# This function is designed for Auto Scheme and Diffusion Pipeline, +# which requires dispatching the whole model on all available devices. +def dispatch_model_by_all_available_devices( + model: torch.nn.Module, device_map: Union[str, int, dict, None] +) -> torch.nn.Module: + # Important Notice: This dispatch does not follow dict device_map, just extract all available devices and use them + device_type = detect_device() + if device_type in DEVICE_ENVIRON_VARIABLE_MAPPING: + existing_env = os.environ.get(DEVICE_ENVIRON_VARIABLE_MAPPING[device_type]) + if existing_env is None: + logger.warning_once( + "`get_balanced_memory` is used here, but no environment variable " + + "is set to specify device visibility. This may lead to OOM issue even the memory " + + "is large enough." + ) + + # Handle DiffusionPipeline: dispatch only the main sub-model (transformer / unet) + # across devices and move the remaining pipeline components to the primary device. + try: + from diffusers.pipelines.pipeline_utils import DiffusionPipeline + + if isinstance(model, DiffusionPipeline): + pipe = model + _device_map = 0 if device_map is None else device_map + devices = parse_available_devices(_device_map) + # Identify the main quantisable sub-model + main_attr = next( + (attr for attr in ("transformer", "unet") if isinstance(getattr(pipe, attr, None), torch.nn.Module)), + None, + ) + if main_attr is None or len(devices) == 1: + # No identifiable main sub-model, or single target device: + # move the entire pipeline to the (first) device. + pipe.to(devices[0] if devices else "cuda:0") + return pipe + # Multi-device path: recursively dispatch the main sub-model, + # then move all remaining pipeline components to the primary device. + main_model = getattr(pipe, main_attr) + dispatched = dispatch_model_by_all_available_devices(main_model, _device_map) + setattr(pipe, main_attr, dispatched) + primary_device = devices[0] + for attr, component in pipe.components.items(): + if attr == main_attr: + continue + if isinstance(component, torch.nn.Module): + try: + component.to(primary_device) + except Exception: + pass + return pipe + except ImportError: + pass + + if device_map is None: + device_map = 0 + + from auto_round.utils.common import normalize_no_split_modules + + no_split_modules = normalize_no_split_modules(getattr(model, "_no_split_modules", [])) + if device_map == "auto": + max_memory = get_balanced_memory( + model, + max_memory=None, + no_split_module_classes=no_split_modules, + ) + device_map = infer_auto_device_map(model, max_memory=max_memory, no_split_module_classes=no_split_modules) + model = dispatch_model(model, device_map=device_map) + return model + + devices = parse_available_devices(device_map) + + if len(devices) == 1: + model.to(devices[0]) + return model + + max_memory = get_balanced_memory( + model, + max_memory=None, + no_split_module_classes=no_split_modules, + ) + + # Filter max_memory with devices + # assume only one GPU model + new_max_memory = {} + for device in devices: + if ":" in device: + device = int(device.split(":")[-1]) + elif device == "cpu": + device = "cpu" + elif isinstance(device, str): + device = 0 + else: + raise ValueError(f"Unsupported device {device} in device_map: {device_map}") + new_max_memory[device] = max_memory[device] + if hasattr(model, "tie_weights") and callable(model.tie_weights): + model.tie_weights() + device_map = infer_auto_device_map(model, max_memory=new_max_memory, no_split_module_classes=no_split_modules) + model = dispatch_model(model, device_map=device_map) + return model diff --git a/auto_round/utils/model.py b/auto_round/utils/model.py index f0aec180a..7b2c574f1 100644 --- a/auto_round/utils/model.py +++ b/auto_round/utils/model.py @@ -666,6 +666,19 @@ def diffusion_load_model( if device_str is not None and "hpu" in device_str: torch_dtype = torch.bfloat16 + try: + from transformers import AutoConfig + + config = AutoConfig.from_pretrained(pretrained_model_name_or_path, trust_remote_code=True) + except: + config = None + + model_type = getattr(config, "model_type", "") + # A special case for NextStep + if model_type == "nextstep": + pipe, model = load_next_step_diffusion(pretrained_model_name_or_path, device_str) + return pipe, pipe.model + pipelines = LazyImport("diffusers.pipelines") if isinstance(pretrained_model_name_or_path, str): if torch_dtype == "auto": @@ -794,6 +807,21 @@ def is_gguf_model(model_path: Union[str, torch.nn.Module]) -> bool: def is_diffusion_model(model_or_path: Union[str, object]) -> bool: from auto_round.utils.common import LazyImport + # First check if it's a known diffusion pipeline by config/model_type to avoid unnecessary imports and file checks for non-diffusion models, which can be time-consuming. + try: + from transformers import AutoConfig + + config = AutoConfig.from_pretrained(model_or_path, trust_remote_code=True) + model_type = getattr(config, "model_type", "") + # A special case for NextStep + if model_type == "nextstep": + return True + except: + logger.warning( + f"Failed to load config for {model_or_path}, trying to check model_index.json for diffusion pipeline." + ) + + # Then check if model_index.json exists for diffusion pipeline, which is a strong signal of being a diffusion pipeline. if isinstance(model_or_path, str): index_file = None if not os.path.isdir(model_or_path): @@ -1677,43 +1705,69 @@ def _copy_extra_model_files(src_dir: str, dst_dir: str): # Adapted from https://github.com/vllm-project/llm-compressor/blob/ # 5b3ddff74cae9651f24bef15d3255c4ee053fc60/src/llmcompressor/pytorch/model_load/helpers.py#L144 -def copy_python_files_from_model_cache(model, save_path: str): +def copy_python_files_from_model_cache(model, save_path: str, copy_folders: bool | list[str] | tuple[str, ...] = False): + """Copy Python files (and optionally subdirectories) from the model cache to *save_path*. + + Args: + model: The model whose ``config._name_or_path`` points to the source cache. + save_path (str): Destination directory. + copy_folders (bool | list[str] | tuple[str, ...]): Controls which subdirectories + are copied from the cache root to *save_path*: + + * ``False`` (default) – no folders are copied. + * ``True`` – every subdirectory that does not already exist in *save_path* + is copied (e.g. all of ``vae``, ``scheduler``, …). + * A list/tuple of folder names (e.g. ``["vae", "scheduler"]``) – only the + named subdirectories are copied. + """ + import shutil + + from huggingface_hub import hf_hub_download + config = model.config - if hasattr(config, "_name_or_path"): - import os - import shutil + if not hasattr(config, "_name_or_path"): + return - from huggingface_hub import hf_hub_download + if version.parse(transformers.__version__) < version.parse("5.0.0"): + from transformers.utils import TRANSFORMERS_CACHE - if version.parse(transformers.__version__) < version.parse("5.0.0"): - from transformers.utils import TRANSFORMERS_CACHE + cache_dir = os.environ.get("HF_HOME", TRANSFORMERS_CACHE) + else: + from huggingface_hub.constants import HF_HUB_CACHE + + cache_dir = os.environ.get("HF_HOME", HF_HUB_CACHE) + from transformers.utils import http_user_agent + + cache_path = config._name_or_path + if not os.path.exists(cache_path): + user_agent = http_user_agent() + config_file_path = hf_hub_download( + repo_id=cache_path, + filename="config.json", + cache_dir=cache_dir, + force_download=False, + user_agent=user_agent, + ) + cache_path = os.path.sep.join(config_file_path.split(os.path.sep)[:-1]) - cache_dir = os.environ.get("HF_HOME", TRANSFORMERS_CACHE) - else: - from huggingface_hub.constants import HF_HUB_CACHE - - cache_dir = os.environ.get("HF_HOME", HF_HUB_CACHE) - from transformers.utils import http_user_agent - - cache_path = config._name_or_path - if not os.path.exists(cache_path): - user_agent = http_user_agent() - config_file_path = hf_hub_download( - repo_id=cache_path, - filename="config.json", - cache_dir=cache_dir, - force_download=False, - user_agent=user_agent, - ) - cache_path = os.path.sep.join(config_file_path.split(os.path.sep)[:-1]) + for file in os.listdir(cache_path): + full_file_name = os.path.join(cache_path, file) + if file.endswith(".py") and os.path.isfile(full_file_name): + logger.debug(f"Transferring {full_file_name} to {save_path}") + shutil.copy(full_file_name, save_path) - for file in os.listdir(cache_path): - full_file_name = os.path.join(cache_path, file) - if file.endswith(".py") and os.path.isfile(full_file_name): - logger.debug(f"Transferring {full_file_name} to {save_path}") - shutil.copy(full_file_name, save_path) + _copy_extra_model_files(cache_path, save_path) - _copy_extra_model_files(cache_path, save_path) + if copy_folders is not False: + for entry in os.listdir(cache_path): + src_entry = os.path.join(cache_path, entry) + dst_entry = os.path.join(save_path, entry) + if not os.path.isdir(src_entry): + continue + if copy_folders is True or entry in copy_folders: + if not os.path.exists(dst_entry): + logger.debug(f"Transferring folder {src_entry} to {save_path}") + shutil.copytree(src_entry, dst_entry) def extract_block_names_to_str(quant_block_list): @@ -1882,3 +1936,35 @@ def forward(m, hidden_states=None, *positional_inputs, **kwargs): return base_hook(m, hidden_states, *positional_inputs, **kwargs) return forward + + +def load_next_step_diffusion(pretrained_model_name_or_path, device_str): + from models.gen_pipeline import NextStepPipeline # pylint: disable=E0401 + from transformers import AutoModel, AutoTokenizer + + tokenizer = AutoTokenizer.from_pretrained( + pretrained_model_name_or_path, local_files_only=True, trust_remote_code=True + ) + model = AutoModel.from_pretrained(pretrained_model_name_or_path, local_files_only=True, trust_remote_code=True) + # The model is loaded onto the device because more than one block requires input data. + pipe = NextStepPipeline(tokenizer=tokenizer, model=model).to(device=device_str, dtype=torch.bfloat16) + + def _nextstep_pipeline_fn(pipe, prompts, guidance_scale=7.5, num_inference_steps=28, generator=None, **kwargs): + """Default pipeline_fn for NextStep models. + + Maps standard :class:`DiffusionCompressor` parameters to NextStep's + ``generate_image`` API. Pass a custom ``pipeline_fn`` to + :class:`DiffusionCompressor` to override defaults or supply + model-specific kwargs (e.g. ``hw``, ``positive_prompt``, + ``cfg_schedule``, ``timesteps_shift``). + """ + for prompt in (prompts if isinstance(prompts, list) else [prompts]): + pipe.generate_image( + prompt, + cfg=guidance_scale, + num_sampling_steps=num_inference_steps, + **kwargs, + ) + + pipe._autoround_pipeline_fn = _nextstep_pipeline_fn + return pipe, model diff --git a/test/test_cuda/models/test_diffusion.py b/test/test_cuda/models/test_diffusion.py index 11ea7da54..f7d265b00 100644 --- a/test/test_cuda/models/test_diffusion.py +++ b/test/test_cuda/models/test_diffusion.py @@ -49,9 +49,8 @@ def test_diffusion_rtn(self, tiny_flux_model_path): # skip model saving since it takes much time autoround.quantize() - @pytest.mark.skip_ci(reason="Tuning will OOM in CI; Only tiny model is suggested") # skip this test in CI @require_optimum - def test_diffusion_tune(self, tiny_flux_model_path): + def test_diffusion_tune(self, tiny_flux_model_path, tmp_path): from diffusers import AutoPipelineForText2Image ## load the model @@ -83,7 +82,7 @@ def test_diffusion_tune(self, tiny_flux_model_path): dataset=get_captions_dataset_path(), ) # skip model saving since it takes much time - autoround.quantize() + autoround.quantize_and_save(tmp_path) @pytest.mark.skip_ci(reason="Download large model; Time-consuming") def test_diffusion_model_checker(self):