Problem Description
Although set the number of CUDA devices to 4 and set device_map to auto, the GPU memory is still insufficient (GPU memory cannot be distributed evenly) when quantizing the Qwen3-8B model with nblocks set to 16.
Reproduction Steps
-
run cmd:
CMD="CUDA_VISIBLE_DEVICES=3,4,5,6 auto-round
--model /models/Qwen3-8B
--scheme W2A16
--group_size 64
--iters 2000
--nsamples 512
--nblocks 16
--format fake
--enable_alg_ext
--tasks $TASKS
--device_map auto
--eval_bs 64
--output_dir $OUTDIR"
-
see error:
�[38;20m2026-03-30 08:30:28 INFO main.py L574: start to quantize /models/Qwen3-8B�[0m
Loading checkpoint shards: 0%| | 0/5 [00:00<?, ?it/s]
Loading checkpoint shards: 100%|██████████| 5/5 [00:00<00:00, 115.52it/s]
�[38;20m2026-03-30 08:30:29 INFO base.py L486: using torch.bfloat16 for quantization tuning�[0m
�[38;20m2026-03-30 08:30:29 INFO base.py L779: 'enable_torch_compile' is set to False by default. Enabling it can reduce tuning cost by 20%, but it might throw an exception.�[0m
�[33;1m2026-03-30 08:30:29 WARNING base.py L511: using algorithm extension for quantization.�[0m
�[33;1m2026-03-30 08:30:29 WARNING base.py L1639: low_cpu_mem_usage is only supported when immediate_packing is True. Setting low_cpu_mem_usage to False.�[0m
�[38;20m2026-03-30 08:30:29 INFO base.py L1739: start to cache block inputs�[0m
�[38;20m2026-03-30 08:30:39 INFO base.py L1754: caching done�[0m
0%| | 0/3 [00:00<?, ?it/s]
Quantizing [1-16]/36: 0%| | 0/3 [00:06<?, ?it/s]/home/zanerma/.venv/lib/python3.10/site-packages/torch/autograd/graph.py:865: UserWarning: Flash Attention defaults to a non-deterministic algorithm. To explicitly enable determinism call torch.use_deterministic_algorithms(True, warn_only=False). (Triggered internally at /pytorch/aten/src/ATen/native/transformers/cuda/attention_backward.cu:114.)
return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass
Traceback (most recent call last):
File "/home/zanerma/.venv/bin/auto-round", line 6, in
sys.exit(run())
File "/home/zanerma/.venv/lib/python3.10/site-packages/auto_round/main.py", line 783, in run
start()
File "/home/zanerma/.venv/lib/python3.10/site-packages/auto_round/main.py", line 516, in start
tune(args)
File "/home/zanerma/.venv/lib/python3.10/site-packages/auto_round/main.py", line 722, in tune
model, folders = autoround.quantize_and_save(export_dir, format=args.format) # pylint: disable=E1101
File "/home/zanerma/.venv/lib/python3.10/site-packages/auto_round/compressors/base.py", line 949, in quantize_and_save
model, _ = self.quantize()
File "/home/zanerma/.venv/lib/python3.10/site-packages/auto_round/compressors/base.py", line 1778, in quantize
self._quantize_blocks(
File "/home/zanerma/.venv/lib/python3.10/site-packages/auto_round/compressors/base.py", line 3140, in _quantize_blocks
q_input, input_ids = self._quantize_block(
File "/home/zanerma/.venv/lib/python3.10/site-packages/auto_round/compressors/base.py", line 2978, in _quantize_block
output_q = self._get_current_q_output(block, input_ids, input_others, indices, device, loss_device)
File "/home/zanerma/.venv/lib/python3.10/site-packages/auto_round/compressors/base.py", line 2727, in _get_current_q_output
output_q = self.block_forward(block, current_input_ids, current_input_others, self.amp, self.amp_dtype, device)
File "/home/zanerma/.venv/lib/python3.10/site-packages/auto_round/compressors/utils.py", line 134, in block_forward
output = block(input_ids, *input_tuple, **input_others)
File "/home/zanerma/.venv/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1776, in _wrapped_call_impl
return self._call_impl(*args, **kwargs)
File "/home/zanerma/.venv/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1787, in _call_impl
return forward_call(*args, **kwargs)
File "/home/zanerma/.venv/lib/python3.10/site-packages/auto_round/wrapper.py", line 665, in forward
layer_outputs = decoder_layer(hidden_states, **kwargs)
File "/home/zanerma/.venv/lib/python3.10/site-packages/transformers/modeling_layers.py", line 94, in call
return super().call(*args, **kwargs)
File "/home/zanerma/.venv/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1776, in _wrapped_call_impl
return self._call_impl(*args, **kwargs)
File "/home/zanerma/.venv/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1787, in _call_impl
return forward_call(*args, **kwargs)
File "/home/zanerma/.venv/lib/python3.10/site-packages/transformers/utils/deprecation.py", line 172, in wrapped_func
return func(*args, **kwargs)
File "/home/zanerma/.venv/lib/python3.10/site-packages/transformers/models/qwen3/modeling_qwen3.py", line 260, in forward
hidden_states, _ = self.self_attn(
File "/home/zanerma/.venv/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1776, in _wrapped_call_impl
return self._call_impl(*args, **kwargs)
File "/home/zanerma/.venv/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1787, in _call_impl
return forward_call(*args, **kwargs)
File "/home/zanerma/.venv/lib/python3.10/site-packages/transformers/utils/deprecation.py", line 172, in wrapped_func
return func(*args, **kwargs)
File "/home/zanerma/.venv/lib/python3.10/site-packages/transformers/models/qwen3/modeling_qwen3.py", line 200, in forward
query_states = self.q_norm(self.q_proj(hidden_states).view(hidden_shape)).transpose(1, 2)
File "/home/zanerma/.venv/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1776, in _wrapped_call_impl
return self._call_impl(*args, **kwargs)
File "/home/zanerma/.venv/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1787, in _call_impl
return forward_call(*args, **kwargs)
File "/home/zanerma/.venv/lib/python3.10/site-packages/transformers/models/qwen3/modeling_qwen3.py", line 64, in forward
return self.weight * hidden_states.to(input_dtype)
torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 128.00 MiB. GPU 0 has a total capacity of 79.25 GiB of which 42.94 MiB is free. Including non-PyTorch memory, this process has 79.20 GiB memory in use. Of the allocated memory 78.68 GiB is allocated by PyTorch, and 43.31 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_ALLOC_CONF=expandable_segments:True to avoid fragmentation. See documentation for Memory Management (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
Quantizing [1-16]/36: 0%| | 0/3 [04:31<?, ?it/s]
Environment Information
- OS: Ubuntu 22.04
- Python version: 3.10
- AutoRound version: 0.10.5
- Hardware: A100
Error Logs
CUDA out of memory. Tried to allocate 128.00 MiB. GPU 0 has a total capacity of 79.25 GiB of which 42.94 MiB is free. Including non-PyTorch memory, this process has 79.20 GiB memory in use. Of the allocated memory 78.68 GiB is allocated by PyTorch, and 43.31 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_ALLOC_CONF=expandable_segments:True to avoid fragmentation. See documentation for Memory Management
Additional Context
No response
Problem Description
Although set the number of CUDA devices to 4 and set device_map to auto, the GPU memory is still insufficient (GPU memory cannot be distributed evenly) when quantizing the Qwen3-8B model with nblocks set to 16.
Reproduction Steps
run cmd:
CMD="CUDA_VISIBLE_DEVICES=3,4,5,6 auto-round
--model /models/Qwen3-8B
--scheme W2A16
--group_size 64
--iters 2000
--nsamples 512
--nblocks 16
--format fake
--enable_alg_ext
--tasks $TASKS
--device_map auto
--eval_bs 64
--output_dir $OUTDIR"
see error:
�[38;20m2026-03-30 08:30:28 INFO main.py L574: start to quantize /models/Qwen3-8B�[0m
Loading checkpoint shards: 0%| | 0/5 [00:00<?, ?it/s]
Loading checkpoint shards: 100%|██████████| 5/5 [00:00<00:00, 115.52it/s]
�[38;20m2026-03-30 08:30:29 INFO base.py L486: using torch.bfloat16 for quantization tuning�[0m
�[38;20m2026-03-30 08:30:29 INFO base.py L779: 'enable_torch_compile' is set to
Falseby default. Enabling it can reduce tuning cost by 20%, but it might throw an exception.�[0m�[33;1m2026-03-30 08:30:29 WARNING base.py L511: using algorithm extension for quantization.�[0m
�[33;1m2026-03-30 08:30:29 WARNING base.py L1639:
low_cpu_mem_usageis only supported whenimmediate_packingis True. Settinglow_cpu_mem_usageto False.�[0m�[38;20m2026-03-30 08:30:29 INFO base.py L1739: start to cache block inputs�[0m
�[38;20m2026-03-30 08:30:39 INFO base.py L1754: caching done�[0m
0%| | 0/3 [00:00<?, ?it/s]
Quantizing [1-16]/36: 0%| | 0/3 [00:06<?, ?it/s]/home/zanerma/.venv/lib/python3.10/site-packages/torch/autograd/graph.py:865: UserWarning: Flash Attention defaults to a non-deterministic algorithm. To explicitly enable determinism call torch.use_deterministic_algorithms(True, warn_only=False). (Triggered internally at /pytorch/aten/src/ATen/native/transformers/cuda/attention_backward.cu:114.)
return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass
Traceback (most recent call last):
File "/home/zanerma/.venv/bin/auto-round", line 6, in
sys.exit(run())
File "/home/zanerma/.venv/lib/python3.10/site-packages/auto_round/main.py", line 783, in run
start()
File "/home/zanerma/.venv/lib/python3.10/site-packages/auto_round/main.py", line 516, in start
tune(args)
File "/home/zanerma/.venv/lib/python3.10/site-packages/auto_round/main.py", line 722, in tune
model, folders = autoround.quantize_and_save(export_dir, format=args.format) # pylint: disable=E1101
File "/home/zanerma/.venv/lib/python3.10/site-packages/auto_round/compressors/base.py", line 949, in quantize_and_save
model, _ = self.quantize()
File "/home/zanerma/.venv/lib/python3.10/site-packages/auto_round/compressors/base.py", line 1778, in quantize
self._quantize_blocks(
File "/home/zanerma/.venv/lib/python3.10/site-packages/auto_round/compressors/base.py", line 3140, in _quantize_blocks
q_input, input_ids = self._quantize_block(
File "/home/zanerma/.venv/lib/python3.10/site-packages/auto_round/compressors/base.py", line 2978, in _quantize_block
output_q = self._get_current_q_output(block, input_ids, input_others, indices, device, loss_device)
File "/home/zanerma/.venv/lib/python3.10/site-packages/auto_round/compressors/base.py", line 2727, in _get_current_q_output
output_q = self.block_forward(block, current_input_ids, current_input_others, self.amp, self.amp_dtype, device)
File "/home/zanerma/.venv/lib/python3.10/site-packages/auto_round/compressors/utils.py", line 134, in block_forward
output = block(input_ids, *input_tuple, **input_others)
File "/home/zanerma/.venv/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1776, in _wrapped_call_impl
return self._call_impl(*args, **kwargs)
File "/home/zanerma/.venv/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1787, in _call_impl
return forward_call(*args, **kwargs)
File "/home/zanerma/.venv/lib/python3.10/site-packages/auto_round/wrapper.py", line 665, in forward
layer_outputs = decoder_layer(hidden_states, **kwargs)
File "/home/zanerma/.venv/lib/python3.10/site-packages/transformers/modeling_layers.py", line 94, in call
return super().call(*args, **kwargs)
File "/home/zanerma/.venv/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1776, in _wrapped_call_impl
return self._call_impl(*args, **kwargs)
File "/home/zanerma/.venv/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1787, in _call_impl
return forward_call(*args, **kwargs)
File "/home/zanerma/.venv/lib/python3.10/site-packages/transformers/utils/deprecation.py", line 172, in wrapped_func
return func(*args, **kwargs)
File "/home/zanerma/.venv/lib/python3.10/site-packages/transformers/models/qwen3/modeling_qwen3.py", line 260, in forward
hidden_states, _ = self.self_attn(
File "/home/zanerma/.venv/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1776, in _wrapped_call_impl
return self._call_impl(*args, **kwargs)
File "/home/zanerma/.venv/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1787, in _call_impl
return forward_call(*args, **kwargs)
File "/home/zanerma/.venv/lib/python3.10/site-packages/transformers/utils/deprecation.py", line 172, in wrapped_func
return func(*args, **kwargs)
File "/home/zanerma/.venv/lib/python3.10/site-packages/transformers/models/qwen3/modeling_qwen3.py", line 200, in forward
query_states = self.q_norm(self.q_proj(hidden_states).view(hidden_shape)).transpose(1, 2)
File "/home/zanerma/.venv/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1776, in _wrapped_call_impl
return self._call_impl(*args, **kwargs)
File "/home/zanerma/.venv/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1787, in _call_impl
return forward_call(*args, **kwargs)
File "/home/zanerma/.venv/lib/python3.10/site-packages/transformers/models/qwen3/modeling_qwen3.py", line 64, in forward
return self.weight * hidden_states.to(input_dtype)
torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 128.00 MiB. GPU 0 has a total capacity of 79.25 GiB of which 42.94 MiB is free. Including non-PyTorch memory, this process has 79.20 GiB memory in use. Of the allocated memory 78.68 GiB is allocated by PyTorch, and 43.31 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_ALLOC_CONF=expandable_segments:True to avoid fragmentation. See documentation for Memory Management (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
Quantizing [1-16]/36: 0%| | 0/3 [04:31<?, ?it/s]
Environment Information
Error Logs
Additional Context
No response