From fb3edd284ea9a3f073704fa1073a775132f7d4cc Mon Sep 17 00:00:00 2001 From: Xiang An Date: Wed, 27 Nov 2024 07:33:59 +0800 Subject: [PATCH] Fix the bug in multi-machine training. When using multiple machines, the rank exceeds 8, which is more than the maximum number of GPUs on the current node. The correct approach should not require passing in the rank, because `torch.cuda.set_device(device)` has already been set earlier. --- train.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/train.py b/train.py index 7cfee808..f1a955ee 100644 --- a/train.py +++ b/train.py @@ -146,7 +146,7 @@ def main(args): # Note that parameter initialization is done within the DiT constructor ema = deepcopy(model).to(device) # Create an EMA of the model for use after training requires_grad(ema, False) - model = DDP(model.to(device), device_ids=[rank]) + model = DDP(model.to(device)) diffusion = create_diffusion(timestep_respacing="") # default: 1000 steps, linear noise schedule vae = AutoencoderKL.from_pretrained(f"stabilityai/sd-vae-ft-{args.vae}").to(device) logger.info(f"DiT Parameters: {sum(p.numel() for p in model.parameters()):,}")