From fb3edd284ea9a3f073704fa1073a775132f7d4cc Mon Sep 17 00:00:00 2001
From: Xiang An <anxiangsir@outlook.com>
Date: Wed, 27 Nov 2024 07:33:59 +0800
Subject: [PATCH] Fix the bug in multi-machine training.

When using multiple machines, the rank exceeds 8, which is more than the maximum number of GPUs on the current node. The correct approach should not require passing in the rank, because `torch.cuda.set_device(device)` has already been set earlier.
---
 train.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/train.py b/train.py
index 7cfee808..f1a955ee 100644
--- a/train.py
+++ b/train.py
@@ -146,7 +146,7 @@ def main(args):
     # Note that parameter initialization is done within the DiT constructor
     ema = deepcopy(model).to(device)  # Create an EMA of the model for use after training
     requires_grad(ema, False)
-    model = DDP(model.to(device), device_ids=[rank])
+    model = DDP(model.to(device))
     diffusion = create_diffusion(timestep_respacing="")  # default: 1000 steps, linear noise schedule
     vae = AutoencoderKL.from_pretrained(f"stabilityai/sd-vae-ft-{args.vae}").to(device)
     logger.info(f"DiT Parameters: {sum(p.numel() for p in model.parameters()):,}")