Find out how to Pace Up Transformer Coaching Utilizing NVIDIA Apex (FusedAdam, FusedLayerNorm) and Native torch.amp

Find out how to Pace Up Transformer Coaching Utilizing NVIDIA Apex (FusedAdam, FusedLayerNorm) and Native torch.amp


print("n### SECTION D: end-to-end Transformer (vanilla fp32 vs Apex fused + AMP) ###")
VOCAB, D, NHEAD, LAYERS, SEQ, BATCH, STEPS = 2000, 256, 4, 4, 128, 32, 60
class Block(torch.nn.Module):
   def __init__(self, d, nhead, norm_cls):
       tremendous().__init__()
       self.attn = torch.nn.MultiheadAttention(d, nhead, batch_first=True)
       self.ff = torch.nn.Sequential(torch.nn.Linear(d, 4 * d), torch.nn.GELU(),
                                     torch.nn.Linear(4 * d, d))
       self.n1, self.n2 = norm_cls(d), norm_cls(d)
   def ahead(self, x):
       h = self.n1(x); x = x + self.attn(h, h, h, need_weights=False)[0]
       return x + self.ff(self.n2(x))
class TinyTransformer(torch.nn.Module):
   def __init__(self, norm_cls):
       tremendous().__init__()
       self.emb = torch.nn.Embedding(VOCAB, D)
       self.blocks = torch.nn.ModuleList([Block(D, NHEAD, norm_cls) for _ in range(LAYERS)])
       self.norm = norm_cls(D)
       self.head = torch.nn.Linear(D, VOCAB)
   def ahead(self, idx):
       x = self.emb(idx)
       for b in self.blocks:
           x = b(x)
       return self.head(self.norm(x))
g = torch.Generator(gadget="cpu").manual_seed(0)
knowledge = torch.randint(0, VOCAB, (BATCH, SEQ + 1), generator=g).to(DEV)
inp, tgt = knowledge[:, :-1], knowledge[:, 1:]
lossfn = torch.nn.CrossEntropyLoss()
def run_training(use_apex):
   torch.manual_seed(0)
   norm_cls = (FusedLayerNorm if (use_apex and HAS_FLN and APEX_OK) else torch.nn.LayerNorm)
   mannequin = TinyTransformer(norm_cls).to(DEV)
   if use_apex and HAS_AMP_C and APEX_OK:
       optimizer = FusedAdam(mannequin.parameters(), lr=3e-4)
   else:
       optimizer = torch.optim.AdamW(mannequin.parameters(), lr=3e-4)
   scaler = torch.amp.GradScaler("cuda", enabled=use_apex)
   def one_step():
       optimizer.zero_grad(set_to_none=True)
       with torch.amp.autocast("cuda", dtype=torch.float16, enabled=use_apex):
           logits = mannequin(inp)
           loss = lossfn(logits.reshape(-1, VOCAB), tgt.reshape(-1))
       scaler.scale(loss).backward()
       scaler.step(optimizer)
       scaler.replace()
       return loss
   for _ in vary(5):
       one_step()
   torch.cuda.synchronize()
   t0 = time.perf_counter()
   for _ in vary(STEPS):
       loss = one_step()
   torch.cuda.synchronize()
   dt = time.perf_counter() - t0
   return loss.merchandise(), (STEPS * BATCH * SEQ) / dt, dt
loss_v, tps_v, dt_v = run_training(use_apex=False)
print(f"  vanilla (fp32, nn.LayerNorm, AdamW)        : "
     f"{dt_v:5.2f}s  | {tps_v:9.0f} tok/s | last loss {loss_v:.3f}")
if APEX_OK and (HAS_AMP_C or HAS_FLN):
   loss_a, tps_a, dt_a = run_training(use_apex=True)
   print(f"  apex   (fp16, FusedLayerNorm, FusedAdam)   : "
         f"{dt_a:5.2f}s  | {tps_a:9.0f} tok/s | last loss {loss_a:.3f}")
   print(f"  ----> speedup: {tps_a / tps_v:0.2f}x throughput")
else:
   print("  apex path SKIPPED (no fused kernels constructed)")
print("n" + "=" * 78)
print("DONE. Key takeaways:")
print("  - FusedAdam/FusedLayerNorm/FusedRMSNorm are the still-relevant Apex items;")
print("    speedups develop with mannequin dimension & parameter rely (tiny demo understates it).")
print("  - apex.amp is deprecated -> choose torch.amp.autocast + torch.amp.GradScaler.")
print("  - FusedAdam composes cleanly with native torch.amp (Part D).")
print("  - On actual workloads, additionally attempt a bigger mannequin and bf16 autocast (no scaler wanted).")
print("=" * 78)



Source link

Leave a Reply

Your email address will not be published. Required fields are marked *