Build Recurrent-Depth Transformers with OpenMythos for MLA, GQA, Sparse MoE, and Loop-Scaled Reasoning


def build_model(attn_type: str = "mla", max_loop_iters: int = 8) -> tuple:
   """Build a small OpenMythos model. Two attention variants supported.
   MLA  — Multi-Latent Attention (compressed KV cache, DeepSeek-V2 style)
   GQA  — Grouped-Query Attention (fewer KV heads than Q heads)
   """
   base = dict(
       vocab_size       = 64,
       dim              = 128,
       n_heads          = 4,
       max_seq_len      = 32,
       max_loop_iters   = max_loop_iters,
       prelude_layers   = 1,
       coda_layers      = 1,
       n_experts        = 4,
       n_shared_experts = 1,
       n_experts_per_tok= 2,
       expert_dim       = 64,
       lora_rank        = 8,
       attn_type        = attn_type,
   )
   if attn_type == "gqa":
       cfg = MythosConfig(**base, n_kv_heads=2)
   else:
       cfg = MythosConfig(
           **base, n_kv_heads=4,
           kv_lora_rank=32, q_lora_rank=32,
           qk_rope_head_dim=16, qk_nope_head_dim=16, v_head_dim=16,
       )
   model = OpenMythos(cfg).to(device)
   return model, cfg
model_mla, cfg_mla = build_model("mla")
model_gqa, cfg_gqa = build_model("gqa")
def n_params(m): return sum(p.numel() for p in m.parameters())
print(f"\n[MLA] params: {n_params(model_mla):>10,}")
print(f"[GQA] params: {n_params(model_gqa):>10,}")
def spectral_radius(model):
   A = model.recurrent.injection.get_A().detach().cpu()
   if A.dim() == 1:
       rho = A.abs().max().item()
   else:
       rho = torch.linalg.eigvals(A.float()).abs().max().item()
   return rho
print(f"\nρ(A) MLA: {spectral_radius(model_mla):.4f}   (must be < 1)")
print(f"ρ(A) GQA: {spectral_radius(model_gqa):.4f}   (must be < 1)")
ids = torch.randint(0, cfg_mla.vocab_size, (2, 16), device=device)
with torch.no_grad():
   logits = model_mla(ids, n_loops=4)
   gen    = model_mla.generate(ids, max_new_tokens=4, n_loops=8)
print(f"\nForward logits shape:  {tuple(logits.shape)}")
print(f"Generation shape:      {tuple(gen.shape)}")



Source link

Leave a Reply

Your email address will not be published. Required fields are marked *