
def build_model(attn_type: str = "mla", max_loop_iters: int = 8) -> tuple:
"""Build a small OpenMythos model. Two attention variants supported.
MLA — Multi-Latent Attention (compressed KV cache, DeepSeek-V2 style)
GQA — Grouped-Query Attention (fewer KV heads than Q heads)
"""
base = dict(
vocab_size = 64,
dim = 128,
n_heads = 4,
max_seq_len = 32,
max_loop_iters = max_loop_iters,
prelude_layers = 1,
coda_layers = 1,
n_experts = 4,
n_shared_experts = 1,
n_experts_per_tok= 2,
expert_dim = 64,
lora_rank = 8,
attn_type = attn_type,
)
if attn_type == "gqa":
cfg = MythosConfig(**base, n_kv_heads=2)
else:
cfg = MythosConfig(
**base, n_kv_heads=4,
kv_lora_rank=32, q_lora_rank=32,
qk_rope_head_dim=16, qk_nope_head_dim=16, v_head_dim=16,
)
model = OpenMythos(cfg).to(device)
return model, cfg
model_mla, cfg_mla = build_model("mla")
model_gqa, cfg_gqa = build_model("gqa")
def n_params(m): return sum(p.numel() for p in m.parameters())
print(f"\n[MLA] params: {n_params(model_mla):>10,}")
print(f"[GQA] params: {n_params(model_gqa):>10,}")
def spectral_radius(model):
A = model.recurrent.injection.get_A().detach().cpu()
if A.dim() == 1:
rho = A.abs().max().item()
else:
rho = torch.linalg.eigvals(A.float()).abs().max().item()
return rho
print(f"\nρ(A) MLA: {spectral_radius(model_mla):.4f} (must be < 1)")
print(f"ρ(A) GQA: {spectral_radius(model_gqa):.4f} (must be < 1)")
ids = torch.randint(0, cfg_mla.vocab_size, (2, 16), device=device)
with torch.no_grad():
logits = model_mla(ids, n_loops=4)
gen = model_mla.generate(ids, max_new_tokens=4, n_loops=8)
print(f"\nForward logits shape: {tuple(logits.shape)}")
print(f"Generation shape: {tuple(gen.shape)}")



