| { | |
| "att_dropout": 0.0, | |
| "att_heads": 16, | |
| "embed_dim": 1024, | |
| "ff_dim": 3072, | |
| "ff_dropout": 0.0, | |
| "head_dim": 128, | |
| "kv_heads": 8, | |
| "layer_types": [ | |
| "stateless", | |
| "stateful", | |
| "stateless", | |
| "stateful", | |
| "stateless", | |
| "stateful", | |
| "stateless", | |
| "stateful", | |
| "stateless", | |
| "stateful", | |
| "stateless", | |
| "stateful", | |
| "stateless", | |
| "stateful", | |
| "stateless", | |
| "stateful", | |
| "stateless", | |
| "stateful", | |
| "stateless", | |
| "stateful", | |
| "stateless", | |
| "stateful", | |
| "stateless", | |
| "stateful", | |
| "stateless", | |
| "stateful", | |
| "stateless", | |
| "stateful" | |
| ], | |
| "memory_gate_type": "linear", | |
| "num_layers": 28, | |
| "padding_idx": 151669, | |
| "rope_base": 1000000, | |
| "seq_len": 8192, | |
| "skip_stm": false, | |
| "stm_batch_size": 1, | |
| "stm_size": 4096, | |
| "tie_embeddings": true, | |
| "training_cache": true, | |
| "use_flash_attention": false, | |
| "use_memory_gate": true, | |
| "use_separate_memory_projections": true, | |
| "vocab_size": 151936 | |
| } |