| { |
| "decoder_config": { |
| "att_groups": 4, |
| "att_heads": 16, |
| "att_query_groups": 8, |
| "cross_att_type": "sqa", |
| "dense_layer_dim": 1536, |
| "embed_dim": 512, |
| "ff_activation": "silu", |
| "ff_dim": 192, |
| "ff_dropout": 0.0, |
| "final_stateless_layers_config": [ |
| "moe", |
| "moe" |
| ], |
| "head_norm_type": "rms_norm", |
| "moe_bias_mode": "global", |
| "moe_grouped_gemm": true, |
| "moe_shared_experts_bias_mode": "global", |
| "moe_top_k": 10, |
| "moe_use_cutlass_grouped_gemm": true, |
| "moe_use_weighted_shared_experts": false, |
| "num_experts": 384, |
| "num_layers": 21, |
| "num_shared_experts": 2, |
| "rope_base": 100000, |
| "router_amp": true, |
| "router_dtype": "bfloat16", |
| "self_att_type": "sqa", |
| "seq_len": 8192, |
| "shared_expert_dim": 384, |
| "stateless_layers_config": [ |
| "dense", |
| "moe" |
| ], |
| "stm_size": 4096, |
| "use_attention_output_bias": false, |
| "use_flash_attention": true, |
| "use_gated": true, |
| "use_gated_attention": true, |
| "use_gated_cross_attention": false, |
| "use_head_norm": true, |
| "use_moe": true, |
| "use_vectorized_moe": true, |
| "vocab_size": 65536 |
| }, |
| "encoder_config": { |
| "att_groups": 8, |
| "att_heads": 16, |
| "att_query_groups": 8, |
| "cross_att_type": "sqa", |
| "embed_dim": 512, |
| "ff_activation": "silu", |
| "ff_dim": 1536, |
| "ff_dropout": 0.0, |
| "num_layers": 21, |
| "rope_base": 100000, |
| "self_att_type": "sqa", |
| "seq_len": 8192, |
| "skip_memory_cross_attention": true, |
| "stm_size": 4096, |
| "use_attention_output_bias": false, |
| "use_flash_attention": true, |
| "use_gated": true, |
| "use_gated_attention": true, |
| "vocab_size": 65536 |
| }, |
| "memory_attention_config": { |
| "att_groups": 8, |
| "att_heads": 16, |
| "att_query_groups": 8, |
| "att_type": "sqa", |
| "embed_dim": 512, |
| "interlayer_att_groups": 8, |
| "interlayer_att_query_groups": 8, |
| "interlayer_att_type": "sqa", |
| "norm_type": "classic-rms", |
| "num_groups": 3, |
| "num_layers": 21, |
| "residual_gate_type": "elementwise", |
| "residual_per_slot_gate": true, |
| "rope_base": 100000, |
| "seq_len": 8192, |
| "stm_size": 4096, |
| "use_flash_attention": false, |
| "use_gated_residual": true, |
| "use_tanh_residual_gate": false |
| }, |
| "memory_attention_variant": "grouped-self-interlayer", |
| "system_prompt_title": "SYSTEM INSTRUCTIONS", |
| "tokenizer": null, |
| "tokenizer_config": { |
| "answer_token_id": 6, |
| "bos_token_id": 2, |
| "eos_token_id": 3, |
| "internal_token_id": 8, |
| "pad_token_id": 0, |
| "query_token_id": 5, |
| "think_token_id": 7, |
| "tool_call_token_id": 9, |
| "tool_use_token_id": 10 |
| } |
| } |