{ "affine_momentum": 0.9, "architectures": [ "NeoLLMForCausalLM" ], "attention_bias": false, "attention_dropout": 0.1, "attn_res_num_blocks": 4, "auto_map": { "AutoConfig": "configuration_neollm.NeoLLMConfig", "AutoModel": "modeling_neollm.NeoLLMModel", "AutoModelForCausalLM": "modeling_neollm.NeoLLMForCausalLM" }, "bos_token_id": 1, "directional_routing_k": 4, "directional_routing_temp": 3.0, "dropout_rate": 0.1, "dtype": "bfloat16", "eos_token_id": 7, "fan_ratio": 0.125, "fan_ratio_ffn": 0.0625, "generator_d_seed": 128, "generator_k": 3, "generator_krank": 32, "generator_num_knots": 32, "generator_num_modes": 8, "generator_spline_degree": 2, "head_dim": 64, "hidden_act": "xielu", "hidden_size": 512, "iha_local_global_pattern": "LLLLG", "iha_num_pseudo_heads": 2, "iha_sliding_window": null, "initializer_range": 0.02, "intermediate_size": 1536, "jtokm_aux_loss_weight": 0.0001, "jtokm_norm_eps": 1e-06, "jtokm_num_experts": 4, "jtokm_num_modes": 4, "jtokm_top_k": 2, "laurel_lr_rank": 32, "lucid_attention_eps": 1e-06, "max_position_embeddings": 512, "mea_component_key_value_heads": 4, "mea_groupnorm_eps": 1e-06, "model_type": "neollm", "momentum_gamma": 0.1, "num_attention_heads": 8, "num_hidden_layers": 12, "num_key_value_heads": 4, "pad_token_id": 0, "partial_rotary_factor": 0.25, "polynorm_exclusive": false, "repo_d_p": 64, "repo_start_layer": 4, "rms_norm_eps": 1e-06, "rope_parameters": { "partial_rotary_factor": 0.25, "rope_theta": 10000.0, "rope_type": "default" }, "rope_theta": 10000.0, "tie_word_embeddings": false, "transformers_version": "5.6.2", "use_affine_scaled_attention": true, "use_attn_res": false, "use_cache": false, "use_directional_routing": false, "use_fan_residual": false, "use_hadamard_o_proj": true, "use_iha": true, "use_jtokm": false, "use_laurel": false, "use_laurel_lr": false, "use_laurel_rw": false, "use_lucid_attention": false, "use_mea_attention": false, "use_momentum_attention": true, "use_repo": true, "use_repo_grape": true, "use_spelling_bee_embeddings": true, "use_token_generator": false, "use_versatile_ffn": false, "use_xsa": true, "versatile_active_experts": 2, "versatile_aux_loss_weight": 1e-05, "versatile_gumbel_temp_decay": 0.99984, "versatile_gumbel_temp_end": 0.1, "versatile_gumbel_temp_start": 5.0, "versatile_max_depth": 2, "versatile_total_experts": 4, "vocab_size": 64402, "xsa_eps": 1e-06 }