data_config: cache_size: 10000 dataset: train_30M_sqrt_s42 enable_reverse_augmentation: true force_rebuild_index: false glm_probability: 0.333 lineage_file: /rna-multiverse/data/training_data/lineage_greengenes.tsv max_samples: null max_seq_length: 8192 mode: mixed span_config: allow_overlap: false coverage_probs: - 0.28 - 0.3 - 0.28 - 0.14 distribution_probs: - 0.3 - 0.5 - 0.2 max_coverage_ratios: - 0.15 - 0.25 - 0.5 - 0.8 max_num_spans: 10 span_distributions: - - 10 - 5 - - 20 - 10 - - 50 - 20 train_file: /rna-multiverse/data/cluster/sampling/training/train_30M_sqrt_s42_50only_new.fa use_chunked: true use_direction_tokens: true use_lineage_prefix: true distributed_config: backend: nccl data_parallel_size: 4 expert_parallel_size: 4 weight_parallel_size: 1 logging_config: enable_wandb: true log_dir: /rna-multiverse/results/mid_training/mid_training_v1_from_v21_checkpoint_31006_20251130/logs wandb_project: rna-mid-training wandb_run_name: mid_training_v1_from_v21_checkpoint_31006 memory_config: cleanup_frequency: 100 enable_monitoring: true gc_frequency: 50 model_config: attention_dropout: 0.0 dropout_ramp_steps: 0 dropout_schedule: linear dropout_warmup_steps: 0 eos_loss_weight: 10.0 expert_capacity_factor: 1.5 gradient_clip_norm: 0.0 hidden_dropout: 0.0 hidden_size: 256 initializer_range: 0.02 intermediate_size: 768 label_smoothing: 0.0 max_position_embeddings: 8192 moe_implementation: megablocks moe_world_size: 4 num_attention_heads: 8 num_experts: 8 num_experts_per_tok: 2 num_hidden_layers: 6 num_key_value_heads: 8 resid_dropout: 0.0 rms_norm_eps: 1e-6 router_aux_loss_coef: 0.01 use_cache: true vocab_size: 114 training_config: adam_beta1: 0.9 adam_beta2: 0.95 adam_epsilon: 1e-8 bf16: true dataloader_drop_last: true dataloader_num_workers: 8 dataloader_pin_memory: true fp16: false gradient_checkpointing: false learning_rate: 1.0e-05 logging_steps: 30 max_epochs: 1 max_wall_time_hours: 50 min_lr_ratio: 0.1 output_dir: /rna-multiverse/results/mid_training/mid_training_v1_from_v21_checkpoint_31006_20251130 per_device_train_batch_size: 32 resume_from_pretrain: /rna-multiverse/results/experiments/scaling_tiny_6e18_v21_mixed_glm_stage1_20251122/checkpoint-31006 run_name: mid_training_v1_from_v21_checkpoint_31006 save_steps: 1000 seed: 42 warmup_steps: 1000 weight_decay: 5.0e-06