File size: 20,731 Bytes
run_name: molmoflow-rby1-multitask-16node-03-05-23-31-19_bs_1024_dbs_8_step_100000_llmlr_1e-5
model:
  model_name: molmoact
  data_formatter:
    prompt_templates: uber_model_v2
    message_format: qwen3
    system_prompt: demo_or_style_v2
    always_start_with_space: false
    default_inference_len: 65
    select_answer: best
    debug: false
    image_last: false
    format_message_list: null
    p_one_message: 0.0
    eval_system_prompt_mapping: null
    p_choice_content_in_mc: 1.0
    template_video_mc_questions: true
    pointing_format: html-v2
    points_decimal_places: 1
    use_seperate_non_pointing_qa_style: false
    timestamp_mode: 50-percent-seconds
    output_timestamp_mode: seconds
    seconds_decimal_places: 1
    p_multi_point_all_image: 0.5
    use_seperate_count_without_pointing_style: false
    sample_random_initial_point: true
  llm:
    d_model: 2560
    n_heads: 32
    n_kv_heads: 8
    head_dim: 128
    qkv_bias: false
    clip_qkv: null
    n_layers: 36
    mlp_ratio: 4
    mlp_hidden_size: 19456
    activation_type: swiglu
    block_type: sequential
    rope: true
    rope_full_precision: true
    rope_theta: 5000000.0
    rope_type: default
    rope_factor: null
    rope_high_freq_factor: null
    rope_low_freq_factor: null
    rope_original_max_position_embeddings: null
    rope_attention_factor: null
    rope_beta_fast: null
    rope_beta_slow: null
    rope_mscale: null
    rope_mscale_all_dim: null
    rope_truncate: null
    attention_type: sdpa
    full_attention_layers: null
    sliding_attention_rope_scaling: false
    float32_attention: true
    attention_dropout: 0.0
    attention_layer_norm: true
    attention_layer_norm_type: qwen3
    residual_dropout: 0.1
    response_residual_dropout: 0.0
    layer_norm_type: rms
    layer_norm_with_affine: true
    layer_norm_eps: 1.0e-06
    attention_layer_norm_with_affine: true
    max_sequence_length: 8192
    max_position_embeddings: null
    include_bias: false
    bias_for_layer_norm: null
    norm_after: false
    moe_num_experts: 8
    moe_top_k: 2
    moe_mlp_impl: sparse
    moe_log_expert_assignment: false
    moe_shared_expert: false
    moe_lbl_in_fp32: false
    moe_interleave: false
    moe_loss_weight: 0.1
    moe_zloss_weight: null
    moe_dropless: true
    moe_capacity_factor: 1.25
    embedding_dropout: 0.0
    scale_logits: false
    vocab_size: 151936
    additional_vocab_size: 128
    weight_tying: true
    embedding_size: 151936
    use_position_ids: true
    tokenizer:
      identifier: Qwen/Qwen3-4B-Instruct-2507
      tokenizer_dir: null
    init_path: /weka/oe-training-default/mm-olmo/pretrained_llms/qwen3-4b-instruct.pt
    init_incremental: null
    new_embedding_init_range: 0.02
    initializer_range: 0.02
    normalize_input_embeds: false
    activation_checkpoint: whole_layer
    compile: blocks
    fix_pad_tokenizer: false
    init_std: 0.02
    init_fn: normal
    init_cutoff_factor: null
  vision_backbone:
    vit:
      image_model_type: siglip
      image_default_input_size:
      - 378
      - 378
      image_patch_size: 14
      image_pos_patch_size: 14
      image_emb_dim: 1152
      image_num_heads: 16
      image_num_key_value_heads: 16
      image_num_layers: 27
      image_head_dim: 72
      image_mlp_dim: 4304
      image_mlp_activations: gelu_pytorch_tanh
      image_dropout_rate: 0.0
      image_num_pos: 729
      image_norm_eps: 1.0e-06
      attention_dropout: 0.0
      residual_dropout: 0.0
      initializer_range: 0.02
      float32_attention: true
      attention_type: sdpa
      sdpa_backend: all
      activation_checkpointing: true
      init_path: /weka/oe-training-default/mm-olmo/pretrained_image_encoders/siglip2-so400m-14-384.pt
      resize_mode: siglip
      pad_value: 0.0
      normalize: siglip
    image_pooling_2d: attention_meanq
    pooling_attention_mask: true
    image_projector: mlp
    image_padding_embed: null
    vit_layers:
    - -3
    - -9
    skip_unused_layers: true
    use_deepstack: false
    share_connector: false
    image_feature_dropout: 0.0
    connector_activation_checkpointing: true
    compile_vit: blocks
    pool_size_embeds: null
    compile_connector: null
    normalize_on_gpu: true
    use_image_augmentation: true
    use_resize_bottleneck: false
  mm_preprocessor:
    max_answer_len: null
    last_message_loss_only: false
    max_text_tokens: null
    loss_token_weighting: root_subsegments_root_tokens
    max_frames: 1
    frame_sample_mode: uniform_last_frame
    candidate_sampling_fps:
    - 0.25
    - 0.5
    - 1.0
    - 2.0
    - 4.0
    - 6.0
    - 8.0
    - 16.0
    cache_videos: true
    loading_method: torchcodec_exact
    max_fps:
    - 2.0
    time_sampling: true
    time_mode: per-frame-compact
    subtitle_mode: frame_1
    max_crops: 1
    overlap_margins:
    - 4.0
    - 4.0
    use_col_tokens: false
    periodic_high_res_frame: null
    high_low_train_mode: local_rnd
    high_res_frame_sample_options: null
    periodic_sample_rate_training:
      4:
      - 0.9
      - 0.03
      - 0.03
      - 0.04
      3:
      - 0.6
      - 0.2
      - 0.2
    skip_low_res_in_high_low: false
    pooling_w: 3
    pooling_h: 3
    high_res_pooling_w: null
    high_res_pooling_h: null
    query_based_resolution_selection: false
    max_queries_for_resolution_selection: 8
    use_frame_special_tokens: true
    frame_sel_clip_identifier: google/siglip2-so400m-patch14-384
    image_padding_mask: false
    max_subtitle_tokens: null
    image:
      crop_mode: resize
      use_col_tokens: true
      max_crops: 8
      high_res_max_crops: 24
      p_high_res: 0.0
      pooling_w: 2
      pooling_h: 2
      overlap_margins:
      - 4
      - 4
      max_images: 5
      max_multi_image_crops: 8
      multi_image_pooling_w: 2
      multi_image_pooling_h: 2
      use_single_crop_col_tokens: false
      use_single_crop_start_token: true
    topk: null
    prune_from_frame: 0
  bi_directional_attn: image_tokens
  shared_low_high_embedding: true
  debug: null
  cp_enabled: false
  apply_cp_to_vision_backbone: false
  action_dim: 20
  action_horizon: 16
  n_action_steps: 8
  n_obs_steps: 1
  action_expert:
    max_horizon: 32
    action_dim: 20
    hidden_size: 768
    num_layers: 36
    num_heads: 8
    mlp_ratio: 4.0
    timestep_embed_dim: 256
    dropout: 0.0
    attn_dropout: 0.0
    context_layer_norm: true
  action_expert_layer_mode: per_layer
  flow_matching_num_steps: 10
  flow_matching_cutoff: 0.999
  flow_matching_beta_alpha: 1.0
  flow_matching_beta_beta: 1.5
  num_flow_timestamps: 8
  same_noise_per_time: false
  robot_preprocessor:
    stats_by_repo:
      synthmanip:
        observation.state:
          min:
          - -4.904874324798584
          - -4.564780235290527
          - -3.5160739421844482
          - -2.356419563293457
          - -0.47234979271888733
          - -2.0865397453308105
          - -3.343071222305298
          - -5.8824052810668945
          - -1.7488995790481567
          - -2.967109203338623
          - -0.11299018561840057
          - -2.3546268939971924
          - -3.1416664123535156
          - -2.0946199893951416
          - -3.2890703678131104
          - -6.282893657684326
          - -1.7483078241348267
          - -2.967064142227173
          - -0.12049419432878494
          - -1.778153419494629
          - -1.7587945461273193
          - -1.5871200561523438
          max:
          - 17.08185577392578
          - 33.73189163208008
          - 3.2411913871765137
          - 2.356658697128296
          - 3.1416971683502197
          - 2.1008245944976807
          - 0.07229717075824738
          - 6.270575523376465
          - 2.0102994441986084
          - 2.9668161869049072
          - 0.021467044949531555
          - 2.3977394104003906
          - 0.34489157795906067
          - 2.0900635719299316
          - 0.07242166996002197
          - 6.27663516998291
          - 2.0076160430908203
          - 2.9636759757995605
          - 0.04509617015719414
          - 0.919683575630188
          - 1.6717331409454346
          - 1.1039749383926392
        action:
          q01:
          - -0.04400388523936272
          - -0.044572047889232635
          - -0.05000000074505806
          - -0.05000000074505806
          - -0.037506889551877975
          - -0.03562070056796074
          - -0.05000000074505806
          - -0.05000000074505806
          - -0.04800133779644966
          - -0.05000000074505806
          - -100.0
          - -0.05000000074505806
          - -0.05000000074505806
          - -0.04927435144782066
          - -0.05000000074505806
          - -0.05000000074505806
          - -0.0456085205078125
          - -0.05000000074505806
          - -100.0
          - -0.025820335373282433
          q99:
          - 0.04579437896609306
          - 0.04565873369574547
          - 0.05000000074505806
          - 0.05000000074505806
          - 0.05000000074505806
          - 0.03847877308726311
          - 0.05000000074505806
          - 0.05000000074505806
          - 0.05000000074505806
          - 0.05000000074505806
          - 100.0
          - 0.05000000074505806
          - 0.03608553484082222
          - 0.04896605759859085
          - 0.05000000074505806
          - 0.05000000074505806
          - 0.05000000074505806
          - 0.05000000074505806
          - 100.0
          - 0.7379999756813049
    default_repo_id: synthmanip
    action_key: action
    state_keys:
    - observation.state
    action_norm_mode: quantiles
    state_norm_mode: min_max
  robot_postprocessor:
    stats_by_repo:
      synthmanip:
        observation.state:
          min:
          - -4.904874324798584
          - -4.564780235290527
          - -3.5160739421844482
          - -2.356419563293457
          - -0.47234979271888733
          - -2.0865397453308105
          - -3.343071222305298
          - -5.8824052810668945
          - -1.7488995790481567
          - -2.967109203338623
          - -0.11299018561840057
          - -2.3546268939971924
          - -3.1416664123535156
          - -2.0946199893951416
          - -3.2890703678131104
          - -6.282893657684326
          - -1.7483078241348267
          - -2.967064142227173
          - -0.12049419432878494
          - -1.778153419494629
          - -1.7587945461273193
          - -1.5871200561523438
          max:
          - 17.08185577392578
          - 33.73189163208008
          - 3.2411913871765137
          - 2.356658697128296
          - 3.1416971683502197
          - 2.1008245944976807
          - 0.07229717075824738
          - 6.270575523376465
          - 2.0102994441986084
          - 2.9668161869049072
          - 0.021467044949531555
          - 2.3977394104003906
          - 0.34489157795906067
          - 2.0900635719299316
          - 0.07242166996002197
          - 6.27663516998291
          - 2.0076160430908203
          - 2.9636759757995605
          - 0.04509617015719414
          - 0.919683575630188
          - 1.6717331409454346
          - 1.1039749383926392
        action:
          q01:
          - -0.04400388523936272
          - -0.044572047889232635
          - -0.05000000074505806
          - -0.05000000074505806
          - -0.037506889551877975
          - -0.03562070056796074
          - -0.05000000074505806
          - -0.05000000074505806
          - -0.04800133779644966
          - -0.05000000074505806
          - -100.0
          - -0.05000000074505806
          - -0.05000000074505806
          - -0.04927435144782066
          - -0.05000000074505806
          - -0.05000000074505806
          - -0.0456085205078125
          - -0.05000000074505806
          - -100.0
          - -0.025820335373282433
          q99:
          - 0.04579437896609306
          - 0.04565873369574547
          - 0.05000000074505806
          - 0.05000000074505806
          - 0.05000000074505806
          - 0.03847877308726311
          - 0.05000000074505806
          - 0.05000000074505806
          - 0.05000000074505806
          - 0.05000000074505806
          - 100.0
          - 0.05000000074505806
          - 0.03608553484082222
          - 0.04896605759859085
          - 0.05000000074505806
          - 0.05000000074505806
          - 0.05000000074505806
          - 0.05000000074505806
          - 100.0
          - 0.7379999756813049
    default_repo_id: synthmanip
    action_key: action
    state_keys:
    - observation.state
    action_norm_mode: quantiles
    state_norm_mode: min_max
parallelism:
  data_parallel_replicate_degree: 1
  enable_compiled_autograd: false
  data_parallel_shard_degree: -1
  fsdp_reshard_after_forward: default
  context_parallel_config:
    degree: 1
    attention_type: ulysses
    load_balancer: ulysses
    head_stride: 1
  tensor_parallel_config:
    degree: 1
    enable_async: false
  data_parallel_config:
    name: fsdp
    param_dtype: null
    reduce_dtype: float32
    num_replicas: null
    shard_degree: null
    wrapping_strategy: full
    prefetch_factor: 0
  context_parallel_rotate_method: allgather
seed: 6198
epoch: null
dry_run: false
ft_llm: true
ft_vit: false
ft_connector: false
ft_embedding: lm_head
optimizer:
  name: adamw
  learning_rate: 0.0001
  weight_decay: 0.01
  betas:
  - 0.9
  - 0.95
  eps: 1.0e-05
  connector_learning_rate: 5.0e-06
  vit_learning_rate: 5.0e-06
  llm_learning_rate: 1.0e-05
  frame_selector_learning_rate: 0.0001
  temporal_token_scorer_learning_rate: 0.0001
  action_expert_learning_rate: 0.0001
  connector_weight_decay: 0.0
  vit_weight_decay: 0.0
  llm_weight_decay: 0.0
  frame_selector_weight_decay: 0.01
  temporal_token_scorer_weight_decay: 0.01
  action_expert_weight_decay: 0.0
  connector_betas:
  - 0.9
  - 0.95
  vit_betas:
  - 0.9
  - 0.95
  llm_betas:
  - 0.9
  - 0.95
  frame_selector_betas:
  - 0.9
  - 0.95
  temporal_token_scorer_betas:
  - 0.9
  - 0.95
  action_expert_betas:
  - 0.9
  - 0.95
  connector_eps: 1.0e-06
  vit_eps: 1.0e-06
  llm_eps: 1.0e-06
  frame_selector_eps: 1.0e-06
  temporal_token_scorer_eps: 1.0e-06
  action_expert_eps: 1.0e-06
  metrics_log_interval: -1
scheduler:
  name: multimodal
  units: steps
  t_warmup: 100
  t_max: null
  alpha_f: 0.1
  connector_t_warmup: 200
  vit_t_warmup: 200
  llm_t_warmup: 2000
  frame_selector_t_warmup: 200
  temporal_token_scorer_t_warmup: 200
  action_expert_t_warmup: 200
  grad_clip_warmup_steps: null
  grad_clip_warmup_factor: null
  warmup_min_lr: 0.0
data:
  dataset: null
  mixture:
    synthmanip/task_0: 1.0
    synthmanip/task_1: 1.0
    synthmanip/task_2: 1.0
    synthmanip/task_3: 1.0
    synthmanip/task_4: 1.0
    synthmanip/task_5: 1.0
    synthmanip/task_6: 1.0
    synthmanip/task_7: 1.0
    synthmanip/task_8: 1.0
    synthmanip/task_9: 1.0
  root_size_mixture: null
  kwargs_mixture: null
  split: train
  seed: 50189
  pad: to_max
  sequence_length: 1024
  max_text_seq_len: null
  shuffle: true
  start_index: 0
  packing: null
  enable_variable_sized_token_pooling: true
  num_workers: 4
  drop_last: true
  pin_memory: true
  prefetch_factor: 4
  persistent_workers: false
  timeout: 300
action_data: null
action_loader_rate: null
action_batch_interval: 1
restore_dataloader: true
fast_forward_batches: null
evaluators: []
eval_interval: 0
inf_evaluators: []
inf_eval_interval: 1000
eval_on_last_step: true
eval_on_load: false
eval_on: []
save_folder: /weka/oe-training-default/snehalj/synthmanip_checkpoints/molmoflow-rby1-multitask-16node-03-05-23-31-19_bs_1024_dbs_8_step_100000_llmlr_1e-5
checkpointer_config:
  save_thread_count: null
  load_thread_count: null
  pre_download: false
  work_dir: null
  throttle_uploads: false
canceled_check_interval: 50
save_interval: 2000
save_at: null
save_final_optim: false
save_num_checkpoints_to_keep: 3
checkpoint_retention_frequency: 10000
save_final_unsharded_checkpoint: false
save_interval_ephemeral: null
save_overwrite: true
load_path: null
reset_optimizer_state: false
reset_trainer_state: false
initial_model_checkpoint: /weka/oe-training-default/hqfang/molmo2_checkpoints/4b-cp/step2000-unsharded/
allow_resume: true
max_duration: 100000
global_train_batch_size: 1024
device_train_microbatch_size: 8
max_grad_norm: 1.0
multi_component_grad_norm: true
batch_divisor: global_batch
max_grad_norm_ratio: null
precision: amp_bf16
wandb:
  project: whirl-molmoflow-rby1
  entity: prior-ai2
  group: null
  name: molmoflow-rby1-multitask-16node-03-05-23-31-19_bs_1024_dbs_8_step_100000_llmlr_1e-5
  tags:
  - watching
  log_artifacts: false
  rank_zero_only: true
  log_interval: 20
  allow_resume: true
  finish_on_sigterm: true
beaker_log_interval: 50
speed_monitor:
  window_size: 20
  gpu_flops_available: null
console_log_interval: 20
enable_timing_logs: false
gen1_gc_interval: 1
compile:
  mode: default
  fullgraph: false
  dynamic: false
  backend: inductor
activation_checkpointing: true
fsdp:
  fsdp2: true
  precision: pure
  use_orig_params: true
  wrapping_strategy: null
  sharding_strategy: FULL_SHARD
  hybrid_sharding_num_model_replicas: null
softmax_auxiliary_loss: false
softmax_auxiliary_loss_scale: 0.0001
response_logits_only: true
saliency_score_loss_wt: null
frame_score_loss_wt: null
frame_score_loss_type: mse
frame_score_loss_target: 0.7
time_limit: null
extra_steps_after_cancel: 0
python_profiling: false
torch_profiling: false
stop_at: 100000
stop_after: null
fused_loss: false
compile_loss: true
runtime_data:
  args: launch_scripts/train_synthmanip.py /weka/oe-training-default/hqfang/molmo2_checkpoints/4b-cp/step2000-unsharded/
    --data_paths /weka/prior/datasets/robomolmo/feb12_franka_and_rby1/DoorOpeningDataGenConfig
    /weka/prior/datasets/robomolmo/feb15_franka_and_rby1/DoorOpeningDataGenConfig
    /weka/prior/datasets/robomolmo/feb12_franka_and_rby1/RBY1PickDataGenConfig /weka/prior/datasets/robomolmo/feb15_franka_and_rby1/RBY1PickDataGenConfig
    /weka/prior/datasets/robomolmo/feb21_franka_and_rby1/RBY1PickDataGenConfig /weka/prior/datasets/robomolmo/feb12_franka_and_rby1/RBY1PickAndPlaceDataGenConfig
    /weka/prior/datasets/robomolmo/feb15_franka_and_rby1/RBY1PickAndPlaceDataGenConfig
    /weka/prior/datasets/robomolmo/feb21_franka_and_rby1/RBY1PickAndPlaceDataGenConfig
    /weka/prior/datasets/robomolmo/feb23_open_datagen/RBY1OpenDataGenConfig /weka/prior/datasets/robomolmo/feb23_open_datagen_obja/RBY1OpenDataGenConfig
    --no_val --dataset_sample_rates 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 --stats_path=/weka/prior/datasets/robomolmo/rby1_multitask_norm_stats.yaml
    --action_preset RBY1_multitask --camera_preset RBY1_full_with_head_gopro --wandb.name=molmoflow-rby1-multitask-16node-03-05-23-31-19_bs_1024_dbs_8_step_100000_llmlr_1e-5
    --wandb.entity=prior-ai2 --wandb.project=whirl-molmoflow-rby1 --seq_len=1024 --max_duration=100000
    --device_batch_size=8 --global_batch_size=1024 --log_interval=20 --model.mm_preprocessor.use_frame_special_tokens=True
    --model.mm_preprocessor.max_subtitle_tokens=null --data.num_workers=4 --prefetch_factor=4
    --save_interval=2000 --save_num_checkpoints_to_keep=3 --checkpoint_retention_frequency=10000
    --save_folder=/weka/oe-training-default/snehalj/synthmanip_checkpoints/molmoflow-rby1-multitask-16node-03-05-23-31-19_bs_1024_dbs_8_step_100000_llmlr_1e-5
    --exp_name=molmoflow-rby1-multitask-16node-03-05-23-31-19_bs_1024_dbs_8_step_100000_llmlr_1e-5
    --data.packing=null --model.mm_preprocessor.image.max_images=5 --model.mm_preprocessor.image.crop_mode=resize
    --model.mm_preprocessor.max_frames=1 --model.same_noise_per_time=False --model.num_flow_timestamps=8
    --use_point_prompts_per_dataset 1 1 0 0 0 0 0 0 1 1 --randomize_prompts --point_prompt_camera=head_camera
    --max_points_in_conditioning_frame=1 --conditioning_frame=random_first_10 --cameras_to_warp
    head_camera --img_aug --ft_llm=True --scheduler.llm_t_warmup=2000 --optimizer.llm_learning_rate=1e-5
  hostname: jupiter-cs-aus-147.reviz.ai2.in
  date: 03/05/2026, 22:36
  world_size: 128
  resuming_from: null
  beaker_experiment_id: 01KK0212A2CKWNFJEJHT7AZMW5
  beaker_experiment_url: https://beaker.org/ex/01KK0212A2CKWNFJEJHT7AZMW5
  wandb_id: t57qc9vl
  wandb_url: https://wandb.ai/prior-ai2/whirl-molmoflow-rby1/runs/t57qc9vl
distributed_eval_enabled: false
distributed_eval_benchmark_path: /weka/oe/rohunt/robo-bench/FrankaPickandPlaceDroidBench_5ep_json_benchmark
distributed_eval_config_cls: launch_scripts.synthvla.configure_mujoco_thor:FrankaState8ClampConfig
distributed_eval_task_horizon: 300
distributed_eval_num_worker_jobs: 1
distributed_eval_wandb_project: mjthor-online-eval
distributed_eval_workspace: ai2/robo-molmo
distributed_eval_clusters:
- ai2/saturn
- ai2/neptune
- ai2/rhea
- ai2/ceres
distributed_eval_priority: high
distributed_eval_preemptible: true