# Model Configuration for SAM2 # This file should be placed alongside the SAM2 checkpoint # SAM 2 Hiera Large Configuration model: _target_: sam2.modeling.sam2_base.SAM2Base image_encoder: _target_: sam2.modeling.backbones.image_encoder.ImageEncoder trunk: _target_: sam2.modeling.backbones.hieradet.Hiera embed_dim: 144 num_heads: 2 stages: [2, 6, 36, 4] global_att_blocks: [23, 33, 43] window_pos_embed_bkg_spatial_size: [7, 7] window_spec: [8, 4, 16, 8] neck: _target_: sam2.modeling.backbones.image_encoder.FpnNeck position_encoding: _target_: sam2.modeling.position_encoding.PositionEmbeddingSine num_pos_feats: 256 normalize: true scale: null temperature: 10000 d_model: 256 backbone_channel_list: [1152, 576, 288, 144] fpn_top_down_levels: [2, 3] fpn_interp_model: nearest memory_attention: _target_: sam2.modeling.memory_attention.MemoryAttention d_model: 256 pos_enc_at_input: true layer: _target_: sam2.modeling.memory_attention.MemoryAttentionLayer activation: relu dim_feedforward: 2048 dropout: 0.1 pos_enc_at_attn: false self_attention: _target_: sam2.modeling.sam.transformer.RoPEAttention rope_theta: 10000.0 feat_sizes: [32, 32] embedding_dim: 256 num_heads: 1 downsample_rate: 1 dropout: 0.1 d_model: 256 pos_enc_at_cross_attn_keys: true pos_enc_at_cross_attn_queries: false cross_attention: _target_: sam2.modeling.sam.transformer.RoPEAttention rope_theta: 10000.0 feat_sizes: [32, 32] rope_k_repeat: True embedding_dim: 256 num_heads: 1 downsample_rate: 1 dropout: 0.1 kv_in_dim: 64 num_layers: 4 memory_encoder: _target_: sam2.modeling.memory_encoder.MemoryEncoder out_dim: 64 position_encoding: _target_: sam2.modeling.position_encoding.PositionEmbeddingSine num_pos_feats: 64 normalize: true scale: null temperature: 10000 mask_downsampler: _target_: sam2.modeling.memory_encoder.MaskDownSampler kernel_size: 3 stride: 2 padding: 1 fuser: _target_: sam2.modeling.memory_encoder.Fuser layer: _target_: sam2.modeling.memory_encoder.CXBlock dim: 256 kernel_size: 7 padding: 3 layer_scale_init_value: 1e-6 use_dwconv: True num_layers: 2 num_maskmem: 7 image_size: 1024 sigmoid_scale_for_mem_enc: 20.0 sigmoid_bias_for_mem_enc: -10.0 use_mask_input_as_output_without_sam: true directly_add_no_mem_embed: true use_high_res_features_in_sam: true multimask_output_in_sam: true multimask_min_pt_num: 0 multimask_max_pt_num: 1 multimask_output_for_tracking: true use_multimask_token_for_obj_ptr: true iou_prediction_use_sigmoid: True memory_temporal_stride_for_eval: 1 non_overlap_masks_for_mem_enc: true use_obj_ptrs_in_encoder: true max_obj_ptrs_in_encoder: 16 add_tpos_enc_to_obj_ptrs: false proj_tpos_enc_in_obj_ptrs: false use_signed_tpos_enc_to_obj_ptrs: false only_obj_ptrs_in_the_past_for_eval: true pred_obj_scores: true pred_obj_scores_mlp: true fixed_no_obj_ptr: true soft_no_obj_ptr: false use_mlp_for_obj_ptr_proj: true no_obj_embed_spatial: true sam_mask_decoder_extra_args: dynamic_multimask_via_stability: true dynamic_multimask_stability_delta: 0.05 dynamic_multimask_stability_thresh: 0.98 pred_obj_scores: true pred_obj_scores_mlp: true use_multimask_token_for_obj_ptr: true compile_image_encoder: False