Spaces:
Running
on
Zero
Running
on
Zero
| # @package _global_ | |
| # Model | |
| model: | |
| _target_: sam2.modeling.sam2_base.SAM2Base | |
| image_encoder: | |
| _target_: sam2.modeling.backbones.image_encoder.ImageEncoder | |
| scalp: 1 | |
| trunk: | |
| _target_: sam2.modeling.backbones.hieradet.Hiera | |
| embed_dim: 96 | |
| num_heads: 1 | |
| stages: [1, 2, 7, 2] | |
| global_att_blocks: [5, 7, 9] | |
| window_pos_embed_bkg_spatial_size: [7, 7] | |
| neck: | |
| _target_: sam2.modeling.backbones.image_encoder.FpnNeck | |
| position_encoding: | |
| _target_: sam2.modeling.position_encoding.PositionEmbeddingSine | |
| num_pos_feats: 256 | |
| normalize: true | |
| scale: null | |
| temperature: 10000 | |
| d_model: 256 | |
| backbone_channel_list: [768, 384, 192, 96] | |
| fpn_top_down_levels: [2, 3] # output level 0 and 1 directly use the backbone features | |
| fpn_interp_model: nearest | |
| memory_attention: | |
| _target_: sam2.modeling.memory_attention.MemoryAttention | |
| d_model: 256 | |
| pos_enc_at_input: true | |
| layer: | |
| _target_: sam2.modeling.memory_attention.MemoryAttentionLayer | |
| activation: relu | |
| dim_feedforward: 2048 | |
| dropout: 0.1 | |
| pos_enc_at_attn: false | |
| self_attention: | |
| _target_: sam2.modeling.sam.transformer.RoPEAttention | |
| rope_theta: 10000.0 | |
| feat_sizes: [32, 32] | |
| embedding_dim: 256 | |
| num_heads: 1 | |
| downsample_rate: 1 | |
| dropout: 0.1 | |
| d_model: 256 | |
| pos_enc_at_cross_attn_keys: true | |
| pos_enc_at_cross_attn_queries: false | |
| cross_attention: | |
| _target_: sam2.modeling.sam.transformer.RoPEAttention | |
| rope_theta: 10000.0 | |
| feat_sizes: [32, 32] | |
| rope_k_repeat: True | |
| embedding_dim: 256 | |
| num_heads: 1 | |
| downsample_rate: 1 | |
| dropout: 0.1 | |
| kv_in_dim: 64 | |
| num_layers: 4 | |
| memory_encoder: | |
| _target_: sam2.modeling.memory_encoder.MemoryEncoder | |
| out_dim: 64 | |
| position_encoding: | |
| _target_: sam2.modeling.position_encoding.PositionEmbeddingSine | |
| num_pos_feats: 64 | |
| normalize: true | |
| scale: null | |
| temperature: 10000 | |
| mask_downsampler: | |
| _target_: sam2.modeling.memory_encoder.MaskDownSampler | |
| kernel_size: 3 | |
| stride: 2 | |
| padding: 1 | |
| fuser: | |
| _target_: sam2.modeling.memory_encoder.Fuser | |
| layer: | |
| _target_: sam2.modeling.memory_encoder.CXBlock | |
| dim: 256 | |
| kernel_size: 7 | |
| padding: 3 | |
| layer_scale_init_value: 1e-6 | |
| use_dwconv: True # depth-wise convs | |
| num_layers: 2 | |
| num_maskmem: 7 | |
| image_size: 1024 | |
| # apply scaled sigmoid on mask logits for memory encoder, and directly feed input mask as output mask | |
| # SAM decoder | |
| sigmoid_scale_for_mem_enc: 20.0 | |
| sigmoid_bias_for_mem_enc: -10.0 | |
| use_mask_input_as_output_without_sam: true | |
| # Memory | |
| directly_add_no_mem_embed: true | |
| # use high-resolution feature map in the SAM mask decoder | |
| use_high_res_features_in_sam: true | |
| # output 3 masks on the first click on initial conditioning frames | |
| multimask_output_in_sam: true | |
| # SAM heads | |
| iou_prediction_use_sigmoid: True | |
| # cross-attend to object pointers from other frames (based on SAM output tokens) in the encoder | |
| use_obj_ptrs_in_encoder: true | |
| add_tpos_enc_to_obj_ptrs: false | |
| only_obj_ptrs_in_the_past_for_eval: true | |
| # object occlusion prediction | |
| pred_obj_scores: true | |
| pred_obj_scores_mlp: true | |
| fixed_no_obj_ptr: true | |
| # multimask tracking settings | |
| multimask_output_for_tracking: true | |
| use_multimask_token_for_obj_ptr: true | |
| multimask_min_pt_num: 0 | |
| multimask_max_pt_num: 1 | |
| use_mlp_for_obj_ptr_proj: true | |
| # Compilation flag | |
| # HieraT does not currently support compilation, should always be set to False | |
| compile_image_encoder: False | |