Skip to content

Commit 0a76a87

Browse files
committed
add code from brunoaduarte's pull request HumanSignal#776; try debugging; add SAM2 configs locally
1 parent e99f357 commit 0a76a87

File tree

7 files changed

+511
-16
lines changed

7 files changed

+511
-16
lines changed

label_studio_ml/api.py

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -53,6 +53,7 @@ def _predict():
5353
@return:
5454
Predictions in LS format
5555
"""
56+
logger.error(f'got predict request')
5657
data = request.json
5758
tasks = data.get('tasks')
5859
label_config = data.get('label_config')
@@ -65,7 +66,8 @@ def _predict():
6566
label_config=label_config)
6667

6768
# model.use_label_config(label_config)
68-
69+
logger.error(f'Using model {model.__class__.__name__} for project {project_id}')
70+
logger.error(f'Making prediction for {len(tasks)} tasks with context: {context} and params: {params}')
6971
response = model.predict(tasks, context=context, **params)
7072

7173
# if there is no model version we will take the default
Lines changed: 116 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,116 @@
1+
# @package _global_
2+
3+
# Model
4+
model:
5+
_target_: sam2.modeling.sam2_base.SAM2Base
6+
image_encoder:
7+
_target_: sam2.modeling.backbones.image_encoder.ImageEncoder
8+
scalp: 1
9+
trunk:
10+
_target_: sam2.modeling.backbones.hieradet.Hiera
11+
embed_dim: 112
12+
num_heads: 2
13+
neck:
14+
_target_: sam2.modeling.backbones.image_encoder.FpnNeck
15+
position_encoding:
16+
_target_: sam2.modeling.position_encoding.PositionEmbeddingSine
17+
num_pos_feats: 256
18+
normalize: true
19+
scale: null
20+
temperature: 10000
21+
d_model: 256
22+
backbone_channel_list: [896, 448, 224, 112]
23+
fpn_top_down_levels: [2, 3] # output level 0 and 1 directly use the backbone features
24+
fpn_interp_model: nearest
25+
26+
memory_attention:
27+
_target_: sam2.modeling.memory_attention.MemoryAttention
28+
d_model: 256
29+
pos_enc_at_input: true
30+
layer:
31+
_target_: sam2.modeling.memory_attention.MemoryAttentionLayer
32+
activation: relu
33+
dim_feedforward: 2048
34+
dropout: 0.1
35+
pos_enc_at_attn: false
36+
self_attention:
37+
_target_: sam2.modeling.sam.transformer.RoPEAttention
38+
rope_theta: 10000.0
39+
feat_sizes: [64, 64]
40+
embedding_dim: 256
41+
num_heads: 1
42+
downsample_rate: 1
43+
dropout: 0.1
44+
d_model: 256
45+
pos_enc_at_cross_attn_keys: true
46+
pos_enc_at_cross_attn_queries: false
47+
cross_attention:
48+
_target_: sam2.modeling.sam.transformer.RoPEAttention
49+
rope_theta: 10000.0
50+
feat_sizes: [64, 64]
51+
rope_k_repeat: True
52+
embedding_dim: 256
53+
num_heads: 1
54+
downsample_rate: 1
55+
dropout: 0.1
56+
kv_in_dim: 64
57+
num_layers: 4
58+
59+
memory_encoder:
60+
_target_: sam2.modeling.memory_encoder.MemoryEncoder
61+
out_dim: 64
62+
position_encoding:
63+
_target_: sam2.modeling.position_encoding.PositionEmbeddingSine
64+
num_pos_feats: 64
65+
normalize: true
66+
scale: null
67+
temperature: 10000
68+
mask_downsampler:
69+
_target_: sam2.modeling.memory_encoder.MaskDownSampler
70+
kernel_size: 3
71+
stride: 2
72+
padding: 1
73+
fuser:
74+
_target_: sam2.modeling.memory_encoder.Fuser
75+
layer:
76+
_target_: sam2.modeling.memory_encoder.CXBlock
77+
dim: 256
78+
kernel_size: 7
79+
padding: 3
80+
layer_scale_init_value: 1e-6
81+
use_dwconv: True # depth-wise convs
82+
num_layers: 2
83+
84+
num_maskmem: 7
85+
image_size: 1024
86+
# apply scaled sigmoid on mask logits for memory encoder, and directly feed input mask as output mask
87+
sigmoid_scale_for_mem_enc: 20.0
88+
sigmoid_bias_for_mem_enc: -10.0
89+
use_mask_input_as_output_without_sam: true
90+
# Memory
91+
directly_add_no_mem_embed: true
92+
no_obj_embed_spatial: true
93+
# use high-resolution feature map in the SAM mask decoder
94+
use_high_res_features_in_sam: true
95+
# output 3 masks on the first click on initial conditioning frames
96+
multimask_output_in_sam: true
97+
# SAM heads
98+
iou_prediction_use_sigmoid: True
99+
# cross-attend to object pointers from other frames (based on SAM output tokens) in the encoder
100+
use_obj_ptrs_in_encoder: true
101+
add_tpos_enc_to_obj_ptrs: true
102+
proj_tpos_enc_in_obj_ptrs: true
103+
use_signed_tpos_enc_to_obj_ptrs: true
104+
only_obj_ptrs_in_the_past_for_eval: true
105+
# object occlusion prediction
106+
pred_obj_scores: true
107+
pred_obj_scores_mlp: true
108+
fixed_no_obj_ptr: true
109+
# multimask tracking settings
110+
multimask_output_for_tracking: true
111+
use_multimask_token_for_obj_ptr: true
112+
multimask_min_pt_num: 0
113+
multimask_max_pt_num: 1
114+
use_mlp_for_obj_ptr_proj: true
115+
# Compilation flag
116+
compile_image_encoder: False
Lines changed: 120 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,120 @@
1+
# @package _global_
2+
3+
# Model
4+
model:
5+
_target_: sam2.modeling.sam2_base.SAM2Base
6+
image_encoder:
7+
_target_: sam2.modeling.backbones.image_encoder.ImageEncoder
8+
scalp: 1
9+
trunk:
10+
_target_: sam2.modeling.backbones.hieradet.Hiera
11+
embed_dim: 144
12+
num_heads: 2
13+
stages: [2, 6, 36, 4]
14+
global_att_blocks: [23, 33, 43]
15+
window_pos_embed_bkg_spatial_size: [7, 7]
16+
window_spec: [8, 4, 16, 8]
17+
neck:
18+
_target_: sam2.modeling.backbones.image_encoder.FpnNeck
19+
position_encoding:
20+
_target_: sam2.modeling.position_encoding.PositionEmbeddingSine
21+
num_pos_feats: 256
22+
normalize: true
23+
scale: null
24+
temperature: 10000
25+
d_model: 256
26+
backbone_channel_list: [1152, 576, 288, 144]
27+
fpn_top_down_levels: [2, 3] # output level 0 and 1 directly use the backbone features
28+
fpn_interp_model: nearest
29+
30+
memory_attention:
31+
_target_: sam2.modeling.memory_attention.MemoryAttention
32+
d_model: 256
33+
pos_enc_at_input: true
34+
layer:
35+
_target_: sam2.modeling.memory_attention.MemoryAttentionLayer
36+
activation: relu
37+
dim_feedforward: 2048
38+
dropout: 0.1
39+
pos_enc_at_attn: false
40+
self_attention:
41+
_target_: sam2.modeling.sam.transformer.RoPEAttention
42+
rope_theta: 10000.0
43+
feat_sizes: [64, 64]
44+
embedding_dim: 256
45+
num_heads: 1
46+
downsample_rate: 1
47+
dropout: 0.1
48+
d_model: 256
49+
pos_enc_at_cross_attn_keys: true
50+
pos_enc_at_cross_attn_queries: false
51+
cross_attention:
52+
_target_: sam2.modeling.sam.transformer.RoPEAttention
53+
rope_theta: 10000.0
54+
feat_sizes: [64, 64]
55+
rope_k_repeat: True
56+
embedding_dim: 256
57+
num_heads: 1
58+
downsample_rate: 1
59+
dropout: 0.1
60+
kv_in_dim: 64
61+
num_layers: 4
62+
63+
memory_encoder:
64+
_target_: sam2.modeling.memory_encoder.MemoryEncoder
65+
out_dim: 64
66+
position_encoding:
67+
_target_: sam2.modeling.position_encoding.PositionEmbeddingSine
68+
num_pos_feats: 64
69+
normalize: true
70+
scale: null
71+
temperature: 10000
72+
mask_downsampler:
73+
_target_: sam2.modeling.memory_encoder.MaskDownSampler
74+
kernel_size: 3
75+
stride: 2
76+
padding: 1
77+
fuser:
78+
_target_: sam2.modeling.memory_encoder.Fuser
79+
layer:
80+
_target_: sam2.modeling.memory_encoder.CXBlock
81+
dim: 256
82+
kernel_size: 7
83+
padding: 3
84+
layer_scale_init_value: 1e-6
85+
use_dwconv: True # depth-wise convs
86+
num_layers: 2
87+
88+
num_maskmem: 7
89+
image_size: 1024
90+
# apply scaled sigmoid on mask logits for memory encoder, and directly feed input mask as output mask
91+
sigmoid_scale_for_mem_enc: 20.0
92+
sigmoid_bias_for_mem_enc: -10.0
93+
use_mask_input_as_output_without_sam: true
94+
# Memory
95+
directly_add_no_mem_embed: true
96+
no_obj_embed_spatial: true
97+
# use high-resolution feature map in the SAM mask decoder
98+
use_high_res_features_in_sam: true
99+
# output 3 masks on the first click on initial conditioning frames
100+
multimask_output_in_sam: true
101+
# SAM heads
102+
iou_prediction_use_sigmoid: True
103+
# cross-attend to object pointers from other frames (based on SAM output tokens) in the encoder
104+
use_obj_ptrs_in_encoder: true
105+
add_tpos_enc_to_obj_ptrs: true
106+
proj_tpos_enc_in_obj_ptrs: true
107+
use_signed_tpos_enc_to_obj_ptrs: true
108+
only_obj_ptrs_in_the_past_for_eval: true
109+
# object occlusion prediction
110+
pred_obj_scores: true
111+
pred_obj_scores_mlp: true
112+
fixed_no_obj_ptr: true
113+
# multimask tracking settings
114+
multimask_output_for_tracking: true
115+
use_multimask_token_for_obj_ptr: true
116+
multimask_min_pt_num: 0
117+
multimask_max_pt_num: 1
118+
use_mlp_for_obj_ptr_proj: true
119+
# Compilation flag
120+
compile_image_encoder: False
Lines changed: 119 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,119 @@
1+
# @package _global_
2+
3+
# Model
4+
model:
5+
_target_: sam2.modeling.sam2_base.SAM2Base
6+
image_encoder:
7+
_target_: sam2.modeling.backbones.image_encoder.ImageEncoder
8+
scalp: 1
9+
trunk:
10+
_target_: sam2.modeling.backbones.hieradet.Hiera
11+
embed_dim: 96
12+
num_heads: 1
13+
stages: [1, 2, 11, 2]
14+
global_att_blocks: [7, 10, 13]
15+
window_pos_embed_bkg_spatial_size: [7, 7]
16+
neck:
17+
_target_: sam2.modeling.backbones.image_encoder.FpnNeck
18+
position_encoding:
19+
_target_: sam2.modeling.position_encoding.PositionEmbeddingSine
20+
num_pos_feats: 256
21+
normalize: true
22+
scale: null
23+
temperature: 10000
24+
d_model: 256
25+
backbone_channel_list: [768, 384, 192, 96]
26+
fpn_top_down_levels: [2, 3] # output level 0 and 1 directly use the backbone features
27+
fpn_interp_model: nearest
28+
29+
memory_attention:
30+
_target_: sam2.modeling.memory_attention.MemoryAttention
31+
d_model: 256
32+
pos_enc_at_input: true
33+
layer:
34+
_target_: sam2.modeling.memory_attention.MemoryAttentionLayer
35+
activation: relu
36+
dim_feedforward: 2048
37+
dropout: 0.1
38+
pos_enc_at_attn: false
39+
self_attention:
40+
_target_: sam2.modeling.sam.transformer.RoPEAttention
41+
rope_theta: 10000.0
42+
feat_sizes: [64, 64]
43+
embedding_dim: 256
44+
num_heads: 1
45+
downsample_rate: 1
46+
dropout: 0.1
47+
d_model: 256
48+
pos_enc_at_cross_attn_keys: true
49+
pos_enc_at_cross_attn_queries: false
50+
cross_attention:
51+
_target_: sam2.modeling.sam.transformer.RoPEAttention
52+
rope_theta: 10000.0
53+
feat_sizes: [64, 64]
54+
rope_k_repeat: True
55+
embedding_dim: 256
56+
num_heads: 1
57+
downsample_rate: 1
58+
dropout: 0.1
59+
kv_in_dim: 64
60+
num_layers: 4
61+
62+
memory_encoder:
63+
_target_: sam2.modeling.memory_encoder.MemoryEncoder
64+
out_dim: 64
65+
position_encoding:
66+
_target_: sam2.modeling.position_encoding.PositionEmbeddingSine
67+
num_pos_feats: 64
68+
normalize: true
69+
scale: null
70+
temperature: 10000
71+
mask_downsampler:
72+
_target_: sam2.modeling.memory_encoder.MaskDownSampler
73+
kernel_size: 3
74+
stride: 2
75+
padding: 1
76+
fuser:
77+
_target_: sam2.modeling.memory_encoder.Fuser
78+
layer:
79+
_target_: sam2.modeling.memory_encoder.CXBlock
80+
dim: 256
81+
kernel_size: 7
82+
padding: 3
83+
layer_scale_init_value: 1e-6
84+
use_dwconv: True # depth-wise convs
85+
num_layers: 2
86+
87+
num_maskmem: 7
88+
image_size: 1024
89+
# apply scaled sigmoid on mask logits for memory encoder, and directly feed input mask as output mask
90+
sigmoid_scale_for_mem_enc: 20.0
91+
sigmoid_bias_for_mem_enc: -10.0
92+
use_mask_input_as_output_without_sam: true
93+
# Memory
94+
directly_add_no_mem_embed: true
95+
no_obj_embed_spatial: true
96+
# use high-resolution feature map in the SAM mask decoder
97+
use_high_res_features_in_sam: true
98+
# output 3 masks on the first click on initial conditioning frames
99+
multimask_output_in_sam: true
100+
# SAM heads
101+
iou_prediction_use_sigmoid: True
102+
# cross-attend to object pointers from other frames (based on SAM output tokens) in the encoder
103+
use_obj_ptrs_in_encoder: true
104+
add_tpos_enc_to_obj_ptrs: true
105+
proj_tpos_enc_in_obj_ptrs: true
106+
use_signed_tpos_enc_to_obj_ptrs: true
107+
only_obj_ptrs_in_the_past_for_eval: true
108+
# object occlusion prediction
109+
pred_obj_scores: true
110+
pred_obj_scores_mlp: true
111+
fixed_no_obj_ptr: true
112+
# multimask tracking settings
113+
multimask_output_for_tracking: true
114+
use_multimask_token_for_obj_ptr: true
115+
multimask_min_pt_num: 0
116+
multimask_max_pt_num: 1
117+
use_mlp_for_obj_ptr_proj: true
118+
# Compilation flag
119+
compile_image_encoder: False

0 commit comments

Comments
 (0)