Skip to content

Commit 508df64

Browse files
committed
Remove '224' from dinov3 model names and enable dynamic_img_size=True by default. Add assert in DINOv3 ROPE module to avoid torchscript failure (hopefully no more?)
1 parent 2b1e266 commit 508df64

File tree

2 files changed

+62
-78
lines changed

2 files changed

+62
-78
lines changed

timm/layers/pos_embed_sincos.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1088,6 +1088,7 @@ def get_embed(self, shape: Optional[List[int]] = None) -> torch.Tensor:
10881088
assert self.feat_shape is not None, 'feature shape must be cached on create'
10891089
rope_embed = self._create_embed(self.feat_shape)
10901090
else:
1091+
assert self.pos_embed_cached is not None
10911092
rope_embed = self.pos_embed_cached
10921093

10931094
return rope_embed

timm/models/eva.py

Lines changed: 61 additions & 78 deletions
Original file line numberDiff line numberDiff line change
@@ -1260,6 +1260,25 @@ def _pe_cfg(url: str = '', **kwargs) -> Dict[str, Any]:
12601260
}
12611261

12621262

1263+
def _dinov3_cfg(url: str = '', **kwargs) -> Dict[str, Any]:
1264+
"""Generate default configuration for DINOv3 models.
1265+
1266+
Args:
1267+
url: Model weights URL.
1268+
**kwargs: Additional configuration parameters.
1269+
1270+
Returns:
1271+
Model configuration dictionary.
1272+
"""
1273+
return {
1274+
'url': url,
1275+
'num_classes': 0, 'input_size': (3, 256, 256), 'pool_size': None,
1276+
'crop_pct': 1.0, 'interpolation': 'bicubic', 'min_input_size': (3, 128, 128),
1277+
'mean': IMAGENET_DEFAULT_MEAN, 'std': IMAGENET_DEFAULT_STD,
1278+
'first_conv': 'patch_embed.proj', 'classifier': 'head',
1279+
'license': 'dinov3', **kwargs
1280+
}
1281+
12631282
default_cfgs = generate_default_cfgs({
12641283

12651284
# EVA 01 CLIP fine-tuned on imagenet-1k
@@ -1614,89 +1633,43 @@ def _pe_cfg(url: str = '', **kwargs) -> Dict[str, Any]:
16141633

16151634
# DINOv3 weights are under a specific license with redistribution terms, please see
16161635
# https://github.com/facebookresearch/dinov3/blob/main/LICENSE.md
1617-
'vit_small_patch16_dinov3_224.lvdm_1689m': _cfg(
1636+
'vit_small_patch16_dinov3.lvdm_1689m': _dinov3_cfg(
16181637
# hf_hub_id='timm/',
1619-
mean=IMAGENET_DEFAULT_MEAN, std=IMAGENET_DEFAULT_STD,
1620-
crop_pct=1.0,
1621-
num_classes=0,
1622-
license='dinov3',
16231638
),
1624-
'vit_small_patch16_dinov3_qkvb_224.lvdm_1689m': _cfg(
1639+
'vit_small_patch16_dinov3_qkvb.lvdm_1689m': _dinov3_cfg(
16251640
# hf_hub_id='timm/',
1626-
mean=IMAGENET_DEFAULT_MEAN, std=IMAGENET_DEFAULT_STD,
1627-
crop_pct=1.0,
1628-
num_classes=0,
1629-
license='dinov3',
16301641
),
1631-
'vit_small_plus_patch16_dinov3_224.lvdm_1689m': _cfg(
1642+
'vit_small_plus_patch16_dinov3.lvdm_1689m': _dinov3_cfg(
16321643
# hf_hub_id='timm/',
1633-
mean=IMAGENET_DEFAULT_MEAN, std=IMAGENET_DEFAULT_STD,
1634-
crop_pct=1.0,
1635-
num_classes=0,
1636-
license='dinov3',
16371644
),
1638-
'vit_small_plus_patch16_dinov3_qkvb_224.lvdm_1689m': _cfg(
1645+
'vit_small_plus_patch16_dinov3_qkvb.lvdm_1689m': _dinov3_cfg(
16391646
# hf_hub_id='timm/',
1640-
mean=IMAGENET_DEFAULT_MEAN, std=IMAGENET_DEFAULT_STD,
1641-
crop_pct=1.0,
1642-
num_classes=0,
1643-
license='dinov3',
16441647
),
1645-
'vit_base_patch16_dinov3_224.lvdm_1689m': _cfg(
1648+
'vit_base_patch16_dinov3.lvdm_1689m': _dinov3_cfg(
16461649
#hf_hub_id='timm/',
1647-
mean=IMAGENET_DEFAULT_MEAN, std=IMAGENET_DEFAULT_STD,
1648-
crop_pct=1.0,
1649-
num_classes=0,
1650-
license='dinov3',
16511650
),
1652-
'vit_base_patch16_dinov3_qkvb_224.lvdm_1689m': _cfg(
1651+
'vit_base_patch16_dinov3_qkvb.lvdm_1689m': _dinov3_cfg(
16531652
#hf_hub_id='timm/',
1654-
mean=IMAGENET_DEFAULT_MEAN, std=IMAGENET_DEFAULT_STD,
1655-
crop_pct=1.0,
1656-
num_classes=0,
1657-
license='dinov3',
16581653
),
1659-
'vit_large_patch16_dinov3_224.lvdm_1689m': _cfg(
1654+
'vit_large_patch16_dinov3.lvdm_1689m': _dinov3_cfg(
16601655
# hf_hub_id='timm/',
1661-
mean=IMAGENET_DEFAULT_MEAN, std=IMAGENET_DEFAULT_STD,
1662-
crop_pct=1.0,
1663-
num_classes=0,
1664-
license='dinov3',
16651656
),
1666-
'vit_large_patch16_dinov3_qkvb_224.lvdm_1689m': _cfg(
1657+
'vit_large_patch16_dinov3_qkvb.lvdm_1689m': _dinov3_cfg(
16671658
# hf_hub_id='timm/',
1668-
mean=IMAGENET_DEFAULT_MEAN, std=IMAGENET_DEFAULT_STD,
1669-
crop_pct=1.0,
1670-
num_classes=0,
1671-
license='dinov3',
16721659
),
1673-
'vit_large_patch16_dinov3_224.sat_493m': _cfg(
1660+
'vit_large_patch16_dinov3.sat_493m': _dinov3_cfg(
16741661
# hf_hub_id='timm/',
16751662
mean=(0.430, 0.411, 0.296), std=(0.213, 0.156, 0.143),
1676-
crop_pct=1.0,
1677-
num_classes=0,
1678-
license='dinov3',
16791663
),
1680-
'vit_huge_plus_patch16_dinov3_224.lvdm_1689m': _cfg(
1664+
'vit_huge_plus_patch16_dinov3.lvdm_1689m': _dinov3_cfg(
16811665
# hf_hub_id='timm/',
1682-
mean=IMAGENET_DEFAULT_MEAN, std=IMAGENET_DEFAULT_STD,
1683-
crop_pct=1.0,
1684-
num_classes=0,
1685-
license='dinov3',
16861666
),
1687-
'vit_7b_patch16_dinov3_224.lvdm_1689m': _cfg(
1667+
'vit_7b_patch16_dinov3.lvdm_1689m': _dinov3_cfg(
16881668
# hf_hub_id='timm/',
1689-
mean=IMAGENET_DEFAULT_MEAN, std=IMAGENET_DEFAULT_STD,
1690-
crop_pct=1.0,
1691-
num_classes=0,
1692-
license='dinov3',
16931669
),
1694-
'vit_7b_patch16_dinov3_224.sat_493m': _cfg(
1670+
'vit_7b_patch16_dinov3.sat_493m': _dinov3_cfg(
16951671
# hf_hub_id='timm/',
16961672
mean=(0.430, 0.411, 0.296), std=(0.213, 0.156, 0.143),
1697-
crop_pct=1.0,
1698-
num_classes=0,
1699-
license='dinov3',
17001673
),
17011674

17021675
})
@@ -2640,9 +2613,10 @@ def vit_large_patch16_rope_mixed_ape_224(pretrained: bool = False, **kwargs) ->
26402613

26412614

26422615
@register_model
2643-
def vit_small_patch16_dinov3_224(pretrained: bool = False, **kwargs) -> Eva:
2616+
def vit_small_patch16_dinov3(pretrained: bool = False, **kwargs) -> Eva:
26442617
model_args = dict(
26452618
patch_size=16,
2619+
dynamic_img_size=True,
26462620
embed_dim=384,
26472621
depth=12,
26482622
num_heads=6,
@@ -2658,14 +2632,15 @@ def vit_small_patch16_dinov3_224(pretrained: bool = False, **kwargs) -> Eva:
26582632
use_fc_norm=False,
26592633
norm_layer=partial(LayerNorm, eps=1e-5),
26602634
)
2661-
model = _create_eva('vit_small_patch16_dinov3_224', pretrained=pretrained, **dict(model_args, **kwargs))
2635+
model = _create_eva('vit_small_patch16_dinov3', pretrained=pretrained, **dict(model_args, **kwargs))
26622636
return model
26632637

26642638

26652639
@register_model
2666-
def vit_small_patch16_dinov3_qkvb_224(pretrained: bool = False, **kwargs) -> Eva:
2640+
def vit_small_patch16_dinov3_qkvb(pretrained: bool = False, **kwargs) -> Eva:
26672641
model_args = dict(
26682642
patch_size=16,
2643+
dynamic_img_size=True,
26692644
embed_dim=384,
26702645
depth=12,
26712646
num_heads=6,
@@ -2681,14 +2656,15 @@ def vit_small_patch16_dinov3_qkvb_224(pretrained: bool = False, **kwargs) -> Eva
26812656
use_fc_norm=False,
26822657
norm_layer=partial(LayerNorm, eps=1e-5),
26832658
)
2684-
model = _create_eva('vit_small_patch16_dinov3_qkvb_224', pretrained=pretrained, **dict(model_args, **kwargs))
2659+
model = _create_eva('vit_small_patch16_dinov3_qkvb', pretrained=pretrained, **dict(model_args, **kwargs))
26852660
return model
26862661

26872662

26882663
@register_model
2689-
def vit_small_plus_patch16_dinov3_224(pretrained: bool = False, **kwargs) -> Eva:
2664+
def vit_small_plus_patch16_dinov3(pretrained: bool = False, **kwargs) -> Eva:
26902665
model_args = dict(
26912666
patch_size=16,
2667+
dynamic_img_size=True,
26922668
embed_dim=384,
26932669
depth=12,
26942670
num_heads=6,
@@ -2706,14 +2682,15 @@ def vit_small_plus_patch16_dinov3_224(pretrained: bool = False, **kwargs) -> Eva
27062682
use_fc_norm=False,
27072683
norm_layer=partial(LayerNorm, eps=1e-5),
27082684
)
2709-
model = _create_eva('vit_small_plus_patch16_dinov3_224', pretrained=pretrained, **dict(model_args, **kwargs))
2685+
model = _create_eva('vit_small_plus_patch16_dinov3', pretrained=pretrained, **dict(model_args, **kwargs))
27102686
return model
27112687

27122688

27132689
@register_model
2714-
def vit_small_plus_patch16_dinov3_qkvb_224(pretrained: bool = False, **kwargs) -> Eva:
2690+
def vit_small_plus_patch16_dinov3_qkvb(pretrained: bool = False, **kwargs) -> Eva:
27152691
model_args = dict(
27162692
patch_size=16,
2693+
dynamic_img_size=True,
27172694
embed_dim=384,
27182695
depth=12,
27192696
num_heads=6,
@@ -2731,14 +2708,15 @@ def vit_small_plus_patch16_dinov3_qkvb_224(pretrained: bool = False, **kwargs) -
27312708
use_fc_norm=False,
27322709
norm_layer=partial(LayerNorm, eps=1e-5),
27332710
)
2734-
model = _create_eva('vit_small_plus_patch16_dinov3_qkvb_224', pretrained=pretrained, **dict(model_args, **kwargs))
2711+
model = _create_eva('vit_small_plus_patch16_dinov3_qkvb', pretrained=pretrained, **dict(model_args, **kwargs))
27352712
return model
27362713

27372714

27382715
@register_model
2739-
def vit_base_patch16_dinov3_224(pretrained: bool = False, **kwargs) -> Eva:
2716+
def vit_base_patch16_dinov3(pretrained: bool = False, **kwargs) -> Eva:
27402717
model_args = dict(
27412718
patch_size=16,
2719+
dynamic_img_size=True,
27422720
embed_dim=768,
27432721
depth=12,
27442722
num_heads=12,
@@ -2754,15 +2732,16 @@ def vit_base_patch16_dinov3_224(pretrained: bool = False, **kwargs) -> Eva:
27542732
use_fc_norm=False,
27552733
norm_layer=partial(LayerNorm, eps=1e-5),
27562734
)
2757-
model = _create_eva('vit_base_patch16_dinov3_224', pretrained=pretrained, **dict(model_args, **kwargs))
2735+
model = _create_eva('vit_base_patch16_dinov3', pretrained=pretrained, **dict(model_args, **kwargs))
27582736
return model
27592737

27602738

27612739
@register_model
2762-
def vit_base_patch16_dinov3_qkvb_224(pretrained: bool = False, **kwargs) -> Eva:
2740+
def vit_base_patch16_dinov3_qkvb(pretrained: bool = False, **kwargs) -> Eva:
27632741
# DINOv3 Base variant w/ qkv_bias enabled (zero'd in weights)
27642742
model_args = dict(
27652743
patch_size=16,
2744+
dynamic_img_size=True,
27662745
embed_dim=768,
27672746
depth=12,
27682747
num_heads=12,
@@ -2778,14 +2757,15 @@ def vit_base_patch16_dinov3_qkvb_224(pretrained: bool = False, **kwargs) -> Eva:
27782757
use_fc_norm=False,
27792758
norm_layer=partial(LayerNorm, eps=1e-5),
27802759
)
2781-
model = _create_eva('vit_base_patch16_dinov3_qkvb_224', pretrained=pretrained, **dict(model_args, **kwargs))
2760+
model = _create_eva('vit_base_patch16_dinov3_qkvb', pretrained=pretrained, **dict(model_args, **kwargs))
27822761
return model
27832762

27842763

27852764
@register_model
2786-
def vit_large_patch16_dinov3_224(pretrained: bool = False, **kwargs) -> Eva:
2765+
def vit_large_patch16_dinov3(pretrained: bool = False, **kwargs) -> Eva:
27872766
model_args = dict(
27882767
patch_size=16,
2768+
dynamic_img_size=True,
27892769
embed_dim=1024,
27902770
depth=24,
27912771
num_heads=16,
@@ -2801,14 +2781,15 @@ def vit_large_patch16_dinov3_224(pretrained: bool = False, **kwargs) -> Eva:
28012781
use_fc_norm=False,
28022782
norm_layer=partial(LayerNorm, eps=1e-5),
28032783
)
2804-
model = _create_eva('vit_large_patch16_dinov3_224', pretrained=pretrained, **dict(model_args, **kwargs))
2784+
model = _create_eva('vit_large_patch16_dinov3', pretrained=pretrained, **dict(model_args, **kwargs))
28052785
return model
28062786

28072787

28082788
@register_model
2809-
def vit_large_patch16_dinov3_qkvb_224(pretrained: bool = False, **kwargs) -> Eva:
2789+
def vit_large_patch16_dinov3_qkvb(pretrained: bool = False, **kwargs) -> Eva:
28102790
model_args = dict(
28112791
patch_size=16,
2792+
dynamic_img_size=True,
28122793
embed_dim=768,
28132794
depth=24,
28142795
num_heads=16,
@@ -2824,14 +2805,15 @@ def vit_large_patch16_dinov3_qkvb_224(pretrained: bool = False, **kwargs) -> Eva
28242805
use_fc_norm=False,
28252806
norm_layer=partial(LayerNorm, eps=1e-5),
28262807
)
2827-
model = _create_eva('vit_large_patch16_dinov3_qkvb_224', pretrained=pretrained, **dict(model_args, **kwargs))
2808+
model = _create_eva('vit_large_patch16_dinov3_qkvb', pretrained=pretrained, **dict(model_args, **kwargs))
28282809
return model
28292810

28302811

28312812
@register_model
2832-
def vit_huge_plus_patch16_dinov3_224(pretrained: bool = False, **kwargs) -> Eva:
2813+
def vit_huge_plus_patch16_dinov3(pretrained: bool = False, **kwargs) -> Eva:
28332814
model_args = dict(
28342815
patch_size=16,
2816+
dynamic_img_size=True,
28352817
embed_dim=1280,
28362818
depth=32,
28372819
num_heads=20,
@@ -2850,14 +2832,15 @@ def vit_huge_plus_patch16_dinov3_224(pretrained: bool = False, **kwargs) -> Eva:
28502832
norm_layer=partial(LayerNorm, eps=1e-5),
28512833
)
28522834

2853-
model = _create_eva('vit_huge_plus_patch16_dinov3_224', pretrained=pretrained, **dict(model_args, **kwargs))
2835+
model = _create_eva('vit_huge_plus_patch16_dinov3', pretrained=pretrained, **dict(model_args, **kwargs))
28542836
return model
28552837

28562838

28572839
@register_model
2858-
def vit_7b_patch16_dinov3_224(pretrained: bool = False, **kwargs) -> Eva:
2840+
def vit_7b_patch16_dinov3(pretrained: bool = False, **kwargs) -> Eva:
28592841
model_args = dict(
28602842
patch_size=16,
2843+
dynamic_img_size=True,
28612844
embed_dim=4096,
28622845
depth=40,
28632846
num_heads=32,
@@ -2877,5 +2860,5 @@ def vit_7b_patch16_dinov3_224(pretrained: bool = False, **kwargs) -> Eva:
28772860
norm_layer=partial(LayerNorm, eps=1e-5),
28782861
)
28792862

2880-
model = _create_eva('vit_7b_patch16_dinov3_224', pretrained=pretrained, **dict(model_args, **kwargs))
2863+
model = _create_eva('vit_7b_patch16_dinov3', pretrained=pretrained, **dict(model_args, **kwargs))
28812864
return model

0 commit comments

Comments
 (0)