diff --git a/router/src/config.rs b/router/src/config.rs index 4d5fcfa0639..20791e159a0 100644 --- a/router/src/config.rs +++ b/router/src/config.rs @@ -88,7 +88,12 @@ impl LlavaNext { pub fn get_number_of_features(&self, height: usize, width: usize) -> usize { let image_size = self.vision_config.image_size; let patch_size = self.vision_config.patch_size; - assert!(image_size % patch_size == 0); + if image_size % patch_size != 0 { + warn!( + "Image size {} is not divisible by patch size {}, will round down", + image_size, patch_size + ); + } let npatches = image_size / patch_size; // Dimensions are intentionally swapped to be bug-compatible with // upstream: https://github.com/LLaVA-VL/LLaVA-NeXT/issues/59 @@ -271,4 +276,26 @@ mod test { let slots = config.get_number_of_features(1067, 1600); assert_eq!(slots, 2144); } + + #[test] + fn test_uneven_division() { + let config = LlavaNext { + text_config: TextConfig {}, + vision_config: VisionConfig { + image_size: 337, // Intentionally uneven + patch_size: 14, + }, + image_grid_pinpoints: vec![ + (336, 672), + (672, 336), + (672, 672), + (1008, 336), + (336, 1008), + ], + }; + + // Should still work even with uneven division + let slots = config.get_number_of_features(640, 640); + assert_eq!(slots, 2928); + } } diff --git a/server/text_generation_server/models/vlm_causal_lm.py b/server/text_generation_server/models/vlm_causal_lm.py index db78341d1ed..b8b3341eac9 100644 --- a/server/text_generation_server/models/vlm_causal_lm.py +++ b/server/text_generation_server/models/vlm_causal_lm.py @@ -170,7 +170,10 @@ def get_number_of_features(height: int, width: int, config) -> int: image_size = config.vision_config.image_size patch_size = config.vision_config.patch_size - assert image_size % patch_size == 0 + if image_size % patch_size != 0: + logger.warning( + f"Image size {image_size} is not divisible by patch size {patch_size}" + ) npatches = image_size // patch_size @@ -520,9 +523,9 @@ def forward( cuda_graph["input_lengths"].zero_() cuda_graph["input_lengths"][: input_lengths.shape[0]] = input_lengths cuda_graph["cache_lengths"].zero_() - cuda_graph["cache_lengths"][ - : cache_lengths_tensor.shape[0] - ] = cache_lengths_tensor + cuda_graph["cache_lengths"][: cache_lengths_tensor.shape[0]] = ( + cache_lengths_tensor + ) with self._forward_context( block_tables=cuda_graph["block_tables"],