Merge branch 'keras-team:master' into stablelm

Bond099 · web-flow · commit 652e52599546 · 2025-06-27T06:19:28.000+05:30
diff --git a/keras_hub/src/layers/modeling/transformer_encoder.py b/keras_hub/src/layers/modeling/transformer_encoder.py
@@ -16,9 +16,12 @@ class TransformerEncoder(keras.layers.Layer):
     paper [Attention is All You Need](https://arxiv.org/abs/1706.03762). Users
     can instantiate multiple instances of this class to stack up an encoder.
 
-    This layer will correctly compute an attention mask from an implicit
-    Keras padding mask (for example, by passing `mask_zero=True` to a
-    `keras.layers.Embedding` layer). See the Masking and Padding
+    This layer will compute an attention mask, prioritizing explicitly provided
+    masks (a `padding_mask` or a custom `attention_mask`) over an implicit Keras
+    padding mask (for example, by passing `mask_zero=True` to a
+    `keras.layers.Embedding` layer). If both a `padding_mask` and a
+    `attention_mask` are provided, they will be combined to determine the final
+    mask. See the Masking and Padding
     [guide](https://keras.io/guides/understanding_masking_and_padding/)
     for more details.
 
diff --git a/keras_hub/src/models/gemma3/gemma3_interleave_embeddings.py b/keras_hub/src/models/gemma3/gemma3_interleave_embeddings.py
@@ -65,7 +65,7 @@ def call(self, image_embeddings, text_embeddings, vision_indices):
         to_add = ops.multiply(
             keras.ops.arange(batch_size, dtype="int32"), seq_length
         )
-        to_add = ops.expand_dims(to_add, axis=-1)
+        to_add = ops.cast(ops.expand_dims(to_add, axis=-1), "int32")
         vision_indices = ops.add(vision_indices, to_add)
 
         # indices should be of shape `(num_updates, 1)`. `num_updates` is
diff --git a/keras_hub/src/models/mistral/mistral_presets.py b/keras_hub/src/models/mistral/mistral_presets.py
@@ -10,6 +10,14 @@
         },
         "kaggle_handle": "kaggle://keras/mistral/keras/mistral_7b_en/8",
     },
+    "mistral_0.3_7b_en": {
+        "metadata": {
+            "description": "Mistral 7B base version 0.3 model",
+            "params": 7248023552,
+            "path": "mistral",
+        },
+        "kaggle_handle": "kaggle://keras/mistral/keras/mistral_0.3_7b_en/1",
+    },
     "mistral_instruct_7b_en": {
         "metadata": {
             "description": "Mistral 7B instruct model",
@@ -20,10 +28,18 @@
     },
     "mistral_0.2_instruct_7b_en": {
         "metadata": {
-            "description": "Mistral 7B instruct Version 0.2 model",
+            "description": "Mistral 7B instruct version 0.2 model",
             "params": 7241732096,
             "path": "mistral",
         },
         "kaggle_handle": "kaggle://keras/mistral/keras/mistral_0.2_instruct_7b_en/3",
     },
+    "mistral_0.3_instruct_7b_en": {
+        "metadata": {
+            "description": "Mistral 7B instruct version 0.3 model",
+            "params": 7248023552,
+            "path": "mistral",
+        },
+        "kaggle_handle": "kaggle://keras/mistral/keras/mistral_0.3_instruct_7b_en/1",
+    },
 }
diff --git a/keras_hub/src/models/mixtral/mixtral_presets.py b/keras_hub/src/models/mixtral/mixtral_presets.py
@@ -10,7 +10,7 @@
             "params": 46702792704,
             "path": "mixtral",
         },
-        "kaggle_handle": "kaggle://keras/mixtral/keras/mixtral_8_7b_en/3",
+        "kaggle_handle": "kaggle://keras/mixtral/keras/mixtral_8_7b_en/4",
     },
     "mixtral_8_instruct_7b_en": {
         "metadata": {
@@ -21,6 +21,6 @@
             "params": 46702792704,
             "path": "mixtral",
         },
-        "kaggle_handle": "kaggle://keras/mixtral/keras/mixtral_8_instruct_7b_en/3",
+        "kaggle_handle": "kaggle://keras/mixtral/keras/mixtral_8_instruct_7b_en/4",
     },
 }
diff --git a/keras_hub/src/models/qwen3/qwen3_attention.py b/keras_hub/src/models/qwen3/qwen3_attention.py
@@ -299,7 +299,7 @@ def _compute_attention(
             attention_scores,
             ops.cast(self._inv_norm_factor, self.compute_dtype),
         )
-        if not self.sliding_window_size:
+        if self.sliding_window_size:
             attention_mask = self._mask_sliding_window(
                 attention_mask,
                 cache_update_index=cache_update_index
diff --git a/keras_hub/src/models/qwen_moe/qwen_moe_presets.py b/keras_hub/src/models/qwen_moe/qwen_moe_presets.py
@@ -10,6 +10,6 @@
             "params": 14315784192,
             "path": "qwen-1.5-moe",
         },
-        "kaggle_handle": "kaggle://keras/qwen-1.5-moe/Keras/qwen1.5_moe_2.7b_en/3",
+        "kaggle_handle": "kaggle://keras/qwen-1.5-moe/Keras/qwen1.5_moe_2.7b_en/4",
     },
 }
diff --git a/requirements-torch-cuda.txt b/requirements-torch-cuda.txt
@@ -4,8 +4,8 @@ tensorflow-text~=2.18
 
 # Torch with cuda support.
 --extra-index-url https://download.pytorch.org/whl/cu126
-torch==2.6.0+cu126
-torchvision==0.21.0+cu126
+torch==2.7.0+cu126
+torchvision==0.22.0+cu126
 
 # Jax cpu-only version.
 jax[cpu]
diff --git a/tools/checkpoint_conversion/convert_mistral_checkpoints.py b/tools/checkpoint_conversion/convert_mistral_checkpoints.py
@@ -5,22 +5,23 @@
 import traceback
 
 import numpy as np
-import requests
 from absl import app
 from absl import flags
 from keras import ops
 from transformers import AutoTokenizer
 from transformers import MistralForCausalLM
 
 from keras_hub.models import MistralBackbone
+from keras_hub.models import MistralCausalLM
 from keras_hub.models import MistralCausalLMPreprocessor
 from keras_hub.models import MistralTokenizer
-from keras_hub.utils.preset_utils import save_to_preset
 
 PRESET_MAP = {
     "mistral_7b_en": "mistralai/Mistral-7B-v0.1",
+    "mistral_0.3_7b_en": "mistralai/Mistral-7B-v0.3",
     "mistral_instruct_7b_en": "mistralai/Mistral-7B-Instruct-v0.1",
     "mistral_0.2_instruct_7b_en": "mistralai/Mistral-7B-Instruct-v0.2",
+    "mistral_0.3_instruct_7b_en": "mistralai/Mistral-7B-Instruct-v0.3",
 }
 
 FLAGS = flags.FLAGS
@@ -236,49 +237,43 @@ def main(_):
             rope_max_wavelength=hf_model.config.rope_theta,
             dtype="float32",
         )
-        keras_hub_model = MistralBackbone(**backbone_kwargs)
+        keras_hub_backbone = MistralBackbone(**backbone_kwargs)
 
-        # === Download the tokenizer from Huggingface model card ===
-        spm_path = (
-            f"https://huggingface.co/{hf_preset}/resolve/main/tokenizer.model"
-        )
-        response = requests.get(spm_path)
-        if not response.ok:
-            raise ValueError(f"Couldn't fetch {preset}'s tokenizer.")
-        tokenizer_path = os.path.join(temp_dir, "vocabulary.spm")
-        with open(tokenizer_path, "wb") as tokenizer_file:
-            tokenizer_file.write(response.content)
-        keras_hub_tokenizer = MistralTokenizer(tokenizer_path)
+        keras_hub_tokenizer = MistralTokenizer.from_preset(f"hf://{hf_preset}")
         print("\n-> Keras 3 model and tokenizer loaded.")
 
         # === Port the weights ===
-        convert_checkpoints(keras_hub_model, hf_model)
+        convert_checkpoints(keras_hub_backbone, hf_model)
         print("\n-> Weight transfer done.")
 
         # === Check that the models and tokenizers outputs match ===
         test_tokenizer(keras_hub_tokenizer, hf_tokenizer)
-        test_model(keras_hub_model, keras_hub_tokenizer, hf_model, hf_tokenizer)
+        test_model(
+            keras_hub_backbone, keras_hub_tokenizer, hf_model, hf_tokenizer
+        )
         print("\n-> Tests passed!")
 
         # === Save the model weights in float32 format ===
-        keras_hub_model.save_weights(os.path.join(temp_dir, "model.weights.h5"))
+        keras_hub_backbone.save_weights(
+            os.path.join(temp_dir, "model.weights.h5")
+        )
         print("\n-> Saved the model weights in float32")
 
-        del keras_hub_model, hf_model
+        del keras_hub_backbone, hf_model
         gc.collect()
 
         # === Save the weights again in float16 ===
         backbone_kwargs["dtype"] = "float16"
-        keras_hub_model = MistralBackbone(**backbone_kwargs)
-        keras_hub_model.load_weights(os.path.join(temp_dir, "model.weights.h5"))
-        save_to_preset(keras_hub_model, preset)
+        keras_hub_backbone = MistralBackbone(**backbone_kwargs)
+        keras_hub_backbone.load_weights(
+            os.path.join(temp_dir, "model.weights.h5")
+        )
+
+        preprocessor = MistralCausalLMPreprocessor(keras_hub_tokenizer)
+        keras_hub_model = MistralCausalLM(keras_hub_backbone, preprocessor)
+        keras_hub_model.save_to_preset(f"./{preset}")
         print("\n-> Saved the model preset in float16")
 
-        # === Save the tokenizer ===
-        save_to_preset(
-            keras_hub_tokenizer, preset, config_filename="tokenizer.json"
-        )
-        print("\n-> Saved the tokenizer")
     finally:
         shutil.rmtree(temp_dir)
 

Original file line number	Diff line number	Diff line change
`@@ -65,7 +65,7 @@ def call(self, image_embeddings, text_embeddings, vision_indices):`
`65`	`65`	`to_add = ops.multiply(`
`66`	`66`	`keras.ops.arange(batch_size, dtype="int32"), seq_length`
`67`	`67`	`)`
`68`		`- to_add = ops.expand_dims(to_add, axis=-1)`
	`68`	`+ to_add = ops.cast(ops.expand_dims(to_add, axis=-1), "int32")`
`69`	`69`	`vision_indices = ops.add(vision_indices, to_add)`
`70`	`70`
`71`	`71`	# indices should be of shape `(num_updates, 1)`. `num_updates` is
Original file line number	Diff line number	Diff line change
`@@ -299,7 +299,7 @@ def _compute_attention(`
`299`	`299`	`attention_scores,`
`300`	`300`	`ops.cast(self._inv_norm_factor, self.compute_dtype),`
`301`	`301`	`)`
`302`		`- if not self.sliding_window_size:`
	`302`	`+ if self.sliding_window_size:`
`303`	`303`	`attention_mask = self._mask_sliding_window(`
`304`	`304`	`attention_mask,`
`305`	`305`	`cache_update_index=cache_update_index`
Original file line number	Diff line number	Diff line change
`@@ -10,6 +10,6 @@`
`10`	`10`	`"params": 14315784192,`
`11`	`11`	`"path": "qwen-1.5-moe",`
`12`	`12`	`},`
`13`		`- "kaggle_handle": "kaggle://keras/qwen-1.5-moe/Keras/qwen1.5_moe_2.7b_en/3",`
	`13`	`+ "kaggle_handle": "kaggle://keras/qwen-1.5-moe/Keras/qwen1.5_moe_2.7b_en/4",`
`14`	`14`	`},`
`15`	`15`	`}`