modelscope · kevssim · Dec 17, 2025 · Dec 17, 2025 · Dec 17, 2025 · Dec 18, 2025
diff --git a/examples/train/tiled_mlp/fsdp2.json b/examples/train/tiled_mlp/fsdp2.json
@@ -0,0 +1,25 @@
+{
+  "compute_environment": "LOCAL_MACHINE",
+  "debug": false,
+  "distributed_type": "FSDP",
+  "downcast_bf16": "no",
+  "fsdp_config": {
+    "fsdp_auto_wrap_policy": "TRANSFORMER_BASED_WRAP",
+    "fsdp_cpu_ram_efficient_loading": true,
+    "fsdp_reshard_after_forward": true,
+    "fsdp_state_dict_type": "FULL_STATE_DICT",
+    "fsdp_activation_checkpointing": true,
+    "fsdp_version": 2
+  },
+  "machine_rank": 0,
+  "main_training_function": "main",
+  "mixed_precision": "bf16",
+  "num_machines": 1,
+  "num_processes": 2,
+  "rdzv_backend": "static",
+  "same_network": true,
+  "tpu_env": [],
+  "tpu_use_cluster": false,
+  "tpu_use_sudo": false,
+  "use_cpu": false
+}
diff --git a/examples/train/tiled_mlp/train_deepspeed.sh b/examples/train/tiled_mlp/train_deepspeed.sh
@@ -0,0 +1,24 @@
+CUDA_VISIBLE_DEVICES=0,1 \
+NPROC_PER_NODE=2 \
+swift sft \
+    --model Qwen/Qwen3-4B \
+    --dataset swift/self-cognition#200 \
+    --train_type full \
+    --torch_dtype bfloat16 \
+    --num_train_epochs 1 \
+    --per_device_train_batch_size 4 \
+    --learning_rate 1e-5 \
+    --weight_decay 0.1 \
+    --gradient_accumulation_steps 1 \
+    --eval_steps 100 \
+    --save_steps 100 \
+    --save_total_limit 2 \
+    --logging_steps 1 \
+    --max_length 2048 \
+    --output_dir output \
+    --system 'You are a helpful assistant.' \
+    --warmup_ratio 0.05 \
+    --dataloader_num_workers 4 \
+    --use_tiled_mlp true \
+    --tiled_mlp_num_shards 4 \
+    --deepspeed zero3
diff --git a/examples/train/tiled_mlp/train_fsdp2.sh b/examples/train/tiled_mlp/train_fsdp2.sh
@@ -0,0 +1,30 @@
+#!/bin/bash
+# FSDP2 training with tiled MLP
+# Requires accelerate config with fsdp_version: 2
+
+# First, create the accelerate config (fsdp2.json) or use the one in examples/train/multi-gpu/fsdp2_lora/
+
+# FSDP2 with tiled MLP
+accelerate launch --config_file fsdp2.json \
+    -m swift sft \
+    --model Qwen/Qwen3-4B \
+    --dataset swift/self-cognition#200 \
+    --train_type full \
+    --torch_dtype bfloat16 \
+    --num_train_epochs 1 \
+    --per_device_train_batch_size 4 \
+    --learning_rate 1e-5 \
+    --gradient_checkpointing false \
+    --weight_decay 0.1 \
+    --gradient_accumulation_steps 1 \
+    --eval_steps 100 \
+    --save_steps 100 \
+    --save_total_limit 2 \
+    --logging_steps 1 \
+    --max_length 2048 \
+    --output_dir output \
+    --system 'You are a helpful assistant.' \
+    --warmup_ratio 0.05 \
+    --dataloader_num_workers 4 \
+    --use_tiled_mlp true \
+    --tiled_mlp_num_shards 4
diff --git a/swift/llm/train/sft.py b/swift/llm/train/sft.py
@@ -51,6 +51,10 @@ def _prepare_generation_config(self):
     @RayHelper.function(group='default')
     def _prepare_model_tokenizer(self, **kwargs):
         args = self.args
+        # Apply tiled MLP before model instantiation
+        if getattr(args, 'use_tiled_mlp', False):
+            from swift.plugin.tiled_mlp import apply_tiled_mlp
+            apply_tiled_mlp(args.model_type, num_shards=getattr(args, 'tiled_mlp_num_shards', None))
         self.model, self.processor = args.get_model_processor(**kwargs)
         if args.sequence_parallel_size > 1:
             from swift.trainers.sequence_parallel import sequence_parallel

diff --git a/swift/plugin/__init__.py b/swift/plugin/__init__.py
@@ -17,6 +17,7 @@
     from .rm_plugin import rm_plugins
     from .env import envs, Env
     from .context_manager import context_managers, ContextManager
+    from .tiled_mlp import (TiledSwiGLUMLP, apply_tiled_mlp, is_fsdp2_enabled, is_fsdp1_enabled, get_tiled_mlp_mode)
 
 else:
     _import_structure = {
@@ -34,6 +35,8 @@
         'rm_plugin': ['rm_plugins'],
         'env': ['envs', 'Env'],
         'context_manager': ['context_managers', 'ContextManager'],
+        'tiled_mlp':
+        ['TiledSwiGLUMLP', 'apply_tiled_mlp', 'is_fsdp2_enabled', 'is_fsdp1_enabled', 'get_tiled_mlp_mode'],
     }
 
     import sys