hpcaitech · Jan 16, 2025
diff --git a/‎colossalai/booster/plugin/hybrid_parallel_plugin.py
+3 b/‎colossalai/booster/plugin/hybrid_parallel_plugin.py
+3
diff --git a/‎colossalai/checkpoint_io/__init__.py
+2 b/‎colossalai/checkpoint_io/__init__.py
+2
diff --git a/‎colossalai/checkpoint_io/distributed_checkpoint_io.py
+633 b/‎colossalai/checkpoint_io/distributed_checkpoint_io.py
+633
diff --git a/‎colossalai/checkpoint_io/hybrid_parallel_checkpoint_io.py
+5-5 b/‎colossalai/checkpoint_io/hybrid_parallel_checkpoint_io.py
+5-5
diff --git a/‎colossalai/checkpoint_io/utils.py
+5-8 b/‎colossalai/checkpoint_io/utils.py
+5-8
diff --git a/‎colossalai/shardformer/layer/parallel_module.py
-1 b/‎colossalai/shardformer/layer/parallel_module.py
-1
diff --git a/‎tests/test_checkpoint_io/test_dist_checkpointio.py
+133 b/‎tests/test_checkpoint_io/test_dist_checkpointio.py
+133
@@ -78,6 +78,9 @@ def __init__(
         self.require_grad_sync = True
         self.overlap_allgather = overlap_allgather
         self.use_fp8 = use_fp8
+        self.param_origin_shape = {}
+        for name, param in module.named_parameters():
+            self.param_origin_shape[name] = param.shape
 
         shardformer = ShardFormer(shard_config)
         if custom_policy is not None:
 
@@ -1,6 +1,7 @@
 from .checkpoint_io_base import CheckpointIO
 from .general_checkpoint_io import GeneralCheckpointIO
 from .hybrid_parallel_checkpoint_io import HybridParallelCheckpointIO
+from.distributed_checkpoint_io import DistributedCheckpointIO
 from .index_file import CheckpointIndexFile
 from .moe_checkpoint import MoECheckpointIO
 
@@ -10,4 +11,5 @@
     "GeneralCheckpointIO",
     "HybridParallelCheckpointIO",
     "MoECheckpointIO",
+    "DistributedCheckpointIO"
 ]
@@ -126,7 +126,7 @@ def _model_sharder(
                 buffer = buf if keep_vars else buf.detach()
                 if pinned_state_dicts is not None:
                     if (prefix + name) not in pinned_state_dicts:
-                        pinned_state_dicts[prefix + name] = torch.empty_like(param_, pin_memory=True, device="cpu")
+                        pinned_state_dicts[prefix + name] = torch.empty_like(buffer, pin_memory=True, device="cpu")
                     pinned_state_dicts[prefix + name].copy_(buffer)
                     buffer = pinned_state_dicts[prefix + name]
                 block, block_size = state_dict_sharder.append_param(prefix + name, buffer)
@@ -142,7 +142,7 @@ def _model_sharder(
             extra_state = model.get_extra_state()
             if pinned_state_dicts is not None:
                 if extra_state_key not in pinned_state_dicts:
-                    pinned_state_dicts[extra_state_key] = torch.empty_like(param_, pin_memory=True, device="cpu")
+                    pinned_state_dicts[extra_state_key] = torch.empty_like(extra_state, pin_memory=True, device="cpu")
                 pinned_state_dicts[extra_state_key].copy_(extra_state)
                 extra_state = pinned_state_dicts[extra_state_key]
             block, block_size = state_dict_sharder.append_param(extra_state_key, extra_state)
@@ -298,9 +298,9 @@ def save_sharded_model(
             Path(tmp_index_file_folder).mkdir(parents=True, exist_ok=True)
 
             # Manage filenames of sharded weights and index file for each pipeline stage.
-            weights_name = weights_name.replace(".bin", f"-stage-{self.pp_rank+1:05d}-shard.bin")
-            weights_name = weights_name.replace(".safetensors", f"-stage-{self.pp_rank+1:05d}-shard.safetensors")
-            save_index_file = save_index_file.replace(".json", f"-stage-{self.pp_rank+1:05d}.json")
+            weights_name = weights_name.replace(".bin", f"-stage-{self.pp_rank:05d}-shard.bin")
+            weights_name = weights_name.replace(".safetensors", f"-stage-{self.pp_rank:05d}-shard.safetensors")
+            save_index_file = save_index_file.replace(".json", f"-stage-{self.pp_rank:05d}.json")
             save_index_file = os.path.join("tmp_index_files", save_index_file)
             if use_async:
                 total_size, writers = async_save_state_dict_shards(
 
@@ -854,14 +854,11 @@ def has_index_file(checkpoint_path: str) -> Tuple[bool, Optional[Path]]:
         # check if there is only one a file ending with .index.json in this directory
         index_files = list(checkpoint_path.glob("*.index.*json"))
 
-        # if we found a .index.json file, make sure there is only one
-        if len(index_files) > 0:
-            assert (
-                len(index_files) == 1
-            ), f"Expected to find one .index.json file in {checkpoint_path}, but found {len(index_files)}"
-
         if len(index_files) == 1:
             return True, index_files[0]
+        elif len(index_files) > 1:
+            # Used for distributed checkpoint IO, where the metadata is stored across multiple files.
+            return True, checkpoint_path
         else:
             return False, None
     else:
@@ -943,8 +940,8 @@ def get_shard_filename(weights_name: str, idx: int):
     """
     get shard file name
     """
-    shard_file = weights_name.replace(".bin", f"-{idx+1:05d}.bin")
-    shard_file = shard_file.replace(".safetensors", f"-{idx+1:05d}.safetensors")
+    shard_file = weights_name.replace(".bin", f"-{idx:05d}.bin")
+    shard_file = shard_file.replace(".safetensors", f"-{idx:05d}.safetensors")
     return shard_file
 
 
 
@@ -120,7 +120,6 @@ def _load_from_state_dict(
                         "received {}".format(key, type(input_param))
                     )
                     continue
-
                 if is_distributed_tensor(param):
                     # shard the input param
                     device_mesh = get_device_mesh(param)
 
@@ -0,0 +1,133 @@
+import pytest
+import torch
+import torch.distributed as dist
+from packaging.version import Version
+from torch.optim import Adam
+from utils import shared_tempdir
+from copy import deepcopy
+
+import colossalai
+from colossalai.booster import Booster
+from colossalai.booster.plugin import HybridParallelPlugin
+from colossalai.shardformer.layer.utils import Randomizer
+from colossalai.tensor.d_tensor.api import clear_layout_converter
+from colossalai.checkpoint_io import DistributedCheckpointIO
+from colossalai.testing import (
+    assert_close_loose,
+    check_state_dict_equal,
+    clear_cache_before_run,
+    parameterize,
+    rerun_if_address_is_in_use,
+    spawn,
+)
+from tests.kit.model_zoo import model_zoo
+
+
+TEST_CONFIGS = [
+    ({"tp_size": 1, "pp_size": 2, "num_microbatches": 4, "zero_stage": 1, "precision": "fp16", "initial_scale": 1},
+    {"tp_size": 2, "pp_size": 1, "num_microbatches": 4, "zero_stage": 1, "precision": "fp16", "initial_scale": 1},)
+]
+
+
+@parameterize("shard", [False, True])
+@parameterize("model_name", ["transformers_llama_for_causal_lm"])
+@parameterize("size_per_shard", [1])
+@parameterize("test_config", TEST_CONFIGS)
+@parameterize("use_async", [False, True])
+@parameterize("low_cpu_mem_mode", [False, True])
+@clear_cache_before_run()
+def exam_state_dict(
+    shard: bool, model_name: str, size_per_shard: int, test_config: dict, use_async: bool, low_cpu_mem_mode: bool
+):
+    (model_fn, data_gen_fn, output_transform_fn, loss_fn, _) = next(
+        iter(model_zoo.get_sub_registry(model_name).values())
+    )
+    criterion = loss_fn
+    test_config_0, test_config_1 = test_config
+    plugin_0 = HybridParallelPlugin(**test_config_0)
+    booster_0 = Booster(plugin=plugin_0)
+    hybrid_ckp_0 = booster_0.checkpoint_io
+    booster_0.checkpoint_io = DistributedCheckpointIO(hybrid_ckp_0.global_dp_group, hybrid_ckp_0.pp_group, hybrid_ckp_0.tp_group, hybrid_ckp_0.sp_group, hybrid_ckp_0.use_zero)
+
+    def _criterion(outputs, inputs):
+        outputs = output_transform_fn(outputs)
+        loss = criterion(outputs)
+        return loss
+
+    def _preprocess_data(data):
+        if booster_0.plugin.stage_manager is not None:
+            for k, v in data.items():
+                if torch.is_tensor(v) or "Tensor" in v.__class__.__name__:
+                    new_shape = [1] * v.dim()
+                    new_shape[0] = 4
+                    data[k] = v.to("cuda").repeat(*new_shape)
+            return iter([data])
+        else:
+            return {k: v.cuda() for k, v in data.items()}
+
+    model_0 = model_fn().cuda()
+    optimizer_0 = Adam(model_0.parameters(), lr=1e-3)
+    model_0, optimizer_0, criterion, _, _ = booster_0.boost(model_0, optimizer_0, criterion)
+
+    data = data_gen_fn()
+    model_0.train()
+    if booster_0.plugin.stage_manager is not None:
+        booster_0.execute_pipeline(_preprocess_data(data), model_0, _criterion, optimizer_0, return_loss=True)
+    else:
+        output = model_0(**_preprocess_data(data))
+        loss = criterion(output)
+        optimizer_0.backward(loss)
+
+    optimizer_0.step()
+    optimizer_0.zero_grad()
+    with shared_tempdir() as tempdir:
+        model_ckpt_path_0 = f"{tempdir}/model_0"
+
+        booster_0.save_model(model_0, model_ckpt_path_0, shard=shard, size_per_shard=size_per_shard, use_async=use_async)
+        booster_0.checkpoint_io._sync_d2h()
+        booster_0.checkpoint_io._sync_io()
+        dist.barrier()
+
+        plugin_1 = HybridParallelPlugin(**test_config_1)
+        booster_1 = Booster(plugin=plugin_1)
+        hybrid_ckp_1 = booster_1.checkpoint_io
+        booster_1.checkpoint_io = DistributedCheckpointIO(hybrid_ckp_1.global_dp_group, hybrid_ckp_1.pp_group, hybrid_ckp_1.tp_group, hybrid_ckp_1.sp_group, hybrid_ckp_1.use_zero)
+
+        model_1 = model_fn().cuda()
+        optimizer_1 = Adam(model_1.parameters(), lr=1e-3)
+        model_1, optimizer_1, criterion, _, _ = booster_1.boost(model_1, optimizer_1, criterion)
+
+        booster_1.load_model(model_1, model_ckpt_path_0, low_cpu_mem_mode=low_cpu_mem_mode)
+
+        model_ckpt_path_1 = f"{tempdir}/model_1"
+        booster_1.save_model(model_1, model_ckpt_path_1, shard=shard, size_per_shard=size_per_shard, use_async=use_async)
+        booster_1.checkpoint_io._sync_d2h()
+        booster_1.checkpoint_io._sync_io()
+        dist.barrier()
+
+        model_2 = model_fn().cuda()
+        optimizer_2 = Adam(model_2.parameters(), lr=1e-3)
+        model_2, optimizer_2, criterion, _, _ = booster_0.boost(model_2, optimizer_2, criterion)
+
+        booster_0.load_model(model_2, model_ckpt_path_1, low_cpu_mem_mode=low_cpu_mem_mode)
+        check_state_dict_equal(model_0.unwrap().state_dict(), model_2.unwrap().state_dict())
+
+    dist.barrier()
+    Randomizer.reset_index()
+    clear_layout_converter()
+
+
+def run_dist(rank, world_size, port):
+    colossalai.launch(rank=rank, world_size=world_size, host="localhost", port=port, backend="nccl")
+    exam_state_dict()
+
+
+@pytest.mark.dist
+@pytest.mark.parametrize("world_size", [4])
+@rerun_if_address_is_in_use()
+def test_hybrid_ckpIO(world_size):
+    spawn(run_dist, world_size)
+
+
+if __name__ == "__main__":
+    test_hybrid_ckpIO(4)
Original file line number	Diff line number	Diff line change
`@@ -120,7 +120,6 @@ def _load_from_state_dict(`
`120`	`120`	`"received {}".format(key, type(input_param))`
`121`	`121`	`)`
`122`	`122`	`continue`
`123`		`-`
`124`	`123`	`if is_distributed_tensor(param):`
`125`	`124`	`# shard the input param`
`126`	`125`	`device_mesh = get_device_mesh(param)`