fix(train): allow zero-step training with bias adjustment (#5477)

njzjz-bot · njzjz · web-flow · commit f0cf2b9cf11b · 2026-06-20T15:07:04.000Z
Problem - `numb_steps=0` is a valid no-optimization path that should save the initial checkpoint. - When `change_bias_after_training` is enabled, the post-training bias adjustment still ran after zero steps and evaluated learning-rate/checkpoint metadata at step `-1`. Change - Skip post-training bias adjustment unless at least one training step has run. - Keep the existing zero-step initial checkpoint save path for both PyTorch and Paddle backends. - Add PT/PD regression tests that run zero-step training with `change_bias_after_training=true` and verify the saved `*-0` checkpoint metadata. Notes - `python3 -m pytest ...` could not run in this workspace because pytest is not installed in the available Python environment. - `uvx ruff check deepmd/pd/train/training.py deepmd/pt/train/training.py source/tests/pd/test_training.py source/tests/pt/test_training.py` passed. - `uvx ruff format --check deepmd/pd/train/training.py deepmd/pt/train/training.py source/tests/pd/test_training.py source/tests/pt/test_training.py` passed. - Closes #4988. Authored by OpenClaw (model: custom-chat-jinzhezeng-group/gpt-5.5)  ## Summary by CodeRabbit * **Bug Fixes** * Prevented unintended bias-adjustment during zero-step PyTorch training so the initial checkpoint is created and recorded correctly. * **Refactor** * Clarified the post-training bias-adjustment conditional in Paddle for readability (no behavior change). * **Tests** * Added tests for zero-step training with bias-adjustment enabled for both Paddle and PyTorch, verifying initial checkpoint creation and training metadata.  [![Review Change Stack](https://storage.googleapis.com/coderabbit_public_assets/review-stack-in-coderabbit-ui.svg)](https://app.coderabbit.ai/change-stack/deepmodeling/deepmd-kit/pull/5477?utm_source=github_walkthrough&utm_medium=github&utm_campaign=change_stack)   --------- Co-authored-by: Jinzhe Zeng <jinzhe.zeng@ustc.edu.cn>
diff --git a/deepmd/pd/train/training.py b/deepmd/pd/train/training.py
@@ -1038,7 +1038,11 @@ def log_loss_valid(_task_key: str = "Default") -> dict:
             if JIT:
                 break
 
-        if self.change_bias_after_training and (self.rank == 0 or dist.get_rank() == 0):
+        if (
+            self.change_bias_after_training
+            and self.num_steps > self.start_step
+            and (self.rank == 0 or dist.get_rank() == 0)
+        ):
             if not self.multi_task:
                 self.model = model_change_out_bias(
                     self.model,
diff --git a/deepmd/pt/train/training.py b/deepmd/pt/train/training.py
@@ -1846,7 +1846,11 @@ def log_loss_valid(_task_key: str = "Default") -> dict:
             if JIT:
                 break
 
-        if self.change_bias_after_training and (self.rank == 0 or dist.get_rank() == 0):
+        if (
+            self.change_bias_after_training
+            and self.num_steps > self.start_step
+            and (self.rank == 0 or dist.get_rank() == 0)
+        ):
             if not self.multi_task:
                 self.model = model_change_out_bias(
                     self.model,
diff --git a/source/tests/pd/test_training.py b/source/tests/pd/test_training.py
@@ -9,8 +9,12 @@
 from pathlib import (
     Path,
 )
+from unittest.mock import (
+    patch,
+)
 
 import numpy as np
+import paddle
 
 from deepmd.pd.entrypoints.main import (
     get_trainer,
@@ -163,6 +167,33 @@ def setUp(self) -> None:
         self.config["training"]["save_freq"] = 1
         enable_prim(True)
 
+    @patch("deepmd.pd.train.training.model_change_out_bias")
+    def test_zero_step_with_change_bias_saves_initial_checkpoint(
+        self, mocked_change_out_bias
+    ) -> None:
+        def keep_model(model, *_args, **_kwargs):
+            return model
+
+        mocked_change_out_bias.side_effect = keep_model
+        config = deepcopy(self.config)
+        config["training"]["numb_steps"] = 0
+        config["training"]["change_bias_after_training"] = True
+        trainer = get_trainer(config)
+        trainer.run()
+
+        expected_model = Path(trainer.save_ckpt + "-0.pd")
+        self.assertEqual(expected_model, trainer.latest_model)
+        self.assertTrue(expected_model.exists())
+        self.assertEqual(
+            expected_model,
+            Path(Path("checkpoint").read_text().strip()),
+        )
+        checkpoint = paddle.load(str(expected_model))
+        train_infos = checkpoint["model"]["_extra_state"]["train_infos"]
+        self.assertEqual(0, train_infos["step"])
+        self.assertEqual(0.0, train_infos["lr"])
+        mocked_change_out_bias.assert_not_called()
+
     def tearDown(self) -> None:
         DPTrainTest.tearDown(self)
 
diff --git a/source/tests/pt/test_training.py b/source/tests/pt/test_training.py
@@ -265,6 +265,33 @@ def test_yaml_input(self) -> None:
         )
         self.assertTrue(Path("out.json").exists())
 
+    @patch("deepmd.pt.train.training.model_change_out_bias")
+    def test_zero_step_with_change_bias_saves_initial_checkpoint(
+        self, mocked_change_out_bias
+    ) -> None:
+        def keep_model(model, *_args, **_kwargs):
+            return model
+
+        mocked_change_out_bias.side_effect = keep_model
+        config = deepcopy(self.config)
+        config["training"]["numb_steps"] = 0
+        config["training"]["change_bias_after_training"] = True
+        trainer = get_trainer(config)
+        trainer.run()
+
+        expected_model = Path(trainer.save_ckpt + "-0.pt")
+        self.assertEqual(expected_model, trainer.latest_model)
+        self.assertTrue(expected_model.exists())
+        self.assertEqual(
+            expected_model,
+            Path(Path("checkpoint").read_text().strip()),
+        )
+        checkpoint = torch.load(expected_model, map_location="cpu", weights_only=True)
+        train_infos = checkpoint["model"]["_extra_state"]["train_infos"]
+        self.assertEqual(0, train_infos["step"])
+        self.assertEqual(0.0, train_infos["lr"])
+        mocked_change_out_bias.assert_not_called()
+
     def tearDown(self) -> None:
         DPTrainTest.tearDown(self)
         for ff in ["out.json", "input.yaml"]: