Jianbing-D
diff --git a/‎.github/CODEOWNERS‎
Lines changed: 12 additions & 5 deletions b/‎.github/CODEOWNERS‎
Lines changed: 12 additions & 5 deletions
diff --git a/‎.github/actions/action.yml‎
Lines changed: 18 additions & 7 deletions b/‎.github/actions/action.yml‎
Lines changed: 18 additions & 7 deletions
diff --git a/‎.github/copy-pr-bot.yaml‎
Lines changed: 1 addition & 0 deletions b/‎.github/copy-pr-bot.yaml‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎.github/pull_request_template.md‎
Lines changed: 64 additions & 0 deletions b/‎.github/pull_request_template.md‎
Lines changed: 64 additions & 0 deletions
diff --git a/‎.github/workflows/_build_test_publish_wheel.yml‎
Lines changed: 156 additions & 0 deletions b/‎.github/workflows/_build_test_publish_wheel.yml‎
Lines changed: 156 additions & 0 deletions
@@ -1,31 +1,38 @@
-megatron/core @NVIDIA/core-adlr @NVIDIA/core-nemo
+megatron/core/ @NVIDIA/core-adlr @NVIDIA/core-nemo
 
 megatron/core/models/gpt/ @NVIDIA/gpt
 
 megatron/core/models/multimodal/ @NVIDIA/multi-modal
 
 megatron/core/models/mamba/ @NVIDIA/hybrid-mamba
 
+megatron/core/datasets/ @NVIDIA/datasets
+
+megatron/core/distributed/fsdp/ @NVIDIA/megatron-fsdp
+
+megatron/core/transformer/fsdp_dtensor_checkpoint.py @NVIDIA/megatron-fsdp
+
 megatron/core/dist_checkpointing/ @NVIDIA/dist-checkpointing
 
 megatron/core/optimizer/distrib_optimizer/ @NVIDIA/dist-optimizer
 
 megatron/core/inference/modelopt_support @NVIDIA/quantization-and-inference
 
-# megatron/core/datasets/ @NVIDIA/datasets
+megatron/core/datasets/ @NVIDIA/datasets
 
 megatron/core/pipeline_parallel/ @NVIDIA/pipeline-parallelism
 
 megatron/core/transformer/ @NVIDIA/core-adlr @NVIDIA/core-nemo
 
 megatron/core/transformer/moe/ @NVIDIA/core-adlr @NVIDIA/core-devtech
 
-# megatron/core/inference/ @NVIDIA/inference
+megatron/core/inference/ @NVIDIA/inference
 
 megatron/core/parallel_state.py @NVIDIA/core-nemo
 
 megatron/core/post_training/ @NVIDIA/post-training
-megatron/post_training @NVIDIA/post-training
+
+megatron/post_training/ @NVIDIA/post-training
 
 .gitlab/ @NVIDIA/ci
 .github/ @NVIDIA/ci
@@ -44,4 +51,4 @@ tests/unit_tests/ @NVIDIA/ci
 megatron/rl/ @NVIDIA/reinforcement-learning
 examples/rl/ @NVIDIA/reinforcement-learning
 test/unit_tests/test_rl_utils.py @NVIDIA/reinforcement-learning
-train_rl.py @NVIDIA/reinforcement-learning
+train_rl.py @NVIDIA/reinforcement-learning
@@ -78,19 +78,19 @@ runs:
         export PYTHONPATH=$(pwd)
         export NEMORUN_HOME=$(pwd)
         pip install --no-cache-dir uv
-        uv sync --only-group test 
+        uv sync --only-group test
         uv run python tests/test_utils/python_scripts/launch_nemo_run_workload.py \
           --scope unit-tests \
           --model unit-tests \
-          --test-case '${{ inputs.test_case }}' \
+          --test-case "${{ inputs.test_case }}" \
           --environment dev \
           --platform dgx_h100 \
           --tag ${{ inputs.tag }} \
           --container-image ${{ inputs.container-image }}
 
         RUN_TEST_EOF
         )
-        echo "$cmd" | tee "job.sh"        
+        echo "$cmd" | tee "job.sh"
         echo "::endgroup::"
 
     - name: Get PR info
@@ -125,23 +125,34 @@ runs:
         #!/bin/bash
         set -euxo pipefail
 
+        if [ "${{ steps.has-run-tests-label.outputs.main }}" == "true" ]; then
+            ARGS=(
+              --scope mr-github
+              --enable-lightweight-mode
+            )
+          else
+            ARGS=(
+              --scope mr-slim
+              --enable-lightweight-mode
+            )
+          fi
+
         export PYTHONPATH=$(pwd)
         export NEMORUN_HOME=$(pwd)
         pip install --no-cache-dir uv
-        uv sync --only-group test 
+        uv sync --only-group test
         uv run python tests/test_utils/python_scripts/launch_nemo_run_workload.py \
-          --scope mr \
+          ${ARGS[@]} \
           --model ${{ inputs.model }} \
           --test-case ${{ inputs.test_case }} \
           --environment dev \
           --platform dgx_h100 \
           --container-image ${{ inputs.container-image }} \
           --data-dir /mnt/datadrive/TestData/megatron-lm/artifacts \
-          --enable-lightweight-mode
 
         RUN_TEST_EOF
         )
-        echo "$cmd" | tee "job.sh"        
+        echo "$cmd" | tee "job.sh"
         echo "::endgroup::"
 
     - name: Set timeout
 
@@ -1,3 +1,4 @@
 enabled: true
 auto_sync_draft: false
 auto_sync_ready: true
+trustees_override: ["AAnoosheh", "ArEsKay3", "Autumn1998", "BestJuly", "BoxiangW", "ChenhanYu", "FDecaYed", "HaochenYuan", "ISEEKYAN", "JRD971000", "QiZhangNV", "ShriyaRishab", "Victarry", "Wohox", "ZhiyuLi-Nvidia", "aklife97", "ananthsub", "asolergi-nv", "buptzyb", "chtruong814", "cspades", "cuichenx", "deepakn94", "dimapihtar", "duncanriach", "erhoo82", "ericharper", "fanshiqing", "gautham-kollu", "hxbai", "jaredcasper", "jiemingz", "jkamalu", "jon-barker", "kanz-nv", "kevalmorabia97", "ko3n1g", "kunlunl", "kvareddy", "layalir", "lhb8125", "lmcafee-nvidia", "maanug-nv", "mathemakitten", "matthieule", "mehraakash", "mkhona-nvidia", "pablo-garay", "parthmannan", "pthombre", "rogerwaleffe", "sanandaraj5597", "santhnm2", "sbak5", "shanmugamr1992", "shifangx", "shjwudp", "sidsingh-nvidia", "skyw", "tdene", "theothermike", "thomasdhc", "trintamaki", "tylerpoon", "wdykas", "xiaoyao0115", "xuwchen", "yanring", "yaox12", "yaoyu-33", "yashaswikarnati", "yobibyte", "youngeunkwon0405", "yuzhongw-nvidia", "zhongbozhu"]
@@ -0,0 +1,64 @@
+# What does this PR do ?
+<!-- Add a one line overview of what this PR aims to accomplish. -->
+
+:warning: For major changes (either in lines of code or in its impact), please make sure to first share discuss a design-doc with the team.  
+
+## Contribution process
+
+```mermaid
+flowchart LR
+    A[Pre-checks] --> B[PR Tests]
+    subgraph Code Review/Approval
+        C1[Expert Review] --> C2[Final Review]
+    end
+    B --> C1
+    C2 --> D[Merge]
+```
+
+### Pre-checks
+
+- [ ] I want this PR in a versioned release and have added the appropriate Milestone (e.g., `Core 0.8`)
+- [ ] I have added relevant unit tests
+- [ ] I have added relevant functional tests
+- [ ] I have added proper typing to my code [Typing guidelines](https://docs.python.org/3/library/typing.html)
+- [ ] I have added relevant documentation
+- [ ] I have run the [autoformatter.sh](https://github.com/NVIDIA/Megatron-LM/blob/main/tools/autoformat.sh) on my PR
+
+### Code review
+
+The following process is enforced via the CODEOWNERS file for changes into `megatron/core`. For changes outside of `megatron/core`, it is up to the PR author whether or not to tag the Final Reviewer team.
+
+<details>
+<summary>For MRs into `main` branch</summary>
+
+#### (Step 1): Add PR label `Expert Review`
+
+#### (Step 2): Collect the expert reviewers reviews
+
+1. Attach the `Expert Review` label when your PR is ready for review.
+2. GitHub auto-assigns expert reviewers based on your changes. They will get notified and pick up your PR soon.
+
+:warning: Only proceed to the next step once all reviewers have approved, merge-conflict are resolved and the CI is passing.  
+Final Review might get declined if these requirements are not fulfilled.
+
+#### (Step 3): Final Review
+
+1. Add `Final Review` label
+2. GitHub auto-assigns final reviewers based on your changes. They will get notified and pick up your PR soon.
+
+#### (Optional Step 4): Cherry-pick into release branch
+
+If this PR also needs to be merged into `core_r*` release branches, after this PR has been merged, select `Cherry-pick` to open a new PR into the release branch.
+
+</details>
+
+<details>
+<summary>For MRs into `dev` branch</summary>
+The proposed review process for `dev` branch is under active discussion.
+
+MRs are mergable after one approval by either `eharper@nvidia.com` or `zijiey@nvidia.com`.
+</details>
+
+### Merging your PR
+
+Any member of [core-adlr](https://github.com/orgs/teams/NVIDIA/core-adlr) and [`core-nemo`](https://github.com/orgs/teams/NVIDIA/core-nemo) will be able to merge your PR.
@@ -0,0 +1,156 @@
+on:
+  workflow_call:
+    secrets:
+      TWINE_USERNAME:
+        required: true
+      TWINE_PASSWORD:
+        required: true
+
+jobs:
+  build-and-test-wheels:
+    strategy:
+      fail-fast: false
+      matrix:
+        include:
+          - PACKAGE: megatron-core
+            PLATFORM: arm64
+            IMAGE: quay.io/pypa/manylinux_2_28_aarch64
+          - PACKAGE: megatron-core
+            PLATFORM: amd64
+            IMAGE: quay.io/pypa/manylinux_2_28_x86_64
+          - PACKAGE: megatron-fsdp
+            IMAGE: quay.io/pypa/manylinux_2_28_x86_64
+            PLATFORM: amd64
+    runs-on: ${{ matrix.PLATFORM == 'amd64' && 'ubuntu-22.04' || 'ubuntu-22.04-arm' }}
+    env:
+      PACKAGE: ${{ matrix.PACKAGE }}
+      IMAGE: ${{ matrix.IMAGE }}
+      PLATFORM: ${{ matrix.PLATFORM }}
+    steps:
+      - name: Checkout repository
+        uses: actions/checkout@v4
+
+      - name: Build wheel
+        id: build-wheel
+        run: |
+          set -x
+
+          PUBLISH_DRYRUN=yes
+
+          if [ "$PACKAGE" = "megatron-core" ]; then
+            ROOTDIR="megatron/core"
+            BUILD_DIR="."
+          elif [ "$PACKAGE" = "megatron-fsdp" ]; then
+            ROOTDIR="megatron/core/distributed/fsdp/src/megatron_fsdp"
+            BUILD_DIR="megatron/core/distributed/fsdp/src"
+          else
+            echo Unknown package: $PACKAGE
+            exit 1
+          fi
+
+          if [ "$PUBLISH_DRYRUN" = "yes" ]; then
+            PRE_RELEASE=$(sed -n "s/.*PRE_RELEASE = '\(.*\)'/\1/p" $ROOTDIR/package_info.py)
+            sed -i "/^PRE_RELEASE/c\PRE_RELEASE = '${PRE_RELEASE}.dev$((RANDOM % 900000 + 100000))'" $ROOTDIR/package_info.py
+          fi
+
+          pushd $BUILD_DIR
+            rm LICENSE || true
+            docker run --rm -v $(pwd):/workspace -w /workspace $IMAGE bash -c '\
+              for python_version in cp310 cp311 cp312 cp313; do \
+                /opt/python/${python_version}-${python_version}/bin/pip install --upgrade "setuptools>=80.0.0" build; \
+              done && \
+              for python_version in cp310 cp311 cp312 cp313; do \
+                /opt/python/${python_version}-${python_version}/bin/python -m build; \
+              done \
+            '
+
+            PLATFORM_WHEELS=$(find dist -name "*.whl" -not -name "*-none-any.whl")
+            if [ -n "$PLATFORM_WHEELS" ]; then
+                echo "Found platform wheels to repair: $PLATFORM_WHEELS"
+                docker run --rm -v $(pwd):/workspace -w /workspace $IMAGE auditwheel repair $PLATFORM_WHEELS
+                docker run --rm -v $(pwd):/workspace -w /workspace $IMAGE rm -rf dist/*.whl
+                docker run --rm -v $(pwd):/workspace -w /workspace $IMAGE cp -a wheelhouse/* dist/
+            fi
+          popd
+
+          pushd $ROOTDIR
+            EXPECTED_RELEASE_NUMBER=$(python -c "import package_info; print(package_info.__version__)")
+          popd
+
+          echo "expected-release-number=$EXPECTED_RELEASE_NUMBER" | tee -a "${GITHUB_OUTPUT}"
+
+          if [ "$PACKAGE" = "megatron-fsdp" ]; then
+            mkdir -p dist/
+            cp -a megatron/core/distributed/fsdp/src/dist/* dist/
+          fi
+
+          ls -al dist/
+
+      - name: Test wheels
+        run: |
+          ls -al dist/
+
+          if [ "$PACKAGE" = "megatron-core" ]; then
+            ROOTPATH="megatron.core"
+            WHEEL_PREFIX="megatron_core"
+          elif [ "$PACKAGE" = "megatron-fsdp" ]; then
+            ROOTPATH="megatron_fsdp"
+            WHEEL_PREFIX="megatron_fsdp"
+          else
+            echo Unknown package: $PACKAGE
+            exit 1
+          fi
+
+          if [ "$PACKAGE" = "megatron-core" ]; then
+            if [[ "$PLATFORM" == "arm64" ]]; then
+              for file in dist/$WHEEL_PREFIX*cp310*aarch64.whl; do
+                pip install --no-cache-dir "$file"
+              done
+            else
+              for file in dist/$WHEEL_PREFIX*cp310*x86_64.whl; do
+                pip install --no-cache-dir "$file"
+              done
+            fi
+          else
+            pip install --no-cache-dir dist/$WHEEL_PREFIX*.whl
+          fi
+
+          sudo rm -rf megatron/
+
+          RELEASE_NUMBER=$(python -c "import $ROOTPATH; print($ROOTPATH.__version__)")
+          test "${{ steps.build-wheel.outputs.expected-release-number }}" == "$RELEASE_NUMBER"
+
+      - name: Upload wheels
+        uses: actions/upload-artifact@v4
+        with:
+          name: wheels-${{ matrix.PACKAGE }}-${{ matrix.PLATFORM }}
+          path: dist/
+
+  publish-wheels:
+    needs: [build-and-test-wheels]
+    runs-on: ubuntu-latest
+    environment: ${{ (github.ref == 'refs/heads/main' || startsWith(github.ref, 'refs/heads/r')) && 'main' || 'public' }}
+    strategy:
+      fail-fast: false
+      matrix:
+        include:
+          - PACKAGE: megatron_core
+          - PACKAGE: megatron_fsdp
+    env:
+      PACKAGE: ${{ matrix.PACKAGE }}
+    steps:
+      - name: Download wheels
+        uses: actions/download-artifact@v4
+        with:
+          path: dist/
+          merge-multiple: true
+
+      - name: Publish wheels
+        env:
+          TWINE_USERNAME: ${{ secrets.TWINE_USERNAME }}
+          TWINE_PASSWORD: ${{ secrets.TWINE_PASSWORD }}
+          TWINE_REPOSITORY: ${{ (github.ref == 'refs/heads/main' || startsWith(github.ref, 'refs/heads/r')) && 'pypi' || 'testpypi' }}
+        run: |
+          ls -al dist/$PACKAGE*
+          pip install twine
+          twine upload -r $TWINE_REPOSITORY -u $TWINE_USERNAME -p $TWINE_PASSWORD dist/$PACKAGE*