NVIDIA
diff --git a/‎constraints.txt‎
Lines changed: 1 addition & 4 deletions b/‎constraints.txt‎
Lines changed: 1 addition & 4 deletions
diff --git a/‎cpp/tensorrt_llm/common/customAllReduceUtils.h‎
Lines changed: 255 additions & 0 deletions b/‎cpp/tensorrt_llm/common/customAllReduceUtils.h‎
Lines changed: 255 additions & 0 deletions
diff --git a/‎cpp/tensorrt_llm/kernels/communicationKernels/allReduceFusionKernels.cu‎
Lines changed: 5 additions & 1 deletion b/‎cpp/tensorrt_llm/kernels/communicationKernels/allReduceFusionKernels.cu‎
Lines changed: 5 additions & 1 deletion
diff --git a/‎cpp/tensorrt_llm/kernels/communicationKernels/moeAlltoAllKernels.cu‎
Lines changed: 3 additions & 3 deletions b/‎cpp/tensorrt_llm/kernels/communicationKernels/moeAlltoAllKernels.cu‎
Lines changed: 3 additions & 3 deletions
diff --git a/‎cpp/tensorrt_llm/kernels/communicationKernels/moeAlltoAllKernels.h‎
Lines changed: 34 additions & 32 deletions b/‎cpp/tensorrt_llm/kernels/communicationKernels/moeAlltoAllKernels.h‎
Lines changed: 34 additions & 32 deletions
diff --git a/‎cpp/tensorrt_llm/kernels/customAllReduceKernels.h‎
Lines changed: 24 additions & 0 deletions b/‎cpp/tensorrt_llm/kernels/customAllReduceKernels.h‎
Lines changed: 24 additions & 0 deletions
diff --git a/‎cpp/tensorrt_llm/kernels/cutlass_kernels/CMakeLists.txt‎
Lines changed: 1 addition & 1 deletion b/‎cpp/tensorrt_llm/kernels/cutlass_kernels/CMakeLists.txt‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎cpp/tensorrt_llm/kernels/cutlass_kernels/fp8_blockscale_gemm/fp8_blockscale_gemm_kernel.cuh‎
Lines changed: 8 additions & 9 deletions b/‎cpp/tensorrt_llm/kernels/cutlass_kernels/fp8_blockscale_gemm/fp8_blockscale_gemm_kernel.cuh‎
Lines changed: 8 additions & 9 deletions
diff --git a/‎cpp/tensorrt_llm/kernels/dsv3MinLatencyKernels/dsv3FusedAGemm.cu‎
Lines changed: 2 additions & 1 deletion b/‎cpp/tensorrt_llm/kernels/dsv3MinLatencyKernels/dsv3FusedAGemm.cu‎
Lines changed: 2 additions & 1 deletion
@@ -1,5 +1,2 @@
-# These vulnerabilities were inherited from the base image (pytorch:25.06-py3) and should be removed when the base image
+# These vulnerabilities were inherited from the base image (pytorch:25.10-py3) and should be removed when the base image
 # is updated.
-
-# WAR against https://github.com/advisories/GHSA-8qvm-5x2c-j2w7
-protobuf>=4.25.8
@@ -134,8 +134,12 @@ public:
             // corresponding CTA has not been launched.
             for (int flag_idx = blockIdx.x; flag_idx < kBarrierFlagCount; flag_idx += gridDim.x)
             {
-                st_flag(m_target_flag + flag_idx * NRanks, m_flag_value);
+                asm volatile(
+                    "st.global.relaxed.sys.b32 [%1], %0;" ::"r"(m_flag_value), "l"(m_target_flag + flag_idx * NRanks));
             }
+            // Single release fence
+            asm volatile("fence.release.sys;");
+
             while (ld_flag(m_current_flag) == prev_flag(m_flag_value))
             {
             }
 
@@ -23,7 +23,7 @@
 #include <cstdint>
 #include <type_traits>
 
-namespace tensorrt_llm::kernels::moe_a2a
+namespace tensorrt_llm::kernels::mnnvl_throughput
 {
 
 #define ENABLE_DEBUG_PRINT 0
@@ -506,7 +506,7 @@ void moe_a2a_dispatch_launch(MoeA2ADispatchParams const& params)
     TLLM_CHECK(params.num_payloads > 0 && params.num_payloads <= kMaxPayloads);
 
     // Prepare kernel pointers struct
-    DispatchKernelPointers kernel_ptrs = {}; // Zero-initialize
+    DispatchKernelPointers kernel_ptrs = {};
 
     // Fill source data pointers and payload sizes
     for (int i = 0; i < params.num_payloads; i++)
@@ -958,4 +958,4 @@ void moe_a2a_sanitize_expert_ids_launch(int32_t* expert_ids, int32_t const* recv
         expert_ids, recv_counters, ep_size, max_tokens_per_rank, top_k, invalid_id);
 }
 
-} // namespace tensorrt_llm::kernels::moe_a2a
+} // namespace tensorrt_llm::kernels::mnnvl_throughput
@@ -19,7 +19,7 @@
 #include <cuda_bf16.h>
 #include <cuda_fp16.h>
 
-namespace tensorrt_llm::kernels::moe_a2a
+namespace tensorrt_llm::kernels::mnnvl_throughput
 {
 
 // Configuration constants
@@ -91,7 +91,7 @@ struct MoeA2ADispatchParams
 
     // Token configuration
     int local_num_tokens;    // Number of tokens on this rank
-    int max_tokens_per_rank; // Maximum tokens per rank for pre-allocation
+    int max_tokens_per_rank; // Maximum tokens per rank for pre-allocation TODO: Rename to runtime_max_tokens_per_rank
     int top_k;               // Number of experts per token
 
     // Expert routing information
@@ -101,23 +101,22 @@ struct MoeA2ADispatchParams
     int num_payloads;                         // Number of different payload types
     PayloadDescriptor payloads[kMaxPayloads]; // Array of payload descriptors
 
-    // Receive buffers and synchronization
-    void* recv_buffers[kMaxRanks][kMaxPayloads]; // Per-rank receive buffers for each payload
+    // Local aux data
+    uint32_t* flag_val;       // The value of the flag for this round (stored on the local rank)
+    int* local_token_counter; // Atomic counter for completed tokens on this rank
+    int* send_counters;       // [ep_size] atomic counters - tracks tokens sent to each target rank
+    int* topk_target_ranks; // Top-K compact routing info per local token (size: [local_num_tokens, top_k]), target rank
+                            // per k, -1 for duplicates
+    int* topk_send_indices; // Top-K compact routing info per local token (size: [local_num_tokens, top_k]), dst index
+                            // per k, -1 for duplicates
 
-    // Synchronization
+    // Distributed aux data and recv buffers
+    int* recv_counters[kMaxRanks]; // tracks tokens received from each source rank. Each rank has [ep_size] counters
     uint32_t* completion_flags[kMaxRanks]; // If completion_flags[target_rank][source_rank] == *flag_val, then source
                                            // rank has signaled the target rank
-    uint32_t* flag_val;                    // The value of the flag for this round (stored on the local rank)
-
-    // Communication tracking
-    int* send_counters;            // [ep_size] atomic counters - tracks tokens sent to each target rank
-    int* recv_counters[kMaxRanks]; // tracks tokens received from each source rank. Each rank has [ep_size] counters
-    int* local_token_counter;      // Atomic counter for completed tokens on this rank
-
-    // Top-K compact routing info per local token (size: [local_num_tokens, top_k])
-    int* topk_target_ranks; // target rank per k, -1 for duplicates
-    int* topk_send_indices; // dst index per k, -1 for duplicates
+    void* recv_buffers[kMaxRanks][kMaxPayloads]; // Per-rank receive buffers for each payload
 
+    // CUDA stream
     cudaStream_t stream;
 };
 
@@ -137,30 +136,33 @@ struct MoeA2ACombineParams
 
     // Token configuration
     int local_num_tokens;    // Number of tokens on this rank
-    int max_tokens_per_rank; // Maximum tokens per rank for pre-allocation
+    int max_tokens_per_rank; // Maximum tokens per rank for pre-allocation TODO: Rename to runtime_max_tokens_per_rank
     int top_k;               // Number of experts per token
 
-    // Expert routing information
-    int const* recv_counters; // [ep_size] number of valid tokens per source rank for this target
-
-    // Top-K compact routing info per local token (size: [local_num_tokens, top_k])
-    int const* topk_target_ranks; // target rank per k, -1 for duplicates
-    int const* topk_send_indices; // dst index per k, -1 for duplicates
+    // Prepare-only field: original payload tensor pointer used to stage into workspace
+    void const* prepare_payload;
 
-    // Single payload information
-    void const* recv_buffers[kMaxRanks]; // Per-rank receive buffers (only for single payload)
-    void* output_data;                   // Output buffer [local_num_tokens, elements_per_token]
-    int elements_per_token;              // Number of elements per token
-    nvinfer1::DataType dtype;            // Data type for proper summation
+    // Output tensor
+    void* output_data; // Output buffer [local_num_tokens, elements_per_token]
+    // Payload information
+    int elements_per_token;   // Number of elements per token
+    nvinfer1::DataType dtype; // Data type for proper summation
+
+    // Local aux data
+    uint32_t* flag_val;     // The value of the flag for this round (stored on the local rank)
+    int* topk_target_ranks; // Top-K compact routing info per local token (size: [local_num_tokens, top_k]), target rank
+                            // per k, -1 for duplicates
+    int* topk_send_indices; // Top-K compact routing info per local token (size: [local_num_tokens, top_k]), dst index
+                            // per k, -1 for duplicates
+    int const* recv_counters; // [ep_size] number of valid tokens per source rank for this target
 
-    // Synchronization
+    // Distributed aux data and recv buffers
     uint32_t* completion_flags[kMaxRanks]; // If completion_flags[target_rank][source_rank] == *flag_val, then source
                                            // rank has signaled the target rank
-    uint32_t* flag_val;                    // The value of the flag for this round (stored on the local rank)
+    void const* recv_buffers[kMaxRanks];   // Per-rank receive buffers (only for single payload)
 
+    // CUDA stream
     cudaStream_t stream;
-    // Prepare-only field: original payload tensor pointer used to stage into workspace
-    void const* prepare_payload;
 };
 
 // Combine kernels
@@ -175,4 +177,4 @@ void moe_a2a_prepare_combine_launch(MoeA2ACombineParams const& params);
 void moe_a2a_sanitize_expert_ids_launch(int32_t* expert_ids, int32_t const* recv_counters, int32_t invalid_id,
     int ep_size, int max_tokens_per_rank, int top_k, cudaStream_t stream);
 
-} // namespace tensorrt_llm::kernels::moe_a2a
+} // namespace tensorrt_llm::kernels::mnnvl_throughput
@@ -106,6 +106,30 @@ inline std::string toString(AllReduceFusionOp op)
     return oss.str();
 }
 
+inline std::ostream& operator<<(std::ostream& os, AllReduceStrategyType op)
+{
+    switch (op)
+    {
+    case AllReduceStrategyType::NCCL: os << "NCCL"; break;
+    case AllReduceStrategyType::MIN_LATENCY: os << "MIN_LATENCY"; break;
+    case AllReduceStrategyType::UB: os << "UB"; break;
+    case AllReduceStrategyType::AUTO: os << "AUTO"; break;
+    case AllReduceStrategyType::ONESHOT: os << "ONESHOT"; break;
+    case AllReduceStrategyType::TWOSHOT: os << "TWOSHOT"; break;
+    case AllReduceStrategyType::LOWPRECISION: os << "LOWPRECISION"; break;
+    case AllReduceStrategyType::MNNVL: os << "MNNVL"; break;
+    case AllReduceStrategyType::NCCL_SYMMETRIC: os << "NCCL_SYMMETRIC"; break;
+    }
+    return os;
+}
+
+inline std::string toString(AllReduceStrategyType op)
+{
+    std::ostringstream oss;
+    oss << op;
+    return oss.str();
+}
+
 struct AllReduceFusionParams
 {
     AllReduceFusionParams()
 
@@ -205,7 +205,7 @@ set_cuda_architectures(fb_gemm_src 89 90 100f 120f)
 # ${INSTANTIATION_GENERATION_DIR}/fp8_rowwise_gemm)
 
 add_library(fp8_blockscale_gemm_src STATIC ${FP8_BLOCKSCALE_GEMM_SRC_CU})
-set_cuda_architectures(fp8_blockscale_gemm_src 89 90 100f)
+set_cuda_architectures(fp8_blockscale_gemm_src 89 90 100f 120f)
 
 set(GEMM_SWIGLU_SM90_SRC_CU
     ${CMAKE_CURRENT_SOURCE_DIR}/fused_gated_gemm/gemm_swiglu_e4m3.cu)
 
@@ -1622,16 +1622,15 @@ void gemm_dispatch_sm89(void* mat_a, void* mat_b, void* mat_d, float* scales_a,
     dim3 grid = dim3(grid_m, grid_n, grid_k);
     dim3 block = dim3(kThreadCount, 1, 1);
 
-    if (kSmemSize > (48 << 10))
-    {
-        cudaFuncSetAttribute(ada_blockwise_gemm::sm89_fp8_gemm_1d1d_impl<GemmKernel>,
-            cudaFuncAttributeMaxDynamicSharedMemorySize, kSmemSize);
-        auto result = cudaGetLastError();
-        TLLM_CHECK_WITH_INFO(result == cudaSuccess, "sm89 gemm kernel cannot launch: %s", cudaGetErrorString(result));
-    }
+    auto result = cudaFuncSetAttribute(ada_blockwise_gemm::sm89_fp8_gemm_1d1d_impl<GemmKernel>,
+        cudaFuncAttributeMaxDynamicSharedMemorySize, kSmemSize);
+    TLLM_CHECK_WITH_INFO(result == cudaSuccess, "sm89 gemm kernel cannot launch: %s", cudaGetErrorString(result));
 
     ada_blockwise_gemm::sm89_fp8_gemm_1d1d_impl<GemmKernel>
         <<<grid, block, kSmemSize, stream>>>(shape_m, shape_n, shape_k, mat_a, mat_b, mat_d, scales_a, scales_b);
+
+    result = cudaGetLastError();
+    TLLM_CHECK_WITH_INFO(result == cudaSuccess, "sm89 gemm kernel runtime error: %s", cudaGetErrorString(result));
 }
 
 void fp8_gemm_run(__nv_fp8_e4m3* mat_a, int ld_a, __nv_fp8_e4m3* mat_b, int ld_b, __nv_bfloat16* mat_d, int ld_d,
@@ -1643,7 +1642,7 @@ void fp8_gemm_run(__nv_fp8_e4m3* mat_a, int ld_a, __nv_fp8_e4m3* mat_b, int ld_b
     }
 #ifndef PLACEHOLDER_KERNELS
     int arch = tensorrt_llm::common::getSMVersion();
-    if (arch == 89)
+    if (arch == 89 || arch == 120)
     {
         gemm_dispatch_sm89(mat_a, mat_b, mat_d, scales_a, scales_b, shape_m, shape_n, shape_k, stream);
         return;
@@ -1883,7 +1882,7 @@ void fp8_stride_batch_gemm_run(__nv_bfloat16 const* mat_a, __nv_fp8_e4m3* fp8_ma
     }
 
     int arch = tensorrt_llm::common::getSMVersion();
-    if (arch == 89)
+    if (arch == 89 || arch == 120)
     {
         strided_batch_gemm_dispatch_sm89(fp8_mat_a, ld_a, stride_a, fp8_mat_b, ld_b, stride_b, mat_d, ld_d, stride_d,
             scales_a, stride_scales_a, scales_b, num_problems, shape_m, shape_n, shape_k, stream);
 
@@ -601,6 +601,8 @@ __global__ __launch_bounds__(256, 1) void fused_a_gemm_kernel(
         }
     }
     __syncthreads();
+    asm volatile("griddepcontrol.wait;");
+    asm volatile("griddepcontrol.launch_dependents;");
 
     if (warp_idx < 2)
     {
@@ -622,7 +624,6 @@ __global__ __launch_bounds__(256, 1) void fused_a_gemm_kernel(
         mma_computer.issue_mainloop();
         mma_computer.epi();
     }
-    asm volatile("griddepcontrol.launch_dependents;");
 #endif
 }
Original file line number	Diff line number	Diff line change
`@@ -134,8 +134,12 @@ public:`
`134`	`134`	`// corresponding CTA has not been launched.`
`135`	`135`	`for (int flag_idx = blockIdx.x; flag_idx < kBarrierFlagCount; flag_idx += gridDim.x)`
`136`	`136`	`{`
`137`		`- st_flag(m_target_flag + flag_idx * NRanks, m_flag_value);`
	`137`	`+ asm volatile(`
	`138`	`+ "st.global.relaxed.sys.b32 [%1], %0;" ::"r"(m_flag_value), "l"(m_target_flag + flag_idx * NRanks));`
`138`	`139`	`}`
	`140`	`+ // Single release fence`
	`141`	`+ asm volatile("fence.release.sys;");`
	`142`	`+`
`139`	`143`	`while (ld_flag(m_current_flag) == prev_flag(m_flag_value))`
`140`	`144`	`{`
`141`	`145`	`}`
Original file line number	Diff line number	Diff line change
`@@ -601,6 +601,8 @@ __global__ __launch_bounds__(256, 1) void fused_a_gemm_kernel(`
`601`	`601`	`}`
`602`	`602`	`}`
`603`	`603`	`__syncthreads();`
	`604`	`+ asm volatile("griddepcontrol.wait;");`
	`605`	`+ asm volatile("griddepcontrol.launch_dependents;");`
`604`	`606`
`605`	`607`	`if (warp_idx < 2)`
`606`	`608`	`{`
`@@ -622,7 +624,6 @@ __global__ __launch_bounds__(256, 1) void fused_a_gemm_kernel(`
`622`	`624`	`mma_computer.issue_mainloop();`
`623`	`625`	`mma_computer.epi();`
`624`	`626`	`}`
`625`		`- asm volatile("griddepcontrol.launch_dependents;");`
`626`	`627`	`#endif`
`627`	`628`	`}`
`628`	`629`