Skip to content

Commit de45ad4

Browse files
Merge branch 'main' into numa_aware_affinity
2 parents a12b774 + e2b2675 commit de45ad4

File tree

136 files changed

+6620
-2454
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

136 files changed

+6620
-2454
lines changed

constraints.txt

Lines changed: 1 addition & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,2 @@
1-
# These vulnerabilities were inherited from the base image (pytorch:25.06-py3) and should be removed when the base image
1+
# These vulnerabilities were inherited from the base image (pytorch:25.10-py3) and should be removed when the base image
22
# is updated.
3-
4-
# WAR against https://github.com/advisories/GHSA-8qvm-5x2c-j2w7
5-
protobuf>=4.25.8

cpp/tensorrt_llm/common/customAllReduceUtils.h

Lines changed: 255 additions & 0 deletions
Large diffs are not rendered by default.

cpp/tensorrt_llm/kernels/communicationKernels/allReduceFusionKernels.cu

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -134,8 +134,12 @@ public:
134134
// corresponding CTA has not been launched.
135135
for (int flag_idx = blockIdx.x; flag_idx < kBarrierFlagCount; flag_idx += gridDim.x)
136136
{
137-
st_flag(m_target_flag + flag_idx * NRanks, m_flag_value);
137+
asm volatile(
138+
"st.global.relaxed.sys.b32 [%1], %0;" ::"r"(m_flag_value), "l"(m_target_flag + flag_idx * NRanks));
138139
}
140+
// Single release fence
141+
asm volatile("fence.release.sys;");
142+
139143
while (ld_flag(m_current_flag) == prev_flag(m_flag_value))
140144
{
141145
}

cpp/tensorrt_llm/kernels/communicationKernels/moeAlltoAllKernels.cu

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -23,7 +23,7 @@
2323
#include <cstdint>
2424
#include <type_traits>
2525

26-
namespace tensorrt_llm::kernels::moe_a2a
26+
namespace tensorrt_llm::kernels::mnnvl_throughput
2727
{
2828

2929
#define ENABLE_DEBUG_PRINT 0
@@ -506,7 +506,7 @@ void moe_a2a_dispatch_launch(MoeA2ADispatchParams const& params)
506506
TLLM_CHECK(params.num_payloads > 0 && params.num_payloads <= kMaxPayloads);
507507

508508
// Prepare kernel pointers struct
509-
DispatchKernelPointers kernel_ptrs = {}; // Zero-initialize
509+
DispatchKernelPointers kernel_ptrs = {};
510510

511511
// Fill source data pointers and payload sizes
512512
for (int i = 0; i < params.num_payloads; i++)
@@ -958,4 +958,4 @@ void moe_a2a_sanitize_expert_ids_launch(int32_t* expert_ids, int32_t const* recv
958958
expert_ids, recv_counters, ep_size, max_tokens_per_rank, top_k, invalid_id);
959959
}
960960

961-
} // namespace tensorrt_llm::kernels::moe_a2a
961+
} // namespace tensorrt_llm::kernels::mnnvl_throughput

cpp/tensorrt_llm/kernels/communicationKernels/moeAlltoAllKernels.h

Lines changed: 34 additions & 32 deletions
Original file line numberDiff line numberDiff line change
@@ -19,7 +19,7 @@
1919
#include <cuda_bf16.h>
2020
#include <cuda_fp16.h>
2121

22-
namespace tensorrt_llm::kernels::moe_a2a
22+
namespace tensorrt_llm::kernels::mnnvl_throughput
2323
{
2424

2525
// Configuration constants
@@ -91,7 +91,7 @@ struct MoeA2ADispatchParams
9191

9292
// Token configuration
9393
int local_num_tokens; // Number of tokens on this rank
94-
int max_tokens_per_rank; // Maximum tokens per rank for pre-allocation
94+
int max_tokens_per_rank; // Maximum tokens per rank for pre-allocation TODO: Rename to runtime_max_tokens_per_rank
9595
int top_k; // Number of experts per token
9696

9797
// Expert routing information
@@ -101,23 +101,22 @@ struct MoeA2ADispatchParams
101101
int num_payloads; // Number of different payload types
102102
PayloadDescriptor payloads[kMaxPayloads]; // Array of payload descriptors
103103

104-
// Receive buffers and synchronization
105-
void* recv_buffers[kMaxRanks][kMaxPayloads]; // Per-rank receive buffers for each payload
104+
// Local aux data
105+
uint32_t* flag_val; // The value of the flag for this round (stored on the local rank)
106+
int* local_token_counter; // Atomic counter for completed tokens on this rank
107+
int* send_counters; // [ep_size] atomic counters - tracks tokens sent to each target rank
108+
int* topk_target_ranks; // Top-K compact routing info per local token (size: [local_num_tokens, top_k]), target rank
109+
// per k, -1 for duplicates
110+
int* topk_send_indices; // Top-K compact routing info per local token (size: [local_num_tokens, top_k]), dst index
111+
// per k, -1 for duplicates
106112

107-
// Synchronization
113+
// Distributed aux data and recv buffers
114+
int* recv_counters[kMaxRanks]; // tracks tokens received from each source rank. Each rank has [ep_size] counters
108115
uint32_t* completion_flags[kMaxRanks]; // If completion_flags[target_rank][source_rank] == *flag_val, then source
109116
// rank has signaled the target rank
110-
uint32_t* flag_val; // The value of the flag for this round (stored on the local rank)
111-
112-
// Communication tracking
113-
int* send_counters; // [ep_size] atomic counters - tracks tokens sent to each target rank
114-
int* recv_counters[kMaxRanks]; // tracks tokens received from each source rank. Each rank has [ep_size] counters
115-
int* local_token_counter; // Atomic counter for completed tokens on this rank
116-
117-
// Top-K compact routing info per local token (size: [local_num_tokens, top_k])
118-
int* topk_target_ranks; // target rank per k, -1 for duplicates
119-
int* topk_send_indices; // dst index per k, -1 for duplicates
117+
void* recv_buffers[kMaxRanks][kMaxPayloads]; // Per-rank receive buffers for each payload
120118

119+
// CUDA stream
121120
cudaStream_t stream;
122121
};
123122

@@ -137,30 +136,33 @@ struct MoeA2ACombineParams
137136

138137
// Token configuration
139138
int local_num_tokens; // Number of tokens on this rank
140-
int max_tokens_per_rank; // Maximum tokens per rank for pre-allocation
139+
int max_tokens_per_rank; // Maximum tokens per rank for pre-allocation TODO: Rename to runtime_max_tokens_per_rank
141140
int top_k; // Number of experts per token
142141

143-
// Expert routing information
144-
int const* recv_counters; // [ep_size] number of valid tokens per source rank for this target
145-
146-
// Top-K compact routing info per local token (size: [local_num_tokens, top_k])
147-
int const* topk_target_ranks; // target rank per k, -1 for duplicates
148-
int const* topk_send_indices; // dst index per k, -1 for duplicates
142+
// Prepare-only field: original payload tensor pointer used to stage into workspace
143+
void const* prepare_payload;
149144

150-
// Single payload information
151-
void const* recv_buffers[kMaxRanks]; // Per-rank receive buffers (only for single payload)
152-
void* output_data; // Output buffer [local_num_tokens, elements_per_token]
153-
int elements_per_token; // Number of elements per token
154-
nvinfer1::DataType dtype; // Data type for proper summation
145+
// Output tensor
146+
void* output_data; // Output buffer [local_num_tokens, elements_per_token]
147+
// Payload information
148+
int elements_per_token; // Number of elements per token
149+
nvinfer1::DataType dtype; // Data type for proper summation
150+
151+
// Local aux data
152+
uint32_t* flag_val; // The value of the flag for this round (stored on the local rank)
153+
int* topk_target_ranks; // Top-K compact routing info per local token (size: [local_num_tokens, top_k]), target rank
154+
// per k, -1 for duplicates
155+
int* topk_send_indices; // Top-K compact routing info per local token (size: [local_num_tokens, top_k]), dst index
156+
// per k, -1 for duplicates
157+
int const* recv_counters; // [ep_size] number of valid tokens per source rank for this target
155158

156-
// Synchronization
159+
// Distributed aux data and recv buffers
157160
uint32_t* completion_flags[kMaxRanks]; // If completion_flags[target_rank][source_rank] == *flag_val, then source
158161
// rank has signaled the target rank
159-
uint32_t* flag_val; // The value of the flag for this round (stored on the local rank)
162+
void const* recv_buffers[kMaxRanks]; // Per-rank receive buffers (only for single payload)
160163

164+
// CUDA stream
161165
cudaStream_t stream;
162-
// Prepare-only field: original payload tensor pointer used to stage into workspace
163-
void const* prepare_payload;
164166
};
165167

166168
// Combine kernels
@@ -175,4 +177,4 @@ void moe_a2a_prepare_combine_launch(MoeA2ACombineParams const& params);
175177
void moe_a2a_sanitize_expert_ids_launch(int32_t* expert_ids, int32_t const* recv_counters, int32_t invalid_id,
176178
int ep_size, int max_tokens_per_rank, int top_k, cudaStream_t stream);
177179

178-
} // namespace tensorrt_llm::kernels::moe_a2a
180+
} // namespace tensorrt_llm::kernels::mnnvl_throughput

cpp/tensorrt_llm/kernels/customAllReduceKernels.h

Lines changed: 24 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -106,6 +106,30 @@ inline std::string toString(AllReduceFusionOp op)
106106
return oss.str();
107107
}
108108

109+
inline std::ostream& operator<<(std::ostream& os, AllReduceStrategyType op)
110+
{
111+
switch (op)
112+
{
113+
case AllReduceStrategyType::NCCL: os << "NCCL"; break;
114+
case AllReduceStrategyType::MIN_LATENCY: os << "MIN_LATENCY"; break;
115+
case AllReduceStrategyType::UB: os << "UB"; break;
116+
case AllReduceStrategyType::AUTO: os << "AUTO"; break;
117+
case AllReduceStrategyType::ONESHOT: os << "ONESHOT"; break;
118+
case AllReduceStrategyType::TWOSHOT: os << "TWOSHOT"; break;
119+
case AllReduceStrategyType::LOWPRECISION: os << "LOWPRECISION"; break;
120+
case AllReduceStrategyType::MNNVL: os << "MNNVL"; break;
121+
case AllReduceStrategyType::NCCL_SYMMETRIC: os << "NCCL_SYMMETRIC"; break;
122+
}
123+
return os;
124+
}
125+
126+
inline std::string toString(AllReduceStrategyType op)
127+
{
128+
std::ostringstream oss;
129+
oss << op;
130+
return oss.str();
131+
}
132+
109133
struct AllReduceFusionParams
110134
{
111135
AllReduceFusionParams()

cpp/tensorrt_llm/kernels/cutlass_kernels/CMakeLists.txt

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -205,7 +205,7 @@ set_cuda_architectures(fb_gemm_src 89 90 100f 120f)
205205
# ${INSTANTIATION_GENERATION_DIR}/fp8_rowwise_gemm)
206206

207207
add_library(fp8_blockscale_gemm_src STATIC ${FP8_BLOCKSCALE_GEMM_SRC_CU})
208-
set_cuda_architectures(fp8_blockscale_gemm_src 89 90 100f)
208+
set_cuda_architectures(fp8_blockscale_gemm_src 89 90 100f 120f)
209209

210210
set(GEMM_SWIGLU_SM90_SRC_CU
211211
${CMAKE_CURRENT_SOURCE_DIR}/fused_gated_gemm/gemm_swiglu_e4m3.cu)

cpp/tensorrt_llm/kernels/cutlass_kernels/fp8_blockscale_gemm/fp8_blockscale_gemm_kernel.cuh

Lines changed: 8 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -1622,16 +1622,15 @@ void gemm_dispatch_sm89(void* mat_a, void* mat_b, void* mat_d, float* scales_a,
16221622
dim3 grid = dim3(grid_m, grid_n, grid_k);
16231623
dim3 block = dim3(kThreadCount, 1, 1);
16241624

1625-
if (kSmemSize > (48 << 10))
1626-
{
1627-
cudaFuncSetAttribute(ada_blockwise_gemm::sm89_fp8_gemm_1d1d_impl<GemmKernel>,
1628-
cudaFuncAttributeMaxDynamicSharedMemorySize, kSmemSize);
1629-
auto result = cudaGetLastError();
1630-
TLLM_CHECK_WITH_INFO(result == cudaSuccess, "sm89 gemm kernel cannot launch: %s", cudaGetErrorString(result));
1631-
}
1625+
auto result = cudaFuncSetAttribute(ada_blockwise_gemm::sm89_fp8_gemm_1d1d_impl<GemmKernel>,
1626+
cudaFuncAttributeMaxDynamicSharedMemorySize, kSmemSize);
1627+
TLLM_CHECK_WITH_INFO(result == cudaSuccess, "sm89 gemm kernel cannot launch: %s", cudaGetErrorString(result));
16321628

16331629
ada_blockwise_gemm::sm89_fp8_gemm_1d1d_impl<GemmKernel>
16341630
<<<grid, block, kSmemSize, stream>>>(shape_m, shape_n, shape_k, mat_a, mat_b, mat_d, scales_a, scales_b);
1631+
1632+
result = cudaGetLastError();
1633+
TLLM_CHECK_WITH_INFO(result == cudaSuccess, "sm89 gemm kernel runtime error: %s", cudaGetErrorString(result));
16351634
}
16361635

16371636
void fp8_gemm_run(__nv_fp8_e4m3* mat_a, int ld_a, __nv_fp8_e4m3* mat_b, int ld_b, __nv_bfloat16* mat_d, int ld_d,
@@ -1643,7 +1642,7 @@ void fp8_gemm_run(__nv_fp8_e4m3* mat_a, int ld_a, __nv_fp8_e4m3* mat_b, int ld_b
16431642
}
16441643
#ifndef PLACEHOLDER_KERNELS
16451644
int arch = tensorrt_llm::common::getSMVersion();
1646-
if (arch == 89)
1645+
if (arch == 89 || arch == 120)
16471646
{
16481647
gemm_dispatch_sm89(mat_a, mat_b, mat_d, scales_a, scales_b, shape_m, shape_n, shape_k, stream);
16491648
return;
@@ -1883,7 +1882,7 @@ void fp8_stride_batch_gemm_run(__nv_bfloat16 const* mat_a, __nv_fp8_e4m3* fp8_ma
18831882
}
18841883

18851884
int arch = tensorrt_llm::common::getSMVersion();
1886-
if (arch == 89)
1885+
if (arch == 89 || arch == 120)
18871886
{
18881887
strided_batch_gemm_dispatch_sm89(fp8_mat_a, ld_a, stride_a, fp8_mat_b, ld_b, stride_b, mat_d, ld_d, stride_d,
18891888
scales_a, stride_scales_a, scales_b, num_problems, shape_m, shape_n, shape_k, stream);

cpp/tensorrt_llm/kernels/dsv3MinLatencyKernels/dsv3FusedAGemm.cu

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -601,6 +601,8 @@ __global__ __launch_bounds__(256, 1) void fused_a_gemm_kernel(
601601
}
602602
}
603603
__syncthreads();
604+
asm volatile("griddepcontrol.wait;");
605+
asm volatile("griddepcontrol.launch_dependents;");
604606

605607
if (warp_idx < 2)
606608
{
@@ -622,7 +624,6 @@ __global__ __launch_bounds__(256, 1) void fused_a_gemm_kernel(
622624
mma_computer.issue_mainloop();
623625
mma_computer.epi();
624626
}
625-
asm volatile("griddepcontrol.launch_dependents;");
626627
#endif
627628
}
628629

0 commit comments

Comments
 (0)