-
Notifications
You must be signed in to change notification settings - Fork 604
Make router_fusion to adapt for the large num_of_expert(>2048) #2582
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: main
Are you sure you want to change the base?
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change | ||||||||||||||||||||
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
|
|
@@ -147,6 +147,8 @@ void fused_score_for_moe_aux_loss_forward_kernel_launcher( | |||||||||||||||||||||
| size_t shared_memory_size = num_experts * num_token_per_block * sizeof(DataType) // logits | ||||||||||||||||||||||
| + topk * num_token_per_block * sizeof(DataType) // topk_logits | ||||||||||||||||||||||
| + topk * num_token_per_block * sizeof(int); // topk_indices | ||||||||||||||||||||||
| cudaFuncSetAttribute(fused_score_for_moe_aux_loss_forward_kernel<DataType>, | ||||||||||||||||||||||
| cudaFuncAttributeMaxDynamicSharedMemorySize, shared_memory_size); | ||||||||||||||||||||||
|
Comment on lines
+150
to
+151
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Missing error handling for If
Suggested change
|
||||||||||||||||||||||
| fused_score_for_moe_aux_loss_forward_kernel<DataType> | ||||||||||||||||||||||
| <<<grid_size, kThreadsPerBlock, shared_memory_size, stream>>>( | ||||||||||||||||||||||
| logits, num_tokens, num_experts, topk, score_function, scores, routing_map, | ||||||||||||||||||||||
|
|
@@ -283,6 +285,8 @@ void fused_score_for_moe_aux_loss_backward_kernel_launcher( | |||||||||||||||||||||
| + | ||||||||||||||||||||||
| num_experts * num_token_per_block * sizeof(DataType) // act_from_fwd | ||||||||||||||||||||||
| + num_experts * num_token_per_block * sizeof(DataType); // comp_buf | ||||||||||||||||||||||
| cudaFuncSetAttribute(fused_score_for_moe_aux_loss_backward_kernel<DataType>, | ||||||||||||||||||||||
| cudaFuncAttributeMaxDynamicSharedMemorySize, shared_memory_size); | ||||||||||||||||||||||
|
Comment on lines
+288
to
+289
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Following the pattern used elsewhere in the codebase (e.g., ln_fwd_cuda_kernel.cu), this call should be conditional:
Suggested change
This attribute only needs to be set when exceeding the 48KB default limit.
Comment on lines
+288
to
+289
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Missing error handling for
Suggested change
Note: If this suggestion doesn't match your team's coding style, reply to this and let me know. I'll remember it for next time! |
||||||||||||||||||||||
| fused_score_for_moe_aux_loss_backward_kernel<DataType> | ||||||||||||||||||||||
| <<<grid_size, kThreadsPerBlock, shared_memory_size, stream>>>( | ||||||||||||||||||||||
| intermediate_output, grad_scores, num_tokens, num_experts, topk, score_function, | ||||||||||||||||||||||
|
|
||||||||||||||||||||||
| Original file line number | Diff line number | Diff line change | ||||||||||||||||||||
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
|
|
@@ -253,6 +253,8 @@ void fused_topk_with_score_function_forward_kernel_launcher( | |||||||||||||||||||||
| shared_memory_size += num_groups * num_token_per_block * sizeof(DataType); // group_scores | ||||||||||||||||||||||
| shared_memory_size += num_experts * num_token_per_block * sizeof(DataType); // maksed_scores | ||||||||||||||||||||||
| } | ||||||||||||||||||||||
| cudaFuncSetAttribute(fused_topk_with_score_function_forward_kernel<DataType, BiasType>, | ||||||||||||||||||||||
| cudaFuncAttributeMaxDynamicSharedMemorySize, shared_memory_size); | ||||||||||||||||||||||
|
Comment on lines
+256
to
+257
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Following the pattern used elsewhere in the codebase (e.g., ln_fwd_cuda_kernel.cu), this call should be conditional:
Suggested change
This attribute only needs to be set when exceeding the 48KB default limit.
Comment on lines
+256
to
+257
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Missing error handling for
Suggested change
|
||||||||||||||||||||||
| fused_topk_with_score_function_forward_kernel<DataType, BiasType> | ||||||||||||||||||||||
| <<<grid_size, kThreadsPerBlock, shared_memory_size, stream>>>( | ||||||||||||||||||||||
| logits, num_tokens, num_experts, topk, use_pre_softmax, num_groups, group_topk, | ||||||||||||||||||||||
|
|
@@ -444,6 +446,8 @@ void fused_topk_with_score_function_backward_kernel_launcher( | |||||||||||||||||||||
| num_experts * num_token_per_block * sizeof(DataType) // act_from_fwd | ||||||||||||||||||||||
| + num_experts * num_token_per_block * sizeof(DataType) // comp_buf | ||||||||||||||||||||||
| + num_experts * num_token_per_block * sizeof(bool); // routing_map | ||||||||||||||||||||||
| cudaFuncSetAttribute(fused_topk_with_score_function_backward_kernel<DataType>, | ||||||||||||||||||||||
| cudaFuncAttributeMaxDynamicSharedMemorySize, shared_memory_size); | ||||||||||||||||||||||
|
Comment on lines
+449
to
+450
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Following the pattern used elsewhere in the codebase (e.g., ln_fwd_cuda_kernel.cu), this call should be conditional:
Suggested change
This attribute only needs to be set when exceeding the 48KB default limit.
Comment on lines
+449
to
+450
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Missing error handling for
Suggested change
Note: If this suggestion doesn't match your team's coding style, reply to this and let me know. I'll remember it for next time! |
||||||||||||||||||||||
| fused_topk_with_score_function_backward_kernel<DataType> | ||||||||||||||||||||||
| <<<grid_size, kThreadsPerBlock, shared_memory_size, stream>>>( | ||||||||||||||||||||||
| routing_map, intermediate_output, grad_probs, num_tokens, num_experts, topk, | ||||||||||||||||||||||
|
|
||||||||||||||||||||||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Following the pattern used elsewhere in the codebase (e.g., ln_fwd_cuda_kernel.cu), this call should be conditional:
This attribute only needs to be set when exceeding the 48KB default limit.