@@ -311,16 +311,17 @@ void GetBlockShapeAndSplitKVBlock(
311311 if (mla_backend && group_size <= 64 ) {
312312 const int set_chunk_size = get_mla_dec_chunk_size (bsz);
313313
314- PADDLE_ENFORCE_GPU_SUCCESS (cudaMemsetAsync (
314+ CUDA_CHECK (cudaMemsetAsync (
315315 decoder_chunk_size_device.data <int >(), 64 , sizeof (int32_t ), stream));
316316
317- PADDLE_ENFORCE_GPU_SUCCESS (cudaMemsetAsync (
317+ CUDA_CHECK (cudaMemsetAsync (
318318 decoder_num_blocks_device.data <int >(), 0 , sizeof (int32_t ), stream));
319319
320320 int device;
321- cudaGetDevice (&device);
321+ CUDA_CHECK ( cudaGetDevice (&device) );
322322 int sm_cout;
323- cudaDeviceGetAttribute (&sm_cout, cudaDevAttrMultiProcessorCount, device);
323+ CUDA_CHECK (cudaDeviceGetAttribute (
324+ &sm_cout, cudaDevAttrMultiProcessorCount, device));
324325 constexpr int config_size =
325326 12 ; // search space for chunk size:[64, 128, 256, ... 131072]
326327
@@ -341,16 +342,14 @@ void GetBlockShapeAndSplitKVBlock(
341342 decoder_chunk_size_device.copy_to (paddle::CPUPlace (), false );
342343 const int chunk_size = decoder_chunk_size_cpu.data <int >()[0 ];
343344
344- PADDLE_ENFORCE_GPU_SUCCESS (
345- cudaMemsetAsync (decoder_batch_ids.data <int >(),
346- 0 ,
347- decoder_batch_ele_num * sizeof (int32_t ),
348- stream));
349- PADDLE_ENFORCE_GPU_SUCCESS (
350- cudaMemsetAsync (decoder_tile_ids_per_batch.data <int >(),
351- 0 ,
352- decoder_batch_ele_num * sizeof (int32_t ),
353- stream));
345+ CUDA_CHECK (cudaMemsetAsync (decoder_batch_ids.data <int >(),
346+ 0 ,
347+ decoder_batch_ele_num * sizeof (int32_t ),
348+ stream));
349+ CUDA_CHECK (cudaMemsetAsync (decoder_tile_ids_per_batch.data <int >(),
350+ 0 ,
351+ decoder_batch_ele_num * sizeof (int32_t ),
352+ stream));
354353
355354 split_block_for_mla<<<1 , 32 , 0 , stream>>> (
356355 seq_lens_this_time.data <int >(),
@@ -362,17 +361,15 @@ void GetBlockShapeAndSplitKVBlock(
362361 chunk_size);
363362
364363 } else {
365- PADDLE_ENFORCE_GPU_SUCCESS (
366- cudaMemsetAsync (decoder_batch_ids.data <int >(),
367- 0 ,
368- decoder_batch_ele_num * sizeof (int32_t ),
369- stream));
370- PADDLE_ENFORCE_GPU_SUCCESS (
371- cudaMemsetAsync (decoder_tile_ids_per_batch.data <int >(),
372- 0 ,
373- decoder_batch_ele_num * sizeof (int32_t ),
374- stream));
375- PADDLE_ENFORCE_GPU_SUCCESS (cudaMemsetAsync (
364+ CUDA_CHECK (cudaMemsetAsync (decoder_batch_ids.data <int >(),
365+ 0 ,
366+ decoder_batch_ele_num * sizeof (int32_t ),
367+ stream));
368+ CUDA_CHECK (cudaMemsetAsync (decoder_tile_ids_per_batch.data <int >(),
369+ 0 ,
370+ decoder_batch_ele_num * sizeof (int32_t ),
371+ stream));
372+ CUDA_CHECK (cudaMemsetAsync (
376373 decoder_num_blocks_device.data <int >(), 0 , sizeof (int32_t ), stream));
377374
378375 split_q_block<<<1 , 32 , 0 , stream>>> (
@@ -391,8 +388,6 @@ void GetBlockShapeAndSplitKVBlock(
391388#endif
392389 decoder_num_blocks_cpu.copy_ (
393390 decoder_num_blocks_device, decoder_num_blocks_cpu.place (), false );
394- PADDLE_ENFORCE_GPU_SUCCESS (cudaMemsetAsync (
395- decoder_chunk_size_device.data <int >(), 64 , sizeof (int32_t ), stream));
396391 }
397392 }
398393
@@ -401,19 +396,17 @@ void GetBlockShapeAndSplitKVBlock(
401396 const uint32_t max_tile_size_per_bs_kv =
402397 div_up (max_enc_dec_len_this_time, block_size);
403398 const uint32_t kv_batch_shape = bsz * max_tile_size_per_bs_kv;
404- PADDLE_ENFORCE_GPU_SUCCESS (cudaMemsetAsync (
399+ CUDA_CHECK (cudaMemsetAsync (
405400 kv_batch_ids.data <int >(), 0 , kv_batch_shape * sizeof (int32_t ), stream));
406- PADDLE_ENFORCE_GPU_SUCCESS (
407- cudaMemsetAsync (kv_tile_ids_per_batch.data <int >(),
408- 0 ,
409- kv_batch_shape * sizeof (int32_t ),
410- stream));
401+ CUDA_CHECK (cudaMemsetAsync (kv_tile_ids_per_batch.data <int >(),
402+ 0 ,
403+ kv_batch_shape * sizeof (int32_t ),
404+ stream));
411405 auto kv_num_blocks_x =
412406 GetEmptyTensor ({1 }, paddle::DataType::INT32, seq_lens_encoder.place ());
413407
414408 split_kv_block<<<1 , 32 , 0 , seq_lens_encoder.stream()>>> (
415409 seq_lens_decoder.data <int >(),
416- // sequence_lengths->data<int>(),
417410 seq_lens_encoder.data <int >(),
418411 kv_batch_ids.data <int >(),
419412 kv_tile_ids_per_batch.data <int >(),
@@ -428,16 +421,14 @@ void GetBlockShapeAndSplitKVBlock(
428421 const uint32_t encoder_max_tile_size_per_bs_q =
429422 div_up ((max_enc_dec_len_this_time * group_size), encoder_block_shape_q);
430423 const uint32_t encoder_batch_shape = bsz * encoder_max_tile_size_per_bs_q;
431- PADDLE_ENFORCE_GPU_SUCCESS (
432- cudaMemsetAsync (encoder_batch_ids.data <int >(),
433- 0 ,
434- encoder_batch_shape * sizeof (int32_t ),
435- stream));
436- PADDLE_ENFORCE_GPU_SUCCESS (
437- cudaMemsetAsync (encoder_tile_ids_per_batch.data <int >(),
438- 0 ,
439- encoder_batch_shape * sizeof (int32_t ),
440- stream));
424+ CUDA_CHECK (cudaMemsetAsync (encoder_batch_ids.data <int >(),
425+ 0 ,
426+ encoder_batch_shape * sizeof (int32_t ),
427+ stream));
428+ CUDA_CHECK (cudaMemsetAsync (encoder_tile_ids_per_batch.data <int >(),
429+ 0 ,
430+ encoder_batch_shape * sizeof (int32_t ),
431+ stream));
441432 auto encoder_num_blocks_x =
442433 GetEmptyTensor ({1 }, paddle::DataType::INT32, seq_lens_encoder.place ());
443434 split_q_block<<<1 , 32 , 0 , stream>>> (seq_lens_encoder.data <int >(),
0 commit comments