diff --git a/src/alge/cs_gradient_cuda.cu b/src/alge/cs_gradient_cuda.cu index 011a6ed4bf..73d7aa0794 100644 --- a/src/alge/cs_gradient_cuda.cu +++ b/src/alge/cs_gradient_cuda.cu @@ -1838,7 +1838,7 @@ cs_reconstruct_vector_gradient_cuda(const cs_mesh_t *m, // b_face_cells); - // _compute_reconstruct_v_b_face_v2<<>> + // _compute_reconstruct_v_b_face_v2<<>> // ( n_b_faces * 3, // coefb_d, // coefa_d, @@ -1849,7 +1849,34 @@ cs_reconstruct_vector_gradient_cuda(const cs_mesh_t *m, // grad_d, // b_f_face_normal, // b_face_cells); + + // _compute_reconstruct_v_b_face_new<<>> + // ( n_b_faces, + // coefb_d, + // coefa_d, + // pvar_d, + // inc, + // diipb, + // r_grad_d, + // grad_d, + // b_f_face_normal, + // b_face_cells, + // cell_b_faces_idx); + + _compute_reconstruct_v_b_face_new_v2<<>> + ( n_b_faces * 3, + coefb_d, + coefa_d, + pvar_d, + inc, + diipb, + r_grad_d, + grad_d, + b_f_face_normal, + b_face_cells, + cell_b_faces_idx); + /*************************************Kernels Scatter conflict free************************************/ // _compute_reconstruct_v_b_face_cf<<>> // ( n_b_faces, @@ -1863,7 +1890,7 @@ cs_reconstruct_vector_gradient_cuda(const cs_mesh_t *m, // b_f_face_normal, // b_face_cells); - // _compute_reconstruct_v_b_face_v2_cf<<>> + // _compute_reconstruct_v_b_face_v2_cf<<>> // ( n_b_faces * 3, // coefb_d, // coefa_d, @@ -1938,19 +1965,19 @@ cs_reconstruct_vector_gradient_cuda(const cs_mesh_t *m, /*************************************Kernels Gather shared memory***************************************/ - _compute_reconstruct_v_b_face_gather_v5<<>> - ( n_b_cells, - coefb_d, - coefa_d, - pvar_d, - inc, - diipb, - r_grad_d, - grad_d, - b_f_face_normal, - b_cells, - cell_b_faces, - cell_b_faces_idx); + // _compute_reconstruct_v_b_face_gather_v5<<>> + // ( n_b_cells, + // coefb_d, + // coefa_d, + // pvar_d, + // inc, + // diipb, + // r_grad_d, + // grad_d, + // b_f_face_normal, + // b_cells, + // cell_b_faces, + // cell_b_faces_idx); CS_CUDA_CHECK(cudaEventRecord(b_faces_2, stream)); diff --git a/src/alge/cs_reconstruct_vector_gradient_scatter.cuh b/src/alge/cs_reconstruct_vector_gradient_scatter.cuh index 025d370cb5..0092c7e212 100644 --- a/src/alge/cs_reconstruct_vector_gradient_scatter.cuh +++ b/src/alge/cs_reconstruct_vector_gradient_scatter.cuh @@ -123,6 +123,126 @@ _compute_reconstruct_v_b_face(cs_lnum_t n_b_faces, } } + + +__global__ static void +_compute_reconstruct_v_b_face_new(cs_lnum_t n_b_faces, + const cs_real_33_t *restrict coefbv, + const cs_real_3_t *restrict coefav, + const cs_real_3_t *restrict pvar, + int inc, + const cs_real_3_t *restrict diipb, + const cs_real_33_t *restrict r_grad, + cs_real_33_t *restrict grad, + const cs_real_3_t *restrict b_f_face_normal, + const cs_lnum_t *restrict b_face_cells, + const cs_lnum_t *restrict cell_b_faces_idx) +{ + cs_lnum_t f_id = blockIdx.x * blockDim.x + threadIdx.x; + + if(f_id >= n_b_faces){ + return; + } + cs_lnum_t c_id; + cs_real_t pfac, rfac, vecfac, g_cij; + + c_id = b_face_cells[f_id]; + + int need_atomic = cell_b_faces_idx[c_id+1] - cell_b_faces_idx[c_id] - 1; + + for (cs_lnum_t i = 0; i < 3; i++) { + + pfac = inc*coefav[f_id][i]; + + for (cs_lnum_t k = 0; k < 3; k++){ + pfac += coefbv[f_id][i][k] * pvar[c_id][k]; + } + + pfac -= pvar[c_id][i]; + + /* Reconstruction part */ + rfac = 0.; + for (cs_lnum_t k = 0; k < 3; k++) { + vecfac = r_grad[c_id][k][0] * diipb[f_id][0] + + r_grad[c_id][k][1] * diipb[f_id][1] + + r_grad[c_id][k][2] * diipb[f_id][2]; + rfac += coefbv[f_id][i][k] * vecfac; + } + for (cs_lnum_t j = 0; j < 3; j++){ + g_cij = (pfac + rfac) * b_f_face_normal[f_id][j]; + if (need_atomic) { + atomicAdd(&grad[c_id][i][j], g_cij); + } + else { + grad[c_id][i][j] += g_cij; + } + } + } +} + + + +__global__ static void +_compute_reconstruct_v_b_face_new_v2(cs_lnum_t n_b_faces, + const cs_real_33_t *restrict coefbv, + const cs_real_3_t *restrict coefav, + const cs_real_3_t *restrict pvar, + int inc, + const cs_real_3_t *restrict diipb, + const cs_real_33_t *restrict r_grad, + cs_real_33_t *restrict grad, + const cs_real_3_t *restrict b_f_face_normal, + const cs_lnum_t *restrict b_face_cells, + const cs_lnum_t *restrict cell_b_faces_idx) +{ + cs_lnum_t f_idt = blockIdx.x * blockDim.x + threadIdx.x; + + if(f_idt >= n_b_faces){ + return; + } + + size_t f_id = f_idt / 3; + size_t i = f_idt % 3; + + cs_lnum_t c_id; + cs_real_t pond, ktpond, pfac, rfac, vecfac, g_cij; + + // if (coupled_faces[f_id * cpl_stride]) + // return; + + c_id = b_face_cells[f_id]; + + int need_atomic = cell_b_faces_idx[c_id+1] - cell_b_faces_idx[c_id] - 1; + + pfac = inc*coefav[f_id][i]; + + for (cs_lnum_t k = 0; k < 3; k++){ + pfac += coefbv[f_id][i][k] * pvar[c_id][k]; + } + + pfac -= pvar[c_id][i]; + + /* Reconstruction part */ + rfac = 0.; + for (cs_lnum_t k = 0; k < 3; k++) { + vecfac = r_grad[c_id][k][0] * diipb[f_id][0] + + r_grad[c_id][k][1] * diipb[f_id][1] + + r_grad[c_id][k][2] * diipb[f_id][2]; + rfac += coefbv[f_id][i][k] * vecfac; + } + + for (cs_lnum_t j = 0; j < 3; j++){ + g_cij = (pfac + rfac) * b_f_face_normal[f_id][j]; + if (need_atomic) { + atomicAdd(&grad[c_id][i][j], g_cij); + } + else { + grad[c_id][i][j] += g_cij; + } + } + +} + __global__ static void _compute_reconstruct_correction(cs_lnum_t n_cells, cs_lnum_t has_dc,