From d45527e5d08eb58acf430bbe4401b270e96ffcf7 Mon Sep 17 00:00:00 2001 From: Fredrik Jansson Date: Fri, 17 May 2024 14:45:06 +0200 Subject: [PATCH] modfftw: fix zeroing of unused parts of p201 one more fix of handling the un-used part of the FFTW data. Note: there may be several processors with no elements to transform. Don't rely on being the last in a row or column, instead determine the number of own elements (jonx_me, jony_me) and use those where appropriate. --- src/modfftw.f90 | 30 +++++++++++++++--------------- 1 file changed, 15 insertions(+), 15 deletions(-) diff --git a/src/modfftw.f90 b/src/modfftw.f90 index 12fc5f60..020f9d70 100644 --- a/src/modfftw.f90 +++ b/src/modfftw.f90 @@ -41,8 +41,8 @@ module modfftw integer :: method integer :: konx, kony integer :: iony, jonx - integer :: konx_last, kony_last - integer :: iony_last, jonx_last + integer :: konx_me, kony_me + integer :: iony_me, jonx_me real(pois_r), dimension(:), allocatable :: bufin, bufout interface fftw_plan_many_r2r_if procedure :: d_fftw_plan_many_r2r @@ -136,10 +136,13 @@ subroutine fftwinit(p, Fp, d, xyrt, ps,pe,qs,qe) jonx = jonx + 1 endif - konx_last = kmax - konx*(nprocx-1) - kony_last = kmax - kony*(nprocy-1) - iony_last = itot - iony*(nprocy-1) - jonx_last = jtot - jonx*(nprocx-1) + ! how many data elements are in use in the current process? + ! if the number of elements is not divisible by nprocx or nprocy, + ! one process may have fewer elements, and some processes may have *no* elements + konx_me = max(min(konx, kmax - konx*myidx), 0) + kony_me = max(min(kony, kmax - kony*myidy), 0) + iony_me = max(min(iony, itot - iony*myidy), 0) + jonx_me = max(min(jonx, jtot - jonx*myidx), 0) ! Allocate communication buffers for the transpose functions sz = max( imax * jmax * konx * nprocx, & ! transpose a1 @@ -258,13 +261,9 @@ subroutine fftwinit(p, Fp, d, xyrt, ps,pe,qs,qe) allocate(xyrt(iony,jonx)) allocate(d(iony,jonx,kmax)) ps = 1 - pe = iony + pe = iony_me qs = 1 - qe = jonx - - ! special case for final task in x or y, which may have fewer elements - if (myidx == nprocx-1) qe = jonx_last - if (myidy == nprocy-1) pe = iony_last + qe = jonx_me else if (method == 2) then @@ -660,9 +659,10 @@ subroutine fftwf(p, Fp) call transpose_a2(p210, p201) ! zero the unused part, avoinds SIGFPE from the FFT (Debug mode) - if (myidx == nprocx-1 .and. konx_last < konx) p201(:,konx_last+1:, :) = 0 - if (myidy == nprocy-1 .and. iony_last < iony) p201(:,:,iony_last+1:) = 0 - ! p201(jtot,konx,iony) + ! indexing: p201(jtot,konx,iony) + if (konx_me < konx) p201(:,konx_me+1:, :) = 0 + if (iony_me < iony) p201(:,:,iony_me+1:) = 0 + call fftw_execute_r2r_if(plany, p201_flat, p201_flat)