-
Notifications
You must be signed in to change notification settings - Fork 55
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
FFTW Poisson solver: fixes for non-trivial tile geometry
Improve the handling of cases where the Poisson solver data is unevenly distributed over the tiles (because of divisibility, jtot not divisible by nprocx or itot not divisible by nprocy). * p201_flat was declared with wrong size (tripped debug build) * pe,qe may be less on the last tile in x or y, if so the tridiagonal solver may operate on uninitialized data (tripped debug build) * before FFT, zero the unused data on the last tile, to avoid operating on uninitialized data (tripped debug build) * increase size of xrt,yrt and pad with zeroes. Last tile may access outside the old bounds when the data is unevenly distributed (tripped debug build) * when filling xrt, yrt, don't assume symmetry around center. This will be needed for open boundaries, and handles odd sizes correctly.
- Loading branch information
Showing
1 changed file
with
25 additions
and
15 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -41,6 +41,8 @@ module modfftw | |
integer :: method | ||
integer :: konx, kony | ||
integer :: iony, jonx | ||
integer :: konx_last, kony_last | ||
integer :: iony_last, jonx_last | ||
real(pois_r), dimension(:), allocatable :: bufin, bufout | ||
interface fftw_plan_many_r2r_if | ||
procedure :: d_fftw_plan_many_r2r | ||
|
@@ -134,14 +136,20 @@ subroutine fftwinit(p, Fp, d, xyrt, ps,pe,qs,qe) | |
jonx = jonx + 1 | ||
endif | ||
|
||
konx_last = kmax - konx*(nprocx-1) | ||
kony_last = kmax - kony*(nprocy-1) | ||
iony_last = itot - iony*(nprocy-1) | ||
jonx_last = jtot - jonx*(nprocx-1) | ||
|
||
! Allocate communication buffers for the transpose functions | ||
sz = max( imax * jmax * konx * nprocx, & ! transpose a1 | ||
iony * jmax * konx * nprocy, & ! transpose a2 | ||
iony * jonx * konx * nprocx ) ! transpose a3 | ||
|
||
allocate(bufin (sz)) | ||
allocate(bufout(sz)) | ||
|
||
bufin = 0 | ||
bufout = 0 | ||
! Allocate temporary arrays to hold transposed matrix | ||
|
||
! calculate memory size as an long int (size_t) | ||
|
@@ -164,7 +172,7 @@ subroutine fftwinit(p, Fp, d, xyrt, ps,pe,qs,qe) | |
p210(1:itot,1:jmax,1:konx) => fptr(1:itot*jmax*konx) | ||
p210_flat(1:itot*jmax*konx)=> fptr(1:itot*jmax*konx) | ||
p201(1:jtot,1:konx,1:iony) => fptr(1:jtot*konx*iony) | ||
p201_flat(1:itot*konx*iony)=> fptr(1:itot*konx*iony) | ||
p201_flat(1:jtot*konx*iony)=> fptr(1:jtot*konx*iony) | ||
Fp(1:iony,1:jonx,1:kmax) => fptr(1:iony*jonx*kmax) | ||
|
||
! Prepare 1d FFT transforms | ||
|
@@ -254,6 +262,10 @@ subroutine fftwinit(p, Fp, d, xyrt, ps,pe,qs,qe) | |
qs = 1 | ||
qe = jonx | ||
|
||
! special case for final task in x or y, which may have fewer elements | ||
if (myidx == nprocx-1) qe = jonx_last | ||
if (myidy == nprocy-1) pe = iony_last | ||
|
||
else if (method == 2) then | ||
|
||
! Make an array with the halo zone sliced off on the top and the left | ||
|
@@ -647,6 +659,10 @@ subroutine fftwf(p, Fp) | |
call fftw_execute_r2r_if(planx, p210_flat, p210_flat) | ||
|
||
call transpose_a2(p210, p201) | ||
! zero the unused part, avoinds SIGFPE from the FFT (Debug mode) | ||
! p201(jtot,konx,iony) | ||
if (myidx == nprocx-1) p201(:,konx_last+1:, :) = 0 | ||
if (myidy == nprocy-1) p201(:,:,iony_last+1:) = 0 | ||
call fftw_execute_r2r_if(plany, p201_flat, p201_flat) | ||
|
||
call transpose_a3(p201, Fp) | ||
|
@@ -692,7 +708,9 @@ subroutine fftwinit_factors(xyrt) | |
|
||
integer :: i,j,iv,jv | ||
real(pois_r) :: fac | ||
real(pois_r) :: xrt(itot), yrt(jtot) | ||
real(pois_r) :: xrt(nprocy*iony), yrt(nprocx*jonx) | ||
xrt = 0 | ||
yrt = 0 | ||
|
||
! Generate Eigenvalues xrt and yrt resulting from d**2/dx**2 F = a**2 F | ||
|
||
|
@@ -712,33 +730,25 @@ subroutine fftwinit_factors(xyrt) | |
|
||
! I --> direction | ||
fac = 1./(2.*itot) | ||
do i=2,(itot/2) | ||
do i=2,itot | ||
xrt(i)=-4.*dxi*dxi*(sin(float(2*(i-1))*pi*fac))**2 | ||
xrt(itot - i + 2) = xrt(i) | ||
This comment has been minimized.
Sorry, something went wrong.
This comment has been minimized.
Sorry, something went wrong.
fjansson
Author
Contributor
|
||
end do | ||
xrt(1) = 0. | ||
if (mod(itot,2) == 0) then | ||
! Nyquist frequency | ||
xrt(1 + itot/2) = -4.*dxi*dxi | ||
endif | ||
|
||
! J --> direction | ||
fac = 1./(2.*jtot) | ||
do j=2,(jtot/2) | ||
do j=2,jtot | ||
yrt(j)=-4.*dyi*dyi*(sin(float(2*(j-1))*pi*fac))**2 | ||
yrt(jtot - j + 2) = yrt(j) | ||
end do | ||
yrt(1) = 0. | ||
if (mod(jtot,2) == 0) then | ||
! Nyquist frequency | ||
yrt(1 + jtot/2) = -4.*dyi*dyi | ||
endif | ||
|
||
! Combine I and J directions | ||
! Note that: | ||
! 1. MPI starts counting at 0 so it should be myidy * jmax | ||
! 2. real data, ie. no halo, starts at index 2 in the array xyrt(2,2) <-> xrt(1), yrt(1) | ||
|
||
! the last tile in x or y may have fewer elements than the rest | ||
! filling xyrt will then access xrt or yrt beyond itot or jtot. xrt and yrt are padded above with zeroes. | ||
if (method == 1) then | ||
! nprocx /= 1, nprocy /= 1 | ||
! we will be working on matrix Fp(1:iony,1:jonx,1:kmax) | ||
|
@fjansson I have not worked on this part of the code, but I wonder if it is correct to replace the mirroring here. That's one thing that stands out. Otherwise I don't see really strange things (I haven't tested things, this is just me looking quickly at this commit).