diff --git a/src/modfftw.f90 b/src/modfftw.f90
index 91fd950d..e9c24dbe 100644
--- a/src/modfftw.f90
+++ b/src/modfftw.f90
@@ -41,6 +41,8 @@ module modfftw
   integer                         :: method
   integer                         :: konx, kony
   integer                         :: iony, jonx
+  integer                         :: konx_last, kony_last
+  integer                         :: iony_last, jonx_last
   real(pois_r), dimension(:), allocatable :: bufin, bufout
   interface fftw_plan_many_r2r_if
     procedure :: d_fftw_plan_many_r2r
@@ -134,6 +136,11 @@ subroutine fftwinit(p, Fp, d, xyrt, ps,pe,qs,qe)
       jonx = jonx + 1
     endif
 
+    konx_last = kmax - konx*(nprocx-1)
+    kony_last = kmax - kony*(nprocy-1)
+    iony_last = itot - iony*(nprocy-1)
+    jonx_last = jtot - jonx*(nprocx-1)
+
 ! Allocate communication buffers for the transpose functions
     sz = max( imax * jmax * konx * nprocx, & ! transpose a1
               iony * jmax * konx * nprocy, & ! transpose a2
@@ -141,7 +148,8 @@ subroutine fftwinit(p, Fp, d, xyrt, ps,pe,qs,qe)
 
     allocate(bufin (sz))
     allocate(bufout(sz))
-
+    bufin = 0
+    bufout = 0
 ! Allocate temporary arrays to hold transposed matrix
 
     ! calculate memory size as an long int (size_t)
@@ -164,7 +172,7 @@ subroutine fftwinit(p, Fp, d, xyrt, ps,pe,qs,qe)
     p210(1:itot,1:jmax,1:konx) => fptr(1:itot*jmax*konx)
     p210_flat(1:itot*jmax*konx)=> fptr(1:itot*jmax*konx)
     p201(1:jtot,1:konx,1:iony) => fptr(1:jtot*konx*iony)
-    p201_flat(1:itot*konx*iony)=> fptr(1:itot*konx*iony)
+    p201_flat(1:jtot*konx*iony)=> fptr(1:jtot*konx*iony)
     Fp(1:iony,1:jonx,1:kmax) => fptr(1:iony*jonx*kmax)
 
     ! Prepare 1d FFT transforms
@@ -254,6 +262,10 @@ subroutine fftwinit(p, Fp, d, xyrt, ps,pe,qs,qe)
     qs = 1
     qe = jonx
 
+    ! special case for final task in x or y, which may have fewer elements
+    if (myidx == nprocx-1) qe = jonx_last
+    if (myidy == nprocy-1) pe = iony_last
+
     else if (method == 2) then
 
     ! Make an array with the halo zone sliced off on the top and the left
@@ -647,6 +659,10 @@ subroutine fftwf(p, Fp)
       call fftw_execute_r2r_if(planx, p210_flat, p210_flat)
 
       call transpose_a2(p210, p201)
+      ! zero the unused part, avoinds SIGFPE from the FFT (Debug mode)
+      ! p201(jtot,konx,iony)
+      if (myidx == nprocx-1) p201(:,konx_last+1:, :) = 0
+      if (myidy == nprocy-1) p201(:,:,iony_last+1:)  = 0
       call fftw_execute_r2r_if(plany, p201_flat, p201_flat)
 
       call transpose_a3(p201, Fp)
@@ -692,7 +708,9 @@ subroutine fftwinit_factors(xyrt)
 
     integer :: i,j,iv,jv
     real(pois_r)    :: fac
-    real(pois_r)    :: xrt(itot), yrt(jtot)
+    real(pois_r)    :: xrt(nprocy*iony), yrt(nprocx*jonx)
+    xrt = 0
+    yrt = 0
 
   ! Generate Eigenvalues xrt and yrt resulting from d**2/dx**2 F = a**2 F
 
@@ -712,33 +730,25 @@ subroutine fftwinit_factors(xyrt)
 
   ! I --> direction
     fac = 1./(2.*itot)
-    do i=2,(itot/2)
+    do i=2,itot
       xrt(i)=-4.*dxi*dxi*(sin(float(2*(i-1))*pi*fac))**2
-      xrt(itot - i + 2) = xrt(i)
     end do
     xrt(1) = 0.
-    if (mod(itot,2) == 0) then
-      ! Nyquist frequency
-      xrt(1 + itot/2) = -4.*dxi*dxi
-    endif
 
   ! J --> direction
     fac = 1./(2.*jtot)
-    do j=2,(jtot/2)
+    do j=2,jtot
       yrt(j)=-4.*dyi*dyi*(sin(float(2*(j-1))*pi*fac))**2
-      yrt(jtot - j + 2) = yrt(j)
     end do
     yrt(1) = 0.
-    if (mod(jtot,2) == 0) then
-      ! Nyquist frequency
-      yrt(1 + jtot/2) = -4.*dyi*dyi
-    endif
 
   ! Combine I and J directions
   ! Note that:
   ! 1. MPI starts counting at 0 so it should be myidy * jmax
   ! 2. real data, ie. no halo, starts at index 2 in the array xyrt(2,2) <-> xrt(1), yrt(1)
 
+  ! the last tile in x or y may have fewer elements than the rest
+  ! filling xyrt will then access xrt or yrt beyond itot or jtot. xrt and yrt are padded above with zeroes.
     if (method == 1) then
       ! nprocx /= 1, nprocy /= 1
       ! we will be working on matrix Fp(1:iony,1:jonx,1:kmax)