Parallel Spectral Numerical Methods/GPU programs for Fourier pseudospectral simulations of the Navier-Stokes, Cubic Nonlinear Schrodinger and sine Gordon equations
This section includes the programs taken from a conference paper by Cloutier, Muite and Rigge[1]. The main purpose is to give example programs which show how to use graphics processing units (GPUs) to solve partial differential equations using Fourier methods. For further background on GPUs and programming models for GPUs see Cloutier, Muite and Rigge[2]. It should be noted that the algorithms used for the sine Gordon equation are very similar to those for the Klein Gordon equation discussed elsewhere in this tutorial. For consistency with the rest of the tutorial, only programs using CUDA Fortran and OpenACC extensions to Fortran are included although Cloutier, Muite and Rigge[3] also has CUDA C programs. GPUs enable acceleration of Fourier pseudospectral codes by factors of 10 compared to OpenMP parallelizations on a single 8 core node.
These programs use the Crank-Nicolson method.
|
( |
!-------------------------------------------------------------------- ! ! PURPOSE ! ! This program numerically solves the 2D incompressible Navier-Stokes ! on a Square Domain [0,1]x[0,1] using pseudo-spectral methods and ! Crank-Nicolson timestepping. The numerical solution is compared to ! the exact Taylor-Green Vortex Solution. ! ! AUTHORS ! ! B. Cloutier, B.K. Muite, P. Rigge ! 4 June 2012 ! ! Periodic free-slip boundary conditions and Initial conditions: ! u(x,y,0)=sin(2*pi*x)cos(2*pi*y) ! v(x,y,0)=-cos(2*pi*x)sin(2*pi*y) ! Analytical Solution (subscript denote derivatives): ! u(x,y,t)=sin(2*pi*x)cos(2*pi*y)exp(-8*pi^2*t/Re) ! v(x,y,t)=-cos(2*pi*x)sin(2*pi*y)exp(-8*pi^2*t/Re) ! u_y(x,y,t)=-2*pi*sin(2*pi*x)sin(2*pi*y)exp(-8*pi^2*t/Re) ! v_x(x,y,t)=2*pi*sin(2*pi*x)sin(2*pi*y)exp(-8*pi^2*t/Re) ! omega=v_x-u_y ! ! .. Parameters .. ! Nx = number of modes in x - power of 2 for FFT ! Ny = number of modes in y - power of 2 for FFT ! nplots = number of plots produced ! plotgap = number of timesteps inbetween plots ! Re = dimensionless Renold's number ! ReInv = 1/Re for optimization ! dt = timestep size ! dtInv = 1/dt for optimization ! tol = determines when convergences is reached ! numthreads = number of CPUs used in calculation ! .. Scalars .. ! i = loop counter in x direction ! j = loop counter in y direction ! n = loop counter for timesteps direction ! allocatestatus = error indicator during allocation ! time = times at which data is saved ! chg = error at each iteration ! .. Arrays (gpu) .. ! omeg_d = vorticity in real space ! omeghat_d = 2D Fourier transform of vorticity ! at next iterate ! omegoldhat_d = 2D Fourier transform of vorticity at previous ! iterate ! nloldhat_d = nonlinear term in Fourier space ! at previous iterate ! psihat_d = 2D Fourier transform of streamfunction ! at next iteration ! temp1_d/temp2_d/temp3_d = reusable complex/real space used for ! calculations. This reduces number of ! arrays stored. ! .. Vectors (gpu) .. ! kx_d = fourier frequencies in x direction ! ky_d = fourier frequencies in y direction ! x_d = x locations ! y_d = y locations ! name_config = array to store filename for data to be saved ! REFERENCES ! ! ACKNOWLEDGEMENTS ! ! ACCURACY ! ! ERROR INDICATORS AND WARNINGS ! ! FURTHER COMMENTS ! This program has not been fully optimized. !-------------------------------------------------------------------- module precision ! Precision control integer, parameter, public :: Single = kind(0.0) ! Single precision integer, parameter, public :: Double = kind(0.0d0) ! Double precision integer, parameter, public :: fp_kind = Double !integer, parameter, public :: fp_kind = Single end module precision module cufft integer, public :: CUFFT_FORWARD = -1 integer, public :: CUFFT_INVERSE = 1 integer, public :: CUFFT_R2C = Z'2a' ! Real to Complex (interleaved) integer, public :: CUFFT_C2R = Z'2c' ! Complex (interleaved) to Real integer, public :: CUFFT_C2C = Z'29' ! Complex to Complex, interleaved integer, public :: CUFFT_D2Z = Z'6a' ! Double to Double-Complex integer, public :: CUFFT_Z2D = Z'6c' ! Double-Complex to Double integer, public :: CUFFT_Z2Z = Z'69' ! Double-Complex to Double-Complex !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! ! ! cufftPlan2d(cufftHandle *plan, int nx,int ny, cufftType type) ! !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! interface cufftPlan2d subroutine cufftPlan2d(plan, nx, ny, type) bind(C,name='cufftPlan2d') use iso_c_binding integer(c_int):: plan integer(c_int),value:: nx, ny, type end subroutine cufftPlan2d end interface cufftPlan2d !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! ! ! cufftDestroy(cufftHandle plan) ! !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! interface cufftDestroy subroutine cufftDestroy(plan) bind(C,name='cufftDestroy') use iso_c_binding integer(c_int),value:: plan end subroutine cufftDestroy end interface cufftDestroy interface cufftExecD2Z subroutine cufftExecD2Z(plan, idata, odata) & & bind(C,name='cufftExecD2Z') use iso_c_binding use precision integer(c_int), value :: plan real(fp_kind), device :: idata(1:nx,1:ny) complex(fp_kind),device :: odata(1:nx,1:ny) end subroutine cufftExecD2Z end interface cufftExecD2Z interface cufftExecZ2D subroutine cufftExecZ2D(plan, idata, odata) & & bind(C,name='cufftExecZ2D') use iso_c_binding use precision integer(c_int),value:: plan complex(fp_kind),device:: idata(1:nx,1:ny) real(fp_kind),device :: odata(1:nx,1:ny) end subroutine cufftExecZ2D end interface cufftExecZ2D end module cufft PROGRAM main use precision use cufft ! declare variables IMPLICIT NONE INTEGER(kind=4), PARAMETER :: Nx=4096 INTEGER(kind=4), PARAMETER :: Ny=4096 INTEGER(kind=8) :: temp=10000000 REAL(fp_kind), PARAMETER :: dt=0.000125d0 !dt=0.000002d0 REAL(fp_kind), PARAMETER :: dtInv=1.0d0/REAL(dt,kind(0d0)) REAL(fp_kind), PARAMETER & :: pi=3.14159265358979323846264338327950288419716939937510d0 REAL(fp_kind), PARAMETER :: Re=1.0d0 REAL(fp_kind), PARAMETER :: ReInv=1.0d0/REAL(Re,kind(0d0)) REAL(fp_kind), PARAMETER :: tol=0.1d0**10 REAL(fp_kind) :: scalemodes,chg INTEGER(kind=4), PARAMETER :: nplots=1,plotgap=20 REAL(fp_kind),DIMENSION(:), ALLOCATABLE :: x,y REAL(fp_kind),DIMENSION(:,:), ALLOCATABLE :: omeg,omegexact INTEGER(kind=4) :: i,j,n,t, AllocateStatus INTEGER(kind=4) :: planz2d,pland2z, kersize !variables used for saving data and timing INTEGER(kind=4) :: start, finish, count_rate,count, iol CHARACTER*100 :: name_config ! Declare variables for GPU REAL(fp_kind), DEVICE, DIMENSION(:,:), ALLOCATABLE :: omeg_d,nl_d, temp2_d,& temp3_d COMPLEX(fp_kind), DEVICE, DIMENSION(:,:), ALLOCATABLE :: omegoldhat_d, nloldhat_d,& omeghat_d, nlhat_d, psihat_d,& temp1_d COMPLEX(fp_kind), DEVICE, DIMENSION(:), ALLOCATABLE :: kx_d,ky_d REAL(kind=8),DEVICE, DIMENSION(:), ALLOCATABLE :: x_d,y_d kersize=min(Nx,256) PRINT *,'Program starting' PRINT *,'Grid:',Nx,'X',Ny PRINT *,'dt:',dt ALLOCATE(x(1:Nx),y(1:Ny),omeg(1:Nx,1:Ny),omegexact(1:Nx,1:Ny),& stat=AllocateStatus) IF (AllocateStatus .ne. 0) STOP PRINT *,'Allocated CPU arrays' ALLOCATE(kx_d(1:Nx/2+1),ky_d(1:Ny),x_d(1:Nx),y_d(1:Ny),omeg_d(1:Nx,1:Ny),& omegoldhat_d(1:Nx/2+1,1:Ny),nloldhat_d(1:Nx/2+1,1:Ny),& omeghat_d(1:Nx/2+1,1:Ny),nl_d(1:Nx,1:Ny),& nlhat_d(1:Nx/2+1,1:Ny),psihat_d(1:Nx/2+1,1:Ny),temp1_d(1:Nx/2+1,1:Ny),& temp2_d(1:Nx,1:Ny),temp3_d(1:Nx,1:Ny),stat=AllocateStatus) IF (AllocateStatus .ne. 0) STOP PRINT *,'Allocated GPU arrays' CALL cufftPlan2D(pland2z,nx,ny,CUFFT_D2Z) CALL cufftPlan2D(planz2d,nx,ny,CUFFT_Z2D) PRINT *,'Setup FFTs' ! setup fourier frequencies !$cuf kernel do <<< *,* >>> DO i=1,Nx/2+1 kx_d(i)= 2.0d0*pi*cmplx(0.0d0,1.0d0)*REAL(i-1,kind=fp_kind) END DO kx_d(1+Nx/2)=0.0d0 !$cuf kernel do <<< *,* >>> DO i=1,Nx x_d(i)=REAL(i-1,kind(0d0))/REAL(Nx,kind=fp_kind) END DO !$cuf kernel do <<< *,* >>> DO j=1,Ny/2+1 ky_d(j)= 2.0d0*pi*cmplx(0.0d0,1.0d0)*REAL(j-1,kind=fp_kind) END DO ky_d(1+Ny/2)=0.0d0 !$cuf kernel do <<< *,* >>> DO j = 1,Ny/2 -1 ky_d(j+1+Ny/2)=-ky_d(1-j+Ny/2) END DO !$cuf kernel do <<< *, * >>> DO j=1,Ny y_d(j)=REAL(j-1,kind(0d0))/REAL(Ny,kind=fp_kind) END DO scalemodes=1.0d0/REAL(Nx*Ny,kind=fp_kind) PRINT *,'Setup grid and fourier frequencies' !$cuf kernel do <<< *,* >>> DO j=1,Ny DO i=1,Nx omeg_d(i,j)=4.0d0*pi*sin(2.0d0*pi*x_d(i))*sin(2.0d0*pi*y_d(j))!+0.01d0*cos(2.0d0*pi*y_d(j)) END DO END DO CALL cufftExecD2Z(pland2z,omeg_d,omeghat_d) !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! !get initial nonlinear term using omeghat to find psihat, u, and v! !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! !$cuf kernel do <<< *,* >>> DO j=1,Ny DO i=1,Nx/2+1 psihat_d(i,j)=-omeghat_d(i,j)/(kx_d(i)*kx_d(i)+ky_d(j)*ky_d(j)+0.10d0**14) END DO END DO !$cuf kernel do <<< *,* >>> DO j=1,Ny DO i=1,Nx/2+1 temp1_d(i,j)=psihat_d(i,j)*ky_d(j)*scalemodes END DO END DO CALL cufftExecZ2D(planz2d,temp1_d,temp3_d) !u !$cuf kernel do <<< *,* >>> DO j=1,Ny DO i=1,Nx/2+1 temp1_d(i,j)=omeghat_d(i,j)*kx_d(i) END DO END DO CALL cufftExecZ2D(planz2d,temp1_d,temp2_d) !omega_x !$cuf kernel do <<< *,* >>> DO j=1,Ny DO i=1,Nx nl_d(i,j)=temp3_d(i,j)*temp2_d(i,j) END DO END DO !$cuf kernel do <<< *,* >>> DO j=1,Ny DO i=1,Nx/2+1 temp1_d(i,j)=-psihat_d(i,j)*kx_d(i)*scalemodes END DO END DO CALL cufftExecZ2D(planz2d,temp1_d,temp3_d) !v !$cuf kernel do <<< *,* >>> DO j=1,Ny DO i=1,Nx/2+1 temp1_d(i,j)=omeghat_d(i,j)*ky_d(j) END DO END DO CALL cufftExecZ2D(planz2d,temp1_d,temp2_d) !omega_y !combine to get full nonlinear term in real space !$cuf kernel do <<< *,* >>> DO j=1,Ny DO i=1,Nx nl_d(i,j)=(nl_d(i,j)+temp3_d(i,j)*temp2_d(i,j))*scalemodes END DO END DO CALL cufftExecD2Z(pland2z,nl_d,nlhat_d) !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! temp2_d=omeg_d !omegacheck PRINT *,'Got initial data, starting timestepping' CALL system_clock(start,count_rate) DO t=1,nplots DO n=1,plotgap chg=1.0d0 nloldhat_d=nlhat_d omegoldhat_d=omeghat_d DO WHILE (chg>tol) !$cuf kernel do(2) <<< (2,*), (kersize,1) >>> DO j=1,Ny DO i=1,Nx/2+1 omeghat_d(i,j)=((dtInv+0.5d0*ReInv*(kx_d(i)*kx_d(i)+ky_d(j)*ky_d(j)))& *omegoldhat_d(i,j) - 0.5d0*(nloldhat_d(i,j)+nlhat_d(i,j)))& /(dtInv-0.5d0*ReInv*(kx_d(i)*kx_d(i)+ky_d(j)*ky_d(j))) END DO END DO !$cuf kernel do(2) <<< (2,*), (kersize,1) >>> DO j=1,Ny DO i=1,Nx/2+1 psihat_d(i,j)=-omeghat_d(i,j)/(kx_d(i)*kx_d(i)+ky_d(j)*ky_d(j)+0.10d0**14) END DO END DO CALL cufftExecZ2D(planz2d,omeghat_d,omeg_d) !check for convergence chg=0.0d0 !$cuf kernel do(2) <<< (2,*), (kersize,1) >>> DO j=1,Ny DO i=1,Nx chg=chg+(omeg_d(i,j)-temp2_d(i,j))*(omeg_d(i,j)-temp2_d(i,j))& *scalemodes*scalemodes END DO END DO !!!!!!!!!!!!!!!! !nonlinear term! !!!!!!!!!!!!!!!! !$cuf kernel do(2) <<< (2,*), (kersize,1) >>> DO j=1,Ny DO i=1,Nx/2+1 temp1_d(i,j)=psihat_d(i,j)*ky_d(j)*scalemodes END DO END DO CALL cufftExecZ2D(planz2d,temp1_d,temp3_d) !u !$cuf kernel do(2) <<< (2,*), (kersize,1) >>> DO j=1,Ny DO i=1,Nx/2+1 temp1_d(i,j)=omeghat_d(i,j)*kx_d(i) END DO END DO CALL cufftExecZ2D(planz2d,temp1_d,temp2_d) !omega_x !$cuf kernel do(2) <<< (2,*), (kersize,1) >>> DO j=1,Ny DO i=1,Nx nl_d(i,j)=temp3_d(i,j)*temp2_d(i,j) END DO END DO !$cuf kernel do(2) <<< (2,*), (kersize,1) >>> DO j=1,Ny DO i=1,Nx/2+1 temp1_d(i,j)=-psihat_d(i,j)*kx_d(i)*scalemodes END DO END DO CALL cufftExecZ2D(planz2d,temp1_d,temp3_d) !v !$cuf kernel do(2) <<< (2,*), (kersize,1) >>> DO j=1,Ny DO i=1,Nx/2+1 temp1_d(i,j)=omeghat_d(i,j)*ky_d(j) END DO END DO CALL cufftExecZ2D(planz2d,temp1_d,temp2_d) !omega_y !combine to get full nonlinear term in real space !$cuf kernel do(2) <<< (2,*), (kersize,1) >>> DO j=1,Ny DO i=1,Nx nl_d(i,j)=(nl_d(i,j)+temp3_d(i,j)*temp2_d(i,j))*scalemodes END DO END DO CALL cufftExecD2Z(pland2z,nl_d,nlhat_d) !!!!!!!!!!!!!!!! temp2_d=omeg_d !save omegacheck END DO END DO !PRINT *, t*plotgap*dt END DO CALL system_clock(finish,count_rate) PRINT*,'Program took ',REAL(finish-start)/REAL(count_rate),& 'for Time stepping' ! Copy grid back to host x=x_d y=y_d omeg=omeg_d !get exact omega DO j=1,Ny DO i=1,Nx omegexact(i,j)=4.0d0*pi*sin(2.0d0*pi*x(i))*& sin(2.0d0*pi*y(j))*exp(-8.0d0*ReInv*pi**2*nplots*plotgap*dt) END DO END DO !compute max error PRINT *,'Max Error:',maxval(abs(omeg*scalemodes-omegexact)) temp=temp+1 write(name_config,'(a,i0,a)') 'omega',temp,'.datbin' INQUIRE(iolength=iol) omeg(1,1) OPEN(unit=11,FILE=name_config,form="unformatted", access="direct",recl=iol) count = 1 DO j=1,Ny DO i=1,Nx WRITE(11,rec=count) omeg(i,j)*scalemodes count=count+1 END DO END DO CLOSE(11) CALL cufftDestroy(planz2d) CALL cufftDestroy(pland2z) PRINT *,'Destroyed fft plan' DEALLOCATE(x,y,omeg,omegexact,stat=AllocateStatus) IF (AllocateStatus .ne. 0) STOP PRINT *,'Deallocated CPU memory' DEALLOCATE(kx_d,ky_d,x_d,y_d,& omeg_d,omegoldhat_d, nloldhat_d,omeghat_d,& nl_d, nlhat_d,temp1_d,temp2_d,temp3_d,& psihat_d,stat=AllocateStatus) IF (AllocateStatus .ne. 0) STOP PRINT *,'Deallocated GPU memory' PRINT *,'Program execution complete' END PROGRAM main
|
( |
!-------------------------------------------------------------------- ! ! PURPOSE ! ! This program numerically solves the 2D incompressible Navier-Stokes ! on a Square Domain [0,1]x[0,1] using pseudo-spectral methods and ! Crank-Nicolson timestepping. The numerical solution is compared to ! the exact Taylor-Green Vortex Solution. ! ! AUTHORS ! ! B. Cloutier, B.K. Muite, P. Rigge ! 4 June 2012 ! ! Periodic free-slip boundary conditions and Initial conditions: ! u(x,y,0)=sin(2*pi*x)cos(2*pi*y) ! v(x,y,0)=-cos(2*pi*x)sin(2*pi*y) ! Analytical Solution (subscript denote derivatives): ! u(x,y,t)=sin(2*pi*x)cos(2*pi*y)exp(-8*pi^2*t/Re) ! v(x,y,t)=-cos(2*pi*x)sin(2*pi*y)exp(-8*pi^2*t/Re) ! u_y(x,y,t)=-2*pi*sin(2*pi*x)sin(2*pi*y)exp(-8*pi^2*t/Re) ! v_x(x,y,t)=2*pi*sin(2*pi*x)sin(2*pi*y)exp(-8*pi^2*t/Re) ! omega=v_x-u_y ! ! .. Parameters .. ! Nx = number of modes in x - power of 2 for FFT ! Ny = number of modes in y - power of 2 for FFT ! nplots = number of plots produced ! plotgap = number of timesteps inbetween plots ! Re = dimensionless Renold's number ! ReInv = 1/Re for optimization ! dt = timestep size ! dtInv = 1/dt for optimization ! tol = determines when convergences is reached ! scalemodes = 1/(Nx*Ny), scaling after preforming FFTs ! .. Scalars .. ! i = loop counter in x direction ! j = loop counter in y direction ! n = loop counter for timesteps direction ! allocatestatus = error indicator during allocation ! time = times at which data is saved ! chg = error at each iteration ! .. Arrays .. ! omeg = vorticity in real space ! omeghat = 2D Fourier transform of vorticity ! at next iterate ! omegoldhat = 2D Fourier transform of vorticity at previous ! iterate ! nl = nonlinear term ! nlhat = nonlinear term in Fourier space ! nloldhat = nonlinear term in Fourier space ! at previous iterate ! omegexact = taylor-green vorticity at ! at final step ! psihat = 2D Fourier transform of streamfunction ! at next iteration ! temp1/temp2/temp3= reusable complex/real space used for ! calculations. This reduces number of ! arrays stored. ! .. Vectors .. ! kx = fourier frequencies in x direction ! ky = fourier frequencies in y direction ! x = x locations ! y = y locations ! name_config = array to store filename for data to be saved ! REFERENCES ! ! ACKNOWLEDGEMENTS ! ! ACCURACY ! ! ERROR INDICATORS AND WARNINGS ! ! FURTHER COMMENTS ! Check that the initial iterate is consistent with the ! boundary conditions for the domain specified !-------------------------------------------------------------------- ! External libraries required ! Cuda FFT ! OpenACC ! FFTW3 -- Fastest Fourier Transform in the West ! (http://www.fftw.org/) ! OpenMP module precision ! Precision control integer, parameter, public :: Single = kind(0.0) ! Single precision integer, parameter, public :: Double = kind(0.0d0) ! Double precision integer, parameter, public :: fp_kind = Double !integer, parameter, public :: fp_kind = Single end module precision module cufft integer, public :: CUFFT_FORWARD = -1 integer, public :: CUFFT_INVERSE = 1 integer, public :: CUFFT_R2C = Z'2a' ! Real to Complex (interleaved) integer, public :: CUFFT_C2R = Z'2c' ! Complex (interleaved) to Real integer, public :: CUFFT_C2C = Z'29' ! Complex to Complex, interleaved integer, public :: CUFFT_D2Z = Z'6a' ! Double to Double-Complex integer, public :: CUFFT_Z2D = Z'6c' ! Double-Complex to Double integer, public :: CUFFT_Z2Z = Z'69' ! Double-Complex to Double-Complex !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! ! ! cufftPlan2d(cufftHandle *plan, int nx,int ny, cufftType type) ! !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! interface cufftPlan2d subroutine cufftPlan2d(plan, nx, ny, type) bind(C,name='cufftPlan2d') use iso_c_binding integer(c_int):: plan integer(c_int),value:: nx, ny, type end subroutine cufftPlan2d end interface cufftPlan2d !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! ! ! cufftDestroy(cufftHandle plan) ! !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! interface cufftDestroy subroutine cufftDestroy(plan) bind(C,name='cufftDestroy') use iso_c_binding integer(c_int),value:: plan end subroutine cufftDestroy end interface cufftDestroy interface cufftExecD2Z subroutine cufftExecD2Z(plan, idata, odata) & & bind(C,name='cufftExecD2Z') use iso_c_binding use precision integer(c_int), value :: plan real(fp_kind), device :: idata(1:nx,1:ny) complex(fp_kind),device :: odata(1:nx/2+1,1:ny) end subroutine cufftExecD2Z end interface cufftExecD2Z interface cufftExecZ2D subroutine cufftExecZ2D(plan, idata, odata) & & bind(C,name='cufftExecZ2D') use iso_c_binding use precision integer(c_int),value:: plan complex(fp_kind),device:: idata(1:nx/2+1,1:ny) real(fp_kind),device :: odata(1:nx,1:ny) end subroutine cufftExecZ2D end interface cufftExecZ2D end module cufft PROGRAM main USE precision USE cufft USE openacc IMPLICIT NONE INTEGER(kind=4), PARAMETER :: Nx=512 INTEGER(kind=4), PARAMETER :: Ny=512 REAL(kind=8), PARAMETER :: dt=0.000125d0 REAL(kind=8), PARAMETER :: dtInv=1.0d0/REAL(dt,kind(0d0)) REAL(kind=8), PARAMETER & :: pi=3.14159265358979323846264338327950288419716939937510d0 REAL(kind=8), PARAMETER :: Re=1.0d0 REAL(kind=8), PARAMETER :: ReInv=1.0d0/REAL(Re,kind(0d0)) REAL(kind=8), PARAMETER :: tol=0.1d0**10 REAL(kind=8) :: scalemodes REAL(kind=8) :: chg INTEGER(kind=4), PARAMETER :: nplots=1, plotgap=20 COMPLEX(kind=8), DIMENSION(:), ALLOCATABLE :: kx,ky REAL(kind=8), DIMENSION(:), ALLOCATABLE :: x,y,time REAL(kind=8), DIMENSION(:,:), ALLOCATABLE :: omeg,nl, temp2, temp3, omegexact COMPLEX(kind=8), DIMENSION(:,:), ALLOCATABLE :: omegoldhat, nloldhat,& omeghat,nlhat, psihat,temp1 INTEGER(kind=4) :: i,j,n,t, allocatestatus INTEGER(kind=4) :: pland2z,planz2d INTEGER(kind=4) :: count, iol CHARACTER*100 :: name_config INTEGER(kind=4) :: start, finish, count_rate INTEGER(kind=4) :: ierr,vecsize,gangsize INTEGER(kind=8) :: planfxy,planbxy vecsize=32 gangsize=16 PRINT *,'Grid:',Nx,'X',Ny PRINT *,'dt:',dt ALLOCATE(time(1:nplots+1),kx(1:Nx),ky(1:Ny),x(1:Nx),y(1:Ny),& omeg(1:Nx,1:Ny),omegoldhat(1:Nx/2+1,1:Ny),& nloldhat(1:Nx/2+1,1:Ny),temp3(1:Nx,1:Ny),omeghat(1:Nx/2+1,1:Ny),& nl(1:Nx,1:Ny),nlhat(1:Nx/2+1,1:Ny), psihat(1:Nx/2+1,1:Ny),& temp1(1:Nx/2+1,1:Ny),omegexact(1:Nx,1:Ny),temp2(1:Nx,1:Ny),& stat=AllocateStatus) IF (AllocateStatus .ne. 0) STOP PRINT *,'allocated space' CALL cufftPlan2D(pland2z,nx,ny,CUFFT_D2Z) CALL cufftPlan2D(planz2d,nx,ny,CUFFT_Z2D) PRINT *,'Setup 2D FFTs' ! setup fourier frequencies in x-direction !$acc data copy(kx,ky,x,y,time,temp3,omeg,nl,temp1,temp2,omegoldhat,nloldhat,omeghat,nlhat,psihat) PRINT *, 'Copied arrays over to device' !$acc kernels loop DO i=1,Nx/2+1 kx(i)= 2.0d0*pi*cmplx(0.0d0,1.0d0)*REAL(i-1,kind(0d0)) END DO !$acc end kernels kx(1+Nx/2)=0.0d0 !$acc kernels loop DO i = 1,Nx/2 -1 kx(i+1+Nx/2)=-kx(1-i+Nx/2) END DO !$acc end kernels !$acc kernels loop DO i=1,Nx x(i)=REAL(i-1,kind(0d0))/REAL(Nx,kind(0d0)) END DO !$acc end kernels ! setup fourier frequencies in y-direction !$acc kernels loop DO j=1,Ny/2+1 ky(j)= 2.0d0*pi*cmplx(0.0d0,1.0d0)*REAL(j-1,kind(0d0)) END DO !$acc end kernels ky(1+Ny/2)=0.0d0 !$acc kernels loop DO j = 1,Ny/2 -1 ky(j+1+Ny/2)=-ky(1-j+Ny/2) END DO !$acc end kernels !$acc kernels loop DO j=1,Ny y(j)=REAL(j-1,kind(0d0))/REAL(Ny,kind(0d0)) END DO !$acc end kernels scalemodes=1.0d0/REAL(Nx*Ny,kind(0d0)) PRINT *,'Setup grid and fourier frequencies' !initial data !$acc kernels loop DO j=1,Ny DO i=1,NX omeg(i,j)=4.0d0*pi*sin(2.0d0*pi*x(i))*sin(2.0d0*pi*y(j))!+0.01d0*cos(2.0d0*pi*y(j)) END DO END DO !$acc end kernels !\hat{\omega^{n,k}} CALL cufftExecD2Z(pland2z,omeg,omeghat) !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! !get initial nonlinear term using omeghat, psihat, u, and v! !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! !\hat{\psi^{n+1,k+1}} !$acc kernels loop gang(gangsize), vector(vecsize) DO j=1,Ny DO i=1,Nx/2+1 psihat(i,j)=-omeghat(i,j)/(kx(i)*kx(i)+ky(j)*ky(j) + 0.1d0**14) END DO END DO !$acc end kernels !\omega^{n+1,k+1} CALL cufftExecZ2D(planz2d,omeghat,omeg) !get \hat{\psi_y^{n+1,k+1}} used to get u, NOTE: u=\psi_y !$acc kernels loop gang(gangsize), vector(vecsize) DO j=1,Ny DO i=1,Nx/2+1 temp1(i,j)=psihat(i,j)*ky(j)*scalemodes END DO END DO !$acc end kernels CALL cufftExecZ2D(planz2d,temp1,temp3) !u ! \hat{\omega_x^{n,k}} !$acc kernels loop DO j=1,Ny DO i=1,Nx/2+1 temp1(i,j)=omeghat(i,j)*kx(i) END DO END DO !$acc end kernels ! \omega_x^{n,k} CALL cufftExecZ2D(planz2d,temp1,temp2) ! first part nonlinear term in real space !$acc kernels loop DO j=1,Ny DO i=1,Nx nl(i,j)=temp3(i,j)*temp2(i,j) END DO END DO !$acc end kernels !get \hat{\psi_x^{n+1,k+1}} used to get v, NOTE: v=-\psi_x !$acc kernels loop gang(gangsize), vector(vecsize) DO j=1,Ny DO i=1,Nx/2+1 temp1(i,j)=-psihat(i,j)*kx(i)*scalemodes END DO END DO !$acc end kernels CALL cufftExecZ2D(planz2d,temp1,temp3) !v ! \hat{\omega_y^{n,k}} !$acc kernels loop DO j=1,Ny DO i=1,Nx/2+1 temp1(i,j)=omeghat(i,j)*ky(j) END DO END DO !$acc end kernels ! \omega_y^{n,k} CALL cufftExecZ2D(planz2d,temp1,temp2) ! get the rest of nonlinear term in real space !$acc kernels loop DO j=1,Ny DO i=1,Nx nl(i,j)=(nl(i,j)+temp3(i,j)*temp2(i,j))*scalemodes END DO END DO !$acc end kernels ! transform nonlinear term into fourier space CALL cufftExecD2Z(pland2z,nl,nlhat) !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! !$acc kernels loop DO j=1,Ny DO i=1,Nx temp2(i,j)=omeg(i,j) END DO END DO !$acc end kernels PRINT *,'Got initial data, starting timestepping' time(1)=0.0d0 CALL system_clock(start,count_rate) DO t=1,nplots DO n=1,plotgap chg=1.0d0 ! save old values(__^{n,k} terms in equation) !$acc kernels loop gang(gangsize), vector(vecsize) DO j=1,Ny DO i=1,Nx/2+1 nloldhat(i,j)=nlhat(i,j) END DO END DO !$acc end kernels !$acc kernels loop gang(gangsize), vector(vecsize) DO j=1,Ny DO i=1,Nx/2+1 omegoldhat(i,j)=omeghat(i,j) END DO END DO !$acc end kernels DO WHILE (chg>tol) !Crank-Nicolson timestepping to get \hat{\omega^{n+1,k+1}} !$acc kernels loop gang(gangsize), vector(vecsize) DO j=1,Ny DO i=1,Nx/2+1 omeghat(i,j)=( (dtInv+0.5d0*ReInv*(kx(i)*kx(i)+ky(j)*ky(j)))& *omegoldhat(i,j) - 0.5d0*(nloldhat(i,j)+nlhat(i,j)))/& (dtInv-0.5d0*ReInv*(kx(i)*kx(i)+ky(j)*ky(j))) END DO END DO !$acc end kernels CALL cufftExecZ2D(planz2d,omeghat,omeg) ! check for convergence chg=0.0d0 !$acc kernels loop gang(gangsize), vector(vecsize) DO j=1,Ny DO i=1,Nx chg=chg+(omeg(i,j)-temp2(i,j))*(omeg(i,j)-temp2(i,j))& *scalemodes*scalemodes END DO END DO !$acc end kernels !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! !get nonlinear term using omeghat, psihat, u, and v! !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! !\hat{\psi^{n+1,k+1}} !$acc kernels loop gang(gangsize), vector(vecsize) DO j=1,Ny DO i=1,Nx/2+1 psihat(i,j)=-omeghat(i,j)/(kx(i)*kx(i)+ky(j)*ky(j) + 0.1d0**14) END DO END DO !$acc end kernels !\omega^{n+1,k+1} CALL cufftExecZ2D(planz2d,omeghat,omeg) !get \hat{\psi_y^{n+1,k+1}} used to get u, NOTE: u=\psi_y !$acc kernels loop gang(gangsize), vector(vecsize) DO j=1,Ny DO i=1,Nx/2+1 temp1(i,j)=psihat(i,j)*ky(j)*scalemodes END DO END DO !$acc end kernels CALL cufftExecZ2D(planz2d,temp1,temp3) !u ! \hat{\omega_x^{n,k}} !$acc kernels loop DO j=1,Ny DO i=1,Nx/2+1 temp1(i,j)=omeghat(i,j)*kx(i) END DO END DO !$acc end kernels ! \omega_x^{n,k} CALL cufftExecZ2D(planz2d,temp1,temp2) ! first part nonlinear term in real space !$acc kernels loop DO j=1,Ny DO i=1,Nx nl(i,j)=temp3(i,j)*temp2(i,j) END DO END DO !$acc end kernels !get \hat{\psi_x^{n+1,k+1}} used to get v, NOTE: v=-\psi_x !$acc kernels loop gang(gangsize), vector(vecsize) DO j=1,Ny DO i=1,Nx/2+1 temp1(i,j)=-psihat(i,j)*kx(i)*scalemodes END DO END DO !$acc end kernels CALL cufftExecZ2D(planz2d,temp1,temp3) ! \hat{\omega_y^{n,k}} !$acc kernels loop DO j=1,Ny DO i=1,Nx/2+1 temp1(i,j)=omeghat(i,j)*ky(j) END DO END DO !$acc end kernels ! \omega_y^{n,k} CALL cufftExecZ2D(planz2d,temp1,temp2) ! get the rest of nonlinear term in real space !$acc kernels loop DO j=1,Ny DO i=1,Nx nl(i,j)=(nl(i,j)+temp3(i,j)*temp2(i,j))*scalemodes END DO END DO !$acc end kernels ! transform nonlinear term into fourier space CALL cufftExecD2Z(pland2z,nl,nlhat) !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! !\omega^{n+1,k+1} is saved for next iteration !$acc kernels loop gang(gangsize), vector(vecsize) DO j=1,Ny DO i=1,Nx temp2(i,j)=omeg(i,j) END DO END DO !$acc end kernels END DO END DO time(t+1)=time(t)+dt*plotgap !PRINT *, time(t+1) END DO CALL system_clock(finish,count_rate) PRINT*,'Program took ',REAL(finish-start)/REAL(count_rate),& 'for Time stepping' !get exact omega !$acc kernels loop gang(gangsize), vector(vecsize) DO j=1,Ny DO i=1,Nx omegexact(i,j)=4.0d0*pi*sin(2.0d0*pi*x(i))*& sin(2.0d0*pi*y(j))*exp(-8.0d0*ReInv*pi**2*nplots*plotgap*dt) END DO END DO !$acc end kernels !$acc end data !compute max error PRINT *,'Max Error:',maxval(abs(omeg*scalemodes-omegexact)) !!!!!!!!!!!!!!!!!!!!!!!! !copy over data to disk! !!!!!!!!!!!!!!!!!!!!!!!! write(name_config,'(a,i0,a)') 'omega',1,'.datbin' INQUIRE(iolength=iol) omeg(1,1) OPEN(unit=11,FILE=name_config,form="unformatted", access="direct",recl=iol) count = 1 DO j=1,Ny DO i=1,Nx WRITE(11,rec=count) omeg(i,j)*scalemodes count=count+1 END DO END DO CLOSE(11) name_config = 'time.dat' OPEN(unit=11,FILE=name_config,status="UNKNOWN") REWIND(11) DO i=1,Nplots+1 WRITE(11,*) time(i) END DO CLOSE(11) name_config = 'xcoord.dat' OPEN(unit=11,FILE=name_config,status="UNKNOWN") REWIND(11) DO i=1,Nx WRITE(11,*) x(i) END DO CLOSE(11) name_config = 'ycoord.dat' OPEN(unit=11,FILE=name_config,status="UNKNOWN") REWIND(11) DO j=1,Ny WRITE(11,*) y(j) END DO CLOSE(11) !!!!!!!!!!!!!!!!!!!!!!!! CALL cufftDestroy(pland2z) CALL cufftDestroy(planz2d) DEALLOCATE(time,temp1,temp2,temp3,kx,ky,x,y,& omeg,omegoldhat,omegexact, nloldhat,& omeghat,nl, nlhat, psihat,& stat=AllocateStatus) IF (AllocateStatus .ne. 0) STOP PRINT *,'Program execution complete' END PROGRAM main
2D Cubic Nonlinear Schrödinger Equations
These programs use splitting.
|
( |
!--------------------------------------------------------------------
!
! PURPOSE
!
! This program solves nonlinear Schrodinger equation in 2 dimensions
! i*u_t+Es*|u|^2u+u_{xx}+u_{yy}=0
! using a second order time spectral splitting scheme
!
! The boundary conditions are u(x=0,y)=u(x=2*L*\pi,y)
! and u(x,y=0)=u(x,y=2*L*\pi)
! The initial condition is u=exp(-x^2-y^2)
!
! AUTHORS
!
! B. Cloutier, B.K. Muite, P. Rigge
! 4 June 2012
!
! .. Parameters ..
! Nx = number of modes in x - power of 2 for FFT
! Ny = number of modes in y - power of 2 for FFT
! Nt = number of timesteps to take
! Tmax = maximum simulation time
! plotgap = number of timesteps between plots
! pi = 3.14159265358979323846264338327950288419716939937510d0
! L = width of box
! ES = +1 for focusing and -1 for defocusing
! .. Scalars ..
! i = loop counter in x direction
! j = loop counter in y direction
! n = loop counter for timesteps direction
! allocatestatus = error indicator during allocation
! start = variable to record start time of program
! finish = variable to record end time of program
! count_rate = variable for clock count rate
! plan = fft plan
! dt = timestep
! InMass = initial mass
! FiMass = final mass
! InEner = initial energy
! FiEner = final energy
! .. Arrays ..
! u = approximate solution
! v = Fourier transform of approximate solution
! u_d = approximate solution on device
! v_d = Fourier transform of approximate solution on device
! temp1_d = temporary array used to find mass and energy
! temp2_d = temporary array used to find mass and energy
! .. Vectors ..
! kx = fourier frequencies in x direction
! ky = fourier frequencies in y direction
! x = x locations
! y = y locations
! time = times at which save data
! name_config = array to store filename for data to be saved
!
! REFERENCES
!
! ACKNOWLEDGEMENTS
!
! This program is based on example code to demonstrate usage of Fortran and
! CUDA FFT routines taken from
! http://cudamusing.blogspot.com/2010/05/calling-cufft-from-cuda-fortran.html
!
! and
!
! http://cudamusing.blogspot.com/search?q=cublas
!
! ACCURACY
!
! ERROR INDICATORS AND WARNINGS
!
! FURTHER COMMENTS
! Check that the initial iterate is consistent with the
! boundary conditions for the domain specified
!--------------------------------------------------------------------
! External routines required
!
! External libraries required
! cufft -- Cuda FFT library
!
!
! Define the INTERFACE to the NVIDIA CUFFT routines
!
module precision
! Precision control
integer, parameter, public :: Single = kind(0.0) ! Single precision
integer, parameter, public :: Double = kind(0.0d0) ! Double precision
integer, parameter, public :: fp_kind = Double
!integer, parameter, public :: fp_kind = Single
end module precision
module cufft
integer, public :: CUFFT_FORWARD = -1
integer, public :: CUFFT_INVERSE = 1
integer, public :: CUFFT_R2C = Z'2a' ! Real to Complex (interleaved)
integer, public :: CUFFT_C2R = Z'2c' ! Complex (interleaved) to Real
integer, public :: CUFFT_C2C = Z'29' ! Complex to Complex, interleaved
integer, public :: CUFFT_D2Z = Z'6a' ! Double to Double-Complex
integer, public :: CUFFT_Z2D = Z'6c' ! Double-Complex to Double
integer, public :: CUFFT_Z2Z = Z'69' ! Double-Complex to Double-Complex
!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
!
! cufftPlan2d(cufftHandle *plan, int nx,int ny, cufftType type)
!
!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
interface cufftPlan2d
subroutine cufftPlan2d(plan, nx, ny, type) bind(C,name='cufftPlan2d')
use iso_c_binding
integer(c_int):: plan
integer(c_int),value:: nx, ny, type
end subroutine cufftPlan2d
end interface cufftPlan2d
!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
!
! cufftDestroy(cufftHandle plan)
!
!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
interface cufftDestroy
subroutine cufftDestroy(plan) bind(C,name='cufftDestroy')
use iso_c_binding
integer(c_int),value:: plan
end subroutine cufftDestroy
end interface cufftDestroy
!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
!
! cufftExecZ2Z(cufftHandle plan,
! cufftDoubleComplex *idata,
! cufftDoubleComplex *odata,
! int direction;
!
!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
interface cufftExecZ2Z
subroutine cufftExecZ2Z(plan, idata, odata, direction) &
& bind(C,name='cufftExecZ2Z')
use iso_c_binding
use precision
integer(c_int),value:: direction
integer(c_int),value:: plan
complex(fp_kind),device,dimension(1:nx,1:ny):: idata,odata
end subroutine cufftExecZ2Z
end interface cufftExecZ2Z
end module cufft
PROGRAM main
use precision
use cufft
! Declare host variables and scalars
IMPLICIT NONE
INTEGER(kind=4), PARAMETER :: Nx=1024
INTEGER(kind=4), PARAMETER :: Ny=1024
INTEGER(kind=4), PARAMETER :: Nt=20
INTEGER(kind=4), PARAMETER :: plotgap=5
REAL(fp_kind), PARAMETER &
:: pi=3.14159265358979323846264338327950288419716939937510d0
REAL(fp_kind), PARAMETER :: Lx=5.0d0
REAL(fp_kind), PARAMETER :: Ly=5.0d0
REAL(fp_kind), PARAMETER :: Es=1.0d0
REAL(fp_kind) :: dt=0.10d0**5
REAL(fp_kind) :: scalemodes
COMPLEX(fp_kind) :: InMass,FiMass,InEner,FiEner
REAL(fp_kind), DIMENSION(:), ALLOCATABLE :: x,y
COMPLEX(fp_kind), DIMENSION(:,:), ALLOCATABLE :: u
REAL(fp_kind), DIMENSION(:), ALLOCATABLE :: time
INTEGER(kind=4) :: i,j,k,n,modes,AllocateStatus,plan, kersize
INTEGER(kind=4) :: start, finish, count_rate
CHARACTER*100 :: name_config
! Declare variables for GPU
COMPLEX(fp_kind), DEVICE, DIMENSION(:), ALLOCATABLE :: kx_d,ky_d
REAL(fp_kind), DEVICE, DIMENSION(:), ALLOCATABLE :: x_d,y_d
COMPLEX(fp_kind), DEVICE, DIMENSION(:,:), ALLOCATABLE :: u_d,v_d,temp1_d,temp2_d
REAL(fp_kind),DEVICE,DIMENSION(:), ALLOCATABLE :: time_d
! start timing
PRINT *,'Program starting'
PRINT *,'Grid: ',Nx,'X',Ny
! allocate arrays on the host
ALLOCATE(x(1:Nx),y(1:Ny),u(1:Nx,1:Ny),time(1:Nt+1),&
stat=AllocateStatus)
IF (allocatestatus .ne. 0) STOP
PRINT *,'Allocated CPU arrays'
! allocate arrays on the device
ALLOCATE(kx_d(1:Nx),ky_d(1:Nx),x_d(1:Nx),y_d(1:Nx),&
u_d(1:Nx,1:Ny),v_d(1:Nx,1:Ny),time_d(1:Nt+1),&
temp1_d(1:Nx,1:Ny),temp2_d(1:Nx,1:Ny),stat=AllocateStatus)
IF (allocatestatus .ne. 0) STOP
PRINT *,'Allocated GPU arrays'
kersize=min(Nx,256)
! set up ffts
CALL cufftPlan2D(plan,nx,ny,CUFFT_Z2Z)
PRINT *,'Setup FFTs'
! setup fourier frequencies
!$cuf kernel do <<< *, * >>>
DO i=1,1+Nx/2
kx_d(i)= cmplx(0.0d0,1.0d0)*(i-1.0d0)/Lx
END DO
kx_d(1+Nx/2)=0.0d0
!$cuf kernel do <<< *, * >>>
DO i = 1,Nx/2 -1
kx_d(i+1+Nx/2)=-kx_d(1-i+Nx/2)
END DO
!$cuf kernel do <<< *, * >>>
DO i=1,Nx
x_d(i)=(-1.0d0 + 2.0d0*REAL(i-1,kind(0d0))/REAL(Nx,kind=fp_kind))*pi*Lx
END DO
!$cuf kernel do <<< *, * >>>
DO j=1,1+Ny/2
ky_d(j)= cmplx(0.0d0,1.0d0)*(j-1.0d0)/Ly
END DO
ky_d(1+Ny/2)=0.0d0
!$cuf kernel do <<< *, * >>>
DO j = 1,Ny/2 -1
ky_d(j+1+Ny/2)=-ky_d(1-j+Ny/2)
END DO
!$cuf kernel do <<< *, * >>>
DO j=1,Ny
y_d(j)=(-1.0d0 + 2.0d0*REAL(j-1,kind(0d0))/REAL(Ny,kind=fp_kind))*pi*Ly
END DO
scalemodes=1.0d0/REAL(Nx*Ny,kind=fp_kind)
PRINT *,'Setup grid and fourier frequencies'
!$cuf kernel do <<< *,* >>>
DO j=1,Ny
DO i=1,Nx
u_d(i,j)=exp(-1.0d0*(x_d(i)**2+y_d(j)**2))
END DO
END DO
! transform initial data
CALL cufftExecZ2Z(plan,u_d,v_d,CUFFT_FORWARD)
PRINT *,'Got initial data'
! get initial mass
!$cuf kernel do <<< *,* >>>
DO j=1,Ny
DO i=1,Nx
temp1_d(i,j)=abs(u_d(i,j))**2
END DO
END DO
! Use FFT to get initial mass
CALL cufftExecZ2Z(plan,temp1_d,temp2_d,CUFFT_FORWARD)
InMass=temp2_d(1,1)
! Get initial energy
!$cuf kernel do <<< *,* >>>
DO j=1,Ny
DO i=1,Nx
temp1_d(i,j)=-ES*0.25d0*abs(u_d(i,j))**4
END DO
END DO
! Use FFT to find mean
CALL cufftExecZ2Z(plan,temp1_d,temp2_d,CUFFT_FORWARD)
InEner=temp2_d(1,1)
!$cuf kernel do <<< *,* >>>
DO j=1,Ny
DO i=1,Nx
temp2_d(i,j)=kx_d(i)*v_d(i,j)*scalemodes
END DO
END DO
CALL cufftExecZ2Z(plan,temp2_d,temp1_d,CUFFT_INVERSE)
!$cuf kernel do <<< *,* >>>
DO j=1,Ny
DO i=1,Nx
temp2_d(i,j)=0.5d0*abs(temp1_d(i,j))**2
END DO
END DO
! Use FFT to find mean
CALL cufftExecZ2Z(plan,temp2_d,temp1_d,CUFFT_FORWARD)
InEner=InEner+temp1_d(1,1)
!$cuf kernel do <<< *,* >>>
DO j=1,Ny
DO i=1,Nx
temp2_d(i,j)=ky_d(j)*v_d(i,j)*scalemodes
END DO
END DO
CALL cufftExecZ2Z(plan,temp2_d,temp1_d,CUFFT_INVERSE)
!$cuf kernel do <<< *,* >>>
DO j=1,Ny
DO i=1,Nx
temp2_d(i,j)=0.5d0*abs(temp1_d(i,j))**2
END DO
END DO
! Use FFT to find mean
CALL cufftExecZ2Z(plan,temp2_d,temp1_d,CUFFT_FORWARD)
InEner=InEner+temp1_d(1,1)
! start timing
CALL system_clock(start,count_rate)
! Do first half time step
CALL cufftExecZ2Z(plan,u_d,v_d,CUFFT_FORWARD)
!$cuf kernel do(2) <<< (1,*),(kersize,1) >>>
DO j=1,Ny
DO i=1,Nx
v_d(i,j)=exp(dt*( kx_d(i)*kx_d(i) + ky_d(j)*ky_d(j) )&
*cmplx(0.0d0,0.50d0) )*v_d(i,j)
END DO
END DO
PRINT *,'Starting timestepping'
time(1)=0.0d0
DO n=1,Nt
time_d(n+1)=n*dt
CALL cufftExecZ2Z(plan,v_d,u_d,CUFFT_INVERSE)
!$cuf kernel do(2) <<< (1,*),(kersize,1) >>>
DO j=1,Ny
DO i=1,Nx
v_d(i,j)=Es*u_d(i,j)*conjg(u_d(i,j))*scalemodes**2
END DO
END DO
!$cuf kernel do(2) <<< (1,*),(kersize,1) >>>
DO j=1,Ny
DO i=1,Nx
u_d(i,j)=exp(cmplx(0.0d0,-1.0d0)*dt*v_d(i,j))&
*u_d(i,j)*scalemodes
END DO
END DO
CALL cufftExecZ2Z(plan,u_d,v_d,CUFFT_FORWARD)
!$cuf kernel do(2) <<< (1,*),(kersize,1) >>>
DO j=1,Ny
DO i=1,Nx
v_d(i,j)=exp(dt*(kx_d(i)*kx_d(i) + ky_d(j)*ky_d(j))&
*cmplx(0.0d0,1.0d0))*v_d(i,j)
END DO
END DO
END DO
! transform back final data and do another half time step
CALL cufftExecZ2Z(plan,v_d,u_d,CUFFT_INVERSE)
!$cuf kernel do(2) <<< (1,*),(kersize,1) >>>
DO j=1,Ny
DO i=1,Nx
v_d(i,j)=Es*u_d(i,j)*conjg(u_d(i,j))*scalemodes**2
END DO
END DO
!$cuf kernel do(2) <<< (1,*),(kersize,1) >>>
DO j=1,Ny
DO i=1,Nx
u_d(i,j)=exp(cmplx(0.0d0,-1.0d0)*dt*v_d(i,j))&
*u_d(i,j)*scalemodes
END DO
END DO
CALL cufftExecZ2Z(plan,u_d,v_d,CUFFT_FORWARD)
!$cuf kernel do(2) <<< (1,*),(kersize,1) >>>
DO j=1,Ny
DO i=1,Nx
v_d(i,j)=exp(dt*(kx_d(i)*kx_d(i) + ky_d(j)*ky_d(j))&
*cmplx(0.0d0,0.50d0))*v_d(i,j)
END DO
END DO
CALL cufftExecZ2Z(plan,v_d,u_d,CUFFT_INVERSE)
! normalize
!$cuf kernel do(2) <<< (1,*),(kersize,1) >>>
DO j=1,Ny
DO i=1,Nx
u_d(i,j)=u_d(i,j)*scalemodes
END DO
END DO
CALL system_clock(finish,count_rate)
PRINT*,'Program took ',&
REAL(finish-start,kind(0d0))/REAL(count_rate,kind(0d0)),'s for execution'
PRINT *,'Finished time stepping'
! calculate final mass
!$cuf kernel do <<< *,* >>>
DO j=1,Ny
DO i=1,Nx
temp1_d(i,j)=abs(u_d(i,j))**2
END DO
END DO
! Use FFT to get initial mass
CALL cufftExecZ2Z(plan,temp1_d,temp2_d,CUFFT_FORWARD)
FiMass=temp2_d(1,1)
PRINT*,'Initial mass',InMass
PRINT*,'Final mass',FiMass
PRINT*,'Final Mass/Initial Mass', &
ABS(REAL(FiMass,kind=fp_kind)/REAL(InMass,kind=fp_kind))
! Get final energy
!$cuf kernel do <<< *,* >>>
DO j=1,Ny
DO i=1,Nx
temp1_d(i,j)=-ES*0.25d0*abs(u_d(i,j))**4
END DO
END DO
! Use FFT to find mean
CALL cufftExecZ2Z(plan,temp1_d,temp2_d,CUFFT_FORWARD)
FiEner=temp2_d(1,1)
!$cuf kernel do <<< *,* >>>
DO j=1,Ny
DO i=1,Nx
temp2_d(i,j)=kx_d(i)*v_d(i,j)*scalemodes
END DO
END DO
CALL cufftExecZ2Z(plan,temp2_d,temp1_d,CUFFT_INVERSE)
!$cuf kernel do <<< *,* >>>
DO j=1,Ny
DO i=1,Nx
temp2_d(i,j)=0.5d0*abs(temp1_d(i,j))**2
END DO
END DO
! Use FFT to find mean
CALL cufftExecZ2Z(plan,temp2_d,temp1_d,CUFFT_FORWARD)
FiEner=FiEner+temp1_d(1,1)
!$cuf kernel do <<< *,* >>>
DO j=1,Ny
DO i=1,Nx
temp2_d(i,j)=ky_d(j)*v_d(i,j)*scalemodes
END DO
END DO
CALL cufftExecZ2Z(plan,temp2_d,temp1_d,CUFFT_INVERSE)
!$cuf kernel do <<< *,* >>>
DO j=1,Ny
DO i=1,Nx
temp2_d(i,j)=0.5d0*abs(temp1_d(i,j))**2
END DO
END DO
! Use FFT to find mean
CALL cufftExecZ2Z(plan,temp2_d,temp1_d,CUFFT_FORWARD)
FiEner=FiEner+temp1_d(1,1)
PRINT*,'Initial energy',InEner
PRINT*,'Final energy',FiEner
PRINT*,'Final Energy/Initial Energy', &
ABS(REAL(FiEner,kind=fp_kind)/REAL(InEner,kind=fp_kind))
! Copy results back to host
u=u_d
time=time_d
x=x_d
y=y_d
name_config = 'ufinal.dat'
OPEN(unit=11,FILE=name_config,status="UNKNOWN")
REWIND(11)
DO j=1,Ny
DO i=1,Nx
WRITE(11,*) abs(u(i,j))**2
END DO
END DO
CLOSE(11)
name_config = 'tdata.dat'
OPEN(unit=11,FILE=name_config,status="UNKNOWN")
REWIND(11)
DO j=1,1+Nt/plotgap
WRITE(11,*) time(j)
END DO
CLOSE(11)
name_config = 'xcoord.dat'
OPEN(unit=11,FILE=name_config,status="UNKNOWN")
REWIND(11)
DO i=1,Nx
WRITE(11,*) x(i)
END DO
CLOSE(11)
name_config = 'ycoord.dat'
OPEN(unit=11,FILE=name_config,status="UNKNOWN")
REWIND(11)
DO j=1,Ny
WRITE(11,*) y(j)
END DO
CLOSE(11)
PRINT *,'Saved data'
! Destroy the plan
CALL cufftDestroy(plan)
DEALLOCATE(kx_d,ky_d,x_d,y_d,&
u_d,v_d,time_d,&
temp1_d,temp2_d,&
stat=AllocateStatus)
IF (allocatestatus .ne. 0) STOP
DEALLOCATE(x,y,u,time,&
stat=AllocateStatus)
IF (allocatestatus .ne. 0) STOP
PRINT *,'deallocated memory'
PRINT *,'Program execution complete'
END PROGRAM main
|
( |
!-------------------------------------------------------------------- ! ! ! PURPOSE ! ! This program solves nonlinear Schrodinger equation in 2 dimensions ! i*u_t+Es*|u|^2u+u_{xx}+u_{yy}=0 ! using a second order time spectral splitting scheme ! ! The boundary conditions are u(x=0,y)=u(2*Lx*\pi,y), ! u(x,y=0)=u(x,y=2*Ly*\pi) ! The initial condition is u=exp(-x^2-y^2) ! ! AUTHORS ! ! B. Cloutier, B.K. Muite, P. Rigge ! 4 June 2012 ! ! .. Parameters .. ! Nx = number of modes in x - power of 2 for FFT ! Ny = number of modes in y - power of 2 for FFT ! Nt = number of timesteps to take ! Tmax = maximum simulation time ! plotgap = number of timesteps between plots ! FFTW_IN_PLACE = value for FFTW input ! FFTW_MEASURE = value for FFTW input ! FFTW_EXHAUSTIVE = value for FFTW input ! FFTW_PATIENT = value for FFTW input ! FFTW_ESTIMATE = value for FFTW input ! FFTW_FORWARD = value for FFTW input ! FFTW_BACKWARD = value for FFTW input ! pi = 3.14159265358979323846264338327950288419716939937510d0 ! Lx = width of box in x direction ! Ly = width of box in y direction ! ES = +1 for focusing and -1 for defocusing ! .. Scalars .. ! i = loop counter in x direction ! j = loop counter in y direction ! n = loop counter for timesteps direction ! allocatestatus = error indicator during allocation ! numthreads = number of openmp threads ! ierr = error return code ! start = variable to record start time of program ! finish = variable to record end time of program ! count_rate = variable for clock count rate ! planfxy = Forward 2d fft plan ! planbxy = Backward 2d fft plan ! dt = timestep ! InMass = initial mass ! FiMass = final mass ! InEner = initial energy ! FiEner = final energy ! .. Arrays .. ! u = approximate solution ! v = Fourier transform of approximate solution ! temp1 = temporary field ! temp2 = temporary field ! .. Vectors .. ! kx = fourier frequencies in x direction ! ky = fourier frequencies in y direction ! x = x locations ! y = y locations ! time = times at which save data ! name_config = array to store filename for data to be saved ! ! REFERENCES ! ! This program is based on example code to demonstrate usage of Fortran and ! CUDA FFT routines taken from ! http://cudamusing.blogspot.com/2010/05/CALLing-cufft-from-cuda-fortran.html ! ! and ! ! http://cudamusing.blogspot.com/search?q=cublas ! ! ACKNOWLEDGEMENTS ! ! ACCURACY ! ! ERROR INDICATORS AND WARNINGS ! ! FURTHER COMMENTS ! Check that the initial iterate is consistent with the ! boundary conditions for the domain specified !-------------------------------------------------------------------- ! External routines required ! precision ! cufft ! ! External libraries required ! CuFFT -- Cuda FFT Library ! OpenACC ! ! Define the INTERFACE to the NVIDIA CUFFT routines ! module precision ! Precision control integer, parameter, public :: Single = kind(0.0) ! Single precision integer, parameter, public :: Double = kind(0.0d0) ! Double precision integer, parameter, public :: fp_kind = Double !integer, parameter, public :: fp_kind = Single end module precision module cufft integer, public :: CUFFT_FORWARD = -1 integer, public :: CUFFT_INVERSE = 1 integer, public :: CUFFT_R2C = Z'2a' ! Real to Complex (interleaved) integer, public :: CUFFT_C2R = Z'2c' ! Complex (interleaved) to Real integer, public :: CUFFT_C2C = Z'29' ! Complex to Complex, interleaved integer, public :: CUFFT_D2Z = Z'6a' ! Double to Double-Complex integer, public :: CUFFT_Z2D = Z'6c' ! Double-Complex to Double integer, public :: CUFFT_Z2Z = Z'69' ! Double-Complex to Double-Complex !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! ! ! cufftPlan2d(cufftHandle *plan, int nx,int ny, cufftType type) ! !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! interface cufftPlan2d subroutine cufftPlan2d(plan, nx, ny, type) bind(C,name='cufftPlan2d') use iso_c_binding integer(c_int):: plan integer(c_int),value:: nx, ny, type end subroutine cufftPlan2d end interface cufftPlan2d !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! ! ! cufftDestroy(cufftHandle plan) ! !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! interface cufftDestroy subroutine cufftDestroy(plan) bind(C,name='cufftDestroy') use iso_c_binding integer(c_int),value:: plan end subroutine cufftDestroy end interface cufftDestroy !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! ! ! cufftExecZ2Z(cufftHandle plan, ! cufftDoubleComplex *idata, ! cufftDoubleComplex *odata, ! int direction; ! !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! interface cufftExecZ2Z subroutine cufftExecZ2Z(plan, idata, odata, direction) & & bind(C,name='cufftExecZ2Z') use iso_c_binding use precision integer(c_int),value:: direction integer(c_int),value:: plan complex(fp_kind),device,dimension(1:nx,1:ny):: idata,odata end subroutine cufftExecZ2Z end interface cufftExecZ2Z end module cufft PROGRAM main USE precision USE cufft USE openacc ! Declare variables IMPLICIT NONE INTEGER(kind=4), PARAMETER :: Nx=128 INTEGER(kind=4), PARAMETER :: Ny=128 INTEGER(kind=4), PARAMETER :: Nt=20 INTEGER(kind=4), PARAMETER :: plotgap=20 REAL(fp_kind), PARAMETER :: & pi=3.14159265358979323846264338327950288419716939937510d0 REAL(fp_kind), PARAMETER :: Lx=5.0d0 REAL(fp_kind), PARAMETER :: Ly=5.0d0 REAL(fp_kind), PARAMETER :: Es=1.0d0 REAL(fp_kind) :: dt=0.10d0**5 REAL(fp_kind) :: scalemodes COMPLEX(fp_kind) :: InMass,FiMass,InEner,FiEner COMPLEX(fp_kind), DIMENSION(:), ALLOCATABLE :: kx COMPLEX(fp_kind), DIMENSION(:), ALLOCATABLE :: ky REAL(fp_kind), DIMENSION(:), ALLOCATABLE :: x REAL(fp_kind), DIMENSION(:), ALLOCATABLE :: y COMPLEX(fp_kind), DIMENSION(:,:), ALLOCATABLE :: u,v,temp1,temp2 REAL(fp_kind), DIMENSION(:), ALLOCATABLE :: time INTEGER(kind=4) :: i,j,k,n,allocatestatus,ierr, vecsize,gangsize REAL(fp_kind) :: start_time,stop_time INTEGER(kind=4) :: plan CHARACTER*100 :: name_config vecsize=32 gangsize=16 PRINT *,'Program starting' PRINT *,'Grid: ',Nx,'X',Ny ALLOCATE(kx(1:Nx),ky(1:Nx),x(1:Nx),y(1:Nx),u(1:Nx,1:Ny),& v(1:Nx,1:Ny),temp1(1:Nx,1:Ny),temp2(1:Nx,1:Ny),& time(1:1+Nt/plotgap),stat=allocatestatus) IF (allocatestatus .ne. 0) stop PRINT *,'allocated memory' !$acc data copy(InMass,FiMass,InEner,FiEner,kx,ky,x,y,u,v,temp1,temp2,time) ! set up ffts CALL cufftPlan2D(plan,nx,ny,CUFFT_Z2Z) PRINT *,'Setup FFTs' ! setup fourier frequencies !$acc kernels loop DO i=1,1+Nx/2 kx(i)= cmplx(0.0d0,1.0d0)*REAL(i-1,kind(0d0))/Lx END DO !$acc end kernels kx(1+Nx/2)=0.0d0 !$acc kernels loop DO i = 1,Nx/2 -1 kx(i+1+Nx/2)=-kx(1-i+Nx/2) END DO !$acc end kernels !$acc kernels loop DO i=1,Nx x(i)=(-1.0d0+2.0d0*REAL(i-1,kind(0d0))/REAL(Nx,kind(0d0)) )*pi*Lx END DO !$acc end kernels !$acc kernels loop DO j=1,1+Ny/2 ky(j)= cmplx(0.0d0,1.0d0)*REAL(j-1,kind(0d0))/Ly END DO !$acc end kernels ky(1+Ny/2)=0.0d0 !$acc kernels loop DO j = 1,Ny/2 -1 ky(j+1+Ny/2)=-ky(1-j+Ny/2) END DO !$acc end kernels !$acc kernels loop DO j=1,Ny y(j)=(-1.0d0+2.0d0*REAL(j-1,kind(0d0))/REAL(Ny,kind(0d0)) )*pi*Ly END DO !$acc end kernels scalemodes=1.0d0/REAL(Nx*Ny,kind(0d0)) PRINT *,'Setup grid and fourier frequencies' !$acc kernels loop DO j=1,Ny DO i=1,Nx u(i,j)=exp(-1.0d0*(x(i)**2 +y(j)**2)) END DO END DO !$acc end kernels ! transform initial data CALL cufftExecZ2Z(plan,u,v,CUFFT_FORWARD) PRINT *,'Got initial data' ! get initial mass !$acc kernels loop DO j=1,Ny DO i=1,Nx temp1(i,j)=abs(u(i,j))**2 END DO END DO !$acc end kernels ! Use FFT to get initial mass CALL cufftExecZ2Z(plan,temp1,temp2,CUFFT_FORWARD) !$acc end data InMass=temp2(1,1) ! Get initial energy !$acc data copy(InMass,FiMass,InEner,FiEner,kx,ky,x,y,u,v,temp1,temp2,time) !$acc kernels loop DO j=1,Ny DO i=1,Nx temp1(i,j)=-ES*0.25d0*abs(u(i,j))**4 END DO END DO !$acc end kernels ! Use FFT to find mean CALL cufftExecZ2Z(plan,temp1,temp2,CUFFT_FORWARD) !$acc end data InEner=temp2(1,1) !$acc data copy(InMass,FiMass,InEner,FiEner,kx,ky,x,y,u,v,temp1,temp2,time) !$acc kernels loop DO j=1,Ny DO i=1,Nx temp2(i,j)=kx(i)*v(i,j)*scalemodes END DO END DO !$acc end kernels CALL cufftExecZ2Z(plan,temp2,temp1,CUFFT_INVERSE) !$acc kernels loop DO j=1,Ny DO i=1,Nx temp2(i,j)=0.5d0*abs(temp1(i,j))**2 END DO END DO !$acc end kernels ! Use FFT to find mean CALL cufftExecZ2Z(plan,temp2,temp1,CUFFT_FORWARD) !$acc end data InEner=InEner+temp1(1,1) !$acc data copy(InMass,FiMass,InEner,FiEner,kx,ky,x,y,u,v,temp1,temp2,time) !$acc kernels loop DO j=1,Ny DO i=1,Nx temp2(i,j)=ky(j)*v(i,j)*scalemodes END DO END DO !$acc end kernels CALL cufftExecZ2Z(plan,temp2,temp1,CUFFT_INVERSE) !$acc kernels loop DO j=1,Ny DO i=1,Nx temp2(i,j)=0.5d0*abs(temp1(i,j))**2 END DO END DO !$acc end kernels ! Use FFT to find mean CALL cufftExecZ2Z(plan,temp2,temp1,CUFFT_FORWARD) !$acc end data InEner=InEner+temp1(1,1) !$acc data copy(InMass,FiMass,InEner,FiEner,kx,ky,x,y,u,v,temp1,temp2,time) CALL cpu_time(start_time) ! transform initial data and do first half time step !$acc kernels loop gang(gangsize), vector(vecsize) DO j=1,Ny DO i=1,Nx v(i,j)=exp(0.5d0*dt*(kx(i)*kx(i) + ky(j)*ky(j))& *cmplx(0.0d0,1.0d0))*v(i,j) END DO END DO !$acc end kernels PRINT *,'Got initial data, starting timestepping' time(1)=0.0d0 DO n=1,Nt CALL cufftExecZ2Z(plan,v,u,CUFFT_INVERSE) !$acc kernels loop gang(gangsize), vector(vecsize) DO j=1,Ny DO i=1,Nx v(i,j)=Es*u(i,j)*conjg(u(i,j))*scalemodes**2 END DO END DO !$acc end kernels !$acc kernels loop gang(gangsize), vector(vecsize) DO j=1,Ny DO i=1,Nx u(i,j)=exp(cmplx(0.0d0,-1.0d0)*dt*v(i,j))& *u(i,j)*scalemodes END DO END DO !$acc end kernels CALL cufftExecZ2Z(plan,u,v,CUFFT_FORWARD) !$acc kernels loop gang(gangsize), vector(vecsize) DO j=1,Ny DO i=1,Nx v(i,j)=exp(dt*(kx(i)*kx(i) + ky(j)*ky(j))& *cmplx(0.0d0,1.0d0))*v(i,j) END DO END DO !$acc end kernels IF (mod(n,plotgap)==0) then time(1+n/plotgap)=n*dt PRINT *,'time',n*dt END IF END DO ! transform back final data and do another half time step CALL cufftExecZ2Z(plan,v,u,CUFFT_INVERSE) !$acc kernels loop gang(gangsize), vector(vecsize) DO j=1,Ny DO i=1,Nx v(i,j)=Es*u(i,j)*conjg(u(i,j))*scalemodes**2 END DO END DO !$acc end kernels !$acc kernels loop gang(gangsize), vector(vecsize) DO j=1,Ny DO i=1,Nx u(i,j)=exp(cmplx(0,-1)*dt*v(i,j))*u(i,j)*scalemodes END DO END DO !$acc end kernels CALL cufftExecZ2Z(plan,u,v,CUFFT_FORWARD) !$acc kernels loop gang(gangsize), vector(vecsize) DO j=1,Ny DO i=1,Nx v(i,j)=exp(0.5d0*dt*(kx(i)*kx(i) + ky(j)*ky(j))& *cmplx(0.0d0,1.0d0))*v(i,j) END DO END DO !$acc end kernels CALL cufftExecZ2Z(plan,v,u,CUFFT_INVERSE) !$acc kernels loop gang(gangsize), vector(vecsize) DO j=1,Ny DO i=1,Nx u(i,j)=u(i,j)*scalemodes END DO END DO !$acc end kernels PRINT *,'Finished time stepping' CALL cpu_time(stop_time) !$acc end data PRINT*,'Program took ',stop_time-start_time,& 'for Time stepping' !$acc data copy(InMass,FiMass,InEner,FiEner,kx,ky,x,y,u,v,temp1,temp2,time) ! calculate final mass !$acc kernels loop DO j=1,Ny DO i=1,Nx temp1(i,j)=abs(u(i,j))**2 END DO END DO !$acc end kernels ! Use FFT to get initial mass CALL cufftExecZ2Z(plan,temp1,temp2,CUFFT_FORWARD) !$acc end data FiMass=temp2(1,1) ! Get final energy !$acc data copy(InMass,FiMass,InEner,FiEner,kx,ky,x,y,u,v,temp1,temp2,time) !$acc kernels loop DO j=1,Ny DO i=1,Nx temp1(i,j)=-ES*0.25d0*abs(u(i,j))**4 END DO END DO !$acc end kernels ! Use FFT to find mean CALL cufftExecZ2Z(plan,temp1,temp2,CUFFT_FORWARD) !$acc end data FiEner=temp2(1,1) !$acc data copy(InMass,FiMass,InEner,FiEner,kx,ky,x,y,u,v,temp1,temp2,time) !$acc kernels loop DO j=1,Ny DO i=1,Nx temp2(i,j)=kx(i)*v(i,j)*scalemodes END DO END DO !$acc end kernels CALL cufftExecZ2Z(plan,temp2,temp1,CUFFT_INVERSE) !$acc kernels loop DO j=1,Ny DO i=1,Nx temp2(i,j)=0.5d0*abs(temp1(i,j))**2 END DO END DO !$acc end kernels ! Use FFT to find mean CALL cufftExecZ2Z(plan,temp2,temp1,CUFFT_FORWARD) !$acc end data FiEner=FiEner+temp1(1,1) !$acc data copy(InMass,FiMass,InEner,FiEner,kx,ky,x,y,u,v,temp1,temp2,time) !$acc kernels loop DO j=1,Ny DO i=1,Nx temp2(i,j)=ky(j)*v(i,j)*scalemodes END DO END DO !$acc end kernels CALL cufftExecZ2Z(plan,temp2,temp1,CUFFT_INVERSE) !$acc kernels loop DO j=1,Ny DO i=1,Nx temp2(i,j)=0.5d0*abs(temp1(i,j))**2 END DO END DO !$acc end kernels ! Use FFT to find mean CALL cufftExecZ2Z(plan,temp2,temp1,CUFFT_FORWARD) !$acc end data FiEner=FiEner+temp1(1,1) PRINT *,'Results copied back to host' PRINT*,'Initial mass',InMass PRINT*,'Final mass',FiMass PRINT*,'Final Mass/Initial Mass', & ABS(REAL(FiMass,kind(0d0))/REAL(InMass,kind(0d0))) PRINT*,'Initial energy',InEner PRINT*,'Final energy',FiEner PRINT*,'Final Energy/Initial Energy', & ABS(REAL(FiEner,kind(0d0))/REAL(InEner,kind(0d0))) name_config = 'ufinal.dat' OPEN(unit=11,FILE=name_config,status="UNKNOWN") REWIND(11) DO j=1,Ny DO i=1,Nx WRITE(11,*) abs(u(i,j))**2 END DO END DO CLOSE(11) name_config = 'tdata.dat' OPEN(unit=11,FILE=name_config,status="UNKNOWN") REWIND(11) DO j=1,1+Nt/plotgap WRITE(11,*) time(j) END DO CLOSE(11) name_config = 'xcoord.dat' OPEN(unit=11,FILE=name_config,status="UNKNOWN") REWIND(11) DO i=1,Nx WRITE(11,*) x(i) END DO CLOSE(11) name_config = 'ycoord.dat' OPEN(unit=11,FILE=name_config,status="UNKNOWN") REWIND(11) DO j=1,Ny WRITE(11,*) y(j) END DO CLOSE(11) PRINT *,'Saved data' CALL cufftDestroy(plan) DEALLOCATE(u,v,temp1,temp2,time,kx,ky,x,y,stat=allocatestatus) IF (allocatestatus .ne. 0) STOP PRINT *,'Deallocated memory' PRINT *,'Program execution complete' END PROGRAM main
2D sine-Gordon Equations
These programs use a semi-explicit method that is similar to that used for the Klein-Gordon equation. Only the main program is included here, and the auxiliary subroutines can be downloaded from Cloutier, Muite and Rigge[4]
|
( |
!-------------------------------------------------------------------- ! ! ! PURPOSE ! ! This program solves nonlinear sine-Gordon equation in 2 dimensions ! u_{tt}-u_{xx}-u_{yy}=-sin(u) ! using a second order implicit-explicit time stepping scheme. ! ! The boundary conditions are u(x=0,y)=u(2*Lx*\pi,y), ! u(x,y=0)=u(x,y=2*Ly*\pi) ! The initial condition is set in initialdata.f90 ! ! AUTHORS ! ! B. Cloutier, B.K. Muite, P. Rigge ! 4 June 2012 ! ! .. Parameters .. ! Nx = number of modes in x - power of 2 for FFT ! Ny = number of modes in y - power of 2 for FFT ! Nt = number of timesteps to take ! plotgap = number of timesteps between plots ! FFTW_IN_PLACE = value for FFTW input ! FFTW_MEASURE = value for FFTW input ! FFTW_EXHAUSTIVE = value for FFTW input ! FFTW_PATIENT = value for FFTW input ! FFTW_ESTIMATE = value for FFTW input ! FFTW_FORWARD = value for FFTW input ! FFTW_BACKWARD = value for FFTW input ! pi = 3.1415926535... ! Lx = width of box in x direction ! Ly = width of box in y direction ! .. Scalars .. ! i = loop counter in x direction ! j = loop counter in y direction ! n = loop counter for timesteps direction ! allocatestatus = error indicator during allocation ! start = variable to record start time of program ! finish = variable to record end time of program ! count_rate = variable for clock count rate ! planfxy = Forward 2d fft plan (FFTW) ! planbxy = Backward 2d fft plan (FFTW) ! planf = Forward 2d fft plan (CUFFT) ! planb = Backward 2d fft plan (CUFFT) ! dt = timestep ! ierr = error code ! plotnum = number of plot ! .. Arrays .. ! u = approximate solution ! uold = approximate solution ! u_d = approximate solution (on GPU) ! v_d = Fourier transform of approximate solution (on GPU) ! uold_d = approximate solution (on GPU) ! vold_d = Fourier transform of approximate solution (on GPU) ! nonlinhat_d = Fourier transform of nonlinear term, sin(u) (on GPU) ! temp1 = extra space for energy computation ! temp2 = extra space for energy computation ! savearray = temp array to save out to disk ! .. Vectors .. ! kx = Fourier frequencies in x direction ! ky = Fourier frequencies in y direction ! kx_d = Fourier frequencies in x direction (on GPU) ! ky_d = Fourier frequencies in y direction (on GPU) ! x = x locations ! y = y locations ! time = times at which save data ! en = total energy ! enstr = strain energy ! enpot = potential energy ! enkin = kinetic energy ! name_config = array to store filename for data to be saved ! ! REFERENCES ! ! ACKNOWLEDGEMENTS ! ! This program is based on example code to demonstrate usage of Fortran and ! CUDA FFT routines taken from ! http://cudamusing.blogspot.com/2010/05/CALLing-cufft-from-cuda-fortran.html ! ! and ! ! http://cudamusing.blogspot.com/search?q=cublas ! ! ACCURACY ! ! ERROR INDICATORS AND WARNINGS ! ! FURTHER COMMENTS ! Check that the initial iterate is consistent with the ! boundary conditions for the domain specified !-------------------------------------------------------------------- ! External routines required ! getgrid.f90 -- Get initial grid of points ! initialdata.f90 -- Get initial data ! enercalc.f90 -- Subroutine to calculate the energy ! savedata.f90 -- Save initial data ! External libraries required ! Cuda FFT -- http://developer.nvidia.com/cufft ! FFTW3 -- Fastest Fourier Transform in the West ! (http://www.fftw.org/) module precision ! Precision control integer, parameter, public :: Single = kind(0.0) ! Single precision integer, parameter, public :: Double = kind(0.0d0) ! Double precision ! integer, parameter, public :: fp_kind = Double !integer, parameter, public :: fp_kind = Single end module precision module cufft integer, public :: CUFFT_FORWARD = -1 integer, public :: CUFFT_INVERSE = 1 integer, public :: CUFFT_R2C = Z'2a' ! Real to Complex (interleaved) integer, public :: CUFFT_C2R = Z'2c' ! Complex (interleaved) to Real integer, public :: CUFFT_C2C = Z'29' ! Complex to Complex, interleaved integer, public :: CUFFT_D2Z = Z'6a' ! Double to Double-Complex integer, public :: CUFFT_Z2D = Z'6c' ! Double-Complex to Double integer, public :: CUFFT_Z2Z = Z'69' ! Double-Complex to Double-Complex !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! ! ! cufftPlan2d(cufftHandle *plan, int nx,int ny, cufftType type,int batch) ! !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! interface cufftPlan2d subroutine cufftPlan2d(plan, nx, ny, type) bind(C,name='cufftPlan2d') use iso_c_binding integer(c_int):: plan integer(c_int),value:: nx, ny, type end subroutine cufftPlan2d end interface cufftPlan2d !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! ! ! cufftDestroy(cufftHandle plan) ! !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! interface cufftDestroy subroutine cufftDestroy(plan) bind(C,name='cufftDestroy') use iso_c_binding integer(c_int),value:: plan end subroutine cufftDestroy end interface cufftDestroy !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! ! ! cufftExecD2Z(cufftHandle plan, ! cufftDoubleReal *idata, ! cufftDoubleComplex *odata) ! !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! interface cufftExecD2Z subroutine cufftExecD2Z(plan, idata, odata) & & bind(C,name='cufftExecD2Z') use iso_c_binding use precision integer(c_int), value :: plan real(fp_kind), device :: idata(1:nx,1:ny) complex(fp_kind),device :: odata(1:nx,1:ny) end subroutine cufftExecD2Z end interface cufftExecD2Z !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! ! ! cufftExecD2Z(cufftHandle plan, ! cufftDoubleComplex *idata, ! cufftDoubleReal *odata) ! !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! interface cufftExecZ2D subroutine cufftExecZ2D(plan, idata, odata) & & bind(C,name='cufftExecZ2D') use iso_c_binding use precision integer(c_int),value :: plan complex(fp_kind),device:: idata(1:nx,1:ny) real(fp_kind),device :: odata(1:nx,1:ny) end subroutine cufftExecZ2D end interface cufftExecZ2D end module cufft PROGRAM sg2d USE precision USE cudafor USE cufft ! Declare variables IMPLICIT NONE INTEGER(kind=4), PARAMETER :: Nx=1024 INTEGER(kind=4), PARAMETER :: Ny=Nx INTEGER(kind=4), PARAMETER :: Nt=500 INTEGER(kind=4), PARAMETER :: plotgap=Nt+1 REAL(kind=8), PARAMETER :: & pi=3.14159265358979323846264338327950288419716939937510d0 REAL(kind=8), PARAMETER :: Lx=5.0d0 REAL(kind=8), PARAMETER :: Ly=5.0d0 REAL(kind=8) :: dt=0.001d0 COMPLEX(kind=8), DIMENSION(:), ALLOCATABLE :: kx,ky REAL(kind=8), DIMENSION(:), ALLOCATABLE :: x,y REAL (kind=8), DIMENSION(:,:), ALLOCATABLE :: u,uold COMPLEX(kind=8), DIMENSION(:,:), ALLOCATABLE :: temp1,temp2 REAL(kind=8), DIMENSION(:,:), ALLOCATABLE :: savearray REAL(kind=8), DIMENSION(:), ALLOCATABLE :: time,enkin,enstr,enpot,en REAL(kind=8) :: scalemodes INTEGER(kind=4) :: ierr,i,j,n,allocatestatus INTEGER(kind=4) :: start, finish, count_rate, plotnum INTEGER(kind=4), PARAMETER :: FFTW_IN_PLACE = 8, FFTW_MEASURE = 0, & FFTW_EXHAUSTIVE = 8, FFTW_PATIENT = 32, FFTW_ESTIMATE = 64 INTEGER(kind=4),PARAMETER :: FFTW_FORWARD = -1, FFTW_BACKWARD=1 INTEGER(kind=8) :: planfxy,planbxy CHARACTER*100 :: name_config INTEGER(kind=4) :: planf,planb,kersize ! GPU variables COMPLEX(fp_kind),DEVICE,DIMENSION(:), ALLOCATABLE :: kx_d,ky_d COMPLEX(fp_kind),DEVICE,DIMENSION(:,:), ALLOCATABLE :: vold_d,v_d,nonlinhat_d REAL (fp_kind),DEVICE,DIMENSION(:,:), ALLOCATABLE :: uold_d,u_d ! print run information PRINT *,"Nx=", Nx PRINT *,"Ny=", Ny PRINT *,"Nt=", Nt PRINT *,"Lx=", Lx PRINT *,"Ly=", Ly PRINT *,"dt=", dt kersize=min(Nx,256) ALLOCATE(kx(1:Nx),ky(1:Ny),kx_d(1:Nx),ky_d(1:Ny),x(1:Nx),y(1:Ny),& u(1:Nx,1:Ny),uold(1:Nx,1:Ny),u_d(1:Nx,1:Ny),uold_d(1:Nx,1:Ny),& v_d(1:Nx/2+1,1:Ny),vold_d(1:Nx/2+1,1:Ny),& savearray(1:Nx,1:Ny),time(1:1+Nt/plotgap),enkin(1:1+Nt/plotgap+1),& enstr(1:1+Nt/plotgap+1),enpot(1:1+Nt/plotgap+1),en(1:1+Nt/plotgap),& nonlinhat_d(1:Nx/2+1,1:Ny),& temp1(1:Nx,1:Ny),temp2(1:Nx,1:Ny),& stat=allocatestatus) IF (allocatestatus .ne. 0) stop PRINT *,'allocated arrays' scalemodes=1.0d0/REAL(Nx*Ny,kind(0d0)) ! set up cuda ffts call cufftPlan2D(planf,nx,ny,CUFFT_D2Z) call cufftPlan2D(planb,nx,ny,CUFFT_Z2D) ! set up fftw ffts CALL dfftw_plan_dft_2d_(planfxy,Nx,Ny,u,temp2,FFTW_FORWARD,FFTW_ESTIMATE) CALL dfftw_plan_dft_2d_(planbxy,Nx,Ny,temp2,u,FFTW_BACKWARD,FFTW_ESTIMATE) PRINT *,'Setup FFTs' ! setup grid, wave numbers CALL getgrid(Nx,Ny,Lx,Ly,pi,name_config,x,y,kx,ky) kx_d=kx ky_d=ky PRINT *,'Got grid and fourier frequencies' CALL initialdata(Nx,Ny,x,y,u,uold) u_d=u uold_d=uold plotnum=1 name_config = 'data/u' savearray=REAL(u) ! CALL savedata(Nx,Ny,plotnum,name_config,savearray) ! disabled for benchmarking PRINT *,'data saved' CALL enercalc(Nx,Ny,planfxy,planbxy,dt,enkin(plotnum),enstr(plotnum),& enpot(plotnum),en(plotnum),kx,ky,temp1,temp2,u,uold) call cufftExecD2Z(planf,u_d,v_d) call cufftExecD2Z(planf,uold_d,vold_d) PRINT *,'Got initial data, starting timestepping' time(plotnum)=0.0d0 CALL system_clock(start,count_rate) DO n=1,Nt !$cuf kernel do(2) <<< (1,*), (kersize,1) >>> DO j=1,Ny DO i=1,Nx uold_d(i,j)=u_d(i,j) END DO END DO !$cuf kernel do(2) <<< (1,*), (kersize,1) >>> DO j=1,Ny DO i=1,Nx u_d(i,j)=sin(u_d(i,j)) END DO END DO call cufftExecD2Z(planf,u_d,nonlinhat_d) !$cuf kernel do(2) <<< (1,*), (kersize,1) >>> DO j=1,Ny DO i=1,Nx/2+1 nonlinhat_d(i,j)=scalemodes*( 0.25*(kx_d(i)*kx_d(i) + ky_d(j)*ky_d(j))& *(2.0d0*v_d(i,j)+vold_d(i,j))& +(2.0d0*v_d(i,j)-vold_d(i,j))/(dt*dt)& -nonlinhat_d(i,j) )& /(1/(dt*dt)-0.25*(kx_d(i)*kx_d(i) + ky_d(j)*ky_d(j))) END DO END DO !$cuf kernel do(2) <<< (1,*), (kersize,1) >>> DO j=1,Ny DO i=1,Nx/2+1 vold_d(i,j)=v_d(i,j) END DO END DO !$cuf kernel do(2) <<< (1,*), (kersize,1) >>> DO j=1,Ny DO i=1,Nx/2+1 v_d(i,j)=nonlinhat_d(i,j)/scalemodes END DO END DO call cufftExecZ2D(planb,nonlinhat_d,u_d) IF (mod(n,plotgap)==0) then plotnum=plotnum+1 time(plotnum)=n*dt PRINT *,'time',n*dt u=u_d uold=uold_d ! savearray=REAL(u,kind(0d0)) ! disabled for benchmarking ! CALL savedata(Nx,Ny,plotnum,name_config,savearray) CALL enercalc(Nx,Ny,planfxy,planbxy,dt,enkin(plotnum),enstr(plotnum),& enpot(plotnum),en(plotnum),kx,ky,temp1,temp2,u,uold) END IF END DO CALL system_clock(finish,count_rate) PRINT *,'Finished time stepping' u=u_d uold=uold_d ! compute energy at the end CALL enercalc(Nx,Ny,planfxy,planbxy,dt,enkin(plotnum+1),enstr(plotnum+1),& enpot(plotnum+1),en(plotnum+1),kx,ky,temp1,temp2,u,uold) PRINT*,'Program took ',& REAL(finish-start,kind(0d0))/REAL(count_rate,kind(0d0)),& 'for Time stepping' CALL saveresults(Nt,plotgap,time(1:1+n/plotgap),en(1:1+n/plotgap+1),& enstr(1:1+n/plotgap+1),enkin(1:1+n/plotgap+1),enpot(1:1+n/plotgap+1)) ! Save times at which output was made in text format PRINT *,'Saved data' call cufftDestroy(planf) call cufftDestroy(planb) PRINT *,'Destroy CUFFT Plans' call dfftw_destroy_plan_(planbxy) call dfftw_destroy_plan_(planfxy) PRINT *,'Destroy FFTW Plans' DEALLOCATE(kx,ky,x,y,u,uold,time,enkin,enstr,enpot,en,savearray,temp1,temp2,& stat=allocatestatus) IF (allocatestatus .ne. 0) STOP PRINT *,'Deallocated host arrays' DEALLOCATE(uold_d,vold_d,u_d,v_d,nonlinhat_d,& kx_d,ky_d,& stat=allocatestatus) IF (allocatestatus .ne. 0) STOP PRINT *,'Deallocated gpu arrays' PRINT *,'Program execution complete' END PROGRAM sg2d
|
( |
!-------------------------------------------------------------------- ! ! ! PURPOSE ! ! This program solves nonlinear sine-Gordon equation in 2 dimensions ! u_{tt}-u_{xx}-u_{yy}=-sin(u) ! using a second order implicit-explicit time stepping scheme. ! ! The boundary conditions are u(x=0,y)=u(2*Lx*\pi,y), ! u(x,y=0)=u(x,y=2*Ly*\pi) ! The initial condition is set in initialdata.f90 ! ! AUTHORS ! ! B. Cloutier, B.K. Muite, P. Rigge ! 4 June 2012 ! ! .. Parameters .. ! Nx = number of modes in x - power of 2 for FFT ! Ny = number of modes in y - power of 2 for FFT ! Nt = number of timesteps to take ! plotgap = number of timesteps between plots ! FFTW_IN_PLACE = value for FFTW input ! FFTW_MEASURE = value for FFTW input ! FFTW_EXHAUSTIVE = value for FFTW input ! FFTW_PATIENT = value for FFTW input ! FFTW_ESTIMATE = value for FFTW input ! FFTW_FORWARD = value for FFTW input ! FFTW_BACKWARD = value for FFTW input ! pi = 3.1415926535... ! Lx = width of box in x direction ! Ly = width of box in y direction ! .. Scalars .. ! i = loop counter in x direction ! j = loop counter in y direction ! n = loop counter for timesteps direction ! allocatestatus = error indicator during allocation ! start = variable to record start time of program ! finish = variable to record end time of program ! count_rate = variable for clock count rate ! planfxy = Forward 2d fft plan (FFTW) ! planbxy = Backward 2d fft plan (FFTW) ! planf = Forward 2d fft plan (CUFFT) ! planb = Backward 2d fft plan (CUFFT) ! dt = timestep ! ierr = error code ! plotnum = number of plot ! .. Arrays .. ! u = approximate solution ! uold = approximate solution ! v = Fourier transform of approximate solution ! vold = Fourier transform of approximate solution ! nonlinhat = Fourier transform of nonlinear term, sin(u) ! temp1 = extra space for energy computation ! temp2 = extra space for energy computation ! savearray = temp array to save out to disk ! .. Vectors .. ! kx = fourier frequencies in x direction ! ky = fourier frequencies in y direction ! x = x locations ! y = y locations ! time = times at which save data ! en = total energy ! enstr = strain energy ! enpot = potential energy ! enkin = kinetic energy ! name_config = array to store filename for data to be saved ! ! REFERENCES ! ! ACKNOWLEDGEMENTS ! ! This program is based on example code to demonstrate usage of Fortran and ! CUDA FFT routines taken from ! http://cudamusing.blogspot.com/2010/05/CALLing-cufft-from-cuda-fortran.html ! ! and ! ! http://cudamusing.blogspot.com/search?q=cublas ! ! ACCURACY ! ! ERROR INDICATORS AND WARNINGS ! ! FURTHER COMMENTS ! Check that the initial iterate is consistent with the ! boundary conditions for the domain specified !-------------------------------------------------------------------- ! External routines required ! getgrid.f90 -- Get initial grid of points ! initialdata.f90 -- Get initial data ! enercalc.f90 -- Subroutine to calculate the energy ! savedata.f90 -- Save initial data ! External libraries required ! Cuda FFT ! OpenACC ! FFTW3 -- Fastest Fourier Transform in the West ! (http://www.fftw.org/) ! OpenMP module precision ! Precision control integer, parameter, public :: Single = kind(0.0) ! Single precision integer, parameter, public :: Double = kind(0.0d0) ! Double precision ! integer, parameter, public :: fp_kind = Double !integer, parameter, public :: fp_kind = Single end module precision module cufft integer, public :: CUFFT_FORWARD = -1 integer, public :: CUFFT_INVERSE = 1 integer, public :: CUFFT_R2C = Z'2a' ! Real to Complex (interleaved) integer, public :: CUFFT_C2R = Z'2c' ! Complex (interleaved) to Real integer, public :: CUFFT_C2C = Z'29' ! Complex to Complex, interleaved integer, public :: CUFFT_D2Z = Z'6a' ! Double to Double-Complex integer, public :: CUFFT_Z2D = Z'6c' ! Double-Complex to Double integer, public :: CUFFT_Z2Z = Z'69' ! Double-Complex to Double-Complex !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! ! ! cufftPlan2d(cufftHandle *plan, int nx,int ny, cufftType type,int batch) ! !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! interface cufftPlan2d subroutine cufftPlan2d(plan, nx, ny, type) bind(C,name='cufftPlan2d') use iso_c_binding integer(c_int):: plan integer(c_int),value:: nx, ny, type end subroutine cufftPlan2d end interface cufftPlan2d !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! ! ! cufftDestroy(cufftHandle plan) ! !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! interface cufftDestroy subroutine cufftDestroy(plan) bind(C,name='cufftDestroy') use iso_c_binding integer(c_int),value:: plan end subroutine cufftDestroy end interface cufftDestroy !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! ! ! cufftExecD2Z(cufftHandle plan, ! cufftDoubleReal *idata, ! cufftDoubleComplex *odata) ! !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! interface cufftExecD2Z subroutine cufftExecD2Z(plan, idata, odata) & & bind(C,name='cufftExecD2Z') use iso_c_binding use precision integer(c_int), value :: plan real(fp_kind), device :: idata(1:nx,1:ny) complex(fp_kind),device :: odata(1:nx,1:ny) end subroutine cufftExecD2Z end interface cufftExecD2Z !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! ! ! cufftExecD2Z(cufftHandle plan, ! cufftDoubleComplex *idata, ! cufftDoubleReal *odata) ! !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! interface cufftExecZ2D subroutine cufftExecZ2D(plan, idata, odata) & & bind(C,name='cufftExecZ2D') use iso_c_binding use precision integer(c_int),value :: plan complex(fp_kind),device:: idata(1:nx,1:ny) real(fp_kind),device :: odata(1:nx,1:ny) end subroutine cufftExecZ2D end interface cufftExecZ2D end module cufft PROGRAM sg2d USE precision USE cufft USE openacc ! Declare variables IMPLICIT NONE INTEGER(kind=4), PARAMETER :: Nx=1024 INTEGER(kind=4), PARAMETER :: Ny=Nx INTEGER(kind=4), PARAMETER :: Nt=500 INTEGER(kind=4), PARAMETER :: plotgap=Nt+1 REAL(kind=8), PARAMETER :: & pi=3.14159265358979323846264338327950288419716939937510d0 REAL(kind=8), PARAMETER :: Lx=5.0d0 REAL(kind=8), PARAMETER :: Ly=5.0d0 REAL(kind=8) :: dt=0.001d0 COMPLEX(kind=8), DIMENSION(:), ALLOCATABLE :: kx,ky REAL(kind=8), DIMENSION(:), ALLOCATABLE :: x,y REAL (kind=8), DIMENSION(:,:), ALLOCATABLE :: u,uold COMPLEX(kind=8), DIMENSION(:,:), ALLOCATABLE :: temp1,temp2,v,vold,nonlinhat REAL(kind=8), DIMENSION(:,:), ALLOCATABLE :: savearray REAL(kind=8), DIMENSION(:), ALLOCATABLE :: time,enkin,enstr,enpot,en INTEGER(kind=4) :: ierr,i,j,n,allocatestatus INTEGER(kind=4) :: start, finish, count_rate, plotnum INTEGER(kind=4), PARAMETER :: FFTW_IN_PLACE = 8, FFTW_MEASURE = 0, & FFTW_EXHAUSTIVE = 8, FFTW_PATIENT = 32, FFTW_ESTIMATE = 64 INTEGER(kind=4),PARAMETER :: FFTW_FORWARD = -1, FFTW_BACKWARD=1 INTEGER(kind=8) :: planfxy,planbxy CHARACTER*100 :: name_config INTEGER(kind=4) :: planf,planb ! print run information PRINT *,"Nx=", Nx PRINT *,"Ny=", Ny PRINT *,"Nt=", Nt PRINT *,"Lx=", Lx PRINT *,"Ly=", Ly PRINT *,"dt=", dt ALLOCATE(kx(1:Nx),ky(1:Ny),x(1:Nx),y(1:Ny),u(1:Nx,1:Ny),uold(1:Nx,1:Ny),& v(1:Nx/2+1,1:Ny),vold(1:Nx/2+1,1:Ny),nonlinhat(1:Nx/2+1,1:Ny),& savearray(1:Nx,1:Ny),time(1:1+Nt/plotgap),enkin(1:1+Nt/plotgap+1),& enstr(1:1+Nt/plotgap+1),enpot(1:1+Nt/plotgap+1),en(1:1+Nt/plotgap),& temp1(1:Nx,1:Ny),temp2(1:Nx,1:Ny),& stat=allocatestatus) IF (allocatestatus .ne. 0) stop PRINT *,'allocated arrays' ! set up cuda ffts call cufftPlan2D(planf,nx,ny,CUFFT_D2Z) call cufftPlan2D(planb,nx,ny,CUFFT_Z2D) ! set up fftw ffts CALL dfftw_plan_dft_2d_(planfxy,Nx,Ny,u,temp2,FFTW_FORWARD,FFTW_ESTIMATE) CALL dfftw_plan_dft_2d_(planbxy,Nx,Ny,temp2,u,FFTW_BACKWARD,FFTW_ESTIMATE) PRINT *,'Setup FFTs' ! setup grid, wave numbers !$acc data copy(x, y, kx, ky, vold, v, nonlinhat, uold, u) !$acc kernels loop DO i=1,1+Nx/2 kx(i)= cmplx(0.0d0,1.0d0)*REAL(i-1,kind(0d0))/Lx END DO !$acc end kernels kx(1+Nx/2)=0.0d0 !$acc kernels loop DO i = 1,Nx/2 -1 kx(i+1+Nx/2)=-kx(1-i+Nx/2) END DO !$acc end kernels !$acc kernels loop DO i=1,Nx x(i)=(-1.0d0 + 2.0d0*REAL(i-1,kind(0d0))/REAL(Nx,kind(0d0)))*pi*Lx END DO !$acc end kernels !$acc kernels loop DO j=1,1+Ny/2 ky(j)= cmplx(0.0d0,1.0d0)*REAL(j-1,kind(0d0))/Ly END DO !$acc end kernels ky(1+Ny/2)=0.0d0 !$acc kernels loop DO j = 1,Ny/2 -1 ky(j+1+Ny/2)=-ky(1-j+Ny/2) END DO !$acc end kernels !$acc kernels loop DO j=1,Ny y(j)=(-1.0d0 + 2.0d0*REAL(j-1,kind(0d0))/REAL(Ny,kind(0d0)))*pi*Ly END DO !$acc end kernels PRINT *,'Got grid and fourier frequencies' !$acc kernels loop DO j=1,Ny DO i=1,Nx u(i,j)=0.5d0*exp(-1.0d0*(x(i)**2 +y(j)**2)) END DO END DO !$acc end kernels !$acc kernels loop DO j=1,Ny DO i=1,Nx uold(i,j)=0.5d0*exp(-1.0d0*(x(i)**2 +y(j)**2)) END DO END DO !$acc end kernels savearray=REAL(u) plotnum=1 name_config = 'data/u' ! CALL savedata(Nx,Ny,plotnum,name_config,savearray) ! disabled for benchmarking PRINT *,'data saved' !$acc end data CALL enercalc(Nx,Ny,planfxy,planbxy,dt,enkin(plotnum),enstr(plotnum),& enpot(plotnum),en(plotnum),kx(1:Nx),ky(1:Ny),temp1,temp2,& u(1:Nx,1:Ny),uold(1:Nx,1:Ny)) !$acc data copy(x, y, kx, ky, vold, v, nonlinhat, uold, u) call cufftExecD2Z(planf,u,v) call cufftExecD2Z(planf,uold,vold) PRINT *,'Got initial data, starting timestepping' time(plotnum)=0.0d0 CALL system_clock(start,count_rate) DO n=1,Nt !$acc kernels loop DO j=1,Ny DO i=1,Nx uold(i,j)=u(i,j) u(i,j)=sin(u(i,j)) END DO END DO !$acc end kernels call cufftExecD2Z(planf,u,nonlinhat) !$acc kernels loop DO j=1,Ny DO i=1,Nx/2+1 nonlinhat(i,j)=( 0.25*(kx(i)*kx(i) + ky(j)*ky(j))& *(2.0d0*v(i,j)+vold(i,j))+(2.0d0*v(i,j)-vold(i,j))/(dt*dt)& -nonlinhat(i,j) )/(1/(dt*dt)-0.25*(kx(i)*kx(i) + ky(j)*ky(j))) vold(i,j)=v(i,j) v(i,j)=nonlinhat(i,j) ! prescale nonlinhat nonlinhat(i,j)=nonlinhat(i,j)/REAL(Nx*Ny,kind(0d0)) END DO END DO !$acc end kernels call cufftExecZ2D(planb,nonlinhat,u) END DO CALL system_clock(finish,count_rate) !$acc end data PRINT *,'Finished time stepping' ! compute energy at the end ! savearray=REAL(u(1:Nx,1:Ny),kind(0d0)) ! disabled for benchmarking ! CALL savedata(Nx,Ny,plotnum+1,name_config,savearray) CALL enercalc(Nx,Ny,planfxy,planbxy,dt,enkin(plotnum+1),enstr(plotnum+1),& enpot(plotnum+1),en(plotnum+1),kx,ky,temp1,temp2,u(1:Nx,1:Ny),uold(1:Nx,1:Ny)) PRINT*,'Program took ',& REAL(finish-start,kind(0d0))/REAL(count_rate,kind(0d0)),& 'for Time stepping' CALL saveresults(Nt,plotgap,time(1:1+n/plotgap),en(1:1+n/plotgap+1),& enstr(1:1+n/plotgap+1),enkin(1:1+n/plotgap+1),enpot(1:1+n/plotgap+1)) ! Save times at which output was made in text format PRINT *,'Saved data' call cufftDestroy(planf) call cufftDestroy(planb) PRINT *,'Destroy CUFFT Plans' call dfftw_destroy_plan_(planbxy) call dfftw_destroy_plan_(planfxy) PRINT *,'Destroy FFTW Plans' DEALLOCATE(kx,ky,x,y,u,uold,time,enkin,enstr,enpot,en,savearray,temp1,temp2,& stat=allocatestatus) IF (allocatestatus .ne. 0) STOP PRINT *,'Deallocated host arrays' PRINT *,'Program execution complete' END PROGRAM sg2d
References
B., Cloutier; B.K., Muite; P., Rigge. "Performance of FORTRAN and C GPU Extensions for a Benchmark Suite of Fourier Pseudospectral Algorithms". Proceedings of the Symposium on Application Accelerators in High Performance computing. IEEE. http://arxiv.org/abs/1206.3215.