In the following run-able code example, the kernel is taken from an actual project, where this kernel is executed in high frequency and accounts for about 15% of the total runtime. The usage of the shared memory has reduced 5% of its runtime. I am looking for suggestions to further optimize it. Any comments are most appreciated!
MODULE type
! Real numbers are simple or double precision
INTEGER, PARAMETER :: SP = SELECTED_REAL_KIND(6, 37) ! REAL32
INTEGER, PARAMETER :: DP = SELECTED_REAL_KIND(15, 307) ! REAL64
INTEGER, PARAMETER :: RP = DP
!
END MODULE
!
!
!
MODULE test_kernel
!
USE cudafor
USE type
!
REAL(RP), ALLOCATABLE, DIMENSION(:,:), DEVICE :: phizi, phizix, phiziy, phiziz, phizit
REAL(RP), ALLOCATABLE, DIMENSION(:,:,:), DEVICE :: etapmext1, etapmext2
REAL(RP), ALLOCATABLE, DIMENSION(:,:), DEVICE :: phim_ext, phimx_ext, phimy_ext, phimz_ext, phit_ext
REAL(RP), ALLOCATABLE, DIMENSION(:,:), DEVICE :: phim2_ext, phimx2_ext, phimy2_ext, phimz2_ext, phit2_ext
REAL(RP), ALLOCATABLE, DIMENSION(:,:) :: phit_ext_h
!
INTEGER, DEVICE :: tile_size_x, tile_size_y
!
CONTAINS
!-- kernel using shared mem
attributes(global) subroutine fillphim1(m, n, jj)
!
IMPLICIT NONE
!
INTEGER, VALUE :: m, n, jj
!
! local variables
!
INTEGER, DEVICE :: i1, i2, j1, j2
REAL(RP), DIMENSION(tile_size_x,tile_size_y), SHARED :: phizi_s, phizix_s, phiziy_s, phiziz_s, phizit_s
REAL(RP), DIMENSION(tile_size_x,tile_size_y), SHARED :: etapmext1_s, etapmext2_s
i1 = (blockIdx%x-1)*blockDim%x + threadIdx%x
i2 = (blockIdx%y-1)*blockDim%y + threadIdx%y
if (i1>m .or. i2>n) return
j1 = threadIdx%x
j2 = threadIdx%y
phizi_s( j1, j2) = phizi(i1,i2)
phizix_s( j1, j2) = phizix(i1,i2)
phiziy_s( j1, j2) = phiziy(i1,i2)
phiziz_s( j1, j2) = phiziz(i1,i2)
phizit_s( j1, j2) = phizit(i1,i2)
etapmext1_s(j1, j2) = etapmext1(i1,i2,jj)
etapmext2_s(j1, j2) = etapmext2(i1,i2,jj)
!CALL syncthreads()
phim_ext(i1,i2) = phim_ext(i1,i2) - phizi_s(j1,j2) * etapmext1_s(j1,j2)
phimx_ext(i1,i2) = phimx_ext(i1,i2) - phizix_s(j1,j2) * etapmext1_s(j1,j2)
phimy_ext(i1,i2) = phimy_ext(i1,i2) - phiziy_s(j1,j2) * etapmext1_s(j1,j2)
phimz_ext(i1,i2) = phimz_ext(i1,i2) - phiziz_s(j1,j2) * etapmext1_s(j1,j2)
phit_ext(i1,i2) = phit_ext(i1,i2) - phizit_s(j1,j2) * etapmext1_s(j1,j2)
!
phim2_ext(i1,i2) = phim2_ext(i1,i2) + phizi_s(j1,j2) * etapmext2_s(j1,j2)
phimx2_ext(i1,i2) = phimx2_ext(i1,i2) + phizix_s(j1,j2) * etapmext2_s(j1,j2)
phimy2_ext(i1,i2) = phimy2_ext(i1,i2) + phiziy_s(j1,j2) * etapmext2_s(j1,j2)
phimz2_ext(i1,i2) = phimz2_ext(i1,i2) + phiziz_s(j1,j2) * etapmext2_s(j1,j2)
phit2_ext(i1,i2) = phit2_ext(i1,i2) + phizit_s(j1,j2) * etapmext2_s(j1,j2)
!
end subroutine
!
END MODULE
!
!
!
PROGRAM test
!
USE test_kernel
USE cudafor
USE type
!
IMPLICIT NONE
!
! local
!
INTEGER :: istat, i1, i2, i3, j, k, M_HOSvel, isize1
INTEGER :: tile_size_x_h, tile_size_y_h
INTEGER :: md1, md2
REAL :: timeElapsed
TYPE(dim3) :: gridV, tBlockV
TYPE(cudaEvent) :: startEvent, stopEvent
REAL(RP), parameter :: tiny = 0.000000001_RP
!
M_HOSvel = 5
md1 = 1024
md2 = 1024
j = 3
!
tile_size_x_h = 16
tile_size_y_h = 16
tile_size_x = tile_size_x_h
tile_size_y = tile_size_y_h
!
isize1 = tile_size_x_h*tile_size_y_h *7*sizeof(0.0_rp)
!
tBlockV = dim3(tile_size_x_h,tile_size_y_h,1)
gridV = dim3(ceiling(real(md1)/tBlockV%x), &
ceiling(real(md2)/tBlockV%y), 1)
!
istat = cudaSetDevice(0)
!
istat = cudaEventCreate(startEvent)
istat = cudaEventCreate(stopEvent)
!
ALLOCATE(etapmext1(md1,md2,M_HOSvel+1), etapmext2(md1,md2,M_HOSvel+1), source=0.1_RP)
ALLOCATE(phizi(md1,md2), phizix(md1,md2), phiziy(md1,md2), phiziz(md1,md2), phizit(md1,md2), source=0.2_RP)
ALLOCATE(phim_ext(md1,md2), phimx_ext(md1,md2), phimy_ext(md1,md2), phimz_ext(md1,md2), phit_ext(md1,md2), source=0.0_RP)
ALLOCATE(phim2_ext(md1,md2), phimx2_ext(md1,md2), phimy2_ext(md1,md2), phimz2_ext(md1,md2), phit2_ext(md1,md2), source=0.0_RP)
!
i3 = 10
!
print*, 'warming up ...'
do i2 = 1, i3
CALL fillphim1<<<gridV,tBlockV,isize1>>>(md1, md2, j+1)
enddo
print*, "----------------------------------------------------------------
"
!
istat = cudaDeviceSynchronize()
!
!-- re-run the tests =======================================================
!
istat = cudaEventRecord(startEvent,0)
print*, 'Starting kernel1 ...'
i3 = 500
do i2 = 1, i3
CALL fillphim1<<<gridV,tBlockV,isize1>>>(md1, md2, j+1)
enddo
istat = cudaEventRecord(stopEvent,0)
istat = cudaEventSynchronize(stopEvent)
istat = cudaEventElapsedTime(timeElapsed, startEvent, stopEvent)
WRITE(*,'(A,F24.12,A)') ' Kernel1 finished in: ', timeElapsed/1000.0_rp, '(sec)'
!
istat = cudaDeviceSynchronize()
!
!-- test if the results are correct
!
phit_ext_h = phit_ext
do i2 = 1, md2
do i1 = 1, md1
if (ABS(phit_ext_h(i1,i2) + 10.2_RP) > tiny) then
print*, 'kernel mistake !'
stop 1
endif
enddo
enddo
!
istat = cudaDeviceSynchronize()
!
istat = cudaGetLastError()
IF (istat /= cudaSuccess) THEN
write(*,*) 'Async kernel error:', cudaGetErrorString(istat)
write(*,*) 'Something bad happened!'
STOP 1
ENDIF
!
istat = cudaDeviceReset()
!
END PROGRAM
question from:
https://stackoverflow.com/questions/65912277/how-to-further-optimize-this-cuda-kernel-function