optimization - how to further optimize this CUDA kernel function

Question

Welcome To Ask or Share your Answers For Others

optimization - how to further optimize this CUDA kernel function

posted Oct 7, 2021 in Technique[技术] by 深蓝 (71.8m points)

optimization - how to further optimize this CUDA kernel function

In the following run-able code example, the kernel is taken from an actual project, where this kernel is executed in high frequency and accounts for about 15% of the total runtime. The usage of the shared memory has reduced 5% of its runtime. I am looking for suggestions to further optimize it. Any comments are most appreciated!

MODULE type

! Real numbers are simple or double precision
INTEGER, PARAMETER :: SP = SELECTED_REAL_KIND(6,   37)     ! REAL32
INTEGER, PARAMETER :: DP = SELECTED_REAL_KIND(15, 307)     ! REAL64
INTEGER, PARAMETER :: RP = DP
!
END MODULE
!
!
!
MODULE test_kernel
!
USE cudafor
USE type
!
REAL(RP), ALLOCATABLE, DIMENSION(:,:),   DEVICE    :: phizi, phizix, phiziy, phiziz, phizit
REAL(RP), ALLOCATABLE, DIMENSION(:,:,:), DEVICE    :: etapmext1, etapmext2
REAL(RP), ALLOCATABLE, DIMENSION(:,:),   DEVICE    :: phim_ext, phimx_ext, phimy_ext, phimz_ext, phit_ext
REAL(RP), ALLOCATABLE, DIMENSION(:,:),   DEVICE    :: phim2_ext, phimx2_ext, phimy2_ext, phimz2_ext, phit2_ext

REAL(RP), ALLOCATABLE, DIMENSION(:,:)              :: phit_ext_h
!
INTEGER, DEVICE    :: tile_size_x, tile_size_y
!
CONTAINS

!-- kernel using shared mem
attributes(global) subroutine fillphim1(m, n, jj)
!
IMPLICIT NONE
!
INTEGER, VALUE :: m, n, jj
!
! local variables
!
INTEGER, DEVICE                                         :: i1, i2, j1, j2
REAL(RP), DIMENSION(tile_size_x,tile_size_y), SHARED    :: phizi_s, phizix_s, phiziy_s, phiziz_s, phizit_s
REAL(RP), DIMENSION(tile_size_x,tile_size_y), SHARED    :: etapmext1_s, etapmext2_s

i1 = (blockIdx%x-1)*blockDim%x + threadIdx%x
i2 = (blockIdx%y-1)*blockDim%y + threadIdx%y

if (i1>m .or. i2>n) return
j1 = threadIdx%x
j2 = threadIdx%y

phizi_s(    j1, j2) = phizi(i1,i2)
phizix_s(   j1, j2) = phizix(i1,i2)
phiziy_s(   j1, j2) = phiziy(i1,i2)
phiziz_s(   j1, j2) = phiziz(i1,i2)
phizit_s(   j1, j2) = phizit(i1,i2)
etapmext1_s(j1, j2) = etapmext1(i1,i2,jj)
etapmext2_s(j1, j2) = etapmext2(i1,i2,jj)

!CALL syncthreads()

phim_ext(i1,i2)  = phim_ext(i1,i2)  - phizi_s(j1,j2)  * etapmext1_s(j1,j2)
phimx_ext(i1,i2) = phimx_ext(i1,i2) - phizix_s(j1,j2) * etapmext1_s(j1,j2)
phimy_ext(i1,i2) = phimy_ext(i1,i2) - phiziy_s(j1,j2) * etapmext1_s(j1,j2)
phimz_ext(i1,i2) = phimz_ext(i1,i2) - phiziz_s(j1,j2) * etapmext1_s(j1,j2)
phit_ext(i1,i2)  = phit_ext(i1,i2)  - phizit_s(j1,j2) * etapmext1_s(j1,j2)
!
phim2_ext(i1,i2) = phim2_ext(i1,i2)   + phizi_s(j1,j2)  * etapmext2_s(j1,j2)
phimx2_ext(i1,i2) = phimx2_ext(i1,i2) + phizix_s(j1,j2) * etapmext2_s(j1,j2)
phimy2_ext(i1,i2) = phimy2_ext(i1,i2) + phiziy_s(j1,j2) * etapmext2_s(j1,j2)
phimz2_ext(i1,i2) = phimz2_ext(i1,i2) + phiziz_s(j1,j2) * etapmext2_s(j1,j2)
phit2_ext(i1,i2) = phit2_ext(i1,i2)   + phizit_s(j1,j2) * etapmext2_s(j1,j2)
!
end subroutine
!
END MODULE
!
!
!
PROGRAM test
!
USE test_kernel
USE cudafor
USE type
!
IMPLICIT NONE
!
! local
!
INTEGER             :: istat, i1, i2, i3, j, k, M_HOSvel, isize1
INTEGER             :: tile_size_x_h, tile_size_y_h
INTEGER             :: md1, md2
REAL                :: timeElapsed
TYPE(dim3)          :: gridV,  tBlockV
TYPE(cudaEvent)     :: startEvent, stopEvent
REAL(RP), parameter :: tiny = 0.000000001_RP
!
M_HOSvel = 5
md1      = 1024
md2      = 1024
j        = 3
!
tile_size_x_h = 16
tile_size_y_h = 16
tile_size_x   = tile_size_x_h
tile_size_y   = tile_size_y_h
!
isize1        = tile_size_x_h*tile_size_y_h *7*sizeof(0.0_rp)
!
tBlockV = dim3(tile_size_x_h,tile_size_y_h,1)
gridV   = dim3(ceiling(real(md1)/tBlockV%x), &
           ceiling(real(md2)/tBlockV%y), 1)
!
istat = cudaSetDevice(0)
!
istat = cudaEventCreate(startEvent)
istat = cudaEventCreate(stopEvent)
!
ALLOCATE(etapmext1(md1,md2,M_HOSvel+1), etapmext2(md1,md2,M_HOSvel+1), source=0.1_RP)
ALLOCATE(phizi(md1,md2), phizix(md1,md2), phiziy(md1,md2), phiziz(md1,md2), phizit(md1,md2), source=0.2_RP)

ALLOCATE(phim_ext(md1,md2), phimx_ext(md1,md2), phimy_ext(md1,md2), phimz_ext(md1,md2), phit_ext(md1,md2), source=0.0_RP)
ALLOCATE(phim2_ext(md1,md2), phimx2_ext(md1,md2), phimy2_ext(md1,md2), phimz2_ext(md1,md2), phit2_ext(md1,md2), source=0.0_RP)
!
i3 = 10
!
print*, 'warming up ...'
do i2 = 1, i3
    CALL fillphim1<<<gridV,tBlockV,isize1>>>(md1, md2, j+1)
enddo
print*, "----------------------------------------------------------------
"
!
istat = cudaDeviceSynchronize()
!
!-- re-run the tests =======================================================
!
istat = cudaEventRecord(startEvent,0)
print*, 'Starting kernel1 ...'
i3 = 500
do i2 = 1, i3
    CALL fillphim1<<<gridV,tBlockV,isize1>>>(md1, md2, j+1)
enddo
istat = cudaEventRecord(stopEvent,0)
istat = cudaEventSynchronize(stopEvent)
istat = cudaEventElapsedTime(timeElapsed, startEvent, stopEvent)
WRITE(*,'(A,F24.12,A)') '   Kernel1 finished in: ', timeElapsed/1000.0_rp, '(sec)'
!
istat = cudaDeviceSynchronize()
!
!-- test if the results are correct
!
phit_ext_h = phit_ext
do i2 = 1, md2
    do i1 = 1, md1
        if (ABS(phit_ext_h(i1,i2) + 10.2_RP) > tiny) then
            print*, 'kernel mistake !'
            stop 1
        endif
    enddo
enddo
!
istat = cudaDeviceSynchronize()
!
istat = cudaGetLastError()
IF (istat /= cudaSuccess) THEN
    write(*,*) 'Async kernel error:', cudaGetErrorString(istat)
    write(*,*) 'Something bad happened!'
    STOP 1
ENDIF
!
istat = cudaDeviceReset()
!
END PROGRAM

question from:https://stackoverflow.com/questions/65912277/how-to-further-optimize-this-cuda-kernel-function

与恶龙缠斗过久,自身亦成为恶龙；凝视深渊过久,深渊将回以凝视…

Categories

optimization - how to further optimize this CUDA kernel function

optimization - how to further optimize this CUDA kernel function

Please log in or register to add a comment.

Please log in or register to reply this article.

1 Reply

Please log in or register to add a comment.

Just Browsing Browsing

Most popular tags