我使用的顯卡型號是teslaC2050,用的編譯器是pgi fortran 11.8版
以下是我寫的CODE
沒加cuda
module simpleOps_m
contains
subroutine inc(a, b,nx)
implicit none
integer :: nx
integer :: a(nx)
integer :: b
integer :: i
do i = 1, nx
a(i) = a(i)+b
enddo
end subroutine inc
end module simpleOps_m
program incTest
use simpleOps_m
implicit none
integer, parameter :: nx = 10241024500
integer :: a(nx)
integer :: b
integer :: i
a=1 ! array assignment
b=3
call inc(a, b,nx)
if (all(a ==4)) then
write(,)‘success’
endif
end program incTest
總耗時2.3x秒
加入cuda
module simpleOps_m
contains
attributes(global) subroutine inc_gpu(a, b,n)
implicit none
integer,intent(inout) :: a(
integer,intent(in), value :: b
integer,intent(in), value :: n
integer :: i,j,nx
i = (((blockIdx%x-1)*blockDim%x+threadIdx%x)-1)*n
nx=size(a)
do j=1,n
if((i+j)<=nx) a(i+j) = a(i+j)+b
enddo
end subroutine inc_gpu
end module simpleOps_m
program incTest
use cudafor
use simpleOps_m
implicit none
integer, parameter :: nx = 10241024500
integer, parameter :: grid=1024*50,tPB=1024
type(dim3) :: dimGrid,dimBlock
integer :: n
integer :: a, b
integer :: i
allocatable :: a(
integer, device :: a_d
allocatable :: a_d(
allocate(a(nx))
allocate(a_d(nx))
a=1 ! array assignment
b=3
if(mod(nx,tPBgrid)==0) then
n=nx/(tPBgrid)
else
n=nx/(tPB*grid)+1
endif
dimGrid=dim3(grid,1,1)
dimBlock=dim3(tPB,1,1)
a_d = a
call inc_gpu<<<dimGrid,dimBlock>>>(a_d, b,n)
a = a_d
if(all(a==4)) then
write(,) ‘success’
endif
deallocate(a,a_d)
end program incTest
總耗時3.3x秒有時還會到4秒多
純粹只是跑一個迴圈的code,結果卻變慢了!!
拜託各位大大救救我
到底是我的code寫錯了還是我的程式出了問題!!!
假如是我code寫錯幫幫忙指正一下,感謝各位大大