请问下面这个kernel有什么问题?

我删掉最后6行(S0=P的六行),两种情况,最后给出的结果(FP数组)不太一样?

attributes(global) SUBROUTINE CALPTENSOR_KERNEL(IM,ITYP,XP,NAPDEV,IA0, &
NPRT,mxKVOIS,KVOIS,INDI,mvl, &
cra011,cra021,cra031, &
cra012,cra022,cra032, &
cra013,cra023,cra033, &
CSI, KT, FPOTR, FPOTB, &
DEN,FP, S00)
!*** PURPOSE: to begine calculate the forces on atoms, and also the virial tensor
use gpu, only:mp_BLOCKSIZE
implicit none
!— DUMMY VARIABLES
integer, value, intent(in)::IM
real(KINDDF), value, intent(in)::cra011,cra021,cra031,cra012,cra022,cra032,cra013,cra023,cra033
integer, device, intent(in)::ITYP(IM)
real(KINDDF), device, intent(in)::XP(3,IM)
integer, value, intent(in)::NAPDEV,IA0,NPRT,mxKVOIS
integer, device, intent(in)::KVOIS(NAPDEV)
integer, device, intent(in)::INDI(mxKVOIS,NAPDEV)
integer(1), device::mvl(mxKVOIS,NAPDEV)
real(KINDDF),value, intent(in)::CSI
integer, value, intent(in)::KT
real(KINDDF), device, intent(in)::FPOTR(KT,)
real(KINDDF), device, intent(in)::FPOTB(KT,
)
real(KINDDF), device, intent(in)::DEN(IM)
!Output:
real(KINDDF), device, intent(out)::FP(3,NAPDEV)
real(KINDDF), device, intent(out)::S00(6,)
!Local variables
integer::J,K,KK, IW, IIW, KTABij, KTABji, ITYPI, ITYPJ
real(KINDDF)::FORTOT,FPX,FPY,FPZ, SK, DK, R2, R, DENKI, DENKJ
real(KINDDF)::SEPX, SEPY, SEPZ, POSX, POSY, POSZ
real(KINDDF)::P1, P2, P3, P4, P5, P6
integer::IC,IC0, ITB, IBG, NTB, NTG, ITG
! —
! — Note: blockdim%y = blockdim%z = griddim%y = griddim%y = 1, which means that
! — the range of threadidx%y, threadidx%z, blockidx%y, blockidx%z is [1,1]
ITB = threadidx%x ! ID of thread in one block
IBG = blockidx%x ! ID of block in one grid
NTB = blockdim%x !$$— size of Block
NTG = NTB
griddim%x ! number of threads in one grid
ITG = (IBG-1)NTB+ITB !$$ – the id of thread in one grid
do IC=ITG, NPRT, NTG
!$$IC – the id of the atom on the device
!$$ NOTE: IC is not the id of the same atom in the whole box
!$$POS-- position of the atom
!$$ NOTE: XP(I) is the position of Ith atom in the whole box
IC0 = IC+IA0
POSX = XP(1,IC0)
POSY = XP(2,IC0)
POSZ = XP(3,IC0)
ITYPI = ITYP(IC0)
DENKI = DEN(IC0)
!$$-- start calculation of electron density
IIW = KVOIS(IC)
FPX = 0.0d0
FPY = 0.0d0
FPZ = 0.0d0
P1= 0.0d0
P2= 0.0d0
P3= 0.0d0
P4= 0.0d0
P5= 0.0d0
P6= 0.0d0
DO IW=1, IIW, 1
!$$— NOTE: the particles index of neighbore-list is the index of particle in the whole box
J=INDI(IW,IC)
!$$— To calculate the seperation between particle IC and its IWth neighbore
SEPX = POSX - XP(1,J)
SEPY = POSY - XP(2,J)
SEPZ = POSZ - XP(3,J)
if(mvl(IW,IC)>0)then
if(iand( mvl(IW,IC), 1)==1)then
SEPX = SEPX - cra011
SEPY = SEPY - cra021
SEPZ = SEPZ - cra031
end if
if(iand( mvl(IW,IC), 2)==2)then
SEPX = SEPX + cra011
SEPY = SEPY + cra021
SEPZ = SEPZ + cra031
end if
if(iand( mvl(IW,IC), 4)==4)then
SEPX = SEPX - cra012
SEPY = SEPY - cra022
SEPZ = SEPZ - cra032
end if
if(iand( mvl(IW,IC), 8)==8)then
SEPX = SEPX + cra012
SEPY = SEPY + cra022
SEPZ = SEPZ + cra032
end if
if(iand( mvl(IW,IC), 16)==16)then
SEPX = SEPX - cra013
SEPY = SEPY - cra023
SEPZ = SEPZ - cra033
end if
if(iand( mvl(IW,IC), 32)==32)then
SEPX = SEPX + cra013
SEPY = SEPY + cra023
SEPZ = SEPZ + cra033
end if
end if
R2 = SEPX
SEPX+SEPYSEPY+SEPZSEPZ
!$$— To calculate electron density on atom I
ITYPJ=ITYP(J)
if(R2 < dcm_dtr2(ITYPI,ITYPJ)) then
KTABij = dcm_KPAIR(ITYPI,ITYPJ)
KTABji = dcm_KPAIR(ITYPJ,ITYPI)
R = DSQRT(R2)
SK= DSQRT(R)CSI
KK = SK
DK = SK-KK
DENKJ = DEN(J)
FORTOT= (FPOTR(KTABij, KK) + DK
(FPOTR(KTABij, KK+1) - FPOTR(KTABij, KK)))/R2 + &
((FPOTB(KTABji, KK) + DK*(FPOTB(KTABji, KK+1) - FPOTB(KTABji, KK)))DENKJ + &
(FPOTB(KTABij, KK) + DK
(FPOTB(KTABij, KK+1) - FPOTB(KTABij, KK)))DENKI)/R
! = -{∂φ/∂r[ij]+(∂F/∂ρ[i])
(∂ρ[i]/∂r[ij])+(∂F/∂ρ[j])(∂ρ[j]/∂r[ij])}/r[ij]
FPX = FPX + FORTOT
SEPX
FPY = FPY + FORTOTSEPY
FPZ = FPZ + FORTOT
SEPZ

FORTOT = FORTOT5.0d-1
P1 = P1+SEPX
SEPXFORTOT
P2 = P2+SEPY
SEPYFORTOT
P3 = P3+SEPZ
SEPZFORTOT
P4 = P4+SEPY
SEPZFORTOT
P5 = P5+SEPZ
SEPXFORTOT
P6 = P6+SEPX
SEPY*FORTOT

end if
ENDDO
FP(1,IC) = FPX
FP(2,IC) = FPY
FP(3,IC) = FPZ
S00(1,IC) = P1
S00(2,IC) = P2
S00(3,IC) = P3
S00(4,IC) = P4
S00(5,IC) = P5
S00(6,IC) = P6
END DO
RETURN
END SUBROUTINE CALPTENSOR_KERNEL

删除掉(S00=P的六行)赋值语句,应该是不会影响前面的FP数组的结果的,但不知道为什么删除前后运行后的FP的结果不一样。
愁呀,不知道该弄:(

@GuangyuanKan @LGZ 麻烦二位帮忙看一下

总觉得gpu的内存里有什么错误,device数据混乱?

希望大神出现:)

在kernel里面是不需要加device关键字的吧?这个关键字应该是在host端用;
如果语法没错的话,从逻辑上来说,S00只是单纯的赋值操作,不会影响其他数据的结果,你可以尝试下把函数参数的S00形参以及S00的参数申明都去掉试试:)

我觉得可以把外层循环去掉,然后观察去掉S00赋值语句是否还存在影响,如果存在的话,说明这个问题可能是逻辑上的问题

版主能否向您请教一下,CUDA fortran 怎么装?我已经装好了基于Visual studio2012 的CUDA 8.0,但是编程语言是C++,我希望它的语言是fortrand的,我应该怎能处理呢?可以直接装上PGI visual fortran编译器么?编译器怎么获取呢?小生是初学者,感谢楼主的耐心请教!:):):slight_smile:

建议使用PGI编译器社区版或专业版。