///////////////////////////// Aderivative的定义////////////////////////////////////
ADerivative = newfloat*[m_receiverm_tnum]; //30300
for( i=0; i<m_receiver*m_tnum; ++i)
{
ADerivative=new float[m_GridnumX*m_GridnumZ];//30*50
}
for( i=0; i<m_receiver*m_tnum; ++i)
for( j=0; j<m_GridnumX*m_GridnumZ; ++j)
{
ADerivative[j]=0.0;
}
/////////////////////////////////////////////////////////////////////////////
#defineM 30*300
#define N 30*50
float *g_ADerivative[N];
float *g_C1[N];
cudaMalloc((void**)&g_ADerivative,NMsizeof(float));
/////////// Aderivative是 new 的 不连续的地址,可不可以直接cudamemcpy////////////
cudaMemcpy(g_ADerivative,ADerivative,m_receiverm_tnumm_GridnumXm_GridnumZsizeof(float),cudaMemcpyHostToDevice);
////////////////////Aderivative的转置 * Aderivative////////////////////////////////
MatrixMutil0(g_ADerivative,g_C1,m_receiver*m_tnum,m_GridnumX*m_GridnumZ,1,BLOCK);
void MatrixMutil0(float **Mx,float **result,intMX,int MZ,floatalpha,int thread)
{
dim3 THread(thread,thread);
dim3 BLock(MZ/thread,MZ/thread);
g_MatrixMutil0<<<BLock,THread>>>(Mx,result,MX,MZ,alpha);
cudaError_t cudaStatus;
cudaStatus = cudaDeviceSynchronize();
if(cudaStatus != cudaSuccess) {
fprintf(stderr, "cudaDeviceSynchronizereturned error code %d after Kernel!\n", cudaStatus);}
}
global void g_MatrixMutil0(float**Mx,float **result,intMX,int MZ,floatalpha) {
int x =blockIdx.x * blockDim.x + threadIdx.x;
int y =blockIdx.y * blockDim.y + threadIdx.y;
int tx =threadIdx.x;
int ty =threadIdx.y;
int bx =blockIdx.x;
int by =blockIdx.y;
floatsub;
inti,j,k,a,b;
for(i=0;i<MX/30;i++)
{
float*Asub = GetSubMatrix(Mx,i,bx);
//获取指向当前矩阵N的子矩阵的指针Nsub
float*A_sub = GetSubMatrix(Mx,i,by);
__shared__ float AT[BLOCK][BLOCK];
__shared__ float A[BLOCK][BLOCK];
AT[ty][tx]=*(A_sub+tx*BLOCK+ty);
A[ty][tx] =*(Asub+tx*BLOCK+ty);
__syncthreads();
for(k=0;k<BLOCK;k++)
{
sub+=AT[ty][k]*A[k][tx]*alpha;
}
__syncthreads();
}
result[y][x]=sub;
}
device float* GetSubMatrix( float**A,int row ,int col)
{
return&A[row*BLOCK][col*BLOCK];
}
程序运行报错:cudaDeviceSynchronize returnederror code 30 after Kernel!
但是有结果,结果不正确。
程序是数据传输问题,还是矩阵自乘部分的问题。烦请大神ice和横扫千军指导,谢谢!!