求助，关于cuda值传递的问题

system · 2013 年5 月 22 日 12:53

正在做一个关于turbo译码的程序，需要两个kernel，两个的内容和作用基本相似，编译能够正常通过，但是输出结果不是想要的。仔细查看了kernel_0，并将中间变量传到host打印输出后，发现里面的计算到G后就错了，将G赋为常数值，传递到host中或者全为0.000，或者为0.000,1.000,2.000,3.000，我是采用地址传递，百思不得其解为什么会出现这样错误，求高手帮助.（为了调试错误，源代码改动了很多，但与问题相关的很少，已经用红色标注出来）这里先谢过了

#include <stdlib.h>
#include <stdio.h>
#include <string.h>
#include <math.h>
#include <time.h>
#include "random.h"
#include "random.cpp"
#include <helper_functions.h>
#include <helper_cuda.h>

// includes CUDA
#include <cuda_runtime.h>

#define threadsPerBlock 8
#define blocksPerGrid 96
#define SUBFRAME_LENGTH 64
#define N 6144
#define NUMSTATES 8
#define memory 3
//const int SEED=1000;
double No;
//void runTest(int argc, char **argv);
double gaussian(double variance);
void inttobool(unsigned state, bool *array, unsigned size);
void booltoint(bool *array, unsigned size, unsigned *state);
void createencodetable();
void encode(bool *mesg,bool *parity,unsigned size);
bool add(bool a, bool b);
//int rand(void);
//void srand(unsigned int n);
unsigned *tostate[2];

unsigned *fromstate[2];

bool *output[2];
Random before;
extern "C"

__global__ void turbo_kernel_0(float *G, float *Le_dev, float *La_dev, float *dev_y, float *dev_yp)
{
   const int m[4] = {1, 1, -1, -1};
   const int n[4] = {1, -1, 1, -1};
  const int G_idx[8] = {3, 1, 2, 0, 0, 2, 1, 3};
  const int G_idy[8] = {3, 2, 2, 3, 3, 2, 2, 3}; 

   //float Lc = 5.0;
   A[threadsPerBlock];
   float B[threadsPerBlock];

   int h[N]={};
//__shared__  float G[threadsPerBlock/2];
   __shared__  float Alpha[threadsPerBlock * SUBFRAME_LENGTH];
   __shared__  float Beta[threadsPerBlock];
   __shared__  float L0_cache[threadsPerBlock * 8 * 8];
   __shared__  float L1_cache[threadsPerBlock * 8 * 8];
   float r0[threadsPerBlock] = {};
   float r1[threadsPerBlock] = {};
   
   float dB = -1.0;
   float No = 1/pow(10.0,dB/10.0);
  float Lc = (4.0*1.0)/No;
 

   //int i = blockDim.x * blockIdx.x + threadIdx.x;//blockDim.x = 8;  i<8,it is enough
   //int i = blockDim.x * blockIdx.x + threadIdx.x;
   int cacheIndex = threadIdx.x;

   if(cacheIndex == 0)
   {
   A[cacheIndex]= 2.0;
   B[cacheIndex]= 0.0; 
   }
   else 
   {
   A[cacheIndex]= -100.0;
   
  B[cacheIndex]= -100.0;
   
   }
   __syncthreads();
   int k = 0;

   G[cacheIndex] = 5.0;// 0.5 * La_dev[k + (SUBFRAME_LENGTH ) * blockIdx.x] * m[cacheIndex] + 0.5 * Lc * m[cacheIndex]  * dev_y[k + (SUBFRAME_LENGTH ) * blockIdx.x] + 0.5 * Lc * n[cacheIndex] * dev_yp[k + (SUBFRAME_LENGTH ) * blockIdx.x];//G[i] will be missing?
   __syncthreads();

   __syncthreads();

   A[cacheIndex]  = 2.0;//(A[(2 * cacheIndex) % 8] + G[G_idx[cacheIndex]]) > (A[(2 * cacheIndex) % 8 + 1] + G[3 - G_idx[cacheIndex]]) ? (A[(2 * cacheIndex) % 8] + G[G_idx[cacheIndex]]) : (A[(2 * cacheIndex) % 8 + 1] + G[3 - G_idx[cacheIndex]]);
   __syncthreads();

   Alpha[cacheIndex + k * (threadsPerBlock)] = A[cacheIndex] ;
   __syncthreads();

   
  for(int l = (SUBFRAME_LENGTH - 1)/8; l>= 0; l--)

   {
   int ls = (SUBFRAME_LENGTH - 1)/8 - l;
   
   for(int j = 7; j >= 0; j--)
   {
   int k = 8 * l + j;
   if(cacheIndex < 4)
   // G[cacheIndex] = 4.0;//0.5 * La_dev[k + (SUBFRAME_LENGTH )* blockIdx.x] * m[cacheIndex] + 0.5 * Lc * m[cacheIndex]  * dev_y[k + (SUBFRAME_LENGTH )* blockIdx.x] + 0.5 * Lc* n[cacheIndex] * dev_yp[k + (SUBFRAME_LENGTH )* blockIdx.x];

   __syncthreads();

   

   B[cacheIndex] = 3.0;//(B[cacheIndex] + G[G_idy[cacheIndex]]) > (B[(cacheIndex + 4) % 8] + G[3 - G_idy[cacheIndex]]) ? (B[cacheIndex] + G[G_idy[cacheIndex]]) : (B[(cacheIndex + 4) % 8] + G[3 - G_idy[cacheIndex]]);
   
   __syncthreads();

   Beta[cacheIndex] = B[cacheIndex];

   __syncthreads();


   const int G_idz[8] = {3, 3, 3, 3, 2, 2, 2, 2};
   const int G_idw[8] = {0, 3, 4, 7, 1, 2, 5, 6};
   const int G_idu[8] = {1, 1, 1, 1, 0, 0, 0, 0};
   const int G_idv[8] = {1, 2, 5, 6, 0, 3, 4, 7};
   int s = 7 - j;
   L0_cache[cacheIndex+ s * (threadsPerBlock ) + 8 * 8 * ls] = Alpha[G_idw[cacheIndex] + k * (threadsPerBlock )] + G[G_idz[cacheIndex]] + Beta[cacheIndex];//sequence of A and B? ;from back to front 
  
   L1_cache[cacheIndex+ s * (threadsPerBlock ) + 8 * 8 * ls] = Alpha[G_idv[cacheIndex] + k * (threadsPerBlock )] + G[G_idu[cacheIndex]] + Beta[cacheIndex];
 
   //L0_cache[cacheIndex+ s * (threadsPerBlock )] = Alpha[G_idw[cacheIndex] + k * (threadsPerBlock )] + G[G_idz[cacheIndex]] + Beta[cacheIndex];//sequence of A and B?
  
  //  L1_cache[cacheIndex+ s * (threadsPerBlock )] = Alpha[G_idv[cacheIndex] + k * (threadsPerBlock )] + G[G_idu[cacheIndex]] + Beta[cacheIndex];
   
   __syncthreads();
   }

   r0[0] =  L0_cache[ ls * 8 * (threadsPerBlock )];
   r1[0] =  L1_cache[ ls * 8 * (threadsPerBlock )];  
   r0[1] =  L0_cache[ 8 + ls * 8 * (threadsPerBlock )];
   r1[1] =  L1_cache[ 8 + ls * 8 * (threadsPerBlock )];
   r0[2] =  L0_cache[ 16 + ls * 8 * (threadsPerBlock )];
   r1[2] =  L1_cache[ 16 + ls * 8 * (threadsPerBlock )];
   r0[3] =  L0_cache[ 24 + ls * 8 * (threadsPerBlock )];
   r1[3] =  L1_cache[ 24 + ls * 8 * (threadsPerBlock )];
   r0[4] =  L0_cache[ 32 + ls * 8 * (threadsPerBlock )];
   r1[4] =  L1_cache[ 32 + ls * 8 * (threadsPerBlock )];
   r0[5] =  L0_cache[ 40 + ls * 8 * (threadsPerBlock )];
   r1[5] =  L1_cache[ 40 + ls * 8 * (threadsPerBlock )];
   r0[6] =  L0_cache[ 48 + ls * 8 * (threadsPerBlock )];
   r1[6] =  L1_cache[ 48 + ls * 8 * (threadsPerBlock )];
   r0[7] =  L0_cache[ 56 + ls * 8 * (threadsPerBlock )];
   r1[7] =  L1_cache[ 56 + ls * 8 * (threadsPerBlock )];


   __syncthreads();
   for(int p = 1; p < 8; p++)
   {
   if(r0[cacheIndex] < L0_cache[p + cacheIndex * 8 + ls * 8 * (threadsPerBlock )])
   r0[cacheIndex] = L0_cache[p + cacheIndex * 8 + ls * 8 * (threadsPerBlock )];

   if(r1[cacheIndex] < L1_cache[p + cacheIndex * 8 + ls * 8 * (threadsPerBlock )])
   r1[cacheIndex] = L1_cache[p + cacheIndex * 8 + ls * 8 * (threadsPerBlock )];
   }
   // r0[cacheIndex] = r0[cacheIndex] > L0_cache[p + cacheIndex * 8 + ls * 8 * (threadsPerBlock )] ?  r0[cacheIndex] : L0_cache[p + cacheIndex * 8 + ls * 8 * (threadsPerBlock )];     
   //  r1[cacheIndex] = r1[cacheIndex] > L1_cache[p + cacheIndex * 8 + ls * 8 * (threadsPerBlock )] ?  r1[cacheIndex] : L1_cache[p + cacheIndex * 8 + ls * 8 * (threadsPerBlock )]; 
   
   /*
   r0[cacheIndex] =  L0_cache[cacheIndex + s * (threadsPerBlock )];
   r1[cacheIndex] =  L1_cache[cacheIndex + s * (threadsPerBlock )];
   for(int p = 1; p < 8; p++)
   {
   int Index = (cacheIndex + p) & 7;
   r0[cacheIndex] = r0[cacheIndex] > L0_cache[Index + s * (threadsPerBlock )] ?  r0[cacheIndex] : L0_cache[Index + s * (threadsPerBlock )];     
   r1[cacheIndex] = r1[cacheIndex] > L1_cache[Index + s * (threadsPerBlock )] ?  r1[cacheIndex] : L1_cache[Index + s * (threadsPerBlock )]; 
   
   }
*/
   __syncthreads();          
  
  
  Le_dev[ls] = 1.0;//r0[cacheIndex] - r1[cacheIndex] - La_dev[cacheIndex + 8 * ls + (SUBFRAME_LENGTH ) * blockIdx.x] - Lc * dev_y[cacheIndex + 8 * ls + (SUBFRAME_LENGTH ) * blockIdx.x];

  //La_dev[cacheIndex + 8 * ls + (SUBFRAME_LENGTH ) * blockIdx.x] = r0[cacheIndex] - r1[cacheIndex] - La_dev[cacheIndex + 8 * ls + (SUBFRAME_LENGTH ) * blockIdx.x] - Lc * dev_y[cacheIndex + 8 * ls + (SUBFRAME_LENGTH ) * blockIdx.x];
   __syncthreads(); 
   
   
   int f1 = 7;
   int f2 = 16;   
   h[cacheIndex + 8 * ls + (SUBFRAME_LENGTH )* blockIdx.x] = (((f1 + f2 * (cacheIndex + 8* ls + (SUBFRAME_LENGTH ) * blockIdx.x) ) % N) * (cacheIndex + 8 * ls + (SUBFRAME_LENGTH ) * blockIdx.x))% N ;//need to be put into sm?
   
   __syncthreads(); 

   La_dev[cacheIndex + 8 * ls + (SUBFRAME_LENGTH ) * blockIdx.x] = Le_dev[h[cacheIndex + 8 * ls + (SUBFRAME_LENGTH ) * blockIdx.x]];
   }
   }

// Program main
int main()
{
   void inttobool(unsigned state, bool *array, unsigned size);
   void booltoint(bool *array, unsigned size, unsigned *state);
   bool add(bool a, bool b);
   void createencodetable();
   void encode(bool *mesg,bool *parity,unsigned size);
   
   float *dev_y, *dev_yp1, *dev_yp2, *Le_dev, *La_dev, *G, *La_temp;
   float *Le, *La,  *y, *yp1, *yp2, *L, *Lf ;
   int *h, *u; 
   bool *mesg, *mesg_inv, *parity1, *parity2,  *uf;
  //d_idata;
  //  float Lc = 5.0;


   unsigned int mem_size = sizeof(float) * N;

   y = new float[N];
   yp1 = new float[N];
   yp2 =  new float[N];
   //yp2 = (float *) malloc(mem_size);
   Le =  new float[N];
   La =  new float[N];
   La_temp =  new float[N];
   float La_temp1[4];
   // La_temp= (float *) malloc(sizeof(float) * 4);
   Lf =  new float[N];
   L =  new float[N];
   u =  new int[N];
   uf = new bool[N];
   h = new int[N];
   mesg = new bool[N];
   mesg_inv = new bool[N];
   parity1 = new bool[N];
   parity2 = new bool[N];

system · 2013 年5 月 22 日 12:56

checkCudaErrors(cudaMalloc((void **) &G, sizeof(float) * 4 ));
checkCudaErrors(cudaMalloc((void **) &dev_y, mem_size));
checkCudaErrors(cudaMalloc((void **) &dev_yp1, mem_size));
checkCudaErrors(cudaMalloc((void **) &dev_yp2, mem_size));
checkCudaErrors(cudaMalloc((void **) &Le_dev, mem_size));
checkCudaErrors(cudaMalloc((void **) &La_dev, mem_size));

printf(" Starting…\n\n");

int p1[N], p2[N];

float dB = -1.0;
No = 1/pow(10.0,dB/10.0);

//float Lc = (4.0*1.0)/No;

// printf("No = %f ", No);

for (int i=0;i<N;i++)
{
mesg[i] = before.boolrandom();
// printf("u = %d ", mesg[i]);
}
for (int i=0;i<N;i++)
{
if(mesg[i] == true)
u[i] = 1;
else u[i] = 0;
}

createencodetable();

int f1 = 7;
int f2 = 16;

for(int i = 0; i < N; i ++)
{
parity1[i] = false;
parity2[i] = false;
h[i] = 0;

h[i] = (((f1 + f2 * i ) % N) * i)% N ;

}

encode(mesg, parity1, N);

for(int i = 0; i < N; i ++)
{
mesg_inv[i] = false;
mesg_inv[i] = mesg[h[i]];
}

encode(mesg_inv, parity2, N);

for(int i = 0; i < N; i ++)
{
if(parity1[i] == true)
p1[i] = 1;
else p1[i] = 0;
}

for(int i = 0; i < N; i ++)
{
if(parity2[i] == true)
p2[i] = 1;
else p2[i] = 0;
}

// srand((unsigned)time(NULL));
for (int i = 0; i < N; i++)
{

// rand_temp[i] = rand()%100;
y[i] =u[i] + gaussian(No/2);
yp1[i] =p1[i] + gaussian(No/2);
yp2[i] =p2[i] + gaussian(No/2);
La[i] = 0.0;
Le[i] = 0.0;

}

// copy host memory to device
checkCudaErrors(cudaMemcpy(Le_dev, Le, mem_size, cudaMemcpyHostToDevice));
checkCudaErrors(cudaMemcpy(La_dev, La, mem_size, cudaMemcpyHostToDevice));
checkCudaErrors(cudaMemcpy(dev_y, y, mem_size, cudaMemcpyHostToDevice));
checkCudaErrors(cudaMemcpy(dev_yp1, yp1, mem_size, cudaMemcpyHostToDevice));
checkCudaErrors(cudaMemcpy(dev_yp2, yp2, mem_size, cudaMemcpyHostToDevice));
// allocate mem for the result on host side
turbo_kernel_0<<< 1, 4 >>>(G, Le_dev, La_dev, dev_y, dev_yp1);
checkCudaErrors( cudaMemcpy(La_temp1, G, sizeof(float) * 4, cudaMemcpyDeviceToHost));
for(int i = 0; i < 4; i++)
printf("t = %f\n ",La_temp1[i]);
checkCudaErrors( cudaMemcpy(La_temp, La_dev, sizeof(float) * N, cudaMemcpyDeviceToHost));
// cudaDeviceSynchronize();
// cudaDeviceReset();

// turbo_kernel_1<<< blocksPerGrid, threadsPerBlock >>>(Le_dev, La_dev, dev_yp2);

checkCudaErrors(cudaMemcpy(Le, Le_dev, mem_size, cudaMemcpyDeviceToHost));

//cudaMemcpy(La, La_dev, mem_size, cudaMemcpyDeviceToHost);
// int f1 = 7;
// int f2 = 16;
for(int i = 0; i < N; i++)
{
L[i] = 0.0;
Lf[i] = 0.0;
// h[i] = 0;
L[i] = Le[i] + La_temp[i];

// h[i] = (((f1 + f2 * i ) % N) * i)% N ;

Lf[h[i]] = L[i];
}
for(int i = 0; i < N; i ++)
{
uf[i] = false;
if (Lf[i] > 0)
uf[i] = true;

// printf("uf = %d ", uf[i]);
}
int sum_error = 0, sum = 0;
for(int i = 0; i < N; i ++)
{
sum = sum + 1;
if(mesg[i] != uf[i])
sum_error = sum_error + 1;

}
printf("sum = %d ", sum);
printf("sum_error = %d \n ", sum_error);

delete(y);

delete(yp1);
delete(yp2);
delete(uf);
delete(Le);
delete(La);
delete(La_temp);
delete(Lf);
delete(L);
delete(u);
delete(h);
delete(mesg);
delete(mesg_inv);
delete(parity1);
delete(parity2);
//delete(reference);
cudaFree(dev_y);
cudaFree(dev_yp1);
cudaFree(dev_yp2);
cudaFree(Le_dev);
cudaFree(La_dev);

// check if kernel execution generated and error
//getLastCudaError(“Kernel execution failed”);

// allocate mem for the result on host side
//int *uf = (int *) malloc(mem_size_int);
// copy result from device to host
//checkCudaErrors(cudaMemcpy(uf, dev_uf, sizeof(int) * num_threads,
// cudaMemcpyDeviceToHost));

//sdkStopTimer(&timer);
//printf(“Processing time: %f (ms)\n”, sdkGetTimerValue(&timer));
//sdkDeleteTimer(&timer);
/*
// compute reference solution
float *reference = (float *) malloc(mem_size);
computeGold(reference, h_idata, num_threads);

// check result
if (checkCmdLineFlag(argc, (const char **) argv, “regression”))
{
// write file for regression test
sdkWriteFile(“./data/regression.dat”, h_odata, num_threads, 0.0f, false);
}
else
{
// custom output handling when no regression test running
// in this case check if the result is equivalent to the expected soluion
bTestResult = compareData(reference, h_odata, num_threads, 0.0f, 0.0f);
}
*/

}

void inttobool(unsigned state, bool *array, unsigned size)

{

for (unsigned x = 0; x < size; x++)
{
unsigned next = state >> 1;

if ((next << 1) == state)
array = false;
else
array = true;

state = next;//important
}
}

void booltoint(bool *array, unsigned size, unsigned *state)
{
*state = 0;

for (int x=0;x<size;x++)
if (array == true)
(*state) |= (1 << x);//1 move left x bits
}

bool add(bool a, bool b)
{
return a==b ? false : true;
}

double gaussian(double variance)
{
// static becuase we don’t want to have it initialized each time we go in
double returnvalue=0;
double k;

k = sqrt(variance/2.0);

// add 24 uniform RV to obtain a simulation of normality
for (int x=0;x<24;x++)
returnvalue += before.doublerandom();

return k*(returnvalue-0.5*24);
}

void createencodetable()
{

//int NUMSTATES = 8;
//int memory = 3;
bool *boolstate;
bool *newstate;
// int intstate, input;

// [2] = input, [16] = current state, tostate[2][16] = next state
//unsigned *tostate[2];
// [2] = last input, [16] = current state, fromstate[2][16] = previous state
//unsigned *fromstate[2];
// [2] = input, [16] = current state, output[2][16] = output of encoder

//bool *output[2];

// create arrays used by encode and decode

output[0] = new bool[NUMSTATES];
output[1] = new bool[NUMSTATES];
fromstate[0] = new unsigned[NUMSTATES];
fromstate[1] = new unsigned[NUMSTATES];
tostate[0] = new unsigned[NUMSTATES];
tostate[1] = new unsigned[NUMSTATES];

boolstate = new bool[memory];
newstate = new bool[memory];

for (unsigned input=0;input<2;input++)
for (unsigned intstate=0;intstate<NUMSTATES;intstate++)
{
bool boolinput = (input == 0) ? false : true;

inttobool(intstate,boolstate,memory);

// calculate output due to the input
output[input][intstate] = add(boolinput,boolstate[0]);
output[input][intstate] = add(output[input][intstate],boolstate[2]);//sequence?
output[input][intstate] = add(output[input][intstate],boolstate[1]);
output[input][intstate] = add(output[input][intstate],boolstate[2]);
//output[input][intstate] = add(output[input][intstate],boolstate[3]);

// calculate new states
//newstate[3] = boolstate[2];
newstate[2] = boolstate[1];
newstate[1] = boolstate[0];
newstate[0] = add(add(boolinput,boolstate[1]),boolstate[2]);//£¿£¿
// from s’ to s
booltoint (newstate,memory,&tostate[input][intstate]);
// from s to s’
fromstate[input][tostate[input][intstate]] = intstate;//now to future to now
}

delete boolstate;
delete newstate;
}

void encode(bool *mesg,bool *parity,unsigned size)
{

// unsigned *tostate[2];
unsigned state=0;
// bool *output[2];

for (int x=0;x<size;x++)
{
// force the encoder to zero state at the end// how can force the encoder to zero state
if (x>=size-memory)
{
if (tostate[0][state]&1)//odd,is 1
mesg = true;//??
else
mesg = false;
}

// can’t assume the bool type has an intrinsic value of 0 or 1
// may differ from platform to platform
int uk = mesg ? 1 : 0;

// calculate output due to new mesg bit
parity = output[uk][state] ;
// calculate the new state
state = tostate[uk][state];
}
}
/*
void encode(bool *mesg,bool *parity,unsigned size, bool force)
{

unsigned state=0;

for (int x=0;x<size;x++)
{
// force the encoder to zero state at the end// how can force the encoder to zero state!!!
if (x>=size-memory && force)
{
if (tostate[0][state]&1)//odd,is 1
mesg = true;//??
else
mesg = false;
}

// can’t assume the bool type has an intrinsic value of 0 or 1
// may differ from platform to platform
int uk = mesg ? 1 : 0;

// calculate output due to new mesg bit
parity = output[uk][state];
// calculate the new state
state = tostate[uk][state];
}
}

*/

system · 2013 年5 月 22 日 13:07

不知道怎么回事设置的红色没有显示出来，这里解释下，39行声明指针G,83行给指针数组G赋值为5.0，2楼第一句给其分配空间，后面关键的输出和打印输出，如下

turbo_kernel_0<<< 1, 4 >>>(G, Le_dev, La_dev, dev_y, dev_yp1);
checkCudaErrors( cudaMemcpy(La_temp1, G, sizeof(float) * 4, cudaMemcpyDeviceToHost)); 
for(int i = 0; i < 4; i++)
printf("t = %f\n ",La_temp1[i]);

system · 2013 年5 月 22 日 13:23

LZ您好，您的代码大致扫过，因为其实在是太冗长了，未能整体理解。

根据您1#的叙述和3#的补充，重点看了下和G有关的地方，并未发现异常用法。所以无法给您建议了。

请您继续寻找和尝试，祝您好运~

system · 2013 年5 月 22 日 13:32

谢谢版主耐心回复，代码确实太冗长，G只是作为下面进一步计算的一个中间变量，只是为了测试而将其输出，下面的所有其他代码与这个问题无关暂无需了解。测试的时候将常数5.0赋值给它（实际代码中是一个很长的表达式），很纳闷一直没有输出5.0，所以觉得应该是值传递的问题，但具体是哪里出问题了也不知道，所以将整个代码贴了出来。感谢！

system · 2013 年5 月 22 日 13:34

LZ您好，但是您G相关的申请空间，kernel内赋值，memcpy回拷似乎都是没错的。

所以您说的现象真心十分怪异了。

暂时没有其他发现，祝您好运~

system · 2013 年5 月 22 日 13:37

我会继续尝试，同时希望高手能指点迷津，不甚感激~

system · 2013 年5 月 22 日 13:39

谢谢，也祝您编码顺利~:P