正在做一个关于turbo译码的程序,需要两个kernel,两个的内容和作用基本相似,编译能够正常通过,但是输出结果不是想要的。仔细查看了kernel_0,并将中间变量传到host打印输出后,发现里面的计算到G后就错了,将G赋为常数值,传递到host中或者全为0.000,或者为0.000,1.000,2.000,3.000,我是采用地址传递,百思不得其解为什么会出现这样错误,求高手帮助.(为了调试错误,源代码改动了很多,但与问题相关的很少,已经用红色标注出来)这里先谢过了
#include <stdlib.h>
#include <stdio.h>
#include <string.h>
#include <math.h>
#include <time.h>
#include "random.h"
#include "random.cpp"
#include <helper_functions.h>
#include <helper_cuda.h>
// includes CUDA
#include <cuda_runtime.h>
#define threadsPerBlock 8
#define blocksPerGrid 96
#define SUBFRAME_LENGTH 64
#define N 6144
#define NUMSTATES 8
#define memory 3
//const int SEED=1000;
double No;
//void runTest(int argc, char **argv);
double gaussian(double variance);
void inttobool(unsigned state, bool *array, unsigned size);
void booltoint(bool *array, unsigned size, unsigned *state);
void createencodetable();
void encode(bool *mesg,bool *parity,unsigned size);
bool add(bool a, bool b);
//int rand(void);
//void srand(unsigned int n);
unsigned *tostate[2];
unsigned *fromstate[2];
bool *output[2];
Random before;
extern "C"
__global__ void turbo_kernel_0(float *G, float *Le_dev, float *La_dev, float *dev_y, float *dev_yp)
{
const int m[4] = {1, 1, -1, -1};
const int n[4] = {1, -1, 1, -1};
const int G_idx[8] = {3, 1, 2, 0, 0, 2, 1, 3};
const int G_idy[8] = {3, 2, 2, 3, 3, 2, 2, 3};
//float Lc = 5.0;
A[threadsPerBlock];
float B[threadsPerBlock];
int h[N]={};
//__shared__ float G[threadsPerBlock/2];
__shared__ float Alpha[threadsPerBlock * SUBFRAME_LENGTH];
__shared__ float Beta[threadsPerBlock];
__shared__ float L0_cache[threadsPerBlock * 8 * 8];
__shared__ float L1_cache[threadsPerBlock * 8 * 8];
float r0[threadsPerBlock] = {};
float r1[threadsPerBlock] = {};
float dB = -1.0;
float No = 1/pow(10.0,dB/10.0);
float Lc = (4.0*1.0)/No;
//int i = blockDim.x * blockIdx.x + threadIdx.x;//blockDim.x = 8; i<8,it is enough
//int i = blockDim.x * blockIdx.x + threadIdx.x;
int cacheIndex = threadIdx.x;
if(cacheIndex == 0)
{
A[cacheIndex]= 2.0;
B[cacheIndex]= 0.0;
}
else
{
A[cacheIndex]= -100.0;
B[cacheIndex]= -100.0;
}
__syncthreads();
int k = 0;
G[cacheIndex] = 5.0;// 0.5 * La_dev[k + (SUBFRAME_LENGTH ) * blockIdx.x] * m[cacheIndex] + 0.5 * Lc * m[cacheIndex] * dev_y[k + (SUBFRAME_LENGTH ) * blockIdx.x] + 0.5 * Lc * n[cacheIndex] * dev_yp[k + (SUBFRAME_LENGTH ) * blockIdx.x];//G[i] will be missing?
__syncthreads();
__syncthreads();
A[cacheIndex] = 2.0;//(A[(2 * cacheIndex) % 8] + G[G_idx[cacheIndex]]) > (A[(2 * cacheIndex) % 8 + 1] + G[3 - G_idx[cacheIndex]]) ? (A[(2 * cacheIndex) % 8] + G[G_idx[cacheIndex]]) : (A[(2 * cacheIndex) % 8 + 1] + G[3 - G_idx[cacheIndex]]);
__syncthreads();
Alpha[cacheIndex + k * (threadsPerBlock)] = A[cacheIndex] ;
__syncthreads();
for(int l = (SUBFRAME_LENGTH - 1)/8; l>= 0; l--)
{
int ls = (SUBFRAME_LENGTH - 1)/8 - l;
for(int j = 7; j >= 0; j--)
{
int k = 8 * l + j;
if(cacheIndex < 4)
// G[cacheIndex] = 4.0;//0.5 * La_dev[k + (SUBFRAME_LENGTH )* blockIdx.x] * m[cacheIndex] + 0.5 * Lc * m[cacheIndex] * dev_y[k + (SUBFRAME_LENGTH )* blockIdx.x] + 0.5 * Lc* n[cacheIndex] * dev_yp[k + (SUBFRAME_LENGTH )* blockIdx.x];
__syncthreads();
B[cacheIndex] = 3.0;//(B[cacheIndex] + G[G_idy[cacheIndex]]) > (B[(cacheIndex + 4) % 8] + G[3 - G_idy[cacheIndex]]) ? (B[cacheIndex] + G[G_idy[cacheIndex]]) : (B[(cacheIndex + 4) % 8] + G[3 - G_idy[cacheIndex]]);
__syncthreads();
Beta[cacheIndex] = B[cacheIndex];
__syncthreads();
const int G_idz[8] = {3, 3, 3, 3, 2, 2, 2, 2};
const int G_idw[8] = {0, 3, 4, 7, 1, 2, 5, 6};
const int G_idu[8] = {1, 1, 1, 1, 0, 0, 0, 0};
const int G_idv[8] = {1, 2, 5, 6, 0, 3, 4, 7};
int s = 7 - j;
L0_cache[cacheIndex+ s * (threadsPerBlock ) + 8 * 8 * ls] = Alpha[G_idw[cacheIndex] + k * (threadsPerBlock )] + G[G_idz[cacheIndex]] + Beta[cacheIndex];//sequence of A and B? ;from back to front
L1_cache[cacheIndex+ s * (threadsPerBlock ) + 8 * 8 * ls] = Alpha[G_idv[cacheIndex] + k * (threadsPerBlock )] + G[G_idu[cacheIndex]] + Beta[cacheIndex];
//L0_cache[cacheIndex+ s * (threadsPerBlock )] = Alpha[G_idw[cacheIndex] + k * (threadsPerBlock )] + G[G_idz[cacheIndex]] + Beta[cacheIndex];//sequence of A and B?
// L1_cache[cacheIndex+ s * (threadsPerBlock )] = Alpha[G_idv[cacheIndex] + k * (threadsPerBlock )] + G[G_idu[cacheIndex]] + Beta[cacheIndex];
__syncthreads();
}
r0[0] = L0_cache[ ls * 8 * (threadsPerBlock )];
r1[0] = L1_cache[ ls * 8 * (threadsPerBlock )];
r0[1] = L0_cache[ 8 + ls * 8 * (threadsPerBlock )];
r1[1] = L1_cache[ 8 + ls * 8 * (threadsPerBlock )];
r0[2] = L0_cache[ 16 + ls * 8 * (threadsPerBlock )];
r1[2] = L1_cache[ 16 + ls * 8 * (threadsPerBlock )];
r0[3] = L0_cache[ 24 + ls * 8 * (threadsPerBlock )];
r1[3] = L1_cache[ 24 + ls * 8 * (threadsPerBlock )];
r0[4] = L0_cache[ 32 + ls * 8 * (threadsPerBlock )];
r1[4] = L1_cache[ 32 + ls * 8 * (threadsPerBlock )];
r0[5] = L0_cache[ 40 + ls * 8 * (threadsPerBlock )];
r1[5] = L1_cache[ 40 + ls * 8 * (threadsPerBlock )];
r0[6] = L0_cache[ 48 + ls * 8 * (threadsPerBlock )];
r1[6] = L1_cache[ 48 + ls * 8 * (threadsPerBlock )];
r0[7] = L0_cache[ 56 + ls * 8 * (threadsPerBlock )];
r1[7] = L1_cache[ 56 + ls * 8 * (threadsPerBlock )];
__syncthreads();
for(int p = 1; p < 8; p++)
{
if(r0[cacheIndex] < L0_cache[p + cacheIndex * 8 + ls * 8 * (threadsPerBlock )])
r0[cacheIndex] = L0_cache[p + cacheIndex * 8 + ls * 8 * (threadsPerBlock )];
if(r1[cacheIndex] < L1_cache[p + cacheIndex * 8 + ls * 8 * (threadsPerBlock )])
r1[cacheIndex] = L1_cache[p + cacheIndex * 8 + ls * 8 * (threadsPerBlock )];
}
// r0[cacheIndex] = r0[cacheIndex] > L0_cache[p + cacheIndex * 8 + ls * 8 * (threadsPerBlock )] ? r0[cacheIndex] : L0_cache[p + cacheIndex * 8 + ls * 8 * (threadsPerBlock )];
// r1[cacheIndex] = r1[cacheIndex] > L1_cache[p + cacheIndex * 8 + ls * 8 * (threadsPerBlock )] ? r1[cacheIndex] : L1_cache[p + cacheIndex * 8 + ls * 8 * (threadsPerBlock )];
/*
r0[cacheIndex] = L0_cache[cacheIndex + s * (threadsPerBlock )];
r1[cacheIndex] = L1_cache[cacheIndex + s * (threadsPerBlock )];
for(int p = 1; p < 8; p++)
{
int Index = (cacheIndex + p) & 7;
r0[cacheIndex] = r0[cacheIndex] > L0_cache[Index + s * (threadsPerBlock )] ? r0[cacheIndex] : L0_cache[Index + s * (threadsPerBlock )];
r1[cacheIndex] = r1[cacheIndex] > L1_cache[Index + s * (threadsPerBlock )] ? r1[cacheIndex] : L1_cache[Index + s * (threadsPerBlock )];
}
*/
__syncthreads();
Le_dev[ls] = 1.0;//r0[cacheIndex] - r1[cacheIndex] - La_dev[cacheIndex + 8 * ls + (SUBFRAME_LENGTH ) * blockIdx.x] - Lc * dev_y[cacheIndex + 8 * ls + (SUBFRAME_LENGTH ) * blockIdx.x];
//La_dev[cacheIndex + 8 * ls + (SUBFRAME_LENGTH ) * blockIdx.x] = r0[cacheIndex] - r1[cacheIndex] - La_dev[cacheIndex + 8 * ls + (SUBFRAME_LENGTH ) * blockIdx.x] - Lc * dev_y[cacheIndex + 8 * ls + (SUBFRAME_LENGTH ) * blockIdx.x];
__syncthreads();
int f1 = 7;
int f2 = 16;
h[cacheIndex + 8 * ls + (SUBFRAME_LENGTH )* blockIdx.x] = (((f1 + f2 * (cacheIndex + 8* ls + (SUBFRAME_LENGTH ) * blockIdx.x) ) % N) * (cacheIndex + 8 * ls + (SUBFRAME_LENGTH ) * blockIdx.x))% N ;//need to be put into sm?
__syncthreads();
La_dev[cacheIndex + 8 * ls + (SUBFRAME_LENGTH ) * blockIdx.x] = Le_dev[h[cacheIndex + 8 * ls + (SUBFRAME_LENGTH ) * blockIdx.x]];
}
}
// Program main
int main()
{
void inttobool(unsigned state, bool *array, unsigned size);
void booltoint(bool *array, unsigned size, unsigned *state);
bool add(bool a, bool b);
void createencodetable();
void encode(bool *mesg,bool *parity,unsigned size);
float *dev_y, *dev_yp1, *dev_yp2, *Le_dev, *La_dev, *G, *La_temp;
float *Le, *La, *y, *yp1, *yp2, *L, *Lf ;
int *h, *u;
bool *mesg, *mesg_inv, *parity1, *parity2, *uf;
//d_idata;
// float Lc = 5.0;
unsigned int mem_size = sizeof(float) * N;
y = new float[N];
yp1 = new float[N];
yp2 = new float[N];
//yp2 = (float *) malloc(mem_size);
Le = new float[N];
La = new float[N];
La_temp = new float[N];
float La_temp1[4];
// La_temp= (float *) malloc(sizeof(float) * 4);
Lf = new float[N];
L = new float[N];
u = new int[N];
uf = new bool[N];
h = new int[N];
mesg = new bool[N];
mesg_inv = new bool[N];
parity1 = new bool[N];
parity2 = new bool[N];