同一个文件,在32位linux下执行结果如图“在32为linux的执行结果.jpeg”所示,可以正常运行,但在64位linux下可以编译成功,但是执行时间却是0,kernel没有运行?。求各位大神给看看,小弟先谢谢了
//text.cu
//
#include <stdio.h>
#include <cuda.h>
#define READACCESSFLAG 1
#define WRITEACCESSFLAG 2
typedef struct
{
int width;
int height;
int* dev_data;
int* rwFlag;
}swap_Mem;
const int threadsPerBlock = 256;
global void Handle(swap_Mem input, swap_Mem output)
{
shared int share_mem[threadsPerBlock];
int tid = threadIdx.x + blockDim.x*blockIdx.x;
int flag = 0;
while(1)
{
if(atomicCAS((input.rwFlag), READACCESSFLAG, READACCESSFLAG)==READACCESSFLAG)
{
share_mem[tid] = input.dev_data[tid];
__syncthreads();
atomicExch((input.rwFlag),WRITEACCESSFLAG);
share_mem[tid] += 1;
__syncthreads();
flag = 1;
}
if(flag && atomicCAS((output.rwFlag), WRITEACCESSFLAG, WRITEACCESSFLAG)==WRITEACCESSFLAG)
{
output.dev_data[tid] = share_mem[tid];
__syncthreads();
atomicExch((output.rwFlag),READACCESSFLAG);
break;
}
}
}
int main()
{
cudaEvent_t start, stop;
cudaEventCreate(&start);
cudaEventCreate(&stop);
cudaEventRecord(start,0);
int input[threadsPerBlock] = {0};
int initFlag = READACCESSFLAG;
int flag = WRITEACCESSFLAG;
swap_Mem inputdata;
inputdata.width = threadsPerBlock;
inputdata.height = 1;
inputdata.dev_data = input;
inputdata.rwFlag = &initFlag;
swap_Mem d_mem_1;
swap_Mem d_mem_2;
int size;
d_mem_1.width = inputdata.width;
d_mem_1.height = inputdata.height;
size = inputdata.width*inputdata.height;
cudaMalloc((void**)&d_mem_1.dev_data, size*sizeof(int));
cudaMalloc((void**)&d_mem_1.rwFlag, sizeof(int));
cudaMemcpy(d_mem_1.dev_data,
inputdata.dev_data,
size*sizeof(int),
cudaMemcpyHostToDevice);
cudaMemcpy(d_mem_1.rwFlag,
inputdata.rwFlag,
sizeof(int),
cudaMemcpyHostToDevice);
d_mem_2.width = inputdata.width;
d_mem_2.height = inputdata.height;
size = inputdata.width*inputdata.height;
cudaMalloc((void**)&d_mem_2.dev_data, size*sizeof(int));
cudaMalloc((void**)&d_mem_2.rwFlag, sizeof(int));
cudaMemcpy(d_mem_2.rwFlag,
&flag,
sizeof(int),
cudaMemcpyHostToDevice);
Handle<<<1, threadsPerBlock>>>(d_mem_1, d_mem_2);
cudaMemcpy(inputdata.dev_data,
d_mem_2.dev_data,
size*sizeof(int),
cudaMemcpyDeviceToHost);
cudaMemcpy(inputdata.rwFlag,
d_mem_2.rwFlag,
sizeof(int),
cudaMemcpyDeviceToHost);
cudaFree(d_mem_1.dev_data);
cudaFree(d_mem_2.dev_data);
cudaFree(d_mem_1.rwFlag);
cudaFree(d_mem_2.rwFlag);
float elapseTime;
cudaEventRecord(stop,0);
cudaEventSynchronize(stop);
cudaEventElapsedTime(&elapseTime, start, stop);
printf("Time to computation: %3.2f ms\n", elapseTime);
printf("The flag = %d\n", *(inputdata.rwFlag));
printf("The input = %d\n", inputdata.dev_data[10]);
return 0;
}