#include <stdio.h>
#include <stdlib.h>
#include <cutil.h>
#include <cuda.h>
#include <cutil_inline.h>
#include <cuda_runtime.h>
global void calculate(int *d_data, int pitch, int row, int col)
{
unsigned int x = blockIdx.x * blockDim.x + threadIdx.x;
unsigned int y = blockIdx.y * blockDim.y + threadIdx.y;
if ( x < col && y < row )
{
int * d = ((int*)((char*)d_data+y*pitch)+x);
*d = x+y;
}
}
int main()
{
int row = 16;
int col = 8;
int *h_data;
int mem_size = row*col*sizeof(int);
cutilSafeCall( cudaMallocHost((void**)&h_data, mem_size) );
int *d_data;
size_t pitch;
cutilSafeCall( cudaMallocPitch((void**)&d_data, &pitch, col*sizeof(int), row) );
dim3 block(col, row);
dim3 grid(1, 1);
calculate<<<grid, block>>>(d_data, pitch, row, col);
cutilSafeCall( cudaMemcpy(h_data, d_data, pitch*row, cudaMemcpyDeviceToHost) );
//cutilSafeCall( cudaMemcpy2D(h_data, sizeof(int)*col, d_data, pitch, col, row, cudaMemcpyDeviceToHost) );
for(int i = 0; i < row; i++)
{
for(int j = 0 ; j < col; j++)
{
printf("%d ", h_data[i*col+j]);
}
printf("\n");
}
cutilSafeCall( cudaFree(d_data) );
cutilSafeCall( cudaFreeHost(h_data) );
return 0;
}