2D线性内存数据访问error

#include <stdio.h>
#include <stdlib.h>
#include <cutil.h>
#include <cuda.h>
#include <cutil_inline.h>
#include <cuda_runtime.h>

global void calculate(int *d_data, int pitch, int row, int col)
{
unsigned int x = blockIdx.x * blockDim.x + threadIdx.x;
unsigned int y = blockIdx.y * blockDim.y + threadIdx.y;

if ( x < col && y < row )
{

	int * d = ((int*)((char*)d_data+y*pitch)+x);
	*d = x+y;
}

}

int main()
{
int row = 16;
int col = 8;
int *h_data;

int mem_size = row*col*sizeof(int);

cutilSafeCall( cudaMallocHost((void**)&h_data, mem_size) );



int *d_data;
size_t pitch;
cutilSafeCall( cudaMallocPitch((void**)&d_data, &pitch, col*sizeof(int), row) );
	
dim3 block(col, row);
dim3 grid(1, 1);

calculate<<<grid, block>>>(d_data, pitch, row, col);

cutilSafeCall( cudaMemcpy(h_data, d_data, pitch*row, cudaMemcpyDeviceToHost) );
//cutilSafeCall( cudaMemcpy2D(h_data, sizeof(int)*col, d_data, pitch, col, row, cudaMemcpyDeviceToHost) );

for(int i = 0; i < row; i++)
{
	for(int j = 0 ; j < col; j++)
	{
		printf("%d  ", h_data[i*col+j]);
	}
	printf("\n");
}

cutilSafeCall( cudaFree(d_data) );
cutilSafeCall( cudaFreeHost(h_data) );


return 0;

}

能不能更详细点,如那些地方出问题了,问题是什么?

这段程序你可以跑跑,结果是错误的,对应的位置的数据并不是i+j