#include “cuda_runtime.h”
#include “device_launch_parameters.h”
#include <stdio.h>
constant int dev_a[144] , dev_b[144] ;
global void addKernel(int *dev_c, int size ,int TILE_WIDTH)
{
int Row = blockIdx.y * TILE_WIDTH + threadIdx.y;
int Col = blockIdx.x * TILE_WIDTH + threadIdx.x;
int pvalue=0;
for(int k=0;k<size;++k)
{
int Mdelement = dev_a[Rowsize+k];
int Ndelement = dev_b[ksize+Col];
pvalue += Mdelement * Ndelement;
}
dev_c[Row*size+Col]=pvalue;
}
int main()
{
const int arraySize = 12;
const int a[arraySizearraySize] = { 1, 2, 3, 4, 5, 6, 7, 8, 9, 10 , 11, 12, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10 , 11, 12,
1, 2, 3, 4, 5, 6, 7, 8, 9, 10 , 11, 12, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10 , 11, 12,
1, 2, 3, 4, 5, 6, 7, 8, 9, 10 , 11, 12, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10 , 11, 12,
1, 2, 3, 4, 5, 6, 7, 8, 9, 10 , 11, 12, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10 , 11, 12,
1, 2, 3, 4, 5, 6, 7, 8, 9, 10 , 11, 12, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10 , 11, 12,
1, 2, 3, 4, 5, 6, 7, 8, 9, 10 , 11, 12, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10 , 11, 12};
const int b[arraySizearraySize]= { 10, 20, 30, 40, 50, 60, 70, 80, 90, 100, 110, 120,10, 20, 30, 40, 50, 60, 70, 80, 90, 100, 110, 120,
10, 20, 30, 40, 50, 60, 70, 80, 90, 100, 110, 120, 10, 20, 30, 40, 50, 60, 70, 80, 90, 100, 110, 120,
10, 20, 30, 40, 50, 60, 70, 80, 90, 100, 110, 120, 10, 20, 30, 40, 50, 60, 70, 80, 90, 100, 110, 120,
10, 20, 30, 40, 50, 60, 70, 80, 90, 100, 110, 120, 10, 20, 30, 40, 50, 60, 70, 80, 90, 100, 110, 120,
10, 20, 30, 40, 50, 60, 70, 80, 90, 100, 110, 120, 10, 20, 30, 40, 50, 60, 70, 80, 90, 100, 110, 120,
10, 20, 30, 40, 50, 60, 70, 80, 90, 100, 110, 120, 10, 20, 30, 40, 50, 60, 70, 80, 90, 100,110, 120};
int c[arraySize*arraySize] = { 1 };
int *dev_c ;
int TILE_WIDTH = 4;
cudaMalloc((void**)&dev_c, (arraySize * arraySize) * sizeof(int));
cudaMemcpy(dev_a, a, (arraySize * arraySize) * sizeof(int), cudaMemcpyHostToDevice);
cudaMemcpy(dev_b, b, (arraySize * arraySize) * sizeof(int), cudaMemcpyHostToDevice);
dim3 dimBlock(TILE_WIDTH,TILE_WIDTH);
dim3 dimGrid(arraySize/TILE_WIDTH,arraySize/TILE_WIDTH);
addKernel<<<dimGrid, dimBlock>>>(dev_c, arraySize,TILE_WIDTH);
cudaMemcpy(c, dev_c, (arraySize * arraySize) * sizeof(int), cudaMemcpyDeviceToHost);//为什么dev_a和dev_b都是 0 啊
cudaMemcpy(c, dev_a, (arraySize * arraySize) * sizeof(int), cudaMemcpyDeviceToHost);//这句话是不是没有执行为什么啊!!!!!!!!!
int j = 1;
for( int i=0;i<144;i++)
{
printf(“%6d”,c[i]);
if(j%12==0)
{
printf(“\n”);
j=0;
}
j++;
}
cudaFree(dev_c);
cudaFree(dev_a);
cudaFree(dev_b);
return 0;
}
[/i]