自己写的矩阵乘法
#include<stdio.h>
#include
#include"cuda_runtime.h"
#define N 10//计算c=a*b
global void mult(float *dev_a,float dev_b,floatdev_c,int n);
int main(void)
{
int i,j;
float a,b,c,dev_a,dev_b,dev_c;
a=(float)malloc(sizeof(float)NN);
b=(float)malloc(sizeof(float)NN);
c=(float)malloc(sizeof(float)NN);
cudaMalloc((void)&dev_a,sizeof(float)NN);
cudaMalloc((void*)&dev_b,sizeof(float)NN);
cudaMalloc((void**)&dev_c,sizeof(float)NN);
for(i=0;i<N;i++)
for(j=0;j<N;j++)
{
a[i*N+j]=i;
b[i*N+j]=i*i;
}
cudaMemcpy(dev_a,a,sizeof(float)NN,cudaMemcpyHostToDevice);
cudaMemcpy(dev_b,b,sizeof(float)NN,cudaMemcpyHostToDevice);
cudaMemcpy(dev_b,b,sizeof(float)NN,cudaMemcpyHostToDevice);
dim3 blocks(1,1);
dim3 threads(10,10);
kernel<<<blocks,threads>>>(dev_a,dev_b,dev_c,N);
cudaMemcpy(c,dev_c,sizeof(float)NN,cudaMemcpyDeviceToDevice);
for(i=0;i<N;i++)
{
for(j=0;j<N;j++)
printf(“%f\t “,c[i*N+j]);
printf(”\n”);
}
system(“pause”);
free(a);
free(b);
free(c);
cudaFree(dev_a);
cudaFree(dev_b);
cudaFree(dev_c);
return 0;
}
global void mult(float dev_a,float dev_b,floatdev_c,int n)
{
int x=threadIdx.x;
int y=threadIdx.y;
float temp=0;
int i;
for(i=0;i<N;i++)
temp+=dev_a[yn+i]dev_b[i*n+x];
dev_c[x+yn]=temp;
}