写了一个小的cu测试程序,计算16x16像素块对应的sad值(对应像素相减的绝对值和),用nsight运行发现核函数占用了不少的寄存器,代码如下
#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#include <stdio.h>
#include <stdlib.h>
#define SIZE 16
typedef unsigned char pixel;
__device__ int x264_pixel_sad_16x16(pixel *pix1, pixel *pix2, int i_stride_pix )
{
int i_sum = 0;
int y;
int r1 = ((int)pix1) & 7;
int r2 = ((int)pix2) & 7;
for (y = 0; y < 16; y++)
{
__int64 p1 = *((__int64*)(pix1));
__int64 p2 = *((__int64*)(pix1+8));
__int64 p3 = 0;
__int64 q1 = *((__int64*)(pix2));
__int64 q2 = *((__int64*)(pix2+8));
__int64 q3 = 0;
if (r1 != 0) p3 = *((__int64*)(pix1-r1+16));
if (r2 != 0) q3 = *((__int64*)(pix2-r2+16));
p1 = (p1>>r1*8 & ((__int64)1<<(64-r1*8))-1) | (p2&(((__int64)1<<r1*8)-1))<<(64-r1*8);
p2 = (p2>>r1*8 & ((__int64)1<<(64-r1*8))-1) | (p3&(((__int64)1<<r1*8)-1))<<(64-r1*8);
q1 = (q1>>r2*8 & ((__int64)1<<(64-r2*8))-1) | (q2&(((__int64)1<<r2*8)-1))<<(64-r2*8);
q2 = (q2>>r2*8 & ((__int64)1<<(64-r2*8))-1) | (q3&(((__int64)1<<r2*8)-1))<<(64-r2*8);
i_sum += abs((int)(p1 &0xff) - (int)(q1 &0xff));
i_sum += abs((int)(p1>>8 &0xff) - (int)(q1>>8 &0xff));
i_sum += abs((int)(p1>>16&0xff) - (int)(q1>>16&0xff));
i_sum += abs((int)(p1>>24&0xff) - (int)(q1>>24&0xff));
i_sum += abs((int)(p1>>32&0xff) - (int)(q1>>32&0xff));
i_sum += abs((int)(p1>>40&0xff) - (int)(q1>>40&0xff));
i_sum += abs((int)(p1>>48&0xff) - (int)(q1>>48&0xff));
i_sum += abs((int)(p1>>56&0xff) - (int)(q1>>56&0xff));
i_sum += abs((int)(p2 &0xff) - (int)(q2 &0xff));
i_sum += abs((int)(p2>>8 &0xff) - (int)(q2>>8 &0xff));
i_sum += abs((int)(p2>>16&0xff) - (int)(q2>>16&0xff));
i_sum += abs((int)(p2>>24&0xff) - (int)(q2>>24&0xff));
i_sum += abs((int)(p2>>32&0xff) - (int)(q2>>32&0xff));
i_sum += abs((int)(p2>>40&0xff) - (int)(q2>>40&0xff));
i_sum += abs((int)(p2>>48&0xff) - (int)(q2>>48&0xff));
i_sum += abs((int)(p2>>56&0xff) - (int)(q2>>56&0xff));
pix1 += i_stride_pix;
pix2 += i_stride_pix;
}
return i_sum;
}
__device__ int x264_pixel_sad_16x16_L( pixel *pix1, pixel *pix2, int i_stride )
{
int i_sum = 0;
int y;
int r1 = ((int)pix1) & 3;
int r2 = ((int)pix2) & 3;
for (y = 0; y < 16; y++)
{
int p1 = *((int*)(pix1-r1));
int p2 = *((int*)(pix1-r1+4));
int p3 = *((int*)(pix1-r1+8));
int p4 = *((int*)(pix1-r1+12));
int p5 = *((int*)(pix1-r1+16));
int q1 = *((int*)(pix2-r2));
int q2 = *((int*)(pix2-r2+4));
int q3 = *((int*)(pix2-r2+8));
int q4 = *((int*)(pix2-r2+12));
int q5 = *((int*)(pix2-r2+16));
p1 = (p1>>r1*8 & (1<<(32-r1*8))-1) | (p2&((1<<r1*8)-1))<<(32-r1*8);
p2 = (p2>>r1*8 & (1<<(32-r1*8))-1) | (p3&((1<<r1*8)-1))<<(32-r1*8);
p3 = (p3>>r1*8 & (1<<(32-r1*8))-1) | (p4&((1<<r1*8)-1))<<(32-r1*8);
p4 = (p4>>r1*8 & (1<<(32-r1*8))-1) | (p5&((1<<r1*8)-1))<<(32-r1*8);
q1 = (q1>>r2*8 & (1<<(32-r2*8))-1) | (q2&((1<<r2*8)-1))<<(32-r2*8);
q2 = (q2>>r2*8 & (1<<(32-r2*8))-1) | (q3&((1<<r2*8)-1))<<(32-r2*8);
q3 = (q3>>r2*8 & (1<<(32-r2*8))-1) | (q4&((1<<r2*8)-1))<<(32-r2*8);
q4 = (q4>>r2*8 & (1<<(32-r2*8))-1) | (q5&((1<<r2*8)-1))<<(32-r2*8);
i_sum += abs((p1 &0xff) - (q1 &0xff));
i_sum += abs((p1>>8 &0xff) - (q1>>8 &0xff));
i_sum += abs((p1>>16&0xff) - (q1>>16&0xff));
i_sum += abs((p1>>24&0xff) - (q1>>24&0xff));
i_sum += abs((p2 &0xff) - (q2 &0xff));
i_sum += abs((p2>>8 &0xff) - (q2>>8 &0xff));
i_sum += abs((p2>>16&0xff) - (q2>>16&0xff));
i_sum += abs((p2>>24&0xff) - (q2>>24&0xff));
i_sum += abs((p3 &0xff) - (q3 &0xff));
i_sum += abs((p3>>8 &0xff) - (q3>>8 &0xff));
i_sum += abs((p3>>16&0xff) - (q3>>16&0xff));
i_sum += abs((p3>>24&0xff) - (q3>>24&0xff));
i_sum += abs((p4 &0xff) - (q4 &0xff));
i_sum += abs((p4>>8 &0xff) - (q4>>8 &0xff));
i_sum += abs((p4>>16&0xff) - (q4>>16&0xff));
i_sum += abs((p4>>24&0xff) - (q4>>24&0xff));
pix1 += i_stride;
pix2 += i_stride;
}
return i_sum;
}
__device__ int x264_pixel_sad_16x16_LL( pixel *pix1, pixel *pix2, int i_stride_pix )
{
int i_sum = 0;
int x, y;
for( y = 0; y < 16; y++ )
{
for( x = 0; x < 16; x++ )
{
i_sum += abs( pix1[x] - pix2[x] );
}
pix1 += i_stride_pix;
pix2 += i_stride_pix;
}
return i_sum;
}
__global__ void cal_sad(int *sad, pixel * fenc, pixel * fref, int w, int h)
{
int id = blockIdx.x*w+threadIdx.x;
int offset = blockIdx.x*w*16*16 + threadIdx.x*16;
sad[id] = x264_pixel_sad_16x16(fenc+offset, fref+offset, w*16);
}
__global__ void cal_sad_L(int *sad, pixel * fenc, pixel * fref, int w, int h)
{
int id = blockIdx.x*w+threadIdx.x;
int offset = blockIdx.x*w*16*16 + threadIdx.x*16;
sad[id] = x264_pixel_sad_16x16_L(fenc+offset, fref+offset, w*16);
}
__global__ void cal_sad_LL(int *sad, pixel * fenc, pixel * fref, int w, int h)
{
int id = blockIdx.x*w+threadIdx.x;
int offset = blockIdx.x*w*16*16 + threadIdx.x*16;
sad[id] = x264_pixel_sad_16x16_LL(fenc+offset, fref+offset, w*16);
}
int main()
{
cudaError_t err;
int w = 256, h = 64;
pixel* fenc_c, *fref_c;
pixel* fenc_g, *fref_g;
int *sad_c, *sad_g, *sad_s;
int size = w*h*SIZE*SIZE;
cudaHostAlloc(&fenc_c, size, cudaHostAllocDefault);
cudaHostAlloc(&fref_c, size, cudaHostAllocDefault);
cudaHostAlloc(&sad_c, w*h*sizeof(int), cudaHostAllocDefault);
cudaHostAlloc(&sad_s, w*h*sizeof(int), cudaHostAllocDefault);
cudaMalloc(&fenc_g, size);
cudaMalloc(&fref_g, size);
cudaMalloc(&sad_g, w*h*sizeof(int));
int i, j, k;
for (i = 0; i < size; i++)
{
fenc_c[i] = rand();
fref_c[i] = rand();
}
cudaMemcpy(fenc_g, fenc_c, size, cudaMemcpyHostToDevice);
cudaMemcpy(fref_g, fref_c, size, cudaMemcpyHostToDevice);
cal_sad_LL<<< h, w >>>(sad_g, fenc_g, fref_g, w, h);
cal_sad_L<<< h, w >>>(sad_g, fenc_g, fref_g, w, h);
cal_sad<<< h, w >>>(sad_g, fenc_g, fref_g, w, h);
cudaMemcpy(sad_c, sad_g, w*h*sizeof(int), cudaMemcpyDeviceToHost);
for (i = 0; i < w*h; i++)
{
int offset = (i>>8)*w*16*16 + (i&0xff)*16;
sad_s[i] = 0;
for (j = 0; j < 16; j++)
for (k = 0; k < 16; k++)
sad_s[i] += abs((int)fenc_c[offset+j*256*16+k] - (int)fref_c[offset+j*256*16+k]);
}
for (i = 0; i < w*h; i++)
if (sad_c[i] != sad_s[i])
break;
if (i < w*h)
printf("%d diff\n", i);
else
printf("same\n");
err = cudaDeviceReset();
return 0;
}
这里直接写出一些nsight给出的值
[attach]3170[/attach]
cal_sad {64, 1, 1} {256, 1, 1} 239.530(us) 75.00% 35
cal_sad_L {64, 1, 1} {256, 1, 1}, 365.536(us) 75.00% 39
cal_sad_LL {64, 1, 1} {256, 1, 1}, 1137.984(us) 100.00% 25
最后一列是每个线程的寄存器数量,倒数第三列是执行时间
程序里的 cal_sad是每次都8个字节的方式、cal_sad_L是每次读4个字节的方式,r1,r2是为了让都值时的对齐,这里因为数据比较整齐都为0,很多时候不为0。
环境
cuda sdk5.0 + vs2010 + nsight3.0 + gtx650ti
我的疑问是代码比较简单,不应该能占用那么多的寄存器。
- 每个线程占用寄存器数量比较多,是什么原因造成的?
- 有什么办法能控制每个线程的寄存器数吗?(不是设置一个上限,那样可能会影响效率)寄存器这块怎样的选择性能比较好。
- 现在学cuda编程,ptx有必要学吗?