代码如下
__global__ void BBBBBBBBBBBBBBBBBB(cufftComplex* Rx, cufftComplex* steerVector, float* sound_field ,float* sound_central)
{
unsigned int i = threadIdx.x + blockDim.x * blockIdx.x;
unsigned int j = threadIdx.y + blockDim.y * blockIdx.y;
int width = blockDim.x * gridDim.x;
int index = j * (width) + i;
float distance = 0.0f;
cufftComplex* steer_vector = steerVector + index * ARRAY_CHANNEL_NUM;
// cufftComplex steer_vector_conj[ARRAY_CHANNEL_NUM];
cufftComplex temp_vector[ARRAY_CHANNEL_NUM];
cufftComplex Rx_vector[ARRAY_CHANNEL_NUM];
cufftComplex soundcomplex{.0f, .0f};
for (int k1 = 0; k1 < ARRAY_CHANNEL_NUM; k1++)
{
Rx_vector[k1].x = 0.0f;
Rx_vector[k1].y = 0.0f;
for (int k2 = 0; k2 < ARRAY_CHANNEL_NUM; k2++)
{
Rx_vector[k2].x = Rx[ARRAY_CHANNEL_NUM * k2 + k1].x;
Rx_vector[k2].y = Rx[ARRAY_CHANNEL_NUM * k2 + k1].y;
}
temp_vector[k1].x = 0.0f;
temp_vector[k1].y = 0.0f;
for (int k3 = 0; k3 < ARRAY_CHANNEL_NUM; k3++)
{
// temp_vector[k1].x += steer_vector_conj[k3].x * Rx_vector[k3].x - steer_vector_conj[k3].y * Rx_vector[k3].y;
// temp_vector[k1].y += steer_vector_conj[k3].x * Rx_vector[k3].y + steer_vector_conj[k3].y * Rx_vector[k3].x;
temp_vector[k1].x += steer_vector[k3].x * Rx_vector[k3].x - (-steer_vector[k3].y) * Rx_vector[k3].y;
temp_vector[k1].y += steer_vector[k3].x * (-Rx_vector[k3].y) + (-steer_vector[k3].y) * Rx_vector[k3].x;
}
soundcomplex.x += temp_vector[k1].x * steer_vector[k1].x - temp_vector[k1].y * steer_vector[k1].y;
soundcomplex.y += temp_vector[k1].x * steer_vector[k1].y + temp_vector[k1].y * steer_vector[k1].x;
}
sound_field[j * 640 + i] = 0.0f;
int k = 0;
// 不注释需要94个寄存器, 注释掉只要11个寄存器
// sound_field[j * 640 + i] = temp_vector[k].x * steer_vector[k].y + temp_vector[k].y * steer_vector[k].x;
}
操作系统Windows10
cuda10.2
编译参数
nvcc.exe -D___CUDACC__ --use_fast_math -lcuda -lcudadevrt -lcudart -lcufft -lcublas --machine 64 -arch=compute_61 -code=sm_61 --ptxas-options=-v --compile -cudart static -D_MBCS -Xcompiler /wd4819,/EHsc,/W3,/nologo,/O2,/Zi -Xcompiler /MD -c -o ...