global void convolutionRow( float* f ,int lwidth, int Width ,int FWidth, int preoffset)
{
const int x = IMUL(blockDim.x, blockIdx.x) + threadIdx.x;
const int y = IMUL(blockDim.y, blockIdx.y) + threadIdx.y;
const float iy = (float)y + 0.5f;
float tmp=0 ;
int m = 0;
if(x < Width/2){
for(int j=0; j< FWidth ; j++)
{
m=(2*x+j+Width-preoffset)%Width;
tmp += tex2D(texData, ((float)m + 0.5f) , iy) *LF[j];
}
f[y*lwidth+x] = tmp;
}else {
for(int j=0; j< FWidth ; j++)
{
m=(2*x+j+Width-preoffset)%Width;
tmp += tex2D(texData, ((float)m + 0.5f) , iy) *HF[j];
}
f[y*lwidth+x] = tmp;
}
__syncthreads();
}
这个函数还能继续优化吗?现在的执行效率并不高