#pragma unroll为什么没有作用?

请问,在程序中用了#pragma unroll,想把for循环全部展开,但是,编译器说:“ Advisory: Loop was not unrolled, inline assembly”,然后PTX文件中有:“// Loop body line 24, nesting depth: 1, iterations: 9”,好像并没有展开循环?但是,不是说“#pragma unroll后未指明任何数据,如果循环次数固定,那么整个循环将完全展开"吗?我的程序好像直接编译成的cubin文件,–keep后为什么中间文件有.ptx呢?

请提供源代码

#pragma unroll 9
for(int round=1; round < 10; round++)
{
rk += 4;
t = tex1Dfetch(texRefdT, (s[ins0] >> 24) );
t1 = tex1Dfetch(texRefdT, (s[ins1] >> 16) & 0xff);
t ^= (t1 >> 8) | (t1 << 24);
t2 = tex1Dfetch(texRefdT, (s[ins2] >> 8 ) & 0xff);
t ^= (t2 >> 16) | (t2 << 16);
t3 = tex1Dfetch(texRefdT, (s[ins3] ) & 0xff);
t ^= (t3 >> 24) | (t3 << 8);
t ^= (*rk);
shared[threadBi] = t;
}
.PTX中是:
// Loop body line 24, nesting depth: 1, iterations: 9
ld.shared.u32 %r41, [%r37+0];
shr.u32 %r42, %r41, 24;
mov.s32 %r43, 0;
mov.s32 %r44, 0;
mov.s32 %r45, 0;
tex.1d.v4.u32.s32 {%r46,%r47,%r48,%r49},[texRefdT,{%r42,%r43,%r44,%r45}];
// Part of loop body line 24, head labeled $Lt_0_4610
.loc 16 406 0
mov.s32 %r50, %r46;
ld.shared.u32 %r51, [%r38+0];
shl.b32 %r52, %r51, 8;
shr.u32 %r53, %r52, 24;
mov.s32 %r54, 0;
mov.s32 %r55, 0;
mov.s32 %r56, 0;
tex.1d.v4.u32.s32 {%r57,%r58,%r59,%r60},[texRefdT,{%r53,%r54,%r55,%r56}];
// Part of loop body line 24, head labeled $Lt_0_4610
.loc 16 407 0
mov.s32 %r61, %r57;
ld.shared.u32 %r62, [%r39+0];
shl.b32 %r63, %r62, 16;
shr.u32 %r64, %r63, 24;
mov.s32 %r65, 0;
mov.s32 %r66, 0;
mov.s32 %r67, 0;
tex.1d.v4.u32.s32 {%r68,%r69,%r70,%r71},[texRefdT,{%r64,%r65,%r66,%r67}];
// Part of loop body line 24, head labeled $Lt_0_4610
.loc 16 409 0
mov.s32 %r72, %r68;
ld.shared.s32 %r73, [%r40+0];
and.b32 %r74, %r73, 255;
mov.s32 %r75, 0;
mov.s32 %r76, 0;
mov.s32 %r77, 0;
tex.1d.v4.u32.s32 {%r78,%r79,%r80,%r81},[texRefdT,{%r74,%r75,%r76,%r77}];
// Part of loop body line 24, head labeled $Lt_0_4610
.loc 16 411 0
mov.s32 %r82, %r78;
.loc 16 414 0
ld.const.u32 %r83, [%r32+16];
shr.u32 %r84, %r82, 24;
shl.b32 %r85, %r82, 8;
or.b32 %r86, %r84, %r85;
xor.b32 %r87, %r83, %r86;
shr.u32 %r88, %r72, 16;
shl.b32 %r89, %r72, 16;
or.b32 %r90, %r88, %r89;
shr.u32 %r91, %r61, 8;
shl.b32 %r92, %r61, 24;
or.b32 %r93, %r91, %r92;
xor.b32 %r94, %r50, %r93;
xor.b32 %r95, %r90, %r94;
xor.b32 %r96, %r87, %r95;
st.shared.u32 [%r13+0], %r96;
请问这样的代码是不是很不好呀?没有多少计算,却需要经常的访存。。。
其中s是放在shared中的,rk指向constant,其它变量都是寄存器,我总担心生成.ptx与我想象的不一样…

你这计算已经相当多了。
关于nvcc所谓的编译器伪指令,一般可以忽略

[ 本帖最后由 yyfn风辰 于 2010-5-17 21:24 编辑 ]