static int name( uint8_t *pix1, int i_stride_pix1, \
uint8_t *pix2, int i_stride_pix2 ) \
{ \
int i_sum = 0; \
for( int y = 0; y < ly; y++ ) \
{ \
for( int x = 0; x < lx; x++ ) \
{ \
i_sum += abs( pix1[x] - pix2[x] ); \
} \
pix1 += i_stride_pix1; \
pix2 += i_stride_pix2; \
} \
return i_sum; \
}
maple汇编
1386 mov w4, w1
1387 mov w1, #0
1388 mov w6, #0
1389 sxtw x7, w3
1390 sxtw x8, w4
1391 .L.4__4:
1392 mov w5, #0 <===== "mov x5, #0 x5为iv"
1393 .L.4__2:
1394 sxtw x3, w5 <====== "冗余"
1395 ldrb w4, [x0,x3]
1396 ldrb w3, [x2,x3]
1397 subs w3, w4, w3
1398 cneg w3, w3, MI
1399 add w1, w1, w3
1400 add w5, w5, #1
1401 cmp w5, #16
1402 blt .L.4__2
1403 .L.4__1:
1404 add x0, x0, x8
1405 add x2, x2, x7
1406 add w6, w6, #1
1407 cmp w6, #16
1408 blt .L.4__4
1409 .L.4__3:
1410 mov w0, w1
1411 .L.4__12:
1412 ret
maple IR
func &x264_pixel_sad_16x16 static (reg %7 <* u8>, reg %11 i32, reg %8 <* u8>, reg %10 i32) i32 {
1335 funcid 4
1336
1337 LOC 2 61
1338 regassign i32 %4 (constval i32 0)
1339 regassign i32 %5 (constval i32 0)
1340 regassign u64 %1 (cvt u64 i32 (regread i32 %10))
1341 regassign u64 %2 (cvt u64 i32 (regread i32 %11))
1342 @@4 regassign i32 %6 (constval i32 0) <====== "%6为 iv 可直接变为 i64"
1343 @@2 regassign u64 %3 (cvt u64 i32 (regread i32 %6)) <====== "冗余"
1344 regassign i32 %4 (add i32 (
1345 regread i32 %4,
1346 abs i32 (sub i32 (
1347 iread u32 <* u8> 0 (add u64 (regread ptr %7, regread u64 %3)),
1348 iread u32 <* u8> 0 (add u64 (regread ptr %8, regread u64 %3))))))
1349 regassign i32 %6 (add i32 (regread i32 %6, constval i32 1))
1350 brtrue @@2 (lt i32 i32 (regread i32 %6, constval i32 16))
1351 @@1 regassign ptr %7 (add u64 (regread ptr %7, regread u64 %2))
1352 regassign ptr %8 (add u64 (regread ptr %8, regread u64 %1))
1353 regassign i32 %5 (add i32 (regread i32 %5, constval i32 1))
1354 brtrue @@4 (lt i32 i32 (regread i32 %5, constval i32 16))
1355 @@3 return (regread i32 %4)
1356 }
gcc汇编
18: mov x5, #0x0 // #0
20: ldrb w4, [x7, x5]
ldrb w6, [x2, x5]
subs w4, w4, w6
cneg w4, w4, mi // mi = first
add w0, w0, w4
add x5, x5, #0x1
cmp x5, #0x10
b.ne 20
add x7, x7, x1
add x2, x2, x3
subs w8, w8, #0x1
b.ne 18
我们之前在FW已经讨论过,从spec角度看现在已经成为一个非常明显的性能问题,我们可能需要一个系统的解决方案,什么情况下extension是不需要的。
此处可能存在不合适展示的内容,页面不予展示。您可通过相关编辑功能自查并修改。
如您确认内容无涉及 不当用语 / 纯广告导流 / 暴力 / 低俗色情 / 侵权 / 盗版 / 虚假 / 无价值内容或违法国家有关法律法规的内容,可点击提交进行申诉,我们将尽快为您处理。
Looking at the code generated with OAC, the inner loop has the same number of instructions, but the outer loop needs to be more optimized. This has to do with how Maple IR generates loop checking and induction variable optimization. So inner loop is not an issue. Outer loop needs to be optimized. There are no extensions seen in the function.
.L.107__8:
mov w9, #0
mov w5, #0
cmp w9, w6
bge .L.107__5
.L.107__6:
ldrb w9, [x0,w5,SXTW]
ldrb w10, [x2,w5,SXTW]
add w5, w5, #1:编译报错: ninja: error: manifest 'build.ninja' still dirty after 100 tries
subs w9, w9, w10
cneg w9, w9, MI
cmp w5, w6
add w1, w1, w9
blt .L.107__6
.L.107__5:
add w7, w7, #1:编译报错: ninja: error: manifest 'build.ninja' still dirty after 100 tries
add x0, x0, x3
add x2, x2, x4
cmp w7, w8
blt .L.107__8
.L.107__7:
I can see the sxtw from the test.mpl file. To remove this sxtw in mplcg will be extremely difficult as it is inside the loop, so it has multiple definition points. The definition in the loop body is the induction variable of +1 for every iteration. To get rid of it, it will need to determine the maximum value of the iv.
The best solution is not to have the convert going into mplcg.
When looking at this problem, please give -strengthreduction to mplme. When the cvt is on an IV, without strength reduction, the cvt cannot be moved out of the loop. This issue may disappear when strength reduction is turned on.
登录 后才可以发表评论