1.4K Star 7.6K Fork 1.4K

GVP方舟编译器 / OpenArkCompiler

 / 详情

【spec性能分析】iv冗余的swtx

待办的
成员
创建于  
2021-06-02 15:56
  • 源码:spec 525 pixel.c 函数:x264_pixel_sad_16x16
static int name( uint8_t *pix1, int i_stride_pix1,  \
                 uint8_t *pix2, int i_stride_pix2 ) \
{                                                   \
    int i_sum = 0;                                  \
    for( int y = 0; y < ly; y++ )                   \
    {                                               \
        for( int x = 0; x < lx; x++ )               \
        {                                           \
            i_sum += abs( pix1[x] - pix2[x] );      \
        }                                           \
        pix1 += i_stride_pix1;                      \
        pix2 += i_stride_pix2;                      \
    }                                               \
    return i_sum;                                   \
}

maple汇编

1386   mov w4, w1
1387   mov w1, #0
1388   mov w6, #0
1389   sxtw  x7, w3
1390   sxtw  x8, w4
1391 .L.4__4:
1392   mov w5, #0   <===== "mov x5, #0  x5为iv"
1393 .L.4__2:
1394   sxtw  x3, w5   <====== "冗余"
1395   ldrb  w4, [x0,x3]
1396   ldrb  w3, [x2,x3]
1397   subs  w3, w4, w3
1398   cneg  w3, w3, MI
1399   add w1, w1, w3
1400   add w5, w5, #1
1401   cmp w5, #16
1402   blt .L.4__2
1403 .L.4__1:
1404   add x0, x0, x8
1405   add x2, x2, x7
1406   add w6, w6, #1
1407   cmp w6, #16
1408   blt .L.4__4
1409 .L.4__3:
1410   mov w0, w1
1411 .L.4__12:
1412   ret

maple IR

func &x264_pixel_sad_16x16 static (reg %7 <* u8>, reg %11 i32, reg %8 <* u8>, reg %10 i32) i32 {
1335   funcid 4
1336
1337 LOC 2 61
1338   regassign i32 %4 (constval i32 0)
1339   regassign i32 %5 (constval i32 0)
1340   regassign u64 %1 (cvt u64 i32 (regread i32 %10))
1341   regassign u64 %2 (cvt u64 i32 (regread i32 %11))
1342 @@4   regassign i32 %6 (constval i32 0)   <====== "%6为 iv 可直接变为 i64"
1343 @@2   regassign u64 %3 (cvt u64 i32 (regread i32 %6))    <====== "冗余"
1344   regassign i32 %4 (add i32 (
1345       regread i32 %4,
1346       abs i32 (sub i32 (
1347         iread u32 <* u8> 0 (add u64 (regread ptr %7, regread u64 %3)),
1348         iread u32 <* u8> 0 (add u64 (regread ptr %8, regread u64 %3))))))
1349   regassign i32 %6 (add i32 (regread i32 %6, constval i32 1))
1350   brtrue @@2 (lt i32 i32 (regread i32 %6, constval i32 16))
1351 @@1   regassign ptr %7 (add u64 (regread ptr %7, regread u64 %2))
1352   regassign ptr %8 (add u64 (regread ptr %8, regread u64 %1))
1353   regassign i32 %5 (add i32 (regread i32 %5, constval i32 1))
1354   brtrue @@4 (lt i32 i32 (regread i32 %5, constval i32 16))
1355 @@3   return (regread i32 %4)
1356 }

gcc汇编

18:   mov    x5, #0x0                        // #0
20:   ldrb   w4, [x7, x5]
      ldrb   w6, [x2, x5]
      subs   w4, w4, w6
      cneg   w4, w4, mi  // mi = first
      add    w0, w0, w4
      add    x5, x5, #0x1
      cmp    x5, #0x10
      b.ne   20
      add    x7, x7, x1
      add    x2, x2, x3
      subs   w8, w8, #0x1
      b.ne   18

评论 (4)

Leo Young 创建了任务
yi_jiang 负责人设置为williambillchen
yi_jiang 添加协作者Alfred Huang
yi_jiang 优先级设置为主要
展开全部操作日志

我们之前在FW已经讨论过,从spec角度看现在已经成为一个非常明显的性能问题,我们可能需要一个系统的解决方案,什么情况下extension是不需要的。

Looking at the code generated with OAC, the inner loop has the same number of instructions, but the outer loop needs to be more optimized. This has to do with how Maple IR generates loop checking and induction variable optimization. So inner loop is not an issue. Outer loop needs to be optimized. There are no extensions seen in the function.
.L.107__8:
mov w9, #0
mov w5, #0
cmp w9, w6
bge .L.107__5
.L.107__6:
ldrb w9, [x0,w5,SXTW]
ldrb w10, [x2,w5,SXTW]
add w5, w5, #1:编译报错: ninja: error: manifest 'build.ninja' still dirty after 100 tries
subs w9, w9, w10
cneg w9, w9, MI
cmp w5, w6
add w1, w1, w9
blt .L.107__6
.L.107__5:
add w7, w7, #1:编译报错: ninja: error: manifest 'build.ninja' still dirty after 100 tries
add x0, x0, x3
add x2, x2, x4
cmp w7, w8
blt .L.107__8
.L.107__7:

I can see the sxtw from the test.mpl file. To remove this sxtw in mplcg will be extremely difficult as it is inside the loop, so it has multiple definition points. The definition in the loop body is the induction variable of +1 for every iteration. To get rid of it, it will need to determine the maximum value of the iv.
The best solution is not to have the convert going into mplcg.

When looking at this problem, please give -strengthreduction to mplme. When the cvt is on an IV, without strength reduction, the cvt cannot be moved out of the loop. This issue may disappear when strength reduction is turned on.

登录 后才可以发表评论

状态
负责人
里程碑
Pull Requests
关联的 Pull Requests 被合并后可能会关闭此 issue
分支
开始日期   -   截止日期
-
置顶选项
优先级
参与者(5)
C++
1
https://gitee.com/openarkcompiler/OpenArkCompiler.git
git@gitee.com:openarkcompiler/OpenArkCompiler.git
openarkcompiler
OpenArkCompiler
OpenArkCompiler

搜索帮助