[CANN 5.0.3.alpha002][TIK Compiler] Code generation error

## Problem description

The TIK compiler generates incorrect code in the following case.

```python3
import te.tik as tik

def reshape(inst, im_in_ub, im_in_reshaped_ub, dim):
  # Assuming im_in_ub = (2*dim + 2, 16)

if dim // 8 > 0:
    with inst.for_range(0, dim // 8) as i_rw:
      src_list = [
          im_in_ub[2 * i_scatter + i_rw * 8 * 2, 0] for i_scatter in range(8)
      ]
      dst_list = [
          im_in_reshaped_ub[0, 8 * i_rw + i_scatter, 0]
          for i_scatter in range(8)
      ]
      inst.scatter_vector_mov(128, dst_list, src_list, 4, (16 * dim) // 16,
                              16 // 16)

tail = dim % 8
  if tail > 0:
    i_rw = dim // 8
    src_list = [
        im_in_ub[2 * i_scatter + i_rw * 8 * 2, 0] for i_scatter in range(tail)
    ]
    dst_list = [
        im_in_reshaped_ub[0, 8 * i_rw + i_scatter, 0]
        for i_scatter in range(tail)
    ]
    inst.scatter_vector_mov(tail * 16, dst_list, src_list, 4, (16 * dim) // 16,
                            16 // 16)

tik_instance = tik.Tik(tik.Dprofile('v100', 'cloud'))

dim = 12

src_shape = (2 * dim + 2, 16)
dst_shape = (4, dim, 16)
src_gm = tik_instance.Tensor('float16', src_shape, tik.scope_gm, 'src_gm')
src_ub = tik_instance.Tensor('float16', src_shape, tik.scope_ubuf, 'src_ub')
src_reshaped_ub_shape = (4, dim, 16)
src_reshaped_ub = tik_instance.Tensor('float16',
                                      src_reshaped_ub_shape,
                                      name='reshaped_ub',
                                      scope=tik.scope_ubuf)

dst_gm = tik_instance.Tensor('float16', dst_shape, tik.scope_gm, 'dst_gm')

tik_instance.data_move(src_ub, src_gm, 0, 1, (2 * dim + 2) * 16 // 16, 0, 0)
with tik_instance.for_range(0, 10):
  reshape(tik_instance, src_ub, src_reshaped_ub, dim)
tik_instance.data_move(dst_gm, src_reshaped_ub, 0, 1, (4 * dim) * 16 // 16, 0,
                       0)

tik_instance.BuildCCE(kernel_name='test',
                      inputs=[src_gm],
                      outputs=[dst_gm],
                      config={'save_temp_cce_file': True})
```

Specifically, this code works in the simulator, but it doesn't work on the device. 
Looking at the CCE code generated, the vector mask is set with the instruction `set_vector_mask(0x0, 0xffffffffffffffff)` only once before the loops. However, the two vector instructions use **two different masks**.
The generated CCE code is reported below.

```
#ifdef __CCE_KT_TEST__
#define __aicore__ 
#else
#define __aicore__ [aicore]
#endif

extern "C" __global__ __aicore__ void test__kernel0(__gm__ uint8_t* __restrict__ src_gm, __gm__ uint8_t* __restrict__ dst_gm) {
set_atomic_none();
__ubuf__   uint8_t* src_ub = (__ubuf__  uint8_t *)get_imm(0);
  __ubuf__   uint8_t* reshaped_ub = (__ubuf__  uint8_t *)get_imm(832);
  // "aicore arch: Ascend910"
  copy_gm_to_ubuf(((__ubuf__ half *)src_ub), ((__gm__ half *)src_gm), 0, 1, 26, 0, 0);
  set_flag(PIPE_MTE2, PIPE_V, EVENT_ID0);
  set_vector_mask(0x0, 0xffffffffffffffff);
  wait_flag(PIPE_MTE2, PIPE_V, EVENT_ID0);
  for (int32_t i = 0; i < 10; ++i) {
    pipe_barrier(PIPE_V);
    uint64_t va_reg_array_1[8] = {((uint64_t)((__ubuf__ half *)reshaped_ub)), ((uint64_t)((__ubuf__ half *)reshaped_ub + 16)), ((uint64_t)((__ubuf__ half *)reshaped_ub + 32)), ((uint64_t)((__ubuf__ half *)reshaped_ub + 48)), ((uint64_t)((__ubuf__ half *)reshaped_ub + 64)), ((uint64_t)((__ubuf__ half *)reshaped_ub + 80)), ((uint64_t)((__ubuf__ half *)reshaped_ub + 96)), ((uint64_t)((__ubuf__ half *)reshaped_ub + 112))};set_va_reg_sb(VA3, va_reg_array_1);
    uint64_t va_reg_array_2[8] = {((uint64_t)((__ubuf__ half *)src_ub)), ((uint64_t)((__ubuf__ half *)src_ub + 32)), ((uint64_t)((__ubuf__ half *)src_ub + 64)), ((uint64_t)((__ubuf__ half *)src_ub + 96)), ((uint64_t)((__ubuf__ half *)src_ub + 128)), ((uint64_t)((__ubuf__ half *)src_ub + 160)), ((uint64_t)((__ubuf__ half *)src_ub + 192)), ((uint64_t)((__ubuf__ half *)src_ub + 224))};set_va_reg_sb(VA4, va_reg_array_2);
    scatter_vector_mov_f16(VA3, VA4, (int64_t)288230376151777292);
    uint64_t va_reg_array_3[8] = {((uint64_t)((__ubuf__ half *)reshaped_ub + 128)), ((uint64_t)((__ubuf__ half *)reshaped_ub + 144)), ((uint64_t)((__ubuf__ half *)reshaped_ub + 160)), ((uint64_t)((__ubuf__ half *)reshaped_ub + 176)), ((uint64_t)((__ubuf__ half *)reshaped_ub + 128)), ((uint64_t)((__ubuf__ half *)reshaped_ub + 128)), ((uint64_t)((__ubuf__ half *)reshaped_ub + 128)), ((uint64_t)((__ubuf__ half *)reshaped_ub + 128))};set_va_reg_sb(VA3, va_reg_array_3);
    uint64_t va_reg_array_4[8] = {((uint64_t)((__ubuf__ half *)src_ub + 256)), ((uint64_t)((__ubuf__ half *)src_ub + 288)), ((uint64_t)((__ubuf__ half *)src_ub + 320)), ((uint64_t)((__ubuf__ half *)src_ub + 352)), ((uint64_t)((__ubuf__ half *)src_ub + 256)), ((uint64_t)((__ubuf__ half *)src_ub + 256)), ((uint64_t)((__ubuf__ half *)src_ub + 256)), ((uint64_t)((__ubuf__ half *)src_ub + 256))};set_va_reg_sb(VA4, va_reg_array_4);
    scatter_vector_mov_f16(VA3, VA4, (int64_t)288230376151777292);
  }
  set_flag(PIPE_V, PIPE_MTE3, EVENT_ID0);
  wait_flag(PIPE_V, PIPE_MTE3, EVENT_ID0);
  copy_ubuf_to_gm(((__gm__ half *)dst_gm), ((__ubuf__ half *)reshaped_ub), 0, 1, 48, 0, 0);
  pipe_barrier(PIPE_ALL);
  pipe_barrier(PIPE_ALL);

}

```

## FIX Suggestion

The problem seems to be introduced by the pass `SequenceSprInsn`, which performs code motion of the `set_vector_mask` instruction from the loop body.

Ascend/modelzoo
暂停

内容风险标识

评论 (10)

Ascend/modelzoo暂停 .gitee-modal { width: 500px !important; }

内容风险标识