Precision error when enabling buffer stitch for Sub and Reduce with shape (1024, 768)

name	about	labels
Bug Report	Use this template for reporting a bug	kind/bug

Environment

Hardware Environment(Ascend/GPU/CPU):

device gpu

Software Environment:
-- MindSpore version (source or binary):
-- Python version (e.g., Python 3.7.5):
-- OS platform and distribution (e.g., Linux Ubuntu 16.04):
-- GCC/Compiler version (if compiled from source):

Related testcase

{
    "composite": true,
    "composite_graph": "3979.3979",
    "id": 4031,
    "input_desc": [
        [
            {
                "data_type": "float16",
                "format": "DefaultFormat",
                "name": "input_0",
                "shape": [
                    1024,
                    768
                ],
                "tensor_name": "output_0_6"
            }
        ],
        [
            {
                "data_type": "float32",
                "format": "DefaultFormat",
                "name": "input_0",
                "shape": [
                    1024,
                    1
                ],
                "tensor_name": "output_0_9"
            }
        ]
    ],
    "op": "Fused_Cast_LessEqual_Cast_Add_Mul_Cast_Mul_Add_Cast_Cast_ReduceSum_Cast_Mul_Sub__more_split_18218696765951249700",
    "op_desc": [
        {
            "attr": [
                {
                    "data_type": "str",
                    "name": "dst_type",
                    "value": "float16"
                }
            ],
            "impl_path": "",
            "input_desc": [
                [
                    {
                        "data_type": "float32",
                        "format": "DefaultFormat",
                        "name": "input_0",
                        "shape": [
                            1024,
                            1
                        ],
                        "tensor_name": "output_0_9"
                    }
                ]
            ],
            "name": "Cast",
            "output_desc": [
                {
                    "data_type": "float16",
                    "format": "DefaultFormat",
                    "name": "output_0",
                    "shape": [
                        1024,
                        1
                    ],
                    "tensor_name": "output_0_10"
                }
            ]
        },
        {
            "attr": null,
            "impl_path": "",
            "input_desc": [
                [
                    {
                        "data_type": "float16",
                        "format": "DefaultFormat",
                        "name": "input_0",
                        "shape": [
                            1024,
                            1
                        ],
                        "tensor_name": "output_0_10"
                    }
                ],
                [
                    {
                        "data_type": "float16",
                        "format": "DefaultFormat",
                        "name": "input_1",
                        "shape": [
                            1
                        ],
                        "tensor_name": "input_17",
                        "value": 0.0013017654418945312
                    }
                ]
            ],
            "name": "Mul",
            "output_desc": [
                {
                    "data_type": "float16",
                    "format": "DefaultFormat",
                    "name": "output_0",
                    "shape": [
                        1024,
                        1
                    ],
                    "tensor_name": "output_0_11"
                }
            ]
        },
        {
            "attr": null,
            "impl_path": "",
            "input_desc": [
                [
                    {
                        "data_type": "float16",
                        "format": "DefaultFormat",
                        "name": "input_0",
                        "shape": [
                            1024,
                            768
                        ],
                        "tensor_name": "output_0_6"
                    }
                ],
                [
                    {
                        "data_type": "float16",
                        "format": "DefaultFormat",
                        "name": "input_1",
                        "shape": [
                            1024,
                            1
                        ],
                        "tensor_name": "output_0_11"
                    }
                ]
            ],
            "name": "Sub",
            "output_desc": [
                {
                    "data_type": "float16",
                    "format": "DefaultFormat",
                    "name": "output_0",
                    "shape": [
                        1024,
                        768
                    ],
                    "tensor_name": "output_0_12"
                }
            ]
        },
        {
            "attr": null,
            "impl_path": "",
            "input_desc": [
                [
                    {
                        "data_type": "float16",
                        "format": "DefaultFormat",
                        "name": "input_0",
                        "shape": [
                            1024,
                            768
                        ],
                        "tensor_name": "output_0_12"
                    }
                ],
                [
                    {
                        "data_type": "float16",
                        "format": "DefaultFormat",
                        "name": "input_1",
                        "shape": [
                            1024,
                            768
                        ],
                        "tensor_name": "output_0_12"
                    }
                ]
            ],
            "name": "Mul",
            "output_desc": [
                {
                    "data_type": "float16",
                    "format": "DefaultFormat",
                    "name": "output_0",
                    "shape": [
                        1024,
                        768
                    ],
                    "tensor_name": "output_0_13"
                }
            ]
        },
        {
            "attr": [
                {
                    "data_type": "str",
                    "name": "dst_type",
                    "value": "float32"
                }
            ],
            "impl_path": "",
            "input_desc": [
                [
                    {
                        "data_type": "float16",
                        "format": "DefaultFormat",
                        "name": "input_0",
                        "shape": [
                            1024,
                            768
                        ],
                        "tensor_name": "output_0_13"
                    }
                ]
            ],
            "name": "Cast",
            "output_desc": [
                {
                    "data_type": "float32",
                    "format": "DefaultFormat",
                    "name": "output_0",
                    "shape": [
                        1024,
                        768
                    ],
                    "tensor_name": "output_0_14"
                }
            ]
        },
        {
            "attr": [
                {
                    "data_type": "str",
                    "name": "stitch",
                    "value": "common"
                },
                {
                    "data_type": "listInt",
                    "name": "axis",
                    "value": [
                        1
                    ]
                },
                {
                    "data_type": "bool",
                    "name": "keep_dims",
                    "value": true
                }
            ],
            "impl_path": "",
            "input_desc": [
                [
                    {
                        "data_type": "float32",
                        "format": "DefaultFormat",
                        "name": "input_0",
                        "shape": [
                            1024,
                            768
                        ],
                        "tensor_name": "output_0_14"
                    }
                ]
            ],
            "name": "ReduceSum",
            "output_desc": [
                {
                    "data_type": "float32",
                    "format": "DefaultFormat",
                    "name": "output_0",
                    "shape": [
                        1024,
                        1
                    ],
                    "tensor_name": "output_0_15"
                }
            ]
        }
    ],
    "output_desc": [
        {
            "data_type": "float32",
            "format": "DefaultFormat",
            "name": "output_0",
            "shape": [
                1024,
                1
            ],
            "tensor_name": "output_0_15"
        },
        {
            "data_type": "float16",
            "format": "DefaultFormat",
            "name": "output_0",
            "shape": [
                1024,
                768
            ],
            "tensor_name": "output_0_12"
        },
        {
            "data_type": "float16",
            "format": "DefaultFormat",
            "name": "output_0",
            "shape": [
                1024,
                1
            ],
            "tensor_name": "output_0_11"
        }
    ],
    "platform": "AKG",
    "process": "cuda"
}

Steps to reproduce the issue

save above case into "p2.info" under dir test/st/composite
$ python test_composite.py -af p2.info

Describe the current behavior

Precision error of Sub output and Reduce output.

Describe the expected behavior

Test pass.

Related log / screenshot

Fail cuda:

extern "C" __global__ void Fused_Cast_LessEqual_Cast_Add_Mul_Cast_Mul_Add_Cast_Cast_ReduceSum_Cast_Mul_Sub__more_split_18218696765951249700_kernel0( half* __restrict__ output_0_6,  float* __restrict__ output_0_9,  float* __restrict__ T_cast_T_multiply_T_subtract_output_0_6_T_multiply_T_cast_output_0_9_T_subtract_output_0_6_T_multiply_T_cast_output_0_9_red,  half* __restrict__ T_subtract_output_0_6_T_multiply_T_cast_output_0_9,  half* __restrict__ T_multiply_T_cast_output_0_9) {
   float output_0_9_local[2];
  __shared__ half T_multiply_T_cast_output_0_9_shared[2];
  __shared__ half T_subtract_output_0_6_T_multiply_T_cast_output_0_9_shared[1536];
  __shared__ float T_cast_T_multiply_T_subtract_output_0_6_T_multiply_T_cast_output_0_9_T_subtract_output_0_6_T_multiply_T_cast_output_0_9_red_shared[2];
   float acc_0[1];
  __shared__ float red_buf0[256];
  for (int cc1 = 0; cc1 < 2; ++cc1) {
    output_0_9_local[cc1] = output_0_9[((((int)blockIdx.y) * 2) + cc1)];
  }
  for (int cc2 = 0; cc2 < 2; ++cc2) {
    for (int cc3 = 0; cc3 < 3; ++cc3) {
      T_multiply_T_cast_output_0_9_shared[cc2] = (((half)output_0_9_local[cc2]) * __float2half_rn(1.301765e-03f));
      T_subtract_output_0_6_T_multiply_T_cast_output_0_9_shared[(((cc2 * 768) + (cc3 * 256)) + ((int)threadIdx.x))] = (output_0_6[((((((int)blockIdx.y) * 1536) + (cc2 * 768)) + (cc3 * 256)) + ((int)threadIdx.x))] - T_multiply_T_cast_output_0_9_shared[cc2]);
      if ((((int)threadIdx.x) == 0) && (cc3 == 0)) {
        T_cast_T_multiply_T_subtract_output_0_6_T_multiply_T_cast_output_0_9_T_subtract_output_0_6_T_multiply_T_cast_output_0_9_red_shared[cc2] = 0.000000e+00f;
      }
    }
  }
  __syncthreads();
  for (int cc21 = 0; cc21 < 2; ++cc21) {
    acc_0[0] = 0.000000e+00f;
    for (int cc31 = 0; cc31 < 3; ++cc31) {
      acc_0[0] = (acc_0[0] + ((float)(T_subtract_output_0_6_T_multiply_T_cast_output_0_9_shared[(((cc21 * 768) + (cc31 * 256)) + ((int)threadIdx.x))] * T_subtract_output_0_6_T_multiply_T_cast_output_0_9_shared[(((cc21 * 768) + (cc31 * 256)) + ((int)threadIdx.x))])));
    }
    (void)akg_reduce::AkgReduce<float,akg_reduce::SumOp, 256, 1, akg_reduce::REDUCE2D_X>(akg_reduce::SumOp(), &(T_cast_T_multiply_T_subtract_output_0_6_T_multiply_T_cast_output_0_9_T_subtract_output_0_6_T_multiply_T_cast_output_0_9_red_shared[cc21]), red_buf0, acc_0[0], 256);
  }
  __syncthreads();
  if (((int)threadIdx.x) <= 1) {
    T_cast_T_multiply_T_subtract_output_0_6_T_multiply_T_cast_output_0_9_T_subtract_output_0_6_T_multiply_T_cast_output_0_9_red[((((int)blockIdx.y) * 2) + ((int)threadIdx.x))] = T_cast_T_multiply_T_subtract_output_0_6_T_multiply_T_cast_output_0_9_T_subtract_output_0_6_T_multiply_T_cast_output_0_9_red_shared[((int)threadIdx.x)];
    T_multiply_T_cast_output_0_9[((((int)blockIdx.y) * 2) + ((int)threadIdx.x))] = T_multiply_T_cast_output_0_9_shared[((int)threadIdx.x)];
  }
  for (int cc22 = 0; cc22 < 2; ++cc22) {
    for (int cc32 = 0; cc32 < 3; ++cc32) {
      T_subtract_output_0_6_T_multiply_T_cast_output_0_9[((((((int)blockIdx.y) * 1536) + (cc22 * 768)) + (cc32 * 256)) + ((int)threadIdx.x))] = T_subtract_output_0_6_T_multiply_T_cast_output_0_9_shared[(((cc22 * 768) + (cc32 * 256)) + ((int)threadIdx.x))];
    }
  }
  __syncthreads();
}

Hey dabaiji, Welcome to MindSpore Community.
All of the projects in MindSpore Community are maintained by @mindspore-ci-bot.
That means the developers can comment below every pull request or issue to trigger Bot Commands.
Please follow instructions at https://gitee.com/mindspore/community/blob/master/command.md to find the details.

MindSpore / akg

内容风险标识

Environment

Related testcase

Steps to reproduce the issue

Describe the current behavior

Describe the expected behavior

Related log / screenshot

评论 (1)

MindSpore / akg .gitee-modal { width: 500px !important; }

内容风险标识

Precision error when enabling buffer stitch for Sub and Reduce with shape (1024, 768)

Environment

Related testcase

Steps to reproduce the issue

Describe the current behavior

Describe the expected behavior

Related log / screenshot

评论 (1)

搜索帮助

MindSpore / akg