67 Star 273 Fork 129

MindSpore / akg

 / 详情

Precision error when enabling buffer stitch for Sub and Reduce with shape (1024, 768)

DONE
Bug-Report
创建于  
2021-04-13 19:09
name about labels
Bug Report Use this template for reporting a bug kind/bug

Environment

  • Hardware Environment(Ascend/GPU/CPU):

device gpu

  • Software Environment:
    -- MindSpore version (source or binary):
    -- Python version (e.g., Python 3.7.5):
    -- OS platform and distribution (e.g., Linux Ubuntu 16.04):
    -- GCC/Compiler version (if compiled from source):

Related testcase

{
    "composite": true,
    "composite_graph": "3979.3979",
    "id": 4031,
    "input_desc": [
        [
            {
                "data_type": "float16",
                "format": "DefaultFormat",
                "name": "input_0",
                "shape": [
                    1024,
                    768
                ],
                "tensor_name": "output_0_6"
            }
        ],
        [
            {
                "data_type": "float32",
                "format": "DefaultFormat",
                "name": "input_0",
                "shape": [
                    1024,
                    1
                ],
                "tensor_name": "output_0_9"
            }
        ]
    ],
    "op": "Fused_Cast_LessEqual_Cast_Add_Mul_Cast_Mul_Add_Cast_Cast_ReduceSum_Cast_Mul_Sub__more_split_18218696765951249700",
    "op_desc": [
        {
            "attr": [
                {
                    "data_type": "str",
                    "name": "dst_type",
                    "value": "float16"
                }
            ],
            "impl_path": "",
            "input_desc": [
                [
                    {
                        "data_type": "float32",
                        "format": "DefaultFormat",
                        "name": "input_0",
                        "shape": [
                            1024,
                            1
                        ],
                        "tensor_name": "output_0_9"
                    }
                ]
            ],
            "name": "Cast",
            "output_desc": [
                {
                    "data_type": "float16",
                    "format": "DefaultFormat",
                    "name": "output_0",
                    "shape": [
                        1024,
                        1
                    ],
                    "tensor_name": "output_0_10"
                }
            ]
        },
        {
            "attr": null,
            "impl_path": "",
            "input_desc": [
                [
                    {
                        "data_type": "float16",
                        "format": "DefaultFormat",
                        "name": "input_0",
                        "shape": [
                            1024,
                            1
                        ],
                        "tensor_name": "output_0_10"
                    }
                ],
                [
                    {
                        "data_type": "float16",
                        "format": "DefaultFormat",
                        "name": "input_1",
                        "shape": [
                            1
                        ],
                        "tensor_name": "input_17",
                        "value": 0.0013017654418945312
                    }
                ]
            ],
            "name": "Mul",
            "output_desc": [
                {
                    "data_type": "float16",
                    "format": "DefaultFormat",
                    "name": "output_0",
                    "shape": [
                        1024,
                        1
                    ],
                    "tensor_name": "output_0_11"
                }
            ]
        },
        {
            "attr": null,
            "impl_path": "",
            "input_desc": [
                [
                    {
                        "data_type": "float16",
                        "format": "DefaultFormat",
                        "name": "input_0",
                        "shape": [
                            1024,
                            768
                        ],
                        "tensor_name": "output_0_6"
                    }
                ],
                [
                    {
                        "data_type": "float16",
                        "format": "DefaultFormat",
                        "name": "input_1",
                        "shape": [
                            1024,
                            1
                        ],
                        "tensor_name": "output_0_11"
                    }
                ]
            ],
            "name": "Sub",
            "output_desc": [
                {
                    "data_type": "float16",
                    "format": "DefaultFormat",
                    "name": "output_0",
                    "shape": [
                        1024,
                        768
                    ],
                    "tensor_name": "output_0_12"
                }
            ]
        },
        {
            "attr": null,
            "impl_path": "",
            "input_desc": [
                [
                    {
                        "data_type": "float16",
                        "format": "DefaultFormat",
                        "name": "input_0",
                        "shape": [
                            1024,
                            768
                        ],
                        "tensor_name": "output_0_12"
                    }
                ],
                [
                    {
                        "data_type": "float16",
                        "format": "DefaultFormat",
                        "name": "input_1",
                        "shape": [
                            1024,
                            768
                        ],
                        "tensor_name": "output_0_12"
                    }
                ]
            ],
            "name": "Mul",
            "output_desc": [
                {
                    "data_type": "float16",
                    "format": "DefaultFormat",
                    "name": "output_0",
                    "shape": [
                        1024,
                        768
                    ],
                    "tensor_name": "output_0_13"
                }
            ]
        },
        {
            "attr": [
                {
                    "data_type": "str",
                    "name": "dst_type",
                    "value": "float32"
                }
            ],
            "impl_path": "",
            "input_desc": [
                [
                    {
                        "data_type": "float16",
                        "format": "DefaultFormat",
                        "name": "input_0",
                        "shape": [
                            1024,
                            768
                        ],
                        "tensor_name": "output_0_13"
                    }
                ]
            ],
            "name": "Cast",
            "output_desc": [
                {
                    "data_type": "float32",
                    "format": "DefaultFormat",
                    "name": "output_0",
                    "shape": [
                        1024,
                        768
                    ],
                    "tensor_name": "output_0_14"
                }
            ]
        },
        {
            "attr": [
                {
                    "data_type": "str",
                    "name": "stitch",
                    "value": "common"
                },
                {
                    "data_type": "listInt",
                    "name": "axis",
                    "value": [
                        1
                    ]
                },
                {
                    "data_type": "bool",
                    "name": "keep_dims",
                    "value": true
                }
            ],
            "impl_path": "",
            "input_desc": [
                [
                    {
                        "data_type": "float32",
                        "format": "DefaultFormat",
                        "name": "input_0",
                        "shape": [
                            1024,
                            768
                        ],
                        "tensor_name": "output_0_14"
                    }
                ]
            ],
            "name": "ReduceSum",
            "output_desc": [
                {
                    "data_type": "float32",
                    "format": "DefaultFormat",
                    "name": "output_0",
                    "shape": [
                        1024,
                        1
                    ],
                    "tensor_name": "output_0_15"
                }
            ]
        }
    ],
    "output_desc": [
        {
            "data_type": "float32",
            "format": "DefaultFormat",
            "name": "output_0",
            "shape": [
                1024,
                1
            ],
            "tensor_name": "output_0_15"
        },
        {
            "data_type": "float16",
            "format": "DefaultFormat",
            "name": "output_0",
            "shape": [
                1024,
                768
            ],
            "tensor_name": "output_0_12"
        },
        {
            "data_type": "float16",
            "format": "DefaultFormat",
            "name": "output_0",
            "shape": [
                1024,
                1
            ],
            "tensor_name": "output_0_11"
        }
    ],
    "platform": "AKG",
    "process": "cuda"
}

Steps to reproduce the issue

  1. save above case into "p2.info" under dir test/st/composite
  2. $ python test_composite.py -af p2.info

Describe the current behavior

Precision error of Sub output and Reduce output.

Describe the expected behavior

Test pass.

Related log / screenshot

Fail cuda:

extern "C" __global__ void Fused_Cast_LessEqual_Cast_Add_Mul_Cast_Mul_Add_Cast_Cast_ReduceSum_Cast_Mul_Sub__more_split_18218696765951249700_kernel0( half* __restrict__ output_0_6,  float* __restrict__ output_0_9,  float* __restrict__ T_cast_T_multiply_T_subtract_output_0_6_T_multiply_T_cast_output_0_9_T_subtract_output_0_6_T_multiply_T_cast_output_0_9_red,  half* __restrict__ T_subtract_output_0_6_T_multiply_T_cast_output_0_9,  half* __restrict__ T_multiply_T_cast_output_0_9) {
   float output_0_9_local[2];
  __shared__ half T_multiply_T_cast_output_0_9_shared[2];
  __shared__ half T_subtract_output_0_6_T_multiply_T_cast_output_0_9_shared[1536];
  __shared__ float T_cast_T_multiply_T_subtract_output_0_6_T_multiply_T_cast_output_0_9_T_subtract_output_0_6_T_multiply_T_cast_output_0_9_red_shared[2];
   float acc_0[1];
  __shared__ float red_buf0[256];
  for (int cc1 = 0; cc1 < 2; ++cc1) {
    output_0_9_local[cc1] = output_0_9[((((int)blockIdx.y) * 2) + cc1)];
  }
  for (int cc2 = 0; cc2 < 2; ++cc2) {
    for (int cc3 = 0; cc3 < 3; ++cc3) {
      T_multiply_T_cast_output_0_9_shared[cc2] = (((half)output_0_9_local[cc2]) * __float2half_rn(1.301765e-03f));
      T_subtract_output_0_6_T_multiply_T_cast_output_0_9_shared[(((cc2 * 768) + (cc3 * 256)) + ((int)threadIdx.x))] = (output_0_6[((((((int)blockIdx.y) * 1536) + (cc2 * 768)) + (cc3 * 256)) + ((int)threadIdx.x))] - T_multiply_T_cast_output_0_9_shared[cc2]);
      if ((((int)threadIdx.x) == 0) && (cc3 == 0)) {
        T_cast_T_multiply_T_subtract_output_0_6_T_multiply_T_cast_output_0_9_T_subtract_output_0_6_T_multiply_T_cast_output_0_9_red_shared[cc2] = 0.000000e+00f;
      }
    }
  }
  __syncthreads();
  for (int cc21 = 0; cc21 < 2; ++cc21) {
    acc_0[0] = 0.000000e+00f;
    for (int cc31 = 0; cc31 < 3; ++cc31) {
      acc_0[0] = (acc_0[0] + ((float)(T_subtract_output_0_6_T_multiply_T_cast_output_0_9_shared[(((cc21 * 768) + (cc31 * 256)) + ((int)threadIdx.x))] * T_subtract_output_0_6_T_multiply_T_cast_output_0_9_shared[(((cc21 * 768) + (cc31 * 256)) + ((int)threadIdx.x))])));
    }
    (void)akg_reduce::AkgReduce<float,akg_reduce::SumOp, 256, 1, akg_reduce::REDUCE2D_X>(akg_reduce::SumOp(), &(T_cast_T_multiply_T_subtract_output_0_6_T_multiply_T_cast_output_0_9_T_subtract_output_0_6_T_multiply_T_cast_output_0_9_red_shared[cc21]), red_buf0, acc_0[0], 256);
  }
  __syncthreads();
  if (((int)threadIdx.x) <= 1) {
    T_cast_T_multiply_T_subtract_output_0_6_T_multiply_T_cast_output_0_9_T_subtract_output_0_6_T_multiply_T_cast_output_0_9_red[((((int)blockIdx.y) * 2) + ((int)threadIdx.x))] = T_cast_T_multiply_T_subtract_output_0_6_T_multiply_T_cast_output_0_9_T_subtract_output_0_6_T_multiply_T_cast_output_0_9_red_shared[((int)threadIdx.x)];
    T_multiply_T_cast_output_0_9[((((int)blockIdx.y) * 2) + ((int)threadIdx.x))] = T_multiply_T_cast_output_0_9_shared[((int)threadIdx.x)];
  }
  for (int cc22 = 0; cc22 < 2; ++cc22) {
    for (int cc32 = 0; cc32 < 3; ++cc32) {
      T_subtract_output_0_6_T_multiply_T_cast_output_0_9[((((((int)blockIdx.y) * 1536) + (cc22 * 768)) + (cc32 * 256)) + ((int)threadIdx.x))] = T_subtract_output_0_6_T_multiply_T_cast_output_0_9_shared[(((cc22 * 768) + (cc32 * 256)) + ((int)threadIdx.x))];
    }
  }
  __syncthreads();
}

评论 (1)

yangsijia 创建了Bug-Report
yangsijia 负责人设置为yangsijia
yangsijia 关联仓库设置为MindSpore/akg
yangsijia 添加了
 
kind/bug
标签
展开全部操作日志

Hey dabaiji, Welcome to MindSpore Community.
All of the projects in MindSpore Community are maintained by @mindspore-ci-bot.
That means the developers can comment below every pull request or issue to trigger Bot Commands.
Please follow instructions at https://gitee.com/mindspore/community/blob/master/command.md to find the details.

anyrenwei 关联分支设置为master
yangsijia 任务状态TODO 修改为DONE

登录 后才可以发表评论

状态
负责人
项目
里程碑
Pull Requests
关联的 Pull Requests 被合并后可能会关闭此 issue
分支
开始日期   -   截止日期
-
置顶选项
优先级
预计工期 (小时)
参与者(2)
5518576 mindspore ci 1587902139 6572207 bigwhitej 1586479632
Python
1
https://gitee.com/mindspore/akg.git
git@gitee.com:mindspore/akg.git
mindspore
akg
akg

搜索帮助