diff --git a/.jenkins/check/config/filter_cppcheck.txt b/.jenkins/check/config/filter_cppcheck.txt index e69de29bb2d1d6434b8b29ae775ad8c2e48c5391..6d50c4e70093b55c9895e3befac9dc28885ffc8a 100644 --- a/.jenkins/check/config/filter_cppcheck.txt +++ b/.jenkins/check/config/filter_cppcheck.txt @@ -0,0 +1 @@ +"ms_custom_ops/ops/ascendc/grid_sample/op_kernel/grid_sample.cpp" "" \ No newline at end of file diff --git a/.jenkins/check/config/whitelizard.txt b/.jenkins/check/config/whitelizard.txt index e69de29bb2d1d6434b8b29ae775ad8c2e48c5391..41aca3a66b1a683979c5029323fcbd26ce6aded7 100644 --- a/.jenkins/check/config/whitelizard.txt +++ b/.jenkins/check/config/whitelizard.txt @@ -0,0 +1,2 @@ +ms_custom_ops/ops/ascendc/grid_sample/op_kernel/grid_sample.cpp:KernelGridSample::grid_sample_0_kernel +ms_custom_ops/ops/ascendc/grid_sample/op_kernel/gridsample_nhwc.py:grid_sample \ No newline at end of file diff --git a/docs/map_from_buildin_to_custom.md b/docs/map_from_buildin_to_custom.md index 40cbe07ed6586afbb8185d5890188d8e41c7edc8..4cae6cfc4b4145b1aa7cb1d395644b79a1436b75 100644 --- a/docs/map_from_buildin_to_custom.md +++ b/docs/map_from_buildin_to_custom.md @@ -3,6 +3,7 @@ | ops.auto_generate.format_cast | [ms_custom_ops.trans_data](../ops/c_api/trans_data/trans_data.md) | 两者均进行ND和NZ的相互转换。format_cast依赖ms内置白名单;trans_data不使用白名单但有使用约束,详见trans_data文档。同一权重只能选用其中一种进行转换,建议网络中统一采用同一种算子,两者不兼容。 | | ops.auto_generate.mla | [ms_custom_ops.mla](../ops/c_api/mla/mla_doc.md) | 新增了input_format参数,用于指定输入参数的format | | ops.auto_generate.reshape_and_cache| [ms_custom_ops.reshape_and_cache](../ops/c_api/reshape_and_cache/reshape_and_cache.md) | 新增cache_mode参数,用于指定Atlas 训练系列cache的format是ND还是NZ; 新增head_num,cache_mode为NZ的时候必须提供,辅助计算。| +| ops.grid_sample | [ms_custom_ops.grid_sample](../ops/ascendc/grid_sample/grid_sample.md) | 支持(N, H, W, C)格式的输入,仅支持 Atlas 推理系列 | | ops.moe_init_routing_v2 | [ms_custom_ops.moe_init_routing_v2](../ops/c_api/moe_init_routing_v2/moe_init_routing_v2.md) | 接口一致,仅支持 Atlas 推理系列 | |ops.auto_generate.moe_gating_group_topk|[ms_custom_ops.moe_gating_group_topk](../ops/c_api/moe_gating_group_topk/moe_gating_group_topk.md) |接口一致| | ops.auto_generate.group_topk | [ms_custom_ops.group_topk](../ops/c_api/group_topk/group_topk_doc.md) | 副作用接口,将不再支持左值输出 | diff --git a/docs/op_list.md b/docs/op_list.md index a7a8ed889a0ce6dd41f062b9b8da95c785f4b360..9156d505660042fe6887ad0b37d5f2f00ca1a6bf 100644 --- a/docs/op_list.md +++ b/docs/op_list.md @@ -6,7 +6,8 @@ 1. [apply_rotary_pos_emb_atb](../ops/c_api/apply_rotary_pos_emb_atb/apply_rotary_pos_emb_atb.md) 1. [fa_update](../ops/c_api/fa_update/fa_update_doc.md) 1. [flash_attention_encoder](../ops/c_api/flash_attention_encoder/flash_attention_encoder.md) -1. [fused_add_topk_div](../ops/c_api/fused_add_topk_div/fused_add_topk_div.md) +1. [fused_add_topk_div](../ops/c_api/fused_add_topk_div/fused_add_topk_div_doc.md) +1. [grid_sample](../ops/ascendc/grid_sample/grid_sample.md) 1. [group_topk](../ops/c_api/group_topk/group_topk_doc.md) 1. [grouped_matmul](../ops/c_api/grouped_matmul/grouped_matmul_doc.md) 1. [grouped_matmul_w4](../ops/c_api/grouped_matmul_w4/grouped_matmul_w4_doc.md) diff --git a/ops/ascendc/grid_sample/grid_sample.cc b/ops/ascendc/grid_sample/grid_sample.cc new file mode 100644 index 0000000000000000000000000000000000000000..ea923786110bfaa562cf531ed4537136daf8e3fe --- /dev/null +++ b/ops/ascendc/grid_sample/grid_sample.cc @@ -0,0 +1,196 @@ +/** + * Copyright 2025 Huawei Technologies Co., Ltd + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +// ============================================================================= +// GRAPH MODE IMPLEMENTATION +// ============================================================================= + +#include +#include +#include +#include +#include + +#include "ops/framework/aclnn/graphmode/aclnn_kernel_mod.h" +#include "ops/framework/utils.h" +#include "mindspore/include/custom_op_api.h" + +namespace ms_custom_ops { +constexpr size_t kCAligned = 8; +enum class GridSampleInputIndex : size_t { + kGridSampleInputIndex = 0, + kGridSampleGridIndex, + kGridSampleModeIndex, + kGridSamplePaddingModeIndex, + kGridSampleAlignCornersIndex, + kGridSampleInputsNum, +}; + +static void GridSampleCheckInputsShape(const std::string &op_name, const std::vector &input_shape, + const std::vector &grid_shape, int64_t mode, + int64_t padding_mode, bool align_corners) { + auto any_dim = abstract::Shape::kShapeDimAny; + MS_CHECK_VALUE(mode == 0, + CheckAndConvertUtils::FormatCommMsg(op_name, ", mode only supports 0, but got ", mode)); + MS_CHECK_VALUE( + padding_mode == 1, + CheckAndConvertUtils::FormatCommMsg(op_name, ", padding_mode only supports 1, but got ", padding_mode)); + MS_CHECK_VALUE(align_corners == false, + CheckAndConvertUtils::FormatCommMsg(op_name, ", align_corners only supports false, but got ", + align_corners)); + if (input_shape.size() != kDim4 || grid_shape.size() != kDim4) { + MS_LOG(EXCEPTION) << op_name << ", the dim of inputs should be input.dim=grid.dim=4, " + << "but got input.dim=" << input_shape.size() + << ", grid.dim=" << grid_shape.size(); + } + auto input_n = input_shape[kIndex0]; + auto grid_n = grid_shape[kIndex0]; + MS_CHECK_VALUE(input_n == grid_n || input_n == any_dim || grid_n == any_dim, + CheckAndConvertUtils::FormatCommMsg( + op_name, ", input.dim0 should be equal grid.dim0,", + " but got input.shape=", input_shape, ", grid.shape=", grid_shape)); + MS_CHECK_VALUE( + grid_shape[kIndex3] == kDim2 || grid_shape[kIndex3] == any_dim, + CheckAndConvertUtils::FormatCommMsg( + op_name, ", grid.shape should be equals (N, H_OUT, W_OUT, 2), but got grid.shape=", grid_shape)); + auto c_in = input_shape[kIndex3]; + MS_CHECK_VALUE( + c_in % kCAligned == 0, + CheckAndConvertUtils::FormatCommMsg( + op_name, ", c should be aligned with ", kCAligned, ", but got c=", c_in)); +} + +static void GridSampleCheckInputsType(const std::string &op_name, const TypeId &input_dtype, + const TypeId &grid_dtype) { + if (input_dtype != kNumberTypeFloat32) { + MS_LOG(EXCEPTION) << op_name << ", the dtype of 'input' should be " << TypeIdToString(kNumberTypeFloat32) + << ", but got input.dtype=" << TypeIdToString(input_dtype); + } + if (grid_dtype != kNumberTypeFloat32) { + MS_LOG(EXCEPTION) << op_name << ", the dtype of 'grid' should be " << TypeIdToString(kNumberTypeFloat32) + << ", but got grid.dtype=" << TypeIdToString(grid_dtype); + } +} + +class OPS_API GridSampleOpFuncImpl : public OpFuncImpl { + public: + ShapeArray InferShape(const PrimitivePtr &primitive, const InferInfoPtrList &input_infos) const override { + MS_EXCEPTION_IF_NULL(primitive); + if (input_infos[static_cast(GridSampleInputIndex::kGridSampleInputIndex)] + ->IsDynamicRank() || + input_infos[static_cast(GridSampleInputIndex::kGridSampleGridIndex)] + ->IsDynamicRank()) { + return {input_infos[static_cast(GridSampleInputIndex::kGridSampleInputIndex)]->GetShape()}; + } + auto op_name = primitive->name(); + auto input_shape = + input_infos[static_cast(GridSampleInputIndex::kGridSampleInputIndex)]->GetShape(); + auto grid_shape = + input_infos[static_cast(GridSampleInputIndex::kGridSampleGridIndex)]->GetShape(); + auto mode = + input_infos[static_cast(GridSampleInputIndex::kGridSampleModeIndex)] + ->GetScalarValueWithCheck(); + auto padding_mode = input_infos[static_cast(GridSampleInputIndex::kGridSamplePaddingModeIndex)] + ->GetScalarValueWithCheck(); + auto align_corners = input_infos[static_cast(GridSampleInputIndex::kGridSampleAlignCornersIndex)] + ->GetScalarValueWithCheck(); + GridSampleCheckInputsShape(op_name, input_shape, grid_shape, mode, padding_mode, align_corners); + auto output_shape = grid_shape; + output_shape[kIndex3] = input_shape[kIndex3]; + return {output_shape}; + } + + std::vector InferType(const PrimitivePtr &primitive, const InferInfoPtrList &input_infos) const override { + auto op_name = primitive->name(); + auto input_dtype = + input_infos[static_cast(GridSampleInputIndex::kGridSampleInputIndex)]->GetType(); + auto grid_dtype = + input_infos[static_cast(GridSampleInputIndex::kGridSampleGridIndex)]->GetType(); + GridSampleCheckInputsType(op_name, input_dtype, grid_dtype); + return {input_dtype}; + } + + bool GeneralInferRegistered() const override { return true; } +}; + +class GridSample : public AclnnCustomKernelMod { + public: + GridSample() : AclnnCustomKernelMod(std::move("aclnnGridSample")) {} + ~GridSample() = default; + + bool Launch(const std::vector &inputs, const std::vector &workspace, + const std::vector &outputs, void *stream_ptr) override { + MS_EXCEPTION_IF_NULL(stream_ptr); + RunOp( + stream_ptr, workspace, inputs[static_cast(GridSampleInputIndex::kGridSampleInputIndex)], + inputs[static_cast(GridSampleInputIndex::kGridSampleGridIndex)], + outputs[0]); + return true; + } + void GetWorkSpaceInfo(const std::vector &inputs, + const std::vector &outputs) override { + GetWorkspaceForResize(inputs[static_cast(GridSampleInputIndex::kGridSampleInputIndex)], + inputs[static_cast(GridSampleInputIndex::kGridSampleGridIndex)], + outputs[0]); + return; + } + + private: + DEFINE_GET_WORKSPACE_FOR_RESIZE(); +}; +} // namespace ms_custom_ops + +REG_GRAPH_MODE_OP(grid_sample, ms_custom_ops::GridSampleOpFuncImpl, + ms_custom_ops::GridSample); + +// ============================================================================= +// PYBOOST MODE IMPLEMENTATION +// ============================================================================= + +namespace ms_custom_ops { +constexpr size_t kGridSampleOutputNum = 1; + +std::vector grid_sample_custom(const ms::Tensor &input, const ms::Tensor &grid, + const int64_t mode, const int64_t padding_mode, + const bool align_corners) { + std::string op_name = "grid_sample"; + auto runner = std::make_shared(op_name); + MS_EXCEPTION_IF_NULL(runner); + auto input_shape = input.shape(); + auto grid_shape = grid.shape(); + auto output_shape = grid_shape; + output_shape[kIndex3] = input_shape[kIndex3]; + GridSampleCheckInputsShape(op_name, input.shape(), grid.shape(), mode, padding_mode, align_corners); + GridSampleCheckInputsType(op_name, input.data_type(), grid.data_type()); + auto out = ms::Tensor(input.data_type(), output_shape); + runner->SetLaunchFunc(LAUNCH_ACLNN_FUNC(aclnnGridSample, input, grid, out)); + runner->Run({input, grid}, {out}); + return {out}; +} +} // namespace ms_custom_ops + +auto pyboost_grid_sample(const ms::Tensor &input, const ms::Tensor &grid, const int64_t mode, + const int64_t padding_mode, const bool align_corners) { + return ms::pynative::PyboostRunner::Call( + ms_custom_ops::grid_sample_custom, input, grid, mode, padding_mode, align_corners); +} + +MS_CUSTOM_OPS_EXTENSION_MODULE(m) { + m.def("grid_sample", + &pyboost_grid_sample, + "GridSample", pybind11::arg("input"), pybind11::arg("grid"), pybind11::arg("mode") = 0, + pybind11::arg("padding_mode") = 1, pybind11::arg("align_corners") = false); +} diff --git a/ops/ascendc/grid_sample/grid_sample.md b/ops/ascendc/grid_sample/grid_sample.md new file mode 100644 index 0000000000000000000000000000000000000000..aa5d8a0674969f97d1db81a2c9fb0acfacc5a809 --- /dev/null +++ b/ops/ascendc/grid_sample/grid_sample.md @@ -0,0 +1,60 @@ +# grid_sample算子 + +## 描述 + +提供一个输入tensor以及一个对应的grid网格,然后根据grid中每个位置提供的坐标信息,将input中对应位置的像素值填充到网格指定的位置,得到最终的输出。 + +## 输入参数 + +| Name | DType | Shape | Optional | Inplace | Format | Description | +|---------------------|-----------------|----------------------------------------|----------|---------|--------|--------------------------------------------------------| +| input | Tensor(float32) | 4维[n, h_in, w_in, c] | No | No | ND | 输入Tensor,shape为(n, h_in, w_in, c) | +| grid | Tensor(float32) | 3维[n, h_out, w_out, 2] | No | Yes | ND | 用于采样的网格 | +| mode | String | No | No | No | int | 插值模式,只支持0, 表示 "bilinear",每个输出像素是最接近的四个输入像素的加权平均值,使用双线性插值计算。
默认0 | +| padding_mode | String | No | No | No | int | padding_mode, 只支持 1,表示"border", 对越界位置用边界值填充。 默认 1 | +| align_corners | Bool | No | Yes | No | Bool | 表示设定特征图坐标与特征值的对应方式,设定为true时,特征值位于像素中心。设定为false时,特征值位于像素的角点。只支持false。默认false。 + +## 输出参数 + +| Name | DType | Shape | Description | +|------|-------|-------|-------------| +| output | Tensor(float32) | [n, h_out, w_out, c] | 采样得到的输出tensor,数据类型float32 | + +## 支持产品 + +- Atlas 推理系列产品 + +## 特殊说明 + +1. 当前仅支持(n,h,w,c)格式的输入,不支持(n,c,h,w)格式。 +2. input的最后一维(c)需要是8的倍数 + +## 使用示例 + +### 基本使用示例 + +```python + +import mindspore as ms +import numpy as np +import ms_custom_ops +from mindspore import context, Tensor + +ms.set_context(device_target="Ascend", mode=context.GRAPH_MODE) +ms.set_context(jit_config={"jit_level": "O0", "infer_boost": "on"}) + +n_in = 1 +c_in = 512 +h_in = 24 +w_in = 24 +h_out = 64 +w_out = 1 +interpolation_mode = 0 +padding_mode = 1 +align_corners = False +np_input = np.random.random((n_in, h_in, w_in, c_in)).astype(input_dtype) +np_grid = np.random.uniform(-1, 1, (n_in, h_out, w_out, 2)).astype(grid_dtype) +input_data = Tensor(np_input) +grid = Tensor(np_grid) +output_data = ms_custon_ops.grid_sample(input_data, grid, interpolation_mode, padding_mode, align_corners) +``` diff --git a/ops/ascendc/grid_sample/grid_sample_op.yaml b/ops/ascendc/grid_sample/grid_sample_op.yaml new file mode 100644 index 0000000000000000000000000000000000000000..40b2fe07d78a54755d126dd6bf06e812a9da809e --- /dev/null +++ b/ops/ascendc/grid_sample/grid_sample_op.yaml @@ -0,0 +1,19 @@ +#operator grid_sample +grid_sample: + args: + input: + dtype: tensor + grid: + dtype: tensor + mode: + dtype: int + default: 0 + padding_mode: + dtype: int + default: 1 + align_corners: + dtype: bool + default: False + returns: + output: + dtype: tensor diff --git a/ops/ascendc/grid_sample/op_host/grid_sample.cpp b/ops/ascendc/grid_sample/op_host/grid_sample.cpp new file mode 100644 index 0000000000000000000000000000000000000000..92acd0beba6d25123fc2f55d81e060fd454a74ec --- /dev/null +++ b/ops/ascendc/grid_sample/op_host/grid_sample.cpp @@ -0,0 +1,107 @@ +/** + * Copyright 2025 Huawei Technologies Co., Ltd + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#include "grid_sample_tiling.h" // NOLINT(build/include_subdir) +#include "register/op_def_registry.h" +#include "graph/utils/type_utils.h" +#include "tiling/platform/platform_ascendc.h" + +namespace { +constexpr uint32_t kIndex0 = 0; +constexpr uint32_t kIndex1 = 1; +constexpr uint32_t kDim0{0}; +constexpr uint32_t kDim1{1}; +constexpr uint32_t kDim2{2}; +constexpr uint32_t kDim3{3}; +} + +namespace optiling { +constexpr uint32_t kCoreNum = 8; +static ge::graphStatus GridSampleTiling(gert::TilingContext *context) { + GridSampleTilingData tiling; + uint32_t tiling_key{0}; + + auto input_shape = context->GetInputShape(0)->GetOriginShape(); + auto grid_shape = context->GetInputShape(1)->GetOriginShape(); + + int32_t n_in = input_shape.GetDim(kDim0); + float h_in = static_cast(input_shape.GetDim(kDim1)); + float w_in = static_cast(input_shape.GetDim(kDim2)); + int32_t c_in = input_shape.GetDim(kDim3); + + int32_t h_out = grid_shape.GetDim(kDim1); + int32_t w_out = grid_shape.GetDim(kDim2); + + tiling.set_h_in(h_in); + tiling.set_w_in(w_in); + tiling.set_h_out(h_out); + tiling.set_w_out(w_out); + tiling.set_n_in(n_in); + tiling.set_c_in(c_in); + + context->SetBlockDim(kCoreNum); + context->SetTilingKey(tiling_key); + tiling.SaveToBuffer(context->GetRawTilingData()->GetData(), context->GetRawTilingData()->GetCapacity()); + context->GetRawTilingData()->SetDataSize(tiling.GetDataSize()); + size_t *currentWorkspace = context->GetWorkspaceSizes(kDim1); + currentWorkspace[kIndex0] = 0; + return ge::GRAPH_SUCCESS; +} +} // namespace optiling + +namespace ge { +static ge::graphStatus GridSampleInferShape(gert::InferShapeContext *context) { + const gert::Shape *input_shape = context->GetInputShape(kIndex0); + const gert::Shape *grid_shape = context->GetInputShape(kIndex1); + gert::Shape *out_shape = context->GetOutputShape(kIndex0); + *out_shape = *grid_shape; + (*out_shape)[kDim3] = (*input_shape)[kDim3]; + return GRAPH_SUCCESS; +} +static graphStatus GridSampleInferDataType(gert::InferDataTypeContext *context) { + const auto inputDataType = context->GetInputDataType(kIndex0); + context->SetOutputDataType(kIndex0, inputDataType); + return ge::GRAPH_SUCCESS; +} +} // namespace ge + +namespace ops { +class GridSample : public OpDef { + public: + explicit GridSample(const char *name) : OpDef(name) { + this->Input("input") + .ParamType(REQUIRED) + .DataType({ge::DT_FLOAT}) + .Format({ge::FORMAT_ND}) + .UnknownShapeFormat({ge::FORMAT_ND}) + .AutoContiguous(); + this->Input("grid") + .UnknownShapeFormat({ge::FORMAT_ND}) + .DataType({ge::DT_FLOAT}) + .Format({ge::FORMAT_ND}) + .ParamType(REQUIRED) + .AutoContiguous(); + this->Output("output") + .UnknownShapeFormat({ge::FORMAT_ND}) + .DataType({ge::DT_FLOAT}) + .ParamType(REQUIRED) + .Format({ge::FORMAT_ND}); + + this->SetInferShape(ge::GridSampleInferShape).SetInferDataType(ge::GridSampleInferDataType); + this->AICore().SetTiling(optiling::GridSampleTiling).AddConfig("ascend310p"); + } +}; +OP_ADD(GridSample); +} // namespace ops diff --git a/ops/ascendc/grid_sample/op_host/grid_sample_tiling.h b/ops/ascendc/grid_sample/op_host/grid_sample_tiling.h new file mode 100644 index 0000000000000000000000000000000000000000..f49ddb6ce61ea79b7570f960ba8f7065ea9c2847 --- /dev/null +++ b/ops/ascendc/grid_sample/op_host/grid_sample_tiling.h @@ -0,0 +1,32 @@ +/** + * Copyright 2025 Huawei Technologies Co., Ltd + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#ifndef ADD_CUSTOM_TILING_H +#define ADD_CUSTOM_TILING_H +#include "register/tilingdata_base.h" + +namespace optiling { +BEGIN_TILING_DATA_DEF(GridSampleTilingData) +TILING_DATA_FIELD_DEF(float, h_in); +TILING_DATA_FIELD_DEF(float, w_in); +TILING_DATA_FIELD_DEF(int32_t, h_out); +TILING_DATA_FIELD_DEF(int32_t, w_out); +TILING_DATA_FIELD_DEF(int32_t, n_in); +TILING_DATA_FIELD_DEF(int32_t, c_in); +END_TILING_DATA_DEF; + +REGISTER_TILING_DATA_CLASS(GridSample, GridSampleTilingData) +} // namespace optiling +#endif // ADD_CUSTOM_TILING_H diff --git a/ops/ascendc/grid_sample/op_kernel/grid_sample.cpp b/ops/ascendc/grid_sample/op_kernel/grid_sample.cpp new file mode 100644 index 0000000000000000000000000000000000000000..2637726adb7ecb50804a359efe1f611f8d34ddda --- /dev/null +++ b/ops/ascendc/grid_sample/op_kernel/grid_sample.cpp @@ -0,0 +1,1452 @@ +/** + * Copyright 2025 Huawei Technologies Co., Ltd + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#include "kernel_operator.h" // NOLINT(build/include_subdir) + +// NOLINTBEGIN +class KernelGridSample { + public: + __aicore__ inline KernelGridSample() {} + __aicore__ inline void Init(GM_ADDR input, GM_ADDR grid, GM_ADDR output, GridSampleTilingData *tiling) { + this->input = input; + this->grid = grid; + this->output = output; + this->h_in = tiling->h_in; + this->w_in = tiling->w_in; + this->h_out = tiling->h_out; + this->w_out = tiling->w_out; + this->n_in = tiling->n_in; + this->c_in = tiling->c_in; + } + __aicore__ inline void Process() { + grid_sample_0_kernel(this->input, this->grid, this->output, this->h_in, this->w_in, + this->h_out, this->w_out, this->n_in, this->c_in); + } + + private: + __aicore__ void grid_sample_0_kernel(__gm__ uint8_t *__restrict__ input_data, __gm__ uint8_t *__restrict__ grid_data, __gm__ uint8_t *__restrict__ output_data, float h_in, float w_in, int32_t h_out, int32_t w_out, int32_t n_in, int32_t c_in) { + AscendC::TPipe pipe; + AscendC::TBuf vecin_buff; + pipe.InitBuffer(vecin_buff, 262144); + AscendC::TBuf gm_buff; + pipe.InitBuffer(gm_buff, 4294967295); + AscendC::TBuf vecout_buff; + pipe.InitBuffer(vecout_buff, 262144); + AscendC::TBuf co2_buff; + pipe.InitBuffer(co2_buff, 262144); + AscendC::TBuf co1_buff; + pipe.InitBuffer(co1_buff, 262144); + AscendC::TBuf a2_buff; + pipe.InitBuffer(a2_buff, 65536); + AscendC::TBuf a1_buff; + pipe.InitBuffer(a1_buff, 1048576); + AscendC::TBuf b2_buff; + pipe.InitBuffer(b2_buff, 65536); + AscendC::TBuf b1_buff; + pipe.InitBuffer(b1_buff, 1048576); + AscendC::DataCopyEnhancedParams enhanceParams_relu({AscendC::BlockMode::BLOCK_MODE_MATRIX,AscendC::DeqScale::DEQ_NONE, 0, 0, true, pad_t::PAD_NONE, 0}); + AscendC::DataCopyEnhancedParams enhanceParams({AscendC::BlockMode::BLOCK_MODE_MATRIX,AscendC::DeqScale::DEQ_NONE, 0, 0, false, pad_t::PAD_NONE, 0}); + uint8_t padList[4] = {0, 0, 0, 0}; + AscendC::GlobalTensorgrid_data_ascendc; + AscendC::GlobalTensorinput_data_ascendc; + AscendC::GlobalTensoroutput_data_ascendc; + int32_t res_2; + int32_t total_outputs_34; + int32_t block_num_50; + int32_t res_58; + int32_t res_64; + int32_t block_idx_3; + int32_t percore_blocks_14; + int32_t res_66; + int32_t res_36; + int32_t res_67; + int32_t pixel_num_17; + int32_t output_res_52; + bool res_69; + int32_t res_73; + int32_t res_74; + bool res_75; + int32_t res_76; + int32_t res_77; + int32_t res_79; + int32_t res_80; + int32_t c_res_num_23; + int32_t in_stride_2_26; + int32_t out_stride_2_30; + bool res_38; + bool res_83; + int32_t res_18; + int32_t res_46; + bool res_84; + float const2_6; + float const5_9; + float const6_11; + int32_t c_chunk_num_21; + int32_t res_27; + float const1_32; + int32_t res_68; + int32_t res_54; + int32_t in_stride_1_55; + int32_t out_stride_1_56; + float const4_40; + int32_t const7_41; + int32_t const8_42; + int32_t c_res_start_44; + float const3_48; + int32_t start_idx_71; + int32_t c_res_chunk_num_60; + int32_t in_stride_0_61; + int32_t out_stride_0_62; + int32_t tile_num_86; + int32_t x_w_90; + int32_t x_e_91; + int32_t y_n_92; + int32_t y_s_93; + float nw_weight_94; + float ne_weight_95; + float sw_weight_96; + float se_weight_97; + int32_t res_102; + int32_t idx_103; + AscendC::LocalTensorxys_ub_107_ascendc; + AscendC::LocalTensorxys_ub_110_ascendc; + AscendC::LocalTensorub_108_ascendc; + AscendC::LocalTensorub_109_ascendc; + AscendC::LocalTensorxys_ub_trans_111_ascendc; + AscendC::LocalTensorxs_ub_116_ascendc; + AscendC::LocalTensorys_ub_121_ascendc; + AscendC::LocalTensorxs_ub_122_ascendc; + AscendC::LocalTensorys_ub_123_ascendc; + AscendC::LocalTensorxs_ub_124_ascendc; + AscendC::LocalTensorys_ub_125_ascendc; + AscendC::LocalTensorxs_ub_127_ascendc; + AscendC::LocalTensorys_ub_129_ascendc; + AscendC::LocalTensorxs_ub_130_ascendc; + AscendC::LocalTensorys_ub_131_ascendc; + AscendC::LocalTensorxs_ub_w_int_132_ascendc; + AscendC::LocalTensorys_ub_n_int_133_ascendc; + AscendC::LocalTensorxs_ub_w_134_ascendc; + AscendC::LocalTensorys_ub_n_137_ascendc; + AscendC::LocalTensorys_ub_s_int_139_ascendc; + AscendC::LocalTensorxs_ub_e_int_136_ascendc; + AscendC::LocalTensorxs_ub_e_141_ascendc; + AscendC::LocalTensore_weights_142_ascendc; + AscendC::LocalTensorys_ub_s_145_ascendc; + AscendC::LocalTensorys_ub_s_int_147_ascendc; + AscendC::LocalTensors_weights_146_ascendc; + AscendC::LocalTensorxs_ub_e_int_143_ascendc; + AscendC::LocalTensorn_weights_148_ascendc; + AscendC::LocalTensorw_weights_150_ascendc; + AscendC::LocalTensorse_weights_149_ascendc; + AscendC::LocalTensorne_weights_151_ascendc; + AscendC::LocalTensornw_weights_152_ascendc; + AscendC::LocalTensorsw_weights_153_ascendc; + AscendC::LocalTensorub_158_ascendc; + int32_t inner_idx_163; + int32_t res_164; + int32_t res_165; + int32_t hw_190; + int32_t n_191; + int32_t w_192; + int32_t h_193; + AscendC::LocalTensorub_162_ascendc; + AscendC::LocalTensorub_169_ascendc; + AscendC::LocalTensorub_173_ascendc; + AscendC::LocalTensorub_177_ascendc; + AscendC::LocalTensorub_181_ascendc; + AscendC::LocalTensorub_185_ascendc; + AscendC::LocalTensorub_189_ascendc; + __ubuf__ float * ub_158 = (__ubuf__ float *)((uintptr_t)(416)); + __ubuf__ int32_t * ub_162 = (__ubuf__ int32_t *)((uintptr_t)(128)); + __ubuf__ int32_t * ub_169 = (__ubuf__ int32_t *)((uintptr_t)(192)); + __ubuf__ int32_t * ub_173 = (__ubuf__ int32_t *)((uintptr_t)(224)); + __ubuf__ int32_t * ub_177 = (__ubuf__ int32_t *)((uintptr_t)(288)); + __ubuf__ float * ub_181 = (__ubuf__ float *)((uintptr_t)(2144)); + __ubuf__ float * ub_185 = (__ubuf__ float *)((uintptr_t)(256)); + __ubuf__ float * ub_189 = (__ubuf__ float *)((uintptr_t)(384)); + int32_t res_194; + int32_t res_195; + int32_t res_197; + int32_t res_198; + int32_t res_208; + int32_t res_209; + int32_t res_196; + int32_t res_199; + int32_t res_210; + int32_t res_220; + int32_t res_221; + int32_t res_223; + int32_t res_202; + int32_t res_212; + int32_t res_219; + int32_t res_225; + int32_t res_226; + int32_t res_228; + int32_t res_230; + int32_t res_231; + int32_t res_233; + AscendC::LocalTensorinput_ub_nw_236_ascendc; + int32_t res_203; + int32_t res_204; + int32_t res_205; + int32_t res_222; + int32_t res_207; + int32_t res_227; + int32_t res_232; + int32_t res_213; + int32_t res_214; + int32_t res_215; + int32_t res_224; + int32_t res_217; + int32_t res_229; + int32_t res_234; + AscendC::LocalTensorinput_ub_ne_238_ascendc; + AscendC::LocalTensorinput_ub_se_242_ascendc; + AscendC::LocalTensorinput_ub_sw_240_ascendc; + AscendC::LocalTensorinput_ub_243_ascendc; + AscendC::LocalTensorub_244_ascendc; + AscendC::LocalTensorub_246_ascendc; + AscendC::LocalTensorub_245_ascendc; + AscendC::LocalTensorinput_ub_247_ascendc; + AscendC::LocalTensorinput_ub_248_ascendc; + AscendC::LocalTensorinput_ub_249_ascendc; + int32_t res_251; + int32_t res_252; + int32_t res_257; + int32_t res_258; + int32_t res_267; + int32_t res_268; + int32_t res_253; + int32_t res_259; + int32_t res_269; + int32_t res_277; + int32_t res_278; + int32_t res_280; + int32_t res_282; + int32_t res_283; + int32_t res_285; + int32_t res_256; + int32_t res_261; + int32_t res_271; + int32_t res_287; + int32_t res_288; + int32_t res_290; + int32_t res_292; + int32_t res_293; + int32_t res_295; + AscendC::LocalTensorinput_ub_nw_298_ascendc; + int32_t res_262; + int32_t res_263; + int32_t res_264; + int32_t res_279; + int32_t res_284; + int32_t res_266; + int32_t res_289; + int32_t res_294; + int32_t res_272; + int32_t res_273; + int32_t res_274; + int32_t res_281; + int32_t res_286; + int32_t res_276; + int32_t res_291; + int32_t res_296; + AscendC::LocalTensorinput_ub_ne_300_ascendc; + AscendC::LocalTensorinput_ub_se_304_ascendc; + AscendC::LocalTensorinput_ub_sw_302_ascendc; + AscendC::LocalTensorinput_ub_305_ascendc; + AscendC::LocalTensorub_306_ascendc; + AscendC::LocalTensorub_308_ascendc; + AscendC::LocalTensorub_307_ascendc; + AscendC::LocalTensorinput_ub_309_ascendc; + AscendC::LocalTensorinput_ub_310_ascendc; + AscendC::LocalTensorinput_ub_311_ascendc; + int32_t res_num_88; + bool res_314; + int32_t res_315; + int32_t idx__316; + AscendC::LocalTensorxys_ub__320_ascendc; + AscendC::LocalTensorxys_ub__323_ascendc; + AscendC::LocalTensorub_321_ascendc; + AscendC::LocalTensorub_322_ascendc; + AscendC::LocalTensorxys_ub_trans__324_ascendc; + AscendC::LocalTensorxs_ub__329_ascendc; + AscendC::LocalTensorys_ub__334_ascendc; + AscendC::LocalTensorxs_ub__335_ascendc; + AscendC::LocalTensorys_ub__336_ascendc; + AscendC::LocalTensorxs_ub__337_ascendc; + AscendC::LocalTensorys_ub__338_ascendc; + AscendC::LocalTensorxs_ub__340_ascendc; + AscendC::LocalTensorys_ub__342_ascendc; + AscendC::LocalTensorxs_ub__343_ascendc; + AscendC::LocalTensorys_ub__344_ascendc; + AscendC::LocalTensorxs_ub_w_int__345_ascendc; + AscendC::LocalTensorys_ub_n_int__346_ascendc; + AscendC::LocalTensorxs_ub_w__347_ascendc; + AscendC::LocalTensorys_ub_n__350_ascendc; + AscendC::LocalTensorys_ub_s_int__352_ascendc; + AscendC::LocalTensorxs_ub_e_int__349_ascendc; + AscendC::LocalTensorxs_ub_e__354_ascendc; + AscendC::LocalTensore_weight__355_ascendc; + AscendC::LocalTensorys_ub_s__358_ascendc; + AscendC::LocalTensorys_ub_s_int__360_ascendc; + AscendC::LocalTensors_weight__359_ascendc; + AscendC::LocalTensorxs_ub_e_int__356_ascendc; + AscendC::LocalTensorn_weight__361_ascendc; + AscendC::LocalTensorw_weight__363_ascendc; + AscendC::LocalTensorse_weight__362_ascendc; + AscendC::LocalTensorne_weight__364_ascendc; + AscendC::LocalTensorsw_weight__365_ascendc; + AscendC::LocalTensornw_weight__366_ascendc; + AscendC::LocalTensorub_371_ascendc; + int32_t inner_idx_388; + int32_t res_389; + int32_t res_390; + int32_t hw_408; + int32_t n_409; + int32_t w_413; + int32_t h_414; + AscendC::LocalTensorub_375_ascendc; + AscendC::LocalTensorub_379_ascendc; + AscendC::LocalTensorub_383_ascendc; + AscendC::LocalTensorub_387_ascendc; + AscendC::LocalTensorub_394_ascendc; + AscendC::LocalTensorub_398_ascendc; + AscendC::LocalTensorub_402_ascendc; + float dst_403; + __ubuf__ float * ub_371 = (__ubuf__ float *)((uintptr_t)(160)); + float dst_404; + __ubuf__ float * ub_375 = (__ubuf__ float *)((uintptr_t)(320)); + float dst_405; + __ubuf__ float * ub_379 = (__ubuf__ float *)((uintptr_t)(352)); + float dst_406; + __ubuf__ float * ub_383 = (__ubuf__ float *)((uintptr_t)(2112)); + int32_t dst_407; + __ubuf__ int32_t * ub_387 = (__ubuf__ int32_t *)((uintptr_t)(0)); + int32_t dst_410; + __ubuf__ int32_t * ub_394 = (__ubuf__ int32_t *)((uintptr_t)(32)); + int32_t dst_411; + __ubuf__ int32_t * ub_398 = (__ubuf__ int32_t *)((uintptr_t)(64)); + int32_t dst_412; + __ubuf__ int32_t * ub_402 = (__ubuf__ int32_t *)((uintptr_t)(96)); + int32_t res_421; + int32_t res_422; + int32_t res_426; + int32_t res_427; + int32_t res_436; + int32_t res_437; + int32_t res_423; + int32_t res_428; + int32_t res_438; + int32_t res_442; + int32_t res_443; + int32_t res_445; + int32_t res_425; + int32_t res_430; + int32_t res_440; + int32_t res_447; + int32_t res_448; + int32_t res_450; + int32_t res_452; + int32_t res_453; + int32_t res_455; + AscendC::LocalTensorinput_ub_nw_457_ascendc; + int32_t res_431; + int32_t res_432; + int32_t res_433; + int32_t res_444; + int32_t res_435; + int32_t res_449; + int32_t res_454; + int32_t res_415; + int32_t res_416; + int32_t res_417; + int32_t res_441; + int32_t res_420; + int32_t res_446; + int32_t res_451; + AscendC::LocalTensorinput_ub_ne_459_ascendc; + AscendC::LocalTensorinput_ub_se_463_ascendc; + AscendC::LocalTensorinput_ub_sw_461_ascendc; + AscendC::LocalTensorinput_ub_464_ascendc; + AscendC::LocalTensorub_465_ascendc; + AscendC::LocalTensorub_467_ascendc; + AscendC::LocalTensorub_466_ascendc; + AscendC::LocalTensorinput_ub_468_ascendc; + AscendC::LocalTensorinput_ub_469_ascendc; + AscendC::LocalTensorinput_ub_470_ascendc; + int32_t res_476; + int32_t res_477; + int32_t res_481; + int32_t res_482; + int32_t res_491; + int32_t res_492; + int32_t res_472; + int32_t res_478; + int32_t res_483; + int32_t res_498; + int32_t res_499; + int32_t res_501; + int32_t res_503; + int32_t res_504; + int32_t res_506; + int32_t res_475; + int32_t res_480; + int32_t res_485; + int32_t res_508; + int32_t res_509; + int32_t res_511; + int32_t res_513; + int32_t res_514; + int32_t res_516; + AscendC::LocalTensorinput_ub_nw_519_ascendc; + int32_t res_486; + int32_t res_487; + int32_t res_488; + int32_t res_500; + int32_t res_505; + int32_t res_490; + int32_t res_510; + int32_t res_515; + int32_t res_493; + int32_t res_494; + int32_t res_495; + int32_t res_502; + int32_t res_507; + int32_t res_497; + int32_t res_512; + int32_t res_517; + AscendC::LocalTensorinput_ub_ne_521_ascendc; + AscendC::LocalTensorinput_ub_se_525_ascendc; + AscendC::LocalTensorinput_ub_sw_523_ascendc; + AscendC::LocalTensorinput_ub_526_ascendc; + AscendC::LocalTensorub_527_ascendc; + AscendC::LocalTensorub_529_ascendc; + AscendC::LocalTensorub_528_ascendc; + AscendC::LocalTensorinput_ub_530_ascendc; + AscendC::LocalTensorinput_ub_531_ascendc; + AscendC::LocalTensorinput_ub_532_ascendc; + res_2 = n_in * h_out; + total_outputs_34 = res_2 * w_out; + block_num_50 = total_outputs_34 / (int32_t)8; + res_58 = block_num_50 + (int32_t)8; + res_64 = res_58 - (int32_t)1; + block_idx_3 = (int32_t)block_idx; + percore_blocks_14 = (int32_t)0 + (int32_t)0; + res_66 = res_64 / (int32_t)8; + res_36 = block_idx_3 + (int32_t)1; + percore_blocks_14 = res_66; + res_67 = res_36 * percore_blocks_14; + pixel_num_17 = (int32_t)0 + (int32_t)0; + output_res_52 = int(total_outputs_34) % int((int32_t)8); + res_69 = res_67 < block_num_50; + AscendC::PipeBarrier(); + if (res_69) { + res_73 = percore_blocks_14 * (int32_t)8; + pixel_num_17 = res_73; + } else { + res_74 = block_idx_3 * percore_blocks_14; + res_75 = res_74 < block_num_50; + AscendC::PipeBarrier(); + if (res_75) { + res_76 = block_idx_3 * percore_blocks_14; + res_77 = block_num_50 - res_76; + res_79 = res_77 * (int32_t)8; + res_80 = res_79 + output_res_52; + pixel_num_17 = res_80; + } else { + pixel_num_17 = (int32_t)0; + } + } + c_res_num_23 = int(c_in) % int((int32_t)512); + in_stride_2_26 = (int32_t)0 + (int32_t)0; + out_stride_2_30 = (int32_t)0 + (int32_t)0; + res_38 = block_idx_3 == (int32_t)0; + AscendC::PipeBarrier(); + res_83 = pixel_num_17 < (int32_t)1; + res_18 = (int32_t)(w_in); + res_46 = c_res_num_23 + (int32_t)8; + in_stride_2_26 = c_in; + out_stride_2_30 = c_in; + res_84 = res_38 && res_83; + const2_6 = float(h_in) * float((float)0.5000000000); + const5_9 = float(w_in) - float((float)1.0000000000); + const6_11 = float(h_in) - float((float)1.0000000000); + c_chunk_num_21 = c_in / (int32_t)512; + res_27 = (int32_t)(h_in); + const1_32 = float(w_in) * float((float)0.5000000000); + res_68 = block_idx_3 * percore_blocks_14; + AscendC::PipeBarrier(); + res_54 = res_46 - (int32_t)1; + in_stride_1_55 = res_18 * in_stride_2_26; + out_stride_1_56 = w_out * out_stride_2_30; + if (res_84) { + pixel_num_17 = total_outputs_34; + } + AscendC::PipeBarrier(); + const4_40 = float(const2_6) - float((float)0.5000000000); + const7_41 = (int32_t)(const5_9); + const8_42 = (int32_t)(const6_11); + c_res_start_44 = c_chunk_num_21 * (int32_t)512; + const3_48 = float(const1_32) - float((float)0.5000000000); + start_idx_71 = res_68 * (int32_t)8; + c_res_chunk_num_60 = res_54 / (int32_t)8; + in_stride_0_61 = res_27 * in_stride_1_55; + out_stride_0_62 = h_out * out_stride_1_56; + tile_num_86 = pixel_num_17 / (int32_t)8; + AscendC::PipeBarrier(); + for (int dynamic_loop_var_0_100 = 0; dynamic_loop_var_0_100 < tile_num_86; dynamic_loop_var_0_100 += 1) { + res_102 = dynamic_loop_var_0_100 * (int32_t)8; + idx_103 = start_idx_71 + res_102; + AscendC::SetFlag(0); + AscendC::WaitFlag(0); + xys_ub_107_ascendc = vecin_buff.GetWithOffset(16, 0); + grid_data_ascendc.SetGlobalBuffer(reinterpret_cast<__gm__ float *>(grid_data)); + AscendC::DataCopy(xys_ub_107_ascendc, grid_data_ascendc[(idx_103) * 2], {(uint16_t)(1), (uint16_t)(2), (uint16_t)(0), (uint16_t)(0)}); + AscendC::SetFlag(0); + AscendC::WaitFlag(0); + xys_ub_110_ascendc = vecout_buff.GetWithOffset(128, 2112); + ub_108_ascendc = vecin_buff.GetWithOffset(256, 64); + ub_109_ascendc = vecin_buff.GetWithOffset(256, 1088); + AscendC::DataCopy(xys_ub_110_ascendc, xys_ub_107_ascendc, {(uint16_t)(1), (uint16_t)(2), (uint16_t)(0), (uint16_t)(0)}); + pipe_barrier(PIPE_V); + AscendC::Transpose(ub_108_ascendc.ReinterpretCast(), xys_ub_110_ascendc.ReinterpretCast(), xys_ub_110_ascendc.ReinterpretCast(), {1,2,1,16,AscendC::TransposeType::TRANSPOSE_NHWC2NCHW}); + pipe_barrier(PIPE_V); + AscendC::Transpose(ub_109_ascendc.ReinterpretCast(), ub_108_ascendc.ReinterpretCast(), ub_108_ascendc.ReinterpretCast(), {1,16,1,16,AscendC::TransposeType::TRANSPOSE_NCHW2NHWC}); + pipe_barrier(PIPE_V); + AscendC::DataCopy(xys_ub_110_ascendc, ub_109_ascendc, {(uint16_t)(1), (uint16_t)(16), (uint16_t)(0), (uint16_t)(0)}); + pipe_barrier(PIPE_V); + AscendC::PipeBarrier(); + xys_ub_trans_111_ascendc = vecin_buff.GetWithOffset(128, 0); + xys_ub_110_ascendc = vecin_buff.GetWithOffset(128, 2112); + AscendC::Transpose(xys_ub_trans_111_ascendc.ReinterpretCast(), xys_ub_110_ascendc.ReinterpretCast(), xys_ub_110_ascendc.ReinterpretCast(), {1,8,1,16,AscendC::TransposeType::TRANSPOSE_NCHW2NHWC}); + AscendC::PipeBarrier(); + AscendC::PipeBarrier(); + xs_ub_116_ascendc = vecout_buff.GetWithOffset(8, 2112); + AscendC::DataCopy(xs_ub_116_ascendc, xys_ub_trans_111_ascendc, {(uint16_t)(1), (uint16_t)(1), (uint16_t)(0), (uint16_t)(0)}); + ys_ub_121_ascendc = vecout_buff.GetWithOffset(8, 2144); + AscendC::DataCopy(ys_ub_121_ascendc, xys_ub_trans_111_ascendc[8], {(uint16_t)(1), (uint16_t)(1), (uint16_t)(0), (uint16_t)(0)}); + AscendC::PipeBarrier(); + xs_ub_122_ascendc = vecin_buff.GetWithOffset(8, 0); + xs_ub_116_ascendc = vecin_buff.GetWithOffset(8, 2112); + AscendC::SetMaskNorm(); + AscendC::SetVectorMask(0ULL, (1ULL << (8)) - 1); + Muls(xs_ub_122_ascendc, xs_ub_116_ascendc, const1_32, AscendC::MASK_PLACEHOLDER, 1, {1, 1, 8, 8}); + AscendC::SetMaskNorm(); + AscendC::SetVectorMask(18446744073709551615ULL, 18446744073709551615ULL); + ys_ub_123_ascendc = vecin_buff.GetWithOffset(8, 32); + ys_ub_121_ascendc = vecin_buff.GetWithOffset(8, 2144); + AscendC::SetMaskNorm(); + AscendC::SetVectorMask(0ULL, (1ULL << (8)) - 1); + Muls(ys_ub_123_ascendc, ys_ub_121_ascendc, const2_6, AscendC::MASK_PLACEHOLDER, 1, {1, 1, 8, 8}); + AscendC::SetMaskNorm(); + AscendC::SetVectorMask(18446744073709551615ULL, 18446744073709551615ULL); + AscendC::PipeBarrier(); + xs_ub_124_ascendc = vecin_buff.GetWithOffset(8, 2112); + AscendC::SetMaskNorm(); + AscendC::SetVectorMask(0ULL, (1ULL << (8)) - 1); + Adds(xs_ub_124_ascendc, xs_ub_122_ascendc, const3_48, AscendC::MASK_PLACEHOLDER, 1, {1, 1, 8, 8}); + AscendC::SetMaskNorm(); + AscendC::SetVectorMask(18446744073709551615ULL, 18446744073709551615ULL); + ys_ub_125_ascendc = vecin_buff.GetWithOffset(8, 2144); + AscendC::SetMaskNorm(); + AscendC::SetVectorMask(0ULL, (1ULL << (8)) - 1); + Adds(ys_ub_125_ascendc, ys_ub_123_ascendc, const4_40, AscendC::MASK_PLACEHOLDER, 1, {1, 1, 8, 8}); + AscendC::SetMaskNorm(); + AscendC::SetVectorMask(18446744073709551615ULL, 18446744073709551615ULL); + AscendC::PipeBarrier(); + xs_ub_127_ascendc = vecin_buff.GetWithOffset(8, 0); + AscendC::SetMaskNorm(); + AscendC::SetVectorMask(0ULL, (1ULL << (8)) - 1); + Maxs(xs_ub_127_ascendc, xs_ub_124_ascendc, (float)0.0000000000, AscendC::MASK_PLACEHOLDER, 1, {1, 1, 8, 8}); + AscendC::SetMaskNorm(); + AscendC::SetVectorMask(18446744073709551615ULL, 18446744073709551615ULL); + ys_ub_129_ascendc = vecin_buff.GetWithOffset(8, 32); + AscendC::SetMaskNorm(); + AscendC::SetVectorMask(0ULL, (1ULL << (8)) - 1); + Maxs(ys_ub_129_ascendc, ys_ub_125_ascendc, (float)0.0000000000, AscendC::MASK_PLACEHOLDER, 1, {1, 1, 8, 8}); + AscendC::SetMaskNorm(); + AscendC::SetVectorMask(18446744073709551615ULL, 18446744073709551615ULL); + AscendC::PipeBarrier(); + xs_ub_130_ascendc = vecin_buff.GetWithOffset(8, 2112); + AscendC::SetMaskNorm(); + AscendC::SetVectorMask(0ULL, (1ULL << (8)) - 1); + Mins(xs_ub_130_ascendc, xs_ub_127_ascendc, const5_9, AscendC::MASK_PLACEHOLDER, 1, {1, 1, 8, 8}); + AscendC::SetMaskNorm(); + AscendC::SetVectorMask(18446744073709551615ULL, 18446744073709551615ULL); + ys_ub_131_ascendc = vecin_buff.GetWithOffset(8, 2144); + AscendC::SetMaskNorm(); + AscendC::SetVectorMask(0ULL, (1ULL << (8)) - 1); + Mins(ys_ub_131_ascendc, ys_ub_129_ascendc, const6_11, AscendC::MASK_PLACEHOLDER, 1, {1, 1, 8, 8}); + AscendC::SetMaskNorm(); + AscendC::SetVectorMask(18446744073709551615ULL, 18446744073709551615ULL); + AscendC::PipeBarrier(); + xs_ub_w_int_132_ascendc = vecin_buff.GetWithOffset(8, 0); + AscendC::SetMaskNorm(); + AscendC::SetVectorMask(0ULL, (1ULL << (8)) - 1); + AscendC::Cast(xs_ub_w_int_132_ascendc, xs_ub_130_ascendc, AscendC::RoundMode::CAST_TRUNC, AscendC::MASK_PLACEHOLDER, 1, {1, 1, 8, 8}); + AscendC::SetMaskNorm(); + AscendC::SetVectorMask(18446744073709551615ULL, 18446744073709551615ULL); + ys_ub_n_int_133_ascendc = vecin_buff.GetWithOffset(8, 32); + AscendC::SetMaskNorm(); + AscendC::SetVectorMask(0ULL, (1ULL << (8)) - 1); + AscendC::Cast(ys_ub_n_int_133_ascendc, ys_ub_131_ascendc, AscendC::RoundMode::CAST_TRUNC, AscendC::MASK_PLACEHOLDER, 1, {1, 1, 8, 8}); + AscendC::SetMaskNorm(); + AscendC::SetVectorMask(18446744073709551615ULL, 18446744073709551615ULL); + AscendC::PipeBarrier(); + xs_ub_w_134_ascendc = vecin_buff.GetWithOffset(8, 192); + AscendC::SetMaskNorm(); + AscendC::SetVectorMask(0ULL, (1ULL << (8)) - 1); + AscendC::Cast(xs_ub_w_134_ascendc, xs_ub_w_int_132_ascendc, AscendC::RoundMode::CAST_NONE, AscendC::MASK_PLACEHOLDER, 1, {1, 1, 8, 8}); + AscendC::SetMaskNorm(); + AscendC::SetVectorMask(18446744073709551615ULL, 18446744073709551615ULL); + ys_ub_n_137_ascendc = vecin_buff.GetWithOffset(8, 256); + AscendC::SetMaskNorm(); + AscendC::SetVectorMask(0ULL, (1ULL << (8)) - 1); + AscendC::Cast(ys_ub_n_137_ascendc, ys_ub_n_int_133_ascendc, AscendC::RoundMode::CAST_NONE, AscendC::MASK_PLACEHOLDER, 1, {1, 1, 8, 8}); + AscendC::SetMaskNorm(); + AscendC::SetVectorMask(18446744073709551615ULL, 18446744073709551615ULL); + ys_ub_s_int_139_ascendc = vecin_buff.GetWithOffset(8, 128); + AscendC::SetMaskNorm(); + AscendC::SetVectorMask(0ULL, (1ULL << (8)) - 1); + Adds(ys_ub_s_int_139_ascendc, ys_ub_n_int_133_ascendc, (int32_t)1, AscendC::MASK_PLACEHOLDER, 1, {1, 1, 8, 8}); + AscendC::SetMaskNorm(); + AscendC::SetVectorMask(18446744073709551615ULL, 18446744073709551615ULL); + xs_ub_e_int_136_ascendc = vecin_buff.GetWithOffset(8, 64); + AscendC::SetMaskNorm(); + AscendC::SetVectorMask(0ULL, (1ULL << (8)) - 1); + Adds(xs_ub_e_int_136_ascendc, xs_ub_w_int_132_ascendc, (int32_t)1, AscendC::MASK_PLACEHOLDER, 1, {1, 1, 8, 8}); + AscendC::SetMaskNorm(); + AscendC::SetVectorMask(18446744073709551615ULL, 18446744073709551615ULL); + AscendC::PipeBarrier(); + xs_ub_e_141_ascendc = vecin_buff.GetWithOffset(8, 320); + AscendC::SetMaskNorm(); + AscendC::SetVectorMask(0ULL, (1ULL << (8)) - 1); + Adds(xs_ub_e_141_ascendc, xs_ub_w_134_ascendc, (float)1.0000000000, AscendC::MASK_PLACEHOLDER, 1, {1, 1, 8, 8}); + AscendC::SetMaskNorm(); + AscendC::SetVectorMask(18446744073709551615ULL, 18446744073709551615ULL); + e_weights_142_ascendc = vecin_buff.GetWithOffset(8, 224); + AscendC::SetMaskNorm(); + AscendC::SetVectorMask(0ULL, (1ULL << (8)) - 1); + Sub(e_weights_142_ascendc, xs_ub_130_ascendc, xs_ub_w_134_ascendc, AscendC::MASK_PLACEHOLDER, 1, {1, 1, 1, 0, 0, 0}); + AscendC::SetMaskNorm(); + AscendC::SetVectorMask(18446744073709551615ULL, 18446744073709551615ULL); + ys_ub_s_145_ascendc = vecin_buff.GetWithOffset(8, 352); + AscendC::SetMaskNorm(); + AscendC::SetVectorMask(0ULL, (1ULL << (8)) - 1); + Adds(ys_ub_s_145_ascendc, ys_ub_n_137_ascendc, (float)1.0000000000, AscendC::MASK_PLACEHOLDER, 1, {1, 1, 8, 8}); + AscendC::SetMaskNorm(); + AscendC::SetVectorMask(18446744073709551615ULL, 18446744073709551615ULL); + ys_ub_s_int_147_ascendc = vecin_buff.GetWithOffset(8, 160); + AscendC::SetMaskNorm(); + AscendC::SetVectorMask(0ULL, (1ULL << (8)) - 1); + Mins(ys_ub_s_int_147_ascendc, ys_ub_s_int_139_ascendc, const8_42, AscendC::MASK_PLACEHOLDER, 1, {1, 1, 8, 8}); + AscendC::SetMaskNorm(); + AscendC::SetVectorMask(18446744073709551615ULL, 18446744073709551615ULL); + AscendC::PipeBarrier(); + s_weights_146_ascendc = vecin_buff.GetWithOffset(8, 288); + AscendC::SetMaskNorm(); + AscendC::SetVectorMask(0ULL, (1ULL << (8)) - 1); + Sub(s_weights_146_ascendc, ys_ub_131_ascendc, ys_ub_n_137_ascendc, AscendC::MASK_PLACEHOLDER, 1, {1, 1, 1, 0, 0, 0}); + AscendC::SetMaskNorm(); + AscendC::SetVectorMask(18446744073709551615ULL, 18446744073709551615ULL); + AscendC::PipeBarrier(); + xs_ub_e_int_143_ascendc = vecin_buff.GetWithOffset(8, 96); + AscendC::SetMaskNorm(); + AscendC::SetVectorMask(0ULL, (1ULL << (8)) - 1); + Mins(xs_ub_e_int_143_ascendc, xs_ub_e_int_136_ascendc, const7_41, AscendC::MASK_PLACEHOLDER, 1, {1, 1, 8, 8}); + AscendC::SetMaskNorm(); + AscendC::SetVectorMask(18446744073709551615ULL, 18446744073709551615ULL); + n_weights_148_ascendc = vecin_buff.GetWithOffset(8, 192); + AscendC::SetMaskNorm(); + AscendC::SetVectorMask(0ULL, (1ULL << (8)) - 1); + Sub(n_weights_148_ascendc, ys_ub_s_145_ascendc, ys_ub_131_ascendc, AscendC::MASK_PLACEHOLDER, 1, {1, 1, 1, 0, 0, 0}); + AscendC::SetMaskNorm(); + AscendC::SetVectorMask(18446744073709551615ULL, 18446744073709551615ULL); + w_weights_150_ascendc = vecin_buff.GetWithOffset(8, 128); + AscendC::SetMaskNorm(); + AscendC::SetVectorMask(0ULL, (1ULL << (8)) - 1); + Sub(w_weights_150_ascendc, xs_ub_e_141_ascendc, xs_ub_130_ascendc, AscendC::MASK_PLACEHOLDER, 1, {1, 1, 1, 0, 0, 0}); + AscendC::SetMaskNorm(); + AscendC::SetVectorMask(18446744073709551615ULL, 18446744073709551615ULL); + AscendC::PipeBarrier(); + se_weights_149_ascendc = vecin_buff.GetWithOffset(8, 64); + AscendC::SetMaskNorm(); + AscendC::SetVectorMask(0ULL, (1ULL << (8)) - 1); + Mul(se_weights_149_ascendc, e_weights_142_ascendc, s_weights_146_ascendc, AscendC::MASK_PLACEHOLDER, 1, {1, 1, 1, 0, 0, 0}); + AscendC::SetMaskNorm(); + AscendC::SetVectorMask(18446744073709551615ULL, 18446744073709551615ULL); + ne_weights_151_ascendc = vecin_buff.GetWithOffset(8, 352); + AscendC::SetMaskNorm(); + AscendC::SetVectorMask(0ULL, (1ULL << (8)) - 1); + Mul(ne_weights_151_ascendc, e_weights_142_ascendc, n_weights_148_ascendc, AscendC::MASK_PLACEHOLDER, 1, {1, 1, 1, 0, 0, 0}); + AscendC::SetMaskNorm(); + AscendC::SetVectorMask(18446744073709551615ULL, 18446744073709551615ULL); + nw_weights_152_ascendc = vecin_buff.GetWithOffset(8, 320); + AscendC::SetMaskNorm(); + AscendC::SetVectorMask(0ULL, (1ULL << (8)) - 1); + Mul(nw_weights_152_ascendc, w_weights_150_ascendc, n_weights_148_ascendc, AscendC::MASK_PLACEHOLDER, 1, {1, 1, 1, 0, 0, 0}); + AscendC::SetMaskNorm(); + AscendC::SetVectorMask(18446744073709551615ULL, 18446744073709551615ULL); + sw_weights_153_ascendc = vecin_buff.GetWithOffset(8, 2112); + AscendC::SetMaskNorm(); + AscendC::SetVectorMask(0ULL, (1ULL << (8)) - 1); + Mul(sw_weights_153_ascendc, w_weights_150_ascendc, s_weights_146_ascendc, AscendC::MASK_PLACEHOLDER, 1, {1, 1, 1, 0, 0, 0}); + AscendC::SetMaskNorm(); + AscendC::SetVectorMask(18446744073709551615ULL, 18446744073709551615ULL); + AscendC::PipeBarrier(); + for (int dynamic_loop_var_1_155 = 0; dynamic_loop_var_1_155 < 8; dynamic_loop_var_1_155 += 1) { + ub_158_ascendc = vecout_buff.GetWithOffset(1, 416); + AscendC::DataCopy(ub_158_ascendc, se_weights_149_ascendc[(dynamic_loop_var_1_155)], {(uint16_t)(1), (uint16_t)(1), (uint16_t)(0), (uint16_t)(0)}); + inner_idx_163 = idx_103 + dynamic_loop_var_1_155; + res_164 = h_out * w_out; + res_165 = h_out * w_out; + hw_190 = int(inner_idx_163) % int(res_164); + n_191 = inner_idx_163 / res_165; + w_192 = int(hw_190) % int(w_out); + h_193 = hw_190 / w_out; + ub_162_ascendc = vecout_buff.GetWithOffset(1, 128); + AscendC::DataCopy(ub_162_ascendc, xs_ub_w_int_132_ascendc[(dynamic_loop_var_1_155)], {(uint16_t)(1), (uint16_t)(1), (uint16_t)(0), (uint16_t)(0)}); + ub_169_ascendc = vecout_buff.GetWithOffset(1, 192); + AscendC::DataCopy(ub_169_ascendc, xs_ub_e_int_143_ascendc[(dynamic_loop_var_1_155)], {(uint16_t)(1), (uint16_t)(1), (uint16_t)(0), (uint16_t)(0)}); + ub_173_ascendc = vecout_buff.GetWithOffset(1, 224); + AscendC::DataCopy(ub_173_ascendc, ys_ub_n_int_133_ascendc[(dynamic_loop_var_1_155)], {(uint16_t)(1), (uint16_t)(1), (uint16_t)(0), (uint16_t)(0)}); + ub_177_ascendc = vecout_buff.GetWithOffset(1, 288); + AscendC::DataCopy(ub_177_ascendc, ys_ub_s_int_147_ascendc[(dynamic_loop_var_1_155)], {(uint16_t)(1), (uint16_t)(1), (uint16_t)(0), (uint16_t)(0)}); + ub_181_ascendc = vecout_buff.GetWithOffset(1, 2144); + AscendC::DataCopy(ub_181_ascendc, nw_weights_152_ascendc[(dynamic_loop_var_1_155)], {(uint16_t)(1), (uint16_t)(1), (uint16_t)(0), (uint16_t)(0)}); + ub_185_ascendc = vecout_buff.GetWithOffset(1, 256); + AscendC::DataCopy(ub_185_ascendc, ne_weights_151_ascendc[(dynamic_loop_var_1_155)], {(uint16_t)(1), (uint16_t)(1), (uint16_t)(0), (uint16_t)(0)}); + ub_189_ascendc = vecout_buff.GetWithOffset(1, 384); + AscendC::DataCopy(ub_189_ascendc, sw_weights_153_ascendc[(dynamic_loop_var_1_155)], {(uint16_t)(1), (uint16_t)(1), (uint16_t)(0), (uint16_t)(0)}); + AscendC::SetFlag(0); + AscendC::WaitFlag(0); + se_weight_97 = *(__ubuf__ float *)ub_158; + x_w_90 = *(__ubuf__ int32_t *)ub_162; + x_e_91 = *(__ubuf__ int32_t *)ub_169; + y_n_92 = *(__ubuf__ int32_t *)ub_173; + y_s_93 = *(__ubuf__ int32_t *)ub_177; + nw_weight_94 = *(__ubuf__ float *)ub_181; + ne_weight_95 = *(__ubuf__ float *)ub_185; + sw_weight_96 = *(__ubuf__ float *)ub_189; + AscendC::PipeBarrier(); + for (int dynamic_loop_var_2_200 = 0; dynamic_loop_var_2_200 < c_chunk_num_21; dynamic_loop_var_2_200 += 1) { + res_194 = n_191 * in_stride_0_61; + res_195 = y_n_92 * in_stride_1_55; + res_197 = n_191 * in_stride_0_61; + res_198 = y_n_92 * in_stride_1_55; + res_208 = n_191 * in_stride_0_61; + res_209 = y_s_93 * in_stride_1_55; + res_196 = x_w_90 * in_stride_2_26; + res_199 = x_e_91 * in_stride_2_26; + res_210 = x_e_91 * in_stride_2_26; + res_220 = res_194 + res_195; + res_221 = res_197 + res_198; + res_223 = res_208 + res_209; + res_202 = dynamic_loop_var_2_200 * (int32_t)512; + res_212 = dynamic_loop_var_2_200 * (int32_t)512; + res_219 = dynamic_loop_var_2_200 * (int32_t)512; + res_225 = res_220 + res_196; + res_226 = res_221 + res_199; + res_228 = res_223 + res_210; + res_230 = res_225 + res_219; + res_231 = res_226 + res_202; + res_233 = res_228 + res_212; + AscendC::SetFlag(0); + AscendC::WaitFlag(0); + input_ub_nw_236_ascendc = vecin_buff.GetWithOffset(512, 10336); + input_data_ascendc.SetGlobalBuffer(reinterpret_cast<__gm__ float *>(input_data)); + for (int n = 0; n < 1; ++n) { + AscendC::DataCopy(input_ub_nw_236_ascendc[n * 512], input_data_ascendc[res_230 + n * 884736], {(uint16_t)(1), (uint16_t)(64), (uint16_t)(0), (uint16_t)(0)}); + } + res_203 = n_191 * in_stride_0_61; + res_204 = y_s_93 * in_stride_1_55; + res_205 = x_w_90 * in_stride_2_26; + res_222 = res_203 + res_204; + res_207 = dynamic_loop_var_2_200 * (int32_t)512; + res_227 = res_222 + res_205; + res_232 = res_227 + res_207; + AscendC::SetFlag(0); + AscendC::WaitFlag(0); + res_213 = n_191 * out_stride_0_62; + res_214 = h_193 * out_stride_1_56; + res_215 = w_192 * out_stride_2_30; + res_224 = res_213 + res_214; + res_217 = dynamic_loop_var_2_200 * (int32_t)512; + res_229 = res_224 + res_215; + res_234 = res_229 + res_217; + AscendC::SetFlag(0); + AscendC::WaitFlag(0); + input_ub_ne_238_ascendc = vecin_buff.GetWithOffset(512, 14432); + for (int n = 0; n < 1; ++n) { + AscendC::DataCopy(input_ub_ne_238_ascendc[n * 512], input_data_ascendc[res_231 + n * 884736], {(uint16_t)(1), (uint16_t)(64), (uint16_t)(0), (uint16_t)(0)}); + } + input_ub_se_242_ascendc = vecin_buff.GetWithOffset(512, 2144); + for (int n = 0; n < 1; ++n) { + AscendC::DataCopy(input_ub_se_242_ascendc[n * 512], input_data_ascendc[res_233 + n * 884736], {(uint16_t)(1), (uint16_t)(64), (uint16_t)(0), (uint16_t)(0)}); + } + input_ub_sw_240_ascendc = vecin_buff.GetWithOffset(512, 6240); + for (int n = 0; n < 1; ++n) { + AscendC::DataCopy(input_ub_sw_240_ascendc[n * 512], input_data_ascendc[res_232 + n * 884736], {(uint16_t)(1), (uint16_t)(64), (uint16_t)(0), (uint16_t)(0)}); + } + AscendC::SetFlag(0); + AscendC::WaitFlag(0); + input_ub_243_ascendc = vecin_buff.GetWithOffset(512, 12384); + Muls(input_ub_243_ascendc, input_ub_nw_236_ascendc, nw_weight_94, AscendC::MASK_PLACEHOLDER, 8, {1, 1, 8, 8}); + AscendC::SetFlag(0); + AscendC::WaitFlag(0); + ub_244_ascendc = vecin_buff.GetWithOffset(512, 16480); + Muls(ub_244_ascendc, input_ub_ne_238_ascendc, ne_weight_95, AscendC::MASK_PLACEHOLDER, 8, {1, 1, 8, 8}); + ub_246_ascendc = vecin_buff.GetWithOffset(512, 4192); + Muls(ub_246_ascendc, input_ub_se_242_ascendc, se_weight_97, AscendC::MASK_PLACEHOLDER, 8, {1, 1, 8, 8}); + AscendC::PipeBarrier(); + ub_245_ascendc = vecin_buff.GetWithOffset(512, 8288); + Muls(ub_245_ascendc, input_ub_sw_240_ascendc, sw_weight_96, AscendC::MASK_PLACEHOLDER, 8, {1, 1, 8, 8}); + AscendC::PipeBarrier(); + input_ub_247_ascendc = vecin_buff.GetWithOffset(512, 2144); + Add(input_ub_247_ascendc, input_ub_243_ascendc, ub_244_ascendc, AscendC::MASK_PLACEHOLDER, 8, {1, 1, 1, 8, 8, 8}); + AscendC::PipeBarrier(); + input_ub_248_ascendc = vecin_buff.GetWithOffset(512, 10368); + Add(input_ub_248_ascendc, input_ub_247_ascendc, ub_245_ascendc, AscendC::MASK_PLACEHOLDER, 8, {1, 1, 1, 8, 8, 8}); + AscendC::PipeBarrier(); + input_ub_249_ascendc = vecin_buff.GetWithOffset(512, 2144); + Add(input_ub_249_ascendc, input_ub_248_ascendc, ub_246_ascendc, AscendC::MASK_PLACEHOLDER, 8, {1, 1, 1, 8, 8, 8}); + AscendC::SetFlag(0); + AscendC::WaitFlag(0); + output_data_ascendc.SetGlobalBuffer(reinterpret_cast<__gm__ float *>(output_data)); + input_ub_249_ascendc = vecout_buff.GetWithOffset(512, 2144); + for (int n = 0; n < 1; ++n) { + AscendC::DataCopy(output_data_ascendc[res_234 + n * 512], input_ub_249_ascendc[n * 16533504], {(uint16_t)(1), (uint16_t)(64), (uint16_t)(0), (uint16_t)(0)}); + } + AscendC::PipeBarrier(); + } + for (int dynamic_loop_var_3_254 = 0; dynamic_loop_var_3_254 < c_res_chunk_num_60; dynamic_loop_var_3_254 += 1) { + res_251 = n_191 * in_stride_0_61; + res_252 = y_n_92 * in_stride_1_55; + res_257 = n_191 * in_stride_0_61; + res_258 = y_n_92 * in_stride_1_55; + res_267 = n_191 * in_stride_0_61; + res_268 = y_s_93 * in_stride_1_55; + res_253 = x_w_90 * in_stride_2_26; + res_259 = x_e_91 * in_stride_2_26; + res_269 = x_e_91 * in_stride_2_26; + res_277 = res_251 + res_252; + res_278 = res_257 + res_258; + res_280 = res_267 + res_268; + res_282 = res_277 + res_253; + res_283 = res_278 + res_259; + res_285 = res_280 + res_269; + res_256 = dynamic_loop_var_3_254 * (int32_t)8; + res_261 = dynamic_loop_var_3_254 * (int32_t)8; + res_271 = dynamic_loop_var_3_254 * (int32_t)8; + res_287 = res_282 + c_res_start_44; + res_288 = res_283 + c_res_start_44; + res_290 = res_285 + c_res_start_44; + res_292 = res_287 + res_256; + res_293 = res_288 + res_261; + res_295 = res_290 + res_271; + AscendC::SetFlag(0); + AscendC::WaitFlag(0); + input_ub_nw_298_ascendc = vecin_buff.GetWithOffset(8, 288); + for (int n = 0; n < 1; ++n) { + AscendC::DataCopy(input_ub_nw_298_ascendc[n * 8], input_data_ascendc[res_292 + n * 884736], {(uint16_t)(1), (uint16_t)(1), (uint16_t)(0), (uint16_t)(0)}); + } + res_262 = n_191 * in_stride_0_61; + res_263 = y_s_93 * in_stride_1_55; + res_264 = x_w_90 * in_stride_2_26; + res_279 = res_262 + res_263; + res_284 = res_279 + res_264; + res_266 = dynamic_loop_var_3_254 * (int32_t)8; + res_289 = res_284 + c_res_start_44; + res_294 = res_289 + res_266; + AscendC::SetFlag(0); + AscendC::WaitFlag(0); + res_272 = n_191 * out_stride_0_62; + res_273 = h_193 * out_stride_1_56; + res_274 = w_192 * out_stride_2_30; + res_281 = res_272 + res_273; + res_286 = res_281 + res_274; + res_276 = dynamic_loop_var_3_254 * (int32_t)8; + res_291 = res_286 + c_res_start_44; + res_296 = res_291 + res_276; + AscendC::SetFlag(0); + AscendC::WaitFlag(0); + input_ub_ne_300_ascendc = vecin_buff.GetWithOffset(8, 416); + for (int n = 0; n < 1; ++n) { + AscendC::DataCopy(input_ub_ne_300_ascendc[n * 8], input_data_ascendc[res_293 + n * 884736], {(uint16_t)(1), (uint16_t)(1), (uint16_t)(0), (uint16_t)(0)}); + } + input_ub_se_304_ascendc = vecin_buff.GetWithOffset(8, 128); + for (int n = 0; n < 1; ++n) { + AscendC::DataCopy(input_ub_se_304_ascendc[n * 8], input_data_ascendc[res_295 + n * 884736], {(uint16_t)(1), (uint16_t)(1), (uint16_t)(0), (uint16_t)(0)}); + } + input_ub_sw_302_ascendc = vecin_buff.GetWithOffset(8, 224); + for (int n = 0; n < 1; ++n) { + AscendC::DataCopy(input_ub_sw_302_ascendc[n * 8], input_data_ascendc[res_294 + n * 884736], {(uint16_t)(1), (uint16_t)(1), (uint16_t)(0), (uint16_t)(0)}); + } + AscendC::SetFlag(0); + AscendC::WaitFlag(0); + input_ub_305_ascendc = vecin_buff.GetWithOffset(8, 384); + AscendC::SetMaskNorm(); + AscendC::SetVectorMask(0ULL, (1ULL << (8)) - 1); + Muls(input_ub_305_ascendc, input_ub_nw_298_ascendc, nw_weight_94, AscendC::MASK_PLACEHOLDER, 1, {1, 1, 8, 8}); + AscendC::SetMaskNorm(); + AscendC::SetVectorMask(18446744073709551615ULL, 18446744073709551615ULL); + AscendC::SetFlag(0); + AscendC::WaitFlag(0); + ub_306_ascendc = vecin_buff.GetWithOffset(8, 448); + AscendC::SetMaskNorm(); + AscendC::SetVectorMask(0ULL, (1ULL << (8)) - 1); + Muls(ub_306_ascendc, input_ub_ne_300_ascendc, ne_weight_95, AscendC::MASK_PLACEHOLDER, 1, {1, 1, 8, 8}); + AscendC::SetMaskNorm(); + AscendC::SetVectorMask(18446744073709551615ULL, 18446744073709551615ULL); + ub_308_ascendc = vecin_buff.GetWithOffset(8, 192); + AscendC::SetMaskNorm(); + AscendC::SetVectorMask(0ULL, (1ULL << (8)) - 1); + Muls(ub_308_ascendc, input_ub_se_304_ascendc, se_weight_97, AscendC::MASK_PLACEHOLDER, 1, {1, 1, 8, 8}); + AscendC::SetMaskNorm(); + AscendC::SetVectorMask(18446744073709551615ULL, 18446744073709551615ULL); + AscendC::PipeBarrier(); + ub_307_ascendc = vecin_buff.GetWithOffset(8, 256); + AscendC::SetMaskNorm(); + AscendC::SetVectorMask(0ULL, (1ULL << (8)) - 1); + Muls(ub_307_ascendc, input_ub_sw_302_ascendc, sw_weight_96, AscendC::MASK_PLACEHOLDER, 1, {1, 1, 8, 8}); + AscendC::SetMaskNorm(); + AscendC::SetVectorMask(18446744073709551615ULL, 18446744073709551615ULL); + AscendC::PipeBarrier(); + input_ub_309_ascendc = vecin_buff.GetWithOffset(8, 128); + AscendC::SetMaskNorm(); + AscendC::SetVectorMask(0ULL, (1ULL << (8)) - 1); + Add(input_ub_309_ascendc, input_ub_305_ascendc, ub_306_ascendc, AscendC::MASK_PLACEHOLDER, 1, {1, 1, 1, 0, 0, 0}); + AscendC::SetMaskNorm(); + AscendC::SetVectorMask(18446744073709551615ULL, 18446744073709551615ULL); + AscendC::PipeBarrier(); + input_ub_310_ascendc = vecin_buff.GetWithOffset(8, 384); + AscendC::SetMaskNorm(); + AscendC::SetVectorMask(0ULL, (1ULL << (8)) - 1); + Add(input_ub_310_ascendc, input_ub_309_ascendc, ub_307_ascendc, AscendC::MASK_PLACEHOLDER, 1, {1, 1, 1, 0, 0, 0}); + AscendC::SetMaskNorm(); + AscendC::SetVectorMask(18446744073709551615ULL, 18446744073709551615ULL); + AscendC::PipeBarrier(); + input_ub_311_ascendc = vecin_buff.GetWithOffset(8, 128); + AscendC::SetMaskNorm(); + AscendC::SetVectorMask(0ULL, (1ULL << (8)) - 1); + Add(input_ub_311_ascendc, input_ub_310_ascendc, ub_308_ascendc, AscendC::MASK_PLACEHOLDER, 1, {1, 1, 1, 0, 0, 0}); + AscendC::SetMaskNorm(); + AscendC::SetVectorMask(18446744073709551615ULL, 18446744073709551615ULL); + AscendC::SetFlag(0); + AscendC::WaitFlag(0); + input_ub_311_ascendc = vecout_buff.GetWithOffset(8, 128); + for (int n = 0; n < 1; ++n) { + AscendC::DataCopy(output_data_ascendc[res_296 + n * 8], input_ub_311_ascendc[n * 16533504], {(uint16_t)(1), (uint16_t)(1), (uint16_t)(0), (uint16_t)(0)}); + } + AscendC::PipeBarrier(); + } + AscendC::PipeBarrier(); + } + AscendC::PipeBarrier(); + } + res_num_88 = int(pixel_num_17) % int((int32_t)8); + res_314 = res_num_88 > (int32_t)0; + AscendC::PipeBarrier(); + AscendC::PipeBarrier(); + if (res_314) { + res_315 = start_idx_71 + pixel_num_17; + idx__316 = res_315 - res_num_88; + AscendC::SetFlag(0); + AscendC::WaitFlag(0); + xys_ub__320_ascendc = vecin_buff.GetWithOffset(16, 2144); + AscendC::DataCopy(xys_ub__320_ascendc, grid_data_ascendc[(idx__316) * 2], {(uint16_t)(1), (uint16_t)(2), (uint16_t)(0), (uint16_t)(0)}); + AscendC::SetFlag(0); + AscendC::WaitFlag(0); + xys_ub__323_ascendc = vecout_buff.GetWithOffset(128, 4256); + ub_321_ascendc = vecin_buff.GetWithOffset(256, 2208); + ub_322_ascendc = vecin_buff.GetWithOffset(256, 3232); + AscendC::DataCopy(xys_ub__323_ascendc, xys_ub__320_ascendc, {(uint16_t)(1), (uint16_t)(2), (uint16_t)(0), (uint16_t)(0)}); + pipe_barrier(PIPE_V); + AscendC::Transpose(ub_321_ascendc.ReinterpretCast(), xys_ub__323_ascendc.ReinterpretCast(), xys_ub__323_ascendc.ReinterpretCast(), {1,2,1,16,AscendC::TransposeType::TRANSPOSE_NHWC2NCHW}); + pipe_barrier(PIPE_V); + AscendC::Transpose(ub_322_ascendc.ReinterpretCast(), ub_321_ascendc.ReinterpretCast(), ub_321_ascendc.ReinterpretCast(), {1,16,1,16,AscendC::TransposeType::TRANSPOSE_NCHW2NHWC}); + pipe_barrier(PIPE_V); + AscendC::DataCopy(xys_ub__323_ascendc, ub_322_ascendc, {(uint16_t)(1), (uint16_t)(16), (uint16_t)(0), (uint16_t)(0)}); + pipe_barrier(PIPE_V); + AscendC::PipeBarrier(); + xys_ub_trans__324_ascendc = vecin_buff.GetWithOffset(128, 2144); + xys_ub__323_ascendc = vecin_buff.GetWithOffset(128, 4256); + AscendC::Transpose(xys_ub_trans__324_ascendc.ReinterpretCast(), xys_ub__323_ascendc.ReinterpretCast(), xys_ub__323_ascendc.ReinterpretCast(), {1,8,1,16,AscendC::TransposeType::TRANSPOSE_NCHW2NHWC}); + AscendC::PipeBarrier(); + AscendC::PipeBarrier(); + xs_ub__329_ascendc = vecout_buff.GetWithOffset(8, 4256); + AscendC::DataCopy(xs_ub__329_ascendc, xys_ub_trans__324_ascendc, {(uint16_t)(1), (uint16_t)(1), (uint16_t)(0), (uint16_t)(0)}); + ys_ub__334_ascendc = vecout_buff.GetWithOffset(8, 4288); + AscendC::DataCopy(ys_ub__334_ascendc, xys_ub_trans__324_ascendc[8], {(uint16_t)(1), (uint16_t)(1), (uint16_t)(0), (uint16_t)(0)}); + AscendC::PipeBarrier(); + xs_ub__335_ascendc = vecin_buff.GetWithOffset(8, 2144); + xs_ub__329_ascendc = vecin_buff.GetWithOffset(8, 4256); + AscendC::SetMaskNorm(); + AscendC::SetVectorMask(0ULL, (1ULL << (8)) - 1); + Muls(xs_ub__335_ascendc, xs_ub__329_ascendc, const1_32, AscendC::MASK_PLACEHOLDER, 1, {1, 1, 8, 8}); + AscendC::SetMaskNorm(); + AscendC::SetVectorMask(18446744073709551615ULL, 18446744073709551615ULL); + ys_ub__336_ascendc = vecin_buff.GetWithOffset(8, 2176); + ys_ub__334_ascendc = vecin_buff.GetWithOffset(8, 4288); + AscendC::SetMaskNorm(); + AscendC::SetVectorMask(0ULL, (1ULL << (8)) - 1); + Muls(ys_ub__336_ascendc, ys_ub__334_ascendc, const2_6, AscendC::MASK_PLACEHOLDER, 1, {1, 1, 8, 8}); + AscendC::SetMaskNorm(); + AscendC::SetVectorMask(18446744073709551615ULL, 18446744073709551615ULL); + AscendC::PipeBarrier(); + xs_ub__337_ascendc = vecin_buff.GetWithOffset(8, 4256); + AscendC::SetMaskNorm(); + AscendC::SetVectorMask(0ULL, (1ULL << (8)) - 1); + Adds(xs_ub__337_ascendc, xs_ub__335_ascendc, const3_48, AscendC::MASK_PLACEHOLDER, 1, {1, 1, 8, 8}); + AscendC::SetMaskNorm(); + AscendC::SetVectorMask(18446744073709551615ULL, 18446744073709551615ULL); + ys_ub__338_ascendc = vecin_buff.GetWithOffset(8, 4288); + AscendC::SetMaskNorm(); + AscendC::SetVectorMask(0ULL, (1ULL << (8)) - 1); + Adds(ys_ub__338_ascendc, ys_ub__336_ascendc, const4_40, AscendC::MASK_PLACEHOLDER, 1, {1, 1, 8, 8}); + AscendC::SetMaskNorm(); + AscendC::SetVectorMask(18446744073709551615ULL, 18446744073709551615ULL); + AscendC::PipeBarrier(); + xs_ub__340_ascendc = vecin_buff.GetWithOffset(8, 2144); + AscendC::SetMaskNorm(); + AscendC::SetVectorMask(0ULL, (1ULL << (8)) - 1); + Maxs(xs_ub__340_ascendc, xs_ub__337_ascendc, (float)0.0000000000, AscendC::MASK_PLACEHOLDER, 1, {1, 1, 8, 8}); + AscendC::SetMaskNorm(); + AscendC::SetVectorMask(18446744073709551615ULL, 18446744073709551615ULL); + ys_ub__342_ascendc = vecin_buff.GetWithOffset(8, 2176); + AscendC::SetMaskNorm(); + AscendC::SetVectorMask(0ULL, (1ULL << (8)) - 1); + Maxs(ys_ub__342_ascendc, ys_ub__338_ascendc, (float)0.0000000000, AscendC::MASK_PLACEHOLDER, 1, {1, 1, 8, 8}); + AscendC::SetMaskNorm(); + AscendC::SetVectorMask(18446744073709551615ULL, 18446744073709551615ULL); + AscendC::PipeBarrier(); + xs_ub__343_ascendc = vecin_buff.GetWithOffset(8, 4256); + AscendC::SetMaskNorm(); + AscendC::SetVectorMask(0ULL, (1ULL << (8)) - 1); + Mins(xs_ub__343_ascendc, xs_ub__340_ascendc, const5_9, AscendC::MASK_PLACEHOLDER, 1, {1, 1, 8, 8}); + AscendC::SetMaskNorm(); + AscendC::SetVectorMask(18446744073709551615ULL, 18446744073709551615ULL); + ys_ub__344_ascendc = vecin_buff.GetWithOffset(8, 4288); + AscendC::SetMaskNorm(); + AscendC::SetVectorMask(0ULL, (1ULL << (8)) - 1); + Mins(ys_ub__344_ascendc, ys_ub__342_ascendc, const6_11, AscendC::MASK_PLACEHOLDER, 1, {1, 1, 8, 8}); + AscendC::SetMaskNorm(); + AscendC::SetVectorMask(18446744073709551615ULL, 18446744073709551615ULL); + AscendC::PipeBarrier(); + xs_ub_w_int__345_ascendc = vecin_buff.GetWithOffset(8, 2144); + AscendC::SetMaskNorm(); + AscendC::SetVectorMask(0ULL, (1ULL << (8)) - 1); + AscendC::Cast(xs_ub_w_int__345_ascendc, xs_ub__343_ascendc, AscendC::RoundMode::CAST_TRUNC, AscendC::MASK_PLACEHOLDER, 1, {1, 1, 8, 8}); + AscendC::SetMaskNorm(); + AscendC::SetVectorMask(18446744073709551615ULL, 18446744073709551615ULL); + ys_ub_n_int__346_ascendc = vecin_buff.GetWithOffset(8, 2176); + AscendC::SetMaskNorm(); + AscendC::SetVectorMask(0ULL, (1ULL << (8)) - 1); + AscendC::Cast(ys_ub_n_int__346_ascendc, ys_ub__344_ascendc, AscendC::RoundMode::CAST_TRUNC, AscendC::MASK_PLACEHOLDER, 1, {1, 1, 8, 8}); + AscendC::SetMaskNorm(); + AscendC::SetVectorMask(18446744073709551615ULL, 18446744073709551615ULL); + AscendC::PipeBarrier(); + xs_ub_w__347_ascendc = vecin_buff.GetWithOffset(8, 2336); + AscendC::SetMaskNorm(); + AscendC::SetVectorMask(0ULL, (1ULL << (8)) - 1); + AscendC::Cast(xs_ub_w__347_ascendc, xs_ub_w_int__345_ascendc, AscendC::RoundMode::CAST_NONE, AscendC::MASK_PLACEHOLDER, 1, {1, 1, 8, 8}); + AscendC::SetMaskNorm(); + AscendC::SetVectorMask(18446744073709551615ULL, 18446744073709551615ULL); + ys_ub_n__350_ascendc = vecin_buff.GetWithOffset(8, 2400); + AscendC::SetMaskNorm(); + AscendC::SetVectorMask(0ULL, (1ULL << (8)) - 1); + AscendC::Cast(ys_ub_n__350_ascendc, ys_ub_n_int__346_ascendc, AscendC::RoundMode::CAST_NONE, AscendC::MASK_PLACEHOLDER, 1, {1, 1, 8, 8}); + AscendC::SetMaskNorm(); + AscendC::SetVectorMask(18446744073709551615ULL, 18446744073709551615ULL); + ys_ub_s_int__352_ascendc = vecin_buff.GetWithOffset(8, 2272); + AscendC::SetMaskNorm(); + AscendC::SetVectorMask(0ULL, (1ULL << (8)) - 1); + Adds(ys_ub_s_int__352_ascendc, ys_ub_n_int__346_ascendc, (int32_t)1, AscendC::MASK_PLACEHOLDER, 1, {1, 1, 8, 8}); + AscendC::SetMaskNorm(); + AscendC::SetVectorMask(18446744073709551615ULL, 18446744073709551615ULL); + xs_ub_e_int__349_ascendc = vecin_buff.GetWithOffset(8, 2208); + AscendC::SetMaskNorm(); + AscendC::SetVectorMask(0ULL, (1ULL << (8)) - 1); + Adds(xs_ub_e_int__349_ascendc, xs_ub_w_int__345_ascendc, (int32_t)1, AscendC::MASK_PLACEHOLDER, 1, {1, 1, 8, 8}); + AscendC::SetMaskNorm(); + AscendC::SetVectorMask(18446744073709551615ULL, 18446744073709551615ULL); + AscendC::PipeBarrier(); + xs_ub_e__354_ascendc = vecin_buff.GetWithOffset(8, 2464); + AscendC::SetMaskNorm(); + AscendC::SetVectorMask(0ULL, (1ULL << (8)) - 1); + Adds(xs_ub_e__354_ascendc, xs_ub_w__347_ascendc, (float)1.0000000000, AscendC::MASK_PLACEHOLDER, 1, {1, 1, 8, 8}); + AscendC::SetMaskNorm(); + AscendC::SetVectorMask(18446744073709551615ULL, 18446744073709551615ULL); + e_weight__355_ascendc = vecin_buff.GetWithOffset(8, 2368); + AscendC::SetMaskNorm(); + AscendC::SetVectorMask(0ULL, (1ULL << (8)) - 1); + Sub(e_weight__355_ascendc, xs_ub__343_ascendc, xs_ub_w__347_ascendc, AscendC::MASK_PLACEHOLDER, 1, {1, 1, 1, 0, 0, 0}); + AscendC::SetMaskNorm(); + AscendC::SetVectorMask(18446744073709551615ULL, 18446744073709551615ULL); + ys_ub_s__358_ascendc = vecin_buff.GetWithOffset(8, 2496); + AscendC::SetMaskNorm(); + AscendC::SetVectorMask(0ULL, (1ULL << (8)) - 1); + Adds(ys_ub_s__358_ascendc, ys_ub_n__350_ascendc, (float)1.0000000000, AscendC::MASK_PLACEHOLDER, 1, {1, 1, 8, 8}); + AscendC::SetMaskNorm(); + AscendC::SetVectorMask(18446744073709551615ULL, 18446744073709551615ULL); + ys_ub_s_int__360_ascendc = vecin_buff.GetWithOffset(8, 2304); + AscendC::SetMaskNorm(); + AscendC::SetVectorMask(0ULL, (1ULL << (8)) - 1); + Mins(ys_ub_s_int__360_ascendc, ys_ub_s_int__352_ascendc, const8_42, AscendC::MASK_PLACEHOLDER, 1, {1, 1, 8, 8}); + AscendC::SetMaskNorm(); + AscendC::SetVectorMask(18446744073709551615ULL, 18446744073709551615ULL); + AscendC::PipeBarrier(); + s_weight__359_ascendc = vecin_buff.GetWithOffset(8, 2432); + AscendC::SetMaskNorm(); + AscendC::SetVectorMask(0ULL, (1ULL << (8)) - 1); + Sub(s_weight__359_ascendc, ys_ub__344_ascendc, ys_ub_n__350_ascendc, AscendC::MASK_PLACEHOLDER, 1, {1, 1, 1, 0, 0, 0}); + AscendC::SetMaskNorm(); + AscendC::SetVectorMask(18446744073709551615ULL, 18446744073709551615ULL); + AscendC::PipeBarrier(); + xs_ub_e_int__356_ascendc = vecin_buff.GetWithOffset(8, 2240); + AscendC::SetMaskNorm(); + AscendC::SetVectorMask(0ULL, (1ULL << (8)) - 1); + Mins(xs_ub_e_int__356_ascendc, xs_ub_e_int__349_ascendc, const7_41, AscendC::MASK_PLACEHOLDER, 1, {1, 1, 8, 8}); + AscendC::SetMaskNorm(); + AscendC::SetVectorMask(18446744073709551615ULL, 18446744073709551615ULL); + n_weight__361_ascendc = vecin_buff.GetWithOffset(8, 2336); + AscendC::SetMaskNorm(); + AscendC::SetVectorMask(0ULL, (1ULL << (8)) - 1); + Sub(n_weight__361_ascendc, ys_ub_s__358_ascendc, ys_ub__344_ascendc, AscendC::MASK_PLACEHOLDER, 1, {1, 1, 1, 0, 0, 0}); + AscendC::SetMaskNorm(); + AscendC::SetVectorMask(18446744073709551615ULL, 18446744073709551615ULL); + w_weight__363_ascendc = vecin_buff.GetWithOffset(8, 2272); + AscendC::SetMaskNorm(); + AscendC::SetVectorMask(0ULL, (1ULL << (8)) - 1); + Sub(w_weight__363_ascendc, xs_ub_e__354_ascendc, xs_ub__343_ascendc, AscendC::MASK_PLACEHOLDER, 1, {1, 1, 1, 0, 0, 0}); + AscendC::SetMaskNorm(); + AscendC::SetVectorMask(18446744073709551615ULL, 18446744073709551615ULL); + AscendC::PipeBarrier(); + se_weight__362_ascendc = vecin_buff.GetWithOffset(8, 2208); + AscendC::SetMaskNorm(); + AscendC::SetVectorMask(0ULL, (1ULL << (8)) - 1); + Mul(se_weight__362_ascendc, e_weight__355_ascendc, s_weight__359_ascendc, AscendC::MASK_PLACEHOLDER, 1, {1, 1, 1, 0, 0, 0}); + AscendC::SetMaskNorm(); + AscendC::SetVectorMask(18446744073709551615ULL, 18446744073709551615ULL); + ne_weight__364_ascendc = vecin_buff.GetWithOffset(8, 2496); + AscendC::SetMaskNorm(); + AscendC::SetVectorMask(0ULL, (1ULL << (8)) - 1); + Mul(ne_weight__364_ascendc, e_weight__355_ascendc, n_weight__361_ascendc, AscendC::MASK_PLACEHOLDER, 1, {1, 1, 1, 0, 0, 0}); + AscendC::SetMaskNorm(); + AscendC::SetVectorMask(18446744073709551615ULL, 18446744073709551615ULL); + sw_weight__365_ascendc = vecin_buff.GetWithOffset(8, 4256); + AscendC::SetMaskNorm(); + AscendC::SetVectorMask(0ULL, (1ULL << (8)) - 1); + Mul(sw_weight__365_ascendc, w_weight__363_ascendc, s_weight__359_ascendc, AscendC::MASK_PLACEHOLDER, 1, {1, 1, 1, 0, 0, 0}); + AscendC::SetMaskNorm(); + AscendC::SetVectorMask(18446744073709551615ULL, 18446744073709551615ULL); + nw_weight__366_ascendc = vecin_buff.GetWithOffset(8, 2464); + AscendC::SetMaskNorm(); + AscendC::SetVectorMask(0ULL, (1ULL << (8)) - 1); + Mul(nw_weight__366_ascendc, w_weight__363_ascendc, n_weight__361_ascendc, AscendC::MASK_PLACEHOLDER, 1, {1, 1, 1, 0, 0, 0}); + AscendC::SetMaskNorm(); + AscendC::SetVectorMask(18446744073709551615ULL, 18446744073709551615ULL); + AscendC::PipeBarrier(); + for (int dynamic_loop_var_4_368 = 0; dynamic_loop_var_4_368 < res_num_88; dynamic_loop_var_4_368 += 1) { + ub_371_ascendc = vecout_buff.GetWithOffset(1, 160); + AscendC::DataCopy(ub_371_ascendc, nw_weight__366_ascendc[(dynamic_loop_var_4_368)], {(uint16_t)(1), (uint16_t)(1), (uint16_t)(0), (uint16_t)(0)}); + inner_idx_388 = idx__316 + dynamic_loop_var_4_368; + res_389 = h_out * w_out; + res_390 = h_out * w_out; + hw_408 = int(inner_idx_388) % int(res_389); + n_409 = inner_idx_388 / res_390; + w_413 = int(hw_408) % int(w_out); + h_414 = hw_408 / w_out; + ub_375_ascendc = vecout_buff.GetWithOffset(1, 320); + AscendC::DataCopy(ub_375_ascendc, ne_weight__364_ascendc[(dynamic_loop_var_4_368)], {(uint16_t)(1), (uint16_t)(1), (uint16_t)(0), (uint16_t)(0)}); + ub_379_ascendc = vecout_buff.GetWithOffset(1, 352); + AscendC::DataCopy(ub_379_ascendc, sw_weight__365_ascendc[(dynamic_loop_var_4_368)], {(uint16_t)(1), (uint16_t)(1), (uint16_t)(0), (uint16_t)(0)}); + ub_383_ascendc = vecout_buff.GetWithOffset(1, 2112); + AscendC::DataCopy(ub_383_ascendc, se_weight__362_ascendc[(dynamic_loop_var_4_368)], {(uint16_t)(1), (uint16_t)(1), (uint16_t)(0), (uint16_t)(0)}); + ub_387_ascendc = vecout_buff.GetWithOffset(1, 0); + AscendC::DataCopy(ub_387_ascendc, xs_ub_w_int__345_ascendc[(dynamic_loop_var_4_368)], {(uint16_t)(1), (uint16_t)(1), (uint16_t)(0), (uint16_t)(0)}); + ub_394_ascendc = vecout_buff.GetWithOffset(1, 32); + AscendC::DataCopy(ub_394_ascendc, xs_ub_e_int__356_ascendc[(dynamic_loop_var_4_368)], {(uint16_t)(1), (uint16_t)(1), (uint16_t)(0), (uint16_t)(0)}); + ub_398_ascendc = vecout_buff.GetWithOffset(1, 64); + AscendC::DataCopy(ub_398_ascendc, ys_ub_n_int__346_ascendc[(dynamic_loop_var_4_368)], {(uint16_t)(1), (uint16_t)(1), (uint16_t)(0), (uint16_t)(0)}); + ub_402_ascendc = vecout_buff.GetWithOffset(1, 96); + AscendC::DataCopy(ub_402_ascendc, ys_ub_s_int__360_ascendc[(dynamic_loop_var_4_368)], {(uint16_t)(1), (uint16_t)(1), (uint16_t)(0), (uint16_t)(0)}); + AscendC::SetFlag(0); + AscendC::WaitFlag(0); + dst_403 = *(__ubuf__ float *)ub_371; + dst_404 = *(__ubuf__ float *)ub_375; + dst_405 = *(__ubuf__ float *)ub_379; + dst_406 = *(__ubuf__ float *)ub_383; + dst_407 = *(__ubuf__ int32_t *)ub_387; + dst_410 = *(__ubuf__ int32_t *)ub_394; + dst_411 = *(__ubuf__ int32_t *)ub_398; + dst_412 = *(__ubuf__ int32_t *)ub_402; + nw_weight_94 = dst_403; + ne_weight_95 = dst_404; + sw_weight_96 = dst_405; + se_weight_97 = dst_406; + x_w_90 = dst_407; + x_e_91 = dst_410; + y_n_92 = dst_411; + y_s_93 = dst_412; + AscendC::PipeBarrier(); + for (int dynamic_loop_var_5_418 = 0; dynamic_loop_var_5_418 < c_chunk_num_21; dynamic_loop_var_5_418 += 1) { + res_421 = n_409 * in_stride_0_61; + res_422 = y_n_92 * in_stride_1_55; + res_426 = n_409 * in_stride_0_61; + res_427 = y_n_92 * in_stride_1_55; + res_436 = n_409 * in_stride_0_61; + res_437 = y_s_93 * in_stride_1_55; + res_423 = x_w_90 * in_stride_2_26; + res_428 = x_e_91 * in_stride_2_26; + res_438 = x_e_91 * in_stride_2_26; + res_442 = res_421 + res_422; + res_443 = res_426 + res_427; + res_445 = res_436 + res_437; + res_425 = dynamic_loop_var_5_418 * (int32_t)512; + res_430 = dynamic_loop_var_5_418 * (int32_t)512; + res_440 = dynamic_loop_var_5_418 * (int32_t)512; + res_447 = res_442 + res_423; + res_448 = res_443 + res_428; + res_450 = res_445 + res_438; + res_452 = res_447 + res_425; + res_453 = res_448 + res_430; + res_455 = res_450 + res_440; + AscendC::SetFlag(0); + AscendC::WaitFlag(0); + input_ub_nw_457_ascendc = vecin_buff.GetWithOffset(512, 12448); + for (int n = 0; n < 1; ++n) { + AscendC::DataCopy(input_ub_nw_457_ascendc[n * 512], input_data_ascendc[res_452 + n * 884736], {(uint16_t)(1), (uint16_t)(64), (uint16_t)(0), (uint16_t)(0)}); + } + res_431 = n_409 * in_stride_0_61; + res_432 = y_s_93 * in_stride_1_55; + res_433 = x_w_90 * in_stride_2_26; + res_444 = res_431 + res_432; + res_435 = dynamic_loop_var_5_418 * (int32_t)512; + res_449 = res_444 + res_433; + res_454 = res_449 + res_435; + AscendC::SetFlag(0); + AscendC::WaitFlag(0); + res_415 = n_409 * out_stride_0_62; + res_416 = h_414 * out_stride_1_56; + res_417 = w_413 * out_stride_2_30; + res_441 = res_415 + res_416; + res_420 = dynamic_loop_var_5_418 * (int32_t)512; + res_446 = res_441 + res_417; + res_451 = res_446 + res_420; + AscendC::SetFlag(0); + AscendC::WaitFlag(0); + input_ub_ne_459_ascendc = vecin_buff.GetWithOffset(512, 16544); + for (int n = 0; n < 1; ++n) { + AscendC::DataCopy(input_ub_ne_459_ascendc[n * 512], input_data_ascendc[res_453 + n * 884736], {(uint16_t)(1), (uint16_t)(64), (uint16_t)(0), (uint16_t)(0)}); + } + input_ub_se_463_ascendc = vecin_buff.GetWithOffset(512, 0); + for (int n = 0; n < 1; ++n) { + AscendC::DataCopy(input_ub_se_463_ascendc[n * 512], input_data_ascendc[res_455 + n * 884736], {(uint16_t)(1), (uint16_t)(64), (uint16_t)(0), (uint16_t)(0)}); + } + input_ub_sw_461_ascendc = vecin_buff.GetWithOffset(512, 8352); + for (int n = 0; n < 1; ++n) { + AscendC::DataCopy(input_ub_sw_461_ascendc[n * 512], input_data_ascendc[res_454 + n * 884736], {(uint16_t)(1), (uint16_t)(64), (uint16_t)(0), (uint16_t)(0)}); + } + AscendC::SetFlag(0); + AscendC::WaitFlag(0); + input_ub_464_ascendc = vecin_buff.GetWithOffset(512, 14496); + Muls(input_ub_464_ascendc, input_ub_nw_457_ascendc, nw_weight_94, AscendC::MASK_PLACEHOLDER, 8, {1, 1, 8, 8}); + AscendC::SetFlag(0); + AscendC::WaitFlag(0); + ub_465_ascendc = vecin_buff.GetWithOffset(512, 18592); + Muls(ub_465_ascendc, input_ub_ne_459_ascendc, ne_weight_95, AscendC::MASK_PLACEHOLDER, 8, {1, 1, 8, 8}); + ub_467_ascendc = vecin_buff.GetWithOffset(512, 4288); + Muls(ub_467_ascendc, input_ub_se_463_ascendc, se_weight_97, AscendC::MASK_PLACEHOLDER, 8, {1, 1, 8, 8}); + AscendC::PipeBarrier(); + ub_466_ascendc = vecin_buff.GetWithOffset(512, 10400); + Muls(ub_466_ascendc, input_ub_sw_461_ascendc, sw_weight_96, AscendC::MASK_PLACEHOLDER, 8, {1, 1, 8, 8}); + AscendC::PipeBarrier(); + input_ub_468_ascendc = vecin_buff.GetWithOffset(512, 0); + Add(input_ub_468_ascendc, input_ub_464_ascendc, ub_465_ascendc, AscendC::MASK_PLACEHOLDER, 8, {1, 1, 1, 8, 8, 8}); + AscendC::PipeBarrier(); + input_ub_469_ascendc = vecin_buff.GetWithOffset(512, 12480); + Add(input_ub_469_ascendc, input_ub_468_ascendc, ub_466_ascendc, AscendC::MASK_PLACEHOLDER, 8, {1, 1, 1, 8, 8, 8}); + AscendC::PipeBarrier(); + input_ub_470_ascendc = vecin_buff.GetWithOffset(512, 0); + Add(input_ub_470_ascendc, input_ub_469_ascendc, ub_467_ascendc, AscendC::MASK_PLACEHOLDER, 8, {1, 1, 1, 8, 8, 8}); + AscendC::SetFlag(0); + AscendC::WaitFlag(0); + input_ub_470_ascendc = vecout_buff.GetWithOffset(512, 0); + for (int n = 0; n < 1; ++n) { + AscendC::DataCopy(output_data_ascendc[res_451 + n * 512], input_ub_470_ascendc[n * 16533504], {(uint16_t)(1), (uint16_t)(64), (uint16_t)(0), (uint16_t)(0)}); + } + AscendC::PipeBarrier(); + } + for (int dynamic_loop_var_6_473 = 0; dynamic_loop_var_6_473 < c_res_chunk_num_60; dynamic_loop_var_6_473 += 1) { + res_476 = n_409 * in_stride_0_61; + res_477 = y_n_92 * in_stride_1_55; + res_481 = n_409 * in_stride_0_61; + res_482 = y_n_92 * in_stride_1_55; + res_491 = n_409 * in_stride_0_61; + res_492 = y_s_93 * in_stride_1_55; + res_472 = x_e_91 * in_stride_2_26; + res_478 = x_w_90 * in_stride_2_26; + res_483 = x_e_91 * in_stride_2_26; + res_498 = res_476 + res_477; + res_499 = res_481 + res_482; + res_501 = res_491 + res_492; + res_503 = res_498 + res_478; + res_504 = res_499 + res_483; + res_506 = res_501 + res_472; + res_475 = dynamic_loop_var_6_473 * (int32_t)8; + res_480 = dynamic_loop_var_6_473 * (int32_t)8; + res_485 = dynamic_loop_var_6_473 * (int32_t)8; + res_508 = res_503 + c_res_start_44; + res_509 = res_504 + c_res_start_44; + res_511 = res_506 + c_res_start_44; + res_513 = res_508 + res_480; + res_514 = res_509 + res_485; + res_516 = res_511 + res_475; + AscendC::SetFlag(0); + AscendC::WaitFlag(0); + input_ub_nw_519_ascendc = vecin_buff.GetWithOffset(8, 2432); + for (int n = 0; n < 1; ++n) { + AscendC::DataCopy(input_ub_nw_519_ascendc[n * 8], input_data_ascendc[res_513 + n * 884736], {(uint16_t)(1), (uint16_t)(1), (uint16_t)(0), (uint16_t)(0)}); + } + res_486 = n_409 * in_stride_0_61; + res_487 = y_s_93 * in_stride_1_55; + res_488 = x_w_90 * in_stride_2_26; + res_500 = res_486 + res_487; + res_505 = res_500 + res_488; + res_490 = dynamic_loop_var_6_473 * (int32_t)8; + res_510 = res_505 + c_res_start_44; + res_515 = res_510 + res_490; + AscendC::SetFlag(0); + AscendC::WaitFlag(0); + res_493 = n_409 * out_stride_0_62; + res_494 = h_414 * out_stride_1_56; + res_495 = w_413 * out_stride_2_30; + res_502 = res_493 + res_494; + res_507 = res_502 + res_495; + res_497 = dynamic_loop_var_6_473 * (int32_t)8; + res_512 = res_507 + c_res_start_44; + res_517 = res_512 + res_497; + AscendC::SetFlag(0); + AscendC::WaitFlag(0); + input_ub_ne_521_ascendc = vecin_buff.GetWithOffset(8, 2528); + for (int n = 0; n < 1; ++n) { + AscendC::DataCopy(input_ub_ne_521_ascendc[n * 8], input_data_ascendc[res_514 + n * 884736], {(uint16_t)(1), (uint16_t)(1), (uint16_t)(0), (uint16_t)(0)}); + } + input_ub_se_525_ascendc = vecin_buff.GetWithOffset(8, 2112); + for (int n = 0; n < 1; ++n) { + AscendC::DataCopy(input_ub_se_525_ascendc[n * 8], input_data_ascendc[res_516 + n * 884736], {(uint16_t)(1), (uint16_t)(1), (uint16_t)(0), (uint16_t)(0)}); + } + input_ub_sw_523_ascendc = vecin_buff.GetWithOffset(8, 2336); + for (int n = 0; n < 1; ++n) { + AscendC::DataCopy(input_ub_sw_523_ascendc[n * 8], input_data_ascendc[res_515 + n * 884736], {(uint16_t)(1), (uint16_t)(1), (uint16_t)(0), (uint16_t)(0)}); + } + AscendC::SetFlag(0); + AscendC::WaitFlag(0); + input_ub_526_ascendc = vecin_buff.GetWithOffset(8, 2400); + AscendC::SetMaskNorm(); + AscendC::SetVectorMask(0ULL, (1ULL << (8)) - 1); + Muls(input_ub_526_ascendc, input_ub_nw_519_ascendc, nw_weight_94, AscendC::MASK_PLACEHOLDER, 1, {1, 1, 8, 8}); + AscendC::SetMaskNorm(); + AscendC::SetVectorMask(18446744073709551615ULL, 18446744073709551615ULL); + AscendC::SetFlag(0); + AscendC::WaitFlag(0); + ub_527_ascendc = vecin_buff.GetWithOffset(8, 2560); + AscendC::SetMaskNorm(); + AscendC::SetVectorMask(0ULL, (1ULL << (8)) - 1); + Muls(ub_527_ascendc, input_ub_ne_521_ascendc, ne_weight_95, AscendC::MASK_PLACEHOLDER, 1, {1, 1, 8, 8}); + AscendC::SetMaskNorm(); + AscendC::SetVectorMask(18446744073709551615ULL, 18446744073709551615ULL); + ub_529_ascendc = vecin_buff.GetWithOffset(8, 2272); + AscendC::SetMaskNorm(); + AscendC::SetVectorMask(0ULL, (1ULL << (8)) - 1); + Muls(ub_529_ascendc, input_ub_se_525_ascendc, se_weight_97, AscendC::MASK_PLACEHOLDER, 1, {1, 1, 8, 8}); + AscendC::SetMaskNorm(); + AscendC::SetVectorMask(18446744073709551615ULL, 18446744073709551615ULL); + AscendC::PipeBarrier(); + ub_528_ascendc = vecin_buff.GetWithOffset(8, 2368); + AscendC::SetMaskNorm(); + AscendC::SetVectorMask(0ULL, (1ULL << (8)) - 1); + Muls(ub_528_ascendc, input_ub_sw_523_ascendc, sw_weight_96, AscendC::MASK_PLACEHOLDER, 1, {1, 1, 8, 8}); + AscendC::SetMaskNorm(); + AscendC::SetVectorMask(18446744073709551615ULL, 18446744073709551615ULL); + AscendC::PipeBarrier(); + input_ub_530_ascendc = vecin_buff.GetWithOffset(8, 2112); + AscendC::SetMaskNorm(); + AscendC::SetVectorMask(0ULL, (1ULL << (8)) - 1); + Add(input_ub_530_ascendc, input_ub_526_ascendc, ub_527_ascendc, AscendC::MASK_PLACEHOLDER, 1, {1, 1, 1, 0, 0, 0}); + AscendC::SetMaskNorm(); + AscendC::SetVectorMask(18446744073709551615ULL, 18446744073709551615ULL); + AscendC::PipeBarrier(); + input_ub_531_ascendc = vecin_buff.GetWithOffset(8, 2400); + AscendC::SetMaskNorm(); + AscendC::SetVectorMask(0ULL, (1ULL << (8)) - 1); + Add(input_ub_531_ascendc, input_ub_530_ascendc, ub_528_ascendc, AscendC::MASK_PLACEHOLDER, 1, {1, 1, 1, 0, 0, 0}); + AscendC::SetMaskNorm(); + AscendC::SetVectorMask(18446744073709551615ULL, 18446744073709551615ULL); + AscendC::PipeBarrier(); + input_ub_532_ascendc = vecin_buff.GetWithOffset(8, 2112); + AscendC::SetMaskNorm(); + AscendC::SetVectorMask(0ULL, (1ULL << (8)) - 1); + Add(input_ub_532_ascendc, input_ub_531_ascendc, ub_529_ascendc, AscendC::MASK_PLACEHOLDER, 1, {1, 1, 1, 0, 0, 0}); + AscendC::SetMaskNorm(); + AscendC::SetVectorMask(18446744073709551615ULL, 18446744073709551615ULL); + AscendC::SetFlag(0); + AscendC::WaitFlag(0); + input_ub_532_ascendc = vecout_buff.GetWithOffset(8, 2112); + for (int n = 0; n < 1; ++n) { + AscendC::DataCopy(output_data_ascendc[res_517 + n * 8], input_ub_532_ascendc[n * 16533504], {(uint16_t)(1), (uint16_t)(1), (uint16_t)(0), (uint16_t)(0)}); + } + AscendC::PipeBarrier(); + } + AscendC::PipeBarrier(); + } + } + } + __aicore__ inline void CopyIn(int32_t index) { + } + __aicore__ inline void Compute() { + } + __aicore__ inline void CopyOut(int32_t index) { + } + + private: + GM_ADDR input; + GM_ADDR grid; + GM_ADDR output; + float h_in; + float w_in; + int32_t h_out; + int32_t w_out; + int32_t n_in; + int32_t c_in; +}; + +extern "C" __global__ __aicore__ void grid_sample(GM_ADDR input, GM_ADDR grid, GM_ADDR out, + GM_ADDR workspace, GM_ADDR tiling) { + GET_TILING_DATA(tilingData, tiling); + KernelGridSample op; + op.Init(input, grid, out, &tilingData); + op.Process(); +} + +#ifndef ASCENDC_CPU_DEBUG +void grid_sample_do(uint32_t blockDim, void *l2ctrl, void *stream, uint8_t *input, uint8_t *grid, + uint8_t *output, uint8_t *workspace, uint8_t *tiling) { + grid_sample<<>>(input, grid, output, workspace, tiling); +} +#endif +// NOLINTEND diff --git a/ops/ascendc/grid_sample/op_kernel/gridsample_nhwc.py b/ops/ascendc/grid_sample/op_kernel/gridsample_nhwc.py new file mode 100644 index 0000000000000000000000000000000000000000..6996e3b5003967199814ec51f818e4fa56ceecc8 --- /dev/null +++ b/ops/ascendc/grid_sample/op_kernel/gridsample_nhwc.py @@ -0,0 +1,404 @@ +# Copyright 2025 Huawei Technologies Co., Ltd +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================ +# pylint: skip-file +import numpy as np +import os +import sys +from swft.core import * +from swft.api import * + +OP_NAME = 'grid_sample' +os.system(f"mkdir -p temp/{OP_NAME}") +os.system(f"mkdir -p temp/{OP_NAME}/input") +os.system(f"mkdir -p temp/{OP_NAME}/output") + +N = 1 +C = 1536 # C should be 8 aligned +H_IN = 24 +W_IN = 24 +H_OUT = 10764 +W_OUT= 1 + +PER_LOOP_PIXEL_NUM = 8 +BLOCK_LOOP_NUM = 8 # 32Bytes / sizeof(type) +CORE_NUM = 8 +ALIGN_CORNERS = False +PADDING_MODE = 1 +CHANNEL_LOOP_CHUNK = 512 # C should be 512 aligned +CHANNEL_LOOP_CHUNK2 = 8 # chunk size for residual +INTERPOLATION_MODE = 0 + +# Numpy Test +# =============================================================================== +def bilinear_interpolate(input_tensor, x, y, H, W): + """双线性插值""" + x1 = int(np.floor(x)) + y1 = int(np.floor(y)) + x2 = min(x1 + 1, W - 1) + y2 = min(y1 + 1, H - 1) + + # 边界检查 + if x1 < 0 or x1 >= W or y1 < 0 or y1 >= H: + return np.zeros(input_tensor.shape[0], dtype=np.float32) + + # 计算权重 + wx = x - x1 + wy = y - y1 + + # 双线性插值 + result = (input_tensor[y1, x1, :] * (1 - wx) * (1 - wy) + + input_tensor[y1, x2, :] * wx * (1 - wy) + + input_tensor[y2, x1, :] * (1 - wx) * wy + + input_tensor[y2, x2, :] * wx * wy) + + return result + +def gen_data(): + np.random.seed(0) + # input_data = np.arange(N * H_IN * W_IN * C).reshape(N, H_IN, W_IN, C).astype(np.float32) + # input_data = np.full([N, H_IN, W_IN, C], 100.0, dtype=np.float32) + # input_data = np.repeat(np.arange(1, N * H_IN * W_IN + 1).reshape(N, H_IN, W_IN, 1), C, axis=3).astype(np.float32) + input_data = np.random.uniform(-1, 1, [N, H_IN, W_IN, C]).astype(np.float32) + grid_data = np.random.uniform(-1, 1, [N, H_OUT, W_OUT, 2]).astype(np.float32) + output_data = np.zeros([N, H_OUT, W_OUT, C]).astype(np.float32) + + for n in range(N): + for h in range(H_OUT): + for w in range(W_OUT): + # 获取归一化坐标 + x_norm = grid_data[n, h, w, 0] + y_norm = grid_data[n, h, w, 1] + + # 映射到input坐标空间 + if ALIGN_CORNERS: + x = (x_norm + 1) * (W_IN - 1) / 2 + y = (y_norm + 1) * (H_IN - 1) / 2 + else: + x = (x_norm + 1) * W_IN / 2 - 0.5 + y = (y_norm + 1) * H_IN / 2 - 0.5 + + # 边界处理 + if PADDING_MODE == 0: # zeros + if x < 0 or x >= W_IN or y < 0 or y >= H_IN: + output_data[n, h, w, :] = 0 + continue + elif PADDING_MODE == 1: # border + x = np.clip(x, 0, W_IN - 1) + y = np.clip(y, 0, H_IN - 1) + + # 插值采样 + if INTERPOLATION_MODE == 0: # bilinear + output_data[n, h, w, :] = bilinear_interpolate( + input_data[n], x, y, H_IN, W_IN) + elif INTERPOLATION_MODE == 1: # nearest + x_idx = int(x) + y_idx = int(y) + x_idx = np.clip(x_idx, 0, W_IN - 1) + y_idx = np.clip(y_idx, 0, H_IN - 1) + output_data[n, h, w, :] = input_data[n, y_idx, x_idx, :] + + h_in = np.array([H_IN], dtype=np.float32) + w_in = np.array([W_IN], dtype=np.float32) + h_out = np.array([H_OUT], dtype=np.int32) + w_out = np.array([W_OUT], dtype=np.int32) + n_in = np.array([N], dtype=np.int32) + c_in = np.array([C], dtype=np.int32) + + # 保存测试数据 + h_in.tofile(f"./temp/{OP_NAME}/input/h_in.bin") + w_in.tofile(f"./temp/{OP_NAME}/input/w_in.bin") + h_out.tofile(f"./temp/{OP_NAME}/input/h_out.bin") + w_out.tofile(f"./temp/{OP_NAME}/input/w_out.bin") + n_in.tofile(f"./temp/{OP_NAME}/input/n_in.bin") + c_in.tofile(f"./temp/{OP_NAME}/input/c_in.bin") + input_data.tofile(f"./temp/{OP_NAME}/input/input_data.bin") + grid_data.tofile(f"./temp/{OP_NAME}/input/grid_data.bin") + output_data.tofile(f"./temp/{OP_NAME}/output/output_golden.bin") + +# OP Impl +# =============================================================================== + + +@sub_kernel(core_num=CORE_NUM) +def grid_sample(input_data, grid_data, output_data, h_in, w_in, h_out, w_out, n_in, c_in): + block_idx = get_block_idx() + + # calc number of output this rank need to compute + total_outputs = n_in * h_out * w_out + # 将total_outputs按照 Block_Loop_NUM 分组 + block_num = total_outputs // BLOCK_LOOP_NUM + # 多余的 outputs + output_res = total_outputs % BLOCK_LOOP_NUM + percore_blocks = Scalar("INT32", 0) + percore_blocks = (block_num + CORE_NUM - 1) // CORE_NUM + pixel_num = Scalar("INT32", 0) + if (block_idx + 1) * percore_blocks < block_num: + pixel_num = percore_blocks * BLOCK_LOOP_NUM + elif block_idx * percore_blocks < block_num: + pixel_num = (block_num - block_idx * percore_blocks) * BLOCK_LOOP_NUM + output_res + else: + pixel_num = Scalar("INT32", 0) + start_idx = block_idx * percore_blocks * BLOCK_LOOP_NUM + if (block_idx == 0) and pixel_num < 1: + pixel_num = total_outputs + + max_pixel_num = PER_LOOP_PIXEL_NUM # 32Byte aligned + tile_num = pixel_num // max_pixel_num + res_num = pixel_num % max_pixel_num + + c_chunk_num = c_in // CHANNEL_LOOP_CHUNK + c_res_num = c_in % CHANNEL_LOOP_CHUNK + + # C should be 8 aligned + c_res_chunk_num = (c_res_num + CHANNEL_LOOP_CHUNK2 - 1) // CHANNEL_LOOP_CHUNK2 + c_res_start = c_chunk_num * CHANNEL_LOOP_CHUNK + + in_stride_2 = Scalar("INT32", 0) + in_stride_2 = c_in + in_stride_1 = w_in.astype("INT32") * in_stride_2 + in_stride_0 = h_in.astype("INT32") * in_stride_1 + + out_stride_2 = Scalar("INT32", 0) + out_stride_2 = c_in + out_stride_1 = w_out * out_stride_2 + out_stride_0 = h_out * out_stride_1 + + # consts + const1 = w_in * 0.5 + const2 = h_in * 0.5 + const3 = const1 - 0.5 + const4 = const2 - 0.5 + const5 = w_in - 1 + const6 = h_in - 1 + const7 = const5.astype("INT32") + const8 = const6.astype("INT32") + for i in dynamic_loop(tile_num): + idx = start_idx + i * max_pixel_num + xys_ub = slice_to_ub(grid_data, [idx, 0], [max_pixel_num, 2]) + xys_ub_trans = transpose(xys_ub, [1, 0]) + xs_ub = slice_to_ub(xys_ub_trans, [0, 0], [1, max_pixel_num]) + ys_ub = slice_to_ub(xys_ub_trans, [1, 0], [1, max_pixel_num]) + + # align corners is false + xs_ub = vmuls(xs_ub, const1) + ys_ub = vmuls(ys_ub, const2) + xs_ub = vadds(xs_ub, const3) + ys_ub = vadds(ys_ub, const4) + + # border clip + xs_ub = vmaxs(xs_ub, Scalar("FP32", 0)) + ys_ub = vmaxs(ys_ub, Scalar("FP32", 0)) + xs_ub = vmins(xs_ub, const5) + ys_ub = vmins(ys_ub, const6) + + # bilinear interpolation + xs_ub_w_int = vconv(xs_ub, "INT32", "z") + xs_ub_w = vconv(xs_ub_w_int, "FP32") + xs_ub_e_int = vadds(xs_ub_w_int, Scalar("INT32", 1)) + xs_ub_e_int = vmins(xs_ub_e_int, const7) + xs_ub_e = vadds(xs_ub_w, Scalar("FP32", 1)) + + ys_ub_n_int = vconv(ys_ub, "INT32", "z") + ys_ub_n = vconv(ys_ub_n_int, "FP32") + ys_ub_s_int = vadds(ys_ub_n_int, Scalar("INT32", 1)) + ys_ub_s_int = vmins(ys_ub_s_int, const8) + ys_ub_s = vadds(ys_ub_n, Scalar("FP32", 1)) + + w_weights = vsub(xs_ub_e, xs_ub) + e_weights = vsub(xs_ub, xs_ub_w) + n_weights = vsub(ys_ub_s, ys_ub) + s_weights = vsub(ys_ub, ys_ub_n) + + nw_weights = vmul(w_weights, n_weights) + ne_weights = vmul(e_weights, n_weights) + sw_weights = vmul(w_weights, s_weights) + se_weights = vmul(e_weights, s_weights) + + # n/e may be out of range, but weight will be zero + for j in dynamic_loop(max_pixel_num): + inner_idx = (idx + j) + hw = inner_idx % (h_out * w_out) + w = hw % w_out + h = hw // w_out + n = inner_idx // (h_out * w_out) + + x_w = move_to_scalar(xs_ub_w_int[0, j]) + x_e = move_to_scalar(xs_ub_e_int[0, j]) + y_n = move_to_scalar(ys_ub_n_int[0, j]) + y_s = move_to_scalar(ys_ub_s_int[0, j]) + + nw_weight = move_to_scalar(nw_weights[0, j]) + ne_weight = move_to_scalar(ne_weights[0, j]) + sw_weight = move_to_scalar(sw_weights[0, j]) + se_weight = move_to_scalar(se_weights[0, j]) + + + for idx_c in dynamic_loop(c_chunk_num): + input_ub_nw = slice_to_ub(input_data, [n * in_stride_0 + y_n * in_stride_1 + x_w * in_stride_2 + idx_c * CHANNEL_LOOP_CHUNK], + [CHANNEL_LOOP_CHUNK]) + input_ub_ne = slice_to_ub(input_data, [n * in_stride_0 + y_n * in_stride_1 + x_e * in_stride_2 + idx_c * CHANNEL_LOOP_CHUNK], + [CHANNEL_LOOP_CHUNK]) + input_ub_sw = slice_to_ub(input_data, [n * in_stride_0 + y_s * in_stride_1 + x_w * in_stride_2 + idx_c * CHANNEL_LOOP_CHUNK], + [CHANNEL_LOOP_CHUNK]) + input_ub_se = slice_to_ub(input_data, [n * in_stride_0 + y_s * in_stride_1 + x_e * in_stride_2 + idx_c * CHANNEL_LOOP_CHUNK], + [CHANNEL_LOOP_CHUNK]) + + input_ub = vmuls(input_ub_nw, nw_weight) + input_ub = vadd(input_ub, vmuls(input_ub_ne, ne_weight)) + input_ub = vadd(input_ub, vmuls(input_ub_sw, sw_weight)) + input_ub = vadd(input_ub, vmuls(input_ub_se, se_weight)) + + insert_to_gm(output_data, input_ub, [n * out_stride_0 + h * out_stride_1 + w * out_stride_2 + idx_c * CHANNEL_LOOP_CHUNK], + [CHANNEL_LOOP_CHUNK]) + + for idx_c in dynamic_loop(c_res_chunk_num): + input_ub_nw = slice_to_ub(input_data, [n * in_stride_0 + y_n * in_stride_1 + x_w * in_stride_2 + c_res_start + idx_c * CHANNEL_LOOP_CHUNK2], + [CHANNEL_LOOP_CHUNK2]) + input_ub_ne = slice_to_ub(input_data, [n * in_stride_0 + y_n * in_stride_1 + x_e * in_stride_2 + c_res_start + idx_c * CHANNEL_LOOP_CHUNK2], + [CHANNEL_LOOP_CHUNK2]) + input_ub_sw = slice_to_ub(input_data, [n * in_stride_0 + y_s * in_stride_1 + x_w * in_stride_2 + c_res_start + idx_c * CHANNEL_LOOP_CHUNK2], + [CHANNEL_LOOP_CHUNK2]) + input_ub_se = slice_to_ub(input_data, [n * in_stride_0 + y_s * in_stride_1 + x_e * in_stride_2 + c_res_start + idx_c * CHANNEL_LOOP_CHUNK2], + [CHANNEL_LOOP_CHUNK2]) + + input_ub = vmuls(input_ub_nw, nw_weight) + input_ub = vadd(input_ub, vmuls(input_ub_ne, ne_weight)) + input_ub = vadd(input_ub, vmuls(input_ub_sw, sw_weight)) + input_ub = vadd(input_ub, vmuls(input_ub_se, se_weight)) + + insert_to_gm(output_data, input_ub, [n * out_stride_0 + h * out_stride_1 + w * out_stride_2 + c_res_start + idx_c * CHANNEL_LOOP_CHUNK2], + [CHANNEL_LOOP_CHUNK2]) + + # for residual + if res_num > 0: + idx_ = start_idx + pixel_num - res_num + xys_ub_ = slice_to_ub(grid_data, [idx_, 0], [max_pixel_num, 2]) + xys_ub_trans_ = transpose(xys_ub_, [1, 0]) + xs_ub_ = slice_to_ub(xys_ub_trans_, [0, 0], [1, max_pixel_num]) + ys_ub_ = slice_to_ub(xys_ub_trans_, [1, 0], [1, max_pixel_num]) + + # align corners is false + xs_ub_ = vmuls(xs_ub_, const1) + ys_ub_ = vmuls(ys_ub_, const2) + xs_ub_ = vadds(xs_ub_, const3) + ys_ub_ = vadds(ys_ub_, const4) + + # border clip + xs_ub_ = vmaxs(xs_ub_, Scalar("FP32", 0)) + ys_ub_ = vmaxs(ys_ub_, Scalar("FP32", 0)) + xs_ub_ = vmins(xs_ub_, const5) + ys_ub_ = vmins(ys_ub_, const6) + + # bilinear interpolation + xs_ub_w_int_ = vconv(xs_ub_, "INT32", "z") + xs_ub_w_ = vconv(xs_ub_w_int_, "FP32") + xs_ub_e_int_ = vadds(xs_ub_w_int_, Scalar("INT32", 1)) + xs_ub_e_int_ = vmins(xs_ub_e_int_, const7) + xs_ub_e_ = vadds(xs_ub_w_, Scalar("FP32", 1)) + + ys_ub_n_int_ = vconv(ys_ub_, "INT32", "z") + ys_ub_n_ = vconv(ys_ub_n_int_, "FP32") + ys_ub_s_int_ = vadds(ys_ub_n_int_, Scalar("INT32", 1)) + ys_ub_s_int_ = vmins(ys_ub_s_int_, const8) + ys_ub_s_ = vadds(ys_ub_n_, Scalar("FP32", 1)) + + w_weight_ = vsub(xs_ub_e_, xs_ub_) + e_weight_ = vsub(xs_ub_, xs_ub_w_) + n_weight_ = vsub(ys_ub_s_, ys_ub_) + s_weight_ = vsub(ys_ub_, ys_ub_n_) + + nw_weight_ = vmul(w_weight_, n_weight_) + ne_weight_ = vmul(e_weight_, n_weight_) + sw_weight_ = vmul(w_weight_, s_weight_) + se_weight_ = vmul(e_weight_, s_weight_) + + # n/e may be out of range, but weight will be zero + + for j in dynamic_loop(res_num): + inner_idx = (idx_ + j) + hw = inner_idx % (h_out * w_out) + w = hw % w_out + h = hw // w_out + n = inner_idx // (h_out * w_out) + + x_w = move_to_scalar(xs_ub_w_int_[0, j]) + x_e = move_to_scalar(xs_ub_e_int_[0, j]) + y_n = move_to_scalar(ys_ub_n_int_[0, j]) + y_s = move_to_scalar(ys_ub_s_int_[0, j]) + + nw_weight = move_to_scalar(nw_weight_[0, j]) + ne_weight = move_to_scalar(ne_weight_[0, j]) + sw_weight = move_to_scalar(sw_weight_[0, j]) + se_weight = move_to_scalar(se_weight_[0, j]) + + for idx_c in dynamic_loop(c_chunk_num): + input_ub_nw = slice_to_ub(input_data, [n * in_stride_0 + y_n * in_stride_1 + x_w * in_stride_2 + idx_c * CHANNEL_LOOP_CHUNK], + [CHANNEL_LOOP_CHUNK]) + input_ub_ne = slice_to_ub(input_data, [n * in_stride_0 + y_n * in_stride_1 + x_e * in_stride_2 + idx_c * CHANNEL_LOOP_CHUNK], + [CHANNEL_LOOP_CHUNK]) + input_ub_sw = slice_to_ub(input_data, [n * in_stride_0 + y_s * in_stride_1 + x_w * in_stride_2 + idx_c * CHANNEL_LOOP_CHUNK], + [CHANNEL_LOOP_CHUNK]) + input_ub_se = slice_to_ub(input_data, [n * in_stride_0 + y_s * in_stride_1 + x_e * in_stride_2 + idx_c * CHANNEL_LOOP_CHUNK], + [CHANNEL_LOOP_CHUNK]) + + input_ub = vmuls(input_ub_nw, nw_weight) + input_ub = vadd(input_ub, vmuls(input_ub_ne, ne_weight)) + input_ub = vadd(input_ub, vmuls(input_ub_sw, sw_weight)) + input_ub = vadd(input_ub, vmuls(input_ub_se, se_weight)) + + insert_to_gm(output_data, input_ub, [n * out_stride_0 + h * out_stride_1 + w * out_stride_2 + idx_c * CHANNEL_LOOP_CHUNK], + [CHANNEL_LOOP_CHUNK]) + + for idx_c in dynamic_loop(c_res_chunk_num): + input_ub_nw = slice_to_ub(input_data, [n * in_stride_0 + y_n * in_stride_1 + x_w * in_stride_2 + c_res_start + idx_c * CHANNEL_LOOP_CHUNK2], + [CHANNEL_LOOP_CHUNK2]) + input_ub_ne = slice_to_ub(input_data, [n * in_stride_0 + y_n * in_stride_1 + x_e * in_stride_2 + c_res_start + idx_c * CHANNEL_LOOP_CHUNK2], + [CHANNEL_LOOP_CHUNK2]) + input_ub_sw = slice_to_ub(input_data, [n * in_stride_0 + y_s * in_stride_1 + x_w * in_stride_2 + c_res_start + idx_c * CHANNEL_LOOP_CHUNK2], + [CHANNEL_LOOP_CHUNK2]) + input_ub_se = slice_to_ub(input_data, [n * in_stride_0 + y_s * in_stride_1 + x_e * in_stride_2 + c_res_start + idx_c * CHANNEL_LOOP_CHUNK2], + [CHANNEL_LOOP_CHUNK2]) + + input_ub = vmuls(input_ub_nw, nw_weight) + input_ub = vadd(input_ub, vmuls(input_ub_ne, ne_weight)) + input_ub = vadd(input_ub, vmuls(input_ub_sw, sw_weight)) + input_ub = vadd(input_ub, vmuls(input_ub_se, se_weight)) + + insert_to_gm(output_data, input_ub, [n * out_stride_0 + h * out_stride_1 + w * out_stride_2 + c_res_start + idx_c * CHANNEL_LOOP_CHUNK2], + [CHANNEL_LOOP_CHUNK2]) + + +if __name__ == "__main__": + set_context("310P") + gen_data() + input_data = Tensor("GM", "FP32", [N * H_IN * W_IN * C], "ND", False) + grid_data = Tensor("GM", "FP32", [N * H_OUT * W_OUT, 2], "ND", False) + output_data = Tensor("GM", "FP32", [N * H_OUT * W_OUT * C], "ND", False) + h_in = Scalar("FP32") + w_in = Scalar("FP32") + h_out = Scalar("INT32") + w_out = Scalar("INT32") + n_in = Scalar("INT32") + c_in = Scalar("INT32") + compile_func(grid_sample, globals())(input_data, grid_data, output_data, h_in, w_in, h_out, w_out, n_in, c_in) + # compile_func(grid_sample_floor, globals())(input_data, grid_data, output_data, h_in, w_in, h_out, w_out, n_in, c_in) + compile_kernel(f"./temp/{OP_NAME}/{OP_NAME}.cce", OP_NAME) + exec_kernel(OP_NAME, locals(), prefix_path="temp", inputs=[ + 'input_data', 'grid_data', 'h_in', 'w_in', 'h_out', 'w_out', 'n_in', 'c_in'], + outputs=['output_data'], device_id=1) + script_dir = os.path.dirname(os.path.abspath(__file__)) + return_code_1 = os.system( + f'python3 {script_dir}/../verify_result.py ./temp/{OP_NAME}/output/output_data_actual.bin ./temp/{OP_NAME}/output/output_golden.bin float32 4e-2 1e-2 4e-3') + sys.exit(return_code_1 >> 8) diff --git a/ops/framework/ms_kernels_internal/pyboost/internal_pyboost_runner.cc b/ops/framework/ms_kernels_internal/pyboost/internal_pyboost_runner.cc index 550281daeaf7f52660c567920e6c7ac9ea2fa714..f30de74da83a7e81ee2c9839eb4345bfdf6bac6d 100644 --- a/ops/framework/ms_kernels_internal/pyboost/internal_pyboost_runner.cc +++ b/ops/framework/ms_kernels_internal/pyboost/internal_pyboost_runner.cc @@ -17,11 +17,11 @@ #include "ops/framework/ms_kernels_internal/pyboost/internal_pyboost_runner.h" namespace { - constexpr size_t kMemAlignSize{512}; - constexpr size_t kAlignBytes{32}; - size_t GetAlignedSize(size_t size) { - return (size + kMemAlignSize + kAlignBytes -1) / kMemAlignSize * kMemAlignSize; - } +constexpr size_t kMemAlignSize{512}; +constexpr size_t kAlignBytes{32}; +size_t GetAlignedSize(size_t size) { + return (size + kMemAlignSize + kAlignBytes -1) / kMemAlignSize * kMemAlignSize; +} } namespace ms_custom_ops { @@ -68,6 +68,7 @@ void InternalPyboostRunner::GetOrCreateKernel(const TensorList &inputs, const Te size_t InternalPyboostRunner::CalcWorkspace() { MS_EXCEPTION_IF_NULL(internal_op_); auto workspace_size_list = internal_op_->GetWorkspaceSize(); + // all workspace will be aligned, like graph_mode return std::accumulate(workspace_size_list.begin(), workspace_size_list.end(), 0, [](size_t acc, size_t size){ return acc + GetAlignedSize(size); diff --git a/tests/st/test_custom_grid_sample.py b/tests/st/test_custom_grid_sample.py new file mode 100644 index 0000000000000000000000000000000000000000..a2d36d893c8b47b0c1168216b406f5880b9a5126 --- /dev/null +++ b/tests/st/test_custom_grid_sample.py @@ -0,0 +1,236 @@ +# Copyright 2025 Huawei Technologies Co., Ltd +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================ +"""Tests for ms_custom_ops.grid_sample.""" + +import logging +import numpy as np +import pytest + +import mindspore as ms +from mindspore import Tensor, nn, context, Profiler +from mindspore.profiler import ProfilerLevel, ProfilerActivity, AicoreMetrics +import ms_custom_ops + +def bilinear_interpolate(input_tensor, x, y, h, w): + """bilinear_interpolate""" + x1 = int(np.floor(x)) + y1 = int(np.floor(y)) + x2 = min(x1 + 1, w - 1) + y2 = min(y1 + 1, h - 1) + + # 边界检查 + if x1 < 0 or x1 >= w or y1 < 0 or y1 >= h: + return np.zeros(input_tensor.shape[0], dtype=np.float32) + + # 计算权重 + wx = x - x1 + wy = y - y1 + + # 双线性插值 + result = (input_tensor[y1, x1, :] * (1 - wx) * (1 - wy) + + input_tensor[y1, x2, :] * wx * (1 - wy) + + input_tensor[y2, x1, :] * (1 - wx) * wy + + input_tensor[y2, x2, :] * wx * wy) + return result + + +def golden_grid_sample(input_data, grid_data, align_corners, padding_mode, interpolation_mode): + """golden_grid_sample""" + n, h_in, w_in, c = input_data.shape + n, h_out, w_out, _ = grid_data.shape + output_data = np.zeros([n, h_out, w_out, c]).astype(np.float32) + + for n in range(n): + for h in range(h_out): + for w in range(w_out): + # 获取归一化坐标 + x_norm = grid_data[n, h, w, 0] + y_norm = grid_data[n, h, w, 1] + + # 映射到input坐标空间 + if align_corners: + x = (x_norm + 1) * (w_in - 1) / 2 + y = (y_norm + 1) * (h_in - 1) / 2 + else: + x = (x_norm + 1) * w_in / 2 - 0.5 + y = (y_norm + 1) * h_in / 2 - 0.5 + + # 边界处理 + if padding_mode == 0: # zeros + if x < 0 or x >= w_in or y < 0 or y >= h_in: + output_data[n, h, w, :] = 0 + continue + elif padding_mode == 1: # border + x = np.clip(x, 0, w_in - 1) + y = np.clip(y, 0, h_in - 1) + + # 插值采样 + if interpolation_mode == 0: # bilinear + output_data[n, h, w, :] = bilinear_interpolate( + input_data[n], x, y, h_in, w_in) + elif interpolation_mode == 1: # nearest + x_idx = int(x) + y_idx = int(y) + x_idx = np.clip(x_idx, 0, w_in - 1) + y_idx = np.clip(y_idx, 0, h_in - 1) + output_data[n, h, w, :] = input_data[n, y_idx, x_idx, :] + return output_data + + +class GridSampleNet(nn.Cell): + """GridSampleNet""" + + def construct(self, input_data, grid, interpolation_mode, padding_mode, align_corners): + return ms_custom_ops.grid_sample(input_data, grid, interpolation_mode, padding_mode, align_corners) + + +def run_grid_sample(net, input_dtype, grid_dtype, align_corners, padding_mode, interpolation_mode, + n, c, h_in, w_in, h_out, w_out, is_profiler=False): + """run_grid_sample""" + np_input = np.random.random((n, h_in, w_in, c)).astype(input_dtype) + np_grid = np.random.uniform(-1, 1, (n, h_out, w_out, 2)).astype(grid_dtype) + input_data = Tensor(np_input) + grid = Tensor(np_grid) + np_output = golden_grid_sample(np_input, np_grid, align_corners, padding_mode, interpolation_mode) + if is_profiler is False: + output_data = net(input_data, grid, interpolation_mode, padding_mode, align_corners) + np.testing.assert_allclose(np_output, output_data.asnumpy(), rtol=1e-4, atol=1e-4, err_msg=" grid_sample ") + else: + profiler = Profiler(profiler_level=ProfilerLevel.Level2, + activities=[ProfilerActivity.CPU, ProfilerActivity.NPU], + aic_metrics=AicoreMetrics.AiCoreNone) + for _ in range(50): + output_data = net(input_data, grid, interpolation_mode, padding_mode, align_corners) + profiler.analyse() + + +@pytest.mark.level0 +@pytest.mark.env_onecard +@pytest.mark.platform_ascend310p +@pytest.mark.parametrize("exec_mode", [context.GRAPH_MODE, context.PYNATIVE_MODE]) +@pytest.mark.parametrize("input_dtype", [np.float32]) +@pytest.mark.parametrize("grid_dtype", [np.float32]) +@pytest.mark.parametrize("align_corners", [False]) +@pytest.mark.parametrize("padding_mode", [1]) +@pytest.mark.parametrize("interpolation_mode", [0]) +@pytest.mark.parametrize("n", [1]) +@pytest.mark.parametrize("c", [1536, 8]) +@pytest.mark.parametrize("h_in,w_in", [(24, 24)]) +@pytest.mark.parametrize("h_out,w_out", [(1024, 1)]) +def test_grid_sample(exec_mode, input_dtype, grid_dtype, align_corners, padding_mode, interpolation_mode, + n, c, h_in, w_in, h_out, w_out): + """ + Feature: test grid_sample operator. + Description: test correctness of grid_sample operator. + Expectation:should pass for all testcases. + """ + ms.set_context(device_target="Ascend", mode=exec_mode) + ms.set_context(jit_config={"jit_level": "O0", "infer_boost": "on"}) + net = GridSampleNet() + run_grid_sample(net, input_dtype, grid_dtype, align_corners, padding_mode, + interpolation_mode, n, c, h_in, w_in, h_out, w_out) + + + +@pytest.mark.level1 +@pytest.mark.env_onecard +@pytest.mark.platform_ascend310p +@pytest.mark.parametrize("exec_mode", [context.GRAPH_MODE, context.PYNATIVE_MODE]) +@pytest.mark.parametrize("input_dtype", [np.float32]) +@pytest.mark.parametrize("grid_dtype", [np.float32]) +@pytest.mark.parametrize("align_corners", [False]) +@pytest.mark.parametrize("padding_mode", [1]) +@pytest.mark.parametrize("interpolation_mode", [0]) +@pytest.mark.parametrize("n", [1]) +@pytest.mark.parametrize("c", [17]) +@pytest.mark.parametrize("h_in,w_in", [(24, 24)]) +@pytest.mark.parametrize("h_out,w_out", [(1024, 1)]) +def test_grid_sample_align_c(exec_mode, input_dtype, grid_dtype, align_corners, padding_mode, interpolation_mode, + n, c, h_in, w_in, h_out, w_out): + """ + Feature: test grid_sample operator. + Description: test align c. + Expectation: Unsupported c correctly rejected. + """ + ms.set_context(device_target="Ascend", mode=exec_mode) + ms.set_context(jit_config={"jit_level": "O0", "infer_boost": "on"}) + net = GridSampleNet() + with pytest.raises(ValueError, match="c should be aligned with"): + run_grid_sample(net, input_dtype, grid_dtype, align_corners, padding_mode, interpolation_mode, + n, c, h_in, w_in, h_out, w_out) + logging.info( + "Unsupported c correctly rejected: c=%s", c + ) + + +@pytest.mark.level1 +@pytest.mark.env_onecard +@pytest.mark.platform_ascend310p +@pytest.mark.parametrize("exec_mode", [context.GRAPH_MODE, context.PYNATIVE_MODE]) +@pytest.mark.parametrize("input_dtype", [np.float32]) +@pytest.mark.parametrize("grid_dtype", [np.float32]) +@pytest.mark.parametrize("align_corners", [False]) +@pytest.mark.parametrize("padding_mode", [1]) +@pytest.mark.parametrize("interpolation_mode", [1]) +@pytest.mark.parametrize("n", [1]) +@pytest.mark.parametrize("c", [16]) +@pytest.mark.parametrize("h_in,w_in", [(2, 2)]) +@pytest.mark.parametrize("h_out,w_out", [(2, 2)]) +def test_grid_sample_unsupported_mode(exec_mode, input_dtype, grid_dtype, align_corners, padding_mode, + interpolation_mode, n, c, h_in, w_in, h_out, w_out): + """ + Feature: test grid_sample operator. + Description: test unsupported mode. + Expectation: Unsupported mode correctly rejected. + """ + ms.set_context(device_target="Ascend", mode=exec_mode) + ms.set_context(jit_config={"jit_level": "O0"}) + net = GridSampleNet() + with pytest.raises(ValueError, match="mode only supports"): + run_grid_sample(net, input_dtype, grid_dtype, align_corners, padding_mode, interpolation_mode, + n, c, h_in, w_in, h_out, w_out) + logging.info("Unsupported mode correctly rejected") + + +@pytest.mark.level1 +@pytest.mark.env_onecard +@pytest.mark.platform_ascend310p +@pytest.mark.parametrize("exec_mode", [context.GRAPH_MODE, context.PYNATIVE_MODE]) +@pytest.mark.parametrize("input_dtype", [np.float32]) +@pytest.mark.parametrize("grid_dtype", [np.float32]) +@pytest.mark.parametrize("align_corners", [False]) +@pytest.mark.parametrize("padding_mode", [1]) +@pytest.mark.parametrize("interpolation_mode", [0]) +@pytest.mark.parametrize("n", [1]) +@pytest.mark.parametrize("h_in,w_in", [(24, 24)]) +@pytest.mark.parametrize("h_out,w_out", [(1024, 1)]) +def test_grid_sample_3d_input(exec_mode, input_dtype, grid_dtype, align_corners, padding_mode, + interpolation_mode, n, h_in, w_in, h_out, w_out): + """ + Feature: test grid_sample operator. + Description: test 3d input. + Expectation: 3d input correctly rejected. + """ + ms.set_context(device_target="Ascend", mode=exec_mode) + ms.set_context(jit_config={"jit_level": "O0", "infer_boost": "on"}) + net = GridSampleNet() + with pytest.raises(RuntimeError, match="dim of inputs should"): + np_input = np.random.random((n, h_in, w_in)).astype(input_dtype) + np_grid = np.random.uniform(-1, 1, (n, h_out, w_out, 2)).astype(grid_dtype) + input_data = Tensor(np_input) + grid = Tensor(np_grid) + output_data = net(input_data, grid, interpolation_mode, padding_mode, align_corners) + _ = output_data.asnumpy() + logging.info("3d input correctly rejected")