diff --git a/.jenkins/check/config/filter_cppcheck.txt b/.jenkins/check/config/filter_cppcheck.txt
index e69de29bb2d1d6434b8b29ae775ad8c2e48c5391..6d50c4e70093b55c9895e3befac9dc28885ffc8a 100644
--- a/.jenkins/check/config/filter_cppcheck.txt
+++ b/.jenkins/check/config/filter_cppcheck.txt
@@ -0,0 +1 @@
+"ms_custom_ops/ops/ascendc/grid_sample/op_kernel/grid_sample.cpp" ""
\ No newline at end of file
diff --git a/.jenkins/check/config/whitelizard.txt b/.jenkins/check/config/whitelizard.txt
index e69de29bb2d1d6434b8b29ae775ad8c2e48c5391..41aca3a66b1a683979c5029323fcbd26ce6aded7 100644
--- a/.jenkins/check/config/whitelizard.txt
+++ b/.jenkins/check/config/whitelizard.txt
@@ -0,0 +1,2 @@
+ms_custom_ops/ops/ascendc/grid_sample/op_kernel/grid_sample.cpp:KernelGridSample::grid_sample_0_kernel
+ms_custom_ops/ops/ascendc/grid_sample/op_kernel/gridsample_nhwc.py:grid_sample
\ No newline at end of file
diff --git a/docs/map_from_buildin_to_custom.md b/docs/map_from_buildin_to_custom.md
index 40cbe07ed6586afbb8185d5890188d8e41c7edc8..4cae6cfc4b4145b1aa7cb1d395644b79a1436b75 100644
--- a/docs/map_from_buildin_to_custom.md
+++ b/docs/map_from_buildin_to_custom.md
@@ -3,6 +3,7 @@
 | ops.auto_generate.format_cast | [ms_custom_ops.trans_data](../ops/c_api/trans_data/trans_data.md) | 两者均进行ND和NZ的相互转换。format_cast依赖ms内置白名单；trans_data不使用白名单但有使用约束，详见trans_data文档。同一权重只能选用其中一种进行转换，建议网络中统一采用同一种算子，两者不兼容。 |
 | ops.auto_generate.mla | [ms_custom_ops.mla](../ops/c_api/mla/mla_doc.md) | 新增了input_format参数，用于指定输入参数的format |
 | ops.auto_generate.reshape_and_cache| [ms_custom_ops.reshape_and_cache](../ops/c_api/reshape_and_cache/reshape_and_cache.md) | 新增cache_mode参数，用于指定Atlas 训练系列cache的format是ND还是NZ; 新增head_num，cache_mode为NZ的时候必须提供，辅助计算。|
+| ops.grid_sample | [ms_custom_ops.grid_sample](../ops/ascendc/grid_sample/grid_sample.md) | 支持(N, H, W, C)格式的输入，仅支持 Atlas 推理系列 |
 | ops.moe_init_routing_v2 | [ms_custom_ops.moe_init_routing_v2](../ops/c_api/moe_init_routing_v2/moe_init_routing_v2.md) | 接口一致，仅支持 Atlas 推理系列 |
 |ops.auto_generate.moe_gating_group_topk|[ms_custom_ops.moe_gating_group_topk](../ops/c_api/moe_gating_group_topk/moe_gating_group_topk.md) |接口一致|
 | ops.auto_generate.group_topk | [ms_custom_ops.group_topk](../ops/c_api/group_topk/group_topk_doc.md) | 副作用接口，将不再支持左值输出 |
diff --git a/docs/op_list.md b/docs/op_list.md
index a7a8ed889a0ce6dd41f062b9b8da95c785f4b360..9156d505660042fe6887ad0b37d5f2f00ca1a6bf 100644
--- a/docs/op_list.md
+++ b/docs/op_list.md
@@ -6,7 +6,8 @@
 1. [apply_rotary_pos_emb_atb](../ops/c_api/apply_rotary_pos_emb_atb/apply_rotary_pos_emb_atb.md)
 1. [fa_update](../ops/c_api/fa_update/fa_update_doc.md)
 1. [flash_attention_encoder](../ops/c_api/flash_attention_encoder/flash_attention_encoder.md)
-1. [fused_add_topk_div](../ops/c_api/fused_add_topk_div/fused_add_topk_div.md)
+1. [fused_add_topk_div](../ops/c_api/fused_add_topk_div/fused_add_topk_div_doc.md)
+1. [grid_sample](../ops/ascendc/grid_sample/grid_sample.md)
 1. [group_topk](../ops/c_api/group_topk/group_topk_doc.md)
 1. [grouped_matmul](../ops/c_api/grouped_matmul/grouped_matmul_doc.md)
 1. [grouped_matmul_w4](../ops/c_api/grouped_matmul_w4/grouped_matmul_w4_doc.md)
diff --git a/ops/ascendc/grid_sample/grid_sample.cc b/ops/ascendc/grid_sample/grid_sample.cc
new file mode 100644
index 0000000000000000000000000000000000000000..ea923786110bfaa562cf531ed4537136daf8e3fe
--- /dev/null
+++ b/ops/ascendc/grid_sample/grid_sample.cc
@@ -0,0 +1,196 @@
+/**
+ * Copyright 2025 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+// =============================================================================
+// GRAPH MODE IMPLEMENTATION
+// =============================================================================
+
+#include <map>
+#include <string>
+#include <vector>
+#include <memory>
+#include <utility>
+
+#include "ops/framework/aclnn/graphmode/aclnn_kernel_mod.h"
+#include "ops/framework/utils.h"
+#include "mindspore/include/custom_op_api.h"
+
+namespace ms_custom_ops {
+constexpr size_t kCAligned = 8;
+enum class GridSampleInputIndex : size_t {
+  kGridSampleInputIndex = 0,
+  kGridSampleGridIndex,
+  kGridSampleModeIndex,
+  kGridSamplePaddingModeIndex,
+  kGridSampleAlignCornersIndex,
+  kGridSampleInputsNum,
+};
+
+static void GridSampleCheckInputsShape(const std::string &op_name, const std::vector<int64_t> &input_shape,
+                                      const std::vector<int64_t> &grid_shape, int64_t mode,
+                                      int64_t padding_mode, bool align_corners) {
+  auto any_dim = abstract::Shape::kShapeDimAny;
+  MS_CHECK_VALUE(mode == 0,
+                 CheckAndConvertUtils::FormatCommMsg(op_name, ", mode only supports 0, but got ", mode));
+  MS_CHECK_VALUE(
+    padding_mode == 1,
+    CheckAndConvertUtils::FormatCommMsg(op_name, ", padding_mode only supports 1, but got ", padding_mode));
+  MS_CHECK_VALUE(align_corners == false,
+                 CheckAndConvertUtils::FormatCommMsg(op_name, ", align_corners only supports false, but got ",
+                                                     align_corners));
+  if (input_shape.size() != kDim4 || grid_shape.size() != kDim4) {
+    MS_LOG(EXCEPTION) << op_name << ", the dim of inputs should be input.dim=grid.dim=4, "
+                      << "but got input.dim=" << input_shape.size()
+                      << ", grid.dim=" << grid_shape.size();
+  }
+  auto input_n = input_shape[kIndex0];
+  auto grid_n = grid_shape[kIndex0];
+  MS_CHECK_VALUE(input_n == grid_n || input_n == any_dim || grid_n == any_dim,
+                 CheckAndConvertUtils::FormatCommMsg(
+                   op_name, ", input.dim0 should be equal grid.dim0,",
+                   " but got input.shape=", input_shape, ", grid.shape=", grid_shape));
+  MS_CHECK_VALUE(
+    grid_shape[kIndex3] == kDim2 || grid_shape[kIndex3] == any_dim,
+    CheckAndConvertUtils::FormatCommMsg(
+      op_name, ", grid.shape should be equals (N, H_OUT, W_OUT, 2), but got grid.shape=", grid_shape));
+  auto c_in = input_shape[kIndex3];
+  MS_CHECK_VALUE(
+    c_in % kCAligned == 0,
+    CheckAndConvertUtils::FormatCommMsg(
+      op_name, ", c should be aligned with ", kCAligned, ", but got c=", c_in));
+}
+
+static void GridSampleCheckInputsType(const std::string &op_name, const TypeId &input_dtype,
+                                               const TypeId &grid_dtype) {
+  if (input_dtype != kNumberTypeFloat32) {
+    MS_LOG(EXCEPTION) << op_name << ", the dtype of 'input' should be " << TypeIdToString(kNumberTypeFloat32)
+                      << ", but got input.dtype=" << TypeIdToString(input_dtype);
+  }
+  if (grid_dtype != kNumberTypeFloat32) {
+    MS_LOG(EXCEPTION) << op_name << ", the dtype of 'grid' should be " << TypeIdToString(kNumberTypeFloat32)
+                      << ", but got grid.dtype=" << TypeIdToString(grid_dtype);
+  }
+}
+
+class OPS_API GridSampleOpFuncImpl : public OpFuncImpl {
+ public:
+  ShapeArray InferShape(const PrimitivePtr &primitive, const InferInfoPtrList &input_infos) const override {
+    MS_EXCEPTION_IF_NULL(primitive);
+    if (input_infos[static_cast<size_t>(GridSampleInputIndex::kGridSampleInputIndex)]
+          ->IsDynamicRank() ||
+        input_infos[static_cast<size_t>(GridSampleInputIndex::kGridSampleGridIndex)]
+          ->IsDynamicRank()) {
+      return {input_infos[static_cast<size_t>(GridSampleInputIndex::kGridSampleInputIndex)]->GetShape()};
+    }
+    auto op_name = primitive->name();
+    auto input_shape =
+      input_infos[static_cast<size_t>(GridSampleInputIndex::kGridSampleInputIndex)]->GetShape();
+    auto grid_shape =
+      input_infos[static_cast<size_t>(GridSampleInputIndex::kGridSampleGridIndex)]->GetShape();
+    auto mode =
+      input_infos[static_cast<size_t>(GridSampleInputIndex::kGridSampleModeIndex)]
+        ->GetScalarValueWithCheck<int64_t>();
+    auto padding_mode = input_infos[static_cast<size_t>(GridSampleInputIndex::kGridSamplePaddingModeIndex)]
+                          ->GetScalarValueWithCheck<int64_t>();
+    auto align_corners = input_infos[static_cast<size_t>(GridSampleInputIndex::kGridSampleAlignCornersIndex)]
+      ->GetScalarValueWithCheck<bool>();
+    GridSampleCheckInputsShape(op_name, input_shape, grid_shape, mode, padding_mode, align_corners);
+    auto output_shape = grid_shape;
+    output_shape[kIndex3] = input_shape[kIndex3];
+    return {output_shape};
+  }
+
+  std::vector<TypeId> InferType(const PrimitivePtr &primitive, const InferInfoPtrList &input_infos) const override {
+    auto op_name = primitive->name();
+    auto input_dtype =
+      input_infos[static_cast<size_t>(GridSampleInputIndex::kGridSampleInputIndex)]->GetType();
+    auto grid_dtype =
+      input_infos[static_cast<size_t>(GridSampleInputIndex::kGridSampleGridIndex)]->GetType();
+    GridSampleCheckInputsType(op_name, input_dtype, grid_dtype);
+    return {input_dtype};
+  }
+
+  bool GeneralInferRegistered() const override { return true; }
+};
+
+class GridSample : public AclnnCustomKernelMod {
+ public:
+  GridSample() : AclnnCustomKernelMod(std::move("aclnnGridSample")) {}
+  ~GridSample() = default;
+
+  bool Launch(const std::vector<KernelTensor *> &inputs, const std::vector<KernelTensor *> &workspace,
+              const std::vector<KernelTensor *> &outputs, void *stream_ptr) override {
+    MS_EXCEPTION_IF_NULL(stream_ptr);
+    RunOp(
+      stream_ptr, workspace, inputs[static_cast<size_t>(GridSampleInputIndex::kGridSampleInputIndex)],
+      inputs[static_cast<size_t>(GridSampleInputIndex::kGridSampleGridIndex)],
+      outputs[0]);
+    return true;
+  }
+  void GetWorkSpaceInfo(const std::vector<KernelTensor *> &inputs,
+                        const std::vector<KernelTensor *> &outputs) override {
+    GetWorkspaceForResize(inputs[static_cast<size_t>(GridSampleInputIndex::kGridSampleInputIndex)],
+                          inputs[static_cast<size_t>(GridSampleInputIndex::kGridSampleGridIndex)],
+                          outputs[0]);
+    return;
+  }
+
+ private:
+  DEFINE_GET_WORKSPACE_FOR_RESIZE();
+};
+}  // namespace ms_custom_ops
+
+REG_GRAPH_MODE_OP(grid_sample, ms_custom_ops::GridSampleOpFuncImpl,
+                  ms_custom_ops::GridSample);
+
+// =============================================================================
+// PYBOOST MODE IMPLEMENTATION
+// =============================================================================
+
+namespace ms_custom_ops {
+constexpr size_t kGridSampleOutputNum = 1;
+
+std::vector<ms::Tensor> grid_sample_custom(const ms::Tensor &input, const ms::Tensor &grid,
+                                           const int64_t mode, const int64_t padding_mode,
+                                           const bool align_corners) {
+  std::string op_name = "grid_sample";
+  auto runner = std::make_shared<ms::pynative::AclnnOpRunner>(op_name);
+  MS_EXCEPTION_IF_NULL(runner);
+  auto input_shape = input.shape();
+  auto grid_shape = grid.shape();
+  auto output_shape = grid_shape;
+  output_shape[kIndex3] = input_shape[kIndex3];
+  GridSampleCheckInputsShape(op_name, input.shape(), grid.shape(), mode, padding_mode, align_corners);
+  GridSampleCheckInputsType(op_name, input.data_type(), grid.data_type());
+  auto out = ms::Tensor(input.data_type(), output_shape);
+  runner->SetLaunchFunc(LAUNCH_ACLNN_FUNC(aclnnGridSample, input, grid, out));
+  runner->Run({input, grid}, {out});
+  return {out};
+}
+}  // namespace ms_custom_ops
+
+auto pyboost_grid_sample(const ms::Tensor &input, const ms::Tensor &grid, const int64_t mode,
+                         const int64_t padding_mode, const bool align_corners) {
+  return ms::pynative::PyboostRunner::Call<ms_custom_ops::kGridSampleOutputNum>(
+    ms_custom_ops::grid_sample_custom, input, grid, mode, padding_mode, align_corners);
+}
+
+MS_CUSTOM_OPS_EXTENSION_MODULE(m) {
+  m.def("grid_sample",
+        &pyboost_grid_sample,
+        "GridSample", pybind11::arg("input"), pybind11::arg("grid"), pybind11::arg("mode") = 0,
+        pybind11::arg("padding_mode") = 1, pybind11::arg("align_corners") = false);
+}
diff --git a/ops/ascendc/grid_sample/grid_sample.md b/ops/ascendc/grid_sample/grid_sample.md
new file mode 100644
index 0000000000000000000000000000000000000000..aa5d8a0674969f97d1db81a2c9fb0acfacc5a809
--- /dev/null
+++ b/ops/ascendc/grid_sample/grid_sample.md
@@ -0,0 +1,60 @@
+# grid_sample算子
+
+## 描述
+
+提供一个输入tensor以及一个对应的grid网格，然后根据grid中每个位置提供的坐标信息，将input中对应位置的像素值填充到网格指定的位置，得到最终的输出。
+
+## 输入参数
+
+| Name                | DType           | Shape                                  | Optional | Inplace | Format | Description                                            |
+|---------------------|-----------------|----------------------------------------|----------|---------|--------|--------------------------------------------------------|
+| input               | Tensor(float32)          | 4维[n, h_in, w_in, c] | No       | No      | ND     | 输入Tensor，shape为(n, h_in, w_in, c) |
+| grid                 | Tensor(float32)          | 3维[n, h_out, w_out, 2] | No       | Yes      | ND     | 用于采样的网格 |  
+| mode   | String          | No          | No       | No      | int     | 插值模式，只支持0， 表示 "bilinear"，每个输出像素是最接近的四个输入像素的加权平均值，使用双线性插值计算。<br>默认0 |
+| padding_mode       | String          | No         | No       | No      | int | padding_mode, 只支持 1，表示"border", 对越界位置用边界值填充。 默认 1 |
+| align_corners              | Bool          | No        | Yes      | No      | Bool | 表示设定特征图坐标与特征值的对应方式，设定为true时，特征值位于像素中心。设定为false时，特征值位于像素的角点。只支持false。默认false。
+
+## 输出参数
+
+| Name | DType | Shape | Description |
+|------|-------|-------|-------------|
+| output | Tensor(float32) | [n, h_out, w_out, c] | 采样得到的输出tensor，数据类型float32 |
+
+## 支持产品
+
+- Atlas 推理系列产品
+
+## 特殊说明
+
+1. 当前仅支持(n,h,w,c)格式的输入，不支持(n,c,h,w)格式。
+2. input的最后一维（c）需要是8的倍数
+
+## 使用示例
+
+### 基本使用示例
+
+```python
+
+import mindspore as ms
+import numpy as np
+import ms_custom_ops
+from mindspore import context, Tensor
+
+ms.set_context(device_target="Ascend", mode=context.GRAPH_MODE)
+ms.set_context(jit_config={"jit_level": "O0", "infer_boost": "on"})
+
+n_in = 1
+c_in = 512
+h_in = 24
+w_in = 24
+h_out = 64
+w_out = 1
+interpolation_mode = 0
+padding_mode = 1
+align_corners = False
+np_input = np.random.random((n_in, h_in, w_in, c_in)).astype(input_dtype)
+np_grid = np.random.uniform(-1, 1, (n_in, h_out, w_out, 2)).astype(grid_dtype)
+input_data = Tensor(np_input)
+grid = Tensor(np_grid)
+output_data = ms_custon_ops.grid_sample(input_data, grid, interpolation_mode, padding_mode, align_corners)
+```
diff --git a/ops/ascendc/grid_sample/grid_sample_op.yaml b/ops/ascendc/grid_sample/grid_sample_op.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..40b2fe07d78a54755d126dd6bf06e812a9da809e
--- /dev/null
+++ b/ops/ascendc/grid_sample/grid_sample_op.yaml
@@ -0,0 +1,19 @@
+#operator grid_sample
+grid_sample:
+    args:
+        input:
+            dtype: tensor
+        grid:
+            dtype: tensor
+        mode:
+            dtype: int
+            default: 0
+        padding_mode: 
+            dtype: int
+            default: 1
+        align_corners:
+            dtype: bool
+            default: False
+    returns:
+        output:
+            dtype: tensor
diff --git a/ops/ascendc/grid_sample/op_host/grid_sample.cpp b/ops/ascendc/grid_sample/op_host/grid_sample.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..92acd0beba6d25123fc2f55d81e060fd454a74ec
--- /dev/null
+++ b/ops/ascendc/grid_sample/op_host/grid_sample.cpp
@@ -0,0 +1,107 @@
+/**
+ * Copyright 2025 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#include "grid_sample_tiling.h"  // NOLINT(build/include_subdir)
+#include "register/op_def_registry.h"
+#include "graph/utils/type_utils.h"
+#include "tiling/platform/platform_ascendc.h"
+
+namespace {
+constexpr uint32_t kIndex0 = 0;
+constexpr uint32_t kIndex1 = 1;
+constexpr uint32_t kDim0{0};
+constexpr uint32_t kDim1{1};
+constexpr uint32_t kDim2{2};
+constexpr uint32_t kDim3{3};
+}
+
+namespace optiling {
+constexpr uint32_t kCoreNum = 8;
+static ge::graphStatus GridSampleTiling(gert::TilingContext *context) {
+  GridSampleTilingData tiling;
+  uint32_t tiling_key{0};
+
+  auto input_shape = context->GetInputShape(0)->GetOriginShape();
+  auto grid_shape = context->GetInputShape(1)->GetOriginShape();
+
+  int32_t n_in = input_shape.GetDim(kDim0);
+  float h_in = static_cast<float>(input_shape.GetDim(kDim1));
+  float w_in = static_cast<float>(input_shape.GetDim(kDim2));
+  int32_t c_in = input_shape.GetDim(kDim3);
+
+  int32_t h_out = grid_shape.GetDim(kDim1);
+  int32_t w_out = grid_shape.GetDim(kDim2);
+
+  tiling.set_h_in(h_in);
+  tiling.set_w_in(w_in);
+  tiling.set_h_out(h_out);
+  tiling.set_w_out(w_out);
+  tiling.set_n_in(n_in);
+  tiling.set_c_in(c_in);
+
+  context->SetBlockDim(kCoreNum);
+  context->SetTilingKey(tiling_key);
+  tiling.SaveToBuffer(context->GetRawTilingData()->GetData(), context->GetRawTilingData()->GetCapacity());
+  context->GetRawTilingData()->SetDataSize(tiling.GetDataSize());
+  size_t *currentWorkspace = context->GetWorkspaceSizes(kDim1);
+  currentWorkspace[kIndex0] = 0;
+  return ge::GRAPH_SUCCESS;
+}
+}  // namespace optiling
+
+namespace ge {
+static ge::graphStatus GridSampleInferShape(gert::InferShapeContext *context) {
+  const gert::Shape *input_shape = context->GetInputShape(kIndex0);
+  const gert::Shape *grid_shape = context->GetInputShape(kIndex1);
+  gert::Shape *out_shape = context->GetOutputShape(kIndex0);
+  *out_shape = *grid_shape;
+  (*out_shape)[kDim3] = (*input_shape)[kDim3];
+  return GRAPH_SUCCESS;
+}
+static graphStatus GridSampleInferDataType(gert::InferDataTypeContext *context) {
+  const auto inputDataType = context->GetInputDataType(kIndex0);
+  context->SetOutputDataType(kIndex0, inputDataType);
+  return ge::GRAPH_SUCCESS;
+}
+}  // namespace ge
+
+namespace ops {
+class GridSample : public OpDef {
+ public:
+  explicit GridSample(const char *name) : OpDef(name) {
+    this->Input("input")
+      .ParamType(REQUIRED)
+      .DataType({ge::DT_FLOAT})
+      .Format({ge::FORMAT_ND})
+      .UnknownShapeFormat({ge::FORMAT_ND})
+      .AutoContiguous();
+    this->Input("grid")
+      .UnknownShapeFormat({ge::FORMAT_ND})
+      .DataType({ge::DT_FLOAT})
+      .Format({ge::FORMAT_ND})
+      .ParamType(REQUIRED)
+      .AutoContiguous();
+    this->Output("output")
+      .UnknownShapeFormat({ge::FORMAT_ND})
+      .DataType({ge::DT_FLOAT})
+      .ParamType(REQUIRED)
+      .Format({ge::FORMAT_ND});
+
+    this->SetInferShape(ge::GridSampleInferShape).SetInferDataType(ge::GridSampleInferDataType);
+    this->AICore().SetTiling(optiling::GridSampleTiling).AddConfig("ascend310p");
+  }
+};
+OP_ADD(GridSample);
+}  // namespace ops
diff --git a/ops/ascendc/grid_sample/op_host/grid_sample_tiling.h b/ops/ascendc/grid_sample/op_host/grid_sample_tiling.h
new file mode 100644
index 0000000000000000000000000000000000000000..f49ddb6ce61ea79b7570f960ba8f7065ea9c2847
--- /dev/null
+++ b/ops/ascendc/grid_sample/op_host/grid_sample_tiling.h
@@ -0,0 +1,32 @@
+/**
+ * Copyright 2025 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#ifndef ADD_CUSTOM_TILING_H
+#define ADD_CUSTOM_TILING_H
+#include "register/tilingdata_base.h"
+
+namespace optiling {
+BEGIN_TILING_DATA_DEF(GridSampleTilingData)
+TILING_DATA_FIELD_DEF(float, h_in);
+TILING_DATA_FIELD_DEF(float, w_in);
+TILING_DATA_FIELD_DEF(int32_t, h_out);
+TILING_DATA_FIELD_DEF(int32_t, w_out);
+TILING_DATA_FIELD_DEF(int32_t, n_in);
+TILING_DATA_FIELD_DEF(int32_t, c_in);
+END_TILING_DATA_DEF;
+
+REGISTER_TILING_DATA_CLASS(GridSample, GridSampleTilingData)
+}  // namespace optiling
+#endif  // ADD_CUSTOM_TILING_H
diff --git a/ops/ascendc/grid_sample/op_kernel/grid_sample.cpp b/ops/ascendc/grid_sample/op_kernel/grid_sample.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..2637726adb7ecb50804a359efe1f611f8d34ddda
--- /dev/null
+++ b/ops/ascendc/grid_sample/op_kernel/grid_sample.cpp
@@ -0,0 +1,1452 @@
+/**
+ * Copyright 2025 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#include "kernel_operator.h"  // NOLINT(build/include_subdir)
+
+// NOLINTBEGIN
+class KernelGridSample {
+ public:
+  __aicore__ inline KernelGridSample() {}
+  __aicore__ inline void Init(GM_ADDR input, GM_ADDR grid, GM_ADDR output, GridSampleTilingData *tiling) {
+    this->input = input;
+    this->grid = grid;
+    this->output = output;
+    this->h_in = tiling->h_in;
+    this->w_in = tiling->w_in;
+    this->h_out = tiling->h_out;
+    this->w_out = tiling->w_out;
+    this->n_in = tiling->n_in;
+    this->c_in = tiling->c_in;
+  }
+  __aicore__ inline void Process() {
+    grid_sample_0_kernel(this->input, this->grid, this->output, this->h_in, this->w_in,
+                         this->h_out, this->w_out, this->n_in, this->c_in);
+  }
+
+ private:
+  __aicore__ void grid_sample_0_kernel(__gm__ uint8_t *__restrict__ input_data, __gm__ uint8_t *__restrict__ grid_data, __gm__ uint8_t *__restrict__ output_data, float h_in, float w_in, int32_t h_out, int32_t w_out, int32_t n_in, int32_t c_in) {
+    AscendC::TPipe pipe;
+    AscendC::TBuf<AscendC::TPosition::VECIN> vecin_buff;
+    pipe.InitBuffer(vecin_buff, 262144);
+    AscendC::TBuf<AscendC::TPosition::GM> gm_buff;
+    pipe.InitBuffer(gm_buff, 4294967295);
+    AscendC::TBuf<AscendC::TPosition::VECOUT> vecout_buff;
+    pipe.InitBuffer(vecout_buff, 262144);
+    AscendC::TBuf<AscendC::TPosition::CO2> co2_buff;
+    pipe.InitBuffer(co2_buff, 262144);
+    AscendC::TBuf<AscendC::TPosition::CO1> co1_buff;
+    pipe.InitBuffer(co1_buff, 262144);
+    AscendC::TBuf<AscendC::TPosition::A2> a2_buff;
+    pipe.InitBuffer(a2_buff, 65536);
+    AscendC::TBuf<AscendC::TPosition::A1> a1_buff;
+    pipe.InitBuffer(a1_buff, 1048576);
+    AscendC::TBuf<AscendC::TPosition::B2> b2_buff;
+    pipe.InitBuffer(b2_buff, 65536);
+    AscendC::TBuf<AscendC::TPosition::B1> b1_buff;
+    pipe.InitBuffer(b1_buff, 1048576);
+    AscendC::DataCopyEnhancedParams enhanceParams_relu({AscendC::BlockMode::BLOCK_MODE_MATRIX,AscendC::DeqScale::DEQ_NONE, 0, 0, true, pad_t::PAD_NONE, 0}); 
+    AscendC::DataCopyEnhancedParams enhanceParams({AscendC::BlockMode::BLOCK_MODE_MATRIX,AscendC::DeqScale::DEQ_NONE, 0, 0, false, pad_t::PAD_NONE, 0}); 
+    uint8_t padList[4] = {0, 0, 0, 0}; 
+    AscendC::GlobalTensor<float>grid_data_ascendc;
+    AscendC::GlobalTensor<float>input_data_ascendc;
+    AscendC::GlobalTensor<float>output_data_ascendc;
+    int32_t res_2;
+    int32_t total_outputs_34;
+    int32_t block_num_50;
+    int32_t res_58;
+    int32_t res_64;
+    int32_t block_idx_3;
+    int32_t percore_blocks_14;
+    int32_t res_66;
+    int32_t res_36;
+    int32_t res_67;
+    int32_t pixel_num_17;
+    int32_t output_res_52;
+    bool res_69;
+    int32_t res_73;
+    int32_t res_74;
+    bool res_75;
+    int32_t res_76;
+    int32_t res_77;
+    int32_t res_79;
+    int32_t res_80;
+    int32_t c_res_num_23;
+    int32_t in_stride_2_26;
+    int32_t out_stride_2_30;
+    bool res_38;
+    bool res_83;
+    int32_t res_18;
+    int32_t res_46;
+    bool res_84;
+    float const2_6;
+    float const5_9;
+    float const6_11;
+    int32_t c_chunk_num_21;
+    int32_t res_27;
+    float const1_32;
+    int32_t res_68;
+    int32_t res_54;
+    int32_t in_stride_1_55;
+    int32_t out_stride_1_56;
+    float const4_40;
+    int32_t const7_41;
+    int32_t const8_42;
+    int32_t c_res_start_44;
+    float const3_48;
+    int32_t start_idx_71;
+    int32_t c_res_chunk_num_60;
+    int32_t in_stride_0_61;
+    int32_t out_stride_0_62;
+    int32_t tile_num_86;
+    int32_t x_w_90;
+    int32_t x_e_91;
+    int32_t y_n_92;
+    int32_t y_s_93;
+    float nw_weight_94;
+    float ne_weight_95;
+    float sw_weight_96;
+    float se_weight_97;
+    int32_t res_102;
+    int32_t idx_103;
+    AscendC::LocalTensor<float>xys_ub_107_ascendc;
+    AscendC::LocalTensor<float>xys_ub_110_ascendc;
+    AscendC::LocalTensor<float>ub_108_ascendc;
+    AscendC::LocalTensor<float>ub_109_ascendc;
+    AscendC::LocalTensor<float>xys_ub_trans_111_ascendc;
+    AscendC::LocalTensor<float>xs_ub_116_ascendc;
+    AscendC::LocalTensor<float>ys_ub_121_ascendc;
+    AscendC::LocalTensor<float>xs_ub_122_ascendc;
+    AscendC::LocalTensor<float>ys_ub_123_ascendc;
+    AscendC::LocalTensor<float>xs_ub_124_ascendc;
+    AscendC::LocalTensor<float>ys_ub_125_ascendc;
+    AscendC::LocalTensor<float>xs_ub_127_ascendc;
+    AscendC::LocalTensor<float>ys_ub_129_ascendc;
+    AscendC::LocalTensor<float>xs_ub_130_ascendc;
+    AscendC::LocalTensor<float>ys_ub_131_ascendc;
+    AscendC::LocalTensor<int32_t>xs_ub_w_int_132_ascendc;
+    AscendC::LocalTensor<int32_t>ys_ub_n_int_133_ascendc;
+    AscendC::LocalTensor<float>xs_ub_w_134_ascendc;
+    AscendC::LocalTensor<float>ys_ub_n_137_ascendc;
+    AscendC::LocalTensor<int32_t>ys_ub_s_int_139_ascendc;
+    AscendC::LocalTensor<int32_t>xs_ub_e_int_136_ascendc;
+    AscendC::LocalTensor<float>xs_ub_e_141_ascendc;
+    AscendC::LocalTensor<float>e_weights_142_ascendc;
+    AscendC::LocalTensor<float>ys_ub_s_145_ascendc;
+    AscendC::LocalTensor<int32_t>ys_ub_s_int_147_ascendc;
+    AscendC::LocalTensor<float>s_weights_146_ascendc;
+    AscendC::LocalTensor<int32_t>xs_ub_e_int_143_ascendc;
+    AscendC::LocalTensor<float>n_weights_148_ascendc;
+    AscendC::LocalTensor<float>w_weights_150_ascendc;
+    AscendC::LocalTensor<float>se_weights_149_ascendc;
+    AscendC::LocalTensor<float>ne_weights_151_ascendc;
+    AscendC::LocalTensor<float>nw_weights_152_ascendc;
+    AscendC::LocalTensor<float>sw_weights_153_ascendc;
+    AscendC::LocalTensor<float>ub_158_ascendc;
+    int32_t inner_idx_163;
+    int32_t res_164;
+    int32_t res_165;
+    int32_t hw_190;
+    int32_t n_191;
+    int32_t w_192;
+    int32_t h_193;
+    AscendC::LocalTensor<int32_t>ub_162_ascendc;
+    AscendC::LocalTensor<int32_t>ub_169_ascendc;
+    AscendC::LocalTensor<int32_t>ub_173_ascendc;
+    AscendC::LocalTensor<int32_t>ub_177_ascendc;
+    AscendC::LocalTensor<float>ub_181_ascendc;
+    AscendC::LocalTensor<float>ub_185_ascendc;
+    AscendC::LocalTensor<float>ub_189_ascendc;
+    __ubuf__ float * ub_158 = (__ubuf__ float *)((uintptr_t)(416));
+    __ubuf__ int32_t * ub_162 = (__ubuf__ int32_t *)((uintptr_t)(128));
+    __ubuf__ int32_t * ub_169 = (__ubuf__ int32_t *)((uintptr_t)(192));
+    __ubuf__ int32_t * ub_173 = (__ubuf__ int32_t *)((uintptr_t)(224));
+    __ubuf__ int32_t * ub_177 = (__ubuf__ int32_t *)((uintptr_t)(288));
+    __ubuf__ float * ub_181 = (__ubuf__ float *)((uintptr_t)(2144));
+    __ubuf__ float * ub_185 = (__ubuf__ float *)((uintptr_t)(256));
+    __ubuf__ float * ub_189 = (__ubuf__ float *)((uintptr_t)(384));
+    int32_t res_194;
+    int32_t res_195;
+    int32_t res_197;
+    int32_t res_198;
+    int32_t res_208;
+    int32_t res_209;
+    int32_t res_196;
+    int32_t res_199;
+    int32_t res_210;
+    int32_t res_220;
+    int32_t res_221;
+    int32_t res_223;
+    int32_t res_202;
+    int32_t res_212;
+    int32_t res_219;
+    int32_t res_225;
+    int32_t res_226;
+    int32_t res_228;
+    int32_t res_230;
+    int32_t res_231;
+    int32_t res_233;
+    AscendC::LocalTensor<float>input_ub_nw_236_ascendc;
+    int32_t res_203;
+    int32_t res_204;
+    int32_t res_205;
+    int32_t res_222;
+    int32_t res_207;
+    int32_t res_227;
+    int32_t res_232;
+    int32_t res_213;
+    int32_t res_214;
+    int32_t res_215;
+    int32_t res_224;
+    int32_t res_217;
+    int32_t res_229;
+    int32_t res_234;
+    AscendC::LocalTensor<float>input_ub_ne_238_ascendc;
+    AscendC::LocalTensor<float>input_ub_se_242_ascendc;
+    AscendC::LocalTensor<float>input_ub_sw_240_ascendc;
+    AscendC::LocalTensor<float>input_ub_243_ascendc;
+    AscendC::LocalTensor<float>ub_244_ascendc;
+    AscendC::LocalTensor<float>ub_246_ascendc;
+    AscendC::LocalTensor<float>ub_245_ascendc;
+    AscendC::LocalTensor<float>input_ub_247_ascendc;
+    AscendC::LocalTensor<float>input_ub_248_ascendc;
+    AscendC::LocalTensor<float>input_ub_249_ascendc;
+    int32_t res_251;
+    int32_t res_252;
+    int32_t res_257;
+    int32_t res_258;
+    int32_t res_267;
+    int32_t res_268;
+    int32_t res_253;
+    int32_t res_259;
+    int32_t res_269;
+    int32_t res_277;
+    int32_t res_278;
+    int32_t res_280;
+    int32_t res_282;
+    int32_t res_283;
+    int32_t res_285;
+    int32_t res_256;
+    int32_t res_261;
+    int32_t res_271;
+    int32_t res_287;
+    int32_t res_288;
+    int32_t res_290;
+    int32_t res_292;
+    int32_t res_293;
+    int32_t res_295;
+    AscendC::LocalTensor<float>input_ub_nw_298_ascendc;
+    int32_t res_262;
+    int32_t res_263;
+    int32_t res_264;
+    int32_t res_279;
+    int32_t res_284;
+    int32_t res_266;
+    int32_t res_289;
+    int32_t res_294;
+    int32_t res_272;
+    int32_t res_273;
+    int32_t res_274;
+    int32_t res_281;
+    int32_t res_286;
+    int32_t res_276;
+    int32_t res_291;
+    int32_t res_296;
+    AscendC::LocalTensor<float>input_ub_ne_300_ascendc;
+    AscendC::LocalTensor<float>input_ub_se_304_ascendc;
+    AscendC::LocalTensor<float>input_ub_sw_302_ascendc;
+    AscendC::LocalTensor<float>input_ub_305_ascendc;
+    AscendC::LocalTensor<float>ub_306_ascendc;
+    AscendC::LocalTensor<float>ub_308_ascendc;
+    AscendC::LocalTensor<float>ub_307_ascendc;
+    AscendC::LocalTensor<float>input_ub_309_ascendc;
+    AscendC::LocalTensor<float>input_ub_310_ascendc;
+    AscendC::LocalTensor<float>input_ub_311_ascendc;
+    int32_t res_num_88;
+    bool res_314;
+    int32_t res_315;
+    int32_t idx__316;
+    AscendC::LocalTensor<float>xys_ub__320_ascendc;
+    AscendC::LocalTensor<float>xys_ub__323_ascendc;
+    AscendC::LocalTensor<float>ub_321_ascendc;
+    AscendC::LocalTensor<float>ub_322_ascendc;
+    AscendC::LocalTensor<float>xys_ub_trans__324_ascendc;
+    AscendC::LocalTensor<float>xs_ub__329_ascendc;
+    AscendC::LocalTensor<float>ys_ub__334_ascendc;
+    AscendC::LocalTensor<float>xs_ub__335_ascendc;
+    AscendC::LocalTensor<float>ys_ub__336_ascendc;
+    AscendC::LocalTensor<float>xs_ub__337_ascendc;
+    AscendC::LocalTensor<float>ys_ub__338_ascendc;
+    AscendC::LocalTensor<float>xs_ub__340_ascendc;
+    AscendC::LocalTensor<float>ys_ub__342_ascendc;
+    AscendC::LocalTensor<float>xs_ub__343_ascendc;
+    AscendC::LocalTensor<float>ys_ub__344_ascendc;
+    AscendC::LocalTensor<int32_t>xs_ub_w_int__345_ascendc;
+    AscendC::LocalTensor<int32_t>ys_ub_n_int__346_ascendc;
+    AscendC::LocalTensor<float>xs_ub_w__347_ascendc;
+    AscendC::LocalTensor<float>ys_ub_n__350_ascendc;
+    AscendC::LocalTensor<int32_t>ys_ub_s_int__352_ascendc;
+    AscendC::LocalTensor<int32_t>xs_ub_e_int__349_ascendc;
+    AscendC::LocalTensor<float>xs_ub_e__354_ascendc;
+    AscendC::LocalTensor<float>e_weight__355_ascendc;
+    AscendC::LocalTensor<float>ys_ub_s__358_ascendc;
+    AscendC::LocalTensor<int32_t>ys_ub_s_int__360_ascendc;
+    AscendC::LocalTensor<float>s_weight__359_ascendc;
+    AscendC::LocalTensor<int32_t>xs_ub_e_int__356_ascendc;
+    AscendC::LocalTensor<float>n_weight__361_ascendc;
+    AscendC::LocalTensor<float>w_weight__363_ascendc;
+    AscendC::LocalTensor<float>se_weight__362_ascendc;
+    AscendC::LocalTensor<float>ne_weight__364_ascendc;
+    AscendC::LocalTensor<float>sw_weight__365_ascendc;
+    AscendC::LocalTensor<float>nw_weight__366_ascendc;
+    AscendC::LocalTensor<float>ub_371_ascendc;
+    int32_t inner_idx_388;
+    int32_t res_389;
+    int32_t res_390;
+    int32_t hw_408;
+    int32_t n_409;
+    int32_t w_413;
+    int32_t h_414;
+    AscendC::LocalTensor<float>ub_375_ascendc;
+    AscendC::LocalTensor<float>ub_379_ascendc;
+    AscendC::LocalTensor<float>ub_383_ascendc;
+    AscendC::LocalTensor<int32_t>ub_387_ascendc;
+    AscendC::LocalTensor<int32_t>ub_394_ascendc;
+    AscendC::LocalTensor<int32_t>ub_398_ascendc;
+    AscendC::LocalTensor<int32_t>ub_402_ascendc;
+    float dst_403;
+    __ubuf__ float * ub_371 = (__ubuf__ float *)((uintptr_t)(160));
+    float dst_404;
+    __ubuf__ float * ub_375 = (__ubuf__ float *)((uintptr_t)(320));
+    float dst_405;
+    __ubuf__ float * ub_379 = (__ubuf__ float *)((uintptr_t)(352));
+    float dst_406;
+    __ubuf__ float * ub_383 = (__ubuf__ float *)((uintptr_t)(2112));
+    int32_t dst_407;
+    __ubuf__ int32_t * ub_387 = (__ubuf__ int32_t *)((uintptr_t)(0));
+    int32_t dst_410;
+    __ubuf__ int32_t * ub_394 = (__ubuf__ int32_t *)((uintptr_t)(32));
+    int32_t dst_411;
+    __ubuf__ int32_t * ub_398 = (__ubuf__ int32_t *)((uintptr_t)(64));
+    int32_t dst_412;
+    __ubuf__ int32_t * ub_402 = (__ubuf__ int32_t *)((uintptr_t)(96));
+    int32_t res_421;
+    int32_t res_422;
+    int32_t res_426;
+    int32_t res_427;
+    int32_t res_436;
+    int32_t res_437;
+    int32_t res_423;
+    int32_t res_428;
+    int32_t res_438;
+    int32_t res_442;
+    int32_t res_443;
+    int32_t res_445;
+    int32_t res_425;
+    int32_t res_430;
+    int32_t res_440;
+    int32_t res_447;
+    int32_t res_448;
+    int32_t res_450;
+    int32_t res_452;
+    int32_t res_453;
+    int32_t res_455;
+    AscendC::LocalTensor<float>input_ub_nw_457_ascendc;
+    int32_t res_431;
+    int32_t res_432;
+    int32_t res_433;
+    int32_t res_444;
+    int32_t res_435;
+    int32_t res_449;
+    int32_t res_454;
+    int32_t res_415;
+    int32_t res_416;
+    int32_t res_417;
+    int32_t res_441;
+    int32_t res_420;
+    int32_t res_446;
+    int32_t res_451;
+    AscendC::LocalTensor<float>input_ub_ne_459_ascendc;
+    AscendC::LocalTensor<float>input_ub_se_463_ascendc;
+    AscendC::LocalTensor<float>input_ub_sw_461_ascendc;
+    AscendC::LocalTensor<float>input_ub_464_ascendc;
+    AscendC::LocalTensor<float>ub_465_ascendc;
+    AscendC::LocalTensor<float>ub_467_ascendc;
+    AscendC::LocalTensor<float>ub_466_ascendc;
+    AscendC::LocalTensor<float>input_ub_468_ascendc;
+    AscendC::LocalTensor<float>input_ub_469_ascendc;
+    AscendC::LocalTensor<float>input_ub_470_ascendc;
+    int32_t res_476;
+    int32_t res_477;
+    int32_t res_481;
+    int32_t res_482;
+    int32_t res_491;
+    int32_t res_492;
+    int32_t res_472;
+    int32_t res_478;
+    int32_t res_483;
+    int32_t res_498;
+    int32_t res_499;
+    int32_t res_501;
+    int32_t res_503;
+    int32_t res_504;
+    int32_t res_506;
+    int32_t res_475;
+    int32_t res_480;
+    int32_t res_485;
+    int32_t res_508;
+    int32_t res_509;
+    int32_t res_511;
+    int32_t res_513;
+    int32_t res_514;
+    int32_t res_516;
+    AscendC::LocalTensor<float>input_ub_nw_519_ascendc;
+    int32_t res_486;
+    int32_t res_487;
+    int32_t res_488;
+    int32_t res_500;
+    int32_t res_505;
+    int32_t res_490;
+    int32_t res_510;
+    int32_t res_515;
+    int32_t res_493;
+    int32_t res_494;
+    int32_t res_495;
+    int32_t res_502;
+    int32_t res_507;
+    int32_t res_497;
+    int32_t res_512;
+    int32_t res_517;
+    AscendC::LocalTensor<float>input_ub_ne_521_ascendc;
+    AscendC::LocalTensor<float>input_ub_se_525_ascendc;
+    AscendC::LocalTensor<float>input_ub_sw_523_ascendc;
+    AscendC::LocalTensor<float>input_ub_526_ascendc;
+    AscendC::LocalTensor<float>ub_527_ascendc;
+    AscendC::LocalTensor<float>ub_529_ascendc;
+    AscendC::LocalTensor<float>ub_528_ascendc;
+    AscendC::LocalTensor<float>input_ub_530_ascendc;
+    AscendC::LocalTensor<float>input_ub_531_ascendc;
+    AscendC::LocalTensor<float>input_ub_532_ascendc;
+    res_2 = n_in * h_out;
+    total_outputs_34 = res_2 * w_out;
+    block_num_50 = total_outputs_34 / (int32_t)8;
+    res_58 = block_num_50 + (int32_t)8;
+    res_64 = res_58 - (int32_t)1;
+    block_idx_3 = (int32_t)block_idx;
+    percore_blocks_14 = (int32_t)0 + (int32_t)0;
+    res_66 = res_64 / (int32_t)8;
+    res_36 = block_idx_3 + (int32_t)1;
+    percore_blocks_14 = res_66;
+    res_67 = res_36 * percore_blocks_14;
+    pixel_num_17 = (int32_t)0 + (int32_t)0;
+    output_res_52 = int(total_outputs_34) % int((int32_t)8);
+    res_69 = res_67 < block_num_50;
+    AscendC::PipeBarrier<PIPE_ALL>();
+    if (res_69) {
+      res_73 = percore_blocks_14 * (int32_t)8;
+      pixel_num_17 = res_73;
+    } else {
+      res_74 = block_idx_3 * percore_blocks_14;
+      res_75 = res_74 < block_num_50;
+      AscendC::PipeBarrier<PIPE_ALL>();
+      if (res_75) {
+        res_76 = block_idx_3 * percore_blocks_14;
+        res_77 = block_num_50 - res_76;
+        res_79 = res_77 * (int32_t)8;
+        res_80 = res_79 + output_res_52;
+        pixel_num_17 = res_80;
+      } else {
+        pixel_num_17 = (int32_t)0;
+      }
+    }
+    c_res_num_23 = int(c_in) % int((int32_t)512);
+    in_stride_2_26 = (int32_t)0 + (int32_t)0;
+    out_stride_2_30 = (int32_t)0 + (int32_t)0;
+    res_38 = block_idx_3 == (int32_t)0;
+    AscendC::PipeBarrier<PIPE_ALL>();
+    res_83 = pixel_num_17 < (int32_t)1;
+    res_18 = (int32_t)(w_in);
+    res_46 = c_res_num_23 + (int32_t)8;
+    in_stride_2_26 = c_in;
+    out_stride_2_30 = c_in;
+    res_84 = res_38 && res_83;
+    const2_6 = float(h_in) * float((float)0.5000000000);
+    const5_9 = float(w_in) - float((float)1.0000000000);
+    const6_11 = float(h_in) - float((float)1.0000000000);
+    c_chunk_num_21 = c_in / (int32_t)512;
+    res_27 = (int32_t)(h_in);
+    const1_32 = float(w_in) * float((float)0.5000000000);
+    res_68 = block_idx_3 * percore_blocks_14;
+    AscendC::PipeBarrier<PIPE_ALL>();
+    res_54 = res_46 - (int32_t)1;
+    in_stride_1_55 = res_18 * in_stride_2_26;
+    out_stride_1_56 = w_out * out_stride_2_30;
+    if (res_84) {
+      pixel_num_17 = total_outputs_34;
+    }
+    AscendC::PipeBarrier<PIPE_ALL>();
+    const4_40 = float(const2_6) - float((float)0.5000000000);
+    const7_41 = (int32_t)(const5_9);
+    const8_42 = (int32_t)(const6_11);
+    c_res_start_44 = c_chunk_num_21 * (int32_t)512;
+    const3_48 = float(const1_32) - float((float)0.5000000000);
+    start_idx_71 = res_68 * (int32_t)8;
+    c_res_chunk_num_60 = res_54 / (int32_t)8;
+    in_stride_0_61 = res_27 * in_stride_1_55;
+    out_stride_0_62 = h_out * out_stride_1_56;
+    tile_num_86 = pixel_num_17 / (int32_t)8;
+    AscendC::PipeBarrier<PIPE_ALL>();
+    for (int dynamic_loop_var_0_100 = 0; dynamic_loop_var_0_100 < tile_num_86; dynamic_loop_var_0_100 += 1) {
+      res_102 = dynamic_loop_var_0_100 * (int32_t)8;
+      idx_103 = start_idx_71 + res_102;
+      AscendC::SetFlag<AscendC::HardEvent::S_MTE2>(0);
+      AscendC::WaitFlag<AscendC::HardEvent::S_MTE2>(0);
+      xys_ub_107_ascendc = vecin_buff.GetWithOffset<float>(16, 0);
+      grid_data_ascendc.SetGlobalBuffer(reinterpret_cast<__gm__ float *>(grid_data));
+      AscendC::DataCopy(xys_ub_107_ascendc, grid_data_ascendc[(idx_103) * 2], {(uint16_t)(1), (uint16_t)(2), (uint16_t)(0), (uint16_t)(0)});
+      AscendC::SetFlag<AscendC::HardEvent::MTE2_V>(0);
+      AscendC::WaitFlag<AscendC::HardEvent::MTE2_V>(0);
+      xys_ub_110_ascendc = vecout_buff.GetWithOffset<float>(128, 2112);
+      ub_108_ascendc = vecin_buff.GetWithOffset<float>(256, 64);
+      ub_109_ascendc = vecin_buff.GetWithOffset<float>(256, 1088);
+      AscendC::DataCopy(xys_ub_110_ascendc, xys_ub_107_ascendc, {(uint16_t)(1), (uint16_t)(2), (uint16_t)(0), (uint16_t)(0)});
+      pipe_barrier(PIPE_V);
+      AscendC::Transpose<uint32_t>(ub_108_ascendc.ReinterpretCast<uint32_t>(), xys_ub_110_ascendc.ReinterpretCast<uint32_t>(), xys_ub_110_ascendc.ReinterpretCast<uint8_t>(), {1,2,1,16,AscendC::TransposeType::TRANSPOSE_NHWC2NCHW});
+      pipe_barrier(PIPE_V);
+      AscendC::Transpose<uint32_t>(ub_109_ascendc.ReinterpretCast<uint32_t>(), ub_108_ascendc.ReinterpretCast<uint32_t>(), ub_108_ascendc.ReinterpretCast<uint8_t>(), {1,16,1,16,AscendC::TransposeType::TRANSPOSE_NCHW2NHWC});
+      pipe_barrier(PIPE_V);
+      AscendC::DataCopy(xys_ub_110_ascendc, ub_109_ascendc, {(uint16_t)(1), (uint16_t)(16), (uint16_t)(0), (uint16_t)(0)});
+      pipe_barrier(PIPE_V);
+      AscendC::PipeBarrier<PIPE_V>();
+      xys_ub_trans_111_ascendc = vecin_buff.GetWithOffset<float>(128, 0);
+      xys_ub_110_ascendc = vecin_buff.GetWithOffset<float>(128, 2112);
+      AscendC::Transpose<uint32_t>(xys_ub_trans_111_ascendc.ReinterpretCast<uint32_t>(), xys_ub_110_ascendc.ReinterpretCast<uint32_t>(), xys_ub_110_ascendc.ReinterpretCast<uint8_t>(), {1,8,1,16,AscendC::TransposeType::TRANSPOSE_NCHW2NHWC});
+      AscendC::PipeBarrier<PIPE_V>();
+      AscendC::PipeBarrier<PIPE_V>();
+      xs_ub_116_ascendc = vecout_buff.GetWithOffset<float>(8, 2112);
+      AscendC::DataCopy(xs_ub_116_ascendc, xys_ub_trans_111_ascendc, {(uint16_t)(1), (uint16_t)(1), (uint16_t)(0), (uint16_t)(0)});
+      ys_ub_121_ascendc = vecout_buff.GetWithOffset<float>(8, 2144);
+      AscendC::DataCopy(ys_ub_121_ascendc, xys_ub_trans_111_ascendc[8], {(uint16_t)(1), (uint16_t)(1), (uint16_t)(0), (uint16_t)(0)});
+      AscendC::PipeBarrier<PIPE_V>();
+      xs_ub_122_ascendc = vecin_buff.GetWithOffset<float>(8, 0);
+      xs_ub_116_ascendc = vecin_buff.GetWithOffset<float>(8, 2112);
+      AscendC::SetMaskNorm();
+      AscendC::SetVectorMask<float, AscendC::MaskMode::NORMAL>(0ULL, (1ULL << (8)) - 1);
+      Muls<float, false>(xs_ub_122_ascendc, xs_ub_116_ascendc, const1_32, AscendC::MASK_PLACEHOLDER, 1, {1, 1, 8, 8});
+      AscendC::SetMaskNorm();
+      AscendC::SetVectorMask<float, AscendC::MaskMode::NORMAL>(18446744073709551615ULL, 18446744073709551615ULL);
+      ys_ub_123_ascendc = vecin_buff.GetWithOffset<float>(8, 32);
+      ys_ub_121_ascendc = vecin_buff.GetWithOffset<float>(8, 2144);
+      AscendC::SetMaskNorm();
+      AscendC::SetVectorMask<float, AscendC::MaskMode::NORMAL>(0ULL, (1ULL << (8)) - 1);
+      Muls<float, false>(ys_ub_123_ascendc, ys_ub_121_ascendc, const2_6, AscendC::MASK_PLACEHOLDER, 1, {1, 1, 8, 8});
+      AscendC::SetMaskNorm();
+      AscendC::SetVectorMask<float, AscendC::MaskMode::NORMAL>(18446744073709551615ULL, 18446744073709551615ULL);
+      AscendC::PipeBarrier<PIPE_V>();
+      xs_ub_124_ascendc = vecin_buff.GetWithOffset<float>(8, 2112);
+      AscendC::SetMaskNorm();
+      AscendC::SetVectorMask<float, AscendC::MaskMode::NORMAL>(0ULL, (1ULL << (8)) - 1);
+      Adds<float, false>(xs_ub_124_ascendc, xs_ub_122_ascendc, const3_48, AscendC::MASK_PLACEHOLDER, 1, {1, 1, 8, 8});
+      AscendC::SetMaskNorm();
+      AscendC::SetVectorMask<float, AscendC::MaskMode::NORMAL>(18446744073709551615ULL, 18446744073709551615ULL);
+      ys_ub_125_ascendc = vecin_buff.GetWithOffset<float>(8, 2144);
+      AscendC::SetMaskNorm();
+      AscendC::SetVectorMask<float, AscendC::MaskMode::NORMAL>(0ULL, (1ULL << (8)) - 1);
+      Adds<float, false>(ys_ub_125_ascendc, ys_ub_123_ascendc, const4_40, AscendC::MASK_PLACEHOLDER, 1, {1, 1, 8, 8});
+      AscendC::SetMaskNorm();
+      AscendC::SetVectorMask<float, AscendC::MaskMode::NORMAL>(18446744073709551615ULL, 18446744073709551615ULL);
+      AscendC::PipeBarrier<PIPE_V>();
+      xs_ub_127_ascendc = vecin_buff.GetWithOffset<float>(8, 0);
+      AscendC::SetMaskNorm();
+      AscendC::SetVectorMask<float, AscendC::MaskMode::NORMAL>(0ULL, (1ULL << (8)) - 1);
+      Maxs<float, false>(xs_ub_127_ascendc, xs_ub_124_ascendc, (float)0.0000000000, AscendC::MASK_PLACEHOLDER, 1, {1, 1, 8, 8});
+      AscendC::SetMaskNorm();
+      AscendC::SetVectorMask<float, AscendC::MaskMode::NORMAL>(18446744073709551615ULL, 18446744073709551615ULL);
+      ys_ub_129_ascendc = vecin_buff.GetWithOffset<float>(8, 32);
+      AscendC::SetMaskNorm();
+      AscendC::SetVectorMask<float, AscendC::MaskMode::NORMAL>(0ULL, (1ULL << (8)) - 1);
+      Maxs<float, false>(ys_ub_129_ascendc, ys_ub_125_ascendc, (float)0.0000000000, AscendC::MASK_PLACEHOLDER, 1, {1, 1, 8, 8});
+      AscendC::SetMaskNorm();
+      AscendC::SetVectorMask<float, AscendC::MaskMode::NORMAL>(18446744073709551615ULL, 18446744073709551615ULL);
+      AscendC::PipeBarrier<PIPE_V>();
+      xs_ub_130_ascendc = vecin_buff.GetWithOffset<float>(8, 2112);
+      AscendC::SetMaskNorm();
+      AscendC::SetVectorMask<float, AscendC::MaskMode::NORMAL>(0ULL, (1ULL << (8)) - 1);
+      Mins<float, false>(xs_ub_130_ascendc, xs_ub_127_ascendc, const5_9, AscendC::MASK_PLACEHOLDER, 1, {1, 1, 8, 8});
+      AscendC::SetMaskNorm();
+      AscendC::SetVectorMask<float, AscendC::MaskMode::NORMAL>(18446744073709551615ULL, 18446744073709551615ULL);
+      ys_ub_131_ascendc = vecin_buff.GetWithOffset<float>(8, 2144);
+      AscendC::SetMaskNorm();
+      AscendC::SetVectorMask<float, AscendC::MaskMode::NORMAL>(0ULL, (1ULL << (8)) - 1);
+      Mins<float, false>(ys_ub_131_ascendc, ys_ub_129_ascendc, const6_11, AscendC::MASK_PLACEHOLDER, 1, {1, 1, 8, 8});
+      AscendC::SetMaskNorm();
+      AscendC::SetVectorMask<float, AscendC::MaskMode::NORMAL>(18446744073709551615ULL, 18446744073709551615ULL);
+      AscendC::PipeBarrier<PIPE_V>();
+      xs_ub_w_int_132_ascendc = vecin_buff.GetWithOffset<int32_t>(8, 0);
+      AscendC::SetMaskNorm();
+      AscendC::SetVectorMask<float, AscendC::MaskMode::NORMAL>(0ULL, (1ULL << (8)) - 1);
+      AscendC::Cast<int32_t, float, false>(xs_ub_w_int_132_ascendc, xs_ub_130_ascendc, AscendC::RoundMode::CAST_TRUNC, AscendC::MASK_PLACEHOLDER, 1, {1, 1, 8, 8});
+      AscendC::SetMaskNorm();
+      AscendC::SetVectorMask<float, AscendC::MaskMode::NORMAL>(18446744073709551615ULL, 18446744073709551615ULL);
+      ys_ub_n_int_133_ascendc = vecin_buff.GetWithOffset<int32_t>(8, 32);
+      AscendC::SetMaskNorm();
+      AscendC::SetVectorMask<float, AscendC::MaskMode::NORMAL>(0ULL, (1ULL << (8)) - 1);
+      AscendC::Cast<int32_t, float, false>(ys_ub_n_int_133_ascendc, ys_ub_131_ascendc, AscendC::RoundMode::CAST_TRUNC, AscendC::MASK_PLACEHOLDER, 1, {1, 1, 8, 8});
+      AscendC::SetMaskNorm();
+      AscendC::SetVectorMask<float, AscendC::MaskMode::NORMAL>(18446744073709551615ULL, 18446744073709551615ULL);
+      AscendC::PipeBarrier<PIPE_V>();
+      xs_ub_w_134_ascendc = vecin_buff.GetWithOffset<float>(8, 192);
+      AscendC::SetMaskNorm();
+      AscendC::SetVectorMask<int32_t, AscendC::MaskMode::NORMAL>(0ULL, (1ULL << (8)) - 1);
+      AscendC::Cast<float, int32_t, false>(xs_ub_w_134_ascendc, xs_ub_w_int_132_ascendc, AscendC::RoundMode::CAST_NONE, AscendC::MASK_PLACEHOLDER, 1, {1, 1, 8, 8});
+      AscendC::SetMaskNorm();
+      AscendC::SetVectorMask<int32_t, AscendC::MaskMode::NORMAL>(18446744073709551615ULL, 18446744073709551615ULL);
+      ys_ub_n_137_ascendc = vecin_buff.GetWithOffset<float>(8, 256);
+      AscendC::SetMaskNorm();
+      AscendC::SetVectorMask<int32_t, AscendC::MaskMode::NORMAL>(0ULL, (1ULL << (8)) - 1);
+      AscendC::Cast<float, int32_t, false>(ys_ub_n_137_ascendc, ys_ub_n_int_133_ascendc, AscendC::RoundMode::CAST_NONE, AscendC::MASK_PLACEHOLDER, 1, {1, 1, 8, 8});
+      AscendC::SetMaskNorm();
+      AscendC::SetVectorMask<int32_t, AscendC::MaskMode::NORMAL>(18446744073709551615ULL, 18446744073709551615ULL);
+      ys_ub_s_int_139_ascendc = vecin_buff.GetWithOffset<int32_t>(8, 128);
+      AscendC::SetMaskNorm();
+      AscendC::SetVectorMask<int32_t, AscendC::MaskMode::NORMAL>(0ULL, (1ULL << (8)) - 1);
+      Adds<int32_t, false>(ys_ub_s_int_139_ascendc, ys_ub_n_int_133_ascendc, (int32_t)1, AscendC::MASK_PLACEHOLDER, 1, {1, 1, 8, 8});
+      AscendC::SetMaskNorm();
+      AscendC::SetVectorMask<int32_t, AscendC::MaskMode::NORMAL>(18446744073709551615ULL, 18446744073709551615ULL);
+      xs_ub_e_int_136_ascendc = vecin_buff.GetWithOffset<int32_t>(8, 64);
+      AscendC::SetMaskNorm();
+      AscendC::SetVectorMask<int32_t, AscendC::MaskMode::NORMAL>(0ULL, (1ULL << (8)) - 1);
+      Adds<int32_t, false>(xs_ub_e_int_136_ascendc, xs_ub_w_int_132_ascendc, (int32_t)1, AscendC::MASK_PLACEHOLDER, 1, {1, 1, 8, 8});
+      AscendC::SetMaskNorm();
+      AscendC::SetVectorMask<int32_t, AscendC::MaskMode::NORMAL>(18446744073709551615ULL, 18446744073709551615ULL);
+      AscendC::PipeBarrier<PIPE_V>();
+      xs_ub_e_141_ascendc = vecin_buff.GetWithOffset<float>(8, 320);
+      AscendC::SetMaskNorm();
+      AscendC::SetVectorMask<float, AscendC::MaskMode::NORMAL>(0ULL, (1ULL << (8)) - 1);
+      Adds<float, false>(xs_ub_e_141_ascendc, xs_ub_w_134_ascendc, (float)1.0000000000, AscendC::MASK_PLACEHOLDER, 1, {1, 1, 8, 8});
+      AscendC::SetMaskNorm();
+      AscendC::SetVectorMask<float, AscendC::MaskMode::NORMAL>(18446744073709551615ULL, 18446744073709551615ULL);
+      e_weights_142_ascendc = vecin_buff.GetWithOffset<float>(8, 224);
+      AscendC::SetMaskNorm();
+      AscendC::SetVectorMask<float, AscendC::MaskMode::NORMAL>(0ULL, (1ULL << (8)) - 1);
+      Sub<float, false>(e_weights_142_ascendc, xs_ub_130_ascendc, xs_ub_w_134_ascendc, AscendC::MASK_PLACEHOLDER, 1, {1, 1, 1, 0, 0, 0});
+      AscendC::SetMaskNorm();
+      AscendC::SetVectorMask<float, AscendC::MaskMode::NORMAL>(18446744073709551615ULL, 18446744073709551615ULL);
+      ys_ub_s_145_ascendc = vecin_buff.GetWithOffset<float>(8, 352);
+      AscendC::SetMaskNorm();
+      AscendC::SetVectorMask<float, AscendC::MaskMode::NORMAL>(0ULL, (1ULL << (8)) - 1);
+      Adds<float, false>(ys_ub_s_145_ascendc, ys_ub_n_137_ascendc, (float)1.0000000000, AscendC::MASK_PLACEHOLDER, 1, {1, 1, 8, 8});
+      AscendC::SetMaskNorm();
+      AscendC::SetVectorMask<float, AscendC::MaskMode::NORMAL>(18446744073709551615ULL, 18446744073709551615ULL);
+      ys_ub_s_int_147_ascendc = vecin_buff.GetWithOffset<int32_t>(8, 160);
+      AscendC::SetMaskNorm();
+      AscendC::SetVectorMask<int32_t, AscendC::MaskMode::NORMAL>(0ULL, (1ULL << (8)) - 1);
+      Mins<int32_t, false>(ys_ub_s_int_147_ascendc, ys_ub_s_int_139_ascendc, const8_42, AscendC::MASK_PLACEHOLDER, 1, {1, 1, 8, 8});
+      AscendC::SetMaskNorm();
+      AscendC::SetVectorMask<int32_t, AscendC::MaskMode::NORMAL>(18446744073709551615ULL, 18446744073709551615ULL);
+      AscendC::PipeBarrier<PIPE_V>();
+      s_weights_146_ascendc = vecin_buff.GetWithOffset<float>(8, 288);
+      AscendC::SetMaskNorm();
+      AscendC::SetVectorMask<float, AscendC::MaskMode::NORMAL>(0ULL, (1ULL << (8)) - 1);
+      Sub<float, false>(s_weights_146_ascendc, ys_ub_131_ascendc, ys_ub_n_137_ascendc, AscendC::MASK_PLACEHOLDER, 1, {1, 1, 1, 0, 0, 0});
+      AscendC::SetMaskNorm();
+      AscendC::SetVectorMask<float, AscendC::MaskMode::NORMAL>(18446744073709551615ULL, 18446744073709551615ULL);
+      AscendC::PipeBarrier<PIPE_V>();
+      xs_ub_e_int_143_ascendc = vecin_buff.GetWithOffset<int32_t>(8, 96);
+      AscendC::SetMaskNorm();
+      AscendC::SetVectorMask<int32_t, AscendC::MaskMode::NORMAL>(0ULL, (1ULL << (8)) - 1);
+      Mins<int32_t, false>(xs_ub_e_int_143_ascendc, xs_ub_e_int_136_ascendc, const7_41, AscendC::MASK_PLACEHOLDER, 1, {1, 1, 8, 8});
+      AscendC::SetMaskNorm();
+      AscendC::SetVectorMask<int32_t, AscendC::MaskMode::NORMAL>(18446744073709551615ULL, 18446744073709551615ULL);
+      n_weights_148_ascendc = vecin_buff.GetWithOffset<float>(8, 192);
+      AscendC::SetMaskNorm();
+      AscendC::SetVectorMask<float, AscendC::MaskMode::NORMAL>(0ULL, (1ULL << (8)) - 1);
+      Sub<float, false>(n_weights_148_ascendc, ys_ub_s_145_ascendc, ys_ub_131_ascendc, AscendC::MASK_PLACEHOLDER, 1, {1, 1, 1, 0, 0, 0});
+      AscendC::SetMaskNorm();
+      AscendC::SetVectorMask<float, AscendC::MaskMode::NORMAL>(18446744073709551615ULL, 18446744073709551615ULL);
+      w_weights_150_ascendc = vecin_buff.GetWithOffset<float>(8, 128);
+      AscendC::SetMaskNorm();
+      AscendC::SetVectorMask<float, AscendC::MaskMode::NORMAL>(0ULL, (1ULL << (8)) - 1);
+      Sub<float, false>(w_weights_150_ascendc, xs_ub_e_141_ascendc, xs_ub_130_ascendc, AscendC::MASK_PLACEHOLDER, 1, {1, 1, 1, 0, 0, 0});
+      AscendC::SetMaskNorm();
+      AscendC::SetVectorMask<float, AscendC::MaskMode::NORMAL>(18446744073709551615ULL, 18446744073709551615ULL);
+      AscendC::PipeBarrier<PIPE_V>();
+      se_weights_149_ascendc = vecin_buff.GetWithOffset<float>(8, 64);
+      AscendC::SetMaskNorm();
+      AscendC::SetVectorMask<float, AscendC::MaskMode::NORMAL>(0ULL, (1ULL << (8)) - 1);
+      Mul<float, false>(se_weights_149_ascendc, e_weights_142_ascendc, s_weights_146_ascendc, AscendC::MASK_PLACEHOLDER, 1, {1, 1, 1, 0, 0, 0});
+      AscendC::SetMaskNorm();
+      AscendC::SetVectorMask<float, AscendC::MaskMode::NORMAL>(18446744073709551615ULL, 18446744073709551615ULL);
+      ne_weights_151_ascendc = vecin_buff.GetWithOffset<float>(8, 352);
+      AscendC::SetMaskNorm();
+      AscendC::SetVectorMask<float, AscendC::MaskMode::NORMAL>(0ULL, (1ULL << (8)) - 1);
+      Mul<float, false>(ne_weights_151_ascendc, e_weights_142_ascendc, n_weights_148_ascendc, AscendC::MASK_PLACEHOLDER, 1, {1, 1, 1, 0, 0, 0});
+      AscendC::SetMaskNorm();
+      AscendC::SetVectorMask<float, AscendC::MaskMode::NORMAL>(18446744073709551615ULL, 18446744073709551615ULL);
+      nw_weights_152_ascendc = vecin_buff.GetWithOffset<float>(8, 320);
+      AscendC::SetMaskNorm();
+      AscendC::SetVectorMask<float, AscendC::MaskMode::NORMAL>(0ULL, (1ULL << (8)) - 1);
+      Mul<float, false>(nw_weights_152_ascendc, w_weights_150_ascendc, n_weights_148_ascendc, AscendC::MASK_PLACEHOLDER, 1, {1, 1, 1, 0, 0, 0});
+      AscendC::SetMaskNorm();
+      AscendC::SetVectorMask<float, AscendC::MaskMode::NORMAL>(18446744073709551615ULL, 18446744073709551615ULL);
+      sw_weights_153_ascendc = vecin_buff.GetWithOffset<float>(8, 2112);
+      AscendC::SetMaskNorm();
+      AscendC::SetVectorMask<float, AscendC::MaskMode::NORMAL>(0ULL, (1ULL << (8)) - 1);
+      Mul<float, false>(sw_weights_153_ascendc, w_weights_150_ascendc, s_weights_146_ascendc, AscendC::MASK_PLACEHOLDER, 1, {1, 1, 1, 0, 0, 0});
+      AscendC::SetMaskNorm();
+      AscendC::SetVectorMask<float, AscendC::MaskMode::NORMAL>(18446744073709551615ULL, 18446744073709551615ULL);
+      AscendC::PipeBarrier<PIPE_ALL>();
+      for (int dynamic_loop_var_1_155 = 0; dynamic_loop_var_1_155 < 8; dynamic_loop_var_1_155 += 1) {
+        ub_158_ascendc = vecout_buff.GetWithOffset<float>(1, 416);
+        AscendC::DataCopy(ub_158_ascendc, se_weights_149_ascendc[(dynamic_loop_var_1_155)], {(uint16_t)(1), (uint16_t)(1), (uint16_t)(0), (uint16_t)(0)});
+        inner_idx_163 = idx_103 + dynamic_loop_var_1_155;
+        res_164 = h_out * w_out;
+        res_165 = h_out * w_out;
+        hw_190 = int(inner_idx_163) % int(res_164);
+        n_191 = inner_idx_163 / res_165;
+        w_192 = int(hw_190) % int(w_out);
+        h_193 = hw_190 / w_out;
+        ub_162_ascendc = vecout_buff.GetWithOffset<int32_t>(1, 128);
+        AscendC::DataCopy(ub_162_ascendc, xs_ub_w_int_132_ascendc[(dynamic_loop_var_1_155)], {(uint16_t)(1), (uint16_t)(1), (uint16_t)(0), (uint16_t)(0)});
+        ub_169_ascendc = vecout_buff.GetWithOffset<int32_t>(1, 192);
+        AscendC::DataCopy(ub_169_ascendc, xs_ub_e_int_143_ascendc[(dynamic_loop_var_1_155)], {(uint16_t)(1), (uint16_t)(1), (uint16_t)(0), (uint16_t)(0)});
+        ub_173_ascendc = vecout_buff.GetWithOffset<int32_t>(1, 224);
+        AscendC::DataCopy(ub_173_ascendc, ys_ub_n_int_133_ascendc[(dynamic_loop_var_1_155)], {(uint16_t)(1), (uint16_t)(1), (uint16_t)(0), (uint16_t)(0)});
+        ub_177_ascendc = vecout_buff.GetWithOffset<int32_t>(1, 288);
+        AscendC::DataCopy(ub_177_ascendc, ys_ub_s_int_147_ascendc[(dynamic_loop_var_1_155)], {(uint16_t)(1), (uint16_t)(1), (uint16_t)(0), (uint16_t)(0)});
+        ub_181_ascendc = vecout_buff.GetWithOffset<float>(1, 2144);
+        AscendC::DataCopy(ub_181_ascendc, nw_weights_152_ascendc[(dynamic_loop_var_1_155)], {(uint16_t)(1), (uint16_t)(1), (uint16_t)(0), (uint16_t)(0)});
+        ub_185_ascendc = vecout_buff.GetWithOffset<float>(1, 256);
+        AscendC::DataCopy(ub_185_ascendc, ne_weights_151_ascendc[(dynamic_loop_var_1_155)], {(uint16_t)(1), (uint16_t)(1), (uint16_t)(0), (uint16_t)(0)});
+        ub_189_ascendc = vecout_buff.GetWithOffset<float>(1, 384);
+        AscendC::DataCopy(ub_189_ascendc, sw_weights_153_ascendc[(dynamic_loop_var_1_155)], {(uint16_t)(1), (uint16_t)(1), (uint16_t)(0), (uint16_t)(0)});
+        AscendC::SetFlag<AscendC::HardEvent::V_S>(0);
+        AscendC::WaitFlag<AscendC::HardEvent::V_S>(0);
+        se_weight_97 = *(__ubuf__ float *)ub_158;
+        x_w_90 = *(__ubuf__ int32_t *)ub_162;
+        x_e_91 = *(__ubuf__ int32_t *)ub_169;
+        y_n_92 = *(__ubuf__ int32_t *)ub_173;
+        y_s_93 = *(__ubuf__ int32_t *)ub_177;
+        nw_weight_94 = *(__ubuf__ float *)ub_181;
+        ne_weight_95 = *(__ubuf__ float *)ub_185;
+        sw_weight_96 = *(__ubuf__ float *)ub_189;
+        AscendC::PipeBarrier<PIPE_ALL>();
+        for (int dynamic_loop_var_2_200 = 0; dynamic_loop_var_2_200 < c_chunk_num_21; dynamic_loop_var_2_200 += 1) {
+          res_194 = n_191 * in_stride_0_61;
+          res_195 = y_n_92 * in_stride_1_55;
+          res_197 = n_191 * in_stride_0_61;
+          res_198 = y_n_92 * in_stride_1_55;
+          res_208 = n_191 * in_stride_0_61;
+          res_209 = y_s_93 * in_stride_1_55;
+          res_196 = x_w_90 * in_stride_2_26;
+          res_199 = x_e_91 * in_stride_2_26;
+          res_210 = x_e_91 * in_stride_2_26;
+          res_220 = res_194 + res_195;
+          res_221 = res_197 + res_198;
+          res_223 = res_208 + res_209;
+          res_202 = dynamic_loop_var_2_200 * (int32_t)512;
+          res_212 = dynamic_loop_var_2_200 * (int32_t)512;
+          res_219 = dynamic_loop_var_2_200 * (int32_t)512;
+          res_225 = res_220 + res_196;
+          res_226 = res_221 + res_199;
+          res_228 = res_223 + res_210;
+          res_230 = res_225 + res_219;
+          res_231 = res_226 + res_202;
+          res_233 = res_228 + res_212;
+          AscendC::SetFlag<AscendC::HardEvent::S_MTE2>(0);
+          AscendC::WaitFlag<AscendC::HardEvent::S_MTE2>(0);
+          input_ub_nw_236_ascendc = vecin_buff.GetWithOffset<float>(512, 10336);
+          input_data_ascendc.SetGlobalBuffer(reinterpret_cast<__gm__ float *>(input_data));
+          for (int n = 0; n < 1; ++n) {
+            AscendC::DataCopy(input_ub_nw_236_ascendc[n * 512], input_data_ascendc[res_230 + n * 884736], {(uint16_t)(1), (uint16_t)(64), (uint16_t)(0), (uint16_t)(0)});
+          }
+          res_203 = n_191 * in_stride_0_61;
+          res_204 = y_s_93 * in_stride_1_55;
+          res_205 = x_w_90 * in_stride_2_26;
+          res_222 = res_203 + res_204;
+          res_207 = dynamic_loop_var_2_200 * (int32_t)512;
+          res_227 = res_222 + res_205;
+          res_232 = res_227 + res_207;
+          AscendC::SetFlag<AscendC::HardEvent::S_MTE2>(0);
+          AscendC::WaitFlag<AscendC::HardEvent::S_MTE2>(0);
+          res_213 = n_191 * out_stride_0_62;
+          res_214 = h_193 * out_stride_1_56;
+          res_215 = w_192 * out_stride_2_30;
+          res_224 = res_213 + res_214;
+          res_217 = dynamic_loop_var_2_200 * (int32_t)512;
+          res_229 = res_224 + res_215;
+          res_234 = res_229 + res_217;
+          AscendC::SetFlag<AscendC::HardEvent::S_MTE3>(0);
+          AscendC::WaitFlag<AscendC::HardEvent::S_MTE3>(0);
+          input_ub_ne_238_ascendc = vecin_buff.GetWithOffset<float>(512, 14432);
+          for (int n = 0; n < 1; ++n) {
+            AscendC::DataCopy(input_ub_ne_238_ascendc[n * 512], input_data_ascendc[res_231 + n * 884736], {(uint16_t)(1), (uint16_t)(64), (uint16_t)(0), (uint16_t)(0)});
+          }
+          input_ub_se_242_ascendc = vecin_buff.GetWithOffset<float>(512, 2144);
+          for (int n = 0; n < 1; ++n) {
+            AscendC::DataCopy(input_ub_se_242_ascendc[n * 512], input_data_ascendc[res_233 + n * 884736], {(uint16_t)(1), (uint16_t)(64), (uint16_t)(0), (uint16_t)(0)});
+          }
+          input_ub_sw_240_ascendc = vecin_buff.GetWithOffset<float>(512, 6240);
+          for (int n = 0; n < 1; ++n) {
+            AscendC::DataCopy(input_ub_sw_240_ascendc[n * 512], input_data_ascendc[res_232 + n * 884736], {(uint16_t)(1), (uint16_t)(64), (uint16_t)(0), (uint16_t)(0)});
+          }
+          AscendC::SetFlag<AscendC::HardEvent::MTE2_V>(0);
+          AscendC::WaitFlag<AscendC::HardEvent::MTE2_V>(0);
+          input_ub_243_ascendc = vecin_buff.GetWithOffset<float>(512, 12384);
+          Muls<float, false>(input_ub_243_ascendc, input_ub_nw_236_ascendc, nw_weight_94, AscendC::MASK_PLACEHOLDER, 8, {1, 1, 8, 8});
+          AscendC::SetFlag<AscendC::HardEvent::MTE2_V>(0);
+          AscendC::WaitFlag<AscendC::HardEvent::MTE2_V>(0);
+          ub_244_ascendc = vecin_buff.GetWithOffset<float>(512, 16480);
+          Muls<float, false>(ub_244_ascendc, input_ub_ne_238_ascendc, ne_weight_95, AscendC::MASK_PLACEHOLDER, 8, {1, 1, 8, 8});
+          ub_246_ascendc = vecin_buff.GetWithOffset<float>(512, 4192);
+          Muls<float, false>(ub_246_ascendc, input_ub_se_242_ascendc, se_weight_97, AscendC::MASK_PLACEHOLDER, 8, {1, 1, 8, 8});
+          AscendC::PipeBarrier<PIPE_V>();
+          ub_245_ascendc = vecin_buff.GetWithOffset<float>(512, 8288);
+          Muls<float, false>(ub_245_ascendc, input_ub_sw_240_ascendc, sw_weight_96, AscendC::MASK_PLACEHOLDER, 8, {1, 1, 8, 8});
+          AscendC::PipeBarrier<PIPE_V>();
+          input_ub_247_ascendc = vecin_buff.GetWithOffset<float>(512, 2144);
+          Add<float, false>(input_ub_247_ascendc, input_ub_243_ascendc, ub_244_ascendc, AscendC::MASK_PLACEHOLDER, 8, {1, 1, 1, 8, 8, 8});
+          AscendC::PipeBarrier<PIPE_V>();
+          input_ub_248_ascendc = vecin_buff.GetWithOffset<float>(512, 10368);
+          Add<float, false>(input_ub_248_ascendc, input_ub_247_ascendc, ub_245_ascendc, AscendC::MASK_PLACEHOLDER, 8, {1, 1, 1, 8, 8, 8});
+          AscendC::PipeBarrier<PIPE_V>();
+          input_ub_249_ascendc = vecin_buff.GetWithOffset<float>(512, 2144);
+          Add<float, false>(input_ub_249_ascendc, input_ub_248_ascendc, ub_246_ascendc, AscendC::MASK_PLACEHOLDER, 8, {1, 1, 1, 8, 8, 8});
+          AscendC::SetFlag<AscendC::HardEvent::V_MTE3>(0);
+          AscendC::WaitFlag<AscendC::HardEvent::V_MTE3>(0);
+          output_data_ascendc.SetGlobalBuffer(reinterpret_cast<__gm__ float *>(output_data));
+          input_ub_249_ascendc = vecout_buff.GetWithOffset<float>(512, 2144);
+          for (int n = 0; n < 1; ++n) {
+            AscendC::DataCopy(output_data_ascendc[res_234 + n * 512], input_ub_249_ascendc[n * 16533504], {(uint16_t)(1), (uint16_t)(64), (uint16_t)(0), (uint16_t)(0)});
+          }
+          AscendC::PipeBarrier<PIPE_ALL>();
+        }
+        for (int dynamic_loop_var_3_254 = 0; dynamic_loop_var_3_254 < c_res_chunk_num_60; dynamic_loop_var_3_254 += 1) {
+          res_251 = n_191 * in_stride_0_61;
+          res_252 = y_n_92 * in_stride_1_55;
+          res_257 = n_191 * in_stride_0_61;
+          res_258 = y_n_92 * in_stride_1_55;
+          res_267 = n_191 * in_stride_0_61;
+          res_268 = y_s_93 * in_stride_1_55;
+          res_253 = x_w_90 * in_stride_2_26;
+          res_259 = x_e_91 * in_stride_2_26;
+          res_269 = x_e_91 * in_stride_2_26;
+          res_277 = res_251 + res_252;
+          res_278 = res_257 + res_258;
+          res_280 = res_267 + res_268;
+          res_282 = res_277 + res_253;
+          res_283 = res_278 + res_259;
+          res_285 = res_280 + res_269;
+          res_256 = dynamic_loop_var_3_254 * (int32_t)8;
+          res_261 = dynamic_loop_var_3_254 * (int32_t)8;
+          res_271 = dynamic_loop_var_3_254 * (int32_t)8;
+          res_287 = res_282 + c_res_start_44;
+          res_288 = res_283 + c_res_start_44;
+          res_290 = res_285 + c_res_start_44;
+          res_292 = res_287 + res_256;
+          res_293 = res_288 + res_261;
+          res_295 = res_290 + res_271;
+          AscendC::SetFlag<AscendC::HardEvent::S_MTE2>(0);
+          AscendC::WaitFlag<AscendC::HardEvent::S_MTE2>(0);
+          input_ub_nw_298_ascendc = vecin_buff.GetWithOffset<float>(8, 288);
+          for (int n = 0; n < 1; ++n) {
+            AscendC::DataCopy(input_ub_nw_298_ascendc[n * 8], input_data_ascendc[res_292 + n * 884736], {(uint16_t)(1), (uint16_t)(1), (uint16_t)(0), (uint16_t)(0)});
+          }
+          res_262 = n_191 * in_stride_0_61;
+          res_263 = y_s_93 * in_stride_1_55;
+          res_264 = x_w_90 * in_stride_2_26;
+          res_279 = res_262 + res_263;
+          res_284 = res_279 + res_264;
+          res_266 = dynamic_loop_var_3_254 * (int32_t)8;
+          res_289 = res_284 + c_res_start_44;
+          res_294 = res_289 + res_266;
+          AscendC::SetFlag<AscendC::HardEvent::S_MTE2>(0);
+          AscendC::WaitFlag<AscendC::HardEvent::S_MTE2>(0);
+          res_272 = n_191 * out_stride_0_62;
+          res_273 = h_193 * out_stride_1_56;
+          res_274 = w_192 * out_stride_2_30;
+          res_281 = res_272 + res_273;
+          res_286 = res_281 + res_274;
+          res_276 = dynamic_loop_var_3_254 * (int32_t)8;
+          res_291 = res_286 + c_res_start_44;
+          res_296 = res_291 + res_276;
+          AscendC::SetFlag<AscendC::HardEvent::S_MTE3>(0);
+          AscendC::WaitFlag<AscendC::HardEvent::S_MTE3>(0);
+          input_ub_ne_300_ascendc = vecin_buff.GetWithOffset<float>(8, 416);
+          for (int n = 0; n < 1; ++n) {
+            AscendC::DataCopy(input_ub_ne_300_ascendc[n * 8], input_data_ascendc[res_293 + n * 884736], {(uint16_t)(1), (uint16_t)(1), (uint16_t)(0), (uint16_t)(0)});
+          }
+          input_ub_se_304_ascendc = vecin_buff.GetWithOffset<float>(8, 128);
+          for (int n = 0; n < 1; ++n) {
+            AscendC::DataCopy(input_ub_se_304_ascendc[n * 8], input_data_ascendc[res_295 + n * 884736], {(uint16_t)(1), (uint16_t)(1), (uint16_t)(0), (uint16_t)(0)});
+          }
+          input_ub_sw_302_ascendc = vecin_buff.GetWithOffset<float>(8, 224);
+          for (int n = 0; n < 1; ++n) {
+            AscendC::DataCopy(input_ub_sw_302_ascendc[n * 8], input_data_ascendc[res_294 + n * 884736], {(uint16_t)(1), (uint16_t)(1), (uint16_t)(0), (uint16_t)(0)});
+          }
+          AscendC::SetFlag<AscendC::HardEvent::MTE2_V>(0);
+          AscendC::WaitFlag<AscendC::HardEvent::MTE2_V>(0);
+          input_ub_305_ascendc = vecin_buff.GetWithOffset<float>(8, 384);
+          AscendC::SetMaskNorm();
+          AscendC::SetVectorMask<float, AscendC::MaskMode::NORMAL>(0ULL, (1ULL << (8)) - 1);
+          Muls<float, false>(input_ub_305_ascendc, input_ub_nw_298_ascendc, nw_weight_94, AscendC::MASK_PLACEHOLDER, 1, {1, 1, 8, 8});
+          AscendC::SetMaskNorm();
+          AscendC::SetVectorMask<float, AscendC::MaskMode::NORMAL>(18446744073709551615ULL, 18446744073709551615ULL);
+          AscendC::SetFlag<AscendC::HardEvent::MTE2_V>(0);
+          AscendC::WaitFlag<AscendC::HardEvent::MTE2_V>(0);
+          ub_306_ascendc = vecin_buff.GetWithOffset<float>(8, 448);
+          AscendC::SetMaskNorm();
+          AscendC::SetVectorMask<float, AscendC::MaskMode::NORMAL>(0ULL, (1ULL << (8)) - 1);
+          Muls<float, false>(ub_306_ascendc, input_ub_ne_300_ascendc, ne_weight_95, AscendC::MASK_PLACEHOLDER, 1, {1, 1, 8, 8});
+          AscendC::SetMaskNorm();
+          AscendC::SetVectorMask<float, AscendC::MaskMode::NORMAL>(18446744073709551615ULL, 18446744073709551615ULL);
+          ub_308_ascendc = vecin_buff.GetWithOffset<float>(8, 192);
+          AscendC::SetMaskNorm();
+          AscendC::SetVectorMask<float, AscendC::MaskMode::NORMAL>(0ULL, (1ULL << (8)) - 1);
+          Muls<float, false>(ub_308_ascendc, input_ub_se_304_ascendc, se_weight_97, AscendC::MASK_PLACEHOLDER, 1, {1, 1, 8, 8});
+          AscendC::SetMaskNorm();
+          AscendC::SetVectorMask<float, AscendC::MaskMode::NORMAL>(18446744073709551615ULL, 18446744073709551615ULL);
+          AscendC::PipeBarrier<PIPE_V>();
+          ub_307_ascendc = vecin_buff.GetWithOffset<float>(8, 256);
+          AscendC::SetMaskNorm();
+          AscendC::SetVectorMask<float, AscendC::MaskMode::NORMAL>(0ULL, (1ULL << (8)) - 1);
+          Muls<float, false>(ub_307_ascendc, input_ub_sw_302_ascendc, sw_weight_96, AscendC::MASK_PLACEHOLDER, 1, {1, 1, 8, 8});
+          AscendC::SetMaskNorm();
+          AscendC::SetVectorMask<float, AscendC::MaskMode::NORMAL>(18446744073709551615ULL, 18446744073709551615ULL);
+          AscendC::PipeBarrier<PIPE_V>();
+          input_ub_309_ascendc = vecin_buff.GetWithOffset<float>(8, 128);
+          AscendC::SetMaskNorm();
+          AscendC::SetVectorMask<float, AscendC::MaskMode::NORMAL>(0ULL, (1ULL << (8)) - 1);
+          Add<float, false>(input_ub_309_ascendc, input_ub_305_ascendc, ub_306_ascendc, AscendC::MASK_PLACEHOLDER, 1, {1, 1, 1, 0, 0, 0});
+          AscendC::SetMaskNorm();
+          AscendC::SetVectorMask<float, AscendC::MaskMode::NORMAL>(18446744073709551615ULL, 18446744073709551615ULL);
+          AscendC::PipeBarrier<PIPE_V>();
+          input_ub_310_ascendc = vecin_buff.GetWithOffset<float>(8, 384);
+          AscendC::SetMaskNorm();
+          AscendC::SetVectorMask<float, AscendC::MaskMode::NORMAL>(0ULL, (1ULL << (8)) - 1);
+          Add<float, false>(input_ub_310_ascendc, input_ub_309_ascendc, ub_307_ascendc, AscendC::MASK_PLACEHOLDER, 1, {1, 1, 1, 0, 0, 0});
+          AscendC::SetMaskNorm();
+          AscendC::SetVectorMask<float, AscendC::MaskMode::NORMAL>(18446744073709551615ULL, 18446744073709551615ULL);
+          AscendC::PipeBarrier<PIPE_V>();
+          input_ub_311_ascendc = vecin_buff.GetWithOffset<float>(8, 128);
+          AscendC::SetMaskNorm();
+          AscendC::SetVectorMask<float, AscendC::MaskMode::NORMAL>(0ULL, (1ULL << (8)) - 1);
+          Add<float, false>(input_ub_311_ascendc, input_ub_310_ascendc, ub_308_ascendc, AscendC::MASK_PLACEHOLDER, 1, {1, 1, 1, 0, 0, 0});
+          AscendC::SetMaskNorm();
+          AscendC::SetVectorMask<float, AscendC::MaskMode::NORMAL>(18446744073709551615ULL, 18446744073709551615ULL);
+          AscendC::SetFlag<AscendC::HardEvent::V_MTE3>(0);
+          AscendC::WaitFlag<AscendC::HardEvent::V_MTE3>(0);
+          input_ub_311_ascendc = vecout_buff.GetWithOffset<float>(8, 128);
+          for (int n = 0; n < 1; ++n) {
+            AscendC::DataCopy(output_data_ascendc[res_296 + n * 8], input_ub_311_ascendc[n * 16533504], {(uint16_t)(1), (uint16_t)(1), (uint16_t)(0), (uint16_t)(0)});
+          }
+          AscendC::PipeBarrier<PIPE_ALL>();
+        }
+        AscendC::PipeBarrier<PIPE_ALL>();
+      }
+      AscendC::PipeBarrier<PIPE_ALL>();
+    }
+    res_num_88 = int(pixel_num_17) % int((int32_t)8);
+    res_314 = res_num_88 > (int32_t)0;
+    AscendC::PipeBarrier<PIPE_ALL>();
+    AscendC::PipeBarrier<PIPE_ALL>();
+    if (res_314) {
+      res_315 = start_idx_71 + pixel_num_17;
+      idx__316 = res_315 - res_num_88;
+      AscendC::SetFlag<AscendC::HardEvent::S_MTE2>(0);
+      AscendC::WaitFlag<AscendC::HardEvent::S_MTE2>(0);
+      xys_ub__320_ascendc = vecin_buff.GetWithOffset<float>(16, 2144);
+      AscendC::DataCopy(xys_ub__320_ascendc, grid_data_ascendc[(idx__316) * 2], {(uint16_t)(1), (uint16_t)(2), (uint16_t)(0), (uint16_t)(0)});
+      AscendC::SetFlag<AscendC::HardEvent::MTE2_V>(0);
+      AscendC::WaitFlag<AscendC::HardEvent::MTE2_V>(0);
+      xys_ub__323_ascendc = vecout_buff.GetWithOffset<float>(128, 4256);
+      ub_321_ascendc = vecin_buff.GetWithOffset<float>(256, 2208);
+      ub_322_ascendc = vecin_buff.GetWithOffset<float>(256, 3232);
+      AscendC::DataCopy(xys_ub__323_ascendc, xys_ub__320_ascendc, {(uint16_t)(1), (uint16_t)(2), (uint16_t)(0), (uint16_t)(0)});
+      pipe_barrier(PIPE_V);
+      AscendC::Transpose<uint32_t>(ub_321_ascendc.ReinterpretCast<uint32_t>(), xys_ub__323_ascendc.ReinterpretCast<uint32_t>(), xys_ub__323_ascendc.ReinterpretCast<uint8_t>(), {1,2,1,16,AscendC::TransposeType::TRANSPOSE_NHWC2NCHW});
+      pipe_barrier(PIPE_V);
+      AscendC::Transpose<uint32_t>(ub_322_ascendc.ReinterpretCast<uint32_t>(), ub_321_ascendc.ReinterpretCast<uint32_t>(), ub_321_ascendc.ReinterpretCast<uint8_t>(), {1,16,1,16,AscendC::TransposeType::TRANSPOSE_NCHW2NHWC});
+      pipe_barrier(PIPE_V);
+      AscendC::DataCopy(xys_ub__323_ascendc, ub_322_ascendc, {(uint16_t)(1), (uint16_t)(16), (uint16_t)(0), (uint16_t)(0)});
+      pipe_barrier(PIPE_V);
+      AscendC::PipeBarrier<PIPE_V>();
+      xys_ub_trans__324_ascendc = vecin_buff.GetWithOffset<float>(128, 2144);
+      xys_ub__323_ascendc = vecin_buff.GetWithOffset<float>(128, 4256);
+      AscendC::Transpose<uint32_t>(xys_ub_trans__324_ascendc.ReinterpretCast<uint32_t>(), xys_ub__323_ascendc.ReinterpretCast<uint32_t>(), xys_ub__323_ascendc.ReinterpretCast<uint8_t>(), {1,8,1,16,AscendC::TransposeType::TRANSPOSE_NCHW2NHWC});
+      AscendC::PipeBarrier<PIPE_V>();
+      AscendC::PipeBarrier<PIPE_V>();
+      xs_ub__329_ascendc = vecout_buff.GetWithOffset<float>(8, 4256);
+      AscendC::DataCopy(xs_ub__329_ascendc, xys_ub_trans__324_ascendc, {(uint16_t)(1), (uint16_t)(1), (uint16_t)(0), (uint16_t)(0)});
+      ys_ub__334_ascendc = vecout_buff.GetWithOffset<float>(8, 4288);
+      AscendC::DataCopy(ys_ub__334_ascendc, xys_ub_trans__324_ascendc[8], {(uint16_t)(1), (uint16_t)(1), (uint16_t)(0), (uint16_t)(0)});
+      AscendC::PipeBarrier<PIPE_V>();
+      xs_ub__335_ascendc = vecin_buff.GetWithOffset<float>(8, 2144);
+      xs_ub__329_ascendc = vecin_buff.GetWithOffset<float>(8, 4256);
+      AscendC::SetMaskNorm();
+      AscendC::SetVectorMask<float, AscendC::MaskMode::NORMAL>(0ULL, (1ULL << (8)) - 1);
+      Muls<float, false>(xs_ub__335_ascendc, xs_ub__329_ascendc, const1_32, AscendC::MASK_PLACEHOLDER, 1, {1, 1, 8, 8});
+      AscendC::SetMaskNorm();
+      AscendC::SetVectorMask<float, AscendC::MaskMode::NORMAL>(18446744073709551615ULL, 18446744073709551615ULL);
+      ys_ub__336_ascendc = vecin_buff.GetWithOffset<float>(8, 2176);
+      ys_ub__334_ascendc = vecin_buff.GetWithOffset<float>(8, 4288);
+      AscendC::SetMaskNorm();
+      AscendC::SetVectorMask<float, AscendC::MaskMode::NORMAL>(0ULL, (1ULL << (8)) - 1);
+      Muls<float, false>(ys_ub__336_ascendc, ys_ub__334_ascendc, const2_6, AscendC::MASK_PLACEHOLDER, 1, {1, 1, 8, 8});
+      AscendC::SetMaskNorm();
+      AscendC::SetVectorMask<float, AscendC::MaskMode::NORMAL>(18446744073709551615ULL, 18446744073709551615ULL);
+      AscendC::PipeBarrier<PIPE_V>();
+      xs_ub__337_ascendc = vecin_buff.GetWithOffset<float>(8, 4256);
+      AscendC::SetMaskNorm();
+      AscendC::SetVectorMask<float, AscendC::MaskMode::NORMAL>(0ULL, (1ULL << (8)) - 1);
+      Adds<float, false>(xs_ub__337_ascendc, xs_ub__335_ascendc, const3_48, AscendC::MASK_PLACEHOLDER, 1, {1, 1, 8, 8});
+      AscendC::SetMaskNorm();
+      AscendC::SetVectorMask<float, AscendC::MaskMode::NORMAL>(18446744073709551615ULL, 18446744073709551615ULL);
+      ys_ub__338_ascendc = vecin_buff.GetWithOffset<float>(8, 4288);
+      AscendC::SetMaskNorm();
+      AscendC::SetVectorMask<float, AscendC::MaskMode::NORMAL>(0ULL, (1ULL << (8)) - 1);
+      Adds<float, false>(ys_ub__338_ascendc, ys_ub__336_ascendc, const4_40, AscendC::MASK_PLACEHOLDER, 1, {1, 1, 8, 8});
+      AscendC::SetMaskNorm();
+      AscendC::SetVectorMask<float, AscendC::MaskMode::NORMAL>(18446744073709551615ULL, 18446744073709551615ULL);
+      AscendC::PipeBarrier<PIPE_V>();
+      xs_ub__340_ascendc = vecin_buff.GetWithOffset<float>(8, 2144);
+      AscendC::SetMaskNorm();
+      AscendC::SetVectorMask<float, AscendC::MaskMode::NORMAL>(0ULL, (1ULL << (8)) - 1);
+      Maxs<float, false>(xs_ub__340_ascendc, xs_ub__337_ascendc, (float)0.0000000000, AscendC::MASK_PLACEHOLDER, 1, {1, 1, 8, 8});
+      AscendC::SetMaskNorm();
+      AscendC::SetVectorMask<float, AscendC::MaskMode::NORMAL>(18446744073709551615ULL, 18446744073709551615ULL);
+      ys_ub__342_ascendc = vecin_buff.GetWithOffset<float>(8, 2176);
+      AscendC::SetMaskNorm();
+      AscendC::SetVectorMask<float, AscendC::MaskMode::NORMAL>(0ULL, (1ULL << (8)) - 1);
+      Maxs<float, false>(ys_ub__342_ascendc, ys_ub__338_ascendc, (float)0.0000000000, AscendC::MASK_PLACEHOLDER, 1, {1, 1, 8, 8});
+      AscendC::SetMaskNorm();
+      AscendC::SetVectorMask<float, AscendC::MaskMode::NORMAL>(18446744073709551615ULL, 18446744073709551615ULL);
+      AscendC::PipeBarrier<PIPE_V>();
+      xs_ub__343_ascendc = vecin_buff.GetWithOffset<float>(8, 4256);
+      AscendC::SetMaskNorm();
+      AscendC::SetVectorMask<float, AscendC::MaskMode::NORMAL>(0ULL, (1ULL << (8)) - 1);
+      Mins<float, false>(xs_ub__343_ascendc, xs_ub__340_ascendc, const5_9, AscendC::MASK_PLACEHOLDER, 1, {1, 1, 8, 8});
+      AscendC::SetMaskNorm();
+      AscendC::SetVectorMask<float, AscendC::MaskMode::NORMAL>(18446744073709551615ULL, 18446744073709551615ULL);
+      ys_ub__344_ascendc = vecin_buff.GetWithOffset<float>(8, 4288);
+      AscendC::SetMaskNorm();
+      AscendC::SetVectorMask<float, AscendC::MaskMode::NORMAL>(0ULL, (1ULL << (8)) - 1);
+      Mins<float, false>(ys_ub__344_ascendc, ys_ub__342_ascendc, const6_11, AscendC::MASK_PLACEHOLDER, 1, {1, 1, 8, 8});
+      AscendC::SetMaskNorm();
+      AscendC::SetVectorMask<float, AscendC::MaskMode::NORMAL>(18446744073709551615ULL, 18446744073709551615ULL);
+      AscendC::PipeBarrier<PIPE_V>();
+      xs_ub_w_int__345_ascendc = vecin_buff.GetWithOffset<int32_t>(8, 2144);
+      AscendC::SetMaskNorm();
+      AscendC::SetVectorMask<float, AscendC::MaskMode::NORMAL>(0ULL, (1ULL << (8)) - 1);
+      AscendC::Cast<int32_t, float, false>(xs_ub_w_int__345_ascendc, xs_ub__343_ascendc, AscendC::RoundMode::CAST_TRUNC, AscendC::MASK_PLACEHOLDER, 1, {1, 1, 8, 8});
+      AscendC::SetMaskNorm();
+      AscendC::SetVectorMask<float, AscendC::MaskMode::NORMAL>(18446744073709551615ULL, 18446744073709551615ULL);
+      ys_ub_n_int__346_ascendc = vecin_buff.GetWithOffset<int32_t>(8, 2176);
+      AscendC::SetMaskNorm();
+      AscendC::SetVectorMask<float, AscendC::MaskMode::NORMAL>(0ULL, (1ULL << (8)) - 1);
+      AscendC::Cast<int32_t, float, false>(ys_ub_n_int__346_ascendc, ys_ub__344_ascendc, AscendC::RoundMode::CAST_TRUNC, AscendC::MASK_PLACEHOLDER, 1, {1, 1, 8, 8});
+      AscendC::SetMaskNorm();
+      AscendC::SetVectorMask<float, AscendC::MaskMode::NORMAL>(18446744073709551615ULL, 18446744073709551615ULL);
+      AscendC::PipeBarrier<PIPE_V>();
+      xs_ub_w__347_ascendc = vecin_buff.GetWithOffset<float>(8, 2336);
+      AscendC::SetMaskNorm();
+      AscendC::SetVectorMask<int32_t, AscendC::MaskMode::NORMAL>(0ULL, (1ULL << (8)) - 1);
+      AscendC::Cast<float, int32_t, false>(xs_ub_w__347_ascendc, xs_ub_w_int__345_ascendc, AscendC::RoundMode::CAST_NONE, AscendC::MASK_PLACEHOLDER, 1, {1, 1, 8, 8});
+      AscendC::SetMaskNorm();
+      AscendC::SetVectorMask<int32_t, AscendC::MaskMode::NORMAL>(18446744073709551615ULL, 18446744073709551615ULL);
+      ys_ub_n__350_ascendc = vecin_buff.GetWithOffset<float>(8, 2400);
+      AscendC::SetMaskNorm();
+      AscendC::SetVectorMask<int32_t, AscendC::MaskMode::NORMAL>(0ULL, (1ULL << (8)) - 1);
+      AscendC::Cast<float, int32_t, false>(ys_ub_n__350_ascendc, ys_ub_n_int__346_ascendc, AscendC::RoundMode::CAST_NONE, AscendC::MASK_PLACEHOLDER, 1, {1, 1, 8, 8});
+      AscendC::SetMaskNorm();
+      AscendC::SetVectorMask<int32_t, AscendC::MaskMode::NORMAL>(18446744073709551615ULL, 18446744073709551615ULL);
+      ys_ub_s_int__352_ascendc = vecin_buff.GetWithOffset<int32_t>(8, 2272);
+      AscendC::SetMaskNorm();
+      AscendC::SetVectorMask<int32_t, AscendC::MaskMode::NORMAL>(0ULL, (1ULL << (8)) - 1);
+      Adds<int32_t, false>(ys_ub_s_int__352_ascendc, ys_ub_n_int__346_ascendc, (int32_t)1, AscendC::MASK_PLACEHOLDER, 1, {1, 1, 8, 8});
+      AscendC::SetMaskNorm();
+      AscendC::SetVectorMask<int32_t, AscendC::MaskMode::NORMAL>(18446744073709551615ULL, 18446744073709551615ULL);
+      xs_ub_e_int__349_ascendc = vecin_buff.GetWithOffset<int32_t>(8, 2208);
+      AscendC::SetMaskNorm();
+      AscendC::SetVectorMask<int32_t, AscendC::MaskMode::NORMAL>(0ULL, (1ULL << (8)) - 1);
+      Adds<int32_t, false>(xs_ub_e_int__349_ascendc, xs_ub_w_int__345_ascendc, (int32_t)1, AscendC::MASK_PLACEHOLDER, 1, {1, 1, 8, 8});
+      AscendC::SetMaskNorm();
+      AscendC::SetVectorMask<int32_t, AscendC::MaskMode::NORMAL>(18446744073709551615ULL, 18446744073709551615ULL);
+      AscendC::PipeBarrier<PIPE_V>();
+      xs_ub_e__354_ascendc = vecin_buff.GetWithOffset<float>(8, 2464);
+      AscendC::SetMaskNorm();
+      AscendC::SetVectorMask<float, AscendC::MaskMode::NORMAL>(0ULL, (1ULL << (8)) - 1);
+      Adds<float, false>(xs_ub_e__354_ascendc, xs_ub_w__347_ascendc, (float)1.0000000000, AscendC::MASK_PLACEHOLDER, 1, {1, 1, 8, 8});
+      AscendC::SetMaskNorm();
+      AscendC::SetVectorMask<float, AscendC::MaskMode::NORMAL>(18446744073709551615ULL, 18446744073709551615ULL);
+      e_weight__355_ascendc = vecin_buff.GetWithOffset<float>(8, 2368);
+      AscendC::SetMaskNorm();
+      AscendC::SetVectorMask<float, AscendC::MaskMode::NORMAL>(0ULL, (1ULL << (8)) - 1);
+      Sub<float, false>(e_weight__355_ascendc, xs_ub__343_ascendc, xs_ub_w__347_ascendc, AscendC::MASK_PLACEHOLDER, 1, {1, 1, 1, 0, 0, 0});
+      AscendC::SetMaskNorm();
+      AscendC::SetVectorMask<float, AscendC::MaskMode::NORMAL>(18446744073709551615ULL, 18446744073709551615ULL);
+      ys_ub_s__358_ascendc = vecin_buff.GetWithOffset<float>(8, 2496);
+      AscendC::SetMaskNorm();
+      AscendC::SetVectorMask<float, AscendC::MaskMode::NORMAL>(0ULL, (1ULL << (8)) - 1);
+      Adds<float, false>(ys_ub_s__358_ascendc, ys_ub_n__350_ascendc, (float)1.0000000000, AscendC::MASK_PLACEHOLDER, 1, {1, 1, 8, 8});
+      AscendC::SetMaskNorm();
+      AscendC::SetVectorMask<float, AscendC::MaskMode::NORMAL>(18446744073709551615ULL, 18446744073709551615ULL);
+      ys_ub_s_int__360_ascendc = vecin_buff.GetWithOffset<int32_t>(8, 2304);
+      AscendC::SetMaskNorm();
+      AscendC::SetVectorMask<int32_t, AscendC::MaskMode::NORMAL>(0ULL, (1ULL << (8)) - 1);
+      Mins<int32_t, false>(ys_ub_s_int__360_ascendc, ys_ub_s_int__352_ascendc, const8_42, AscendC::MASK_PLACEHOLDER, 1, {1, 1, 8, 8});
+      AscendC::SetMaskNorm();
+      AscendC::SetVectorMask<int32_t, AscendC::MaskMode::NORMAL>(18446744073709551615ULL, 18446744073709551615ULL);
+      AscendC::PipeBarrier<PIPE_V>();
+      s_weight__359_ascendc = vecin_buff.GetWithOffset<float>(8, 2432);
+      AscendC::SetMaskNorm();
+      AscendC::SetVectorMask<float, AscendC::MaskMode::NORMAL>(0ULL, (1ULL << (8)) - 1);
+      Sub<float, false>(s_weight__359_ascendc, ys_ub__344_ascendc, ys_ub_n__350_ascendc, AscendC::MASK_PLACEHOLDER, 1, {1, 1, 1, 0, 0, 0});
+      AscendC::SetMaskNorm();
+      AscendC::SetVectorMask<float, AscendC::MaskMode::NORMAL>(18446744073709551615ULL, 18446744073709551615ULL);
+      AscendC::PipeBarrier<PIPE_V>();
+      xs_ub_e_int__356_ascendc = vecin_buff.GetWithOffset<int32_t>(8, 2240);
+      AscendC::SetMaskNorm();
+      AscendC::SetVectorMask<int32_t, AscendC::MaskMode::NORMAL>(0ULL, (1ULL << (8)) - 1);
+      Mins<int32_t, false>(xs_ub_e_int__356_ascendc, xs_ub_e_int__349_ascendc, const7_41, AscendC::MASK_PLACEHOLDER, 1, {1, 1, 8, 8});
+      AscendC::SetMaskNorm();
+      AscendC::SetVectorMask<int32_t, AscendC::MaskMode::NORMAL>(18446744073709551615ULL, 18446744073709551615ULL);
+      n_weight__361_ascendc = vecin_buff.GetWithOffset<float>(8, 2336);
+      AscendC::SetMaskNorm();
+      AscendC::SetVectorMask<float, AscendC::MaskMode::NORMAL>(0ULL, (1ULL << (8)) - 1);
+      Sub<float, false>(n_weight__361_ascendc, ys_ub_s__358_ascendc, ys_ub__344_ascendc, AscendC::MASK_PLACEHOLDER, 1, {1, 1, 1, 0, 0, 0});
+      AscendC::SetMaskNorm();
+      AscendC::SetVectorMask<float, AscendC::MaskMode::NORMAL>(18446744073709551615ULL, 18446744073709551615ULL);
+      w_weight__363_ascendc = vecin_buff.GetWithOffset<float>(8, 2272);
+      AscendC::SetMaskNorm();
+      AscendC::SetVectorMask<float, AscendC::MaskMode::NORMAL>(0ULL, (1ULL << (8)) - 1);
+      Sub<float, false>(w_weight__363_ascendc, xs_ub_e__354_ascendc, xs_ub__343_ascendc, AscendC::MASK_PLACEHOLDER, 1, {1, 1, 1, 0, 0, 0});
+      AscendC::SetMaskNorm();
+      AscendC::SetVectorMask<float, AscendC::MaskMode::NORMAL>(18446744073709551615ULL, 18446744073709551615ULL);
+      AscendC::PipeBarrier<PIPE_V>();
+      se_weight__362_ascendc = vecin_buff.GetWithOffset<float>(8, 2208);
+      AscendC::SetMaskNorm();
+      AscendC::SetVectorMask<float, AscendC::MaskMode::NORMAL>(0ULL, (1ULL << (8)) - 1);
+      Mul<float, false>(se_weight__362_ascendc, e_weight__355_ascendc, s_weight__359_ascendc, AscendC::MASK_PLACEHOLDER, 1, {1, 1, 1, 0, 0, 0});
+      AscendC::SetMaskNorm();
+      AscendC::SetVectorMask<float, AscendC::MaskMode::NORMAL>(18446744073709551615ULL, 18446744073709551615ULL);
+      ne_weight__364_ascendc = vecin_buff.GetWithOffset<float>(8, 2496);
+      AscendC::SetMaskNorm();
+      AscendC::SetVectorMask<float, AscendC::MaskMode::NORMAL>(0ULL, (1ULL << (8)) - 1);
+      Mul<float, false>(ne_weight__364_ascendc, e_weight__355_ascendc, n_weight__361_ascendc, AscendC::MASK_PLACEHOLDER, 1, {1, 1, 1, 0, 0, 0});
+      AscendC::SetMaskNorm();
+      AscendC::SetVectorMask<float, AscendC::MaskMode::NORMAL>(18446744073709551615ULL, 18446744073709551615ULL);
+      sw_weight__365_ascendc = vecin_buff.GetWithOffset<float>(8, 4256);
+      AscendC::SetMaskNorm();
+      AscendC::SetVectorMask<float, AscendC::MaskMode::NORMAL>(0ULL, (1ULL << (8)) - 1);
+      Mul<float, false>(sw_weight__365_ascendc, w_weight__363_ascendc, s_weight__359_ascendc, AscendC::MASK_PLACEHOLDER, 1, {1, 1, 1, 0, 0, 0});
+      AscendC::SetMaskNorm();
+      AscendC::SetVectorMask<float, AscendC::MaskMode::NORMAL>(18446744073709551615ULL, 18446744073709551615ULL);
+      nw_weight__366_ascendc = vecin_buff.GetWithOffset<float>(8, 2464);
+      AscendC::SetMaskNorm();
+      AscendC::SetVectorMask<float, AscendC::MaskMode::NORMAL>(0ULL, (1ULL << (8)) - 1);
+      Mul<float, false>(nw_weight__366_ascendc, w_weight__363_ascendc, n_weight__361_ascendc, AscendC::MASK_PLACEHOLDER, 1, {1, 1, 1, 0, 0, 0});
+      AscendC::SetMaskNorm();
+      AscendC::SetVectorMask<float, AscendC::MaskMode::NORMAL>(18446744073709551615ULL, 18446744073709551615ULL);
+      AscendC::PipeBarrier<PIPE_ALL>();
+      for (int dynamic_loop_var_4_368 = 0; dynamic_loop_var_4_368 < res_num_88; dynamic_loop_var_4_368 += 1) {
+        ub_371_ascendc = vecout_buff.GetWithOffset<float>(1, 160);
+        AscendC::DataCopy(ub_371_ascendc, nw_weight__366_ascendc[(dynamic_loop_var_4_368)], {(uint16_t)(1), (uint16_t)(1), (uint16_t)(0), (uint16_t)(0)});
+        inner_idx_388 = idx__316 + dynamic_loop_var_4_368;
+        res_389 = h_out * w_out;
+        res_390 = h_out * w_out;
+        hw_408 = int(inner_idx_388) % int(res_389);
+        n_409 = inner_idx_388 / res_390;
+        w_413 = int(hw_408) % int(w_out);
+        h_414 = hw_408 / w_out;
+        ub_375_ascendc = vecout_buff.GetWithOffset<float>(1, 320);
+        AscendC::DataCopy(ub_375_ascendc, ne_weight__364_ascendc[(dynamic_loop_var_4_368)], {(uint16_t)(1), (uint16_t)(1), (uint16_t)(0), (uint16_t)(0)});
+        ub_379_ascendc = vecout_buff.GetWithOffset<float>(1, 352);
+        AscendC::DataCopy(ub_379_ascendc, sw_weight__365_ascendc[(dynamic_loop_var_4_368)], {(uint16_t)(1), (uint16_t)(1), (uint16_t)(0), (uint16_t)(0)});
+        ub_383_ascendc = vecout_buff.GetWithOffset<float>(1, 2112);
+        AscendC::DataCopy(ub_383_ascendc, se_weight__362_ascendc[(dynamic_loop_var_4_368)], {(uint16_t)(1), (uint16_t)(1), (uint16_t)(0), (uint16_t)(0)});
+        ub_387_ascendc = vecout_buff.GetWithOffset<int32_t>(1, 0);
+        AscendC::DataCopy(ub_387_ascendc, xs_ub_w_int__345_ascendc[(dynamic_loop_var_4_368)], {(uint16_t)(1), (uint16_t)(1), (uint16_t)(0), (uint16_t)(0)});
+        ub_394_ascendc = vecout_buff.GetWithOffset<int32_t>(1, 32);
+        AscendC::DataCopy(ub_394_ascendc, xs_ub_e_int__356_ascendc[(dynamic_loop_var_4_368)], {(uint16_t)(1), (uint16_t)(1), (uint16_t)(0), (uint16_t)(0)});
+        ub_398_ascendc = vecout_buff.GetWithOffset<int32_t>(1, 64);
+        AscendC::DataCopy(ub_398_ascendc, ys_ub_n_int__346_ascendc[(dynamic_loop_var_4_368)], {(uint16_t)(1), (uint16_t)(1), (uint16_t)(0), (uint16_t)(0)});
+        ub_402_ascendc = vecout_buff.GetWithOffset<int32_t>(1, 96);
+        AscendC::DataCopy(ub_402_ascendc, ys_ub_s_int__360_ascendc[(dynamic_loop_var_4_368)], {(uint16_t)(1), (uint16_t)(1), (uint16_t)(0), (uint16_t)(0)});
+        AscendC::SetFlag<AscendC::HardEvent::V_S>(0);
+        AscendC::WaitFlag<AscendC::HardEvent::V_S>(0);
+        dst_403 = *(__ubuf__ float *)ub_371;
+        dst_404 = *(__ubuf__ float *)ub_375;
+        dst_405 = *(__ubuf__ float *)ub_379;
+        dst_406 = *(__ubuf__ float *)ub_383;
+        dst_407 = *(__ubuf__ int32_t *)ub_387;
+        dst_410 = *(__ubuf__ int32_t *)ub_394;
+        dst_411 = *(__ubuf__ int32_t *)ub_398;
+        dst_412 = *(__ubuf__ int32_t *)ub_402;
+        nw_weight_94 = dst_403;
+        ne_weight_95 = dst_404;
+        sw_weight_96 = dst_405;
+        se_weight_97 = dst_406;
+        x_w_90 = dst_407;
+        x_e_91 = dst_410;
+        y_n_92 = dst_411;
+        y_s_93 = dst_412;
+        AscendC::PipeBarrier<PIPE_ALL>();
+        for (int dynamic_loop_var_5_418 = 0; dynamic_loop_var_5_418 < c_chunk_num_21; dynamic_loop_var_5_418 += 1) {
+          res_421 = n_409 * in_stride_0_61;
+          res_422 = y_n_92 * in_stride_1_55;
+          res_426 = n_409 * in_stride_0_61;
+          res_427 = y_n_92 * in_stride_1_55;
+          res_436 = n_409 * in_stride_0_61;
+          res_437 = y_s_93 * in_stride_1_55;
+          res_423 = x_w_90 * in_stride_2_26;
+          res_428 = x_e_91 * in_stride_2_26;
+          res_438 = x_e_91 * in_stride_2_26;
+          res_442 = res_421 + res_422;
+          res_443 = res_426 + res_427;
+          res_445 = res_436 + res_437;
+          res_425 = dynamic_loop_var_5_418 * (int32_t)512;
+          res_430 = dynamic_loop_var_5_418 * (int32_t)512;
+          res_440 = dynamic_loop_var_5_418 * (int32_t)512;
+          res_447 = res_442 + res_423;
+          res_448 = res_443 + res_428;
+          res_450 = res_445 + res_438;
+          res_452 = res_447 + res_425;
+          res_453 = res_448 + res_430;
+          res_455 = res_450 + res_440;
+          AscendC::SetFlag<AscendC::HardEvent::S_MTE2>(0);
+          AscendC::WaitFlag<AscendC::HardEvent::S_MTE2>(0);
+          input_ub_nw_457_ascendc = vecin_buff.GetWithOffset<float>(512, 12448);
+          for (int n = 0; n < 1; ++n) {
+            AscendC::DataCopy(input_ub_nw_457_ascendc[n * 512], input_data_ascendc[res_452 + n * 884736], {(uint16_t)(1), (uint16_t)(64), (uint16_t)(0), (uint16_t)(0)});
+          }
+          res_431 = n_409 * in_stride_0_61;
+          res_432 = y_s_93 * in_stride_1_55;
+          res_433 = x_w_90 * in_stride_2_26;
+          res_444 = res_431 + res_432;
+          res_435 = dynamic_loop_var_5_418 * (int32_t)512;
+          res_449 = res_444 + res_433;
+          res_454 = res_449 + res_435;
+          AscendC::SetFlag<AscendC::HardEvent::S_MTE2>(0);
+          AscendC::WaitFlag<AscendC::HardEvent::S_MTE2>(0);
+          res_415 = n_409 * out_stride_0_62;
+          res_416 = h_414 * out_stride_1_56;
+          res_417 = w_413 * out_stride_2_30;
+          res_441 = res_415 + res_416;
+          res_420 = dynamic_loop_var_5_418 * (int32_t)512;
+          res_446 = res_441 + res_417;
+          res_451 = res_446 + res_420;
+          AscendC::SetFlag<AscendC::HardEvent::S_MTE3>(0);
+          AscendC::WaitFlag<AscendC::HardEvent::S_MTE3>(0);
+          input_ub_ne_459_ascendc = vecin_buff.GetWithOffset<float>(512, 16544);
+          for (int n = 0; n < 1; ++n) {
+            AscendC::DataCopy(input_ub_ne_459_ascendc[n * 512], input_data_ascendc[res_453 + n * 884736], {(uint16_t)(1), (uint16_t)(64), (uint16_t)(0), (uint16_t)(0)});
+          }
+          input_ub_se_463_ascendc = vecin_buff.GetWithOffset<float>(512, 0);
+          for (int n = 0; n < 1; ++n) {
+            AscendC::DataCopy(input_ub_se_463_ascendc[n * 512], input_data_ascendc[res_455 + n * 884736], {(uint16_t)(1), (uint16_t)(64), (uint16_t)(0), (uint16_t)(0)});
+          }
+          input_ub_sw_461_ascendc = vecin_buff.GetWithOffset<float>(512, 8352);
+          for (int n = 0; n < 1; ++n) {
+            AscendC::DataCopy(input_ub_sw_461_ascendc[n * 512], input_data_ascendc[res_454 + n * 884736], {(uint16_t)(1), (uint16_t)(64), (uint16_t)(0), (uint16_t)(0)});
+          }
+          AscendC::SetFlag<AscendC::HardEvent::MTE2_V>(0);
+          AscendC::WaitFlag<AscendC::HardEvent::MTE2_V>(0);
+          input_ub_464_ascendc = vecin_buff.GetWithOffset<float>(512, 14496);
+          Muls<float, false>(input_ub_464_ascendc, input_ub_nw_457_ascendc, nw_weight_94, AscendC::MASK_PLACEHOLDER, 8, {1, 1, 8, 8});
+          AscendC::SetFlag<AscendC::HardEvent::MTE2_V>(0);
+          AscendC::WaitFlag<AscendC::HardEvent::MTE2_V>(0);
+          ub_465_ascendc = vecin_buff.GetWithOffset<float>(512, 18592);
+          Muls<float, false>(ub_465_ascendc, input_ub_ne_459_ascendc, ne_weight_95, AscendC::MASK_PLACEHOLDER, 8, {1, 1, 8, 8});
+          ub_467_ascendc = vecin_buff.GetWithOffset<float>(512, 4288);
+          Muls<float, false>(ub_467_ascendc, input_ub_se_463_ascendc, se_weight_97, AscendC::MASK_PLACEHOLDER, 8, {1, 1, 8, 8});
+          AscendC::PipeBarrier<PIPE_V>();
+          ub_466_ascendc = vecin_buff.GetWithOffset<float>(512, 10400);
+          Muls<float, false>(ub_466_ascendc, input_ub_sw_461_ascendc, sw_weight_96, AscendC::MASK_PLACEHOLDER, 8, {1, 1, 8, 8});
+          AscendC::PipeBarrier<PIPE_V>();
+          input_ub_468_ascendc = vecin_buff.GetWithOffset<float>(512, 0);
+          Add<float, false>(input_ub_468_ascendc, input_ub_464_ascendc, ub_465_ascendc, AscendC::MASK_PLACEHOLDER, 8, {1, 1, 1, 8, 8, 8});
+          AscendC::PipeBarrier<PIPE_V>();
+          input_ub_469_ascendc = vecin_buff.GetWithOffset<float>(512, 12480);
+          Add<float, false>(input_ub_469_ascendc, input_ub_468_ascendc, ub_466_ascendc, AscendC::MASK_PLACEHOLDER, 8, {1, 1, 1, 8, 8, 8});
+          AscendC::PipeBarrier<PIPE_V>();
+          input_ub_470_ascendc = vecin_buff.GetWithOffset<float>(512, 0);
+          Add<float, false>(input_ub_470_ascendc, input_ub_469_ascendc, ub_467_ascendc, AscendC::MASK_PLACEHOLDER, 8, {1, 1, 1, 8, 8, 8});
+          AscendC::SetFlag<AscendC::HardEvent::V_MTE3>(0);
+          AscendC::WaitFlag<AscendC::HardEvent::V_MTE3>(0);
+          input_ub_470_ascendc = vecout_buff.GetWithOffset<float>(512, 0);
+          for (int n = 0; n < 1; ++n) {
+            AscendC::DataCopy(output_data_ascendc[res_451 + n * 512], input_ub_470_ascendc[n * 16533504], {(uint16_t)(1), (uint16_t)(64), (uint16_t)(0), (uint16_t)(0)});
+          }
+          AscendC::PipeBarrier<PIPE_ALL>();
+        }
+        for (int dynamic_loop_var_6_473 = 0; dynamic_loop_var_6_473 < c_res_chunk_num_60; dynamic_loop_var_6_473 += 1) {
+          res_476 = n_409 * in_stride_0_61;
+          res_477 = y_n_92 * in_stride_1_55;
+          res_481 = n_409 * in_stride_0_61;
+          res_482 = y_n_92 * in_stride_1_55;
+          res_491 = n_409 * in_stride_0_61;
+          res_492 = y_s_93 * in_stride_1_55;
+          res_472 = x_e_91 * in_stride_2_26;
+          res_478 = x_w_90 * in_stride_2_26;
+          res_483 = x_e_91 * in_stride_2_26;
+          res_498 = res_476 + res_477;
+          res_499 = res_481 + res_482;
+          res_501 = res_491 + res_492;
+          res_503 = res_498 + res_478;
+          res_504 = res_499 + res_483;
+          res_506 = res_501 + res_472;
+          res_475 = dynamic_loop_var_6_473 * (int32_t)8;
+          res_480 = dynamic_loop_var_6_473 * (int32_t)8;
+          res_485 = dynamic_loop_var_6_473 * (int32_t)8;
+          res_508 = res_503 + c_res_start_44;
+          res_509 = res_504 + c_res_start_44;
+          res_511 = res_506 + c_res_start_44;
+          res_513 = res_508 + res_480;
+          res_514 = res_509 + res_485;
+          res_516 = res_511 + res_475;
+          AscendC::SetFlag<AscendC::HardEvent::S_MTE2>(0);
+          AscendC::WaitFlag<AscendC::HardEvent::S_MTE2>(0);
+          input_ub_nw_519_ascendc = vecin_buff.GetWithOffset<float>(8, 2432);
+          for (int n = 0; n < 1; ++n) {
+            AscendC::DataCopy(input_ub_nw_519_ascendc[n * 8], input_data_ascendc[res_513 + n * 884736], {(uint16_t)(1), (uint16_t)(1), (uint16_t)(0), (uint16_t)(0)});
+          }
+          res_486 = n_409 * in_stride_0_61;
+          res_487 = y_s_93 * in_stride_1_55;
+          res_488 = x_w_90 * in_stride_2_26;
+          res_500 = res_486 + res_487;
+          res_505 = res_500 + res_488;
+          res_490 = dynamic_loop_var_6_473 * (int32_t)8;
+          res_510 = res_505 + c_res_start_44;
+          res_515 = res_510 + res_490;
+          AscendC::SetFlag<AscendC::HardEvent::S_MTE2>(0);
+          AscendC::WaitFlag<AscendC::HardEvent::S_MTE2>(0);
+          res_493 = n_409 * out_stride_0_62;
+          res_494 = h_414 * out_stride_1_56;
+          res_495 = w_413 * out_stride_2_30;
+          res_502 = res_493 + res_494;
+          res_507 = res_502 + res_495;
+          res_497 = dynamic_loop_var_6_473 * (int32_t)8;
+          res_512 = res_507 + c_res_start_44;
+          res_517 = res_512 + res_497;
+          AscendC::SetFlag<AscendC::HardEvent::S_MTE3>(0);
+          AscendC::WaitFlag<AscendC::HardEvent::S_MTE3>(0);
+          input_ub_ne_521_ascendc = vecin_buff.GetWithOffset<float>(8, 2528);
+          for (int n = 0; n < 1; ++n) {
+            AscendC::DataCopy(input_ub_ne_521_ascendc[n * 8], input_data_ascendc[res_514 + n * 884736], {(uint16_t)(1), (uint16_t)(1), (uint16_t)(0), (uint16_t)(0)});
+          }
+          input_ub_se_525_ascendc = vecin_buff.GetWithOffset<float>(8, 2112);
+          for (int n = 0; n < 1; ++n) {
+            AscendC::DataCopy(input_ub_se_525_ascendc[n * 8], input_data_ascendc[res_516 + n * 884736], {(uint16_t)(1), (uint16_t)(1), (uint16_t)(0), (uint16_t)(0)});
+          }
+          input_ub_sw_523_ascendc = vecin_buff.GetWithOffset<float>(8, 2336);
+          for (int n = 0; n < 1; ++n) {
+            AscendC::DataCopy(input_ub_sw_523_ascendc[n * 8], input_data_ascendc[res_515 + n * 884736], {(uint16_t)(1), (uint16_t)(1), (uint16_t)(0), (uint16_t)(0)});
+          }
+          AscendC::SetFlag<AscendC::HardEvent::MTE2_V>(0);
+          AscendC::WaitFlag<AscendC::HardEvent::MTE2_V>(0);
+          input_ub_526_ascendc = vecin_buff.GetWithOffset<float>(8, 2400);
+          AscendC::SetMaskNorm();
+          AscendC::SetVectorMask<float, AscendC::MaskMode::NORMAL>(0ULL, (1ULL << (8)) - 1);
+          Muls<float, false>(input_ub_526_ascendc, input_ub_nw_519_ascendc, nw_weight_94, AscendC::MASK_PLACEHOLDER, 1, {1, 1, 8, 8});
+          AscendC::SetMaskNorm();
+          AscendC::SetVectorMask<float, AscendC::MaskMode::NORMAL>(18446744073709551615ULL, 18446744073709551615ULL);
+          AscendC::SetFlag<AscendC::HardEvent::MTE2_V>(0);
+          AscendC::WaitFlag<AscendC::HardEvent::MTE2_V>(0);
+          ub_527_ascendc = vecin_buff.GetWithOffset<float>(8, 2560);
+          AscendC::SetMaskNorm();
+          AscendC::SetVectorMask<float, AscendC::MaskMode::NORMAL>(0ULL, (1ULL << (8)) - 1);
+          Muls<float, false>(ub_527_ascendc, input_ub_ne_521_ascendc, ne_weight_95, AscendC::MASK_PLACEHOLDER, 1, {1, 1, 8, 8});
+          AscendC::SetMaskNorm();
+          AscendC::SetVectorMask<float, AscendC::MaskMode::NORMAL>(18446744073709551615ULL, 18446744073709551615ULL);
+          ub_529_ascendc = vecin_buff.GetWithOffset<float>(8, 2272);
+          AscendC::SetMaskNorm();
+          AscendC::SetVectorMask<float, AscendC::MaskMode::NORMAL>(0ULL, (1ULL << (8)) - 1);
+          Muls<float, false>(ub_529_ascendc, input_ub_se_525_ascendc, se_weight_97, AscendC::MASK_PLACEHOLDER, 1, {1, 1, 8, 8});
+          AscendC::SetMaskNorm();
+          AscendC::SetVectorMask<float, AscendC::MaskMode::NORMAL>(18446744073709551615ULL, 18446744073709551615ULL);
+          AscendC::PipeBarrier<PIPE_V>();
+          ub_528_ascendc = vecin_buff.GetWithOffset<float>(8, 2368);
+          AscendC::SetMaskNorm();
+          AscendC::SetVectorMask<float, AscendC::MaskMode::NORMAL>(0ULL, (1ULL << (8)) - 1);
+          Muls<float, false>(ub_528_ascendc, input_ub_sw_523_ascendc, sw_weight_96, AscendC::MASK_PLACEHOLDER, 1, {1, 1, 8, 8});
+          AscendC::SetMaskNorm();
+          AscendC::SetVectorMask<float, AscendC::MaskMode::NORMAL>(18446744073709551615ULL, 18446744073709551615ULL);
+          AscendC::PipeBarrier<PIPE_V>();
+          input_ub_530_ascendc = vecin_buff.GetWithOffset<float>(8, 2112);
+          AscendC::SetMaskNorm();
+          AscendC::SetVectorMask<float, AscendC::MaskMode::NORMAL>(0ULL, (1ULL << (8)) - 1);
+          Add<float, false>(input_ub_530_ascendc, input_ub_526_ascendc, ub_527_ascendc, AscendC::MASK_PLACEHOLDER, 1, {1, 1, 1, 0, 0, 0});
+          AscendC::SetMaskNorm();
+          AscendC::SetVectorMask<float, AscendC::MaskMode::NORMAL>(18446744073709551615ULL, 18446744073709551615ULL);
+          AscendC::PipeBarrier<PIPE_V>();
+          input_ub_531_ascendc = vecin_buff.GetWithOffset<float>(8, 2400);
+          AscendC::SetMaskNorm();
+          AscendC::SetVectorMask<float, AscendC::MaskMode::NORMAL>(0ULL, (1ULL << (8)) - 1);
+          Add<float, false>(input_ub_531_ascendc, input_ub_530_ascendc, ub_528_ascendc, AscendC::MASK_PLACEHOLDER, 1, {1, 1, 1, 0, 0, 0});
+          AscendC::SetMaskNorm();
+          AscendC::SetVectorMask<float, AscendC::MaskMode::NORMAL>(18446744073709551615ULL, 18446744073709551615ULL);
+          AscendC::PipeBarrier<PIPE_V>();
+          input_ub_532_ascendc = vecin_buff.GetWithOffset<float>(8, 2112);
+          AscendC::SetMaskNorm();
+          AscendC::SetVectorMask<float, AscendC::MaskMode::NORMAL>(0ULL, (1ULL << (8)) - 1);
+          Add<float, false>(input_ub_532_ascendc, input_ub_531_ascendc, ub_529_ascendc, AscendC::MASK_PLACEHOLDER, 1, {1, 1, 1, 0, 0, 0});
+          AscendC::SetMaskNorm();
+          AscendC::SetVectorMask<float, AscendC::MaskMode::NORMAL>(18446744073709551615ULL, 18446744073709551615ULL);
+          AscendC::SetFlag<AscendC::HardEvent::V_MTE3>(0);
+          AscendC::WaitFlag<AscendC::HardEvent::V_MTE3>(0);
+          input_ub_532_ascendc = vecout_buff.GetWithOffset<float>(8, 2112);
+          for (int n = 0; n < 1; ++n) {
+            AscendC::DataCopy(output_data_ascendc[res_517 + n * 8], input_ub_532_ascendc[n * 16533504], {(uint16_t)(1), (uint16_t)(1), (uint16_t)(0), (uint16_t)(0)});
+          }
+          AscendC::PipeBarrier<PIPE_ALL>();
+        }
+        AscendC::PipeBarrier<PIPE_ALL>();
+      }
+    }
+  }
+  __aicore__ inline void CopyIn(int32_t index) {
+  }
+  __aicore__ inline void Compute() {
+  }
+  __aicore__ inline void CopyOut(int32_t index) {
+  }
+
+ private:
+    GM_ADDR input;
+    GM_ADDR grid;
+    GM_ADDR output;
+    float h_in;
+    float w_in;
+    int32_t h_out;
+    int32_t w_out;
+    int32_t n_in;
+    int32_t c_in;
+};
+
+extern "C" __global__ __aicore__ void grid_sample(GM_ADDR input, GM_ADDR grid, GM_ADDR out,
+                                                  GM_ADDR workspace, GM_ADDR tiling) {
+    GET_TILING_DATA(tilingData, tiling);
+    KernelGridSample op;
+    op.Init(input, grid, out, &tilingData);
+    op.Process();
+}
+
+#ifndef ASCENDC_CPU_DEBUG
+void grid_sample_do(uint32_t blockDim, void *l2ctrl, void *stream, uint8_t *input, uint8_t *grid,
+                       uint8_t *output, uint8_t *workspace, uint8_t *tiling) {
+  grid_sample<<<blockDim, l2ctrl, stream>>>(input, grid, output, workspace, tiling);
+}
+#endif
+// NOLINTEND
diff --git a/ops/ascendc/grid_sample/op_kernel/gridsample_nhwc.py b/ops/ascendc/grid_sample/op_kernel/gridsample_nhwc.py
new file mode 100644
index 0000000000000000000000000000000000000000..6996e3b5003967199814ec51f818e4fa56ceecc8
--- /dev/null
+++ b/ops/ascendc/grid_sample/op_kernel/gridsample_nhwc.py
@@ -0,0 +1,404 @@
+# Copyright 2025 Huawei Technologies Co., Ltd
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ============================================================================
+# pylint: skip-file
+import numpy as np
+import os
+import sys
+from swft.core import *
+from swft.api import *
+
+OP_NAME = 'grid_sample'
+os.system(f"mkdir -p temp/{OP_NAME}")
+os.system(f"mkdir -p temp/{OP_NAME}/input")
+os.system(f"mkdir -p temp/{OP_NAME}/output")
+
+N = 1
+C = 1536 # C should be 8 aligned
+H_IN = 24
+W_IN = 24
+H_OUT = 10764
+W_OUT= 1
+
+PER_LOOP_PIXEL_NUM = 8
+BLOCK_LOOP_NUM = 8 # 32Bytes / sizeof(type)
+CORE_NUM = 8
+ALIGN_CORNERS = False
+PADDING_MODE = 1
+CHANNEL_LOOP_CHUNK = 512 # C should be 512 aligned
+CHANNEL_LOOP_CHUNK2 = 8 # chunk size for residual
+INTERPOLATION_MODE = 0
+
+# Numpy Test
+# ===============================================================================
+def bilinear_interpolate(input_tensor, x, y, H, W):
+    """双线性插值"""
+    x1 = int(np.floor(x))
+    y1 = int(np.floor(y))
+    x2 = min(x1 + 1, W - 1)
+    y2 = min(y1 + 1, H - 1)
+
+    # 边界检查
+    if x1 < 0 or x1 >= W or y1 < 0 or y1 >= H:
+        return np.zeros(input_tensor.shape[0], dtype=np.float32)
+
+    # 计算权重
+    wx = x - x1
+    wy = y - y1
+
+    # 双线性插值
+    result = (input_tensor[y1, x1, :] * (1 - wx) * (1 - wy) +
+              input_tensor[y1, x2, :] * wx * (1 - wy) +
+              input_tensor[y2, x1, :] * (1 - wx) * wy +
+              input_tensor[y2, x2, :] * wx * wy)
+
+    return result
+
+def gen_data():
+    np.random.seed(0)
+    # input_data = np.arange(N * H_IN * W_IN * C).reshape(N, H_IN, W_IN, C).astype(np.float32)
+    # input_data = np.full([N, H_IN, W_IN, C], 100.0, dtype=np.float32)
+    # input_data = np.repeat(np.arange(1, N * H_IN * W_IN + 1).reshape(N, H_IN, W_IN, 1), C, axis=3).astype(np.float32)
+    input_data = np.random.uniform(-1, 1, [N, H_IN, W_IN, C]).astype(np.float32)
+    grid_data = np.random.uniform(-1, 1, [N, H_OUT, W_OUT, 2]).astype(np.float32)
+    output_data = np.zeros([N, H_OUT, W_OUT, C]).astype(np.float32)
+
+    for n in range(N):
+        for h in range(H_OUT):
+            for w in range(W_OUT):
+                # 获取归一化坐标
+                x_norm = grid_data[n, h, w, 0]
+                y_norm = grid_data[n, h, w, 1]
+
+                # 映射到input坐标空间
+                if ALIGN_CORNERS:
+                    x = (x_norm + 1) * (W_IN - 1) / 2
+                    y = (y_norm + 1) * (H_IN - 1) / 2
+                else:
+                    x = (x_norm + 1) * W_IN / 2 - 0.5
+                    y = (y_norm + 1) * H_IN / 2 - 0.5
+
+                # 边界处理
+                if PADDING_MODE == 0:  # zeros
+                    if x < 0 or x >= W_IN or y < 0 or y >= H_IN:
+                        output_data[n, h, w, :] = 0
+                        continue
+                elif PADDING_MODE == 1:  # border
+                    x = np.clip(x, 0, W_IN - 1)
+                    y = np.clip(y, 0, H_IN - 1)
+
+                # 插值采样
+                if INTERPOLATION_MODE == 0:  # bilinear
+                    output_data[n, h, w, :] = bilinear_interpolate(
+                        input_data[n], x, y, H_IN, W_IN)
+                elif INTERPOLATION_MODE == 1:  # nearest
+                    x_idx = int(x)
+                    y_idx = int(y)
+                    x_idx = np.clip(x_idx, 0, W_IN - 1)
+                    y_idx = np.clip(y_idx, 0, H_IN - 1)
+                    output_data[n, h, w, :] = input_data[n, y_idx, x_idx, :]
+
+    h_in = np.array([H_IN], dtype=np.float32)
+    w_in = np.array([W_IN], dtype=np.float32)
+    h_out = np.array([H_OUT], dtype=np.int32)
+    w_out = np.array([W_OUT], dtype=np.int32)
+    n_in = np.array([N], dtype=np.int32)
+    c_in = np.array([C], dtype=np.int32)
+
+    # 保存测试数据
+    h_in.tofile(f"./temp/{OP_NAME}/input/h_in.bin")
+    w_in.tofile(f"./temp/{OP_NAME}/input/w_in.bin")
+    h_out.tofile(f"./temp/{OP_NAME}/input/h_out.bin")
+    w_out.tofile(f"./temp/{OP_NAME}/input/w_out.bin")
+    n_in.tofile(f"./temp/{OP_NAME}/input/n_in.bin")
+    c_in.tofile(f"./temp/{OP_NAME}/input/c_in.bin")
+    input_data.tofile(f"./temp/{OP_NAME}/input/input_data.bin")
+    grid_data.tofile(f"./temp/{OP_NAME}/input/grid_data.bin")
+    output_data.tofile(f"./temp/{OP_NAME}/output/output_golden.bin")
+
+# OP Impl
+# ===============================================================================
+
+
+@sub_kernel(core_num=CORE_NUM)
+def grid_sample(input_data, grid_data, output_data, h_in, w_in, h_out, w_out, n_in, c_in):
+    block_idx = get_block_idx()
+
+    # calc number of output this rank need to compute
+    total_outputs = n_in * h_out * w_out
+    # 将total_outputs按照 Block_Loop_NUM 分组
+    block_num = total_outputs // BLOCK_LOOP_NUM
+    # 多余的 outputs
+    output_res = total_outputs % BLOCK_LOOP_NUM
+    percore_blocks = Scalar("INT32", 0)
+    percore_blocks = (block_num + CORE_NUM - 1) // CORE_NUM
+    pixel_num = Scalar("INT32", 0)
+    if (block_idx + 1) * percore_blocks < block_num:
+        pixel_num = percore_blocks * BLOCK_LOOP_NUM
+    elif block_idx * percore_blocks < block_num:
+        pixel_num = (block_num - block_idx * percore_blocks) * BLOCK_LOOP_NUM + output_res
+    else:
+        pixel_num = Scalar("INT32", 0)
+    start_idx = block_idx * percore_blocks * BLOCK_LOOP_NUM
+    if (block_idx == 0) and pixel_num < 1:
+        pixel_num = total_outputs
+
+    max_pixel_num = PER_LOOP_PIXEL_NUM # 32Byte aligned
+    tile_num = pixel_num // max_pixel_num
+    res_num = pixel_num % max_pixel_num
+
+    c_chunk_num = c_in // CHANNEL_LOOP_CHUNK
+    c_res_num = c_in % CHANNEL_LOOP_CHUNK
+
+    # C should be 8 aligned
+    c_res_chunk_num = (c_res_num + CHANNEL_LOOP_CHUNK2 - 1) // CHANNEL_LOOP_CHUNK2
+    c_res_start = c_chunk_num * CHANNEL_LOOP_CHUNK
+
+    in_stride_2 = Scalar("INT32", 0)
+    in_stride_2 = c_in
+    in_stride_1 = w_in.astype("INT32") * in_stride_2
+    in_stride_0 = h_in.astype("INT32") * in_stride_1
+
+    out_stride_2 = Scalar("INT32", 0)
+    out_stride_2 = c_in
+    out_stride_1 = w_out * out_stride_2
+    out_stride_0 = h_out * out_stride_1
+
+    # consts
+    const1 = w_in * 0.5
+    const2 = h_in * 0.5
+    const3 = const1 - 0.5
+    const4 = const2 - 0.5
+    const5 = w_in - 1
+    const6 = h_in - 1
+    const7 = const5.astype("INT32")
+    const8 = const6.astype("INT32")
+    for i in dynamic_loop(tile_num):
+        idx = start_idx + i * max_pixel_num
+        xys_ub = slice_to_ub(grid_data, [idx, 0], [max_pixel_num, 2])
+        xys_ub_trans = transpose(xys_ub, [1, 0])
+        xs_ub = slice_to_ub(xys_ub_trans, [0, 0], [1, max_pixel_num])
+        ys_ub = slice_to_ub(xys_ub_trans, [1, 0], [1, max_pixel_num])
+
+        # align corners is false
+        xs_ub = vmuls(xs_ub, const1)
+        ys_ub = vmuls(ys_ub, const2)
+        xs_ub = vadds(xs_ub, const3)
+        ys_ub = vadds(ys_ub, const4)
+
+        # border clip
+        xs_ub = vmaxs(xs_ub, Scalar("FP32", 0))
+        ys_ub = vmaxs(ys_ub, Scalar("FP32", 0))
+        xs_ub = vmins(xs_ub, const5)
+        ys_ub = vmins(ys_ub, const6)
+        
+        # bilinear interpolation
+        xs_ub_w_int = vconv(xs_ub, "INT32", "z")
+        xs_ub_w = vconv(xs_ub_w_int, "FP32")
+        xs_ub_e_int = vadds(xs_ub_w_int, Scalar("INT32", 1))
+        xs_ub_e_int = vmins(xs_ub_e_int, const7)
+        xs_ub_e = vadds(xs_ub_w, Scalar("FP32", 1))
+
+        ys_ub_n_int = vconv(ys_ub, "INT32", "z")
+        ys_ub_n = vconv(ys_ub_n_int, "FP32")
+        ys_ub_s_int = vadds(ys_ub_n_int, Scalar("INT32", 1))
+        ys_ub_s_int = vmins(ys_ub_s_int, const8)
+        ys_ub_s = vadds(ys_ub_n, Scalar("FP32", 1))
+        
+        w_weights = vsub(xs_ub_e, xs_ub)
+        e_weights = vsub(xs_ub, xs_ub_w)
+        n_weights = vsub(ys_ub_s, ys_ub)
+        s_weights = vsub(ys_ub, ys_ub_n)
+
+        nw_weights = vmul(w_weights, n_weights)
+        ne_weights = vmul(e_weights, n_weights)
+        sw_weights = vmul(w_weights, s_weights)
+        se_weights = vmul(e_weights, s_weights)
+        
+        # n/e may be out of range, but weight will be zero
+        for j in dynamic_loop(max_pixel_num):
+            inner_idx = (idx + j)
+            hw = inner_idx % (h_out * w_out)
+            w = hw % w_out
+            h = hw // w_out
+            n = inner_idx // (h_out * w_out)
+
+            x_w = move_to_scalar(xs_ub_w_int[0, j])
+            x_e = move_to_scalar(xs_ub_e_int[0, j])
+            y_n = move_to_scalar(ys_ub_n_int[0, j])
+            y_s = move_to_scalar(ys_ub_s_int[0, j])
+            
+            nw_weight = move_to_scalar(nw_weights[0, j])
+            ne_weight = move_to_scalar(ne_weights[0, j])
+            sw_weight = move_to_scalar(sw_weights[0, j])
+            se_weight = move_to_scalar(se_weights[0, j])
+            
+            
+            for idx_c in dynamic_loop(c_chunk_num):
+                input_ub_nw = slice_to_ub(input_data, [n * in_stride_0 + y_n * in_stride_1 + x_w * in_stride_2 + idx_c * CHANNEL_LOOP_CHUNK],
+                                          [CHANNEL_LOOP_CHUNK])
+                input_ub_ne = slice_to_ub(input_data, [n * in_stride_0 + y_n * in_stride_1 + x_e * in_stride_2 + idx_c * CHANNEL_LOOP_CHUNK],
+                                          [CHANNEL_LOOP_CHUNK])
+                input_ub_sw = slice_to_ub(input_data, [n * in_stride_0 + y_s * in_stride_1 + x_w * in_stride_2 + idx_c * CHANNEL_LOOP_CHUNK],
+                                          [CHANNEL_LOOP_CHUNK])
+                input_ub_se = slice_to_ub(input_data, [n * in_stride_0 + y_s * in_stride_1 + x_e * in_stride_2 + idx_c * CHANNEL_LOOP_CHUNK],
+                                          [CHANNEL_LOOP_CHUNK])
+                
+                input_ub = vmuls(input_ub_nw, nw_weight)
+                input_ub = vadd(input_ub, vmuls(input_ub_ne, ne_weight))
+                input_ub = vadd(input_ub, vmuls(input_ub_sw, sw_weight))
+                input_ub = vadd(input_ub, vmuls(input_ub_se, se_weight))
+
+                insert_to_gm(output_data, input_ub, [n * out_stride_0 + h * out_stride_1 + w * out_stride_2 + idx_c * CHANNEL_LOOP_CHUNK],
+                             [CHANNEL_LOOP_CHUNK])
+
+            for idx_c in dynamic_loop(c_res_chunk_num):
+                input_ub_nw = slice_to_ub(input_data, [n * in_stride_0 + y_n * in_stride_1 + x_w * in_stride_2 + c_res_start + idx_c * CHANNEL_LOOP_CHUNK2],
+                                          [CHANNEL_LOOP_CHUNK2])
+                input_ub_ne = slice_to_ub(input_data, [n * in_stride_0 + y_n * in_stride_1 + x_e * in_stride_2 + c_res_start + idx_c * CHANNEL_LOOP_CHUNK2],
+                                          [CHANNEL_LOOP_CHUNK2])
+                input_ub_sw = slice_to_ub(input_data, [n * in_stride_0 + y_s * in_stride_1 + x_w * in_stride_2 + c_res_start + idx_c * CHANNEL_LOOP_CHUNK2],
+                                          [CHANNEL_LOOP_CHUNK2])
+                input_ub_se = slice_to_ub(input_data, [n * in_stride_0 + y_s * in_stride_1 + x_e * in_stride_2 + c_res_start + idx_c * CHANNEL_LOOP_CHUNK2],
+                                          [CHANNEL_LOOP_CHUNK2])
+
+                input_ub = vmuls(input_ub_nw, nw_weight)
+                input_ub = vadd(input_ub, vmuls(input_ub_ne, ne_weight))
+                input_ub = vadd(input_ub, vmuls(input_ub_sw, sw_weight))
+                input_ub = vadd(input_ub, vmuls(input_ub_se, se_weight))
+
+                insert_to_gm(output_data, input_ub, [n * out_stride_0 + h * out_stride_1 + w * out_stride_2 + c_res_start + idx_c * CHANNEL_LOOP_CHUNK2],
+                             [CHANNEL_LOOP_CHUNK2])
+
+    # for residual
+    if res_num > 0:
+        idx_ = start_idx + pixel_num - res_num
+        xys_ub_ = slice_to_ub(grid_data, [idx_, 0], [max_pixel_num, 2])
+        xys_ub_trans_ = transpose(xys_ub_, [1, 0])
+        xs_ub_ = slice_to_ub(xys_ub_trans_, [0, 0], [1, max_pixel_num])
+        ys_ub_ = slice_to_ub(xys_ub_trans_, [1, 0], [1, max_pixel_num])
+
+        # align corners is false
+        xs_ub_ = vmuls(xs_ub_, const1)
+        ys_ub_ = vmuls(ys_ub_, const2)
+        xs_ub_ = vadds(xs_ub_, const3)
+        ys_ub_ = vadds(ys_ub_, const4)
+
+        # border clip
+        xs_ub_ = vmaxs(xs_ub_, Scalar("FP32", 0))
+        ys_ub_ = vmaxs(ys_ub_, Scalar("FP32", 0))
+        xs_ub_ = vmins(xs_ub_, const5)
+        ys_ub_ = vmins(ys_ub_, const6)
+        
+        # bilinear interpolation
+        xs_ub_w_int_ = vconv(xs_ub_, "INT32", "z")
+        xs_ub_w_ = vconv(xs_ub_w_int_, "FP32")
+        xs_ub_e_int_ = vadds(xs_ub_w_int_, Scalar("INT32", 1))
+        xs_ub_e_int_ = vmins(xs_ub_e_int_, const7)
+        xs_ub_e_ = vadds(xs_ub_w_, Scalar("FP32", 1))
+
+        ys_ub_n_int_ = vconv(ys_ub_, "INT32", "z")
+        ys_ub_n_ = vconv(ys_ub_n_int_, "FP32")
+        ys_ub_s_int_ = vadds(ys_ub_n_int_, Scalar("INT32", 1))
+        ys_ub_s_int_ = vmins(ys_ub_s_int_, const8)
+        ys_ub_s_ = vadds(ys_ub_n_, Scalar("FP32", 1))
+        
+        w_weight_ = vsub(xs_ub_e_, xs_ub_)
+        e_weight_ = vsub(xs_ub_, xs_ub_w_)
+        n_weight_ = vsub(ys_ub_s_, ys_ub_)
+        s_weight_ = vsub(ys_ub_, ys_ub_n_)
+        
+        nw_weight_ = vmul(w_weight_, n_weight_)
+        ne_weight_ = vmul(e_weight_, n_weight_)
+        sw_weight_ = vmul(w_weight_, s_weight_)
+        se_weight_ = vmul(e_weight_, s_weight_)
+        
+        # n/e may be out of range, but weight will be zero
+
+        for j in dynamic_loop(res_num):
+            inner_idx = (idx_ + j)
+            hw = inner_idx % (h_out * w_out)
+            w = hw % w_out
+            h = hw // w_out
+            n = inner_idx // (h_out * w_out)
+            
+            x_w = move_to_scalar(xs_ub_w_int_[0, j])
+            x_e = move_to_scalar(xs_ub_e_int_[0, j])
+            y_n = move_to_scalar(ys_ub_n_int_[0, j])
+            y_s = move_to_scalar(ys_ub_s_int_[0, j])
+            
+            nw_weight = move_to_scalar(nw_weight_[0, j])
+            ne_weight = move_to_scalar(ne_weight_[0, j])
+            sw_weight = move_to_scalar(sw_weight_[0, j])
+            se_weight = move_to_scalar(se_weight_[0, j])
+            
+            for idx_c in dynamic_loop(c_chunk_num):
+                input_ub_nw = slice_to_ub(input_data, [n * in_stride_0 + y_n * in_stride_1 + x_w * in_stride_2 + idx_c * CHANNEL_LOOP_CHUNK],
+                                          [CHANNEL_LOOP_CHUNK])
+                input_ub_ne = slice_to_ub(input_data, [n * in_stride_0 + y_n * in_stride_1 + x_e * in_stride_2 + idx_c * CHANNEL_LOOP_CHUNK],
+                                          [CHANNEL_LOOP_CHUNK])
+                input_ub_sw = slice_to_ub(input_data, [n * in_stride_0 + y_s * in_stride_1 + x_w * in_stride_2 + idx_c * CHANNEL_LOOP_CHUNK],
+                                          [CHANNEL_LOOP_CHUNK])
+                input_ub_se = slice_to_ub(input_data, [n * in_stride_0 + y_s * in_stride_1 + x_e * in_stride_2 + idx_c * CHANNEL_LOOP_CHUNK],
+                                          [CHANNEL_LOOP_CHUNK])
+
+                input_ub = vmuls(input_ub_nw, nw_weight)
+                input_ub = vadd(input_ub, vmuls(input_ub_ne, ne_weight))
+                input_ub = vadd(input_ub, vmuls(input_ub_sw, sw_weight))
+                input_ub = vadd(input_ub, vmuls(input_ub_se, se_weight))
+
+                insert_to_gm(output_data, input_ub, [n * out_stride_0 + h * out_stride_1 + w * out_stride_2 + idx_c * CHANNEL_LOOP_CHUNK],
+                             [CHANNEL_LOOP_CHUNK])
+            
+            for idx_c in dynamic_loop(c_res_chunk_num):
+                input_ub_nw = slice_to_ub(input_data, [n * in_stride_0 + y_n * in_stride_1 + x_w * in_stride_2 + c_res_start + idx_c * CHANNEL_LOOP_CHUNK2],
+                                          [CHANNEL_LOOP_CHUNK2])
+                input_ub_ne = slice_to_ub(input_data, [n * in_stride_0 + y_n * in_stride_1 + x_e * in_stride_2 + c_res_start + idx_c * CHANNEL_LOOP_CHUNK2],
+                                          [CHANNEL_LOOP_CHUNK2])
+                input_ub_sw = slice_to_ub(input_data, [n * in_stride_0 + y_s * in_stride_1 + x_w * in_stride_2 + c_res_start + idx_c * CHANNEL_LOOP_CHUNK2],
+                                          [CHANNEL_LOOP_CHUNK2])
+                input_ub_se = slice_to_ub(input_data, [n * in_stride_0 + y_s * in_stride_1 + x_e * in_stride_2 + c_res_start + idx_c * CHANNEL_LOOP_CHUNK2],
+                                          [CHANNEL_LOOP_CHUNK2])
+
+                input_ub = vmuls(input_ub_nw, nw_weight)
+                input_ub = vadd(input_ub, vmuls(input_ub_ne, ne_weight))
+                input_ub = vadd(input_ub, vmuls(input_ub_sw, sw_weight))
+                input_ub = vadd(input_ub, vmuls(input_ub_se, se_weight))
+
+                insert_to_gm(output_data, input_ub, [n * out_stride_0 + h * out_stride_1 + w * out_stride_2 + c_res_start + idx_c * CHANNEL_LOOP_CHUNK2],
+                             [CHANNEL_LOOP_CHUNK2])
+
+
+if __name__ == "__main__":
+    set_context("310P")
+    gen_data()
+    input_data = Tensor("GM", "FP32", [N * H_IN * W_IN * C], "ND", False)
+    grid_data = Tensor("GM", "FP32", [N * H_OUT * W_OUT, 2], "ND", False)
+    output_data = Tensor("GM", "FP32", [N * H_OUT * W_OUT * C], "ND", False)
+    h_in = Scalar("FP32")
+    w_in = Scalar("FP32")
+    h_out = Scalar("INT32")
+    w_out = Scalar("INT32")
+    n_in = Scalar("INT32")
+    c_in = Scalar("INT32")
+    compile_func(grid_sample, globals())(input_data, grid_data, output_data, h_in, w_in, h_out, w_out, n_in, c_in)
+    # compile_func(grid_sample_floor, globals())(input_data, grid_data, output_data, h_in, w_in, h_out, w_out, n_in, c_in)
+    compile_kernel(f"./temp/{OP_NAME}/{OP_NAME}.cce", OP_NAME)
+    exec_kernel(OP_NAME, locals(), prefix_path="temp", inputs=[
+                'input_data', 'grid_data', 'h_in', 'w_in', 'h_out', 'w_out', 'n_in', 'c_in'],
+                outputs=['output_data'], device_id=1)
+    script_dir = os.path.dirname(os.path.abspath(__file__))
+    return_code_1 = os.system(
+        f'python3 {script_dir}/../verify_result.py ./temp/{OP_NAME}/output/output_data_actual.bin ./temp/{OP_NAME}/output/output_golden.bin float32 4e-2 1e-2 4e-3')
+    sys.exit(return_code_1 >> 8)
diff --git a/ops/framework/ms_kernels_internal/pyboost/internal_pyboost_runner.cc b/ops/framework/ms_kernels_internal/pyboost/internal_pyboost_runner.cc
index 550281daeaf7f52660c567920e6c7ac9ea2fa714..f30de74da83a7e81ee2c9839eb4345bfdf6bac6d 100644
--- a/ops/framework/ms_kernels_internal/pyboost/internal_pyboost_runner.cc
+++ b/ops/framework/ms_kernels_internal/pyboost/internal_pyboost_runner.cc
@@ -17,11 +17,11 @@
 #include "ops/framework/ms_kernels_internal/pyboost/internal_pyboost_runner.h"
 
 namespace {
-  constexpr size_t kMemAlignSize{512};
-  constexpr size_t kAlignBytes{32};
-  size_t GetAlignedSize(size_t size) {
-    return (size + kMemAlignSize + kAlignBytes -1) / kMemAlignSize * kMemAlignSize;
-  }
+constexpr size_t kMemAlignSize{512};
+constexpr size_t kAlignBytes{32};
+size_t GetAlignedSize(size_t size) {
+  return (size + kMemAlignSize + kAlignBytes -1) / kMemAlignSize * kMemAlignSize;
+}
 }
 
 namespace ms_custom_ops {
@@ -68,6 +68,7 @@ void InternalPyboostRunner::GetOrCreateKernel(const TensorList &inputs, const Te
 size_t InternalPyboostRunner::CalcWorkspace() {
   MS_EXCEPTION_IF_NULL(internal_op_);
   auto workspace_size_list = internal_op_->GetWorkspaceSize();
+  // all workspace will be aligned, like graph_mode
   return std::accumulate(workspace_size_list.begin(), workspace_size_list.end(), 0,
                          [](size_t acc, size_t size){
                            return acc + GetAlignedSize(size);
diff --git a/tests/st/test_custom_grid_sample.py b/tests/st/test_custom_grid_sample.py
new file mode 100644
index 0000000000000000000000000000000000000000..a2d36d893c8b47b0c1168216b406f5880b9a5126
--- /dev/null
+++ b/tests/st/test_custom_grid_sample.py
@@ -0,0 +1,236 @@
+# Copyright 2025 Huawei Technologies Co., Ltd
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ============================================================================
+"""Tests for ms_custom_ops.grid_sample."""
+
+import logging
+import numpy as np
+import pytest
+
+import mindspore as ms
+from mindspore import Tensor, nn, context, Profiler
+from mindspore.profiler import ProfilerLevel, ProfilerActivity, AicoreMetrics
+import ms_custom_ops
+
+def bilinear_interpolate(input_tensor, x, y, h, w):
+    """bilinear_interpolate"""
+    x1 = int(np.floor(x))
+    y1 = int(np.floor(y))
+    x2 = min(x1 + 1, w - 1)
+    y2 = min(y1 + 1, h - 1)
+
+    # 边界检查
+    if x1 < 0 or x1 >= w or y1 < 0 or y1 >= h:
+        return np.zeros(input_tensor.shape[0], dtype=np.float32)
+
+    # 计算权重
+    wx = x - x1
+    wy = y - y1
+
+    # 双线性插值
+    result = (input_tensor[y1, x1, :] * (1 - wx) * (1 - wy) +
+              input_tensor[y1, x2, :] * wx * (1 - wy) +
+              input_tensor[y2, x1, :] * (1 - wx) * wy +
+              input_tensor[y2, x2, :] * wx * wy)
+    return result
+
+
+def golden_grid_sample(input_data, grid_data, align_corners, padding_mode, interpolation_mode):
+    """golden_grid_sample"""
+    n, h_in, w_in, c = input_data.shape
+    n, h_out, w_out, _ = grid_data.shape
+    output_data = np.zeros([n, h_out, w_out, c]).astype(np.float32)
+
+    for n in range(n):
+        for h in range(h_out):
+            for w in range(w_out):
+                # 获取归一化坐标
+                x_norm = grid_data[n, h, w, 0]
+                y_norm = grid_data[n, h, w, 1]
+
+                # 映射到input坐标空间
+                if align_corners:
+                    x = (x_norm + 1) * (w_in - 1) / 2
+                    y = (y_norm + 1) * (h_in - 1) / 2
+                else:
+                    x = (x_norm + 1) * w_in / 2 - 0.5
+                    y = (y_norm + 1) * h_in / 2 - 0.5
+
+                # 边界处理
+                if padding_mode == 0:  # zeros
+                    if x < 0 or x >= w_in or y < 0 or y >= h_in:
+                        output_data[n, h, w, :] = 0
+                        continue
+                elif padding_mode == 1:  # border
+                    x = np.clip(x, 0, w_in - 1)
+                    y = np.clip(y, 0, h_in - 1)
+
+                # 插值采样
+                if interpolation_mode == 0:  # bilinear
+                    output_data[n, h, w, :] = bilinear_interpolate(
+                        input_data[n], x, y, h_in, w_in)
+                elif interpolation_mode == 1:  # nearest
+                    x_idx = int(x)
+                    y_idx = int(y)
+                    x_idx = np.clip(x_idx, 0, w_in - 1)
+                    y_idx = np.clip(y_idx, 0, h_in - 1)
+                    output_data[n, h, w, :] = input_data[n, y_idx, x_idx, :]
+    return output_data
+
+
+class GridSampleNet(nn.Cell):
+    """GridSampleNet"""
+
+    def construct(self, input_data, grid, interpolation_mode, padding_mode, align_corners):
+        return ms_custom_ops.grid_sample(input_data, grid, interpolation_mode, padding_mode, align_corners)
+
+
+def run_grid_sample(net, input_dtype, grid_dtype, align_corners, padding_mode, interpolation_mode,
+                    n, c, h_in, w_in, h_out, w_out, is_profiler=False):
+    """run_grid_sample"""
+    np_input = np.random.random((n, h_in, w_in, c)).astype(input_dtype)
+    np_grid = np.random.uniform(-1, 1, (n, h_out, w_out, 2)).astype(grid_dtype)
+    input_data = Tensor(np_input)
+    grid = Tensor(np_grid)
+    np_output = golden_grid_sample(np_input, np_grid, align_corners, padding_mode, interpolation_mode)
+    if is_profiler is False:
+        output_data = net(input_data, grid, interpolation_mode, padding_mode, align_corners)
+        np.testing.assert_allclose(np_output, output_data.asnumpy(), rtol=1e-4, atol=1e-4, err_msg=" grid_sample ")
+    else:
+        profiler = Profiler(profiler_level=ProfilerLevel.Level2,
+                    activities=[ProfilerActivity.CPU, ProfilerActivity.NPU],
+                    aic_metrics=AicoreMetrics.AiCoreNone)
+        for _ in range(50):
+            output_data = net(input_data, grid, interpolation_mode, padding_mode, align_corners)
+        profiler.analyse()
+
+
+@pytest.mark.level0
+@pytest.mark.env_onecard
+@pytest.mark.platform_ascend310p
+@pytest.mark.parametrize("exec_mode", [context.GRAPH_MODE, context.PYNATIVE_MODE])
+@pytest.mark.parametrize("input_dtype", [np.float32])
+@pytest.mark.parametrize("grid_dtype", [np.float32])
+@pytest.mark.parametrize("align_corners", [False])
+@pytest.mark.parametrize("padding_mode", [1])
+@pytest.mark.parametrize("interpolation_mode", [0])
+@pytest.mark.parametrize("n", [1])
+@pytest.mark.parametrize("c", [1536, 8])
+@pytest.mark.parametrize("h_in,w_in", [(24, 24)])
+@pytest.mark.parametrize("h_out,w_out", [(1024, 1)])
+def test_grid_sample(exec_mode, input_dtype, grid_dtype, align_corners, padding_mode, interpolation_mode,
+                     n, c, h_in, w_in, h_out, w_out):
+    """
+    Feature: test grid_sample operator.
+    Description: test correctness of grid_sample operator.
+    Expectation:should pass for all testcases.
+    """
+    ms.set_context(device_target="Ascend", mode=exec_mode)
+    ms.set_context(jit_config={"jit_level": "O0", "infer_boost": "on"})
+    net =  GridSampleNet()
+    run_grid_sample(net, input_dtype, grid_dtype, align_corners, padding_mode,
+                    interpolation_mode, n, c, h_in, w_in, h_out, w_out)
+
+
+
+@pytest.mark.level1
+@pytest.mark.env_onecard
+@pytest.mark.platform_ascend310p
+@pytest.mark.parametrize("exec_mode", [context.GRAPH_MODE, context.PYNATIVE_MODE])
+@pytest.mark.parametrize("input_dtype", [np.float32])
+@pytest.mark.parametrize("grid_dtype", [np.float32])
+@pytest.mark.parametrize("align_corners", [False])
+@pytest.mark.parametrize("padding_mode", [1])
+@pytest.mark.parametrize("interpolation_mode", [0])
+@pytest.mark.parametrize("n", [1])
+@pytest.mark.parametrize("c", [17])
+@pytest.mark.parametrize("h_in,w_in", [(24, 24)])
+@pytest.mark.parametrize("h_out,w_out", [(1024, 1)])
+def test_grid_sample_align_c(exec_mode, input_dtype, grid_dtype, align_corners, padding_mode, interpolation_mode,
+                             n, c, h_in, w_in, h_out, w_out):
+    """
+    Feature: test grid_sample operator.
+    Description: test align c.
+    Expectation: Unsupported c correctly rejected.
+    """
+    ms.set_context(device_target="Ascend", mode=exec_mode)
+    ms.set_context(jit_config={"jit_level": "O0", "infer_boost": "on"})
+    net =  GridSampleNet()
+    with pytest.raises(ValueError, match="c should be aligned with"):
+        run_grid_sample(net, input_dtype, grid_dtype, align_corners, padding_mode, interpolation_mode,
+                        n, c, h_in, w_in, h_out, w_out)
+    logging.info(
+        "Unsupported c correctly rejected: c=%s", c
+    )
+
+
+@pytest.mark.level1
+@pytest.mark.env_onecard
+@pytest.mark.platform_ascend310p
+@pytest.mark.parametrize("exec_mode", [context.GRAPH_MODE, context.PYNATIVE_MODE])
+@pytest.mark.parametrize("input_dtype", [np.float32])
+@pytest.mark.parametrize("grid_dtype", [np.float32])
+@pytest.mark.parametrize("align_corners", [False])
+@pytest.mark.parametrize("padding_mode", [1])
+@pytest.mark.parametrize("interpolation_mode", [1])
+@pytest.mark.parametrize("n", [1])
+@pytest.mark.parametrize("c", [16])
+@pytest.mark.parametrize("h_in,w_in", [(2, 2)])
+@pytest.mark.parametrize("h_out,w_out", [(2, 2)])
+def test_grid_sample_unsupported_mode(exec_mode, input_dtype, grid_dtype, align_corners, padding_mode,
+                                     interpolation_mode, n, c, h_in, w_in, h_out, w_out):
+    """
+    Feature: test grid_sample operator.
+    Description: test unsupported mode.
+    Expectation: Unsupported mode correctly rejected.
+    """
+    ms.set_context(device_target="Ascend", mode=exec_mode)
+    ms.set_context(jit_config={"jit_level": "O0"})
+    net =  GridSampleNet()
+    with pytest.raises(ValueError, match="mode only supports"):
+        run_grid_sample(net, input_dtype, grid_dtype, align_corners, padding_mode, interpolation_mode,
+                        n, c, h_in, w_in, h_out, w_out)
+    logging.info("Unsupported mode correctly rejected")
+
+
+@pytest.mark.level1
+@pytest.mark.env_onecard
+@pytest.mark.platform_ascend310p
+@pytest.mark.parametrize("exec_mode", [context.GRAPH_MODE, context.PYNATIVE_MODE])
+@pytest.mark.parametrize("input_dtype", [np.float32])
+@pytest.mark.parametrize("grid_dtype", [np.float32])
+@pytest.mark.parametrize("align_corners", [False])
+@pytest.mark.parametrize("padding_mode", [1])
+@pytest.mark.parametrize("interpolation_mode", [0])
+@pytest.mark.parametrize("n", [1])
+@pytest.mark.parametrize("h_in,w_in", [(24, 24)])
+@pytest.mark.parametrize("h_out,w_out", [(1024, 1)])
+def test_grid_sample_3d_input(exec_mode, input_dtype, grid_dtype, align_corners, padding_mode,
+                              interpolation_mode, n, h_in, w_in, h_out, w_out):
+    """
+    Feature: test grid_sample operator.
+    Description: test 3d input.
+    Expectation: 3d input correctly rejected.
+    """
+    ms.set_context(device_target="Ascend", mode=exec_mode)
+    ms.set_context(jit_config={"jit_level": "O0", "infer_boost": "on"})
+    net =  GridSampleNet()
+    with pytest.raises(RuntimeError, match="dim of inputs should"):
+        np_input = np.random.random((n, h_in, w_in)).astype(input_dtype)
+        np_grid = np.random.uniform(-1, 1, (n, h_out, w_out, 2)).astype(grid_dtype)
+        input_data = Tensor(np_input)
+        grid = Tensor(np_grid)
+        output_data = net(input_data, grid, interpolation_mode, padding_mode, align_corners)
+        _ = output_data.asnumpy()
+    logging.info("3d input correctly rejected")