diff --git a/akg-mlir/python/akg_mlir/exec_tools/py_benchmark.py b/akg-mlir/python/akg_mlir/exec_tools/py_benchmark.py
index 265c5da34039c737b57ac1b6c4a699e57fb8175d..6a9034bd818708773e2a615394ecaced12d87b57 100644
--- a/akg-mlir/python/akg_mlir/exec_tools/py_benchmark.py
+++ b/akg-mlir/python/akg_mlir/exec_tools/py_benchmark.py
@@ -184,20 +184,22 @@ def _transform_data_to_ctypes_ascend(data,
         data_shape = np.array(device_shape[data_idx])
         data_bytes = d.nbytes
         is_numpy_bf16 = False
+        is_numpy_output = False
         if isinstance(d, int):
             data_ctypes.append(ctypes.c_int(d))
         elif isinstance(d, np.ndarray):
+            if data_idx in output_idx_set:
+                is_numpy_output = True
             if d.dtype.name == "bfloat16":
                 d = d.astype(np.float32)
                 data[data_idx] = d
                 is_numpy_bf16 = True
 
         ascend_tensor_obj = akgAscendLaunch.AscendTensorObjStructPyTorch()
-        is_output = data_idx in output_idx_set
         ascend_tensor_obj.tensor_info = d
         ascend_tensor_obj.shape_info = data_shape
         ascend_tensor_obj.nbytes = data_bytes
-        ascend_tensor_obj.is_output = is_output
+        ascend_tensor_obj.is_output = is_numpy_output
         ascend_tensor_obj.is_bf16 = is_numpy_bf16
         data_ctypes.append(ascend_tensor_obj)
 
diff --git a/akg-mlir/python/kernel.py b/akg-mlir/python/kernel.py
new file mode 100644
index 0000000000000000000000000000000000000000..80a6fe7df08adbbfdab4b48dd694a5fdf5972cb3
--- /dev/null
+++ b/akg-mlir/python/kernel.py
@@ -0,0 +1,158 @@
+# Copyright 2023-2025 Huawei Technologies Co., Ltd
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+""" Module for akg support ascend_npu_ir test """
+import os
+import re
+import ctypes
+import subprocess
+import numpy as np
+
+from akg import akgAscendLaunch
+from akg.message import get_npucompiler_path
+from akg.utils.dynamic_utils import get_device_shape
+
+def _transform_data_to_ctypes_ascend(data,
+                              kernel_name,
+                              output_indexes,
+                              is_dyn_shape=False,
+                              backend="ascend",
+                              is_profile_params=False,
+                              ):
+    """ transform tensor input data to ctypes for ascend """
+    data_ctypes = []
+    if len(data) == 0:
+        # dynamic shape info cannot generate inputs while compilation
+        return data_ctypes
+
+    device_shape, _, _ = get_device_shape(
+        data, kernel_name, is_dyn_shape and not is_profile_params)
+
+    output_idx_set = []
+    for output_idx in output_indexes:
+        if output_idx >= 0:
+            output_idx_set.append(output_idx)
+        else:
+            output_idx_set.append(output_idx + len(data))
+    output_idx_set = set(output_idx_set)
+    for data_idx, d in enumerate(data):
+        data_shape = np.array(device_shape[data_idx])
+        data_bytes = d.nbytes
+        is_numpy_bf16 = False
+        is_numpy_output = False
+        if isinstance(d, int):
+            data_ctypes.append(ctypes.c_int(d))
+        elif isinstance(d, np.ndarray):
+            if data_idx in output_idx_set:
+                is_numpy_output = True
+            if d.dtype.name == "bfloat16":
+                d = d.astype(np.float32)
+                data[data_idx] = d
+                is_numpy_bf16 = True
+
+        ascend_tensor_obj = akgAscendLaunch.AscendTensorObjStructPyTorch()
+        ascend_tensor_obj.tensor_info = d
+        ascend_tensor_obj.shape_info = data_shape
+        ascend_tensor_obj.nbytes = data_bytes
+        ascend_tensor_obj.is_output = is_numpy_output
+        ascend_tensor_obj.is_bf16 = is_numpy_bf16
+        data_ctypes.append(ascend_tensor_obj)
+
+    return data_ctypes
+
+class Kernel:
+    """ Kernel for support ascend_npu_ir """
+    def __init__(self, kernel_meta=None):
+        self.kernel_name = kernel_meta.get('kernel_name')
+        self.dynamic = kernel_meta.get('dynamic')
+        self.device_id = kernel_meta.get('device_index')
+        self.base_dir = os.path.dirname(os.path.abspath(__file__))
+        self.output_so_dir = os.path.join(self.base_dir, "data/")
+        backend = kernel_meta.get('backend')
+        self.backend = backend if backend is not None else "ascend"
+        num_outputs = kernel_meta.get('num_outputs')
+        self.output_indexes = self._get_output_index(num_outputs)
+
+    def _get_output_index(self, num_outputs: int):
+        return [-i for i in range(1, num_outputs + 1)]
+
+    def compile(self, input_mlir: str):
+        """ Compile .mlir file to .so file. """
+        mlir_file_name = f"{self.kernel_name}_out.mlir"
+        mlir_file_path = os.path.join(self.base_dir, mlir_file_name)
+        os.makedirs(self.output_so_dir, exist_ok=True)
+        output_so_path = os.path.join(self.output_so_dir, f"{self.kernel_name}.so")
+        if not self.dynamic:
+            pattern = r'(\{[^{}]*\{[^{}]*)<[^<>]*>'
+            replacement = r'\1<HOST>'
+            input_mlir = re.sub(pattern, replacement, input_mlir, count=1)
+
+        try:
+            with open(mlir_file_path, "w", encoding="utf-8") as f:
+                f.write(input_mlir)
+
+            bishengir_compile_path = get_npucompiler_path()
+            compile_cmd = [
+                bishengir_compile_path,
+                mlir_file_path,
+                "-enable-hfusion-compile=true",
+                "-enable-hivm-compile=true",
+                "-enable-bin-relocation=false",
+                "-block-dim=40",
+                "-enable-auto-multi-buffer=true",
+                "-o",
+                output_so_path,
+            ]
+            print(f"exec command: {compile_cmd}")
+            result = subprocess.run(
+                compile_cmd,
+                stdout=subprocess.PIPE,
+                stderr=subprocess.PIPE,
+                text=True,
+                check=True
+            )
+            print(f"compile finish, lib.so save to {os.path.abspath(output_so_path)}")
+            return (result.stdout, result.stderr)
+        except Exception as compile_err:
+            raise Exception(f"compile MLIR failed, error message: {str(compile_err)}") from compile_err
+        finally:
+            if os.path.exists(mlir_file_path):
+                os.remove(mlir_file_path)
+
+    def run(self, *args, **kwargs):
+        """ launch .so file by akg_ascend_backend """
+        so_file_path = os.path.join(self.output_so_dir, f"lib{self.kernel_name}.so")
+        if not os.path.exists(so_file_path):
+            raise FileNotFoundError(f"can not find lib{self.kernel_name}.so in path: {so_file_path}")
+        n = len(args)
+        try:
+            input_for_mod_ctypes = _transform_data_to_ctypes_ascend(
+                args[:n-1],
+                self.kernel_name,
+                self.output_indexes,
+                self.dynamic,
+                self.backend
+            )
+
+            akgAscendLaunch.akg_ascend_run(
+                self.output_so_dir,
+                self.kernel_name,
+                self.device_id,
+                self.dynamic,
+                *input_for_mod_ctypes
+            )
+            print(f"success launch kernel: {self.kernel_name}")
+            return None
+        except Exception as running_err:
+            raise Exception(f"exec {self.kernel_name}.so error, error msg: {str(running_err)}") from running_err
+    
\ No newline at end of file