diff --git a/test/test_network_ops/test_abs.py b/test/test_network_ops/test_abs.py
new file mode 100644
index 0000000000000000000000000000000000000000..cf44ee4bc0cda0ca2958634cd735651558f60b5d
--- /dev/null
+++ b/test/test_network_ops/test_abs.py
@@ -0,0 +1,62 @@
+# Copyright (c) 2020, Huawei Technologies.All rights reserved.
+#
+# Licensed under the BSD 3-Clause License  (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# https://opensource.org/licenses/BSD-3-Clause
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import torch
+import torch_npu
+import numpy as np
+
+from torch_npu.testing.common_utils import TestCase, run_tests
+from torch_npu.testing.common_device_type import Dtypes, instantiate_device_type_tests
+from torch_npu.testing.util_test import create_common_tensor, test_2args_broadcast, create_dtype_tensor, UT_FAST_MODE
+
+class TestAbs(TestCase):
+    def cpu_op_exec(self, input1):
+        output = torch.abs(input1)
+        output = output.numpy()
+        return output
+
+    def npu_op_exec(self, input1):
+        output = torch.abs(input1)
+        output = output.to("cpu")
+        output = output.numpy()
+        return output
+
+    def test_abs_shape_format_fp16(self, device):
+        format_list = [0, 3]
+        shape_list = [[5], [5, 10], [1, 3, 2], [52, 15, 15, 20]]
+        shape_format = [
+            [np.float16, i, j] for i in format_list for j in shape_list
+        ]
+        for item in shape_format:
+            cpu_input, npu_input = create_common_tensor(item, -10, 10)
+            cpu_input = cpu_input.to(torch.float32)
+            cpu_output = self.cpu_op_exec(cpu_input)
+            npu_output = self.npu_op_exec(npu_input)
+            cpu_output = cpu_output.astype(np.float16)
+            self.assertRtolEqual(cpu_output, npu_output)
+
+    def test_abs_shape_format_fp32(self, device):
+        format_list = [0, 3]
+        shape_list = [[5], [5, 10], [1, 3, 2], [52, 15, 15, 20]]
+        shape_format = [
+            [np.float32, i, j] for i in format_list for j in shape_list
+        ]
+        for item in shape_format:
+            cpu_input, npu_input = create_common_tensor(item, -10, 10)
+            cpu_output = self.cpu_op_exec(cpu_input)
+            npu_output = self.npu_op_exec(npu_input)
+            self.assertRtolEqual(cpu_output, npu_output)
+
+instantiate_device_type_tests(TestAbs, globals(), except_for="cpu")
+if __name__ == "__main__":
+    run_tests()
diff --git a/test/test_network_ops/test_acos.py b/test/test_network_ops/test_acos.py
new file mode 100644
index 0000000000000000000000000000000000000000..36e9ffa8931ca0e8487451c78b1014ad0b3b7748
--- /dev/null
+++ b/test/test_network_ops/test_acos.py
@@ -0,0 +1,68 @@
+# Copyright (c) 2020, Huawei Technologies.All rights reserved.
+#
+# Licensed under the BSD 3-Clause License  (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# https://opensource.org/licenses/BSD-3-Clause
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import torch
+import numpy as np
+
+from torch_npu.testing.common_utils import TestCase, run_tests
+from torch_npu.testing.common_device_type import Dtypes, instantiate_device_type_tests
+from torch_npu.testing.util_test import create_common_tensor, test_2args_broadcast, create_dtype_tensor, UT_FAST_MODE
+
+class TestAcos(TestCase):
+    def cpu_op_exec(self, input_para):
+        output = torch.acos(input_para) 
+        output = output.numpy()
+        return output
+ 
+    def npu_op_exec(self, input_para):
+        output = torch.acos(input_para) 
+        output = output.to("cpu") 
+        output = output.numpy()
+        return output  
+        
+    def test_acos_common_shape_format(self, device):
+        shape_format = [
+                [[np.float32, -1, 1]],
+                [[np.float32, -1, (64, 10)]], 
+                [[np.float32, -1, (32, 1, 3)]]
+        ]
+        for item in shape_format:
+            cpu_input, npu_input = create_common_tensor(item[0], -1, 1)
+            cpu_output = self.cpu_op_exec(cpu_input)
+            npu_output = self.npu_op_exec(npu_input)
+            self.assertRtolEqual(cpu_output, npu_output)
+
+    def test_acos_float16_shape_format(self, device):
+        def cpu_op_exec_fp16(input_para):
+            input_para = input_para.to(torch.float32)
+            output = torch.acos(input_para)
+            output = output.numpy()
+            output = output.astype(np.float16)
+            return output
+
+        shape_format = [
+                [[np.float16, -1, 1]], 
+                [[np.float16, -1, (64, 10)]],    
+                [[np.float16, -1, (31, 1, 3)]]
+        ] 
+        for item in shape_format:
+            cpu_input, npu_input = create_common_tensor(item[0], -1, 1)
+            cpu_output = cpu_op_exec_fp16(cpu_input)
+            npu_output = self.npu_op_exec(npu_input)
+            self.assertRtolEqual(cpu_output, npu_output)  
+
+instantiate_device_type_tests(TestAcos, globals(), except_for="cpu")
+if __name__ == "__main__":
+    run_tests()
+        
diff --git a/test/test_network_ops/test_adaptive_avg_pool1d.py b/test/test_network_ops/test_adaptive_avg_pool1d.py
new file mode 100644
index 0000000000000000000000000000000000000000..d39d76da23a99f0208628e924d1e630eb62beb21
--- /dev/null
+++ b/test/test_network_ops/test_adaptive_avg_pool1d.py
@@ -0,0 +1,64 @@
+# Copyright (c) 2020, Huawei Technologies.All rights reserved.
+#
+# Licensed under the BSD 3-Clause License  (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# https://opensource.org/licenses/BSD-3-Clause
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import torch
+import torch_npu
+import torch.nn as nn
+import numpy as np
+from torch_npu.testing.common_utils import TestCase, run_tests
+from torch_npu.testing.common_device_type import Dtypes, instantiate_device_type_tests
+from torch_npu.testing.util_test import create_common_tensor, test_2args_broadcast, create_dtype_tensor, UT_FAST_MODE
+
+class TestAdaptiveAvgPool1d(TestCase):
+    def cpu_op_exec(self, input1, output_size):
+        m = nn.AdaptiveAvgPool1d(output_size)
+        output = m(input1)
+        return output
+
+    def npu_op_exec(self, input1, output_size):
+        m = nn.AdaptiveAvgPool1d(output_size).npu()
+        output = m(input1)
+        return output.cpu()
+    
+    def test_AdaptiveAvgPool1d_shape_format_fp16(self, device):
+        shape_format = [
+                [np.float16, 0, (64, 10, 16)],
+                [np.float16, -1, (256, 2048, 8)],
+                [np.float16, 3, (32, 16, 16)]
+        ]
+        output_list = [(4), (3)]
+        for item in shape_format:
+            cpu_input, npu_input = create_common_tensor(item, 1, 10)
+            for output_size in output_list:
+                cpu_output = self.cpu_op_exec(cpu_input.float(), output_size).half()
+                npu_output = self.npu_op_exec(npu_input, output_size)
+                self.assertRtolEqual(cpu_output, npu_output, prec16=0.002)
+
+    def test_AdaptiveAvgPool1d_shape_format_fp32(self, device):
+        shape_format = [
+                [np.float32, 0, (64, 10, 16)],
+                [np.float32, -1, (256, 2048, 8)],
+                [np.float32, 3, (32, 16, 16)]
+        ]
+        output_list = [(4), (3), (1)]
+        for item in shape_format:
+            cpu_input, npu_input = create_common_tensor(item, 1, 10)
+            for output_size in output_list:
+                cpu_output = self.cpu_op_exec(cpu_input, output_size)
+                npu_output = self.npu_op_exec(npu_input, output_size)
+                self.assertRtolEqual(cpu_output, npu_output, 0.001)
+
+instantiate_device_type_tests(TestAdaptiveAvgPool1d, globals(), except_for="cpu")
+if __name__ == "__main__":
+    run_tests()
diff --git a/test/test_network_ops/test_adaptive_avg_pool2d_backward.py b/test/test_network_ops/test_adaptive_avg_pool2d_backward.py
new file mode 100644
index 0000000000000000000000000000000000000000..5f60db63eaa31789211d6214053428f6566ba607
--- /dev/null
+++ b/test/test_network_ops/test_adaptive_avg_pool2d_backward.py
@@ -0,0 +1,74 @@
+# Copyright (c) 2020, Huawei Technologies.All rights reserved.
+#
+# Licensed under the BSD 3-Clause License  (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# https://opensource.org/licenses/BSD-3-Clause
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import torch
+import torch_npu
+import numpy as np
+from torch.nn import functional as F
+from torch_npu.testing.common_utils import TestCase, run_tests
+from torch_npu.testing.common_device_type import Dtypes, instantiate_device_type_tests
+from torch_npu.testing.util_test import create_common_tensor, test_2args_broadcast, create_dtype_tensor, UT_FAST_MODE
+
+class TestAdaptiveAvgPool2dBackward(TestCase):
+
+    def cpu_op_exec(self, input_x, input_grad):
+        input_x.requires_grad_(True)
+        m = torch.nn.AdaptiveAvgPool2d(input_grad)
+        if input_x.dtype == torch.half:
+            output = m(input_x.float()).half()
+        else:
+            output = m(input_x)
+        output.backward(output)
+        out = output.detach(), input_x.grad
+        return out
+
+    def npu_op_exec(self, input_x, input_grad):
+        input_x.requires_grad_(True)
+        m = torch.nn.AdaptiveAvgPool2d(input_grad)
+        output = m(input_x)
+        output.backward(output)
+        out = output.detach().cpu(), input_x.grad.cpu()
+        return out
+
+    def test_adaptiveAvgPool2d_backward_1(self, device):
+        cpu_input = torch.randn((1, 8, 9), dtype=torch.float32)
+        npu_input = cpu_input.npu()
+        output_size = np.array((2, 3))
+        cpu_output = self.cpu_op_exec(cpu_input, output_size)
+        npu_output = self.npu_op_exec(npu_input, output_size)
+        self.assertRtolEqual(cpu_output[0], npu_output[0])
+        self.assertRtolEqual(cpu_output[1], npu_output[1])
+        
+    def test_adaptiveAvgPool2d_backward_2(self, device):
+        cpu_input = torch.randn((1, 3, 3, 3), dtype=torch.float32)
+        npu_input = cpu_input.npu()
+        output_size = np.array((2, 2))
+        cpu_output = self.cpu_op_exec(cpu_input, output_size)
+        npu_output = self.npu_op_exec(npu_input, output_size)
+        self.assertRtolEqual(cpu_output[0], npu_output[0])
+        self.assertRtolEqual(cpu_output[1], npu_output[1])
+
+    def test_adaptiveAvgPool2d_backward_fp16(self, device):
+        input_x = np.random.uniform(0, 1, (1, 3, 6, 6)).astype(np.float16)
+        cpu_input = torch.from_numpy(input_x)
+        npu_input = cpu_input.npu()
+        output_size = np.array((5, 5))
+        cpu_output = self.cpu_op_exec(cpu_input, output_size)
+        npu_output = self.npu_op_exec(npu_input, output_size)
+        self.assertRtolEqual(cpu_output[0], npu_output[0])
+        self.assertRtolEqual(cpu_output[1], npu_output[1])
+        
+instantiate_device_type_tests(TestAdaptiveAvgPool2dBackward, globals(), except_for="cpu")
+if __name__ == "__main__":
+    run_tests()
diff --git a/test/test_network_ops/test_addbmm.py b/test/test_network_ops/test_addbmm.py
new file mode 100644
index 0000000000000000000000000000000000000000..c9fe853f909d2eb1c19035d9519d408012a49eeb
--- /dev/null
+++ b/test/test_network_ops/test_addbmm.py
@@ -0,0 +1,128 @@
+# Copyright (c) 2020 Huawei Technologies Co., Ltd
+# All rights reserved.
+#
+# Licensed under the BSD 3-Clause License  (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# https://opensource.org/licenses/BSD-3-Clause
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import torch
+import torch_npu
+import numpy as np
+
+from torch_npu.testing.common_utils import TestCase, run_tests
+from torch_npu.testing.common_device_type import instantiate_device_type_tests
+from torch_npu.testing.util_test import create_common_tensor
+
+class TestAddbmm(TestCase):
+    def generate_scalar(self, dtype, min_d, max_d):
+        if dtype == "float32":
+            scalar = np.random.uniform(min_d, max_d)
+        if dtype == "int32":
+            scalar = np.random.randint(min_d, max_d)
+        return scalar
+
+    def cpu_op_exec(self, input1, input2, input3, scalar1, scalar2):
+        output = torch.addbmm(input1, input2, input3, beta=scalar1, alpha=scalar2)
+        output = output.numpy()
+        return output
+
+    def npu_op_exec(self, input1, input2, input3, scalar1, scalar2):
+        input1 = input1.to("npu")
+        input2 = input2.to("npu")
+        input3 = input3.to("npu")
+        output = torch.addbmm(input1, input2, input3, beta=scalar1, alpha=scalar2)
+        output = output.to("cpu")
+        output = output.numpy()
+        return output
+
+    def npu_op_exec_out(self, input1, input2, input3, scalar1, scalar2, input4):
+        input1 = input1.to("npu")
+        input2 = input2.to("npu")
+        input3 = input3.to("npu")
+        output = input4.to("npu")
+        torch.addbmm(input1, input2, input3, beta=scalar1, alpha=scalar2, out=output)
+        output = output.to("cpu")
+        output = output.numpy()
+        return output
+
+    def npu_op_exec_inplace(self, input1, input2, input3, scalar1, scalar2):
+        input1 = input1.to("npu")
+        input2 = input2.to("npu")
+        input3 = input3.to("npu")
+        input1.addbmm_(input2, input3, beta=scalar1, alpha=scalar2)
+        output = input1.to("cpu")
+        output = output.numpy()
+        return output
+
+
+    def cpu_op_transpose_exec(self, input1, input2, input3, scalar1, scalar2):
+        input3_t = np.transpose(input3,(0,2,1))
+        output = torch.addbmm(input1, input2, input3_t, beta=scalar1, alpha=scalar2)
+        output = output.numpy()
+        return output
+
+    def npu_op_transpose_exec(self, input1, input2, input3, scalar1, scalar2):
+        input1 = input1.to("npu")
+        input2 = input2.to("npu")
+        input3 = input3.to("npu")
+        input3_t = np.transpose(input3,(0,2,1))
+        output = torch.addbmm(input1, input2, input3_t, beta=scalar1, alpha=scalar2)
+        output = output.to("cpu")
+        output = output.numpy()
+        return output
+
+    def test_addbmm(self, device):
+        shape_format = [
+            [[np.float32, 0, [3, 5]], [np.float32, 0, [10, 3, 4]], [np.float32, 0, [10, 4, 5]], "float32"],
+            [[np.int32, 0, [3, 5]], [np.int32, 0, [10, 3, 4]], [np.int32, 0, [10, 4, 5]], "int32"]
+        ]
+
+        for item in shape_format:
+            cpu_input1, npu_input1 = create_common_tensor(item[0], 0, 100)
+            cpu_input2, npu_input2 = create_common_tensor(item[1], 0, 100)
+            cpu_input3, npu_input3 = create_common_tensor(item[2], 0, 100)
+            cpu_input4, npu_input4 = create_common_tensor(item[0], 0, 100)
+
+            scalar1 = self.generate_scalar(item[3], 0, 10)
+            scalar2 = self.generate_scalar(item[3], 0, 10)
+
+            cpu_output = self.cpu_op_exec(cpu_input1, cpu_input2, cpu_input3, scalar1, scalar2)
+            npu_output = self.npu_op_exec(npu_input1, npu_input2, npu_input3, scalar1, scalar2)
+
+            npu_output1 = self.npu_op_exec_out(npu_input1, npu_input2, npu_input3, scalar1, scalar2, npu_input4)
+            npu_output2 = self.npu_op_exec_inplace(npu_input1, npu_input2, npu_input3, scalar1, scalar2)
+
+            self.assertRtolEqual(cpu_output, npu_output)
+            self.assertRtolEqual(cpu_output, npu_output1)
+            self.assertRtolEqual(cpu_output, npu_output2)
+
+        def test_addbmm_transpose(self, device):
+            shape_format = [
+                [[np.float32, 0, [4, 5]], [np.float32, 0, [10, 4, 7]], [np.float32, 0, [10, 5, 7]], "float32"],
+                [[np.int32, 0, [4, 5]], [np.int32, 0, [10, 4, 7]], [np.int32, 0, [10, 5, 7]], "int32"]
+            ]
+
+            for item in shape_format:
+                cpu_input1, npu_input1 = create_common_tensor(item[0], 0, 100)
+                cpu_input2, npu_input2 = create_common_tensor(item[1], 0, 100)
+                cpu_input3, npu_input3 = create_common_tensor(item[2], 0, 100)
+
+                scalar1 = self.generate_scalar(item[3], 0, 10)
+                scalar2 = self.generate_scalar(item[3], 0, 10)
+
+                cpu_transpose_output = self.cpu_op_transpose_exec(cpu_input1, cpu_input2, cpu_input3, scalar1, scalar2)
+                npu_transpose_output = self.npu_op_transpose_exec(npu_input1, npu_input2, npu_input3, scalar1, scalar2)
+
+                self.assertRtolEqual(cpu_transpose_output, npu_transpose_output)
+
+
+instantiate_device_type_tests(TestAddbmm, globals(), except_for='cpu')
+if __name__ == "__main__":
+    run_tests()
\ No newline at end of file
diff --git a/test/test_network_ops/test_bmm.py b/test/test_network_ops/test_bmm.py
new file mode 100644
index 0000000000000000000000000000000000000000..de204834bb3e7644060ebc02e4125fdd1713ebba
--- /dev/null
+++ b/test/test_network_ops/test_bmm.py
@@ -0,0 +1,73 @@
+# Copyright (c) 2020, Huawei Technologies.All rights reserved.
+#
+# Licensed under the BSD 3-Clause License  (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# https://opensource.org/licenses/BSD-3-Clause
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import torch
+import torch_npu
+import numpy as np
+
+from torch_npu.testing.common_utils import TestCase, run_tests
+from torch_npu.testing.common_device_type import instantiate_device_type_tests
+from torch_npu.testing.util_test import create_common_tensor
+
+class TestBatchMatMul(TestCase):
+    def cpu_op_exec(self, input1, input2):
+        output = torch.bmm(input1, input2)
+        output = output.numpy()
+        return output
+
+    def npu_op_exec(self, input1, input2):
+        output = torch.bmm(input1, input2)
+        output = output.to("cpu")
+        output = output.numpy()
+        return output
+
+    def bmm_auto_list_exec(self, shape):
+        for item in shape:
+            cpu_input1, npu_input1 = create_common_tensor(item[0], 0, 10)
+            cpu_input2, npu_input2 = create_common_tensor(item[1], 0, 10)
+            if cpu_input1.dtype == torch.float16:
+                cpu_input1 = cpu_input1.to(torch.float32)
+            if cpu_input2.dtype == torch.float16:
+                cpu_input2 = cpu_input2.to(torch.float32)
+            cpu_output = self.cpu_op_exec(cpu_input1, cpu_input2)
+            npu_output = self.npu_op_exec(npu_input1, npu_input2)
+            cpu_output = cpu_output.astype(npu_output.dtype)
+            self.assertRtolEqual(cpu_output, npu_output)
+
+    def test_batchmatmul_shape_format_fp16_3d(self, device):
+        format_list = [0, 3, 29]
+        shape_list = [(1, 3, 2)]
+        shape_format1 = [[np.float16, i, j]
+                            for i in format_list for j in shape_list]
+        format_list = [0, 3, 29]
+        shape_list = [(1, 2, 3)]
+        shape_format2 = [[np.float16, i, j]
+                            for i in format_list for j in shape_list]
+        shape_format = [[i, j] for i in shape_format1 for j in shape_format2]
+        self.bmm_auto_list_exec(shape_format)
+
+    def test_batchmatmul_shape_format_fp32_3d(self, device):
+        format_list = [0, 3, 29]
+        shape_list = [(1, 3, 2)]
+        shape_format1 = [[np.float32, i, j]
+                            for i in format_list for j in shape_list]
+        format_list = [0, 3, 29]
+        shape_list = [(1, 2, 3)]
+        shape_format2 = [[np.float32, i, j]
+                            for i in format_list for j in shape_list]
+        shape_format = [[i, j] for i in shape_format1 for j in shape_format2]
+        self.bmm_auto_list_exec(shape_format)
+
+instantiate_device_type_tests(TestBatchMatMul, globals(), except_for="cpu")
+if __name__ == "__main__":
+    run_tests()
diff --git a/torch_npu/csrc/aten/ops/AbsKernelNpu.cpp b/torch_npu/csrc/aten/ops/AbsKernelNpu.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..65bf701255f5037a0cea14fa559918e697c1a23b
--- /dev/null
+++ b/torch_npu/csrc/aten/ops/AbsKernelNpu.cpp
@@ -0,0 +1,55 @@
+// Copyright (c) 2020 Huawei Technologies Co., Ltd
+// Copyright (c) 2019, Facebook CORPORATION. 
+// All rights reserved.
+//
+// Licensed under the BSD 3-Clause License  (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#include "torch_npu/csrc/framework/utils/OpAdapter.h"
+#include "torch_npu/csrc/aten/NPUNativeFunctions.h"
+
+namespace at_npu {
+namespace native {
+
+at::Tensor& abs_out_npu_nocheck(at::Tensor& result, const at::Tensor& self) {
+  OpCommand cmd;
+  cmd.Name("Abs")
+     .Input(self)
+     .Output(result)
+     .Run();
+  return result;
+}
+
+at::Tensor& NPUNativeFunctions::abs_out(const at::Tensor& self, at::Tensor& result) {
+  OpPreparation::CheckOut(
+      {self},
+      result,
+      self);
+  OpPipeWithDefinedOut pipe;
+  return pipe.CheckMemory({self}, {result})
+   .Func([&self](at::Tensor& result){abs_out_npu_nocheck(result, self);})
+   .Call(result);
+}
+
+at::Tensor NPUNativeFunctions::abs(const at::Tensor& self) {
+  OpPipeWithApplyOut pipe;
+  return pipe.ApplyOutputSameAs(self)
+    .Func([&self](at::Tensor& result) {abs_out_npu_nocheck(result, self);})
+    .Call();
+}
+
+at::Tensor& NPUNativeFunctions::abs_(at::Tensor& self) {
+  abs_out(self, self);
+  return self;
+}
+
+} // namespace native
+} // namespace at_npu
\ No newline at end of file
diff --git a/torch_npu/csrc/aten/ops/AcosKernelNpu.cpp b/torch_npu/csrc/aten/ops/AcosKernelNpu.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..731ac93999f1e00034c98866c3fc7d8457491b6f
--- /dev/null
+++ b/torch_npu/csrc/aten/ops/AcosKernelNpu.cpp
@@ -0,0 +1,56 @@
+// Copyright (c) 2020 Huawei Technologies Co., Ltd
+// Copyright (c) 2019, Facebook CORPORATION. 
+// All rights reserved.
+//
+// Licensed under the BSD 3-Clause License  (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#include "torch_npu/csrc/framework/utils/OpAdapter.h"
+#include "torch_npu/csrc/aten/NPUNativeFunctions.h"
+
+namespace at_npu {
+namespace native {
+
+at::Tensor& acos_out_npu_nocheck(const at::Tensor& self, at::Tensor& result) {
+  OpCommand cmd;
+  cmd.Name("Acos")
+     .Input(self)
+     .Output(result)
+     .Run();
+
+  return result;
+}
+
+at::Tensor& NPUNativeFunctions::acos_out(const at::Tensor& self, at::Tensor& result) {
+  OpPipeWithDefinedOut pipe;
+  OpPreparation::CheckOut(
+      {self},
+      result,
+      self);  
+  return pipe.CheckMemory({self}, {result})
+   .Func([&self](at::Tensor& result){acos_out_npu_nocheck(self, result);})
+   .Call(result);
+}
+
+at::Tensor NPUNativeFunctions::acos(const at::Tensor& self) {
+  OpPipeWithApplyOut pipe;
+  return pipe.ApplyOutputSameAs(self)
+    .Func([&self](at::Tensor& result) {acos_out_npu_nocheck(self, result);})
+    .Call();
+}
+
+at::Tensor& NPUNativeFunctions::acos_(at::Tensor& self) {
+  acos_out(self, self);
+  return self;
+}
+
+} // namespace native
+} // namespace at_npu
\ No newline at end of file
diff --git a/torch_npu/csrc/aten/ops/AddbmmKernelNpu.cpp b/torch_npu/csrc/aten/ops/AddbmmKernelNpu.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..dff97430b145785997c9b7187b90b2f6f534445a
--- /dev/null
+++ b/torch_npu/csrc/aten/ops/AddbmmKernelNpu.cpp
@@ -0,0 +1,70 @@
+// Copyright (c) 2020, Huawei Technologies.All rights reserved.
+//
+// Licensed under the BSD 3-Clause License  (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "torch_npu/csrc/framework/utils/OpAdapter.h"
+#include "torch_npu/csrc/aten/NPUNativeFunctions.h"
+
+namespace at_npu {
+namespace native {
+
+at::Tensor& NPUNativeFunctions::addbmm_out( 
+    const at::Tensor& self, 
+    const at::Tensor& batch1, 
+    const at::Tensor& batch2,
+    at::Scalar beta,
+    at::Scalar alpha,
+    at::Tensor& result) {
+  at::Tensor MulResult = at::mul(batch1, alpha);
+  at::Tensor bmmResult = at::bmm(MulResult,batch2);
+  int64_t dim[2] = {batch1.size(1), batch2.size(2)};
+  at::Tensor sumResult = at::sum_to(bmmResult, dim);
+  // sumResult + self*beta
+  at::add_out(result, sumResult, self, beta); 
+  return result;
+}
+
+at::Tensor NPUNativeFunctions::addbmm(
+    const at::Tensor& self,
+    const at::Tensor& batch1,
+    const at::Tensor& batch2,
+    at::Scalar beta,
+    at::Scalar alpha) {
+  // calculate the output size
+  auto outputSize = addbmm_npu_output_size(self, batch1, batch2, beta, alpha);
+  // construct the output tensor of the NPU
+  at::Tensor result = OpPreparation::ApplyTensor(self, outputSize);
+  // calculate the output result of the NPU
+  addbmm_out(self, batch1, batch2, beta, alpha, result);
+  return result;
+}
+
+at::Tensor& NPUNativeFunctions::addbmm_(
+    at::Tensor& self,
+    const at::Tensor& batch1,
+    const at::Tensor& batch2,
+    at::Scalar beta,
+    at::Scalar alpha) {
+  OpPreparation::CheckMemory({self, batch1, batch2}, {self});
+  if (!NpuUtils::check_match(&self)) {
+    at::Tensor contiguousSelf = NpuUtils::format_contiguous(self);
+    at::Tensor result = addbmm_out(contiguousSelf, batch1, batch2, beta, alpha, contiguousSelf);
+    NpuUtils::format_fresh_view(self, result);
+  } else {
+    addbmm_out(self, batch1, batch2, beta, alpha, self);
+  }
+  return self;
+}
+
+} // namespace native
+} // namespace at
\ No newline at end of file
diff --git a/torch_npu/csrc/aten/ops/BmmKernelNpu.cpp b/torch_npu/csrc/aten/ops/BmmKernelNpu.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..38aceb87fd3faf9805639c1659673666e757ab15
--- /dev/null
+++ b/torch_npu/csrc/aten/ops/BmmKernelNpu.cpp
@@ -0,0 +1,88 @@
+// Copyright (c) 2020 Huawei Technologies Co., Ltd
+// Copyright (c) 2019, Facebook CORPORATION.
+// All rights reserved.
+//
+// Licensed under the BSD 3-Clause License  (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "c10/npu/OptionsManager.h"
+
+#include "torch_npu/csrc/framework/utils/OpAdapter.h"
+#include "torch_npu/csrc/framework/utils/CalcuOpUtil.h"
+#include "torch_npu/csrc/aten/NPUNativeFunctions.h"
+
+namespace at_npu {
+namespace native {
+
+at::Tensor& NPUNativeFunctions::bmm_out(const at::Tensor& self, const at::Tensor& mat2, at::Tensor& result) {
+  at::Tensor contiguousResult = result.is_contiguous() ? result : result.contiguous();
+
+  at::Tensor contiguousSelf = self;
+  at::Tensor contiguousMat2 = mat2;
+  bool isSelfT = CalcuOpUtil::is_transpose_last_two_dims(self);
+  bool isMat2T = CalcuOpUtil::is_transpose_last_two_dims(mat2);
+
+  if(!isSelfT){
+    contiguousSelf = NpuUtils::format_contiguous_add_copy_optimize(self);
+  }
+  if(!isMat2T){
+    contiguousMat2 = NpuUtils::format_contiguous_add_copy_optimize(mat2);
+  }
+
+  auto func1 = [&contiguousSelf]() {
+      bool pass = false;
+      return std::tie(pass, contiguousSelf);
+  };
+  auto func2 = [&contiguousMat2]() {
+      bool pass = false;
+      return std::tie(pass, contiguousMat2);
+  };
+
+  // executing the NPU operator
+  OpCommand cmd;
+  cmd.Name("BatchMatMul")
+      .InputWithFunc(func1)
+      .InputWithFunc(func2)
+      .Output(contiguousResult)
+      .Attr("adj_x1", isSelfT)
+      .Attr("adj_x2", isMat2T)
+      .Run();
+
+  if (!result.is_contiguous()) {
+    result.copy_(contiguousResult);
+  }
+  return result;
+}
+
+at::Tensor NPUNativeFunctions::bmm(const at::Tensor& self, const at::Tensor& mat2) {
+  // calculate the output size
+  auto outputSize = {self.size(0), self.size(1), mat2.size(2)};
+
+  // construct the output tensor of the NPU
+  at::Tensor result;
+
+  // 检查是否指定mm输出为NCHW。待NLP模型总体策略制定后删去
+  if ((self.scalar_type() == at::ScalarType::Float || self.scalar_type() == at::ScalarType::Half) &&
+      !c10::npu::OptionsManager::CheckSwitchMMOutputEnable()) {
+    result = at::empty_with_format(outputSize, self.options(), ACL_FORMAT_FRACTAL_NZ);
+  } else {
+    result = at::empty_with_format(outputSize, self.options(), ACL_FORMAT_ND);
+  }
+
+  // calculate the output result of the NPU
+  NPUNativeFunctions::bmm_out(self, mat2, result);
+
+  return result;
+}
+
+} // namespace native
+} // namespace at
diff --git a/torch_npu/csrc/aten/ops/pooling/AdaptiveAvgPool1dKernelNpu.cpp b/torch_npu/csrc/aten/ops/pooling/AdaptiveAvgPool1dKernelNpu.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..ae50870d0f727c27fd9ee0009c638350eac84796
--- /dev/null
+++ b/torch_npu/csrc/aten/ops/pooling/AdaptiveAvgPool1dKernelNpu.cpp
@@ -0,0 +1,42 @@
+// Copyright (c) 2020, Huawei Technologies.All rights reserved.
+//
+// Licensed under the BSD 3-Clause License  (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#include "torch_npu/csrc/framework/utils/OpAdapter.h"
+#include "torch_npu/csrc/aten/NPUNativeFunctions.h"
+
+namespace at_npu {
+namespace native {
+
+static void check1d(
+    const char* function_name,
+    const char* argument_name,
+    at::IntArrayRef x) {
+  TORCH_CHECK(
+      x.size() == 1,
+      function_name, "() argument '", argument_name,
+      "' should contain one int (got ", x.size(), ")");
+}
+
+at::Tensor NPUNativeFunctions::adaptive_avg_pool1d(const at::Tensor& self, at::IntArrayRef output_size) {
+  at::checkDim("adaptive_avg_pool1d", at::TensorArg(self, "self", 1), 3);
+  check1d("adaptive_avg_pool1d", "output_size", output_size);
+// construct the output tensor of the NPU
+  auto output = NPUNativeFunctions::adaptive_avg_pool2d(
+      self.unsqueeze(2),
+      {1, output_size[0]});
+
+  return output.squeeze(2);
+}
+
+} // namespace native
+} // namespace at_npu
\ No newline at end of file