From 4e9a3846351ea48ab87b432fd39ea4b59d7bfae5 Mon Sep 17 00:00:00 2001
From: wangxiao <wangxiao99@huawei.com>
Date: Tue, 25 Jan 2022 16:31:14 +0800
Subject: [PATCH 01/11] abs, acos, adaptive_avg_pool1d

---
 test/test_network_ops/test_abs.py             | 62 ++++++++++++++++
 test/test_network_ops/test_acos.py            | 68 ++++++++++++++++++
 .../test_adaptive_avg_pool1d.py               | 64 +++++++++++++++++
 .../test_adaptive_avg_pool2d_backward.py      | 71 +++++++++++++++++++
 torch_npu/csrc/aten/npu_native_functions.yaml |  7 ++
 torch_npu/csrc/aten/ops/AbsKernelNpu.cpp      | 55 ++++++++++++++
 torch_npu/csrc/aten/ops/AcosKernelNpu.cpp     | 56 +++++++++++++++
 .../pooling/AdaptiveAvgPool1dKernelNpu.cpp    | 42 +++++++++++
 8 files changed, 425 insertions(+)
 create mode 100644 test/test_network_ops/test_abs.py
 create mode 100644 test/test_network_ops/test_acos.py
 create mode 100644 test/test_network_ops/test_adaptive_avg_pool1d.py
 create mode 100644 test/test_network_ops/test_adaptive_avg_pool2d_backward.py
 create mode 100644 torch_npu/csrc/aten/ops/AbsKernelNpu.cpp
 create mode 100644 torch_npu/csrc/aten/ops/AcosKernelNpu.cpp
 create mode 100644 torch_npu/csrc/aten/ops/pooling/AdaptiveAvgPool1dKernelNpu.cpp

diff --git a/test/test_network_ops/test_abs.py b/test/test_network_ops/test_abs.py
new file mode 100644
index 0000000000..5e10d1d907
--- /dev/null
+++ b/test/test_network_ops/test_abs.py
@@ -0,0 +1,62 @@
+# Copyright (c) 2020, Huawei Technologies.All rights reserved.
+#
+# Licensed under the BSD 3-Clause License  (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# https://opensource.org/licenses/BSD-3-Clause
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import torch
+import torch_npu
+import numpy as np
+
+from torch_npu.testing.common_utils import TestCase, run_tests
+from torch_npu.testing.common_device_type import Dtypes, instantiate_device_type_tests
+from torch_npu.testing.util_test import create_common_tensor, test_2args_broadcast, create_dtype_tensor, UT_FAST_MODE
+
+class TestAbs(TestCase):
+    def cpu_op_exec(self, input):
+        output = torch.abs(input)
+        output = output.numpy()
+        return output
+
+    def npu_op_exec(self, input):
+        output = torch.abs(input)
+        output = output.to("cpu")
+        output = output.numpy()
+        return output
+
+    def test_abs_shape_format_fp16(self, device):
+        format_list = [0, 3]
+        shape_list = [[5], [5, 10], [1, 3, 2], [52, 15, 15, 20]]
+        shape_format = [
+            [np.float16, i, j] for i in format_list for j in shape_list
+        ]
+        for item in shape_format:
+            cpu_input, npu_input = create_common_tensor(item, -10, 10)
+            cpu_input = cpu_input.to(torch.float32)
+            cpu_output = self.cpu_op_exec(cpu_input)
+            npu_output = self.npu_op_exec(npu_input)
+            cpu_output = cpu_output.astype(np.float16)
+            self.assertRtolEqual(cpu_output, npu_output)
+
+    def test_abs_shape_format_fp32(self, device):
+        format_list = [0, 3]
+        shape_list = [[5], [5, 10], [1, 3, 2], [52, 15, 15, 20]]
+        shape_format = [
+            [np.float32, i, j] for i in format_list for j in shape_list
+        ]
+        for item in shape_format:
+            cpu_input, npu_input = create_common_tensor(item, -10, 10)
+            cpu_output = self.cpu_op_exec(cpu_input)
+            npu_output = self.npu_op_exec(npu_input)
+            self.assertRtolEqual(cpu_output, npu_output)
+
+instantiate_device_type_tests(TestAbs, globals(), except_for="cpu")
+if __name__ == "__main__":
+    run_tests()
diff --git a/test/test_network_ops/test_acos.py b/test/test_network_ops/test_acos.py
new file mode 100644
index 0000000000..36e9ffa893
--- /dev/null
+++ b/test/test_network_ops/test_acos.py
@@ -0,0 +1,68 @@
+# Copyright (c) 2020, Huawei Technologies.All rights reserved.
+#
+# Licensed under the BSD 3-Clause License  (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# https://opensource.org/licenses/BSD-3-Clause
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import torch
+import numpy as np
+
+from torch_npu.testing.common_utils import TestCase, run_tests
+from torch_npu.testing.common_device_type import Dtypes, instantiate_device_type_tests
+from torch_npu.testing.util_test import create_common_tensor, test_2args_broadcast, create_dtype_tensor, UT_FAST_MODE
+
+class TestAcos(TestCase):
+    def cpu_op_exec(self, input_para):
+        output = torch.acos(input_para) 
+        output = output.numpy()
+        return output
+ 
+    def npu_op_exec(self, input_para):
+        output = torch.acos(input_para) 
+        output = output.to("cpu") 
+        output = output.numpy()
+        return output  
+        
+    def test_acos_common_shape_format(self, device):
+        shape_format = [
+                [[np.float32, -1, 1]],
+                [[np.float32, -1, (64, 10)]], 
+                [[np.float32, -1, (32, 1, 3)]]
+        ]
+        for item in shape_format:
+            cpu_input, npu_input = create_common_tensor(item[0], -1, 1)
+            cpu_output = self.cpu_op_exec(cpu_input)
+            npu_output = self.npu_op_exec(npu_input)
+            self.assertRtolEqual(cpu_output, npu_output)
+
+    def test_acos_float16_shape_format(self, device):
+        def cpu_op_exec_fp16(input_para):
+            input_para = input_para.to(torch.float32)
+            output = torch.acos(input_para)
+            output = output.numpy()
+            output = output.astype(np.float16)
+            return output
+
+        shape_format = [
+                [[np.float16, -1, 1]], 
+                [[np.float16, -1, (64, 10)]],    
+                [[np.float16, -1, (31, 1, 3)]]
+        ] 
+        for item in shape_format:
+            cpu_input, npu_input = create_common_tensor(item[0], -1, 1)
+            cpu_output = cpu_op_exec_fp16(cpu_input)
+            npu_output = self.npu_op_exec(npu_input)
+            self.assertRtolEqual(cpu_output, npu_output)  
+
+instantiate_device_type_tests(TestAcos, globals(), except_for="cpu")
+if __name__ == "__main__":
+    run_tests()
+        
diff --git a/test/test_network_ops/test_adaptive_avg_pool1d.py b/test/test_network_ops/test_adaptive_avg_pool1d.py
new file mode 100644
index 0000000000..56ecbe229c
--- /dev/null
+++ b/test/test_network_ops/test_adaptive_avg_pool1d.py
@@ -0,0 +1,64 @@
+# Copyright (c) 2020, Huawei Technologies.All rights reserved.
+#
+# Licensed under the BSD 3-Clause License  (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# https://opensource.org/licenses/BSD-3-Clause
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import torch
+import torch_npu
+import torch.nn as nn
+import numpy as np
+from torch_npu.testing.common_utils import TestCase, run_tests
+from torch_npu.testing.common_device_type import Dtypes, instantiate_device_type_tests
+from torch_npu.testing.util_test import create_common_tensor, test_2args_broadcast, create_dtype_tensor, UT_FAST_MODE
+
+class TestAdaptiveAvgPool1d(TestCase):
+    def cpu_op_exec(self, input, output_size):
+        m = nn.AdaptiveAvgPool1d(output_size)
+        output= m(input)
+        return output.numpy()
+
+    def npu_op_exec(self, input, output_size):
+        m = nn.AdaptiveAvgPool1d(output_size).npu()
+        output = m(input)
+        return output.cpu().numpy()
+    
+    def test_AdaptiveAvgPool1d_shape_format_fp16(self, device):
+        shape_format = [
+                [np.float16, 0, (64, 10, 16)],
+                [np.float16, -1, (256, 2048, 8)],
+                [np.float16, 3, (32, 16, 16)]
+        ]
+        output_list = [(4), (3)]
+        for item in shape_format:
+            cpu_input, npu_input = create_common_tensor(item, 1, 10)
+            for output_size in output_list:
+                cpu_output = self.cpu_op_exec(cpu_input, output_size)
+                npu_output = self.npu_op_exec(npu_input, output_size)
+                self.assertRtolEqual(cpu_output, npu_output, prec16=0.002)
+
+    def test_AdaptiveAvgPool1d_shape_format_fp32(self, device):
+        shape_format = [
+                [np.float32, 0, (64, 10, 16)],
+                [np.float32, -1, (256, 2048, 8)],
+                [np.float32, 3, (32, 16, 16)]
+        ]
+        output_list = [(4), (3), (1)]
+        for item in shape_format:
+            cpu_input, npu_input = create_common_tensor(item, 1, 10)
+            for output_size in output_list:
+                cpu_output = self.cpu_op_exec(cpu_input, output_size)
+                npu_output = self.npu_op_exec(npu_input, output_size)
+                self.assertRtolEqual(cpu_output, npu_output, 0.001)
+
+instantiate_device_type_tests(TestAdaptiveAvgPool1d, globals(), except_for="cpu")
+if __name__ == "__main__":
+    run_tests()
diff --git a/test/test_network_ops/test_adaptive_avg_pool2d_backward.py b/test/test_network_ops/test_adaptive_avg_pool2d_backward.py
new file mode 100644
index 0000000000..8bdefbf7fc
--- /dev/null
+++ b/test/test_network_ops/test_adaptive_avg_pool2d_backward.py
@@ -0,0 +1,71 @@
+# Copyright (c) 2020, Huawei Technologies.All rights reserved.
+#
+# Licensed under the BSD 3-Clause License  (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# https://opensource.org/licenses/BSD-3-Clause
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import torch
+import torch_npu
+import numpy as np
+from torch.nn import functional as F
+from torch_npu.testing.common_utils import TestCase, run_tests
+from torch_npu.testing.common_device_type import Dtypes, instantiate_device_type_tests
+from torch_npu.testing.util_test import create_common_tensor, test_2args_broadcast, create_dtype_tensor, UT_FAST_MODE
+
+class TestAdaptiveAvgPool2dBackward(TestCase):
+
+    def cpu_op_exec(self, input_x, input_grad):
+        input_x.requires_grad_(True)
+        m = torch.nn.AdaptiveAvgPool2d(input_grad)
+        output = m(input_x)
+        output.backward(output)
+        out = input_x.grad
+        return out
+
+    def npu_op_exec(self, input_x, input_grad):
+        input_x.requires_grad_(True)
+        m = torch.nn.AdaptiveAvgPool2d(input_grad)
+        output = m(input_x)
+        output.backward(output)
+        out = input_x.grad.cpu()
+        return out
+
+    def test_adaptiveAvgPool2d_backward_1(self, device):
+        cpu_input = torch.randn((1, 8, 9), dtype=torch.float32)
+        npu_input = cpu_input
+        output_size = np.array((2, 3))
+        cpu_output = self.cpu_op_exec(cpu_input, output_size)
+        npu_output = self.npu_op_exec(npu_input, output_size)
+        self.assertRtolEqual(cpu_output.numpy(), npu_output.numpy())
+        
+    def test_adaptiveAvgPool2d_backward_2(self, device):
+        cpu_input = torch.randn((1, 3, 3, 3), dtype=torch.float32)
+        npu_input = cpu_input
+        output_size = np.array((2, 2))
+        cpu_output = self.cpu_op_exec(cpu_input, output_size)
+        npu_output = self.npu_op_exec(npu_input, output_size)
+        
+        self.assertRtolEqual(cpu_output.numpy(), npu_output.numpy())
+
+    def test_adaptiveAvgPool2d_backward_fp16(self, device):
+        input_x = np.random.uniform(0, 1, (1, 3, 6, 6)).astype(np.float16)
+        cpu_input = torch.from_numpy(input_x)
+        npu_input = cpu_input
+        output_size = np.array((5, 5))
+        cpu_input = cpu_input.to(torch.float32)
+        cpu_output = self.cpu_op_exec(cpu_input, output_size)
+        npu_output = self.npu_op_exec(npu_input, output_size)
+        cpu_output = cpu_output.to(torch.float16)
+        self.assertRtolEqual(cpu_output.numpy(), npu_output.numpy())
+        
+instantiate_device_type_tests(TestAdaptiveAvgPool2dBackward, globals(), except_for="cpu")
+if __name__ == "__main__":
+    run_tests()
diff --git a/torch_npu/csrc/aten/npu_native_functions.yaml b/torch_npu/csrc/aten/npu_native_functions.yaml
index 412fb9da7c..a9c96fdc0b 100644
--- a/torch_npu/csrc/aten/npu_native_functions.yaml
+++ b/torch_npu/csrc/aten/npu_native_functions.yaml
@@ -1,6 +1,13 @@
 backend: NPU
 cpp_namespace: at_npu::native
 supported:
+  - abs
+  - abs_
+  - abs.out
+  - acos
+  - acos_
+  - acos.out
+  - adaptive_avg_pool1d
   - add.Tensor
   - add.Scalar
   - add_.Tensor
diff --git a/torch_npu/csrc/aten/ops/AbsKernelNpu.cpp b/torch_npu/csrc/aten/ops/AbsKernelNpu.cpp
new file mode 100644
index 0000000000..de2d514e40
--- /dev/null
+++ b/torch_npu/csrc/aten/ops/AbsKernelNpu.cpp
@@ -0,0 +1,55 @@
+// Copyright (c) 2020 Huawei Technologies Co., Ltd
+// Copyright (c) 2019, Facebook CORPORATION. 
+// All rights reserved.
+//
+// Licensed under the BSD 3-Clause License  (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#include "torch_npu/csrc/framework/utils/OpAdapter.h"
+#include "torch_npu/csrc/aten/NPUNativeFunctions.h"
+
+namespace at_npu {
+namespace native {
+
+at::Tensor& abs_out_npu_nocheck(at::Tensor& result, const at::Tensor& self) {
+  OpCommand cmd;
+  cmd.Name("Abs")
+     .Input(self)
+     .Output(result)
+     .Run();
+  return result;
+}
+
+at::Tensor& NPUNativeFunctions::abs_out(const at::Tensor& self, at::Tensor& result) {
+  OpPreparation::CheckOut(
+      {self},
+      result,
+      self);
+  OpPipeWithDefinedOut pipe;
+  return pipe.CheckMemory({self}, {result})
+   .Func([&self](at::Tensor& result){abs_out_npu_nocheck(result, self);})
+   .Call(result);
+}
+
+at::Tensor NPUNativeFunctions::abs(const at::Tensor& self) {
+  OpPipeWithApplyOut pipe;
+  return pipe.ApplyOutputSameAs(self)
+    .Func([&self](at::Tensor& result) {abs_out_npu_nocheck(result, self);})
+    .Call();
+}
+
+at::Tensor& NPUNativeFunctions::abs_(at::Tensor& self) {
+  abs_out_npu(self, self);
+  return self;
+}
+
+} // namespace native
+} // namespace at_npu
\ No newline at end of file
diff --git a/torch_npu/csrc/aten/ops/AcosKernelNpu.cpp b/torch_npu/csrc/aten/ops/AcosKernelNpu.cpp
new file mode 100644
index 0000000000..09acb183d6
--- /dev/null
+++ b/torch_npu/csrc/aten/ops/AcosKernelNpu.cpp
@@ -0,0 +1,56 @@
+// Copyright (c) 2020 Huawei Technologies Co., Ltd
+// Copyright (c) 2019, Facebook CORPORATION. 
+// All rights reserved.
+//
+// Licensed under the BSD 3-Clause License  (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#include "torch_npu/csrc/framework/utils/OpAdapter.h"
+#include "torch_npu/csrc/aten/NPUNativeFunctions.h"
+
+namespace at_npu {
+namespace native {
+
+at::Tensor& acos_out_npu_nocheck(const at::Tensor& self, at::Tensor& result) {
+  OpCommand cmd;
+  cmd.Name("Acos")
+     .Input(self)
+     .Output(result)
+     .Run();
+
+  return result;
+}
+
+at::Tensor& NPUNativeFunctions::acos_out(const at::Tensor& self, at::Tensor& result) {
+  OpPipeWithDefinedOut pipe;
+  OpPreparation::CheckOut(
+      {self},
+      result,
+      self);  
+  return pipe.CheckMemory({self}, {result})
+   .Func([&self](at::Tensor& result){acos_out_npu_nocheck(self, result);})
+   .Call(result);
+}
+
+at::Tensor NPUNativeFunctions::acos(const at::Tensor& self) {
+  OpPipeWithApplyOut pipe;
+  return pipe.ApplyOutputSameAs(self)
+    .Func([&self](at::Tensor& result) {acos_out_npu_nocheck(self, result);})
+    .Call();
+}
+
+at::Tensor& NPUNativeFunctions::acos_(at::Tensor& self) {
+  acos_out_npu(self, self);
+  return self;
+}
+
+} // namespace native
+} // namespace at
\ No newline at end of file
diff --git a/torch_npu/csrc/aten/ops/pooling/AdaptiveAvgPool1dKernelNpu.cpp b/torch_npu/csrc/aten/ops/pooling/AdaptiveAvgPool1dKernelNpu.cpp
new file mode 100644
index 0000000000..d30939a8b4
--- /dev/null
+++ b/torch_npu/csrc/aten/ops/pooling/AdaptiveAvgPool1dKernelNpu.cpp
@@ -0,0 +1,42 @@
+// Copyright (c) 2020, Huawei Technologies.All rights reserved.
+//
+// Licensed under the BSD 3-Clause License  (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#include "torch_npu/csrc/framework/utils/OpAdapter.h"
+#include "torch_npu/csrc/aten/NPUNativeFunctions.h"
+
+namespace at_npu {
+namespace native {
+
+static void check1d(
+    const char* function_name,
+    const char* argument_name,
+    IntArrayRef x) {
+  TORCH_CHECK(
+      x.size() == 1,
+      function_name, "() argument '", argument_name,
+      "' should contain one int (got ", x.size(), ")");
+}
+
+at::Tensor NPUNativeFunctions::adaptive_avg_pool1d(const at::Tensor& self, IntArrayRef output_size) {
+  checkDim("adaptive_avg_pool1d", TensorArg(self, "self", 1), 3);
+  check1d("adaptive_avg_pool1d", "output_size", output_size);
+// construct the output tensor of the NPU
+  auto output = NPUNativeFunctions::adaptive_avg_pool2d(
+      self.unsqueeze(2),
+      {1, output_size[0]});
+
+  return output.squeeze(2);
+}
+
+} // namespace native
+} // namespace at
\ No newline at end of file
-- 
Gitee


From bc3b850b19ba5196065b2cb5bff5486e539fb535 Mon Sep 17 00:00:00 2001
From: wangxiao <wangxiao99@huawei.com>
Date: Tue, 25 Jan 2022 17:31:54 +0800
Subject: [PATCH 02/11] abs, acos, adaptive_avg_pool1d, fix bug

---
 torch_npu/csrc/aten/ops/AbsKernelNpu.cpp                      | 2 +-
 torch_npu/csrc/aten/ops/AcosKernelNpu.cpp                     | 4 ++--
 .../csrc/aten/ops/pooling/AdaptiveAvgPool1dKernelNpu.cpp      | 2 +-
 3 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/torch_npu/csrc/aten/ops/AbsKernelNpu.cpp b/torch_npu/csrc/aten/ops/AbsKernelNpu.cpp
index de2d514e40..65bf701255 100644
--- a/torch_npu/csrc/aten/ops/AbsKernelNpu.cpp
+++ b/torch_npu/csrc/aten/ops/AbsKernelNpu.cpp
@@ -47,7 +47,7 @@ at::Tensor NPUNativeFunctions::abs(const at::Tensor& self) {
 }
 
 at::Tensor& NPUNativeFunctions::abs_(at::Tensor& self) {
-  abs_out_npu(self, self);
+  abs_out(self, self);
   return self;
 }
 
diff --git a/torch_npu/csrc/aten/ops/AcosKernelNpu.cpp b/torch_npu/csrc/aten/ops/AcosKernelNpu.cpp
index 09acb183d6..731ac93999 100644
--- a/torch_npu/csrc/aten/ops/AcosKernelNpu.cpp
+++ b/torch_npu/csrc/aten/ops/AcosKernelNpu.cpp
@@ -48,9 +48,9 @@ at::Tensor NPUNativeFunctions::acos(const at::Tensor& self) {
 }
 
 at::Tensor& NPUNativeFunctions::acos_(at::Tensor& self) {
-  acos_out_npu(self, self);
+  acos_out(self, self);
   return self;
 }
 
 } // namespace native
-} // namespace at
\ No newline at end of file
+} // namespace at_npu
\ No newline at end of file
diff --git a/torch_npu/csrc/aten/ops/pooling/AdaptiveAvgPool1dKernelNpu.cpp b/torch_npu/csrc/aten/ops/pooling/AdaptiveAvgPool1dKernelNpu.cpp
index d30939a8b4..64a2e08fb4 100644
--- a/torch_npu/csrc/aten/ops/pooling/AdaptiveAvgPool1dKernelNpu.cpp
+++ b/torch_npu/csrc/aten/ops/pooling/AdaptiveAvgPool1dKernelNpu.cpp
@@ -39,4 +39,4 @@ at::Tensor NPUNativeFunctions::adaptive_avg_pool1d(const at::Tensor& self, IntAr
 }
 
 } // namespace native
-} // namespace at
\ No newline at end of file
+} // namespace at_npu
\ No newline at end of file
-- 
Gitee


From 0cd1567f240caca872f9990f23088212c21b98ae Mon Sep 17 00:00:00 2001
From: wangxiao <wangxiao99@huawei.com>
Date: Tue, 25 Jan 2022 17:35:21 +0800
Subject: [PATCH 03/11] adaptive_avg_pool1d, fix bug

---
 .../csrc/aten/ops/pooling/AdaptiveAvgPool1dKernelNpu.cpp      | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/torch_npu/csrc/aten/ops/pooling/AdaptiveAvgPool1dKernelNpu.cpp b/torch_npu/csrc/aten/ops/pooling/AdaptiveAvgPool1dKernelNpu.cpp
index 64a2e08fb4..19090b9af8 100644
--- a/torch_npu/csrc/aten/ops/pooling/AdaptiveAvgPool1dKernelNpu.cpp
+++ b/torch_npu/csrc/aten/ops/pooling/AdaptiveAvgPool1dKernelNpu.cpp
@@ -20,14 +20,14 @@ namespace native {
 static void check1d(
     const char* function_name,
     const char* argument_name,
-    IntArrayRef x) {
+    at::IntArrayRef x) {
   TORCH_CHECK(
       x.size() == 1,
       function_name, "() argument '", argument_name,
       "' should contain one int (got ", x.size(), ")");
 }
 
-at::Tensor NPUNativeFunctions::adaptive_avg_pool1d(const at::Tensor& self, IntArrayRef output_size) {
+at::Tensor NPUNativeFunctions::adaptive_avg_pool1d(const at::Tensor& self, at::IntArrayRef output_size) {
   checkDim("adaptive_avg_pool1d", TensorArg(self, "self", 1), 3);
   check1d("adaptive_avg_pool1d", "output_size", output_size);
 // construct the output tensor of the NPU
-- 
Gitee


From 691f4bf4aad3d9f4b036a8020c86a763b27e59f0 Mon Sep 17 00:00:00 2001
From: wangxiao <wangxiao99@huawei.com>
Date: Tue, 25 Jan 2022 17:38:19 +0800
Subject: [PATCH 04/11] adaptive_avg_pool1d, fix bug

---
 torch_npu/csrc/aten/ops/pooling/AdaptiveAvgPool1dKernelNpu.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/torch_npu/csrc/aten/ops/pooling/AdaptiveAvgPool1dKernelNpu.cpp b/torch_npu/csrc/aten/ops/pooling/AdaptiveAvgPool1dKernelNpu.cpp
index 19090b9af8..ae50870d0f 100644
--- a/torch_npu/csrc/aten/ops/pooling/AdaptiveAvgPool1dKernelNpu.cpp
+++ b/torch_npu/csrc/aten/ops/pooling/AdaptiveAvgPool1dKernelNpu.cpp
@@ -28,7 +28,7 @@ static void check1d(
 }
 
 at::Tensor NPUNativeFunctions::adaptive_avg_pool1d(const at::Tensor& self, at::IntArrayRef output_size) {
-  checkDim("adaptive_avg_pool1d", TensorArg(self, "self", 1), 3);
+  at::checkDim("adaptive_avg_pool1d", at::TensorArg(self, "self", 1), 3);
   check1d("adaptive_avg_pool1d", "output_size", output_size);
 // construct the output tensor of the NPU
   auto output = NPUNativeFunctions::adaptive_avg_pool2d(
-- 
Gitee


From 79340b0c1c0d3f282127941f91e1ffa9b5d9126f Mon Sep 17 00:00:00 2001
From: wangxiao <wangxiao99@huawei.com>
Date: Wed, 26 Jan 2022 16:14:48 +0800
Subject: [PATCH 05/11] ut fix

---
 .../test_adaptive_avg_pool1d.py               |  2 +-
 .../test_adaptive_avg_pool2d_backward.py      | 19 ++++++++++++-------
 torch_npu/csrc/aten/npu_native_functions.yaml |  7 -------
 3 files changed, 13 insertions(+), 15 deletions(-)

diff --git a/test/test_network_ops/test_adaptive_avg_pool1d.py b/test/test_network_ops/test_adaptive_avg_pool1d.py
index 56ecbe229c..dadff381e2 100644
--- a/test/test_network_ops/test_adaptive_avg_pool1d.py
+++ b/test/test_network_ops/test_adaptive_avg_pool1d.py
@@ -41,7 +41,7 @@ class TestAdaptiveAvgPool1d(TestCase):
         for item in shape_format:
             cpu_input, npu_input = create_common_tensor(item, 1, 10)
             for output_size in output_list:
-                cpu_output = self.cpu_op_exec(cpu_input, output_size)
+                cpu_output = self.cpu_op_exec(cpu_input.float(), output_size).half()
                 npu_output = self.npu_op_exec(npu_input, output_size)
                 self.assertRtolEqual(cpu_output, npu_output, prec16=0.002)
 
diff --git a/test/test_network_ops/test_adaptive_avg_pool2d_backward.py b/test/test_network_ops/test_adaptive_avg_pool2d_backward.py
index 8bdefbf7fc..095fdd2188 100644
--- a/test/test_network_ops/test_adaptive_avg_pool2d_backward.py
+++ b/test/test_network_ops/test_adaptive_avg_pool2d_backward.py
@@ -25,9 +25,12 @@ class TestAdaptiveAvgPool2dBackward(TestCase):
     def cpu_op_exec(self, input_x, input_grad):
         input_x.requires_grad_(True)
         m = torch.nn.AdaptiveAvgPool2d(input_grad)
-        output = m(input_x)
+        if input_x.dtype == torch.half:
+            output = m(input_x.float()).half()
+        else:
+            output = m(input_x)
         output.backward(output)
-        out = input_x.grad
+        out = output.detach(), input_x.grad
         return out
 
     def npu_op_exec(self, input_x, input_grad):
@@ -35,7 +38,7 @@ class TestAdaptiveAvgPool2dBackward(TestCase):
         m = torch.nn.AdaptiveAvgPool2d(input_grad)
         output = m(input_x)
         output.backward(output)
-        out = input_x.grad.cpu()
+        out = output.detach().cpu(), input_x.grad.cpu()
         return out
 
     def test_adaptiveAvgPool2d_backward_1(self, device):
@@ -44,7 +47,8 @@ class TestAdaptiveAvgPool2dBackward(TestCase):
         output_size = np.array((2, 3))
         cpu_output = self.cpu_op_exec(cpu_input, output_size)
         npu_output = self.npu_op_exec(npu_input, output_size)
-        self.assertRtolEqual(cpu_output.numpy(), npu_output.numpy())
+        self.assertRtolEqual(cpu_output[0], npu_output[0])
+        self.assertRtolEqual(cpu_output[1], npu_output[1])
         
     def test_adaptiveAvgPool2d_backward_2(self, device):
         cpu_input = torch.randn((1, 3, 3, 3), dtype=torch.float32)
@@ -52,8 +56,8 @@ class TestAdaptiveAvgPool2dBackward(TestCase):
         output_size = np.array((2, 2))
         cpu_output = self.cpu_op_exec(cpu_input, output_size)
         npu_output = self.npu_op_exec(npu_input, output_size)
-        
-        self.assertRtolEqual(cpu_output.numpy(), npu_output.numpy())
+        self.assertRtolEqual(cpu_output[0], npu_output[0])
+        self.assertRtolEqual(cpu_output[1], npu_output[1])
 
     def test_adaptiveAvgPool2d_backward_fp16(self, device):
         input_x = np.random.uniform(0, 1, (1, 3, 6, 6)).astype(np.float16)
@@ -64,7 +68,8 @@ class TestAdaptiveAvgPool2dBackward(TestCase):
         cpu_output = self.cpu_op_exec(cpu_input, output_size)
         npu_output = self.npu_op_exec(npu_input, output_size)
         cpu_output = cpu_output.to(torch.float16)
-        self.assertRtolEqual(cpu_output.numpy(), npu_output.numpy())
+        self.assertRtolEqual(cpu_output[0], npu_output[0])
+        self.assertRtolEqual(cpu_output[1], npu_output[1])
         
 instantiate_device_type_tests(TestAdaptiveAvgPool2dBackward, globals(), except_for="cpu")
 if __name__ == "__main__":
diff --git a/torch_npu/csrc/aten/npu_native_functions.yaml b/torch_npu/csrc/aten/npu_native_functions.yaml
index a9c96fdc0b..412fb9da7c 100644
--- a/torch_npu/csrc/aten/npu_native_functions.yaml
+++ b/torch_npu/csrc/aten/npu_native_functions.yaml
@@ -1,13 +1,6 @@
 backend: NPU
 cpp_namespace: at_npu::native
 supported:
-  - abs
-  - abs_
-  - abs.out
-  - acos
-  - acos_
-  - acos.out
-  - adaptive_avg_pool1d
   - add.Tensor
   - add.Scalar
   - add_.Tensor
-- 
Gitee


From 5a4d42268aa8de81932e046db5810bf189ed84b9 Mon Sep 17 00:00:00 2001
From: wangxiao <wangxiao99@huawei.com>
Date: Wed, 26 Jan 2022 16:42:04 +0800
Subject: [PATCH 06/11] ut fix

---
 test/test_network_ops/test_adaptive_avg_pool1d.py         | 4 ++--
 .../test_network_ops/test_adaptive_avg_pool2d_backward.py | 8 +++-----
 2 files changed, 5 insertions(+), 7 deletions(-)

diff --git a/test/test_network_ops/test_adaptive_avg_pool1d.py b/test/test_network_ops/test_adaptive_avg_pool1d.py
index dadff381e2..5775934385 100644
--- a/test/test_network_ops/test_adaptive_avg_pool1d.py
+++ b/test/test_network_ops/test_adaptive_avg_pool1d.py
@@ -24,12 +24,12 @@ class TestAdaptiveAvgPool1d(TestCase):
     def cpu_op_exec(self, input, output_size):
         m = nn.AdaptiveAvgPool1d(output_size)
         output= m(input)
-        return output.numpy()
+        return output
 
     def npu_op_exec(self, input, output_size):
         m = nn.AdaptiveAvgPool1d(output_size).npu()
         output = m(input)
-        return output.cpu().numpy()
+        return output.cpu()
     
     def test_AdaptiveAvgPool1d_shape_format_fp16(self, device):
         shape_format = [
diff --git a/test/test_network_ops/test_adaptive_avg_pool2d_backward.py b/test/test_network_ops/test_adaptive_avg_pool2d_backward.py
index 095fdd2188..5f60db63ea 100644
--- a/test/test_network_ops/test_adaptive_avg_pool2d_backward.py
+++ b/test/test_network_ops/test_adaptive_avg_pool2d_backward.py
@@ -43,7 +43,7 @@ class TestAdaptiveAvgPool2dBackward(TestCase):
 
     def test_adaptiveAvgPool2d_backward_1(self, device):
         cpu_input = torch.randn((1, 8, 9), dtype=torch.float32)
-        npu_input = cpu_input
+        npu_input = cpu_input.npu()
         output_size = np.array((2, 3))
         cpu_output = self.cpu_op_exec(cpu_input, output_size)
         npu_output = self.npu_op_exec(npu_input, output_size)
@@ -52,7 +52,7 @@ class TestAdaptiveAvgPool2dBackward(TestCase):
         
     def test_adaptiveAvgPool2d_backward_2(self, device):
         cpu_input = torch.randn((1, 3, 3, 3), dtype=torch.float32)
-        npu_input = cpu_input
+        npu_input = cpu_input.npu()
         output_size = np.array((2, 2))
         cpu_output = self.cpu_op_exec(cpu_input, output_size)
         npu_output = self.npu_op_exec(npu_input, output_size)
@@ -62,12 +62,10 @@ class TestAdaptiveAvgPool2dBackward(TestCase):
     def test_adaptiveAvgPool2d_backward_fp16(self, device):
         input_x = np.random.uniform(0, 1, (1, 3, 6, 6)).astype(np.float16)
         cpu_input = torch.from_numpy(input_x)
-        npu_input = cpu_input
+        npu_input = cpu_input.npu()
         output_size = np.array((5, 5))
-        cpu_input = cpu_input.to(torch.float32)
         cpu_output = self.cpu_op_exec(cpu_input, output_size)
         npu_output = self.npu_op_exec(npu_input, output_size)
-        cpu_output = cpu_output.to(torch.float16)
         self.assertRtolEqual(cpu_output[0], npu_output[0])
         self.assertRtolEqual(cpu_output[1], npu_output[1])
         
-- 
Gitee


From 935347f46a94a881cbf0c0235f4fd396ba5108d9 Mon Sep 17 00:00:00 2001
From: wangxiao <wangxiao99@huawei.com>
Date: Wed, 26 Jan 2022 20:51:02 +0800
Subject: [PATCH 07/11] addbmm, bmm

---
 test/test_network_ops/test_addbmm.py        | 128 ++++++++++++++++++++
 test/test_network_ops/test_bmm.py           |  73 +++++++++++
 torch_npu/csrc/aten/ops/AddbmmKernelNpu.cpp |  70 +++++++++++
 torch_npu/csrc/aten/ops/BmmKernelNpu.cpp    |  88 ++++++++++++++
 4 files changed, 359 insertions(+)
 create mode 100644 test/test_network_ops/test_addbmm.py
 create mode 100644 test/test_network_ops/test_bmm.py
 create mode 100644 torch_npu/csrc/aten/ops/AddbmmKernelNpu.cpp
 create mode 100644 torch_npu/csrc/aten/ops/BmmKernelNpu.cpp

diff --git a/test/test_network_ops/test_addbmm.py b/test/test_network_ops/test_addbmm.py
new file mode 100644
index 0000000000..c9fe853f90
--- /dev/null
+++ b/test/test_network_ops/test_addbmm.py
@@ -0,0 +1,128 @@
+# Copyright (c) 2020 Huawei Technologies Co., Ltd
+# All rights reserved.
+#
+# Licensed under the BSD 3-Clause License  (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# https://opensource.org/licenses/BSD-3-Clause
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import torch
+import torch_npu
+import numpy as np
+
+from torch_npu.testing.common_utils import TestCase, run_tests
+from torch_npu.testing.common_device_type import instantiate_device_type_tests
+from torch_npu.testing.util_test import create_common_tensor
+
+class TestAddbmm(TestCase):
+    def generate_scalar(self, dtype, min_d, max_d):
+        if dtype == "float32":
+            scalar = np.random.uniform(min_d, max_d)
+        if dtype == "int32":
+            scalar = np.random.randint(min_d, max_d)
+        return scalar
+
+    def cpu_op_exec(self, input1, input2, input3, scalar1, scalar2):
+        output = torch.addbmm(input1, input2, input3, beta=scalar1, alpha=scalar2)
+        output = output.numpy()
+        return output
+
+    def npu_op_exec(self, input1, input2, input3, scalar1, scalar2):
+        input1 = input1.to("npu")
+        input2 = input2.to("npu")
+        input3 = input3.to("npu")
+        output = torch.addbmm(input1, input2, input3, beta=scalar1, alpha=scalar2)
+        output = output.to("cpu")
+        output = output.numpy()
+        return output
+
+    def npu_op_exec_out(self, input1, input2, input3, scalar1, scalar2, input4):
+        input1 = input1.to("npu")
+        input2 = input2.to("npu")
+        input3 = input3.to("npu")
+        output = input4.to("npu")
+        torch.addbmm(input1, input2, input3, beta=scalar1, alpha=scalar2, out=output)
+        output = output.to("cpu")
+        output = output.numpy()
+        return output
+
+    def npu_op_exec_inplace(self, input1, input2, input3, scalar1, scalar2):
+        input1 = input1.to("npu")
+        input2 = input2.to("npu")
+        input3 = input3.to("npu")
+        input1.addbmm_(input2, input3, beta=scalar1, alpha=scalar2)
+        output = input1.to("cpu")
+        output = output.numpy()
+        return output
+
+
+    def cpu_op_transpose_exec(self, input1, input2, input3, scalar1, scalar2):
+        input3_t = np.transpose(input3,(0,2,1))
+        output = torch.addbmm(input1, input2, input3_t, beta=scalar1, alpha=scalar2)
+        output = output.numpy()
+        return output
+
+    def npu_op_transpose_exec(self, input1, input2, input3, scalar1, scalar2):
+        input1 = input1.to("npu")
+        input2 = input2.to("npu")
+        input3 = input3.to("npu")
+        input3_t = np.transpose(input3,(0,2,1))
+        output = torch.addbmm(input1, input2, input3_t, beta=scalar1, alpha=scalar2)
+        output = output.to("cpu")
+        output = output.numpy()
+        return output
+
+    def test_addbmm(self, device):
+        shape_format = [
+            [[np.float32, 0, [3, 5]], [np.float32, 0, [10, 3, 4]], [np.float32, 0, [10, 4, 5]], "float32"],
+            [[np.int32, 0, [3, 5]], [np.int32, 0, [10, 3, 4]], [np.int32, 0, [10, 4, 5]], "int32"]
+        ]
+
+        for item in shape_format:
+            cpu_input1, npu_input1 = create_common_tensor(item[0], 0, 100)
+            cpu_input2, npu_input2 = create_common_tensor(item[1], 0, 100)
+            cpu_input3, npu_input3 = create_common_tensor(item[2], 0, 100)
+            cpu_input4, npu_input4 = create_common_tensor(item[0], 0, 100)
+
+            scalar1 = self.generate_scalar(item[3], 0, 10)
+            scalar2 = self.generate_scalar(item[3], 0, 10)
+
+            cpu_output = self.cpu_op_exec(cpu_input1, cpu_input2, cpu_input3, scalar1, scalar2)
+            npu_output = self.npu_op_exec(npu_input1, npu_input2, npu_input3, scalar1, scalar2)
+
+            npu_output1 = self.npu_op_exec_out(npu_input1, npu_input2, npu_input3, scalar1, scalar2, npu_input4)
+            npu_output2 = self.npu_op_exec_inplace(npu_input1, npu_input2, npu_input3, scalar1, scalar2)
+
+            self.assertRtolEqual(cpu_output, npu_output)
+            self.assertRtolEqual(cpu_output, npu_output1)
+            self.assertRtolEqual(cpu_output, npu_output2)
+
+        def test_addbmm_transpose(self, device):
+            shape_format = [
+                [[np.float32, 0, [4, 5]], [np.float32, 0, [10, 4, 7]], [np.float32, 0, [10, 5, 7]], "float32"],
+                [[np.int32, 0, [4, 5]], [np.int32, 0, [10, 4, 7]], [np.int32, 0, [10, 5, 7]], "int32"]
+            ]
+
+            for item in shape_format:
+                cpu_input1, npu_input1 = create_common_tensor(item[0], 0, 100)
+                cpu_input2, npu_input2 = create_common_tensor(item[1], 0, 100)
+                cpu_input3, npu_input3 = create_common_tensor(item[2], 0, 100)
+
+                scalar1 = self.generate_scalar(item[3], 0, 10)
+                scalar2 = self.generate_scalar(item[3], 0, 10)
+
+                cpu_transpose_output = self.cpu_op_transpose_exec(cpu_input1, cpu_input2, cpu_input3, scalar1, scalar2)
+                npu_transpose_output = self.npu_op_transpose_exec(npu_input1, npu_input2, npu_input3, scalar1, scalar2)
+
+                self.assertRtolEqual(cpu_transpose_output, npu_transpose_output)
+
+
+instantiate_device_type_tests(TestAddbmm, globals(), except_for='cpu')
+if __name__ == "__main__":
+    run_tests()
\ No newline at end of file
diff --git a/test/test_network_ops/test_bmm.py b/test/test_network_ops/test_bmm.py
new file mode 100644
index 0000000000..5c5147b975
--- /dev/null
+++ b/test/test_network_ops/test_bmm.py
@@ -0,0 +1,73 @@
+# Copyright (c) 2020, Huawei Technologies.All rights reserved.
+#
+# Licensed under the BSD 3-Clause License  (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# https://opensource.org/licenses/BSD-3-Clause
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import torch
+import torch_npu
+import numpy as np
+
+from torch_npu.testing.common_utils import TestCase, run_tests
+from torch_npu.testing.common_device_type import instantiate_device_type_tests
+from torch_npu.testing.util_test import create_common_tensor
+
+class TestBatchMatMul(TestCase):
+  def cpu_op_exec(self, input1, input2):
+      output = torch.bmm(input1, input2)
+      output = output.numpy()
+      return output
+
+  def npu_op_exec(self, input1, input2):
+      output = torch.bmm(input1, input2)
+      output = output.to("cpu")
+      output = output.numpy()
+      return output
+
+  def bmm_auto_list_exec(self, shape):
+      for item in shape:
+          cpu_input1, npu_input1 = create_common_tensor(item[0], 0, 10)
+          cpu_input2, npu_input2 = create_common_tensor(item[1], 0, 10)
+          if cpu_input1.dtype == torch.float16:
+              cpu_input1 = cpu_input1.to(torch.float32)
+          if cpu_input2.dtype == torch.float16:
+              cpu_input2 = cpu_input2.to(torch.float32)
+          cpu_output = self.cpu_op_exec(cpu_input1, cpu_input2)
+          npu_output = self.npu_op_exec(npu_input1, npu_input2)
+          cpu_output = cpu_output.astype(npu_output.dtype)
+          self.assertRtolEqual(cpu_output, npu_output)
+
+  def test_batchmatmul_shape_format_fp16_3d(self, device):
+      format_list = [0, 3, 29]
+      shape_list = [(1, 3, 2)]
+      shape_format1 = [[np.float16, i, j]
+                        for i in format_list for j in shape_list]
+      format_list = [0, 3, 29]
+      shape_list = [(1, 2, 3)]
+      shape_format2 = [[np.float16, i, j]
+                        for i in format_list for j in shape_list]
+      shape_format = [[i, j] for i in shape_format1 for j in shape_format2]
+      self.bmm_auto_list_exec(shape_format)
+
+  def test_batchmatmul_shape_format_fp32_3d(self, device):
+      format_list = [0, 3, 29]
+      shape_list = [(1, 3, 2)]
+      shape_format1 = [[np.float32, i, j]
+                        for i in format_list for j in shape_list]
+      format_list = [0, 3, 29]
+      shape_list = [(1, 2, 3)]
+      shape_format2 = [[np.float32, i, j]
+                        for i in format_list for j in shape_list]
+      shape_format = [[i, j] for i in shape_format1 for j in shape_format2]
+      self.bmm_auto_list_exec(shape_format)
+
+instantiate_device_type_tests(TestBatchMatMul, globals(), except_for="cpu")
+if __name__ == "__main__":
+    run_tests()
diff --git a/torch_npu/csrc/aten/ops/AddbmmKernelNpu.cpp b/torch_npu/csrc/aten/ops/AddbmmKernelNpu.cpp
new file mode 100644
index 0000000000..484467bcdf
--- /dev/null
+++ b/torch_npu/csrc/aten/ops/AddbmmKernelNpu.cpp
@@ -0,0 +1,70 @@
+// Copyright (c) 2020, Huawei Technologies.All rights reserved.
+//
+// Licensed under the BSD 3-Clause License  (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "torch_npu/csrc/framework/utils/OpAdapter.h"
+#include "torch_npu/csrc/aten/NPUNativeFunctions.h"
+
+namespace at_npu {
+namespace native {
+
+at::Tensor& NPUNativeFunctions::addbmm_out( 
+    const at::Tensor& self, 
+    const at::Tensor& batch1, 
+    const at::Tensor& batch2,
+    at::Scalar beta,
+    at::Scalar alpha,
+    at::Tensor& result) {
+  at::Tensor MulResult = at::mul(batch1, alpha);
+  at::Tensor bmmResult = at::bmm(MulResult,batch2);
+  int64_t dim[2] = {batch1.size(1), batch2.size(2)};
+  at::Tensor sumResult = at::sum_to(bmmResult, dim);
+  // sumResult + self*beta
+  at::add_out(result, sumResult, self, beta); 
+  return result;
+}
+
+at::Tensor NPUNativeFunctions::addbmm(
+    const at::Tensor& self,
+    const at::Tensor& batch1,
+    const at::Tensor& batch2,
+    at::Scalar beta,
+    at::Scalar alpha) {
+  // calculate the output size
+  auto outputSize = addbmm_npu_output_size(self, batch1, batch2, beta, alpha);
+  // construct the output tensor of the NPU
+  Tensor result = OpPreparation::ApplyTensor(self, outputSize);
+  // calculate the output result of the NPU
+  addbmm_out(self, batch1, batch2, beta, alpha, result);
+  return result;
+}
+
+Tensor& NPUNativeFunctions::addbmm_(
+    at::Tensor& self,
+    const at::Tensor& batch1,
+    const at::Tensor& batch2,
+    at::Scalar beta,
+    at::Scalar alpha) {
+  OpPreparation::CheckMemory({self, batch1, batch2}, {self});
+  if (!NpuUtils::check_match(&self)) {
+    at::Tensor contiguousSelf = NpuUtils::format_contiguous(self);
+    at::Tensor result = addbmm_out_npu(contiguousSelf, batch1, batch2, beta, alpha, contiguousSelf);
+    NpuUtils::format_fresh_view(self, result);
+  } else {
+    addbmm_out(self, batch1, batch2, beta, alpha, self);
+  }
+  return self;
+}
+
+} // namespace native
+} // namespace at
\ No newline at end of file
diff --git a/torch_npu/csrc/aten/ops/BmmKernelNpu.cpp b/torch_npu/csrc/aten/ops/BmmKernelNpu.cpp
new file mode 100644
index 0000000000..c93fbe3cc8
--- /dev/null
+++ b/torch_npu/csrc/aten/ops/BmmKernelNpu.cpp
@@ -0,0 +1,88 @@
+// Copyright (c) 2020 Huawei Technologies Co., Ltd
+// Copyright (c) 2019, Facebook CORPORATION.
+// All rights reserved.
+//
+// Licensed under the BSD 3-Clause License  (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "c10/npu/OptionsManager.h"
+
+#include "torch_npu/csrc/framework/utils/OpAdapter.h"
+#include "torch_npu/csrc/framework/utils/CalcuOpUtil.h"
+#include "torch_npu/csrc/aten/NPUNativeFunctions.h"
+
+namespace at_npu {
+namespace native {
+
+at::Tensor& NPUNativeFunctions::bmm_out(const at::Tensor& self, const at::Tensor& mat2, at::Tensor& result) {
+  at::Tensor contiguousResult = result.is_contiguous() ? result : result.contiguous();
+
+  at::Tensor contiguousSelf = self;
+  at::Tensor contiguousMat2 = mat2;
+  bool isSelfT = CalcuOpUtil::is_transpose_last_two_dims(self);
+  bool isMat2T = CalcuOpUtil::is_transpose_last_two_dims(mat2);
+
+  if(!isSelfT){
+    contiguousSelf = NpuUtils::format_contiguous_add_copy_optimize(self);
+  }
+  if(!isMat2T){
+    contiguousMat2 = NpuUtils::format_contiguous_add_copy_optimize(mat2);
+  }
+
+  auto func1 = [&contiguousSelf]() {
+      bool pass = false;
+      return std::tie(pass, contiguousSelf);
+  };
+  auto func2 = [&contiguousMat2]() {
+      bool pass = false;
+      return std::tie(pass, contiguousMat2);
+  };
+
+  // executing the NPU operator
+  OpCommand cmd;
+  cmd.Name("BatchMatMul")
+      .InputWithFunc(func1)
+      .InputWithFunc(func2)
+      .Output(contiguousResult)
+      .Attr("adj_x1", isSelfT)
+      .Attr("adj_x2", isMat2T)
+      .Run();
+
+  if (!result.is_contiguous()) {
+    result.copy_(contiguousResult);
+  }
+  return result;
+}
+
+at::Tensor NPUNativeFunctions::bmm(const at::Tensor& self, const at::Tensor& mat2) {
+  // calculate the output size
+  auto outputSize = {self.size(0), self.size(1), mat2.size(2)};
+
+  // construct the output tensor of the NPU
+  at::Tensor result;
+
+  // 检查是否指定mm输出为NCHW。待NLP模型总体策略制定后删去
+  if ((self.scalar_type() == at::ScalarType::Float || self.scalar_type() == at::ScalarType::Half) &&
+      !c10::npu::OptionsManager::CheckSwitchMMOutputEnable()) {
+    result = NPUNativeFunctions::empty_with_format(outputSize, self.options(), ACL_FORMAT_FRACTAL_NZ);
+  } else {
+    result = NPUNativeFunctions::empty_with_format(outputSize, self.options(), ACL_FORMAT_ND);
+  }
+
+  // calculate the output result of the NPU
+  NPUNativeFunctions::bmm_out(self, mat2, result);
+
+  return result;
+}
+
+} // namespace native
+} // namespace at
-- 
Gitee


From a683cd88207dec00c2f2f58f42762b80ccbb5c16 Mon Sep 17 00:00:00 2001
From: wangxiao <wangxiao99@huawei.com>
Date: Wed, 26 Jan 2022 20:57:55 +0800
Subject: [PATCH 08/11] addbmm, bmm

---
 torch_npu/csrc/aten/ops/AddbmmKernelNpu.cpp | 2 +-
 torch_npu/csrc/aten/ops/BmmKernelNpu.cpp    | 4 ++--
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/torch_npu/csrc/aten/ops/AddbmmKernelNpu.cpp b/torch_npu/csrc/aten/ops/AddbmmKernelNpu.cpp
index 484467bcdf..f5e57820c4 100644
--- a/torch_npu/csrc/aten/ops/AddbmmKernelNpu.cpp
+++ b/torch_npu/csrc/aten/ops/AddbmmKernelNpu.cpp
@@ -43,7 +43,7 @@ at::Tensor NPUNativeFunctions::addbmm(
   // calculate the output size
   auto outputSize = addbmm_npu_output_size(self, batch1, batch2, beta, alpha);
   // construct the output tensor of the NPU
-  Tensor result = OpPreparation::ApplyTensor(self, outputSize);
+  at::Tensor result = OpPreparation::ApplyTensor(self, outputSize);
   // calculate the output result of the NPU
   addbmm_out(self, batch1, batch2, beta, alpha, result);
   return result;
diff --git a/torch_npu/csrc/aten/ops/BmmKernelNpu.cpp b/torch_npu/csrc/aten/ops/BmmKernelNpu.cpp
index c93fbe3cc8..38aceb87fd 100644
--- a/torch_npu/csrc/aten/ops/BmmKernelNpu.cpp
+++ b/torch_npu/csrc/aten/ops/BmmKernelNpu.cpp
@@ -73,9 +73,9 @@ at::Tensor NPUNativeFunctions::bmm(const at::Tensor& self, const at::Tensor& mat
   // 检查是否指定mm输出为NCHW。待NLP模型总体策略制定后删去
   if ((self.scalar_type() == at::ScalarType::Float || self.scalar_type() == at::ScalarType::Half) &&
       !c10::npu::OptionsManager::CheckSwitchMMOutputEnable()) {
-    result = NPUNativeFunctions::empty_with_format(outputSize, self.options(), ACL_FORMAT_FRACTAL_NZ);
+    result = at::empty_with_format(outputSize, self.options(), ACL_FORMAT_FRACTAL_NZ);
   } else {
-    result = NPUNativeFunctions::empty_with_format(outputSize, self.options(), ACL_FORMAT_ND);
+    result = at::empty_with_format(outputSize, self.options(), ACL_FORMAT_ND);
   }
 
   // calculate the output result of the NPU
-- 
Gitee


From 131dc05d11685bef358b9d121d8804ed3591ded1 Mon Sep 17 00:00:00 2001
From: wangxiao <wangxiao99@huawei.com>
Date: Wed, 26 Jan 2022 21:01:10 +0800
Subject: [PATCH 09/11] addbmm, bmm

---
 torch_npu/csrc/aten/ops/AddbmmKernelNpu.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/torch_npu/csrc/aten/ops/AddbmmKernelNpu.cpp b/torch_npu/csrc/aten/ops/AddbmmKernelNpu.cpp
index f5e57820c4..98f82b8428 100644
--- a/torch_npu/csrc/aten/ops/AddbmmKernelNpu.cpp
+++ b/torch_npu/csrc/aten/ops/AddbmmKernelNpu.cpp
@@ -49,7 +49,7 @@ at::Tensor NPUNativeFunctions::addbmm(
   return result;
 }
 
-Tensor& NPUNativeFunctions::addbmm_(
+at::Tensor& NPUNativeFunctions::addbmm_(
     at::Tensor& self,
     const at::Tensor& batch1,
     const at::Tensor& batch2,
-- 
Gitee


From 8c1cec8a1d7916f75e790f3f0f146ba4a5351dab Mon Sep 17 00:00:00 2001
From: wangxiao <wangxiao99@huawei.com>
Date: Wed, 26 Jan 2022 21:03:02 +0800
Subject: [PATCH 10/11] addbmm, bmm

---
 torch_npu/csrc/aten/ops/AddbmmKernelNpu.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/torch_npu/csrc/aten/ops/AddbmmKernelNpu.cpp b/torch_npu/csrc/aten/ops/AddbmmKernelNpu.cpp
index 98f82b8428..dff97430b1 100644
--- a/torch_npu/csrc/aten/ops/AddbmmKernelNpu.cpp
+++ b/torch_npu/csrc/aten/ops/AddbmmKernelNpu.cpp
@@ -58,7 +58,7 @@ at::Tensor& NPUNativeFunctions::addbmm_(
   OpPreparation::CheckMemory({self, batch1, batch2}, {self});
   if (!NpuUtils::check_match(&self)) {
     at::Tensor contiguousSelf = NpuUtils::format_contiguous(self);
-    at::Tensor result = addbmm_out_npu(contiguousSelf, batch1, batch2, beta, alpha, contiguousSelf);
+    at::Tensor result = addbmm_out(contiguousSelf, batch1, batch2, beta, alpha, contiguousSelf);
     NpuUtils::format_fresh_view(self, result);
   } else {
     addbmm_out(self, batch1, batch2, beta, alpha, self);
-- 
Gitee


From aa9f05b17f19c16cd4a5cc387535482fdd43be0b Mon Sep 17 00:00:00 2001
From: wangxiao <wangxiao99@huawei.com>
Date: Wed, 26 Jan 2022 21:19:44 +0800
Subject: [PATCH 11/11] clean code

---
 test/test_network_ops/test_abs.py             |  8 +-
 .../test_adaptive_avg_pool1d.py               |  8 +-
 test/test_network_ops/test_bmm.py             | 86 +++++++++----------
 3 files changed, 51 insertions(+), 51 deletions(-)

diff --git a/test/test_network_ops/test_abs.py b/test/test_network_ops/test_abs.py
index 5e10d1d907..cf44ee4bc0 100644
--- a/test/test_network_ops/test_abs.py
+++ b/test/test_network_ops/test_abs.py
@@ -20,13 +20,13 @@ from torch_npu.testing.common_device_type import Dtypes, instantiate_device_type
 from torch_npu.testing.util_test import create_common_tensor, test_2args_broadcast, create_dtype_tensor, UT_FAST_MODE
 
 class TestAbs(TestCase):
-    def cpu_op_exec(self, input):
-        output = torch.abs(input)
+    def cpu_op_exec(self, input1):
+        output = torch.abs(input1)
         output = output.numpy()
         return output
 
-    def npu_op_exec(self, input):
-        output = torch.abs(input)
+    def npu_op_exec(self, input1):
+        output = torch.abs(input1)
         output = output.to("cpu")
         output = output.numpy()
         return output
diff --git a/test/test_network_ops/test_adaptive_avg_pool1d.py b/test/test_network_ops/test_adaptive_avg_pool1d.py
index 5775934385..d39d76da23 100644
--- a/test/test_network_ops/test_adaptive_avg_pool1d.py
+++ b/test/test_network_ops/test_adaptive_avg_pool1d.py
@@ -21,14 +21,14 @@ from torch_npu.testing.common_device_type import Dtypes, instantiate_device_type
 from torch_npu.testing.util_test import create_common_tensor, test_2args_broadcast, create_dtype_tensor, UT_FAST_MODE
 
 class TestAdaptiveAvgPool1d(TestCase):
-    def cpu_op_exec(self, input, output_size):
+    def cpu_op_exec(self, input1, output_size):
         m = nn.AdaptiveAvgPool1d(output_size)
-        output= m(input)
+        output = m(input1)
         return output
 
-    def npu_op_exec(self, input, output_size):
+    def npu_op_exec(self, input1, output_size):
         m = nn.AdaptiveAvgPool1d(output_size).npu()
-        output = m(input)
+        output = m(input1)
         return output.cpu()
     
     def test_AdaptiveAvgPool1d_shape_format_fp16(self, device):
diff --git a/test/test_network_ops/test_bmm.py b/test/test_network_ops/test_bmm.py
index 5c5147b975..de204834bb 100644
--- a/test/test_network_ops/test_bmm.py
+++ b/test/test_network_ops/test_bmm.py
@@ -20,53 +20,53 @@ from torch_npu.testing.common_device_type import instantiate_device_type_tests
 from torch_npu.testing.util_test import create_common_tensor
 
 class TestBatchMatMul(TestCase):
-  def cpu_op_exec(self, input1, input2):
-      output = torch.bmm(input1, input2)
-      output = output.numpy()
-      return output
+    def cpu_op_exec(self, input1, input2):
+        output = torch.bmm(input1, input2)
+        output = output.numpy()
+        return output
 
-  def npu_op_exec(self, input1, input2):
-      output = torch.bmm(input1, input2)
-      output = output.to("cpu")
-      output = output.numpy()
-      return output
+    def npu_op_exec(self, input1, input2):
+        output = torch.bmm(input1, input2)
+        output = output.to("cpu")
+        output = output.numpy()
+        return output
 
-  def bmm_auto_list_exec(self, shape):
-      for item in shape:
-          cpu_input1, npu_input1 = create_common_tensor(item[0], 0, 10)
-          cpu_input2, npu_input2 = create_common_tensor(item[1], 0, 10)
-          if cpu_input1.dtype == torch.float16:
-              cpu_input1 = cpu_input1.to(torch.float32)
-          if cpu_input2.dtype == torch.float16:
-              cpu_input2 = cpu_input2.to(torch.float32)
-          cpu_output = self.cpu_op_exec(cpu_input1, cpu_input2)
-          npu_output = self.npu_op_exec(npu_input1, npu_input2)
-          cpu_output = cpu_output.astype(npu_output.dtype)
-          self.assertRtolEqual(cpu_output, npu_output)
+    def bmm_auto_list_exec(self, shape):
+        for item in shape:
+            cpu_input1, npu_input1 = create_common_tensor(item[0], 0, 10)
+            cpu_input2, npu_input2 = create_common_tensor(item[1], 0, 10)
+            if cpu_input1.dtype == torch.float16:
+                cpu_input1 = cpu_input1.to(torch.float32)
+            if cpu_input2.dtype == torch.float16:
+                cpu_input2 = cpu_input2.to(torch.float32)
+            cpu_output = self.cpu_op_exec(cpu_input1, cpu_input2)
+            npu_output = self.npu_op_exec(npu_input1, npu_input2)
+            cpu_output = cpu_output.astype(npu_output.dtype)
+            self.assertRtolEqual(cpu_output, npu_output)
 
-  def test_batchmatmul_shape_format_fp16_3d(self, device):
-      format_list = [0, 3, 29]
-      shape_list = [(1, 3, 2)]
-      shape_format1 = [[np.float16, i, j]
-                        for i in format_list for j in shape_list]
-      format_list = [0, 3, 29]
-      shape_list = [(1, 2, 3)]
-      shape_format2 = [[np.float16, i, j]
-                        for i in format_list for j in shape_list]
-      shape_format = [[i, j] for i in shape_format1 for j in shape_format2]
-      self.bmm_auto_list_exec(shape_format)
+    def test_batchmatmul_shape_format_fp16_3d(self, device):
+        format_list = [0, 3, 29]
+        shape_list = [(1, 3, 2)]
+        shape_format1 = [[np.float16, i, j]
+                            for i in format_list for j in shape_list]
+        format_list = [0, 3, 29]
+        shape_list = [(1, 2, 3)]
+        shape_format2 = [[np.float16, i, j]
+                            for i in format_list for j in shape_list]
+        shape_format = [[i, j] for i in shape_format1 for j in shape_format2]
+        self.bmm_auto_list_exec(shape_format)
 
-  def test_batchmatmul_shape_format_fp32_3d(self, device):
-      format_list = [0, 3, 29]
-      shape_list = [(1, 3, 2)]
-      shape_format1 = [[np.float32, i, j]
-                        for i in format_list for j in shape_list]
-      format_list = [0, 3, 29]
-      shape_list = [(1, 2, 3)]
-      shape_format2 = [[np.float32, i, j]
-                        for i in format_list for j in shape_list]
-      shape_format = [[i, j] for i in shape_format1 for j in shape_format2]
-      self.bmm_auto_list_exec(shape_format)
+    def test_batchmatmul_shape_format_fp32_3d(self, device):
+        format_list = [0, 3, 29]
+        shape_list = [(1, 3, 2)]
+        shape_format1 = [[np.float32, i, j]
+                            for i in format_list for j in shape_list]
+        format_list = [0, 3, 29]
+        shape_list = [(1, 2, 3)]
+        shape_format2 = [[np.float32, i, j]
+                            for i in format_list for j in shape_list]
+        shape_format = [[i, j] for i in shape_format1 for j in shape_format2]
+        self.bmm_auto_list_exec(shape_format)
 
 instantiate_device_type_tests(TestBatchMatMul, globals(), except_for="cpu")
 if __name__ == "__main__":
-- 
Gitee