From 4e9a3846351ea48ab87b432fd39ea4b59d7bfae5 Mon Sep 17 00:00:00 2001 From: wangxiao Date: Tue, 25 Jan 2022 16:31:14 +0800 Subject: [PATCH 01/11] abs, acos, adaptive_avg_pool1d --- test/test_network_ops/test_abs.py | 62 ++++++++++++++++ test/test_network_ops/test_acos.py | 68 ++++++++++++++++++ .../test_adaptive_avg_pool1d.py | 64 +++++++++++++++++ .../test_adaptive_avg_pool2d_backward.py | 71 +++++++++++++++++++ torch_npu/csrc/aten/npu_native_functions.yaml | 7 ++ torch_npu/csrc/aten/ops/AbsKernelNpu.cpp | 55 ++++++++++++++ torch_npu/csrc/aten/ops/AcosKernelNpu.cpp | 56 +++++++++++++++ .../pooling/AdaptiveAvgPool1dKernelNpu.cpp | 42 +++++++++++ 8 files changed, 425 insertions(+) create mode 100644 test/test_network_ops/test_abs.py create mode 100644 test/test_network_ops/test_acos.py create mode 100644 test/test_network_ops/test_adaptive_avg_pool1d.py create mode 100644 test/test_network_ops/test_adaptive_avg_pool2d_backward.py create mode 100644 torch_npu/csrc/aten/ops/AbsKernelNpu.cpp create mode 100644 torch_npu/csrc/aten/ops/AcosKernelNpu.cpp create mode 100644 torch_npu/csrc/aten/ops/pooling/AdaptiveAvgPool1dKernelNpu.cpp diff --git a/test/test_network_ops/test_abs.py b/test/test_network_ops/test_abs.py new file mode 100644 index 0000000000..5e10d1d907 --- /dev/null +++ b/test/test_network_ops/test_abs.py @@ -0,0 +1,62 @@ +# Copyright (c) 2020, Huawei Technologies.All rights reserved. +# +# Licensed under the BSD 3-Clause License (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# https://opensource.org/licenses/BSD-3-Clause +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +import torch +import torch_npu +import numpy as np + +from torch_npu.testing.common_utils import TestCase, run_tests +from torch_npu.testing.common_device_type import Dtypes, instantiate_device_type_tests +from torch_npu.testing.util_test import create_common_tensor, test_2args_broadcast, create_dtype_tensor, UT_FAST_MODE + +class TestAbs(TestCase): + def cpu_op_exec(self, input): + output = torch.abs(input) + output = output.numpy() + return output + + def npu_op_exec(self, input): + output = torch.abs(input) + output = output.to("cpu") + output = output.numpy() + return output + + def test_abs_shape_format_fp16(self, device): + format_list = [0, 3] + shape_list = [[5], [5, 10], [1, 3, 2], [52, 15, 15, 20]] + shape_format = [ + [np.float16, i, j] for i in format_list for j in shape_list + ] + for item in shape_format: + cpu_input, npu_input = create_common_tensor(item, -10, 10) + cpu_input = cpu_input.to(torch.float32) + cpu_output = self.cpu_op_exec(cpu_input) + npu_output = self.npu_op_exec(npu_input) + cpu_output = cpu_output.astype(np.float16) + self.assertRtolEqual(cpu_output, npu_output) + + def test_abs_shape_format_fp32(self, device): + format_list = [0, 3] + shape_list = [[5], [5, 10], [1, 3, 2], [52, 15, 15, 20]] + shape_format = [ + [np.float32, i, j] for i in format_list for j in shape_list + ] + for item in shape_format: + cpu_input, npu_input = create_common_tensor(item, -10, 10) + cpu_output = self.cpu_op_exec(cpu_input) + npu_output = self.npu_op_exec(npu_input) + self.assertRtolEqual(cpu_output, npu_output) + +instantiate_device_type_tests(TestAbs, globals(), except_for="cpu") +if __name__ == "__main__": + run_tests() diff --git a/test/test_network_ops/test_acos.py b/test/test_network_ops/test_acos.py new file mode 100644 index 0000000000..36e9ffa893 --- /dev/null +++ b/test/test_network_ops/test_acos.py @@ -0,0 +1,68 @@ +# Copyright (c) 2020, Huawei Technologies.All rights reserved. +# +# Licensed under the BSD 3-Clause License (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# https://opensource.org/licenses/BSD-3-Clause +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import torch +import numpy as np + +from torch_npu.testing.common_utils import TestCase, run_tests +from torch_npu.testing.common_device_type import Dtypes, instantiate_device_type_tests +from torch_npu.testing.util_test import create_common_tensor, test_2args_broadcast, create_dtype_tensor, UT_FAST_MODE + +class TestAcos(TestCase): + def cpu_op_exec(self, input_para): + output = torch.acos(input_para) + output = output.numpy() + return output + + def npu_op_exec(self, input_para): + output = torch.acos(input_para) + output = output.to("cpu") + output = output.numpy() + return output + + def test_acos_common_shape_format(self, device): + shape_format = [ + [[np.float32, -1, 1]], + [[np.float32, -1, (64, 10)]], + [[np.float32, -1, (32, 1, 3)]] + ] + for item in shape_format: + cpu_input, npu_input = create_common_tensor(item[0], -1, 1) + cpu_output = self.cpu_op_exec(cpu_input) + npu_output = self.npu_op_exec(npu_input) + self.assertRtolEqual(cpu_output, npu_output) + + def test_acos_float16_shape_format(self, device): + def cpu_op_exec_fp16(input_para): + input_para = input_para.to(torch.float32) + output = torch.acos(input_para) + output = output.numpy() + output = output.astype(np.float16) + return output + + shape_format = [ + [[np.float16, -1, 1]], + [[np.float16, -1, (64, 10)]], + [[np.float16, -1, (31, 1, 3)]] + ] + for item in shape_format: + cpu_input, npu_input = create_common_tensor(item[0], -1, 1) + cpu_output = cpu_op_exec_fp16(cpu_input) + npu_output = self.npu_op_exec(npu_input) + self.assertRtolEqual(cpu_output, npu_output) + +instantiate_device_type_tests(TestAcos, globals(), except_for="cpu") +if __name__ == "__main__": + run_tests() + diff --git a/test/test_network_ops/test_adaptive_avg_pool1d.py b/test/test_network_ops/test_adaptive_avg_pool1d.py new file mode 100644 index 0000000000..56ecbe229c --- /dev/null +++ b/test/test_network_ops/test_adaptive_avg_pool1d.py @@ -0,0 +1,64 @@ +# Copyright (c) 2020, Huawei Technologies.All rights reserved. +# +# Licensed under the BSD 3-Clause License (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# https://opensource.org/licenses/BSD-3-Clause +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import torch +import torch_npu +import torch.nn as nn +import numpy as np +from torch_npu.testing.common_utils import TestCase, run_tests +from torch_npu.testing.common_device_type import Dtypes, instantiate_device_type_tests +from torch_npu.testing.util_test import create_common_tensor, test_2args_broadcast, create_dtype_tensor, UT_FAST_MODE + +class TestAdaptiveAvgPool1d(TestCase): + def cpu_op_exec(self, input, output_size): + m = nn.AdaptiveAvgPool1d(output_size) + output= m(input) + return output.numpy() + + def npu_op_exec(self, input, output_size): + m = nn.AdaptiveAvgPool1d(output_size).npu() + output = m(input) + return output.cpu().numpy() + + def test_AdaptiveAvgPool1d_shape_format_fp16(self, device): + shape_format = [ + [np.float16, 0, (64, 10, 16)], + [np.float16, -1, (256, 2048, 8)], + [np.float16, 3, (32, 16, 16)] + ] + output_list = [(4), (3)] + for item in shape_format: + cpu_input, npu_input = create_common_tensor(item, 1, 10) + for output_size in output_list: + cpu_output = self.cpu_op_exec(cpu_input, output_size) + npu_output = self.npu_op_exec(npu_input, output_size) + self.assertRtolEqual(cpu_output, npu_output, prec16=0.002) + + def test_AdaptiveAvgPool1d_shape_format_fp32(self, device): + shape_format = [ + [np.float32, 0, (64, 10, 16)], + [np.float32, -1, (256, 2048, 8)], + [np.float32, 3, (32, 16, 16)] + ] + output_list = [(4), (3), (1)] + for item in shape_format: + cpu_input, npu_input = create_common_tensor(item, 1, 10) + for output_size in output_list: + cpu_output = self.cpu_op_exec(cpu_input, output_size) + npu_output = self.npu_op_exec(npu_input, output_size) + self.assertRtolEqual(cpu_output, npu_output, 0.001) + +instantiate_device_type_tests(TestAdaptiveAvgPool1d, globals(), except_for="cpu") +if __name__ == "__main__": + run_tests() diff --git a/test/test_network_ops/test_adaptive_avg_pool2d_backward.py b/test/test_network_ops/test_adaptive_avg_pool2d_backward.py new file mode 100644 index 0000000000..8bdefbf7fc --- /dev/null +++ b/test/test_network_ops/test_adaptive_avg_pool2d_backward.py @@ -0,0 +1,71 @@ +# Copyright (c) 2020, Huawei Technologies.All rights reserved. +# +# Licensed under the BSD 3-Clause License (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# https://opensource.org/licenses/BSD-3-Clause +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import torch +import torch_npu +import numpy as np +from torch.nn import functional as F +from torch_npu.testing.common_utils import TestCase, run_tests +from torch_npu.testing.common_device_type import Dtypes, instantiate_device_type_tests +from torch_npu.testing.util_test import create_common_tensor, test_2args_broadcast, create_dtype_tensor, UT_FAST_MODE + +class TestAdaptiveAvgPool2dBackward(TestCase): + + def cpu_op_exec(self, input_x, input_grad): + input_x.requires_grad_(True) + m = torch.nn.AdaptiveAvgPool2d(input_grad) + output = m(input_x) + output.backward(output) + out = input_x.grad + return out + + def npu_op_exec(self, input_x, input_grad): + input_x.requires_grad_(True) + m = torch.nn.AdaptiveAvgPool2d(input_grad) + output = m(input_x) + output.backward(output) + out = input_x.grad.cpu() + return out + + def test_adaptiveAvgPool2d_backward_1(self, device): + cpu_input = torch.randn((1, 8, 9), dtype=torch.float32) + npu_input = cpu_input + output_size = np.array((2, 3)) + cpu_output = self.cpu_op_exec(cpu_input, output_size) + npu_output = self.npu_op_exec(npu_input, output_size) + self.assertRtolEqual(cpu_output.numpy(), npu_output.numpy()) + + def test_adaptiveAvgPool2d_backward_2(self, device): + cpu_input = torch.randn((1, 3, 3, 3), dtype=torch.float32) + npu_input = cpu_input + output_size = np.array((2, 2)) + cpu_output = self.cpu_op_exec(cpu_input, output_size) + npu_output = self.npu_op_exec(npu_input, output_size) + + self.assertRtolEqual(cpu_output.numpy(), npu_output.numpy()) + + def test_adaptiveAvgPool2d_backward_fp16(self, device): + input_x = np.random.uniform(0, 1, (1, 3, 6, 6)).astype(np.float16) + cpu_input = torch.from_numpy(input_x) + npu_input = cpu_input + output_size = np.array((5, 5)) + cpu_input = cpu_input.to(torch.float32) + cpu_output = self.cpu_op_exec(cpu_input, output_size) + npu_output = self.npu_op_exec(npu_input, output_size) + cpu_output = cpu_output.to(torch.float16) + self.assertRtolEqual(cpu_output.numpy(), npu_output.numpy()) + +instantiate_device_type_tests(TestAdaptiveAvgPool2dBackward, globals(), except_for="cpu") +if __name__ == "__main__": + run_tests() diff --git a/torch_npu/csrc/aten/npu_native_functions.yaml b/torch_npu/csrc/aten/npu_native_functions.yaml index 412fb9da7c..a9c96fdc0b 100644 --- a/torch_npu/csrc/aten/npu_native_functions.yaml +++ b/torch_npu/csrc/aten/npu_native_functions.yaml @@ -1,6 +1,13 @@ backend: NPU cpp_namespace: at_npu::native supported: + - abs + - abs_ + - abs.out + - acos + - acos_ + - acos.out + - adaptive_avg_pool1d - add.Tensor - add.Scalar - add_.Tensor diff --git a/torch_npu/csrc/aten/ops/AbsKernelNpu.cpp b/torch_npu/csrc/aten/ops/AbsKernelNpu.cpp new file mode 100644 index 0000000000..de2d514e40 --- /dev/null +++ b/torch_npu/csrc/aten/ops/AbsKernelNpu.cpp @@ -0,0 +1,55 @@ +// Copyright (c) 2020 Huawei Technologies Co., Ltd +// Copyright (c) 2019, Facebook CORPORATION. +// All rights reserved. +// +// Licensed under the BSD 3-Clause License (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// https://opensource.org/licenses/BSD-3-Clause +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +#include "torch_npu/csrc/framework/utils/OpAdapter.h" +#include "torch_npu/csrc/aten/NPUNativeFunctions.h" + +namespace at_npu { +namespace native { + +at::Tensor& abs_out_npu_nocheck(at::Tensor& result, const at::Tensor& self) { + OpCommand cmd; + cmd.Name("Abs") + .Input(self) + .Output(result) + .Run(); + return result; +} + +at::Tensor& NPUNativeFunctions::abs_out(const at::Tensor& self, at::Tensor& result) { + OpPreparation::CheckOut( + {self}, + result, + self); + OpPipeWithDefinedOut pipe; + return pipe.CheckMemory({self}, {result}) + .Func([&self](at::Tensor& result){abs_out_npu_nocheck(result, self);}) + .Call(result); +} + +at::Tensor NPUNativeFunctions::abs(const at::Tensor& self) { + OpPipeWithApplyOut pipe; + return pipe.ApplyOutputSameAs(self) + .Func([&self](at::Tensor& result) {abs_out_npu_nocheck(result, self);}) + .Call(); +} + +at::Tensor& NPUNativeFunctions::abs_(at::Tensor& self) { + abs_out_npu(self, self); + return self; +} + +} // namespace native +} // namespace at_npu \ No newline at end of file diff --git a/torch_npu/csrc/aten/ops/AcosKernelNpu.cpp b/torch_npu/csrc/aten/ops/AcosKernelNpu.cpp new file mode 100644 index 0000000000..09acb183d6 --- /dev/null +++ b/torch_npu/csrc/aten/ops/AcosKernelNpu.cpp @@ -0,0 +1,56 @@ +// Copyright (c) 2020 Huawei Technologies Co., Ltd +// Copyright (c) 2019, Facebook CORPORATION. +// All rights reserved. +// +// Licensed under the BSD 3-Clause License (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// https://opensource.org/licenses/BSD-3-Clause +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +#include "torch_npu/csrc/framework/utils/OpAdapter.h" +#include "torch_npu/csrc/aten/NPUNativeFunctions.h" + +namespace at_npu { +namespace native { + +at::Tensor& acos_out_npu_nocheck(const at::Tensor& self, at::Tensor& result) { + OpCommand cmd; + cmd.Name("Acos") + .Input(self) + .Output(result) + .Run(); + + return result; +} + +at::Tensor& NPUNativeFunctions::acos_out(const at::Tensor& self, at::Tensor& result) { + OpPipeWithDefinedOut pipe; + OpPreparation::CheckOut( + {self}, + result, + self); + return pipe.CheckMemory({self}, {result}) + .Func([&self](at::Tensor& result){acos_out_npu_nocheck(self, result);}) + .Call(result); +} + +at::Tensor NPUNativeFunctions::acos(const at::Tensor& self) { + OpPipeWithApplyOut pipe; + return pipe.ApplyOutputSameAs(self) + .Func([&self](at::Tensor& result) {acos_out_npu_nocheck(self, result);}) + .Call(); +} + +at::Tensor& NPUNativeFunctions::acos_(at::Tensor& self) { + acos_out_npu(self, self); + return self; +} + +} // namespace native +} // namespace at \ No newline at end of file diff --git a/torch_npu/csrc/aten/ops/pooling/AdaptiveAvgPool1dKernelNpu.cpp b/torch_npu/csrc/aten/ops/pooling/AdaptiveAvgPool1dKernelNpu.cpp new file mode 100644 index 0000000000..d30939a8b4 --- /dev/null +++ b/torch_npu/csrc/aten/ops/pooling/AdaptiveAvgPool1dKernelNpu.cpp @@ -0,0 +1,42 @@ +// Copyright (c) 2020, Huawei Technologies.All rights reserved. +// +// Licensed under the BSD 3-Clause License (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// https://opensource.org/licenses/BSD-3-Clause +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +#include "torch_npu/csrc/framework/utils/OpAdapter.h" +#include "torch_npu/csrc/aten/NPUNativeFunctions.h" + +namespace at_npu { +namespace native { + +static void check1d( + const char* function_name, + const char* argument_name, + IntArrayRef x) { + TORCH_CHECK( + x.size() == 1, + function_name, "() argument '", argument_name, + "' should contain one int (got ", x.size(), ")"); +} + +at::Tensor NPUNativeFunctions::adaptive_avg_pool1d(const at::Tensor& self, IntArrayRef output_size) { + checkDim("adaptive_avg_pool1d", TensorArg(self, "self", 1), 3); + check1d("adaptive_avg_pool1d", "output_size", output_size); +// construct the output tensor of the NPU + auto output = NPUNativeFunctions::adaptive_avg_pool2d( + self.unsqueeze(2), + {1, output_size[0]}); + + return output.squeeze(2); +} + +} // namespace native +} // namespace at \ No newline at end of file -- Gitee From bc3b850b19ba5196065b2cb5bff5486e539fb535 Mon Sep 17 00:00:00 2001 From: wangxiao Date: Tue, 25 Jan 2022 17:31:54 +0800 Subject: [PATCH 02/11] abs, acos, adaptive_avg_pool1d, fix bug --- torch_npu/csrc/aten/ops/AbsKernelNpu.cpp | 2 +- torch_npu/csrc/aten/ops/AcosKernelNpu.cpp | 4 ++-- .../csrc/aten/ops/pooling/AdaptiveAvgPool1dKernelNpu.cpp | 2 +- 3 files changed, 4 insertions(+), 4 deletions(-) diff --git a/torch_npu/csrc/aten/ops/AbsKernelNpu.cpp b/torch_npu/csrc/aten/ops/AbsKernelNpu.cpp index de2d514e40..65bf701255 100644 --- a/torch_npu/csrc/aten/ops/AbsKernelNpu.cpp +++ b/torch_npu/csrc/aten/ops/AbsKernelNpu.cpp @@ -47,7 +47,7 @@ at::Tensor NPUNativeFunctions::abs(const at::Tensor& self) { } at::Tensor& NPUNativeFunctions::abs_(at::Tensor& self) { - abs_out_npu(self, self); + abs_out(self, self); return self; } diff --git a/torch_npu/csrc/aten/ops/AcosKernelNpu.cpp b/torch_npu/csrc/aten/ops/AcosKernelNpu.cpp index 09acb183d6..731ac93999 100644 --- a/torch_npu/csrc/aten/ops/AcosKernelNpu.cpp +++ b/torch_npu/csrc/aten/ops/AcosKernelNpu.cpp @@ -48,9 +48,9 @@ at::Tensor NPUNativeFunctions::acos(const at::Tensor& self) { } at::Tensor& NPUNativeFunctions::acos_(at::Tensor& self) { - acos_out_npu(self, self); + acos_out(self, self); return self; } } // namespace native -} // namespace at \ No newline at end of file +} // namespace at_npu \ No newline at end of file diff --git a/torch_npu/csrc/aten/ops/pooling/AdaptiveAvgPool1dKernelNpu.cpp b/torch_npu/csrc/aten/ops/pooling/AdaptiveAvgPool1dKernelNpu.cpp index d30939a8b4..64a2e08fb4 100644 --- a/torch_npu/csrc/aten/ops/pooling/AdaptiveAvgPool1dKernelNpu.cpp +++ b/torch_npu/csrc/aten/ops/pooling/AdaptiveAvgPool1dKernelNpu.cpp @@ -39,4 +39,4 @@ at::Tensor NPUNativeFunctions::adaptive_avg_pool1d(const at::Tensor& self, IntAr } } // namespace native -} // namespace at \ No newline at end of file +} // namespace at_npu \ No newline at end of file -- Gitee From 0cd1567f240caca872f9990f23088212c21b98ae Mon Sep 17 00:00:00 2001 From: wangxiao Date: Tue, 25 Jan 2022 17:35:21 +0800 Subject: [PATCH 03/11] adaptive_avg_pool1d, fix bug --- .../csrc/aten/ops/pooling/AdaptiveAvgPool1dKernelNpu.cpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/torch_npu/csrc/aten/ops/pooling/AdaptiveAvgPool1dKernelNpu.cpp b/torch_npu/csrc/aten/ops/pooling/AdaptiveAvgPool1dKernelNpu.cpp index 64a2e08fb4..19090b9af8 100644 --- a/torch_npu/csrc/aten/ops/pooling/AdaptiveAvgPool1dKernelNpu.cpp +++ b/torch_npu/csrc/aten/ops/pooling/AdaptiveAvgPool1dKernelNpu.cpp @@ -20,14 +20,14 @@ namespace native { static void check1d( const char* function_name, const char* argument_name, - IntArrayRef x) { + at::IntArrayRef x) { TORCH_CHECK( x.size() == 1, function_name, "() argument '", argument_name, "' should contain one int (got ", x.size(), ")"); } -at::Tensor NPUNativeFunctions::adaptive_avg_pool1d(const at::Tensor& self, IntArrayRef output_size) { +at::Tensor NPUNativeFunctions::adaptive_avg_pool1d(const at::Tensor& self, at::IntArrayRef output_size) { checkDim("adaptive_avg_pool1d", TensorArg(self, "self", 1), 3); check1d("adaptive_avg_pool1d", "output_size", output_size); // construct the output tensor of the NPU -- Gitee From 691f4bf4aad3d9f4b036a8020c86a763b27e59f0 Mon Sep 17 00:00:00 2001 From: wangxiao Date: Tue, 25 Jan 2022 17:38:19 +0800 Subject: [PATCH 04/11] adaptive_avg_pool1d, fix bug --- torch_npu/csrc/aten/ops/pooling/AdaptiveAvgPool1dKernelNpu.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/torch_npu/csrc/aten/ops/pooling/AdaptiveAvgPool1dKernelNpu.cpp b/torch_npu/csrc/aten/ops/pooling/AdaptiveAvgPool1dKernelNpu.cpp index 19090b9af8..ae50870d0f 100644 --- a/torch_npu/csrc/aten/ops/pooling/AdaptiveAvgPool1dKernelNpu.cpp +++ b/torch_npu/csrc/aten/ops/pooling/AdaptiveAvgPool1dKernelNpu.cpp @@ -28,7 +28,7 @@ static void check1d( } at::Tensor NPUNativeFunctions::adaptive_avg_pool1d(const at::Tensor& self, at::IntArrayRef output_size) { - checkDim("adaptive_avg_pool1d", TensorArg(self, "self", 1), 3); + at::checkDim("adaptive_avg_pool1d", at::TensorArg(self, "self", 1), 3); check1d("adaptive_avg_pool1d", "output_size", output_size); // construct the output tensor of the NPU auto output = NPUNativeFunctions::adaptive_avg_pool2d( -- Gitee From 79340b0c1c0d3f282127941f91e1ffa9b5d9126f Mon Sep 17 00:00:00 2001 From: wangxiao Date: Wed, 26 Jan 2022 16:14:48 +0800 Subject: [PATCH 05/11] ut fix --- .../test_adaptive_avg_pool1d.py | 2 +- .../test_adaptive_avg_pool2d_backward.py | 19 ++++++++++++------- torch_npu/csrc/aten/npu_native_functions.yaml | 7 ------- 3 files changed, 13 insertions(+), 15 deletions(-) diff --git a/test/test_network_ops/test_adaptive_avg_pool1d.py b/test/test_network_ops/test_adaptive_avg_pool1d.py index 56ecbe229c..dadff381e2 100644 --- a/test/test_network_ops/test_adaptive_avg_pool1d.py +++ b/test/test_network_ops/test_adaptive_avg_pool1d.py @@ -41,7 +41,7 @@ class TestAdaptiveAvgPool1d(TestCase): for item in shape_format: cpu_input, npu_input = create_common_tensor(item, 1, 10) for output_size in output_list: - cpu_output = self.cpu_op_exec(cpu_input, output_size) + cpu_output = self.cpu_op_exec(cpu_input.float(), output_size).half() npu_output = self.npu_op_exec(npu_input, output_size) self.assertRtolEqual(cpu_output, npu_output, prec16=0.002) diff --git a/test/test_network_ops/test_adaptive_avg_pool2d_backward.py b/test/test_network_ops/test_adaptive_avg_pool2d_backward.py index 8bdefbf7fc..095fdd2188 100644 --- a/test/test_network_ops/test_adaptive_avg_pool2d_backward.py +++ b/test/test_network_ops/test_adaptive_avg_pool2d_backward.py @@ -25,9 +25,12 @@ class TestAdaptiveAvgPool2dBackward(TestCase): def cpu_op_exec(self, input_x, input_grad): input_x.requires_grad_(True) m = torch.nn.AdaptiveAvgPool2d(input_grad) - output = m(input_x) + if input_x.dtype == torch.half: + output = m(input_x.float()).half() + else: + output = m(input_x) output.backward(output) - out = input_x.grad + out = output.detach(), input_x.grad return out def npu_op_exec(self, input_x, input_grad): @@ -35,7 +38,7 @@ class TestAdaptiveAvgPool2dBackward(TestCase): m = torch.nn.AdaptiveAvgPool2d(input_grad) output = m(input_x) output.backward(output) - out = input_x.grad.cpu() + out = output.detach().cpu(), input_x.grad.cpu() return out def test_adaptiveAvgPool2d_backward_1(self, device): @@ -44,7 +47,8 @@ class TestAdaptiveAvgPool2dBackward(TestCase): output_size = np.array((2, 3)) cpu_output = self.cpu_op_exec(cpu_input, output_size) npu_output = self.npu_op_exec(npu_input, output_size) - self.assertRtolEqual(cpu_output.numpy(), npu_output.numpy()) + self.assertRtolEqual(cpu_output[0], npu_output[0]) + self.assertRtolEqual(cpu_output[1], npu_output[1]) def test_adaptiveAvgPool2d_backward_2(self, device): cpu_input = torch.randn((1, 3, 3, 3), dtype=torch.float32) @@ -52,8 +56,8 @@ class TestAdaptiveAvgPool2dBackward(TestCase): output_size = np.array((2, 2)) cpu_output = self.cpu_op_exec(cpu_input, output_size) npu_output = self.npu_op_exec(npu_input, output_size) - - self.assertRtolEqual(cpu_output.numpy(), npu_output.numpy()) + self.assertRtolEqual(cpu_output[0], npu_output[0]) + self.assertRtolEqual(cpu_output[1], npu_output[1]) def test_adaptiveAvgPool2d_backward_fp16(self, device): input_x = np.random.uniform(0, 1, (1, 3, 6, 6)).astype(np.float16) @@ -64,7 +68,8 @@ class TestAdaptiveAvgPool2dBackward(TestCase): cpu_output = self.cpu_op_exec(cpu_input, output_size) npu_output = self.npu_op_exec(npu_input, output_size) cpu_output = cpu_output.to(torch.float16) - self.assertRtolEqual(cpu_output.numpy(), npu_output.numpy()) + self.assertRtolEqual(cpu_output[0], npu_output[0]) + self.assertRtolEqual(cpu_output[1], npu_output[1]) instantiate_device_type_tests(TestAdaptiveAvgPool2dBackward, globals(), except_for="cpu") if __name__ == "__main__": diff --git a/torch_npu/csrc/aten/npu_native_functions.yaml b/torch_npu/csrc/aten/npu_native_functions.yaml index a9c96fdc0b..412fb9da7c 100644 --- a/torch_npu/csrc/aten/npu_native_functions.yaml +++ b/torch_npu/csrc/aten/npu_native_functions.yaml @@ -1,13 +1,6 @@ backend: NPU cpp_namespace: at_npu::native supported: - - abs - - abs_ - - abs.out - - acos - - acos_ - - acos.out - - adaptive_avg_pool1d - add.Tensor - add.Scalar - add_.Tensor -- Gitee From 5a4d42268aa8de81932e046db5810bf189ed84b9 Mon Sep 17 00:00:00 2001 From: wangxiao Date: Wed, 26 Jan 2022 16:42:04 +0800 Subject: [PATCH 06/11] ut fix --- test/test_network_ops/test_adaptive_avg_pool1d.py | 4 ++-- .../test_network_ops/test_adaptive_avg_pool2d_backward.py | 8 +++----- 2 files changed, 5 insertions(+), 7 deletions(-) diff --git a/test/test_network_ops/test_adaptive_avg_pool1d.py b/test/test_network_ops/test_adaptive_avg_pool1d.py index dadff381e2..5775934385 100644 --- a/test/test_network_ops/test_adaptive_avg_pool1d.py +++ b/test/test_network_ops/test_adaptive_avg_pool1d.py @@ -24,12 +24,12 @@ class TestAdaptiveAvgPool1d(TestCase): def cpu_op_exec(self, input, output_size): m = nn.AdaptiveAvgPool1d(output_size) output= m(input) - return output.numpy() + return output def npu_op_exec(self, input, output_size): m = nn.AdaptiveAvgPool1d(output_size).npu() output = m(input) - return output.cpu().numpy() + return output.cpu() def test_AdaptiveAvgPool1d_shape_format_fp16(self, device): shape_format = [ diff --git a/test/test_network_ops/test_adaptive_avg_pool2d_backward.py b/test/test_network_ops/test_adaptive_avg_pool2d_backward.py index 095fdd2188..5f60db63ea 100644 --- a/test/test_network_ops/test_adaptive_avg_pool2d_backward.py +++ b/test/test_network_ops/test_adaptive_avg_pool2d_backward.py @@ -43,7 +43,7 @@ class TestAdaptiveAvgPool2dBackward(TestCase): def test_adaptiveAvgPool2d_backward_1(self, device): cpu_input = torch.randn((1, 8, 9), dtype=torch.float32) - npu_input = cpu_input + npu_input = cpu_input.npu() output_size = np.array((2, 3)) cpu_output = self.cpu_op_exec(cpu_input, output_size) npu_output = self.npu_op_exec(npu_input, output_size) @@ -52,7 +52,7 @@ class TestAdaptiveAvgPool2dBackward(TestCase): def test_adaptiveAvgPool2d_backward_2(self, device): cpu_input = torch.randn((1, 3, 3, 3), dtype=torch.float32) - npu_input = cpu_input + npu_input = cpu_input.npu() output_size = np.array((2, 2)) cpu_output = self.cpu_op_exec(cpu_input, output_size) npu_output = self.npu_op_exec(npu_input, output_size) @@ -62,12 +62,10 @@ class TestAdaptiveAvgPool2dBackward(TestCase): def test_adaptiveAvgPool2d_backward_fp16(self, device): input_x = np.random.uniform(0, 1, (1, 3, 6, 6)).astype(np.float16) cpu_input = torch.from_numpy(input_x) - npu_input = cpu_input + npu_input = cpu_input.npu() output_size = np.array((5, 5)) - cpu_input = cpu_input.to(torch.float32) cpu_output = self.cpu_op_exec(cpu_input, output_size) npu_output = self.npu_op_exec(npu_input, output_size) - cpu_output = cpu_output.to(torch.float16) self.assertRtolEqual(cpu_output[0], npu_output[0]) self.assertRtolEqual(cpu_output[1], npu_output[1]) -- Gitee From 935347f46a94a881cbf0c0235f4fd396ba5108d9 Mon Sep 17 00:00:00 2001 From: wangxiao Date: Wed, 26 Jan 2022 20:51:02 +0800 Subject: [PATCH 07/11] addbmm, bmm --- test/test_network_ops/test_addbmm.py | 128 ++++++++++++++++++++ test/test_network_ops/test_bmm.py | 73 +++++++++++ torch_npu/csrc/aten/ops/AddbmmKernelNpu.cpp | 70 +++++++++++ torch_npu/csrc/aten/ops/BmmKernelNpu.cpp | 88 ++++++++++++++ 4 files changed, 359 insertions(+) create mode 100644 test/test_network_ops/test_addbmm.py create mode 100644 test/test_network_ops/test_bmm.py create mode 100644 torch_npu/csrc/aten/ops/AddbmmKernelNpu.cpp create mode 100644 torch_npu/csrc/aten/ops/BmmKernelNpu.cpp diff --git a/test/test_network_ops/test_addbmm.py b/test/test_network_ops/test_addbmm.py new file mode 100644 index 0000000000..c9fe853f90 --- /dev/null +++ b/test/test_network_ops/test_addbmm.py @@ -0,0 +1,128 @@ +# Copyright (c) 2020 Huawei Technologies Co., Ltd +# All rights reserved. +# +# Licensed under the BSD 3-Clause License (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# https://opensource.org/licenses/BSD-3-Clause +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +import torch +import torch_npu +import numpy as np + +from torch_npu.testing.common_utils import TestCase, run_tests +from torch_npu.testing.common_device_type import instantiate_device_type_tests +from torch_npu.testing.util_test import create_common_tensor + +class TestAddbmm(TestCase): + def generate_scalar(self, dtype, min_d, max_d): + if dtype == "float32": + scalar = np.random.uniform(min_d, max_d) + if dtype == "int32": + scalar = np.random.randint(min_d, max_d) + return scalar + + def cpu_op_exec(self, input1, input2, input3, scalar1, scalar2): + output = torch.addbmm(input1, input2, input3, beta=scalar1, alpha=scalar2) + output = output.numpy() + return output + + def npu_op_exec(self, input1, input2, input3, scalar1, scalar2): + input1 = input1.to("npu") + input2 = input2.to("npu") + input3 = input3.to("npu") + output = torch.addbmm(input1, input2, input3, beta=scalar1, alpha=scalar2) + output = output.to("cpu") + output = output.numpy() + return output + + def npu_op_exec_out(self, input1, input2, input3, scalar1, scalar2, input4): + input1 = input1.to("npu") + input2 = input2.to("npu") + input3 = input3.to("npu") + output = input4.to("npu") + torch.addbmm(input1, input2, input3, beta=scalar1, alpha=scalar2, out=output) + output = output.to("cpu") + output = output.numpy() + return output + + def npu_op_exec_inplace(self, input1, input2, input3, scalar1, scalar2): + input1 = input1.to("npu") + input2 = input2.to("npu") + input3 = input3.to("npu") + input1.addbmm_(input2, input3, beta=scalar1, alpha=scalar2) + output = input1.to("cpu") + output = output.numpy() + return output + + + def cpu_op_transpose_exec(self, input1, input2, input3, scalar1, scalar2): + input3_t = np.transpose(input3,(0,2,1)) + output = torch.addbmm(input1, input2, input3_t, beta=scalar1, alpha=scalar2) + output = output.numpy() + return output + + def npu_op_transpose_exec(self, input1, input2, input3, scalar1, scalar2): + input1 = input1.to("npu") + input2 = input2.to("npu") + input3 = input3.to("npu") + input3_t = np.transpose(input3,(0,2,1)) + output = torch.addbmm(input1, input2, input3_t, beta=scalar1, alpha=scalar2) + output = output.to("cpu") + output = output.numpy() + return output + + def test_addbmm(self, device): + shape_format = [ + [[np.float32, 0, [3, 5]], [np.float32, 0, [10, 3, 4]], [np.float32, 0, [10, 4, 5]], "float32"], + [[np.int32, 0, [3, 5]], [np.int32, 0, [10, 3, 4]], [np.int32, 0, [10, 4, 5]], "int32"] + ] + + for item in shape_format: + cpu_input1, npu_input1 = create_common_tensor(item[0], 0, 100) + cpu_input2, npu_input2 = create_common_tensor(item[1], 0, 100) + cpu_input3, npu_input3 = create_common_tensor(item[2], 0, 100) + cpu_input4, npu_input4 = create_common_tensor(item[0], 0, 100) + + scalar1 = self.generate_scalar(item[3], 0, 10) + scalar2 = self.generate_scalar(item[3], 0, 10) + + cpu_output = self.cpu_op_exec(cpu_input1, cpu_input2, cpu_input3, scalar1, scalar2) + npu_output = self.npu_op_exec(npu_input1, npu_input2, npu_input3, scalar1, scalar2) + + npu_output1 = self.npu_op_exec_out(npu_input1, npu_input2, npu_input3, scalar1, scalar2, npu_input4) + npu_output2 = self.npu_op_exec_inplace(npu_input1, npu_input2, npu_input3, scalar1, scalar2) + + self.assertRtolEqual(cpu_output, npu_output) + self.assertRtolEqual(cpu_output, npu_output1) + self.assertRtolEqual(cpu_output, npu_output2) + + def test_addbmm_transpose(self, device): + shape_format = [ + [[np.float32, 0, [4, 5]], [np.float32, 0, [10, 4, 7]], [np.float32, 0, [10, 5, 7]], "float32"], + [[np.int32, 0, [4, 5]], [np.int32, 0, [10, 4, 7]], [np.int32, 0, [10, 5, 7]], "int32"] + ] + + for item in shape_format: + cpu_input1, npu_input1 = create_common_tensor(item[0], 0, 100) + cpu_input2, npu_input2 = create_common_tensor(item[1], 0, 100) + cpu_input3, npu_input3 = create_common_tensor(item[2], 0, 100) + + scalar1 = self.generate_scalar(item[3], 0, 10) + scalar2 = self.generate_scalar(item[3], 0, 10) + + cpu_transpose_output = self.cpu_op_transpose_exec(cpu_input1, cpu_input2, cpu_input3, scalar1, scalar2) + npu_transpose_output = self.npu_op_transpose_exec(npu_input1, npu_input2, npu_input3, scalar1, scalar2) + + self.assertRtolEqual(cpu_transpose_output, npu_transpose_output) + + +instantiate_device_type_tests(TestAddbmm, globals(), except_for='cpu') +if __name__ == "__main__": + run_tests() \ No newline at end of file diff --git a/test/test_network_ops/test_bmm.py b/test/test_network_ops/test_bmm.py new file mode 100644 index 0000000000..5c5147b975 --- /dev/null +++ b/test/test_network_ops/test_bmm.py @@ -0,0 +1,73 @@ +# Copyright (c) 2020, Huawei Technologies.All rights reserved. +# +# Licensed under the BSD 3-Clause License (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# https://opensource.org/licenses/BSD-3-Clause +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +import torch +import torch_npu +import numpy as np + +from torch_npu.testing.common_utils import TestCase, run_tests +from torch_npu.testing.common_device_type import instantiate_device_type_tests +from torch_npu.testing.util_test import create_common_tensor + +class TestBatchMatMul(TestCase): + def cpu_op_exec(self, input1, input2): + output = torch.bmm(input1, input2) + output = output.numpy() + return output + + def npu_op_exec(self, input1, input2): + output = torch.bmm(input1, input2) + output = output.to("cpu") + output = output.numpy() + return output + + def bmm_auto_list_exec(self, shape): + for item in shape: + cpu_input1, npu_input1 = create_common_tensor(item[0], 0, 10) + cpu_input2, npu_input2 = create_common_tensor(item[1], 0, 10) + if cpu_input1.dtype == torch.float16: + cpu_input1 = cpu_input1.to(torch.float32) + if cpu_input2.dtype == torch.float16: + cpu_input2 = cpu_input2.to(torch.float32) + cpu_output = self.cpu_op_exec(cpu_input1, cpu_input2) + npu_output = self.npu_op_exec(npu_input1, npu_input2) + cpu_output = cpu_output.astype(npu_output.dtype) + self.assertRtolEqual(cpu_output, npu_output) + + def test_batchmatmul_shape_format_fp16_3d(self, device): + format_list = [0, 3, 29] + shape_list = [(1, 3, 2)] + shape_format1 = [[np.float16, i, j] + for i in format_list for j in shape_list] + format_list = [0, 3, 29] + shape_list = [(1, 2, 3)] + shape_format2 = [[np.float16, i, j] + for i in format_list for j in shape_list] + shape_format = [[i, j] for i in shape_format1 for j in shape_format2] + self.bmm_auto_list_exec(shape_format) + + def test_batchmatmul_shape_format_fp32_3d(self, device): + format_list = [0, 3, 29] + shape_list = [(1, 3, 2)] + shape_format1 = [[np.float32, i, j] + for i in format_list for j in shape_list] + format_list = [0, 3, 29] + shape_list = [(1, 2, 3)] + shape_format2 = [[np.float32, i, j] + for i in format_list for j in shape_list] + shape_format = [[i, j] for i in shape_format1 for j in shape_format2] + self.bmm_auto_list_exec(shape_format) + +instantiate_device_type_tests(TestBatchMatMul, globals(), except_for="cpu") +if __name__ == "__main__": + run_tests() diff --git a/torch_npu/csrc/aten/ops/AddbmmKernelNpu.cpp b/torch_npu/csrc/aten/ops/AddbmmKernelNpu.cpp new file mode 100644 index 0000000000..484467bcdf --- /dev/null +++ b/torch_npu/csrc/aten/ops/AddbmmKernelNpu.cpp @@ -0,0 +1,70 @@ +// Copyright (c) 2020, Huawei Technologies.All rights reserved. +// +// Licensed under the BSD 3-Clause License (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// https://opensource.org/licenses/BSD-3-Clause +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "torch_npu/csrc/framework/utils/OpAdapter.h" +#include "torch_npu/csrc/aten/NPUNativeFunctions.h" + +namespace at_npu { +namespace native { + +at::Tensor& NPUNativeFunctions::addbmm_out( + const at::Tensor& self, + const at::Tensor& batch1, + const at::Tensor& batch2, + at::Scalar beta, + at::Scalar alpha, + at::Tensor& result) { + at::Tensor MulResult = at::mul(batch1, alpha); + at::Tensor bmmResult = at::bmm(MulResult,batch2); + int64_t dim[2] = {batch1.size(1), batch2.size(2)}; + at::Tensor sumResult = at::sum_to(bmmResult, dim); + // sumResult + self*beta + at::add_out(result, sumResult, self, beta); + return result; +} + +at::Tensor NPUNativeFunctions::addbmm( + const at::Tensor& self, + const at::Tensor& batch1, + const at::Tensor& batch2, + at::Scalar beta, + at::Scalar alpha) { + // calculate the output size + auto outputSize = addbmm_npu_output_size(self, batch1, batch2, beta, alpha); + // construct the output tensor of the NPU + Tensor result = OpPreparation::ApplyTensor(self, outputSize); + // calculate the output result of the NPU + addbmm_out(self, batch1, batch2, beta, alpha, result); + return result; +} + +Tensor& NPUNativeFunctions::addbmm_( + at::Tensor& self, + const at::Tensor& batch1, + const at::Tensor& batch2, + at::Scalar beta, + at::Scalar alpha) { + OpPreparation::CheckMemory({self, batch1, batch2}, {self}); + if (!NpuUtils::check_match(&self)) { + at::Tensor contiguousSelf = NpuUtils::format_contiguous(self); + at::Tensor result = addbmm_out_npu(contiguousSelf, batch1, batch2, beta, alpha, contiguousSelf); + NpuUtils::format_fresh_view(self, result); + } else { + addbmm_out(self, batch1, batch2, beta, alpha, self); + } + return self; +} + +} // namespace native +} // namespace at \ No newline at end of file diff --git a/torch_npu/csrc/aten/ops/BmmKernelNpu.cpp b/torch_npu/csrc/aten/ops/BmmKernelNpu.cpp new file mode 100644 index 0000000000..c93fbe3cc8 --- /dev/null +++ b/torch_npu/csrc/aten/ops/BmmKernelNpu.cpp @@ -0,0 +1,88 @@ +// Copyright (c) 2020 Huawei Technologies Co., Ltd +// Copyright (c) 2019, Facebook CORPORATION. +// All rights reserved. +// +// Licensed under the BSD 3-Clause License (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// https://opensource.org/licenses/BSD-3-Clause +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "c10/npu/OptionsManager.h" + +#include "torch_npu/csrc/framework/utils/OpAdapter.h" +#include "torch_npu/csrc/framework/utils/CalcuOpUtil.h" +#include "torch_npu/csrc/aten/NPUNativeFunctions.h" + +namespace at_npu { +namespace native { + +at::Tensor& NPUNativeFunctions::bmm_out(const at::Tensor& self, const at::Tensor& mat2, at::Tensor& result) { + at::Tensor contiguousResult = result.is_contiguous() ? result : result.contiguous(); + + at::Tensor contiguousSelf = self; + at::Tensor contiguousMat2 = mat2; + bool isSelfT = CalcuOpUtil::is_transpose_last_two_dims(self); + bool isMat2T = CalcuOpUtil::is_transpose_last_two_dims(mat2); + + if(!isSelfT){ + contiguousSelf = NpuUtils::format_contiguous_add_copy_optimize(self); + } + if(!isMat2T){ + contiguousMat2 = NpuUtils::format_contiguous_add_copy_optimize(mat2); + } + + auto func1 = [&contiguousSelf]() { + bool pass = false; + return std::tie(pass, contiguousSelf); + }; + auto func2 = [&contiguousMat2]() { + bool pass = false; + return std::tie(pass, contiguousMat2); + }; + + // executing the NPU operator + OpCommand cmd; + cmd.Name("BatchMatMul") + .InputWithFunc(func1) + .InputWithFunc(func2) + .Output(contiguousResult) + .Attr("adj_x1", isSelfT) + .Attr("adj_x2", isMat2T) + .Run(); + + if (!result.is_contiguous()) { + result.copy_(contiguousResult); + } + return result; +} + +at::Tensor NPUNativeFunctions::bmm(const at::Tensor& self, const at::Tensor& mat2) { + // calculate the output size + auto outputSize = {self.size(0), self.size(1), mat2.size(2)}; + + // construct the output tensor of the NPU + at::Tensor result; + + // 检查是否指定mm输出为NCHW。待NLP模型总体策略制定后删去 + if ((self.scalar_type() == at::ScalarType::Float || self.scalar_type() == at::ScalarType::Half) && + !c10::npu::OptionsManager::CheckSwitchMMOutputEnable()) { + result = NPUNativeFunctions::empty_with_format(outputSize, self.options(), ACL_FORMAT_FRACTAL_NZ); + } else { + result = NPUNativeFunctions::empty_with_format(outputSize, self.options(), ACL_FORMAT_ND); + } + + // calculate the output result of the NPU + NPUNativeFunctions::bmm_out(self, mat2, result); + + return result; +} + +} // namespace native +} // namespace at -- Gitee From a683cd88207dec00c2f2f58f42762b80ccbb5c16 Mon Sep 17 00:00:00 2001 From: wangxiao Date: Wed, 26 Jan 2022 20:57:55 +0800 Subject: [PATCH 08/11] addbmm, bmm --- torch_npu/csrc/aten/ops/AddbmmKernelNpu.cpp | 2 +- torch_npu/csrc/aten/ops/BmmKernelNpu.cpp | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/torch_npu/csrc/aten/ops/AddbmmKernelNpu.cpp b/torch_npu/csrc/aten/ops/AddbmmKernelNpu.cpp index 484467bcdf..f5e57820c4 100644 --- a/torch_npu/csrc/aten/ops/AddbmmKernelNpu.cpp +++ b/torch_npu/csrc/aten/ops/AddbmmKernelNpu.cpp @@ -43,7 +43,7 @@ at::Tensor NPUNativeFunctions::addbmm( // calculate the output size auto outputSize = addbmm_npu_output_size(self, batch1, batch2, beta, alpha); // construct the output tensor of the NPU - Tensor result = OpPreparation::ApplyTensor(self, outputSize); + at::Tensor result = OpPreparation::ApplyTensor(self, outputSize); // calculate the output result of the NPU addbmm_out(self, batch1, batch2, beta, alpha, result); return result; diff --git a/torch_npu/csrc/aten/ops/BmmKernelNpu.cpp b/torch_npu/csrc/aten/ops/BmmKernelNpu.cpp index c93fbe3cc8..38aceb87fd 100644 --- a/torch_npu/csrc/aten/ops/BmmKernelNpu.cpp +++ b/torch_npu/csrc/aten/ops/BmmKernelNpu.cpp @@ -73,9 +73,9 @@ at::Tensor NPUNativeFunctions::bmm(const at::Tensor& self, const at::Tensor& mat // 检查是否指定mm输出为NCHW。待NLP模型总体策略制定后删去 if ((self.scalar_type() == at::ScalarType::Float || self.scalar_type() == at::ScalarType::Half) && !c10::npu::OptionsManager::CheckSwitchMMOutputEnable()) { - result = NPUNativeFunctions::empty_with_format(outputSize, self.options(), ACL_FORMAT_FRACTAL_NZ); + result = at::empty_with_format(outputSize, self.options(), ACL_FORMAT_FRACTAL_NZ); } else { - result = NPUNativeFunctions::empty_with_format(outputSize, self.options(), ACL_FORMAT_ND); + result = at::empty_with_format(outputSize, self.options(), ACL_FORMAT_ND); } // calculate the output result of the NPU -- Gitee From 131dc05d11685bef358b9d121d8804ed3591ded1 Mon Sep 17 00:00:00 2001 From: wangxiao Date: Wed, 26 Jan 2022 21:01:10 +0800 Subject: [PATCH 09/11] addbmm, bmm --- torch_npu/csrc/aten/ops/AddbmmKernelNpu.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/torch_npu/csrc/aten/ops/AddbmmKernelNpu.cpp b/torch_npu/csrc/aten/ops/AddbmmKernelNpu.cpp index f5e57820c4..98f82b8428 100644 --- a/torch_npu/csrc/aten/ops/AddbmmKernelNpu.cpp +++ b/torch_npu/csrc/aten/ops/AddbmmKernelNpu.cpp @@ -49,7 +49,7 @@ at::Tensor NPUNativeFunctions::addbmm( return result; } -Tensor& NPUNativeFunctions::addbmm_( +at::Tensor& NPUNativeFunctions::addbmm_( at::Tensor& self, const at::Tensor& batch1, const at::Tensor& batch2, -- Gitee From 8c1cec8a1d7916f75e790f3f0f146ba4a5351dab Mon Sep 17 00:00:00 2001 From: wangxiao Date: Wed, 26 Jan 2022 21:03:02 +0800 Subject: [PATCH 10/11] addbmm, bmm --- torch_npu/csrc/aten/ops/AddbmmKernelNpu.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/torch_npu/csrc/aten/ops/AddbmmKernelNpu.cpp b/torch_npu/csrc/aten/ops/AddbmmKernelNpu.cpp index 98f82b8428..dff97430b1 100644 --- a/torch_npu/csrc/aten/ops/AddbmmKernelNpu.cpp +++ b/torch_npu/csrc/aten/ops/AddbmmKernelNpu.cpp @@ -58,7 +58,7 @@ at::Tensor& NPUNativeFunctions::addbmm_( OpPreparation::CheckMemory({self, batch1, batch2}, {self}); if (!NpuUtils::check_match(&self)) { at::Tensor contiguousSelf = NpuUtils::format_contiguous(self); - at::Tensor result = addbmm_out_npu(contiguousSelf, batch1, batch2, beta, alpha, contiguousSelf); + at::Tensor result = addbmm_out(contiguousSelf, batch1, batch2, beta, alpha, contiguousSelf); NpuUtils::format_fresh_view(self, result); } else { addbmm_out(self, batch1, batch2, beta, alpha, self); -- Gitee From aa9f05b17f19c16cd4a5cc387535482fdd43be0b Mon Sep 17 00:00:00 2001 From: wangxiao Date: Wed, 26 Jan 2022 21:19:44 +0800 Subject: [PATCH 11/11] clean code --- test/test_network_ops/test_abs.py | 8 +- .../test_adaptive_avg_pool1d.py | 8 +- test/test_network_ops/test_bmm.py | 86 +++++++++---------- 3 files changed, 51 insertions(+), 51 deletions(-) diff --git a/test/test_network_ops/test_abs.py b/test/test_network_ops/test_abs.py index 5e10d1d907..cf44ee4bc0 100644 --- a/test/test_network_ops/test_abs.py +++ b/test/test_network_ops/test_abs.py @@ -20,13 +20,13 @@ from torch_npu.testing.common_device_type import Dtypes, instantiate_device_type from torch_npu.testing.util_test import create_common_tensor, test_2args_broadcast, create_dtype_tensor, UT_FAST_MODE class TestAbs(TestCase): - def cpu_op_exec(self, input): - output = torch.abs(input) + def cpu_op_exec(self, input1): + output = torch.abs(input1) output = output.numpy() return output - def npu_op_exec(self, input): - output = torch.abs(input) + def npu_op_exec(self, input1): + output = torch.abs(input1) output = output.to("cpu") output = output.numpy() return output diff --git a/test/test_network_ops/test_adaptive_avg_pool1d.py b/test/test_network_ops/test_adaptive_avg_pool1d.py index 5775934385..d39d76da23 100644 --- a/test/test_network_ops/test_adaptive_avg_pool1d.py +++ b/test/test_network_ops/test_adaptive_avg_pool1d.py @@ -21,14 +21,14 @@ from torch_npu.testing.common_device_type import Dtypes, instantiate_device_type from torch_npu.testing.util_test import create_common_tensor, test_2args_broadcast, create_dtype_tensor, UT_FAST_MODE class TestAdaptiveAvgPool1d(TestCase): - def cpu_op_exec(self, input, output_size): + def cpu_op_exec(self, input1, output_size): m = nn.AdaptiveAvgPool1d(output_size) - output= m(input) + output = m(input1) return output - def npu_op_exec(self, input, output_size): + def npu_op_exec(self, input1, output_size): m = nn.AdaptiveAvgPool1d(output_size).npu() - output = m(input) + output = m(input1) return output.cpu() def test_AdaptiveAvgPool1d_shape_format_fp16(self, device): diff --git a/test/test_network_ops/test_bmm.py b/test/test_network_ops/test_bmm.py index 5c5147b975..de204834bb 100644 --- a/test/test_network_ops/test_bmm.py +++ b/test/test_network_ops/test_bmm.py @@ -20,53 +20,53 @@ from torch_npu.testing.common_device_type import instantiate_device_type_tests from torch_npu.testing.util_test import create_common_tensor class TestBatchMatMul(TestCase): - def cpu_op_exec(self, input1, input2): - output = torch.bmm(input1, input2) - output = output.numpy() - return output + def cpu_op_exec(self, input1, input2): + output = torch.bmm(input1, input2) + output = output.numpy() + return output - def npu_op_exec(self, input1, input2): - output = torch.bmm(input1, input2) - output = output.to("cpu") - output = output.numpy() - return output + def npu_op_exec(self, input1, input2): + output = torch.bmm(input1, input2) + output = output.to("cpu") + output = output.numpy() + return output - def bmm_auto_list_exec(self, shape): - for item in shape: - cpu_input1, npu_input1 = create_common_tensor(item[0], 0, 10) - cpu_input2, npu_input2 = create_common_tensor(item[1], 0, 10) - if cpu_input1.dtype == torch.float16: - cpu_input1 = cpu_input1.to(torch.float32) - if cpu_input2.dtype == torch.float16: - cpu_input2 = cpu_input2.to(torch.float32) - cpu_output = self.cpu_op_exec(cpu_input1, cpu_input2) - npu_output = self.npu_op_exec(npu_input1, npu_input2) - cpu_output = cpu_output.astype(npu_output.dtype) - self.assertRtolEqual(cpu_output, npu_output) + def bmm_auto_list_exec(self, shape): + for item in shape: + cpu_input1, npu_input1 = create_common_tensor(item[0], 0, 10) + cpu_input2, npu_input2 = create_common_tensor(item[1], 0, 10) + if cpu_input1.dtype == torch.float16: + cpu_input1 = cpu_input1.to(torch.float32) + if cpu_input2.dtype == torch.float16: + cpu_input2 = cpu_input2.to(torch.float32) + cpu_output = self.cpu_op_exec(cpu_input1, cpu_input2) + npu_output = self.npu_op_exec(npu_input1, npu_input2) + cpu_output = cpu_output.astype(npu_output.dtype) + self.assertRtolEqual(cpu_output, npu_output) - def test_batchmatmul_shape_format_fp16_3d(self, device): - format_list = [0, 3, 29] - shape_list = [(1, 3, 2)] - shape_format1 = [[np.float16, i, j] - for i in format_list for j in shape_list] - format_list = [0, 3, 29] - shape_list = [(1, 2, 3)] - shape_format2 = [[np.float16, i, j] - for i in format_list for j in shape_list] - shape_format = [[i, j] for i in shape_format1 for j in shape_format2] - self.bmm_auto_list_exec(shape_format) + def test_batchmatmul_shape_format_fp16_3d(self, device): + format_list = [0, 3, 29] + shape_list = [(1, 3, 2)] + shape_format1 = [[np.float16, i, j] + for i in format_list for j in shape_list] + format_list = [0, 3, 29] + shape_list = [(1, 2, 3)] + shape_format2 = [[np.float16, i, j] + for i in format_list for j in shape_list] + shape_format = [[i, j] for i in shape_format1 for j in shape_format2] + self.bmm_auto_list_exec(shape_format) - def test_batchmatmul_shape_format_fp32_3d(self, device): - format_list = [0, 3, 29] - shape_list = [(1, 3, 2)] - shape_format1 = [[np.float32, i, j] - for i in format_list for j in shape_list] - format_list = [0, 3, 29] - shape_list = [(1, 2, 3)] - shape_format2 = [[np.float32, i, j] - for i in format_list for j in shape_list] - shape_format = [[i, j] for i in shape_format1 for j in shape_format2] - self.bmm_auto_list_exec(shape_format) + def test_batchmatmul_shape_format_fp32_3d(self, device): + format_list = [0, 3, 29] + shape_list = [(1, 3, 2)] + shape_format1 = [[np.float32, i, j] + for i in format_list for j in shape_list] + format_list = [0, 3, 29] + shape_list = [(1, 2, 3)] + shape_format2 = [[np.float32, i, j] + for i in format_list for j in shape_list] + shape_format = [[i, j] for i in shape_format1 for j in shape_format2] + self.bmm_auto_list_exec(shape_format) instantiate_device_type_tests(TestBatchMatMul, globals(), except_for="cpu") if __name__ == "__main__": -- Gitee