From 9d66efbaacf01d4a3c9061e4e7f602f984f4f8f7 Mon Sep 17 00:00:00 2001 From: hxf12345677 Date: Fri, 11 Feb 2022 18:05:36 +0800 Subject: [PATCH 1/4] =?UTF-8?q?npu=5Flinear1.8=E7=AE=97=E5=AD=90=E7=A7=BB?= =?UTF-8?q?=E6=A4=8D?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- test/test_network_ops/test_npu_linear.py | 65 +++++++++++++++ .../test_npu_linear_backward.py | 81 +++++++++++++++++++ torch_npu/csrc/aten/npu_native_functions.yaml | 2 + .../csrc/aten/ops/LinearBackwardKernelNpu.cpp | 69 ++++++++++++++++ torch_npu/csrc/aten/ops/LinearKernelNpu.cpp | 48 +++++++++++ 5 files changed, 265 insertions(+) create mode 100644 test/test_network_ops/test_npu_linear.py create mode 100644 test/test_network_ops/test_npu_linear_backward.py create mode 100644 torch_npu/csrc/aten/ops/LinearBackwardKernelNpu.cpp create mode 100644 torch_npu/csrc/aten/ops/LinearKernelNpu.cpp diff --git a/test/test_network_ops/test_npu_linear.py b/test/test_network_ops/test_npu_linear.py new file mode 100644 index 0000000000..72a9f82618 --- /dev/null +++ b/test/test_network_ops/test_npu_linear.py @@ -0,0 +1,65 @@ +# Copyright (c) 2020, Huawei Technologies.All rights reserved. +# +# Licensed under the BSD 3-Clause License (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# https://opensource.org/licenses/BSD-3-Clause +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +import torch +import torch_npu +import numpy as np + +from torch_npu.testing.common_utils import TestCase, run_tests +from torch_npu.testing.common_device_type import instantiate_device_type_tests +from torch_npu.testing.util_test import create_common_tensor + + +class TestNpuLinear(TestCase): + def cpu_op_exec(self, x, weight, bias): + output = torch.nn.functional.linear(x, weight, bias) + output = output.numpy() + return output + + def npu_op_exec(self, x, weight, bias): + output = torch_npu.npu_linear(x, weight, bias) + output = output.cpu().numpy() + return output + + def test_npu_linear_shape_format_fp32(self, device): + shape_format = [ + [[np.float32, -1, (6144, 1024)], [np.float32, -1, (256, 1024)], [np.float32, -1, (256)]], + [[np.float32, -1, (123, 456)], [np.float32, -1, (789, 456)], [np.float32, -1, (789)]], + ] + + for item in shape_format: + cpu_x, npu_x = create_common_tensor(item[0], -2, 2) + cpu_w, npu_w = create_common_tensor(item[1], -2, 2) + cpu_b, npu_b = create_common_tensor(item[2], -2, 2) + cpu_output = self.cpu_op_exec(cpu_x, cpu_w, cpu_b) + npu_output = self.npu_op_exec(npu_x, npu_w, npu_b) + self.assertRtolEqual(cpu_output, npu_output, 0.0002) + + def test_npu_linear_shape_format_fp16(self, device): + shape_format = [ + [[np.float16, -1, (6144, 1024)], [np.float16, -1, (256, 1024)], [np.float16, -1, (256)]], + [[np.float16, -1, (123, 456)], [np.float16, -1, (789, 456)], [np.float16, -1, (789)]], + ] + + for item in shape_format: + cpu_x, npu_x = create_common_tensor(item[0], -2, 2) + cpu_w, npu_w = create_common_tensor(item[1], -2, 2) + cpu_b, npu_b = create_common_tensor(item[2], -2, 2) + cpu_output = self.cpu_op_exec(cpu_x.float(), cpu_w.float(), cpu_b.float()).astype(np.float16) + npu_output = self.npu_op_exec(npu_x, npu_w, npu_b) + self.assertRtolEqual(cpu_output, npu_output) + + +instantiate_device_type_tests(TestNpuLinear, globals(), except_for="cpu") +if __name__ == "__main__": + run_tests() \ No newline at end of file diff --git a/test/test_network_ops/test_npu_linear_backward.py b/test/test_network_ops/test_npu_linear_backward.py new file mode 100644 index 0000000000..468f10c72b --- /dev/null +++ b/test/test_network_ops/test_npu_linear_backward.py @@ -0,0 +1,81 @@ +# Copyright (c) 2020, Huawei Technologies.All rights reserved. +# +# Licensed under the BSD 3-Clause License (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# https://opensource.org/licenses/BSD-3-Clause +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import torch +import torch_npu +import numpy as np + +from torch_npu.testing.common_utils import TestCase, run_tests +from torch_npu.testing.common_device_type import instantiate_device_type_tests +from torch_npu.testing.util_test import create_common_tensor + + +class TestNpuLinearBackward(TestCase): + def cpu_op_exec(self, x, weight, bias): + x.requires_grad = True + weight.requires_grad = True + bias.requires_grad = True + output = torch.nn.functional.linear(x, weight, bias) + loss = output.sum() + loss.backward() + return output.detach().numpy(), x.grad.numpy(), weight.grad.numpy(), bias.grad.numpy() + + def npu_op_exec(self, x, weight, bias): + x.requires_grad = True + weight.requires_grad = True + bias.requires_grad = True + output = torch_npu.npu_linear(x, weight, bias) + loss = output.sum() + loss.backward() + return output.cpu().detach().numpy(), x.grad.cpu().numpy(), weight.grad.cpu().numpy(), bias.grad.cpu().numpy() + + def test_npu_linear_backward_shape_format_fp32(self, device): + shape_format = [ + [[np.float32, -1, (6144, 1024)], [np.float32, -1, (256, 1024)], [np.float32, -1, (256)]], + [[np.float32, -1, (123, 456)], [np.float32, -1, (789, 456)], [np.float32, -1, (789)]], + ] + + for item in shape_format: + cpu_x, npu_x = create_common_tensor(item[0], -2, 2) + cpu_w, npu_w = create_common_tensor(item[1], -2, 2) + cpu_b, npu_b = create_common_tensor(item[2], -2, 2) + cpu_output, cpu_x_grad, cpu_w_grad, cpu_b_grad = self.cpu_op_exec(cpu_x, cpu_w, cpu_b) + npu_output, npu_x_grad, npu_w_grad, npu_b_grad = self.npu_op_exec(npu_x, npu_w, npu_b) + self.assertRtolEqual(cpu_output, npu_output, 0.0002) + self.assertRtolEqual(cpu_x_grad, npu_x_grad) + self.assertRtolEqual(cpu_w_grad, npu_w_grad) + self.assertRtolEqual(cpu_b_grad, npu_b_grad) + + def test_npu_linear_shape_format_fp16(self, device): + shape_format = [ + [[np.float16, -1, (6144, 1024)], [np.float16, -1, (256, 1024)], [np.float16, -1, (256)]], + [[np.float16, -1, (123, 456)], [np.float16, -1, (789, 456)], [np.float16, -1, (789)]], + ] + + for item in shape_format: + cpu_x, npu_x = create_common_tensor(item[0], -2, 2) + cpu_w, npu_w = create_common_tensor(item[1], -2, 2) + cpu_b, npu_b = create_common_tensor(item[2], -2, 2) + cpu_output, cpu_x_grad, cpu_w_grad, cpu_b_grad = self.cpu_op_exec( + cpu_x.float(), cpu_w.float(), cpu_b.float()) + npu_output, npu_x_grad, npu_w_grad, npu_b_grad = self.npu_op_exec(npu_x, npu_w, npu_b) + self.assertRtolEqual(cpu_output.astype(np.float16), npu_output) + self.assertRtolEqual(cpu_x_grad.astype(np.float16), npu_x_grad) + self.assertRtolEqual(cpu_w_grad.astype(np.float16), npu_w_grad) + self.assertRtolEqual(cpu_b_grad.astype(np.float16), npu_b_grad) + + +instantiate_device_type_tests(TestNpuLinearBackward, globals(), except_for="cpu") +if __name__ == "__main__": + run_tests() \ No newline at end of file diff --git a/torch_npu/csrc/aten/npu_native_functions.yaml b/torch_npu/csrc/aten/npu_native_functions.yaml index f5c9f3adca..5a07f4650b 100644 --- a/torch_npu/csrc/aten/npu_native_functions.yaml +++ b/torch_npu/csrc/aten/npu_native_functions.yaml @@ -1913,6 +1913,8 @@ custom: - func: npu_indexing.out(Tensor self, int[] begin, int[] end, int[] strides, int begin_mask=0, int end_mask=0, int ellipsis_mask=0, int new_axis_mask=0, int shrink_axis_mask=0, *, Tensor(a!) out) -> Tensor(a!) - func: npu_bmmV2(Tensor self, Tensor mat2, int[] output_sizes) -> Tensor variants: function, method + - func: npu_linear(Tensor input, Tensor weight, Tensor? bias=None) -> Tensor + - func: npu_linear_backward(Tensor grad, Tensor input, Tensor weight) -> (Tensor, Tensor) custom_autograd: - func: npu_convolution(Tensor input, Tensor weight, Tensor? bias, int[] stride, int[] padding, int[] dilation, int groups) -> Tensor - func: npu_convolution_transpose(Tensor input, Tensor weight, Tensor? bias, int[] padding, int[] output_padding, int[] stride, int[] dilation, int groups) -> Tensor diff --git a/torch_npu/csrc/aten/ops/LinearBackwardKernelNpu.cpp b/torch_npu/csrc/aten/ops/LinearBackwardKernelNpu.cpp new file mode 100644 index 0000000000..2c14e73e39 --- /dev/null +++ b/torch_npu/csrc/aten/ops/LinearBackwardKernelNpu.cpp @@ -0,0 +1,69 @@ +// Copyright (c) 2020 Huawei Technologies Co., Ltd +// Copyright (c) 2019, Facebook CORPORATION. +// All rights reserved. +// +// Licensed under the BSD 3-Clause License (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// https://opensource.org/licenses/BSD-3-Clause +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "torch_npu/csrc/framework/utils/OpAdapter.h" +#include "torch_npu/csrc/aten/NPUNativeFunctions.h" + +namespace at_npu { +namespace native { + +at::Tensor linear_backward_out_npu( + at::Tensor& result, + const at::Tensor& input, + const at::Tensor& weight, + bool transpose_x1, + bool transpose_x2) { + int64_t offset_x = 0; + OpCommand cmd; + cmd.Name("MatMulV2") + .Input(input) + .Input(weight) + .Output(result) + .Attr("transpose_x1", transpose_x1) + .Attr("transpose_x2", transpose_x2) + .Attr("offset_x", offset_x) + .Run(); + return result; +} + +tuple NPUNativeFunctions::npu_linear_backward( + const at::Tensor& grad, + const at::Tensor& input, + const at::Tensor& weight) { + c10::SmallVector inputGradOutputSize = { + grad.size(0), + weight.size(1)}; + c10::SmallVector weightGradOutputSize = { + grad.size(1), + input.size(1)}; + at::Tensor inputGrad = OpPreparation::ApplyTensor(input, inputGradOutputSize); + at::Tensor weightGrad = OpPreparation::ApplyTensor(weight, weightGradOutputSize); + + if (CalcuOpUtil::get_tensor_npu_format(grad) == CalcuOpUtil::get_tensor_npu_format(weight)) { + linear_backward_out_npu(inputGrad, grad, weight, false, false); + linear_backward_out_npu(weightGrad, grad, input, true, false); + } else { + at::Tensor gradFormatcast = OpPreparation::ApplyTensor(grad, grad.sizes()); + gradFormatcast = NPUNativeFunctions::npu_format_cast(grad, CalcuOpUtil::get_tensor_npu_format(weight)); + linear_backward_out_npu(inputGrad, gradFormatcast, weight, false, false); + linear_backward_out_npu(weightGrad, gradFormatcast, input, true, false); + } + + return std::tie(inputGrad, weightGrad); +} + +} // namespace native +} // namespace at_npu \ No newline at end of file diff --git a/torch_npu/csrc/aten/ops/LinearKernelNpu.cpp b/torch_npu/csrc/aten/ops/LinearKernelNpu.cpp new file mode 100644 index 0000000000..3cdc4cfd1d --- /dev/null +++ b/torch_npu/csrc/aten/ops/LinearKernelNpu.cpp @@ -0,0 +1,48 @@ +// Copyright (c) 2020 Huawei Technologies Co., Ltd +// Copyright (c) 2019, Facebook CORPORATION. +// All rights reserved. +// +// Licensed under the BSD 3-Clause License (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// https://opensource.org/licenses/BSD-3-Clause +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "torch_npu/csrc/framework/utils/OpAdapter.h" +#include "torch_npu/csrc/aten/NPUNativeFunctions.h" + +namespace at_npu { +namespace native { + +at::Tensor NPUNativeFunctions::npu_linear( + const at::Tensor& input, + const at::Tensor& weight, + const at::Tensor& bias) { + c10::SmallVector outputSize = {input.size(0), weight.size(0)}; + at::Tensor output = OpPreparation::ApplyTensor(input, outputSize); + + int64_t offset_x = 0; + OpCommand cmd; + cmd.Name("MatMulV2") + .Input(input) + .Input(weight); + if (bias.defined()) { + cmd.Input(bias); + } + cmd.Output(output) + .Attr("transpose_x1", false) + .Attr("transpose_x2", true) + .Attr("offset_x", offset_x) + .Run(); + + return output; +} + +} // namespace native +} // namespace at_npu \ No newline at end of file -- Gitee From 8d2b392d681f3d947e1f565441c857f392247110 Mon Sep 17 00:00:00 2001 From: hxf12345677 Date: Mon, 14 Feb 2022 17:53:07 +0800 Subject: [PATCH 2/4] =?UTF-8?q?npu=5Flinear=E7=AE=97=E5=AD=90=E4=BF=AE?= =?UTF-8?q?=E6=94=B9?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- torch_npu/csrc/aten/npu_native_functions.yaml | 13 +-- .../csrc/aten/ops/LinearBackwardKernelNpu.cpp | 69 ------------- torch_npu/csrc/aten/ops/LinearKernelNpu.cpp | 97 ++++++++++++++++++- 3 files changed, 102 insertions(+), 77 deletions(-) delete mode 100644 torch_npu/csrc/aten/ops/LinearBackwardKernelNpu.cpp diff --git a/torch_npu/csrc/aten/npu_native_functions.yaml b/torch_npu/csrc/aten/npu_native_functions.yaml index 5a07f4650b..dcadd5d297 100644 --- a/torch_npu/csrc/aten/npu_native_functions.yaml +++ b/torch_npu/csrc/aten/npu_native_functions.yaml @@ -898,9 +898,6 @@ supported: - var_mean.names_dim - view_as - where.self - - where.ScalarSelf - - where.ScalarOther - - where.Scalar - where - _s_where - norm_except_dim @@ -1099,8 +1096,6 @@ supported: - scatter.value - scatter.dimname_src - scatter.dimname_value - - scatter_.reduce - - scatter_.value_reduce - scatter_add_ - scatter_add - scatter_add.dimname @@ -1890,6 +1885,8 @@ custom: variants: function, method - func: one_(Tensor(a!) self) -> Tensor(a!) variants: method, function + - func: npu_bert_apply_adam(Scalar lr, Scalar beta1, Scalar beta2, Scalar epsilon, Tensor grad, Scalar max_grad_norm, Scalar global_grad_norm, Scalar weight_decay) -> (Tensor var, Tensor m, Tensor v) + - func: npu_bert_apply_adam.out(Scalar lr, Scalar beta1, Scalar beta2, Scalar epsilon, Tensor grad, Scalar max_grad_norm, Scalar global_grad_norm, Scalar weight_decay, *, Tensor(a!) var, Tensor(b!) m, Tensor(c!) v) -> (Tensor(a!), Tensor(b!), Tensor(c!)) - func: npu_conv_transpose2d_backward(Tensor input, Tensor grad_output, Tensor weight, int[] padding, int[] output_padding, int[] stride, int[] dilation, int groups, bool[3] output_mask) -> (Tensor, Tensor, Tensor) - func: npu_conv_transpose3d_backward(Tensor input, Tensor grad_output, Tensor weight, int[] padding, int[] output_padding, int[] stride, int[] dilation, int groups, bool[3] output_mask) -> (Tensor, Tensor, Tensor) - func: npu_convolution_backward(Tensor input, Tensor grad_output, Tensor weight, int[] stride, int[] padding, int[] dilation, int groups, bool[3] output_mask) -> (Tensor, Tensor, Tensor) @@ -1913,8 +1910,12 @@ custom: - func: npu_indexing.out(Tensor self, int[] begin, int[] end, int[] strides, int begin_mask=0, int end_mask=0, int ellipsis_mask=0, int new_axis_mask=0, int shrink_axis_mask=0, *, Tensor(a!) out) -> Tensor(a!) - func: npu_bmmV2(Tensor self, Tensor mat2, int[] output_sizes) -> Tensor variants: function, method - - func: npu_linear(Tensor input, Tensor weight, Tensor? bias=None) -> Tensor + - func: npu_softmax_cross_entropy_with_logits(Tensor self, Tensor labels) -> Tensor + variants: function, method + - func: npu_softmax_cross_entropy_with_logits_backward(Tensor grad, Tensor self, Tensor labels) -> Tensor + variants: function, method - func: npu_linear_backward(Tensor grad, Tensor input, Tensor weight) -> (Tensor, Tensor) custom_autograd: - func: npu_convolution(Tensor input, Tensor weight, Tensor? bias, int[] stride, int[] padding, int[] dilation, int groups) -> Tensor - func: npu_convolution_transpose(Tensor input, Tensor weight, Tensor? bias, int[] padding, int[] output_padding, int[] stride, int[] dilation, int groups) -> Tensor + - func: npu_linear(Tensor input, Tensor weight, Tensor? bias=None) -> Tensor diff --git a/torch_npu/csrc/aten/ops/LinearBackwardKernelNpu.cpp b/torch_npu/csrc/aten/ops/LinearBackwardKernelNpu.cpp deleted file mode 100644 index 2c14e73e39..0000000000 --- a/torch_npu/csrc/aten/ops/LinearBackwardKernelNpu.cpp +++ /dev/null @@ -1,69 +0,0 @@ -// Copyright (c) 2020 Huawei Technologies Co., Ltd -// Copyright (c) 2019, Facebook CORPORATION. -// All rights reserved. -// -// Licensed under the BSD 3-Clause License (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// https://opensource.org/licenses/BSD-3-Clause -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#include "torch_npu/csrc/framework/utils/OpAdapter.h" -#include "torch_npu/csrc/aten/NPUNativeFunctions.h" - -namespace at_npu { -namespace native { - -at::Tensor linear_backward_out_npu( - at::Tensor& result, - const at::Tensor& input, - const at::Tensor& weight, - bool transpose_x1, - bool transpose_x2) { - int64_t offset_x = 0; - OpCommand cmd; - cmd.Name("MatMulV2") - .Input(input) - .Input(weight) - .Output(result) - .Attr("transpose_x1", transpose_x1) - .Attr("transpose_x2", transpose_x2) - .Attr("offset_x", offset_x) - .Run(); - return result; -} - -tuple NPUNativeFunctions::npu_linear_backward( - const at::Tensor& grad, - const at::Tensor& input, - const at::Tensor& weight) { - c10::SmallVector inputGradOutputSize = { - grad.size(0), - weight.size(1)}; - c10::SmallVector weightGradOutputSize = { - grad.size(1), - input.size(1)}; - at::Tensor inputGrad = OpPreparation::ApplyTensor(input, inputGradOutputSize); - at::Tensor weightGrad = OpPreparation::ApplyTensor(weight, weightGradOutputSize); - - if (CalcuOpUtil::get_tensor_npu_format(grad) == CalcuOpUtil::get_tensor_npu_format(weight)) { - linear_backward_out_npu(inputGrad, grad, weight, false, false); - linear_backward_out_npu(weightGrad, grad, input, true, false); - } else { - at::Tensor gradFormatcast = OpPreparation::ApplyTensor(grad, grad.sizes()); - gradFormatcast = NPUNativeFunctions::npu_format_cast(grad, CalcuOpUtil::get_tensor_npu_format(weight)); - linear_backward_out_npu(inputGrad, gradFormatcast, weight, false, false); - linear_backward_out_npu(weightGrad, gradFormatcast, input, true, false); - } - - return std::tie(inputGrad, weightGrad); -} - -} // namespace native -} // namespace at_npu \ No newline at end of file diff --git a/torch_npu/csrc/aten/ops/LinearKernelNpu.cpp b/torch_npu/csrc/aten/ops/LinearKernelNpu.cpp index 3cdc4cfd1d..e4b8ff73f4 100644 --- a/torch_npu/csrc/aten/ops/LinearKernelNpu.cpp +++ b/torch_npu/csrc/aten/ops/LinearKernelNpu.cpp @@ -15,15 +15,21 @@ // limitations under the License. #include "torch_npu/csrc/framework/utils/OpAdapter.h" +#include "torch_npu/csrc/framework/utils/CalcuOpUtil.h" #include "torch_npu/csrc/aten/NPUNativeFunctions.h" +#include namespace at_npu { namespace native { +using torch::autograd::Function; +using torch::autograd::AutogradContext; +using tensor_list = std::vector; -at::Tensor NPUNativeFunctions::npu_linear( +at::Tensor linear_npu( const at::Tensor& input, const at::Tensor& weight, - const at::Tensor& bias) { + const c10::optional & bias_opt) { + const at::Tensor& bias = c10::value_or_else(bias_opt, [] {return at::Tensor();}); c10::SmallVector outputSize = {input.size(0), weight.size(0)}; at::Tensor output = OpPreparation::ApplyTensor(input, outputSize); @@ -44,5 +50,92 @@ at::Tensor NPUNativeFunctions::npu_linear( return output; } +at::Tensor linear_backward_out_npu( + at::Tensor& result, + const at::Tensor& input, + const at::Tensor& weight, + bool transpose_x1, + bool transpose_x2) { + int64_t offset_x = 0; + OpCommand cmd; + cmd.Name("MatMulV2") + .Input(input) + .Input(weight) + .Output(result) + .Attr("transpose_x1", transpose_x1) + .Attr("transpose_x2", transpose_x2) + .Attr("offset_x", offset_x) + .Run(); + return result; +} + +tuple NPUNativeFunctions::npu_linear_backward( + const at::Tensor& grad, + const at::Tensor& input, + const at::Tensor& weight) { + c10::SmallVector inputGradOutputSize = { + grad.size(0), + weight.size(1)}; + c10::SmallVector weightGradOutputSize = { + grad.size(1), + input.size(1)}; + at::Tensor inputGrad = OpPreparation::ApplyTensor(input, inputGradOutputSize); + at::Tensor weightGrad = OpPreparation::ApplyTensor(weight, weightGradOutputSize); + + if (CalcuOpUtil::get_tensor_npu_format(grad) == CalcuOpUtil::get_tensor_npu_format(weight)) { + linear_backward_out_npu(inputGrad, grad, weight, false, false); + linear_backward_out_npu(weightGrad, grad, input, true, false); + } else { + at::Tensor gradFormatcast = OpPreparation::ApplyTensor(grad, grad.sizes()); + gradFormatcast = NPUNativeFunctions::npu_format_cast(grad, CalcuOpUtil::get_tensor_npu_format(weight)); + linear_backward_out_npu(inputGrad, gradFormatcast, weight, false, false); + linear_backward_out_npu(weightGrad, gradFormatcast, input, true, false); + } + + return std::tie(inputGrad, weightGrad); +} + +class NPULinearFunction : public torch::autograd::Function { +public: + static at::Tensor forward(AutogradContext *ctx, + const at::Tensor& input, + const at::Tensor& weight, + const c10::optional& bias_opt) { + ctx->saved_data["bias_has_value"] = (bias_opt.has_value() == true) ? bias_opt.value().requires_grad() : false; + + at::AutoNonVariableTypeMode g; + ctx->save_for_backward({input, weight}); + return linear_npu(input, weight, bias_opt); + } + + static tensor_list backward(AutogradContext *ctx, + tensor_list grad_outputs) { + auto bias_has_value = ctx->saved_data["bias_has_value"].toBool(); + auto saved = ctx->get_saved_variables(); + auto input = saved[0]; + auto weight = saved[1]; + + tuple result = NPUNativeFunctions::npu_linear_backward(grad_outputs[0], input, weight); + + tensor_list output; + if (bias_has_value) { + output = {std::get<0>(result), + std::get<1>(result), + grad_outputs[0]}; + } else { + output = {std::get<0>(result), + std::get<1>(result), + at::Tensor()}; + } + return output; + } +}; + +at::Tensor NPUNativeFunctions::npu_linear(const at::Tensor& input, + const at::Tensor& weight, + const c10::optional& bias_opt) { + return NPULinearFunction::apply(input, weight, bias_opt); +} + } // namespace native } // namespace at_npu \ No newline at end of file -- Gitee From bf67bb8f1f3b3482fc4f9f3b231e69b77ba0130c Mon Sep 17 00:00:00 2001 From: hxf12345677 Date: Tue, 15 Feb 2022 02:18:28 +0000 Subject: [PATCH 3/4] =?UTF-8?q?npu=5Flinear=E4=BF=AE=E6=94=B9?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .../test_npu_linear_backward.py | 31 ++++++++++--------- torch_npu/csrc/aten/ops/LinearKernelNpu.cpp | 3 +- 2 files changed, 18 insertions(+), 16 deletions(-) diff --git a/test/test_network_ops/test_npu_linear_backward.py b/test/test_network_ops/test_npu_linear_backward.py index 468f10c72b..86b64ba0c2 100644 --- a/test/test_network_ops/test_npu_linear_backward.py +++ b/test/test_network_ops/test_npu_linear_backward.py @@ -29,7 +29,8 @@ class TestNpuLinearBackward(TestCase): output = torch.nn.functional.linear(x, weight, bias) loss = output.sum() loss.backward() - return output.detach().numpy(), x.grad.numpy(), weight.grad.numpy(), bias.grad.numpy() + list1 = [output.detach().numpy(), x.grad.numpy(), weight.grad.numpy(), bias.grad.numpy()] + return list1 def npu_op_exec(self, x, weight, bias): x.requires_grad = True @@ -38,7 +39,8 @@ class TestNpuLinearBackward(TestCase): output = torch_npu.npu_linear(x, weight, bias) loss = output.sum() loss.backward() - return output.cpu().detach().numpy(), x.grad.cpu().numpy(), weight.grad.cpu().numpy(), bias.grad.cpu().numpy() + list2 = [output.cpu().detach().numpy(), x.grad.cpu().numpy(), weight.grad.cpu().numpy(), bias.grad.cpu().numpy()] + return list2 def test_npu_linear_backward_shape_format_fp32(self, device): shape_format = [ @@ -50,12 +52,12 @@ class TestNpuLinearBackward(TestCase): cpu_x, npu_x = create_common_tensor(item[0], -2, 2) cpu_w, npu_w = create_common_tensor(item[1], -2, 2) cpu_b, npu_b = create_common_tensor(item[2], -2, 2) - cpu_output, cpu_x_grad, cpu_w_grad, cpu_b_grad = self.cpu_op_exec(cpu_x, cpu_w, cpu_b) - npu_output, npu_x_grad, npu_w_grad, npu_b_grad = self.npu_op_exec(npu_x, npu_w, npu_b) - self.assertRtolEqual(cpu_output, npu_output, 0.0002) - self.assertRtolEqual(cpu_x_grad, npu_x_grad) - self.assertRtolEqual(cpu_w_grad, npu_w_grad) - self.assertRtolEqual(cpu_b_grad, npu_b_grad) + getlist1 = self.cpu_op_exec(cpu_x, cpu_w, cpu_b) + getlist2 = self.npu_op_exec(npu_x, npu_w, npu_b) + self.assertRtolEqual(getlist1[0], getlist2[0], 0.0002) + self.assertRtolEqual(getlist1[1], getlist2[1]) + self.assertRtolEqual(getlist1[2], getlist2[2]) + self.assertRtolEqual(getlist1[3], getlist2[3]) def test_npu_linear_shape_format_fp16(self, device): shape_format = [ @@ -67,13 +69,12 @@ class TestNpuLinearBackward(TestCase): cpu_x, npu_x = create_common_tensor(item[0], -2, 2) cpu_w, npu_w = create_common_tensor(item[1], -2, 2) cpu_b, npu_b = create_common_tensor(item[2], -2, 2) - cpu_output, cpu_x_grad, cpu_w_grad, cpu_b_grad = self.cpu_op_exec( - cpu_x.float(), cpu_w.float(), cpu_b.float()) - npu_output, npu_x_grad, npu_w_grad, npu_b_grad = self.npu_op_exec(npu_x, npu_w, npu_b) - self.assertRtolEqual(cpu_output.astype(np.float16), npu_output) - self.assertRtolEqual(cpu_x_grad.astype(np.float16), npu_x_grad) - self.assertRtolEqual(cpu_w_grad.astype(np.float16), npu_w_grad) - self.assertRtolEqual(cpu_b_grad.astype(np.float16), npu_b_grad) + getlist1 = self.cpu_op_exec(cpu_x.float(), cpu_w.float(), cpu_b.float()) + getlist2 = self.npu_op_exec(npu_x, npu_w, npu_b) + self.assertRtolEqual(getlist1[0].astype(np.float16), getlist2[0]) + self.assertRtolEqual(getlist1[1].astype(np.float16), getlist2[1]) + self.assertRtolEqual(getlist1[2].astype(np.float16), getlist2[2]) + self.assertRtolEqual(getlist1[3].astype(np.float16), getlist2[3]) instantiate_device_type_tests(TestNpuLinearBackward, globals(), except_for="cpu") diff --git a/torch_npu/csrc/aten/ops/LinearKernelNpu.cpp b/torch_npu/csrc/aten/ops/LinearKernelNpu.cpp index e4b8ff73f4..ffe290ab1e 100644 --- a/torch_npu/csrc/aten/ops/LinearKernelNpu.cpp +++ b/torch_npu/csrc/aten/ops/LinearKernelNpu.cpp @@ -14,10 +14,11 @@ // See the License for the specific language governing permissions and // limitations under the License. +#include + #include "torch_npu/csrc/framework/utils/OpAdapter.h" #include "torch_npu/csrc/framework/utils/CalcuOpUtil.h" #include "torch_npu/csrc/aten/NPUNativeFunctions.h" -#include namespace at_npu { namespace native { -- Gitee From 69155121b91f673eb2d5e82ac77e29218027f260 Mon Sep 17 00:00:00 2001 From: hxf12345677 Date: Tue, 15 Feb 2022 02:23:35 +0000 Subject: [PATCH 4/4] =?UTF-8?q?npu=5Flinear=E4=BF=AE=E6=94=B9?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- test/test_network_ops/test_npu_linear_backward.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/test/test_network_ops/test_npu_linear_backward.py b/test/test_network_ops/test_npu_linear_backward.py index 86b64ba0c2..0874c99356 100644 --- a/test/test_network_ops/test_npu_linear_backward.py +++ b/test/test_network_ops/test_npu_linear_backward.py @@ -39,7 +39,8 @@ class TestNpuLinearBackward(TestCase): output = torch_npu.npu_linear(x, weight, bias) loss = output.sum() loss.backward() - list2 = [output.cpu().detach().numpy(), x.grad.cpu().numpy(), weight.grad.cpu().numpy(), bias.grad.cpu().numpy()] + list2 = [output.cpu().detach().numpy(), x.grad.cpu().numpy(), weight.grad.cpu().numpy(), + bias.grad.cpu().numpy()] return list2 def test_npu_linear_backward_shape_format_fp32(self, device): -- Gitee