From 9d66efbaacf01d4a3c9061e4e7f602f984f4f8f7 Mon Sep 17 00:00:00 2001
From: hxf12345677 <houxiaofang@h-partners.com>
Date: Fri, 11 Feb 2022 18:05:36 +0800
Subject: [PATCH 1/4] =?UTF-8?q?npu=5Flinear1.8=E7=AE=97=E5=AD=90=E7=A7=BB?=
 =?UTF-8?q?=E6=A4=8D?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 test/test_network_ops/test_npu_linear.py      | 65 +++++++++++++++
 .../test_npu_linear_backward.py               | 81 +++++++++++++++++++
 torch_npu/csrc/aten/npu_native_functions.yaml |  2 +
 .../csrc/aten/ops/LinearBackwardKernelNpu.cpp | 69 ++++++++++++++++
 torch_npu/csrc/aten/ops/LinearKernelNpu.cpp   | 48 +++++++++++
 5 files changed, 265 insertions(+)
 create mode 100644 test/test_network_ops/test_npu_linear.py
 create mode 100644 test/test_network_ops/test_npu_linear_backward.py
 create mode 100644 torch_npu/csrc/aten/ops/LinearBackwardKernelNpu.cpp
 create mode 100644 torch_npu/csrc/aten/ops/LinearKernelNpu.cpp

diff --git a/test/test_network_ops/test_npu_linear.py b/test/test_network_ops/test_npu_linear.py
new file mode 100644
index 0000000000..72a9f82618
--- /dev/null
+++ b/test/test_network_ops/test_npu_linear.py
@@ -0,0 +1,65 @@
+# Copyright (c) 2020, Huawei Technologies.All rights reserved.
+#
+# Licensed under the BSD 3-Clause License  (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# https://opensource.org/licenses/BSD-3-Clause
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import torch
+import torch_npu
+import numpy as np
+
+from torch_npu.testing.common_utils import TestCase, run_tests
+from torch_npu.testing.common_device_type import instantiate_device_type_tests
+from torch_npu.testing.util_test import create_common_tensor
+
+
+class TestNpuLinear(TestCase):
+    def cpu_op_exec(self, x, weight, bias):
+        output = torch.nn.functional.linear(x, weight, bias)
+        output = output.numpy()
+        return output
+
+    def npu_op_exec(self, x, weight, bias):
+        output = torch_npu.npu_linear(x, weight, bias)
+        output = output.cpu().numpy()
+        return output
+
+    def test_npu_linear_shape_format_fp32(self, device):
+        shape_format = [
+            [[np.float32, -1, (6144, 1024)], [np.float32, -1, (256, 1024)], [np.float32, -1, (256)]],
+            [[np.float32, -1, (123, 456)], [np.float32, -1, (789, 456)], [np.float32, -1, (789)]],
+        ]
+
+        for item in shape_format:
+            cpu_x, npu_x = create_common_tensor(item[0], -2, 2)
+            cpu_w, npu_w = create_common_tensor(item[1], -2, 2)
+            cpu_b, npu_b = create_common_tensor(item[2], -2, 2)
+            cpu_output = self.cpu_op_exec(cpu_x, cpu_w, cpu_b)
+            npu_output = self.npu_op_exec(npu_x, npu_w, npu_b)
+            self.assertRtolEqual(cpu_output, npu_output, 0.0002)
+
+    def test_npu_linear_shape_format_fp16(self, device):
+        shape_format = [
+            [[np.float16, -1, (6144, 1024)], [np.float16, -1, (256, 1024)], [np.float16, -1, (256)]],
+            [[np.float16, -1, (123, 456)], [np.float16, -1, (789, 456)], [np.float16, -1, (789)]],
+        ]
+
+        for item in shape_format:
+            cpu_x, npu_x = create_common_tensor(item[0], -2, 2)
+            cpu_w, npu_w = create_common_tensor(item[1], -2, 2)
+            cpu_b, npu_b = create_common_tensor(item[2], -2, 2)
+            cpu_output = self.cpu_op_exec(cpu_x.float(), cpu_w.float(), cpu_b.float()).astype(np.float16)
+            npu_output = self.npu_op_exec(npu_x, npu_w, npu_b)
+            self.assertRtolEqual(cpu_output, npu_output)
+
+
+instantiate_device_type_tests(TestNpuLinear, globals(), except_for="cpu")
+if __name__ == "__main__":
+    run_tests()
\ No newline at end of file
diff --git a/test/test_network_ops/test_npu_linear_backward.py b/test/test_network_ops/test_npu_linear_backward.py
new file mode 100644
index 0000000000..468f10c72b
--- /dev/null
+++ b/test/test_network_ops/test_npu_linear_backward.py
@@ -0,0 +1,81 @@
+# Copyright (c) 2020, Huawei Technologies.All rights reserved.
+#
+# Licensed under the BSD 3-Clause License  (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# https://opensource.org/licenses/BSD-3-Clause
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import torch
+import torch_npu
+import numpy as np
+
+from torch_npu.testing.common_utils import TestCase, run_tests
+from torch_npu.testing.common_device_type import instantiate_device_type_tests
+from torch_npu.testing.util_test import create_common_tensor
+
+
+class TestNpuLinearBackward(TestCase):
+    def cpu_op_exec(self, x, weight, bias):
+        x.requires_grad = True
+        weight.requires_grad = True
+        bias.requires_grad = True
+        output = torch.nn.functional.linear(x, weight, bias)
+        loss = output.sum()
+        loss.backward()
+        return output.detach().numpy(), x.grad.numpy(), weight.grad.numpy(), bias.grad.numpy()
+
+    def npu_op_exec(self, x, weight, bias):
+        x.requires_grad = True
+        weight.requires_grad = True
+        bias.requires_grad = True
+        output = torch_npu.npu_linear(x, weight, bias)
+        loss = output.sum()
+        loss.backward()
+        return output.cpu().detach().numpy(), x.grad.cpu().numpy(), weight.grad.cpu().numpy(), bias.grad.cpu().numpy()
+
+    def test_npu_linear_backward_shape_format_fp32(self, device):
+        shape_format = [
+            [[np.float32, -1, (6144, 1024)], [np.float32, -1, (256, 1024)], [np.float32, -1, (256)]],
+            [[np.float32, -1, (123, 456)], [np.float32, -1, (789, 456)], [np.float32, -1, (789)]],
+        ]
+
+        for item in shape_format:
+            cpu_x, npu_x = create_common_tensor(item[0], -2, 2)
+            cpu_w, npu_w = create_common_tensor(item[1], -2, 2)
+            cpu_b, npu_b = create_common_tensor(item[2], -2, 2)
+            cpu_output, cpu_x_grad, cpu_w_grad, cpu_b_grad = self.cpu_op_exec(cpu_x, cpu_w, cpu_b)
+            npu_output, npu_x_grad, npu_w_grad, npu_b_grad = self.npu_op_exec(npu_x, npu_w, npu_b)
+            self.assertRtolEqual(cpu_output, npu_output, 0.0002)
+            self.assertRtolEqual(cpu_x_grad, npu_x_grad)
+            self.assertRtolEqual(cpu_w_grad, npu_w_grad)
+            self.assertRtolEqual(cpu_b_grad, npu_b_grad)
+
+    def test_npu_linear_shape_format_fp16(self, device):
+        shape_format = [
+            [[np.float16, -1, (6144, 1024)], [np.float16, -1, (256, 1024)], [np.float16, -1, (256)]],
+            [[np.float16, -1, (123, 456)], [np.float16, -1, (789, 456)], [np.float16, -1, (789)]],
+        ]
+
+        for item in shape_format:
+            cpu_x, npu_x = create_common_tensor(item[0], -2, 2)
+            cpu_w, npu_w = create_common_tensor(item[1], -2, 2)
+            cpu_b, npu_b = create_common_tensor(item[2], -2, 2)
+            cpu_output, cpu_x_grad, cpu_w_grad, cpu_b_grad = self.cpu_op_exec(
+                cpu_x.float(), cpu_w.float(), cpu_b.float())
+            npu_output, npu_x_grad, npu_w_grad, npu_b_grad = self.npu_op_exec(npu_x, npu_w, npu_b)
+            self.assertRtolEqual(cpu_output.astype(np.float16), npu_output)
+            self.assertRtolEqual(cpu_x_grad.astype(np.float16), npu_x_grad)
+            self.assertRtolEqual(cpu_w_grad.astype(np.float16), npu_w_grad)
+            self.assertRtolEqual(cpu_b_grad.astype(np.float16), npu_b_grad)
+
+
+instantiate_device_type_tests(TestNpuLinearBackward, globals(), except_for="cpu")
+if __name__ == "__main__":
+    run_tests()
\ No newline at end of file
diff --git a/torch_npu/csrc/aten/npu_native_functions.yaml b/torch_npu/csrc/aten/npu_native_functions.yaml
index f5c9f3adca..5a07f4650b 100644
--- a/torch_npu/csrc/aten/npu_native_functions.yaml
+++ b/torch_npu/csrc/aten/npu_native_functions.yaml
@@ -1913,6 +1913,8 @@ custom:
   - func: npu_indexing.out(Tensor self, int[] begin, int[] end, int[] strides, int begin_mask=0, int end_mask=0, int ellipsis_mask=0, int new_axis_mask=0, int shrink_axis_mask=0, *, Tensor(a!) out) -> Tensor(a!)
   - func: npu_bmmV2(Tensor self, Tensor mat2, int[] output_sizes) -> Tensor
     variants: function, method
+  - func: npu_linear(Tensor input, Tensor weight, Tensor? bias=None) -> Tensor
+  - func: npu_linear_backward(Tensor grad, Tensor input, Tensor weight) -> (Tensor, Tensor)
 custom_autograd:
   - func: npu_convolution(Tensor input, Tensor weight, Tensor? bias, int[] stride, int[] padding, int[] dilation, int groups) -> Tensor
   - func: npu_convolution_transpose(Tensor input, Tensor weight, Tensor? bias, int[] padding, int[] output_padding, int[] stride, int[] dilation, int groups) -> Tensor
diff --git a/torch_npu/csrc/aten/ops/LinearBackwardKernelNpu.cpp b/torch_npu/csrc/aten/ops/LinearBackwardKernelNpu.cpp
new file mode 100644
index 0000000000..2c14e73e39
--- /dev/null
+++ b/torch_npu/csrc/aten/ops/LinearBackwardKernelNpu.cpp
@@ -0,0 +1,69 @@
+// Copyright (c) 2020 Huawei Technologies Co., Ltd
+// Copyright (c) 2019, Facebook CORPORATION.
+// All rights reserved.
+//
+// Licensed under the BSD 3-Clause License  (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "torch_npu/csrc/framework/utils/OpAdapter.h"
+#include "torch_npu/csrc/aten/NPUNativeFunctions.h"
+
+namespace at_npu {
+namespace native {
+
+at::Tensor linear_backward_out_npu(
+    at::Tensor& result,
+    const at::Tensor& input,
+    const at::Tensor& weight,
+    bool transpose_x1,
+    bool transpose_x2) {
+  int64_t offset_x = 0;
+  OpCommand cmd;
+  cmd.Name("MatMulV2")
+      .Input(input)
+      .Input(weight)
+      .Output(result)
+      .Attr("transpose_x1", transpose_x1)
+      .Attr("transpose_x2", transpose_x2)
+      .Attr("offset_x", offset_x)
+      .Run();
+  return result;
+}
+
+tuple<at::Tensor, at::Tensor> NPUNativeFunctions::npu_linear_backward(
+    const at::Tensor& grad,
+    const at::Tensor& input,
+    const at::Tensor& weight) {
+  c10::SmallVector<int64_t, SIZE> inputGradOutputSize = {
+      grad.size(0),
+      weight.size(1)};
+  c10::SmallVector<int64_t, SIZE> weightGradOutputSize = {
+      grad.size(1),
+      input.size(1)};
+  at::Tensor inputGrad = OpPreparation::ApplyTensor(input, inputGradOutputSize);
+  at::Tensor weightGrad = OpPreparation::ApplyTensor(weight, weightGradOutputSize);
+
+  if (CalcuOpUtil::get_tensor_npu_format(grad) == CalcuOpUtil::get_tensor_npu_format(weight)) {
+    linear_backward_out_npu(inputGrad, grad, weight, false, false);
+    linear_backward_out_npu(weightGrad, grad, input, true, false);
+  } else {
+    at::Tensor gradFormatcast = OpPreparation::ApplyTensor(grad, grad.sizes());
+    gradFormatcast = NPUNativeFunctions::npu_format_cast(grad, CalcuOpUtil::get_tensor_npu_format(weight));
+    linear_backward_out_npu(inputGrad, gradFormatcast, weight, false, false);
+    linear_backward_out_npu(weightGrad, gradFormatcast, input, true, false);
+  }
+
+  return std::tie(inputGrad, weightGrad);
+}
+
+} // namespace native
+} // namespace at_npu
\ No newline at end of file
diff --git a/torch_npu/csrc/aten/ops/LinearKernelNpu.cpp b/torch_npu/csrc/aten/ops/LinearKernelNpu.cpp
new file mode 100644
index 0000000000..3cdc4cfd1d
--- /dev/null
+++ b/torch_npu/csrc/aten/ops/LinearKernelNpu.cpp
@@ -0,0 +1,48 @@
+// Copyright (c) 2020 Huawei Technologies Co., Ltd
+// Copyright (c) 2019, Facebook CORPORATION.
+// All rights reserved.
+//
+// Licensed under the BSD 3-Clause License  (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "torch_npu/csrc/framework/utils/OpAdapter.h"
+#include "torch_npu/csrc/aten/NPUNativeFunctions.h"
+
+namespace at_npu {
+namespace native {
+
+at::Tensor NPUNativeFunctions::npu_linear(
+    const at::Tensor& input,
+    const at::Tensor& weight,
+    const at::Tensor& bias) {
+  c10::SmallVector<int64_t, SIZE> outputSize = {input.size(0), weight.size(0)};
+  at::Tensor output = OpPreparation::ApplyTensor(input, outputSize);
+
+  int64_t offset_x = 0;
+  OpCommand cmd;
+  cmd.Name("MatMulV2")
+      .Input(input)
+      .Input(weight);
+  if (bias.defined()) {
+    cmd.Input(bias);
+  }
+  cmd.Output(output)
+      .Attr("transpose_x1", false)
+      .Attr("transpose_x2", true)
+      .Attr("offset_x", offset_x)
+      .Run();
+
+  return output;
+}
+
+} // namespace native
+} // namespace at_npu
\ No newline at end of file
-- 
Gitee


From 8d2b392d681f3d947e1f565441c857f392247110 Mon Sep 17 00:00:00 2001
From: hxf12345677 <houxiaofang@h-partners.com>
Date: Mon, 14 Feb 2022 17:53:07 +0800
Subject: [PATCH 2/4] =?UTF-8?q?npu=5Flinear=E7=AE=97=E5=AD=90=E4=BF=AE?=
 =?UTF-8?q?=E6=94=B9?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 torch_npu/csrc/aten/npu_native_functions.yaml | 13 +--
 .../csrc/aten/ops/LinearBackwardKernelNpu.cpp | 69 -------------
 torch_npu/csrc/aten/ops/LinearKernelNpu.cpp   | 97 ++++++++++++++++++-
 3 files changed, 102 insertions(+), 77 deletions(-)
 delete mode 100644 torch_npu/csrc/aten/ops/LinearBackwardKernelNpu.cpp

diff --git a/torch_npu/csrc/aten/npu_native_functions.yaml b/torch_npu/csrc/aten/npu_native_functions.yaml
index 5a07f4650b..dcadd5d297 100644
--- a/torch_npu/csrc/aten/npu_native_functions.yaml
+++ b/torch_npu/csrc/aten/npu_native_functions.yaml
@@ -898,9 +898,6 @@ supported:
   - var_mean.names_dim
   - view_as
   - where.self
-  - where.ScalarSelf
-  - where.ScalarOther
-  - where.Scalar
   - where
   - _s_where
   - norm_except_dim
@@ -1099,8 +1096,6 @@ supported:
   - scatter.value
   - scatter.dimname_src
   - scatter.dimname_value
-  - scatter_.reduce
-  - scatter_.value_reduce
   - scatter_add_
   - scatter_add
   - scatter_add.dimname
@@ -1890,6 +1885,8 @@ custom:
     variants: function, method
   - func: one_(Tensor(a!) self) -> Tensor(a!)
     variants: method, function
+  - func: npu_bert_apply_adam(Scalar lr, Scalar beta1, Scalar beta2, Scalar epsilon, Tensor grad, Scalar max_grad_norm, Scalar global_grad_norm, Scalar weight_decay) -> (Tensor var, Tensor m, Tensor v)
+  - func: npu_bert_apply_adam.out(Scalar lr, Scalar beta1, Scalar beta2, Scalar epsilon, Tensor grad, Scalar max_grad_norm, Scalar global_grad_norm, Scalar weight_decay, *, Tensor(a!) var, Tensor(b!) m, Tensor(c!) v) -> (Tensor(a!), Tensor(b!), Tensor(c!))
   - func: npu_conv_transpose2d_backward(Tensor input, Tensor grad_output, Tensor weight, int[] padding, int[] output_padding, int[] stride, int[] dilation, int groups, bool[3] output_mask) -> (Tensor, Tensor, Tensor)
   - func: npu_conv_transpose3d_backward(Tensor input, Tensor grad_output, Tensor weight, int[] padding, int[] output_padding, int[] stride, int[] dilation, int groups, bool[3] output_mask) -> (Tensor, Tensor, Tensor)
   - func: npu_convolution_backward(Tensor input, Tensor grad_output, Tensor weight, int[] stride, int[] padding, int[] dilation, int groups, bool[3] output_mask) -> (Tensor, Tensor, Tensor)
@@ -1913,8 +1910,12 @@ custom:
   - func: npu_indexing.out(Tensor self, int[] begin, int[] end, int[] strides, int begin_mask=0, int end_mask=0, int ellipsis_mask=0, int new_axis_mask=0, int shrink_axis_mask=0, *, Tensor(a!) out) -> Tensor(a!)
   - func: npu_bmmV2(Tensor self, Tensor mat2, int[] output_sizes) -> Tensor
     variants: function, method
-  - func: npu_linear(Tensor input, Tensor weight, Tensor? bias=None) -> Tensor
+  - func: npu_softmax_cross_entropy_with_logits(Tensor self, Tensor labels) -> Tensor
+    variants: function, method
+  - func: npu_softmax_cross_entropy_with_logits_backward(Tensor grad, Tensor self, Tensor labels) -> Tensor
+    variants: function, method
   - func: npu_linear_backward(Tensor grad, Tensor input, Tensor weight) -> (Tensor, Tensor)
 custom_autograd:
   - func: npu_convolution(Tensor input, Tensor weight, Tensor? bias, int[] stride, int[] padding, int[] dilation, int groups) -> Tensor
   - func: npu_convolution_transpose(Tensor input, Tensor weight, Tensor? bias, int[] padding, int[] output_padding, int[] stride, int[] dilation, int groups) -> Tensor
+  - func: npu_linear(Tensor input, Tensor weight, Tensor? bias=None) -> Tensor
diff --git a/torch_npu/csrc/aten/ops/LinearBackwardKernelNpu.cpp b/torch_npu/csrc/aten/ops/LinearBackwardKernelNpu.cpp
deleted file mode 100644
index 2c14e73e39..0000000000
--- a/torch_npu/csrc/aten/ops/LinearBackwardKernelNpu.cpp
+++ /dev/null
@@ -1,69 +0,0 @@
-// Copyright (c) 2020 Huawei Technologies Co., Ltd
-// Copyright (c) 2019, Facebook CORPORATION.
-// All rights reserved.
-//
-// Licensed under the BSD 3-Clause License  (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-// https://opensource.org/licenses/BSD-3-Clause
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "torch_npu/csrc/framework/utils/OpAdapter.h"
-#include "torch_npu/csrc/aten/NPUNativeFunctions.h"
-
-namespace at_npu {
-namespace native {
-
-at::Tensor linear_backward_out_npu(
-    at::Tensor& result,
-    const at::Tensor& input,
-    const at::Tensor& weight,
-    bool transpose_x1,
-    bool transpose_x2) {
-  int64_t offset_x = 0;
-  OpCommand cmd;
-  cmd.Name("MatMulV2")
-      .Input(input)
-      .Input(weight)
-      .Output(result)
-      .Attr("transpose_x1", transpose_x1)
-      .Attr("transpose_x2", transpose_x2)
-      .Attr("offset_x", offset_x)
-      .Run();
-  return result;
-}
-
-tuple<at::Tensor, at::Tensor> NPUNativeFunctions::npu_linear_backward(
-    const at::Tensor& grad,
-    const at::Tensor& input,
-    const at::Tensor& weight) {
-  c10::SmallVector<int64_t, SIZE> inputGradOutputSize = {
-      grad.size(0),
-      weight.size(1)};
-  c10::SmallVector<int64_t, SIZE> weightGradOutputSize = {
-      grad.size(1),
-      input.size(1)};
-  at::Tensor inputGrad = OpPreparation::ApplyTensor(input, inputGradOutputSize);
-  at::Tensor weightGrad = OpPreparation::ApplyTensor(weight, weightGradOutputSize);
-
-  if (CalcuOpUtil::get_tensor_npu_format(grad) == CalcuOpUtil::get_tensor_npu_format(weight)) {
-    linear_backward_out_npu(inputGrad, grad, weight, false, false);
-    linear_backward_out_npu(weightGrad, grad, input, true, false);
-  } else {
-    at::Tensor gradFormatcast = OpPreparation::ApplyTensor(grad, grad.sizes());
-    gradFormatcast = NPUNativeFunctions::npu_format_cast(grad, CalcuOpUtil::get_tensor_npu_format(weight));
-    linear_backward_out_npu(inputGrad, gradFormatcast, weight, false, false);
-    linear_backward_out_npu(weightGrad, gradFormatcast, input, true, false);
-  }
-
-  return std::tie(inputGrad, weightGrad);
-}
-
-} // namespace native
-} // namespace at_npu
\ No newline at end of file
diff --git a/torch_npu/csrc/aten/ops/LinearKernelNpu.cpp b/torch_npu/csrc/aten/ops/LinearKernelNpu.cpp
index 3cdc4cfd1d..e4b8ff73f4 100644
--- a/torch_npu/csrc/aten/ops/LinearKernelNpu.cpp
+++ b/torch_npu/csrc/aten/ops/LinearKernelNpu.cpp
@@ -15,15 +15,21 @@
 // limitations under the License.
 
 #include "torch_npu/csrc/framework/utils/OpAdapter.h"
+#include "torch_npu/csrc/framework/utils/CalcuOpUtil.h"
 #include "torch_npu/csrc/aten/NPUNativeFunctions.h"
+#include <torch/csrc/autograd/custom_function.h>
 
 namespace at_npu {
 namespace native {
+using torch::autograd::Function;
+using torch::autograd::AutogradContext;
+using tensor_list = std::vector<at::Tensor>;
 
-at::Tensor NPUNativeFunctions::npu_linear(
+at::Tensor linear_npu(
     const at::Tensor& input,
     const at::Tensor& weight,
-    const at::Tensor& bias) {
+    const c10::optional<at::Tensor> & bias_opt) {
+  const at::Tensor& bias = c10::value_or_else(bias_opt, [] {return at::Tensor();});
   c10::SmallVector<int64_t, SIZE> outputSize = {input.size(0), weight.size(0)};
   at::Tensor output = OpPreparation::ApplyTensor(input, outputSize);
 
@@ -44,5 +50,92 @@ at::Tensor NPUNativeFunctions::npu_linear(
   return output;
 }
 
+at::Tensor linear_backward_out_npu(
+    at::Tensor& result,
+    const at::Tensor& input,
+    const at::Tensor& weight,
+    bool transpose_x1,
+    bool transpose_x2) {
+  int64_t offset_x = 0;
+  OpCommand cmd;
+  cmd.Name("MatMulV2")
+      .Input(input)
+      .Input(weight)
+      .Output(result)
+      .Attr("transpose_x1", transpose_x1)
+      .Attr("transpose_x2", transpose_x2)
+      .Attr("offset_x", offset_x)
+      .Run();
+  return result;
+}
+
+tuple<at::Tensor, at::Tensor> NPUNativeFunctions::npu_linear_backward(
+    const at::Tensor& grad,
+    const at::Tensor& input,
+    const at::Tensor& weight) {
+  c10::SmallVector<int64_t, SIZE> inputGradOutputSize = {
+      grad.size(0),
+      weight.size(1)};
+  c10::SmallVector<int64_t, SIZE> weightGradOutputSize = {
+      grad.size(1),
+      input.size(1)};
+  at::Tensor inputGrad = OpPreparation::ApplyTensor(input, inputGradOutputSize);
+  at::Tensor weightGrad = OpPreparation::ApplyTensor(weight, weightGradOutputSize);
+
+  if (CalcuOpUtil::get_tensor_npu_format(grad) == CalcuOpUtil::get_tensor_npu_format(weight)) {
+    linear_backward_out_npu(inputGrad, grad, weight, false, false);
+    linear_backward_out_npu(weightGrad, grad, input, true, false);
+  } else {
+    at::Tensor gradFormatcast = OpPreparation::ApplyTensor(grad, grad.sizes());
+    gradFormatcast = NPUNativeFunctions::npu_format_cast(grad, CalcuOpUtil::get_tensor_npu_format(weight));
+    linear_backward_out_npu(inputGrad, gradFormatcast, weight, false, false);
+    linear_backward_out_npu(weightGrad, gradFormatcast, input, true, false);
+  }
+
+  return std::tie(inputGrad, weightGrad);
+}
+
+class NPULinearFunction : public torch::autograd::Function<NPULinearFunction> {
+public:
+  static at::Tensor forward(AutogradContext *ctx,
+    const at::Tensor& input,
+    const at::Tensor& weight,
+    const c10::optional<at::Tensor>& bias_opt) {
+    ctx->saved_data["bias_has_value"] = (bias_opt.has_value() == true) ? bias_opt.value().requires_grad() : false;
+
+    at::AutoNonVariableTypeMode g;
+    ctx->save_for_backward({input, weight});
+    return linear_npu(input, weight, bias_opt);
+  }
+
+  static tensor_list backward(AutogradContext *ctx,
+    tensor_list grad_outputs) {
+    auto bias_has_value = ctx->saved_data["bias_has_value"].toBool();
+    auto saved = ctx->get_saved_variables();
+    auto input = saved[0];
+    auto weight = saved[1];
+
+    tuple<at::Tensor, at::Tensor> result = NPUNativeFunctions::npu_linear_backward(grad_outputs[0], input, weight);
+
+    tensor_list output;
+    if (bias_has_value) {
+      output = {std::get<0>(result),
+        std::get<1>(result),
+        grad_outputs[0]};
+    } else {
+      output = {std::get<0>(result),
+        std::get<1>(result),
+        at::Tensor()};
+    }
+    return output;
+  }
+};
+
+at::Tensor NPUNativeFunctions::npu_linear(const at::Tensor& input,
+    const at::Tensor& weight,
+    const c10::optional<at::Tensor>& bias_opt) {
+  return NPULinearFunction::apply(input, weight, bias_opt);
+}
+
 } // namespace native
 } // namespace at_npu
\ No newline at end of file
-- 
Gitee


From bf67bb8f1f3b3482fc4f9f3b231e69b77ba0130c Mon Sep 17 00:00:00 2001
From: hxf12345677 <houxiaofang@h-partners.com>
Date: Tue, 15 Feb 2022 02:18:28 +0000
Subject: [PATCH 3/4] =?UTF-8?q?npu=5Flinear=E4=BF=AE=E6=94=B9?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 .../test_npu_linear_backward.py               | 31 ++++++++++---------
 torch_npu/csrc/aten/ops/LinearKernelNpu.cpp   |  3 +-
 2 files changed, 18 insertions(+), 16 deletions(-)

diff --git a/test/test_network_ops/test_npu_linear_backward.py b/test/test_network_ops/test_npu_linear_backward.py
index 468f10c72b..86b64ba0c2 100644
--- a/test/test_network_ops/test_npu_linear_backward.py
+++ b/test/test_network_ops/test_npu_linear_backward.py
@@ -29,7 +29,8 @@ class TestNpuLinearBackward(TestCase):
         output = torch.nn.functional.linear(x, weight, bias)
         loss = output.sum()
         loss.backward()
-        return output.detach().numpy(), x.grad.numpy(), weight.grad.numpy(), bias.grad.numpy()
+        list1 = [output.detach().numpy(), x.grad.numpy(), weight.grad.numpy(), bias.grad.numpy()]
+        return list1
 
     def npu_op_exec(self, x, weight, bias):
         x.requires_grad = True
@@ -38,7 +39,8 @@ class TestNpuLinearBackward(TestCase):
         output = torch_npu.npu_linear(x, weight, bias)
         loss = output.sum()
         loss.backward()
-        return output.cpu().detach().numpy(), x.grad.cpu().numpy(), weight.grad.cpu().numpy(), bias.grad.cpu().numpy()
+        list2 = [output.cpu().detach().numpy(), x.grad.cpu().numpy(), weight.grad.cpu().numpy(), bias.grad.cpu().numpy()]
+        return list2
 
     def test_npu_linear_backward_shape_format_fp32(self, device):
         shape_format = [
@@ -50,12 +52,12 @@ class TestNpuLinearBackward(TestCase):
             cpu_x, npu_x = create_common_tensor(item[0], -2, 2)
             cpu_w, npu_w = create_common_tensor(item[1], -2, 2)
             cpu_b, npu_b = create_common_tensor(item[2], -2, 2)
-            cpu_output, cpu_x_grad, cpu_w_grad, cpu_b_grad = self.cpu_op_exec(cpu_x, cpu_w, cpu_b)
-            npu_output, npu_x_grad, npu_w_grad, npu_b_grad = self.npu_op_exec(npu_x, npu_w, npu_b)
-            self.assertRtolEqual(cpu_output, npu_output, 0.0002)
-            self.assertRtolEqual(cpu_x_grad, npu_x_grad)
-            self.assertRtolEqual(cpu_w_grad, npu_w_grad)
-            self.assertRtolEqual(cpu_b_grad, npu_b_grad)
+            getlist1 = self.cpu_op_exec(cpu_x, cpu_w, cpu_b)
+            getlist2 = self.npu_op_exec(npu_x, npu_w, npu_b)
+            self.assertRtolEqual(getlist1[0], getlist2[0], 0.0002)
+            self.assertRtolEqual(getlist1[1], getlist2[1])
+            self.assertRtolEqual(getlist1[2], getlist2[2])
+            self.assertRtolEqual(getlist1[3], getlist2[3])
 
     def test_npu_linear_shape_format_fp16(self, device):
         shape_format = [
@@ -67,13 +69,12 @@ class TestNpuLinearBackward(TestCase):
             cpu_x, npu_x = create_common_tensor(item[0], -2, 2)
             cpu_w, npu_w = create_common_tensor(item[1], -2, 2)
             cpu_b, npu_b = create_common_tensor(item[2], -2, 2)
-            cpu_output, cpu_x_grad, cpu_w_grad, cpu_b_grad = self.cpu_op_exec(
-                cpu_x.float(), cpu_w.float(), cpu_b.float())
-            npu_output, npu_x_grad, npu_w_grad, npu_b_grad = self.npu_op_exec(npu_x, npu_w, npu_b)
-            self.assertRtolEqual(cpu_output.astype(np.float16), npu_output)
-            self.assertRtolEqual(cpu_x_grad.astype(np.float16), npu_x_grad)
-            self.assertRtolEqual(cpu_w_grad.astype(np.float16), npu_w_grad)
-            self.assertRtolEqual(cpu_b_grad.astype(np.float16), npu_b_grad)
+            getlist1 = self.cpu_op_exec(cpu_x.float(), cpu_w.float(), cpu_b.float())
+            getlist2 = self.npu_op_exec(npu_x, npu_w, npu_b)
+            self.assertRtolEqual(getlist1[0].astype(np.float16), getlist2[0])
+            self.assertRtolEqual(getlist1[1].astype(np.float16), getlist2[1])
+            self.assertRtolEqual(getlist1[2].astype(np.float16), getlist2[2])
+            self.assertRtolEqual(getlist1[3].astype(np.float16), getlist2[3])
 
 
 instantiate_device_type_tests(TestNpuLinearBackward, globals(), except_for="cpu")
diff --git a/torch_npu/csrc/aten/ops/LinearKernelNpu.cpp b/torch_npu/csrc/aten/ops/LinearKernelNpu.cpp
index e4b8ff73f4..ffe290ab1e 100644
--- a/torch_npu/csrc/aten/ops/LinearKernelNpu.cpp
+++ b/torch_npu/csrc/aten/ops/LinearKernelNpu.cpp
@@ -14,10 +14,11 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
+#include <torch/csrc/autograd/custom_function.h>
+
 #include "torch_npu/csrc/framework/utils/OpAdapter.h"
 #include "torch_npu/csrc/framework/utils/CalcuOpUtil.h"
 #include "torch_npu/csrc/aten/NPUNativeFunctions.h"
-#include <torch/csrc/autograd/custom_function.h>
 
 namespace at_npu {
 namespace native {
-- 
Gitee


From 69155121b91f673eb2d5e82ac77e29218027f260 Mon Sep 17 00:00:00 2001
From: hxf12345677 <houxiaofang@h-partners.com>
Date: Tue, 15 Feb 2022 02:23:35 +0000
Subject: [PATCH 4/4] =?UTF-8?q?npu=5Flinear=E4=BF=AE=E6=94=B9?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 test/test_network_ops/test_npu_linear_backward.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/test/test_network_ops/test_npu_linear_backward.py b/test/test_network_ops/test_npu_linear_backward.py
index 86b64ba0c2..0874c99356 100644
--- a/test/test_network_ops/test_npu_linear_backward.py
+++ b/test/test_network_ops/test_npu_linear_backward.py
@@ -39,7 +39,8 @@ class TestNpuLinearBackward(TestCase):
         output = torch_npu.npu_linear(x, weight, bias)
         loss = output.sum()
         loss.backward()
-        list2 = [output.cpu().detach().numpy(), x.grad.cpu().numpy(), weight.grad.cpu().numpy(), bias.grad.cpu().numpy()]
+        list2 = [output.cpu().detach().numpy(), x.grad.cpu().numpy(), weight.grad.cpu().numpy(),
+           bias.grad.cpu().numpy()]
         return list2
 
     def test_npu_linear_backward_shape_format_fp32(self, device):
-- 
Gitee