From f333dbcab9e469af6ae70207d7377714fafbdbdd Mon Sep 17 00:00:00 2001 From: suyafeng s00639171 Date: Tue, 22 Aug 2023 00:40:57 +0800 Subject: [PATCH 1/2] add just orfer --- torch_npu/csrc/aten/ops/op_api/MatmulKernelNpuOpApi.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/torch_npu/csrc/aten/ops/op_api/MatmulKernelNpuOpApi.cpp b/torch_npu/csrc/aten/ops/op_api/MatmulKernelNpuOpApi.cpp index 97cb537680..43cdee1779 100644 --- a/torch_npu/csrc/aten/ops/op_api/MatmulKernelNpuOpApi.cpp +++ b/torch_npu/csrc/aten/ops/op_api/MatmulKernelNpuOpApi.cpp @@ -183,8 +183,8 @@ std::tuple matmul_backward(const at::Tensor &grad, return std::make_tuple(at::Tensor(), at::Tensor()); } // backward mat1 and mat2 separately - auto self_grad = matmul_mat1_backward(self, other, grad); auto other_grad = matmul_mat2_backward(self, other, grad); + auto self_grad = matmul_mat1_backward(self, other, grad); // strip added dim: (5,1)->(5) if (other.dim() == 1 && other_grad.size(-1) == 1) { -- Gitee From 2f370d91a55ba870dcca87f7f19fc617b3e35e23 Mon Sep 17 00:00:00 2001 From: suyafeng s00639171 Date: Wed, 23 Aug 2023 05:15:44 +0800 Subject: [PATCH 2/2] rewrit from PC --- test/test_network_ops/test_matmul.py | 120 ++++++++++++++- .../aten/ops/op_api/MatmulKernelNpuOpApi.cpp | 144 +++++++++++++++--- 2 files changed, 240 insertions(+), 24 deletions(-) diff --git a/test/test_network_ops/test_matmul.py b/test/test_network_ops/test_matmul.py index 7ade8c135f..cb7b8162a0 100644 --- a/test/test_network_ops/test_matmul.py +++ b/test/test_network_ops/test_matmul.py @@ -19,7 +19,7 @@ import torch_npu from torch_npu.testing.testcase import TestCase, run_tests from torch_npu.testing.common_utils import create_common_tensor - +torch_npu.npu.set_compile_mode(jit_compile=False) PrescsionTableFP16 = [ [2, 1e2, 0.005], [2, 1e3, 0.005], [2, 1e4, 0.005], [2, 1e5, 0.005], [2, 1e6, 0.005], [10, 1e2, 0.005], [10, 1e3, 0.01], [10, 1e4, 0.02], [10, 1e5, 0.0305], [10, 1e6, 0.04], @@ -74,6 +74,8 @@ class TestMatMul(TestCase): return cpu_output.detach().numpy(), input1.grad.numpy(), input2.grad.numpy() + # return cpu_output.detach().numpy() + def op_exec_npu(self, mat1, mat2): input1 = mat1 input2 = mat2 @@ -86,7 +88,9 @@ class TestMatMul(TestCase): npu_output = npu_output.cpu() return npu_output.detach().cpu().numpy(), input1.grad.cpu().numpy(), input2.grad.cpu().numpy() - def matmul_backward_result(self, shape_format): + # return npu_output.detach().cpu().numpy() + + def matmul_backward_result(self, shape_format, transpose_mat1=False, transpose_mat2=False): for item in shape_format: mat1_cpu, mat1_npu = create_common_tensor(item[0], -10, 10) if mat1_cpu.dtype == torch.float16: @@ -94,9 +98,39 @@ class TestMatMul(TestCase): mat2_cpu, mat2_npu = create_common_tensor(item[1], -10, 10) if mat2_cpu.dtype == torch.float16: mat2_cpu = mat2_cpu.to(torch.float32) + + if transpose_mat1: + mat1_cpu = mat1_cpu.transpose(-2, -1) + mat1_npu = mat1_npu.transpose(-2, -1) + + if transpose_mat2: + mat2_cpu = mat2_cpu.transpose(-2, -1) + mat2_npu = mat2_npu.transpose(-2, -1) + + print("steven =========== mat1_cpu.shape ", mat1_cpu.shape) + print("steven =========== mat2_cpu.shape ", mat2_cpu.shape) + cpu_output, cpu_mat1_grad, cpu_mat2_grad = self.op_exec_cpu(mat1_cpu, mat2_cpu) npu_output, npu_mat1_grad, npu_mat2_grad = self.op_exec_npu(mat1_npu, mat2_npu) + # cpu_output = self.op_exec_cpu(mat1_cpu, mat2_cpu) + # npu_output = self.op_exec_npu(mat1_npu, mat2_npu) + + # print("steven =========== cpu_output ", cpu_output.flatten()[0:3]) + # print("steven =========== npu_output ", npu_output.flatten()[0:3]) + # # + # print("steven =========== cpu_mat1_grad.shape ", cpu_mat1_grad.shape) + # print("steven =========== npu_mat1_grad.shape ", npu_mat1_grad.shape) + # + # print("steven =========== cpu_mat1_grad ", cpu_mat1_grad.flatten()[0:3]) + # print("steven =========== npu_mat1_grad ", npu_mat1_grad.flatten()[0:3]) + # # + # print("steven =========== cpu_mat2_grad.shape ", cpu_mat2_grad.shape) + # print("steven =========== npu_mat2_grad.shape ", npu_mat2_grad.shape) + + # print("steven =========== cpu_mat2_grad ", cpu_mat2_grad.flatten()[0:3]) + # print("steven =========== npu_mat2_grad ", npu_mat2_grad.flatten()[0:3]) + self.assertRtolEqualMatmul(cpu_output.astype(npu_output.dtype), npu_output) self.assertRtolEqualMatmul(cpu_mat1_grad.astype(npu_mat1_grad.dtype), npu_mat1_grad) self.assertRtolEqualMatmul(cpu_mat2_grad.astype(npu_mat2_grad.dtype), npu_mat2_grad) @@ -198,5 +232,87 @@ class TestMatMul(TestCase): self.matmul_backward_result(shape_format) torch.npu.matmul.allow_hf32 = False + def test_matmul_backward_big_memory(self): + torch.npu.matmul.allow_hf32 = True + shape_format = [ + [[np.float16, 2, [8192, 4, 5120]], [np.float16, 2, [5120, 3416]]], + ] + self.matmul_backward_result(shape_format) + torch.npu.matmul.allow_hf32 = False + + def test_matmul_backward_transpose_right(self): + torch.npu.matmul.allow_hf32 = True + shape_format = [ + [[np.float16, 2, [5, 4, 2]], [np.float16, 2, [3, 2]]], + [[np.float16, 2, [3, 2, 1, 2]], [np.float16, 2, [3, 2]]], + [[np.float16, 2, [4, 2]], [np.float16, 2, [3, 2]]], + [[np.float16, 2, [10, 2]], [np.float16, 2, [4, 3, 2]]], + ] + self.matmul_backward_result(shape_format, False, True) + torch.npu.matmul.allow_hf32 = False + + def test_matmul_backward_transpose_left(self): + torch.npu.matmul.allow_hf32 = True + shape_format = [ + [[np.float16, 2, [2, 3, 4]], [np.float16, 2, [3, 2]]], # 8, 4 + [[np.float16, 2, [3, 2, 3, 4]], [np.float16, 2, [3, 2]]],#8, 4 + [[np.float16, 2, [3, 4]], [np.float16, 2, [3, 2]]],#2,2 + [[np.float16, 2, [3, 4]], [np.float16, 2, [5, 3, 2]]], ### !!!!!!!!!!!! 8, 10 + ] + self.matmul_backward_result(shape_format, True, False) + torch.npu.matmul.allow_hf32 = False + + def test_matmul_backward_transpose_both_bmm(self): + torch.npu.matmul.allow_hf32 = True + shape_format = [ + [[np.float16, 2, [5, 3, 4]], [np.float16, 2, [5, 2, 3]]], # 8, 4 + [[np.float16, 2, [5, 2, 3, 4]], [np.float16, 2, [2, 2, 3]]],#8, 4 + ] + self.matmul_backward_result(shape_format, True, True) + torch.npu.matmul.allow_hf32 = False + + + def test_matmul_backward_transpose_both(self): + torch.npu.matmul.allow_hf32 = True + shape_format = [ + [[np.float16, 2, [2, 3, 4]], [np.float16, 2, [2,3]]], + [[np.float16, 2, [3, 2, 3, 4]], [np.float16, 2, [2,3]]], + [[np.float16, 2, [3, 4]], [np.float16, 2, [2,3]]], + [[np.float16, 2, [3, 4]], [np.float16, 2, [5, 2, 3]]], ### !!!!!!!!!!!! + ] + self.matmul_backward_result(shape_format, True, True) + torch.npu.matmul.allow_hf32 = False + + # def test_matmul_mm(self): + # + # # input image [64,64,3] + # mat1 = torch.randn(3, 4).t().to(torch.float32) + # mat2 = torch.randn(5, 3, 2).to(torch.float32) + # + # cpu_grad = torch.matmul(mat1, mat2) + # + # mat3 = mat2.reshape(mat2.size(-2), -1) + # + # cpu_grad = cpu_grad.transpose(-2, -1) + # cpu_grad = cpu_grad.reshape(-1, cpu_grad.size(-1)) + # + # cpu_output = torch.mm(mat3, cpu_grad).cpu().detach().numpy() + # + # mat1 = mat1.npu() + # mat2 = mat2.npu() + # npu_grad = torch.matmul(mat1, mat2) + # mat3 = mat2.reshape(mat2.size(-2), -1) + # + # npu_grad = npu_grad.transpose(-2, -1) + # npu_grad = npu_grad.reshape(-1, npu_grad.size(-1)) + # + # npu_output = torch.mm(mat3, npu_grad).cpu().detach().numpy() + # + # self.assertRtolEqualMatmul(cpu_output, npu_output) + # + # print("++++++++++++") + + + if __name__ == "__main__": run_tests() diff --git a/torch_npu/csrc/aten/ops/op_api/MatmulKernelNpuOpApi.cpp b/torch_npu/csrc/aten/ops/op_api/MatmulKernelNpuOpApi.cpp index 43cdee1779..c59bb0edca 100644 --- a/torch_npu/csrc/aten/ops/op_api/MatmulKernelNpuOpApi.cpp +++ b/torch_npu/csrc/aten/ops/op_api/MatmulKernelNpuOpApi.cpp @@ -19,8 +19,11 @@ #include "torch_npu/csrc/aten/ops/op_api/op_api_common.h" #include "torch_npu/csrc/framework/utils/OpAdapter.h" +#include + namespace at_npu { namespace native { +using namespace std; const int8_t ALLOW_FP32_DOWN_PRECISION = 1; const int8_t KEEP_DTYPE = 0; @@ -90,12 +93,25 @@ static c10::SmallVector get_output_size(const at::Tensor &tensor1 static inline void matmul_implement_npu(at::Tensor &out, const at::Tensor &self, const at::Tensor &mat2) { + + cout << " sssssss " << " matmul_implement_npu " + << " self.sizes(): " << self.sizes() + << " mat2.sizes(): " << mat2.sizes() + << " out.sizes(): " << out.sizes() + << endl; + // allow dicrease precision int8_t cube_math_type = ALLOW_FP32_DOWN_PRECISION; EXEC_NPU_CMD(aclnnMatmul, self, mat2, out, cube_math_type); return; } +//if input was column-major, return grad as column-order for efficiency +static inline bool is_column_major(const at::Tensor &mat) { + bool row_major = (mat.stride(-1) == 1 && mat.stride(-2) == mat.size(-1)); + return false == row_major; +} + at::Tensor matmul_mat1_backward(const at::Tensor self, const at::Tensor other, const at::Tensor grad_output) { /*mat1_grad = grad * mat2^T*/ @@ -119,19 +135,63 @@ at::Tensor matmul_mat1_backward(const at::Tensor self, const at::Tensor other, } at::Tensor output; - if (mat1.dim() == 2 && mat2.dim() > 2) { // mm - output = OpPreparation::ApplyTensorWithoutFormat(mat1.sizes(), grad.options()); - mat2 = mat2.transpose(-2, -1); - mat2 = mat2.reshape({-1, mat2.size(-1)}); - grad = grad.view({grad.size(-2), -1}); - matmul_implement_npu(output, grad, mat2); - output = output.reshape(self.sizes()); + if (mat1.dim() == 2) { // mm + + // 先转置在后面一个tensor:先转置再合并k轴??? + if (is_column_major(mat1)&&mat2.dim()==2) { + //if (is_column_major(mat1)) { + output = OpPreparation::ApplyTensorWithoutFormat(mat1.t().sizes(), grad.options()); + // 列主序, (mat2*grad^T)^T: + cout << " sssssss " << " mat1_back column_major " + << " mat1.sizes(): " << mat1.sizes() + << " mat2.sizes(): " << mat2.sizes() + << " grad.sizes(): " << grad.sizes() + << endl; + + grad = grad.transpose(-2, -1); + grad = grad.reshape({-1, grad.size(-1)}); + mat2 = mat2.reshape({mat2.size(-2), -1}); //列向连续,列向融合?? + matmul_implement_npu(output, mat2, grad); + output = output.t(); + output = output.reshape(self.sizes()); + + }else { + output = OpPreparation::ApplyTensorWithoutFormat(mat1.sizes(), grad.options()); + + cout << " sssssss " << " mat1_back " + << " mat1.sizes(): " << mat1.sizes() + << " mat2.sizes(): " << mat2.sizes() + << " grad.sizes(): " << grad.sizes() + << endl; + + //grad * mat2^T:先转置再合并k轴 + mat2 = mat2.transpose(-2, -1); + mat2 = mat2.reshape({-1, mat2.size(-1)}); + grad = grad.reshape({grad.size(-2), -1}); + matmul_implement_npu(output, grad, mat2); + output = output.reshape(self.sizes()); + } } else { // bmm - mat2 = mat2.transpose(-2, -1); - auto expend_sizes = get_output_size(grad, mat2); - output = OpPreparation::ApplyTensorWithoutFormat(expend_sizes, grad.options()); - matmul_implement_npu(output, grad, mat2); + cout << " sssssss " << " mat1_back bmm 111" << endl; + if (is_column_major(mat1)) { //(mat2*grad^T)^T: + cout << " sssssss " << " mat1_back column_major bmm " + << " mat1.sizes(): " << mat1.sizes() + << " mat2.sizes(): " << mat2.sizes() + << " grad.sizes(): " << grad.sizes() + << endl; + grad = grad.transpose(-2, -1); + auto expend_sizes = get_output_size(mat2, grad); + output = OpPreparation::ApplyTensorWithoutFormat(expend_sizes, grad.options()); + matmul_implement_npu(output, mat2, grad); + output = output.transpose(-2, -1); + }else { //grad * mat2^T + mat2 = mat2.transpose(-2, -1); + auto expend_sizes = get_output_size(grad, mat2); + output = OpPreparation::ApplyTensorWithoutFormat(expend_sizes, grad.options()); + matmul_implement_npu(output, grad, mat2); + } + } return output; @@ -159,18 +219,58 @@ at::Tensor matmul_mat2_backward(const at::Tensor self, const at::Tensor other, } at::Tensor output; - if (mat2.dim() == 2 && mat1.dim() > 2) { // mm - output = OpPreparation::ApplyTensorWithoutFormat(mat2.sizes(), mat1.options()); - mat1 = mat1.reshape({-1, mat1.size(-1)}); - grad = grad.reshape({-1, grad.size(-1)}); - mat1 = mat1.transpose(-2, -1); - matmul_implement_npu(output, mat1, grad); - output = output.reshape(other.sizes()); + if (mat2.dim() == 2) { // mm + if (is_column_major(mat2)) { + output = OpPreparation::ApplyTensorWithoutFormat(mat2.t().sizes(), mat1.options()); + // 列主序, (grad^T*mat1)^T: + cout << " sssssss " << " mat2_back column_major " + << " mat1.sizes(): " << mat1.sizes() + << " mat2.sizes(): " << mat2.sizes() + << " grad.sizes(): " << grad.sizes() + << endl; + + grad = grad.reshape({-1, grad.size(-1)}); + mat1 = mat1.reshape({-1, mat1.size(-1)}); + grad = grad.transpose(-2, -1); + + matmul_implement_npu(output, grad, mat1); + + // cout << "output: " << output << endl; + output = output.t(); + + // cout << "output2: " << output << endl; + output = output.reshape(other.sizes()); + + }else { + //mat1^T * grad:先合并k轴再转置 + output = OpPreparation::ApplyTensorWithoutFormat(mat2.sizes(), mat1.options()); + mat1 = mat1.reshape({-1, mat1.size(-1)}); + grad = grad.reshape({-1, grad.size(-1)}); + mat1 = mat1.transpose(-2, -1); + matmul_implement_npu(output, mat1, grad); + output = output.reshape(other.sizes()); + } + } else { // bmm - mat1 = mat1.transpose(-2, -1); - auto expend_sizes = get_output_size(mat1, grad); - output = OpPreparation::ApplyTensorWithoutFormat(expend_sizes, mat1.options()); - matmul_implement_npu(output, mat1, grad); + cout << " sssssss " << " mat2_back bmm 222" << endl; + if (is_column_major(mat2)){ // (grad^T*mat1)^T: + cout << " sssssss " << " mat2_back column_major bmm " + << " mat1.sizes(): " << mat1.sizes() + << " mat2.sizes(): " << mat2.sizes() + << " grad.sizes(): " << grad.sizes() + << endl; + grad = grad.transpose(-2, -1); + auto expend_sizes = get_output_size(grad, mat1); + output = OpPreparation::ApplyTensorWithoutFormat(expend_sizes, mat1.options()); + matmul_implement_npu(output, grad, mat1); + output = output.transpose(-2, -1); + + }else{ //mat1^T * grad + mat1 = mat1.transpose(-2, -1); + auto expend_sizes = get_output_size(mat1, grad); + output = OpPreparation::ApplyTensorWithoutFormat(expend_sizes, mat1.options()); + matmul_implement_npu(output, mat1, grad); + } } return output; -- Gitee