diff --git a/ads/common/__init__.py b/ads/common/__init__.py index 8e59829b5732c061d84fec33f3401d83ca9257a3..021418f12d15c708b799729a39fd351db043dd69 100644 --- a/ads/common/__init__.py +++ b/ads/common/__init__.py @@ -21,3 +21,4 @@ from .ops.npu_bounding_box_encode import npu_bounding_box_encode from .ops.npu_batch_nms import npu_batch_nms from .ops.npu_confusion_transpose import npu_confusion_transpose from .ops.npu_broadcast import npu_broadcast +from .ops.npu_moe_tutel import npu_moe_tutel diff --git a/ads/common/ops/csrc/MoeTutelOpApi.cpp b/ads/common/ops/csrc/MoeTutelOpApi.cpp new file mode 100644 index 0000000000000000000000000000000000000000..6ec9fa354b375b96b8cc2f61bba993cbed69a3f0 --- /dev/null +++ b/ads/common/ops/csrc/MoeTutelOpApi.cpp @@ -0,0 +1,87 @@ +// Copyright (c) 2023 Huawei Technologies Co., Ltd +// Copyright (c) 2019, Facebook CORPORATION. +// All rights reserved. +// +// Licensed under the BSD 3-Clause License (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// https://opensource.org/licenses/BSD-3-Clause +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include +#include +#include "torch_npu/csrc/framework/OpCommand.h" +#include "torch_npu/csrc/framework/utils/OpPreparation.h" +#include "torch_npu/csrc/framework/utils/NpuUtils.h" +#include "torch_npu/csrc/aten/NPUNativeFunctions.h" +#include "torch_npu/csrc/aten/CustomFunctions.h" +#include "functions.h" +#include "common.h" +#include "OpApiCommon.h" + +using npu_preparation = at_npu::native::OpPreparation; +using torch::autograd::Function; +using torch::autograd::AutogradContext; +using tensor_tuple = std::tuple; + +namespace { +inline void npu_moe_tutel_check( + const at::Tensor& self, + const at::Tensor& gates, + const at::Tensor& indices, + const at::Tensor& locations) +{ + TORCH_CHECK(self.dim() == 2, "The dim of input tensor [x] should equal to (sample, hidden)."); + TORCH_CHECK(gates.dim() == 2, "The dim of gates tensor [x] should equal to (batch, sample)."); + TORCH_CHECK(self.sizes()[0] == gates.sizes()[1], "input's sample size should equal to gates's samples size."); + TORCH_CHECK((gates.sizes() == indices.sizes()) && (indices.sizes() == locations.sizes()), + "Shape of gates should match shape of indices and locations."); +} +} // namespace + +at::Tensor npu_moe_tutel( + const at::Tensor& self, + const at::Tensor& gates, + const at::Tensor& indices, + const at::Tensor& locations, + int64_t capacity) +{ + npu_moe_tutel_check(self, gates, indices, locations); + auto gates_size = gates.sizes(); + auto self_size = self.sizes(); + auto output_size = {gates_size[0], capacity, self_size[1]}; + at::Tensor result = at::zeros(output_size, self.options()); + EXEC_NPU_CMD(aclnnMoeTutelDispatch, self, gates, indices, locations, capacity, result); + return result; +} + +at::Tensor npu_moe_tutel_data_backward( + const at::Tensor& y_grad, + const at::Tensor& gates, + const at::Tensor& indices, + const at::Tensor& locations) +{ + auto gates_size = gates.sizes(); + auto grad_size = y_grad.sizes(); + auto output_size = {gates_size[1], grad_size[2]}; + at::Tensor result = at::zeros(output_size, y_grad.options()); + EXEC_NPU_CMD(aclnnMoeTutelCombineX, y_grad, gates, indices, locations, result); + return result; +} + +at::Tensor npu_moe_tutel_gate_backward( + const at::Tensor& self, + const at::Tensor& y_grad, + const at::Tensor& indices, + const at::Tensor& locations) +{ + at::Tensor result = at::zeros(indices.sizes(), y_grad.options()); + EXEC_NPU_CMD(aclnnMoeTutelCombineGates, self, y_grad, indices, locations, result); + return result; +} diff --git a/ads/common/ops/csrc/functions.h b/ads/common/ops/csrc/functions.h index 0afa2edce5b9de5e41969fbd8b7d91855377f509..f713b98400c6d2168727ff675f61b2c73a086b74 100644 --- a/ads/common/ops/csrc/functions.h +++ b/ads/common/ops/csrc/functions.h @@ -116,5 +116,21 @@ at::Tensor npu_conv_transpose2d( int64_t groups); at::Tensor npu_broadcast(const at::Tensor& self, at::IntArrayRef size); at::Tensor& npu_broadcast_out(const at::Tensor& self, at::IntArrayRef size, at::Tensor& result); +at::Tensor npu_moe_tutel( + const at::Tensor &self, + const at::Tensor &gates, + const at::Tensor &indices, + const at::Tensor &locations, + int64_t capacity); +at::Tensor npu_moe_tutel_data_backward( + const at::Tensor &y_grad, + const at::Tensor &gates, + const at::Tensor &indices, + const at::Tensor &locations); +at::Tensor npu_moe_tutel_gate_backward( + const at::Tensor &self, + const at::Tensor &y_grad, + const at::Tensor &indices, + const at::Tensor &locations); #endif // __FUNCTIONS_H__ diff --git a/ads/common/ops/csrc/pybind.cpp b/ads/common/ops/csrc/pybind.cpp index b8ebe3f5250e9add59301ffab14216dcf2b18539..c91e4093ea2c93a19f213e43557b3cdd6d25cb1e 100644 --- a/ads/common/ops/csrc/pybind.cpp +++ b/ads/common/ops/csrc/pybind.cpp @@ -66,4 +66,9 @@ void init_common(pybind11::module &m) // npu_broadcast m.def("npu_broadcast", &npu_broadcast); + + // npu_moe_tutel + m.def("npu_moe_tutel", &npu_moe_tutel, "npu_moe_tutel NPU version"); + m.def("npu_moe_tutel_data_backward", &npu_moe_tutel_data_backward, "npu_moe_tutel_data_backward NPU version"); + m.def("npu_moe_tutel_gate_backward", &npu_moe_tutel_gate_backward, "npu_moe_tutel_gate_backward NPU version"); } diff --git a/ads/common/ops/npu_moe_tutel.py b/ads/common/ops/npu_moe_tutel.py new file mode 100644 index 0000000000000000000000000000000000000000..ba98c3574f4de65ed8c014d40a6981bce529a606 --- /dev/null +++ b/ads/common/ops/npu_moe_tutel.py @@ -0,0 +1,27 @@ +import torch +from torch.autograd import Function +from torch.nn import Module + +import torch_npu +import ads_c + + +class MoeTutelFunction(Function): + @staticmethod + # 'pylint: disable=too-many-arguments,huawei-too-many-arguments + def forward(ctx, x, gates, indices, locations, capacity): + result = ads_c.npu_moe_tutel(x, gates, indices, locations, capacity) + ctx.save_for_backward(x, gates, indices, locations) + return result + + @staticmethod + # 'pylint: disable=too-many-arguments,huawei-too-many-arguments + # 'pylint: disable=too-many-return-arguments,huawei-too-many-return-arguments + def backward(ctx, y_grad): + x0, gates, indices, locations = ctx.saved_tensors + x_grad = ads_c.npu_moe_tutel_data_backward(y_grad, gates, indices, locations) + gates_grad = ads_c.npu_moe_tutel_gate_backward(x0, y_grad, indices, locations) + return x_grad, gates_grad, None, None, None + + +npu_moe_tutel = MoeTutelFunction.apply diff --git a/tests/test_npu_moe_tutel.py b/tests/test_npu_moe_tutel.py new file mode 100644 index 0000000000000000000000000000000000000000..8723ced23c99a89456458a9e98d16309387c7760 --- /dev/null +++ b/tests/test_npu_moe_tutel.py @@ -0,0 +1,88 @@ +import unittest +import torch + +import torch_npu +from torch_npu.testing.testcase import TestCase, run_tests +import ads.common + +DEVICE_NAME = torch_npu.npu.get_device_name(0)[:10] + + +class TestMoeTutel(TestCase): + # 'pylint: disable=too-many-arguments,huawei-too-many-arguments + def cpu_to_exec(self, x, gates, indices, locations, capacity, batch_size, sample_size, hidden, dtype): + result = torch.zeros([batch_size, capacity, hidden]).to(dtype) + for tensor_idx in range(batch_size): + for i in range(sample_size): + if locations[tensor_idx, i] < capacity and indices[tensor_idx, i] >= 0: + result[int(indices[tensor_idx, i]), int(locations[tensor_idx, i]), :] = gates[tensor_idx, i] * x[i, + :] + return result + + def npu_to_exec(self, x, gates, indices, locations, capacity): + out = ads.common.npu_moe_tutel(x, gates, indices, locations, capacity) + return out.cpu() + + def gen_data(self, shape, dtype): + cpu_input = torch.rand(shape, dtype=dtype) + npu_input = cpu_input.npu() + return cpu_input, npu_input + + def gen_data_gates(self, shape, dtype): + cpu_input = torch.rand(shape).bool().to(dtype) + npu_input = cpu_input.npu() + return cpu_input, npu_input + + def gen_data_indices(self, shape): + batch_size = shape[0] + sample_size = shape[1] + cpu_input = torch.zeros((1, sample_size)).int() + indices = torch.ones((1, sample_size)).int() + for i in range(batch_size - 1): + cpu_input = torch.cat((cpu_input, torch.mul(indices, torch.tensor(i + 1, dtype=torch.int32))), 0) + npu_input = cpu_input.npu() + return cpu_input, npu_input + + def gen_data_locations(self, shape): + batch_size = shape[0] + sample_size = shape[1] + cpu_input = torch.arange(0, sample_size).reshape(1, sample_size).int() + cpu_input = cpu_input.repeat(batch_size, 1) + npu_input = cpu_input.npu() + return cpu_input, npu_input + + @unittest.skipIf(DEVICE_NAME != 'Ascend910B', "OP `MoeTutel` is only supported on 910B, skip this ut!") + def test_moe_tutel(self): + dtype_list = [torch.float16, torch.float32, torch.bfloat16] + shape_list = [ + [[2, 5], [5, 16], 6], + [[3, 6], [6, 16], 6], + [[4, 7], [7, 32], 12], + [[5, 8], [8, 32], 12], + [[2, 16384], [16384, 32], 16384], + ] + items = [ + [shape, dtype] + for shape in shape_list + for dtype in dtype_list + ] + for shape, dtype in items: + capacity = shape[2] + batch_size = shape[0][0] + sample_size = shape[0][1] + hidden = shape[1][1] + cpu_x, npu_x = self.gen_data(shape[1], dtype) + cpu_gates, npu_gates = self.gen_data_gates(shape[0], dtype) + cpu_indices, npu_indices = self.gen_data_indices(shape[0]) + cpu_locations, npu_locations = self.gen_data_locations(shape[0]) + cpu_out = self.cpu_to_exec(cpu_x, cpu_gates, cpu_indices, cpu_locations, capacity, batch_size, sample_size, + hidden, dtype) + npu_out = self.npu_to_exec(npu_x, npu_gates, npu_indices, npu_locations, capacity) + if dtype == torch.bfloat16 or dtype == torch.float16: + npu_out = npu_out.to(torch.float32) + cpu_out = cpu_out.to(torch.float32) + self.assertRtolEqual(npu_out.numpy(), cpu_out.numpy()) + + +if __name__ == '__main__': + run_tests() diff --git a/tests/test_npu_moe_tutel_backward.py b/tests/test_npu_moe_tutel_backward.py new file mode 100644 index 0000000000000000000000000000000000000000..ee401777c0b2976d82fc19a78de73d0187a2b960 --- /dev/null +++ b/tests/test_npu_moe_tutel_backward.py @@ -0,0 +1,100 @@ +import unittest +import torch + +import torch_npu +from torch_npu.testing.testcase import TestCase, run_tests +import ads.common + +DEVICE_NAME = torch_npu.npu.get_device_name(0)[:10] + + +class TestMoeTutel(TestCase): + # 'pylint: disable=too-many-arguments,huawei-too-many-arguments + def cpu_to_exec(self, x, gates, indices, locations, capacity, batch_size, sample_size, hidden, dtype): + x.requires_grad = True + gates.requires_grad = True + out = torch.zeros([batch_size, capacity, hidden]).to(dtype) + for tensor_idx in range(batch_size): + for i in range(sample_size): + if locations[tensor_idx, i] < capacity and indices[tensor_idx, i] >= 0: + out[indices[tensor_idx, i], locations[tensor_idx, i], :] = gates[tensor_idx, i] * x[i, :] + out.backward(torch.ones_like(out)) + x_grad = x.grad + gates_grad = gates.grad + return x_grad, gates_grad + + def npu_to_exec(self, x, gates, indices, locations, capacity): + x.requires_grad = True + gates.requires_grad = True + out = ads.common.npu_moe_tutel(x, gates, indices, locations, capacity) + out.backward(torch.ones_like(out)) + x_grad = x.grad + gates_grad = gates.grad + return x_grad, gates_grad + + def gen_data(self, shape, dtype): + cpu_input = torch.rand(shape, dtype=dtype) + npu_input = cpu_input.npu() + return cpu_input, npu_input + + def gen_data_gates(self, shape, dtype): + cpu_input = torch.rand(shape).bool().to(dtype) + npu_input = cpu_input.npu() + return cpu_input, npu_input + + def gen_data_indices(self, shape): + batch_size = shape[0] + sample_size = shape[1] + cpu_input = torch.zeros((1, sample_size)).int() + indices = torch.ones((1, sample_size)).int() + for i in range(batch_size - 1): + cpu_input = torch.cat((cpu_input, torch.mul(indices, torch.tensor(i + 1, dtype=torch.int32))), 0) + npu_input = cpu_input.npu() + return cpu_input, npu_input + + def gen_data_locations(self, shape): + batch_size = shape[0] + sample_size = shape[1] + cpu_input = torch.arange(0, sample_size).reshape(1, sample_size).int() + cpu_input = cpu_input.repeat(batch_size, 1) + npu_input = cpu_input.npu() + return cpu_input, npu_input + + @unittest.skipIf(DEVICE_NAME != 'Ascend910B', "OP `MoeTutel` is only supported on 910B, skip this ut!") + def test_moe_tutel(self): + dtype_list = [torch.float16, torch.float32, torch.bfloat16] + shape_list = [ + [[2, 5], [5, 16], 6], + [[3, 6], [6, 16], 6], + [[4, 7], [7, 32], 12], + [[5, 8], [8, 32], 12], + [[2, 16384], [16384, 32], 16384], + ] + items = [ + [shape, dtype] + for shape in shape_list + for dtype in dtype_list + ] + for shape, dtype in items: + capacity = shape[2] + batch_size = shape[0][0] + sample_size = shape[0][1] + hidden = shape[1][1] + cpu_x, npu_x = self.gen_data(shape[1], dtype) + cpu_gates, npu_gates = self.gen_data_gates(shape[0], dtype) + cpu_indices, npu_indices = self.gen_data_indices(shape[0]) + cpu_locations, npu_locations = self.gen_data_locations(shape[0]) + cpu_grad1, cpu_grad2 = self.cpu_to_exec(cpu_x, cpu_gates, cpu_indices, cpu_locations, capacity, batch_size, + sample_size, hidden, dtype) + npu_grad1, npu_grad2 = self.npu_to_exec(npu_x, npu_gates, npu_indices, npu_locations, capacity) + if dtype == torch.bfloat16 or dtype == torch.float16: + cpu_grad1 = cpu_grad1.to(torch.float32) + cpu_grad2 = cpu_grad2.to(torch.float32) + npu_grad1 = npu_grad1.to(torch.float32) + npu_grad2 = npu_grad2.to(torch.float32) + self.assertRtolEqual(npu_grad1.detach().cpu().numpy(), cpu_grad1.numpy()) + self.assertRtolEqual(npu_grad2.detach().cpu().numpy(), cpu_grad2.numpy()) + + +if __name__ == '__main__': + run_tests()