diff --git a/mindspore-lite/src/litert/kernel/dsp/ft04/matmulfusion.cc b/mindspore-lite/src/litert/kernel/dsp/ft04/matmulfusion.cc new file mode 100644 index 0000000000000000000000000000000000000000..602507e06123f48e0d3bfa8dfc4cb3fe365a1d0b --- /dev/null +++ b/mindspore-lite/src/litert/kernel/dsp/ft04/matmulfusion.cc @@ -0,0 +1,181 @@ +/** + * Copyright 2025 Huawei Technologies Co., Ltd + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "src/litert/kernel/dsp/ft04/matmulfusion.h" +#include +#include +#include "src/litert/kernel_registry.h" +#include "schema/inner/model_generated.h" +#include "src/litert/kernel/cpu/nnacl_c/matmul_parameter.h" + +using mindspore::kernel::KERNEL_ARCH::kDSP; +using mindspore::lite::KernelRegistrar; +using mindspore::lite::RET_ERROR; +using mindspore::lite::RET_OK; +using mindspore::schema::PrimitiveType_MatMulFusion; + +namespace mindspore::kernel { + +int MatMulFusionDSPKernel::Prepare() { return RET_OK; } + +int MatMulFusionDSPKernel::CheckSpecs() { + // inputs: A, B, (optional) bias; output: C + if (in_tensors_.size() != INPUT_TENSOR_SIZE_2 && in_tensors_.size() != INPUT_TENSOR_SIZE_3) { + MS_LOG(WARNING) << "MatMulFusion expects 2 or 3 inputs, got " << in_tensors_.size(); + return RET_ERROR; + } + if (out_tensors_.size() != OUTPUT_TENSOR_SIZE_1) { + MS_LOG(WARNING) << "MatMulFusion expects 1 output, got " << out_tensors_.size(); + return RET_ERROR; + } + int M = 0, N = 0, K = 0; + if (GetMNK(&M, &N, &K) != RET_OK) { + MS_LOG(WARNING) << "MatMulFusion shape inference failed."; + return RET_ERROR; + } + // Bias check if present + if (in_tensors_.size() == INPUT_TENSOR_SIZE_3) { + auto bias_shape = in_tensors_[2]->shape(); + if (bias_shape.size() != 2 || bias_shape[0] != M || bias_shape[1] != N) { + MS_LOG(WARNING) << "Bias shape mismatch MxN: got " << bias_shape; + return RET_ERROR; + } + } + // Output shape check + auto out_shape = out_tensors_[0]->shape(); + if (out_shape.size() != 2 || out_shape[0] != M || out_shape[1] != N) { + MS_LOG(WARNING) << "Output shape mismatch expected (" << M << "," << N << ")"; + return RET_ERROR; + } + return RET_OK; +} + +int MatMulFusionDSPKernel::GetMNK(int *M, int *N, int *K) const { + if (M == nullptr || N == nullptr || K == nullptr) return RET_ERROR; + const auto &a_shape = in_tensors_[0]->shape(); + const auto &b_shape = in_tensors_[1]->shape(); + if (a_shape.size() != 2 || b_shape.size() != 2) { + MS_LOG(WARNING) << "A/B must be rank-2"; + return RET_ERROR; + } + int aM = a_shape[0]; + int aK = a_shape[1]; + int bK = b_shape[0]; + int bN = b_shape[1]; + if (aK != bK) { + MS_LOG(WARNING) << "Inner dimension mismatch: " << aK << " vs " << bK; + return RET_ERROR; + } + *M = aM; + *K = aK; + *N = bN; + return RET_OK; +} + +int MatMulFusionDSPKernel::GetActTypeCode(int *code) const { + if (code == nullptr) return RET_ERROR; + // Map ActType (nnacl) -> DSP activation code used in DSP functions (NONE=0, RELU=1, RELU6=2) + int act = 0; // default NONE + auto *param = reinterpret_cast(op_parameter_); + if (param != nullptr) { + switch (param->act_type_) { + case ActType_Relu: + act = 1; + break; + case ActType_Relu6: + act = 2; + break; // DSP uses 2 for RELU6, nnacl uses enum value 3 + default: + act = 0; + break; + } + } + *code = act; + return RET_OK; +} + +int MatMulFusionDSPKernel::RunFp32() { + kernel_name_ = "fp_matmulfusion_s"; + return dsp_runtime_->RunKernel(kernel_name_, kernel_args_, core_mask_); +} +int MatMulFusionDSPKernel::RunFp16() { + kernel_name_ = "hp_matmulfusion_s"; + return dsp_runtime_->RunKernel(kernel_name_, kernel_args_, core_mask_); +} +int MatMulFusionDSPKernel::RunInt32() { + kernel_name_ = "i32_matmulfusion_s"; + return dsp_runtime_->RunKernel(kernel_name_, kernel_args_, core_mask_); +} +int MatMulFusionDSPKernel::RunInt16() { + kernel_name_ = "i16_matmulfusion_s"; + return dsp_runtime_->RunKernel(kernel_name_, kernel_args_, core_mask_); +} +int MatMulFusionDSPKernel::RunComplex64() { + kernel_name_ = "c64_matmulfusion_s"; + return dsp_runtime_->RunKernel(kernel_name_, kernel_args_, core_mask_); +} + +int MatMulFusionDSPKernel::Run() { + int M = 0, N = 0, K = 0; + if (GetMNK(&M, &N, &K) != RET_OK) { + MS_LOG(ERROR) << "MatMulFusion GetMNK failed"; + return RET_ERROR; + } + int act_code = 0; + (void)GetActTypeCode(&act_code); // default 0 if not set + + auto allocator = dsp_runtime_->GetAllocator(); + uint64_t a_ptr = allocator->GetDeviceMemPtr(in_tensors_[0]->data()); + uint64_t b_ptr = allocator->GetDeviceMemPtr(in_tensors_[1]->data()); + uint64_t out_ptr = allocator->GetDeviceMemPtr(out_tensors_[0]->data()); + uint64_t bias_ptr = 0; + if (in_tensors_.size() == INPUT_TENSOR_SIZE_3) { + bias_ptr = allocator->GetDeviceMemPtr(in_tensors_[2]->data()); + } + // Arg order must match DSP symbol prototype: A,B,C,bias,M,N,K,act_type + SetKernelArg({a_ptr, b_ptr, out_ptr, bias_ptr, static_cast(M), static_cast(N), + static_cast(K), static_cast(act_code)}); + + int ret = RET_ERROR; + auto dtype = in_tensors_[0]->data_type(); + if (dtype == kNumberTypeFloat32) { + ret = RunFp32(); + } else if (dtype == kNumberTypeFloat16) { + ret = RunFp16(); + } else if (dtype == kNumberTypeInt32) { + ret = RunInt32(); + } else if (dtype == kNumberTypeInt16) { + ret = RunInt16(); + } else if (dtype == kNumberTypeComplex64) { + ret = RunComplex64(); + } else { + MS_LOG(ERROR) << "MatMulFusion unsupported dtype: " << static_cast(dtype); + return RET_ERROR; + } + if (ret != RET_OK) { + MS_LOG(ERROR) << "MatMulFusion DSP run failed"; + return RET_ERROR; + } + return RET_OK; +} + +REG_KERNEL(kDSP, kNumberTypeFloat32, PrimitiveType_MatMulFusion, DSPKernelCreator) +REG_KERNEL(kDSP, kNumberTypeFloat16, PrimitiveType_MatMulFusion, DSPKernelCreator) +REG_KERNEL(kDSP, kNumberTypeInt32, PrimitiveType_MatMulFusion, DSPKernelCreator) +REG_KERNEL(kDSP, kNumberTypeInt16, PrimitiveType_MatMulFusion, DSPKernelCreator) +REG_KERNEL(kDSP, kNumberTypeComplex64, PrimitiveType_MatMulFusion, DSPKernelCreator) + +} // namespace mindspore::kernel diff --git a/mindspore-lite/src/litert/kernel/dsp/ft04/matmulfusion.h b/mindspore-lite/src/litert/kernel/dsp/ft04/matmulfusion.h new file mode 100644 index 0000000000000000000000000000000000000000..1a487f089c878e328cd1a9200b923716978b91d0 --- /dev/null +++ b/mindspore-lite/src/litert/kernel/dsp/ft04/matmulfusion.h @@ -0,0 +1,51 @@ +/** + * Copyright 2025 Huawei Technologies Co., Ltd + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef MINDSPORE_LITE_SRC_RUNTIME_KERNEL_DSP_KERNEL_MATMULFUSION_H_ +#define MINDSPORE_LITE_SRC_RUNTIME_KERNEL_DSP_KERNEL_MATMULFUSION_H_ + +#include +#include +#include "src/litert/kernel/dsp/dsp_kernel.h" + +namespace mindspore::kernel { +class MatMulFusionDSPKernel : public DSPKernel { + public: + using DSPKernel::DSPKernel; + ~MatMulFusionDSPKernel() override = default; + + int Prepare() override; + int CheckSpecs() override; + int Run() override; + + private: + int RunFp32(); + int RunFp16(); + int RunInt32(); + int RunInt16(); + int RunComplex64(); + + // helpers + int GetMNK(int *M, int *N, int *K) const; + int GetActTypeCode(int *code) const; + + private: + std::string kernel_name_; + uint64_t core_mask_{0xF}; +}; +} // namespace mindspore::kernel + +#endif // MINDSPORE_LITE_SRC_RUNTIME_KERNEL_DSP_KERNEL_MATMULFUSION_H_ diff --git a/mindspore-lite/src/litert/kernel/dsp/ft78/matmulfusion.cc b/mindspore-lite/src/litert/kernel/dsp/ft78/matmulfusion.cc new file mode 100644 index 0000000000000000000000000000000000000000..d17ab5738380822ad3a301dac924baaf5e73d298 --- /dev/null +++ b/mindspore-lite/src/litert/kernel/dsp/ft78/matmulfusion.cc @@ -0,0 +1,223 @@ +/** + * Copyright 2025 Huawei Technologies Co., Ltd + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "src/litert/kernel/dsp/ft78/matmulfusion.h" +#include +#include +#include +#include "src/litert/kernel_registry.h" +#include "schema/inner/model_generated.h" +#include "src/litert/kernel/cpu/nnacl_c/matmul_parameter.h" + +using mindspore::kernel::KERNEL_ARCH::kDSP; +using mindspore::lite::KernelRegistrar; +using mindspore::lite::RET_ERROR; +using mindspore::lite::RET_OK; +using mindspore::schema::PrimitiveType_MatMulFusion; + +namespace mindspore::kernel { + +int MatMulFusionDSPKernel::Prepare() { return RET_OK; } + +int MatMulFusionDSPKernel::CheckSpecs() { + if (in_tensors_.size() != INPUT_TENSOR_SIZE_2 && in_tensors_.size() != INPUT_TENSOR_SIZE_3 && + in_tensors_.size() != INPUT_TENSOR_SIZE_4) { + MS_LOG(WARNING) << "MatMulFusion expects 2, 3 or 4 inputs, got " << in_tensors_.size(); + return RET_ERROR; + } + + if (out_tensors_.size() != OUTPUT_TENSOR_SIZE_1) { + MS_LOG(WARNING) << "MatMulFusion expects 1 output, got " << out_tensors_.size(); + return RET_ERROR; + } + int M = 0; + int N = 0; + int K = 0; + if (GetMNK(&M, &N, &K) != RET_OK) { + MS_LOG(WARNING) << "MatMulFusion shape inference failed."; + return RET_ERROR; + } + + if (in_tensors_.size() == INPUT_TENSOR_SIZE_3 || in_tensors_.size() == INPUT_TENSOR_SIZE_4) { + auto bias_shape = in_tensors_[INPUT_TENSOR_SIZE_2]->shape(); + if (bias_shape.size() != 2 || bias_shape[0] != M || bias_shape[1] != N) { + MS_LOG(WARNING) << "Bias shape mismatch MxN."; + return RET_ERROR; + } + } + + auto out_shape = out_tensors_[0]->shape(); + if (out_shape.size() != 2 || out_shape[0] != M || out_shape[1] != N) { + MS_LOG(WARNING) << "Output shape mismatch expected (" << M << "," << N << ")."; + return RET_ERROR; + } + return RET_OK; +} + +int MatMulFusionDSPKernel::GetMNK(int *M, int *N, int *K) const { + if (M == nullptr || N == nullptr || K == nullptr) { + return RET_ERROR; + } + const auto &a_shape = in_tensors_[0]->shape(); + const auto &b_shape = in_tensors_[1]->shape(); + if (a_shape.size() != 2 || b_shape.size() != 2) { + MS_LOG(WARNING) << "A/B must be rank-2."; + return RET_ERROR; + } + int aM = a_shape[0]; + int aK = a_shape[1]; + int bK = b_shape[0]; + int bN = b_shape[1]; + if (aK != bK) { + MS_LOG(WARNING) << "Inner dimension mismatch: " << aK << " vs " << bK; + return RET_ERROR; + } + *M = aM; + *K = aK; + *N = bN; + return RET_OK; +} + +int MatMulFusionDSPKernel::GetActTypeCode(int *code) const { + if (code == nullptr) { + return RET_ERROR; + } + int act = 0; + auto *param = reinterpret_cast(op_parameter_); + if (param != nullptr) { + switch (param->act_type_) { + case ActType_Relu: + act = 1; + break; + case ActType_Relu6: + act = 2; + break; + default: + act = 0; + break; + } + } + *code = act; + return RET_OK; +} + +int MatMulFusionDSPKernel::RunFp32() { + kernel_name_ = "fp_matmulfusion_s"; + return dsp_runtime_->RunKernel(kernel_name_, kernel_args_, core_mask_); +} + +int MatMulFusionDSPKernel::RunFp64() { + kernel_name_ = "dp_matmulfusion_s"; + return dsp_runtime_->RunKernel(kernel_name_, kernel_args_, core_mask_); +} + +int MatMulFusionDSPKernel::RunInt32() { + kernel_name_ = "i32_matmulfusion_s"; + return dsp_runtime_->RunKernel(kernel_name_, kernel_args_, core_mask_); +} + +int MatMulFusionDSPKernel::RunInt16() { + kernel_name_ = "i16_matmulfusion_s"; + return dsp_runtime_->RunKernel(kernel_name_, kernel_args_, core_mask_); +} + +int MatMulFusionDSPKernel::RunInt8() { + kernel_name_ = "i8_matmulfusion_s"; + return dsp_runtime_->RunKernel(kernel_name_, kernel_args_, core_mask_); +} + +int MatMulFusionDSPKernel::RunComplex64() { + kernel_name_ = "c64_matmulfusion_s"; + return dsp_runtime_->RunKernel(kernel_name_, kernel_args_, core_mask_); +} + +int MatMulFusionDSPKernel::RunComplex128() { + kernel_name_ = "c128_matmulfusion_s"; + return dsp_runtime_->RunKernel(kernel_name_, kernel_args_, core_mask_); +} + +int MatMulFusionDSPKernel::Run() { + int M = 0; + int N = 0; + int K = 0; + if (GetMNK(&M, &N, &K) != RET_OK) { + MS_LOG(ERROR) << "MatMulFusion GetMNK failed"; + return RET_ERROR; + } + int act_code = 0; + (void)GetActTypeCode(&act_code); + + auto allocator = dsp_runtime_->GetAllocator(); + uint64_t a_ptr = allocator->GetDeviceMemPtr(in_tensors_[0]->data()); + uint64_t b_ptr = allocator->GetDeviceMemPtr(in_tensors_[1]->data()); + uint64_t out_ptr = allocator->GetDeviceMemPtr(out_tensors_[0]->data()); + uint64_t bias_ptr = 0; + if (in_tensors_.size() >= INPUT_TENSOR_SIZE_3) { + bias_ptr = allocator->GetDeviceMemPtr(in_tensors_[INPUT_TENSOR_SIZE_2]->data()); + } + + if (in_tensors_.size() == INPUT_TENSOR_SIZE_4) { + uint64_t mnk_ptr = allocator->GetDeviceMemPtr(in_tensors_[INPUT_TENSOR_SIZE_4 - 1]->data()); + SetKernelArg({a_ptr, b_ptr, out_ptr, bias_ptr, mnk_ptr}); + } else { + SetKernelArg({a_ptr, b_ptr, out_ptr, bias_ptr, static_cast(M), static_cast(N), + static_cast(K), static_cast(act_code)}); + } + + auto dtype = in_tensors_[0]->data_type(); + int ret = RET_ERROR; + switch (dtype) { + case kNumberTypeFloat32: + ret = RunFp32(); + break; + case kNumberTypeFloat64: + ret = RunFp64(); + break; + case kNumberTypeInt32: + ret = RunInt32(); + break; + case kNumberTypeInt16: + ret = RunInt16(); + break; + case kNumberTypeInt8: + ret = RunInt8(); + break; + case kNumberTypeComplex64: + ret = RunComplex64(); + break; + case kNumberTypeComplex128: + ret = RunComplex128(); + break; + default: + MS_LOG(ERROR) << "MatMulFusion unsupported dtype: " << static_cast(dtype); + return RET_ERROR; + } + if (ret != RET_OK) { + MS_LOG(ERROR) << "MatMulFusion DSP run failed"; + return RET_ERROR; + } + return RET_OK; +} + +REG_KERNEL(kDSP, kNumberTypeFloat32, PrimitiveType_MatMulFusion, DSPKernelCreator) +REG_KERNEL(kDSP, kNumberTypeFloat64, PrimitiveType_MatMulFusion, DSPKernelCreator) +REG_KERNEL(kDSP, kNumberTypeInt32, PrimitiveType_MatMulFusion, DSPKernelCreator) +REG_KERNEL(kDSP, kNumberTypeInt16, PrimitiveType_MatMulFusion, DSPKernelCreator) +REG_KERNEL(kDSP, kNumberTypeInt8, PrimitiveType_MatMulFusion, DSPKernelCreator) +REG_KERNEL(kDSP, kNumberTypeComplex64, PrimitiveType_MatMulFusion, DSPKernelCreator) +REG_KERNEL(kDSP, kNumberTypeComplex128, PrimitiveType_MatMulFusion, DSPKernelCreator) + +} // namespace mindspore::kernel diff --git a/mindspore-lite/src/litert/kernel/dsp/ft78/matmulfusion.h b/mindspore-lite/src/litert/kernel/dsp/ft78/matmulfusion.h new file mode 100644 index 0000000000000000000000000000000000000000..905db563b7569308cb620c921a7fdb40b06b5214 --- /dev/null +++ b/mindspore-lite/src/litert/kernel/dsp/ft78/matmulfusion.h @@ -0,0 +1,52 @@ +/** + * Copyright 2025 Huawei Technologies Co., Ltd + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef MINDSPORE_LITE_SRC_RUNTIME_KERNEL_DSP_FT78_MATMULFUSION_H_ +#define MINDSPORE_LITE_SRC_RUNTIME_KERNEL_DSP_FT78_MATMULFUSION_H_ + +#include +#include +#include "src/litert/kernel/dsp/dsp_kernel.h" + +namespace mindspore::kernel { +class MatMulFusionDSPKernel : public DSPKernel { + public: + using DSPKernel::DSPKernel; + ~MatMulFusionDSPKernel() override = default; + + int Prepare() override; + int CheckSpecs() override; + int Run() override; + + private: + int RunFp32(); + int RunFp64(); + int RunInt32(); + int RunInt16(); + int RunInt8(); + int RunComplex64(); + int RunComplex128(); + + int GetMNK(int *M, int *N, int *K) const; + int GetActTypeCode(int *code) const; + + private: + std::string kernel_name_; + uint64_t core_mask_{0xff}; +}; +} // namespace mindspore::kernel + +#endif // MINDSPORE_LITE_SRC_RUNTIME_KERNEL_DSP_FT78_MATMULFUSION_H_ diff --git a/mindspore-lite/test/ut/src/runtime/kernel/dsp/matmulfusion_tests.cc b/mindspore-lite/test/ut/src/runtime/kernel/dsp/matmulfusion_tests.cc new file mode 100644 index 0000000000000000000000000000000000000000..597856545417702914bea083d7f080e363cfbb60 --- /dev/null +++ b/mindspore-lite/test/ut/src/runtime/kernel/dsp/matmulfusion_tests.cc @@ -0,0 +1,1031 @@ +/** + * Copyright 2025 Huawei Technologies Co., Ltd + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include +#include +#include +#include +#include "ut/src/runtime/kernel/dsp/dsp_test.h" +#include "include/api/context.h" +#include "include/api/data_type.h" +#include "include/api/model.h" +#include "schema/inner/model_generated.h" +#include "src/litert/kernel_registry.h" +#include "src/litert/kernel/cpu/nnacl_c/matmul_parameter.h" + +namespace mindspore::lite::dsp::test { + +class TestDSP_MatMulFusion : public DSPCommonTest {}; + +static void FillFloat(float *data, int size, float base = 0.1f) { + for (int i = 0; i < size; ++i) { + data[i] = base * static_cast((i % 10)); + } +} + +typedef uint16_t float16_t_u; +static inline float16_t_u Fp32ToFp16Bits(float v) { + uint32_t bits; + std::memcpy(&bits, &v, sizeof(bits)); + uint32_t sign = (bits >> 31) & 0x1; + int32_t exponent = ((bits >> 23) & 0xFF) - 127 + 15; + uint32_t mantissa = bits & 0x007FFFFF; + uint16_t result; + if (exponent <= 0) { + if (exponent < -10) { + result = static_cast(sign << 15); + } else { + mantissa |= 0x00800000; + int shift = 14 - exponent; + uint32_t mantissa_shifted = mantissa >> shift; + uint32_t remainder = mantissa & ((1U << shift) - 1); + if (remainder > (1U << (shift - 1)) || (remainder == (1U << (shift - 1)) && (mantissa_shifted & 1))) { + mantissa_shifted++; + } + result = static_cast((sign << 15) | (mantissa_shifted & 0x3FF)); + } + } else if (exponent == 0xFF - 127 + 15) { + result = static_cast((sign << 15) | (mantissa == 0 ? 0x7C00 : 0x7E00)); + } else if (exponent > 30) { + result = static_cast((sign << 15) | 0x7C00); + } else { + uint32_t mantissa_rounded = mantissa >> 13; + uint32_t remainder = mantissa & 0x1FFF; + if (remainder > 0x1000 || (remainder == 0x1000 && (mantissa_rounded & 1))) { + mantissa_rounded++; + if (mantissa_rounded == 0x400) { + mantissa_rounded = 0; + exponent++; + if (exponent > 30) return static_cast((sign << 15) | 0x7C00); + } + } + result = static_cast((sign << 15) | (static_cast(exponent) << 10) | (mantissa_rounded & 0x3FF)); + } + return result; +} + +#ifdef SUPPORT_FT78 +static inline int GetActCode(int act_type) { + if (act_type == ActType_Relu) { + return 1; + } + if (act_type == ActType_Relu6) { + return 2; + } + return 0; +} + +static lite::Tensor *CreateFT78ParamTensor(const std::shared_ptr &allocator, int M, int N, int K, + int act_code) { + std::vector param_shape = {4}; + auto tensor = new lite::Tensor(kNumberTypeInt32, param_shape, NHWC, lite::Category::CONST_TENSOR); + tensor->MallocData(allocator); + auto data = reinterpret_cast(tensor->MutableData()); + data[0] = M; + data[1] = N; + data[2] = K; + data[3] = act_code; + return tensor; +} +#endif + +#ifndef SUPPORT_FT78 +// Large size tests (M=N=K=256) across dtypes +TEST_F(TestDSP_MatMulFusion, MatMulFusion_Fp32_Large_BiasRelu) { + InitDSPRuntime(); + const int M = 256, K = 256, N = 256; + std::vector a_shape = {M, K}; + std::vector b_shape = {K, N}; + std::vector out_shape = {M, N}; + std::vector bias_shape = {M, N}; + auto t_A = new lite::Tensor(kNumberTypeFloat32, a_shape, NHWC, lite::Category::CONST_TENSOR); + auto t_B = new lite::Tensor(kNumberTypeFloat32, b_shape, NHWC, lite::Category::CONST_TENSOR); + auto t_bias = new lite::Tensor(kNumberTypeFloat32, bias_shape, NHWC, lite::Category::CONST_TENSOR); + auto t_out = new lite::Tensor(kNumberTypeFloat32, out_shape, NHWC, lite::Category::CONST_TENSOR); + t_A->MallocData(allocator_); + t_B->MallocData(allocator_); + t_bias->MallocData(allocator_); + t_out->MallocData(allocator_); + FillFloat(reinterpret_cast(t_A->MutableData()), M * K, 0.02f); + FillFloat(reinterpret_cast(t_B->MutableData()), K * N, 0.03f); + FillFloat(reinterpret_cast(t_bias->MutableData()), M * N, 0.005f); + std::memset(t_out->MutableData(), 0, M * N * sizeof(float)); + std::vector inputs_{t_A, t_B, t_bias}; + std::vector outputs_{t_out}; + auto ctx = new lite::InnerContext; + ASSERT_EQ(lite::RET_OK, ctx->Init()); + auto *param = new MatMulParameter(); + param->op_parameter_.type_ = static_cast(schema::PrimitiveType_MatMulFusion); + param->act_type_ = ActType_Relu; + param->has_bias_ = true; + param->row_ = M; + param->col_ = N; + param->deep_ = K; + kernel::KernelKey key = {kernel::KERNEL_ARCH::kDSP, kNumberTypeFloat32, NHWC, schema::PrimitiveType_MatMulFusion}; + auto creator = KernelRegistry::GetInstance()->GetCreator(key); + ASSERT_NE(creator, nullptr); + auto kernel = creator(inputs_, outputs_, reinterpret_cast(param), ctx, key); + ASSERT_NE(kernel, nullptr); + ASSERT_EQ(kernel->Prepare(), lite::RET_OK); + ASSERT_EQ(kernel->Run(), lite::RET_OK); + auto A = reinterpret_cast(t_A->MutableData()); + auto B = reinterpret_cast(t_B->MutableData()); + auto bias = reinterpret_cast(t_bias->MutableData()); + auto C = reinterpret_cast(t_out->MutableData()); + std::vector expect(M * N, 0.f); + for (int m = 0; m < M; ++m) { + for (int n = 0; n < N; ++n) { + float sum = 0.f; + for (int k = 0; k < K; ++k) { + sum += A[m * K + k] * B[k * N + n]; + } + sum += bias[m * N + n]; + expect[m * N + n] = sum > 0.f ? sum : 0.f; + } + } + ASSERT_EQ(0, CompareOutputData(C, expect.data(), M * N, 1e-3)); + UninitDSPRuntime(); + delete ctx; + delete kernel; + delete t_A; + delete t_B; + delete t_bias; + delete t_out; +} + +TEST_F(TestDSP_MatMulFusion, MatMulFusion_Fp16_Large_BiasRelu) { + InitDSPRuntime(); + const int M = 256, K = 256, N = 256; + std::vector a_shape = {M, K}; + std::vector b_shape = {K, N}; + std::vector out_shape = {M, N}; + std::vector bias_shape = {M, N}; + auto t_A = new lite::Tensor(kNumberTypeFloat16, a_shape, NHWC, lite::Category::CONST_TENSOR); + auto t_B = new lite::Tensor(kNumberTypeFloat16, b_shape, NHWC, lite::Category::CONST_TENSOR); + auto t_bias = new lite::Tensor(kNumberTypeFloat16, bias_shape, NHWC, lite::Category::CONST_TENSOR); + auto t_out = new lite::Tensor(kNumberTypeFloat16, out_shape, NHWC, lite::Category::CONST_TENSOR); + t_A->MallocData(allocator_); + t_B->MallocData(allocator_); + t_bias->MallocData(allocator_); + t_out->MallocData(allocator_); + auto A16 = reinterpret_cast(t_A->MutableData()); + auto B16 = reinterpret_cast(t_B->MutableData()); + auto bias16 = reinterpret_cast(t_bias->MutableData()); + auto C16 = reinterpret_cast(t_out->MutableData()); + for (int i = 0; i < M * K; ++i) { + A16[i] = Fp32ToFp16Bits(0.01f * static_cast(i % 13)); + } + for (int i = 0; i < K * N; ++i) { + B16[i] = Fp32ToFp16Bits(0.02f * static_cast(i % 17)); + } + for (int i = 0; i < M * N; ++i) { + bias16[i] = Fp32ToFp16Bits(0.003f * static_cast(i % 11)); + } + std::memset(C16, 0, M * N * sizeof(uint16_t)); + std::vector inputs_{t_A, t_B, t_bias}; + std::vector outputs_{t_out}; + auto ctx = new lite::InnerContext; + ASSERT_EQ(lite::RET_OK, ctx->Init()); + auto *param = new MatMulParameter(); + param->op_parameter_.type_ = static_cast(schema::PrimitiveType_MatMulFusion); + param->act_type_ = ActType_Relu; + param->has_bias_ = true; + param->row_ = M; + param->col_ = N; + param->deep_ = K; + kernel::KernelKey key = {kernel::KERNEL_ARCH::kDSP, kNumberTypeFloat16, NHWC, schema::PrimitiveType_MatMulFusion}; + auto creator = KernelRegistry::GetInstance()->GetCreator(key); + ASSERT_NE(creator, nullptr); + auto kernel = creator(inputs_, outputs_, reinterpret_cast(param), ctx, key); + ASSERT_NE(kernel, nullptr); + ASSERT_EQ(kernel->Prepare(), lite::RET_OK); + ASSERT_EQ(kernel->Run(), lite::RET_OK); + auto Fp16ToFp32 = [&](uint16_t h) { + uint32_t sign = (h & 0x8000) << 16; + uint32_t exp = (h & 0x7C00) >> 10; + uint32_t frac = (h & 0x03FF); + uint32_t fexp, ffrac; + if (exp == 0) { + if (frac == 0) { + fexp = 0; + ffrac = 0; + } else { + int shift = 0; + while ((frac & 0x0200) == 0) { + frac <<= 1; + ++shift; + } + frac &= 0x03FF; + fexp = 127 - 15 - shift; + ffrac = frac << 13; + } + } else if (exp == 0x1F) { + fexp = 255; + ffrac = frac << 13; + } else { + fexp = exp - 15 + 127; + ffrac = frac << 13; + } + uint32_t bits = sign | (fexp << 23) | ffrac; + float out; + std::memcpy(&out, &bits, sizeof(out)); + return out; + }; + std::vector expect_fp32(M * N, 0.f); + std::vector actual_fp32(M * N, 0.f); + for (int m = 0; m < M; ++m) { + for (int n = 0; n < N; ++n) { + float sum = 0.f; + for (int k = 0; k < K; ++k) { + float a = Fp16ToFp32(A16[m * K + k]); + float b = Fp16ToFp32(B16[k * N + n]); + sum += a * b; + } + sum += Fp16ToFp32(bias16[m * N + n]); + expect_fp32[m * N + n] = sum > 0.f ? sum : 0.f; + actual_fp32[m * N + n] = Fp16ToFp32(C16[m * N + n]); + } + } + ASSERT_EQ(0, CompareOutputData(actual_fp32.data(), expect_fp32.data(), M * N, 5e-2)); + UninitDSPRuntime(); + delete ctx; + delete kernel; + delete t_A; + delete t_B; + delete t_bias; + delete t_out; +} + +TEST_F(TestDSP_MatMulFusion, MatMulFusion_Int32_Large_BiasRelu) { + InitDSPRuntime(); + const int M = 256, K = 256, N = 256; + std::vector a_shape = {M, K}; + std::vector b_shape = {K, N}; + std::vector out_shape = {M, N}; + std::vector bias_shape = {M, N}; + auto t_A = new lite::Tensor(kNumberTypeInt32, a_shape, NHWC, lite::Category::CONST_TENSOR); + auto t_B = new lite::Tensor(kNumberTypeInt32, b_shape, NHWC, lite::Category::CONST_TENSOR); + auto t_bias = new lite::Tensor(kNumberTypeInt32, bias_shape, NHWC, lite::Category::CONST_TENSOR); + auto t_out = new lite::Tensor(kNumberTypeInt32, out_shape, NHWC, lite::Category::CONST_TENSOR); + t_A->MallocData(allocator_); + t_B->MallocData(allocator_); + t_bias->MallocData(allocator_); + t_out->MallocData(allocator_); + auto A = reinterpret_cast(t_A->MutableData()); + auto B = reinterpret_cast(t_B->MutableData()); + auto bias = reinterpret_cast(t_bias->MutableData()); + auto C = reinterpret_cast(t_out->MutableData()); + for (int i = 0; i < M * K; ++i) { + A[i] = (i % 11) - 5; + } + for (int i = 0; i < K * N; ++i) { + B[i] = (i % 13) - 6; + } + for (int i = 0; i < M * N; ++i) { + bias[i] = (i % 9) - 4; + } + std::memset(C, 0, M * N * sizeof(int32_t)); + std::vector inputs_{t_A, t_B, t_bias}; + std::vector outputs_{t_out}; + auto ctx = new lite::InnerContext; + ASSERT_EQ(lite::RET_OK, ctx->Init()); + auto *param = new MatMulParameter(); + param->op_parameter_.type_ = static_cast(schema::PrimitiveType_MatMulFusion); + param->act_type_ = ActType_Relu; + param->has_bias_ = true; + param->row_ = M; + param->col_ = N; + param->deep_ = K; + kernel::KernelKey key = {kernel::KERNEL_ARCH::kDSP, kNumberTypeInt32, NHWC, schema::PrimitiveType_MatMulFusion}; + auto creator = KernelRegistry::GetInstance()->GetCreator(key); + ASSERT_NE(creator, nullptr); + auto kernel = creator(inputs_, outputs_, reinterpret_cast(param), ctx, key); + ASSERT_NE(kernel, nullptr); + ASSERT_EQ(kernel->Prepare(), lite::RET_OK); + ASSERT_EQ(kernel->Run(), lite::RET_OK); + std::vector expect(M * N, 0); + for (int m = 0; m < M; ++m) { + for (int n = 0; n < N; ++n) { + int64_t sum = 0; + for (int k = 0; k < K; ++k) { + sum += static_cast(A[m * K + k]) * B[k * N + n]; + } + sum += static_cast(bias[m * N + n]); + expect[m * N + n] = static_cast(sum > 0 ? sum : 0); + } + } + ASSERT_EQ(0, CompareOutputData(C, expect.data(), M * N, 0.f)); + UninitDSPRuntime(); + delete ctx; + delete kernel; + delete t_A; + delete t_B; + delete t_bias; + delete t_out; +} + +TEST_F(TestDSP_MatMulFusion, MatMulFusion_Int16_Large_BiasRelu) { + InitDSPRuntime(); + const int M = 256, K = 256, N = 256; + std::vector a_shape = {M, K}; + std::vector b_shape = {K, N}; + std::vector out_shape = {M, N}; + std::vector bias_shape = {M, N}; + auto t_A = new lite::Tensor(kNumberTypeInt16, a_shape, NHWC, lite::Category::CONST_TENSOR); + auto t_B = new lite::Tensor(kNumberTypeInt16, b_shape, NHWC, lite::Category::CONST_TENSOR); + auto t_bias = new lite::Tensor(kNumberTypeInt16, bias_shape, NHWC, lite::Category::CONST_TENSOR); + auto t_out = new lite::Tensor(kNumberTypeInt16, out_shape, NHWC, lite::Category::CONST_TENSOR); + t_A->MallocData(allocator_); + t_B->MallocData(allocator_); + t_bias->MallocData(allocator_); + t_out->MallocData(allocator_); + auto A = reinterpret_cast(t_A->MutableData()); + auto B = reinterpret_cast(t_B->MutableData()); + auto bias = reinterpret_cast(t_bias->MutableData()); + auto C = reinterpret_cast(t_out->MutableData()); + for (int i = 0; i < M * K; ++i) { + A[i] = static_cast((i % 21) - 10); + } + for (int i = 0; i < K * N; ++i) { + B[i] = static_cast((i % 19) - 9); + } + for (int i = 0; i < M * N; ++i) { + bias[i] = static_cast(i % 15); + } + std::memset(C, 0, M * N * sizeof(int16_t)); + std::vector inputs_{t_A, t_B, t_bias}; + std::vector outputs_{t_out}; + auto ctx = new lite::InnerContext; + ASSERT_EQ(lite::RET_OK, ctx->Init()); + auto *param = new MatMulParameter(); + param->op_parameter_.type_ = static_cast(schema::PrimitiveType_MatMulFusion); + param->act_type_ = ActType_Relu; + param->has_bias_ = true; + param->row_ = M; + param->col_ = N; + param->deep_ = K; + kernel::KernelKey key = {kernel::KERNEL_ARCH::kDSP, kNumberTypeInt16, NHWC, schema::PrimitiveType_MatMulFusion}; + auto creator = KernelRegistry::GetInstance()->GetCreator(key); + ASSERT_NE(creator, nullptr); + auto kernel = creator(inputs_, outputs_, reinterpret_cast(param), ctx, key); + ASSERT_NE(kernel, nullptr); + ASSERT_EQ(kernel->Prepare(), lite::RET_OK); + ASSERT_EQ(kernel->Run(), lite::RET_OK); + std::vector expect(M * N, 0); + for (int m = 0; m < M; ++m) { + for (int n = 0; n < N; ++n) { + int64_t sum = 0; + for (int k = 0; k < K; ++k) { + sum += static_cast(A[m * K + k]) * B[k * N + n]; + } + sum += static_cast(bias[m * N + n]); + sum = sum > 0 ? sum : 0; + if (sum > std::numeric_limits::max()) sum = std::numeric_limits::max(); + expect[m * N + n] = static_cast(sum); + } + } + ASSERT_EQ(0, CompareOutputData(C, expect.data(), M * N, 0.f)); + UninitDSPRuntime(); + delete ctx; + delete kernel; + delete t_A; + delete t_B; + delete t_bias; + delete t_out; +} + +TEST_F(TestDSP_MatMulFusion, MatMulFusion_Complex64_Large_BiasRelu) { + InitDSPRuntime(); + const int M = 256, K = 256, N = 256; + std::vector a_shape = {M, K}; + std::vector b_shape = {K, N}; + std::vector out_shape = {M, N}; + std::vector bias_shape = {M, N}; + auto t_A = new lite::Tensor(kNumberTypeComplex64, a_shape, NHWC, lite::Category::CONST_TENSOR); + auto t_B = new lite::Tensor(kNumberTypeComplex64, b_shape, NHWC, lite::Category::CONST_TENSOR); + auto t_bias = new lite::Tensor(kNumberTypeComplex64, bias_shape, NHWC, lite::Category::CONST_TENSOR); + auto t_out = new lite::Tensor(kNumberTypeComplex64, out_shape, NHWC, lite::Category::CONST_TENSOR); + t_A->MallocData(allocator_); + t_B->MallocData(allocator_); + t_bias->MallocData(allocator_); + t_out->MallocData(allocator_); + auto A = reinterpret_cast(t_A->MutableData()); + auto B = reinterpret_cast(t_B->MutableData()); + auto bias = reinterpret_cast(t_bias->MutableData()); + auto C = reinterpret_cast(t_out->MutableData()); // complex64 stored as interleaved real,imag + for (int i = 0; i < M * K; ++i) { + A[2 * i] = 0.01f * (i % 17); + A[2 * i + 1] = 0.02f * (i % 19); + } + for (int i = 0; i < K * N; ++i) { + B[2 * i] = 0.03f * (i % 23); + B[2 * i + 1] = 0.01f * (i % 29); + } + for (int i = 0; i < M * N; ++i) { + bias[2 * i] = 0.002f * (i % 31); + bias[2 * i + 1] = 0.001f * (i % 37); + } + std::memset(C, 0, M * N * 2 * sizeof(float)); + std::vector inputs_{t_A, t_B, t_bias}; + std::vector outputs_{t_out}; + auto ctx = new lite::InnerContext; + ASSERT_EQ(lite::RET_OK, ctx->Init()); + auto *param = new MatMulParameter(); + param->op_parameter_.type_ = static_cast(schema::PrimitiveType_MatMulFusion); + param->act_type_ = ActType_Relu; + param->has_bias_ = true; + param->row_ = M; + param->col_ = N; + param->deep_ = K; + kernel::KernelKey key = {kernel::KERNEL_ARCH::kDSP, kNumberTypeComplex64, NHWC, schema::PrimitiveType_MatMulFusion}; + auto creator = KernelRegistry::GetInstance()->GetCreator(key); + ASSERT_NE(creator, nullptr); + auto kernel = creator(inputs_, outputs_, reinterpret_cast(param), ctx, key); + ASSERT_NE(kernel, nullptr); + ASSERT_EQ(kernel->Prepare(), lite::RET_OK); + ASSERT_EQ(kernel->Run(), lite::RET_OK); + std::vector expect(2 * M * N, 0.f); + std::vector actual(2 * M * N, 0.f); + for (int m = 0; m < M; ++m) { + for (int n = 0; n < N; ++n) { + float real = 0.f; + float imag = 0.f; + for (int k = 0; k < K; ++k) { + float ar = A[2 * (m * K + k)]; + float ai = A[2 * (m * K + k) + 1]; + float br = B[2 * (k * N + n)]; + float bi = B[2 * (k * N + n) + 1]; + real += ar * br - ai * bi; + imag += ar * bi + ai * br; + } + real += bias[2 * (m * N + n)]; + imag += bias[2 * (m * N + n) + 1]; + if (real < 0.f) real = 0.f; + expect[2 * (m * N + n)] = real; + expect[2 * (m * N + n) + 1] = imag; + actual[2 * (m * N + n)] = C[2 * (m * N + n)]; + actual[2 * (m * N + n) + 1] = C[2 * (m * N + n) + 1]; + } + } + ASSERT_EQ(0, CompareOutputData(actual.data(), expect.data(), 2 * M * N, 5e-2)); + UninitDSPRuntime(); + delete ctx; + delete kernel; + delete t_A; + delete t_B; + delete t_bias; +} +#endif + +#ifdef SUPPORT_FT78 +TEST_F(TestDSP_MatMulFusion, MatMulFusion_Fp32_Large_BiasRelu_FT78) { + InitDSPRuntime(); + const int M = 256, K = 256, N = 256; + std::vector a_shape = {M, K}; + std::vector b_shape = {K, N}; + std::vector out_shape = {M, N}; + std::vector bias_shape = {M, N}; + auto t_A = new lite::Tensor(kNumberTypeFloat32, a_shape, NHWC, lite::Category::CONST_TENSOR); + auto t_B = new lite::Tensor(kNumberTypeFloat32, b_shape, NHWC, lite::Category::CONST_TENSOR); + auto t_bias = new lite::Tensor(kNumberTypeFloat32, bias_shape, NHWC, lite::Category::CONST_TENSOR); + auto t_out = new lite::Tensor(kNumberTypeFloat32, out_shape, NHWC, lite::Category::CONST_TENSOR); + t_A->MallocData(allocator_); + t_B->MallocData(allocator_); + t_bias->MallocData(allocator_); + t_out->MallocData(allocator_); + FillFloat(reinterpret_cast(t_A->MutableData()), M * K, 0.02f); + FillFloat(reinterpret_cast(t_B->MutableData()), K * N, 0.03f); + FillFloat(reinterpret_cast(t_bias->MutableData()), M * N, 0.005f); + std::memset(t_out->MutableData(), 0, M * N * sizeof(float)); + std::vector inputs_{t_A, t_B, t_bias}; + auto *param = new MatMulParameter(); + param->op_parameter_.type_ = static_cast(schema::PrimitiveType_MatMulFusion); + param->act_type_ = ActType_Relu; + param->has_bias_ = true; + param->row_ = M; + param->col_ = N; + param->deep_ = K; + int act_code = GetActCode(param->act_type_); + auto t_param = CreateFT78ParamTensor(allocator_, M, N, K, act_code); + inputs_.push_back(t_param); + std::vector outputs_{t_out}; + auto ctx = new lite::InnerContext; + ASSERT_EQ(lite::RET_OK, ctx->Init()); + kernel::KernelKey key = {kernel::KERNEL_ARCH::kDSP, kNumberTypeFloat32, NHWC, schema::PrimitiveType_MatMulFusion}; + auto creator = KernelRegistry::GetInstance()->GetCreator(key); + ASSERT_NE(creator, nullptr); + auto kernel = creator(inputs_, outputs_, reinterpret_cast(param), ctx, key); + ASSERT_NE(kernel, nullptr); + ASSERT_EQ(kernel->Prepare(), lite::RET_OK); + ASSERT_EQ(kernel->Run(), lite::RET_OK); + auto A = reinterpret_cast(t_A->MutableData()); + auto B = reinterpret_cast(t_B->MutableData()); + auto bias = reinterpret_cast(t_bias->MutableData()); + auto C = reinterpret_cast(t_out->MutableData()); + std::vector expect(M * N, 0.f); + for (int m = 0; m < M; ++m) { + for (int n = 0; n < N; ++n) { + float sum = 0.f; + for (int k = 0; k < K; ++k) { + sum += A[m * K + k] * B[k * N + n]; + } + sum += bias[m * N + n]; + expect[m * N + n] = sum > 0.f ? sum : 0.f; + } + } + ASSERT_EQ(0, CompareOutputData(C, expect.data(), M * N, 1e-3)); + UninitDSPRuntime(); + delete ctx; + delete kernel; + delete t_A; + delete t_B; + delete t_bias; + delete t_param; + delete t_out; +} + +TEST_F(TestDSP_MatMulFusion, MatMulFusion_Int32_Large_BiasRelu_FT78) { + InitDSPRuntime(); + const int M = 256, K = 256, N = 256; + std::vector a_shape = {M, K}; + std::vector b_shape = {K, N}; + std::vector out_shape = {M, N}; + std::vector bias_shape = {M, N}; + auto t_A = new lite::Tensor(kNumberTypeInt32, a_shape, NHWC, lite::Category::CONST_TENSOR); + auto t_B = new lite::Tensor(kNumberTypeInt32, b_shape, NHWC, lite::Category::CONST_TENSOR); + auto t_bias = new lite::Tensor(kNumberTypeInt32, bias_shape, NHWC, lite::Category::CONST_TENSOR); + auto t_out = new lite::Tensor(kNumberTypeInt32, out_shape, NHWC, lite::Category::CONST_TENSOR); + t_A->MallocData(allocator_); + t_B->MallocData(allocator_); + t_bias->MallocData(allocator_); + t_out->MallocData(allocator_); + auto A = reinterpret_cast(t_A->MutableData()); + auto B = reinterpret_cast(t_B->MutableData()); + auto bias = reinterpret_cast(t_bias->MutableData()); + auto C = reinterpret_cast(t_out->MutableData()); + for (int i = 0; i < M * K; ++i) { + A[i] = (i % 11) - 5; + } + for (int i = 0; i < K * N; ++i) { + B[i] = (i % 13) - 6; + } + for (int i = 0; i < M * N; ++i) { + bias[i] = (i % 9) - 4; + } + std::memset(C, 0, M * N * sizeof(int32_t)); + std::vector inputs_{t_A, t_B, t_bias}; + auto *param = new MatMulParameter(); + param->op_parameter_.type_ = static_cast(schema::PrimitiveType_MatMulFusion); + param->act_type_ = ActType_Relu; + param->has_bias_ = true; + param->row_ = M; + param->col_ = N; + param->deep_ = K; + int act_code = GetActCode(param->act_type_); + auto t_param = CreateFT78ParamTensor(allocator_, M, N, K, act_code); + inputs_.push_back(t_param); + std::vector outputs_{t_out}; + auto ctx = new lite::InnerContext; + ASSERT_EQ(lite::RET_OK, ctx->Init()); + kernel::KernelKey key = {kernel::KERNEL_ARCH::kDSP, kNumberTypeInt32, NHWC, schema::PrimitiveType_MatMulFusion}; + auto creator = KernelRegistry::GetInstance()->GetCreator(key); + ASSERT_NE(creator, nullptr); + auto kernel = creator(inputs_, outputs_, reinterpret_cast(param), ctx, key); + ASSERT_NE(kernel, nullptr); + ASSERT_EQ(kernel->Prepare(), lite::RET_OK); + ASSERT_EQ(kernel->Run(), lite::RET_OK); + std::vector expect(M * N, 0); + for (int m = 0; m < M; ++m) { + for (int n = 0; n < N; ++n) { + int64_t sum = 0; + for (int k = 0; k < K; ++k) { + sum += static_cast(A[m * K + k]) * B[k * N + n]; + } + sum += static_cast(bias[m * N + n]); + expect[m * N + n] = static_cast(sum > 0 ? sum : 0); + } + } + ASSERT_EQ(0, CompareOutputData(C, expect.data(), M * N, 0.f)); + UninitDSPRuntime(); + delete ctx; + delete kernel; + delete t_A; + delete t_B; + delete t_bias; + delete t_param; + delete t_out; +} + +TEST_F(TestDSP_MatMulFusion, MatMulFusion_Int16_Large_BiasRelu_FT78) { + InitDSPRuntime(); + const int M = 256, K = 256, N = 256; + std::vector a_shape = {M, K}; + std::vector b_shape = {K, N}; + std::vector out_shape = {M, N}; + std::vector bias_shape = {M, N}; + auto t_A = new lite::Tensor(kNumberTypeInt16, a_shape, NHWC, lite::Category::CONST_TENSOR); + auto t_B = new lite::Tensor(kNumberTypeInt16, b_shape, NHWC, lite::Category::CONST_TENSOR); + auto t_bias = new lite::Tensor(kNumberTypeInt16, bias_shape, NHWC, lite::Category::CONST_TENSOR); + auto t_out = new lite::Tensor(kNumberTypeInt16, out_shape, NHWC, lite::Category::CONST_TENSOR); + t_A->MallocData(allocator_); + t_B->MallocData(allocator_); + t_bias->MallocData(allocator_); + t_out->MallocData(allocator_); + auto A = reinterpret_cast(t_A->MutableData()); + auto B = reinterpret_cast(t_B->MutableData()); + auto bias = reinterpret_cast(t_bias->MutableData()); + auto C = reinterpret_cast(t_out->MutableData()); + for (int i = 0; i < M * K; ++i) { + A[i] = static_cast((i % 21) - 10); + } + for (int i = 0; i < K * N; ++i) { + B[i] = static_cast((i % 19) - 9); + } + for (int i = 0; i < M * N; ++i) { + bias[i] = static_cast(i % 15); + } + std::memset(C, 0, M * N * sizeof(int16_t)); + std::vector inputs_{t_A, t_B, t_bias}; + auto *param = new MatMulParameter(); + param->op_parameter_.type_ = static_cast(schema::PrimitiveType_MatMulFusion); + param->act_type_ = ActType_Relu; + param->has_bias_ = true; + param->row_ = M; + param->col_ = N; + param->deep_ = K; + int act_code = GetActCode(param->act_type_); + auto t_param = CreateFT78ParamTensor(allocator_, M, N, K, act_code); + inputs_.push_back(t_param); + std::vector outputs_{t_out}; + auto ctx = new lite::InnerContext; + ASSERT_EQ(lite::RET_OK, ctx->Init()); + kernel::KernelKey key = {kernel::KERNEL_ARCH::kDSP, kNumberTypeInt16, NHWC, schema::PrimitiveType_MatMulFusion}; + auto creator = KernelRegistry::GetInstance()->GetCreator(key); + ASSERT_NE(creator, nullptr); + auto kernel = creator(inputs_, outputs_, reinterpret_cast(param), ctx, key); + ASSERT_NE(kernel, nullptr); + ASSERT_EQ(kernel->Prepare(), lite::RET_OK); + ASSERT_EQ(kernel->Run(), lite::RET_OK); + std::vector expect(M * N, 0); + for (int m = 0; m < M; ++m) { + for (int n = 0; n < N; ++n) { + int64_t sum = 0; + for (int k = 0; k < K; ++k) { + sum += static_cast(A[m * K + k]) * B[k * N + n]; + } + sum += static_cast(bias[m * N + n]); + sum = sum > 0 ? sum : 0; + if (sum > std::numeric_limits::max()) sum = std::numeric_limits::max(); + expect[m * N + n] = static_cast(sum); + } + } + ASSERT_EQ(0, CompareOutputData(C, expect.data(), M * N, 0.f)); + UninitDSPRuntime(); + delete ctx; + delete kernel; + delete t_A; + delete t_B; + delete t_bias; + delete t_param; + delete t_out; +} + +TEST_F(TestDSP_MatMulFusion, MatMulFusion_Complex64_Large_BiasRelu_FT78) { + InitDSPRuntime(); + const int M = 256, K = 256, N = 256; + std::vector a_shape = {M, K}; + std::vector b_shape = {K, N}; + std::vector out_shape = {M, N}; + std::vector bias_shape = {M, N}; + auto t_A = new lite::Tensor(kNumberTypeComplex64, a_shape, NHWC, lite::Category::CONST_TENSOR); + auto t_B = new lite::Tensor(kNumberTypeComplex64, b_shape, NHWC, lite::Category::CONST_TENSOR); + auto t_bias = new lite::Tensor(kNumberTypeComplex64, bias_shape, NHWC, lite::Category::CONST_TENSOR); + auto t_out = new lite::Tensor(kNumberTypeComplex64, out_shape, NHWC, lite::Category::CONST_TENSOR); + t_A->MallocData(allocator_); + t_B->MallocData(allocator_); + t_bias->MallocData(allocator_); + t_out->MallocData(allocator_); + auto A = reinterpret_cast(t_A->MutableData()); + auto B = reinterpret_cast(t_B->MutableData()); + auto bias = reinterpret_cast(t_bias->MutableData()); + auto C = reinterpret_cast(t_out->MutableData()); + for (int i = 0; i < M * K; ++i) { + A[2 * i] = 0.01f * (i % 17); + A[2 * i + 1] = 0.02f * (i % 19); + } + for (int i = 0; i < K * N; ++i) { + B[2 * i] = 0.03f * (i % 23); + B[2 * i + 1] = 0.01f * (i % 29); + } + for (int i = 0; i < M * N; ++i) { + bias[2 * i] = 0.002f * (i % 31); + bias[2 * i + 1] = 0.001f * (i % 37); + } + std::memset(C, 0, M * N * 2 * sizeof(float)); + std::vector inputs_{t_A, t_B, t_bias}; + auto *param = new MatMulParameter(); + param->op_parameter_.type_ = static_cast(schema::PrimitiveType_MatMulFusion); + param->act_type_ = ActType_Relu; + param->has_bias_ = true; + param->row_ = M; + param->col_ = N; + param->deep_ = K; + int act_code = GetActCode(param->act_type_); + auto t_param = CreateFT78ParamTensor(allocator_, M, N, K, act_code); + inputs_.push_back(t_param); + std::vector outputs_{t_out}; + auto ctx = new lite::InnerContext; + ASSERT_EQ(lite::RET_OK, ctx->Init()); + kernel::KernelKey key = {kernel::KERNEL_ARCH::kDSP, kNumberTypeComplex64, NHWC, schema::PrimitiveType_MatMulFusion}; + auto creator = KernelRegistry::GetInstance()->GetCreator(key); + ASSERT_NE(creator, nullptr); + auto kernel = creator(inputs_, outputs_, reinterpret_cast(param), ctx, key); + ASSERT_NE(kernel, nullptr); + ASSERT_EQ(kernel->Prepare(), lite::RET_OK); + ASSERT_EQ(kernel->Run(), lite::RET_OK); + std::vector expect(2 * M * N, 0.f); + std::vector actual(2 * M * N, 0.f); + for (int m = 0; m < M; ++m) { + for (int n = 0; n < N; ++n) { + float real = 0.f; + float imag = 0.f; + for (int k = 0; k < K; ++k) { + float ar = A[2 * (m * K + k)]; + float ai = A[2 * (m * K + k) + 1]; + float br = B[2 * (k * N + n)]; + float bi = B[2 * (k * N + n) + 1]; + real += ar * br - ai * bi; + imag += ar * bi + ai * br; + } + real += bias[2 * (m * N + n)]; + imag += bias[2 * (m * N + n) + 1]; + if (real < 0.f) real = 0.f; + expect[2 * (m * N + n)] = real; + expect[2 * (m * N + n) + 1] = imag; + actual[2 * (m * N + n)] = C[2 * (m * N + n)]; + actual[2 * (m * N + n) + 1] = C[2 * (m * N + n) + 1]; + } + } + ASSERT_EQ(0, CompareOutputData(actual.data(), expect.data(), 2 * M * N, 5e-2)); + UninitDSPRuntime(); + delete ctx; + delete kernel; + delete t_A; + delete t_B; + delete t_bias; + delete t_param; + delete t_out; +} + +TEST_F(TestDSP_MatMulFusion, MatMulFusion_Fp64_Large_BiasRelu_FT78) { + InitDSPRuntime(); + const int M = 256, K = 256, N = 256; + std::vector a_shape = {M, K}; + std::vector b_shape = {K, N}; + std::vector out_shape = {M, N}; + std::vector bias_shape = {M, N}; + auto t_A = new lite::Tensor(kNumberTypeFloat64, a_shape, NHWC, lite::Category::CONST_TENSOR); + auto t_B = new lite::Tensor(kNumberTypeFloat64, b_shape, NHWC, lite::Category::CONST_TENSOR); + auto t_bias = new lite::Tensor(kNumberTypeFloat64, bias_shape, NHWC, lite::Category::CONST_TENSOR); + auto t_out = new lite::Tensor(kNumberTypeFloat64, out_shape, NHWC, lite::Category::CONST_TENSOR); + t_A->MallocData(allocator_); + t_B->MallocData(allocator_); + t_bias->MallocData(allocator_); + t_out->MallocData(allocator_); + auto A = reinterpret_cast(t_A->MutableData()); + auto B = reinterpret_cast(t_B->MutableData()); + auto bias = reinterpret_cast(t_bias->MutableData()); + auto C = reinterpret_cast(t_out->MutableData()); + for (int i = 0; i < M * K; ++i) { + A[i] = 0.015 * static_cast(i % 13); + } + for (int i = 0; i < K * N; ++i) { + B[i] = 0.018 * static_cast(i % 17); + } + for (int i = 0; i < M * N; ++i) { + bias[i] = 0.004 * static_cast(i % 19); + } + std::fill_n(C, M * N, 0.0); + auto *param = new MatMulParameter(); + param->op_parameter_.type_ = static_cast(schema::PrimitiveType_MatMulFusion); + param->act_type_ = ActType_Relu; + param->has_bias_ = true; + param->row_ = M; + param->col_ = N; + param->deep_ = K; + std::vector inputs_{t_A, t_B, t_bias}; + int act_code = GetActCode(param->act_type_); + auto t_param = CreateFT78ParamTensor(allocator_, M, N, K, act_code); + inputs_.push_back(t_param); + std::vector outputs_{t_out}; + auto ctx = new lite::InnerContext; + ASSERT_EQ(lite::RET_OK, ctx->Init()); + kernel::KernelKey key = {kernel::KERNEL_ARCH::kDSP, kNumberTypeFloat64, NHWC, schema::PrimitiveType_MatMulFusion}; + auto creator = KernelRegistry::GetInstance()->GetCreator(key); + ASSERT_NE(creator, nullptr); + auto kernel = creator(inputs_, outputs_, reinterpret_cast(param), ctx, key); + ASSERT_NE(kernel, nullptr); + ASSERT_EQ(kernel->Prepare(), lite::RET_OK); + ASSERT_EQ(kernel->Run(), lite::RET_OK); + std::vector expect(M * N, 0.0); + for (int m = 0; m < M; ++m) { + for (int n = 0; n < N; ++n) { + double sum = 0.0; + for (int k = 0; k < K; ++k) { + sum += A[m * K + k] * B[k * N + n]; + } + sum += bias[m * N + n]; + expect[m * N + n] = sum > 0.0 ? sum : 0.0; + } + } + ASSERT_EQ(0, CompareOutputData(C, expect.data(), M * N, 1e-6)); + UninitDSPRuntime(); + delete ctx; + delete kernel; + delete t_A; + delete t_B; + delete t_bias; + delete t_param; + delete t_out; +} + +TEST_F(TestDSP_MatMulFusion, MatMulFusion_Int8_Large_BiasRelu_FT78) { + InitDSPRuntime(); + const int M = 32, K = 32, N = 32; + std::vector a_shape = {M, K}; + std::vector b_shape = {K, N}; + std::vector out_shape = {M, N}; + std::vector bias_shape = {M, N}; + auto t_A = new lite::Tensor(kNumberTypeInt8, a_shape, NHWC, lite::Category::CONST_TENSOR); + auto t_B = new lite::Tensor(kNumberTypeInt8, b_shape, NHWC, lite::Category::CONST_TENSOR); + auto t_bias = new lite::Tensor(kNumberTypeInt8, bias_shape, NHWC, lite::Category::CONST_TENSOR); + auto t_out = new lite::Tensor(kNumberTypeInt8, out_shape, NHWC, lite::Category::CONST_TENSOR); + t_A->MallocData(allocator_); + t_B->MallocData(allocator_); + t_bias->MallocData(allocator_); + t_out->MallocData(allocator_); + auto A = reinterpret_cast(t_A->MutableData()); + auto B = reinterpret_cast(t_B->MutableData()); + auto bias = reinterpret_cast(t_bias->MutableData()); + auto C = reinterpret_cast(t_out->MutableData()); + for (int i = 0; i < M * K; ++i) { + A[i] = static_cast((i % 7) - 3); + } + for (int i = 0; i < K * N; ++i) { + B[i] = static_cast((i % 9) - 4); + } + for (int i = 0; i < M * N; ++i) { + bias[i] = static_cast(i % 5 - 2); + } + std::fill_n(C, M * N, static_cast(0)); + auto *param = new MatMulParameter(); + param->op_parameter_.type_ = static_cast(schema::PrimitiveType_MatMulFusion); + param->act_type_ = ActType_Relu; + param->has_bias_ = true; + param->row_ = M; + param->col_ = N; + param->deep_ = K; + std::vector inputs_{t_A, t_B, t_bias}; + int act_code = GetActCode(param->act_type_); + auto t_param = CreateFT78ParamTensor(allocator_, M, N, K, act_code); + inputs_.push_back(t_param); + std::vector outputs_{t_out}; + auto ctx = new lite::InnerContext; + ASSERT_EQ(lite::RET_OK, ctx->Init()); + kernel::KernelKey key = {kernel::KERNEL_ARCH::kDSP, kNumberTypeInt8, NHWC, schema::PrimitiveType_MatMulFusion}; + auto creator = KernelRegistry::GetInstance()->GetCreator(key); + ASSERT_NE(creator, nullptr); + auto kernel = creator(inputs_, outputs_, reinterpret_cast(param), ctx, key); + ASSERT_NE(kernel, nullptr); + ASSERT_EQ(kernel->Prepare(), lite::RET_OK); + ASSERT_EQ(kernel->Run(), lite::RET_OK); + std::vector expect(M * N, static_cast(0)); + for (int m = 0; m < M; ++m) { + for (int n = 0; n < N; ++n) { + int32_t sum = 0; + for (int k = 0; k < K; ++k) { + sum += static_cast(A[m * K + k]) * static_cast(B[k * N + n]); + } + sum += static_cast(bias[m * N + n]); + sum = sum < 0 ? 0 : sum; + if (sum > std::numeric_limits::max()) { + sum = std::numeric_limits::max(); + } + expect[m * N + n] = static_cast(sum); + } + } + ASSERT_EQ(0, CompareOutputData(C, expect.data(), M * N, 0.0f)); + UninitDSPRuntime(); + delete ctx; + delete kernel; + delete t_A; + delete t_B; + delete t_bias; + delete t_param; + delete t_out; +} + +TEST_F(TestDSP_MatMulFusion, MatMulFusion_Complex128_Large_BiasRelu_FT78) { + InitDSPRuntime(); + const int M = 256, K = 256, N = 256; + std::vector a_shape = {M, K}; + std::vector b_shape = {K, N}; + std::vector out_shape = {M, N}; + std::vector bias_shape = {M, N}; + auto t_A = new lite::Tensor(kNumberTypeComplex128, a_shape, NHWC, lite::Category::CONST_TENSOR); + auto t_B = new lite::Tensor(kNumberTypeComplex128, b_shape, NHWC, lite::Category::CONST_TENSOR); + auto t_bias = new lite::Tensor(kNumberTypeComplex128, bias_shape, NHWC, lite::Category::CONST_TENSOR); + auto t_out = new lite::Tensor(kNumberTypeComplex128, out_shape, NHWC, lite::Category::CONST_TENSOR); + t_A->MallocData(allocator_); + t_B->MallocData(allocator_); + t_bias->MallocData(allocator_); + t_out->MallocData(allocator_); + auto A = reinterpret_cast(t_A->MutableData()); + auto B = reinterpret_cast(t_B->MutableData()); + auto bias = reinterpret_cast(t_bias->MutableData()); + auto C = reinterpret_cast(t_out->MutableData()); + for (int i = 0; i < M * K; ++i) { + A[2 * i] = 0.01f * (i % 17); + A[2 * i + 1] = 0.02f * (i % 19); + } + for (int i = 0; i < K * N; ++i) { + B[2 * i] = 0.03f * (i % 23); + B[2 * i + 1] = 0.01f * (i % 29); + } + for (int i = 0; i < M * N; ++i) { + bias[2 * i] = 0.002f * (i % 31); + bias[2 * i + 1] = 0.001f * (i % 37); + } + std::memset(C, 0, M * N * 2 * sizeof(double)); + std::vector inputs_{t_A, t_B, t_bias}; + std::vector outputs_{t_out}; + auto ctx = new lite::InnerContext; + ASSERT_EQ(lite::RET_OK, ctx->Init()); + auto *param = new MatMulParameter(); + param->op_parameter_.type_ = static_cast(schema::PrimitiveType_MatMulFusion); + param->act_type_ = ActType_Relu; + param->has_bias_ = true; + param->row_ = M; + param->col_ = N; + param->deep_ = K; + int act_code = GetActCode(param->act_type_); + auto t_param = CreateFT78ParamTensor(allocator_, M, N, K, act_code); + inputs_.push_back(t_param); + kernel::KernelKey key = {kernel::KERNEL_ARCH::kDSP, kNumberTypeComplex128, NHWC, schema::PrimitiveType_MatMulFusion}; + auto creator = KernelRegistry::GetInstance()->GetCreator(key); + ASSERT_NE(creator, nullptr); + auto kernel = creator(inputs_, outputs_, reinterpret_cast(param), ctx, key); + ASSERT_NE(kernel, nullptr); + ASSERT_EQ(kernel->Prepare(), lite::RET_OK); + ASSERT_EQ(kernel->Run(), lite::RET_OK); + std::vector expect(2 * M * N, 0.0); + std::vector actual(2 * M * N, 0.0); + for (int m = 0; m < M; ++m) { + for (int n = 0; n < N; ++n) { + double real = 0.0; + double imag = 0.0; + for (int k = 0; k < K; ++k) { + double ar = A[2 * (m * K + k)]; + double ai = A[2 * (m * K + k) + 1]; + double br = B[2 * (k * N + n)]; + double bi = B[2 * (k * N + n) + 1]; + real += ar * br - ai * bi; + imag += ar * bi + ai * br; + } + real += bias[2 * (m * N + n)]; + imag += bias[2 * (m * N + n) + 1]; + if (real < 0.0) { + real = 0.0; + } + expect[2 * (m * N + n)] = real; + expect[2 * (m * N + n) + 1] = imag; + actual[2 * (m * N + n)] = C[2 * (m * N + n)]; + actual[2 * (m * N + n) + 1] = C[2 * (m * N + n) + 1]; + } + } + ASSERT_EQ(0, CompareOutputData(actual.data(), expect.data(), 2 * M * N, 1e-3)); + UninitDSPRuntime(); + delete ctx; + delete kernel; + delete t_A; + delete t_B; + delete t_bias; + delete t_param; + delete t_out; +} +#endif + +} // namespace mindspore::lite::dsp::test