From faaf47cff5a61f30a9b4a72fc50a7f6ce21e4a00 Mon Sep 17 00:00:00 2001 From: sunsuodong Date: Sat, 19 Feb 2022 21:50:38 -0800 Subject: [PATCH] support arm64 fp16 Signed-off-by: sunsuodong --- .../kernel_compiler/cpu/nnacl/BUILD.gn | 30 ++++ mindspore/lite/BUILD.gn | 132 ++++++++---------- 2 files changed, 85 insertions(+), 77 deletions(-) diff --git a/mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/BUILD.gn b/mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/BUILD.gn index 0637531d21..e5ca589411 100644 --- a/mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/BUILD.gn +++ b/mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/BUILD.gn @@ -16,6 +16,9 @@ import("//build/ohos.gni") ohos_source_set("nnacl_o") { sources = [ + "assembly/opt/MatmulDpInt8.S", + "assembly/opt/MatmulDpInt8Opt.S", + "assembly/opt/MatmulOptR4Int8.S", "base/arithmetic_base.c", "base/batch_to_space_base.c", "base/broadcast_to.c", @@ -376,6 +379,25 @@ ohos_source_set("nnacl_o") { "assembly/arm64/TiledC4MatmulFp32.S", "assembly/arm64/WinogradTransLeft.S", "assembly/arm64/WinogradTransRight.S", + "assembly/fp16/ConvDwFp16Border.S", + "assembly/fp16/ConvDwFp16Center.S", + "assembly/fp16/ConvDwFp16Row.S", + "assembly/fp16/DeconvDwFp16Border.S", + "assembly/fp16/DeconvDwFp16Center.S", + "assembly/fp16/Float16ToFloat32.S", + "assembly/fp16/Float32ToFloat16.S", + "assembly/fp16/MatVecMulFp16.S", + "assembly/fp16/Matmul12X16Fp16.S", + "assembly/fp16/MatmulBaseFp16Neon.S", + "assembly/fp16/MatmulFp16.S", + "assembly/fp16/MatmulFp16Opt.S", + "assembly/fp16/MatmulWinogradFp16.S", + "assembly/fp16/PostFuncBiasReluC4Fp16.S", + "assembly/fp16/PostFuncBiasReluC8Fp16.S", + "assembly/fp16/TiledC4MatmulFp16.S", + "assembly/fp16/VecMatmulFp16.S", + "assembly/fp16/WinogradTransLeftFp16.S", + "assembly/fp16/WinogradTransRightFp16.S", "fp16/activation_fp16.c", "fp16/arg_min_max_fp16.c", "fp16/arithmetic_fp16.c", @@ -452,5 +474,13 @@ ohos_source_set("nnacl_o") { ] } + cflags_c = [ + "-march=armv8.2-a+dotprod+fp16", + "-fomit-frame-pointer", + "-fstrict-aliasing", + "-ffast-math", + ] + asmflags = [ "-march=armv8.2-a+dotprod+fp16" ] + part_name = "mindspore" } diff --git a/mindspore/lite/BUILD.gn b/mindspore/lite/BUILD.gn index 45d16454a1..94dc444842 100644 --- a/mindspore/lite/BUILD.gn +++ b/mindspore/lite/BUILD.gn @@ -530,6 +530,61 @@ ohos_shared_library("mindspore_lib") { "tools/converter/quantizer/fse_decoder.cc", ] + if (target_cpu == "arm64") { + sources += [ + "src/runtime/kernel/arm/fp16/activation_fp16.cc", + "src/runtime/kernel/arm/fp16/addn_fp16.cc", + "src/runtime/kernel/arm/fp16/arithmetic_compare_fp16.cc", + "src/runtime/kernel/arm/fp16/arithmetic_fp16.cc", + "src/runtime/kernel/arm/fp16/arithmetic_self_fp16.cc", + "src/runtime/kernel/arm/fp16/batchnorm_fp16.cc", + "src/runtime/kernel/arm/fp16/biasadd_fp16.cc", + "src/runtime/kernel/arm/fp16/cast_fp16.cc", + "src/runtime/kernel/arm/fp16/common_fp16.cc", + "src/runtime/kernel/arm/fp16/concat_fp16.cc", + "src/runtime/kernel/arm/fp16/convolution_1x1_fp16.cc", + "src/runtime/kernel/arm/fp16/convolution_delegate_fp16.cc", + "src/runtime/kernel/arm/fp16/convolution_depthwise_3x3_fp16.cc", + "src/runtime/kernel/arm/fp16/convolution_depthwise_fp16.cc", + "src/runtime/kernel/arm/fp16/convolution_depthwise_slidewindow_fp16.cc", + "src/runtime/kernel/arm/fp16/convolution_fp16.cc", + "src/runtime/kernel/arm/fp16/convolution_winograd_fp16.cc", + "src/runtime/kernel/arm/fp16/crop_fp16.cc", + "src/runtime/kernel/arm/fp16/deconvolution_depthwise_fp16.cc", + "src/runtime/kernel/arm/fp16/deconvolution_fp16.cc", + "src/runtime/kernel/arm/fp16/deconvolution_winograd_fp16.cc", + "src/runtime/kernel/arm/fp16/depth_to_space_fp16.cc", + "src/runtime/kernel/arm/fp16/exp_fp16.cc", + "src/runtime/kernel/arm/fp16/fill_fp16.cc", + "src/runtime/kernel/arm/fp16/fullconnection_fp16.cc", + "src/runtime/kernel/arm/fp16/fused_batchnorm_fp16.cc", + "src/runtime/kernel/arm/fp16/gather_fp16.cc", + "src/runtime/kernel/arm/fp16/group_convolution_fp16.cc", + "src/runtime/kernel/arm/fp16/gru_fp16.cc", + "src/runtime/kernel/arm/fp16/instance_norm_fp16.cc", + "src/runtime/kernel/arm/fp16/layer_norm_fp16.cc", + "src/runtime/kernel/arm/fp16/layout_transform_fp16.cc", + "src/runtime/kernel/arm/fp16/log_softmax_fp16.cc", + "src/runtime/kernel/arm/fp16/lstm_fp16.cc", + "src/runtime/kernel/arm/fp16/matmul_base_fp16.cc", + "src/runtime/kernel/arm/fp16/matmul_fp16.cc", + "src/runtime/kernel/arm/fp16/pad_fp16.cc", + "src/runtime/kernel/arm/fp16/pooling_fp16.cc", + "src/runtime/kernel/arm/fp16/power_fp16.cc", + "src/runtime/kernel/arm/fp16/prelu_fp16.cc", + "src/runtime/kernel/arm/fp16/quant_dtype_cast_fp16.cc", + "src/runtime/kernel/arm/fp16/ragged_range_fp16.cc", + "src/runtime/kernel/arm/fp16/reduce_fp16.cc", + "src/runtime/kernel/arm/fp16/resize_fp16.cc", + "src/runtime/kernel/arm/fp16/scale_fp16.cc", + "src/runtime/kernel/arm/fp16/slice_fp16.cc", + "src/runtime/kernel/arm/fp16/softmax_fp16.cc", + "src/runtime/kernel/arm/fp16/stack_fp16.cc", + "src/runtime/kernel/arm/fp16/transpose_fp16.cc", + "src/runtime/kernel/arm/fp16/where_fp16.cc", + ] + } + include_dirs = [ "./", "../../", @@ -573,10 +628,6 @@ ohos_shared_library("mindspore_lib") { "../core/mindrt/:mindrt_o", ] - if (target_cpu == "arm64") { - deps += [ ":arm_fp16_cc_o" ] - } - output_name = "libmindspore-lite.huawei" output_extension = "so" @@ -584,79 +635,6 @@ ohos_shared_library("mindspore_lib") { part_name = "mindspore" } -ohos_source_set("arm_fp16_cc_o") { - sources = [ - "src/runtime/kernel/arm/fp16/activation_fp16.cc", - "src/runtime/kernel/arm/fp16/addn_fp16.cc", - "src/runtime/kernel/arm/fp16/arithmetic_compare_fp16.cc", - "src/runtime/kernel/arm/fp16/arithmetic_fp16.cc", - "src/runtime/kernel/arm/fp16/arithmetic_self_fp16.cc", - "src/runtime/kernel/arm/fp16/batchnorm_fp16.cc", - "src/runtime/kernel/arm/fp16/biasadd_fp16.cc", - "src/runtime/kernel/arm/fp16/cast_fp16.cc", - "src/runtime/kernel/arm/fp16/common_fp16.cc", - "src/runtime/kernel/arm/fp16/concat_fp16.cc", - "src/runtime/kernel/arm/fp16/convolution_1x1_fp16.cc", - "src/runtime/kernel/arm/fp16/convolution_delegate_fp16.cc", - "src/runtime/kernel/arm/fp16/convolution_depthwise_3x3_fp16.cc", - "src/runtime/kernel/arm/fp16/convolution_depthwise_fp16.cc", - "src/runtime/kernel/arm/fp16/convolution_depthwise_slidewindow_fp16.cc", - "src/runtime/kernel/arm/fp16/convolution_fp16.cc", - "src/runtime/kernel/arm/fp16/convolution_winograd_fp16.cc", - "src/runtime/kernel/arm/fp16/crop_fp16.cc", - "src/runtime/kernel/arm/fp16/deconvolution_depthwise_fp16.cc", - "src/runtime/kernel/arm/fp16/deconvolution_fp16.cc", - "src/runtime/kernel/arm/fp16/deconvolution_winograd_fp16.cc", - "src/runtime/kernel/arm/fp16/depth_to_space_fp16.cc", - "src/runtime/kernel/arm/fp16/exp_fp16.cc", - "src/runtime/kernel/arm/fp16/fill_fp16.cc", - "src/runtime/kernel/arm/fp16/fullconnection_fp16.cc", - "src/runtime/kernel/arm/fp16/fused_batchnorm_fp16.cc", - "src/runtime/kernel/arm/fp16/gather_fp16.cc", - "src/runtime/kernel/arm/fp16/group_convolution_fp16.cc", - "src/runtime/kernel/arm/fp16/gru_fp16.cc", - "src/runtime/kernel/arm/fp16/instance_norm_fp16.cc", - "src/runtime/kernel/arm/fp16/layer_norm_fp16.cc", - "src/runtime/kernel/arm/fp16/layout_transform_fp16.cc", - "src/runtime/kernel/arm/fp16/log_softmax_fp16.cc", - "src/runtime/kernel/arm/fp16/lstm_fp16.cc", - "src/runtime/kernel/arm/fp16/matmul_base_fp16.cc", - "src/runtime/kernel/arm/fp16/matmul_fp16.cc", - "src/runtime/kernel/arm/fp16/pad_fp16.cc", - "src/runtime/kernel/arm/fp16/pooling_fp16.cc", - "src/runtime/kernel/arm/fp16/power_fp16.cc", - "src/runtime/kernel/arm/fp16/prelu_fp16.cc", - "src/runtime/kernel/arm/fp16/quant_dtype_cast_fp16.cc", - "src/runtime/kernel/arm/fp16/ragged_range_fp16.cc", - "src/runtime/kernel/arm/fp16/reduce_fp16.cc", - "src/runtime/kernel/arm/fp16/resize_fp16.cc", - "src/runtime/kernel/arm/fp16/scale_fp16.cc", - "src/runtime/kernel/arm/fp16/slice_fp16.cc", - "src/runtime/kernel/arm/fp16/softmax_fp16.cc", - "src/runtime/kernel/arm/fp16/stack_fp16.cc", - "src/runtime/kernel/arm/fp16/transpose_fp16.cc", - "src/runtime/kernel/arm/fp16/where_fp16.cc", - "src/runtime/kernel/arm/fp16_grad/activation_fp16_grad.cc", - "src/runtime/kernel/arm/fp16_grad/arithmetic_fp16_grad.cc", - "src/runtime/kernel/arm/fp16_grad/arithmetic_fp16_self_grad.cc", - "src/runtime/kernel/arm/fp16_grad/bias_fp16_grad.cc", - "src/runtime/kernel/arm/fp16_grad/bn_fp16_grad.cc", - "src/runtime/kernel/arm/fp16_grad/convolution_fp16_grad_filter.cc", - "src/runtime/kernel/arm/fp16_grad/convolution_fp16_grad_input.cc", - "src/runtime/kernel/arm/fp16_grad/dropout_fp16_grad.cc", - "src/runtime/kernel/arm/fp16_grad/layernorm_fp16_grad.cc", - "src/runtime/kernel/arm/fp16_grad/neg_fp16_grad.cc", - "src/runtime/kernel/arm/fp16_grad/pooling_fp16_grad.cc", - "src/runtime/kernel/arm/fp16_grad/resize_fp16_grad.cc", - "src/runtime/kernel/arm/fp16_grad/strided_slice_fp16_grad.cc", - "src/runtime/kernel/arm/fp16_grad/unsorted_segment_sum_fp16.cc", - ] - - include_dirs = [ "../ccsrc/backend/kernel_compiler/cpu/" ] - - part_name = "mindspore" -} - action("third_party") { script = "get_thirdparty.sh" outputs = [ "$root_out_dir/ai/mindspore/log.txt" ] -- Gitee